21 #include "kmp_error.h" 24 #include "kmp_stats.h" 26 #if KMP_USE_X87CONTROL 30 #include "kmp_dispatch.h" 31 #if KMP_USE_HIER_SCHED 32 #include "kmp_dispatch_hier.h" 36 #include "ompt-specific.h" 42 void __kmp_dispatch_deo_error(
int *gtid_ref,
int *cid_ref,
ident_t *loc_ref) {
45 KMP_DEBUG_ASSERT(gtid_ref);
47 if (__kmp_env_consistency_check) {
48 th = __kmp_threads[*gtid_ref];
49 if (th->th.th_root->r.r_active &&
50 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51 #if KMP_USE_DYNAMIC_LOCK 52 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
54 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
60 void __kmp_dispatch_dxo_error(
int *gtid_ref,
int *cid_ref,
ident_t *loc_ref) {
63 if (__kmp_env_consistency_check) {
64 th = __kmp_threads[*gtid_ref];
65 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
72 static inline int __kmp_get_monotonicity(
enum sched_type schedule,
73 bool use_hier =
false) {
77 monotonicity = SCHEDULE_MONOTONIC;
78 if (SCHEDULE_HAS_NONMONOTONIC(schedule))
79 monotonicity = SCHEDULE_NONMONOTONIC;
80 else if (SCHEDULE_HAS_MONOTONIC(schedule))
81 monotonicity = SCHEDULE_MONOTONIC;
96 void __kmp_dispatch_init_algorithm(
ident_t *loc,
int gtid,
97 dispatch_private_info_template<T> *pr,
99 typename traits_t<T>::signed_t st,
101 kmp_uint64 *cur_chunk,
103 typename traits_t<T>::signed_t chunk,
105 typedef typename traits_t<T>::unsigned_t UT;
106 typedef typename traits_t<T>::floating_t DBL;
116 typedef typename traits_t<T>::signed_t ST;
120 buff = __kmp_str_format(
"__kmp_dispatch_init_algorithm: T#%%d called " 121 "pr:%%p lb:%%%s ub:%%%s st:%%%s " 122 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
123 traits_t<T>::spec, traits_t<T>::spec,
124 traits_t<ST>::spec, traits_t<ST>::spec,
125 traits_t<T>::spec, traits_t<T>::spec);
126 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
127 __kmp_str_free(&buff);
131 th = __kmp_threads[gtid];
132 team = th->th.th_team;
133 active = !team->t.t_serialized;
136 int itt_need_metadata_reporting =
137 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
138 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
139 team->t.t_active_level == 1;
142 #if KMP_USE_HIER_SCHED 143 use_hier = pr->flags.use_hier;
149 monotonicity = __kmp_get_monotonicity(schedule, use_hier);
150 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
154 pr->flags.nomerge = TRUE;
158 pr->flags.nomerge = FALSE;
160 pr->type_size = traits_t<T>::type_size;
162 pr->flags.ordered = TRUE;
166 pr->flags.ordered = FALSE;
169 if (pr->flags.ordered) {
170 monotonicity = SCHEDULE_MONOTONIC;
174 schedule = __kmp_static;
176 if (schedule == kmp_sch_runtime) {
179 schedule = team->t.t_sched.r_sched_type;
180 monotonicity = __kmp_get_monotonicity(schedule, use_hier);
181 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
185 schedule = __kmp_guided;
187 schedule = __kmp_static;
191 chunk = team->t.t_sched.chunk;
200 buff = __kmp_str_format(
"__kmp_dispatch_init_algorithm: T#%%d new: " 201 "schedule:%%d chunk:%%%s\n",
203 KD_TRACE(10, (buff, gtid, schedule, chunk));
204 __kmp_str_free(&buff);
209 schedule = __kmp_guided;
212 chunk = KMP_DEFAULT_CHUNK;
218 schedule = __kmp_auto;
223 buff = __kmp_str_format(
224 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: " 225 "schedule:%%d chunk:%%%s\n",
227 KD_TRACE(10, (buff, gtid, schedule, chunk));
228 __kmp_str_free(&buff);
232 #if KMP_STATIC_STEAL_ENABLED 234 if (schedule == kmp_sch_dynamic_chunked) {
235 if (monotonicity == SCHEDULE_NONMONOTONIC)
236 schedule = kmp_sch_static_steal;
240 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
241 schedule = kmp_sch_guided_iterative_chunked;
242 KMP_WARNING(DispatchManyThreads);
246 schedule = team->t.t_sched.r_sched_type;
247 monotonicity = __kmp_get_monotonicity(schedule, use_hier);
248 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
252 schedule == __kmp_static) {
253 schedule = kmp_sch_static_balanced_chunked;
258 chunk = team->t.t_sched.chunk * chunk;
268 buff = __kmp_str_format(
269 "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d" 272 KD_TRACE(10, (buff, gtid, schedule, chunk));
273 __kmp_str_free(&buff);
277 pr->u.p.parm1 = chunk;
280 "unknown scheduling type");
284 if (__kmp_env_consistency_check) {
286 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
287 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
301 tc = (UT)(lb - ub) / (-st) + 1;
309 tc = (UT)(ub - lb) / st + 1;
315 #if KMP_STATS_ENABLED 316 if (KMP_MASTER_GTID(gtid)) {
327 pr->u.p.last_upper = ub + st;
333 if (pr->flags.ordered) {
334 pr->ordered_bumped = 0;
335 pr->u.p.ordered_lower = 1;
336 pr->u.p.ordered_upper = 0;
341 #if (KMP_STATIC_STEAL_ENABLED) 342 case kmp_sch_static_steal: {
346 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
349 ntc = (tc % chunk ? 1 : 0) + tc / chunk;
350 if (nproc > 1 && ntc >= nproc) {
353 T small_chunk, extras;
355 small_chunk = ntc / nproc;
356 extras = ntc % nproc;
358 init =
id * small_chunk + (
id < extras ? id : extras);
359 pr->u.p.count = init;
360 pr->u.p.ub = init + small_chunk + (
id < extras ? 1 : 0);
366 pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc);
367 pr->u.p.parm4 = (
id + 1) % nproc;
369 if (traits_t<T>::type_size > 4) {
375 KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
376 th->th.th_dispatch->th_steal_lock =
377 (kmp_lock_t *)__kmp_allocate(
sizeof(kmp_lock_t));
378 __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
382 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d falling-through to " 383 "kmp_sch_static_balanced\n",
385 schedule = kmp_sch_static_balanced;
392 case kmp_sch_static_balanced: {
397 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
407 pr->u.p.parm1 = (
id == tc - 1);
410 pr->u.p.parm1 = FALSE;
414 T small_chunk = tc / nproc;
415 T extras = tc % nproc;
416 init =
id * small_chunk + (
id < extras ? id : extras);
417 limit = init + small_chunk - (
id < extras ? 0 : 1);
418 pr->u.p.parm1 = (
id == nproc - 1);
424 pr->u.p.parm1 = TRUE;
428 pr->u.p.parm1 = FALSE;
434 if (itt_need_metadata_reporting)
436 *cur_chunk = limit - init + 1;
439 pr->u.p.lb = lb + init;
440 pr->u.p.ub = lb + limit;
443 T ub_tmp = lb + limit * st;
444 pr->u.p.lb = lb + init * st;
448 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
450 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
453 if (pr->flags.ordered) {
454 pr->u.p.ordered_lower = init;
455 pr->u.p.ordered_upper = limit;
459 case kmp_sch_static_balanced_chunked: {
462 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)" 463 " -> falling-through to static_greedy\n",
465 schedule = kmp_sch_static_greedy;
467 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
473 case kmp_sch_guided_iterative_chunked: {
476 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked" 481 if ((2L * chunk + 1) * nproc >= tc) {
483 schedule = kmp_sch_dynamic_chunked;
486 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
487 *(
double *)&pr->u.p.parm3 =
488 guided_flt_param / nproc;
491 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d falling-through to " 492 "kmp_sch_static_greedy\n",
494 schedule = kmp_sch_static_greedy;
498 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
504 case kmp_sch_guided_analytical_chunked: {
505 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d " 506 "kmp_sch_guided_analytical_chunked case\n",
510 if ((2L * chunk + 1) * nproc >= tc) {
512 schedule = kmp_sch_dynamic_chunked;
517 #if KMP_USE_X87CONTROL 527 unsigned int oldFpcw = _control87(0, 0);
528 _control87(_PC_64, _MCW_PC);
531 long double target = ((
long double)chunk * 2 + 1) * nproc / tc;
538 x = (
long double)1.0 - (
long double)0.5 / nproc;
549 ptrdiff_t natural_alignment =
550 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
554 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
559 *(DBL *)&pr->u.p.parm3 = x;
572 p = __kmp_pow<UT>(x, right);
577 }
while (p > target && right < (1 << 27));
585 while (left + 1 < right) {
586 mid = (left + right) / 2;
587 if (__kmp_pow<UT>(x, mid) > target) {
596 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
597 __kmp_pow<UT>(x, cross) <= target);
600 pr->u.p.parm2 = cross;
603 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8)) 604 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3) 606 #define GUIDED_ANALYTICAL_WORKAROUND (x) 609 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
610 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
612 #if KMP_USE_X87CONTROL 614 _control87(oldFpcw, _MCW_PC);
618 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d falling-through to " 619 "kmp_sch_static_greedy\n",
621 schedule = kmp_sch_static_greedy;
627 case kmp_sch_static_greedy:
630 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
632 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
634 case kmp_sch_static_chunked:
635 case kmp_sch_dynamic_chunked:
636 if (pr->u.p.parm1 <= 0) {
637 pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
639 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d " 640 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
643 case kmp_sch_trapezoidal: {
646 T parm1, parm2, parm3, parm4;
648 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
654 parm2 = (tc / (2 * nproc));
664 }
else if (parm1 > parm2) {
669 parm3 = (parm2 + parm1);
670 parm3 = (2 * tc + parm3 - 1) / parm3;
678 parm4 = (parm2 - parm1) / parm4;
685 pr->u.p.parm1 = parm1;
686 pr->u.p.parm2 = parm2;
687 pr->u.p.parm3 = parm3;
688 pr->u.p.parm4 = parm4;
693 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected),
694 KMP_HNT(GetNewerLibrary),
699 pr->schedule = schedule;
702 #if KMP_USE_HIER_SCHED 703 template <
typename T>
704 inline void __kmp_dispatch_init_hier_runtime(
ident_t *loc, T lb, T ub,
705 typename traits_t<T>::signed_t st);
708 __kmp_dispatch_init_hier_runtime<kmp_int32>(
ident_t *loc, kmp_int32 lb,
709 kmp_int32 ub, kmp_int32 st) {
710 __kmp_dispatch_init_hierarchy<kmp_int32>(
711 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
712 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
716 __kmp_dispatch_init_hier_runtime<kmp_uint32>(
ident_t *loc, kmp_uint32 lb,
717 kmp_uint32 ub, kmp_int32 st) {
718 __kmp_dispatch_init_hierarchy<kmp_uint32>(
719 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
720 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
724 __kmp_dispatch_init_hier_runtime<kmp_int64>(
ident_t *loc, kmp_int64 lb,
725 kmp_int64 ub, kmp_int64 st) {
726 __kmp_dispatch_init_hierarchy<kmp_int64>(
727 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
728 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
732 __kmp_dispatch_init_hier_runtime<kmp_uint64>(
ident_t *loc, kmp_uint64 lb,
733 kmp_uint64 ub, kmp_int64 st) {
734 __kmp_dispatch_init_hierarchy<kmp_uint64>(
735 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
736 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
740 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
741 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
742 for (
int i = 0; i < num_disp_buff; ++i) {
745 reinterpret_cast<dispatch_shared_info_template<kmp_int32>
volatile *
>(
746 &team->t.t_disp_buffer[i]);
748 sh->hier->deallocate();
749 __kmp_free(sh->hier);
757 template <
typename T>
760 T ub,
typename traits_t<T>::signed_t st,
761 typename traits_t<T>::signed_t chunk,
int push_ws) {
762 typedef typename traits_t<T>::unsigned_t UT;
767 kmp_uint32 my_buffer_index;
768 dispatch_private_info_template<T> *pr;
769 dispatch_shared_info_template<T>
volatile *sh;
771 KMP_BUILD_ASSERT(
sizeof(dispatch_private_info_template<T>) ==
772 sizeof(dispatch_private_info));
773 KMP_BUILD_ASSERT(
sizeof(dispatch_shared_info_template<UT>) ==
774 sizeof(dispatch_shared_info));
776 if (!TCR_4(__kmp_init_parallel))
777 __kmp_parallel_initialize();
779 __kmp_resume_if_soft_paused();
781 #if INCLUDE_SSC_MARKS 782 SSC_MARK_DISPATCH_INIT();
785 typedef typename traits_t<T>::signed_t ST;
789 buff = __kmp_str_format(
"__kmp_dispatch_init: T#%%d called: schedule:%%d " 790 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
791 traits_t<ST>::spec, traits_t<T>::spec,
792 traits_t<T>::spec, traits_t<ST>::spec);
793 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
794 __kmp_str_free(&buff);
798 th = __kmp_threads[gtid];
799 team = th->th.th_team;
800 active = !team->t.t_serialized;
801 th->th.th_ident = loc;
806 if (schedule == __kmp_static) {
812 #if KMP_USE_HIER_SCHED 818 my_buffer_index = th->th.th_dispatch->th_disp_index;
819 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
821 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
822 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
827 if (pr->flags.use_hier) {
829 KD_TRACE(100, (
"__kmp_dispatch_init: T#%d ordered loop detected. " 830 "Disabling hierarchical scheduling.\n",
832 pr->flags.use_hier = FALSE;
835 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
838 if (!ordered && !pr->flags.use_hier)
839 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
841 #endif // KMP_USE_HIER_SCHED 844 kmp_uint64 cur_chunk = chunk;
845 int itt_need_metadata_reporting =
846 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
847 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
848 team->t.t_active_level == 1;
851 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
852 th->th.th_dispatch->th_disp_buffer);
854 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
855 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
857 my_buffer_index = th->th.th_dispatch->th_disp_index++;
860 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
862 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
863 sh =
reinterpret_cast<dispatch_shared_info_template<T>
volatile *
>(
864 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
865 KD_TRACE(10, (
"__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
869 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
873 chunk, (T)th->th.th_team_nproc,
874 (T)th->th.th_info.ds.ds_tid);
876 if (pr->flags.ordered == 0) {
877 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
878 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
880 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
881 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
889 KD_TRACE(100, (
"__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d " 890 "sh->buffer_index:%d\n",
891 gtid, my_buffer_index, sh->buffer_index));
892 __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
893 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
897 KD_TRACE(100, (
"__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d " 898 "sh->buffer_index:%d\n",
899 gtid, my_buffer_index, sh->buffer_index));
901 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
902 th->th.th_dispatch->th_dispatch_sh_current =
903 CCAST(dispatch_shared_info_t *, (
volatile dispatch_shared_info_t *)sh);
905 if (pr->flags.ordered) {
906 __kmp_itt_ordered_init(gtid);
909 if (itt_need_metadata_reporting) {
911 kmp_uint64 schedtype = 0;
913 case kmp_sch_static_chunked:
914 case kmp_sch_static_balanced:
916 case kmp_sch_static_greedy:
917 cur_chunk = pr->u.p.parm1;
919 case kmp_sch_dynamic_chunked:
922 case kmp_sch_guided_iterative_chunked:
923 case kmp_sch_guided_analytical_chunked:
933 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
935 #if KMP_USE_HIER_SCHED 936 if (pr->flags.use_hier) {
938 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
940 #endif // KMP_USER_HIER_SCHED 948 buff = __kmp_str_format(
949 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s " 951 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" 952 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
953 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
954 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
955 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
956 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
957 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
958 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
959 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
960 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
961 __kmp_str_free(&buff);
964 #if (KMP_STATIC_STEAL_ENABLED) 970 if (schedule == kmp_sch_static_steal) {
974 volatile T *p = &pr->u.p.static_steal_counter;
977 #endif // ( KMP_STATIC_STEAL_ENABLED ) 979 #if OMPT_SUPPORT && OMPT_OPTIONAL 980 if (ompt_enabled.ompt_callback_work) {
981 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
982 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
983 ompt_callbacks.ompt_callback(ompt_callback_work)(
984 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
985 &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
988 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
996 template <
typename UT>
997 static void __kmp_dispatch_finish(
int gtid,
ident_t *loc) {
998 typedef typename traits_t<UT>::signed_t ST;
999 kmp_info_t *th = __kmp_threads[gtid];
1001 KD_TRACE(100, (
"__kmp_dispatch_finish: T#%d called\n", gtid));
1002 if (!th->th.th_team->t.t_serialized) {
1004 dispatch_private_info_template<UT> *pr =
1005 reinterpret_cast<dispatch_private_info_template<UT> *
>(
1006 th->th.th_dispatch->th_dispatch_pr_current);
1007 dispatch_shared_info_template<UT>
volatile *sh =
1008 reinterpret_cast<dispatch_shared_info_template<UT>
volatile *
>(
1009 th->th.th_dispatch->th_dispatch_sh_current);
1010 KMP_DEBUG_ASSERT(pr);
1011 KMP_DEBUG_ASSERT(sh);
1012 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1013 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1015 if (pr->ordered_bumped) {
1018 (
"__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1020 pr->ordered_bumped = 0;
1022 UT lower = pr->u.p.ordered_lower;
1028 buff = __kmp_str_format(
"__kmp_dispatch_finish: T#%%d before wait: " 1029 "ordered_iteration:%%%s lower:%%%s\n",
1030 traits_t<UT>::spec, traits_t<UT>::spec);
1031 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1032 __kmp_str_free(&buff);
1036 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1037 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1043 buff = __kmp_str_format(
"__kmp_dispatch_finish: T#%%d after wait: " 1044 "ordered_iteration:%%%s lower:%%%s\n",
1045 traits_t<UT>::spec, traits_t<UT>::spec);
1046 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1047 __kmp_str_free(&buff);
1051 test_then_inc<ST>((
volatile ST *)&sh->u.s.ordered_iteration);
1054 KD_TRACE(100, (
"__kmp_dispatch_finish: T#%d returned\n", gtid));
1057 #ifdef KMP_GOMP_COMPAT 1059 template <
typename UT>
1060 static void __kmp_dispatch_finish_chunk(
int gtid,
ident_t *loc) {
1061 typedef typename traits_t<UT>::signed_t ST;
1062 kmp_info_t *th = __kmp_threads[gtid];
1064 KD_TRACE(100, (
"__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1065 if (!th->th.th_team->t.t_serialized) {
1067 dispatch_private_info_template<UT> *pr =
1068 reinterpret_cast<dispatch_private_info_template<UT> *
>(
1069 th->th.th_dispatch->th_dispatch_pr_current);
1070 dispatch_shared_info_template<UT>
volatile *sh =
1071 reinterpret_cast<dispatch_shared_info_template<UT>
volatile *
>(
1072 th->th.th_dispatch->th_dispatch_sh_current);
1073 KMP_DEBUG_ASSERT(pr);
1074 KMP_DEBUG_ASSERT(sh);
1075 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1076 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1079 UT lower = pr->u.p.ordered_lower;
1080 UT upper = pr->u.p.ordered_upper;
1081 UT inc = upper - lower + 1;
1083 if (pr->ordered_bumped == inc) {
1086 (
"__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1088 pr->ordered_bumped = 0;
1090 inc -= pr->ordered_bumped;
1096 buff = __kmp_str_format(
1097 "__kmp_dispatch_finish_chunk: T#%%d before wait: " 1098 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1099 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1100 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1101 __kmp_str_free(&buff);
1105 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1106 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1109 KD_TRACE(1000, (
"__kmp_dispatch_finish_chunk: T#%d resetting " 1110 "ordered_bumped to zero\n",
1112 pr->ordered_bumped = 0;
1118 buff = __kmp_str_format(
1119 "__kmp_dispatch_finish_chunk: T#%%d after wait: " 1120 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1121 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1122 traits_t<UT>::spec);
1124 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1125 __kmp_str_free(&buff);
1129 test_then_add<ST>((
volatile ST *)&sh->u.s.ordered_iteration, inc);
1133 KD_TRACE(100, (
"__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1138 template <
typename T>
1139 int __kmp_dispatch_next_algorithm(
int gtid,
1140 dispatch_private_info_template<T> *pr,
1141 dispatch_shared_info_template<T>
volatile *sh,
1142 kmp_int32 *p_last, T *p_lb, T *p_ub,
1143 typename traits_t<T>::signed_t *p_st, T nproc,
1145 typedef typename traits_t<T>::unsigned_t UT;
1146 typedef typename traits_t<T>::signed_t ST;
1147 typedef typename traits_t<T>::floating_t DBL;
1152 UT limit, trip, init;
1153 kmp_info_t *th = __kmp_threads[gtid];
1154 kmp_team_t *team = th->th.th_team;
1156 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1157 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1158 KMP_DEBUG_ASSERT(pr);
1159 KMP_DEBUG_ASSERT(sh);
1160 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1166 __kmp_str_format(
"__kmp_dispatch_next_algorithm: T#%%d called pr:%%p " 1167 "sh:%%p nproc:%%%s tid:%%%s\n",
1168 traits_t<T>::spec, traits_t<T>::spec);
1169 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1170 __kmp_str_free(&buff);
1175 if (pr->u.p.tc == 0) {
1177 (
"__kmp_dispatch_next_algorithm: T#%d early exit trip count is " 1183 switch (pr->schedule) {
1184 #if (KMP_STATIC_STEAL_ENABLED) 1185 case kmp_sch_static_steal: {
1186 T chunk = pr->u.p.parm1;
1189 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1192 trip = pr->u.p.tc - 1;
1194 if (traits_t<T>::type_size > 4) {
1197 kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1198 KMP_DEBUG_ASSERT(lck != NULL);
1199 if (pr->u.p.count < (UT)pr->u.p.ub) {
1200 __kmp_acquire_lock(lck, gtid);
1202 init = (pr->u.p.count)++;
1203 status = (init < (UT)pr->u.p.ub);
1204 __kmp_release_lock(lck, gtid);
1209 kmp_info_t **other_threads = team->t.t_threads;
1210 int while_limit = pr->u.p.parm3;
1211 int while_index = 0;
1214 while ((!status) && (while_limit != ++while_index)) {
1216 T victimIdx = pr->u.p.parm4;
1217 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1218 dispatch_private_info_template<T> *victim =
1219 reinterpret_cast<dispatch_private_info_template<T> *
>(
1220 other_threads[victimIdx]
1221 ->th.th_dispatch->th_dispatch_pr_current);
1222 while ((victim == NULL || victim == pr ||
1223 (*(
volatile T *)&victim->u.p.static_steal_counter !=
1224 *(
volatile T *)&pr->u.p.static_steal_counter)) &&
1225 oldVictimIdx != victimIdx) {
1226 victimIdx = (victimIdx + 1) % nproc;
1227 victim =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1228 other_threads[victimIdx]
1229 ->th.th_dispatch->th_dispatch_pr_current);
1231 if (!victim || (*(
volatile T *)&victim->u.p.static_steal_counter !=
1232 *(
volatile T *)&pr->u.p.static_steal_counter)) {
1237 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1238 pr->u.p.parm4 = (victimIdx + 1) % nproc;
1242 lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1243 KMP_ASSERT(lck != NULL);
1244 __kmp_acquire_lock(lck, gtid);
1245 limit = victim->u.p.ub;
1246 if (victim->u.p.count >= limit ||
1247 (remaining = limit - victim->u.p.count) < 2) {
1248 __kmp_release_lock(lck, gtid);
1249 pr->u.p.parm4 = (victimIdx + 1) % nproc;
1254 if (remaining > 3) {
1256 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1257 init = (victim->u.p.ub -= (remaining >> 2));
1260 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1261 init = (victim->u.p.ub -= 1);
1263 __kmp_release_lock(lck, gtid);
1265 KMP_DEBUG_ASSERT(init + 1 <= limit);
1266 pr->u.p.parm4 = victimIdx;
1270 __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1271 pr->u.p.count = init + 1;
1273 __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1288 union_i4 vold, vnew;
1289 vold.b = *(
volatile kmp_int64 *)(&pr->u.p.count);
1292 while (!KMP_COMPARE_AND_STORE_ACQ64(
1293 (
volatile kmp_int64 *)&pr->u.p.count,
1294 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1295 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1297 vold.b = *(
volatile kmp_int64 *)(&pr->u.p.count);
1302 init = vnew.p.count;
1303 status = (init < (UT)vnew.p.ub);
1307 kmp_info_t **other_threads = team->t.t_threads;
1308 int while_limit = pr->u.p.parm3;
1309 int while_index = 0;
1313 while ((!status) && (while_limit != ++while_index)) {
1314 union_i4 vold, vnew;
1315 kmp_int32 remaining;
1316 T victimIdx = pr->u.p.parm4;
1317 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1318 dispatch_private_info_template<T> *victim =
1319 reinterpret_cast<dispatch_private_info_template<T> *
>(
1320 other_threads[victimIdx]
1321 ->th.th_dispatch->th_dispatch_pr_current);
1322 while ((victim == NULL || victim == pr ||
1323 (*(
volatile T *)&victim->u.p.static_steal_counter !=
1324 *(
volatile T *)&pr->u.p.static_steal_counter)) &&
1325 oldVictimIdx != victimIdx) {
1326 victimIdx = (victimIdx + 1) % nproc;
1327 victim =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1328 other_threads[victimIdx]
1329 ->th.th_dispatch->th_dispatch_pr_current);
1331 if (!victim || (*(
volatile T *)&victim->u.p.static_steal_counter !=
1332 *(
volatile T *)&pr->u.p.static_steal_counter)) {
1337 pr->u.p.parm4 = victimIdx;
1339 vold.b = *(
volatile kmp_int64 *)(&victim->u.p.count);
1342 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1343 if (vnew.p.count >= (UT)vnew.p.ub ||
1344 (remaining = vnew.p.ub - vnew.p.count) < 2) {
1345 pr->u.p.parm4 = (victimIdx + 1) % nproc;
1348 if (remaining > 3) {
1349 vnew.p.ub -= (remaining >> 2);
1353 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1355 if (KMP_COMPARE_AND_STORE_ACQ64(
1356 (
volatile kmp_int64 *)&victim->u.p.count,
1357 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1358 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1360 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1361 vold.p.ub - vnew.p.ub);
1366 vold.p.count = init + 1;
1368 KMP_XCHG_FIXED64((
volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1370 *(
volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1385 start = pr->u.p.parm2;
1387 limit = chunk + init - 1;
1389 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1391 KMP_DEBUG_ASSERT(init <= trip);
1392 if ((last = (limit >= trip)) != 0)
1398 *p_lb = start + init;
1399 *p_ub = start + limit;
1401 *p_lb = start + init * incr;
1402 *p_ub = start + limit * incr;
1405 if (pr->flags.ordered) {
1406 pr->u.p.ordered_lower = init;
1407 pr->u.p.ordered_upper = limit;
1412 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1413 case kmp_sch_static_balanced: {
1416 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1419 if ((status = !pr->u.p.count) != 0) {
1423 last = pr->u.p.parm1;
1427 pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1431 case kmp_sch_static_greedy:
1433 case kmp_sch_static_chunked: {
1436 KD_TRACE(100, (
"__kmp_dispatch_next_algorithm: T#%d " 1437 "kmp_sch_static_[affinity|chunked] case\n",
1439 parm1 = pr->u.p.parm1;
1441 trip = pr->u.p.tc - 1;
1442 init = parm1 * (pr->u.p.count + tid);
1444 if ((status = (init <= trip)) != 0) {
1447 limit = parm1 + init - 1;
1449 if ((last = (limit >= trip)) != 0)
1455 pr->u.p.count += nproc;
1458 *p_lb = start + init;
1459 *p_ub = start + limit;
1461 *p_lb = start + init * incr;
1462 *p_ub = start + limit * incr;
1465 if (pr->flags.ordered) {
1466 pr->u.p.ordered_lower = init;
1467 pr->u.p.ordered_upper = limit;
1473 case kmp_sch_dynamic_chunked: {
1474 T chunk = pr->u.p.parm1;
1478 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1481 init = chunk * test_then_inc_acq<ST>((
volatile ST *)&sh->u.s.iteration);
1482 trip = pr->u.p.tc - 1;
1484 if ((status = (init <= trip)) == 0) {
1491 limit = chunk + init - 1;
1494 if ((last = (limit >= trip)) != 0)
1501 *p_lb = start + init;
1502 *p_ub = start + limit;
1504 *p_lb = start + init * incr;
1505 *p_ub = start + limit * incr;
1508 if (pr->flags.ordered) {
1509 pr->u.p.ordered_lower = init;
1510 pr->u.p.ordered_upper = limit;
1516 case kmp_sch_guided_iterative_chunked: {
1517 T chunkspec = pr->u.p.parm1;
1518 KD_TRACE(100, (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked " 1525 init = sh->u.s.iteration;
1526 remaining = trip - init;
1527 if (remaining <= 0) {
1536 init = test_then_add<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1538 remaining = trip - init;
1539 if (remaining <= 0) {
1544 if ((T)remaining > chunkspec) {
1545 limit = init + chunkspec - 1;
1548 limit = init + remaining - 1;
1554 (UT)(remaining * *(
double *)&pr->u.p.parm3);
1555 if (compare_and_swap<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1556 (ST)init, (ST)limit)) {
1568 *p_lb = start + init * incr;
1569 *p_ub = start + limit * incr;
1570 if (pr->flags.ordered) {
1571 pr->u.p.ordered_lower = init;
1572 pr->u.p.ordered_upper = limit;
1586 T chunk = pr->u.p.parm1;
1588 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1594 init = sh->u.s.iteration;
1595 remaining = trip - init;
1596 if (remaining <= 0) {
1600 KMP_DEBUG_ASSERT(init % chunk == 0);
1602 if ((T)remaining < pr->u.p.parm2) {
1605 init = test_then_add<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1607 remaining = trip - init;
1608 if (remaining <= 0) {
1613 if ((T)remaining > chunk) {
1614 limit = init + chunk - 1;
1617 limit = init + remaining - 1;
1623 UT span = remaining * (*(
double *)&pr->u.p.parm3);
1624 UT rem = span % chunk;
1626 span += chunk - rem;
1627 limit = init + span;
1628 if (compare_and_swap<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1629 (ST)init, (ST)limit)) {
1641 *p_lb = start + init * incr;
1642 *p_ub = start + limit * incr;
1643 if (pr->flags.ordered) {
1644 pr->u.p.ordered_lower = init;
1645 pr->u.p.ordered_upper = limit;
1656 case kmp_sch_guided_analytical_chunked: {
1657 T chunkspec = pr->u.p.parm1;
1659 #if KMP_USE_X87CONTROL 1662 unsigned int oldFpcw;
1663 unsigned int fpcwSet = 0;
1665 KD_TRACE(100, (
"__kmp_dispatch_next_algorithm: T#%d " 1666 "kmp_sch_guided_analytical_chunked case\n",
1671 KMP_DEBUG_ASSERT(nproc > 1);
1672 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1676 chunkIdx = test_then_inc_acq<ST>((
volatile ST *)&sh->u.s.iteration);
1677 if (chunkIdx >= (UT)pr->u.p.parm2) {
1680 init = chunkIdx * chunkspec + pr->u.p.count;
1683 if ((status = (init > 0 && init <= trip)) != 0) {
1684 limit = init + chunkspec - 1;
1686 if ((last = (limit >= trip)) != 0)
1696 #if KMP_USE_X87CONTROL 1701 oldFpcw = _control87(0, 0);
1702 _control87(_PC_64, _MCW_PC);
1707 init = __kmp_dispatch_guided_remaining<T>(
1708 trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1709 KMP_DEBUG_ASSERT(init);
1713 limit = trip - __kmp_dispatch_guided_remaining<T>(
1714 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1715 KMP_ASSERT(init <= limit);
1717 KMP_DEBUG_ASSERT(limit <= trip);
1724 #if KMP_USE_X87CONTROL 1728 if (fpcwSet && (oldFpcw & fpcwSet))
1729 _control87(oldFpcw, _MCW_PC);
1736 *p_lb = start + init * incr;
1737 *p_ub = start + limit * incr;
1738 if (pr->flags.ordered) {
1739 pr->u.p.ordered_lower = init;
1740 pr->u.p.ordered_upper = limit;
1751 case kmp_sch_trapezoidal: {
1753 T parm2 = pr->u.p.parm2;
1754 T parm3 = pr->u.p.parm3;
1755 T parm4 = pr->u.p.parm4;
1757 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1760 index = test_then_inc<ST>((
volatile ST *)&sh->u.s.iteration);
1762 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1763 trip = pr->u.p.tc - 1;
1765 if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1772 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1775 if ((last = (limit >= trip)) != 0)
1782 *p_lb = start + init;
1783 *p_ub = start + limit;
1785 *p_lb = start + init * incr;
1786 *p_ub = start + limit * incr;
1789 if (pr->flags.ordered) {
1790 pr->u.p.ordered_lower = init;
1791 pr->u.p.ordered_upper = limit;
1798 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected),
1799 KMP_HNT(GetNewerLibrary),
1807 if (pr->flags.ordered) {
1810 buff = __kmp_str_format(
"__kmp_dispatch_next_algorithm: T#%%d " 1811 "ordered_lower:%%%s ordered_upper:%%%s\n",
1812 traits_t<UT>::spec, traits_t<UT>::spec);
1813 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1814 __kmp_str_free(&buff);
1819 buff = __kmp_str_format(
1820 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d " 1821 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1822 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1823 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1824 __kmp_str_free(&buff);
1833 #if OMPT_SUPPORT && OMPT_OPTIONAL 1834 #define OMPT_LOOP_END \ 1835 if (status == 0) { \ 1836 if (ompt_enabled.ompt_callback_work) { \ 1837 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1838 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \ 1839 ompt_callbacks.ompt_callback(ompt_callback_work)( \ 1840 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \ 1841 &(task_info->task_data), 0, codeptr); \ 1846 #define OMPT_LOOP_END // no-op 1849 #if KMP_STATS_ENABLED 1850 #define KMP_STATS_LOOP_END \ 1852 kmp_int64 u, l, t, i; \ 1853 l = (kmp_int64)(*p_lb); \ 1854 u = (kmp_int64)(*p_ub); \ 1855 i = (kmp_int64)(pr->u.p.st); \ 1856 if (status == 0) { \ 1858 KMP_POP_PARTITIONED_TIMER(); \ 1859 } else if (i == 1) { \ 1864 } else if (i < 0) { \ 1866 t = (l - u) / (-i) + 1; \ 1871 t = (u - l) / i + 1; \ 1875 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \ 1878 #define KMP_STATS_LOOP_END 1881 template <
typename T>
1882 static int __kmp_dispatch_next(
ident_t *loc,
int gtid, kmp_int32 *p_last,
1884 typename traits_t<T>::signed_t *p_st
1885 #
if OMPT_SUPPORT && OMPT_OPTIONAL
1891 typedef typename traits_t<T>::unsigned_t UT;
1892 typedef typename traits_t<T>::signed_t ST;
1897 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1900 dispatch_private_info_template<T> *pr;
1901 kmp_info_t *th = __kmp_threads[gtid];
1902 kmp_team_t *team = th->th.th_team;
1904 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st);
1907 (
"__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1908 gtid, p_lb, p_ub, p_st, p_last));
1910 if (team->t.t_serialized) {
1912 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1913 th->th.th_dispatch->th_disp_buffer);
1914 KMP_DEBUG_ASSERT(pr);
1916 if ((status = (pr->u.p.tc != 0)) == 0) {
1923 if (__kmp_env_consistency_check) {
1924 if (pr->pushed_ws != ct_none) {
1925 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1928 }
else if (pr->flags.nomerge) {
1931 UT limit, trip, init;
1933 T chunk = pr->u.p.parm1;
1935 KD_TRACE(100, (
"__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1938 init = chunk * pr->u.p.count++;
1939 trip = pr->u.p.tc - 1;
1941 if ((status = (init <= trip)) == 0) {
1948 if (__kmp_env_consistency_check) {
1949 if (pr->pushed_ws != ct_none) {
1950 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1955 limit = chunk + init - 1;
1958 if ((last = (limit >= trip)) != 0) {
1961 pr->u.p.last_upper = pr->u.p.ub;
1969 *p_lb = start + init;
1970 *p_ub = start + limit;
1972 *p_lb = start + init * incr;
1973 *p_ub = start + limit * incr;
1976 if (pr->flags.ordered) {
1977 pr->u.p.ordered_lower = init;
1978 pr->u.p.ordered_upper = limit;
1983 buff = __kmp_str_format(
"__kmp_dispatch_next: T#%%d " 1984 "ordered_lower:%%%s ordered_upper:%%%s\n",
1985 traits_t<UT>::spec, traits_t<UT>::spec);
1986 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1987 pr->u.p.ordered_upper));
1988 __kmp_str_free(&buff);
1998 pr->u.p.last_upper = *p_ub;
2009 buff = __kmp_str_format(
2010 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " 2011 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
2012 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2013 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
2014 __kmp_str_free(&buff);
2017 #if INCLUDE_SSC_MARKS 2018 SSC_MARK_DISPATCH_NEXT();
2025 dispatch_shared_info_template<T>
volatile *sh;
2027 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2028 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2030 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
2031 th->th.th_dispatch->th_dispatch_pr_current);
2032 KMP_DEBUG_ASSERT(pr);
2033 sh =
reinterpret_cast<dispatch_shared_info_template<T>
volatile *
>(
2034 th->th.th_dispatch->th_dispatch_sh_current);
2035 KMP_DEBUG_ASSERT(sh);
2037 #if KMP_USE_HIER_SCHED 2038 if (pr->flags.use_hier)
2039 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2041 #endif // KMP_USE_HIER_SCHED 2042 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2043 p_st, th->th.th_team_nproc,
2044 th->th.th_info.ds.ds_tid);
2049 num_done = test_then_inc<ST>((
volatile ST *)&sh->u.s.num_done);
2054 buff = __kmp_str_format(
2055 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2056 traits_t<UT>::spec);
2057 KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2058 __kmp_str_free(&buff);
2062 #if KMP_USE_HIER_SCHED 2063 pr->flags.use_hier = FALSE;
2065 if ((ST)num_done == th->th.th_team_nproc - 1) {
2066 #if (KMP_STATIC_STEAL_ENABLED) 2067 if (pr->schedule == kmp_sch_static_steal &&
2068 traits_t<T>::type_size > 4) {
2070 kmp_info_t **other_threads = team->t.t_threads;
2072 for (i = 0; i < th->th.th_team_nproc; ++i) {
2073 kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2074 KMP_ASSERT(lck != NULL);
2075 __kmp_destroy_lock(lck);
2077 other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2085 sh->u.s.num_done = 0;
2086 sh->u.s.iteration = 0;
2089 if (pr->flags.ordered) {
2090 sh->u.s.ordered_iteration = 0;
2095 sh->buffer_index += __kmp_dispatch_num_buffers;
2096 KD_TRACE(100, (
"__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2097 gtid, sh->buffer_index));
2102 if (__kmp_env_consistency_check) {
2103 if (pr->pushed_ws != ct_none) {
2104 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2108 th->th.th_dispatch->th_deo_fcn = NULL;
2109 th->th.th_dispatch->th_dxo_fcn = NULL;
2110 th->th.th_dispatch->th_dispatch_sh_current = NULL;
2111 th->th.th_dispatch->th_dispatch_pr_current = NULL;
2115 pr->u.p.last_upper = pr->u.p.ub;
2118 if (p_last != NULL && status != 0)
2126 buff = __kmp_str_format(
2127 "__kmp_dispatch_next: T#%%d normal case: " 2128 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2129 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2130 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2131 (p_last ? *p_last : 0), status));
2132 __kmp_str_free(&buff);
2135 #if INCLUDE_SSC_MARKS 2136 SSC_MARK_DISPATCH_NEXT();
2143 template <
typename T>
2144 static void __kmp_dist_get_bounds(
ident_t *loc, kmp_int32 gtid,
2145 kmp_int32 *plastiter, T *plower, T *pupper,
2146 typename traits_t<T>::signed_t incr) {
2147 typedef typename traits_t<T>::unsigned_t UT;
2154 KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2155 KE_TRACE(10, (
"__kmpc_dist_get_bounds called (%d)\n", gtid));
2157 typedef typename traits_t<T>::signed_t ST;
2161 buff = __kmp_str_format(
"__kmpc_dist_get_bounds: T#%%d liter=%%d " 2162 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2163 traits_t<T>::spec, traits_t<T>::spec,
2164 traits_t<ST>::spec, traits_t<T>::spec);
2165 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2166 __kmp_str_free(&buff);
2170 if (__kmp_env_consistency_check) {
2172 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2175 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2185 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2188 th = __kmp_threads[gtid];
2189 team = th->th.th_team;
2190 KMP_DEBUG_ASSERT(th->th.th_teams_microtask);
2191 nteams = th->th.th_teams_size.nteams;
2192 team_id = team->t.t_master_tid;
2193 KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2197 trip_count = *pupper - *plower + 1;
2198 }
else if (incr == -1) {
2199 trip_count = *plower - *pupper + 1;
2200 }
else if (incr > 0) {
2202 trip_count = (UT)(*pupper - *plower) / incr + 1;
2204 trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2207 if (trip_count <= nteams) {
2209 __kmp_static == kmp_sch_static_greedy ||
2211 kmp_sch_static_balanced);
2213 if (team_id < trip_count) {
2214 *pupper = *plower = *plower + team_id * incr;
2216 *plower = *pupper + incr;
2218 if (plastiter != NULL)
2219 *plastiter = (team_id == trip_count - 1);
2221 if (__kmp_static == kmp_sch_static_balanced) {
2222 UT chunk = trip_count / nteams;
2223 UT extras = trip_count % nteams;
2225 incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2226 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2227 if (plastiter != NULL)
2228 *plastiter = (team_id == nteams - 1);
2231 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2233 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2235 *plower += team_id * chunk_inc_count;
2236 *pupper = *plower + chunk_inc_count - incr;
2239 if (*pupper < *plower)
2240 *pupper = traits_t<T>::max_value;
2241 if (plastiter != NULL)
2242 *plastiter = *plower <= upper && *pupper > upper - incr;
2243 if (*pupper > upper)
2246 if (*pupper > *plower)
2247 *pupper = traits_t<T>::min_value;
2248 if (plastiter != NULL)
2249 *plastiter = *plower >= upper && *pupper < upper - incr;
2250 if (*pupper < upper)
2282 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2283 KMP_DEBUG_ASSERT(__kmp_init_serial);
2284 #if OMPT_SUPPORT && OMPT_OPTIONAL 2285 OMPT_STORE_RETURN_ADDRESS(gtid);
2287 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2294 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2295 KMP_DEBUG_ASSERT(__kmp_init_serial);
2296 #if OMPT_SUPPORT && OMPT_OPTIONAL 2297 OMPT_STORE_RETURN_ADDRESS(gtid);
2299 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2307 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2308 KMP_DEBUG_ASSERT(__kmp_init_serial);
2309 #if OMPT_SUPPORT && OMPT_OPTIONAL 2310 OMPT_STORE_RETURN_ADDRESS(gtid);
2312 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2320 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2321 KMP_DEBUG_ASSERT(__kmp_init_serial);
2322 #if OMPT_SUPPORT && OMPT_OPTIONAL 2323 OMPT_STORE_RETURN_ADDRESS(gtid);
2325 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2339 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2341 KMP_DEBUG_ASSERT(__kmp_init_serial);
2342 #if OMPT_SUPPORT && OMPT_OPTIONAL 2343 OMPT_STORE_RETURN_ADDRESS(gtid);
2345 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2346 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2349 void __kmpc_dist_dispatch_init_4u(
ident_t *loc, kmp_int32 gtid,
2351 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2353 KMP_DEBUG_ASSERT(__kmp_init_serial);
2354 #if OMPT_SUPPORT && OMPT_OPTIONAL 2355 OMPT_STORE_RETURN_ADDRESS(gtid);
2357 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2358 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2361 void __kmpc_dist_dispatch_init_8(
ident_t *loc, kmp_int32 gtid,
2363 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2365 KMP_DEBUG_ASSERT(__kmp_init_serial);
2366 #if OMPT_SUPPORT && OMPT_OPTIONAL 2367 OMPT_STORE_RETURN_ADDRESS(gtid);
2369 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2370 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2373 void __kmpc_dist_dispatch_init_8u(
ident_t *loc, kmp_int32 gtid,
2375 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2377 KMP_DEBUG_ASSERT(__kmp_init_serial);
2378 #if OMPT_SUPPORT && OMPT_OPTIONAL 2379 OMPT_STORE_RETURN_ADDRESS(gtid);
2381 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2382 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2399 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2400 #if OMPT_SUPPORT && OMPT_OPTIONAL 2401 OMPT_STORE_RETURN_ADDRESS(gtid);
2403 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2404 #if OMPT_SUPPORT && OMPT_OPTIONAL 2406 OMPT_LOAD_RETURN_ADDRESS(gtid)
2415 kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2417 #if OMPT_SUPPORT && OMPT_OPTIONAL 2418 OMPT_STORE_RETURN_ADDRESS(gtid);
2420 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2421 #if OMPT_SUPPORT && OMPT_OPTIONAL 2423 OMPT_LOAD_RETURN_ADDRESS(gtid)
2432 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2433 #if OMPT_SUPPORT && OMPT_OPTIONAL 2434 OMPT_STORE_RETURN_ADDRESS(gtid);
2436 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2437 #if OMPT_SUPPORT && OMPT_OPTIONAL 2439 OMPT_LOAD_RETURN_ADDRESS(gtid)
2448 kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2450 #if OMPT_SUPPORT && OMPT_OPTIONAL 2451 OMPT_STORE_RETURN_ADDRESS(gtid);
2453 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2454 #if OMPT_SUPPORT && OMPT_OPTIONAL 2456 OMPT_LOAD_RETURN_ADDRESS(gtid)
2468 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2475 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2482 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2489 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2496 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2497 return value == checker;
2500 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2501 return value != checker;
2504 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2505 return value < checker;
2508 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2509 return value >= checker;
2512 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2513 return value <= checker;
2517 __kmp_wait_4(
volatile kmp_uint32 *spinner, kmp_uint32 checker,
2518 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2522 volatile kmp_uint32 *spin = spinner;
2523 kmp_uint32 check = checker;
2525 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2528 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2529 KMP_INIT_YIELD(spins);
2531 while (!f(r = TCR_4(*spin), check)) {
2532 KMP_FSYNC_SPIN_PREPARE(obj);
2537 KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2539 KMP_FSYNC_SPIN_ACQUIRED(obj);
2543 void __kmp_wait_4_ptr(
void *spinner, kmp_uint32 checker,
2544 kmp_uint32 (*pred)(
void *, kmp_uint32),
2548 void *spin = spinner;
2549 kmp_uint32 check = checker;
2551 kmp_uint32 (*f)(
void *, kmp_uint32) = pred;
2553 KMP_FSYNC_SPIN_INIT(obj, spin);
2554 KMP_INIT_YIELD(spins);
2556 while (!f(spin, check)) {
2557 KMP_FSYNC_SPIN_PREPARE(obj);
2560 KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2562 KMP_FSYNC_SPIN_ACQUIRED(obj);
2567 #ifdef KMP_GOMP_COMPAT 2569 void __kmp_aux_dispatch_init_4(
ident_t *loc, kmp_int32 gtid,
2571 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2573 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2577 void __kmp_aux_dispatch_init_4u(
ident_t *loc, kmp_int32 gtid,
2579 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2581 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2585 void __kmp_aux_dispatch_init_8(
ident_t *loc, kmp_int32 gtid,
2587 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2589 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2593 void __kmp_aux_dispatch_init_8u(
ident_t *loc, kmp_int32 gtid,
2595 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2597 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2601 void __kmp_aux_dispatch_fini_chunk_4(
ident_t *loc, kmp_int32 gtid) {
2602 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2605 void __kmp_aux_dispatch_fini_chunk_8(
ident_t *loc, kmp_int32 gtid) {
2606 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2609 void __kmp_aux_dispatch_fini_chunk_4u(
ident_t *loc, kmp_int32 gtid) {
2610 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2613 void __kmp_aux_dispatch_fini_chunk_8u(
ident_t *loc, kmp_int32 gtid) {
2614 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st)
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk)
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st)
int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st)
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st)
void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk)