22 #include "kmp_error.h" 25 #include "kmp_stats.h" 27 #if KMP_USE_X87CONTROL 31 #include "kmp_dispatch.h" 32 #if KMP_USE_HIER_SCHED 33 #include "kmp_dispatch_hier.h" 37 #include "ompt-specific.h" 43 void __kmp_dispatch_deo_error(
int *gtid_ref,
int *cid_ref,
ident_t *loc_ref) {
46 KMP_DEBUG_ASSERT(gtid_ref);
48 if (__kmp_env_consistency_check) {
49 th = __kmp_threads[*gtid_ref];
50 if (th->th.th_root->r.r_active &&
51 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
52 #if KMP_USE_DYNAMIC_LOCK 53 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
55 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
61 void __kmp_dispatch_dxo_error(
int *gtid_ref,
int *cid_ref,
ident_t *loc_ref) {
64 if (__kmp_env_consistency_check) {
65 th = __kmp_threads[*gtid_ref];
66 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
67 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
83 void __kmp_dispatch_init_algorithm(
ident_t *loc,
int gtid,
84 dispatch_private_info_template<T> *pr,
86 typename traits_t<T>::signed_t st,
88 kmp_uint64 *cur_chunk,
90 typename traits_t<T>::signed_t chunk,
92 typedef typename traits_t<T>::unsigned_t UT;
93 typedef typename traits_t<T>::floating_t DBL;
101 typedef typename traits_t<T>::signed_t ST;
105 buff = __kmp_str_format(
"__kmp_dispatch_init_algorithm: T#%%d called " 106 "pr:%%p lb:%%%s ub:%%%s st:%%%s " 107 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
108 traits_t<T>::spec, traits_t<T>::spec,
109 traits_t<ST>::spec, traits_t<ST>::spec,
110 traits_t<T>::spec, traits_t<T>::spec);
111 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
112 __kmp_str_free(&buff);
116 th = __kmp_threads[gtid];
117 team = th->th.th_team;
118 active = !team->t.t_serialized;
121 int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
122 __kmp_forkjoin_frames_mode == 3 &&
123 KMP_MASTER_GTID(gtid) &&
125 th->th.th_teams_microtask == NULL &&
127 team->t.t_active_level == 1;
129 #if (KMP_STATIC_STEAL_ENABLED) 130 if (SCHEDULE_HAS_NONMONOTONIC(schedule))
132 schedule = kmp_sch_static_steal;
135 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
139 pr->flags.nomerge = TRUE;
143 pr->flags.nomerge = FALSE;
145 pr->type_size = traits_t<T>::type_size;
147 pr->flags.ordered = TRUE;
151 pr->flags.ordered = FALSE;
155 schedule = __kmp_static;
157 if (schedule == kmp_sch_runtime) {
160 schedule = team->t.t_sched.r_sched_type;
164 schedule = __kmp_guided;
166 schedule = __kmp_static;
170 chunk = team->t.t_sched.chunk;
179 buff = __kmp_str_format(
"__kmp_dispatch_init_algorithm: T#%%d new: " 180 "schedule:%%d chunk:%%%s\n",
182 KD_TRACE(10, (buff, gtid, schedule, chunk));
183 __kmp_str_free(&buff);
188 schedule = __kmp_guided;
191 chunk = KMP_DEFAULT_CHUNK;
197 schedule = __kmp_auto;
202 buff = __kmp_str_format(
203 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: " 204 "schedule:%%d chunk:%%%s\n",
206 KD_TRACE(10, (buff, gtid, schedule, chunk));
207 __kmp_str_free(&buff);
213 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
214 schedule = kmp_sch_guided_iterative_chunked;
215 KMP_WARNING(DispatchManyThreads);
218 if (schedule == kmp_sch_runtime_simd) {
220 schedule = team->t.t_sched.r_sched_type;
224 schedule == __kmp_static) {
225 schedule = kmp_sch_static_balanced_chunked;
228 schedule = kmp_sch_guided_simd;
230 chunk = team->t.t_sched.chunk * chunk;
240 buff = __kmp_str_format(
"__kmp_dispatch_init: T#%%d new: schedule:%%d" 243 KD_TRACE(10, (buff, gtid, schedule, chunk));
244 __kmp_str_free(&buff);
248 #endif // OMP_45_ENABLED 249 pr->u.p.parm1 = chunk;
252 "unknown scheduling type");
256 if (__kmp_env_consistency_check) {
258 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
259 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
273 tc = (UT)(lb - ub) / (-st) + 1;
281 tc = (UT)(ub - lb) / st + 1;
293 pr->u.p.last_upper = ub + st;
299 if (pr->flags.ordered) {
300 pr->ordered_bumped = 0;
301 pr->u.p.ordered_lower = 1;
302 pr->u.p.ordered_upper = 0;
307 #if (KMP_STATIC_STEAL_ENABLED) 308 case kmp_sch_static_steal: {
312 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
315 ntc = (tc % chunk ? 1 : 0) + tc / chunk;
316 if (nproc > 1 && ntc >= nproc) {
319 T small_chunk, extras;
321 small_chunk = ntc / nproc;
322 extras = ntc % nproc;
324 init =
id * small_chunk + (
id < extras ? id : extras);
325 pr->u.p.count = init;
326 pr->u.p.ub = init + small_chunk + (
id < extras ? 1 : 0);
330 pr->u.p.parm4 = (
id + 1) % nproc;
332 if (traits_t<T>::type_size > 4) {
338 KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
339 th->th.th_dispatch->th_steal_lock =
340 (kmp_lock_t *)__kmp_allocate(
sizeof(kmp_lock_t));
341 __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
345 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d falling-through to " 346 "kmp_sch_static_balanced\n",
348 schedule = kmp_sch_static_balanced;
354 case kmp_sch_static_balanced: {
359 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
369 pr->u.p.parm1 = (
id == tc - 1);
372 pr->u.p.parm1 = FALSE;
376 T small_chunk = tc / nproc;
377 T extras = tc % nproc;
378 init =
id * small_chunk + (
id < extras ? id : extras);
379 limit = init + small_chunk - (
id < extras ? 0 : 1);
380 pr->u.p.parm1 = (
id == nproc - 1);
386 pr->u.p.parm1 = TRUE;
390 pr->u.p.parm1 = FALSE;
396 if (itt_need_metadata_reporting)
398 *cur_chunk = limit - init + 1;
401 pr->u.p.lb = lb + init;
402 pr->u.p.ub = lb + limit;
405 T ub_tmp = lb + limit * st;
406 pr->u.p.lb = lb + init * st;
410 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
412 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
415 if (pr->flags.ordered) {
416 pr->u.p.ordered_lower = init;
417 pr->u.p.ordered_upper = limit;
422 case kmp_sch_static_balanced_chunked: {
425 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)" 426 " -> falling-through to static_greedy\n",
428 schedule = kmp_sch_static_greedy;
430 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
435 case kmp_sch_guided_simd:
436 #endif // OMP_45_ENABLED 437 case kmp_sch_guided_iterative_chunked: {
440 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked" 445 if ((2L * chunk + 1) * nproc >= tc) {
447 schedule = kmp_sch_dynamic_chunked;
450 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
451 *(
double *)&pr->u.p.parm3 =
452 guided_flt_param / nproc;
455 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d falling-through to " 456 "kmp_sch_static_greedy\n",
458 schedule = kmp_sch_static_greedy;
462 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
468 case kmp_sch_guided_analytical_chunked: {
469 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d " 470 "kmp_sch_guided_analytical_chunked case\n",
474 if ((2L * chunk + 1) * nproc >= tc) {
476 schedule = kmp_sch_dynamic_chunked;
481 #if KMP_USE_X87CONTROL 491 unsigned int oldFpcw = _control87(0, 0);
492 _control87(_PC_64, _MCW_PC);
495 long double target = ((
long double)chunk * 2 + 1) * nproc / tc;
502 x = (
long double)1.0 - (
long double)0.5 / nproc;
513 ptrdiff_t natural_alignment =
514 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
518 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
523 *(DBL *)&pr->u.p.parm3 = x;
536 p = __kmp_pow<UT>(x, right);
541 }
while (p > target && right < (1 << 27));
549 while (left + 1 < right) {
550 mid = (left + right) / 2;
551 if (__kmp_pow<UT>(x, mid) > target) {
560 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
561 __kmp_pow<UT>(x, cross) <= target);
564 pr->u.p.parm2 = cross;
567 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8)) 568 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3) 570 #define GUIDED_ANALYTICAL_WORKAROUND (x) 573 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
574 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
576 #if KMP_USE_X87CONTROL 578 _control87(oldFpcw, _MCW_PC);
582 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d falling-through to " 583 "kmp_sch_static_greedy\n",
585 schedule = kmp_sch_static_greedy;
591 case kmp_sch_static_greedy:
594 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
596 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
598 case kmp_sch_static_chunked:
599 case kmp_sch_dynamic_chunked:
600 if (pr->u.p.parm1 <= 0) {
601 pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
603 KD_TRACE(100, (
"__kmp_dispatch_init_algorithm: T#%d " 604 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
607 case kmp_sch_trapezoidal: {
610 T parm1, parm2, parm3, parm4;
612 (
"__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
618 parm2 = (tc / (2 * nproc));
628 }
else if (parm1 > parm2) {
633 parm3 = (parm2 + parm1);
634 parm3 = (2 * tc + parm3 - 1) / parm3;
642 parm4 = (parm2 - parm1) / parm4;
649 pr->u.p.parm1 = parm1;
650 pr->u.p.parm2 = parm2;
651 pr->u.p.parm3 = parm3;
652 pr->u.p.parm4 = parm4;
657 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected),
658 KMP_HNT(GetNewerLibrary),
663 pr->schedule = schedule;
666 #if KMP_USE_HIER_SCHED 667 template <
typename T>
668 inline void __kmp_dispatch_init_hier_runtime(
ident_t *loc, T lb, T ub,
669 typename traits_t<T>::signed_t st);
672 __kmp_dispatch_init_hier_runtime<kmp_int32>(
ident_t *loc, kmp_int32 lb,
673 kmp_int32 ub, kmp_int32 st) {
674 __kmp_dispatch_init_hierarchy<kmp_int32>(
675 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
676 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
680 __kmp_dispatch_init_hier_runtime<kmp_uint32>(
ident_t *loc, kmp_uint32 lb,
681 kmp_uint32 ub, kmp_int32 st) {
682 __kmp_dispatch_init_hierarchy<kmp_uint32>(
683 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
684 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
688 __kmp_dispatch_init_hier_runtime<kmp_int64>(
ident_t *loc, kmp_int64 lb,
689 kmp_int64 ub, kmp_int64 st) {
690 __kmp_dispatch_init_hierarchy<kmp_int64>(
691 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
692 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
696 __kmp_dispatch_init_hier_runtime<kmp_uint64>(
ident_t *loc, kmp_uint64 lb,
697 kmp_uint64 ub, kmp_int64 st) {
698 __kmp_dispatch_init_hierarchy<kmp_uint64>(
699 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
700 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
704 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
705 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
706 for (
int i = 0; i < num_disp_buff; ++i) {
709 reinterpret_cast<dispatch_shared_info_template<kmp_int32>
volatile *
>(
710 &team->t.t_disp_buffer[i]);
712 sh->hier->deallocate();
713 __kmp_free(sh->hier);
721 template <
typename T>
724 T ub,
typename traits_t<T>::signed_t st,
725 typename traits_t<T>::signed_t chunk,
int push_ws) {
726 typedef typename traits_t<T>::unsigned_t UT;
731 kmp_uint32 my_buffer_index;
732 dispatch_private_info_template<T> *pr;
733 dispatch_shared_info_template<T>
volatile *sh;
735 KMP_BUILD_ASSERT(
sizeof(dispatch_private_info_template<T>) ==
736 sizeof(dispatch_private_info));
737 KMP_BUILD_ASSERT(
sizeof(dispatch_shared_info_template<UT>) ==
738 sizeof(dispatch_shared_info));
740 if (!TCR_4(__kmp_init_parallel))
741 __kmp_parallel_initialize();
744 __kmp_resume_if_soft_paused();
747 #if INCLUDE_SSC_MARKS 748 SSC_MARK_DISPATCH_INIT();
751 typedef typename traits_t<T>::signed_t ST;
755 buff = __kmp_str_format(
"__kmp_dispatch_init: T#%%d called: schedule:%%d " 756 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
757 traits_t<ST>::spec, traits_t<T>::spec,
758 traits_t<T>::spec, traits_t<ST>::spec);
759 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
760 __kmp_str_free(&buff);
764 th = __kmp_threads[gtid];
765 team = th->th.th_team;
766 active = !team->t.t_serialized;
767 th->th.th_ident = loc;
772 if (schedule == __kmp_static) {
778 #if KMP_USE_HIER_SCHED 784 my_buffer_index = th->th.th_dispatch->th_disp_index;
785 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
787 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
788 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
793 if (pr->flags.use_hier) {
795 KD_TRACE(100, (
"__kmp_dispatch_init: T#%d ordered loop detected. " 796 "Disabling hierarchical scheduling.\n",
798 pr->flags.use_hier = FALSE;
801 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
804 if (!ordered && !pr->flags.use_hier)
805 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
807 #endif // KMP_USE_HIER_SCHED 810 kmp_uint64 cur_chunk = chunk;
811 int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
812 __kmp_forkjoin_frames_mode == 3 &&
813 KMP_MASTER_GTID(gtid) &&
815 th->th.th_teams_microtask == NULL &&
817 team->t.t_active_level == 1;
820 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
821 th->th.th_dispatch->th_disp_buffer);
823 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
824 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
826 my_buffer_index = th->th.th_dispatch->th_disp_index++;
829 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
831 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
832 sh =
reinterpret_cast<dispatch_shared_info_template<T>
volatile *
>(
833 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
834 KD_TRACE(10, (
"__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
838 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
842 chunk, (T)th->th.th_team_nproc,
843 (T)th->th.th_info.ds.ds_tid);
845 if (pr->flags.ordered == 0) {
846 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
847 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
849 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
850 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
858 KD_TRACE(100, (
"__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d " 859 "sh->buffer_index:%d\n",
860 gtid, my_buffer_index, sh->buffer_index));
861 __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index,
862 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
866 KD_TRACE(100, (
"__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d " 867 "sh->buffer_index:%d\n",
868 gtid, my_buffer_index, sh->buffer_index));
870 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
871 th->th.th_dispatch->th_dispatch_sh_current =
872 CCAST(dispatch_shared_info_t *, (
volatile dispatch_shared_info_t *)sh);
874 if (pr->flags.ordered) {
875 __kmp_itt_ordered_init(gtid);
878 if (itt_need_metadata_reporting) {
880 kmp_uint64 schedtype = 0;
882 case kmp_sch_static_chunked:
883 case kmp_sch_static_balanced:
885 case kmp_sch_static_greedy:
886 cur_chunk = pr->u.p.parm1;
888 case kmp_sch_dynamic_chunked:
891 case kmp_sch_guided_iterative_chunked:
892 case kmp_sch_guided_analytical_chunked:
894 case kmp_sch_guided_simd:
904 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
906 #if KMP_USE_HIER_SCHED 907 if (pr->flags.use_hier) {
909 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
911 #endif // KMP_USER_HIER_SCHED 919 buff = __kmp_str_format(
920 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s " 922 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" 923 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
924 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
925 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
926 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
927 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
928 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
929 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
930 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
931 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
932 __kmp_str_free(&buff);
935 #if (KMP_STATIC_STEAL_ENABLED) 941 if (schedule == kmp_sch_static_steal) {
945 volatile T *p = &pr->u.p.static_steal_counter;
948 #endif // ( KMP_STATIC_STEAL_ENABLED ) 950 #if OMPT_SUPPORT && OMPT_OPTIONAL 951 if (ompt_enabled.ompt_callback_work) {
952 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
953 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
954 ompt_callbacks.ompt_callback(ompt_callback_work)(
955 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
956 &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
959 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
967 template <
typename UT>
968 static void __kmp_dispatch_finish(
int gtid,
ident_t *loc) {
969 typedef typename traits_t<UT>::signed_t ST;
970 kmp_info_t *th = __kmp_threads[gtid];
972 KD_TRACE(100, (
"__kmp_dispatch_finish: T#%d called\n", gtid));
973 if (!th->th.th_team->t.t_serialized) {
975 dispatch_private_info_template<UT> *pr =
976 reinterpret_cast<dispatch_private_info_template<UT> *
>(
977 th->th.th_dispatch->th_dispatch_pr_current);
978 dispatch_shared_info_template<UT>
volatile *sh =
979 reinterpret_cast<dispatch_shared_info_template<UT>
volatile *
>(
980 th->th.th_dispatch->th_dispatch_sh_current);
981 KMP_DEBUG_ASSERT(pr);
982 KMP_DEBUG_ASSERT(sh);
983 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
984 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
986 if (pr->ordered_bumped) {
989 (
"__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
991 pr->ordered_bumped = 0;
993 UT lower = pr->u.p.ordered_lower;
999 buff = __kmp_str_format(
"__kmp_dispatch_finish: T#%%d before wait: " 1000 "ordered_iteration:%%%s lower:%%%s\n",
1001 traits_t<UT>::spec, traits_t<UT>::spec);
1002 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1003 __kmp_str_free(&buff);
1007 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1008 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1014 buff = __kmp_str_format(
"__kmp_dispatch_finish: T#%%d after wait: " 1015 "ordered_iteration:%%%s lower:%%%s\n",
1016 traits_t<UT>::spec, traits_t<UT>::spec);
1017 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1018 __kmp_str_free(&buff);
1022 test_then_inc<ST>((
volatile ST *)&sh->u.s.ordered_iteration);
1025 KD_TRACE(100, (
"__kmp_dispatch_finish: T#%d returned\n", gtid));
1028 #ifdef KMP_GOMP_COMPAT 1030 template <
typename UT>
1031 static void __kmp_dispatch_finish_chunk(
int gtid,
ident_t *loc) {
1032 typedef typename traits_t<UT>::signed_t ST;
1033 kmp_info_t *th = __kmp_threads[gtid];
1035 KD_TRACE(100, (
"__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1036 if (!th->th.th_team->t.t_serialized) {
1038 dispatch_private_info_template<UT> *pr =
1039 reinterpret_cast<dispatch_private_info_template<UT> *
>(
1040 th->th.th_dispatch->th_dispatch_pr_current);
1041 dispatch_shared_info_template<UT>
volatile *sh =
1042 reinterpret_cast<dispatch_shared_info_template<UT>
volatile *
>(
1043 th->th.th_dispatch->th_dispatch_sh_current);
1044 KMP_DEBUG_ASSERT(pr);
1045 KMP_DEBUG_ASSERT(sh);
1046 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1047 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1050 UT lower = pr->u.p.ordered_lower;
1051 UT upper = pr->u.p.ordered_upper;
1052 UT inc = upper - lower + 1;
1054 if (pr->ordered_bumped == inc) {
1057 (
"__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1059 pr->ordered_bumped = 0;
1061 inc -= pr->ordered_bumped;
1067 buff = __kmp_str_format(
1068 "__kmp_dispatch_finish_chunk: T#%%d before wait: " 1069 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1070 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1071 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1072 __kmp_str_free(&buff);
1076 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1077 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1080 KD_TRACE(1000, (
"__kmp_dispatch_finish_chunk: T#%d resetting " 1081 "ordered_bumped to zero\n",
1083 pr->ordered_bumped = 0;
1089 buff = __kmp_str_format(
1090 "__kmp_dispatch_finish_chunk: T#%%d after wait: " 1091 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1092 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1093 traits_t<UT>::spec);
1095 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1096 __kmp_str_free(&buff);
1100 test_then_add<ST>((
volatile ST *)&sh->u.s.ordered_iteration, inc);
1104 KD_TRACE(100, (
"__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1109 template <
typename T>
1110 int __kmp_dispatch_next_algorithm(
int gtid,
1111 dispatch_private_info_template<T> *pr,
1112 dispatch_shared_info_template<T>
volatile *sh,
1113 kmp_int32 *p_last, T *p_lb, T *p_ub,
1114 typename traits_t<T>::signed_t *p_st, T nproc,
1116 typedef typename traits_t<T>::unsigned_t UT;
1117 typedef typename traits_t<T>::signed_t ST;
1118 typedef typename traits_t<T>::floating_t DBL;
1123 UT limit, trip, init;
1124 kmp_info_t *th = __kmp_threads[gtid];
1125 kmp_team_t *team = th->th.th_team;
1127 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1128 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1129 KMP_DEBUG_ASSERT(pr);
1130 KMP_DEBUG_ASSERT(sh);
1131 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1137 __kmp_str_format(
"__kmp_dispatch_next_algorithm: T#%%d called pr:%%p " 1138 "sh:%%p nproc:%%%s tid:%%%s\n",
1139 traits_t<T>::spec, traits_t<T>::spec);
1140 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1141 __kmp_str_free(&buff);
1146 if (pr->u.p.tc == 0) {
1148 (
"__kmp_dispatch_next_algorithm: T#%d early exit trip count is " 1154 switch (pr->schedule) {
1155 #if (KMP_STATIC_STEAL_ENABLED) 1156 case kmp_sch_static_steal: {
1157 T chunk = pr->u.p.parm1;
1160 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1163 trip = pr->u.p.tc - 1;
1165 if (traits_t<T>::type_size > 4) {
1168 kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1169 KMP_DEBUG_ASSERT(lck != NULL);
1170 if (pr->u.p.count < (UT)pr->u.p.ub) {
1171 __kmp_acquire_lock(lck, gtid);
1173 init = (pr->u.p.count)++;
1174 status = (init < (UT)pr->u.p.ub);
1175 __kmp_release_lock(lck, gtid);
1180 kmp_info_t **other_threads = team->t.t_threads;
1181 int while_limit = nproc;
1182 int while_index = 0;
1185 while ((!status) && (while_limit != ++while_index)) {
1187 T victimIdx = pr->u.p.parm4;
1188 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1189 dispatch_private_info_template<T> *victim =
1190 reinterpret_cast<dispatch_private_info_template<T> *
>(
1191 other_threads[victimIdx]
1192 ->th.th_dispatch->th_dispatch_pr_current);
1193 while ((victim == NULL || victim == pr ||
1194 (*(
volatile T *)&victim->u.p.static_steal_counter !=
1195 *(
volatile T *)&pr->u.p.static_steal_counter)) &&
1196 oldVictimIdx != victimIdx) {
1197 victimIdx = (victimIdx + 1) % nproc;
1198 victim =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1199 other_threads[victimIdx]
1200 ->th.th_dispatch->th_dispatch_pr_current);
1202 if (!victim || (*(
volatile T *)&victim->u.p.static_steal_counter !=
1203 *(
volatile T *)&pr->u.p.static_steal_counter)) {
1208 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1209 pr->u.p.parm4 = (victimIdx + 1) % nproc;
1213 lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1214 KMP_ASSERT(lck != NULL);
1215 __kmp_acquire_lock(lck, gtid);
1216 limit = victim->u.p.ub;
1217 if (victim->u.p.count >= limit ||
1218 (remaining = limit - victim->u.p.count) < 2) {
1219 __kmp_release_lock(lck, gtid);
1220 pr->u.p.parm4 = (victimIdx + 1) % nproc;
1225 if (remaining > 3) {
1227 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1228 init = (victim->u.p.ub -= (remaining >> 2));
1231 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1232 init = (victim->u.p.ub -= 1);
1234 __kmp_release_lock(lck, gtid);
1236 KMP_DEBUG_ASSERT(init + 1 <= limit);
1237 pr->u.p.parm4 = victimIdx;
1241 __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1242 pr->u.p.count = init + 1;
1244 __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1259 union_i4 vold, vnew;
1260 vold.b = *(
volatile kmp_int64 *)(&pr->u.p.count);
1263 while (!KMP_COMPARE_AND_STORE_ACQ64(
1264 (
volatile kmp_int64 *)&pr->u.p.count,
1265 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1266 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1268 vold.b = *(
volatile kmp_int64 *)(&pr->u.p.count);
1273 init = vnew.p.count;
1274 status = (init < (UT)vnew.p.ub);
1278 kmp_info_t **other_threads = team->t.t_threads;
1279 int while_limit = nproc;
1280 int while_index = 0;
1284 while ((!status) && (while_limit != ++while_index)) {
1285 union_i4 vold, vnew;
1286 kmp_int32 remaining;
1287 T victimIdx = pr->u.p.parm4;
1288 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1289 dispatch_private_info_template<T> *victim =
1290 reinterpret_cast<dispatch_private_info_template<T> *
>(
1291 other_threads[victimIdx]
1292 ->th.th_dispatch->th_dispatch_pr_current);
1293 while ((victim == NULL || victim == pr ||
1294 (*(
volatile T *)&victim->u.p.static_steal_counter !=
1295 *(
volatile T *)&pr->u.p.static_steal_counter)) &&
1296 oldVictimIdx != victimIdx) {
1297 victimIdx = (victimIdx + 1) % nproc;
1298 victim =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1299 other_threads[victimIdx]
1300 ->th.th_dispatch->th_dispatch_pr_current);
1302 if (!victim || (*(
volatile T *)&victim->u.p.static_steal_counter !=
1303 *(
volatile T *)&pr->u.p.static_steal_counter)) {
1308 pr->u.p.parm4 = victimIdx;
1310 vold.b = *(
volatile kmp_int64 *)(&victim->u.p.count);
1313 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1314 if (vnew.p.count >= (UT)vnew.p.ub ||
1315 (remaining = vnew.p.ub - vnew.p.count) < 2) {
1316 pr->u.p.parm4 = (victimIdx + 1) % nproc;
1319 if (remaining > 3) {
1320 vnew.p.ub -= (remaining >> 2);
1324 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1326 if (KMP_COMPARE_AND_STORE_ACQ64(
1327 (
volatile kmp_int64 *)&victim->u.p.count,
1328 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1329 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1331 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1332 vold.p.ub - vnew.p.ub);
1337 vold.p.count = init + 1;
1339 KMP_XCHG_FIXED64((
volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1341 *(
volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1356 start = pr->u.p.parm2;
1358 limit = chunk + init - 1;
1360 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1362 KMP_DEBUG_ASSERT(init <= trip);
1363 if ((last = (limit >= trip)) != 0)
1369 *p_lb = start + init;
1370 *p_ub = start + limit;
1372 *p_lb = start + init * incr;
1373 *p_ub = start + limit * incr;
1376 if (pr->flags.ordered) {
1377 pr->u.p.ordered_lower = init;
1378 pr->u.p.ordered_upper = limit;
1383 #endif // ( KMP_STATIC_STEAL_ENABLED ) 1384 case kmp_sch_static_balanced: {
1387 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1390 if ((status = !pr->u.p.count) != 0) {
1394 last = pr->u.p.parm1;
1398 pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1402 case kmp_sch_static_greedy:
1404 case kmp_sch_static_chunked: {
1407 KD_TRACE(100, (
"__kmp_dispatch_next_algorithm: T#%d " 1408 "kmp_sch_static_[affinity|chunked] case\n",
1410 parm1 = pr->u.p.parm1;
1412 trip = pr->u.p.tc - 1;
1413 init = parm1 * (pr->u.p.count + tid);
1415 if ((status = (init <= trip)) != 0) {
1418 limit = parm1 + init - 1;
1420 if ((last = (limit >= trip)) != 0)
1426 pr->u.p.count += nproc;
1429 *p_lb = start + init;
1430 *p_ub = start + limit;
1432 *p_lb = start + init * incr;
1433 *p_ub = start + limit * incr;
1436 if (pr->flags.ordered) {
1437 pr->u.p.ordered_lower = init;
1438 pr->u.p.ordered_upper = limit;
1444 case kmp_sch_dynamic_chunked: {
1445 T chunk = pr->u.p.parm1;
1449 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1452 init = chunk * test_then_inc_acq<ST>((
volatile ST *)&sh->u.s.iteration);
1453 trip = pr->u.p.tc - 1;
1455 if ((status = (init <= trip)) == 0) {
1462 limit = chunk + init - 1;
1465 if ((last = (limit >= trip)) != 0)
1472 *p_lb = start + init;
1473 *p_ub = start + limit;
1475 *p_lb = start + init * incr;
1476 *p_ub = start + limit * incr;
1479 if (pr->flags.ordered) {
1480 pr->u.p.ordered_lower = init;
1481 pr->u.p.ordered_upper = limit;
1487 case kmp_sch_guided_iterative_chunked: {
1488 T chunkspec = pr->u.p.parm1;
1489 KD_TRACE(100, (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked " 1496 init = sh->u.s.iteration;
1497 remaining = trip - init;
1498 if (remaining <= 0) {
1507 init = test_then_add<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1509 remaining = trip - init;
1510 if (remaining <= 0) {
1515 if ((T)remaining > chunkspec) {
1516 limit = init + chunkspec - 1;
1519 limit = init + remaining - 1;
1525 (UT)(remaining * *(
double *)&pr->u.p.parm3);
1526 if (compare_and_swap<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1527 (ST)init, (ST)limit)) {
1539 *p_lb = start + init * incr;
1540 *p_ub = start + limit * incr;
1541 if (pr->flags.ordered) {
1542 pr->u.p.ordered_lower = init;
1543 pr->u.p.ordered_upper = limit;
1555 case kmp_sch_guided_simd: {
1558 T chunk = pr->u.p.parm1;
1560 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1566 init = sh->u.s.iteration;
1567 remaining = trip - init;
1568 if (remaining <= 0) {
1572 KMP_DEBUG_ASSERT(init % chunk == 0);
1574 if ((T)remaining < pr->u.p.parm2) {
1577 init = test_then_add<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1579 remaining = trip - init;
1580 if (remaining <= 0) {
1585 if ((T)remaining > chunk) {
1586 limit = init + chunk - 1;
1589 limit = init + remaining - 1;
1595 UT span = remaining * (*(
double *)&pr->u.p.parm3);
1596 UT rem = span % chunk;
1598 span += chunk - rem;
1599 limit = init + span;
1600 if (compare_and_swap<ST>(RCAST(
volatile ST *, &sh->u.s.iteration),
1601 (ST)init, (ST)limit)) {
1613 *p_lb = start + init * incr;
1614 *p_ub = start + limit * incr;
1615 if (pr->flags.ordered) {
1616 pr->u.p.ordered_lower = init;
1617 pr->u.p.ordered_upper = limit;
1627 #endif // OMP_45_ENABLED 1629 case kmp_sch_guided_analytical_chunked: {
1630 T chunkspec = pr->u.p.parm1;
1632 #if KMP_USE_X87CONTROL 1635 unsigned int oldFpcw;
1636 unsigned int fpcwSet = 0;
1638 KD_TRACE(100, (
"__kmp_dispatch_next_algorithm: T#%d " 1639 "kmp_sch_guided_analytical_chunked case\n",
1644 KMP_DEBUG_ASSERT(nproc > 1);
1645 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1649 chunkIdx = test_then_inc_acq<ST>((
volatile ST *)&sh->u.s.iteration);
1650 if (chunkIdx >= (UT)pr->u.p.parm2) {
1653 init = chunkIdx * chunkspec + pr->u.p.count;
1656 if ((status = (init > 0 && init <= trip)) != 0) {
1657 limit = init + chunkspec - 1;
1659 if ((last = (limit >= trip)) != 0)
1669 #if KMP_USE_X87CONTROL 1674 oldFpcw = _control87(0, 0);
1675 _control87(_PC_64, _MCW_PC);
1680 init = __kmp_dispatch_guided_remaining<T>(
1681 trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1682 KMP_DEBUG_ASSERT(init);
1686 limit = trip - __kmp_dispatch_guided_remaining<T>(
1687 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1688 KMP_ASSERT(init <= limit);
1690 KMP_DEBUG_ASSERT(limit <= trip);
1697 #if KMP_USE_X87CONTROL 1701 if (fpcwSet && (oldFpcw & fpcwSet))
1702 _control87(oldFpcw, _MCW_PC);
1709 *p_lb = start + init * incr;
1710 *p_ub = start + limit * incr;
1711 if (pr->flags.ordered) {
1712 pr->u.p.ordered_lower = init;
1713 pr->u.p.ordered_upper = limit;
1724 case kmp_sch_trapezoidal: {
1726 T parm2 = pr->u.p.parm2;
1727 T parm3 = pr->u.p.parm3;
1728 T parm4 = pr->u.p.parm4;
1730 (
"__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1733 index = test_then_inc<ST>((
volatile ST *)&sh->u.s.iteration);
1735 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1736 trip = pr->u.p.tc - 1;
1738 if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1745 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1748 if ((last = (limit >= trip)) != 0)
1755 *p_lb = start + init;
1756 *p_ub = start + limit;
1758 *p_lb = start + init * incr;
1759 *p_ub = start + limit * incr;
1762 if (pr->flags.ordered) {
1763 pr->u.p.ordered_lower = init;
1764 pr->u.p.ordered_upper = limit;
1771 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected),
1772 KMP_HNT(GetNewerLibrary),
1780 if (pr->flags.ordered) {
1783 buff = __kmp_str_format(
"__kmp_dispatch_next_algorithm: T#%%d " 1784 "ordered_lower:%%%s ordered_upper:%%%s\n",
1785 traits_t<UT>::spec, traits_t<UT>::spec);
1786 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1787 __kmp_str_free(&buff);
1792 buff = __kmp_str_format(
1793 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d " 1794 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1795 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1796 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1797 __kmp_str_free(&buff);
1806 #if OMPT_SUPPORT && OMPT_OPTIONAL 1807 #define OMPT_LOOP_END \ 1808 if (status == 0) { \ 1809 if (ompt_enabled.ompt_callback_work) { \ 1810 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \ 1811 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \ 1812 ompt_callbacks.ompt_callback(ompt_callback_work)( \ 1813 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \ 1814 &(task_info->task_data), 0, codeptr); \ 1819 #define OMPT_LOOP_END // no-op 1822 #if KMP_STATS_ENABLED 1823 #define KMP_STATS_LOOP_END \ 1825 kmp_int64 u, l, t, i; \ 1826 l = (kmp_int64)(*p_lb); \ 1827 u = (kmp_int64)(*p_ub); \ 1828 i = (kmp_int64)(pr->u.p.st); \ 1829 if (status == 0) { \ 1831 KMP_POP_PARTITIONED_TIMER(); \ 1832 } else if (i == 1) { \ 1837 } else if (i < 0) { \ 1839 t = (l - u) / (-i) + 1; \ 1844 t = (u - l) / i + 1; \ 1848 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \ 1851 #define KMP_STATS_LOOP_END 1854 template <
typename T>
1855 static int __kmp_dispatch_next(
ident_t *loc,
int gtid, kmp_int32 *p_last,
1857 typename traits_t<T>::signed_t *p_st
1858 #
if OMPT_SUPPORT && OMPT_OPTIONAL
1864 typedef typename traits_t<T>::unsigned_t UT;
1865 typedef typename traits_t<T>::signed_t ST;
1870 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1873 dispatch_private_info_template<T> *pr;
1874 kmp_info_t *th = __kmp_threads[gtid];
1875 kmp_team_t *team = th->th.th_team;
1877 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st);
1880 (
"__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1881 gtid, p_lb, p_ub, p_st, p_last));
1883 if (team->t.t_serialized) {
1885 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
1886 th->th.th_dispatch->th_disp_buffer);
1887 KMP_DEBUG_ASSERT(pr);
1889 if ((status = (pr->u.p.tc != 0)) == 0) {
1896 if (__kmp_env_consistency_check) {
1897 if (pr->pushed_ws != ct_none) {
1898 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1901 }
else if (pr->flags.nomerge) {
1904 UT limit, trip, init;
1906 T chunk = pr->u.p.parm1;
1908 KD_TRACE(100, (
"__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1911 init = chunk * pr->u.p.count++;
1912 trip = pr->u.p.tc - 1;
1914 if ((status = (init <= trip)) == 0) {
1921 if (__kmp_env_consistency_check) {
1922 if (pr->pushed_ws != ct_none) {
1923 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1928 limit = chunk + init - 1;
1931 if ((last = (limit >= trip)) != 0) {
1934 pr->u.p.last_upper = pr->u.p.ub;
1942 *p_lb = start + init;
1943 *p_ub = start + limit;
1945 *p_lb = start + init * incr;
1946 *p_ub = start + limit * incr;
1949 if (pr->flags.ordered) {
1950 pr->u.p.ordered_lower = init;
1951 pr->u.p.ordered_upper = limit;
1956 buff = __kmp_str_format(
"__kmp_dispatch_next: T#%%d " 1957 "ordered_lower:%%%s ordered_upper:%%%s\n",
1958 traits_t<UT>::spec, traits_t<UT>::spec);
1959 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1960 pr->u.p.ordered_upper));
1961 __kmp_str_free(&buff);
1971 pr->u.p.last_upper = *p_ub;
1982 buff = __kmp_str_format(
1983 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " 1984 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
1985 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1986 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
1987 __kmp_str_free(&buff);
1990 #if INCLUDE_SSC_MARKS 1991 SSC_MARK_DISPATCH_NEXT();
1998 dispatch_shared_info_template<T>
volatile *sh;
2000 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2001 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2003 pr =
reinterpret_cast<dispatch_private_info_template<T> *
>(
2004 th->th.th_dispatch->th_dispatch_pr_current);
2005 KMP_DEBUG_ASSERT(pr);
2006 sh =
reinterpret_cast<dispatch_shared_info_template<T>
volatile *
>(
2007 th->th.th_dispatch->th_dispatch_sh_current);
2008 KMP_DEBUG_ASSERT(sh);
2010 #if KMP_USE_HIER_SCHED 2011 if (pr->flags.use_hier)
2012 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2014 #endif // KMP_USE_HIER_SCHED 2015 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2016 p_st, th->th.th_team_nproc,
2017 th->th.th_info.ds.ds_tid);
2022 num_done = test_then_inc<ST>((
volatile ST *)&sh->u.s.num_done);
2027 buff = __kmp_str_format(
2028 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2029 traits_t<UT>::spec);
2030 KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2031 __kmp_str_free(&buff);
2035 #if KMP_USE_HIER_SCHED 2036 pr->flags.use_hier = FALSE;
2038 if ((ST)num_done == th->th.th_team_nproc - 1) {
2039 #if (KMP_STATIC_STEAL_ENABLED) 2040 if (pr->schedule == kmp_sch_static_steal &&
2041 traits_t<T>::type_size > 4) {
2043 kmp_info_t **other_threads = team->t.t_threads;
2045 for (i = 0; i < th->th.th_team_nproc; ++i) {
2046 kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2047 KMP_ASSERT(lck != NULL);
2048 __kmp_destroy_lock(lck);
2050 other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2058 sh->u.s.num_done = 0;
2059 sh->u.s.iteration = 0;
2062 if (pr->flags.ordered) {
2063 sh->u.s.ordered_iteration = 0;
2068 sh->buffer_index += __kmp_dispatch_num_buffers;
2069 KD_TRACE(100, (
"__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2070 gtid, sh->buffer_index));
2075 if (__kmp_env_consistency_check) {
2076 if (pr->pushed_ws != ct_none) {
2077 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2081 th->th.th_dispatch->th_deo_fcn = NULL;
2082 th->th.th_dispatch->th_dxo_fcn = NULL;
2083 th->th.th_dispatch->th_dispatch_sh_current = NULL;
2084 th->th.th_dispatch->th_dispatch_pr_current = NULL;
2088 pr->u.p.last_upper = pr->u.p.ub;
2091 if (p_last != NULL && status != 0)
2099 buff = __kmp_str_format(
2100 "__kmp_dispatch_next: T#%%d normal case: " 2101 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2102 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2103 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2104 (p_last ? *p_last : 0), status));
2105 __kmp_str_free(&buff);
2108 #if INCLUDE_SSC_MARKS 2109 SSC_MARK_DISPATCH_NEXT();
2116 template <
typename T>
2117 static void __kmp_dist_get_bounds(
ident_t *loc, kmp_int32 gtid,
2118 kmp_int32 *plastiter, T *plower, T *pupper,
2119 typename traits_t<T>::signed_t incr) {
2120 typedef typename traits_t<T>::unsigned_t UT;
2127 KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2128 KE_TRACE(10, (
"__kmpc_dist_get_bounds called (%d)\n", gtid));
2130 typedef typename traits_t<T>::signed_t ST;
2134 buff = __kmp_str_format(
"__kmpc_dist_get_bounds: T#%%d liter=%%d " 2135 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2136 traits_t<T>::spec, traits_t<T>::spec,
2137 traits_t<ST>::spec, traits_t<T>::spec);
2138 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2139 __kmp_str_free(&buff);
2143 if (__kmp_env_consistency_check) {
2145 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2148 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2158 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2161 th = __kmp_threads[gtid];
2162 team = th->th.th_team;
2164 KMP_DEBUG_ASSERT(th->th.th_teams_microtask);
2165 nteams = th->th.th_teams_size.nteams;
2167 team_id = team->t.t_master_tid;
2168 KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2172 trip_count = *pupper - *plower + 1;
2173 }
else if (incr == -1) {
2174 trip_count = *plower - *pupper + 1;
2175 }
else if (incr > 0) {
2177 trip_count = (UT)(*pupper - *plower) / incr + 1;
2179 trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2182 if (trip_count <= nteams) {
2184 __kmp_static == kmp_sch_static_greedy ||
2186 kmp_sch_static_balanced);
2188 if (team_id < trip_count) {
2189 *pupper = *plower = *plower + team_id * incr;
2191 *plower = *pupper + incr;
2193 if (plastiter != NULL)
2194 *plastiter = (team_id == trip_count - 1);
2196 if (__kmp_static == kmp_sch_static_balanced) {
2197 UT chunk = trip_count / nteams;
2198 UT extras = trip_count % nteams;
2200 incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2201 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2202 if (plastiter != NULL)
2203 *plastiter = (team_id == nteams - 1);
2206 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2208 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2210 *plower += team_id * chunk_inc_count;
2211 *pupper = *plower + chunk_inc_count - incr;
2214 if (*pupper < *plower)
2215 *pupper = traits_t<T>::max_value;
2216 if (plastiter != NULL)
2217 *plastiter = *plower <= upper && *pupper > upper - incr;
2218 if (*pupper > upper)
2221 if (*pupper > *plower)
2222 *pupper = traits_t<T>::min_value;
2223 if (plastiter != NULL)
2224 *plastiter = *plower >= upper && *pupper < upper - incr;
2225 if (*pupper < upper)
2257 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2258 KMP_DEBUG_ASSERT(__kmp_init_serial);
2259 #if OMPT_SUPPORT && OMPT_OPTIONAL 2260 OMPT_STORE_RETURN_ADDRESS(gtid);
2262 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2269 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2270 KMP_DEBUG_ASSERT(__kmp_init_serial);
2271 #if OMPT_SUPPORT && OMPT_OPTIONAL 2272 OMPT_STORE_RETURN_ADDRESS(gtid);
2274 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2282 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2283 KMP_DEBUG_ASSERT(__kmp_init_serial);
2284 #if OMPT_SUPPORT && OMPT_OPTIONAL 2285 OMPT_STORE_RETURN_ADDRESS(gtid);
2287 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2295 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2296 KMP_DEBUG_ASSERT(__kmp_init_serial);
2297 #if OMPT_SUPPORT && OMPT_OPTIONAL 2298 OMPT_STORE_RETURN_ADDRESS(gtid);
2300 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2314 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2316 KMP_DEBUG_ASSERT(__kmp_init_serial);
2317 #if OMPT_SUPPORT && OMPT_OPTIONAL 2318 OMPT_STORE_RETURN_ADDRESS(gtid);
2320 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2321 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2324 void __kmpc_dist_dispatch_init_4u(
ident_t *loc, kmp_int32 gtid,
2326 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2328 KMP_DEBUG_ASSERT(__kmp_init_serial);
2329 #if OMPT_SUPPORT && OMPT_OPTIONAL 2330 OMPT_STORE_RETURN_ADDRESS(gtid);
2332 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2333 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2336 void __kmpc_dist_dispatch_init_8(
ident_t *loc, kmp_int32 gtid,
2338 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2340 KMP_DEBUG_ASSERT(__kmp_init_serial);
2341 #if OMPT_SUPPORT && OMPT_OPTIONAL 2342 OMPT_STORE_RETURN_ADDRESS(gtid);
2344 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2345 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2348 void __kmpc_dist_dispatch_init_8u(
ident_t *loc, kmp_int32 gtid,
2350 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2352 KMP_DEBUG_ASSERT(__kmp_init_serial);
2353 #if OMPT_SUPPORT && OMPT_OPTIONAL 2354 OMPT_STORE_RETURN_ADDRESS(gtid);
2356 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2357 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
true);
2374 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2375 #if OMPT_SUPPORT && OMPT_OPTIONAL 2376 OMPT_STORE_RETURN_ADDRESS(gtid);
2378 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2379 #if OMPT_SUPPORT && OMPT_OPTIONAL 2381 OMPT_LOAD_RETURN_ADDRESS(gtid)
2390 kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2392 #if OMPT_SUPPORT && OMPT_OPTIONAL 2393 OMPT_STORE_RETURN_ADDRESS(gtid);
2395 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2396 #if OMPT_SUPPORT && OMPT_OPTIONAL 2398 OMPT_LOAD_RETURN_ADDRESS(gtid)
2407 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2408 #if OMPT_SUPPORT && OMPT_OPTIONAL 2409 OMPT_STORE_RETURN_ADDRESS(gtid);
2411 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2412 #if OMPT_SUPPORT && OMPT_OPTIONAL 2414 OMPT_LOAD_RETURN_ADDRESS(gtid)
2423 kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2425 #if OMPT_SUPPORT && OMPT_OPTIONAL 2426 OMPT_STORE_RETURN_ADDRESS(gtid);
2428 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2429 #if OMPT_SUPPORT && OMPT_OPTIONAL 2431 OMPT_LOAD_RETURN_ADDRESS(gtid)
2443 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2450 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2457 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2464 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2471 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2472 return value == checker;
2475 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2476 return value != checker;
2479 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2480 return value < checker;
2483 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2484 return value >= checker;
2487 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2488 return value <= checker;
2492 __kmp_wait_yield_4(
volatile kmp_uint32 *spinner, kmp_uint32 checker,
2493 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2497 volatile kmp_uint32 *spin = spinner;
2498 kmp_uint32 check = checker;
2500 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2503 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2504 KMP_INIT_YIELD(spins);
2506 while (!f(r = TCR_4(*spin), check)) {
2507 KMP_FSYNC_SPIN_PREPARE(obj);
2515 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2516 KMP_YIELD_SPIN(spins);
2518 KMP_FSYNC_SPIN_ACQUIRED(obj);
2522 void __kmp_wait_yield_4_ptr(
2523 void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(
void *, kmp_uint32),
2527 void *spin = spinner;
2528 kmp_uint32 check = checker;
2530 kmp_uint32 (*f)(
void *, kmp_uint32) = pred;
2532 KMP_FSYNC_SPIN_INIT(obj, spin);
2533 KMP_INIT_YIELD(spins);
2535 while (!f(spin, check)) {
2536 KMP_FSYNC_SPIN_PREPARE(obj);
2539 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2540 KMP_YIELD_SPIN(spins);
2542 KMP_FSYNC_SPIN_ACQUIRED(obj);
2547 #ifdef KMP_GOMP_COMPAT 2549 void __kmp_aux_dispatch_init_4(
ident_t *loc, kmp_int32 gtid,
2551 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2553 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2557 void __kmp_aux_dispatch_init_4u(
ident_t *loc, kmp_int32 gtid,
2559 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2561 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2565 void __kmp_aux_dispatch_init_8(
ident_t *loc, kmp_int32 gtid,
2567 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2569 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2573 void __kmp_aux_dispatch_init_8u(
ident_t *loc, kmp_int32 gtid,
2575 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2577 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2581 void __kmp_aux_dispatch_fini_chunk_4(
ident_t *loc, kmp_int32 gtid) {
2582 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2585 void __kmp_aux_dispatch_fini_chunk_8(
ident_t *loc, kmp_int32 gtid) {
2586 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2589 void __kmp_aux_dispatch_fini_chunk_4u(
ident_t *loc, kmp_int32 gtid) {
2590 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2593 void __kmp_aux_dispatch_fini_chunk_8u(
ident_t *loc, kmp_int32 gtid) {
2594 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st)
void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk)
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st)
int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st)
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st)
void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk)