LLVM OpenMP* Runtime Library
kmp_stats.h
1 #ifndef KMP_STATS_H
2 #define KMP_STATS_H
3 
8 //===----------------------------------------------------------------------===//
9 //
10 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
11 // See https://llvm.org/LICENSE.txt for license information.
12 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "kmp_config.h"
17 #include "kmp_debug.h"
18 
19 #if KMP_STATS_ENABLED
20 /* Statistics accumulator.
21  Accumulates number of samples and computes min, max, mean, standard deviation
22  on the fly.
23 
24  Online variance calculation algorithm from
25  http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm
26  */
27 
28 #include "kmp_stats_timing.h"
29 #include <limits>
30 #include <math.h>
31 #include <new> // placement new
32 #include <stdint.h>
33 #include <string>
34 #include <vector>
35 
36 /* Enable developer statistics here if you want them. They are more detailed
37  than is useful for application characterisation and are intended for the
38  runtime library developer. */
39 #define KMP_DEVELOPER_STATS 0
40 
41 /* Enable/Disable histogram output */
42 #define KMP_STATS_HIST 0
43 
50  noTotal = 1 << 0,
51  onlyInMaster = 1 << 1,
52  noUnits = 1 << 2,
53  notInMaster = 1 << 3,
54  logEvent = 1 << 4
55 };
57 
64  IDLE,
65  SERIAL_REGION,
66  FORK_JOIN_BARRIER,
67  PLAIN_BARRIER,
68  TASKWAIT,
69  TASKYIELD,
70  TASKGROUP,
71  IMPLICIT_TASK,
72  EXPLICIT_TASK,
73  TEAMS_REGION
74 };
75 
94 // clang-format off
95 #define KMP_FOREACH_COUNTER(macro, arg) \
96  macro(OMP_PARALLEL,stats_flags_e::onlyInMaster|stats_flags_e::noTotal,arg) \
97  macro(OMP_NESTED_PARALLEL, 0, arg) \
98  macro(OMP_LOOP_STATIC, 0, arg) \
99  macro(OMP_LOOP_STATIC_STEAL, 0, arg) \
100  macro(OMP_LOOP_DYNAMIC, 0, arg) \
101  macro(OMP_DISTRIBUTE, 0, arg) \
102  macro(OMP_BARRIER, 0, arg) \
103  macro(OMP_CRITICAL, 0, arg) \
104  macro(OMP_SINGLE, 0, arg) \
105  macro(OMP_MASTER, 0, arg) \
106  macro(OMP_TEAMS, 0, arg) \
107  macro(OMP_set_lock, 0, arg) \
108  macro(OMP_test_lock, 0, arg) \
109  macro(REDUCE_wait, 0, arg) \
110  macro(REDUCE_nowait, 0, arg) \
111  macro(OMP_TASKYIELD, 0, arg) \
112  macro(OMP_TASKLOOP, 0, arg) \
113  macro(TASK_executed, 0, arg) \
114  macro(TASK_cancelled, 0, arg) \
115  macro(TASK_stolen, 0, arg)
116 // clang-format on
117 
136 // clang-format off
137 #define KMP_FOREACH_TIMER(macro, arg) \
138  macro (OMP_worker_thread_life, stats_flags_e::logEvent, arg) \
139  macro (OMP_parallel, stats_flags_e::logEvent, arg) \
140  macro (OMP_parallel_overhead, stats_flags_e::logEvent, arg) \
141  macro (OMP_teams, stats_flags_e::logEvent, arg) \
142  macro (OMP_teams_overhead, stats_flags_e::logEvent, arg) \
143  macro (OMP_loop_static, 0, arg) \
144  macro (OMP_loop_static_scheduling, 0, arg) \
145  macro (OMP_loop_dynamic, 0, arg) \
146  macro (OMP_loop_dynamic_scheduling, 0, arg) \
147  macro (OMP_distribute, 0, arg) \
148  macro (OMP_distribute_scheduling, 0, arg) \
149  macro (OMP_critical, 0, arg) \
150  macro (OMP_critical_wait, 0, arg) \
151  macro (OMP_single, 0, arg) \
152  macro (OMP_master, 0, arg) \
153  macro (OMP_task_immediate, 0, arg) \
154  macro (OMP_task_taskwait, 0, arg) \
155  macro (OMP_task_taskyield, 0, arg) \
156  macro (OMP_task_taskgroup, 0, arg) \
157  macro (OMP_task_join_bar, 0, arg) \
158  macro (OMP_task_plain_bar, 0, arg) \
159  macro (OMP_taskloop_scheduling, 0, arg) \
160  macro (OMP_plain_barrier, stats_flags_e::logEvent, arg) \
161  macro (OMP_idle, stats_flags_e::logEvent, arg) \
162  macro (OMP_fork_barrier, stats_flags_e::logEvent, arg) \
163  macro (OMP_join_barrier, stats_flags_e::logEvent, arg) \
164  macro (OMP_serial, stats_flags_e::logEvent, arg) \
165  macro (OMP_set_numthreads, stats_flags_e::noUnits | stats_flags_e::noTotal, \
166  arg) \
167  macro (OMP_PARALLEL_args, stats_flags_e::noUnits | stats_flags_e::noTotal, \
168  arg) \
169  macro (OMP_loop_static_iterations, \
170  stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
171  macro (OMP_loop_static_total_iterations, \
172  stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
173  macro (OMP_loop_dynamic_iterations, \
174  stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
175  macro (OMP_loop_dynamic_total_iterations, \
176  stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
177  macro (OMP_distribute_iterations, \
178  stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
179  KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
180 // clang-format on
181 
182 // OMP_worker_thread_life -- Time from thread becoming an OpenMP thread (either
183 // initializing OpenMP or being created by a master)
184 // until the thread is destroyed
185 // OMP_parallel -- Time thread spends executing work directly
186 // within a #pragma omp parallel
187 // OMP_parallel_overhead -- Time thread spends setting up a parallel region
188 // OMP_loop_static -- Time thread spends executing loop iterations from
189 // a statically scheduled loop
190 // OMP_loop_static_scheduling -- Time thread spends scheduling loop iterations
191 // from a statically scheduled loop
192 // OMP_loop_dynamic -- Time thread spends executing loop iterations from
193 // a dynamically scheduled loop
194 // OMP_loop_dynamic_scheduling -- Time thread spends scheduling loop iterations
195 // from a dynamically scheduled loop
196 // OMP_critical -- Time thread spends executing critical section
197 // OMP_critical_wait -- Time thread spends waiting to enter
198 // a critcal seciton
199 // OMP_single -- Time spent executing a "single" region
200 // OMP_master -- Time spent executing a "master" region
201 // OMP_task_immediate -- Time spent executing non-deferred tasks
202 // OMP_task_taskwait -- Time spent executing tasks inside a taskwait
203 // construct
204 // OMP_task_taskyield -- Time spent executing tasks inside a taskyield
205 // construct
206 // OMP_task_taskgroup -- Time spent executing tasks inside a taskygroup
207 // construct
208 // OMP_task_join_bar -- Time spent executing tasks inside a join barrier
209 // OMP_task_plain_bar -- Time spent executing tasks inside a barrier
210 // construct
211 // OMP_taskloop_scheduling -- Time spent scheduling tasks inside a taskloop
212 // construct
213 // OMP_plain_barrier -- Time spent in a #pragma omp barrier construct or
214 // inside implicit barrier at end of worksharing
215 // construct
216 // OMP_idle -- Time worker threads spend waiting for next
217 // parallel region
218 // OMP_fork_barrier -- Time spent in a the fork barrier surrounding a
219 // parallel region
220 // OMP_join_barrier -- Time spent in a the join barrier surrounding a
221 // parallel region
222 // OMP_serial -- Time thread zero spends executing serial code
223 // OMP_set_numthreads -- Values passed to omp_set_num_threads
224 // OMP_PARALLEL_args -- Number of arguments passed to a parallel region
225 // OMP_loop_static_iterations -- Number of iterations thread is assigned for
226 // statically scheduled loops
227 // OMP_loop_dynamic_iterations -- Number of iterations thread is assigned for
228 // dynamically scheduled loops
229 
230 #if (KMP_DEVELOPER_STATS)
231 // Timers which are of interest to runtime library developers, not end users.
232 // These have to be explicitly enabled in addition to the other stats.
233 
234 // KMP_fork_barrier -- time in __kmp_fork_barrier
235 // KMP_join_barrier -- time in __kmp_join_barrier
236 // KMP_barrier -- time in __kmp_barrier
237 // KMP_end_split_barrier -- time in __kmp_end_split_barrier
238 // KMP_setup_icv_copy -- time in __kmp_setup_icv_copy
239 // KMP_icv_copy -- start/stop timer for any ICV copying
240 // KMP_linear_gather -- time in __kmp_linear_barrier_gather
241 // KMP_linear_release -- time in __kmp_linear_barrier_release
242 // KMP_tree_gather -- time in __kmp_tree_barrier_gather
243 // KMP_tree_release -- time in __kmp_tree_barrier_release
244 // KMP_hyper_gather -- time in __kmp_hyper_barrier_gather
245 // KMP_hyper_release -- time in __kmp_hyper_barrier_release
246 // clang-format off
247 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg) \
248  macro(KMP_fork_call, 0, arg) \
249  macro(KMP_join_call, 0, arg) \
250  macro(KMP_end_split_barrier, 0, arg) \
251  macro(KMP_hier_gather, 0, arg) \
252  macro(KMP_hier_release, 0, arg) \
253  macro(KMP_hyper_gather, 0, arg) \
254  macro(KMP_hyper_release, 0, arg) \
255  macro(KMP_linear_gather, 0, arg) \
256  macro(KMP_linear_release, 0, arg) \
257  macro(KMP_tree_gather, 0, arg) \
258  macro(KMP_tree_release, 0, arg) \
259  macro(USER_resume, 0, arg) \
260  macro(USER_suspend, 0, arg) \
261  macro(KMP_allocate_team, 0, arg) \
262  macro(KMP_setup_icv_copy, 0, arg) \
263  macro(USER_icv_copy, 0, arg) \
264  macro (FOR_static_steal_stolen, \
265  stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \
266  macro (FOR_static_steal_chunks, \
267  stats_flags_e::noUnits | stats_flags_e::noTotal, arg)
268 #else
269 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
270 #endif
271 // clang-format on
272 
292 #define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) KMP_FOREACH_TIMER(macro, arg)
293 
294 #define ENUMERATE(name, ignore, prefix) prefix##name,
295 enum timer_e { KMP_FOREACH_TIMER(ENUMERATE, TIMER_) TIMER_LAST };
296 
297 enum explicit_timer_e {
298  KMP_FOREACH_EXPLICIT_TIMER(ENUMERATE, EXPLICIT_TIMER_) EXPLICIT_TIMER_LAST
299 };
300 
301 enum counter_e { KMP_FOREACH_COUNTER(ENUMERATE, COUNTER_) COUNTER_LAST };
302 #undef ENUMERATE
303 
304 /*
305  * A logarithmic histogram. It accumulates the number of values in each power of
306  * ten bin. So 1<=x<10, 10<=x<100, ...
307  * Mostly useful where we have some big outliers and want to see information
308  * about them.
309  */
310 class logHistogram {
311  enum {
312  numBins = 31, /* Number of powers of 10. If this changes you need to change
313  * the initializer for binMax */
314 
315  /*
316  * If you want to use this to analyse values that may be less than 1, (for
317  * instance times in s), then the logOffset gives you negative powers.
318  * In our case here, we're just looking at times in ticks, or counts, so we
319  * can never see values with magnitude < 1 (other than zero), so we can set
320  * it to 0. As above change the initializer if you change this.
321  */
322  logOffset = 0
323  };
324  uint32_t KMP_ALIGN_CACHE zeroCount;
325  struct {
326  uint32_t count;
327  double total;
328  } bins[numBins];
329 
330  static double binMax[numBins];
331 
332 #ifdef KMP_DEBUG
333  uint64_t _total;
334 
335  void check() const {
336  uint64_t t = zeroCount;
337  for (int i = 0; i < numBins; i++)
338  t += bins[i].count;
339  KMP_DEBUG_ASSERT(t == _total);
340  }
341 #else
342  void check() const {}
343 #endif
344 
345 public:
346  logHistogram() { reset(); }
347 
348  logHistogram(logHistogram const &o) {
349  for (int i = 0; i < numBins; i++)
350  bins[i] = o.bins[i];
351 #ifdef KMP_DEBUG
352  _total = o._total;
353 #endif
354  }
355 
356  void reset() {
357  zeroCount = 0;
358  for (int i = 0; i < numBins; i++) {
359  bins[i].count = 0;
360  bins[i].total = 0;
361  }
362 
363 #ifdef KMP_DEBUG
364  _total = 0;
365 #endif
366  }
367  uint32_t count(int b) const { return bins[b + logOffset].count; }
368  double total(int b) const { return bins[b + logOffset].total; }
369  static uint32_t findBin(double sample);
370 
371  logHistogram &operator+=(logHistogram const &o) {
372  zeroCount += o.zeroCount;
373  for (int i = 0; i < numBins; i++) {
374  bins[i].count += o.bins[i].count;
375  bins[i].total += o.bins[i].total;
376  }
377 #ifdef KMP_DEBUG
378  _total += o._total;
379  check();
380 #endif
381 
382  return *this;
383  }
384 
385  void addSample(double sample);
386  int minBin() const;
387  int maxBin() const;
388 
389  std::string format(char) const;
390 };
391 
392 class statistic {
393  double KMP_ALIGN_CACHE minVal;
394  double maxVal;
395  double meanVal;
396  double m2;
397  uint64_t sampleCount;
398  double offset;
399  bool collectingHist;
400  logHistogram hist;
401 
402 public:
403  statistic(bool doHist = bool(KMP_STATS_HIST)) {
404  reset();
405  collectingHist = doHist;
406  }
407  statistic(statistic const &o)
408  : minVal(o.minVal), maxVal(o.maxVal), meanVal(o.meanVal), m2(o.m2),
409  sampleCount(o.sampleCount), offset(o.offset),
410  collectingHist(o.collectingHist), hist(o.hist) {}
411  statistic(double minv, double maxv, double meanv, uint64_t sc, double sd)
412  : minVal(minv), maxVal(maxv), meanVal(meanv), m2(sd * sd * sc),
413  sampleCount(sc), offset(0.0), collectingHist(false) {}
414  bool haveHist() const { return collectingHist; }
415  double getMin() const { return minVal; }
416  double getMean() const { return meanVal; }
417  double getMax() const { return maxVal; }
418  uint64_t getCount() const { return sampleCount; }
419  double getSD() const { return sqrt(m2 / sampleCount); }
420  double getTotal() const { return sampleCount * meanVal; }
421  logHistogram const *getHist() const { return &hist; }
422  void setOffset(double d) { offset = d; }
423 
424  void reset() {
425  minVal = std::numeric_limits<double>::max();
426  maxVal = -minVal;
427  meanVal = 0.0;
428  m2 = 0.0;
429  sampleCount = 0;
430  offset = 0.0;
431  hist.reset();
432  }
433  void addSample(double sample);
434  void scale(double factor);
435  void scaleDown(double f) { scale(1. / f); }
436  void forceCount(uint64_t count) { sampleCount = count; }
437  statistic &operator+=(statistic const &other);
438 
439  std::string format(char unit, bool total = false) const;
440  std::string formatHist(char unit) const { return hist.format(unit); }
441 };
442 
443 struct statInfo {
444  const char *name;
445  uint32_t flags;
446 };
447 
448 class timeStat : public statistic {
449  static statInfo timerInfo[];
450 
451 public:
452  timeStat() : statistic() {}
453  static const char *name(timer_e e) { return timerInfo[e].name; }
454  static bool noTotal(timer_e e) {
455  return timerInfo[e].flags & stats_flags_e::noTotal;
456  }
457  static bool masterOnly(timer_e e) {
458  return timerInfo[e].flags & stats_flags_e::onlyInMaster;
459  }
460  static bool workerOnly(timer_e e) {
461  return timerInfo[e].flags & stats_flags_e::notInMaster;
462  }
463  static bool noUnits(timer_e e) {
464  return timerInfo[e].flags & stats_flags_e::noUnits;
465  }
466  static bool logEvent(timer_e e) {
467  return timerInfo[e].flags & stats_flags_e::logEvent;
468  }
469  static void clearEventFlags() {
470  for (int i = 0; i < TIMER_LAST; i++) {
471  timerInfo[i].flags &= (~(stats_flags_e::logEvent));
472  }
473  }
474 };
475 
476 // Where we need explicitly to start and end the timer, this version can be used
477 // Since these timers normally aren't nicely scoped, so don't have a good place
478 // to live on the stack of the thread, they're more work to use.
479 class explicitTimer {
480  timeStat *stat;
481  timer_e timerEnumValue;
482  tsc_tick_count startTime;
483  tsc_tick_count pauseStartTime;
484  tsc_tick_count::tsc_interval_t totalPauseTime;
485 
486 public:
487  explicitTimer(timeStat *s, timer_e te)
488  : stat(s), timerEnumValue(te), startTime(), pauseStartTime(0),
489  totalPauseTime() {}
490 
491  // void setStat(timeStat *s) { stat = s; }
492  void start(tsc_tick_count tick);
493  void pause(tsc_tick_count tick) { pauseStartTime = tick; }
494  void resume(tsc_tick_count tick) {
495  totalPauseTime += (tick - pauseStartTime);
496  }
497  void stop(tsc_tick_count tick, kmp_stats_list *stats_ptr = nullptr);
498  void reset() {
499  startTime = 0;
500  pauseStartTime = 0;
501  totalPauseTime = 0;
502  }
503  timer_e get_type() const { return timerEnumValue; }
504 };
505 
506 // Where you need to partition a threads clock ticks into separate states
507 // e.g., a partitionedTimers class with two timers of EXECUTING_TASK, and
508 // DOING_NOTHING would render these conditions:
509 // time(EXECUTING_TASK) + time(DOING_NOTHING) = total time thread is alive
510 // No clock tick in the EXECUTING_TASK is a member of DOING_NOTHING and vice
511 // versa
512 class partitionedTimers {
513 private:
514  std::vector<explicitTimer> timer_stack;
515 
516 public:
517  partitionedTimers();
518  void init(explicitTimer timer);
519  void exchange(explicitTimer timer);
520  void push(explicitTimer timer);
521  void pop();
522  void windup();
523 };
524 
525 // Special wrapper around the partioned timers to aid timing code blocks
526 // It avoids the need to have an explicit end, leaving the scope suffices.
527 class blockPartitionedTimer {
528  partitionedTimers *part_timers;
529 
530 public:
531  blockPartitionedTimer(partitionedTimers *pt, explicitTimer timer)
532  : part_timers(pt) {
533  part_timers->push(timer);
534  }
535  ~blockPartitionedTimer() { part_timers->pop(); }
536 };
537 
538 // Special wrapper around the thread state to aid in keeping state in code
539 // blocks It avoids the need to have an explicit end, leaving the scope
540 // suffices.
541 class blockThreadState {
542  stats_state_e *state_pointer;
543  stats_state_e old_state;
544 
545 public:
546  blockThreadState(stats_state_e *thread_state_pointer, stats_state_e new_state)
547  : state_pointer(thread_state_pointer), old_state(*thread_state_pointer) {
548  *state_pointer = new_state;
549  }
550  ~blockThreadState() { *state_pointer = old_state; }
551 };
552 
553 // If all you want is a count, then you can use this...
554 // The individual per-thread counts will be aggregated into a statistic at
555 // program exit.
556 class counter {
557  uint64_t value;
558  static const statInfo counterInfo[];
559 
560 public:
561  counter() : value(0) {}
562  void increment() { value++; }
563  uint64_t getValue() const { return value; }
564  void reset() { value = 0; }
565  static const char *name(counter_e e) { return counterInfo[e].name; }
566  static bool masterOnly(counter_e e) {
567  return counterInfo[e].flags & stats_flags_e::onlyInMaster;
568  }
569 };
570 
571 /* ****************************************************************
572  Class to implement an event
573 
574  There are four components to an event: start time, stop time
575  nest_level, and timer_name.
576  The start and stop time should be obvious (recorded in clock ticks).
577  The nest_level relates to the bar width in the timeline graph.
578  The timer_name is used to determine which timer event triggered this event.
579 
580  the interface to this class is through four read-only operations:
581  1) getStart() -- returns the start time as 64 bit integer
582  2) getStop() -- returns the stop time as 64 bit integer
583  3) getNestLevel() -- returns the nest level of the event
584  4) getTimerName() -- returns the timer name that triggered event
585 
586  *MORE ON NEST_LEVEL*
587  The nest level is used in the bar graph that represents the timeline.
588  Its main purpose is for showing how events are nested inside eachother.
589  For example, say events, A, B, and C are recorded. If the timeline
590  looks like this:
591 
592 Begin -------------------------------------------------------------> Time
593  | | | | | |
594  A B C C B A
595  start start start end end end
596 
597  Then A, B, C will have a nest level of 1, 2, 3 respectively.
598  These values are then used to calculate the barwidth so you can
599  see that inside A, B has occurred, and inside B, C has occurred.
600  Currently, this is shown with A's bar width being larger than B's
601  bar width, and B's bar width being larger than C's bar width.
602 
603 **************************************************************** */
604 class kmp_stats_event {
605  uint64_t start;
606  uint64_t stop;
607  int nest_level;
608  timer_e timer_name;
609 
610 public:
611  kmp_stats_event()
612  : start(0), stop(0), nest_level(0), timer_name(TIMER_LAST) {}
613  kmp_stats_event(uint64_t strt, uint64_t stp, int nst, timer_e nme)
614  : start(strt), stop(stp), nest_level(nst), timer_name(nme) {}
615  inline uint64_t getStart() const { return start; }
616  inline uint64_t getStop() const { return stop; }
617  inline int getNestLevel() const { return nest_level; }
618  inline timer_e getTimerName() const { return timer_name; }
619 };
620 
621 /* ****************************************************************
622  Class to implement a dynamically expandable array of events
623 
624  ---------------------------------------------------------
625  | event 1 | event 2 | event 3 | event 4 | ... | event N |
626  ---------------------------------------------------------
627 
628  An event is pushed onto the back of this array at every
629  explicitTimer->stop() call. The event records the thread #,
630  start time, stop time, and nest level related to the bar width.
631 
632  The event vector starts at size INIT_SIZE and grows (doubles in size)
633  if needed. An implication of this behavior is that log(N)
634  reallocations are needed (where N is number of events). If you want
635  to avoid reallocations, then set INIT_SIZE to a large value.
636 
637  the interface to this class is through six operations:
638  1) reset() -- sets the internal_size back to 0 but does not deallocate any
639  memory
640  2) size() -- returns the number of valid elements in the vector
641  3) push_back(start, stop, nest, timer_name) -- pushes an event onto
642  the back of the array
643  4) deallocate() -- frees all memory associated with the vector
644  5) sort() -- sorts the vector by start time
645  6) operator[index] or at(index) -- returns event reference at that index
646 **************************************************************** */
647 class kmp_stats_event_vector {
648  kmp_stats_event *events;
649  int internal_size;
650  int allocated_size;
651  static const int INIT_SIZE = 1024;
652 
653 public:
654  kmp_stats_event_vector() {
655  events =
656  (kmp_stats_event *)__kmp_allocate(sizeof(kmp_stats_event) * INIT_SIZE);
657  internal_size = 0;
658  allocated_size = INIT_SIZE;
659  }
660  ~kmp_stats_event_vector() {}
661  inline void reset() { internal_size = 0; }
662  inline int size() const { return internal_size; }
663  void push_back(uint64_t start_time, uint64_t stop_time, int nest_level,
664  timer_e name) {
665  int i;
666  if (internal_size == allocated_size) {
667  kmp_stats_event *tmp = (kmp_stats_event *)__kmp_allocate(
668  sizeof(kmp_stats_event) * allocated_size * 2);
669  for (i = 0; i < internal_size; i++)
670  tmp[i] = events[i];
671  __kmp_free(events);
672  events = tmp;
673  allocated_size *= 2;
674  }
675  events[internal_size] =
676  kmp_stats_event(start_time, stop_time, nest_level, name);
677  internal_size++;
678  return;
679  }
680  void deallocate();
681  void sort();
682  const kmp_stats_event &operator[](int index) const { return events[index]; }
683  kmp_stats_event &operator[](int index) { return events[index]; }
684  const kmp_stats_event &at(int index) const { return events[index]; }
685  kmp_stats_event &at(int index) { return events[index]; }
686 };
687 
688 /* ****************************************************************
689  Class to implement a doubly-linked, circular, statistics list
690 
691  |---| ---> |---| ---> |---| ---> |---| ---> ... next
692  | | | | | | | |
693  |---| <--- |---| <--- |---| <--- |---| <--- ... prev
694  Sentinel first second third
695  Node node node node
696 
697  The Sentinel Node is the user handle on the list.
698  The first node corresponds to thread 0's statistics.
699  The second node corresponds to thread 1's statistics and so on...
700 
701  Each node has a _timers, _counters, and _explicitTimers array to hold that
702  thread's statistics. The _explicitTimers point to the correct _timer and
703  update its statistics at every stop() call. The explicitTimers' pointers are
704  set up in the constructor. Each node also has an event vector to hold that
705  thread's timing events. The event vector expands as necessary and records
706  the start-stop times for each timer.
707 
708  The nestLevel variable is for plotting events and is related
709  to the bar width in the timeline graph.
710 
711  Every thread will have a thread local pointer to its node in
712  the list. The sentinel node is used by the master thread to
713  store "dummy" statistics before __kmp_create_worker() is called.
714 **************************************************************** */
715 class kmp_stats_list {
716  int gtid;
717  timeStat _timers[TIMER_LAST + 1];
718  counter _counters[COUNTER_LAST + 1];
719  explicitTimer thread_life_timer;
720  partitionedTimers _partitionedTimers;
721  int _nestLevel; // one per thread
722  kmp_stats_event_vector _event_vector;
723  kmp_stats_list *next;
724  kmp_stats_list *prev;
725  stats_state_e state;
726  int thread_is_idle_flag;
727 
728 public:
729  kmp_stats_list()
730  : thread_life_timer(&_timers[TIMER_OMP_worker_thread_life],
731  TIMER_OMP_worker_thread_life),
732  _nestLevel(0), _event_vector(), next(this), prev(this), state(IDLE),
733  thread_is_idle_flag(0) {}
734  ~kmp_stats_list() {}
735  inline timeStat *getTimer(timer_e idx) { return &_timers[idx]; }
736  inline counter *getCounter(counter_e idx) { return &_counters[idx]; }
737  inline partitionedTimers *getPartitionedTimers() {
738  return &_partitionedTimers;
739  }
740  inline timeStat *getTimers() { return _timers; }
741  inline counter *getCounters() { return _counters; }
742  inline kmp_stats_event_vector &getEventVector() { return _event_vector; }
743  inline void startLife() { thread_life_timer.start(tsc_tick_count::now()); }
744  inline void endLife() { thread_life_timer.stop(tsc_tick_count::now(), this); }
745  inline void resetEventVector() { _event_vector.reset(); }
746  inline void incrementNestValue() { _nestLevel++; }
747  inline int getNestValue() { return _nestLevel; }
748  inline void decrementNestValue() { _nestLevel--; }
749  inline int getGtid() const { return gtid; }
750  inline void setGtid(int newgtid) { gtid = newgtid; }
751  inline void setState(stats_state_e newstate) { state = newstate; }
752  inline stats_state_e getState() const { return state; }
753  inline stats_state_e *getStatePointer() { return &state; }
754  inline bool isIdle() { return thread_is_idle_flag == 1; }
755  inline void setIdleFlag() { thread_is_idle_flag = 1; }
756  inline void resetIdleFlag() { thread_is_idle_flag = 0; }
757  kmp_stats_list *push_back(int gtid); // returns newly created list node
758  inline void push_event(uint64_t start_time, uint64_t stop_time,
759  int nest_level, timer_e name) {
760  _event_vector.push_back(start_time, stop_time, nest_level, name);
761  }
762  void deallocate();
763  class iterator;
764  kmp_stats_list::iterator begin();
765  kmp_stats_list::iterator end();
766  int size();
767  class iterator {
768  kmp_stats_list *ptr;
769  friend kmp_stats_list::iterator kmp_stats_list::begin();
770  friend kmp_stats_list::iterator kmp_stats_list::end();
771 
772  public:
773  iterator();
774  ~iterator();
775  iterator operator++();
776  iterator operator++(int dummy);
777  iterator operator--();
778  iterator operator--(int dummy);
779  bool operator!=(const iterator &rhs);
780  bool operator==(const iterator &rhs);
781  kmp_stats_list *operator*() const; // dereference operator
782  };
783 };
784 
785 /* ****************************************************************
786  Class to encapsulate all output functions and the environment variables
787 
788  This module holds filenames for various outputs (normal stats, events, plot
789  file), as well as coloring information for the plot file.
790 
791  The filenames and flags variables are read from environment variables.
792  These are read once by the constructor of the global variable
793  __kmp_stats_output which calls init().
794 
795  During this init() call, event flags for the timeStat::timerInfo[] global
796  array are cleared if KMP_STATS_EVENTS is not true (on, 1, yes).
797 
798  The only interface function that is public is outputStats(heading). This
799  function should print out everything it needs to, either to files or stderr,
800  depending on the environment variables described below
801 
802  ENVIRONMENT VARIABLES:
803  KMP_STATS_FILE -- if set, all statistics (not events) will be printed to this
804  file, otherwise, print to stderr
805  KMP_STATS_THREADS -- if set to "on", then will print per thread statistics to
806  either KMP_STATS_FILE or stderr
807  KMP_STATS_PLOT_FILE -- if set, print the ploticus plot file to this filename,
808  otherwise, the plot file is sent to "events.plt"
809  KMP_STATS_EVENTS -- if set to "on", then log events, otherwise, don't log
810  events
811  KMP_STATS_EVENTS_FILE -- if set, all events are outputted to this file,
812  otherwise, output is sent to "events.dat"
813 **************************************************************** */
814 class kmp_stats_output_module {
815 
816 public:
817  struct rgb_color {
818  float r;
819  float g;
820  float b;
821  };
822 
823 private:
824  std::string outputFileName;
825  static const char *eventsFileName;
826  static const char *plotFileName;
827  static int printPerThreadFlag;
828  static int printPerThreadEventsFlag;
829  static const rgb_color globalColorArray[];
830  static rgb_color timerColorInfo[];
831 
832  void init();
833  static void setupEventColors();
834  static void printPloticusFile();
835  static void printHeaderInfo(FILE *statsOut);
836  static void printTimerStats(FILE *statsOut, statistic const *theStats,
837  statistic const *totalStats);
838  static void printCounterStats(FILE *statsOut, statistic const *theStats);
839  static void printCounters(FILE *statsOut, counter const *theCounters);
840  static void printEvents(FILE *eventsOut, kmp_stats_event_vector *theEvents,
841  int gtid);
842  static rgb_color getEventColor(timer_e e) { return timerColorInfo[e]; }
843  static void windupExplicitTimers();
844  bool eventPrintingEnabled() const { return printPerThreadEventsFlag; }
845 
846 public:
847  kmp_stats_output_module() { init(); }
848  void outputStats(const char *heading);
849 };
850 
851 #ifdef __cplusplus
852 extern "C" {
853 #endif
854 void __kmp_stats_init();
855 void __kmp_stats_fini();
856 void __kmp_reset_stats();
857 void __kmp_output_stats(const char *);
858 void __kmp_accumulate_stats_at_exit(void);
859 // thread local pointer to stats node within list
860 extern KMP_THREAD_LOCAL kmp_stats_list *__kmp_stats_thread_ptr;
861 // head to stats list.
862 extern kmp_stats_list *__kmp_stats_list;
863 // lock for __kmp_stats_list
864 extern kmp_tas_lock_t __kmp_stats_lock;
865 // reference start time
866 extern tsc_tick_count __kmp_stats_start_time;
867 // interface to output
868 extern kmp_stats_output_module __kmp_stats_output;
869 
870 #ifdef __cplusplus
871 }
872 #endif
873 
874 // Simple, standard interfaces that drop out completely if stats aren't enabled
875 
887 #define KMP_COUNT_VALUE(name, value) \
888  __kmp_stats_thread_ptr->getTimer(TIMER_##name)->addSample(value)
889 
900 #define KMP_COUNT_BLOCK(name) \
901  __kmp_stats_thread_ptr->getCounter(COUNTER_##name)->increment()
902 
920 #define KMP_OUTPUT_STATS(heading_string) __kmp_output_stats(heading_string)
921 
929 #define KMP_INIT_PARTITIONED_TIMERS(name) \
930  __kmp_stats_thread_ptr->getPartitionedTimers()->init(explicitTimer( \
931  __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
932 
933 #define KMP_TIME_PARTITIONED_BLOCK(name) \
934  blockPartitionedTimer __PBLOCKTIME__( \
935  __kmp_stats_thread_ptr->getPartitionedTimers(), \
936  explicitTimer(__kmp_stats_thread_ptr->getTimer(TIMER_##name), \
937  TIMER_##name))
938 
939 #define KMP_PUSH_PARTITIONED_TIMER(name) \
940  __kmp_stats_thread_ptr->getPartitionedTimers()->push(explicitTimer( \
941  __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
942 
943 #define KMP_POP_PARTITIONED_TIMER() \
944  __kmp_stats_thread_ptr->getPartitionedTimers()->pop()
945 
946 #define KMP_EXCHANGE_PARTITIONED_TIMER(name) \
947  __kmp_stats_thread_ptr->getPartitionedTimers()->exchange(explicitTimer( \
948  __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
949 
950 #define KMP_SET_THREAD_STATE(state_name) \
951  __kmp_stats_thread_ptr->setState(state_name)
952 
953 #define KMP_GET_THREAD_STATE() __kmp_stats_thread_ptr->getState()
954 
955 #define KMP_SET_THREAD_STATE_BLOCK(state_name) \
956  blockThreadState __BTHREADSTATE__(__kmp_stats_thread_ptr->getStatePointer(), \
957  state_name)
958 
966 #define KMP_RESET_STATS() __kmp_reset_stats()
967 
968 #if (KMP_DEVELOPER_STATS)
969 #define KMP_COUNT_DEVELOPER_VALUE(n, v) KMP_COUNT_VALUE(n, v)
970 #define KMP_COUNT_DEVELOPER_BLOCK(n) KMP_COUNT_BLOCK(n)
971 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) KMP_TIME_PARTITIONED_BLOCK(n)
972 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) KMP_PUSH_PARTITIONED_TIMER(n)
973 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) KMP_POP_PARTITIONED_TIMER(n)
974 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) \
975  KMP_EXCHANGE_PARTITIONED_TIMER(n)
976 #else
977 // Null definitions
978 #define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
979 #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
980 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
981 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
982 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
983 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
984 #endif
985 
986 #else // KMP_STATS_ENABLED
987 
988 // Null definitions
989 #define KMP_COUNT_VALUE(n, v) ((void)0)
990 #define KMP_COUNT_BLOCK(n) ((void)0)
991 
992 #define KMP_OUTPUT_STATS(heading_string) ((void)0)
993 #define KMP_RESET_STATS() ((void)0)
994 
995 #define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0)
996 #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0)
997 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0)
998 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
999 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
1000 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0)
1001 #define KMP_INIT_PARTITIONED_TIMERS(name) ((void)0)
1002 #define KMP_TIME_PARTITIONED_BLOCK(name) ((void)0)
1003 #define KMP_PUSH_PARTITIONED_TIMER(name) ((void)0)
1004 #define KMP_POP_PARTITIONED_TIMER() ((void)0)
1005 #define KMP_SET_THREAD_STATE(state_name) ((void)0)
1006 #define KMP_GET_THREAD_STATE() ((void)0)
1007 #define KMP_SET_THREAD_STATE_BLOCK(state_name) ((void)0)
1008 #endif // KMP_STATS_ENABLED
1009 
1010 #endif // KMP_STATS_H
statistic is valid only for master
Definition: kmp_stats.h:51
statistic is valid only for non-master threads
Definition: kmp_stats.h:53
do not show a TOTAL_aggregation for this statistic
Definition: kmp_stats.h:50
#define KMP_FOREACH_EXPLICIT_TIMER(macro, arg)
Add new explicit timers under KMP_FOREACH_EXPLICIT_TIMER() macro.
Definition: kmp_stats.h:292
statistic doesn&#39;t need units printed next to it
Definition: kmp_stats.h:52
stats_flags_e
flags to describe the statistic (timer or counter)
Definition: kmp_stats.h:49
#define KMP_FOREACH_COUNTER(macro, arg)
Add new counters under KMP_FOREACH_COUNTER() macro in kmp_stats.h.
Definition: kmp_stats.h:95
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63