Index: projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_taskq.cpp
===================================================================

Property changes on: projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_taskq.cpp
___________________________________________________________________
Deleted: svn:eol-style
## -1 +0,0 ##
-native
\ No newline at end of property
Deleted: svn:keywords
## -1 +0,0 ##
-FreeBSD=%H
\ No newline at end of property
Deleted: svn:mime-type
## -1 +0,0 ##
-text/plain
\ No newline at end of property
Index: projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp.h
===================================================================
--- projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp.h	(revision 357058)
+++ projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp.h	(revision 357059)
@@ -1,3917 +1,3916 @@
 /*! \file */
 /*
  * kmp.h -- KPTS runtime header file.
  */
 
 //===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef KMP_H
 #define KMP_H
 
 #include "kmp_config.h"
 
 /* #define BUILD_PARALLEL_ORDERED 1 */
 
 /* This fix replaces gettimeofday with clock_gettime for better scalability on
    the Altix.  Requires user code to be linked with -lrt. */
 //#define FIX_SGI_CLOCK
 
 /* Defines for OpenMP 3.0 tasking and auto scheduling */
 
 #ifndef KMP_STATIC_STEAL_ENABLED
 #define KMP_STATIC_STEAL_ENABLED 1
 #endif
 
 #define TASK_CURRENT_NOT_QUEUED 0
 #define TASK_CURRENT_QUEUED 1
 
 #ifdef BUILD_TIED_TASK_STACK
 #define TASK_STACK_EMPTY 0 // entries when the stack is empty
 #define TASK_STACK_BLOCK_BITS 5 // Used in TASK_STACK_SIZE and TASK_STACK_MASK
 // Number of entries in each task stack array
 #define TASK_STACK_BLOCK_SIZE (1 << TASK_STACK_BLOCK_BITS)
 // Mask for determining index into stack block
 #define TASK_STACK_INDEX_MASK (TASK_STACK_BLOCK_SIZE - 1)
 #endif // BUILD_TIED_TASK_STACK
 
 #define TASK_NOT_PUSHED 1
 #define TASK_SUCCESSFULLY_PUSHED 0
 #define TASK_TIED 1
 #define TASK_UNTIED 0
 #define TASK_EXPLICIT 1
 #define TASK_IMPLICIT 0
 #define TASK_PROXY 1
 #define TASK_FULL 0
 #define TASK_DETACHABLE 1
 #define TASK_UNDETACHABLE 0
 
 #define KMP_CANCEL_THREADS
 #define KMP_THREAD_ATTR
 
 // Android does not have pthread_cancel.  Undefine KMP_CANCEL_THREADS if being
 // built on Android
 #if defined(__ANDROID__)
 #undef KMP_CANCEL_THREADS
 #endif
 
 #include <signal.h>
 #include <stdarg.h>
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 /* include <ctype.h> don't use; problems with /MD on Windows* OS NT due to bad
    Microsoft library. Some macros provided below to replace these functions  */
 #ifndef __ABSOFT_WIN
 #include <sys/types.h>
 #endif
 #include <limits.h>
 #include <time.h>
 
 #include <errno.h>
 
 #include "kmp_os.h"
 
 #include "kmp_safe_c_api.h"
 
 #if KMP_STATS_ENABLED
 class kmp_stats_list;
 #endif
 
 #if KMP_USE_HIER_SCHED
 // Only include hierarchical scheduling if affinity is supported
 #undef KMP_USE_HIER_SCHED
 #define KMP_USE_HIER_SCHED KMP_AFFINITY_SUPPORTED
 #endif
 
 #if KMP_USE_HWLOC && KMP_AFFINITY_SUPPORTED
 #include "hwloc.h"
 #ifndef HWLOC_OBJ_NUMANODE
 #define HWLOC_OBJ_NUMANODE HWLOC_OBJ_NODE
 #endif
 #ifndef HWLOC_OBJ_PACKAGE
 #define HWLOC_OBJ_PACKAGE HWLOC_OBJ_SOCKET
 #endif
 #if HWLOC_API_VERSION >= 0x00020000
 // hwloc 2.0 changed type of depth of object from unsigned to int
 typedef int kmp_hwloc_depth_t;
 #else
 typedef unsigned int kmp_hwloc_depth_t;
 #endif
 #endif
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 #include <xmmintrin.h>
 #endif
 
 #include "kmp_debug.h"
 #include "kmp_lock.h"
 #include "kmp_version.h"
 #if USE_DEBUGGER
 #include "kmp_debugger.h"
 #endif
 #include "kmp_i18n.h"
 
 #define KMP_HANDLE_SIGNALS (KMP_OS_UNIX || KMP_OS_WINDOWS)
 
 #include "kmp_wrapper_malloc.h"
 #if KMP_OS_UNIX
 #include <unistd.h>
 #if !defined NSIG && defined _NSIG
 #define NSIG _NSIG
 #endif
 #endif
 
 #if KMP_OS_LINUX
 #pragma weak clock_gettime
 #endif
 
 #if OMPT_SUPPORT
 #include "ompt-internal.h"
 #endif
 
 // Affinity format function
 #include "kmp_str.h"
 
 // 0 - no fast memory allocation, alignment: 8-byte on x86, 16-byte on x64.
 // 3 - fast allocation using sync, non-sync free lists of any size, non-self
 // free lists of limited size.
 #ifndef USE_FAST_MEMORY
 #define USE_FAST_MEMORY 3
 #endif
 
 #ifndef KMP_NESTED_HOT_TEAMS
 #define KMP_NESTED_HOT_TEAMS 0
 #define USE_NESTED_HOT_ARG(x)
 #else
 #if KMP_NESTED_HOT_TEAMS
 #define USE_NESTED_HOT_ARG(x) , x
 #else
 #define USE_NESTED_HOT_ARG(x)
 #endif
 #endif
 
 // Assume using BGET compare_exchange instruction instead of lock by default.
 #ifndef USE_CMP_XCHG_FOR_BGET
 #define USE_CMP_XCHG_FOR_BGET 1
 #endif
 
 // Test to see if queuing lock is better than bootstrap lock for bget
 // #ifndef USE_QUEUING_LOCK_FOR_BGET
 // #define USE_QUEUING_LOCK_FOR_BGET
 // #endif
 
 #define KMP_NSEC_PER_SEC 1000000000L
 #define KMP_USEC_PER_SEC 1000000L
 
 /*!
 @ingroup BASIC_TYPES
 @{
 */
 
 /*!
 Values for bit flags used in the ident_t to describe the fields.
 */
 enum {
   /*! Use trampoline for internal microtasks */
   KMP_IDENT_IMB = 0x01,
   /*! Use c-style ident structure */
   KMP_IDENT_KMPC = 0x02,
   /* 0x04 is no longer used */
   /*! Entry point generated by auto-parallelization */
   KMP_IDENT_AUTOPAR = 0x08,
   /*! Compiler generates atomic reduction option for kmpc_reduce* */
   KMP_IDENT_ATOMIC_REDUCE = 0x10,
   /*! To mark a 'barrier' directive in user code */
   KMP_IDENT_BARRIER_EXPL = 0x20,
   /*! To Mark implicit barriers. */
   KMP_IDENT_BARRIER_IMPL = 0x0040,
   KMP_IDENT_BARRIER_IMPL_MASK = 0x01C0,
   KMP_IDENT_BARRIER_IMPL_FOR = 0x0040,
   KMP_IDENT_BARRIER_IMPL_SECTIONS = 0x00C0,
 
   KMP_IDENT_BARRIER_IMPL_SINGLE = 0x0140,
   KMP_IDENT_BARRIER_IMPL_WORKSHARE = 0x01C0,
 
   /*! To mark a static loop in OMPT callbacks */
   KMP_IDENT_WORK_LOOP = 0x200,
   /*! To mark a sections directive in OMPT callbacks */
   KMP_IDENT_WORK_SECTIONS = 0x400,
   /*! To mark a distirbute construct in OMPT callbacks */
   KMP_IDENT_WORK_DISTRIBUTE = 0x800,
   /*! Atomic hint; bottom four bits as omp_sync_hint_t. Top four reserved and
       not currently used. If one day we need more bits, then we can use
       an invalid combination of hints to mean that another, larger field
       should be used in a different flag. */
   KMP_IDENT_ATOMIC_HINT_MASK = 0xFF0000,
   KMP_IDENT_ATOMIC_HINT_UNCONTENDED = 0x010000,
   KMP_IDENT_ATOMIC_HINT_CONTENDED = 0x020000,
   KMP_IDENT_ATOMIC_HINT_NONSPECULATIVE = 0x040000,
   KMP_IDENT_ATOMIC_HINT_SPECULATIVE = 0x080000,
 };
 
 /*!
  * The ident structure that describes a source location.
  */
 typedef struct ident {
   kmp_int32 reserved_1; /**<  might be used in Fortran; see above  */
   kmp_int32 flags; /**<  also f.flags; KMP_IDENT_xxx flags; KMP_IDENT_KMPC
                       identifies this union member  */
   kmp_int32 reserved_2; /**<  not really used in Fortran any more; see above */
 #if USE_ITT_BUILD
 /*  but currently used for storing region-specific ITT */
 /*  contextual information. */
 #endif /* USE_ITT_BUILD */
   kmp_int32 reserved_3; /**< source[4] in Fortran, do not use for C++  */
   char const *psource; /**< String describing the source location.
                        The string is composed of semi-colon separated fields
                        which describe the source file, the function and a pair
                        of line numbers that delimit the construct. */
 } ident_t;
 /*!
 @}
 */
 
 // Some forward declarations.
 typedef union kmp_team kmp_team_t;
 typedef struct kmp_taskdata kmp_taskdata_t;
 typedef union kmp_task_team kmp_task_team_t;
 typedef union kmp_team kmp_team_p;
 typedef union kmp_info kmp_info_p;
 typedef union kmp_root kmp_root_p;
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 /* ------------------------------------------------------------------------ */
 
 /* Pack two 32-bit signed integers into a 64-bit signed integer */
 /* ToDo: Fix word ordering for big-endian machines. */
 #define KMP_PACK_64(HIGH_32, LOW_32)                                           \
   ((kmp_int64)((((kmp_uint64)(HIGH_32)) << 32) | (kmp_uint64)(LOW_32)))
 
 // Generic string manipulation macros. Assume that _x is of type char *
 #define SKIP_WS(_x)                                                            \
   {                                                                            \
     while (*(_x) == ' ' || *(_x) == '\t')                                      \
       (_x)++;                                                                  \
   }
 #define SKIP_DIGITS(_x)                                                        \
   {                                                                            \
     while (*(_x) >= '0' && *(_x) <= '9')                                       \
       (_x)++;                                                                  \
   }
 #define SKIP_TOKEN(_x)                                                         \
   {                                                                            \
     while ((*(_x) >= '0' && *(_x) <= '9') || (*(_x) >= 'a' && *(_x) <= 'z') || \
            (*(_x) >= 'A' && *(_x) <= 'Z') || *(_x) == '_')                     \
       (_x)++;                                                                  \
   }
 #define SKIP_TO(_x, _c)                                                        \
   {                                                                            \
     while (*(_x) != '\0' && *(_x) != (_c))                                     \
       (_x)++;                                                                  \
   }
 
 /* ------------------------------------------------------------------------ */
 
 #define KMP_MAX(x, y) ((x) > (y) ? (x) : (y))
 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
 
 /* ------------------------------------------------------------------------ */
 /* Enumeration types */
 
 enum kmp_state_timer {
   ts_stop,
   ts_start,
   ts_pause,
 
   ts_last_state
 };
 
 enum dynamic_mode {
   dynamic_default,
 #ifdef USE_LOAD_BALANCE
   dynamic_load_balance,
 #endif /* USE_LOAD_BALANCE */
   dynamic_random,
   dynamic_thread_limit,
   dynamic_max
 };
 
 /* external schedule constants, duplicate enum omp_sched in omp.h in order to
  * not include it here */
 #ifndef KMP_SCHED_TYPE_DEFINED
 #define KMP_SCHED_TYPE_DEFINED
 typedef enum kmp_sched {
   kmp_sched_lower = 0, // lower and upper bounds are for routine parameter check
   // Note: need to adjust __kmp_sch_map global array in case enum is changed
   kmp_sched_static = 1, // mapped to kmp_sch_static_chunked           (33)
   kmp_sched_dynamic = 2, // mapped to kmp_sch_dynamic_chunked          (35)
   kmp_sched_guided = 3, // mapped to kmp_sch_guided_chunked           (36)
   kmp_sched_auto = 4, // mapped to kmp_sch_auto                     (38)
   kmp_sched_upper_std = 5, // upper bound for standard schedules
   kmp_sched_lower_ext = 100, // lower bound of Intel extension schedules
   kmp_sched_trapezoidal = 101, // mapped to kmp_sch_trapezoidal (39)
 #if KMP_STATIC_STEAL_ENABLED
   kmp_sched_static_steal = 102, // mapped to kmp_sch_static_steal (44)
 #endif
   kmp_sched_upper,
   kmp_sched_default = kmp_sched_static, // default scheduling
   kmp_sched_monotonic = 0x80000000
 } kmp_sched_t;
 #endif
 
 /*!
  @ingroup WORK_SHARING
  * Describes the loop schedule to be used for a parallel for loop.
  */
 enum sched_type : kmp_int32 {
   kmp_sch_lower = 32, /**< lower bound for unordered values */
   kmp_sch_static_chunked = 33,
   kmp_sch_static = 34, /**< static unspecialized */
   kmp_sch_dynamic_chunked = 35,
   kmp_sch_guided_chunked = 36, /**< guided unspecialized */
   kmp_sch_runtime = 37,
   kmp_sch_auto = 38, /**< auto */
   kmp_sch_trapezoidal = 39,
 
   /* accessible only through KMP_SCHEDULE environment variable */
   kmp_sch_static_greedy = 40,
   kmp_sch_static_balanced = 41,
   /* accessible only through KMP_SCHEDULE environment variable */
   kmp_sch_guided_iterative_chunked = 42,
   kmp_sch_guided_analytical_chunked = 43,
   /* accessible only through KMP_SCHEDULE environment variable */
   kmp_sch_static_steal = 44,
 
   /* static with chunk adjustment (e.g., simd) */
   kmp_sch_static_balanced_chunked = 45,
   kmp_sch_guided_simd = 46, /**< guided with chunk adjustment */
   kmp_sch_runtime_simd = 47, /**< runtime with chunk adjustment */
 
   /* accessible only through KMP_SCHEDULE environment variable */
   kmp_sch_upper, /**< upper bound for unordered values */
 
   kmp_ord_lower = 64, /**< lower bound for ordered values, must be power of 2 */
   kmp_ord_static_chunked = 65,
   kmp_ord_static = 66, /**< ordered static unspecialized */
   kmp_ord_dynamic_chunked = 67,
   kmp_ord_guided_chunked = 68,
   kmp_ord_runtime = 69,
   kmp_ord_auto = 70, /**< ordered auto */
   kmp_ord_trapezoidal = 71,
   kmp_ord_upper, /**< upper bound for ordered values */
 
   /* Schedules for Distribute construct */
   kmp_distribute_static_chunked = 91, /**< distribute static chunked */
   kmp_distribute_static = 92, /**< distribute static unspecialized */
 
   /* For the "nomerge" versions, kmp_dispatch_next*() will always return a
      single iteration/chunk, even if the loop is serialized. For the schedule
      types listed above, the entire iteration vector is returned if the loop is
      serialized. This doesn't work for gcc/gcomp sections. */
   kmp_nm_lower = 160, /**< lower bound for nomerge values */
 
   kmp_nm_static_chunked =
       (kmp_sch_static_chunked - kmp_sch_lower + kmp_nm_lower),
   kmp_nm_static = 162, /**< static unspecialized */
   kmp_nm_dynamic_chunked = 163,
   kmp_nm_guided_chunked = 164, /**< guided unspecialized */
   kmp_nm_runtime = 165,
   kmp_nm_auto = 166, /**< auto */
   kmp_nm_trapezoidal = 167,
 
   /* accessible only through KMP_SCHEDULE environment variable */
   kmp_nm_static_greedy = 168,
   kmp_nm_static_balanced = 169,
   /* accessible only through KMP_SCHEDULE environment variable */
   kmp_nm_guided_iterative_chunked = 170,
   kmp_nm_guided_analytical_chunked = 171,
   kmp_nm_static_steal =
       172, /* accessible only through OMP_SCHEDULE environment variable */
 
   kmp_nm_ord_static_chunked = 193,
   kmp_nm_ord_static = 194, /**< ordered static unspecialized */
   kmp_nm_ord_dynamic_chunked = 195,
   kmp_nm_ord_guided_chunked = 196,
   kmp_nm_ord_runtime = 197,
   kmp_nm_ord_auto = 198, /**< auto */
   kmp_nm_ord_trapezoidal = 199,
   kmp_nm_upper, /**< upper bound for nomerge values */
 
   /* Support for OpenMP 4.5 monotonic and nonmonotonic schedule modifiers. Since
      we need to distinguish the three possible cases (no modifier, monotonic
      modifier, nonmonotonic modifier), we need separate bits for each modifier.
      The absence of monotonic does not imply nonmonotonic, especially since 4.5
      says that the behaviour of the "no modifier" case is implementation defined
      in 4.5, but will become "nonmonotonic" in 5.0.
 
      Since we're passing a full 32 bit value, we can use a couple of high bits
      for these flags; out of paranoia we avoid the sign bit.
 
      These modifiers can be or-ed into non-static schedules by the compiler to
      pass the additional information. They will be stripped early in the
      processing in __kmp_dispatch_init when setting up schedules, so most of the
      code won't ever see schedules with these bits set.  */
   kmp_sch_modifier_monotonic =
       (1 << 29), /**< Set if the monotonic schedule modifier was present */
   kmp_sch_modifier_nonmonotonic =
       (1 << 30), /**< Set if the nonmonotonic schedule modifier was present */
 
 #define SCHEDULE_WITHOUT_MODIFIERS(s)                                          \
   (enum sched_type)(                                                           \
       (s) & ~(kmp_sch_modifier_nonmonotonic | kmp_sch_modifier_monotonic))
 #define SCHEDULE_HAS_MONOTONIC(s) (((s)&kmp_sch_modifier_monotonic) != 0)
 #define SCHEDULE_HAS_NONMONOTONIC(s) (((s)&kmp_sch_modifier_nonmonotonic) != 0)
 #define SCHEDULE_HAS_NO_MODIFIERS(s)                                           \
   (((s) & (kmp_sch_modifier_nonmonotonic | kmp_sch_modifier_monotonic)) == 0)
 #define SCHEDULE_GET_MODIFIERS(s)                                              \
   ((enum sched_type)(                                                          \
       (s) & (kmp_sch_modifier_nonmonotonic | kmp_sch_modifier_monotonic)))
 #define SCHEDULE_SET_MODIFIERS(s, m)                                           \
   (s = (enum sched_type)((kmp_int32)s | (kmp_int32)m))
 #define SCHEDULE_NONMONOTONIC 0
 #define SCHEDULE_MONOTONIC 1
 
   kmp_sch_default = kmp_sch_static /**< default scheduling algorithm */
 };
 
 // Apply modifiers on internal kind to standard kind
 static inline void
 __kmp_sched_apply_mods_stdkind(kmp_sched_t *kind,
                                enum sched_type internal_kind) {
   if (SCHEDULE_HAS_MONOTONIC(internal_kind)) {
     *kind = (kmp_sched_t)((int)*kind | (int)kmp_sched_monotonic);
   }
 }
 
 // Apply modifiers on standard kind to internal kind
 static inline void
 __kmp_sched_apply_mods_intkind(kmp_sched_t kind,
                                enum sched_type *internal_kind) {
   if ((int)kind & (int)kmp_sched_monotonic) {
     *internal_kind = (enum sched_type)((int)*internal_kind |
                                        (int)kmp_sch_modifier_monotonic);
   }
 }
 
 // Get standard schedule without modifiers
 static inline kmp_sched_t __kmp_sched_without_mods(kmp_sched_t kind) {
   return (kmp_sched_t)((int)kind & ~((int)kmp_sched_monotonic));
 }
 
 /* Type to keep runtime schedule set via OMP_SCHEDULE or omp_set_schedule() */
 typedef union kmp_r_sched {
   struct {
     enum sched_type r_sched_type;
     int chunk;
   };
   kmp_int64 sched;
 } kmp_r_sched_t;
 
 extern enum sched_type __kmp_sch_map[]; // map OMP 3.0 schedule types with our
 // internal schedule types
 
 enum library_type {
   library_none,
   library_serial,
   library_turnaround,
   library_throughput
 };
 
 #if KMP_OS_LINUX
 enum clock_function_type {
   clock_function_gettimeofday,
   clock_function_clock_gettime
 };
 #endif /* KMP_OS_LINUX */
 
 #if KMP_MIC_SUPPORTED
 enum mic_type { non_mic, mic1, mic2, mic3, dummy };
 #endif
 
 /* -- fast reduction stuff ------------------------------------------------ */
 
 #undef KMP_FAST_REDUCTION_BARRIER
 #define KMP_FAST_REDUCTION_BARRIER 1
 
 #undef KMP_FAST_REDUCTION_CORE_DUO
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 #define KMP_FAST_REDUCTION_CORE_DUO 1
 #endif
 
 enum _reduction_method {
   reduction_method_not_defined = 0,
   critical_reduce_block = (1 << 8),
   atomic_reduce_block = (2 << 8),
   tree_reduce_block = (3 << 8),
   empty_reduce_block = (4 << 8)
 };
 
 // Description of the packed_reduction_method variable:
 // The packed_reduction_method variable consists of two enum types variables
 // that are packed together into 0-th byte and 1-st byte:
 // 0: (packed_reduction_method & 0x000000FF) is a 'enum barrier_type' value of
 // barrier that will be used in fast reduction: bs_plain_barrier or
 // bs_reduction_barrier
 // 1: (packed_reduction_method & 0x0000FF00) is a reduction method that will
 // be used in fast reduction;
 // Reduction method is of 'enum _reduction_method' type and it's defined the way
 // so that the bits of 0-th byte are empty, so no need to execute a shift
 // instruction while packing/unpacking
 
 #if KMP_FAST_REDUCTION_BARRIER
 #define PACK_REDUCTION_METHOD_AND_BARRIER(reduction_method, barrier_type)      \
   ((reduction_method) | (barrier_type))
 
 #define UNPACK_REDUCTION_METHOD(packed_reduction_method)                       \
   ((enum _reduction_method)((packed_reduction_method) & (0x0000FF00)))
 
 #define UNPACK_REDUCTION_BARRIER(packed_reduction_method)                      \
   ((enum barrier_type)((packed_reduction_method) & (0x000000FF)))
 #else
 #define PACK_REDUCTION_METHOD_AND_BARRIER(reduction_method, barrier_type)      \
   (reduction_method)
 
 #define UNPACK_REDUCTION_METHOD(packed_reduction_method)                       \
   (packed_reduction_method)
 
 #define UNPACK_REDUCTION_BARRIER(packed_reduction_method) (bs_plain_barrier)
 #endif
 
 #define TEST_REDUCTION_METHOD(packed_reduction_method, which_reduction_block)  \
   ((UNPACK_REDUCTION_METHOD(packed_reduction_method)) ==                       \
    (which_reduction_block))
 
 #if KMP_FAST_REDUCTION_BARRIER
 #define TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER                               \
   (PACK_REDUCTION_METHOD_AND_BARRIER(tree_reduce_block, bs_reduction_barrier))
 
 #define TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER                                   \
   (PACK_REDUCTION_METHOD_AND_BARRIER(tree_reduce_block, bs_plain_barrier))
 #endif
 
 typedef int PACKED_REDUCTION_METHOD_T;
 
 /* -- end of fast reduction stuff ----------------------------------------- */
 
 #if KMP_OS_WINDOWS
 #define USE_CBLKDATA
 #if KMP_MSVC_COMPAT
 #pragma warning(push)
 #pragma warning(disable : 271 310)
 #endif
 #include <windows.h>
 #if KMP_MSVC_COMPAT
 #pragma warning(pop)
 #endif
 #endif
 
 #if KMP_OS_UNIX
 #include <dlfcn.h>
 #include <pthread.h>
 #endif
 
 /* Only Linux* OS and Windows* OS support thread affinity. */
 #if KMP_AFFINITY_SUPPORTED
 
 // GROUP_AFFINITY is already defined for _MSC_VER>=1600 (VS2010 and later).
 #if KMP_OS_WINDOWS
 #if _MSC_VER < 1600 && KMP_MSVC_COMPAT
 typedef struct GROUP_AFFINITY {
   KAFFINITY Mask;
   WORD Group;
   WORD Reserved[3];
 } GROUP_AFFINITY;
 #endif /* _MSC_VER < 1600 */
 #if KMP_GROUP_AFFINITY
 extern int __kmp_num_proc_groups;
 #else
 static const int __kmp_num_proc_groups = 1;
 #endif /* KMP_GROUP_AFFINITY */
 typedef DWORD (*kmp_GetActiveProcessorCount_t)(WORD);
 extern kmp_GetActiveProcessorCount_t __kmp_GetActiveProcessorCount;
 
 typedef WORD (*kmp_GetActiveProcessorGroupCount_t)(void);
 extern kmp_GetActiveProcessorGroupCount_t __kmp_GetActiveProcessorGroupCount;
 
 typedef BOOL (*kmp_GetThreadGroupAffinity_t)(HANDLE, GROUP_AFFINITY *);
 extern kmp_GetThreadGroupAffinity_t __kmp_GetThreadGroupAffinity;
 
 typedef BOOL (*kmp_SetThreadGroupAffinity_t)(HANDLE, const GROUP_AFFINITY *,
                                              GROUP_AFFINITY *);
 extern kmp_SetThreadGroupAffinity_t __kmp_SetThreadGroupAffinity;
 #endif /* KMP_OS_WINDOWS */
 
 #if KMP_USE_HWLOC
 extern hwloc_topology_t __kmp_hwloc_topology;
 extern int __kmp_hwloc_error;
 extern int __kmp_numa_detected;
 extern int __kmp_tile_depth;
 #endif
 
 extern size_t __kmp_affin_mask_size;
 #define KMP_AFFINITY_CAPABLE() (__kmp_affin_mask_size > 0)
 #define KMP_AFFINITY_DISABLE() (__kmp_affin_mask_size = 0)
 #define KMP_AFFINITY_ENABLE(mask_size) (__kmp_affin_mask_size = mask_size)
 #define KMP_CPU_SET_ITERATE(i, mask)                                           \
   for (i = (mask)->begin(); (int)i != (mask)->end(); i = (mask)->next(i))
 #define KMP_CPU_SET(i, mask) (mask)->set(i)
 #define KMP_CPU_ISSET(i, mask) (mask)->is_set(i)
 #define KMP_CPU_CLR(i, mask) (mask)->clear(i)
 #define KMP_CPU_ZERO(mask) (mask)->zero()
 #define KMP_CPU_COPY(dest, src) (dest)->copy(src)
 #define KMP_CPU_AND(dest, src) (dest)->bitwise_and(src)
 #define KMP_CPU_COMPLEMENT(max_bit_number, mask) (mask)->bitwise_not()
 #define KMP_CPU_UNION(dest, src) (dest)->bitwise_or(src)
 #define KMP_CPU_ALLOC(ptr) (ptr = __kmp_affinity_dispatch->allocate_mask())
 #define KMP_CPU_FREE(ptr) __kmp_affinity_dispatch->deallocate_mask(ptr)
 #define KMP_CPU_ALLOC_ON_STACK(ptr) KMP_CPU_ALLOC(ptr)
 #define KMP_CPU_FREE_FROM_STACK(ptr) KMP_CPU_FREE(ptr)
 #define KMP_CPU_INTERNAL_ALLOC(ptr) KMP_CPU_ALLOC(ptr)
 #define KMP_CPU_INTERNAL_FREE(ptr) KMP_CPU_FREE(ptr)
 #define KMP_CPU_INDEX(arr, i) __kmp_affinity_dispatch->index_mask_array(arr, i)
 #define KMP_CPU_ALLOC_ARRAY(arr, n)                                            \
   (arr = __kmp_affinity_dispatch->allocate_mask_array(n))
 #define KMP_CPU_FREE_ARRAY(arr, n)                                             \
   __kmp_affinity_dispatch->deallocate_mask_array(arr)
 #define KMP_CPU_INTERNAL_ALLOC_ARRAY(arr, n) KMP_CPU_ALLOC_ARRAY(arr, n)
 #define KMP_CPU_INTERNAL_FREE_ARRAY(arr, n) KMP_CPU_FREE_ARRAY(arr, n)
 #define __kmp_get_system_affinity(mask, abort_bool)                            \
   (mask)->get_system_affinity(abort_bool)
 #define __kmp_set_system_affinity(mask, abort_bool)                            \
   (mask)->set_system_affinity(abort_bool)
 #define __kmp_get_proc_group(mask) (mask)->get_proc_group()
 
 class KMPAffinity {
 public:
   class Mask {
   public:
     void *operator new(size_t n);
     void operator delete(void *p);
     void *operator new[](size_t n);
     void operator delete[](void *p);
     virtual ~Mask() {}
     // Set bit i to 1
     virtual void set(int i) {}
     // Return bit i
     virtual bool is_set(int i) const { return false; }
     // Set bit i to 0
     virtual void clear(int i) {}
     // Zero out entire mask
     virtual void zero() {}
     // Copy src into this mask
     virtual void copy(const Mask *src) {}
     // this &= rhs
     virtual void bitwise_and(const Mask *rhs) {}
     // this |= rhs
     virtual void bitwise_or(const Mask *rhs) {}
     // this = ~this
     virtual void bitwise_not() {}
     // API for iterating over an affinity mask
     // for (int i = mask->begin(); i != mask->end(); i = mask->next(i))
     virtual int begin() const { return 0; }
     virtual int end() const { return 0; }
     virtual int next(int previous) const { return 0; }
     // Set the system's affinity to this affinity mask's value
     virtual int set_system_affinity(bool abort_on_error) const { return -1; }
     // Set this affinity mask to the current system affinity
     virtual int get_system_affinity(bool abort_on_error) { return -1; }
     // Only 1 DWORD in the mask should have any procs set.
     // Return the appropriate index, or -1 for an invalid mask.
     virtual int get_proc_group() const { return -1; }
   };
   void *operator new(size_t n);
   void operator delete(void *p);
   // Need virtual destructor
   virtual ~KMPAffinity() = default;
   // Determine if affinity is capable
   virtual void determine_capable(const char *env_var) {}
   // Bind the current thread to os proc
   virtual void bind_thread(int proc) {}
   // Factory functions to allocate/deallocate a mask
   virtual Mask *allocate_mask() { return nullptr; }
   virtual void deallocate_mask(Mask *m) {}
   virtual Mask *allocate_mask_array(int num) { return nullptr; }
   virtual void deallocate_mask_array(Mask *m) {}
   virtual Mask *index_mask_array(Mask *m, int index) { return nullptr; }
   static void pick_api();
   static void destroy_api();
   enum api_type {
     NATIVE_OS
 #if KMP_USE_HWLOC
     ,
     HWLOC
 #endif
   };
   virtual api_type get_api_type() const {
     KMP_ASSERT(0);
     return NATIVE_OS;
   }
 
 private:
   static bool picked_api;
 };
 
 typedef KMPAffinity::Mask kmp_affin_mask_t;
 extern KMPAffinity *__kmp_affinity_dispatch;
 
 // Declare local char buffers with this size for printing debug and info
 // messages, using __kmp_affinity_print_mask().
 #define KMP_AFFIN_MASK_PRINT_LEN 1024
 
 enum affinity_type {
   affinity_none = 0,
   affinity_physical,
   affinity_logical,
   affinity_compact,
   affinity_scatter,
   affinity_explicit,
   affinity_balanced,
   affinity_disabled, // not used outsize the env var parser
   affinity_default
 };
 
 enum affinity_gran {
   affinity_gran_fine = 0,
   affinity_gran_thread,
   affinity_gran_core,
   affinity_gran_tile,
   affinity_gran_numa,
   affinity_gran_package,
   affinity_gran_node,
 #if KMP_GROUP_AFFINITY
   // The "group" granularity isn't necesssarily coarser than all of the
   // other levels, but we put it last in the enum.
   affinity_gran_group,
 #endif /* KMP_GROUP_AFFINITY */
   affinity_gran_default
 };
 
 enum affinity_top_method {
   affinity_top_method_all = 0, // try all (supported) methods, in order
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
   affinity_top_method_apicid,
   affinity_top_method_x2apicid,
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
   affinity_top_method_cpuinfo, // KMP_CPUINFO_FILE is usable on Windows* OS, too
 #if KMP_GROUP_AFFINITY
   affinity_top_method_group,
 #endif /* KMP_GROUP_AFFINITY */
   affinity_top_method_flat,
 #if KMP_USE_HWLOC
   affinity_top_method_hwloc,
 #endif
   affinity_top_method_default
 };
 
 #define affinity_respect_mask_default (-1)
 
 extern enum affinity_type __kmp_affinity_type; /* Affinity type */
 extern enum affinity_gran __kmp_affinity_gran; /* Affinity granularity */
 extern int __kmp_affinity_gran_levels; /* corresponding int value */
 extern int __kmp_affinity_dups; /* Affinity duplicate masks */
 extern enum affinity_top_method __kmp_affinity_top_method;
 extern int __kmp_affinity_compact; /* Affinity 'compact' value */
 extern int __kmp_affinity_offset; /* Affinity offset value  */
 extern int __kmp_affinity_verbose; /* Was verbose specified for KMP_AFFINITY? */
 extern int __kmp_affinity_warnings; /* KMP_AFFINITY warnings enabled ? */
 extern int __kmp_affinity_respect_mask; // Respect process' init affinity mask?
 extern char *__kmp_affinity_proclist; /* proc ID list */
 extern kmp_affin_mask_t *__kmp_affinity_masks;
 extern unsigned __kmp_affinity_num_masks;
 extern void __kmp_affinity_bind_thread(int which);
 
 extern kmp_affin_mask_t *__kmp_affin_fullMask;
 extern char *__kmp_cpuinfo_file;
 
 #endif /* KMP_AFFINITY_SUPPORTED */
 
 // This needs to be kept in sync with the values in omp.h !!!
 typedef enum kmp_proc_bind_t {
   proc_bind_false = 0,
   proc_bind_true,
   proc_bind_master,
   proc_bind_close,
   proc_bind_spread,
   proc_bind_intel, // use KMP_AFFINITY interface
   proc_bind_default
 } kmp_proc_bind_t;
 
 typedef struct kmp_nested_proc_bind_t {
   kmp_proc_bind_t *bind_types;
   int size;
   int used;
 } kmp_nested_proc_bind_t;
 
 extern kmp_nested_proc_bind_t __kmp_nested_proc_bind;
 
 extern int __kmp_display_affinity;
 extern char *__kmp_affinity_format;
 static const size_t KMP_AFFINITY_FORMAT_SIZE = 512;
 
 #if KMP_AFFINITY_SUPPORTED
 #define KMP_PLACE_ALL (-1)
 #define KMP_PLACE_UNDEFINED (-2)
 // Is KMP_AFFINITY is being used instead of OMP_PROC_BIND/OMP_PLACES?
 #define KMP_AFFINITY_NON_PROC_BIND                                             \
   ((__kmp_nested_proc_bind.bind_types[0] == proc_bind_false ||                 \
     __kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) &&                \
    (__kmp_affinity_num_masks > 0 || __kmp_affinity_type == affinity_balanced))
 #endif /* KMP_AFFINITY_SUPPORTED */
 
 extern int __kmp_affinity_num_places;
 
 typedef enum kmp_cancel_kind_t {
   cancel_noreq = 0,
   cancel_parallel = 1,
   cancel_loop = 2,
   cancel_sections = 3,
   cancel_taskgroup = 4
 } kmp_cancel_kind_t;
 
 // KMP_HW_SUBSET support:
 typedef struct kmp_hws_item {
   int num;
   int offset;
 } kmp_hws_item_t;
 
 extern kmp_hws_item_t __kmp_hws_socket;
 extern kmp_hws_item_t __kmp_hws_node;
 extern kmp_hws_item_t __kmp_hws_tile;
 extern kmp_hws_item_t __kmp_hws_core;
 extern kmp_hws_item_t __kmp_hws_proc;
 extern int __kmp_hws_requested;
 extern int __kmp_hws_abs_flag; // absolute or per-item number requested
 
 /* ------------------------------------------------------------------------ */
 
 #define KMP_PAD(type, sz)                                                      \
   (sizeof(type) + (sz - ((sizeof(type) - 1) % (sz)) - 1))
 
 // We need to avoid using -1 as a GTID as +1 is added to the gtid
 // when storing it in a lock, and the value 0 is reserved.
 #define KMP_GTID_DNE (-2) /* Does not exist */
 #define KMP_GTID_SHUTDOWN (-3) /* Library is shutting down */
 #define KMP_GTID_MONITOR (-4) /* Monitor thread ID */
 #define KMP_GTID_UNKNOWN (-5) /* Is not known */
 #define KMP_GTID_MIN (-6) /* Minimal gtid for low bound check in DEBUG */
 
 /* OpenMP 5.0 Memory Management support */
 
 #ifndef __OMP_H
 // Duplicate type definitios from omp.h
 typedef uintptr_t omp_uintptr_t;
 
 typedef enum {
   OMP_ATK_THREADMODEL = 1,
   OMP_ATK_ALIGNMENT = 2,
   OMP_ATK_ACCESS = 3,
   OMP_ATK_POOL_SIZE = 4,
   OMP_ATK_FALLBACK = 5,
   OMP_ATK_FB_DATA = 6,
   OMP_ATK_PINNED = 7,
   OMP_ATK_PARTITION = 8
 } omp_alloctrait_key_t;
 
 typedef enum {
   OMP_ATV_FALSE = 0,
   OMP_ATV_TRUE = 1,
   OMP_ATV_DEFAULT = 2,
   OMP_ATV_CONTENDED = 3,
   OMP_ATV_UNCONTENDED = 4,
   OMP_ATV_SEQUENTIAL = 5,
   OMP_ATV_PRIVATE = 6,
   OMP_ATV_ALL = 7,
   OMP_ATV_THREAD = 8,
   OMP_ATV_PTEAM = 9,
   OMP_ATV_CGROUP = 10,
   OMP_ATV_DEFAULT_MEM_FB = 11,
   OMP_ATV_NULL_FB = 12,
   OMP_ATV_ABORT_FB = 13,
   OMP_ATV_ALLOCATOR_FB = 14,
   OMP_ATV_ENVIRONMENT = 15,
   OMP_ATV_NEAREST = 16,
   OMP_ATV_BLOCKED = 17,
   OMP_ATV_INTERLEAVED = 18
 } omp_alloctrait_value_t;
 
 typedef void *omp_memspace_handle_t;
 extern omp_memspace_handle_t const omp_default_mem_space;
 extern omp_memspace_handle_t const omp_large_cap_mem_space;
 extern omp_memspace_handle_t const omp_const_mem_space;
 extern omp_memspace_handle_t const omp_high_bw_mem_space;
 extern omp_memspace_handle_t const omp_low_lat_mem_space;
 
 typedef struct {
   omp_alloctrait_key_t key;
   omp_uintptr_t value;
 } omp_alloctrait_t;
 
 typedef void *omp_allocator_handle_t;
 extern omp_allocator_handle_t const omp_null_allocator;
 extern omp_allocator_handle_t const omp_default_mem_alloc;
 extern omp_allocator_handle_t const omp_large_cap_mem_alloc;
 extern omp_allocator_handle_t const omp_const_mem_alloc;
 extern omp_allocator_handle_t const omp_high_bw_mem_alloc;
 extern omp_allocator_handle_t const omp_low_lat_mem_alloc;
 extern omp_allocator_handle_t const omp_cgroup_mem_alloc;
 extern omp_allocator_handle_t const omp_pteam_mem_alloc;
 extern omp_allocator_handle_t const omp_thread_mem_alloc;
 extern omp_allocator_handle_t const kmp_max_mem_alloc;
 extern omp_allocator_handle_t __kmp_def_allocator;
 
 // end of duplicate type definitios from omp.h
 #endif
 
 extern int __kmp_memkind_available;
 
 typedef omp_memspace_handle_t kmp_memspace_t; // placeholder
 
 typedef struct kmp_allocator_t {
   omp_memspace_handle_t memspace;
   void **memkind; // pointer to memkind
   int alignment;
   omp_alloctrait_value_t fb;
   kmp_allocator_t *fb_data;
   kmp_uint64 pool_size;
   kmp_uint64 pool_used;
 } kmp_allocator_t;
 
 extern omp_allocator_handle_t __kmpc_init_allocator(int gtid,
                                                     omp_memspace_handle_t,
                                                     int ntraits,
                                                     omp_alloctrait_t traits[]);
 extern void __kmpc_destroy_allocator(int gtid, omp_allocator_handle_t al);
 extern void __kmpc_set_default_allocator(int gtid, omp_allocator_handle_t al);
 extern omp_allocator_handle_t __kmpc_get_default_allocator(int gtid);
 extern void *__kmpc_alloc(int gtid, size_t sz, omp_allocator_handle_t al);
 extern void __kmpc_free(int gtid, void *ptr, omp_allocator_handle_t al);
 
 extern void __kmp_init_memkind();
 extern void __kmp_fini_memkind();
 
 /* ------------------------------------------------------------------------ */
 
 #define KMP_UINT64_MAX                                                         \
   (~((kmp_uint64)1 << ((sizeof(kmp_uint64) * (1 << 3)) - 1)))
 
 #define KMP_MIN_NTH 1
 
 #ifndef KMP_MAX_NTH
 #if defined(PTHREAD_THREADS_MAX) && PTHREAD_THREADS_MAX < INT_MAX
 #define KMP_MAX_NTH PTHREAD_THREADS_MAX
 #else
 #define KMP_MAX_NTH INT_MAX
 #endif
 #endif /* KMP_MAX_NTH */
 
 #ifdef PTHREAD_STACK_MIN
 #define KMP_MIN_STKSIZE PTHREAD_STACK_MIN
 #else
 #define KMP_MIN_STKSIZE ((size_t)(32 * 1024))
 #endif
 
 #define KMP_MAX_STKSIZE (~((size_t)1 << ((sizeof(size_t) * (1 << 3)) - 1)))
 
 #if KMP_ARCH_X86
 #define KMP_DEFAULT_STKSIZE ((size_t)(2 * 1024 * 1024))
 #elif KMP_ARCH_X86_64
 #define KMP_DEFAULT_STKSIZE ((size_t)(4 * 1024 * 1024))
 #define KMP_BACKUP_STKSIZE ((size_t)(2 * 1024 * 1024))
 #else
 #define KMP_DEFAULT_STKSIZE ((size_t)(1024 * 1024))
 #endif
 
 #define KMP_DEFAULT_MALLOC_POOL_INCR ((size_t)(1024 * 1024))
 #define KMP_MIN_MALLOC_POOL_INCR ((size_t)(4 * 1024))
 #define KMP_MAX_MALLOC_POOL_INCR                                               \
   (~((size_t)1 << ((sizeof(size_t) * (1 << 3)) - 1)))
 
 #define KMP_MIN_STKOFFSET (0)
 #define KMP_MAX_STKOFFSET KMP_MAX_STKSIZE
 #if KMP_OS_DARWIN
 #define KMP_DEFAULT_STKOFFSET KMP_MIN_STKOFFSET
 #else
 #define KMP_DEFAULT_STKOFFSET CACHE_LINE
 #endif
 
 #define KMP_MIN_STKPADDING (0)
 #define KMP_MAX_STKPADDING (2 * 1024 * 1024)
 
 #define KMP_BLOCKTIME_MULTIPLIER                                               \
   (1000) /* number of blocktime units per second */
 #define KMP_MIN_BLOCKTIME (0)
 #define KMP_MAX_BLOCKTIME                                                      \
   (INT_MAX) /* Must be this for "infinite" setting the work */
 #define KMP_DEFAULT_BLOCKTIME (200) /*  __kmp_blocktime is in milliseconds  */
 
 #if KMP_USE_MONITOR
 #define KMP_DEFAULT_MONITOR_STKSIZE ((size_t)(64 * 1024))
 #define KMP_MIN_MONITOR_WAKEUPS (1) // min times monitor wakes up per second
 #define KMP_MAX_MONITOR_WAKEUPS (1000) // max times monitor can wake up per sec
 
 /* Calculate new number of monitor wakeups for a specific block time based on
    previous monitor_wakeups. Only allow increasing number of wakeups */
 #define KMP_WAKEUPS_FROM_BLOCKTIME(blocktime, monitor_wakeups)                 \
   (((blocktime) == KMP_MAX_BLOCKTIME)                                          \
        ? (monitor_wakeups)                                                     \
        : ((blocktime) == KMP_MIN_BLOCKTIME)                                    \
              ? KMP_MAX_MONITOR_WAKEUPS                                         \
              : ((monitor_wakeups) > (KMP_BLOCKTIME_MULTIPLIER / (blocktime)))  \
                    ? (monitor_wakeups)                                         \
                    : (KMP_BLOCKTIME_MULTIPLIER) / (blocktime))
 
 /* Calculate number of intervals for a specific block time based on
    monitor_wakeups */
 #define KMP_INTERVALS_FROM_BLOCKTIME(blocktime, monitor_wakeups)               \
   (((blocktime) + (KMP_BLOCKTIME_MULTIPLIER / (monitor_wakeups)) - 1) /        \
    (KMP_BLOCKTIME_MULTIPLIER / (monitor_wakeups)))
 #else
 #define KMP_BLOCKTIME(team, tid)                                               \
   (get__bt_set(team, tid) ? get__blocktime(team, tid) : __kmp_dflt_blocktime)
 #if KMP_OS_UNIX && (KMP_ARCH_X86 || KMP_ARCH_X86_64)
 // HW TSC is used to reduce overhead (clock tick instead of nanosecond).
 extern kmp_uint64 __kmp_ticks_per_msec;
 #if KMP_COMPILER_ICC
 #define KMP_NOW() ((kmp_uint64)_rdtsc())
 #else
 #define KMP_NOW() __kmp_hardware_timestamp()
 #endif
 #define KMP_NOW_MSEC() (KMP_NOW() / __kmp_ticks_per_msec)
 #define KMP_BLOCKTIME_INTERVAL(team, tid)                                      \
   (KMP_BLOCKTIME(team, tid) * __kmp_ticks_per_msec)
 #define KMP_BLOCKING(goal, count) ((goal) > KMP_NOW())
 #else
 // System time is retrieved sporadically while blocking.
 extern kmp_uint64 __kmp_now_nsec();
 #define KMP_NOW() __kmp_now_nsec()
 #define KMP_NOW_MSEC() (KMP_NOW() / KMP_USEC_PER_SEC)
 #define KMP_BLOCKTIME_INTERVAL(team, tid)                                      \
   (KMP_BLOCKTIME(team, tid) * KMP_USEC_PER_SEC)
 #define KMP_BLOCKING(goal, count) ((count) % 1000 != 0 || (goal) > KMP_NOW())
 #endif
 #endif // KMP_USE_MONITOR
 
 #define KMP_MIN_STATSCOLS 40
 #define KMP_MAX_STATSCOLS 4096
 #define KMP_DEFAULT_STATSCOLS 80
 
 #define KMP_MIN_INTERVAL 0
 #define KMP_MAX_INTERVAL (INT_MAX - 1)
 #define KMP_DEFAULT_INTERVAL 0
 
 #define KMP_MIN_CHUNK 1
 #define KMP_MAX_CHUNK (INT_MAX - 1)
 #define KMP_DEFAULT_CHUNK 1
 
 #define KMP_DFLT_DISP_NUM_BUFF 7
 #define KMP_MAX_ORDERED 8
 
 #define KMP_MAX_FIELDS 32
 
 #define KMP_MAX_BRANCH_BITS 31
 
 #define KMP_MAX_ACTIVE_LEVELS_LIMIT INT_MAX
 
 #define KMP_MAX_DEFAULT_DEVICE_LIMIT INT_MAX
 
 #define KMP_MAX_TASK_PRIORITY_LIMIT INT_MAX
 
 /* Minimum number of threads before switch to TLS gtid (experimentally
    determined) */
 /* josh TODO: what about OS X* tuning? */
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 #define KMP_TLS_GTID_MIN 5
 #else
 #define KMP_TLS_GTID_MIN INT_MAX
 #endif
 
 #define KMP_MASTER_TID(tid) ((tid) == 0)
 #define KMP_WORKER_TID(tid) ((tid) != 0)
 
 #define KMP_MASTER_GTID(gtid) (__kmp_tid_from_gtid((gtid)) == 0)
 #define KMP_WORKER_GTID(gtid) (__kmp_tid_from_gtid((gtid)) != 0)
 #define KMP_INITIAL_GTID(gtid) ((gtid) == 0)
 
 #ifndef TRUE
 #define FALSE 0
 #define TRUE (!FALSE)
 #endif
 
 /* NOTE: all of the following constants must be even */
 
 #if KMP_OS_WINDOWS
 #define KMP_INIT_WAIT 64U /* initial number of spin-tests   */
 #define KMP_NEXT_WAIT 32U /* susequent number of spin-tests */
 #elif KMP_OS_CNK
 #define KMP_INIT_WAIT 16U /* initial number of spin-tests   */
 #define KMP_NEXT_WAIT 8U /* susequent number of spin-tests */
 #elif KMP_OS_LINUX
 #define KMP_INIT_WAIT 1024U /* initial number of spin-tests   */
 #define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
 #elif KMP_OS_DARWIN
 /* TODO: tune for KMP_OS_DARWIN */
 #define KMP_INIT_WAIT 1024U /* initial number of spin-tests   */
 #define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
 #elif KMP_OS_DRAGONFLY
 /* TODO: tune for KMP_OS_DRAGONFLY */
 #define KMP_INIT_WAIT 1024U /* initial number of spin-tests   */
 #define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
 #elif KMP_OS_FREEBSD
 /* TODO: tune for KMP_OS_FREEBSD */
 #define KMP_INIT_WAIT 1024U /* initial number of spin-tests   */
 #define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
 #elif KMP_OS_NETBSD
 /* TODO: tune for KMP_OS_NETBSD */
 #define KMP_INIT_WAIT 1024U /* initial number of spin-tests   */
 #define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
 #elif KMP_OS_HURD
 /* TODO: tune for KMP_OS_HURD */
 #define KMP_INIT_WAIT 1024U /* initial number of spin-tests   */
 #define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
 #elif KMP_OS_OPENBSD
 /* TODO: tune for KMP_OS_OPENBSD */
 #define KMP_INIT_WAIT 1024U /* initial number of spin-tests   */
 #define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */
 #endif
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 typedef struct kmp_cpuid {
   kmp_uint32 eax;
   kmp_uint32 ebx;
   kmp_uint32 ecx;
   kmp_uint32 edx;
 } kmp_cpuid_t;
 
 typedef struct kmp_cpuinfo {
   int initialized; // If 0, other fields are not initialized.
   int signature; // CPUID(1).EAX
   int family; // CPUID(1).EAX[27:20]+CPUID(1).EAX[11:8] (Extended Family+Family)
   int model; // ( CPUID(1).EAX[19:16] << 4 ) + CPUID(1).EAX[7:4] ( ( Extended
   // Model << 4 ) + Model)
   int stepping; // CPUID(1).EAX[3:0] ( Stepping )
   int sse2; // 0 if SSE2 instructions are not supported, 1 otherwise.
   int rtm; // 0 if RTM instructions are not supported, 1 otherwise.
   int cpu_stackoffset;
   int apic_id;
   int physical_id;
   int logical_id;
   kmp_uint64 frequency; // Nominal CPU frequency in Hz.
   char name[3 * sizeof(kmp_cpuid_t)]; // CPUID(0x80000002,0x80000003,0x80000004)
 } kmp_cpuinfo_t;
 
 extern void __kmp_query_cpuid(kmp_cpuinfo_t *p);
 
 #if KMP_OS_UNIX
 // subleaf is only needed for cache and topology discovery and can be set to
 // zero in most cases
 static inline void __kmp_x86_cpuid(int leaf, int subleaf, struct kmp_cpuid *p) {
   __asm__ __volatile__("cpuid"
                        : "=a"(p->eax), "=b"(p->ebx), "=c"(p->ecx), "=d"(p->edx)
                        : "a"(leaf), "c"(subleaf));
 }
 // Load p into FPU control word
 static inline void __kmp_load_x87_fpu_control_word(const kmp_int16 *p) {
   __asm__ __volatile__("fldcw %0" : : "m"(*p));
 }
 // Store FPU control word into p
 static inline void __kmp_store_x87_fpu_control_word(kmp_int16 *p) {
   __asm__ __volatile__("fstcw %0" : "=m"(*p));
 }
 static inline void __kmp_clear_x87_fpu_status_word() {
 #if KMP_MIC
   // 32-bit protected mode x87 FPU state
   struct x87_fpu_state {
     unsigned cw;
     unsigned sw;
     unsigned tw;
     unsigned fip;
     unsigned fips;
     unsigned fdp;
     unsigned fds;
   };
   struct x87_fpu_state fpu_state = {0, 0, 0, 0, 0, 0, 0};
   __asm__ __volatile__("fstenv %0\n\t" // store FP env
                        "andw $0x7f00, %1\n\t" // clear 0-7,15 bits of FP SW
                        "fldenv %0\n\t" // load FP env back
                        : "+m"(fpu_state), "+m"(fpu_state.sw));
 #else
   __asm__ __volatile__("fnclex");
 #endif // KMP_MIC
 }
 #if __SSE__
 static inline void __kmp_load_mxcsr(const kmp_uint32 *p) { _mm_setcsr(*p); }
 static inline void __kmp_store_mxcsr(kmp_uint32 *p) { *p = _mm_getcsr(); }
 #else
 static inline void __kmp_load_mxcsr(const kmp_uint32 *p) {}
 static inline void __kmp_store_mxcsr(kmp_uint32 *p) { *p = 0; }
 #endif
 #else
 // Windows still has these as external functions in assembly file
 extern void __kmp_x86_cpuid(int mode, int mode2, struct kmp_cpuid *p);
 extern void __kmp_load_x87_fpu_control_word(const kmp_int16 *p);
 extern void __kmp_store_x87_fpu_control_word(kmp_int16 *p);
 extern void __kmp_clear_x87_fpu_status_word();
 static inline void __kmp_load_mxcsr(const kmp_uint32 *p) { _mm_setcsr(*p); }
 static inline void __kmp_store_mxcsr(kmp_uint32 *p) { *p = _mm_getcsr(); }
 #endif // KMP_OS_UNIX
 
 #define KMP_X86_MXCSR_MASK 0xffffffc0 /* ignore status flags (6 lsb) */
 
 #if KMP_ARCH_X86
 extern void __kmp_x86_pause(void);
 #elif KMP_MIC
 // Performance testing on KNC (C0QS-7120 P/A/X/D, 61-core, 16 GB Memory) showed
 // regression after removal of extra PAUSE from spin loops. Changing
 // the delay from 100 to 300 showed even better performance than double PAUSE
 // on Spec OMP2001 and LCPC tasking tests, no regressions on EPCC.
 static inline void __kmp_x86_pause(void) { _mm_delay_32(300); }
 #else
 static inline void __kmp_x86_pause(void) { _mm_pause(); }
 #endif
 #define KMP_CPU_PAUSE() __kmp_x86_pause()
 #elif KMP_ARCH_PPC64
 #define KMP_PPC64_PRI_LOW() __asm__ volatile("or 1, 1, 1")
 #define KMP_PPC64_PRI_MED() __asm__ volatile("or 2, 2, 2")
 #define KMP_PPC64_PRI_LOC_MB() __asm__ volatile("" : : : "memory")
 #define KMP_CPU_PAUSE()                                                        \
   do {                                                                         \
     KMP_PPC64_PRI_LOW();                                                       \
     KMP_PPC64_PRI_MED();                                                       \
     KMP_PPC64_PRI_LOC_MB();                                                    \
   } while (0)
 #else
 #define KMP_CPU_PAUSE() /* nothing to do */
 #endif
 
 #define KMP_INIT_YIELD(count)                                                  \
   { (count) = __kmp_yield_init; }
 
 #define KMP_OVERSUBSCRIBED                                                     \
   (TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc))
 
 #define KMP_TRY_YIELD                                                          \
   ((__kmp_use_yield == 1) || (__kmp_use_yield == 2 && (KMP_OVERSUBSCRIBED)))
 
 #define KMP_TRY_YIELD_OVERSUB                                                  \
   ((__kmp_use_yield == 1 || __kmp_use_yield == 2) && (KMP_OVERSUBSCRIBED))
 
 #define KMP_YIELD(cond)                                                        \
   {                                                                            \
     KMP_CPU_PAUSE();                                                           \
     if ((cond) && (KMP_TRY_YIELD))                                             \
       __kmp_yield();                                                           \
   }
 
 #define KMP_YIELD_OVERSUB()                                                    \
   {                                                                            \
     KMP_CPU_PAUSE();                                                           \
     if ((KMP_TRY_YIELD_OVERSUB))                                               \
       __kmp_yield();                                                           \
   }
 
 // Note the decrement of 2 in the following Macros. With KMP_LIBRARY=turnaround,
 // there should be no yielding since initial value from KMP_INIT_YIELD() is odd.
 #define KMP_YIELD_SPIN(count)                                                  \
   {                                                                            \
     KMP_CPU_PAUSE();                                                           \
     if (KMP_TRY_YIELD) {                                                       \
       (count) -= 2;                                                            \
       if (!(count)) {                                                          \
         __kmp_yield();                                                         \
         (count) = __kmp_yield_next;                                            \
       }                                                                        \
     }                                                                          \
   }
 
 #define KMP_YIELD_OVERSUB_ELSE_SPIN(count)                                     \
   {                                                                            \
     KMP_CPU_PAUSE();                                                           \
     if ((KMP_TRY_YIELD_OVERSUB))                                               \
       __kmp_yield();                                                           \
     else if (__kmp_use_yield == 1) {                                           \
       (count) -= 2;                                                            \
       if (!(count)) {                                                          \
         __kmp_yield();                                                         \
         (count) = __kmp_yield_next;                                            \
       }                                                                        \
     }                                                                          \
   }
 
 /* ------------------------------------------------------------------------ */
 /* Support datatypes for the orphaned construct nesting checks.             */
 /* ------------------------------------------------------------------------ */
 
 enum cons_type {
   ct_none,
   ct_parallel,
   ct_pdo,
   ct_pdo_ordered,
   ct_psections,
   ct_psingle,
   ct_critical,
   ct_ordered_in_parallel,
   ct_ordered_in_pdo,
   ct_master,
   ct_reduce,
   ct_barrier
 };
 
 #define IS_CONS_TYPE_ORDERED(ct) ((ct) == ct_pdo_ordered)
 
 struct cons_data {
   ident_t const *ident;
   enum cons_type type;
   int prev;
   kmp_user_lock_p
       name; /* address exclusively for critical section name comparison */
 };
 
 struct cons_header {
   int p_top, w_top, s_top;
   int stack_size, stack_top;
   struct cons_data *stack_data;
 };
 
 struct kmp_region_info {
   char *text;
   int offset[KMP_MAX_FIELDS];
   int length[KMP_MAX_FIELDS];
 };
 
 /* ---------------------------------------------------------------------- */
 /* ---------------------------------------------------------------------- */
 
 #if KMP_OS_WINDOWS
 typedef HANDLE kmp_thread_t;
 typedef DWORD kmp_key_t;
 #endif /* KMP_OS_WINDOWS */
 
 #if KMP_OS_UNIX
 typedef pthread_t kmp_thread_t;
 typedef pthread_key_t kmp_key_t;
 #endif
 
 extern kmp_key_t __kmp_gtid_threadprivate_key;
 
 typedef struct kmp_sys_info {
   long maxrss; /* the maximum resident set size utilized (in kilobytes)     */
   long minflt; /* the number of page faults serviced without any I/O        */
   long majflt; /* the number of page faults serviced that required I/O      */
   long nswap; /* the number of times a process was "swapped" out of memory */
   long inblock; /* the number of times the file system had to perform input  */
   long oublock; /* the number of times the file system had to perform output */
   long nvcsw; /* the number of times a context switch was voluntarily      */
   long nivcsw; /* the number of times a context switch was forced           */
 } kmp_sys_info_t;
 
 #if USE_ITT_BUILD
 // We cannot include "kmp_itt.h" due to circular dependency. Declare the only
 // required type here. Later we will check the type meets requirements.
 typedef int kmp_itt_mark_t;
 #define KMP_ITT_DEBUG 0
 #endif /* USE_ITT_BUILD */
 
 typedef kmp_int32 kmp_critical_name[8];
 
 /*!
 @ingroup PARALLEL
 The type for a microtask which gets passed to @ref __kmpc_fork_call().
 The arguments to the outlined function are
 @param global_tid the global thread identity of the thread executing the
 function.
 @param bound_tid  the local identitiy of the thread executing the function
 @param ... pointers to shared variables accessed by the function.
 */
 typedef void (*kmpc_micro)(kmp_int32 *global_tid, kmp_int32 *bound_tid, ...);
 typedef void (*kmpc_micro_bound)(kmp_int32 *bound_tid, kmp_int32 *bound_nth,
                                  ...);
 
 /*!
 @ingroup THREADPRIVATE
 @{
 */
 /* ---------------------------------------------------------------------------
  */
 /* Threadprivate initialization/finalization function declarations */
 
 /*  for non-array objects:  __kmpc_threadprivate_register()  */
 
 /*!
  Pointer to the constructor function.
  The first argument is the <tt>this</tt> pointer
 */
 typedef void *(*kmpc_ctor)(void *);
 
 /*!
  Pointer to the destructor function.
  The first argument is the <tt>this</tt> pointer
 */
 typedef void (*kmpc_dtor)(
     void * /*, size_t */); /* 2nd arg: magic number for KCC unused by Intel
                               compiler */
 /*!
  Pointer to an alternate constructor.
  The first argument is the <tt>this</tt> pointer.
 */
 typedef void *(*kmpc_cctor)(void *, void *);
 
 /* for array objects: __kmpc_threadprivate_register_vec() */
 /* First arg: "this" pointer */
 /* Last arg: number of array elements */
 /*!
  Array constructor.
  First argument is the <tt>this</tt> pointer
  Second argument the number of array elements.
 */
 typedef void *(*kmpc_ctor_vec)(void *, size_t);
 /*!
  Pointer to the array destructor function.
  The first argument is the <tt>this</tt> pointer
  Second argument the number of array elements.
 */
 typedef void (*kmpc_dtor_vec)(void *, size_t);
 /*!
  Array constructor.
  First argument is the <tt>this</tt> pointer
  Third argument the number of array elements.
 */
 typedef void *(*kmpc_cctor_vec)(void *, void *,
                                 size_t); /* function unused by compiler */
 
 /*!
 @}
 */
 
 /* keeps tracked of threadprivate cache allocations for cleanup later */
 typedef struct kmp_cached_addr {
   void **addr; /* address of allocated cache */
   void ***compiler_cache; /* pointer to compiler's cache */
   void *data; /* pointer to global data */
   struct kmp_cached_addr *next; /* pointer to next cached address */
 } kmp_cached_addr_t;
 
 struct private_data {
   struct private_data *next; /* The next descriptor in the list      */
   void *data; /* The data buffer for this descriptor  */
   int more; /* The repeat count for this descriptor */
   size_t size; /* The data size for this descriptor    */
 };
 
 struct private_common {
   struct private_common *next;
   struct private_common *link;
   void *gbl_addr;
   void *par_addr; /* par_addr == gbl_addr for MASTER thread */
   size_t cmn_size;
 };
 
 struct shared_common {
   struct shared_common *next;
   struct private_data *pod_init;
   void *obj_init;
   void *gbl_addr;
   union {
     kmpc_ctor ctor;
     kmpc_ctor_vec ctorv;
   } ct;
   union {
     kmpc_cctor cctor;
     kmpc_cctor_vec cctorv;
   } cct;
   union {
     kmpc_dtor dtor;
     kmpc_dtor_vec dtorv;
   } dt;
   size_t vec_len;
   int is_vec;
   size_t cmn_size;
 };
 
 #define KMP_HASH_TABLE_LOG2 9 /* log2 of the hash table size */
 #define KMP_HASH_TABLE_SIZE                                                    \
   (1 << KMP_HASH_TABLE_LOG2) /* size of the hash table */
 #define KMP_HASH_SHIFT 3 /* throw away this many low bits from the address */
 #define KMP_HASH(x)                                                            \
   ((((kmp_uintptr_t)x) >> KMP_HASH_SHIFT) & (KMP_HASH_TABLE_SIZE - 1))
 
 struct common_table {
   struct private_common *data[KMP_HASH_TABLE_SIZE];
 };
 
 struct shared_table {
   struct shared_common *data[KMP_HASH_TABLE_SIZE];
 };
 
 /* ------------------------------------------------------------------------ */
 
 #if KMP_USE_HIER_SCHED
 // Shared barrier data that exists inside a single unit of the scheduling
 // hierarchy
 typedef struct kmp_hier_private_bdata_t {
   kmp_int32 num_active;
   kmp_uint64 index;
   kmp_uint64 wait_val[2];
 } kmp_hier_private_bdata_t;
 #endif
 
 typedef struct kmp_sched_flags {
   unsigned ordered : 1;
   unsigned nomerge : 1;
   unsigned contains_last : 1;
 #if KMP_USE_HIER_SCHED
   unsigned use_hier : 1;
   unsigned unused : 28;
 #else
   unsigned unused : 29;
 #endif
 } kmp_sched_flags_t;
 
 KMP_BUILD_ASSERT(sizeof(kmp_sched_flags_t) == 4);
 
 #if KMP_STATIC_STEAL_ENABLED
 typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {
   kmp_int32 count;
   kmp_int32 ub;
   /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
   kmp_int32 lb;
   kmp_int32 st;
   kmp_int32 tc;
   kmp_int32 static_steal_counter; /* for static_steal only; maybe better to put
                                      after ub */
 
   // KMP_ALIGN( 16 ) ensures ( if the KMP_ALIGN macro is turned on )
   //    a) parm3 is properly aligned and
   //    b) all parm1-4 are in the same cache line.
   // Because of parm1-4 are used together, performance seems to be better
   // if they are in the same line (not measured though).
 
   struct KMP_ALIGN(32) { // AC: changed 16 to 32 in order to simplify template
     kmp_int32 parm1; //     structures in kmp_dispatch.cpp. This should
     kmp_int32 parm2; //     make no real change at least while padding is off.
     kmp_int32 parm3;
     kmp_int32 parm4;
   };
 
   kmp_uint32 ordered_lower;
   kmp_uint32 ordered_upper;
 #if KMP_OS_WINDOWS
   // This var can be placed in the hole between 'tc' and 'parm1', instead of
   // 'static_steal_counter'. It would be nice to measure execution times.
   // Conditional if/endif can be removed at all.
   kmp_int32 last_upper;
 #endif /* KMP_OS_WINDOWS */
 } dispatch_private_info32_t;
 
 typedef struct KMP_ALIGN_CACHE dispatch_private_info64 {
   kmp_int64 count; // current chunk number for static & static-steal scheduling
   kmp_int64 ub; /* upper-bound */
   /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
   kmp_int64 lb; /* lower-bound */
   kmp_int64 st; /* stride */
   kmp_int64 tc; /* trip count (number of iterations) */
   kmp_int64 static_steal_counter; /* for static_steal only; maybe better to put
                                      after ub */
 
   /* parm[1-4] are used in different ways by different scheduling algorithms */
 
   // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
   //    a) parm3 is properly aligned and
   //    b) all parm1-4 are in the same cache line.
   // Because of parm1-4 are used together, performance seems to be better
   // if they are in the same line (not measured though).
 
   struct KMP_ALIGN(32) {
     kmp_int64 parm1;
     kmp_int64 parm2;
     kmp_int64 parm3;
     kmp_int64 parm4;
   };
 
   kmp_uint64 ordered_lower;
   kmp_uint64 ordered_upper;
 #if KMP_OS_WINDOWS
   // This var can be placed in the hole between 'tc' and 'parm1', instead of
   // 'static_steal_counter'. It would be nice to measure execution times.
   // Conditional if/endif can be removed at all.
   kmp_int64 last_upper;
 #endif /* KMP_OS_WINDOWS */
 } dispatch_private_info64_t;
 #else /* KMP_STATIC_STEAL_ENABLED */
 typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {
   kmp_int32 lb;
   kmp_int32 ub;
   kmp_int32 st;
   kmp_int32 tc;
 
   kmp_int32 parm1;
   kmp_int32 parm2;
   kmp_int32 parm3;
   kmp_int32 parm4;
 
   kmp_int32 count;
 
   kmp_uint32 ordered_lower;
   kmp_uint32 ordered_upper;
 #if KMP_OS_WINDOWS
   kmp_int32 last_upper;
 #endif /* KMP_OS_WINDOWS */
 } dispatch_private_info32_t;
 
 typedef struct KMP_ALIGN_CACHE dispatch_private_info64 {
   kmp_int64 lb; /* lower-bound */
   kmp_int64 ub; /* upper-bound */
   kmp_int64 st; /* stride */
   kmp_int64 tc; /* trip count (number of iterations) */
 
   /* parm[1-4] are used in different ways by different scheduling algorithms */
   kmp_int64 parm1;
   kmp_int64 parm2;
   kmp_int64 parm3;
   kmp_int64 parm4;
 
   kmp_int64 count; /* current chunk number for static scheduling */
 
   kmp_uint64 ordered_lower;
   kmp_uint64 ordered_upper;
 #if KMP_OS_WINDOWS
   kmp_int64 last_upper;
 #endif /* KMP_OS_WINDOWS */
 } dispatch_private_info64_t;
 #endif /* KMP_STATIC_STEAL_ENABLED */
 
 typedef struct KMP_ALIGN_CACHE dispatch_private_info {
   union private_info {
     dispatch_private_info32_t p32;
     dispatch_private_info64_t p64;
   } u;
   enum sched_type schedule; /* scheduling algorithm */
   kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */
   kmp_int32 ordered_bumped;
   // To retain the structure size after making ordered_iteration scalar
   kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
   // Stack of buffers for nest of serial regions
   struct dispatch_private_info *next;
   kmp_int32 type_size; /* the size of types in private_info */
 #if KMP_USE_HIER_SCHED
   kmp_int32 hier_id;
   void *parent; /* hierarchical scheduling parent pointer */
 #endif
   enum cons_type pushed_ws;
 } dispatch_private_info_t;
 
 typedef struct dispatch_shared_info32 {
   /* chunk index under dynamic, number of idle threads under static-steal;
      iteration index otherwise */
   volatile kmp_uint32 iteration;
   volatile kmp_uint32 num_done;
   volatile kmp_uint32 ordered_iteration;
   // Dummy to retain the structure size after making ordered_iteration scalar
   kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 1];
 } dispatch_shared_info32_t;
 
 typedef struct dispatch_shared_info64 {
   /* chunk index under dynamic, number of idle threads under static-steal;
      iteration index otherwise */
   volatile kmp_uint64 iteration;
   volatile kmp_uint64 num_done;
   volatile kmp_uint64 ordered_iteration;
   // Dummy to retain the structure size after making ordered_iteration scalar
   kmp_int64 ordered_dummy[KMP_MAX_ORDERED - 3];
 } dispatch_shared_info64_t;
 
 typedef struct dispatch_shared_info {
   union shared_info {
     dispatch_shared_info32_t s32;
     dispatch_shared_info64_t s64;
   } u;
   volatile kmp_uint32 buffer_index;
   volatile kmp_int32 doacross_buf_idx; // teamwise index
   volatile kmp_uint32 *doacross_flags; // shared array of iteration flags (0/1)
   kmp_int32 doacross_num_done; // count finished threads
 #if KMP_USE_HIER_SCHED
   void *hier;
 #endif
 #if KMP_USE_HWLOC
   // When linking with libhwloc, the ORDERED EPCC test slows down on big
   // machines (> 48 cores). Performance analysis showed that a cache thrash
   // was occurring and this padding helps alleviate the problem.
   char padding[64];
 #endif
 } dispatch_shared_info_t;
 
 typedef struct kmp_disp {
   /* Vector for ORDERED SECTION */
   void (*th_deo_fcn)(int *gtid, int *cid, ident_t *);
   /* Vector for END ORDERED SECTION */
   void (*th_dxo_fcn)(int *gtid, int *cid, ident_t *);
 
   dispatch_shared_info_t *th_dispatch_sh_current;
   dispatch_private_info_t *th_dispatch_pr_current;
 
   dispatch_private_info_t *th_disp_buffer;
   kmp_int32 th_disp_index;
   kmp_int32 th_doacross_buf_idx; // thread's doacross buffer index
   volatile kmp_uint32 *th_doacross_flags; // pointer to shared array of flags
   union { // we can use union here because doacross cannot be used in
     // nonmonotonic loops
     kmp_int64 *th_doacross_info; // info on loop bounds
     kmp_lock_t *th_steal_lock; // lock used for chunk stealing (8-byte variable)
   };
 #if KMP_USE_INTERNODE_ALIGNMENT
   char more_padding[INTERNODE_CACHE_LINE];
 #endif
 } kmp_disp_t;
 
 /* ------------------------------------------------------------------------ */
 /* Barrier stuff */
 
 /* constants for barrier state update */
 #define KMP_INIT_BARRIER_STATE 0 /* should probably start from zero */
 #define KMP_BARRIER_SLEEP_BIT 0 /* bit used for suspend/sleep part of state */
 #define KMP_BARRIER_UNUSED_BIT 1 // bit that must never be set for valid state
 #define KMP_BARRIER_BUMP_BIT 2 /* lsb used for bump of go/arrived state */
 
 #define KMP_BARRIER_SLEEP_STATE (1 << KMP_BARRIER_SLEEP_BIT)
 #define KMP_BARRIER_UNUSED_STATE (1 << KMP_BARRIER_UNUSED_BIT)
 #define KMP_BARRIER_STATE_BUMP (1 << KMP_BARRIER_BUMP_BIT)
 
 #if (KMP_BARRIER_SLEEP_BIT >= KMP_BARRIER_BUMP_BIT)
 #error "Barrier sleep bit must be smaller than barrier bump bit"
 #endif
 #if (KMP_BARRIER_UNUSED_BIT >= KMP_BARRIER_BUMP_BIT)
 #error "Barrier unused bit must be smaller than barrier bump bit"
 #endif
 
 // Constants for release barrier wait state: currently, hierarchical only
 #define KMP_BARRIER_NOT_WAITING 0 // Normal state; worker not in wait_sleep
 #define KMP_BARRIER_OWN_FLAG                                                   \
   1 // Normal state; worker waiting on own b_go flag in release
 #define KMP_BARRIER_PARENT_FLAG                                                \
   2 // Special state; worker waiting on parent's b_go flag in release
 #define KMP_BARRIER_SWITCH_TO_OWN_FLAG                                         \
   3 // Special state; tells worker to shift from parent to own b_go
 #define KMP_BARRIER_SWITCHING                                                  \
   4 // Special state; worker resets appropriate flag on wake-up
 
 #define KMP_NOT_SAFE_TO_REAP                                                   \
   0 // Thread th_reap_state: not safe to reap (tasking)
 #define KMP_SAFE_TO_REAP 1 // Thread th_reap_state: safe to reap (not tasking)
 
 enum barrier_type {
   bs_plain_barrier = 0, /* 0, All non-fork/join barriers (except reduction
                            barriers if enabled) */
   bs_forkjoin_barrier, /* 1, All fork/join (parallel region) barriers */
 #if KMP_FAST_REDUCTION_BARRIER
   bs_reduction_barrier, /* 2, All barriers that are used in reduction */
 #endif // KMP_FAST_REDUCTION_BARRIER
   bs_last_barrier /* Just a placeholder to mark the end */
 };
 
 // to work with reduction barriers just like with plain barriers
 #if !KMP_FAST_REDUCTION_BARRIER
 #define bs_reduction_barrier bs_plain_barrier
 #endif // KMP_FAST_REDUCTION_BARRIER
 
 typedef enum kmp_bar_pat { /* Barrier communication patterns */
                            bp_linear_bar =
                                0, /* Single level (degenerate) tree */
                            bp_tree_bar =
                                1, /* Balanced tree with branching factor 2^n */
                            bp_hyper_bar =
                                2, /* Hypercube-embedded tree with min branching
                                      factor 2^n */
                            bp_hierarchical_bar = 3, /* Machine hierarchy tree */
                            bp_last_bar /* Placeholder to mark the end */
 } kmp_bar_pat_e;
 
 #define KMP_BARRIER_ICV_PUSH 1
 
 /* Record for holding the values of the internal controls stack records */
 typedef struct kmp_internal_control {
   int serial_nesting_level; /* corresponds to the value of the
                                th_team_serialized field */
   kmp_int8 dynamic; /* internal control for dynamic adjustment of threads (per
                        thread) */
   kmp_int8
       bt_set; /* internal control for whether blocktime is explicitly set */
   int blocktime; /* internal control for blocktime */
 #if KMP_USE_MONITOR
   int bt_intervals; /* internal control for blocktime intervals */
 #endif
   int nproc; /* internal control for #threads for next parallel region (per
                 thread) */
   int thread_limit; /* internal control for thread-limit-var */
   int max_active_levels; /* internal control for max_active_levels */
   kmp_r_sched_t
       sched; /* internal control for runtime schedule {sched,chunk} pair */
   kmp_proc_bind_t proc_bind; /* internal control for affinity  */
   kmp_int32 default_device; /* internal control for default device */
   struct kmp_internal_control *next;
 } kmp_internal_control_t;
 
 static inline void copy_icvs(kmp_internal_control_t *dst,
                              kmp_internal_control_t *src) {
   *dst = *src;
 }
 
 /* Thread barrier needs volatile barrier fields */
 typedef struct KMP_ALIGN_CACHE kmp_bstate {
   // th_fixed_icvs is aligned by virtue of kmp_bstate being aligned (and all
   // uses of it). It is not explicitly aligned below, because we *don't* want
   // it to be padded -- instead, we fit b_go into the same cache line with
   // th_fixed_icvs, enabling NGO cache lines stores in the hierarchical barrier.
   kmp_internal_control_t th_fixed_icvs; // Initial ICVs for the thread
   // Tuck b_go into end of th_fixed_icvs cache line, so it can be stored with
   // same NGO store
   volatile kmp_uint64 b_go; // STATE => task should proceed (hierarchical)
   KMP_ALIGN_CACHE volatile kmp_uint64
       b_arrived; // STATE => task reached synch point.
   kmp_uint32 *skip_per_level;
   kmp_uint32 my_level;
   kmp_int32 parent_tid;
   kmp_int32 old_tid;
   kmp_uint32 depth;
   struct kmp_bstate *parent_bar;
   kmp_team_t *team;
   kmp_uint64 leaf_state;
   kmp_uint32 nproc;
   kmp_uint8 base_leaf_kids;
   kmp_uint8 leaf_kids;
   kmp_uint8 offset;
   kmp_uint8 wait_flag;
   kmp_uint8 use_oncore_barrier;
 #if USE_DEBUGGER
   // The following field is intended for the debugger solely. Only the worker
   // thread itself accesses this field: the worker increases it by 1 when it
   // arrives to a barrier.
   KMP_ALIGN_CACHE kmp_uint b_worker_arrived;
 #endif /* USE_DEBUGGER */
 } kmp_bstate_t;
 
 union KMP_ALIGN_CACHE kmp_barrier_union {
   double b_align; /* use worst case alignment */
   char b_pad[KMP_PAD(kmp_bstate_t, CACHE_LINE)];
   kmp_bstate_t bb;
 };
 
 typedef union kmp_barrier_union kmp_balign_t;
 
 /* Team barrier needs only non-volatile arrived counter */
 union KMP_ALIGN_CACHE kmp_barrier_team_union {
   double b_align; /* use worst case alignment */
   char b_pad[CACHE_LINE];
   struct {
     kmp_uint64 b_arrived; /* STATE => task reached synch point. */
 #if USE_DEBUGGER
     // The following two fields are indended for the debugger solely. Only
     // master of the team accesses these fields: the first one is increased by
     // 1 when master arrives to a barrier, the second one is increased by one
     // when all the threads arrived.
     kmp_uint b_master_arrived;
     kmp_uint b_team_arrived;
 #endif
   };
 };
 
 typedef union kmp_barrier_team_union kmp_balign_team_t;
 
 /* Padding for Linux* OS pthreads condition variables and mutexes used to signal
    threads when a condition changes.  This is to workaround an NPTL bug where
    padding was added to pthread_cond_t which caused the initialization routine
    to write outside of the structure if compiled on pre-NPTL threads.  */
 #if KMP_OS_WINDOWS
 typedef struct kmp_win32_mutex {
   /* The Lock */
   CRITICAL_SECTION cs;
 } kmp_win32_mutex_t;
 
 typedef struct kmp_win32_cond {
   /* Count of the number of waiters. */
   int waiters_count_;
 
   /* Serialize access to <waiters_count_> */
   kmp_win32_mutex_t waiters_count_lock_;
 
   /* Number of threads to release via a <cond_broadcast> or a <cond_signal> */
   int release_count_;
 
   /* Keeps track of the current "generation" so that we don't allow */
   /* one thread to steal all the "releases" from the broadcast. */
   int wait_generation_count_;
 
   /* A manual-reset event that's used to block and release waiting threads. */
   HANDLE event_;
 } kmp_win32_cond_t;
 #endif
 
 #if KMP_OS_UNIX
 
 union KMP_ALIGN_CACHE kmp_cond_union {
   double c_align;
   char c_pad[CACHE_LINE];
   pthread_cond_t c_cond;
 };
 
 typedef union kmp_cond_union kmp_cond_align_t;
 
 union KMP_ALIGN_CACHE kmp_mutex_union {
   double m_align;
   char m_pad[CACHE_LINE];
   pthread_mutex_t m_mutex;
 };
 
 typedef union kmp_mutex_union kmp_mutex_align_t;
 
 #endif /* KMP_OS_UNIX */
 
 typedef struct kmp_desc_base {
   void *ds_stackbase;
   size_t ds_stacksize;
   int ds_stackgrow;
   kmp_thread_t ds_thread;
   volatile int ds_tid;
   int ds_gtid;
 #if KMP_OS_WINDOWS
   volatile int ds_alive;
   DWORD ds_thread_id;
 /* ds_thread keeps thread handle on Windows* OS. It is enough for RTL purposes.
    However, debugger support (libomp_db) cannot work with handles, because they
    uncomparable. For example, debugger requests info about thread with handle h.
    h is valid within debugger process, and meaningless within debugee process.
    Even if h is duped by call to DuplicateHandle(), so the result h' is valid
    within debugee process, but it is a *new* handle which does *not* equal to
    any other handle in debugee... The only way to compare handles is convert
    them to system-wide ids. GetThreadId() function is available only in
    Longhorn and Server 2003. :-( In contrast, GetCurrentThreadId() is available
    on all Windows* OS flavours (including Windows* 95). Thus, we have to get
    thread id by call to GetCurrentThreadId() from within the thread and save it
    to let libomp_db identify threads.  */
 #endif /* KMP_OS_WINDOWS */
 } kmp_desc_base_t;
 
 typedef union KMP_ALIGN_CACHE kmp_desc {
   double ds_align; /* use worst case alignment */
   char ds_pad[KMP_PAD(kmp_desc_base_t, CACHE_LINE)];
   kmp_desc_base_t ds;
 } kmp_desc_t;
 
 typedef struct kmp_local {
   volatile int this_construct; /* count of single's encountered by thread */
   void *reduce_data;
 #if KMP_USE_BGET
   void *bget_data;
   void *bget_list;
 #if !USE_CMP_XCHG_FOR_BGET
 #ifdef USE_QUEUING_LOCK_FOR_BGET
   kmp_lock_t bget_lock; /* Lock for accessing bget free list */
 #else
   kmp_bootstrap_lock_t bget_lock; // Lock for accessing bget free list. Must be
 // bootstrap lock so we can use it at library
 // shutdown.
 #endif /* USE_LOCK_FOR_BGET */
 #endif /* ! USE_CMP_XCHG_FOR_BGET */
 #endif /* KMP_USE_BGET */
 
   PACKED_REDUCTION_METHOD_T
   packed_reduction_method; /* stored by __kmpc_reduce*(), used by
                               __kmpc_end_reduce*() */
 
 } kmp_local_t;
 
 #define KMP_CHECK_UPDATE(a, b)                                                 \
   if ((a) != (b))                                                              \
   (a) = (b)
 #define KMP_CHECK_UPDATE_SYNC(a, b)                                            \
   if ((a) != (b))                                                              \
   TCW_SYNC_PTR((a), (b))
 
 #define get__blocktime(xteam, xtid)                                            \
   ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.blocktime)
 #define get__bt_set(xteam, xtid)                                               \
   ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_set)
 #if KMP_USE_MONITOR
 #define get__bt_intervals(xteam, xtid)                                         \
   ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_intervals)
 #endif
 
 #define get__dynamic_2(xteam, xtid)                                            \
   ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.dynamic)
 #define get__nproc_2(xteam, xtid)                                              \
   ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.nproc)
 #define get__sched_2(xteam, xtid)                                              \
   ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.sched)
 
 #define set__blocktime_team(xteam, xtid, xval)                                 \
   (((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.blocktime) =     \
        (xval))
 
 #if KMP_USE_MONITOR
 #define set__bt_intervals_team(xteam, xtid, xval)                              \
   (((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_intervals) =  \
        (xval))
 #endif
 
 #define set__bt_set_team(xteam, xtid, xval)                                    \
   (((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_set) = (xval))
 
 #define set__dynamic(xthread, xval)                                            \
   (((xthread)->th.th_current_task->td_icvs.dynamic) = (xval))
 #define get__dynamic(xthread)                                                  \
   (((xthread)->th.th_current_task->td_icvs.dynamic) ? (FTN_TRUE) : (FTN_FALSE))
 
 #define set__nproc(xthread, xval)                                              \
   (((xthread)->th.th_current_task->td_icvs.nproc) = (xval))
 
 #define set__thread_limit(xthread, xval)                                       \
   (((xthread)->th.th_current_task->td_icvs.thread_limit) = (xval))
 
 #define set__max_active_levels(xthread, xval)                                  \
   (((xthread)->th.th_current_task->td_icvs.max_active_levels) = (xval))
 
 #define get__max_active_levels(xthread)                                        \
   ((xthread)->th.th_current_task->td_icvs.max_active_levels)
 
 #define set__sched(xthread, xval)                                              \
   (((xthread)->th.th_current_task->td_icvs.sched) = (xval))
 
 #define set__proc_bind(xthread, xval)                                          \
   (((xthread)->th.th_current_task->td_icvs.proc_bind) = (xval))
 #define get__proc_bind(xthread)                                                \
   ((xthread)->th.th_current_task->td_icvs.proc_bind)
 
 // OpenMP tasking data structures
 
 typedef enum kmp_tasking_mode {
   tskm_immediate_exec = 0,
   tskm_extra_barrier = 1,
   tskm_task_teams = 2,
   tskm_max = 2
 } kmp_tasking_mode_t;
 
 extern kmp_tasking_mode_t
     __kmp_tasking_mode; /* determines how/when to execute tasks */
 extern int __kmp_task_stealing_constraint;
 extern int __kmp_enable_task_throttling;
 extern kmp_int32 __kmp_default_device; // Set via OMP_DEFAULT_DEVICE if
 // specified, defaults to 0 otherwise
 // Set via OMP_MAX_TASK_PRIORITY if specified, defaults to 0 otherwise
 extern kmp_int32 __kmp_max_task_priority;
 // Set via KMP_TASKLOOP_MIN_TASKS if specified, defaults to 0 otherwise
 extern kmp_uint64 __kmp_taskloop_min_tasks;
 
 /* NOTE: kmp_taskdata_t and kmp_task_t structures allocated in single block with
    taskdata first */
 #define KMP_TASK_TO_TASKDATA(task) (((kmp_taskdata_t *)task) - 1)
 #define KMP_TASKDATA_TO_TASK(taskdata) (kmp_task_t *)(taskdata + 1)
 
 // The tt_found_tasks flag is a signal to all threads in the team that tasks
 // were spawned and queued since the previous barrier release.
 #define KMP_TASKING_ENABLED(task_team)                                         \
   (TCR_SYNC_4((task_team)->tt.tt_found_tasks) == TRUE)
 /*!
 @ingroup BASIC_TYPES
 @{
 */
 
 /*!
  */
 typedef kmp_int32 (*kmp_routine_entry_t)(kmp_int32, void *);
 
 typedef union kmp_cmplrdata {
   kmp_int32 priority; /**< priority specified by user for the task */
   kmp_routine_entry_t
       destructors; /* pointer to function to invoke deconstructors of
                       firstprivate C++ objects */
   /* future data */
 } kmp_cmplrdata_t;
 
 /*  sizeof_kmp_task_t passed as arg to kmpc_omp_task call  */
 /*!
  */
 typedef struct kmp_task { /* GEH: Shouldn't this be aligned somehow? */
   void *shareds; /**< pointer to block of pointers to shared vars   */
   kmp_routine_entry_t
       routine; /**< pointer to routine to call for executing task */
   kmp_int32 part_id; /**< part id for the task                          */
   kmp_cmplrdata_t
       data1; /* Two known optional additions: destructors and priority */
   kmp_cmplrdata_t data2; /* Process destructors first, priority second */
   /* future data */
   /*  private vars  */
 } kmp_task_t;
 
 /*!
 @}
 */
 
 typedef struct kmp_taskgroup {
   std::atomic<kmp_int32> count; // number of allocated and incomplete tasks
   std::atomic<kmp_int32>
       cancel_request; // request for cancellation of this taskgroup
   struct kmp_taskgroup *parent; // parent taskgroup
   // Block of data to perform task reduction
   void *reduce_data; // reduction related info
   kmp_int32 reduce_num_data; // number of data items to reduce
 } kmp_taskgroup_t;
 
 // forward declarations
 typedef union kmp_depnode kmp_depnode_t;
 typedef struct kmp_depnode_list kmp_depnode_list_t;
 typedef struct kmp_dephash_entry kmp_dephash_entry_t;
 
 // Compiler sends us this info:
 typedef struct kmp_depend_info {
   kmp_intptr_t base_addr;
   size_t len;
   struct {
     bool in : 1;
     bool out : 1;
     bool mtx : 1;
   } flags;
 } kmp_depend_info_t;
 
 // Internal structures to work with task dependencies:
 struct kmp_depnode_list {
   kmp_depnode_t *node;
   kmp_depnode_list_t *next;
 };
 
 // Max number of mutexinoutset dependencies per node
 #define MAX_MTX_DEPS 4
 
 typedef struct kmp_base_depnode {
   kmp_depnode_list_t *successors; /* used under lock */
   kmp_task_t *task; /* non-NULL if depnode is active, used under lock */
   kmp_lock_t *mtx_locks[MAX_MTX_DEPS]; /* lock mutexinoutset dependent tasks */
   kmp_int32 mtx_num_locks; /* number of locks in mtx_locks array */
   kmp_lock_t lock; /* guards shared fields: task, successors */
 #if KMP_SUPPORT_GRAPH_OUTPUT
   kmp_uint32 id;
 #endif
   std::atomic<kmp_int32> npredecessors;
   std::atomic<kmp_int32> nrefs;
 } kmp_base_depnode_t;
 
 union KMP_ALIGN_CACHE kmp_depnode {
   double dn_align; /* use worst case alignment */
   char dn_pad[KMP_PAD(kmp_base_depnode_t, CACHE_LINE)];
   kmp_base_depnode_t dn;
 };
 
 struct kmp_dephash_entry {
   kmp_intptr_t addr;
   kmp_depnode_t *last_out;
   kmp_depnode_list_t *last_ins;
   kmp_depnode_list_t *last_mtxs;
   kmp_int32 last_flag;
   kmp_lock_t *mtx_lock; /* is referenced by depnodes w/mutexinoutset dep */
   kmp_dephash_entry_t *next_in_bucket;
 };
 
 typedef struct kmp_dephash {
   kmp_dephash_entry_t **buckets;
   size_t size;
-#ifdef KMP_DEBUG
+  size_t generation;
   kmp_uint32 nelements;
   kmp_uint32 nconflicts;
-#endif
 } kmp_dephash_t;
 
 typedef struct kmp_task_affinity_info {
   kmp_intptr_t base_addr;
   size_t len;
   struct {
     bool flag1 : 1;
     bool flag2 : 1;
     kmp_int32 reserved : 30;
   } flags;
 } kmp_task_affinity_info_t;
 
 typedef enum kmp_event_type_t {
   KMP_EVENT_UNINITIALIZED = 0,
   KMP_EVENT_ALLOW_COMPLETION = 1
 } kmp_event_type_t;
 
 typedef struct {
   kmp_event_type_t type;
   kmp_tas_lock_t lock;
   union {
     kmp_task_t *task;
   } ed;
 } kmp_event_t;
 
 #ifdef BUILD_TIED_TASK_STACK
 
 /* Tied Task stack definitions */
 typedef struct kmp_stack_block {
   kmp_taskdata_t *sb_block[TASK_STACK_BLOCK_SIZE];
   struct kmp_stack_block *sb_next;
   struct kmp_stack_block *sb_prev;
 } kmp_stack_block_t;
 
 typedef struct kmp_task_stack {
   kmp_stack_block_t ts_first_block; // first block of stack entries
   kmp_taskdata_t **ts_top; // pointer to the top of stack
   kmp_int32 ts_entries; // number of entries on the stack
 } kmp_task_stack_t;
 
 #endif // BUILD_TIED_TASK_STACK
 
 typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
   /* Compiler flags */ /* Total compiler flags must be 16 bits */
   unsigned tiedness : 1; /* task is either tied (1) or untied (0) */
   unsigned final : 1; /* task is final(1) so execute immediately */
   unsigned merged_if0 : 1; /* no __kmpc_task_{begin/complete}_if0 calls in if0
                               code path */
   unsigned destructors_thunk : 1; /* set if the compiler creates a thunk to
                                      invoke destructors from the runtime */
   unsigned proxy : 1; /* task is a proxy task (it will be executed outside the
                          context of the RTL) */
   unsigned priority_specified : 1; /* set if the compiler provides priority
                                       setting for the task */
   unsigned detachable : 1; /* 1 == can detach */
   unsigned reserved : 9; /* reserved for compiler use */
 
   /* Library flags */ /* Total library flags must be 16 bits */
   unsigned tasktype : 1; /* task is either explicit(1) or implicit (0) */
   unsigned task_serial : 1; // task is executed immediately (1) or deferred (0)
   unsigned tasking_ser : 1; // all tasks in team are either executed immediately
   // (1) or may be deferred (0)
   unsigned team_serial : 1; // entire team is serial (1) [1 thread] or parallel
   // (0) [>= 2 threads]
   /* If either team_serial or tasking_ser is set, task team may be NULL */
   /* Task State Flags: */
   unsigned started : 1; /* 1==started, 0==not started     */
   unsigned executing : 1; /* 1==executing, 0==not executing */
   unsigned complete : 1; /* 1==complete, 0==not complete   */
   unsigned freed : 1; /* 1==freed, 0==allocateed        */
   unsigned native : 1; /* 1==gcc-compiled task, 0==intel */
   unsigned reserved31 : 7; /* reserved for library use */
 
 } kmp_tasking_flags_t;
 
 struct kmp_taskdata { /* aligned during dynamic allocation       */
   kmp_int32 td_task_id; /* id, assigned by debugger                */
   kmp_tasking_flags_t td_flags; /* task flags                              */
   kmp_team_t *td_team; /* team for this task                      */
   kmp_info_p *td_alloc_thread; /* thread that allocated data structures   */
   /* Currently not used except for perhaps IDB */
   kmp_taskdata_t *td_parent; /* parent task                             */
   kmp_int32 td_level; /* task nesting level                      */
   std::atomic<kmp_int32> td_untied_count; // untied task active parts counter
   ident_t *td_ident; /* task identifier                         */
   // Taskwait data.
   ident_t *td_taskwait_ident;
   kmp_uint32 td_taskwait_counter;
   kmp_int32 td_taskwait_thread; /* gtid + 1 of thread encountered taskwait */
   KMP_ALIGN_CACHE kmp_internal_control_t
       td_icvs; /* Internal control variables for the task */
   KMP_ALIGN_CACHE std::atomic<kmp_int32>
       td_allocated_child_tasks; /* Child tasks (+ current task) not yet
                                    deallocated */
   std::atomic<kmp_int32>
       td_incomplete_child_tasks; /* Child tasks not yet complete */
   kmp_taskgroup_t
       *td_taskgroup; // Each task keeps pointer to its current taskgroup
   kmp_dephash_t
       *td_dephash; // Dependencies for children tasks are tracked from here
   kmp_depnode_t
       *td_depnode; // Pointer to graph node if this task has dependencies
   kmp_task_team_t *td_task_team;
   kmp_int32 td_size_alloc; // The size of task structure, including shareds etc.
 #if defined(KMP_GOMP_COMPAT)
   // 4 or 8 byte integers for the loop bounds in GOMP_taskloop
   kmp_int32 td_size_loop_bounds;
 #endif
   kmp_taskdata_t *td_last_tied; // keep tied task for task scheduling constraint
 #if defined(KMP_GOMP_COMPAT)
   // GOMP sends in a copy function for copy constructors
   void (*td_copy_func)(void *, void *);
 #endif
   kmp_event_t td_allow_completion_event;
 #if OMPT_SUPPORT
   ompt_task_info_t ompt_task_info;
 #endif
 }; // struct kmp_taskdata
 
 // Make sure padding above worked
 KMP_BUILD_ASSERT(sizeof(kmp_taskdata_t) % sizeof(void *) == 0);
 
 // Data for task team but per thread
 typedef struct kmp_base_thread_data {
   kmp_info_p *td_thr; // Pointer back to thread info
   // Used only in __kmp_execute_tasks_template, maybe not avail until task is
   // queued?
   kmp_bootstrap_lock_t td_deque_lock; // Lock for accessing deque
   kmp_taskdata_t *
       *td_deque; // Deque of tasks encountered by td_thr, dynamically allocated
   kmp_int32 td_deque_size; // Size of deck
   kmp_uint32 td_deque_head; // Head of deque (will wrap)
   kmp_uint32 td_deque_tail; // Tail of deque (will wrap)
   kmp_int32 td_deque_ntasks; // Number of tasks in deque
   // GEH: shouldn't this be volatile since used in while-spin?
   kmp_int32 td_deque_last_stolen; // Thread number of last successful steal
 #ifdef BUILD_TIED_TASK_STACK
   kmp_task_stack_t td_susp_tied_tasks; // Stack of suspended tied tasks for task
 // scheduling constraint
 #endif // BUILD_TIED_TASK_STACK
 } kmp_base_thread_data_t;
 
 #define TASK_DEQUE_BITS 8 // Used solely to define INITIAL_TASK_DEQUE_SIZE
 #define INITIAL_TASK_DEQUE_SIZE (1 << TASK_DEQUE_BITS)
 
 #define TASK_DEQUE_SIZE(td) ((td).td_deque_size)
 #define TASK_DEQUE_MASK(td) ((td).td_deque_size - 1)
 
 typedef union KMP_ALIGN_CACHE kmp_thread_data {
   kmp_base_thread_data_t td;
   double td_align; /* use worst case alignment */
   char td_pad[KMP_PAD(kmp_base_thread_data_t, CACHE_LINE)];
 } kmp_thread_data_t;
 
 // Data for task teams which are used when tasking is enabled for the team
 typedef struct kmp_base_task_team {
   kmp_bootstrap_lock_t
       tt_threads_lock; /* Lock used to allocate per-thread part of task team */
   /* must be bootstrap lock since used at library shutdown*/
   kmp_task_team_t *tt_next; /* For linking the task team free list */
   kmp_thread_data_t
       *tt_threads_data; /* Array of per-thread structures for task team */
   /* Data survives task team deallocation */
   kmp_int32 tt_found_tasks; /* Have we found tasks and queued them while
                                executing this team? */
   /* TRUE means tt_threads_data is set up and initialized */
   kmp_int32 tt_nproc; /* #threads in team           */
   kmp_int32 tt_max_threads; // # entries allocated for threads_data array
   kmp_int32 tt_found_proxy_tasks; // found proxy tasks since last barrier
   kmp_int32 tt_untied_task_encountered;
 
   KMP_ALIGN_CACHE
   std::atomic<kmp_int32> tt_unfinished_threads; /* #threads still active */
 
   KMP_ALIGN_CACHE
   volatile kmp_uint32
       tt_active; /* is the team still actively executing tasks */
 } kmp_base_task_team_t;
 
 union KMP_ALIGN_CACHE kmp_task_team {
   kmp_base_task_team_t tt;
   double tt_align; /* use worst case alignment */
   char tt_pad[KMP_PAD(kmp_base_task_team_t, CACHE_LINE)];
 };
 
 #if (USE_FAST_MEMORY == 3) || (USE_FAST_MEMORY == 5)
 // Free lists keep same-size free memory slots for fast memory allocation
 // routines
 typedef struct kmp_free_list {
   void *th_free_list_self; // Self-allocated tasks free list
   void *th_free_list_sync; // Self-allocated tasks stolen/returned by other
   // threads
   void *th_free_list_other; // Non-self free list (to be returned to owner's
   // sync list)
 } kmp_free_list_t;
 #endif
 #if KMP_NESTED_HOT_TEAMS
 // Hot teams array keeps hot teams and their sizes for given thread. Hot teams
 // are not put in teams pool, and they don't put threads in threads pool.
 typedef struct kmp_hot_team_ptr {
   kmp_team_p *hot_team; // pointer to hot_team of given nesting level
   kmp_int32 hot_team_nth; // number of threads allocated for the hot_team
 } kmp_hot_team_ptr_t;
 #endif
 typedef struct kmp_teams_size {
   kmp_int32 nteams; // number of teams in a league
   kmp_int32 nth; // number of threads in each team of the league
 } kmp_teams_size_t;
 
 // This struct stores a thread that acts as a "root" for a contention
 // group. Contention groups are rooted at kmp_root threads, but also at
 // each master thread of each team created in the teams construct.
 // This struct therefore also stores a thread_limit associated with
 // that contention group, and a counter to track the number of threads
 // active in that contention group. Each thread has a list of these: CG
 // root threads have an entry in their list in which cg_root refers to
 // the thread itself, whereas other workers in the CG will have a
 // single entry where cg_root is same as the entry containing their CG
 // root. When a thread encounters a teams construct, it will add a new
 // entry to the front of its list, because it now roots a new CG.
 typedef struct kmp_cg_root {
   kmp_info_p *cg_root; // "root" thread for a contention group
   // The CG root's limit comes from OMP_THREAD_LIMIT for root threads, or
   // thread_limit clause for teams masters
   kmp_int32 cg_thread_limit;
   kmp_int32 cg_nthreads; // Count of active threads in CG rooted at cg_root
   struct kmp_cg_root *up; // pointer to higher level CG root in list
 } kmp_cg_root_t;
 
 // OpenMP thread data structures
 
 typedef struct KMP_ALIGN_CACHE kmp_base_info {
   /* Start with the readonly data which is cache aligned and padded. This is
      written before the thread starts working by the master. Uber masters may
      update themselves later. Usage does not consider serialized regions.  */
   kmp_desc_t th_info;
   kmp_team_p *th_team; /* team we belong to */
   kmp_root_p *th_root; /* pointer to root of task hierarchy */
   kmp_info_p *th_next_pool; /* next available thread in the pool */
   kmp_disp_t *th_dispatch; /* thread's dispatch data */
   int th_in_pool; /* in thread pool (32 bits for TCR/TCW) */
 
   /* The following are cached from the team info structure */
   /* TODO use these in more places as determined to be needed via profiling */
   int th_team_nproc; /* number of threads in a team */
   kmp_info_p *th_team_master; /* the team's master thread */
   int th_team_serialized; /* team is serialized */
   microtask_t th_teams_microtask; /* save entry address for teams construct */
   int th_teams_level; /* save initial level of teams construct */
 /* it is 0 on device but may be any on host */
 
 /* The blocktime info is copied from the team struct to the thread sruct */
 /* at the start of a barrier, and the values stored in the team are used */
 /* at points in the code where the team struct is no longer guaranteed   */
 /* to exist (from the POV of worker threads).                            */
 #if KMP_USE_MONITOR
   int th_team_bt_intervals;
   int th_team_bt_set;
 #else
   kmp_uint64 th_team_bt_intervals;
 #endif
 
 #if KMP_AFFINITY_SUPPORTED
   kmp_affin_mask_t *th_affin_mask; /* thread's current affinity mask */
 #endif
   omp_allocator_handle_t th_def_allocator; /* default allocator */
   /* The data set by the master at reinit, then R/W by the worker */
   KMP_ALIGN_CACHE int
       th_set_nproc; /* if > 0, then only use this request for the next fork */
 #if KMP_NESTED_HOT_TEAMS
   kmp_hot_team_ptr_t *th_hot_teams; /* array of hot teams */
 #endif
   kmp_proc_bind_t
       th_set_proc_bind; /* if != proc_bind_default, use request for next fork */
   kmp_teams_size_t
       th_teams_size; /* number of teams/threads in teams construct */
 #if KMP_AFFINITY_SUPPORTED
   int th_current_place; /* place currently bound to */
   int th_new_place; /* place to bind to in par reg */
   int th_first_place; /* first place in partition */
   int th_last_place; /* last place in partition */
 #endif
   int th_prev_level; /* previous level for affinity format */
   int th_prev_num_threads; /* previous num_threads for affinity format */
 #if USE_ITT_BUILD
   kmp_uint64 th_bar_arrive_time; /* arrival to barrier timestamp */
   kmp_uint64 th_bar_min_time; /* minimum arrival time at the barrier */
   kmp_uint64 th_frame_time; /* frame timestamp */
 #endif /* USE_ITT_BUILD */
   kmp_local_t th_local;
   struct private_common *th_pri_head;
 
   /* Now the data only used by the worker (after initial allocation) */
   /* TODO the first serial team should actually be stored in the info_t
      structure.  this will help reduce initial allocation overhead */
   KMP_ALIGN_CACHE kmp_team_p
       *th_serial_team; /*serialized team held in reserve*/
 
 #if OMPT_SUPPORT
   ompt_thread_info_t ompt_thread_info;
 #endif
 
   /* The following are also read by the master during reinit */
   struct common_table *th_pri_common;
 
   volatile kmp_uint32 th_spin_here; /* thread-local location for spinning */
   /* while awaiting queuing lock acquire */
 
   volatile void *th_sleep_loc; // this points at a kmp_flag<T>
 
   ident_t *th_ident;
   unsigned th_x; // Random number generator data
   unsigned th_a; // Random number generator data
 
   /* Tasking-related data for the thread */
   kmp_task_team_t *th_task_team; // Task team struct
   kmp_taskdata_t *th_current_task; // Innermost Task being executed
   kmp_uint8 th_task_state; // alternating 0/1 for task team identification
   kmp_uint8 *th_task_state_memo_stack; // Stack holding memos of th_task_state
   // at nested levels
   kmp_uint32 th_task_state_top; // Top element of th_task_state_memo_stack
   kmp_uint32 th_task_state_stack_sz; // Size of th_task_state_memo_stack
   kmp_uint32 th_reap_state; // Non-zero indicates thread is not
   // tasking, thus safe to reap
 
   /* More stuff for keeping track of active/sleeping threads (this part is
      written by the worker thread) */
   kmp_uint8 th_active_in_pool; // included in count of #active threads in pool
   int th_active; // ! sleeping; 32 bits for TCR/TCW
   struct cons_header *th_cons; // used for consistency check
 #if KMP_USE_HIER_SCHED
   // used for hierarchical scheduling
   kmp_hier_private_bdata_t *th_hier_bar_data;
 #endif
 
   /* Add the syncronizing data which is cache aligned and padded. */
   KMP_ALIGN_CACHE kmp_balign_t th_bar[bs_last_barrier];
 
   KMP_ALIGN_CACHE volatile kmp_int32
       th_next_waiting; /* gtid+1 of next thread on lock wait queue, 0 if none */
 
 #if (USE_FAST_MEMORY == 3) || (USE_FAST_MEMORY == 5)
 #define NUM_LISTS 4
   kmp_free_list_t th_free_lists[NUM_LISTS]; // Free lists for fast memory
 // allocation routines
 #endif
 
 #if KMP_OS_WINDOWS
   kmp_win32_cond_t th_suspend_cv;
   kmp_win32_mutex_t th_suspend_mx;
   std::atomic<int> th_suspend_init;
 #endif
 #if KMP_OS_UNIX
   kmp_cond_align_t th_suspend_cv;
   kmp_mutex_align_t th_suspend_mx;
   std::atomic<int> th_suspend_init_count;
 #endif
 
 #if USE_ITT_BUILD
   kmp_itt_mark_t th_itt_mark_single;
 // alignment ???
 #endif /* USE_ITT_BUILD */
 #if KMP_STATS_ENABLED
   kmp_stats_list *th_stats;
 #endif
 #if KMP_OS_UNIX
   std::atomic<bool> th_blocking;
 #endif
   kmp_cg_root_t *th_cg_roots; // list of cg_roots associated with this thread
 } kmp_base_info_t;
 
 typedef union KMP_ALIGN_CACHE kmp_info {
   double th_align; /* use worst case alignment */
   char th_pad[KMP_PAD(kmp_base_info_t, CACHE_LINE)];
   kmp_base_info_t th;
 } kmp_info_t;
 
 // OpenMP thread team data structures
 
 typedef struct kmp_base_data { volatile kmp_uint32 t_value; } kmp_base_data_t;
 
 typedef union KMP_ALIGN_CACHE kmp_sleep_team {
   double dt_align; /* use worst case alignment */
   char dt_pad[KMP_PAD(kmp_base_data_t, CACHE_LINE)];
   kmp_base_data_t dt;
 } kmp_sleep_team_t;
 
 typedef union KMP_ALIGN_CACHE kmp_ordered_team {
   double dt_align; /* use worst case alignment */
   char dt_pad[KMP_PAD(kmp_base_data_t, CACHE_LINE)];
   kmp_base_data_t dt;
 } kmp_ordered_team_t;
 
 typedef int (*launch_t)(int gtid);
 
 /* Minimum number of ARGV entries to malloc if necessary */
 #define KMP_MIN_MALLOC_ARGV_ENTRIES 100
 
 // Set up how many argv pointers will fit in cache lines containing
 // t_inline_argv. Historically, we have supported at least 96 bytes. Using a
 // larger value for more space between the master write/worker read section and
 // read/write by all section seems to buy more performance on EPCC PARALLEL.
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 #define KMP_INLINE_ARGV_BYTES                                                  \
   (4 * CACHE_LINE -                                                            \
    ((3 * KMP_PTR_SKIP + 2 * sizeof(int) + 2 * sizeof(kmp_int8) +               \
      sizeof(kmp_int16) + sizeof(kmp_uint32)) %                                 \
     CACHE_LINE))
 #else
 #define KMP_INLINE_ARGV_BYTES                                                  \
   (2 * CACHE_LINE - ((3 * KMP_PTR_SKIP + 2 * sizeof(int)) % CACHE_LINE))
 #endif
 #define KMP_INLINE_ARGV_ENTRIES (int)(KMP_INLINE_ARGV_BYTES / KMP_PTR_SKIP)
 
 typedef struct KMP_ALIGN_CACHE kmp_base_team {
   // Synchronization Data
   // ---------------------------------------------------------------------------
   KMP_ALIGN_CACHE kmp_ordered_team_t t_ordered;
   kmp_balign_team_t t_bar[bs_last_barrier];
   std::atomic<int> t_construct; // count of single directive encountered by team
   char pad[sizeof(kmp_lock_t)]; // padding to maintain performance on big iron
 
   // [0] - parallel / [1] - worksharing task reduction data shared by taskgroups
   std::atomic<void *> t_tg_reduce_data[2]; // to support task modifier
   std::atomic<int> t_tg_fini_counter[2]; // sync end of task reductions
 
   // Master only
   // ---------------------------------------------------------------------------
   KMP_ALIGN_CACHE int t_master_tid; // tid of master in parent team
   int t_master_this_cons; // "this_construct" single counter of master in parent
   // team
   ident_t *t_ident; // if volatile, have to change too much other crud to
   // volatile too
   kmp_team_p *t_parent; // parent team
   kmp_team_p *t_next_pool; // next free team in the team pool
   kmp_disp_t *t_dispatch; // thread's dispatch data
   kmp_task_team_t *t_task_team[2]; // Task team struct; switch between 2
   kmp_proc_bind_t t_proc_bind; // bind type for par region
 #if USE_ITT_BUILD
   kmp_uint64 t_region_time; // region begin timestamp
 #endif /* USE_ITT_BUILD */
 
   // Master write, workers read
   // --------------------------------------------------------------------------
   KMP_ALIGN_CACHE void **t_argv;
   int t_argc;
   int t_nproc; // number of threads in team
   microtask_t t_pkfn;
   launch_t t_invoke; // procedure to launch the microtask
 
 #if OMPT_SUPPORT
   ompt_team_info_t ompt_team_info;
   ompt_lw_taskteam_t *ompt_serialized_team_info;
 #endif
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
   kmp_int8 t_fp_control_saved;
   kmp_int8 t_pad2b;
   kmp_int16 t_x87_fpu_control_word; // FP control regs
   kmp_uint32 t_mxcsr;
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
   void *t_inline_argv[KMP_INLINE_ARGV_ENTRIES];
 
   KMP_ALIGN_CACHE kmp_info_t **t_threads;
   kmp_taskdata_t
       *t_implicit_task_taskdata; // Taskdata for the thread's implicit task
   int t_level; // nested parallel level
 
   KMP_ALIGN_CACHE int t_max_argc;
   int t_max_nproc; // max threads this team can handle (dynamicly expandable)
   int t_serialized; // levels deep of serialized teams
   dispatch_shared_info_t *t_disp_buffer; // buffers for dispatch system
   int t_id; // team's id, assigned by debugger.
   int t_active_level; // nested active parallel level
   kmp_r_sched_t t_sched; // run-time schedule for the team
 #if KMP_AFFINITY_SUPPORTED
   int t_first_place; // first & last place in parent thread's partition.
   int t_last_place; // Restore these values to master after par region.
 #endif // KMP_AFFINITY_SUPPORTED
   int t_display_affinity;
   int t_size_changed; // team size was changed?: 0: no, 1: yes, -1: changed via
   // omp_set_num_threads() call
   omp_allocator_handle_t t_def_allocator; /* default allocator */
 
 // Read/write by workers as well
 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
   // Using CACHE_LINE=64 reduces memory footprint, but causes a big perf
   // regression of epcc 'parallel' and 'barrier' on fxe256lin01. This extra
   // padding serves to fix the performance of epcc 'parallel' and 'barrier' when
   // CACHE_LINE=64. TODO: investigate more and get rid if this padding.
   char dummy_padding[1024];
 #endif
   // Internal control stack for additional nested teams.
   KMP_ALIGN_CACHE kmp_internal_control_t *t_control_stack_top;
   // for SERIALIZED teams nested 2 or more levels deep
   // typed flag to store request state of cancellation
   std::atomic<kmp_int32> t_cancel_request;
   int t_master_active; // save on fork, restore on join
   void *t_copypriv_data; // team specific pointer to copyprivate data array
 #if KMP_OS_WINDOWS
   std::atomic<kmp_uint32> t_copyin_counter;
 #endif
 #if USE_ITT_BUILD
   void *t_stack_id; // team specific stack stitching id (for ittnotify)
 #endif /* USE_ITT_BUILD */
 } kmp_base_team_t;
 
 union KMP_ALIGN_CACHE kmp_team {
   kmp_base_team_t t;
   double t_align; /* use worst case alignment */
   char t_pad[KMP_PAD(kmp_base_team_t, CACHE_LINE)];
 };
 
 typedef union KMP_ALIGN_CACHE kmp_time_global {
   double dt_align; /* use worst case alignment */
   char dt_pad[KMP_PAD(kmp_base_data_t, CACHE_LINE)];
   kmp_base_data_t dt;
 } kmp_time_global_t;
 
 typedef struct kmp_base_global {
   /* cache-aligned */
   kmp_time_global_t g_time;
 
   /* non cache-aligned */
   volatile int g_abort;
   volatile int g_done;
 
   int g_dynamic;
   enum dynamic_mode g_dynamic_mode;
 } kmp_base_global_t;
 
 typedef union KMP_ALIGN_CACHE kmp_global {
   kmp_base_global_t g;
   double g_align; /* use worst case alignment */
   char g_pad[KMP_PAD(kmp_base_global_t, CACHE_LINE)];
 } kmp_global_t;
 
 typedef struct kmp_base_root {
   // TODO: GEH - combine r_active with r_in_parallel then r_active ==
   // (r_in_parallel>= 0)
   // TODO: GEH - then replace r_active with t_active_levels if we can to reduce
   // the synch overhead or keeping r_active
   volatile int r_active; /* TRUE if some region in a nest has > 1 thread */
   // keeps a count of active parallel regions per root
   std::atomic<int> r_in_parallel;
   // GEH: This is misnamed, should be r_active_levels
   kmp_team_t *r_root_team;
   kmp_team_t *r_hot_team;
   kmp_info_t *r_uber_thread;
   kmp_lock_t r_begin_lock;
   volatile int r_begin;
   int r_blocktime; /* blocktime for this root and descendants */
 } kmp_base_root_t;
 
 typedef union KMP_ALIGN_CACHE kmp_root {
   kmp_base_root_t r;
   double r_align; /* use worst case alignment */
   char r_pad[KMP_PAD(kmp_base_root_t, CACHE_LINE)];
 } kmp_root_t;
 
 struct fortran_inx_info {
   kmp_int32 data;
 };
 
 /* ------------------------------------------------------------------------ */
 
 extern int __kmp_settings;
 extern int __kmp_duplicate_library_ok;
 #if USE_ITT_BUILD
 extern int __kmp_forkjoin_frames;
 extern int __kmp_forkjoin_frames_mode;
 #endif
 extern PACKED_REDUCTION_METHOD_T __kmp_force_reduction_method;
 extern int __kmp_determ_red;
 
 #ifdef KMP_DEBUG
 extern int kmp_a_debug;
 extern int kmp_b_debug;
 extern int kmp_c_debug;
 extern int kmp_d_debug;
 extern int kmp_e_debug;
 extern int kmp_f_debug;
 #endif /* KMP_DEBUG */
 
 /* For debug information logging using rotating buffer */
 #define KMP_DEBUG_BUF_LINES_INIT 512
 #define KMP_DEBUG_BUF_LINES_MIN 1
 
 #define KMP_DEBUG_BUF_CHARS_INIT 128
 #define KMP_DEBUG_BUF_CHARS_MIN 2
 
 extern int
     __kmp_debug_buf; /* TRUE means use buffer, FALSE means print to stderr */
 extern int __kmp_debug_buf_lines; /* How many lines of debug stored in buffer */
 extern int
     __kmp_debug_buf_chars; /* How many characters allowed per line in buffer */
 extern int __kmp_debug_buf_atomic; /* TRUE means use atomic update of buffer
                                       entry pointer */
 
 extern char *__kmp_debug_buffer; /* Debug buffer itself */
 extern std::atomic<int> __kmp_debug_count; /* Counter for number of lines
                                               printed in buffer so far */
 extern int __kmp_debug_buf_warn_chars; /* Keep track of char increase
                                           recommended in warnings */
 /* end rotating debug buffer */
 
 #ifdef KMP_DEBUG
 extern int __kmp_par_range; /* +1 => only go par for constructs in range */
 
 #define KMP_PAR_RANGE_ROUTINE_LEN 1024
 extern char __kmp_par_range_routine[KMP_PAR_RANGE_ROUTINE_LEN];
 #define KMP_PAR_RANGE_FILENAME_LEN 1024
 extern char __kmp_par_range_filename[KMP_PAR_RANGE_FILENAME_LEN];
 extern int __kmp_par_range_lb;
 extern int __kmp_par_range_ub;
 #endif
 
 /* For printing out dynamic storage map for threads and teams */
 extern int
     __kmp_storage_map; /* True means print storage map for threads and teams */
 extern int __kmp_storage_map_verbose; /* True means storage map includes
                                          placement info */
 extern int __kmp_storage_map_verbose_specified;
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 extern kmp_cpuinfo_t __kmp_cpuinfo;
 #endif
 
 extern volatile int __kmp_init_serial;
 extern volatile int __kmp_init_gtid;
 extern volatile int __kmp_init_common;
 extern volatile int __kmp_init_middle;
 extern volatile int __kmp_init_parallel;
 #if KMP_USE_MONITOR
 extern volatile int __kmp_init_monitor;
 #endif
 extern volatile int __kmp_init_user_locks;
 extern int __kmp_init_counter;
 extern int __kmp_root_counter;
 extern int __kmp_version;
 
 /* list of address of allocated caches for commons */
 extern kmp_cached_addr_t *__kmp_threadpriv_cache_list;
 
 /* Barrier algorithm types and options */
 extern kmp_uint32 __kmp_barrier_gather_bb_dflt;
 extern kmp_uint32 __kmp_barrier_release_bb_dflt;
 extern kmp_bar_pat_e __kmp_barrier_gather_pat_dflt;
 extern kmp_bar_pat_e __kmp_barrier_release_pat_dflt;
 extern kmp_uint32 __kmp_barrier_gather_branch_bits[bs_last_barrier];
 extern kmp_uint32 __kmp_barrier_release_branch_bits[bs_last_barrier];
 extern kmp_bar_pat_e __kmp_barrier_gather_pattern[bs_last_barrier];
 extern kmp_bar_pat_e __kmp_barrier_release_pattern[bs_last_barrier];
 extern char const *__kmp_barrier_branch_bit_env_name[bs_last_barrier];
 extern char const *__kmp_barrier_pattern_env_name[bs_last_barrier];
 extern char const *__kmp_barrier_type_name[bs_last_barrier];
 extern char const *__kmp_barrier_pattern_name[bp_last_bar];
 
 /* Global Locks */
 extern kmp_bootstrap_lock_t __kmp_initz_lock; /* control initialization */
 extern kmp_bootstrap_lock_t __kmp_forkjoin_lock; /* control fork/join access */
 extern kmp_bootstrap_lock_t __kmp_task_team_lock;
 extern kmp_bootstrap_lock_t
     __kmp_exit_lock; /* exit() is not always thread-safe */
 #if KMP_USE_MONITOR
 extern kmp_bootstrap_lock_t
     __kmp_monitor_lock; /* control monitor thread creation */
 #endif
 extern kmp_bootstrap_lock_t
     __kmp_tp_cached_lock; /* used for the hack to allow threadprivate cache and
                              __kmp_threads expansion to co-exist */
 
 extern kmp_lock_t __kmp_global_lock; /* control OS/global access  */
 extern kmp_queuing_lock_t __kmp_dispatch_lock; /* control dispatch access  */
 extern kmp_lock_t __kmp_debug_lock; /* control I/O access for KMP_DEBUG */
 
 extern enum library_type __kmp_library;
 
 extern enum sched_type __kmp_sched; /* default runtime scheduling */
 extern enum sched_type __kmp_static; /* default static scheduling method */
 extern enum sched_type __kmp_guided; /* default guided scheduling method */
 extern enum sched_type __kmp_auto; /* default auto scheduling method */
 extern int __kmp_chunk; /* default runtime chunk size */
 
 extern size_t __kmp_stksize; /* stack size per thread         */
 #if KMP_USE_MONITOR
 extern size_t __kmp_monitor_stksize; /* stack size for monitor thread */
 #endif
 extern size_t __kmp_stkoffset; /* stack offset per thread       */
 extern int __kmp_stkpadding; /* Should we pad root thread(s) stack */
 
 extern size_t
     __kmp_malloc_pool_incr; /* incremental size of pool for kmp_malloc() */
 extern int __kmp_env_stksize; /* was KMP_STACKSIZE specified? */
 extern int __kmp_env_blocktime; /* was KMP_BLOCKTIME specified? */
 extern int __kmp_env_checks; /* was KMP_CHECKS specified?    */
 extern int __kmp_env_consistency_check; // was KMP_CONSISTENCY_CHECK specified?
 extern int __kmp_generate_warnings; /* should we issue warnings? */
 extern int __kmp_reserve_warn; /* have we issued reserve_threads warning? */
 
 #ifdef DEBUG_SUSPEND
 extern int __kmp_suspend_count; /* count inside __kmp_suspend_template() */
 #endif
 
 extern kmp_int32 __kmp_use_yield;
 extern kmp_int32 __kmp_use_yield_exp_set;
 extern kmp_uint32 __kmp_yield_init;
 extern kmp_uint32 __kmp_yield_next;
 
 /* ------------------------------------------------------------------------- */
 extern int __kmp_allThreadsSpecified;
 
 extern size_t __kmp_align_alloc;
 /* following data protected by initialization routines */
 extern int __kmp_xproc; /* number of processors in the system */
 extern int __kmp_avail_proc; /* number of processors available to the process */
 extern size_t __kmp_sys_min_stksize; /* system-defined minimum stack size */
 extern int __kmp_sys_max_nth; /* system-imposed maximum number of threads */
 // maximum total number of concurrently-existing threads on device
 extern int __kmp_max_nth;
 // maximum total number of concurrently-existing threads in a contention group
 extern int __kmp_cg_max_nth;
 extern int __kmp_teams_max_nth; // max threads used in a teams construct
 extern int __kmp_threads_capacity; /* capacity of the arrays __kmp_threads and
                                       __kmp_root */
 extern int __kmp_dflt_team_nth; /* default number of threads in a parallel
                                    region a la OMP_NUM_THREADS */
 extern int __kmp_dflt_team_nth_ub; /* upper bound on "" determined at serial
                                       initialization */
 extern int __kmp_tp_capacity; /* capacity of __kmp_threads if threadprivate is
                                  used (fixed) */
 extern int __kmp_tp_cached; /* whether threadprivate cache has been created
                                (__kmpc_threadprivate_cached()) */
 extern int __kmp_dflt_blocktime; /* number of milliseconds to wait before
                                     blocking (env setting) */
 #if KMP_USE_MONITOR
 extern int
     __kmp_monitor_wakeups; /* number of times monitor wakes up per second */
 extern int __kmp_bt_intervals; /* number of monitor timestamp intervals before
                                   blocking */
 #endif
 #ifdef KMP_ADJUST_BLOCKTIME
 extern int __kmp_zero_bt; /* whether blocktime has been forced to zero */
 #endif /* KMP_ADJUST_BLOCKTIME */
 #ifdef KMP_DFLT_NTH_CORES
 extern int __kmp_ncores; /* Total number of cores for threads placement */
 #endif
 /* Number of millisecs to delay on abort for Intel(R) VTune(TM) tools */
 extern int __kmp_abort_delay;
 
 extern int __kmp_need_register_atfork_specified;
 extern int
     __kmp_need_register_atfork; /* At initialization, call pthread_atfork to
                                    install fork handler */
 extern int __kmp_gtid_mode; /* Method of getting gtid, values:
                                0 - not set, will be set at runtime
                                1 - using stack search
                                2 - dynamic TLS (pthread_getspecific(Linux* OS/OS
                                    X*) or TlsGetValue(Windows* OS))
                                3 - static TLS (__declspec(thread) __kmp_gtid),
                                    Linux* OS .so only.  */
 extern int
     __kmp_adjust_gtid_mode; /* If true, adjust method based on #threads */
 #ifdef KMP_TDATA_GTID
 extern KMP_THREAD_LOCAL int __kmp_gtid;
 #endif
 extern int __kmp_tls_gtid_min; /* #threads below which use sp search for gtid */
 extern int __kmp_foreign_tp; // If true, separate TP var for each foreign thread
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 extern int __kmp_inherit_fp_control; // copy fp creg(s) parent->workers at fork
 extern kmp_int16 __kmp_init_x87_fpu_control_word; // init thread's FP ctrl reg
 extern kmp_uint32 __kmp_init_mxcsr; /* init thread's mxscr */
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
 // max_active_levels for nested parallelism enabled by default via
 // OMP_MAX_ACTIVE_LEVELS, OMP_NESTED, OMP_NUM_THREADS, and OMP_PROC_BIND
 extern int __kmp_dflt_max_active_levels;
 // Indicates whether value of __kmp_dflt_max_active_levels was already
 // explicitly set by OMP_MAX_ACTIVE_LEVELS or OMP_NESTED=false
 extern bool __kmp_dflt_max_active_levels_set;
 extern int __kmp_dispatch_num_buffers; /* max possible dynamic loops in
                                           concurrent execution per team */
 #if KMP_NESTED_HOT_TEAMS
 extern int __kmp_hot_teams_mode;
 extern int __kmp_hot_teams_max_level;
 #endif
 
 #if KMP_OS_LINUX
 extern enum clock_function_type __kmp_clock_function;
 extern int __kmp_clock_function_param;
 #endif /* KMP_OS_LINUX */
 
 #if KMP_MIC_SUPPORTED
 extern enum mic_type __kmp_mic_type;
 #endif
 
 #ifdef USE_LOAD_BALANCE
 extern double __kmp_load_balance_interval; // load balance algorithm interval
 #endif /* USE_LOAD_BALANCE */
 
 // OpenMP 3.1 - Nested num threads array
 typedef struct kmp_nested_nthreads_t {
   int *nth;
   int size;
   int used;
 } kmp_nested_nthreads_t;
 
 extern kmp_nested_nthreads_t __kmp_nested_nth;
 
 #if KMP_USE_ADAPTIVE_LOCKS
 
 // Parameters for the speculative lock backoff system.
 struct kmp_adaptive_backoff_params_t {
   // Number of soft retries before it counts as a hard retry.
   kmp_uint32 max_soft_retries;
   // Badness is a bit mask : 0,1,3,7,15,... on each hard failure we move one to
   // the right
   kmp_uint32 max_badness;
 };
 
 extern kmp_adaptive_backoff_params_t __kmp_adaptive_backoff_params;
 
 #if KMP_DEBUG_ADAPTIVE_LOCKS
 extern const char *__kmp_speculative_statsfile;
 #endif
 
 #endif // KMP_USE_ADAPTIVE_LOCKS
 
 extern int __kmp_display_env; /* TRUE or FALSE */
 extern int __kmp_display_env_verbose; /* TRUE if OMP_DISPLAY_ENV=VERBOSE */
 extern int __kmp_omp_cancellation; /* TRUE or FALSE */
 
 /* ------------------------------------------------------------------------- */
 
 /* the following are protected by the fork/join lock */
 /* write: lock  read: anytime */
 extern kmp_info_t **__kmp_threads; /* Descriptors for the threads */
 /* read/write: lock */
 extern volatile kmp_team_t *__kmp_team_pool;
 extern volatile kmp_info_t *__kmp_thread_pool;
 extern kmp_info_t *__kmp_thread_pool_insert_pt;
 
 // total num threads reachable from some root thread including all root threads
 extern volatile int __kmp_nth;
 /* total number of threads reachable from some root thread including all root
    threads, and those in the thread pool */
 extern volatile int __kmp_all_nth;
 extern std::atomic<int> __kmp_thread_pool_active_nth;
 
 extern kmp_root_t **__kmp_root; /* root of thread hierarchy */
 /* end data protected by fork/join lock */
 /* ------------------------------------------------------------------------- */
 
 #define __kmp_get_gtid() __kmp_get_global_thread_id()
 #define __kmp_entry_gtid() __kmp_get_global_thread_id_reg()
 #define __kmp_get_tid() (__kmp_tid_from_gtid(__kmp_get_gtid()))
 #define __kmp_get_team() (__kmp_threads[(__kmp_get_gtid())]->th.th_team)
 #define __kmp_get_thread() (__kmp_thread_from_gtid(__kmp_get_gtid()))
 
 // AT: Which way is correct?
 // AT: 1. nproc = __kmp_threads[ ( gtid ) ] -> th.th_team -> t.t_nproc;
 // AT: 2. nproc = __kmp_threads[ ( gtid ) ] -> th.th_team_nproc;
 #define __kmp_get_team_num_threads(gtid)                                       \
   (__kmp_threads[(gtid)]->th.th_team->t.t_nproc)
 
 static inline bool KMP_UBER_GTID(int gtid) {
   KMP_DEBUG_ASSERT(gtid >= KMP_GTID_MIN);
   KMP_DEBUG_ASSERT(gtid < __kmp_threads_capacity);
   return (gtid >= 0 && __kmp_root[gtid] && __kmp_threads[gtid] &&
           __kmp_threads[gtid] == __kmp_root[gtid]->r.r_uber_thread);
 }
 
 static inline int __kmp_tid_from_gtid(int gtid) {
   KMP_DEBUG_ASSERT(gtid >= 0);
   return __kmp_threads[gtid]->th.th_info.ds.ds_tid;
 }
 
 static inline int __kmp_gtid_from_tid(int tid, const kmp_team_t *team) {
   KMP_DEBUG_ASSERT(tid >= 0 && team);
   return team->t.t_threads[tid]->th.th_info.ds.ds_gtid;
 }
 
 static inline int __kmp_gtid_from_thread(const kmp_info_t *thr) {
   KMP_DEBUG_ASSERT(thr);
   return thr->th.th_info.ds.ds_gtid;
 }
 
 static inline kmp_info_t *__kmp_thread_from_gtid(int gtid) {
   KMP_DEBUG_ASSERT(gtid >= 0);
   return __kmp_threads[gtid];
 }
 
 static inline kmp_team_t *__kmp_team_from_gtid(int gtid) {
   KMP_DEBUG_ASSERT(gtid >= 0);
   return __kmp_threads[gtid]->th.th_team;
 }
 
 /* ------------------------------------------------------------------------- */
 
 extern kmp_global_t __kmp_global; /* global status */
 
 extern kmp_info_t __kmp_monitor;
 // For Debugging Support Library
 extern std::atomic<kmp_int32> __kmp_team_counter;
 // For Debugging Support Library
 extern std::atomic<kmp_int32> __kmp_task_counter;
 
 #if USE_DEBUGGER
 #define _KMP_GEN_ID(counter)                                                   \
   (__kmp_debugging ? KMP_ATOMIC_INC(&counter) + 1 : ~0)
 #else
 #define _KMP_GEN_ID(counter) (~0)
 #endif /* USE_DEBUGGER */
 
 #define KMP_GEN_TASK_ID() _KMP_GEN_ID(__kmp_task_counter)
 #define KMP_GEN_TEAM_ID() _KMP_GEN_ID(__kmp_team_counter)
 
 /* ------------------------------------------------------------------------ */
 
 extern void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2,
                                          size_t size, char const *format, ...);
 
 extern void __kmp_serial_initialize(void);
 extern void __kmp_middle_initialize(void);
 extern void __kmp_parallel_initialize(void);
 
 extern void __kmp_internal_begin(void);
 extern void __kmp_internal_end_library(int gtid);
 extern void __kmp_internal_end_thread(int gtid);
 extern void __kmp_internal_end_atexit(void);
 extern void __kmp_internal_end_fini(void);
 extern void __kmp_internal_end_dtor(void);
 extern void __kmp_internal_end_dest(void *);
 
 extern int __kmp_register_root(int initial_thread);
 extern void __kmp_unregister_root(int gtid);
 
 extern int __kmp_ignore_mppbeg(void);
 extern int __kmp_ignore_mppend(void);
 
 extern int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws);
 extern void __kmp_exit_single(int gtid);
 
 extern void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
 extern void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
 
 #ifdef USE_LOAD_BALANCE
 extern int __kmp_get_load_balance(int);
 #endif
 
 extern int __kmp_get_global_thread_id(void);
 extern int __kmp_get_global_thread_id_reg(void);
 extern void __kmp_exit_thread(int exit_status);
 extern void __kmp_abort(char const *format, ...);
 extern void __kmp_abort_thread(void);
 KMP_NORETURN extern void __kmp_abort_process(void);
 extern void __kmp_warn(char const *format, ...);
 
 extern void __kmp_set_num_threads(int new_nth, int gtid);
 
 // Returns current thread (pointer to kmp_info_t). Current thread *must* be
 // registered.
 static inline kmp_info_t *__kmp_entry_thread() {
   int gtid = __kmp_entry_gtid();
 
   return __kmp_threads[gtid];
 }
 
 extern void __kmp_set_max_active_levels(int gtid, int new_max_active_levels);
 extern int __kmp_get_max_active_levels(int gtid);
 extern int __kmp_get_ancestor_thread_num(int gtid, int level);
 extern int __kmp_get_team_size(int gtid, int level);
 extern void __kmp_set_schedule(int gtid, kmp_sched_t new_sched, int chunk);
 extern void __kmp_get_schedule(int gtid, kmp_sched_t *sched, int *chunk);
 
 extern unsigned short __kmp_get_random(kmp_info_t *thread);
 extern void __kmp_init_random(kmp_info_t *thread);
 
 extern kmp_r_sched_t __kmp_get_schedule_global(void);
 extern void __kmp_adjust_num_threads(int new_nproc);
 extern void __kmp_check_stksize(size_t *val);
 
 extern void *___kmp_allocate(size_t size KMP_SRC_LOC_DECL);
 extern void *___kmp_page_allocate(size_t size KMP_SRC_LOC_DECL);
 extern void ___kmp_free(void *ptr KMP_SRC_LOC_DECL);
 #define __kmp_allocate(size) ___kmp_allocate((size)KMP_SRC_LOC_CURR)
 #define __kmp_page_allocate(size) ___kmp_page_allocate((size)KMP_SRC_LOC_CURR)
 #define __kmp_free(ptr) ___kmp_free((ptr)KMP_SRC_LOC_CURR)
 
 #if USE_FAST_MEMORY
 extern void *___kmp_fast_allocate(kmp_info_t *this_thr,
                                   size_t size KMP_SRC_LOC_DECL);
 extern void ___kmp_fast_free(kmp_info_t *this_thr, void *ptr KMP_SRC_LOC_DECL);
 extern void __kmp_free_fast_memory(kmp_info_t *this_thr);
 extern void __kmp_initialize_fast_memory(kmp_info_t *this_thr);
 #define __kmp_fast_allocate(this_thr, size)                                    \
   ___kmp_fast_allocate((this_thr), (size)KMP_SRC_LOC_CURR)
 #define __kmp_fast_free(this_thr, ptr)                                         \
   ___kmp_fast_free((this_thr), (ptr)KMP_SRC_LOC_CURR)
 #endif
 
 extern void *___kmp_thread_malloc(kmp_info_t *th, size_t size KMP_SRC_LOC_DECL);
 extern void *___kmp_thread_calloc(kmp_info_t *th, size_t nelem,
                                   size_t elsize KMP_SRC_LOC_DECL);
 extern void *___kmp_thread_realloc(kmp_info_t *th, void *ptr,
                                    size_t size KMP_SRC_LOC_DECL);
 extern void ___kmp_thread_free(kmp_info_t *th, void *ptr KMP_SRC_LOC_DECL);
 #define __kmp_thread_malloc(th, size)                                          \
   ___kmp_thread_malloc((th), (size)KMP_SRC_LOC_CURR)
 #define __kmp_thread_calloc(th, nelem, elsize)                                 \
   ___kmp_thread_calloc((th), (nelem), (elsize)KMP_SRC_LOC_CURR)
 #define __kmp_thread_realloc(th, ptr, size)                                    \
   ___kmp_thread_realloc((th), (ptr), (size)KMP_SRC_LOC_CURR)
 #define __kmp_thread_free(th, ptr)                                             \
   ___kmp_thread_free((th), (ptr)KMP_SRC_LOC_CURR)
 
 #define KMP_INTERNAL_MALLOC(sz) malloc(sz)
 #define KMP_INTERNAL_FREE(p) free(p)
 #define KMP_INTERNAL_REALLOC(p, sz) realloc((p), (sz))
 #define KMP_INTERNAL_CALLOC(n, sz) calloc((n), (sz))
 
 extern void __kmp_push_num_threads(ident_t *loc, int gtid, int num_threads);
 
 extern void __kmp_push_proc_bind(ident_t *loc, int gtid,
                                  kmp_proc_bind_t proc_bind);
 extern void __kmp_push_num_teams(ident_t *loc, int gtid, int num_teams,
                                  int num_threads);
 
 extern void __kmp_yield();
 
 extern void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
                                    enum sched_type schedule, kmp_int32 lb,
                                    kmp_int32 ub, kmp_int32 st, kmp_int32 chunk);
 extern void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
                                     enum sched_type schedule, kmp_uint32 lb,
                                     kmp_uint32 ub, kmp_int32 st,
                                     kmp_int32 chunk);
 extern void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
                                    enum sched_type schedule, kmp_int64 lb,
                                    kmp_int64 ub, kmp_int64 st, kmp_int64 chunk);
 extern void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
                                     enum sched_type schedule, kmp_uint64 lb,
                                     kmp_uint64 ub, kmp_int64 st,
                                     kmp_int64 chunk);
 
 extern int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid,
                                   kmp_int32 *p_last, kmp_int32 *p_lb,
                                   kmp_int32 *p_ub, kmp_int32 *p_st);
 extern int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid,
                                    kmp_int32 *p_last, kmp_uint32 *p_lb,
                                    kmp_uint32 *p_ub, kmp_int32 *p_st);
 extern int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid,
                                   kmp_int32 *p_last, kmp_int64 *p_lb,
                                   kmp_int64 *p_ub, kmp_int64 *p_st);
 extern int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid,
                                    kmp_int32 *p_last, kmp_uint64 *p_lb,
                                    kmp_uint64 *p_ub, kmp_int64 *p_st);
 
 extern void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid);
 extern void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid);
 extern void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid);
 extern void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid);
 
 #ifdef KMP_GOMP_COMPAT
 
 extern void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
                                       enum sched_type schedule, kmp_int32 lb,
                                       kmp_int32 ub, kmp_int32 st,
                                       kmp_int32 chunk, int push_ws);
 extern void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
                                        enum sched_type schedule, kmp_uint32 lb,
                                        kmp_uint32 ub, kmp_int32 st,
                                        kmp_int32 chunk, int push_ws);
 extern void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
                                       enum sched_type schedule, kmp_int64 lb,
                                       kmp_int64 ub, kmp_int64 st,
                                       kmp_int64 chunk, int push_ws);
 extern void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
                                        enum sched_type schedule, kmp_uint64 lb,
                                        kmp_uint64 ub, kmp_int64 st,
                                        kmp_int64 chunk, int push_ws);
 extern void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid);
 extern void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid);
 extern void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid);
 extern void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid);
 
 #endif /* KMP_GOMP_COMPAT */
 
 extern kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker);
 extern kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker);
 extern kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker);
 extern kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker);
 extern kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker);
 extern kmp_uint32 __kmp_wait_4(kmp_uint32 volatile *spinner, kmp_uint32 checker,
                                kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
                                void *obj);
 extern void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
                              kmp_uint32 (*pred)(void *, kmp_uint32), void *obj);
 
 class kmp_flag_32;
 class kmp_flag_64;
 class kmp_flag_oncore;
 extern void __kmp_wait_64(kmp_info_t *this_thr, kmp_flag_64 *flag,
                           int final_spin
 #if USE_ITT_BUILD
                           ,
                           void *itt_sync_obj
 #endif
                           );
 extern void __kmp_release_64(kmp_flag_64 *flag);
 
 extern void __kmp_infinite_loop(void);
 
 extern void __kmp_cleanup(void);
 
 #if KMP_HANDLE_SIGNALS
 extern int __kmp_handle_signals;
 extern void __kmp_install_signals(int parallel_init);
 extern void __kmp_remove_signals(void);
 #endif
 
 extern void __kmp_clear_system_time(void);
 extern void __kmp_read_system_time(double *delta);
 
 extern void __kmp_check_stack_overlap(kmp_info_t *thr);
 
 extern void __kmp_expand_host_name(char *buffer, size_t size);
 extern void __kmp_expand_file_name(char *result, size_t rlen, char *pattern);
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 extern void
 __kmp_initialize_system_tick(void); /* Initialize timer tick value */
 #endif
 
 extern void
 __kmp_runtime_initialize(void); /* machine specific initialization */
 extern void __kmp_runtime_destroy(void);
 
 #if KMP_AFFINITY_SUPPORTED
 extern char *__kmp_affinity_print_mask(char *buf, int buf_len,
                                        kmp_affin_mask_t *mask);
 extern kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf,
                                                   kmp_affin_mask_t *mask);
 extern void __kmp_affinity_initialize(void);
 extern void __kmp_affinity_uninitialize(void);
 extern void __kmp_affinity_set_init_mask(
     int gtid, int isa_root); /* set affinity according to KMP_AFFINITY */
 extern void __kmp_affinity_set_place(int gtid);
 extern void __kmp_affinity_determine_capable(const char *env_var);
 extern int __kmp_aux_set_affinity(void **mask);
 extern int __kmp_aux_get_affinity(void **mask);
 extern int __kmp_aux_get_affinity_max_proc();
 extern int __kmp_aux_set_affinity_mask_proc(int proc, void **mask);
 extern int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask);
 extern int __kmp_aux_get_affinity_mask_proc(int proc, void **mask);
 extern void __kmp_balanced_affinity(kmp_info_t *th, int team_size);
-#if KMP_OS_LINUX
+#if KMP_OS_LINUX || KMP_OS_FREEBSD
 extern int kmp_set_thread_affinity_mask_initial(void);
 #endif
 #endif /* KMP_AFFINITY_SUPPORTED */
 // No need for KMP_AFFINITY_SUPPORTED guard as only one field in the
 // format string is for affinity, so platforms that do not support
 // affinity can still use the other fields, e.g., %n for num_threads
 extern size_t __kmp_aux_capture_affinity(int gtid, const char *format,
                                          kmp_str_buf_t *buffer);
 extern void __kmp_aux_display_affinity(int gtid, const char *format);
 
 extern void __kmp_cleanup_hierarchy();
 extern void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar);
 
 #if KMP_USE_FUTEX
 
 extern int __kmp_futex_determine_capable(void);
 
 #endif // KMP_USE_FUTEX
 
 extern void __kmp_gtid_set_specific(int gtid);
 extern int __kmp_gtid_get_specific(void);
 
 extern double __kmp_read_cpu_time(void);
 
 extern int __kmp_read_system_info(struct kmp_sys_info *info);
 
 #if KMP_USE_MONITOR
 extern void __kmp_create_monitor(kmp_info_t *th);
 #endif
 
 extern void *__kmp_launch_thread(kmp_info_t *thr);
 
 extern void __kmp_create_worker(int gtid, kmp_info_t *th, size_t stack_size);
 
 #if KMP_OS_WINDOWS
 extern int __kmp_still_running(kmp_info_t *th);
 extern int __kmp_is_thread_alive(kmp_info_t *th, DWORD *exit_val);
 extern void __kmp_free_handle(kmp_thread_t tHandle);
 #endif
 
 #if KMP_USE_MONITOR
 extern void __kmp_reap_monitor(kmp_info_t *th);
 #endif
 extern void __kmp_reap_worker(kmp_info_t *th);
 extern void __kmp_terminate_thread(int gtid);
 
 extern int __kmp_try_suspend_mx(kmp_info_t *th);
 extern void __kmp_lock_suspend_mx(kmp_info_t *th);
 extern void __kmp_unlock_suspend_mx(kmp_info_t *th);
 
 extern void __kmp_suspend_32(int th_gtid, kmp_flag_32 *flag);
 extern void __kmp_suspend_64(int th_gtid, kmp_flag_64 *flag);
 extern void __kmp_suspend_oncore(int th_gtid, kmp_flag_oncore *flag);
 extern void __kmp_resume_32(int target_gtid, kmp_flag_32 *flag);
 extern void __kmp_resume_64(int target_gtid, kmp_flag_64 *flag);
 extern void __kmp_resume_oncore(int target_gtid, kmp_flag_oncore *flag);
 
 extern void __kmp_elapsed(double *);
 extern void __kmp_elapsed_tick(double *);
 
 extern void __kmp_enable(int old_state);
 extern void __kmp_disable(int *old_state);
 
 extern void __kmp_thread_sleep(int millis);
 
 extern void __kmp_common_initialize(void);
 extern void __kmp_common_destroy(void);
 extern void __kmp_common_destroy_gtid(int gtid);
 
 #if KMP_OS_UNIX
 extern void __kmp_register_atfork(void);
 #endif
 extern void __kmp_suspend_initialize(void);
 extern void __kmp_suspend_initialize_thread(kmp_info_t *th);
 extern void __kmp_suspend_uninitialize_thread(kmp_info_t *th);
 
 extern kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
                                          int tid);
 extern kmp_team_t *
 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
 #if OMPT_SUPPORT
                     ompt_data_t ompt_parallel_data,
 #endif
                     kmp_proc_bind_t proc_bind, kmp_internal_control_t *new_icvs,
                     int argc USE_NESTED_HOT_ARG(kmp_info_t *thr));
 extern void __kmp_free_thread(kmp_info_t *);
 extern void __kmp_free_team(kmp_root_t *,
                             kmp_team_t *USE_NESTED_HOT_ARG(kmp_info_t *));
 extern kmp_team_t *__kmp_reap_team(kmp_team_t *);
 
 /* ------------------------------------------------------------------------ */
 
 extern void __kmp_initialize_bget(kmp_info_t *th);
 extern void __kmp_finalize_bget(kmp_info_t *th);
 
 KMP_EXPORT void *kmpc_malloc(size_t size);
 KMP_EXPORT void *kmpc_aligned_malloc(size_t size, size_t alignment);
 KMP_EXPORT void *kmpc_calloc(size_t nelem, size_t elsize);
 KMP_EXPORT void *kmpc_realloc(void *ptr, size_t size);
 KMP_EXPORT void kmpc_free(void *ptr);
 
 /* declarations for internal use */
 
 extern int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
                          size_t reduce_size, void *reduce_data,
                          void (*reduce)(void *, void *));
 extern void __kmp_end_split_barrier(enum barrier_type bt, int gtid);
 extern int __kmp_barrier_gomp_cancel(int gtid);
 
 /*!
  * Tell the fork call which compiler generated the fork call, and therefore how
  * to deal with the call.
  */
 enum fork_context_e {
   fork_context_gnu, /**< Called from GNU generated code, so must not invoke the
                        microtask internally. */
   fork_context_intel, /**< Called from Intel generated code.  */
   fork_context_last
 };
 extern int __kmp_fork_call(ident_t *loc, int gtid,
                            enum fork_context_e fork_context, kmp_int32 argc,
                            microtask_t microtask, launch_t invoker,
 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
 #if (KMP_ARCH_ARM || KMP_ARCH_X86_64 || KMP_ARCH_AARCH64) && KMP_OS_LINUX
                            va_list *ap
 #else
                            va_list ap
 #endif
                            );
 
 extern void __kmp_join_call(ident_t *loc, int gtid
 #if OMPT_SUPPORT
                             ,
                             enum fork_context_e fork_context
 #endif
                             ,
                             int exit_teams = 0);
 
 extern void __kmp_serialized_parallel(ident_t *id, kmp_int32 gtid);
 extern void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team);
 extern void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team);
 extern int __kmp_invoke_task_func(int gtid);
 extern void __kmp_run_before_invoked_task(int gtid, int tid,
                                           kmp_info_t *this_thr,
                                           kmp_team_t *team);
 extern void __kmp_run_after_invoked_task(int gtid, int tid,
                                          kmp_info_t *this_thr,
                                          kmp_team_t *team);
 
 // should never have been exported
 KMP_EXPORT int __kmpc_invoke_task_func(int gtid);
 extern int __kmp_invoke_teams_master(int gtid);
 extern void __kmp_teams_master(int gtid);
 extern int __kmp_aux_get_team_num();
 extern int __kmp_aux_get_num_teams();
 extern void __kmp_save_internal_controls(kmp_info_t *thread);
 extern void __kmp_user_set_library(enum library_type arg);
 extern void __kmp_aux_set_library(enum library_type arg);
 extern void __kmp_aux_set_stacksize(size_t arg);
 extern void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid);
 extern void __kmp_aux_set_defaults(char const *str, int len);
 
 /* Functions called from __kmp_aux_env_initialize() in kmp_settings.cpp */
 void kmpc_set_blocktime(int arg);
 void ompc_set_nested(int flag);
 void ompc_set_dynamic(int flag);
 void ompc_set_num_threads(int arg);
 
 extern void __kmp_push_current_task_to_thread(kmp_info_t *this_thr,
                                               kmp_team_t *team, int tid);
 extern void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr);
 extern kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
                                     kmp_tasking_flags_t *flags,
                                     size_t sizeof_kmp_task_t,
                                     size_t sizeof_shareds,
                                     kmp_routine_entry_t task_entry);
 extern void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
                                      kmp_team_t *team, int tid,
                                      int set_curr_task);
 extern void __kmp_finish_implicit_task(kmp_info_t *this_thr);
 extern void __kmp_free_implicit_task(kmp_info_t *this_thr);
 
 extern kmp_event_t *__kmpc_task_allow_completion_event(ident_t *loc_ref,
                                                        int gtid,
                                                        kmp_task_t *task);
 extern void __kmp_fulfill_event(kmp_event_t *event);
 
 int __kmp_execute_tasks_32(kmp_info_t *thread, kmp_int32 gtid,
                            kmp_flag_32 *flag, int final_spin,
                            int *thread_finished,
 #if USE_ITT_BUILD
                            void *itt_sync_obj,
 #endif /* USE_ITT_BUILD */
                            kmp_int32 is_constrained);
 int __kmp_execute_tasks_64(kmp_info_t *thread, kmp_int32 gtid,
                            kmp_flag_64 *flag, int final_spin,
                            int *thread_finished,
 #if USE_ITT_BUILD
                            void *itt_sync_obj,
 #endif /* USE_ITT_BUILD */
                            kmp_int32 is_constrained);
 int __kmp_execute_tasks_oncore(kmp_info_t *thread, kmp_int32 gtid,
                                kmp_flag_oncore *flag, int final_spin,
                                int *thread_finished,
 #if USE_ITT_BUILD
                                void *itt_sync_obj,
 #endif /* USE_ITT_BUILD */
                                kmp_int32 is_constrained);
 
 extern void __kmp_free_task_team(kmp_info_t *thread,
                                  kmp_task_team_t *task_team);
 extern void __kmp_reap_task_teams(void);
 extern void __kmp_wait_to_unref_task_teams(void);
 extern void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team,
                                   int always);
 extern void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team);
 extern void __kmp_task_team_wait(kmp_info_t *this_thr, kmp_team_t *team
 #if USE_ITT_BUILD
                                  ,
                                  void *itt_sync_obj
 #endif /* USE_ITT_BUILD */
                                  ,
                                  int wait = 1);
 extern void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread,
                                   int gtid);
 
 extern int __kmp_is_address_mapped(void *addr);
 extern kmp_uint64 __kmp_hardware_timestamp(void);
 
 #if KMP_OS_UNIX
 extern int __kmp_read_from_file(char const *path, char const *format, ...);
 #endif
 
 /* ------------------------------------------------------------------------ */
 //
 // Assembly routines that have no compiler intrinsic replacement
 //
 
 extern int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int npr, int argc,
                                   void *argv[]
 #if OMPT_SUPPORT
                                   ,
                                   void **exit_frame_ptr
 #endif
                                   );
 
 /* ------------------------------------------------------------------------ */
 
 KMP_EXPORT void __kmpc_begin(ident_t *, kmp_int32 flags);
 KMP_EXPORT void __kmpc_end(ident_t *);
 
 KMP_EXPORT void __kmpc_threadprivate_register_vec(ident_t *, void *data,
                                                   kmpc_ctor_vec ctor,
                                                   kmpc_cctor_vec cctor,
                                                   kmpc_dtor_vec dtor,
                                                   size_t vector_length);
 KMP_EXPORT void __kmpc_threadprivate_register(ident_t *, void *data,
                                               kmpc_ctor ctor, kmpc_cctor cctor,
                                               kmpc_dtor dtor);
 KMP_EXPORT void *__kmpc_threadprivate(ident_t *, kmp_int32 global_tid,
                                       void *data, size_t size);
 
 KMP_EXPORT kmp_int32 __kmpc_global_thread_num(ident_t *);
 KMP_EXPORT kmp_int32 __kmpc_global_num_threads(ident_t *);
 KMP_EXPORT kmp_int32 __kmpc_bound_thread_num(ident_t *);
 KMP_EXPORT kmp_int32 __kmpc_bound_num_threads(ident_t *);
 
 KMP_EXPORT kmp_int32 __kmpc_ok_to_fork(ident_t *);
 KMP_EXPORT void __kmpc_fork_call(ident_t *, kmp_int32 nargs,
                                  kmpc_micro microtask, ...);
 
 KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid);
 KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid);
 
 KMP_EXPORT void __kmpc_flush(ident_t *);
 KMP_EXPORT void __kmpc_barrier(ident_t *, kmp_int32 global_tid);
 KMP_EXPORT kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid);
 KMP_EXPORT void __kmpc_end_master(ident_t *, kmp_int32 global_tid);
 KMP_EXPORT void __kmpc_ordered(ident_t *, kmp_int32 global_tid);
 KMP_EXPORT void __kmpc_end_ordered(ident_t *, kmp_int32 global_tid);
 KMP_EXPORT void __kmpc_critical(ident_t *, kmp_int32 global_tid,
                                 kmp_critical_name *);
 KMP_EXPORT void __kmpc_end_critical(ident_t *, kmp_int32 global_tid,
                                     kmp_critical_name *);
 KMP_EXPORT void __kmpc_critical_with_hint(ident_t *, kmp_int32 global_tid,
                                           kmp_critical_name *, uint32_t hint);
 
 KMP_EXPORT kmp_int32 __kmpc_barrier_master(ident_t *, kmp_int32 global_tid);
 KMP_EXPORT void __kmpc_end_barrier_master(ident_t *, kmp_int32 global_tid);
 
 KMP_EXPORT kmp_int32 __kmpc_barrier_master_nowait(ident_t *,
                                                   kmp_int32 global_tid);
 
 KMP_EXPORT kmp_int32 __kmpc_single(ident_t *, kmp_int32 global_tid);
 KMP_EXPORT void __kmpc_end_single(ident_t *, kmp_int32 global_tid);
 
 KMP_EXPORT void KMPC_FOR_STATIC_INIT(ident_t *loc, kmp_int32 global_tid,
                                      kmp_int32 schedtype, kmp_int32 *plastiter,
                                      kmp_int *plower, kmp_int *pupper,
                                      kmp_int *pstride, kmp_int incr,
                                      kmp_int chunk);
 
 KMP_EXPORT void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid);
 
 KMP_EXPORT void __kmpc_copyprivate(ident_t *loc, kmp_int32 global_tid,
                                    size_t cpy_size, void *cpy_data,
                                    void (*cpy_func)(void *, void *),
                                    kmp_int32 didit);
 
 extern void KMPC_SET_NUM_THREADS(int arg);
 extern void KMPC_SET_DYNAMIC(int flag);
 extern void KMPC_SET_NESTED(int flag);
 
 /* OMP 3.0 tasking interface routines */
 KMP_EXPORT kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
                                      kmp_task_t *new_task);
 KMP_EXPORT kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
                                              kmp_int32 flags,
                                              size_t sizeof_kmp_task_t,
                                              size_t sizeof_shareds,
                                              kmp_routine_entry_t task_entry);
 KMP_EXPORT kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
                                                     kmp_int32 flags,
                                                     size_t sizeof_kmp_task_t,
                                                     size_t sizeof_shareds,
                                                     kmp_routine_entry_t task_entry,
                                                     kmp_int64 device_id);
 KMP_EXPORT void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
                                           kmp_task_t *task);
 KMP_EXPORT void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
                                              kmp_task_t *task);
 KMP_EXPORT kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
                                            kmp_task_t *new_task);
 KMP_EXPORT kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid);
 
 KMP_EXPORT kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid,
                                           int end_part);
 
 #if TASK_UNUSED
 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task);
 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
                               kmp_task_t *task);
 #endif // TASK_UNUSED
 
 /* ------------------------------------------------------------------------ */
 
 KMP_EXPORT void __kmpc_taskgroup(ident_t *loc, int gtid);
 KMP_EXPORT void __kmpc_end_taskgroup(ident_t *loc, int gtid);
 
 KMP_EXPORT kmp_int32 __kmpc_omp_task_with_deps(
     ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 ndeps,
     kmp_depend_info_t *dep_list, kmp_int32 ndeps_noalias,
     kmp_depend_info_t *noalias_dep_list);
 KMP_EXPORT void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid,
                                      kmp_int32 ndeps,
                                      kmp_depend_info_t *dep_list,
                                      kmp_int32 ndeps_noalias,
                                      kmp_depend_info_t *noalias_dep_list);
 extern kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
                                 bool serialize_immediate);
 
 KMP_EXPORT kmp_int32 __kmpc_cancel(ident_t *loc_ref, kmp_int32 gtid,
                                    kmp_int32 cncl_kind);
 KMP_EXPORT kmp_int32 __kmpc_cancellationpoint(ident_t *loc_ref, kmp_int32 gtid,
                                               kmp_int32 cncl_kind);
 KMP_EXPORT kmp_int32 __kmpc_cancel_barrier(ident_t *loc_ref, kmp_int32 gtid);
 KMP_EXPORT int __kmp_get_cancellation_status(int cancel_kind);
 
 KMP_EXPORT void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask);
 KMP_EXPORT void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask);
 KMP_EXPORT void __kmpc_taskloop(ident_t *loc, kmp_int32 gtid, kmp_task_t *task,
                                 kmp_int32 if_val, kmp_uint64 *lb,
                                 kmp_uint64 *ub, kmp_int64 st, kmp_int32 nogroup,
                                 kmp_int32 sched, kmp_uint64 grainsize,
                                 void *task_dup);
 KMP_EXPORT void *__kmpc_task_reduction_init(int gtid, int num_data, void *data);
 KMP_EXPORT void *__kmpc_taskred_init(int gtid, int num_data, void *data);
 KMP_EXPORT void *__kmpc_task_reduction_get_th_data(int gtid, void *tg, void *d);
 KMP_EXPORT void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid,
                                                      int is_ws, int num,
                                                      void *data);
 KMP_EXPORT void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws,
                                               int num, void *data);
 KMP_EXPORT void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid,
                                                     int is_ws);
 KMP_EXPORT kmp_int32 __kmpc_omp_reg_task_with_affinity(
     ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 naffins,
     kmp_task_affinity_info_t *affin_list);
 
 /* Lock interface routines (fast versions with gtid passed in) */
 KMP_EXPORT void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid,
                                  void **user_lock);
 KMP_EXPORT void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid,
                                       void **user_lock);
 KMP_EXPORT void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid,
                                     void **user_lock);
 KMP_EXPORT void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid,
                                          void **user_lock);
 KMP_EXPORT void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock);
 KMP_EXPORT void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid,
                                      void **user_lock);
 KMP_EXPORT void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid,
                                   void **user_lock);
 KMP_EXPORT void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid,
                                        void **user_lock);
 KMP_EXPORT int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock);
 KMP_EXPORT int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid,
                                      void **user_lock);
 
 KMP_EXPORT void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid,
                                            void **user_lock, uintptr_t hint);
 KMP_EXPORT void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
                                                 void **user_lock,
                                                 uintptr_t hint);
 
 /* Interface to fast scalable reduce methods routines */
 
 KMP_EXPORT kmp_int32 __kmpc_reduce_nowait(
     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
     kmp_critical_name *lck);
 KMP_EXPORT void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
                                          kmp_critical_name *lck);
 KMP_EXPORT kmp_int32 __kmpc_reduce(
     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
     kmp_critical_name *lck);
 KMP_EXPORT void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
                                   kmp_critical_name *lck);
 
 /* Internal fast reduction routines */
 
 extern PACKED_REDUCTION_METHOD_T __kmp_determine_reduction_method(
     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
     kmp_critical_name *lck);
 
 // this function is for testing set/get/determine reduce method
 KMP_EXPORT kmp_int32 __kmp_get_reduce_method(void);
 
 KMP_EXPORT kmp_uint64 __kmpc_get_taskid();
 KMP_EXPORT kmp_uint64 __kmpc_get_parent_taskid();
 
 // C++ port
 // missing 'extern "C"' declarations
 
 KMP_EXPORT kmp_int32 __kmpc_in_parallel(ident_t *loc);
 KMP_EXPORT void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid);
 KMP_EXPORT void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
                                         kmp_int32 num_threads);
 
 KMP_EXPORT void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
                                       int proc_bind);
 KMP_EXPORT void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
                                       kmp_int32 num_teams,
                                       kmp_int32 num_threads);
 KMP_EXPORT void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc,
                                   kmpc_micro microtask, ...);
 struct kmp_dim { // loop bounds info casted to kmp_int64
   kmp_int64 lo; // lower
   kmp_int64 up; // upper
   kmp_int64 st; // stride
 };
 KMP_EXPORT void __kmpc_doacross_init(ident_t *loc, kmp_int32 gtid,
                                      kmp_int32 num_dims,
                                      const struct kmp_dim *dims);
 KMP_EXPORT void __kmpc_doacross_wait(ident_t *loc, kmp_int32 gtid,
                                      const kmp_int64 *vec);
 KMP_EXPORT void __kmpc_doacross_post(ident_t *loc, kmp_int32 gtid,
                                      const kmp_int64 *vec);
 KMP_EXPORT void __kmpc_doacross_fini(ident_t *loc, kmp_int32 gtid);
 
 KMP_EXPORT void *__kmpc_threadprivate_cached(ident_t *loc, kmp_int32 global_tid,
                                              void *data, size_t size,
                                              void ***cache);
 
 // Symbols for MS mutual detection.
 extern int _You_must_link_with_exactly_one_OpenMP_library;
 extern int _You_must_link_with_Intel_OpenMP_library;
 #if KMP_OS_WINDOWS && (KMP_VERSION_MAJOR > 4)
 extern int _You_must_link_with_Microsoft_OpenMP_library;
 #endif
 
 // The routines below are not exported.
 // Consider making them 'static' in corresponding source files.
 void kmp_threadprivate_insert_private_data(int gtid, void *pc_addr,
                                            void *data_addr, size_t pc_size);
 struct private_common *kmp_threadprivate_insert(int gtid, void *pc_addr,
                                                 void *data_addr,
                                                 size_t pc_size);
 void __kmp_threadprivate_resize_cache(int newCapacity);
 void __kmp_cleanup_threadprivate_caches();
 
 // ompc_, kmpc_ entries moved from omp.h.
 #if KMP_OS_WINDOWS
 #define KMPC_CONVENTION __cdecl
 #else
 #define KMPC_CONVENTION
 #endif
 
 #ifndef __OMP_H
 typedef enum omp_sched_t {
   omp_sched_static = 1,
   omp_sched_dynamic = 2,
   omp_sched_guided = 3,
   omp_sched_auto = 4
 } omp_sched_t;
 typedef void *kmp_affinity_mask_t;
 #endif
 
 KMP_EXPORT void KMPC_CONVENTION ompc_set_max_active_levels(int);
 KMP_EXPORT void KMPC_CONVENTION ompc_set_schedule(omp_sched_t, int);
 KMP_EXPORT int KMPC_CONVENTION ompc_get_ancestor_thread_num(int);
 KMP_EXPORT int KMPC_CONVENTION ompc_get_team_size(int);
 KMP_EXPORT int KMPC_CONVENTION
 kmpc_set_affinity_mask_proc(int, kmp_affinity_mask_t *);
 KMP_EXPORT int KMPC_CONVENTION
 kmpc_unset_affinity_mask_proc(int, kmp_affinity_mask_t *);
 KMP_EXPORT int KMPC_CONVENTION
 kmpc_get_affinity_mask_proc(int, kmp_affinity_mask_t *);
 
 KMP_EXPORT void KMPC_CONVENTION kmpc_set_stacksize(int);
 KMP_EXPORT void KMPC_CONVENTION kmpc_set_stacksize_s(size_t);
 KMP_EXPORT void KMPC_CONVENTION kmpc_set_library(int);
 KMP_EXPORT void KMPC_CONVENTION kmpc_set_defaults(char const *);
 KMP_EXPORT void KMPC_CONVENTION kmpc_set_disp_num_buffers(int);
 
 enum kmp_target_offload_kind {
   tgt_disabled = 0,
   tgt_default = 1,
   tgt_mandatory = 2
 };
 typedef enum kmp_target_offload_kind kmp_target_offload_kind_t;
 // Set via OMP_TARGET_OFFLOAD if specified, defaults to tgt_default otherwise
 extern kmp_target_offload_kind_t __kmp_target_offload;
 extern int __kmpc_get_target_offload();
 
 // Constants used in libomptarget
 #define KMP_DEVICE_DEFAULT -1 // This is libomptarget's default device.
 #define KMP_HOST_DEVICE -10 // This is what it is in libomptarget, go figure.
 #define KMP_DEVICE_ALL -11 // This is libomptarget's "all devices".
 
 // OMP Pause Resource
 
 // The following enum is used both to set the status in __kmp_pause_status, and
 // as the internal equivalent of the externally-visible omp_pause_resource_t.
 typedef enum kmp_pause_status_t {
   kmp_not_paused = 0, // status is not paused, or, requesting resume
   kmp_soft_paused = 1, // status is soft-paused, or, requesting soft pause
   kmp_hard_paused = 2 // status is hard-paused, or, requesting hard pause
 } kmp_pause_status_t;
 
 // This stores the pause state of the runtime
 extern kmp_pause_status_t __kmp_pause_status;
 extern int __kmpc_pause_resource(kmp_pause_status_t level);
 extern int __kmp_pause_resource(kmp_pause_status_t level);
 // Soft resume sets __kmp_pause_status, and wakes up all threads.
 extern void __kmp_resume_if_soft_paused();
 // Hard resume simply resets the status to not paused. Library will appear to
 // be uninitialized after hard pause. Let OMP constructs trigger required
 // initializations.
 static inline void __kmp_resume_if_hard_paused() {
   if (__kmp_pause_status == kmp_hard_paused) {
     __kmp_pause_status = kmp_not_paused;
   }
 }
 
 #ifdef __cplusplus
 }
 #endif
 
 #endif /* KMP_H */
Index: projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_affinity.cpp
===================================================================
--- projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_affinity.cpp	(revision 357058)
+++ projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_affinity.cpp	(revision 357059)
@@ -1,5342 +1,5342 @@
 /*
  * kmp_affinity.cpp -- affinity management
  */
 
 //===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
 #include "kmp.h"
 #include "kmp_affinity.h"
 #include "kmp_i18n.h"
 #include "kmp_io.h"
 #include "kmp_str.h"
 #include "kmp_wrapper_getpid.h"
 #if KMP_USE_HIER_SCHED
 #include "kmp_dispatch_hier.h"
 #endif
 
 // Store the real or imagined machine hierarchy here
 static hierarchy_info machine_hierarchy;
 
 void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); }
 
 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
   kmp_uint32 depth;
   // The test below is true if affinity is available, but set to "none". Need to
   // init on first use of hierarchical barrier.
   if (TCR_1(machine_hierarchy.uninitialized))
     machine_hierarchy.init(NULL, nproc);
 
   // Adjust the hierarchy in case num threads exceeds original
   if (nproc > machine_hierarchy.base_num_threads)
     machine_hierarchy.resize(nproc);
 
   depth = machine_hierarchy.depth;
   KMP_DEBUG_ASSERT(depth > 0);
 
   thr_bar->depth = depth;
   thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0] - 1;
   thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
 }
 
 #if KMP_AFFINITY_SUPPORTED
 
 bool KMPAffinity::picked_api = false;
 
 void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); }
 void *KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); }
 void KMPAffinity::Mask::operator delete(void *p) { __kmp_free(p); }
 void KMPAffinity::Mask::operator delete[](void *p) { __kmp_free(p); }
 void *KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); }
 void KMPAffinity::operator delete(void *p) { __kmp_free(p); }
 
 void KMPAffinity::pick_api() {
   KMPAffinity *affinity_dispatch;
   if (picked_api)
     return;
 #if KMP_USE_HWLOC
   // Only use Hwloc if affinity isn't explicitly disabled and
   // user requests Hwloc topology method
   if (__kmp_affinity_top_method == affinity_top_method_hwloc &&
       __kmp_affinity_type != affinity_disabled) {
     affinity_dispatch = new KMPHwlocAffinity();
   } else
 #endif
   {
     affinity_dispatch = new KMPNativeAffinity();
   }
   __kmp_affinity_dispatch = affinity_dispatch;
   picked_api = true;
 }
 
 void KMPAffinity::destroy_api() {
   if (__kmp_affinity_dispatch != NULL) {
     delete __kmp_affinity_dispatch;
     __kmp_affinity_dispatch = NULL;
     picked_api = false;
   }
 }
 
 #define KMP_ADVANCE_SCAN(scan)                                                 \
   while (*scan != '\0') {                                                      \
     scan++;                                                                    \
   }
 
 // Print the affinity mask to the character array in a pretty format.
 // The format is a comma separated list of non-negative integers or integer
 // ranges: e.g., 1,2,3-5,7,9-15
 // The format can also be the string "{<empty>}" if no bits are set in mask
 char *__kmp_affinity_print_mask(char *buf, int buf_len,
                                 kmp_affin_mask_t *mask) {
   int start = 0, finish = 0, previous = 0;
   bool first_range;
   KMP_ASSERT(buf);
   KMP_ASSERT(buf_len >= 40);
   KMP_ASSERT(mask);
   char *scan = buf;
   char *end = buf + buf_len - 1;
 
   // Check for empty set.
   if (mask->begin() == mask->end()) {
     KMP_SNPRINTF(scan, end - scan + 1, "{<empty>}");
     KMP_ADVANCE_SCAN(scan);
     KMP_ASSERT(scan <= end);
     return buf;
   }
 
   first_range = true;
   start = mask->begin();
   while (1) {
     // Find next range
     // [start, previous] is inclusive range of contiguous bits in mask
     for (finish = mask->next(start), previous = start;
          finish == previous + 1 && finish != mask->end();
          finish = mask->next(finish)) {
       previous = finish;
     }
 
     // The first range does not need a comma printed before it, but the rest
     // of the ranges do need a comma beforehand
     if (!first_range) {
       KMP_SNPRINTF(scan, end - scan + 1, "%s", ",");
       KMP_ADVANCE_SCAN(scan);
     } else {
       first_range = false;
     }
     // Range with three or more contiguous bits in the affinity mask
     if (previous - start > 1) {
       KMP_SNPRINTF(scan, end - scan + 1, "%d-%d", static_cast<int>(start),
                    static_cast<int>(previous));
     } else {
       // Range with one or two contiguous bits in the affinity mask
       KMP_SNPRINTF(scan, end - scan + 1, "%d", static_cast<int>(start));
       KMP_ADVANCE_SCAN(scan);
       if (previous - start > 0) {
         KMP_SNPRINTF(scan, end - scan + 1, ",%d", static_cast<int>(previous));
       }
     }
     KMP_ADVANCE_SCAN(scan);
     // Start over with new start point
     start = finish;
     if (start == mask->end())
       break;
     // Check for overflow
     if (end - scan < 2)
       break;
   }
 
   // Check for overflow
   KMP_ASSERT(scan <= end);
   return buf;
 }
 #undef KMP_ADVANCE_SCAN
 
 // Print the affinity mask to the string buffer object in a pretty format
 // The format is a comma separated list of non-negative integers or integer
 // ranges: e.g., 1,2,3-5,7,9-15
 // The format can also be the string "{<empty>}" if no bits are set in mask
 kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf,
                                            kmp_affin_mask_t *mask) {
   int start = 0, finish = 0, previous = 0;
   bool first_range;
   KMP_ASSERT(buf);
   KMP_ASSERT(mask);
 
   __kmp_str_buf_clear(buf);
 
   // Check for empty set.
   if (mask->begin() == mask->end()) {
     __kmp_str_buf_print(buf, "%s", "{<empty>}");
     return buf;
   }
 
   first_range = true;
   start = mask->begin();
   while (1) {
     // Find next range
     // [start, previous] is inclusive range of contiguous bits in mask
     for (finish = mask->next(start), previous = start;
          finish == previous + 1 && finish != mask->end();
          finish = mask->next(finish)) {
       previous = finish;
     }
 
     // The first range does not need a comma printed before it, but the rest
     // of the ranges do need a comma beforehand
     if (!first_range) {
       __kmp_str_buf_print(buf, "%s", ",");
     } else {
       first_range = false;
     }
     // Range with three or more contiguous bits in the affinity mask
     if (previous - start > 1) {
       __kmp_str_buf_print(buf, "%d-%d", static_cast<int>(start),
                           static_cast<int>(previous));
     } else {
       // Range with one or two contiguous bits in the affinity mask
       __kmp_str_buf_print(buf, "%d", static_cast<int>(start));
       if (previous - start > 0) {
         __kmp_str_buf_print(buf, ",%d", static_cast<int>(previous));
       }
     }
     // Start over with new start point
     start = finish;
     if (start == mask->end())
       break;
   }
   return buf;
 }
 
 void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) {
   KMP_CPU_ZERO(mask);
 
 #if KMP_GROUP_AFFINITY
 
   if (__kmp_num_proc_groups > 1) {
     int group;
     KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
     for (group = 0; group < __kmp_num_proc_groups; group++) {
       int i;
       int num = __kmp_GetActiveProcessorCount(group);
       for (i = 0; i < num; i++) {
         KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
       }
     }
   } else
 
 #endif /* KMP_GROUP_AFFINITY */
 
   {
     int proc;
     for (proc = 0; proc < __kmp_xproc; proc++) {
       KMP_CPU_SET(proc, mask);
     }
   }
 }
 
 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be
 // called to renumber the labels from [0..n] and place them into the child_num
 // vector of the address object.  This is done in case the labels used for
 // the children at one node of the hierarchy differ from those used for
 // another node at the same level.  Example:  suppose the machine has 2 nodes
 // with 2 packages each.  The first node contains packages 601 and 602, and
 // second node contains packages 603 and 604.  If we try to sort the table
 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
 // because we are paying attention to the labels themselves, not the ordinal
 // child numbers.  By using the child numbers in the sort, the result is
 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
 static void __kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
                                              int numAddrs) {
   KMP_DEBUG_ASSERT(numAddrs > 0);
   int depth = address2os->first.depth;
   unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
   unsigned *lastLabel = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
   int labCt;
   for (labCt = 0; labCt < depth; labCt++) {
     address2os[0].first.childNums[labCt] = counts[labCt] = 0;
     lastLabel[labCt] = address2os[0].first.labels[labCt];
   }
   int i;
   for (i = 1; i < numAddrs; i++) {
     for (labCt = 0; labCt < depth; labCt++) {
       if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
         int labCt2;
         for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
           counts[labCt2] = 0;
           lastLabel[labCt2] = address2os[i].first.labels[labCt2];
         }
         counts[labCt]++;
         lastLabel[labCt] = address2os[i].first.labels[labCt];
         break;
       }
     }
     for (labCt = 0; labCt < depth; labCt++) {
       address2os[i].first.childNums[labCt] = counts[labCt];
     }
     for (; labCt < (int)Address::maxDepth; labCt++) {
       address2os[i].first.childNums[labCt] = 0;
     }
   }
   __kmp_free(lastLabel);
   __kmp_free(counts);
 }
 
 // All of the __kmp_affinity_create_*_map() routines should set
 // __kmp_affinity_masks to a vector of affinity mask objects of length
 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and return
 // the number of levels in the machine topology tree (zero if
 // __kmp_affinity_type == affinity_none).
 //
 // All of the __kmp_affinity_create_*_map() routines should set
 // *__kmp_affin_fullMask to the affinity mask for the initialization thread.
 // They need to save and restore the mask, and it could be needed later, so
 // saving it is just an optimization to avoid calling kmp_get_system_affinity()
 // again.
 kmp_affin_mask_t *__kmp_affin_fullMask = NULL;
 
 static int nCoresPerPkg, nPackages;
 static int __kmp_nThreadsPerCore;
 #ifndef KMP_DFLT_NTH_CORES
 static int __kmp_ncores;
 #endif
 static int *__kmp_pu_os_idx = NULL;
 
 // __kmp_affinity_uniform_topology() doesn't work when called from
 // places which support arbitrarily many levels in the machine topology
 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
 // __kmp_affinity_create_x2apicid_map().
 inline static bool __kmp_affinity_uniform_topology() {
   return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
 }
 
 // Print out the detailed machine topology map, i.e. the physical locations
 // of each OS proc.
 static void __kmp_affinity_print_topology(AddrUnsPair *address2os, int len,
                                           int depth, int pkgLevel,
                                           int coreLevel, int threadLevel) {
   int proc;
 
   KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
   for (proc = 0; proc < len; proc++) {
     int level;
     kmp_str_buf_t buf;
     __kmp_str_buf_init(&buf);
     for (level = 0; level < depth; level++) {
       if (level == threadLevel) {
         __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
       } else if (level == coreLevel) {
         __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
       } else if (level == pkgLevel) {
         __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
       } else if (level > pkgLevel) {
         __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
                             level - pkgLevel - 1);
       } else {
         __kmp_str_buf_print(&buf, "L%d ", level);
       }
       __kmp_str_buf_print(&buf, "%d ", address2os[proc].first.labels[level]);
     }
     KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
                buf.str);
     __kmp_str_buf_free(&buf);
   }
 }
 
 #if KMP_USE_HWLOC
 
 static void __kmp_affinity_print_hwloc_tp(AddrUnsPair *addrP, int len,
                                           int depth, int *levels) {
   int proc;
   kmp_str_buf_t buf;
   __kmp_str_buf_init(&buf);
   KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
   for (proc = 0; proc < len; proc++) {
     __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Package),
                         addrP[proc].first.labels[0]);
     if (depth > 1) {
       int level = 1; // iterate over levels
       int label = 1; // iterate over labels
       if (__kmp_numa_detected)
         // node level follows package
         if (levels[level++] > 0)
           __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Node),
                               addrP[proc].first.labels[label++]);
       if (__kmp_tile_depth > 0)
         // tile level follows node if any, or package
         if (levels[level++] > 0)
           __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Tile),
                               addrP[proc].first.labels[label++]);
       if (levels[level++] > 0)
         // core level follows
         __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Core),
                             addrP[proc].first.labels[label++]);
       if (levels[level++] > 0)
         // thread level is the latest
         __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Thread),
                             addrP[proc].first.labels[label++]);
       KMP_DEBUG_ASSERT(label == depth);
     }
     KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", addrP[proc].second, buf.str);
     __kmp_str_buf_clear(&buf);
   }
   __kmp_str_buf_free(&buf);
 }
 
 static int nNodePerPkg, nTilePerPkg, nTilePerNode, nCorePerNode, nCorePerTile;
 
 // This function removes the topology levels that are radix 1 and don't offer
 // further information about the topology.  The most common example is when you
 // have one thread context per core, we don't want the extra thread context
 // level if it offers no unique labels.  So they are removed.
 // return value: the new depth of address2os
 static int __kmp_affinity_remove_radix_one_levels(AddrUnsPair *addrP, int nTh,
                                                   int depth, int *levels) {
   int level;
   int i;
   int radix1_detected;
   int new_depth = depth;
   for (level = depth - 1; level > 0; --level) {
     // Detect if this level is radix 1
     radix1_detected = 1;
     for (i = 1; i < nTh; ++i) {
       if (addrP[0].first.labels[level] != addrP[i].first.labels[level]) {
         // There are differing label values for this level so it stays
         radix1_detected = 0;
         break;
       }
     }
     if (!radix1_detected)
       continue;
     // Radix 1 was detected
     --new_depth;
     levels[level] = -1; // mark level as not present in address2os array
     if (level == new_depth) {
       // "turn off" deepest level, just decrement the depth that removes
       // the level from address2os array
       for (i = 0; i < nTh; ++i) {
         addrP[i].first.depth--;
       }
     } else {
       // For other levels, we move labels over and also reduce the depth
       int j;
       for (j = level; j < new_depth; ++j) {
         for (i = 0; i < nTh; ++i) {
           addrP[i].first.labels[j] = addrP[i].first.labels[j + 1];
           addrP[i].first.depth--;
         }
         levels[j + 1] -= 1;
       }
     }
   }
   return new_depth;
 }
 
 // Returns the number of objects of type 'type' below 'obj' within the topology
 // tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is
 // HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET
 // object.
 static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj,
                                            hwloc_obj_type_t type) {
   int retval = 0;
   hwloc_obj_t first;
   for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type,
                                            obj->logical_index, type, 0);
        first != NULL &&
        hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, obj->type, first) ==
            obj;
        first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type,
                                           first)) {
     ++retval;
   }
   return retval;
 }
 
 static int __kmp_hwloc_count_children_by_depth(hwloc_topology_t t,
                                                hwloc_obj_t o,
                                                kmp_hwloc_depth_t depth,
                                                hwloc_obj_t *f) {
   if (o->depth == depth) {
     if (*f == NULL)
       *f = o; // output first descendant found
     return 1;
   }
   int sum = 0;
   for (unsigned i = 0; i < o->arity; i++)
     sum += __kmp_hwloc_count_children_by_depth(t, o->children[i], depth, f);
   return sum; // will be 0 if no one found (as PU arity is 0)
 }
 
 static int __kmp_hwloc_count_children_by_type(hwloc_topology_t t, hwloc_obj_t o,
                                               hwloc_obj_type_t type,
                                               hwloc_obj_t *f) {
   if (!hwloc_compare_types(o->type, type)) {
     if (*f == NULL)
       *f = o; // output first descendant found
     return 1;
   }
   int sum = 0;
   for (unsigned i = 0; i < o->arity; i++)
     sum += __kmp_hwloc_count_children_by_type(t, o->children[i], type, f);
   return sum; // will be 0 if no one found (as PU arity is 0)
 }
 
 static int __kmp_hwloc_process_obj_core_pu(AddrUnsPair *addrPair,
                                            int &nActiveThreads,
                                            int &num_active_cores,
                                            hwloc_obj_t obj, int depth,
                                            int *labels) {
   hwloc_obj_t core = NULL;
   hwloc_topology_t &tp = __kmp_hwloc_topology;
   int NC = __kmp_hwloc_count_children_by_type(tp, obj, HWLOC_OBJ_CORE, &core);
   for (int core_id = 0; core_id < NC; ++core_id, core = core->next_cousin) {
     hwloc_obj_t pu = NULL;
     KMP_DEBUG_ASSERT(core != NULL);
     int num_active_threads = 0;
     int NT = __kmp_hwloc_count_children_by_type(tp, core, HWLOC_OBJ_PU, &pu);
     // int NT = core->arity; pu = core->first_child; // faster?
     for (int pu_id = 0; pu_id < NT; ++pu_id, pu = pu->next_cousin) {
       KMP_DEBUG_ASSERT(pu != NULL);
       if (!KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask))
         continue; // skip inactive (inaccessible) unit
       Address addr(depth + 2);
       KA_TRACE(20, ("Hwloc inserting %d (%d) %d (%d) %d (%d) into address2os\n",
                     obj->os_index, obj->logical_index, core->os_index,
                     core->logical_index, pu->os_index, pu->logical_index));
       for (int i = 0; i < depth; ++i)
         addr.labels[i] = labels[i]; // package, etc.
       addr.labels[depth] = core_id; // core
       addr.labels[depth + 1] = pu_id; // pu
       addrPair[nActiveThreads] = AddrUnsPair(addr, pu->os_index);
       __kmp_pu_os_idx[nActiveThreads] = pu->os_index;
       nActiveThreads++;
       ++num_active_threads; // count active threads per core
     }
     if (num_active_threads) { // were there any active threads on the core?
       ++__kmp_ncores; // count total active cores
       ++num_active_cores; // count active cores per socket
       if (num_active_threads > __kmp_nThreadsPerCore)
         __kmp_nThreadsPerCore = num_active_threads; // calc maximum
     }
   }
   return 0;
 }
 
 // Check if NUMA node detected below the package,
 // and if tile object is detected and return its depth
 static int __kmp_hwloc_check_numa() {
   hwloc_topology_t &tp = __kmp_hwloc_topology;
   hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to)
   int depth, l2cache_depth, package_depth;
 
   // Get some PU
   hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, 0);
   if (hT == NULL) // something has gone wrong
     return 1;
 
   // check NUMA node below PACKAGE
   hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT);
   hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT);
   KMP_DEBUG_ASSERT(hS != NULL);
   if (hN != NULL && hN->depth > hS->depth) {
     __kmp_numa_detected = TRUE; // socket includes node(s)
     if (__kmp_affinity_gran == affinity_gran_node) {
       __kmp_affinity_gran = affinity_gran_numa;
     }
   }
 
   package_depth = hwloc_get_type_depth(tp, HWLOC_OBJ_PACKAGE);
   l2cache_depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED);
   // check tile, get object by depth because of multiple caches possible
   depth = (l2cache_depth < package_depth) ? package_depth : l2cache_depth;
   hL = hwloc_get_ancestor_obj_by_depth(tp, depth, hT);
   hC = NULL; // not used, but reset it here just in case
   if (hL != NULL &&
       __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC) > 1)
     __kmp_tile_depth = depth; // tile consists of multiple cores
   return 0;
 }
 
 static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
                                            kmp_i18n_id_t *const msg_id) {
   hwloc_topology_t &tp = __kmp_hwloc_topology; // shortcut of a long name
   *address2os = NULL;
   *msg_id = kmp_i18n_null;
 
   // Save the affinity mask for the current thread.
   kmp_affin_mask_t *oldMask;
   KMP_CPU_ALLOC(oldMask);
   __kmp_get_system_affinity(oldMask, TRUE);
   __kmp_hwloc_check_numa();
 
   if (!KMP_AFFINITY_CAPABLE()) {
     // Hack to try and infer the machine topology using only the data
     // available from cpuid on the current thread, and __kmp_xproc.
     KMP_ASSERT(__kmp_affinity_type == affinity_none);
 
     nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(
         hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0), HWLOC_OBJ_CORE);
     __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(
         hwloc_get_obj_by_type(tp, HWLOC_OBJ_CORE, 0), HWLOC_OBJ_PU);
     __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
     nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
     if (__kmp_affinity_verbose) {
       KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
       KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
       if (__kmp_affinity_uniform_topology()) {
         KMP_INFORM(Uniform, "KMP_AFFINITY");
       } else {
         KMP_INFORM(NonUniform, "KMP_AFFINITY");
       }
       KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
                  __kmp_nThreadsPerCore, __kmp_ncores);
     }
     KMP_CPU_FREE(oldMask);
     return 0;
   }
 
   int depth = 3;
   int levels[5] = {0, 1, 2, 3, 4}; // package, [node,] [tile,] core, thread
   int labels[3] = {0}; // package [,node] [,tile] - head of lables array
   if (__kmp_numa_detected)
     ++depth;
   if (__kmp_tile_depth)
     ++depth;
 
   // Allocate the data structure to be returned.
   AddrUnsPair *retval =
       (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
   KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
   __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
 
   // When affinity is off, this routine will still be called to set
   // __kmp_ncores, as well as __kmp_nThreadsPerCore,
   // nCoresPerPkg, & nPackages.  Make sure all these vars are set
   // correctly, and return if affinity is not enabled.
 
   hwloc_obj_t socket, node, tile;
   int nActiveThreads = 0;
   int socket_id = 0;
   // re-calculate globals to count only accessible resources
   __kmp_ncores = nPackages = nCoresPerPkg = __kmp_nThreadsPerCore = 0;
   nNodePerPkg = nTilePerPkg = nTilePerNode = nCorePerNode = nCorePerTile = 0;
   for (socket = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0); socket != NULL;
        socket = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, socket),
       socket_id++) {
     labels[0] = socket_id;
     if (__kmp_numa_detected) {
       int NN;
       int n_active_nodes = 0;
       node = NULL;
       NN = __kmp_hwloc_count_children_by_type(tp, socket, HWLOC_OBJ_NUMANODE,
                                               &node);
       for (int node_id = 0; node_id < NN; ++node_id, node = node->next_cousin) {
         labels[1] = node_id;
         if (__kmp_tile_depth) {
           // NUMA + tiles
           int NT;
           int n_active_tiles = 0;
           tile = NULL;
           NT = __kmp_hwloc_count_children_by_depth(tp, node, __kmp_tile_depth,
                                                    &tile);
           for (int tl_id = 0; tl_id < NT; ++tl_id, tile = tile->next_cousin) {
             labels[2] = tl_id;
             int n_active_cores = 0;
             __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads,
                                             n_active_cores, tile, 3, labels);
             if (n_active_cores) { // were there any active cores on the socket?
               ++n_active_tiles; // count active tiles per node
               if (n_active_cores > nCorePerTile)
                 nCorePerTile = n_active_cores; // calc maximum
             }
           }
           if (n_active_tiles) { // were there any active tiles on the socket?
             ++n_active_nodes; // count active nodes per package
             if (n_active_tiles > nTilePerNode)
               nTilePerNode = n_active_tiles; // calc maximum
           }
         } else {
           // NUMA, no tiles
           int n_active_cores = 0;
           __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads,
                                           n_active_cores, node, 2, labels);
           if (n_active_cores) { // were there any active cores on the socket?
             ++n_active_nodes; // count active nodes per package
             if (n_active_cores > nCorePerNode)
               nCorePerNode = n_active_cores; // calc maximum
           }
         }
       }
       if (n_active_nodes) { // were there any active nodes on the socket?
         ++nPackages; // count total active packages
         if (n_active_nodes > nNodePerPkg)
           nNodePerPkg = n_active_nodes; // calc maximum
       }
     } else {
       if (__kmp_tile_depth) {
         // no NUMA, tiles
         int NT;
         int n_active_tiles = 0;
         tile = NULL;
         NT = __kmp_hwloc_count_children_by_depth(tp, socket, __kmp_tile_depth,
                                                  &tile);
         for (int tl_id = 0; tl_id < NT; ++tl_id, tile = tile->next_cousin) {
           labels[1] = tl_id;
           int n_active_cores = 0;
           __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads,
                                           n_active_cores, tile, 2, labels);
           if (n_active_cores) { // were there any active cores on the socket?
             ++n_active_tiles; // count active tiles per package
             if (n_active_cores > nCorePerTile)
               nCorePerTile = n_active_cores; // calc maximum
           }
         }
         if (n_active_tiles) { // were there any active tiles on the socket?
           ++nPackages; // count total active packages
           if (n_active_tiles > nTilePerPkg)
             nTilePerPkg = n_active_tiles; // calc maximum
         }
       } else {
         // no NUMA, no tiles
         int n_active_cores = 0;
         __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads, n_active_cores,
                                         socket, 1, labels);
         if (n_active_cores) { // were there any active cores on the socket?
           ++nPackages; // count total active packages
           if (n_active_cores > nCoresPerPkg)
             nCoresPerPkg = n_active_cores; // calc maximum
         }
       }
     }
   }
 
   // If there's only one thread context to bind to, return now.
   KMP_DEBUG_ASSERT(nActiveThreads == __kmp_avail_proc);
   KMP_ASSERT(nActiveThreads > 0);
   if (nActiveThreads == 1) {
     __kmp_ncores = nPackages = 1;
     __kmp_nThreadsPerCore = nCoresPerPkg = 1;
     if (__kmp_affinity_verbose) {
       char buf[KMP_AFFIN_MASK_PRINT_LEN];
       __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
 
       KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
       if (__kmp_affinity_respect_mask) {
         KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
       } else {
         KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
       }
       KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
       KMP_INFORM(Uniform, "KMP_AFFINITY");
       KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
                  __kmp_nThreadsPerCore, __kmp_ncores);
     }
 
     if (__kmp_affinity_type == affinity_none) {
       __kmp_free(retval);
       KMP_CPU_FREE(oldMask);
       return 0;
     }
 
     // Form an Address object which only includes the package level.
     Address addr(1);
     addr.labels[0] = retval[0].first.labels[0];
     retval[0].first = addr;
 
     if (__kmp_affinity_gran_levels < 0) {
       __kmp_affinity_gran_levels = 0;
     }
 
     if (__kmp_affinity_verbose) {
       __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
     }
 
     *address2os = retval;
     KMP_CPU_FREE(oldMask);
     return 1;
   }
 
   // Sort the table by physical Id.
   qsort(retval, nActiveThreads, sizeof(*retval),
         __kmp_affinity_cmp_Address_labels);
 
   // Check to see if the machine topology is uniform
   int nPUs = nPackages * __kmp_nThreadsPerCore;
   if (__kmp_numa_detected) {
     if (__kmp_tile_depth) { // NUMA + tiles
       nPUs *= (nNodePerPkg * nTilePerNode * nCorePerTile);
     } else { // NUMA, no tiles
       nPUs *= (nNodePerPkg * nCorePerNode);
     }
   } else {
     if (__kmp_tile_depth) { // no NUMA, tiles
       nPUs *= (nTilePerPkg * nCorePerTile);
     } else { // no NUMA, no tiles
       nPUs *= nCoresPerPkg;
     }
   }
   unsigned uniform = (nPUs == nActiveThreads);
 
   // Print the machine topology summary.
   if (__kmp_affinity_verbose) {
     char mask[KMP_AFFIN_MASK_PRINT_LEN];
     __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
     if (__kmp_affinity_respect_mask) {
       KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
     } else {
       KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
     }
     KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
     if (uniform) {
       KMP_INFORM(Uniform, "KMP_AFFINITY");
     } else {
       KMP_INFORM(NonUniform, "KMP_AFFINITY");
     }
     if (__kmp_numa_detected) {
       if (__kmp_tile_depth) { // NUMA + tiles
         KMP_INFORM(TopologyExtraNoTi, "KMP_AFFINITY", nPackages, nNodePerPkg,
                    nTilePerNode, nCorePerTile, __kmp_nThreadsPerCore,
                    __kmp_ncores);
       } else { // NUMA, no tiles
         KMP_INFORM(TopologyExtraNode, "KMP_AFFINITY", nPackages, nNodePerPkg,
                    nCorePerNode, __kmp_nThreadsPerCore, __kmp_ncores);
         nPUs *= (nNodePerPkg * nCorePerNode);
       }
     } else {
       if (__kmp_tile_depth) { // no NUMA, tiles
         KMP_INFORM(TopologyExtraTile, "KMP_AFFINITY", nPackages, nTilePerPkg,
                    nCorePerTile, __kmp_nThreadsPerCore, __kmp_ncores);
       } else { // no NUMA, no tiles
         kmp_str_buf_t buf;
         __kmp_str_buf_init(&buf);
         __kmp_str_buf_print(&buf, "%d", nPackages);
         KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
                    __kmp_nThreadsPerCore, __kmp_ncores);
         __kmp_str_buf_free(&buf);
       }
     }
   }
 
   if (__kmp_affinity_type == affinity_none) {
     __kmp_free(retval);
     KMP_CPU_FREE(oldMask);
     return 0;
   }
 
   int depth_full = depth; // number of levels before compressing
   // Find any levels with radiix 1, and remove them from the map
   // (except for the package level).
   depth = __kmp_affinity_remove_radix_one_levels(retval, nActiveThreads, depth,
                                                  levels);
   KMP_DEBUG_ASSERT(__kmp_affinity_gran != affinity_gran_default);
   if (__kmp_affinity_gran_levels < 0) {
     // Set the granularity level based on what levels are modeled
     // in the machine topology map.
     __kmp_affinity_gran_levels = 0; // lowest level (e.g. fine)
     if (__kmp_affinity_gran > affinity_gran_thread) {
       for (int i = 1; i <= depth_full; ++i) {
         if (__kmp_affinity_gran <= i) // only count deeper levels
           break;
         if (levels[depth_full - i] > 0)
           __kmp_affinity_gran_levels++;
       }
     }
     if (__kmp_affinity_gran > affinity_gran_package)
       __kmp_affinity_gran_levels++; // e.g. granularity = group
   }
 
   if (__kmp_affinity_verbose)
     __kmp_affinity_print_hwloc_tp(retval, nActiveThreads, depth, levels);
 
   KMP_CPU_FREE(oldMask);
   *address2os = retval;
   return depth;
 }
 #endif // KMP_USE_HWLOC
 
 // If we don't know how to retrieve the machine's processor topology, or
 // encounter an error in doing so, this routine is called to form a "flat"
 // mapping of os thread id's <-> processor id's.
 static int __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
                                           kmp_i18n_id_t *const msg_id) {
   *address2os = NULL;
   *msg_id = kmp_i18n_null;
 
   // Even if __kmp_affinity_type == affinity_none, this routine might still
   // called to set __kmp_ncores, as well as
   // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
   if (!KMP_AFFINITY_CAPABLE()) {
     KMP_ASSERT(__kmp_affinity_type == affinity_none);
     __kmp_ncores = nPackages = __kmp_xproc;
     __kmp_nThreadsPerCore = nCoresPerPkg = 1;
     if (__kmp_affinity_verbose) {
       KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
       KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
       KMP_INFORM(Uniform, "KMP_AFFINITY");
       KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
                  __kmp_nThreadsPerCore, __kmp_ncores);
     }
     return 0;
   }
 
   // When affinity is off, this routine will still be called to set
   // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
   // Make sure all these vars are set correctly, and return now if affinity is
   // not enabled.
   __kmp_ncores = nPackages = __kmp_avail_proc;
   __kmp_nThreadsPerCore = nCoresPerPkg = 1;
   if (__kmp_affinity_verbose) {
     char buf[KMP_AFFIN_MASK_PRINT_LEN];
     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
                               __kmp_affin_fullMask);
 
     KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
     if (__kmp_affinity_respect_mask) {
       KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
     } else {
       KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
     }
     KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
     KMP_INFORM(Uniform, "KMP_AFFINITY");
     KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
                __kmp_nThreadsPerCore, __kmp_ncores);
   }
   KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
   __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
   if (__kmp_affinity_type == affinity_none) {
     int avail_ct = 0;
     int i;
     KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
       if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask))
         continue;
       __kmp_pu_os_idx[avail_ct++] = i; // suppose indices are flat
     }
     return 0;
   }
 
   // Contruct the data structure to be returned.
   *address2os =
       (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
   int avail_ct = 0;
   int i;
   KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
     // Skip this proc if it is not included in the machine model.
     if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
       continue;
     }
     __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat
     Address addr(1);
     addr.labels[0] = i;
     (*address2os)[avail_ct++] = AddrUnsPair(addr, i);
   }
   if (__kmp_affinity_verbose) {
     KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
   }
 
   if (__kmp_affinity_gran_levels < 0) {
     // Only the package level is modeled in the machine topology map,
     // so the #levels of granularity is either 0 or 1.
     if (__kmp_affinity_gran > affinity_gran_package) {
       __kmp_affinity_gran_levels = 1;
     } else {
       __kmp_affinity_gran_levels = 0;
     }
   }
   return 1;
 }
 
 #if KMP_GROUP_AFFINITY
 
 // If multiple Windows* OS processor groups exist, we can create a 2-level
 // topology map with the groups at level 0 and the individual procs at level 1.
 // This facilitates letting the threads float among all procs in a group,
 // if granularity=group (the default when there are multiple groups).
 static int __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
                                                 kmp_i18n_id_t *const msg_id) {
   *address2os = NULL;
   *msg_id = kmp_i18n_null;
 
   // If we aren't affinity capable, then return now.
   // The flat mapping will be used.
   if (!KMP_AFFINITY_CAPABLE()) {
     // FIXME set *msg_id
     return -1;
   }
 
   // Contruct the data structure to be returned.
   *address2os =
       (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
   KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
   __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
   int avail_ct = 0;
   int i;
   KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
     // Skip this proc if it is not included in the machine model.
     if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
       continue;
     }
     __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat
     Address addr(2);
     addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
     addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
     (*address2os)[avail_ct++] = AddrUnsPair(addr, i);
 
     if (__kmp_affinity_verbose) {
       KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
                  addr.labels[1]);
     }
   }
 
   if (__kmp_affinity_gran_levels < 0) {
     if (__kmp_affinity_gran == affinity_gran_group) {
       __kmp_affinity_gran_levels = 1;
     } else if ((__kmp_affinity_gran == affinity_gran_fine) ||
                (__kmp_affinity_gran == affinity_gran_thread)) {
       __kmp_affinity_gran_levels = 0;
     } else {
       const char *gran_str = NULL;
       if (__kmp_affinity_gran == affinity_gran_core) {
         gran_str = "core";
       } else if (__kmp_affinity_gran == affinity_gran_package) {
         gran_str = "package";
       } else if (__kmp_affinity_gran == affinity_gran_node) {
         gran_str = "node";
       } else {
         KMP_ASSERT(0);
       }
 
       // Warning: can't use affinity granularity \"gran\" with group topology
       // method, using "thread"
       __kmp_affinity_gran_levels = 0;
     }
   }
   return 2;
 }
 
 #endif /* KMP_GROUP_AFFINITY */
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 
 static int __kmp_cpuid_mask_width(int count) {
   int r = 0;
 
   while ((1 << r) < count)
     ++r;
   return r;
 }
 
 class apicThreadInfo {
 public:
   unsigned osId; // param to __kmp_affinity_bind_thread
   unsigned apicId; // from cpuid after binding
   unsigned maxCoresPerPkg; //      ""
   unsigned maxThreadsPerPkg; //      ""
   unsigned pkgId; // inferred from above values
   unsigned coreId; //      ""
   unsigned threadId; //      ""
 };
 
 static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a,
                                                      const void *b) {
   const apicThreadInfo *aa = (const apicThreadInfo *)a;
   const apicThreadInfo *bb = (const apicThreadInfo *)b;
   if (aa->pkgId < bb->pkgId)
     return -1;
   if (aa->pkgId > bb->pkgId)
     return 1;
   if (aa->coreId < bb->coreId)
     return -1;
   if (aa->coreId > bb->coreId)
     return 1;
   if (aa->threadId < bb->threadId)
     return -1;
   if (aa->threadId > bb->threadId)
     return 1;
   return 0;
 }
 
 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
 // an algorithm which cycles through the available os threads, setting
 // the current thread's affinity mask to that thread, and then retrieves
 // the Apic Id for each thread context using the cpuid instruction.
 static int __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
                                             kmp_i18n_id_t *const msg_id) {
   kmp_cpuid buf;
   *address2os = NULL;
   *msg_id = kmp_i18n_null;
 
   // Check if cpuid leaf 4 is supported.
   __kmp_x86_cpuid(0, 0, &buf);
   if (buf.eax < 4) {
     *msg_id = kmp_i18n_str_NoLeaf4Support;
     return -1;
   }
 
   // The algorithm used starts by setting the affinity to each available thread
   // and retrieving info from the cpuid instruction, so if we are not capable of
   // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we
   // need to do something else - use the defaults that we calculated from
   // issuing cpuid without binding to each proc.
   if (!KMP_AFFINITY_CAPABLE()) {
     // Hack to try and infer the machine topology using only the data
     // available from cpuid on the current thread, and __kmp_xproc.
     KMP_ASSERT(__kmp_affinity_type == affinity_none);
 
     // Get an upper bound on the number of threads per package using cpuid(1).
     // On some OS/chps combinations where HT is supported by the chip but is
     // disabled, this value will be 2 on a single core chip. Usually, it will be
     // 2 if HT is enabled and 1 if HT is disabled.
     __kmp_x86_cpuid(1, 0, &buf);
     int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
     if (maxThreadsPerPkg == 0) {
       maxThreadsPerPkg = 1;
     }
 
     // The num cores per pkg comes from cpuid(4). 1 must be added to the encoded
     // value.
     //
     // The author of cpu_count.cpp treated this only an upper bound on the
     // number of cores, but I haven't seen any cases where it was greater than
     // the actual number of cores, so we will treat it as exact in this block of
     // code.
     //
     // First, we need to check if cpuid(4) is supported on this chip. To see if
     // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n or
     // greater.
     __kmp_x86_cpuid(0, 0, &buf);
     if (buf.eax >= 4) {
       __kmp_x86_cpuid(4, 0, &buf);
       nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
     } else {
       nCoresPerPkg = 1;
     }
 
     // There is no way to reliably tell if HT is enabled without issuing the
     // cpuid instruction from every thread, can correlating the cpuid info, so
     // if the machine is not affinity capable, we assume that HT is off. We have
     // seen quite a few machines where maxThreadsPerPkg is 2, yet the machine
     // does not support HT.
     //
     // - Older OSes are usually found on machines with older chips, which do not
     //   support HT.
     // - The performance penalty for mistakenly identifying a machine as HT when
     //   it isn't (which results in blocktime being incorrecly set to 0) is
     //   greater than the penalty when for mistakenly identifying a machine as
     //   being 1 thread/core when it is really HT enabled (which results in
     //   blocktime being incorrectly set to a positive value).
     __kmp_ncores = __kmp_xproc;
     nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
     __kmp_nThreadsPerCore = 1;
     if (__kmp_affinity_verbose) {
       KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
       KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
       if (__kmp_affinity_uniform_topology()) {
         KMP_INFORM(Uniform, "KMP_AFFINITY");
       } else {
         KMP_INFORM(NonUniform, "KMP_AFFINITY");
       }
       KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
                  __kmp_nThreadsPerCore, __kmp_ncores);
     }
     return 0;
   }
 
   // From here on, we can assume that it is safe to call
   // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
   // __kmp_affinity_type = affinity_none.
 
   // Save the affinity mask for the current thread.
   kmp_affin_mask_t *oldMask;
   KMP_CPU_ALLOC(oldMask);
   KMP_ASSERT(oldMask != NULL);
   __kmp_get_system_affinity(oldMask, TRUE);
 
   // Run through each of the available contexts, binding the current thread
   // to it, and obtaining the pertinent information using the cpuid instr.
   //
   // The relevant information is:
   // - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
   //     has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
   // - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value
   //     of this field determines the width of the core# + thread# fields in the
   //     Apic Id. It is also an upper bound on the number of threads per
   //     package, but it has been verified that situations happen were it is not
   //     exact. In particular, on certain OS/chip combinations where Intel(R)
   //     Hyper-Threading Technology is supported by the chip but has been
   //     disabled, the value of this field will be 2 (for a single core chip).
   //     On other OS/chip combinations supporting Intel(R) Hyper-Threading
   //     Technology, the value of this field will be 1 when Intel(R)
   //     Hyper-Threading Technology is disabled and 2 when it is enabled.
   // - Max Cores Per Pkg:  Bits 26:31 of eax after issuing cpuid(4). The value
   //     of this field (+1) determines the width of the core# field in the Apic
   //     Id. The comments in "cpucount.cpp" say that this value is an upper
   //     bound, but the IA-32 architecture manual says that it is exactly the
   //     number of cores per package, and I haven't seen any case where it
   //     wasn't.
   //
   // From this information, deduce the package Id, core Id, and thread Id,
   // and set the corresponding fields in the apicThreadInfo struct.
   unsigned i;
   apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
       __kmp_avail_proc * sizeof(apicThreadInfo));
   unsigned nApics = 0;
   KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
     // Skip this proc if it is not included in the machine model.
     if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
       continue;
     }
     KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
 
     __kmp_affinity_dispatch->bind_thread(i);
     threadInfo[nApics].osId = i;
 
     // The apic id and max threads per pkg come from cpuid(1).
     __kmp_x86_cpuid(1, 0, &buf);
     if (((buf.edx >> 9) & 1) == 0) {
       __kmp_set_system_affinity(oldMask, TRUE);
       __kmp_free(threadInfo);
       KMP_CPU_FREE(oldMask);
       *msg_id = kmp_i18n_str_ApicNotPresent;
       return -1;
     }
     threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
     threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
     if (threadInfo[nApics].maxThreadsPerPkg == 0) {
       threadInfo[nApics].maxThreadsPerPkg = 1;
     }
 
     // Max cores per pkg comes from cpuid(4). 1 must be added to the encoded
     // value.
     //
     // First, we need to check if cpuid(4) is supported on this chip. To see if
     // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n
     // or greater.
     __kmp_x86_cpuid(0, 0, &buf);
     if (buf.eax >= 4) {
       __kmp_x86_cpuid(4, 0, &buf);
       threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
     } else {
       threadInfo[nApics].maxCoresPerPkg = 1;
     }
 
     // Infer the pkgId / coreId / threadId using only the info obtained locally.
     int widthCT = __kmp_cpuid_mask_width(threadInfo[nApics].maxThreadsPerPkg);
     threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
 
     int widthC = __kmp_cpuid_mask_width(threadInfo[nApics].maxCoresPerPkg);
     int widthT = widthCT - widthC;
     if (widthT < 0) {
       // I've never seen this one happen, but I suppose it could, if the cpuid
       // instruction on a chip was really screwed up. Make sure to restore the
       // affinity mask before the tail call.
       __kmp_set_system_affinity(oldMask, TRUE);
       __kmp_free(threadInfo);
       KMP_CPU_FREE(oldMask);
       *msg_id = kmp_i18n_str_InvalidCpuidInfo;
       return -1;
     }
 
     int maskC = (1 << widthC) - 1;
     threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) & maskC;
 
     int maskT = (1 << widthT) - 1;
     threadInfo[nApics].threadId = threadInfo[nApics].apicId & maskT;
 
     nApics++;
   }
 
   // We've collected all the info we need.
   // Restore the old affinity mask for this thread.
   __kmp_set_system_affinity(oldMask, TRUE);
 
   // If there's only one thread context to bind to, form an Address object
   // with depth 1 and return immediately (or, if affinity is off, set
   // address2os to NULL and return).
   //
   // If it is configured to omit the package level when there is only a single
   // package, the logic at the end of this routine won't work if there is only
   // a single thread - it would try to form an Address object with depth 0.
   KMP_ASSERT(nApics > 0);
   if (nApics == 1) {
     __kmp_ncores = nPackages = 1;
     __kmp_nThreadsPerCore = nCoresPerPkg = 1;
     if (__kmp_affinity_verbose) {
       char buf[KMP_AFFIN_MASK_PRINT_LEN];
       __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
 
       KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
       if (__kmp_affinity_respect_mask) {
         KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
       } else {
         KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
       }
       KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
       KMP_INFORM(Uniform, "KMP_AFFINITY");
       KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
                  __kmp_nThreadsPerCore, __kmp_ncores);
     }
 
     if (__kmp_affinity_type == affinity_none) {
       __kmp_free(threadInfo);
       KMP_CPU_FREE(oldMask);
       return 0;
     }
 
     *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair));
     Address addr(1);
     addr.labels[0] = threadInfo[0].pkgId;
     (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
 
     if (__kmp_affinity_gran_levels < 0) {
       __kmp_affinity_gran_levels = 0;
     }
 
     if (__kmp_affinity_verbose) {
       __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
     }
 
     __kmp_free(threadInfo);
     KMP_CPU_FREE(oldMask);
     return 1;
   }
 
   // Sort the threadInfo table by physical Id.
   qsort(threadInfo, nApics, sizeof(*threadInfo),
         __kmp_affinity_cmp_apicThreadInfo_phys_id);
 
   // The table is now sorted by pkgId / coreId / threadId, but we really don't
   // know the radix of any of the fields. pkgId's may be sparsely assigned among
   // the chips on a system. Although coreId's are usually assigned
   // [0 .. coresPerPkg-1] and threadId's are usually assigned
   // [0..threadsPerCore-1], we don't want to make any such assumptions.
   //
   // For that matter, we don't know what coresPerPkg and threadsPerCore (or the
   // total # packages) are at this point - we want to determine that now. We
   // only have an upper bound on the first two figures.
   //
   // We also perform a consistency check at this point: the values returned by
   // the cpuid instruction for any thread bound to a given package had better
   // return the same info for maxThreadsPerPkg and maxCoresPerPkg.
   nPackages = 1;
   nCoresPerPkg = 1;
   __kmp_nThreadsPerCore = 1;
   unsigned nCores = 1;
 
   unsigned pkgCt = 1; // to determine radii
   unsigned lastPkgId = threadInfo[0].pkgId;
   unsigned coreCt = 1;
   unsigned lastCoreId = threadInfo[0].coreId;
   unsigned threadCt = 1;
   unsigned lastThreadId = threadInfo[0].threadId;
 
   // intra-pkg consist checks
   unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
   unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
 
   for (i = 1; i < nApics; i++) {
     if (threadInfo[i].pkgId != lastPkgId) {
       nCores++;
       pkgCt++;
       lastPkgId = threadInfo[i].pkgId;
       if ((int)coreCt > nCoresPerPkg)
         nCoresPerPkg = coreCt;
       coreCt = 1;
       lastCoreId = threadInfo[i].coreId;
       if ((int)threadCt > __kmp_nThreadsPerCore)
         __kmp_nThreadsPerCore = threadCt;
       threadCt = 1;
       lastThreadId = threadInfo[i].threadId;
 
       // This is a different package, so go on to the next iteration without
       // doing any consistency checks. Reset the consistency check vars, though.
       prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
       prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
       continue;
     }
 
     if (threadInfo[i].coreId != lastCoreId) {
       nCores++;
       coreCt++;
       lastCoreId = threadInfo[i].coreId;
       if ((int)threadCt > __kmp_nThreadsPerCore)
         __kmp_nThreadsPerCore = threadCt;
       threadCt = 1;
       lastThreadId = threadInfo[i].threadId;
     } else if (threadInfo[i].threadId != lastThreadId) {
       threadCt++;
       lastThreadId = threadInfo[i].threadId;
     } else {
       __kmp_free(threadInfo);
       KMP_CPU_FREE(oldMask);
       *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
       return -1;
     }
 
     // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
     // fields agree between all the threads bounds to a given package.
     if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) ||
         (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
       __kmp_free(threadInfo);
       KMP_CPU_FREE(oldMask);
       *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
       return -1;
     }
   }
   nPackages = pkgCt;
   if ((int)coreCt > nCoresPerPkg)
     nCoresPerPkg = coreCt;
   if ((int)threadCt > __kmp_nThreadsPerCore)
     __kmp_nThreadsPerCore = threadCt;
 
   // When affinity is off, this routine will still be called to set
   // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
   // Make sure all these vars are set correctly, and return now if affinity is
   // not enabled.
   __kmp_ncores = nCores;
   if (__kmp_affinity_verbose) {
     char buf[KMP_AFFIN_MASK_PRINT_LEN];
     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
 
     KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
     if (__kmp_affinity_respect_mask) {
       KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
     } else {
       KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
     }
     KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
     if (__kmp_affinity_uniform_topology()) {
       KMP_INFORM(Uniform, "KMP_AFFINITY");
     } else {
       KMP_INFORM(NonUniform, "KMP_AFFINITY");
     }
     KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
                __kmp_nThreadsPerCore, __kmp_ncores);
   }
   KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
   KMP_DEBUG_ASSERT(nApics == (unsigned)__kmp_avail_proc);
   __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
   for (i = 0; i < nApics; ++i) {
     __kmp_pu_os_idx[i] = threadInfo[i].osId;
   }
   if (__kmp_affinity_type == affinity_none) {
     __kmp_free(threadInfo);
     KMP_CPU_FREE(oldMask);
     return 0;
   }
 
   // Now that we've determined the number of packages, the number of cores per
   // package, and the number of threads per core, we can construct the data
   // structure that is to be returned.
   int pkgLevel = 0;
   int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
   int threadLevel =
       (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
   unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
 
   KMP_ASSERT(depth > 0);
   *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
 
   for (i = 0; i < nApics; ++i) {
     Address addr(depth);
     unsigned os = threadInfo[i].osId;
     int d = 0;
 
     if (pkgLevel >= 0) {
       addr.labels[d++] = threadInfo[i].pkgId;
     }
     if (coreLevel >= 0) {
       addr.labels[d++] = threadInfo[i].coreId;
     }
     if (threadLevel >= 0) {
       addr.labels[d++] = threadInfo[i].threadId;
     }
     (*address2os)[i] = AddrUnsPair(addr, os);
   }
 
   if (__kmp_affinity_gran_levels < 0) {
     // Set the granularity level based on what levels are modeled in the machine
     // topology map.
     __kmp_affinity_gran_levels = 0;
     if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
       __kmp_affinity_gran_levels++;
     }
     if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
       __kmp_affinity_gran_levels++;
     }
     if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
       __kmp_affinity_gran_levels++;
     }
   }
 
   if (__kmp_affinity_verbose) {
     __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
                                   coreLevel, threadLevel);
   }
 
   __kmp_free(threadInfo);
   KMP_CPU_FREE(oldMask);
   return depth;
 }
 
 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
 // architectures support a newer interface for specifying the x2APIC Ids,
 // based on cpuid leaf 11.
 static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
                                               kmp_i18n_id_t *const msg_id) {
   kmp_cpuid buf;
   *address2os = NULL;
   *msg_id = kmp_i18n_null;
 
   // Check to see if cpuid leaf 11 is supported.
   __kmp_x86_cpuid(0, 0, &buf);
   if (buf.eax < 11) {
     *msg_id = kmp_i18n_str_NoLeaf11Support;
     return -1;
   }
   __kmp_x86_cpuid(11, 0, &buf);
   if (buf.ebx == 0) {
     *msg_id = kmp_i18n_str_NoLeaf11Support;
     return -1;
   }
 
   // Find the number of levels in the machine topology. While we're at it, get
   // the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will try to
   // get more accurate values later by explicitly counting them, but get
   // reasonable defaults now, in case we return early.
   int level;
   int threadLevel = -1;
   int coreLevel = -1;
   int pkgLevel = -1;
   __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
 
   for (level = 0;; level++) {
     if (level > 31) {
       // FIXME: Hack for DPD200163180
       //
       // If level is big then something went wrong -> exiting
       //
       // There could actually be 32 valid levels in the machine topology, but so
       // far, the only machine we have seen which does not exit this loop before
       // iteration 32 has fubar x2APIC settings.
       //
       // For now, just reject this case based upon loop trip count.
       *msg_id = kmp_i18n_str_InvalidCpuidInfo;
       return -1;
     }
     __kmp_x86_cpuid(11, level, &buf);
     if (buf.ebx == 0) {
       if (pkgLevel < 0) {
         // Will infer nPackages from __kmp_xproc
         pkgLevel = level;
         level++;
       }
       break;
     }
     int kind = (buf.ecx >> 8) & 0xff;
     if (kind == 1) {
       // SMT level
       threadLevel = level;
       coreLevel = -1;
       pkgLevel = -1;
       __kmp_nThreadsPerCore = buf.ebx & 0xffff;
       if (__kmp_nThreadsPerCore == 0) {
         *msg_id = kmp_i18n_str_InvalidCpuidInfo;
         return -1;
       }
     } else if (kind == 2) {
       // core level
       coreLevel = level;
       pkgLevel = -1;
       nCoresPerPkg = buf.ebx & 0xffff;
       if (nCoresPerPkg == 0) {
         *msg_id = kmp_i18n_str_InvalidCpuidInfo;
         return -1;
       }
     } else {
       if (level <= 0) {
         *msg_id = kmp_i18n_str_InvalidCpuidInfo;
         return -1;
       }
       if (pkgLevel >= 0) {
         continue;
       }
       pkgLevel = level;
       nPackages = buf.ebx & 0xffff;
       if (nPackages == 0) {
         *msg_id = kmp_i18n_str_InvalidCpuidInfo;
         return -1;
       }
     }
   }
   int depth = level;
 
   // In the above loop, "level" was counted from the finest level (usually
   // thread) to the coarsest.  The caller expects that we will place the labels
   // in (*address2os)[].first.labels[] in the inverse order, so we need to
   // invert the vars saying which level means what.
   if (threadLevel >= 0) {
     threadLevel = depth - threadLevel - 1;
   }
   if (coreLevel >= 0) {
     coreLevel = depth - coreLevel - 1;
   }
   KMP_DEBUG_ASSERT(pkgLevel >= 0);
   pkgLevel = depth - pkgLevel - 1;
 
   // The algorithm used starts by setting the affinity to each available thread
   // and retrieving info from the cpuid instruction, so if we are not capable of
   // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we
   // need to do something else - use the defaults that we calculated from
   // issuing cpuid without binding to each proc.
   if (!KMP_AFFINITY_CAPABLE()) {
     // Hack to try and infer the machine topology using only the data
     // available from cpuid on the current thread, and __kmp_xproc.
     KMP_ASSERT(__kmp_affinity_type == affinity_none);
 
     __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
     nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
     if (__kmp_affinity_verbose) {
       KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
       KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
       if (__kmp_affinity_uniform_topology()) {
         KMP_INFORM(Uniform, "KMP_AFFINITY");
       } else {
         KMP_INFORM(NonUniform, "KMP_AFFINITY");
       }
       KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
                  __kmp_nThreadsPerCore, __kmp_ncores);
     }
     return 0;
   }
 
   // From here on, we can assume that it is safe to call
   // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
   // __kmp_affinity_type = affinity_none.
 
   // Save the affinity mask for the current thread.
   kmp_affin_mask_t *oldMask;
   KMP_CPU_ALLOC(oldMask);
   __kmp_get_system_affinity(oldMask, TRUE);
 
   // Allocate the data structure to be returned.
   AddrUnsPair *retval =
       (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
 
   // Run through each of the available contexts, binding the current thread
   // to it, and obtaining the pertinent information using the cpuid instr.
   unsigned int proc;
   int nApics = 0;
   KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) {
     // Skip this proc if it is not included in the machine model.
     if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
       continue;
     }
     KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
 
     __kmp_affinity_dispatch->bind_thread(proc);
 
     // Extract labels for each level in the machine topology map from Apic ID.
     Address addr(depth);
     int prev_shift = 0;
 
     for (level = 0; level < depth; level++) {
       __kmp_x86_cpuid(11, level, &buf);
       unsigned apicId = buf.edx;
       if (buf.ebx == 0) {
         if (level != depth - 1) {
           KMP_CPU_FREE(oldMask);
           *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
           return -1;
         }
         addr.labels[depth - level - 1] = apicId >> prev_shift;
         level++;
         break;
       }
       int shift = buf.eax & 0x1f;
       int mask = (1 << shift) - 1;
       addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
       prev_shift = shift;
     }
     if (level != depth) {
       KMP_CPU_FREE(oldMask);
       *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
       return -1;
     }
 
     retval[nApics] = AddrUnsPair(addr, proc);
     nApics++;
   }
 
   // We've collected all the info we need.
   // Restore the old affinity mask for this thread.
   __kmp_set_system_affinity(oldMask, TRUE);
 
   // If there's only one thread context to bind to, return now.
   KMP_ASSERT(nApics > 0);
   if (nApics == 1) {
     __kmp_ncores = nPackages = 1;
     __kmp_nThreadsPerCore = nCoresPerPkg = 1;
     if (__kmp_affinity_verbose) {
       char buf[KMP_AFFIN_MASK_PRINT_LEN];
       __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
 
       KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
       if (__kmp_affinity_respect_mask) {
         KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
       } else {
         KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
       }
       KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
       KMP_INFORM(Uniform, "KMP_AFFINITY");
       KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
                  __kmp_nThreadsPerCore, __kmp_ncores);
     }
 
     if (__kmp_affinity_type == affinity_none) {
       __kmp_free(retval);
       KMP_CPU_FREE(oldMask);
       return 0;
     }
 
     // Form an Address object which only includes the package level.
     Address addr(1);
     addr.labels[0] = retval[0].first.labels[pkgLevel];
     retval[0].first = addr;
 
     if (__kmp_affinity_gran_levels < 0) {
       __kmp_affinity_gran_levels = 0;
     }
 
     if (__kmp_affinity_verbose) {
       __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
     }
 
     *address2os = retval;
     KMP_CPU_FREE(oldMask);
     return 1;
   }
 
   // Sort the table by physical Id.
   qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
 
   // Find the radix at each of the levels.
   unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
   unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
   unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
   unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
   for (level = 0; level < depth; level++) {
     totals[level] = 1;
     maxCt[level] = 1;
     counts[level] = 1;
     last[level] = retval[0].first.labels[level];
   }
 
   // From here on, the iteration variable "level" runs from the finest level to
   // the coarsest, i.e. we iterate forward through
   // (*address2os)[].first.labels[] - in the previous loops, we iterated
   // backwards.
   for (proc = 1; (int)proc < nApics; proc++) {
     int level;
     for (level = 0; level < depth; level++) {
       if (retval[proc].first.labels[level] != last[level]) {
         int j;
         for (j = level + 1; j < depth; j++) {
           totals[j]++;
           counts[j] = 1;
           // The line below causes printing incorrect topology information in
           // case the max value for some level (maxCt[level]) is encountered
           // earlier than some less value while going through the array. For
           // example, let pkg0 has 4 cores and pkg1 has 2 cores. Then
           // maxCt[1] == 2
           // whereas it must be 4.
           // TODO!!! Check if it can be commented safely
           // maxCt[j] = 1;
           last[j] = retval[proc].first.labels[j];
         }
         totals[level]++;
         counts[level]++;
         if (counts[level] > maxCt[level]) {
           maxCt[level] = counts[level];
         }
         last[level] = retval[proc].first.labels[level];
         break;
       } else if (level == depth - 1) {
         __kmp_free(last);
         __kmp_free(maxCt);
         __kmp_free(counts);
         __kmp_free(totals);
         __kmp_free(retval);
         KMP_CPU_FREE(oldMask);
         *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
         return -1;
       }
     }
   }
 
   // When affinity is off, this routine will still be called to set
   // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
   // Make sure all these vars are set correctly, and return if affinity is not
   // enabled.
   if (threadLevel >= 0) {
     __kmp_nThreadsPerCore = maxCt[threadLevel];
   } else {
     __kmp_nThreadsPerCore = 1;
   }
   nPackages = totals[pkgLevel];
 
   if (coreLevel >= 0) {
     __kmp_ncores = totals[coreLevel];
     nCoresPerPkg = maxCt[coreLevel];
   } else {
     __kmp_ncores = nPackages;
     nCoresPerPkg = 1;
   }
 
   // Check to see if the machine topology is uniform
   unsigned prod = maxCt[0];
   for (level = 1; level < depth; level++) {
     prod *= maxCt[level];
   }
   bool uniform = (prod == totals[level - 1]);
 
   // Print the machine topology summary.
   if (__kmp_affinity_verbose) {
     char mask[KMP_AFFIN_MASK_PRINT_LEN];
     __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
 
     KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
     if (__kmp_affinity_respect_mask) {
       KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
     } else {
       KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
     }
     KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
     if (uniform) {
       KMP_INFORM(Uniform, "KMP_AFFINITY");
     } else {
       KMP_INFORM(NonUniform, "KMP_AFFINITY");
     }
 
     kmp_str_buf_t buf;
     __kmp_str_buf_init(&buf);
 
     __kmp_str_buf_print(&buf, "%d", totals[0]);
     for (level = 1; level <= pkgLevel; level++) {
       __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
     }
     KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
                __kmp_nThreadsPerCore, __kmp_ncores);
 
     __kmp_str_buf_free(&buf);
   }
   KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
   KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc);
   __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
   for (proc = 0; (int)proc < nApics; ++proc) {
     __kmp_pu_os_idx[proc] = retval[proc].second;
   }
   if (__kmp_affinity_type == affinity_none) {
     __kmp_free(last);
     __kmp_free(maxCt);
     __kmp_free(counts);
     __kmp_free(totals);
     __kmp_free(retval);
     KMP_CPU_FREE(oldMask);
     return 0;
   }
 
   // Find any levels with radiix 1, and remove them from the map
   // (except for the package level).
   int new_depth = 0;
   for (level = 0; level < depth; level++) {
     if ((maxCt[level] == 1) && (level != pkgLevel)) {
       continue;
     }
     new_depth++;
   }
 
   // If we are removing any levels, allocate a new vector to return,
   // and copy the relevant information to it.
   if (new_depth != depth) {
     AddrUnsPair *new_retval =
         (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
     for (proc = 0; (int)proc < nApics; proc++) {
       Address addr(new_depth);
       new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
     }
     int new_level = 0;
     int newPkgLevel = -1;
     int newCoreLevel = -1;
     int newThreadLevel = -1;
     for (level = 0; level < depth; level++) {
       if ((maxCt[level] == 1) && (level != pkgLevel)) {
         // Remove this level. Never remove the package level
         continue;
       }
       if (level == pkgLevel) {
         newPkgLevel = new_level;
       }
       if (level == coreLevel) {
         newCoreLevel = new_level;
       }
       if (level == threadLevel) {
         newThreadLevel = new_level;
       }
       for (proc = 0; (int)proc < nApics; proc++) {
         new_retval[proc].first.labels[new_level] =
             retval[proc].first.labels[level];
       }
       new_level++;
     }
 
     __kmp_free(retval);
     retval = new_retval;
     depth = new_depth;
     pkgLevel = newPkgLevel;
     coreLevel = newCoreLevel;
     threadLevel = newThreadLevel;
   }
 
   if (__kmp_affinity_gran_levels < 0) {
     // Set the granularity level based on what levels are modeled
     // in the machine topology map.
     __kmp_affinity_gran_levels = 0;
     if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
       __kmp_affinity_gran_levels++;
     }
     if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
       __kmp_affinity_gran_levels++;
     }
     if (__kmp_affinity_gran > affinity_gran_package) {
       __kmp_affinity_gran_levels++;
     }
   }
 
   if (__kmp_affinity_verbose) {
     __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel, coreLevel,
                                   threadLevel);
   }
 
   __kmp_free(last);
   __kmp_free(maxCt);
   __kmp_free(counts);
   __kmp_free(totals);
   KMP_CPU_FREE(oldMask);
   *address2os = retval;
   return depth;
 }
 
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
 #define osIdIndex 0
 #define threadIdIndex 1
 #define coreIdIndex 2
 #define pkgIdIndex 3
 #define nodeIdIndex 4
 
 typedef unsigned *ProcCpuInfo;
 static unsigned maxIndex = pkgIdIndex;
 
 static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a,
                                                   const void *b) {
   unsigned i;
   const unsigned *aa = *(unsigned *const *)a;
   const unsigned *bb = *(unsigned *const *)b;
   for (i = maxIndex;; i--) {
     if (aa[i] < bb[i])
       return -1;
     if (aa[i] > bb[i])
       return 1;
     if (i == osIdIndex)
       break;
   }
   return 0;
 }
 
 #if KMP_USE_HIER_SCHED
 // Set the array sizes for the hierarchy layers
 static void __kmp_dispatch_set_hierarchy_values() {
   // Set the maximum number of L1's to number of cores
   // Set the maximum number of L2's to to either number of cores / 2 for
   // Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing
   // Or the number of cores for Intel(R) Xeon(R) processors
   // Set the maximum number of NUMA nodes and L3's to number of packages
   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] =
       nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores;
-#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
+#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS)
   if (__kmp_mic_type >= mic3)
     __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2;
   else
 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
     __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores;
   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L3 + 1] = nPackages;
   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_NUMA + 1] = nPackages;
   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LOOP + 1] = 1;
   // Set the number of threads per unit
   // Number of hardware threads per L1/L2/L3/NUMA/LOOP
   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1;
   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] =
       __kmp_nThreadsPerCore;
-#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
+#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS)
   if (__kmp_mic_type >= mic3)
     __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
         2 * __kmp_nThreadsPerCore;
   else
 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
     __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
         __kmp_nThreadsPerCore;
   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L3 + 1] =
       nCoresPerPkg * __kmp_nThreadsPerCore;
   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_NUMA + 1] =
       nCoresPerPkg * __kmp_nThreadsPerCore;
   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LOOP + 1] =
       nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
 }
 
 // Return the index into the hierarchy for this tid and layer type (L1, L2, etc)
 // i.e., this thread's L1 or this thread's L2, etc.
 int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type) {
   int index = type + 1;
   int num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1];
   KMP_DEBUG_ASSERT(type != kmp_hier_layer_e::LAYER_LAST);
   if (type == kmp_hier_layer_e::LAYER_THREAD)
     return tid;
   else if (type == kmp_hier_layer_e::LAYER_LOOP)
     return 0;
   KMP_DEBUG_ASSERT(__kmp_hier_max_units[index] != 0);
   if (tid >= num_hw_threads)
     tid = tid % num_hw_threads;
   return (tid / __kmp_hier_threads_per[index]) % __kmp_hier_max_units[index];
 }
 
 // Return the number of t1's per t2
 int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1, kmp_hier_layer_e t2) {
   int i1 = t1 + 1;
   int i2 = t2 + 1;
   KMP_DEBUG_ASSERT(i1 <= i2);
   KMP_DEBUG_ASSERT(t1 != kmp_hier_layer_e::LAYER_LAST);
   KMP_DEBUG_ASSERT(t2 != kmp_hier_layer_e::LAYER_LAST);
   KMP_DEBUG_ASSERT(__kmp_hier_threads_per[i1] != 0);
   // (nthreads/t2) / (nthreads/t1) = t1 / t2
   return __kmp_hier_threads_per[i2] / __kmp_hier_threads_per[i1];
 }
 #endif // KMP_USE_HIER_SCHED
 
 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
 // affinity map.
 static int __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os,
                                              int *line,
                                              kmp_i18n_id_t *const msg_id,
                                              FILE *f) {
   *address2os = NULL;
   *msg_id = kmp_i18n_null;
 
   // Scan of the file, and count the number of "processor" (osId) fields,
   // and find the highest value of <n> for a node_<n> field.
   char buf[256];
   unsigned num_records = 0;
   while (!feof(f)) {
     buf[sizeof(buf) - 1] = 1;
     if (!fgets(buf, sizeof(buf), f)) {
       // Read errors presumably because of EOF
       break;
     }
 
     char s1[] = "processor";
     if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
       num_records++;
       continue;
     }
 
     // FIXME - this will match "node_<n> <garbage>"
     unsigned level;
     if (KMP_SSCANF(buf, "node_%u id", &level) == 1) {
       if (nodeIdIndex + level >= maxIndex) {
         maxIndex = nodeIdIndex + level;
       }
       continue;
     }
   }
 
   // Check for empty file / no valid processor records, or too many. The number
   // of records can't exceed the number of valid bits in the affinity mask.
   if (num_records == 0) {
     *line = 0;
     *msg_id = kmp_i18n_str_NoProcRecords;
     return -1;
   }
   if (num_records > (unsigned)__kmp_xproc) {
     *line = 0;
     *msg_id = kmp_i18n_str_TooManyProcRecords;
     return -1;
   }
 
   // Set the file pointer back to the begginning, so that we can scan the file
   // again, this time performing a full parse of the data. Allocate a vector of
   // ProcCpuInfo object, where we will place the data. Adding an extra element
   // at the end allows us to remove a lot of extra checks for termination
   // conditions.
   if (fseek(f, 0, SEEK_SET) != 0) {
     *line = 0;
     *msg_id = kmp_i18n_str_CantRewindCpuinfo;
     return -1;
   }
 
   // Allocate the array of records to store the proc info in.  The dummy
   // element at the end makes the logic in filling them out easier to code.
   unsigned **threadInfo =
       (unsigned **)__kmp_allocate((num_records + 1) * sizeof(unsigned *));
   unsigned i;
   for (i = 0; i <= num_records; i++) {
     threadInfo[i] =
         (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
   }
 
 #define CLEANUP_THREAD_INFO                                                    \
   for (i = 0; i <= num_records; i++) {                                         \
     __kmp_free(threadInfo[i]);                                                 \
   }                                                                            \
   __kmp_free(threadInfo);
 
   // A value of UINT_MAX means that we didn't find the field
   unsigned __index;
 
 #define INIT_PROC_INFO(p)                                                      \
   for (__index = 0; __index <= maxIndex; __index++) {                          \
     (p)[__index] = UINT_MAX;                                                   \
   }
 
   for (i = 0; i <= num_records; i++) {
     INIT_PROC_INFO(threadInfo[i]);
   }
 
   unsigned num_avail = 0;
   *line = 0;
   while (!feof(f)) {
     // Create an inner scoping level, so that all the goto targets at the end of
     // the loop appear in an outer scoping level. This avoids warnings about
     // jumping past an initialization to a target in the same block.
     {
       buf[sizeof(buf) - 1] = 1;
       bool long_line = false;
       if (!fgets(buf, sizeof(buf), f)) {
         // Read errors presumably because of EOF
         // If there is valid data in threadInfo[num_avail], then fake
         // a blank line in ensure that the last address gets parsed.
         bool valid = false;
         for (i = 0; i <= maxIndex; i++) {
           if (threadInfo[num_avail][i] != UINT_MAX) {
             valid = true;
           }
         }
         if (!valid) {
           break;
         }
         buf[0] = 0;
       } else if (!buf[sizeof(buf) - 1]) {
         // The line is longer than the buffer.  Set a flag and don't
         // emit an error if we were going to ignore the line, anyway.
         long_line = true;
 
 #define CHECK_LINE                                                             \
   if (long_line) {                                                             \
     CLEANUP_THREAD_INFO;                                                       \
     *msg_id = kmp_i18n_str_LongLineCpuinfo;                                    \
     return -1;                                                                 \
   }
       }
       (*line)++;
 
       char s1[] = "processor";
       if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
         CHECK_LINE;
         char *p = strchr(buf + sizeof(s1) - 1, ':');
         unsigned val;
         if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
           goto no_val;
         if (threadInfo[num_avail][osIdIndex] != UINT_MAX)
 #if KMP_ARCH_AARCH64
           // Handle the old AArch64 /proc/cpuinfo layout differently,
           // it contains all of the 'processor' entries listed in a
           // single 'Processor' section, therefore the normal looking
           // for duplicates in that section will always fail.
           num_avail++;
 #else
           goto dup_field;
 #endif
         threadInfo[num_avail][osIdIndex] = val;
 #if KMP_OS_LINUX && !(KMP_ARCH_X86 || KMP_ARCH_X86_64)
         char path[256];
         KMP_SNPRINTF(
             path, sizeof(path),
             "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
             threadInfo[num_avail][osIdIndex]);
         __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
 
         KMP_SNPRINTF(path, sizeof(path),
                      "/sys/devices/system/cpu/cpu%u/topology/core_id",
                      threadInfo[num_avail][osIdIndex]);
         __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
         continue;
 #else
       }
       char s2[] = "physical id";
       if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
         CHECK_LINE;
         char *p = strchr(buf + sizeof(s2) - 1, ':');
         unsigned val;
         if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
           goto no_val;
         if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX)
           goto dup_field;
         threadInfo[num_avail][pkgIdIndex] = val;
         continue;
       }
       char s3[] = "core id";
       if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
         CHECK_LINE;
         char *p = strchr(buf + sizeof(s3) - 1, ':');
         unsigned val;
         if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
           goto no_val;
         if (threadInfo[num_avail][coreIdIndex] != UINT_MAX)
           goto dup_field;
         threadInfo[num_avail][coreIdIndex] = val;
         continue;
 #endif // KMP_OS_LINUX && USE_SYSFS_INFO
       }
       char s4[] = "thread id";
       if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
         CHECK_LINE;
         char *p = strchr(buf + sizeof(s4) - 1, ':');
         unsigned val;
         if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
           goto no_val;
         if (threadInfo[num_avail][threadIdIndex] != UINT_MAX)
           goto dup_field;
         threadInfo[num_avail][threadIdIndex] = val;
         continue;
       }
       unsigned level;
       if (KMP_SSCANF(buf, "node_%u id", &level) == 1) {
         CHECK_LINE;
         char *p = strchr(buf + sizeof(s4) - 1, ':');
         unsigned val;
         if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
           goto no_val;
         KMP_ASSERT(nodeIdIndex + level <= maxIndex);
         if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX)
           goto dup_field;
         threadInfo[num_avail][nodeIdIndex + level] = val;
         continue;
       }
 
       // We didn't recognize the leading token on the line. There are lots of
       // leading tokens that we don't recognize - if the line isn't empty, go on
       // to the next line.
       if ((*buf != 0) && (*buf != '\n')) {
         // If the line is longer than the buffer, read characters
         // until we find a newline.
         if (long_line) {
           int ch;
           while (((ch = fgetc(f)) != EOF) && (ch != '\n'))
             ;
         }
         continue;
       }
 
       // A newline has signalled the end of the processor record.
       // Check that there aren't too many procs specified.
       if ((int)num_avail == __kmp_xproc) {
         CLEANUP_THREAD_INFO;
         *msg_id = kmp_i18n_str_TooManyEntries;
         return -1;
       }
 
       // Check for missing fields.  The osId field must be there, and we
       // currently require that the physical id field is specified, also.
       if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
         CLEANUP_THREAD_INFO;
         *msg_id = kmp_i18n_str_MissingProcField;
         return -1;
       }
       if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
         CLEANUP_THREAD_INFO;
         *msg_id = kmp_i18n_str_MissingPhysicalIDField;
         return -1;
       }
 
       // Skip this proc if it is not included in the machine model.
       if (!KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex],
                          __kmp_affin_fullMask)) {
         INIT_PROC_INFO(threadInfo[num_avail]);
         continue;
       }
 
       // We have a successful parse of this proc's info.
       // Increment the counter, and prepare for the next proc.
       num_avail++;
       KMP_ASSERT(num_avail <= num_records);
       INIT_PROC_INFO(threadInfo[num_avail]);
     }
     continue;
 
   no_val:
     CLEANUP_THREAD_INFO;
     *msg_id = kmp_i18n_str_MissingValCpuinfo;
     return -1;
 
   dup_field:
     CLEANUP_THREAD_INFO;
     *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
     return -1;
   }
   *line = 0;
 
 #if KMP_MIC && REDUCE_TEAM_SIZE
   unsigned teamSize = 0;
 #endif // KMP_MIC && REDUCE_TEAM_SIZE
 
   // check for num_records == __kmp_xproc ???
 
   // If there's only one thread context to bind to, form an Address object with
   // depth 1 and return immediately (or, if affinity is off, set address2os to
   // NULL and return).
   //
   // If it is configured to omit the package level when there is only a single
   // package, the logic at the end of this routine won't work if there is only a
   // single thread - it would try to form an Address object with depth 0.
   KMP_ASSERT(num_avail > 0);
   KMP_ASSERT(num_avail <= num_records);
   if (num_avail == 1) {
     __kmp_ncores = 1;
     __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
     if (__kmp_affinity_verbose) {
       if (!KMP_AFFINITY_CAPABLE()) {
         KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
         KMP_INFORM(Uniform, "KMP_AFFINITY");
       } else {
         char buf[KMP_AFFIN_MASK_PRINT_LEN];
         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
                                   __kmp_affin_fullMask);
         KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
         if (__kmp_affinity_respect_mask) {
           KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
         } else {
           KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
         }
         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
         KMP_INFORM(Uniform, "KMP_AFFINITY");
       }
       int index;
       kmp_str_buf_t buf;
       __kmp_str_buf_init(&buf);
       __kmp_str_buf_print(&buf, "1");
       for (index = maxIndex - 1; index > pkgIdIndex; index--) {
         __kmp_str_buf_print(&buf, " x 1");
       }
       KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
       __kmp_str_buf_free(&buf);
     }
 
     if (__kmp_affinity_type == affinity_none) {
       CLEANUP_THREAD_INFO;
       return 0;
     }
 
     *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair));
     Address addr(1);
     addr.labels[0] = threadInfo[0][pkgIdIndex];
     (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
 
     if (__kmp_affinity_gran_levels < 0) {
       __kmp_affinity_gran_levels = 0;
     }
 
     if (__kmp_affinity_verbose) {
       __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
     }
 
     CLEANUP_THREAD_INFO;
     return 1;
   }
 
   // Sort the threadInfo table by physical Id.
   qsort(threadInfo, num_avail, sizeof(*threadInfo),
         __kmp_affinity_cmp_ProcCpuInfo_phys_id);
 
   // The table is now sorted by pkgId / coreId / threadId, but we really don't
   // know the radix of any of the fields. pkgId's may be sparsely assigned among
   // the chips on a system. Although coreId's are usually assigned
   // [0 .. coresPerPkg-1] and threadId's are usually assigned
   // [0..threadsPerCore-1], we don't want to make any such assumptions.
   //
   // For that matter, we don't know what coresPerPkg and threadsPerCore (or the
   // total # packages) are at this point - we want to determine that now. We
   // only have an upper bound on the first two figures.
   unsigned *counts =
       (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
   unsigned *maxCt =
       (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
   unsigned *totals =
       (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
   unsigned *lastId =
       (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
 
   bool assign_thread_ids = false;
   unsigned threadIdCt;
   unsigned index;
 
 restart_radix_check:
   threadIdCt = 0;
 
   // Initialize the counter arrays with data from threadInfo[0].
   if (assign_thread_ids) {
     if (threadInfo[0][threadIdIndex] == UINT_MAX) {
       threadInfo[0][threadIdIndex] = threadIdCt++;
     } else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
       threadIdCt = threadInfo[0][threadIdIndex] + 1;
     }
   }
   for (index = 0; index <= maxIndex; index++) {
     counts[index] = 1;
     maxCt[index] = 1;
     totals[index] = 1;
     lastId[index] = threadInfo[0][index];
     ;
   }
 
   // Run through the rest of the OS procs.
   for (i = 1; i < num_avail; i++) {
     // Find the most significant index whose id differs from the id for the
     // previous OS proc.
     for (index = maxIndex; index >= threadIdIndex; index--) {
       if (assign_thread_ids && (index == threadIdIndex)) {
         // Auto-assign the thread id field if it wasn't specified.
         if (threadInfo[i][threadIdIndex] == UINT_MAX) {
           threadInfo[i][threadIdIndex] = threadIdCt++;
         }
         // Apparently the thread id field was specified for some entries and not
         // others. Start the thread id counter off at the next higher thread id.
         else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
           threadIdCt = threadInfo[i][threadIdIndex] + 1;
         }
       }
       if (threadInfo[i][index] != lastId[index]) {
         // Run through all indices which are less significant, and reset the
         // counts to 1. At all levels up to and including index, we need to
         // increment the totals and record the last id.
         unsigned index2;
         for (index2 = threadIdIndex; index2 < index; index2++) {
           totals[index2]++;
           if (counts[index2] > maxCt[index2]) {
             maxCt[index2] = counts[index2];
           }
           counts[index2] = 1;
           lastId[index2] = threadInfo[i][index2];
         }
         counts[index]++;
         totals[index]++;
         lastId[index] = threadInfo[i][index];
 
         if (assign_thread_ids && (index > threadIdIndex)) {
 
 #if KMP_MIC && REDUCE_TEAM_SIZE
           // The default team size is the total #threads in the machine
           // minus 1 thread for every core that has 3 or more threads.
           teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1);
 #endif // KMP_MIC && REDUCE_TEAM_SIZE
 
           // Restart the thread counter, as we are on a new core.
           threadIdCt = 0;
 
           // Auto-assign the thread id field if it wasn't specified.
           if (threadInfo[i][threadIdIndex] == UINT_MAX) {
             threadInfo[i][threadIdIndex] = threadIdCt++;
           }
 
           // Aparrently the thread id field was specified for some entries and
           // not others. Start the thread id counter off at the next higher
           // thread id.
           else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
             threadIdCt = threadInfo[i][threadIdIndex] + 1;
           }
         }
         break;
       }
     }
     if (index < threadIdIndex) {
       // If thread ids were specified, it is an error if they are not unique.
       // Also, check that we waven't already restarted the loop (to be safe -
       // shouldn't need to).
       if ((threadInfo[i][threadIdIndex] != UINT_MAX) || assign_thread_ids) {
         __kmp_free(lastId);
         __kmp_free(totals);
         __kmp_free(maxCt);
         __kmp_free(counts);
         CLEANUP_THREAD_INFO;
         *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
         return -1;
       }
 
       // If the thread ids were not specified and we see entries entries that
       // are duplicates, start the loop over and assign the thread ids manually.
       assign_thread_ids = true;
       goto restart_radix_check;
     }
   }
 
 #if KMP_MIC && REDUCE_TEAM_SIZE
   // The default team size is the total #threads in the machine
   // minus 1 thread for every core that has 3 or more threads.
   teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1);
 #endif // KMP_MIC && REDUCE_TEAM_SIZE
 
   for (index = threadIdIndex; index <= maxIndex; index++) {
     if (counts[index] > maxCt[index]) {
       maxCt[index] = counts[index];
     }
   }
 
   __kmp_nThreadsPerCore = maxCt[threadIdIndex];
   nCoresPerPkg = maxCt[coreIdIndex];
   nPackages = totals[pkgIdIndex];
 
   // Check to see if the machine topology is uniform
   unsigned prod = totals[maxIndex];
   for (index = threadIdIndex; index < maxIndex; index++) {
     prod *= maxCt[index];
   }
   bool uniform = (prod == totals[threadIdIndex]);
 
   // When affinity is off, this routine will still be called to set
   // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
   // Make sure all these vars are set correctly, and return now if affinity is
   // not enabled.
   __kmp_ncores = totals[coreIdIndex];
 
   if (__kmp_affinity_verbose) {
     if (!KMP_AFFINITY_CAPABLE()) {
       KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
       KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
       if (uniform) {
         KMP_INFORM(Uniform, "KMP_AFFINITY");
       } else {
         KMP_INFORM(NonUniform, "KMP_AFFINITY");
       }
     } else {
       char buf[KMP_AFFIN_MASK_PRINT_LEN];
       __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
                                 __kmp_affin_fullMask);
       KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
       if (__kmp_affinity_respect_mask) {
         KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
       } else {
         KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
       }
       KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
       if (uniform) {
         KMP_INFORM(Uniform, "KMP_AFFINITY");
       } else {
         KMP_INFORM(NonUniform, "KMP_AFFINITY");
       }
     }
     kmp_str_buf_t buf;
     __kmp_str_buf_init(&buf);
 
     __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
     for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
       __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
     }
     KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex],
                maxCt[threadIdIndex], __kmp_ncores);
 
     __kmp_str_buf_free(&buf);
   }
 
 #if KMP_MIC && REDUCE_TEAM_SIZE
   // Set the default team size.
   if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
     __kmp_dflt_team_nth = teamSize;
     KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting "
                   "__kmp_dflt_team_nth = %d\n",
                   __kmp_dflt_team_nth));
   }
 #endif // KMP_MIC && REDUCE_TEAM_SIZE
 
   KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
   KMP_DEBUG_ASSERT(num_avail == (unsigned)__kmp_avail_proc);
   __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
   for (i = 0; i < num_avail; ++i) { // fill the os indices
     __kmp_pu_os_idx[i] = threadInfo[i][osIdIndex];
   }
 
   if (__kmp_affinity_type == affinity_none) {
     __kmp_free(lastId);
     __kmp_free(totals);
     __kmp_free(maxCt);
     __kmp_free(counts);
     CLEANUP_THREAD_INFO;
     return 0;
   }
 
   // Count the number of levels which have more nodes at that level than at the
   // parent's level (with there being an implicit root node of the top level).
   // This is equivalent to saying that there is at least one node at this level
   // which has a sibling. These levels are in the map, and the package level is
   // always in the map.
   bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
   for (index = threadIdIndex; index < maxIndex; index++) {
     KMP_ASSERT(totals[index] >= totals[index + 1]);
     inMap[index] = (totals[index] > totals[index + 1]);
   }
   inMap[maxIndex] = (totals[maxIndex] > 1);
   inMap[pkgIdIndex] = true;
 
   int depth = 0;
   for (index = threadIdIndex; index <= maxIndex; index++) {
     if (inMap[index]) {
       depth++;
     }
   }
   KMP_ASSERT(depth > 0);
 
   // Construct the data structure that is to be returned.
   *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * num_avail);
   int pkgLevel = -1;
   int coreLevel = -1;
   int threadLevel = -1;
 
   for (i = 0; i < num_avail; ++i) {
     Address addr(depth);
     unsigned os = threadInfo[i][osIdIndex];
     int src_index;
     int dst_index = 0;
 
     for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
       if (!inMap[src_index]) {
         continue;
       }
       addr.labels[dst_index] = threadInfo[i][src_index];
       if (src_index == pkgIdIndex) {
         pkgLevel = dst_index;
       } else if (src_index == coreIdIndex) {
         coreLevel = dst_index;
       } else if (src_index == threadIdIndex) {
         threadLevel = dst_index;
       }
       dst_index++;
     }
     (*address2os)[i] = AddrUnsPair(addr, os);
   }
 
   if (__kmp_affinity_gran_levels < 0) {
     // Set the granularity level based on what levels are modeled
     // in the machine topology map.
     unsigned src_index;
     __kmp_affinity_gran_levels = 0;
     for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
       if (!inMap[src_index]) {
         continue;
       }
       switch (src_index) {
       case threadIdIndex:
         if (__kmp_affinity_gran > affinity_gran_thread) {
           __kmp_affinity_gran_levels++;
         }
 
         break;
       case coreIdIndex:
         if (__kmp_affinity_gran > affinity_gran_core) {
           __kmp_affinity_gran_levels++;
         }
         break;
 
       case pkgIdIndex:
         if (__kmp_affinity_gran > affinity_gran_package) {
           __kmp_affinity_gran_levels++;
         }
         break;
       }
     }
   }
 
   if (__kmp_affinity_verbose) {
     __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
                                   coreLevel, threadLevel);
   }
 
   __kmp_free(inMap);
   __kmp_free(lastId);
   __kmp_free(totals);
   __kmp_free(maxCt);
   __kmp_free(counts);
   CLEANUP_THREAD_INFO;
   return depth;
 }
 
 // Create and return a table of affinity masks, indexed by OS thread ID.
 // This routine handles OR'ing together all the affinity masks of threads
 // that are sufficiently close, if granularity > fine.
 static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex,
                                             unsigned *numUnique,
                                             AddrUnsPair *address2os,
                                             unsigned numAddrs) {
   // First form a table of affinity masks in order of OS thread id.
   unsigned depth;
   unsigned maxOsId;
   unsigned i;
 
   KMP_ASSERT(numAddrs > 0);
   depth = address2os[0].first.depth;
 
   maxOsId = 0;
   for (i = numAddrs - 1;; --i) {
     unsigned osId = address2os[i].second;
     if (osId > maxOsId) {
       maxOsId = osId;
     }
     if (i == 0)
       break;
   }
   kmp_affin_mask_t *osId2Mask;
   KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId + 1));
 
   // Sort the address2os table according to physical order. Doing so will put
   // all threads on the same core/package/node in consecutive locations.
   qsort(address2os, numAddrs, sizeof(*address2os),
         __kmp_affinity_cmp_Address_labels);
 
   KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
   if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
     KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels);
   }
   if (__kmp_affinity_gran_levels >= (int)depth) {
     if (__kmp_affinity_verbose ||
         (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) {
       KMP_WARNING(AffThreadsMayMigrate);
     }
   }
 
   // Run through the table, forming the masks for all threads on each core.
   // Threads on the same core will have identical "Address" objects, not
   // considering the last level, which must be the thread id. All threads on a
   // core will appear consecutively.
   unsigned unique = 0;
   unsigned j = 0; // index of 1st thread on core
   unsigned leader = 0;
   Address *leaderAddr = &(address2os[0].first);
   kmp_affin_mask_t *sum;
   KMP_CPU_ALLOC_ON_STACK(sum);
   KMP_CPU_ZERO(sum);
   KMP_CPU_SET(address2os[0].second, sum);
   for (i = 1; i < numAddrs; i++) {
     // If this thread is sufficiently close to the leader (within the
     // granularity setting), then set the bit for this os thread in the
     // affinity mask for this group, and go on to the next thread.
     if (leaderAddr->isClose(address2os[i].first, __kmp_affinity_gran_levels)) {
       KMP_CPU_SET(address2os[i].second, sum);
       continue;
     }
 
     // For every thread in this group, copy the mask to the thread's entry in
     // the osId2Mask table.  Mark the first address as a leader.
     for (; j < i; j++) {
       unsigned osId = address2os[j].second;
       KMP_DEBUG_ASSERT(osId <= maxOsId);
       kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
       KMP_CPU_COPY(mask, sum);
       address2os[j].first.leader = (j == leader);
     }
     unique++;
 
     // Start a new mask.
     leader = i;
     leaderAddr = &(address2os[i].first);
     KMP_CPU_ZERO(sum);
     KMP_CPU_SET(address2os[i].second, sum);
   }
 
   // For every thread in last group, copy the mask to the thread's
   // entry in the osId2Mask table.
   for (; j < i; j++) {
     unsigned osId = address2os[j].second;
     KMP_DEBUG_ASSERT(osId <= maxOsId);
     kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
     KMP_CPU_COPY(mask, sum);
     address2os[j].first.leader = (j == leader);
   }
   unique++;
   KMP_CPU_FREE_FROM_STACK(sum);
 
   *maxIndex = maxOsId;
   *numUnique = unique;
   return osId2Mask;
 }
 
 // Stuff for the affinity proclist parsers.  It's easier to declare these vars
 // as file-static than to try and pass them through the calling sequence of
 // the recursive-descent OMP_PLACES parser.
 static kmp_affin_mask_t *newMasks;
 static int numNewMasks;
 static int nextNewMask;
 
 #define ADD_MASK(_mask)                                                        \
   {                                                                            \
     if (nextNewMask >= numNewMasks) {                                          \
       int i;                                                                   \
       numNewMasks *= 2;                                                        \
       kmp_affin_mask_t *temp;                                                  \
       KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks);                         \
       for (i = 0; i < numNewMasks / 2; i++) {                                  \
         kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);                    \
         kmp_affin_mask_t *dest = KMP_CPU_INDEX(temp, i);                       \
         KMP_CPU_COPY(dest, src);                                               \
       }                                                                        \
       KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks / 2);                  \
       newMasks = temp;                                                         \
     }                                                                          \
     KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask));               \
     nextNewMask++;                                                             \
   }
 
 #define ADD_MASK_OSID(_osId, _osId2Mask, _maxOsId)                             \
   {                                                                            \
     if (((_osId) > _maxOsId) ||                                                \
         (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) {     \
       if (__kmp_affinity_verbose ||                                            \
           (__kmp_affinity_warnings &&                                          \
            (__kmp_affinity_type != affinity_none))) {                          \
         KMP_WARNING(AffIgnoreInvalidProcID, _osId);                            \
       }                                                                        \
     } else {                                                                   \
       ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId)));                            \
     }                                                                          \
   }
 
 // Re-parse the proclist (for the explicit affinity type), and form the list
 // of affinity newMasks indexed by gtid.
 static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
                                             unsigned int *out_numMasks,
                                             const char *proclist,
                                             kmp_affin_mask_t *osId2Mask,
                                             int maxOsId) {
   int i;
   const char *scan = proclist;
   const char *next = proclist;
 
   // We use malloc() for the temporary mask vector, so that we can use
   // realloc() to extend it.
   numNewMasks = 2;
   KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
   nextNewMask = 0;
   kmp_affin_mask_t *sumMask;
   KMP_CPU_ALLOC(sumMask);
   int setSize = 0;
 
   for (;;) {
     int start, end, stride;
 
     SKIP_WS(scan);
     next = scan;
     if (*next == '\0') {
       break;
     }
 
     if (*next == '{') {
       int num;
       setSize = 0;
       next++; // skip '{'
       SKIP_WS(next);
       scan = next;
 
       // Read the first integer in the set.
       KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad proclist");
       SKIP_DIGITS(next);
       num = __kmp_str_to_int(scan, *next);
       KMP_ASSERT2(num >= 0, "bad explicit proc list");
 
       // Copy the mask for that osId to the sum (union) mask.
       if ((num > maxOsId) ||
           (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
         if (__kmp_affinity_verbose ||
             (__kmp_affinity_warnings &&
              (__kmp_affinity_type != affinity_none))) {
           KMP_WARNING(AffIgnoreInvalidProcID, num);
         }
         KMP_CPU_ZERO(sumMask);
       } else {
         KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
         setSize = 1;
       }
 
       for (;;) {
         // Check for end of set.
         SKIP_WS(next);
         if (*next == '}') {
           next++; // skip '}'
           break;
         }
 
         // Skip optional comma.
         if (*next == ',') {
           next++;
         }
         SKIP_WS(next);
 
         // Read the next integer in the set.
         scan = next;
         KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
 
         SKIP_DIGITS(next);
         num = __kmp_str_to_int(scan, *next);
         KMP_ASSERT2(num >= 0, "bad explicit proc list");
 
         // Add the mask for that osId to the sum mask.
         if ((num > maxOsId) ||
             (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
           if (__kmp_affinity_verbose ||
               (__kmp_affinity_warnings &&
                (__kmp_affinity_type != affinity_none))) {
             KMP_WARNING(AffIgnoreInvalidProcID, num);
           }
         } else {
           KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
           setSize++;
         }
       }
       if (setSize > 0) {
         ADD_MASK(sumMask);
       }
 
       SKIP_WS(next);
       if (*next == ',') {
         next++;
       }
       scan = next;
       continue;
     }
 
     // Read the first integer.
     KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
     SKIP_DIGITS(next);
     start = __kmp_str_to_int(scan, *next);
     KMP_ASSERT2(start >= 0, "bad explicit proc list");
     SKIP_WS(next);
 
     // If this isn't a range, then add a mask to the list and go on.
     if (*next != '-') {
       ADD_MASK_OSID(start, osId2Mask, maxOsId);
 
       // Skip optional comma.
       if (*next == ',') {
         next++;
       }
       scan = next;
       continue;
     }
 
     // This is a range.  Skip over the '-' and read in the 2nd int.
     next++; // skip '-'
     SKIP_WS(next);
     scan = next;
     KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
     SKIP_DIGITS(next);
     end = __kmp_str_to_int(scan, *next);
     KMP_ASSERT2(end >= 0, "bad explicit proc list");
 
     // Check for a stride parameter
     stride = 1;
     SKIP_WS(next);
     if (*next == ':') {
       // A stride is specified.  Skip over the ':" and read the 3rd int.
       int sign = +1;
       next++; // skip ':'
       SKIP_WS(next);
       scan = next;
       if (*next == '-') {
         sign = -1;
         next++;
         SKIP_WS(next);
         scan = next;
       }
       KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
       SKIP_DIGITS(next);
       stride = __kmp_str_to_int(scan, *next);
       KMP_ASSERT2(stride >= 0, "bad explicit proc list");
       stride *= sign;
     }
 
     // Do some range checks.
     KMP_ASSERT2(stride != 0, "bad explicit proc list");
     if (stride > 0) {
       KMP_ASSERT2(start <= end, "bad explicit proc list");
     } else {
       KMP_ASSERT2(start >= end, "bad explicit proc list");
     }
     KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
 
     // Add the mask for each OS proc # to the list.
     if (stride > 0) {
       do {
         ADD_MASK_OSID(start, osId2Mask, maxOsId);
         start += stride;
       } while (start <= end);
     } else {
       do {
         ADD_MASK_OSID(start, osId2Mask, maxOsId);
         start += stride;
       } while (start >= end);
     }
 
     // Skip optional comma.
     SKIP_WS(next);
     if (*next == ',') {
       next++;
     }
     scan = next;
   }
 
   *out_numMasks = nextNewMask;
   if (nextNewMask == 0) {
     *out_masks = NULL;
     KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
     return;
   }
   KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
   for (i = 0; i < nextNewMask; i++) {
     kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);
     kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i);
     KMP_CPU_COPY(dest, src);
   }
   KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
   KMP_CPU_FREE(sumMask);
 }
 
 /*-----------------------------------------------------------------------------
 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
 places.  Again, Here is the grammar:
 
 place_list := place
 place_list := place , place_list
 place := num
 place := place : num
 place := place : num : signed
 place := { subplacelist }
 place := ! place                  // (lowest priority)
 subplace_list := subplace
 subplace_list := subplace , subplace_list
 subplace := num
 subplace := num : num
 subplace := num : num : signed
 signed := num
 signed := + signed
 signed := - signed
 -----------------------------------------------------------------------------*/
 static void __kmp_process_subplace_list(const char **scan,
                                         kmp_affin_mask_t *osId2Mask,
                                         int maxOsId, kmp_affin_mask_t *tempMask,
                                         int *setSize) {
   const char *next;
 
   for (;;) {
     int start, count, stride, i;
 
     // Read in the starting proc id
     SKIP_WS(*scan);
     KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
     next = *scan;
     SKIP_DIGITS(next);
     start = __kmp_str_to_int(*scan, *next);
     KMP_ASSERT(start >= 0);
     *scan = next;
 
     // valid follow sets are ',' ':' and '}'
     SKIP_WS(*scan);
     if (**scan == '}' || **scan == ',') {
       if ((start > maxOsId) ||
           (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
         if (__kmp_affinity_verbose ||
             (__kmp_affinity_warnings &&
              (__kmp_affinity_type != affinity_none))) {
           KMP_WARNING(AffIgnoreInvalidProcID, start);
         }
       } else {
         KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
         (*setSize)++;
       }
       if (**scan == '}') {
         break;
       }
       (*scan)++; // skip ','
       continue;
     }
     KMP_ASSERT2(**scan == ':', "bad explicit places list");
     (*scan)++; // skip ':'
 
     // Read count parameter
     SKIP_WS(*scan);
     KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
     next = *scan;
     SKIP_DIGITS(next);
     count = __kmp_str_to_int(*scan, *next);
     KMP_ASSERT(count >= 0);
     *scan = next;
 
     // valid follow sets are ',' ':' and '}'
     SKIP_WS(*scan);
     if (**scan == '}' || **scan == ',') {
       for (i = 0; i < count; i++) {
         if ((start > maxOsId) ||
             (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
           if (__kmp_affinity_verbose ||
               (__kmp_affinity_warnings &&
                (__kmp_affinity_type != affinity_none))) {
             KMP_WARNING(AffIgnoreInvalidProcID, start);
           }
           break; // don't proliferate warnings for large count
         } else {
           KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
           start++;
           (*setSize)++;
         }
       }
       if (**scan == '}') {
         break;
       }
       (*scan)++; // skip ','
       continue;
     }
     KMP_ASSERT2(**scan == ':', "bad explicit places list");
     (*scan)++; // skip ':'
 
     // Read stride parameter
     int sign = +1;
     for (;;) {
       SKIP_WS(*scan);
       if (**scan == '+') {
         (*scan)++; // skip '+'
         continue;
       }
       if (**scan == '-') {
         sign *= -1;
         (*scan)++; // skip '-'
         continue;
       }
       break;
     }
     SKIP_WS(*scan);
     KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
     next = *scan;
     SKIP_DIGITS(next);
     stride = __kmp_str_to_int(*scan, *next);
     KMP_ASSERT(stride >= 0);
     *scan = next;
     stride *= sign;
 
     // valid follow sets are ',' and '}'
     SKIP_WS(*scan);
     if (**scan == '}' || **scan == ',') {
       for (i = 0; i < count; i++) {
         if ((start > maxOsId) ||
             (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
           if (__kmp_affinity_verbose ||
               (__kmp_affinity_warnings &&
                (__kmp_affinity_type != affinity_none))) {
             KMP_WARNING(AffIgnoreInvalidProcID, start);
           }
           break; // don't proliferate warnings for large count
         } else {
           KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
           start += stride;
           (*setSize)++;
         }
       }
       if (**scan == '}') {
         break;
       }
       (*scan)++; // skip ','
       continue;
     }
 
     KMP_ASSERT2(0, "bad explicit places list");
   }
 }
 
 static void __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
                                 int maxOsId, kmp_affin_mask_t *tempMask,
                                 int *setSize) {
   const char *next;
 
   // valid follow sets are '{' '!' and num
   SKIP_WS(*scan);
   if (**scan == '{') {
     (*scan)++; // skip '{'
     __kmp_process_subplace_list(scan, osId2Mask, maxOsId, tempMask, setSize);
     KMP_ASSERT2(**scan == '}', "bad explicit places list");
     (*scan)++; // skip '}'
   } else if (**scan == '!') {
     (*scan)++; // skip '!'
     __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
     KMP_CPU_COMPLEMENT(maxOsId, tempMask);
   } else if ((**scan >= '0') && (**scan <= '9')) {
     next = *scan;
     SKIP_DIGITS(next);
     int num = __kmp_str_to_int(*scan, *next);
     KMP_ASSERT(num >= 0);
     if ((num > maxOsId) ||
         (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
       if (__kmp_affinity_verbose ||
           (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) {
         KMP_WARNING(AffIgnoreInvalidProcID, num);
       }
     } else {
       KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
       (*setSize)++;
     }
     *scan = next; // skip num
   } else {
     KMP_ASSERT2(0, "bad explicit places list");
   }
 }
 
 // static void
 void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
                                       unsigned int *out_numMasks,
                                       const char *placelist,
                                       kmp_affin_mask_t *osId2Mask,
                                       int maxOsId) {
   int i, j, count, stride, sign;
   const char *scan = placelist;
   const char *next = placelist;
 
   numNewMasks = 2;
   KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
   nextNewMask = 0;
 
   // tempMask is modified based on the previous or initial
   //   place to form the current place
   // previousMask contains the previous place
   kmp_affin_mask_t *tempMask;
   kmp_affin_mask_t *previousMask;
   KMP_CPU_ALLOC(tempMask);
   KMP_CPU_ZERO(tempMask);
   KMP_CPU_ALLOC(previousMask);
   KMP_CPU_ZERO(previousMask);
   int setSize = 0;
 
   for (;;) {
     __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
 
     // valid follow sets are ',' ':' and EOL
     SKIP_WS(scan);
     if (*scan == '\0' || *scan == ',') {
       if (setSize > 0) {
         ADD_MASK(tempMask);
       }
       KMP_CPU_ZERO(tempMask);
       setSize = 0;
       if (*scan == '\0') {
         break;
       }
       scan++; // skip ','
       continue;
     }
 
     KMP_ASSERT2(*scan == ':', "bad explicit places list");
     scan++; // skip ':'
 
     // Read count parameter
     SKIP_WS(scan);
     KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list");
     next = scan;
     SKIP_DIGITS(next);
     count = __kmp_str_to_int(scan, *next);
     KMP_ASSERT(count >= 0);
     scan = next;
 
     // valid follow sets are ',' ':' and EOL
     SKIP_WS(scan);
     if (*scan == '\0' || *scan == ',') {
       stride = +1;
     } else {
       KMP_ASSERT2(*scan == ':', "bad explicit places list");
       scan++; // skip ':'
 
       // Read stride parameter
       sign = +1;
       for (;;) {
         SKIP_WS(scan);
         if (*scan == '+') {
           scan++; // skip '+'
           continue;
         }
         if (*scan == '-') {
           sign *= -1;
           scan++; // skip '-'
           continue;
         }
         break;
       }
       SKIP_WS(scan);
       KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list");
       next = scan;
       SKIP_DIGITS(next);
       stride = __kmp_str_to_int(scan, *next);
       KMP_DEBUG_ASSERT(stride >= 0);
       scan = next;
       stride *= sign;
     }
 
     // Add places determined by initial_place : count : stride
     for (i = 0; i < count; i++) {
       if (setSize == 0) {
         break;
       }
       // Add the current place, then build the next place (tempMask) from that
       KMP_CPU_COPY(previousMask, tempMask);
       ADD_MASK(previousMask);
       KMP_CPU_ZERO(tempMask);
       setSize = 0;
       KMP_CPU_SET_ITERATE(j, previousMask) {
         if (!KMP_CPU_ISSET(j, previousMask)) {
           continue;
         }
         if ((j + stride > maxOsId) || (j + stride < 0) ||
             (!KMP_CPU_ISSET(j, __kmp_affin_fullMask)) ||
             (!KMP_CPU_ISSET(j + stride,
                             KMP_CPU_INDEX(osId2Mask, j + stride)))) {
           if ((__kmp_affinity_verbose ||
                (__kmp_affinity_warnings &&
                 (__kmp_affinity_type != affinity_none))) &&
               i < count - 1) {
             KMP_WARNING(AffIgnoreInvalidProcID, j + stride);
           }
           continue;
         }
         KMP_CPU_SET(j + stride, tempMask);
         setSize++;
       }
     }
     KMP_CPU_ZERO(tempMask);
     setSize = 0;
 
     // valid follow sets are ',' and EOL
     SKIP_WS(scan);
     if (*scan == '\0') {
       break;
     }
     if (*scan == ',') {
       scan++; // skip ','
       continue;
     }
 
     KMP_ASSERT2(0, "bad explicit places list");
   }
 
   *out_numMasks = nextNewMask;
   if (nextNewMask == 0) {
     *out_masks = NULL;
     KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
     return;
   }
   KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
   KMP_CPU_FREE(tempMask);
   KMP_CPU_FREE(previousMask);
   for (i = 0; i < nextNewMask; i++) {
     kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);
     kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i);
     KMP_CPU_COPY(dest, src);
   }
   KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
 }
 
 #undef ADD_MASK
 #undef ADD_MASK_OSID
 
 #if KMP_USE_HWLOC
 static int __kmp_hwloc_skip_PUs_obj(hwloc_topology_t t, hwloc_obj_t o) {
   // skip PUs descendants of the object o
   int skipped = 0;
   hwloc_obj_t hT = NULL;
   int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT);
   for (int i = 0; i < N; ++i) {
     KMP_DEBUG_ASSERT(hT);
     unsigned idx = hT->os_index;
     if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
       KMP_CPU_CLR(idx, __kmp_affin_fullMask);
       KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
       ++skipped;
     }
     hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT);
   }
   return skipped; // count number of skipped units
 }
 
 static int __kmp_hwloc_obj_has_PUs(hwloc_topology_t t, hwloc_obj_t o) {
   // check if obj has PUs present in fullMask
   hwloc_obj_t hT = NULL;
   int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT);
   for (int i = 0; i < N; ++i) {
     KMP_DEBUG_ASSERT(hT);
     unsigned idx = hT->os_index;
     if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask))
       return 1; // found PU
     hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT);
   }
   return 0; // no PUs found
 }
 #endif // KMP_USE_HWLOC
 
 static void __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) {
   AddrUnsPair *newAddr;
   if (__kmp_hws_requested == 0)
     goto _exit; // no topology limiting actions requested, exit
 #if KMP_USE_HWLOC
   if (__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
     // Number of subobjects calculated dynamically, this works fine for
     // any non-uniform topology.
     // L2 cache objects are determined by depth, other objects - by type.
     hwloc_topology_t tp = __kmp_hwloc_topology;
     int nS = 0, nN = 0, nL = 0, nC = 0,
         nT = 0; // logical index including skipped
     int nCr = 0, nTr = 0; // number of requested units
     int nPkg = 0, nCo = 0, n_new = 0, n_old = 0, nCpP = 0, nTpC = 0; // counters
     hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to)
     int L2depth, idx;
 
     // check support of extensions ----------------------------------
     int numa_support = 0, tile_support = 0;
     if (__kmp_pu_os_idx)
       hT = hwloc_get_pu_obj_by_os_index(tp,
                                         __kmp_pu_os_idx[__kmp_avail_proc - 1]);
     else
       hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, __kmp_avail_proc - 1);
     if (hT == NULL) { // something's gone wrong
       KMP_WARNING(AffHWSubsetUnsupported);
       goto _exit;
     }
     // check NUMA node
     hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT);
     hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT);
     if (hN != NULL && hN->depth > hS->depth) {
       numa_support = 1; // 1 in case socket includes node(s)
     } else if (__kmp_hws_node.num > 0) {
       // don't support sockets inside NUMA node (no such HW found for testing)
       KMP_WARNING(AffHWSubsetUnsupported);
       goto _exit;
     }
     // check L2 cahce, get object by depth because of multiple caches
     L2depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED);
     hL = hwloc_get_ancestor_obj_by_depth(tp, L2depth, hT);
     if (hL != NULL &&
         __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC) > 1) {
       tile_support = 1; // no sense to count L2 if it includes single core
     } else if (__kmp_hws_tile.num > 0) {
       if (__kmp_hws_core.num == 0) {
         __kmp_hws_core = __kmp_hws_tile; // replace L2 with core
         __kmp_hws_tile.num = 0;
       } else {
         // L2 and core are both requested, but represent same object
         KMP_WARNING(AffHWSubsetInvalid);
         goto _exit;
       }
     }
     // end of check of extensions -----------------------------------
 
     // fill in unset items, validate settings -----------------------
     if (__kmp_hws_socket.num == 0)
       __kmp_hws_socket.num = nPackages; // use all available sockets
     if (__kmp_hws_socket.offset >= nPackages) {
       KMP_WARNING(AffHWSubsetManySockets);
       goto _exit;
     }
     if (numa_support) {
       hN = NULL;
       int NN = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE,
                                                   &hN); // num nodes in socket
       if (__kmp_hws_node.num == 0)
         __kmp_hws_node.num = NN; // use all available nodes
       if (__kmp_hws_node.offset >= NN) {
         KMP_WARNING(AffHWSubsetManyNodes);
         goto _exit;
       }
       if (tile_support) {
         // get num tiles in node
         int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL);
         if (__kmp_hws_tile.num == 0) {
           __kmp_hws_tile.num = NL + 1;
         } // use all available tiles, some node may have more tiles, thus +1
         if (__kmp_hws_tile.offset >= NL) {
           KMP_WARNING(AffHWSubsetManyTiles);
           goto _exit;
         }
         int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE,
                                                     &hC); // num cores in tile
         if (__kmp_hws_core.num == 0)
           __kmp_hws_core.num = NC; // use all available cores
         if (__kmp_hws_core.offset >= NC) {
           KMP_WARNING(AffHWSubsetManyCores);
           goto _exit;
         }
       } else { // tile_support
         int NC = __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE,
                                                     &hC); // num cores in node
         if (__kmp_hws_core.num == 0)
           __kmp_hws_core.num = NC; // use all available cores
         if (__kmp_hws_core.offset >= NC) {
           KMP_WARNING(AffHWSubsetManyCores);
           goto _exit;
         }
       } // tile_support
     } else { // numa_support
       if (tile_support) {
         // get num tiles in socket
         int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL);
         if (__kmp_hws_tile.num == 0)
           __kmp_hws_tile.num = NL; // use all available tiles
         if (__kmp_hws_tile.offset >= NL) {
           KMP_WARNING(AffHWSubsetManyTiles);
           goto _exit;
         }
         int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE,
                                                     &hC); // num cores in tile
         if (__kmp_hws_core.num == 0)
           __kmp_hws_core.num = NC; // use all available cores
         if (__kmp_hws_core.offset >= NC) {
           KMP_WARNING(AffHWSubsetManyCores);
           goto _exit;
         }
       } else { // tile_support
         int NC = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE,
                                                     &hC); // num cores in socket
         if (__kmp_hws_core.num == 0)
           __kmp_hws_core.num = NC; // use all available cores
         if (__kmp_hws_core.offset >= NC) {
           KMP_WARNING(AffHWSubsetManyCores);
           goto _exit;
         }
       } // tile_support
     }
     if (__kmp_hws_proc.num == 0)
       __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all available procs
     if (__kmp_hws_proc.offset >= __kmp_nThreadsPerCore) {
       KMP_WARNING(AffHWSubsetManyProcs);
       goto _exit;
     }
     // end of validation --------------------------------------------
 
     if (pAddr) // pAddr is NULL in case of affinity_none
       newAddr = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) *
                                               __kmp_avail_proc); // max size
     // main loop to form HW subset ----------------------------------
     hS = NULL;
     int NP = hwloc_get_nbobjs_by_type(tp, HWLOC_OBJ_PACKAGE);
     for (int s = 0; s < NP; ++s) {
       // Check Socket -----------------------------------------------
       hS = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hS);
       if (!__kmp_hwloc_obj_has_PUs(tp, hS))
         continue; // skip socket if all PUs are out of fullMask
       ++nS; // only count objects those have PUs in affinity mask
       if (nS <= __kmp_hws_socket.offset ||
           nS > __kmp_hws_socket.num + __kmp_hws_socket.offset) {
         n_old += __kmp_hwloc_skip_PUs_obj(tp, hS); // skip socket
         continue; // move to next socket
       }
       nCr = 0; // count number of cores per socket
       // socket requested, go down the topology tree
       // check 4 cases: (+NUMA+Tile), (+NUMA-Tile), (-NUMA+Tile), (-NUMA-Tile)
       if (numa_support) {
         nN = 0;
         hN = NULL;
         // num nodes in current socket
         int NN =
             __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE, &hN);
         for (int n = 0; n < NN; ++n) {
           // Check NUMA Node ----------------------------------------
           if (!__kmp_hwloc_obj_has_PUs(tp, hN)) {
             hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN);
             continue; // skip node if all PUs are out of fullMask
           }
           ++nN;
           if (nN <= __kmp_hws_node.offset ||
               nN > __kmp_hws_node.num + __kmp_hws_node.offset) {
             // skip node as not requested
             n_old += __kmp_hwloc_skip_PUs_obj(tp, hN); // skip node
             hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN);
             continue; // move to next node
           }
           // node requested, go down the topology tree
           if (tile_support) {
             nL = 0;
             hL = NULL;
             int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL);
             for (int l = 0; l < NL; ++l) {
               // Check L2 (tile) ------------------------------------
               if (!__kmp_hwloc_obj_has_PUs(tp, hL)) {
                 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
                 continue; // skip tile if all PUs are out of fullMask
               }
               ++nL;
               if (nL <= __kmp_hws_tile.offset ||
                   nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) {
                 // skip tile as not requested
                 n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile
                 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
                 continue; // move to next tile
               }
               // tile requested, go down the topology tree
               nC = 0;
               hC = NULL;
               // num cores in current tile
               int NC = __kmp_hwloc_count_children_by_type(tp, hL,
                                                           HWLOC_OBJ_CORE, &hC);
               for (int c = 0; c < NC; ++c) {
                 // Check Core ---------------------------------------
                 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
                   hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
                   continue; // skip core if all PUs are out of fullMask
                 }
                 ++nC;
                 if (nC <= __kmp_hws_core.offset ||
                     nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
                   // skip node as not requested
                   n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
                   hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
                   continue; // move to next node
                 }
                 // core requested, go down to PUs
                 nT = 0;
                 nTr = 0;
                 hT = NULL;
                 // num procs in current core
                 int NT = __kmp_hwloc_count_children_by_type(tp, hC,
                                                             HWLOC_OBJ_PU, &hT);
                 for (int t = 0; t < NT; ++t) {
                   // Check PU ---------------------------------------
                   idx = hT->os_index;
                   if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
                     hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
                     continue; // skip PU if not in fullMask
                   }
                   ++nT;
                   if (nT <= __kmp_hws_proc.offset ||
                       nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
                     // skip PU
                     KMP_CPU_CLR(idx, __kmp_affin_fullMask);
                     ++n_old;
                     KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
                     hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
                     continue; // move to next node
                   }
                   ++nTr;
                   if (pAddr) // collect requested thread's data
                     newAddr[n_new] = (*pAddr)[n_old];
                   ++n_new;
                   ++n_old;
                   hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
                 } // threads loop
                 if (nTr > 0) {
                   ++nCr; // num cores per socket
                   ++nCo; // total num cores
                   if (nTr > nTpC)
                     nTpC = nTr; // calc max threads per core
                 }
                 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
               } // cores loop
               hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
             } // tiles loop
           } else { // tile_support
             // no tiles, check cores
             nC = 0;
             hC = NULL;
             // num cores in current node
             int NC =
                 __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE, &hC);
             for (int c = 0; c < NC; ++c) {
               // Check Core ---------------------------------------
               if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
                 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
                 continue; // skip core if all PUs are out of fullMask
               }
               ++nC;
               if (nC <= __kmp_hws_core.offset ||
                   nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
                 // skip node as not requested
                 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
                 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
                 continue; // move to next node
               }
               // core requested, go down to PUs
               nT = 0;
               nTr = 0;
               hT = NULL;
               int NT =
                   __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT);
               for (int t = 0; t < NT; ++t) {
                 // Check PU ---------------------------------------
                 idx = hT->os_index;
                 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
                   hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
                   continue; // skip PU if not in fullMask
                 }
                 ++nT;
                 if (nT <= __kmp_hws_proc.offset ||
                     nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
                   // skip PU
                   KMP_CPU_CLR(idx, __kmp_affin_fullMask);
                   ++n_old;
                   KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
                   hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
                   continue; // move to next node
                 }
                 ++nTr;
                 if (pAddr) // collect requested thread's data
                   newAddr[n_new] = (*pAddr)[n_old];
                 ++n_new;
                 ++n_old;
                 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
               } // threads loop
               if (nTr > 0) {
                 ++nCr; // num cores per socket
                 ++nCo; // total num cores
                 if (nTr > nTpC)
                   nTpC = nTr; // calc max threads per core
               }
               hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
             } // cores loop
           } // tiles support
           hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN);
         } // nodes loop
       } else { // numa_support
         // no NUMA support
         if (tile_support) {
           nL = 0;
           hL = NULL;
           // num tiles in current socket
           int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL);
           for (int l = 0; l < NL; ++l) {
             // Check L2 (tile) ------------------------------------
             if (!__kmp_hwloc_obj_has_PUs(tp, hL)) {
               hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
               continue; // skip tile if all PUs are out of fullMask
             }
             ++nL;
             if (nL <= __kmp_hws_tile.offset ||
                 nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) {
               // skip tile as not requested
               n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile
               hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
               continue; // move to next tile
             }
             // tile requested, go down the topology tree
             nC = 0;
             hC = NULL;
             // num cores per tile
             int NC =
                 __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC);
             for (int c = 0; c < NC; ++c) {
               // Check Core ---------------------------------------
               if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
                 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
                 continue; // skip core if all PUs are out of fullMask
               }
               ++nC;
               if (nC <= __kmp_hws_core.offset ||
                   nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
                 // skip node as not requested
                 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
                 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
                 continue; // move to next node
               }
               // core requested, go down to PUs
               nT = 0;
               nTr = 0;
               hT = NULL;
               // num procs per core
               int NT =
                   __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT);
               for (int t = 0; t < NT; ++t) {
                 // Check PU ---------------------------------------
                 idx = hT->os_index;
                 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
                   hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
                   continue; // skip PU if not in fullMask
                 }
                 ++nT;
                 if (nT <= __kmp_hws_proc.offset ||
                     nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
                   // skip PU
                   KMP_CPU_CLR(idx, __kmp_affin_fullMask);
                   ++n_old;
                   KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
                   hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
                   continue; // move to next node
                 }
                 ++nTr;
                 if (pAddr) // collect requested thread's data
                   newAddr[n_new] = (*pAddr)[n_old];
                 ++n_new;
                 ++n_old;
                 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
               } // threads loop
               if (nTr > 0) {
                 ++nCr; // num cores per socket
                 ++nCo; // total num cores
                 if (nTr > nTpC)
                   nTpC = nTr; // calc max threads per core
               }
               hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
             } // cores loop
             hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
           } // tiles loop
         } else { // tile_support
           // no tiles, check cores
           nC = 0;
           hC = NULL;
           // num cores in socket
           int NC =
               __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE, &hC);
           for (int c = 0; c < NC; ++c) {
             // Check Core -------------------------------------------
             if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
               hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
               continue; // skip core if all PUs are out of fullMask
             }
             ++nC;
             if (nC <= __kmp_hws_core.offset ||
                 nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
               // skip node as not requested
               n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
               hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
               continue; // move to next node
             }
             // core requested, go down to PUs
             nT = 0;
             nTr = 0;
             hT = NULL;
             // num procs per core
             int NT =
                 __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT);
             for (int t = 0; t < NT; ++t) {
               // Check PU ---------------------------------------
               idx = hT->os_index;
               if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
                 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
                 continue; // skip PU if not in fullMask
               }
               ++nT;
               if (nT <= __kmp_hws_proc.offset ||
                   nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
                 // skip PU
                 KMP_CPU_CLR(idx, __kmp_affin_fullMask);
                 ++n_old;
                 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
                 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
                 continue; // move to next node
               }
               ++nTr;
               if (pAddr) // collect requested thread's data
                 newAddr[n_new] = (*pAddr)[n_old];
               ++n_new;
               ++n_old;
               hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
             } // threads loop
             if (nTr > 0) {
               ++nCr; // num cores per socket
               ++nCo; // total num cores
               if (nTr > nTpC)
                 nTpC = nTr; // calc max threads per core
             }
             hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
           } // cores loop
         } // tiles support
       } // numa_support
       if (nCr > 0) { // found cores?
         ++nPkg; // num sockets
         if (nCr > nCpP)
           nCpP = nCr; // calc max cores per socket
       }
     } // sockets loop
 
     // check the subset is valid
     KMP_DEBUG_ASSERT(n_old == __kmp_avail_proc);
     KMP_DEBUG_ASSERT(nPkg > 0);
     KMP_DEBUG_ASSERT(nCpP > 0);
     KMP_DEBUG_ASSERT(nTpC > 0);
     KMP_DEBUG_ASSERT(nCo > 0);
     KMP_DEBUG_ASSERT(nPkg <= nPackages);
     KMP_DEBUG_ASSERT(nCpP <= nCoresPerPkg);
     KMP_DEBUG_ASSERT(nTpC <= __kmp_nThreadsPerCore);
     KMP_DEBUG_ASSERT(nCo <= __kmp_ncores);
 
     nPackages = nPkg; // correct num sockets
     nCoresPerPkg = nCpP; // correct num cores per socket
     __kmp_nThreadsPerCore = nTpC; // correct num threads per core
     __kmp_avail_proc = n_new; // correct num procs
     __kmp_ncores = nCo; // correct num cores
     // hwloc topology method end
   } else
 #endif // KMP_USE_HWLOC
   {
     int n_old = 0, n_new = 0, proc_num = 0;
     if (__kmp_hws_node.num > 0 || __kmp_hws_tile.num > 0) {
       KMP_WARNING(AffHWSubsetNoHWLOC);
       goto _exit;
     }
     if (__kmp_hws_socket.num == 0)
       __kmp_hws_socket.num = nPackages; // use all available sockets
     if (__kmp_hws_core.num == 0)
       __kmp_hws_core.num = nCoresPerPkg; // use all available cores
     if (__kmp_hws_proc.num == 0 || __kmp_hws_proc.num > __kmp_nThreadsPerCore)
       __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all HW contexts
     if (!__kmp_affinity_uniform_topology()) {
       KMP_WARNING(AffHWSubsetNonUniform);
       goto _exit; // don't support non-uniform topology
     }
     if (depth > 3) {
       KMP_WARNING(AffHWSubsetNonThreeLevel);
       goto _exit; // don't support not-3-level topology
     }
     if (__kmp_hws_socket.offset + __kmp_hws_socket.num > nPackages) {
       KMP_WARNING(AffHWSubsetManySockets);
       goto _exit;
     }
     if (__kmp_hws_core.offset + __kmp_hws_core.num > nCoresPerPkg) {
       KMP_WARNING(AffHWSubsetManyCores);
       goto _exit;
     }
     // Form the requested subset
     if (pAddr) // pAddr is NULL in case of affinity_none
       newAddr = (AddrUnsPair *)__kmp_allocate(
           sizeof(AddrUnsPair) * __kmp_hws_socket.num * __kmp_hws_core.num *
           __kmp_hws_proc.num);
     for (int i = 0; i < nPackages; ++i) {
       if (i < __kmp_hws_socket.offset ||
           i >= __kmp_hws_socket.offset + __kmp_hws_socket.num) {
         // skip not-requested socket
         n_old += nCoresPerPkg * __kmp_nThreadsPerCore;
         if (__kmp_pu_os_idx != NULL) {
           // walk through skipped socket
           for (int j = 0; j < nCoresPerPkg; ++j) {
             for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
               KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
               ++proc_num;
             }
           }
         }
       } else {
         // walk through requested socket
         for (int j = 0; j < nCoresPerPkg; ++j) {
           if (j < __kmp_hws_core.offset ||
               j >= __kmp_hws_core.offset +
                        __kmp_hws_core.num) { // skip not-requested core
             n_old += __kmp_nThreadsPerCore;
             if (__kmp_pu_os_idx != NULL) {
               for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
                 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
                 ++proc_num;
               }
             }
           } else {
             // walk through requested core
             for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
               if (k < __kmp_hws_proc.num) {
                 if (pAddr) // collect requested thread's data
                   newAddr[n_new] = (*pAddr)[n_old];
                 n_new++;
               } else {
                 if (__kmp_pu_os_idx != NULL)
                   KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
               }
               n_old++;
               ++proc_num;
             }
           }
         }
       }
     }
     KMP_DEBUG_ASSERT(n_old == nPackages * nCoresPerPkg * __kmp_nThreadsPerCore);
     KMP_DEBUG_ASSERT(n_new ==
                      __kmp_hws_socket.num * __kmp_hws_core.num *
                          __kmp_hws_proc.num);
     nPackages = __kmp_hws_socket.num; // correct nPackages
     nCoresPerPkg = __kmp_hws_core.num; // correct nCoresPerPkg
     __kmp_nThreadsPerCore = __kmp_hws_proc.num; // correct __kmp_nThreadsPerCore
     __kmp_avail_proc = n_new; // correct avail_proc
     __kmp_ncores = nPackages * __kmp_hws_core.num; // correct ncores
   } // non-hwloc topology method
   if (pAddr) {
     __kmp_free(*pAddr);
     *pAddr = newAddr; // replace old topology with new one
   }
   if (__kmp_affinity_verbose) {
     char m[KMP_AFFIN_MASK_PRINT_LEN];
     __kmp_affinity_print_mask(m, KMP_AFFIN_MASK_PRINT_LEN,
                               __kmp_affin_fullMask);
     if (__kmp_affinity_respect_mask) {
       KMP_INFORM(InitOSProcSetRespect, "KMP_HW_SUBSET", m);
     } else {
       KMP_INFORM(InitOSProcSetNotRespect, "KMP_HW_SUBSET", m);
     }
     KMP_INFORM(AvailableOSProc, "KMP_HW_SUBSET", __kmp_avail_proc);
     kmp_str_buf_t buf;
     __kmp_str_buf_init(&buf);
     __kmp_str_buf_print(&buf, "%d", nPackages);
     KMP_INFORM(TopologyExtra, "KMP_HW_SUBSET", buf.str, nCoresPerPkg,
                __kmp_nThreadsPerCore, __kmp_ncores);
     __kmp_str_buf_free(&buf);
   }
 _exit:
   if (__kmp_pu_os_idx != NULL) {
     __kmp_free(__kmp_pu_os_idx);
     __kmp_pu_os_idx = NULL;
   }
 }
 
 // This function figures out the deepest level at which there is at least one
 // cluster/core with more than one processing unit bound to it.
 static int __kmp_affinity_find_core_level(const AddrUnsPair *address2os,
                                           int nprocs, int bottom_level) {
   int core_level = 0;
 
   for (int i = 0; i < nprocs; i++) {
     for (int j = bottom_level; j > 0; j--) {
       if (address2os[i].first.labels[j] > 0) {
         if (core_level < (j - 1)) {
           core_level = j - 1;
         }
       }
     }
   }
   return core_level;
 }
 
 // This function counts number of clusters/cores at given level.
 static int __kmp_affinity_compute_ncores(const AddrUnsPair *address2os,
                                          int nprocs, int bottom_level,
                                          int core_level) {
   int ncores = 0;
   int i, j;
 
   j = bottom_level;
   for (i = 0; i < nprocs; i++) {
     for (j = bottom_level; j > core_level; j--) {
       if ((i + 1) < nprocs) {
         if (address2os[i + 1].first.labels[j] > 0) {
           break;
         }
       }
     }
     if (j == core_level) {
       ncores++;
     }
   }
   if (j > core_level) {
     // In case of ( nprocs < __kmp_avail_proc ) we may end too deep and miss one
     // core. May occur when called from __kmp_affinity_find_core().
     ncores++;
   }
   return ncores;
 }
 
 // This function finds to which cluster/core given processing unit is bound.
 static int __kmp_affinity_find_core(const AddrUnsPair *address2os, int proc,
                                     int bottom_level, int core_level) {
   return __kmp_affinity_compute_ncores(address2os, proc + 1, bottom_level,
                                        core_level) -
          1;
 }
 
 // This function finds maximal number of processing units bound to a
 // cluster/core at given level.
 static int __kmp_affinity_max_proc_per_core(const AddrUnsPair *address2os,
                                             int nprocs, int bottom_level,
                                             int core_level) {
   int maxprocpercore = 0;
 
   if (core_level < bottom_level) {
     for (int i = 0; i < nprocs; i++) {
       int percore = address2os[i].first.labels[core_level + 1] + 1;
 
       if (percore > maxprocpercore) {
         maxprocpercore = percore;
       }
     }
   } else {
     maxprocpercore = 1;
   }
   return maxprocpercore;
 }
 
 static AddrUnsPair *address2os = NULL;
 static int *procarr = NULL;
 static int __kmp_aff_depth = 0;
 
 #if KMP_USE_HIER_SCHED
 #define KMP_EXIT_AFF_NONE                                                      \
   KMP_ASSERT(__kmp_affinity_type == affinity_none);                            \
   KMP_ASSERT(address2os == NULL);                                              \
   __kmp_apply_thread_places(NULL, 0);                                          \
   __kmp_create_affinity_none_places();                                         \
   __kmp_dispatch_set_hierarchy_values();                                       \
   return;
 #else
 #define KMP_EXIT_AFF_NONE                                                      \
   KMP_ASSERT(__kmp_affinity_type == affinity_none);                            \
   KMP_ASSERT(address2os == NULL);                                              \
   __kmp_apply_thread_places(NULL, 0);                                          \
   __kmp_create_affinity_none_places();                                         \
   return;
 #endif
 
 // Create a one element mask array (set of places) which only contains the
 // initial process's affinity mask
 static void __kmp_create_affinity_none_places() {
   KMP_ASSERT(__kmp_affin_fullMask != NULL);
   KMP_ASSERT(__kmp_affinity_type == affinity_none);
   __kmp_affinity_num_masks = 1;
   KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
   kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, 0);
   KMP_CPU_COPY(dest, __kmp_affin_fullMask);
 }
 
 static int __kmp_affinity_cmp_Address_child_num(const void *a, const void *b) {
   const Address *aa = &(((const AddrUnsPair *)a)->first);
   const Address *bb = &(((const AddrUnsPair *)b)->first);
   unsigned depth = aa->depth;
   unsigned i;
   KMP_DEBUG_ASSERT(depth == bb->depth);
   KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth);
   KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
   for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) {
     int j = depth - i - 1;
     if (aa->childNums[j] < bb->childNums[j])
       return -1;
     if (aa->childNums[j] > bb->childNums[j])
       return 1;
   }
   for (; i < depth; i++) {
     int j = i - __kmp_affinity_compact;
     if (aa->childNums[j] < bb->childNums[j])
       return -1;
     if (aa->childNums[j] > bb->childNums[j])
       return 1;
   }
   return 0;
 }
 
 static void __kmp_aux_affinity_initialize(void) {
   if (__kmp_affinity_masks != NULL) {
     KMP_ASSERT(__kmp_affin_fullMask != NULL);
     return;
   }
 
   // Create the "full" mask - this defines all of the processors that we
   // consider to be in the machine model. If respect is set, then it is the
   // initialization thread's affinity mask. Otherwise, it is all processors that
   // we know about on the machine.
   if (__kmp_affin_fullMask == NULL) {
     KMP_CPU_ALLOC(__kmp_affin_fullMask);
   }
   if (KMP_AFFINITY_CAPABLE()) {
     if (__kmp_affinity_respect_mask) {
       __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE);
 
       // Count the number of available processors.
       unsigned i;
       __kmp_avail_proc = 0;
       KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
         if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
           continue;
         }
         __kmp_avail_proc++;
       }
       if (__kmp_avail_proc > __kmp_xproc) {
         if (__kmp_affinity_verbose ||
             (__kmp_affinity_warnings &&
              (__kmp_affinity_type != affinity_none))) {
           KMP_WARNING(ErrorInitializeAffinity);
         }
         __kmp_affinity_type = affinity_none;
         KMP_AFFINITY_DISABLE();
         return;
       }
     } else {
       __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask);
       __kmp_avail_proc = __kmp_xproc;
     }
   }
 
   if (__kmp_affinity_gran == affinity_gran_tile &&
       // check if user's request is valid
       __kmp_affinity_dispatch->get_api_type() == KMPAffinity::NATIVE_OS) {
     KMP_WARNING(AffTilesNoHWLOC, "KMP_AFFINITY");
     __kmp_affinity_gran = affinity_gran_package;
   }
 
   int depth = -1;
   kmp_i18n_id_t msg_id = kmp_i18n_null;
 
   // For backward compatibility, setting KMP_CPUINFO_FILE =>
   // KMP_TOPOLOGY_METHOD=cpuinfo
   if ((__kmp_cpuinfo_file != NULL) &&
       (__kmp_affinity_top_method == affinity_top_method_all)) {
     __kmp_affinity_top_method = affinity_top_method_cpuinfo;
   }
 
   if (__kmp_affinity_top_method == affinity_top_method_all) {
     // In the default code path, errors are not fatal - we just try using
     // another method. We only emit a warning message if affinity is on, or the
     // verbose flag is set, an the nowarnings flag was not set.
     const char *file_name = NULL;
     int line = 0;
 #if KMP_USE_HWLOC
     if (depth < 0 &&
         __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
       if (__kmp_affinity_verbose) {
         KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
       }
       if (!__kmp_hwloc_error) {
         depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
         if (depth == 0) {
           KMP_EXIT_AFF_NONE;
         } else if (depth < 0 && __kmp_affinity_verbose) {
           KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
         }
       } else if (__kmp_affinity_verbose) {
         KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
       }
     }
 #endif
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 
     if (depth < 0) {
       if (__kmp_affinity_verbose) {
         KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
       }
 
       file_name = NULL;
       depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
       if (depth == 0) {
         KMP_EXIT_AFF_NONE;
       }
 
       if (depth < 0) {
         if (__kmp_affinity_verbose) {
           if (msg_id != kmp_i18n_null) {
             KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY",
                        __kmp_i18n_catgets(msg_id),
                        KMP_I18N_STR(DecodingLegacyAPIC));
           } else {
             KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
                        KMP_I18N_STR(DecodingLegacyAPIC));
           }
         }
 
         file_name = NULL;
         depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
         if (depth == 0) {
           KMP_EXIT_AFF_NONE;
         }
       }
     }
 
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
 #if KMP_OS_LINUX
 
     if (depth < 0) {
       if (__kmp_affinity_verbose) {
         if (msg_id != kmp_i18n_null) {
           KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY",
                      __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
         } else {
           KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
         }
       }
 
       FILE *f = fopen("/proc/cpuinfo", "r");
       if (f == NULL) {
         msg_id = kmp_i18n_str_CantOpenCpuinfo;
       } else {
         file_name = "/proc/cpuinfo";
         depth =
             __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
         fclose(f);
         if (depth == 0) {
           KMP_EXIT_AFF_NONE;
         }
       }
     }
 
 #endif /* KMP_OS_LINUX */
 
 #if KMP_GROUP_AFFINITY
 
     if ((depth < 0) && (__kmp_num_proc_groups > 1)) {
       if (__kmp_affinity_verbose) {
         KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
       }
 
       depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
       KMP_ASSERT(depth != 0);
     }
 
 #endif /* KMP_GROUP_AFFINITY */
 
     if (depth < 0) {
       if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) {
         if (file_name == NULL) {
           KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id));
         } else if (line == 0) {
           KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
         } else {
           KMP_INFORM(UsingFlatOSFileLine, file_name, line,
                      __kmp_i18n_catgets(msg_id));
         }
       }
       // FIXME - print msg if msg_id = kmp_i18n_null ???
 
       file_name = "";
       depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
       if (depth == 0) {
         KMP_EXIT_AFF_NONE;
       }
       KMP_ASSERT(depth > 0);
       KMP_ASSERT(address2os != NULL);
     }
   }
 
 #if KMP_USE_HWLOC
   else if (__kmp_affinity_top_method == affinity_top_method_hwloc) {
     KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC);
     if (__kmp_affinity_verbose) {
       KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
     }
     depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
     if (depth == 0) {
       KMP_EXIT_AFF_NONE;
     }
   }
 #endif // KMP_USE_HWLOC
 
 // If the user has specified that a paricular topology discovery method is to be
 // used, then we abort if that method fails. The exception is group affinity,
 // which might have been implicitly set.
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 
   else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
     if (__kmp_affinity_verbose) {
       KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
     }
 
     depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
     if (depth == 0) {
       KMP_EXIT_AFF_NONE;
     }
     if (depth < 0) {
       KMP_ASSERT(msg_id != kmp_i18n_null);
       KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
     }
   } else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
     if (__kmp_affinity_verbose) {
       KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
     }
 
     depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
     if (depth == 0) {
       KMP_EXIT_AFF_NONE;
     }
     if (depth < 0) {
       KMP_ASSERT(msg_id != kmp_i18n_null);
       KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
     }
   }
 
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
   else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
     const char *filename;
     if (__kmp_cpuinfo_file != NULL) {
       filename = __kmp_cpuinfo_file;
     } else {
       filename = "/proc/cpuinfo";
     }
 
     if (__kmp_affinity_verbose) {
       KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
     }
 
     FILE *f = fopen(filename, "r");
     if (f == NULL) {
       int code = errno;
       if (__kmp_cpuinfo_file != NULL) {
         __kmp_fatal(KMP_MSG(CantOpenFileForReading, filename), KMP_ERR(code),
                     KMP_HNT(NameComesFrom_CPUINFO_FILE), __kmp_msg_null);
       } else {
         __kmp_fatal(KMP_MSG(CantOpenFileForReading, filename), KMP_ERR(code),
                     __kmp_msg_null);
       }
     }
     int line = 0;
     depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
     fclose(f);
     if (depth < 0) {
       KMP_ASSERT(msg_id != kmp_i18n_null);
       if (line > 0) {
         KMP_FATAL(FileLineMsgExiting, filename, line,
                   __kmp_i18n_catgets(msg_id));
       } else {
         KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
       }
     }
     if (__kmp_affinity_type == affinity_none) {
       KMP_ASSERT(depth == 0);
       KMP_EXIT_AFF_NONE;
     }
   }
 
 #if KMP_GROUP_AFFINITY
 
   else if (__kmp_affinity_top_method == affinity_top_method_group) {
     if (__kmp_affinity_verbose) {
       KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
     }
 
     depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
     KMP_ASSERT(depth != 0);
     if (depth < 0) {
       KMP_ASSERT(msg_id != kmp_i18n_null);
       KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
     }
   }
 
 #endif /* KMP_GROUP_AFFINITY */
 
   else if (__kmp_affinity_top_method == affinity_top_method_flat) {
     if (__kmp_affinity_verbose) {
       KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
     }
 
     depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
     if (depth == 0) {
       KMP_EXIT_AFF_NONE;
     }
     // should not fail
     KMP_ASSERT(depth > 0);
     KMP_ASSERT(address2os != NULL);
   }
 
 #if KMP_USE_HIER_SCHED
   __kmp_dispatch_set_hierarchy_values();
 #endif
 
   if (address2os == NULL) {
     if (KMP_AFFINITY_CAPABLE() &&
         (__kmp_affinity_verbose ||
          (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none)))) {
       KMP_WARNING(ErrorInitializeAffinity);
     }
     __kmp_affinity_type = affinity_none;
     __kmp_create_affinity_none_places();
     KMP_AFFINITY_DISABLE();
     return;
   }
 
   if (__kmp_affinity_gran == affinity_gran_tile
 #if KMP_USE_HWLOC
       && __kmp_tile_depth == 0
 #endif
       ) {
     // tiles requested but not detected, warn user on this
     KMP_WARNING(AffTilesNoTiles, "KMP_AFFINITY");
   }
 
   __kmp_apply_thread_places(&address2os, depth);
 
   // Create the table of masks, indexed by thread Id.
   unsigned maxIndex;
   unsigned numUnique;
   kmp_affin_mask_t *osId2Mask =
       __kmp_create_masks(&maxIndex, &numUnique, address2os, __kmp_avail_proc);
   if (__kmp_affinity_gran_levels == 0) {
     KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
   }
 
   // Set the childNums vector in all Address objects. This must be done before
   // we can sort using __kmp_affinity_cmp_Address_child_num(), which takes into
   // account the setting of __kmp_affinity_compact.
   __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
 
   switch (__kmp_affinity_type) {
 
   case affinity_explicit:
     KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
     if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) {
       __kmp_affinity_process_proclist(
           &__kmp_affinity_masks, &__kmp_affinity_num_masks,
           __kmp_affinity_proclist, osId2Mask, maxIndex);
     } else {
       __kmp_affinity_process_placelist(
           &__kmp_affinity_masks, &__kmp_affinity_num_masks,
           __kmp_affinity_proclist, osId2Mask, maxIndex);
     }
     if (__kmp_affinity_num_masks == 0) {
       if (__kmp_affinity_verbose ||
           (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) {
         KMP_WARNING(AffNoValidProcID);
       }
       __kmp_affinity_type = affinity_none;
       __kmp_create_affinity_none_places();
       return;
     }
     break;
 
   // The other affinity types rely on sorting the Addresses according to some
   // permutation of the machine topology tree. Set __kmp_affinity_compact and
   // __kmp_affinity_offset appropriately, then jump to a common code fragment
   // to do the sort and create the array of affinity masks.
 
   case affinity_logical:
     __kmp_affinity_compact = 0;
     if (__kmp_affinity_offset) {
       __kmp_affinity_offset =
           __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc;
     }
     goto sortAddresses;
 
   case affinity_physical:
     if (__kmp_nThreadsPerCore > 1) {
       __kmp_affinity_compact = 1;
       if (__kmp_affinity_compact >= depth) {
         __kmp_affinity_compact = 0;
       }
     } else {
       __kmp_affinity_compact = 0;
     }
     if (__kmp_affinity_offset) {
       __kmp_affinity_offset =
           __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc;
     }
     goto sortAddresses;
 
   case affinity_scatter:
     if (__kmp_affinity_compact >= depth) {
       __kmp_affinity_compact = 0;
     } else {
       __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
     }
     goto sortAddresses;
 
   case affinity_compact:
     if (__kmp_affinity_compact >= depth) {
       __kmp_affinity_compact = depth - 1;
     }
     goto sortAddresses;
 
   case affinity_balanced:
     if (depth <= 1) {
       if (__kmp_affinity_verbose || __kmp_affinity_warnings) {
         KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY");
       }
       __kmp_affinity_type = affinity_none;
       __kmp_create_affinity_none_places();
       return;
     } else if (!__kmp_affinity_uniform_topology()) {
       // Save the depth for further usage
       __kmp_aff_depth = depth;
 
       int core_level = __kmp_affinity_find_core_level(
           address2os, __kmp_avail_proc, depth - 1);
       int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc,
                                                  depth - 1, core_level);
       int maxprocpercore = __kmp_affinity_max_proc_per_core(
           address2os, __kmp_avail_proc, depth - 1, core_level);
 
       int nproc = ncores * maxprocpercore;
       if ((nproc < 2) || (nproc < __kmp_avail_proc)) {
         if (__kmp_affinity_verbose || __kmp_affinity_warnings) {
           KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY");
         }
         __kmp_affinity_type = affinity_none;
         return;
       }
 
       procarr = (int *)__kmp_allocate(sizeof(int) * nproc);
       for (int i = 0; i < nproc; i++) {
         procarr[i] = -1;
       }
 
       int lastcore = -1;
       int inlastcore = 0;
       for (int i = 0; i < __kmp_avail_proc; i++) {
         int proc = address2os[i].second;
         int core =
             __kmp_affinity_find_core(address2os, i, depth - 1, core_level);
 
         if (core == lastcore) {
           inlastcore++;
         } else {
           inlastcore = 0;
         }
         lastcore = core;
 
         procarr[core * maxprocpercore + inlastcore] = proc;
       }
     }
     if (__kmp_affinity_compact >= depth) {
       __kmp_affinity_compact = depth - 1;
     }
 
   sortAddresses:
     // Allocate the gtid->affinity mask table.
     if (__kmp_affinity_dups) {
       __kmp_affinity_num_masks = __kmp_avail_proc;
     } else {
       __kmp_affinity_num_masks = numUnique;
     }
 
     if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) &&
         (__kmp_affinity_num_places > 0) &&
         ((unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks)) {
       __kmp_affinity_num_masks = __kmp_affinity_num_places;
     }
 
     KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
 
     // Sort the address2os table according to the current setting of
     // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
     qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
           __kmp_affinity_cmp_Address_child_num);
     {
       int i;
       unsigned j;
       for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
         if ((!__kmp_affinity_dups) && (!address2os[i].first.leader)) {
           continue;
         }
         unsigned osId = address2os[i].second;
         kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
         kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, j);
         KMP_ASSERT(KMP_CPU_ISSET(osId, src));
         KMP_CPU_COPY(dest, src);
         if (++j >= __kmp_affinity_num_masks) {
           break;
         }
       }
       KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
     }
     break;
 
   default:
     KMP_ASSERT2(0, "Unexpected affinity setting");
   }
 
   KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex + 1);
   machine_hierarchy.init(address2os, __kmp_avail_proc);
 }
 #undef KMP_EXIT_AFF_NONE
 
 void __kmp_affinity_initialize(void) {
   // Much of the code above was written assumming that if a machine was not
   // affinity capable, then __kmp_affinity_type == affinity_none.  We now
   // explicitly represent this as __kmp_affinity_type == affinity_disabled.
   // There are too many checks for __kmp_affinity_type == affinity_none
   // in this code.  Instead of trying to change them all, check if
   // __kmp_affinity_type == affinity_disabled, and if so, slam it with
   // affinity_none, call the real initialization routine, then restore
   // __kmp_affinity_type to affinity_disabled.
   int disabled = (__kmp_affinity_type == affinity_disabled);
   if (!KMP_AFFINITY_CAPABLE()) {
     KMP_ASSERT(disabled);
   }
   if (disabled) {
     __kmp_affinity_type = affinity_none;
   }
   __kmp_aux_affinity_initialize();
   if (disabled) {
     __kmp_affinity_type = affinity_disabled;
   }
 }
 
 void __kmp_affinity_uninitialize(void) {
   if (__kmp_affinity_masks != NULL) {
     KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
     __kmp_affinity_masks = NULL;
   }
   if (__kmp_affin_fullMask != NULL) {
     KMP_CPU_FREE(__kmp_affin_fullMask);
     __kmp_affin_fullMask = NULL;
   }
   __kmp_affinity_num_masks = 0;
   __kmp_affinity_type = affinity_default;
   __kmp_affinity_num_places = 0;
   if (__kmp_affinity_proclist != NULL) {
     __kmp_free(__kmp_affinity_proclist);
     __kmp_affinity_proclist = NULL;
   }
   if (address2os != NULL) {
     __kmp_free(address2os);
     address2os = NULL;
   }
   if (procarr != NULL) {
     __kmp_free(procarr);
     procarr = NULL;
   }
 #if KMP_USE_HWLOC
   if (__kmp_hwloc_topology != NULL) {
     hwloc_topology_destroy(__kmp_hwloc_topology);
     __kmp_hwloc_topology = NULL;
   }
 #endif
   KMPAffinity::destroy_api();
 }
 
 void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
   if (!KMP_AFFINITY_CAPABLE()) {
     return;
   }
 
   kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
   if (th->th.th_affin_mask == NULL) {
     KMP_CPU_ALLOC(th->th.th_affin_mask);
   } else {
     KMP_CPU_ZERO(th->th.th_affin_mask);
   }
 
   // Copy the thread mask to the kmp_info_t strucuture. If
   // __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one that
   // has all of the OS proc ids set, or if __kmp_affinity_respect_mask is set,
   // then the full mask is the same as the mask of the initialization thread.
   kmp_affin_mask_t *mask;
   int i;
 
   if (KMP_AFFINITY_NON_PROC_BIND) {
     if ((__kmp_affinity_type == affinity_none) ||
         (__kmp_affinity_type == affinity_balanced)) {
 #if KMP_GROUP_AFFINITY
       if (__kmp_num_proc_groups > 1) {
         return;
       }
 #endif
       KMP_ASSERT(__kmp_affin_fullMask != NULL);
       i = 0;
       mask = __kmp_affin_fullMask;
     } else {
       KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0);
       i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
       mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
     }
   } else {
     if ((!isa_root) ||
         (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
 #if KMP_GROUP_AFFINITY
       if (__kmp_num_proc_groups > 1) {
         return;
       }
 #endif
       KMP_ASSERT(__kmp_affin_fullMask != NULL);
       i = KMP_PLACE_ALL;
       mask = __kmp_affin_fullMask;
     } else {
       // int i = some hash function or just a counter that doesn't
       // always start at 0.  Use gtid for now.
       KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0);
       i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
       mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
     }
   }
 
   th->th.th_current_place = i;
   if (isa_root) {
     th->th.th_new_place = i;
     th->th.th_first_place = 0;
     th->th.th_last_place = __kmp_affinity_num_masks - 1;
   } else if (KMP_AFFINITY_NON_PROC_BIND) {
     // When using a Non-OMP_PROC_BIND affinity method,
     // set all threads' place-partition-var to the entire place list
     th->th.th_first_place = 0;
     th->th.th_last_place = __kmp_affinity_num_masks - 1;
   }
 
   if (i == KMP_PLACE_ALL) {
     KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
                    gtid));
   } else {
     KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
                    gtid, i));
   }
 
   KMP_CPU_COPY(th->th.th_affin_mask, mask);
 
   if (__kmp_affinity_verbose
       /* to avoid duplicate printing (will be correctly printed on barrier) */
       && (__kmp_affinity_type == affinity_none ||
           (i != KMP_PLACE_ALL && __kmp_affinity_type != affinity_balanced))) {
     char buf[KMP_AFFIN_MASK_PRINT_LEN];
     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
                               th->th.th_affin_mask);
     KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
                __kmp_gettid(), gtid, buf);
   }
 
 #if KMP_OS_WINDOWS
   // On Windows* OS, the process affinity mask might have changed. If the user
   // didn't request affinity and this call fails, just continue silently.
   // See CQ171393.
   if (__kmp_affinity_type == affinity_none) {
     __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
   } else
 #endif
     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
 }
 
 void __kmp_affinity_set_place(int gtid) {
   if (!KMP_AFFINITY_CAPABLE()) {
     return;
   }
 
   kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
 
   KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current "
                  "place = %d)\n",
                  gtid, th->th.th_new_place, th->th.th_current_place));
 
   // Check that the new place is within this thread's partition.
   KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
   KMP_ASSERT(th->th.th_new_place >= 0);
   KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
   if (th->th.th_first_place <= th->th.th_last_place) {
     KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) &&
                (th->th.th_new_place <= th->th.th_last_place));
   } else {
     KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) ||
                (th->th.th_new_place >= th->th.th_last_place));
   }
 
   // Copy the thread mask to the kmp_info_t strucuture,
   // and set this thread's affinity.
   kmp_affin_mask_t *mask =
       KMP_CPU_INDEX(__kmp_affinity_masks, th->th.th_new_place);
   KMP_CPU_COPY(th->th.th_affin_mask, mask);
   th->th.th_current_place = th->th.th_new_place;
 
   if (__kmp_affinity_verbose) {
     char buf[KMP_AFFIN_MASK_PRINT_LEN];
     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
                               th->th.th_affin_mask);
     KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
                __kmp_gettid(), gtid, buf);
   }
   __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
 }
 
 int __kmp_aux_set_affinity(void **mask) {
   int gtid;
   kmp_info_t *th;
   int retval;
 
   if (!KMP_AFFINITY_CAPABLE()) {
     return -1;
   }
 
   gtid = __kmp_entry_gtid();
   KA_TRACE(1000, (""); {
     char buf[KMP_AFFIN_MASK_PRINT_LEN];
     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
                               (kmp_affin_mask_t *)(*mask));
     __kmp_debug_printf(
         "kmp_set_affinity: setting affinity mask for thread %d = %s\n", gtid,
         buf);
   });
 
   if (__kmp_env_consistency_check) {
     if ((mask == NULL) || (*mask == NULL)) {
       KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
     } else {
       unsigned proc;
       int num_procs = 0;
 
       KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t *)(*mask))) {
         if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
           KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
         }
         if (!KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
           continue;
         }
         num_procs++;
       }
       if (num_procs == 0) {
         KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
       }
 
 #if KMP_GROUP_AFFINITY
       if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
         KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
       }
 #endif /* KMP_GROUP_AFFINITY */
     }
   }
 
   th = __kmp_threads[gtid];
   KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
   retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
   if (retval == 0) {
     KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
   }
 
   th->th.th_current_place = KMP_PLACE_UNDEFINED;
   th->th.th_new_place = KMP_PLACE_UNDEFINED;
   th->th.th_first_place = 0;
   th->th.th_last_place = __kmp_affinity_num_masks - 1;
 
   // Turn off 4.0 affinity for the current tread at this parallel level.
   th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
 
   return retval;
 }
 
 int __kmp_aux_get_affinity(void **mask) {
   int gtid;
   int retval;
   kmp_info_t *th;
 
   if (!KMP_AFFINITY_CAPABLE()) {
     return -1;
   }
 
   gtid = __kmp_entry_gtid();
   th = __kmp_threads[gtid];
   KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
 
   KA_TRACE(1000, (""); {
     char buf[KMP_AFFIN_MASK_PRINT_LEN];
     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
                               th->th.th_affin_mask);
     __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n",
                  gtid, buf);
   });
 
   if (__kmp_env_consistency_check) {
     if ((mask == NULL) || (*mask == NULL)) {
       KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
     }
   }
 
 #if !KMP_OS_WINDOWS
 
   retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
   KA_TRACE(1000, (""); {
     char buf[KMP_AFFIN_MASK_PRINT_LEN];
     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
                               (kmp_affin_mask_t *)(*mask));
     __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n",
                  gtid, buf);
   });
   return retval;
 
 #else
 
   KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
   return 0;
 
 #endif /* KMP_OS_WINDOWS */
 }
 
 int __kmp_aux_get_affinity_max_proc() {
   if (!KMP_AFFINITY_CAPABLE()) {
     return 0;
   }
 #if KMP_GROUP_AFFINITY
   if (__kmp_num_proc_groups > 1) {
     return (int)(__kmp_num_proc_groups * sizeof(DWORD_PTR) * CHAR_BIT);
   }
 #endif
   return __kmp_xproc;
 }
 
 int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) {
   if (!KMP_AFFINITY_CAPABLE()) {
     return -1;
   }
 
   KA_TRACE(1000, (""); {
     int gtid = __kmp_entry_gtid();
     char buf[KMP_AFFIN_MASK_PRINT_LEN];
     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
                               (kmp_affin_mask_t *)(*mask));
     __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in "
                        "affinity mask for thread %d = %s\n",
                        proc, gtid, buf);
   });
 
   if (__kmp_env_consistency_check) {
     if ((mask == NULL) || (*mask == NULL)) {
       KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
     }
   }
 
   if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
     return -1;
   }
   if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
     return -2;
   }
 
   KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
   return 0;
 }
 
 int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) {
   if (!KMP_AFFINITY_CAPABLE()) {
     return -1;
   }
 
   KA_TRACE(1000, (""); {
     int gtid = __kmp_entry_gtid();
     char buf[KMP_AFFIN_MASK_PRINT_LEN];
     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
                               (kmp_affin_mask_t *)(*mask));
     __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in "
                        "affinity mask for thread %d = %s\n",
                        proc, gtid, buf);
   });
 
   if (__kmp_env_consistency_check) {
     if ((mask == NULL) || (*mask == NULL)) {
       KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
     }
   }
 
   if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
     return -1;
   }
   if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
     return -2;
   }
 
   KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
   return 0;
 }
 
 int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) {
   if (!KMP_AFFINITY_CAPABLE()) {
     return -1;
   }
 
   KA_TRACE(1000, (""); {
     int gtid = __kmp_entry_gtid();
     char buf[KMP_AFFIN_MASK_PRINT_LEN];
     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
                               (kmp_affin_mask_t *)(*mask));
     __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in "
                        "affinity mask for thread %d = %s\n",
                        proc, gtid, buf);
   });
 
   if (__kmp_env_consistency_check) {
     if ((mask == NULL) || (*mask == NULL)) {
       KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
     }
   }
 
   if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
     return -1;
   }
   if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
     return 0;
   }
 
   return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
 }
 
 // Dynamic affinity settings - Affinity balanced
 void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
   KMP_DEBUG_ASSERT(th);
   bool fine_gran = true;
   int tid = th->th.th_info.ds.ds_tid;
 
   switch (__kmp_affinity_gran) {
   case affinity_gran_fine:
   case affinity_gran_thread:
     break;
   case affinity_gran_core:
     if (__kmp_nThreadsPerCore > 1) {
       fine_gran = false;
     }
     break;
   case affinity_gran_package:
     if (nCoresPerPkg > 1) {
       fine_gran = false;
     }
     break;
   default:
     fine_gran = false;
   }
 
   if (__kmp_affinity_uniform_topology()) {
     int coreID;
     int threadID;
     // Number of hyper threads per core in HT machine
     int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
     // Number of cores
     int ncores = __kmp_ncores;
     if ((nPackages > 1) && (__kmp_nth_per_core <= 1)) {
       __kmp_nth_per_core = __kmp_avail_proc / nPackages;
       ncores = nPackages;
     }
     // How many threads will be bound to each core
     int chunk = nthreads / ncores;
     // How many cores will have an additional thread bound to it - "big cores"
     int big_cores = nthreads % ncores;
     // Number of threads on the big cores
     int big_nth = (chunk + 1) * big_cores;
     if (tid < big_nth) {
       coreID = tid / (chunk + 1);
       threadID = (tid % (chunk + 1)) % __kmp_nth_per_core;
     } else { // tid >= big_nth
       coreID = (tid - big_cores) / chunk;
       threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core;
     }
 
     KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
                       "Illegal set affinity operation when not capable");
 
     kmp_affin_mask_t *mask = th->th.th_affin_mask;
     KMP_CPU_ZERO(mask);
 
     if (fine_gran) {
       int osID = address2os[coreID * __kmp_nth_per_core + threadID].second;
       KMP_CPU_SET(osID, mask);
     } else {
       for (int i = 0; i < __kmp_nth_per_core; i++) {
         int osID;
         osID = address2os[coreID * __kmp_nth_per_core + i].second;
         KMP_CPU_SET(osID, mask);
       }
     }
     if (__kmp_affinity_verbose) {
       char buf[KMP_AFFIN_MASK_PRINT_LEN];
       __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
       KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
                  __kmp_gettid(), tid, buf);
     }
     __kmp_set_system_affinity(mask, TRUE);
   } else { // Non-uniform topology
 
     kmp_affin_mask_t *mask = th->th.th_affin_mask;
     KMP_CPU_ZERO(mask);
 
     int core_level = __kmp_affinity_find_core_level(
         address2os, __kmp_avail_proc, __kmp_aff_depth - 1);
     int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc,
                                                __kmp_aff_depth - 1, core_level);
     int nth_per_core = __kmp_affinity_max_proc_per_core(
         address2os, __kmp_avail_proc, __kmp_aff_depth - 1, core_level);
 
     // For performance gain consider the special case nthreads ==
     // __kmp_avail_proc
     if (nthreads == __kmp_avail_proc) {
       if (fine_gran) {
         int osID = address2os[tid].second;
         KMP_CPU_SET(osID, mask);
       } else {
         int core = __kmp_affinity_find_core(address2os, tid,
                                             __kmp_aff_depth - 1, core_level);
         for (int i = 0; i < __kmp_avail_proc; i++) {
           int osID = address2os[i].second;
           if (__kmp_affinity_find_core(address2os, i, __kmp_aff_depth - 1,
                                        core_level) == core) {
             KMP_CPU_SET(osID, mask);
           }
         }
       }
     } else if (nthreads <= ncores) {
 
       int core = 0;
       for (int i = 0; i < ncores; i++) {
         // Check if this core from procarr[] is in the mask
         int in_mask = 0;
         for (int j = 0; j < nth_per_core; j++) {
           if (procarr[i * nth_per_core + j] != -1) {
             in_mask = 1;
             break;
           }
         }
         if (in_mask) {
           if (tid == core) {
             for (int j = 0; j < nth_per_core; j++) {
               int osID = procarr[i * nth_per_core + j];
               if (osID != -1) {
                 KMP_CPU_SET(osID, mask);
                 // For fine granularity it is enough to set the first available
                 // osID for this core
                 if (fine_gran) {
                   break;
                 }
               }
             }
             break;
           } else {
             core++;
           }
         }
       }
     } else { // nthreads > ncores
       // Array to save the number of processors at each core
       int *nproc_at_core = (int *)KMP_ALLOCA(sizeof(int) * ncores);
       // Array to save the number of cores with "x" available processors;
       int *ncores_with_x_procs =
           (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1));
       // Array to save the number of cores with # procs from x to nth_per_core
       int *ncores_with_x_to_max_procs =
           (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1));
 
       for (int i = 0; i <= nth_per_core; i++) {
         ncores_with_x_procs[i] = 0;
         ncores_with_x_to_max_procs[i] = 0;
       }
 
       for (int i = 0; i < ncores; i++) {
         int cnt = 0;
         for (int j = 0; j < nth_per_core; j++) {
           if (procarr[i * nth_per_core + j] != -1) {
             cnt++;
           }
         }
         nproc_at_core[i] = cnt;
         ncores_with_x_procs[cnt]++;
       }
 
       for (int i = 0; i <= nth_per_core; i++) {
         for (int j = i; j <= nth_per_core; j++) {
           ncores_with_x_to_max_procs[i] += ncores_with_x_procs[j];
         }
       }
 
       // Max number of processors
       int nproc = nth_per_core * ncores;
       // An array to keep number of threads per each context
       int *newarr = (int *)__kmp_allocate(sizeof(int) * nproc);
       for (int i = 0; i < nproc; i++) {
         newarr[i] = 0;
       }
 
       int nth = nthreads;
       int flag = 0;
       while (nth > 0) {
         for (int j = 1; j <= nth_per_core; j++) {
           int cnt = ncores_with_x_to_max_procs[j];
           for (int i = 0; i < ncores; i++) {
             // Skip the core with 0 processors
             if (nproc_at_core[i] == 0) {
               continue;
             }
             for (int k = 0; k < nth_per_core; k++) {
               if (procarr[i * nth_per_core + k] != -1) {
                 if (newarr[i * nth_per_core + k] == 0) {
                   newarr[i * nth_per_core + k] = 1;
                   cnt--;
                   nth--;
                   break;
                 } else {
                   if (flag != 0) {
                     newarr[i * nth_per_core + k]++;
                     cnt--;
                     nth--;
                     break;
                   }
                 }
               }
             }
             if (cnt == 0 || nth == 0) {
               break;
             }
           }
           if (nth == 0) {
             break;
           }
         }
         flag = 1;
       }
       int sum = 0;
       for (int i = 0; i < nproc; i++) {
         sum += newarr[i];
         if (sum > tid) {
           if (fine_gran) {
             int osID = procarr[i];
             KMP_CPU_SET(osID, mask);
           } else {
             int coreID = i / nth_per_core;
             for (int ii = 0; ii < nth_per_core; ii++) {
               int osID = procarr[coreID * nth_per_core + ii];
               if (osID != -1) {
                 KMP_CPU_SET(osID, mask);
               }
             }
           }
           break;
         }
       }
       __kmp_free(newarr);
     }
 
     if (__kmp_affinity_verbose) {
       char buf[KMP_AFFIN_MASK_PRINT_LEN];
       __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
       KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
                  __kmp_gettid(), tid, buf);
     }
     __kmp_set_system_affinity(mask, TRUE);
   }
 }
 
 #if KMP_OS_LINUX
 // We don't need this entry for Windows because
 // there is GetProcessAffinityMask() api
 //
 // The intended usage is indicated by these steps:
 // 1) The user gets the current affinity mask
 // 2) Then sets the affinity by calling this function
 // 3) Error check the return value
 // 4) Use non-OpenMP parallelization
 // 5) Reset the affinity to what was stored in step 1)
 #ifdef __cplusplus
 extern "C"
 #endif
     int
     kmp_set_thread_affinity_mask_initial()
 // the function returns 0 on success,
 //   -1 if we cannot bind thread
 //   >0 (errno) if an error happened during binding
 {
   int gtid = __kmp_get_gtid();
   if (gtid < 0) {
     // Do not touch non-omp threads
     KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
                   "non-omp thread, returning\n"));
     return -1;
   }
   if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) {
     KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
                   "affinity not initialized, returning\n"));
     return -1;
   }
   KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
                 "set full mask for thread %d\n",
                 gtid));
   KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL);
   return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE);
 }
 #endif
 
 #endif // KMP_AFFINITY_SUPPORTED
Index: projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_affinity.h
===================================================================
--- projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_affinity.h	(revision 357058)
+++ projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_affinity.h	(revision 357059)
@@ -1,827 +1,842 @@
 /*
  * kmp_affinity.h -- header for affinity management
  */
 
 //===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef KMP_AFFINITY_H
 #define KMP_AFFINITY_H
 
 #include "kmp.h"
 #include "kmp_os.h"
 
 #if KMP_AFFINITY_SUPPORTED
 #if KMP_USE_HWLOC
 class KMPHwlocAffinity : public KMPAffinity {
 public:
   class Mask : public KMPAffinity::Mask {
     hwloc_cpuset_t mask;
 
   public:
     Mask() {
       mask = hwloc_bitmap_alloc();
       this->zero();
     }
     ~Mask() { hwloc_bitmap_free(mask); }
     void set(int i) override { hwloc_bitmap_set(mask, i); }
     bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
     void clear(int i) override { hwloc_bitmap_clr(mask, i); }
     void zero() override { hwloc_bitmap_zero(mask); }
     void copy(const KMPAffinity::Mask *src) override {
       const Mask *convert = static_cast<const Mask *>(src);
       hwloc_bitmap_copy(mask, convert->mask);
     }
     void bitwise_and(const KMPAffinity::Mask *rhs) override {
       const Mask *convert = static_cast<const Mask *>(rhs);
       hwloc_bitmap_and(mask, mask, convert->mask);
     }
     void bitwise_or(const KMPAffinity::Mask *rhs) override {
       const Mask *convert = static_cast<const Mask *>(rhs);
       hwloc_bitmap_or(mask, mask, convert->mask);
     }
     void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
     int begin() const override { return hwloc_bitmap_first(mask); }
     int end() const override { return -1; }
     int next(int previous) const override {
       return hwloc_bitmap_next(mask, previous);
     }
     int get_system_affinity(bool abort_on_error) override {
       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
                   "Illegal get affinity operation when not capable");
       int retval =
           hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
       if (retval >= 0) {
         return 0;
       }
       int error = errno;
       if (abort_on_error) {
         __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
       }
       return error;
     }
     int set_system_affinity(bool abort_on_error) const override {
       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
                   "Illegal get affinity operation when not capable");
       int retval =
           hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
       if (retval >= 0) {
         return 0;
       }
       int error = errno;
       if (abort_on_error) {
         __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
       }
       return error;
     }
     int get_proc_group() const override {
       int group = -1;
 #if KMP_OS_WINDOWS
       if (__kmp_num_proc_groups == 1) {
         return 1;
       }
       for (int i = 0; i < __kmp_num_proc_groups; i++) {
         // On windows, the long type is always 32 bits
         unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
         unsigned long second_32_bits =
             hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
         if (first_32_bits == 0 && second_32_bits == 0) {
           continue;
         }
         if (group >= 0) {
           return -1;
         }
         group = i;
       }
 #endif /* KMP_OS_WINDOWS */
       return group;
     }
   };
   void determine_capable(const char *var) override {
     const hwloc_topology_support *topology_support;
     if (__kmp_hwloc_topology == NULL) {
       if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
         __kmp_hwloc_error = TRUE;
         if (__kmp_affinity_verbose)
           KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
       }
       if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
         __kmp_hwloc_error = TRUE;
         if (__kmp_affinity_verbose)
           KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
       }
     }
     topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
     // Is the system capable of setting/getting this thread's affinity?
     // Also, is topology discovery possible? (pu indicates ability to discover
     // processing units). And finally, were there no errors when calling any
     // hwloc_* API functions?
     if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
         topology_support->cpubind->get_thisthread_cpubind &&
         topology_support->discovery->pu && !__kmp_hwloc_error) {
       // enables affinity according to KMP_AFFINITY_CAPABLE() macro
       KMP_AFFINITY_ENABLE(TRUE);
     } else {
       // indicate that hwloc didn't work and disable affinity
       __kmp_hwloc_error = TRUE;
       KMP_AFFINITY_DISABLE();
     }
   }
   void bind_thread(int which) override {
     KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
                 "Illegal set affinity operation when not capable");
     KMPAffinity::Mask *mask;
     KMP_CPU_ALLOC_ON_STACK(mask);
     KMP_CPU_ZERO(mask);
     KMP_CPU_SET(which, mask);
     __kmp_set_system_affinity(mask, TRUE);
     KMP_CPU_FREE_FROM_STACK(mask);
   }
   KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
   void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
   KMPAffinity::Mask *allocate_mask_array(int num) override {
     return new Mask[num];
   }
   void deallocate_mask_array(KMPAffinity::Mask *array) override {
     Mask *hwloc_array = static_cast<Mask *>(array);
     delete[] hwloc_array;
   }
   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
                                       int index) override {
     Mask *hwloc_array = static_cast<Mask *>(array);
     return &(hwloc_array[index]);
   }
   api_type get_api_type() const override { return HWLOC; }
 };
 #endif /* KMP_USE_HWLOC */
 
+#if KMP_OS_LINUX || KMP_OS_FREEBSD
 #if KMP_OS_LINUX
 /* On some of the older OS's that we build on, these constants aren't present
    in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
    all systems of the same arch where they are defined, and they cannot change.
    stone forever. */
 #include <sys/syscall.h>
 #if KMP_ARCH_X86 || KMP_ARCH_ARM
 #ifndef __NR_sched_setaffinity
 #define __NR_sched_setaffinity 241
 #elif __NR_sched_setaffinity != 241
 #error Wrong code for setaffinity system call.
 #endif /* __NR_sched_setaffinity */
 #ifndef __NR_sched_getaffinity
 #define __NR_sched_getaffinity 242
 #elif __NR_sched_getaffinity != 242
 #error Wrong code for getaffinity system call.
 #endif /* __NR_sched_getaffinity */
 #elif KMP_ARCH_AARCH64
 #ifndef __NR_sched_setaffinity
 #define __NR_sched_setaffinity 122
 #elif __NR_sched_setaffinity != 122
 #error Wrong code for setaffinity system call.
 #endif /* __NR_sched_setaffinity */
 #ifndef __NR_sched_getaffinity
 #define __NR_sched_getaffinity 123
 #elif __NR_sched_getaffinity != 123
 #error Wrong code for getaffinity system call.
 #endif /* __NR_sched_getaffinity */
 #elif KMP_ARCH_X86_64
 #ifndef __NR_sched_setaffinity
 #define __NR_sched_setaffinity 203
 #elif __NR_sched_setaffinity != 203
 #error Wrong code for setaffinity system call.
 #endif /* __NR_sched_setaffinity */
 #ifndef __NR_sched_getaffinity
 #define __NR_sched_getaffinity 204
 #elif __NR_sched_getaffinity != 204
 #error Wrong code for getaffinity system call.
 #endif /* __NR_sched_getaffinity */
 #elif KMP_ARCH_PPC64
 #ifndef __NR_sched_setaffinity
 #define __NR_sched_setaffinity 222
 #elif __NR_sched_setaffinity != 222
 #error Wrong code for setaffinity system call.
 #endif /* __NR_sched_setaffinity */
 #ifndef __NR_sched_getaffinity
 #define __NR_sched_getaffinity 223
 #elif __NR_sched_getaffinity != 223
 #error Wrong code for getaffinity system call.
 #endif /* __NR_sched_getaffinity */
 #elif KMP_ARCH_MIPS
 #ifndef __NR_sched_setaffinity
 #define __NR_sched_setaffinity 4239
 #elif __NR_sched_setaffinity != 4239
 #error Wrong code for setaffinity system call.
 #endif /* __NR_sched_setaffinity */
 #ifndef __NR_sched_getaffinity
 #define __NR_sched_getaffinity 4240
 #elif __NR_sched_getaffinity != 4240
 #error Wrong code for getaffinity system call.
 #endif /* __NR_sched_getaffinity */
 #elif KMP_ARCH_MIPS64
 #ifndef __NR_sched_setaffinity
 #define __NR_sched_setaffinity 5195
 #elif __NR_sched_setaffinity != 5195
 #error Wrong code for setaffinity system call.
 #endif /* __NR_sched_setaffinity */
 #ifndef __NR_sched_getaffinity
 #define __NR_sched_getaffinity 5196
 #elif __NR_sched_getaffinity != 5196
 #error Wrong code for getaffinity system call.
 #endif /* __NR_sched_getaffinity */
 #error Unknown or unsupported architecture
 #endif /* KMP_ARCH_* */
+#elif KMP_OS_FREEBSD
+#include <pthread.h>
+#include <pthread_np.h>
+#endif
 class KMPNativeAffinity : public KMPAffinity {
   class Mask : public KMPAffinity::Mask {
     typedef unsigned char mask_t;
     static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
 
   public:
     mask_t *mask;
     Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
     ~Mask() {
       if (mask)
         __kmp_free(mask);
     }
     void set(int i) override {
       mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
     }
     bool is_set(int i) const override {
       return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
     }
     void clear(int i) override {
       mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
     }
     void zero() override {
       for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
         mask[i] = 0;
     }
     void copy(const KMPAffinity::Mask *src) override {
       const Mask *convert = static_cast<const Mask *>(src);
       for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
         mask[i] = convert->mask[i];
     }
     void bitwise_and(const KMPAffinity::Mask *rhs) override {
       const Mask *convert = static_cast<const Mask *>(rhs);
       for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
         mask[i] &= convert->mask[i];
     }
     void bitwise_or(const KMPAffinity::Mask *rhs) override {
       const Mask *convert = static_cast<const Mask *>(rhs);
       for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
         mask[i] |= convert->mask[i];
     }
     void bitwise_not() override {
       for (size_t i = 0; i < __kmp_affin_mask_size; ++i)
         mask[i] = ~(mask[i]);
     }
     int begin() const override {
       int retval = 0;
       while (retval < end() && !is_set(retval))
         ++retval;
       return retval;
     }
     int end() const override { return __kmp_affin_mask_size * BITS_PER_MASK_T; }
     int next(int previous) const override {
       int retval = previous + 1;
       while (retval < end() && !is_set(retval))
         ++retval;
       return retval;
     }
     int get_system_affinity(bool abort_on_error) override {
       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
                   "Illegal get affinity operation when not capable");
+#if KMP_OS_LINUX
       int retval =
           syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
+#elif KMP_OS_FREEBSD
+      int retval =
+          pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size, reinterpret_cast<cpuset_t *>(mask));
+#endif
       if (retval >= 0) {
         return 0;
       }
       int error = errno;
       if (abort_on_error) {
         __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
       }
       return error;
     }
     int set_system_affinity(bool abort_on_error) const override {
       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
                   "Illegal get affinity operation when not capable");
+#if KMP_OS_LINUX
       int retval =
           syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
+#elif KMP_OS_FREEBSD
+      int retval =
+          pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size, reinterpret_cast<cpuset_t *>(mask));
+#endif
       if (retval >= 0) {
         return 0;
       }
       int error = errno;
       if (abort_on_error) {
         __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
       }
       return error;
     }
   };
   void determine_capable(const char *env_var) override {
     __kmp_affinity_determine_capable(env_var);
   }
   void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
   KMPAffinity::Mask *allocate_mask() override {
     KMPNativeAffinity::Mask *retval = new Mask();
     return retval;
   }
   void deallocate_mask(KMPAffinity::Mask *m) override {
     KMPNativeAffinity::Mask *native_mask =
         static_cast<KMPNativeAffinity::Mask *>(m);
     delete native_mask;
   }
   KMPAffinity::Mask *allocate_mask_array(int num) override {
     return new Mask[num];
   }
   void deallocate_mask_array(KMPAffinity::Mask *array) override {
     Mask *linux_array = static_cast<Mask *>(array);
     delete[] linux_array;
   }
   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
                                       int index) override {
     Mask *linux_array = static_cast<Mask *>(array);
     return &(linux_array[index]);
   }
   api_type get_api_type() const override { return NATIVE_OS; }
 };
-#endif /* KMP_OS_LINUX */
+#endif /* KMP_OS_LINUX || KMP_OS_FREEBSD */
 
 #if KMP_OS_WINDOWS
 class KMPNativeAffinity : public KMPAffinity {
   class Mask : public KMPAffinity::Mask {
     typedef ULONG_PTR mask_t;
     static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
     mask_t *mask;
 
   public:
     Mask() {
       mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
     }
     ~Mask() {
       if (mask)
         __kmp_free(mask);
     }
     void set(int i) override {
       mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
     }
     bool is_set(int i) const override {
       return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
     }
     void clear(int i) override {
       mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
     }
     void zero() override {
       for (int i = 0; i < __kmp_num_proc_groups; ++i)
         mask[i] = 0;
     }
     void copy(const KMPAffinity::Mask *src) override {
       const Mask *convert = static_cast<const Mask *>(src);
       for (int i = 0; i < __kmp_num_proc_groups; ++i)
         mask[i] = convert->mask[i];
     }
     void bitwise_and(const KMPAffinity::Mask *rhs) override {
       const Mask *convert = static_cast<const Mask *>(rhs);
       for (int i = 0; i < __kmp_num_proc_groups; ++i)
         mask[i] &= convert->mask[i];
     }
     void bitwise_or(const KMPAffinity::Mask *rhs) override {
       const Mask *convert = static_cast<const Mask *>(rhs);
       for (int i = 0; i < __kmp_num_proc_groups; ++i)
         mask[i] |= convert->mask[i];
     }
     void bitwise_not() override {
       for (int i = 0; i < __kmp_num_proc_groups; ++i)
         mask[i] = ~(mask[i]);
     }
     int begin() const override {
       int retval = 0;
       while (retval < end() && !is_set(retval))
         ++retval;
       return retval;
     }
     int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
     int next(int previous) const override {
       int retval = previous + 1;
       while (retval < end() && !is_set(retval))
         ++retval;
       return retval;
     }
     int set_system_affinity(bool abort_on_error) const override {
       if (__kmp_num_proc_groups > 1) {
         // Check for a valid mask.
         GROUP_AFFINITY ga;
         int group = get_proc_group();
         if (group < 0) {
           if (abort_on_error) {
             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
           }
           return -1;
         }
         // Transform the bit vector into a GROUP_AFFINITY struct
         // and make the system call to set affinity.
         ga.Group = group;
         ga.Mask = mask[group];
         ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
 
         KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
         if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
           DWORD error = GetLastError();
           if (abort_on_error) {
             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
                         __kmp_msg_null);
           }
           return error;
         }
       } else {
         if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
           DWORD error = GetLastError();
           if (abort_on_error) {
             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
                         __kmp_msg_null);
           }
           return error;
         }
       }
       return 0;
     }
     int get_system_affinity(bool abort_on_error) override {
       if (__kmp_num_proc_groups > 1) {
         this->zero();
         GROUP_AFFINITY ga;
         KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
         if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
           DWORD error = GetLastError();
           if (abort_on_error) {
             __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
                         KMP_ERR(error), __kmp_msg_null);
           }
           return error;
         }
         if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
             (ga.Mask == 0)) {
           return -1;
         }
         mask[ga.Group] = ga.Mask;
       } else {
         mask_t newMask, sysMask, retval;
         if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
           DWORD error = GetLastError();
           if (abort_on_error) {
             __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
                         KMP_ERR(error), __kmp_msg_null);
           }
           return error;
         }
         retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
         if (!retval) {
           DWORD error = GetLastError();
           if (abort_on_error) {
             __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
                         KMP_ERR(error), __kmp_msg_null);
           }
           return error;
         }
         newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
         if (!newMask) {
           DWORD error = GetLastError();
           if (abort_on_error) {
             __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
                         KMP_ERR(error), __kmp_msg_null);
           }
         }
         *mask = retval;
       }
       return 0;
     }
     int get_proc_group() const override {
       int group = -1;
       if (__kmp_num_proc_groups == 1) {
         return 1;
       }
       for (int i = 0; i < __kmp_num_proc_groups; i++) {
         if (mask[i] == 0)
           continue;
         if (group >= 0)
           return -1;
         group = i;
       }
       return group;
     }
   };
   void determine_capable(const char *env_var) override {
     __kmp_affinity_determine_capable(env_var);
   }
   void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
   KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
   void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
   KMPAffinity::Mask *allocate_mask_array(int num) override {
     return new Mask[num];
   }
   void deallocate_mask_array(KMPAffinity::Mask *array) override {
     Mask *windows_array = static_cast<Mask *>(array);
     delete[] windows_array;
   }
   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
                                       int index) override {
     Mask *windows_array = static_cast<Mask *>(array);
     return &(windows_array[index]);
   }
   api_type get_api_type() const override { return NATIVE_OS; }
 };
 #endif /* KMP_OS_WINDOWS */
 #endif /* KMP_AFFINITY_SUPPORTED */
 
 class Address {
 public:
   static const unsigned maxDepth = 32;
   unsigned labels[maxDepth];
   unsigned childNums[maxDepth];
   unsigned depth;
   unsigned leader;
   Address(unsigned _depth) : depth(_depth), leader(FALSE) {}
   Address &operator=(const Address &b) {
     depth = b.depth;
     for (unsigned i = 0; i < depth; i++) {
       labels[i] = b.labels[i];
       childNums[i] = b.childNums[i];
     }
     leader = FALSE;
     return *this;
   }
   bool operator==(const Address &b) const {
     if (depth != b.depth)
       return false;
     for (unsigned i = 0; i < depth; i++)
       if (labels[i] != b.labels[i])
         return false;
     return true;
   }
   bool isClose(const Address &b, int level) const {
     if (depth != b.depth)
       return false;
     if ((unsigned)level >= depth)
       return true;
     for (unsigned i = 0; i < (depth - level); i++)
       if (labels[i] != b.labels[i])
         return false;
     return true;
   }
   bool operator!=(const Address &b) const { return !operator==(b); }
   void print() const {
     unsigned i;
     printf("Depth: %u --- ", depth);
     for (i = 0; i < depth; i++) {
       printf("%u ", labels[i]);
     }
   }
 };
 
 class AddrUnsPair {
 public:
   Address first;
   unsigned second;
   AddrUnsPair(Address _first, unsigned _second)
       : first(_first), second(_second) {}
   AddrUnsPair &operator=(const AddrUnsPair &b) {
     first = b.first;
     second = b.second;
     return *this;
   }
   void print() const {
     printf("first = ");
     first.print();
     printf(" --- second = %u", second);
   }
   bool operator==(const AddrUnsPair &b) const {
     if (first != b.first)
       return false;
     if (second != b.second)
       return false;
     return true;
   }
   bool operator!=(const AddrUnsPair &b) const { return !operator==(b); }
 };
 
 static int __kmp_affinity_cmp_Address_labels(const void *a, const void *b) {
   const Address *aa = &(((const AddrUnsPair *)a)->first);
   const Address *bb = &(((const AddrUnsPair *)b)->first);
   unsigned depth = aa->depth;
   unsigned i;
   KMP_DEBUG_ASSERT(depth == bb->depth);
   for (i = 0; i < depth; i++) {
     if (aa->labels[i] < bb->labels[i])
       return -1;
     if (aa->labels[i] > bb->labels[i])
       return 1;
   }
   return 0;
 }
 
 /* A structure for holding machine-specific hierarchy info to be computed once
    at init. This structure represents a mapping of threads to the actual machine
    hierarchy, or to our best guess at what the hierarchy might be, for the
    purpose of performing an efficient barrier. In the worst case, when there is
    no machine hierarchy information, it produces a tree suitable for a barrier,
    similar to the tree used in the hyper barrier. */
 class hierarchy_info {
 public:
   /* Good default values for number of leaves and branching factor, given no
      affinity information. Behaves a bit like hyper barrier. */
   static const kmp_uint32 maxLeaves = 4;
   static const kmp_uint32 minBranch = 4;
   /** Number of levels in the hierarchy. Typical levels are threads/core,
       cores/package or socket, packages/node, nodes/machine, etc. We don't want
       to get specific with nomenclature. When the machine is oversubscribed we
       add levels to duplicate the hierarchy, doubling the thread capacity of the
       hierarchy each time we add a level. */
   kmp_uint32 maxLevels;
 
   /** This is specifically the depth of the machine configuration hierarchy, in
       terms of the number of levels along the longest path from root to any
       leaf. It corresponds to the number of entries in numPerLevel if we exclude
       all but one trailing 1. */
   kmp_uint32 depth;
   kmp_uint32 base_num_threads;
   enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
   volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
   // 2=initialization in progress
   volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
 
   /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children
       the parent of a node at level i has. For example, if we have a machine
       with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel =
       {2, 4, 4, 1, 1}. All empty levels are set to 1. */
   kmp_uint32 *numPerLevel;
   kmp_uint32 *skipPerLevel;
 
   void deriveLevels(AddrUnsPair *adr2os, int num_addrs) {
     int hier_depth = adr2os[0].first.depth;
     int level = 0;
     for (int i = hier_depth - 1; i >= 0; --i) {
       int max = -1;
       for (int j = 0; j < num_addrs; ++j) {
         int next = adr2os[j].first.childNums[i];
         if (next > max)
           max = next;
       }
       numPerLevel[level] = max + 1;
       ++level;
     }
   }
 
   hierarchy_info()
       : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
 
   void fini() {
     if (!uninitialized && numPerLevel) {
       __kmp_free(numPerLevel);
       numPerLevel = NULL;
       uninitialized = not_initialized;
     }
   }
 
   void init(AddrUnsPair *adr2os, int num_addrs) {
     kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
         &uninitialized, not_initialized, initializing);
     if (bool_result == 0) { // Wait for initialization
       while (TCR_1(uninitialized) != initialized)
         KMP_CPU_PAUSE();
       return;
     }
     KMP_DEBUG_ASSERT(bool_result == 1);
 
     /* Added explicit initialization of the data fields here to prevent usage of
        dirty value observed when static library is re-initialized multiple times
        (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
        OpenMP). */
     depth = 1;
     resizing = 0;
     maxLevels = 7;
     numPerLevel =
         (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
     skipPerLevel = &(numPerLevel[maxLevels]);
     for (kmp_uint32 i = 0; i < maxLevels;
          ++i) { // init numPerLevel[*] to 1 item per level
       numPerLevel[i] = 1;
       skipPerLevel[i] = 1;
     }
 
     // Sort table by physical ID
     if (adr2os) {
       qsort(adr2os, num_addrs, sizeof(*adr2os),
             __kmp_affinity_cmp_Address_labels);
       deriveLevels(adr2os, num_addrs);
     } else {
       numPerLevel[0] = maxLeaves;
       numPerLevel[1] = num_addrs / maxLeaves;
       if (num_addrs % maxLeaves)
         numPerLevel[1]++;
     }
 
     base_num_threads = num_addrs;
     for (int i = maxLevels - 1; i >= 0;
          --i) // count non-empty levels to get depth
       if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
         depth++;
 
     kmp_uint32 branch = minBranch;
     if (numPerLevel[0] == 1)
       branch = num_addrs / maxLeaves;
     if (branch < minBranch)
       branch = minBranch;
     for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
       while (numPerLevel[d] > branch ||
              (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
         if (numPerLevel[d] & 1)
           numPerLevel[d]++;
         numPerLevel[d] = numPerLevel[d] >> 1;
         if (numPerLevel[d + 1] == 1)
           depth++;
         numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
       }
       if (numPerLevel[0] == 1) {
         branch = branch >> 1;
         if (branch < 4)
           branch = minBranch;
       }
     }
 
     for (kmp_uint32 i = 1; i < depth; ++i)
       skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
     // Fill in hierarchy in the case of oversubscription
     for (kmp_uint32 i = depth; i < maxLevels; ++i)
       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
 
     uninitialized = initialized; // One writer
   }
 
   // Resize the hierarchy if nproc changes to something larger than before
   void resize(kmp_uint32 nproc) {
     kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
     while (bool_result == 0) { // someone else is trying to resize
       KMP_CPU_PAUSE();
       if (nproc <= base_num_threads) // happy with other thread's resize
         return;
       else // try to resize
         bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
     }
     KMP_DEBUG_ASSERT(bool_result != 0);
     if (nproc <= base_num_threads)
       return; // happy with other thread's resize
 
     // Calculate new maxLevels
     kmp_uint32 old_sz = skipPerLevel[depth - 1];
     kmp_uint32 incs = 0, old_maxLevels = maxLevels;
     // First see if old maxLevels is enough to contain new size
     for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
       numPerLevel[i - 1] *= 2;
       old_sz *= 2;
       depth++;
     }
     if (nproc > old_sz) { // Not enough space, need to expand hierarchy
       while (nproc > old_sz) {
         old_sz *= 2;
         incs++;
         depth++;
       }
       maxLevels += incs;
 
       // Resize arrays
       kmp_uint32 *old_numPerLevel = numPerLevel;
       kmp_uint32 *old_skipPerLevel = skipPerLevel;
       numPerLevel = skipPerLevel = NULL;
       numPerLevel =
           (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
       skipPerLevel = &(numPerLevel[maxLevels]);
 
       // Copy old elements from old arrays
       for (kmp_uint32 i = 0; i < old_maxLevels;
            ++i) { // init numPerLevel[*] to 1 item per level
         numPerLevel[i] = old_numPerLevel[i];
         skipPerLevel[i] = old_skipPerLevel[i];
       }
 
       // Init new elements in arrays to 1
       for (kmp_uint32 i = old_maxLevels; i < maxLevels;
            ++i) { // init numPerLevel[*] to 1 item per level
         numPerLevel[i] = 1;
         skipPerLevel[i] = 1;
       }
 
       // Free old arrays
       __kmp_free(old_numPerLevel);
     }
 
     // Fill in oversubscription levels of hierarchy
     for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
 
     base_num_threads = nproc;
     resizing = 0; // One writer
   }
 };
 #endif // KMP_AFFINITY_H
Index: projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_csupport.cpp
===================================================================
--- projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_csupport.cpp	(revision 357058)
+++ projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_csupport.cpp	(revision 357059)
@@ -1,4187 +1,4189 @@
 /*
  * kmp_csupport.cpp -- kfront linkage support for OpenMP.
  */
 
 //===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
 #define __KMP_IMP
 #include "omp.h" /* extern "C" declarations of user-visible routines */
 #include "kmp.h"
 #include "kmp_error.h"
 #include "kmp_i18n.h"
 #include "kmp_itt.h"
 #include "kmp_lock.h"
 #include "kmp_stats.h"
 
 #if OMPT_SUPPORT
 #include "ompt-specific.h"
 #endif
 
 #define MAX_MESSAGE 512
 
 // flags will be used in future, e.g. to implement openmp_strict library
 // restrictions
 
 /*!
  * @ingroup STARTUP_SHUTDOWN
  * @param loc   in   source location information
  * @param flags in   for future use (currently ignored)
  *
  * Initialize the runtime library. This call is optional; if it is not made then
  * it will be implicitly called by attempts to use other library functions.
  */
 void __kmpc_begin(ident_t *loc, kmp_int32 flags) {
   // By default __kmpc_begin() is no-op.
   char *env;
   if ((env = getenv("KMP_INITIAL_THREAD_BIND")) != NULL &&
       __kmp_str_match_true(env)) {
     __kmp_middle_initialize();
     KC_TRACE(10, ("__kmpc_begin: middle initialization called\n"));
   } else if (__kmp_ignore_mppbeg() == FALSE) {
     // By default __kmp_ignore_mppbeg() returns TRUE.
     __kmp_internal_begin();
     KC_TRACE(10, ("__kmpc_begin: called\n"));
   }
 }
 
 /*!
  * @ingroup STARTUP_SHUTDOWN
  * @param loc source location information
  *
  * Shutdown the runtime library. This is also optional, and even if called will
  * not do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to
  * zero.
  */
 void __kmpc_end(ident_t *loc) {
   // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end()
   // call no-op. However, this can be overridden with KMP_IGNORE_MPPEND
   // environment variable. If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend()
   // returns FALSE and __kmpc_end() will unregister this root (it can cause
   // library shut down).
   if (__kmp_ignore_mppend() == FALSE) {
     KC_TRACE(10, ("__kmpc_end: called\n"));
     KA_TRACE(30, ("__kmpc_end\n"));
 
     __kmp_internal_end_thread(-1);
   }
 #if KMP_OS_WINDOWS && OMPT_SUPPORT
   // Normal exit process on Windows does not allow worker threads of the final
   // parallel region to finish reporting their events, so shutting down the
   // library here fixes the issue at least for the cases where __kmpc_end() is
   // placed properly.
   if (ompt_enabled.enabled)
     __kmp_internal_end_library(__kmp_gtid_get_specific());
 #endif
 }
 
 /*!
 @ingroup THREAD_STATES
 @param loc Source location information.
 @return The global thread index of the active thread.
 
 This function can be called in any context.
 
 If the runtime has ony been entered at the outermost level from a
 single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is
 that which would be returned by omp_get_thread_num() in the outermost
 active parallel construct. (Or zero if there is no active parallel
 construct, since the master thread is necessarily thread zero).
 
 If multiple non-OpenMP threads all enter an OpenMP construct then this
 will be a unique thread identifier among all the threads created by
 the OpenMP runtime (but the value cannote be defined in terms of
 OpenMP thread ids returned by omp_get_thread_num()).
 */
 kmp_int32 __kmpc_global_thread_num(ident_t *loc) {
   kmp_int32 gtid = __kmp_entry_gtid();
 
   KC_TRACE(10, ("__kmpc_global_thread_num: T#%d\n", gtid));
 
   return gtid;
 }
 
 /*!
 @ingroup THREAD_STATES
 @param loc Source location information.
 @return The number of threads under control of the OpenMP<sup>*</sup> runtime
 
 This function can be called in any context.
 It returns the total number of threads under the control of the OpenMP runtime.
 That is not a number that can be determined by any OpenMP standard calls, since
 the library may be called from more than one non-OpenMP thread, and this
 reflects the total over all such calls. Similarly the runtime maintains
 underlying threads even when they are not active (since the cost of creating
 and destroying OS threads is high), this call counts all such threads even if
 they are not waiting for work.
 */
 kmp_int32 __kmpc_global_num_threads(ident_t *loc) {
   KC_TRACE(10,
            ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_all_nth));
 
   return TCR_4(__kmp_all_nth);
 }
 
 /*!
 @ingroup THREAD_STATES
 @param loc Source location information.
 @return The thread number of the calling thread in the innermost active parallel
 construct.
 */
 kmp_int32 __kmpc_bound_thread_num(ident_t *loc) {
   KC_TRACE(10, ("__kmpc_bound_thread_num: called\n"));
   return __kmp_tid_from_gtid(__kmp_entry_gtid());
 }
 
 /*!
 @ingroup THREAD_STATES
 @param loc Source location information.
 @return The number of threads in the innermost active parallel construct.
 */
 kmp_int32 __kmpc_bound_num_threads(ident_t *loc) {
   KC_TRACE(10, ("__kmpc_bound_num_threads: called\n"));
 
   return __kmp_entry_thread()->th.th_team->t.t_nproc;
 }
 
 /*!
  * @ingroup DEPRECATED
  * @param loc location description
  *
  * This function need not be called. It always returns TRUE.
  */
 kmp_int32 __kmpc_ok_to_fork(ident_t *loc) {
 #ifndef KMP_DEBUG
 
   return TRUE;
 
 #else
 
   const char *semi2;
   const char *semi3;
   int line_no;
 
   if (__kmp_par_range == 0) {
     return TRUE;
   }
   semi2 = loc->psource;
   if (semi2 == NULL) {
     return TRUE;
   }
   semi2 = strchr(semi2, ';');
   if (semi2 == NULL) {
     return TRUE;
   }
   semi2 = strchr(semi2 + 1, ';');
   if (semi2 == NULL) {
     return TRUE;
   }
   if (__kmp_par_range_filename[0]) {
     const char *name = semi2 - 1;
     while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
       name--;
     }
     if ((*name == '/') || (*name == ';')) {
       name++;
     }
     if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
       return __kmp_par_range < 0;
     }
   }
   semi3 = strchr(semi2 + 1, ';');
   if (__kmp_par_range_routine[0]) {
     if ((semi3 != NULL) && (semi3 > semi2) &&
         (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
       return __kmp_par_range < 0;
     }
   }
   if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
     if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
       return __kmp_par_range > 0;
     }
     return __kmp_par_range < 0;
   }
   return TRUE;
 
 #endif /* KMP_DEBUG */
 }
 
 /*!
 @ingroup THREAD_STATES
 @param loc Source location information.
 @return 1 if this thread is executing inside an active parallel region, zero if
 not.
 */
 kmp_int32 __kmpc_in_parallel(ident_t *loc) {
   return __kmp_entry_thread()->th.th_root->r.r_active;
 }
 
 /*!
 @ingroup PARALLEL
 @param loc source location information
 @param global_tid global thread number
 @param num_threads number of threads requested for this parallel construct
 
 Set the number of threads to be used by the next fork spawned by this thread.
 This call is only required if the parallel construct has a `num_threads` clause.
 */
 void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
                              kmp_int32 num_threads) {
   KA_TRACE(20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
                 global_tid, num_threads));
 
   __kmp_push_num_threads(loc, global_tid, num_threads);
 }
 
 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
 
   /* the num_threads are automatically popped */
 }
 
 void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
                            kmp_int32 proc_bind) {
   KA_TRACE(20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n", global_tid,
                 proc_bind));
 
   __kmp_push_proc_bind(loc, global_tid, (kmp_proc_bind_t)proc_bind);
 }
 
 /*!
 @ingroup PARALLEL
 @param loc  source location information
 @param argc  total number of arguments in the ellipsis
 @param microtask  pointer to callback routine consisting of outlined parallel
 construct
 @param ...  pointers to shared variables that aren't global
 
 Do the actual fork and call the microtask in the relevant number of threads.
 */
 void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
   int gtid = __kmp_entry_gtid();
 
 #if (KMP_STATS_ENABLED)
   // If we were in a serial region, then stop the serial timer, record
   // the event, and start parallel region timer
   stats_state_e previous_state = KMP_GET_THREAD_STATE();
   if (previous_state == stats_state_e::SERIAL_REGION) {
     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
   } else {
     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
   }
   int inParallel = __kmpc_in_parallel(loc);
   if (inParallel) {
     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
   } else {
     KMP_COUNT_BLOCK(OMP_PARALLEL);
   }
 #endif
 
   // maybe to save thr_state is enough here
   {
     va_list ap;
     va_start(ap, microtask);
 
 #if OMPT_SUPPORT
     ompt_frame_t *ompt_frame;
     if (ompt_enabled.enabled) {
       kmp_info_t *master_th = __kmp_threads[gtid];
       kmp_team_t *parent_team = master_th->th.th_team;
       ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
       if (lwt)
         ompt_frame = &(lwt->ompt_task_info.frame);
       else {
         int tid = __kmp_tid_from_gtid(gtid);
         ompt_frame = &(
             parent_team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame);
       }
       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
       OMPT_STORE_RETURN_ADDRESS(gtid);
     }
 #endif
 
 #if INCLUDE_SSC_MARKS
     SSC_MARK_FORKING();
 #endif
     __kmp_fork_call(loc, gtid, fork_context_intel, argc,
                     VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
                     VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
                     &ap
 #else
                     ap
 #endif
                     );
 #if INCLUDE_SSC_MARKS
     SSC_MARK_JOINING();
 #endif
     __kmp_join_call(loc, gtid
 #if OMPT_SUPPORT
                     ,
                     fork_context_intel
 #endif
                     );
 
     va_end(ap);
   }
 
 #if KMP_STATS_ENABLED
   if (previous_state == stats_state_e::SERIAL_REGION) {
     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
   } else {
     KMP_POP_PARTITIONED_TIMER();
   }
 #endif // KMP_STATS_ENABLED
 }
 
 /*!
 @ingroup PARALLEL
 @param loc source location information
 @param global_tid global thread number
 @param num_teams number of teams requested for the teams construct
 @param num_threads number of threads per team requested for the teams construct
 
 Set the number of teams to be used by the teams construct.
 This call is only required if the teams construct has a `num_teams` clause
 or a `thread_limit` clause (or both).
 */
 void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid,
                            kmp_int32 num_teams, kmp_int32 num_threads) {
   KA_TRACE(20,
            ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
             global_tid, num_teams, num_threads));
 
   __kmp_push_num_teams(loc, global_tid, num_teams, num_threads);
 }
 
 /*!
 @ingroup PARALLEL
 @param loc  source location information
 @param argc  total number of arguments in the ellipsis
 @param microtask  pointer to callback routine consisting of outlined teams
 construct
 @param ...  pointers to shared variables that aren't global
 
 Do the actual fork and call the microtask in the relevant number of threads.
 */
 void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,
                        ...) {
   int gtid = __kmp_entry_gtid();
   kmp_info_t *this_thr = __kmp_threads[gtid];
   va_list ap;
   va_start(ap, microtask);
 
 #if KMP_STATS_ENABLED
   KMP_COUNT_BLOCK(OMP_TEAMS);
   stats_state_e previous_state = KMP_GET_THREAD_STATE();
   if (previous_state == stats_state_e::SERIAL_REGION) {
     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_teams_overhead);
   } else {
     KMP_PUSH_PARTITIONED_TIMER(OMP_teams_overhead);
   }
 #endif
 
   // remember teams entry point and nesting level
   this_thr->th.th_teams_microtask = microtask;
   this_thr->th.th_teams_level =
       this_thr->th.th_team->t.t_level; // AC: can be >0 on host
 
 #if OMPT_SUPPORT
   kmp_team_t *parent_team = this_thr->th.th_team;
   int tid = __kmp_tid_from_gtid(gtid);
   if (ompt_enabled.enabled) {
     parent_team->t.t_implicit_task_taskdata[tid]
         .ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
   }
   OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
 
   // check if __kmpc_push_num_teams called, set default number of teams
   // otherwise
   if (this_thr->th.th_teams_size.nteams == 0) {
     __kmp_push_num_teams(loc, gtid, 0, 0);
   }
   KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
   KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
 
   __kmp_fork_call(loc, gtid, fork_context_intel, argc,
                   VOLATILE_CAST(microtask_t)
                       __kmp_teams_master, // "wrapped" task
                   VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
                   &ap
 #else
                   ap
 #endif
                   );
   __kmp_join_call(loc, gtid
 #if OMPT_SUPPORT
                   ,
                   fork_context_intel
 #endif
                   );
 
   // Pop current CG root off list
   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
   kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
   this_thr->th.th_cg_roots = tmp->up;
   KA_TRACE(100, ("__kmpc_fork_teams: Thread %p popping node %p and moving up"
                  " to node %p. cg_nthreads was %d\n",
                  this_thr, tmp, this_thr->th.th_cg_roots, tmp->cg_nthreads));
   KMP_DEBUG_ASSERT(tmp->cg_nthreads);
   int i = tmp->cg_nthreads--;
   if (i == 1) { // check is we are the last thread in CG (not always the case)
     __kmp_free(tmp);
   }
   // Restore current task's thread_limit from CG root
   KMP_DEBUG_ASSERT(this_thr->th.th_cg_roots);
   this_thr->th.th_current_task->td_icvs.thread_limit =
       this_thr->th.th_cg_roots->cg_thread_limit;
 
   this_thr->th.th_teams_microtask = NULL;
   this_thr->th.th_teams_level = 0;
   *(kmp_int64 *)(&this_thr->th.th_teams_size) = 0L;
   va_end(ap);
 #if KMP_STATS_ENABLED
   if (previous_state == stats_state_e::SERIAL_REGION) {
     KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
   } else {
     KMP_POP_PARTITIONED_TIMER();
   }
 #endif // KMP_STATS_ENABLED
 }
 
 // I don't think this function should ever have been exported.
 // The __kmpc_ prefix was misapplied.  I'm fairly certain that no generated
 // openmp code ever called it, but it's been exported from the RTL for so
 // long that I'm afraid to remove the definition.
 int __kmpc_invoke_task_func(int gtid) { return __kmp_invoke_task_func(gtid); }
 
 /*!
 @ingroup PARALLEL
 @param loc  source location information
 @param global_tid  global thread number
 
 Enter a serialized parallel construct. This interface is used to handle a
 conditional parallel region, like this,
 @code
 #pragma omp parallel if (condition)
 @endcode
 when the condition is false.
 */
 void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
 // The implementation is now in kmp_runtime.cpp so that it can share static
 // functions with kmp_fork_call since the tasks to be done are similar in
 // each case.
 #if OMPT_SUPPORT
   OMPT_STORE_RETURN_ADDRESS(global_tid);
 #endif
   __kmp_serialized_parallel(loc, global_tid);
 }
 
 /*!
 @ingroup PARALLEL
 @param loc  source location information
 @param global_tid  global thread number
 
 Leave a serialized parallel construct.
 */
 void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
   kmp_internal_control_t *top;
   kmp_info_t *this_thr;
   kmp_team_t *serial_team;
 
   KC_TRACE(10,
            ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid));
 
   /* skip all this code for autopar serialized loops since it results in
      unacceptable overhead */
   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
     return;
 
   // Not autopar code
   if (!TCR_4(__kmp_init_parallel))
     __kmp_parallel_initialize();
 
   __kmp_resume_if_soft_paused();
 
   this_thr = __kmp_threads[global_tid];
   serial_team = this_thr->th.th_serial_team;
 
   kmp_task_team_t *task_team = this_thr->th.th_task_team;
   // we need to wait for the proxy tasks before finishing the thread
   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks)
     __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL));
 
   KMP_MB();
   KMP_DEBUG_ASSERT(serial_team);
   KMP_ASSERT(serial_team->t.t_serialized);
   KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
   KMP_DEBUG_ASSERT(serial_team != this_thr->th.th_root->r.r_root_team);
   KMP_DEBUG_ASSERT(serial_team->t.t_threads);
   KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
 
 #if OMPT_SUPPORT
   if (ompt_enabled.enabled &&
       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = ompt_data_none;
     if (ompt_enabled.ompt_callback_implicit_task) {
       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
           ompt_scope_end, NULL, OMPT_CUR_TASK_DATA(this_thr), 1,
           OMPT_CUR_TASK_INFO(this_thr)->thread_num, ompt_task_implicit);
     }
 
     // reset clear the task id only after unlinking the task
     ompt_data_t *parent_task_data;
     __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL);
 
     if (ompt_enabled.ompt_callback_parallel_end) {
       ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
           &(serial_team->t.ompt_team_info.parallel_data), parent_task_data,
-          ompt_parallel_invoker_program, OMPT_LOAD_RETURN_ADDRESS(global_tid));
+          ompt_parallel_invoker_program | ompt_parallel_team,
+          OMPT_LOAD_RETURN_ADDRESS(global_tid));
     }
     __ompt_lw_taskteam_unlink(this_thr);
     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
   }
 #endif
 
   /* If necessary, pop the internal control stack values and replace the team
    * values */
   top = serial_team->t.t_control_stack_top;
   if (top && top->serial_nesting_level == serial_team->t.t_serialized) {
     copy_icvs(&serial_team->t.t_threads[0]->th.th_current_task->td_icvs, top);
     serial_team->t.t_control_stack_top = top->next;
     __kmp_free(top);
   }
 
   // if( serial_team -> t.t_serialized > 1 )
   serial_team->t.t_level--;
 
   /* pop dispatch buffers stack */
   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
   {
     dispatch_private_info_t *disp_buffer =
         serial_team->t.t_dispatch->th_disp_buffer;
     serial_team->t.t_dispatch->th_disp_buffer =
         serial_team->t.t_dispatch->th_disp_buffer->next;
     __kmp_free(disp_buffer);
   }
   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
 
   --serial_team->t.t_serialized;
   if (serial_team->t.t_serialized == 0) {
 
 /* return to the parallel section */
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
     if (__kmp_inherit_fp_control && serial_team->t.t_fp_control_saved) {
       __kmp_clear_x87_fpu_status_word();
       __kmp_load_x87_fpu_control_word(&serial_team->t.t_x87_fpu_control_word);
       __kmp_load_mxcsr(&serial_team->t.t_mxcsr);
     }
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
     this_thr->th.th_team = serial_team->t.t_parent;
     this_thr->th.th_info.ds.ds_tid = serial_team->t.t_master_tid;
 
     /* restore values cached in the thread */
     this_thr->th.th_team_nproc = serial_team->t.t_parent->t.t_nproc; /*  JPH */
     this_thr->th.th_team_master =
         serial_team->t.t_parent->t.t_threads[0]; /* JPH */
     this_thr->th.th_team_serialized = this_thr->th.th_team->t.t_serialized;
 
     /* TODO the below shouldn't need to be adjusted for serialized teams */
     this_thr->th.th_dispatch =
         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
 
     __kmp_pop_current_task_from_thread(this_thr);
 
     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
     this_thr->th.th_current_task->td_flags.executing = 1;
 
     if (__kmp_tasking_mode != tskm_immediate_exec) {
       // Copy the task team from the new child / old parent team to the thread.
       this_thr->th.th_task_team =
           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
       KA_TRACE(20,
                ("__kmpc_end_serialized_parallel: T#%d restoring task_team %p / "
                 "team %p\n",
                 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
     }
   } else {
     if (__kmp_tasking_mode != tskm_immediate_exec) {
       KA_TRACE(20, ("__kmpc_end_serialized_parallel: T#%d decreasing nesting "
                     "depth of serial team %p to %d\n",
                     global_tid, serial_team, serial_team->t.t_serialized));
     }
   }
 
   if (__kmp_env_consistency_check)
     __kmp_pop_parallel(global_tid, NULL);
 #if OMPT_SUPPORT
   if (ompt_enabled.enabled)
     this_thr->th.ompt_thread_info.state =
         ((this_thr->th.th_team_serialized) ? ompt_state_work_serial
                                            : ompt_state_work_parallel);
 #endif
 }
 
 /*!
 @ingroup SYNCHRONIZATION
 @param loc  source location information.
 
 Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
 depending on the memory ordering convention obeyed by the compiler
 even that may not be necessary).
 */
 void __kmpc_flush(ident_t *loc) {
   KC_TRACE(10, ("__kmpc_flush: called\n"));
 
   /* need explicit __mf() here since use volatile instead in library */
   KMP_MB(); /* Flush all pending memory write invalidates.  */
 
 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
 #if KMP_MIC
 // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
 // We shouldn't need it, though, since the ABI rules require that
 // * If the compiler generates NGO stores it also generates the fence
 // * If users hand-code NGO stores they should insert the fence
 // therefore no incomplete unordered stores should be visible.
 #else
   // C74404
   // This is to address non-temporal store instructions (sfence needed).
   // The clflush instruction is addressed either (mfence needed).
   // Probably the non-temporal load monvtdqa instruction should also be
   // addressed.
   // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
   if (!__kmp_cpuinfo.initialized) {
     __kmp_query_cpuid(&__kmp_cpuinfo);
   }
   if (!__kmp_cpuinfo.sse2) {
     // CPU cannot execute SSE2 instructions.
   } else {
 #if KMP_COMPILER_ICC
     _mm_mfence();
 #elif KMP_COMPILER_MSVC
     MemoryBarrier();
 #else
     __sync_synchronize();
 #endif // KMP_COMPILER_ICC
   }
 #endif // KMP_MIC
-#elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64)
+#elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64 || \
+       KMP_ARCH_RISCV64)
 // Nothing to see here move along
 #elif KMP_ARCH_PPC64
 // Nothing needed here (we have a real MB above).
 #if KMP_OS_CNK
   // The flushing thread needs to yield here; this prevents a
   // busy-waiting thread from saturating the pipeline. flush is
   // often used in loops like this:
   // while (!flag) {
   //   #pragma omp flush(flag)
   // }
   // and adding the yield here is good for at least a 10x speedup
   // when running >2 threads per core (on the NAS LU benchmark).
   __kmp_yield();
 #endif
 #else
 #error Unknown or unsupported architecture
 #endif
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (ompt_enabled.ompt_callback_flush) {
     ompt_callbacks.ompt_callback(ompt_callback_flush)(
         __ompt_get_thread_data_internal(), OMPT_GET_RETURN_ADDRESS(0));
   }
 #endif
 }
 
 /* -------------------------------------------------------------------------- */
 /*!
 @ingroup SYNCHRONIZATION
 @param loc source location information
 @param global_tid thread id.
 
 Execute a barrier.
 */
 void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
   KMP_COUNT_BLOCK(OMP_BARRIER);
   KC_TRACE(10, ("__kmpc_barrier: called T#%d\n", global_tid));
 
   if (!TCR_4(__kmp_init_parallel))
     __kmp_parallel_initialize();
 
   __kmp_resume_if_soft_paused();
 
   if (__kmp_env_consistency_check) {
     if (loc == 0) {
       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
     }
     __kmp_check_barrier(global_tid, ct_barrier, loc);
   }
 
 #if OMPT_SUPPORT
   ompt_frame_t *ompt_frame;
   if (ompt_enabled.enabled) {
     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
     if (ompt_frame->enter_frame.ptr == NULL)
       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
     OMPT_STORE_RETURN_ADDRESS(global_tid);
   }
 #endif
   __kmp_threads[global_tid]->th.th_ident = loc;
   // TODO: explicit barrier_wait_id:
   //   this function is called when 'barrier' directive is present or
   //   implicit barrier at the end of a worksharing construct.
   // 1) better to add a per-thread barrier counter to a thread data structure
   // 2) set to 0 when a new team is created
   // 4) no sync is required
 
   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (ompt_enabled.enabled) {
     ompt_frame->enter_frame = ompt_data_none;
   }
 #endif
 }
 
 /* The BARRIER for a MASTER section is always explicit   */
 /*!
 @ingroup WORK_SHARING
 @param loc  source location information.
 @param global_tid  global thread number .
 @return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
 */
 kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid) {
   int status = 0;
 
   KC_TRACE(10, ("__kmpc_master: called T#%d\n", global_tid));
 
   if (!TCR_4(__kmp_init_parallel))
     __kmp_parallel_initialize();
 
   __kmp_resume_if_soft_paused();
 
   if (KMP_MASTER_GTID(global_tid)) {
     KMP_COUNT_BLOCK(OMP_MASTER);
     KMP_PUSH_PARTITIONED_TIMER(OMP_master);
     status = 1;
   }
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (status) {
     if (ompt_enabled.ompt_callback_master) {
       kmp_info_t *this_thr = __kmp_threads[global_tid];
       kmp_team_t *team = this_thr->th.th_team;
 
       int tid = __kmp_tid_from_gtid(global_tid);
       ompt_callbacks.ompt_callback(ompt_callback_master)(
           ompt_scope_begin, &(team->t.ompt_team_info.parallel_data),
           &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
           OMPT_GET_RETURN_ADDRESS(0));
     }
   }
 #endif
 
   if (__kmp_env_consistency_check) {
 #if KMP_USE_DYNAMIC_LOCK
     if (status)
       __kmp_push_sync(global_tid, ct_master, loc, NULL, 0);
     else
       __kmp_check_sync(global_tid, ct_master, loc, NULL, 0);
 #else
     if (status)
       __kmp_push_sync(global_tid, ct_master, loc, NULL);
     else
       __kmp_check_sync(global_tid, ct_master, loc, NULL);
 #endif
   }
 
   return status;
 }
 
 /*!
 @ingroup WORK_SHARING
 @param loc  source location information.
 @param global_tid  global thread number .
 
 Mark the end of a <tt>master</tt> region. This should only be called by the
 thread that executes the <tt>master</tt> region.
 */
 void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) {
   KC_TRACE(10, ("__kmpc_end_master: called T#%d\n", global_tid));
 
   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(global_tid));
   KMP_POP_PARTITIONED_TIMER();
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   kmp_info_t *this_thr = __kmp_threads[global_tid];
   kmp_team_t *team = this_thr->th.th_team;
   if (ompt_enabled.ompt_callback_master) {
     int tid = __kmp_tid_from_gtid(global_tid);
     ompt_callbacks.ompt_callback(ompt_callback_master)(
         ompt_scope_end, &(team->t.ompt_team_info.parallel_data),
         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
         OMPT_GET_RETURN_ADDRESS(0));
   }
 #endif
 
   if (__kmp_env_consistency_check) {
     if (global_tid < 0)
       KMP_WARNING(ThreadIdentInvalid);
 
     if (KMP_MASTER_GTID(global_tid))
       __kmp_pop_sync(global_tid, ct_master, loc);
   }
 }
 
 /*!
 @ingroup WORK_SHARING
 @param loc  source location information.
 @param gtid  global thread number.
 
 Start execution of an <tt>ordered</tt> construct.
 */
 void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
   int cid = 0;
   kmp_info_t *th;
   KMP_DEBUG_ASSERT(__kmp_init_serial);
 
   KC_TRACE(10, ("__kmpc_ordered: called T#%d\n", gtid));
 
   if (!TCR_4(__kmp_init_parallel))
     __kmp_parallel_initialize();
 
   __kmp_resume_if_soft_paused();
 
 #if USE_ITT_BUILD
   __kmp_itt_ordered_prep(gtid);
 // TODO: ordered_wait_id
 #endif /* USE_ITT_BUILD */
 
   th = __kmp_threads[gtid];
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   kmp_team_t *team;
   ompt_wait_id_t lck;
   void *codeptr_ra;
   if (ompt_enabled.enabled) {
     OMPT_STORE_RETURN_ADDRESS(gtid);
     team = __kmp_team_from_gtid(gtid);
     lck = (ompt_wait_id_t)(uintptr_t)&team->t.t_ordered.dt.t_value;
     /* OMPT state update */
     th->th.ompt_thread_info.wait_id = lck;
     th->th.ompt_thread_info.state = ompt_state_wait_ordered;
 
     /* OMPT event callback */
     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
     if (ompt_enabled.ompt_callback_mutex_acquire) {
       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin, lck,
           codeptr_ra);
     }
   }
 #endif
 
   if (th->th.th_dispatch->th_deo_fcn != 0)
     (*th->th.th_dispatch->th_deo_fcn)(&gtid, &cid, loc);
   else
     __kmp_parallel_deo(&gtid, &cid, loc);
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (ompt_enabled.enabled) {
     /* OMPT state update */
     th->th.ompt_thread_info.state = ompt_state_work_parallel;
     th->th.ompt_thread_info.wait_id = 0;
 
     /* OMPT event callback */
     if (ompt_enabled.ompt_callback_mutex_acquired) {
       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
           ompt_mutex_ordered, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
     }
   }
 #endif
 
 #if USE_ITT_BUILD
   __kmp_itt_ordered_start(gtid);
 #endif /* USE_ITT_BUILD */
 }
 
 /*!
 @ingroup WORK_SHARING
 @param loc  source location information.
 @param gtid  global thread number.
 
 End execution of an <tt>ordered</tt> construct.
 */
 void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
   int cid = 0;
   kmp_info_t *th;
 
   KC_TRACE(10, ("__kmpc_end_ordered: called T#%d\n", gtid));
 
 #if USE_ITT_BUILD
   __kmp_itt_ordered_end(gtid);
 // TODO: ordered_wait_id
 #endif /* USE_ITT_BUILD */
 
   th = __kmp_threads[gtid];
 
   if (th->th.th_dispatch->th_dxo_fcn != 0)
     (*th->th.th_dispatch->th_dxo_fcn)(&gtid, &cid, loc);
   else
     __kmp_parallel_dxo(&gtid, &cid, loc);
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   OMPT_STORE_RETURN_ADDRESS(gtid);
   if (ompt_enabled.ompt_callback_mutex_released) {
     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
         ompt_mutex_ordered,
         (ompt_wait_id_t)(uintptr_t)&__kmp_team_from_gtid(gtid)
             ->t.t_ordered.dt.t_value,
         OMPT_LOAD_RETURN_ADDRESS(gtid));
   }
 #endif
 }
 
 #if KMP_USE_DYNAMIC_LOCK
 
 static __forceinline void
 __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
                           kmp_int32 gtid, kmp_indirect_locktag_t tag) {
   // Pointer to the allocated indirect lock is written to crit, while indexing
   // is ignored.
   void *idx;
   kmp_indirect_lock_t **lck;
   lck = (kmp_indirect_lock_t **)crit;
   kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
   KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
   KMP_SET_I_LOCK_LOCATION(ilk, loc);
   KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
   KA_TRACE(20,
            ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
 #if USE_ITT_BUILD
   __kmp_itt_critical_creating(ilk->lock, loc);
 #endif
   int status = KMP_COMPARE_AND_STORE_PTR(lck, nullptr, ilk);
   if (status == 0) {
 #if USE_ITT_BUILD
     __kmp_itt_critical_destroyed(ilk->lock);
 #endif
     // We don't really need to destroy the unclaimed lock here since it will be
     // cleaned up at program exit.
     // KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
   }
   KMP_DEBUG_ASSERT(*lck != NULL);
 }
 
 // Fast-path acquire tas lock
 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
   {                                                                            \
     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
     if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
         !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
       kmp_uint32 spins;                                                        \
       KMP_FSYNC_PREPARE(l);                                                    \
       KMP_INIT_YIELD(spins);                                                   \
       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
       do {                                                                     \
         if (TCR_4(__kmp_nth) >                                                 \
             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
           KMP_YIELD(TRUE);                                                     \
         } else {                                                               \
           KMP_YIELD_SPIN(spins);                                               \
         }                                                                      \
         __kmp_spin_backoff(&backoff);                                          \
       } while (                                                                \
           KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
           !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy));   \
     }                                                                          \
     KMP_FSYNC_ACQUIRED(l);                                                     \
   }
 
 // Fast-path test tas lock
 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
   {                                                                            \
     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
     kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
     kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
     rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
          __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
   }
 
 // Fast-path release tas lock
 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
   { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
 
 #if KMP_USE_FUTEX
 
 #include <sys/syscall.h>
 #include <unistd.h>
 #ifndef FUTEX_WAIT
 #define FUTEX_WAIT 0
 #endif
 #ifndef FUTEX_WAKE
 #define FUTEX_WAKE 1
 #endif
 
 // Fast-path acquire futex lock
 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid)                                     \
   {                                                                            \
     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
     kmp_int32 gtid_code = (gtid + 1) << 1;                                     \
     KMP_MB();                                                                  \
     KMP_FSYNC_PREPARE(ftx);                                                    \
     kmp_int32 poll_val;                                                        \
     while ((poll_val = KMP_COMPARE_AND_STORE_RET32(                            \
                 &(ftx->lk.poll), KMP_LOCK_FREE(futex),                         \
                 KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {   \
       kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;                           \
       if (!cond) {                                                             \
         if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val,            \
                                          poll_val |                            \
                                              KMP_LOCK_BUSY(1, futex))) {       \
           continue;                                                            \
         }                                                                      \
         poll_val |= KMP_LOCK_BUSY(1, futex);                                   \
       }                                                                        \
       kmp_int32 rc;                                                            \
       if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val,     \
                         NULL, NULL, 0)) != 0) {                                \
         continue;                                                              \
       }                                                                        \
       gtid_code |= 1;                                                          \
     }                                                                          \
     KMP_FSYNC_ACQUIRED(ftx);                                                   \
   }
 
 // Fast-path test futex lock
 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc)                                    \
   {                                                                            \
     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
     if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex),     \
                                     KMP_LOCK_BUSY(gtid + 1 << 1, futex))) {    \
       KMP_FSYNC_ACQUIRED(ftx);                                                 \
       rc = TRUE;                                                               \
     } else {                                                                   \
       rc = FALSE;                                                              \
     }                                                                          \
   }
 
 // Fast-path release futex lock
 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid)                                     \
   {                                                                            \
     kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock;                          \
     KMP_MB();                                                                  \
     KMP_FSYNC_RELEASING(ftx);                                                  \
     kmp_int32 poll_val =                                                       \
         KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex));               \
     if (KMP_LOCK_STRIP(poll_val) & 1) {                                        \
       syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE,                         \
               KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);                         \
     }                                                                          \
     KMP_MB();                                                                  \
     KMP_YIELD_OVERSUB();                                                       \
   }
 
 #endif // KMP_USE_FUTEX
 
 #else // KMP_USE_DYNAMIC_LOCK
 
 static kmp_user_lock_p __kmp_get_critical_section_ptr(kmp_critical_name *crit,
                                                       ident_t const *loc,
                                                       kmp_int32 gtid) {
   kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
 
   // Because of the double-check, the following load doesn't need to be volatile
   kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
 
   if (lck == NULL) {
     void *idx;
 
     // Allocate & initialize the lock.
     // Remember alloc'ed locks in table in order to free them in __kmp_cleanup()
     lck = __kmp_user_lock_allocate(&idx, gtid, kmp_lf_critical_section);
     __kmp_init_user_lock_with_checks(lck);
     __kmp_set_user_lock_location(lck, loc);
 #if USE_ITT_BUILD
     __kmp_itt_critical_creating(lck);
 // __kmp_itt_critical_creating() should be called *before* the first usage
 // of underlying lock. It is the only place where we can guarantee it. There
 // are chances the lock will destroyed with no usage, but it is not a
 // problem, because this is not real event seen by user but rather setting
 // name for object (lock). See more details in kmp_itt.h.
 #endif /* USE_ITT_BUILD */
 
     // Use a cmpxchg instruction to slam the start of the critical section with
     // the lock pointer.  If another thread beat us to it, deallocate the lock,
     // and use the lock that the other thread allocated.
     int status = KMP_COMPARE_AND_STORE_PTR(lck_pp, 0, lck);
 
     if (status == 0) {
 // Deallocate the lock and reload the value.
 #if USE_ITT_BUILD
       __kmp_itt_critical_destroyed(lck);
 // Let ITT know the lock is destroyed and the same memory location may be reused
 // for another purpose.
 #endif /* USE_ITT_BUILD */
       __kmp_destroy_user_lock_with_checks(lck);
       __kmp_user_lock_free(&idx, gtid, lck);
       lck = (kmp_user_lock_p)TCR_PTR(*lck_pp);
       KMP_DEBUG_ASSERT(lck != NULL);
     }
   }
   return lck;
 }
 
 #endif // KMP_USE_DYNAMIC_LOCK
 
 /*!
 @ingroup WORK_SHARING
 @param loc  source location information.
 @param global_tid  global thread number .
 @param crit identity of the critical section. This could be a pointer to a lock
 associated with the critical section, or some other suitably unique value.
 
 Enter code protected by a `critical` construct.
 This function blocks until the executing thread can enter the critical section.
 */
 void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
                      kmp_critical_name *crit) {
 #if KMP_USE_DYNAMIC_LOCK
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   OMPT_STORE_RETURN_ADDRESS(global_tid);
 #endif // OMPT_SUPPORT
   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
 #else
   KMP_COUNT_BLOCK(OMP_CRITICAL);
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   ompt_state_t prev_state = ompt_state_undefined;
   ompt_thread_info_t ti;
 #endif
   kmp_user_lock_p lck;
 
   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
 
   // TODO: add THR_OVHD_STATE
 
   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
   KMP_CHECK_USER_LOCK_INIT();
 
   if ((__kmp_user_lock_kind == lk_tas) &&
       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
     lck = (kmp_user_lock_p)crit;
   }
 #if KMP_USE_FUTEX
   else if ((__kmp_user_lock_kind == lk_futex) &&
            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
     lck = (kmp_user_lock_p)crit;
   }
 #endif
   else { // ticket, queuing or drdpa
     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
   }
 
   if (__kmp_env_consistency_check)
     __kmp_push_sync(global_tid, ct_critical, loc, lck);
 
 // since the critical directive binds to all threads, not just the current
 // team we have to check this even if we are in a serialized team.
 // also, even if we are the uber thread, we still have to conduct the lock,
 // as we have to contend with sibling threads.
 
 #if USE_ITT_BUILD
   __kmp_itt_critical_acquiring(lck);
 #endif /* USE_ITT_BUILD */
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   OMPT_STORE_RETURN_ADDRESS(gtid);
   void *codeptr_ra = NULL;
   if (ompt_enabled.enabled) {
     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
     /* OMPT state update */
     prev_state = ti.state;
     ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
     ti.state = ompt_state_wait_critical;
 
     /* OMPT event callback */
     codeptr_ra = OMPT_LOAD_RETURN_ADDRESS(gtid);
     if (ompt_enabled.ompt_callback_mutex_acquire) {
       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
           (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
     }
   }
 #endif
   // Value of 'crit' should be good for using as a critical_id of the critical
   // section directive.
   __kmp_acquire_user_lock_with_checks(lck, global_tid);
 
 #if USE_ITT_BUILD
   __kmp_itt_critical_acquired(lck);
 #endif /* USE_ITT_BUILD */
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (ompt_enabled.enabled) {
     /* OMPT state update */
     ti.state = prev_state;
     ti.wait_id = 0;
 
     /* OMPT event callback */
     if (ompt_enabled.ompt_callback_mutex_acquired) {
       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr_ra);
     }
   }
 #endif
   KMP_POP_PARTITIONED_TIMER();
 
   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
 #endif // KMP_USE_DYNAMIC_LOCK
 }
 
 #if KMP_USE_DYNAMIC_LOCK
 
 // Converts the given hint to an internal lock implementation
 static __forceinline kmp_dyna_lockseq_t __kmp_map_hint_to_lock(uintptr_t hint) {
 #if KMP_USE_TSX
 #define KMP_TSX_LOCK(seq) lockseq_##seq
 #else
 #define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
 #endif
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 #define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
 #else
 #define KMP_CPUINFO_RTM 0
 #endif
 
   // Hints that do not require further logic
   if (hint & kmp_lock_hint_hle)
     return KMP_TSX_LOCK(hle);
   if (hint & kmp_lock_hint_rtm)
     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm) : __kmp_user_lock_seq;
   if (hint & kmp_lock_hint_adaptive)
     return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive) : __kmp_user_lock_seq;
 
   // Rule out conflicting hints first by returning the default lock
   if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
     return __kmp_user_lock_seq;
   if ((hint & omp_lock_hint_speculative) &&
       (hint & omp_lock_hint_nonspeculative))
     return __kmp_user_lock_seq;
 
   // Do not even consider speculation when it appears to be contended
   if (hint & omp_lock_hint_contended)
     return lockseq_queuing;
 
   // Uncontended lock without speculation
   if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
     return lockseq_tas;
 
   // HLE lock for speculation
   if (hint & omp_lock_hint_speculative)
     return KMP_TSX_LOCK(hle);
 
   return __kmp_user_lock_seq;
 }
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
 #if KMP_USE_DYNAMIC_LOCK
 static kmp_mutex_impl_t
 __ompt_get_mutex_impl_type(void *user_lock, kmp_indirect_lock_t *ilock = 0) {
   if (user_lock) {
     switch (KMP_EXTRACT_D_TAG(user_lock)) {
     case 0:
       break;
 #if KMP_USE_FUTEX
     case locktag_futex:
       return kmp_mutex_impl_queuing;
 #endif
     case locktag_tas:
       return kmp_mutex_impl_spin;
 #if KMP_USE_TSX
     case locktag_hle:
       return kmp_mutex_impl_speculative;
 #endif
     default:
       return kmp_mutex_impl_none;
     }
     ilock = KMP_LOOKUP_I_LOCK(user_lock);
   }
   KMP_ASSERT(ilock);
   switch (ilock->type) {
 #if KMP_USE_TSX
   case locktag_adaptive:
   case locktag_rtm:
     return kmp_mutex_impl_speculative;
 #endif
   case locktag_nested_tas:
     return kmp_mutex_impl_spin;
 #if KMP_USE_FUTEX
   case locktag_nested_futex:
 #endif
   case locktag_ticket:
   case locktag_queuing:
   case locktag_drdpa:
   case locktag_nested_ticket:
   case locktag_nested_queuing:
   case locktag_nested_drdpa:
     return kmp_mutex_impl_queuing;
   default:
     return kmp_mutex_impl_none;
   }
 }
 #else
 // For locks without dynamic binding
 static kmp_mutex_impl_t __ompt_get_mutex_impl_type() {
   switch (__kmp_user_lock_kind) {
   case lk_tas:
     return kmp_mutex_impl_spin;
 #if KMP_USE_FUTEX
   case lk_futex:
 #endif
   case lk_ticket:
   case lk_queuing:
   case lk_drdpa:
     return kmp_mutex_impl_queuing;
 #if KMP_USE_TSX
   case lk_hle:
   case lk_rtm:
   case lk_adaptive:
     return kmp_mutex_impl_speculative;
 #endif
   default:
     return kmp_mutex_impl_none;
   }
 }
 #endif // KMP_USE_DYNAMIC_LOCK
 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
 
 /*!
 @ingroup WORK_SHARING
 @param loc  source location information.
 @param global_tid  global thread number.
 @param crit identity of the critical section. This could be a pointer to a lock
 associated with the critical section, or some other suitably unique value.
 @param hint the lock hint.
 
 Enter code protected by a `critical` construct with a hint. The hint value is
 used to suggest a lock implementation. This function blocks until the executing
 thread can enter the critical section unless the hint suggests use of
 speculative execution and the hardware supports it.
 */
 void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
                                kmp_critical_name *crit, uint32_t hint) {
   KMP_COUNT_BLOCK(OMP_CRITICAL);
   kmp_user_lock_p lck;
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   ompt_state_t prev_state = ompt_state_undefined;
   ompt_thread_info_t ti;
   // This is the case, if called from __kmpc_critical:
   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
   if (!codeptr)
     codeptr = OMPT_GET_RETURN_ADDRESS(0);
 #endif
 
   KC_TRACE(10, ("__kmpc_critical: called T#%d\n", global_tid));
 
   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
   // Check if it is initialized.
   KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
   if (*lk == 0) {
     kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
     if (KMP_IS_D_LOCK(lckseq)) {
       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
                                   KMP_GET_D_TAG(lckseq));
     } else {
       __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
     }
   }
   // Branch for accessing the actual lock object and set operation. This
   // branching is inevitable since this lock initialization does not follow the
   // normal dispatch path (lock table is not used).
   if (KMP_EXTRACT_D_TAG(lk) != 0) {
     lck = (kmp_user_lock_p)lk;
     if (__kmp_env_consistency_check) {
       __kmp_push_sync(global_tid, ct_critical, loc, lck,
                       __kmp_map_hint_to_lock(hint));
     }
 #if USE_ITT_BUILD
     __kmp_itt_critical_acquiring(lck);
 #endif
 #if OMPT_SUPPORT && OMPT_OPTIONAL
     if (ompt_enabled.enabled) {
       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
       /* OMPT state update */
       prev_state = ti.state;
       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
       ti.state = ompt_state_wait_critical;
 
       /* OMPT event callback */
       if (ompt_enabled.ompt_callback_mutex_acquire) {
         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
             ompt_mutex_critical, (unsigned int)hint,
             __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)(uintptr_t)lck,
             codeptr);
       }
     }
 #endif
 #if KMP_USE_INLINED_TAS
     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
       KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
     } else
 #elif KMP_USE_INLINED_FUTEX
     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
       KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
     } else
 #endif
     {
       KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
     }
   } else {
     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
     lck = ilk->lock;
     if (__kmp_env_consistency_check) {
       __kmp_push_sync(global_tid, ct_critical, loc, lck,
                       __kmp_map_hint_to_lock(hint));
     }
 #if USE_ITT_BUILD
     __kmp_itt_critical_acquiring(lck);
 #endif
 #if OMPT_SUPPORT && OMPT_OPTIONAL
     if (ompt_enabled.enabled) {
       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
       /* OMPT state update */
       prev_state = ti.state;
       ti.wait_id = (ompt_wait_id_t)(uintptr_t)lck;
       ti.state = ompt_state_wait_critical;
 
       /* OMPT event callback */
       if (ompt_enabled.ompt_callback_mutex_acquire) {
         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
             ompt_mutex_critical, (unsigned int)hint,
             __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)(uintptr_t)lck,
             codeptr);
       }
     }
 #endif
     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
   }
   KMP_POP_PARTITIONED_TIMER();
 
 #if USE_ITT_BUILD
   __kmp_itt_critical_acquired(lck);
 #endif /* USE_ITT_BUILD */
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (ompt_enabled.enabled) {
     /* OMPT state update */
     ti.state = prev_state;
     ti.wait_id = 0;
 
     /* OMPT event callback */
     if (ompt_enabled.ompt_callback_mutex_acquired) {
       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
           ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
     }
   }
 #endif
 
   KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
 } // __kmpc_critical_with_hint
 
 #endif // KMP_USE_DYNAMIC_LOCK
 
 /*!
 @ingroup WORK_SHARING
 @param loc  source location information.
 @param global_tid  global thread number .
 @param crit identity of the critical section. This could be a pointer to a lock
 associated with the critical section, or some other suitably unique value.
 
 Leave a critical section, releasing any lock that was held during its execution.
 */
 void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
                          kmp_critical_name *crit) {
   kmp_user_lock_p lck;
 
   KC_TRACE(10, ("__kmpc_end_critical: called T#%d\n", global_tid));
 
 #if KMP_USE_DYNAMIC_LOCK
   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
     lck = (kmp_user_lock_p)crit;
     KMP_ASSERT(lck != NULL);
     if (__kmp_env_consistency_check) {
       __kmp_pop_sync(global_tid, ct_critical, loc);
     }
 #if USE_ITT_BUILD
     __kmp_itt_critical_releasing(lck);
 #endif
 #if KMP_USE_INLINED_TAS
     if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
       KMP_RELEASE_TAS_LOCK(lck, global_tid);
     } else
 #elif KMP_USE_INLINED_FUTEX
     if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
       KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
     } else
 #endif
     {
       KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
     }
   } else {
     kmp_indirect_lock_t *ilk =
         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
     KMP_ASSERT(ilk != NULL);
     lck = ilk->lock;
     if (__kmp_env_consistency_check) {
       __kmp_pop_sync(global_tid, ct_critical, loc);
     }
 #if USE_ITT_BUILD
     __kmp_itt_critical_releasing(lck);
 #endif
     KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
   }
 
 #else // KMP_USE_DYNAMIC_LOCK
 
   if ((__kmp_user_lock_kind == lk_tas) &&
       (sizeof(lck->tas.lk.poll) <= OMP_CRITICAL_SIZE)) {
     lck = (kmp_user_lock_p)crit;
   }
 #if KMP_USE_FUTEX
   else if ((__kmp_user_lock_kind == lk_futex) &&
            (sizeof(lck->futex.lk.poll) <= OMP_CRITICAL_SIZE)) {
     lck = (kmp_user_lock_p)crit;
   }
 #endif
   else { // ticket, queuing or drdpa
     lck = (kmp_user_lock_p)TCR_PTR(*((kmp_user_lock_p *)crit));
   }
 
   KMP_ASSERT(lck != NULL);
 
   if (__kmp_env_consistency_check)
     __kmp_pop_sync(global_tid, ct_critical, loc);
 
 #if USE_ITT_BUILD
   __kmp_itt_critical_releasing(lck);
 #endif /* USE_ITT_BUILD */
   // Value of 'crit' should be good for using as a critical_id of the critical
   // section directive.
   __kmp_release_user_lock_with_checks(lck, global_tid);
 
 #endif // KMP_USE_DYNAMIC_LOCK
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   /* OMPT release event triggers after lock is released; place here to trigger
    * for all #if branches */
   OMPT_STORE_RETURN_ADDRESS(global_tid);
   if (ompt_enabled.ompt_callback_mutex_released) {
     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
         ompt_mutex_critical, (ompt_wait_id_t)(uintptr_t)lck,
         OMPT_LOAD_RETURN_ADDRESS(0));
   }
 #endif
 
   KMP_POP_PARTITIONED_TIMER();
   KA_TRACE(15, ("__kmpc_end_critical: done T#%d\n", global_tid));
 }
 
 /*!
 @ingroup SYNCHRONIZATION
 @param loc source location information
 @param global_tid thread id.
 @return one if the thread should execute the master block, zero otherwise
 
 Start execution of a combined barrier and master. The barrier is executed inside
 this function.
 */
 kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
   int status;
 
   KC_TRACE(10, ("__kmpc_barrier_master: called T#%d\n", global_tid));
 
   if (!TCR_4(__kmp_init_parallel))
     __kmp_parallel_initialize();
 
   __kmp_resume_if_soft_paused();
 
   if (__kmp_env_consistency_check)
     __kmp_check_barrier(global_tid, ct_barrier, loc);
 
 #if OMPT_SUPPORT
   ompt_frame_t *ompt_frame;
   if (ompt_enabled.enabled) {
     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
     if (ompt_frame->enter_frame.ptr == NULL)
       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
     OMPT_STORE_RETURN_ADDRESS(global_tid);
   }
 #endif
 #if USE_ITT_NOTIFY
   __kmp_threads[global_tid]->th.th_ident = loc;
 #endif
   status = __kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL);
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (ompt_enabled.enabled) {
     ompt_frame->enter_frame = ompt_data_none;
   }
 #endif
 
   return (status != 0) ? 0 : 1;
 }
 
 /*!
 @ingroup SYNCHRONIZATION
 @param loc source location information
 @param global_tid thread id.
 
 Complete the execution of a combined barrier and master. This function should
 only be called at the completion of the <tt>master</tt> code. Other threads will
 still be waiting at the barrier and this call releases them.
 */
 void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid) {
   KC_TRACE(10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid));
 
   __kmp_end_split_barrier(bs_plain_barrier, global_tid);
 }
 
 /*!
 @ingroup SYNCHRONIZATION
 @param loc source location information
 @param global_tid thread id.
 @return one if the thread should execute the master block, zero otherwise
 
 Start execution of a combined barrier and master(nowait) construct.
 The barrier is executed inside this function.
 There is no equivalent "end" function, since the
 */
 kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
   kmp_int32 ret;
 
   KC_TRACE(10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid));
 
   if (!TCR_4(__kmp_init_parallel))
     __kmp_parallel_initialize();
 
   __kmp_resume_if_soft_paused();
 
   if (__kmp_env_consistency_check) {
     if (loc == 0) {
       KMP_WARNING(ConstructIdentInvalid); // ??? What does it mean for the user?
     }
     __kmp_check_barrier(global_tid, ct_barrier, loc);
   }
 
 #if OMPT_SUPPORT
   ompt_frame_t *ompt_frame;
   if (ompt_enabled.enabled) {
     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
     if (ompt_frame->enter_frame.ptr == NULL)
       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
     OMPT_STORE_RETURN_ADDRESS(global_tid);
   }
 #endif
 #if USE_ITT_NOTIFY
   __kmp_threads[global_tid]->th.th_ident = loc;
 #endif
   __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (ompt_enabled.enabled) {
     ompt_frame->enter_frame = ompt_data_none;
   }
 #endif
 
   ret = __kmpc_master(loc, global_tid);
 
   if (__kmp_env_consistency_check) {
     /*  there's no __kmpc_end_master called; so the (stats) */
     /*  actions of __kmpc_end_master are done here          */
 
     if (global_tid < 0) {
       KMP_WARNING(ThreadIdentInvalid);
     }
     if (ret) {
       /* only one thread should do the pop since only */
       /* one did the push (see __kmpc_master())       */
 
       __kmp_pop_sync(global_tid, ct_master, loc);
     }
   }
 
   return (ret);
 }
 
 /* The BARRIER for a SINGLE process section is always explicit   */
 /*!
 @ingroup WORK_SHARING
 @param loc  source location information
 @param global_tid  global thread number
 @return One if this thread should execute the single construct, zero otherwise.
 
 Test whether to execute a <tt>single</tt> construct.
 There are no implicit barriers in the two "single" calls, rather the compiler
 should introduce an explicit barrier if it is required.
 */
 
 kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid) {
   kmp_int32 rc = __kmp_enter_single(global_tid, loc, TRUE);
 
   if (rc) {
     // We are going to execute the single statement, so we should count it.
     KMP_COUNT_BLOCK(OMP_SINGLE);
     KMP_PUSH_PARTITIONED_TIMER(OMP_single);
   }
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   kmp_info_t *this_thr = __kmp_threads[global_tid];
   kmp_team_t *team = this_thr->th.th_team;
   int tid = __kmp_tid_from_gtid(global_tid);
 
   if (ompt_enabled.enabled) {
     if (rc) {
       if (ompt_enabled.ompt_callback_work) {
         ompt_callbacks.ompt_callback(ompt_callback_work)(
             ompt_work_single_executor, ompt_scope_begin,
             &(team->t.ompt_team_info.parallel_data),
             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
             1, OMPT_GET_RETURN_ADDRESS(0));
       }
     } else {
       if (ompt_enabled.ompt_callback_work) {
         ompt_callbacks.ompt_callback(ompt_callback_work)(
             ompt_work_single_other, ompt_scope_begin,
             &(team->t.ompt_team_info.parallel_data),
             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
             1, OMPT_GET_RETURN_ADDRESS(0));
         ompt_callbacks.ompt_callback(ompt_callback_work)(
             ompt_work_single_other, ompt_scope_end,
             &(team->t.ompt_team_info.parallel_data),
             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
             1, OMPT_GET_RETURN_ADDRESS(0));
       }
     }
   }
 #endif
 
   return rc;
 }
 
 /*!
 @ingroup WORK_SHARING
 @param loc  source location information
 @param global_tid  global thread number
 
 Mark the end of a <tt>single</tt> construct.  This function should
 only be called by the thread that executed the block of code protected
 by the `single` construct.
 */
 void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) {
   __kmp_exit_single(global_tid);
   KMP_POP_PARTITIONED_TIMER();
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   kmp_info_t *this_thr = __kmp_threads[global_tid];
   kmp_team_t *team = this_thr->th.th_team;
   int tid = __kmp_tid_from_gtid(global_tid);
 
   if (ompt_enabled.ompt_callback_work) {
     ompt_callbacks.ompt_callback(ompt_callback_work)(
         ompt_work_single_executor, ompt_scope_end,
         &(team->t.ompt_team_info.parallel_data),
         &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data), 1,
         OMPT_GET_RETURN_ADDRESS(0));
   }
 #endif
 }
 
 /*!
 @ingroup WORK_SHARING
 @param loc Source location
 @param global_tid Global thread id
 
 Mark the end of a statically scheduled loop.
 */
 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
   KMP_POP_PARTITIONED_TIMER();
   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (ompt_enabled.ompt_callback_work) {
     ompt_work_t ompt_work_type = ompt_work_loop;
     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
     // Determine workshare type
     if (loc != NULL) {
       if ((loc->flags & KMP_IDENT_WORK_LOOP) != 0) {
         ompt_work_type = ompt_work_loop;
       } else if ((loc->flags & KMP_IDENT_WORK_SECTIONS) != 0) {
         ompt_work_type = ompt_work_sections;
       } else if ((loc->flags & KMP_IDENT_WORK_DISTRIBUTE) != 0) {
         ompt_work_type = ompt_work_distribute;
       } else {
         // use default set above.
         // a warning about this case is provided in __kmpc_for_static_init
       }
       KMP_DEBUG_ASSERT(ompt_work_type);
     }
     ompt_callbacks.ompt_callback(ompt_callback_work)(
         ompt_work_type, ompt_scope_end, &(team_info->parallel_data),
         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
   }
 #endif
   if (__kmp_env_consistency_check)
     __kmp_pop_workshare(global_tid, ct_pdo, loc);
 }
 
 // User routines which take C-style arguments (call by value)
 // different from the Fortran equivalent routines
 
 void ompc_set_num_threads(int arg) {
   // !!!!! TODO: check the per-task binding
   __kmp_set_num_threads(arg, __kmp_entry_gtid());
 }
 
 void ompc_set_dynamic(int flag) {
   kmp_info_t *thread;
 
   /* For the thread-private implementation of the internal controls */
   thread = __kmp_entry_thread();
 
   __kmp_save_internal_controls(thread);
 
   set__dynamic(thread, flag ? TRUE : FALSE);
 }
 
 void ompc_set_nested(int flag) {
   kmp_info_t *thread;
 
   /* For the thread-private internal controls implementation */
   thread = __kmp_entry_thread();
 
   __kmp_save_internal_controls(thread);
 
   set__max_active_levels(thread, flag ? __kmp_dflt_max_active_levels : 1);
 }
 
 void ompc_set_max_active_levels(int max_active_levels) {
   /* TO DO */
   /* we want per-task implementation of this internal control */
 
   /* For the per-thread internal controls implementation */
   __kmp_set_max_active_levels(__kmp_entry_gtid(), max_active_levels);
 }
 
 void ompc_set_schedule(omp_sched_t kind, int modifier) {
   // !!!!! TODO: check the per-task binding
   __kmp_set_schedule(__kmp_entry_gtid(), (kmp_sched_t)kind, modifier);
 }
 
 int ompc_get_ancestor_thread_num(int level) {
   return __kmp_get_ancestor_thread_num(__kmp_entry_gtid(), level);
 }
 
 int ompc_get_team_size(int level) {
   return __kmp_get_team_size(__kmp_entry_gtid(), level);
 }
 
 /* OpenMP 5.0 Affinity Format API */
 
 void ompc_set_affinity_format(char const *format) {
   if (!__kmp_init_serial) {
     __kmp_serial_initialize();
   }
   __kmp_strncpy_truncate(__kmp_affinity_format, KMP_AFFINITY_FORMAT_SIZE,
                          format, KMP_STRLEN(format) + 1);
 }
 
 size_t ompc_get_affinity_format(char *buffer, size_t size) {
   size_t format_size;
   if (!__kmp_init_serial) {
     __kmp_serial_initialize();
   }
   format_size = KMP_STRLEN(__kmp_affinity_format);
   if (buffer && size) {
     __kmp_strncpy_truncate(buffer, size, __kmp_affinity_format,
                            format_size + 1);
   }
   return format_size;
 }
 
 void ompc_display_affinity(char const *format) {
   int gtid;
   if (!TCR_4(__kmp_init_middle)) {
     __kmp_middle_initialize();
   }
   gtid = __kmp_get_gtid();
   __kmp_aux_display_affinity(gtid, format);
 }
 
 size_t ompc_capture_affinity(char *buffer, size_t buf_size,
                              char const *format) {
   int gtid;
   size_t num_required;
   kmp_str_buf_t capture_buf;
   if (!TCR_4(__kmp_init_middle)) {
     __kmp_middle_initialize();
   }
   gtid = __kmp_get_gtid();
   __kmp_str_buf_init(&capture_buf);
   num_required = __kmp_aux_capture_affinity(gtid, format, &capture_buf);
   if (buffer && buf_size) {
     __kmp_strncpy_truncate(buffer, buf_size, capture_buf.str,
                            capture_buf.used + 1);
   }
   __kmp_str_buf_free(&capture_buf);
   return num_required;
 }
 
 void kmpc_set_stacksize(int arg) {
   // __kmp_aux_set_stacksize initializes the library if needed
   __kmp_aux_set_stacksize(arg);
 }
 
 void kmpc_set_stacksize_s(size_t arg) {
   // __kmp_aux_set_stacksize initializes the library if needed
   __kmp_aux_set_stacksize(arg);
 }
 
 void kmpc_set_blocktime(int arg) {
   int gtid, tid;
   kmp_info_t *thread;
 
   gtid = __kmp_entry_gtid();
   tid = __kmp_tid_from_gtid(gtid);
   thread = __kmp_thread_from_gtid(gtid);
 
   __kmp_aux_set_blocktime(arg, thread, tid);
 }
 
 void kmpc_set_library(int arg) {
   // __kmp_user_set_library initializes the library if needed
   __kmp_user_set_library((enum library_type)arg);
 }
 
 void kmpc_set_defaults(char const *str) {
   // __kmp_aux_set_defaults initializes the library if needed
   __kmp_aux_set_defaults(str, KMP_STRLEN(str));
 }
 
 void kmpc_set_disp_num_buffers(int arg) {
   // ignore after initialization because some teams have already
   // allocated dispatch buffers
   if (__kmp_init_serial == 0 && arg > 0)
     __kmp_dispatch_num_buffers = arg;
 }
 
 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
   return -1;
 #else
   if (!TCR_4(__kmp_init_middle)) {
     __kmp_middle_initialize();
   }
   return __kmp_aux_set_affinity_mask_proc(proc, mask);
 #endif
 }
 
 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
   return -1;
 #else
   if (!TCR_4(__kmp_init_middle)) {
     __kmp_middle_initialize();
   }
   return __kmp_aux_unset_affinity_mask_proc(proc, mask);
 #endif
 }
 
 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
   return -1;
 #else
   if (!TCR_4(__kmp_init_middle)) {
     __kmp_middle_initialize();
   }
   return __kmp_aux_get_affinity_mask_proc(proc, mask);
 #endif
 }
 
 /* -------------------------------------------------------------------------- */
 /*!
 @ingroup THREADPRIVATE
 @param loc       source location information
 @param gtid      global thread number
 @param cpy_size  size of the cpy_data buffer
 @param cpy_data  pointer to data to be copied
 @param cpy_func  helper function to call for copying data
 @param didit     flag variable: 1=single thread; 0=not single thread
 
 __kmpc_copyprivate implements the interface for the private data broadcast
 needed for the copyprivate clause associated with a single region in an
 OpenMP<sup>*</sup> program (both C and Fortran).
 All threads participating in the parallel region call this routine.
 One of the threads (called the single thread) should have the <tt>didit</tt>
 variable set to 1 and all other threads should have that variable set to 0.
 All threads pass a pointer to a data buffer (cpy_data) that they have built.
 
 The OpenMP specification forbids the use of nowait on the single region when a
 copyprivate clause is present. However, @ref __kmpc_copyprivate implements a
 barrier internally to avoid race conditions, so the code generation for the
 single region should avoid generating a barrier after the call to @ref
 __kmpc_copyprivate.
 
 The <tt>gtid</tt> parameter is the global thread id for the current thread.
 The <tt>loc</tt> parameter is a pointer to source location information.
 
 Internal implementation: The single thread will first copy its descriptor
 address (cpy_data) to a team-private location, then the other threads will each
 call the function pointed to by the parameter cpy_func, which carries out the
 copy by copying the data using the cpy_data buffer.
 
 The cpy_func routine used for the copy and the contents of the data area defined
 by cpy_data and cpy_size may be built in any fashion that will allow the copy
 to be done. For instance, the cpy_data buffer can hold the actual data to be
 copied or it may hold a list of pointers to the data. The cpy_func routine must
 interpret the cpy_data buffer appropriately.
 
 The interface to cpy_func is as follows:
 @code
 void cpy_func( void *destination, void *source )
 @endcode
 where void *destination is the cpy_data pointer for the thread being copied to
 and void *source is the cpy_data pointer for the thread being copied from.
 */
 void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
                         void *cpy_data, void (*cpy_func)(void *, void *),
                         kmp_int32 didit) {
   void **data_ptr;
 
   KC_TRACE(10, ("__kmpc_copyprivate: called T#%d\n", gtid));
 
   KMP_MB();
 
   data_ptr = &__kmp_team_from_gtid(gtid)->t.t_copypriv_data;
 
   if (__kmp_env_consistency_check) {
     if (loc == 0) {
       KMP_WARNING(ConstructIdentInvalid);
     }
   }
 
   // ToDo: Optimize the following two barriers into some kind of split barrier
 
   if (didit)
     *data_ptr = cpy_data;
 
 #if OMPT_SUPPORT
   ompt_frame_t *ompt_frame;
   if (ompt_enabled.enabled) {
     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
     if (ompt_frame->enter_frame.ptr == NULL)
       ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
     OMPT_STORE_RETURN_ADDRESS(gtid);
   }
 #endif
 /* This barrier is not a barrier region boundary */
 #if USE_ITT_NOTIFY
   __kmp_threads[gtid]->th.th_ident = loc;
 #endif
   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
 
   if (!didit)
     (*cpy_func)(cpy_data, *data_ptr);
 
 // Consider next barrier a user-visible barrier for barrier region boundaries
 // Nesting checks are already handled by the single construct checks
 
 #if OMPT_SUPPORT
   if (ompt_enabled.enabled) {
     OMPT_STORE_RETURN_ADDRESS(gtid);
   }
 #endif
 #if USE_ITT_NOTIFY
   __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g.
 // tasks can overwrite the location)
 #endif
   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (ompt_enabled.enabled) {
     ompt_frame->enter_frame = ompt_data_none;
   }
 #endif
 }
 
 /* -------------------------------------------------------------------------- */
 
 #define INIT_LOCK __kmp_init_user_lock_with_checks
 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
 #define ACQUIRE_NESTED_LOCK_TIMED                                              \
   __kmp_acquire_nested_user_lock_with_checks_timed
 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
 #define TEST_LOCK __kmp_test_user_lock_with_checks
 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
 
 // TODO: Make check abort messages use location info & pass it into
 // with_checks routines
 
 #if KMP_USE_DYNAMIC_LOCK
 
 // internal lock initializer
 static __forceinline void __kmp_init_lock_with_hint(ident_t *loc, void **lock,
                                                     kmp_dyna_lockseq_t seq) {
   if (KMP_IS_D_LOCK(seq)) {
     KMP_INIT_D_LOCK(lock, seq);
 #if USE_ITT_BUILD
     __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
 #endif
   } else {
     KMP_INIT_I_LOCK(lock, seq);
 #if USE_ITT_BUILD
     kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
     __kmp_itt_lock_creating(ilk->lock, loc);
 #endif
   }
 }
 
 // internal nest lock initializer
 static __forceinline void
 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock,
                                kmp_dyna_lockseq_t seq) {
 #if KMP_USE_TSX
   // Don't have nested lock implementation for speculative locks
   if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
     seq = __kmp_user_lock_seq;
 #endif
   switch (seq) {
   case lockseq_tas:
     seq = lockseq_nested_tas;
     break;
 #if KMP_USE_FUTEX
   case lockseq_futex:
     seq = lockseq_nested_futex;
     break;
 #endif
   case lockseq_ticket:
     seq = lockseq_nested_ticket;
     break;
   case lockseq_queuing:
     seq = lockseq_nested_queuing;
     break;
   case lockseq_drdpa:
     seq = lockseq_nested_drdpa;
     break;
   default:
     seq = lockseq_nested_queuing;
   }
   KMP_INIT_I_LOCK(lock, seq);
 #if USE_ITT_BUILD
   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
   __kmp_itt_lock_creating(ilk->lock, loc);
 #endif
 }
 
 /* initialize the lock with a hint */
 void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
                                 uintptr_t hint) {
   KMP_DEBUG_ASSERT(__kmp_init_serial);
   if (__kmp_env_consistency_check && user_lock == NULL) {
     KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
   }
 
   __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   // This is the case, if called from omp_init_lock_with_hint:
   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
   if (!codeptr)
     codeptr = OMPT_GET_RETURN_ADDRESS(0);
   if (ompt_enabled.ompt_callback_lock_init) {
     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
         ompt_mutex_lock, (omp_lock_hint_t)hint,
         __ompt_get_mutex_impl_type(user_lock),
         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
   }
 #endif
 }
 
 /* initialize the lock with a hint */
 void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
                                      void **user_lock, uintptr_t hint) {
   KMP_DEBUG_ASSERT(__kmp_init_serial);
   if (__kmp_env_consistency_check && user_lock == NULL) {
     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
   }
 
   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   // This is the case, if called from omp_init_lock_with_hint:
   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
   if (!codeptr)
     codeptr = OMPT_GET_RETURN_ADDRESS(0);
   if (ompt_enabled.ompt_callback_lock_init) {
     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
         __ompt_get_mutex_impl_type(user_lock),
         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
   }
 #endif
 }
 
 #endif // KMP_USE_DYNAMIC_LOCK
 
 /* initialize the lock */
 void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
 #if KMP_USE_DYNAMIC_LOCK
 
   KMP_DEBUG_ASSERT(__kmp_init_serial);
   if (__kmp_env_consistency_check && user_lock == NULL) {
     KMP_FATAL(LockIsUninitialized, "omp_init_lock");
   }
   __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   // This is the case, if called from omp_init_lock_with_hint:
   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
   if (!codeptr)
     codeptr = OMPT_GET_RETURN_ADDRESS(0);
   if (ompt_enabled.ompt_callback_lock_init) {
     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
         ompt_mutex_lock, omp_lock_hint_none,
         __ompt_get_mutex_impl_type(user_lock),
         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
   }
 #endif
 
 #else // KMP_USE_DYNAMIC_LOCK
 
   static char const *const func = "omp_init_lock";
   kmp_user_lock_p lck;
   KMP_DEBUG_ASSERT(__kmp_init_serial);
 
   if (__kmp_env_consistency_check) {
     if (user_lock == NULL) {
       KMP_FATAL(LockIsUninitialized, func);
     }
   }
 
   KMP_CHECK_USER_LOCK_INIT();
 
   if ((__kmp_user_lock_kind == lk_tas) &&
       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
     lck = (kmp_user_lock_p)user_lock;
   }
 #if KMP_USE_FUTEX
   else if ((__kmp_user_lock_kind == lk_futex) &&
            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
     lck = (kmp_user_lock_p)user_lock;
   }
 #endif
   else {
     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
   }
   INIT_LOCK(lck);
   __kmp_set_user_lock_location(lck, loc);
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   // This is the case, if called from omp_init_lock_with_hint:
   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
   if (!codeptr)
     codeptr = OMPT_GET_RETURN_ADDRESS(0);
   if (ompt_enabled.ompt_callback_lock_init) {
     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
   }
 #endif
 
 #if USE_ITT_BUILD
   __kmp_itt_lock_creating(lck);
 #endif /* USE_ITT_BUILD */
 
 #endif // KMP_USE_DYNAMIC_LOCK
 } // __kmpc_init_lock
 
 /* initialize the lock */
 void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
 #if KMP_USE_DYNAMIC_LOCK
 
   KMP_DEBUG_ASSERT(__kmp_init_serial);
   if (__kmp_env_consistency_check && user_lock == NULL) {
     KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
   }
   __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   // This is the case, if called from omp_init_lock_with_hint:
   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
   if (!codeptr)
     codeptr = OMPT_GET_RETURN_ADDRESS(0);
   if (ompt_enabled.ompt_callback_lock_init) {
     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
         ompt_mutex_nest_lock, omp_lock_hint_none,
         __ompt_get_mutex_impl_type(user_lock),
         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
   }
 #endif
 
 #else // KMP_USE_DYNAMIC_LOCK
 
   static char const *const func = "omp_init_nest_lock";
   kmp_user_lock_p lck;
   KMP_DEBUG_ASSERT(__kmp_init_serial);
 
   if (__kmp_env_consistency_check) {
     if (user_lock == NULL) {
       KMP_FATAL(LockIsUninitialized, func);
     }
   }
 
   KMP_CHECK_USER_LOCK_INIT();
 
   if ((__kmp_user_lock_kind == lk_tas) &&
       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
        OMP_NEST_LOCK_T_SIZE)) {
     lck = (kmp_user_lock_p)user_lock;
   }
 #if KMP_USE_FUTEX
   else if ((__kmp_user_lock_kind == lk_futex) &&
            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
             OMP_NEST_LOCK_T_SIZE)) {
     lck = (kmp_user_lock_p)user_lock;
   }
 #endif
   else {
     lck = __kmp_user_lock_allocate(user_lock, gtid, 0);
   }
 
   INIT_NESTED_LOCK(lck);
   __kmp_set_user_lock_location(lck, loc);
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   // This is the case, if called from omp_init_lock_with_hint:
   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
   if (!codeptr)
     codeptr = OMPT_GET_RETURN_ADDRESS(0);
   if (ompt_enabled.ompt_callback_lock_init) {
     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
   }
 #endif
 
 #if USE_ITT_BUILD
   __kmp_itt_lock_creating(lck);
 #endif /* USE_ITT_BUILD */
 
 #endif // KMP_USE_DYNAMIC_LOCK
 } // __kmpc_init_nest_lock
 
 void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
 #if KMP_USE_DYNAMIC_LOCK
 
 #if USE_ITT_BUILD
   kmp_user_lock_p lck;
   if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
     lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
   } else {
     lck = (kmp_user_lock_p)user_lock;
   }
   __kmp_itt_lock_destroyed(lck);
 #endif
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   // This is the case, if called from omp_init_lock_with_hint:
   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
   if (!codeptr)
     codeptr = OMPT_GET_RETURN_ADDRESS(0);
   if (ompt_enabled.ompt_callback_lock_destroy) {
     kmp_user_lock_p lck;
     if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
       lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
     } else {
       lck = (kmp_user_lock_p)user_lock;
     }
     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
   }
 #endif
   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
 #else
   kmp_user_lock_p lck;
 
   if ((__kmp_user_lock_kind == lk_tas) &&
       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
     lck = (kmp_user_lock_p)user_lock;
   }
 #if KMP_USE_FUTEX
   else if ((__kmp_user_lock_kind == lk_futex) &&
            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
     lck = (kmp_user_lock_p)user_lock;
   }
 #endif
   else {
     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_lock");
   }
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   // This is the case, if called from omp_init_lock_with_hint:
   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
   if (!codeptr)
     codeptr = OMPT_GET_RETURN_ADDRESS(0);
   if (ompt_enabled.ompt_callback_lock_destroy) {
     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
   }
 #endif
 
 #if USE_ITT_BUILD
   __kmp_itt_lock_destroyed(lck);
 #endif /* USE_ITT_BUILD */
   DESTROY_LOCK(lck);
 
   if ((__kmp_user_lock_kind == lk_tas) &&
       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
     ;
   }
 #if KMP_USE_FUTEX
   else if ((__kmp_user_lock_kind == lk_futex) &&
            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
     ;
   }
 #endif
   else {
     __kmp_user_lock_free(user_lock, gtid, lck);
   }
 #endif // KMP_USE_DYNAMIC_LOCK
 } // __kmpc_destroy_lock
 
 /* destroy the lock */
 void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
 #if KMP_USE_DYNAMIC_LOCK
 
 #if USE_ITT_BUILD
   kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
   __kmp_itt_lock_destroyed(ilk->lock);
 #endif
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   // This is the case, if called from omp_init_lock_with_hint:
   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
   if (!codeptr)
     codeptr = OMPT_GET_RETURN_ADDRESS(0);
   if (ompt_enabled.ompt_callback_lock_destroy) {
     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
   }
 #endif
   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
 
 #else // KMP_USE_DYNAMIC_LOCK
 
   kmp_user_lock_p lck;
 
   if ((__kmp_user_lock_kind == lk_tas) &&
       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
        OMP_NEST_LOCK_T_SIZE)) {
     lck = (kmp_user_lock_p)user_lock;
   }
 #if KMP_USE_FUTEX
   else if ((__kmp_user_lock_kind == lk_futex) &&
            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
             OMP_NEST_LOCK_T_SIZE)) {
     lck = (kmp_user_lock_p)user_lock;
   }
 #endif
   else {
     lck = __kmp_lookup_user_lock(user_lock, "omp_destroy_nest_lock");
   }
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   // This is the case, if called from omp_init_lock_with_hint:
   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
   if (!codeptr)
     codeptr = OMPT_GET_RETURN_ADDRESS(0);
   if (ompt_enabled.ompt_callback_lock_destroy) {
     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
         ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
   }
 #endif
 
 #if USE_ITT_BUILD
   __kmp_itt_lock_destroyed(lck);
 #endif /* USE_ITT_BUILD */
 
   DESTROY_NESTED_LOCK(lck);
 
   if ((__kmp_user_lock_kind == lk_tas) &&
       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
        OMP_NEST_LOCK_T_SIZE)) {
     ;
   }
 #if KMP_USE_FUTEX
   else if ((__kmp_user_lock_kind == lk_futex) &&
            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
             OMP_NEST_LOCK_T_SIZE)) {
     ;
   }
 #endif
   else {
     __kmp_user_lock_free(user_lock, gtid, lck);
   }
 #endif // KMP_USE_DYNAMIC_LOCK
 } // __kmpc_destroy_nest_lock
 
 void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
   KMP_COUNT_BLOCK(OMP_set_lock);
 #if KMP_USE_DYNAMIC_LOCK
   int tag = KMP_EXTRACT_D_TAG(user_lock);
 #if USE_ITT_BUILD
   __kmp_itt_lock_acquiring(
       (kmp_user_lock_p)
           user_lock); // itt function will get to the right lock object.
 #endif
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   // This is the case, if called from omp_init_lock_with_hint:
   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
   if (!codeptr)
     codeptr = OMPT_GET_RETURN_ADDRESS(0);
   if (ompt_enabled.ompt_callback_mutex_acquire) {
     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
         ompt_mutex_lock, omp_lock_hint_none,
         __ompt_get_mutex_impl_type(user_lock),
         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
   }
 #endif
 #if KMP_USE_INLINED_TAS
   if (tag == locktag_tas && !__kmp_env_consistency_check) {
     KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
   } else
 #elif KMP_USE_INLINED_FUTEX
   if (tag == locktag_futex && !__kmp_env_consistency_check) {
     KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
   } else
 #endif
   {
     __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
   }
 #if USE_ITT_BUILD
   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
 #endif
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (ompt_enabled.ompt_callback_mutex_acquired) {
     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
   }
 #endif
 
 #else // KMP_USE_DYNAMIC_LOCK
 
   kmp_user_lock_p lck;
 
   if ((__kmp_user_lock_kind == lk_tas) &&
       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
     lck = (kmp_user_lock_p)user_lock;
   }
 #if KMP_USE_FUTEX
   else if ((__kmp_user_lock_kind == lk_futex) &&
            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
     lck = (kmp_user_lock_p)user_lock;
   }
 #endif
   else {
     lck = __kmp_lookup_user_lock(user_lock, "omp_set_lock");
   }
 
 #if USE_ITT_BUILD
   __kmp_itt_lock_acquiring(lck);
 #endif /* USE_ITT_BUILD */
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   // This is the case, if called from omp_init_lock_with_hint:
   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
   if (!codeptr)
     codeptr = OMPT_GET_RETURN_ADDRESS(0);
   if (ompt_enabled.ompt_callback_mutex_acquire) {
     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
   }
 #endif
 
   ACQUIRE_LOCK(lck, gtid);
 
 #if USE_ITT_BUILD
   __kmp_itt_lock_acquired(lck);
 #endif /* USE_ITT_BUILD */
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (ompt_enabled.ompt_callback_mutex_acquired) {
     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
   }
 #endif
 
 #endif // KMP_USE_DYNAMIC_LOCK
 }
 
 void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
 #if KMP_USE_DYNAMIC_LOCK
 
 #if USE_ITT_BUILD
   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
 #endif
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   // This is the case, if called from omp_init_lock_with_hint:
   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
   if (!codeptr)
     codeptr = OMPT_GET_RETURN_ADDRESS(0);
   if (ompt_enabled.enabled) {
     if (ompt_enabled.ompt_callback_mutex_acquire) {
       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
           ompt_mutex_nest_lock, omp_lock_hint_none,
           __ompt_get_mutex_impl_type(user_lock),
           (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
     }
   }
 #endif
   int acquire_status =
       KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
   (void) acquire_status;
 #if USE_ITT_BUILD
   __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
 #endif
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (ompt_enabled.enabled) {
     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
       if (ompt_enabled.ompt_callback_mutex_acquired) {
         // lock_first
         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
             codeptr);
       }
     } else {
       if (ompt_enabled.ompt_callback_nest_lock) {
         // lock_next
         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
       }
     }
   }
 #endif
 
 #else // KMP_USE_DYNAMIC_LOCK
   int acquire_status;
   kmp_user_lock_p lck;
 
   if ((__kmp_user_lock_kind == lk_tas) &&
       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
        OMP_NEST_LOCK_T_SIZE)) {
     lck = (kmp_user_lock_p)user_lock;
   }
 #if KMP_USE_FUTEX
   else if ((__kmp_user_lock_kind == lk_futex) &&
            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
             OMP_NEST_LOCK_T_SIZE)) {
     lck = (kmp_user_lock_p)user_lock;
   }
 #endif
   else {
     lck = __kmp_lookup_user_lock(user_lock, "omp_set_nest_lock");
   }
 
 #if USE_ITT_BUILD
   __kmp_itt_lock_acquiring(lck);
 #endif /* USE_ITT_BUILD */
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   // This is the case, if called from omp_init_lock_with_hint:
   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
   if (!codeptr)
     codeptr = OMPT_GET_RETURN_ADDRESS(0);
   if (ompt_enabled.enabled) {
     if (ompt_enabled.ompt_callback_mutex_acquire) {
       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
           ompt_mutex_nest_lock, omp_lock_hint_none,
           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
           codeptr);
     }
   }
 #endif
 
   ACQUIRE_NESTED_LOCK(lck, gtid, &acquire_status);
 
 #if USE_ITT_BUILD
   __kmp_itt_lock_acquired(lck);
 #endif /* USE_ITT_BUILD */
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (ompt_enabled.enabled) {
     if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
       if (ompt_enabled.ompt_callback_mutex_acquired) {
         // lock_first
         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
       }
     } else {
       if (ompt_enabled.ompt_callback_nest_lock) {
         // lock_next
         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
       }
     }
   }
 #endif
 
 #endif // KMP_USE_DYNAMIC_LOCK
 }
 
 void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
 #if KMP_USE_DYNAMIC_LOCK
 
   int tag = KMP_EXTRACT_D_TAG(user_lock);
 #if USE_ITT_BUILD
   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
 #endif
 #if KMP_USE_INLINED_TAS
   if (tag == locktag_tas && !__kmp_env_consistency_check) {
     KMP_RELEASE_TAS_LOCK(user_lock, gtid);
   } else
 #elif KMP_USE_INLINED_FUTEX
   if (tag == locktag_futex && !__kmp_env_consistency_check) {
     KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
   } else
 #endif
   {
     __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
   }
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   // This is the case, if called from omp_init_lock_with_hint:
   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
   if (!codeptr)
     codeptr = OMPT_GET_RETURN_ADDRESS(0);
   if (ompt_enabled.ompt_callback_mutex_released) {
     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
   }
 #endif
 
 #else // KMP_USE_DYNAMIC_LOCK
 
   kmp_user_lock_p lck;
 
   /* Can't use serial interval since not block structured */
   /* release the lock */
 
   if ((__kmp_user_lock_kind == lk_tas) &&
       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
 #if KMP_OS_LINUX &&                                                            \
     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
 // "fast" path implemented to fix customer performance issue
 #if USE_ITT_BUILD
     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
 #endif /* USE_ITT_BUILD */
     TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
     KMP_MB();
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
     // This is the case, if called from omp_init_lock_with_hint:
     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
     if (!codeptr)
       codeptr = OMPT_GET_RETURN_ADDRESS(0);
     if (ompt_enabled.ompt_callback_mutex_released) {
       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
     }
 #endif
 
     return;
 #else
     lck = (kmp_user_lock_p)user_lock;
 #endif
   }
 #if KMP_USE_FUTEX
   else if ((__kmp_user_lock_kind == lk_futex) &&
            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
     lck = (kmp_user_lock_p)user_lock;
   }
 #endif
   else {
     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_lock");
   }
 
 #if USE_ITT_BUILD
   __kmp_itt_lock_releasing(lck);
 #endif /* USE_ITT_BUILD */
 
   RELEASE_LOCK(lck, gtid);
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   // This is the case, if called from omp_init_lock_with_hint:
   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
   if (!codeptr)
     codeptr = OMPT_GET_RETURN_ADDRESS(0);
   if (ompt_enabled.ompt_callback_mutex_released) {
     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
   }
 #endif
 
 #endif // KMP_USE_DYNAMIC_LOCK
 }
 
 /* release the lock */
 void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
 #if KMP_USE_DYNAMIC_LOCK
 
 #if USE_ITT_BUILD
   __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
 #endif
   int release_status =
       KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
   (void) release_status;
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   // This is the case, if called from omp_init_lock_with_hint:
   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
   if (!codeptr)
     codeptr = OMPT_GET_RETURN_ADDRESS(0);
   if (ompt_enabled.enabled) {
     if (release_status == KMP_LOCK_RELEASED) {
       if (ompt_enabled.ompt_callback_mutex_released) {
         // release_lock_last
         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
             codeptr);
       }
     } else if (ompt_enabled.ompt_callback_nest_lock) {
       // release_lock_prev
       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
           ompt_scope_end, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
     }
   }
 #endif
 
 #else // KMP_USE_DYNAMIC_LOCK
 
   kmp_user_lock_p lck;
 
   /* Can't use serial interval since not block structured */
 
   if ((__kmp_user_lock_kind == lk_tas) &&
       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
        OMP_NEST_LOCK_T_SIZE)) {
 #if KMP_OS_LINUX &&                                                            \
     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
     // "fast" path implemented to fix customer performance issue
     kmp_tas_lock_t *tl = (kmp_tas_lock_t *)user_lock;
 #if USE_ITT_BUILD
     __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
 #endif /* USE_ITT_BUILD */
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
     int release_status = KMP_LOCK_STILL_HELD;
 #endif
 
     if (--(tl->lk.depth_locked) == 0) {
       TCW_4(tl->lk.poll, 0);
 #if OMPT_SUPPORT && OMPT_OPTIONAL
       release_status = KMP_LOCK_RELEASED;
 #endif
     }
     KMP_MB();
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
     // This is the case, if called from omp_init_lock_with_hint:
     void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
     if (!codeptr)
       codeptr = OMPT_GET_RETURN_ADDRESS(0);
     if (ompt_enabled.enabled) {
       if (release_status == KMP_LOCK_RELEASED) {
         if (ompt_enabled.ompt_callback_mutex_released) {
           // release_lock_last
           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
               ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
         }
       } else if (ompt_enabled.ompt_callback_nest_lock) {
         // release_lock_previous
         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
             ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
       }
     }
 #endif
 
     return;
 #else
     lck = (kmp_user_lock_p)user_lock;
 #endif
   }
 #if KMP_USE_FUTEX
   else if ((__kmp_user_lock_kind == lk_futex) &&
            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
             OMP_NEST_LOCK_T_SIZE)) {
     lck = (kmp_user_lock_p)user_lock;
   }
 #endif
   else {
     lck = __kmp_lookup_user_lock(user_lock, "omp_unset_nest_lock");
   }
 
 #if USE_ITT_BUILD
   __kmp_itt_lock_releasing(lck);
 #endif /* USE_ITT_BUILD */
 
   int release_status;
   release_status = RELEASE_NESTED_LOCK(lck, gtid);
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   // This is the case, if called from omp_init_lock_with_hint:
   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
   if (!codeptr)
     codeptr = OMPT_GET_RETURN_ADDRESS(0);
   if (ompt_enabled.enabled) {
     if (release_status == KMP_LOCK_RELEASED) {
       if (ompt_enabled.ompt_callback_mutex_released) {
         // release_lock_last
         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
       }
     } else if (ompt_enabled.ompt_callback_nest_lock) {
       // release_lock_previous
       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
           ompt_mutex_scope_end, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
     }
   }
 #endif
 
 #endif // KMP_USE_DYNAMIC_LOCK
 }
 
 /* try to acquire the lock */
 int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
   KMP_COUNT_BLOCK(OMP_test_lock);
 
 #if KMP_USE_DYNAMIC_LOCK
   int rc;
   int tag = KMP_EXTRACT_D_TAG(user_lock);
 #if USE_ITT_BUILD
   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
 #endif
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   // This is the case, if called from omp_init_lock_with_hint:
   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
   if (!codeptr)
     codeptr = OMPT_GET_RETURN_ADDRESS(0);
   if (ompt_enabled.ompt_callback_mutex_acquire) {
     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
         ompt_mutex_lock, omp_lock_hint_none,
         __ompt_get_mutex_impl_type(user_lock),
         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
   }
 #endif
 #if KMP_USE_INLINED_TAS
   if (tag == locktag_tas && !__kmp_env_consistency_check) {
     KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
   } else
 #elif KMP_USE_INLINED_FUTEX
   if (tag == locktag_futex && !__kmp_env_consistency_check) {
     KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
   } else
 #endif
   {
     rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
   }
   if (rc) {
 #if USE_ITT_BUILD
     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
 #endif
 #if OMPT_SUPPORT && OMPT_OPTIONAL
     if (ompt_enabled.ompt_callback_mutex_acquired) {
       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
           ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
     }
 #endif
     return FTN_TRUE;
   } else {
 #if USE_ITT_BUILD
     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
 #endif
     return FTN_FALSE;
   }
 
 #else // KMP_USE_DYNAMIC_LOCK
 
   kmp_user_lock_p lck;
   int rc;
 
   if ((__kmp_user_lock_kind == lk_tas) &&
       (sizeof(lck->tas.lk.poll) <= OMP_LOCK_T_SIZE)) {
     lck = (kmp_user_lock_p)user_lock;
   }
 #if KMP_USE_FUTEX
   else if ((__kmp_user_lock_kind == lk_futex) &&
            (sizeof(lck->futex.lk.poll) <= OMP_LOCK_T_SIZE)) {
     lck = (kmp_user_lock_p)user_lock;
   }
 #endif
   else {
     lck = __kmp_lookup_user_lock(user_lock, "omp_test_lock");
   }
 
 #if USE_ITT_BUILD
   __kmp_itt_lock_acquiring(lck);
 #endif /* USE_ITT_BUILD */
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   // This is the case, if called from omp_init_lock_with_hint:
   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
   if (!codeptr)
     codeptr = OMPT_GET_RETURN_ADDRESS(0);
   if (ompt_enabled.ompt_callback_mutex_acquire) {
     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
         (ompt_wait_id_t)(uintptr_t)lck, codeptr);
   }
 #endif
 
   rc = TEST_LOCK(lck, gtid);
 #if USE_ITT_BUILD
   if (rc) {
     __kmp_itt_lock_acquired(lck);
   } else {
     __kmp_itt_lock_cancelled(lck);
   }
 #endif /* USE_ITT_BUILD */
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
         ompt_mutex_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
   }
 #endif
 
   return (rc ? FTN_TRUE : FTN_FALSE);
 
 /* Can't use serial interval since not block structured */
 
 #endif // KMP_USE_DYNAMIC_LOCK
 }
 
 /* try to acquire the lock */
 int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
 #if KMP_USE_DYNAMIC_LOCK
   int rc;
 #if USE_ITT_BUILD
   __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
 #endif
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   // This is the case, if called from omp_init_lock_with_hint:
   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
   if (!codeptr)
     codeptr = OMPT_GET_RETURN_ADDRESS(0);
   if (ompt_enabled.ompt_callback_mutex_acquire) {
     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
         ompt_mutex_nest_lock, omp_lock_hint_none,
         __ompt_get_mutex_impl_type(user_lock),
         (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
   }
 #endif
   rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
 #if USE_ITT_BUILD
   if (rc) {
     __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
   } else {
     __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
   }
 #endif
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (ompt_enabled.enabled && rc) {
     if (rc == 1) {
       if (ompt_enabled.ompt_callback_mutex_acquired) {
         // lock_first
         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)user_lock,
             codeptr);
       }
     } else {
       if (ompt_enabled.ompt_callback_nest_lock) {
         // lock_next
         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
             ompt_scope_begin, (ompt_wait_id_t)(uintptr_t)user_lock, codeptr);
       }
     }
   }
 #endif
   return rc;
 
 #else // KMP_USE_DYNAMIC_LOCK
 
   kmp_user_lock_p lck;
   int rc;
 
   if ((__kmp_user_lock_kind == lk_tas) &&
       (sizeof(lck->tas.lk.poll) + sizeof(lck->tas.lk.depth_locked) <=
        OMP_NEST_LOCK_T_SIZE)) {
     lck = (kmp_user_lock_p)user_lock;
   }
 #if KMP_USE_FUTEX
   else if ((__kmp_user_lock_kind == lk_futex) &&
            (sizeof(lck->futex.lk.poll) + sizeof(lck->futex.lk.depth_locked) <=
             OMP_NEST_LOCK_T_SIZE)) {
     lck = (kmp_user_lock_p)user_lock;
   }
 #endif
   else {
     lck = __kmp_lookup_user_lock(user_lock, "omp_test_nest_lock");
   }
 
 #if USE_ITT_BUILD
   __kmp_itt_lock_acquiring(lck);
 #endif /* USE_ITT_BUILD */
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   // This is the case, if called from omp_init_lock_with_hint:
   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
   if (!codeptr)
     codeptr = OMPT_GET_RETURN_ADDRESS(0);
   if (ompt_enabled.enabled) &&
         ompt_enabled.ompt_callback_mutex_acquire) {
       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
           ompt_mutex_nest_lock, omp_lock_hint_none,
           __ompt_get_mutex_impl_type(), (ompt_wait_id_t)(uintptr_t)lck,
           codeptr);
     }
 #endif
 
   rc = TEST_NESTED_LOCK(lck, gtid);
 #if USE_ITT_BUILD
   if (rc) {
     __kmp_itt_lock_acquired(lck);
   } else {
     __kmp_itt_lock_cancelled(lck);
   }
 #endif /* USE_ITT_BUILD */
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (ompt_enabled.enabled && rc) {
     if (rc == 1) {
       if (ompt_enabled.ompt_callback_mutex_acquired) {
         // lock_first
         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
             ompt_mutex_nest_lock, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
       }
     } else {
       if (ompt_enabled.ompt_callback_nest_lock) {
         // lock_next
         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
             ompt_mutex_scope_begin, (ompt_wait_id_t)(uintptr_t)lck, codeptr);
       }
     }
   }
 #endif
   return rc;
 
 /* Can't use serial interval since not block structured */
 
 #endif // KMP_USE_DYNAMIC_LOCK
 }
 
 // Interface to fast scalable reduce methods routines
 
 // keep the selected method in a thread local structure for cross-function
 // usage: will be used in __kmpc_end_reduce* functions;
 // another solution: to re-determine the method one more time in
 // __kmpc_end_reduce* functions (new prototype required then)
 // AT: which solution is better?
 #define __KMP_SET_REDUCTION_METHOD(gtid, rmethod)                              \
   ((__kmp_threads[(gtid)]->th.th_local.packed_reduction_method) = (rmethod))
 
 #define __KMP_GET_REDUCTION_METHOD(gtid)                                       \
   (__kmp_threads[(gtid)]->th.th_local.packed_reduction_method)
 
 // description of the packed_reduction_method variable: look at the macros in
 // kmp.h
 
 // used in a critical section reduce block
 static __forceinline void
 __kmp_enter_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
                                           kmp_critical_name *crit) {
 
   // this lock was visible to a customer and to the threading profile tool as a
   // serial overhead span (although it's used for an internal purpose only)
   //            why was it visible in previous implementation?
   //            should we keep it visible in new reduce block?
   kmp_user_lock_p lck;
 
 #if KMP_USE_DYNAMIC_LOCK
 
   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
   // Check if it is initialized.
   if (*lk == 0) {
     if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
       KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0,
                                   KMP_GET_D_TAG(__kmp_user_lock_seq));
     } else {
       __kmp_init_indirect_csptr(crit, loc, global_tid,
                                 KMP_GET_I_TAG(__kmp_user_lock_seq));
     }
   }
   // Branch for accessing the actual lock object and set operation. This
   // branching is inevitable since this lock initialization does not follow the
   // normal dispatch path (lock table is not used).
   if (KMP_EXTRACT_D_TAG(lk) != 0) {
     lck = (kmp_user_lock_p)lk;
     KMP_DEBUG_ASSERT(lck != NULL);
     if (__kmp_env_consistency_check) {
       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
     }
     KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
   } else {
     kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
     lck = ilk->lock;
     KMP_DEBUG_ASSERT(lck != NULL);
     if (__kmp_env_consistency_check) {
       __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
     }
     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
   }
 
 #else // KMP_USE_DYNAMIC_LOCK
 
   // We know that the fast reduction code is only emitted by Intel compilers
   // with 32 byte critical sections. If there isn't enough space, then we
   // have to use a pointer.
   if (__kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE) {
     lck = (kmp_user_lock_p)crit;
   } else {
     lck = __kmp_get_critical_section_ptr(crit, loc, global_tid);
   }
   KMP_DEBUG_ASSERT(lck != NULL);
 
   if (__kmp_env_consistency_check)
     __kmp_push_sync(global_tid, ct_critical, loc, lck);
 
   __kmp_acquire_user_lock_with_checks(lck, global_tid);
 
 #endif // KMP_USE_DYNAMIC_LOCK
 }
 
 // used in a critical section reduce block
 static __forceinline void
 __kmp_end_critical_section_reduce_block(ident_t *loc, kmp_int32 global_tid,
                                         kmp_critical_name *crit) {
 
   kmp_user_lock_p lck;
 
 #if KMP_USE_DYNAMIC_LOCK
 
   if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
     lck = (kmp_user_lock_p)crit;
     if (__kmp_env_consistency_check)
       __kmp_pop_sync(global_tid, ct_critical, loc);
     KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
   } else {
     kmp_indirect_lock_t *ilk =
         (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
     if (__kmp_env_consistency_check)
       __kmp_pop_sync(global_tid, ct_critical, loc);
     KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
   }
 
 #else // KMP_USE_DYNAMIC_LOCK
 
   // We know that the fast reduction code is only emitted by Intel compilers
   // with 32 byte critical sections. If there isn't enough space, then we have
   // to use a pointer.
   if (__kmp_base_user_lock_size > 32) {
     lck = *((kmp_user_lock_p *)crit);
     KMP_ASSERT(lck != NULL);
   } else {
     lck = (kmp_user_lock_p)crit;
   }
 
   if (__kmp_env_consistency_check)
     __kmp_pop_sync(global_tid, ct_critical, loc);
 
   __kmp_release_user_lock_with_checks(lck, global_tid);
 
 #endif // KMP_USE_DYNAMIC_LOCK
 } // __kmp_end_critical_section_reduce_block
 
 static __forceinline int
 __kmp_swap_teams_for_teams_reduction(kmp_info_t *th, kmp_team_t **team_p,
                                      int *task_state) {
   kmp_team_t *team;
 
   // Check if we are inside the teams construct?
   if (th->th.th_teams_microtask) {
     *team_p = team = th->th.th_team;
     if (team->t.t_level == th->th.th_teams_level) {
       // This is reduction at teams construct.
       KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
       // Let's swap teams temporarily for the reduction.
       th->th.th_info.ds.ds_tid = team->t.t_master_tid;
       th->th.th_team = team->t.t_parent;
       th->th.th_team_nproc = th->th.th_team->t.t_nproc;
       th->th.th_task_team = th->th.th_team->t.t_task_team[0];
       *task_state = th->th.th_task_state;
       th->th.th_task_state = 0;
 
       return 1;
     }
   }
   return 0;
 }
 
 static __forceinline void
 __kmp_restore_swapped_teams(kmp_info_t *th, kmp_team_t *team, int task_state) {
   // Restore thread structure swapped in __kmp_swap_teams_for_teams_reduction.
   th->th.th_info.ds.ds_tid = 0;
   th->th.th_team = team;
   th->th.th_team_nproc = team->t.t_nproc;
   th->th.th_task_team = team->t.t_task_team[task_state];
   th->th.th_task_state = task_state;
 }
 
 /* 2.a.i. Reduce Block without a terminating barrier */
 /*!
 @ingroup SYNCHRONIZATION
 @param loc source location information
 @param global_tid global thread number
 @param num_vars number of items (variables) to be reduced
 @param reduce_size size of data in bytes to be reduced
 @param reduce_data pointer to data to be reduced
 @param reduce_func callback function providing reduction operation on two
 operands and returning result of reduction in lhs_data
 @param lck pointer to the unique lock data structure
 @result 1 for the master thread, 0 for all other team threads, 2 for all team
 threads if atomic reduction needed
 
 The nowait version is used for a reduce clause with the nowait argument.
 */
 kmp_int32
 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
                      size_t reduce_size, void *reduce_data,
                      void (*reduce_func)(void *lhs_data, void *rhs_data),
                      kmp_critical_name *lck) {
 
   KMP_COUNT_BLOCK(REDUCE_nowait);
   int retval = 0;
   PACKED_REDUCTION_METHOD_T packed_reduction_method;
   kmp_info_t *th;
   kmp_team_t *team;
   int teams_swapped = 0, task_state;
   KA_TRACE(10, ("__kmpc_reduce_nowait() enter: called T#%d\n", global_tid));
 
   // why do we need this initialization here at all?
   // Reduction clause can not be used as a stand-alone directive.
 
   // do not call __kmp_serial_initialize(), it will be called by
   // __kmp_parallel_initialize() if needed
   // possible detection of false-positive race by the threadchecker ???
   if (!TCR_4(__kmp_init_parallel))
     __kmp_parallel_initialize();
 
   __kmp_resume_if_soft_paused();
 
 // check correctness of reduce block nesting
 #if KMP_USE_DYNAMIC_LOCK
   if (__kmp_env_consistency_check)
     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
 #else
   if (__kmp_env_consistency_check)
     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
 #endif
 
   th = __kmp_thread_from_gtid(global_tid);
   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
 
   // packed_reduction_method value will be reused by __kmp_end_reduce* function,
   // the value should be kept in a variable
   // the variable should be either a construct-specific or thread-specific
   // property, not a team specific property
   //     (a thread can reach the next reduce block on the next construct, reduce
   //     method may differ on the next construct)
   // an ident_t "loc" parameter could be used as a construct-specific property
   // (what if loc == 0?)
   //     (if both construct-specific and team-specific variables were shared,
   //     then unness extra syncs should be needed)
   // a thread-specific variable is better regarding two issues above (next
   // construct and extra syncs)
   // a thread-specific "th_local.reduction_method" variable is used currently
   // each thread executes 'determine' and 'set' lines (no need to execute by one
   // thread, to avoid unness extra syncs)
 
   packed_reduction_method = __kmp_determine_reduction_method(
       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
 
   if (packed_reduction_method == critical_reduce_block) {
 
     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
     retval = 1;
 
   } else if (packed_reduction_method == empty_reduce_block) {
 
     // usage: if team size == 1, no synchronization is required ( Intel
     // platforms only )
     retval = 1;
 
   } else if (packed_reduction_method == atomic_reduce_block) {
 
     retval = 2;
 
     // all threads should do this pop here (because __kmpc_end_reduce_nowait()
     // won't be called by the code gen)
     //     (it's not quite good, because the checking block has been closed by
     //     this 'pop',
     //      but atomic operation has not been executed yet, will be executed
     //      slightly later, literally on next instruction)
     if (__kmp_env_consistency_check)
       __kmp_pop_sync(global_tid, ct_reduce, loc);
 
   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
                                    tree_reduce_block)) {
 
 // AT: performance issue: a real barrier here
 // AT:     (if master goes slow, other threads are blocked here waiting for the
 // master to come and release them)
 // AT:     (it's not what a customer might expect specifying NOWAIT clause)
 // AT:     (specifying NOWAIT won't result in improvement of performance, it'll
 // be confusing to a customer)
 // AT: another implementation of *barrier_gather*nowait() (or some other design)
 // might go faster and be more in line with sense of NOWAIT
 // AT: TO DO: do epcc test and compare times
 
 // this barrier should be invisible to a customer and to the threading profile
 // tool (it's neither a terminating barrier nor customer's code, it's
 // used for an internal purpose)
 #if OMPT_SUPPORT
     // JP: can this barrier potentially leed to task scheduling?
     // JP: as long as there is a barrier in the implementation, OMPT should and
     // will provide the barrier events
     //         so we set-up the necessary frame/return addresses.
     ompt_frame_t *ompt_frame;
     if (ompt_enabled.enabled) {
       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
       if (ompt_frame->enter_frame.ptr == NULL)
         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
       OMPT_STORE_RETURN_ADDRESS(global_tid);
     }
 #endif
 #if USE_ITT_NOTIFY
     __kmp_threads[global_tid]->th.th_ident = loc;
 #endif
     retval =
         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
                       global_tid, FALSE, reduce_size, reduce_data, reduce_func);
     retval = (retval != 0) ? (0) : (1);
 #if OMPT_SUPPORT && OMPT_OPTIONAL
     if (ompt_enabled.enabled) {
       ompt_frame->enter_frame = ompt_data_none;
     }
 #endif
 
     // all other workers except master should do this pop here
     //     ( none of other workers will get to __kmpc_end_reduce_nowait() )
     if (__kmp_env_consistency_check) {
       if (retval == 0) {
         __kmp_pop_sync(global_tid, ct_reduce, loc);
       }
     }
 
   } else {
 
     // should never reach this block
     KMP_ASSERT(0); // "unexpected method"
   }
   if (teams_swapped) {
     __kmp_restore_swapped_teams(th, team, task_state);
   }
   KA_TRACE(
       10,
       ("__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n",
        global_tid, packed_reduction_method, retval));
 
   return retval;
 }
 
 /*!
 @ingroup SYNCHRONIZATION
 @param loc source location information
 @param global_tid global thread id.
 @param lck pointer to the unique lock data structure
 
 Finish the execution of a reduce nowait.
 */
 void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid,
                               kmp_critical_name *lck) {
 
   PACKED_REDUCTION_METHOD_T packed_reduction_method;
 
   KA_TRACE(10, ("__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid));
 
   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
 
   if (packed_reduction_method == critical_reduce_block) {
 
     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
 
   } else if (packed_reduction_method == empty_reduce_block) {
 
     // usage: if team size == 1, no synchronization is required ( on Intel
     // platforms only )
 
   } else if (packed_reduction_method == atomic_reduce_block) {
 
     // neither master nor other workers should get here
     //     (code gen does not generate this call in case 2: atomic reduce block)
     // actually it's better to remove this elseif at all;
     // after removal this value will checked by the 'else' and will assert
 
   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
                                    tree_reduce_block)) {
 
     // only master gets here
 
   } else {
 
     // should never reach this block
     KMP_ASSERT(0); // "unexpected method"
   }
 
   if (__kmp_env_consistency_check)
     __kmp_pop_sync(global_tid, ct_reduce, loc);
 
   KA_TRACE(10, ("__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n",
                 global_tid, packed_reduction_method));
 
   return;
 }
 
 /* 2.a.ii. Reduce Block with a terminating barrier */
 
 /*!
 @ingroup SYNCHRONIZATION
 @param loc source location information
 @param global_tid global thread number
 @param num_vars number of items (variables) to be reduced
 @param reduce_size size of data in bytes to be reduced
 @param reduce_data pointer to data to be reduced
 @param reduce_func callback function providing reduction operation on two
 operands and returning result of reduction in lhs_data
 @param lck pointer to the unique lock data structure
 @result 1 for the master thread, 0 for all other team threads, 2 for all team
 threads if atomic reduction needed
 
 A blocking reduce that includes an implicit barrier.
 */
 kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
                         size_t reduce_size, void *reduce_data,
                         void (*reduce_func)(void *lhs_data, void *rhs_data),
                         kmp_critical_name *lck) {
   KMP_COUNT_BLOCK(REDUCE_wait);
   int retval = 0;
   PACKED_REDUCTION_METHOD_T packed_reduction_method;
   kmp_info_t *th;
   kmp_team_t *team;
   int teams_swapped = 0, task_state;
 
   KA_TRACE(10, ("__kmpc_reduce() enter: called T#%d\n", global_tid));
 
   // why do we need this initialization here at all?
   // Reduction clause can not be a stand-alone directive.
 
   // do not call __kmp_serial_initialize(), it will be called by
   // __kmp_parallel_initialize() if needed
   // possible detection of false-positive race by the threadchecker ???
   if (!TCR_4(__kmp_init_parallel))
     __kmp_parallel_initialize();
 
   __kmp_resume_if_soft_paused();
 
 // check correctness of reduce block nesting
 #if KMP_USE_DYNAMIC_LOCK
   if (__kmp_env_consistency_check)
     __kmp_push_sync(global_tid, ct_reduce, loc, NULL, 0);
 #else
   if (__kmp_env_consistency_check)
     __kmp_push_sync(global_tid, ct_reduce, loc, NULL);
 #endif
 
   th = __kmp_thread_from_gtid(global_tid);
   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
 
   packed_reduction_method = __kmp_determine_reduction_method(
       loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck);
   __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method);
 
   if (packed_reduction_method == critical_reduce_block) {
 
     __kmp_enter_critical_section_reduce_block(loc, global_tid, lck);
     retval = 1;
 
   } else if (packed_reduction_method == empty_reduce_block) {
 
     // usage: if team size == 1, no synchronization is required ( Intel
     // platforms only )
     retval = 1;
 
   } else if (packed_reduction_method == atomic_reduce_block) {
 
     retval = 2;
 
   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
                                    tree_reduce_block)) {
 
 // case tree_reduce_block:
 // this barrier should be visible to a customer and to the threading profile
 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
 #if OMPT_SUPPORT
     ompt_frame_t *ompt_frame;
     if (ompt_enabled.enabled) {
       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
       if (ompt_frame->enter_frame.ptr == NULL)
         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
       OMPT_STORE_RETURN_ADDRESS(global_tid);
     }
 #endif
 #if USE_ITT_NOTIFY
     __kmp_threads[global_tid]->th.th_ident =
         loc; // needed for correct notification of frames
 #endif
     retval =
         __kmp_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
                       global_tid, TRUE, reduce_size, reduce_data, reduce_func);
     retval = (retval != 0) ? (0) : (1);
 #if OMPT_SUPPORT && OMPT_OPTIONAL
     if (ompt_enabled.enabled) {
       ompt_frame->enter_frame = ompt_data_none;
     }
 #endif
 
     // all other workers except master should do this pop here
     // ( none of other workers except master will enter __kmpc_end_reduce() )
     if (__kmp_env_consistency_check) {
       if (retval == 0) { // 0: all other workers; 1: master
         __kmp_pop_sync(global_tid, ct_reduce, loc);
       }
     }
 
   } else {
 
     // should never reach this block
     KMP_ASSERT(0); // "unexpected method"
   }
   if (teams_swapped) {
     __kmp_restore_swapped_teams(th, team, task_state);
   }
 
   KA_TRACE(10,
            ("__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n",
             global_tid, packed_reduction_method, retval));
   return retval;
 }
 
 /*!
 @ingroup SYNCHRONIZATION
 @param loc source location information
 @param global_tid global thread id.
 @param lck pointer to the unique lock data structure
 
 Finish the execution of a blocking reduce.
 The <tt>lck</tt> pointer must be the same as that used in the corresponding
 start function.
 */
 void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
                        kmp_critical_name *lck) {
 
   PACKED_REDUCTION_METHOD_T packed_reduction_method;
   kmp_info_t *th;
   kmp_team_t *team;
   int teams_swapped = 0, task_state;
 
   KA_TRACE(10, ("__kmpc_end_reduce() enter: called T#%d\n", global_tid));
 
   th = __kmp_thread_from_gtid(global_tid);
   teams_swapped = __kmp_swap_teams_for_teams_reduction(th, &team, &task_state);
 
   packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid);
 
   // this barrier should be visible to a customer and to the threading profile
   // tool (it's a terminating barrier on constructs if NOWAIT not specified)
 
   if (packed_reduction_method == critical_reduce_block) {
     __kmp_end_critical_section_reduce_block(loc, global_tid, lck);
 
 // TODO: implicit barrier: should be exposed
 #if OMPT_SUPPORT
     ompt_frame_t *ompt_frame;
     if (ompt_enabled.enabled) {
       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
       if (ompt_frame->enter_frame.ptr == NULL)
         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
       OMPT_STORE_RETURN_ADDRESS(global_tid);
     }
 #endif
 #if USE_ITT_NOTIFY
     __kmp_threads[global_tid]->th.th_ident = loc;
 #endif
     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
 #if OMPT_SUPPORT && OMPT_OPTIONAL
     if (ompt_enabled.enabled) {
       ompt_frame->enter_frame = ompt_data_none;
     }
 #endif
 
   } else if (packed_reduction_method == empty_reduce_block) {
 
 // usage: if team size==1, no synchronization is required (Intel platforms only)
 
 // TODO: implicit barrier: should be exposed
 #if OMPT_SUPPORT
     ompt_frame_t *ompt_frame;
     if (ompt_enabled.enabled) {
       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
       if (ompt_frame->enter_frame.ptr == NULL)
         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
       OMPT_STORE_RETURN_ADDRESS(global_tid);
     }
 #endif
 #if USE_ITT_NOTIFY
     __kmp_threads[global_tid]->th.th_ident = loc;
 #endif
     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
 #if OMPT_SUPPORT && OMPT_OPTIONAL
     if (ompt_enabled.enabled) {
       ompt_frame->enter_frame = ompt_data_none;
     }
 #endif
 
   } else if (packed_reduction_method == atomic_reduce_block) {
 
 #if OMPT_SUPPORT
     ompt_frame_t *ompt_frame;
     if (ompt_enabled.enabled) {
       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
       if (ompt_frame->enter_frame.ptr == NULL)
         ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
       OMPT_STORE_RETURN_ADDRESS(global_tid);
     }
 #endif
 // TODO: implicit barrier: should be exposed
 #if USE_ITT_NOTIFY
     __kmp_threads[global_tid]->th.th_ident = loc;
 #endif
     __kmp_barrier(bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL);
 #if OMPT_SUPPORT && OMPT_OPTIONAL
     if (ompt_enabled.enabled) {
       ompt_frame->enter_frame = ompt_data_none;
     }
 #endif
 
   } else if (TEST_REDUCTION_METHOD(packed_reduction_method,
                                    tree_reduce_block)) {
 
     // only master executes here (master releases all other workers)
     __kmp_end_split_barrier(UNPACK_REDUCTION_BARRIER(packed_reduction_method),
                             global_tid);
 
   } else {
 
     // should never reach this block
     KMP_ASSERT(0); // "unexpected method"
   }
   if (teams_swapped) {
     __kmp_restore_swapped_teams(th, team, task_state);
   }
 
   if (__kmp_env_consistency_check)
     __kmp_pop_sync(global_tid, ct_reduce, loc);
 
   KA_TRACE(10, ("__kmpc_end_reduce() exit: called T#%d: method %08x\n",
                 global_tid, packed_reduction_method));
 
   return;
 }
 
 #undef __KMP_GET_REDUCTION_METHOD
 #undef __KMP_SET_REDUCTION_METHOD
 
 /* end of interface to fast scalable reduce routines */
 
 kmp_uint64 __kmpc_get_taskid() {
 
   kmp_int32 gtid;
   kmp_info_t *thread;
 
   gtid = __kmp_get_gtid();
   if (gtid < 0) {
     return 0;
   }
   thread = __kmp_thread_from_gtid(gtid);
   return thread->th.th_current_task->td_task_id;
 
 } // __kmpc_get_taskid
 
 kmp_uint64 __kmpc_get_parent_taskid() {
 
   kmp_int32 gtid;
   kmp_info_t *thread;
   kmp_taskdata_t *parent_task;
 
   gtid = __kmp_get_gtid();
   if (gtid < 0) {
     return 0;
   }
   thread = __kmp_thread_from_gtid(gtid);
   parent_task = thread->th.th_current_task->td_parent;
   return (parent_task == NULL ? 0 : parent_task->td_task_id);
 
 } // __kmpc_get_parent_taskid
 
 /*!
 @ingroup WORK_SHARING
 @param loc  source location information.
 @param gtid  global thread number.
 @param num_dims  number of associated doacross loops.
 @param dims  info on loops bounds.
 
 Initialize doacross loop information.
 Expect compiler send us inclusive bounds,
 e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
 */
 void __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims,
                           const struct kmp_dim *dims) {
   int j, idx;
   kmp_int64 last, trace_count;
   kmp_info_t *th = __kmp_threads[gtid];
   kmp_team_t *team = th->th.th_team;
   kmp_uint32 *flags;
   kmp_disp_t *pr_buf = th->th.th_dispatch;
   dispatch_shared_info_t *sh_buf;
 
   KA_TRACE(
       20,
       ("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
        gtid, num_dims, !team->t.t_serialized));
   KMP_DEBUG_ASSERT(dims != NULL);
   KMP_DEBUG_ASSERT(num_dims > 0);
 
   if (team->t.t_serialized) {
     KA_TRACE(20, ("__kmpc_doacross_init() exit: serialized team\n"));
     return; // no dependencies if team is serialized
   }
   KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
   idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for
   // the next loop
   sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
 
   // Save bounds info into allocated private buffer
   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
   pr_buf->th_doacross_info = (kmp_int64 *)__kmp_thread_malloc(
       th, sizeof(kmp_int64) * (4 * num_dims + 1));
   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
   pr_buf->th_doacross_info[0] =
       (kmp_int64)num_dims; // first element is number of dimensions
   // Save also address of num_done in order to access it later without knowing
   // the buffer index
   pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
   pr_buf->th_doacross_info[2] = dims[0].lo;
   pr_buf->th_doacross_info[3] = dims[0].up;
   pr_buf->th_doacross_info[4] = dims[0].st;
   last = 5;
   for (j = 1; j < num_dims; ++j) {
     kmp_int64
         range_length; // To keep ranges of all dimensions but the first dims[0]
     if (dims[j].st == 1) { // most common case
       // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
       range_length = dims[j].up - dims[j].lo + 1;
     } else {
       if (dims[j].st > 0) {
         KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
         range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
       } else { // negative increment
         KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
         range_length =
             (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
       }
     }
     pr_buf->th_doacross_info[last++] = range_length;
     pr_buf->th_doacross_info[last++] = dims[j].lo;
     pr_buf->th_doacross_info[last++] = dims[j].up;
     pr_buf->th_doacross_info[last++] = dims[j].st;
   }
 
   // Compute total trip count.
   // Start with range of dims[0] which we don't need to keep in the buffer.
   if (dims[0].st == 1) { // most common case
     trace_count = dims[0].up - dims[0].lo + 1;
   } else if (dims[0].st > 0) {
     KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
     trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
   } else { // negative increment
     KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
     trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
   }
   for (j = 1; j < num_dims; ++j) {
     trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
   }
   KMP_DEBUG_ASSERT(trace_count > 0);
 
   // Check if shared buffer is not occupied by other loop (idx -
   // __kmp_dispatch_num_buffers)
   if (idx != sh_buf->doacross_buf_idx) {
     // Shared buffer is occupied, wait for it to be free
     __kmp_wait_4((volatile kmp_uint32 *)&sh_buf->doacross_buf_idx, idx,
                  __kmp_eq_4, NULL);
   }
 #if KMP_32_BIT_ARCH
   // Check if we are the first thread. After the CAS the first thread gets 0,
   // others get 1 if initialization is in progress, allocated pointer otherwise.
   // Treat pointer as volatile integer (value 0 or 1) until memory is allocated.
   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET32(
       (volatile kmp_int32 *)&sh_buf->doacross_flags, NULL, 1);
 #else
   flags = (kmp_uint32 *)KMP_COMPARE_AND_STORE_RET64(
       (volatile kmp_int64 *)&sh_buf->doacross_flags, NULL, 1LL);
 #endif
   if (flags == NULL) {
     // we are the first thread, allocate the array of flags
     size_t size = trace_count / 8 + 8; // in bytes, use single bit per iteration
     flags = (kmp_uint32 *)__kmp_thread_calloc(th, size, 1);
     KMP_MB();
     sh_buf->doacross_flags = flags;
   } else if (flags == (kmp_uint32 *)1) {
 #if KMP_32_BIT_ARCH
     // initialization is still in progress, need to wait
     while (*(volatile kmp_int32 *)&sh_buf->doacross_flags == 1)
 #else
     while (*(volatile kmp_int64 *)&sh_buf->doacross_flags == 1LL)
 #endif
       KMP_YIELD(TRUE);
     KMP_MB();
   } else {
     KMP_MB();
   }
   KMP_DEBUG_ASSERT(sh_buf->doacross_flags > (kmp_uint32 *)1); // check ptr value
   pr_buf->th_doacross_flags =
       sh_buf->doacross_flags; // save private copy in order to not
   // touch shared buffer on each iteration
   KA_TRACE(20, ("__kmpc_doacross_init() exit: T#%d\n", gtid));
 }
 
 void __kmpc_doacross_wait(ident_t *loc, int gtid, const kmp_int64 *vec) {
   kmp_int32 shft, num_dims, i;
   kmp_uint32 flag;
   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
   kmp_info_t *th = __kmp_threads[gtid];
   kmp_team_t *team = th->th.th_team;
   kmp_disp_t *pr_buf;
   kmp_int64 lo, up, st;
 
   KA_TRACE(20, ("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
   if (team->t.t_serialized) {
     KA_TRACE(20, ("__kmpc_doacross_wait() exit: serialized team\n"));
     return; // no dependencies if team is serialized
   }
 
   // calculate sequential iteration number and check out-of-bounds condition
   pr_buf = th->th.th_dispatch;
   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
   num_dims = pr_buf->th_doacross_info[0];
   lo = pr_buf->th_doacross_info[2];
   up = pr_buf->th_doacross_info[3];
   st = pr_buf->th_doacross_info[4];
   if (st == 1) { // most common case
     if (vec[0] < lo || vec[0] > up) {
       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
                     "bounds [%lld,%lld]\n",
                     gtid, vec[0], lo, up));
       return;
     }
     iter_number = vec[0] - lo;
   } else if (st > 0) {
     if (vec[0] < lo || vec[0] > up) {
       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
                     "bounds [%lld,%lld]\n",
                     gtid, vec[0], lo, up));
       return;
     }
     iter_number = (kmp_uint64)(vec[0] - lo) / st;
   } else { // negative increment
     if (vec[0] > lo || vec[0] < up) {
       KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
                     "bounds [%lld,%lld]\n",
                     gtid, vec[0], lo, up));
       return;
     }
     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
   }
   for (i = 1; i < num_dims; ++i) {
     kmp_int64 iter, ln;
     kmp_int32 j = i * 4;
     ln = pr_buf->th_doacross_info[j + 1];
     lo = pr_buf->th_doacross_info[j + 2];
     up = pr_buf->th_doacross_info[j + 3];
     st = pr_buf->th_doacross_info[j + 4];
     if (st == 1) {
       if (vec[i] < lo || vec[i] > up) {
         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
                       "bounds [%lld,%lld]\n",
                       gtid, vec[i], lo, up));
         return;
       }
       iter = vec[i] - lo;
     } else if (st > 0) {
       if (vec[i] < lo || vec[i] > up) {
         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
                       "bounds [%lld,%lld]\n",
                       gtid, vec[i], lo, up));
         return;
       }
       iter = (kmp_uint64)(vec[i] - lo) / st;
     } else { // st < 0
       if (vec[i] > lo || vec[i] < up) {
         KA_TRACE(20, ("__kmpc_doacross_wait() exit: T#%d iter %lld is out of "
                       "bounds [%lld,%lld]\n",
                       gtid, vec[i], lo, up));
         return;
       }
       iter = (kmp_uint64)(lo - vec[i]) / (-st);
     }
     iter_number = iter + ln * iter_number;
   }
   shft = iter_number % 32; // use 32-bit granularity
   iter_number >>= 5; // divided by 32
   flag = 1 << shft;
   while ((flag & pr_buf->th_doacross_flags[iter_number]) == 0) {
     KMP_YIELD(TRUE);
   }
   KMP_MB();
   KA_TRACE(20,
            ("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
             gtid, (iter_number << 5) + shft));
 }
 
 void __kmpc_doacross_post(ident_t *loc, int gtid, const kmp_int64 *vec) {
   kmp_int32 shft, num_dims, i;
   kmp_uint32 flag;
   kmp_int64 iter_number; // iteration number of "collapsed" loop nest
   kmp_info_t *th = __kmp_threads[gtid];
   kmp_team_t *team = th->th.th_team;
   kmp_disp_t *pr_buf;
   kmp_int64 lo, st;
 
   KA_TRACE(20, ("__kmpc_doacross_post() enter: called T#%d\n", gtid));
   if (team->t.t_serialized) {
     KA_TRACE(20, ("__kmpc_doacross_post() exit: serialized team\n"));
     return; // no dependencies if team is serialized
   }
 
   // calculate sequential iteration number (same as in "wait" but no
   // out-of-bounds checks)
   pr_buf = th->th.th_dispatch;
   KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
   num_dims = pr_buf->th_doacross_info[0];
   lo = pr_buf->th_doacross_info[2];
   st = pr_buf->th_doacross_info[4];
   if (st == 1) { // most common case
     iter_number = vec[0] - lo;
   } else if (st > 0) {
     iter_number = (kmp_uint64)(vec[0] - lo) / st;
   } else { // negative increment
     iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
   }
   for (i = 1; i < num_dims; ++i) {
     kmp_int64 iter, ln;
     kmp_int32 j = i * 4;
     ln = pr_buf->th_doacross_info[j + 1];
     lo = pr_buf->th_doacross_info[j + 2];
     st = pr_buf->th_doacross_info[j + 4];
     if (st == 1) {
       iter = vec[i] - lo;
     } else if (st > 0) {
       iter = (kmp_uint64)(vec[i] - lo) / st;
     } else { // st < 0
       iter = (kmp_uint64)(lo - vec[i]) / (-st);
     }
     iter_number = iter + ln * iter_number;
   }
   shft = iter_number % 32; // use 32-bit granularity
   iter_number >>= 5; // divided by 32
   flag = 1 << shft;
   KMP_MB();
   if ((flag & pr_buf->th_doacross_flags[iter_number]) == 0)
     KMP_TEST_THEN_OR32(&pr_buf->th_doacross_flags[iter_number], flag);
   KA_TRACE(20, ("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", gtid,
                 (iter_number << 5) + shft));
 }
 
 void __kmpc_doacross_fini(ident_t *loc, int gtid) {
   kmp_int32 num_done;
   kmp_info_t *th = __kmp_threads[gtid];
   kmp_team_t *team = th->th.th_team;
   kmp_disp_t *pr_buf = th->th.th_dispatch;
 
   KA_TRACE(20, ("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
   if (team->t.t_serialized) {
     KA_TRACE(20, ("__kmpc_doacross_fini() exit: serialized team %p\n", team));
     return; // nothing to do
   }
   num_done = KMP_TEST_THEN_INC32((kmp_int32 *)pr_buf->th_doacross_info[1]) + 1;
   if (num_done == th->th.th_team_nproc) {
     // we are the last thread, need to free shared resources
     int idx = pr_buf->th_doacross_buf_idx - 1;
     dispatch_shared_info_t *sh_buf =
         &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
     KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] ==
                      (kmp_int64)&sh_buf->doacross_num_done);
     KMP_DEBUG_ASSERT(num_done == sh_buf->doacross_num_done);
     KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
     __kmp_thread_free(th, CCAST(kmp_uint32 *, sh_buf->doacross_flags));
     sh_buf->doacross_flags = NULL;
     sh_buf->doacross_num_done = 0;
     sh_buf->doacross_buf_idx +=
         __kmp_dispatch_num_buffers; // free buffer for future re-use
   }
   // free private resources (need to keep buffer index forever)
   pr_buf->th_doacross_flags = NULL;
   __kmp_thread_free(th, (void *)pr_buf->th_doacross_info);
   pr_buf->th_doacross_info = NULL;
   KA_TRACE(20, ("__kmpc_doacross_fini() exit: T#%d\n", gtid));
 }
 
 /* omp_alloc/omp_free only defined for C/C++, not for Fortran */
 void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
   return __kmpc_alloc(__kmp_entry_gtid(), size, allocator);
 }
 
 void omp_free(void *ptr, omp_allocator_handle_t allocator) {
   __kmpc_free(__kmp_entry_gtid(), ptr, allocator);
 }
 
 int __kmpc_get_target_offload(void) {
   if (!__kmp_init_serial) {
     __kmp_serial_initialize();
   }
   return __kmp_target_offload;
 }
 
 int __kmpc_pause_resource(kmp_pause_status_t level) {
   if (!__kmp_init_serial) {
     return 1; // Can't pause if runtime is not initialized
   }
   return __kmp_pause_resource(level);
 }
Index: projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_ftn_os.h
===================================================================
--- projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_ftn_os.h	(revision 357058)
+++ projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_ftn_os.h	(revision 357059)
@@ -1,637 +1,657 @@
 /*
  * kmp_ftn_os.h -- KPTS Fortran defines header file.
  */
 
 //===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef KMP_FTN_OS_H
 #define KMP_FTN_OS_H
 
 // KMP_FNT_ENTRIES may be one of: KMP_FTN_PLAIN, KMP_FTN_UPPER, KMP_FTN_APPEND,
 // KMP_FTN_UAPPEND.
 
 /* -------------------------- External definitions ------------------------ */
 
 #if KMP_FTN_ENTRIES == KMP_FTN_PLAIN
 
 #define FTN_SET_STACKSIZE kmp_set_stacksize
 #define FTN_SET_STACKSIZE_S kmp_set_stacksize_s
 #define FTN_GET_STACKSIZE kmp_get_stacksize
 #define FTN_GET_STACKSIZE_S kmp_get_stacksize_s
 #define FTN_SET_BLOCKTIME kmp_set_blocktime
 #define FTN_GET_BLOCKTIME kmp_get_blocktime
 #define FTN_SET_LIBRARY_SERIAL kmp_set_library_serial
 #define FTN_SET_LIBRARY_TURNAROUND kmp_set_library_turnaround
 #define FTN_SET_LIBRARY_THROUGHPUT kmp_set_library_throughput
 #define FTN_SET_LIBRARY kmp_set_library
 #define FTN_GET_LIBRARY kmp_get_library
 #define FTN_SET_DEFAULTS kmp_set_defaults
 #define FTN_SET_DISP_NUM_BUFFERS kmp_set_disp_num_buffers
 #define FTN_SET_AFFINITY kmp_set_affinity
 #define FTN_GET_AFFINITY kmp_get_affinity
 #define FTN_GET_AFFINITY_MAX_PROC kmp_get_affinity_max_proc
 #define FTN_CREATE_AFFINITY_MASK kmp_create_affinity_mask
 #define FTN_DESTROY_AFFINITY_MASK kmp_destroy_affinity_mask
 #define FTN_SET_AFFINITY_MASK_PROC kmp_set_affinity_mask_proc
 #define FTN_UNSET_AFFINITY_MASK_PROC kmp_unset_affinity_mask_proc
 #define FTN_GET_AFFINITY_MASK_PROC kmp_get_affinity_mask_proc
 
 #define FTN_MALLOC kmp_malloc
 #define FTN_ALIGNED_MALLOC kmp_aligned_malloc
 #define FTN_CALLOC kmp_calloc
 #define FTN_REALLOC kmp_realloc
 #define FTN_KFREE kmp_free
 
 #define FTN_GET_NUM_KNOWN_THREADS kmp_get_num_known_threads
 
 #define FTN_SET_NUM_THREADS omp_set_num_threads
 #define FTN_GET_NUM_THREADS omp_get_num_threads
 #define FTN_GET_MAX_THREADS omp_get_max_threads
 #define FTN_GET_THREAD_NUM omp_get_thread_num
 #define FTN_GET_NUM_PROCS omp_get_num_procs
 #define FTN_SET_DYNAMIC omp_set_dynamic
 #define FTN_GET_DYNAMIC omp_get_dynamic
 #define FTN_SET_NESTED omp_set_nested
 #define FTN_GET_NESTED omp_get_nested
 #define FTN_IN_PARALLEL omp_in_parallel
 #define FTN_GET_THREAD_LIMIT omp_get_thread_limit
 #define FTN_SET_SCHEDULE omp_set_schedule
 #define FTN_GET_SCHEDULE omp_get_schedule
 #define FTN_SET_MAX_ACTIVE_LEVELS omp_set_max_active_levels
 #define FTN_GET_MAX_ACTIVE_LEVELS omp_get_max_active_levels
 #define FTN_GET_ACTIVE_LEVEL omp_get_active_level
 #define FTN_GET_LEVEL omp_get_level
 #define FTN_GET_ANCESTOR_THREAD_NUM omp_get_ancestor_thread_num
 #define FTN_GET_TEAM_SIZE omp_get_team_size
 #define FTN_IN_FINAL omp_in_final
 #define FTN_GET_PROC_BIND omp_get_proc_bind
 #define FTN_GET_NUM_TEAMS omp_get_num_teams
 #define FTN_GET_TEAM_NUM omp_get_team_num
 #define FTN_INIT_LOCK omp_init_lock
 #if KMP_USE_DYNAMIC_LOCK
 #define FTN_INIT_LOCK_WITH_HINT omp_init_lock_with_hint
 #define FTN_INIT_NEST_LOCK_WITH_HINT omp_init_nest_lock_with_hint
 #endif
 #define FTN_DESTROY_LOCK omp_destroy_lock
 #define FTN_SET_LOCK omp_set_lock
 #define FTN_UNSET_LOCK omp_unset_lock
 #define FTN_TEST_LOCK omp_test_lock
 #define FTN_INIT_NEST_LOCK omp_init_nest_lock
 #define FTN_DESTROY_NEST_LOCK omp_destroy_nest_lock
 #define FTN_SET_NEST_LOCK omp_set_nest_lock
 #define FTN_UNSET_NEST_LOCK omp_unset_nest_lock
 #define FTN_TEST_NEST_LOCK omp_test_nest_lock
 
 #define FTN_SET_WARNINGS_ON kmp_set_warnings_on
 #define FTN_SET_WARNINGS_OFF kmp_set_warnings_off
 
 #define FTN_GET_WTIME omp_get_wtime
 #define FTN_GET_WTICK omp_get_wtick
 
 #define FTN_GET_NUM_DEVICES omp_get_num_devices
 #define FTN_GET_DEFAULT_DEVICE omp_get_default_device
 #define FTN_SET_DEFAULT_DEVICE omp_set_default_device
 #define FTN_IS_INITIAL_DEVICE omp_is_initial_device
 
 #define FTN_GET_CANCELLATION omp_get_cancellation
 #define FTN_GET_CANCELLATION_STATUS kmp_get_cancellation_status
 
 #define FTN_GET_MAX_TASK_PRIORITY omp_get_max_task_priority
 #define FTN_GET_NUM_PLACES omp_get_num_places
 #define FTN_GET_PLACE_NUM_PROCS omp_get_place_num_procs
 #define FTN_GET_PLACE_PROC_IDS omp_get_place_proc_ids
 #define FTN_GET_PLACE_NUM omp_get_place_num
 #define FTN_GET_PARTITION_NUM_PLACES omp_get_partition_num_places
 #define FTN_GET_PARTITION_PLACE_NUMS omp_get_partition_place_nums
 #define FTN_GET_INITIAL_DEVICE omp_get_initial_device
 #ifdef KMP_STUB
 #define FTN_TARGET_ALLOC omp_target_alloc
 #define FTN_TARGET_FREE omp_target_free
 #define FTN_TARGET_IS_PRESENT omp_target_is_present
 #define FTN_TARGET_MEMCPY omp_target_memcpy
 #define FTN_TARGET_MEMCPY_RECT omp_target_memcpy_rect
 #define FTN_TARGET_ASSOCIATE_PTR omp_target_associate_ptr
 #define FTN_TARGET_DISASSOCIATE_PTR omp_target_disassociate_ptr
 #endif
 
 #define FTN_CONTROL_TOOL omp_control_tool
 #define FTN_INIT_ALLOCATOR omp_init_allocator
 #define FTN_DESTROY_ALLOCATOR omp_destroy_allocator
 #define FTN_SET_DEFAULT_ALLOCATOR omp_set_default_allocator
 #define FTN_GET_DEFAULT_ALLOCATOR omp_get_default_allocator
 #define FTN_GET_DEVICE_NUM omp_get_device_num
 #define FTN_SET_AFFINITY_FORMAT omp_set_affinity_format
 #define FTN_GET_AFFINITY_FORMAT omp_get_affinity_format
 #define FTN_DISPLAY_AFFINITY omp_display_affinity
 #define FTN_CAPTURE_AFFINITY omp_capture_affinity
 #define FTN_PAUSE_RESOURCE omp_pause_resource
 #define FTN_PAUSE_RESOURCE_ALL omp_pause_resource_all
 #define FTN_GET_SUPPORTED_ACTIVE_LEVELS omp_get_supported_active_levels
 #define FTN_FULFILL_EVENT omp_fulfill_event
 
 #endif /* KMP_FTN_PLAIN */
 
 /* ------------------------------------------------------------------------ */
 
 #if KMP_FTN_ENTRIES == KMP_FTN_APPEND
 
 #define FTN_SET_STACKSIZE kmp_set_stacksize_
 #define FTN_SET_STACKSIZE_S kmp_set_stacksize_s_
 #define FTN_GET_STACKSIZE kmp_get_stacksize_
 #define FTN_GET_STACKSIZE_S kmp_get_stacksize_s_
 #define FTN_SET_BLOCKTIME kmp_set_blocktime_
 #define FTN_GET_BLOCKTIME kmp_get_blocktime_
 #define FTN_SET_LIBRARY_SERIAL kmp_set_library_serial_
 #define FTN_SET_LIBRARY_TURNAROUND kmp_set_library_turnaround_
 #define FTN_SET_LIBRARY_THROUGHPUT kmp_set_library_throughput_
 #define FTN_SET_LIBRARY kmp_set_library_
 #define FTN_GET_LIBRARY kmp_get_library_
 #define FTN_SET_DEFAULTS kmp_set_defaults_
 #define FTN_SET_DISP_NUM_BUFFERS kmp_set_disp_num_buffers_
 #define FTN_SET_AFFINITY kmp_set_affinity_
 #define FTN_GET_AFFINITY kmp_get_affinity_
 #define FTN_GET_AFFINITY_MAX_PROC kmp_get_affinity_max_proc_
 #define FTN_CREATE_AFFINITY_MASK kmp_create_affinity_mask_
 #define FTN_DESTROY_AFFINITY_MASK kmp_destroy_affinity_mask_
 #define FTN_SET_AFFINITY_MASK_PROC kmp_set_affinity_mask_proc_
 #define FTN_UNSET_AFFINITY_MASK_PROC kmp_unset_affinity_mask_proc_
 #define FTN_GET_AFFINITY_MASK_PROC kmp_get_affinity_mask_proc_
 
 #define FTN_MALLOC kmp_malloc_
 #define FTN_ALIGNED_MALLOC kmp_aligned_malloc_
 #define FTN_CALLOC kmp_calloc_
 #define FTN_REALLOC kmp_realloc_
 #define FTN_KFREE kmp_free_
 
 #define FTN_GET_NUM_KNOWN_THREADS kmp_get_num_known_threads_
 
 #define FTN_SET_NUM_THREADS omp_set_num_threads_
 #define FTN_GET_NUM_THREADS omp_get_num_threads_
 #define FTN_GET_MAX_THREADS omp_get_max_threads_
 #define FTN_GET_THREAD_NUM omp_get_thread_num_
 #define FTN_GET_NUM_PROCS omp_get_num_procs_
 #define FTN_SET_DYNAMIC omp_set_dynamic_
 #define FTN_GET_DYNAMIC omp_get_dynamic_
 #define FTN_SET_NESTED omp_set_nested_
 #define FTN_GET_NESTED omp_get_nested_
 #define FTN_IN_PARALLEL omp_in_parallel_
 #define FTN_GET_THREAD_LIMIT omp_get_thread_limit_
 #define FTN_SET_SCHEDULE omp_set_schedule_
 #define FTN_GET_SCHEDULE omp_get_schedule_
 #define FTN_SET_MAX_ACTIVE_LEVELS omp_set_max_active_levels_
 #define FTN_GET_MAX_ACTIVE_LEVELS omp_get_max_active_levels_
 #define FTN_GET_ACTIVE_LEVEL omp_get_active_level_
 #define FTN_GET_LEVEL omp_get_level_
 #define FTN_GET_ANCESTOR_THREAD_NUM omp_get_ancestor_thread_num_
 #define FTN_GET_TEAM_SIZE omp_get_team_size_
 #define FTN_IN_FINAL omp_in_final_
 #define FTN_GET_PROC_BIND omp_get_proc_bind_
 #define FTN_GET_NUM_TEAMS omp_get_num_teams_
 #define FTN_GET_TEAM_NUM omp_get_team_num_
 #define FTN_INIT_LOCK omp_init_lock_
 #if KMP_USE_DYNAMIC_LOCK
 #define FTN_INIT_LOCK_WITH_HINT omp_init_lock_with_hint_
 #define FTN_INIT_NEST_LOCK_WITH_HINT omp_init_nest_lock_with_hint_
 #endif
 #define FTN_DESTROY_LOCK omp_destroy_lock_
 #define FTN_SET_LOCK omp_set_lock_
 #define FTN_UNSET_LOCK omp_unset_lock_
 #define FTN_TEST_LOCK omp_test_lock_
 #define FTN_INIT_NEST_LOCK omp_init_nest_lock_
 #define FTN_DESTROY_NEST_LOCK omp_destroy_nest_lock_
 #define FTN_SET_NEST_LOCK omp_set_nest_lock_
 #define FTN_UNSET_NEST_LOCK omp_unset_nest_lock_
 #define FTN_TEST_NEST_LOCK omp_test_nest_lock_
 
 #define FTN_SET_WARNINGS_ON kmp_set_warnings_on_
 #define FTN_SET_WARNINGS_OFF kmp_set_warnings_off_
 
 #define FTN_GET_WTIME omp_get_wtime_
 #define FTN_GET_WTICK omp_get_wtick_
 
 #define FTN_GET_NUM_DEVICES omp_get_num_devices_
 #define FTN_GET_DEFAULT_DEVICE omp_get_default_device_
 #define FTN_SET_DEFAULT_DEVICE omp_set_default_device_
 #define FTN_IS_INITIAL_DEVICE omp_is_initial_device_
 
 #define FTN_GET_CANCELLATION omp_get_cancellation_
 #define FTN_GET_CANCELLATION_STATUS kmp_get_cancellation_status_
 
 #define FTN_GET_MAX_TASK_PRIORITY omp_get_max_task_priority_
 #define FTN_GET_NUM_PLACES omp_get_num_places_
 #define FTN_GET_PLACE_NUM_PROCS omp_get_place_num_procs_
 #define FTN_GET_PLACE_PROC_IDS omp_get_place_proc_ids_
 #define FTN_GET_PLACE_NUM omp_get_place_num_
 #define FTN_GET_PARTITION_NUM_PLACES omp_get_partition_num_places_
 #define FTN_GET_PARTITION_PLACE_NUMS omp_get_partition_place_nums_
 #define FTN_GET_INITIAL_DEVICE omp_get_initial_device_
 #ifdef KMP_STUB
 #define FTN_TARGET_ALLOC omp_target_alloc_
 #define FTN_TARGET_FREE omp_target_free_
 #define FTN_TARGET_IS_PRESENT omp_target_is_present_
 #define FTN_TARGET_MEMCPY omp_target_memcpy_
 #define FTN_TARGET_MEMCPY_RECT omp_target_memcpy_rect_
 #define FTN_TARGET_ASSOCIATE_PTR omp_target_associate_ptr_
 #define FTN_TARGET_DISASSOCIATE_PTR omp_target_disassociate_ptr_
 #endif
 
 #define FTN_CONTROL_TOOL omp_control_tool_
 #define FTN_INIT_ALLOCATOR omp_init_allocator_
 #define FTN_DESTROY_ALLOCATOR omp_destroy_allocator_
 #define FTN_SET_DEFAULT_ALLOCATOR omp_set_default_allocator_
 #define FTN_GET_DEFAULT_ALLOCATOR omp_get_default_allocator_
 #define FTN_ALLOC omp_alloc_
 #define FTN_FREE omp_free_
 #define FTN_GET_DEVICE_NUM omp_get_device_num_
 #define FTN_SET_AFFINITY_FORMAT omp_set_affinity_format_
 #define FTN_GET_AFFINITY_FORMAT omp_get_affinity_format_
 #define FTN_DISPLAY_AFFINITY omp_display_affinity_
 #define FTN_CAPTURE_AFFINITY omp_capture_affinity_
 #define FTN_PAUSE_RESOURCE omp_pause_resource_
 #define FTN_PAUSE_RESOURCE_ALL omp_pause_resource_all_
 #define FTN_GET_SUPPORTED_ACTIVE_LEVELS omp_get_supported_active_levels_
 #define FTN_FULFILL_EVENT omp_fulfill_event_
 
 #endif /* KMP_FTN_APPEND */
 
 /* ------------------------------------------------------------------------ */
 
 #if KMP_FTN_ENTRIES == KMP_FTN_UPPER
 
 #define FTN_SET_STACKSIZE KMP_SET_STACKSIZE
 #define FTN_SET_STACKSIZE_S KMP_SET_STACKSIZE_S
 #define FTN_GET_STACKSIZE KMP_GET_STACKSIZE
 #define FTN_GET_STACKSIZE_S KMP_GET_STACKSIZE_S
 #define FTN_SET_BLOCKTIME KMP_SET_BLOCKTIME
 #define FTN_GET_BLOCKTIME KMP_GET_BLOCKTIME
 #define FTN_SET_LIBRARY_SERIAL KMP_SET_LIBRARY_SERIAL
 #define FTN_SET_LIBRARY_TURNAROUND KMP_SET_LIBRARY_TURNAROUND
 #define FTN_SET_LIBRARY_THROUGHPUT KMP_SET_LIBRARY_THROUGHPUT
 #define FTN_SET_LIBRARY KMP_SET_LIBRARY
 #define FTN_GET_LIBRARY KMP_GET_LIBRARY
 #define FTN_SET_DEFAULTS KMP_SET_DEFAULTS
 #define FTN_SET_DISP_NUM_BUFFERS KMP_SET_DISP_NUM_BUFFERS
 #define FTN_SET_AFFINITY KMP_SET_AFFINITY
 #define FTN_GET_AFFINITY KMP_GET_AFFINITY
 #define FTN_GET_AFFINITY_MAX_PROC KMP_GET_AFFINITY_MAX_PROC
 #define FTN_CREATE_AFFINITY_MASK KMP_CREATE_AFFINITY_MASK
 #define FTN_DESTROY_AFFINITY_MASK KMP_DESTROY_AFFINITY_MASK
 #define FTN_SET_AFFINITY_MASK_PROC KMP_SET_AFFINITY_MASK_PROC
 #define FTN_UNSET_AFFINITY_MASK_PROC KMP_UNSET_AFFINITY_MASK_PROC
 #define FTN_GET_AFFINITY_MASK_PROC KMP_GET_AFFINITY_MASK_PROC
 
 #define FTN_MALLOC KMP_MALLOC
 #define FTN_ALIGNED_MALLOC KMP_ALIGNED_MALLOC
 #define FTN_CALLOC KMP_CALLOC
 #define FTN_REALLOC KMP_REALLOC
 #define FTN_KFREE KMP_FREE
 
 #define FTN_GET_NUM_KNOWN_THREADS KMP_GET_NUM_KNOWN_THREADS
 
 #define FTN_SET_NUM_THREADS OMP_SET_NUM_THREADS
 #define FTN_GET_NUM_THREADS OMP_GET_NUM_THREADS
 #define FTN_GET_MAX_THREADS OMP_GET_MAX_THREADS
 #define FTN_GET_THREAD_NUM OMP_GET_THREAD_NUM
 #define FTN_GET_NUM_PROCS OMP_GET_NUM_PROCS
 #define FTN_SET_DYNAMIC OMP_SET_DYNAMIC
 #define FTN_GET_DYNAMIC OMP_GET_DYNAMIC
 #define FTN_SET_NESTED OMP_SET_NESTED
 #define FTN_GET_NESTED OMP_GET_NESTED
 #define FTN_IN_PARALLEL OMP_IN_PARALLEL
 #define FTN_GET_THREAD_LIMIT OMP_GET_THREAD_LIMIT
 #define FTN_SET_SCHEDULE OMP_SET_SCHEDULE
 #define FTN_GET_SCHEDULE OMP_GET_SCHEDULE
 #define FTN_SET_MAX_ACTIVE_LEVELS OMP_SET_MAX_ACTIVE_LEVELS
 #define FTN_GET_MAX_ACTIVE_LEVELS OMP_GET_MAX_ACTIVE_LEVELS
 #define FTN_GET_ACTIVE_LEVEL OMP_GET_ACTIVE_LEVEL
 #define FTN_GET_LEVEL OMP_GET_LEVEL
 #define FTN_GET_ANCESTOR_THREAD_NUM OMP_GET_ANCESTOR_THREAD_NUM
 #define FTN_GET_TEAM_SIZE OMP_GET_TEAM_SIZE
 #define FTN_IN_FINAL OMP_IN_FINAL
 #define FTN_GET_PROC_BIND OMP_GET_PROC_BIND
 #define FTN_GET_NUM_TEAMS OMP_GET_NUM_TEAMS
 #define FTN_GET_TEAM_NUM OMP_GET_TEAM_NUM
 #define FTN_INIT_LOCK OMP_INIT_LOCK
 #if KMP_USE_DYNAMIC_LOCK
 #define FTN_INIT_LOCK_WITH_HINT OMP_INIT_LOCK_WITH_HINT
 #define FTN_INIT_NEST_LOCK_WITH_HINT OMP_INIT_NEST_LOCK_WITH_HINT
 #endif
 #define FTN_DESTROY_LOCK OMP_DESTROY_LOCK
 #define FTN_SET_LOCK OMP_SET_LOCK
 #define FTN_UNSET_LOCK OMP_UNSET_LOCK
 #define FTN_TEST_LOCK OMP_TEST_LOCK
 #define FTN_INIT_NEST_LOCK OMP_INIT_NEST_LOCK
 #define FTN_DESTROY_NEST_LOCK OMP_DESTROY_NEST_LOCK
 #define FTN_SET_NEST_LOCK OMP_SET_NEST_LOCK
 #define FTN_UNSET_NEST_LOCK OMP_UNSET_NEST_LOCK
 #define FTN_TEST_NEST_LOCK OMP_TEST_NEST_LOCK
 
 #define FTN_SET_WARNINGS_ON KMP_SET_WARNINGS_ON
 #define FTN_SET_WARNINGS_OFF KMP_SET_WARNINGS_OFF
 
 #define FTN_GET_WTIME OMP_GET_WTIME
 #define FTN_GET_WTICK OMP_GET_WTICK
 
 #define FTN_GET_NUM_DEVICES OMP_GET_NUM_DEVICES
 #define FTN_GET_DEFAULT_DEVICE OMP_GET_DEFAULT_DEVICE
 #define FTN_SET_DEFAULT_DEVICE OMP_SET_DEFAULT_DEVICE
 #define FTN_IS_INITIAL_DEVICE OMP_IS_INITIAL_DEVICE
 
 #define FTN_GET_CANCELLATION OMP_GET_CANCELLATION
 #define FTN_GET_CANCELLATION_STATUS KMP_GET_CANCELLATION_STATUS
 
 #define FTN_GET_MAX_TASK_PRIORITY OMP_GET_MAX_TASK_PRIORITY
 #define FTN_GET_NUM_PLACES OMP_GET_NUM_PLACES
 #define FTN_GET_PLACE_NUM_PROCS OMP_GET_PLACE_NUM_PROCS
 #define FTN_GET_PLACE_PROC_IDS OMP_GET_PLACE_PROC_IDS
 #define FTN_GET_PLACE_NUM OMP_GET_PLACE_NUM
 #define FTN_GET_PARTITION_NUM_PLACES OMP_GET_PARTITION_NUM_PLACES
 #define FTN_GET_PARTITION_PLACE_NUMS OMP_GET_PARTITION_PLACE_NUMS
 #define FTN_GET_INITIAL_DEVICE OMP_GET_INITIAL_DEVICE
 #ifdef KMP_STUB
 #define FTN_TARGET_ALLOC OMP_TARGET_ALLOC
 #define FTN_TARGET_FREE OMP_TARGET_FREE
 #define FTN_TARGET_IS_PRESENT OMP_TARGET_IS_PRESENT
 #define FTN_TARGET_MEMCPY OMP_TARGET_MEMCPY
 #define FTN_TARGET_MEMCPY_RECT OMP_TARGET_MEMCPY_RECT
 #define FTN_TARGET_ASSOCIATE_PTR OMP_TARGET_ASSOCIATE_PTR
 #define FTN_TARGET_DISASSOCIATE_PTR OMP_TARGET_DISASSOCIATE_PTR
 #endif
 
 #define FTN_CONTROL_TOOL OMP_CONTROL_TOOL
 #define FTN_INIT_ALLOCATOR OMP_INIT_ALLOCATOR
 #define FTN_DESTROY_ALLOCATOR OMP_DESTROY_ALLOCATOR
 #define FTN_SET_DEFAULT_ALLOCATOR OMP_SET_DEFAULT_ALLOCATOR
 #define FTN_GET_DEFAULT_ALLOCATOR OMP_GET_DEFAULT_ALLOCATOR
 #define FTN_GET_DEVICE_NUM OMP_GET_DEVICE_NUM
 #define FTN_SET_AFFINITY_FORMAT OMP_SET_AFFINITY_FORMAT
 #define FTN_GET_AFFINITY_FORMAT OMP_GET_AFFINITY_FORMAT
 #define FTN_DISPLAY_AFFINITY OMP_DISPLAY_AFFINITY
 #define FTN_CAPTURE_AFFINITY OMP_CAPTURE_AFFINITY
 #define FTN_PAUSE_RESOURCE OMP_PAUSE_RESOURCE
 #define FTN_PAUSE_RESOURCE_ALL OMP_PAUSE_RESOURCE_ALL
 #define FTN_GET_SUPPORTED_ACTIVE_LEVELS OMP_GET_SUPPORTED_ACTIVE_LEVELS
 #define FTN_FULFILL_EVENT OMP_FULFILL_EVENT
 
 #endif /* KMP_FTN_UPPER */
 
 /* ------------------------------------------------------------------------ */
 
 #if KMP_FTN_ENTRIES == KMP_FTN_UAPPEND
 
 #define FTN_SET_STACKSIZE KMP_SET_STACKSIZE_
 #define FTN_SET_STACKSIZE_S KMP_SET_STACKSIZE_S_
 #define FTN_GET_STACKSIZE KMP_GET_STACKSIZE_
 #define FTN_GET_STACKSIZE_S KMP_GET_STACKSIZE_S_
 #define FTN_SET_BLOCKTIME KMP_SET_BLOCKTIME_
 #define FTN_GET_BLOCKTIME KMP_GET_BLOCKTIME_
 #define FTN_SET_LIBRARY_SERIAL KMP_SET_LIBRARY_SERIAL_
 #define FTN_SET_LIBRARY_TURNAROUND KMP_SET_LIBRARY_TURNAROUND_
 #define FTN_SET_LIBRARY_THROUGHPUT KMP_SET_LIBRARY_THROUGHPUT_
 #define FTN_SET_LIBRARY KMP_SET_LIBRARY_
 #define FTN_GET_LIBRARY KMP_GET_LIBRARY_
 #define FTN_SET_DEFAULTS KMP_SET_DEFAULTS_
 #define FTN_SET_DISP_NUM_BUFFERS KMP_SET_DISP_NUM_BUFFERS_
 #define FTN_SET_AFFINITY KMP_SET_AFFINITY_
 #define FTN_GET_AFFINITY KMP_GET_AFFINITY_
 #define FTN_GET_AFFINITY_MAX_PROC KMP_GET_AFFINITY_MAX_PROC_
 #define FTN_CREATE_AFFINITY_MASK KMP_CREATE_AFFINITY_MASK_
 #define FTN_DESTROY_AFFINITY_MASK KMP_DESTROY_AFFINITY_MASK_
 #define FTN_SET_AFFINITY_MASK_PROC KMP_SET_AFFINITY_MASK_PROC_
 #define FTN_UNSET_AFFINITY_MASK_PROC KMP_UNSET_AFFINITY_MASK_PROC_
 #define FTN_GET_AFFINITY_MASK_PROC KMP_GET_AFFINITY_MASK_PROC_
 
 #define FTN_MALLOC KMP_MALLOC_
 #define FTN_ALIGNED_MALLOC KMP_ALIGNED_MALLOC_
 #define FTN_CALLOC KMP_CALLOC_
 #define FTN_REALLOC KMP_REALLOC_
 #define FTN_KFREE KMP_FREE_
 
 #define FTN_GET_NUM_KNOWN_THREADS KMP_GET_NUM_KNOWN_THREADS_
 
 #define FTN_SET_NUM_THREADS OMP_SET_NUM_THREADS_
 #define FTN_GET_NUM_THREADS OMP_GET_NUM_THREADS_
 #define FTN_GET_MAX_THREADS OMP_GET_MAX_THREADS_
 #define FTN_GET_THREAD_NUM OMP_GET_THREAD_NUM_
 #define FTN_GET_NUM_PROCS OMP_GET_NUM_PROCS_
 #define FTN_SET_DYNAMIC OMP_SET_DYNAMIC_
 #define FTN_GET_DYNAMIC OMP_GET_DYNAMIC_
 #define FTN_SET_NESTED OMP_SET_NESTED_
 #define FTN_GET_NESTED OMP_GET_NESTED_
 #define FTN_IN_PARALLEL OMP_IN_PARALLEL_
 #define FTN_GET_THREAD_LIMIT OMP_GET_THREAD_LIMIT_
 #define FTN_SET_SCHEDULE OMP_SET_SCHEDULE_
 #define FTN_GET_SCHEDULE OMP_GET_SCHEDULE_
 #define FTN_SET_MAX_ACTIVE_LEVELS OMP_SET_MAX_ACTIVE_LEVELS_
 #define FTN_GET_MAX_ACTIVE_LEVELS OMP_GET_MAX_ACTIVE_LEVELS_
 #define FTN_GET_ACTIVE_LEVEL OMP_GET_ACTIVE_LEVEL_
 #define FTN_GET_LEVEL OMP_GET_LEVEL_
 #define FTN_GET_ANCESTOR_THREAD_NUM OMP_GET_ANCESTOR_THREAD_NUM_
 #define FTN_GET_TEAM_SIZE OMP_GET_TEAM_SIZE_
 #define FTN_IN_FINAL OMP_IN_FINAL_
 #define FTN_GET_PROC_BIND OMP_GET_PROC_BIND_
 #define FTN_GET_NUM_TEAMS OMP_GET_NUM_TEAMS_
 #define FTN_GET_TEAM_NUM OMP_GET_TEAM_NUM_
 #define FTN_INIT_LOCK OMP_INIT_LOCK_
 #if KMP_USE_DYNAMIC_LOCK
 #define FTN_INIT_LOCK_WITH_HINT OMP_INIT_LOCK_WITH_HINT_
 #define FTN_INIT_NEST_LOCK_WITH_HINT OMP_INIT_NEST_LOCK_WITH_HINT_
 #endif
 #define FTN_DESTROY_LOCK OMP_DESTROY_LOCK_
 #define FTN_SET_LOCK OMP_SET_LOCK_
 #define FTN_UNSET_LOCK OMP_UNSET_LOCK_
 #define FTN_TEST_LOCK OMP_TEST_LOCK_
 #define FTN_INIT_NEST_LOCK OMP_INIT_NEST_LOCK_
 #define FTN_DESTROY_NEST_LOCK OMP_DESTROY_NEST_LOCK_
 #define FTN_SET_NEST_LOCK OMP_SET_NEST_LOCK_
 #define FTN_UNSET_NEST_LOCK OMP_UNSET_NEST_LOCK_
 #define FTN_TEST_NEST_LOCK OMP_TEST_NEST_LOCK_
 
 #define FTN_SET_WARNINGS_ON KMP_SET_WARNINGS_ON_
 #define FTN_SET_WARNINGS_OFF KMP_SET_WARNINGS_OFF_
 
 #define FTN_GET_WTIME OMP_GET_WTIME_
 #define FTN_GET_WTICK OMP_GET_WTICK_
 
 #define FTN_GET_NUM_DEVICES OMP_GET_NUM_DEVICES_
 #define FTN_GET_DEFAULT_DEVICE OMP_GET_DEFAULT_DEVICE_
 #define FTN_SET_DEFAULT_DEVICE OMP_SET_DEFAULT_DEVICE_
 #define FTN_IS_INITIAL_DEVICE OMP_IS_INITIAL_DEVICE_
 
 #define FTN_GET_CANCELLATION OMP_GET_CANCELLATION_
 #define FTN_GET_CANCELLATION_STATUS KMP_GET_CANCELLATION_STATUS_
 
 #define FTN_GET_MAX_TASK_PRIORITY OMP_GET_MAX_TASK_PRIORITY_
 #define FTN_GET_NUM_PLACES OMP_GET_NUM_PLACES_
 #define FTN_GET_PLACE_NUM_PROCS OMP_GET_PLACE_NUM_PROCS_
 #define FTN_GET_PLACE_PROC_IDS OMP_GET_PLACE_PROC_IDS_
 #define FTN_GET_PLACE_NUM OMP_GET_PLACE_NUM_
 #define FTN_GET_PARTITION_NUM_PLACES OMP_GET_PARTITION_NUM_PLACES_
 #define FTN_GET_PARTITION_PLACE_NUMS OMP_GET_PARTITION_PLACE_NUMS_
 #define FTN_GET_INITIAL_DEVICE OMP_GET_INITIAL_DEVICE_
 #ifdef KMP_STUB
 #define FTN_TARGET_ALLOC OMP_TARGET_ALLOC_
 #define FTN_TARGET_FREE OMP_TARGET_FREE_
 #define FTN_TARGET_IS_PRESENT OMP_TARGET_IS_PRESENT_
 #define FTN_TARGET_MEMCPY OMP_TARGET_MEMCPY_
 #define FTN_TARGET_MEMCPY_RECT OMP_TARGET_MEMCPY_RECT_
 #define FTN_TARGET_ASSOCIATE_PTR OMP_TARGET_ASSOCIATE_PTR_
 #define FTN_TARGET_DISASSOCIATE_PTR OMP_TARGET_DISASSOCIATE_PTR_
 #endif
 
 #define FTN_CONTROL_TOOL OMP_CONTROL_TOOL_
 #define FTN_INIT_ALLOCATOR OMP_INIT_ALLOCATOR_
 #define FTN_DESTROY_ALLOCATOR OMP_DESTROY_ALLOCATOR_
 #define FTN_SET_DEFAULT_ALLOCATOR OMP_SET_DEFAULT_ALLOCATOR_
 #define FTN_GET_DEFAULT_ALLOCATOR OMP_GET_DEFAULT_ALLOCATOR_
 #define FTN_ALLOC OMP_ALLOC_
 #define FTN_FREE OMP_FREE_
 #define FTN_GET_DEVICE_NUM OMP_GET_DEVICE_NUM_
 #define FTN_SET_AFFINITY_FORMAT OMP_SET_AFFINITY_FORMAT_
 #define FTN_GET_AFFINITY_FORMAT OMP_GET_AFFINITY_FORMAT_
 #define FTN_DISPLAY_AFFINITY OMP_DISPLAY_AFFINITY_
 #define FTN_CAPTURE_AFFINITY OMP_CAPTURE_AFFINITY_
 #define FTN_PAUSE_RESOURCE OMP_PAUSE_RESOURCE_
 #define FTN_PAUSE_RESOURCE_ALL OMP_PAUSE_RESOURCE_ALL_
 #define FTN_GET_SUPPORTED_ACTIVE_LEVELS OMP_GET_SUPPORTED_ACTIVE_LEVELS_
 #define FTN_FULFILL_EVENT OMP_FULFILL_EVENT_
 
 #endif /* KMP_FTN_UAPPEND */
 
 /* -------------------------- GOMP API NAMES ------------------------ */
 // All GOMP_1.0 symbols
 #define KMP_API_NAME_GOMP_ATOMIC_END GOMP_atomic_end
 #define KMP_API_NAME_GOMP_ATOMIC_START GOMP_atomic_start
 #define KMP_API_NAME_GOMP_BARRIER GOMP_barrier
 #define KMP_API_NAME_GOMP_CRITICAL_END GOMP_critical_end
 #define KMP_API_NAME_GOMP_CRITICAL_NAME_END GOMP_critical_name_end
 #define KMP_API_NAME_GOMP_CRITICAL_NAME_START GOMP_critical_name_start
 #define KMP_API_NAME_GOMP_CRITICAL_START GOMP_critical_start
 #define KMP_API_NAME_GOMP_LOOP_DYNAMIC_NEXT GOMP_loop_dynamic_next
 #define KMP_API_NAME_GOMP_LOOP_DYNAMIC_START GOMP_loop_dynamic_start
 #define KMP_API_NAME_GOMP_LOOP_END GOMP_loop_end
 #define KMP_API_NAME_GOMP_LOOP_END_NOWAIT GOMP_loop_end_nowait
 #define KMP_API_NAME_GOMP_LOOP_GUIDED_NEXT GOMP_loop_guided_next
 #define KMP_API_NAME_GOMP_LOOP_GUIDED_START GOMP_loop_guided_start
 #define KMP_API_NAME_GOMP_LOOP_ORDERED_DYNAMIC_NEXT                            \
   GOMP_loop_ordered_dynamic_next
 #define KMP_API_NAME_GOMP_LOOP_ORDERED_DYNAMIC_START                           \
   GOMP_loop_ordered_dynamic_start
 #define KMP_API_NAME_GOMP_LOOP_ORDERED_GUIDED_NEXT GOMP_loop_ordered_guided_next
 #define KMP_API_NAME_GOMP_LOOP_ORDERED_GUIDED_START                            \
   GOMP_loop_ordered_guided_start
 #define KMP_API_NAME_GOMP_LOOP_ORDERED_RUNTIME_NEXT                            \
   GOMP_loop_ordered_runtime_next
 #define KMP_API_NAME_GOMP_LOOP_ORDERED_RUNTIME_START                           \
   GOMP_loop_ordered_runtime_start
 #define KMP_API_NAME_GOMP_LOOP_ORDERED_STATIC_NEXT GOMP_loop_ordered_static_next
 #define KMP_API_NAME_GOMP_LOOP_ORDERED_STATIC_START                            \
   GOMP_loop_ordered_static_start
 #define KMP_API_NAME_GOMP_LOOP_RUNTIME_NEXT GOMP_loop_runtime_next
 #define KMP_API_NAME_GOMP_LOOP_RUNTIME_START GOMP_loop_runtime_start
 #define KMP_API_NAME_GOMP_LOOP_STATIC_NEXT GOMP_loop_static_next
 #define KMP_API_NAME_GOMP_LOOP_STATIC_START GOMP_loop_static_start
 #define KMP_API_NAME_GOMP_ORDERED_END GOMP_ordered_end
 #define KMP_API_NAME_GOMP_ORDERED_START GOMP_ordered_start
 #define KMP_API_NAME_GOMP_PARALLEL_END GOMP_parallel_end
 #define KMP_API_NAME_GOMP_PARALLEL_LOOP_DYNAMIC_START                          \
   GOMP_parallel_loop_dynamic_start
 #define KMP_API_NAME_GOMP_PARALLEL_LOOP_GUIDED_START                           \
   GOMP_parallel_loop_guided_start
 #define KMP_API_NAME_GOMP_PARALLEL_LOOP_RUNTIME_START                          \
   GOMP_parallel_loop_runtime_start
 #define KMP_API_NAME_GOMP_PARALLEL_LOOP_STATIC_START                           \
   GOMP_parallel_loop_static_start
 #define KMP_API_NAME_GOMP_PARALLEL_SECTIONS_START GOMP_parallel_sections_start
 #define KMP_API_NAME_GOMP_PARALLEL_START GOMP_parallel_start
 #define KMP_API_NAME_GOMP_SECTIONS_END GOMP_sections_end
 #define KMP_API_NAME_GOMP_SECTIONS_END_NOWAIT GOMP_sections_end_nowait
 #define KMP_API_NAME_GOMP_SECTIONS_NEXT GOMP_sections_next
 #define KMP_API_NAME_GOMP_SECTIONS_START GOMP_sections_start
 #define KMP_API_NAME_GOMP_SINGLE_COPY_END GOMP_single_copy_end
 #define KMP_API_NAME_GOMP_SINGLE_COPY_START GOMP_single_copy_start
 #define KMP_API_NAME_GOMP_SINGLE_START GOMP_single_start
 
 // All GOMP_2.0 symbols
 #define KMP_API_NAME_GOMP_TASK GOMP_task
 #define KMP_API_NAME_GOMP_TASKWAIT GOMP_taskwait
 #define KMP_API_NAME_GOMP_LOOP_ULL_DYNAMIC_NEXT GOMP_loop_ull_dynamic_next
 #define KMP_API_NAME_GOMP_LOOP_ULL_DYNAMIC_START GOMP_loop_ull_dynamic_start
 #define KMP_API_NAME_GOMP_LOOP_ULL_GUIDED_NEXT GOMP_loop_ull_guided_next
 #define KMP_API_NAME_GOMP_LOOP_ULL_GUIDED_START GOMP_loop_ull_guided_start
 #define KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_DYNAMIC_NEXT                        \
   GOMP_loop_ull_ordered_dynamic_next
 #define KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_DYNAMIC_START                       \
   GOMP_loop_ull_ordered_dynamic_start
 #define KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_GUIDED_NEXT                         \
   GOMP_loop_ull_ordered_guided_next
 #define KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_GUIDED_START                        \
   GOMP_loop_ull_ordered_guided_start
 #define KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_RUNTIME_NEXT                        \
   GOMP_loop_ull_ordered_runtime_next
 #define KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_RUNTIME_START                       \
   GOMP_loop_ull_ordered_runtime_start
 #define KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_STATIC_NEXT                         \
   GOMP_loop_ull_ordered_static_next
 #define KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_STATIC_START                        \
   GOMP_loop_ull_ordered_static_start
 #define KMP_API_NAME_GOMP_LOOP_ULL_RUNTIME_NEXT GOMP_loop_ull_runtime_next
 #define KMP_API_NAME_GOMP_LOOP_ULL_RUNTIME_START GOMP_loop_ull_runtime_start
 #define KMP_API_NAME_GOMP_LOOP_ULL_STATIC_NEXT GOMP_loop_ull_static_next
 #define KMP_API_NAME_GOMP_LOOP_ULL_STATIC_START GOMP_loop_ull_static_start
 
 // All GOMP_3.0 symbols
 #define KMP_API_NAME_GOMP_TASKYIELD GOMP_taskyield
 
 // All GOMP_4.0 symbols
 #define KMP_API_NAME_GOMP_BARRIER_CANCEL GOMP_barrier_cancel
 #define KMP_API_NAME_GOMP_CANCEL GOMP_cancel
 #define KMP_API_NAME_GOMP_CANCELLATION_POINT GOMP_cancellation_point
 #define KMP_API_NAME_GOMP_LOOP_END_CANCEL GOMP_loop_end_cancel
 #define KMP_API_NAME_GOMP_PARALLEL_LOOP_DYNAMIC GOMP_parallel_loop_dynamic
 #define KMP_API_NAME_GOMP_PARALLEL_LOOP_GUIDED GOMP_parallel_loop_guided
 #define KMP_API_NAME_GOMP_PARALLEL_LOOP_RUNTIME GOMP_parallel_loop_runtime
 #define KMP_API_NAME_GOMP_PARALLEL_LOOP_STATIC GOMP_parallel_loop_static
 #define KMP_API_NAME_GOMP_PARALLEL_SECTIONS GOMP_parallel_sections
 #define KMP_API_NAME_GOMP_PARALLEL GOMP_parallel
 #define KMP_API_NAME_GOMP_SECTIONS_END_CANCEL GOMP_sections_end_cancel
 #define KMP_API_NAME_GOMP_TASKGROUP_START GOMP_taskgroup_start
 #define KMP_API_NAME_GOMP_TASKGROUP_END GOMP_taskgroup_end
 /* Target functions should be taken care of by liboffload */
 #define KMP_API_NAME_GOMP_TARGET GOMP_target
 #define KMP_API_NAME_GOMP_TARGET_DATA GOMP_target_data
 #define KMP_API_NAME_GOMP_TARGET_END_DATA GOMP_target_end_data
 #define KMP_API_NAME_GOMP_TARGET_UPDATE GOMP_target_update
 #define KMP_API_NAME_GOMP_TEAMS GOMP_teams
 
 // All GOMP_4.5 symbols
 #define KMP_API_NAME_GOMP_TASKLOOP GOMP_taskloop
 #define KMP_API_NAME_GOMP_TASKLOOP_ULL GOMP_taskloop_ull
 #define KMP_API_NAME_GOMP_DOACROSS_POST GOMP_doacross_post
 #define KMP_API_NAME_GOMP_DOACROSS_WAIT GOMP_doacross_wait
 #define KMP_API_NAME_GOMP_LOOP_DOACROSS_STATIC_START                           \
   GOMP_loop_doacross_static_start
 #define KMP_API_NAME_GOMP_LOOP_DOACROSS_DYNAMIC_START                          \
   GOMP_loop_doacross_dynamic_start
 #define KMP_API_NAME_GOMP_LOOP_DOACROSS_GUIDED_START                           \
   GOMP_loop_doacross_guided_start
 #define KMP_API_NAME_GOMP_LOOP_DOACROSS_RUNTIME_START                          \
   GOMP_loop_doacross_runtime_start
 #define KMP_API_NAME_GOMP_DOACROSS_ULL_POST GOMP_doacross_ull_post
 #define KMP_API_NAME_GOMP_DOACROSS_ULL_WAIT GOMP_doacross_ull_wait
 #define KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_STATIC_START                       \
   GOMP_loop_ull_doacross_static_start
 #define KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_DYNAMIC_START                      \
   GOMP_loop_ull_doacross_dynamic_start
 #define KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_GUIDED_START                       \
   GOMP_loop_ull_doacross_guided_start
 #define KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_RUNTIME_START                      \
   GOMP_loop_ull_doacross_runtime_start
+#define KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_DYNAMIC_NEXT                       \
+  GOMP_loop_nonmonotonic_dynamic_next
+#define KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_DYNAMIC_START                      \
+  GOMP_loop_nonmonotonic_dynamic_start
+#define KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_GUIDED_NEXT                        \
+  GOMP_loop_nonmonotonic_guided_next
+#define KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_GUIDED_START                       \
+  GOMP_loop_nonmonotonic_guided_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_DYNAMIC_NEXT                   \
+  GOMP_loop_ull_nonmonotonic_dynamic_next
+#define KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_DYNAMIC_START                  \
+  GOMP_loop_ull_nonmonotonic_dynamic_start
+#define KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_GUIDED_NEXT                    \
+  GOMP_loop_ull_nonmonotonic_guided_next
+#define KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_GUIDED_START                   \
+  GOMP_loop_ull_nonmonotonic_guided_start
+#define KMP_API_NAME_GOMP_PARALLEL_LOOP_NONMONOTONIC_DYNAMIC                   \
+  GOMP_parallel_loop_nonmonotonic_dynamic
+#define KMP_API_NAME_GOMP_PARALLEL_LOOP_NONMONOTONIC_GUIDED                    \
+  GOMP_parallel_loop_nonmonotonic_guided
 
 #endif /* KMP_FTN_OS_H */
Index: projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_global.cpp
===================================================================
--- projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_global.cpp	(revision 357058)
+++ projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_global.cpp	(revision 357059)
@@ -1,534 +1,534 @@
 /*
  * kmp_global.cpp -- KPTS global variables for runtime support library
  */
 
 //===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
 #include "kmp.h"
 #include "kmp_affinity.h"
 #if KMP_USE_HIER_SCHED
 #include "kmp_dispatch_hier.h"
 #endif
 
 kmp_key_t __kmp_gtid_threadprivate_key;
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 kmp_cpuinfo_t __kmp_cpuinfo = {0}; // Not initialized
 #endif
 
 #if KMP_STATS_ENABLED
 #include "kmp_stats.h"
 // lock for modifying the global __kmp_stats_list
 kmp_tas_lock_t __kmp_stats_lock;
 
 // global list of per thread stats, the head is a sentinel node which
 // accumulates all stats produced before __kmp_create_worker is called.
 kmp_stats_list *__kmp_stats_list;
 
 // thread local pointer to stats node within list
 KMP_THREAD_LOCAL kmp_stats_list *__kmp_stats_thread_ptr = NULL;
 
 // gives reference tick for all events (considered the 0 tick)
 tsc_tick_count __kmp_stats_start_time;
 #endif
 
 /* ----------------------------------------------------- */
 /* INITIALIZATION VARIABLES */
 /* they are syncronized to write during init, but read anytime */
 volatile int __kmp_init_serial = FALSE;
 volatile int __kmp_init_gtid = FALSE;
 volatile int __kmp_init_common = FALSE;
 volatile int __kmp_init_middle = FALSE;
 volatile int __kmp_init_parallel = FALSE;
 #if KMP_USE_MONITOR
 volatile int __kmp_init_monitor =
     0; /* 1 - launched, 2 - actually started (Windows* OS only) */
 #endif
 volatile int __kmp_init_user_locks = FALSE;
 
 /* list of address of allocated caches for commons */
 kmp_cached_addr_t *__kmp_threadpriv_cache_list = NULL;
 
 int __kmp_init_counter = 0;
 int __kmp_root_counter = 0;
 int __kmp_version = 0;
 
 std::atomic<kmp_int32> __kmp_team_counter = ATOMIC_VAR_INIT(0);
 std::atomic<kmp_int32> __kmp_task_counter = ATOMIC_VAR_INIT(0);
 
 size_t __kmp_stksize = KMP_DEFAULT_STKSIZE;
 #if KMP_USE_MONITOR
 size_t __kmp_monitor_stksize = 0; // auto adjust
 #endif
 size_t __kmp_stkoffset = KMP_DEFAULT_STKOFFSET;
 int __kmp_stkpadding = KMP_MIN_STKPADDING;
 
 size_t __kmp_malloc_pool_incr = KMP_DEFAULT_MALLOC_POOL_INCR;
 
 // Barrier method defaults, settings, and strings.
 // branch factor = 2^branch_bits (only relevant for tree & hyper barrier types)
 kmp_uint32 __kmp_barrier_gather_bb_dflt = 2;
 /* branch_factor = 4 */ /* hyper2: C78980 */
 kmp_uint32 __kmp_barrier_release_bb_dflt = 2;
 /* branch_factor = 4 */ /* hyper2: C78980 */
 
 kmp_bar_pat_e __kmp_barrier_gather_pat_dflt = bp_hyper_bar;
 /* hyper2: C78980 */
 kmp_bar_pat_e __kmp_barrier_release_pat_dflt = bp_hyper_bar;
 /* hyper2: C78980 */
 
 kmp_uint32 __kmp_barrier_gather_branch_bits[bs_last_barrier] = {0};
 kmp_uint32 __kmp_barrier_release_branch_bits[bs_last_barrier] = {0};
 kmp_bar_pat_e __kmp_barrier_gather_pattern[bs_last_barrier] = {bp_linear_bar};
 kmp_bar_pat_e __kmp_barrier_release_pattern[bs_last_barrier] = {bp_linear_bar};
 char const *__kmp_barrier_branch_bit_env_name[bs_last_barrier] = {
     "KMP_PLAIN_BARRIER", "KMP_FORKJOIN_BARRIER"
 #if KMP_FAST_REDUCTION_BARRIER
     ,
     "KMP_REDUCTION_BARRIER"
 #endif // KMP_FAST_REDUCTION_BARRIER
 };
 char const *__kmp_barrier_pattern_env_name[bs_last_barrier] = {
     "KMP_PLAIN_BARRIER_PATTERN", "KMP_FORKJOIN_BARRIER_PATTERN"
 #if KMP_FAST_REDUCTION_BARRIER
     ,
     "KMP_REDUCTION_BARRIER_PATTERN"
 #endif // KMP_FAST_REDUCTION_BARRIER
 };
 char const *__kmp_barrier_type_name[bs_last_barrier] = {"plain", "forkjoin"
 #if KMP_FAST_REDUCTION_BARRIER
                                                         ,
                                                         "reduction"
 #endif // KMP_FAST_REDUCTION_BARRIER
 };
 char const *__kmp_barrier_pattern_name[bp_last_bar] = {"linear", "tree",
                                                        "hyper", "hierarchical"};
 
 int __kmp_allThreadsSpecified = 0;
 size_t __kmp_align_alloc = CACHE_LINE;
 
 int __kmp_generate_warnings = kmp_warnings_low;
 int __kmp_reserve_warn = 0;
 int __kmp_xproc = 0;
 int __kmp_avail_proc = 0;
 size_t __kmp_sys_min_stksize = KMP_MIN_STKSIZE;
 int __kmp_sys_max_nth = KMP_MAX_NTH;
 int __kmp_max_nth = 0;
 int __kmp_cg_max_nth = 0;
 int __kmp_teams_max_nth = 0;
 int __kmp_threads_capacity = 0;
 int __kmp_dflt_team_nth = 0;
 int __kmp_dflt_team_nth_ub = 0;
 int __kmp_tp_capacity = 0;
 int __kmp_tp_cached = 0;
 int __kmp_dispatch_num_buffers = KMP_DFLT_DISP_NUM_BUFF;
 int __kmp_dflt_max_active_levels = 1; // Nesting off by default
 bool __kmp_dflt_max_active_levels_set = false; // Don't override set value
 #if KMP_NESTED_HOT_TEAMS
 int __kmp_hot_teams_mode = 0; /* 0 - free extra threads when reduced */
 /* 1 - keep extra threads when reduced */
 int __kmp_hot_teams_max_level = 1; /* nesting level of hot teams */
 #endif
 enum library_type __kmp_library = library_none;
 enum sched_type __kmp_sched =
     kmp_sch_default; /* scheduling method for runtime scheduling */
 enum sched_type __kmp_static =
     kmp_sch_static_greedy; /* default static scheduling method */
 enum sched_type __kmp_guided =
     kmp_sch_guided_iterative_chunked; /* default guided scheduling method */
 enum sched_type __kmp_auto =
     kmp_sch_guided_analytical_chunked; /* default auto scheduling method */
 #if KMP_USE_HIER_SCHED
 int __kmp_dispatch_hand_threading = 0;
 int __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LAST + 1];
 int __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LAST + 1];
 kmp_hier_sched_env_t __kmp_hier_scheds = {0, 0, NULL, NULL, NULL};
 #endif
 int __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
 #if KMP_USE_MONITOR
 int __kmp_monitor_wakeups = KMP_MIN_MONITOR_WAKEUPS;
 int __kmp_bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(KMP_DEFAULT_BLOCKTIME,
                                                       KMP_MIN_MONITOR_WAKEUPS);
 #endif
 #ifdef KMP_ADJUST_BLOCKTIME
 int __kmp_zero_bt = FALSE;
 #endif /* KMP_ADJUST_BLOCKTIME */
 #ifdef KMP_DFLT_NTH_CORES
 int __kmp_ncores = 0;
 #endif
 int __kmp_chunk = 0;
 int __kmp_abort_delay = 0;
 #if KMP_OS_LINUX && defined(KMP_TDATA_GTID)
 int __kmp_gtid_mode = 3; /* use __declspec(thread) TLS to store gtid */
 int __kmp_adjust_gtid_mode = FALSE;
 #elif KMP_OS_WINDOWS
 int __kmp_gtid_mode = 2; /* use TLS functions to store gtid */
 int __kmp_adjust_gtid_mode = FALSE;
 #else
 int __kmp_gtid_mode = 0; /* select method to get gtid based on #threads */
 int __kmp_adjust_gtid_mode = TRUE;
 #endif /* KMP_OS_LINUX && defined(KMP_TDATA_GTID) */
 #ifdef KMP_TDATA_GTID
 KMP_THREAD_LOCAL int __kmp_gtid = KMP_GTID_DNE;
 #endif /* KMP_TDATA_GTID */
 int __kmp_tls_gtid_min = INT_MAX;
 int __kmp_foreign_tp = TRUE;
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 int __kmp_inherit_fp_control = TRUE;
 kmp_int16 __kmp_init_x87_fpu_control_word = 0;
 kmp_uint32 __kmp_init_mxcsr = 0;
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
 #ifdef USE_LOAD_BALANCE
 double __kmp_load_balance_interval = 1.0;
 #endif /* USE_LOAD_BALANCE */
 
 kmp_nested_nthreads_t __kmp_nested_nth = {NULL, 0, 0};
 
 #if KMP_USE_ADAPTIVE_LOCKS
 
 kmp_adaptive_backoff_params_t __kmp_adaptive_backoff_params = {
     1, 1024}; // TODO: tune it!
 
 #if KMP_DEBUG_ADAPTIVE_LOCKS
 const char *__kmp_speculative_statsfile = "-";
 #endif
 
 #endif // KMP_USE_ADAPTIVE_LOCKS
 
 int __kmp_display_env = FALSE;
 int __kmp_display_env_verbose = FALSE;
 int __kmp_omp_cancellation = FALSE;
 
 /* map OMP 3.0 schedule types with our internal schedule types */
 enum sched_type __kmp_sch_map[kmp_sched_upper - kmp_sched_lower_ext +
                               kmp_sched_upper_std - kmp_sched_lower - 2] = {
     kmp_sch_static_chunked, // ==> kmp_sched_static            = 1
     kmp_sch_dynamic_chunked, // ==> kmp_sched_dynamic           = 2
     kmp_sch_guided_chunked, // ==> kmp_sched_guided            = 3
     kmp_sch_auto, // ==> kmp_sched_auto              = 4
     kmp_sch_trapezoidal // ==> kmp_sched_trapezoidal       = 101
     // will likely not be used, introduced here just to debug the code
     // of public intel extension schedules
 };
 
 #if KMP_OS_LINUX
 enum clock_function_type __kmp_clock_function;
 int __kmp_clock_function_param;
 #endif /* KMP_OS_LINUX */
 
 #if KMP_MIC_SUPPORTED
 enum mic_type __kmp_mic_type = non_mic;
 #endif
 
 #if KMP_AFFINITY_SUPPORTED
 
 KMPAffinity *__kmp_affinity_dispatch = NULL;
 
 #if KMP_USE_HWLOC
 int __kmp_hwloc_error = FALSE;
 hwloc_topology_t __kmp_hwloc_topology = NULL;
 int __kmp_numa_detected = FALSE;
 int __kmp_tile_depth = 0;
 #endif
 
 #if KMP_OS_WINDOWS
 #if KMP_GROUP_AFFINITY
 int __kmp_num_proc_groups = 1;
 #endif /* KMP_GROUP_AFFINITY */
 kmp_GetActiveProcessorCount_t __kmp_GetActiveProcessorCount = NULL;
 kmp_GetActiveProcessorGroupCount_t __kmp_GetActiveProcessorGroupCount = NULL;
 kmp_GetThreadGroupAffinity_t __kmp_GetThreadGroupAffinity = NULL;
 kmp_SetThreadGroupAffinity_t __kmp_SetThreadGroupAffinity = NULL;
 #endif /* KMP_OS_WINDOWS */
 
 size_t __kmp_affin_mask_size = 0;
 enum affinity_type __kmp_affinity_type = affinity_default;
 enum affinity_gran __kmp_affinity_gran = affinity_gran_default;
 int __kmp_affinity_gran_levels = -1;
 int __kmp_affinity_dups = TRUE;
 enum affinity_top_method __kmp_affinity_top_method =
     affinity_top_method_default;
 int __kmp_affinity_compact = 0;
 int __kmp_affinity_offset = 0;
 int __kmp_affinity_verbose = FALSE;
 int __kmp_affinity_warnings = TRUE;
 int __kmp_affinity_respect_mask = affinity_respect_mask_default;
 char *__kmp_affinity_proclist = NULL;
 kmp_affin_mask_t *__kmp_affinity_masks = NULL;
 unsigned __kmp_affinity_num_masks = 0;
 
 char *__kmp_cpuinfo_file = NULL;
 
 #endif /* KMP_AFFINITY_SUPPORTED */
 
 kmp_nested_proc_bind_t __kmp_nested_proc_bind = {NULL, 0, 0};
 int __kmp_affinity_num_places = 0;
 int __kmp_display_affinity = FALSE;
 char *__kmp_affinity_format = NULL;
 
 kmp_hws_item_t __kmp_hws_socket = {0, 0};
 kmp_hws_item_t __kmp_hws_node = {0, 0};
 kmp_hws_item_t __kmp_hws_tile = {0, 0};
 kmp_hws_item_t __kmp_hws_core = {0, 0};
 kmp_hws_item_t __kmp_hws_proc = {0, 0};
 int __kmp_hws_requested = 0;
 int __kmp_hws_abs_flag = 0; // absolute or per-item number requested
 
 kmp_int32 __kmp_default_device = 0;
 
 kmp_tasking_mode_t __kmp_tasking_mode = tskm_task_teams;
 kmp_int32 __kmp_max_task_priority = 0;
 kmp_uint64 __kmp_taskloop_min_tasks = 0;
 
 int __kmp_memkind_available = 0;
 omp_allocator_handle_t const omp_null_allocator = NULL;
 omp_allocator_handle_t const omp_default_mem_alloc =
     (omp_allocator_handle_t const)1;
 omp_allocator_handle_t const omp_large_cap_mem_alloc =
     (omp_allocator_handle_t const)2;
 omp_allocator_handle_t const omp_const_mem_alloc =
     (omp_allocator_handle_t const)3;
 omp_allocator_handle_t const omp_high_bw_mem_alloc =
     (omp_allocator_handle_t const)4;
 omp_allocator_handle_t const omp_low_lat_mem_alloc =
     (omp_allocator_handle_t const)5;
 omp_allocator_handle_t const omp_cgroup_mem_alloc =
     (omp_allocator_handle_t const)6;
 omp_allocator_handle_t const omp_pteam_mem_alloc =
     (omp_allocator_handle_t const)7;
 omp_allocator_handle_t const omp_thread_mem_alloc =
     (omp_allocator_handle_t const)8;
 omp_allocator_handle_t const kmp_max_mem_alloc =
     (omp_allocator_handle_t const)1024;
 omp_allocator_handle_t __kmp_def_allocator = omp_default_mem_alloc;
 
 omp_memspace_handle_t const omp_default_mem_space =
     (omp_memspace_handle_t const)0;
 omp_memspace_handle_t const omp_large_cap_mem_space =
     (omp_memspace_handle_t const)1;
 omp_memspace_handle_t const omp_const_mem_space =
     (omp_memspace_handle_t const)2;
 omp_memspace_handle_t const omp_high_bw_mem_space =
     (omp_memspace_handle_t const)3;
 omp_memspace_handle_t const omp_low_lat_mem_space =
     (omp_memspace_handle_t const)4;
 
 /* This check ensures that the compiler is passing the correct data type for the
    flags formal parameter of the function kmpc_omp_task_alloc(). If the type is
    not a 4-byte type, then give an error message about a non-positive length
    array pointing here.  If that happens, the kmp_tasking_flags_t structure must
    be redefined to have exactly 32 bits. */
 KMP_BUILD_ASSERT(sizeof(kmp_tasking_flags_t) == 4);
 
 int __kmp_task_stealing_constraint = 1; /* Constrain task stealing by default */
 int __kmp_enable_task_throttling = 1;
 
 #ifdef DEBUG_SUSPEND
 int __kmp_suspend_count = 0;
 #endif
 
 int __kmp_settings = FALSE;
 int __kmp_duplicate_library_ok = 0;
 #if USE_ITT_BUILD
 int __kmp_forkjoin_frames = 1;
 int __kmp_forkjoin_frames_mode = 3;
 #endif
 PACKED_REDUCTION_METHOD_T __kmp_force_reduction_method =
     reduction_method_not_defined;
 int __kmp_determ_red = FALSE;
 
 #ifdef KMP_DEBUG
 int kmp_a_debug = 0;
 int kmp_b_debug = 0;
 int kmp_c_debug = 0;
 int kmp_d_debug = 0;
 int kmp_e_debug = 0;
 int kmp_f_debug = 0;
 int kmp_diag = 0;
 #endif
 
 /* For debug information logging using rotating buffer */
 int __kmp_debug_buf =
     FALSE; /* TRUE means use buffer, FALSE means print to stderr */
 int __kmp_debug_buf_lines =
     KMP_DEBUG_BUF_LINES_INIT; /* Lines of debug stored in buffer */
 int __kmp_debug_buf_chars =
     KMP_DEBUG_BUF_CHARS_INIT; /* Characters allowed per line in buffer */
 int __kmp_debug_buf_atomic =
     FALSE; /* TRUE means use atomic update of buffer entry pointer */
 
 char *__kmp_debug_buffer = NULL; /* Debug buffer itself */
 std::atomic<int> __kmp_debug_count =
     ATOMIC_VAR_INIT(0); /* number of lines printed in buffer so far */
 int __kmp_debug_buf_warn_chars =
     0; /* Keep track of char increase recommended in warnings */
 /* end rotating debug buffer */
 
 #ifdef KMP_DEBUG
 int __kmp_par_range; /* +1 => only go par for constructs in range */
 /* -1 => only go par for constructs outside range */
 char __kmp_par_range_routine[KMP_PAR_RANGE_ROUTINE_LEN] = {'\0'};
 char __kmp_par_range_filename[KMP_PAR_RANGE_FILENAME_LEN] = {'\0'};
 int __kmp_par_range_lb = 0;
 int __kmp_par_range_ub = INT_MAX;
 #endif /* KMP_DEBUG */
 
 /* For printing out dynamic storage map for threads and teams */
 int __kmp_storage_map =
     FALSE; /* True means print storage map for threads and teams */
 int __kmp_storage_map_verbose =
     FALSE; /* True means storage map includes placement info */
 int __kmp_storage_map_verbose_specified = FALSE;
 /* Initialize the library data structures when we fork a child process, defaults
  * to TRUE */
 int __kmp_need_register_atfork =
     TRUE; /* At initialization, call pthread_atfork to install fork handler */
 int __kmp_need_register_atfork_specified = TRUE;
 
 int __kmp_env_stksize = FALSE; /* KMP_STACKSIZE specified? */
 int __kmp_env_blocktime = FALSE; /* KMP_BLOCKTIME specified? */
 int __kmp_env_checks = FALSE; /* KMP_CHECKS specified?    */
 int __kmp_env_consistency_check = FALSE; /* KMP_CONSISTENCY_CHECK specified? */
 
 // From KMP_USE_YIELD:
 // 0 = never yield;
 // 1 = always yield (default);
 // 2 = yield only if oversubscribed
 kmp_int32 __kmp_use_yield = 1;
 // This will be 1 if KMP_USE_YIELD environment variable was set explicitly
 kmp_int32 __kmp_use_yield_exp_set = 0;
 
 kmp_uint32 __kmp_yield_init = KMP_INIT_WAIT;
 kmp_uint32 __kmp_yield_next = KMP_NEXT_WAIT;
 
 /* ------------------------------------------------------ */
 /* STATE mostly syncronized with global lock */
 /* data written to rarely by masters, read often by workers */
 /* TODO: None of this global padding stuff works consistently because the order
    of declaration is not necessarily correlated to storage order. To fix this,
    all the important globals must be put in a big structure instead. */
 KMP_ALIGN_CACHE
 kmp_info_t **__kmp_threads = NULL;
 kmp_root_t **__kmp_root = NULL;
 
 /* data read/written to often by masters */
 KMP_ALIGN_CACHE
 volatile int __kmp_nth = 0;
 volatile int __kmp_all_nth = 0;
 volatile kmp_info_t *__kmp_thread_pool = NULL;
 volatile kmp_team_t *__kmp_team_pool = NULL;
 
 KMP_ALIGN_CACHE
 std::atomic<int> __kmp_thread_pool_active_nth = ATOMIC_VAR_INIT(0);
 
 /* -------------------------------------------------
  * GLOBAL/ROOT STATE */
 KMP_ALIGN_CACHE
-kmp_global_t __kmp_global = {{0}};
+kmp_global_t __kmp_global;
 
 /* ----------------------------------------------- */
 /* GLOBAL SYNCHRONIZATION LOCKS */
 /* TODO verify the need for these locks and if they need to be global */
 
 #if KMP_USE_INTERNODE_ALIGNMENT
 /* Multinode systems have larger cache line granularity which can cause
  * false sharing if the alignment is not large enough for these locks */
 KMP_ALIGN_CACHE_INTERNODE
 
 KMP_BOOTSTRAP_LOCK_INIT(__kmp_initz_lock); /* Control initializations */
 KMP_ALIGN_CACHE_INTERNODE
 KMP_BOOTSTRAP_LOCK_INIT(__kmp_forkjoin_lock); /* control fork/join access */
 KMP_ALIGN_CACHE_INTERNODE
 KMP_BOOTSTRAP_LOCK_INIT(__kmp_exit_lock); /* exit() is not always thread-safe */
 #if KMP_USE_MONITOR
 /* control monitor thread creation */
 KMP_ALIGN_CACHE_INTERNODE
 KMP_BOOTSTRAP_LOCK_INIT(__kmp_monitor_lock);
 #endif
 /* used for the hack to allow threadprivate cache and __kmp_threads expansion
    to co-exist */
 KMP_ALIGN_CACHE_INTERNODE
 KMP_BOOTSTRAP_LOCK_INIT(__kmp_tp_cached_lock);
 
 KMP_ALIGN_CACHE_INTERNODE
 KMP_LOCK_INIT(__kmp_global_lock); /* Control OS/global access */
 KMP_ALIGN_CACHE_INTERNODE
 kmp_queuing_lock_t __kmp_dispatch_lock; /* Control dispatch access  */
 KMP_ALIGN_CACHE_INTERNODE
 KMP_LOCK_INIT(__kmp_debug_lock); /* Control I/O access for KMP_DEBUG */
 #else
 KMP_ALIGN_CACHE
 
 KMP_BOOTSTRAP_LOCK_INIT(__kmp_initz_lock); /* Control initializations */
 KMP_BOOTSTRAP_LOCK_INIT(__kmp_forkjoin_lock); /* control fork/join access */
 KMP_BOOTSTRAP_LOCK_INIT(__kmp_exit_lock); /* exit() is not always thread-safe */
 #if KMP_USE_MONITOR
 /* control monitor thread creation */
 KMP_BOOTSTRAP_LOCK_INIT(__kmp_monitor_lock);
 #endif
 /* used for the hack to allow threadprivate cache and __kmp_threads expansion
    to co-exist */
 KMP_BOOTSTRAP_LOCK_INIT(__kmp_tp_cached_lock);
 
 KMP_ALIGN(128)
 KMP_LOCK_INIT(__kmp_global_lock); /* Control OS/global access */
 KMP_ALIGN(128)
 kmp_queuing_lock_t __kmp_dispatch_lock; /* Control dispatch access  */
 KMP_ALIGN(128)
 KMP_LOCK_INIT(__kmp_debug_lock); /* Control I/O access for KMP_DEBUG */
 #endif
 
 /* ----------------------------------------------- */
 
 #if KMP_HANDLE_SIGNALS
 /* Signal handling is disabled by default, because it confuses users: In case of
    sigsegv (or other trouble) in user code signal handler catches the signal,
    which then "appears" in the monitor thread (when the monitor executes raise()
    function). Users see signal in the monitor thread and blame OpenMP RTL.
 
    Grant said signal handling required on some older OSes (Irix?) supported by
    KAI, because bad applications hung but not aborted. Currently it is not a
    problem for Linux* OS, OS X* and Windows* OS.
 
    Grant: Found new hangs for EL4, EL5, and a Fedora Core machine.  So I'm
    putting the default back for now to see if that fixes hangs on those
    machines.
 
    2010-04013 Lev: It was a bug in Fortran RTL. Fortran RTL prints a kind of
    stack backtrace when program is aborting, but the code is not signal-safe.
    When multiple signals raised at the same time (which occurs in dynamic
    negative tests because all the worker threads detects the same error),
    Fortran RTL may hang. The bug finally fixed in Fortran RTL library provided
    by Steve R., and will be available soon. */
 int __kmp_handle_signals = FALSE;
 #endif
 
 #ifdef DEBUG_SUSPEND
 int get_suspend_count_(void) {
   int count = __kmp_suspend_count;
   __kmp_suspend_count = 0;
   return count;
 }
 void set_suspend_count_(int *value) { __kmp_suspend_count = *value; }
 #endif
 
 // Symbols for MS mutual detection.
 int _You_must_link_with_exactly_one_OpenMP_library = 1;
 int _You_must_link_with_Intel_OpenMP_library = 1;
 #if KMP_OS_WINDOWS && (KMP_VERSION_MAJOR > 4)
 int _You_must_link_with_Microsoft_OpenMP_library = 1;
 #endif
 
 kmp_target_offload_kind_t __kmp_target_offload = tgt_default;
 
 // OMP Pause Resources
 kmp_pause_status_t __kmp_pause_status = kmp_not_paused;
 
 // end of file //
Index: projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_gsupport.cpp
===================================================================
--- projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_gsupport.cpp	(revision 357058)
+++ projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_gsupport.cpp	(revision 357059)
@@ -1,1950 +1,1992 @@
 /*
  * kmp_gsupport.cpp
  */
 
 //===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
 #include "kmp.h"
 #include "kmp_atomic.h"
 
 #if OMPT_SUPPORT
 #include "ompt-specific.h"
 #endif
 
 #ifdef __cplusplus
 extern "C" {
 #endif // __cplusplus
 
 #define MKLOC(loc, routine)                                                    \
-  static ident_t(loc) = {0, KMP_IDENT_KMPC, 0, 0, ";unknown;unknown;0;0;;"};
+  static ident_t loc = {0, KMP_IDENT_KMPC, 0, 0, ";unknown;unknown;0;0;;"};
 
 #include "kmp_ftn_os.h"
 
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_BARRIER)(void) {
   int gtid = __kmp_entry_gtid();
   MKLOC(loc, "GOMP_barrier");
   KA_TRACE(20, ("GOMP_barrier: T#%d\n", gtid));
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   ompt_frame_t *ompt_frame;
   if (ompt_enabled.enabled) {
     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
     ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
     OMPT_STORE_RETURN_ADDRESS(gtid);
   }
 #endif
   __kmpc_barrier(&loc, gtid);
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (ompt_enabled.enabled) {
     ompt_frame->enter_frame = ompt_data_none;
   }
 #endif
 }
 
 // Mutual exclusion
 
 // The symbol that icc/ifort generates for unnamed for unnamed critical sections
 // - .gomp_critical_user_ - is defined using .comm in any objects reference it.
 // We can't reference it directly here in C code, as the symbol contains a ".".
 //
 // The RTL contains an assembly language definition of .gomp_critical_user_
 // with another symbol __kmp_unnamed_critical_addr initialized with it's
 // address.
 extern kmp_critical_name *__kmp_unnamed_critical_addr;
 
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_CRITICAL_START)(void) {
   int gtid = __kmp_entry_gtid();
   MKLOC(loc, "GOMP_critical_start");
   KA_TRACE(20, ("GOMP_critical_start: T#%d\n", gtid));
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
   __kmpc_critical(&loc, gtid, __kmp_unnamed_critical_addr);
 }
 
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_CRITICAL_END)(void) {
   int gtid = __kmp_get_gtid();
   MKLOC(loc, "GOMP_critical_end");
   KA_TRACE(20, ("GOMP_critical_end: T#%d\n", gtid));
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
   __kmpc_end_critical(&loc, gtid, __kmp_unnamed_critical_addr);
 }
 
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_CRITICAL_NAME_START)(void **pptr) {
   int gtid = __kmp_entry_gtid();
   MKLOC(loc, "GOMP_critical_name_start");
   KA_TRACE(20, ("GOMP_critical_name_start: T#%d\n", gtid));
   __kmpc_critical(&loc, gtid, (kmp_critical_name *)pptr);
 }
 
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_CRITICAL_NAME_END)(void **pptr) {
   int gtid = __kmp_get_gtid();
   MKLOC(loc, "GOMP_critical_name_end");
   KA_TRACE(20, ("GOMP_critical_name_end: T#%d\n", gtid));
   __kmpc_end_critical(&loc, gtid, (kmp_critical_name *)pptr);
 }
 
 // The Gnu codegen tries to use locked operations to perform atomic updates
 // inline.  If it can't, then it calls GOMP_atomic_start() before performing
 // the update and GOMP_atomic_end() afterward, regardless of the data type.
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_ATOMIC_START)(void) {
   int gtid = __kmp_entry_gtid();
   KA_TRACE(20, ("GOMP_atomic_start: T#%d\n", gtid));
 
 #if OMPT_SUPPORT
   __ompt_thread_assign_wait_id(0);
 #endif
 
   __kmp_acquire_atomic_lock(&__kmp_atomic_lock, gtid);
 }
 
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_ATOMIC_END)(void) {
   int gtid = __kmp_get_gtid();
   KA_TRACE(20, ("GOMP_atomic_end: T#%d\n", gtid));
   __kmp_release_atomic_lock(&__kmp_atomic_lock, gtid);
 }
 
 int KMP_EXPAND_NAME(KMP_API_NAME_GOMP_SINGLE_START)(void) {
   int gtid = __kmp_entry_gtid();
   MKLOC(loc, "GOMP_single_start");
   KA_TRACE(20, ("GOMP_single_start: T#%d\n", gtid));
 
   if (!TCR_4(__kmp_init_parallel))
     __kmp_parallel_initialize();
   __kmp_resume_if_soft_paused();
 
   // 3rd parameter == FALSE prevents kmp_enter_single from pushing a
   // workshare when USE_CHECKS is defined.  We need to avoid the push,
   // as there is no corresponding GOMP_single_end() call.
   kmp_int32 rc = __kmp_enter_single(gtid, &loc, FALSE);
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   kmp_info_t *this_thr = __kmp_threads[gtid];
   kmp_team_t *team = this_thr->th.th_team;
   int tid = __kmp_tid_from_gtid(gtid);
 
   if (ompt_enabled.enabled) {
     if (rc) {
       if (ompt_enabled.ompt_callback_work) {
         ompt_callbacks.ompt_callback(ompt_callback_work)(
             ompt_work_single_executor, ompt_scope_begin,
             &(team->t.ompt_team_info.parallel_data),
             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
             1, OMPT_GET_RETURN_ADDRESS(0));
       }
     } else {
       if (ompt_enabled.ompt_callback_work) {
         ompt_callbacks.ompt_callback(ompt_callback_work)(
             ompt_work_single_other, ompt_scope_begin,
             &(team->t.ompt_team_info.parallel_data),
             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
             1, OMPT_GET_RETURN_ADDRESS(0));
         ompt_callbacks.ompt_callback(ompt_callback_work)(
             ompt_work_single_other, ompt_scope_end,
             &(team->t.ompt_team_info.parallel_data),
             &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data),
             1, OMPT_GET_RETURN_ADDRESS(0));
       }
     }
   }
 #endif
 
   return rc;
 }
 
 void *KMP_EXPAND_NAME(KMP_API_NAME_GOMP_SINGLE_COPY_START)(void) {
   void *retval;
   int gtid = __kmp_entry_gtid();
   MKLOC(loc, "GOMP_single_copy_start");
   KA_TRACE(20, ("GOMP_single_copy_start: T#%d\n", gtid));
 
   if (!TCR_4(__kmp_init_parallel))
     __kmp_parallel_initialize();
   __kmp_resume_if_soft_paused();
 
   // If this is the first thread to enter, return NULL.  The generated code will
   // then call GOMP_single_copy_end() for this thread only, with the
   // copyprivate data pointer as an argument.
   if (__kmp_enter_single(gtid, &loc, FALSE))
     return NULL;
 
 // Wait for the first thread to set the copyprivate data pointer,
 // and for all other threads to reach this point.
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   ompt_frame_t *ompt_frame;
   if (ompt_enabled.enabled) {
     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
     ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
     OMPT_STORE_RETURN_ADDRESS(gtid);
   }
 #endif
   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
 
   // Retrieve the value of the copyprivate data point, and wait for all
   // threads to do likewise, then return.
   retval = __kmp_team_from_gtid(gtid)->t.t_copypriv_data;
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (ompt_enabled.enabled) {
     OMPT_STORE_RETURN_ADDRESS(gtid);
   }
 #endif
   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (ompt_enabled.enabled) {
     ompt_frame->enter_frame = ompt_data_none;
   }
 #endif
   return retval;
 }
 
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_SINGLE_COPY_END)(void *data) {
   int gtid = __kmp_get_gtid();
   KA_TRACE(20, ("GOMP_single_copy_end: T#%d\n", gtid));
 
   // Set the copyprivate data pointer fo the team, then hit the barrier so that
   // the other threads will continue on and read it.  Hit another barrier before
   // continuing, so that the know that the copyprivate data pointer has been
   // propagated to all threads before trying to reuse the t_copypriv_data field.
   __kmp_team_from_gtid(gtid)->t.t_copypriv_data = data;
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   ompt_frame_t *ompt_frame;
   if (ompt_enabled.enabled) {
     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
     ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
     OMPT_STORE_RETURN_ADDRESS(gtid);
   }
 #endif
   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (ompt_enabled.enabled) {
     OMPT_STORE_RETURN_ADDRESS(gtid);
   }
 #endif
   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (ompt_enabled.enabled) {
     ompt_frame->enter_frame = ompt_data_none;
   }
 #endif
 }
 
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_ORDERED_START)(void) {
   int gtid = __kmp_entry_gtid();
   MKLOC(loc, "GOMP_ordered_start");
   KA_TRACE(20, ("GOMP_ordered_start: T#%d\n", gtid));
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
   __kmpc_ordered(&loc, gtid);
 }
 
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_ORDERED_END)(void) {
   int gtid = __kmp_get_gtid();
   MKLOC(loc, "GOMP_ordered_end");
   KA_TRACE(20, ("GOMP_ordered_start: T#%d\n", gtid));
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
   __kmpc_end_ordered(&loc, gtid);
 }
 
 // Dispatch macro defs
 //
 // They come in two flavors: 64-bit unsigned, and either 32-bit signed
 // (IA-32 architecture) or 64-bit signed (Intel(R) 64).
 
 #if KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_MIPS
 #define KMP_DISPATCH_INIT __kmp_aux_dispatch_init_4
 #define KMP_DISPATCH_FINI_CHUNK __kmp_aux_dispatch_fini_chunk_4
 #define KMP_DISPATCH_NEXT __kmpc_dispatch_next_4
 #else
 #define KMP_DISPATCH_INIT __kmp_aux_dispatch_init_8
 #define KMP_DISPATCH_FINI_CHUNK __kmp_aux_dispatch_fini_chunk_8
 #define KMP_DISPATCH_NEXT __kmpc_dispatch_next_8
 #endif /* KMP_ARCH_X86 */
 
 #define KMP_DISPATCH_INIT_ULL __kmp_aux_dispatch_init_8u
 #define KMP_DISPATCH_FINI_CHUNK_ULL __kmp_aux_dispatch_fini_chunk_8u
 #define KMP_DISPATCH_NEXT_ULL __kmpc_dispatch_next_8u
 
 // The parallel contruct
 
 #ifndef KMP_DEBUG
 static
 #endif /* KMP_DEBUG */
     void
     __kmp_GOMP_microtask_wrapper(int *gtid, int *npr, void (*task)(void *),
                                  void *data) {
 #if OMPT_SUPPORT
   kmp_info_t *thr;
   ompt_frame_t *ompt_frame;
   ompt_state_t enclosing_state;
 
   if (ompt_enabled.enabled) {
     // get pointer to thread data structure
     thr = __kmp_threads[*gtid];
 
     // save enclosing task state; set current state for task
     enclosing_state = thr->th.ompt_thread_info.state;
     thr->th.ompt_thread_info.state = ompt_state_work_parallel;
 
     // set task frame
     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
     ompt_frame->exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
   }
 #endif
 
   task(data);
 
 #if OMPT_SUPPORT
   if (ompt_enabled.enabled) {
     // clear task frame
     ompt_frame->exit_frame = ompt_data_none;
 
     // restore enclosing state
     thr->th.ompt_thread_info.state = enclosing_state;
   }
 #endif
 }
 
 #ifndef KMP_DEBUG
 static
 #endif /* KMP_DEBUG */
     void
     __kmp_GOMP_parallel_microtask_wrapper(int *gtid, int *npr,
                                           void (*task)(void *), void *data,
                                           unsigned num_threads, ident_t *loc,
                                           enum sched_type schedule, long start,
                                           long end, long incr,
                                           long chunk_size) {
   // Intialize the loop worksharing construct.
 
   KMP_DISPATCH_INIT(loc, *gtid, schedule, start, end, incr, chunk_size,
                     schedule != kmp_sch_static);
 
 #if OMPT_SUPPORT
   kmp_info_t *thr;
   ompt_frame_t *ompt_frame;
   ompt_state_t enclosing_state;
 
   if (ompt_enabled.enabled) {
     thr = __kmp_threads[*gtid];
     // save enclosing task state; set current state for task
     enclosing_state = thr->th.ompt_thread_info.state;
     thr->th.ompt_thread_info.state = ompt_state_work_parallel;
 
     // set task frame
     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
     ompt_frame->exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
   }
 #endif
 
   // Now invoke the microtask.
   task(data);
 
 #if OMPT_SUPPORT
   if (ompt_enabled.enabled) {
     // clear task frame
     ompt_frame->exit_frame = ompt_data_none;
 
     // reset enclosing state
     thr->th.ompt_thread_info.state = enclosing_state;
   }
 #endif
 }
 
 #ifndef KMP_DEBUG
 static
 #endif /* KMP_DEBUG */
     void
     __kmp_GOMP_fork_call(ident_t *loc, int gtid, void (*unwrapped_task)(void *),
                          microtask_t wrapper, int argc, ...) {
   int rc;
   kmp_info_t *thr = __kmp_threads[gtid];
   kmp_team_t *team = thr->th.th_team;
   int tid = __kmp_tid_from_gtid(gtid);
 
   va_list ap;
   va_start(ap, argc);
 
   rc = __kmp_fork_call(loc, gtid, fork_context_gnu, argc, wrapper,
                        __kmp_invoke_task_func,
 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
                        &ap
 #else
                        ap
 #endif
                        );
 
   va_end(ap);
 
   if (rc) {
     __kmp_run_before_invoked_task(gtid, tid, thr, team);
   }
 
 #if OMPT_SUPPORT
   int ompt_team_size;
   if (ompt_enabled.enabled) {
     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
 
     // implicit task callback
     if (ompt_enabled.ompt_callback_implicit_task) {
       ompt_team_size = __kmp_team_from_gtid(gtid)->t.t_nproc;
       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
           ompt_scope_begin, &(team_info->parallel_data),
           &(task_info->task_data), ompt_team_size, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
       task_info->thread_num = __kmp_tid_from_gtid(gtid);
     }
     thr->th.ompt_thread_info.state = ompt_state_work_parallel;
   }
 #endif
 }
 
 static void __kmp_GOMP_serialized_parallel(ident_t *loc, kmp_int32 gtid,
                                            void (*task)(void *)) {
 #if OMPT_SUPPORT
   OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
   __kmp_serialized_parallel(loc, gtid);
 }
 
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_START)(void (*task)(void *),
                                                        void *data,
                                                        unsigned num_threads) {
   int gtid = __kmp_entry_gtid();
 
 #if OMPT_SUPPORT
   ompt_frame_t *parent_frame, *frame;
 
   if (ompt_enabled.enabled) {
     __ompt_get_task_info_internal(0, NULL, NULL, &parent_frame, NULL, NULL);
     parent_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
     OMPT_STORE_RETURN_ADDRESS(gtid);
   }
 #endif
 
   MKLOC(loc, "GOMP_parallel_start");
   KA_TRACE(20, ("GOMP_parallel_start: T#%d\n", gtid));
 
   if (__kmpc_ok_to_fork(&loc) && (num_threads != 1)) {
     if (num_threads != 0) {
       __kmp_push_num_threads(&loc, gtid, num_threads);
     }
     __kmp_GOMP_fork_call(&loc, gtid, task,
                          (microtask_t)__kmp_GOMP_microtask_wrapper, 2, task,
                          data);
   } else {
     __kmp_GOMP_serialized_parallel(&loc, gtid, task);
   }
 
 #if OMPT_SUPPORT
   if (ompt_enabled.enabled) {
     __ompt_get_task_info_internal(0, NULL, NULL, &frame, NULL, NULL);
     frame->exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
   }
 #endif
 }
 
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_END)(void) {
   int gtid = __kmp_get_gtid();
   kmp_info_t *thr;
 
   thr = __kmp_threads[gtid];
 
   MKLOC(loc, "GOMP_parallel_end");
   KA_TRACE(20, ("GOMP_parallel_end: T#%d\n", gtid));
 
   if (!thr->th.th_team->t.t_serialized) {
     __kmp_run_after_invoked_task(gtid, __kmp_tid_from_gtid(gtid), thr,
                                  thr->th.th_team);
 
 #if OMPT_SUPPORT
     if (ompt_enabled.enabled) {
       // Implicit task is finished here, in the barrier we might schedule
       // deferred tasks,
       // these don't see the implicit task on the stack
       OMPT_CUR_TASK_INFO(thr)->frame.exit_frame = ompt_data_none;
     }
 #endif
 
     __kmp_join_call(&loc, gtid
 #if OMPT_SUPPORT
                     ,
                     fork_context_gnu
 #endif
                     );
   } else {
     __kmpc_end_serialized_parallel(&loc, gtid);
   }
 }
 
 // Loop worksharing constructs
 
 // The Gnu codegen passes in an exclusive upper bound for the overall range,
 // but the libguide dispatch code expects an inclusive upper bound, hence the
 // "end - incr" 5th argument to KMP_DISPATCH_INIT (and the " ub - str" 11th
 // argument to __kmp_GOMP_fork_call).
 //
 // Conversely, KMP_DISPATCH_NEXT returns and inclusive upper bound in *p_ub,
 // but the Gnu codegen expects an excluside upper bound, so the adjustment
 // "*p_ub += stride" compenstates for the discrepancy.
 //
 // Correction: the gnu codegen always adjusts the upper bound by +-1, not the
 // stride value.  We adjust the dispatch parameters accordingly (by +-1), but
 // we still adjust p_ub by the actual stride value.
 //
 // The "runtime" versions do not take a chunk_sz parameter.
 //
 // The profile lib cannot support construct checking of unordered loops that
 // are predetermined by the compiler to be statically scheduled, as the gcc
 // codegen will not always emit calls to GOMP_loop_static_next() to get the
 // next iteration.  Instead, it emits inline code to call omp_get_thread_num()
 // num and calculate the iteration space using the result.  It doesn't do this
 // with ordered static loop, so they can be checked.
 
 #if OMPT_SUPPORT
 #define IF_OMPT_SUPPORT(code) code
 #else
 #define IF_OMPT_SUPPORT(code)
 #endif
 
 #define LOOP_START(func, schedule)                                             \
   int func(long lb, long ub, long str, long chunk_sz, long *p_lb,              \
            long *p_ub) {                                                       \
     int status;                                                                \
     long stride;                                                               \
     int gtid = __kmp_entry_gtid();                                             \
     MKLOC(loc, KMP_STR(func));                                                 \
     KA_TRACE(                                                                  \
         20,                                                                    \
         (KMP_STR(                                                              \
              func) ": T#%d, lb 0x%lx, ub 0x%lx, str 0x%lx, chunk_sz 0x%lx\n",  \
          gtid, lb, ub, str, chunk_sz));                                        \
                                                                                \
     if ((str > 0) ? (lb < ub) : (lb > ub)) {                                   \
       IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid);)                        \
       KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb,                            \
                         (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz,        \
                         (schedule) != kmp_sch_static);                         \
       IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid);)                        \
       status = KMP_DISPATCH_NEXT(&loc, gtid, NULL, (kmp_int *)p_lb,            \
                                  (kmp_int *)p_ub, (kmp_int *)&stride);         \
       if (status) {                                                            \
         KMP_DEBUG_ASSERT(stride == str);                                       \
         *p_ub += (str > 0) ? 1 : -1;                                           \
       }                                                                        \
     } else {                                                                   \
       status = 0;                                                              \
     }                                                                          \
                                                                                \
     KA_TRACE(                                                                  \
         20,                                                                    \
         (KMP_STR(                                                              \
              func) " exit: T#%d, *p_lb 0x%lx, *p_ub 0x%lx, returning %d\n",    \
          gtid, *p_lb, *p_ub, status));                                         \
     return status;                                                             \
   }
 
 #define LOOP_RUNTIME_START(func, schedule)                                     \
   int func(long lb, long ub, long str, long *p_lb, long *p_ub) {               \
     int status;                                                                \
     long stride;                                                               \
     long chunk_sz = 0;                                                         \
     int gtid = __kmp_entry_gtid();                                             \
     MKLOC(loc, KMP_STR(func));                                                 \
     KA_TRACE(                                                                  \
         20,                                                                    \
         (KMP_STR(func) ": T#%d, lb 0x%lx, ub 0x%lx, str 0x%lx, chunk_sz %d\n", \
          gtid, lb, ub, str, chunk_sz));                                        \
                                                                                \
     if ((str > 0) ? (lb < ub) : (lb > ub)) {                                   \
       IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid);)                        \
       KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb,                            \
                         (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz, TRUE); \
       IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid);)                        \
       status = KMP_DISPATCH_NEXT(&loc, gtid, NULL, (kmp_int *)p_lb,            \
                                  (kmp_int *)p_ub, (kmp_int *)&stride);         \
       if (status) {                                                            \
         KMP_DEBUG_ASSERT(stride == str);                                       \
         *p_ub += (str > 0) ? 1 : -1;                                           \
       }                                                                        \
     } else {                                                                   \
       status = 0;                                                              \
     }                                                                          \
                                                                                \
     KA_TRACE(                                                                  \
         20,                                                                    \
         (KMP_STR(                                                              \
              func) " exit: T#%d, *p_lb 0x%lx, *p_ub 0x%lx, returning %d\n",    \
          gtid, *p_lb, *p_ub, status));                                         \
     return status;                                                             \
   }
 
 #define KMP_DOACROSS_FINI(status, gtid)                                        \
   if (!status && __kmp_threads[gtid]->th.th_dispatch->th_doacross_flags) {     \
     __kmpc_doacross_fini(NULL, gtid);                                          \
   }
 
 #define LOOP_NEXT(func, fini_code)                                             \
   int func(long *p_lb, long *p_ub) {                                           \
     int status;                                                                \
     long stride;                                                               \
     int gtid = __kmp_get_gtid();                                               \
     MKLOC(loc, KMP_STR(func));                                                 \
     KA_TRACE(20, (KMP_STR(func) ": T#%d\n", gtid));                            \
                                                                                \
     IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid);)                          \
     fini_code status = KMP_DISPATCH_NEXT(&loc, gtid, NULL, (kmp_int *)p_lb,    \
                                          (kmp_int *)p_ub, (kmp_int *)&stride); \
     if (status) {                                                              \
       *p_ub += (stride > 0) ? 1 : -1;                                          \
     }                                                                          \
     KMP_DOACROSS_FINI(status, gtid)                                            \
                                                                                \
     KA_TRACE(                                                                  \
         20,                                                                    \
         (KMP_STR(func) " exit: T#%d, *p_lb 0x%lx, *p_ub 0x%lx, stride 0x%lx, " \
                        "returning %d\n",                                       \
          gtid, *p_lb, *p_ub, stride, status));                                 \
     return status;                                                             \
   }
 
 LOOP_START(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_STATIC_START), kmp_sch_static)
 LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_STATIC_NEXT), {})
 LOOP_START(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_DYNAMIC_START),
            kmp_sch_dynamic_chunked)
+LOOP_START(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_DYNAMIC_START),
+           kmp_sch_dynamic_chunked)
 LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_DYNAMIC_NEXT), {})
+LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_DYNAMIC_NEXT), {})
 LOOP_START(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_GUIDED_START),
            kmp_sch_guided_chunked)
+LOOP_START(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_GUIDED_START),
+           kmp_sch_guided_chunked)
 LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_GUIDED_NEXT), {})
+LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_GUIDED_NEXT), {})
 LOOP_RUNTIME_START(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_RUNTIME_START),
                    kmp_sch_runtime)
 LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_RUNTIME_NEXT), {})
 
 LOOP_START(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ORDERED_STATIC_START),
            kmp_ord_static)
 LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ORDERED_STATIC_NEXT),
           { KMP_DISPATCH_FINI_CHUNK(&loc, gtid); })
 LOOP_START(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ORDERED_DYNAMIC_START),
            kmp_ord_dynamic_chunked)
 LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ORDERED_DYNAMIC_NEXT),
           { KMP_DISPATCH_FINI_CHUNK(&loc, gtid); })
 LOOP_START(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ORDERED_GUIDED_START),
            kmp_ord_guided_chunked)
 LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ORDERED_GUIDED_NEXT),
           { KMP_DISPATCH_FINI_CHUNK(&loc, gtid); })
 LOOP_RUNTIME_START(
     KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ORDERED_RUNTIME_START),
     kmp_ord_runtime)
 LOOP_NEXT(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ORDERED_RUNTIME_NEXT),
           { KMP_DISPATCH_FINI_CHUNK(&loc, gtid); })
 
 #define LOOP_DOACROSS_START(func, schedule)                                    \
   bool func(unsigned ncounts, long *counts, long chunk_sz, long *p_lb,         \
             long *p_ub) {                                                      \
     int status;                                                                \
     long stride, lb, ub, str;                                                  \
     int gtid = __kmp_entry_gtid();                                             \
     struct kmp_dim *dims =                                                     \
         (struct kmp_dim *)__kmp_allocate(sizeof(struct kmp_dim) * ncounts);    \
     MKLOC(loc, KMP_STR(func));                                                 \
     for (unsigned i = 0; i < ncounts; ++i) {                                   \
       dims[i].lo = 0;                                                          \
       dims[i].up = counts[i] - 1;                                              \
       dims[i].st = 1;                                                          \
     }                                                                          \
     __kmpc_doacross_init(&loc, gtid, (int)ncounts, dims);                      \
     lb = 0;                                                                    \
     ub = counts[0];                                                            \
     str = 1;                                                                   \
     KA_TRACE(20, (KMP_STR(func) ": T#%d, ncounts %u, lb 0x%lx, ub 0x%lx, str " \
                                 "0x%lx, chunk_sz "                             \
                                 "0x%lx\n",                                     \
                   gtid, ncounts, lb, ub, str, chunk_sz));                      \
                                                                                \
     if ((str > 0) ? (lb < ub) : (lb > ub)) {                                   \
       KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb,                            \
                         (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz,        \
                         (schedule) != kmp_sch_static);                         \
       status = KMP_DISPATCH_NEXT(&loc, gtid, NULL, (kmp_int *)p_lb,            \
                                  (kmp_int *)p_ub, (kmp_int *)&stride);         \
       if (status) {                                                            \
         KMP_DEBUG_ASSERT(stride == str);                                       \
         *p_ub += (str > 0) ? 1 : -1;                                           \
       }                                                                        \
     } else {                                                                   \
       status = 0;                                                              \
     }                                                                          \
     KMP_DOACROSS_FINI(status, gtid);                                           \
                                                                                \
     KA_TRACE(                                                                  \
         20,                                                                    \
         (KMP_STR(                                                              \
              func) " exit: T#%d, *p_lb 0x%lx, *p_ub 0x%lx, returning %d\n",    \
          gtid, *p_lb, *p_ub, status));                                         \
     __kmp_free(dims);                                                          \
     return status;                                                             \
   }
 
 #define LOOP_DOACROSS_RUNTIME_START(func, schedule)                            \
   int func(unsigned ncounts, long *counts, long *p_lb, long *p_ub) {           \
     int status;                                                                \
     long stride, lb, ub, str;                                                  \
     long chunk_sz = 0;                                                         \
     int gtid = __kmp_entry_gtid();                                             \
     struct kmp_dim *dims =                                                     \
         (struct kmp_dim *)__kmp_allocate(sizeof(struct kmp_dim) * ncounts);    \
     MKLOC(loc, KMP_STR(func));                                                 \
     for (unsigned i = 0; i < ncounts; ++i) {                                   \
       dims[i].lo = 0;                                                          \
       dims[i].up = counts[i] - 1;                                              \
       dims[i].st = 1;                                                          \
     }                                                                          \
     __kmpc_doacross_init(&loc, gtid, (int)ncounts, dims);                      \
     lb = 0;                                                                    \
     ub = counts[0];                                                            \
     str = 1;                                                                   \
     KA_TRACE(                                                                  \
         20,                                                                    \
         (KMP_STR(func) ": T#%d, lb 0x%lx, ub 0x%lx, str 0x%lx, chunk_sz %d\n", \
          gtid, lb, ub, str, chunk_sz));                                        \
                                                                                \
     if ((str > 0) ? (lb < ub) : (lb > ub)) {                                   \
       KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb,                            \
                         (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz, TRUE); \
       status = KMP_DISPATCH_NEXT(&loc, gtid, NULL, (kmp_int *)p_lb,            \
                                  (kmp_int *)p_ub, (kmp_int *)&stride);         \
       if (status) {                                                            \
         KMP_DEBUG_ASSERT(stride == str);                                       \
         *p_ub += (str > 0) ? 1 : -1;                                           \
       }                                                                        \
     } else {                                                                   \
       status = 0;                                                              \
     }                                                                          \
     KMP_DOACROSS_FINI(status, gtid);                                           \
                                                                                \
     KA_TRACE(                                                                  \
         20,                                                                    \
         (KMP_STR(                                                              \
              func) " exit: T#%d, *p_lb 0x%lx, *p_ub 0x%lx, returning %d\n",    \
          gtid, *p_lb, *p_ub, status));                                         \
     __kmp_free(dims);                                                          \
     return status;                                                             \
   }
 
 LOOP_DOACROSS_START(
     KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_DOACROSS_STATIC_START),
     kmp_sch_static)
 LOOP_DOACROSS_START(
     KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_DOACROSS_DYNAMIC_START),
     kmp_sch_dynamic_chunked)
 LOOP_DOACROSS_START(
     KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_DOACROSS_GUIDED_START),
     kmp_sch_guided_chunked)
 LOOP_DOACROSS_RUNTIME_START(
     KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_DOACROSS_RUNTIME_START),
     kmp_sch_runtime)
 
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_END)(void) {
   int gtid = __kmp_get_gtid();
   KA_TRACE(20, ("GOMP_loop_end: T#%d\n", gtid))
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   ompt_frame_t *ompt_frame;
   if (ompt_enabled.enabled) {
     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
     ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
     OMPT_STORE_RETURN_ADDRESS(gtid);
   }
 #endif
   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (ompt_enabled.enabled) {
     ompt_frame->enter_frame = ompt_data_none;
   }
 #endif
 
   KA_TRACE(20, ("GOMP_loop_end exit: T#%d\n", gtid))
 }
 
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_END_NOWAIT)(void) {
   KA_TRACE(20, ("GOMP_loop_end_nowait: T#%d\n", __kmp_get_gtid()))
 }
 
 // Unsigned long long loop worksharing constructs
 //
 // These are new with gcc 4.4
 
 #define LOOP_START_ULL(func, schedule)                                         \
   int func(int up, unsigned long long lb, unsigned long long ub,               \
            unsigned long long str, unsigned long long chunk_sz,                \
            unsigned long long *p_lb, unsigned long long *p_ub) {               \
     int status;                                                                \
     long long str2 = up ? ((long long)str) : -((long long)str);                \
     long long stride;                                                          \
     int gtid = __kmp_entry_gtid();                                             \
     MKLOC(loc, KMP_STR(func));                                                 \
                                                                                \
     KA_TRACE(20, (KMP_STR(func) ": T#%d, up %d, lb 0x%llx, ub 0x%llx, str "    \
                                 "0x%llx, chunk_sz 0x%llx\n",                   \
                   gtid, up, lb, ub, str, chunk_sz));                           \
                                                                                \
     if ((str > 0) ? (lb < ub) : (lb > ub)) {                                   \
       KMP_DISPATCH_INIT_ULL(&loc, gtid, (schedule), lb,                        \
                             (str2 > 0) ? (ub - 1) : (ub + 1), str2, chunk_sz,  \
                             (schedule) != kmp_sch_static);                     \
       status =                                                                 \
           KMP_DISPATCH_NEXT_ULL(&loc, gtid, NULL, (kmp_uint64 *)p_lb,          \
                                 (kmp_uint64 *)p_ub, (kmp_int64 *)&stride);     \
       if (status) {                                                            \
         KMP_DEBUG_ASSERT(stride == str2);                                      \
         *p_ub += (str > 0) ? 1 : -1;                                           \
       }                                                                        \
     } else {                                                                   \
       status = 0;                                                              \
     }                                                                          \
                                                                                \
     KA_TRACE(                                                                  \
         20,                                                                    \
         (KMP_STR(                                                              \
              func) " exit: T#%d, *p_lb 0x%llx, *p_ub 0x%llx, returning %d\n",  \
          gtid, *p_lb, *p_ub, status));                                         \
     return status;                                                             \
   }
 
 #define LOOP_RUNTIME_START_ULL(func, schedule)                                 \
   int func(int up, unsigned long long lb, unsigned long long ub,               \
            unsigned long long str, unsigned long long *p_lb,                   \
            unsigned long long *p_ub) {                                         \
     int status;                                                                \
     long long str2 = up ? ((long long)str) : -((long long)str);                \
     unsigned long long stride;                                                 \
     unsigned long long chunk_sz = 0;                                           \
     int gtid = __kmp_entry_gtid();                                             \
     MKLOC(loc, KMP_STR(func));                                                 \
                                                                                \
     KA_TRACE(20, (KMP_STR(func) ": T#%d, up %d, lb 0x%llx, ub 0x%llx, str "    \
                                 "0x%llx, chunk_sz 0x%llx\n",                   \
                   gtid, up, lb, ub, str, chunk_sz));                           \
                                                                                \
     if ((str > 0) ? (lb < ub) : (lb > ub)) {                                   \
       KMP_DISPATCH_INIT_ULL(&loc, gtid, (schedule), lb,                        \
                             (str2 > 0) ? (ub - 1) : (ub + 1), str2, chunk_sz,  \
                             TRUE);                                             \
       status =                                                                 \
           KMP_DISPATCH_NEXT_ULL(&loc, gtid, NULL, (kmp_uint64 *)p_lb,          \
                                 (kmp_uint64 *)p_ub, (kmp_int64 *)&stride);     \
       if (status) {                                                            \
         KMP_DEBUG_ASSERT((long long)stride == str2);                           \
         *p_ub += (str > 0) ? 1 : -1;                                           \
       }                                                                        \
     } else {                                                                   \
       status = 0;                                                              \
     }                                                                          \
                                                                                \
     KA_TRACE(                                                                  \
         20,                                                                    \
         (KMP_STR(                                                              \
              func) " exit: T#%d, *p_lb 0x%llx, *p_ub 0x%llx, returning %d\n",  \
          gtid, *p_lb, *p_ub, status));                                         \
     return status;                                                             \
   }
 
 #define LOOP_NEXT_ULL(func, fini_code)                                         \
   int func(unsigned long long *p_lb, unsigned long long *p_ub) {               \
     int status;                                                                \
     long long stride;                                                          \
     int gtid = __kmp_get_gtid();                                               \
     MKLOC(loc, KMP_STR(func));                                                 \
     KA_TRACE(20, (KMP_STR(func) ": T#%d\n", gtid));                            \
                                                                                \
     fini_code status =                                                         \
         KMP_DISPATCH_NEXT_ULL(&loc, gtid, NULL, (kmp_uint64 *)p_lb,            \
                               (kmp_uint64 *)p_ub, (kmp_int64 *)&stride);       \
     if (status) {                                                              \
       *p_ub += (stride > 0) ? 1 : -1;                                          \
     }                                                                          \
                                                                                \
     KA_TRACE(                                                                  \
         20,                                                                    \
         (KMP_STR(                                                              \
              func) " exit: T#%d, *p_lb 0x%llx, *p_ub 0x%llx, stride 0x%llx, "  \
                    "returning %d\n",                                           \
          gtid, *p_lb, *p_ub, stride, status));                                 \
     return status;                                                             \
   }
 
 LOOP_START_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_STATIC_START),
                kmp_sch_static)
 LOOP_NEXT_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_STATIC_NEXT), {})
 LOOP_START_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_DYNAMIC_START),
                kmp_sch_dynamic_chunked)
 LOOP_NEXT_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_DYNAMIC_NEXT), {})
 LOOP_START_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_GUIDED_START),
                kmp_sch_guided_chunked)
 LOOP_NEXT_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_GUIDED_NEXT), {})
+LOOP_START_ULL(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_DYNAMIC_START),
+               kmp_sch_dynamic_chunked)
+LOOP_NEXT_ULL(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_DYNAMIC_NEXT), {})
+LOOP_START_ULL(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_GUIDED_START),
+               kmp_sch_guided_chunked)
+LOOP_NEXT_ULL(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_GUIDED_NEXT), {})
 LOOP_RUNTIME_START_ULL(
     KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_RUNTIME_START), kmp_sch_runtime)
 LOOP_NEXT_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_RUNTIME_NEXT), {})
 
 LOOP_START_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_STATIC_START),
                kmp_ord_static)
 LOOP_NEXT_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_STATIC_NEXT),
               { KMP_DISPATCH_FINI_CHUNK_ULL(&loc, gtid); })
 LOOP_START_ULL(
     KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_DYNAMIC_START),
     kmp_ord_dynamic_chunked)
 LOOP_NEXT_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_DYNAMIC_NEXT),
               { KMP_DISPATCH_FINI_CHUNK_ULL(&loc, gtid); })
 LOOP_START_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_GUIDED_START),
                kmp_ord_guided_chunked)
 LOOP_NEXT_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_GUIDED_NEXT),
               { KMP_DISPATCH_FINI_CHUNK_ULL(&loc, gtid); })
 LOOP_RUNTIME_START_ULL(
     KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_RUNTIME_START),
     kmp_ord_runtime)
 LOOP_NEXT_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_RUNTIME_NEXT),
               { KMP_DISPATCH_FINI_CHUNK_ULL(&loc, gtid); })
 
 #define LOOP_DOACROSS_START_ULL(func, schedule)                                \
   int func(unsigned ncounts, unsigned long long *counts,                       \
            unsigned long long chunk_sz, unsigned long long *p_lb,              \
            unsigned long long *p_ub) {                                         \
     int status;                                                                \
     long long stride, str, lb, ub;                                             \
     int gtid = __kmp_entry_gtid();                                             \
     struct kmp_dim *dims =                                                     \
         (struct kmp_dim *)__kmp_allocate(sizeof(struct kmp_dim) * ncounts);    \
     MKLOC(loc, KMP_STR(func));                                                 \
     for (unsigned i = 0; i < ncounts; ++i) {                                   \
       dims[i].lo = 0;                                                          \
       dims[i].up = counts[i] - 1;                                              \
       dims[i].st = 1;                                                          \
     }                                                                          \
     __kmpc_doacross_init(&loc, gtid, (int)ncounts, dims);                      \
     lb = 0;                                                                    \
     ub = counts[0];                                                            \
     str = 1;                                                                   \
                                                                                \
     KA_TRACE(20, (KMP_STR(func) ": T#%d, lb 0x%llx, ub 0x%llx, str "           \
                                 "0x%llx, chunk_sz 0x%llx\n",                   \
                   gtid, lb, ub, str, chunk_sz));                               \
                                                                                \
     if ((str > 0) ? (lb < ub) : (lb > ub)) {                                   \
       KMP_DISPATCH_INIT_ULL(&loc, gtid, (schedule), lb,                        \
                             (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz,    \
                             (schedule) != kmp_sch_static);                     \
       status =                                                                 \
           KMP_DISPATCH_NEXT_ULL(&loc, gtid, NULL, (kmp_uint64 *)p_lb,          \
                                 (kmp_uint64 *)p_ub, (kmp_int64 *)&stride);     \
       if (status) {                                                            \
         KMP_DEBUG_ASSERT(stride == str);                                       \
         *p_ub += (str > 0) ? 1 : -1;                                           \
       }                                                                        \
     } else {                                                                   \
       status = 0;                                                              \
     }                                                                          \
     KMP_DOACROSS_FINI(status, gtid);                                           \
                                                                                \
     KA_TRACE(                                                                  \
         20,                                                                    \
         (KMP_STR(                                                              \
              func) " exit: T#%d, *p_lb 0x%llx, *p_ub 0x%llx, returning %d\n",  \
          gtid, *p_lb, *p_ub, status));                                         \
     __kmp_free(dims);                                                          \
     return status;                                                             \
   }
 
 #define LOOP_DOACROSS_RUNTIME_START_ULL(func, schedule)                        \
   int func(unsigned ncounts, unsigned long long *counts,                       \
            unsigned long long *p_lb, unsigned long long *p_ub) {               \
     int status;                                                                \
     unsigned long long stride, str, lb, ub;                                    \
     unsigned long long chunk_sz = 0;                                           \
     int gtid = __kmp_entry_gtid();                                             \
     struct kmp_dim *dims =                                                     \
         (struct kmp_dim *)__kmp_allocate(sizeof(struct kmp_dim) * ncounts);    \
     MKLOC(loc, KMP_STR(func));                                                 \
     for (unsigned i = 0; i < ncounts; ++i) {                                   \
       dims[i].lo = 0;                                                          \
       dims[i].up = counts[i] - 1;                                              \
       dims[i].st = 1;                                                          \
     }                                                                          \
     __kmpc_doacross_init(&loc, gtid, (int)ncounts, dims);                      \
     lb = 0;                                                                    \
     ub = counts[0];                                                            \
     str = 1;                                                                   \
     KA_TRACE(20, (KMP_STR(func) ": T#%d, lb 0x%llx, ub 0x%llx, str "           \
                                 "0x%llx, chunk_sz 0x%llx\n",                   \
                   gtid, lb, ub, str, chunk_sz));                               \
                                                                                \
     if ((str > 0) ? (lb < ub) : (lb > ub)) {                                   \
       KMP_DISPATCH_INIT_ULL(&loc, gtid, (schedule), lb,                        \
                             (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz,    \
                             TRUE);                                             \
       status =                                                                 \
           KMP_DISPATCH_NEXT_ULL(&loc, gtid, NULL, (kmp_uint64 *)p_lb,          \
                                 (kmp_uint64 *)p_ub, (kmp_int64 *)&stride);     \
       if (status) {                                                            \
         KMP_DEBUG_ASSERT(stride == str);                                       \
         *p_ub += (str > 0) ? 1 : -1;                                           \
       }                                                                        \
     } else {                                                                   \
       status = 0;                                                              \
     }                                                                          \
     KMP_DOACROSS_FINI(status, gtid);                                           \
                                                                                \
     KA_TRACE(                                                                  \
         20,                                                                    \
         (KMP_STR(                                                              \
              func) " exit: T#%d, *p_lb 0x%llx, *p_ub 0x%llx, returning %d\n",  \
          gtid, *p_lb, *p_ub, status));                                         \
     __kmp_free(dims);                                                          \
     return status;                                                             \
   }
 
 LOOP_DOACROSS_START_ULL(
     KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_STATIC_START),
     kmp_sch_static)
 LOOP_DOACROSS_START_ULL(
     KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_DYNAMIC_START),
     kmp_sch_dynamic_chunked)
 LOOP_DOACROSS_START_ULL(
     KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_GUIDED_START),
     kmp_sch_guided_chunked)
 LOOP_DOACROSS_RUNTIME_START_ULL(
     KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_RUNTIME_START),
     kmp_sch_runtime)
 
 // Combined parallel / loop worksharing constructs
 //
 // There are no ull versions (yet).
 
 #define PARALLEL_LOOP_START(func, schedule, ompt_pre, ompt_post)               \
   void func(void (*task)(void *), void *data, unsigned num_threads, long lb,   \
             long ub, long str, long chunk_sz) {                                \
     int gtid = __kmp_entry_gtid();                                             \
     MKLOC(loc, KMP_STR(func));                                                 \
     KA_TRACE(                                                                  \
         20,                                                                    \
         (KMP_STR(                                                              \
              func) ": T#%d, lb 0x%lx, ub 0x%lx, str 0x%lx, chunk_sz 0x%lx\n",  \
          gtid, lb, ub, str, chunk_sz));                                        \
                                                                                \
     ompt_pre();                                                                \
                                                                                \
     if (__kmpc_ok_to_fork(&loc) && (num_threads != 1)) {                       \
       if (num_threads != 0) {                                                  \
         __kmp_push_num_threads(&loc, gtid, num_threads);                       \
       }                                                                        \
       __kmp_GOMP_fork_call(&loc, gtid, task,                                   \
                            (microtask_t)__kmp_GOMP_parallel_microtask_wrapper, \
                            9, task, data, num_threads, &loc, (schedule), lb,   \
                            (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz);    \
       IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid));                        \
     } else {                                                                   \
       __kmp_GOMP_serialized_parallel(&loc, gtid, task);                        \
       IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid));                        \
     }                                                                          \
                                                                                \
     KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb,                              \
                       (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz,          \
                       (schedule) != kmp_sch_static);                           \
                                                                                \
     ompt_post();                                                               \
                                                                                \
     KA_TRACE(20, (KMP_STR(func) " exit: T#%d\n", gtid));                       \
   }
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
 
 #define OMPT_LOOP_PRE()                                                        \
   ompt_frame_t *parent_frame;                                                  \
   if (ompt_enabled.enabled) {                                                  \
     __ompt_get_task_info_internal(0, NULL, NULL, &parent_frame, NULL, NULL);   \
     parent_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);                 \
     OMPT_STORE_RETURN_ADDRESS(gtid);                                           \
   }
 
 #define OMPT_LOOP_POST()                                                       \
   if (ompt_enabled.enabled) {                                                  \
     parent_frame->enter_frame = ompt_data_none;                                \
   }
 
 #else
 
 #define OMPT_LOOP_PRE()
 
 #define OMPT_LOOP_POST()
 
 #endif
 
 PARALLEL_LOOP_START(
     KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_STATIC_START),
     kmp_sch_static, OMPT_LOOP_PRE, OMPT_LOOP_POST)
 PARALLEL_LOOP_START(
     KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_DYNAMIC_START),
     kmp_sch_dynamic_chunked, OMPT_LOOP_PRE, OMPT_LOOP_POST)
 PARALLEL_LOOP_START(
     KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_GUIDED_START),
     kmp_sch_guided_chunked, OMPT_LOOP_PRE, OMPT_LOOP_POST)
 PARALLEL_LOOP_START(
     KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_RUNTIME_START),
     kmp_sch_runtime, OMPT_LOOP_PRE, OMPT_LOOP_POST)
 
 // Tasking constructs
 
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASK)(void (*func)(void *), void *data,
                                              void (*copy_func)(void *, void *),
                                              long arg_size, long arg_align,
                                              bool if_cond, unsigned gomp_flags,
                                              void **depend) {
   MKLOC(loc, "GOMP_task");
   int gtid = __kmp_entry_gtid();
   kmp_int32 flags = 0;
   kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
 
   KA_TRACE(20, ("GOMP_task: T#%d\n", gtid));
 
   // The low-order bit is the "untied" flag
   if (!(gomp_flags & 1)) {
     input_flags->tiedness = 1;
   }
   // The second low-order bit is the "final" flag
   if (gomp_flags & 2) {
     input_flags->final = 1;
   }
   input_flags->native = 1;
   // __kmp_task_alloc() sets up all other flags
 
   if (!if_cond) {
     arg_size = 0;
   }
 
   kmp_task_t *task = __kmp_task_alloc(
       &loc, gtid, input_flags, sizeof(kmp_task_t),
       arg_size ? arg_size + arg_align - 1 : 0, (kmp_routine_entry_t)func);
 
   if (arg_size > 0) {
     if (arg_align > 0) {
       task->shareds = (void *)((((size_t)task->shareds) + arg_align - 1) /
                                arg_align * arg_align);
     }
     // else error??
 
     if (copy_func) {
       (*copy_func)(task->shareds, data);
     } else {
       KMP_MEMCPY(task->shareds, data, arg_size);
     }
   }
 
 #if OMPT_SUPPORT
   kmp_taskdata_t *current_task;
   if (ompt_enabled.enabled) {
     OMPT_STORE_RETURN_ADDRESS(gtid);
     current_task = __kmp_threads[gtid]->th.th_current_task;
     current_task->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
   }
 #endif
 
   if (if_cond) {
     if (gomp_flags & 8) {
       KMP_ASSERT(depend);
       const size_t ndeps = (kmp_intptr_t)depend[0];
       const size_t nout = (kmp_intptr_t)depend[1];
       kmp_depend_info_t dep_list[ndeps];
 
       for (size_t i = 0U; i < ndeps; i++) {
         dep_list[i].base_addr = (kmp_intptr_t)depend[2U + i];
         dep_list[i].len = 0U;
         dep_list[i].flags.in = 1;
         dep_list[i].flags.out = (i < nout);
       }
       __kmpc_omp_task_with_deps(&loc, gtid, task, ndeps, dep_list, 0, NULL);
     } else {
       __kmpc_omp_task(&loc, gtid, task);
     }
   } else {
 #if OMPT_SUPPORT
     ompt_thread_info_t oldInfo;
     kmp_info_t *thread;
     kmp_taskdata_t *taskdata;
     if (ompt_enabled.enabled) {
       // Store the threads states and restore them after the task
       thread = __kmp_threads[gtid];
       taskdata = KMP_TASK_TO_TASKDATA(task);
       oldInfo = thread->th.ompt_thread_info;
       thread->th.ompt_thread_info.wait_id = 0;
       thread->th.ompt_thread_info.state = ompt_state_work_parallel;
       taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
       OMPT_STORE_RETURN_ADDRESS(gtid);
     }
 #endif
 
     __kmpc_omp_task_begin_if0(&loc, gtid, task);
     func(data);
     __kmpc_omp_task_complete_if0(&loc, gtid, task);
 
 #if OMPT_SUPPORT
     if (ompt_enabled.enabled) {
       thread->th.ompt_thread_info = oldInfo;
       taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
     }
 #endif
   }
 #if OMPT_SUPPORT
   if (ompt_enabled.enabled) {
     current_task->ompt_task_info.frame.enter_frame = ompt_data_none;
   }
 #endif
 
   KA_TRACE(20, ("GOMP_task exit: T#%d\n", gtid));
 }
 
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKWAIT)(void) {
   MKLOC(loc, "GOMP_taskwait");
   int gtid = __kmp_entry_gtid();
 
 #if OMPT_SUPPORT
   if (ompt_enabled.enabled)
     OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
 
   KA_TRACE(20, ("GOMP_taskwait: T#%d\n", gtid));
 
   __kmpc_omp_taskwait(&loc, gtid);
 
   KA_TRACE(20, ("GOMP_taskwait exit: T#%d\n", gtid));
 }
 
 // Sections worksharing constructs
 //
 // For the sections construct, we initialize a dynamically scheduled loop
 // worksharing construct with lb 1 and stride 1, and use the iteration #'s
 // that its returns as sections ids.
 //
 // There are no special entry points for ordered sections, so we always use
 // the dynamically scheduled workshare, even if the sections aren't ordered.
 
 unsigned KMP_EXPAND_NAME(KMP_API_NAME_GOMP_SECTIONS_START)(unsigned count) {
   int status;
   kmp_int lb, ub, stride;
   int gtid = __kmp_entry_gtid();
   MKLOC(loc, "GOMP_sections_start");
   KA_TRACE(20, ("GOMP_sections_start: T#%d\n", gtid));
 
   KMP_DISPATCH_INIT(&loc, gtid, kmp_nm_dynamic_chunked, 1, count, 1, 1, TRUE);
 
   status = KMP_DISPATCH_NEXT(&loc, gtid, NULL, &lb, &ub, &stride);
   if (status) {
     KMP_DEBUG_ASSERT(stride == 1);
     KMP_DEBUG_ASSERT(lb > 0);
     KMP_ASSERT(lb == ub);
   } else {
     lb = 0;
   }
 
   KA_TRACE(20, ("GOMP_sections_start exit: T#%d returning %u\n", gtid,
                 (unsigned)lb));
   return (unsigned)lb;
 }
 
 unsigned KMP_EXPAND_NAME(KMP_API_NAME_GOMP_SECTIONS_NEXT)(void) {
   int status;
   kmp_int lb, ub, stride;
   int gtid = __kmp_get_gtid();
   MKLOC(loc, "GOMP_sections_next");
   KA_TRACE(20, ("GOMP_sections_next: T#%d\n", gtid));
 
 #if OMPT_SUPPORT
   OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
 
   status = KMP_DISPATCH_NEXT(&loc, gtid, NULL, &lb, &ub, &stride);
   if (status) {
     KMP_DEBUG_ASSERT(stride == 1);
     KMP_DEBUG_ASSERT(lb > 0);
     KMP_ASSERT(lb == ub);
   } else {
     lb = 0;
   }
 
   KA_TRACE(
       20, ("GOMP_sections_next exit: T#%d returning %u\n", gtid, (unsigned)lb));
   return (unsigned)lb;
 }
 
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_SECTIONS_START)(
     void (*task)(void *), void *data, unsigned num_threads, unsigned count) {
   int gtid = __kmp_entry_gtid();
 
 #if OMPT_SUPPORT
   ompt_frame_t *parent_frame;
 
   if (ompt_enabled.enabled) {
     __ompt_get_task_info_internal(0, NULL, NULL, &parent_frame, NULL, NULL);
     parent_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
     OMPT_STORE_RETURN_ADDRESS(gtid);
   }
 #endif
 
   MKLOC(loc, "GOMP_parallel_sections_start");
   KA_TRACE(20, ("GOMP_parallel_sections_start: T#%d\n", gtid));
 
   if (__kmpc_ok_to_fork(&loc) && (num_threads != 1)) {
     if (num_threads != 0) {
       __kmp_push_num_threads(&loc, gtid, num_threads);
     }
     __kmp_GOMP_fork_call(&loc, gtid, task,
                          (microtask_t)__kmp_GOMP_parallel_microtask_wrapper, 9,
                          task, data, num_threads, &loc, kmp_nm_dynamic_chunked,
                          (kmp_int)1, (kmp_int)count, (kmp_int)1, (kmp_int)1);
   } else {
     __kmp_GOMP_serialized_parallel(&loc, gtid, task);
   }
 
 #if OMPT_SUPPORT
   if (ompt_enabled.enabled) {
     parent_frame->enter_frame = ompt_data_none;
   }
 #endif
 
   KMP_DISPATCH_INIT(&loc, gtid, kmp_nm_dynamic_chunked, 1, count, 1, 1, TRUE);
 
   KA_TRACE(20, ("GOMP_parallel_sections_start exit: T#%d\n", gtid));
 }
 
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_SECTIONS_END)(void) {
   int gtid = __kmp_get_gtid();
   KA_TRACE(20, ("GOMP_sections_end: T#%d\n", gtid))
 
 #if OMPT_SUPPORT
   ompt_frame_t *ompt_frame;
   if (ompt_enabled.enabled) {
     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
     ompt_frame->enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
     OMPT_STORE_RETURN_ADDRESS(gtid);
   }
 #endif
   __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
 #if OMPT_SUPPORT
   if (ompt_enabled.enabled) {
     ompt_frame->enter_frame = ompt_data_none;
   }
 #endif
 
   KA_TRACE(20, ("GOMP_sections_end exit: T#%d\n", gtid))
 }
 
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_SECTIONS_END_NOWAIT)(void) {
   KA_TRACE(20, ("GOMP_sections_end_nowait: T#%d\n", __kmp_get_gtid()))
 }
 
 // libgomp has an empty function for GOMP_taskyield as of 2013-10-10
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKYIELD)(void) {
   KA_TRACE(20, ("GOMP_taskyield: T#%d\n", __kmp_get_gtid()))
   return;
 }
 
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL)(void (*task)(void *),
                                                  void *data,
                                                  unsigned num_threads,
                                                  unsigned int flags) {
   int gtid = __kmp_entry_gtid();
   MKLOC(loc, "GOMP_parallel");
   KA_TRACE(20, ("GOMP_parallel: T#%d\n", gtid));
 
 #if OMPT_SUPPORT
   ompt_task_info_t *parent_task_info, *task_info;
   if (ompt_enabled.enabled) {
     parent_task_info = __ompt_get_task_info_object(0);
     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
     OMPT_STORE_RETURN_ADDRESS(gtid);
   }
 #endif
   if (__kmpc_ok_to_fork(&loc) && (num_threads != 1)) {
     if (num_threads != 0) {
       __kmp_push_num_threads(&loc, gtid, num_threads);
     }
     if (flags != 0) {
       __kmp_push_proc_bind(&loc, gtid, (kmp_proc_bind_t)flags);
     }
     __kmp_GOMP_fork_call(&loc, gtid, task,
                          (microtask_t)__kmp_GOMP_microtask_wrapper, 2, task,
                          data);
   } else {
     __kmp_GOMP_serialized_parallel(&loc, gtid, task);
   }
 #if OMPT_SUPPORT
   if (ompt_enabled.enabled) {
     task_info = __ompt_get_task_info_object(0);
     task_info->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
   }
 #endif
   task(data);
 #if OMPT_SUPPORT
   if (ompt_enabled.enabled) {
     OMPT_STORE_RETURN_ADDRESS(gtid);
   }
 #endif
   KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_END)();
 #if OMPT_SUPPORT
   if (ompt_enabled.enabled) {
     task_info->frame.exit_frame = ompt_data_none;
     parent_task_info->frame.enter_frame = ompt_data_none;
   }
 #endif
 }
 
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_SECTIONS)(void (*task)(void *),
                                                           void *data,
                                                           unsigned num_threads,
                                                           unsigned count,
                                                           unsigned flags) {
   int gtid = __kmp_entry_gtid();
   MKLOC(loc, "GOMP_parallel_sections");
   KA_TRACE(20, ("GOMP_parallel_sections: T#%d\n", gtid));
 
 #if OMPT_SUPPORT
   OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
 
   if (__kmpc_ok_to_fork(&loc) && (num_threads != 1)) {
     if (num_threads != 0) {
       __kmp_push_num_threads(&loc, gtid, num_threads);
     }
     if (flags != 0) {
       __kmp_push_proc_bind(&loc, gtid, (kmp_proc_bind_t)flags);
     }
     __kmp_GOMP_fork_call(&loc, gtid, task,
                          (microtask_t)__kmp_GOMP_parallel_microtask_wrapper, 9,
                          task, data, num_threads, &loc, kmp_nm_dynamic_chunked,
                          (kmp_int)1, (kmp_int)count, (kmp_int)1, (kmp_int)1);
   } else {
     __kmp_GOMP_serialized_parallel(&loc, gtid, task);
   }
 
 #if OMPT_SUPPORT
   OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
 
   KMP_DISPATCH_INIT(&loc, gtid, kmp_nm_dynamic_chunked, 1, count, 1, 1, TRUE);
 
   task(data);
   KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_END)();
   KA_TRACE(20, ("GOMP_parallel_sections exit: T#%d\n", gtid));
 }
 
 #define PARALLEL_LOOP(func, schedule, ompt_pre, ompt_post)                     \
   void func(void (*task)(void *), void *data, unsigned num_threads, long lb,   \
             long ub, long str, long chunk_sz, unsigned flags) {                \
     int gtid = __kmp_entry_gtid();                                             \
     MKLOC(loc, KMP_STR(func));                                                 \
     KA_TRACE(                                                                  \
         20,                                                                    \
         (KMP_STR(                                                              \
              func) ": T#%d, lb 0x%lx, ub 0x%lx, str 0x%lx, chunk_sz 0x%lx\n",  \
          gtid, lb, ub, str, chunk_sz));                                        \
                                                                                \
     ompt_pre();                                                                \
     if (__kmpc_ok_to_fork(&loc) && (num_threads != 1)) {                       \
       if (num_threads != 0) {                                                  \
         __kmp_push_num_threads(&loc, gtid, num_threads);                       \
       }                                                                        \
       if (flags != 0) {                                                        \
         __kmp_push_proc_bind(&loc, gtid, (kmp_proc_bind_t)flags);              \
       }                                                                        \
       __kmp_GOMP_fork_call(&loc, gtid, task,                                   \
                            (microtask_t)__kmp_GOMP_parallel_microtask_wrapper, \
                            9, task, data, num_threads, &loc, (schedule), lb,   \
                            (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz);    \
     } else {                                                                   \
       __kmp_GOMP_serialized_parallel(&loc, gtid, task);                        \
     }                                                                          \
                                                                                \
     IF_OMPT_SUPPORT(OMPT_STORE_RETURN_ADDRESS(gtid);)                          \
     KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb,                              \
                       (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz,          \
                       (schedule) != kmp_sch_static);                           \
     task(data);                                                                \
     KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_END)();                         \
     ompt_post();                                                               \
                                                                                \
     KA_TRACE(20, (KMP_STR(func) " exit: T#%d\n", gtid));                       \
   }
 
 PARALLEL_LOOP(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_STATIC),
               kmp_sch_static, OMPT_LOOP_PRE, OMPT_LOOP_POST)
 PARALLEL_LOOP(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_DYNAMIC),
               kmp_sch_dynamic_chunked, OMPT_LOOP_PRE, OMPT_LOOP_POST)
+PARALLEL_LOOP(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_NONMONOTONIC_GUIDED),
+              kmp_sch_guided_chunked, OMPT_LOOP_PRE, OMPT_LOOP_POST)
+PARALLEL_LOOP(
+    KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_NONMONOTONIC_DYNAMIC),
+              kmp_sch_dynamic_chunked, OMPT_LOOP_PRE, OMPT_LOOP_POST)
 PARALLEL_LOOP(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_GUIDED),
               kmp_sch_guided_chunked, OMPT_LOOP_PRE, OMPT_LOOP_POST)
 PARALLEL_LOOP(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_LOOP_RUNTIME),
               kmp_sch_runtime, OMPT_LOOP_PRE, OMPT_LOOP_POST)
 
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKGROUP_START)(void) {
   int gtid = __kmp_entry_gtid();
   MKLOC(loc, "GOMP_taskgroup_start");
   KA_TRACE(20, ("GOMP_taskgroup_start: T#%d\n", gtid));
 
 #if OMPT_SUPPORT
   if (ompt_enabled.enabled)
     OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
 
   __kmpc_taskgroup(&loc, gtid);
 
   return;
 }
 
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKGROUP_END)(void) {
   int gtid = __kmp_get_gtid();
   MKLOC(loc, "GOMP_taskgroup_end");
   KA_TRACE(20, ("GOMP_taskgroup_end: T#%d\n", gtid));
 
 #if OMPT_SUPPORT
   if (ompt_enabled.enabled)
     OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
 
   __kmpc_end_taskgroup(&loc, gtid);
 
   return;
 }
 
 static kmp_int32 __kmp_gomp_to_omp_cancellation_kind(int gomp_kind) {
   kmp_int32 cncl_kind = 0;
   switch (gomp_kind) {
   case 1:
     cncl_kind = cancel_parallel;
     break;
   case 2:
     cncl_kind = cancel_loop;
     break;
   case 4:
     cncl_kind = cancel_sections;
     break;
   case 8:
     cncl_kind = cancel_taskgroup;
     break;
   }
   return cncl_kind;
 }
 
 // Return true if cancellation should take place, false otherwise
 bool KMP_EXPAND_NAME(KMP_API_NAME_GOMP_CANCELLATION_POINT)(int which) {
   int gtid = __kmp_get_gtid();
   MKLOC(loc, "GOMP_cancellation_point");
   KA_TRACE(20, ("GOMP_cancellation_point: T#%d which:%d\n", gtid, which));
   kmp_int32 cncl_kind = __kmp_gomp_to_omp_cancellation_kind(which);
   return __kmpc_cancellationpoint(&loc, gtid, cncl_kind);
 }
 
 // Return true if cancellation should take place, false otherwise
 bool KMP_EXPAND_NAME(KMP_API_NAME_GOMP_CANCEL)(int which, bool do_cancel) {
   int gtid = __kmp_get_gtid();
   MKLOC(loc, "GOMP_cancel");
   KA_TRACE(20, ("GOMP_cancel: T#%d which:%d do_cancel:%d\n", gtid, which,
                 (int)do_cancel));
   kmp_int32 cncl_kind = __kmp_gomp_to_omp_cancellation_kind(which);
 
   if (do_cancel == FALSE) {
     return __kmpc_cancellationpoint(&loc, gtid, cncl_kind);
   } else {
     return __kmpc_cancel(&loc, gtid, cncl_kind);
   }
 }
 
 // Return true if cancellation should take place, false otherwise
 bool KMP_EXPAND_NAME(KMP_API_NAME_GOMP_BARRIER_CANCEL)(void) {
   int gtid = __kmp_get_gtid();
   KA_TRACE(20, ("GOMP_barrier_cancel: T#%d\n", gtid));
   return __kmp_barrier_gomp_cancel(gtid);
 }
 
 // Return true if cancellation should take place, false otherwise
 bool KMP_EXPAND_NAME(KMP_API_NAME_GOMP_SECTIONS_END_CANCEL)(void) {
   int gtid = __kmp_get_gtid();
   KA_TRACE(20, ("GOMP_sections_end_cancel: T#%d\n", gtid));
   return __kmp_barrier_gomp_cancel(gtid);
 }
 
 // Return true if cancellation should take place, false otherwise
 bool KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_END_CANCEL)(void) {
   int gtid = __kmp_get_gtid();
   KA_TRACE(20, ("GOMP_loop_end_cancel: T#%d\n", gtid));
   return __kmp_barrier_gomp_cancel(gtid);
 }
 
 // All target functions are empty as of 2014-05-29
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TARGET)(int device, void (*fn)(void *),
                                                const void *openmp_target,
                                                size_t mapnum, void **hostaddrs,
                                                size_t *sizes,
                                                unsigned char *kinds) {
   return;
 }
 
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TARGET_DATA)(
     int device, const void *openmp_target, size_t mapnum, void **hostaddrs,
     size_t *sizes, unsigned char *kinds) {
   return;
 }
 
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TARGET_END_DATA)(void) { return; }
 
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TARGET_UPDATE)(
     int device, const void *openmp_target, size_t mapnum, void **hostaddrs,
     size_t *sizes, unsigned char *kinds) {
   return;
 }
 
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TEAMS)(unsigned int num_teams,
                                               unsigned int thread_limit) {
   return;
 }
 
 // Task duplication function which copies src to dest (both are
 // preallocated task structures)
 static void __kmp_gomp_task_dup(kmp_task_t *dest, kmp_task_t *src,
                                 kmp_int32 last_private) {
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(src);
   if (taskdata->td_copy_func) {
     (taskdata->td_copy_func)(dest->shareds, src->shareds);
   }
 }
 
 #ifdef __cplusplus
 } // extern "C"
 #endif
 
 template <typename T>
 void __GOMP_taskloop(void (*func)(void *), void *data,
                      void (*copy_func)(void *, void *), long arg_size,
                      long arg_align, unsigned gomp_flags,
                      unsigned long num_tasks, int priority, T start, T end,
                      T step) {
   typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
   MKLOC(loc, "GOMP_taskloop");
   int sched;
   T *loop_bounds;
   int gtid = __kmp_entry_gtid();
   kmp_int32 flags = 0;
   int if_val = gomp_flags & (1u << 10);
   int nogroup = gomp_flags & (1u << 11);
   int up = gomp_flags & (1u << 8);
   p_task_dup_t task_dup = NULL;
   kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
 #ifdef KMP_DEBUG
   {
     char *buff;
     buff = __kmp_str_format(
         "GOMP_taskloop: T#%%d: func:%%p data:%%p copy_func:%%p "
         "arg_size:%%ld arg_align:%%ld gomp_flags:0x%%x num_tasks:%%lu "
         "priority:%%d start:%%%s end:%%%s step:%%%s\n",
         traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
     KA_TRACE(20, (buff, gtid, func, data, copy_func, arg_size, arg_align,
                   gomp_flags, num_tasks, priority, start, end, step));
     __kmp_str_free(&buff);
   }
 #endif
   KMP_ASSERT((size_t)arg_size >= 2 * sizeof(T));
   KMP_ASSERT(arg_align > 0);
   // The low-order bit is the "untied" flag
   if (!(gomp_flags & 1)) {
     input_flags->tiedness = 1;
   }
   // The second low-order bit is the "final" flag
   if (gomp_flags & 2) {
     input_flags->final = 1;
   }
   // Negative step flag
   if (!up) {
     // If step is flagged as negative, but isn't properly sign extended
     // Then manually sign extend it.  Could be a short, int, char embedded
     // in a long.  So cannot assume any cast.
     if (step > 0) {
       for (int i = sizeof(T) * CHAR_BIT - 1; i >= 0L; --i) {
         // break at the first 1 bit
         if (step & ((T)1 << i))
           break;
         step |= ((T)1 << i);
       }
     }
   }
   input_flags->native = 1;
   // Figure out if none/grainsize/num_tasks clause specified
   if (num_tasks > 0) {
     if (gomp_flags & (1u << 9))
       sched = 1; // grainsize specified
     else
       sched = 2; // num_tasks specified
     // neither grainsize nor num_tasks specified
   } else {
     sched = 0;
   }
 
   // __kmp_task_alloc() sets up all other flags
   kmp_task_t *task =
       __kmp_task_alloc(&loc, gtid, input_flags, sizeof(kmp_task_t),
                        arg_size + arg_align - 1, (kmp_routine_entry_t)func);
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
   taskdata->td_copy_func = copy_func;
   taskdata->td_size_loop_bounds = sizeof(T);
 
   // re-align shareds if needed and setup firstprivate copy constructors
   // through the task_dup mechanism
   task->shareds = (void *)((((size_t)task->shareds) + arg_align - 1) /
                            arg_align * arg_align);
   if (copy_func) {
     task_dup = __kmp_gomp_task_dup;
   }
   KMP_MEMCPY(task->shareds, data, arg_size);
 
   loop_bounds = (T *)task->shareds;
   loop_bounds[0] = start;
   loop_bounds[1] = end + (up ? -1 : 1);
   __kmpc_taskloop(&loc, gtid, task, if_val, (kmp_uint64 *)&(loop_bounds[0]),
                   (kmp_uint64 *)&(loop_bounds[1]), (kmp_int64)step, nogroup,
                   sched, (kmp_uint64)num_tasks, (void *)task_dup);
 }
 
 // 4 byte version of GOMP_doacross_post
 // This verison needs to create a temporary array which converts 4 byte
 // integers into 8 byte integeres
 template <typename T, bool need_conversion = (sizeof(long) == 4)>
 void __kmp_GOMP_doacross_post(T *count);
 
 template <> void __kmp_GOMP_doacross_post<long, true>(long *count) {
   int gtid = __kmp_entry_gtid();
   kmp_info_t *th = __kmp_threads[gtid];
   MKLOC(loc, "GOMP_doacross_post");
   kmp_int64 num_dims = th->th.th_dispatch->th_doacross_info[0];
   kmp_int64 *vec =
       (kmp_int64 *)__kmp_thread_malloc(th, sizeof(kmp_int64) * num_dims);
   for (kmp_int64 i = 0; i < num_dims; ++i) {
     vec[i] = (kmp_int64)count[i];
   }
   __kmpc_doacross_post(&loc, gtid, vec);
   __kmp_thread_free(th, vec);
 }
 
 // 8 byte versions of GOMP_doacross_post
 // This version can just pass in the count array directly instead of creating
 // a temporary array
 template <> void __kmp_GOMP_doacross_post<long, false>(long *count) {
   int gtid = __kmp_entry_gtid();
   MKLOC(loc, "GOMP_doacross_post");
   __kmpc_doacross_post(&loc, gtid, RCAST(kmp_int64 *, count));
 }
 
 template <typename T> void __kmp_GOMP_doacross_wait(T first, va_list args) {
   int gtid = __kmp_entry_gtid();
   kmp_info_t *th = __kmp_threads[gtid];
   MKLOC(loc, "GOMP_doacross_wait");
   kmp_int64 num_dims = th->th.th_dispatch->th_doacross_info[0];
   kmp_int64 *vec =
       (kmp_int64 *)__kmp_thread_malloc(th, sizeof(kmp_int64) * num_dims);
   vec[0] = (kmp_int64)first;
   for (kmp_int64 i = 1; i < num_dims; ++i) {
     T item = va_arg(args, T);
     vec[i] = (kmp_int64)item;
   }
   __kmpc_doacross_wait(&loc, gtid, vec);
   __kmp_thread_free(th, vec);
   return;
 }
 
 #ifdef __cplusplus
 extern "C" {
 #endif // __cplusplus
 
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKLOOP)(
     void (*func)(void *), void *data, void (*copy_func)(void *, void *),
     long arg_size, long arg_align, unsigned gomp_flags, unsigned long num_tasks,
     int priority, long start, long end, long step) {
   __GOMP_taskloop<long>(func, data, copy_func, arg_size, arg_align, gomp_flags,
                         num_tasks, priority, start, end, step);
 }
 
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKLOOP_ULL)(
     void (*func)(void *), void *data, void (*copy_func)(void *, void *),
     long arg_size, long arg_align, unsigned gomp_flags, unsigned long num_tasks,
     int priority, unsigned long long start, unsigned long long end,
     unsigned long long step) {
   __GOMP_taskloop<unsigned long long>(func, data, copy_func, arg_size,
                                       arg_align, gomp_flags, num_tasks,
                                       priority, start, end, step);
 }
 
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_DOACROSS_POST)(long *count) {
   __kmp_GOMP_doacross_post(count);
 }
 
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_DOACROSS_WAIT)(long first, ...) {
   va_list args;
   va_start(args, first);
   __kmp_GOMP_doacross_wait<long>(first, args);
   va_end(args);
 }
 
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_DOACROSS_ULL_POST)(
     unsigned long long *count) {
   int gtid = __kmp_entry_gtid();
   MKLOC(loc, "GOMP_doacross_ull_post");
   __kmpc_doacross_post(&loc, gtid, RCAST(kmp_int64 *, count));
 }
 
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_DOACROSS_ULL_WAIT)(
     unsigned long long first, ...) {
   va_list args;
   va_start(args, first);
   __kmp_GOMP_doacross_wait<unsigned long long>(first, args);
   va_end(args);
 }
 
 /* The following sections of code create aliases for the GOMP_* functions, then
    create versioned symbols using the assembler directive .symver. This is only
    pertinent for ELF .so library. The KMP_VERSION_SYMBOL macro is defined in
    kmp_os.h  */
 
 #ifdef KMP_USE_VERSION_SYMBOLS
 // GOMP_1.0 versioned symbols
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_ATOMIC_END, 10, "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_ATOMIC_START, 10, "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_BARRIER, 10, "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_CRITICAL_END, 10, "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_CRITICAL_NAME_END, 10, "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_CRITICAL_NAME_START, 10, "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_CRITICAL_START, 10, "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_DYNAMIC_NEXT, 10, "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_DYNAMIC_START, 10, "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_END, 10, "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_END_NOWAIT, 10, "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_GUIDED_NEXT, 10, "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_GUIDED_START, 10, "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ORDERED_DYNAMIC_NEXT, 10, "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ORDERED_DYNAMIC_START, 10,
                    "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ORDERED_GUIDED_NEXT, 10, "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ORDERED_GUIDED_START, 10, "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ORDERED_RUNTIME_NEXT, 10, "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ORDERED_RUNTIME_START, 10,
                    "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ORDERED_STATIC_NEXT, 10, "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ORDERED_STATIC_START, 10, "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_RUNTIME_NEXT, 10, "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_RUNTIME_START, 10, "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_STATIC_NEXT, 10, "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_STATIC_START, 10, "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_ORDERED_END, 10, "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_ORDERED_START, 10, "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_END, 10, "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_DYNAMIC_START, 10,
                    "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_GUIDED_START, 10,
                    "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_RUNTIME_START, 10,
                    "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_STATIC_START, 10,
                    "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_SECTIONS_START, 10, "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_START, 10, "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_SECTIONS_END, 10, "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_SECTIONS_END_NOWAIT, 10, "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_SECTIONS_NEXT, 10, "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_SECTIONS_START, 10, "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_SINGLE_COPY_END, 10, "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_SINGLE_COPY_START, 10, "GOMP_1.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_SINGLE_START, 10, "GOMP_1.0");
 
 // GOMP_2.0 versioned symbols
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TASK, 20, "GOMP_2.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TASKWAIT, 20, "GOMP_2.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_DYNAMIC_NEXT, 20, "GOMP_2.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_DYNAMIC_START, 20, "GOMP_2.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_GUIDED_NEXT, 20, "GOMP_2.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_GUIDED_START, 20, "GOMP_2.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_DYNAMIC_NEXT, 20,
                    "GOMP_2.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_DYNAMIC_START, 20,
                    "GOMP_2.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_GUIDED_NEXT, 20,
                    "GOMP_2.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_GUIDED_START, 20,
                    "GOMP_2.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_RUNTIME_NEXT, 20,
                    "GOMP_2.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_RUNTIME_START, 20,
                    "GOMP_2.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_STATIC_NEXT, 20,
                    "GOMP_2.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_STATIC_START, 20,
                    "GOMP_2.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_RUNTIME_NEXT, 20, "GOMP_2.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_RUNTIME_START, 20, "GOMP_2.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_STATIC_NEXT, 20, "GOMP_2.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_STATIC_START, 20, "GOMP_2.0");
 
 // GOMP_3.0 versioned symbols
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TASKYIELD, 30, "GOMP_3.0");
 
 // GOMP_4.0 versioned symbols
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL, 40, "GOMP_4.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_SECTIONS, 40, "GOMP_4.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_DYNAMIC, 40, "GOMP_4.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_GUIDED, 40, "GOMP_4.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_RUNTIME, 40, "GOMP_4.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_STATIC, 40, "GOMP_4.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TASKGROUP_START, 40, "GOMP_4.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TASKGROUP_END, 40, "GOMP_4.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_BARRIER_CANCEL, 40, "GOMP_4.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_CANCEL, 40, "GOMP_4.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_CANCELLATION_POINT, 40, "GOMP_4.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_END_CANCEL, 40, "GOMP_4.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_SECTIONS_END_CANCEL, 40, "GOMP_4.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TARGET, 40, "GOMP_4.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TARGET_DATA, 40, "GOMP_4.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TARGET_END_DATA, 40, "GOMP_4.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TARGET_UPDATE, 40, "GOMP_4.0");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TEAMS, 40, "GOMP_4.0");
 
 // GOMP_4.5 versioned symbols
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TASKLOOP, 45, "GOMP_4.5");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TASKLOOP_ULL, 45, "GOMP_4.5");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_DOACROSS_POST, 45, "GOMP_4.5");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_DOACROSS_WAIT, 45, "GOMP_4.5");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_DOACROSS_STATIC_START, 45,
                    "GOMP_4.5");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_DOACROSS_DYNAMIC_START, 45,
                    "GOMP_4.5");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_DOACROSS_GUIDED_START, 45,
                    "GOMP_4.5");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_DOACROSS_RUNTIME_START, 45,
                    "GOMP_4.5");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_DOACROSS_ULL_POST, 45, "GOMP_4.5");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_DOACROSS_ULL_WAIT, 45, "GOMP_4.5");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_STATIC_START, 45,
                    "GOMP_4.5");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_DYNAMIC_START, 45,
                    "GOMP_4.5");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_GUIDED_START, 45,
                    "GOMP_4.5");
 KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_DOACROSS_RUNTIME_START, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_DYNAMIC_START, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_DYNAMIC_NEXT, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_GUIDED_START, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_NONMONOTONIC_GUIDED_NEXT, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_DYNAMIC_START, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_DYNAMIC_NEXT, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_GUIDED_START, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_LOOP_ULL_NONMONOTONIC_GUIDED_NEXT, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_NONMONOTONIC_DYNAMIC, 45,
+                   "GOMP_4.5");
+KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_PARALLEL_LOOP_NONMONOTONIC_GUIDED, 45,
                    "GOMP_4.5");
 
 #endif // KMP_USE_VERSION_SYMBOLS
 
 #ifdef __cplusplus
 } // extern "C"
 #endif // __cplusplus
Index: projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_lock.cpp
===================================================================
--- projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_lock.cpp	(revision 357058)
+++ projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_lock.cpp	(revision 357059)
@@ -1,3942 +1,3942 @@
 /*
  * kmp_lock.cpp -- lock-related functions
  */
 
 //===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
 #include <stddef.h>
 #include <atomic>
 
 #include "kmp.h"
 #include "kmp_i18n.h"
 #include "kmp_io.h"
 #include "kmp_itt.h"
 #include "kmp_lock.h"
 #include "kmp_wait_release.h"
 #include "kmp_wrapper_getpid.h"
 
 #include "tsan_annotations.h"
 
 #if KMP_USE_FUTEX
 #include <sys/syscall.h>
 #include <unistd.h>
 // We should really include <futex.h>, but that causes compatibility problems on
 // different Linux* OS distributions that either require that you include (or
 // break when you try to include) <pci/types.h>. Since all we need is the two
 // macros below (which are part of the kernel ABI, so can't change) we just
 // define the constants here and don't include <futex.h>
 #ifndef FUTEX_WAIT
 #define FUTEX_WAIT 0
 #endif
 #ifndef FUTEX_WAKE
 #define FUTEX_WAKE 1
 #endif
 #endif
 
 /* Implement spin locks for internal library use.             */
 /* The algorithm implemented is Lamport's bakery lock [1974]. */
 
 void __kmp_validate_locks(void) {
   int i;
   kmp_uint32 x, y;
 
   /* Check to make sure unsigned arithmetic does wraps properly */
   x = ~((kmp_uint32)0) - 2;
   y = x - 2;
 
   for (i = 0; i < 8; ++i, ++x, ++y) {
     kmp_uint32 z = (x - y);
     KMP_ASSERT(z == 2);
   }
 
   KMP_ASSERT(offsetof(kmp_base_queuing_lock, tail_id) % 8 == 0);
 }
 
 /* ------------------------------------------------------------------------ */
 /* test and set locks */
 
 // For the non-nested locks, we can only assume that the first 4 bytes were
 // allocated, since gcc only allocates 4 bytes for omp_lock_t, and the Intel
 // compiler only allocates a 4 byte pointer on IA-32 architecture.  On
 // Windows* OS on Intel(R) 64, we can assume that all 8 bytes were allocated.
 //
 // gcc reserves >= 8 bytes for nested locks, so we can assume that the
 // entire 8 bytes were allocated for nested locks on all 64-bit platforms.
 
 static kmp_int32 __kmp_get_tas_lock_owner(kmp_tas_lock_t *lck) {
   return KMP_LOCK_STRIP(KMP_ATOMIC_LD_RLX(&lck->lk.poll)) - 1;
 }
 
 static inline bool __kmp_is_tas_lock_nestable(kmp_tas_lock_t *lck) {
   return lck->lk.depth_locked != -1;
 }
 
 __forceinline static int
 __kmp_acquire_tas_lock_timed_template(kmp_tas_lock_t *lck, kmp_int32 gtid) {
   KMP_MB();
 
 #ifdef USE_LOCK_PROFILE
   kmp_uint32 curr = KMP_LOCK_STRIP(lck->lk.poll);
   if ((curr != 0) && (curr != gtid + 1))
     __kmp_printf("LOCK CONTENTION: %p\n", lck);
 /* else __kmp_printf( "." );*/
 #endif /* USE_LOCK_PROFILE */
 
   kmp_int32 tas_free = KMP_LOCK_FREE(tas);
   kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);
 
   if (KMP_ATOMIC_LD_RLX(&lck->lk.poll) == tas_free &&
       __kmp_atomic_compare_store_acq(&lck->lk.poll, tas_free, tas_busy)) {
     KMP_FSYNC_ACQUIRED(lck);
     return KMP_LOCK_ACQUIRED_FIRST;
   }
 
   kmp_uint32 spins;
   KMP_FSYNC_PREPARE(lck);
   KMP_INIT_YIELD(spins);
   kmp_backoff_t backoff = __kmp_spin_backoff_params;
   do {
     __kmp_spin_backoff(&backoff);
     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
   } while (KMP_ATOMIC_LD_RLX(&lck->lk.poll) != tas_free ||
            !__kmp_atomic_compare_store_acq(&lck->lk.poll, tas_free, tas_busy));
   KMP_FSYNC_ACQUIRED(lck);
   return KMP_LOCK_ACQUIRED_FIRST;
 }
 
 int __kmp_acquire_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
   int retval = __kmp_acquire_tas_lock_timed_template(lck, gtid);
   ANNOTATE_TAS_ACQUIRED(lck);
   return retval;
 }
 
 static int __kmp_acquire_tas_lock_with_checks(kmp_tas_lock_t *lck,
                                               kmp_int32 gtid) {
   char const *const func = "omp_set_lock";
   if ((sizeof(kmp_tas_lock_t) <= OMP_LOCK_T_SIZE) &&
       __kmp_is_tas_lock_nestable(lck)) {
     KMP_FATAL(LockNestableUsedAsSimple, func);
   }
   if ((gtid >= 0) && (__kmp_get_tas_lock_owner(lck) == gtid)) {
     KMP_FATAL(LockIsAlreadyOwned, func);
   }
   return __kmp_acquire_tas_lock(lck, gtid);
 }
 
 int __kmp_test_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
   kmp_int32 tas_free = KMP_LOCK_FREE(tas);
   kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);
   if (KMP_ATOMIC_LD_RLX(&lck->lk.poll) == tas_free &&
       __kmp_atomic_compare_store_acq(&lck->lk.poll, tas_free, tas_busy)) {
     KMP_FSYNC_ACQUIRED(lck);
     return TRUE;
   }
   return FALSE;
 }
 
 static int __kmp_test_tas_lock_with_checks(kmp_tas_lock_t *lck,
                                            kmp_int32 gtid) {
   char const *const func = "omp_test_lock";
   if ((sizeof(kmp_tas_lock_t) <= OMP_LOCK_T_SIZE) &&
       __kmp_is_tas_lock_nestable(lck)) {
     KMP_FATAL(LockNestableUsedAsSimple, func);
   }
   return __kmp_test_tas_lock(lck, gtid);
 }
 
 int __kmp_release_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
   KMP_MB(); /* Flush all pending memory write invalidates.  */
 
   KMP_FSYNC_RELEASING(lck);
   ANNOTATE_TAS_RELEASED(lck);
   KMP_ATOMIC_ST_REL(&lck->lk.poll, KMP_LOCK_FREE(tas));
   KMP_MB(); /* Flush all pending memory write invalidates.  */
 
   KMP_YIELD_OVERSUB();
   return KMP_LOCK_RELEASED;
 }
 
 static int __kmp_release_tas_lock_with_checks(kmp_tas_lock_t *lck,
                                               kmp_int32 gtid) {
   char const *const func = "omp_unset_lock";
   KMP_MB(); /* in case another processor initialized lock */
   if ((sizeof(kmp_tas_lock_t) <= OMP_LOCK_T_SIZE) &&
       __kmp_is_tas_lock_nestable(lck)) {
     KMP_FATAL(LockNestableUsedAsSimple, func);
   }
   if (__kmp_get_tas_lock_owner(lck) == -1) {
     KMP_FATAL(LockUnsettingFree, func);
   }
   if ((gtid >= 0) && (__kmp_get_tas_lock_owner(lck) >= 0) &&
       (__kmp_get_tas_lock_owner(lck) != gtid)) {
     KMP_FATAL(LockUnsettingSetByAnother, func);
   }
   return __kmp_release_tas_lock(lck, gtid);
 }
 
 void __kmp_init_tas_lock(kmp_tas_lock_t *lck) {
   lck->lk.poll = KMP_LOCK_FREE(tas);
 }
 
 void __kmp_destroy_tas_lock(kmp_tas_lock_t *lck) { lck->lk.poll = 0; }
 
 static void __kmp_destroy_tas_lock_with_checks(kmp_tas_lock_t *lck) {
   char const *const func = "omp_destroy_lock";
   if ((sizeof(kmp_tas_lock_t) <= OMP_LOCK_T_SIZE) &&
       __kmp_is_tas_lock_nestable(lck)) {
     KMP_FATAL(LockNestableUsedAsSimple, func);
   }
   if (__kmp_get_tas_lock_owner(lck) != -1) {
     KMP_FATAL(LockStillOwned, func);
   }
   __kmp_destroy_tas_lock(lck);
 }
 
 // nested test and set locks
 
 int __kmp_acquire_nested_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
   KMP_DEBUG_ASSERT(gtid >= 0);
 
   if (__kmp_get_tas_lock_owner(lck) == gtid) {
     lck->lk.depth_locked += 1;
     return KMP_LOCK_ACQUIRED_NEXT;
   } else {
     __kmp_acquire_tas_lock_timed_template(lck, gtid);
     ANNOTATE_TAS_ACQUIRED(lck);
     lck->lk.depth_locked = 1;
     return KMP_LOCK_ACQUIRED_FIRST;
   }
 }
 
 static int __kmp_acquire_nested_tas_lock_with_checks(kmp_tas_lock_t *lck,
                                                      kmp_int32 gtid) {
   char const *const func = "omp_set_nest_lock";
   if (!__kmp_is_tas_lock_nestable(lck)) {
     KMP_FATAL(LockSimpleUsedAsNestable, func);
   }
   return __kmp_acquire_nested_tas_lock(lck, gtid);
 }
 
 int __kmp_test_nested_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
   int retval;
 
   KMP_DEBUG_ASSERT(gtid >= 0);
 
   if (__kmp_get_tas_lock_owner(lck) == gtid) {
     retval = ++lck->lk.depth_locked;
   } else if (!__kmp_test_tas_lock(lck, gtid)) {
     retval = 0;
   } else {
     KMP_MB();
     retval = lck->lk.depth_locked = 1;
   }
   return retval;
 }
 
 static int __kmp_test_nested_tas_lock_with_checks(kmp_tas_lock_t *lck,
                                                   kmp_int32 gtid) {
   char const *const func = "omp_test_nest_lock";
   if (!__kmp_is_tas_lock_nestable(lck)) {
     KMP_FATAL(LockSimpleUsedAsNestable, func);
   }
   return __kmp_test_nested_tas_lock(lck, gtid);
 }
 
 int __kmp_release_nested_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
   KMP_DEBUG_ASSERT(gtid >= 0);
 
   KMP_MB();
   if (--(lck->lk.depth_locked) == 0) {
     __kmp_release_tas_lock(lck, gtid);
     return KMP_LOCK_RELEASED;
   }
   return KMP_LOCK_STILL_HELD;
 }
 
 static int __kmp_release_nested_tas_lock_with_checks(kmp_tas_lock_t *lck,
                                                      kmp_int32 gtid) {
   char const *const func = "omp_unset_nest_lock";
   KMP_MB(); /* in case another processor initialized lock */
   if (!__kmp_is_tas_lock_nestable(lck)) {
     KMP_FATAL(LockSimpleUsedAsNestable, func);
   }
   if (__kmp_get_tas_lock_owner(lck) == -1) {
     KMP_FATAL(LockUnsettingFree, func);
   }
   if (__kmp_get_tas_lock_owner(lck) != gtid) {
     KMP_FATAL(LockUnsettingSetByAnother, func);
   }
   return __kmp_release_nested_tas_lock(lck, gtid);
 }
 
 void __kmp_init_nested_tas_lock(kmp_tas_lock_t *lck) {
   __kmp_init_tas_lock(lck);
   lck->lk.depth_locked = 0; // >= 0 for nestable locks, -1 for simple locks
 }
 
 void __kmp_destroy_nested_tas_lock(kmp_tas_lock_t *lck) {
   __kmp_destroy_tas_lock(lck);
   lck->lk.depth_locked = 0;
 }
 
 static void __kmp_destroy_nested_tas_lock_with_checks(kmp_tas_lock_t *lck) {
   char const *const func = "omp_destroy_nest_lock";
   if (!__kmp_is_tas_lock_nestable(lck)) {
     KMP_FATAL(LockSimpleUsedAsNestable, func);
   }
   if (__kmp_get_tas_lock_owner(lck) != -1) {
     KMP_FATAL(LockStillOwned, func);
   }
   __kmp_destroy_nested_tas_lock(lck);
 }
 
 #if KMP_USE_FUTEX
 
 /* ------------------------------------------------------------------------ */
 /* futex locks */
 
 // futex locks are really just test and set locks, with a different method
 // of handling contention.  They take the same amount of space as test and
 // set locks, and are allocated the same way (i.e. use the area allocated by
 // the compiler for non-nested locks / allocate nested locks on the heap).
 
 static kmp_int32 __kmp_get_futex_lock_owner(kmp_futex_lock_t *lck) {
   return KMP_LOCK_STRIP((TCR_4(lck->lk.poll) >> 1)) - 1;
 }
 
 static inline bool __kmp_is_futex_lock_nestable(kmp_futex_lock_t *lck) {
   return lck->lk.depth_locked != -1;
 }
 
 __forceinline static int
 __kmp_acquire_futex_lock_timed_template(kmp_futex_lock_t *lck, kmp_int32 gtid) {
   kmp_int32 gtid_code = (gtid + 1) << 1;
 
   KMP_MB();
 
 #ifdef USE_LOCK_PROFILE
   kmp_uint32 curr = KMP_LOCK_STRIP(TCR_4(lck->lk.poll));
   if ((curr != 0) && (curr != gtid_code))
     __kmp_printf("LOCK CONTENTION: %p\n", lck);
 /* else __kmp_printf( "." );*/
 #endif /* USE_LOCK_PROFILE */
 
   KMP_FSYNC_PREPARE(lck);
   KA_TRACE(1000, ("__kmp_acquire_futex_lock: lck:%p(0x%x), T#%d entering\n",
                   lck, lck->lk.poll, gtid));
 
   kmp_int32 poll_val;
 
   while ((poll_val = KMP_COMPARE_AND_STORE_RET32(
               &(lck->lk.poll), KMP_LOCK_FREE(futex),
               KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) {
 
     kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1;
     KA_TRACE(
         1000,
         ("__kmp_acquire_futex_lock: lck:%p, T#%d poll_val = 0x%x cond = 0x%x\n",
          lck, gtid, poll_val, cond));
 
     // NOTE: if you try to use the following condition for this branch
     //
     // if ( poll_val & 1 == 0 )
     //
     // Then the 12.0 compiler has a bug where the following block will
     // always be skipped, regardless of the value of the LSB of poll_val.
     if (!cond) {
       // Try to set the lsb in the poll to indicate to the owner
       // thread that they need to wake this thread up.
       if (!KMP_COMPARE_AND_STORE_REL32(&(lck->lk.poll), poll_val,
                                        poll_val | KMP_LOCK_BUSY(1, futex))) {
         KA_TRACE(
             1000,
             ("__kmp_acquire_futex_lock: lck:%p(0x%x), T#%d can't set bit 0\n",
              lck, lck->lk.poll, gtid));
         continue;
       }
       poll_val |= KMP_LOCK_BUSY(1, futex);
 
       KA_TRACE(1000,
                ("__kmp_acquire_futex_lock: lck:%p(0x%x), T#%d bit 0 set\n", lck,
                 lck->lk.poll, gtid));
     }
 
     KA_TRACE(
         1000,
         ("__kmp_acquire_futex_lock: lck:%p, T#%d before futex_wait(0x%x)\n",
          lck, gtid, poll_val));
 
     kmp_int32 rc;
     if ((rc = syscall(__NR_futex, &(lck->lk.poll), FUTEX_WAIT, poll_val, NULL,
                       NULL, 0)) != 0) {
       KA_TRACE(1000, ("__kmp_acquire_futex_lock: lck:%p, T#%d futex_wait(0x%x) "
                       "failed (rc=%d errno=%d)\n",
                       lck, gtid, poll_val, rc, errno));
       continue;
     }
 
     KA_TRACE(1000,
              ("__kmp_acquire_futex_lock: lck:%p, T#%d after futex_wait(0x%x)\n",
               lck, gtid, poll_val));
     // This thread has now done a successful futex wait call and was entered on
     // the OS futex queue.  We must now perform a futex wake call when releasing
     // the lock, as we have no idea how many other threads are in the queue.
     gtid_code |= 1;
   }
 
   KMP_FSYNC_ACQUIRED(lck);
   KA_TRACE(1000, ("__kmp_acquire_futex_lock: lck:%p(0x%x), T#%d exiting\n", lck,
                   lck->lk.poll, gtid));
   return KMP_LOCK_ACQUIRED_FIRST;
 }
 
 int __kmp_acquire_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
   int retval = __kmp_acquire_futex_lock_timed_template(lck, gtid);
   ANNOTATE_FUTEX_ACQUIRED(lck);
   return retval;
 }
 
 static int __kmp_acquire_futex_lock_with_checks(kmp_futex_lock_t *lck,
                                                 kmp_int32 gtid) {
   char const *const func = "omp_set_lock";
   if ((sizeof(kmp_futex_lock_t) <= OMP_LOCK_T_SIZE) &&
       __kmp_is_futex_lock_nestable(lck)) {
     KMP_FATAL(LockNestableUsedAsSimple, func);
   }
   if ((gtid >= 0) && (__kmp_get_futex_lock_owner(lck) == gtid)) {
     KMP_FATAL(LockIsAlreadyOwned, func);
   }
   return __kmp_acquire_futex_lock(lck, gtid);
 }
 
 int __kmp_test_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
   if (KMP_COMPARE_AND_STORE_ACQ32(&(lck->lk.poll), KMP_LOCK_FREE(futex),
                                   KMP_LOCK_BUSY((gtid + 1) << 1, futex))) {
     KMP_FSYNC_ACQUIRED(lck);
     return TRUE;
   }
   return FALSE;
 }
 
 static int __kmp_test_futex_lock_with_checks(kmp_futex_lock_t *lck,
                                              kmp_int32 gtid) {
   char const *const func = "omp_test_lock";
   if ((sizeof(kmp_futex_lock_t) <= OMP_LOCK_T_SIZE) &&
       __kmp_is_futex_lock_nestable(lck)) {
     KMP_FATAL(LockNestableUsedAsSimple, func);
   }
   return __kmp_test_futex_lock(lck, gtid);
 }
 
 int __kmp_release_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
   KMP_MB(); /* Flush all pending memory write invalidates.  */
 
   KA_TRACE(1000, ("__kmp_release_futex_lock: lck:%p(0x%x), T#%d entering\n",
                   lck, lck->lk.poll, gtid));
 
   KMP_FSYNC_RELEASING(lck);
   ANNOTATE_FUTEX_RELEASED(lck);
 
   kmp_int32 poll_val = KMP_XCHG_FIXED32(&(lck->lk.poll), KMP_LOCK_FREE(futex));
 
   KA_TRACE(1000,
            ("__kmp_release_futex_lock: lck:%p, T#%d released poll_val = 0x%x\n",
             lck, gtid, poll_val));
 
   if (KMP_LOCK_STRIP(poll_val) & 1) {
     KA_TRACE(1000,
              ("__kmp_release_futex_lock: lck:%p, T#%d futex_wake 1 thread\n",
               lck, gtid));
     syscall(__NR_futex, &(lck->lk.poll), FUTEX_WAKE, KMP_LOCK_BUSY(1, futex),
             NULL, NULL, 0);
   }
 
   KMP_MB(); /* Flush all pending memory write invalidates.  */
 
   KA_TRACE(1000, ("__kmp_release_futex_lock: lck:%p(0x%x), T#%d exiting\n", lck,
                   lck->lk.poll, gtid));
 
   KMP_YIELD_OVERSUB();
   return KMP_LOCK_RELEASED;
 }
 
 static int __kmp_release_futex_lock_with_checks(kmp_futex_lock_t *lck,
                                                 kmp_int32 gtid) {
   char const *const func = "omp_unset_lock";
   KMP_MB(); /* in case another processor initialized lock */
   if ((sizeof(kmp_futex_lock_t) <= OMP_LOCK_T_SIZE) &&
       __kmp_is_futex_lock_nestable(lck)) {
     KMP_FATAL(LockNestableUsedAsSimple, func);
   }
   if (__kmp_get_futex_lock_owner(lck) == -1) {
     KMP_FATAL(LockUnsettingFree, func);
   }
   if ((gtid >= 0) && (__kmp_get_futex_lock_owner(lck) >= 0) &&
       (__kmp_get_futex_lock_owner(lck) != gtid)) {
     KMP_FATAL(LockUnsettingSetByAnother, func);
   }
   return __kmp_release_futex_lock(lck, gtid);
 }
 
 void __kmp_init_futex_lock(kmp_futex_lock_t *lck) {
   TCW_4(lck->lk.poll, KMP_LOCK_FREE(futex));
 }
 
 void __kmp_destroy_futex_lock(kmp_futex_lock_t *lck) { lck->lk.poll = 0; }
 
 static void __kmp_destroy_futex_lock_with_checks(kmp_futex_lock_t *lck) {
   char const *const func = "omp_destroy_lock";
   if ((sizeof(kmp_futex_lock_t) <= OMP_LOCK_T_SIZE) &&
       __kmp_is_futex_lock_nestable(lck)) {
     KMP_FATAL(LockNestableUsedAsSimple, func);
   }
   if (__kmp_get_futex_lock_owner(lck) != -1) {
     KMP_FATAL(LockStillOwned, func);
   }
   __kmp_destroy_futex_lock(lck);
 }
 
 // nested futex locks
 
 int __kmp_acquire_nested_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
   KMP_DEBUG_ASSERT(gtid >= 0);
 
   if (__kmp_get_futex_lock_owner(lck) == gtid) {
     lck->lk.depth_locked += 1;
     return KMP_LOCK_ACQUIRED_NEXT;
   } else {
     __kmp_acquire_futex_lock_timed_template(lck, gtid);
     ANNOTATE_FUTEX_ACQUIRED(lck);
     lck->lk.depth_locked = 1;
     return KMP_LOCK_ACQUIRED_FIRST;
   }
 }
 
 static int __kmp_acquire_nested_futex_lock_with_checks(kmp_futex_lock_t *lck,
                                                        kmp_int32 gtid) {
   char const *const func = "omp_set_nest_lock";
   if (!__kmp_is_futex_lock_nestable(lck)) {
     KMP_FATAL(LockSimpleUsedAsNestable, func);
   }
   return __kmp_acquire_nested_futex_lock(lck, gtid);
 }
 
 int __kmp_test_nested_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
   int retval;
 
   KMP_DEBUG_ASSERT(gtid >= 0);
 
   if (__kmp_get_futex_lock_owner(lck) == gtid) {
     retval = ++lck->lk.depth_locked;
   } else if (!__kmp_test_futex_lock(lck, gtid)) {
     retval = 0;
   } else {
     KMP_MB();
     retval = lck->lk.depth_locked = 1;
   }
   return retval;
 }
 
 static int __kmp_test_nested_futex_lock_with_checks(kmp_futex_lock_t *lck,
                                                     kmp_int32 gtid) {
   char const *const func = "omp_test_nest_lock";
   if (!__kmp_is_futex_lock_nestable(lck)) {
     KMP_FATAL(LockSimpleUsedAsNestable, func);
   }
   return __kmp_test_nested_futex_lock(lck, gtid);
 }
 
 int __kmp_release_nested_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
   KMP_DEBUG_ASSERT(gtid >= 0);
 
   KMP_MB();
   if (--(lck->lk.depth_locked) == 0) {
     __kmp_release_futex_lock(lck, gtid);
     return KMP_LOCK_RELEASED;
   }
   return KMP_LOCK_STILL_HELD;
 }
 
 static int __kmp_release_nested_futex_lock_with_checks(kmp_futex_lock_t *lck,
                                                        kmp_int32 gtid) {
   char const *const func = "omp_unset_nest_lock";
   KMP_MB(); /* in case another processor initialized lock */
   if (!__kmp_is_futex_lock_nestable(lck)) {
     KMP_FATAL(LockSimpleUsedAsNestable, func);
   }
   if (__kmp_get_futex_lock_owner(lck) == -1) {
     KMP_FATAL(LockUnsettingFree, func);
   }
   if (__kmp_get_futex_lock_owner(lck) != gtid) {
     KMP_FATAL(LockUnsettingSetByAnother, func);
   }
   return __kmp_release_nested_futex_lock(lck, gtid);
 }
 
 void __kmp_init_nested_futex_lock(kmp_futex_lock_t *lck) {
   __kmp_init_futex_lock(lck);
   lck->lk.depth_locked = 0; // >= 0 for nestable locks, -1 for simple locks
 }
 
 void __kmp_destroy_nested_futex_lock(kmp_futex_lock_t *lck) {
   __kmp_destroy_futex_lock(lck);
   lck->lk.depth_locked = 0;
 }
 
 static void __kmp_destroy_nested_futex_lock_with_checks(kmp_futex_lock_t *lck) {
   char const *const func = "omp_destroy_nest_lock";
   if (!__kmp_is_futex_lock_nestable(lck)) {
     KMP_FATAL(LockSimpleUsedAsNestable, func);
   }
   if (__kmp_get_futex_lock_owner(lck) != -1) {
     KMP_FATAL(LockStillOwned, func);
   }
   __kmp_destroy_nested_futex_lock(lck);
 }
 
 #endif // KMP_USE_FUTEX
 
 /* ------------------------------------------------------------------------ */
 /* ticket (bakery) locks */
 
 static kmp_int32 __kmp_get_ticket_lock_owner(kmp_ticket_lock_t *lck) {
   return std::atomic_load_explicit(&lck->lk.owner_id,
                                    std::memory_order_relaxed) -
          1;
 }
 
 static inline bool __kmp_is_ticket_lock_nestable(kmp_ticket_lock_t *lck) {
   return std::atomic_load_explicit(&lck->lk.depth_locked,
                                    std::memory_order_relaxed) != -1;
 }
 
 static kmp_uint32 __kmp_bakery_check(void *now_serving, kmp_uint32 my_ticket) {
   return std::atomic_load_explicit((std::atomic<unsigned> *)now_serving,
                                    std::memory_order_acquire) == my_ticket;
 }
 
 __forceinline static int
 __kmp_acquire_ticket_lock_timed_template(kmp_ticket_lock_t *lck,
                                          kmp_int32 gtid) {
   kmp_uint32 my_ticket = std::atomic_fetch_add_explicit(
       &lck->lk.next_ticket, 1U, std::memory_order_relaxed);
 
 #ifdef USE_LOCK_PROFILE
   if (std::atomic_load_explicit(&lck->lk.now_serving,
                                 std::memory_order_relaxed) != my_ticket)
     __kmp_printf("LOCK CONTENTION: %p\n", lck);
 /* else __kmp_printf( "." );*/
 #endif /* USE_LOCK_PROFILE */
 
   if (std::atomic_load_explicit(&lck->lk.now_serving,
                                 std::memory_order_acquire) == my_ticket) {
     return KMP_LOCK_ACQUIRED_FIRST;
   }
   KMP_WAIT_PTR(&lck->lk.now_serving, my_ticket, __kmp_bakery_check, lck);
   return KMP_LOCK_ACQUIRED_FIRST;
 }
 
 int __kmp_acquire_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
   int retval = __kmp_acquire_ticket_lock_timed_template(lck, gtid);
   ANNOTATE_TICKET_ACQUIRED(lck);
   return retval;
 }
 
 static int __kmp_acquire_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
                                                  kmp_int32 gtid) {
   char const *const func = "omp_set_lock";
 
   if (!std::atomic_load_explicit(&lck->lk.initialized,
                                  std::memory_order_relaxed)) {
     KMP_FATAL(LockIsUninitialized, func);
   }
   if (lck->lk.self != lck) {
     KMP_FATAL(LockIsUninitialized, func);
   }
   if (__kmp_is_ticket_lock_nestable(lck)) {
     KMP_FATAL(LockNestableUsedAsSimple, func);
   }
   if ((gtid >= 0) && (__kmp_get_ticket_lock_owner(lck) == gtid)) {
     KMP_FATAL(LockIsAlreadyOwned, func);
   }
 
   __kmp_acquire_ticket_lock(lck, gtid);
 
   std::atomic_store_explicit(&lck->lk.owner_id, gtid + 1,
                              std::memory_order_relaxed);
   return KMP_LOCK_ACQUIRED_FIRST;
 }
 
 int __kmp_test_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
   kmp_uint32 my_ticket = std::atomic_load_explicit(&lck->lk.next_ticket,
                                                    std::memory_order_relaxed);
 
   if (std::atomic_load_explicit(&lck->lk.now_serving,
                                 std::memory_order_relaxed) == my_ticket) {
     kmp_uint32 next_ticket = my_ticket + 1;
     if (std::atomic_compare_exchange_strong_explicit(
             &lck->lk.next_ticket, &my_ticket, next_ticket,
             std::memory_order_acquire, std::memory_order_acquire)) {
       return TRUE;
     }
   }
   return FALSE;
 }
 
 static int __kmp_test_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
                                               kmp_int32 gtid) {
   char const *const func = "omp_test_lock";
 
   if (!std::atomic_load_explicit(&lck->lk.initialized,
                                  std::memory_order_relaxed)) {
     KMP_FATAL(LockIsUninitialized, func);
   }
   if (lck->lk.self != lck) {
     KMP_FATAL(LockIsUninitialized, func);
   }
   if (__kmp_is_ticket_lock_nestable(lck)) {
     KMP_FATAL(LockNestableUsedAsSimple, func);
   }
 
   int retval = __kmp_test_ticket_lock(lck, gtid);
 
   if (retval) {
     std::atomic_store_explicit(&lck->lk.owner_id, gtid + 1,
                                std::memory_order_relaxed);
   }
   return retval;
 }
 
 int __kmp_release_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
   kmp_uint32 distance = std::atomic_load_explicit(&lck->lk.next_ticket,
                                                   std::memory_order_relaxed) -
                         std::atomic_load_explicit(&lck->lk.now_serving,
                                                   std::memory_order_relaxed);
 
   ANNOTATE_TICKET_RELEASED(lck);
   std::atomic_fetch_add_explicit(&lck->lk.now_serving, 1U,
                                  std::memory_order_release);
 
   KMP_YIELD(distance >
             (kmp_uint32)(__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc));
   return KMP_LOCK_RELEASED;
 }
 
 static int __kmp_release_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
                                                  kmp_int32 gtid) {
   char const *const func = "omp_unset_lock";
 
   if (!std::atomic_load_explicit(&lck->lk.initialized,
                                  std::memory_order_relaxed)) {
     KMP_FATAL(LockIsUninitialized, func);
   }
   if (lck->lk.self != lck) {
     KMP_FATAL(LockIsUninitialized, func);
   }
   if (__kmp_is_ticket_lock_nestable(lck)) {
     KMP_FATAL(LockNestableUsedAsSimple, func);
   }
   if (__kmp_get_ticket_lock_owner(lck) == -1) {
     KMP_FATAL(LockUnsettingFree, func);
   }
   if ((gtid >= 0) && (__kmp_get_ticket_lock_owner(lck) >= 0) &&
       (__kmp_get_ticket_lock_owner(lck) != gtid)) {
     KMP_FATAL(LockUnsettingSetByAnother, func);
   }
   std::atomic_store_explicit(&lck->lk.owner_id, 0, std::memory_order_relaxed);
   return __kmp_release_ticket_lock(lck, gtid);
 }
 
 void __kmp_init_ticket_lock(kmp_ticket_lock_t *lck) {
   lck->lk.location = NULL;
   lck->lk.self = lck;
   std::atomic_store_explicit(&lck->lk.next_ticket, 0U,
                              std::memory_order_relaxed);
   std::atomic_store_explicit(&lck->lk.now_serving, 0U,
                              std::memory_order_relaxed);
   std::atomic_store_explicit(
       &lck->lk.owner_id, 0,
       std::memory_order_relaxed); // no thread owns the lock.
   std::atomic_store_explicit(
       &lck->lk.depth_locked, -1,
       std::memory_order_relaxed); // -1 => not a nested lock.
   std::atomic_store_explicit(&lck->lk.initialized, true,
                              std::memory_order_release);
 }
 
 void __kmp_destroy_ticket_lock(kmp_ticket_lock_t *lck) {
   std::atomic_store_explicit(&lck->lk.initialized, false,
                              std::memory_order_release);
   lck->lk.self = NULL;
   lck->lk.location = NULL;
   std::atomic_store_explicit(&lck->lk.next_ticket, 0U,
                              std::memory_order_relaxed);
   std::atomic_store_explicit(&lck->lk.now_serving, 0U,
                              std::memory_order_relaxed);
   std::atomic_store_explicit(&lck->lk.owner_id, 0, std::memory_order_relaxed);
   std::atomic_store_explicit(&lck->lk.depth_locked, -1,
                              std::memory_order_relaxed);
 }
 
 static void __kmp_destroy_ticket_lock_with_checks(kmp_ticket_lock_t *lck) {
   char const *const func = "omp_destroy_lock";
 
   if (!std::atomic_load_explicit(&lck->lk.initialized,
                                  std::memory_order_relaxed)) {
     KMP_FATAL(LockIsUninitialized, func);
   }
   if (lck->lk.self != lck) {
     KMP_FATAL(LockIsUninitialized, func);
   }
   if (__kmp_is_ticket_lock_nestable(lck)) {
     KMP_FATAL(LockNestableUsedAsSimple, func);
   }
   if (__kmp_get_ticket_lock_owner(lck) != -1) {
     KMP_FATAL(LockStillOwned, func);
   }
   __kmp_destroy_ticket_lock(lck);
 }
 
 // nested ticket locks
 
 int __kmp_acquire_nested_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
   KMP_DEBUG_ASSERT(gtid >= 0);
 
   if (__kmp_get_ticket_lock_owner(lck) == gtid) {
     std::atomic_fetch_add_explicit(&lck->lk.depth_locked, 1,
                                    std::memory_order_relaxed);
     return KMP_LOCK_ACQUIRED_NEXT;
   } else {
     __kmp_acquire_ticket_lock_timed_template(lck, gtid);
     ANNOTATE_TICKET_ACQUIRED(lck);
     std::atomic_store_explicit(&lck->lk.depth_locked, 1,
                                std::memory_order_relaxed);
     std::atomic_store_explicit(&lck->lk.owner_id, gtid + 1,
                                std::memory_order_relaxed);
     return KMP_LOCK_ACQUIRED_FIRST;
   }
 }
 
 static int __kmp_acquire_nested_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
                                                         kmp_int32 gtid) {
   char const *const func = "omp_set_nest_lock";
 
   if (!std::atomic_load_explicit(&lck->lk.initialized,
                                  std::memory_order_relaxed)) {
     KMP_FATAL(LockIsUninitialized, func);
   }
   if (lck->lk.self != lck) {
     KMP_FATAL(LockIsUninitialized, func);
   }
   if (!__kmp_is_ticket_lock_nestable(lck)) {
     KMP_FATAL(LockSimpleUsedAsNestable, func);
   }
   return __kmp_acquire_nested_ticket_lock(lck, gtid);
 }
 
 int __kmp_test_nested_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
   int retval;
 
   KMP_DEBUG_ASSERT(gtid >= 0);
 
   if (__kmp_get_ticket_lock_owner(lck) == gtid) {
     retval = std::atomic_fetch_add_explicit(&lck->lk.depth_locked, 1,
                                             std::memory_order_relaxed) +
              1;
   } else if (!__kmp_test_ticket_lock(lck, gtid)) {
     retval = 0;
   } else {
     std::atomic_store_explicit(&lck->lk.depth_locked, 1,
                                std::memory_order_relaxed);
     std::atomic_store_explicit(&lck->lk.owner_id, gtid + 1,
                                std::memory_order_relaxed);
     retval = 1;
   }
   return retval;
 }
 
 static int __kmp_test_nested_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
                                                      kmp_int32 gtid) {
   char const *const func = "omp_test_nest_lock";
 
   if (!std::atomic_load_explicit(&lck->lk.initialized,
                                  std::memory_order_relaxed)) {
     KMP_FATAL(LockIsUninitialized, func);
   }
   if (lck->lk.self != lck) {
     KMP_FATAL(LockIsUninitialized, func);
   }
   if (!__kmp_is_ticket_lock_nestable(lck)) {
     KMP_FATAL(LockSimpleUsedAsNestable, func);
   }
   return __kmp_test_nested_ticket_lock(lck, gtid);
 }
 
 int __kmp_release_nested_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid) {
   KMP_DEBUG_ASSERT(gtid >= 0);
 
   if ((std::atomic_fetch_add_explicit(&lck->lk.depth_locked, -1,
                                       std::memory_order_relaxed) -
        1) == 0) {
     std::atomic_store_explicit(&lck->lk.owner_id, 0, std::memory_order_relaxed);
     __kmp_release_ticket_lock(lck, gtid);
     return KMP_LOCK_RELEASED;
   }
   return KMP_LOCK_STILL_HELD;
 }
 
 static int __kmp_release_nested_ticket_lock_with_checks(kmp_ticket_lock_t *lck,
                                                         kmp_int32 gtid) {
   char const *const func = "omp_unset_nest_lock";
 
   if (!std::atomic_load_explicit(&lck->lk.initialized,
                                  std::memory_order_relaxed)) {
     KMP_FATAL(LockIsUninitialized, func);
   }
   if (lck->lk.self != lck) {
     KMP_FATAL(LockIsUninitialized, func);
   }
   if (!__kmp_is_ticket_lock_nestable(lck)) {
     KMP_FATAL(LockSimpleUsedAsNestable, func);
   }
   if (__kmp_get_ticket_lock_owner(lck) == -1) {
     KMP_FATAL(LockUnsettingFree, func);
   }
   if (__kmp_get_ticket_lock_owner(lck) != gtid) {
     KMP_FATAL(LockUnsettingSetByAnother, func);
   }
   return __kmp_release_nested_ticket_lock(lck, gtid);
 }
 
 void __kmp_init_nested_ticket_lock(kmp_ticket_lock_t *lck) {
   __kmp_init_ticket_lock(lck);
   std::atomic_store_explicit(&lck->lk.depth_locked, 0,
                              std::memory_order_relaxed);
   // >= 0 for nestable locks, -1 for simple locks
 }
 
 void __kmp_destroy_nested_ticket_lock(kmp_ticket_lock_t *lck) {
   __kmp_destroy_ticket_lock(lck);
   std::atomic_store_explicit(&lck->lk.depth_locked, 0,
                              std::memory_order_relaxed);
 }
 
 static void
 __kmp_destroy_nested_ticket_lock_with_checks(kmp_ticket_lock_t *lck) {
   char const *const func = "omp_destroy_nest_lock";
 
   if (!std::atomic_load_explicit(&lck->lk.initialized,
                                  std::memory_order_relaxed)) {
     KMP_FATAL(LockIsUninitialized, func);
   }
   if (lck->lk.self != lck) {
     KMP_FATAL(LockIsUninitialized, func);
   }
   if (!__kmp_is_ticket_lock_nestable(lck)) {
     KMP_FATAL(LockSimpleUsedAsNestable, func);
   }
   if (__kmp_get_ticket_lock_owner(lck) != -1) {
     KMP_FATAL(LockStillOwned, func);
   }
   __kmp_destroy_nested_ticket_lock(lck);
 }
 
 // access functions to fields which don't exist for all lock kinds.
 
 static const ident_t *__kmp_get_ticket_lock_location(kmp_ticket_lock_t *lck) {
   return lck->lk.location;
 }
 
 static void __kmp_set_ticket_lock_location(kmp_ticket_lock_t *lck,
                                            const ident_t *loc) {
   lck->lk.location = loc;
 }
 
 static kmp_lock_flags_t __kmp_get_ticket_lock_flags(kmp_ticket_lock_t *lck) {
   return lck->lk.flags;
 }
 
 static void __kmp_set_ticket_lock_flags(kmp_ticket_lock_t *lck,
                                         kmp_lock_flags_t flags) {
   lck->lk.flags = flags;
 }
 
 /* ------------------------------------------------------------------------ */
 /* queuing locks */
 
 /* First the states
    (head,tail) =              0, 0  means lock is unheld, nobody on queue
                  UINT_MAX or -1, 0  means lock is held, nobody on queue
                               h, h  means lock held or about to transition,
                                     1 element on queue
                               h, t  h <> t, means lock is held or about to
                                     transition, >1 elements on queue
 
    Now the transitions
       Acquire(0,0)  = -1 ,0
       Release(0,0)  = Error
       Acquire(-1,0) =  h ,h    h > 0
       Release(-1,0) =  0 ,0
       Acquire(h,h)  =  h ,t    h > 0, t > 0, h <> t
       Release(h,h)  = -1 ,0    h > 0
       Acquire(h,t)  =  h ,t'   h > 0, t > 0, t' > 0, h <> t, h <> t', t <> t'
       Release(h,t)  =  h',t    h > 0, t > 0, h <> t, h <> h', h' maybe = t
 
    And pictorially
 
            +-----+
            | 0, 0|------- release -------> Error
            +-----+
              |  ^
       acquire|  |release
              |  |
              |  |
              v  |
            +-----+
            |-1, 0|
            +-----+
              |  ^
       acquire|  |release
              |  |
              |  |
              v  |
            +-----+
            | h, h|
            +-----+
              |  ^
       acquire|  |release
              |  |
              |  |
              v  |
            +-----+
            | h, t|----- acquire, release loopback ---+
            +-----+                                   |
                 ^                                    |
                 |                                    |
                 +------------------------------------+
  */
 
 #ifdef DEBUG_QUEUING_LOCKS
 
 /* Stuff for circular trace buffer */
 #define TRACE_BUF_ELE 1024
 static char traces[TRACE_BUF_ELE][128] = {0};
 static int tc = 0;
 #define TRACE_LOCK(X, Y)                                                       \
   KMP_SNPRINTF(traces[tc++ % TRACE_BUF_ELE], 128, "t%d at %s\n", X, Y);
 #define TRACE_LOCK_T(X, Y, Z)                                                  \
   KMP_SNPRINTF(traces[tc++ % TRACE_BUF_ELE], 128, "t%d at %s%d\n", X, Y, Z);
 #define TRACE_LOCK_HT(X, Y, Z, Q)                                              \
   KMP_SNPRINTF(traces[tc++ % TRACE_BUF_ELE], 128, "t%d at %s %d,%d\n", X, Y,   \
                Z, Q);
 
 static void __kmp_dump_queuing_lock(kmp_info_t *this_thr, kmp_int32 gtid,
                                     kmp_queuing_lock_t *lck, kmp_int32 head_id,
                                     kmp_int32 tail_id) {
   kmp_int32 t, i;
 
   __kmp_printf_no_lock("\n__kmp_dump_queuing_lock: TRACE BEGINS HERE! \n");
 
   i = tc % TRACE_BUF_ELE;
   __kmp_printf_no_lock("%s\n", traces[i]);
   i = (i + 1) % TRACE_BUF_ELE;
   while (i != (tc % TRACE_BUF_ELE)) {
     __kmp_printf_no_lock("%s", traces[i]);
     i = (i + 1) % TRACE_BUF_ELE;
   }
   __kmp_printf_no_lock("\n");
 
   __kmp_printf_no_lock("\n__kmp_dump_queuing_lock: gtid+1:%d, spin_here:%d, "
                        "next_wait:%d, head_id:%d, tail_id:%d\n",
                        gtid + 1, this_thr->th.th_spin_here,
                        this_thr->th.th_next_waiting, head_id, tail_id);
 
   __kmp_printf_no_lock("\t\thead: %d ", lck->lk.head_id);
 
   if (lck->lk.head_id >= 1) {
     t = __kmp_threads[lck->lk.head_id - 1]->th.th_next_waiting;
     while (t > 0) {
       __kmp_printf_no_lock("-> %d ", t);
       t = __kmp_threads[t - 1]->th.th_next_waiting;
     }
   }
   __kmp_printf_no_lock(";  tail: %d ", lck->lk.tail_id);
   __kmp_printf_no_lock("\n\n");
 }
 
 #endif /* DEBUG_QUEUING_LOCKS */
 
 static kmp_int32 __kmp_get_queuing_lock_owner(kmp_queuing_lock_t *lck) {
   return TCR_4(lck->lk.owner_id) - 1;
 }
 
 static inline bool __kmp_is_queuing_lock_nestable(kmp_queuing_lock_t *lck) {
   return lck->lk.depth_locked != -1;
 }
 
 /* Acquire a lock using a the queuing lock implementation */
 template <bool takeTime>
 /* [TLW] The unused template above is left behind because of what BEB believes
    is a potential compiler problem with __forceinline. */
 __forceinline static int
 __kmp_acquire_queuing_lock_timed_template(kmp_queuing_lock_t *lck,
                                           kmp_int32 gtid) {
   kmp_info_t *this_thr = __kmp_thread_from_gtid(gtid);
   volatile kmp_int32 *head_id_p = &lck->lk.head_id;
   volatile kmp_int32 *tail_id_p = &lck->lk.tail_id;
   volatile kmp_uint32 *spin_here_p;
   kmp_int32 need_mf = 1;
 
 #if OMPT_SUPPORT
   ompt_state_t prev_state = ompt_state_undefined;
 #endif
 
   KA_TRACE(1000,
            ("__kmp_acquire_queuing_lock: lck:%p, T#%d entering\n", lck, gtid));
 
   KMP_FSYNC_PREPARE(lck);
   KMP_DEBUG_ASSERT(this_thr != NULL);
   spin_here_p = &this_thr->th.th_spin_here;
 
 #ifdef DEBUG_QUEUING_LOCKS
   TRACE_LOCK(gtid + 1, "acq ent");
   if (*spin_here_p)
     __kmp_dump_queuing_lock(this_thr, gtid, lck, *head_id_p, *tail_id_p);
   if (this_thr->th.th_next_waiting != 0)
     __kmp_dump_queuing_lock(this_thr, gtid, lck, *head_id_p, *tail_id_p);
 #endif
   KMP_DEBUG_ASSERT(!*spin_here_p);
   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
 
   /* The following st.rel to spin_here_p needs to precede the cmpxchg.acq to
      head_id_p that may follow, not just in execution order, but also in
      visibility order. This way, when a releasing thread observes the changes to
      the queue by this thread, it can rightly assume that spin_here_p has
      already been set to TRUE, so that when it sets spin_here_p to FALSE, it is
      not premature.  If the releasing thread sets spin_here_p to FALSE before
      this thread sets it to TRUE, this thread will hang. */
   *spin_here_p = TRUE; /* before enqueuing to prevent race */
 
   while (1) {
     kmp_int32 enqueued;
     kmp_int32 head;
     kmp_int32 tail;
 
     head = *head_id_p;
 
     switch (head) {
 
     case -1: {
 #ifdef DEBUG_QUEUING_LOCKS
       tail = *tail_id_p;
       TRACE_LOCK_HT(gtid + 1, "acq read: ", head, tail);
 #endif
       tail = 0; /* to make sure next link asynchronously read is not set
                 accidentally; this assignment prevents us from entering the
                 if ( t > 0 ) condition in the enqueued case below, which is not
                 necessary for this state transition */
 
       need_mf = 0;
       /* try (-1,0)->(tid,tid) */
       enqueued = KMP_COMPARE_AND_STORE_ACQ64((volatile kmp_int64 *)tail_id_p,
                                              KMP_PACK_64(-1, 0),
                                              KMP_PACK_64(gtid + 1, gtid + 1));
 #ifdef DEBUG_QUEUING_LOCKS
       if (enqueued)
         TRACE_LOCK(gtid + 1, "acq enq: (-1,0)->(tid,tid)");
 #endif
     } break;
 
     default: {
       tail = *tail_id_p;
       KMP_DEBUG_ASSERT(tail != gtid + 1);
 
 #ifdef DEBUG_QUEUING_LOCKS
       TRACE_LOCK_HT(gtid + 1, "acq read: ", head, tail);
 #endif
 
       if (tail == 0) {
         enqueued = FALSE;
       } else {
         need_mf = 0;
         /* try (h,t) or (h,h)->(h,tid) */
         enqueued = KMP_COMPARE_AND_STORE_ACQ32(tail_id_p, tail, gtid + 1);
 
 #ifdef DEBUG_QUEUING_LOCKS
         if (enqueued)
           TRACE_LOCK(gtid + 1, "acq enq: (h,t)->(h,tid)");
 #endif
       }
     } break;
 
     case 0: /* empty queue */
     {
       kmp_int32 grabbed_lock;
 
 #ifdef DEBUG_QUEUING_LOCKS
       tail = *tail_id_p;
       TRACE_LOCK_HT(gtid + 1, "acq read: ", head, tail);
 #endif
       /* try (0,0)->(-1,0) */
 
       /* only legal transition out of head = 0 is head = -1 with no change to
        * tail */
       grabbed_lock = KMP_COMPARE_AND_STORE_ACQ32(head_id_p, 0, -1);
 
       if (grabbed_lock) {
 
         *spin_here_p = FALSE;
 
         KA_TRACE(
             1000,
             ("__kmp_acquire_queuing_lock: lck:%p, T#%d exiting: no queuing\n",
              lck, gtid));
 #ifdef DEBUG_QUEUING_LOCKS
         TRACE_LOCK_HT(gtid + 1, "acq exit: ", head, 0);
 #endif
 
 #if OMPT_SUPPORT
         if (ompt_enabled.enabled && prev_state != ompt_state_undefined) {
           /* change the state before clearing wait_id */
           this_thr->th.ompt_thread_info.state = prev_state;
           this_thr->th.ompt_thread_info.wait_id = 0;
         }
 #endif
 
         KMP_FSYNC_ACQUIRED(lck);
         return KMP_LOCK_ACQUIRED_FIRST; /* lock holder cannot be on queue */
       }
       enqueued = FALSE;
     } break;
     }
 
 #if OMPT_SUPPORT
     if (ompt_enabled.enabled && prev_state == ompt_state_undefined) {
       /* this thread will spin; set wait_id before entering wait state */
       prev_state = this_thr->th.ompt_thread_info.state;
       this_thr->th.ompt_thread_info.wait_id = (uint64_t)lck;
       this_thr->th.ompt_thread_info.state = ompt_state_wait_lock;
     }
 #endif
 
     if (enqueued) {
       if (tail > 0) {
         kmp_info_t *tail_thr = __kmp_thread_from_gtid(tail - 1);
         KMP_ASSERT(tail_thr != NULL);
         tail_thr->th.th_next_waiting = gtid + 1;
         /* corresponding wait for this write in release code */
       }
       KA_TRACE(1000,
                ("__kmp_acquire_queuing_lock: lck:%p, T#%d waiting for lock\n",
                 lck, gtid));
 
       KMP_MB();
       // ToDo: Use __kmp_wait_sleep or similar when blocktime != inf
       KMP_WAIT(spin_here_p, FALSE, KMP_EQ, lck);
 
 #ifdef DEBUG_QUEUING_LOCKS
       TRACE_LOCK(gtid + 1, "acq spin");
 
       if (this_thr->th.th_next_waiting != 0)
         __kmp_dump_queuing_lock(this_thr, gtid, lck, *head_id_p, *tail_id_p);
 #endif
       KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
       KA_TRACE(1000, ("__kmp_acquire_queuing_lock: lck:%p, T#%d exiting: after "
                       "waiting on queue\n",
                       lck, gtid));
 
 #ifdef DEBUG_QUEUING_LOCKS
       TRACE_LOCK(gtid + 1, "acq exit 2");
 #endif
 
 #if OMPT_SUPPORT
       /* change the state before clearing wait_id */
       this_thr->th.ompt_thread_info.state = prev_state;
       this_thr->th.ompt_thread_info.wait_id = 0;
 #endif
 
       /* got lock, we were dequeued by the thread that released lock */
       return KMP_LOCK_ACQUIRED_FIRST;
     }
 
     /* Yield if number of threads > number of logical processors */
     /* ToDo: Not sure why this should only be in oversubscription case,
        maybe should be traditional YIELD_INIT/YIELD_WHEN loop */
     KMP_YIELD_OVERSUB();
 
 #ifdef DEBUG_QUEUING_LOCKS
     TRACE_LOCK(gtid + 1, "acq retry");
 #endif
   }
   KMP_ASSERT2(0, "should not get here");
   return KMP_LOCK_ACQUIRED_FIRST;
 }
 
 int __kmp_acquire_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
   KMP_DEBUG_ASSERT(gtid >= 0);
 
   int retval = __kmp_acquire_queuing_lock_timed_template<false>(lck, gtid);
   ANNOTATE_QUEUING_ACQUIRED(lck);
   return retval;
 }
 
 static int __kmp_acquire_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
                                                   kmp_int32 gtid) {
   char const *const func = "omp_set_lock";
   if (lck->lk.initialized != lck) {
     KMP_FATAL(LockIsUninitialized, func);
   }
   if (__kmp_is_queuing_lock_nestable(lck)) {
     KMP_FATAL(LockNestableUsedAsSimple, func);
   }
   if (__kmp_get_queuing_lock_owner(lck) == gtid) {
     KMP_FATAL(LockIsAlreadyOwned, func);
   }
 
   __kmp_acquire_queuing_lock(lck, gtid);
 
   lck->lk.owner_id = gtid + 1;
   return KMP_LOCK_ACQUIRED_FIRST;
 }
 
 int __kmp_test_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
   volatile kmp_int32 *head_id_p = &lck->lk.head_id;
   kmp_int32 head;
 #ifdef KMP_DEBUG
   kmp_info_t *this_thr;
 #endif
 
   KA_TRACE(1000, ("__kmp_test_queuing_lock: T#%d entering\n", gtid));
   KMP_DEBUG_ASSERT(gtid >= 0);
 #ifdef KMP_DEBUG
   this_thr = __kmp_thread_from_gtid(gtid);
   KMP_DEBUG_ASSERT(this_thr != NULL);
   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
 #endif
 
   head = *head_id_p;
 
   if (head == 0) { /* nobody on queue, nobody holding */
     /* try (0,0)->(-1,0) */
     if (KMP_COMPARE_AND_STORE_ACQ32(head_id_p, 0, -1)) {
       KA_TRACE(1000,
                ("__kmp_test_queuing_lock: T#%d exiting: holding lock\n", gtid));
       KMP_FSYNC_ACQUIRED(lck);
       ANNOTATE_QUEUING_ACQUIRED(lck);
       return TRUE;
     }
   }
 
   KA_TRACE(1000,
            ("__kmp_test_queuing_lock: T#%d exiting: without lock\n", gtid));
   return FALSE;
 }
 
 static int __kmp_test_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
                                                kmp_int32 gtid) {
   char const *const func = "omp_test_lock";
   if (lck->lk.initialized != lck) {
     KMP_FATAL(LockIsUninitialized, func);
   }
   if (__kmp_is_queuing_lock_nestable(lck)) {
     KMP_FATAL(LockNestableUsedAsSimple, func);
   }
 
   int retval = __kmp_test_queuing_lock(lck, gtid);
 
   if (retval) {
     lck->lk.owner_id = gtid + 1;
   }
   return retval;
 }
 
 int __kmp_release_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
   kmp_info_t *this_thr;
   volatile kmp_int32 *head_id_p = &lck->lk.head_id;
   volatile kmp_int32 *tail_id_p = &lck->lk.tail_id;
 
   KA_TRACE(1000,
            ("__kmp_release_queuing_lock: lck:%p, T#%d entering\n", lck, gtid));
   KMP_DEBUG_ASSERT(gtid >= 0);
   this_thr = __kmp_thread_from_gtid(gtid);
   KMP_DEBUG_ASSERT(this_thr != NULL);
 #ifdef DEBUG_QUEUING_LOCKS
   TRACE_LOCK(gtid + 1, "rel ent");
 
   if (this_thr->th.th_spin_here)
     __kmp_dump_queuing_lock(this_thr, gtid, lck, *head_id_p, *tail_id_p);
   if (this_thr->th.th_next_waiting != 0)
     __kmp_dump_queuing_lock(this_thr, gtid, lck, *head_id_p, *tail_id_p);
 #endif
   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
 
   KMP_FSYNC_RELEASING(lck);
   ANNOTATE_QUEUING_RELEASED(lck);
 
   while (1) {
     kmp_int32 dequeued;
     kmp_int32 head;
     kmp_int32 tail;
 
     head = *head_id_p;
 
 #ifdef DEBUG_QUEUING_LOCKS
     tail = *tail_id_p;
     TRACE_LOCK_HT(gtid + 1, "rel read: ", head, tail);
     if (head == 0)
       __kmp_dump_queuing_lock(this_thr, gtid, lck, head, tail);
 #endif
     KMP_DEBUG_ASSERT(head !=
                      0); /* holding the lock, head must be -1 or queue head */
 
     if (head == -1) { /* nobody on queue */
       /* try (-1,0)->(0,0) */
       if (KMP_COMPARE_AND_STORE_REL32(head_id_p, -1, 0)) {
         KA_TRACE(
             1000,
             ("__kmp_release_queuing_lock: lck:%p, T#%d exiting: queue empty\n",
              lck, gtid));
 #ifdef DEBUG_QUEUING_LOCKS
         TRACE_LOCK_HT(gtid + 1, "rel exit: ", 0, 0);
 #endif
 
 #if OMPT_SUPPORT
 /* nothing to do - no other thread is trying to shift blame */
 #endif
         return KMP_LOCK_RELEASED;
       }
       dequeued = FALSE;
     } else {
       KMP_MB();
       tail = *tail_id_p;
       if (head == tail) { /* only one thread on the queue */
 #ifdef DEBUG_QUEUING_LOCKS
         if (head <= 0)
           __kmp_dump_queuing_lock(this_thr, gtid, lck, head, tail);
 #endif
         KMP_DEBUG_ASSERT(head > 0);
 
         /* try (h,h)->(-1,0) */
         dequeued = KMP_COMPARE_AND_STORE_REL64(
             RCAST(volatile kmp_int64 *, tail_id_p), KMP_PACK_64(head, head),
             KMP_PACK_64(-1, 0));
 #ifdef DEBUG_QUEUING_LOCKS
         TRACE_LOCK(gtid + 1, "rel deq: (h,h)->(-1,0)");
 #endif
 
       } else {
         volatile kmp_int32 *waiting_id_p;
         kmp_info_t *head_thr = __kmp_thread_from_gtid(head - 1);
         KMP_DEBUG_ASSERT(head_thr != NULL);
         waiting_id_p = &head_thr->th.th_next_waiting;
 
 /* Does this require synchronous reads? */
 #ifdef DEBUG_QUEUING_LOCKS
         if (head <= 0 || tail <= 0)
           __kmp_dump_queuing_lock(this_thr, gtid, lck, head, tail);
 #endif
         KMP_DEBUG_ASSERT(head > 0 && tail > 0);
 
         /* try (h,t)->(h',t) or (t,t) */
         KMP_MB();
         /* make sure enqueuing thread has time to update next waiting thread
          * field */
         *head_id_p =
             KMP_WAIT((volatile kmp_uint32 *)waiting_id_p, 0, KMP_NEQ, NULL);
 #ifdef DEBUG_QUEUING_LOCKS
         TRACE_LOCK(gtid + 1, "rel deq: (h,t)->(h',t)");
 #endif
         dequeued = TRUE;
       }
     }
 
     if (dequeued) {
       kmp_info_t *head_thr = __kmp_thread_from_gtid(head - 1);
       KMP_DEBUG_ASSERT(head_thr != NULL);
 
 /* Does this require synchronous reads? */
 #ifdef DEBUG_QUEUING_LOCKS
       if (head <= 0 || tail <= 0)
         __kmp_dump_queuing_lock(this_thr, gtid, lck, head, tail);
 #endif
       KMP_DEBUG_ASSERT(head > 0 && tail > 0);
 
       /* For clean code only. Thread not released until next statement prevents
          race with acquire code. */
       head_thr->th.th_next_waiting = 0;
 #ifdef DEBUG_QUEUING_LOCKS
       TRACE_LOCK_T(gtid + 1, "rel nw=0 for t=", head);
 #endif
 
       KMP_MB();
       /* reset spin value */
       head_thr->th.th_spin_here = FALSE;
 
       KA_TRACE(1000, ("__kmp_release_queuing_lock: lck:%p, T#%d exiting: after "
                       "dequeuing\n",
                       lck, gtid));
 #ifdef DEBUG_QUEUING_LOCKS
       TRACE_LOCK(gtid + 1, "rel exit 2");
 #endif
       return KMP_LOCK_RELEASED;
     }
 /* KMP_CPU_PAUSE(); don't want to make releasing thread hold up acquiring
    threads */
 
 #ifdef DEBUG_QUEUING_LOCKS
     TRACE_LOCK(gtid + 1, "rel retry");
 #endif
 
   } /* while */
   KMP_ASSERT2(0, "should not get here");
   return KMP_LOCK_RELEASED;
 }
 
 static int __kmp_release_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
                                                   kmp_int32 gtid) {
   char const *const func = "omp_unset_lock";
   KMP_MB(); /* in case another processor initialized lock */
   if (lck->lk.initialized != lck) {
     KMP_FATAL(LockIsUninitialized, func);
   }
   if (__kmp_is_queuing_lock_nestable(lck)) {
     KMP_FATAL(LockNestableUsedAsSimple, func);
   }
   if (__kmp_get_queuing_lock_owner(lck) == -1) {
     KMP_FATAL(LockUnsettingFree, func);
   }
   if (__kmp_get_queuing_lock_owner(lck) != gtid) {
     KMP_FATAL(LockUnsettingSetByAnother, func);
   }
   lck->lk.owner_id = 0;
   return __kmp_release_queuing_lock(lck, gtid);
 }
 
 void __kmp_init_queuing_lock(kmp_queuing_lock_t *lck) {
   lck->lk.location = NULL;
   lck->lk.head_id = 0;
   lck->lk.tail_id = 0;
   lck->lk.next_ticket = 0;
   lck->lk.now_serving = 0;
   lck->lk.owner_id = 0; // no thread owns the lock.
   lck->lk.depth_locked = -1; // >= 0 for nestable locks, -1 for simple locks.
   lck->lk.initialized = lck;
 
   KA_TRACE(1000, ("__kmp_init_queuing_lock: lock %p initialized\n", lck));
 }
 
 void __kmp_destroy_queuing_lock(kmp_queuing_lock_t *lck) {
   lck->lk.initialized = NULL;
   lck->lk.location = NULL;
   lck->lk.head_id = 0;
   lck->lk.tail_id = 0;
   lck->lk.next_ticket = 0;
   lck->lk.now_serving = 0;
   lck->lk.owner_id = 0;
   lck->lk.depth_locked = -1;
 }
 
 static void __kmp_destroy_queuing_lock_with_checks(kmp_queuing_lock_t *lck) {
   char const *const func = "omp_destroy_lock";
   if (lck->lk.initialized != lck) {
     KMP_FATAL(LockIsUninitialized, func);
   }
   if (__kmp_is_queuing_lock_nestable(lck)) {
     KMP_FATAL(LockNestableUsedAsSimple, func);
   }
   if (__kmp_get_queuing_lock_owner(lck) != -1) {
     KMP_FATAL(LockStillOwned, func);
   }
   __kmp_destroy_queuing_lock(lck);
 }
 
 // nested queuing locks
 
 int __kmp_acquire_nested_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
   KMP_DEBUG_ASSERT(gtid >= 0);
 
   if (__kmp_get_queuing_lock_owner(lck) == gtid) {
     lck->lk.depth_locked += 1;
     return KMP_LOCK_ACQUIRED_NEXT;
   } else {
     __kmp_acquire_queuing_lock_timed_template<false>(lck, gtid);
     ANNOTATE_QUEUING_ACQUIRED(lck);
     KMP_MB();
     lck->lk.depth_locked = 1;
     KMP_MB();
     lck->lk.owner_id = gtid + 1;
     return KMP_LOCK_ACQUIRED_FIRST;
   }
 }
 
 static int
 __kmp_acquire_nested_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
                                               kmp_int32 gtid) {
   char const *const func = "omp_set_nest_lock";
   if (lck->lk.initialized != lck) {
     KMP_FATAL(LockIsUninitialized, func);
   }
   if (!__kmp_is_queuing_lock_nestable(lck)) {
     KMP_FATAL(LockSimpleUsedAsNestable, func);
   }
   return __kmp_acquire_nested_queuing_lock(lck, gtid);
 }
 
 int __kmp_test_nested_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
   int retval;
 
   KMP_DEBUG_ASSERT(gtid >= 0);
 
   if (__kmp_get_queuing_lock_owner(lck) == gtid) {
     retval = ++lck->lk.depth_locked;
   } else if (!__kmp_test_queuing_lock(lck, gtid)) {
     retval = 0;
   } else {
     KMP_MB();
     retval = lck->lk.depth_locked = 1;
     KMP_MB();
     lck->lk.owner_id = gtid + 1;
   }
   return retval;
 }
 
 static int __kmp_test_nested_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
                                                       kmp_int32 gtid) {
   char const *const func = "omp_test_nest_lock";
   if (lck->lk.initialized != lck) {
     KMP_FATAL(LockIsUninitialized, func);
   }
   if (!__kmp_is_queuing_lock_nestable(lck)) {
     KMP_FATAL(LockSimpleUsedAsNestable, func);
   }
   return __kmp_test_nested_queuing_lock(lck, gtid);
 }
 
 int __kmp_release_nested_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
   KMP_DEBUG_ASSERT(gtid >= 0);
 
   KMP_MB();
   if (--(lck->lk.depth_locked) == 0) {
     KMP_MB();
     lck->lk.owner_id = 0;
     __kmp_release_queuing_lock(lck, gtid);
     return KMP_LOCK_RELEASED;
   }
   return KMP_LOCK_STILL_HELD;
 }
 
 static int
 __kmp_release_nested_queuing_lock_with_checks(kmp_queuing_lock_t *lck,
                                               kmp_int32 gtid) {
   char const *const func = "omp_unset_nest_lock";
   KMP_MB(); /* in case another processor initialized lock */
   if (lck->lk.initialized != lck) {
     KMP_FATAL(LockIsUninitialized, func);
   }
   if (!__kmp_is_queuing_lock_nestable(lck)) {
     KMP_FATAL(LockSimpleUsedAsNestable, func);
   }
   if (__kmp_get_queuing_lock_owner(lck) == -1) {
     KMP_FATAL(LockUnsettingFree, func);
   }
   if (__kmp_get_queuing_lock_owner(lck) != gtid) {
     KMP_FATAL(LockUnsettingSetByAnother, func);
   }
   return __kmp_release_nested_queuing_lock(lck, gtid);
 }
 
 void __kmp_init_nested_queuing_lock(kmp_queuing_lock_t *lck) {
   __kmp_init_queuing_lock(lck);
   lck->lk.depth_locked = 0; // >= 0 for nestable locks, -1 for simple locks
 }
 
 void __kmp_destroy_nested_queuing_lock(kmp_queuing_lock_t *lck) {
   __kmp_destroy_queuing_lock(lck);
   lck->lk.depth_locked = 0;
 }
 
 static void
 __kmp_destroy_nested_queuing_lock_with_checks(kmp_queuing_lock_t *lck) {
   char const *const func = "omp_destroy_nest_lock";
   if (lck->lk.initialized != lck) {
     KMP_FATAL(LockIsUninitialized, func);
   }
   if (!__kmp_is_queuing_lock_nestable(lck)) {
     KMP_FATAL(LockSimpleUsedAsNestable, func);
   }
   if (__kmp_get_queuing_lock_owner(lck) != -1) {
     KMP_FATAL(LockStillOwned, func);
   }
   __kmp_destroy_nested_queuing_lock(lck);
 }
 
 // access functions to fields which don't exist for all lock kinds.
 
 static const ident_t *__kmp_get_queuing_lock_location(kmp_queuing_lock_t *lck) {
   return lck->lk.location;
 }
 
 static void __kmp_set_queuing_lock_location(kmp_queuing_lock_t *lck,
                                             const ident_t *loc) {
   lck->lk.location = loc;
 }
 
 static kmp_lock_flags_t __kmp_get_queuing_lock_flags(kmp_queuing_lock_t *lck) {
   return lck->lk.flags;
 }
 
 static void __kmp_set_queuing_lock_flags(kmp_queuing_lock_t *lck,
                                          kmp_lock_flags_t flags) {
   lck->lk.flags = flags;
 }
 
 #if KMP_USE_ADAPTIVE_LOCKS
 
 /* RTM Adaptive locks */
 
 #if (KMP_COMPILER_ICC && __INTEL_COMPILER >= 1300) ||                          \
     (KMP_COMPILER_MSVC && _MSC_VER >= 1700) ||                                 \
     (KMP_COMPILER_CLANG && KMP_MSVC_COMPAT)
 
 #include <immintrin.h>
 #define SOFT_ABORT_MASK (_XABORT_RETRY | _XABORT_CONFLICT | _XABORT_EXPLICIT)
 
 #else
 
 // Values from the status register after failed speculation.
 #define _XBEGIN_STARTED (~0u)
 #define _XABORT_EXPLICIT (1 << 0)
 #define _XABORT_RETRY (1 << 1)
 #define _XABORT_CONFLICT (1 << 2)
 #define _XABORT_CAPACITY (1 << 3)
 #define _XABORT_DEBUG (1 << 4)
 #define _XABORT_NESTED (1 << 5)
 #define _XABORT_CODE(x) ((unsigned char)(((x) >> 24) & 0xFF))
 
 // Aborts for which it's worth trying again immediately
 #define SOFT_ABORT_MASK (_XABORT_RETRY | _XABORT_CONFLICT | _XABORT_EXPLICIT)
 
 #define STRINGIZE_INTERNAL(arg) #arg
 #define STRINGIZE(arg) STRINGIZE_INTERNAL(arg)
 
 // Access to RTM instructions
 /*A version of XBegin which returns -1 on speculation, and the value of EAX on
   an abort. This is the same definition as the compiler intrinsic that will be
   supported at some point. */
 static __inline int _xbegin() {
   int res = -1;
 
 #if KMP_OS_WINDOWS
 #if KMP_ARCH_X86_64
   _asm {
         _emit 0xC7
         _emit 0xF8
         _emit 2
         _emit 0
         _emit 0
         _emit 0
         jmp   L2
         mov   res, eax
     L2:
   }
 #else /* IA32 */
   _asm {
         _emit 0xC7
         _emit 0xF8
         _emit 2
         _emit 0
         _emit 0
         _emit 0
         jmp   L2
         mov   res, eax
     L2:
   }
 #endif // KMP_ARCH_X86_64
 #else
   /* Note that %eax must be noted as killed (clobbered), because the XSR is
      returned in %eax(%rax) on abort.  Other register values are restored, so
      don't need to be killed.
 
      We must also mark 'res' as an input and an output, since otherwise
      'res=-1' may be dropped as being dead, whereas we do need the assignment on
      the successful (i.e., non-abort) path. */
   __asm__ volatile("1: .byte  0xC7; .byte 0xF8;\n"
                    "   .long  1f-1b-6\n"
                    "    jmp   2f\n"
                    "1:  movl  %%eax,%0\n"
                    "2:"
                    : "+r"(res)::"memory", "%eax");
 #endif // KMP_OS_WINDOWS
   return res;
 }
 
 /* Transaction end */
 static __inline void _xend() {
 #if KMP_OS_WINDOWS
   __asm {
         _emit 0x0f
         _emit 0x01
         _emit 0xd5
   }
 #else
   __asm__ volatile(".byte 0x0f; .byte 0x01; .byte 0xd5" ::: "memory");
 #endif
 }
 
 /* This is a macro, the argument must be a single byte constant which can be
    evaluated by the inline assembler, since it is emitted as a byte into the
    assembly code. */
 // clang-format off
 #if KMP_OS_WINDOWS
 #define _xabort(ARG) _asm _emit 0xc6 _asm _emit 0xf8 _asm _emit ARG
 #else
 #define _xabort(ARG)                                                           \
   __asm__ volatile(".byte 0xC6; .byte 0xF8; .byte " STRINGIZE(ARG):::"memory");
 #endif
 // clang-format on
 #endif // KMP_COMPILER_ICC && __INTEL_COMPILER >= 1300
 
 // Statistics is collected for testing purpose
 #if KMP_DEBUG_ADAPTIVE_LOCKS
 
 // We accumulate speculative lock statistics when the lock is destroyed. We
 // keep locks that haven't been destroyed in the liveLocks list so that we can
 // grab their statistics too.
 static kmp_adaptive_lock_statistics_t destroyedStats;
 
 // To hold the list of live locks.
 static kmp_adaptive_lock_info_t liveLocks;
 
 // A lock so we can safely update the list of locks.
 static kmp_bootstrap_lock_t chain_lock =
     KMP_BOOTSTRAP_LOCK_INITIALIZER(chain_lock);
 
 // Initialize the list of stats.
 void __kmp_init_speculative_stats() {
   kmp_adaptive_lock_info_t *lck = &liveLocks;
 
   memset(CCAST(kmp_adaptive_lock_statistics_t *, &(lck->stats)), 0,
          sizeof(lck->stats));
   lck->stats.next = lck;
   lck->stats.prev = lck;
 
   KMP_ASSERT(lck->stats.next->stats.prev == lck);
   KMP_ASSERT(lck->stats.prev->stats.next == lck);
 
   __kmp_init_bootstrap_lock(&chain_lock);
 }
 
 // Insert the lock into the circular list
 static void __kmp_remember_lock(kmp_adaptive_lock_info_t *lck) {
   __kmp_acquire_bootstrap_lock(&chain_lock);
 
   lck->stats.next = liveLocks.stats.next;
   lck->stats.prev = &liveLocks;
 
   liveLocks.stats.next = lck;
   lck->stats.next->stats.prev = lck;
 
   KMP_ASSERT(lck->stats.next->stats.prev == lck);
   KMP_ASSERT(lck->stats.prev->stats.next == lck);
 
   __kmp_release_bootstrap_lock(&chain_lock);
 }
 
 static void __kmp_forget_lock(kmp_adaptive_lock_info_t *lck) {
   KMP_ASSERT(lck->stats.next->stats.prev == lck);
   KMP_ASSERT(lck->stats.prev->stats.next == lck);
 
   kmp_adaptive_lock_info_t *n = lck->stats.next;
   kmp_adaptive_lock_info_t *p = lck->stats.prev;
 
   n->stats.prev = p;
   p->stats.next = n;
 }
 
 static void __kmp_zero_speculative_stats(kmp_adaptive_lock_info_t *lck) {
   memset(CCAST(kmp_adaptive_lock_statistics_t *, &lck->stats), 0,
          sizeof(lck->stats));
   __kmp_remember_lock(lck);
 }
 
 static void __kmp_add_stats(kmp_adaptive_lock_statistics_t *t,
                             kmp_adaptive_lock_info_t *lck) {
   kmp_adaptive_lock_statistics_t volatile *s = &lck->stats;
 
   t->nonSpeculativeAcquireAttempts += lck->acquire_attempts;
   t->successfulSpeculations += s->successfulSpeculations;
   t->hardFailedSpeculations += s->hardFailedSpeculations;
   t->softFailedSpeculations += s->softFailedSpeculations;
   t->nonSpeculativeAcquires += s->nonSpeculativeAcquires;
   t->lemmingYields += s->lemmingYields;
 }
 
 static void __kmp_accumulate_speculative_stats(kmp_adaptive_lock_info_t *lck) {
   __kmp_acquire_bootstrap_lock(&chain_lock);
 
   __kmp_add_stats(&destroyedStats, lck);
   __kmp_forget_lock(lck);
 
   __kmp_release_bootstrap_lock(&chain_lock);
 }
 
 static float percent(kmp_uint32 count, kmp_uint32 total) {
   return (total == 0) ? 0.0 : (100.0 * count) / total;
 }
 
 static FILE *__kmp_open_stats_file() {
   if (strcmp(__kmp_speculative_statsfile, "-") == 0)
     return stdout;
 
   size_t buffLen = KMP_STRLEN(__kmp_speculative_statsfile) + 20;
   char buffer[buffLen];
   KMP_SNPRINTF(&buffer[0], buffLen, __kmp_speculative_statsfile,
                (kmp_int32)getpid());
   FILE *result = fopen(&buffer[0], "w");
 
   // Maybe we should issue a warning here...
   return result ? result : stdout;
 }
 
 void __kmp_print_speculative_stats() {
   kmp_adaptive_lock_statistics_t total = destroyedStats;
   kmp_adaptive_lock_info_t *lck;
 
   for (lck = liveLocks.stats.next; lck != &liveLocks; lck = lck->stats.next) {
     __kmp_add_stats(&total, lck);
   }
   kmp_adaptive_lock_statistics_t *t = &total;
   kmp_uint32 totalSections =
       t->nonSpeculativeAcquires + t->successfulSpeculations;
   kmp_uint32 totalSpeculations = t->successfulSpeculations +
                                  t->hardFailedSpeculations +
                                  t->softFailedSpeculations;
   if (totalSections <= 0)
     return;
 
   FILE *statsFile = __kmp_open_stats_file();
 
   fprintf(statsFile, "Speculative lock statistics (all approximate!)\n");
   fprintf(statsFile, " Lock parameters: \n"
                      "   max_soft_retries               : %10d\n"
                      "   max_badness                    : %10d\n",
           __kmp_adaptive_backoff_params.max_soft_retries,
           __kmp_adaptive_backoff_params.max_badness);
   fprintf(statsFile, " Non-speculative acquire attempts : %10d\n",
           t->nonSpeculativeAcquireAttempts);
   fprintf(statsFile, " Total critical sections          : %10d\n",
           totalSections);
   fprintf(statsFile, " Successful speculations          : %10d (%5.1f%%)\n",
           t->successfulSpeculations,
           percent(t->successfulSpeculations, totalSections));
   fprintf(statsFile, " Non-speculative acquires         : %10d (%5.1f%%)\n",
           t->nonSpeculativeAcquires,
           percent(t->nonSpeculativeAcquires, totalSections));
   fprintf(statsFile, " Lemming yields                   : %10d\n\n",
           t->lemmingYields);
 
   fprintf(statsFile, " Speculative acquire attempts     : %10d\n",
           totalSpeculations);
   fprintf(statsFile, " Successes                        : %10d (%5.1f%%)\n",
           t->successfulSpeculations,
           percent(t->successfulSpeculations, totalSpeculations));
   fprintf(statsFile, " Soft failures                    : %10d (%5.1f%%)\n",
           t->softFailedSpeculations,
           percent(t->softFailedSpeculations, totalSpeculations));
   fprintf(statsFile, " Hard failures                    : %10d (%5.1f%%)\n",
           t->hardFailedSpeculations,
           percent(t->hardFailedSpeculations, totalSpeculations));
 
   if (statsFile != stdout)
     fclose(statsFile);
 }
 
 #define KMP_INC_STAT(lck, stat) (lck->lk.adaptive.stats.stat++)
 #else
 #define KMP_INC_STAT(lck, stat)
 
 #endif // KMP_DEBUG_ADAPTIVE_LOCKS
 
 static inline bool __kmp_is_unlocked_queuing_lock(kmp_queuing_lock_t *lck) {
   // It is enough to check that the head_id is zero.
   // We don't also need to check the tail.
   bool res = lck->lk.head_id == 0;
 
 // We need a fence here, since we must ensure that no memory operations
 // from later in this thread float above that read.
 #if KMP_COMPILER_ICC
   _mm_mfence();
 #else
   __sync_synchronize();
 #endif
 
   return res;
 }
 
 // Functions for manipulating the badness
 static __inline void
 __kmp_update_badness_after_success(kmp_adaptive_lock_t *lck) {
   // Reset the badness to zero so we eagerly try to speculate again
   lck->lk.adaptive.badness = 0;
   KMP_INC_STAT(lck, successfulSpeculations);
 }
 
 // Create a bit mask with one more set bit.
 static __inline void __kmp_step_badness(kmp_adaptive_lock_t *lck) {
   kmp_uint32 newBadness = (lck->lk.adaptive.badness << 1) | 1;
   if (newBadness > lck->lk.adaptive.max_badness) {
     return;
   } else {
     lck->lk.adaptive.badness = newBadness;
   }
 }
 
 // Check whether speculation should be attempted.
 static __inline int __kmp_should_speculate(kmp_adaptive_lock_t *lck,
                                            kmp_int32 gtid) {
   kmp_uint32 badness = lck->lk.adaptive.badness;
   kmp_uint32 attempts = lck->lk.adaptive.acquire_attempts;
   int res = (attempts & badness) == 0;
   return res;
 }
 
 // Attempt to acquire only the speculative lock.
 // Does not back off to the non-speculative lock.
 static int __kmp_test_adaptive_lock_only(kmp_adaptive_lock_t *lck,
                                          kmp_int32 gtid) {
   int retries = lck->lk.adaptive.max_soft_retries;
 
   // We don't explicitly count the start of speculation, rather we record the
   // results (success, hard fail, soft fail). The sum of all of those is the
   // total number of times we started speculation since all speculations must
   // end one of those ways.
   do {
     kmp_uint32 status = _xbegin();
     // Switch this in to disable actual speculation but exercise at least some
     // of the rest of the code. Useful for debugging...
     // kmp_uint32 status = _XABORT_NESTED;
 
     if (status == _XBEGIN_STARTED) {
       /* We have successfully started speculation. Check that no-one acquired
          the lock for real between when we last looked and now. This also gets
          the lock cache line into our read-set, which we need so that we'll
          abort if anyone later claims it for real. */
       if (!__kmp_is_unlocked_queuing_lock(GET_QLK_PTR(lck))) {
         // Lock is now visibly acquired, so someone beat us to it. Abort the
         // transaction so we'll restart from _xbegin with the failure status.
         _xabort(0x01);
         KMP_ASSERT2(0, "should not get here");
       }
       return 1; // Lock has been acquired (speculatively)
     } else {
       // We have aborted, update the statistics
       if (status & SOFT_ABORT_MASK) {
         KMP_INC_STAT(lck, softFailedSpeculations);
         // and loop round to retry.
       } else {
         KMP_INC_STAT(lck, hardFailedSpeculations);
         // Give up if we had a hard failure.
         break;
       }
     }
   } while (retries--); // Loop while we have retries, and didn't fail hard.
 
   // Either we had a hard failure or we didn't succeed softly after
   // the full set of attempts, so back off the badness.
   __kmp_step_badness(lck);
   return 0;
 }
 
 // Attempt to acquire the speculative lock, or back off to the non-speculative
 // one if the speculative lock cannot be acquired.
 // We can succeed speculatively, non-speculatively, or fail.
 static int __kmp_test_adaptive_lock(kmp_adaptive_lock_t *lck, kmp_int32 gtid) {
   // First try to acquire the lock speculatively
   if (__kmp_should_speculate(lck, gtid) &&
       __kmp_test_adaptive_lock_only(lck, gtid))
     return 1;
 
   // Speculative acquisition failed, so try to acquire it non-speculatively.
   // Count the non-speculative acquire attempt
   lck->lk.adaptive.acquire_attempts++;
 
   // Use base, non-speculative lock.
   if (__kmp_test_queuing_lock(GET_QLK_PTR(lck), gtid)) {
     KMP_INC_STAT(lck, nonSpeculativeAcquires);
     return 1; // Lock is acquired (non-speculatively)
   } else {
     return 0; // Failed to acquire the lock, it's already visibly locked.
   }
 }
 
 static int __kmp_test_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck,
                                                 kmp_int32 gtid) {
   char const *const func = "omp_test_lock";
   if (lck->lk.qlk.initialized != GET_QLK_PTR(lck)) {
     KMP_FATAL(LockIsUninitialized, func);
   }
 
   int retval = __kmp_test_adaptive_lock(lck, gtid);
 
   if (retval) {
     lck->lk.qlk.owner_id = gtid + 1;
   }
   return retval;
 }
 
 // Block until we can acquire a speculative, adaptive lock. We check whether we
 // should be trying to speculate. If we should be, we check the real lock to see
 // if it is free, and, if not, pause without attempting to acquire it until it
 // is. Then we try the speculative acquire. This means that although we suffer
 // from lemmings a little (because all we can't acquire the lock speculatively
 // until the queue of threads waiting has cleared), we don't get into a state
 // where we can never acquire the lock speculatively (because we force the queue
 // to clear by preventing new arrivals from entering the queue). This does mean
 // that when we're trying to break lemmings, the lock is no longer fair. However
 // OpenMP makes no guarantee that its locks are fair, so this isn't a real
 // problem.
 static void __kmp_acquire_adaptive_lock(kmp_adaptive_lock_t *lck,
                                         kmp_int32 gtid) {
   if (__kmp_should_speculate(lck, gtid)) {
     if (__kmp_is_unlocked_queuing_lock(GET_QLK_PTR(lck))) {
       if (__kmp_test_adaptive_lock_only(lck, gtid))
         return;
       // We tried speculation and failed, so give up.
     } else {
       // We can't try speculation until the lock is free, so we pause here
       // (without suspending on the queueing lock, to allow it to drain, then
       // try again. All other threads will also see the same result for
       // shouldSpeculate, so will be doing the same if they try to claim the
       // lock from now on.
       while (!__kmp_is_unlocked_queuing_lock(GET_QLK_PTR(lck))) {
         KMP_INC_STAT(lck, lemmingYields);
         KMP_YIELD(TRUE);
       }
 
       if (__kmp_test_adaptive_lock_only(lck, gtid))
         return;
     }
   }
 
   // Speculative acquisition failed, so acquire it non-speculatively.
   // Count the non-speculative acquire attempt
   lck->lk.adaptive.acquire_attempts++;
 
   __kmp_acquire_queuing_lock_timed_template<FALSE>(GET_QLK_PTR(lck), gtid);
   // We have acquired the base lock, so count that.
   KMP_INC_STAT(lck, nonSpeculativeAcquires);
   ANNOTATE_QUEUING_ACQUIRED(lck);
 }
 
 static void __kmp_acquire_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck,
                                                     kmp_int32 gtid) {
   char const *const func = "omp_set_lock";
   if (lck->lk.qlk.initialized != GET_QLK_PTR(lck)) {
     KMP_FATAL(LockIsUninitialized, func);
   }
   if (__kmp_get_queuing_lock_owner(GET_QLK_PTR(lck)) == gtid) {
     KMP_FATAL(LockIsAlreadyOwned, func);
   }
 
   __kmp_acquire_adaptive_lock(lck, gtid);
 
   lck->lk.qlk.owner_id = gtid + 1;
 }
 
 static int __kmp_release_adaptive_lock(kmp_adaptive_lock_t *lck,
                                        kmp_int32 gtid) {
   if (__kmp_is_unlocked_queuing_lock(GET_QLK_PTR(
           lck))) { // If the lock doesn't look claimed we must be speculating.
     // (Or the user's code is buggy and they're releasing without locking;
     // if we had XTEST we'd be able to check that case...)
     _xend(); // Exit speculation
     __kmp_update_badness_after_success(lck);
   } else { // Since the lock *is* visibly locked we're not speculating,
     // so should use the underlying lock's release scheme.
     __kmp_release_queuing_lock(GET_QLK_PTR(lck), gtid);
   }
   return KMP_LOCK_RELEASED;
 }
 
 static int __kmp_release_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck,
                                                    kmp_int32 gtid) {
   char const *const func = "omp_unset_lock";
   KMP_MB(); /* in case another processor initialized lock */
   if (lck->lk.qlk.initialized != GET_QLK_PTR(lck)) {
     KMP_FATAL(LockIsUninitialized, func);
   }
   if (__kmp_get_queuing_lock_owner(GET_QLK_PTR(lck)) == -1) {
     KMP_FATAL(LockUnsettingFree, func);
   }
   if (__kmp_get_queuing_lock_owner(GET_QLK_PTR(lck)) != gtid) {
     KMP_FATAL(LockUnsettingSetByAnother, func);
   }
   lck->lk.qlk.owner_id = 0;
   __kmp_release_adaptive_lock(lck, gtid);
   return KMP_LOCK_RELEASED;
 }
 
 static void __kmp_init_adaptive_lock(kmp_adaptive_lock_t *lck) {
   __kmp_init_queuing_lock(GET_QLK_PTR(lck));
   lck->lk.adaptive.badness = 0;
   lck->lk.adaptive.acquire_attempts = 0; // nonSpeculativeAcquireAttempts = 0;
   lck->lk.adaptive.max_soft_retries =
       __kmp_adaptive_backoff_params.max_soft_retries;
   lck->lk.adaptive.max_badness = __kmp_adaptive_backoff_params.max_badness;
 #if KMP_DEBUG_ADAPTIVE_LOCKS
   __kmp_zero_speculative_stats(&lck->lk.adaptive);
 #endif
   KA_TRACE(1000, ("__kmp_init_adaptive_lock: lock %p initialized\n", lck));
 }
 
 static void __kmp_destroy_adaptive_lock(kmp_adaptive_lock_t *lck) {
 #if KMP_DEBUG_ADAPTIVE_LOCKS
   __kmp_accumulate_speculative_stats(&lck->lk.adaptive);
 #endif
   __kmp_destroy_queuing_lock(GET_QLK_PTR(lck));
   // Nothing needed for the speculative part.
 }
 
 static void __kmp_destroy_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck) {
   char const *const func = "omp_destroy_lock";
   if (lck->lk.qlk.initialized != GET_QLK_PTR(lck)) {
     KMP_FATAL(LockIsUninitialized, func);
   }
   if (__kmp_get_queuing_lock_owner(GET_QLK_PTR(lck)) != -1) {
     KMP_FATAL(LockStillOwned, func);
   }
   __kmp_destroy_adaptive_lock(lck);
 }
 
 #endif // KMP_USE_ADAPTIVE_LOCKS
 
 /* ------------------------------------------------------------------------ */
 /* DRDPA ticket locks                                                */
 /* "DRDPA" means Dynamically Reconfigurable Distributed Polling Area */
 
 static kmp_int32 __kmp_get_drdpa_lock_owner(kmp_drdpa_lock_t *lck) {
   return lck->lk.owner_id - 1;
 }
 
 static inline bool __kmp_is_drdpa_lock_nestable(kmp_drdpa_lock_t *lck) {
   return lck->lk.depth_locked != -1;
 }
 
 __forceinline static int
 __kmp_acquire_drdpa_lock_timed_template(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
   kmp_uint64 ticket = KMP_ATOMIC_INC(&lck->lk.next_ticket);
   kmp_uint64 mask = lck->lk.mask; // atomic load
   std::atomic<kmp_uint64> *polls = lck->lk.polls;
 
 #ifdef USE_LOCK_PROFILE
   if (polls[ticket & mask] != ticket)
     __kmp_printf("LOCK CONTENTION: %p\n", lck);
 /* else __kmp_printf( "." );*/
 #endif /* USE_LOCK_PROFILE */
 
   // Now spin-wait, but reload the polls pointer and mask, in case the
   // polling area has been reconfigured.  Unless it is reconfigured, the
   // reloads stay in L1 cache and are cheap.
   //
   // Keep this code in sync with KMP_WAIT, in kmp_dispatch.cpp !!!
   // The current implementation of KMP_WAIT doesn't allow for mask
   // and poll to be re-read every spin iteration.
   kmp_uint32 spins;
   KMP_FSYNC_PREPARE(lck);
   KMP_INIT_YIELD(spins);
   while (polls[ticket & mask] < ticket) { // atomic load
     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
     // Re-read the mask and the poll pointer from the lock structure.
     //
     // Make certain that "mask" is read before "polls" !!!
     //
     // If another thread picks reconfigures the polling area and updates their
     // values, and we get the new value of mask and the old polls pointer, we
     // could access memory beyond the end of the old polling area.
     mask = lck->lk.mask; // atomic load
     polls = lck->lk.polls; // atomic load
   }
 
   // Critical section starts here
   KMP_FSYNC_ACQUIRED(lck);
   KA_TRACE(1000, ("__kmp_acquire_drdpa_lock: ticket #%lld acquired lock %p\n",
                   ticket, lck));
   lck->lk.now_serving = ticket; // non-volatile store
 
   // Deallocate a garbage polling area if we know that we are the last
   // thread that could possibly access it.
   //
   // The >= check is in case __kmp_test_drdpa_lock() allocated the cleanup
   // ticket.
   if ((lck->lk.old_polls != NULL) && (ticket >= lck->lk.cleanup_ticket)) {
     __kmp_free(lck->lk.old_polls);
     lck->lk.old_polls = NULL;
     lck->lk.cleanup_ticket = 0;
   }
 
   // Check to see if we should reconfigure the polling area.
   // If there is still a garbage polling area to be deallocated from a
   // previous reconfiguration, let a later thread reconfigure it.
   if (lck->lk.old_polls == NULL) {
     bool reconfigure = false;
     std::atomic<kmp_uint64> *old_polls = polls;
     kmp_uint32 num_polls = TCR_4(lck->lk.num_polls);
 
     if (TCR_4(__kmp_nth) >
         (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {
       // We are in oversubscription mode.  Contract the polling area
       // down to a single location, if that hasn't been done already.
       if (num_polls > 1) {
         reconfigure = true;
         num_polls = TCR_4(lck->lk.num_polls);
         mask = 0;
         num_polls = 1;
         polls = (std::atomic<kmp_uint64> *)__kmp_allocate(num_polls *
                                                           sizeof(*polls));
         polls[0] = ticket;
       }
     } else {
       // We are in under/fully subscribed mode.  Check the number of
       // threads waiting on the lock.  The size of the polling area
       // should be at least the number of threads waiting.
       kmp_uint64 num_waiting = TCR_8(lck->lk.next_ticket) - ticket - 1;
       if (num_waiting > num_polls) {
         kmp_uint32 old_num_polls = num_polls;
         reconfigure = true;
         do {
           mask = (mask << 1) | 1;
           num_polls *= 2;
         } while (num_polls <= num_waiting);
 
         // Allocate the new polling area, and copy the relevant portion
         // of the old polling area to the new area.  __kmp_allocate()
         // zeroes the memory it allocates, and most of the old area is
         // just zero padding, so we only copy the release counters.
         polls = (std::atomic<kmp_uint64> *)__kmp_allocate(num_polls *
                                                           sizeof(*polls));
         kmp_uint32 i;
         for (i = 0; i < old_num_polls; i++) {
           polls[i].store(old_polls[i]);
         }
       }
     }
 
     if (reconfigure) {
       // Now write the updated fields back to the lock structure.
       //
       // Make certain that "polls" is written before "mask" !!!
       //
       // If another thread picks up the new value of mask and the old polls
       // pointer , it could access memory beyond the end of the old polling
       // area.
       //
       // On x86, we need memory fences.
       KA_TRACE(1000, ("__kmp_acquire_drdpa_lock: ticket #%lld reconfiguring "
                       "lock %p to %d polls\n",
                       ticket, lck, num_polls));
 
       lck->lk.old_polls = old_polls;
       lck->lk.polls = polls; // atomic store
 
       KMP_MB();
 
       lck->lk.num_polls = num_polls;
       lck->lk.mask = mask; // atomic store
 
       KMP_MB();
 
       // Only after the new polling area and mask have been flushed
       // to main memory can we update the cleanup ticket field.
       //
       // volatile load / non-volatile store
       lck->lk.cleanup_ticket = lck->lk.next_ticket;
     }
   }
   return KMP_LOCK_ACQUIRED_FIRST;
 }
 
 int __kmp_acquire_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
   int retval = __kmp_acquire_drdpa_lock_timed_template(lck, gtid);
   ANNOTATE_DRDPA_ACQUIRED(lck);
   return retval;
 }
 
 static int __kmp_acquire_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
                                                 kmp_int32 gtid) {
   char const *const func = "omp_set_lock";
   if (lck->lk.initialized != lck) {
     KMP_FATAL(LockIsUninitialized, func);
   }
   if (__kmp_is_drdpa_lock_nestable(lck)) {
     KMP_FATAL(LockNestableUsedAsSimple, func);
   }
   if ((gtid >= 0) && (__kmp_get_drdpa_lock_owner(lck) == gtid)) {
     KMP_FATAL(LockIsAlreadyOwned, func);
   }
 
   __kmp_acquire_drdpa_lock(lck, gtid);
 
   lck->lk.owner_id = gtid + 1;
   return KMP_LOCK_ACQUIRED_FIRST;
 }
 
 int __kmp_test_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
   // First get a ticket, then read the polls pointer and the mask.
   // The polls pointer must be read before the mask!!! (See above)
   kmp_uint64 ticket = lck->lk.next_ticket; // atomic load
   std::atomic<kmp_uint64> *polls = lck->lk.polls;
   kmp_uint64 mask = lck->lk.mask; // atomic load
   if (polls[ticket & mask] == ticket) {
     kmp_uint64 next_ticket = ticket + 1;
     if (__kmp_atomic_compare_store_acq(&lck->lk.next_ticket, ticket,
                                        next_ticket)) {
       KMP_FSYNC_ACQUIRED(lck);
       KA_TRACE(1000, ("__kmp_test_drdpa_lock: ticket #%lld acquired lock %p\n",
                       ticket, lck));
       lck->lk.now_serving = ticket; // non-volatile store
 
       // Since no threads are waiting, there is no possibility that we would
       // want to reconfigure the polling area.  We might have the cleanup ticket
       // value (which says that it is now safe to deallocate old_polls), but
       // we'll let a later thread which calls __kmp_acquire_lock do that - this
       // routine isn't supposed to block, and we would risk blocks if we called
       // __kmp_free() to do the deallocation.
       return TRUE;
     }
   }
   return FALSE;
 }
 
 static int __kmp_test_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
                                              kmp_int32 gtid) {
   char const *const func = "omp_test_lock";
   if (lck->lk.initialized != lck) {
     KMP_FATAL(LockIsUninitialized, func);
   }
   if (__kmp_is_drdpa_lock_nestable(lck)) {
     KMP_FATAL(LockNestableUsedAsSimple, func);
   }
 
   int retval = __kmp_test_drdpa_lock(lck, gtid);
 
   if (retval) {
     lck->lk.owner_id = gtid + 1;
   }
   return retval;
 }
 
 int __kmp_release_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
   // Read the ticket value from the lock data struct, then the polls pointer and
   // the mask.  The polls pointer must be read before the mask!!! (See above)
   kmp_uint64 ticket = lck->lk.now_serving + 1; // non-atomic load
   std::atomic<kmp_uint64> *polls = lck->lk.polls; // atomic load
   kmp_uint64 mask = lck->lk.mask; // atomic load
   KA_TRACE(1000, ("__kmp_release_drdpa_lock: ticket #%lld released lock %p\n",
                   ticket - 1, lck));
   KMP_FSYNC_RELEASING(lck);
   ANNOTATE_DRDPA_RELEASED(lck);
   polls[ticket & mask] = ticket; // atomic store
   return KMP_LOCK_RELEASED;
 }
 
 static int __kmp_release_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
                                                 kmp_int32 gtid) {
   char const *const func = "omp_unset_lock";
   KMP_MB(); /* in case another processor initialized lock */
   if (lck->lk.initialized != lck) {
     KMP_FATAL(LockIsUninitialized, func);
   }
   if (__kmp_is_drdpa_lock_nestable(lck)) {
     KMP_FATAL(LockNestableUsedAsSimple, func);
   }
   if (__kmp_get_drdpa_lock_owner(lck) == -1) {
     KMP_FATAL(LockUnsettingFree, func);
   }
   if ((gtid >= 0) && (__kmp_get_drdpa_lock_owner(lck) >= 0) &&
       (__kmp_get_drdpa_lock_owner(lck) != gtid)) {
     KMP_FATAL(LockUnsettingSetByAnother, func);
   }
   lck->lk.owner_id = 0;
   return __kmp_release_drdpa_lock(lck, gtid);
 }
 
 void __kmp_init_drdpa_lock(kmp_drdpa_lock_t *lck) {
   lck->lk.location = NULL;
   lck->lk.mask = 0;
   lck->lk.num_polls = 1;
   lck->lk.polls = (std::atomic<kmp_uint64> *)__kmp_allocate(
       lck->lk.num_polls * sizeof(*(lck->lk.polls)));
   lck->lk.cleanup_ticket = 0;
   lck->lk.old_polls = NULL;
   lck->lk.next_ticket = 0;
   lck->lk.now_serving = 0;
   lck->lk.owner_id = 0; // no thread owns the lock.
   lck->lk.depth_locked = -1; // >= 0 for nestable locks, -1 for simple locks.
   lck->lk.initialized = lck;
 
   KA_TRACE(1000, ("__kmp_init_drdpa_lock: lock %p initialized\n", lck));
 }
 
 void __kmp_destroy_drdpa_lock(kmp_drdpa_lock_t *lck) {
   lck->lk.initialized = NULL;
   lck->lk.location = NULL;
   if (lck->lk.polls.load() != NULL) {
     __kmp_free(lck->lk.polls.load());
     lck->lk.polls = NULL;
   }
   if (lck->lk.old_polls != NULL) {
     __kmp_free(lck->lk.old_polls);
     lck->lk.old_polls = NULL;
   }
   lck->lk.mask = 0;
   lck->lk.num_polls = 0;
   lck->lk.cleanup_ticket = 0;
   lck->lk.next_ticket = 0;
   lck->lk.now_serving = 0;
   lck->lk.owner_id = 0;
   lck->lk.depth_locked = -1;
 }
 
 static void __kmp_destroy_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck) {
   char const *const func = "omp_destroy_lock";
   if (lck->lk.initialized != lck) {
     KMP_FATAL(LockIsUninitialized, func);
   }
   if (__kmp_is_drdpa_lock_nestable(lck)) {
     KMP_FATAL(LockNestableUsedAsSimple, func);
   }
   if (__kmp_get_drdpa_lock_owner(lck) != -1) {
     KMP_FATAL(LockStillOwned, func);
   }
   __kmp_destroy_drdpa_lock(lck);
 }
 
 // nested drdpa ticket locks
 
 int __kmp_acquire_nested_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
   KMP_DEBUG_ASSERT(gtid >= 0);
 
   if (__kmp_get_drdpa_lock_owner(lck) == gtid) {
     lck->lk.depth_locked += 1;
     return KMP_LOCK_ACQUIRED_NEXT;
   } else {
     __kmp_acquire_drdpa_lock_timed_template(lck, gtid);
     ANNOTATE_DRDPA_ACQUIRED(lck);
     KMP_MB();
     lck->lk.depth_locked = 1;
     KMP_MB();
     lck->lk.owner_id = gtid + 1;
     return KMP_LOCK_ACQUIRED_FIRST;
   }
 }
 
 static void __kmp_acquire_nested_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
                                                         kmp_int32 gtid) {
   char const *const func = "omp_set_nest_lock";
   if (lck->lk.initialized != lck) {
     KMP_FATAL(LockIsUninitialized, func);
   }
   if (!__kmp_is_drdpa_lock_nestable(lck)) {
     KMP_FATAL(LockSimpleUsedAsNestable, func);
   }
   __kmp_acquire_nested_drdpa_lock(lck, gtid);
 }
 
 int __kmp_test_nested_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
   int retval;
 
   KMP_DEBUG_ASSERT(gtid >= 0);
 
   if (__kmp_get_drdpa_lock_owner(lck) == gtid) {
     retval = ++lck->lk.depth_locked;
   } else if (!__kmp_test_drdpa_lock(lck, gtid)) {
     retval = 0;
   } else {
     KMP_MB();
     retval = lck->lk.depth_locked = 1;
     KMP_MB();
     lck->lk.owner_id = gtid + 1;
   }
   return retval;
 }
 
 static int __kmp_test_nested_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
                                                     kmp_int32 gtid) {
   char const *const func = "omp_test_nest_lock";
   if (lck->lk.initialized != lck) {
     KMP_FATAL(LockIsUninitialized, func);
   }
   if (!__kmp_is_drdpa_lock_nestable(lck)) {
     KMP_FATAL(LockSimpleUsedAsNestable, func);
   }
   return __kmp_test_nested_drdpa_lock(lck, gtid);
 }
 
 int __kmp_release_nested_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
   KMP_DEBUG_ASSERT(gtid >= 0);
 
   KMP_MB();
   if (--(lck->lk.depth_locked) == 0) {
     KMP_MB();
     lck->lk.owner_id = 0;
     __kmp_release_drdpa_lock(lck, gtid);
     return KMP_LOCK_RELEASED;
   }
   return KMP_LOCK_STILL_HELD;
 }
 
 static int __kmp_release_nested_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
                                                        kmp_int32 gtid) {
   char const *const func = "omp_unset_nest_lock";
   KMP_MB(); /* in case another processor initialized lock */
   if (lck->lk.initialized != lck) {
     KMP_FATAL(LockIsUninitialized, func);
   }
   if (!__kmp_is_drdpa_lock_nestable(lck)) {
     KMP_FATAL(LockSimpleUsedAsNestable, func);
   }
   if (__kmp_get_drdpa_lock_owner(lck) == -1) {
     KMP_FATAL(LockUnsettingFree, func);
   }
   if (__kmp_get_drdpa_lock_owner(lck) != gtid) {
     KMP_FATAL(LockUnsettingSetByAnother, func);
   }
   return __kmp_release_nested_drdpa_lock(lck, gtid);
 }
 
 void __kmp_init_nested_drdpa_lock(kmp_drdpa_lock_t *lck) {
   __kmp_init_drdpa_lock(lck);
   lck->lk.depth_locked = 0; // >= 0 for nestable locks, -1 for simple locks
 }
 
 void __kmp_destroy_nested_drdpa_lock(kmp_drdpa_lock_t *lck) {
   __kmp_destroy_drdpa_lock(lck);
   lck->lk.depth_locked = 0;
 }
 
 static void __kmp_destroy_nested_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck) {
   char const *const func = "omp_destroy_nest_lock";
   if (lck->lk.initialized != lck) {
     KMP_FATAL(LockIsUninitialized, func);
   }
   if (!__kmp_is_drdpa_lock_nestable(lck)) {
     KMP_FATAL(LockSimpleUsedAsNestable, func);
   }
   if (__kmp_get_drdpa_lock_owner(lck) != -1) {
     KMP_FATAL(LockStillOwned, func);
   }
   __kmp_destroy_nested_drdpa_lock(lck);
 }
 
 // access functions to fields which don't exist for all lock kinds.
 
 static const ident_t *__kmp_get_drdpa_lock_location(kmp_drdpa_lock_t *lck) {
   return lck->lk.location;
 }
 
 static void __kmp_set_drdpa_lock_location(kmp_drdpa_lock_t *lck,
                                           const ident_t *loc) {
   lck->lk.location = loc;
 }
 
 static kmp_lock_flags_t __kmp_get_drdpa_lock_flags(kmp_drdpa_lock_t *lck) {
   return lck->lk.flags;
 }
 
 static void __kmp_set_drdpa_lock_flags(kmp_drdpa_lock_t *lck,
                                        kmp_lock_flags_t flags) {
   lck->lk.flags = flags;
 }
 
 // Time stamp counter
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 #define __kmp_tsc() __kmp_hardware_timestamp()
 // Runtime's default backoff parameters
 kmp_backoff_t __kmp_spin_backoff_params = {1, 4096, 100};
 #else
 // Use nanoseconds for other platforms
 extern kmp_uint64 __kmp_now_nsec();
 kmp_backoff_t __kmp_spin_backoff_params = {1, 256, 100};
 #define __kmp_tsc() __kmp_now_nsec()
 #endif
 
 // A useful predicate for dealing with timestamps that may wrap.
 // Is a before b? Since the timestamps may wrap, this is asking whether it's
 // shorter to go clockwise from a to b around the clock-face, or anti-clockwise.
 // Times where going clockwise is less distance than going anti-clockwise
 // are in the future, others are in the past. e.g. a = MAX-1, b = MAX+1 (=0),
 // then a > b (true) does not mean a reached b; whereas signed(a) = -2,
 // signed(b) = 0 captures the actual difference
 static inline bool before(kmp_uint64 a, kmp_uint64 b) {
   return ((kmp_int64)b - (kmp_int64)a) > 0;
 }
 
 // Truncated binary exponential backoff function
 void __kmp_spin_backoff(kmp_backoff_t *boff) {
   // We could flatten this loop, but making it a nested loop gives better result
   kmp_uint32 i;
   for (i = boff->step; i > 0; i--) {
     kmp_uint64 goal = __kmp_tsc() + boff->min_tick;
     do {
       KMP_CPU_PAUSE();
     } while (before(__kmp_tsc(), goal));
   }
   boff->step = (boff->step << 1 | 1) & (boff->max_backoff - 1);
 }
 
 #if KMP_USE_DYNAMIC_LOCK
 
 // Direct lock initializers. It simply writes a tag to the low 8 bits of the
 // lock word.
 static void __kmp_init_direct_lock(kmp_dyna_lock_t *lck,
                                    kmp_dyna_lockseq_t seq) {
   TCW_4(*lck, KMP_GET_D_TAG(seq));
   KA_TRACE(
       20,
       ("__kmp_init_direct_lock: initialized direct lock with type#%d\n", seq));
 }
 
 #if KMP_USE_TSX
 
 // HLE lock functions - imported from the testbed runtime.
 #define HLE_ACQUIRE ".byte 0xf2;"
 #define HLE_RELEASE ".byte 0xf3;"
 
 static inline kmp_uint32 swap4(kmp_uint32 volatile *p, kmp_uint32 v) {
   __asm__ volatile(HLE_ACQUIRE "xchg %1,%0" : "+r"(v), "+m"(*p) : : "memory");
   return v;
 }
 
 static void __kmp_destroy_hle_lock(kmp_dyna_lock_t *lck) { TCW_4(*lck, 0); }
 
 static void __kmp_destroy_hle_lock_with_checks(kmp_dyna_lock_t *lck) {
   TCW_4(*lck, 0);
 }
 
 static void __kmp_acquire_hle_lock(kmp_dyna_lock_t *lck, kmp_int32 gtid) {
   // Use gtid for KMP_LOCK_BUSY if necessary
   if (swap4(lck, KMP_LOCK_BUSY(1, hle)) != KMP_LOCK_FREE(hle)) {
     int delay = 1;
     do {
       while (*(kmp_uint32 volatile *)lck != KMP_LOCK_FREE(hle)) {
         for (int i = delay; i != 0; --i)
           KMP_CPU_PAUSE();
         delay = ((delay << 1) | 1) & 7;
       }
     } while (swap4(lck, KMP_LOCK_BUSY(1, hle)) != KMP_LOCK_FREE(hle));
   }
 }
 
 static void __kmp_acquire_hle_lock_with_checks(kmp_dyna_lock_t *lck,
                                                kmp_int32 gtid) {
   __kmp_acquire_hle_lock(lck, gtid); // TODO: add checks
 }
 
 static int __kmp_release_hle_lock(kmp_dyna_lock_t *lck, kmp_int32 gtid) {
   __asm__ volatile(HLE_RELEASE "movl %1,%0"
                    : "=m"(*lck)
                    : "r"(KMP_LOCK_FREE(hle))
                    : "memory");
   return KMP_LOCK_RELEASED;
 }
 
 static int __kmp_release_hle_lock_with_checks(kmp_dyna_lock_t *lck,
                                               kmp_int32 gtid) {
   return __kmp_release_hle_lock(lck, gtid); // TODO: add checks
 }
 
 static int __kmp_test_hle_lock(kmp_dyna_lock_t *lck, kmp_int32 gtid) {
   return swap4(lck, KMP_LOCK_BUSY(1, hle)) == KMP_LOCK_FREE(hle);
 }
 
 static int __kmp_test_hle_lock_with_checks(kmp_dyna_lock_t *lck,
                                            kmp_int32 gtid) {
   return __kmp_test_hle_lock(lck, gtid); // TODO: add checks
 }
 
 static void __kmp_init_rtm_lock(kmp_queuing_lock_t *lck) {
   __kmp_init_queuing_lock(lck);
 }
 
 static void __kmp_destroy_rtm_lock(kmp_queuing_lock_t *lck) {
   __kmp_destroy_queuing_lock(lck);
 }
 
 static void __kmp_destroy_rtm_lock_with_checks(kmp_queuing_lock_t *lck) {
   __kmp_destroy_queuing_lock_with_checks(lck);
 }
 
 static void __kmp_acquire_rtm_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
   unsigned retries = 3, status;
   do {
     status = _xbegin();
     if (status == _XBEGIN_STARTED) {
       if (__kmp_is_unlocked_queuing_lock(lck))
         return;
       _xabort(0xff);
     }
     if ((status & _XABORT_EXPLICIT) && _XABORT_CODE(status) == 0xff) {
       // Wait until lock becomes free
       while (!__kmp_is_unlocked_queuing_lock(lck)) {
         KMP_YIELD(TRUE);
       }
     } else if (!(status & _XABORT_RETRY))
       break;
   } while (retries--);
 
   // Fall-back non-speculative lock (xchg)
   __kmp_acquire_queuing_lock(lck, gtid);
 }
 
 static void __kmp_acquire_rtm_lock_with_checks(kmp_queuing_lock_t *lck,
                                                kmp_int32 gtid) {
   __kmp_acquire_rtm_lock(lck, gtid);
 }
 
 static int __kmp_release_rtm_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
   if (__kmp_is_unlocked_queuing_lock(lck)) {
     // Releasing from speculation
     _xend();
   } else {
     // Releasing from a real lock
     __kmp_release_queuing_lock(lck, gtid);
   }
   return KMP_LOCK_RELEASED;
 }
 
 static int __kmp_release_rtm_lock_with_checks(kmp_queuing_lock_t *lck,
                                               kmp_int32 gtid) {
   return __kmp_release_rtm_lock(lck, gtid);
 }
 
 static int __kmp_test_rtm_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid) {
   unsigned retries = 3, status;
   do {
     status = _xbegin();
     if (status == _XBEGIN_STARTED && __kmp_is_unlocked_queuing_lock(lck)) {
       return 1;
     }
     if (!(status & _XABORT_RETRY))
       break;
   } while (retries--);
 
   return (__kmp_is_unlocked_queuing_lock(lck)) ? 1 : 0;
 }
 
 static int __kmp_test_rtm_lock_with_checks(kmp_queuing_lock_t *lck,
                                            kmp_int32 gtid) {
   return __kmp_test_rtm_lock(lck, gtid);
 }
 
 #endif // KMP_USE_TSX
 
 // Entry functions for indirect locks (first element of direct lock jump tables)
 static void __kmp_init_indirect_lock(kmp_dyna_lock_t *l,
                                      kmp_dyna_lockseq_t tag);
 static void __kmp_destroy_indirect_lock(kmp_dyna_lock_t *lock);
 static int __kmp_set_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32);
 static int __kmp_unset_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32);
 static int __kmp_test_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32);
 static int __kmp_set_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
                                                kmp_int32);
 static int __kmp_unset_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
                                                  kmp_int32);
 static int __kmp_test_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
                                                 kmp_int32);
 
 // Lock function definitions for the union parameter type
 #define KMP_FOREACH_LOCK_KIND(m, a) m(ticket, a) m(queuing, a) m(drdpa, a)
 
 #define expand1(lk, op)                                                        \
   static void __kmp_##op##_##lk##_##lock(kmp_user_lock_p lock) {               \
     __kmp_##op##_##lk##_##lock(&lock->lk);                                     \
   }
 #define expand2(lk, op)                                                        \
   static int __kmp_##op##_##lk##_##lock(kmp_user_lock_p lock,                  \
                                         kmp_int32 gtid) {                      \
     return __kmp_##op##_##lk##_##lock(&lock->lk, gtid);                        \
   }
 #define expand3(lk, op)                                                        \
   static void __kmp_set_##lk##_##lock_flags(kmp_user_lock_p lock,              \
                                             kmp_lock_flags_t flags) {          \
     __kmp_set_##lk##_lock_flags(&lock->lk, flags);                             \
   }
 #define expand4(lk, op)                                                        \
   static void __kmp_set_##lk##_##lock_location(kmp_user_lock_p lock,           \
                                                const ident_t *loc) {           \
     __kmp_set_##lk##_lock_location(&lock->lk, loc);                            \
   }
 
 KMP_FOREACH_LOCK_KIND(expand1, init)
 KMP_FOREACH_LOCK_KIND(expand1, init_nested)
 KMP_FOREACH_LOCK_KIND(expand1, destroy)
 KMP_FOREACH_LOCK_KIND(expand1, destroy_nested)
 KMP_FOREACH_LOCK_KIND(expand2, acquire)
 KMP_FOREACH_LOCK_KIND(expand2, acquire_nested)
 KMP_FOREACH_LOCK_KIND(expand2, release)
 KMP_FOREACH_LOCK_KIND(expand2, release_nested)
 KMP_FOREACH_LOCK_KIND(expand2, test)
 KMP_FOREACH_LOCK_KIND(expand2, test_nested)
 KMP_FOREACH_LOCK_KIND(expand3, )
 KMP_FOREACH_LOCK_KIND(expand4, )
 
 #undef expand1
 #undef expand2
 #undef expand3
 #undef expand4
 
 // Jump tables for the indirect lock functions
 // Only fill in the odd entries, that avoids the need to shift out the low bit
 
 // init functions
 #define expand(l, op) 0, __kmp_init_direct_lock,
 void (*__kmp_direct_init[])(kmp_dyna_lock_t *, kmp_dyna_lockseq_t) = {
     __kmp_init_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, init)};
 #undef expand
 
 // destroy functions
 #define expand(l, op) 0, (void (*)(kmp_dyna_lock_t *))__kmp_##op##_##l##_lock,
 static void (*direct_destroy[])(kmp_dyna_lock_t *) = {
     __kmp_destroy_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, destroy)};
 #undef expand
 #define expand(l, op)                                                          \
   0, (void (*)(kmp_dyna_lock_t *))__kmp_destroy_##l##_lock_with_checks,
 static void (*direct_destroy_check[])(kmp_dyna_lock_t *) = {
     __kmp_destroy_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, destroy)};
 #undef expand
 
 // set/acquire functions
 #define expand(l, op)                                                          \
   0, (int (*)(kmp_dyna_lock_t *, kmp_int32))__kmp_##op##_##l##_lock,
 static int (*direct_set[])(kmp_dyna_lock_t *, kmp_int32) = {
     __kmp_set_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, acquire)};
 #undef expand
 #define expand(l, op)                                                          \
   0, (int (*)(kmp_dyna_lock_t *, kmp_int32))__kmp_##op##_##l##_lock_with_checks,
 static int (*direct_set_check[])(kmp_dyna_lock_t *, kmp_int32) = {
     __kmp_set_indirect_lock_with_checks, 0,
     KMP_FOREACH_D_LOCK(expand, acquire)};
 #undef expand
 
 // unset/release and test functions
 #define expand(l, op)                                                          \
   0, (int (*)(kmp_dyna_lock_t *, kmp_int32))__kmp_##op##_##l##_lock,
 static int (*direct_unset[])(kmp_dyna_lock_t *, kmp_int32) = {
     __kmp_unset_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, release)};
 static int (*direct_test[])(kmp_dyna_lock_t *, kmp_int32) = {
     __kmp_test_indirect_lock, 0, KMP_FOREACH_D_LOCK(expand, test)};
 #undef expand
 #define expand(l, op)                                                          \
   0, (int (*)(kmp_dyna_lock_t *, kmp_int32))__kmp_##op##_##l##_lock_with_checks,
 static int (*direct_unset_check[])(kmp_dyna_lock_t *, kmp_int32) = {
     __kmp_unset_indirect_lock_with_checks, 0,
     KMP_FOREACH_D_LOCK(expand, release)};
 static int (*direct_test_check[])(kmp_dyna_lock_t *, kmp_int32) = {
     __kmp_test_indirect_lock_with_checks, 0, KMP_FOREACH_D_LOCK(expand, test)};
 #undef expand
 
 // Exposes only one set of jump tables (*lock or *lock_with_checks).
-void (*(*__kmp_direct_destroy))(kmp_dyna_lock_t *) = 0;
-int (*(*__kmp_direct_set))(kmp_dyna_lock_t *, kmp_int32) = 0;
-int (*(*__kmp_direct_unset))(kmp_dyna_lock_t *, kmp_int32) = 0;
-int (*(*__kmp_direct_test))(kmp_dyna_lock_t *, kmp_int32) = 0;
+void (**__kmp_direct_destroy)(kmp_dyna_lock_t *) = 0;
+int (**__kmp_direct_set)(kmp_dyna_lock_t *, kmp_int32) = 0;
+int (**__kmp_direct_unset)(kmp_dyna_lock_t *, kmp_int32) = 0;
+int (**__kmp_direct_test)(kmp_dyna_lock_t *, kmp_int32) = 0;
 
 // Jump tables for the indirect lock functions
 #define expand(l, op) (void (*)(kmp_user_lock_p)) __kmp_##op##_##l##_##lock,
 void (*__kmp_indirect_init[])(kmp_user_lock_p) = {
     KMP_FOREACH_I_LOCK(expand, init)};
 #undef expand
 
 #define expand(l, op) (void (*)(kmp_user_lock_p)) __kmp_##op##_##l##_##lock,
 static void (*indirect_destroy[])(kmp_user_lock_p) = {
     KMP_FOREACH_I_LOCK(expand, destroy)};
 #undef expand
 #define expand(l, op)                                                          \
   (void (*)(kmp_user_lock_p)) __kmp_##op##_##l##_##lock_with_checks,
 static void (*indirect_destroy_check[])(kmp_user_lock_p) = {
     KMP_FOREACH_I_LOCK(expand, destroy)};
 #undef expand
 
 // set/acquire functions
 #define expand(l, op)                                                          \
   (int (*)(kmp_user_lock_p, kmp_int32)) __kmp_##op##_##l##_##lock,
 static int (*indirect_set[])(kmp_user_lock_p,
                              kmp_int32) = {KMP_FOREACH_I_LOCK(expand, acquire)};
 #undef expand
 #define expand(l, op)                                                          \
   (int (*)(kmp_user_lock_p, kmp_int32)) __kmp_##op##_##l##_##lock_with_checks,
 static int (*indirect_set_check[])(kmp_user_lock_p, kmp_int32) = {
     KMP_FOREACH_I_LOCK(expand, acquire)};
 #undef expand
 
 // unset/release and test functions
 #define expand(l, op)                                                          \
   (int (*)(kmp_user_lock_p, kmp_int32)) __kmp_##op##_##l##_##lock,
 static int (*indirect_unset[])(kmp_user_lock_p, kmp_int32) = {
     KMP_FOREACH_I_LOCK(expand, release)};
 static int (*indirect_test[])(kmp_user_lock_p,
                               kmp_int32) = {KMP_FOREACH_I_LOCK(expand, test)};
 #undef expand
 #define expand(l, op)                                                          \
   (int (*)(kmp_user_lock_p, kmp_int32)) __kmp_##op##_##l##_##lock_with_checks,
 static int (*indirect_unset_check[])(kmp_user_lock_p, kmp_int32) = {
     KMP_FOREACH_I_LOCK(expand, release)};
 static int (*indirect_test_check[])(kmp_user_lock_p, kmp_int32) = {
     KMP_FOREACH_I_LOCK(expand, test)};
 #undef expand
 
 // Exposes only one jump tables (*lock or *lock_with_checks).
-void (*(*__kmp_indirect_destroy))(kmp_user_lock_p) = 0;
-int (*(*__kmp_indirect_set))(kmp_user_lock_p, kmp_int32) = 0;
-int (*(*__kmp_indirect_unset))(kmp_user_lock_p, kmp_int32) = 0;
-int (*(*__kmp_indirect_test))(kmp_user_lock_p, kmp_int32) = 0;
+void (**__kmp_indirect_destroy)(kmp_user_lock_p) = 0;
+int (**__kmp_indirect_set)(kmp_user_lock_p, kmp_int32) = 0;
+int (**__kmp_indirect_unset)(kmp_user_lock_p, kmp_int32) = 0;
+int (**__kmp_indirect_test)(kmp_user_lock_p, kmp_int32) = 0;
 
 // Lock index table.
 kmp_indirect_lock_table_t __kmp_i_lock_table;
 
 // Size of indirect locks.
 static kmp_uint32 __kmp_indirect_lock_size[KMP_NUM_I_LOCKS] = {0};
 
 // Jump tables for lock accessor/modifier.
 void (*__kmp_indirect_set_location[KMP_NUM_I_LOCKS])(kmp_user_lock_p,
                                                      const ident_t *) = {0};
 void (*__kmp_indirect_set_flags[KMP_NUM_I_LOCKS])(kmp_user_lock_p,
                                                   kmp_lock_flags_t) = {0};
 const ident_t *(*__kmp_indirect_get_location[KMP_NUM_I_LOCKS])(
     kmp_user_lock_p) = {0};
 kmp_lock_flags_t (*__kmp_indirect_get_flags[KMP_NUM_I_LOCKS])(
     kmp_user_lock_p) = {0};
 
 // Use different lock pools for different lock types.
 static kmp_indirect_lock_t *__kmp_indirect_lock_pool[KMP_NUM_I_LOCKS] = {0};
 
 // User lock allocator for dynamically dispatched indirect locks. Every entry of
 // the indirect lock table holds the address and type of the allocated indrect
 // lock (kmp_indirect_lock_t), and the size of the table doubles when it is
 // full. A destroyed indirect lock object is returned to the reusable pool of
 // locks, unique to each lock type.
 kmp_indirect_lock_t *__kmp_allocate_indirect_lock(void **user_lock,
                                                   kmp_int32 gtid,
                                                   kmp_indirect_locktag_t tag) {
   kmp_indirect_lock_t *lck;
   kmp_lock_index_t idx;
 
   __kmp_acquire_lock(&__kmp_global_lock, gtid);
 
   if (__kmp_indirect_lock_pool[tag] != NULL) {
     // Reuse the allocated and destroyed lock object
     lck = __kmp_indirect_lock_pool[tag];
     if (OMP_LOCK_T_SIZE < sizeof(void *))
       idx = lck->lock->pool.index;
     __kmp_indirect_lock_pool[tag] = (kmp_indirect_lock_t *)lck->lock->pool.next;
     KA_TRACE(20, ("__kmp_allocate_indirect_lock: reusing an existing lock %p\n",
                   lck));
   } else {
     idx = __kmp_i_lock_table.next;
     // Check capacity and double the size if it is full
     if (idx == __kmp_i_lock_table.size) {
       // Double up the space for block pointers
       int row = __kmp_i_lock_table.size / KMP_I_LOCK_CHUNK;
       kmp_indirect_lock_t **new_table = (kmp_indirect_lock_t **)__kmp_allocate(
           2 * row * sizeof(kmp_indirect_lock_t *));
       KMP_MEMCPY(new_table, __kmp_i_lock_table.table,
                  row * sizeof(kmp_indirect_lock_t *));
       kmp_indirect_lock_t **old_table = __kmp_i_lock_table.table;
       __kmp_i_lock_table.table = new_table;
       __kmp_free(old_table);
       // Allocate new objects in the new blocks
       for (int i = row; i < 2 * row; ++i)
         *(__kmp_i_lock_table.table + i) = (kmp_indirect_lock_t *)__kmp_allocate(
             KMP_I_LOCK_CHUNK * sizeof(kmp_indirect_lock_t));
       __kmp_i_lock_table.size = 2 * idx;
     }
     __kmp_i_lock_table.next++;
     lck = KMP_GET_I_LOCK(idx);
     // Allocate a new base lock object
     lck->lock = (kmp_user_lock_p)__kmp_allocate(__kmp_indirect_lock_size[tag]);
     KA_TRACE(20,
              ("__kmp_allocate_indirect_lock: allocated a new lock %p\n", lck));
   }
 
   __kmp_release_lock(&__kmp_global_lock, gtid);
 
   lck->type = tag;
 
   if (OMP_LOCK_T_SIZE < sizeof(void *)) {
     *((kmp_lock_index_t *)user_lock) = idx
                                        << 1; // indirect lock word must be even
   } else {
     *((kmp_indirect_lock_t **)user_lock) = lck;
   }
 
   return lck;
 }
 
 // User lock lookup for dynamically dispatched locks.
 static __forceinline kmp_indirect_lock_t *
 __kmp_lookup_indirect_lock(void **user_lock, const char *func) {
   if (__kmp_env_consistency_check) {
     kmp_indirect_lock_t *lck = NULL;
     if (user_lock == NULL) {
       KMP_FATAL(LockIsUninitialized, func);
     }
     if (OMP_LOCK_T_SIZE < sizeof(void *)) {
       kmp_lock_index_t idx = KMP_EXTRACT_I_INDEX(user_lock);
       if (idx >= __kmp_i_lock_table.size) {
         KMP_FATAL(LockIsUninitialized, func);
       }
       lck = KMP_GET_I_LOCK(idx);
     } else {
       lck = *((kmp_indirect_lock_t **)user_lock);
     }
     if (lck == NULL) {
       KMP_FATAL(LockIsUninitialized, func);
     }
     return lck;
   } else {
     if (OMP_LOCK_T_SIZE < sizeof(void *)) {
       return KMP_GET_I_LOCK(KMP_EXTRACT_I_INDEX(user_lock));
     } else {
       return *((kmp_indirect_lock_t **)user_lock);
     }
   }
 }
 
 static void __kmp_init_indirect_lock(kmp_dyna_lock_t *lock,
                                      kmp_dyna_lockseq_t seq) {
 #if KMP_USE_ADAPTIVE_LOCKS
   if (seq == lockseq_adaptive && !__kmp_cpuinfo.rtm) {
     KMP_WARNING(AdaptiveNotSupported, "kmp_lockseq_t", "adaptive");
     seq = lockseq_queuing;
   }
 #endif
 #if KMP_USE_TSX
   if (seq == lockseq_rtm && !__kmp_cpuinfo.rtm) {
     seq = lockseq_queuing;
   }
 #endif
   kmp_indirect_locktag_t tag = KMP_GET_I_TAG(seq);
   kmp_indirect_lock_t *l =
       __kmp_allocate_indirect_lock((void **)lock, __kmp_entry_gtid(), tag);
   KMP_I_LOCK_FUNC(l, init)(l->lock);
   KA_TRACE(
       20, ("__kmp_init_indirect_lock: initialized indirect lock with type#%d\n",
            seq));
 }
 
 static void __kmp_destroy_indirect_lock(kmp_dyna_lock_t *lock) {
   kmp_uint32 gtid = __kmp_entry_gtid();
   kmp_indirect_lock_t *l =
       __kmp_lookup_indirect_lock((void **)lock, "omp_destroy_lock");
   KMP_I_LOCK_FUNC(l, destroy)(l->lock);
   kmp_indirect_locktag_t tag = l->type;
 
   __kmp_acquire_lock(&__kmp_global_lock, gtid);
 
   // Use the base lock's space to keep the pool chain.
   l->lock->pool.next = (kmp_user_lock_p)__kmp_indirect_lock_pool[tag];
   if (OMP_LOCK_T_SIZE < sizeof(void *)) {
     l->lock->pool.index = KMP_EXTRACT_I_INDEX(lock);
   }
   __kmp_indirect_lock_pool[tag] = l;
 
   __kmp_release_lock(&__kmp_global_lock, gtid);
 }
 
 static int __kmp_set_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32 gtid) {
   kmp_indirect_lock_t *l = KMP_LOOKUP_I_LOCK(lock);
   return KMP_I_LOCK_FUNC(l, set)(l->lock, gtid);
 }
 
 static int __kmp_unset_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32 gtid) {
   kmp_indirect_lock_t *l = KMP_LOOKUP_I_LOCK(lock);
   return KMP_I_LOCK_FUNC(l, unset)(l->lock, gtid);
 }
 
 static int __kmp_test_indirect_lock(kmp_dyna_lock_t *lock, kmp_int32 gtid) {
   kmp_indirect_lock_t *l = KMP_LOOKUP_I_LOCK(lock);
   return KMP_I_LOCK_FUNC(l, test)(l->lock, gtid);
 }
 
 static int __kmp_set_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
                                                kmp_int32 gtid) {
   kmp_indirect_lock_t *l =
       __kmp_lookup_indirect_lock((void **)lock, "omp_set_lock");
   return KMP_I_LOCK_FUNC(l, set)(l->lock, gtid);
 }
 
 static int __kmp_unset_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
                                                  kmp_int32 gtid) {
   kmp_indirect_lock_t *l =
       __kmp_lookup_indirect_lock((void **)lock, "omp_unset_lock");
   return KMP_I_LOCK_FUNC(l, unset)(l->lock, gtid);
 }
 
 static int __kmp_test_indirect_lock_with_checks(kmp_dyna_lock_t *lock,
                                                 kmp_int32 gtid) {
   kmp_indirect_lock_t *l =
       __kmp_lookup_indirect_lock((void **)lock, "omp_test_lock");
   return KMP_I_LOCK_FUNC(l, test)(l->lock, gtid);
 }
 
 kmp_dyna_lockseq_t __kmp_user_lock_seq = lockseq_queuing;
 
 // This is used only in kmp_error.cpp when consistency checking is on.
 kmp_int32 __kmp_get_user_lock_owner(kmp_user_lock_p lck, kmp_uint32 seq) {
   switch (seq) {
   case lockseq_tas:
   case lockseq_nested_tas:
     return __kmp_get_tas_lock_owner((kmp_tas_lock_t *)lck);
 #if KMP_USE_FUTEX
   case lockseq_futex:
   case lockseq_nested_futex:
     return __kmp_get_futex_lock_owner((kmp_futex_lock_t *)lck);
 #endif
   case lockseq_ticket:
   case lockseq_nested_ticket:
     return __kmp_get_ticket_lock_owner((kmp_ticket_lock_t *)lck);
   case lockseq_queuing:
   case lockseq_nested_queuing:
 #if KMP_USE_ADAPTIVE_LOCKS
   case lockseq_adaptive:
 #endif
     return __kmp_get_queuing_lock_owner((kmp_queuing_lock_t *)lck);
   case lockseq_drdpa:
   case lockseq_nested_drdpa:
     return __kmp_get_drdpa_lock_owner((kmp_drdpa_lock_t *)lck);
   default:
     return 0;
   }
 }
 
 // Initializes data for dynamic user locks.
 void __kmp_init_dynamic_user_locks() {
   // Initialize jump table for the lock functions
   if (__kmp_env_consistency_check) {
     __kmp_direct_set = direct_set_check;
     __kmp_direct_unset = direct_unset_check;
     __kmp_direct_test = direct_test_check;
     __kmp_direct_destroy = direct_destroy_check;
     __kmp_indirect_set = indirect_set_check;
     __kmp_indirect_unset = indirect_unset_check;
     __kmp_indirect_test = indirect_test_check;
     __kmp_indirect_destroy = indirect_destroy_check;
   } else {
     __kmp_direct_set = direct_set;
     __kmp_direct_unset = direct_unset;
     __kmp_direct_test = direct_test;
     __kmp_direct_destroy = direct_destroy;
     __kmp_indirect_set = indirect_set;
     __kmp_indirect_unset = indirect_unset;
     __kmp_indirect_test = indirect_test;
     __kmp_indirect_destroy = indirect_destroy;
   }
   // If the user locks have already been initialized, then return. Allow the
   // switch between different KMP_CONSISTENCY_CHECK values, but do not allocate
   // new lock tables if they have already been allocated.
   if (__kmp_init_user_locks)
     return;
 
   // Initialize lock index table
   __kmp_i_lock_table.size = KMP_I_LOCK_CHUNK;
   __kmp_i_lock_table.table =
       (kmp_indirect_lock_t **)__kmp_allocate(sizeof(kmp_indirect_lock_t *));
   *(__kmp_i_lock_table.table) = (kmp_indirect_lock_t *)__kmp_allocate(
       KMP_I_LOCK_CHUNK * sizeof(kmp_indirect_lock_t));
   __kmp_i_lock_table.next = 0;
 
   // Indirect lock size
   __kmp_indirect_lock_size[locktag_ticket] = sizeof(kmp_ticket_lock_t);
   __kmp_indirect_lock_size[locktag_queuing] = sizeof(kmp_queuing_lock_t);
 #if KMP_USE_ADAPTIVE_LOCKS
   __kmp_indirect_lock_size[locktag_adaptive] = sizeof(kmp_adaptive_lock_t);
 #endif
   __kmp_indirect_lock_size[locktag_drdpa] = sizeof(kmp_drdpa_lock_t);
 #if KMP_USE_TSX
   __kmp_indirect_lock_size[locktag_rtm] = sizeof(kmp_queuing_lock_t);
 #endif
   __kmp_indirect_lock_size[locktag_nested_tas] = sizeof(kmp_tas_lock_t);
 #if KMP_USE_FUTEX
   __kmp_indirect_lock_size[locktag_nested_futex] = sizeof(kmp_futex_lock_t);
 #endif
   __kmp_indirect_lock_size[locktag_nested_ticket] = sizeof(kmp_ticket_lock_t);
   __kmp_indirect_lock_size[locktag_nested_queuing] = sizeof(kmp_queuing_lock_t);
   __kmp_indirect_lock_size[locktag_nested_drdpa] = sizeof(kmp_drdpa_lock_t);
 
 // Initialize lock accessor/modifier
 #define fill_jumps(table, expand, sep)                                         \
   {                                                                            \
     table[locktag##sep##ticket] = expand(ticket);                              \
     table[locktag##sep##queuing] = expand(queuing);                            \
     table[locktag##sep##drdpa] = expand(drdpa);                                \
   }
 
 #if KMP_USE_ADAPTIVE_LOCKS
 #define fill_table(table, expand)                                              \
   {                                                                            \
     fill_jumps(table, expand, _);                                              \
     table[locktag_adaptive] = expand(queuing);                                 \
     fill_jumps(table, expand, _nested_);                                       \
   }
 #else
 #define fill_table(table, expand)                                              \
   {                                                                            \
     fill_jumps(table, expand, _);                                              \
     fill_jumps(table, expand, _nested_);                                       \
   }
 #endif // KMP_USE_ADAPTIVE_LOCKS
 
 #define expand(l)                                                              \
   (void (*)(kmp_user_lock_p, const ident_t *)) __kmp_set_##l##_lock_location
   fill_table(__kmp_indirect_set_location, expand);
 #undef expand
 #define expand(l)                                                              \
   (void (*)(kmp_user_lock_p, kmp_lock_flags_t)) __kmp_set_##l##_lock_flags
   fill_table(__kmp_indirect_set_flags, expand);
 #undef expand
 #define expand(l)                                                              \
   (const ident_t *(*)(kmp_user_lock_p)) __kmp_get_##l##_lock_location
   fill_table(__kmp_indirect_get_location, expand);
 #undef expand
 #define expand(l)                                                              \
   (kmp_lock_flags_t(*)(kmp_user_lock_p)) __kmp_get_##l##_lock_flags
   fill_table(__kmp_indirect_get_flags, expand);
 #undef expand
 
   __kmp_init_user_locks = TRUE;
 }
 
 // Clean up the lock table.
 void __kmp_cleanup_indirect_user_locks() {
   kmp_lock_index_t i;
   int k;
 
   // Clean up locks in the pools first (they were already destroyed before going
   // into the pools).
   for (k = 0; k < KMP_NUM_I_LOCKS; ++k) {
     kmp_indirect_lock_t *l = __kmp_indirect_lock_pool[k];
     while (l != NULL) {
       kmp_indirect_lock_t *ll = l;
       l = (kmp_indirect_lock_t *)l->lock->pool.next;
       KA_TRACE(20, ("__kmp_cleanup_indirect_user_locks: freeing %p from pool\n",
                     ll));
       __kmp_free(ll->lock);
       ll->lock = NULL;
     }
     __kmp_indirect_lock_pool[k] = NULL;
   }
   // Clean up the remaining undestroyed locks.
   for (i = 0; i < __kmp_i_lock_table.next; i++) {
     kmp_indirect_lock_t *l = KMP_GET_I_LOCK(i);
     if (l->lock != NULL) {
       // Locks not destroyed explicitly need to be destroyed here.
       KMP_I_LOCK_FUNC(l, destroy)(l->lock);
       KA_TRACE(
           20,
           ("__kmp_cleanup_indirect_user_locks: destroy/freeing %p from table\n",
            l));
       __kmp_free(l->lock);
     }
   }
   // Free the table
   for (i = 0; i < __kmp_i_lock_table.size / KMP_I_LOCK_CHUNK; i++)
     __kmp_free(__kmp_i_lock_table.table[i]);
   __kmp_free(__kmp_i_lock_table.table);
 
   __kmp_init_user_locks = FALSE;
 }
 
 enum kmp_lock_kind __kmp_user_lock_kind = lk_default;
 int __kmp_num_locks_in_block = 1; // FIXME - tune this value
 
 #else // KMP_USE_DYNAMIC_LOCK
 
 static void __kmp_init_tas_lock_with_checks(kmp_tas_lock_t *lck) {
   __kmp_init_tas_lock(lck);
 }
 
 static void __kmp_init_nested_tas_lock_with_checks(kmp_tas_lock_t *lck) {
   __kmp_init_nested_tas_lock(lck);
 }
 
 #if KMP_USE_FUTEX
 static void __kmp_init_futex_lock_with_checks(kmp_futex_lock_t *lck) {
   __kmp_init_futex_lock(lck);
 }
 
 static void __kmp_init_nested_futex_lock_with_checks(kmp_futex_lock_t *lck) {
   __kmp_init_nested_futex_lock(lck);
 }
 #endif
 
 static int __kmp_is_ticket_lock_initialized(kmp_ticket_lock_t *lck) {
   return lck == lck->lk.self;
 }
 
 static void __kmp_init_ticket_lock_with_checks(kmp_ticket_lock_t *lck) {
   __kmp_init_ticket_lock(lck);
 }
 
 static void __kmp_init_nested_ticket_lock_with_checks(kmp_ticket_lock_t *lck) {
   __kmp_init_nested_ticket_lock(lck);
 }
 
 static int __kmp_is_queuing_lock_initialized(kmp_queuing_lock_t *lck) {
   return lck == lck->lk.initialized;
 }
 
 static void __kmp_init_queuing_lock_with_checks(kmp_queuing_lock_t *lck) {
   __kmp_init_queuing_lock(lck);
 }
 
 static void
 __kmp_init_nested_queuing_lock_with_checks(kmp_queuing_lock_t *lck) {
   __kmp_init_nested_queuing_lock(lck);
 }
 
 #if KMP_USE_ADAPTIVE_LOCKS
 static void __kmp_init_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck) {
   __kmp_init_adaptive_lock(lck);
 }
 #endif
 
 static int __kmp_is_drdpa_lock_initialized(kmp_drdpa_lock_t *lck) {
   return lck == lck->lk.initialized;
 }
 
 static void __kmp_init_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck) {
   __kmp_init_drdpa_lock(lck);
 }
 
 static void __kmp_init_nested_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck) {
   __kmp_init_nested_drdpa_lock(lck);
 }
 
 /* user locks
  * They are implemented as a table of function pointers which are set to the
  * lock functions of the appropriate kind, once that has been determined. */
 
 enum kmp_lock_kind __kmp_user_lock_kind = lk_default;
 
 size_t __kmp_base_user_lock_size = 0;
 size_t __kmp_user_lock_size = 0;
 
 kmp_int32 (*__kmp_get_user_lock_owner_)(kmp_user_lock_p lck) = NULL;
 int (*__kmp_acquire_user_lock_with_checks_)(kmp_user_lock_p lck,
                                             kmp_int32 gtid) = NULL;
 
 int (*__kmp_test_user_lock_with_checks_)(kmp_user_lock_p lck,
                                          kmp_int32 gtid) = NULL;
 int (*__kmp_release_user_lock_with_checks_)(kmp_user_lock_p lck,
                                             kmp_int32 gtid) = NULL;
 void (*__kmp_init_user_lock_with_checks_)(kmp_user_lock_p lck) = NULL;
 void (*__kmp_destroy_user_lock_)(kmp_user_lock_p lck) = NULL;
 void (*__kmp_destroy_user_lock_with_checks_)(kmp_user_lock_p lck) = NULL;
 int (*__kmp_acquire_nested_user_lock_with_checks_)(kmp_user_lock_p lck,
                                                    kmp_int32 gtid) = NULL;
 
 int (*__kmp_test_nested_user_lock_with_checks_)(kmp_user_lock_p lck,
                                                 kmp_int32 gtid) = NULL;
 int (*__kmp_release_nested_user_lock_with_checks_)(kmp_user_lock_p lck,
                                                    kmp_int32 gtid) = NULL;
 void (*__kmp_init_nested_user_lock_with_checks_)(kmp_user_lock_p lck) = NULL;
 void (*__kmp_destroy_nested_user_lock_with_checks_)(kmp_user_lock_p lck) = NULL;
 
 int (*__kmp_is_user_lock_initialized_)(kmp_user_lock_p lck) = NULL;
 const ident_t *(*__kmp_get_user_lock_location_)(kmp_user_lock_p lck) = NULL;
 void (*__kmp_set_user_lock_location_)(kmp_user_lock_p lck,
                                       const ident_t *loc) = NULL;
 kmp_lock_flags_t (*__kmp_get_user_lock_flags_)(kmp_user_lock_p lck) = NULL;
 void (*__kmp_set_user_lock_flags_)(kmp_user_lock_p lck,
                                    kmp_lock_flags_t flags) = NULL;
 
 void __kmp_set_user_lock_vptrs(kmp_lock_kind_t user_lock_kind) {
   switch (user_lock_kind) {
   case lk_default:
   default:
     KMP_ASSERT(0);
 
   case lk_tas: {
     __kmp_base_user_lock_size = sizeof(kmp_base_tas_lock_t);
     __kmp_user_lock_size = sizeof(kmp_tas_lock_t);
 
     __kmp_get_user_lock_owner_ =
         (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_tas_lock_owner);
 
     if (__kmp_env_consistency_check) {
       KMP_BIND_USER_LOCK_WITH_CHECKS(tas);
       KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(tas);
     } else {
       KMP_BIND_USER_LOCK(tas);
       KMP_BIND_NESTED_USER_LOCK(tas);
     }
 
     __kmp_destroy_user_lock_ =
         (void (*)(kmp_user_lock_p))(&__kmp_destroy_tas_lock);
 
     __kmp_is_user_lock_initialized_ = (int (*)(kmp_user_lock_p))NULL;
 
     __kmp_get_user_lock_location_ = (const ident_t *(*)(kmp_user_lock_p))NULL;
 
     __kmp_set_user_lock_location_ =
         (void (*)(kmp_user_lock_p, const ident_t *))NULL;
 
     __kmp_get_user_lock_flags_ = (kmp_lock_flags_t(*)(kmp_user_lock_p))NULL;
 
     __kmp_set_user_lock_flags_ =
         (void (*)(kmp_user_lock_p, kmp_lock_flags_t))NULL;
   } break;
 
 #if KMP_USE_FUTEX
 
   case lk_futex: {
     __kmp_base_user_lock_size = sizeof(kmp_base_futex_lock_t);
     __kmp_user_lock_size = sizeof(kmp_futex_lock_t);
 
     __kmp_get_user_lock_owner_ =
         (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_futex_lock_owner);
 
     if (__kmp_env_consistency_check) {
       KMP_BIND_USER_LOCK_WITH_CHECKS(futex);
       KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(futex);
     } else {
       KMP_BIND_USER_LOCK(futex);
       KMP_BIND_NESTED_USER_LOCK(futex);
     }
 
     __kmp_destroy_user_lock_ =
         (void (*)(kmp_user_lock_p))(&__kmp_destroy_futex_lock);
 
     __kmp_is_user_lock_initialized_ = (int (*)(kmp_user_lock_p))NULL;
 
     __kmp_get_user_lock_location_ = (const ident_t *(*)(kmp_user_lock_p))NULL;
 
     __kmp_set_user_lock_location_ =
         (void (*)(kmp_user_lock_p, const ident_t *))NULL;
 
     __kmp_get_user_lock_flags_ = (kmp_lock_flags_t(*)(kmp_user_lock_p))NULL;
 
     __kmp_set_user_lock_flags_ =
         (void (*)(kmp_user_lock_p, kmp_lock_flags_t))NULL;
   } break;
 
 #endif // KMP_USE_FUTEX
 
   case lk_ticket: {
     __kmp_base_user_lock_size = sizeof(kmp_base_ticket_lock_t);
     __kmp_user_lock_size = sizeof(kmp_ticket_lock_t);
 
     __kmp_get_user_lock_owner_ =
         (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_ticket_lock_owner);
 
     if (__kmp_env_consistency_check) {
       KMP_BIND_USER_LOCK_WITH_CHECKS(ticket);
       KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(ticket);
     } else {
       KMP_BIND_USER_LOCK(ticket);
       KMP_BIND_NESTED_USER_LOCK(ticket);
     }
 
     __kmp_destroy_user_lock_ =
         (void (*)(kmp_user_lock_p))(&__kmp_destroy_ticket_lock);
 
     __kmp_is_user_lock_initialized_ =
         (int (*)(kmp_user_lock_p))(&__kmp_is_ticket_lock_initialized);
 
     __kmp_get_user_lock_location_ =
         (const ident_t *(*)(kmp_user_lock_p))(&__kmp_get_ticket_lock_location);
 
     __kmp_set_user_lock_location_ = (void (*)(
         kmp_user_lock_p, const ident_t *))(&__kmp_set_ticket_lock_location);
 
     __kmp_get_user_lock_flags_ =
         (kmp_lock_flags_t(*)(kmp_user_lock_p))(&__kmp_get_ticket_lock_flags);
 
     __kmp_set_user_lock_flags_ = (void (*)(kmp_user_lock_p, kmp_lock_flags_t))(
         &__kmp_set_ticket_lock_flags);
   } break;
 
   case lk_queuing: {
     __kmp_base_user_lock_size = sizeof(kmp_base_queuing_lock_t);
     __kmp_user_lock_size = sizeof(kmp_queuing_lock_t);
 
     __kmp_get_user_lock_owner_ =
         (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_owner);
 
     if (__kmp_env_consistency_check) {
       KMP_BIND_USER_LOCK_WITH_CHECKS(queuing);
       KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(queuing);
     } else {
       KMP_BIND_USER_LOCK(queuing);
       KMP_BIND_NESTED_USER_LOCK(queuing);
     }
 
     __kmp_destroy_user_lock_ =
         (void (*)(kmp_user_lock_p))(&__kmp_destroy_queuing_lock);
 
     __kmp_is_user_lock_initialized_ =
         (int (*)(kmp_user_lock_p))(&__kmp_is_queuing_lock_initialized);
 
     __kmp_get_user_lock_location_ =
         (const ident_t *(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_location);
 
     __kmp_set_user_lock_location_ = (void (*)(
         kmp_user_lock_p, const ident_t *))(&__kmp_set_queuing_lock_location);
 
     __kmp_get_user_lock_flags_ =
         (kmp_lock_flags_t(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_flags);
 
     __kmp_set_user_lock_flags_ = (void (*)(kmp_user_lock_p, kmp_lock_flags_t))(
         &__kmp_set_queuing_lock_flags);
   } break;
 
 #if KMP_USE_ADAPTIVE_LOCKS
   case lk_adaptive: {
     __kmp_base_user_lock_size = sizeof(kmp_base_adaptive_lock_t);
     __kmp_user_lock_size = sizeof(kmp_adaptive_lock_t);
 
     __kmp_get_user_lock_owner_ =
         (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_owner);
 
     if (__kmp_env_consistency_check) {
       KMP_BIND_USER_LOCK_WITH_CHECKS(adaptive);
     } else {
       KMP_BIND_USER_LOCK(adaptive);
     }
 
     __kmp_destroy_user_lock_ =
         (void (*)(kmp_user_lock_p))(&__kmp_destroy_adaptive_lock);
 
     __kmp_is_user_lock_initialized_ =
         (int (*)(kmp_user_lock_p))(&__kmp_is_queuing_lock_initialized);
 
     __kmp_get_user_lock_location_ =
         (const ident_t *(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_location);
 
     __kmp_set_user_lock_location_ = (void (*)(
         kmp_user_lock_p, const ident_t *))(&__kmp_set_queuing_lock_location);
 
     __kmp_get_user_lock_flags_ =
         (kmp_lock_flags_t(*)(kmp_user_lock_p))(&__kmp_get_queuing_lock_flags);
 
     __kmp_set_user_lock_flags_ = (void (*)(kmp_user_lock_p, kmp_lock_flags_t))(
         &__kmp_set_queuing_lock_flags);
 
   } break;
 #endif // KMP_USE_ADAPTIVE_LOCKS
 
   case lk_drdpa: {
     __kmp_base_user_lock_size = sizeof(kmp_base_drdpa_lock_t);
     __kmp_user_lock_size = sizeof(kmp_drdpa_lock_t);
 
     __kmp_get_user_lock_owner_ =
         (kmp_int32(*)(kmp_user_lock_p))(&__kmp_get_drdpa_lock_owner);
 
     if (__kmp_env_consistency_check) {
       KMP_BIND_USER_LOCK_WITH_CHECKS(drdpa);
       KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(drdpa);
     } else {
       KMP_BIND_USER_LOCK(drdpa);
       KMP_BIND_NESTED_USER_LOCK(drdpa);
     }
 
     __kmp_destroy_user_lock_ =
         (void (*)(kmp_user_lock_p))(&__kmp_destroy_drdpa_lock);
 
     __kmp_is_user_lock_initialized_ =
         (int (*)(kmp_user_lock_p))(&__kmp_is_drdpa_lock_initialized);
 
     __kmp_get_user_lock_location_ =
         (const ident_t *(*)(kmp_user_lock_p))(&__kmp_get_drdpa_lock_location);
 
     __kmp_set_user_lock_location_ = (void (*)(
         kmp_user_lock_p, const ident_t *))(&__kmp_set_drdpa_lock_location);
 
     __kmp_get_user_lock_flags_ =
         (kmp_lock_flags_t(*)(kmp_user_lock_p))(&__kmp_get_drdpa_lock_flags);
 
     __kmp_set_user_lock_flags_ = (void (*)(kmp_user_lock_p, kmp_lock_flags_t))(
         &__kmp_set_drdpa_lock_flags);
   } break;
   }
 }
 
 // ----------------------------------------------------------------------------
 // User lock table & lock allocation
 
 kmp_lock_table_t __kmp_user_lock_table = {1, 0, NULL};
 kmp_user_lock_p __kmp_lock_pool = NULL;
 
 // Lock block-allocation support.
 kmp_block_of_locks *__kmp_lock_blocks = NULL;
 int __kmp_num_locks_in_block = 1; // FIXME - tune this value
 
 static kmp_lock_index_t __kmp_lock_table_insert(kmp_user_lock_p lck) {
   // Assume that kmp_global_lock is held upon entry/exit.
   kmp_lock_index_t index;
   if (__kmp_user_lock_table.used >= __kmp_user_lock_table.allocated) {
     kmp_lock_index_t size;
     kmp_user_lock_p *table;
     // Reallocate lock table.
     if (__kmp_user_lock_table.allocated == 0) {
       size = 1024;
     } else {
       size = __kmp_user_lock_table.allocated * 2;
     }
     table = (kmp_user_lock_p *)__kmp_allocate(sizeof(kmp_user_lock_p) * size);
     KMP_MEMCPY(table + 1, __kmp_user_lock_table.table + 1,
                sizeof(kmp_user_lock_p) * (__kmp_user_lock_table.used - 1));
     table[0] = (kmp_user_lock_p)__kmp_user_lock_table.table;
     // We cannot free the previous table now, since it may be in use by other
     // threads. So save the pointer to the previous table in in the first
     // element of the new table. All the tables will be organized into a list,
     // and could be freed when library shutting down.
     __kmp_user_lock_table.table = table;
     __kmp_user_lock_table.allocated = size;
   }
   KMP_DEBUG_ASSERT(__kmp_user_lock_table.used <
                    __kmp_user_lock_table.allocated);
   index = __kmp_user_lock_table.used;
   __kmp_user_lock_table.table[index] = lck;
   ++__kmp_user_lock_table.used;
   return index;
 }
 
 static kmp_user_lock_p __kmp_lock_block_allocate() {
   // Assume that kmp_global_lock is held upon entry/exit.
   static int last_index = 0;
   if ((last_index >= __kmp_num_locks_in_block) || (__kmp_lock_blocks == NULL)) {
     // Restart the index.
     last_index = 0;
     // Need to allocate a new block.
     KMP_DEBUG_ASSERT(__kmp_user_lock_size > 0);
     size_t space_for_locks = __kmp_user_lock_size * __kmp_num_locks_in_block;
     char *buffer =
         (char *)__kmp_allocate(space_for_locks + sizeof(kmp_block_of_locks));
     // Set up the new block.
     kmp_block_of_locks *new_block =
         (kmp_block_of_locks *)(&buffer[space_for_locks]);
     new_block->next_block = __kmp_lock_blocks;
     new_block->locks = (void *)buffer;
     // Publish the new block.
     KMP_MB();
     __kmp_lock_blocks = new_block;
   }
   kmp_user_lock_p ret = (kmp_user_lock_p)(&(
       ((char *)(__kmp_lock_blocks->locks))[last_index * __kmp_user_lock_size]));
   last_index++;
   return ret;
 }
 
 // Get memory for a lock. It may be freshly allocated memory or reused memory
 // from lock pool.
 kmp_user_lock_p __kmp_user_lock_allocate(void **user_lock, kmp_int32 gtid,
                                          kmp_lock_flags_t flags) {
   kmp_user_lock_p lck;
   kmp_lock_index_t index;
   KMP_DEBUG_ASSERT(user_lock);
 
   __kmp_acquire_lock(&__kmp_global_lock, gtid);
 
   if (__kmp_lock_pool == NULL) {
     // Lock pool is empty. Allocate new memory.
 
     // ANNOTATION: Found no good way to express the syncronisation
     // between allocation and usage, so ignore the allocation
     ANNOTATE_IGNORE_WRITES_BEGIN();
     if (__kmp_num_locks_in_block <= 1) { // Tune this cutoff point.
       lck = (kmp_user_lock_p)__kmp_allocate(__kmp_user_lock_size);
     } else {
       lck = __kmp_lock_block_allocate();
     }
     ANNOTATE_IGNORE_WRITES_END();
 
     // Insert lock in the table so that it can be freed in __kmp_cleanup,
     // and debugger has info on all allocated locks.
     index = __kmp_lock_table_insert(lck);
   } else {
     // Pick up lock from pool.
     lck = __kmp_lock_pool;
     index = __kmp_lock_pool->pool.index;
     __kmp_lock_pool = __kmp_lock_pool->pool.next;
   }
 
   // We could potentially differentiate between nested and regular locks
   // here, and do the lock table lookup for regular locks only.
   if (OMP_LOCK_T_SIZE < sizeof(void *)) {
     *((kmp_lock_index_t *)user_lock) = index;
   } else {
     *((kmp_user_lock_p *)user_lock) = lck;
   }
 
   // mark the lock if it is critical section lock.
   __kmp_set_user_lock_flags(lck, flags);
 
   __kmp_release_lock(&__kmp_global_lock, gtid); // AC: TODO move this line upper
 
   return lck;
 }
 
 // Put lock's memory to pool for reusing.
 void __kmp_user_lock_free(void **user_lock, kmp_int32 gtid,
                           kmp_user_lock_p lck) {
   KMP_DEBUG_ASSERT(user_lock != NULL);
   KMP_DEBUG_ASSERT(lck != NULL);
 
   __kmp_acquire_lock(&__kmp_global_lock, gtid);
 
   lck->pool.next = __kmp_lock_pool;
   __kmp_lock_pool = lck;
   if (OMP_LOCK_T_SIZE < sizeof(void *)) {
     kmp_lock_index_t index = *((kmp_lock_index_t *)user_lock);
     KMP_DEBUG_ASSERT(0 < index && index <= __kmp_user_lock_table.used);
     lck->pool.index = index;
   }
 
   __kmp_release_lock(&__kmp_global_lock, gtid);
 }
 
 kmp_user_lock_p __kmp_lookup_user_lock(void **user_lock, char const *func) {
   kmp_user_lock_p lck = NULL;
 
   if (__kmp_env_consistency_check) {
     if (user_lock == NULL) {
       KMP_FATAL(LockIsUninitialized, func);
     }
   }
 
   if (OMP_LOCK_T_SIZE < sizeof(void *)) {
     kmp_lock_index_t index = *((kmp_lock_index_t *)user_lock);
     if (__kmp_env_consistency_check) {
       if (!(0 < index && index < __kmp_user_lock_table.used)) {
         KMP_FATAL(LockIsUninitialized, func);
       }
     }
     KMP_DEBUG_ASSERT(0 < index && index < __kmp_user_lock_table.used);
     KMP_DEBUG_ASSERT(__kmp_user_lock_size > 0);
     lck = __kmp_user_lock_table.table[index];
   } else {
     lck = *((kmp_user_lock_p *)user_lock);
   }
 
   if (__kmp_env_consistency_check) {
     if (lck == NULL) {
       KMP_FATAL(LockIsUninitialized, func);
     }
   }
 
   return lck;
 }
 
 void __kmp_cleanup_user_locks(void) {
   // Reset lock pool. Don't worry about lock in the pool--we will free them when
   // iterating through lock table (it includes all the locks, dead or alive).
   __kmp_lock_pool = NULL;
 
 #define IS_CRITICAL(lck)                                                       \
   ((__kmp_get_user_lock_flags_ != NULL) &&                                     \
    ((*__kmp_get_user_lock_flags_)(lck)&kmp_lf_critical_section))
 
   // Loop through lock table, free all locks.
   // Do not free item [0], it is reserved for lock tables list.
   //
   // FIXME - we are iterating through a list of (pointers to) objects of type
   // union kmp_user_lock, but we have no way of knowing whether the base type is
   // currently "pool" or whatever the global user lock type is.
   //
   // We are relying on the fact that for all of the user lock types
   // (except "tas"), the first field in the lock struct is the "initialized"
   // field, which is set to the address of the lock object itself when
   // the lock is initialized.  When the union is of type "pool", the
   // first field is a pointer to the next object in the free list, which
   // will not be the same address as the object itself.
   //
   // This means that the check (*__kmp_is_user_lock_initialized_)(lck) will fail
   // for "pool" objects on the free list.  This must happen as the "location"
   // field of real user locks overlaps the "index" field of "pool" objects.
   //
   // It would be better to run through the free list, and remove all "pool"
   // objects from the lock table before executing this loop.  However,
   // "pool" objects do not always have their index field set (only on
   // lin_32e), and I don't want to search the lock table for the address
   // of every "pool" object on the free list.
   while (__kmp_user_lock_table.used > 1) {
     const ident *loc;
 
     // reduce __kmp_user_lock_table.used before freeing the lock,
     // so that state of locks is consistent
     kmp_user_lock_p lck =
         __kmp_user_lock_table.table[--__kmp_user_lock_table.used];
 
     if ((__kmp_is_user_lock_initialized_ != NULL) &&
         (*__kmp_is_user_lock_initialized_)(lck)) {
       // Issue a warning if: KMP_CONSISTENCY_CHECK AND lock is initialized AND
       // it is NOT a critical section (user is not responsible for destroying
       // criticals) AND we know source location to report.
       if (__kmp_env_consistency_check && (!IS_CRITICAL(lck)) &&
           ((loc = __kmp_get_user_lock_location(lck)) != NULL) &&
           (loc->psource != NULL)) {
         kmp_str_loc_t str_loc = __kmp_str_loc_init(loc->psource, 0);
         KMP_WARNING(CnsLockNotDestroyed, str_loc.file, str_loc.line);
         __kmp_str_loc_free(&str_loc);
       }
 
 #ifdef KMP_DEBUG
       if (IS_CRITICAL(lck)) {
         KA_TRACE(
             20,
             ("__kmp_cleanup_user_locks: free critical section lock %p (%p)\n",
              lck, *(void **)lck));
       } else {
         KA_TRACE(20, ("__kmp_cleanup_user_locks: free lock %p (%p)\n", lck,
                       *(void **)lck));
       }
 #endif // KMP_DEBUG
 
       // Cleanup internal lock dynamic resources (for drdpa locks particularly).
       __kmp_destroy_user_lock(lck);
     }
 
     // Free the lock if block allocation of locks is not used.
     if (__kmp_lock_blocks == NULL) {
       __kmp_free(lck);
     }
   }
 
 #undef IS_CRITICAL
 
   // delete lock table(s).
   kmp_user_lock_p *table_ptr = __kmp_user_lock_table.table;
   __kmp_user_lock_table.table = NULL;
   __kmp_user_lock_table.allocated = 0;
 
   while (table_ptr != NULL) {
     // In the first element we saved the pointer to the previous
     // (smaller) lock table.
     kmp_user_lock_p *next = (kmp_user_lock_p *)(table_ptr[0]);
     __kmp_free(table_ptr);
     table_ptr = next;
   }
 
   // Free buffers allocated for blocks of locks.
   kmp_block_of_locks_t *block_ptr = __kmp_lock_blocks;
   __kmp_lock_blocks = NULL;
 
   while (block_ptr != NULL) {
     kmp_block_of_locks_t *next = block_ptr->next_block;
     __kmp_free(block_ptr->locks);
     // *block_ptr itself was allocated at the end of the locks vector.
     block_ptr = next;
   }
 
   TCW_4(__kmp_init_user_locks, FALSE);
 }
 
 #endif // KMP_USE_DYNAMIC_LOCK
Index: projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_lock.h
===================================================================
--- projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_lock.h	(revision 357058)
+++ projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_lock.h	(revision 357059)
@@ -1,1275 +1,1275 @@
 /*
  * kmp_lock.h -- lock header file
  */
 
 //===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef KMP_LOCK_H
 #define KMP_LOCK_H
 
 #include <limits.h> // CHAR_BIT
 #include <stddef.h> // offsetof
 
 #include "kmp_debug.h"
 #include "kmp_os.h"
 
 #ifdef __cplusplus
 #include <atomic>
 
 extern "C" {
 #endif // __cplusplus
 
 // ----------------------------------------------------------------------------
 // Have to copy these definitions from kmp.h because kmp.h cannot be included
 // due to circular dependencies.  Will undef these at end of file.
 
 #define KMP_PAD(type, sz)                                                      \
   (sizeof(type) + (sz - ((sizeof(type) - 1) % (sz)) - 1))
 #define KMP_GTID_DNE (-2)
 
 // Forward declaration of ident and ident_t
 
 struct ident;
 typedef struct ident ident_t;
 
 // End of copied code.
 // ----------------------------------------------------------------------------
 
 // We need to know the size of the area we can assume that the compiler(s)
 // allocated for obects of type omp_lock_t and omp_nest_lock_t.  The Intel
 // compiler always allocates a pointer-sized area, as does visual studio.
 //
 // gcc however, only allocates 4 bytes for regular locks, even on 64-bit
 // intel archs.  It allocates at least 8 bytes for nested lock (more on
 // recent versions), but we are bounded by the pointer-sized chunks that
 // the Intel compiler allocates.
 
 #if KMP_OS_LINUX && defined(KMP_GOMP_COMPAT)
 #define OMP_LOCK_T_SIZE sizeof(int)
 #define OMP_NEST_LOCK_T_SIZE sizeof(void *)
 #else
 #define OMP_LOCK_T_SIZE sizeof(void *)
 #define OMP_NEST_LOCK_T_SIZE sizeof(void *)
 #endif
 
 // The Intel compiler allocates a 32-byte chunk for a critical section.
 // Both gcc and visual studio only allocate enough space for a pointer.
 // Sometimes we know that the space was allocated by the Intel compiler.
 #define OMP_CRITICAL_SIZE sizeof(void *)
 #define INTEL_CRITICAL_SIZE 32
 
 // lock flags
 typedef kmp_uint32 kmp_lock_flags_t;
 
 #define kmp_lf_critical_section 1
 
 // When a lock table is used, the indices are of kmp_lock_index_t
 typedef kmp_uint32 kmp_lock_index_t;
 
 // When memory allocated for locks are on the lock pool (free list),
 // it is treated as structs of this type.
 struct kmp_lock_pool {
   union kmp_user_lock *next;
   kmp_lock_index_t index;
 };
 
 typedef struct kmp_lock_pool kmp_lock_pool_t;
 
 extern void __kmp_validate_locks(void);
 
 // ----------------------------------------------------------------------------
 //  There are 5 lock implementations:
 //       1. Test and set locks.
 //       2. futex locks (Linux* OS on x86 and
 //          Intel(R) Many Integrated Core Architecture)
 //       3. Ticket (Lamport bakery) locks.
 //       4. Queuing locks (with separate spin fields).
 //       5. DRPA (Dynamically Reconfigurable Distributed Polling Area) locks
 //
 //   and 3 lock purposes:
 //       1. Bootstrap locks -- Used for a few locks available at library
 //       startup-shutdown time.
 //          These do not require non-negative global thread ID's.
 //       2. Internal RTL locks -- Used everywhere else in the RTL
 //       3. User locks (includes critical sections)
 // ----------------------------------------------------------------------------
 
 // ============================================================================
 // Lock implementations.
 //
 // Test and set locks.
 //
 // Non-nested test and set locks differ from the other lock kinds (except
 // futex) in that we use the memory allocated by the compiler for the lock,
 // rather than a pointer to it.
 //
 // On lin32, lin_32e, and win_32, the space allocated may be as small as 4
 // bytes, so we have to use a lock table for nested locks, and avoid accessing
 // the depth_locked field for non-nested locks.
 //
 // Information normally available to the tools, such as lock location, lock
 // usage (normal lock vs. critical section), etc. is not available with test and
 // set locks.
 // ----------------------------------------------------------------------------
 
 struct kmp_base_tas_lock {
   // KMP_LOCK_FREE(tas) => unlocked; locked: (gtid+1) of owning thread
   std::atomic<kmp_int32> poll;
   kmp_int32 depth_locked; // depth locked, for nested locks only
 };
 
 typedef struct kmp_base_tas_lock kmp_base_tas_lock_t;
 
 union kmp_tas_lock {
   kmp_base_tas_lock_t lk;
   kmp_lock_pool_t pool; // make certain struct is large enough
   double lk_align; // use worst case alignment; no cache line padding
 };
 
 typedef union kmp_tas_lock kmp_tas_lock_t;
 
 // Static initializer for test and set lock variables. Usage:
 //    kmp_tas_lock_t xlock = KMP_TAS_LOCK_INITIALIZER( xlock );
 #define KMP_TAS_LOCK_INITIALIZER(lock)                                         \
   {                                                                            \
     { ATOMIC_VAR_INIT(KMP_LOCK_FREE(tas)), 0 }                                 \
   }
 
 extern int __kmp_acquire_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid);
 extern int __kmp_test_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid);
 extern int __kmp_release_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid);
 extern void __kmp_init_tas_lock(kmp_tas_lock_t *lck);
 extern void __kmp_destroy_tas_lock(kmp_tas_lock_t *lck);
 
 extern int __kmp_acquire_nested_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid);
 extern int __kmp_test_nested_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid);
 extern int __kmp_release_nested_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid);
 extern void __kmp_init_nested_tas_lock(kmp_tas_lock_t *lck);
 extern void __kmp_destroy_nested_tas_lock(kmp_tas_lock_t *lck);
 
 #define KMP_LOCK_RELEASED 1
 #define KMP_LOCK_STILL_HELD 0
 #define KMP_LOCK_ACQUIRED_FIRST 1
 #define KMP_LOCK_ACQUIRED_NEXT 0
 #ifndef KMP_USE_FUTEX
 #define KMP_USE_FUTEX                                                          \
   (KMP_OS_LINUX && !KMP_OS_CNK &&                                              \
    (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64))
 #endif
 #if KMP_USE_FUTEX
 
 // ----------------------------------------------------------------------------
 // futex locks.  futex locks are only available on Linux* OS.
 //
 // Like non-nested test and set lock, non-nested futex locks use the memory
 // allocated by the compiler for the lock, rather than a pointer to it.
 //
 // Information normally available to the tools, such as lock location, lock
 // usage (normal lock vs. critical section), etc. is not available with test and
 // set locks. With non-nested futex locks, the lock owner is not even available.
 // ----------------------------------------------------------------------------
 
 struct kmp_base_futex_lock {
   volatile kmp_int32 poll; // KMP_LOCK_FREE(futex) => unlocked
   // 2*(gtid+1) of owning thread, 0 if unlocked
   // locked: (gtid+1) of owning thread
   kmp_int32 depth_locked; // depth locked, for nested locks only
 };
 
 typedef struct kmp_base_futex_lock kmp_base_futex_lock_t;
 
 union kmp_futex_lock {
   kmp_base_futex_lock_t lk;
   kmp_lock_pool_t pool; // make certain struct is large enough
   double lk_align; // use worst case alignment
   // no cache line padding
 };
 
 typedef union kmp_futex_lock kmp_futex_lock_t;
 
 // Static initializer for futex lock variables. Usage:
 //    kmp_futex_lock_t xlock = KMP_FUTEX_LOCK_INITIALIZER( xlock );
 #define KMP_FUTEX_LOCK_INITIALIZER(lock)                                       \
   {                                                                            \
     { KMP_LOCK_FREE(futex), 0 }                                                \
   }
 
 extern int __kmp_acquire_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid);
 extern int __kmp_test_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid);
 extern int __kmp_release_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid);
 extern void __kmp_init_futex_lock(kmp_futex_lock_t *lck);
 extern void __kmp_destroy_futex_lock(kmp_futex_lock_t *lck);
 
 extern int __kmp_acquire_nested_futex_lock(kmp_futex_lock_t *lck,
                                            kmp_int32 gtid);
 extern int __kmp_test_nested_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid);
 extern int __kmp_release_nested_futex_lock(kmp_futex_lock_t *lck,
                                            kmp_int32 gtid);
 extern void __kmp_init_nested_futex_lock(kmp_futex_lock_t *lck);
 extern void __kmp_destroy_nested_futex_lock(kmp_futex_lock_t *lck);
 
 #endif // KMP_USE_FUTEX
 
 // ----------------------------------------------------------------------------
 // Ticket locks.
 
 #ifdef __cplusplus
 
 #ifdef _MSC_VER
 // MSVC won't allow use of std::atomic<> in a union since it has non-trivial
 // copy constructor.
 
 struct kmp_base_ticket_lock {
   // `initialized' must be the first entry in the lock data structure!
   std::atomic_bool initialized;
   volatile union kmp_ticket_lock *self; // points to the lock union
   ident_t const *location; // Source code location of omp_init_lock().
   std::atomic_uint
       next_ticket; // ticket number to give to next thread which acquires
   std::atomic_uint now_serving; // ticket number for thread which holds the lock
   std::atomic_int owner_id; // (gtid+1) of owning thread, 0 if unlocked
   std::atomic_int depth_locked; // depth locked, for nested locks only
   kmp_lock_flags_t flags; // lock specifics, e.g. critical section lock
 };
 #else
 struct kmp_base_ticket_lock {
   // `initialized' must be the first entry in the lock data structure!
   std::atomic<bool> initialized;
   volatile union kmp_ticket_lock *self; // points to the lock union
   ident_t const *location; // Source code location of omp_init_lock().
   std::atomic<unsigned>
       next_ticket; // ticket number to give to next thread which acquires
   std::atomic<unsigned>
       now_serving; // ticket number for thread which holds the lock
   std::atomic<int> owner_id; // (gtid+1) of owning thread, 0 if unlocked
   std::atomic<int> depth_locked; // depth locked, for nested locks only
   kmp_lock_flags_t flags; // lock specifics, e.g. critical section lock
 };
 #endif
 
 #else // __cplusplus
 
 struct kmp_base_ticket_lock;
 
 #endif // !__cplusplus
 
 typedef struct kmp_base_ticket_lock kmp_base_ticket_lock_t;
 
 union KMP_ALIGN_CACHE kmp_ticket_lock {
   kmp_base_ticket_lock_t
       lk; // This field must be first to allow static initializing.
   kmp_lock_pool_t pool;
   double lk_align; // use worst case alignment
   char lk_pad[KMP_PAD(kmp_base_ticket_lock_t, CACHE_LINE)];
 };
 
 typedef union kmp_ticket_lock kmp_ticket_lock_t;
 
 // Static initializer for simple ticket lock variables. Usage:
 //    kmp_ticket_lock_t xlock = KMP_TICKET_LOCK_INITIALIZER( xlock );
 // Note the macro argument. It is important to make var properly initialized.
 #define KMP_TICKET_LOCK_INITIALIZER(lock)                                      \
   {                                                                            \
     {                                                                          \
       ATOMIC_VAR_INIT(true)                                                    \
       , &(lock), NULL, ATOMIC_VAR_INIT(0U), ATOMIC_VAR_INIT(0U),               \
           ATOMIC_VAR_INIT(0), ATOMIC_VAR_INIT(-1)                              \
     }                                                                          \
   }
 
 extern int __kmp_acquire_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid);
 extern int __kmp_test_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid);
 extern int __kmp_test_ticket_lock_with_cheks(kmp_ticket_lock_t *lck,
                                              kmp_int32 gtid);
 extern int __kmp_release_ticket_lock(kmp_ticket_lock_t *lck, kmp_int32 gtid);
 extern void __kmp_init_ticket_lock(kmp_ticket_lock_t *lck);
 extern void __kmp_destroy_ticket_lock(kmp_ticket_lock_t *lck);
 
 extern int __kmp_acquire_nested_ticket_lock(kmp_ticket_lock_t *lck,
                                             kmp_int32 gtid);
 extern int __kmp_test_nested_ticket_lock(kmp_ticket_lock_t *lck,
                                          kmp_int32 gtid);
 extern int __kmp_release_nested_ticket_lock(kmp_ticket_lock_t *lck,
                                             kmp_int32 gtid);
 extern void __kmp_init_nested_ticket_lock(kmp_ticket_lock_t *lck);
 extern void __kmp_destroy_nested_ticket_lock(kmp_ticket_lock_t *lck);
 
 // ----------------------------------------------------------------------------
 // Queuing locks.
 
 #if KMP_USE_ADAPTIVE_LOCKS
 
 struct kmp_adaptive_lock_info;
 
 typedef struct kmp_adaptive_lock_info kmp_adaptive_lock_info_t;
 
 #if KMP_DEBUG_ADAPTIVE_LOCKS
 
 struct kmp_adaptive_lock_statistics {
   /* So we can get stats from locks that haven't been destroyed. */
   kmp_adaptive_lock_info_t *next;
   kmp_adaptive_lock_info_t *prev;
 
   /* Other statistics */
   kmp_uint32 successfulSpeculations;
   kmp_uint32 hardFailedSpeculations;
   kmp_uint32 softFailedSpeculations;
   kmp_uint32 nonSpeculativeAcquires;
   kmp_uint32 nonSpeculativeAcquireAttempts;
   kmp_uint32 lemmingYields;
 };
 
 typedef struct kmp_adaptive_lock_statistics kmp_adaptive_lock_statistics_t;
 
 extern void __kmp_print_speculative_stats();
 extern void __kmp_init_speculative_stats();
 
 #endif // KMP_DEBUG_ADAPTIVE_LOCKS
 
 struct kmp_adaptive_lock_info {
   /* Values used for adaptivity.
      Although these are accessed from multiple threads we don't access them
      atomically, because if we miss updates it probably doesn't matter much. (It
      just affects our decision about whether to try speculation on the lock). */
   kmp_uint32 volatile badness;
   kmp_uint32 volatile acquire_attempts;
   /* Parameters of the lock. */
   kmp_uint32 max_badness;
   kmp_uint32 max_soft_retries;
 
 #if KMP_DEBUG_ADAPTIVE_LOCKS
   kmp_adaptive_lock_statistics_t volatile stats;
 #endif
 };
 
 #endif // KMP_USE_ADAPTIVE_LOCKS
 
 struct kmp_base_queuing_lock {
 
   //  `initialized' must be the first entry in the lock data structure!
   volatile union kmp_queuing_lock
       *initialized; // Points to the lock union if in initialized state.
 
   ident_t const *location; // Source code location of omp_init_lock().
 
   KMP_ALIGN(8) // tail_id  must be 8-byte aligned!
 
   volatile kmp_int32
       tail_id; // (gtid+1) of thread at tail of wait queue, 0 if empty
   // Must be no padding here since head/tail used in 8-byte CAS
   volatile kmp_int32
       head_id; // (gtid+1) of thread at head of wait queue, 0 if empty
   // Decl order assumes little endian
   // bakery-style lock
   volatile kmp_uint32
       next_ticket; // ticket number to give to next thread which acquires
   volatile kmp_uint32
       now_serving; // ticket number for thread which holds the lock
   volatile kmp_int32 owner_id; // (gtid+1) of owning thread, 0 if unlocked
   kmp_int32 depth_locked; // depth locked, for nested locks only
 
   kmp_lock_flags_t flags; // lock specifics, e.g. critical section lock
 };
 
 typedef struct kmp_base_queuing_lock kmp_base_queuing_lock_t;
 
 KMP_BUILD_ASSERT(offsetof(kmp_base_queuing_lock_t, tail_id) % 8 == 0);
 
 union KMP_ALIGN_CACHE kmp_queuing_lock {
   kmp_base_queuing_lock_t
       lk; // This field must be first to allow static initializing.
   kmp_lock_pool_t pool;
   double lk_align; // use worst case alignment
   char lk_pad[KMP_PAD(kmp_base_queuing_lock_t, CACHE_LINE)];
 };
 
 typedef union kmp_queuing_lock kmp_queuing_lock_t;
 
 extern int __kmp_acquire_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid);
 extern int __kmp_test_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid);
 extern int __kmp_release_queuing_lock(kmp_queuing_lock_t *lck, kmp_int32 gtid);
 extern void __kmp_init_queuing_lock(kmp_queuing_lock_t *lck);
 extern void __kmp_destroy_queuing_lock(kmp_queuing_lock_t *lck);
 
 extern int __kmp_acquire_nested_queuing_lock(kmp_queuing_lock_t *lck,
                                              kmp_int32 gtid);
 extern int __kmp_test_nested_queuing_lock(kmp_queuing_lock_t *lck,
                                           kmp_int32 gtid);
 extern int __kmp_release_nested_queuing_lock(kmp_queuing_lock_t *lck,
                                              kmp_int32 gtid);
 extern void __kmp_init_nested_queuing_lock(kmp_queuing_lock_t *lck);
 extern void __kmp_destroy_nested_queuing_lock(kmp_queuing_lock_t *lck);
 
 #if KMP_USE_ADAPTIVE_LOCKS
 
 // ----------------------------------------------------------------------------
 // Adaptive locks.
 struct kmp_base_adaptive_lock {
   kmp_base_queuing_lock qlk;
   KMP_ALIGN(CACHE_LINE)
   kmp_adaptive_lock_info_t
       adaptive; // Information for the speculative adaptive lock
 };
 
 typedef struct kmp_base_adaptive_lock kmp_base_adaptive_lock_t;
 
 union KMP_ALIGN_CACHE kmp_adaptive_lock {
   kmp_base_adaptive_lock_t lk;
   kmp_lock_pool_t pool;
   double lk_align;
   char lk_pad[KMP_PAD(kmp_base_adaptive_lock_t, CACHE_LINE)];
 };
 typedef union kmp_adaptive_lock kmp_adaptive_lock_t;
 
 #define GET_QLK_PTR(l) ((kmp_queuing_lock_t *)&(l)->lk.qlk)
 
 #endif // KMP_USE_ADAPTIVE_LOCKS
 
 // ----------------------------------------------------------------------------
 // DRDPA ticket locks.
 struct kmp_base_drdpa_lock {
   // All of the fields on the first cache line are only written when
   // initializing or reconfiguring the lock.  These are relatively rare
   // operations, so data from the first cache line will usually stay resident in
   // the cache of each thread trying to acquire the lock.
   //
   // initialized must be the first entry in the lock data structure!
   KMP_ALIGN_CACHE
 
   volatile union kmp_drdpa_lock
       *initialized; // points to the lock union if in initialized state
   ident_t const *location; // Source code location of omp_init_lock().
   std::atomic<std::atomic<kmp_uint64> *> polls;
   std::atomic<kmp_uint64> mask; // is 2**num_polls-1 for mod op
   kmp_uint64 cleanup_ticket; // thread with cleanup ticket
   std::atomic<kmp_uint64> *old_polls; // will deallocate old_polls
   kmp_uint32 num_polls; // must be power of 2
 
   // next_ticket it needs to exist in a separate cache line, as it is
   // invalidated every time a thread takes a new ticket.
   KMP_ALIGN_CACHE
 
   std::atomic<kmp_uint64> next_ticket;
 
   // now_serving is used to store our ticket value while we hold the lock. It
   // has a slightly different meaning in the DRDPA ticket locks (where it is
   // written by the acquiring thread) than it does in the simple ticket locks
   // (where it is written by the releasing thread).
   //
   // Since now_serving is only read an written in the critical section,
   // it is non-volatile, but it needs to exist on a separate cache line,
   // as it is invalidated at every lock acquire.
   //
   // Likewise, the vars used for nested locks (owner_id and depth_locked) are
   // only written by the thread owning the lock, so they are put in this cache
   // line.  owner_id is read by other threads, so it must be declared volatile.
   KMP_ALIGN_CACHE
   kmp_uint64 now_serving; // doesn't have to be volatile
   volatile kmp_uint32 owner_id; // (gtid+1) of owning thread, 0 if unlocked
   kmp_int32 depth_locked; // depth locked
   kmp_lock_flags_t flags; // lock specifics, e.g. critical section lock
 };
 
 typedef struct kmp_base_drdpa_lock kmp_base_drdpa_lock_t;
 
 union KMP_ALIGN_CACHE kmp_drdpa_lock {
   kmp_base_drdpa_lock_t
       lk; // This field must be first to allow static initializing. */
   kmp_lock_pool_t pool;
   double lk_align; // use worst case alignment
   char lk_pad[KMP_PAD(kmp_base_drdpa_lock_t, CACHE_LINE)];
 };
 
 typedef union kmp_drdpa_lock kmp_drdpa_lock_t;
 
 extern int __kmp_acquire_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid);
 extern int __kmp_test_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid);
 extern int __kmp_release_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid);
 extern void __kmp_init_drdpa_lock(kmp_drdpa_lock_t *lck);
 extern void __kmp_destroy_drdpa_lock(kmp_drdpa_lock_t *lck);
 
 extern int __kmp_acquire_nested_drdpa_lock(kmp_drdpa_lock_t *lck,
                                            kmp_int32 gtid);
 extern int __kmp_test_nested_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid);
 extern int __kmp_release_nested_drdpa_lock(kmp_drdpa_lock_t *lck,
                                            kmp_int32 gtid);
 extern void __kmp_init_nested_drdpa_lock(kmp_drdpa_lock_t *lck);
 extern void __kmp_destroy_nested_drdpa_lock(kmp_drdpa_lock_t *lck);
 
 // ============================================================================
 // Lock purposes.
 // ============================================================================
 
 // Bootstrap locks.
 //
 // Bootstrap locks -- very few locks used at library initialization time.
 // Bootstrap locks are currently implemented as ticket locks.
 // They could also be implemented as test and set lock, but cannot be
 // implemented with other lock kinds as they require gtids which are not
 // available at initialization time.
 
 typedef kmp_ticket_lock_t kmp_bootstrap_lock_t;
 
 #define KMP_BOOTSTRAP_LOCK_INITIALIZER(lock) KMP_TICKET_LOCK_INITIALIZER((lock))
 #define KMP_BOOTSTRAP_LOCK_INIT(lock)                                          \
   kmp_bootstrap_lock_t lock = KMP_TICKET_LOCK_INITIALIZER(lock)
 
 static inline int __kmp_acquire_bootstrap_lock(kmp_bootstrap_lock_t *lck) {
   return __kmp_acquire_ticket_lock(lck, KMP_GTID_DNE);
 }
 
 static inline int __kmp_test_bootstrap_lock(kmp_bootstrap_lock_t *lck) {
   return __kmp_test_ticket_lock(lck, KMP_GTID_DNE);
 }
 
 static inline void __kmp_release_bootstrap_lock(kmp_bootstrap_lock_t *lck) {
   __kmp_release_ticket_lock(lck, KMP_GTID_DNE);
 }
 
 static inline void __kmp_init_bootstrap_lock(kmp_bootstrap_lock_t *lck) {
   __kmp_init_ticket_lock(lck);
 }
 
 static inline void __kmp_destroy_bootstrap_lock(kmp_bootstrap_lock_t *lck) {
   __kmp_destroy_ticket_lock(lck);
 }
 
 // Internal RTL locks.
 //
 // Internal RTL locks are also implemented as ticket locks, for now.
 //
 // FIXME - We should go through and figure out which lock kind works best for
 // each internal lock, and use the type declaration and function calls for
 // that explicit lock kind (and get rid of this section).
 
 typedef kmp_ticket_lock_t kmp_lock_t;
 
 #define KMP_LOCK_INIT(lock) kmp_lock_t lock = KMP_TICKET_LOCK_INITIALIZER(lock)
 
 static inline int __kmp_acquire_lock(kmp_lock_t *lck, kmp_int32 gtid) {
   return __kmp_acquire_ticket_lock(lck, gtid);
 }
 
 static inline int __kmp_test_lock(kmp_lock_t *lck, kmp_int32 gtid) {
   return __kmp_test_ticket_lock(lck, gtid);
 }
 
 static inline void __kmp_release_lock(kmp_lock_t *lck, kmp_int32 gtid) {
   __kmp_release_ticket_lock(lck, gtid);
 }
 
 static inline void __kmp_init_lock(kmp_lock_t *lck) {
   __kmp_init_ticket_lock(lck);
 }
 
 static inline void __kmp_destroy_lock(kmp_lock_t *lck) {
   __kmp_destroy_ticket_lock(lck);
 }
 
 // User locks.
 //
 // Do not allocate objects of type union kmp_user_lock!!! This will waste space
 // unless __kmp_user_lock_kind == lk_drdpa. Instead, check the value of
 // __kmp_user_lock_kind and allocate objects of the type of the appropriate
 // union member, and cast their addresses to kmp_user_lock_p.
 
 enum kmp_lock_kind {
   lk_default = 0,
   lk_tas,
 #if KMP_USE_FUTEX
   lk_futex,
 #endif
 #if KMP_USE_DYNAMIC_LOCK && KMP_USE_TSX
   lk_hle,
   lk_rtm,
 #endif
   lk_ticket,
   lk_queuing,
   lk_drdpa,
 #if KMP_USE_ADAPTIVE_LOCKS
   lk_adaptive
 #endif // KMP_USE_ADAPTIVE_LOCKS
 };
 
 typedef enum kmp_lock_kind kmp_lock_kind_t;
 
 extern kmp_lock_kind_t __kmp_user_lock_kind;
 
 union kmp_user_lock {
   kmp_tas_lock_t tas;
 #if KMP_USE_FUTEX
   kmp_futex_lock_t futex;
 #endif
   kmp_ticket_lock_t ticket;
   kmp_queuing_lock_t queuing;
   kmp_drdpa_lock_t drdpa;
 #if KMP_USE_ADAPTIVE_LOCKS
   kmp_adaptive_lock_t adaptive;
 #endif // KMP_USE_ADAPTIVE_LOCKS
   kmp_lock_pool_t pool;
 };
 
 typedef union kmp_user_lock *kmp_user_lock_p;
 
 #if !KMP_USE_DYNAMIC_LOCK
 
 extern size_t __kmp_base_user_lock_size;
 extern size_t __kmp_user_lock_size;
 
 extern kmp_int32 (*__kmp_get_user_lock_owner_)(kmp_user_lock_p lck);
 
 static inline kmp_int32 __kmp_get_user_lock_owner(kmp_user_lock_p lck) {
   KMP_DEBUG_ASSERT(__kmp_get_user_lock_owner_ != NULL);
   return (*__kmp_get_user_lock_owner_)(lck);
 }
 
 extern int (*__kmp_acquire_user_lock_with_checks_)(kmp_user_lock_p lck,
                                                    kmp_int32 gtid);
 
 #if KMP_OS_LINUX &&                                                            \
     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
 
 #define __kmp_acquire_user_lock_with_checks(lck, gtid)                         \
   if (__kmp_user_lock_kind == lk_tas) {                                        \
     if (__kmp_env_consistency_check) {                                         \
       char const *const func = "omp_set_lock";                                 \
       if ((sizeof(kmp_tas_lock_t) <= OMP_LOCK_T_SIZE) &&                       \
           lck->tas.lk.depth_locked != -1) {                                    \
         KMP_FATAL(LockNestableUsedAsSimple, func);                             \
       }                                                                        \
       if ((gtid >= 0) && (lck->tas.lk.poll - 1 == gtid)) {                     \
         KMP_FATAL(LockIsAlreadyOwned, func);                                   \
       }                                                                        \
     }                                                                          \
     if (lck->tas.lk.poll != 0 ||                                               \
         !__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1)) {     \
       kmp_uint32 spins;                                                        \
       KMP_FSYNC_PREPARE(lck);                                                  \
       KMP_INIT_YIELD(spins);                                                   \
       do {                                                                     \
         KMP_YIELD_OVERSUB_ELSE_SPIN(spins);                                    \
       } while (                                                                \
           lck->tas.lk.poll != 0 ||                                             \
           !__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1));    \
     }                                                                          \
     KMP_FSYNC_ACQUIRED(lck);                                                   \
   } else {                                                                     \
     KMP_DEBUG_ASSERT(__kmp_acquire_user_lock_with_checks_ != NULL);            \
     (*__kmp_acquire_user_lock_with_checks_)(lck, gtid);                        \
   }
 
 #else
 static inline int __kmp_acquire_user_lock_with_checks(kmp_user_lock_p lck,
                                                       kmp_int32 gtid) {
   KMP_DEBUG_ASSERT(__kmp_acquire_user_lock_with_checks_ != NULL);
   return (*__kmp_acquire_user_lock_with_checks_)(lck, gtid);
 }
 #endif
 
 extern int (*__kmp_test_user_lock_with_checks_)(kmp_user_lock_p lck,
                                                 kmp_int32 gtid);
 
 #if KMP_OS_LINUX &&                                                            \
     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
 
 #include "kmp_i18n.h" /* AC: KMP_FATAL definition */
 extern int __kmp_env_consistency_check; /* AC: copy from kmp.h here */
 static inline int __kmp_test_user_lock_with_checks(kmp_user_lock_p lck,
                                                    kmp_int32 gtid) {
   if (__kmp_user_lock_kind == lk_tas) {
     if (__kmp_env_consistency_check) {
       char const *const func = "omp_test_lock";
       if ((sizeof(kmp_tas_lock_t) <= OMP_LOCK_T_SIZE) &&
           lck->tas.lk.depth_locked != -1) {
         KMP_FATAL(LockNestableUsedAsSimple, func);
       }
     }
     return ((lck->tas.lk.poll == 0) &&
             __kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1));
   } else {
     KMP_DEBUG_ASSERT(__kmp_test_user_lock_with_checks_ != NULL);
     return (*__kmp_test_user_lock_with_checks_)(lck, gtid);
   }
 }
 #else
 static inline int __kmp_test_user_lock_with_checks(kmp_user_lock_p lck,
                                                    kmp_int32 gtid) {
   KMP_DEBUG_ASSERT(__kmp_test_user_lock_with_checks_ != NULL);
   return (*__kmp_test_user_lock_with_checks_)(lck, gtid);
 }
 #endif
 
 extern int (*__kmp_release_user_lock_with_checks_)(kmp_user_lock_p lck,
                                                    kmp_int32 gtid);
 
 static inline void __kmp_release_user_lock_with_checks(kmp_user_lock_p lck,
                                                        kmp_int32 gtid) {
   KMP_DEBUG_ASSERT(__kmp_release_user_lock_with_checks_ != NULL);
   (*__kmp_release_user_lock_with_checks_)(lck, gtid);
 }
 
 extern void (*__kmp_init_user_lock_with_checks_)(kmp_user_lock_p lck);
 
 static inline void __kmp_init_user_lock_with_checks(kmp_user_lock_p lck) {
   KMP_DEBUG_ASSERT(__kmp_init_user_lock_with_checks_ != NULL);
   (*__kmp_init_user_lock_with_checks_)(lck);
 }
 
 // We need a non-checking version of destroy lock for when the RTL is
 // doing the cleanup as it can't always tell if the lock is nested or not.
 extern void (*__kmp_destroy_user_lock_)(kmp_user_lock_p lck);
 
 static inline void __kmp_destroy_user_lock(kmp_user_lock_p lck) {
   KMP_DEBUG_ASSERT(__kmp_destroy_user_lock_ != NULL);
   (*__kmp_destroy_user_lock_)(lck);
 }
 
 extern void (*__kmp_destroy_user_lock_with_checks_)(kmp_user_lock_p lck);
 
 static inline void __kmp_destroy_user_lock_with_checks(kmp_user_lock_p lck) {
   KMP_DEBUG_ASSERT(__kmp_destroy_user_lock_with_checks_ != NULL);
   (*__kmp_destroy_user_lock_with_checks_)(lck);
 }
 
 extern int (*__kmp_acquire_nested_user_lock_with_checks_)(kmp_user_lock_p lck,
                                                           kmp_int32 gtid);
 
 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64)
 
 #define __kmp_acquire_nested_user_lock_with_checks(lck, gtid, depth)           \
   if (__kmp_user_lock_kind == lk_tas) {                                        \
     if (__kmp_env_consistency_check) {                                         \
       char const *const func = "omp_set_nest_lock";                            \
       if ((sizeof(kmp_tas_lock_t) <= OMP_NEST_LOCK_T_SIZE) &&                  \
           lck->tas.lk.depth_locked == -1) {                                    \
         KMP_FATAL(LockSimpleUsedAsNestable, func);                             \
       }                                                                        \
     }                                                                          \
     if (lck->tas.lk.poll - 1 == gtid) {                                        \
       lck->tas.lk.depth_locked += 1;                                           \
       *depth = KMP_LOCK_ACQUIRED_NEXT;                                         \
     } else {                                                                   \
       if ((lck->tas.lk.poll != 0) ||                                           \
           !__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1)) {   \
         kmp_uint32 spins;                                                      \
         KMP_FSYNC_PREPARE(lck);                                                \
         KMP_INIT_YIELD(spins);                                                 \
         do {                                                                   \
           KMP_YIELD_OVERSUB_ELSE_SPIN(spins);                                  \
         } while (                                                              \
             (lck->tas.lk.poll != 0) ||                                         \
             !__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1));  \
       }                                                                        \
       lck->tas.lk.depth_locked = 1;                                            \
       *depth = KMP_LOCK_ACQUIRED_FIRST;                                        \
     }                                                                          \
     KMP_FSYNC_ACQUIRED(lck);                                                   \
   } else {                                                                     \
     KMP_DEBUG_ASSERT(__kmp_acquire_nested_user_lock_with_checks_ != NULL);     \
     *depth = (*__kmp_acquire_nested_user_lock_with_checks_)(lck, gtid);        \
   }
 
 #else
 static inline void
 __kmp_acquire_nested_user_lock_with_checks(kmp_user_lock_p lck, kmp_int32 gtid,
                                            int *depth) {
   KMP_DEBUG_ASSERT(__kmp_acquire_nested_user_lock_with_checks_ != NULL);
   *depth = (*__kmp_acquire_nested_user_lock_with_checks_)(lck, gtid);
 }
 #endif
 
 extern int (*__kmp_test_nested_user_lock_with_checks_)(kmp_user_lock_p lck,
                                                        kmp_int32 gtid);
 
 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64)
 static inline int __kmp_test_nested_user_lock_with_checks(kmp_user_lock_p lck,
                                                           kmp_int32 gtid) {
   if (__kmp_user_lock_kind == lk_tas) {
     int retval;
     if (__kmp_env_consistency_check) {
       char const *const func = "omp_test_nest_lock";
       if ((sizeof(kmp_tas_lock_t) <= OMP_NEST_LOCK_T_SIZE) &&
           lck->tas.lk.depth_locked == -1) {
         KMP_FATAL(LockSimpleUsedAsNestable, func);
       }
     }
     KMP_DEBUG_ASSERT(gtid >= 0);
     if (lck->tas.lk.poll - 1 ==
         gtid) { /* __kmp_get_tas_lock_owner( lck ) == gtid */
       return ++lck->tas.lk.depth_locked; /* same owner, depth increased */
     }
     retval = ((lck->tas.lk.poll == 0) &&
               __kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1));
     if (retval) {
       KMP_MB();
       lck->tas.lk.depth_locked = 1;
     }
     return retval;
   } else {
     KMP_DEBUG_ASSERT(__kmp_test_nested_user_lock_with_checks_ != NULL);
     return (*__kmp_test_nested_user_lock_with_checks_)(lck, gtid);
   }
 }
 #else
 static inline int __kmp_test_nested_user_lock_with_checks(kmp_user_lock_p lck,
                                                           kmp_int32 gtid) {
   KMP_DEBUG_ASSERT(__kmp_test_nested_user_lock_with_checks_ != NULL);
   return (*__kmp_test_nested_user_lock_with_checks_)(lck, gtid);
 }
 #endif
 
 extern int (*__kmp_release_nested_user_lock_with_checks_)(kmp_user_lock_p lck,
                                                           kmp_int32 gtid);
 
 static inline int
 __kmp_release_nested_user_lock_with_checks(kmp_user_lock_p lck,
                                            kmp_int32 gtid) {
   KMP_DEBUG_ASSERT(__kmp_release_nested_user_lock_with_checks_ != NULL);
   return (*__kmp_release_nested_user_lock_with_checks_)(lck, gtid);
 }
 
 extern void (*__kmp_init_nested_user_lock_with_checks_)(kmp_user_lock_p lck);
 
 static inline void
 __kmp_init_nested_user_lock_with_checks(kmp_user_lock_p lck) {
   KMP_DEBUG_ASSERT(__kmp_init_nested_user_lock_with_checks_ != NULL);
   (*__kmp_init_nested_user_lock_with_checks_)(lck);
 }
 
 extern void (*__kmp_destroy_nested_user_lock_with_checks_)(kmp_user_lock_p lck);
 
 static inline void
 __kmp_destroy_nested_user_lock_with_checks(kmp_user_lock_p lck) {
   KMP_DEBUG_ASSERT(__kmp_destroy_nested_user_lock_with_checks_ != NULL);
   (*__kmp_destroy_nested_user_lock_with_checks_)(lck);
 }
 
 // user lock functions which do not necessarily exist for all lock kinds.
 //
 // The "set" functions usually have wrapper routines that check for a NULL set
 // function pointer and call it if non-NULL.
 //
 // In some cases, it makes sense to have a "get" wrapper function check for a
 // NULL get function pointer and return NULL / invalid value / error code if
 // the function pointer is NULL.
 //
 // In other cases, the calling code really should differentiate between an
 // unimplemented function and one that is implemented but returning NULL /
 // invalied value.  If this is the case, no get function wrapper exists.
 
 extern int (*__kmp_is_user_lock_initialized_)(kmp_user_lock_p lck);
 
 // no set function; fields set durining local allocation
 
 extern const ident_t *(*__kmp_get_user_lock_location_)(kmp_user_lock_p lck);
 
 static inline const ident_t *__kmp_get_user_lock_location(kmp_user_lock_p lck) {
   if (__kmp_get_user_lock_location_ != NULL) {
     return (*__kmp_get_user_lock_location_)(lck);
   } else {
     return NULL;
   }
 }
 
 extern void (*__kmp_set_user_lock_location_)(kmp_user_lock_p lck,
                                              const ident_t *loc);
 
 static inline void __kmp_set_user_lock_location(kmp_user_lock_p lck,
                                                 const ident_t *loc) {
   if (__kmp_set_user_lock_location_ != NULL) {
     (*__kmp_set_user_lock_location_)(lck, loc);
   }
 }
 
 extern kmp_lock_flags_t (*__kmp_get_user_lock_flags_)(kmp_user_lock_p lck);
 
 extern void (*__kmp_set_user_lock_flags_)(kmp_user_lock_p lck,
                                           kmp_lock_flags_t flags);
 
 static inline void __kmp_set_user_lock_flags(kmp_user_lock_p lck,
                                              kmp_lock_flags_t flags) {
   if (__kmp_set_user_lock_flags_ != NULL) {
     (*__kmp_set_user_lock_flags_)(lck, flags);
   }
 }
 
 // The fuction which sets up all of the vtbl pointers for kmp_user_lock_t.
 extern void __kmp_set_user_lock_vptrs(kmp_lock_kind_t user_lock_kind);
 
 // Macros for binding user lock functions.
 #define KMP_BIND_USER_LOCK_TEMPLATE(nest, kind, suffix)                        \
   {                                                                            \
     __kmp_acquire##nest##user_lock_with_checks_ = (int (*)(                    \
         kmp_user_lock_p, kmp_int32))__kmp_acquire##nest##kind##_##suffix;      \
     __kmp_release##nest##user_lock_with_checks_ = (int (*)(                    \
         kmp_user_lock_p, kmp_int32))__kmp_release##nest##kind##_##suffix;      \
     __kmp_test##nest##user_lock_with_checks_ = (int (*)(                       \
         kmp_user_lock_p, kmp_int32))__kmp_test##nest##kind##_##suffix;         \
     __kmp_init##nest##user_lock_with_checks_ =                                 \
         (void (*)(kmp_user_lock_p))__kmp_init##nest##kind##_##suffix;          \
     __kmp_destroy##nest##user_lock_with_checks_ =                              \
         (void (*)(kmp_user_lock_p))__kmp_destroy##nest##kind##_##suffix;       \
   }
 
 #define KMP_BIND_USER_LOCK(kind) KMP_BIND_USER_LOCK_TEMPLATE(_, kind, lock)
 #define KMP_BIND_USER_LOCK_WITH_CHECKS(kind)                                   \
   KMP_BIND_USER_LOCK_TEMPLATE(_, kind, lock_with_checks)
 #define KMP_BIND_NESTED_USER_LOCK(kind)                                        \
   KMP_BIND_USER_LOCK_TEMPLATE(_nested_, kind, lock)
 #define KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(kind)                            \
   KMP_BIND_USER_LOCK_TEMPLATE(_nested_, kind, lock_with_checks)
 
 // User lock table & lock allocation
 /* On 64-bit Linux* OS (and OS X*) GNU compiler allocates only 4 bytems memory
    for lock variable, which is not enough to store a pointer, so we have to use
    lock indexes instead of pointers and maintain lock table to map indexes to
    pointers.
 
 
    Note: The first element of the table is not a pointer to lock! It is a
    pointer to previously allocated table (or NULL if it is the first table).
 
    Usage:
 
    if ( OMP_LOCK_T_SIZE < sizeof( <lock> ) ) { // or OMP_NEST_LOCK_T_SIZE
      Lock table is fully utilized. User locks are indexes, so table is used on
      user lock operation.
      Note: it may be the case (lin_32) that we don't need to use a lock
      table for regular locks, but do need the table for nested locks.
    }
    else {
      Lock table initialized but not actually used.
    }
 */
 
 struct kmp_lock_table {
   kmp_lock_index_t used; // Number of used elements
   kmp_lock_index_t allocated; // Number of allocated elements
   kmp_user_lock_p *table; // Lock table.
 };
 
 typedef struct kmp_lock_table kmp_lock_table_t;
 
 extern kmp_lock_table_t __kmp_user_lock_table;
 extern kmp_user_lock_p __kmp_lock_pool;
 
 struct kmp_block_of_locks {
   struct kmp_block_of_locks *next_block;
   void *locks;
 };
 
 typedef struct kmp_block_of_locks kmp_block_of_locks_t;
 
 extern kmp_block_of_locks_t *__kmp_lock_blocks;
 extern int __kmp_num_locks_in_block;
 
 extern kmp_user_lock_p __kmp_user_lock_allocate(void **user_lock,
                                                 kmp_int32 gtid,
                                                 kmp_lock_flags_t flags);
 extern void __kmp_user_lock_free(void **user_lock, kmp_int32 gtid,
                                  kmp_user_lock_p lck);
 extern kmp_user_lock_p __kmp_lookup_user_lock(void **user_lock,
                                               char const *func);
 extern void __kmp_cleanup_user_locks();
 
 #define KMP_CHECK_USER_LOCK_INIT()                                             \
   {                                                                            \
     if (!TCR_4(__kmp_init_user_locks)) {                                       \
       __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);                         \
       if (!TCR_4(__kmp_init_user_locks)) {                                     \
         TCW_4(__kmp_init_user_locks, TRUE);                                    \
       }                                                                        \
       __kmp_release_bootstrap_lock(&__kmp_initz_lock);                         \
     }                                                                          \
   }
 
 #endif // KMP_USE_DYNAMIC_LOCK
 
 #undef KMP_PAD
 #undef KMP_GTID_DNE
 
 #if KMP_USE_DYNAMIC_LOCK
 // KMP_USE_DYNAMIC_LOCK enables dynamic dispatch of lock functions without
 // breaking the current compatibility. Essential functionality of this new code
 // is dynamic dispatch, but it also implements (or enables implementation of)
 // hinted user lock and critical section which will be part of OMP 4.5 soon.
 //
 // Lock type can be decided at creation time (i.e., lock initialization), and
 // subsequent lock function call on the created lock object requires type
 // extraction and call through jump table using the extracted type. This type
 // information is stored in two different ways depending on the size of the lock
 // object, and we differentiate lock types by this size requirement - direct and
 // indirect locks.
 //
 // Direct locks:
 // A direct lock object fits into the space created by the compiler for an
 // omp_lock_t object, and TAS/Futex lock falls into this category. We use low
 // one byte of the lock object as the storage for the lock type, and appropriate
 // bit operation is required to access the data meaningful to the lock
 // algorithms. Also, to differentiate direct lock from indirect lock, 1 is
 // written to LSB of the lock object. The newly introduced "hle" lock is also a
 // direct lock.
 //
 // Indirect locks:
 // An indirect lock object requires more space than the compiler-generated
 // space, and it should be allocated from heap. Depending on the size of the
 // compiler-generated space for the lock (i.e., size of omp_lock_t), this
 // omp_lock_t object stores either the address of the heap-allocated indirect
 // lock (void * fits in the object) or an index to the indirect lock table entry
 // that holds the address. Ticket/Queuing/DRDPA/Adaptive lock falls into this
 // category, and the newly introduced "rtm" lock is also an indirect lock which
 // was implemented on top of the Queuing lock. When the omp_lock_t object holds
 // an index (not lock address), 0 is written to LSB to differentiate the lock
 // from a direct lock, and the remaining part is the actual index to the
 // indirect lock table.
 
 #include <stdint.h> // for uintptr_t
 
 // Shortcuts
 #define KMP_USE_INLINED_TAS                                                    \
   (KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM)) && 1
 #define KMP_USE_INLINED_FUTEX KMP_USE_FUTEX && 0
 
 // List of lock definitions; all nested locks are indirect locks.
 // hle lock is xchg lock prefixed with XACQUIRE/XRELEASE.
 // All nested locks are indirect lock types.
 #if KMP_USE_TSX
 #if KMP_USE_FUTEX
 #define KMP_FOREACH_D_LOCK(m, a) m(tas, a) m(futex, a) m(hle, a)
 #define KMP_FOREACH_I_LOCK(m, a)                                               \
   m(ticket, a) m(queuing, a) m(adaptive, a) m(drdpa, a) m(rtm, a)              \
       m(nested_tas, a) m(nested_futex, a) m(nested_ticket, a)                  \
           m(nested_queuing, a) m(nested_drdpa, a)
 #else
 #define KMP_FOREACH_D_LOCK(m, a) m(tas, a) m(hle, a)
 #define KMP_FOREACH_I_LOCK(m, a)                                               \
   m(ticket, a) m(queuing, a) m(adaptive, a) m(drdpa, a) m(rtm, a)              \
       m(nested_tas, a) m(nested_ticket, a) m(nested_queuing, a)                \
           m(nested_drdpa, a)
 #endif // KMP_USE_FUTEX
 #define KMP_LAST_D_LOCK lockseq_hle
 #else
 #if KMP_USE_FUTEX
 #define KMP_FOREACH_D_LOCK(m, a) m(tas, a) m(futex, a)
 #define KMP_FOREACH_I_LOCK(m, a)                                               \
   m(ticket, a) m(queuing, a) m(drdpa, a) m(nested_tas, a) m(nested_futex, a)   \
       m(nested_ticket, a) m(nested_queuing, a) m(nested_drdpa, a)
 #define KMP_LAST_D_LOCK lockseq_futex
 #else
 #define KMP_FOREACH_D_LOCK(m, a) m(tas, a)
 #define KMP_FOREACH_I_LOCK(m, a)                                               \
   m(ticket, a) m(queuing, a) m(drdpa, a) m(nested_tas, a) m(nested_ticket, a)  \
       m(nested_queuing, a) m(nested_drdpa, a)
 #define KMP_LAST_D_LOCK lockseq_tas
 #endif // KMP_USE_FUTEX
 #endif // KMP_USE_TSX
 
 // Information used in dynamic dispatch
 #define KMP_LOCK_SHIFT                                                         \
   8 // number of low bits to be used as tag for direct locks
 #define KMP_FIRST_D_LOCK lockseq_tas
 #define KMP_FIRST_I_LOCK lockseq_ticket
 #define KMP_LAST_I_LOCK lockseq_nested_drdpa
 #define KMP_NUM_I_LOCKS                                                        \
   (locktag_nested_drdpa + 1) // number of indirect lock types
 
 // Base type for dynamic locks.
 typedef kmp_uint32 kmp_dyna_lock_t;
 
 // Lock sequence that enumerates all lock kinds. Always make this enumeration
 // consistent with kmp_lockseq_t in the include directory.
 typedef enum {
   lockseq_indirect = 0,
 #define expand_seq(l, a) lockseq_##l,
   KMP_FOREACH_D_LOCK(expand_seq, 0) KMP_FOREACH_I_LOCK(expand_seq, 0)
 #undef expand_seq
 } kmp_dyna_lockseq_t;
 
 // Enumerates indirect lock tags.
 typedef enum {
 #define expand_tag(l, a) locktag_##l,
   KMP_FOREACH_I_LOCK(expand_tag, 0)
 #undef expand_tag
 } kmp_indirect_locktag_t;
 
 // Utility macros that extract information from lock sequences.
 #define KMP_IS_D_LOCK(seq)                                                     \
   ((seq) >= KMP_FIRST_D_LOCK && (seq) <= KMP_LAST_D_LOCK)
 #define KMP_IS_I_LOCK(seq)                                                     \
   ((seq) >= KMP_FIRST_I_LOCK && (seq) <= KMP_LAST_I_LOCK)
 #define KMP_GET_I_TAG(seq) (kmp_indirect_locktag_t)((seq)-KMP_FIRST_I_LOCK)
 #define KMP_GET_D_TAG(seq) ((seq) << 1 | 1)
 
 // Enumerates direct lock tags starting from indirect tag.
 typedef enum {
 #define expand_tag(l, a) locktag_##l = KMP_GET_D_TAG(lockseq_##l),
   KMP_FOREACH_D_LOCK(expand_tag, 0)
 #undef expand_tag
 } kmp_direct_locktag_t;
 
 // Indirect lock type
 typedef struct {
   kmp_user_lock_p lock;
   kmp_indirect_locktag_t type;
 } kmp_indirect_lock_t;
 
 // Function tables for direct locks. Set/unset/test differentiate functions
 // with/without consistency checking.
 extern void (*__kmp_direct_init[])(kmp_dyna_lock_t *, kmp_dyna_lockseq_t);
-extern void (*(*__kmp_direct_destroy))(kmp_dyna_lock_t *);
-extern int (*(*__kmp_direct_set))(kmp_dyna_lock_t *, kmp_int32);
-extern int (*(*__kmp_direct_unset))(kmp_dyna_lock_t *, kmp_int32);
-extern int (*(*__kmp_direct_test))(kmp_dyna_lock_t *, kmp_int32);
+extern void (**__kmp_direct_destroy)(kmp_dyna_lock_t *);
+extern int (**__kmp_direct_set)(kmp_dyna_lock_t *, kmp_int32);
+extern int (**__kmp_direct_unset)(kmp_dyna_lock_t *, kmp_int32);
+extern int (**__kmp_direct_test)(kmp_dyna_lock_t *, kmp_int32);
 
 // Function tables for indirect locks. Set/unset/test differentiate functions
 // with/withuot consistency checking.
 extern void (*__kmp_indirect_init[])(kmp_user_lock_p);
-extern void (*(*__kmp_indirect_destroy))(kmp_user_lock_p);
-extern int (*(*__kmp_indirect_set))(kmp_user_lock_p, kmp_int32);
-extern int (*(*__kmp_indirect_unset))(kmp_user_lock_p, kmp_int32);
-extern int (*(*__kmp_indirect_test))(kmp_user_lock_p, kmp_int32);
+extern void (**__kmp_indirect_destroy)(kmp_user_lock_p);
+extern int (**__kmp_indirect_set)(kmp_user_lock_p, kmp_int32);
+extern int (**__kmp_indirect_unset)(kmp_user_lock_p, kmp_int32);
+extern int (**__kmp_indirect_test)(kmp_user_lock_p, kmp_int32);
 
 // Extracts direct lock tag from a user lock pointer
 #define KMP_EXTRACT_D_TAG(l)                                                   \
   (*((kmp_dyna_lock_t *)(l)) & ((1 << KMP_LOCK_SHIFT) - 1) &                   \
    -(*((kmp_dyna_lock_t *)(l)) & 1))
 
 // Extracts indirect lock index from a user lock pointer
 #define KMP_EXTRACT_I_INDEX(l) (*(kmp_lock_index_t *)(l) >> 1)
 
 // Returns function pointer to the direct lock function with l (kmp_dyna_lock_t
 // *) and op (operation type).
 #define KMP_D_LOCK_FUNC(l, op) __kmp_direct_##op[KMP_EXTRACT_D_TAG(l)]
 
 // Returns function pointer to the indirect lock function with l
 // (kmp_indirect_lock_t *) and op (operation type).
 #define KMP_I_LOCK_FUNC(l, op)                                                 \
   __kmp_indirect_##op[((kmp_indirect_lock_t *)(l))->type]
 
 // Initializes a direct lock with the given lock pointer and lock sequence.
 #define KMP_INIT_D_LOCK(l, seq)                                                \
   __kmp_direct_init[KMP_GET_D_TAG(seq)]((kmp_dyna_lock_t *)l, seq)
 
 // Initializes an indirect lock with the given lock pointer and lock sequence.
 #define KMP_INIT_I_LOCK(l, seq)                                                \
   __kmp_direct_init[0]((kmp_dyna_lock_t *)(l), seq)
 
 // Returns "free" lock value for the given lock type.
 #define KMP_LOCK_FREE(type) (locktag_##type)
 
 // Returns "busy" lock value for the given lock teyp.
 #define KMP_LOCK_BUSY(v, type) ((v) << KMP_LOCK_SHIFT | locktag_##type)
 
 // Returns lock value after removing (shifting) lock tag.
 #define KMP_LOCK_STRIP(v) ((v) >> KMP_LOCK_SHIFT)
 
 // Initializes global states and data structures for managing dynamic user
 // locks.
 extern void __kmp_init_dynamic_user_locks();
 
 // Allocates and returns an indirect lock with the given indirect lock tag.
 extern kmp_indirect_lock_t *
 __kmp_allocate_indirect_lock(void **, kmp_int32, kmp_indirect_locktag_t);
 
 // Cleans up global states and data structures for managing dynamic user locks.
 extern void __kmp_cleanup_indirect_user_locks();
 
 // Default user lock sequence when not using hinted locks.
 extern kmp_dyna_lockseq_t __kmp_user_lock_seq;
 
 // Jump table for "set lock location", available only for indirect locks.
 extern void (*__kmp_indirect_set_location[KMP_NUM_I_LOCKS])(kmp_user_lock_p,
                                                             const ident_t *);
 #define KMP_SET_I_LOCK_LOCATION(lck, loc)                                      \
   {                                                                            \
     if (__kmp_indirect_set_location[(lck)->type] != NULL)                      \
       __kmp_indirect_set_location[(lck)->type]((lck)->lock, loc);              \
   }
 
 // Jump table for "set lock flags", available only for indirect locks.
 extern void (*__kmp_indirect_set_flags[KMP_NUM_I_LOCKS])(kmp_user_lock_p,
                                                          kmp_lock_flags_t);
 #define KMP_SET_I_LOCK_FLAGS(lck, flag)                                        \
   {                                                                            \
     if (__kmp_indirect_set_flags[(lck)->type] != NULL)                         \
       __kmp_indirect_set_flags[(lck)->type]((lck)->lock, flag);                \
   }
 
 // Jump table for "get lock location", available only for indirect locks.
 extern const ident_t *(*__kmp_indirect_get_location[KMP_NUM_I_LOCKS])(
     kmp_user_lock_p);
 #define KMP_GET_I_LOCK_LOCATION(lck)                                           \
   (__kmp_indirect_get_location[(lck)->type] != NULL                            \
        ? __kmp_indirect_get_location[(lck)->type]((lck)->lock)                 \
        : NULL)
 
 // Jump table for "get lock flags", available only for indirect locks.
 extern kmp_lock_flags_t (*__kmp_indirect_get_flags[KMP_NUM_I_LOCKS])(
     kmp_user_lock_p);
 #define KMP_GET_I_LOCK_FLAGS(lck)                                              \
   (__kmp_indirect_get_flags[(lck)->type] != NULL                               \
        ? __kmp_indirect_get_flags[(lck)->type]((lck)->lock)                    \
        : NULL)
 
 #define KMP_I_LOCK_CHUNK                                                       \
   1024 // number of kmp_indirect_lock_t objects to be allocated together
 
 // Lock table for indirect locks.
 typedef struct kmp_indirect_lock_table {
   kmp_indirect_lock_t **table; // blocks of indirect locks allocated
   kmp_lock_index_t size; // size of the indirect lock table
   kmp_lock_index_t next; // index to the next lock to be allocated
 } kmp_indirect_lock_table_t;
 
 extern kmp_indirect_lock_table_t __kmp_i_lock_table;
 
 // Returns the indirect lock associated with the given index.
 #define KMP_GET_I_LOCK(index)                                                  \
   (*(__kmp_i_lock_table.table + (index) / KMP_I_LOCK_CHUNK) +                  \
    (index) % KMP_I_LOCK_CHUNK)
 
 // Number of locks in a lock block, which is fixed to "1" now.
 // TODO: No lock block implementation now. If we do support, we need to manage
 // lock block data structure for each indirect lock type.
 extern int __kmp_num_locks_in_block;
 
 // Fast lock table lookup without consistency checking
 #define KMP_LOOKUP_I_LOCK(l)                                                   \
   ((OMP_LOCK_T_SIZE < sizeof(void *)) ? KMP_GET_I_LOCK(KMP_EXTRACT_I_INDEX(l)) \
                                       : *((kmp_indirect_lock_t **)(l)))
 
 // Used once in kmp_error.cpp
 extern kmp_int32 __kmp_get_user_lock_owner(kmp_user_lock_p, kmp_uint32);
 
 #else // KMP_USE_DYNAMIC_LOCK
 
 #define KMP_LOCK_BUSY(v, type) (v)
 #define KMP_LOCK_FREE(type) 0
 #define KMP_LOCK_STRIP(v) (v)
 
 #endif // KMP_USE_DYNAMIC_LOCK
 
 // data structure for using backoff within spin locks.
 typedef struct {
   kmp_uint32 step; // current step
   kmp_uint32 max_backoff; // upper bound of outer delay loop
   kmp_uint32 min_tick; // size of inner delay loop in ticks (machine-dependent)
 } kmp_backoff_t;
 
 // Runtime's default backoff parameters
 extern kmp_backoff_t __kmp_spin_backoff_params;
 
 // Backoff function
 extern void __kmp_spin_backoff(kmp_backoff_t *);
 
 #ifdef __cplusplus
 } // extern "C"
 #endif // __cplusplus
 
 #endif /* KMP_LOCK_H */
Index: projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_os.h
===================================================================
--- projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_os.h	(revision 357058)
+++ projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_os.h	(revision 357059)
@@ -1,1040 +1,1041 @@
 /*
  * kmp_os.h -- KPTS runtime header file.
  */
 
 //===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef KMP_OS_H
 #define KMP_OS_H
 
 #include "kmp_config.h"
 #include <stdlib.h>
 #include <atomic>
 
 #define KMP_FTN_PLAIN 1
 #define KMP_FTN_APPEND 2
 #define KMP_FTN_UPPER 3
 /*
 #define KMP_FTN_PREPEND 4
 #define KMP_FTN_UAPPEND 5
 */
 
 #define KMP_PTR_SKIP (sizeof(void *))
 
 /* -------------------------- Compiler variations ------------------------ */
 
 #define KMP_OFF 0
 #define KMP_ON 1
 
 #define KMP_MEM_CONS_VOLATILE 0
 #define KMP_MEM_CONS_FENCE 1
 
 #ifndef KMP_MEM_CONS_MODEL
 #define KMP_MEM_CONS_MODEL KMP_MEM_CONS_VOLATILE
 #endif
 
 #ifndef __has_cpp_attribute
 #define __has_cpp_attribute(x) 0
 #endif
 
 #ifndef __has_attribute
 #define __has_attribute(x) 0
 #endif
 
 /* ------------------------- Compiler recognition ---------------------- */
 #define KMP_COMPILER_ICC 0
 #define KMP_COMPILER_GCC 0
 #define KMP_COMPILER_CLANG 0
 #define KMP_COMPILER_MSVC 0
 
 #if defined(__INTEL_COMPILER)
 #undef KMP_COMPILER_ICC
 #define KMP_COMPILER_ICC 1
 #elif defined(__clang__)
 #undef KMP_COMPILER_CLANG
 #define KMP_COMPILER_CLANG 1
 #elif defined(__GNUC__)
 #undef KMP_COMPILER_GCC
 #define KMP_COMPILER_GCC 1
 #elif defined(_MSC_VER)
 #undef KMP_COMPILER_MSVC
 #define KMP_COMPILER_MSVC 1
 #else
 #error Unknown compiler
 #endif
 
-#if (KMP_OS_LINUX || KMP_OS_WINDOWS) && !KMP_OS_CNK
+#if (KMP_OS_LINUX || KMP_OS_WINDOWS || KMP_OS_FREEBSD) && !KMP_OS_CNK
 #define KMP_AFFINITY_SUPPORTED 1
 #if KMP_OS_WINDOWS && KMP_ARCH_X86_64
 #define KMP_GROUP_AFFINITY 1
 #else
 #define KMP_GROUP_AFFINITY 0
 #endif
 #else
 #define KMP_AFFINITY_SUPPORTED 0
 #define KMP_GROUP_AFFINITY 0
 #endif
 
 /* Check for quad-precision extension. */
 #define KMP_HAVE_QUAD 0
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 #if KMP_COMPILER_ICC
 /* _Quad is already defined for icc */
 #undef KMP_HAVE_QUAD
 #define KMP_HAVE_QUAD 1
 #elif KMP_COMPILER_CLANG
 /* Clang doesn't support a software-implemented
    128-bit extended precision type yet */
 typedef long double _Quad;
 #elif KMP_COMPILER_GCC
 /* GCC on NetBSD lacks __multc3/__divtc3 builtins needed for quad */
 #if !KMP_OS_NETBSD
 typedef __float128 _Quad;
 #undef KMP_HAVE_QUAD
 #define KMP_HAVE_QUAD 1
 #endif
 #elif KMP_COMPILER_MSVC
 typedef long double _Quad;
 #endif
 #else
 #if __LDBL_MAX_EXP__ >= 16384 && KMP_COMPILER_GCC
 typedef long double _Quad;
 #undef KMP_HAVE_QUAD
 #define KMP_HAVE_QUAD 1
 #endif
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
 #define KMP_USE_X87CONTROL 0
 #if KMP_OS_WINDOWS
 #define KMP_END_OF_LINE "\r\n"
 typedef char kmp_int8;
 typedef unsigned char kmp_uint8;
 typedef short kmp_int16;
 typedef unsigned short kmp_uint16;
 typedef int kmp_int32;
 typedef unsigned int kmp_uint32;
 #define KMP_INT32_SPEC "d"
 #define KMP_UINT32_SPEC "u"
 #ifndef KMP_STRUCT64
 typedef __int64 kmp_int64;
 typedef unsigned __int64 kmp_uint64;
 #define KMP_INT64_SPEC "I64d"
 #define KMP_UINT64_SPEC "I64u"
 #else
 struct kmp_struct64 {
   kmp_int32 a, b;
 };
 typedef struct kmp_struct64 kmp_int64;
 typedef struct kmp_struct64 kmp_uint64;
 /* Not sure what to use for KMP_[U]INT64_SPEC here */
 #endif
 #if KMP_ARCH_X86 && KMP_MSVC_COMPAT
 #undef KMP_USE_X87CONTROL
 #define KMP_USE_X87CONTROL 1
 #endif
 #if KMP_ARCH_X86_64
 #define KMP_INTPTR 1
 typedef __int64 kmp_intptr_t;
 typedef unsigned __int64 kmp_uintptr_t;
 #define KMP_INTPTR_SPEC "I64d"
 #define KMP_UINTPTR_SPEC "I64u"
 #endif
 #endif /* KMP_OS_WINDOWS */
 
 #if KMP_OS_UNIX
 #define KMP_END_OF_LINE "\n"
 typedef char kmp_int8;
 typedef unsigned char kmp_uint8;
 typedef short kmp_int16;
 typedef unsigned short kmp_uint16;
 typedef int kmp_int32;
 typedef unsigned int kmp_uint32;
 typedef long long kmp_int64;
 typedef unsigned long long kmp_uint64;
 #define KMP_INT32_SPEC "d"
 #define KMP_UINT32_SPEC "u"
 #define KMP_INT64_SPEC "lld"
 #define KMP_UINT64_SPEC "llu"
 #endif /* KMP_OS_UNIX */
 
 #if KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_MIPS
 #define KMP_SIZE_T_SPEC KMP_UINT32_SPEC
-#elif KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
+#elif KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                 \
+    KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
 #define KMP_SIZE_T_SPEC KMP_UINT64_SPEC
 #else
 #error "Can't determine size_t printf format specifier."
 #endif
 
 #if KMP_ARCH_X86
 #define KMP_SIZE_T_MAX (0xFFFFFFFF)
 #else
 #define KMP_SIZE_T_MAX (0xFFFFFFFFFFFFFFFF)
 #endif
 
 typedef size_t kmp_size_t;
 typedef float kmp_real32;
 typedef double kmp_real64;
 
 #ifndef KMP_INTPTR
 #define KMP_INTPTR 1
 typedef long kmp_intptr_t;
 typedef unsigned long kmp_uintptr_t;
 #define KMP_INTPTR_SPEC "ld"
 #define KMP_UINTPTR_SPEC "lu"
 #endif
 
 #ifdef BUILD_I8
 typedef kmp_int64 kmp_int;
 typedef kmp_uint64 kmp_uint;
 #else
 typedef kmp_int32 kmp_int;
 typedef kmp_uint32 kmp_uint;
 #endif /* BUILD_I8 */
 #define KMP_INT_MAX ((kmp_int32)0x7FFFFFFF)
 #define KMP_INT_MIN ((kmp_int32)0x80000000)
 
 #ifdef __cplusplus
 // macros to cast out qualifiers and to re-interpret types
 #define CCAST(type, var) const_cast<type>(var)
 #define RCAST(type, var) reinterpret_cast<type>(var)
 //-------------------------------------------------------------------------
 // template for debug prints specification ( d, u, lld, llu ), and to obtain
 // signed/unsigned flavors of a type
 template <typename T> struct traits_t {};
 // int
 template <> struct traits_t<signed int> {
   typedef signed int signed_t;
   typedef unsigned int unsigned_t;
   typedef double floating_t;
   static char const *spec;
   static const signed_t max_value = 0x7fffffff;
   static const signed_t min_value = 0x80000000;
   static const int type_size = sizeof(signed_t);
 };
 // unsigned int
 template <> struct traits_t<unsigned int> {
   typedef signed int signed_t;
   typedef unsigned int unsigned_t;
   typedef double floating_t;
   static char const *spec;
   static const unsigned_t max_value = 0xffffffff;
   static const unsigned_t min_value = 0x00000000;
   static const int type_size = sizeof(unsigned_t);
 };
 // long
 template <> struct traits_t<signed long> {
   typedef signed long signed_t;
   typedef unsigned long unsigned_t;
   typedef long double floating_t;
   static char const *spec;
   static const int type_size = sizeof(signed_t);
 };
 // long long
 template <> struct traits_t<signed long long> {
   typedef signed long long signed_t;
   typedef unsigned long long unsigned_t;
   typedef long double floating_t;
   static char const *spec;
   static const signed_t max_value = 0x7fffffffffffffffLL;
   static const signed_t min_value = 0x8000000000000000LL;
   static const int type_size = sizeof(signed_t);
 };
 // unsigned long long
 template <> struct traits_t<unsigned long long> {
   typedef signed long long signed_t;
   typedef unsigned long long unsigned_t;
   typedef long double floating_t;
   static char const *spec;
   static const unsigned_t max_value = 0xffffffffffffffffLL;
   static const unsigned_t min_value = 0x0000000000000000LL;
   static const int type_size = sizeof(unsigned_t);
 };
 //-------------------------------------------------------------------------
 #else
 #define CCAST(type, var) (type)(var)
 #define RCAST(type, var) (type)(var)
 #endif // __cplusplus
 
 #define KMP_EXPORT extern /* export declaration in guide libraries */
 
 #if __GNUC__ >= 4 && !defined(__MINGW32__)
 #define __forceinline __inline
 #endif
 
 #if KMP_OS_WINDOWS
 #include <windows.h>
 
 static inline int KMP_GET_PAGE_SIZE(void) {
   SYSTEM_INFO si;
   GetSystemInfo(&si);
   return si.dwPageSize;
 }
 #else
 #define KMP_GET_PAGE_SIZE() getpagesize()
 #endif
 
 #define PAGE_ALIGNED(_addr)                                                    \
   (!((size_t)_addr & (size_t)(KMP_GET_PAGE_SIZE() - 1)))
 #define ALIGN_TO_PAGE(x)                                                       \
   (void *)(((size_t)(x)) & ~((size_t)(KMP_GET_PAGE_SIZE() - 1)))
 
 /* ---------- Support for cache alignment, padding, etc. ----------------*/
 
 #ifdef __cplusplus
 extern "C" {
 #endif // __cplusplus
 
 #define INTERNODE_CACHE_LINE 4096 /* for multi-node systems */
 
 /* Define the default size of the cache line */
 #ifndef CACHE_LINE
 #define CACHE_LINE 128 /* cache line size in bytes */
 #else
 #if (CACHE_LINE < 64) && !defined(KMP_OS_DARWIN)
 // 2006-02-13: This produces too many warnings on OS X*. Disable for now
 #warning CACHE_LINE is too small.
 #endif
 #endif /* CACHE_LINE */
 
 #define KMP_CACHE_PREFETCH(ADDR) /* nothing */
 
 // Define attribute that indicates that the fall through from the previous
 // case label is intentional and should not be diagnosed by a compiler
 //   Code from libcxx/include/__config
 // Use a function like macro to imply that it must be followed by a semicolon
 #if __cplusplus > 201402L && __has_cpp_attribute(fallthrough)
 #  define KMP_FALLTHROUGH() [[fallthrough]]
 #elif __has_cpp_attribute(clang::fallthrough)
 #  define KMP_FALLTHROUGH() [[clang::fallthrough]]
 #elif __has_attribute(fallthough) || __GNUC__ >= 7
 #  define KMP_FALLTHROUGH() __attribute__((__fallthrough__))
 #else
 #  define KMP_FALLTHROUGH() ((void)0)
 #endif
 
 // Define attribute that indicates a function does not return
 #if __cplusplus >= 201103L
 #define KMP_NORETURN [[noreturn]]
 #elif KMP_OS_WINDOWS
 #define KMP_NORETURN __declspec(noreturn)
 #else
 #define KMP_NORETURN __attribute__((noreturn))
 #endif
 
 #if KMP_OS_WINDOWS && KMP_MSVC_COMPAT
 #define KMP_ALIGN(bytes) __declspec(align(bytes))
 #define KMP_THREAD_LOCAL __declspec(thread)
 #define KMP_ALIAS /* Nothing */
 #else
 #define KMP_ALIGN(bytes) __attribute__((aligned(bytes)))
 #define KMP_THREAD_LOCAL __thread
 #define KMP_ALIAS(alias_of) __attribute__((alias(alias_of)))
 #endif
 
 #if KMP_HAVE_WEAK_ATTRIBUTE
 #define KMP_WEAK_ATTRIBUTE __attribute__((weak))
 #else
 #define KMP_WEAK_ATTRIBUTE /* Nothing */
 #endif
 
 // Define KMP_VERSION_SYMBOL and KMP_EXPAND_NAME
 #ifndef KMP_STR
 #define KMP_STR(x) _KMP_STR(x)
 #define _KMP_STR(x) #x
 #endif
 
 #ifdef KMP_USE_VERSION_SYMBOLS
 // If using versioned symbols, KMP_EXPAND_NAME prepends
 // __kmp_api_ to the real API name
 #define KMP_EXPAND_NAME(api_name) _KMP_EXPAND_NAME(api_name)
 #define _KMP_EXPAND_NAME(api_name) __kmp_api_##api_name
 #define KMP_VERSION_SYMBOL(api_name, ver_num, ver_str)                         \
   _KMP_VERSION_SYMBOL(api_name, ver_num, ver_str, "VERSION")
 #define _KMP_VERSION_SYMBOL(api_name, ver_num, ver_str, default_ver)            \
   __typeof__(__kmp_api_##api_name) __kmp_api_##api_name##_##ver_num##_alias     \
       __attribute__((alias(KMP_STR(__kmp_api_##api_name))));                    \
   __asm__(                                                                      \
       ".symver " KMP_STR(__kmp_api_##api_name##_##ver_num##_alias) "," KMP_STR( \
           api_name) "@" ver_str "\n\t");                                        \
   __asm__(".symver " KMP_STR(__kmp_api_##api_name) "," KMP_STR(                 \
       api_name) "@@" default_ver "\n\t")
 #else // KMP_USE_VERSION_SYMBOLS
 #define KMP_EXPAND_NAME(api_name) api_name
 #define KMP_VERSION_SYMBOL(api_name, ver_num, ver_str) /* Nothing */
 #endif // KMP_USE_VERSION_SYMBOLS
 
 /* Temporary note: if performance testing of this passes, we can remove
    all references to KMP_DO_ALIGN and replace with KMP_ALIGN.  */
 #define KMP_DO_ALIGN(bytes) KMP_ALIGN(bytes)
 #define KMP_ALIGN_CACHE KMP_ALIGN(CACHE_LINE)
 #define KMP_ALIGN_CACHE_INTERNODE KMP_ALIGN(INTERNODE_CACHE_LINE)
 
 /* General purpose fence types for memory operations */
 enum kmp_mem_fence_type {
   kmp_no_fence, /* No memory fence */
   kmp_acquire_fence, /* Acquire (read) memory fence */
   kmp_release_fence, /* Release (write) memory fence */
   kmp_full_fence /* Full (read+write) memory fence */
 };
 
 // Synchronization primitives
 
 #if KMP_ASM_INTRINS && KMP_OS_WINDOWS
 
 #if KMP_MSVC_COMPAT && !KMP_COMPILER_CLANG
 #pragma intrinsic(InterlockedExchangeAdd)
 #pragma intrinsic(InterlockedCompareExchange)
 #pragma intrinsic(InterlockedExchange)
 #pragma intrinsic(InterlockedExchange64)
 #endif
 
 // Using InterlockedIncrement / InterlockedDecrement causes a library loading
 // ordering problem, so we use InterlockedExchangeAdd instead.
 #define KMP_TEST_THEN_INC32(p) InterlockedExchangeAdd((volatile long *)(p), 1)
 #define KMP_TEST_THEN_INC_ACQ32(p)                                             \
   InterlockedExchangeAdd((volatile long *)(p), 1)
 #define KMP_TEST_THEN_ADD4_32(p) InterlockedExchangeAdd((volatile long *)(p), 4)
 #define KMP_TEST_THEN_ADD4_ACQ32(p)                                            \
   InterlockedExchangeAdd((volatile long *)(p), 4)
 #define KMP_TEST_THEN_DEC32(p) InterlockedExchangeAdd((volatile long *)(p), -1)
 #define KMP_TEST_THEN_DEC_ACQ32(p)                                             \
   InterlockedExchangeAdd((volatile long *)(p), -1)
 #define KMP_TEST_THEN_ADD32(p, v)                                              \
   InterlockedExchangeAdd((volatile long *)(p), (v))
 
 #define KMP_COMPARE_AND_STORE_RET32(p, cv, sv)                                 \
   InterlockedCompareExchange((volatile long *)(p), (long)(sv), (long)(cv))
 
 #define KMP_XCHG_FIXED32(p, v)                                                 \
   InterlockedExchange((volatile long *)(p), (long)(v))
 #define KMP_XCHG_FIXED64(p, v)                                                 \
   InterlockedExchange64((volatile kmp_int64 *)(p), (kmp_int64)(v))
 
 inline kmp_real32 KMP_XCHG_REAL32(volatile kmp_real32 *p, kmp_real32 v) {
   kmp_int32 tmp = InterlockedExchange((volatile long *)p, *(long *)&v);
   return *(kmp_real32 *)&tmp;
 }
 
 // Routines that we still need to implement in assembly.
 extern kmp_int8 __kmp_test_then_add8(volatile kmp_int8 *p, kmp_int8 v);
 extern kmp_int8 __kmp_test_then_or8(volatile kmp_int8 *p, kmp_int8 v);
 extern kmp_int8 __kmp_test_then_and8(volatile kmp_int8 *p, kmp_int8 v);
 extern kmp_int32 __kmp_test_then_add32(volatile kmp_int32 *p, kmp_int32 v);
 extern kmp_uint32 __kmp_test_then_or32(volatile kmp_uint32 *p, kmp_uint32 v);
 extern kmp_uint32 __kmp_test_then_and32(volatile kmp_uint32 *p, kmp_uint32 v);
 extern kmp_int64 __kmp_test_then_add64(volatile kmp_int64 *p, kmp_int64 v);
 extern kmp_uint64 __kmp_test_then_or64(volatile kmp_uint64 *p, kmp_uint64 v);
 extern kmp_uint64 __kmp_test_then_and64(volatile kmp_uint64 *p, kmp_uint64 v);
 
 extern kmp_int8 __kmp_compare_and_store8(volatile kmp_int8 *p, kmp_int8 cv,
                                          kmp_int8 sv);
 extern kmp_int16 __kmp_compare_and_store16(volatile kmp_int16 *p, kmp_int16 cv,
                                            kmp_int16 sv);
 extern kmp_int32 __kmp_compare_and_store32(volatile kmp_int32 *p, kmp_int32 cv,
                                            kmp_int32 sv);
 extern kmp_int32 __kmp_compare_and_store64(volatile kmp_int64 *p, kmp_int64 cv,
                                            kmp_int64 sv);
 extern kmp_int8 __kmp_compare_and_store_ret8(volatile kmp_int8 *p, kmp_int8 cv,
                                              kmp_int8 sv);
 extern kmp_int16 __kmp_compare_and_store_ret16(volatile kmp_int16 *p,
                                                kmp_int16 cv, kmp_int16 sv);
 extern kmp_int32 __kmp_compare_and_store_ret32(volatile kmp_int32 *p,
                                                kmp_int32 cv, kmp_int32 sv);
 extern kmp_int64 __kmp_compare_and_store_ret64(volatile kmp_int64 *p,
                                                kmp_int64 cv, kmp_int64 sv);
 
 extern kmp_int8 __kmp_xchg_fixed8(volatile kmp_int8 *p, kmp_int8 v);
 extern kmp_int16 __kmp_xchg_fixed16(volatile kmp_int16 *p, kmp_int16 v);
 extern kmp_int32 __kmp_xchg_fixed32(volatile kmp_int32 *p, kmp_int32 v);
 extern kmp_int64 __kmp_xchg_fixed64(volatile kmp_int64 *p, kmp_int64 v);
 extern kmp_real32 __kmp_xchg_real32(volatile kmp_real32 *p, kmp_real32 v);
 extern kmp_real64 __kmp_xchg_real64(volatile kmp_real64 *p, kmp_real64 v);
 
 //#define KMP_TEST_THEN_INC32(p) __kmp_test_then_add32((p), 1)
 //#define KMP_TEST_THEN_INC_ACQ32(p) __kmp_test_then_add32((p), 1)
 #define KMP_TEST_THEN_INC64(p) __kmp_test_then_add64((p), 1LL)
 #define KMP_TEST_THEN_INC_ACQ64(p) __kmp_test_then_add64((p), 1LL)
 //#define KMP_TEST_THEN_ADD4_32(p) __kmp_test_then_add32((p), 4)
 //#define KMP_TEST_THEN_ADD4_ACQ32(p) __kmp_test_then_add32((p), 4)
 #define KMP_TEST_THEN_ADD4_64(p) __kmp_test_then_add64((p), 4LL)
 #define KMP_TEST_THEN_ADD4_ACQ64(p) __kmp_test_then_add64((p), 4LL)
 //#define KMP_TEST_THEN_DEC32(p) __kmp_test_then_add32((p), -1)
 //#define KMP_TEST_THEN_DEC_ACQ32(p) __kmp_test_then_add32((p), -1)
 #define KMP_TEST_THEN_DEC64(p) __kmp_test_then_add64((p), -1LL)
 #define KMP_TEST_THEN_DEC_ACQ64(p) __kmp_test_then_add64((p), -1LL)
 //#define KMP_TEST_THEN_ADD32(p, v) __kmp_test_then_add32((p), (v))
 #define KMP_TEST_THEN_ADD8(p, v) __kmp_test_then_add8((p), (v))
 #define KMP_TEST_THEN_ADD64(p, v) __kmp_test_then_add64((p), (v))
 
 #define KMP_TEST_THEN_OR8(p, v) __kmp_test_then_or8((p), (v))
 #define KMP_TEST_THEN_AND8(p, v) __kmp_test_then_and8((p), (v))
 #define KMP_TEST_THEN_OR32(p, v) __kmp_test_then_or32((p), (v))
 #define KMP_TEST_THEN_AND32(p, v) __kmp_test_then_and32((p), (v))
 #define KMP_TEST_THEN_OR64(p, v) __kmp_test_then_or64((p), (v))
 #define KMP_TEST_THEN_AND64(p, v) __kmp_test_then_and64((p), (v))
 
 #define KMP_COMPARE_AND_STORE_ACQ8(p, cv, sv)                                  \
   __kmp_compare_and_store8((p), (cv), (sv))
 #define KMP_COMPARE_AND_STORE_REL8(p, cv, sv)                                  \
   __kmp_compare_and_store8((p), (cv), (sv))
 #define KMP_COMPARE_AND_STORE_ACQ16(p, cv, sv)                                 \
   __kmp_compare_and_store16((p), (cv), (sv))
 #define KMP_COMPARE_AND_STORE_REL16(p, cv, sv)                                 \
   __kmp_compare_and_store16((p), (cv), (sv))
 #define KMP_COMPARE_AND_STORE_ACQ32(p, cv, sv)                                 \
   __kmp_compare_and_store32((volatile kmp_int32 *)(p), (kmp_int32)(cv),        \
                             (kmp_int32)(sv))
 #define KMP_COMPARE_AND_STORE_REL32(p, cv, sv)                                 \
   __kmp_compare_and_store32((volatile kmp_int32 *)(p), (kmp_int32)(cv),        \
                             (kmp_int32)(sv))
 #define KMP_COMPARE_AND_STORE_ACQ64(p, cv, sv)                                 \
   __kmp_compare_and_store64((volatile kmp_int64 *)(p), (kmp_int64)(cv),        \
                             (kmp_int64)(sv))
 #define KMP_COMPARE_AND_STORE_REL64(p, cv, sv)                                 \
   __kmp_compare_and_store64((volatile kmp_int64 *)(p), (kmp_int64)(cv),        \
                             (kmp_int64)(sv))
 
 #if KMP_ARCH_X86
 #define KMP_COMPARE_AND_STORE_PTR(p, cv, sv)                                   \
   __kmp_compare_and_store32((volatile kmp_int32 *)(p), (kmp_int32)(cv),        \
                             (kmp_int32)(sv))
 #else /* 64 bit pointers */
 #define KMP_COMPARE_AND_STORE_PTR(p, cv, sv)                                   \
   __kmp_compare_and_store64((volatile kmp_int64 *)(p), (kmp_int64)(cv),        \
                             (kmp_int64)(sv))
 #endif /* KMP_ARCH_X86 */
 
 #define KMP_COMPARE_AND_STORE_RET8(p, cv, sv)                                  \
   __kmp_compare_and_store_ret8((p), (cv), (sv))
 #define KMP_COMPARE_AND_STORE_RET16(p, cv, sv)                                 \
   __kmp_compare_and_store_ret16((p), (cv), (sv))
 #define KMP_COMPARE_AND_STORE_RET64(p, cv, sv)                                 \
   __kmp_compare_and_store_ret64((volatile kmp_int64 *)(p), (kmp_int64)(cv),    \
                                 (kmp_int64)(sv))
 
 #define KMP_XCHG_FIXED8(p, v)                                                  \
   __kmp_xchg_fixed8((volatile kmp_int8 *)(p), (kmp_int8)(v));
 #define KMP_XCHG_FIXED16(p, v) __kmp_xchg_fixed16((p), (v));
 //#define KMP_XCHG_FIXED32(p, v) __kmp_xchg_fixed32((p), (v));
 //#define KMP_XCHG_FIXED64(p, v) __kmp_xchg_fixed64((p), (v));
 //#define KMP_XCHG_REAL32(p, v) __kmp_xchg_real32((p), (v));
 #define KMP_XCHG_REAL64(p, v) __kmp_xchg_real64((p), (v));
 
 #elif (KMP_ASM_INTRINS && KMP_OS_UNIX) || !(KMP_ARCH_X86 || KMP_ARCH_X86_64)
 
 /* cast p to correct type so that proper intrinsic will be used */
 #define KMP_TEST_THEN_INC32(p)                                                 \
   __sync_fetch_and_add((volatile kmp_int32 *)(p), 1)
 #define KMP_TEST_THEN_INC_ACQ32(p)                                             \
   __sync_fetch_and_add((volatile kmp_int32 *)(p), 1)
 #if KMP_ARCH_MIPS
 #define KMP_TEST_THEN_INC64(p)                                                 \
   __atomic_fetch_add((volatile kmp_int64 *)(p), 1LL, __ATOMIC_SEQ_CST)
 #define KMP_TEST_THEN_INC_ACQ64(p)                                             \
   __atomic_fetch_add((volatile kmp_int64 *)(p), 1LL, __ATOMIC_SEQ_CST)
 #else
 #define KMP_TEST_THEN_INC64(p)                                                 \
   __sync_fetch_and_add((volatile kmp_int64 *)(p), 1LL)
 #define KMP_TEST_THEN_INC_ACQ64(p)                                             \
   __sync_fetch_and_add((volatile kmp_int64 *)(p), 1LL)
 #endif
 #define KMP_TEST_THEN_ADD4_32(p)                                               \
   __sync_fetch_and_add((volatile kmp_int32 *)(p), 4)
 #define KMP_TEST_THEN_ADD4_ACQ32(p)                                            \
   __sync_fetch_and_add((volatile kmp_int32 *)(p), 4)
 #if KMP_ARCH_MIPS
 #define KMP_TEST_THEN_ADD4_64(p)                                               \
   __atomic_fetch_add((volatile kmp_int64 *)(p), 4LL, __ATOMIC_SEQ_CST)
 #define KMP_TEST_THEN_ADD4_ACQ64(p)                                            \
   __atomic_fetch_add((volatile kmp_int64 *)(p), 4LL, __ATOMIC_SEQ_CST)
 #define KMP_TEST_THEN_DEC64(p)                                                 \
   __atomic_fetch_sub((volatile kmp_int64 *)(p), 1LL, __ATOMIC_SEQ_CST)
 #define KMP_TEST_THEN_DEC_ACQ64(p)                                             \
   __atomic_fetch_sub((volatile kmp_int64 *)(p), 1LL, __ATOMIC_SEQ_CST)
 #else
 #define KMP_TEST_THEN_ADD4_64(p)                                               \
   __sync_fetch_and_add((volatile kmp_int64 *)(p), 4LL)
 #define KMP_TEST_THEN_ADD4_ACQ64(p)                                            \
   __sync_fetch_and_add((volatile kmp_int64 *)(p), 4LL)
 #define KMP_TEST_THEN_DEC64(p)                                                 \
   __sync_fetch_and_sub((volatile kmp_int64 *)(p), 1LL)
 #define KMP_TEST_THEN_DEC_ACQ64(p)                                             \
   __sync_fetch_and_sub((volatile kmp_int64 *)(p), 1LL)
 #endif
 #define KMP_TEST_THEN_DEC32(p)                                                 \
   __sync_fetch_and_sub((volatile kmp_int32 *)(p), 1)
 #define KMP_TEST_THEN_DEC_ACQ32(p)                                             \
   __sync_fetch_and_sub((volatile kmp_int32 *)(p), 1)
 #define KMP_TEST_THEN_ADD8(p, v)                                               \
   __sync_fetch_and_add((volatile kmp_int8 *)(p), (kmp_int8)(v))
 #define KMP_TEST_THEN_ADD32(p, v)                                              \
   __sync_fetch_and_add((volatile kmp_int32 *)(p), (kmp_int32)(v))
 #if KMP_ARCH_MIPS
 #define KMP_TEST_THEN_ADD64(p, v)                                              \
   __atomic_fetch_add((volatile kmp_uint64 *)(p), (kmp_uint64)(v),              \
                      __ATOMIC_SEQ_CST)
 #else
 #define KMP_TEST_THEN_ADD64(p, v)                                              \
   __sync_fetch_and_add((volatile kmp_int64 *)(p), (kmp_int64)(v))
 #endif
 
 #define KMP_TEST_THEN_OR8(p, v)                                                \
   __sync_fetch_and_or((volatile kmp_int8 *)(p), (kmp_int8)(v))
 #define KMP_TEST_THEN_AND8(p, v)                                               \
   __sync_fetch_and_and((volatile kmp_int8 *)(p), (kmp_int8)(v))
 #define KMP_TEST_THEN_OR32(p, v)                                               \
   __sync_fetch_and_or((volatile kmp_uint32 *)(p), (kmp_uint32)(v))
 #define KMP_TEST_THEN_AND32(p, v)                                              \
   __sync_fetch_and_and((volatile kmp_uint32 *)(p), (kmp_uint32)(v))
 #if KMP_ARCH_MIPS
 #define KMP_TEST_THEN_OR64(p, v)                                               \
   __atomic_fetch_or((volatile kmp_uint64 *)(p), (kmp_uint64)(v),               \
                     __ATOMIC_SEQ_CST)
 #define KMP_TEST_THEN_AND64(p, v)                                              \
   __atomic_fetch_and((volatile kmp_uint64 *)(p), (kmp_uint64)(v),              \
                      __ATOMIC_SEQ_CST)
 #else
 #define KMP_TEST_THEN_OR64(p, v)                                               \
   __sync_fetch_and_or((volatile kmp_uint64 *)(p), (kmp_uint64)(v))
 #define KMP_TEST_THEN_AND64(p, v)                                              \
   __sync_fetch_and_and((volatile kmp_uint64 *)(p), (kmp_uint64)(v))
 #endif
 
 #define KMP_COMPARE_AND_STORE_ACQ8(p, cv, sv)                                  \
   __sync_bool_compare_and_swap((volatile kmp_uint8 *)(p), (kmp_uint8)(cv),     \
                                (kmp_uint8)(sv))
 #define KMP_COMPARE_AND_STORE_REL8(p, cv, sv)                                  \
   __sync_bool_compare_and_swap((volatile kmp_uint8 *)(p), (kmp_uint8)(cv),     \
                                (kmp_uint8)(sv))
 #define KMP_COMPARE_AND_STORE_ACQ16(p, cv, sv)                                 \
   __sync_bool_compare_and_swap((volatile kmp_uint16 *)(p), (kmp_uint16)(cv),   \
                                (kmp_uint16)(sv))
 #define KMP_COMPARE_AND_STORE_REL16(p, cv, sv)                                 \
   __sync_bool_compare_and_swap((volatile kmp_uint16 *)(p), (kmp_uint16)(cv),   \
                                (kmp_uint16)(sv))
 #define KMP_COMPARE_AND_STORE_ACQ32(p, cv, sv)                                 \
   __sync_bool_compare_and_swap((volatile kmp_uint32 *)(p), (kmp_uint32)(cv),   \
                                (kmp_uint32)(sv))
 #define KMP_COMPARE_AND_STORE_REL32(p, cv, sv)                                 \
   __sync_bool_compare_and_swap((volatile kmp_uint32 *)(p), (kmp_uint32)(cv),   \
                                (kmp_uint32)(sv))
 #define KMP_COMPARE_AND_STORE_PTR(p, cv, sv)                                   \
   __sync_bool_compare_and_swap((void *volatile *)(p), (void *)(cv),            \
                                (void *)(sv))
 
 #define KMP_COMPARE_AND_STORE_RET8(p, cv, sv)                                  \
   __sync_val_compare_and_swap((volatile kmp_uint8 *)(p), (kmp_uint8)(cv),      \
                               (kmp_uint8)(sv))
 #define KMP_COMPARE_AND_STORE_RET16(p, cv, sv)                                 \
   __sync_val_compare_and_swap((volatile kmp_uint16 *)(p), (kmp_uint16)(cv),    \
                               (kmp_uint16)(sv))
 #define KMP_COMPARE_AND_STORE_RET32(p, cv, sv)                                 \
   __sync_val_compare_and_swap((volatile kmp_uint32 *)(p), (kmp_uint32)(cv),    \
                               (kmp_uint32)(sv))
 #if KMP_ARCH_MIPS
 static inline bool mips_sync_bool_compare_and_swap(
   volatile kmp_uint64 *p, kmp_uint64 cv, kmp_uint64 sv) {
   return __atomic_compare_exchange(p, &cv, &sv, false, __ATOMIC_SEQ_CST,
                                                        __ATOMIC_SEQ_CST);
 }
 static inline bool mips_sync_val_compare_and_swap(
   volatile kmp_uint64 *p, kmp_uint64 cv, kmp_uint64 sv) {
   __atomic_compare_exchange(p, &cv, &sv, false, __ATOMIC_SEQ_CST,
                                                 __ATOMIC_SEQ_CST);
   return cv;
 }
 #define KMP_COMPARE_AND_STORE_ACQ64(p, cv, sv)                                 \
   mips_sync_bool_compare_and_swap((volatile kmp_uint64 *)(p), (kmp_uint64)(cv),\
                                (kmp_uint64)(sv))
 #define KMP_COMPARE_AND_STORE_REL64(p, cv, sv)                                 \
   mips_sync_bool_compare_and_swap((volatile kmp_uint64 *)(p), (kmp_uint64)(cv),\
                                (kmp_uint64)(sv))
 #define KMP_COMPARE_AND_STORE_RET64(p, cv, sv)                                 \
   mips_sync_val_compare_and_swap((volatile kmp_uint64 *)(p), (kmp_uint64)(cv), \
                               (kmp_uint64)(sv))
 #else
 #define KMP_COMPARE_AND_STORE_ACQ64(p, cv, sv)                                 \
   __sync_bool_compare_and_swap((volatile kmp_uint64 *)(p), (kmp_uint64)(cv),   \
                                (kmp_uint64)(sv))
 #define KMP_COMPARE_AND_STORE_REL64(p, cv, sv)                                 \
   __sync_bool_compare_and_swap((volatile kmp_uint64 *)(p), (kmp_uint64)(cv),   \
                                (kmp_uint64)(sv))
 #define KMP_COMPARE_AND_STORE_RET64(p, cv, sv)                                 \
   __sync_val_compare_and_swap((volatile kmp_uint64 *)(p), (kmp_uint64)(cv),    \
                               (kmp_uint64)(sv))
 #endif
 
 #define KMP_XCHG_FIXED8(p, v)                                                  \
   __sync_lock_test_and_set((volatile kmp_uint8 *)(p), (kmp_uint8)(v))
 #define KMP_XCHG_FIXED16(p, v)                                                 \
   __sync_lock_test_and_set((volatile kmp_uint16 *)(p), (kmp_uint16)(v))
 #define KMP_XCHG_FIXED32(p, v)                                                 \
   __sync_lock_test_and_set((volatile kmp_uint32 *)(p), (kmp_uint32)(v))
 #define KMP_XCHG_FIXED64(p, v)                                                 \
   __sync_lock_test_and_set((volatile kmp_uint64 *)(p), (kmp_uint64)(v))
 
 inline kmp_real32 KMP_XCHG_REAL32(volatile kmp_real32 *p, kmp_real32 v) {
   kmp_int32 tmp =
       __sync_lock_test_and_set((volatile kmp_uint32 *)(p), *(kmp_uint32 *)&v);
   return *(kmp_real32 *)&tmp;
 }
 
 inline kmp_real64 KMP_XCHG_REAL64(volatile kmp_real64 *p, kmp_real64 v) {
   kmp_int64 tmp =
       __sync_lock_test_and_set((volatile kmp_uint64 *)(p), *(kmp_uint64 *)&v);
   return *(kmp_real64 *)&tmp;
 }
 
 #else
 
 extern kmp_int8 __kmp_test_then_add8(volatile kmp_int8 *p, kmp_int8 v);
 extern kmp_int8 __kmp_test_then_or8(volatile kmp_int8 *p, kmp_int8 v);
 extern kmp_int8 __kmp_test_then_and8(volatile kmp_int8 *p, kmp_int8 v);
 extern kmp_int32 __kmp_test_then_add32(volatile kmp_int32 *p, kmp_int32 v);
 extern kmp_uint32 __kmp_test_then_or32(volatile kmp_uint32 *p, kmp_uint32 v);
 extern kmp_uint32 __kmp_test_then_and32(volatile kmp_uint32 *p, kmp_uint32 v);
 extern kmp_int64 __kmp_test_then_add64(volatile kmp_int64 *p, kmp_int64 v);
 extern kmp_uint64 __kmp_test_then_or64(volatile kmp_uint64 *p, kmp_uint64 v);
 extern kmp_uint64 __kmp_test_then_and64(volatile kmp_uint64 *p, kmp_uint64 v);
 
 extern kmp_int8 __kmp_compare_and_store8(volatile kmp_int8 *p, kmp_int8 cv,
                                          kmp_int8 sv);
 extern kmp_int16 __kmp_compare_and_store16(volatile kmp_int16 *p, kmp_int16 cv,
                                            kmp_int16 sv);
 extern kmp_int32 __kmp_compare_and_store32(volatile kmp_int32 *p, kmp_int32 cv,
                                            kmp_int32 sv);
 extern kmp_int32 __kmp_compare_and_store64(volatile kmp_int64 *p, kmp_int64 cv,
                                            kmp_int64 sv);
 extern kmp_int8 __kmp_compare_and_store_ret8(volatile kmp_int8 *p, kmp_int8 cv,
                                              kmp_int8 sv);
 extern kmp_int16 __kmp_compare_and_store_ret16(volatile kmp_int16 *p,
                                                kmp_int16 cv, kmp_int16 sv);
 extern kmp_int32 __kmp_compare_and_store_ret32(volatile kmp_int32 *p,
                                                kmp_int32 cv, kmp_int32 sv);
 extern kmp_int64 __kmp_compare_and_store_ret64(volatile kmp_int64 *p,
                                                kmp_int64 cv, kmp_int64 sv);
 
 extern kmp_int8 __kmp_xchg_fixed8(volatile kmp_int8 *p, kmp_int8 v);
 extern kmp_int16 __kmp_xchg_fixed16(volatile kmp_int16 *p, kmp_int16 v);
 extern kmp_int32 __kmp_xchg_fixed32(volatile kmp_int32 *p, kmp_int32 v);
 extern kmp_int64 __kmp_xchg_fixed64(volatile kmp_int64 *p, kmp_int64 v);
 extern kmp_real32 __kmp_xchg_real32(volatile kmp_real32 *p, kmp_real32 v);
 extern kmp_real64 __kmp_xchg_real64(volatile kmp_real64 *p, kmp_real64 v);
 
 #define KMP_TEST_THEN_INC32(p)                                                 \
   __kmp_test_then_add32((volatile kmp_int32 *)(p), 1)
 #define KMP_TEST_THEN_INC_ACQ32(p)                                             \
   __kmp_test_then_add32((volatile kmp_int32 *)(p), 1)
 #define KMP_TEST_THEN_INC64(p)                                                 \
   __kmp_test_then_add64((volatile kmp_int64 *)(p), 1LL)
 #define KMP_TEST_THEN_INC_ACQ64(p)                                             \
   __kmp_test_then_add64((volatile kmp_int64 *)(p), 1LL)
 #define KMP_TEST_THEN_ADD4_32(p)                                               \
   __kmp_test_then_add32((volatile kmp_int32 *)(p), 4)
 #define KMP_TEST_THEN_ADD4_ACQ32(p)                                            \
   __kmp_test_then_add32((volatile kmp_int32 *)(p), 4)
 #define KMP_TEST_THEN_ADD4_64(p)                                               \
   __kmp_test_then_add64((volatile kmp_int64 *)(p), 4LL)
 #define KMP_TEST_THEN_ADD4_ACQ64(p)                                            \
   __kmp_test_then_add64((volatile kmp_int64 *)(p), 4LL)
 #define KMP_TEST_THEN_DEC32(p)                                                 \
   __kmp_test_then_add32((volatile kmp_int32 *)(p), -1)
 #define KMP_TEST_THEN_DEC_ACQ32(p)                                             \
   __kmp_test_then_add32((volatile kmp_int32 *)(p), -1)
 #define KMP_TEST_THEN_DEC64(p)                                                 \
   __kmp_test_then_add64((volatile kmp_int64 *)(p), -1LL)
 #define KMP_TEST_THEN_DEC_ACQ64(p)                                             \
   __kmp_test_then_add64((volatile kmp_int64 *)(p), -1LL)
 #define KMP_TEST_THEN_ADD8(p, v)                                               \
   __kmp_test_then_add8((volatile kmp_int8 *)(p), (kmp_int8)(v))
 #define KMP_TEST_THEN_ADD32(p, v)                                              \
   __kmp_test_then_add32((volatile kmp_int32 *)(p), (kmp_int32)(v))
 #define KMP_TEST_THEN_ADD64(p, v)                                              \
   __kmp_test_then_add64((volatile kmp_int64 *)(p), (kmp_int64)(v))
 
 #define KMP_TEST_THEN_OR8(p, v)                                                \
   __kmp_test_then_or8((volatile kmp_int8 *)(p), (kmp_int8)(v))
 #define KMP_TEST_THEN_AND8(p, v)                                               \
   __kmp_test_then_and8((volatile kmp_int8 *)(p), (kmp_int8)(v))
 #define KMP_TEST_THEN_OR32(p, v)                                               \
   __kmp_test_then_or32((volatile kmp_uint32 *)(p), (kmp_uint32)(v))
 #define KMP_TEST_THEN_AND32(p, v)                                              \
   __kmp_test_then_and32((volatile kmp_uint32 *)(p), (kmp_uint32)(v))
 #define KMP_TEST_THEN_OR64(p, v)                                               \
   __kmp_test_then_or64((volatile kmp_uint64 *)(p), (kmp_uint64)(v))
 #define KMP_TEST_THEN_AND64(p, v)                                              \
   __kmp_test_then_and64((volatile kmp_uint64 *)(p), (kmp_uint64)(v))
 
 #define KMP_COMPARE_AND_STORE_ACQ8(p, cv, sv)                                  \
   __kmp_compare_and_store8((volatile kmp_int8 *)(p), (kmp_int8)(cv),           \
                            (kmp_int8)(sv))
 #define KMP_COMPARE_AND_STORE_REL8(p, cv, sv)                                  \
   __kmp_compare_and_store8((volatile kmp_int8 *)(p), (kmp_int8)(cv),           \
                            (kmp_int8)(sv))
 #define KMP_COMPARE_AND_STORE_ACQ16(p, cv, sv)                                 \
   __kmp_compare_and_store16((volatile kmp_int16 *)(p), (kmp_int16)(cv),        \
                             (kmp_int16)(sv))
 #define KMP_COMPARE_AND_STORE_REL16(p, cv, sv)                                 \
   __kmp_compare_and_store16((volatile kmp_int16 *)(p), (kmp_int16)(cv),        \
                             (kmp_int16)(sv))
 #define KMP_COMPARE_AND_STORE_ACQ32(p, cv, sv)                                 \
   __kmp_compare_and_store32((volatile kmp_int32 *)(p), (kmp_int32)(cv),        \
                             (kmp_int32)(sv))
 #define KMP_COMPARE_AND_STORE_REL32(p, cv, sv)                                 \
   __kmp_compare_and_store32((volatile kmp_int32 *)(p), (kmp_int32)(cv),        \
                             (kmp_int32)(sv))
 #define KMP_COMPARE_AND_STORE_ACQ64(p, cv, sv)                                 \
   __kmp_compare_and_store64((volatile kmp_int64 *)(p), (kmp_int64)(cv),        \
                             (kmp_int64)(sv))
 #define KMP_COMPARE_AND_STORE_REL64(p, cv, sv)                                 \
   __kmp_compare_and_store64((volatile kmp_int64 *)(p), (kmp_int64)(cv),        \
                             (kmp_int64)(sv))
 
 #if KMP_ARCH_X86
 #define KMP_COMPARE_AND_STORE_PTR(p, cv, sv)                                   \
   __kmp_compare_and_store32((volatile kmp_int32 *)(p), (kmp_int32)(cv),        \
                             (kmp_int32)(sv))
 #else /* 64 bit pointers */
 #define KMP_COMPARE_AND_STORE_PTR(p, cv, sv)                                   \
   __kmp_compare_and_store64((volatile kmp_int64 *)(p), (kmp_int64)(cv),        \
                             (kmp_int64)(sv))
 #endif /* KMP_ARCH_X86 */
 
 #define KMP_COMPARE_AND_STORE_RET8(p, cv, sv)                                  \
   __kmp_compare_and_store_ret8((p), (cv), (sv))
 #define KMP_COMPARE_AND_STORE_RET16(p, cv, sv)                                 \
   __kmp_compare_and_store_ret16((p), (cv), (sv))
 #define KMP_COMPARE_AND_STORE_RET32(p, cv, sv)                                 \
   __kmp_compare_and_store_ret32((volatile kmp_int32 *)(p), (kmp_int32)(cv),    \
                                 (kmp_int32)(sv))
 #define KMP_COMPARE_AND_STORE_RET64(p, cv, sv)                                 \
   __kmp_compare_and_store_ret64((volatile kmp_int64 *)(p), (kmp_int64)(cv),    \
                                 (kmp_int64)(sv))
 
 #define KMP_XCHG_FIXED8(p, v)                                                  \
   __kmp_xchg_fixed8((volatile kmp_int8 *)(p), (kmp_int8)(v));
 #define KMP_XCHG_FIXED16(p, v) __kmp_xchg_fixed16((p), (v));
 #define KMP_XCHG_FIXED32(p, v) __kmp_xchg_fixed32((p), (v));
 #define KMP_XCHG_FIXED64(p, v) __kmp_xchg_fixed64((p), (v));
 #define KMP_XCHG_REAL32(p, v) __kmp_xchg_real32((p), (v));
 #define KMP_XCHG_REAL64(p, v) __kmp_xchg_real64((p), (v));
 
 #endif /* KMP_ASM_INTRINS */
 
 /* ------------- relaxed consistency memory model stuff ------------------ */
 
 #if KMP_OS_WINDOWS
 #ifdef __ABSOFT_WIN
 #define KMP_MB() asm("nop")
 #define KMP_IMB() asm("nop")
 #else
 #define KMP_MB() /* _asm{ nop } */
 #define KMP_IMB() /* _asm{ nop } */
 #endif
 #endif /* KMP_OS_WINDOWS */
 
 #if KMP_ARCH_PPC64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS ||     \
-    KMP_ARCH_MIPS64
+    KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
 #define KMP_MB() __sync_synchronize()
 #endif
 
 #ifndef KMP_MB
 #define KMP_MB() /* nothing to do */
 #endif
 
 #ifndef KMP_IMB
 #define KMP_IMB() /* nothing to do */
 #endif
 
 #ifndef KMP_ST_REL32
 #define KMP_ST_REL32(A, D) (*(A) = (D))
 #endif
 
 #ifndef KMP_ST_REL64
 #define KMP_ST_REL64(A, D) (*(A) = (D))
 #endif
 
 #ifndef KMP_LD_ACQ32
 #define KMP_LD_ACQ32(A) (*(A))
 #endif
 
 #ifndef KMP_LD_ACQ64
 #define KMP_LD_ACQ64(A) (*(A))
 #endif
 
 /* ------------------------------------------------------------------------ */
 // FIXME - maybe this should this be
 //
 // #define TCR_4(a)    (*(volatile kmp_int32 *)(&a))
 // #define TCW_4(a,b)  (a) = (*(volatile kmp_int32 *)&(b))
 //
 // #define TCR_8(a)    (*(volatile kmp_int64 *)(a))
 // #define TCW_8(a,b)  (a) = (*(volatile kmp_int64 *)(&b))
 //
 // I'm fairly certain this is the correct thing to do, but I'm afraid
 // of performance regressions.
 
 #define TCR_1(a) (a)
 #define TCW_1(a, b) (a) = (b)
 #define TCR_4(a) (a)
 #define TCW_4(a, b) (a) = (b)
 #define TCI_4(a) (++(a))
 #define TCD_4(a) (--(a))
 #define TCR_8(a) (a)
 #define TCW_8(a, b) (a) = (b)
 #define TCI_8(a) (++(a))
 #define TCD_8(a) (--(a))
 #define TCR_SYNC_4(a) (a)
 #define TCW_SYNC_4(a, b) (a) = (b)
 #define TCX_SYNC_4(a, b, c)                                                    \
   KMP_COMPARE_AND_STORE_REL32((volatile kmp_int32 *)(volatile void *)&(a),     \
                               (kmp_int32)(b), (kmp_int32)(c))
 #define TCR_SYNC_8(a) (a)
 #define TCW_SYNC_8(a, b) (a) = (b)
 #define TCX_SYNC_8(a, b, c)                                                    \
   KMP_COMPARE_AND_STORE_REL64((volatile kmp_int64 *)(volatile void *)&(a),     \
                               (kmp_int64)(b), (kmp_int64)(c))
 
 #if KMP_ARCH_X86 || KMP_ARCH_MIPS
 // What about ARM?
 #define TCR_PTR(a) ((void *)TCR_4(a))
 #define TCW_PTR(a, b) TCW_4((a), (b))
 #define TCR_SYNC_PTR(a) ((void *)TCR_SYNC_4(a))
 #define TCW_SYNC_PTR(a, b) TCW_SYNC_4((a), (b))
 #define TCX_SYNC_PTR(a, b, c) ((void *)TCX_SYNC_4((a), (b), (c)))
 
 #else /* 64 bit pointers */
 
 #define TCR_PTR(a) ((void *)TCR_8(a))
 #define TCW_PTR(a, b) TCW_8((a), (b))
 #define TCR_SYNC_PTR(a) ((void *)TCR_SYNC_8(a))
 #define TCW_SYNC_PTR(a, b) TCW_SYNC_8((a), (b))
 #define TCX_SYNC_PTR(a, b, c) ((void *)TCX_SYNC_8((a), (b), (c)))
 
 #endif /* KMP_ARCH_X86 */
 
 /* If these FTN_{TRUE,FALSE} values change, may need to change several places
    where they are used to check that language is Fortran, not C. */
 
 #ifndef FTN_TRUE
 #define FTN_TRUE TRUE
 #endif
 
 #ifndef FTN_FALSE
 #define FTN_FALSE FALSE
 #endif
 
 typedef void (*microtask_t)(int *gtid, int *npr, ...);
 
 #ifdef USE_VOLATILE_CAST
 #define VOLATILE_CAST(x) (volatile x)
 #else
 #define VOLATILE_CAST(x) (x)
 #endif
 
 #define KMP_WAIT __kmp_wait_4
 #define KMP_WAIT_PTR __kmp_wait_4_ptr
 #define KMP_EQ __kmp_eq_4
 #define KMP_NEQ __kmp_neq_4
 #define KMP_LT __kmp_lt_4
 #define KMP_GE __kmp_ge_4
 #define KMP_LE __kmp_le_4
 
 /* Workaround for Intel(R) 64 code gen bug when taking address of static array
  * (Intel(R) 64 Tracker #138) */
 #if (KMP_ARCH_X86_64 || KMP_ARCH_PPC64) && KMP_OS_LINUX
 #define STATIC_EFI2_WORKAROUND
 #else
 #define STATIC_EFI2_WORKAROUND static
 #endif
 
 // Support of BGET usage
 #ifndef KMP_USE_BGET
 #define KMP_USE_BGET 1
 #endif
 
 // Switches for OSS builds
 #ifndef USE_CMPXCHG_FIX
 #define USE_CMPXCHG_FIX 1
 #endif
 
 // Enable dynamic user lock
 #define KMP_USE_DYNAMIC_LOCK 1
 
 // Enable Intel(R) Transactional Synchronization Extensions (Intel(R) TSX) if
 // dynamic user lock is turned on
 #if KMP_USE_DYNAMIC_LOCK
 // Visual studio can't handle the asm sections in this code
 #define KMP_USE_TSX (KMP_ARCH_X86 || KMP_ARCH_X86_64) && !KMP_COMPILER_MSVC
 #ifdef KMP_USE_ADAPTIVE_LOCKS
 #undef KMP_USE_ADAPTIVE_LOCKS
 #endif
 #define KMP_USE_ADAPTIVE_LOCKS KMP_USE_TSX
 #endif
 
 // Enable tick time conversion of ticks to seconds
 #if KMP_STATS_ENABLED
 #define KMP_HAVE_TICK_TIME                                                     \
   (KMP_OS_LINUX && (KMP_MIC || KMP_ARCH_X86 || KMP_ARCH_X86_64))
 #endif
 
 // Warning levels
 enum kmp_warnings_level {
   kmp_warnings_off = 0, /* No warnings */
   kmp_warnings_low, /* Minimal warnings (default) */
   kmp_warnings_explicit = 6, /* Explicitly set to ON - more warnings */
   kmp_warnings_verbose /* reserved */
 };
 
 #ifdef __cplusplus
 } // extern "C"
 #endif // __cplusplus
 
 // Macros for C++11 atomic functions
 #define KMP_ATOMIC_LD(p, order) (p)->load(std::memory_order_##order)
 #define KMP_ATOMIC_OP(op, p, v, order) (p)->op(v, std::memory_order_##order)
 
 // For non-default load/store
 #define KMP_ATOMIC_LD_ACQ(p) KMP_ATOMIC_LD(p, acquire)
 #define KMP_ATOMIC_LD_RLX(p) KMP_ATOMIC_LD(p, relaxed)
 #define KMP_ATOMIC_ST_REL(p, v) KMP_ATOMIC_OP(store, p, v, release)
 #define KMP_ATOMIC_ST_RLX(p, v) KMP_ATOMIC_OP(store, p, v, relaxed)
 
 // For non-default fetch_<op>
 #define KMP_ATOMIC_ADD(p, v) KMP_ATOMIC_OP(fetch_add, p, v, acq_rel)
 #define KMP_ATOMIC_SUB(p, v) KMP_ATOMIC_OP(fetch_sub, p, v, acq_rel)
 #define KMP_ATOMIC_AND(p, v) KMP_ATOMIC_OP(fetch_and, p, v, acq_rel)
 #define KMP_ATOMIC_OR(p, v) KMP_ATOMIC_OP(fetch_or, p, v, acq_rel)
 #define KMP_ATOMIC_INC(p) KMP_ATOMIC_OP(fetch_add, p, 1, acq_rel)
 #define KMP_ATOMIC_DEC(p) KMP_ATOMIC_OP(fetch_sub, p, 1, acq_rel)
 #define KMP_ATOMIC_ADD_RLX(p, v) KMP_ATOMIC_OP(fetch_add, p, v, relaxed)
 #define KMP_ATOMIC_INC_RLX(p) KMP_ATOMIC_OP(fetch_add, p, 1, relaxed)
 
 // Callers of the following functions cannot see the side effect on "expected".
 template <typename T>
 bool __kmp_atomic_compare_store(std::atomic<T> *p, T expected, T desired) {
   return p->compare_exchange_strong(
       expected, desired, std::memory_order_acq_rel, std::memory_order_relaxed);
 }
 
 template <typename T>
 bool __kmp_atomic_compare_store_acq(std::atomic<T> *p, T expected, T desired) {
   return p->compare_exchange_strong(
       expected, desired, std::memory_order_acquire, std::memory_order_relaxed);
 }
 
 template <typename T>
 bool __kmp_atomic_compare_store_rel(std::atomic<T> *p, T expected, T desired) {
   return p->compare_exchange_strong(
       expected, desired, std::memory_order_release, std::memory_order_relaxed);
 }
 
 #endif /* KMP_OS_H */
 // Safe C API
 #include "kmp_safe_c_api.h"
Index: projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_platform.h
===================================================================
--- projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_platform.h	(revision 357058)
+++ projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_platform.h	(revision 357059)
@@ -1,206 +1,210 @@
 /*
  * kmp_platform.h -- header for determining operating system and architecture
  */
 
 //===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef KMP_PLATFORM_H
 #define KMP_PLATFORM_H
 
 /* ---------------------- Operating system recognition ------------------- */
 
 #define KMP_OS_LINUX 0
 #define KMP_OS_DRAGONFLY 0
 #define KMP_OS_FREEBSD 0
 #define KMP_OS_NETBSD 0
 #define KMP_OS_OPENBSD 0
 #define KMP_OS_DARWIN 0
 #define KMP_OS_WINDOWS 0
 #define KMP_OS_CNK 0
 #define KMP_OS_HURD 0
 #define KMP_OS_UNIX 0 /* disjunction of KMP_OS_LINUX, KMP_OS_DARWIN etc. */
 
 #ifdef _WIN32
 #undef KMP_OS_WINDOWS
 #define KMP_OS_WINDOWS 1
 #endif
 
 #if (defined __APPLE__ && defined __MACH__)
 #undef KMP_OS_DARWIN
 #define KMP_OS_DARWIN 1
 #endif
 
 // in some ppc64 linux installations, only the second condition is met
 #if (defined __linux)
 #undef KMP_OS_LINUX
 #define KMP_OS_LINUX 1
 #elif (defined __linux__)
 #undef KMP_OS_LINUX
 #define KMP_OS_LINUX 1
 #else
 #endif
 
 #if (defined __DragonFly__)
 #undef KMP_OS_DRAGONFLY
 #define KMP_OS_DRAGONFLY 1
 #endif
 
 #if (defined __FreeBSD__)
 #undef KMP_OS_FREEBSD
 #define KMP_OS_FREEBSD 1
 #endif
 
 #if (defined __NetBSD__)
 #undef KMP_OS_NETBSD
 #define KMP_OS_NETBSD 1
 #endif
 
 #if (defined __OpenBSD__)
 #undef KMP_OS_OPENBSD
 #define KMP_OS_OPENBSD 1
 #endif
 
 #if (defined __bgq__)
 #undef KMP_OS_CNK
 #define KMP_OS_CNK 1
 #endif
 
 #if (defined __GNU__)
 #undef KMP_OS_HURD
 #define KMP_OS_HURD 1
 #endif
 
 #if (1 !=                                                                      \
      KMP_OS_LINUX + KMP_OS_DRAGONFLY + KMP_OS_FREEBSD + KMP_OS_NETBSD +        \
          KMP_OS_OPENBSD + KMP_OS_DARWIN + KMP_OS_WINDOWS + KMP_OS_HURD)
 #error Unknown OS
 #endif
 
 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
         KMP_OS_OPENBSD || KMP_OS_DARWIN || KMP_OS_HURD
 #undef KMP_OS_UNIX
 #define KMP_OS_UNIX 1
 #endif
 
 /* ---------------------- Architecture recognition ------------------- */
 
 #define KMP_ARCH_X86 0
 #define KMP_ARCH_X86_64 0
 #define KMP_ARCH_AARCH64 0
 #define KMP_ARCH_PPC64_ELFv1 0
 #define KMP_ARCH_PPC64_ELFv2 0
 #define KMP_ARCH_PPC64 (KMP_ARCH_PPC64_ELFv2 || KMP_ARCH_PPC64_ELFv1)
 #define KMP_ARCH_MIPS 0
 #define KMP_ARCH_MIPS64 0
+#define KMP_ARCH_RISCV64 0
 
 #if KMP_OS_WINDOWS
 #if defined(_M_AMD64) || defined(__x86_64)
 #undef KMP_ARCH_X86_64
 #define KMP_ARCH_X86_64 1
 #else
 #undef KMP_ARCH_X86
 #define KMP_ARCH_X86 1
 #endif
 #endif
 
 #if KMP_OS_UNIX
 #if defined __x86_64
 #undef KMP_ARCH_X86_64
 #define KMP_ARCH_X86_64 1
 #elif defined __i386
 #undef KMP_ARCH_X86
 #define KMP_ARCH_X86 1
 #elif defined __powerpc64__
 #if defined(_CALL_ELF) && _CALL_ELF == 2
 #undef KMP_ARCH_PPC64_ELFv2
 #define KMP_ARCH_PPC64_ELFv2 1
 #else
 #undef KMP_ARCH_PPC64_ELFv1
 #define KMP_ARCH_PPC64_ELFv1 1
 #endif
 #elif defined __aarch64__
 #undef KMP_ARCH_AARCH64
 #define KMP_ARCH_AARCH64 1
 #elif defined __mips__
 #if defined __mips64
 #undef KMP_ARCH_MIPS64
 #define KMP_ARCH_MIPS64 1
 #else
 #undef KMP_ARCH_MIPS
 #define KMP_ARCH_MIPS 1
 #endif
+#elif defined __riscv && __riscv_xlen == 64
+#undef KMP_ARCH_RISCV64
+#define KMP_ARCH_RISCV64 1
 #endif
 #endif
 
 #if defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7R__) ||                     \
     defined(__ARM_ARCH_7A__)
 #define KMP_ARCH_ARMV7 1
 #endif
 
 #if defined(KMP_ARCH_ARMV7) || defined(__ARM_ARCH_6__) ||                      \
     defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) ||                    \
     defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6T2__) ||                   \
     defined(__ARM_ARCH_6ZK__)
 #define KMP_ARCH_ARMV6 1
 #endif
 
 #if defined(KMP_ARCH_ARMV6) || defined(__ARM_ARCH_5T__) ||                     \
     defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) ||                   \
     defined(__ARM_ARCH_5TEJ__)
 #define KMP_ARCH_ARMV5 1
 #endif
 
 #if defined(KMP_ARCH_ARMV5) || defined(__ARM_ARCH_4__) ||                      \
     defined(__ARM_ARCH_4T__)
 #define KMP_ARCH_ARMV4 1
 #endif
 
 #if defined(KMP_ARCH_ARMV4) || defined(__ARM_ARCH_3__) ||                      \
     defined(__ARM_ARCH_3M__)
 #define KMP_ARCH_ARMV3 1
 #endif
 
 #if defined(KMP_ARCH_ARMV3) || defined(__ARM_ARCH_2__)
 #define KMP_ARCH_ARMV2 1
 #endif
 
 #if defined(KMP_ARCH_ARMV2)
 #define KMP_ARCH_ARM 1
 #endif
 
 #if defined(__MIC__) || defined(__MIC2__)
 #define KMP_MIC 1
 #if __MIC2__ || __KNC__
 #define KMP_MIC1 0
 #define KMP_MIC2 1
 #else
 #define KMP_MIC1 1
 #define KMP_MIC2 0
 #endif
 #else
 #define KMP_MIC 0
 #define KMP_MIC1 0
 #define KMP_MIC2 0
 #endif
 
 /* Specify 32 bit architectures here */
 #define KMP_32_BIT_ARCH (KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_MIPS)
 
 // Platforms which support Intel(R) Many Integrated Core Architecture
 #define KMP_MIC_SUPPORTED                                                      \
   ((KMP_ARCH_X86 || KMP_ARCH_X86_64) && (KMP_OS_LINUX || KMP_OS_WINDOWS))
 
 // TODO: Fixme - This is clever, but really fugly
 #if (1 !=                                                                      \
      KMP_ARCH_X86 + KMP_ARCH_X86_64 + KMP_ARCH_ARM + KMP_ARCH_PPC64 +          \
-         KMP_ARCH_AARCH64 + KMP_ARCH_MIPS + KMP_ARCH_MIPS64)
+     KMP_ARCH_AARCH64 + KMP_ARCH_MIPS + KMP_ARCH_MIPS64 + KMP_ARCH_RISCV64)
 #error Unknown or unsupported architecture
 #endif
 
 #endif // KMP_PLATFORM_H
Index: projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_runtime.cpp
===================================================================
--- projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_runtime.cpp	(revision 357058)
+++ projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_runtime.cpp	(revision 357059)
@@ -1,8217 +1,8305 @@
 /*
  * kmp_runtime.cpp -- KPTS runtime support library
  */
 
 //===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
 #include "kmp.h"
 #include "kmp_affinity.h"
 #include "kmp_atomic.h"
 #include "kmp_environment.h"
 #include "kmp_error.h"
 #include "kmp_i18n.h"
 #include "kmp_io.h"
 #include "kmp_itt.h"
 #include "kmp_settings.h"
 #include "kmp_stats.h"
 #include "kmp_str.h"
 #include "kmp_wait_release.h"
 #include "kmp_wrapper_getpid.h"
 #include "kmp_dispatch.h"
 #if KMP_USE_HIER_SCHED
 #include "kmp_dispatch_hier.h"
 #endif
 
 #if OMPT_SUPPORT
 #include "ompt-specific.h"
 #endif
 
 /* these are temporary issues to be dealt with */
 #define KMP_USE_PRCTL 0
 
 #if KMP_OS_WINDOWS
 #include <process.h>
 #endif
 
 #include "tsan_annotations.h"
 
 #if defined(KMP_GOMP_COMPAT)
 char const __kmp_version_alt_comp[] =
     KMP_VERSION_PREFIX "alternative compiler support: yes";
 #endif /* defined(KMP_GOMP_COMPAT) */
 
 char const __kmp_version_omp_api[] =
     KMP_VERSION_PREFIX "API version: 5.0 (201611)";
 
 #ifdef KMP_DEBUG
 char const __kmp_version_lock[] =
     KMP_VERSION_PREFIX "lock type: run time selectable";
 #endif /* KMP_DEBUG */
 
 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
 
 /* ------------------------------------------------------------------------ */
 
 #if KMP_USE_MONITOR
 kmp_info_t __kmp_monitor;
 #endif
 
 /* Forward declarations */
 
 void __kmp_cleanup(void);
 
 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
                                   int gtid);
 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
                                   kmp_internal_control_t *new_icvs,
                                   ident_t *loc);
 #if KMP_AFFINITY_SUPPORTED
 static void __kmp_partition_places(kmp_team_t *team,
                                    int update_master_only = 0);
 #endif
 static void __kmp_do_serial_initialize(void);
 void __kmp_fork_barrier(int gtid, int tid);
 void __kmp_join_barrier(int gtid);
 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
                           kmp_internal_control_t *new_icvs, ident_t *loc);
 
 #ifdef USE_LOAD_BALANCE
 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
 #endif
 
 static int __kmp_expand_threads(int nNeed);
 #if KMP_OS_WINDOWS
 static int __kmp_unregister_root_other_thread(int gtid);
 #endif
 static void __kmp_unregister_library(void); // called by __kmp_internal_end()
 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
 
 /* Calculate the identifier of the current thread */
 /* fast (and somewhat portable) way to get unique identifier of executing
    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
 int __kmp_get_global_thread_id() {
   int i;
   kmp_info_t **other_threads;
   size_t stack_data;
   char *stack_addr;
   size_t stack_size;
   char *stack_base;
 
   KA_TRACE(
       1000,
       ("*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
        __kmp_nth, __kmp_all_nth));
 
   /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
      a parallel region, made it return KMP_GTID_DNE to force serial_initialize
      by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
      __kmp_init_gtid for this to work. */
 
   if (!TCR_4(__kmp_init_gtid))
     return KMP_GTID_DNE;
 
 #ifdef KMP_TDATA_GTID
   if (TCR_4(__kmp_gtid_mode) >= 3) {
     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
     return __kmp_gtid;
   }
 #endif
   if (TCR_4(__kmp_gtid_mode) >= 2) {
     KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
     return __kmp_gtid_get_specific();
   }
   KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
 
   stack_addr = (char *)&stack_data;
   other_threads = __kmp_threads;
 
   /* ATT: The code below is a source of potential bugs due to unsynchronized
      access to __kmp_threads array. For example:
      1. Current thread loads other_threads[i] to thr and checks it, it is
         non-NULL.
      2. Current thread is suspended by OS.
      3. Another thread unregisters and finishes (debug versions of free()
         may fill memory with something like 0xEF).
      4. Current thread is resumed.
      5. Current thread reads junk from *thr.
      TODO: Fix it.  --ln  */
 
   for (i = 0; i < __kmp_threads_capacity; i++) {
 
     kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
     if (!thr)
       continue;
 
     stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
     stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
 
     /* stack grows down -- search through all of the active threads */
 
     if (stack_addr <= stack_base) {
       size_t stack_diff = stack_base - stack_addr;
 
       if (stack_diff <= stack_size) {
         /* The only way we can be closer than the allocated */
         /* stack size is if we are running on this thread. */
         KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
         return i;
       }
     }
   }
 
   /* get specific to try and determine our gtid */
   KA_TRACE(1000,
            ("*** __kmp_get_global_thread_id: internal alg. failed to find "
             "thread, using TLS\n"));
   i = __kmp_gtid_get_specific();
 
   /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
 
   /* if we havn't been assigned a gtid, then return code */
   if (i < 0)
     return i;
 
   /* dynamically updated stack window for uber threads to avoid get_specific
      call */
   if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
     KMP_FATAL(StackOverflow, i);
   }
 
   stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
   if (stack_addr > stack_base) {
     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
             other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
                 stack_base);
   } else {
     TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
             stack_base - stack_addr);
   }
 
   /* Reprint stack bounds for ubermaster since they have been refined */
   if (__kmp_storage_map) {
     char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
     char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
     __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
                                  other_threads[i]->th.th_info.ds.ds_stacksize,
                                  "th_%d stack (refinement)", i);
   }
   return i;
 }
 
 int __kmp_get_global_thread_id_reg() {
   int gtid;
 
   if (!__kmp_init_serial) {
     gtid = KMP_GTID_DNE;
   } else
 #ifdef KMP_TDATA_GTID
       if (TCR_4(__kmp_gtid_mode) >= 3) {
     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
     gtid = __kmp_gtid;
   } else
 #endif
       if (TCR_4(__kmp_gtid_mode) >= 2) {
     KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
     gtid = __kmp_gtid_get_specific();
   } else {
     KA_TRACE(1000,
              ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
     gtid = __kmp_get_global_thread_id();
   }
 
   /* we must be a new uber master sibling thread */
   if (gtid == KMP_GTID_DNE) {
     KA_TRACE(10,
              ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
               "Registering a new gtid.\n"));
     __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
     if (!__kmp_init_serial) {
       __kmp_do_serial_initialize();
       gtid = __kmp_gtid_get_specific();
     } else {
       gtid = __kmp_register_root(FALSE);
     }
     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
     /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
   }
 
   KMP_DEBUG_ASSERT(gtid >= 0);
 
   return gtid;
 }
 
 /* caller must hold forkjoin_lock */
 void __kmp_check_stack_overlap(kmp_info_t *th) {
   int f;
   char *stack_beg = NULL;
   char *stack_end = NULL;
   int gtid;
 
   KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
   if (__kmp_storage_map) {
     stack_end = (char *)th->th.th_info.ds.ds_stackbase;
     stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
 
     gtid = __kmp_gtid_from_thread(th);
 
     if (gtid == KMP_GTID_MONITOR) {
       __kmp_print_storage_map_gtid(
           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
           "th_%s stack (%s)", "mon",
           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
     } else {
       __kmp_print_storage_map_gtid(
           gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
           "th_%d stack (%s)", gtid,
           (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
     }
   }
 
   /* No point in checking ubermaster threads since they use refinement and
    * cannot overlap */
   gtid = __kmp_gtid_from_thread(th);
   if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
     KA_TRACE(10,
              ("__kmp_check_stack_overlap: performing extensive checking\n"));
     if (stack_beg == NULL) {
       stack_end = (char *)th->th.th_info.ds.ds_stackbase;
       stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
     }
 
     for (f = 0; f < __kmp_threads_capacity; f++) {
       kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
 
       if (f_th && f_th != th) {
         char *other_stack_end =
             (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
         char *other_stack_beg =
             other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
         if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
             (stack_end > other_stack_beg && stack_end < other_stack_end)) {
 
           /* Print the other stack values before the abort */
           if (__kmp_storage_map)
             __kmp_print_storage_map_gtid(
                 -1, other_stack_beg, other_stack_end,
                 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
                 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
 
           __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
                       __kmp_msg_null);
         }
       }
     }
   }
   KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
 }
 
 /* ------------------------------------------------------------------------ */
 
 void __kmp_infinite_loop(void) {
   static int done = FALSE;
 
   while (!done) {
     KMP_YIELD(TRUE);
   }
 }
 
 #define MAX_MESSAGE 512
 
 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
                                   char const *format, ...) {
   char buffer[MAX_MESSAGE];
   va_list ap;
 
   va_start(ap, format);
   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
                p2, (unsigned long)size, format);
   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
   __kmp_vprintf(kmp_err, buffer, ap);
 #if KMP_PRINT_DATA_PLACEMENT
   int node;
   if (gtid >= 0) {
     if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
       if (__kmp_storage_map_verbose) {
         node = __kmp_get_host_node(p1);
         if (node < 0) /* doesn't work, so don't try this next time */
           __kmp_storage_map_verbose = FALSE;
         else {
           char *last;
           int lastNode;
           int localProc = __kmp_get_cpu_from_gtid(gtid);
 
           const int page_size = KMP_GET_PAGE_SIZE();
 
           p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
           p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
           if (localProc >= 0)
             __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid,
                                  localProc >> 1);
           else
             __kmp_printf_no_lock("  GTID %d\n", gtid);
 #if KMP_USE_PRCTL
           /* The more elaborate format is disabled for now because of the prctl
            * hanging bug. */
           do {
             last = p1;
             lastNode = node;
             /* This loop collates adjacent pages with the same host node. */
             do {
               (char *)p1 += page_size;
             } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
             __kmp_printf_no_lock("    %p-%p memNode %d\n", last, (char *)p1 - 1,
                                  lastNode);
           } while (p1 <= p2);
 #else
           __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
                                (char *)p1 + (page_size - 1),
                                __kmp_get_host_node(p1));
           if (p1 < p2) {
             __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
                                  (char *)p2 + (page_size - 1),
                                  __kmp_get_host_node(p2));
           }
 #endif
         }
       }
     } else
       __kmp_printf_no_lock("  %s\n", KMP_I18N_STR(StorageMapWarning));
   }
 #endif /* KMP_PRINT_DATA_PLACEMENT */
   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
 }
 
 void __kmp_warn(char const *format, ...) {
   char buffer[MAX_MESSAGE];
   va_list ap;
 
   if (__kmp_generate_warnings == kmp_warnings_off) {
     return;
   }
 
   va_start(ap, format);
 
   KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
   __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
   __kmp_vprintf(kmp_err, buffer, ap);
   __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
 
   va_end(ap);
 }
 
 void __kmp_abort_process() {
   // Later threads may stall here, but that's ok because abort() will kill them.
   __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
 
   if (__kmp_debug_buf) {
     __kmp_dump_debug_buffer();
   }
 
   if (KMP_OS_WINDOWS) {
     // Let other threads know of abnormal termination and prevent deadlock
     // if abort happened during library initialization or shutdown
     __kmp_global.g.g_abort = SIGABRT;
 
     /* On Windows* OS by default abort() causes pop-up error box, which stalls
        nightly testing. Unfortunately, we cannot reliably suppress pop-up error
        boxes. _set_abort_behavior() works well, but this function is not
        available in VS7 (this is not problem for DLL, but it is a problem for
        static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
        help, at least in some versions of MS C RTL.
 
        It seems following sequence is the only way to simulate abort() and
        avoid pop-up error box. */
     raise(SIGABRT);
     _exit(3); // Just in case, if signal ignored, exit anyway.
   } else {
     abort();
   }
 
   __kmp_infinite_loop();
   __kmp_release_bootstrap_lock(&__kmp_exit_lock);
 
 } // __kmp_abort_process
 
 void __kmp_abort_thread(void) {
   // TODO: Eliminate g_abort global variable and this function.
   // In case of abort just call abort(), it will kill all the threads.
   __kmp_infinite_loop();
 } // __kmp_abort_thread
 
 /* Print out the storage map for the major kmp_info_t thread data structures
    that are allocated together. */
 
 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
   __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
                                gtid);
 
   __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
                                sizeof(kmp_desc_t), "th_%d.th_info", gtid);
 
   __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
                                sizeof(kmp_local_t), "th_%d.th_local", gtid);
 
   __kmp_print_storage_map_gtid(
       gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
       sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
 
   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
                                &thr->th.th_bar[bs_plain_barrier + 1],
                                sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
                                gtid);
 
   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
                                &thr->th.th_bar[bs_forkjoin_barrier + 1],
                                sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
                                gtid);
 
 #if KMP_FAST_REDUCTION_BARRIER
   __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
                                &thr->th.th_bar[bs_reduction_barrier + 1],
                                sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
                                gtid);
 #endif // KMP_FAST_REDUCTION_BARRIER
 }
 
 /* Print out the storage map for the major kmp_team_t team data structures
    that are allocated together. */
 
 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
                                          int team_id, int num_thr) {
   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
   __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
                                header, team_id);
 
   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
                                &team->t.t_bar[bs_last_barrier],
                                sizeof(kmp_balign_team_t) * bs_last_barrier,
                                "%s_%d.t_bar", header, team_id);
 
   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
                                &team->t.t_bar[bs_plain_barrier + 1],
                                sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
                                header, team_id);
 
   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
                                &team->t.t_bar[bs_forkjoin_barrier + 1],
                                sizeof(kmp_balign_team_t),
                                "%s_%d.t_bar[forkjoin]", header, team_id);
 
 #if KMP_FAST_REDUCTION_BARRIER
   __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
                                &team->t.t_bar[bs_reduction_barrier + 1],
                                sizeof(kmp_balign_team_t),
                                "%s_%d.t_bar[reduction]", header, team_id);
 #endif // KMP_FAST_REDUCTION_BARRIER
 
   __kmp_print_storage_map_gtid(
       -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
       sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
 
   __kmp_print_storage_map_gtid(
       -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
       sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
 
   __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
                                &team->t.t_disp_buffer[num_disp_buff],
                                sizeof(dispatch_shared_info_t) * num_disp_buff,
                                "%s_%d.t_disp_buffer", header, team_id);
 }
 
 static void __kmp_init_allocator() { __kmp_init_memkind(); }
 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
 
 /* ------------------------------------------------------------------------ */
 
 #if KMP_DYNAMIC_LIB
 #if KMP_OS_WINDOWS
 
 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
   // TODO: Change to __kmp_break_bootstrap_lock().
   __kmp_init_bootstrap_lock(lck); // make the lock released
 }
 
 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
   int i;
   int thread_count;
 
   // PROCESS_DETACH is expected to be called by a thread that executes
   // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
   // calling ProcessExit or FreeLibrary). So, it might be safe to access the
   // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
   // threads can be still alive here, although being about to be terminated. The
   // threads in the array with ds_thread==0 are most suspicious. Actually, it
   // can be not safe to access the __kmp_threads[].
 
   // TODO: does it make sense to check __kmp_roots[] ?
 
   // Let's check that there are no other alive threads registered with the OMP
   // lib.
   while (1) {
     thread_count = 0;
     for (i = 0; i < __kmp_threads_capacity; ++i) {
       if (!__kmp_threads)
         continue;
       kmp_info_t *th = __kmp_threads[i];
       if (th == NULL)
         continue;
       int gtid = th->th.th_info.ds.ds_gtid;
       if (gtid == gtid_req)
         continue;
       if (gtid < 0)
         continue;
       DWORD exit_val;
       int alive = __kmp_is_thread_alive(th, &exit_val);
       if (alive) {
         ++thread_count;
       }
     }
     if (thread_count == 0)
       break; // success
   }
 
   // Assume that I'm alone. Now it might be safe to check and reset locks.
   // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
   __kmp_reset_lock(&__kmp_forkjoin_lock);
 #ifdef KMP_DEBUG
   __kmp_reset_lock(&__kmp_stdio_lock);
 #endif // KMP_DEBUG
 }
 
 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
   //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
 
   switch (fdwReason) {
 
   case DLL_PROCESS_ATTACH:
     KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
 
     return TRUE;
 
   case DLL_PROCESS_DETACH:
     KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
 
     if (lpReserved != NULL) {
       // lpReserved is used for telling the difference:
       //   lpReserved == NULL when FreeLibrary() was called,
       //   lpReserved != NULL when the process terminates.
       // When FreeLibrary() is called, worker threads remain alive. So they will
       // release the forkjoin lock by themselves. When the process terminates,
       // worker threads disappear triggering the problem of unreleased forkjoin
       // lock as described below.
 
       // A worker thread can take the forkjoin lock. The problem comes up if
       // that worker thread becomes dead before it releases the forkjoin lock.
       // The forkjoin lock remains taken, while the thread executing
       // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
       // to take the forkjoin lock and will always fail, so that the application
       // will never finish [normally]. This scenario is possible if
       // __kmpc_end() has not been executed. It looks like it's not a corner
       // case, but common cases:
       // - the main function was compiled by an alternative compiler;
       // - the main function was compiled by icl but without /Qopenmp
       //   (application with plugins);
       // - application terminates by calling C exit(), Fortran CALL EXIT() or
       //   Fortran STOP.
       // - alive foreign thread prevented __kmpc_end from doing cleanup.
       //
       // This is a hack to work around the problem.
       // TODO: !!! figure out something better.
       __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
     }
 
     __kmp_internal_end_library(__kmp_gtid_get_specific());
 
     return TRUE;
 
   case DLL_THREAD_ATTACH:
     KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
 
     /* if we want to register new siblings all the time here call
      * __kmp_get_gtid(); */
     return TRUE;
 
   case DLL_THREAD_DETACH:
     KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
 
     __kmp_internal_end_thread(__kmp_gtid_get_specific());
     return TRUE;
   }
 
   return TRUE;
 }
 
 #endif /* KMP_OS_WINDOWS */
 #endif /* KMP_DYNAMIC_LIB */
 
 /* __kmp_parallel_deo -- Wait until it's our turn. */
 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
   int gtid = *gtid_ref;
 #ifdef BUILD_PARALLEL_ORDERED
   kmp_team_t *team = __kmp_team_from_gtid(gtid);
 #endif /* BUILD_PARALLEL_ORDERED */
 
   if (__kmp_env_consistency_check) {
     if (__kmp_threads[gtid]->th.th_root->r.r_active)
 #if KMP_USE_DYNAMIC_LOCK
       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
 #else
       __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
 #endif
   }
 #ifdef BUILD_PARALLEL_ORDERED
   if (!team->t.t_serialized) {
     KMP_MB();
     KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
              NULL);
     KMP_MB();
   }
 #endif /* BUILD_PARALLEL_ORDERED */
 }
 
 /* __kmp_parallel_dxo -- Signal the next task. */
 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
   int gtid = *gtid_ref;
 #ifdef BUILD_PARALLEL_ORDERED
   int tid = __kmp_tid_from_gtid(gtid);
   kmp_team_t *team = __kmp_team_from_gtid(gtid);
 #endif /* BUILD_PARALLEL_ORDERED */
 
   if (__kmp_env_consistency_check) {
     if (__kmp_threads[gtid]->th.th_root->r.r_active)
       __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
   }
 #ifdef BUILD_PARALLEL_ORDERED
   if (!team->t.t_serialized) {
     KMP_MB(); /* Flush all pending memory write invalidates.  */
 
     /* use the tid of the next thread in this team */
     /* TODO replace with general release procedure */
     team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
 
     KMP_MB(); /* Flush all pending memory write invalidates.  */
   }
 #endif /* BUILD_PARALLEL_ORDERED */
 }
 
 /* ------------------------------------------------------------------------ */
 /* The BARRIER for a SINGLE process section is always explicit   */
 
 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
   int status;
   kmp_info_t *th;
   kmp_team_t *team;
 
   if (!TCR_4(__kmp_init_parallel))
     __kmp_parallel_initialize();
   __kmp_resume_if_soft_paused();
 
   th = __kmp_threads[gtid];
   team = th->th.th_team;
   status = 0;
 
   th->th.th_ident = id_ref;
 
   if (team->t.t_serialized) {
     status = 1;
   } else {
     kmp_int32 old_this = th->th.th_local.this_construct;
 
     ++th->th.th_local.this_construct;
     /* try to set team count to thread count--success means thread got the
        single block */
     /* TODO: Should this be acquire or release? */
     if (team->t.t_construct == old_this) {
       status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
                                               th->th.th_local.this_construct);
     }
 #if USE_ITT_BUILD
     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
         KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
         team->t.t_active_level ==
             1) { // Only report metadata by master of active team at level 1
       __kmp_itt_metadata_single(id_ref);
     }
 #endif /* USE_ITT_BUILD */
   }
 
   if (__kmp_env_consistency_check) {
     if (status && push_ws) {
       __kmp_push_workshare(gtid, ct_psingle, id_ref);
     } else {
       __kmp_check_workshare(gtid, ct_psingle, id_ref);
     }
   }
 #if USE_ITT_BUILD
   if (status) {
     __kmp_itt_single_start(gtid);
   }
 #endif /* USE_ITT_BUILD */
   return status;
 }
 
 void __kmp_exit_single(int gtid) {
 #if USE_ITT_BUILD
   __kmp_itt_single_end(gtid);
 #endif /* USE_ITT_BUILD */
   if (__kmp_env_consistency_check)
     __kmp_pop_workshare(gtid, ct_psingle, NULL);
 }
 
 /* determine if we can go parallel or must use a serialized parallel region and
  * how many threads we can use
  * set_nproc is the number of threads requested for the team
  * returns 0 if we should serialize or only use one thread,
  * otherwise the number of threads to use
  * The forkjoin lock is held by the caller. */
 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
                                  int master_tid, int set_nthreads,
                                  int enter_teams) {
   int capacity;
   int new_nthreads;
   KMP_DEBUG_ASSERT(__kmp_init_serial);
   KMP_DEBUG_ASSERT(root && parent_team);
   kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
 
   // If dyn-var is set, dynamically adjust the number of desired threads,
   // according to the method specified by dynamic_mode.
   new_nthreads = set_nthreads;
   if (!get__dynamic_2(parent_team, master_tid)) {
     ;
   }
 #ifdef USE_LOAD_BALANCE
   else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
     new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
     if (new_nthreads == 1) {
       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
                     "reservation to 1 thread\n",
                     master_tid));
       return 1;
     }
     if (new_nthreads < set_nthreads) {
       KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
                     "reservation to %d threads\n",
                     master_tid, new_nthreads));
     }
   }
 #endif /* USE_LOAD_BALANCE */
   else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
     new_nthreads = __kmp_avail_proc - __kmp_nth +
                    (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
     if (new_nthreads <= 1) {
       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
                     "reservation to 1 thread\n",
                     master_tid));
       return 1;
     }
     if (new_nthreads < set_nthreads) {
       KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
                     "reservation to %d threads\n",
                     master_tid, new_nthreads));
     } else {
       new_nthreads = set_nthreads;
     }
   } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
     if (set_nthreads > 2) {
       new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
       new_nthreads = (new_nthreads % set_nthreads) + 1;
       if (new_nthreads == 1) {
         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
                       "reservation to 1 thread\n",
                       master_tid));
         return 1;
       }
       if (new_nthreads < set_nthreads) {
         KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
                       "reservation to %d threads\n",
                       master_tid, new_nthreads));
       }
     }
   } else {
     KMP_ASSERT(0);
   }
 
   // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
   if (__kmp_nth + new_nthreads -
           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
       __kmp_max_nth) {
     int tl_nthreads = __kmp_max_nth - __kmp_nth +
                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
     if (tl_nthreads <= 0) {
       tl_nthreads = 1;
     }
 
     // If dyn-var is false, emit a 1-time warning.
     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
       __kmp_reserve_warn = 1;
       __kmp_msg(kmp_ms_warning,
                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
     }
     if (tl_nthreads == 1) {
       KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
                     "reduced reservation to 1 thread\n",
                     master_tid));
       return 1;
     }
     KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
                   "reservation to %d threads\n",
                   master_tid, tl_nthreads));
     new_nthreads = tl_nthreads;
   }
 
   // Respect OMP_THREAD_LIMIT
   int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
   int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
   if (cg_nthreads + new_nthreads -
           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
       max_cg_threads) {
     int tl_nthreads = max_cg_threads - cg_nthreads +
                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
     if (tl_nthreads <= 0) {
       tl_nthreads = 1;
     }
 
     // If dyn-var is false, emit a 1-time warning.
     if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
       __kmp_reserve_warn = 1;
       __kmp_msg(kmp_ms_warning,
                 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
     }
     if (tl_nthreads == 1) {
       KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
                     "reduced reservation to 1 thread\n",
                     master_tid));
       return 1;
     }
     KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
                   "reservation to %d threads\n",
                   master_tid, tl_nthreads));
     new_nthreads = tl_nthreads;
   }
 
   // Check if the threads array is large enough, or needs expanding.
   // See comment in __kmp_register_root() about the adjustment if
   // __kmp_threads[0] == NULL.
   capacity = __kmp_threads_capacity;
   if (TCR_PTR(__kmp_threads[0]) == NULL) {
     --capacity;
   }
   if (__kmp_nth + new_nthreads -
           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
       capacity) {
     // Expand the threads array.
     int slotsRequired = __kmp_nth + new_nthreads -
                         (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
                         capacity;
     int slotsAdded = __kmp_expand_threads(slotsRequired);
     if (slotsAdded < slotsRequired) {
       // The threads array was not expanded enough.
       new_nthreads -= (slotsRequired - slotsAdded);
       KMP_ASSERT(new_nthreads >= 1);
 
       // If dyn-var is false, emit a 1-time warning.
       if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
         __kmp_reserve_warn = 1;
         if (__kmp_tp_cached) {
           __kmp_msg(kmp_ms_warning,
                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
                     KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
                     KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
         } else {
           __kmp_msg(kmp_ms_warning,
                     KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
                     KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
         }
       }
     }
   }
 
 #ifdef KMP_DEBUG
   if (new_nthreads == 1) {
     KC_TRACE(10,
              ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
               "dead roots and rechecking; requested %d threads\n",
               __kmp_get_gtid(), set_nthreads));
   } else {
     KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
                   " %d threads\n",
                   __kmp_get_gtid(), new_nthreads, set_nthreads));
   }
 #endif // KMP_DEBUG
   return new_nthreads;
 }
 
 /* Allocate threads from the thread pool and assign them to the new team. We are
    assured that there are enough threads available, because we checked on that
    earlier within critical section forkjoin */
 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
                                     kmp_info_t *master_th, int master_gtid) {
   int i;
   int use_hot_team;
 
   KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
   KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
   KMP_MB();
 
   /* first, let's setup the master thread */
   master_th->th.th_info.ds.ds_tid = 0;
   master_th->th.th_team = team;
   master_th->th.th_team_nproc = team->t.t_nproc;
   master_th->th.th_team_master = master_th;
   master_th->th.th_team_serialized = FALSE;
   master_th->th.th_dispatch = &team->t.t_dispatch[0];
 
 /* make sure we are not the optimized hot team */
 #if KMP_NESTED_HOT_TEAMS
   use_hot_team = 0;
   kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
   if (hot_teams) { // hot teams array is not allocated if
     // KMP_HOT_TEAMS_MAX_LEVEL=0
     int level = team->t.t_active_level - 1; // index in array of hot teams
     if (master_th->th.th_teams_microtask) { // are we inside the teams?
       if (master_th->th.th_teams_size.nteams > 1) {
         ++level; // level was not increased in teams construct for
         // team_of_masters
       }
       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
           master_th->th.th_teams_level == team->t.t_level) {
         ++level; // level was not increased in teams construct for
         // team_of_workers before the parallel
       } // team->t.t_level will be increased inside parallel
     }
     if (level < __kmp_hot_teams_max_level) {
       if (hot_teams[level].hot_team) {
         // hot team has already been allocated for given level
         KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
         use_hot_team = 1; // the team is ready to use
       } else {
         use_hot_team = 0; // AC: threads are not allocated yet
         hot_teams[level].hot_team = team; // remember new hot team
         hot_teams[level].hot_team_nth = team->t.t_nproc;
       }
     } else {
       use_hot_team = 0;
     }
   }
 #else
   use_hot_team = team == root->r.r_hot_team;
 #endif
   if (!use_hot_team) {
 
     /* install the master thread */
     team->t.t_threads[0] = master_th;
     __kmp_initialize_info(master_th, team, 0, master_gtid);
 
     /* now, install the worker threads */
     for (i = 1; i < team->t.t_nproc; i++) {
 
       /* fork or reallocate a new thread and install it in team */
       kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
       team->t.t_threads[i] = thr;
       KMP_DEBUG_ASSERT(thr);
       KMP_DEBUG_ASSERT(thr->th.th_team == team);
       /* align team and thread arrived states */
       KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
                     "T#%d(%d:%d) join =%llu, plain=%llu\n",
                     __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
                     __kmp_gtid_from_tid(i, team), team->t.t_id, i,
                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
                     team->t.t_bar[bs_plain_barrier].b_arrived));
       thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
       thr->th.th_teams_level = master_th->th.th_teams_level;
       thr->th.th_teams_size = master_th->th.th_teams_size;
       { // Initialize threads' barrier data.
         int b;
         kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
         for (b = 0; b < bs_last_barrier; ++b) {
           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
 #if USE_DEBUGGER
           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
 #endif
         }
       }
     }
 
 #if KMP_AFFINITY_SUPPORTED
     __kmp_partition_places(team);
 #endif
   }
 
   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
     for (i = 0; i < team->t.t_nproc; i++) {
       kmp_info_t *thr = team->t.t_threads[i];
       if (thr->th.th_prev_num_threads != team->t.t_nproc ||
           thr->th.th_prev_level != team->t.t_level) {
         team->t.t_display_affinity = 1;
         break;
       }
     }
   }
 
   KMP_MB();
 }
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 // Propagate any changes to the floating point control registers out to the team
 // We try to avoid unnecessary writes to the relevant cache line in the team
 // structure, so we don't make changes unless they are needed.
 inline static void propagateFPControl(kmp_team_t *team) {
   if (__kmp_inherit_fp_control) {
     kmp_int16 x87_fpu_control_word;
     kmp_uint32 mxcsr;
 
     // Get master values of FPU control flags (both X87 and vector)
     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
     __kmp_store_mxcsr(&mxcsr);
     mxcsr &= KMP_X86_MXCSR_MASK;
 
     // There is no point looking at t_fp_control_saved here.
     // If it is TRUE, we still have to update the values if they are different
     // from those we now have. If it is FALSE we didn't save anything yet, but
     // our objective is the same. We have to ensure that the values in the team
     // are the same as those we have.
     // So, this code achieves what we need whether or not t_fp_control_saved is
     // true. By checking whether the value needs updating we avoid unnecessary
     // writes that would put the cache-line into a written state, causing all
     // threads in the team to have to read it again.
     KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
     KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
     // Although we don't use this value, other code in the runtime wants to know
     // whether it should restore them. So we must ensure it is correct.
     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
   } else {
     // Similarly here. Don't write to this cache-line in the team structure
     // unless we have to.
     KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
   }
 }
 
 // Do the opposite, setting the hardware registers to the updated values from
 // the team.
 inline static void updateHWFPControl(kmp_team_t *team) {
   if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
     // Only reset the fp control regs if they have been changed in the team.
     // the parallel region that we are exiting.
     kmp_int16 x87_fpu_control_word;
     kmp_uint32 mxcsr;
     __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
     __kmp_store_mxcsr(&mxcsr);
     mxcsr &= KMP_X86_MXCSR_MASK;
 
     if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
       __kmp_clear_x87_fpu_status_word();
       __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
     }
 
     if (team->t.t_mxcsr != mxcsr) {
       __kmp_load_mxcsr(&team->t.t_mxcsr);
     }
   }
 }
 #else
 #define propagateFPControl(x) ((void)0)
 #define updateHWFPControl(x) ((void)0)
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
                                      int realloc); // forward declaration
 
 /* Run a parallel region that has been serialized, so runs only in a team of the
    single master thread. */
 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
   kmp_info_t *this_thr;
   kmp_team_t *serial_team;
 
   KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
 
   /* Skip all this code for autopar serialized loops since it results in
      unacceptable overhead */
   if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
     return;
 
   if (!TCR_4(__kmp_init_parallel))
     __kmp_parallel_initialize();
   __kmp_resume_if_soft_paused();
 
   this_thr = __kmp_threads[global_tid];
   serial_team = this_thr->th.th_serial_team;
 
   /* utilize the serialized team held by this thread */
   KMP_DEBUG_ASSERT(serial_team);
   KMP_MB();
 
   if (__kmp_tasking_mode != tskm_immediate_exec) {
     KMP_DEBUG_ASSERT(
         this_thr->th.th_task_team ==
         this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
     KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
                      NULL);
     KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
                   "team %p, new task_team = NULL\n",
                   global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
     this_thr->th.th_task_team = NULL;
   }
 
   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
     proc_bind = proc_bind_false;
   } else if (proc_bind == proc_bind_default) {
     // No proc_bind clause was specified, so use the current value
     // of proc-bind-var for this parallel region.
     proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
   }
   // Reset for next parallel region
   this_thr->th.th_set_proc_bind = proc_bind_default;
 
 #if OMPT_SUPPORT
   ompt_data_t ompt_parallel_data = ompt_data_none;
   ompt_data_t *implicit_task_data;
   void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
   if (ompt_enabled.enabled &&
       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
 
     ompt_task_info_t *parent_task_info;
     parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
 
     parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
     if (ompt_enabled.ompt_callback_parallel_begin) {
       int team_size = 1;
 
       ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
           &(parent_task_info->task_data), &(parent_task_info->frame),
-          &ompt_parallel_data, team_size, ompt_parallel_invoker_program,
-          codeptr);
+          &ompt_parallel_data, team_size,
+          ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
     }
   }
 #endif // OMPT_SUPPORT
 
   if (this_thr->th.th_team != serial_team) {
     // Nested level will be an index in the nested nthreads array
     int level = this_thr->th.th_team->t.t_level;
 
     if (serial_team->t.t_serialized) {
       /* this serial team was already used
          TODO increase performance by making this locks more specific */
       kmp_team_t *new_team;
 
       __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
 
       new_team =
           __kmp_allocate_team(this_thr->th.th_root, 1, 1,
 #if OMPT_SUPPORT
                               ompt_parallel_data,
 #endif
                               proc_bind, &this_thr->th.th_current_task->td_icvs,
                               0 USE_NESTED_HOT_ARG(NULL));
       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
       KMP_ASSERT(new_team);
 
       /* setup new serialized team and install it */
       new_team->t.t_threads[0] = this_thr;
       new_team->t.t_parent = this_thr->th.th_team;
       serial_team = new_team;
       this_thr->th.th_serial_team = serial_team;
 
       KF_TRACE(
           10,
           ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
            global_tid, serial_team));
 
       /* TODO the above breaks the requirement that if we run out of resources,
          then we can still guarantee that serialized teams are ok, since we may
          need to allocate a new one */
     } else {
       KF_TRACE(
           10,
           ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
            global_tid, serial_team));
     }
 
     /* we have to initialize this serial team */
     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
     KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
     serial_team->t.t_ident = loc;
     serial_team->t.t_serialized = 1;
     serial_team->t.t_nproc = 1;
     serial_team->t.t_parent = this_thr->th.th_team;
     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
     this_thr->th.th_team = serial_team;
     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
 
     KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
                   this_thr->th.th_current_task));
     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
     this_thr->th.th_current_task->td_flags.executing = 0;
 
     __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
 
     /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
        implicit task for each serialized task represented by
        team->t.t_serialized? */
     copy_icvs(&this_thr->th.th_current_task->td_icvs,
               &this_thr->th.th_current_task->td_parent->td_icvs);
 
     // Thread value exists in the nested nthreads array for the next nested
     // level
     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
       this_thr->th.th_current_task->td_icvs.nproc =
           __kmp_nested_nth.nth[level + 1];
     }
 
     if (__kmp_nested_proc_bind.used &&
         (level + 1 < __kmp_nested_proc_bind.used)) {
       this_thr->th.th_current_task->td_icvs.proc_bind =
           __kmp_nested_proc_bind.bind_types[level + 1];
     }
 
 #if USE_DEBUGGER
     serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
 #endif
     this_thr->th.th_info.ds.ds_tid = 0;
 
     /* set thread cache values */
     this_thr->th.th_team_nproc = 1;
     this_thr->th.th_team_master = this_thr;
     this_thr->th.th_team_serialized = 1;
 
     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
     serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
 
     propagateFPControl(serial_team);
 
     /* check if we need to allocate dispatch buffers stack */
     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
     if (!serial_team->t.t_dispatch->th_disp_buffer) {
       serial_team->t.t_dispatch->th_disp_buffer =
           (dispatch_private_info_t *)__kmp_allocate(
               sizeof(dispatch_private_info_t));
     }
     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
 
     KMP_MB();
 
   } else {
     /* this serialized team is already being used,
      * that's fine, just add another nested level */
     KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
     KMP_DEBUG_ASSERT(serial_team->t.t_threads);
     KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
     ++serial_team->t.t_serialized;
     this_thr->th.th_team_serialized = serial_team->t.t_serialized;
 
     // Nested level will be an index in the nested nthreads array
     int level = this_thr->th.th_team->t.t_level;
     // Thread value exists in the nested nthreads array for the next nested
     // level
     if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
       this_thr->th.th_current_task->td_icvs.nproc =
           __kmp_nested_nth.nth[level + 1];
     }
     serial_team->t.t_level++;
     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
                   "of serial team %p to %d\n",
                   global_tid, serial_team, serial_team->t.t_level));
 
     /* allocate/push dispatch buffers stack */
     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
     {
       dispatch_private_info_t *disp_buffer =
           (dispatch_private_info_t *)__kmp_allocate(
               sizeof(dispatch_private_info_t));
       disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
       serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
     }
     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
 
     KMP_MB();
   }
   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
 
   // Perform the display affinity functionality for
   // serialized parallel regions
   if (__kmp_display_affinity) {
     if (this_thr->th.th_prev_level != serial_team->t.t_level ||
         this_thr->th.th_prev_num_threads != 1) {
       // NULL means use the affinity-format-var ICV
       __kmp_aux_display_affinity(global_tid, NULL);
       this_thr->th.th_prev_level = serial_team->t.t_level;
       this_thr->th.th_prev_num_threads = 1;
     }
   }
 
   if (__kmp_env_consistency_check)
     __kmp_push_parallel(global_tid, NULL);
 #if OMPT_SUPPORT
   serial_team->t.ompt_team_info.master_return_address = codeptr;
   if (ompt_enabled.enabled &&
       this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
 
     ompt_lw_taskteam_t lw_taskteam;
     __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
                             &ompt_parallel_data, codeptr);
 
     __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
     // don't use lw_taskteam after linking. content was swaped
 
     /* OMPT implicit task begin */
     implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
     if (ompt_enabled.ompt_callback_implicit_task) {
       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
           OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
       OMPT_CUR_TASK_INFO(this_thr)
           ->thread_num = __kmp_tid_from_gtid(global_tid);
     }
 
     /* OMPT state */
     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
   }
 #endif
 }
 
 /* most of the work for a fork */
 /* return true if we really went parallel, false if serialized */
 int __kmp_fork_call(ident_t *loc, int gtid,
                     enum fork_context_e call_context, // Intel, GNU, ...
                     kmp_int32 argc, microtask_t microtask, launch_t invoker,
 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
                     va_list *ap
 #else
                     va_list ap
 #endif
                     ) {
   void **argv;
   int i;
   int master_tid;
   int master_this_cons;
   kmp_team_t *team;
   kmp_team_t *parent_team;
   kmp_info_t *master_th;
   kmp_root_t *root;
   int nthreads;
   int master_active;
   int master_set_numthreads;
   int level;
   int active_level;
   int teams_level;
 #if KMP_NESTED_HOT_TEAMS
   kmp_hot_team_ptr_t **p_hot_teams;
 #endif
   { // KMP_TIME_BLOCK
     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
 
     KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
     if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
       /* Some systems prefer the stack for the root thread(s) to start with */
       /* some gap from the parent stack to prevent false sharing. */
       void *dummy = KMP_ALLOCA(__kmp_stkpadding);
       /* These 2 lines below are so this does not get optimized out */
       if (__kmp_stkpadding > KMP_MAX_STKPADDING)
         __kmp_stkpadding += (short)((kmp_int64)dummy);
     }
 
     /* initialize if needed */
     KMP_DEBUG_ASSERT(
         __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
     if (!TCR_4(__kmp_init_parallel))
       __kmp_parallel_initialize();
     __kmp_resume_if_soft_paused();
 
     /* setup current data */
     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
     // shutdown
     parent_team = master_th->th.th_team;
     master_tid = master_th->th.th_info.ds.ds_tid;
     master_this_cons = master_th->th.th_local.this_construct;
     root = master_th->th.th_root;
     master_active = root->r.r_active;
     master_set_numthreads = master_th->th.th_set_nproc;
 
 #if OMPT_SUPPORT
     ompt_data_t ompt_parallel_data = ompt_data_none;
     ompt_data_t *parent_task_data;
     ompt_frame_t *ompt_frame;
     ompt_data_t *implicit_task_data;
     void *return_address = NULL;
 
     if (ompt_enabled.enabled) {
       __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
                                     NULL, NULL);
       return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
     }
 #endif
 
     // Nested level will be an index in the nested nthreads array
     level = parent_team->t.t_level;
     // used to launch non-serial teams even if nested is not allowed
     active_level = parent_team->t.t_active_level;
     // needed to check nesting inside the teams
     teams_level = master_th->th.th_teams_level;
 #if KMP_NESTED_HOT_TEAMS
     p_hot_teams = &master_th->th.th_hot_teams;
     if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
       *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
           sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
       (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
       // it is either actual or not needed (when active_level > 0)
       (*p_hot_teams)[0].hot_team_nth = 1;
     }
 #endif
 
 #if OMPT_SUPPORT
     if (ompt_enabled.enabled) {
       if (ompt_enabled.ompt_callback_parallel_begin) {
         int team_size = master_set_numthreads
                             ? master_set_numthreads
                             : get__nproc_2(parent_team, master_tid);
+        int flags = OMPT_INVOKER(call_context) |
+                    ((microtask == (microtask_t)__kmp_teams_master)
+                         ? ompt_parallel_league
+                         : ompt_parallel_team);
         ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
-            parent_task_data, ompt_frame, &ompt_parallel_data, team_size,
-            OMPT_INVOKER(call_context), return_address);
+            parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
+            return_address);
       }
       master_th->th.ompt_thread_info.state = ompt_state_overhead;
     }
 #endif
 
     master_th->th.th_ident = loc;
 
     if (master_th->th.th_teams_microtask && ap &&
         microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
       // AC: This is start of parallel that is nested inside teams construct.
       // The team is actual (hot), all workers are ready at the fork barrier.
       // No lock needed to initialize the team a bit, then free workers.
       parent_team->t.t_ident = loc;
       __kmp_alloc_argv_entries(argc, parent_team, TRUE);
       parent_team->t.t_argc = argc;
       argv = (void **)parent_team->t.t_argv;
       for (i = argc - 1; i >= 0; --i)
 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
         *argv++ = va_arg(*ap, void *);
 #else
         *argv++ = va_arg(ap, void *);
 #endif
       // Increment our nested depth levels, but not increase the serialization
       if (parent_team == master_th->th.th_serial_team) {
         // AC: we are in serialized parallel
         __kmpc_serialized_parallel(loc, gtid);
         KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
-        // AC: need this in order enquiry functions work
-        // correctly, will restore at join time
-        parent_team->t.t_serialized--;
+
 #if OMPT_SUPPORT
         void *dummy;
-        void **exit_runtime_p;
+        void **exit_frame_p;
 
         ompt_lw_taskteam_t lw_taskteam;
 
         if (ompt_enabled.enabled) {
           __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
                                   &ompt_parallel_data, return_address);
-          exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
+          exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
 
           __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
           // don't use lw_taskteam after linking. content was swaped
 
           /* OMPT implicit task begin */
           implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
           if (ompt_enabled.ompt_callback_implicit_task) {
-            ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
-                ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
-                implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
             OMPT_CUR_TASK_INFO(master_th)
                 ->thread_num = __kmp_tid_from_gtid(gtid);
+            ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+                ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
+                implicit_task_data, 1,
+                OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
           }
 
           /* OMPT state */
           master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
         } else {
-          exit_runtime_p = &dummy;
+          exit_frame_p = &dummy;
         }
 #endif
+        // AC: need to decrement t_serialized for enquiry functions to work
+        // correctly, will restore at join time
+        parent_team->t.t_serialized--;
 
         {
           KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
           KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
           __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
 #if OMPT_SUPPORT
                                  ,
-                                 exit_runtime_p
+                                 exit_frame_p
 #endif
                                  );
         }
 
 #if OMPT_SUPPORT
-        *exit_runtime_p = NULL;
         if (ompt_enabled.enabled) {
+          *exit_frame_p = NULL;
           OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
           if (ompt_enabled.ompt_callback_implicit_task) {
             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
                 ompt_scope_end, NULL, implicit_task_data, 1,
-                OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
+                OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
           }
+          ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
           __ompt_lw_taskteam_unlink(master_th);
-
           if (ompt_enabled.ompt_callback_parallel_end) {
             ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
-                OMPT_CUR_TEAM_DATA(master_th), OMPT_CUR_TASK_DATA(master_th),
-                OMPT_INVOKER(call_context), return_address);
+                &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
+                OMPT_INVOKER(call_context) | ompt_parallel_team,
+                return_address);
           }
           master_th->th.ompt_thread_info.state = ompt_state_overhead;
         }
 #endif
         return TRUE;
       }
 
       parent_team->t.t_pkfn = microtask;
       parent_team->t.t_invoke = invoker;
       KMP_ATOMIC_INC(&root->r.r_in_parallel);
       parent_team->t.t_active_level++;
       parent_team->t.t_level++;
       parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
 
+#if OMPT_SUPPORT
+      if (ompt_enabled.enabled) {
+        ompt_lw_taskteam_t lw_taskteam;
+        __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
+                                &ompt_parallel_data, return_address);
+        __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
+      }
+#endif
+
       /* Change number of threads in the team if requested */
       if (master_set_numthreads) { // The parallel has num_threads clause
         if (master_set_numthreads < master_th->th.th_teams_size.nth) {
           // AC: only can reduce number of threads dynamically, can't increase
           kmp_info_t **other_threads = parent_team->t.t_threads;
           parent_team->t.t_nproc = master_set_numthreads;
           for (i = 0; i < master_set_numthreads; ++i) {
             other_threads[i]->th.th_team_nproc = master_set_numthreads;
           }
           // Keep extra threads hot in the team for possible next parallels
         }
         master_th->th.th_set_nproc = 0;
       }
 
 #if USE_DEBUGGER
       if (__kmp_debugging) { // Let debugger override number of threads.
         int nth = __kmp_omp_num_threads(loc);
         if (nth > 0) { // 0 means debugger doesn't want to change num threads
           master_set_numthreads = nth;
         }
       }
 #endif
 
       KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
                     "master_th=%p, gtid=%d\n",
                     root, parent_team, master_th, gtid));
       __kmp_internal_fork(loc, gtid, parent_team);
       KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
                     "master_th=%p, gtid=%d\n",
                     root, parent_team, master_th, gtid));
 
       /* Invoke microtask for MASTER thread */
       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
                     parent_team->t.t_id, parent_team->t.t_pkfn));
 
       if (!parent_team->t.t_invoke(gtid)) {
         KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
       }
       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
                     parent_team->t.t_id, parent_team->t.t_pkfn));
       KMP_MB(); /* Flush all pending memory write invalidates.  */
 
       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
 
       return TRUE;
     } // Parallel closely nested in teams construct
 
 #if KMP_DEBUG
     if (__kmp_tasking_mode != tskm_immediate_exec) {
       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
                        parent_team->t.t_task_team[master_th->th.th_task_state]);
     }
 #endif
 
     if (parent_team->t.t_active_level >=
         master_th->th.th_current_task->td_icvs.max_active_levels) {
       nthreads = 1;
     } else {
       int enter_teams = ((ap == NULL && active_level == 0) ||
                          (ap && teams_level > 0 && teams_level == level));
       nthreads =
           master_set_numthreads
               ? master_set_numthreads
               : get__nproc_2(
                     parent_team,
                     master_tid); // TODO: get nproc directly from current task
 
       // Check if we need to take forkjoin lock? (no need for serialized
       // parallel out of teams construct). This code moved here from
       // __kmp_reserve_threads() to speedup nested serialized parallels.
       if (nthreads > 1) {
         if ((get__max_active_levels(master_th) == 1 &&
              (root->r.r_in_parallel && !enter_teams)) ||
             (__kmp_library == library_serial)) {
           KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
                         " threads\n",
                         gtid, nthreads));
           nthreads = 1;
         }
       }
       if (nthreads > 1) {
         /* determine how many new threads we can use */
         __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
         /* AC: If we execute teams from parallel region (on host), then teams
            should be created but each can only have 1 thread if nesting is
            disabled. If teams called from serial region, then teams and their
            threads should be created regardless of the nesting setting. */
         nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
                                          nthreads, enter_teams);
         if (nthreads == 1) {
           // Free lock for single thread execution here; for multi-thread
           // execution it will be freed later after team of threads created
           // and initialized
           __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
         }
       }
     }
     KMP_DEBUG_ASSERT(nthreads > 0);
 
     // If we temporarily changed the set number of threads then restore it now
     master_th->th.th_set_nproc = 0;
 
     /* create a serialized parallel region? */
     if (nthreads == 1) {
 /* josh todo: hypothetical question: what do we do for OS X*? */
 #if KMP_OS_LINUX &&                                                            \
     (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
       void *args[argc];
 #else
       void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
           KMP_ARCH_AARCH64) */
 
       KA_TRACE(20,
                ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
 
       __kmpc_serialized_parallel(loc, gtid);
 
       if (call_context == fork_context_intel) {
         /* TODO this sucks, use the compiler itself to pass args! :) */
         master_th->th.th_serial_team->t.t_ident = loc;
         if (!ap) {
           // revert change made in __kmpc_serialized_parallel()
           master_th->th.th_serial_team->t.t_level--;
 // Get args from parent team for teams construct
 
 #if OMPT_SUPPORT
           void *dummy;
-          void **exit_runtime_p;
+          void **exit_frame_p;
           ompt_task_info_t *task_info;
 
           ompt_lw_taskteam_t lw_taskteam;
 
           if (ompt_enabled.enabled) {
             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
                                     &ompt_parallel_data, return_address);
 
             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
             // don't use lw_taskteam after linking. content was swaped
 
             task_info = OMPT_CUR_TASK_INFO(master_th);
-            exit_runtime_p = &(task_info->frame.exit_frame.ptr);
+            exit_frame_p = &(task_info->frame.exit_frame.ptr);
             if (ompt_enabled.ompt_callback_implicit_task) {
-              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
-                  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
-                  &(task_info->task_data), 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
               OMPT_CUR_TASK_INFO(master_th)
                   ->thread_num = __kmp_tid_from_gtid(gtid);
+              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+                  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
+                  &(task_info->task_data), 1,
+                  OMPT_CUR_TASK_INFO(master_th)->thread_num,
+                  ompt_task_implicit);
             }
 
             /* OMPT state */
             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
           } else {
-            exit_runtime_p = &dummy;
+            exit_frame_p = &dummy;
           }
 #endif
 
           {
             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
             __kmp_invoke_microtask(microtask, gtid, 0, argc,
                                    parent_team->t.t_argv
 #if OMPT_SUPPORT
                                    ,
-                                   exit_runtime_p
+                                   exit_frame_p
 #endif
                                    );
           }
 
 #if OMPT_SUPPORT
           if (ompt_enabled.enabled) {
-            exit_runtime_p = NULL;
+            *exit_frame_p = NULL;
             if (ompt_enabled.ompt_callback_implicit_task) {
               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
                   ompt_scope_end, NULL, &(task_info->task_data), 1,
-                  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
+                  OMPT_CUR_TASK_INFO(master_th)->thread_num,
+                  ompt_task_implicit);
             }
-
+            ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
             __ompt_lw_taskteam_unlink(master_th);
             if (ompt_enabled.ompt_callback_parallel_end) {
               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
-                  OMPT_CUR_TEAM_DATA(master_th), parent_task_data,
-                  OMPT_INVOKER(call_context), return_address);
+                  &ompt_parallel_data, parent_task_data,
+                  OMPT_INVOKER(call_context) | ompt_parallel_team,
+                  return_address);
             }
             master_th->th.ompt_thread_info.state = ompt_state_overhead;
           }
 #endif
         } else if (microtask == (microtask_t)__kmp_teams_master) {
           KMP_DEBUG_ASSERT(master_th->th.th_team ==
                            master_th->th.th_serial_team);
           team = master_th->th.th_team;
           // team->t.t_pkfn = microtask;
           team->t.t_invoke = invoker;
           __kmp_alloc_argv_entries(argc, team, TRUE);
           team->t.t_argc = argc;
           argv = (void **)team->t.t_argv;
           if (ap) {
             for (i = argc - 1; i >= 0; --i)
 // TODO: revert workaround for Intel(R) 64 tracker #96
 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
               *argv++ = va_arg(*ap, void *);
 #else
               *argv++ = va_arg(ap, void *);
 #endif
           } else {
             for (i = 0; i < argc; ++i)
               // Get args from parent team for teams construct
               argv[i] = parent_team->t.t_argv[i];
           }
           // AC: revert change made in __kmpc_serialized_parallel()
           //     because initial code in teams should have level=0
           team->t.t_level--;
           // AC: call special invoker for outer "parallel" of teams construct
           invoker(gtid);
+#if OMPT_SUPPORT
+          if (ompt_enabled.enabled) {
+            ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
+            if (ompt_enabled.ompt_callback_implicit_task) {
+              ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+                  ompt_scope_end, NULL, &(task_info->task_data), 0,
+                  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
+            }
+            if (ompt_enabled.ompt_callback_parallel_end) {
+              ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
+                  &ompt_parallel_data, parent_task_data,
+                  OMPT_INVOKER(call_context) | ompt_parallel_league,
+                  return_address);
+            }
+            master_th->th.ompt_thread_info.state = ompt_state_overhead;
+          }
+#endif
         } else {
           argv = args;
           for (i = argc - 1; i >= 0; --i)
 // TODO: revert workaround for Intel(R) 64 tracker #96
 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
             *argv++ = va_arg(*ap, void *);
 #else
             *argv++ = va_arg(ap, void *);
 #endif
           KMP_MB();
 
 #if OMPT_SUPPORT
           void *dummy;
-          void **exit_runtime_p;
+          void **exit_frame_p;
           ompt_task_info_t *task_info;
 
           ompt_lw_taskteam_t lw_taskteam;
 
           if (ompt_enabled.enabled) {
             __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
                                     &ompt_parallel_data, return_address);
             __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
             // don't use lw_taskteam after linking. content was swaped
             task_info = OMPT_CUR_TASK_INFO(master_th);
-            exit_runtime_p = &(task_info->frame.exit_frame.ptr);
+            exit_frame_p = &(task_info->frame.exit_frame.ptr);
 
             /* OMPT implicit task begin */
             implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
             if (ompt_enabled.ompt_callback_implicit_task) {
               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
-                  implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
+                  implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
+                  ompt_task_implicit);
               OMPT_CUR_TASK_INFO(master_th)
                   ->thread_num = __kmp_tid_from_gtid(gtid);
             }
 
             /* OMPT state */
             master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
           } else {
-            exit_runtime_p = &dummy;
+            exit_frame_p = &dummy;
           }
 #endif
 
           {
             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
             __kmp_invoke_microtask(microtask, gtid, 0, argc, args
 #if OMPT_SUPPORT
                                    ,
-                                   exit_runtime_p
+                                   exit_frame_p
 #endif
                                    );
           }
 
 #if OMPT_SUPPORT
           if (ompt_enabled.enabled) {
-            *exit_runtime_p = NULL;
+            *exit_frame_p = NULL;
             if (ompt_enabled.ompt_callback_implicit_task) {
               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
                   ompt_scope_end, NULL, &(task_info->task_data), 1,
-                  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
+                  OMPT_CUR_TASK_INFO(master_th)->thread_num,
+                  ompt_task_implicit);
             }
 
             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
             __ompt_lw_taskteam_unlink(master_th);
             if (ompt_enabled.ompt_callback_parallel_end) {
               ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
                   &ompt_parallel_data, parent_task_data,
-                  OMPT_INVOKER(call_context), return_address);
+                  OMPT_INVOKER(call_context) | ompt_parallel_team,
+                  return_address);
             }
             master_th->th.ompt_thread_info.state = ompt_state_overhead;
           }
 #endif
         }
       } else if (call_context == fork_context_gnu) {
 #if OMPT_SUPPORT
         ompt_lw_taskteam_t lwt;
         __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
                                 return_address);
 
         lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
         __ompt_lw_taskteam_link(&lwt, master_th, 1);
 // don't use lw_taskteam after linking. content was swaped
 #endif
 
         // we were called from GNU native code
         KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
         return FALSE;
       } else {
         KMP_ASSERT2(call_context < fork_context_last,
                     "__kmp_fork_call: unknown fork_context parameter");
       }
 
       KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
       KMP_MB();
       return FALSE;
     } // if (nthreads == 1)
 
     // GEH: only modify the executing flag in the case when not serialized
     //      serialized case is handled in kmpc_serialized_parallel
     KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
                   "curtask=%p, curtask_max_aclevel=%d\n",
                   parent_team->t.t_active_level, master_th,
                   master_th->th.th_current_task,
                   master_th->th.th_current_task->td_icvs.max_active_levels));
     // TODO: GEH - cannot do this assertion because root thread not set up as
     // executing
     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
     master_th->th.th_current_task->td_flags.executing = 0;
 
     if (!master_th->th.th_teams_microtask || level > teams_level) {
       /* Increment our nested depth level */
       KMP_ATOMIC_INC(&root->r.r_in_parallel);
     }
 
     // See if we need to make a copy of the ICVs.
     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
     if ((level + 1 < __kmp_nested_nth.used) &&
         (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
       nthreads_icv = __kmp_nested_nth.nth[level + 1];
     } else {
       nthreads_icv = 0; // don't update
     }
 
     // Figure out the proc_bind_policy for the new team.
     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
     kmp_proc_bind_t proc_bind_icv =
         proc_bind_default; // proc_bind_default means don't update
     if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
       proc_bind = proc_bind_false;
     } else {
       if (proc_bind == proc_bind_default) {
         // No proc_bind clause specified; use current proc-bind-var for this
         // parallel region
         proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
       }
       /* else: The proc_bind policy was specified explicitly on parallel clause.
          This overrides proc-bind-var for this parallel region, but does not
          change proc-bind-var. */
       // Figure the value of proc-bind-var for the child threads.
       if ((level + 1 < __kmp_nested_proc_bind.used) &&
           (__kmp_nested_proc_bind.bind_types[level + 1] !=
            master_th->th.th_current_task->td_icvs.proc_bind)) {
         proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
       }
     }
 
     // Reset for next parallel region
     master_th->th.th_set_proc_bind = proc_bind_default;
 
     if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
       kmp_internal_control_t new_icvs;
       copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
       new_icvs.next = NULL;
       if (nthreads_icv > 0) {
         new_icvs.nproc = nthreads_icv;
       }
       if (proc_bind_icv != proc_bind_default) {
         new_icvs.proc_bind = proc_bind_icv;
       }
 
       /* allocate a new parallel team */
       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
       team = __kmp_allocate_team(root, nthreads, nthreads,
 #if OMPT_SUPPORT
                                  ompt_parallel_data,
 #endif
                                  proc_bind, &new_icvs,
                                  argc USE_NESTED_HOT_ARG(master_th));
     } else {
       /* allocate a new parallel team */
       KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
       team = __kmp_allocate_team(root, nthreads, nthreads,
 #if OMPT_SUPPORT
                                  ompt_parallel_data,
 #endif
                                  proc_bind,
                                  &master_th->th.th_current_task->td_icvs,
                                  argc USE_NESTED_HOT_ARG(master_th));
     }
     KF_TRACE(
         10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
 
     /* setup the new team */
     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
     KMP_CHECK_UPDATE(team->t.t_ident, loc);
     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
 #if OMPT_SUPPORT
     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
                           return_address);
 #endif
     KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
     // TODO: parent_team->t.t_level == INT_MAX ???
     if (!master_th->th.th_teams_microtask || level > teams_level) {
       int new_level = parent_team->t.t_level + 1;
       KMP_CHECK_UPDATE(team->t.t_level, new_level);
       new_level = parent_team->t.t_active_level + 1;
       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
     } else {
       // AC: Do not increase parallel level at start of the teams construct
       int new_level = parent_team->t.t_level;
       KMP_CHECK_UPDATE(team->t.t_level, new_level);
       new_level = parent_team->t.t_active_level;
       KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
     }
     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
     // set master's schedule as new run-time schedule
     KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
 
     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
 
     // Update the floating point rounding in the team if required.
     propagateFPControl(team);
 
     if (__kmp_tasking_mode != tskm_immediate_exec) {
       // Set master's task team to team's task team. Unless this is hot team, it
       // should be NULL.
       KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
                        parent_team->t.t_task_team[master_th->th.th_task_state]);
       KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
                     "%p, new task_team %p / team %p\n",
                     __kmp_gtid_from_thread(master_th),
                     master_th->th.th_task_team, parent_team,
                     team->t.t_task_team[master_th->th.th_task_state], team));
 
       if (active_level || master_th->th.th_task_team) {
         // Take a memo of master's task_state
         KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
         if (master_th->th.th_task_state_top >=
             master_th->th.th_task_state_stack_sz) { // increase size
           kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
           kmp_uint8 *old_stack, *new_stack;
           kmp_uint32 i;
           new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
           for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
             new_stack[i] = master_th->th.th_task_state_memo_stack[i];
           }
           for (i = master_th->th.th_task_state_stack_sz; i < new_size;
                ++i) { // zero-init rest of stack
             new_stack[i] = 0;
           }
           old_stack = master_th->th.th_task_state_memo_stack;
           master_th->th.th_task_state_memo_stack = new_stack;
           master_th->th.th_task_state_stack_sz = new_size;
           __kmp_free(old_stack);
         }
         // Store master's task_state on stack
         master_th->th
             .th_task_state_memo_stack[master_th->th.th_task_state_top] =
             master_th->th.th_task_state;
         master_th->th.th_task_state_top++;
 #if KMP_NESTED_HOT_TEAMS
         if (master_th->th.th_hot_teams &&
             active_level < __kmp_hot_teams_max_level &&
             team == master_th->th.th_hot_teams[active_level].hot_team) {
           // Restore master's nested state if nested hot team
           master_th->th.th_task_state =
               master_th->th
                   .th_task_state_memo_stack[master_th->th.th_task_state_top];
         } else {
 #endif
           master_th->th.th_task_state = 0;
 #if KMP_NESTED_HOT_TEAMS
         }
 #endif
       }
 #if !KMP_NESTED_HOT_TEAMS
       KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
                        (team == root->r.r_hot_team));
 #endif
     }
 
     KA_TRACE(
         20,
         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
          gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
          team->t.t_nproc));
     KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
                      (team->t.t_master_tid == 0 &&
                       (team->t.t_parent == root->r.r_root_team ||
                        team->t.t_parent->t.t_serialized)));
     KMP_MB();
 
     /* now, setup the arguments */
     argv = (void **)team->t.t_argv;
     if (ap) {
       for (i = argc - 1; i >= 0; --i) {
 // TODO: revert workaround for Intel(R) 64 tracker #96
 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
         void *new_argv = va_arg(*ap, void *);
 #else
         void *new_argv = va_arg(ap, void *);
 #endif
         KMP_CHECK_UPDATE(*argv, new_argv);
         argv++;
       }
     } else {
       for (i = 0; i < argc; ++i) {
         // Get args from parent team for teams construct
         KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
       }
     }
 
     /* now actually fork the threads */
     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
       root->r.r_active = TRUE;
 
     __kmp_fork_team_threads(root, team, master_th, gtid);
     __kmp_setup_icv_copy(team, nthreads,
                          &master_th->th.th_current_task->td_icvs, loc);
 
 #if OMPT_SUPPORT
     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
 #endif
 
     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
 
 #if USE_ITT_BUILD
     if (team->t.t_active_level == 1 // only report frames at level 1
         && !master_th->th.th_teams_microtask) { // not in teams construct
 #if USE_ITT_NOTIFY
       if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
           (__kmp_forkjoin_frames_mode == 3 ||
            __kmp_forkjoin_frames_mode == 1)) {
         kmp_uint64 tmp_time = 0;
         if (__itt_get_timestamp_ptr)
           tmp_time = __itt_get_timestamp();
         // Internal fork - report frame begin
         master_th->th.th_frame_time = tmp_time;
         if (__kmp_forkjoin_frames_mode == 3)
           team->t.t_region_time = tmp_time;
       } else
 // only one notification scheme (either "submit" or "forking/joined", not both)
 #endif /* USE_ITT_NOTIFY */
           if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
               __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
         // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
         __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
       }
     }
 #endif /* USE_ITT_BUILD */
 
     /* now go on and do the work */
     KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
     KMP_MB();
     KF_TRACE(10,
              ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
               root, team, master_th, gtid));
 
 #if USE_ITT_BUILD
     if (__itt_stack_caller_create_ptr) {
       team->t.t_stack_id =
           __kmp_itt_stack_caller_create(); // create new stack stitching id
       // before entering fork barrier
     }
 #endif /* USE_ITT_BUILD */
 
     // AC: skip __kmp_internal_fork at teams construct, let only master
     // threads execute
     if (ap) {
       __kmp_internal_fork(loc, gtid, team);
       KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
                     "master_th=%p, gtid=%d\n",
                     root, team, master_th, gtid));
     }
 
     if (call_context == fork_context_gnu) {
       KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
       return TRUE;
     }
 
     /* Invoke microtask for MASTER thread */
     KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
                   team->t.t_id, team->t.t_pkfn));
   } // END of timer KMP_fork_call block
 
 #if KMP_STATS_ENABLED
   // If beginning a teams construct, then change thread state
   stats_state_e previous_state = KMP_GET_THREAD_STATE();
   if (!ap) {
     KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
   }
 #endif
 
   if (!team->t.t_invoke(gtid)) {
     KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
   }
 
 #if KMP_STATS_ENABLED
   // If was beginning of a teams construct, then reset thread state
   if (!ap) {
     KMP_SET_THREAD_STATE(previous_state);
   }
 #endif
 
   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
                 team->t.t_id, team->t.t_pkfn));
   KMP_MB(); /* Flush all pending memory write invalidates.  */
 
   KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
 
 #if OMPT_SUPPORT
   if (ompt_enabled.enabled) {
     master_th->th.ompt_thread_info.state = ompt_state_overhead;
   }
 #endif
 
   return TRUE;
 }
 
 #if OMPT_SUPPORT
 static inline void __kmp_join_restore_state(kmp_info_t *thread,
                                             kmp_team_t *team) {
   // restore state outside the region
   thread->th.ompt_thread_info.state =
       ((team->t.t_serialized) ? ompt_state_work_serial
                               : ompt_state_work_parallel);
 }
 
 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
                                    kmp_team_t *team, ompt_data_t *parallel_data,
-                                   fork_context_e fork_context, void *codeptr) {
+                                   int flags, void *codeptr) {
   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
   if (ompt_enabled.ompt_callback_parallel_end) {
     ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
-        parallel_data, &(task_info->task_data), OMPT_INVOKER(fork_context),
-        codeptr);
+        parallel_data, &(task_info->task_data), flags, codeptr);
   }
 
   task_info->frame.enter_frame = ompt_data_none;
   __kmp_join_restore_state(thread, team);
 }
 #endif
 
 void __kmp_join_call(ident_t *loc, int gtid
 #if OMPT_SUPPORT
                      ,
                      enum fork_context_e fork_context
 #endif
                      ,
                      int exit_teams) {
   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
   kmp_team_t *team;
   kmp_team_t *parent_team;
   kmp_info_t *master_th;
   kmp_root_t *root;
   int master_active;
 
   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
 
   /* setup current data */
   master_th = __kmp_threads[gtid];
   root = master_th->th.th_root;
   team = master_th->th.th_team;
   parent_team = team->t.t_parent;
 
   master_th->th.th_ident = loc;
 
 #if OMPT_SUPPORT
+  void *team_microtask = (void *)team->t.t_pkfn;
   if (ompt_enabled.enabled) {
     master_th->th.ompt_thread_info.state = ompt_state_overhead;
   }
 #endif
 
 #if KMP_DEBUG
   if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
     KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
                   "th_task_team = %p\n",
                   __kmp_gtid_from_thread(master_th), team,
                   team->t.t_task_team[master_th->th.th_task_state],
                   master_th->th.th_task_team));
     KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
                      team->t.t_task_team[master_th->th.th_task_state]);
   }
 #endif
 
   if (team->t.t_serialized) {
     if (master_th->th.th_teams_microtask) {
       // We are in teams construct
       int level = team->t.t_level;
       int tlevel = master_th->th.th_teams_level;
       if (level == tlevel) {
         // AC: we haven't incremented it earlier at start of teams construct,
         //     so do it here - at the end of teams construct
         team->t.t_level++;
       } else if (level == tlevel + 1) {
         // AC: we are exiting parallel inside teams, need to increment
         // serialization in order to restore it in the next call to
         // __kmpc_end_serialized_parallel
         team->t.t_serialized++;
       }
     }
     __kmpc_end_serialized_parallel(loc, gtid);
 
 #if OMPT_SUPPORT
     if (ompt_enabled.enabled) {
       __kmp_join_restore_state(master_th, parent_team);
     }
 #endif
 
     return;
   }
 
   master_active = team->t.t_master_active;
 
   if (!exit_teams) {
     // AC: No barrier for internal teams at exit from teams construct.
     //     But there is barrier for external team (league).
     __kmp_internal_join(loc, gtid, team);
   } else {
     master_th->th.th_task_state =
         0; // AC: no tasking in teams (out of any parallel)
   }
 
   KMP_MB();
 
 #if OMPT_SUPPORT
   ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
   void *codeptr = team->t.ompt_team_info.master_return_address;
 #endif
 
 #if USE_ITT_BUILD
   if (__itt_stack_caller_create_ptr) {
     __kmp_itt_stack_caller_destroy(
         (__itt_caller)team->t
             .t_stack_id); // destroy the stack stitching id after join barrier
   }
 
   // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
   if (team->t.t_active_level == 1 &&
       !master_th->th.th_teams_microtask) { /* not in teams construct */
     master_th->th.th_ident = loc;
     // only one notification scheme (either "submit" or "forking/joined", not
     // both)
     if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
         __kmp_forkjoin_frames_mode == 3)
       __kmp_itt_frame_submit(gtid, team->t.t_region_time,
                              master_th->th.th_frame_time, 0, loc,
                              master_th->th.th_team_nproc, 1);
     else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
              !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
       __kmp_itt_region_joined(gtid);
   } // active_level == 1
 #endif /* USE_ITT_BUILD */
 
   if (master_th->th.th_teams_microtask && !exit_teams &&
       team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
       team->t.t_level == master_th->th.th_teams_level + 1) {
-    // AC: We need to leave the team structure intact at the end of parallel
-    // inside the teams construct, so that at the next parallel same (hot) team
-    // works, only adjust nesting levels
-
+// AC: We need to leave the team structure intact at the end of parallel
+// inside the teams construct, so that at the next parallel same (hot) team
+// works, only adjust nesting levels
+#if OMPT_SUPPORT
+    ompt_data_t ompt_parallel_data = ompt_data_none;
+    if (ompt_enabled.enabled) {
+      ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
+      if (ompt_enabled.ompt_callback_implicit_task) {
+        int ompt_team_size = team->t.t_nproc;
+        ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+            ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
+            OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
+      }
+      task_info->frame.exit_frame = ompt_data_none;
+      task_info->task_data = ompt_data_none;
+      ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
+      __ompt_lw_taskteam_unlink(master_th);
+    }
+#endif
     /* Decrement our nested depth level */
     team->t.t_level--;
     team->t.t_active_level--;
     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
 
     // Restore number of threads in the team if needed. This code relies on
     // the proper adjustment of th_teams_size.nth after the fork in
     // __kmp_teams_master on each teams master in the case that
     // __kmp_reserve_threads reduced it.
     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
       int old_num = master_th->th.th_team_nproc;
       int new_num = master_th->th.th_teams_size.nth;
       kmp_info_t **other_threads = team->t.t_threads;
       team->t.t_nproc = new_num;
       for (int i = 0; i < old_num; ++i) {
         other_threads[i]->th.th_team_nproc = new_num;
       }
       // Adjust states of non-used threads of the team
       for (int i = old_num; i < new_num; ++i) {
         // Re-initialize thread's barrier data.
         KMP_DEBUG_ASSERT(other_threads[i]);
         kmp_balign_t *balign = other_threads[i]->th.th_bar;
         for (int b = 0; b < bs_last_barrier; ++b) {
           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
 #if USE_DEBUGGER
           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
 #endif
         }
         if (__kmp_tasking_mode != tskm_immediate_exec) {
           // Synchronize thread's task state
           other_threads[i]->th.th_task_state = master_th->th.th_task_state;
         }
       }
     }
 
 #if OMPT_SUPPORT
     if (ompt_enabled.enabled) {
-      __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
-                      codeptr);
+      __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
+                      OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
     }
 #endif
 
     return;
   }
 
   /* do cleanup and restore the parent team */
   master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
   master_th->th.th_local.this_construct = team->t.t_master_this_cons;
 
   master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
 
   /* jc: The following lock has instructions with REL and ACQ semantics,
      separating the parallel user code called in this parallel region
      from the serial user code called after this function returns. */
   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
 
   if (!master_th->th.th_teams_microtask ||
       team->t.t_level > master_th->th.th_teams_level) {
     /* Decrement our nested depth level */
     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
   }
   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
 
 #if OMPT_SUPPORT
   if (ompt_enabled.enabled) {
     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
     if (ompt_enabled.ompt_callback_implicit_task) {
-      int ompt_team_size = team->t.t_nproc;
+      int flags = (team_microtask == (void *)__kmp_teams_master)
+                      ? ompt_task_initial
+                      : ompt_task_implicit;
+      int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
-          OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
+          OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
     }
-
     task_info->frame.exit_frame = ompt_data_none;
     task_info->task_data = ompt_data_none;
   }
 #endif
 
   KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
                 master_th, team));
   __kmp_pop_current_task_from_thread(master_th);
 
 #if KMP_AFFINITY_SUPPORTED
   // Restore master thread's partition.
   master_th->th.th_first_place = team->t.t_first_place;
   master_th->th.th_last_place = team->t.t_last_place;
 #endif // KMP_AFFINITY_SUPPORTED
   master_th->th.th_def_allocator = team->t.t_def_allocator;
 
   updateHWFPControl(team);
 
   if (root->r.r_active != master_active)
     root->r.r_active = master_active;
 
   __kmp_free_team(root, team USE_NESTED_HOT_ARG(
                             master_th)); // this will free worker threads
 
   /* this race was fun to find. make sure the following is in the critical
      region otherwise assertions may fail occasionally since the old team may be
      reallocated and the hierarchy appears inconsistent. it is actually safe to
      run and won't cause any bugs, but will cause those assertion failures. it's
      only one deref&assign so might as well put this in the critical region */
   master_th->th.th_team = parent_team;
   master_th->th.th_team_nproc = parent_team->t.t_nproc;
   master_th->th.th_team_master = parent_team->t.t_threads[0];
   master_th->th.th_team_serialized = parent_team->t.t_serialized;
 
   /* restore serialized team, if need be */
   if (parent_team->t.t_serialized &&
       parent_team != master_th->th.th_serial_team &&
       parent_team != root->r.r_root_team) {
     __kmp_free_team(root,
                     master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
     master_th->th.th_serial_team = parent_team;
   }
 
   if (__kmp_tasking_mode != tskm_immediate_exec) {
     if (master_th->th.th_task_state_top >
         0) { // Restore task state from memo stack
       KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
       // Remember master's state if we re-use this nested hot team
       master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
           master_th->th.th_task_state;
       --master_th->th.th_task_state_top; // pop
       // Now restore state at this level
       master_th->th.th_task_state =
           master_th->th
               .th_task_state_memo_stack[master_th->th.th_task_state_top];
     }
     // Copy the task team from the parent team to the master thread
     master_th->th.th_task_team =
         parent_team->t.t_task_team[master_th->th.th_task_state];
     KA_TRACE(20,
              ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
               __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
               parent_team));
   }
 
   // TODO: GEH - cannot do this assertion because root thread not set up as
   // executing
   // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
   master_th->th.th_current_task->td_flags.executing = 1;
 
   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
 
 #if OMPT_SUPPORT
+  int flags =
+      OMPT_INVOKER(fork_context) |
+      ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
+                                                      : ompt_parallel_team);
   if (ompt_enabled.enabled) {
-    __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
+    __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
                     codeptr);
   }
 #endif
 
   KMP_MB();
   KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
 }
 
 /* Check whether we should push an internal control record onto the
    serial team stack.  If so, do it.  */
 void __kmp_save_internal_controls(kmp_info_t *thread) {
 
   if (thread->th.th_team != thread->th.th_serial_team) {
     return;
   }
   if (thread->th.th_team->t.t_serialized > 1) {
     int push = 0;
 
     if (thread->th.th_team->t.t_control_stack_top == NULL) {
       push = 1;
     } else {
       if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
           thread->th.th_team->t.t_serialized) {
         push = 1;
       }
     }
     if (push) { /* push a record on the serial team's stack */
       kmp_internal_control_t *control =
           (kmp_internal_control_t *)__kmp_allocate(
               sizeof(kmp_internal_control_t));
 
       copy_icvs(control, &thread->th.th_current_task->td_icvs);
 
       control->serial_nesting_level = thread->th.th_team->t.t_serialized;
 
       control->next = thread->th.th_team->t.t_control_stack_top;
       thread->th.th_team->t.t_control_stack_top = control;
     }
   }
 }
 
 /* Changes set_nproc */
 void __kmp_set_num_threads(int new_nth, int gtid) {
   kmp_info_t *thread;
   kmp_root_t *root;
 
   KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
   KMP_DEBUG_ASSERT(__kmp_init_serial);
 
   if (new_nth < 1)
     new_nth = 1;
   else if (new_nth > __kmp_max_nth)
     new_nth = __kmp_max_nth;
 
   KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
   thread = __kmp_threads[gtid];
   if (thread->th.th_current_task->td_icvs.nproc == new_nth)
     return; // nothing to do
 
   __kmp_save_internal_controls(thread);
 
   set__nproc(thread, new_nth);
 
   // If this omp_set_num_threads() call will cause the hot team size to be
   // reduced (in the absence of a num_threads clause), then reduce it now,
   // rather than waiting for the next parallel region.
   root = thread->th.th_root;
   if (__kmp_init_parallel && (!root->r.r_active) &&
       (root->r.r_hot_team->t.t_nproc > new_nth)
 #if KMP_NESTED_HOT_TEAMS
       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
 #endif
       ) {
     kmp_team_t *hot_team = root->r.r_hot_team;
     int f;
 
     __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
 
     // Release the extra threads we don't need any more.
     for (f = new_nth; f < hot_team->t.t_nproc; f++) {
       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
       if (__kmp_tasking_mode != tskm_immediate_exec) {
         // When decreasing team size, threads no longer in the team should unref
         // task team.
         hot_team->t.t_threads[f]->th.th_task_team = NULL;
       }
       __kmp_free_thread(hot_team->t.t_threads[f]);
       hot_team->t.t_threads[f] = NULL;
     }
     hot_team->t.t_nproc = new_nth;
 #if KMP_NESTED_HOT_TEAMS
     if (thread->th.th_hot_teams) {
       KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
       thread->th.th_hot_teams[0].hot_team_nth = new_nth;
     }
 #endif
 
     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
 
     // Update the t_nproc field in the threads that are still active.
     for (f = 0; f < new_nth; f++) {
       KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
       hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
     }
     // Special flag in case omp_set_num_threads() call
     hot_team->t.t_size_changed = -1;
   }
 }
 
 /* Changes max_active_levels */
 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
   kmp_info_t *thread;
 
   KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
                 "%d = (%d)\n",
                 gtid, max_active_levels));
   KMP_DEBUG_ASSERT(__kmp_init_serial);
 
   // validate max_active_levels
   if (max_active_levels < 0) {
     KMP_WARNING(ActiveLevelsNegative, max_active_levels);
     // We ignore this call if the user has specified a negative value.
     // The current setting won't be changed. The last valid setting will be
     // used. A warning will be issued (if warnings are allowed as controlled by
     // the KMP_WARNINGS env var).
     KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
                   "max_active_levels for thread %d = (%d)\n",
                   gtid, max_active_levels));
     return;
   }
   if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
     // it's OK, the max_active_levels is within the valid range: [ 0;
     // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
     // We allow a zero value. (implementation defined behavior)
   } else {
     KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
                 KMP_MAX_ACTIVE_LEVELS_LIMIT);
     max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
     // Current upper limit is MAX_INT. (implementation defined behavior)
     // If the input exceeds the upper limit, we correct the input to be the
     // upper limit. (implementation defined behavior)
     // Actually, the flow should never get here until we use MAX_INT limit.
   }
   KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
                 "max_active_levels for thread %d = (%d)\n",
                 gtid, max_active_levels));
 
   thread = __kmp_threads[gtid];
 
   __kmp_save_internal_controls(thread);
 
   set__max_active_levels(thread, max_active_levels);
 }
 
 /* Gets max_active_levels */
 int __kmp_get_max_active_levels(int gtid) {
   kmp_info_t *thread;
 
   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
   KMP_DEBUG_ASSERT(__kmp_init_serial);
 
   thread = __kmp_threads[gtid];
   KMP_DEBUG_ASSERT(thread->th.th_current_task);
   KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
                 "curtask_maxaclevel=%d\n",
                 gtid, thread->th.th_current_task,
                 thread->th.th_current_task->td_icvs.max_active_levels));
   return thread->th.th_current_task->td_icvs.max_active_levels;
 }
 
 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
 
 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
   kmp_info_t *thread;
   kmp_sched_t orig_kind;
   //    kmp_team_t *team;
 
   KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
                 gtid, (int)kind, chunk));
   KMP_DEBUG_ASSERT(__kmp_init_serial);
 
   // Check if the kind parameter is valid, correct if needed.
   // Valid parameters should fit in one of two intervals - standard or extended:
   //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
   // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
   orig_kind = kind;
   kind = __kmp_sched_without_mods(kind);
 
   if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
       (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
     // TODO: Hint needs attention in case we change the default schedule.
     __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
               KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
               __kmp_msg_null);
     kind = kmp_sched_default;
     chunk = 0; // ignore chunk value in case of bad kind
   }
 
   thread = __kmp_threads[gtid];
 
   __kmp_save_internal_controls(thread);
 
   if (kind < kmp_sched_upper_std) {
     if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
       // differ static chunked vs. unchunked:  chunk should be invalid to
       // indicate unchunked schedule (which is the default)
       thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
     } else {
       thread->th.th_current_task->td_icvs.sched.r_sched_type =
           __kmp_sch_map[kind - kmp_sched_lower - 1];
     }
   } else {
     //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
     //    kmp_sched_lower - 2 ];
     thread->th.th_current_task->td_icvs.sched.r_sched_type =
         __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
                       kmp_sched_lower - 2];
   }
   __kmp_sched_apply_mods_intkind(
       orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
   if (kind == kmp_sched_auto || chunk < 1) {
     // ignore parameter chunk for schedule auto
     thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
   } else {
     thread->th.th_current_task->td_icvs.sched.chunk = chunk;
   }
 }
 
 /* Gets def_sched_var ICV values */
 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
   kmp_info_t *thread;
   enum sched_type th_type;
 
   KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
   KMP_DEBUG_ASSERT(__kmp_init_serial);
 
   thread = __kmp_threads[gtid];
 
   th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
   switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
   case kmp_sch_static:
   case kmp_sch_static_greedy:
   case kmp_sch_static_balanced:
     *kind = kmp_sched_static;
     __kmp_sched_apply_mods_stdkind(kind, th_type);
     *chunk = 0; // chunk was not set, try to show this fact via zero value
     return;
   case kmp_sch_static_chunked:
     *kind = kmp_sched_static;
     break;
   case kmp_sch_dynamic_chunked:
     *kind = kmp_sched_dynamic;
     break;
   case kmp_sch_guided_chunked:
   case kmp_sch_guided_iterative_chunked:
   case kmp_sch_guided_analytical_chunked:
     *kind = kmp_sched_guided;
     break;
   case kmp_sch_auto:
     *kind = kmp_sched_auto;
     break;
   case kmp_sch_trapezoidal:
     *kind = kmp_sched_trapezoidal;
     break;
 #if KMP_STATIC_STEAL_ENABLED
   case kmp_sch_static_steal:
     *kind = kmp_sched_static_steal;
     break;
 #endif
   default:
     KMP_FATAL(UnknownSchedulingType, th_type);
   }
 
   __kmp_sched_apply_mods_stdkind(kind, th_type);
   *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
 }
 
 int __kmp_get_ancestor_thread_num(int gtid, int level) {
 
   int ii, dd;
   kmp_team_t *team;
   kmp_info_t *thr;
 
   KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
   KMP_DEBUG_ASSERT(__kmp_init_serial);
 
   // validate level
   if (level == 0)
     return 0;
   if (level < 0)
     return -1;
   thr = __kmp_threads[gtid];
   team = thr->th.th_team;
   ii = team->t.t_level;
   if (level > ii)
     return -1;
 
   if (thr->th.th_teams_microtask) {
     // AC: we are in teams region where multiple nested teams have same level
     int tlevel = thr->th.th_teams_level; // the level of the teams construct
     if (level <=
         tlevel) { // otherwise usual algorithm works (will not touch the teams)
       KMP_DEBUG_ASSERT(ii >= tlevel);
       // AC: As we need to pass by the teams league, we need to artificially
       // increase ii
       if (ii == tlevel) {
         ii += 2; // three teams have same level
       } else {
         ii++; // two teams have same level
       }
     }
   }
 
   if (ii == level)
     return __kmp_tid_from_gtid(gtid);
 
   dd = team->t.t_serialized;
   level++;
   while (ii > level) {
     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
     }
     if ((team->t.t_serialized) && (!dd)) {
       team = team->t.t_parent;
       continue;
     }
     if (ii > level) {
       team = team->t.t_parent;
       dd = team->t.t_serialized;
       ii--;
     }
   }
 
   return (dd > 1) ? (0) : (team->t.t_master_tid);
 }
 
 int __kmp_get_team_size(int gtid, int level) {
 
   int ii, dd;
   kmp_team_t *team;
   kmp_info_t *thr;
 
   KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
   KMP_DEBUG_ASSERT(__kmp_init_serial);
 
   // validate level
   if (level == 0)
     return 1;
   if (level < 0)
     return -1;
   thr = __kmp_threads[gtid];
   team = thr->th.th_team;
   ii = team->t.t_level;
   if (level > ii)
     return -1;
 
   if (thr->th.th_teams_microtask) {
     // AC: we are in teams region where multiple nested teams have same level
     int tlevel = thr->th.th_teams_level; // the level of the teams construct
     if (level <=
         tlevel) { // otherwise usual algorithm works (will not touch the teams)
       KMP_DEBUG_ASSERT(ii >= tlevel);
       // AC: As we need to pass by the teams league, we need to artificially
       // increase ii
       if (ii == tlevel) {
         ii += 2; // three teams have same level
       } else {
         ii++; // two teams have same level
       }
     }
   }
 
   while (ii > level) {
     for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
     }
     if (team->t.t_serialized && (!dd)) {
       team = team->t.t_parent;
       continue;
     }
     if (ii > level) {
       team = team->t.t_parent;
       ii--;
     }
   }
 
   return team->t.t_nproc;
 }
 
 kmp_r_sched_t __kmp_get_schedule_global() {
   // This routine created because pairs (__kmp_sched, __kmp_chunk) and
   // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
   // independently. So one can get the updated schedule here.
 
   kmp_r_sched_t r_sched;
 
   // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
   // __kmp_guided. __kmp_sched should keep original value, so that user can set
   // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
   // different roots (even in OMP 2.5)
   enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
   enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
   if (s == kmp_sch_static) {
     // replace STATIC with more detailed schedule (balanced or greedy)
     r_sched.r_sched_type = __kmp_static;
   } else if (s == kmp_sch_guided_chunked) {
     // replace GUIDED with more detailed schedule (iterative or analytical)
     r_sched.r_sched_type = __kmp_guided;
   } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
     r_sched.r_sched_type = __kmp_sched;
   }
   SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
 
   if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
     // __kmp_chunk may be wrong here (if it was not ever set)
     r_sched.chunk = KMP_DEFAULT_CHUNK;
   } else {
     r_sched.chunk = __kmp_chunk;
   }
 
   return r_sched;
 }
 
 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
    at least argc number of *t_argv entries for the requested team. */
 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
 
   KMP_DEBUG_ASSERT(team);
   if (!realloc || argc > team->t.t_max_argc) {
 
     KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
                    "current entries=%d\n",
                    team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
     /* if previously allocated heap space for args, free them */
     if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
       __kmp_free((void *)team->t.t_argv);
 
     if (argc <= KMP_INLINE_ARGV_ENTRIES) {
       /* use unused space in the cache line for arguments */
       team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
                      "argv entries\n",
                      team->t.t_id, team->t.t_max_argc));
       team->t.t_argv = &team->t.t_inline_argv[0];
       if (__kmp_storage_map) {
         __kmp_print_storage_map_gtid(
             -1, &team->t.t_inline_argv[0],
             &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
             (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
             team->t.t_id);
       }
     } else {
       /* allocate space for arguments in the heap */
       team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
                                ? KMP_MIN_MALLOC_ARGV_ENTRIES
                                : 2 * argc;
       KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
                      "argv entries\n",
                      team->t.t_id, team->t.t_max_argc));
       team->t.t_argv =
           (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
       if (__kmp_storage_map) {
         __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
                                      &team->t.t_argv[team->t.t_max_argc],
                                      sizeof(void *) * team->t.t_max_argc,
                                      "team_%d.t_argv", team->t.t_id);
       }
     }
   }
 }
 
 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
   int i;
   int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
   team->t.t_threads =
       (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
   team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
       sizeof(dispatch_shared_info_t) * num_disp_buff);
   team->t.t_dispatch =
       (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
   team->t.t_implicit_task_taskdata =
       (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
   team->t.t_max_nproc = max_nth;
 
   /* setup dispatch buffers */
   for (i = 0; i < num_disp_buff; ++i) {
     team->t.t_disp_buffer[i].buffer_index = i;
     team->t.t_disp_buffer[i].doacross_buf_idx = i;
   }
 }
 
 static void __kmp_free_team_arrays(kmp_team_t *team) {
   /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
   int i;
   for (i = 0; i < team->t.t_max_nproc; ++i) {
     if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
       __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
       team->t.t_dispatch[i].th_disp_buffer = NULL;
     }
   }
 #if KMP_USE_HIER_SCHED
   __kmp_dispatch_free_hierarchies(team);
 #endif
   __kmp_free(team->t.t_threads);
   __kmp_free(team->t.t_disp_buffer);
   __kmp_free(team->t.t_dispatch);
   __kmp_free(team->t.t_implicit_task_taskdata);
   team->t.t_threads = NULL;
   team->t.t_disp_buffer = NULL;
   team->t.t_dispatch = NULL;
   team->t.t_implicit_task_taskdata = 0;
 }
 
 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
   kmp_info_t **oldThreads = team->t.t_threads;
 
   __kmp_free(team->t.t_disp_buffer);
   __kmp_free(team->t.t_dispatch);
   __kmp_free(team->t.t_implicit_task_taskdata);
   __kmp_allocate_team_arrays(team, max_nth);
 
   KMP_MEMCPY(team->t.t_threads, oldThreads,
              team->t.t_nproc * sizeof(kmp_info_t *));
 
   __kmp_free(oldThreads);
 }
 
 static kmp_internal_control_t __kmp_get_global_icvs(void) {
 
   kmp_r_sched_t r_sched =
       __kmp_get_schedule_global(); // get current state of scheduling globals
 
   KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
 
   kmp_internal_control_t g_icvs = {
     0, // int serial_nesting_level; //corresponds to value of th_team_serialized
     (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
     // adjustment of threads (per thread)
     (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
     // whether blocktime is explicitly set
     __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
 #if KMP_USE_MONITOR
     __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
 // intervals
 #endif
     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
     // next parallel region (per thread)
     // (use a max ub on value if __kmp_parallel_initialize not called yet)
     __kmp_cg_max_nth, // int thread_limit;
     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
     // for max_active_levels
     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
     // {sched,chunk} pair
     __kmp_nested_proc_bind.bind_types[0],
     __kmp_default_device,
     NULL // struct kmp_internal_control *next;
   };
 
   return g_icvs;
 }
 
 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
 
   kmp_internal_control_t gx_icvs;
   gx_icvs.serial_nesting_level =
       0; // probably =team->t.t_serial like in save_inter_controls
   copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
   gx_icvs.next = NULL;
 
   return gx_icvs;
 }
 
 static void __kmp_initialize_root(kmp_root_t *root) {
   int f;
   kmp_team_t *root_team;
   kmp_team_t *hot_team;
   int hot_team_max_nth;
   kmp_r_sched_t r_sched =
       __kmp_get_schedule_global(); // get current state of scheduling globals
   kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
   KMP_DEBUG_ASSERT(root);
   KMP_ASSERT(!root->r.r_begin);
 
   /* setup the root state structure */
   __kmp_init_lock(&root->r.r_begin_lock);
   root->r.r_begin = FALSE;
   root->r.r_active = FALSE;
   root->r.r_in_parallel = 0;
   root->r.r_blocktime = __kmp_dflt_blocktime;
 
   /* setup the root team for this task */
   /* allocate the root team structure */
   KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
 
   root_team =
       __kmp_allocate_team(root,
                           1, // new_nproc
                           1, // max_nproc
 #if OMPT_SUPPORT
                           ompt_data_none, // root parallel id
 #endif
                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
                           0 // argc
                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
                           );
 #if USE_DEBUGGER
   // Non-NULL value should be assigned to make the debugger display the root
   // team.
   TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
 #endif
 
   KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
 
   root->r.r_root_team = root_team;
   root_team->t.t_control_stack_top = NULL;
 
   /* initialize root team */
   root_team->t.t_threads[0] = NULL;
   root_team->t.t_nproc = 1;
   root_team->t.t_serialized = 1;
   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
   root_team->t.t_sched.sched = r_sched.sched;
   KA_TRACE(
       20,
       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
        root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
 
   /* setup the  hot team for this task */
   /* allocate the hot team structure */
   KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
 
   hot_team =
       __kmp_allocate_team(root,
                           1, // new_nproc
                           __kmp_dflt_team_nth_ub * 2, // max_nproc
 #if OMPT_SUPPORT
                           ompt_data_none, // root parallel id
 #endif
                           __kmp_nested_proc_bind.bind_types[0], &r_icvs,
                           0 // argc
                           USE_NESTED_HOT_ARG(NULL) // master thread is unknown
                           );
   KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
 
   root->r.r_hot_team = hot_team;
   root_team->t.t_control_stack_top = NULL;
 
   /* first-time initialization */
   hot_team->t.t_parent = root_team;
 
   /* initialize hot team */
   hot_team_max_nth = hot_team->t.t_max_nproc;
   for (f = 0; f < hot_team_max_nth; ++f) {
     hot_team->t.t_threads[f] = NULL;
   }
   hot_team->t.t_nproc = 1;
   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
   hot_team->t.t_sched.sched = r_sched.sched;
   hot_team->t.t_size_changed = 0;
 }
 
 #ifdef KMP_DEBUG
 
 typedef struct kmp_team_list_item {
   kmp_team_p const *entry;
   struct kmp_team_list_item *next;
 } kmp_team_list_item_t;
 typedef kmp_team_list_item_t *kmp_team_list_t;
 
 static void __kmp_print_structure_team_accum( // Add team to list of teams.
     kmp_team_list_t list, // List of teams.
     kmp_team_p const *team // Team to add.
     ) {
 
   // List must terminate with item where both entry and next are NULL.
   // Team is added to the list only once.
   // List is sorted in ascending order by team id.
   // Team id is *not* a key.
 
   kmp_team_list_t l;
 
   KMP_DEBUG_ASSERT(list != NULL);
   if (team == NULL) {
     return;
   }
 
   __kmp_print_structure_team_accum(list, team->t.t_parent);
   __kmp_print_structure_team_accum(list, team->t.t_next_pool);
 
   // Search list for the team.
   l = list;
   while (l->next != NULL && l->entry != team) {
     l = l->next;
   }
   if (l->next != NULL) {
     return; // Team has been added before, exit.
   }
 
   // Team is not found. Search list again for insertion point.
   l = list;
   while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
     l = l->next;
   }
 
   // Insert team.
   {
     kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
         sizeof(kmp_team_list_item_t));
     *item = *l;
     l->entry = team;
     l->next = item;
   }
 }
 
 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
 
                                        ) {
   __kmp_printf("%s", title);
   if (team != NULL) {
     __kmp_printf("%2x %p\n", team->t.t_id, team);
   } else {
     __kmp_printf(" - (nil)\n");
   }
 }
 
 static void __kmp_print_structure_thread(char const *title,
                                          kmp_info_p const *thread) {
   __kmp_printf("%s", title);
   if (thread != NULL) {
     __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
   } else {
     __kmp_printf(" - (nil)\n");
   }
 }
 
 void __kmp_print_structure(void) {
 
   kmp_team_list_t list;
 
   // Initialize list of teams.
   list =
       (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
   list->entry = NULL;
   list->next = NULL;
 
   __kmp_printf("\n------------------------------\nGlobal Thread "
                "Table\n------------------------------\n");
   {
     int gtid;
     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
       __kmp_printf("%2d", gtid);
       if (__kmp_threads != NULL) {
         __kmp_printf(" %p", __kmp_threads[gtid]);
       }
       if (__kmp_root != NULL) {
         __kmp_printf(" %p", __kmp_root[gtid]);
       }
       __kmp_printf("\n");
     }
   }
 
   // Print out __kmp_threads array.
   __kmp_printf("\n------------------------------\nThreads\n--------------------"
                "----------\n");
   if (__kmp_threads != NULL) {
     int gtid;
     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
       kmp_info_t const *thread = __kmp_threads[gtid];
       if (thread != NULL) {
         __kmp_printf("GTID %2d %p:\n", gtid, thread);
         __kmp_printf("    Our Root:        %p\n", thread->th.th_root);
         __kmp_print_structure_team("    Our Team:     ", thread->th.th_team);
         __kmp_print_structure_team("    Serial Team:  ",
                                    thread->th.th_serial_team);
         __kmp_printf("    Threads:      %2d\n", thread->th.th_team_nproc);
         __kmp_print_structure_thread("    Master:       ",
                                      thread->th.th_team_master);
         __kmp_printf("    Serialized?:  %2d\n", thread->th.th_team_serialized);
         __kmp_printf("    Set NProc:    %2d\n", thread->th.th_set_nproc);
         __kmp_printf("    Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
         __kmp_print_structure_thread("    Next in pool: ",
                                      thread->th.th_next_pool);
         __kmp_printf("\n");
         __kmp_print_structure_team_accum(list, thread->th.th_team);
         __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
       }
     }
   } else {
     __kmp_printf("Threads array is not allocated.\n");
   }
 
   // Print out __kmp_root array.
   __kmp_printf("\n------------------------------\nUbers\n----------------------"
                "--------\n");
   if (__kmp_root != NULL) {
     int gtid;
     for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
       kmp_root_t const *root = __kmp_root[gtid];
       if (root != NULL) {
         __kmp_printf("GTID %2d %p:\n", gtid, root);
         __kmp_print_structure_team("    Root Team:    ", root->r.r_root_team);
         __kmp_print_structure_team("    Hot Team:     ", root->r.r_hot_team);
         __kmp_print_structure_thread("    Uber Thread:  ",
                                      root->r.r_uber_thread);
         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
         __kmp_printf("    In Parallel:  %2d\n",
                      KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
         __kmp_printf("\n");
         __kmp_print_structure_team_accum(list, root->r.r_root_team);
         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
       }
     }
   } else {
     __kmp_printf("Ubers array is not allocated.\n");
   }
 
   __kmp_printf("\n------------------------------\nTeams\n----------------------"
                "--------\n");
   while (list->next != NULL) {
     kmp_team_p const *team = list->entry;
     int i;
     __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
     __kmp_print_structure_team("    Parent Team:      ", team->t.t_parent);
     __kmp_printf("    Master TID:       %2d\n", team->t.t_master_tid);
     __kmp_printf("    Max threads:      %2d\n", team->t.t_max_nproc);
     __kmp_printf("    Levels of serial: %2d\n", team->t.t_serialized);
     __kmp_printf("    Number threads:   %2d\n", team->t.t_nproc);
     for (i = 0; i < team->t.t_nproc; ++i) {
       __kmp_printf("    Thread %2d:      ", i);
       __kmp_print_structure_thread("", team->t.t_threads[i]);
     }
     __kmp_print_structure_team("    Next in pool:     ", team->t.t_next_pool);
     __kmp_printf("\n");
     list = list->next;
   }
 
   // Print out __kmp_thread_pool and __kmp_team_pool.
   __kmp_printf("\n------------------------------\nPools\n----------------------"
                "--------\n");
   __kmp_print_structure_thread("Thread pool:          ",
                                CCAST(kmp_info_t *, __kmp_thread_pool));
   __kmp_print_structure_team("Team pool:            ",
                              CCAST(kmp_team_t *, __kmp_team_pool));
   __kmp_printf("\n");
 
   // Free team list.
   while (list != NULL) {
     kmp_team_list_item_t *item = list;
     list = list->next;
     KMP_INTERNAL_FREE(item);
   }
 }
 
 #endif
 
 //---------------------------------------------------------------------------
 //  Stuff for per-thread fast random number generator
 //  Table of primes
 static const unsigned __kmp_primes[] = {
     0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
     0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
     0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
     0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
     0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
     0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
     0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
     0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
     0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
     0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
     0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
 
 //---------------------------------------------------------------------------
 //  __kmp_get_random: Get a random number using a linear congruential method.
 unsigned short __kmp_get_random(kmp_info_t *thread) {
   unsigned x = thread->th.th_x;
   unsigned short r = x >> 16;
 
   thread->th.th_x = x * thread->th.th_a + 1;
 
   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
                 thread->th.th_info.ds.ds_tid, r));
 
   return r;
 }
 //--------------------------------------------------------
 // __kmp_init_random: Initialize a random number generator
 void __kmp_init_random(kmp_info_t *thread) {
   unsigned seed = thread->th.th_info.ds.ds_tid;
 
   thread->th.th_a =
       __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
   thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
   KA_TRACE(30,
            ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
 }
 
 #if KMP_OS_WINDOWS
 /* reclaim array entries for root threads that are already dead, returns number
  * reclaimed */
 static int __kmp_reclaim_dead_roots(void) {
   int i, r = 0;
 
   for (i = 0; i < __kmp_threads_capacity; ++i) {
     if (KMP_UBER_GTID(i) &&
         !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
         !__kmp_root[i]
              ->r.r_active) { // AC: reclaim only roots died in non-active state
       r += __kmp_unregister_root_other_thread(i);
     }
   }
   return r;
 }
 #endif
 
 /* This function attempts to create free entries in __kmp_threads and
    __kmp_root, and returns the number of free entries generated.
 
    For Windows* OS static library, the first mechanism used is to reclaim array
    entries for root threads that are already dead.
 
    On all platforms, expansion is attempted on the arrays __kmp_threads_ and
    __kmp_root, with appropriate update to __kmp_threads_capacity. Array
    capacity is increased by doubling with clipping to __kmp_tp_capacity, if
    threadprivate cache array has been created. Synchronization with
    __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
 
    After any dead root reclamation, if the clipping value allows array expansion
    to result in the generation of a total of nNeed free slots, the function does
    that expansion. If not, nothing is done beyond the possible initial root
    thread reclamation.
 
    If any argument is negative, the behavior is undefined. */
 static int __kmp_expand_threads(int nNeed) {
   int added = 0;
   int minimumRequiredCapacity;
   int newCapacity;
   kmp_info_t **newThreads;
   kmp_root_t **newRoot;
 
 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
 // resizing __kmp_threads does not need additional protection if foreign
 // threads are present
 
 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
   /* only for Windows static library */
   /* reclaim array entries for root threads that are already dead */
   added = __kmp_reclaim_dead_roots();
 
   if (nNeed) {
     nNeed -= added;
     if (nNeed < 0)
       nNeed = 0;
   }
 #endif
   if (nNeed <= 0)
     return added;
 
   // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
   // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
   // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
   // > __kmp_max_nth in one of two ways:
   //
   // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
   //    may not be resused by another thread, so we may need to increase
   //    __kmp_threads_capacity to __kmp_max_nth + 1.
   //
   // 2) New foreign root(s) are encountered.  We always register new foreign
   //    roots. This may cause a smaller # of threads to be allocated at
   //    subsequent parallel regions, but the worker threads hang around (and
   //    eventually go to sleep) and need slots in the __kmp_threads[] array.
   //
   // Anyway, that is the reason for moving the check to see if
   // __kmp_max_nth was exceeded into __kmp_reserve_threads()
   // instead of having it performed here. -BB
 
   KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
 
   /* compute expansion headroom to check if we can expand */
   if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
     /* possible expansion too small -- give up */
     return added;
   }
   minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
 
   newCapacity = __kmp_threads_capacity;
   do {
     newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
                                                           : __kmp_sys_max_nth;
   } while (newCapacity < minimumRequiredCapacity);
   newThreads = (kmp_info_t **)__kmp_allocate(
       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
   newRoot =
       (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
   KMP_MEMCPY(newThreads, __kmp_threads,
              __kmp_threads_capacity * sizeof(kmp_info_t *));
   KMP_MEMCPY(newRoot, __kmp_root,
              __kmp_threads_capacity * sizeof(kmp_root_t *));
 
   kmp_info_t **temp_threads = __kmp_threads;
   *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
   *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
   __kmp_free(temp_threads);
   added += newCapacity - __kmp_threads_capacity;
   *(volatile int *)&__kmp_threads_capacity = newCapacity;
 
   if (newCapacity > __kmp_tp_capacity) {
     __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
     if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
       __kmp_threadprivate_resize_cache(newCapacity);
     } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
       *(volatile int *)&__kmp_tp_capacity = newCapacity;
     }
     __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
   }
 
   return added;
 }
 
 /* Register the current thread as a root thread and obtain our gtid. We must
    have the __kmp_initz_lock held at this point. Argument TRUE only if are the
    thread that calls from __kmp_do_serial_initialize() */
 int __kmp_register_root(int initial_thread) {
   kmp_info_t *root_thread;
   kmp_root_t *root;
   int gtid;
   int capacity;
   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
   KA_TRACE(20, ("__kmp_register_root: entered\n"));
   KMP_MB();
 
   /* 2007-03-02:
      If initial thread did not invoke OpenMP RTL yet, and this thread is not an
      initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
      work as expected -- it may return false (that means there is at least one
      empty slot in __kmp_threads array), but it is possible the only free slot
      is #0, which is reserved for initial thread and so cannot be used for this
      one. Following code workarounds this bug.
 
      However, right solution seems to be not reserving slot #0 for initial
      thread because:
      (1) there is no magic in slot #0,
      (2) we cannot detect initial thread reliably (the first thread which does
         serial initialization may be not a real initial thread).
   */
   capacity = __kmp_threads_capacity;
   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
     --capacity;
   }
 
   /* see if there are too many threads */
   if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
     if (__kmp_tp_cached) {
       __kmp_fatal(KMP_MSG(CantRegisterNewThread),
                   KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
                   KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
     } else {
       __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
                   __kmp_msg_null);
     }
   }
 
   /* find an available thread slot */
   /* Don't reassign the zero slot since we need that to only be used by initial
      thread */
   for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
        gtid++)
     ;
   KA_TRACE(1,
            ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
   KMP_ASSERT(gtid < __kmp_threads_capacity);
 
   /* update global accounting */
   __kmp_all_nth++;
   TCW_4(__kmp_nth, __kmp_nth + 1);
 
   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
   // numbers of procs, and method #2 (keyed API call) for higher numbers.
   if (__kmp_adjust_gtid_mode) {
     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
       if (TCR_4(__kmp_gtid_mode) != 2) {
         TCW_4(__kmp_gtid_mode, 2);
       }
     } else {
       if (TCR_4(__kmp_gtid_mode) != 1) {
         TCW_4(__kmp_gtid_mode, 1);
       }
     }
   }
 
 #ifdef KMP_ADJUST_BLOCKTIME
   /* Adjust blocktime to zero if necessary            */
   /* Middle initialization might not have occurred yet */
   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
     if (__kmp_nth > __kmp_avail_proc) {
       __kmp_zero_bt = TRUE;
     }
   }
 #endif /* KMP_ADJUST_BLOCKTIME */
 
   /* setup this new hierarchy */
   if (!(root = __kmp_root[gtid])) {
     root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
     KMP_DEBUG_ASSERT(!root->r.r_root_team);
   }
 
 #if KMP_STATS_ENABLED
   // Initialize stats as soon as possible (right after gtid assignment).
   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
   __kmp_stats_thread_ptr->startLife();
   KMP_SET_THREAD_STATE(SERIAL_REGION);
   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
 #endif
   __kmp_initialize_root(root);
 
   /* setup new root thread structure */
   if (root->r.r_uber_thread) {
     root_thread = root->r.r_uber_thread;
   } else {
     root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
     if (__kmp_storage_map) {
       __kmp_print_thread_storage_map(root_thread, gtid);
     }
     root_thread->th.th_info.ds.ds_gtid = gtid;
 #if OMPT_SUPPORT
     root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
 #endif
     root_thread->th.th_root = root;
     if (__kmp_env_consistency_check) {
       root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
     }
 #if USE_FAST_MEMORY
     __kmp_initialize_fast_memory(root_thread);
 #endif /* USE_FAST_MEMORY */
 
 #if KMP_USE_BGET
     KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
     __kmp_initialize_bget(root_thread);
 #endif
     __kmp_init_random(root_thread); // Initialize random number generator
   }
 
   /* setup the serial team held in reserve by the root thread */
   if (!root_thread->th.th_serial_team) {
     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
     KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
     root_thread->th.th_serial_team = __kmp_allocate_team(
         root, 1, 1,
 #if OMPT_SUPPORT
         ompt_data_none, // root parallel id
 #endif
         proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
   }
   KMP_ASSERT(root_thread->th.th_serial_team);
   KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
                 root_thread->th.th_serial_team));
 
   /* drop root_thread into place */
   TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
 
   root->r.r_root_team->t.t_threads[0] = root_thread;
   root->r.r_hot_team->t.t_threads[0] = root_thread;
   root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
   // AC: the team created in reserve, not for execution (it is unused for now).
   root_thread->th.th_serial_team->t.t_serialized = 0;
   root->r.r_uber_thread = root_thread;
 
   /* initialize the thread, get it ready to go */
   __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
   TCW_4(__kmp_init_gtid, TRUE);
 
   /* prepare the master thread for get_gtid() */
   __kmp_gtid_set_specific(gtid);
 
 #if USE_ITT_BUILD
   __kmp_itt_thread_name(gtid);
 #endif /* USE_ITT_BUILD */
 
 #ifdef KMP_TDATA_GTID
   __kmp_gtid = gtid;
 #endif
   __kmp_create_worker(gtid, root_thread, __kmp_stksize);
   KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
 
   KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
                 "plain=%u\n",
                 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
                 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
                 KMP_INIT_BARRIER_STATE));
   { // Initialize barrier data.
     int b;
     for (b = 0; b < bs_last_barrier; ++b) {
       root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
 #if USE_DEBUGGER
       root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
 #endif
     }
   }
   KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
                    KMP_INIT_BARRIER_STATE);
 
 #if KMP_AFFINITY_SUPPORTED
   root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
   root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
   root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
   root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
   if (TCR_4(__kmp_init_middle)) {
     __kmp_affinity_set_init_mask(gtid, TRUE);
   }
 #endif /* KMP_AFFINITY_SUPPORTED */
   root_thread->th.th_def_allocator = __kmp_def_allocator;
   root_thread->th.th_prev_level = 0;
   root_thread->th.th_prev_num_threads = 1;
 
   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
   tmp->cg_root = root_thread;
   tmp->cg_thread_limit = __kmp_cg_max_nth;
   tmp->cg_nthreads = 1;
   KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
                  " cg_nthreads init to 1\n",
                  root_thread, tmp));
   tmp->up = NULL;
   root_thread->th.th_cg_roots = tmp;
 
   __kmp_root_counter++;
 
 #if OMPT_SUPPORT
   if (!initial_thread && ompt_enabled.enabled) {
 
     kmp_info_t *root_thread = ompt_get_thread();
 
     ompt_set_thread_state(root_thread, ompt_state_overhead);
 
     if (ompt_enabled.ompt_callback_thread_begin) {
       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
           ompt_thread_initial, __ompt_get_thread_data_internal());
     }
     ompt_data_t *task_data;
     ompt_data_t *parallel_data;
     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
     if (ompt_enabled.ompt_callback_implicit_task) {
       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
           ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
     }
 
     ompt_set_thread_state(root_thread, ompt_state_work_serial);
   }
 #endif
 
   KMP_MB();
   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
 
   return gtid;
 }
 
 #if KMP_NESTED_HOT_TEAMS
 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
                                 const int max_level) {
   int i, n, nth;
   kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
   if (!hot_teams || !hot_teams[level].hot_team) {
     return 0;
   }
   KMP_DEBUG_ASSERT(level < max_level);
   kmp_team_t *team = hot_teams[level].hot_team;
   nth = hot_teams[level].hot_team_nth;
   n = nth - 1; // master is not freed
   if (level < max_level - 1) {
     for (i = 0; i < nth; ++i) {
       kmp_info_t *th = team->t.t_threads[i];
       n += __kmp_free_hot_teams(root, th, level + 1, max_level);
       if (i > 0 && th->th.th_hot_teams) {
         __kmp_free(th->th.th_hot_teams);
         th->th.th_hot_teams = NULL;
       }
     }
   }
   __kmp_free_team(root, team, NULL);
   return n;
 }
 #endif
 
 // Resets a root thread and clear its root and hot teams.
 // Returns the number of __kmp_threads entries directly and indirectly freed.
 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
   kmp_team_t *root_team = root->r.r_root_team;
   kmp_team_t *hot_team = root->r.r_hot_team;
   int n = hot_team->t.t_nproc;
   int i;
 
   KMP_DEBUG_ASSERT(!root->r.r_active);
 
   root->r.r_root_team = NULL;
   root->r.r_hot_team = NULL;
   // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
   // before call to __kmp_free_team().
   __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
 #if KMP_NESTED_HOT_TEAMS
   if (__kmp_hot_teams_max_level >
       0) { // need to free nested hot teams and their threads if any
     for (i = 0; i < hot_team->t.t_nproc; ++i) {
       kmp_info_t *th = hot_team->t.t_threads[i];
       if (__kmp_hot_teams_max_level > 1) {
         n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
       }
       if (th->th.th_hot_teams) {
         __kmp_free(th->th.th_hot_teams);
         th->th.th_hot_teams = NULL;
       }
     }
   }
 #endif
   __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
 
   // Before we can reap the thread, we need to make certain that all other
   // threads in the teams that had this root as ancestor have stopped trying to
   // steal tasks.
   if (__kmp_tasking_mode != tskm_immediate_exec) {
     __kmp_wait_to_unref_task_teams();
   }
 
 #if KMP_OS_WINDOWS
   /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
   KA_TRACE(
       10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
            "\n",
            (LPVOID) & (root->r.r_uber_thread->th),
            root->r.r_uber_thread->th.th_info.ds.ds_thread));
   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
 #endif /* KMP_OS_WINDOWS */
 
 #if OMPT_SUPPORT
   ompt_data_t *task_data;
   ompt_data_t *parallel_data;
   __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
   if (ompt_enabled.ompt_callback_implicit_task) {
     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
         ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
   }
   if (ompt_enabled.ompt_callback_thread_end) {
     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
         &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
   }
 #endif
 
   TCW_4(__kmp_nth,
         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
   i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
   KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
                  " to %d\n",
                  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
                  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
   if (i == 1) {
     // need to free contention group structure
     KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
                      root->r.r_uber_thread->th.th_cg_roots->cg_root);
     KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
     __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
     root->r.r_uber_thread->th.th_cg_roots = NULL;
   }
   __kmp_reap_thread(root->r.r_uber_thread, 1);
 
   // We canot put root thread to __kmp_thread_pool, so we have to reap it istead
   // of freeing.
   root->r.r_uber_thread = NULL;
   /* mark root as no longer in use */
   root->r.r_begin = FALSE;
 
   return n;
 }
 
 void __kmp_unregister_root_current_thread(int gtid) {
   KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
   /* this lock should be ok, since unregister_root_current_thread is never
      called during an abort, only during a normal close. furthermore, if you
      have the forkjoin lock, you should never try to get the initz lock */
   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
     KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
                   "exiting T#%d\n",
                   gtid));
     __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
     return;
   }
   kmp_root_t *root = __kmp_root[gtid];
 
   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
   KMP_ASSERT(KMP_UBER_GTID(gtid));
   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
   KMP_ASSERT(root->r.r_active == FALSE);
 
   KMP_MB();
 
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_team_t *team = thread->th.th_team;
   kmp_task_team_t *task_team = thread->th.th_task_team;
 
   // we need to wait for the proxy tasks before finishing the thread
   if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
 #if OMPT_SUPPORT
     // the runtime is shutting down so we won't report any events
     thread->th.ompt_thread_info.state = ompt_state_undefined;
 #endif
     __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
   }
 
   __kmp_reset_root(gtid, root);
 
   /* free up this thread slot */
   __kmp_gtid_set_specific(KMP_GTID_DNE);
 #ifdef KMP_TDATA_GTID
   __kmp_gtid = KMP_GTID_DNE;
 #endif
 
   KMP_MB();
   KC_TRACE(10,
            ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
 
   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
 }
 
 #if KMP_OS_WINDOWS
 /* __kmp_forkjoin_lock must be already held
    Unregisters a root thread that is not the current thread.  Returns the number
    of __kmp_threads entries freed as a result. */
 static int __kmp_unregister_root_other_thread(int gtid) {
   kmp_root_t *root = __kmp_root[gtid];
   int r;
 
   KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
   KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
   KMP_ASSERT(KMP_UBER_GTID(gtid));
   KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
   KMP_ASSERT(root->r.r_active == FALSE);
 
   r = __kmp_reset_root(gtid, root);
   KC_TRACE(10,
            ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
   return r;
 }
 #endif
 
 #if KMP_DEBUG
 void __kmp_task_info() {
 
   kmp_int32 gtid = __kmp_entry_gtid();
   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
   kmp_info_t *this_thr = __kmp_threads[gtid];
   kmp_team_t *steam = this_thr->th.th_serial_team;
   kmp_team_t *team = this_thr->th.th_team;
 
   __kmp_printf(
       "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
       "ptask=%p\n",
       gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
       team->t.t_implicit_task_taskdata[tid].td_parent);
 }
 #endif // KMP_DEBUG
 
 /* TODO optimize with one big memclr, take out what isn't needed, split
    responsibility to workers as much as possible, and delay initialization of
    features as much as possible  */
 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
                                   int tid, int gtid) {
   /* this_thr->th.th_info.ds.ds_gtid is setup in
      kmp_allocate_thread/create_worker.
      this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
   kmp_info_t *master = team->t.t_threads[0];
   KMP_DEBUG_ASSERT(this_thr != NULL);
   KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
   KMP_DEBUG_ASSERT(team);
   KMP_DEBUG_ASSERT(team->t.t_threads);
   KMP_DEBUG_ASSERT(team->t.t_dispatch);
   KMP_DEBUG_ASSERT(master);
   KMP_DEBUG_ASSERT(master->th.th_root);
 
   KMP_MB();
 
   TCW_SYNC_PTR(this_thr->th.th_team, team);
 
   this_thr->th.th_info.ds.ds_tid = tid;
   this_thr->th.th_set_nproc = 0;
   if (__kmp_tasking_mode != tskm_immediate_exec)
     // When tasking is possible, threads are not safe to reap until they are
     // done tasking; this will be set when tasking code is exited in wait
     this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
   else // no tasking --> always safe to reap
     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
   this_thr->th.th_set_proc_bind = proc_bind_default;
 #if KMP_AFFINITY_SUPPORTED
   this_thr->th.th_new_place = this_thr->th.th_current_place;
 #endif
   this_thr->th.th_root = master->th.th_root;
 
   /* setup the thread's cache of the team structure */
   this_thr->th.th_team_nproc = team->t.t_nproc;
   this_thr->th.th_team_master = master;
   this_thr->th.th_team_serialized = team->t.t_serialized;
   TCW_PTR(this_thr->th.th_sleep_loc, NULL);
 
   KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
 
   KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
                 tid, gtid, this_thr, this_thr->th.th_current_task));
 
   __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
                            team, tid, TRUE);
 
   KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
                 tid, gtid, this_thr, this_thr->th.th_current_task));
   // TODO: Initialize ICVs from parent; GEH - isn't that already done in
   // __kmp_initialize_team()?
 
   /* TODO no worksharing in speculative threads */
   this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
 
   this_thr->th.th_local.this_construct = 0;
 
   if (!this_thr->th.th_pri_common) {
     this_thr->th.th_pri_common =
         (struct common_table *)__kmp_allocate(sizeof(struct common_table));
     if (__kmp_storage_map) {
       __kmp_print_storage_map_gtid(
           gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
           sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
     }
     this_thr->th.th_pri_head = NULL;
   }
 
   if (this_thr != master && // Master's CG root is initialized elsewhere
       this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
     // Make new thread's CG root same as master's
     KMP_DEBUG_ASSERT(master->th.th_cg_roots);
     kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
     if (tmp) {
       // worker changes CG, need to check if old CG should be freed
       int i = tmp->cg_nthreads--;
       KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
                      " on node %p of thread %p to %d\n",
                      this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
       if (i == 1) {
         __kmp_free(tmp); // last thread left CG --> free it
       }
     }
     this_thr->th.th_cg_roots = master->th.th_cg_roots;
     // Increment new thread's CG root's counter to add the new thread
     this_thr->th.th_cg_roots->cg_nthreads++;
     KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
                    " node %p of thread %p to %d\n",
                    this_thr, this_thr->th.th_cg_roots,
                    this_thr->th.th_cg_roots->cg_root,
                    this_thr->th.th_cg_roots->cg_nthreads));
     this_thr->th.th_current_task->td_icvs.thread_limit =
         this_thr->th.th_cg_roots->cg_thread_limit;
   }
 
   /* Initialize dynamic dispatch */
   {
     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
     // Use team max_nproc since this will never change for the team.
     size_t disp_size =
         sizeof(dispatch_private_info_t) *
         (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
     KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
                   team->t.t_max_nproc));
     KMP_ASSERT(dispatch);
     KMP_DEBUG_ASSERT(team->t.t_dispatch);
     KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
 
     dispatch->th_disp_index = 0;
     dispatch->th_doacross_buf_idx = 0;
     if (!dispatch->th_disp_buffer) {
       dispatch->th_disp_buffer =
           (dispatch_private_info_t *)__kmp_allocate(disp_size);
 
       if (__kmp_storage_map) {
         __kmp_print_storage_map_gtid(
             gtid, &dispatch->th_disp_buffer[0],
             &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
                                           ? 1
                                           : __kmp_dispatch_num_buffers],
             disp_size, "th_%d.th_dispatch.th_disp_buffer "
                        "(team_%d.t_dispatch[%d].th_disp_buffer)",
             gtid, team->t.t_id, gtid);
       }
     } else {
       memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
     }
 
     dispatch->th_dispatch_pr_current = 0;
     dispatch->th_dispatch_sh_current = 0;
 
     dispatch->th_deo_fcn = 0; /* ORDERED     */
     dispatch->th_dxo_fcn = 0; /* END ORDERED */
   }
 
   this_thr->th.th_next_pool = NULL;
 
   if (!this_thr->th.th_task_state_memo_stack) {
     size_t i;
     this_thr->th.th_task_state_memo_stack =
         (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
     this_thr->th.th_task_state_top = 0;
     this_thr->th.th_task_state_stack_sz = 4;
     for (i = 0; i < this_thr->th.th_task_state_stack_sz;
          ++i) // zero init the stack
       this_thr->th.th_task_state_memo_stack[i] = 0;
   }
 
   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
 
   KMP_MB();
 }
 
 /* allocate a new thread for the requesting team. this is only called from
    within a forkjoin critical section. we will first try to get an available
    thread from the thread pool. if none is available, we will fork a new one
    assuming we are able to create a new one. this should be assured, as the
    caller should check on this first. */
 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
                                   int new_tid) {
   kmp_team_t *serial_team;
   kmp_info_t *new_thr;
   int new_gtid;
 
   KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
   KMP_DEBUG_ASSERT(root && team);
 #if !KMP_NESTED_HOT_TEAMS
   KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
 #endif
   KMP_MB();
 
   /* first, try to get one from the thread pool */
   if (__kmp_thread_pool) {
     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
     if (new_thr == __kmp_thread_pool_insert_pt) {
       __kmp_thread_pool_insert_pt = NULL;
     }
     TCW_4(new_thr->th.th_in_pool, FALSE);
     __kmp_suspend_initialize_thread(new_thr);
     __kmp_lock_suspend_mx(new_thr);
     if (new_thr->th.th_active_in_pool == TRUE) {
       KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
       new_thr->th.th_active_in_pool = FALSE;
     }
     __kmp_unlock_suspend_mx(new_thr);
 
     KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
                   __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
     KMP_ASSERT(!new_thr->th.th_team);
     KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
 
     /* setup the thread structure */
     __kmp_initialize_info(new_thr, team, new_tid,
                           new_thr->th.th_info.ds.ds_gtid);
     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
 
     TCW_4(__kmp_nth, __kmp_nth + 1);
 
     new_thr->th.th_task_state = 0;
     new_thr->th.th_task_state_top = 0;
     new_thr->th.th_task_state_stack_sz = 4;
 
 #ifdef KMP_ADJUST_BLOCKTIME
     /* Adjust blocktime back to zero if necessary */
     /* Middle initialization might not have occurred yet */
     if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
       if (__kmp_nth > __kmp_avail_proc) {
         __kmp_zero_bt = TRUE;
       }
     }
 #endif /* KMP_ADJUST_BLOCKTIME */
 
 #if KMP_DEBUG
     // If thread entered pool via __kmp_free_thread, wait_flag should !=
     // KMP_BARRIER_PARENT_FLAG.
     int b;
     kmp_balign_t *balign = new_thr->th.th_bar;
     for (b = 0; b < bs_last_barrier; ++b)
       KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
 #endif
 
     KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
                   __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
 
     KMP_MB();
     return new_thr;
   }
 
   /* no, well fork a new one */
   KMP_ASSERT(__kmp_nth == __kmp_all_nth);
   KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
 
 #if KMP_USE_MONITOR
   // If this is the first worker thread the RTL is creating, then also
   // launch the monitor thread.  We try to do this as early as possible.
   if (!TCR_4(__kmp_init_monitor)) {
     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
     if (!TCR_4(__kmp_init_monitor)) {
       KF_TRACE(10, ("before __kmp_create_monitor\n"));
       TCW_4(__kmp_init_monitor, 1);
       __kmp_create_monitor(&__kmp_monitor);
       KF_TRACE(10, ("after __kmp_create_monitor\n"));
 #if KMP_OS_WINDOWS
       // AC: wait until monitor has started. This is a fix for CQ232808.
       // The reason is that if the library is loaded/unloaded in a loop with
       // small (parallel) work in between, then there is high probability that
       // monitor thread started after the library shutdown. At shutdown it is
       // too late to cope with the problem, because when the master is in
       // DllMain (process detach) the monitor has no chances to start (it is
       // blocked), and master has no means to inform the monitor that the
       // library has gone, because all the memory which the monitor can access
       // is going to be released/reset.
       while (TCR_4(__kmp_init_monitor) < 2) {
         KMP_YIELD(TRUE);
       }
       KF_TRACE(10, ("after monitor thread has started\n"));
 #endif
     }
     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
   }
 #endif
 
   KMP_MB();
   for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
     KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
   }
 
   /* allocate space for it. */
   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
 
   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
 
   if (__kmp_storage_map) {
     __kmp_print_thread_storage_map(new_thr, new_gtid);
   }
 
   // add the reserve serialized team, initialized from the team's master thread
   {
     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
     KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
     new_thr->th.th_serial_team = serial_team =
         (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
 #if OMPT_SUPPORT
                                           ompt_data_none, // root parallel id
 #endif
                                           proc_bind_default, &r_icvs,
                                           0 USE_NESTED_HOT_ARG(NULL));
   }
   KMP_ASSERT(serial_team);
   serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
   // execution (it is unused for now).
   serial_team->t.t_threads[0] = new_thr;
   KF_TRACE(10,
            ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
             new_thr));
 
   /* setup the thread structures */
   __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
 
 #if USE_FAST_MEMORY
   __kmp_initialize_fast_memory(new_thr);
 #endif /* USE_FAST_MEMORY */
 
 #if KMP_USE_BGET
   KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
   __kmp_initialize_bget(new_thr);
 #endif
 
   __kmp_init_random(new_thr); // Initialize random number generator
 
   /* Initialize these only once when thread is grabbed for a team allocation */
   KA_TRACE(20,
            ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
             __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
 
   int b;
   kmp_balign_t *balign = new_thr->th.th_bar;
   for (b = 0; b < bs_last_barrier; ++b) {
     balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
     balign[b].bb.team = NULL;
     balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
     balign[b].bb.use_oncore_barrier = 0;
   }
 
   new_thr->th.th_spin_here = FALSE;
   new_thr->th.th_next_waiting = 0;
 #if KMP_OS_UNIX
   new_thr->th.th_blocking = false;
 #endif
 
 #if KMP_AFFINITY_SUPPORTED
   new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
   new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
   new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
   new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
 #endif
   new_thr->th.th_def_allocator = __kmp_def_allocator;
   new_thr->th.th_prev_level = 0;
   new_thr->th.th_prev_num_threads = 1;
 
   TCW_4(new_thr->th.th_in_pool, FALSE);
   new_thr->th.th_active_in_pool = FALSE;
   TCW_4(new_thr->th.th_active, TRUE);
 
   /* adjust the global counters */
   __kmp_all_nth++;
   __kmp_nth++;
 
   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
   // numbers of procs, and method #2 (keyed API call) for higher numbers.
   if (__kmp_adjust_gtid_mode) {
     if (__kmp_all_nth >= __kmp_tls_gtid_min) {
       if (TCR_4(__kmp_gtid_mode) != 2) {
         TCW_4(__kmp_gtid_mode, 2);
       }
     } else {
       if (TCR_4(__kmp_gtid_mode) != 1) {
         TCW_4(__kmp_gtid_mode, 1);
       }
     }
   }
 
 #ifdef KMP_ADJUST_BLOCKTIME
   /* Adjust blocktime back to zero if necessary       */
   /* Middle initialization might not have occurred yet */
   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
     if (__kmp_nth > __kmp_avail_proc) {
       __kmp_zero_bt = TRUE;
     }
   }
 #endif /* KMP_ADJUST_BLOCKTIME */
 
   /* actually fork it and create the new worker thread */
   KF_TRACE(
       10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
   __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
   KF_TRACE(10,
            ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
 
   KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
                 new_gtid));
   KMP_MB();
   return new_thr;
 }
 
 /* Reinitialize team for reuse.
    The hot team code calls this case at every fork barrier, so EPCC barrier
    test are extremely sensitive to changes in it, esp. writes to the team
    struct, which cause a cache invalidation in all threads.
    IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
 static void __kmp_reinitialize_team(kmp_team_t *team,
                                     kmp_internal_control_t *new_icvs,
                                     ident_t *loc) {
   KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
                 team->t.t_threads[0], team));
   KMP_DEBUG_ASSERT(team && new_icvs);
   KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
   KMP_CHECK_UPDATE(team->t.t_ident, loc);
 
   KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
   // Copy ICVs to the master thread's implicit taskdata
   __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
   copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
 
   KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
                 team->t.t_threads[0], team));
 }
 
 /* Initialize the team data structure.
    This assumes the t_threads and t_max_nproc are already set.
    Also, we don't touch the arguments */
 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
                                   kmp_internal_control_t *new_icvs,
                                   ident_t *loc) {
   KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
 
   /* verify */
   KMP_DEBUG_ASSERT(team);
   KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
   KMP_DEBUG_ASSERT(team->t.t_threads);
   KMP_MB();
 
   team->t.t_master_tid = 0; /* not needed */
   /* team->t.t_master_bar;        not needed */
   team->t.t_serialized = new_nproc > 1 ? 0 : 1;
   team->t.t_nproc = new_nproc;
 
   /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
   team->t.t_next_pool = NULL;
   /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
    * up hot team */
 
   TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
   team->t.t_invoke = NULL; /* not needed */
 
   // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
   team->t.t_sched.sched = new_icvs->sched.sched;
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
   team->t.t_fp_control_saved = FALSE; /* not needed */
   team->t.t_x87_fpu_control_word = 0; /* not needed */
   team->t.t_mxcsr = 0; /* not needed */
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
   team->t.t_construct = 0;
 
   team->t.t_ordered.dt.t_value = 0;
   team->t.t_master_active = FALSE;
 
 #ifdef KMP_DEBUG
   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
 #endif
 #if KMP_OS_WINDOWS
   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
 #endif
 
   team->t.t_control_stack_top = NULL;
 
   __kmp_reinitialize_team(team, new_icvs, loc);
 
   KMP_MB();
   KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
 }
 
-#if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
+#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
 /* Sets full mask for thread and returns old mask, no changes to structures. */
 static void
 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
   if (KMP_AFFINITY_CAPABLE()) {
     int status;
     if (old_mask != NULL) {
       status = __kmp_get_system_affinity(old_mask, TRUE);
       int error = errno;
       if (status != 0) {
         __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
                     __kmp_msg_null);
       }
     }
     __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
   }
 }
 #endif
 
 #if KMP_AFFINITY_SUPPORTED
 
 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
 // It calculats the worker + master thread's partition based upon the parent
 // thread's partition, and binds each worker to a thread in their partition.
 // The master thread's partition should already include its current binding.
 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
   // Copy the master thread's place partion to the team struct
   kmp_info_t *master_th = team->t.t_threads[0];
   KMP_DEBUG_ASSERT(master_th != NULL);
   kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
   int first_place = master_th->th.th_first_place;
   int last_place = master_th->th.th_last_place;
   int masters_place = master_th->th.th_current_place;
   team->t.t_first_place = first_place;
   team->t.t_last_place = last_place;
 
   KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
                 "bound to place %d partition = [%d,%d]\n",
                 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
                 team->t.t_id, masters_place, first_place, last_place));
 
   switch (proc_bind) {
 
   case proc_bind_default:
     // serial teams might have the proc_bind policy set to proc_bind_default. It
     // doesn't matter, as we don't rebind master thread for any proc_bind policy
     KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
     break;
 
   case proc_bind_master: {
     int f;
     int n_th = team->t.t_nproc;
     for (f = 1; f < n_th; f++) {
       kmp_info_t *th = team->t.t_threads[f];
       KMP_DEBUG_ASSERT(th != NULL);
       th->th.th_first_place = first_place;
       th->th.th_last_place = last_place;
       th->th.th_new_place = masters_place;
       if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
           team->t.t_display_affinity != 1) {
         team->t.t_display_affinity = 1;
       }
 
       KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
                      "partition = [%d,%d]\n",
                      __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
                      f, masters_place, first_place, last_place));
     }
   } break;
 
   case proc_bind_close: {
     int f;
     int n_th = team->t.t_nproc;
     int n_places;
     if (first_place <= last_place) {
       n_places = last_place - first_place + 1;
     } else {
       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
     }
     if (n_th <= n_places) {
       int place = masters_place;
       for (f = 1; f < n_th; f++) {
         kmp_info_t *th = team->t.t_threads[f];
         KMP_DEBUG_ASSERT(th != NULL);
 
         if (place == last_place) {
           place = first_place;
         } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
           place = 0;
         } else {
           place++;
         }
         th->th.th_first_place = first_place;
         th->th.th_last_place = last_place;
         th->th.th_new_place = place;
         if (__kmp_display_affinity && place != th->th.th_current_place &&
             team->t.t_display_affinity != 1) {
           team->t.t_display_affinity = 1;
         }
 
         KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
                        "partition = [%d,%d]\n",
                        __kmp_gtid_from_thread(team->t.t_threads[f]),
                        team->t.t_id, f, place, first_place, last_place));
       }
     } else {
       int S, rem, gap, s_count;
       S = n_th / n_places;
       s_count = 0;
       rem = n_th - (S * n_places);
       gap = rem > 0 ? n_places / rem : n_places;
       int place = masters_place;
       int gap_ct = gap;
       for (f = 0; f < n_th; f++) {
         kmp_info_t *th = team->t.t_threads[f];
         KMP_DEBUG_ASSERT(th != NULL);
 
         th->th.th_first_place = first_place;
         th->th.th_last_place = last_place;
         th->th.th_new_place = place;
         if (__kmp_display_affinity && place != th->th.th_current_place &&
             team->t.t_display_affinity != 1) {
           team->t.t_display_affinity = 1;
         }
         s_count++;
 
         if ((s_count == S) && rem && (gap_ct == gap)) {
           // do nothing, add an extra thread to place on next iteration
         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
           // we added an extra thread to this place; move to next place
           if (place == last_place) {
             place = first_place;
           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
             place = 0;
           } else {
             place++;
           }
           s_count = 0;
           gap_ct = 1;
           rem--;
         } else if (s_count == S) { // place full; don't add extra
           if (place == last_place) {
             place = first_place;
           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
             place = 0;
           } else {
             place++;
           }
           gap_ct++;
           s_count = 0;
         }
 
         KA_TRACE(100,
                  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
                   "partition = [%d,%d]\n",
                   __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
                   th->th.th_new_place, first_place, last_place));
       }
       KMP_DEBUG_ASSERT(place == masters_place);
     }
   } break;
 
   case proc_bind_spread: {
     int f;
     int n_th = team->t.t_nproc;
     int n_places;
     int thidx;
     if (first_place <= last_place) {
       n_places = last_place - first_place + 1;
     } else {
       n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
     }
     if (n_th <= n_places) {
       int place = -1;
 
       if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
         int S = n_places / n_th;
         int s_count, rem, gap, gap_ct;
 
         place = masters_place;
         rem = n_places - n_th * S;
         gap = rem ? n_th / rem : 1;
         gap_ct = gap;
         thidx = n_th;
         if (update_master_only == 1)
           thidx = 1;
         for (f = 0; f < thidx; f++) {
           kmp_info_t *th = team->t.t_threads[f];
           KMP_DEBUG_ASSERT(th != NULL);
 
           th->th.th_first_place = place;
           th->th.th_new_place = place;
           if (__kmp_display_affinity && place != th->th.th_current_place &&
               team->t.t_display_affinity != 1) {
             team->t.t_display_affinity = 1;
           }
           s_count = 1;
           while (s_count < S) {
             if (place == last_place) {
               place = first_place;
             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
               place = 0;
             } else {
               place++;
             }
             s_count++;
           }
           if (rem && (gap_ct == gap)) {
             if (place == last_place) {
               place = first_place;
             } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
               place = 0;
             } else {
               place++;
             }
             rem--;
             gap_ct = 0;
           }
           th->th.th_last_place = place;
           gap_ct++;
 
           if (place == last_place) {
             place = first_place;
           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
             place = 0;
           } else {
             place++;
           }
 
           KA_TRACE(100,
                    ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
                     "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
                     __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
                     f, th->th.th_new_place, th->th.th_first_place,
                     th->th.th_last_place, __kmp_affinity_num_masks));
         }
       } else {
         /* Having uniform space of available computation places I can create
            T partitions of round(P/T) size and put threads into the first
            place of each partition. */
         double current = static_cast<double>(masters_place);
         double spacing =
             (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
         int first, last;
         kmp_info_t *th;
 
         thidx = n_th + 1;
         if (update_master_only == 1)
           thidx = 1;
         for (f = 0; f < thidx; f++) {
           first = static_cast<int>(current);
           last = static_cast<int>(current + spacing) - 1;
           KMP_DEBUG_ASSERT(last >= first);
           if (first >= n_places) {
             if (masters_place) {
               first -= n_places;
               last -= n_places;
               if (first == (masters_place + 1)) {
                 KMP_DEBUG_ASSERT(f == n_th);
                 first--;
               }
               if (last == masters_place) {
                 KMP_DEBUG_ASSERT(f == (n_th - 1));
                 last--;
               }
             } else {
               KMP_DEBUG_ASSERT(f == n_th);
               first = 0;
               last = 0;
             }
           }
           if (last >= n_places) {
             last = (n_places - 1);
           }
           place = first;
           current += spacing;
           if (f < n_th) {
             KMP_DEBUG_ASSERT(0 <= first);
             KMP_DEBUG_ASSERT(n_places > first);
             KMP_DEBUG_ASSERT(0 <= last);
             KMP_DEBUG_ASSERT(n_places > last);
             KMP_DEBUG_ASSERT(last_place >= first_place);
             th = team->t.t_threads[f];
             KMP_DEBUG_ASSERT(th);
             th->th.th_first_place = first;
             th->th.th_new_place = place;
             th->th.th_last_place = last;
             if (__kmp_display_affinity && place != th->th.th_current_place &&
                 team->t.t_display_affinity != 1) {
               team->t.t_display_affinity = 1;
             }
             KA_TRACE(100,
                      ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
                       "partition = [%d,%d], spacing = %.4f\n",
                       __kmp_gtid_from_thread(team->t.t_threads[f]),
                       team->t.t_id, f, th->th.th_new_place,
                       th->th.th_first_place, th->th.th_last_place, spacing));
           }
         }
       }
       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
     } else {
       int S, rem, gap, s_count;
       S = n_th / n_places;
       s_count = 0;
       rem = n_th - (S * n_places);
       gap = rem > 0 ? n_places / rem : n_places;
       int place = masters_place;
       int gap_ct = gap;
       thidx = n_th;
       if (update_master_only == 1)
         thidx = 1;
       for (f = 0; f < thidx; f++) {
         kmp_info_t *th = team->t.t_threads[f];
         KMP_DEBUG_ASSERT(th != NULL);
 
         th->th.th_first_place = place;
         th->th.th_last_place = place;
         th->th.th_new_place = place;
         if (__kmp_display_affinity && place != th->th.th_current_place &&
             team->t.t_display_affinity != 1) {
           team->t.t_display_affinity = 1;
         }
         s_count++;
 
         if ((s_count == S) && rem && (gap_ct == gap)) {
           // do nothing, add an extra thread to place on next iteration
         } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
           // we added an extra thread to this place; move on to next place
           if (place == last_place) {
             place = first_place;
           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
             place = 0;
           } else {
             place++;
           }
           s_count = 0;
           gap_ct = 1;
           rem--;
         } else if (s_count == S) { // place is full; don't add extra thread
           if (place == last_place) {
             place = first_place;
           } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
             place = 0;
           } else {
             place++;
           }
           gap_ct++;
           s_count = 0;
         }
 
         KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
                        "partition = [%d,%d]\n",
                        __kmp_gtid_from_thread(team->t.t_threads[f]),
                        team->t.t_id, f, th->th.th_new_place,
                        th->th.th_first_place, th->th.th_last_place));
       }
       KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
     }
   } break;
 
   default:
     break;
   }
 
   KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
 }
 
 #endif // KMP_AFFINITY_SUPPORTED
 
 /* allocate a new team data structure to use.  take one off of the free pool if
    available */
 kmp_team_t *
 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
 #if OMPT_SUPPORT
                     ompt_data_t ompt_parallel_data,
 #endif
                     kmp_proc_bind_t new_proc_bind,
                     kmp_internal_control_t *new_icvs,
                     int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
   int f;
   kmp_team_t *team;
   int use_hot_team = !root->r.r_active;
   int level = 0;
 
   KA_TRACE(20, ("__kmp_allocate_team: called\n"));
   KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
   KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
   KMP_MB();
 
 #if KMP_NESTED_HOT_TEAMS
   kmp_hot_team_ptr_t *hot_teams;
   if (master) {
     team = master->th.th_team;
     level = team->t.t_active_level;
     if (master->th.th_teams_microtask) { // in teams construct?
       if (master->th.th_teams_size.nteams > 1 &&
           ( // #teams > 1
               team->t.t_pkfn ==
                   (microtask_t)__kmp_teams_master || // inner fork of the teams
               master->th.th_teams_level <
                   team->t.t_level)) { // or nested parallel inside the teams
         ++level; // not increment if #teams==1, or for outer fork of the teams;
         // increment otherwise
       }
     }
     hot_teams = master->th.th_hot_teams;
     if (level < __kmp_hot_teams_max_level && hot_teams &&
         hot_teams[level]
             .hot_team) { // hot team has already been allocated for given level
       use_hot_team = 1;
     } else {
       use_hot_team = 0;
     }
   }
 #endif
   // Optimization to use a "hot" team
   if (use_hot_team && new_nproc > 1) {
     KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
 #if KMP_NESTED_HOT_TEAMS
     team = hot_teams[level].hot_team;
 #else
     team = root->r.r_hot_team;
 #endif
 #if KMP_DEBUG
     if (__kmp_tasking_mode != tskm_immediate_exec) {
       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
                     "task_team[1] = %p before reinit\n",
                     team->t.t_task_team[0], team->t.t_task_team[1]));
     }
 #endif
 
     // Has the number of threads changed?
     /* Let's assume the most common case is that the number of threads is
        unchanged, and put that case first. */
     if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
       KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
       // This case can mean that omp_set_num_threads() was called and the hot
       // team size was already reduced, so we check the special flag
       if (team->t.t_size_changed == -1) {
         team->t.t_size_changed = 1;
       } else {
         KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
       }
 
       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
       kmp_r_sched_t new_sched = new_icvs->sched;
       // set master's schedule as new run-time schedule
       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
 
       __kmp_reinitialize_team(team, new_icvs,
                               root->r.r_uber_thread->th.th_ident);
 
       KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
                     team->t.t_threads[0], team));
       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
 
 #if KMP_AFFINITY_SUPPORTED
       if ((team->t.t_size_changed == 0) &&
           (team->t.t_proc_bind == new_proc_bind)) {
         if (new_proc_bind == proc_bind_spread) {
           __kmp_partition_places(
               team, 1); // add flag to update only master for spread
         }
         KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
                        "proc_bind = %d, partition = [%d,%d]\n",
                        team->t.t_id, new_proc_bind, team->t.t_first_place,
                        team->t.t_last_place));
       } else {
         KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
         __kmp_partition_places(team);
       }
 #else
       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
 #endif /* KMP_AFFINITY_SUPPORTED */
     } else if (team->t.t_nproc > new_nproc) {
       KA_TRACE(20,
                ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
                 new_nproc));
 
       team->t.t_size_changed = 1;
 #if KMP_NESTED_HOT_TEAMS
       if (__kmp_hot_teams_mode == 0) {
         // AC: saved number of threads should correspond to team's value in this
         // mode, can be bigger in mode 1, when hot team has threads in reserve
         KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
         hot_teams[level].hot_team_nth = new_nproc;
 #endif // KMP_NESTED_HOT_TEAMS
         /* release the extra threads we don't need any more */
         for (f = new_nproc; f < team->t.t_nproc; f++) {
           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
           if (__kmp_tasking_mode != tskm_immediate_exec) {
             // When decreasing team size, threads no longer in the team should
             // unref task team.
             team->t.t_threads[f]->th.th_task_team = NULL;
           }
           __kmp_free_thread(team->t.t_threads[f]);
           team->t.t_threads[f] = NULL;
         }
 #if KMP_NESTED_HOT_TEAMS
       } // (__kmp_hot_teams_mode == 0)
       else {
         // When keeping extra threads in team, switch threads to wait on own
         // b_go flag
         for (f = new_nproc; f < team->t.t_nproc; ++f) {
           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
           kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
           for (int b = 0; b < bs_last_barrier; ++b) {
             if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
               balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
             }
             KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
           }
         }
       }
 #endif // KMP_NESTED_HOT_TEAMS
       team->t.t_nproc = new_nproc;
       // TODO???: team->t.t_max_active_levels = new_max_active_levels;
       KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
       __kmp_reinitialize_team(team, new_icvs,
                               root->r.r_uber_thread->th.th_ident);
 
       // Update remaining threads
       for (f = 0; f < new_nproc; ++f) {
         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
       }
 
       // restore the current task state of the master thread: should be the
       // implicit task
       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
                     team->t.t_threads[0], team));
 
       __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
 
 #ifdef KMP_DEBUG
       for (f = 0; f < team->t.t_nproc; f++) {
         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
                          team->t.t_threads[f]->th.th_team_nproc ==
                              team->t.t_nproc);
       }
 #endif
 
       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
 #if KMP_AFFINITY_SUPPORTED
       __kmp_partition_places(team);
 #endif
     } else { // team->t.t_nproc < new_nproc
-#if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
+#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
       kmp_affin_mask_t *old_mask;
       if (KMP_AFFINITY_CAPABLE()) {
         KMP_CPU_ALLOC(old_mask);
       }
 #endif
 
       KA_TRACE(20,
                ("__kmp_allocate_team: increasing hot team thread count to %d\n",
                 new_nproc));
 
       team->t.t_size_changed = 1;
 
 #if KMP_NESTED_HOT_TEAMS
       int avail_threads = hot_teams[level].hot_team_nth;
       if (new_nproc < avail_threads)
         avail_threads = new_nproc;
       kmp_info_t **other_threads = team->t.t_threads;
       for (f = team->t.t_nproc; f < avail_threads; ++f) {
         // Adjust barrier data of reserved threads (if any) of the team
         // Other data will be set in __kmp_initialize_info() below.
         int b;
         kmp_balign_t *balign = other_threads[f]->th.th_bar;
         for (b = 0; b < bs_last_barrier; ++b) {
           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
 #if USE_DEBUGGER
           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
 #endif
         }
       }
       if (hot_teams[level].hot_team_nth >= new_nproc) {
         // we have all needed threads in reserve, no need to allocate any
         // this only possible in mode 1, cannot have reserved threads in mode 0
         KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
         team->t.t_nproc = new_nproc; // just get reserved threads involved
       } else {
         // we may have some threads in reserve, but not enough
         team->t.t_nproc =
             hot_teams[level]
                 .hot_team_nth; // get reserved threads involved if any
         hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
 #endif // KMP_NESTED_HOT_TEAMS
         if (team->t.t_max_nproc < new_nproc) {
           /* reallocate larger arrays */
           __kmp_reallocate_team_arrays(team, new_nproc);
           __kmp_reinitialize_team(team, new_icvs, NULL);
         }
 
-#if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
+#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
         /* Temporarily set full mask for master thread before creation of
            workers. The reason is that workers inherit the affinity from master,
            so if a lot of workers are created on the single core quickly, they
            don't get a chance to set their own affinity for a long time. */
         __kmp_set_thread_affinity_mask_full_tmp(old_mask);
 #endif
 
         /* allocate new threads for the hot team */
         for (f = team->t.t_nproc; f < new_nproc; f++) {
           kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
           KMP_DEBUG_ASSERT(new_worker);
           team->t.t_threads[f] = new_worker;
 
           KA_TRACE(20,
                    ("__kmp_allocate_team: team %d init T#%d arrived: "
                     "join=%llu, plain=%llu\n",
                     team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
                     team->t.t_bar[bs_forkjoin_barrier].b_arrived,
                     team->t.t_bar[bs_plain_barrier].b_arrived));
 
           { // Initialize barrier data for new threads.
             int b;
             kmp_balign_t *balign = new_worker->th.th_bar;
             for (b = 0; b < bs_last_barrier; ++b) {
               balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
               KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
                                KMP_BARRIER_PARENT_FLAG);
 #if USE_DEBUGGER
               balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
 #endif
             }
           }
         }
 
-#if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
+#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
         if (KMP_AFFINITY_CAPABLE()) {
           /* Restore initial master thread's affinity mask */
           __kmp_set_system_affinity(old_mask, TRUE);
           KMP_CPU_FREE(old_mask);
         }
 #endif
 #if KMP_NESTED_HOT_TEAMS
       } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
 #endif // KMP_NESTED_HOT_TEAMS
       /* make sure everyone is syncronized */
       int old_nproc = team->t.t_nproc; // save old value and use to update only
       // new threads below
       __kmp_initialize_team(team, new_nproc, new_icvs,
                             root->r.r_uber_thread->th.th_ident);
 
       /* reinitialize the threads */
       KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
       for (f = 0; f < team->t.t_nproc; ++f)
         __kmp_initialize_info(team->t.t_threads[f], team, f,
                               __kmp_gtid_from_tid(f, team));
 
       if (level) { // set th_task_state for new threads in nested hot team
         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
         // only need to set the th_task_state for the new threads. th_task_state
         // for master thread will not be accurate until after this in
         // __kmp_fork_call(), so we look to the master's memo_stack to get the
         // correct value.
         for (f = old_nproc; f < team->t.t_nproc; ++f)
           team->t.t_threads[f]->th.th_task_state =
               team->t.t_threads[0]->th.th_task_state_memo_stack[level];
       } else { // set th_task_state for new threads in non-nested hot team
         int old_state =
             team->t.t_threads[0]->th.th_task_state; // copy master's state
         for (f = old_nproc; f < team->t.t_nproc; ++f)
           team->t.t_threads[f]->th.th_task_state = old_state;
       }
 
 #ifdef KMP_DEBUG
       for (f = 0; f < team->t.t_nproc; ++f) {
         KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
                          team->t.t_threads[f]->th.th_team_nproc ==
                              team->t.t_nproc);
       }
 #endif
 
       KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
 #if KMP_AFFINITY_SUPPORTED
       __kmp_partition_places(team);
 #endif
     } // Check changes in number of threads
 
     kmp_info_t *master = team->t.t_threads[0];
     if (master->th.th_teams_microtask) {
       for (f = 1; f < new_nproc; ++f) {
         // propagate teams construct specific info to workers
         kmp_info_t *thr = team->t.t_threads[f];
         thr->th.th_teams_microtask = master->th.th_teams_microtask;
         thr->th.th_teams_level = master->th.th_teams_level;
         thr->th.th_teams_size = master->th.th_teams_size;
       }
     }
 #if KMP_NESTED_HOT_TEAMS
     if (level) {
       // Sync barrier state for nested hot teams, not needed for outermost hot
       // team.
       for (f = 1; f < new_nproc; ++f) {
         kmp_info_t *thr = team->t.t_threads[f];
         int b;
         kmp_balign_t *balign = thr->th.th_bar;
         for (b = 0; b < bs_last_barrier; ++b) {
           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
 #if USE_DEBUGGER
           balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
 #endif
         }
       }
     }
 #endif // KMP_NESTED_HOT_TEAMS
 
     /* reallocate space for arguments if necessary */
     __kmp_alloc_argv_entries(argc, team, TRUE);
     KMP_CHECK_UPDATE(team->t.t_argc, argc);
     // The hot team re-uses the previous task team,
     // if untouched during the previous release->gather phase.
 
     KF_TRACE(10, (" hot_team = %p\n", team));
 
 #if KMP_DEBUG
     if (__kmp_tasking_mode != tskm_immediate_exec) {
       KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
                     "task_team[1] = %p after reinit\n",
                     team->t.t_task_team[0], team->t.t_task_team[1]));
     }
 #endif
 
 #if OMPT_SUPPORT
     __ompt_team_assign_id(team, ompt_parallel_data);
 #endif
 
     KMP_MB();
 
     return team;
   }
 
   /* next, let's try to take one from the team pool */
   KMP_MB();
   for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
     /* TODO: consider resizing undersized teams instead of reaping them, now
        that we have a resizing mechanism */
     if (team->t.t_max_nproc >= max_nproc) {
       /* take this team from the team pool */
       __kmp_team_pool = team->t.t_next_pool;
 
       /* setup the team for fresh use */
       __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
 
       KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
                     "task_team[1] %p to NULL\n",
                     &team->t.t_task_team[0], &team->t.t_task_team[1]));
       team->t.t_task_team[0] = NULL;
       team->t.t_task_team[1] = NULL;
 
       /* reallocate space for arguments if necessary */
       __kmp_alloc_argv_entries(argc, team, TRUE);
       KMP_CHECK_UPDATE(team->t.t_argc, argc);
 
       KA_TRACE(
           20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
                team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
       { // Initialize barrier data.
         int b;
         for (b = 0; b < bs_last_barrier; ++b) {
           team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
 #if USE_DEBUGGER
           team->t.t_bar[b].b_master_arrived = 0;
           team->t.t_bar[b].b_team_arrived = 0;
 #endif
         }
       }
 
       team->t.t_proc_bind = new_proc_bind;
 
       KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
                     team->t.t_id));
 
 #if OMPT_SUPPORT
       __ompt_team_assign_id(team, ompt_parallel_data);
 #endif
 
       KMP_MB();
 
       return team;
     }
 
     /* reap team if it is too small, then loop back and check the next one */
     // not sure if this is wise, but, will be redone during the hot-teams
     // rewrite.
     /* TODO: Use technique to find the right size hot-team, don't reap them */
     team = __kmp_reap_team(team);
     __kmp_team_pool = team;
   }
 
   /* nothing available in the pool, no matter, make a new team! */
   KMP_MB();
   team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
 
   /* and set it up */
   team->t.t_max_nproc = max_nproc;
   /* NOTE well, for some reason allocating one big buffer and dividing it up
      seems to really hurt performance a lot on the P4, so, let's not use this */
   __kmp_allocate_team_arrays(team, max_nproc);
 
   KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
   __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
 
   KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
                 "%p to NULL\n",
                 &team->t.t_task_team[0], &team->t.t_task_team[1]));
   team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
   // memory, no need to duplicate
   team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
   // memory, no need to duplicate
 
   if (__kmp_storage_map) {
     __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
   }
 
   /* allocate space for arguments */
   __kmp_alloc_argv_entries(argc, team, FALSE);
   team->t.t_argc = argc;
 
   KA_TRACE(20,
            ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
   { // Initialize barrier data.
     int b;
     for (b = 0; b < bs_last_barrier; ++b) {
       team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
 #if USE_DEBUGGER
       team->t.t_bar[b].b_master_arrived = 0;
       team->t.t_bar[b].b_team_arrived = 0;
 #endif
     }
   }
 
   team->t.t_proc_bind = new_proc_bind;
 
 #if OMPT_SUPPORT
   __ompt_team_assign_id(team, ompt_parallel_data);
   team->t.ompt_serialized_team_info = NULL;
 #endif
 
   KMP_MB();
 
   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
                 team->t.t_id));
 
   return team;
 }
 
 /* TODO implement hot-teams at all levels */
 /* TODO implement lazy thread release on demand (disband request) */
 
 /* free the team.  return it to the team pool.  release all the threads
  * associated with it */
 void __kmp_free_team(kmp_root_t *root,
                      kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
   int f;
   KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
                 team->t.t_id));
 
   /* verify state */
   KMP_DEBUG_ASSERT(root);
   KMP_DEBUG_ASSERT(team);
   KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
   KMP_DEBUG_ASSERT(team->t.t_threads);
 
   int use_hot_team = team == root->r.r_hot_team;
 #if KMP_NESTED_HOT_TEAMS
   int level;
   kmp_hot_team_ptr_t *hot_teams;
   if (master) {
     level = team->t.t_active_level - 1;
     if (master->th.th_teams_microtask) { // in teams construct?
       if (master->th.th_teams_size.nteams > 1) {
         ++level; // level was not increased in teams construct for
         // team_of_masters
       }
       if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
           master->th.th_teams_level == team->t.t_level) {
         ++level; // level was not increased in teams construct for
         // team_of_workers before the parallel
       } // team->t.t_level will be increased inside parallel
     }
     hot_teams = master->th.th_hot_teams;
     if (level < __kmp_hot_teams_max_level) {
       KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
       use_hot_team = 1;
     }
   }
 #endif // KMP_NESTED_HOT_TEAMS
 
   /* team is done working */
   TCW_SYNC_PTR(team->t.t_pkfn,
                NULL); // Important for Debugging Support Library.
 #if KMP_OS_WINDOWS
   team->t.t_copyin_counter = 0; // init counter for possible reuse
 #endif
   // Do not reset pointer to parent team to NULL for hot teams.
 
   /* if we are non-hot team, release our threads */
   if (!use_hot_team) {
     if (__kmp_tasking_mode != tskm_immediate_exec) {
       // Wait for threads to reach reapable state
       for (f = 1; f < team->t.t_nproc; ++f) {
         KMP_DEBUG_ASSERT(team->t.t_threads[f]);
         kmp_info_t *th = team->t.t_threads[f];
         volatile kmp_uint32 *state = &th->th.th_reap_state;
         while (*state != KMP_SAFE_TO_REAP) {
 #if KMP_OS_WINDOWS
           // On Windows a thread can be killed at any time, check this
           DWORD ecode;
           if (!__kmp_is_thread_alive(th, &ecode)) {
             *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
             break;
           }
 #endif
           // first check if thread is sleeping
           kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
           if (fl.is_sleeping())
             fl.resume(__kmp_gtid_from_thread(th));
           KMP_CPU_PAUSE();
         }
       }
 
       // Delete task teams
       int tt_idx;
       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
         if (task_team != NULL) {
           for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
             KMP_DEBUG_ASSERT(team->t.t_threads[f]);
             team->t.t_threads[f]->th.th_task_team = NULL;
           }
           KA_TRACE(
               20,
               ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
                __kmp_get_gtid(), task_team, team->t.t_id));
 #if KMP_NESTED_HOT_TEAMS
           __kmp_free_task_team(master, task_team);
 #endif
           team->t.t_task_team[tt_idx] = NULL;
         }
       }
     }
 
     // Reset pointer to parent team only for non-hot teams.
     team->t.t_parent = NULL;
     team->t.t_level = 0;
     team->t.t_active_level = 0;
 
     /* free the worker threads */
     for (f = 1; f < team->t.t_nproc; ++f) {
       KMP_DEBUG_ASSERT(team->t.t_threads[f]);
       __kmp_free_thread(team->t.t_threads[f]);
       team->t.t_threads[f] = NULL;
     }
 
     /* put the team back in the team pool */
     /* TODO limit size of team pool, call reap_team if pool too large */
     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
     __kmp_team_pool = (volatile kmp_team_t *)team;
   } else { // Check if team was created for the masters in a teams construct
     // See if first worker is a CG root
     KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
                      team->t.t_threads[1]->th.th_cg_roots);
     if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
       // Clean up the CG root nodes on workers so that this team can be re-used
       for (f = 1; f < team->t.t_nproc; ++f) {
         kmp_info_t *thr = team->t.t_threads[f];
         KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
                          thr->th.th_cg_roots->cg_root == thr);
         // Pop current CG root off list
         kmp_cg_root_t *tmp = thr->th.th_cg_roots;
         thr->th.th_cg_roots = tmp->up;
         KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
                        " up to node %p. cg_nthreads was %d\n",
                        thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
         int i = tmp->cg_nthreads--;
         if (i == 1) {
           __kmp_free(tmp); // free CG if we are the last thread in it
         }
         // Restore current task's thread_limit from CG root
         if (thr->th.th_cg_roots)
           thr->th.th_current_task->td_icvs.thread_limit =
               thr->th.th_cg_roots->cg_thread_limit;
       }
     }
   }
 
   KMP_MB();
 }
 
 /* reap the team.  destroy it, reclaim all its resources and free its memory */
 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
   kmp_team_t *next_pool = team->t.t_next_pool;
 
   KMP_DEBUG_ASSERT(team);
   KMP_DEBUG_ASSERT(team->t.t_dispatch);
   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
   KMP_DEBUG_ASSERT(team->t.t_threads);
   KMP_DEBUG_ASSERT(team->t.t_argv);
 
   /* TODO clean the threads that are a part of this? */
 
   /* free stuff */
   __kmp_free_team_arrays(team);
   if (team->t.t_argv != &team->t.t_inline_argv[0])
     __kmp_free((void *)team->t.t_argv);
   __kmp_free(team);
 
   KMP_MB();
   return next_pool;
 }
 
 // Free the thread.  Don't reap it, just place it on the pool of available
 // threads.
 //
 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
 // binding for the affinity mechanism to be useful.
 //
 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
 // However, we want to avoid a potential performance problem by always
 // scanning through the list to find the correct point at which to insert
 // the thread (potential N**2 behavior).  To do this we keep track of the
 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
 // With single-level parallelism, threads will always be added to the tail
 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
 // parallelism, all bets are off and we may need to scan through the entire
 // free list.
 //
 // This change also has a potentially large performance benefit, for some
 // applications.  Previously, as threads were freed from the hot team, they
 // would be placed back on the free list in inverse order.  If the hot team
 // grew back to it's original size, then the freed thread would be placed
 // back on the hot team in reverse order.  This could cause bad cache
 // locality problems on programs where the size of the hot team regularly
 // grew and shrunk.
 //
 // Now, for single-level parallelism, the OMP tid is alway == gtid.
 void __kmp_free_thread(kmp_info_t *this_th) {
   int gtid;
   kmp_info_t **scan;
 
   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
 
   KMP_DEBUG_ASSERT(this_th);
 
   // When moving thread to pool, switch thread to wait on own b_go flag, and
   // uninitialized (NULL team).
   int b;
   kmp_balign_t *balign = this_th->th.th_bar;
   for (b = 0; b < bs_last_barrier; ++b) {
     if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
       balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
     balign[b].bb.team = NULL;
     balign[b].bb.leaf_kids = 0;
   }
   this_th->th.th_task_state = 0;
   this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
 
   /* put thread back on the free pool */
   TCW_PTR(this_th->th.th_team, NULL);
   TCW_PTR(this_th->th.th_root, NULL);
   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
 
   while (this_th->th.th_cg_roots) {
     this_th->th.th_cg_roots->cg_nthreads--;
     KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
                    " %p of thread  %p to %d\n",
                    this_th, this_th->th.th_cg_roots,
                    this_th->th.th_cg_roots->cg_root,
                    this_th->th.th_cg_roots->cg_nthreads));
     kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
     if (tmp->cg_root == this_th) { // Thread is a cg_root
       KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
       KA_TRACE(
           5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
       this_th->th.th_cg_roots = tmp->up;
       __kmp_free(tmp);
     } else { // Worker thread
       if (tmp->cg_nthreads == 0) { // last thread leaves contention group
         __kmp_free(tmp);
       }
       this_th->th.th_cg_roots = NULL;
       break;
     }
   }
 
   /* If the implicit task assigned to this thread can be used by other threads
    * -> multiple threads can share the data and try to free the task at
    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
    * with higher probability when hot team is disabled but can occurs even when
    * the hot team is enabled */
   __kmp_free_implicit_task(this_th);
   this_th->th.th_current_task = NULL;
 
   // If the __kmp_thread_pool_insert_pt is already past the new insert
   // point, then we need to re-scan the entire list.
   gtid = this_th->th.th_info.ds.ds_gtid;
   if (__kmp_thread_pool_insert_pt != NULL) {
     KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
     if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
       __kmp_thread_pool_insert_pt = NULL;
     }
   }
 
   // Scan down the list to find the place to insert the thread.
   // scan is the address of a link in the list, possibly the address of
   // __kmp_thread_pool itself.
   //
   // In the absence of nested parallism, the for loop will have 0 iterations.
   if (__kmp_thread_pool_insert_pt != NULL) {
     scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
   } else {
     scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
   }
   for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
        scan = &((*scan)->th.th_next_pool))
     ;
 
   // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
   // to its address.
   TCW_PTR(this_th->th.th_next_pool, *scan);
   __kmp_thread_pool_insert_pt = *scan = this_th;
   KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
                    (this_th->th.th_info.ds.ds_gtid <
                     this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
   TCW_4(this_th->th.th_in_pool, TRUE);
   __kmp_suspend_initialize_thread(this_th);
   __kmp_lock_suspend_mx(this_th);
   if (this_th->th.th_active == TRUE) {
     KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
     this_th->th.th_active_in_pool = TRUE;
   }
 #if KMP_DEBUG
   else {
     KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
   }
 #endif
   __kmp_unlock_suspend_mx(this_th);
 
   TCW_4(__kmp_nth, __kmp_nth - 1);
 
 #ifdef KMP_ADJUST_BLOCKTIME
   /* Adjust blocktime back to user setting or default if necessary */
   /* Middle initialization might never have occurred                */
   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
     if (__kmp_nth <= __kmp_avail_proc) {
       __kmp_zero_bt = FALSE;
     }
   }
 #endif /* KMP_ADJUST_BLOCKTIME */
 
   KMP_MB();
 }
 
 /* ------------------------------------------------------------------------ */
 
 void *__kmp_launch_thread(kmp_info_t *this_thr) {
   int gtid = this_thr->th.th_info.ds.ds_gtid;
   /*    void                 *stack_data;*/
-  kmp_team_t *(*volatile pteam);
+  kmp_team_t **volatile pteam;
 
   KMP_MB();
   KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
 
   if (__kmp_env_consistency_check) {
     this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
   }
 
 #if OMPT_SUPPORT
   ompt_data_t *thread_data;
   if (ompt_enabled.enabled) {
     thread_data = &(this_thr->th.ompt_thread_info.thread_data);
     *thread_data = ompt_data_none;
 
     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
     this_thr->th.ompt_thread_info.wait_id = 0;
     this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
+    this_thr->th.ompt_thread_info.parallel_flags = 0;
     if (ompt_enabled.ompt_callback_thread_begin) {
       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
           ompt_thread_worker, thread_data);
     }
-  }
-#endif
-
-#if OMPT_SUPPORT
-  if (ompt_enabled.enabled) {
     this_thr->th.ompt_thread_info.state = ompt_state_idle;
   }
 #endif
+
   /* This is the place where threads wait for work */
   while (!TCR_4(__kmp_global.g.g_done)) {
     KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
     KMP_MB();
 
     /* wait for work to do */
     KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
 
     /* No tid yet since not part of a team */
     __kmp_fork_barrier(gtid, KMP_GTID_DNE);
 
 #if OMPT_SUPPORT
     if (ompt_enabled.enabled) {
       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
     }
 #endif
 
-    pteam = (kmp_team_t * (*))(&this_thr->th.th_team);
+    pteam = &this_thr->th.th_team;
 
     /* have we been allocated? */
     if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
       /* we were just woken up, so run our new task */
       if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
         int rc;
         KA_TRACE(20,
                  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
                   gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
                   (*pteam)->t.t_pkfn));
 
         updateHWFPControl(*pteam);
 
 #if OMPT_SUPPORT
         if (ompt_enabled.enabled) {
           this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
         }
 #endif
 
         rc = (*pteam)->t.t_invoke(gtid);
         KMP_ASSERT(rc);
 
         KMP_MB();
         KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
                       gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
                       (*pteam)->t.t_pkfn));
       }
 #if OMPT_SUPPORT
       if (ompt_enabled.enabled) {
         /* no frame set while outside task */
         __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
 
         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
       }
 #endif
       /* join barrier after parallel region */
       __kmp_join_barrier(gtid);
     }
   }
   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
 
 #if OMPT_SUPPORT
   if (ompt_enabled.ompt_callback_thread_end) {
     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
   }
 #endif
 
   this_thr->th.th_task_team = NULL;
   /* run the destructors for the threadprivate data for this thread */
   __kmp_common_destroy_gtid(gtid);
 
   KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
   KMP_MB();
   return this_thr;
 }
 
 /* ------------------------------------------------------------------------ */
 
 void __kmp_internal_end_dest(void *specific_gtid) {
 #if KMP_COMPILER_ICC
 #pragma warning(push)
 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
 // significant bits
 #endif
   // Make sure no significant bits are lost
   int gtid = (kmp_intptr_t)specific_gtid - 1;
 #if KMP_COMPILER_ICC
 #pragma warning(pop)
 #endif
 
   KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
   /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
    * this is because 0 is reserved for the nothing-stored case */
 
   /* josh: One reason for setting the gtid specific data even when it is being
      destroyed by pthread is to allow gtid lookup through thread specific data
      (__kmp_gtid_get_specific).  Some of the code, especially stat code,
      that gets executed in the call to __kmp_internal_end_thread, actually
      gets the gtid through the thread specific data.  Setting it here seems
      rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
      to run smoothly.
      todo: get rid of this after we remove the dependence on
      __kmp_gtid_get_specific  */
   if (gtid >= 0 && KMP_UBER_GTID(gtid))
     __kmp_gtid_set_specific(gtid);
 #ifdef KMP_TDATA_GTID
   __kmp_gtid = gtid;
 #endif
   __kmp_internal_end_thread(gtid);
 }
 
 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
 
 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases
 // destructors work perfectly, but in real libomp.so I have no evidence it is
 // ever called. However, -fini linker option in makefile.mk works fine.
 
 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
   __kmp_internal_end_atexit();
 }
 
 void __kmp_internal_end_fini(void) { __kmp_internal_end_atexit(); }
 
 #endif
 
 /* [Windows] josh: when the atexit handler is called, there may still be more
    than one thread alive */
 void __kmp_internal_end_atexit(void) {
   KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
   /* [Windows]
      josh: ideally, we want to completely shutdown the library in this atexit
      handler, but stat code that depends on thread specific data for gtid fails
      because that data becomes unavailable at some point during the shutdown, so
      we call __kmp_internal_end_thread instead. We should eventually remove the
      dependency on __kmp_get_specific_gtid in the stat code and use
      __kmp_internal_end_library to cleanly shutdown the library.
 
      // TODO: Can some of this comment about GVS be removed?
      I suspect that the offending stat code is executed when the calling thread
      tries to clean up a dead root thread's data structures, resulting in GVS
      code trying to close the GVS structures for that thread, but since the stat
      code uses __kmp_get_specific_gtid to get the gtid with the assumption that
      the calling thread is cleaning up itself instead of another thread, it get
      confused. This happens because allowing a thread to unregister and cleanup
      another thread is a recent modification for addressing an issue.
      Based on the current design (20050722), a thread may end up
      trying to unregister another thread only if thread death does not trigger
      the calling of __kmp_internal_end_thread.  For Linux* OS, there is the
      thread specific data destructor function to detect thread death. For
      Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
      is nothing.  Thus, the workaround is applicable only for Windows static
      stat library. */
   __kmp_internal_end_library(-1);
 #if KMP_OS_WINDOWS
   __kmp_close_console();
 #endif
 }
 
 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
   // It is assumed __kmp_forkjoin_lock is acquired.
 
   int gtid;
 
   KMP_DEBUG_ASSERT(thread != NULL);
 
   gtid = thread->th.th_info.ds.ds_gtid;
 
   if (!is_root) {
     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
       /* Assume the threads are at the fork barrier here */
       KA_TRACE(
           20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
                gtid));
       /* Need release fence here to prevent seg faults for tree forkjoin barrier
        * (GEH) */
       ANNOTATE_HAPPENS_BEFORE(thread);
       kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
       __kmp_release_64(&flag);
     }
 
     // Terminate OS thread.
     __kmp_reap_worker(thread);
 
     // The thread was killed asynchronously.  If it was actively
     // spinning in the thread pool, decrement the global count.
     //
     // There is a small timing hole here - if the worker thread was just waking
     // up after sleeping in the pool, had reset it's th_active_in_pool flag but
     // not decremented the global counter __kmp_thread_pool_active_nth yet, then
     // the global counter might not get updated.
     //
     // Currently, this can only happen as the library is unloaded,
     // so there are no harmful side effects.
     if (thread->th.th_active_in_pool) {
       thread->th.th_active_in_pool = FALSE;
       KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
       KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
     }
   }
 
   __kmp_free_implicit_task(thread);
 
 // Free the fast memory for tasking
 #if USE_FAST_MEMORY
   __kmp_free_fast_memory(thread);
 #endif /* USE_FAST_MEMORY */
 
   __kmp_suspend_uninitialize_thread(thread);
 
   KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
   TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
 
   --__kmp_all_nth;
 // __kmp_nth was decremented when thread is added to the pool.
 
 #ifdef KMP_ADJUST_BLOCKTIME
   /* Adjust blocktime back to user setting or default if necessary */
   /* Middle initialization might never have occurred                */
   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
     if (__kmp_nth <= __kmp_avail_proc) {
       __kmp_zero_bt = FALSE;
     }
   }
 #endif /* KMP_ADJUST_BLOCKTIME */
 
   /* free the memory being used */
   if (__kmp_env_consistency_check) {
     if (thread->th.th_cons) {
       __kmp_free_cons_stack(thread->th.th_cons);
       thread->th.th_cons = NULL;
     }
   }
 
   if (thread->th.th_pri_common != NULL) {
     __kmp_free(thread->th.th_pri_common);
     thread->th.th_pri_common = NULL;
   }
 
   if (thread->th.th_task_state_memo_stack != NULL) {
     __kmp_free(thread->th.th_task_state_memo_stack);
     thread->th.th_task_state_memo_stack = NULL;
   }
 
 #if KMP_USE_BGET
   if (thread->th.th_local.bget_data != NULL) {
     __kmp_finalize_bget(thread);
   }
 #endif
 
 #if KMP_AFFINITY_SUPPORTED
   if (thread->th.th_affin_mask != NULL) {
     KMP_CPU_FREE(thread->th.th_affin_mask);
     thread->th.th_affin_mask = NULL;
   }
 #endif /* KMP_AFFINITY_SUPPORTED */
 
 #if KMP_USE_HIER_SCHED
   if (thread->th.th_hier_bar_data != NULL) {
     __kmp_free(thread->th.th_hier_bar_data);
     thread->th.th_hier_bar_data = NULL;
   }
 #endif
 
   __kmp_reap_team(thread->th.th_serial_team);
   thread->th.th_serial_team = NULL;
   __kmp_free(thread);
 
   KMP_MB();
 
 } // __kmp_reap_thread
 
 static void __kmp_internal_end(void) {
   int i;
 
   /* First, unregister the library */
   __kmp_unregister_library();
 
 #if KMP_OS_WINDOWS
   /* In Win static library, we can't tell when a root actually dies, so we
      reclaim the data structures for any root threads that have died but not
      unregistered themselves, in order to shut down cleanly.
      In Win dynamic library we also can't tell when a thread dies.  */
   __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
 // dead roots
 #endif
 
   for (i = 0; i < __kmp_threads_capacity; i++)
     if (__kmp_root[i])
       if (__kmp_root[i]->r.r_active)
         break;
   KMP_MB(); /* Flush all pending memory write invalidates.  */
   TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
 
   if (i < __kmp_threads_capacity) {
 #if KMP_USE_MONITOR
     // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
     KMP_MB(); /* Flush all pending memory write invalidates.  */
 
     // Need to check that monitor was initialized before reaping it. If we are
     // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
     // __kmp_monitor will appear to contain valid data, but it is only valid in
     // the parent process, not the child.
     // New behavior (201008): instead of keying off of the flag
     // __kmp_init_parallel, the monitor thread creation is keyed off
     // of the new flag __kmp_init_monitor.
     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
     if (TCR_4(__kmp_init_monitor)) {
       __kmp_reap_monitor(&__kmp_monitor);
       TCW_4(__kmp_init_monitor, 0);
     }
     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
 #endif // KMP_USE_MONITOR
   } else {
 /* TODO move this to cleanup code */
 #ifdef KMP_DEBUG
     /* make sure that everything has properly ended */
     for (i = 0; i < __kmp_threads_capacity; i++) {
       if (__kmp_root[i]) {
         //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC:
         //                    there can be uber threads alive here
         KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
       }
     }
 #endif
 
     KMP_MB();
 
     // Reap the worker threads.
     // This is valid for now, but be careful if threads are reaped sooner.
     while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
       // Get the next thread from the pool.
       kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
       __kmp_thread_pool = thread->th.th_next_pool;
       // Reap it.
       KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
       thread->th.th_next_pool = NULL;
       thread->th.th_in_pool = FALSE;
       __kmp_reap_thread(thread, 0);
     }
     __kmp_thread_pool_insert_pt = NULL;
 
     // Reap teams.
     while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
       // Get the next team from the pool.
       kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
       __kmp_team_pool = team->t.t_next_pool;
       // Reap it.
       team->t.t_next_pool = NULL;
       __kmp_reap_team(team);
     }
 
     __kmp_reap_task_teams();
 
 #if KMP_OS_UNIX
     // Threads that are not reaped should not access any resources since they
     // are going to be deallocated soon, so the shutdown sequence should wait
     // until all threads either exit the final spin-waiting loop or begin
     // sleeping after the given blocktime.
     for (i = 0; i < __kmp_threads_capacity; i++) {
       kmp_info_t *thr = __kmp_threads[i];
       while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
         KMP_CPU_PAUSE();
     }
 #endif
 
     for (i = 0; i < __kmp_threads_capacity; ++i) {
       // TBD: Add some checking...
       // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
     }
 
     /* Make sure all threadprivate destructors get run by joining with all
        worker threads before resetting this flag */
     TCW_SYNC_4(__kmp_init_common, FALSE);
 
     KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
     KMP_MB();
 
 #if KMP_USE_MONITOR
     // See note above: One of the possible fixes for CQ138434 / CQ140126
     //
     // FIXME: push both code fragments down and CSE them?
     // push them into __kmp_cleanup() ?
     __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
     if (TCR_4(__kmp_init_monitor)) {
       __kmp_reap_monitor(&__kmp_monitor);
       TCW_4(__kmp_init_monitor, 0);
     }
     __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
     KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
 #endif
   } /* else !__kmp_global.t_active */
   TCW_4(__kmp_init_gtid, FALSE);
   KMP_MB(); /* Flush all pending memory write invalidates.  */
 
   __kmp_cleanup();
 #if OMPT_SUPPORT
   ompt_fini();
 #endif
 }
 
 void __kmp_internal_end_library(int gtid_req) {
   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
   /* this shouldn't be a race condition because __kmp_internal_end() is the
      only place to clear __kmp_serial_init */
   /* we'll check this later too, after we get the lock */
   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
   // redundaant, because the next check will work in any case.
   if (__kmp_global.g.g_abort) {
     KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
     /* TODO abort? */
     return;
   }
   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
     KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
     return;
   }
 
   KMP_MB(); /* Flush all pending memory write invalidates.  */
 
   /* find out who we are and what we should do */
   {
     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
     KA_TRACE(
         10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req));
     if (gtid == KMP_GTID_SHUTDOWN) {
       KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
                     "already shutdown\n"));
       return;
     } else if (gtid == KMP_GTID_MONITOR) {
       KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
                     "registered, or system shutdown\n"));
       return;
     } else if (gtid == KMP_GTID_DNE) {
       KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
                     "shutdown\n"));
       /* we don't know who we are, but we may still shutdown the library */
     } else if (KMP_UBER_GTID(gtid)) {
       /* unregister ourselves as an uber thread.  gtid is no longer valid */
       if (__kmp_root[gtid]->r.r_active) {
         __kmp_global.g.g_abort = -1;
         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
         KA_TRACE(10,
                  ("__kmp_internal_end_library: root still active, abort T#%d\n",
                   gtid));
         return;
       } else {
         KA_TRACE(
             10,
             ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
         __kmp_unregister_root_current_thread(gtid);
       }
     } else {
 /* worker threads may call this function through the atexit handler, if they
  * call exit() */
 /* For now, skip the usual subsequent processing and just dump the debug buffer.
    TODO: do a thorough shutdown instead */
 #ifdef DUMP_DEBUG_ON_EXIT
       if (__kmp_debug_buf)
         __kmp_dump_debug_buffer();
 #endif
       return;
     }
   }
   /* synchronize the termination process */
   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
 
   /* have we already finished */
   if (__kmp_global.g.g_abort) {
     KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
     /* TODO abort? */
     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
     return;
   }
   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
     return;
   }
 
   /* We need this lock to enforce mutex between this reading of
      __kmp_threads_capacity and the writing by __kmp_register_root.
      Alternatively, we can use a counter of roots that is atomically updated by
      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
      __kmp_internal_end_*.  */
   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
 
   /* now we can safely conduct the actual termination */
   __kmp_internal_end();
 
   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
 
   KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
 
 #ifdef DUMP_DEBUG_ON_EXIT
   if (__kmp_debug_buf)
     __kmp_dump_debug_buffer();
 #endif
 
 #if KMP_OS_WINDOWS
   __kmp_close_console();
 #endif
 
   __kmp_fini_allocator();
 
 } // __kmp_internal_end_library
 
 void __kmp_internal_end_thread(int gtid_req) {
   int i;
 
   /* if we have already cleaned up, don't try again, it wouldn't be pretty */
   /* this shouldn't be a race condition because __kmp_internal_end() is the
    * only place to clear __kmp_serial_init */
   /* we'll check this later too, after we get the lock */
   // 2009-09-06: We do not set g_abort without setting g_done. This check looks
   // redundant, because the next check will work in any case.
   if (__kmp_global.g.g_abort) {
     KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
     /* TODO abort? */
     return;
   }
   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
     KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
     return;
   }
 
   KMP_MB(); /* Flush all pending memory write invalidates.  */
 
   /* find out who we are and what we should do */
   {
     int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
     KA_TRACE(10,
              ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req));
     if (gtid == KMP_GTID_SHUTDOWN) {
       KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
                     "already shutdown\n"));
       return;
     } else if (gtid == KMP_GTID_MONITOR) {
       KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
                     "registered, or system shutdown\n"));
       return;
     } else if (gtid == KMP_GTID_DNE) {
       KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
                     "shutdown\n"));
       return;
       /* we don't know who we are */
     } else if (KMP_UBER_GTID(gtid)) {
       /* unregister ourselves as an uber thread.  gtid is no longer valid */
       if (__kmp_root[gtid]->r.r_active) {
         __kmp_global.g.g_abort = -1;
         TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
         KA_TRACE(10,
                  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
                   gtid));
         return;
       } else {
         KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
                       gtid));
         __kmp_unregister_root_current_thread(gtid);
       }
     } else {
       /* just a worker thread, let's leave */
       KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
 
       if (gtid >= 0) {
         __kmp_threads[gtid]->th.th_task_team = NULL;
       }
 
       KA_TRACE(10,
                ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
                 gtid));
       return;
     }
   }
 #if KMP_DYNAMIC_LIB
   if (__kmp_pause_status != kmp_hard_paused)
   // AC: lets not shutdown the dynamic library at the exit of uber thread,
   // because we will better shutdown later in the library destructor.
   {
     KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
     return;
   }
 #endif
   /* synchronize the termination process */
   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
 
   /* have we already finished */
   if (__kmp_global.g.g_abort) {
     KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
     /* TODO abort? */
     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
     return;
   }
   if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
     return;
   }
 
   /* We need this lock to enforce mutex between this reading of
      __kmp_threads_capacity and the writing by __kmp_register_root.
      Alternatively, we can use a counter of roots that is atomically updated by
      __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
      __kmp_internal_end_*.  */
 
   /* should we finish the run-time?  are all siblings done? */
   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
 
   for (i = 0; i < __kmp_threads_capacity; ++i) {
     if (KMP_UBER_GTID(i)) {
       KA_TRACE(
           10,
           ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
       __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
       __kmp_release_bootstrap_lock(&__kmp_initz_lock);
       return;
     }
   }
 
   /* now we can safely conduct the actual termination */
 
   __kmp_internal_end();
 
   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
 
   KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
 
 #ifdef DUMP_DEBUG_ON_EXIT
   if (__kmp_debug_buf)
     __kmp_dump_debug_buffer();
 #endif
 } // __kmp_internal_end_thread
 
 // -----------------------------------------------------------------------------
 // Library registration stuff.
 
 static long __kmp_registration_flag = 0;
 // Random value used to indicate library initialization.
 static char *__kmp_registration_str = NULL;
 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
 
 static inline char *__kmp_reg_status_name() {
   /* On RHEL 3u5 if linked statically, getpid() returns different values in
      each thread. If registration and unregistration go in different threads
      (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
      env var can not be found, because the name will contain different pid. */
   return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
 } // __kmp_reg_status_get
 
 void __kmp_register_library_startup(void) {
 
   char *name = __kmp_reg_status_name(); // Name of the environment variable.
   int done = 0;
   union {
     double dtime;
     long ltime;
   } time;
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
   __kmp_initialize_system_tick();
 #endif
   __kmp_read_system_time(&time.dtime);
   __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
   __kmp_registration_str =
       __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
                        __kmp_registration_flag, KMP_LIBRARY_FILE);
 
   KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
                 __kmp_registration_str));
 
   while (!done) {
 
     char *value = NULL; // Actual value of the environment variable.
 
     // Set environment variable, but do not overwrite if it is exist.
     __kmp_env_set(name, __kmp_registration_str, 0);
     // Check the variable is written.
     value = __kmp_env_get(name);
     if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
 
       done = 1; // Ok, environment variable set successfully, exit the loop.
 
     } else {
 
       // Oops. Write failed. Another copy of OpenMP RTL is in memory.
       // Check whether it alive or dead.
       int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
       char *tail = value;
       char *flag_addr_str = NULL;
       char *flag_val_str = NULL;
       char const *file_name = NULL;
       __kmp_str_split(tail, '-', &flag_addr_str, &tail);
       __kmp_str_split(tail, '-', &flag_val_str, &tail);
       file_name = tail;
       if (tail != NULL) {
         long *flag_addr = 0;
         long flag_val = 0;
         KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
         KMP_SSCANF(flag_val_str, "%lx", &flag_val);
         if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
           // First, check whether environment-encoded address is mapped into
           // addr space.
           // If so, dereference it to see if it still has the right value.
           if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
             neighbor = 1;
           } else {
             // If not, then we know the other copy of the library is no longer
             // running.
             neighbor = 2;
           }
         }
       }
       switch (neighbor) {
       case 0: // Cannot parse environment variable -- neighbor status unknown.
         // Assume it is the incompatible format of future version of the
         // library. Assume the other library is alive.
         // WARN( ... ); // TODO: Issue a warning.
         file_name = "unknown library";
         KMP_FALLTHROUGH();
       // Attention! Falling to the next case. That's intentional.
       case 1: { // Neighbor is alive.
         // Check it is allowed.
         char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
         if (!__kmp_str_match_true(duplicate_ok)) {
           // That's not allowed. Issue fatal error.
           __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
                       KMP_HNT(DuplicateLibrary), __kmp_msg_null);
         }
         KMP_INTERNAL_FREE(duplicate_ok);
         __kmp_duplicate_library_ok = 1;
         done = 1; // Exit the loop.
       } break;
       case 2: { // Neighbor is dead.
         // Clear the variable and try to register library again.
         __kmp_env_unset(name);
       } break;
       default: { KMP_DEBUG_ASSERT(0); } break;
       }
     }
     KMP_INTERNAL_FREE((void *)value);
   }
   KMP_INTERNAL_FREE((void *)name);
 
 } // func __kmp_register_library_startup
 
 void __kmp_unregister_library(void) {
 
   char *name = __kmp_reg_status_name();
   char *value = __kmp_env_get(name);
 
   KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
   KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
   if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
     // Ok, this is our variable. Delete it.
     __kmp_env_unset(name);
   }
 
   KMP_INTERNAL_FREE(__kmp_registration_str);
   KMP_INTERNAL_FREE(value);
   KMP_INTERNAL_FREE(name);
 
   __kmp_registration_flag = 0;
   __kmp_registration_str = NULL;
 
 } // __kmp_unregister_library
 
 // End of Library registration stuff.
 // -----------------------------------------------------------------------------
 
 #if KMP_MIC_SUPPORTED
 
 static void __kmp_check_mic_type() {
   kmp_cpuid_t cpuid_state = {0};
   kmp_cpuid_t *cs_p = &cpuid_state;
   __kmp_x86_cpuid(1, 0, cs_p);
   // We don't support mic1 at the moment
   if ((cs_p->eax & 0xff0) == 0xB10) {
     __kmp_mic_type = mic2;
   } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
     __kmp_mic_type = mic3;
   } else {
     __kmp_mic_type = non_mic;
   }
 }
 
 #endif /* KMP_MIC_SUPPORTED */
 
 static void __kmp_do_serial_initialize(void) {
   int i, gtid;
   int size;
 
   KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
 
   KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
   KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
   KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
   KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
 
 #if OMPT_SUPPORT
   ompt_pre_init();
 #endif
 
   __kmp_validate_locks();
 
   /* Initialize internal memory allocator */
   __kmp_init_allocator();
 
   /* Register the library startup via an environment variable and check to see
      whether another copy of the library is already registered. */
 
   __kmp_register_library_startup();
 
   /* TODO reinitialization of library */
   if (TCR_4(__kmp_global.g.g_done)) {
     KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
   }
 
   __kmp_global.g.g_abort = 0;
   TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
 
 /* initialize the locks */
 #if KMP_USE_ADAPTIVE_LOCKS
 #if KMP_DEBUG_ADAPTIVE_LOCKS
   __kmp_init_speculative_stats();
 #endif
 #endif
 #if KMP_STATS_ENABLED
   __kmp_stats_init();
 #endif
   __kmp_init_lock(&__kmp_global_lock);
   __kmp_init_queuing_lock(&__kmp_dispatch_lock);
   __kmp_init_lock(&__kmp_debug_lock);
   __kmp_init_atomic_lock(&__kmp_atomic_lock);
   __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
   __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
   __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
   __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
   __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
   __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
   __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
   __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
   __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
   __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
   __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
   __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
   __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
   __kmp_init_bootstrap_lock(&__kmp_exit_lock);
 #if KMP_USE_MONITOR
   __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
 #endif
   __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
 
   /* conduct initialization and initial setup of configuration */
 
   __kmp_runtime_initialize();
 
 #if KMP_MIC_SUPPORTED
   __kmp_check_mic_type();
 #endif
 
 // Some global variable initialization moved here from kmp_env_initialize()
 #ifdef KMP_DEBUG
   kmp_diag = 0;
 #endif
   __kmp_abort_delay = 0;
 
   // From __kmp_init_dflt_team_nth()
   /* assume the entire machine will be used */
   __kmp_dflt_team_nth_ub = __kmp_xproc;
   if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
     __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
   }
   if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
     __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
   }
   __kmp_max_nth = __kmp_sys_max_nth;
   __kmp_cg_max_nth = __kmp_sys_max_nth;
   __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
   if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
     __kmp_teams_max_nth = __kmp_sys_max_nth;
   }
 
   // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
   // part
   __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
 #if KMP_USE_MONITOR
   __kmp_monitor_wakeups =
       KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
   __kmp_bt_intervals =
       KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
 #endif
   // From "KMP_LIBRARY" part of __kmp_env_initialize()
   __kmp_library = library_throughput;
   // From KMP_SCHEDULE initialization
   __kmp_static = kmp_sch_static_balanced;
 // AC: do not use analytical here, because it is non-monotonous
 //__kmp_guided = kmp_sch_guided_iterative_chunked;
 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
 // need to repeat assignment
 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
 // bit control and barrier method control parts
 #if KMP_FAST_REDUCTION_BARRIER
 #define kmp_reduction_barrier_gather_bb ((int)1)
 #define kmp_reduction_barrier_release_bb ((int)1)
 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
 #define kmp_reduction_barrier_release_pat bp_hyper_bar
 #endif // KMP_FAST_REDUCTION_BARRIER
   for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
     __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
     __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
     __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
     __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
 #if KMP_FAST_REDUCTION_BARRIER
     if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
       // lin_64 ): hyper,1
       __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
       __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
       __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
       __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
     }
 #endif // KMP_FAST_REDUCTION_BARRIER
   }
 #if KMP_FAST_REDUCTION_BARRIER
 #undef kmp_reduction_barrier_release_pat
 #undef kmp_reduction_barrier_gather_pat
 #undef kmp_reduction_barrier_release_bb
 #undef kmp_reduction_barrier_gather_bb
 #endif // KMP_FAST_REDUCTION_BARRIER
 #if KMP_MIC_SUPPORTED
   if (__kmp_mic_type == mic2) { // KNC
     // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
     __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
     __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
         1; // forkjoin release
     __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
     __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
   }
 #if KMP_FAST_REDUCTION_BARRIER
   if (__kmp_mic_type == mic2) { // KNC
     __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
     __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
   }
 #endif // KMP_FAST_REDUCTION_BARRIER
 #endif // KMP_MIC_SUPPORTED
 
 // From KMP_CHECKS initialization
 #ifdef KMP_DEBUG
   __kmp_env_checks = TRUE; /* development versions have the extra checks */
 #else
   __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
 #endif
 
   // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
   __kmp_foreign_tp = TRUE;
 
   __kmp_global.g.g_dynamic = FALSE;
   __kmp_global.g.g_dynamic_mode = dynamic_default;
 
   __kmp_env_initialize(NULL);
 
 // Print all messages in message catalog for testing purposes.
 #ifdef KMP_DEBUG
   char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
   if (__kmp_str_match_true(val)) {
     kmp_str_buf_t buffer;
     __kmp_str_buf_init(&buffer);
     __kmp_i18n_dump_catalog(&buffer);
     __kmp_printf("%s", buffer.str);
     __kmp_str_buf_free(&buffer);
   }
   __kmp_env_free(&val);
 #endif
 
   __kmp_threads_capacity =
       __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
   // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
   __kmp_tp_capacity = __kmp_default_tp_capacity(
       __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
 
   // If the library is shut down properly, both pools must be NULL. Just in
   // case, set them to NULL -- some memory may leak, but subsequent code will
   // work even if pools are not freed.
   KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
   KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
   KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
   __kmp_thread_pool = NULL;
   __kmp_thread_pool_insert_pt = NULL;
   __kmp_team_pool = NULL;
 
   /* Allocate all of the variable sized records */
   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
    * expandable */
   /* Since allocation is cache-aligned, just add extra padding at the end */
   size =
       (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
       CACHE_LINE;
   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
                                sizeof(kmp_info_t *) * __kmp_threads_capacity);
 
   /* init thread counts */
   KMP_DEBUG_ASSERT(__kmp_all_nth ==
                    0); // Asserts fail if the library is reinitializing and
   KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
   __kmp_all_nth = 0;
   __kmp_nth = 0;
 
   /* setup the uber master thread and hierarchy */
   gtid = __kmp_register_root(TRUE);
   KA_TRACE(10, ("__kmp_do_serial_initialize  T#%d\n", gtid));
   KMP_ASSERT(KMP_UBER_GTID(gtid));
   KMP_ASSERT(KMP_INITIAL_GTID(gtid));
 
   KMP_MB(); /* Flush all pending memory write invalidates.  */
 
   __kmp_common_initialize();
 
 #if KMP_OS_UNIX
   /* invoke the child fork handler */
   __kmp_register_atfork();
 #endif
 
 #if !KMP_DYNAMIC_LIB
   {
     /* Invoke the exit handler when the program finishes, only for static
        library. For dynamic library, we already have _fini and DllMain. */
     int rc = atexit(__kmp_internal_end_atexit);
     if (rc != 0) {
       __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
                   __kmp_msg_null);
     }
   }
 #endif
 
 #if KMP_HANDLE_SIGNALS
 #if KMP_OS_UNIX
   /* NOTE: make sure that this is called before the user installs their own
      signal handlers so that the user handlers are called first. this way they
      can return false, not call our handler, avoid terminating the library, and
      continue execution where they left off. */
   __kmp_install_signals(FALSE);
 #endif /* KMP_OS_UNIX */
 #if KMP_OS_WINDOWS
   __kmp_install_signals(TRUE);
 #endif /* KMP_OS_WINDOWS */
 #endif
 
   /* we have finished the serial initialization */
   __kmp_init_counter++;
 
   __kmp_init_serial = TRUE;
 
   if (__kmp_settings) {
     __kmp_env_print();
   }
 
   if (__kmp_display_env || __kmp_display_env_verbose) {
     __kmp_env_print_2();
   }
 
 #if OMPT_SUPPORT
   ompt_post_init();
 #endif
 
   KMP_MB();
 
   KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
 }
 
 void __kmp_serial_initialize(void) {
   if (__kmp_init_serial) {
     return;
   }
   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
   if (__kmp_init_serial) {
     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
     return;
   }
   __kmp_do_serial_initialize();
   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
 }
 
 static void __kmp_do_middle_initialize(void) {
   int i, j;
   int prev_dflt_team_nth;
 
   if (!__kmp_init_serial) {
     __kmp_do_serial_initialize();
   }
 
   KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
 
   // Save the previous value for the __kmp_dflt_team_nth so that
   // we can avoid some reinitialization if it hasn't changed.
   prev_dflt_team_nth = __kmp_dflt_team_nth;
 
 #if KMP_AFFINITY_SUPPORTED
   // __kmp_affinity_initialize() will try to set __kmp_ncores to the
   // number of cores on the machine.
   __kmp_affinity_initialize();
 
   // Run through the __kmp_threads array and set the affinity mask
   // for each root thread that is currently registered with the RTL.
   for (i = 0; i < __kmp_threads_capacity; i++) {
     if (TCR_PTR(__kmp_threads[i]) != NULL) {
       __kmp_affinity_set_init_mask(i, TRUE);
     }
   }
 #endif /* KMP_AFFINITY_SUPPORTED */
 
   KMP_ASSERT(__kmp_xproc > 0);
   if (__kmp_avail_proc == 0) {
     __kmp_avail_proc = __kmp_xproc;
   }
 
   // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
   // correct them now
   j = 0;
   while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
     __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
         __kmp_avail_proc;
     j++;
   }
 
   if (__kmp_dflt_team_nth == 0) {
 #ifdef KMP_DFLT_NTH_CORES
     // Default #threads = #cores
     __kmp_dflt_team_nth = __kmp_ncores;
     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
                   "__kmp_ncores (%d)\n",
                   __kmp_dflt_team_nth));
 #else
     // Default #threads = #available OS procs
     __kmp_dflt_team_nth = __kmp_avail_proc;
     KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
                   "__kmp_avail_proc(%d)\n",
                   __kmp_dflt_team_nth));
 #endif /* KMP_DFLT_NTH_CORES */
   }
 
   if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
     __kmp_dflt_team_nth = KMP_MIN_NTH;
   }
   if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
     __kmp_dflt_team_nth = __kmp_sys_max_nth;
   }
 
   // There's no harm in continuing if the following check fails,
   // but it indicates an error in the previous logic.
   KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
 
   if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
     // Run through the __kmp_threads array and set the num threads icv for each
     // root thread that is currently registered with the RTL (which has not
     // already explicitly set its nthreads-var with a call to
     // omp_set_num_threads()).
     for (i = 0; i < __kmp_threads_capacity; i++) {
       kmp_info_t *thread = __kmp_threads[i];
       if (thread == NULL)
         continue;
       if (thread->th.th_current_task->td_icvs.nproc != 0)
         continue;
 
       set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
     }
   }
   KA_TRACE(
       20,
       ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
        __kmp_dflt_team_nth));
 
 #ifdef KMP_ADJUST_BLOCKTIME
   /* Adjust blocktime to zero if necessary  now that __kmp_avail_proc is set */
   if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
     KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
     if (__kmp_nth > __kmp_avail_proc) {
       __kmp_zero_bt = TRUE;
     }
   }
 #endif /* KMP_ADJUST_BLOCKTIME */
 
   /* we have finished middle initialization */
   TCW_SYNC_4(__kmp_init_middle, TRUE);
 
   KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
 }
 
 void __kmp_middle_initialize(void) {
   if (__kmp_init_middle) {
     return;
   }
   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
   if (__kmp_init_middle) {
     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
     return;
   }
   __kmp_do_middle_initialize();
   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
 }
 
 void __kmp_parallel_initialize(void) {
   int gtid = __kmp_entry_gtid(); // this might be a new root
 
   /* synchronize parallel initialization (for sibling) */
   if (TCR_4(__kmp_init_parallel))
     return;
   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
   if (TCR_4(__kmp_init_parallel)) {
     __kmp_release_bootstrap_lock(&__kmp_initz_lock);
     return;
   }
 
   /* TODO reinitialization after we have already shut down */
   if (TCR_4(__kmp_global.g.g_done)) {
     KA_TRACE(
         10,
         ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
     __kmp_infinite_loop();
   }
 
   /* jc: The lock __kmp_initz_lock is already held, so calling
      __kmp_serial_initialize would cause a deadlock.  So we call
      __kmp_do_serial_initialize directly. */
   if (!__kmp_init_middle) {
     __kmp_do_middle_initialize();
   }
   __kmp_resume_if_hard_paused();
 
   /* begin initialization */
   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
   KMP_ASSERT(KMP_UBER_GTID(gtid));
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
   // Save the FP control regs.
   // Worker threads will set theirs to these values at thread startup.
   __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
   __kmp_store_mxcsr(&__kmp_init_mxcsr);
   __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
 #if KMP_OS_UNIX
 #if KMP_HANDLE_SIGNALS
   /*  must be after __kmp_serial_initialize  */
   __kmp_install_signals(TRUE);
 #endif
 #endif
 
   __kmp_suspend_initialize();
 
 #if defined(USE_LOAD_BALANCE)
   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
     __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
   }
 #else
   if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
   }
 #endif
 
   if (__kmp_version) {
     __kmp_print_version_2();
   }
 
   /* we have finished parallel initialization */
   TCW_SYNC_4(__kmp_init_parallel, TRUE);
 
   KMP_MB();
   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
 
   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
 }
 
 /* ------------------------------------------------------------------------ */
 
 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
                                    kmp_team_t *team) {
   kmp_disp_t *dispatch;
 
   KMP_MB();
 
   /* none of the threads have encountered any constructs, yet. */
   this_thr->th.th_local.this_construct = 0;
 #if KMP_CACHE_MANAGE
   KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
 #endif /* KMP_CACHE_MANAGE */
   dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
   KMP_DEBUG_ASSERT(dispatch);
   KMP_DEBUG_ASSERT(team->t.t_dispatch);
   // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
   // this_thr->th.th_info.ds.ds_tid ] );
 
   dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
   dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
   if (__kmp_env_consistency_check)
     __kmp_push_parallel(gtid, team->t.t_ident);
 
   KMP_MB(); /* Flush all pending memory write invalidates.  */
 }
 
 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
                                   kmp_team_t *team) {
   if (__kmp_env_consistency_check)
     __kmp_pop_parallel(gtid, team->t.t_ident);
 
   __kmp_finish_implicit_task(this_thr);
 }
 
 int __kmp_invoke_task_func(int gtid) {
   int rc;
   int tid = __kmp_tid_from_gtid(gtid);
   kmp_info_t *this_thr = __kmp_threads[gtid];
   kmp_team_t *team = this_thr->th.th_team;
 
   __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
 #if USE_ITT_BUILD
   if (__itt_stack_caller_create_ptr) {
     __kmp_itt_stack_callee_enter(
         (__itt_caller)
             team->t.t_stack_id); // inform ittnotify about entering user's code
   }
 #endif /* USE_ITT_BUILD */
 #if INCLUDE_SSC_MARKS
   SSC_MARK_INVOKING();
 #endif
 
 #if OMPT_SUPPORT
   void *dummy;
-  void **exit_runtime_p;
+  void **exit_frame_p;
   ompt_data_t *my_task_data;
   ompt_data_t *my_parallel_data;
   int ompt_team_size;
 
   if (ompt_enabled.enabled) {
-    exit_runtime_p = &(
+    exit_frame_p = &(
         team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr);
   } else {
-    exit_runtime_p = &dummy;
+    exit_frame_p = &dummy;
   }
 
   my_task_data =
       &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
   my_parallel_data = &(team->t.ompt_team_info.parallel_data);
   if (ompt_enabled.ompt_callback_implicit_task) {
     ompt_team_size = team->t.t_nproc;
     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
-        __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
+        __kmp_tid_from_gtid(gtid), ompt_task_implicit);
     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
   }
 #endif
 
 #if KMP_STATS_ENABLED
   stats_state_e previous_state = KMP_GET_THREAD_STATE();
   if (previous_state == stats_state_e::TEAMS_REGION) {
     KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
   } else {
     KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
   }
   KMP_SET_THREAD_STATE(IMPLICIT_TASK);
 #endif
 
   rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
                               tid, (int)team->t.t_argc, (void **)team->t.t_argv
 #if OMPT_SUPPORT
                               ,
-                              exit_runtime_p
+                              exit_frame_p
 #endif
                               );
 #if OMPT_SUPPORT
-  *exit_runtime_p = NULL;
+  *exit_frame_p = NULL;
+   this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
 #endif
 
 #if KMP_STATS_ENABLED
   if (previous_state == stats_state_e::TEAMS_REGION) {
     KMP_SET_THREAD_STATE(previous_state);
   }
   KMP_POP_PARTITIONED_TIMER();
 #endif
 
 #if USE_ITT_BUILD
   if (__itt_stack_caller_create_ptr) {
     __kmp_itt_stack_callee_leave(
         (__itt_caller)
             team->t.t_stack_id); // inform ittnotify about leaving user's code
   }
 #endif /* USE_ITT_BUILD */
   __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
 
   return rc;
 }
 
 void __kmp_teams_master(int gtid) {
   // This routine is called by all master threads in teams construct
   kmp_info_t *thr = __kmp_threads[gtid];
   kmp_team_t *team = thr->th.th_team;
   ident_t *loc = team->t.t_ident;
   thr->th.th_set_nproc = thr->th.th_teams_size.nth;
   KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
 
   // This thread is a new CG root.  Set up the proper variables.
   kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
   tmp->cg_root = thr; // Make thr the CG root
   // Init to thread limit that was stored when league masters were forked
   tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
   tmp->cg_nthreads = 1; // Init counter to one active thread, this one
   KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
                  " cg_nthreads to 1\n",
                  thr, tmp));
   tmp->up = thr->th.th_cg_roots;
   thr->th.th_cg_roots = tmp;
 
 // Launch league of teams now, but not let workers execute
 // (they hang on fork barrier until next parallel)
 #if INCLUDE_SSC_MARKS
   SSC_MARK_FORKING();
 #endif
   __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
                   (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
                   VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
 #if INCLUDE_SSC_MARKS
   SSC_MARK_JOINING();
 #endif
   // If the team size was reduced from the limit, set it to the new size
   if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
     thr->th.th_teams_size.nth = thr->th.th_team_nproc;
   // AC: last parameter "1" eliminates join barrier which won't work because
   // worker threads are in a fork barrier waiting for more parallel regions
   __kmp_join_call(loc, gtid
 #if OMPT_SUPPORT
                   ,
                   fork_context_intel
 #endif
                   ,
                   1);
 }
 
 int __kmp_invoke_teams_master(int gtid) {
   kmp_info_t *this_thr = __kmp_threads[gtid];
   kmp_team_t *team = this_thr->th.th_team;
 #if KMP_DEBUG
   if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
     KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
                      (void *)__kmp_teams_master);
 #endif
   __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
+#if OMPT_SUPPORT
+  int tid = __kmp_tid_from_gtid(gtid);
+  ompt_data_t *task_data =
+      &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
+  ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
+  if (ompt_enabled.ompt_callback_implicit_task) {
+    ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
+        ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
+        ompt_task_initial);
+    OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
+  }
+#endif
   __kmp_teams_master(gtid);
+#if OMPT_SUPPORT
+  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
+#endif
   __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
   return 1;
 }
 
 /* this sets the requested number of threads for the next parallel region
    encountered by this team. since this should be enclosed in the forkjoin
    critical section it should avoid race conditions with assymmetrical nested
    parallelism */
 
 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
   kmp_info_t *thr = __kmp_threads[gtid];
 
   if (num_threads > 0)
     thr->th.th_set_nproc = num_threads;
 }
 
 /* this sets the requested number of teams for the teams region and/or
    the number of threads for the next parallel region encountered  */
 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
                           int num_threads) {
   kmp_info_t *thr = __kmp_threads[gtid];
   KMP_DEBUG_ASSERT(num_teams >= 0);
   KMP_DEBUG_ASSERT(num_threads >= 0);
 
   if (num_teams == 0)
     num_teams = 1; // default number of teams is 1.
   if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
     if (!__kmp_reserve_warn) {
       __kmp_reserve_warn = 1;
       __kmp_msg(kmp_ms_warning,
                 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
                 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
     }
     num_teams = __kmp_teams_max_nth;
   }
   // Set number of teams (number of threads in the outer "parallel" of the
   // teams)
   thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
 
   // Remember the number of threads for inner parallel regions
+  if (!TCR_4(__kmp_init_middle))
+    __kmp_middle_initialize(); // get internal globals calculated
+  KMP_DEBUG_ASSERT(__kmp_avail_proc);
+  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
   if (num_threads == 0) {
-    if (!TCR_4(__kmp_init_middle))
-      __kmp_middle_initialize(); // get __kmp_avail_proc calculated
     num_threads = __kmp_avail_proc / num_teams;
+    // adjust num_threads w/o warning as it is not user setting
+    // num_threads = min(num_threads, nthreads-var, thread-limit-var)
+    // no thread_limit clause specified -  do not change thread-limit-var ICV
+    if (num_threads > __kmp_dflt_team_nth) {
+      num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
+    }
+    if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
+      num_threads = thr->th.th_current_task->td_icvs.thread_limit;
+    } // prevent team size to exceed thread-limit-var
     if (num_teams * num_threads > __kmp_teams_max_nth) {
-      // adjust num_threads w/o warning as it is not user setting
       num_threads = __kmp_teams_max_nth / num_teams;
     }
   } else {
     // This thread will be the master of the league masters
     // Store new thread limit; old limit is saved in th_cg_roots list
     thr->th.th_current_task->td_icvs.thread_limit = num_threads;
-
+    // num_threads = min(num_threads, nthreads-var)
+    if (num_threads > __kmp_dflt_team_nth) {
+      num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
+    }
     if (num_teams * num_threads > __kmp_teams_max_nth) {
       int new_threads = __kmp_teams_max_nth / num_teams;
       if (!__kmp_reserve_warn) { // user asked for too many threads
         __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
         __kmp_msg(kmp_ms_warning,
                   KMP_MSG(CantFormThrTeam, num_threads, new_threads),
                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
       }
       num_threads = new_threads;
     }
   }
   thr->th.th_teams_size.nth = num_threads;
 }
 
 // Set the proc_bind var to use in the following parallel region.
 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
   kmp_info_t *thr = __kmp_threads[gtid];
   thr->th.th_set_proc_bind = proc_bind;
 }
 
 /* Launch the worker threads into the microtask. */
 
 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
   kmp_info_t *this_thr = __kmp_threads[gtid];
 
 #ifdef KMP_DEBUG
   int f;
 #endif /* KMP_DEBUG */
 
   KMP_DEBUG_ASSERT(team);
   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
   KMP_ASSERT(KMP_MASTER_GTID(gtid));
   KMP_MB(); /* Flush all pending memory write invalidates.  */
 
   team->t.t_construct = 0; /* no single directives seen yet */
   team->t.t_ordered.dt.t_value =
       0; /* thread 0 enters the ordered section first */
 
   /* Reset the identifiers on the dispatch buffer */
   KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
   if (team->t.t_max_nproc > 1) {
     int i;
     for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
       team->t.t_disp_buffer[i].buffer_index = i;
       team->t.t_disp_buffer[i].doacross_buf_idx = i;
     }
   } else {
     team->t.t_disp_buffer[0].buffer_index = 0;
     team->t.t_disp_buffer[0].doacross_buf_idx = 0;
   }
 
   KMP_MB(); /* Flush all pending memory write invalidates.  */
   KMP_ASSERT(this_thr->th.th_team == team);
 
 #ifdef KMP_DEBUG
   for (f = 0; f < team->t.t_nproc; f++) {
     KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
                      team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
   }
 #endif /* KMP_DEBUG */
 
   /* release the worker threads so they may begin working */
   __kmp_fork_barrier(gtid, 0);
 }
 
 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
   kmp_info_t *this_thr = __kmp_threads[gtid];
 
   KMP_DEBUG_ASSERT(team);
   KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
   KMP_ASSERT(KMP_MASTER_GTID(gtid));
   KMP_MB(); /* Flush all pending memory write invalidates.  */
 
 /* Join barrier after fork */
 
 #ifdef KMP_DEBUG
   if (__kmp_threads[gtid] &&
       __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
     __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
                  __kmp_threads[gtid]);
     __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
                  "team->t.t_nproc=%d\n",
                  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
                  team->t.t_nproc);
     __kmp_print_structure();
   }
   KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
                    __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
 #endif /* KMP_DEBUG */
 
   __kmp_join_barrier(gtid); /* wait for everyone */
 #if OMPT_SUPPORT
   if (ompt_enabled.enabled &&
       this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
     int ds_tid = this_thr->th.th_info.ds.ds_tid;
     ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
 #if OMPT_OPTIONAL
     void *codeptr = NULL;
     if (KMP_MASTER_TID(ds_tid) &&
         (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
          ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
       codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
 
     if (ompt_enabled.ompt_callback_sync_region_wait) {
       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
           codeptr);
     }
     if (ompt_enabled.ompt_callback_sync_region) {
       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
           codeptr);
     }
 #endif
     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
           ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
     }
   }
 #endif
 
   KMP_MB(); /* Flush all pending memory write invalidates.  */
   KMP_ASSERT(this_thr->th.th_team == team);
 }
 
 /* ------------------------------------------------------------------------ */
 
 #ifdef USE_LOAD_BALANCE
 
 // Return the worker threads actively spinning in the hot team, if we
 // are at the outermost level of parallelism.  Otherwise, return 0.
 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
   int i;
   int retval;
   kmp_team_t *hot_team;
 
   if (root->r.r_active) {
     return 0;
   }
   hot_team = root->r.r_hot_team;
   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
     return hot_team->t.t_nproc - 1; // Don't count master thread
   }
 
   // Skip the master thread - it is accounted for elsewhere.
   retval = 0;
   for (i = 1; i < hot_team->t.t_nproc; i++) {
     if (hot_team->t.t_threads[i]->th.th_active) {
       retval++;
     }
   }
   return retval;
 }
 
 // Perform an automatic adjustment to the number of
 // threads used by the next parallel region.
 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
   int retval;
   int pool_active;
   int hot_team_active;
   int team_curr_active;
   int system_active;
 
   KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
                 set_nproc));
   KMP_DEBUG_ASSERT(root);
   KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
                        ->th.th_current_task->td_icvs.dynamic == TRUE);
   KMP_DEBUG_ASSERT(set_nproc > 1);
 
   if (set_nproc == 1) {
     KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
     return 1;
   }
 
   // Threads that are active in the thread pool, active in the hot team for this
   // particular root (if we are at the outer par level), and the currently
   // executing thread (to become the master) are available to add to the new
   // team, but are currently contributing to the system load, and must be
   // accounted for.
   pool_active = __kmp_thread_pool_active_nth;
   hot_team_active = __kmp_active_hot_team_nproc(root);
   team_curr_active = pool_active + hot_team_active + 1;
 
   // Check the system load.
   system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
   KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
                 "hot team active = %d\n",
                 system_active, pool_active, hot_team_active));
 
   if (system_active < 0) {
     // There was an error reading the necessary info from /proc, so use the
     // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
     // = dynamic_thread_limit, we shouldn't wind up getting back here.
     __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
     KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
 
     // Make this call behave like the thread limit algorithm.
     retval = __kmp_avail_proc - __kmp_nth +
              (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
     if (retval > set_nproc) {
       retval = set_nproc;
     }
     if (retval < KMP_MIN_NTH) {
       retval = KMP_MIN_NTH;
     }
 
     KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
                   retval));
     return retval;
   }
 
   // There is a slight delay in the load balance algorithm in detecting new
   // running procs. The real system load at this instant should be at least as
   // large as the #active omp thread that are available to add to the team.
   if (system_active < team_curr_active) {
     system_active = team_curr_active;
   }
   retval = __kmp_avail_proc - system_active + team_curr_active;
   if (retval > set_nproc) {
     retval = set_nproc;
   }
   if (retval < KMP_MIN_NTH) {
     retval = KMP_MIN_NTH;
   }
 
   KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
   return retval;
 } // __kmp_load_balance_nproc()
 
 #endif /* USE_LOAD_BALANCE */
 
 /* ------------------------------------------------------------------------ */
 
 /* NOTE: this is called with the __kmp_init_lock held */
 void __kmp_cleanup(void) {
   int f;
 
   KA_TRACE(10, ("__kmp_cleanup: enter\n"));
 
   if (TCR_4(__kmp_init_parallel)) {
 #if KMP_HANDLE_SIGNALS
     __kmp_remove_signals();
 #endif
     TCW_4(__kmp_init_parallel, FALSE);
   }
 
   if (TCR_4(__kmp_init_middle)) {
 #if KMP_AFFINITY_SUPPORTED
     __kmp_affinity_uninitialize();
 #endif /* KMP_AFFINITY_SUPPORTED */
     __kmp_cleanup_hierarchy();
     TCW_4(__kmp_init_middle, FALSE);
   }
 
   KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
 
   if (__kmp_init_serial) {
     __kmp_runtime_destroy();
     __kmp_init_serial = FALSE;
   }
 
   __kmp_cleanup_threadprivate_caches();
 
   for (f = 0; f < __kmp_threads_capacity; f++) {
     if (__kmp_root[f] != NULL) {
       __kmp_free(__kmp_root[f]);
       __kmp_root[f] = NULL;
     }
   }
   __kmp_free(__kmp_threads);
   // __kmp_threads and __kmp_root were allocated at once, as single block, so
   // there is no need in freeing __kmp_root.
   __kmp_threads = NULL;
   __kmp_root = NULL;
   __kmp_threads_capacity = 0;
 
 #if KMP_USE_DYNAMIC_LOCK
   __kmp_cleanup_indirect_user_locks();
 #else
   __kmp_cleanup_user_locks();
 #endif
 
 #if KMP_AFFINITY_SUPPORTED
   KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
   __kmp_cpuinfo_file = NULL;
 #endif /* KMP_AFFINITY_SUPPORTED */
 
 #if KMP_USE_ADAPTIVE_LOCKS
 #if KMP_DEBUG_ADAPTIVE_LOCKS
   __kmp_print_speculative_stats();
 #endif
 #endif
   KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
   __kmp_nested_nth.nth = NULL;
   __kmp_nested_nth.size = 0;
   __kmp_nested_nth.used = 0;
   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
   __kmp_nested_proc_bind.bind_types = NULL;
   __kmp_nested_proc_bind.size = 0;
   __kmp_nested_proc_bind.used = 0;
   if (__kmp_affinity_format) {
     KMP_INTERNAL_FREE(__kmp_affinity_format);
     __kmp_affinity_format = NULL;
   }
 
   __kmp_i18n_catclose();
 
 #if KMP_USE_HIER_SCHED
   __kmp_hier_scheds.deallocate();
 #endif
 
 #if KMP_STATS_ENABLED
   __kmp_stats_fini();
 #endif
 
   KA_TRACE(10, ("__kmp_cleanup: exit\n"));
 }
 
 /* ------------------------------------------------------------------------ */
 
 int __kmp_ignore_mppbeg(void) {
   char *env;
 
   if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
     if (__kmp_str_match_false(env))
       return FALSE;
   }
   // By default __kmpc_begin() is no-op.
   return TRUE;
 }
 
 int __kmp_ignore_mppend(void) {
   char *env;
 
   if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
     if (__kmp_str_match_false(env))
       return FALSE;
   }
   // By default __kmpc_end() is no-op.
   return TRUE;
 }
 
 void __kmp_internal_begin(void) {
   int gtid;
   kmp_root_t *root;
 
   /* this is a very important step as it will register new sibling threads
      and assign these new uber threads a new gtid */
   gtid = __kmp_entry_gtid();
   root = __kmp_threads[gtid]->th.th_root;
   KMP_ASSERT(KMP_UBER_GTID(gtid));
 
   if (root->r.r_begin)
     return;
   __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
   if (root->r.r_begin) {
     __kmp_release_lock(&root->r.r_begin_lock, gtid);
     return;
   }
 
   root->r.r_begin = TRUE;
 
   __kmp_release_lock(&root->r.r_begin_lock, gtid);
 }
 
 /* ------------------------------------------------------------------------ */
 
 void __kmp_user_set_library(enum library_type arg) {
   int gtid;
   kmp_root_t *root;
   kmp_info_t *thread;
 
   /* first, make sure we are initialized so we can get our gtid */
 
   gtid = __kmp_entry_gtid();
   thread = __kmp_threads[gtid];
 
   root = thread->th.th_root;
 
   KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
                 library_serial));
   if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
                                   thread */
     KMP_WARNING(SetLibraryIncorrectCall);
     return;
   }
 
   switch (arg) {
   case library_serial:
     thread->th.th_set_nproc = 0;
     set__nproc(thread, 1);
     break;
   case library_turnaround:
     thread->th.th_set_nproc = 0;
     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
                                            : __kmp_dflt_team_nth_ub);
     break;
   case library_throughput:
     thread->th.th_set_nproc = 0;
     set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
                                            : __kmp_dflt_team_nth_ub);
     break;
   default:
     KMP_FATAL(UnknownLibraryType, arg);
   }
 
   __kmp_aux_set_library(arg);
 }
 
 void __kmp_aux_set_stacksize(size_t arg) {
   if (!__kmp_init_serial)
     __kmp_serial_initialize();
 
 #if KMP_OS_DARWIN
   if (arg & (0x1000 - 1)) {
     arg &= ~(0x1000 - 1);
     if (arg + 0x1000) /* check for overflow if we round up */
       arg += 0x1000;
   }
 #endif
   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
 
   /* only change the default stacksize before the first parallel region */
   if (!TCR_4(__kmp_init_parallel)) {
     size_t value = arg; /* argument is in bytes */
 
     if (value < __kmp_sys_min_stksize)
       value = __kmp_sys_min_stksize;
     else if (value > KMP_MAX_STKSIZE)
       value = KMP_MAX_STKSIZE;
 
     __kmp_stksize = value;
 
     __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
   }
 
   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
 }
 
 /* set the behaviour of the runtime library */
 /* TODO this can cause some odd behaviour with sibling parallelism... */
 void __kmp_aux_set_library(enum library_type arg) {
   __kmp_library = arg;
 
   switch (__kmp_library) {
   case library_serial: {
     KMP_INFORM(LibraryIsSerial);
   } break;
   case library_turnaround:
     if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
       __kmp_use_yield = 2; // only yield when oversubscribed
     break;
   case library_throughput:
     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
       __kmp_dflt_blocktime = 200;
     break;
   default:
     KMP_FATAL(UnknownLibraryType, arg);
   }
 }
 
 /* Getting team information common for all team API */
 // Returns NULL if not in teams construct
 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
   kmp_info_t *thr = __kmp_entry_thread();
   teams_serialized = 0;
   if (thr->th.th_teams_microtask) {
     kmp_team_t *team = thr->th.th_team;
     int tlevel = thr->th.th_teams_level; // the level of the teams construct
     int ii = team->t.t_level;
     teams_serialized = team->t.t_serialized;
     int level = tlevel + 1;
     KMP_DEBUG_ASSERT(ii >= tlevel);
     while (ii > level) {
       for (teams_serialized = team->t.t_serialized;
            (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
       }
       if (team->t.t_serialized && (!teams_serialized)) {
         team = team->t.t_parent;
         continue;
       }
       if (ii > level) {
         team = team->t.t_parent;
         ii--;
       }
     }
     return team;
   }
   return NULL;
 }
 
 int __kmp_aux_get_team_num() {
   int serialized;
   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
   if (team) {
     if (serialized > 1) {
       return 0; // teams region is serialized ( 1 team of 1 thread ).
     } else {
       return team->t.t_master_tid;
     }
   }
   return 0;
 }
 
 int __kmp_aux_get_num_teams() {
   int serialized;
   kmp_team_t *team = __kmp_aux_get_team_info(serialized);
   if (team) {
     if (serialized > 1) {
       return 1;
     } else {
       return team->t.t_parent->t.t_nproc;
     }
   }
   return 1;
 }
 
 /* ------------------------------------------------------------------------ */
 
 /*
  * Affinity Format Parser
  *
  * Field is in form of: %[[[0].]size]type
  * % and type are required (%% means print a literal '%')
  * type is either single char or long name surrounded by {},
  * e.g., N or {num_threads}
  * 0 => leading zeros
  * . => right justified when size is specified
  * by default output is left justified
  * size is the *minimum* field length
  * All other characters are printed as is
  *
  * Available field types:
  * L {thread_level}      - omp_get_level()
  * n {thread_num}        - omp_get_thread_num()
  * h {host}              - name of host machine
  * P {process_id}        - process id (integer)
  * T {thread_identifier} - native thread identifier (integer)
  * N {num_threads}       - omp_get_num_threads()
  * A {ancestor_tnum}     - omp_get_ancestor_thread_num(omp_get_level()-1)
  * a {thread_affinity}   - comma separated list of integers or integer ranges
  *                         (values of affinity mask)
  *
  * Implementation-specific field types can be added
  * If a type is unknown, print "undefined"
 */
 
 // Structure holding the short name, long name, and corresponding data type
 // for snprintf.  A table of these will represent the entire valid keyword
 // field types.
 typedef struct kmp_affinity_format_field_t {
   char short_name; // from spec e.g., L -> thread level
   const char *long_name; // from spec thread_level -> thread level
   char field_format; // data type for snprintf (typically 'd' or 's'
   // for integer or string)
 } kmp_affinity_format_field_t;
 
 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
 #if KMP_AFFINITY_SUPPORTED
     {'A', "thread_affinity", 's'},
 #endif
     {'t', "team_num", 'd'},
     {'T', "num_teams", 'd'},
     {'L', "nesting_level", 'd'},
     {'n', "thread_num", 'd'},
     {'N', "num_threads", 'd'},
     {'a', "ancestor_tnum", 'd'},
     {'H', "host", 's'},
     {'P', "process_id", 'd'},
     {'i', "native_thread_id", 'd'}};
 
 // Return the number of characters it takes to hold field
 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
                                             const char **ptr,
                                             kmp_str_buf_t *field_buffer) {
   int rc, format_index, field_value;
   const char *width_left, *width_right;
   bool pad_zeros, right_justify, parse_long_name, found_valid_name;
   static const int FORMAT_SIZE = 20;
   char format[FORMAT_SIZE] = {0};
   char absolute_short_name = 0;
 
   KMP_DEBUG_ASSERT(gtid >= 0);
   KMP_DEBUG_ASSERT(th);
   KMP_DEBUG_ASSERT(**ptr == '%');
   KMP_DEBUG_ASSERT(field_buffer);
 
   __kmp_str_buf_clear(field_buffer);
 
   // Skip the initial %
   (*ptr)++;
 
   // Check for %% first
   if (**ptr == '%') {
     __kmp_str_buf_cat(field_buffer, "%", 1);
     (*ptr)++; // skip over the second %
     return 1;
   }
 
   // Parse field modifiers if they are present
   pad_zeros = false;
   if (**ptr == '0') {
     pad_zeros = true;
     (*ptr)++; // skip over 0
   }
   right_justify = false;
   if (**ptr == '.') {
     right_justify = true;
     (*ptr)++; // skip over .
   }
   // Parse width of field: [width_left, width_right)
   width_left = width_right = NULL;
   if (**ptr >= '0' && **ptr <= '9') {
     width_left = *ptr;
     SKIP_DIGITS(*ptr);
     width_right = *ptr;
   }
 
   // Create the format for KMP_SNPRINTF based on flags parsed above
   format_index = 0;
   format[format_index++] = '%';
   if (!right_justify)
     format[format_index++] = '-';
   if (pad_zeros)
     format[format_index++] = '0';
   if (width_left && width_right) {
     int i = 0;
     // Only allow 8 digit number widths.
     // This also prevents overflowing format variable
     while (i < 8 && width_left < width_right) {
       format[format_index++] = *width_left;
       width_left++;
       i++;
     }
   }
 
   // Parse a name (long or short)
   // Canonicalize the name into absolute_short_name
   found_valid_name = false;
   parse_long_name = (**ptr == '{');
   if (parse_long_name)
     (*ptr)++; // skip initial left brace
   for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
                              sizeof(__kmp_affinity_format_table[0]);
        ++i) {
     char short_name = __kmp_affinity_format_table[i].short_name;
     const char *long_name = __kmp_affinity_format_table[i].long_name;
     char field_format = __kmp_affinity_format_table[i].field_format;
     if (parse_long_name) {
       int length = KMP_STRLEN(long_name);
       if (strncmp(*ptr, long_name, length) == 0) {
         found_valid_name = true;
         (*ptr) += length; // skip the long name
       }
     } else if (**ptr == short_name) {
       found_valid_name = true;
       (*ptr)++; // skip the short name
     }
     if (found_valid_name) {
       format[format_index++] = field_format;
       format[format_index++] = '\0';
       absolute_short_name = short_name;
       break;
     }
   }
   if (parse_long_name) {
     if (**ptr != '}') {
       absolute_short_name = 0;
     } else {
       (*ptr)++; // skip over the right brace
     }
   }
 
   // Attempt to fill the buffer with the requested
   // value using snprintf within __kmp_str_buf_print()
   switch (absolute_short_name) {
   case 't':
     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
     break;
   case 'T':
     rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
     break;
   case 'L':
     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
     break;
   case 'n':
     rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
     break;
   case 'H': {
     static const int BUFFER_SIZE = 256;
     char buf[BUFFER_SIZE];
     __kmp_expand_host_name(buf, BUFFER_SIZE);
     rc = __kmp_str_buf_print(field_buffer, format, buf);
   } break;
   case 'P':
     rc = __kmp_str_buf_print(field_buffer, format, getpid());
     break;
   case 'i':
     rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
     break;
   case 'N':
     rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
     break;
   case 'a':
     field_value =
         __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
     rc = __kmp_str_buf_print(field_buffer, format, field_value);
     break;
 #if KMP_AFFINITY_SUPPORTED
   case 'A': {
     kmp_str_buf_t buf;
     __kmp_str_buf_init(&buf);
     __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
     rc = __kmp_str_buf_print(field_buffer, format, buf.str);
     __kmp_str_buf_free(&buf);
   } break;
 #endif
   default:
     // According to spec, If an implementation does not have info for field
     // type, then "undefined" is printed
     rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
     // Skip the field
     if (parse_long_name) {
       SKIP_TOKEN(*ptr);
       if (**ptr == '}')
         (*ptr)++;
     } else {
       (*ptr)++;
     }
   }
 
   KMP_ASSERT(format_index <= FORMAT_SIZE);
   return rc;
 }
 
 /*
  * Return number of characters needed to hold the affinity string
  * (not including null byte character)
  * The resultant string is printed to buffer, which the caller can then
  * handle afterwards
 */
 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
                                   kmp_str_buf_t *buffer) {
   const char *parse_ptr;
   size_t retval;
   const kmp_info_t *th;
   kmp_str_buf_t field;
 
   KMP_DEBUG_ASSERT(buffer);
   KMP_DEBUG_ASSERT(gtid >= 0);
 
   __kmp_str_buf_init(&field);
   __kmp_str_buf_clear(buffer);
 
   th = __kmp_threads[gtid];
   retval = 0;
 
   // If format is NULL or zero-length string, then we use
   // affinity-format-var ICV
   parse_ptr = format;
   if (parse_ptr == NULL || *parse_ptr == '\0') {
     parse_ptr = __kmp_affinity_format;
   }
   KMP_DEBUG_ASSERT(parse_ptr);
 
   while (*parse_ptr != '\0') {
     // Parse a field
     if (*parse_ptr == '%') {
       // Put field in the buffer
       int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
       __kmp_str_buf_catbuf(buffer, &field);
       retval += rc;
     } else {
       // Put literal character in buffer
       __kmp_str_buf_cat(buffer, parse_ptr, 1);
       retval++;
       parse_ptr++;
     }
   }
   __kmp_str_buf_free(&field);
   return retval;
 }
 
 // Displays the affinity string to stdout
 void __kmp_aux_display_affinity(int gtid, const char *format) {
   kmp_str_buf_t buf;
   __kmp_str_buf_init(&buf);
   __kmp_aux_capture_affinity(gtid, format, &buf);
   __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
   __kmp_str_buf_free(&buf);
 }
 
 /* ------------------------------------------------------------------------ */
 
 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
   int blocktime = arg; /* argument is in milliseconds */
 #if KMP_USE_MONITOR
   int bt_intervals;
 #endif
   int bt_set;
 
   __kmp_save_internal_controls(thread);
 
   /* Normalize and set blocktime for the teams */
   if (blocktime < KMP_MIN_BLOCKTIME)
     blocktime = KMP_MIN_BLOCKTIME;
   else if (blocktime > KMP_MAX_BLOCKTIME)
     blocktime = KMP_MAX_BLOCKTIME;
 
   set__blocktime_team(thread->th.th_team, tid, blocktime);
   set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
 
 #if KMP_USE_MONITOR
   /* Calculate and set blocktime intervals for the teams */
   bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
 
   set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
   set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
 #endif
 
   /* Set whether blocktime has been set to "TRUE" */
   bt_set = TRUE;
 
   set__bt_set_team(thread->th.th_team, tid, bt_set);
   set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
 #if KMP_USE_MONITOR
   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
                 "bt_intervals=%d, monitor_updates=%d\n",
                 __kmp_gtid_from_tid(tid, thread->th.th_team),
                 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
                 __kmp_monitor_wakeups));
 #else
   KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
                 __kmp_gtid_from_tid(tid, thread->th.th_team),
                 thread->th.th_team->t.t_id, tid, blocktime));
 #endif
 }
 
 void __kmp_aux_set_defaults(char const *str, int len) {
   if (!__kmp_init_serial) {
     __kmp_serial_initialize();
   }
   __kmp_env_initialize(str);
 
   if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
     __kmp_env_print();
   }
 } // __kmp_aux_set_defaults
 
 /* ------------------------------------------------------------------------ */
 /* internal fast reduction routines */
 
 PACKED_REDUCTION_METHOD_T
 __kmp_determine_reduction_method(
     ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
     void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
     kmp_critical_name *lck) {
 
   // Default reduction method: critical construct ( lck != NULL, like in current
   // PAROPT )
   // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
   // can be selected by RTL
   // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
   // can be selected by RTL
   // Finally, it's up to OpenMP RTL to make a decision on which method to select
   // among generated by PAROPT.
 
   PACKED_REDUCTION_METHOD_T retval;
 
   int team_size;
 
   KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
   KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
 
 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED                                 \
   ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
 
   retval = critical_reduce_block;
 
   // another choice of getting a team size (with 1 dynamic deference) is slower
   team_size = __kmp_get_team_num_threads(global_tid);
   if (team_size == 1) {
 
     retval = empty_reduce_block;
 
   } else {
 
     int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
 
-#if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
+#if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 ||                   \
+    KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
 
 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
     KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
 
     int teamsize_cutoff = 4;
 
 #if KMP_MIC_SUPPORTED
     if (__kmp_mic_type != non_mic) {
       teamsize_cutoff = 8;
     }
 #endif
     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
     if (tree_available) {
       if (team_size <= teamsize_cutoff) {
         if (atomic_available) {
           retval = atomic_reduce_block;
         }
       } else {
         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
       }
     } else if (atomic_available) {
       retval = atomic_reduce_block;
     }
 #else
 #error "Unknown or unsupported OS"
 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
        // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
 
 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
 
 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
 
     // basic tuning
 
     if (atomic_available) {
       if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
         retval = atomic_reduce_block;
       }
     } // otherwise: use critical section
 
 #elif KMP_OS_DARWIN
 
     int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
     if (atomic_available && (num_vars <= 3)) {
       retval = atomic_reduce_block;
     } else if (tree_available) {
       if ((reduce_size > (9 * sizeof(kmp_real64))) &&
           (reduce_size < (2000 * sizeof(kmp_real64)))) {
         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
       }
     } // otherwise: use critical section
 
 #else
 #error "Unknown or unsupported OS"
 #endif
 
 #else
 #error "Unknown or unsupported architecture"
 #endif
   }
 
   // KMP_FORCE_REDUCTION
 
   // If the team is serialized (team_size == 1), ignore the forced reduction
   // method and stay with the unsynchronized method (empty_reduce_block)
   if (__kmp_force_reduction_method != reduction_method_not_defined &&
       team_size != 1) {
 
     PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
 
     int atomic_available, tree_available;
 
     switch ((forced_retval = __kmp_force_reduction_method)) {
     case critical_reduce_block:
       KMP_ASSERT(lck); // lck should be != 0
       break;
 
     case atomic_reduce_block:
       atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
       if (!atomic_available) {
         KMP_WARNING(RedMethodNotSupported, "atomic");
         forced_retval = critical_reduce_block;
       }
       break;
 
     case tree_reduce_block:
       tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
       if (!tree_available) {
         KMP_WARNING(RedMethodNotSupported, "tree");
         forced_retval = critical_reduce_block;
       } else {
 #if KMP_FAST_REDUCTION_BARRIER
         forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
 #endif
       }
       break;
 
     default:
       KMP_ASSERT(0); // "unsupported method specified"
     }
 
     retval = forced_retval;
   }
 
   KA_TRACE(10, ("reduction method selected=%08x\n", retval));
 
 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
 
   return (retval);
 }
 
 // this function is for testing set/get/determine reduce method
 kmp_int32 __kmp_get_reduce_method(void) {
   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
 }
 
 // Soft pause sets up threads to ignore blocktime and just go to sleep.
 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
 
 // Hard pause shuts down the runtime completely.  Resume happens naturally when
 // OpenMP is used subsequently.
 void __kmp_hard_pause() {
   __kmp_pause_status = kmp_hard_paused;
   __kmp_internal_end_thread(-1);
 }
 
 // Soft resume sets __kmp_pause_status, and wakes up all threads.
 void __kmp_resume_if_soft_paused() {
   if (__kmp_pause_status == kmp_soft_paused) {
     __kmp_pause_status = kmp_not_paused;
 
     for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
       kmp_info_t *thread = __kmp_threads[gtid];
       if (thread) { // Wake it if sleeping
         kmp_flag_64 fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
         if (fl.is_sleeping())
           fl.resume(gtid);
         else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
           __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
         } else { // thread holds the lock and may sleep soon
           do { // until either the thread sleeps, or we can get the lock
             if (fl.is_sleeping()) {
               fl.resume(gtid);
               break;
             } else if (__kmp_try_suspend_mx(thread)) {
               __kmp_unlock_suspend_mx(thread);
               break;
             }
           } while (1);
         }
       }
     }
   }
 }
 
 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
 // TODO: add warning messages
 int __kmp_pause_resource(kmp_pause_status_t level) {
   if (level == kmp_not_paused) { // requesting resume
     if (__kmp_pause_status == kmp_not_paused) {
       // error message about runtime not being paused, so can't resume
       return 1;
     } else {
       KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
                        __kmp_pause_status == kmp_hard_paused);
       __kmp_pause_status = kmp_not_paused;
       return 0;
     }
   } else if (level == kmp_soft_paused) { // requesting soft pause
     if (__kmp_pause_status != kmp_not_paused) {
       // error message about already being paused
       return 1;
     } else {
       __kmp_soft_pause();
       return 0;
     }
   } else if (level == kmp_hard_paused) { // requesting hard pause
     if (__kmp_pause_status != kmp_not_paused) {
       // error message about already being paused
       return 1;
     } else {
       __kmp_hard_pause();
       return 0;
     }
   } else {
     // error message about invalid level
     return 1;
   }
 }
Index: projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_stub.cpp
===================================================================
--- projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_stub.cpp	(revision 357058)
+++ projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_stub.cpp	(revision 357059)
@@ -1,385 +1,385 @@
 /*
  * kmp_stub.cpp -- stub versions of user-callable OpenMP RT functions.
  */
 
 //===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
 #include <errno.h>
 #include <limits.h>
 #include <stdlib.h>
 
 #define __KMP_IMP
 #include "omp.h" // omp_* declarations, must be included before "kmp.h"
 #include "kmp.h" // KMP_DEFAULT_STKSIZE
 #include "kmp_stub.h"
 
 #if KMP_OS_WINDOWS
 #include <windows.h>
 #else
 #include <sys/time.h>
 #endif
 
 // Moved from omp.h
 #define omp_set_max_active_levels ompc_set_max_active_levels
 #define omp_set_schedule ompc_set_schedule
 #define omp_get_ancestor_thread_num ompc_get_ancestor_thread_num
 #define omp_get_team_size ompc_get_team_size
 
 #define omp_set_num_threads ompc_set_num_threads
 #define omp_set_dynamic ompc_set_dynamic
 #define omp_set_nested ompc_set_nested
 #define omp_set_affinity_format ompc_set_affinity_format
 #define omp_get_affinity_format ompc_get_affinity_format
 #define omp_display_affinity ompc_display_affinity
 #define omp_capture_affinity ompc_capture_affinity
 #define kmp_set_stacksize kmpc_set_stacksize
 #define kmp_set_stacksize_s kmpc_set_stacksize_s
 #define kmp_set_blocktime kmpc_set_blocktime
 #define kmp_set_library kmpc_set_library
 #define kmp_set_defaults kmpc_set_defaults
 #define kmp_set_disp_num_buffers kmpc_set_disp_num_buffers
 #define kmp_malloc kmpc_malloc
 #define kmp_aligned_malloc kmpc_aligned_malloc
 #define kmp_calloc kmpc_calloc
 #define kmp_realloc kmpc_realloc
 #define kmp_free kmpc_free
 
 #if KMP_OS_WINDOWS
 static double frequency = 0.0;
 #endif
 
 // Helper functions.
 static size_t __kmps_init() {
   static int initialized = 0;
   static size_t dummy = 0;
   if (!initialized) {
     // TODO: Analyze KMP_VERSION environment variable, print
     // __kmp_version_copyright and __kmp_version_build_time.
     // WARNING: Do not use "fprintf(stderr, ...)" because it will cause
     // unresolved "__iob" symbol (see C70080). We need to extract __kmp_printf()
     // stuff from kmp_runtime.cpp and use it.
 
     // Trick with dummy variable forces linker to keep __kmp_version_copyright
     // and __kmp_version_build_time strings in executable file (in case of
     // static linkage). When KMP_VERSION analysis is implemented, dummy
     // variable should be deleted, function should return void.
     dummy = __kmp_version_copyright - __kmp_version_build_time;
 
 #if KMP_OS_WINDOWS
     LARGE_INTEGER freq;
     BOOL status = QueryPerformanceFrequency(&freq);
     if (status) {
       frequency = double(freq.QuadPart);
     }
 #endif
 
     initialized = 1;
   }
   return dummy;
 } // __kmps_init
 
 #define i __kmps_init();
 
 /* set API functions */
 void omp_set_num_threads(omp_int_t num_threads) { i; }
 void omp_set_dynamic(omp_int_t dynamic) {
   i;
   __kmps_set_dynamic(dynamic);
 }
 void omp_set_nested(omp_int_t nested) {
   i;
   __kmps_set_nested(nested);
 }
 void omp_set_max_active_levels(omp_int_t max_active_levels) { i; }
 void omp_set_schedule(omp_sched_t kind, omp_int_t modifier) {
   i;
   __kmps_set_schedule((kmp_sched_t)kind, modifier);
 }
 int omp_get_ancestor_thread_num(omp_int_t level) {
   i;
   return (level) ? (-1) : (0);
 }
 int omp_get_team_size(omp_int_t level) {
   i;
   return (level) ? (-1) : (1);
 }
 int kmpc_set_affinity_mask_proc(int proc, void **mask) {
   i;
   return -1;
 }
 int kmpc_unset_affinity_mask_proc(int proc, void **mask) {
   i;
   return -1;
 }
 int kmpc_get_affinity_mask_proc(int proc, void **mask) {
   i;
   return -1;
 }
 
 /* kmp API functions */
 void kmp_set_stacksize(omp_int_t arg) {
   i;
   __kmps_set_stacksize(arg);
 }
 void kmp_set_stacksize_s(size_t arg) {
   i;
   __kmps_set_stacksize(arg);
 }
 void kmp_set_blocktime(omp_int_t arg) {
   i;
   __kmps_set_blocktime(arg);
 }
 void kmp_set_library(omp_int_t arg) {
   i;
   __kmps_set_library(arg);
 }
 void kmp_set_defaults(char const *str) { i; }
 void kmp_set_disp_num_buffers(omp_int_t arg) { i; }
 
 /* KMP memory management functions. */
 void *kmp_malloc(size_t size) {
   i;
   void *res;
 #if KMP_OS_WINDOWS
   // If succesfull returns a pointer to the memory block, otherwise returns
   // NULL.
   // Sets errno to ENOMEM or EINVAL if memory allocation failed or parameter
   // validation failed.
   res = _aligned_malloc(size, 1);
 #else
   res = malloc(size);
 #endif
   return res;
 }
 void *kmp_aligned_malloc(size_t sz, size_t a) {
   i;
   int err;
   void *res;
 #if KMP_OS_WINDOWS
   res = _aligned_malloc(sz, a);
 #else
-  if (err = posix_memalign(&res, a, sz)) {
+  if ((err = posix_memalign(&res, a, sz))) {
     errno = err; // can be EINVAL or ENOMEM
     res = NULL;
   }
 #endif
   return res;
 }
 void *kmp_calloc(size_t nelem, size_t elsize) {
   i;
   void *res;
 #if KMP_OS_WINDOWS
   res = _aligned_recalloc(NULL, nelem, elsize, 1);
 #else
   res = calloc(nelem, elsize);
 #endif
   return res;
 }
 void *kmp_realloc(void *ptr, size_t size) {
   i;
   void *res;
 #if KMP_OS_WINDOWS
   res = _aligned_realloc(ptr, size, 1);
 #else
   res = realloc(ptr, size);
 #endif
   return res;
 }
 void kmp_free(void *ptr) {
   i;
 #if KMP_OS_WINDOWS
   _aligned_free(ptr);
 #else
   free(ptr);
 #endif
 }
 
 static int __kmps_blocktime = INT_MAX;
 
 void __kmps_set_blocktime(int arg) {
   i;
   __kmps_blocktime = arg;
 } // __kmps_set_blocktime
 
 int __kmps_get_blocktime(void) {
   i;
   return __kmps_blocktime;
 } // __kmps_get_blocktime
 
 static int __kmps_dynamic = 0;
 
 void __kmps_set_dynamic(int arg) {
   i;
   __kmps_dynamic = arg;
 } // __kmps_set_dynamic
 
 int __kmps_get_dynamic(void) {
   i;
   return __kmps_dynamic;
 } // __kmps_get_dynamic
 
 static int __kmps_library = 1000;
 
 void __kmps_set_library(int arg) {
   i;
   __kmps_library = arg;
 } // __kmps_set_library
 
 int __kmps_get_library(void) {
   i;
   return __kmps_library;
 } // __kmps_get_library
 
 static int __kmps_nested = 0;
 
 void __kmps_set_nested(int arg) {
   i;
   __kmps_nested = arg;
 } // __kmps_set_nested
 
 int __kmps_get_nested(void) {
   i;
   return __kmps_nested;
 } // __kmps_get_nested
 
 static size_t __kmps_stacksize = KMP_DEFAULT_STKSIZE;
 
 void __kmps_set_stacksize(int arg) {
   i;
   __kmps_stacksize = arg;
 } // __kmps_set_stacksize
 
 int __kmps_get_stacksize(void) {
   i;
   return __kmps_stacksize;
 } // __kmps_get_stacksize
 
 static kmp_sched_t __kmps_sched_kind = kmp_sched_default;
 static int __kmps_sched_modifier = 0;
 
 void __kmps_set_schedule(kmp_sched_t kind, int modifier) {
   i;
   __kmps_sched_kind = kind;
   __kmps_sched_modifier = modifier;
 } // __kmps_set_schedule
 
 void __kmps_get_schedule(kmp_sched_t *kind, int *modifier) {
   i;
   *kind = __kmps_sched_kind;
   *modifier = __kmps_sched_modifier;
 } // __kmps_get_schedule
 
 kmp_proc_bind_t __kmps_get_proc_bind(void) {
   i;
-  return 0;
+  return proc_bind_false;
 } // __kmps_get_proc_bind
 
 double __kmps_get_wtime(void) {
   // Elapsed wall clock time (in second) from "sometime in the past".
   double wtime = 0.0;
   i;
 #if KMP_OS_WINDOWS
   if (frequency > 0.0) {
     LARGE_INTEGER now;
     BOOL status = QueryPerformanceCounter(&now);
     if (status) {
       wtime = double(now.QuadPart) / frequency;
     }
   }
 #else
   // gettimeofday() returns seconds and microseconds since the Epoch.
   struct timeval tval;
   int rc;
   rc = gettimeofday(&tval, NULL);
   if (rc == 0) {
     wtime = (double)(tval.tv_sec) + 1.0E-06 * (double)(tval.tv_usec);
   } else {
     // TODO: Assert or abort here.
   }
 #endif
   return wtime;
 } // __kmps_get_wtime
 
 double __kmps_get_wtick(void) {
   // Number of seconds between successive clock ticks.
   double wtick = 0.0;
   i;
 #if KMP_OS_WINDOWS
   {
     DWORD increment;
     DWORD adjustment;
     BOOL disabled;
     BOOL rc;
     rc = GetSystemTimeAdjustment(&adjustment, &increment, &disabled);
     if (rc) {
       wtick = 1.0E-07 * (double)(disabled ? increment : adjustment);
     } else {
       // TODO: Assert or abort here.
       wtick = 1.0E-03;
     }
   }
 #else
   // TODO: gettimeofday() returns in microseconds, but what the precision?
   wtick = 1.0E-06;
 #endif
   return wtick;
 } // __kmps_get_wtick
 
 /* OpenMP 5.0 Memory Management */
 #if KMP_OS_WINDOWS
 omp_allocator_handle_t const omp_null_allocator = 0;
 omp_allocator_handle_t const omp_default_mem_alloc =
     (omp_allocator_handle_t const)1;
 omp_allocator_handle_t const omp_large_cap_mem_alloc =
     (omp_allocator_handle_t const)2;
 omp_allocator_handle_t const omp_const_mem_alloc =
     (omp_allocator_handle_t const)3;
 omp_allocator_handle_t const omp_high_bw_mem_alloc =
     (omp_allocator_handle_t const)4;
 omp_allocator_handle_t const omp_low_lat_mem_alloc =
     (omp_allocator_handle_t const)5;
 omp_allocator_handle_t const omp_cgroup_mem_alloc =
     (omp_allocator_handle_t const)6;
 omp_allocator_handle_t const omp_pteam_mem_alloc =
     (omp_allocator_handle_t const)7;
 omp_allocator_handle_t const omp_thread_mem_alloc =
     (omp_allocator_handle_t const)8;
 
 omp_memspace_handle_t const omp_default_mem_space =
     (omp_memspace_handle_t const)0;
 omp_memspace_handle_t const omp_large_cap_mem_space =
     (omp_memspace_handle_t const)1;
 omp_memspace_handle_t const omp_const_mem_space =
     (omp_memspace_handle_t const)2;
 omp_memspace_handle_t const omp_high_bw_mem_space =
     (omp_memspace_handle_t const)3;
 omp_memspace_handle_t const omp_low_lat_mem_space =
     (omp_memspace_handle_t const)4;
 #endif /* KMP_OS_WINDOWS */
 void *omp_alloc(size_t size, const omp_allocator_handle_t allocator) {
   i;
   return malloc(size);
 }
 void omp_free(void *ptr, const omp_allocator_handle_t allocator) {
   i;
   free(ptr);
 }
 /* OpenMP 5.0 Affinity Format */
 void omp_set_affinity_format(char const *format) { i; }
 size_t omp_get_affinity_format(char *buffer, size_t size) {
   i;
   return 0;
 }
 void omp_display_affinity(char const *format) { i; }
 size_t omp_capture_affinity(char *buffer, size_t buf_size, char const *format) {
   i;
   return 0;
 }
 
 // end of file //
Index: projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_taskdeps.cpp
===================================================================
--- projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_taskdeps.cpp	(revision 357058)
+++ projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_taskdeps.cpp	(revision 357059)
@@ -1,655 +1,710 @@
 /*
  * kmp_taskdeps.cpp
  */
 
 //===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
 //#define KMP_SUPPORT_GRAPH_OUTPUT 1
 
 #include "kmp.h"
 #include "kmp_io.h"
 #include "kmp_wait_release.h"
 #include "kmp_taskdeps.h"
 #if OMPT_SUPPORT
 #include "ompt-specific.h"
 #endif
 
 // TODO: Improve memory allocation? keep a list of pre-allocated structures?
 // allocate in blocks? re-use list finished list entries?
 // TODO: don't use atomic ref counters for stack-allocated nodes.
 // TODO: find an alternate to atomic refs for heap-allocated nodes?
 // TODO: Finish graph output support
 // TODO: kmp_lock_t seems a tad to big (and heavy weight) for this. Check other
 // runtime locks
 // TODO: Any ITT support needed?
 
 #ifdef KMP_SUPPORT_GRAPH_OUTPUT
 static std::atomic<kmp_int32> kmp_node_id_seed = ATOMIC_VAR_INIT(0);
 #endif
 
 static void __kmp_init_node(kmp_depnode_t *node) {
   node->dn.successors = NULL;
   node->dn.task = NULL; // will point to the rigth task
   // once dependences have been processed
   for (int i = 0; i < MAX_MTX_DEPS; ++i)
     node->dn.mtx_locks[i] = NULL;
   node->dn.mtx_num_locks = 0;
   __kmp_init_lock(&node->dn.lock);
   KMP_ATOMIC_ST_RLX(&node->dn.nrefs, 1); // init creates the first reference
 #ifdef KMP_SUPPORT_GRAPH_OUTPUT
   node->dn.id = KMP_ATOMIC_INC(&kmp_node_id_seed);
 #endif
 }
 
 static inline kmp_depnode_t *__kmp_node_ref(kmp_depnode_t *node) {
   KMP_ATOMIC_INC(&node->dn.nrefs);
   return node;
 }
 
 enum { KMP_DEPHASH_OTHER_SIZE = 97, KMP_DEPHASH_MASTER_SIZE = 997 };
 
+size_t sizes[] = { 997, 2003, 4001, 8191, 16001, 32003, 64007, 131071, 270029 };
+const size_t MAX_GEN = 8;
+
 static inline kmp_int32 __kmp_dephash_hash(kmp_intptr_t addr, size_t hsize) {
   // TODO alternate to try: set = (((Addr64)(addrUsefulBits * 9.618)) %
   // m_num_sets );
   return ((addr >> 6) ^ (addr >> 2)) % hsize;
 }
 
+static kmp_dephash_t *__kmp_dephash_extend(kmp_info_t *thread,
+                                           kmp_dephash_t *current_dephash) {
+  kmp_dephash_t *h;
+
+  size_t gen = current_dephash->generation + 1;
+  if (gen >= MAX_GEN)
+    return current_dephash;
+  size_t new_size = sizes[gen];
+
+  kmp_int32 size_to_allocate =
+      new_size * sizeof(kmp_dephash_entry_t *) + sizeof(kmp_dephash_t);
+
+#if USE_FAST_MEMORY
+  h = (kmp_dephash_t *)__kmp_fast_allocate(thread, size_to_allocate);
+#else
+  h = (kmp_dephash_t *)__kmp_thread_malloc(thread, size_to_allocate);
+#endif
+
+  h->size = new_size;
+  h->nelements = current_dephash->nelements;
+  h->buckets = (kmp_dephash_entry **)(h + 1);
+  h->generation = gen;
+
+  // insert existing elements in the new table
+  for (size_t i = 0; i < current_dephash->size; i++) {
+    kmp_dephash_entry_t *next;
+    for (kmp_dephash_entry_t *entry = current_dephash->buckets[i]; entry; entry = next) {
+      next = entry->next_in_bucket;
+      // Compute the new hash using the new size, and insert the entry in
+      // the new bucket.
+      kmp_int32 new_bucket = __kmp_dephash_hash(entry->addr, h->size);
+      if (entry->next_in_bucket) {
+        h->nconflicts++;
+      }
+      entry->next_in_bucket = h->buckets[new_bucket];
+      h->buckets[new_bucket] = entry;
+    }
+  }
+
+  // Free old hash table
+#if USE_FAST_MEMORY
+  __kmp_fast_free(thread, current_dephash);
+#else
+  __kmp_thread_free(thread, current_dephash);
+#endif
+
+  return h;
+}
+
 static kmp_dephash_t *__kmp_dephash_create(kmp_info_t *thread,
                                            kmp_taskdata_t *current_task) {
   kmp_dephash_t *h;
 
   size_t h_size;
 
   if (current_task->td_flags.tasktype == TASK_IMPLICIT)
     h_size = KMP_DEPHASH_MASTER_SIZE;
   else
     h_size = KMP_DEPHASH_OTHER_SIZE;
 
   kmp_int32 size =
       h_size * sizeof(kmp_dephash_entry_t *) + sizeof(kmp_dephash_t);
 
 #if USE_FAST_MEMORY
   h = (kmp_dephash_t *)__kmp_fast_allocate(thread, size);
 #else
   h = (kmp_dephash_t *)__kmp_thread_malloc(thread, size);
 #endif
   h->size = h_size;
 
-#ifdef KMP_DEBUG
+  h->generation = 0;
   h->nelements = 0;
   h->nconflicts = 0;
-#endif
   h->buckets = (kmp_dephash_entry **)(h + 1);
 
   for (size_t i = 0; i < h_size; i++)
     h->buckets[i] = 0;
 
   return h;
 }
 
 #define ENTRY_LAST_INS 0
 #define ENTRY_LAST_MTXS 1
 
 static kmp_dephash_entry *
-__kmp_dephash_find(kmp_info_t *thread, kmp_dephash_t *h, kmp_intptr_t addr) {
+__kmp_dephash_find(kmp_info_t *thread, kmp_dephash_t **hash, kmp_intptr_t addr) {
+  kmp_dephash_t *h = *hash;
+  if (h->nelements != 0
+      && h->nconflicts/h->size >= 1) {
+    *hash = __kmp_dephash_extend(thread, h);
+    h = *hash;
+  }
   kmp_int32 bucket = __kmp_dephash_hash(addr, h->size);
 
   kmp_dephash_entry_t *entry;
   for (entry = h->buckets[bucket]; entry; entry = entry->next_in_bucket)
     if (entry->addr == addr)
       break;
 
   if (entry == NULL) {
 // create entry. This is only done by one thread so no locking required
 #if USE_FAST_MEMORY
     entry = (kmp_dephash_entry_t *)__kmp_fast_allocate(
         thread, sizeof(kmp_dephash_entry_t));
 #else
     entry = (kmp_dephash_entry_t *)__kmp_thread_malloc(
         thread, sizeof(kmp_dephash_entry_t));
 #endif
     entry->addr = addr;
     entry->last_out = NULL;
     entry->last_ins = NULL;
     entry->last_mtxs = NULL;
     entry->last_flag = ENTRY_LAST_INS;
     entry->mtx_lock = NULL;
     entry->next_in_bucket = h->buckets[bucket];
     h->buckets[bucket] = entry;
-#ifdef KMP_DEBUG
     h->nelements++;
     if (entry->next_in_bucket)
       h->nconflicts++;
-#endif
   }
   return entry;
 }
 
 static kmp_depnode_list_t *__kmp_add_node(kmp_info_t *thread,
                                           kmp_depnode_list_t *list,
                                           kmp_depnode_t *node) {
   kmp_depnode_list_t *new_head;
 
 #if USE_FAST_MEMORY
   new_head = (kmp_depnode_list_t *)__kmp_fast_allocate(
       thread, sizeof(kmp_depnode_list_t));
 #else
   new_head = (kmp_depnode_list_t *)__kmp_thread_malloc(
       thread, sizeof(kmp_depnode_list_t));
 #endif
 
   new_head->node = __kmp_node_ref(node);
   new_head->next = list;
 
   return new_head;
 }
 
 static inline void __kmp_track_dependence(kmp_depnode_t *source,
                                           kmp_depnode_t *sink,
                                           kmp_task_t *sink_task) {
 #ifdef KMP_SUPPORT_GRAPH_OUTPUT
   kmp_taskdata_t *task_source = KMP_TASK_TO_TASKDATA(source->dn.task);
   // do not use sink->dn.task as that is only filled after the dependencies
   // are already processed!
   kmp_taskdata_t *task_sink = KMP_TASK_TO_TASKDATA(sink_task);
 
   __kmp_printf("%d(%s) -> %d(%s)\n", source->dn.id,
                task_source->td_ident->psource, sink->dn.id,
                task_sink->td_ident->psource);
 #endif
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   /* OMPT tracks dependences between task (a=source, b=sink) in which
      task a blocks the execution of b through the ompt_new_dependence_callback
      */
   if (ompt_enabled.ompt_callback_task_dependence) {
     kmp_taskdata_t *task_source = KMP_TASK_TO_TASKDATA(source->dn.task);
     kmp_taskdata_t *task_sink = KMP_TASK_TO_TASKDATA(sink_task);
 
     ompt_callbacks.ompt_callback(ompt_callback_task_dependence)(
         &(task_source->ompt_task_info.task_data),
         &(task_sink->ompt_task_info.task_data));
   }
 #endif /* OMPT_SUPPORT && OMPT_OPTIONAL */
 }
 
 static inline kmp_int32
 __kmp_depnode_link_successor(kmp_int32 gtid, kmp_info_t *thread,
                              kmp_task_t *task, kmp_depnode_t *node,
                              kmp_depnode_list_t *plist) {
   if (!plist)
     return 0;
   kmp_int32 npredecessors = 0;
   // link node as successor of list elements
   for (kmp_depnode_list_t *p = plist; p; p = p->next) {
     kmp_depnode_t *dep = p->node;
     if (dep->dn.task) {
       KMP_ACQUIRE_DEPNODE(gtid, dep);
       if (dep->dn.task) {
         __kmp_track_dependence(dep, node, task);
         dep->dn.successors = __kmp_add_node(thread, dep->dn.successors, node);
         KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to "
                       "%p\n",
                       gtid, KMP_TASK_TO_TASKDATA(dep->dn.task),
                       KMP_TASK_TO_TASKDATA(task)));
         npredecessors++;
       }
       KMP_RELEASE_DEPNODE(gtid, dep);
     }
   }
   return npredecessors;
 }
 
 static inline kmp_int32 __kmp_depnode_link_successor(kmp_int32 gtid,
                                                      kmp_info_t *thread,
                                                      kmp_task_t *task,
                                                      kmp_depnode_t *source,
                                                      kmp_depnode_t *sink) {
   if (!sink)
     return 0;
   kmp_int32 npredecessors = 0;
   if (sink->dn.task) {
     // synchronously add source to sink' list of successors
     KMP_ACQUIRE_DEPNODE(gtid, sink);
     if (sink->dn.task) {
       __kmp_track_dependence(sink, source, task);
       sink->dn.successors = __kmp_add_node(thread, sink->dn.successors, source);
       KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to "
                     "%p\n",
                     gtid, KMP_TASK_TO_TASKDATA(sink->dn.task),
                     KMP_TASK_TO_TASKDATA(task)));
       npredecessors++;
     }
     KMP_RELEASE_DEPNODE(gtid, sink);
   }
   return npredecessors;
 }
 
 template <bool filter>
 static inline kmp_int32
-__kmp_process_deps(kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t *hash,
+__kmp_process_deps(kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t **hash,
                    bool dep_barrier, kmp_int32 ndeps,
                    kmp_depend_info_t *dep_list, kmp_task_t *task) {
   KA_TRACE(30, ("__kmp_process_deps<%d>: T#%d processing %d dependencies : "
                 "dep_barrier = %d\n",
                 filter, gtid, ndeps, dep_barrier));
 
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_int32 npredecessors = 0;
   for (kmp_int32 i = 0; i < ndeps; i++) {
     const kmp_depend_info_t *dep = &dep_list[i];
 
     if (filter && dep->base_addr == 0)
       continue; // skip filtered entries
 
     kmp_dephash_entry_t *info =
         __kmp_dephash_find(thread, hash, dep->base_addr);
     kmp_depnode_t *last_out = info->last_out;
     kmp_depnode_list_t *last_ins = info->last_ins;
     kmp_depnode_list_t *last_mtxs = info->last_mtxs;
 
     if (dep->flags.out) { // out --> clean lists of ins and mtxs if any
       if (last_ins || last_mtxs) {
         if (info->last_flag == ENTRY_LAST_INS) { // INS were last
           npredecessors +=
               __kmp_depnode_link_successor(gtid, thread, task, node, last_ins);
         } else { // MTXS were last
           npredecessors +=
               __kmp_depnode_link_successor(gtid, thread, task, node, last_mtxs);
         }
         __kmp_depnode_list_free(thread, last_ins);
         __kmp_depnode_list_free(thread, last_mtxs);
         info->last_ins = NULL;
         info->last_mtxs = NULL;
       } else {
         npredecessors +=
             __kmp_depnode_link_successor(gtid, thread, task, node, last_out);
       }
       __kmp_node_deref(thread, last_out);
       if (dep_barrier) {
         // if this is a sync point in the serial sequence, then the previous
         // outputs are guaranteed to be completed after the execution of this
         // task so the previous output nodes can be cleared.
         info->last_out = NULL;
       } else {
         info->last_out = __kmp_node_ref(node);
       }
     } else if (dep->flags.in) {
       // in --> link node to either last_out or last_mtxs, clean earlier deps
       if (last_mtxs) {
         npredecessors +=
             __kmp_depnode_link_successor(gtid, thread, task, node, last_mtxs);
         __kmp_node_deref(thread, last_out);
         info->last_out = NULL;
         if (info->last_flag == ENTRY_LAST_MTXS && last_ins) { // MTXS were last
           // clean old INS before creating new list
           __kmp_depnode_list_free(thread, last_ins);
           info->last_ins = NULL;
         }
       } else {
         // link node as successor of the last_out if any
         npredecessors +=
             __kmp_depnode_link_successor(gtid, thread, task, node, last_out);
       }
       info->last_flag = ENTRY_LAST_INS;
       info->last_ins = __kmp_add_node(thread, info->last_ins, node);
     } else {
       KMP_DEBUG_ASSERT(dep->flags.mtx == 1);
       // mtx --> link node to either last_out or last_ins, clean earlier deps
       if (last_ins) {
         npredecessors +=
             __kmp_depnode_link_successor(gtid, thread, task, node, last_ins);
         __kmp_node_deref(thread, last_out);
         info->last_out = NULL;
         if (info->last_flag == ENTRY_LAST_INS && last_mtxs) { // INS were last
           // clean old MTXS before creating new list
           __kmp_depnode_list_free(thread, last_mtxs);
           info->last_mtxs = NULL;
         }
       } else {
         // link node as successor of the last_out if any
         npredecessors +=
             __kmp_depnode_link_successor(gtid, thread, task, node, last_out);
       }
       info->last_flag = ENTRY_LAST_MTXS;
       info->last_mtxs = __kmp_add_node(thread, info->last_mtxs, node);
       if (info->mtx_lock == NULL) {
         info->mtx_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
         __kmp_init_lock(info->mtx_lock);
       }
       KMP_DEBUG_ASSERT(node->dn.mtx_num_locks < MAX_MTX_DEPS);
       kmp_int32 m;
       // Save lock in node's array
       for (m = 0; m < MAX_MTX_DEPS; ++m) {
         // sort pointers in decreasing order to avoid potential livelock
         if (node->dn.mtx_locks[m] < info->mtx_lock) {
           KMP_DEBUG_ASSERT(node->dn.mtx_locks[node->dn.mtx_num_locks] == NULL);
           for (int n = node->dn.mtx_num_locks; n > m; --n) {
             // shift right all lesser non-NULL pointers
             KMP_DEBUG_ASSERT(node->dn.mtx_locks[n - 1] != NULL);
             node->dn.mtx_locks[n] = node->dn.mtx_locks[n - 1];
           }
           node->dn.mtx_locks[m] = info->mtx_lock;
           break;
         }
       }
       KMP_DEBUG_ASSERT(m < MAX_MTX_DEPS); // must break from loop
       node->dn.mtx_num_locks++;
     }
   }
   KA_TRACE(30, ("__kmp_process_deps<%d>: T#%d found %d predecessors\n", filter,
                 gtid, npredecessors));
   return npredecessors;
 }
 
 #define NO_DEP_BARRIER (false)
 #define DEP_BARRIER (true)
 
 // returns true if the task has any outstanding dependence
 static bool __kmp_check_deps(kmp_int32 gtid, kmp_depnode_t *node,
-                             kmp_task_t *task, kmp_dephash_t *hash,
+                             kmp_task_t *task, kmp_dephash_t **hash,
                              bool dep_barrier, kmp_int32 ndeps,
                              kmp_depend_info_t *dep_list,
                              kmp_int32 ndeps_noalias,
                              kmp_depend_info_t *noalias_dep_list) {
   int i, n_mtxs = 0;
 #if KMP_DEBUG
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
 #endif
   KA_TRACE(20, ("__kmp_check_deps: T#%d checking dependencies for task %p : %d "
                 "possibly aliased dependencies, %d non-aliased depedencies : "
                 "dep_barrier=%d .\n",
                 gtid, taskdata, ndeps, ndeps_noalias, dep_barrier));
 
   // Filter deps in dep_list
   // TODO: Different algorithm for large dep_list ( > 10 ? )
   for (i = 0; i < ndeps; i++) {
     if (dep_list[i].base_addr != 0) {
       for (int j = i + 1; j < ndeps; j++) {
         if (dep_list[i].base_addr == dep_list[j].base_addr) {
           dep_list[i].flags.in |= dep_list[j].flags.in;
           dep_list[i].flags.out |=
               (dep_list[j].flags.out ||
                (dep_list[i].flags.in && dep_list[j].flags.mtx) ||
                (dep_list[i].flags.mtx && dep_list[j].flags.in));
           dep_list[i].flags.mtx =
               dep_list[i].flags.mtx | dep_list[j].flags.mtx &&
               !dep_list[i].flags.out;
           dep_list[j].base_addr = 0; // Mark j element as void
         }
       }
       if (dep_list[i].flags.mtx) {
         // limit number of mtx deps to MAX_MTX_DEPS per node
         if (n_mtxs < MAX_MTX_DEPS && task != NULL) {
           ++n_mtxs;
         } else {
           dep_list[i].flags.in = 1; // downgrade mutexinoutset to inout
           dep_list[i].flags.out = 1;
           dep_list[i].flags.mtx = 0;
         }
       }
     }
   }
 
   // doesn't need to be atomic as no other thread is going to be accessing this
   // node just yet.
   // npredecessors is set -1 to ensure that none of the releasing tasks queues
   // this task before we have finished processing all the dependencies
   node->dn.npredecessors = -1;
 
   // used to pack all npredecessors additions into a single atomic operation at
   // the end
   int npredecessors;
 
   npredecessors = __kmp_process_deps<true>(gtid, node, hash, dep_barrier, ndeps,
                                            dep_list, task);
   npredecessors += __kmp_process_deps<false>(
       gtid, node, hash, dep_barrier, ndeps_noalias, noalias_dep_list, task);
 
   node->dn.task = task;
   KMP_MB();
 
   // Account for our initial fake value
   npredecessors++;
 
   // Update predecessors and obtain current value to check if there are still
   // any outstandig dependences (some tasks may have finished while we processed
   // the dependences)
   npredecessors =
       node->dn.npredecessors.fetch_add(npredecessors) + npredecessors;
 
   KA_TRACE(20, ("__kmp_check_deps: T#%d found %d predecessors for task %p \n",
                 gtid, npredecessors, taskdata));
 
   // beyond this point the task could be queued (and executed) by a releasing
   // task...
   return npredecessors > 0 ? true : false;
 }
 
 /*!
 @ingroup TASKING
 @param loc_ref location of the original task directive
 @param gtid Global Thread ID of encountering thread
 @param new_task task thunk allocated by __kmp_omp_task_alloc() for the ''new
 task''
 @param ndeps Number of depend items with possible aliasing
 @param dep_list List of depend items with possible aliasing
 @param ndeps_noalias Number of depend items with no aliasing
 @param noalias_dep_list List of depend items with no aliasing
 
 @return Returns either TASK_CURRENT_NOT_QUEUED if the current task was not
 suspendend and queued, or TASK_CURRENT_QUEUED if it was suspended and queued
 
 Schedule a non-thread-switchable task with dependences for execution
 */
 kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
                                     kmp_task_t *new_task, kmp_int32 ndeps,
                                     kmp_depend_info_t *dep_list,
                                     kmp_int32 ndeps_noalias,
                                     kmp_depend_info_t *noalias_dep_list) {
 
   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
   KA_TRACE(10, ("__kmpc_omp_task_with_deps(enter): T#%d loc=%p task=%p\n", gtid,
                 loc_ref, new_taskdata));
 
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_taskdata_t *current_task = thread->th.th_current_task;
 
 #if OMPT_SUPPORT
   if (ompt_enabled.enabled) {
     OMPT_STORE_RETURN_ADDRESS(gtid);
     if (!current_task->ompt_task_info.frame.enter_frame.ptr)
       current_task->ompt_task_info.frame.enter_frame.ptr =
           OMPT_GET_FRAME_ADDRESS(0);
     if (ompt_enabled.ompt_callback_task_create) {
       ompt_data_t task_data = ompt_data_none;
       ompt_callbacks.ompt_callback(ompt_callback_task_create)(
           current_task ? &(current_task->ompt_task_info.task_data) : &task_data,
           current_task ? &(current_task->ompt_task_info.frame) : NULL,
           &(new_taskdata->ompt_task_info.task_data),
           ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 1,
           OMPT_LOAD_RETURN_ADDRESS(gtid));
     }
 
     new_taskdata->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
   }
 
 #if OMPT_OPTIONAL
   /* OMPT grab all dependences if requested by the tool */
   if (ndeps + ndeps_noalias > 0 &&
       ompt_enabled.ompt_callback_dependences) {
     kmp_int32 i;
 
     new_taskdata->ompt_task_info.ndeps = ndeps + ndeps_noalias;
     new_taskdata->ompt_task_info.deps =
         (ompt_dependence_t *)KMP_OMPT_DEPS_ALLOC(
             thread, (ndeps + ndeps_noalias) * sizeof(ompt_dependence_t));
 
     KMP_ASSERT(new_taskdata->ompt_task_info.deps != NULL);
 
     for (i = 0; i < ndeps; i++) {
       new_taskdata->ompt_task_info.deps[i].variable.ptr =
           (void *)dep_list[i].base_addr;
       if (dep_list[i].flags.in && dep_list[i].flags.out)
         new_taskdata->ompt_task_info.deps[i].dependence_type =
             ompt_dependence_type_inout;
       else if (dep_list[i].flags.out)
         new_taskdata->ompt_task_info.deps[i].dependence_type =
             ompt_dependence_type_out;
       else if (dep_list[i].flags.in)
         new_taskdata->ompt_task_info.deps[i].dependence_type =
             ompt_dependence_type_in;
     }
     for (i = 0; i < ndeps_noalias; i++) {
       new_taskdata->ompt_task_info.deps[ndeps + i].variable.ptr =
           (void *)noalias_dep_list[i].base_addr;
       if (noalias_dep_list[i].flags.in && noalias_dep_list[i].flags.out)
         new_taskdata->ompt_task_info.deps[ndeps + i].dependence_type =
             ompt_dependence_type_inout;
       else if (noalias_dep_list[i].flags.out)
         new_taskdata->ompt_task_info.deps[ndeps + i].dependence_type =
             ompt_dependence_type_out;
       else if (noalias_dep_list[i].flags.in)
         new_taskdata->ompt_task_info.deps[ndeps + i].dependence_type =
             ompt_dependence_type_in;
     }
     ompt_callbacks.ompt_callback(ompt_callback_dependences)(
         &(new_taskdata->ompt_task_info.task_data),
         new_taskdata->ompt_task_info.deps, new_taskdata->ompt_task_info.ndeps);
     /* We can now free the allocated memory for the dependencies */
     /* For OMPD we might want to delay the free until task_end */
     KMP_OMPT_DEPS_FREE(thread, new_taskdata->ompt_task_info.deps);
     new_taskdata->ompt_task_info.deps = NULL;
     new_taskdata->ompt_task_info.ndeps = 0;
   }
 #endif /* OMPT_OPTIONAL */
 #endif /* OMPT_SUPPORT */
 
   bool serial = current_task->td_flags.team_serial ||
                 current_task->td_flags.tasking_ser ||
                 current_task->td_flags.final;
   kmp_task_team_t *task_team = thread->th.th_task_team;
   serial = serial && !(task_team && task_team->tt.tt_found_proxy_tasks);
 
   if (!serial && (ndeps > 0 || ndeps_noalias > 0)) {
     /* if no dependencies have been tracked yet, create the dependence hash */
     if (current_task->td_dephash == NULL)
       current_task->td_dephash = __kmp_dephash_create(thread, current_task);
 
 #if USE_FAST_MEMORY
     kmp_depnode_t *node =
         (kmp_depnode_t *)__kmp_fast_allocate(thread, sizeof(kmp_depnode_t));
 #else
     kmp_depnode_t *node =
         (kmp_depnode_t *)__kmp_thread_malloc(thread, sizeof(kmp_depnode_t));
 #endif
 
     __kmp_init_node(node);
     new_taskdata->td_depnode = node;
 
-    if (__kmp_check_deps(gtid, node, new_task, current_task->td_dephash,
+    if (__kmp_check_deps(gtid, node, new_task, &current_task->td_dephash,
                          NO_DEP_BARRIER, ndeps, dep_list, ndeps_noalias,
                          noalias_dep_list)) {
       KA_TRACE(10, ("__kmpc_omp_task_with_deps(exit): T#%d task had blocking "
                     "dependencies: "
                     "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
                     gtid, loc_ref, new_taskdata));
 #if OMPT_SUPPORT
       if (ompt_enabled.enabled) {
         current_task->ompt_task_info.frame.enter_frame = ompt_data_none;
       }
 #endif
       return TASK_CURRENT_NOT_QUEUED;
     }
   } else {
     KA_TRACE(10, ("__kmpc_omp_task_with_deps(exit): T#%d ignored dependencies "
                   "for task (serialized)"
                   "loc=%p task=%p\n",
                   gtid, loc_ref, new_taskdata));
   }
 
   KA_TRACE(10, ("__kmpc_omp_task_with_deps(exit): T#%d task had no blocking "
                 "dependencies : "
                 "loc=%p task=%p, transferring to __kmp_omp_task\n",
                 gtid, loc_ref, new_taskdata));
 
   kmp_int32 ret = __kmp_omp_task(gtid, new_task, true);
 #if OMPT_SUPPORT
   if (ompt_enabled.enabled) {
     current_task->ompt_task_info.frame.enter_frame = ompt_data_none;
   }
 #endif
   return ret;
 }
 
 /*!
 @ingroup TASKING
 @param loc_ref location of the original task directive
 @param gtid Global Thread ID of encountering thread
 @param ndeps Number of depend items with possible aliasing
 @param dep_list List of depend items with possible aliasing
 @param ndeps_noalias Number of depend items with no aliasing
 @param noalias_dep_list List of depend items with no aliasing
 
 Blocks the current task until all specifies dependencies have been fulfilled.
 */
 void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps,
                           kmp_depend_info_t *dep_list, kmp_int32 ndeps_noalias,
                           kmp_depend_info_t *noalias_dep_list) {
   KA_TRACE(10, ("__kmpc_omp_wait_deps(enter): T#%d loc=%p\n", gtid, loc_ref));
 
   if (ndeps == 0 && ndeps_noalias == 0) {
     KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d has no dependencies to "
                   "wait upon : loc=%p\n",
                   gtid, loc_ref));
     return;
   }
 
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_taskdata_t *current_task = thread->th.th_current_task;
 
   // We can return immediately as:
   // - dependences are not computed in serial teams (except with proxy tasks)
   // - if the dephash is not yet created it means we have nothing to wait for
   bool ignore = current_task->td_flags.team_serial ||
                 current_task->td_flags.tasking_ser ||
                 current_task->td_flags.final;
   ignore = ignore && thread->th.th_task_team != NULL &&
            thread->th.th_task_team->tt.tt_found_proxy_tasks == FALSE;
   ignore = ignore || current_task->td_dephash == NULL;
 
   if (ignore) {
     KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d has no blocking "
                   "dependencies : loc=%p\n",
                   gtid, loc_ref));
     return;
   }
 
   kmp_depnode_t node = {0};
   __kmp_init_node(&node);
 
-  if (!__kmp_check_deps(gtid, &node, NULL, current_task->td_dephash,
+  if (!__kmp_check_deps(gtid, &node, NULL, &current_task->td_dephash,
                         DEP_BARRIER, ndeps, dep_list, ndeps_noalias,
                         noalias_dep_list)) {
     KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d has no blocking "
                   "dependencies : loc=%p\n",
                   gtid, loc_ref));
     return;
   }
 
   int thread_finished = FALSE;
   kmp_flag_32 flag((std::atomic<kmp_uint32> *)&node.dn.npredecessors, 0U);
   while (node.dn.npredecessors > 0) {
     flag.execute_tasks(thread, gtid, FALSE,
                        &thread_finished USE_ITT_BUILD_ARG(NULL),
                        __kmp_task_stealing_constraint);
   }
 
   KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d finished waiting : loc=%p\n",
                 gtid, loc_ref));
 }
Index: projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_wait_release.h
===================================================================
--- projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_wait_release.h	(revision 357058)
+++ projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/kmp_wait_release.h	(revision 357059)
@@ -1,932 +1,935 @@
 /*
  * kmp_wait_release.h -- Wait/Release implementation
  */
 
 //===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef KMP_WAIT_RELEASE_H
 #define KMP_WAIT_RELEASE_H
 
 #include "kmp.h"
 #include "kmp_itt.h"
 #include "kmp_stats.h"
 #if OMPT_SUPPORT
 #include "ompt-specific.h"
 #endif
 
 /*!
 @defgroup WAIT_RELEASE Wait/Release operations
 
 The definitions and functions here implement the lowest level thread
 synchronizations of suspending a thread and awaking it. They are used to build
 higher level operations such as barriers and fork/join.
 */
 
 /*!
 @ingroup WAIT_RELEASE
 @{
 */
 
 /*!
  * The flag_type describes the storage used for the flag.
  */
 enum flag_type {
   flag32, /**< 32 bit flags */
   flag64, /**< 64 bit flags */
   flag_oncore /**< special 64-bit flag for on-core barrier (hierarchical) */
 };
 
 /*!
  * Base class for wait/release volatile flag
  */
 template <typename P> class kmp_flag_native {
   volatile P *loc;
   flag_type t;
 
 public:
   typedef P flag_t;
   kmp_flag_native(volatile P *p, flag_type ft) : loc(p), t(ft) {}
   volatile P *get() { return loc; }
   void *get_void_p() { return RCAST(void *, CCAST(P *, loc)); }
   void set(volatile P *new_loc) { loc = new_loc; }
   flag_type get_type() { return t; }
   P load() { return *loc; }
   void store(P val) { *loc = val; }
 };
 
 /*!
  * Base class for wait/release atomic flag
  */
 template <typename P> class kmp_flag {
   std::atomic<P>
       *loc; /**< Pointer to the flag storage that is modified by another thread
              */
   flag_type t; /**< "Type" of the flag in loc */
 public:
   typedef P flag_t;
   kmp_flag(std::atomic<P> *p, flag_type ft) : loc(p), t(ft) {}
   /*!
    * @result the pointer to the actual flag
    */
   std::atomic<P> *get() { return loc; }
   /*!
    * @result void* pointer to the actual flag
    */
   void *get_void_p() { return RCAST(void *, loc); }
   /*!
    * @param new_loc in   set loc to point at new_loc
    */
   void set(std::atomic<P> *new_loc) { loc = new_loc; }
   /*!
    * @result the flag_type
    */
   flag_type get_type() { return t; }
   /*!
    * @result flag value
    */
   P load() { return loc->load(std::memory_order_acquire); }
   /*!
    * @param val the new flag value to be stored
    */
   void store(P val) { loc->store(val, std::memory_order_release); }
   // Derived classes must provide the following:
   /*
   kmp_info_t * get_waiter(kmp_uint32 i);
   kmp_uint32 get_num_waiters();
   bool done_check();
   bool done_check_val(P old_loc);
   bool notdone_check();
   P internal_release();
   void suspend(int th_gtid);
   void resume(int th_gtid);
   P set_sleeping();
   P unset_sleeping();
   bool is_sleeping();
   bool is_any_sleeping();
   bool is_sleeping_val(P old_loc);
   int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
                     int *thread_finished
                     USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32
                     is_constrained);
   */
 };
 
 #if OMPT_SUPPORT
 OMPT_NOINLINE
 static void __ompt_implicit_task_end(kmp_info_t *this_thr,
                                      ompt_state_t ompt_state,
                                      ompt_data_t *tId) {
   int ds_tid = this_thr->th.th_info.ds.ds_tid;
   if (ompt_state == ompt_state_wait_barrier_implicit) {
     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
 #if OMPT_OPTIONAL
     void *codeptr = NULL;
     if (ompt_enabled.ompt_callback_sync_region_wait) {
       ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, tId,
           codeptr);
     }
     if (ompt_enabled.ompt_callback_sync_region) {
       ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
           ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, tId,
           codeptr);
     }
 #endif
     if (!KMP_MASTER_TID(ds_tid)) {
       if (ompt_enabled.ompt_callback_implicit_task) {
+        int flags = this_thr->th.ompt_thread_info.parallel_flags;
+        flags = (flags & ompt_parallel_league) ? ompt_task_initial
+                                               : ompt_task_implicit;
         ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
-            ompt_scope_end, NULL, tId, 0, ds_tid, ompt_task_implicit);
+            ompt_scope_end, NULL, tId, 0, ds_tid, flags);
       }
       // return to idle state
       this_thr->th.ompt_thread_info.state = ompt_state_idle;
     } else {
       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
     }
   }
 }
 #endif
 
 /* Spin wait loop that first does pause/yield, then sleep. A thread that calls
    __kmp_wait_*  must make certain that another thread calls __kmp_release
    to wake it back up to prevent deadlocks!
 
    NOTE: We may not belong to a team at this point.  */
 template <class C, int final_spin, bool cancellable = false,
           bool sleepable = true>
 static inline bool
 __kmp_wait_template(kmp_info_t *this_thr,
                     C *flag USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
 #if USE_ITT_BUILD && USE_ITT_NOTIFY
   volatile void *spin = flag->get();
 #endif
   kmp_uint32 spins;
   int th_gtid;
   int tasks_completed = FALSE;
   int oversubscribed;
 #if !KMP_USE_MONITOR
   kmp_uint64 poll_count;
   kmp_uint64 hibernate_goal;
 #else
   kmp_uint32 hibernate;
 #endif
 
   KMP_FSYNC_SPIN_INIT(spin, NULL);
   if (flag->done_check()) {
     KMP_FSYNC_SPIN_ACQUIRED(CCAST(void *, spin));
     return false;
   }
   th_gtid = this_thr->th.th_info.ds.ds_gtid;
   if (cancellable) {
     kmp_team_t *team = this_thr->th.th_team;
     if (team && team->t.t_cancel_request == cancel_parallel)
       return true;
   }
 #if KMP_OS_UNIX
   if (final_spin)
     KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, true);
 #endif
   KA_TRACE(20,
            ("__kmp_wait_sleep: T#%d waiting for flag(%p)\n", th_gtid, flag));
 #if KMP_STATS_ENABLED
   stats_state_e thread_state = KMP_GET_THREAD_STATE();
 #endif
 
 /* OMPT Behavior:
 THIS function is called from
   __kmp_barrier (2 times)  (implicit or explicit barrier in parallel regions)
             these have join / fork behavior
 
        In these cases, we don't change the state or trigger events in THIS
 function.
        Events are triggered in the calling code (__kmp_barrier):
 
                 state := ompt_state_overhead
             barrier-begin
             barrier-wait-begin
                 state := ompt_state_wait_barrier
           call join-barrier-implementation (finally arrive here)
           {}
           call fork-barrier-implementation (finally arrive here)
           {}
                 state := ompt_state_overhead
             barrier-wait-end
             barrier-end
                 state := ompt_state_work_parallel
 
 
   __kmp_fork_barrier  (after thread creation, before executing implicit task)
           call fork-barrier-implementation (finally arrive here)
           {} // worker arrive here with state = ompt_state_idle
 
 
   __kmp_join_barrier  (implicit barrier at end of parallel region)
                 state := ompt_state_barrier_implicit
             barrier-begin
             barrier-wait-begin
           call join-barrier-implementation (finally arrive here
 final_spin=FALSE)
           {
           }
   __kmp_fork_barrier  (implicit barrier at end of parallel region)
           call fork-barrier-implementation (finally arrive here final_spin=TRUE)
 
        Worker after task-team is finished:
             barrier-wait-end
             barrier-end
             implicit-task-end
             idle-begin
                 state := ompt_state_idle
 
        Before leaving, if state = ompt_state_idle
             idle-end
                 state := ompt_state_overhead
 */
 #if OMPT_SUPPORT
   ompt_state_t ompt_entry_state;
   ompt_data_t *tId;
   if (ompt_enabled.enabled) {
     ompt_entry_state = this_thr->th.ompt_thread_info.state;
     if (!final_spin || ompt_entry_state != ompt_state_wait_barrier_implicit ||
         KMP_MASTER_TID(this_thr->th.th_info.ds.ds_tid)) {
       ompt_lw_taskteam_t *team =
           this_thr->th.th_team->t.ompt_serialized_team_info;
       if (team) {
         tId = &(team->ompt_task_info.task_data);
       } else {
         tId = OMPT_CUR_TASK_DATA(this_thr);
       }
     } else {
       tId = &(this_thr->th.ompt_thread_info.task_data);
     }
     if (final_spin && (__kmp_tasking_mode == tskm_immediate_exec ||
                        this_thr->th.th_task_team == NULL)) {
       // implicit task is done. Either no taskqueue, or task-team finished
       __ompt_implicit_task_end(this_thr, ompt_entry_state, tId);
     }
   }
 #endif
 
   KMP_INIT_YIELD(spins); // Setup for waiting
 
   if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
       __kmp_pause_status == kmp_soft_paused) {
 #if KMP_USE_MONITOR
 // The worker threads cannot rely on the team struct existing at this point.
 // Use the bt values cached in the thread struct instead.
 #ifdef KMP_ADJUST_BLOCKTIME
     if (__kmp_pause_status == kmp_soft_paused ||
         (__kmp_zero_bt && !this_thr->th.th_team_bt_set))
       // Force immediate suspend if not set by user and more threads than
       // available procs
       hibernate = 0;
     else
       hibernate = this_thr->th.th_team_bt_intervals;
 #else
     hibernate = this_thr->th.th_team_bt_intervals;
 #endif /* KMP_ADJUST_BLOCKTIME */
 
     /* If the blocktime is nonzero, we want to make sure that we spin wait for
        the entirety of the specified #intervals, plus up to one interval more.
        This increment make certain that this thread doesn't go to sleep too
        soon.  */
     if (hibernate != 0)
       hibernate++;
 
     // Add in the current time value.
     hibernate += TCR_4(__kmp_global.g.g_time.dt.t_value);
     KF_TRACE(20, ("__kmp_wait_sleep: T#%d now=%d, hibernate=%d, intervals=%d\n",
                   th_gtid, __kmp_global.g.g_time.dt.t_value, hibernate,
                   hibernate - __kmp_global.g.g_time.dt.t_value));
 #else
     if (__kmp_pause_status == kmp_soft_paused) {
       // Force immediate suspend
       hibernate_goal = KMP_NOW();
     } else
       hibernate_goal = KMP_NOW() + this_thr->th.th_team_bt_intervals;
     poll_count = 0;
 #endif // KMP_USE_MONITOR
   }
 
   oversubscribed = (TCR_4(__kmp_nth) > __kmp_avail_proc);
   KMP_MB();
 
   // Main wait spin loop
   while (flag->notdone_check()) {
     kmp_task_team_t *task_team = NULL;
     if (__kmp_tasking_mode != tskm_immediate_exec) {
       task_team = this_thr->th.th_task_team;
       /* If the thread's task team pointer is NULL, it means one of 3 things:
          1) A newly-created thread is first being released by
          __kmp_fork_barrier(), and its task team has not been set up yet.
          2) All tasks have been executed to completion.
          3) Tasking is off for this region.  This could be because we are in a
          serialized region (perhaps the outer one), or else tasking was manually
          disabled (KMP_TASKING=0).  */
       if (task_team != NULL) {
         if (TCR_SYNC_4(task_team->tt.tt_active)) {
           if (KMP_TASKING_ENABLED(task_team))
             flag->execute_tasks(
                 this_thr, th_gtid, final_spin,
                 &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
           else
             this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
         } else {
           KMP_DEBUG_ASSERT(!KMP_MASTER_TID(this_thr->th.th_info.ds.ds_tid));
 #if OMPT_SUPPORT
           // task-team is done now, other cases should be catched above
           if (final_spin && ompt_enabled.enabled)
             __ompt_implicit_task_end(this_thr, ompt_entry_state, tId);
 #endif
           this_thr->th.th_task_team = NULL;
           this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
         }
       } else {
         this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
       } // if
     } // if
 
     KMP_FSYNC_SPIN_PREPARE(CCAST(void *, spin));
     if (TCR_4(__kmp_global.g.g_done)) {
       if (__kmp_global.g.g_abort)
         __kmp_abort_thread();
       break;
     }
 
     // If we are oversubscribed, or have waited a bit (and
     // KMP_LIBRARY=throughput), then yield
     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
 
 #if KMP_STATS_ENABLED
     // Check if thread has been signalled to idle state
     // This indicates that the logical "join-barrier" has finished
     if (this_thr->th.th_stats->isIdle() &&
         KMP_GET_THREAD_STATE() == FORK_JOIN_BARRIER) {
       KMP_SET_THREAD_STATE(IDLE);
       KMP_PUSH_PARTITIONED_TIMER(OMP_idle);
     }
 #endif
     // Check if the barrier surrounding this wait loop has been cancelled
     if (cancellable) {
       kmp_team_t *team = this_thr->th.th_team;
       if (team && team->t.t_cancel_request == cancel_parallel)
         break;
     }
 
     // Don't suspend if KMP_BLOCKTIME is set to "infinite"
     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
         __kmp_pause_status != kmp_soft_paused)
       continue;
 
     // Don't suspend if there is a likelihood of new tasks being spawned.
     if ((task_team != NULL) && TCR_4(task_team->tt.tt_found_tasks))
       continue;
 
 #if KMP_USE_MONITOR
     // If we have waited a bit more, fall asleep
     if (TCR_4(__kmp_global.g.g_time.dt.t_value) < hibernate)
       continue;
 #else
     if (KMP_BLOCKING(hibernate_goal, poll_count++))
       continue;
 #endif
     // Don't suspend if wait loop designated non-sleepable
     // in template parameters
     if (!sleepable)
       continue;
 
     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
         __kmp_pause_status != kmp_soft_paused)
       continue;
 
     KF_TRACE(50, ("__kmp_wait_sleep: T#%d suspend time reached\n", th_gtid));
 
 #if KMP_OS_UNIX
     if (final_spin)
       KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, false);
 #endif
     flag->suspend(th_gtid);
 #if KMP_OS_UNIX
     if (final_spin)
       KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, true);
 #endif
 
     if (TCR_4(__kmp_global.g.g_done)) {
       if (__kmp_global.g.g_abort)
         __kmp_abort_thread();
       break;
     } else if (__kmp_tasking_mode != tskm_immediate_exec &&
                this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
       this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
     }
     // TODO: If thread is done with work and times out, disband/free
   }
 
 #if OMPT_SUPPORT
   ompt_state_t ompt_exit_state = this_thr->th.ompt_thread_info.state;
   if (ompt_enabled.enabled && ompt_exit_state != ompt_state_undefined) {
 #if OMPT_OPTIONAL
     if (final_spin) {
       __ompt_implicit_task_end(this_thr, ompt_exit_state, tId);
       ompt_exit_state = this_thr->th.ompt_thread_info.state;
     }
 #endif
     if (ompt_exit_state == ompt_state_idle) {
       this_thr->th.ompt_thread_info.state = ompt_state_overhead;
     }
   }
 #endif
 #if KMP_STATS_ENABLED
   // If we were put into idle state, pop that off the state stack
   if (KMP_GET_THREAD_STATE() == IDLE) {
     KMP_POP_PARTITIONED_TIMER();
     KMP_SET_THREAD_STATE(thread_state);
     this_thr->th.th_stats->resetIdleFlag();
   }
 #endif
 
 #if KMP_OS_UNIX
   if (final_spin)
     KMP_ATOMIC_ST_REL(&this_thr->th.th_blocking, false);
 #endif
   KMP_FSYNC_SPIN_ACQUIRED(CCAST(void *, spin));
   if (cancellable) {
     kmp_team_t *team = this_thr->th.th_team;
     if (team && team->t.t_cancel_request == cancel_parallel) {
       if (tasks_completed) {
         // undo the previous decrement of unfinished_threads so that the
         // thread can decrement at the join barrier with no problem
         kmp_task_team_t *task_team = this_thr->th.th_task_team;
         std::atomic<kmp_int32> *unfinished_threads =
             &(task_team->tt.tt_unfinished_threads);
         KMP_ATOMIC_INC(unfinished_threads);
       }
       return true;
     }
   }
   return false;
 }
 
 /* Release any threads specified as waiting on the flag by releasing the flag
    and resume the waiting thread if indicated by the sleep bit(s). A thread that
    calls __kmp_wait_template must call this function to wake up the potentially
    sleeping thread and prevent deadlocks!  */
 template <class C> static inline void __kmp_release_template(C *flag) {
 #ifdef KMP_DEBUG
   int gtid = TCR_4(__kmp_init_gtid) ? __kmp_get_gtid() : -1;
 #endif
   KF_TRACE(20, ("__kmp_release: T#%d releasing flag(%x)\n", gtid, flag->get()));
   KMP_DEBUG_ASSERT(flag->get());
   KMP_FSYNC_RELEASING(flag->get_void_p());
 
   flag->internal_release();
 
   KF_TRACE(100, ("__kmp_release: T#%d set new spin=%d\n", gtid, flag->get(),
                  flag->load()));
 
   if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
     // Only need to check sleep stuff if infinite block time not set.
     // Are *any* threads waiting on flag sleeping?
     if (flag->is_any_sleeping()) {
       for (unsigned int i = 0; i < flag->get_num_waiters(); ++i) {
         // if sleeping waiter exists at i, sets current_waiter to i inside flag
         kmp_info_t *waiter = flag->get_waiter(i);
         if (waiter) {
           int wait_gtid = waiter->th.th_info.ds.ds_gtid;
           // Wake up thread if needed
           KF_TRACE(50, ("__kmp_release: T#%d waking up thread T#%d since sleep "
                         "flag(%p) set\n",
                         gtid, wait_gtid, flag->get()));
           flag->resume(wait_gtid); // unsets flag's current_waiter when done
         }
       }
     }
   }
 }
 
 template <typename FlagType> struct flag_traits {};
 
 template <> struct flag_traits<kmp_uint32> {
   typedef kmp_uint32 flag_t;
   static const flag_type t = flag32;
   static inline flag_t tcr(flag_t f) { return TCR_4(f); }
   static inline flag_t test_then_add4(volatile flag_t *f) {
     return KMP_TEST_THEN_ADD4_32(RCAST(volatile kmp_int32 *, f));
   }
   static inline flag_t test_then_or(volatile flag_t *f, flag_t v) {
     return KMP_TEST_THEN_OR32(f, v);
   }
   static inline flag_t test_then_and(volatile flag_t *f, flag_t v) {
     return KMP_TEST_THEN_AND32(f, v);
   }
 };
 
 template <> struct flag_traits<kmp_uint64> {
   typedef kmp_uint64 flag_t;
   static const flag_type t = flag64;
   static inline flag_t tcr(flag_t f) { return TCR_8(f); }
   static inline flag_t test_then_add4(volatile flag_t *f) {
     return KMP_TEST_THEN_ADD4_64(RCAST(volatile kmp_int64 *, f));
   }
   static inline flag_t test_then_or(volatile flag_t *f, flag_t v) {
     return KMP_TEST_THEN_OR64(f, v);
   }
   static inline flag_t test_then_and(volatile flag_t *f, flag_t v) {
     return KMP_TEST_THEN_AND64(f, v);
   }
 };
 
 // Basic flag that does not use C11 Atomics
 template <typename FlagType>
 class kmp_basic_flag_native : public kmp_flag_native<FlagType> {
   typedef flag_traits<FlagType> traits_type;
   FlagType checker; /**< Value to compare flag to to check if flag has been
                        released. */
   kmp_info_t
       *waiting_threads[1]; /**< Array of threads sleeping on this thread. */
   kmp_uint32
       num_waiting_threads; /**< Number of threads sleeping on this thread. */
 public:
   kmp_basic_flag_native(volatile FlagType *p)
       : kmp_flag_native<FlagType>(p, traits_type::t), num_waiting_threads(0) {}
   kmp_basic_flag_native(volatile FlagType *p, kmp_info_t *thr)
       : kmp_flag_native<FlagType>(p, traits_type::t), num_waiting_threads(1) {
     waiting_threads[0] = thr;
   }
   kmp_basic_flag_native(volatile FlagType *p, FlagType c)
       : kmp_flag_native<FlagType>(p, traits_type::t), checker(c),
         num_waiting_threads(0) {}
   /*!
    * param i in   index into waiting_threads
    * @result the thread that is waiting at index i
    */
   kmp_info_t *get_waiter(kmp_uint32 i) {
     KMP_DEBUG_ASSERT(i < num_waiting_threads);
     return waiting_threads[i];
   }
   /*!
    * @result num_waiting_threads
    */
   kmp_uint32 get_num_waiters() { return num_waiting_threads; }
   /*!
    * @param thr in   the thread which is now waiting
    *
    * Insert a waiting thread at index 0.
    */
   void set_waiter(kmp_info_t *thr) {
     waiting_threads[0] = thr;
     num_waiting_threads = 1;
   }
   /*!
    * @result true if the flag object has been released.
    */
   bool done_check() { return traits_type::tcr(*(this->get())) == checker; }
   /*!
    * @param old_loc in   old value of flag
    * @result true if the flag's old value indicates it was released.
    */
   bool done_check_val(FlagType old_loc) { return old_loc == checker; }
   /*!
    * @result true if the flag object is not yet released.
    * Used in __kmp_wait_template like:
    * @code
    * while (flag.notdone_check()) { pause(); }
    * @endcode
    */
   bool notdone_check() { return traits_type::tcr(*(this->get())) != checker; }
   /*!
    * @result Actual flag value before release was applied.
    * Trigger all waiting threads to run by modifying flag to release state.
    */
   void internal_release() {
     (void)traits_type::test_then_add4((volatile FlagType *)this->get());
   }
   /*!
    * @result Actual flag value before sleep bit(s) set.
    * Notes that there is at least one thread sleeping on the flag by setting
    * sleep bit(s).
    */
   FlagType set_sleeping() {
     return traits_type::test_then_or((volatile FlagType *)this->get(),
                                      KMP_BARRIER_SLEEP_STATE);
   }
   /*!
    * @result Actual flag value before sleep bit(s) cleared.
    * Notes that there are no longer threads sleeping on the flag by clearing
    * sleep bit(s).
    */
   FlagType unset_sleeping() {
     return traits_type::test_then_and((volatile FlagType *)this->get(),
                                       ~KMP_BARRIER_SLEEP_STATE);
   }
   /*!
    * @param old_loc in   old value of flag
    * Test whether there are threads sleeping on the flag's old value in old_loc.
    */
   bool is_sleeping_val(FlagType old_loc) {
     return old_loc & KMP_BARRIER_SLEEP_STATE;
   }
   /*!
    * Test whether there are threads sleeping on the flag.
    */
   bool is_sleeping() { return is_sleeping_val(*(this->get())); }
   bool is_any_sleeping() { return is_sleeping_val(*(this->get())); }
   kmp_uint8 *get_stolen() { return NULL; }
   enum barrier_type get_bt() { return bs_last_barrier; }
 };
 
 template <typename FlagType> class kmp_basic_flag : public kmp_flag<FlagType> {
   typedef flag_traits<FlagType> traits_type;
   FlagType checker; /**< Value to compare flag to to check if flag has been
                        released. */
   kmp_info_t
       *waiting_threads[1]; /**< Array of threads sleeping on this thread. */
   kmp_uint32
       num_waiting_threads; /**< Number of threads sleeping on this thread. */
 public:
   kmp_basic_flag(std::atomic<FlagType> *p)
       : kmp_flag<FlagType>(p, traits_type::t), num_waiting_threads(0) {}
   kmp_basic_flag(std::atomic<FlagType> *p, kmp_info_t *thr)
       : kmp_flag<FlagType>(p, traits_type::t), num_waiting_threads(1) {
     waiting_threads[0] = thr;
   }
   kmp_basic_flag(std::atomic<FlagType> *p, FlagType c)
       : kmp_flag<FlagType>(p, traits_type::t), checker(c),
         num_waiting_threads(0) {}
   /*!
    * param i in   index into waiting_threads
    * @result the thread that is waiting at index i
    */
   kmp_info_t *get_waiter(kmp_uint32 i) {
     KMP_DEBUG_ASSERT(i < num_waiting_threads);
     return waiting_threads[i];
   }
   /*!
    * @result num_waiting_threads
    */
   kmp_uint32 get_num_waiters() { return num_waiting_threads; }
   /*!
    * @param thr in   the thread which is now waiting
    *
    * Insert a waiting thread at index 0.
    */
   void set_waiter(kmp_info_t *thr) {
     waiting_threads[0] = thr;
     num_waiting_threads = 1;
   }
   /*!
    * @result true if the flag object has been released.
    */
   bool done_check() { return this->load() == checker; }
   /*!
    * @param old_loc in   old value of flag
    * @result true if the flag's old value indicates it was released.
    */
   bool done_check_val(FlagType old_loc) { return old_loc == checker; }
   /*!
    * @result true if the flag object is not yet released.
    * Used in __kmp_wait_template like:
    * @code
    * while (flag.notdone_check()) { pause(); }
    * @endcode
    */
   bool notdone_check() { return this->load() != checker; }
   /*!
    * @result Actual flag value before release was applied.
    * Trigger all waiting threads to run by modifying flag to release state.
    */
   void internal_release() { KMP_ATOMIC_ADD(this->get(), 4); }
   /*!
    * @result Actual flag value before sleep bit(s) set.
    * Notes that there is at least one thread sleeping on the flag by setting
    * sleep bit(s).
    */
   FlagType set_sleeping() {
     return KMP_ATOMIC_OR(this->get(), KMP_BARRIER_SLEEP_STATE);
   }
   /*!
    * @result Actual flag value before sleep bit(s) cleared.
    * Notes that there are no longer threads sleeping on the flag by clearing
    * sleep bit(s).
    */
   FlagType unset_sleeping() {
     return KMP_ATOMIC_AND(this->get(), ~KMP_BARRIER_SLEEP_STATE);
   }
   /*!
    * @param old_loc in   old value of flag
    * Test whether there are threads sleeping on the flag's old value in old_loc.
    */
   bool is_sleeping_val(FlagType old_loc) {
     return old_loc & KMP_BARRIER_SLEEP_STATE;
   }
   /*!
    * Test whether there are threads sleeping on the flag.
    */
   bool is_sleeping() { return is_sleeping_val(this->load()); }
   bool is_any_sleeping() { return is_sleeping_val(this->load()); }
   kmp_uint8 *get_stolen() { return NULL; }
   enum barrier_type get_bt() { return bs_last_barrier; }
 };
 
 class kmp_flag_32 : public kmp_basic_flag<kmp_uint32> {
 public:
   kmp_flag_32(std::atomic<kmp_uint32> *p) : kmp_basic_flag<kmp_uint32>(p) {}
   kmp_flag_32(std::atomic<kmp_uint32> *p, kmp_info_t *thr)
       : kmp_basic_flag<kmp_uint32>(p, thr) {}
   kmp_flag_32(std::atomic<kmp_uint32> *p, kmp_uint32 c)
       : kmp_basic_flag<kmp_uint32>(p, c) {}
   void suspend(int th_gtid) { __kmp_suspend_32(th_gtid, this); }
   void resume(int th_gtid) { __kmp_resume_32(th_gtid, this); }
   int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
                     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
                     kmp_int32 is_constrained) {
     return __kmp_execute_tasks_32(
         this_thr, gtid, this, final_spin,
         thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
   }
   void wait(kmp_info_t *this_thr,
             int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
     if (final_spin)
       __kmp_wait_template<kmp_flag_32, TRUE>(
           this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
     else
       __kmp_wait_template<kmp_flag_32, FALSE>(
           this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
   }
   void release() { __kmp_release_template(this); }
   flag_type get_ptr_type() { return flag32; }
 };
 
 class kmp_flag_64 : public kmp_basic_flag_native<kmp_uint64> {
 public:
   kmp_flag_64(volatile kmp_uint64 *p) : kmp_basic_flag_native<kmp_uint64>(p) {}
   kmp_flag_64(volatile kmp_uint64 *p, kmp_info_t *thr)
       : kmp_basic_flag_native<kmp_uint64>(p, thr) {}
   kmp_flag_64(volatile kmp_uint64 *p, kmp_uint64 c)
       : kmp_basic_flag_native<kmp_uint64>(p, c) {}
   void suspend(int th_gtid) { __kmp_suspend_64(th_gtid, this); }
   void resume(int th_gtid) { __kmp_resume_64(th_gtid, this); }
   int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
                     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
                     kmp_int32 is_constrained) {
     return __kmp_execute_tasks_64(
         this_thr, gtid, this, final_spin,
         thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
   }
   void wait(kmp_info_t *this_thr,
             int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
     if (final_spin)
       __kmp_wait_template<kmp_flag_64, TRUE>(
           this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
     else
       __kmp_wait_template<kmp_flag_64, FALSE>(
           this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
   }
   bool wait_cancellable_nosleep(kmp_info_t *this_thr,
                                 int final_spin
                                     USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
     bool retval = false;
     if (final_spin)
       retval = __kmp_wait_template<kmp_flag_64, TRUE, true, false>(
           this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
     else
       retval = __kmp_wait_template<kmp_flag_64, FALSE, true, false>(
           this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
     return retval;
   }
   void release() { __kmp_release_template(this); }
   flag_type get_ptr_type() { return flag64; }
 };
 
 // Hierarchical 64-bit on-core barrier instantiation
 class kmp_flag_oncore : public kmp_flag_native<kmp_uint64> {
   kmp_uint64 checker;
   kmp_info_t *waiting_threads[1];
   kmp_uint32 num_waiting_threads;
   kmp_uint32
       offset; /**< Portion of flag that is of interest for an operation. */
   bool flag_switch; /**< Indicates a switch in flag location. */
   enum barrier_type bt; /**< Barrier type. */
   kmp_info_t *this_thr; /**< Thread that may be redirected to different flag
                            location. */
 #if USE_ITT_BUILD
   void *
       itt_sync_obj; /**< ITT object that must be passed to new flag location. */
 #endif
   unsigned char &byteref(volatile kmp_uint64 *loc, size_t offset) {
     return (RCAST(unsigned char *, CCAST(kmp_uint64 *, loc)))[offset];
   }
 
 public:
   kmp_flag_oncore(volatile kmp_uint64 *p)
       : kmp_flag_native<kmp_uint64>(p, flag_oncore), num_waiting_threads(0),
         flag_switch(false) {}
   kmp_flag_oncore(volatile kmp_uint64 *p, kmp_uint32 idx)
       : kmp_flag_native<kmp_uint64>(p, flag_oncore), num_waiting_threads(0),
         offset(idx), flag_switch(false) {}
   kmp_flag_oncore(volatile kmp_uint64 *p, kmp_uint64 c, kmp_uint32 idx,
                   enum barrier_type bar_t,
                   kmp_info_t *thr USE_ITT_BUILD_ARG(void *itt))
       : kmp_flag_native<kmp_uint64>(p, flag_oncore), checker(c),
         num_waiting_threads(0), offset(idx), flag_switch(false), bt(bar_t),
         this_thr(thr) USE_ITT_BUILD_ARG(itt_sync_obj(itt)) {}
   kmp_info_t *get_waiter(kmp_uint32 i) {
     KMP_DEBUG_ASSERT(i < num_waiting_threads);
     return waiting_threads[i];
   }
   kmp_uint32 get_num_waiters() { return num_waiting_threads; }
   void set_waiter(kmp_info_t *thr) {
     waiting_threads[0] = thr;
     num_waiting_threads = 1;
   }
   bool done_check_val(kmp_uint64 old_loc) {
     return byteref(&old_loc, offset) == checker;
   }
   bool done_check() { return done_check_val(*get()); }
   bool notdone_check() {
     // Calculate flag_switch
     if (this_thr->th.th_bar[bt].bb.wait_flag == KMP_BARRIER_SWITCH_TO_OWN_FLAG)
       flag_switch = true;
     if (byteref(get(), offset) != 1 && !flag_switch)
       return true;
     else if (flag_switch) {
       this_thr->th.th_bar[bt].bb.wait_flag = KMP_BARRIER_SWITCHING;
       kmp_flag_64 flag(&this_thr->th.th_bar[bt].bb.b_go,
                        (kmp_uint64)KMP_BARRIER_STATE_BUMP);
       __kmp_wait_64(this_thr, &flag, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
     }
     return false;
   }
   void internal_release() {
     // Other threads can write their own bytes simultaneously.
     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
       byteref(get(), offset) = 1;
     } else {
       kmp_uint64 mask = 0;
       byteref(&mask, offset) = 1;
       KMP_TEST_THEN_OR64(get(), mask);
     }
   }
   kmp_uint64 set_sleeping() {
     return KMP_TEST_THEN_OR64(get(), KMP_BARRIER_SLEEP_STATE);
   }
   kmp_uint64 unset_sleeping() {
     return KMP_TEST_THEN_AND64(get(), ~KMP_BARRIER_SLEEP_STATE);
   }
   bool is_sleeping_val(kmp_uint64 old_loc) {
     return old_loc & KMP_BARRIER_SLEEP_STATE;
   }
   bool is_sleeping() { return is_sleeping_val(*get()); }
   bool is_any_sleeping() { return is_sleeping_val(*get()); }
   void wait(kmp_info_t *this_thr, int final_spin) {
     if (final_spin)
       __kmp_wait_template<kmp_flag_oncore, TRUE>(
           this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
     else
       __kmp_wait_template<kmp_flag_oncore, FALSE>(
           this_thr, this USE_ITT_BUILD_ARG(itt_sync_obj));
   }
   void release() { __kmp_release_template(this); }
   void suspend(int th_gtid) { __kmp_suspend_oncore(th_gtid, this); }
   void resume(int th_gtid) { __kmp_resume_oncore(th_gtid, this); }
   int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
                     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
                     kmp_int32 is_constrained) {
     return __kmp_execute_tasks_oncore(
         this_thr, gtid, this, final_spin,
         thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
   }
   kmp_uint8 *get_stolen() { return NULL; }
   enum barrier_type get_bt() { return bt; }
   flag_type get_ptr_type() { return flag_oncore; }
 };
 
 // Used to wake up threads, volatile void* flag is usually the th_sleep_loc
 // associated with int gtid.
 static inline void __kmp_null_resume_wrapper(int gtid, volatile void *flag) {
   if (!flag)
     return;
 
   switch (RCAST(kmp_flag_64 *, CCAST(void *, flag))->get_type()) {
   case flag32:
     __kmp_resume_32(gtid, NULL);
     break;
   case flag64:
     __kmp_resume_64(gtid, NULL);
     break;
   case flag_oncore:
     __kmp_resume_oncore(gtid, NULL);
     break;
   }
 }
 
 /*!
 @}
 */
 
 #endif // KMP_WAIT_RELEASE_H
Index: projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/ompt-general.cpp
===================================================================
--- projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/ompt-general.cpp	(revision 357058)
+++ projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/ompt-general.cpp	(revision 357059)
@@ -1,732 +1,729 @@
 /*
  * ompt-general.cpp -- OMPT implementation of interface functions
  */
 
 //===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
 /*****************************************************************************
  * system include files
  ****************************************************************************/
 
 #include <assert.h>
 
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #if KMP_OS_UNIX
 #include <dlfcn.h>
 #endif
 
 /*****************************************************************************
  * ompt include files
  ****************************************************************************/
 
 #include "ompt-specific.cpp"
 
 /*****************************************************************************
  * macros
  ****************************************************************************/
 
 #define ompt_get_callback_success 1
 #define ompt_get_callback_failure 0
 
 #define no_tool_present 0
 
 #define OMPT_API_ROUTINE static
 
 #ifndef OMPT_STR_MATCH
 #define OMPT_STR_MATCH(haystack, needle) (!strcasecmp(haystack, needle))
 #endif
 
 /*****************************************************************************
  * types
  ****************************************************************************/
 
 typedef struct {
   const char *state_name;
   ompt_state_t state_id;
 } ompt_state_info_t;
 
 typedef struct {
   const char *name;
   kmp_mutex_impl_t id;
 } kmp_mutex_impl_info_t;
 
 enum tool_setting_e {
   omp_tool_error,
   omp_tool_unset,
   omp_tool_disabled,
   omp_tool_enabled
 };
 
 /*****************************************************************************
  * global variables
  ****************************************************************************/
 
 ompt_callbacks_active_t ompt_enabled;
 
 ompt_state_info_t ompt_state_info[] = {
 #define ompt_state_macro(state, code) {#state, state},
     FOREACH_OMPT_STATE(ompt_state_macro)
 #undef ompt_state_macro
 };
 
 kmp_mutex_impl_info_t kmp_mutex_impl_info[] = {
 #define kmp_mutex_impl_macro(name, id) {#name, name},
     FOREACH_KMP_MUTEX_IMPL(kmp_mutex_impl_macro)
 #undef kmp_mutex_impl_macro
 };
 
 ompt_callbacks_internal_t ompt_callbacks;
 
 static ompt_start_tool_result_t *ompt_start_tool_result = NULL;
 
 /*****************************************************************************
  * forward declarations
  ****************************************************************************/
 
 static ompt_interface_fn_t ompt_fn_lookup(const char *s);
 
 OMPT_API_ROUTINE ompt_data_t *ompt_get_thread_data(void);
 
 /*****************************************************************************
  * initialization and finalization (private operations)
  ****************************************************************************/
 
 typedef ompt_start_tool_result_t *(*ompt_start_tool_t)(unsigned int,
                                                        const char *);
 
 #if KMP_OS_DARWIN
 
 // While Darwin supports weak symbols, the library that wishes to provide a new
 // implementation has to link against this runtime which defeats the purpose
 // of having tools that are agnostic of the underlying runtime implementation.
 //
 // Fortunately, the linker includes all symbols of an executable in the global
 // symbol table by default so dlsym() even finds static implementations of
 // ompt_start_tool. For this to work on Linux, -Wl,--export-dynamic needs to be
 // passed when building the application which we don't want to rely on.
 
 static ompt_start_tool_result_t *ompt_tool_darwin(unsigned int omp_version,
                                                   const char *runtime_version) {
   ompt_start_tool_result_t *ret = NULL;
   // Search symbol in the current address space.
   ompt_start_tool_t start_tool =
       (ompt_start_tool_t)dlsym(RTLD_DEFAULT, "ompt_start_tool");
   if (start_tool) {
     ret = start_tool(omp_version, runtime_version);
   }
   return ret;
 }
 
 #elif OMPT_HAVE_WEAK_ATTRIBUTE
 
 // On Unix-like systems that support weak symbols the following implementation
 // of ompt_start_tool() will be used in case no tool-supplied implementation of
 // this function is present in the address space of a process.
 
 _OMP_EXTERN OMPT_WEAK_ATTRIBUTE ompt_start_tool_result_t *
 ompt_start_tool(unsigned int omp_version, const char *runtime_version) {
   ompt_start_tool_result_t *ret = NULL;
   // Search next symbol in the current address space. This can happen if the
   // runtime library is linked before the tool. Since glibc 2.2 strong symbols
   // don't override weak symbols that have been found before unless the user
   // sets the environment variable LD_DYNAMIC_WEAK.
   ompt_start_tool_t next_tool =
       (ompt_start_tool_t)dlsym(RTLD_NEXT, "ompt_start_tool");
   if (next_tool) {
     ret = next_tool(omp_version, runtime_version);
   }
   return ret;
 }
 
 #elif OMPT_HAVE_PSAPI
 
 // On Windows, the ompt_tool_windows function is used to find the
 // ompt_start_tool symbol across all modules loaded by a process. If
 // ompt_start_tool is found, ompt_start_tool's return value is used to
 // initialize the tool. Otherwise, NULL is returned and OMPT won't be enabled.
 
 #include <psapi.h>
 #pragma comment(lib, "psapi.lib")
 
 // The number of loaded modules to start enumeration with EnumProcessModules()
 #define NUM_MODULES 128
 
 static ompt_start_tool_result_t *
 ompt_tool_windows(unsigned int omp_version, const char *runtime_version) {
   int i;
   DWORD needed, new_size;
   HMODULE *modules;
   HANDLE process = GetCurrentProcess();
   modules = (HMODULE *)malloc(NUM_MODULES * sizeof(HMODULE));
   ompt_start_tool_t ompt_tool_p = NULL;
 
 #if OMPT_DEBUG
   printf("ompt_tool_windows(): looking for ompt_start_tool\n");
 #endif
   if (!EnumProcessModules(process, modules, NUM_MODULES * sizeof(HMODULE),
                           &needed)) {
     // Regardless of the error reason use the stub initialization function
     free(modules);
     return NULL;
   }
   // Check if NUM_MODULES is enough to list all modules
   new_size = needed / sizeof(HMODULE);
   if (new_size > NUM_MODULES) {
 #if OMPT_DEBUG
     printf("ompt_tool_windows(): resize buffer to %d bytes\n", needed);
 #endif
     modules = (HMODULE *)realloc(modules, needed);
     // If resizing failed use the stub function.
     if (!EnumProcessModules(process, modules, needed, &needed)) {
       free(modules);
       return NULL;
     }
   }
   for (i = 0; i < new_size; ++i) {
     (FARPROC &)ompt_tool_p = GetProcAddress(modules[i], "ompt_start_tool");
     if (ompt_tool_p) {
 #if OMPT_DEBUG
       TCHAR modName[MAX_PATH];
       if (GetModuleFileName(modules[i], modName, MAX_PATH))
         printf("ompt_tool_windows(): ompt_start_tool found in module %s\n",
                modName);
 #endif
       free(modules);
       return (*ompt_tool_p)(omp_version, runtime_version);
     }
 #if OMPT_DEBUG
     else {
       TCHAR modName[MAX_PATH];
       if (GetModuleFileName(modules[i], modName, MAX_PATH))
         printf("ompt_tool_windows(): ompt_start_tool not found in module %s\n",
                modName);
     }
 #endif
   }
   free(modules);
   return NULL;
 }
 #else
 #error Activation of OMPT is not supported on this platform.
 #endif
 
 static ompt_start_tool_result_t *
 ompt_try_start_tool(unsigned int omp_version, const char *runtime_version) {
   ompt_start_tool_result_t *ret = NULL;
   ompt_start_tool_t start_tool = NULL;
 #if KMP_OS_WINDOWS
   // Cannot use colon to describe a list of absolute paths on Windows
   const char *sep = ";";
 #else
   const char *sep = ":";
 #endif
 
 #if KMP_OS_DARWIN
   // Try in the current address space
   ret = ompt_tool_darwin(omp_version, runtime_version);
 #elif OMPT_HAVE_WEAK_ATTRIBUTE
   ret = ompt_start_tool(omp_version, runtime_version);
 #elif OMPT_HAVE_PSAPI
   ret = ompt_tool_windows(omp_version, runtime_version);
 #else
 #error Activation of OMPT is not supported on this platform.
 #endif
   if (ret)
     return ret;
 
   // Try tool-libraries-var ICV
   const char *tool_libs = getenv("OMP_TOOL_LIBRARIES");
   if (tool_libs) {
     char *libs = __kmp_str_format("%s", tool_libs);
     char *buf;
     char *fname = __kmp_str_token(libs, sep, &buf);
     while (fname) {
 #if KMP_OS_UNIX
       void *h = dlopen(fname, RTLD_LAZY);
       if (h) {
         start_tool = (ompt_start_tool_t)dlsym(h, "ompt_start_tool");
 #elif KMP_OS_WINDOWS
       HMODULE h = LoadLibrary(fname);
       if (h) {
         start_tool = (ompt_start_tool_t)GetProcAddress(h, "ompt_start_tool");
 #else
 #error Activation of OMPT is not supported on this platform.
 #endif
         if (start_tool && (ret = (*start_tool)(omp_version, runtime_version)))
           break;
       }
       fname = __kmp_str_token(NULL, sep, &buf);
     }
     __kmp_str_free(&libs);
   }
   return ret;
 }
 
 void ompt_pre_init() {
   //--------------------------------------------------
   // Execute the pre-initialization logic only once.
   //--------------------------------------------------
   static int ompt_pre_initialized = 0;
 
   if (ompt_pre_initialized)
     return;
 
   ompt_pre_initialized = 1;
 
   //--------------------------------------------------
   // Use a tool iff a tool is enabled and available.
   //--------------------------------------------------
   const char *ompt_env_var = getenv("OMP_TOOL");
   tool_setting_e tool_setting = omp_tool_error;
 
   if (!ompt_env_var || !strcmp(ompt_env_var, ""))
     tool_setting = omp_tool_unset;
   else if (OMPT_STR_MATCH(ompt_env_var, "disabled"))
     tool_setting = omp_tool_disabled;
   else if (OMPT_STR_MATCH(ompt_env_var, "enabled"))
     tool_setting = omp_tool_enabled;
 
 #if OMPT_DEBUG
   printf("ompt_pre_init(): tool_setting = %d\n", tool_setting);
 #endif
   switch (tool_setting) {
   case omp_tool_disabled:
     break;
 
   case omp_tool_unset:
   case omp_tool_enabled:
 
     //--------------------------------------------------
     // Load tool iff specified in environment variable
     //--------------------------------------------------
     ompt_start_tool_result =
         ompt_try_start_tool(__kmp_openmp_version, ompt_get_runtime_version());
 
     memset(&ompt_enabled, 0, sizeof(ompt_enabled));
     break;
 
   case omp_tool_error:
     fprintf(stderr, "Warning: OMP_TOOL has invalid value \"%s\".\n"
                     "  legal values are (NULL,\"\",\"disabled\","
                     "\"enabled\").\n",
             ompt_env_var);
     break;
   }
 #if OMPT_DEBUG
   printf("ompt_pre_init(): ompt_enabled = %d\n", ompt_enabled);
 #endif
 }
 
 extern "C" int omp_get_initial_device(void);
 
 void ompt_post_init() {
   //--------------------------------------------------
   // Execute the post-initialization logic only once.
   //--------------------------------------------------
   static int ompt_post_initialized = 0;
 
   if (ompt_post_initialized)
     return;
 
   ompt_post_initialized = 1;
 
   //--------------------------------------------------
   // Initialize the tool if so indicated.
   //--------------------------------------------------
   if (ompt_start_tool_result) {
     ompt_enabled.enabled = !!ompt_start_tool_result->initialize(
         ompt_fn_lookup, omp_get_initial_device(), &(ompt_start_tool_result->tool_data));
 
     if (!ompt_enabled.enabled) {
       // tool not enabled, zero out the bitmap, and done
       memset(&ompt_enabled, 0, sizeof(ompt_enabled));
       return;
     }
 
     kmp_info_t *root_thread = ompt_get_thread();
 
     ompt_set_thread_state(root_thread, ompt_state_overhead);
 
     if (ompt_enabled.ompt_callback_thread_begin) {
       ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
           ompt_thread_initial, __ompt_get_thread_data_internal());
     }
     ompt_data_t *task_data;
     ompt_data_t *parallel_data;
     __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
     if (ompt_enabled.ompt_callback_implicit_task) {
       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
           ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
     }
 
     ompt_set_thread_state(root_thread, ompt_state_work_serial);
   }
 }
 
 void ompt_fini() {
   if (ompt_enabled.enabled) {
     ompt_start_tool_result->finalize(&(ompt_start_tool_result->tool_data));
   }
 
   memset(&ompt_enabled, 0, sizeof(ompt_enabled));
 }
 
 /*****************************************************************************
  * interface operations
  ****************************************************************************/
 
 /*****************************************************************************
  * state
  ****************************************************************************/
 
 OMPT_API_ROUTINE int ompt_enumerate_states(int current_state, int *next_state,
                                            const char **next_state_name) {
   const static int len = sizeof(ompt_state_info) / sizeof(ompt_state_info_t);
   int i = 0;
 
   for (i = 0; i < len - 1; i++) {
     if (ompt_state_info[i].state_id == current_state) {
       *next_state = ompt_state_info[i + 1].state_id;
       *next_state_name = ompt_state_info[i + 1].state_name;
       return 1;
     }
   }
 
   return 0;
 }
 
 OMPT_API_ROUTINE int ompt_enumerate_mutex_impls(int current_impl,
                                                 int *next_impl,
                                                 const char **next_impl_name) {
   const static int len =
       sizeof(kmp_mutex_impl_info) / sizeof(kmp_mutex_impl_info_t);
   int i = 0;
   for (i = 0; i < len - 1; i++) {
     if (kmp_mutex_impl_info[i].id != current_impl)
       continue;
     *next_impl = kmp_mutex_impl_info[i + 1].id;
     *next_impl_name = kmp_mutex_impl_info[i + 1].name;
     return 1;
   }
   return 0;
 }
 
 /*****************************************************************************
  * callbacks
  ****************************************************************************/
 
 OMPT_API_ROUTINE ompt_set_result_t ompt_set_callback(ompt_callbacks_t which,
                                        ompt_callback_t callback) {
   switch (which) {
 
 #define ompt_event_macro(event_name, callback_type, event_id)                  \
   case event_name:                                                             \
-    if (ompt_event_implementation_status(event_name)) {                        \
-      ompt_callbacks.ompt_callback(event_name) = (callback_type)callback;      \
-      ompt_enabled.event_name = (callback != 0);                               \
-    }                                                                          \
+    ompt_callbacks.ompt_callback(event_name) = (callback_type)callback;        \
+    ompt_enabled.event_name = (callback != 0);                                 \
     if (callback)                                                              \
       return ompt_event_implementation_status(event_name);                     \
     else                                                                       \
       return ompt_set_always;
 
     FOREACH_OMPT_EVENT(ompt_event_macro)
 
 #undef ompt_event_macro
 
   default:
     return ompt_set_error;
   }
 }
 
 OMPT_API_ROUTINE int ompt_get_callback(ompt_callbacks_t which,
                                        ompt_callback_t *callback) {
   if (!ompt_enabled.enabled)
     return ompt_get_callback_failure;
 
   switch (which) {
 
 #define ompt_event_macro(event_name, callback_type, event_id)                  \
-  case event_name:                                                             \
-    if (ompt_event_implementation_status(event_name)) {                        \
-      ompt_callback_t mycb =                                                   \
-          (ompt_callback_t)ompt_callbacks.ompt_callback(event_name);           \
-      if (ompt_enabled.event_name && mycb) {                                   \
-        *callback = mycb;                                                      \
-        return ompt_get_callback_success;                                      \
-      }                                                                        \
+  case event_name: {                                                           \
+    ompt_callback_t mycb =                                                     \
+        (ompt_callback_t)ompt_callbacks.ompt_callback(event_name);             \
+    if (ompt_enabled.event_name && mycb) {                                     \
+      *callback = mycb;                                                        \
+      return ompt_get_callback_success;                                        \
     }                                                                          \
-    return ompt_get_callback_failure;
+    return ompt_get_callback_failure;                                          \
+  }
 
     FOREACH_OMPT_EVENT(ompt_event_macro)
 
 #undef ompt_event_macro
 
   default:
     return ompt_get_callback_failure;
   }
 }
 
 /*****************************************************************************
  * parallel regions
  ****************************************************************************/
 
 OMPT_API_ROUTINE int ompt_get_parallel_info(int ancestor_level,
                                             ompt_data_t **parallel_data,
                                             int *team_size) {
   if (!ompt_enabled.enabled)
     return 0;
   return __ompt_get_parallel_info_internal(ancestor_level, parallel_data,
                                            team_size);
 }
 
 OMPT_API_ROUTINE int ompt_get_state(ompt_wait_id_t *wait_id) {
   if (!ompt_enabled.enabled)
     return ompt_state_work_serial;
   int thread_state = __ompt_get_state_internal(wait_id);
 
   if (thread_state == ompt_state_undefined) {
     thread_state = ompt_state_work_serial;
   }
 
   return thread_state;
 }
 
 /*****************************************************************************
  * tasks
  ****************************************************************************/
 
 OMPT_API_ROUTINE ompt_data_t *ompt_get_thread_data(void) {
   if (!ompt_enabled.enabled)
     return NULL;
   return __ompt_get_thread_data_internal();
 }
 
 OMPT_API_ROUTINE int ompt_get_task_info(int ancestor_level, int *type,
                                         ompt_data_t **task_data,
                                         ompt_frame_t **task_frame,
                                         ompt_data_t **parallel_data,
                                         int *thread_num) {
   if (!ompt_enabled.enabled)
     return 0;
   return __ompt_get_task_info_internal(ancestor_level, type, task_data,
                                        task_frame, parallel_data, thread_num);
 }
 
 OMPT_API_ROUTINE int ompt_get_task_memory(void **addr, size_t *size,
                                           int block) {
   return __ompt_get_task_memory_internal(addr, size, block);
 }
 
 /*****************************************************************************
  * num_procs
  ****************************************************************************/
 
 OMPT_API_ROUTINE int ompt_get_num_procs(void) {
   // copied from kmp_ftn_entry.h (but modified: OMPT can only be called when
   // runtime is initialized)
   return __kmp_avail_proc;
 }
 
 /*****************************************************************************
  * places
  ****************************************************************************/
 
 OMPT_API_ROUTINE int ompt_get_num_places(void) {
 // copied from kmp_ftn_entry.h (but modified)
 #if !KMP_AFFINITY_SUPPORTED
   return 0;
 #else
   if (!KMP_AFFINITY_CAPABLE())
     return 0;
   return __kmp_affinity_num_masks;
 #endif
 }
 
 OMPT_API_ROUTINE int ompt_get_place_proc_ids(int place_num, int ids_size,
                                              int *ids) {
 // copied from kmp_ftn_entry.h (but modified)
 #if !KMP_AFFINITY_SUPPORTED
   return 0;
 #else
   int i, count;
   int tmp_ids[ids_size];
   if (!KMP_AFFINITY_CAPABLE())
     return 0;
   if (place_num < 0 || place_num >= (int)__kmp_affinity_num_masks)
     return 0;
   /* TODO: Is this safe for asynchronous call from signal handler during runtime
    * shutdown? */
   kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks, place_num);
   count = 0;
   KMP_CPU_SET_ITERATE(i, mask) {
     if ((!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) ||
         (!KMP_CPU_ISSET(i, mask))) {
       continue;
     }
     if (count < ids_size)
       tmp_ids[count] = i;
     count++;
   }
   if (ids_size >= count) {
     for (i = 0; i < count; i++) {
       ids[i] = tmp_ids[i];
     }
   }
   return count;
 #endif
 }
 
 OMPT_API_ROUTINE int ompt_get_place_num(void) {
 // copied from kmp_ftn_entry.h (but modified)
 #if !KMP_AFFINITY_SUPPORTED
   return -1;
 #else
   if (!ompt_enabled.enabled || __kmp_get_gtid() < 0)
     return -1;
 
   int gtid;
   kmp_info_t *thread;
   if (!KMP_AFFINITY_CAPABLE())
     return -1;
   gtid = __kmp_entry_gtid();
   thread = __kmp_thread_from_gtid(gtid);
   if (thread == NULL || thread->th.th_current_place < 0)
     return -1;
   return thread->th.th_current_place;
 #endif
 }
 
 OMPT_API_ROUTINE int ompt_get_partition_place_nums(int place_nums_size,
                                                    int *place_nums) {
 // copied from kmp_ftn_entry.h (but modified)
 #if !KMP_AFFINITY_SUPPORTED
   return 0;
 #else
   if (!ompt_enabled.enabled || __kmp_get_gtid() < 0)
     return 0;
 
   int i, gtid, place_num, first_place, last_place, start, end;
   kmp_info_t *thread;
   if (!KMP_AFFINITY_CAPABLE())
     return 0;
   gtid = __kmp_entry_gtid();
   thread = __kmp_thread_from_gtid(gtid);
   if (thread == NULL)
     return 0;
   first_place = thread->th.th_first_place;
   last_place = thread->th.th_last_place;
   if (first_place < 0 || last_place < 0)
     return 0;
   if (first_place <= last_place) {
     start = first_place;
     end = last_place;
   } else {
     start = last_place;
     end = first_place;
   }
   if (end - start <= place_nums_size)
     for (i = 0, place_num = start; place_num <= end; ++place_num, ++i) {
       place_nums[i] = place_num;
     }
   return end - start + 1;
 #endif
 }
 
 /*****************************************************************************
  * places
  ****************************************************************************/
 
 OMPT_API_ROUTINE int ompt_get_proc_id(void) {
   if (!ompt_enabled.enabled || __kmp_get_gtid() < 0)
     return -1;
 #if KMP_OS_LINUX
   return sched_getcpu();
 #elif KMP_OS_WINDOWS
   PROCESSOR_NUMBER pn;
   GetCurrentProcessorNumberEx(&pn);
   return 64 * pn.Group + pn.Number;
 #else
   return -1;
 #endif
 }
 
 /*****************************************************************************
  * compatability
  ****************************************************************************/
 
 /*
  * Currently unused function
 OMPT_API_ROUTINE int ompt_get_ompt_version() { return OMPT_VERSION; }
 */
 
 /*****************************************************************************
 * application-facing API
  ****************************************************************************/
 
 /*----------------------------------------------------------------------------
  | control
  ---------------------------------------------------------------------------*/
 
 int __kmp_control_tool(uint64_t command, uint64_t modifier, void *arg) {
 
   if (ompt_enabled.enabled) {
     if (ompt_enabled.ompt_callback_control_tool) {
       return ompt_callbacks.ompt_callback(ompt_callback_control_tool)(
           command, modifier, arg, OMPT_LOAD_RETURN_ADDRESS(__kmp_entry_gtid()));
     } else {
       return -1;
     }
   } else {
     return -2;
   }
 }
 
 /*****************************************************************************
  * misc
  ****************************************************************************/
 
 OMPT_API_ROUTINE uint64_t ompt_get_unique_id(void) {
   return __ompt_get_unique_id_internal();
 }
 
 OMPT_API_ROUTINE void ompt_finalize_tool(void) { __kmp_internal_end_atexit(); }
 
 /*****************************************************************************
  * Target
  ****************************************************************************/
 
 OMPT_API_ROUTINE int ompt_get_target_info(uint64_t *device_num,
                                           ompt_id_t *target_id,
                                           ompt_id_t *host_op_id) {
   return 0; // thread is not in a target region
 }
 
 OMPT_API_ROUTINE int ompt_get_num_devices(void) {
   return 1; // only one device (the current device) is available
 }
 
 /*****************************************************************************
  * API inquiry for tool
  ****************************************************************************/
 
 static ompt_interface_fn_t ompt_fn_lookup(const char *s) {
 
 #define ompt_interface_fn(fn)                                                  \
   fn##_t fn##_f = fn;                                                          \
   if (strcmp(s, #fn) == 0)                                                     \
     return (ompt_interface_fn_t)fn##_f;
 
   FOREACH_OMPT_INQUIRY_FN(ompt_interface_fn)
 
   return (ompt_interface_fn_t)0;
 }
Index: projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/ompt-internal.h
===================================================================
--- projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/ompt-internal.h	(revision 357058)
+++ projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/ompt-internal.h	(revision 357059)
@@ -1,126 +1,127 @@
 /*
  * ompt-internal.h - header of OMPT internal data structures
  */
 
 //===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef __OMPT_INTERNAL_H__
 #define __OMPT_INTERNAL_H__
 
 #include "ompt-event-specific.h"
 #include "omp-tools.h"
 
 #define OMPT_VERSION 1
 
 #define _OMP_EXTERN extern "C"
 
 #define OMPT_INVOKER(x)                                                        \
   ((x == fork_context_gnu) ? ompt_parallel_invoker_program                     \
                            : ompt_parallel_invoker_runtime)
 
 #define ompt_callback(e) e##_callback
 
 typedef struct ompt_callbacks_internal_s {
 #define ompt_event_macro(event, callback, eventid)                             \
   callback ompt_callback(event);
 
   FOREACH_OMPT_EVENT(ompt_event_macro)
 
 #undef ompt_event_macro
 } ompt_callbacks_internal_t;
 
 typedef struct ompt_callbacks_active_s {
   unsigned int enabled : 1;
 #define ompt_event_macro(event, callback, eventid) unsigned int event : 1;
 
   FOREACH_OMPT_EVENT(ompt_event_macro)
 
 #undef ompt_event_macro
 } ompt_callbacks_active_t;
 
 #define TASK_TYPE_DETAILS_FORMAT(info)                                         \
   ((info->td_flags.task_serial || info->td_flags.tasking_ser)                  \
        ? ompt_task_undeferred                                                  \
        : 0x0) |                                                                \
       ((!(info->td_flags.tiedness)) ? ompt_task_untied : 0x0) |                \
       (info->td_flags.final ? ompt_task_final : 0x0) |                         \
       (info->td_flags.merged_if0 ? ompt_task_mergeable : 0x0)
 
 typedef struct {
   ompt_frame_t frame;
   ompt_data_t task_data;
   struct kmp_taskdata *scheduling_parent;
   int thread_num;
   int ndeps;
   ompt_dependence_t *deps;
 } ompt_task_info_t;
 
 typedef struct {
   ompt_data_t parallel_data;
   void *master_return_address;
 } ompt_team_info_t;
 
 typedef struct ompt_lw_taskteam_s {
   ompt_team_info_t ompt_team_info;
   ompt_task_info_t ompt_task_info;
   int heap;
   struct ompt_lw_taskteam_s *parent;
 } ompt_lw_taskteam_t;
 
 typedef struct {
   ompt_data_t thread_data;
   ompt_data_t task_data; /* stored here from implicit barrier-begin until
                             implicit-task-end */
   void *return_address; /* stored here on entry of runtime */
   ompt_state_t state;
   ompt_wait_id_t wait_id;
   int ompt_task_yielded;
+  int parallel_flags; // information for the last parallel region invoked
   void *idle_frame;
 } ompt_thread_info_t;
 
 extern ompt_callbacks_internal_t ompt_callbacks;
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
 #if USE_FAST_MEMORY
 #define KMP_OMPT_DEPS_ALLOC __kmp_fast_allocate
 #define KMP_OMPT_DEPS_FREE __kmp_fast_free
 #else
 #define KMP_OMPT_DEPS_ALLOC __kmp_thread_malloc
 #define KMP_OMPT_DEPS_FREE __kmp_thread_free
 #endif
 #endif /* OMPT_SUPPORT && OMPT_OPTIONAL */
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 void ompt_pre_init(void);
 void ompt_post_init(void);
 void ompt_fini(void);
 
 #define OMPT_GET_RETURN_ADDRESS(level) __builtin_return_address(level)
 #define OMPT_GET_FRAME_ADDRESS(level) __builtin_frame_address(level)
 
 int __kmp_control_tool(uint64_t command, uint64_t modifier, void *arg);
 
 extern ompt_callbacks_active_t ompt_enabled;
 
 #if KMP_OS_WINDOWS
 #define UNLIKELY(x) (x)
 #define OMPT_NOINLINE __declspec(noinline)
 #else
 #define UNLIKELY(x) __builtin_expect(!!(x), 0)
 #define OMPT_NOINLINE __attribute__((noinline))
 #endif
 
 #ifdef __cplusplus
 };
 #endif
 
 #endif
Index: projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/ompt-specific.cpp
===================================================================
--- projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/ompt-specific.cpp	(revision 357058)
+++ projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/ompt-specific.cpp	(revision 357059)
@@ -1,505 +1,506 @@
 /*
  * ompt-specific.cpp -- OMPT internal functions
  */
 
 //===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
 //******************************************************************************
 // include files
 //******************************************************************************
 
 #include "kmp.h"
 #include "ompt-specific.h"
 
 #if KMP_OS_UNIX
 #include <dlfcn.h>
 #endif
 
 #if KMP_OS_WINDOWS
 #define THREAD_LOCAL __declspec(thread)
 #else
 #define THREAD_LOCAL __thread
 #endif
 
 #define OMPT_WEAK_ATTRIBUTE KMP_WEAK_ATTRIBUTE
 
 //******************************************************************************
 // macros
 //******************************************************************************
 
 #define LWT_FROM_TEAM(team) (team)->t.ompt_serialized_team_info
 
 #define OMPT_THREAD_ID_BITS 16
 
 //******************************************************************************
 // private operations
 //******************************************************************************
 
 //----------------------------------------------------------
 // traverse the team and task hierarchy
 // note: __ompt_get_teaminfo and __ompt_get_task_info_object
 //       traverse the hierarchy similarly and need to be
 //       kept consistent
 //----------------------------------------------------------
 
 ompt_team_info_t *__ompt_get_teaminfo(int depth, int *size) {
   kmp_info_t *thr = ompt_get_thread();
 
   if (thr) {
     kmp_team *team = thr->th.th_team;
     if (team == NULL)
       return NULL;
 
     ompt_lw_taskteam_t *next_lwt = LWT_FROM_TEAM(team), *lwt = NULL;
 
     while (depth > 0) {
       // next lightweight team (if any)
       if (lwt)
         lwt = lwt->parent;
 
       // next heavyweight team (if any) after
       // lightweight teams are exhausted
       if (!lwt && team) {
         if (next_lwt) {
           lwt = next_lwt;
           next_lwt = NULL;
         } else {
           team = team->t.t_parent;
           if (team) {
             next_lwt = LWT_FROM_TEAM(team);
           }
         }
       }
 
       depth--;
     }
 
     if (lwt) {
       // lightweight teams have one task
       if (size)
         *size = 1;
 
       // return team info for lightweight team
       return &lwt->ompt_team_info;
     } else if (team) {
       // extract size from heavyweight team
       if (size)
         *size = team->t.t_nproc;
 
       // return team info for heavyweight team
       return &team->t.ompt_team_info;
     }
   }
 
   return NULL;
 }
 
 ompt_task_info_t *__ompt_get_task_info_object(int depth) {
   ompt_task_info_t *info = NULL;
   kmp_info_t *thr = ompt_get_thread();
 
   if (thr) {
     kmp_taskdata_t *taskdata = thr->th.th_current_task;
     ompt_lw_taskteam_t *lwt = NULL,
                        *next_lwt = LWT_FROM_TEAM(taskdata->td_team);
 
     while (depth > 0) {
       // next lightweight team (if any)
       if (lwt)
         lwt = lwt->parent;
 
       // next heavyweight team (if any) after
       // lightweight teams are exhausted
       if (!lwt && taskdata) {
         if (next_lwt) {
           lwt = next_lwt;
           next_lwt = NULL;
         } else {
           taskdata = taskdata->td_parent;
           if (taskdata) {
             next_lwt = LWT_FROM_TEAM(taskdata->td_team);
           }
         }
       }
       depth--;
     }
 
     if (lwt) {
       info = &lwt->ompt_task_info;
     } else if (taskdata) {
       info = &taskdata->ompt_task_info;
     }
   }
 
   return info;
 }
 
 ompt_task_info_t *__ompt_get_scheduling_taskinfo(int depth) {
   ompt_task_info_t *info = NULL;
   kmp_info_t *thr = ompt_get_thread();
 
   if (thr) {
     kmp_taskdata_t *taskdata = thr->th.th_current_task;
 
     ompt_lw_taskteam_t *lwt = NULL,
                        *next_lwt = LWT_FROM_TEAM(taskdata->td_team);
 
     while (depth > 0) {
       // next lightweight team (if any)
       if (lwt)
         lwt = lwt->parent;
 
       // next heavyweight team (if any) after
       // lightweight teams are exhausted
       if (!lwt && taskdata) {
         // first try scheduling parent (for explicit task scheduling)
         if (taskdata->ompt_task_info.scheduling_parent) {
           taskdata = taskdata->ompt_task_info.scheduling_parent;
         } else if (next_lwt) {
           lwt = next_lwt;
           next_lwt = NULL;
         } else {
           // then go for implicit tasks
           taskdata = taskdata->td_parent;
           if (taskdata) {
             next_lwt = LWT_FROM_TEAM(taskdata->td_team);
           }
         }
       }
       depth--;
     }
 
     if (lwt) {
       info = &lwt->ompt_task_info;
     } else if (taskdata) {
       info = &taskdata->ompt_task_info;
     }
   }
 
   return info;
 }
 
 //******************************************************************************
 // interface operations
 //******************************************************************************
 
 //----------------------------------------------------------
 // thread support
 //----------------------------------------------------------
 
 ompt_data_t *__ompt_get_thread_data_internal() {
   if (__kmp_get_gtid() >= 0) {
     kmp_info_t *thread = ompt_get_thread();
     if (thread == NULL)
       return NULL;
     return &(thread->th.ompt_thread_info.thread_data);
   }
   return NULL;
 }
 
 //----------------------------------------------------------
 // state support
 //----------------------------------------------------------
 
 void __ompt_thread_assign_wait_id(void *variable) {
   kmp_info_t *ti = ompt_get_thread();
 
   if (ti)
     ti->th.ompt_thread_info.wait_id = (ompt_wait_id_t)(uintptr_t)variable;
 }
 
 int __ompt_get_state_internal(ompt_wait_id_t *omp_wait_id) {
   kmp_info_t *ti = ompt_get_thread();
 
   if (ti) {
     if (omp_wait_id)
       *omp_wait_id = ti->th.ompt_thread_info.wait_id;
     return ti->th.ompt_thread_info.state;
   }
   return ompt_state_undefined;
 }
 
 //----------------------------------------------------------
 // parallel region support
 //----------------------------------------------------------
 
 int __ompt_get_parallel_info_internal(int ancestor_level,
                                       ompt_data_t **parallel_data,
                                       int *team_size) {
   if (__kmp_get_gtid() >= 0) {
     ompt_team_info_t *info;
     if (team_size) {
       info = __ompt_get_teaminfo(ancestor_level, team_size);
     } else {
       info = __ompt_get_teaminfo(ancestor_level, NULL);
     }
     if (parallel_data) {
       *parallel_data = info ? &(info->parallel_data) : NULL;
     }
     return info ? 2 : 0;
   } else {
     return 0;
   }
 }
 
 //----------------------------------------------------------
 // lightweight task team support
 //----------------------------------------------------------
 
 void __ompt_lw_taskteam_init(ompt_lw_taskteam_t *lwt, kmp_info_t *thr, int gtid,
                              ompt_data_t *ompt_pid, void *codeptr) {
   // initialize parallel_data with input, return address to parallel_data on
   // exit
   lwt->ompt_team_info.parallel_data = *ompt_pid;
   lwt->ompt_team_info.master_return_address = codeptr;
   lwt->ompt_task_info.task_data.value = 0;
   lwt->ompt_task_info.frame.enter_frame = ompt_data_none;
   lwt->ompt_task_info.frame.exit_frame = ompt_data_none;
   lwt->ompt_task_info.scheduling_parent = NULL;
   lwt->ompt_task_info.deps = NULL;
   lwt->ompt_task_info.ndeps = 0;
   lwt->heap = 0;
   lwt->parent = 0;
 }
 
 void __ompt_lw_taskteam_link(ompt_lw_taskteam_t *lwt, kmp_info_t *thr,
-                             int on_heap) {
+                             int on_heap, bool always) {
   ompt_lw_taskteam_t *link_lwt = lwt;
-  if (thr->th.th_team->t.t_serialized >
-      1) { // we already have a team, so link the new team and swap values
+  if (always ||
+      thr->th.th_team->t.t_serialized >
+          1) { // we already have a team, so link the new team and swap values
     if (on_heap) { // the lw_taskteam cannot stay on stack, allocate it on heap
       link_lwt =
           (ompt_lw_taskteam_t *)__kmp_allocate(sizeof(ompt_lw_taskteam_t));
     }
     link_lwt->heap = on_heap;
 
     // would be swap in the (on_stack) case.
     ompt_team_info_t tmp_team = lwt->ompt_team_info;
     link_lwt->ompt_team_info = *OMPT_CUR_TEAM_INFO(thr);
     *OMPT_CUR_TEAM_INFO(thr) = tmp_team;
 
     ompt_task_info_t tmp_task = lwt->ompt_task_info;
     link_lwt->ompt_task_info = *OMPT_CUR_TASK_INFO(thr);
     *OMPT_CUR_TASK_INFO(thr) = tmp_task;
 
     // link the taskteam into the list of taskteams:
     ompt_lw_taskteam_t *my_parent =
         thr->th.th_team->t.ompt_serialized_team_info;
     link_lwt->parent = my_parent;
     thr->th.th_team->t.ompt_serialized_team_info = link_lwt;
   } else {
     // this is the first serialized team, so we just store the values in the
     // team and drop the taskteam-object
     *OMPT_CUR_TEAM_INFO(thr) = lwt->ompt_team_info;
     *OMPT_CUR_TASK_INFO(thr) = lwt->ompt_task_info;
   }
 }
 
 void __ompt_lw_taskteam_unlink(kmp_info_t *thr) {
   ompt_lw_taskteam_t *lwtask = thr->th.th_team->t.ompt_serialized_team_info;
   if (lwtask) {
     thr->th.th_team->t.ompt_serialized_team_info = lwtask->parent;
 
     ompt_team_info_t tmp_team = lwtask->ompt_team_info;
     lwtask->ompt_team_info = *OMPT_CUR_TEAM_INFO(thr);
     *OMPT_CUR_TEAM_INFO(thr) = tmp_team;
 
     ompt_task_info_t tmp_task = lwtask->ompt_task_info;
     lwtask->ompt_task_info = *OMPT_CUR_TASK_INFO(thr);
     *OMPT_CUR_TASK_INFO(thr) = tmp_task;
 
     if (lwtask->heap) {
       __kmp_free(lwtask);
       lwtask = NULL;
     }
   }
   //    return lwtask;
 }
 
 //----------------------------------------------------------
 // task support
 //----------------------------------------------------------
 
 int __ompt_get_task_info_internal(int ancestor_level, int *type,
                                   ompt_data_t **task_data,
                                   ompt_frame_t **task_frame,
                                   ompt_data_t **parallel_data,
                                   int *thread_num) {
   if (__kmp_get_gtid() < 0)
     return 0;
 
   if (ancestor_level < 0)
     return 0;
 
   // copied from __ompt_get_scheduling_taskinfo
   ompt_task_info_t *info = NULL;
   ompt_team_info_t *team_info = NULL;
   kmp_info_t *thr = ompt_get_thread();
   int level = ancestor_level;
 
   if (thr) {
     kmp_taskdata_t *taskdata = thr->th.th_current_task;
     if (taskdata == NULL)
       return 0;
     kmp_team *team = thr->th.th_team, *prev_team = NULL;
     if (team == NULL)
       return 0;
     ompt_lw_taskteam_t *lwt = NULL,
                        *next_lwt = LWT_FROM_TEAM(taskdata->td_team),
                        *prev_lwt = NULL;
 
     while (ancestor_level > 0) {
       // needed for thread_num
       prev_team = team;
       prev_lwt = lwt;
       // next lightweight team (if any)
       if (lwt)
         lwt = lwt->parent;
 
       // next heavyweight team (if any) after
       // lightweight teams are exhausted
       if (!lwt && taskdata) {
         // first try scheduling parent (for explicit task scheduling)
         if (taskdata->ompt_task_info.scheduling_parent) {
           taskdata = taskdata->ompt_task_info.scheduling_parent;
         } else if (next_lwt) {
           lwt = next_lwt;
           next_lwt = NULL;
         } else {
           // then go for implicit tasks
           taskdata = taskdata->td_parent;
           if (team == NULL)
             return 0;
           team = team->t.t_parent;
           if (taskdata) {
             next_lwt = LWT_FROM_TEAM(taskdata->td_team);
           }
         }
       }
       ancestor_level--;
     }
 
     if (lwt) {
       info = &lwt->ompt_task_info;
       team_info = &lwt->ompt_team_info;
       if (type) {
         *type = ompt_task_implicit;
       }
     } else if (taskdata) {
       info = &taskdata->ompt_task_info;
       team_info = &team->t.ompt_team_info;
       if (type) {
         if (taskdata->td_parent) {
           *type = (taskdata->td_flags.tasktype ? ompt_task_explicit
                                                : ompt_task_implicit) |
                   TASK_TYPE_DETAILS_FORMAT(taskdata);
         } else {
           *type = ompt_task_initial;
         }
       }
     }
     if (task_data) {
       *task_data = info ? &info->task_data : NULL;
     }
     if (task_frame) {
       // OpenMP spec asks for the scheduling task to be returned.
       *task_frame = info ? &info->frame : NULL;
     }
     if (parallel_data) {
       *parallel_data = team_info ? &(team_info->parallel_data) : NULL;
     }
     if (thread_num) {
       if (level == 0)
         *thread_num = __kmp_get_tid();
       else if (prev_lwt)
         *thread_num = 0;
       else
         *thread_num = prev_team->t.t_master_tid;
       //        *thread_num = team->t.t_master_tid;
     }
     return info ? 2 : 0;
   }
   return 0;
 }
 
 int __ompt_get_task_memory_internal(void **addr, size_t *size, int blocknum) {
   if (blocknum != 0)
     return 0; // support only a single block
 
   kmp_info_t *thr = ompt_get_thread();
   if (!thr)
     return 0;
 
   kmp_taskdata_t *taskdata = thr->th.th_current_task;
   kmp_task_t *task = KMP_TASKDATA_TO_TASK(taskdata);
 
   if (taskdata->td_flags.tasktype != TASK_EXPLICIT)
     return 0; // support only explicit task
 
   void *ret_addr;
   int64_t ret_size = taskdata->td_size_alloc - sizeof(kmp_taskdata_t);
 
   // kmp_task_t->data1 is an optional member
   if (taskdata->td_flags.destructors_thunk)
     ret_addr = &task->data1 + 1;
   else
     ret_addr = &task->part_id + 1;
 
   ret_size -= (char *)(ret_addr) - (char *)(task);
   if (ret_size < 0)
     return 0;
 
   *addr = ret_addr;
   *size = ret_size;
   return 1;
 }
 
 //----------------------------------------------------------
 // team support
 //----------------------------------------------------------
 
 void __ompt_team_assign_id(kmp_team_t *team, ompt_data_t ompt_pid) {
   team->t.ompt_team_info.parallel_data = ompt_pid;
 }
 
 //----------------------------------------------------------
 // misc
 //----------------------------------------------------------
 
 static uint64_t __ompt_get_unique_id_internal() {
   static uint64_t thread = 1;
   static THREAD_LOCAL uint64_t ID = 0;
   if (ID == 0) {
     uint64_t new_thread = KMP_TEST_THEN_INC64((kmp_int64 *)&thread);
     ID = new_thread << (sizeof(uint64_t) * 8 - OMPT_THREAD_ID_BITS);
   }
   return ++ID;
 }
 
 ompt_sync_region_t __ompt_get_barrier_kind(enum barrier_type bt,
                                            kmp_info_t *thr) {
   if (bt == bs_forkjoin_barrier)
     return ompt_sync_region_barrier_implicit;
 
   if (bt != bs_plain_barrier)
     return ompt_sync_region_barrier_implementation;
 
   if (!thr->th.th_ident)
     return ompt_sync_region_barrier;
 
   kmp_int32 flags = thr->th.th_ident->flags;
 
   if ((flags & KMP_IDENT_BARRIER_EXPL) != 0)
     return ompt_sync_region_barrier_explicit;
 
   if ((flags & KMP_IDENT_BARRIER_IMPL) != 0)
     return ompt_sync_region_barrier_implicit;
 
   return ompt_sync_region_barrier_implementation;
 }
Index: projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/ompt-specific.h
===================================================================
--- projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/ompt-specific.h	(revision 357058)
+++ projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/ompt-specific.h	(revision 357059)
@@ -1,105 +1,105 @@
 /*
  * ompt-specific.h - header of OMPT internal functions implementation
  */
 
 //===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef OMPT_SPECIFIC_H
 #define OMPT_SPECIFIC_H
 
 #include "kmp.h"
 
 /*****************************************************************************
  * forward declarations
  ****************************************************************************/
 
 void __ompt_team_assign_id(kmp_team_t *team, ompt_data_t ompt_pid);
 void __ompt_thread_assign_wait_id(void *variable);
 
 void __ompt_lw_taskteam_init(ompt_lw_taskteam_t *lwt, kmp_info_t *thr,
                              int gtid, ompt_data_t *ompt_pid, void *codeptr);
 
 void __ompt_lw_taskteam_link(ompt_lw_taskteam_t *lwt, kmp_info_t *thr,
-                             int on_heap);
+                             int on_heap, bool always = false);
 
 void __ompt_lw_taskteam_unlink(kmp_info_t *thr);
 
 ompt_team_info_t *__ompt_get_teaminfo(int depth, int *size);
 
 ompt_task_info_t *__ompt_get_task_info_object(int depth);
 
 int __ompt_get_parallel_info_internal(int ancestor_level,
                                       ompt_data_t **parallel_data,
                                       int *team_size);
 
 int __ompt_get_task_info_internal(int ancestor_level, int *type,
                                   ompt_data_t **task_data,
                                   ompt_frame_t **task_frame,
                                   ompt_data_t **parallel_data, int *thread_num);
 
 ompt_data_t *__ompt_get_thread_data_internal();
 
 /*
  * Unused currently
 static uint64_t __ompt_get_get_unique_id_internal();
 */
 
 ompt_sync_region_t __ompt_get_barrier_kind(enum barrier_type, kmp_info_t *);
 
 /*****************************************************************************
  * macros
  ****************************************************************************/
 
 #define OMPT_CUR_TASK_INFO(thr) (&(thr->th.th_current_task->ompt_task_info))
 #define OMPT_CUR_TASK_DATA(thr)                                                \
   (&(thr->th.th_current_task->ompt_task_info.task_data))
 #define OMPT_CUR_TEAM_INFO(thr) (&(thr->th.th_team->t.ompt_team_info))
 #define OMPT_CUR_TEAM_DATA(thr)                                                \
   (&(thr->th.th_team->t.ompt_team_info.parallel_data))
 
 #define OMPT_HAVE_WEAK_ATTRIBUTE KMP_HAVE_WEAK_ATTRIBUTE
 #define OMPT_HAVE_PSAPI KMP_HAVE_PSAPI
 #define OMPT_STR_MATCH(haystack, needle) __kmp_str_match(haystack, 0, needle)
 
 inline void *__ompt_load_return_address(int gtid) {
   kmp_info_t *thr = __kmp_threads[gtid];
   void *return_address = thr->th.ompt_thread_info.return_address;
   thr->th.ompt_thread_info.return_address = NULL;
   return return_address;
 }
 
 #define OMPT_STORE_RETURN_ADDRESS(gtid)                                        \
   if (ompt_enabled.enabled && gtid >= 0 && __kmp_threads[gtid] &&              \
       !__kmp_threads[gtid]->th.ompt_thread_info.return_address)                \
   __kmp_threads[gtid]->th.ompt_thread_info.return_address =                    \
       __builtin_return_address(0)
 #define OMPT_LOAD_RETURN_ADDRESS(gtid) __ompt_load_return_address(gtid)
 
 //******************************************************************************
 // inline functions
 //******************************************************************************
 
 inline kmp_info_t *ompt_get_thread_gtid(int gtid) {
   return (gtid >= 0) ? __kmp_thread_from_gtid(gtid) : NULL;
 }
 
 inline kmp_info_t *ompt_get_thread() {
   int gtid = __kmp_get_gtid();
   return ompt_get_thread_gtid(gtid);
 }
 
 inline void ompt_set_thread_state(kmp_info_t *thread, ompt_state_t state) {
   thread->th.ompt_thread_info.state = state;
 }
 
 inline const char *ompt_get_runtime_version() {
   return &__kmp_version_lib_ver[KMP_VERSION_MAGIC_LEN];
 }
 
 #endif
Index: projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/thirdparty/ittnotify/ittnotify_static.c
===================================================================
--- projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/thirdparty/ittnotify/ittnotify_static.c	(revision 357058)
+++ projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/thirdparty/ittnotify/ittnotify_static.c	(nonexistent)
@@ -1,1201 +0,0 @@
-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "kmp_config.h"
-#include "ittnotify_config.h"
-
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#if defined(__MINGW32__)
-#include <limits.h>
-#else
-#define PATH_MAX 512
-#endif
-#else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
-#include <limits.h>
-#include <dlfcn.h>
-#include <errno.h>
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdarg.h>
-#include <string.h>
-
-#define INTEL_NO_MACRO_BODY
-#define INTEL_ITTNOTIFY_API_PRIVATE
-#include "ittnotify.h"
-#include "legacy/ittnotify.h"
-
-#if KMP_MSVC_COMPAT
-#include "disable_warnings.h"
-#endif
-
-static const char api_version[] = API_VERSION "\0\n@(#) $Revision: 481659 $\n";
-
-#define _N_(n) ITT_JOIN(INTEL_ITTNOTIFY_PREFIX,n)
-
-#if ITT_OS==ITT_OS_WIN
-static const char* ittnotify_lib_name = "libittnotify.dll";
-#elif ITT_OS==ITT_OS_LINUX || ITT_OS==ITT_OS_FREEBSD
-static const char* ittnotify_lib_name = "libittnotify.so";
-#elif ITT_OS==ITT_OS_MAC
-static const char* ittnotify_lib_name = "libittnotify.dylib";
-#else
-#error Unsupported or unknown OS.
-#endif
-
-#ifdef __ANDROID__
-#include <android/log.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <linux/limits.h>
-
-#ifdef ITT_ANDROID_LOG
-    #define ITT_ANDROID_LOG_TAG   "INTEL_VTUNE_USERAPI"
-    #define ITT_ANDROID_LOGI(...) ((void)__android_log_print(ANDROID_LOG_INFO, ITT_ANDROID_LOG_TAG, __VA_ARGS__))
-    #define ITT_ANDROID_LOGW(...) ((void)__android_log_print(ANDROID_LOG_WARN, ITT_ANDROID_LOG_TAG, __VA_ARGS__))
-    #define ITT_ANDROID_LOGE(...) ((void)__android_log_print(ANDROID_LOG_ERROR,ITT_ANDROID_LOG_TAG, __VA_ARGS__))
-    #define ITT_ANDROID_LOGD(...) ((void)__android_log_print(ANDROID_LOG_DEBUG,ITT_ANDROID_LOG_TAG, __VA_ARGS__))
-#else
-    #define ITT_ANDROID_LOGI(...)
-    #define ITT_ANDROID_LOGW(...)
-    #define ITT_ANDROID_LOGE(...)
-    #define ITT_ANDROID_LOGD(...)
-#endif
-
-/* default location of userapi collector on Android */
-#define ANDROID_ITTNOTIFY_DEFAULT_PATH_MASK(x)  "/data/data/com.intel.vtune/perfrun/lib" \
-                                                #x "/runtime/libittnotify.so"
-
-#if ITT_ARCH==ITT_ARCH_IA32 || ITT_ARCH==ITT_ARCH_ARM
-#define ANDROID_ITTNOTIFY_DEFAULT_PATH  ANDROID_ITTNOTIFY_DEFAULT_PATH_MASK(32)
-#else
-#define ANDROID_ITTNOTIFY_DEFAULT_PATH  ANDROID_ITTNOTIFY_DEFAULT_PATH_MASK(64)
-#endif
-
-#endif
-
-#ifndef PATH_MAX
-#define PATH_MAX 4096
-#endif
-
-
-#ifndef LIB_VAR_NAME
-#if ITT_ARCH==ITT_ARCH_IA32 || ITT_ARCH==ITT_ARCH_ARM || ITT_ARCH==ITT_ARCH_MIPS
-#define LIB_VAR_NAME INTEL_LIBITTNOTIFY32
-#else
-#define LIB_VAR_NAME INTEL_LIBITTNOTIFY64
-#endif
-#endif /* LIB_VAR_NAME */
-
-#define ITT_MUTEX_INIT_AND_LOCK(p) {                                 \
-    if (PTHREAD_SYMBOLS)                                             \
-    {                                                                \
-        if (!p.mutex_initialized)                                    \
-        {                                                            \
-            if (__itt_interlocked_increment(&p.atomic_counter) == 1) \
-            {                                                        \
-                __itt_mutex_init(&p.mutex);                          \
-                p.mutex_initialized = 1;                             \
-            }                                                        \
-            else                                                     \
-                while (!p.mutex_initialized)                         \
-                    __itt_thread_yield();                            \
-        }                                                            \
-        __itt_mutex_lock(&p.mutex);                                  \
-    }                                                                \
-}
-
-typedef int (__itt_init_ittlib_t)(const char*, __itt_group_id);
-
-/* this define used to control initialization function name. */
-#ifndef __itt_init_ittlib_name
-ITT_EXTERN_C int _N_(init_ittlib)(const char*, __itt_group_id);
-static __itt_init_ittlib_t* __itt_init_ittlib_ptr = _N_(init_ittlib);
-#define __itt_init_ittlib_name __itt_init_ittlib_ptr
-#endif /* __itt_init_ittlib_name */
-
-typedef void (__itt_fini_ittlib_t)(void);
-
-/* this define used to control finalization function name. */
-#ifndef __itt_fini_ittlib_name
-ITT_EXTERN_C void _N_(fini_ittlib)(void);
-static __itt_fini_ittlib_t* __itt_fini_ittlib_ptr = _N_(fini_ittlib);
-#define __itt_fini_ittlib_name __itt_fini_ittlib_ptr
-#endif /* __itt_fini_ittlib_name */
-
-/* building pointers to imported funcs */
-#undef ITT_STUBV
-#undef ITT_STUB
-#define ITT_STUB(api,type,name,args,params,ptr,group,format)   \
-static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args;\
-typedef type api ITT_JOIN(_N_(name),_t) args;                  \
-ITT_EXTERN_C_BEGIN ITT_JOIN(_N_(name),_t)* ITTNOTIFY_NAME(name) = ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)); ITT_EXTERN_C_END \
-static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args \
-{                                                              \
-    __itt_init_ittlib_name(NULL, __itt_group_all);             \
-    if (ITTNOTIFY_NAME(name) && ITTNOTIFY_NAME(name) != ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init))) \
-        return ITTNOTIFY_NAME(name) params;                    \
-    else                                                       \
-        return (type)0;                                        \
-}
-
-#define ITT_STUBV(api,type,name,args,params,ptr,group,format)  \
-static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args;\
-typedef type api ITT_JOIN(_N_(name),_t) args;                  \
-ITT_EXTERN_C_BEGIN ITT_JOIN(_N_(name),_t)* ITTNOTIFY_NAME(name) = ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)); ITT_EXTERN_C_END \
-static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args \
-{                                                              \
-    __itt_init_ittlib_name(NULL, __itt_group_all);             \
-    if (ITTNOTIFY_NAME(name) && ITTNOTIFY_NAME(name) != ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init))) \
-        ITTNOTIFY_NAME(name) params;                           \
-    else                                                       \
-        return;                                                \
-}
-
-#undef __ITT_INTERNAL_INIT
-#include "ittnotify_static.h"
-
-#undef ITT_STUB
-#undef ITT_STUBV
-#define ITT_STUB(api,type,name,args,params,ptr,group,format)   \
-static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args;\
-typedef type api ITT_JOIN(_N_(name),_t) args;                  \
-ITT_EXTERN_C_BEGIN ITT_JOIN(_N_(name),_t)* ITTNOTIFY_NAME(name) = ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)); ITT_EXTERN_C_END
-
-#define ITT_STUBV(api,type,name,args,params,ptr,group,format)  \
-static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args;\
-typedef type api ITT_JOIN(_N_(name),_t) args;                  \
-ITT_EXTERN_C_BEGIN ITT_JOIN(_N_(name),_t)* ITTNOTIFY_NAME(name) = ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)); ITT_EXTERN_C_END
-
-#define __ITT_INTERNAL_INIT
-#include "ittnotify_static.h"
-#undef __ITT_INTERNAL_INIT
-
-ITT_GROUP_LIST(group_list);
-
-#pragma pack(push, 8)
-
-typedef struct ___itt_group_alias
-{
-    const char*    env_var;
-    __itt_group_id groups;
-} __itt_group_alias;
-
-static __itt_group_alias group_alias[] = {
-    { "KMP_FOR_TPROFILE", (__itt_group_id)(__itt_group_control | __itt_group_thread | __itt_group_sync  | __itt_group_mark) },
-    { "KMP_FOR_TCHECK",   (__itt_group_id)(__itt_group_control | __itt_group_thread | __itt_group_sync  | __itt_group_fsync | __itt_group_mark | __itt_group_suppress) },
-    { NULL,               (__itt_group_none) },
-    { api_version,        (__itt_group_none) } /* !!! Just to avoid unused code elimination !!! */
-};
-
-#pragma pack(pop)
-
-#if ITT_PLATFORM==ITT_PLATFORM_WIN && KMP_MSVC_COMPAT
-#pragma warning(push)
-#pragma warning(disable: 4054) /* warning C4054: 'type cast' : from function pointer 'XXX' to data pointer 'void *' */
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-
-static __itt_api_info api_list[] = {
-/* Define functions with static implementation */
-#undef ITT_STUB
-#undef ITT_STUBV
-#define ITT_STUB(api,type,name,args,params,nameindll,group,format) { ITT_TO_STR(ITT_JOIN(__itt_,nameindll)), (void**)(void*)&ITTNOTIFY_NAME(name), (void*)(size_t)&ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)), (void*)(size_t)&ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)), (__itt_group_id)(group)},
-#define ITT_STUBV ITT_STUB
-#define __ITT_INTERNAL_INIT
-#include "ittnotify_static.h"
-#undef __ITT_INTERNAL_INIT
-/* Define functions without static implementation */
-#undef ITT_STUB
-#undef ITT_STUBV
-#define ITT_STUB(api,type,name,args,params,nameindll,group,format) {ITT_TO_STR(ITT_JOIN(__itt_,nameindll)), (void**)(void*)&ITTNOTIFY_NAME(name), (void*)(size_t)&ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)), NULL, (__itt_group_id)(group)},
-#define ITT_STUBV ITT_STUB
-#include "ittnotify_static.h"
-    {NULL, NULL, NULL, NULL, __itt_group_none}
-};
-
-#if ITT_PLATFORM==ITT_PLATFORM_WIN && KMP_MSVC_COMPAT
-#pragma warning(pop)
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-
-static const char dll_path[PATH_MAX] = { 0 };
-
-/* static part descriptor which handles. all notification api attributes. */
-__itt_global _N_(_ittapi_global) = {
-    ITT_MAGIC,                                     /* identification info */
-    ITT_MAJOR, ITT_MINOR, API_VERSION_BUILD,       /* version info */
-    0,                                             /* api_initialized */
-    0,                                             /* mutex_initialized */
-    0,                                             /* atomic_counter */
-    MUTEX_INITIALIZER,                             /* mutex */
-    NULL,                                          /* dynamic library handle */
-    NULL,                                          /* error_handler */
-    (const char**)&dll_path,                       /* dll_path_ptr */
-    (__itt_api_info*)&api_list,                    /* api_list_ptr */
-    NULL,                                          /* next __itt_global */
-    NULL,                                          /* thread_list */
-    NULL,                                          /* domain_list */
-    NULL,                                          /* string_list */
-    __itt_collection_normal,                       /* collection state */
-    NULL                                          /* counter_list */
-};
-
-typedef void (__itt_api_init_t)(__itt_global*, __itt_group_id);
-typedef void (__itt_api_fini_t)(__itt_global*);
-
-/* ========================================================================= */
-
-#ifdef ITT_NOTIFY_EXT_REPORT
-ITT_EXTERN_C void _N_(error_handler)(__itt_error_code, va_list args);
-#endif /* ITT_NOTIFY_EXT_REPORT */
-
-#if ITT_PLATFORM==ITT_PLATFORM_WIN && KMP_MSVC_COMPAT
-#pragma warning(push)
-#pragma warning(disable: 4055) /* warning C4055: 'type cast' : from data pointer 'void *' to function pointer 'XXX' */
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-
-static void __itt_report_error(unsigned code_arg, ...)
-{
-    va_list args;
-    va_start(args, code_arg);
-
-    // We use unsigned for the code argument and explicitly cast it here to the
-    // right enumerator because variadic functions are not compatible with
-    // default promotions.
-    __itt_error_code code = (__itt_error_code)code_arg;
-
-    if (_N_(_ittapi_global).error_handler != NULL)
-    {
-        __itt_error_handler_t* handler = (__itt_error_handler_t*)(size_t)_N_(_ittapi_global).error_handler;
-        handler(code, args);
-    }
-#ifdef ITT_NOTIFY_EXT_REPORT
-    _N_(error_handler)(code, args);
-#endif /* ITT_NOTIFY_EXT_REPORT */
-    va_end(args);
-}
-
-#if ITT_PLATFORM==ITT_PLATFORM_WIN && KMP_MSVC_COMPAT
-#pragma warning(pop)
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-static __itt_domain* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(domain_createW),_init))(const wchar_t* name)
-{
-    __itt_domain *h_tail = NULL, *h = NULL;
-
-    if (name == NULL)
-    {
-        return NULL;
-    }
-
-    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
-    if (_N_(_ittapi_global).api_initialized)
-    {
-        if (ITTNOTIFY_NAME(domain_createW) && ITTNOTIFY_NAME(domain_createW) != ITT_VERSIONIZE(ITT_JOIN(_N_(domain_createW),_init)))
-        {
-            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-            return ITTNOTIFY_NAME(domain_createW)(name);
-        }
-    }
-    for (h_tail = NULL, h = _N_(_ittapi_global).domain_list; h != NULL; h_tail = h, h = h->next)
-    {
-        if (h->nameW != NULL && !wcscmp(h->nameW, name)) break;
-    }
-    if (h == NULL)
-    {
-        NEW_DOMAIN_W(&_N_(_ittapi_global),h,h_tail,name);
-    }
-    if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-    return h;
-}
-
-static __itt_domain* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(domain_createA),_init))(const char* name)
-#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
-static __itt_domain* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(domain_create),_init))(const char* name)
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-{
-    __itt_domain *h_tail = NULL, *h = NULL;
-
-    if (name == NULL)
-    {
-        return NULL;
-    }
-
-    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
-    if (_N_(_ittapi_global).api_initialized)
-    {
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-        if (ITTNOTIFY_NAME(domain_createA) && ITTNOTIFY_NAME(domain_createA) != ITT_VERSIONIZE(ITT_JOIN(_N_(domain_createA),_init)))
-        {
-            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-            return ITTNOTIFY_NAME(domain_createA)(name);
-        }
-#else
-        if (ITTNOTIFY_NAME(domain_create) && ITTNOTIFY_NAME(domain_create) != ITT_VERSIONIZE(ITT_JOIN(_N_(domain_create),_init)))
-        {
-            if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-            return ITTNOTIFY_NAME(domain_create)(name);
-        }
-#endif
-    }
-    for (h_tail = NULL, h = _N_(_ittapi_global).domain_list; h != NULL; h_tail = h, h = h->next)
-    {
-        if (h->nameA != NULL && !__itt_fstrcmp(h->nameA, name)) break;
-    }
-    if (h == NULL)
-    {
-        NEW_DOMAIN_A(&_N_(_ittapi_global),h,h_tail,name);
-    }
-    if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-    return h;
-}
-
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-static __itt_string_handle* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_createW),_init))(const wchar_t* name)
-{
-    __itt_string_handle *h_tail = NULL, *h = NULL;
-
-    if (name == NULL)
-    {
-        return NULL;
-    }
-
-    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
-    if (_N_(_ittapi_global).api_initialized)
-    {
-        if (ITTNOTIFY_NAME(string_handle_createW) && ITTNOTIFY_NAME(string_handle_createW) != ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_createW),_init)))
-        {
-            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-            return ITTNOTIFY_NAME(string_handle_createW)(name);
-        }
-    }
-    for (h_tail = NULL, h = _N_(_ittapi_global).string_list; h != NULL; h_tail = h, h = h->next)
-    {
-        if (h->strW != NULL && !wcscmp(h->strW, name)) break;
-    }
-    if (h == NULL)
-    {
-        NEW_STRING_HANDLE_W(&_N_(_ittapi_global),h,h_tail,name);
-    }
-    __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-    return h;
-}
-
-static __itt_string_handle* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_createA),_init))(const char* name)
-#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
-static __itt_string_handle* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_create),_init))(const char* name)
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-{
-    __itt_string_handle *h_tail = NULL, *h = NULL;
-
-    if (name == NULL)
-    {
-        return NULL;
-    }
-
-    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
-    if (_N_(_ittapi_global).api_initialized)
-    {
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-        if (ITTNOTIFY_NAME(string_handle_createA) && ITTNOTIFY_NAME(string_handle_createA) != ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_createA),_init)))
-        {
-            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-            return ITTNOTIFY_NAME(string_handle_createA)(name);
-        }
-#else
-        if (ITTNOTIFY_NAME(string_handle_create) && ITTNOTIFY_NAME(string_handle_create) != ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_create),_init)))
-        {
-            if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-            return ITTNOTIFY_NAME(string_handle_create)(name);
-        }
-#endif
-    }
-    for (h_tail = NULL, h = _N_(_ittapi_global).string_list; h != NULL; h_tail = h, h = h->next)
-    {
-        if (h->strA != NULL && !__itt_fstrcmp(h->strA, name)) break;
-    }
-    if (h == NULL)
-    {
-        NEW_STRING_HANDLE_A(&_N_(_ittapi_global),h,h_tail,name);
-    }
-    if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-    return h;
-}
-
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createW),_init))(const wchar_t *name, const wchar_t *domain)
-{
-    __itt_counter_info_t *h_tail = NULL, *h = NULL;
-    __itt_metadata_type type = __itt_metadata_u64;
-
-    if (name == NULL)
-    {
-        return NULL;
-    }
-
-    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
-    if (_N_(_ittapi_global).api_initialized)
-    {
-        if (ITTNOTIFY_NAME(counter_createW) && ITTNOTIFY_NAME(counter_createW) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createW),_init)))
-        {
-            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-            return ITTNOTIFY_NAME(counter_createW)(name, domain);
-        }
-    }
-    for (h_tail = NULL, h = _N_(_ittapi_global).counter_list; h != NULL; h_tail = h, h = h->next)
-    {
-        if (h->nameW != NULL  && h->type == type && !wcscmp(h->nameW, name) && ((h->domainW == NULL && domain == NULL) ||
-            (h->domainW != NULL && domain != NULL && !wcscmp(h->domainW, domain)))) break;
-
-    }
-    if (h == NULL)
-    {
-        NEW_COUNTER_W(&_N_(_ittapi_global),h,h_tail,name,domain,type);
-    }
-    __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-    return (__itt_counter)h;
-}
-
-static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createA),_init))(const char *name, const char *domain)
-#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
-static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create),_init))(const char *name, const char *domain)
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-{
-    __itt_counter_info_t *h_tail = NULL, *h = NULL;
-    __itt_metadata_type type = __itt_metadata_u64;
-
-    if (name == NULL)
-    {
-        return NULL;
-    }
-
-    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
-    if (_N_(_ittapi_global).api_initialized)
-    {
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-        if (ITTNOTIFY_NAME(counter_createA) && ITTNOTIFY_NAME(counter_createA) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createA),_init)))
-        {
-            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-            return ITTNOTIFY_NAME(counter_createA)(name, domain);
-        }
-#else
-        if (ITTNOTIFY_NAME(counter_create) && ITTNOTIFY_NAME(counter_create) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create),_init)))
-        {
-            if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-            return ITTNOTIFY_NAME(counter_create)(name, domain);
-        }
-#endif
-    }
-    for (h_tail = NULL, h = _N_(_ittapi_global).counter_list; h != NULL; h_tail = h, h = h->next)
-    {
-        if (h->nameA != NULL  && h->type == type && !__itt_fstrcmp(h->nameA, name) && ((h->domainA == NULL && domain == NULL) ||
-            (h->domainA != NULL && domain != NULL && !__itt_fstrcmp(h->domainA, domain)))) break;
-    }
-    if (h == NULL)
-    {
-       NEW_COUNTER_A(&_N_(_ittapi_global),h,h_tail,name,domain,type);
-    }
-    if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-    return (__itt_counter)h;
-}
-
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typedW),_init))(const wchar_t *name, const wchar_t *domain, __itt_metadata_type type)
-{
-    __itt_counter_info_t *h_tail = NULL, *h = NULL;
-
-    if (name == NULL)
-    {
-        return NULL;
-    }
-
-    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
-    if (_N_(_ittapi_global).api_initialized)
-    {
-        if (ITTNOTIFY_NAME(counter_create_typedW) && ITTNOTIFY_NAME(counter_create_typedW) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typedW),_init)))
-        {
-            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-            return ITTNOTIFY_NAME(counter_create_typedW)(name, domain, type);
-        }
-    }
-    for (h_tail = NULL, h = _N_(_ittapi_global).counter_list; h != NULL; h_tail = h, h = h->next)
-    {
-        if (h->nameW != NULL  && h->type == type && !wcscmp(h->nameW, name) && ((h->domainW == NULL && domain == NULL) ||
-            (h->domainW != NULL && domain != NULL && !wcscmp(h->domainW, domain)))) break;
-
-    }
-    if (h == NULL)
-    {
-        NEW_COUNTER_W(&_N_(_ittapi_global),h,h_tail,name,domain,type);
-    }
-    __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-    return (__itt_counter)h;
-}
-
-static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typedA),_init))(const char *name, const char *domain, __itt_metadata_type type)
-#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
-static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typed),_init))(const char *name, const char *domain, __itt_metadata_type type)
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-{
-    __itt_counter_info_t *h_tail = NULL, *h = NULL;
-
-    if (name == NULL)
-    {
-        return NULL;
-    }
-
-    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
-    if (_N_(_ittapi_global).api_initialized)
-    {
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-        if (ITTNOTIFY_NAME(counter_create_typedA) && ITTNOTIFY_NAME(counter_create_typedA) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typedA),_init)))
-        {
-            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-            return ITTNOTIFY_NAME(counter_create_typedA)(name, domain, type);
-        }
-#else
-        if (ITTNOTIFY_NAME(counter_create_typed) && ITTNOTIFY_NAME(counter_create_typed) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typed),_init)))
-        {
-            if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-            return ITTNOTIFY_NAME(counter_create_typed)(name, domain, type);
-        }
-#endif
-    }
-    for (h_tail = NULL, h = _N_(_ittapi_global).counter_list; h != NULL; h_tail = h, h = h->next)
-    {
-        if (h->nameA != NULL  && h->type == type && !__itt_fstrcmp(h->nameA, name) && ((h->domainA == NULL && domain == NULL) ||
-            (h->domainA != NULL && domain != NULL && !__itt_fstrcmp(h->domainA, domain)))) break;
-    }
-    if (h == NULL)
-    {
-       NEW_COUNTER_A(&_N_(_ittapi_global),h,h_tail,name,domain,type);
-    }
-    if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-    return (__itt_counter)h;
-}
-
-/* -------------------------------------------------------------------------- */
-
-static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(pause),_init))(void)
-{
-    if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL)
-    {
-        __itt_init_ittlib_name(NULL, __itt_group_all);
-    }
-    if (ITTNOTIFY_NAME(pause) && ITTNOTIFY_NAME(pause) != ITT_VERSIONIZE(ITT_JOIN(_N_(pause),_init)))
-    {
-        ITTNOTIFY_NAME(pause)();
-    }
-    else
-    {
-        _N_(_ittapi_global).state = __itt_collection_paused;
-    }
-}
-
-static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(resume),_init))(void)
-{
-    if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL)
-    {
-        __itt_init_ittlib_name(NULL, __itt_group_all);
-    }
-    if (ITTNOTIFY_NAME(resume) && ITTNOTIFY_NAME(resume) != ITT_VERSIONIZE(ITT_JOIN(_N_(resume),_init)))
-    {
-        ITTNOTIFY_NAME(resume)();
-    }
-    else
-    {
-        _N_(_ittapi_global).state = __itt_collection_normal;
-    }
-}
-
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameW),_init))(const wchar_t* name)
-{
-    if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL)
-    {
-        __itt_init_ittlib_name(NULL, __itt_group_all);
-    }
-    if (ITTNOTIFY_NAME(thread_set_nameW) && ITTNOTIFY_NAME(thread_set_nameW) != ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameW),_init)))
-    {
-        ITTNOTIFY_NAME(thread_set_nameW)(name);
-    }
-}
-
-static int ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thr_name_setW),_init))(const wchar_t* name, int namelen)
-{
-    (void)namelen;
-    ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameW),_init))(name);
-    return 0;
-}
-
-static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameA),_init))(const char* name)
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_name),_init))(const char* name)
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-{
-    if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL)
-    {
-        __itt_init_ittlib_name(NULL, __itt_group_all);
-    }
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-    if (ITTNOTIFY_NAME(thread_set_nameA) && ITTNOTIFY_NAME(thread_set_nameA) != ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameA),_init)))
-    {
-        ITTNOTIFY_NAME(thread_set_nameA)(name);
-    }
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-    if (ITTNOTIFY_NAME(thread_set_name) && ITTNOTIFY_NAME(thread_set_name) != ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_name),_init)))
-    {
-        ITTNOTIFY_NAME(thread_set_name)(name);
-    }
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-}
-
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-static int ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thr_name_setA),_init))(const char* name, int namelen)
-{
-    (void)namelen;
-    ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameA),_init))(name);
-    return 0;
-}
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-static int ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thr_name_set),_init))(const char* name, int namelen)
-{
-    (void)namelen;
-    ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_name),_init))(name);
-    return 0;
-}
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-
-static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thread_ignore),_init))(void)
-{
-    if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL)
-    {
-        __itt_init_ittlib_name(NULL, __itt_group_all);
-    }
-    if (ITTNOTIFY_NAME(thread_ignore) && ITTNOTIFY_NAME(thread_ignore) != ITT_VERSIONIZE(ITT_JOIN(_N_(thread_ignore),_init)))
-    {
-        ITTNOTIFY_NAME(thread_ignore)();
-    }
-}
-
-static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thr_ignore),_init))(void)
-{
-    ITT_VERSIONIZE(ITT_JOIN(_N_(thread_ignore),_init))();
-}
-
-static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(enable_attach),_init))(void)
-{
-#ifdef __ANDROID__
-    /*
-     * if LIB_VAR_NAME env variable were set before then stay previous value
-     * else set default path
-    */
-    setenv(ITT_TO_STR(LIB_VAR_NAME), ANDROID_ITTNOTIFY_DEFAULT_PATH, 0);
-#endif
-}
-
-/* -------------------------------------------------------------------------- */
-
-static const char* __itt_fsplit(const char* s, const char* sep, const char** out, int* len)
-{
-    int i;
-    int j;
-
-    if (!s || !sep || !out || !len)
-        return NULL;
-
-    for (i = 0; s[i]; i++)
-    {
-        int b = 0;
-        for (j = 0; sep[j]; j++)
-            if (s[i] == sep[j])
-            {
-                b = 1;
-                break;
-            }
-        if (!b)
-            break;
-    }
-
-    if (!s[i])
-        return NULL;
-
-    *len = 0;
-    *out = &s[i];
-
-    for (; s[i]; i++, (*len)++)
-    {
-        int b = 0;
-        for (j = 0; sep[j]; j++)
-            if (s[i] == sep[j])
-            {
-                b = 1;
-                break;
-            }
-        if (b)
-            break;
-    }
-
-    for (; s[i]; i++)
-    {
-        int b = 0;
-        for (j = 0; sep[j]; j++)
-            if (s[i] == sep[j])
-            {
-                b = 1;
-                break;
-            }
-        if (!b)
-            break;
-    }
-
-    return &s[i];
-}
-
-/* This function return value of env variable that placed into static buffer.
- * !!! The same static buffer is used for subsequent calls. !!!
- * This was done to aviod dynamic allocation for few calls.
- * Actually we need this function only four times.
- */
-static const char* __itt_get_env_var(const char* name)
-{
-#define MAX_ENV_VALUE_SIZE 4086
-    static char  env_buff[MAX_ENV_VALUE_SIZE];
-    static char* env_value = (char*)env_buff;
-
-    if (name != NULL)
-    {
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-        size_t max_len = MAX_ENV_VALUE_SIZE - (size_t)(env_value - env_buff);
-        DWORD rc = GetEnvironmentVariableA(name, env_value, (DWORD)max_len);
-        if (rc >= max_len)
-            __itt_report_error(__itt_error_env_too_long, name, (size_t)rc - 1, (size_t)(max_len - 1));
-        else if (rc > 0)
-        {
-            const char* ret = (const char*)env_value;
-            env_value += rc + 1;
-            return ret;
-        }
-        else
-        {
-            /* If environment variable is empty, GetEnvirornmentVariables()
-             * returns zero (number of characters (not including terminating null),
-             * and GetLastError() returns ERROR_SUCCESS. */
-            DWORD err = GetLastError();
-            if (err == ERROR_SUCCESS)
-                return env_value;
-
-            if (err != ERROR_ENVVAR_NOT_FOUND)
-                __itt_report_error(__itt_error_cant_read_env, name, (int)err);
-        }
-#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
-        char* env = getenv(name);
-        if (env != NULL)
-        {
-            size_t len = __itt_fstrnlen(env, MAX_ENV_VALUE_SIZE);
-            size_t max_len = MAX_ENV_VALUE_SIZE - (size_t)(env_value - env_buff);
-            if (len < max_len)
-            {
-                const char* ret = (const char*)env_value;
-                __itt_fstrcpyn(env_value, max_len, env, len + 1);
-                env_value += len + 1;
-                return ret;
-            } else
-                __itt_report_error(__itt_error_env_too_long, name, (size_t)len, (size_t)(max_len - 1));
-        }
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-    }
-    return NULL;
-}
-
-static const char* __itt_get_lib_name(void)
-{
-    const char* lib_name = __itt_get_env_var(ITT_TO_STR(LIB_VAR_NAME));
-
-#ifdef __ANDROID__
-    if (lib_name == NULL)
-    {
-
-#if ITT_ARCH==ITT_ARCH_IA32 || ITT_ARCH==ITT_ARCH_ARM
-        const char* const marker_filename = "com.intel.itt.collector_lib_32";
-#else
-        const char* const marker_filename = "com.intel.itt.collector_lib_64";
-#endif
-
-        char system_wide_marker_filename[PATH_MAX] = {0};
-        int itt_marker_file_fd = -1;
-        ssize_t res = 0;
-
-        res = snprintf(system_wide_marker_filename, PATH_MAX - 1, "%s%s", "/data/local/tmp/", marker_filename);
-        if (res < 0)
-        {
-            ITT_ANDROID_LOGE("Unable to concatenate marker file string.");
-            return lib_name;
-        }
-        itt_marker_file_fd = open(system_wide_marker_filename, O_RDONLY);
-
-        if (itt_marker_file_fd == -1)
-        {
-            const pid_t my_pid = getpid();
-            char cmdline_path[PATH_MAX] = {0};
-            char package_name[PATH_MAX] = {0};
-            char app_sandbox_file[PATH_MAX] = {0};
-            int cmdline_fd = 0;
-
-            ITT_ANDROID_LOGI("Unable to open system-wide marker file.");
-            res = snprintf(cmdline_path, PATH_MAX - 1, "/proc/%d/cmdline", my_pid);
-            if (res < 0)
-            {
-                ITT_ANDROID_LOGE("Unable to get cmdline path string.");
-                return lib_name;
-            }
-
-            ITT_ANDROID_LOGI("CMD file: %s\n", cmdline_path);
-            cmdline_fd = open(cmdline_path, O_RDONLY);
-            if (cmdline_fd == -1)
-            {
-                ITT_ANDROID_LOGE("Unable to open %s file!", cmdline_path);
-                return lib_name;
-            }
-            res = read(cmdline_fd, package_name, PATH_MAX - 1);
-            if (res == -1)
-            {
-                ITT_ANDROID_LOGE("Unable to read %s file!", cmdline_path);
-                res = close(cmdline_fd);
-                if (res == -1)
-                {
-                    ITT_ANDROID_LOGE("Unable to close %s file!", cmdline_path);
-                }
-                return lib_name;
-            }
-            res = close(cmdline_fd);
-            if (res == -1)
-            {
-                ITT_ANDROID_LOGE("Unable to close %s file!", cmdline_path);
-                return lib_name;
-            }
-            ITT_ANDROID_LOGI("Package name: %s\n", package_name);
-            res = snprintf(app_sandbox_file, PATH_MAX - 1, "/data/data/%s/%s", package_name, marker_filename);
-            if (res < 0)
-            {
-                ITT_ANDROID_LOGE("Unable to concatenate marker file string.");
-                return lib_name;
-            }
-
-            ITT_ANDROID_LOGI("Lib marker file name: %s\n", app_sandbox_file);
-            itt_marker_file_fd = open(app_sandbox_file, O_RDONLY);
-            if (itt_marker_file_fd == -1)
-            {
-                ITT_ANDROID_LOGE("Unable to open app marker file!");
-                return lib_name;
-            }
-        }
-
-        {
-            char itt_lib_name[PATH_MAX] = {0};
-
-            res = read(itt_marker_file_fd, itt_lib_name, PATH_MAX - 1);
-            if (res == -1)
-            {
-                ITT_ANDROID_LOGE("Unable to read %s file!", itt_marker_file_fd);
-                res = close(itt_marker_file_fd);
-                if (res == -1)
-                {
-                    ITT_ANDROID_LOGE("Unable to close %s file!", itt_marker_file_fd);
-                }
-                return lib_name;
-            }
-            ITT_ANDROID_LOGI("ITT Lib path: %s", itt_lib_name);
-            res = close(itt_marker_file_fd);
-            if (res == -1)
-            {
-                ITT_ANDROID_LOGE("Unable to close %s file!", itt_marker_file_fd);
-                return lib_name;
-            }
-            ITT_ANDROID_LOGI("Set env %s to %s", ITT_TO_STR(LIB_VAR_NAME), itt_lib_name);
-            res = setenv(ITT_TO_STR(LIB_VAR_NAME), itt_lib_name, 0);
-            if (res == -1)
-            {
-                ITT_ANDROID_LOGE("Unable to set env var!");
-                return lib_name;
-            }
-            lib_name = __itt_get_env_var(ITT_TO_STR(LIB_VAR_NAME));
-            ITT_ANDROID_LOGI("ITT Lib path from env: %s", lib_name);
-        }
-    }
-#endif
-
-    return lib_name;
-}
-
-/* Avoid clashes with std::min, reported by tbb team */
-#define __itt_min(a,b) (a) < (b) ? (a) : (b)
-
-static __itt_group_id __itt_get_groups(void)
-{
-    int i;
-    __itt_group_id res = __itt_group_none;
-    const char* var_name  = "INTEL_ITTNOTIFY_GROUPS";
-    const char* group_str = __itt_get_env_var(var_name);
-
-    if (group_str != NULL)
-    {
-        int len;
-        char gr[255];
-        const char* chunk;
-        while ((group_str = __itt_fsplit(group_str, ",; ", &chunk, &len)) != NULL)
-        {
-            int min_len = __itt_min(len, (int)(sizeof(gr) - 1));
-            __itt_fstrcpyn(gr, sizeof(gr) - 1, chunk,  min_len);
-            gr[min_len] = 0;
-
-            for (i = 0; group_list[i].name != NULL; i++)
-            {
-                if (!__itt_fstrcmp(gr, group_list[i].name))
-                {
-                    res = (__itt_group_id)(res | group_list[i].id);
-                    break;
-                }
-            }
-        }
-        /* TODO: !!! Workaround for bug with warning for unknown group !!!
-         * Should be fixed in new initialization scheme.
-         * Now the following groups should be set always. */
-        for (i = 0; group_list[i].id != __itt_group_none; i++)
-            if (group_list[i].id != __itt_group_all &&
-                group_list[i].id > __itt_group_splitter_min &&
-                group_list[i].id < __itt_group_splitter_max)
-                res = (__itt_group_id)(res | group_list[i].id);
-        return res;
-    }
-    else
-    {
-        for (i = 0; group_alias[i].env_var != NULL; i++)
-            if (__itt_get_env_var(group_alias[i].env_var) != NULL)
-                return group_alias[i].groups;
-    }
-
-    return res;
-}
-
-#undef __itt_min
-
-static int __itt_lib_version(lib_t lib)
-{
-    if (lib == NULL)
-        return 0;
-    if (__itt_get_proc(lib, "__itt_api_init"))
-        return 2;
-    if (__itt_get_proc(lib, "__itt_api_version"))
-        return 1;
-    return 0;
-}
-
-/* It's not used right now! Comment it out to avoid warnings.
-static void __itt_reinit_all_pointers(void)
-{
-    int i;
-    // Fill all pointers with initial stubs
-    for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++)
-        *_N_(_ittapi_global).api_list_ptr[i].func_ptr = _N_(_ittapi_global).api_list_ptr[i].init_func;
-}
-*/
-
-static void __itt_nullify_all_pointers(void)
-{
-    int i;
-    /* Nulify all pointers except domain_create, string_handle_create  and counter_create */
-    for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++)
-        *_N_(_ittapi_global).api_list_ptr[i].func_ptr = _N_(_ittapi_global).api_list_ptr[i].null_func;
-}
-
-#if ITT_PLATFORM==ITT_PLATFORM_WIN && KMP_MSVC_COMPAT
-#pragma warning(push)
-#pragma warning(disable: 4054) /* warning C4054: 'type cast' : from function pointer 'XXX' to data pointer 'void *' */
-#pragma warning(disable: 4055) /* warning C4055: 'type cast' : from data pointer 'void *' to function pointer 'XXX' */
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-
-ITT_EXTERN_C void _N_(fini_ittlib)(void)
-{
-    __itt_api_fini_t* __itt_api_fini_ptr = NULL;
-    static volatile TIDT current_thread = 0;
-
-    if (_N_(_ittapi_global).api_initialized)
-    {
-        ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
-        if (_N_(_ittapi_global).api_initialized)
-        {
-            if (current_thread == 0)
-            {
-                if (PTHREAD_SYMBOLS) current_thread = __itt_thread_id();
-                if (_N_(_ittapi_global).lib != NULL)
-                {
-                    __itt_api_fini_ptr = (__itt_api_fini_t*)(size_t)__itt_get_proc(_N_(_ittapi_global).lib, "__itt_api_fini");
-                }
-                if (__itt_api_fini_ptr)
-                {
-                    __itt_api_fini_ptr(&_N_(_ittapi_global));
-                }
-
-                __itt_nullify_all_pointers();
-
- /* TODO: !!! not safe !!! don't support unload so far.
-  *             if (_N_(_ittapi_global).lib != NULL)
-  *                 __itt_unload_lib(_N_(_ittapi_global).lib);
-  *             _N_(_ittapi_global).lib = NULL;
-  */
-                _N_(_ittapi_global).api_initialized = 0;
-                current_thread = 0;
-            }
-        }
-        if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-    }
-}
-
-ITT_EXTERN_C int _N_(init_ittlib)(const char* lib_name, __itt_group_id init_groups)
-{
-    int i;
-    __itt_group_id groups;
-#ifdef ITT_COMPLETE_GROUP
-    __itt_group_id zero_group = __itt_group_none;
-#endif /* ITT_COMPLETE_GROUP */
-    static volatile TIDT current_thread = 0;
-
-    if (!_N_(_ittapi_global).api_initialized)
-    {
-#ifndef ITT_SIMPLE_INIT
-        ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
-#endif /* ITT_SIMPLE_INIT */
-
-        if (!_N_(_ittapi_global).api_initialized)
-        {
-            if (current_thread == 0)
-            {
-                if (PTHREAD_SYMBOLS) current_thread = __itt_thread_id();
-                if (lib_name == NULL)
-                {
-                    lib_name = __itt_get_lib_name();
-                }
-                groups = __itt_get_groups();
-                if (DL_SYMBOLS && (groups != __itt_group_none || lib_name != NULL))
-                {
-                    _N_(_ittapi_global).lib = __itt_load_lib((lib_name == NULL) ? ittnotify_lib_name : lib_name);
-
-                    if (_N_(_ittapi_global).lib != NULL)
-                    {
-                        __itt_api_init_t* __itt_api_init_ptr;
-                        int lib_version = __itt_lib_version(_N_(_ittapi_global).lib);
-
-                        switch (lib_version) {
-                        case 0:
-                            groups = __itt_group_legacy;
-                        case 1:
-                            /* Fill all pointers from dynamic library */
-                            for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++)
-                            {
-                                if (_N_(_ittapi_global).api_list_ptr[i].group & groups & init_groups)
-                                {
-                                    *_N_(_ittapi_global).api_list_ptr[i].func_ptr = (void*)__itt_get_proc(_N_(_ittapi_global).lib, _N_(_ittapi_global).api_list_ptr[i].name);
-                                    if (*_N_(_ittapi_global).api_list_ptr[i].func_ptr == NULL)
-                                    {
-                                        /* Restore pointers for function with static implementation */
-                                        *_N_(_ittapi_global).api_list_ptr[i].func_ptr = _N_(_ittapi_global).api_list_ptr[i].null_func;
-                                        __itt_report_error(__itt_error_no_symbol, lib_name, _N_(_ittapi_global).api_list_ptr[i].name);
-#ifdef ITT_COMPLETE_GROUP
-                                        zero_group = (__itt_group_id)(zero_group | _N_(_ittapi_global).api_list_ptr[i].group);
-#endif /* ITT_COMPLETE_GROUP */
-                                    }
-                                }
-                                else
-                                    *_N_(_ittapi_global).api_list_ptr[i].func_ptr = _N_(_ittapi_global).api_list_ptr[i].null_func;
-                            }
-
-                            if (groups == __itt_group_legacy)
-                            {
-                                /* Compatibility with legacy tools */
-                                ITTNOTIFY_NAME(thread_ignore)  = ITTNOTIFY_NAME(thr_ignore);
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-                                ITTNOTIFY_NAME(sync_createA)   = ITTNOTIFY_NAME(sync_set_nameA);
-                                ITTNOTIFY_NAME(sync_createW)   = ITTNOTIFY_NAME(sync_set_nameW);
-#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
-                                ITTNOTIFY_NAME(sync_create)    = ITTNOTIFY_NAME(sync_set_name);
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-                                ITTNOTIFY_NAME(sync_prepare)   = ITTNOTIFY_NAME(notify_sync_prepare);
-                                ITTNOTIFY_NAME(sync_cancel)    = ITTNOTIFY_NAME(notify_sync_cancel);
-                                ITTNOTIFY_NAME(sync_acquired)  = ITTNOTIFY_NAME(notify_sync_acquired);
-                                ITTNOTIFY_NAME(sync_releasing) = ITTNOTIFY_NAME(notify_sync_releasing);
-                            }
-
-#ifdef ITT_COMPLETE_GROUP
-                            for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++)
-                                if (_N_(_ittapi_global).api_list_ptr[i].group & zero_group)
-                                    *_N_(_ittapi_global).api_list_ptr[i].func_ptr = _N_(_ittapi_global).api_list_ptr[i].null_func;
-#endif /* ITT_COMPLETE_GROUP */
-                            break;
-                        case 2:
-                            __itt_api_init_ptr = (__itt_api_init_t*)(size_t)__itt_get_proc(_N_(_ittapi_global).lib, "__itt_api_init");
-                            if (__itt_api_init_ptr)
-                                __itt_api_init_ptr(&_N_(_ittapi_global), init_groups);
-                            break;
-                        }
-                    }
-                    else
-                    {
-                        __itt_nullify_all_pointers();
-
-                        __itt_report_error(__itt_error_no_module, lib_name,
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-                            __itt_system_error()
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-                            dlerror()
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-                        );
-                    }
-                }
-                else
-                {
-                    __itt_nullify_all_pointers();
-                }
-                _N_(_ittapi_global).api_initialized = 1;
-                current_thread = 0;
-                /* !!! Just to avoid unused code elimination !!! */
-                if (__itt_fini_ittlib_ptr == _N_(fini_ittlib)) current_thread = 0;
-            }
-        }
-
-#ifndef ITT_SIMPLE_INIT
-        if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
-#endif /* ITT_SIMPLE_INIT */
-    }
-
-    /* Evaluating if any function ptr is non empty and it's in init_groups */
-    for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++)
-    {
-        if (*_N_(_ittapi_global).api_list_ptr[i].func_ptr != _N_(_ittapi_global).api_list_ptr[i].null_func &&
-            _N_(_ittapi_global).api_list_ptr[i].group & init_groups)
-        {
-            return 1;
-        }
-    }
-    return 0;
-}
-
-ITT_EXTERN_C __itt_error_handler_t* _N_(set_error_handler)(__itt_error_handler_t* handler)
-{
-    __itt_error_handler_t* prev = (__itt_error_handler_t*)(size_t)_N_(_ittapi_global).error_handler;
-    _N_(_ittapi_global).error_handler = (void*)(size_t)handler;
-    return prev;
-}
-
-#if ITT_PLATFORM==ITT_PLATFORM_WIN && KMP_MSVC_COMPAT
-#pragma warning(pop)
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */

Property changes on: projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/thirdparty/ittnotify/ittnotify_static.c
___________________________________________________________________
Deleted: svn:eol-style
## -1 +0,0 ##
-native
\ No newline at end of property
Deleted: svn:keywords
## -1 +0,0 ##
-FreeBSD=%H
\ No newline at end of property
Deleted: svn:mime-type
## -1 +0,0 ##
-text/plain
\ No newline at end of property
Index: projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/thirdparty/ittnotify/ittnotify_config.h
===================================================================
--- projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/thirdparty/ittnotify/ittnotify_config.h	(revision 357058)
+++ projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/thirdparty/ittnotify/ittnotify_config.h	(revision 357059)
@@ -1,587 +1,595 @@
 
 //===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef _ITTNOTIFY_CONFIG_H_
 #define _ITTNOTIFY_CONFIG_H_
 
 /** @cond exclude_from_documentation */
 #ifndef ITT_OS_WIN
 #  define ITT_OS_WIN   1
 #endif /* ITT_OS_WIN */
 
 #ifndef ITT_OS_LINUX
 #  define ITT_OS_LINUX 2
 #endif /* ITT_OS_LINUX */
 
 #ifndef ITT_OS_MAC
 #  define ITT_OS_MAC   3
 #endif /* ITT_OS_MAC */
 
 #ifndef ITT_OS_FREEBSD
 #  define ITT_OS_FREEBSD   4
 #endif /* ITT_OS_FREEBSD */
 
 #ifndef ITT_OS
 #  if defined WIN32 || defined _WIN32
 #    define ITT_OS ITT_OS_WIN
 #  elif defined( __APPLE__ ) && defined( __MACH__ )
 #    define ITT_OS ITT_OS_MAC
 #  elif defined( __FreeBSD__ )
 #    define ITT_OS ITT_OS_FREEBSD
 #  else
 #    define ITT_OS ITT_OS_LINUX
 #  endif
 #endif /* ITT_OS */
 
 #ifndef ITT_PLATFORM_WIN
 #  define ITT_PLATFORM_WIN 1
 #endif /* ITT_PLATFORM_WIN */
 
 #ifndef ITT_PLATFORM_POSIX
 #  define ITT_PLATFORM_POSIX 2
 #endif /* ITT_PLATFORM_POSIX */
 
 #ifndef ITT_PLATFORM_MAC
 #  define ITT_PLATFORM_MAC 3
 #endif /* ITT_PLATFORM_MAC */
 
 #ifndef ITT_PLATFORM_FREEBSD
 #  define ITT_PLATFORM_FREEBSD 4
 #endif /* ITT_PLATFORM_FREEBSD */
 
 #ifndef ITT_PLATFORM
 #  if ITT_OS==ITT_OS_WIN
 #    define ITT_PLATFORM ITT_PLATFORM_WIN
 #  elif ITT_OS==ITT_OS_MAC
 #    define ITT_PLATFORM ITT_PLATFORM_MAC
 #  elif ITT_OS==ITT_OS_FREEBSD
 #    define ITT_PLATFORM ITT_PLATFORM_FREEBSD
 #  else
 #    define ITT_PLATFORM ITT_PLATFORM_POSIX
 #  endif
 #endif /* ITT_PLATFORM */
 
 #if defined(_UNICODE) && !defined(UNICODE)
 #define UNICODE
 #endif
 
 #include <stddef.h>
 #if ITT_PLATFORM==ITT_PLATFORM_WIN
 #include <tchar.h>
 #else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #include <stdint.h>
 #if defined(UNICODE) || defined(_UNICODE)
 #include <wchar.h>
 #endif /* UNICODE || _UNICODE */
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 
 #ifndef ITTAPI_CDECL
 #  if ITT_PLATFORM==ITT_PLATFORM_WIN
 #    define ITTAPI_CDECL __cdecl
 #  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #    if defined _M_IX86 || defined __i386__
 #      define ITTAPI_CDECL __attribute__ ((cdecl))
 #    else  /* _M_IX86 || __i386__ */
 #      define ITTAPI_CDECL /* actual only on x86 platform */
 #    endif /* _M_IX86 || __i386__ */
 #  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* ITTAPI_CDECL */
 
 #ifndef STDCALL
 #  if ITT_PLATFORM==ITT_PLATFORM_WIN
 #    define STDCALL __stdcall
 #  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #    if defined _M_IX86 || defined __i386__
 #      define STDCALL __attribute__ ((stdcall))
 #    else  /* _M_IX86 || __i386__ */
 #      define STDCALL /* supported only on x86 platform */
 #    endif /* _M_IX86 || __i386__ */
 #  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #endif /* STDCALL */
 
 #define ITTAPI    ITTAPI_CDECL
 #define LIBITTAPI ITTAPI_CDECL
 
 /* TODO: Temporary for compatibility! */
 #define ITTAPI_CALL    ITTAPI_CDECL
 #define LIBITTAPI_CALL ITTAPI_CDECL
 
 #if ITT_PLATFORM==ITT_PLATFORM_WIN
 /* use __forceinline (VC++ specific) */
 #define ITT_INLINE           __forceinline
 #define ITT_INLINE_ATTRIBUTE /* nothing */
 #else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 /*
  * Generally, functions are not inlined unless optimization is specified.
  * For functions declared inline, this attribute inlines the function even
  * if no optimization level was specified.
  */
 #ifdef __STRICT_ANSI__
 #define ITT_INLINE           static
 #define ITT_INLINE_ATTRIBUTE __attribute__((unused))
 #else  /* __STRICT_ANSI__ */
 #define ITT_INLINE           static inline
 #define ITT_INLINE_ATTRIBUTE __attribute__((always_inline, unused))
 #endif /* __STRICT_ANSI__ */
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 /** @endcond */
 
 #ifndef ITT_ARCH_IA32
 #  define ITT_ARCH_IA32  1
 #endif /* ITT_ARCH_IA32 */
 
 #ifndef ITT_ARCH_IA32E
 #  define ITT_ARCH_IA32E 2
 #endif /* ITT_ARCH_IA32E */
 
 /* Was there a magical reason we didn't have 3 here before? */
 #ifndef ITT_ARCH_AARCH64
 #  define ITT_ARCH_AARCH64  3
 #endif /* ITT_ARCH_AARCH64 */
 
 #ifndef ITT_ARCH_ARM
 #  define ITT_ARCH_ARM  4
 #endif /* ITT_ARCH_ARM */
 
 #ifndef ITT_ARCH_PPC64
 #  define ITT_ARCH_PPC64  5
 #endif /* ITT_ARCH_PPC64 */
 
 #ifndef ITT_ARCH_MIPS
 #  define ITT_ARCH_MIPS  6
 #endif /* ITT_ARCH_MIPS */
 
 #ifndef ITT_ARCH_MIPS64
 #  define ITT_ARCH_MIPS64  6
 #endif /* ITT_ARCH_MIPS64 */
 
+#ifndef ITT_ARCH_RISCV64
+#  define ITT_ARCH_RISCV64  7
+#endif /* ITT_ARCH_RISCV64 */
+
 #ifndef ITT_ARCH
 #  if defined _M_IX86 || defined __i386__
 #    define ITT_ARCH ITT_ARCH_IA32
 #  elif defined _M_X64 || defined _M_AMD64 || defined __x86_64__
 #    define ITT_ARCH ITT_ARCH_IA32E
 #  elif defined _M_IA64 || defined __ia64__
 #    define ITT_ARCH ITT_ARCH_IA64
 #  elif defined _M_ARM || defined __arm__
 #    define ITT_ARCH ITT_ARCH_ARM
 #  elif defined __powerpc64__
 #    define ITT_ARCH ITT_ARCH_PPC64
 #  elif defined __aarch64__
 #    define ITT_ARCH ITT_ARCH_AARCH64
 #  elif defined __mips__ && !defined __mips64
 #    define ITT_ARCH ITT_ARCH_MIPS
 #  elif defined __mips__ && defined __mips64
 #    define ITT_ARCH ITT_ARCH_MIPS64
+#  elif defined __riscv && __riscv_xlen == 64
+#    define ITT_ARCH ITT_ARCH_RISCV64
 #  endif
 #endif
 
 #ifdef __cplusplus
 #  define ITT_EXTERN_C extern "C"
 #  define ITT_EXTERN_C_BEGIN extern "C" {
 #  define ITT_EXTERN_C_END }
 #else
 #  define ITT_EXTERN_C /* nothing */
 #  define ITT_EXTERN_C_BEGIN /* nothing */
 #  define ITT_EXTERN_C_END /* nothing */
 #endif /* __cplusplus */
 
 #define ITT_TO_STR_AUX(x) #x
 #define ITT_TO_STR(x)     ITT_TO_STR_AUX(x)
 
 #define __ITT_BUILD_ASSERT(expr, suffix) do { \
     static char __itt_build_check_##suffix[(expr) ? 1 : -1]; \
     __itt_build_check_##suffix[0] = 0; \
 } while(0)
 #define _ITT_BUILD_ASSERT(expr, suffix)  __ITT_BUILD_ASSERT((expr), suffix)
 #define ITT_BUILD_ASSERT(expr)           _ITT_BUILD_ASSERT((expr), __LINE__)
 
 #define ITT_MAGIC { 0xED, 0xAB, 0xAB, 0xEC, 0x0D, 0xEE, 0xDA, 0x30 }
 
 /* Replace with snapshot date YYYYMMDD for promotion build. */
 #define API_VERSION_BUILD    20151119
 
 #ifndef API_VERSION_NUM
 #define API_VERSION_NUM 0.0.0
 #endif /* API_VERSION_NUM */
 
 #define API_VERSION "ITT-API-Version " ITT_TO_STR(API_VERSION_NUM) \
                                 " (" ITT_TO_STR(API_VERSION_BUILD) ")"
 
 /* OS communication functions */
 #if ITT_PLATFORM==ITT_PLATFORM_WIN
 #include <windows.h>
 typedef HMODULE           lib_t;
 typedef DWORD             TIDT;
 typedef CRITICAL_SECTION  mutex_t;
 #define MUTEX_INITIALIZER { 0 }
 #define strong_alias(name, aliasname) /* empty for Windows */
 #else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #include <dlfcn.h>
 #if defined(UNICODE) || defined(_UNICODE)
 #include <wchar.h>
 #endif /* UNICODE */
 #ifndef _GNU_SOURCE
 #define _GNU_SOURCE 1 /* need for PTHREAD_MUTEX_RECURSIVE */
 #endif /* _GNU_SOURCE */
 #ifndef __USE_UNIX98
 #define __USE_UNIX98 1 /* need for PTHREAD_MUTEX_RECURSIVE, on SLES11.1 with gcc 4.3.4 wherein pthread.h missing dependency on __USE_XOPEN2K8 */
 #endif /*__USE_UNIX98*/
 #include <pthread.h>
 typedef void*             lib_t;
 typedef pthread_t         TIDT;
 typedef pthread_mutex_t   mutex_t;
 #define MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER
 #define _strong_alias(name, aliasname) \
             extern __typeof (name) aliasname __attribute__ ((alias (#name)));
 #define strong_alias(name, aliasname) _strong_alias(name, aliasname)
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 
 #if ITT_PLATFORM==ITT_PLATFORM_WIN
 #define __itt_get_proc(lib, name) GetProcAddress(lib, name)
 #define __itt_mutex_init(mutex)   InitializeCriticalSection(mutex)
 #define __itt_mutex_lock(mutex)   EnterCriticalSection(mutex)
 #define __itt_mutex_unlock(mutex) LeaveCriticalSection(mutex)
 #define __itt_load_lib(name)      LoadLibraryA(name)
 #define __itt_unload_lib(handle)  FreeLibrary(handle)
 #define __itt_system_error()      (int)GetLastError()
 #define __itt_fstrcmp(s1, s2)     lstrcmpA(s1, s2)
 #define __itt_fstrnlen(s, l)      strnlen_s(s, l)
 #define __itt_fstrcpyn(s1, b, s2, l) strncpy_s(s1, b, s2, l)
 #define __itt_fstrdup(s)          _strdup(s)
 #define __itt_thread_id()         GetCurrentThreadId()
 #define __itt_thread_yield()      SwitchToThread()
 #ifndef ITT_SIMPLE_INIT
 ITT_INLINE long
 __itt_interlocked_increment(volatile long* ptr) ITT_INLINE_ATTRIBUTE;
 ITT_INLINE long __itt_interlocked_increment(volatile long* ptr)
 {
     return InterlockedIncrement(ptr);
 }
 #endif /* ITT_SIMPLE_INIT */
 
 #define DL_SYMBOLS (1)
 #define PTHREAD_SYMBOLS (1)
 
 #else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
 #define __itt_get_proc(lib, name) dlsym(lib, name)
 #define __itt_mutex_init(mutex)   {\
     pthread_mutexattr_t mutex_attr;                                         \
     int error_code = pthread_mutexattr_init(&mutex_attr);                   \
     if (error_code)                                                         \
         __itt_report_error(__itt_error_system, "pthread_mutexattr_init",    \
                            error_code);                                     \
     error_code = pthread_mutexattr_settype(&mutex_attr,                     \
                                            PTHREAD_MUTEX_RECURSIVE);        \
     if (error_code)                                                         \
         __itt_report_error(__itt_error_system, "pthread_mutexattr_settype", \
                            error_code);                                     \
     error_code = pthread_mutex_init(mutex, &mutex_attr);                    \
     if (error_code)                                                         \
         __itt_report_error(__itt_error_system, "pthread_mutex_init",        \
                            error_code);                                     \
     error_code = pthread_mutexattr_destroy(&mutex_attr);                    \
     if (error_code)                                                         \
         __itt_report_error(__itt_error_system, "pthread_mutexattr_destroy", \
                            error_code);                                     \
 }
 #define __itt_mutex_lock(mutex)   pthread_mutex_lock(mutex)
 #define __itt_mutex_unlock(mutex) pthread_mutex_unlock(mutex)
 #define __itt_load_lib(name)      dlopen(name, RTLD_LAZY)
 #define __itt_unload_lib(handle)  dlclose(handle)
 #define __itt_system_error()      errno
 #define __itt_fstrcmp(s1, s2)     strcmp(s1, s2)
 
 /* makes customer code define safe APIs for SDL_STRNLEN_S and SDL_STRNCPY_S */
 #ifdef SDL_STRNLEN_S
 #define __itt_fstrnlen(s, l)      SDL_STRNLEN_S(s, l)
 #else
 #define __itt_fstrnlen(s, l)      strlen(s)
 #endif /* SDL_STRNLEN_S */
 #ifdef SDL_STRNCPY_S
 #define __itt_fstrcpyn(s1, b, s2, l) SDL_STRNCPY_S(s1, b, s2, l)
 #else
 #define __itt_fstrcpyn(s1, b, s2, l) strncpy(s1, s2, l)
 #endif /* SDL_STRNCPY_S */
 
 #define __itt_fstrdup(s)          strdup(s)
 #define __itt_thread_id()         pthread_self()
 #define __itt_thread_yield()      sched_yield()
 #if ITT_ARCH==ITT_ARCH_IA64
 #ifdef __INTEL_COMPILER
 #define __TBB_machine_fetchadd4(addr, val) __fetchadd4_acq((void *)addr, val)
 #else  /* __INTEL_COMPILER */
 /* TODO: Add Support for not Intel compilers for IA-64 architecture */
 #endif /* __INTEL_COMPILER */
 #elif ITT_ARCH==ITT_ARCH_IA32 || ITT_ARCH==ITT_ARCH_IA32E /* ITT_ARCH!=ITT_ARCH_IA64 */
 ITT_INLINE long
 __TBB_machine_fetchadd4(volatile void* ptr, long addend) ITT_INLINE_ATTRIBUTE;
 ITT_INLINE long __TBB_machine_fetchadd4(volatile void* ptr, long addend)
 {
     long result;
     __asm__ __volatile__("lock\nxadd %0,%1"
                           : "=r"(result),"=m"(*(volatile int*)ptr)
                           : "0"(addend), "m"(*(volatile int*)ptr)
                           : "memory");
     return result;
 }
-#elif ITT_ARCH==ITT_ARCH_ARM || ITT_ARCH==ITT_ARCH_PPC64 || ITT_ARCH==ITT_ARCH_AARCH64 || ITT_ARCH==ITT_ARCH_MIPS ||  ITT_ARCH==ITT_ARCH_MIPS64
+#elif ITT_ARCH == ITT_ARCH_ARM || ITT_ARCH == ITT_ARCH_PPC64 ||                \
+    ITT_ARCH == ITT_ARCH_AARCH64 || ITT_ARCH == ITT_ARCH_MIPS ||               \
+    ITT_ARCH == ITT_ARCH_MIPS64 || ITT_ARCH == ITT_ARCH_RISCV64
 #define __TBB_machine_fetchadd4(addr, val) __sync_fetch_and_add(addr, val)
 #endif /* ITT_ARCH==ITT_ARCH_IA64 */
 #ifndef ITT_SIMPLE_INIT
 ITT_INLINE long
 __itt_interlocked_increment(volatile long* ptr) ITT_INLINE_ATTRIBUTE;
 ITT_INLINE long __itt_interlocked_increment(volatile long* ptr)
 {
     return __TBB_machine_fetchadd4(ptr, 1) + 1L;
 }
 #endif /* ITT_SIMPLE_INIT */
 
 void* dlopen(const char*, int) __attribute__((weak));
 void* dlsym(void*, const char*) __attribute__((weak));
 int dlclose(void*) __attribute__((weak));
 #define DL_SYMBOLS (dlopen && dlsym && dlclose)
 
 int pthread_mutex_init(pthread_mutex_t*, const pthread_mutexattr_t*) __attribute__((weak));
 int pthread_mutex_lock(pthread_mutex_t*) __attribute__((weak));
 int pthread_mutex_unlock(pthread_mutex_t*) __attribute__((weak));
 int pthread_mutex_destroy(pthread_mutex_t*) __attribute__((weak));
 int pthread_mutexattr_init(pthread_mutexattr_t*) __attribute__((weak));
 int pthread_mutexattr_settype(pthread_mutexattr_t*, int) __attribute__((weak));
 int pthread_mutexattr_destroy(pthread_mutexattr_t*) __attribute__((weak));
 pthread_t pthread_self(void) __attribute__((weak));
 #define PTHREAD_SYMBOLS (pthread_mutex_init && pthread_mutex_lock && pthread_mutex_unlock && pthread_mutex_destroy && pthread_mutexattr_init && pthread_mutexattr_settype && pthread_mutexattr_destroy && pthread_self)
 
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 
 typedef enum {
     __itt_collection_normal = 0,
     __itt_collection_paused = 1
 } __itt_collection_state;
 
 typedef enum {
     __itt_thread_normal  = 0,
     __itt_thread_ignored = 1
 } __itt_thread_state;
 
 #pragma pack(push, 8)
 
 typedef struct ___itt_thread_info
 {
     const char* nameA; /*!< Copy of original name in ASCII. */
 #if defined(UNICODE) || defined(_UNICODE)
     const wchar_t* nameW; /*!< Copy of original name in UNICODE. */
 #else  /* UNICODE || _UNICODE */
     void* nameW;
 #endif /* UNICODE || _UNICODE */
     TIDT               tid;
     __itt_thread_state state;   /*!< Thread state (paused or normal) */
     int                extra1;  /*!< Reserved to the runtime */
     void*              extra2;  /*!< Reserved to the runtime */
     struct ___itt_thread_info* next;
 } __itt_thread_info;
 
 #include "ittnotify_types.h" /* For __itt_group_id definition */
 
 typedef struct ___itt_api_info_20101001
 {
     const char*    name;
     void**         func_ptr;
     void*          init_func;
     __itt_group_id group;
 }  __itt_api_info_20101001;
 
 typedef struct ___itt_api_info
 {
     const char*    name;
     void**         func_ptr;
     void*          init_func;
     void*          null_func;
     __itt_group_id group;
 }  __itt_api_info;
 
 typedef struct __itt_counter_info
 {
     const char* nameA;  /*!< Copy of original name in ASCII. */
 #if defined(UNICODE) || defined(_UNICODE)
     const wchar_t* nameW; /*!< Copy of original name in UNICODE. */
 #else  /* UNICODE || _UNICODE */
     void* nameW;
 #endif /* UNICODE || _UNICODE */
     const char* domainA;  /*!< Copy of original name in ASCII. */
 #if defined(UNICODE) || defined(_UNICODE)
     const wchar_t* domainW; /*!< Copy of original name in UNICODE. */
 #else  /* UNICODE || _UNICODE */
     void* domainW;
 #endif /* UNICODE || _UNICODE */
     int type;
     long index;
     int   extra1; /*!< Reserved to the runtime */
     void* extra2; /*!< Reserved to the runtime */
     struct __itt_counter_info* next;
 }  __itt_counter_info_t;
 
 struct ___itt_domain;
 struct ___itt_string_handle;
 
 typedef struct ___itt_global
 {
     unsigned char          magic[8];
     unsigned long          version_major;
     unsigned long          version_minor;
     unsigned long          version_build;
     volatile long          api_initialized;
     volatile long          mutex_initialized;
     volatile long          atomic_counter;
     mutex_t                mutex;
     lib_t                  lib;
     void*                  error_handler;
     const char**           dll_path_ptr;
     __itt_api_info*        api_list_ptr;
     struct ___itt_global*  next;
     /* Joinable structures below */
     __itt_thread_info*     thread_list;
     struct ___itt_domain*  domain_list;
     struct ___itt_string_handle* string_list;
     __itt_collection_state state;
     __itt_counter_info_t* counter_list;
 } __itt_global;
 
 #pragma pack(pop)
 
 #define NEW_THREAD_INFO_W(gptr,h,h_tail,t,s,n) { \
     h = (__itt_thread_info*)malloc(sizeof(__itt_thread_info)); \
     if (h != NULL) { \
         h->tid    = t; \
         h->nameA  = NULL; \
         h->nameW  = n ? _wcsdup(n) : NULL; \
         h->state  = s; \
         h->extra1 = 0;    /* reserved */ \
         h->extra2 = NULL; /* reserved */ \
         h->next   = NULL; \
         if (h_tail == NULL) \
             (gptr)->thread_list = h; \
         else \
             h_tail->next = h; \
     } \
 }
 
 #define NEW_THREAD_INFO_A(gptr,h,h_tail,t,s,n) { \
     h = (__itt_thread_info*)malloc(sizeof(__itt_thread_info)); \
     if (h != NULL) { \
         h->tid    = t; \
         h->nameA  = n ? __itt_fstrdup(n) : NULL; \
         h->nameW  = NULL; \
         h->state  = s; \
         h->extra1 = 0;    /* reserved */ \
         h->extra2 = NULL; /* reserved */ \
         h->next   = NULL; \
         if (h_tail == NULL) \
             (gptr)->thread_list = h; \
         else \
             h_tail->next = h; \
     } \
 }
 
 #define NEW_DOMAIN_W(gptr,h,h_tail,name) { \
     h = (__itt_domain*)malloc(sizeof(__itt_domain)); \
     if (h != NULL) { \
         h->flags  = 1;    /* domain is enabled by default */ \
         h->nameA  = NULL; \
         h->nameW  = name ? _wcsdup(name) : NULL; \
         h->extra1 = 0;    /* reserved */ \
         h->extra2 = NULL; /* reserved */ \
         h->next   = NULL; \
         if (h_tail == NULL) \
             (gptr)->domain_list = h; \
         else \
             h_tail->next = h; \
     } \
 }
 
 #define NEW_DOMAIN_A(gptr,h,h_tail,name) { \
     h = (__itt_domain*)malloc(sizeof(__itt_domain)); \
     if (h != NULL) { \
         h->flags  = 1;    /* domain is enabled by default */ \
         h->nameA  = name ? __itt_fstrdup(name) : NULL; \
         h->nameW  = NULL; \
         h->extra1 = 0;    /* reserved */ \
         h->extra2 = NULL; /* reserved */ \
         h->next   = NULL; \
         if (h_tail == NULL) \
             (gptr)->domain_list = h; \
         else \
             h_tail->next = h; \
     } \
 }
 
 #define NEW_STRING_HANDLE_W(gptr,h,h_tail,name) { \
     h = (__itt_string_handle*)malloc(sizeof(__itt_string_handle)); \
     if (h != NULL) { \
         h->strA   = NULL; \
         h->strW   = name ? _wcsdup(name) : NULL; \
         h->extra1 = 0;    /* reserved */ \
         h->extra2 = NULL; /* reserved */ \
         h->next   = NULL; \
         if (h_tail == NULL) \
             (gptr)->string_list = h; \
         else \
             h_tail->next = h; \
     } \
 }
 
 #define NEW_STRING_HANDLE_A(gptr,h,h_tail,name) { \
     h = (__itt_string_handle*)malloc(sizeof(__itt_string_handle)); \
     if (h != NULL) { \
         h->strA   = name ? __itt_fstrdup(name) : NULL; \
         h->strW   = NULL; \
         h->extra1 = 0;    /* reserved */ \
         h->extra2 = NULL; /* reserved */ \
         h->next   = NULL; \
         if (h_tail == NULL) \
             (gptr)->string_list = h; \
         else \
             h_tail->next = h; \
     } \
 }
 
 #define NEW_COUNTER_W(gptr,h,h_tail,name,domain,type) { \
     h = (__itt_counter_info_t*)malloc(sizeof(__itt_counter_info_t)); \
     if (h != NULL) { \
         h->nameA   = NULL; \
         h->nameW   = name ? _wcsdup(name) : NULL; \
         h->domainA   = NULL; \
         h->domainW   = name ? _wcsdup(domain) : NULL; \
         h->type = type; \
         h->index = 0; \
         h->next   = NULL; \
         if (h_tail == NULL) \
             (gptr)->counter_list = h; \
         else \
             h_tail->next = h; \
     } \
 }
 
 #define NEW_COUNTER_A(gptr,h,h_tail,name,domain,type) { \
     h = (__itt_counter_info_t*)malloc(sizeof(__itt_counter_info_t)); \
     if (h != NULL) { \
         h->nameA   = name ? __itt_fstrdup(name) : NULL; \
         h->nameW   = NULL; \
         h->domainA   = domain ? __itt_fstrdup(domain) : NULL; \
         h->domainW   = NULL; \
         h->type = type; \
         h->index = 0; \
         h->next   = NULL; \
         if (h_tail == NULL) \
             (gptr)->counter_list = h; \
         else \
             h_tail->next = h; \
     } \
 }
 
 #endif /* _ITTNOTIFY_CONFIG_H_ */
Index: projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/thirdparty/ittnotify/ittnotify_static.cpp
===================================================================
--- projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/thirdparty/ittnotify/ittnotify_static.cpp	(nonexistent)
+++ projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/thirdparty/ittnotify/ittnotify_static.cpp	(revision 357059)
@@ -0,0 +1,1201 @@
+
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp_config.h"
+#include "kmp_os.h"
+#include "ittnotify_config.h"
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#if defined(__MINGW32__)
+#include <limits.h>
+#else
+#define PATH_MAX 512
+#endif
+#else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+#include <limits.h>
+#include <dlfcn.h>
+#include <errno.h>
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+
+#define INTEL_NO_MACRO_BODY
+#define INTEL_ITTNOTIFY_API_PRIVATE
+#include "ittnotify.h"
+#include "legacy/ittnotify.h"
+
+#if KMP_MSVC_COMPAT
+#include "disable_warnings.h"
+#endif
+
+static const char api_version[] = API_VERSION "\0\n@(#) $Revision: 481659 $\n";
+
+#define _N_(n) ITT_JOIN(INTEL_ITTNOTIFY_PREFIX,n)
+
+#if ITT_OS==ITT_OS_WIN
+static const char* ittnotify_lib_name = "libittnotify.dll";
+#elif ITT_OS==ITT_OS_LINUX || ITT_OS==ITT_OS_FREEBSD
+static const char* ittnotify_lib_name = "libittnotify.so";
+#elif ITT_OS==ITT_OS_MAC
+static const char* ittnotify_lib_name = "libittnotify.dylib";
+#else
+#error Unsupported or unknown OS.
+#endif
+
+#ifdef __ANDROID__
+#include <android/log.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <linux/limits.h>
+
+#ifdef ITT_ANDROID_LOG
+    #define ITT_ANDROID_LOG_TAG   "INTEL_VTUNE_USERAPI"
+    #define ITT_ANDROID_LOGI(...) ((void)__android_log_print(ANDROID_LOG_INFO, ITT_ANDROID_LOG_TAG, __VA_ARGS__))
+    #define ITT_ANDROID_LOGW(...) ((void)__android_log_print(ANDROID_LOG_WARN, ITT_ANDROID_LOG_TAG, __VA_ARGS__))
+    #define ITT_ANDROID_LOGE(...) ((void)__android_log_print(ANDROID_LOG_ERROR,ITT_ANDROID_LOG_TAG, __VA_ARGS__))
+    #define ITT_ANDROID_LOGD(...) ((void)__android_log_print(ANDROID_LOG_DEBUG,ITT_ANDROID_LOG_TAG, __VA_ARGS__))
+#else
+    #define ITT_ANDROID_LOGI(...)
+    #define ITT_ANDROID_LOGW(...)
+    #define ITT_ANDROID_LOGE(...)
+    #define ITT_ANDROID_LOGD(...)
+#endif
+
+/* default location of userapi collector on Android */
+#define ANDROID_ITTNOTIFY_DEFAULT_PATH_MASK(x)  "/data/data/com.intel.vtune/perfrun/lib" \
+                                                #x "/runtime/libittnotify.so"
+
+#if ITT_ARCH==ITT_ARCH_IA32 || ITT_ARCH==ITT_ARCH_ARM
+#define ANDROID_ITTNOTIFY_DEFAULT_PATH  ANDROID_ITTNOTIFY_DEFAULT_PATH_MASK(32)
+#else
+#define ANDROID_ITTNOTIFY_DEFAULT_PATH  ANDROID_ITTNOTIFY_DEFAULT_PATH_MASK(64)
+#endif
+
+#endif
+
+#ifndef PATH_MAX
+#define PATH_MAX 4096
+#endif
+
+
+#ifndef LIB_VAR_NAME
+#if ITT_ARCH==ITT_ARCH_IA32 || ITT_ARCH==ITT_ARCH_ARM || ITT_ARCH==ITT_ARCH_MIPS
+#define LIB_VAR_NAME INTEL_LIBITTNOTIFY32
+#else
+#define LIB_VAR_NAME INTEL_LIBITTNOTIFY64
+#endif
+#endif /* LIB_VAR_NAME */
+
+#define ITT_MUTEX_INIT_AND_LOCK(p) {                                 \
+    if (PTHREAD_SYMBOLS)                                             \
+    {                                                                \
+        if (!p.mutex_initialized)                                    \
+        {                                                            \
+            if (__itt_interlocked_increment(&p.atomic_counter) == 1) \
+            {                                                        \
+                __itt_mutex_init(&p.mutex);                          \
+                p.mutex_initialized = 1;                             \
+            }                                                        \
+            else                                                     \
+                while (!p.mutex_initialized)                         \
+                    __itt_thread_yield();                            \
+        }                                                            \
+        __itt_mutex_lock(&p.mutex);                                  \
+    }                                                                \
+}
+
+typedef int (__itt_init_ittlib_t)(const char*, __itt_group_id);
+
+/* this define used to control initialization function name. */
+#ifndef __itt_init_ittlib_name
+ITT_EXTERN_C int _N_(init_ittlib)(const char*, __itt_group_id);
+static __itt_init_ittlib_t* __itt_init_ittlib_ptr = _N_(init_ittlib);
+#define __itt_init_ittlib_name __itt_init_ittlib_ptr
+#endif /* __itt_init_ittlib_name */
+
+typedef void (__itt_fini_ittlib_t)(void);
+
+/* this define used to control finalization function name. */
+#ifndef __itt_fini_ittlib_name
+ITT_EXTERN_C void _N_(fini_ittlib)(void);
+static __itt_fini_ittlib_t* __itt_fini_ittlib_ptr = _N_(fini_ittlib);
+#define __itt_fini_ittlib_name __itt_fini_ittlib_ptr
+#endif /* __itt_fini_ittlib_name */
+
+/* building pointers to imported funcs */
+#undef ITT_STUBV
+#undef ITT_STUB
+#define ITT_STUB(api,type,name,args,params,ptr,group,format)   \
+static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args;\
+typedef type api ITT_JOIN(_N_(name),_t) args;                  \
+ITT_EXTERN_C_BEGIN ITT_JOIN(_N_(name),_t)* ITTNOTIFY_NAME(name) = ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)); ITT_EXTERN_C_END \
+static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args \
+{                                                              \
+    __itt_init_ittlib_name(NULL, __itt_group_all);             \
+    if (ITTNOTIFY_NAME(name) && ITTNOTIFY_NAME(name) != ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init))) \
+        return ITTNOTIFY_NAME(name) params;                    \
+    else                                                       \
+        return (type)0;                                        \
+}
+
+#define ITT_STUBV(api,type,name,args,params,ptr,group,format)  \
+static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args;\
+typedef type api ITT_JOIN(_N_(name),_t) args;                  \
+ITT_EXTERN_C_BEGIN ITT_JOIN(_N_(name),_t)* ITTNOTIFY_NAME(name) = ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)); ITT_EXTERN_C_END \
+static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args \
+{                                                              \
+    __itt_init_ittlib_name(NULL, __itt_group_all);             \
+    if (ITTNOTIFY_NAME(name) && ITTNOTIFY_NAME(name) != ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init))) \
+        ITTNOTIFY_NAME(name) params;                           \
+    else                                                       \
+        return;                                                \
+}
+
+#undef __ITT_INTERNAL_INIT
+#include "ittnotify_static.h"
+
+#undef ITT_STUB
+#undef ITT_STUBV
+#define ITT_STUB(api,type,name,args,params,ptr,group,format)   \
+static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args;\
+typedef type api ITT_JOIN(_N_(name),_t) args;                  \
+ITT_EXTERN_C_BEGIN ITT_JOIN(_N_(name),_t)* ITTNOTIFY_NAME(name) = ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)); ITT_EXTERN_C_END
+
+#define ITT_STUBV(api,type,name,args,params,ptr,group,format)  \
+static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args;\
+typedef type api ITT_JOIN(_N_(name),_t) args;                  \
+ITT_EXTERN_C_BEGIN ITT_JOIN(_N_(name),_t)* ITTNOTIFY_NAME(name) = ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)); ITT_EXTERN_C_END
+
+#define __ITT_INTERNAL_INIT
+#include "ittnotify_static.h"
+#undef __ITT_INTERNAL_INIT
+
+ITT_GROUP_LIST(group_list);
+
+#pragma pack(push, 8)
+
+typedef struct ___itt_group_alias
+{
+    const char*    env_var;
+    __itt_group_id groups;
+} __itt_group_alias;
+
+static __itt_group_alias group_alias[] = {
+    { "KMP_FOR_TPROFILE", (__itt_group_id)(__itt_group_control | __itt_group_thread | __itt_group_sync  | __itt_group_mark) },
+    { "KMP_FOR_TCHECK",   (__itt_group_id)(__itt_group_control | __itt_group_thread | __itt_group_sync  | __itt_group_fsync | __itt_group_mark | __itt_group_suppress) },
+    { NULL,               (__itt_group_none) },
+    { api_version,        (__itt_group_none) } /* !!! Just to avoid unused code elimination !!! */
+};
+
+#pragma pack(pop)
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN && KMP_MSVC_COMPAT
+#pragma warning(push)
+#pragma warning(disable: 4054) /* warning C4054: 'type cast' : from function pointer 'XXX' to data pointer 'void *' */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+static __itt_api_info api_list[] = {
+/* Define functions with static implementation */
+#undef ITT_STUB
+#undef ITT_STUBV
+#define ITT_STUB(api,type,name,args,params,nameindll,group,format) { ITT_TO_STR(ITT_JOIN(__itt_,nameindll)), (void**)(void*)&ITTNOTIFY_NAME(name), (void*)(size_t)&ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)), (void*)(size_t)&ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)), (__itt_group_id)(group)},
+#define ITT_STUBV ITT_STUB
+#define __ITT_INTERNAL_INIT
+#include "ittnotify_static.h"
+#undef __ITT_INTERNAL_INIT
+/* Define functions without static implementation */
+#undef ITT_STUB
+#undef ITT_STUBV
+#define ITT_STUB(api,type,name,args,params,nameindll,group,format) {ITT_TO_STR(ITT_JOIN(__itt_,nameindll)), (void**)(void*)&ITTNOTIFY_NAME(name), (void*)(size_t)&ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)), NULL, (__itt_group_id)(group)},
+#define ITT_STUBV ITT_STUB
+#include "ittnotify_static.h"
+    {NULL, NULL, NULL, NULL, __itt_group_none}
+};
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN && KMP_MSVC_COMPAT
+#pragma warning(pop)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/* static part descriptor which handles. all notification api attributes. */
+__itt_global _N_(_ittapi_global) = {
+    ITT_MAGIC,                                     /* identification info */
+    ITT_MAJOR, ITT_MINOR, API_VERSION_BUILD,       /* version info */
+    0,                                             /* api_initialized */
+    0,                                             /* mutex_initialized */
+    0,                                             /* atomic_counter */
+    MUTEX_INITIALIZER,                             /* mutex */
+    NULL,                                          /* dynamic library handle */
+    NULL,                                          /* error_handler */
+    NULL,                                          /* dll_path_ptr */
+    (__itt_api_info*)&api_list,                    /* api_list_ptr */
+    NULL,                                          /* next __itt_global */
+    NULL,                                          /* thread_list */
+    NULL,                                          /* domain_list */
+    NULL,                                          /* string_list */
+    __itt_collection_normal,                       /* collection state */
+    NULL                                          /* counter_list */
+};
+
+typedef void (__itt_api_init_t)(__itt_global*, __itt_group_id);
+typedef void (__itt_api_fini_t)(__itt_global*);
+
+/* ========================================================================= */
+
+#ifdef ITT_NOTIFY_EXT_REPORT
+ITT_EXTERN_C void _N_(error_handler)(__itt_error_code, va_list args);
+#endif /* ITT_NOTIFY_EXT_REPORT */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN && KMP_MSVC_COMPAT
+#pragma warning(push)
+#pragma warning(disable: 4055) /* warning C4055: 'type cast' : from data pointer 'void *' to function pointer 'XXX' */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+static void __itt_report_error(unsigned code_arg, ...)
+{
+    va_list args;
+    va_start(args, code_arg);
+
+    // We use unsigned for the code argument and explicitly cast it here to the
+    // right enumerator because variadic functions are not compatible with
+    // default promotions.
+    __itt_error_code code = (__itt_error_code)code_arg;
+
+    if (_N_(_ittapi_global).error_handler != NULL)
+    {
+        __itt_error_handler_t* handler = (__itt_error_handler_t*)(size_t)_N_(_ittapi_global).error_handler;
+        handler(code, args);
+    }
+#ifdef ITT_NOTIFY_EXT_REPORT
+    _N_(error_handler)(code, args);
+#endif /* ITT_NOTIFY_EXT_REPORT */
+    va_end(args);
+}
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN && KMP_MSVC_COMPAT
+#pragma warning(pop)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+static __itt_domain* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(domain_createW),_init))(const wchar_t* name)
+{
+    __itt_domain *h_tail = NULL, *h = NULL;
+
+    if (name == NULL)
+    {
+        return NULL;
+    }
+
+    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+    if (_N_(_ittapi_global).api_initialized)
+    {
+        if (ITTNOTIFY_NAME(domain_createW) && ITTNOTIFY_NAME(domain_createW) != ITT_VERSIONIZE(ITT_JOIN(_N_(domain_createW),_init)))
+        {
+            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(domain_createW)(name);
+        }
+    }
+    for (h_tail = NULL, h = _N_(_ittapi_global).domain_list; h != NULL; h_tail = h, h = h->next)
+    {
+        if (h->nameW != NULL && !wcscmp(h->nameW, name)) break;
+    }
+    if (h == NULL)
+    {
+        NEW_DOMAIN_W(&_N_(_ittapi_global),h,h_tail,name);
+    }
+    if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+    return h;
+}
+
+static __itt_domain* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(domain_createA),_init))(const char* name)
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+static __itt_domain* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(domain_create),_init))(const char* name)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+{
+    __itt_domain *h_tail = NULL, *h = NULL;
+
+    if (name == NULL)
+    {
+        return NULL;
+    }
+
+    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+    if (_N_(_ittapi_global).api_initialized)
+    {
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+        if (ITTNOTIFY_NAME(domain_createA) && ITTNOTIFY_NAME(domain_createA) != ITT_VERSIONIZE(ITT_JOIN(_N_(domain_createA),_init)))
+        {
+            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(domain_createA)(name);
+        }
+#else
+        if (ITTNOTIFY_NAME(domain_create) && ITTNOTIFY_NAME(domain_create) != ITT_VERSIONIZE(ITT_JOIN(_N_(domain_create),_init)))
+        {
+            if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(domain_create)(name);
+        }
+#endif
+    }
+    for (h_tail = NULL, h = _N_(_ittapi_global).domain_list; h != NULL; h_tail = h, h = h->next)
+    {
+        if (h->nameA != NULL && !__itt_fstrcmp(h->nameA, name)) break;
+    }
+    if (h == NULL)
+    {
+        NEW_DOMAIN_A(&_N_(_ittapi_global),h,h_tail,name);
+    }
+    if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+    return h;
+}
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+static __itt_string_handle* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_createW),_init))(const wchar_t* name)
+{
+    __itt_string_handle *h_tail = NULL, *h = NULL;
+
+    if (name == NULL)
+    {
+        return NULL;
+    }
+
+    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+    if (_N_(_ittapi_global).api_initialized)
+    {
+        if (ITTNOTIFY_NAME(string_handle_createW) && ITTNOTIFY_NAME(string_handle_createW) != ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_createW),_init)))
+        {
+            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(string_handle_createW)(name);
+        }
+    }
+    for (h_tail = NULL, h = _N_(_ittapi_global).string_list; h != NULL; h_tail = h, h = h->next)
+    {
+        if (h->strW != NULL && !wcscmp(h->strW, name)) break;
+    }
+    if (h == NULL)
+    {
+        NEW_STRING_HANDLE_W(&_N_(_ittapi_global),h,h_tail,name);
+    }
+    __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+    return h;
+}
+
+static __itt_string_handle* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_createA),_init))(const char* name)
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+static __itt_string_handle* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_create),_init))(const char* name)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+{
+    __itt_string_handle *h_tail = NULL, *h = NULL;
+
+    if (name == NULL)
+    {
+        return NULL;
+    }
+
+    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+    if (_N_(_ittapi_global).api_initialized)
+    {
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+        if (ITTNOTIFY_NAME(string_handle_createA) && ITTNOTIFY_NAME(string_handle_createA) != ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_createA),_init)))
+        {
+            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(string_handle_createA)(name);
+        }
+#else
+        if (ITTNOTIFY_NAME(string_handle_create) && ITTNOTIFY_NAME(string_handle_create) != ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_create),_init)))
+        {
+            if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(string_handle_create)(name);
+        }
+#endif
+    }
+    for (h_tail = NULL, h = _N_(_ittapi_global).string_list; h != NULL; h_tail = h, h = h->next)
+    {
+        if (h->strA != NULL && !__itt_fstrcmp(h->strA, name)) break;
+    }
+    if (h == NULL)
+    {
+        NEW_STRING_HANDLE_A(&_N_(_ittapi_global),h,h_tail,name);
+    }
+    if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+    return h;
+}
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createW),_init))(const wchar_t *name, const wchar_t *domain)
+{
+    __itt_counter_info_t *h_tail = NULL, *h = NULL;
+    __itt_metadata_type type = __itt_metadata_u64;
+
+    if (name == NULL)
+    {
+        return NULL;
+    }
+
+    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+    if (_N_(_ittapi_global).api_initialized)
+    {
+        if (ITTNOTIFY_NAME(counter_createW) && ITTNOTIFY_NAME(counter_createW) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createW),_init)))
+        {
+            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(counter_createW)(name, domain);
+        }
+    }
+    for (h_tail = NULL, h = _N_(_ittapi_global).counter_list; h != NULL; h_tail = h, h = h->next)
+    {
+        if (h->nameW != NULL  && h->type == type && !wcscmp(h->nameW, name) && ((h->domainW == NULL && domain == NULL) ||
+            (h->domainW != NULL && domain != NULL && !wcscmp(h->domainW, domain)))) break;
+
+    }
+    if (h == NULL)
+    {
+        NEW_COUNTER_W(&_N_(_ittapi_global),h,h_tail,name,domain,type);
+    }
+    __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+    return (__itt_counter)h;
+}
+
+static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createA),_init))(const char *name, const char *domain)
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create),_init))(const char *name, const char *domain)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+{
+    __itt_counter_info_t *h_tail = NULL, *h = NULL;
+    __itt_metadata_type type = __itt_metadata_u64;
+
+    if (name == NULL)
+    {
+        return NULL;
+    }
+
+    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+    if (_N_(_ittapi_global).api_initialized)
+    {
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+        if (ITTNOTIFY_NAME(counter_createA) && ITTNOTIFY_NAME(counter_createA) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createA),_init)))
+        {
+            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(counter_createA)(name, domain);
+        }
+#else
+        if (ITTNOTIFY_NAME(counter_create) && ITTNOTIFY_NAME(counter_create) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create),_init)))
+        {
+            if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(counter_create)(name, domain);
+        }
+#endif
+    }
+    for (h_tail = NULL, h = _N_(_ittapi_global).counter_list; h != NULL; h_tail = h, h = h->next)
+    {
+        if (h->nameA != NULL  && h->type == type && !__itt_fstrcmp(h->nameA, name) && ((h->domainA == NULL && domain == NULL) ||
+            (h->domainA != NULL && domain != NULL && !__itt_fstrcmp(h->domainA, domain)))) break;
+    }
+    if (h == NULL)
+    {
+       NEW_COUNTER_A(&_N_(_ittapi_global),h,h_tail,name,domain,type);
+    }
+    if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+    return (__itt_counter)h;
+}
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typedW),_init))(const wchar_t *name, const wchar_t *domain, __itt_metadata_type type)
+{
+    __itt_counter_info_t *h_tail = NULL, *h = NULL;
+
+    if (name == NULL)
+    {
+        return NULL;
+    }
+
+    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+    if (_N_(_ittapi_global).api_initialized)
+    {
+        if (ITTNOTIFY_NAME(counter_create_typedW) && ITTNOTIFY_NAME(counter_create_typedW) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typedW),_init)))
+        {
+            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(counter_create_typedW)(name, domain, type);
+        }
+    }
+    for (h_tail = NULL, h = _N_(_ittapi_global).counter_list; h != NULL; h_tail = h, h = h->next)
+    {
+        if (h->nameW != NULL  && h->type == type && !wcscmp(h->nameW, name) && ((h->domainW == NULL && domain == NULL) ||
+            (h->domainW != NULL && domain != NULL && !wcscmp(h->domainW, domain)))) break;
+
+    }
+    if (h == NULL)
+    {
+        NEW_COUNTER_W(&_N_(_ittapi_global),h,h_tail,name,domain,type);
+    }
+    __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+    return (__itt_counter)h;
+}
+
+static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typedA),_init))(const char *name, const char *domain, __itt_metadata_type type)
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typed),_init))(const char *name, const char *domain, __itt_metadata_type type)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+{
+    __itt_counter_info_t *h_tail = NULL, *h = NULL;
+
+    if (name == NULL)
+    {
+        return NULL;
+    }
+
+    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+    if (_N_(_ittapi_global).api_initialized)
+    {
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+        if (ITTNOTIFY_NAME(counter_create_typedA) && ITTNOTIFY_NAME(counter_create_typedA) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typedA),_init)))
+        {
+            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(counter_create_typedA)(name, domain, type);
+        }
+#else
+        if (ITTNOTIFY_NAME(counter_create_typed) && ITTNOTIFY_NAME(counter_create_typed) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typed),_init)))
+        {
+            if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(counter_create_typed)(name, domain, type);
+        }
+#endif
+    }
+    for (h_tail = NULL, h = _N_(_ittapi_global).counter_list; h != NULL; h_tail = h, h = h->next)
+    {
+        if (h->nameA != NULL  && h->type == type && !__itt_fstrcmp(h->nameA, name) && ((h->domainA == NULL && domain == NULL) ||
+            (h->domainA != NULL && domain != NULL && !__itt_fstrcmp(h->domainA, domain)))) break;
+    }
+    if (h == NULL)
+    {
+       NEW_COUNTER_A(&_N_(_ittapi_global),h,h_tail,name,domain,type);
+    }
+    if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+    return (__itt_counter)h;
+}
+
+/* -------------------------------------------------------------------------- */
+
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(pause),_init))(void)
+{
+    if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL)
+    {
+        __itt_init_ittlib_name(NULL, __itt_group_all);
+    }
+    if (ITTNOTIFY_NAME(pause) && ITTNOTIFY_NAME(pause) != ITT_VERSIONIZE(ITT_JOIN(_N_(pause),_init)))
+    {
+        ITTNOTIFY_NAME(pause)();
+    }
+    else
+    {
+        _N_(_ittapi_global).state = __itt_collection_paused;
+    }
+}
+
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(resume),_init))(void)
+{
+    if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL)
+    {
+        __itt_init_ittlib_name(NULL, __itt_group_all);
+    }
+    if (ITTNOTIFY_NAME(resume) && ITTNOTIFY_NAME(resume) != ITT_VERSIONIZE(ITT_JOIN(_N_(resume),_init)))
+    {
+        ITTNOTIFY_NAME(resume)();
+    }
+    else
+    {
+        _N_(_ittapi_global).state = __itt_collection_normal;
+    }
+}
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameW),_init))(const wchar_t* name)
+{
+    if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL)
+    {
+        __itt_init_ittlib_name(NULL, __itt_group_all);
+    }
+    if (ITTNOTIFY_NAME(thread_set_nameW) && ITTNOTIFY_NAME(thread_set_nameW) != ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameW),_init)))
+    {
+        ITTNOTIFY_NAME(thread_set_nameW)(name);
+    }
+}
+
+static int ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thr_name_setW),_init))(const wchar_t* name, int namelen)
+{
+    (void)namelen;
+    ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameW),_init))(name);
+    return 0;
+}
+
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameA),_init))(const char* name)
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_name),_init))(const char* name)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+{
+    if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL)
+    {
+        __itt_init_ittlib_name(NULL, __itt_group_all);
+    }
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+    if (ITTNOTIFY_NAME(thread_set_nameA) && ITTNOTIFY_NAME(thread_set_nameA) != ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameA),_init)))
+    {
+        ITTNOTIFY_NAME(thread_set_nameA)(name);
+    }
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+    if (ITTNOTIFY_NAME(thread_set_name) && ITTNOTIFY_NAME(thread_set_name) != ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_name),_init)))
+    {
+        ITTNOTIFY_NAME(thread_set_name)(name);
+    }
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+}
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+static int ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thr_name_setA),_init))(const char* name, int namelen)
+{
+    (void)namelen;
+    ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameA),_init))(name);
+    return 0;
+}
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+static int ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thr_name_set),_init))(const char* name, int namelen)
+{
+    (void)namelen;
+    ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_name),_init))(name);
+    return 0;
+}
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thread_ignore),_init))(void)
+{
+    if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL)
+    {
+        __itt_init_ittlib_name(NULL, __itt_group_all);
+    }
+    if (ITTNOTIFY_NAME(thread_ignore) && ITTNOTIFY_NAME(thread_ignore) != ITT_VERSIONIZE(ITT_JOIN(_N_(thread_ignore),_init)))
+    {
+        ITTNOTIFY_NAME(thread_ignore)();
+    }
+}
+
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thr_ignore),_init))(void)
+{
+    ITT_VERSIONIZE(ITT_JOIN(_N_(thread_ignore),_init))();
+}
+
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(enable_attach),_init))(void)
+{
+#ifdef __ANDROID__
+    /*
+     * if LIB_VAR_NAME env variable were set before then stay previous value
+     * else set default path
+    */
+    setenv(ITT_TO_STR(LIB_VAR_NAME), ANDROID_ITTNOTIFY_DEFAULT_PATH, 0);
+#endif
+}
+
+/* -------------------------------------------------------------------------- */
+
+static const char* __itt_fsplit(const char* s, const char* sep, const char** out, int* len)
+{
+    int i;
+    int j;
+
+    if (!s || !sep || !out || !len)
+        return NULL;
+
+    for (i = 0; s[i]; i++)
+    {
+        int b = 0;
+        for (j = 0; sep[j]; j++)
+            if (s[i] == sep[j])
+            {
+                b = 1;
+                break;
+            }
+        if (!b)
+            break;
+    }
+
+    if (!s[i])
+        return NULL;
+
+    *len = 0;
+    *out = &s[i];
+
+    for (; s[i]; i++, (*len)++)
+    {
+        int b = 0;
+        for (j = 0; sep[j]; j++)
+            if (s[i] == sep[j])
+            {
+                b = 1;
+                break;
+            }
+        if (b)
+            break;
+    }
+
+    for (; s[i]; i++)
+    {
+        int b = 0;
+        for (j = 0; sep[j]; j++)
+            if (s[i] == sep[j])
+            {
+                b = 1;
+                break;
+            }
+        if (!b)
+            break;
+    }
+
+    return &s[i];
+}
+
+/* This function return value of env variable that placed into static buffer.
+ * !!! The same static buffer is used for subsequent calls. !!!
+ * This was done to aviod dynamic allocation for few calls.
+ * Actually we need this function only four times.
+ */
+static const char* __itt_get_env_var(const char* name)
+{
+#define MAX_ENV_VALUE_SIZE 4086
+    static char  env_buff[MAX_ENV_VALUE_SIZE];
+    static char* env_value = (char*)env_buff;
+
+    if (name != NULL)
+    {
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+        size_t max_len = MAX_ENV_VALUE_SIZE - (size_t)(env_value - env_buff);
+        DWORD rc = GetEnvironmentVariableA(name, env_value, (DWORD)max_len);
+        if (rc >= max_len)
+            __itt_report_error(__itt_error_env_too_long, name, (size_t)rc - 1, (size_t)(max_len - 1));
+        else if (rc > 0)
+        {
+            const char* ret = (const char*)env_value;
+            env_value += rc + 1;
+            return ret;
+        }
+        else
+        {
+            /* If environment variable is empty, GetEnvirornmentVariables()
+             * returns zero (number of characters (not including terminating null),
+             * and GetLastError() returns ERROR_SUCCESS. */
+            DWORD err = GetLastError();
+            if (err == ERROR_SUCCESS)
+                return env_value;
+
+            if (err != ERROR_ENVVAR_NOT_FOUND)
+                __itt_report_error(__itt_error_cant_read_env, name, (int)err);
+        }
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+        char* env = getenv(name);
+        if (env != NULL)
+        {
+            size_t len = __itt_fstrnlen(env, MAX_ENV_VALUE_SIZE);
+            size_t max_len = MAX_ENV_VALUE_SIZE - (size_t)(env_value - env_buff);
+            if (len < max_len)
+            {
+                const char* ret = (const char*)env_value;
+                __itt_fstrcpyn(env_value, max_len, env, len + 1);
+                env_value += len + 1;
+                return ret;
+            } else
+                __itt_report_error(__itt_error_env_too_long, name, (size_t)len, (size_t)(max_len - 1));
+        }
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+    }
+    return NULL;
+}
+
+static const char* __itt_get_lib_name(void)
+{
+    const char* lib_name = __itt_get_env_var(ITT_TO_STR(LIB_VAR_NAME));
+
+#ifdef __ANDROID__
+    if (lib_name == NULL)
+    {
+
+#if ITT_ARCH==ITT_ARCH_IA32 || ITT_ARCH==ITT_ARCH_ARM
+        const char* const marker_filename = "com.intel.itt.collector_lib_32";
+#else
+        const char* const marker_filename = "com.intel.itt.collector_lib_64";
+#endif
+
+        char system_wide_marker_filename[PATH_MAX] = {0};
+        int itt_marker_file_fd = -1;
+        ssize_t res = 0;
+
+        res = snprintf(system_wide_marker_filename, PATH_MAX - 1, "%s%s", "/data/local/tmp/", marker_filename);
+        if (res < 0)
+        {
+            ITT_ANDROID_LOGE("Unable to concatenate marker file string.");
+            return lib_name;
+        }
+        itt_marker_file_fd = open(system_wide_marker_filename, O_RDONLY);
+
+        if (itt_marker_file_fd == -1)
+        {
+            const pid_t my_pid = getpid();
+            char cmdline_path[PATH_MAX] = {0};
+            char package_name[PATH_MAX] = {0};
+            char app_sandbox_file[PATH_MAX] = {0};
+            int cmdline_fd = 0;
+
+            ITT_ANDROID_LOGI("Unable to open system-wide marker file.");
+            res = snprintf(cmdline_path, PATH_MAX - 1, "/proc/%d/cmdline", my_pid);
+            if (res < 0)
+            {
+                ITT_ANDROID_LOGE("Unable to get cmdline path string.");
+                return lib_name;
+            }
+
+            ITT_ANDROID_LOGI("CMD file: %s\n", cmdline_path);
+            cmdline_fd = open(cmdline_path, O_RDONLY);
+            if (cmdline_fd == -1)
+            {
+                ITT_ANDROID_LOGE("Unable to open %s file!", cmdline_path);
+                return lib_name;
+            }
+            res = read(cmdline_fd, package_name, PATH_MAX - 1);
+            if (res == -1)
+            {
+                ITT_ANDROID_LOGE("Unable to read %s file!", cmdline_path);
+                res = close(cmdline_fd);
+                if (res == -1)
+                {
+                    ITT_ANDROID_LOGE("Unable to close %s file!", cmdline_path);
+                }
+                return lib_name;
+            }
+            res = close(cmdline_fd);
+            if (res == -1)
+            {
+                ITT_ANDROID_LOGE("Unable to close %s file!", cmdline_path);
+                return lib_name;
+            }
+            ITT_ANDROID_LOGI("Package name: %s\n", package_name);
+            res = snprintf(app_sandbox_file, PATH_MAX - 1, "/data/data/%s/%s", package_name, marker_filename);
+            if (res < 0)
+            {
+                ITT_ANDROID_LOGE("Unable to concatenate marker file string.");
+                return lib_name;
+            }
+
+            ITT_ANDROID_LOGI("Lib marker file name: %s\n", app_sandbox_file);
+            itt_marker_file_fd = open(app_sandbox_file, O_RDONLY);
+            if (itt_marker_file_fd == -1)
+            {
+                ITT_ANDROID_LOGE("Unable to open app marker file!");
+                return lib_name;
+            }
+        }
+
+        {
+            char itt_lib_name[PATH_MAX] = {0};
+
+            res = read(itt_marker_file_fd, itt_lib_name, PATH_MAX - 1);
+            if (res == -1)
+            {
+                ITT_ANDROID_LOGE("Unable to read %s file!", itt_marker_file_fd);
+                res = close(itt_marker_file_fd);
+                if (res == -1)
+                {
+                    ITT_ANDROID_LOGE("Unable to close %s file!", itt_marker_file_fd);
+                }
+                return lib_name;
+            }
+            ITT_ANDROID_LOGI("ITT Lib path: %s", itt_lib_name);
+            res = close(itt_marker_file_fd);
+            if (res == -1)
+            {
+                ITT_ANDROID_LOGE("Unable to close %s file!", itt_marker_file_fd);
+                return lib_name;
+            }
+            ITT_ANDROID_LOGI("Set env %s to %s", ITT_TO_STR(LIB_VAR_NAME), itt_lib_name);
+            res = setenv(ITT_TO_STR(LIB_VAR_NAME), itt_lib_name, 0);
+            if (res == -1)
+            {
+                ITT_ANDROID_LOGE("Unable to set env var!");
+                return lib_name;
+            }
+            lib_name = __itt_get_env_var(ITT_TO_STR(LIB_VAR_NAME));
+            ITT_ANDROID_LOGI("ITT Lib path from env: %s", lib_name);
+        }
+    }
+#endif
+
+    return lib_name;
+}
+
+/* Avoid clashes with std::min, reported by tbb team */
+#define __itt_min(a,b) (a) < (b) ? (a) : (b)
+
+static __itt_group_id __itt_get_groups(void)
+{
+    int i;
+    __itt_group_id res = __itt_group_none;
+    const char* var_name  = "INTEL_ITTNOTIFY_GROUPS";
+    const char* group_str = __itt_get_env_var(var_name);
+
+    if (group_str != NULL)
+    {
+        int len;
+        char gr[255];
+        const char* chunk;
+        while ((group_str = __itt_fsplit(group_str, ",; ", &chunk, &len)) != NULL)
+        {
+            int min_len = __itt_min(len, (int)(sizeof(gr) - 1));
+            __itt_fstrcpyn(gr, sizeof(gr) - 1, chunk,  min_len);
+            gr[min_len] = 0;
+
+            for (i = 0; group_list[i].name != NULL; i++)
+            {
+                if (!__itt_fstrcmp(gr, group_list[i].name))
+                {
+                    res = (__itt_group_id)(res | group_list[i].id);
+                    break;
+                }
+            }
+        }
+        /* TODO: !!! Workaround for bug with warning for unknown group !!!
+         * Should be fixed in new initialization scheme.
+         * Now the following groups should be set always. */
+        for (i = 0; group_list[i].id != __itt_group_none; i++)
+            if (group_list[i].id != __itt_group_all &&
+                group_list[i].id > __itt_group_splitter_min &&
+                group_list[i].id < __itt_group_splitter_max)
+                res = (__itt_group_id)(res | group_list[i].id);
+        return res;
+    }
+    else
+    {
+        for (i = 0; group_alias[i].env_var != NULL; i++)
+            if (__itt_get_env_var(group_alias[i].env_var) != NULL)
+                return group_alias[i].groups;
+    }
+
+    return res;
+}
+
+#undef __itt_min
+
+static int __itt_lib_version(lib_t lib)
+{
+    if (lib == NULL)
+        return 0;
+    if (__itt_get_proc(lib, "__itt_api_init"))
+        return 2;
+    if (__itt_get_proc(lib, "__itt_api_version"))
+        return 1;
+    return 0;
+}
+
+/* It's not used right now! Comment it out to avoid warnings.
+static void __itt_reinit_all_pointers(void)
+{
+    int i;
+    // Fill all pointers with initial stubs
+    for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++)
+        *_N_(_ittapi_global).api_list_ptr[i].func_ptr = _N_(_ittapi_global).api_list_ptr[i].init_func;
+}
+*/
+
+static void __itt_nullify_all_pointers(void)
+{
+    int i;
+    /* Nulify all pointers except domain_create, string_handle_create  and counter_create */
+    for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++)
+        *_N_(_ittapi_global).api_list_ptr[i].func_ptr = _N_(_ittapi_global).api_list_ptr[i].null_func;
+}
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN && KMP_MSVC_COMPAT
+#pragma warning(push)
+#pragma warning(disable: 4054) /* warning C4054: 'type cast' : from function pointer 'XXX' to data pointer 'void *' */
+#pragma warning(disable: 4055) /* warning C4055: 'type cast' : from data pointer 'void *' to function pointer 'XXX' */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+ITT_EXTERN_C void _N_(fini_ittlib)(void)
+{
+    __itt_api_fini_t* __itt_api_fini_ptr = NULL;
+    static volatile TIDT current_thread = 0;
+
+    if (_N_(_ittapi_global).api_initialized)
+    {
+        ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+        if (_N_(_ittapi_global).api_initialized)
+        {
+            if (current_thread == 0)
+            {
+                if (PTHREAD_SYMBOLS) current_thread = __itt_thread_id();
+                if (_N_(_ittapi_global).lib != NULL)
+                {
+                    __itt_api_fini_ptr = (__itt_api_fini_t*)(size_t)__itt_get_proc(_N_(_ittapi_global).lib, "__itt_api_fini");
+                }
+                if (__itt_api_fini_ptr)
+                {
+                    __itt_api_fini_ptr(&_N_(_ittapi_global));
+                }
+
+                __itt_nullify_all_pointers();
+
+ /* TODO: !!! not safe !!! don't support unload so far.
+  *             if (_N_(_ittapi_global).lib != NULL)
+  *                 __itt_unload_lib(_N_(_ittapi_global).lib);
+  *             _N_(_ittapi_global).lib = NULL;
+  */
+                _N_(_ittapi_global).api_initialized = 0;
+                current_thread = 0;
+            }
+        }
+        if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+    }
+}
+
+ITT_EXTERN_C int _N_(init_ittlib)(const char* lib_name, __itt_group_id init_groups)
+{
+    int i;
+    __itt_group_id groups;
+#ifdef ITT_COMPLETE_GROUP
+    __itt_group_id zero_group = __itt_group_none;
+#endif /* ITT_COMPLETE_GROUP */
+    static volatile TIDT current_thread = 0;
+
+    if (!_N_(_ittapi_global).api_initialized)
+    {
+#ifndef ITT_SIMPLE_INIT
+        ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+#endif /* ITT_SIMPLE_INIT */
+
+        if (!_N_(_ittapi_global).api_initialized)
+        {
+            if (current_thread == 0)
+            {
+                if (PTHREAD_SYMBOLS) current_thread = __itt_thread_id();
+                if (lib_name == NULL)
+                {
+                    lib_name = __itt_get_lib_name();
+                }
+                groups = __itt_get_groups();
+                if (DL_SYMBOLS && (groups != __itt_group_none || lib_name != NULL))
+                {
+                    _N_(_ittapi_global).lib = __itt_load_lib((lib_name == NULL) ? ittnotify_lib_name : lib_name);
+
+                    if (_N_(_ittapi_global).lib != NULL)
+                    {
+                        __itt_api_init_t* __itt_api_init_ptr;
+                        int lib_version = __itt_lib_version(_N_(_ittapi_global).lib);
+
+                        switch (lib_version) {
+                        case 0:
+                            groups = __itt_group_legacy;
+                            KMP_FALLTHROUGH();
+                        case 1:
+                            /* Fill all pointers from dynamic library */
+                            for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++)
+                            {
+                                if (_N_(_ittapi_global).api_list_ptr[i].group & groups & init_groups)
+                                {
+                                    *_N_(_ittapi_global).api_list_ptr[i].func_ptr = (void*)__itt_get_proc(_N_(_ittapi_global).lib, _N_(_ittapi_global).api_list_ptr[i].name);
+                                    if (*_N_(_ittapi_global).api_list_ptr[i].func_ptr == NULL)
+                                    {
+                                        /* Restore pointers for function with static implementation */
+                                        *_N_(_ittapi_global).api_list_ptr[i].func_ptr = _N_(_ittapi_global).api_list_ptr[i].null_func;
+                                        __itt_report_error(__itt_error_no_symbol, lib_name, _N_(_ittapi_global).api_list_ptr[i].name);
+#ifdef ITT_COMPLETE_GROUP
+                                        zero_group = (__itt_group_id)(zero_group | _N_(_ittapi_global).api_list_ptr[i].group);
+#endif /* ITT_COMPLETE_GROUP */
+                                    }
+                                }
+                                else
+                                    *_N_(_ittapi_global).api_list_ptr[i].func_ptr = _N_(_ittapi_global).api_list_ptr[i].null_func;
+                            }
+
+                            if (groups == __itt_group_legacy)
+                            {
+                                /* Compatibility with legacy tools */
+                                ITTNOTIFY_NAME(thread_ignore)  = ITTNOTIFY_NAME(thr_ignore);
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+                                ITTNOTIFY_NAME(sync_createA)   = ITTNOTIFY_NAME(sync_set_nameA);
+                                ITTNOTIFY_NAME(sync_createW)   = ITTNOTIFY_NAME(sync_set_nameW);
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+                                ITTNOTIFY_NAME(sync_create)    = ITTNOTIFY_NAME(sync_set_name);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+                                ITTNOTIFY_NAME(sync_prepare)   = ITTNOTIFY_NAME(notify_sync_prepare);
+                                ITTNOTIFY_NAME(sync_cancel)    = ITTNOTIFY_NAME(notify_sync_cancel);
+                                ITTNOTIFY_NAME(sync_acquired)  = ITTNOTIFY_NAME(notify_sync_acquired);
+                                ITTNOTIFY_NAME(sync_releasing) = ITTNOTIFY_NAME(notify_sync_releasing);
+                            }
+
+#ifdef ITT_COMPLETE_GROUP
+                            for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++)
+                                if (_N_(_ittapi_global).api_list_ptr[i].group & zero_group)
+                                    *_N_(_ittapi_global).api_list_ptr[i].func_ptr = _N_(_ittapi_global).api_list_ptr[i].null_func;
+#endif /* ITT_COMPLETE_GROUP */
+                            break;
+                        case 2:
+                            __itt_api_init_ptr = (__itt_api_init_t*)(size_t)__itt_get_proc(_N_(_ittapi_global).lib, "__itt_api_init");
+                            if (__itt_api_init_ptr)
+                                __itt_api_init_ptr(&_N_(_ittapi_global), init_groups);
+                            break;
+                        }
+                    }
+                    else
+                    {
+                        __itt_nullify_all_pointers();
+
+                        __itt_report_error(__itt_error_no_module, lib_name,
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+                            __itt_system_error()
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+                            dlerror()
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+                        );
+                    }
+                }
+                else
+                {
+                    __itt_nullify_all_pointers();
+                }
+                _N_(_ittapi_global).api_initialized = 1;
+                current_thread = 0;
+                /* !!! Just to avoid unused code elimination !!! */
+                if (__itt_fini_ittlib_ptr == _N_(fini_ittlib)) current_thread = 0;
+            }
+        }
+
+#ifndef ITT_SIMPLE_INIT
+        if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+#endif /* ITT_SIMPLE_INIT */
+    }
+
+    /* Evaluating if any function ptr is non empty and it's in init_groups */
+    for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++)
+    {
+        if (*_N_(_ittapi_global).api_list_ptr[i].func_ptr != _N_(_ittapi_global).api_list_ptr[i].null_func &&
+            _N_(_ittapi_global).api_list_ptr[i].group & init_groups)
+        {
+            return 1;
+        }
+    }
+    return 0;
+}
+
+ITT_EXTERN_C __itt_error_handler_t* _N_(set_error_handler)(__itt_error_handler_t* handler)
+{
+    __itt_error_handler_t* prev = (__itt_error_handler_t*)(size_t)_N_(_ittapi_global).error_handler;
+    _N_(_ittapi_global).error_handler = (void*)(size_t)handler;
+    return prev;
+}
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN && KMP_MSVC_COMPAT
+#pragma warning(pop)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
Index: projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/z_Linux_asm.S
===================================================================
--- projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/z_Linux_asm.S	(revision 357058)
+++ projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/z_Linux_asm.S	(revision 357059)
@@ -1,1555 +1,1762 @@
 //  z_Linux_asm.S:  - microtasking routines specifically
 //                    written for Intel platforms running Linux* OS
 
 //
 ////===----------------------------------------------------------------------===//
 ////
 //// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 //// See https://llvm.org/LICENSE.txt for license information.
 //// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 ////
 ////===----------------------------------------------------------------------===//
 //
 
 // -----------------------------------------------------------------------
 // macros
 // -----------------------------------------------------------------------
 
 #include "kmp_config.h"
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 
 # if KMP_MIC
 // the 'delay r16/r32/r64' should be used instead of the 'pause'.
 // The delay operation has the effect of removing the current thread from
 // the round-robin HT mechanism, and therefore speeds up the issue rate of
 // the other threads on the same core.
 //
 // A value of 0 works fine for <= 2 threads per core, but causes the EPCC
 // barrier time to increase greatly for 3 or more threads per core.
 //
 // A value of 100 works pretty well for up to 4 threads per core, but isn't
 // quite as fast as 0 for 2 threads per core.
 //
 // We need to check what happens for oversubscription / > 4 threads per core.
 // It is possible that we need to pass the delay value in as a parameter
 // that the caller determines based on the total # threads / # cores.
 //
 //.macro pause_op
 //	mov    $100, %rax
 //	delay  %rax
 //.endm
 # else
 #  define pause_op   .byte 0xf3,0x90
 # endif // KMP_MIC
 
 # if KMP_OS_DARWIN
 #  define KMP_PREFIX_UNDERSCORE(x) _##x  // extra underscore for OS X* symbols
 #  define KMP_LABEL(x) L_##x             // form the name of label
 .macro KMP_CFI_DEF_OFFSET
 .endmacro
 .macro KMP_CFI_OFFSET
 .endmacro
 .macro KMP_CFI_REGISTER
 .endmacro
 .macro KMP_CFI_DEF
 .endmacro
 .macro ALIGN
 	.align $0
 .endmacro
 .macro DEBUG_INFO
 /* Not sure what .size does in icc, not sure if we need to do something
    similar for OS X*.
 */
 .endmacro
 .macro PROC
 	ALIGN  4
 	.globl KMP_PREFIX_UNDERSCORE($0)
 KMP_PREFIX_UNDERSCORE($0):
 .endmacro
 # else // KMP_OS_DARWIN
 #  define KMP_PREFIX_UNDERSCORE(x) x //no extra underscore for Linux* OS symbols
 // Format labels so that they don't override function names in gdb's backtraces
 // MIC assembler doesn't accept .L syntax, the L works fine there (as well as
 // on OS X*)
 # if KMP_MIC
 #  define KMP_LABEL(x) L_##x          // local label
 # else
 #  define KMP_LABEL(x) .L_##x         // local label hidden from backtraces
 # endif // KMP_MIC
 .macro ALIGN size
 	.align 1<<(\size)
 .endm
 .macro DEBUG_INFO proc
 	.cfi_endproc
 // Not sure why we need .type and .size for the functions
 	.align 16
 	.type  \proc,@function
         .size  \proc,.-\proc
 .endm
 .macro PROC proc
 	ALIGN  4
         .globl KMP_PREFIX_UNDERSCORE(\proc)
 KMP_PREFIX_UNDERSCORE(\proc):
 	.cfi_startproc
 .endm
 .macro KMP_CFI_DEF_OFFSET sz
 	.cfi_def_cfa_offset	\sz
 .endm
 .macro KMP_CFI_OFFSET reg, sz
 	.cfi_offset	\reg,\sz
 .endm
 .macro KMP_CFI_REGISTER reg
 	.cfi_def_cfa_register	\reg
 .endm
 .macro KMP_CFI_DEF reg, sz
 	.cfi_def_cfa	\reg,\sz
 .endm
 # endif // KMP_OS_DARWIN
 #endif // KMP_ARCH_X86 || KMP_ARCH_x86_64
 
 #if (KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64
 
 # if KMP_OS_DARWIN
 #  define KMP_PREFIX_UNDERSCORE(x) _##x  // extra underscore for OS X* symbols
 #  define KMP_LABEL(x) L_##x             // form the name of label
 
 .macro ALIGN
 	.align $0
 .endmacro
 
 .macro DEBUG_INFO
 /* Not sure what .size does in icc, not sure if we need to do something
    similar for OS X*.
 */
 .endmacro
 
 .macro PROC
 	ALIGN  4
 	.globl KMP_PREFIX_UNDERSCORE($0)
 KMP_PREFIX_UNDERSCORE($0):
 .endmacro
 # else // KMP_OS_DARWIN
 #  define KMP_PREFIX_UNDERSCORE(x) x  // no extra underscore for Linux* OS symbols
 // Format labels so that they don't override function names in gdb's backtraces
 #  define KMP_LABEL(x) .L_##x         // local label hidden from backtraces
 
 .macro ALIGN size
 	.align 1<<(\size)
 .endm
 
 .macro DEBUG_INFO proc
 	.cfi_endproc
 // Not sure why we need .type and .size for the functions
 	ALIGN 2
 	.type  \proc,@function
 	.size  \proc,.-\proc
 .endm
 
 .macro PROC proc
 	ALIGN 2
 	.globl KMP_PREFIX_UNDERSCORE(\proc)
 KMP_PREFIX_UNDERSCORE(\proc):
 	.cfi_startproc
 .endm
 # endif // KMP_OS_DARWIN
 
 #endif // (KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64
 
 // -----------------------------------------------------------------------
 // data
 // -----------------------------------------------------------------------
 
 #ifdef KMP_GOMP_COMPAT
 
 // Support for unnamed common blocks.
 //
 // Because the symbol ".gomp_critical_user_" contains a ".", we have to
 // put this stuff in assembly.
 
 # if KMP_ARCH_X86
 #  if KMP_OS_DARWIN
         .data
         .comm .gomp_critical_user_,32
         .data
         .globl ___kmp_unnamed_critical_addr
 ___kmp_unnamed_critical_addr:
         .long .gomp_critical_user_
 #  else /* Linux* OS */
         .data
         .comm .gomp_critical_user_,32,8
         .data
 	ALIGN 4
         .global __kmp_unnamed_critical_addr
 __kmp_unnamed_critical_addr:
         .4byte .gomp_critical_user_
         .type __kmp_unnamed_critical_addr,@object
         .size __kmp_unnamed_critical_addr,4
 #  endif /* KMP_OS_DARWIN */
 # endif /* KMP_ARCH_X86 */
 
 # if KMP_ARCH_X86_64
 #  if KMP_OS_DARWIN
         .data
         .comm .gomp_critical_user_,32
         .data
         .globl ___kmp_unnamed_critical_addr
 ___kmp_unnamed_critical_addr:
         .quad .gomp_critical_user_
 #  else /* Linux* OS */
         .data
         .comm .gomp_critical_user_,32,8
         .data
 	ALIGN 8
         .global __kmp_unnamed_critical_addr
 __kmp_unnamed_critical_addr:
         .8byte .gomp_critical_user_
         .type __kmp_unnamed_critical_addr,@object
         .size __kmp_unnamed_critical_addr,8
 #  endif /* KMP_OS_DARWIN */
 # endif /* KMP_ARCH_X86_64 */
 
 #endif /* KMP_GOMP_COMPAT */
 
 
 #if KMP_ARCH_X86 && !KMP_ARCH_PPC64
 
 // -----------------------------------------------------------------------
 // microtasking routines specifically written for IA-32 architecture
 // running Linux* OS
 // -----------------------------------------------------------------------
 
 	.ident "Intel Corporation"
 	.data
 	ALIGN 4
 // void
 // __kmp_x86_pause( void );
 
         .text
 	PROC  __kmp_x86_pause
 
         pause_op
         ret
 
 	DEBUG_INFO __kmp_x86_pause
 
 # if !KMP_ASM_INTRINS
 
 //------------------------------------------------------------------------
 // kmp_int32
 // __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
 
         PROC      __kmp_test_then_add32
 
         movl      4(%esp), %ecx
         movl      8(%esp), %eax
         lock
         xaddl     %eax,(%ecx)
         ret
 
 	DEBUG_INFO __kmp_test_then_add32
 
 //------------------------------------------------------------------------
 // FUNCTION __kmp_xchg_fixed8
 //
 // kmp_int32
 // __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
 //
 // parameters:
 // 	p:	4(%esp)
 // 	d:	8(%esp)
 //
 // return:	%al
         PROC  __kmp_xchg_fixed8
 
         movl      4(%esp), %ecx    // "p"
         movb      8(%esp), %al	// "d"
 
         lock
         xchgb     %al,(%ecx)
         ret
 
         DEBUG_INFO __kmp_xchg_fixed8
 
 
 //------------------------------------------------------------------------
 // FUNCTION __kmp_xchg_fixed16
 //
 // kmp_int16
 // __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
 //
 // parameters:
 // 	p:	4(%esp)
 // 	d:	8(%esp)
 // return:     %ax
         PROC  __kmp_xchg_fixed16
 
         movl      4(%esp), %ecx    // "p"
         movw      8(%esp), %ax	// "d"
 
         lock
         xchgw     %ax,(%ecx)
         ret
 
         DEBUG_INFO __kmp_xchg_fixed16
 
 
 //------------------------------------------------------------------------
 // FUNCTION __kmp_xchg_fixed32
 //
 // kmp_int32
 // __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
 //
 // parameters:
 // 	p:	4(%esp)
 // 	d:	8(%esp)
 //
 // return:	%eax
         PROC  __kmp_xchg_fixed32
 
         movl      4(%esp), %ecx    // "p"
         movl      8(%esp), %eax	// "d"
 
         lock
         xchgl     %eax,(%ecx)
         ret
 
         DEBUG_INFO __kmp_xchg_fixed32
 
 
 // kmp_int8
 // __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
         PROC  __kmp_compare_and_store8
 
         movl      4(%esp), %ecx
         movb      8(%esp), %al
         movb      12(%esp), %dl
         lock
         cmpxchgb  %dl,(%ecx)
         sete      %al           // if %al == (%ecx) set %al = 1 else set %al = 0
         and       $1, %eax      // sign extend previous instruction
         ret
 
         DEBUG_INFO __kmp_compare_and_store8
 
 // kmp_int16
 // __kmp_compare_and_store16(volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv);
         PROC  __kmp_compare_and_store16
 
         movl      4(%esp), %ecx
         movw      8(%esp), %ax
         movw      12(%esp), %dx
         lock
         cmpxchgw  %dx,(%ecx)
         sete      %al           // if %ax == (%ecx) set %al = 1 else set %al = 0
         and       $1, %eax      // sign extend previous instruction
         ret
 
         DEBUG_INFO __kmp_compare_and_store16
 
 // kmp_int32
 // __kmp_compare_and_store32(volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv);
         PROC  __kmp_compare_and_store32
 
         movl      4(%esp), %ecx
         movl      8(%esp), %eax
         movl      12(%esp), %edx
         lock
         cmpxchgl  %edx,(%ecx)
         sete      %al          // if %eax == (%ecx) set %al = 1 else set %al = 0
         and       $1, %eax     // sign extend previous instruction
         ret
 
         DEBUG_INFO __kmp_compare_and_store32
 
 // kmp_int32
 // __kmp_compare_and_store64(volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 s );
         PROC  __kmp_compare_and_store64
 
         pushl     %ebp
         movl      %esp, %ebp
         pushl     %ebx
         pushl     %edi
         movl      8(%ebp), %edi
         movl      12(%ebp), %eax        // "cv" low order word
         movl      16(%ebp), %edx        // "cv" high order word
         movl      20(%ebp), %ebx        // "sv" low order word
         movl      24(%ebp), %ecx        // "sv" high order word
         lock
         cmpxchg8b (%edi)
         sete      %al      // if %edx:eax == (%edi) set %al = 1 else set %al = 0
         and       $1, %eax // sign extend previous instruction
         popl      %edi
         popl      %ebx
         movl      %ebp, %esp
         popl      %ebp
         ret
 
         DEBUG_INFO __kmp_compare_and_store64
 
 // kmp_int8
 // __kmp_compare_and_store_ret8(volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv);
         PROC  __kmp_compare_and_store_ret8
 
         movl      4(%esp), %ecx
         movb      8(%esp), %al
         movb      12(%esp), %dl
         lock
         cmpxchgb  %dl,(%ecx)
         ret
 
         DEBUG_INFO __kmp_compare_and_store_ret8
 
 // kmp_int16
 // __kmp_compare_and_store_ret16(volatile kmp_int16 *p, kmp_int16 cv,
 //                               kmp_int16 sv);
         PROC  __kmp_compare_and_store_ret16
 
         movl      4(%esp), %ecx
         movw      8(%esp), %ax
         movw      12(%esp), %dx
         lock
         cmpxchgw  %dx,(%ecx)
         ret
 
         DEBUG_INFO __kmp_compare_and_store_ret16
 
 // kmp_int32
 // __kmp_compare_and_store_ret32(volatile kmp_int32 *p, kmp_int32 cv,
 //                               kmp_int32 sv);
         PROC  __kmp_compare_and_store_ret32
 
         movl      4(%esp), %ecx
         movl      8(%esp), %eax
         movl      12(%esp), %edx
         lock
         cmpxchgl  %edx,(%ecx)
         ret
 
         DEBUG_INFO __kmp_compare_and_store_ret32
 
 // kmp_int64
 // __kmp_compare_and_store_ret64(volatile kmp_int64 *p, kmp_int64 cv,
 //                               kmp_int64 sv);
         PROC  __kmp_compare_and_store_ret64
 
         pushl     %ebp
         movl      %esp, %ebp
         pushl     %ebx
         pushl     %edi
         movl      8(%ebp), %edi
         movl      12(%ebp), %eax        // "cv" low order word
         movl      16(%ebp), %edx        // "cv" high order word
         movl      20(%ebp), %ebx        // "sv" low order word
         movl      24(%ebp), %ecx        // "sv" high order word
         lock
         cmpxchg8b (%edi)
         popl      %edi
         popl      %ebx
         movl      %ebp, %esp
         popl      %ebp
         ret
 
         DEBUG_INFO __kmp_compare_and_store_ret64
 
 
 //------------------------------------------------------------------------
 // FUNCTION __kmp_xchg_real32
 //
 // kmp_real32
 // __kmp_xchg_real32( volatile kmp_real32 *addr, kmp_real32 data );
 //
 // parameters:
 // 	addr:	4(%esp)
 // 	data:	8(%esp)
 //
 // return:	%eax
         PROC  __kmp_xchg_real32
 
         pushl   %ebp
         movl    %esp, %ebp
         subl    $4, %esp
         pushl   %esi
 
         movl    4(%ebp), %esi
         flds    (%esi)
                         // load <addr>
         fsts    -4(%ebp)
                         // store old value
 
         movl    8(%ebp), %eax
 
         lock
         xchgl   %eax, (%esi)
 
         flds    -4(%ebp)
                         // return old value
 
         popl    %esi
         movl    %ebp, %esp
         popl    %ebp
         ret
 
         DEBUG_INFO __kmp_xchg_real32
 
 # endif /* !KMP_ASM_INTRINS */
 
 //------------------------------------------------------------------------
-// typedef void	(*microtask_t)( int *gtid, int *tid, ... );
-//
 // int
-// __kmp_invoke_microtask( microtask_t pkfn, int gtid, int tid,
-//                         int argc, void *p_argv[] ) {
-//    (*pkfn)( & gtid, & gtid, argv[0], ... );
-//    return 1;
+// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
+//                         int gtid, int tid,
+//                         int argc, void *p_argv[]
+// #if OMPT_SUPPORT
+//                         ,
+//                         void **exit_frame_ptr
+// #endif
+//                       ) {
+// #if OMPT_SUPPORT
+//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
+// #endif
+//
+//   (*pkfn)( & gtid, & tid, argv[0], ... );
+//   return 1;
 // }
 
 // -- Begin __kmp_invoke_microtask
 // mark_begin;
 	PROC  __kmp_invoke_microtask
 
 	pushl %ebp
 	KMP_CFI_DEF_OFFSET 8
 	KMP_CFI_OFFSET ebp,-8
 	movl %esp,%ebp		// establish the base pointer for this routine.
 	KMP_CFI_REGISTER ebp
 	subl $8,%esp		// allocate space for two local variables.
 				// These varibales are:
 				//	argv: -4(%ebp)
 				//	temp: -8(%ebp)
 				//
 	pushl %ebx		// save %ebx to use during this routine
 				//
 #if OMPT_SUPPORT
 	movl 28(%ebp),%ebx	// get exit_frame address
 	movl %ebp,(%ebx)	// save exit_frame
 #endif
 
 	movl 20(%ebp),%ebx	// Stack alignment - # args
 	addl $2,%ebx		// #args +2  Always pass at least 2 args (gtid and tid)
 	shll $2,%ebx		// Number of bytes used on stack: (#args+2)*4
 	movl %esp,%eax		//
 	subl %ebx,%eax		// %esp-((#args+2)*4) -> %eax -- without mods, stack ptr would be this
 	movl %eax,%ebx		// Save to %ebx
 	andl $0xFFFFFF80,%eax	// mask off 7 bits
 	subl %eax,%ebx		// Amount to subtract from %esp
 	subl %ebx,%esp		// Prepare the stack ptr --
 				//   now it will be aligned on 128-byte boundary at the call
 
 	movl 24(%ebp),%eax	// copy from p_argv[]
 	movl %eax,-4(%ebp)	// into the local variable *argv.
 
 	movl 20(%ebp),%ebx	// argc is 20(%ebp)
 	shll $2,%ebx
 
 KMP_LABEL(invoke_2):
 	cmpl $0,%ebx
 	jg  KMP_LABEL(invoke_4)
 	jmp KMP_LABEL(invoke_3)
 	ALIGN 2
 KMP_LABEL(invoke_4):
 	movl -4(%ebp),%eax
 	subl $4,%ebx			// decrement argc.
 	addl %ebx,%eax			// index into argv.
 	movl (%eax),%edx
 	pushl %edx
 
 	jmp KMP_LABEL(invoke_2)
 	ALIGN 2
 KMP_LABEL(invoke_3):
 	leal 16(%ebp),%eax		// push & tid
 	pushl %eax
 
 	leal 12(%ebp),%eax		// push & gtid
 	pushl %eax
 
 	movl 8(%ebp),%ebx
 	call *%ebx			// call (*pkfn)();
 
 	movl $1,%eax			// return 1;
 
 	movl -12(%ebp),%ebx		// restore %ebx
 	leave
 	KMP_CFI_DEF esp,4
 	ret
 
 	DEBUG_INFO __kmp_invoke_microtask
 // -- End  __kmp_invoke_microtask
 
 
 // kmp_uint64
 // __kmp_hardware_timestamp(void)
 	PROC  __kmp_hardware_timestamp
 	rdtsc
 	ret
 
 	DEBUG_INFO __kmp_hardware_timestamp
 // -- End  __kmp_hardware_timestamp
 
 #endif /* KMP_ARCH_X86 */
 
 
 #if KMP_ARCH_X86_64
 
 // -----------------------------------------------------------------------
 // microtasking routines specifically written for IA-32 architecture and
 // Intel(R) 64 running Linux* OS
 // -----------------------------------------------------------------------
 
 // -- Machine type P
 // mark_description "Intel Corporation";
 	.ident "Intel Corporation"
 // --	.file "z_Linux_asm.S"
 	.data
 	ALIGN 4
 
 // To prevent getting our code into .data section .text added to every routine
 // definition for x86_64.
 //------------------------------------------------------------------------
 # if !KMP_ASM_INTRINS
 
 //------------------------------------------------------------------------
 // FUNCTION __kmp_test_then_add32
 //
 // kmp_int32
 // __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
 //
 // parameters:
 // 	p:	%rdi
 // 	d:	%esi
 //
 // return:	%eax
         .text
         PROC  __kmp_test_then_add32
 
         movl      %esi, %eax	// "d"
         lock
         xaddl     %eax,(%rdi)
         ret
 
         DEBUG_INFO __kmp_test_then_add32
 
 
 //------------------------------------------------------------------------
 // FUNCTION __kmp_test_then_add64
 //
 // kmp_int64
 // __kmp_test_then_add64( volatile kmp_int64 *p, kmp_int64 d );
 //
 // parameters:
 // 	p:	%rdi
 // 	d:	%rsi
 //	return:	%rax
         .text
         PROC  __kmp_test_then_add64
 
         movq      %rsi, %rax	// "d"
         lock
         xaddq     %rax,(%rdi)
         ret
 
         DEBUG_INFO __kmp_test_then_add64
 
 
 //------------------------------------------------------------------------
 // FUNCTION __kmp_xchg_fixed8
 //
 // kmp_int32
 // __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
 //
 // parameters:
 // 	p:	%rdi
 // 	d:	%sil
 //
 // return:	%al
         .text
         PROC  __kmp_xchg_fixed8
 
         movb      %sil, %al	// "d"
 
         lock
         xchgb     %al,(%rdi)
         ret
 
         DEBUG_INFO __kmp_xchg_fixed8
 
 
 //------------------------------------------------------------------------
 // FUNCTION __kmp_xchg_fixed16
 //
 // kmp_int16
 // __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
 //
 // parameters:
 // 	p:	%rdi
 // 	d:	%si
 // return:     %ax
         .text
         PROC  __kmp_xchg_fixed16
 
         movw      %si, %ax	// "d"
 
         lock
         xchgw     %ax,(%rdi)
         ret
 
         DEBUG_INFO __kmp_xchg_fixed16
 
 
 //------------------------------------------------------------------------
 // FUNCTION __kmp_xchg_fixed32
 //
 // kmp_int32
 // __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
 //
 // parameters:
 // 	p:	%rdi
 // 	d:	%esi
 //
 // return:	%eax
         .text
         PROC  __kmp_xchg_fixed32
 
         movl      %esi, %eax	// "d"
 
         lock
         xchgl     %eax,(%rdi)
         ret
 
         DEBUG_INFO __kmp_xchg_fixed32
 
 
 //------------------------------------------------------------------------
 // FUNCTION __kmp_xchg_fixed64
 //
 // kmp_int64
 // __kmp_xchg_fixed64( volatile kmp_int64 *p, kmp_int64 d );
 //
 // parameters:
 // 	p:	%rdi
 // 	d:	%rsi
 // return:	%rax
         .text
         PROC  __kmp_xchg_fixed64
 
         movq      %rsi, %rax	// "d"
 
         lock
         xchgq     %rax,(%rdi)
         ret
 
         DEBUG_INFO __kmp_xchg_fixed64
 
 
 //------------------------------------------------------------------------
 // FUNCTION __kmp_compare_and_store8
 //
 // kmp_int8
 // __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
 //
 // parameters:
 // 	p:	%rdi
 // 	cv:	%esi
 //	sv:	%edx
 //
 // return:	%eax
         .text
         PROC  __kmp_compare_and_store8
 
         movb      %sil, %al	// "cv"
         lock
         cmpxchgb  %dl,(%rdi)
         sete      %al           // if %al == (%rdi) set %al = 1 else set %al = 0
         andq      $1, %rax      // sign extend previous instruction for return value
         ret
 
         DEBUG_INFO __kmp_compare_and_store8
 
 
 //------------------------------------------------------------------------
 // FUNCTION __kmp_compare_and_store16
 //
 // kmp_int16
 // __kmp_compare_and_store16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
 //
 // parameters:
 // 	p:	%rdi
 // 	cv:	%si
 //	sv:	%dx
 //
 // return:	%eax
         .text
         PROC  __kmp_compare_and_store16
 
         movw      %si, %ax	// "cv"
         lock
         cmpxchgw  %dx,(%rdi)
         sete      %al           // if %ax == (%rdi) set %al = 1 else set %al = 0
         andq      $1, %rax      // sign extend previous instruction for return value
         ret
 
         DEBUG_INFO __kmp_compare_and_store16
 
 
 //------------------------------------------------------------------------
 // FUNCTION __kmp_compare_and_store32
 //
 // kmp_int32
 // __kmp_compare_and_store32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
 //
 // parameters:
 // 	p:	%rdi
 // 	cv:	%esi
 //	sv:	%edx
 //
 // return:	%eax
         .text
         PROC  __kmp_compare_and_store32
 
         movl      %esi, %eax	// "cv"
         lock
         cmpxchgl  %edx,(%rdi)
         sete      %al           // if %eax == (%rdi) set %al = 1 else set %al = 0
         andq      $1, %rax      // sign extend previous instruction for return value
         ret
 
         DEBUG_INFO __kmp_compare_and_store32
 
 
 //------------------------------------------------------------------------
 // FUNCTION __kmp_compare_and_store64
 //
 // kmp_int32
 // __kmp_compare_and_store64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
 //
 // parameters:
 // 	p:	%rdi
 // 	cv:	%rsi
 //	sv:	%rdx
 //	return:	%eax
         .text
         PROC  __kmp_compare_and_store64
 
         movq      %rsi, %rax    // "cv"
         lock
         cmpxchgq  %rdx,(%rdi)
         sete      %al           // if %rax == (%rdi) set %al = 1 else set %al = 0
         andq      $1, %rax      // sign extend previous instruction for return value
         ret
 
         DEBUG_INFO __kmp_compare_and_store64
 
 //------------------------------------------------------------------------
 // FUNCTION __kmp_compare_and_store_ret8
 //
 // kmp_int8
 // __kmp_compare_and_store_ret8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
 //
 // parameters:
 // 	p:	%rdi
 // 	cv:	%esi
 //	sv:	%edx
 //
 // return:	%eax
         .text
         PROC  __kmp_compare_and_store_ret8
 
         movb      %sil, %al	// "cv"
         lock
         cmpxchgb  %dl,(%rdi)
         ret
 
         DEBUG_INFO __kmp_compare_and_store_ret8
 
 
 //------------------------------------------------------------------------
 // FUNCTION __kmp_compare_and_store_ret16
 //
 // kmp_int16
 // __kmp_compare_and_store16_ret( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
 //
 // parameters:
 // 	p:	%rdi
 // 	cv:	%si
 //	sv:	%dx
 //
 // return:	%eax
         .text
         PROC  __kmp_compare_and_store_ret16
 
         movw      %si, %ax	// "cv"
         lock
         cmpxchgw  %dx,(%rdi)
         ret
 
         DEBUG_INFO __kmp_compare_and_store_ret16
 
 
 //------------------------------------------------------------------------
 // FUNCTION __kmp_compare_and_store_ret32
 //
 // kmp_int32
 // __kmp_compare_and_store_ret32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
 //
 // parameters:
 // 	p:	%rdi
 // 	cv:	%esi
 //	sv:	%edx
 //
 // return:	%eax
         .text
         PROC  __kmp_compare_and_store_ret32
 
         movl      %esi, %eax	// "cv"
         lock
         cmpxchgl  %edx,(%rdi)
         ret
 
         DEBUG_INFO __kmp_compare_and_store_ret32
 
 
 //------------------------------------------------------------------------
 // FUNCTION __kmp_compare_and_store_ret64
 //
 // kmp_int64
 // __kmp_compare_and_store_ret64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
 //
 // parameters:
 // 	p:	%rdi
 // 	cv:	%rsi
 //	sv:	%rdx
 //	return:	%eax
         .text
         PROC  __kmp_compare_and_store_ret64
 
         movq      %rsi, %rax    // "cv"
         lock
         cmpxchgq  %rdx,(%rdi)
         ret
 
         DEBUG_INFO __kmp_compare_and_store_ret64
 
 # endif /* !KMP_ASM_INTRINS */
 
 
 # if !KMP_MIC
 
 # if !KMP_ASM_INTRINS
 
 //------------------------------------------------------------------------
 // FUNCTION __kmp_xchg_real32
 //
 // kmp_real32
 // __kmp_xchg_real32( volatile kmp_real32 *addr, kmp_real32 data );
 //
 // parameters:
 // 	addr:	%rdi
 // 	data:	%xmm0 (lower 4 bytes)
 //
 // return:	%xmm0 (lower 4 bytes)
         .text
         PROC  __kmp_xchg_real32
 
 	movd	%xmm0, %eax	// load "data" to eax
 
          lock
          xchgl %eax, (%rdi)
 
 	movd	%eax, %xmm0	// load old value into return register
 
         ret
 
         DEBUG_INFO __kmp_xchg_real32
 
 
 //------------------------------------------------------------------------
 // FUNCTION __kmp_xchg_real64
 //
 // kmp_real64
 // __kmp_xchg_real64( volatile kmp_real64 *addr, kmp_real64 data );
 //
 // parameters:
 //      addr:   %rdi
 //      data:   %xmm0 (lower 8 bytes)
 //      return: %xmm0 (lower 8 bytes)
         .text
         PROC  __kmp_xchg_real64
 
 	movd	%xmm0, %rax	// load "data" to rax
 
          lock
 	xchgq  %rax, (%rdi)
 
 	movd	%rax, %xmm0	// load old value into return register
         ret
 
         DEBUG_INFO __kmp_xchg_real64
 
 
 # endif /* !KMP_MIC */
 
 # endif /* !KMP_ASM_INTRINS */
 
 //------------------------------------------------------------------------
-// typedef void	(*microtask_t)( int *gtid, int *tid, ... );
-//
 // int
 // __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
-//		           int gtid, int tid,
-//                         int argc, void *p_argv[] ) {
-//    (*pkfn)( & gtid, & tid, argv[0], ... );
-//    return 1;
+//                         int gtid, int tid,
+//                         int argc, void *p_argv[]
+// #if OMPT_SUPPORT
+//                         ,
+//                         void **exit_frame_ptr
+// #endif
+//                       ) {
+// #if OMPT_SUPPORT
+//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
+// #endif
+//
+//   (*pkfn)( & gtid, & tid, argv[0], ... );
+//   return 1;
 // }
 //
 // note: at call to pkfn must have %rsp 128-byte aligned for compiler
 //
 // parameters:
 //      %rdi:  	pkfn
 //	%esi:	gtid
 //	%edx:	tid
 //	%ecx:	argc
 //	%r8:	p_argv
 //	%r9:	&exit_frame
 //
 // locals:
 //	__gtid:	gtid parm pushed on stack so can pass &gtid to pkfn
 //	__tid:	tid parm pushed on stack so can pass &tid to pkfn
 //
 // reg temps:
 //	%rax:	used all over the place
 //	%rdx:	used in stack pointer alignment calculation
 //	%r11:	used to traverse p_argv array
 //	%rsi:	used as temporary for stack parameters
 //		used as temporary for number of pkfn parms to push
 //	%rbx:	used to hold pkfn address, and zero constant, callee-save
 //
 // return:	%eax 	(always 1/TRUE)
 __gtid = -16
 __tid = -24
 
 // -- Begin __kmp_invoke_microtask
 // mark_begin;
         .text
 	PROC  __kmp_invoke_microtask
 
 	pushq 	%rbp		// save base pointer
 	KMP_CFI_DEF_OFFSET 16
 	KMP_CFI_OFFSET rbp,-16
 	movq 	%rsp,%rbp	// establish the base pointer for this routine.
 	KMP_CFI_REGISTER rbp
 
 #if OMPT_SUPPORT
 	movq	%rbp, (%r9)	// save exit_frame
 #endif
 
 	pushq 	%rbx		// %rbx is callee-saved register
 	pushq	%rsi		// Put gtid on stack so can pass &tgid to pkfn
 	pushq	%rdx		// Put tid on stack so can pass &tid to pkfn
 
 	movq	%rcx, %rax	// Stack alignment calculation begins; argc -> %rax
 	movq	$0, %rbx	// constant for cmovs later
 	subq	$4, %rax	// subtract four args passed in registers to pkfn
 #if KMP_MIC
 	js	KMP_LABEL(kmp_0)	// jump to movq
 	jmp	KMP_LABEL(kmp_0_exit)	// jump ahead
 KMP_LABEL(kmp_0):
 	movq	%rbx, %rax	// zero negative value in %rax <- max(0, argc-4)
 KMP_LABEL(kmp_0_exit):
 #else
 	cmovsq	%rbx, %rax	// zero negative value in %rax <- max(0, argc-4)
 #endif // KMP_MIC
 
 	movq	%rax, %rsi	// save max(0, argc-4) -> %rsi for later
 	shlq 	$3, %rax	// Number of bytes used on stack: max(0, argc-4)*8
 
 	movq 	%rsp, %rdx	//
 	subq 	%rax, %rdx	// %rsp-(max(0,argc-4)*8) -> %rdx --
 				// without align, stack ptr would be this
 	movq 	%rdx, %rax	// Save to %rax
 
 	andq 	$0xFFFFFFFFFFFFFF80, %rax  // mask off lower 7 bits (128 bytes align)
 	subq 	%rax, %rdx	// Amount to subtract from %rsp
 	subq 	%rdx, %rsp	// Prepare the stack ptr --
 				// now %rsp will align to 128-byte boundary at call site
 
 				// setup pkfn parameter reg and stack
 	movq	%rcx, %rax	// argc -> %rax
 	cmpq	$0, %rsi
 	je	KMP_LABEL(kmp_invoke_pass_parms)	// jump ahead if no parms to push
 	shlq	$3, %rcx	// argc*8 -> %rcx
 	movq 	%r8, %rdx	// p_argv -> %rdx
 	addq	%rcx, %rdx	// &p_argv[argc] -> %rdx
 
 	movq	%rsi, %rcx	// max (0, argc-4) -> %rcx
 
 KMP_LABEL(kmp_invoke_push_parms):
 	// push nth - 7th parms to pkfn on stack
 	subq	$8, %rdx	// decrement p_argv pointer to previous parm
 	movq	(%rdx), %rsi	// p_argv[%rcx-1] -> %rsi
 	pushq	%rsi		// push p_argv[%rcx-1] onto stack (reverse order)
 	subl	$1, %ecx
 
 // C69570: "X86_64_RELOC_BRANCH not supported" error at linking on mac_32e
 //		if the name of the label that is an operand of this jecxz starts with a dot (".");
 //	   Apple's linker does not support 1-byte length relocation;
 //         Resolution: replace all .labelX entries with L_labelX.
 
 	jecxz   KMP_LABEL(kmp_invoke_pass_parms)  // stop when four p_argv[] parms left
 	jmp	KMP_LABEL(kmp_invoke_push_parms)
 	ALIGN 3
 KMP_LABEL(kmp_invoke_pass_parms):	// put 1st - 6th parms to pkfn in registers.
 				// order here is important to avoid trashing
 				// registers used for both input and output parms!
 	movq	%rdi, %rbx	// pkfn -> %rbx
 	leaq	__gtid(%rbp), %rdi // &gtid -> %rdi (store 1st parm to pkfn)
 	leaq	__tid(%rbp), %rsi  // &tid -> %rsi (store 2nd parm to pkfn)
 
 	movq	%r8, %r11	// p_argv -> %r11
 
 #if KMP_MIC
 	cmpq	$4, %rax	// argc >= 4?
 	jns	KMP_LABEL(kmp_4)	// jump to movq
 	jmp	KMP_LABEL(kmp_4_exit)	// jump ahead
 KMP_LABEL(kmp_4):
 	movq	24(%r11), %r9	// p_argv[3] -> %r9 (store 6th parm to pkfn)
 KMP_LABEL(kmp_4_exit):
 
 	cmpq	$3, %rax	// argc >= 3?
 	jns	KMP_LABEL(kmp_3)	// jump to movq
 	jmp	KMP_LABEL(kmp_3_exit)	// jump ahead
 KMP_LABEL(kmp_3):
 	movq	16(%r11), %r8	// p_argv[2] -> %r8 (store 5th parm to pkfn)
 KMP_LABEL(kmp_3_exit):
 
 	cmpq	$2, %rax	// argc >= 2?
 	jns	KMP_LABEL(kmp_2)	// jump to movq
 	jmp	KMP_LABEL(kmp_2_exit)	// jump ahead
 KMP_LABEL(kmp_2):
 	movq	8(%r11), %rcx	// p_argv[1] -> %rcx (store 4th parm to pkfn)
 KMP_LABEL(kmp_2_exit):
 
 	cmpq	$1, %rax	// argc >= 1?
 	jns	KMP_LABEL(kmp_1)	// jump to movq
 	jmp	KMP_LABEL(kmp_1_exit)	// jump ahead
 KMP_LABEL(kmp_1):
 	movq	(%r11), %rdx	// p_argv[0] -> %rdx (store 3rd parm to pkfn)
 KMP_LABEL(kmp_1_exit):
 #else
 	cmpq	$4, %rax	// argc >= 4?
 	cmovnsq	24(%r11), %r9	// p_argv[3] -> %r9 (store 6th parm to pkfn)
 
 	cmpq	$3, %rax	// argc >= 3?
 	cmovnsq	16(%r11), %r8	// p_argv[2] -> %r8 (store 5th parm to pkfn)
 
 	cmpq	$2, %rax	// argc >= 2?
 	cmovnsq	8(%r11), %rcx	// p_argv[1] -> %rcx (store 4th parm to pkfn)
 
 	cmpq	$1, %rax	// argc >= 1?
 	cmovnsq	(%r11), %rdx	// p_argv[0] -> %rdx (store 3rd parm to pkfn)
 #endif // KMP_MIC
 
 	call	*%rbx		// call (*pkfn)();
 	movq	$1, %rax	// move 1 into return register;
 
 	movq	-8(%rbp), %rbx	// restore %rbx	using %rbp since %rsp was modified
 	movq 	%rbp, %rsp	// restore stack pointer
 	popq 	%rbp		// restore frame pointer
 	KMP_CFI_DEF rsp,8
 	ret
 
 	DEBUG_INFO __kmp_invoke_microtask
 // -- End  __kmp_invoke_microtask
 
 // kmp_uint64
 // __kmp_hardware_timestamp(void)
         .text
 	PROC  __kmp_hardware_timestamp
 	rdtsc
 	shlq    $32, %rdx
 	orq     %rdx, %rax
 	ret
 
 	DEBUG_INFO __kmp_hardware_timestamp
 // -- End  __kmp_hardware_timestamp
 
 //------------------------------------------------------------------------
 // FUNCTION __kmp_bsr32
 //
 // int
 // __kmp_bsr32( int );
         .text
         PROC  __kmp_bsr32
 
         bsr    %edi,%eax
         ret
 
         DEBUG_INFO __kmp_bsr32
 
 // -----------------------------------------------------------------------
 #endif /* KMP_ARCH_X86_64 */
 
 // '
 #if (KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64
 
 //------------------------------------------------------------------------
-//
-// typedef void	(*microtask_t)( int *gtid, int *tid, ... );
-//
 // int
 // __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
-//		           int gtid, int tid,
-//                         int argc, void *p_argv[] ) {
-//    (*pkfn)( & gtid, & tid, argv[0], ... );
-//    return 1;
+//                         int gtid, int tid,
+//                         int argc, void *p_argv[]
+// #if OMPT_SUPPORT
+//                         ,
+//                         void **exit_frame_ptr
+// #endif
+//                       ) {
+// #if OMPT_SUPPORT
+//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
+// #endif
+//
+//   (*pkfn)( & gtid, & tid, argv[0], ... );
+//
+// // FIXME: This is done at call-site and can be removed here.
+// #if OMPT_SUPPORT
+//   *exit_frame_ptr = 0;
+// #endif
+//
+//   return 1;
 // }
 //
 // parameters:
 //	x0:	pkfn
 //	w1:	gtid
 //	w2:	tid
 //	w3:	argc
 //	x4:	p_argv
 //	x5:	&exit_frame
 //
 // locals:
 //	__gtid:	gtid parm pushed on stack so can pass &gtid to pkfn
 //	__tid:	tid parm pushed on stack so can pass &tid to pkfn
 //
 // reg temps:
 //	 x8:	used to hold pkfn address
 //	 w9:	used as temporary for number of pkfn parms
 //	x10:	used to traverse p_argv array
 //	x11:	used as temporary for stack placement calculation
 //	x12:	used as temporary for stack parameters
 //	x19:	used to preserve exit_frame_ptr, callee-save
 //
 // return:	w0	(always 1/TRUE)
 //
 
 __gtid = 4
 __tid = 8
 
 // -- Begin __kmp_invoke_microtask
 // mark_begin;
 	.text
 	PROC __kmp_invoke_microtask
 
 	stp	x29, x30, [sp, #-16]!
 # if OMPT_SUPPORT
 	stp	x19, x20, [sp, #-16]!
 # endif
 	mov	x29, sp
 
 	orr	w9, wzr, #1
 	add	w9, w9, w3, lsr #1
 	sub	sp, sp, w9, uxtw #4
 	mov	x11, sp
 
 	mov	x8, x0
 	str	w1, [x29, #-__gtid]
 	str	w2, [x29, #-__tid]
 	mov	w9, w3
 	mov	x10, x4
 # if OMPT_SUPPORT
 	mov	x19, x5
 	str	x29, [x19]
 # endif
 
 	sub	x0, x29, #__gtid
 	sub	x1, x29, #__tid
 
 	cbz	w9, KMP_LABEL(kmp_1)
 	ldr	x2, [x10]
 
 	sub	w9, w9, #1
 	cbz	w9, KMP_LABEL(kmp_1)
 	ldr	x3, [x10, #8]!
 
 	sub	w9, w9, #1
 	cbz	w9, KMP_LABEL(kmp_1)
 	ldr	x4, [x10, #8]!
 
 	sub	w9, w9, #1
 	cbz	w9, KMP_LABEL(kmp_1)
 	ldr	x5, [x10, #8]!
 
 	sub	w9, w9, #1
 	cbz	w9, KMP_LABEL(kmp_1)
 	ldr	x6, [x10, #8]!
 
 	sub	w9, w9, #1
 	cbz	w9, KMP_LABEL(kmp_1)
 	ldr	x7, [x10, #8]!
 
 KMP_LABEL(kmp_0):
 	sub	w9, w9, #1
 	cbz	w9, KMP_LABEL(kmp_1)
 	ldr	x12, [x10, #8]!
 	str	x12, [x11], #8
 	b	KMP_LABEL(kmp_0)
 KMP_LABEL(kmp_1):
 	blr	x8
 	orr	w0, wzr, #1
 	mov	sp, x29
 # if OMPT_SUPPORT
 	str	xzr, [x19]
 	ldp	x19, x20, [sp], #16
 # endif
 	ldp	x29, x30, [sp], #16
 	ret
 
 	DEBUG_INFO __kmp_invoke_microtask
 // -- End  __kmp_invoke_microtask
 
 #endif /* (KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64 */
 
 #if KMP_ARCH_PPC64
 
 //------------------------------------------------------------------------
-//
-// typedef void	(*microtask_t)( int *gtid, int *tid, ... );
-//
 // int
 // __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
-//		           int gtid, int tid,
-//                         int argc, void *p_argv[] ) {
-//    (*pkfn)( & gtid, & tid, argv[0], ... );
-//    return 1;
+//                         int gtid, int tid,
+//                         int argc, void *p_argv[]
+// #if OMPT_SUPPORT
+//                         ,
+//                         void **exit_frame_ptr
+// #endif
+//                       ) {
+// #if OMPT_SUPPORT
+//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
+// #endif
+//
+//   (*pkfn)( & gtid, & tid, argv[0], ... );
+//
+// // FIXME: This is done at call-site and can be removed here.
+// #if OMPT_SUPPORT
+//   *exit_frame_ptr = 0;
+// #endif
+//
+//   return 1;
 // }
 //
 // parameters:
 //	r3:	pkfn
 //	r4:	gtid
 //	r5:	tid
 //	r6:	argc
 //	r7:	p_argv
 //	r8:	&exit_frame
 //
 // return:	r3	(always 1/TRUE)
 //
 	.text
 # if KMP_ARCH_PPC64_ELFv2
 	.abiversion 2
 # endif
 	.globl	__kmp_invoke_microtask
 
 # if KMP_ARCH_PPC64_ELFv2
 	.p2align	4
 # else
 	.p2align	2
 # endif
 
 	.type	__kmp_invoke_microtask,@function
 
 # if KMP_ARCH_PPC64_ELFv2
 __kmp_invoke_microtask:
 .Lfunc_begin0:
 .Lfunc_gep0:
 	addis 2, 12, .TOC.-.Lfunc_gep0@ha
 	addi 2, 2, .TOC.-.Lfunc_gep0@l
 .Lfunc_lep0:
 	.localentry	__kmp_invoke_microtask, .Lfunc_lep0-.Lfunc_gep0
 # else
 	.section	.opd,"aw",@progbits
 __kmp_invoke_microtask:
 	.p2align	3
 	.quad	.Lfunc_begin0
 	.quad	.TOC.@tocbase
 	.quad	0
 	.text
 .Lfunc_begin0:
 # endif
 
 // -- Begin __kmp_invoke_microtask
 // mark_begin;
 
 // We need to allocate a stack frame large enough to hold all of the parameters
 // on the stack for the microtask plus what this function needs. That's 48
 // bytes under the ELFv1 ABI (32 bytes under ELFv2), plus 8*(2 + argc) for the
 // parameters to the microtask, plus 8 bytes to store the values of r4 and r5,
 // and 8 bytes to store r31. With OMP-T support, we need an additional 8 bytes
 // to save r30 to hold a copy of r8.
 
 	.cfi_startproc
 	mflr 0
 	std 31, -8(1)
 	std 0, 16(1)
 
 // This is unusual because normally we'd set r31 equal to r1 after the stack
 // frame is established. In this case, however, we need to dynamically compute
 // the stack frame size, and so we keep a direct copy of r1 to access our
 // register save areas and restore the r1 value before returning.
 	mr 31, 1
 	.cfi_def_cfa_register r31
 	.cfi_offset r31, -8
 	.cfi_offset lr, 16
 
 // Compute the size necessary for the local stack frame.
 # if KMP_ARCH_PPC64_ELFv2
 	li 12, 72
 # else
 	li 12, 88
 # endif
 	sldi 0, 6, 3
 	add 12, 0, 12
 	neg 12, 12
 
 // We need to make sure that the stack frame stays aligned (to 16 bytes, except
 // under the BG/Q CNK, where it must be to 32 bytes).
 # if KMP_OS_CNK
 	li 0, -32
 # else
 	li 0, -16
 # endif
 	and 12, 0, 12
 
 // Establish the local stack frame.
 	stdux 1, 1, 12
 
 # if OMPT_SUPPORT
 	.cfi_offset r30, -16
 	std 30, -16(31)
 	std 1, 0(8)
 	mr 30, 8
 # endif
 
 // Store gtid and tid to the stack because they're passed by reference to the microtask.
 	stw 4, -20(31)
 	stw 5, -24(31)
 
 	mr 12, 6
 	mr 4, 7
 
 	cmpwi 0, 12, 1
 	blt	 0, .Lcall
 
 	ld 5, 0(4)
 
 	cmpwi 0, 12, 2
 	blt	 0, .Lcall
 
 	ld 6, 8(4)
 
 	cmpwi 0, 12, 3
 	blt	 0, .Lcall
 
 	ld 7, 16(4)
 
 	cmpwi 0, 12, 4
 	blt	 0, .Lcall
 
 	ld 8, 24(4)
 
 	cmpwi 0, 12, 5
 	blt	 0, .Lcall
 
 	ld 9, 32(4)
 
 	cmpwi 0, 12, 6
 	blt	 0, .Lcall
 
 	ld 10, 40(4)
 
 	cmpwi 0, 12, 7
 	blt	 0, .Lcall
 
 // There are more than 6 microtask parameters, so we need to store the
 // remainder to the stack.
 	addi 12, 12, -6
 	mtctr 12
 
 // These are set to 8 bytes before the first desired store address (we're using
 // pre-increment loads and stores in the loop below). The parameter save area
 // for the microtask begins 48 + 8*8 == 112 bytes above r1 for ELFv1 and
 // 32 + 8*8 == 96 bytes above r1 for ELFv2.
 	addi 4, 4, 40
 # if KMP_ARCH_PPC64_ELFv2
 	addi 12, 1, 88
 # else
 	addi 12, 1, 104
 # endif
 
 .Lnext:
 	ldu 0, 8(4)
 	stdu 0, 8(12)
 	bdnz .Lnext
 
 .Lcall:
 # if KMP_ARCH_PPC64_ELFv2
 	std 2, 24(1)
 	mr 12, 3
 #else
 	std 2, 40(1)
 // For ELFv1, we need to load the actual function address from the function descriptor.
 	ld 12, 0(3)
 	ld 2, 8(3)
 	ld 11, 16(3)
 #endif
 
 	addi 3, 31, -20
 	addi 4, 31, -24
 
 	mtctr 12
 	bctrl
 # if KMP_ARCH_PPC64_ELFv2
 	ld 2, 24(1)
 # else
 	ld 2, 40(1)
 # endif
 
 # if OMPT_SUPPORT
 	li 3, 0
 	std 3, 0(30)
 # endif
 
 	li 3, 1
 
 # if OMPT_SUPPORT
 	ld 30, -16(31)
 # endif
 
 	mr 1, 31
 	ld 0, 16(1)
 	ld 31, -8(1)
 	mtlr 0
 	blr
 
 	.long	0
 	.quad	0
 .Lfunc_end0:
 	.size	__kmp_invoke_microtask, .Lfunc_end0-.Lfunc_begin0
 	.cfi_endproc
 
 // -- End  __kmp_invoke_microtask
 
 #endif /* KMP_ARCH_PPC64 */
 
+#if KMP_ARCH_RISCV64
+
+//------------------------------------------------------------------------
+//
+// typedef void (*microtask_t)(int *gtid, int *tid, ...);
+//
+// int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
+//                            void *p_argv[]
+// #if OMPT_SUPPORT
+//                            ,
+//                            void **exit_frame_ptr
+// #endif
+//                            ) {
+// #if OMPT_SUPPORT
+//   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
+// #endif
+//
+//   (*pkfn)(&gtid, &tid, argv[0], ...);
+//
+//   return 1;
+// }
+//
+// Parameters:
+//   a0: pkfn
+//   a1: gtid
+//   a2: tid
+//   a3: argc
+//   a4: p_argv
+//   a5: exit_frame_ptr
+//
+// Locals:
+//   __gtid: gtid param pushed on stack so can pass &gtid to pkfn
+//   __tid: tid param pushed on stack so can pass &tid to pkfn
+//
+// Temp. registers:
+//
+//  t0: used to calculate the dynamic stack size / used to hold pkfn address
+//  t1: used as temporary for stack placement calculation
+//  t2: used as temporary for stack arguments
+//  t3: used as temporary for number of remaining pkfn parms
+//  t4: used to traverse p_argv array
+//
+// return: a0 (always 1/TRUE)
+//
+
+__gtid = -20
+__tid = -24
+
+// -- Begin __kmp_invoke_microtask
+// mark_begin;
+	.text
+	.globl	__kmp_invoke_microtask
+	.p2align	1
+	.type	__kmp_invoke_microtask,@function
+__kmp_invoke_microtask:
+	.cfi_startproc
+
+	// First, save ra and fp
+	addi	sp, sp, -16
+	sd	ra, 8(sp)
+	sd	fp, 0(sp)
+	addi	fp, sp, 16
+	.cfi_def_cfa	fp, 0
+	.cfi_offset	ra, -8
+	.cfi_offset	fp, -16
+
+	// Compute the dynamic stack size:
+	//
+	// - We need 8 bytes for storing 'gtid' and 'tid', so we can pass them by
+	//   reference
+	// - We need 8 bytes for each argument that cannot be passed to the 'pkfn'
+	//   function by register. Given that we have 8 of such registers (a[0-7])
+	//   and two + 'argc' arguments (consider &gtid and &tid), we need to
+	//   reserve max(0, argc - 6)*8 extra bytes
+	//
+	// The total number of bytes is then max(0, argc - 6)*8 + 8
+
+	// Compute max(0, argc - 6) using the following bithack:
+	// max(0, x) = x - (x & (x >> 31)), where x := argc - 6
+	// Source: http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
+	addi	t0, a3, -6
+	srai	t1, t0, 31
+	and	t1, t0, t1
+	sub	t0, t0, t1
+
+	addi	t0, t0, 1
+
+	slli	t0, t0, 3
+	sub	sp, sp, t0
+
+	// Align the stack to 16 bytes
+	andi	sp, sp, -16
+
+	mv	t0, a0
+	mv	t3, a3
+	mv	t4, a4
+
+#if OMPT_SUPPORT
+	// Save frame pointer into exit_frame
+	sd	fp, 0(a5)
+#endif
+
+	// Prepare arguments for the pkfn function (first 8 using a0-a7 registers)
+
+	sw	a1, __gtid(fp)
+	sw	a2, __tid(fp)
+
+	addi	a0, fp, __gtid
+	addi	a1, fp, __tid
+
+	beqz	t3, .L_kmp_3
+	ld	a2, 0(t4)
+
+	addi	t3, t3, -1
+	beqz	t3, .L_kmp_3
+	ld	a3, 8(t4)
+
+	addi	t3, t3, -1
+	beqz	t3, .L_kmp_3
+	ld	a4, 16(t4)
+
+	addi	t3, t3, -1
+	beqz	t3, .L_kmp_3
+	ld	a5, 24(t4)
+
+	addi	t3, t3, -1
+	beqz	t3, .L_kmp_3
+	ld	a6, 32(t4)
+
+	addi	t3, t3, -1
+	beqz	t3, .L_kmp_3
+	ld	a7, 40(t4)
+
+	// Prepare any additional argument passed through the stack
+	addi	t4, t4, 48
+	mv	t1, sp
+	j .L_kmp_2
+.L_kmp_1:
+	ld	t2, 0(t4)
+	sd	t2, 0(t1)
+	addi	t4, t4, 8
+	addi	t1, t1, 8
+.L_kmp_2:
+	addi	t3, t3, -1
+	bnez	t3, .L_kmp_1
+
+.L_kmp_3:
+	// Call pkfn function
+	jalr	t0
+
+	// Restore stack and return
+
+	addi	a0, zero, 1
+
+	addi	sp, fp, -16
+	ld	fp, 0(sp)
+	ld	ra, 8(sp)
+	addi	sp, sp, 16
+	ret
+.Lfunc_end0:
+	.size	__kmp_invoke_microtask, .Lfunc_end0-__kmp_invoke_microtask
+	.cfi_endproc
+
+// -- End  __kmp_invoke_microtask
+
+#endif /* KMP_ARCH_RISCV64 */
+
 #if KMP_ARCH_ARM || KMP_ARCH_MIPS
     .data
     .comm .gomp_critical_user_,32,8
     .data
     .align 4
     .global __kmp_unnamed_critical_addr
 __kmp_unnamed_critical_addr:
     .4byte .gomp_critical_user_
     .size __kmp_unnamed_critical_addr,4
 #endif /* KMP_ARCH_ARM */
 
-#if KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
+#if KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
     .data
     .comm .gomp_critical_user_,32,8
     .data
     .align 8
     .global __kmp_unnamed_critical_addr
 __kmp_unnamed_critical_addr:
     .8byte .gomp_critical_user_
     .size __kmp_unnamed_critical_addr,8
-#endif /* KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 */
+#endif /* KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 ||
+          KMP_ARCH_RISCV64 */
 
 #if KMP_OS_LINUX
 # if KMP_ARCH_ARM
 .section .note.GNU-stack,"",%progbits
 # else
 .section .note.GNU-stack,"",@progbits
 # endif
 #endif
Index: projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/z_Linux_util.cpp
===================================================================
--- projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/z_Linux_util.cpp	(revision 357058)
+++ projects/clang1000-import/contrib/llvm-project/openmp/runtime/src/z_Linux_util.cpp	(revision 357059)
@@ -1,2427 +1,2494 @@
 /*
  * z_Linux_util.cpp -- platform specific routines.
  */
 
 //===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
 #include "kmp.h"
 #include "kmp_affinity.h"
 #include "kmp_i18n.h"
 #include "kmp_io.h"
 #include "kmp_itt.h"
 #include "kmp_lock.h"
 #include "kmp_stats.h"
 #include "kmp_str.h"
 #include "kmp_wait_release.h"
 #include "kmp_wrapper_getpid.h"
 
 #if !KMP_OS_DRAGONFLY && !KMP_OS_FREEBSD && !KMP_OS_NETBSD && !KMP_OS_OPENBSD
 #include <alloca.h>
 #endif
 #include <math.h> // HUGE_VAL.
 #include <sys/resource.h>
 #include <sys/syscall.h>
 #include <sys/time.h>
 #include <sys/times.h>
 #include <unistd.h>
 
 #if KMP_OS_LINUX && !KMP_OS_CNK
 #include <sys/sysinfo.h>
 #if KMP_USE_FUTEX
 // We should really include <futex.h>, but that causes compatibility problems on
 // different Linux* OS distributions that either require that you include (or
 // break when you try to include) <pci/types.h>. Since all we need is the two
 // macros below (which are part of the kernel ABI, so can't change) we just
 // define the constants here and don't include <futex.h>
 #ifndef FUTEX_WAIT
 #define FUTEX_WAIT 0
 #endif
 #ifndef FUTEX_WAKE
 #define FUTEX_WAKE 1
 #endif
 #endif
 #elif KMP_OS_DARWIN
 #include <mach/mach.h>
 #include <sys/sysctl.h>
 #elif KMP_OS_DRAGONFLY || KMP_OS_FREEBSD
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <sys/user.h>
 #include <pthread_np.h>
 #elif KMP_OS_NETBSD
 #include <sys/types.h>
 #include <sys/sysctl.h>
 #endif
 
 #include <ctype.h>
 #include <dirent.h>
 #include <fcntl.h>
 
 #include "tsan_annotations.h"
 
 struct kmp_sys_timer {
   struct timespec start;
 };
 
 // Convert timespec to nanoseconds.
 #define TS2NS(timespec) (((timespec).tv_sec * 1e9) + (timespec).tv_nsec)
 
 static struct kmp_sys_timer __kmp_sys_timer_data;
 
 #if KMP_HANDLE_SIGNALS
 typedef void (*sig_func_t)(int);
 STATIC_EFI2_WORKAROUND struct sigaction __kmp_sighldrs[NSIG];
 static sigset_t __kmp_sigset;
 #endif
 
 static int __kmp_init_runtime = FALSE;
 
 static int __kmp_fork_count = 0;
 
 static pthread_condattr_t __kmp_suspend_cond_attr;
 static pthread_mutexattr_t __kmp_suspend_mutex_attr;
 
 static kmp_cond_align_t __kmp_wait_cv;
 static kmp_mutex_align_t __kmp_wait_mx;
 
 kmp_uint64 __kmp_ticks_per_msec = 1000000;
 
 #ifdef DEBUG_SUSPEND
 static void __kmp_print_cond(char *buffer, kmp_cond_align_t *cond) {
   KMP_SNPRINTF(buffer, 128, "(cond (lock (%ld, %d)), (descr (%p)))",
                cond->c_cond.__c_lock.__status, cond->c_cond.__c_lock.__spinlock,
                cond->c_cond.__c_waiting);
 }
 #endif
 
-#if (KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED)
+#if ((KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED)
 
 /* Affinity support */
 
 void __kmp_affinity_bind_thread(int which) {
   KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
               "Illegal set affinity operation when not capable");
 
   kmp_affin_mask_t *mask;
   KMP_CPU_ALLOC_ON_STACK(mask);
   KMP_CPU_ZERO(mask);
   KMP_CPU_SET(which, mask);
   __kmp_set_system_affinity(mask, TRUE);
   KMP_CPU_FREE_FROM_STACK(mask);
 }
 
 /* Determine if we can access affinity functionality on this version of
  * Linux* OS by checking __NR_sched_{get,set}affinity system calls, and set
  * __kmp_affin_mask_size to the appropriate value (0 means not capable). */
 void __kmp_affinity_determine_capable(const char *env_var) {
 // Check and see if the OS supports thread affinity.
 
+#if KMP_OS_LINUX
 #define KMP_CPU_SET_SIZE_LIMIT (1024 * 1024)
+#elif KMP_OS_FREEBSD
+#define KMP_CPU_SET_SIZE_LIMIT (sizeof(cpuset_t))
+#endif
 
-  int gCode;
-  int sCode;
-  unsigned char *buf;
-  buf = (unsigned char *)KMP_INTERNAL_MALLOC(KMP_CPU_SET_SIZE_LIMIT);
 
+#if KMP_OS_LINUX
   // If Linux* OS:
   // If the syscall fails or returns a suggestion for the size,
   // then we don't have to search for an appropriate size.
+  int gCode;
+  int sCode;
+  unsigned char *buf;
+  buf = (unsigned char *)KMP_INTERNAL_MALLOC(KMP_CPU_SET_SIZE_LIMIT);
   gCode = syscall(__NR_sched_getaffinity, 0, KMP_CPU_SET_SIZE_LIMIT, buf);
   KA_TRACE(30, ("__kmp_affinity_determine_capable: "
                 "initial getaffinity call returned %d errno = %d\n",
                 gCode, errno));
 
   // if ((gCode < 0) && (errno == ENOSYS))
   if (gCode < 0) {
     // System call not supported
     if (__kmp_affinity_verbose ||
         (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none) &&
          (__kmp_affinity_type != affinity_default) &&
          (__kmp_affinity_type != affinity_disabled))) {
       int error = errno;
       kmp_msg_t err_code = KMP_ERR(error);
       __kmp_msg(kmp_ms_warning, KMP_MSG(GetAffSysCallNotSupported, env_var),
                 err_code, __kmp_msg_null);
       if (__kmp_generate_warnings == kmp_warnings_off) {
         __kmp_str_free(&err_code.str);
       }
     }
     KMP_AFFINITY_DISABLE();
     KMP_INTERNAL_FREE(buf);
     return;
   }
   if (gCode > 0) { // Linux* OS only
     // The optimal situation: the OS returns the size of the buffer it expects.
     //
     // A verification of correct behavior is that Isetaffinity on a NULL
     // buffer with the same size fails with errno set to EFAULT.
     sCode = syscall(__NR_sched_setaffinity, 0, gCode, NULL);
     KA_TRACE(30, ("__kmp_affinity_determine_capable: "
                   "setaffinity for mask size %d returned %d errno = %d\n",
                   gCode, sCode, errno));
     if (sCode < 0) {
       if (errno == ENOSYS) {
         if (__kmp_affinity_verbose ||
             (__kmp_affinity_warnings &&
              (__kmp_affinity_type != affinity_none) &&
              (__kmp_affinity_type != affinity_default) &&
              (__kmp_affinity_type != affinity_disabled))) {
           int error = errno;
           kmp_msg_t err_code = KMP_ERR(error);
           __kmp_msg(kmp_ms_warning, KMP_MSG(SetAffSysCallNotSupported, env_var),
                     err_code, __kmp_msg_null);
           if (__kmp_generate_warnings == kmp_warnings_off) {
             __kmp_str_free(&err_code.str);
           }
         }
         KMP_AFFINITY_DISABLE();
         KMP_INTERNAL_FREE(buf);
       }
       if (errno == EFAULT) {
         KMP_AFFINITY_ENABLE(gCode);
         KA_TRACE(10, ("__kmp_affinity_determine_capable: "
                       "affinity supported (mask size %d)\n",
                       (int)__kmp_affin_mask_size));
         KMP_INTERNAL_FREE(buf);
         return;
       }
     }
   }
 
   // Call the getaffinity system call repeatedly with increasing set sizes
   // until we succeed, or reach an upper bound on the search.
   KA_TRACE(30, ("__kmp_affinity_determine_capable: "
                 "searching for proper set size\n"));
   int size;
   for (size = 1; size <= KMP_CPU_SET_SIZE_LIMIT; size *= 2) {
     gCode = syscall(__NR_sched_getaffinity, 0, size, buf);
     KA_TRACE(30, ("__kmp_affinity_determine_capable: "
                   "getaffinity for mask size %d returned %d errno = %d\n",
                   size, gCode, errno));
 
     if (gCode < 0) {
       if (errno == ENOSYS) {
         // We shouldn't get here
         KA_TRACE(30, ("__kmp_affinity_determine_capable: "
                       "inconsistent OS call behavior: errno == ENOSYS for mask "
                       "size %d\n",
                       size));
         if (__kmp_affinity_verbose ||
             (__kmp_affinity_warnings &&
              (__kmp_affinity_type != affinity_none) &&
              (__kmp_affinity_type != affinity_default) &&
              (__kmp_affinity_type != affinity_disabled))) {
           int error = errno;
           kmp_msg_t err_code = KMP_ERR(error);
           __kmp_msg(kmp_ms_warning, KMP_MSG(GetAffSysCallNotSupported, env_var),
                     err_code, __kmp_msg_null);
           if (__kmp_generate_warnings == kmp_warnings_off) {
             __kmp_str_free(&err_code.str);
           }
         }
         KMP_AFFINITY_DISABLE();
         KMP_INTERNAL_FREE(buf);
         return;
       }
       continue;
     }
 
     sCode = syscall(__NR_sched_setaffinity, 0, gCode, NULL);
     KA_TRACE(30, ("__kmp_affinity_determine_capable: "
                   "setaffinity for mask size %d returned %d errno = %d\n",
                   gCode, sCode, errno));
     if (sCode < 0) {
       if (errno == ENOSYS) { // Linux* OS only
         // We shouldn't get here
         KA_TRACE(30, ("__kmp_affinity_determine_capable: "
                       "inconsistent OS call behavior: errno == ENOSYS for mask "
                       "size %d\n",
                       size));
         if (__kmp_affinity_verbose ||
             (__kmp_affinity_warnings &&
              (__kmp_affinity_type != affinity_none) &&
              (__kmp_affinity_type != affinity_default) &&
              (__kmp_affinity_type != affinity_disabled))) {
           int error = errno;
           kmp_msg_t err_code = KMP_ERR(error);
           __kmp_msg(kmp_ms_warning, KMP_MSG(SetAffSysCallNotSupported, env_var),
                     err_code, __kmp_msg_null);
           if (__kmp_generate_warnings == kmp_warnings_off) {
             __kmp_str_free(&err_code.str);
           }
         }
         KMP_AFFINITY_DISABLE();
         KMP_INTERNAL_FREE(buf);
         return;
       }
       if (errno == EFAULT) {
         KMP_AFFINITY_ENABLE(gCode);
         KA_TRACE(10, ("__kmp_affinity_determine_capable: "
                       "affinity supported (mask size %d)\n",
                       (int)__kmp_affin_mask_size));
         KMP_INTERNAL_FREE(buf);
         return;
       }
     }
   }
+#elif KMP_OS_FREEBSD
+  int gCode;
+  unsigned char *buf;
+  buf = (unsigned char *)KMP_INTERNAL_MALLOC(KMP_CPU_SET_SIZE_LIMIT);
+  gCode = pthread_getaffinity_np(pthread_self(), KMP_CPU_SET_SIZE_LIMIT, reinterpret_cast<cpuset_t *>(buf));
+  KA_TRACE(30, ("__kmp_affinity_determine_capable: "
+                "initial getaffinity call returned %d errno = %d\n",
+                gCode, errno));
+  if (gCode == 0) {
+    KMP_AFFINITY_ENABLE(KMP_CPU_SET_SIZE_LIMIT);
+    KA_TRACE(10, ("__kmp_affinity_determine_capable: "
+                  "affinity supported (mask size %d)\n"<
+		  (int)__kmp_affin_mask_size));
+    KMP_INTERNAL_FREE(buf);
+    return;
+  }
+#endif
   // save uncaught error code
   // int error = errno;
   KMP_INTERNAL_FREE(buf);
   // restore uncaught error code, will be printed at the next KMP_WARNING below
   // errno = error;
 
   // Affinity is not supported
   KMP_AFFINITY_DISABLE();
   KA_TRACE(10, ("__kmp_affinity_determine_capable: "
                 "cannot determine mask size - affinity not supported\n"));
   if (__kmp_affinity_verbose ||
       (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none) &&
        (__kmp_affinity_type != affinity_default) &&
        (__kmp_affinity_type != affinity_disabled))) {
     KMP_WARNING(AffCantGetMaskSize, env_var);
   }
 }
 
 #endif // KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
 
 #if KMP_USE_FUTEX
 
 int __kmp_futex_determine_capable() {
   int loc = 0;
   int rc = syscall(__NR_futex, &loc, FUTEX_WAKE, 1, NULL, NULL, 0);
   int retval = (rc == 0) || (errno != ENOSYS);
 
   KA_TRACE(10,
            ("__kmp_futex_determine_capable: rc = %d errno = %d\n", rc, errno));
   KA_TRACE(10, ("__kmp_futex_determine_capable: futex syscall%s supported\n",
                 retval ? "" : " not"));
 
   return retval;
 }
 
 #endif // KMP_USE_FUTEX
 
 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64) && (!KMP_ASM_INTRINS)
 /* Only 32-bit "add-exchange" instruction on IA-32 architecture causes us to
    use compare_and_store for these routines */
 
 kmp_int8 __kmp_test_then_or8(volatile kmp_int8 *p, kmp_int8 d) {
   kmp_int8 old_value, new_value;
 
   old_value = TCR_1(*p);
   new_value = old_value | d;
 
   while (!KMP_COMPARE_AND_STORE_REL8(p, old_value, new_value)) {
     KMP_CPU_PAUSE();
     old_value = TCR_1(*p);
     new_value = old_value | d;
   }
   return old_value;
 }
 
 kmp_int8 __kmp_test_then_and8(volatile kmp_int8 *p, kmp_int8 d) {
   kmp_int8 old_value, new_value;
 
   old_value = TCR_1(*p);
   new_value = old_value & d;
 
   while (!KMP_COMPARE_AND_STORE_REL8(p, old_value, new_value)) {
     KMP_CPU_PAUSE();
     old_value = TCR_1(*p);
     new_value = old_value & d;
   }
   return old_value;
 }
 
 kmp_uint32 __kmp_test_then_or32(volatile kmp_uint32 *p, kmp_uint32 d) {
   kmp_uint32 old_value, new_value;
 
   old_value = TCR_4(*p);
   new_value = old_value | d;
 
   while (!KMP_COMPARE_AND_STORE_REL32(p, old_value, new_value)) {
     KMP_CPU_PAUSE();
     old_value = TCR_4(*p);
     new_value = old_value | d;
   }
   return old_value;
 }
 
 kmp_uint32 __kmp_test_then_and32(volatile kmp_uint32 *p, kmp_uint32 d) {
   kmp_uint32 old_value, new_value;
 
   old_value = TCR_4(*p);
   new_value = old_value & d;
 
   while (!KMP_COMPARE_AND_STORE_REL32(p, old_value, new_value)) {
     KMP_CPU_PAUSE();
     old_value = TCR_4(*p);
     new_value = old_value & d;
   }
   return old_value;
 }
 
 #if KMP_ARCH_X86
 kmp_int8 __kmp_test_then_add8(volatile kmp_int8 *p, kmp_int8 d) {
   kmp_int8 old_value, new_value;
 
   old_value = TCR_1(*p);
   new_value = old_value + d;
 
   while (!KMP_COMPARE_AND_STORE_REL8(p, old_value, new_value)) {
     KMP_CPU_PAUSE();
     old_value = TCR_1(*p);
     new_value = old_value + d;
   }
   return old_value;
 }
 
 kmp_int64 __kmp_test_then_add64(volatile kmp_int64 *p, kmp_int64 d) {
   kmp_int64 old_value, new_value;
 
   old_value = TCR_8(*p);
   new_value = old_value + d;
 
   while (!KMP_COMPARE_AND_STORE_REL64(p, old_value, new_value)) {
     KMP_CPU_PAUSE();
     old_value = TCR_8(*p);
     new_value = old_value + d;
   }
   return old_value;
 }
 #endif /* KMP_ARCH_X86 */
 
 kmp_uint64 __kmp_test_then_or64(volatile kmp_uint64 *p, kmp_uint64 d) {
   kmp_uint64 old_value, new_value;
 
   old_value = TCR_8(*p);
   new_value = old_value | d;
   while (!KMP_COMPARE_AND_STORE_REL64(p, old_value, new_value)) {
     KMP_CPU_PAUSE();
     old_value = TCR_8(*p);
     new_value = old_value | d;
   }
   return old_value;
 }
 
 kmp_uint64 __kmp_test_then_and64(volatile kmp_uint64 *p, kmp_uint64 d) {
   kmp_uint64 old_value, new_value;
 
   old_value = TCR_8(*p);
   new_value = old_value & d;
   while (!KMP_COMPARE_AND_STORE_REL64(p, old_value, new_value)) {
     KMP_CPU_PAUSE();
     old_value = TCR_8(*p);
     new_value = old_value & d;
   }
   return old_value;
 }
 
 #endif /* (KMP_ARCH_X86 || KMP_ARCH_X86_64) && (! KMP_ASM_INTRINS) */
 
 void __kmp_terminate_thread(int gtid) {
   int status;
   kmp_info_t *th = __kmp_threads[gtid];
 
   if (!th)
     return;
 
 #ifdef KMP_CANCEL_THREADS
   KA_TRACE(10, ("__kmp_terminate_thread: kill (%d)\n", gtid));
   status = pthread_cancel(th->th.th_info.ds.ds_thread);
   if (status != 0 && status != ESRCH) {
     __kmp_fatal(KMP_MSG(CantTerminateWorkerThread), KMP_ERR(status),
                 __kmp_msg_null);
   }
 #endif
   KMP_YIELD(TRUE);
 } //
 
 /* Set thread stack info according to values returned by pthread_getattr_np().
    If values are unreasonable, assume call failed and use incremental stack
    refinement method instead. Returns TRUE if the stack parameters could be
    determined exactly, FALSE if incremental refinement is necessary. */
 static kmp_int32 __kmp_set_stack_info(int gtid, kmp_info_t *th) {
   int stack_data;
 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
         KMP_OS_HURD
   pthread_attr_t attr;
   int status;
   size_t size = 0;
   void *addr = 0;
 
   /* Always do incremental stack refinement for ubermaster threads since the
      initial thread stack range can be reduced by sibling thread creation so
      pthread_attr_getstack may cause thread gtid aliasing */
   if (!KMP_UBER_GTID(gtid)) {
 
     /* Fetch the real thread attributes */
     status = pthread_attr_init(&attr);
     KMP_CHECK_SYSFAIL("pthread_attr_init", status);
 #if KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD
     status = pthread_attr_get_np(pthread_self(), &attr);
     KMP_CHECK_SYSFAIL("pthread_attr_get_np", status);
 #else
     status = pthread_getattr_np(pthread_self(), &attr);
     KMP_CHECK_SYSFAIL("pthread_getattr_np", status);
 #endif
     status = pthread_attr_getstack(&attr, &addr, &size);
     KMP_CHECK_SYSFAIL("pthread_attr_getstack", status);
     KA_TRACE(60,
              ("__kmp_set_stack_info: T#%d pthread_attr_getstack returned size:"
               " %lu, low addr: %p\n",
               gtid, size, addr));
     status = pthread_attr_destroy(&attr);
     KMP_CHECK_SYSFAIL("pthread_attr_destroy", status);
   }
 
   if (size != 0 && addr != 0) { // was stack parameter determination successful?
     /* Store the correct base and size */
     TCW_PTR(th->th.th_info.ds.ds_stackbase, (((char *)addr) + size));
     TCW_PTR(th->th.th_info.ds.ds_stacksize, size);
     TCW_4(th->th.th_info.ds.ds_stackgrow, FALSE);
     return TRUE;
   }
 #endif /* KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
               KMP_OS_HURD */
   /* Use incremental refinement starting from initial conservative estimate */
   TCW_PTR(th->th.th_info.ds.ds_stacksize, 0);
   TCW_PTR(th->th.th_info.ds.ds_stackbase, &stack_data);
   TCW_4(th->th.th_info.ds.ds_stackgrow, TRUE);
   return FALSE;
 }
 
 static void *__kmp_launch_worker(void *thr) {
   int status, old_type, old_state;
 #ifdef KMP_BLOCK_SIGNALS
   sigset_t new_set, old_set;
 #endif /* KMP_BLOCK_SIGNALS */
   void *exit_val;
 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
         KMP_OS_OPENBSD || KMP_OS_HURD
   void *volatile padding = 0;
 #endif
   int gtid;
 
   gtid = ((kmp_info_t *)thr)->th.th_info.ds.ds_gtid;
   __kmp_gtid_set_specific(gtid);
 #ifdef KMP_TDATA_GTID
   __kmp_gtid = gtid;
 #endif
 #if KMP_STATS_ENABLED
   // set thread local index to point to thread-specific stats
   __kmp_stats_thread_ptr = ((kmp_info_t *)thr)->th.th_stats;
   __kmp_stats_thread_ptr->startLife();
   KMP_SET_THREAD_STATE(IDLE);
   KMP_INIT_PARTITIONED_TIMERS(OMP_idle);
 #endif
 
 #if USE_ITT_BUILD
   __kmp_itt_thread_name(gtid);
 #endif /* USE_ITT_BUILD */
 
 #if KMP_AFFINITY_SUPPORTED
   __kmp_affinity_set_init_mask(gtid, FALSE);
 #endif
 
 #ifdef KMP_CANCEL_THREADS
   status = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &old_type);
   KMP_CHECK_SYSFAIL("pthread_setcanceltype", status);
   // josh todo: isn't PTHREAD_CANCEL_ENABLE default for newly-created threads?
   status = pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &old_state);
   KMP_CHECK_SYSFAIL("pthread_setcancelstate", status);
 #endif
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
   // Set FP control regs to be a copy of the parallel initialization thread's.
   __kmp_clear_x87_fpu_status_word();
   __kmp_load_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
   __kmp_load_mxcsr(&__kmp_init_mxcsr);
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
 #ifdef KMP_BLOCK_SIGNALS
   status = sigfillset(&new_set);
   KMP_CHECK_SYSFAIL_ERRNO("sigfillset", status);
   status = pthread_sigmask(SIG_BLOCK, &new_set, &old_set);
   KMP_CHECK_SYSFAIL("pthread_sigmask", status);
 #endif /* KMP_BLOCK_SIGNALS */
 
 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
         KMP_OS_OPENBSD
   if (__kmp_stkoffset > 0 && gtid > 0) {
     padding = KMP_ALLOCA(gtid * __kmp_stkoffset);
   }
 #endif
 
   KMP_MB();
   __kmp_set_stack_info(gtid, (kmp_info_t *)thr);
 
   __kmp_check_stack_overlap((kmp_info_t *)thr);
 
   exit_val = __kmp_launch_thread((kmp_info_t *)thr);
 
 #ifdef KMP_BLOCK_SIGNALS
   status = pthread_sigmask(SIG_SETMASK, &old_set, NULL);
   KMP_CHECK_SYSFAIL("pthread_sigmask", status);
 #endif /* KMP_BLOCK_SIGNALS */
 
   return exit_val;
 }
 
 #if KMP_USE_MONITOR
 /* The monitor thread controls all of the threads in the complex */
 
 static void *__kmp_launch_monitor(void *thr) {
   int status, old_type, old_state;
 #ifdef KMP_BLOCK_SIGNALS
   sigset_t new_set;
 #endif /* KMP_BLOCK_SIGNALS */
   struct timespec interval;
 
   KMP_MB(); /* Flush all pending memory write invalidates.  */
 
   KA_TRACE(10, ("__kmp_launch_monitor: #1 launched\n"));
 
   /* register us as the monitor thread */
   __kmp_gtid_set_specific(KMP_GTID_MONITOR);
 #ifdef KMP_TDATA_GTID
   __kmp_gtid = KMP_GTID_MONITOR;
 #endif
 
   KMP_MB();
 
 #if USE_ITT_BUILD
   // Instruct Intel(R) Threading Tools to ignore monitor thread.
   __kmp_itt_thread_ignore();
 #endif /* USE_ITT_BUILD */
 
   __kmp_set_stack_info(((kmp_info_t *)thr)->th.th_info.ds.ds_gtid,
                        (kmp_info_t *)thr);
 
   __kmp_check_stack_overlap((kmp_info_t *)thr);
 
 #ifdef KMP_CANCEL_THREADS
   status = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &old_type);
   KMP_CHECK_SYSFAIL("pthread_setcanceltype", status);
   // josh todo: isn't PTHREAD_CANCEL_ENABLE default for newly-created threads?
   status = pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &old_state);
   KMP_CHECK_SYSFAIL("pthread_setcancelstate", status);
 #endif
 
 #if KMP_REAL_TIME_FIX
   // This is a potential fix which allows application with real-time scheduling
   // policy work. However, decision about the fix is not made yet, so it is
   // disabled by default.
   { // Are program started with real-time scheduling policy?
     int sched = sched_getscheduler(0);
     if (sched == SCHED_FIFO || sched == SCHED_RR) {
       // Yes, we are a part of real-time application. Try to increase the
       // priority of the monitor.
       struct sched_param param;
       int max_priority = sched_get_priority_max(sched);
       int rc;
       KMP_WARNING(RealTimeSchedNotSupported);
       sched_getparam(0, &param);
       if (param.sched_priority < max_priority) {
         param.sched_priority += 1;
         rc = sched_setscheduler(0, sched, &param);
         if (rc != 0) {
           int error = errno;
           kmp_msg_t err_code = KMP_ERR(error);
           __kmp_msg(kmp_ms_warning, KMP_MSG(CantChangeMonitorPriority),
                     err_code, KMP_MSG(MonitorWillStarve), __kmp_msg_null);
           if (__kmp_generate_warnings == kmp_warnings_off) {
             __kmp_str_free(&err_code.str);
           }
         }
       } else {
         // We cannot abort here, because number of CPUs may be enough for all
         // the threads, including the monitor thread, so application could
         // potentially work...
         __kmp_msg(kmp_ms_warning, KMP_MSG(RunningAtMaxPriority),
                   KMP_MSG(MonitorWillStarve), KMP_HNT(RunningAtMaxPriority),
                   __kmp_msg_null);
       }
     }
     // AC: free thread that waits for monitor started
     TCW_4(__kmp_global.g.g_time.dt.t_value, 0);
   }
 #endif // KMP_REAL_TIME_FIX
 
   KMP_MB(); /* Flush all pending memory write invalidates.  */
 
   if (__kmp_monitor_wakeups == 1) {
     interval.tv_sec = 1;
     interval.tv_nsec = 0;
   } else {
     interval.tv_sec = 0;
     interval.tv_nsec = (KMP_NSEC_PER_SEC / __kmp_monitor_wakeups);
   }
 
   KA_TRACE(10, ("__kmp_launch_monitor: #2 monitor\n"));
 
   while (!TCR_4(__kmp_global.g.g_done)) {
     struct timespec now;
     struct timeval tval;
 
     /*  This thread monitors the state of the system */
 
     KA_TRACE(15, ("__kmp_launch_monitor: update\n"));
 
     status = gettimeofday(&tval, NULL);
     KMP_CHECK_SYSFAIL_ERRNO("gettimeofday", status);
     TIMEVAL_TO_TIMESPEC(&tval, &now);
 
     now.tv_sec += interval.tv_sec;
     now.tv_nsec += interval.tv_nsec;
 
     if (now.tv_nsec >= KMP_NSEC_PER_SEC) {
       now.tv_sec += 1;
       now.tv_nsec -= KMP_NSEC_PER_SEC;
     }
 
     status = pthread_mutex_lock(&__kmp_wait_mx.m_mutex);
     KMP_CHECK_SYSFAIL("pthread_mutex_lock", status);
     // AC: the monitor should not fall asleep if g_done has been set
     if (!TCR_4(__kmp_global.g.g_done)) { // check once more under mutex
       status = pthread_cond_timedwait(&__kmp_wait_cv.c_cond,
                                       &__kmp_wait_mx.m_mutex, &now);
       if (status != 0) {
         if (status != ETIMEDOUT && status != EINTR) {
           KMP_SYSFAIL("pthread_cond_timedwait", status);
         }
       }
     }
     status = pthread_mutex_unlock(&__kmp_wait_mx.m_mutex);
     KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
 
     TCW_4(__kmp_global.g.g_time.dt.t_value,
           TCR_4(__kmp_global.g.g_time.dt.t_value) + 1);
 
     KMP_MB(); /* Flush all pending memory write invalidates.  */
   }
 
   KA_TRACE(10, ("__kmp_launch_monitor: #3 cleanup\n"));
 
 #ifdef KMP_BLOCK_SIGNALS
   status = sigfillset(&new_set);
   KMP_CHECK_SYSFAIL_ERRNO("sigfillset", status);
   status = pthread_sigmask(SIG_UNBLOCK, &new_set, NULL);
   KMP_CHECK_SYSFAIL("pthread_sigmask", status);
 #endif /* KMP_BLOCK_SIGNALS */
 
   KA_TRACE(10, ("__kmp_launch_monitor: #4 finished\n"));
 
   if (__kmp_global.g.g_abort != 0) {
     /* now we need to terminate the worker threads  */
     /* the value of t_abort is the signal we caught */
 
     int gtid;
 
     KA_TRACE(10, ("__kmp_launch_monitor: #5 terminate sig=%d\n",
                   __kmp_global.g.g_abort));
 
     /* terminate the OpenMP worker threads */
     /* TODO this is not valid for sibling threads!!
      * the uber master might not be 0 anymore.. */
     for (gtid = 1; gtid < __kmp_threads_capacity; ++gtid)
       __kmp_terminate_thread(gtid);
 
     __kmp_cleanup();
 
     KA_TRACE(10, ("__kmp_launch_monitor: #6 raise sig=%d\n",
                   __kmp_global.g.g_abort));
 
     if (__kmp_global.g.g_abort > 0)
       raise(__kmp_global.g.g_abort);
   }
 
   KA_TRACE(10, ("__kmp_launch_monitor: #7 exit\n"));
 
   return thr;
 }
 #endif // KMP_USE_MONITOR
 
 void __kmp_create_worker(int gtid, kmp_info_t *th, size_t stack_size) {
   pthread_t handle;
   pthread_attr_t thread_attr;
   int status;
 
   th->th.th_info.ds.ds_gtid = gtid;
 
 #if KMP_STATS_ENABLED
   // sets up worker thread stats
   __kmp_acquire_tas_lock(&__kmp_stats_lock, gtid);
 
   // th->th.th_stats is used to transfer thread-specific stats-pointer to
   // __kmp_launch_worker. So when thread is created (goes into
   // __kmp_launch_worker) it will set its thread local pointer to
   // th->th.th_stats
   if (!KMP_UBER_GTID(gtid)) {
     th->th.th_stats = __kmp_stats_list->push_back(gtid);
   } else {
     // For root threads, __kmp_stats_thread_ptr is set in __kmp_register_root(),
     // so set the th->th.th_stats field to it.
     th->th.th_stats = __kmp_stats_thread_ptr;
   }
   __kmp_release_tas_lock(&__kmp_stats_lock, gtid);
 
 #endif // KMP_STATS_ENABLED
 
   if (KMP_UBER_GTID(gtid)) {
     KA_TRACE(10, ("__kmp_create_worker: uber thread (%d)\n", gtid));
     th->th.th_info.ds.ds_thread = pthread_self();
     __kmp_set_stack_info(gtid, th);
     __kmp_check_stack_overlap(th);
     return;
   }
 
   KA_TRACE(10, ("__kmp_create_worker: try to create thread (%d)\n", gtid));
 
   KMP_MB(); /* Flush all pending memory write invalidates.  */
 
 #ifdef KMP_THREAD_ATTR
   status = pthread_attr_init(&thread_attr);
   if (status != 0) {
     __kmp_fatal(KMP_MSG(CantInitThreadAttrs), KMP_ERR(status), __kmp_msg_null);
   }
   status = pthread_attr_setdetachstate(&thread_attr, PTHREAD_CREATE_JOINABLE);
   if (status != 0) {
     __kmp_fatal(KMP_MSG(CantSetWorkerState), KMP_ERR(status), __kmp_msg_null);
   }
 
   /* Set stack size for this thread now.
      The multiple of 2 is there because on some machines, requesting an unusual
      stacksize causes the thread to have an offset before the dummy alloca()
      takes place to create the offset.  Since we want the user to have a
      sufficient stacksize AND support a stack offset, we alloca() twice the
      offset so that the upcoming alloca() does not eliminate any premade offset,
      and also gives the user the stack space they requested for all threads */
   stack_size += gtid * __kmp_stkoffset * 2;
 
+#if defined(__ANDROID__) && __ANDROID_API__ < 19
+    // Round the stack size to a multiple of the page size. Older versions of
+    // Android (until KitKat) would fail pthread_attr_setstacksize with EINVAL
+    // if the stack size was not a multiple of the page size.
+    stack_size = (stack_size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
+#endif
+
   KA_TRACE(10, ("__kmp_create_worker: T#%d, default stacksize = %lu bytes, "
                 "__kmp_stksize = %lu bytes, final stacksize = %lu bytes\n",
                 gtid, KMP_DEFAULT_STKSIZE, __kmp_stksize, stack_size));
 
 #ifdef _POSIX_THREAD_ATTR_STACKSIZE
   status = pthread_attr_setstacksize(&thread_attr, stack_size);
 #ifdef KMP_BACKUP_STKSIZE
   if (status != 0) {
     if (!__kmp_env_stksize) {
       stack_size = KMP_BACKUP_STKSIZE + gtid * __kmp_stkoffset;
       __kmp_stksize = KMP_BACKUP_STKSIZE;
       KA_TRACE(10, ("__kmp_create_worker: T#%d, default stacksize = %lu bytes, "
                     "__kmp_stksize = %lu bytes, (backup) final stacksize = %lu "
                     "bytes\n",
                     gtid, KMP_DEFAULT_STKSIZE, __kmp_stksize, stack_size));
       status = pthread_attr_setstacksize(&thread_attr, stack_size);
     }
   }
 #endif /* KMP_BACKUP_STKSIZE */
   if (status != 0) {
     __kmp_fatal(KMP_MSG(CantSetWorkerStackSize, stack_size), KMP_ERR(status),
                 KMP_HNT(ChangeWorkerStackSize), __kmp_msg_null);
   }
 #endif /* _POSIX_THREAD_ATTR_STACKSIZE */
 
 #endif /* KMP_THREAD_ATTR */
 
   status =
       pthread_create(&handle, &thread_attr, __kmp_launch_worker, (void *)th);
   if (status != 0 || !handle) { // ??? Why do we check handle??
 #ifdef _POSIX_THREAD_ATTR_STACKSIZE
     if (status == EINVAL) {
       __kmp_fatal(KMP_MSG(CantSetWorkerStackSize, stack_size), KMP_ERR(status),
                   KMP_HNT(IncreaseWorkerStackSize), __kmp_msg_null);
     }
     if (status == ENOMEM) {
       __kmp_fatal(KMP_MSG(CantSetWorkerStackSize, stack_size), KMP_ERR(status),
                   KMP_HNT(DecreaseWorkerStackSize), __kmp_msg_null);
     }
 #endif /* _POSIX_THREAD_ATTR_STACKSIZE */
     if (status == EAGAIN) {
       __kmp_fatal(KMP_MSG(NoResourcesForWorkerThread), KMP_ERR(status),
                   KMP_HNT(Decrease_NUM_THREADS), __kmp_msg_null);
     }
     KMP_SYSFAIL("pthread_create", status);
   }
 
   th->th.th_info.ds.ds_thread = handle;
 
 #ifdef KMP_THREAD_ATTR
   status = pthread_attr_destroy(&thread_attr);
   if (status) {
     kmp_msg_t err_code = KMP_ERR(status);
     __kmp_msg(kmp_ms_warning, KMP_MSG(CantDestroyThreadAttrs), err_code,
               __kmp_msg_null);
     if (__kmp_generate_warnings == kmp_warnings_off) {
       __kmp_str_free(&err_code.str);
     }
   }
 #endif /* KMP_THREAD_ATTR */
 
   KMP_MB(); /* Flush all pending memory write invalidates.  */
 
   KA_TRACE(10, ("__kmp_create_worker: done creating thread (%d)\n", gtid));
 
 } // __kmp_create_worker
 
 #if KMP_USE_MONITOR
 void __kmp_create_monitor(kmp_info_t *th) {
   pthread_t handle;
   pthread_attr_t thread_attr;
   size_t size;
   int status;
   int auto_adj_size = FALSE;
 
   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
     // We don't need monitor thread in case of MAX_BLOCKTIME
     KA_TRACE(10, ("__kmp_create_monitor: skipping monitor thread because of "
                   "MAX blocktime\n"));
     th->th.th_info.ds.ds_tid = 0; // this makes reap_monitor no-op
     th->th.th_info.ds.ds_gtid = 0;
     return;
   }
   KA_TRACE(10, ("__kmp_create_monitor: try to create monitor\n"));
 
   KMP_MB(); /* Flush all pending memory write invalidates.  */
 
   th->th.th_info.ds.ds_tid = KMP_GTID_MONITOR;
   th->th.th_info.ds.ds_gtid = KMP_GTID_MONITOR;
 #if KMP_REAL_TIME_FIX
   TCW_4(__kmp_global.g.g_time.dt.t_value,
         -1); // Will use it for synchronization a bit later.
 #else
   TCW_4(__kmp_global.g.g_time.dt.t_value, 0);
 #endif // KMP_REAL_TIME_FIX
 
 #ifdef KMP_THREAD_ATTR
   if (__kmp_monitor_stksize == 0) {
     __kmp_monitor_stksize = KMP_DEFAULT_MONITOR_STKSIZE;
     auto_adj_size = TRUE;
   }
   status = pthread_attr_init(&thread_attr);
   if (status != 0) {
     __kmp_fatal(KMP_MSG(CantInitThreadAttrs), KMP_ERR(status), __kmp_msg_null);
   }
   status = pthread_attr_setdetachstate(&thread_attr, PTHREAD_CREATE_JOINABLE);
   if (status != 0) {
     __kmp_fatal(KMP_MSG(CantSetMonitorState), KMP_ERR(status), __kmp_msg_null);
   }
 
 #ifdef _POSIX_THREAD_ATTR_STACKSIZE
   status = pthread_attr_getstacksize(&thread_attr, &size);
   KMP_CHECK_SYSFAIL("pthread_attr_getstacksize", status);
 #else
   size = __kmp_sys_min_stksize;
 #endif /* _POSIX_THREAD_ATTR_STACKSIZE */
 #endif /* KMP_THREAD_ATTR */
 
   if (__kmp_monitor_stksize == 0) {
     __kmp_monitor_stksize = KMP_DEFAULT_MONITOR_STKSIZE;
   }
   if (__kmp_monitor_stksize < __kmp_sys_min_stksize) {
     __kmp_monitor_stksize = __kmp_sys_min_stksize;
   }
 
   KA_TRACE(10, ("__kmp_create_monitor: default stacksize = %lu bytes,"
                 "requested stacksize = %lu bytes\n",
                 size, __kmp_monitor_stksize));
 
 retry:
 
 /* Set stack size for this thread now. */
 #ifdef _POSIX_THREAD_ATTR_STACKSIZE
   KA_TRACE(10, ("__kmp_create_monitor: setting stacksize = %lu bytes,",
                 __kmp_monitor_stksize));
   status = pthread_attr_setstacksize(&thread_attr, __kmp_monitor_stksize);
   if (status != 0) {
     if (auto_adj_size) {
       __kmp_monitor_stksize *= 2;
       goto retry;
     }
     kmp_msg_t err_code = KMP_ERR(status);
     __kmp_msg(kmp_ms_warning, // should this be fatal?  BB
               KMP_MSG(CantSetMonitorStackSize, (long int)__kmp_monitor_stksize),
               err_code, KMP_HNT(ChangeMonitorStackSize), __kmp_msg_null);
     if (__kmp_generate_warnings == kmp_warnings_off) {
       __kmp_str_free(&err_code.str);
     }
   }
 #endif /* _POSIX_THREAD_ATTR_STACKSIZE */
 
   status =
       pthread_create(&handle, &thread_attr, __kmp_launch_monitor, (void *)th);
 
   if (status != 0) {
 #ifdef _POSIX_THREAD_ATTR_STACKSIZE
     if (status == EINVAL) {
       if (auto_adj_size && (__kmp_monitor_stksize < (size_t)0x40000000)) {
         __kmp_monitor_stksize *= 2;
         goto retry;
       }
       __kmp_fatal(KMP_MSG(CantSetMonitorStackSize, __kmp_monitor_stksize),
                   KMP_ERR(status), KMP_HNT(IncreaseMonitorStackSize),
                   __kmp_msg_null);
     }
     if (status == ENOMEM) {
       __kmp_fatal(KMP_MSG(CantSetMonitorStackSize, __kmp_monitor_stksize),
                   KMP_ERR(status), KMP_HNT(DecreaseMonitorStackSize),
                   __kmp_msg_null);
     }
 #endif /* _POSIX_THREAD_ATTR_STACKSIZE */
     if (status == EAGAIN) {
       __kmp_fatal(KMP_MSG(NoResourcesForMonitorThread), KMP_ERR(status),
                   KMP_HNT(DecreaseNumberOfThreadsInUse), __kmp_msg_null);
     }
     KMP_SYSFAIL("pthread_create", status);
   }
 
   th->th.th_info.ds.ds_thread = handle;
 
 #if KMP_REAL_TIME_FIX
   // Wait for the monitor thread is really started and set its *priority*.
   KMP_DEBUG_ASSERT(sizeof(kmp_uint32) ==
                    sizeof(__kmp_global.g.g_time.dt.t_value));
   __kmp_wait_4((kmp_uint32 volatile *)&__kmp_global.g.g_time.dt.t_value, -1,
                &__kmp_neq_4, NULL);
 #endif // KMP_REAL_TIME_FIX
 
 #ifdef KMP_THREAD_ATTR
   status = pthread_attr_destroy(&thread_attr);
   if (status != 0) {
     kmp_msg_t err_code = KMP_ERR(status);
     __kmp_msg(kmp_ms_warning, KMP_MSG(CantDestroyThreadAttrs), err_code,
               __kmp_msg_null);
     if (__kmp_generate_warnings == kmp_warnings_off) {
       __kmp_str_free(&err_code.str);
     }
   }
 #endif
 
   KMP_MB(); /* Flush all pending memory write invalidates.  */
 
   KA_TRACE(10, ("__kmp_create_monitor: monitor created %#.8lx\n",
                 th->th.th_info.ds.ds_thread));
 
 } // __kmp_create_monitor
 #endif // KMP_USE_MONITOR
 
 void __kmp_exit_thread(int exit_status) {
   pthread_exit((void *)(intptr_t)exit_status);
 } // __kmp_exit_thread
 
 #if KMP_USE_MONITOR
 void __kmp_resume_monitor();
 
 void __kmp_reap_monitor(kmp_info_t *th) {
   int status;
   void *exit_val;
 
   KA_TRACE(10, ("__kmp_reap_monitor: try to reap monitor thread with handle"
                 " %#.8lx\n",
                 th->th.th_info.ds.ds_thread));
 
   // If monitor has been created, its tid and gtid should be KMP_GTID_MONITOR.
   // If both tid and gtid are 0, it means the monitor did not ever start.
   // If both tid and gtid are KMP_GTID_DNE, the monitor has been shut down.
   KMP_DEBUG_ASSERT(th->th.th_info.ds.ds_tid == th->th.th_info.ds.ds_gtid);
   if (th->th.th_info.ds.ds_gtid != KMP_GTID_MONITOR) {
     KA_TRACE(10, ("__kmp_reap_monitor: monitor did not start, returning\n"));
     return;
   }
 
   KMP_MB(); /* Flush all pending memory write invalidates.  */
 
   /* First, check to see whether the monitor thread exists to wake it up. This
      is to avoid performance problem when the monitor sleeps during
      blocktime-size interval */
 
   status = pthread_kill(th->th.th_info.ds.ds_thread, 0);
   if (status != ESRCH) {
     __kmp_resume_monitor(); // Wake up the monitor thread
   }
   KA_TRACE(10, ("__kmp_reap_monitor: try to join with monitor\n"));
   status = pthread_join(th->th.th_info.ds.ds_thread, &exit_val);
   if (exit_val != th) {
     __kmp_fatal(KMP_MSG(ReapMonitorError), KMP_ERR(status), __kmp_msg_null);
   }
 
   th->th.th_info.ds.ds_tid = KMP_GTID_DNE;
   th->th.th_info.ds.ds_gtid = KMP_GTID_DNE;
 
   KA_TRACE(10, ("__kmp_reap_monitor: done reaping monitor thread with handle"
                 " %#.8lx\n",
                 th->th.th_info.ds.ds_thread));
 
   KMP_MB(); /* Flush all pending memory write invalidates.  */
 }
 #endif // KMP_USE_MONITOR
 
 void __kmp_reap_worker(kmp_info_t *th) {
   int status;
   void *exit_val;
 
   KMP_MB(); /* Flush all pending memory write invalidates.  */
 
   KA_TRACE(
       10, ("__kmp_reap_worker: try to reap T#%d\n", th->th.th_info.ds.ds_gtid));
 
   status = pthread_join(th->th.th_info.ds.ds_thread, &exit_val);
 #ifdef KMP_DEBUG
   /* Don't expose these to the user until we understand when they trigger */
   if (status != 0) {
     __kmp_fatal(KMP_MSG(ReapWorkerError), KMP_ERR(status), __kmp_msg_null);
   }
   if (exit_val != th) {
     KA_TRACE(10, ("__kmp_reap_worker: worker T#%d did not reap properly, "
                   "exit_val = %p\n",
                   th->th.th_info.ds.ds_gtid, exit_val));
   }
 #endif /* KMP_DEBUG */
 
   KA_TRACE(10, ("__kmp_reap_worker: done reaping T#%d\n",
                 th->th.th_info.ds.ds_gtid));
 
   KMP_MB(); /* Flush all pending memory write invalidates.  */
 }
 
 #if KMP_HANDLE_SIGNALS
 
 static void __kmp_null_handler(int signo) {
   //  Do nothing, for doing SIG_IGN-type actions.
 } // __kmp_null_handler
 
 static void __kmp_team_handler(int signo) {
   if (__kmp_global.g.g_abort == 0) {
 /* Stage 1 signal handler, let's shut down all of the threads */
 #ifdef KMP_DEBUG
     __kmp_debug_printf("__kmp_team_handler: caught signal = %d\n", signo);
 #endif
     switch (signo) {
     case SIGHUP:
     case SIGINT:
     case SIGQUIT:
     case SIGILL:
     case SIGABRT:
     case SIGFPE:
     case SIGBUS:
     case SIGSEGV:
 #ifdef SIGSYS
     case SIGSYS:
 #endif
     case SIGTERM:
       if (__kmp_debug_buf) {
         __kmp_dump_debug_buffer();
       }
       KMP_MB(); // Flush all pending memory write invalidates.
       TCW_4(__kmp_global.g.g_abort, signo);
       KMP_MB(); // Flush all pending memory write invalidates.
       TCW_4(__kmp_global.g.g_done, TRUE);
       KMP_MB(); // Flush all pending memory write invalidates.
       break;
     default:
 #ifdef KMP_DEBUG
       __kmp_debug_printf("__kmp_team_handler: unknown signal type");
 #endif
       break;
     }
   }
 } // __kmp_team_handler
 
 static void __kmp_sigaction(int signum, const struct sigaction *act,
                             struct sigaction *oldact) {
   int rc = sigaction(signum, act, oldact);
   KMP_CHECK_SYSFAIL_ERRNO("sigaction", rc);
 }
 
 static void __kmp_install_one_handler(int sig, sig_func_t handler_func,
                                       int parallel_init) {
   KMP_MB(); // Flush all pending memory write invalidates.
   KB_TRACE(60,
            ("__kmp_install_one_handler( %d, ..., %d )\n", sig, parallel_init));
   if (parallel_init) {
     struct sigaction new_action;
     struct sigaction old_action;
     new_action.sa_handler = handler_func;
     new_action.sa_flags = 0;
     sigfillset(&new_action.sa_mask);
     __kmp_sigaction(sig, &new_action, &old_action);
     if (old_action.sa_handler == __kmp_sighldrs[sig].sa_handler) {
       sigaddset(&__kmp_sigset, sig);
     } else {
       // Restore/keep user's handler if one previously installed.
       __kmp_sigaction(sig, &old_action, NULL);
     }
   } else {
     // Save initial/system signal handlers to see if user handlers installed.
     __kmp_sigaction(sig, NULL, &__kmp_sighldrs[sig]);
   }
   KMP_MB(); // Flush all pending memory write invalidates.
 } // __kmp_install_one_handler
 
 static void __kmp_remove_one_handler(int sig) {
   KB_TRACE(60, ("__kmp_remove_one_handler( %d )\n", sig));
   if (sigismember(&__kmp_sigset, sig)) {
     struct sigaction old;
     KMP_MB(); // Flush all pending memory write invalidates.
     __kmp_sigaction(sig, &__kmp_sighldrs[sig], &old);
     if ((old.sa_handler != __kmp_team_handler) &&
         (old.sa_handler != __kmp_null_handler)) {
       // Restore the users signal handler.
       KB_TRACE(10, ("__kmp_remove_one_handler: oops, not our handler, "
                     "restoring: sig=%d\n",
                     sig));
       __kmp_sigaction(sig, &old, NULL);
     }
     sigdelset(&__kmp_sigset, sig);
     KMP_MB(); // Flush all pending memory write invalidates.
   }
 } // __kmp_remove_one_handler
 
 void __kmp_install_signals(int parallel_init) {
   KB_TRACE(10, ("__kmp_install_signals( %d )\n", parallel_init));
   if (__kmp_handle_signals || !parallel_init) {
     // If ! parallel_init, we do not install handlers, just save original
     // handlers. Let us do it even __handle_signals is 0.
     sigemptyset(&__kmp_sigset);
     __kmp_install_one_handler(SIGHUP, __kmp_team_handler, parallel_init);
     __kmp_install_one_handler(SIGINT, __kmp_team_handler, parallel_init);
     __kmp_install_one_handler(SIGQUIT, __kmp_team_handler, parallel_init);
     __kmp_install_one_handler(SIGILL, __kmp_team_handler, parallel_init);
     __kmp_install_one_handler(SIGABRT, __kmp_team_handler, parallel_init);
     __kmp_install_one_handler(SIGFPE, __kmp_team_handler, parallel_init);
     __kmp_install_one_handler(SIGBUS, __kmp_team_handler, parallel_init);
     __kmp_install_one_handler(SIGSEGV, __kmp_team_handler, parallel_init);
 #ifdef SIGSYS
     __kmp_install_one_handler(SIGSYS, __kmp_team_handler, parallel_init);
 #endif // SIGSYS
     __kmp_install_one_handler(SIGTERM, __kmp_team_handler, parallel_init);
 #ifdef SIGPIPE
     __kmp_install_one_handler(SIGPIPE, __kmp_team_handler, parallel_init);
 #endif // SIGPIPE
   }
 } // __kmp_install_signals
 
 void __kmp_remove_signals(void) {
   int sig;
   KB_TRACE(10, ("__kmp_remove_signals()\n"));
   for (sig = 1; sig < NSIG; ++sig) {
     __kmp_remove_one_handler(sig);
   }
 } // __kmp_remove_signals
 
 #endif // KMP_HANDLE_SIGNALS
 
 void __kmp_enable(int new_state) {
 #ifdef KMP_CANCEL_THREADS
   int status, old_state;
   status = pthread_setcancelstate(new_state, &old_state);
   KMP_CHECK_SYSFAIL("pthread_setcancelstate", status);
   KMP_DEBUG_ASSERT(old_state == PTHREAD_CANCEL_DISABLE);
 #endif
 }
 
 void __kmp_disable(int *old_state) {
 #ifdef KMP_CANCEL_THREADS
   int status;
   status = pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, old_state);
   KMP_CHECK_SYSFAIL("pthread_setcancelstate", status);
 #endif
 }
 
 static void __kmp_atfork_prepare(void) {
   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
   __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
 }
 
 static void __kmp_atfork_parent(void) {
   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
 }
 
 /* Reset the library so execution in the child starts "all over again" with
    clean data structures in initial states.  Don't worry about freeing memory
    allocated by parent, just abandon it to be safe. */
 static void __kmp_atfork_child(void) {
   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
   /* TODO make sure this is done right for nested/sibling */
   // ATT:  Memory leaks are here? TODO: Check it and fix.
   /* KMP_ASSERT( 0 ); */
 
   ++__kmp_fork_count;
 
 #if KMP_AFFINITY_SUPPORTED
 #if KMP_OS_LINUX
   // reset the affinity in the child to the initial thread
   // affinity in the parent
   kmp_set_thread_affinity_mask_initial();
 #endif
   // Set default not to bind threads tightly in the child (we’re expecting
   // over-subscription after the fork and this can improve things for
   // scripting languages that use OpenMP inside process-parallel code).
   __kmp_affinity_type = affinity_none;
   if (__kmp_nested_proc_bind.bind_types != NULL) {
     __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
   }
 #endif // KMP_AFFINITY_SUPPORTED
 
   __kmp_init_runtime = FALSE;
 #if KMP_USE_MONITOR
   __kmp_init_monitor = 0;
 #endif
   __kmp_init_parallel = FALSE;
   __kmp_init_middle = FALSE;
   __kmp_init_serial = FALSE;
   TCW_4(__kmp_init_gtid, FALSE);
   __kmp_init_common = FALSE;
 
   TCW_4(__kmp_init_user_locks, FALSE);
 #if !KMP_USE_DYNAMIC_LOCK
   __kmp_user_lock_table.used = 1;
   __kmp_user_lock_table.allocated = 0;
   __kmp_user_lock_table.table = NULL;
   __kmp_lock_blocks = NULL;
 #endif
 
   __kmp_all_nth = 0;
   TCW_4(__kmp_nth, 0);
 
   __kmp_thread_pool = NULL;
   __kmp_thread_pool_insert_pt = NULL;
   __kmp_team_pool = NULL;
 
   /* Must actually zero all the *cache arguments passed to __kmpc_threadprivate
      here so threadprivate doesn't use stale data */
   KA_TRACE(10, ("__kmp_atfork_child: checking cache address list %p\n",
                 __kmp_threadpriv_cache_list));
 
   while (__kmp_threadpriv_cache_list != NULL) {
 
     if (*__kmp_threadpriv_cache_list->addr != NULL) {
       KC_TRACE(50, ("__kmp_atfork_child: zeroing cache at address %p\n",
                     &(*__kmp_threadpriv_cache_list->addr)));
 
       *__kmp_threadpriv_cache_list->addr = NULL;
     }
     __kmp_threadpriv_cache_list = __kmp_threadpriv_cache_list->next;
   }
 
   __kmp_init_runtime = FALSE;
 
   /* reset statically initialized locks */
   __kmp_init_bootstrap_lock(&__kmp_initz_lock);
   __kmp_init_bootstrap_lock(&__kmp_stdio_lock);
   __kmp_init_bootstrap_lock(&__kmp_console_lock);
   __kmp_init_bootstrap_lock(&__kmp_task_team_lock);
 
 #if USE_ITT_BUILD
   __kmp_itt_reset(); // reset ITT's global state
 #endif /* USE_ITT_BUILD */
 
   /* This is necessary to make sure no stale data is left around */
   /* AC: customers complain that we use unsafe routines in the atfork
      handler. Mathworks: dlsym() is unsafe. We call dlsym and dlopen
      in dynamic_link when check the presence of shared tbbmalloc library.
      Suggestion is to make the library initialization lazier, similar
      to what done for __kmpc_begin(). */
   // TODO: synchronize all static initializations with regular library
   //       startup; look at kmp_global.cpp and etc.
   //__kmp_internal_begin ();
 }
 
 void __kmp_register_atfork(void) {
   if (__kmp_need_register_atfork) {
     int status = pthread_atfork(__kmp_atfork_prepare, __kmp_atfork_parent,
                                 __kmp_atfork_child);
     KMP_CHECK_SYSFAIL("pthread_atfork", status);
     __kmp_need_register_atfork = FALSE;
   }
 }
 
 void __kmp_suspend_initialize(void) {
   int status;
   status = pthread_mutexattr_init(&__kmp_suspend_mutex_attr);
   KMP_CHECK_SYSFAIL("pthread_mutexattr_init", status);
   status = pthread_condattr_init(&__kmp_suspend_cond_attr);
   KMP_CHECK_SYSFAIL("pthread_condattr_init", status);
 }
 
 void __kmp_suspend_initialize_thread(kmp_info_t *th) {
   ANNOTATE_HAPPENS_AFTER(&th->th.th_suspend_init_count);
   int old_value = KMP_ATOMIC_LD_RLX(&th->th.th_suspend_init_count);
   int new_value = __kmp_fork_count + 1;
   // Return if already initialized
   if (old_value == new_value)
     return;
   // Wait, then return if being initialized
   if (old_value == -1 ||
       !__kmp_atomic_compare_store(&th->th.th_suspend_init_count, old_value,
                                   -1)) {
     while (KMP_ATOMIC_LD_ACQ(&th->th.th_suspend_init_count) != new_value) {
       KMP_CPU_PAUSE();
     }
   } else {
     // Claim to be the initializer and do initializations
     int status;
     status = pthread_cond_init(&th->th.th_suspend_cv.c_cond,
                                &__kmp_suspend_cond_attr);
     KMP_CHECK_SYSFAIL("pthread_cond_init", status);
     status = pthread_mutex_init(&th->th.th_suspend_mx.m_mutex,
                                 &__kmp_suspend_mutex_attr);
     KMP_CHECK_SYSFAIL("pthread_mutex_init", status);
     KMP_ATOMIC_ST_REL(&th->th.th_suspend_init_count, new_value);
     ANNOTATE_HAPPENS_BEFORE(&th->th.th_suspend_init_count);
   }
 }
 
 void __kmp_suspend_uninitialize_thread(kmp_info_t *th) {
   if (KMP_ATOMIC_LD_ACQ(&th->th.th_suspend_init_count) > __kmp_fork_count) {
     /* this means we have initialize the suspension pthread objects for this
        thread in this instance of the process */
     int status;
 
     status = pthread_cond_destroy(&th->th.th_suspend_cv.c_cond);
     if (status != 0 && status != EBUSY) {
       KMP_SYSFAIL("pthread_cond_destroy", status);
     }
     status = pthread_mutex_destroy(&th->th.th_suspend_mx.m_mutex);
     if (status != 0 && status != EBUSY) {
       KMP_SYSFAIL("pthread_mutex_destroy", status);
     }
     --th->th.th_suspend_init_count;
     KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&th->th.th_suspend_init_count) ==
                      __kmp_fork_count);
   }
 }
 
 // return true if lock obtained, false otherwise
 int __kmp_try_suspend_mx(kmp_info_t *th) {
   return (pthread_mutex_trylock(&th->th.th_suspend_mx.m_mutex) == 0);
 }
 
 void __kmp_lock_suspend_mx(kmp_info_t *th) {
   int status = pthread_mutex_lock(&th->th.th_suspend_mx.m_mutex);
   KMP_CHECK_SYSFAIL("pthread_mutex_lock", status);
 }
 
 void __kmp_unlock_suspend_mx(kmp_info_t *th) {
   int status = pthread_mutex_unlock(&th->th.th_suspend_mx.m_mutex);
   KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
 }
 
 /* This routine puts the calling thread to sleep after setting the
    sleep bit for the indicated flag variable to true. */
 template <class C>
 static inline void __kmp_suspend_template(int th_gtid, C *flag) {
   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_suspend);
   kmp_info_t *th = __kmp_threads[th_gtid];
   int status;
   typename C::flag_t old_spin;
 
   KF_TRACE(30, ("__kmp_suspend_template: T#%d enter for flag = %p\n", th_gtid,
                 flag->get()));
 
   __kmp_suspend_initialize_thread(th);
 
   status = pthread_mutex_lock(&th->th.th_suspend_mx.m_mutex);
   KMP_CHECK_SYSFAIL("pthread_mutex_lock", status);
 
   KF_TRACE(10, ("__kmp_suspend_template: T#%d setting sleep bit for spin(%p)\n",
                 th_gtid, flag->get()));
 
   /* TODO: shouldn't this use release semantics to ensure that
      __kmp_suspend_initialize_thread gets called first? */
   old_spin = flag->set_sleeping();
   if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
       __kmp_pause_status != kmp_soft_paused) {
     flag->unset_sleeping();
     status = pthread_mutex_unlock(&th->th.th_suspend_mx.m_mutex);
     KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
     return;
   }
   KF_TRACE(5, ("__kmp_suspend_template: T#%d set sleep bit for spin(%p)==%x,"
                " was %x\n",
                th_gtid, flag->get(), flag->load(), old_spin));
 
   if (flag->done_check_val(old_spin)) {
     old_spin = flag->unset_sleeping();
     KF_TRACE(5, ("__kmp_suspend_template: T#%d false alarm, reset sleep bit "
                  "for spin(%p)\n",
                  th_gtid, flag->get()));
   } else {
     /* Encapsulate in a loop as the documentation states that this may
        "with low probability" return when the condition variable has
        not been signaled or broadcast */
     int deactivated = FALSE;
     TCW_PTR(th->th.th_sleep_loc, (void *)flag);
 
     while (flag->is_sleeping()) {
 #ifdef DEBUG_SUSPEND
       char buffer[128];
       __kmp_suspend_count++;
       __kmp_print_cond(buffer, &th->th.th_suspend_cv);
       __kmp_printf("__kmp_suspend_template: suspending T#%d: %s\n", th_gtid,
                    buffer);
 #endif
       // Mark the thread as no longer active (only in the first iteration of the
       // loop).
       if (!deactivated) {
         th->th.th_active = FALSE;
         if (th->th.th_active_in_pool) {
           th->th.th_active_in_pool = FALSE;
           KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
           KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0);
         }
         deactivated = TRUE;
       }
 
 #if USE_SUSPEND_TIMEOUT
       struct timespec now;
       struct timeval tval;
       int msecs;
 
       status = gettimeofday(&tval, NULL);
       KMP_CHECK_SYSFAIL_ERRNO("gettimeofday", status);
       TIMEVAL_TO_TIMESPEC(&tval, &now);
 
       msecs = (4 * __kmp_dflt_blocktime) + 200;
       now.tv_sec += msecs / 1000;
       now.tv_nsec += (msecs % 1000) * 1000;
 
       KF_TRACE(15, ("__kmp_suspend_template: T#%d about to perform "
                     "pthread_cond_timedwait\n",
                     th_gtid));
       status = pthread_cond_timedwait(&th->th.th_suspend_cv.c_cond,
                                       &th->th.th_suspend_mx.m_mutex, &now);
 #else
       KF_TRACE(15, ("__kmp_suspend_template: T#%d about to perform"
                     " pthread_cond_wait\n",
                     th_gtid));
       status = pthread_cond_wait(&th->th.th_suspend_cv.c_cond,
                                  &th->th.th_suspend_mx.m_mutex);
 #endif
 
       if ((status != 0) && (status != EINTR) && (status != ETIMEDOUT)) {
         KMP_SYSFAIL("pthread_cond_wait", status);
       }
 #ifdef KMP_DEBUG
       if (status == ETIMEDOUT) {
         if (flag->is_sleeping()) {
           KF_TRACE(100,
                    ("__kmp_suspend_template: T#%d timeout wakeup\n", th_gtid));
         } else {
           KF_TRACE(2, ("__kmp_suspend_template: T#%d timeout wakeup, sleep bit "
                        "not set!\n",
                        th_gtid));
         }
       } else if (flag->is_sleeping()) {
         KF_TRACE(100,
                  ("__kmp_suspend_template: T#%d spurious wakeup\n", th_gtid));
       }
 #endif
     } // while
 
     // Mark the thread as active again (if it was previous marked as inactive)
     if (deactivated) {
       th->th.th_active = TRUE;
       if (TCR_4(th->th.th_in_pool)) {
         KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
         th->th.th_active_in_pool = TRUE;
       }
     }
   }
 #ifdef DEBUG_SUSPEND
   {
     char buffer[128];
     __kmp_print_cond(buffer, &th->th.th_suspend_cv);
     __kmp_printf("__kmp_suspend_template: T#%d has awakened: %s\n", th_gtid,
                  buffer);
   }
 #endif
 
   status = pthread_mutex_unlock(&th->th.th_suspend_mx.m_mutex);
   KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
   KF_TRACE(30, ("__kmp_suspend_template: T#%d exit\n", th_gtid));
 }
 
 void __kmp_suspend_32(int th_gtid, kmp_flag_32 *flag) {
   __kmp_suspend_template(th_gtid, flag);
 }
 void __kmp_suspend_64(int th_gtid, kmp_flag_64 *flag) {
   __kmp_suspend_template(th_gtid, flag);
 }
 void __kmp_suspend_oncore(int th_gtid, kmp_flag_oncore *flag) {
   __kmp_suspend_template(th_gtid, flag);
 }
 
 /* This routine signals the thread specified by target_gtid to wake up
    after setting the sleep bit indicated by the flag argument to FALSE.
    The target thread must already have called __kmp_suspend_template() */
 template <class C>
 static inline void __kmp_resume_template(int target_gtid, C *flag) {
   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_resume);
   kmp_info_t *th = __kmp_threads[target_gtid];
   int status;
 
 #ifdef KMP_DEBUG
   int gtid = TCR_4(__kmp_init_gtid) ? __kmp_get_gtid() : -1;
 #endif
 
   KF_TRACE(30, ("__kmp_resume_template: T#%d wants to wakeup T#%d enter\n",
                 gtid, target_gtid));
   KMP_DEBUG_ASSERT(gtid != target_gtid);
 
   __kmp_suspend_initialize_thread(th);
 
   status = pthread_mutex_lock(&th->th.th_suspend_mx.m_mutex);
   KMP_CHECK_SYSFAIL("pthread_mutex_lock", status);
 
   if (!flag) { // coming from __kmp_null_resume_wrapper
     flag = (C *)CCAST(void *, th->th.th_sleep_loc);
   }
 
   // First, check if the flag is null or its type has changed. If so, someone
   // else woke it up.
   if (!flag || flag->get_type() != flag->get_ptr_type()) { // get_ptr_type
     // simply shows what
     // flag was cast to
     KF_TRACE(5, ("__kmp_resume_template: T#%d exiting, thread T#%d already "
                  "awake: flag(%p)\n",
                  gtid, target_gtid, NULL));
     status = pthread_mutex_unlock(&th->th.th_suspend_mx.m_mutex);
     KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
     return;
   } else { // if multiple threads are sleeping, flag should be internally
     // referring to a specific thread here
     typename C::flag_t old_spin = flag->unset_sleeping();
     if (!flag->is_sleeping_val(old_spin)) {
       KF_TRACE(5, ("__kmp_resume_template: T#%d exiting, thread T#%d already "
                    "awake: flag(%p): "
                    "%u => %u\n",
                    gtid, target_gtid, flag->get(), old_spin, flag->load()));
       status = pthread_mutex_unlock(&th->th.th_suspend_mx.m_mutex);
       KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
       return;
     }
     KF_TRACE(5, ("__kmp_resume_template: T#%d about to wakeup T#%d, reset "
                  "sleep bit for flag's loc(%p): "
                  "%u => %u\n",
                  gtid, target_gtid, flag->get(), old_spin, flag->load()));
   }
   TCW_PTR(th->th.th_sleep_loc, NULL);
 
 #ifdef DEBUG_SUSPEND
   {
     char buffer[128];
     __kmp_print_cond(buffer, &th->th.th_suspend_cv);
     __kmp_printf("__kmp_resume_template: T#%d resuming T#%d: %s\n", gtid,
                  target_gtid, buffer);
   }
 #endif
   status = pthread_cond_signal(&th->th.th_suspend_cv.c_cond);
   KMP_CHECK_SYSFAIL("pthread_cond_signal", status);
   status = pthread_mutex_unlock(&th->th.th_suspend_mx.m_mutex);
   KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
   KF_TRACE(30, ("__kmp_resume_template: T#%d exiting after signaling wake up"
                 " for T#%d\n",
                 gtid, target_gtid));
 }
 
 void __kmp_resume_32(int target_gtid, kmp_flag_32 *flag) {
   __kmp_resume_template(target_gtid, flag);
 }
 void __kmp_resume_64(int target_gtid, kmp_flag_64 *flag) {
   __kmp_resume_template(target_gtid, flag);
 }
 void __kmp_resume_oncore(int target_gtid, kmp_flag_oncore *flag) {
   __kmp_resume_template(target_gtid, flag);
 }
 
 #if KMP_USE_MONITOR
 void __kmp_resume_monitor() {
   KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_resume);
   int status;
 #ifdef KMP_DEBUG
   int gtid = TCR_4(__kmp_init_gtid) ? __kmp_get_gtid() : -1;
   KF_TRACE(30, ("__kmp_resume_monitor: T#%d wants to wakeup T#%d enter\n", gtid,
                 KMP_GTID_MONITOR));
   KMP_DEBUG_ASSERT(gtid != KMP_GTID_MONITOR);
 #endif
   status = pthread_mutex_lock(&__kmp_wait_mx.m_mutex);
   KMP_CHECK_SYSFAIL("pthread_mutex_lock", status);
 #ifdef DEBUG_SUSPEND
   {
     char buffer[128];
     __kmp_print_cond(buffer, &__kmp_wait_cv.c_cond);
     __kmp_printf("__kmp_resume_monitor: T#%d resuming T#%d: %s\n", gtid,
                  KMP_GTID_MONITOR, buffer);
   }
 #endif
   status = pthread_cond_signal(&__kmp_wait_cv.c_cond);
   KMP_CHECK_SYSFAIL("pthread_cond_signal", status);
   status = pthread_mutex_unlock(&__kmp_wait_mx.m_mutex);
   KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
   KF_TRACE(30, ("__kmp_resume_monitor: T#%d exiting after signaling wake up"
                 " for T#%d\n",
                 gtid, KMP_GTID_MONITOR));
 }
 #endif // KMP_USE_MONITOR
 
 void __kmp_yield() { sched_yield(); }
 
 void __kmp_gtid_set_specific(int gtid) {
   if (__kmp_init_gtid) {
     int status;
     status = pthread_setspecific(__kmp_gtid_threadprivate_key,
                                  (void *)(intptr_t)(gtid + 1));
     KMP_CHECK_SYSFAIL("pthread_setspecific", status);
   } else {
     KA_TRACE(50, ("__kmp_gtid_set_specific: runtime shutdown, returning\n"));
   }
 }
 
 int __kmp_gtid_get_specific() {
   int gtid;
   if (!__kmp_init_gtid) {
     KA_TRACE(50, ("__kmp_gtid_get_specific: runtime shutdown, returning "
                   "KMP_GTID_SHUTDOWN\n"));
     return KMP_GTID_SHUTDOWN;
   }
   gtid = (int)(size_t)pthread_getspecific(__kmp_gtid_threadprivate_key);
   if (gtid == 0) {
     gtid = KMP_GTID_DNE;
   } else {
     gtid--;
   }
   KA_TRACE(50, ("__kmp_gtid_get_specific: key:%d gtid:%d\n",
                 __kmp_gtid_threadprivate_key, gtid));
   return gtid;
 }
 
 double __kmp_read_cpu_time(void) {
   /*clock_t   t;*/
   struct tms buffer;
 
   /*t =*/times(&buffer);
 
   return (buffer.tms_utime + buffer.tms_cutime) / (double)CLOCKS_PER_SEC;
 }
 
 int __kmp_read_system_info(struct kmp_sys_info *info) {
   int status;
   struct rusage r_usage;
 
   memset(info, 0, sizeof(*info));
 
   status = getrusage(RUSAGE_SELF, &r_usage);
   KMP_CHECK_SYSFAIL_ERRNO("getrusage", status);
 
   // The maximum resident set size utilized (in kilobytes)
   info->maxrss = r_usage.ru_maxrss;
   // The number of page faults serviced without any I/O
   info->minflt = r_usage.ru_minflt;
   // The number of page faults serviced that required I/O
   info->majflt = r_usage.ru_majflt;
   // The number of times a process was "swapped" out of memory
   info->nswap = r_usage.ru_nswap;
   // The number of times the file system had to perform input
   info->inblock = r_usage.ru_inblock;
   // The number of times the file system had to perform output
   info->oublock = r_usage.ru_oublock;
   // The number of times a context switch was voluntarily
   info->nvcsw = r_usage.ru_nvcsw;
   // The number of times a context switch was forced
   info->nivcsw = r_usage.ru_nivcsw;
 
   return (status != 0);
 }
 
 void __kmp_read_system_time(double *delta) {
   double t_ns;
   struct timeval tval;
   struct timespec stop;
   int status;
 
   status = gettimeofday(&tval, NULL);
   KMP_CHECK_SYSFAIL_ERRNO("gettimeofday", status);
   TIMEVAL_TO_TIMESPEC(&tval, &stop);
   t_ns = TS2NS(stop) - TS2NS(__kmp_sys_timer_data.start);
   *delta = (t_ns * 1e-9);
 }
 
 void __kmp_clear_system_time(void) {
   struct timeval tval;
   int status;
   status = gettimeofday(&tval, NULL);
   KMP_CHECK_SYSFAIL_ERRNO("gettimeofday", status);
   TIMEVAL_TO_TIMESPEC(&tval, &__kmp_sys_timer_data.start);
 }
 
 static int __kmp_get_xproc(void) {
 
   int r = 0;
 
 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
         KMP_OS_OPENBSD || KMP_OS_HURD
 
   r = sysconf(_SC_NPROCESSORS_ONLN);
 
 #elif KMP_OS_DARWIN
 
   // Bug C77011 High "OpenMP Threads and number of active cores".
 
   // Find the number of available CPUs.
   kern_return_t rc;
   host_basic_info_data_t info;
   mach_msg_type_number_t num = HOST_BASIC_INFO_COUNT;
   rc = host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&info, &num);
   if (rc == 0 && num == HOST_BASIC_INFO_COUNT) {
     // Cannot use KA_TRACE() here because this code works before trace support
     // is initialized.
     r = info.avail_cpus;
   } else {
     KMP_WARNING(CantGetNumAvailCPU);
     KMP_INFORM(AssumedNumCPU);
   }
 
 #else
 
 #error "Unknown or unsupported OS."
 
 #endif
 
   return r > 0 ? r : 2; /* guess value of 2 if OS told us 0 */
 
 } // __kmp_get_xproc
 
 int __kmp_read_from_file(char const *path, char const *format, ...) {
   int result;
   va_list args;
 
   va_start(args, format);
   FILE *f = fopen(path, "rb");
   if (f == NULL)
     return 0;
   result = vfscanf(f, format, args);
   fclose(f);
 
   return result;
 }
 
 void __kmp_runtime_initialize(void) {
   int status;
   pthread_mutexattr_t mutex_attr;
   pthread_condattr_t cond_attr;
 
   if (__kmp_init_runtime) {
     return;
   }
 
 #if (KMP_ARCH_X86 || KMP_ARCH_X86_64)
   if (!__kmp_cpuinfo.initialized) {
     __kmp_query_cpuid(&__kmp_cpuinfo);
   }
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
   __kmp_xproc = __kmp_get_xproc();
 
 #if ! KMP_32_BIT_ARCH
   struct rlimit rlim;
   // read stack size of calling thread, save it as default for worker threads;
   // this should be done before reading environment variables
   status = getrlimit(RLIMIT_STACK, &rlim);
   if (status == 0) { // success?
     __kmp_stksize = rlim.rlim_cur;
     __kmp_check_stksize(&__kmp_stksize); // check value and adjust if needed
   }
 #endif /* KMP_32_BIT_ARCH */
 
   if (sysconf(_SC_THREADS)) {
 
     /* Query the maximum number of threads */
     __kmp_sys_max_nth = sysconf(_SC_THREAD_THREADS_MAX);
     if (__kmp_sys_max_nth == -1) {
       /* Unlimited threads for NPTL */
       __kmp_sys_max_nth = INT_MAX;
     } else if (__kmp_sys_max_nth <= 1) {
       /* Can't tell, just use PTHREAD_THREADS_MAX */
       __kmp_sys_max_nth = KMP_MAX_NTH;
     }
 
     /* Query the minimum stack size */
     __kmp_sys_min_stksize = sysconf(_SC_THREAD_STACK_MIN);
     if (__kmp_sys_min_stksize <= 1) {
       __kmp_sys_min_stksize = KMP_MIN_STKSIZE;
     }
   }
 
   /* Set up minimum number of threads to switch to TLS gtid */
   __kmp_tls_gtid_min = KMP_TLS_GTID_MIN;
 
   status = pthread_key_create(&__kmp_gtid_threadprivate_key,
                               __kmp_internal_end_dest);
   KMP_CHECK_SYSFAIL("pthread_key_create", status);
   status = pthread_mutexattr_init(&mutex_attr);
   KMP_CHECK_SYSFAIL("pthread_mutexattr_init", status);
   status = pthread_mutex_init(&__kmp_wait_mx.m_mutex, &mutex_attr);
   KMP_CHECK_SYSFAIL("pthread_mutex_init", status);
   status = pthread_condattr_init(&cond_attr);
   KMP_CHECK_SYSFAIL("pthread_condattr_init", status);
   status = pthread_cond_init(&__kmp_wait_cv.c_cond, &cond_attr);
   KMP_CHECK_SYSFAIL("pthread_cond_init", status);
 #if USE_ITT_BUILD
   __kmp_itt_initialize();
 #endif /* USE_ITT_BUILD */
 
   __kmp_init_runtime = TRUE;
 }
 
 void __kmp_runtime_destroy(void) {
   int status;
 
   if (!__kmp_init_runtime) {
     return; // Nothing to do.
   }
 
 #if USE_ITT_BUILD
   __kmp_itt_destroy();
 #endif /* USE_ITT_BUILD */
 
   status = pthread_key_delete(__kmp_gtid_threadprivate_key);
   KMP_CHECK_SYSFAIL("pthread_key_delete", status);
 
   status = pthread_mutex_destroy(&__kmp_wait_mx.m_mutex);
   if (status != 0 && status != EBUSY) {
     KMP_SYSFAIL("pthread_mutex_destroy", status);
   }
   status = pthread_cond_destroy(&__kmp_wait_cv.c_cond);
   if (status != 0 && status != EBUSY) {
     KMP_SYSFAIL("pthread_cond_destroy", status);
   }
 #if KMP_AFFINITY_SUPPORTED
   __kmp_affinity_uninitialize();
 #endif
 
   __kmp_init_runtime = FALSE;
 }
 
 /* Put the thread to sleep for a time period */
 /* NOTE: not currently used anywhere */
 void __kmp_thread_sleep(int millis) { sleep((millis + 500) / 1000); }
 
 /* Calculate the elapsed wall clock time for the user */
 void __kmp_elapsed(double *t) {
   int status;
 #ifdef FIX_SGI_CLOCK
   struct timespec ts;
 
   status = clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
   KMP_CHECK_SYSFAIL_ERRNO("clock_gettime", status);
   *t =
       (double)ts.tv_nsec * (1.0 / (double)KMP_NSEC_PER_SEC) + (double)ts.tv_sec;
 #else
   struct timeval tv;
 
   status = gettimeofday(&tv, NULL);
   KMP_CHECK_SYSFAIL_ERRNO("gettimeofday", status);
   *t =
       (double)tv.tv_usec * (1.0 / (double)KMP_USEC_PER_SEC) + (double)tv.tv_sec;
 #endif
 }
 
 /* Calculate the elapsed wall clock tick for the user */
 void __kmp_elapsed_tick(double *t) { *t = 1 / (double)CLOCKS_PER_SEC; }
 
 /* Return the current time stamp in nsec */
 kmp_uint64 __kmp_now_nsec() {
   struct timeval t;
   gettimeofday(&t, NULL);
   kmp_uint64 nsec = (kmp_uint64)KMP_NSEC_PER_SEC * (kmp_uint64)t.tv_sec +
                     (kmp_uint64)1000 * (kmp_uint64)t.tv_usec;
   return nsec;
 }
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 /* Measure clock ticks per millisecond */
 void __kmp_initialize_system_tick() {
   kmp_uint64 now, nsec2, diff;
   kmp_uint64 delay = 100000; // 50~100 usec on most machines.
   kmp_uint64 nsec = __kmp_now_nsec();
   kmp_uint64 goal = __kmp_hardware_timestamp() + delay;
   while ((now = __kmp_hardware_timestamp()) < goal)
     ;
   nsec2 = __kmp_now_nsec();
   diff = nsec2 - nsec;
   if (diff > 0) {
     kmp_uint64 tpms = (kmp_uint64)(1e6 * (delay + (now - goal)) / diff);
     if (tpms > 0)
       __kmp_ticks_per_msec = tpms;
   }
 }
 #endif
 
 /* Determine whether the given address is mapped into the current address
    space. */
 
 int __kmp_is_address_mapped(void *addr) {
 
   int found = 0;
   int rc;
 
-#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_HURD
+#if KMP_OS_LINUX || KMP_OS_HURD
 
   /* On GNUish OSes, read the /proc/<pid>/maps pseudo-file to get all the address
      ranges mapped into the address space. */
 
   char *name = __kmp_str_format("/proc/%d/maps", getpid());
   FILE *file = NULL;
 
   file = fopen(name, "r");
   KMP_ASSERT(file != NULL);
 
   for (;;) {
 
     void *beginning = NULL;
     void *ending = NULL;
     char perms[5];
 
     rc = fscanf(file, "%p-%p %4s %*[^\n]\n", &beginning, &ending, perms);
     if (rc == EOF) {
       break;
     }
     KMP_ASSERT(rc == 3 &&
                KMP_STRLEN(perms) == 4); // Make sure all fields are read.
 
     // Ending address is not included in the region, but beginning is.
     if ((addr >= beginning) && (addr < ending)) {
       perms[2] = 0; // 3th and 4th character does not matter.
       if (strcmp(perms, "rw") == 0) {
         // Memory we are looking for should be readable and writable.
         found = 1;
       }
       break;
     }
   }
 
   // Free resources.
   fclose(file);
   KMP_INTERNAL_FREE(name);
+#elif KMP_OS_FREEBSD
+  char *buf;
+  size_t lstsz;
+  int mib[] = {CTL_KERN, KERN_PROC, KERN_PROC_VMMAP, getpid()};
+  rc = sysctl(mib, 4, NULL, &lstsz, NULL, 0);
+  if (rc < 0)
+     return 0;
+  // We pass from number of vm entry's semantic
+  // to size of whole entry map list.
+  lstsz = lstsz * 4 / 3;
+  buf = reinterpret_cast<char *>(kmpc_malloc(lstsz));
+  rc = sysctl(mib, 4, buf, &lstsz, NULL, 0);
+  if (rc < 0) {
+     kmpc_free(buf);
+     return 0;
+  }
 
+  char *lw = buf;
+  char *up = buf + lstsz;
+
+  while (lw < up) {
+      struct kinfo_vmentry *cur = reinterpret_cast<struct kinfo_vmentry *>(lw);
+      size_t cursz = cur->kve_structsize;
+      if (cursz == 0)
+          break;
+      void *start = reinterpret_cast<void *>(cur->kve_start);
+      void *end = reinterpret_cast<void *>(cur->kve_end);
+      // Readable/Writable addresses within current map entry
+      if ((addr >= start) && (addr < end)) {
+          if ((cur->kve_protection & KVME_PROT_READ) != 0 &&
+              (cur->kve_protection & KVME_PROT_WRITE) != 0) {
+              found = 1;
+              break;
+          }
+      }
+      lw += cursz;
+  }
+  kmpc_free(buf);
+
 #elif KMP_OS_DARWIN
 
   /* On OS X*, /proc pseudo filesystem is not available. Try to read memory
      using vm interface. */
 
   int buffer;
   vm_size_t count;
   rc = vm_read_overwrite(
       mach_task_self(), // Task to read memory of.
       (vm_address_t)(addr), // Address to read from.
       1, // Number of bytes to be read.
       (vm_address_t)(&buffer), // Address of buffer to save read bytes in.
       &count // Address of var to save number of read bytes in.
       );
   if (rc == 0) {
     // Memory successfully read.
     found = 1;
   }
 
 #elif KMP_OS_NETBSD
 
   int mib[5];
   mib[0] = CTL_VM;
   mib[1] = VM_PROC;
   mib[2] = VM_PROC_MAP;
   mib[3] = getpid();
   mib[4] = sizeof(struct kinfo_vmentry);
 
   size_t size;
   rc = sysctl(mib, __arraycount(mib), NULL, &size, NULL, 0);
   KMP_ASSERT(!rc);
   KMP_ASSERT(size);
 
   size = size * 4 / 3;
   struct kinfo_vmentry *kiv = (struct kinfo_vmentry *)KMP_INTERNAL_MALLOC(size);
   KMP_ASSERT(kiv);
 
   rc = sysctl(mib, __arraycount(mib), kiv, &size, NULL, 0);
   KMP_ASSERT(!rc);
   KMP_ASSERT(size);
 
   for (size_t i = 0; i < size; i++) {
     if (kiv[i].kve_start >= (uint64_t)addr &&
         kiv[i].kve_end <= (uint64_t)addr) {
       found = 1;
       break;
     }
   }
   KMP_INTERNAL_FREE(kiv);
 #elif KMP_OS_DRAGONFLY || KMP_OS_OPENBSD
 
   // FIXME(DragonFly, OpenBSD): Implement this
   found = 1;
 
 #else
 
 #error "Unknown or unsupported OS"
 
 #endif
 
   return found;
 
 } // __kmp_is_address_mapped
 
 #ifdef USE_LOAD_BALANCE
 
 #if KMP_OS_DARWIN || KMP_OS_NETBSD
 
 // The function returns the rounded value of the system load average
 // during given time interval which depends on the value of
 // __kmp_load_balance_interval variable (default is 60 sec, other values
 // may be 300 sec or 900 sec).
 // It returns -1 in case of error.
 int __kmp_get_load_balance(int max) {
   double averages[3];
   int ret_avg = 0;
 
   int res = getloadavg(averages, 3);
 
   // Check __kmp_load_balance_interval to determine which of averages to use.
   // getloadavg() may return the number of samples less than requested that is
   // less than 3.
   if (__kmp_load_balance_interval < 180 && (res >= 1)) {
     ret_avg = averages[0]; // 1 min
   } else if ((__kmp_load_balance_interval >= 180 &&
               __kmp_load_balance_interval < 600) &&
              (res >= 2)) {
     ret_avg = averages[1]; // 5 min
   } else if ((__kmp_load_balance_interval >= 600) && (res == 3)) {
     ret_avg = averages[2]; // 15 min
   } else { // Error occurred
     return -1;
   }
 
   return ret_avg;
 }
 
 #else // Linux* OS
 
 // The fuction returns number of running (not sleeping) threads, or -1 in case
 // of error. Error could be reported if Linux* OS kernel too old (without
 // "/proc" support). Counting running threads stops if max running threads
 // encountered.
 int __kmp_get_load_balance(int max) {
   static int permanent_error = 0;
   static int glb_running_threads = 0; // Saved count of the running threads for
   // the thread balance algortihm
   static double glb_call_time = 0; /* Thread balance algorithm call time */
 
   int running_threads = 0; // Number of running threads in the system.
 
   DIR *proc_dir = NULL; // Handle of "/proc/" directory.
   struct dirent *proc_entry = NULL;
 
   kmp_str_buf_t task_path; // "/proc/<pid>/task/<tid>/" path.
   DIR *task_dir = NULL; // Handle of "/proc/<pid>/task/<tid>/" directory.
   struct dirent *task_entry = NULL;
   int task_path_fixed_len;
 
   kmp_str_buf_t stat_path; // "/proc/<pid>/task/<tid>/stat" path.
   int stat_file = -1;
   int stat_path_fixed_len;
 
   int total_processes = 0; // Total number of processes in system.
   int total_threads = 0; // Total number of threads in system.
 
   double call_time = 0.0;
 
   __kmp_str_buf_init(&task_path);
   __kmp_str_buf_init(&stat_path);
 
   __kmp_elapsed(&call_time);
 
   if (glb_call_time &&
       (call_time - glb_call_time < __kmp_load_balance_interval)) {
     running_threads = glb_running_threads;
     goto finish;
   }
 
   glb_call_time = call_time;
 
   // Do not spend time on scanning "/proc/" if we have a permanent error.
   if (permanent_error) {
     running_threads = -1;
     goto finish;
   }
 
   if (max <= 0) {
     max = INT_MAX;
   }
 
   // Open "/proc/" directory.
   proc_dir = opendir("/proc");
   if (proc_dir == NULL) {
     // Cannot open "/prroc/". Probably the kernel does not support it. Return an
     // error now and in subsequent calls.
     running_threads = -1;
     permanent_error = 1;
     goto finish;
   }
 
   // Initialize fixed part of task_path. This part will not change.
   __kmp_str_buf_cat(&task_path, "/proc/", 6);
   task_path_fixed_len = task_path.used; // Remember number of used characters.
 
   proc_entry = readdir(proc_dir);
   while (proc_entry != NULL) {
     // Proc entry is a directory and name starts with a digit. Assume it is a
     // process' directory.
     if (proc_entry->d_type == DT_DIR && isdigit(proc_entry->d_name[0])) {
 
       ++total_processes;
       // Make sure init process is the very first in "/proc", so we can replace
       // strcmp( proc_entry->d_name, "1" ) == 0 with simpler total_processes ==
       // 1. We are going to check that total_processes == 1 => d_name == "1" is
       // true (where "=>" is implication). Since C++ does not have => operator,
       // let us replace it with its equivalent: a => b == ! a || b.
       KMP_DEBUG_ASSERT(total_processes != 1 ||
                        strcmp(proc_entry->d_name, "1") == 0);
 
       // Construct task_path.
       task_path.used = task_path_fixed_len; // Reset task_path to "/proc/".
       __kmp_str_buf_cat(&task_path, proc_entry->d_name,
                         KMP_STRLEN(proc_entry->d_name));
       __kmp_str_buf_cat(&task_path, "/task", 5);
 
       task_dir = opendir(task_path.str);
       if (task_dir == NULL) {
         // Process can finish between reading "/proc/" directory entry and
         // opening process' "task/" directory. So, in general case we should not
         // complain, but have to skip this process and read the next one. But on
         // systems with no "task/" support we will spend lot of time to scan
         // "/proc/" tree again and again without any benefit. "init" process
         // (its pid is 1) should exist always, so, if we cannot open
         // "/proc/1/task/" directory, it means "task/" is not supported by
         // kernel. Report an error now and in the future.
         if (strcmp(proc_entry->d_name, "1") == 0) {
           running_threads = -1;
           permanent_error = 1;
           goto finish;
         }
       } else {
         // Construct fixed part of stat file path.
         __kmp_str_buf_clear(&stat_path);
         __kmp_str_buf_cat(&stat_path, task_path.str, task_path.used);
         __kmp_str_buf_cat(&stat_path, "/", 1);
         stat_path_fixed_len = stat_path.used;
 
         task_entry = readdir(task_dir);
         while (task_entry != NULL) {
           // It is a directory and name starts with a digit.
           if (proc_entry->d_type == DT_DIR && isdigit(task_entry->d_name[0])) {
             ++total_threads;
 
             // Consruct complete stat file path. Easiest way would be:
             //  __kmp_str_buf_print( & stat_path, "%s/%s/stat", task_path.str,
             //  task_entry->d_name );
             // but seriae of __kmp_str_buf_cat works a bit faster.
             stat_path.used =
                 stat_path_fixed_len; // Reset stat path to its fixed part.
             __kmp_str_buf_cat(&stat_path, task_entry->d_name,
                               KMP_STRLEN(task_entry->d_name));
             __kmp_str_buf_cat(&stat_path, "/stat", 5);
 
             // Note: Low-level API (open/read/close) is used. High-level API
             // (fopen/fclose)  works ~ 30 % slower.
             stat_file = open(stat_path.str, O_RDONLY);
             if (stat_file == -1) {
               // We cannot report an error because task (thread) can terminate
               // just before reading this file.
             } else {
               /* Content of "stat" file looks like:
                  24285 (program) S ...
 
                  It is a single line (if program name does not include funny
                  symbols). First number is a thread id, then name of executable
                  file name in paretheses, then state of the thread. We need just
                  thread state.
 
                  Good news: Length of program name is 15 characters max. Longer
                  names are truncated.
 
                  Thus, we need rather short buffer: 15 chars for program name +
                  2 parenthesis, + 3 spaces + ~7 digits of pid = 37.
 
                  Bad news: Program name may contain special symbols like space,
                  closing parenthesis, or even new line. This makes parsing
                  "stat" file not 100 % reliable. In case of fanny program names
                  parsing may fail (report incorrect thread state).
 
                  Parsing "status" file looks more promissing (due to different
                  file structure and escaping special symbols) but reading and
                  parsing of "status" file works slower.
                   -- ln
               */
               char buffer[65];
               int len;
               len = read(stat_file, buffer, sizeof(buffer) - 1);
               if (len >= 0) {
                 buffer[len] = 0;
                 // Using scanf:
                 //     sscanf( buffer, "%*d (%*s) %c ", & state );
                 // looks very nice, but searching for a closing parenthesis
                 // works a bit faster.
                 char *close_parent = strstr(buffer, ") ");
                 if (close_parent != NULL) {
                   char state = *(close_parent + 2);
                   if (state == 'R') {
                     ++running_threads;
                     if (running_threads >= max) {
                       goto finish;
                     }
                   }
                 }
               }
               close(stat_file);
               stat_file = -1;
             }
           }
           task_entry = readdir(task_dir);
         }
         closedir(task_dir);
         task_dir = NULL;
       }
     }
     proc_entry = readdir(proc_dir);
   }
 
   // There _might_ be a timing hole where the thread executing this
   // code get skipped in the load balance, and running_threads is 0.
   // Assert in the debug builds only!!!
   KMP_DEBUG_ASSERT(running_threads > 0);
   if (running_threads <= 0) {
     running_threads = 1;
   }
 
 finish: // Clean up and exit.
   if (proc_dir != NULL) {
     closedir(proc_dir);
   }
   __kmp_str_buf_free(&task_path);
   if (task_dir != NULL) {
     closedir(task_dir);
   }
   __kmp_str_buf_free(&stat_path);
   if (stat_file != -1) {
     close(stat_file);
   }
 
   glb_running_threads = running_threads;
 
   return running_threads;
 
 } // __kmp_get_load_balance
 
 #endif // KMP_OS_DARWIN
 
 #endif // USE_LOAD_BALANCE
 
 #if !(KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_MIC ||                            \
-      ((KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64) || KMP_ARCH_PPC64)
+      ((KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64) ||                 \
+      KMP_ARCH_PPC64 || KMP_ARCH_RISCV64)
 
 // we really only need the case with 1 argument, because CLANG always build
 // a struct of pointers to shared variables referenced in the outlined function
 int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
                            void *p_argv[]
 #if OMPT_SUPPORT
                            ,
                            void **exit_frame_ptr
 #endif
                            ) {
 #if OMPT_SUPPORT
   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
 #endif
 
   switch (argc) {
   default:
     fprintf(stderr, "Too many args to microtask: %d!\n", argc);
     fflush(stderr);
     exit(-1);
   case 0:
     (*pkfn)(&gtid, &tid);
     break;
   case 1:
     (*pkfn)(&gtid, &tid, p_argv[0]);
     break;
   case 2:
     (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1]);
     break;
   case 3:
     (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2]);
     break;
   case 4:
     (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3]);
     break;
   case 5:
     (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4]);
     break;
   case 6:
     (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
             p_argv[5]);
     break;
   case 7:
     (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
             p_argv[5], p_argv[6]);
     break;
   case 8:
     (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
             p_argv[5], p_argv[6], p_argv[7]);
     break;
   case 9:
     (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
             p_argv[5], p_argv[6], p_argv[7], p_argv[8]);
     break;
   case 10:
     (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
             p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9]);
     break;
   case 11:
     (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
             p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10]);
     break;
   case 12:
     (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
             p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
             p_argv[11]);
     break;
   case 13:
     (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
             p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
             p_argv[11], p_argv[12]);
     break;
   case 14:
     (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
             p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
             p_argv[11], p_argv[12], p_argv[13]);
     break;
   case 15:
     (*pkfn)(&gtid, &tid, p_argv[0], p_argv[1], p_argv[2], p_argv[3], p_argv[4],
             p_argv[5], p_argv[6], p_argv[7], p_argv[8], p_argv[9], p_argv[10],
             p_argv[11], p_argv[12], p_argv[13], p_argv[14]);
     break;
   }
-
-#if OMPT_SUPPORT
-  *exit_frame_ptr = 0;
-#endif
 
   return 1;
 }
 
 #endif
 
 // end of file //
Index: projects/clang1000-import/contrib/llvm-project/openmp
===================================================================
--- projects/clang1000-import/contrib/llvm-project/openmp	(revision 357058)
+++ projects/clang1000-import/contrib/llvm-project/openmp	(revision 357059)

Property changes on: projects/clang1000-import/contrib/llvm-project/openmp
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /vendor/llvm-openmp/dist:r353954-357044