diff --git a/lib/libsys/_umtx_op.2 b/lib/libsys/_umtx_op.2
index 974850fb8425..c590f8e8e0c8 100644
--- a/lib/libsys/_umtx_op.2
+++ b/lib/libsys/_umtx_op.2
@@ -1,1539 +1,1541 @@
 .\" Copyright (c) 2016 The FreeBSD Foundation
 .\"
 .\" This documentation was written by
 .\" Konstantin Belousov <kib@FreeBSD.org> under sponsorship
 .\" from the FreeBSD Foundation.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .Dd November 23, 2020
 .Dt _UMTX_OP 2
 .Os
 .Sh NAME
 .Nm _umtx_op
 .Nd interface for implementation of userspace threading synchronization primitives
 .Sh LIBRARY
 .Lb libc
 .Sh SYNOPSIS
 .In sys/types.h
 .In sys/umtx.h
 .Ft int
 .Fn _umtx_op "void *obj" "int op" "u_long val" "void *uaddr" "void *uaddr2"
 .Sh DESCRIPTION
 The
 .Fn _umtx_op
 system call provides kernel support for userspace implementation of
 the threading synchronization primitives.
 The
 .Lb libthr
 uses the syscall to implement
 .St -p1003.1-2001
 pthread locks, like mutexes, condition variables and so on.
 .Ss STRUCTURES
 The operations, performed by the
 .Fn _umtx_op
 syscall, operate on userspace objects which are described
 by the following structures.
 Reserved fields and paddings are omitted.
 All objects require ABI-mandated alignment, but this is not currently
 enforced consistently on all architectures.
 .Pp
 The following flags are defined for flag fields of all structures:
 .Bl -tag -width indent
 .It Dv USYNC_PROCESS_SHARED
 Allow selection of the process-shared sleep queue for the thread sleep
 container, when the lock ownership cannot be granted immediately,
 and the operation must sleep.
 The process-shared or process-private sleep queue is selected based on
 the attributes of the memory mapping which contains the first byte of
 the structure, see
 .Xr mmap 2 .
 Otherwise, if the flag is not specified, the process-private sleep queue
 is selected regardless of the memory mapping attributes, as an optimization.
 .Pp
 See the
 .Sx SLEEP QUEUES
 subsection below for more details on sleep queues.
 .El
 .Bl -hang -offset indent
 .It Sy Mutex
 .Bd -literal
 struct umutex {
 	volatile lwpid_t m_owner;
 	uint32_t         m_flags;
 	uint32_t         m_ceilings[2];
 	uintptr_t        m_rb_lnk;
 };
 .Ed
 .Pp
 The
 .Dv m_owner
 field is the actual lock.
 It contains either the thread identifier of the lock owner in the
 locked state, or zero when the lock is unowned.
 The highest bit set indicates that there is contention on the lock.
 The constants are defined for special values:
 .Bl -tag -width indent
 .It Dv UMUTEX_UNOWNED
 Zero, the value stored in the unowned lock.
 .It Dv UMUTEX_CONTESTED
 The contention indicator.
 .It Dv UMUTEX_RB_OWNERDEAD
 A thread owning the robust mutex terminated.
 The mutex is in unlocked state.
 .It Dv UMUTEX_RB_NOTRECOV
 The robust mutex is in a non-recoverable state.
 It cannot be locked until reinitialized.
 .El
 .Pp
 The
 .Dv m_flags
 field may contain the following umutex-specific flags, in addition to
 the common flags:
 .Bl -tag -width indent
 .It Dv UMUTEX_PRIO_INHERIT
 Mutex implements
 .Em Priority Inheritance
 protocol.
 .It Dv UMUTEX_PRIO_PROTECT
 Mutex implements
 .Em Priority Protection
 protocol.
 .It Dv UMUTEX_ROBUST
 Mutex is robust, as described in the
 .Sx ROBUST UMUTEXES
 section below.
 .It Dv UMUTEX_NONCONSISTENT
 Robust mutex is in a transient non-consistent state.
 Not used by kernel.
 .El
 .Pp
 In the manual page, mutexes not having
 .Dv UMUTEX_PRIO_INHERIT
 and
 .Dv UMUTEX_PRIO_PROTECT
 flags set, are called normal mutexes.
 Each type of mutex
 .Pq normal, priority-inherited, and priority-protected
 has a separate sleep queue associated
 with the given key.
 .Pp
 For priority protected mutexes, the
 .Dv m_ceilings
 array contains priority ceiling values.
 The
 .Dv m_ceilings[0]
 is the ceiling value for the mutex, as specified by
 .St -p1003.1-2008
 for the
 .Em Priority Protected
 mutex protocol.
 The
 .Dv m_ceilings[1]
 is used only for the unlock of a priority protected mutex, when
 unlock is done in an order other than the reversed lock order.
 In this case,
 .Dv m_ceilings[1]
 must contain the ceiling value for the last locked priority protected
 mutex, for proper priority reassignment.
 If, instead, the unlocking mutex was the last priority propagated
 mutex locked by the thread,
 .Dv m_ceilings[1]
 should contain \-1.
 This is required because kernel does not maintain the ordered lock list.
 .It Sy Condition variable
 .Bd -literal
 struct ucond {
 	volatile uint32_t c_has_waiters;
 	uint32_t          c_flags;
 	uint32_t          c_clockid;
 };
 .Ed
 .Pp
 A non-zero
 .Dv c_has_waiters
 value indicates that there are in-kernel waiters for the condition,
 executing the
 .Dv UMTX_OP_CV_WAIT
 request.
 .Pp
 The
 .Dv c_flags
 field contains flags.
 Only the common flags
 .Pq Dv USYNC_PROCESS_SHARED
 are defined for ucond.
 .Pp
 The
 .Dv c_clockid
 member provides the clock identifier to use for timeout, when the
 .Dv UMTX_OP_CV_WAIT
 request has both the
 .Dv CVWAIT_CLOCKID
 flag and the timeout specified.
 Valid clock identifiers are a subset of those for
 .Xr clock_gettime 2 :
 .Bl -bullet -compact
 .It
 .Dv CLOCK_MONOTONIC
 .It
 .Dv CLOCK_MONOTONIC_FAST
 .It
 .Dv CLOCK_MONOTONIC_PRECISE
 .It
 .Dv CLOCK_PROF
 .It
 .Dv CLOCK_REALTIME
 .It
 .Dv CLOCK_REALTIME_FAST
 .It
 .Dv CLOCK_REALTIME_PRECISE
 .It
 .Dv CLOCK_SECOND
 .It
+.Dv CLOCK_TAI
+.It
 .Dv CLOCK_UPTIME
 .It
 .Dv CLOCK_UPTIME_FAST
 .It
 .Dv CLOCK_UPTIME_PRECISE
 .It
 .Dv CLOCK_VIRTUAL
 .El
 .It Sy Reader/writer lock
 .Bd -literal
 struct urwlock {
 	volatile int32_t rw_state;
 	uint32_t         rw_flags;
 	uint32_t         rw_blocked_readers;
 	uint32_t         rw_blocked_writers;
 };
 .Ed
 .Pp
 The
 .Dv rw_state
 field is the actual lock.
 It contains both the flags and counter of the read locks which were
 granted.
 Names of the
 .Dv rw_state
 bits are following:
 .Bl -tag -width indent
 .It Dv URWLOCK_WRITE_OWNER
 Write lock was granted.
 .It Dv URWLOCK_WRITE_WAITERS
 There are write lock waiters.
 .It Dv URWLOCK_READ_WAITERS
 There are read lock waiters.
 .It Dv URWLOCK_READER_COUNT(c)
 Returns the count of currently granted read locks.
 .El
 .Pp
 At any given time there may be only one thread to which the writer lock
 is granted on the
 .Vt struct rwlock ,
 and no threads are granted read lock.
 Or, at the given time, up to
 .Dv URWLOCK_MAX_READERS
 threads may be granted the read lock simultaneously, but write lock is
 not granted to any thread.
 .Pp
 The following flags for the
 .Dv rw_flags
 member of
 .Vt struct urwlock
 are defined, in addition to the common flags:
 .Bl -tag -width indent
 .It Dv URWLOCK_PREFER_READER
 If specified, immediately grant read lock requests when
 .Dv urwlock
 is already read-locked, even in presence of unsatisfied write
 lock requests.
 By default, if there is a write lock waiter, further read requests are
 not granted, to prevent unfair write lock waiter starvation.
 .El
 .Pp
 The
 .Dv rw_blocked_readers
 and
 .Dv rw_blocked_writers
 members contain the count of threads which are sleeping in kernel,
 waiting for the associated request type to be granted.
 The fields are used by kernel to update the
 .Dv URWLOCK_READ_WAITERS
 and
 .Dv URWLOCK_WRITE_WAITERS
 flags of the
 .Dv rw_state
 lock after requesting thread was woken up.
 .It Sy Semaphore
 .Bd -literal
 struct _usem2 {
 	volatile uint32_t _count;
 	uint32_t          _flags;
 };
 .Ed
 .Pp
 The
 .Dv _count
 word represents a counting semaphore.
 A non-zero value indicates an unlocked (posted) semaphore, while zero
 represents the locked state.
 The maximal supported semaphore count is
 .Dv USEM_MAX_COUNT .
 .Pp
 The
 .Dv _count
 word, besides the counter of posts (unlocks), also contains the
 .Dv USEM_HAS_WAITERS
 bit, which indicates that locked semaphore has waiting threads.
 .Pp
 The
 .Dv USEM_COUNT()
 macro, applied to the
 .Dv _count
 word, returns the current semaphore counter, which is the number of posts
 issued on the semaphore.
 .Pp
 The following bits for the
 .Dv _flags
 member of
 .Vt struct _usem2
 are defined, in addition to the common flags:
 .Bl -tag -width indent
 .It Dv USEM_NAMED
 Flag is ignored by kernel.
 .El
 .It Sy Timeout parameter
 .Bd -literal
 struct _umtx_time {
 	struct timespec _timeout;
 	uint32_t        _flags;
 	uint32_t        _clockid;
 };
 .Ed
 .Pp
 Several
 .Fn _umtx_op
 operations allow the blocking time to be limited, failing the request
 if it cannot be satisfied in the specified time period.
 The timeout is specified by passing either the address of
 .Vt struct timespec ,
 or its extended variant,
 .Vt struct _umtx_time ,
 as the
 .Fa uaddr2
 argument of
 .Fn _umtx_op .
 They are distinguished by the
 .Fa uaddr
 value, which must be equal to the size of the structure pointed to by
 .Fa uaddr2 ,
 casted to
 .Vt uintptr_t .
 .Pp
 The
 .Dv _timeout
 member specifies the time when the timeout should occur.
 Legal values for clock identifier
 .Dv _clockid
 are shared with the
 .Fa clock_id
 argument to the
 .Xr clock_gettime 2
 function,
 and use the same underlying clocks.
 The specified clock is used to obtain the current time value.
 Interval counting is always performed by the monotonic wall clock.
 .Pp
 The
 .Dv _flags
 argument allows the following flags to further define the timeout behaviour:
 .Bl -tag -width indent
 .It Dv UMTX_ABSTIME
 The
 .Dv _timeout
 value is the absolute time.
 The thread will be unblocked and the request failed when specified
 clock value is equal or exceeds the
 .Dv _timeout.
 .Pp
 If the flag is absent, the timeout value is relative, that is the amount
 of time, measured by the monotonic wall clock from the moment of the request
 start.
 .El
 .El
 .Ss SLEEP QUEUES
 When a locking request cannot be immediately satisfied, the thread is
 typically put to
 .Em sleep ,
 which is a non-runnable state terminated by the
 .Em wake
 operation.
 Lock operations include a
 .Em try
 variant which returns an error rather than sleeping if the lock cannot
 be obtained.
 Also,
 .Fn _umtx_op
 provides requests which explicitly put the thread to sleep.
 .Pp
 Wakes need to know which threads to make runnable, so sleeping threads
 are grouped into containers called
 .Em sleep queues .
 A sleep queue is identified by a key, which for
 .Fn _umtx_op
 is defined as the physical address of some variable.
 Note that the
 .Em physical
 address is used, which means that same variable mapped multiple
 times will give one key value.
 This mechanism enables the construction of
 .Em process-shared
 locks.
 .Pp
 A related attribute of the key is shareability.
 Some requests always interpret keys as private for the current process,
 creating sleep queues with the scope of the current process even if
 the memory is shared.
 Others either select the shareability automatically from the
 mapping attributes, or take additional input as the
 .Dv USYNC_PROCESS_SHARED
 common flag.
 This is done as optimization, allowing the lock scope to be limited
 regardless of the kind of backing memory.
 .Pp
 Only the address of the start byte of the variable specified as key is
 important for determining corresponding sleep queue.
 The size of the variable does not matter, so, for example, sleep on the same
 address interpreted as
 .Vt uint32_t
 and
 .Vt long
 on a little-endian 64-bit platform would collide.
 .Pp
 The last attribute of the key is the object type.
 The sleep queue to which a sleeping thread is assigned is an individual
 one for simple wait requests, mutexes, rwlocks, condvars and other
 primitives, even when the physical address of the key is same.
 .Pp
 When waking up a limited number of threads from a given sleep queue,
 the highest priority threads that have been blocked for the longest on
 the queue are selected.
 .Ss ROBUST UMUTEXES
 The
 .Em robust umutexes
 are provided as a substrate for a userspace library to implement
 .Tn POSIX
 robust mutexes.
 A robust umutex must have the
 .Dv UMUTEX_ROBUST
 flag set.
 .Pp
 On thread termination, the kernel walks two lists of mutexes.
 The two lists head addresses must be provided by a prior call to
 .Dv UMTX_OP_ROBUST_LISTS
 request.
 The lists are singly-linked.
 The link to next element is provided by the
 .Dv m_rb_lnk
 member of the
 .Vt struct umutex .
 .Pp
 Robust list processing is aborted if the kernel finds a mutex
 with any of the following conditions:
 .Bl -dash -offset indent -compact
 .It
 the
 .Dv UMUTEX_ROBUST
 flag is not set
 .It
 not owned by the current thread, except when the mutex is pointed to
 by the
 .Dv robust_inactive
 member of the
 .Vt struct umtx_robust_lists_params ,
 registered for the current thread
 .It
 the combination of mutex flags is invalid
 .It
 read of the umutex memory faults
 .It
 the list length limit described in
 .Xr libthr 3
 is reached.
 .El
 .Pp
 Every mutex in both lists is unlocked as if the
 .Dv UMTX_OP_MUTEX_UNLOCK
 request is performed on it, but instead of the
 .Dv UMUTEX_UNOWNED
 value, the
 .Dv m_owner
 field is written with the
 .Dv UMUTEX_RB_OWNERDEAD
 value.
 When a mutex in the
 .Dv UMUTEX_RB_OWNERDEAD
 state is locked by kernel due to the
 .Dv UMTX_OP_MUTEX_TRYLOCK
 and
 .Dv UMTX_OP_MUTEX_LOCK
 requests, the lock is granted and
 .Er EOWNERDEAD
 error is returned.
 .Pp
 Also, the kernel handles the
 .Dv UMUTEX_RB_NOTRECOV
 value of
 .Dv the m_owner
 field specially, always returning the
 .Er ENOTRECOVERABLE
 error for lock attempts, without granting the lock.
 .Ss OPERATIONS
 The following operations, requested by the
 .Fa op
 argument to the function, are implemented:
 .Bl -tag -width indent
 .It Dv UMTX_OP_WAIT
 Wait.
 The arguments for the request are:
 .Bl -tag -width "obj"
 .It Fa obj
 Pointer to a variable of type
 .Vt long .
 .It Fa val
 Current value of the
 .Dv *obj .
 .El
 .Pp
 The current value of the variable pointed to by the
 .Fa obj
 argument is compared with the
 .Fa val .
 If they are equal, the requesting thread is put to interruptible sleep
 until woken up or the optionally specified timeout expires.
 .Pp
 The comparison and sleep are atomic.
 In other words, if another thread writes a new value to
 .Dv *obj
 and then issues
 .Dv UMTX_OP_WAKE ,
 the request is guaranteed to not miss the wakeup,
 which might otherwise happen between comparison and blocking.
 .Pp
 The physical address of memory where the
 .Fa *obj
 variable is located, is used as a key to index sleeping threads.
 .Pp
 The read of the current value of the
 .Dv *obj
 variable is not guarded by barriers.
 In particular, it is the user's duty to ensure the lock acquire
 and release memory semantics, if the
 .Dv UMTX_OP_WAIT
 and
 .Dv UMTX_OP_WAKE
 requests are used as a substrate for implementing a simple lock.
 .Pp
 The request is not restartable.
 An unblocked signal delivered during the wait always results in sleep
 interruption and
 .Er EINTR
 error.
 .Pp
 Optionally, a timeout for the request may be specified.
 .It Dv UMTX_OP_WAKE
 Wake the threads possibly sleeping due to
 .Dv UMTX_OP_WAIT .
 The arguments for the request are:
 .Bl -tag -width "obj"
 .It Fa obj
 Pointer to a variable, used as a key to find sleeping threads.
 .It Fa val
 Up to
 .Fa val
 threads are woken up by this request.
 Specify
 .Dv INT_MAX
 to wake up all waiters.
 .El
 .It Dv UMTX_OP_MUTEX_TRYLOCK
 Try to lock umutex.
 The arguments to the request are:
 .Bl -tag -width "obj"
 .It Fa obj
 Pointer to the umutex.
 .El
 .Pp
 Operates same as the
 .Dv UMTX_OP_MUTEX_LOCK
 request, but returns
 .Er EBUSY
 instead of sleeping if the lock cannot be obtained immediately.
 .It Dv UMTX_OP_MUTEX_LOCK
 Lock umutex.
 The arguments to the request are:
 .Bl -tag -width "obj"
 .It Fa obj
 Pointer to the umutex.
 .El
 .Pp
 Locking is performed by writing the current thread id into the
 .Dv m_owner
 word of the
 .Vt struct umutex .
 The write is atomic, preserves the
 .Dv UMUTEX_CONTESTED
 contention indicator, and provides the acquire barrier for
 lock entrance semantic.
 .Pp
 If the lock cannot be obtained immediately because another thread owns
 the lock, the current thread is put to sleep, with
 .Dv UMUTEX_CONTESTED
 bit set before.
 Upon wake up, the lock conditions are re-tested.
 .Pp
 The request adheres to the priority protection or inheritance protocol
 of the mutex, specified by the
 .Dv UMUTEX_PRIO_PROTECT
 or
 .Dv UMUTEX_PRIO_INHERIT
 flag, respectively.
 .Pp
 Optionally, a timeout for the request may be specified.
 .Pp
 A request with a timeout specified is not restartable.
 An unblocked signal delivered during the wait always results in sleep
 interruption and
 .Er EINTR
 error.
 A request without timeout specified is always restarted after return
 from a signal handler.
 .It Dv UMTX_OP_MUTEX_UNLOCK
 Unlock umutex.
 The arguments to the request are:
 .Bl -tag -width "obj"
 .It Fa obj
 Pointer to the umutex.
 .El
 .Pp
 Unlocks the mutex, by writing
 .Dv UMUTEX_UNOWNED
 (zero) value into
 .Dv m_owner
 word of the
 .Vt struct umutex .
 The write is done with a release barrier, to provide lock leave semantic.
 .Pp
 If there are threads sleeping in the sleep queue associated with the
 umutex, one thread is woken up.
 If more than one thread sleeps in the sleep queue, the
 .Dv UMUTEX_CONTESTED
 bit is set together with the write of the
 .Dv UMUTEX_UNOWNED
 value into
 .Dv m_owner .
 .Pp
 The request adheres to the priority protection or inheritance protocol
 of the mutex, specified by the
 .Dv UMUTEX_PRIO_PROTECT
 or
 .Dv UMUTEX_PRIO_INHERIT
 flag, respectively.
 See description of the
 .Dv m_ceilings
 member of the
 .Vt struct umutex
 structure for additional details of the request operation on the
 priority protected protocol mutex.
 .It Dv UMTX_OP_SET_CEILING
 Set ceiling for the priority protected umutex.
 The arguments to the request are:
 .Bl -tag -width "uaddr"
 .It Fa obj
 Pointer to the umutex.
 .It Fa val
 New ceiling value.
 .It Fa uaddr
 Address of a variable of type
 .Vt uint32_t .
 If not
 .Dv NULL
 and the update was successful, the previous ceiling value is
 written to the location pointed to by
 .Fa uaddr .
 .El
 .Pp
 The request locks the umutex pointed to by the
 .Fa obj
 parameter, waiting for the lock if not immediately available.
 After the lock is obtained, the new ceiling value
 .Fa val
 is written to the
 .Dv m_ceilings[0]
 member of the
 .Vt struct umutex,
 after which the umutex is unlocked.
 .Pp
 The locking does not adhere to the priority protect protocol,
 to conform to the
 .Tn POSIX
 requirements for the
 .Xr pthread_mutex_setprioceiling 3
 interface.
 .It Dv UMTX_OP_CV_WAIT
 Wait for a condition.
 The arguments to the request are:
 .Bl -tag -width "uaddr2"
 .It Fa obj
 Pointer to the
 .Vt struct ucond .
 .It Fa val
 Request flags, see below.
 .It Fa uaddr
 Pointer to the umutex.
 .It Fa uaddr2
 Optional pointer to a
 .Vt struct timespec
 for timeout specification.
 .El
 .Pp
 The request must be issued by the thread owning the mutex pointed to
 by the
 .Fa uaddr
 argument.
 The
 .Dv c_hash_waiters
 member of the
 .Vt struct ucond ,
 pointed to by the
 .Fa obj
 argument, is set to an arbitrary non-zero value, after which the
 .Fa uaddr
 mutex is unlocked (following the appropriate protocol), and
 the current thread is put to sleep on the sleep queue keyed by
 the
 .Fa obj
 argument.
 The operations are performed atomically.
 It is guaranteed to not miss a wakeup from
 .Dv UMTX_OP_CV_SIGNAL
 or
 .Dv UMTX_OP_CV_BROADCAST
 sent between mutex unlock and putting the current thread on the sleep queue.
 .Pp
 Upon wakeup, if the timeout expired and no other threads are sleeping in
 the same sleep queue, the
 .Dv c_hash_waiters
 member is cleared.
 After wakeup, the
 .Fa uaddr
 umutex is not relocked.
 .Pp
 The following flags are defined:
 .Bl -tag -width "CVWAIT_CLOCKID"
 .It Dv CVWAIT_ABSTIME
 Timeout is absolute.
 .It Dv CVWAIT_CLOCKID
 Clockid is provided.
 .El
 .Pp
 Optionally, a timeout for the request may be specified.
 Unlike other requests, the timeout value is specified directly by a
 .Vt struct timespec ,
 pointed to by the
 .Fa uaddr2
 argument.
 If the
 .Dv CVWAIT_CLOCKID
 flag is provided, the timeout uses the clock from the
 .Dv c_clockid
 member of the
 .Vt struct ucond ,
 pointed to by
 .Fa obj
 argument.
 Otherwise,
 .Dv CLOCK_REALTIME
 is used, regardless of the clock identifier possibly specified in the
 .Vt struct _umtx_time .
 If the
 .Dv CVWAIT_ABSTIME
 flag is supplied, the timeout specifies absolute time value, otherwise
 it denotes a relative time interval.
 .Pp
 The request is not restartable.
 An unblocked signal delivered during
 the wait always results in sleep interruption and
 .Er EINTR
 error.
 .It Dv UMTX_OP_CV_SIGNAL
 Wake up one condition waiter.
 The arguments to the request are:
 .Bl -tag -width "obj"
 .It Fa obj
 Pointer to
 .Vt struct ucond .
 .El
 .Pp
 The request wakes up at most one thread sleeping on the sleep queue keyed
 by the
 .Fa obj
 argument.
 If the woken up thread was the last on the sleep queue, the
 .Dv c_has_waiters
 member of the
 .Vt struct ucond
 is cleared.
 .It Dv UMTX_OP_CV_BROADCAST
 Wake up all condition waiters.
 The arguments to the request are:
 .Bl -tag -width "obj"
 .It Fa obj
 Pointer to
 .Vt struct ucond .
 .El
 .Pp
 The request wakes up all threads sleeping on the sleep queue keyed by the
 .Fa obj
 argument.
 The
 .Dv c_has_waiters
 member of the
 .Vt struct ucond
 is cleared.
 .It Dv UMTX_OP_WAIT_UINT
 Same as
 .Dv UMTX_OP_WAIT ,
 but the type of the variable pointed to by
 .Fa obj
 is
 .Vt u_int
 .Pq a 32-bit integer .
 .It Dv UMTX_OP_RW_RDLOCK
 Read-lock a
 .Vt struct rwlock
 lock.
 The arguments to the request are:
 .Bl -tag -width "obj"
 .It Fa obj
 Pointer to the lock (of type
 .Vt struct rwlock )
 to be read-locked.
 .It Fa val
 Additional flags to augment locking behaviour.
 The valid flags in the
 .Fa val
 argument are:
 .Bl -tag -width indent
 .It Dv URWLOCK_PREFER_READER
 .El
 .El
 .Pp
 The request obtains the read lock on the specified
 .Vt struct rwlock
 by incrementing the count of readers in the
 .Dv rw_state
 word of the structure.
 If the
 .Dv URWLOCK_WRITE_OWNER
 bit is set in the word
 .Dv rw_state ,
 the lock was granted to a writer which has not yet relinquished
 its ownership.
 In this case the current thread is put to sleep until it makes sense to
 retry.
 .Pp
 If the
 .Dv URWLOCK_PREFER_READER
 flag is set either in the
 .Dv rw_flags
 word of the structure, or in the
 .Fa val
 argument of the request, the presence of the threads trying to obtain
 the write lock on the same structure does not prevent the current thread
 from trying to obtain the read lock.
 Otherwise, if the flag is not set, and the
 .Dv URWLOCK_WRITE_WAITERS
 flag is set in
 .Dv rw_state ,
 the current thread does not attempt to obtain read-lock.
 Instead it sets the
 .Dv URWLOCK_READ_WAITERS
 in the
 .Dv rw_state
 word and puts itself to sleep on corresponding sleep queue.
 Upon wakeup, the locking conditions are re-evaluated.
 .Pp
 Optionally, a timeout for the request may be specified.
 .Pp
 The request is not restartable.
 An unblocked signal delivered during the wait always results in sleep
 interruption and
 .Er EINTR
 error.
 .It Dv UMTX_OP_RW_WRLOCK
 Write-lock a
 .Vt struct rwlock
 lock.
 The arguments to the request are:
 .Bl -tag -width "obj"
 .It Fa obj
 Pointer to the lock (of type
 .Vt struct rwlock )
 to be write-locked.
 .El
 .Pp
 The request obtains a write lock on the specified
 .Vt struct rwlock ,
 by setting the
 .Dv URWLOCK_WRITE_OWNER
 bit in the
 .Dv rw_state
 word of the structure.
 If there is already a write lock owner, as indicated by the
 .Dv URWLOCK_WRITE_OWNER
 bit being set, or there are read lock owners, as indicated
 by the read-lock counter, the current thread does not attempt to
 obtain the write-lock.
 Instead it sets the
 .Dv URWLOCK_WRITE_WAITERS
 in the
 .Dv rw_state
 word and puts itself to sleep on corresponding sleep queue.
 Upon wakeup, the locking conditions are re-evaluated.
 .Pp
 Optionally, a timeout for the request may be specified.
 .Pp
 The request is not restartable.
 An unblocked signal delivered during the wait always results in sleep
 interruption and
 .Er EINTR
 error.
 .It Dv UMTX_OP_RW_UNLOCK
 Unlock rwlock.
 The arguments to the request are:
 .Bl -tag -width "obj"
 .It Fa obj
 Pointer to the lock (of type
 .Vt struct rwlock )
 to be unlocked.
 .El
 .Pp
 The unlock type (read or write) is determined by the
 current lock state.
 Note that the
 .Vt struct rwlock
 does not save information about the identity of the thread which
 acquired the lock.
 .Pp
 If there are pending writers after the unlock, and the
 .Dv URWLOCK_PREFER_READER
 flag is not set in the
 .Dv rw_flags
 member of the
 .Fa *obj
 structure, one writer is woken up, selected as described in the
 .Sx SLEEP QUEUES
 subsection.
 If the
 .Dv URWLOCK_PREFER_READER
 flag is set, a pending writer is woken up only if there is
 no pending readers.
 .Pp
 If there are no pending writers, or, in the case that the
 .Dv URWLOCK_PREFER_READER
 flag is set, then all pending readers are woken up by unlock.
 .It Dv UMTX_OP_WAIT_UINT_PRIVATE
 Same as
 .Dv UMTX_OP_WAIT_UINT ,
 but unconditionally select the process-private sleep queue.
 .It Dv UMTX_OP_WAKE_PRIVATE
 Same as
 .Dv UMTX_OP_WAKE ,
 but unconditionally select the process-private sleep queue.
 .It Dv UMTX_OP_MUTEX_WAIT
 Wait for mutex availability.
 The arguments to the request are:
 .Bl -tag -width "obj"
 .It Fa obj
 Address of the mutex.
 .El
 .Pp
 Similarly to the
 .Dv UMTX_OP_MUTEX_LOCK ,
 put the requesting thread to sleep if the mutex lock cannot be obtained
 immediately.
 The
 .Dv UMUTEX_CONTESTED
 bit is set in the
 .Dv m_owner
 word of the mutex to indicate that there is a waiter, before the thread
 is added to the sleep queue.
 Unlike the
 .Dv UMTX_OP_MUTEX_LOCK
 request, the lock is not obtained.
 .Pp
 The operation is not implemented for priority protected and
 priority inherited protocol mutexes.
 .Pp
 Optionally, a timeout for the request may be specified.
 .Pp
 A request with a timeout specified is not restartable.
 An unblocked signal delivered during the wait always results in sleep
 interruption and
 .Er EINTR
 error.
 A request without a timeout automatically restarts if the signal disposition
 requested restart via the
 .Dv SA_RESTART
 flag in
 .Vt struct sigaction
 member
 .Dv sa_flags .
 .It Dv UMTX_OP_NWAKE_PRIVATE
 Wake up a batch of sleeping threads.
 The arguments to the request are:
 .Bl -tag -width "obj"
 .It Fa obj
 Pointer to the array of pointers.
 .It Fa val
 Number of elements in the array pointed to by
 .Fa obj .
 .El
 .Pp
 For each element in the array pointed to by
 .Fa obj ,
 wakes up all threads waiting on the
 .Em private
 sleep queue with the key
 being the byte addressed by the array element.
 .It Dv UMTX_OP_MUTEX_WAKE
 Check if a normal umutex is unlocked and wake up a waiter.
 The arguments for the request are:
 .Bl -tag -width "obj"
 .It Fa obj
 Pointer to the umutex.
 .El
 .Pp
 If the
 .Dv m_owner
 word of the mutex pointed to by the
 .Fa obj
 argument indicates unowned mutex, which has its contention indicator bit
 .Dv UMUTEX_CONTESTED
 set, clear the bit and wake up one waiter in the sleep queue associated
 with the byte addressed by the
 .Fa obj ,
 if any.
 Only normal mutexes are supported by the request.
 The sleep queue is always one for a normal mutex type.
 .Pp
 This request is deprecated in favor of
 .Dv UMTX_OP_MUTEX_WAKE2
 since mutexes using it cannot synchronize their own destruction.
 That is, the
 .Dv m_owner
 word has already been set to
 .Dv UMUTEX_UNOWNED
 when this request is made,
 so that another thread can lock, unlock and destroy the mutex
 (if no other thread uses the mutex afterwards).
 Clearing the
 .Dv UMUTEX_CONTESTED
 bit may then modify freed memory.
 .It Dv UMTX_OP_MUTEX_WAKE2
 Check if a umutex is unlocked and wake up a waiter.
 The arguments for the request are:
 .Bl -tag -width "obj"
 .It Fa obj
 Pointer to the umutex.
 .It Fa val
 The umutex flags.
 .El
 .Pp
 The request does not read the
 .Dv m_flags
 member of the
 .Vt struct umutex ;
 instead, the
 .Fa val
 argument supplies flag information, in particular, to determine the
 sleep queue where the waiters are found for wake up.
 .Pp
 If the mutex is unowned, one waiter is woken up.
 .Pp
 If the mutex memory cannot be accessed, all waiters are woken up.
 .Pp
 If there is more than one waiter on the sleep queue, or there is only
 one waiter but the mutex is owned by a thread, the
 .Dv UMUTEX_CONTESTED
 bit is set in the
 .Dv m_owner
 word of the
 .Vt struct umutex .
 .It Dv UMTX_OP_SEM2_WAIT
 Wait until semaphore is available.
 The arguments to the request are:
 .Bl -tag -width "obj"
 .It Fa obj
 Pointer to the semaphore (of type
 .Vt struct _usem2 ) .
 .It Fa uaddr
 Size of the memory passed in via the
 .Fa uaddr2
 argument.
 .It Fa uaddr2
 Optional pointer to a structure of type
 .Vt struct _umtx_time ,
 which may be followed by a structure of type
 .Vt struct timespec .
 .El
 .Pp
 Put the requesting thread onto a sleep queue if the semaphore counter
 is zero.
 If the thread is put to sleep, the
 .Dv USEM_HAS_WAITERS
 bit is set in the
 .Dv _count
 word to indicate waiters.
 The function returns either due to
 .Dv _count
 indicating the semaphore is available (non-zero count due to post),
 or due to a wakeup.
 The return does not guarantee that the semaphore is available,
 nor does it consume the semaphore lock on successful return.
 .Pp
 Optionally, a timeout for the request may be specified.
 .Pp
 A request with non-absolute timeout value is not restartable.
 An unblocked signal delivered during such wait results in sleep
 interruption and
 .Er EINTR
 error.
 .Pp
 If
 .Dv UMTX_ABSTIME
 was not set, and the operation was interrupted and the caller passed in a
 .Fa uaddr2
 large enough to hold a
 .Vt struct timespec
 following the initial
 .Vt struct _umtx_time ,
 then the
 .Vt struct timespec
 is updated to contain the unslept amount.
 .It Dv UMTX_OP_SEM2_WAKE
 Wake up waiters on semaphore lock.
 The arguments to the request are:
 .Bl -tag -width "obj"
 .It Fa obj
 Pointer to the semaphore (of type
 .Vt struct _usem2 ) .
 .El
 .Pp
 The request wakes up one waiter for the semaphore lock.
 The function does not increment the semaphore lock count.
 If the
 .Dv USEM_HAS_WAITERS
 bit was set in the
 .Dv _count
 word, and the last sleeping thread was woken up, the bit is cleared.
 .It Dv UMTX_OP_SHM
 Manage anonymous
 .Tn POSIX
 shared memory objects (see
 .Xr shm_open 2 ) ,
 which can be attached to a byte of physical memory, mapped into the
 process address space.
 The objects are used to implement process-shared locks in
 .Dv libthr .
 .Pp
 The
 .Fa val
 argument specifies the sub-request of the
 .Dv UMTX_OP_SHM
 request:
 .Bl -tag -width indent
 .It Dv UMTX_SHM_CREAT
 Creates the anonymous shared memory object, which can be looked up
 with the specified key
 .Fa uaddr .
 If the object associated with the
 .Fa uaddr
 key already exists, it is returned instead of creating a new object.
 The object's size is one page.
 On success, the file descriptor referencing the object is returned.
 The descriptor can be used for mapping the object using
 .Xr mmap 2 ,
 or for other shared memory operations.
 .It Dv UMTX_SHM_LOOKUP
 Same as
 .Dv UMTX_SHM_CREATE
 request, but if there is no shared memory object associated with
 the specified key
 .Fa uaddr ,
 an error is returned, and no new object is created.
 .It Dv UMTX_SHM_DESTROY
 De-associate the shared object with the specified key
 .Fa uaddr .
 The object is destroyed after the last open file descriptor is closed
 and the last mapping for it is destroyed.
 .It Dv UMTX_SHM_ALIVE
 Checks whether there is a live shared object associated with the
 supplied key
 .Fa uaddr .
 Returns zero if there is, and an error otherwise.
 This request is an optimization of the
 .Dv UMTX_SHM_LOOKUP
 request.
 It is cheaper when only the liveness of the associated object is asked
 for, since no file descriptor is installed in the process fd table
 on success.
 .El
 .Pp
 The
 .Fa uaddr
 argument specifies the virtual address, which backing physical memory
 byte identity is used as a key for the anonymous shared object
 creation or lookup.
 .It Dv UMTX_OP_ROBUST_LISTS
 Register the list heads for the current thread's robust mutex lists.
 The arguments to the request are:
 .Bl -tag -width "uaddr"
 .It Fa val
 Size of the structure passed in the
 .Fa uaddr
 argument.
 .It Fa uaddr
 Pointer to the structure of type
 .Vt struct umtx_robust_lists_params .
 .El
 .Pp
 The structure is defined as
 .Bd -literal
 struct umtx_robust_lists_params {
 	uintptr_t	robust_list_offset;
 	uintptr_t	robust_priv_list_offset;
 	uintptr_t	robust_inact_offset;
 };
 .Ed
 .Pp
 The
 .Dv robust_list_offset
 member contains address of the first element in the list of locked
 robust shared mutexes.
 The
 .Dv robust_priv_list_offset
 member contains address of the first element in the list of locked
 robust private mutexes.
 The private and shared robust locked lists are split to allow fast
 termination of the shared list on fork, in the child.
 .Pp
 The
 .Dv robust_inact_offset
 contains a pointer to the mutex which might be locked in nearby future,
 or might have been just unlocked.
 It is typically set by the lock or unlock mutex implementation code
 around the whole operation, since lists can be only changed race-free
 when the thread owns the mutex.
 The kernel inspects the
 .Dv robust_inact_offset
 in addition to walking the shared and private lists.
 Also, the mutex pointed to by
 .Dv robust_inact_offset
 is handled more loosely at the thread termination time,
 than other mutexes on the list.
 That mutex is allowed to be not owned by the current thread,
 in which case list processing is continued.
 See
 .Sx ROBUST UMUTEXES
 subsection for details.
 .It Dv UMTX_OP_GET_MIN_TIMEOUT
 Writes out the current value of minimal umtx operations timeout,
 in nanoseconds, into the long integer variable pointed to by
 .Fa uaddr1 .
 .It Dv UMTX_OP_SET_MIN_TIMEOUT
 Set the minimal amount of time, in nanoseconds, the thread is required
 to sleep for umtx operations specifying a timeout using absolute clocks.
 The value is taken from the
 .Fa val
 argument of the call.
 Zero means no minimum.
 .El
 .Pp
 The
 .Fa op
 argument may be a bitwise OR of a single command from above with one or more of
 the following flags:
 .Bl -tag -width indent
 .It Dv UMTX_OP__I386
 Request i386 ABI compatibility from the native
 .Nm
 system call.
 Specifically, this implies that:
 .Bl -hang -offset indent
 .It
 .Fa obj
 arguments that point to a word, point to a 32-bit integer.
 .It
 The
 .Dv UMTX_OP_NWAKE_PRIVATE
 .Fa obj
 argument is a pointer to an array of 32-bit pointers.
 .It
 The
 .Dv m_rb_lnk
 member of
 .Vt struct umutex
 is a 32-bit pointer.
 .It
 .Vt struct timespec
 uses a 32-bit time_t.
 .El
 .Pp
 .Dv UMTX_OP__32BIT
 has no effect if this flag is set.
 This flag is valid for all architectures, but it is ignored on i386.
 .It Dv UMTX_OP__32BIT
 Request non-i386, 32-bit ABI compatibility from the native
 .Nm
 system call.
 Specifically, this implies that:
 .Bl -hang -offset indent
 .It
 .Fa obj
 arguments that point to a word, point to a 32-bit integer.
 .It
 The
 .Dv UMTX_OP_NWAKE_PRIVATE
 .Fa obj
 argument is a pointer to an array of 32-bit pointers.
 .It
 The
 .Dv m_rb_lnk
 member of
 .Vt struct umutex
 is a 32-bit pointer.
 .It
 .Vt struct timespec
 uses a 64-bit time_t.
 .El
 .Pp
 This flag has no effect if
 .Dv UMTX_OP__I386
 is set.
 This flag is valid for all architectures.
 .El
 .Pp
 Note that if any 32-bit ABI compatibility is being requested, then care must be
 taken with robust lists.
 A single thread may not mix 32-bit compatible robust lists with native
 robust lists.
 The first
 .Dv UMTX_OP_ROBUST_LISTS
 call in a given thread determines which ABI that thread will use for robust
 lists going forward.
 .Sh RETURN VALUES
 If successful,
 all requests, except
 .Dv UMTX_SHM_CREAT
 and
 .Dv UMTX_SHM_LOOKUP
 sub-requests of the
 .Dv UMTX_OP_SHM
 request, will return zero.
 The
 .Dv UMTX_SHM_CREAT
 and
 .Dv UMTX_SHM_LOOKUP
 return a shared memory file descriptor on success.
 On error \-1 is returned, and the
 .Va errno
 variable is set to indicate the error.
 .Sh ERRORS
 The
 .Fn _umtx_op
 operations can fail with the following errors:
 .Bl -tag -width "[ETIMEDOUT]"
 .It Bq Er EFAULT
 One of the arguments point to invalid memory.
 .It Bq Er EINVAL
 The clock identifier, specified for the
 .Vt struct _umtx_time
 timeout parameter, or in the
 .Dv c_clockid
 member of
 .Vt struct ucond,
 is invalid.
 .It Bq Er EINVAL
 The type of the mutex, encoded by the
 .Dv m_flags
 member of
 .Vt struct umutex ,
 is invalid.
 .It Bq Er EINVAL
 The
 .Dv m_owner
 member of the
 .Vt struct umutex
 has changed the lock owner thread identifier during unlock.
 .It Bq Er EINVAL
 The
 .Dv timeout.tv_sec
 or
 .Dv timeout.tv_nsec
 member of
 .Vt struct _umtx_time
 is less than zero, or
 .Dv timeout.tv_nsec
 is greater than 1000000000.
 .It Bq Er EINVAL
 The
 .Fa op
 argument specifies invalid operation.
 .It Bq Er EINVAL
 The
 .Fa uaddr
 argument for the
 .Dv UMTX_OP_SHM
 request specifies invalid operation.
 .It Bq Er EINVAL
 The
 .Dv UMTX_OP_SET_CEILING
 request specifies non priority protected mutex.
 .It Bq Er EINVAL
 The new ceiling value for the
 .Dv UMTX_OP_SET_CEILING
 request, or one or more of the values read from the
 .Dv m_ceilings
 array during lock or unlock operations, is greater than
 .Dv RTP_PRIO_MAX .
 .It Bq Er EPERM
 Unlock attempted on an object not owned by the current thread.
 .It Bq Er EOWNERDEAD
 The lock was requested on an umutex where the
 .Dv m_owner
 field was set to the
 .Dv UMUTEX_RB_OWNERDEAD
 value, indicating terminated robust mutex.
 The lock was granted to the caller, so this error in fact
 indicates success with additional conditions.
 .It Bq Er ENOTRECOVERABLE
 The lock was requested on an umutex which
 .Dv m_owner
 field is equal to the
 .Dv UMUTEX_RB_NOTRECOV
 value, indicating abandoned robust mutex after termination.
 The lock was not granted to the caller.
 .It Bq Er ENOTTY
 The shared memory object, associated with the address passed to the
 .Dv UMTX_SHM_ALIVE
 sub-request of
 .Dv UMTX_OP_SHM
 request, was destroyed.
 .It Bq Er ESRCH
 For the
 .Dv UMTX_SHM_LOOKUP ,
 .Dv UMTX_SHM_DESTROY ,
 and
 .Dv UMTX_SHM_ALIVE
 sub-requests of the
 .Dv UMTX_OP_SHM
 request, there is no shared memory object associated with the provided key.
 .It Bq Er ENOMEM
 The
 .Dv UMTX_SHM_CREAT
 sub-request of the
 .Dv UMTX_OP_SHM
 request cannot be satisfied, because allocation of the shared memory object
 would exceed the
 .Dv RLIMIT_UMTXP
 resource limit, see
 .Xr setrlimit 2 .
 .It Bq Er EAGAIN
 The maximum number of readers
 .Dv ( URWLOCK_MAX_READERS )
 were already granted ownership of the given
 .Vt struct rwlock
 for read.
 .It Bq Er EBUSY
 A try mutex lock operation was not able to obtain the lock.
 .It Bq Er ETIMEDOUT
 The request specified a timeout in the
 .Fa uaddr
 and
 .Fa uaddr2
 arguments, and timed out before obtaining the lock or being woken up.
 .It Bq Er EINTR
 A signal was delivered during wait, for a non-restartable operation.
 Operations with timeouts are typically non-restartable, but timeouts
 specified in absolute time may be restartable.
 .It Bq Er ERESTART
 A signal was delivered during wait, for a restartable operation.
 Mutex lock requests without timeout specified are restartable.
 The error is not returned to userspace code since restart
 is handled by usual adjustment of the instruction counter.
 .El
 .Sh SEE ALSO
 .Xr clock_gettime 2 ,
 .Xr mmap 2 ,
 .Xr setrlimit 2 ,
 .Xr shm_open 2 ,
 .Xr sigaction 2 ,
 .Xr thr_exit 2 ,
 .Xr thr_kill 2 ,
 .Xr thr_kill2 2 ,
 .Xr thr_new 2 ,
 .Xr thr_self 2 ,
 .Xr thr_set_name 2 ,
 .Xr signal 3
 .Sh STANDARDS
 The
 .Fn _umtx_op
 system call is non-standard and is used by the
 .Lb libthr
 to implement
 .St -p1003.1-2001
 .Xr pthread 3
 functionality.
 .Sh BUGS
 A window between a unlocking robust mutex and resetting the pointer in the
 .Dv robust_inact_offset
 member of the registered
 .Vt struct umtx_robust_lists_params
 allows another thread to destroy the mutex, thus making the kernel inspect
 freed or reused memory.
 The
 .Li libthr
 implementation is only vulnerable to this race when operating on
 a shared mutex.
 A possible fix for the current implementation is to strengthen the checks
 for shared mutexes before terminating them, in particular, verifying
 that the mutex memory is mapped from a shared memory object allocated
 by the
 .Dv UMTX_OP_SHM
 request.
 This is not done because it is believed that the race is adequately
 covered by other consistency checks, while adding the check would
 prevent alternative implementations of
 .Li libpthread .
diff --git a/lib/libsys/clock_gettime.2 b/lib/libsys/clock_gettime.2
index fcdc5be498f2..1dcfd9d1faf7 100644
--- a/lib/libsys/clock_gettime.2
+++ b/lib/libsys/clock_gettime.2
@@ -1,227 +1,237 @@
 .\"	$OpenBSD: clock_gettime.2,v 1.4 1997/05/08 20:21:16 kstailey Exp $
 .\"
 .\" Copyright (c) 1980, 1991, 1993
 .\"	The Regents of the University of California.  All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\" 3. Neither the name of the University nor the names of its contributors
 .\"    may be used to endorse or promote products derived from this software
 .\"    without specific prior written permission.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
-.Dd June 28, 2024
+.Dd August 10, 2024
 .Dt CLOCK_GETTIME 2
 .Os
 .Sh NAME
 .Nm clock_gettime ,
 .Nm clock_settime ,
 .Nm clock_getres
 .Nd get/set/calibrate date and time
 .Sh LIBRARY
 .Lb libc
 .Sh SYNOPSIS
 .In time.h
 .Ft int
 .Fn clock_gettime "clockid_t clock_id" "struct timespec *tp"
 .Ft int
 .Fn clock_settime "clockid_t clock_id" "const struct timespec *tp"
 .Ft int
 .Fn clock_getres "clockid_t clock_id" "struct timespec *tp"
 .Sh DESCRIPTION
 The
 .Fn clock_gettime
 and
 .Fn clock_settime
 system calls allow the calling process to retrieve or set the value
 used by a clock which is specified by
 .Fa clock_id .
 .Pp
 The
 .Fa clock_id
 argument can be a value obtained from
 .Xr clock_getcpuclockid 3
 or
 .Xr pthread_getcpuclockid 3
 as well as the following values:
 .Pp
 .Bl -tag -width indent -compact
 .It Dv CLOCK_REALTIME
 .It Dv CLOCK_REALTIME_PRECISE
 .It Dv CLOCK_REALTIME_FAST
 .It Dv CLOCK_REALTIME_COARSE
 Increments in SI seconds like a wall clock.
 It uses a 1970 epoch and implements the UTC timescale.
 The count of physical SI seconds since 1970, adjusted by subtracting the number
 of positive leap seconds and adding the number of negative leap seconds.
 Behavior during a leap second is not defined by and POSIX standard.
 .It Dv CLOCK_MONOTONIC
 .It Dv CLOCK_MONOTONIC_PRECISE
 .It Dv CLOCK_MONOTONIC_FAST
 .It Dv CLOCK_MONOTONIC_COARSE
 .It Dv CLOCK_BOOTTIME
 Increments in SI seconds, even while the system is suspended.
 Its epoch is unspecified.
 The count is not adjusted by leap seconds.
 .Fx implements
 .It Dv CLOCK_UPTIME
 .It Dv CLOCK_UPTIME_PRECISE
 .It Dv CLOCK_UPTIME_FAST
 Increments monotonically in SI seconds while the machine is running.
 The count is not adjusted by leap seconds.
 The epoch is unspecified.
 .It Dv CLOCK_VIRTUAL
 Increments only when
 the CPU is running in user mode on behalf of the calling process.
 .It Dv CLOCK_PROF
 Increments when the CPU is running in user or kernel mode.
 .It Dv CLOCK_SECOND
 Returns the current second without performing a full time counter
 query, using an in-kernel cached value of the current second.
 .It Dv CLOCK_PROCESS_CPUTIME_ID
 Returns the execution time of the calling process.
 .It Dv CLOCK_THREAD_CPUTIME_ID
 Returns the execution time of the calling thread.
+.It Dv CLOCK_TAI
+Increments in SI seconds like a wall clock.
+It uses a 1970 epoch and implements the TAI timescale.
+Similar to CLOCK_REALTIME, but without leap seconds.
+It will increase monotonically during a leap second.
+Will return EINVAL if the current offset between TAI and UTC is not known,
+which may be the case early in boot before NTP or other time daemon has
+synchronized.
 .El
 .Pp
 The clock IDs
 .Dv CLOCK_BOOTTIME ,
 .Dv CLOCK_REALTIME ,
+.Dv CLOCK_TAI ,
 .Dv CLOCK_MONOTONIC ,
 and
 .Dv CLOCK_UPTIME
 perform a full time counter query.
 The clock IDs with the _FAST suffix, i.e.,
 .Dv CLOCK_REALTIME_FAST ,
 .Dv CLOCK_MONOTONIC_FAST ,
 and
 .Dv CLOCK_UPTIME_FAST ,
 do not perform
 a full time counter query, so their accuracy is one timer tick.
 Similarly,
 .Dv CLOCK_REALTIME_PRECISE ,
 .Dv CLOCK_MONOTONIC_PRECISE ,
 and
 .Dv CLOCK_UPTIME_PRECISE
 are used to get the most exact value as possible, at the expense of
 execution time.
 The clock IDs
 .Dv CLOCK_REALTIME_COARSE
 and
 .Dv CLOCK_MONOTONIC_COARSE
 are aliases of corresponding IDs with _FAST suffix for compatibility with other
 systems.
 Finally,
 .Dv CLOCK_BOOTTIME
 is an alias for
 .Dv CLOCK_MONOTONIC
 for compatibility with other systems and is unrelated to the
 .Fa kern.boottime
 .Xr sysctl 8 .
 .Pp
 The structure pointed to by
 .Fa tp
 is defined in
 .In sys/timespec.h
 as:
 .Bd -literal
 struct timespec {
 	time_t	tv_sec;		/* seconds */
 	long	tv_nsec;	/* and nanoseconds */
 };
 .Ed
 .Pp
 Only the super-user may set the time of day, using only
 .Dv CLOCK_REALTIME .
 If the system
 .Xr securelevel 7
 is greater than 1 (see
 .Xr init 8 ) ,
 the time may only be advanced.
 This limitation is imposed to prevent a malicious super-user
 from setting arbitrary time stamps on files.
 The system time can still be adjusted backwards using the
 .Xr adjtime 2
 system call even when the system is secure.
 .Pp
 The resolution (granularity) of a clock is returned by the
 .Fn clock_getres
 system call.
 This value is placed in a (non-NULL)
 .Fa *tp .
 .Sh RETURN VALUES
 .Rv -std
 .Sh ERRORS
 The following error codes may be set in
 .Va errno :
 .Bl -tag -width Er
 .It Bq Er EINVAL
 The
 .Fa clock_id
 or
 .Fa timespec
 argument
 was not a valid value.
 .It Bq Er EPERM
 A user other than the super-user attempted to set the time.
 .El
 .Sh SEE ALSO
 .Xr date 1 ,
 .Xr adjtime 2 ,
 .Xr clock_getcpuclockid 3 ,
 .Xr ctime 3 ,
 .Xr pthread_getcpuclockid 3
 .Sh STANDARDS
 The
 .Fn clock_gettime ,
 .Fn clock_settime ,
 and
 .Fn clock_getres
 system calls conform to
 .St -p1003.1-2008 .
 The clock IDs
 .Dv CLOCK_BOOTTIME ,
 .Dv CLOCK_MONOTONIC_FAST ,
 .Dv CLOCK_MONOTONIC_PRECISE ,
 .Dv CLOCK_REALTIME_FAST ,
 .Dv CLOCK_REALTIME_PRECISE ,
-.Dv CLOCK_SECOND
+.Dv CLOCK_SECOND ,
+.Dv CLOCK_TAI ,
 .Dv CLOCK_UPTIME ,
 .Dv CLOCK_UPTIME_FAST ,
 and
 .Dv CLOCK_UPTIME_PRECISE
 are
 .Fx
 extensions to the POSIX interface.
 .Pp
 UTC is defined by ITU-R TF.460-6, Standard-frequency and time-signal emissions.
 However, the
 .Vt time_t
 type is a simple count that does not provide a unique encoding for leap seconds,
 nor a specification for what values should be used to encode a leap second.
 .Pp
 .Sh HISTORY
 The
 .Fn clock_gettime ,
 .Fn clock_settime ,
 and
 .Fn clock_getres
 system calls first appeared in
 .Fx 3.0 .
diff --git a/lib/libsys/nanosleep.2 b/lib/libsys/nanosleep.2
index ba9aae1edf57..290565dbd6e1 100644
--- a/lib/libsys/nanosleep.2
+++ b/lib/libsys/nanosleep.2
@@ -1,253 +1,255 @@
 .\"	$NetBSD: nanosleep.2,v 1.23 2016/11/14 10:40:59 wiz Exp $
 .\"
 .\" Copyright (c) 1986, 1991, 1993
 .\"	The Regents of the University of California.  All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\" 3. Neither the name of the University nor the names of its contributors
 .\"    may be used to endorse or promote products derived from this software
 .\"    without specific prior written permission.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
-.Dd April 29, 2025
+.Dd May 3, 2025
 .Dt NANOSLEEP 2
 .Os
 .Sh NAME
 .Nm nanosleep
 .Nd high resolution sleep
 .Sh LIBRARY
 .Lb libc
 .Sh SYNOPSIS
 .In time.h
 .Ft int
 .Fo clock_nanosleep
 .Fa "clockid_t clock_id"
 .Fa "int flags"
 .Fa "const struct timespec *rqtp"
 .Fa "struct timespec *rmtp"
 .Fc
 .Ft int
 .Fo nanosleep
 .Fa "const struct timespec *rqtp"
 .Fa "struct timespec *rmtp"
 .Fc
 .Sh DESCRIPTION
 If the
 .Dv TIMER_ABSTIME
 flag is not set in the
 .Fa flags
 argument, then
 .Fn clock_nanosleep
 suspends execution of the calling thread until either the
 time interval specified by the
 .Fa rqtp
 argument has elapsed,
 or a signal is delivered to the calling process and its
 action is to invoke a signal-catching function or to terminate the
 process.
 The clock used to measure the time is specified by the
 .Fa clock_id
 argument.
 .Pp
 If the
 .Dv TIMER_ABSTIME
 flag is set in the
 .Fa flags
 argument, then
 .Fn clock_nanosleep
 suspends execution of the calling thread until either the value
 of the clock specified by the
 .Fa clock_id
 argument reaches the absolute time specified by the
 .Fa rqtp
 argument,
 or a signal is delivered to the calling process and its
 action is to invoke a signal-catching function or to terminate the
 process.
 If, at the time of the call, the time value specified by
 .Fa rqtp
 is less than or equal to the time value of the specified clock, then
 .Fn clock_nanosleep
 returns immediately and the calling thread is not suspended.
 An unmasked signal will terminate the sleep early, regardless of the
 .Dv SA_RESTART
 value on the interrupting signal.
 The
 .Fa rqtp
 and
 .Fa rmtp
 arguments can point to the same object.
 .Pp
 The following
 .Fa clock_id
 values are supported:
 .Pp
 .Bl -item -compact -offset indent
 .It
 CLOCK_MONOTONIC
 .It
 CLOCK_MONOTONIC_FAST
 .It
 CLOCK_MONOTONIC_PRECISE
 .It
 CLOCK_REALTIME
 .It
 CLOCK_REALTIME_FAST
 .It
 CLOCK_REALTIME_PRECISE
 .It
 CLOCK_SECOND
 .It
+CLOCK_TAI
+.It
 CLOCK_UPTIME
 .It
 CLOCK_UPTIME_FAST
 .It
 CLOCK_UPTIME_PRECISE
 .El
 .Pp
 The suspension time may be longer than requested due to the
 scheduling of other activity by the system.
 The clocks with the
 .Dv _FAST
 suffix and the
 .Dv CLOCK_SECOND
 are subject to the allowed time interval deviation specified by the
 .Va kern.timecounter.alloweddeviation
 .Xr sysctl 8
 variable.
 The clocks with the
 .Dv _PRECISE
 suffix are always as precise as possible.
 The
 .Dv CLOCK_MONOTONIC ,
 .Dv CLOCK_REALTIME
 and
 .Dv CLOCK_UPTIME
 are precise by default.
 Setting the
 .Va kern.timecounter.nanosleep_precise
 .Xr sysctl 8
 to a false value would make those clocks to behave like the
 .Dv _FAST
 clocks.
 .Pp
 The
 .Fn nanosleep
 function behaves like
 .Fn clock_nanosleep
 with a
 .Fa clock_id
 argument of
 .Dv CLOCK_REALTIME
 and without the
 .Dv TIMER_ABSTIME
 flag in the
 .Fa flags
 argument.
 .Sh RETURN VALUES
 These functions return zero when the requested time has elapsed.
 .Pp
 If these functions return for any other reason, then
 .Fn clock_nanosleep
 will directly return the error number, and
 .Fn nanosleep
 will return \-1 with the global variable
 .Va errno
 set to indicate the error.
 If a relative sleep is interrupted by a signal and
 .Fa rmtp
 is
 .Pf non- Dv NULL ,
 the timespec structure it references is updated to contain the
 unslept amount (the request time minus the time actually slept).
 .Sh ERRORS
 These functions can fail with the following errors.
 .Bl -tag -width Er
 .It Bq Er EFAULT
 Either
 .Fa rqtp
 or
 .Fa rmtp
 points to memory that is not a valid part of the process
 address space.
 .It Bq Er EINTR
 The function was interrupted by the delivery of a signal.
 .It Bq Er EINVAL
 The
 .Fa rqtp
 argument specified a nanosecond value less than zero
 or greater than or equal to 1000 million.
 .It Bq Er EINVAL
 The
 .Fa flags
 argument contained an invalid flag.
 .It Bq Er EINVAL
 The
 .Fa clock_id
 argument was
 .Dv CLOCK_THREAD_CPUTIME_ID
 or an unrecognized value.
 .It Bq Er ENOTSUP
 The
 .Fa clock_id
 argument was valid but not supported by this implementation of
 .Fn clock_nanosleep .
 .El
 .Sh SEE ALSO
 .Xr clock_gettime 2 ,
 .Xr sigaction 2 ,
 .Xr sleep 3
 .Sh STANDARDS
 These functions conform to
 .St -p1003.1-2008 .
 .Sh HISTORY
 The predecessor of this system call,
 .Fn sleep ,
 appeared in
 .At v3 ,
 but was removed when
 .Xr alarm 3
 was introduced into
 .At v7 .
 The
 .Fn nanosleep
 system call has been available since
 .Nx 1.3
 and was ported to
 .Ox 2.1
 and
 .Fx 3.0 .
 The
 .Fn clock_nanosleep
 system call has been available since
 .Fx 11.1 .
 .Pp
 In
 .Fx 15.0
 the default behavior of
 .Fn clock_nanosleep
 with
 .Dv CLOCK_MONOTONIC ,
 .Dv CLOCK_REALTIME ,
 .Dv CLOCK_UPTIME
 clocks and
 .Fn nanosleep
 has been switched to use precise clock.
diff --git a/lib/libsys/timer_create.2 b/lib/libsys/timer_create.2
index e8489b390845..8f6ff2e27c51 100644
--- a/lib/libsys/timer_create.2
+++ b/lib/libsys/timer_create.2
@@ -1,198 +1,199 @@
 .\" Copyright (c) 2005 David Xu <davidxu@FreeBSD.org>
 .\" All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice(s), this list of conditions and the following disclaimer as
 .\"    the first lines of this file unmodified other than the possible
 .\"    addition of one or more copyright notices.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice(s), this list of conditions and the following disclaimer in
 .\"    the documentation and/or other materials provided with the
 .\"    distribution.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
 .\" EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 .\" PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE
 .\" LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 .\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 .\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
 .\" BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 .\" WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
 .\" OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 .\" EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .\"
 .Dd July 15, 2016
 .Dt TIMER_CREATE 2
 .Os
 .Sh NAME
 .Nm timer_create
 .Nd "create a per-process timer (REALTIME)"
 .Sh LIBRARY
 .Lb librt
 .Sh SYNOPSIS
 .In time.h
 .In signal.h
 .Ft int
 .Fo timer_create
 .Fa "clockid_t clockid" "struct sigevent *restrict evp"
 .Fa "timer_t *restrict timerid"
 .Fc
 .Sh DESCRIPTION
 The
 .Fn timer_create
 system call creates a per-process timer using the specified clock,
 .Fa clock_id ,
 as the timing base.
 The
 .Fn timer_create
 system call returns, in the location referenced by
 .Fa timerid ,
 a timer ID of type
 .Vt timer_t
 used to identify the timer in timer requests.
 This timer ID is unique within the calling process until the timer is deleted.
 The particular clock,
 .Fa clock_id ,
 is defined in
 .In time.h .
 The timer whose ID is returned is in a disarmed state upon return from
 .Fn timer_create .
 .Pp
 The
 .Fa evp
 argument, if
 .Pf non- Dv NULL ,
 points to a
 .Vt sigevent
 structure.
 This structure,
 allocated by the application, defines the asynchronous notification to occur
 when the timer expires.
 .Pp
 If
 .Fa evp->sigev_notify
 is
 .Dv SIGEV_SIGNO
 or
 .Dv SIGEV_THREAD_ID ,
 the signal specified in
 .Fa evp->sigev_signo
 will be sent to the calling process
 .Pq Dv SIGEV_SIGNO
 or to the thread whose LWP ID is
 .Fa evp->sigev_notify_thread_id
 .Pq Dv SIGEV_THREAD_ID .
 The information for the queued signal will include:
 .Bl -column ".Va si_value"
 .It Sy Member Ta Sy Value
 .It Va si_code Ta Dv SI_TIMER
 .It Va si_value Ta
 the value stored in
 .Fa evp->sigev_value
 .It Va si_timerid Ta timer ID
 .It Va si_overrun Ta timer overrun count
 .It Va si_errno Ta
 If timer overrun is
 .Brq Dv DELAYTIMER_MAX ,
 an error code defined in
 .In errno.h
 .El
 .Pp
 If the
 .Fa evp
 argument is
 .Dv NULL ,
 the effect is as if the
 .Fa evp
 argument pointed to a
 .Vt sigevent
 structure with the
 .Va sigev_notify
 member having the value
 .Dv SIGEV_SIGNAL ,
 the
 .Va sigev_signo
 having a default signal number
 .Pq Dv SIGALRM ,
 and the
 .Va sigev_value
 member having
 the value of the timer ID.
 .Pp
 This implementation supports a
 .Fa clock_id
 of
-.Dv CLOCK_REALTIME
+.Dv CLOCK_REALTIME ,
+.Dv CLOCK_TAI ,
 or
 .Dv CLOCK_MONOTONIC .
 .Pp
 If
 .Fa evp->sigev_notify
 is
 .Dv SIGEV_THREAD
 and
 .Fa sev->sigev_notify_attributes
 is not
 .Dv NULL ,
 if the attribute pointed to by
 .Fa sev->sigev_notify_attributes
 has
 a thread stack address specified by a call to
 .Fn pthread_attr_setstack
 or
 .Fn pthread_attr_setstackaddr ,
 the results are unspecified if the signal is generated more than once.
 .Sh RETURN VALUES
 If the call succeeds,
 .Fn timer_create
 returns zero and updates the location referenced by
 .Fa timerid
 to a
 .Vt timer_t ,
 which can be passed to the per-process timer calls.
 If an error
 occurs, the system call returns a value of \-1
 and the global variable
 .Va errno
 is set to indicate the
 error.
 The value of
 .Fa timerid
 is undefined if an error occurs.
 .Sh ERRORS
 The
 .Fn timer_create
 system call
 will fail if:
 .Bl -tag -width Er
 .It Bq Er EAGAIN
 The calling process has already created all of the timers it is allowed by
 this implementation.
 .It Bq Er EINVAL
 The specified clock ID is not supported.
 .It Bq Er EINVAL
 The specified asynchronous notification method is not supported.
 .It Bq Er EFAULT
 Any arguments point outside the allocated address space or there is a
 memory protection fault.
 .El
 .Sh SEE ALSO
 .Xr clock_getres 2 ,
 .Xr timer_delete 2 ,
 .Xr timer_getoverrun 2 ,
 .Xr sigevent 3 ,
 .Xr siginfo 3
 .Sh STANDARDS
 The
 .Fn timer_create
 system call conforms to
 .St -p1003.1-2004 .
 .Sh HISTORY
 Support for
 .Tn POSIX
 per-process timer first appeared in
 .Fx 7.0 .
diff --git a/lib/libthr/thread/thr_condattr.c b/lib/libthr/thread/thr_condattr.c
index 0dc3e52bab5e..dc56363fc084 100644
--- a/lib/libthr/thread/thr_condattr.c
+++ b/lib/libthr/thread/thr_condattr.c
@@ -1,127 +1,128 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1997 John Birrell <jb@cimlogic.com.au>.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "namespace.h"
 #include <stdlib.h>
 #include <string.h>
 #include <errno.h>
 #include <pthread.h>
 #include "un-namespace.h"
 
 #include "thr_private.h"
 
 __weak_reference(_pthread_condattr_init, pthread_condattr_init);
 __weak_reference(_pthread_condattr_destroy, pthread_condattr_destroy);
 __weak_reference(_pthread_condattr_getclock, pthread_condattr_getclock);
 __weak_reference(_pthread_condattr_setclock, pthread_condattr_setclock);
 __weak_reference(_pthread_condattr_getpshared, pthread_condattr_getpshared);
 __weak_reference(_pthread_condattr_setpshared, pthread_condattr_setpshared);
 
 int
 _pthread_condattr_init(pthread_condattr_t *attr)
 {
 	pthread_condattr_t pattr;
 	int ret;
 
 	if ((pattr = (pthread_condattr_t)
 	    malloc(sizeof(struct pthread_cond_attr))) == NULL) {
 		ret = ENOMEM;
 	} else {
 		memcpy(pattr, &_pthread_condattr_default,
 		    sizeof(struct pthread_cond_attr));
 		*attr = pattr;
 		ret = 0;
 	}
 	return (ret);
 }
 
 int
 _pthread_condattr_destroy(pthread_condattr_t *attr)
 {
 	int	ret;
 
 	if (attr == NULL || *attr == NULL) {
 		ret = EINVAL;
 	} else {
 		free(*attr);
 		*attr = NULL;
 		ret = 0;
 	}
 	return(ret);
 }
 
 int
 _pthread_condattr_getclock(const pthread_condattr_t * __restrict attr,
     clockid_t * __restrict clock_id)
 {
 	if (attr == NULL || *attr == NULL)
 		return (EINVAL);
 	*clock_id = (*attr)->c_clockid;
 	return (0);
 }
 
 int
 _pthread_condattr_setclock(pthread_condattr_t *attr, clockid_t clock_id)
 {
 	if (attr == NULL || *attr == NULL)
 		return (EINVAL);
 	if (clock_id != CLOCK_REALTIME &&
+	    clock_id != CLOCK_TAI &&
 	    clock_id != CLOCK_VIRTUAL &&
 	    clock_id != CLOCK_PROF &&
 	    clock_id != CLOCK_MONOTONIC) {
 		return  (EINVAL);
 	}
 	(*attr)->c_clockid = clock_id;
 	return (0);
 }
 
 int
 _pthread_condattr_getpshared(const pthread_condattr_t * __restrict attr,
     int * __restrict pshared)
 {
 
 	if (attr == NULL || *attr == NULL)
 		return (EINVAL);
 	*pshared = (*attr)->c_pshared;
 	return (0);
 }
 
 int
 _pthread_condattr_setpshared(pthread_condattr_t *attr, int pshared)
 {
 
 	if (attr == NULL || *attr == NULL ||
 	    (pshared != PTHREAD_PROCESS_PRIVATE &&
 	    pshared != PTHREAD_PROCESS_SHARED))
 		return (EINVAL);
 	(*attr)->c_pshared = pshared;
 	return (0);
 }
diff --git a/share/man/man3/pthread_condattr.3 b/share/man/man3/pthread_condattr.3
index 96d30263d7f2..33ad904f9a3d 100644
--- a/share/man/man3/pthread_condattr.3
+++ b/share/man/man3/pthread_condattr.3
@@ -1,169 +1,170 @@
 .\" Copyright (C) 2000 Jason Evans <jasone@FreeBSD.org>.
 .\" All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice(s), this list of conditions and the following disclaimer as
 .\"    the first lines of this file unmodified other than the possible
 .\"    addition of one or more copyright notices.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice(s), this list of conditions and the following disclaimer in
 .\"    the documentation and/or other materials provided with the
 .\"    distribution.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
 .\" EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 .\" PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE
 .\" LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 .\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 .\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
 .\" BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 .\" WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
 .\" OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 .\" EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .Dd October 27, 2023
 .Dt PTHREAD_CONDATTR 3
 .Os
 .Sh NAME
 .Nm pthread_condattr_init ,
 .Nm pthread_condattr_destroy ,
 .Nm pthread_condattr_getclock ,
 .Nm pthread_condattr_setclock ,
 .Nm pthread_condattr_getpshared ,
 .Nm pthread_condattr_setpshared
 .Nd condition attribute operations
 .Sh LIBRARY
 .Lb libpthread
 .Sh SYNOPSIS
 .In pthread.h
 .Ft int
 .Fn pthread_condattr_init "pthread_condattr_t *attr"
 .Ft int
 .Fn pthread_condattr_destroy "pthread_condattr_t *attr"
 .Ft int
 .Fo pthread_condattr_getclock
 .Fa "pthread_condattr_t * restrict attr" "clockid_t * restrict clock_id"
 .Fc
 .Ft int
 .Fn pthread_condattr_setclock "pthread_condattr_t *attr" "clockid_t clock_id"
 .Ft int
 .Fo pthread_condattr_getpshared
 .Fa "pthread_condattr_t * restrict attr" "int * restrict pshared"
 .Fc
 .Ft int
 .Fn pthread_condattr_setpshared "pthread_condattr_t *attr" "int pshared"
 .Sh DESCRIPTION
 Condition attribute objects are used to specify parameters to
 .Fn pthread_cond_init .
 .Pp
 The
 .Fn pthread_condattr_init
 function initializes a condition attribute object with the default attributes.
 .Pp
 The
 .Fn pthread_condattr_destroy
 function destroys a condition attribute object.
 .Pp
 The
 .Fn pthread_condattr_getclock
 function will put the value of the clock attribute from
 .Fa attr
 into the memory area pointed to by
 .Fa clock_id .
 The
 .Fn pthread_condattr_setclock
 function will set the clock attribute of
 .Fa attr
 to the value specified in
 .Fa clock_id .
 The clock attribute affects the interpretation of
 .Fa abstime
 in
 .Xr pthread_cond_timedwait 3
 and may be set to
 .Dv CLOCK_REALTIME
-(default)
+(default),
+.Dv CLOCK_TAI ,
 or
 .Dv CLOCK_MONOTONIC .
 .Pp
 The
 .Fn pthread_condattr_getpshared
 function will put the value of the process-shared attribute from
 .Fa attr
 into the memory area pointed to by
 .Fa pshared .
 The
 .Fn pthread_condattr_setpshared
 function will set the process-shared attribute of
 .Fa attr
 to the value specified in
 .Fa pshared .
 The argument
 .Fa pshared
 may have one of the following values:
 .Bl -tag -width ".Dv PTHREAD_PROCESS_PRIVATE"
 .It Dv PTHREAD_PROCESS_PRIVATE
 The condition variable it is attached to may only be accessed by
 threads in the same process as the one that created the object.
 .It Dv PTHREAD_PROCESS_SHARED
 The condition variable it is attached to may be accessed by
 threads in processes other than the one that created the object.
 .El
 See
 .Xr libthr 3
 for details of the implementation of shared condition variables,
 and their limitations.
 .Sh RETURN VALUES
 If successful, these functions return 0.
 Otherwise, an error number is returned to indicate the error.
 .Sh ERRORS
 The
 .Fn pthread_condattr_init
 function will fail if:
 .Bl -tag -width Er
 .It Bq Er ENOMEM
 Out of memory.
 .El
 .Pp
 The
 .Fn pthread_condattr_destroy
 function will fail if:
 .Bl -tag -width Er
 .It Bq Er EINVAL
 Invalid value for
 .Fa attr .
 .El
 .Pp
 The
 .Fn pthread_condattr_setclock
 function will fail if:
 .Bl -tag -width Er
 .It Bq Er EINVAL
 The value specified in
 .Fa clock_id
 is not one of the allowed values.
 .El
 .Pp
 The
 .Fn pthread_condattr_setpshared
 function will fail if:
 .Bl -tag -width Er
 .It Bq Er EINVAL
 The value specified in
 .Fa pshared
 is not one of the allowed values.
 .El
 .Sh SEE ALSO
 .Xr libthr 3 ,
 .Xr pthread_cond_init 3 ,
 .Xr pthread_cond_timedwait 3
 .Sh STANDARDS
 The
 .Fn pthread_condattr_init
 and
 .Fn pthread_condattr_destroy
 functions conform to
 .St -p1003.1-96
diff --git a/sys/kern/kern_ntptime.c b/sys/kern/kern_ntptime.c
index 65746021028b..892a6798ab1f 100644
--- a/sys/kern/kern_ntptime.c
+++ b/sys/kern/kern_ntptime.c
@@ -1,1048 +1,1049 @@
 /*-
  ***********************************************************************
  *								       *
  * Copyright (c) David L. Mills 1993-2001			       *
  *								       *
  * Permission to use, copy, modify, and distribute this software and   *
  * its documentation for any purpose and without fee is hereby	       *
  * granted, provided that the above copyright notice appears in all    *
  * copies and that both the copyright notice and this permission       *
  * notice appear in supporting documentation, and that the name	       *
  * University of Delaware not be used in advertising or publicity      *
  * pertaining to distribution of the software without specific,	       *
  * written prior permission. The University of Delaware makes no       *
  * representations about the suitability this software for any	       *
  * purpose. It is provided "as is" without express or implied	       *
  * warranty.							       *
  *								       *
  **********************************************************************/
 
 /*
  * Adapted from the original sources for FreeBSD and timecounters by:
  * Poul-Henning Kamp <phk@FreeBSD.org>.
  *
  * The 32bit version of the "LP" macros seems a bit past its "sell by" 
  * date so I have retained only the 64bit version and included it directly
  * in this file.
  *
  * Only minor changes done to interface with the timecounters over in
  * sys/kern/kern_clock.c.   Some of the comments below may be (even more)
  * confusing and/or plain wrong in that context.
  */
 
 #include <sys/cdefs.h>
 #include "opt_ntp.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/time.h>
 #include <sys/timex.h>
 #include <sys/timetc.h>
 #include <sys/timepps.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 
 #ifdef PPS_SYNC
 FEATURE(pps_sync, "Support usage of external PPS signal by kernel PLL");
 #endif
 
 /*
  * Single-precision macros for 64-bit machines
  */
 typedef int64_t l_fp;
 #define L_ADD(v, u)	((v) += (u))
 #define L_SUB(v, u)	((v) -= (u))
 #define L_ADDHI(v, a)	((v) += (int64_t)(a) << 32)
 #define L_NEG(v)	((v) = -(v))
 #define L_RSHIFT(v, n) \
 	do { \
 		if ((v) < 0) \
 			(v) = -(-(v) >> (n)); \
 		else \
 			(v) = (v) >> (n); \
 	} while (0)
 #define L_MPY(v, a)	((v) *= (a))
 #define L_CLR(v)	((v) = 0)
 #define L_ISNEG(v)	((v) < 0)
 #define L_LINT(v, a) \
 	do { \
 		if ((a) < 0) \
 			((v) = -((int64_t)(-(a)) << 32)); \
 		else \
 			((v) = (int64_t)(a) << 32); \
 	} while (0)
 #define L_GINT(v)	((v) < 0 ? -(-(v) >> 32) : (v) >> 32)
 
 /*
  * Generic NTP kernel interface
  *
  * These routines constitute the Network Time Protocol (NTP) interfaces
  * for user and daemon application programs. The ntp_gettime() routine
  * provides the time, maximum error (synch distance) and estimated error
  * (dispersion) to client user application programs. The ntp_adjtime()
  * routine is used by the NTP daemon to adjust the system clock to an
  * externally derived time. The time offset and related variables set by
  * this routine are used by other routines in this module to adjust the
  * phase and frequency of the clock discipline loop which controls the
  * system clock.
  *
  * When the kernel time is reckoned directly in nanoseconds (NTP_NANO
  * defined), the time at each tick interrupt is derived directly from
  * the kernel time variable. When the kernel time is reckoned in
  * microseconds, (NTP_NANO undefined), the time is derived from the
  * kernel time variable together with a variable representing the
  * leftover nanoseconds at the last tick interrupt. In either case, the
  * current nanosecond time is reckoned from these values plus an
  * interpolated value derived by the clock routines in another
  * architecture-specific module. The interpolation can use either a
  * dedicated counter or a processor cycle counter (PCC) implemented in
  * some architectures.
  *
  * Note that all routines must run at priority splclock or higher.
  */
 /*
  * Phase/frequency-lock loop (PLL/FLL) definitions
  *
  * The nanosecond clock discipline uses two variable types, time
  * variables and frequency variables. Both types are represented as 64-
  * bit fixed-point quantities with the decimal point between two 32-bit
  * halves. On a 32-bit machine, each half is represented as a single
  * word and mathematical operations are done using multiple-precision
  * arithmetic. On a 64-bit machine, ordinary computer arithmetic is
  * used.
  *
  * A time variable is a signed 64-bit fixed-point number in ns and
  * fraction. It represents the remaining time offset to be amortized
  * over succeeding tick interrupts. The maximum time offset is about
  * 0.5 s and the resolution is about 2.3e-10 ns.
  *
  *			1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3
  *  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  * |s s s|			 ns				   |
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  * |			    fraction				   |
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  *
  * A frequency variable is a signed 64-bit fixed-point number in ns/s
  * and fraction. It represents the ns and fraction to be added to the
  * kernel time variable at each second. The maximum frequency offset is
  * about +-500000 ns/s and the resolution is about 2.3e-10 ns/s.
  *
  *			1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3
  *  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  * |s s s s s s s s s s s s s|	          ns/s			   |
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  * |			    fraction				   |
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  */
 /*
  * The following variables establish the state of the PLL/FLL and the
  * residual time and frequency offset of the local clock.
  */
 #define SHIFT_PLL	4		/* PLL loop gain (shift) */
 #define SHIFT_FLL	2		/* FLL loop gain (shift) */
 
 static int time_state = TIME_OK;	/* clock state */
 int time_status = STA_UNSYNC;	/* clock status bits */
 static long time_tai;			/* TAI offset (s) */
 static long time_monitor;		/* last time offset scaled (ns) */
 static long time_constant;		/* poll interval (shift) (s) */
 static long time_precision = 1;		/* clock precision (ns) */
 static long time_maxerror = MAXPHASE / 1000; /* maximum error (us) */
 long time_esterror = MAXPHASE / 1000; /* estimated error (us) */
 static long time_reftime;		/* uptime at last adjustment (s) */
 static l_fp time_offset;		/* time offset (ns) */
 static l_fp time_freq;			/* frequency offset (ns/s) */
 static l_fp time_adj;			/* tick adjust (ns/s) */
 
 static int64_t time_adjtime;		/* correction from adjtime(2) (usec) */
 
 static struct mtx ntp_lock;
 MTX_SYSINIT(ntp, &ntp_lock, "ntp", MTX_SPIN);
 
 #define	NTP_LOCK()		mtx_lock_spin(&ntp_lock)
 #define	NTP_UNLOCK()		mtx_unlock_spin(&ntp_lock)
 #define	NTP_ASSERT_LOCKED()	mtx_assert(&ntp_lock, MA_OWNED)
 
 #ifdef PPS_SYNC
 /*
  * The following variables are used when a pulse-per-second (PPS) signal
  * is available and connected via a modem control lead. They establish
  * the engineering parameters of the clock discipline loop when
  * controlled by the PPS signal.
  */
 #define PPS_FAVG	2		/* min freq avg interval (s) (shift) */
 #define PPS_FAVGDEF	8		/* default freq avg int (s) (shift) */
 #define PPS_FAVGMAX	15		/* max freq avg interval (s) (shift) */
 #define PPS_PAVG	4		/* phase avg interval (s) (shift) */
 #define PPS_VALID	120		/* PPS signal watchdog max (s) */
 #define PPS_MAXWANDER	100000		/* max PPS wander (ns/s) */
 #define PPS_POPCORN	2		/* popcorn spike threshold (shift) */
 
 static struct timespec pps_tf[3];	/* phase median filter */
 static l_fp pps_freq;			/* scaled frequency offset (ns/s) */
 static long pps_fcount;			/* frequency accumulator */
 static long pps_jitter;			/* nominal jitter (ns) */
 static long pps_stabil;			/* nominal stability (scaled ns/s) */
 static time_t pps_lastsec;		/* time at last calibration (s) */
 static int pps_valid;			/* signal watchdog counter */
 static int pps_shift = PPS_FAVG;	/* interval duration (s) (shift) */
 static int pps_shiftmax = PPS_FAVGDEF;	/* max interval duration (s) (shift) */
 static int pps_intcnt;			/* wander counter */
 
 /*
  * PPS signal quality monitors
  */
 static long pps_calcnt;			/* calibration intervals */
 static long pps_jitcnt;			/* jitter limit exceeded */
 static long pps_stbcnt;			/* stability limit exceeded */
 static long pps_errcnt;			/* calibration errors */
 #endif /* PPS_SYNC */
 /*
  * End of phase/frequency-lock loop (PLL/FLL) definitions
  */
 
 static void hardupdate(long offset);
 static void ntp_gettime1(struct ntptimeval *ntvp);
 static bool ntp_is_time_error(int tsl);
 
 static bool
 ntp_is_time_error(int tsl)
 {
 
 	/*
 	 * Status word error decode. If any of these conditions occur,
 	 * an error is returned, instead of the status word. Most
 	 * applications will care only about the fact the system clock
 	 * may not be trusted, not about the details.
 	 *
 	 * Hardware or software error
 	 */
 	if ((tsl & (STA_UNSYNC | STA_CLOCKERR)) ||
 
 	/*
 	 * PPS signal lost when either time or frequency synchronization
 	 * requested
 	 */
 	    (tsl & (STA_PPSFREQ | STA_PPSTIME) &&
 	    !(tsl & STA_PPSSIGNAL)) ||
 
 	/*
 	 * PPS jitter exceeded when time synchronization requested
 	 */
 	    (tsl & STA_PPSTIME && tsl & STA_PPSJITTER) ||
 
 	/*
 	 * PPS wander exceeded or calibration error when frequency
 	 * synchronization requested
 	 */
 	    (tsl & STA_PPSFREQ &&
 	    tsl & (STA_PPSWANDER | STA_PPSERROR)))
 		return (true);
 
 	return (false);
 }
 
 static void
 ntp_gettime1(struct ntptimeval *ntvp)
 {
 	struct timespec atv;	/* nanosecond time */
 
 	NTP_ASSERT_LOCKED();
 
 	nanotime(&atv);
 	ntvp->time.tv_sec = atv.tv_sec;
 	ntvp->time.tv_nsec = atv.tv_nsec;
 	ntvp->maxerror = time_maxerror;
 	ntvp->esterror = time_esterror;
 	ntvp->tai = time_tai;
 	ntvp->time_state = time_state;
 
 	if (ntp_is_time_error(time_status))
 		ntvp->time_state = TIME_ERROR;
 }
 
 /*
  * ntp_gettime() - NTP user application interface
  *
  * See the timex.h header file for synopsis and API description.  Note that
  * the TAI offset is returned in the ntvtimeval.tai structure member.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ntp_gettime_args {
 	struct ntptimeval *ntvp;
 };
 #endif
 /* ARGSUSED */
 int
 sys_ntp_gettime(struct thread *td, struct ntp_gettime_args *uap)
 {	
 	struct ntptimeval ntv;
 
 	memset(&ntv, 0, sizeof(ntv));
 
 	NTP_LOCK();
 	ntp_gettime1(&ntv);
 	NTP_UNLOCK();
 
 	td->td_retval[0] = ntv.time_state;
 	return (copyout(&ntv, uap->ntvp, sizeof(ntv)));
 }
 
 static int
 ntp_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct ntptimeval ntv;	/* temporary structure */
 
 	memset(&ntv, 0, sizeof(ntv));
 
 	NTP_LOCK();
 	ntp_gettime1(&ntv);
 	NTP_UNLOCK();
 
 	return (sysctl_handle_opaque(oidp, &ntv, sizeof(ntv), req));
 }
 
 SYSCTL_NODE(_kern, OID_AUTO, ntp_pll, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "");
 SYSCTL_PROC(_kern_ntp_pll, OID_AUTO, gettime, CTLTYPE_OPAQUE | CTLFLAG_RD |
     CTLFLAG_MPSAFE, 0, sizeof(struct ntptimeval) , ntp_sysctl, "S,ntptimeval",
     "");
 
 #ifdef PPS_SYNC
 SYSCTL_INT(_kern_ntp_pll, OID_AUTO, pps_shiftmax, CTLFLAG_RW,
     &pps_shiftmax, 0, "Max interval duration (sec) (shift)");
 SYSCTL_INT(_kern_ntp_pll, OID_AUTO, pps_shift, CTLFLAG_RW,
     &pps_shift, 0, "Interval duration (sec) (shift)");
 SYSCTL_LONG(_kern_ntp_pll, OID_AUTO, time_monitor, CTLFLAG_RD,
     &time_monitor, 0, "Last time offset scaled (ns)");
 
 SYSCTL_S64(_kern_ntp_pll, OID_AUTO, pps_freq, CTLFLAG_RD | CTLFLAG_MPSAFE,
     &pps_freq, 0,
     "Scaled frequency offset (ns/sec)");
 SYSCTL_S64(_kern_ntp_pll, OID_AUTO, time_freq, CTLFLAG_RD | CTLFLAG_MPSAFE,
     &time_freq, 0,
     "Frequency offset (ns/sec)");
 #endif
 
 /*
  * ntp_adjtime() - NTP daemon application interface
  *
  * See the timex.h header file for synopsis and API description.  Note that
  * the timex.constant structure member has a dual purpose to set the time
  * constant and to set the TAI offset.
  */
 int
 kern_ntp_adjtime(struct thread *td, struct timex *ntv, int *retvalp)
 {
 	long freq;		/* frequency ns/s) */
 	int modes;		/* mode bits from structure */
 	int error, retval;
 
 	/*
 	 * Update selected clock variables - only the superuser can
 	 * change anything. Note that there is no error checking here on
 	 * the assumption the superuser should know what it is doing.
 	 * Note that either the time constant or TAI offset are loaded
 	 * from the ntv.constant member, depending on the mode bits. If
 	 * the STA_PLL bit in the status word is cleared, the state and
 	 * status words are reset to the initial values at boot.
 	 */
 	modes = ntv->modes;
 	error = 0;
 	if (modes)
 		error = priv_check(td, PRIV_NTP_ADJTIME);
 	if (error != 0)
 		return (error);
 	NTP_LOCK();
 	if (modes & MOD_MAXERROR)
 		time_maxerror = ntv->maxerror;
 	if (modes & MOD_ESTERROR)
 		time_esterror = ntv->esterror;
 	if (modes & MOD_STATUS) {
 		if (time_status & STA_PLL && !(ntv->status & STA_PLL)) {
 			time_state = TIME_OK;
 			time_status = STA_UNSYNC;
 #ifdef PPS_SYNC
 			pps_shift = PPS_FAVG;
 #endif /* PPS_SYNC */
 		}
 		time_status &= STA_RONLY;
 		time_status |= ntv->status & ~STA_RONLY;
 	}
 	if (modes & MOD_TIMECONST) {
 		if (ntv->constant < 0)
 			time_constant = 0;
 		else if (ntv->constant > MAXTC)
 			time_constant = MAXTC;
 		else
 			time_constant = ntv->constant;
 	}
 	if (modes & MOD_TAI) {
 		if (ntv->constant > 0) /* XXX zero & negative numbers ? */
 			time_tai = ntv->constant;
 	}
 #ifdef PPS_SYNC
 	if (modes & MOD_PPSMAX) {
 		if (ntv->shift < PPS_FAVG)
 			pps_shiftmax = PPS_FAVG;
 		else if (ntv->shift > PPS_FAVGMAX)
 			pps_shiftmax = PPS_FAVGMAX;
 		else
 			pps_shiftmax = ntv->shift;
 	}
 #endif /* PPS_SYNC */
 	if (modes & MOD_NANO)
 		time_status |= STA_NANO;
 	if (modes & MOD_MICRO)
 		time_status &= ~STA_NANO;
 	if (modes & MOD_CLKB)
 		time_status |= STA_CLK;
 	if (modes & MOD_CLKA)
 		time_status &= ~STA_CLK;
 	if (modes & MOD_FREQUENCY) {
 		freq = (ntv->freq * 1000LL) >> 16;
 		if (freq > MAXFREQ)
 			L_LINT(time_freq, MAXFREQ);
 		else if (freq < -MAXFREQ)
 			L_LINT(time_freq, -MAXFREQ);
 		else {
 			/*
 			 * ntv->freq is [PPM * 2^16] = [us/s * 2^16]
 			 * time_freq is [ns/s * 2^32]
 			 */
 			time_freq = ntv->freq * 1000LL * 65536LL;
 		}
 #ifdef PPS_SYNC
 		pps_freq = time_freq;
 #endif /* PPS_SYNC */
 	}
 	if (modes & MOD_OFFSET) {
 		if (time_status & STA_NANO)
 			hardupdate(ntv->offset);
 		else
 			hardupdate(ntv->offset * 1000);
 	}
 
 	/*
 	 * Retrieve all clock variables. Note that the TAI offset is
 	 * returned only by ntp_gettime();
 	 */
 	if (time_status & STA_NANO)
 		ntv->offset = L_GINT(time_offset);
 	else
 		ntv->offset = L_GINT(time_offset) / 1000; /* XXX rounding ? */
 	ntv->freq = L_GINT((time_freq / 1000LL) << 16);
 	ntv->maxerror = time_maxerror;
 	ntv->esterror = time_esterror;
 	ntv->status = time_status;
 	ntv->constant = time_constant;
 	if (time_status & STA_NANO)
 		ntv->precision = time_precision;
 	else
 		ntv->precision = time_precision / 1000;
 	ntv->tolerance = MAXFREQ * SCALE_PPM;
 #ifdef PPS_SYNC
 	ntv->shift = pps_shift;
 	ntv->ppsfreq = L_GINT((pps_freq / 1000LL) << 16);
 	if (time_status & STA_NANO)
 		ntv->jitter = pps_jitter;
 	else
 		ntv->jitter = pps_jitter / 1000;
 	ntv->stabil = pps_stabil;
 	ntv->calcnt = pps_calcnt;
 	ntv->errcnt = pps_errcnt;
 	ntv->jitcnt = pps_jitcnt;
 	ntv->stbcnt = pps_stbcnt;
 #endif /* PPS_SYNC */
 	retval = ntp_is_time_error(time_status) ? TIME_ERROR : time_state;
 	NTP_UNLOCK();
 
 	*retvalp = retval;
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ntp_adjtime_args {
 	struct timex *tp;
 };
 #endif
 
 int
 sys_ntp_adjtime(struct thread *td, struct ntp_adjtime_args *uap)
 {
 	struct timex ntv;
 	int error, retval;
 
 	error = copyin(uap->tp, &ntv, sizeof(ntv));
 	if (error == 0) {
 		error = kern_ntp_adjtime(td, &ntv, &retval);
 		if (error == 0) {
 			error = copyout(&ntv, uap->tp, sizeof(ntv));
 			if (error == 0)
 				td->td_retval[0] = retval;
 		}
 	}
 	return (error);
 }
 
 /*
  * second_overflow() - called after ntp_tick_adjust()
  *
  * This routine is ordinarily called immediately following the above
  * routine ntp_tick_adjust(). While these two routines are normally
  * combined, they are separated here only for the purposes of
  * simulation.
  */
 void
-ntp_update_second(int64_t *adjustment, time_t *newsec)
+ntp_update_second(int64_t *adjustment, time_t *newsec, long *tai_off)
 {
 	int tickrate;
 	l_fp ftemp;		/* 32/64-bit temporary */
 
 	NTP_LOCK();
 
 	/*
 	 * On rollover of the second both the nanosecond and microsecond
 	 * clocks are updated and the state machine cranked as
 	 * necessary. The phase adjustment to be used for the next
 	 * second is calculated and the maximum error is increased by
 	 * the tolerance.
 	 */
 	time_maxerror += MAXFREQ / 1000;
 
 	/*
 	 * Leap second processing. If in leap-insert state at
 	 * the end of the day, the system clock is set back one
 	 * second; if in leap-delete state, the system clock is
 	 * set ahead one second. The nano_time() routine or
 	 * external clock driver will insure that reported time
 	 * is always monotonic.
 	 */
 	switch (time_state) {
 		/*
 		 * No warning.
 		 */
 		case TIME_OK:
 		if (time_status & STA_INS)
 			time_state = TIME_INS;
 		else if (time_status & STA_DEL)
 			time_state = TIME_DEL;
 		break;
 
 		/*
 		 * Insert second 23:59:60 following second
 		 * 23:59:59.
 		 */
 		case TIME_INS:
 		if (!(time_status & STA_INS))
 			time_state = TIME_OK;
 		else if ((*newsec) % 86400 == 0) {
 			(*newsec)--;
 			time_state = TIME_OOP;
 			time_tai++;
 		}
 		break;
 
 		/*
 		 * Delete second 23:59:59.
 		 */
 		case TIME_DEL:
 		if (!(time_status & STA_DEL))
 			time_state = TIME_OK;
 		else if (((*newsec) + 1) % 86400 == 0) {
 			(*newsec)++;
 			time_tai--;
 			time_state = TIME_WAIT;
 		}
 		break;
 
 		/*
 		 * Insert second in progress.
 		 */
 		case TIME_OOP:
 			time_state = TIME_WAIT;
 		break;
 
 		/*
 		 * Wait for status bits to clear.
 		 */
 		case TIME_WAIT:
 		if (!(time_status & (STA_INS | STA_DEL)))
 			time_state = TIME_OK;
 	}
 
 	/*
 	 * Compute the total time adjustment for the next second
 	 * in ns. The offset is reduced by a factor depending on
 	 * whether the PPS signal is operating. Note that the
 	 * value is in effect scaled by the clock frequency,
 	 * since the adjustment is added at each tick interrupt.
 	 */
 	ftemp = time_offset;
 #ifdef PPS_SYNC
 	/* XXX even if PPS signal dies we should finish adjustment ? */
 	if (time_status & STA_PPSTIME && time_status &
 	    STA_PPSSIGNAL)
 		L_RSHIFT(ftemp, pps_shift);
 	else
 		L_RSHIFT(ftemp, SHIFT_PLL + time_constant);
 #else
 		L_RSHIFT(ftemp, SHIFT_PLL + time_constant);
 #endif /* PPS_SYNC */
 	time_adj = ftemp;
 	L_SUB(time_offset, ftemp);
 	L_ADD(time_adj, time_freq);
 
 	/*
 	 * Apply any correction from adjtime(2).  If more than one second
 	 * off we slew at a rate of 5ms/s (5000 PPM) else 500us/s (500 PPM)
 	 * until the last second is slewed the final < 500 usecs.
 	 */
 	if (time_adjtime != 0) {
 		if (time_adjtime > 1000000)
 			tickrate = 5000;
 		else if (time_adjtime < -1000000)
 			tickrate = -5000;
 		else if (time_adjtime > 500)
 			tickrate = 500;
 		else if (time_adjtime < -500)
 			tickrate = -500;
 		else
 			tickrate = time_adjtime;
 		time_adjtime -= tickrate;
 		L_LINT(ftemp, tickrate * 1000);
 		L_ADD(time_adj, ftemp);
 	}
 	*adjustment = time_adj;
+	*tai_off = time_tai;
 		
 #ifdef PPS_SYNC
 	if (pps_valid > 0)
 		pps_valid--;
 	else
 		time_status &= ~STA_PPSSIGNAL;
 #endif /* PPS_SYNC */
 
 	NTP_UNLOCK();
 }
 
 /*
  * hardupdate() - local clock update
  *
  * This routine is called by ntp_adjtime() to update the local clock
  * phase and frequency. The implementation is of an adaptive-parameter,
  * hybrid phase/frequency-lock loop (PLL/FLL). The routine computes new
  * time and frequency offset estimates for each call. If the kernel PPS
  * discipline code is configured (PPS_SYNC), the PPS signal itself
  * determines the new time offset, instead of the calling argument.
  * Presumably, calls to ntp_adjtime() occur only when the caller
  * believes the local clock is valid within some bound (+-128 ms with
  * NTP). If the caller's time is far different than the PPS time, an
  * argument will ensue, and it's not clear who will lose.
  *
  * For uncompensated quartz crystal oscillators and nominal update
  * intervals less than 256 s, operation should be in phase-lock mode,
  * where the loop is disciplined to phase. For update intervals greater
  * than 1024 s, operation should be in frequency-lock mode, where the
  * loop is disciplined to frequency. Between 256 s and 1024 s, the mode
  * is selected by the STA_MODE status bit.
  */
 static void
 hardupdate(long offset /* clock offset (ns) */)
 {
 	long mtemp;
 	l_fp ftemp;
 
 	NTP_ASSERT_LOCKED();
 
 	/*
 	 * Select how the phase is to be controlled and from which
 	 * source. If the PPS signal is present and enabled to
 	 * discipline the time, the PPS offset is used; otherwise, the
 	 * argument offset is used.
 	 */
 	if (!(time_status & STA_PLL))
 		return;
 	if (!(time_status & STA_PPSTIME && time_status &
 	    STA_PPSSIGNAL)) {
 		if (offset > MAXPHASE)
 			time_monitor = MAXPHASE;
 		else if (offset < -MAXPHASE)
 			time_monitor = -MAXPHASE;
 		else
 			time_monitor = offset;
 		L_LINT(time_offset, time_monitor);
 	}
 
 	/*
 	 * Select how the frequency is to be controlled and in which
 	 * mode (PLL or FLL). If the PPS signal is present and enabled
 	 * to discipline the frequency, the PPS frequency is used;
 	 * otherwise, the argument offset is used to compute it.
 	 */
 	if (time_status & STA_PPSFREQ && time_status & STA_PPSSIGNAL) {
 		time_reftime = time_uptime;
 		return;
 	}
 	if (time_status & STA_FREQHOLD || time_reftime == 0)
 		time_reftime = time_uptime;
 	mtemp = time_uptime - time_reftime;
 	L_LINT(ftemp, time_monitor);
 	L_RSHIFT(ftemp, (SHIFT_PLL + 2 + time_constant) << 1);
 	L_MPY(ftemp, mtemp);
 	L_ADD(time_freq, ftemp);
 	time_status &= ~STA_MODE;
 	if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp >
 	    MAXSEC)) {
 		L_LINT(ftemp, (time_monitor << 4) / mtemp);
 		L_RSHIFT(ftemp, SHIFT_FLL + 4);
 		L_ADD(time_freq, ftemp);
 		time_status |= STA_MODE;
 	}
 	time_reftime = time_uptime;
 	if (L_GINT(time_freq) > MAXFREQ)
 		L_LINT(time_freq, MAXFREQ);
 	else if (L_GINT(time_freq) < -MAXFREQ)
 		L_LINT(time_freq, -MAXFREQ);
 }
 
 #ifdef PPS_SYNC
 /*
  * hardpps() - discipline CPU clock oscillator to external PPS signal
  *
  * This routine is called at each PPS interrupt in order to discipline
  * the CPU clock oscillator to the PPS signal. There are two independent
  * first-order feedback loops, one for the phase, the other for the
  * frequency. The phase loop measures and grooms the PPS phase offset
  * and leaves it in a handy spot for the seconds overflow routine. The
  * frequency loop averages successive PPS phase differences and
  * calculates the PPS frequency offset, which is also processed by the
  * seconds overflow routine. The code requires the caller to capture the
  * time and architecture-dependent hardware counter values in
  * nanoseconds at the on-time PPS signal transition.
  *
  * Note that, on some Unix systems this routine runs at an interrupt
  * priority level higher than the timer interrupt routine hardclock().
  * Therefore, the variables used are distinct from the hardclock()
  * variables, except for the actual time and frequency variables, which
  * are determined by this routine and updated atomically.
  *
  * tsp  - time at current PPS event
  * delta_nsec - time elapsed between the previous and current PPS event
  */
 void
 hardpps(struct timespec *tsp, long delta_nsec)
 {
 	long u_nsec, v_nsec; /* temps */
 	time_t u_sec;
 	l_fp ftemp;
 
 	NTP_LOCK();
 
 	/*
 	 * The signal is first processed by a range gate and frequency
 	 * discriminator. The range gate rejects noise spikes outside
 	 * the range +-500 us. The frequency discriminator rejects input
 	 * signals with apparent frequency outside the range 1 +-500
 	 * PPM. If two hits occur in the same second, we ignore the
 	 * later hit; if not and a hit occurs outside the range gate,
 	 * keep the later hit for later comparison, but do not process
 	 * it.
 	 */
 	time_status |= STA_PPSSIGNAL | STA_PPSJITTER;
 	time_status &= ~(STA_PPSWANDER | STA_PPSERROR);
 	pps_valid = PPS_VALID;
 	u_sec = tsp->tv_sec;
 	u_nsec = tsp->tv_nsec;
 	if (u_nsec >= (NANOSECOND >> 1)) {
 		u_nsec -= NANOSECOND;
 		u_sec++;
 	}
 	v_nsec = u_nsec - pps_tf[0].tv_nsec;
 	if (u_sec == pps_tf[0].tv_sec && v_nsec < NANOSECOND - MAXFREQ)
 		goto out;
 	pps_tf[2] = pps_tf[1];
 	pps_tf[1] = pps_tf[0];
 	pps_tf[0].tv_sec = u_sec;
 	pps_tf[0].tv_nsec = u_nsec;
 
 	/*
 	 * Update the frequency accumulator using the difference between the
 	 * current and previous PPS event measured directly by the timecounter.
 	 */
 	pps_fcount += delta_nsec - NANOSECOND;
 	if (v_nsec > MAXFREQ || v_nsec < -MAXFREQ)
 		goto out;
 	time_status &= ~STA_PPSJITTER;
 
 	/*
 	 * A three-stage median filter is used to help denoise the PPS
 	 * time. The median sample becomes the time offset estimate; the
 	 * difference between the other two samples becomes the time
 	 * dispersion (jitter) estimate.
 	 */
 	if (pps_tf[0].tv_nsec > pps_tf[1].tv_nsec) {
 		if (pps_tf[1].tv_nsec > pps_tf[2].tv_nsec) {
 			v_nsec = pps_tf[1].tv_nsec;	/* 0 1 2 */
 			u_nsec = pps_tf[0].tv_nsec - pps_tf[2].tv_nsec;
 		} else if (pps_tf[2].tv_nsec > pps_tf[0].tv_nsec) {
 			v_nsec = pps_tf[0].tv_nsec;	/* 2 0 1 */
 			u_nsec = pps_tf[2].tv_nsec - pps_tf[1].tv_nsec;
 		} else {
 			v_nsec = pps_tf[2].tv_nsec;	/* 0 2 1 */
 			u_nsec = pps_tf[0].tv_nsec - pps_tf[1].tv_nsec;
 		}
 	} else {
 		if (pps_tf[1].tv_nsec < pps_tf[2].tv_nsec) {
 			v_nsec = pps_tf[1].tv_nsec;	/* 2 1 0 */
 			u_nsec = pps_tf[2].tv_nsec - pps_tf[0].tv_nsec;
 		} else if (pps_tf[2].tv_nsec < pps_tf[0].tv_nsec) {
 			v_nsec = pps_tf[0].tv_nsec;	/* 1 0 2 */
 			u_nsec = pps_tf[1].tv_nsec - pps_tf[2].tv_nsec;
 		} else {
 			v_nsec = pps_tf[2].tv_nsec;	/* 1 2 0 */
 			u_nsec = pps_tf[1].tv_nsec - pps_tf[0].tv_nsec;
 		}
 	}
 
 	/*
 	 * Nominal jitter is due to PPS signal noise and interrupt
 	 * latency. If it exceeds the popcorn threshold, the sample is
 	 * discarded. otherwise, if so enabled, the time offset is
 	 * updated. We can tolerate a modest loss of data here without
 	 * much degrading time accuracy.
 	 *
 	 * The measurements being checked here were made with the system
 	 * timecounter, so the popcorn threshold is not allowed to fall below
 	 * the number of nanoseconds in two ticks of the timecounter.  For a
 	 * timecounter running faster than 1 GHz the lower bound is 2ns, just
 	 * to avoid a nonsensical threshold of zero.
 	*/
 	if (u_nsec > lmax(pps_jitter << PPS_POPCORN,
 	    2 * (NANOSECOND / (long)qmin(NANOSECOND, tc_getfrequency())))) {
 		time_status |= STA_PPSJITTER;
 		pps_jitcnt++;
 	} else if (time_status & STA_PPSTIME) {
 		time_monitor = -v_nsec;
 		L_LINT(time_offset, time_monitor);
 	}
 	pps_jitter += (u_nsec - pps_jitter) >> PPS_FAVG;
 	u_sec = pps_tf[0].tv_sec - pps_lastsec;
 	if (u_sec < (1 << pps_shift))
 		goto out;
 
 	/*
 	 * At the end of the calibration interval the difference between
 	 * the first and last counter values becomes the scaled
 	 * frequency. It will later be divided by the length of the
 	 * interval to determine the frequency update. If the frequency
 	 * exceeds a sanity threshold, or if the actual calibration
 	 * interval is not equal to the expected length, the data are
 	 * discarded. We can tolerate a modest loss of data here without
 	 * much degrading frequency accuracy.
 	 */
 	pps_calcnt++;
 	v_nsec = -pps_fcount;
 	pps_lastsec = pps_tf[0].tv_sec;
 	pps_fcount = 0;
 	u_nsec = MAXFREQ << pps_shift;
 	if (v_nsec > u_nsec || v_nsec < -u_nsec || u_sec != (1 << pps_shift)) {
 		time_status |= STA_PPSERROR;
 		pps_errcnt++;
 		goto out;
 	}
 
 	/*
 	 * Here the raw frequency offset and wander (stability) is
 	 * calculated. If the wander is less than the wander threshold
 	 * for four consecutive averaging intervals, the interval is
 	 * doubled; if it is greater than the threshold for four
 	 * consecutive intervals, the interval is halved. The scaled
 	 * frequency offset is converted to frequency offset. The
 	 * stability metric is calculated as the average of recent
 	 * frequency changes, but is used only for performance
 	 * monitoring.
 	 */
 	L_LINT(ftemp, v_nsec);
 	L_RSHIFT(ftemp, pps_shift);
 	L_SUB(ftemp, pps_freq);
 	u_nsec = L_GINT(ftemp);
 	if (u_nsec > PPS_MAXWANDER) {
 		L_LINT(ftemp, PPS_MAXWANDER);
 		pps_intcnt--;
 		time_status |= STA_PPSWANDER;
 		pps_stbcnt++;
 	} else if (u_nsec < -PPS_MAXWANDER) {
 		L_LINT(ftemp, -PPS_MAXWANDER);
 		pps_intcnt--;
 		time_status |= STA_PPSWANDER;
 		pps_stbcnt++;
 	} else {
 		pps_intcnt++;
 	}
 	if (pps_intcnt >= 4) {
 		pps_intcnt = 4;
 		if (pps_shift < pps_shiftmax) {
 			pps_shift++;
 			pps_intcnt = 0;
 		}
 	} else if (pps_intcnt <= -4 || pps_shift > pps_shiftmax) {
 		pps_intcnt = -4;
 		if (pps_shift > PPS_FAVG) {
 			pps_shift--;
 			pps_intcnt = 0;
 		}
 	}
 	if (u_nsec < 0)
 		u_nsec = -u_nsec;
 	pps_stabil += (u_nsec * SCALE_PPM - pps_stabil) >> PPS_FAVG;
 
 	/*
 	 * The PPS frequency is recalculated and clamped to the maximum
 	 * MAXFREQ. If enabled, the system clock frequency is updated as
 	 * well.
 	 */
 	L_ADD(pps_freq, ftemp);
 	u_nsec = L_GINT(pps_freq);
 	if (u_nsec > MAXFREQ)
 		L_LINT(pps_freq, MAXFREQ);
 	else if (u_nsec < -MAXFREQ)
 		L_LINT(pps_freq, -MAXFREQ);
 	if (time_status & STA_PPSFREQ)
 		time_freq = pps_freq;
 
 out:
 	NTP_UNLOCK();
 }
 #endif /* PPS_SYNC */
 
 #ifndef _SYS_SYSPROTO_H_
 struct adjtime_args {
 	struct timeval *delta;
 	struct timeval *olddelta;
 };
 #endif
 /* ARGSUSED */
 int
 sys_adjtime(struct thread *td, struct adjtime_args *uap)
 {
 	struct timeval delta, olddelta, *deltap;
 	int error;
 
 	if (uap->delta) {
 		error = copyin(uap->delta, &delta, sizeof(delta));
 		if (error)
 			return (error);
 		deltap = &delta;
 	} else
 		deltap = NULL;
 	error = kern_adjtime(td, deltap, &olddelta);
 	if (uap->olddelta && error == 0)
 		error = copyout(&olddelta, uap->olddelta, sizeof(olddelta));
 	return (error);
 }
 
 int
 kern_adjtime(struct thread *td, struct timeval *delta, struct timeval *olddelta)
 {
 	struct timeval atv;
 	int64_t ltr, ltw;
 	int error;
 
 	if (delta != NULL) {
 		error = priv_check(td, PRIV_ADJTIME);
 		if (error != 0)
 			return (error);
 		ltw = (int64_t)delta->tv_sec * 1000000 + delta->tv_usec;
 	}
 	NTP_LOCK();
 	ltr = time_adjtime;
 	if (delta != NULL)
 		time_adjtime = ltw;
 	NTP_UNLOCK();
 	if (olddelta != NULL) {
 		atv.tv_sec = ltr / 1000000;
 		atv.tv_usec = ltr % 1000000;
 		if (atv.tv_usec < 0) {
 			atv.tv_usec += 1000000;
 			atv.tv_sec--;
 		}
 		*olddelta = atv;
 	}
 	return (0);
 }
 
 static struct callout resettodr_callout;
 static int resettodr_period = 1800;
 
 static void
 periodic_resettodr(void *arg __unused)
 {
 
 	/*
 	 * Read of time_status is lock-less, which is fine since
 	 * ntp_is_time_error() operates on the consistent read value.
 	 */
 	if (!ntp_is_time_error(time_status))
 		resettodr();
 	if (resettodr_period > 0)
 		callout_schedule(&resettodr_callout, resettodr_period * hz);
 }
 
 static void
 shutdown_resettodr(void *arg __unused, int howto __unused)
 {
 
 	callout_drain(&resettodr_callout);
 	/* Another unlocked read of time_status */
 	if (resettodr_period > 0 && !ntp_is_time_error(time_status))
 		resettodr();
 }
 
 static int
 sysctl_resettodr_period(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 
 	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
 	if (error || !req->newptr)
 		return (error);
 	if (cold)
 		goto done;
 	if (resettodr_period == 0)
 		callout_stop(&resettodr_callout);
 	else
 		callout_reset(&resettodr_callout, resettodr_period * hz,
 		    periodic_resettodr, NULL);
 done:
 	return (0);
 }
 
 SYSCTL_PROC(_machdep, OID_AUTO, rtc_save_period, CTLTYPE_INT | CTLFLAG_RWTUN |
     CTLFLAG_MPSAFE, &resettodr_period, 1800, sysctl_resettodr_period, "I",
     "Save system time to RTC with this period (in seconds)");
 
 static void
 start_periodic_resettodr(void *arg __unused)
 {
 
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_resettodr, NULL,
 	    SHUTDOWN_PRI_FIRST);
 	callout_init(&resettodr_callout, 1);
 	if (resettodr_period == 0)
 		return;
 	callout_reset(&resettodr_callout, resettodr_period * hz,
 	    periodic_resettodr, NULL);
 }
 
 SYSINIT(periodic_resettodr, SI_SUB_LAST, SI_ORDER_MIDDLE,
 	start_periodic_resettodr, NULL);
diff --git a/sys/kern/kern_tc.c b/sys/kern/kern_tc.c
index a797a101bf6f..e85812e415c7 100644
--- a/sys/kern/kern_tc.c
+++ b/sys/kern/kern_tc.c
@@ -1,2263 +1,2271 @@
 /*-
  * SPDX-License-Identifier: Beerware
  *
  * ----------------------------------------------------------------------------
  * "THE BEER-WARE LICENSE" (Revision 42):
  * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
  * can do whatever you want with this stuff. If we meet some day, and you think
  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
  * ----------------------------------------------------------------------------
  *
  * Copyright (c) 2011, 2015, 2016 The FreeBSD Foundation
  *
  * Portions of this software were developed by Julien Ridoux at the University
  * of Melbourne under sponsorship from the FreeBSD Foundation.
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  */
 
 #include <sys/cdefs.h>
 #include "opt_ntp.h"
 #include "opt_ffclock.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sleepqueue.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 #include <sys/timeffc.h>
 #include <sys/timepps.h>
 #include <sys/timerfd.h>
 #include <sys/timetc.h>
 #include <sys/timex.h>
 #include <sys/vdso.h>
 
 /*
  * A large step happens on boot.  This constant detects such steps.
  * It is relatively small so that ntp_update_second gets called enough
  * in the typical 'missed a couple of seconds' case, but doesn't loop
  * forever when the time step is large.
  */
 #define LARGE_STEP	200
 
 /*
  * Implement a dummy timecounter which we can use until we get a real one
  * in the air.  This allows the console and other early stuff to use
  * time services.
  */
 
 static u_int
 dummy_get_timecount(struct timecounter *tc)
 {
 	static u_int now;
 
 	return (++now);
 }
 
 static struct timecounter dummy_timecounter = {
 	dummy_get_timecount, 0, ~0u, 1000000, "dummy", -1000000
 };
 
 struct timehands {
 	/* These fields must be initialized by the driver. */
 	struct timecounter	*th_counter;
 	int64_t			th_adjustment;
 	uint64_t		th_scale;
 	u_int			th_large_delta;
 	u_int	 		th_offset_count;
+	long			th_tai_offset;
 	struct bintime		th_offset;
 	struct bintime		th_bintime;
 	struct timeval		th_microtime;
 	struct timespec		th_nanotime;
 	struct bintime		th_boottime;
 	/* Fields not to be copied in tc_windup start with th_generation. */
 	u_int			th_generation;
 	struct timehands	*th_next;
 };
 
 static struct timehands ths[16] = {
     [0] =  {
 	.th_counter = &dummy_timecounter,
 	.th_scale = (uint64_t)-1 / 1000000,
 	.th_large_delta = 1000000,
 	.th_offset = { .sec = 1 },
 	.th_generation = 1,
     },
 };
 
 static struct timehands *volatile timehands = &ths[0];
 struct timecounter *timecounter = &dummy_timecounter;
 static struct timecounter *timecounters = &dummy_timecounter;
 
 /* Mutex to protect the timecounter list. */
 static struct mtx tc_lock;
 
 int tc_min_ticktock_freq = 1;
 
 volatile time_t time_second = 1;
 volatile time_t time_uptime = 1;
 
 /*
  * The system time is always computed by summing the estimated boot time and the
  * system uptime. The timehands track boot time, but it changes when the system
  * time is set by the user, stepped by ntpd or adjusted when resuming. It
  * is set to new_time - uptime.
  */
 static int sysctl_kern_boottime(SYSCTL_HANDLER_ARGS);
 SYSCTL_PROC(_kern, KERN_BOOTTIME, boottime,
     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_kern_boottime, "S,timeval",
     "Estimated system boottime");
 
 SYSCTL_NODE(_kern, OID_AUTO, timecounter, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "");
 static SYSCTL_NODE(_kern_timecounter, OID_AUTO, tc,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "");
 
 static int timestepwarnings;
 SYSCTL_INT(_kern_timecounter, OID_AUTO, stepwarnings, CTLFLAG_RWTUN,
     &timestepwarnings, 0, "Log time steps");
 
 static int timehands_count = 2;
 SYSCTL_INT(_kern_timecounter, OID_AUTO, timehands_count,
     CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
     &timehands_count, 0, "Count of timehands in rotation");
 
 struct bintime bt_timethreshold;
 struct bintime bt_tickthreshold;
 sbintime_t sbt_timethreshold;
 sbintime_t sbt_tickthreshold;
 struct bintime tc_tick_bt;
 sbintime_t tc_tick_sbt;
 int tc_precexp;
 int tc_timepercentage = TC_DEFAULTPERC;
 static int sysctl_kern_timecounter_adjprecision(SYSCTL_HANDLER_ARGS);
 SYSCTL_PROC(_kern_timecounter, OID_AUTO, alloweddeviation,
     CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
     sysctl_kern_timecounter_adjprecision, "I",
     "Allowed time interval deviation in percents");
 
 volatile int rtc_generation = 1;
 
 static int tc_chosen;	/* Non-zero if a specific tc was chosen via sysctl. */
 static char tc_from_tunable[16];
 
 static void tc_windup(struct bintime *new_boottimebin);
 static void cpu_tick_calibrate(int);
 
 void dtrace_getnanotime(struct timespec *tsp);
 void dtrace_getnanouptime(struct timespec *tsp);
 
 static int
 sysctl_kern_boottime(SYSCTL_HANDLER_ARGS)
 {
 	struct timeval boottime;
 
 	getboottime(&boottime);
 
 /* i386 is the only arch which uses a 32bits time_t */
 #ifdef __amd64__
 #ifdef SCTL_MASK32
 	int tv[2];
 
 	if (req->flags & SCTL_MASK32) {
 		tv[0] = boottime.tv_sec;
 		tv[1] = boottime.tv_usec;
 		return (SYSCTL_OUT(req, tv, sizeof(tv)));
 	}
 #endif
 #endif
 	return (SYSCTL_OUT(req, &boottime, sizeof(boottime)));
 }
 
 static int
 sysctl_kern_timecounter_get(SYSCTL_HANDLER_ARGS)
 {
 	u_int ncount;
 	struct timecounter *tc = arg1;
 
 	ncount = tc->tc_get_timecount(tc);
 	return (sysctl_handle_int(oidp, &ncount, 0, req));
 }
 
 static int
 sysctl_kern_timecounter_freq(SYSCTL_HANDLER_ARGS)
 {
 	uint64_t freq;
 	struct timecounter *tc = arg1;
 
 	freq = tc->tc_frequency;
 	return (sysctl_handle_64(oidp, &freq, 0, req));
 }
 
 /*
  * Return the difference between the timehands' counter value now and what
  * was when we copied it to the timehands' offset_count.
  */
 static __inline u_int
 tc_delta(struct timehands *th)
 {
 	struct timecounter *tc;
 
 	tc = th->th_counter;
 	return ((tc->tc_get_timecount(tc) - th->th_offset_count) &
 	    tc->tc_counter_mask);
 }
 
 static __inline void
 bintime_add_tc_delta(struct bintime *bt, uint64_t scale,
     uint64_t large_delta, uint64_t delta)
 {
 	uint64_t x;
 
 	if (__predict_false(delta >= large_delta)) {
 		/* Avoid overflow for scale * delta. */
 		x = (scale >> 32) * delta;
 		bt->sec += x >> 32;
 		bintime_addx(bt, x << 32);
 		bintime_addx(bt, (scale & 0xffffffff) * delta);
 	} else {
 		bintime_addx(bt, scale * delta);
 	}
 }
 
 /*
  * Functions for reading the time.  We have to loop until we are sure that
  * the timehands that we operated on was not updated under our feet.  See
  * the comment in <sys/time.h> for a description of these 12 functions.
  */
 
 static __inline void
 bintime_off(struct bintime *bt, u_int off)
 {
 	struct timehands *th;
 	struct bintime *btp;
 	uint64_t scale;
 	u_int delta, gen, large_delta;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		btp = (struct bintime *)((vm_offset_t)th + off);
 		*bt = *btp;
 		scale = th->th_scale;
 		delta = tc_delta(th);
 		large_delta = th->th_large_delta;
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 
 	bintime_add_tc_delta(bt, scale, large_delta, delta);
 }
 #define	GETTHBINTIME(dst, member)					\
 do {									\
 	_Static_assert(_Generic(((struct timehands *)NULL)->member,	\
 	    struct bintime: 1, default: 0) == 1,			\
 	    "struct timehands member is not of struct bintime type");	\
 	bintime_off(dst, __offsetof(struct timehands, member));		\
 } while (0)
 
 static __inline void
 getthmember(void *out, size_t out_size, u_int off)
 {
 	struct timehands *th;
 	u_int gen;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		memcpy(out, (char *)th + off, out_size);
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 }
 #define	GETTHMEMBER(dst, member)					\
 do {									\
 	_Static_assert(_Generic(*dst,					\
 	    __typeof(((struct timehands *)NULL)->member): 1,		\
 	    default: 0) == 1,						\
 	    "*dst and struct timehands member have different types");	\
 	getthmember(dst, sizeof(*dst), __offsetof(struct timehands,	\
 	    member));							\
 } while (0)
 
 #ifdef FFCLOCK
 void
 fbclock_binuptime(struct bintime *bt)
 {
 
 	GETTHBINTIME(bt, th_offset);
 }
 
 void
 fbclock_nanouptime(struct timespec *tsp)
 {
 	struct bintime bt;
 
 	fbclock_binuptime(&bt);
 	bintime2timespec(&bt, tsp);
 }
 
 void
 fbclock_microuptime(struct timeval *tvp)
 {
 	struct bintime bt;
 
 	fbclock_binuptime(&bt);
 	bintime2timeval(&bt, tvp);
 }
 
 void
 fbclock_bintime(struct bintime *bt)
 {
 
 	GETTHBINTIME(bt, th_bintime);
 }
 
 void
 fbclock_nanotime(struct timespec *tsp)
 {
 	struct bintime bt;
 
 	fbclock_bintime(&bt);
 	bintime2timespec(&bt, tsp);
 }
 
 void
 fbclock_microtime(struct timeval *tvp)
 {
 	struct bintime bt;
 
 	fbclock_bintime(&bt);
 	bintime2timeval(&bt, tvp);
 }
 
 void
 fbclock_getbinuptime(struct bintime *bt)
 {
 
 	GETTHMEMBER(bt, th_offset);
 }
 
 void
 fbclock_getnanouptime(struct timespec *tsp)
 {
 	struct bintime bt;
 
 	GETTHMEMBER(&bt, th_offset);
 	bintime2timespec(&bt, tsp);
 }
 
 void
 fbclock_getmicrouptime(struct timeval *tvp)
 {
 	struct bintime bt;
 
 	GETTHMEMBER(&bt, th_offset);
 	bintime2timeval(&bt, tvp);
 }
 
 void
 fbclock_getbintime(struct bintime *bt)
 {
 
 	GETTHMEMBER(bt, th_bintime);
 }
 
 void
 fbclock_getnanotime(struct timespec *tsp)
 {
 
 	GETTHMEMBER(tsp, th_nanotime);
 }
 
 void
 fbclock_getmicrotime(struct timeval *tvp)
 {
 
 	GETTHMEMBER(tvp, th_microtime);
 }
 #else /* !FFCLOCK */
 
 void
 binuptime(struct bintime *bt)
 {
 
 	GETTHBINTIME(bt, th_offset);
 }
 
 void
 nanouptime(struct timespec *tsp)
 {
 	struct bintime bt;
 
 	binuptime(&bt);
 	bintime2timespec(&bt, tsp);
 }
 
 void
 microuptime(struct timeval *tvp)
 {
 	struct bintime bt;
 
 	binuptime(&bt);
 	bintime2timeval(&bt, tvp);
 }
 
 void
 bintime(struct bintime *bt)
 {
 
 	GETTHBINTIME(bt, th_bintime);
 }
 
 void
 nanotime(struct timespec *tsp)
 {
 	struct bintime bt;
 
 	bintime(&bt);
 	bintime2timespec(&bt, tsp);
 }
 
 void
 microtime(struct timeval *tvp)
 {
 	struct bintime bt;
 
 	bintime(&bt);
 	bintime2timeval(&bt, tvp);
 }
 
 void
 getbinuptime(struct bintime *bt)
 {
 
 	GETTHMEMBER(bt, th_offset);
 }
 
 void
 getnanouptime(struct timespec *tsp)
 {
 	struct bintime bt;
 
 	GETTHMEMBER(&bt, th_offset);
 	bintime2timespec(&bt, tsp);
 }
 
 void
 getmicrouptime(struct timeval *tvp)
 {
 	struct bintime bt;
 
 	GETTHMEMBER(&bt, th_offset);
 	bintime2timeval(&bt, tvp);
 }
 
 void
 getbintime(struct bintime *bt)
 {
 
 	GETTHMEMBER(bt, th_bintime);
 }
 
 void
 getnanotime(struct timespec *tsp)
 {
 
 	GETTHMEMBER(tsp, th_nanotime);
 }
 
 void
 getmicrotime(struct timeval *tvp)
 {
 
 	GETTHMEMBER(tvp, th_microtime);
 }
 #endif /* FFCLOCK */
 
 void
 getboottime(struct timeval *boottime)
 {
 	struct bintime boottimebin;
 
 	getboottimebin(&boottimebin);
 	bintime2timeval(&boottimebin, boottime);
 }
 
 void
 getboottimebin(struct bintime *boottimebin)
 {
 
 	GETTHMEMBER(boottimebin, th_boottime);
 }
 
 #ifdef FFCLOCK
 /*
  * Support for feed-forward synchronization algorithms. This is heavily inspired
  * by the timehands mechanism but kept independent from it. *_windup() functions
  * have some connection to avoid accessing the timecounter hardware more than
  * necessary.
  */
 
 /* Feed-forward clock estimates kept updated by the synchronization daemon. */
 struct ffclock_estimate ffclock_estimate;
 struct bintime ffclock_boottime;	/* Feed-forward boot time estimate. */
 uint32_t ffclock_status;		/* Feed-forward clock status. */
 int8_t ffclock_updated;			/* New estimates are available. */
 struct mtx ffclock_mtx;			/* Mutex on ffclock_estimate. */
 
 struct fftimehands {
 	struct ffclock_estimate	cest;
 	struct bintime		tick_time;
 	struct bintime		tick_time_lerp;
 	ffcounter		tick_ffcount;
 	uint64_t		period_lerp;
 	volatile uint8_t	gen;
 	struct fftimehands	*next;
 };
 
 #define	NUM_ELEMENTS(x) (sizeof(x) / sizeof(*x))
 
 static struct fftimehands ffth[10];
 static struct fftimehands *volatile fftimehands = ffth;
 
 static void
 ffclock_init(void)
 {
 	struct fftimehands *cur;
 	struct fftimehands *last;
 
 	memset(ffth, 0, sizeof(ffth));
 
 	last = ffth + NUM_ELEMENTS(ffth) - 1;
 	for (cur = ffth; cur < last; cur++)
 		cur->next = cur + 1;
 	last->next = ffth;
 
 	ffclock_updated = 0;
 	ffclock_status = FFCLOCK_STA_UNSYNC;
 	mtx_init(&ffclock_mtx, "ffclock lock", NULL, MTX_DEF);
 }
 
 /*
  * Reset the feed-forward clock estimates. Called from inittodr() to get things
  * kick started and uses the timecounter nominal frequency as a first period
  * estimate. Note: this function may be called several time just after boot.
  * Note: this is the only function that sets the value of boot time for the
  * monotonic (i.e. uptime) version of the feed-forward clock.
  */
 void
 ffclock_reset_clock(struct timespec *ts)
 {
 	struct timecounter *tc;
 	struct ffclock_estimate cest;
 
 	tc = timehands->th_counter;
 	memset(&cest, 0, sizeof(struct ffclock_estimate));
 
 	timespec2bintime(ts, &ffclock_boottime);
 	timespec2bintime(ts, &(cest.update_time));
 	ffclock_read_counter(&cest.update_ffcount);
 	cest.leapsec_next = 0;
 	cest.period = ((1ULL << 63) / tc->tc_frequency) << 1;
 	cest.errb_abs = 0;
 	cest.errb_rate = 0;
 	cest.status = FFCLOCK_STA_UNSYNC;
 	cest.leapsec_total = 0;
 	cest.leapsec = 0;
 
 	mtx_lock(&ffclock_mtx);
 	bcopy(&cest, &ffclock_estimate, sizeof(struct ffclock_estimate));
 	ffclock_updated = INT8_MAX;
 	mtx_unlock(&ffclock_mtx);
 
 	printf("ffclock reset: %s (%llu Hz), time = %ld.%09lu\n", tc->tc_name,
 	    (unsigned long long)tc->tc_frequency, (long)ts->tv_sec,
 	    (unsigned long)ts->tv_nsec);
 }
 
 /*
  * Sub-routine to convert a time interval measured in RAW counter units to time
  * in seconds stored in bintime format.
  * NOTE: bintime_mul requires u_int, but the value of the ffcounter may be
  * larger than the max value of u_int (on 32 bit architecture). Loop to consume
  * extra cycles.
  */
 static void
 ffclock_convert_delta(ffcounter ffdelta, uint64_t period, struct bintime *bt)
 {
 	struct bintime bt2;
 	ffcounter delta, delta_max;
 
 	delta_max = (1ULL << (8 * sizeof(unsigned int))) - 1;
 	bintime_clear(bt);
 	do {
 		if (ffdelta > delta_max)
 			delta = delta_max;
 		else
 			delta = ffdelta;
 		bt2.sec = 0;
 		bt2.frac = period;
 		bintime_mul(&bt2, (unsigned int)delta);
 		bintime_add(bt, &bt2);
 		ffdelta -= delta;
 	} while (ffdelta > 0);
 }
 
 /*
  * Update the fftimehands.
  * Push the tick ffcount and time(s) forward based on current clock estimate.
  * The conversion from ffcounter to bintime relies on the difference clock
  * principle, whose accuracy relies on computing small time intervals. If a new
  * clock estimate has been passed by the synchronisation daemon, make it
  * current, and compute the linear interpolation for monotonic time if needed.
  */
 static void
 ffclock_windup(unsigned int delta)
 {
 	struct ffclock_estimate *cest;
 	struct fftimehands *ffth;
 	struct bintime bt, gap_lerp;
 	ffcounter ffdelta;
 	uint64_t frac;
 	unsigned int polling;
 	uint8_t forward_jump, ogen;
 
 	/*
 	 * Pick the next timehand, copy current ffclock estimates and move tick
 	 * times and counter forward.
 	 */
 	forward_jump = 0;
 	ffth = fftimehands->next;
 	ogen = ffth->gen;
 	ffth->gen = 0;
 	cest = &ffth->cest;
 	bcopy(&fftimehands->cest, cest, sizeof(struct ffclock_estimate));
 	ffdelta = (ffcounter)delta;
 	ffth->period_lerp = fftimehands->period_lerp;
 
 	ffth->tick_time = fftimehands->tick_time;
 	ffclock_convert_delta(ffdelta, cest->period, &bt);
 	bintime_add(&ffth->tick_time, &bt);
 
 	ffth->tick_time_lerp = fftimehands->tick_time_lerp;
 	ffclock_convert_delta(ffdelta, ffth->period_lerp, &bt);
 	bintime_add(&ffth->tick_time_lerp, &bt);
 
 	ffth->tick_ffcount = fftimehands->tick_ffcount + ffdelta;
 
 	/*
 	 * Assess the status of the clock, if the last update is too old, it is
 	 * likely the synchronisation daemon is dead and the clock is free
 	 * running.
 	 */
 	if (ffclock_updated == 0) {
 		ffdelta = ffth->tick_ffcount - cest->update_ffcount;
 		ffclock_convert_delta(ffdelta, cest->period, &bt);
 		if (bt.sec > 2 * FFCLOCK_SKM_SCALE)
 			ffclock_status |= FFCLOCK_STA_UNSYNC;
 	}
 
 	/*
 	 * If available, grab updated clock estimates and make them current.
 	 * Recompute time at this tick using the updated estimates. The clock
 	 * estimates passed the feed-forward synchronisation daemon may result
 	 * in time conversion that is not monotonically increasing (just after
 	 * the update). time_lerp is a particular linear interpolation over the
 	 * synchronisation algo polling period that ensures monotonicity for the
 	 * clock ids requesting it.
 	 */
 	if (ffclock_updated > 0) {
 		bcopy(&ffclock_estimate, cest, sizeof(struct ffclock_estimate));
 		ffdelta = ffth->tick_ffcount - cest->update_ffcount;
 		ffth->tick_time = cest->update_time;
 		ffclock_convert_delta(ffdelta, cest->period, &bt);
 		bintime_add(&ffth->tick_time, &bt);
 
 		/* ffclock_reset sets ffclock_updated to INT8_MAX */
 		if (ffclock_updated == INT8_MAX)
 			ffth->tick_time_lerp = ffth->tick_time;
 
 		if (bintime_cmp(&ffth->tick_time, &ffth->tick_time_lerp, >))
 			forward_jump = 1;
 		else
 			forward_jump = 0;
 
 		bintime_clear(&gap_lerp);
 		if (forward_jump) {
 			gap_lerp = ffth->tick_time;
 			bintime_sub(&gap_lerp, &ffth->tick_time_lerp);
 		} else {
 			gap_lerp = ffth->tick_time_lerp;
 			bintime_sub(&gap_lerp, &ffth->tick_time);
 		}
 
 		/*
 		 * The reset from the RTC clock may be far from accurate, and
 		 * reducing the gap between real time and interpolated time
 		 * could take a very long time if the interpolated clock insists
 		 * on strict monotonicity. The clock is reset under very strict
 		 * conditions (kernel time is known to be wrong and
 		 * synchronization daemon has been restarted recently.
 		 * ffclock_boottime absorbs the jump to ensure boot time is
 		 * correct and uptime functions stay consistent.
 		 */
 		if (((ffclock_status & FFCLOCK_STA_UNSYNC) == FFCLOCK_STA_UNSYNC) &&
 		    ((cest->status & FFCLOCK_STA_UNSYNC) == 0) &&
 		    ((cest->status & FFCLOCK_STA_WARMUP) == FFCLOCK_STA_WARMUP)) {
 			if (forward_jump)
 				bintime_add(&ffclock_boottime, &gap_lerp);
 			else
 				bintime_sub(&ffclock_boottime, &gap_lerp);
 			ffth->tick_time_lerp = ffth->tick_time;
 			bintime_clear(&gap_lerp);
 		}
 
 		ffclock_status = cest->status;
 		ffth->period_lerp = cest->period;
 
 		/*
 		 * Compute corrected period used for the linear interpolation of
 		 * time. The rate of linear interpolation is capped to 5000PPM
 		 * (5ms/s).
 		 */
 		if (bintime_isset(&gap_lerp)) {
 			ffdelta = cest->update_ffcount;
 			ffdelta -= fftimehands->cest.update_ffcount;
 			ffclock_convert_delta(ffdelta, cest->period, &bt);
 			polling = bt.sec;
 			bt.sec = 0;
 			bt.frac = 5000000 * (uint64_t)18446744073LL;
 			bintime_mul(&bt, polling);
 			if (bintime_cmp(&gap_lerp, &bt, >))
 				gap_lerp = bt;
 
 			/* Approximate 1 sec by 1-(1/2^64) to ease arithmetic */
 			frac = 0;
 			if (gap_lerp.sec > 0) {
 				frac -= 1;
 				frac /= ffdelta / gap_lerp.sec;
 			}
 			frac += gap_lerp.frac / ffdelta;
 
 			if (forward_jump)
 				ffth->period_lerp += frac;
 			else
 				ffth->period_lerp -= frac;
 		}
 
 		ffclock_updated = 0;
 	}
 	if (++ogen == 0)
 		ogen = 1;
 	ffth->gen = ogen;
 	fftimehands = ffth;
 }
 
 /*
  * Adjust the fftimehands when the timecounter is changed. Stating the obvious,
  * the old and new hardware counter cannot be read simultaneously. tc_windup()
  * does read the two counters 'back to back', but a few cycles are effectively
  * lost, and not accumulated in tick_ffcount. This is a fairly radical
  * operation for a feed-forward synchronization daemon, and it is its job to not
  * pushing irrelevant data to the kernel. Because there is no locking here,
  * simply force to ignore pending or next update to give daemon a chance to
  * realize the counter has changed.
  */
 static void
 ffclock_change_tc(struct timehands *th)
 {
 	struct fftimehands *ffth;
 	struct ffclock_estimate *cest;
 	struct timecounter *tc;
 	uint8_t ogen;
 
 	tc = th->th_counter;
 	ffth = fftimehands->next;
 	ogen = ffth->gen;
 	ffth->gen = 0;
 
 	cest = &ffth->cest;
 	bcopy(&(fftimehands->cest), cest, sizeof(struct ffclock_estimate));
 	cest->period = ((1ULL << 63) / tc->tc_frequency ) << 1;
 	cest->errb_abs = 0;
 	cest->errb_rate = 0;
 	cest->status |= FFCLOCK_STA_UNSYNC;
 
 	ffth->tick_ffcount = fftimehands->tick_ffcount;
 	ffth->tick_time_lerp = fftimehands->tick_time_lerp;
 	ffth->tick_time = fftimehands->tick_time;
 	ffth->period_lerp = cest->period;
 
 	/* Do not lock but ignore next update from synchronization daemon. */
 	ffclock_updated--;
 
 	if (++ogen == 0)
 		ogen = 1;
 	ffth->gen = ogen;
 	fftimehands = ffth;
 }
 
 /*
  * Retrieve feed-forward counter and time of last kernel tick.
  */
 void
 ffclock_last_tick(ffcounter *ffcount, struct bintime *bt, uint32_t flags)
 {
 	struct fftimehands *ffth;
 	uint8_t gen;
 
 	/*
 	 * No locking but check generation has not changed. Also need to make
 	 * sure ffdelta is positive, i.e. ffcount > tick_ffcount.
 	 */
 	do {
 		ffth = fftimehands;
 		gen = ffth->gen;
 		if ((flags & FFCLOCK_LERP) == FFCLOCK_LERP)
 			*bt = ffth->tick_time_lerp;
 		else
 			*bt = ffth->tick_time;
 		*ffcount = ffth->tick_ffcount;
 	} while (gen == 0 || gen != ffth->gen);
 }
 
 /*
  * Absolute clock conversion. Low level function to convert ffcounter to
  * bintime. The ffcounter is converted using the current ffclock period estimate
  * or the "interpolated period" to ensure monotonicity.
  * NOTE: this conversion may have been deferred, and the clock updated since the
  * hardware counter has been read.
  */
 void
 ffclock_convert_abs(ffcounter ffcount, struct bintime *bt, uint32_t flags)
 {
 	struct fftimehands *ffth;
 	struct bintime bt2;
 	ffcounter ffdelta;
 	uint8_t gen;
 
 	/*
 	 * No locking but check generation has not changed. Also need to make
 	 * sure ffdelta is positive, i.e. ffcount > tick_ffcount.
 	 */
 	do {
 		ffth = fftimehands;
 		gen = ffth->gen;
 		if (ffcount > ffth->tick_ffcount)
 			ffdelta = ffcount - ffth->tick_ffcount;
 		else
 			ffdelta = ffth->tick_ffcount - ffcount;
 
 		if ((flags & FFCLOCK_LERP) == FFCLOCK_LERP) {
 			*bt = ffth->tick_time_lerp;
 			ffclock_convert_delta(ffdelta, ffth->period_lerp, &bt2);
 		} else {
 			*bt = ffth->tick_time;
 			ffclock_convert_delta(ffdelta, ffth->cest.period, &bt2);
 		}
 
 		if (ffcount > ffth->tick_ffcount)
 			bintime_add(bt, &bt2);
 		else
 			bintime_sub(bt, &bt2);
 	} while (gen == 0 || gen != ffth->gen);
 }
 
 /*
  * Difference clock conversion.
  * Low level function to Convert a time interval measured in RAW counter units
  * into bintime. The difference clock allows measuring small intervals much more
  * reliably than the absolute clock.
  */
 void
 ffclock_convert_diff(ffcounter ffdelta, struct bintime *bt)
 {
 	struct fftimehands *ffth;
 	uint8_t gen;
 
 	/* No locking but check generation has not changed. */
 	do {
 		ffth = fftimehands;
 		gen = ffth->gen;
 		ffclock_convert_delta(ffdelta, ffth->cest.period, bt);
 	} while (gen == 0 || gen != ffth->gen);
 }
 
 /*
  * Access to current ffcounter value.
  */
 void
 ffclock_read_counter(ffcounter *ffcount)
 {
 	struct timehands *th;
 	struct fftimehands *ffth;
 	unsigned int gen, delta;
 
 	/*
 	 * ffclock_windup() called from tc_windup(), safe to rely on
 	 * th->th_generation only, for correct delta and ffcounter.
 	 */
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		ffth = fftimehands;
 		delta = tc_delta(th);
 		*ffcount = ffth->tick_ffcount;
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 
 	*ffcount += delta;
 }
 
 void
 binuptime(struct bintime *bt)
 {
 
 	binuptime_fromclock(bt, sysclock_active);
 }
 
 void
 nanouptime(struct timespec *tsp)
 {
 
 	nanouptime_fromclock(tsp, sysclock_active);
 }
 
 void
 microuptime(struct timeval *tvp)
 {
 
 	microuptime_fromclock(tvp, sysclock_active);
 }
 
 void
 bintime(struct bintime *bt)
 {
 
 	bintime_fromclock(bt, sysclock_active);
 }
 
 void
 nanotime(struct timespec *tsp)
 {
 
 	nanotime_fromclock(tsp, sysclock_active);
 }
 
 void
 microtime(struct timeval *tvp)
 {
 
 	microtime_fromclock(tvp, sysclock_active);
 }
 
 void
 getbinuptime(struct bintime *bt)
 {
 
 	getbinuptime_fromclock(bt, sysclock_active);
 }
 
 void
 getnanouptime(struct timespec *tsp)
 {
 
 	getnanouptime_fromclock(tsp, sysclock_active);
 }
 
 void
 getmicrouptime(struct timeval *tvp)
 {
 
 	getmicrouptime_fromclock(tvp, sysclock_active);
 }
 
 void
 getbintime(struct bintime *bt)
 {
 
 	getbintime_fromclock(bt, sysclock_active);
 }
 
 void
 getnanotime(struct timespec *tsp)
 {
 
 	getnanotime_fromclock(tsp, sysclock_active);
 }
 
 void
 getmicrotime(struct timeval *tvp)
 {
 
 	getmicrouptime_fromclock(tvp, sysclock_active);
 }
 
 #endif /* FFCLOCK */
 
 /*
  * This is a clone of getnanotime and used for walltimestamps.
  * The dtrace_ prefix prevents fbt from creating probes for
  * it so walltimestamp can be safely used in all fbt probes.
  */
 void
 dtrace_getnanotime(struct timespec *tsp)
 {
 
 	GETTHMEMBER(tsp, th_nanotime);
 }
 
 /*
  * This is a clone of getnanouptime used for time since boot.
  * The dtrace_ prefix prevents fbt from creating probes for
  * it so an uptime that can be safely used in all fbt probes.
  */
 void
 dtrace_getnanouptime(struct timespec *tsp)
 {
 	struct bintime bt;
 
 	GETTHMEMBER(&bt, th_offset);
 	bintime2timespec(&bt, tsp);
 }
 
 /*
  * System clock currently providing time to the system. Modifiable via sysctl
  * when the FFCLOCK option is defined.
  */
 int sysclock_active = SYSCLOCK_FBCK;
 
 /* Internal NTP status and error estimates. */
 extern int time_status;
 extern long time_esterror;
 
 /*
  * Take a snapshot of sysclock data which can be used to compare system clocks
  * and generate timestamps after the fact.
  */
 void
 sysclock_getsnapshot(struct sysclock_snap *clock_snap, int fast)
 {
 	struct fbclock_info *fbi;
 	struct timehands *th;
 	struct bintime bt;
 	unsigned int delta, gen;
 #ifdef FFCLOCK
 	ffcounter ffcount;
 	struct fftimehands *ffth;
 	struct ffclock_info *ffi;
 	struct ffclock_estimate cest;
 
 	ffi = &clock_snap->ff_info;
 #endif
 
 	fbi = &clock_snap->fb_info;
 	delta = 0;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		fbi->th_scale = th->th_scale;
+		fbi->th_tai_offset = th->th_tai_offset;
 		fbi->tick_time = th->th_offset;
 #ifdef FFCLOCK
 		ffth = fftimehands;
 		ffi->tick_time = ffth->tick_time_lerp;
 		ffi->tick_time_lerp = ffth->tick_time_lerp;
 		ffi->period = ffth->cest.period;
 		ffi->period_lerp = ffth->period_lerp;
 		clock_snap->ffcount = ffth->tick_ffcount;
 		cest = ffth->cest;
 #endif
 		if (!fast)
 			delta = tc_delta(th);
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 
 	clock_snap->delta = delta;
 	clock_snap->sysclock_active = sysclock_active;
 
 	/* Record feedback clock status and error. */
 	clock_snap->fb_info.status = time_status;
 	/* XXX: Very crude estimate of feedback clock error. */
 	bt.sec = time_esterror / 1000000;
 	bt.frac = ((time_esterror - bt.sec) * 1000000) *
 	    (uint64_t)18446744073709ULL;
 	clock_snap->fb_info.error = bt;
 
 #ifdef FFCLOCK
 	if (!fast)
 		clock_snap->ffcount += delta;
 
 	/* Record feed-forward clock leap second adjustment. */
 	ffi->leapsec_adjustment = cest.leapsec_total;
 	if (clock_snap->ffcount > cest.leapsec_next)
 		ffi->leapsec_adjustment -= cest.leapsec;
 
 	/* Record feed-forward clock status and error. */
 	clock_snap->ff_info.status = cest.status;
 	ffcount = clock_snap->ffcount - cest.update_ffcount;
 	ffclock_convert_delta(ffcount, cest.period, &bt);
 	/* 18446744073709 = int(2^64/1e12), err_bound_rate in [ps/s]. */
 	bintime_mul(&bt, cest.errb_rate * (uint64_t)18446744073709ULL);
 	/* 18446744073 = int(2^64 / 1e9), since err_abs in [ns]. */
 	bintime_addx(&bt, cest.errb_abs * (uint64_t)18446744073ULL);
 	clock_snap->ff_info.error = bt;
 #endif
 }
 
 /*
  * Convert a sysclock snapshot into a struct bintime based on the specified
  * clock source and flags.
  */
 int
 sysclock_snap2bintime(struct sysclock_snap *cs, struct bintime *bt,
     int whichclock, uint32_t flags)
 {
 	struct bintime boottimebin;
 #ifdef FFCLOCK
 	struct bintime bt2;
 	uint64_t period;
 #endif
 
 	switch (whichclock) {
 	case SYSCLOCK_FBCK:
 		*bt = cs->fb_info.tick_time;
 
 		/* If snapshot was created with !fast, delta will be >0. */
 		if (cs->delta > 0)
 			bintime_addx(bt, cs->fb_info.th_scale * cs->delta);
 
 		if ((flags & FBCLOCK_UPTIME) == 0) {
 			getboottimebin(&boottimebin);
 			bintime_add(bt, &boottimebin);
 		}
+		if (!(flags & FBCLOCK_LEAPSEC)) {
+			if (cs->fb_info.th_tai_offset == 0)
+				return (EINVAL);
+			bt->sec += cs->fb_info.th_tai_offset;
+		}
 		break;
 #ifdef FFCLOCK
 	case SYSCLOCK_FFWD:
 		if (flags & FFCLOCK_LERP) {
 			*bt = cs->ff_info.tick_time_lerp;
 			period = cs->ff_info.period_lerp;
 		} else {
 			*bt = cs->ff_info.tick_time;
 			period = cs->ff_info.period;
 		}
 
 		/* If snapshot was created with !fast, delta will be >0. */
 		if (cs->delta > 0) {
 			ffclock_convert_delta(cs->delta, period, &bt2);
 			bintime_add(bt, &bt2);
 		}
 
 		/* Leap second adjustment. */
 		if (flags & FFCLOCK_LEAPSEC)
 			bt->sec -= cs->ff_info.leapsec_adjustment;
 
 		/* Boot time adjustment, for uptime/monotonic clocks. */
 		if (flags & FFCLOCK_UPTIME)
 			bintime_sub(bt, &ffclock_boottime);
 		break;
 #endif
 	default:
 		return (EINVAL);
 		break;
 	}
 
 	return (0);
 }
 
 /*
  * Initialize a new timecounter and possibly use it.
  */
 void
 tc_init(struct timecounter *tc)
 {
 	u_int u;
 	struct sysctl_oid *tc_root;
 
 	u = tc->tc_frequency / tc->tc_counter_mask;
 	/* XXX: We need some margin here, 10% is a guess */
 	u *= 11;
 	u /= 10;
 	if (u > hz && tc->tc_quality >= 0) {
 		tc->tc_quality = -2000;
 		if (bootverbose) {
 			printf("Timecounter \"%s\" frequency %ju Hz",
 			    tc->tc_name, (uintmax_t)tc->tc_frequency);
 			printf(" -- Insufficient hz, needs at least %u\n", u);
 		}
 	} else if (tc->tc_quality >= 0 || bootverbose) {
 		printf("Timecounter \"%s\" frequency %ju Hz quality %d\n",
 		    tc->tc_name, (uintmax_t)tc->tc_frequency,
 		    tc->tc_quality);
 	}
 
 	/*
 	 * Set up sysctl tree for this counter.
 	 */
 	tc_root = SYSCTL_ADD_NODE_WITH_LABEL(NULL,
 	    SYSCTL_STATIC_CHILDREN(_kern_timecounter_tc), OID_AUTO, tc->tc_name,
 	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 	    "timecounter description", "timecounter");
 	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
 	    "mask", CTLFLAG_RD, &(tc->tc_counter_mask), 0,
 	    "mask for implemented bits");
 	SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
 	    "counter", CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, tc,
 	    sizeof(*tc), sysctl_kern_timecounter_get, "IU",
 	    "current timecounter value");
 	SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
 	    "frequency", CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE, tc,
 	    sizeof(*tc), sysctl_kern_timecounter_freq, "QU",
 	    "timecounter frequency");
 	SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
 	    "quality", CTLFLAG_RD, &(tc->tc_quality), 0,
 	    "goodness of time counter");
 
 	mtx_lock(&tc_lock);
 	tc->tc_next = timecounters;
 	timecounters = tc;
 
 	/*
 	 * Do not automatically switch if the current tc was specifically
 	 * chosen.  Never automatically use a timecounter with negative quality.
 	 * Even though we run on the dummy counter, switching here may be
 	 * worse since this timecounter may not be monotonic.
 	 */
 	if (tc_chosen)
 		goto unlock;
 	if (tc->tc_quality < 0)
 		goto unlock;
 	if (tc_from_tunable[0] != '\0' &&
 	    strcmp(tc->tc_name, tc_from_tunable) == 0) {
 		tc_chosen = 1;
 		tc_from_tunable[0] = '\0';
 	} else {
 		if (tc->tc_quality < timecounter->tc_quality)
 			goto unlock;
 		if (tc->tc_quality == timecounter->tc_quality &&
 		    tc->tc_frequency < timecounter->tc_frequency)
 			goto unlock;
 	}
 	(void)tc->tc_get_timecount(tc);
 	timecounter = tc;
 unlock:
 	mtx_unlock(&tc_lock);
 }
 
 /* Report the frequency of the current timecounter. */
 uint64_t
 tc_getfrequency(void)
 {
 
 	return (timehands->th_counter->tc_frequency);
 }
 
 static bool
 sleeping_on_old_rtc(struct thread *td)
 {
 
 	/*
 	 * td_rtcgen is modified by curthread when it is running,
 	 * and by other threads in this function.  By finding the thread
 	 * on a sleepqueue and holding the lock on the sleepqueue
 	 * chain, we guarantee that the thread is not running and that
 	 * modifying td_rtcgen is safe.  Setting td_rtcgen to zero informs
 	 * the thread that it was woken due to a real-time clock adjustment.
 	 * (The declaration of td_rtcgen refers to this comment.)
 	 */
 	if (td->td_rtcgen != 0 && td->td_rtcgen != rtc_generation) {
 		td->td_rtcgen = 0;
 		return (true);
 	}
 	return (false);
 }
 
 static struct mtx tc_setclock_mtx;
 MTX_SYSINIT(tc_setclock_init, &tc_setclock_mtx, "tcsetc", MTX_SPIN);
 
 /*
  * Step our concept of UTC.  This is done by modifying our estimate of
  * when we booted.
  */
 void
 tc_setclock(struct timespec *ts)
 {
 	struct timespec tbef, taft;
 	struct bintime bt, bt2;
 
 	timespec2bintime(ts, &bt);
 	nanotime(&tbef);
 	mtx_lock_spin(&tc_setclock_mtx);
 	cpu_tick_calibrate(1);
 	binuptime(&bt2);
 	bintime_sub(&bt, &bt2);
 
 	/* XXX fiddle all the little crinkly bits around the fiords... */
 	tc_windup(&bt);
 	mtx_unlock_spin(&tc_setclock_mtx);
 
 	/* Avoid rtc_generation == 0, since td_rtcgen == 0 is special. */
 	atomic_add_rel_int(&rtc_generation, 2);
 	timerfd_jumped();
 	sleepq_chains_remove_matching(sleeping_on_old_rtc);
 	if (timestepwarnings) {
 		nanotime(&taft);
 		log(LOG_INFO,
 		    "Time stepped from %jd.%09ld to %jd.%09ld (%jd.%09ld)\n",
 		    (intmax_t)tbef.tv_sec, tbef.tv_nsec,
 		    (intmax_t)taft.tv_sec, taft.tv_nsec,
 		    (intmax_t)ts->tv_sec, ts->tv_nsec);
 	}
 }
 
 /*
  * Recalculate the scaling factor.  We want the number of 1/2^64
  * fractions of a second per period of the hardware counter, taking
  * into account the th_adjustment factor which the NTP PLL/adjtime(2)
  * processing provides us with.
  *
  * The th_adjustment is nanoseconds per second with 32 bit binary
  * fraction and we want 64 bit binary fraction of second:
  *
  *	 x = a * 2^32 / 10^9 = a * 4.294967296
  *
  * The range of th_adjustment is +/- 5000PPM so inside a 64bit int
  * we can only multiply by about 850 without overflowing, that
  * leaves no suitably precise fractions for multiply before divide.
  *
  * Divide before multiply with a fraction of 2199/512 results in a
  * systematic undercompensation of 10PPM of th_adjustment.  On a
  * 5000PPM adjustment this is a 0.05PPM error.  This is acceptable.
  *
  * We happily sacrifice the lowest of the 64 bits of our result
  * to the goddess of code clarity.
  */
 static void
 recalculate_scaling_factor_and_large_delta(struct timehands *th)
 {
 	uint64_t scale;
 
 	scale = (uint64_t)1 << 63;
 	scale += (th->th_adjustment / 1024) * 2199;
 	scale /= th->th_counter->tc_frequency;
 	th->th_scale = scale * 2;
 	th->th_large_delta = MIN(((uint64_t)1 << 63) / scale, UINT_MAX);
 }
 
 /*
  * Initialize the next struct timehands in the ring and make
  * it the active timehands.  Along the way we might switch to a different
  * timecounter and/or do seconds processing in NTP.  Slightly magic.
  */
 static void
 tc_windup(struct bintime *new_boottimebin)
 {
 	struct bintime bt;
 	struct timecounter *tc;
 	struct timehands *th, *tho;
 	u_int delta, ncount, ogen;
 	int i;
 	time_t t;
 
 	/*
 	 * Make the next timehands a copy of the current one, but do
 	 * not overwrite the generation or next pointer.  While we
 	 * update the contents, the generation must be zero.  We need
 	 * to ensure that the zero generation is visible before the
 	 * data updates become visible, which requires release fence.
 	 * For similar reasons, re-reading of the generation after the
 	 * data is read should use acquire fence.
 	 */
 	tho = timehands;
 	th = tho->th_next;
 	ogen = th->th_generation;
 	th->th_generation = 0;
 	atomic_thread_fence_rel();
 	memcpy(th, tho, offsetof(struct timehands, th_generation));
 	if (new_boottimebin != NULL)
 		th->th_boottime = *new_boottimebin;
 
 	/*
 	 * Capture a timecounter delta on the current timecounter and if
 	 * changing timecounters, a counter value from the new timecounter.
 	 * Update the offset fields accordingly.
 	 */
 	tc = atomic_load_ptr(&timecounter);
 	delta = tc_delta(th);
 	if (th->th_counter != tc)
 		ncount = tc->tc_get_timecount(tc);
 	else
 		ncount = 0;
 #ifdef FFCLOCK
 	ffclock_windup(delta);
 #endif
 	th->th_offset_count += delta;
 	th->th_offset_count &= th->th_counter->tc_counter_mask;
 	bintime_add_tc_delta(&th->th_offset, th->th_scale,
 	    th->th_large_delta, delta);
 
 	/*
 	 * Hardware latching timecounters may not generate interrupts on
 	 * PPS events, so instead we poll them.  There is a finite risk that
 	 * the hardware might capture a count which is later than the one we
 	 * got above, and therefore possibly in the next NTP second which might
 	 * have a different rate than the current NTP second.  It doesn't
 	 * matter in practice.
 	 */
 	if (tho->th_counter->tc_poll_pps)
 		tho->th_counter->tc_poll_pps(tho->th_counter);
 
 	/*
 	 * Deal with NTP second processing.  The loop normally
 	 * iterates at most once, but in extreme situations it might
 	 * keep NTP sane if timeouts are not run for several seconds.
 	 * At boot, the time step can be large when the TOD hardware
 	 * has been read, so on really large steps, we call
 	 * ntp_update_second only twice.  We need to call it twice in
 	 * case we missed a leap second.
 	 */
 	bt = th->th_offset;
 	bintime_add(&bt, &th->th_boottime);
 	i = bt.sec - tho->th_microtime.tv_sec;
 	if (i > 0) {
 		if (i > LARGE_STEP)
 			i = 2;
 
 		do {
 			t = bt.sec;
-			ntp_update_second(&th->th_adjustment, &bt.sec);
+			ntp_update_second(&th->th_adjustment, &bt.sec,
+			    &th->th_tai_offset);
 			if (bt.sec != t)
 				th->th_boottime.sec += bt.sec - t;
 			--i;
 		} while (i > 0);
 
 		recalculate_scaling_factor_and_large_delta(th);
 	}
 
 	/* Update the UTC timestamps used by the get*() functions. */
 	th->th_bintime = bt;
 	bintime2timeval(&bt, &th->th_microtime);
 	bintime2timespec(&bt, &th->th_nanotime);
 
 	/* Now is a good time to change timecounters. */
 	if (th->th_counter != tc) {
 #ifndef __arm__
 		if ((tc->tc_flags & TC_FLAGS_C2STOP) != 0)
 			cpu_disable_c2_sleep++;
 		if ((th->th_counter->tc_flags & TC_FLAGS_C2STOP) != 0)
 			cpu_disable_c2_sleep--;
 #endif
 		th->th_counter = tc;
 		th->th_offset_count = ncount;
 		tc_min_ticktock_freq = max(1, tc->tc_frequency /
 		    (((uint64_t)tc->tc_counter_mask + 1) / 3));
 		recalculate_scaling_factor_and_large_delta(th);
 #ifdef FFCLOCK
 		ffclock_change_tc(th);
 #endif
 	}
 
 	/*
 	 * Now that the struct timehands is again consistent, set the new
 	 * generation number, making sure to not make it zero.
 	 */
 	if (++ogen == 0)
 		ogen = 1;
 	atomic_store_rel_int(&th->th_generation, ogen);
 
 	/* Go live with the new struct timehands. */
 #ifdef FFCLOCK
 	switch (sysclock_active) {
 	case SYSCLOCK_FBCK:
 #endif
 		time_second = th->th_microtime.tv_sec;
 		time_uptime = th->th_offset.sec;
 #ifdef FFCLOCK
 		break;
 	case SYSCLOCK_FFWD:
 		time_second = fftimehands->tick_time_lerp.sec;
 		time_uptime = fftimehands->tick_time_lerp.sec - ffclock_boottime.sec;
 		break;
 	}
 #endif
 
 	timehands = th;
 	timekeep_push_vdso();
 }
 
 /* Report or change the active timecounter hardware. */
 static int
 sysctl_kern_timecounter_hardware(SYSCTL_HANDLER_ARGS)
 {
 	char newname[32];
 	struct timecounter *newtc, *tc;
 	int error;
 
 	mtx_lock(&tc_lock);
 	tc = timecounter;
 	strlcpy(newname, tc->tc_name, sizeof(newname));
 	mtx_unlock(&tc_lock);
 
 	error = sysctl_handle_string(oidp, &newname[0], sizeof(newname), req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	mtx_lock(&tc_lock);
 	/* Record that the tc in use now was specifically chosen. */
 	tc_chosen = 1;
 	if (strcmp(newname, tc->tc_name) == 0) {
 		mtx_unlock(&tc_lock);
 		return (0);
 	}
 	for (newtc = timecounters; newtc != NULL; newtc = newtc->tc_next) {
 		if (strcmp(newname, newtc->tc_name) != 0)
 			continue;
 
 		/* Warm up new timecounter. */
 		(void)newtc->tc_get_timecount(newtc);
 
 		timecounter = newtc;
 
 		/*
 		 * The vdso timehands update is deferred until the next
 		 * 'tc_windup()'.
 		 *
 		 * This is prudent given that 'timekeep_push_vdso()' does not
 		 * use any locking and that it can be called in hard interrupt
 		 * context via 'tc_windup()'.
 		 */
 		break;
 	}
 	mtx_unlock(&tc_lock);
 	return (newtc != NULL ? 0 : EINVAL);
 }
 SYSCTL_PROC(_kern_timecounter, OID_AUTO, hardware,
     CTLTYPE_STRING | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, 0, 0,
     sysctl_kern_timecounter_hardware, "A",
     "Timecounter hardware selected");
 
 /* Report the available timecounter hardware. */
 static int
 sysctl_kern_timecounter_choice(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sb;
 	struct timecounter *tc;
 	int error;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sbuf_new_for_sysctl(&sb, NULL, 0, req);
 	mtx_lock(&tc_lock);
 	for (tc = timecounters; tc != NULL; tc = tc->tc_next) {
 		if (tc != timecounters)
 			sbuf_putc(&sb, ' ');
 		sbuf_printf(&sb, "%s(%d)", tc->tc_name, tc->tc_quality);
 	}
 	mtx_unlock(&tc_lock);
 	error = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 	return (error);
 }
 
 SYSCTL_PROC(_kern_timecounter, OID_AUTO, choice,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
     sysctl_kern_timecounter_choice, "A",
     "Timecounter hardware detected");
 
 /*
  * RFC 2783 PPS-API implementation.
  */
 
 /*
  *  Return true if the driver is aware of the abi version extensions in the
  *  pps_state structure, and it supports at least the given abi version number.
  */
 static inline int
 abi_aware(struct pps_state *pps, int vers)
 {
 
 	return ((pps->kcmode & KCMODE_ABIFLAG) && pps->driver_abi >= vers);
 }
 
 static int
 pps_fetch(struct pps_fetch_args *fapi, struct pps_state *pps)
 {
 	int err, timo;
 	pps_seq_t aseq, cseq;
 	struct timeval tv;
 
 	if (fapi->tsformat && fapi->tsformat != PPS_TSFMT_TSPEC)
 		return (EINVAL);
 
 	/*
 	 * If no timeout is requested, immediately return whatever values were
 	 * most recently captured.  If timeout seconds is -1, that's a request
 	 * to block without a timeout.  WITNESS won't let us sleep forever
 	 * without a lock (we really don't need a lock), so just repeatedly
 	 * sleep a long time.
 	 */
 	if (fapi->timeout.tv_sec || fapi->timeout.tv_nsec) {
 		if (fapi->timeout.tv_sec == -1)
 			timo = 0x7fffffff;
 		else {
 			tv.tv_sec = fapi->timeout.tv_sec;
 			tv.tv_usec = fapi->timeout.tv_nsec / 1000;
 			timo = tvtohz(&tv);
 		}
 		aseq = atomic_load_int(&pps->ppsinfo.assert_sequence);
 		cseq = atomic_load_int(&pps->ppsinfo.clear_sequence);
 		while (aseq == atomic_load_int(&pps->ppsinfo.assert_sequence) &&
 		    cseq == atomic_load_int(&pps->ppsinfo.clear_sequence)) {
 			if (abi_aware(pps, 1) && pps->driver_mtx != NULL) {
 				if (pps->flags & PPSFLAG_MTX_SPIN) {
 					err = msleep_spin(pps, pps->driver_mtx,
 					    "ppsfch", timo);
 				} else {
 					err = msleep(pps, pps->driver_mtx, PCATCH,
 					    "ppsfch", timo);
 				}
 			} else {
 				err = tsleep(pps, PCATCH, "ppsfch", timo);
 			}
 			if (err == EWOULDBLOCK) {
 				if (fapi->timeout.tv_sec == -1) {
 					continue;
 				} else {
 					return (ETIMEDOUT);
 				}
 			} else if (err != 0) {
 				return (err);
 			}
 		}
 	}
 
 	pps->ppsinfo.current_mode = pps->ppsparam.mode;
 	fapi->pps_info_buf = pps->ppsinfo;
 
 	return (0);
 }
 
 int
 pps_ioctl(u_long cmd, caddr_t data, struct pps_state *pps)
 {
 	pps_params_t *app;
 	struct pps_fetch_args *fapi;
 #ifdef FFCLOCK
 	struct pps_fetch_ffc_args *fapi_ffc;
 #endif
 #ifdef PPS_SYNC
 	struct pps_kcbind_args *kapi;
 #endif
 
 	KASSERT(pps != NULL, ("NULL pps pointer in pps_ioctl"));
 	switch (cmd) {
 	case PPS_IOC_CREATE:
 		return (0);
 	case PPS_IOC_DESTROY:
 		return (0);
 	case PPS_IOC_SETPARAMS:
 		app = (pps_params_t *)data;
 		if (app->mode & ~pps->ppscap)
 			return (EINVAL);
 #ifdef FFCLOCK
 		/* Ensure only a single clock is selected for ffc timestamp. */
 		if ((app->mode & PPS_TSCLK_MASK) == PPS_TSCLK_MASK)
 			return (EINVAL);
 #endif
 		pps->ppsparam = *app;
 		return (0);
 	case PPS_IOC_GETPARAMS:
 		app = (pps_params_t *)data;
 		*app = pps->ppsparam;
 		app->api_version = PPS_API_VERS_1;
 		return (0);
 	case PPS_IOC_GETCAP:
 		*(int*)data = pps->ppscap;
 		return (0);
 	case PPS_IOC_FETCH:
 		fapi = (struct pps_fetch_args *)data;
 		return (pps_fetch(fapi, pps));
 #ifdef FFCLOCK
 	case PPS_IOC_FETCH_FFCOUNTER:
 		fapi_ffc = (struct pps_fetch_ffc_args *)data;
 		if (fapi_ffc->tsformat && fapi_ffc->tsformat !=
 		    PPS_TSFMT_TSPEC)
 			return (EINVAL);
 		if (fapi_ffc->timeout.tv_sec || fapi_ffc->timeout.tv_nsec)
 			return (EOPNOTSUPP);
 		pps->ppsinfo_ffc.current_mode = pps->ppsparam.mode;
 		fapi_ffc->pps_info_buf_ffc = pps->ppsinfo_ffc;
 		/* Overwrite timestamps if feedback clock selected. */
 		switch (pps->ppsparam.mode & PPS_TSCLK_MASK) {
 		case PPS_TSCLK_FBCK:
 			fapi_ffc->pps_info_buf_ffc.assert_timestamp =
 			    pps->ppsinfo.assert_timestamp;
 			fapi_ffc->pps_info_buf_ffc.clear_timestamp =
 			    pps->ppsinfo.clear_timestamp;
 			break;
 		case PPS_TSCLK_FFWD:
 			break;
 		default:
 			break;
 		}
 		return (0);
 #endif /* FFCLOCK */
 	case PPS_IOC_KCBIND:
 #ifdef PPS_SYNC
 		kapi = (struct pps_kcbind_args *)data;
 		/* XXX Only root should be able to do this */
 		if (kapi->tsformat && kapi->tsformat != PPS_TSFMT_TSPEC)
 			return (EINVAL);
 		if (kapi->kernel_consumer != PPS_KC_HARDPPS)
 			return (EINVAL);
 		if (kapi->edge & ~pps->ppscap)
 			return (EINVAL);
 		pps->kcmode = (kapi->edge & KCMODE_EDGEMASK) |
 		    (pps->kcmode & KCMODE_ABIFLAG);
 		return (0);
 #else
 		return (EOPNOTSUPP);
 #endif
 	default:
 		return (ENOIOCTL);
 	}
 }
 
 void
 pps_init(struct pps_state *pps)
 {
 	pps->ppscap |= PPS_TSFMT_TSPEC | PPS_CANWAIT;
 	if (pps->ppscap & PPS_CAPTUREASSERT)
 		pps->ppscap |= PPS_OFFSETASSERT;
 	if (pps->ppscap & PPS_CAPTURECLEAR)
 		pps->ppscap |= PPS_OFFSETCLEAR;
 #ifdef FFCLOCK
 	pps->ppscap |= PPS_TSCLK_MASK;
 #endif
 	pps->kcmode &= ~KCMODE_ABIFLAG;
 }
 
 void
 pps_init_abi(struct pps_state *pps)
 {
 
 	pps_init(pps);
 	if (pps->driver_abi > 0) {
 		pps->kcmode |= KCMODE_ABIFLAG;
 		pps->kernel_abi = PPS_ABI_VERSION;
 	}
 }
 
 void
 pps_capture(struct pps_state *pps)
 {
 	struct timehands *th;
 	struct timecounter *tc;
 
 	KASSERT(pps != NULL, ("NULL pps pointer in pps_capture"));
 	th = timehands;
 	pps->capgen = atomic_load_acq_int(&th->th_generation);
 	pps->capth = th;
 #ifdef FFCLOCK
 	pps->capffth = fftimehands;
 #endif
 	tc = th->th_counter;
 	pps->capcount = tc->tc_get_timecount(tc);
 }
 
 void
 pps_event(struct pps_state *pps, int event)
 {
 	struct timehands *capth;
 	struct timecounter *captc;
 	uint64_t capth_scale;
 	struct bintime bt;
 	struct timespec *tsp, *osp;
 	u_int tcount, *pcount;
 	int foff;
 	pps_seq_t *pseq;
 #ifdef FFCLOCK
 	struct timespec *tsp_ffc;
 	pps_seq_t *pseq_ffc;
 	ffcounter *ffcount;
 #endif
 #ifdef PPS_SYNC
 	int fhard;
 #endif
 
 	KASSERT(pps != NULL, ("NULL pps pointer in pps_event"));
 	/* Nothing to do if not currently set to capture this event type. */
 	if ((event & pps->ppsparam.mode) == 0)
 		return;
 
 	/* Make a snapshot of the captured timehand */
 	capth = pps->capth;
 	captc = capth->th_counter;
 	capth_scale = capth->th_scale;
 	tcount = capth->th_offset_count;
 	bt = capth->th_bintime;
 
 	/* If the timecounter was wound up underneath us, bail out. */
 	atomic_thread_fence_acq();
 	if (pps->capgen == 0 || pps->capgen != capth->th_generation)
 		return;
 
 	/* Things would be easier with arrays. */
 	if (event == PPS_CAPTUREASSERT) {
 		tsp = &pps->ppsinfo.assert_timestamp;
 		osp = &pps->ppsparam.assert_offset;
 		foff = pps->ppsparam.mode & PPS_OFFSETASSERT;
 #ifdef PPS_SYNC
 		fhard = pps->kcmode & PPS_CAPTUREASSERT;
 #endif
 		pcount = &pps->ppscount[0];
 		pseq = &pps->ppsinfo.assert_sequence;
 #ifdef FFCLOCK
 		ffcount = &pps->ppsinfo_ffc.assert_ffcount;
 		tsp_ffc = &pps->ppsinfo_ffc.assert_timestamp;
 		pseq_ffc = &pps->ppsinfo_ffc.assert_sequence;
 #endif
 	} else {
 		tsp = &pps->ppsinfo.clear_timestamp;
 		osp = &pps->ppsparam.clear_offset;
 		foff = pps->ppsparam.mode & PPS_OFFSETCLEAR;
 #ifdef PPS_SYNC
 		fhard = pps->kcmode & PPS_CAPTURECLEAR;
 #endif
 		pcount = &pps->ppscount[1];
 		pseq = &pps->ppsinfo.clear_sequence;
 #ifdef FFCLOCK
 		ffcount = &pps->ppsinfo_ffc.clear_ffcount;
 		tsp_ffc = &pps->ppsinfo_ffc.clear_timestamp;
 		pseq_ffc = &pps->ppsinfo_ffc.clear_sequence;
 #endif
 	}
 
 	*pcount = pps->capcount;
 
 	/*
 	 * If the timecounter changed, we cannot compare the count values, so
 	 * we have to drop the rest of the PPS-stuff until the next event.
 	 */
 	if (__predict_false(pps->ppstc != captc)) {
 		pps->ppstc = captc;
 		pps->ppscount[2] = pps->capcount;
 		return;
 	}
 
 	(*pseq)++;
 
 	/* Convert the count to a timespec. */
 	tcount = pps->capcount - tcount;
 	tcount &= captc->tc_counter_mask;
 	bintime_addx(&bt, capth_scale * tcount);
 	bintime2timespec(&bt, tsp);
 
 	if (foff) {
 		timespecadd(tsp, osp, tsp);
 		if (tsp->tv_nsec < 0) {
 			tsp->tv_nsec += 1000000000;
 			tsp->tv_sec -= 1;
 		}
 	}
 
 #ifdef FFCLOCK
 	*ffcount = pps->capffth->tick_ffcount + tcount;
 	bt = pps->capffth->tick_time;
 	ffclock_convert_delta(tcount, pps->capffth->cest.period, &bt);
 	bintime_add(&bt, &pps->capffth->tick_time);
 	(*pseq_ffc)++;
 	bintime2timespec(&bt, tsp_ffc);
 #endif
 
 #ifdef PPS_SYNC
 	if (fhard) {
 		uint64_t delta_nsec;
 		uint64_t freq;
 
 		/*
 		 * Feed the NTP PLL/FLL.
 		 * The FLL wants to know how many (hardware) nanoseconds
 		 * elapsed since the previous event.
 		 */
 		tcount = pps->capcount - pps->ppscount[2];
 		pps->ppscount[2] = pps->capcount;
 		tcount &= captc->tc_counter_mask;
 		delta_nsec = 1000000000;
 		delta_nsec *= tcount;
 		freq = captc->tc_frequency;
 		delta_nsec = (delta_nsec + freq / 2) / freq;
 		hardpps(tsp, (long)delta_nsec);
 	}
 #endif
 
 	/* Wakeup anyone sleeping in pps_fetch().  */
 	wakeup(pps);
 }
 
 /*
  * Timecounters need to be updated every so often to prevent the hardware
  * counter from overflowing.  Updating also recalculates the cached values
  * used by the get*() family of functions, so their precision depends on
  * the update frequency.
  */
 
 static int tc_tick;
 SYSCTL_INT(_kern_timecounter, OID_AUTO, tick, CTLFLAG_RD, &tc_tick, 0,
     "Approximate number of hardclock ticks in a millisecond");
 
 void
 tc_ticktock(long cnt)
 {
 	static long count;
 
 	if (mtx_trylock_spin(&tc_setclock_mtx)) {
 		count += cnt;
 		if (count >= tc_tick) {
 			count = 0;
 			tc_windup(NULL);
 		}
 		mtx_unlock_spin(&tc_setclock_mtx);
 	}
 }
 
 static void __inline
 tc_adjprecision(void)
 {
 	int t;
 
 	if (tc_timepercentage > 0) {
 		t = (99 + tc_timepercentage) / tc_timepercentage;
 		tc_precexp = fls(t + (t >> 1)) - 1;
 		FREQ2BT(hz / tc_tick, &bt_timethreshold);
 		FREQ2BT(hz, &bt_tickthreshold);
 		bintime_shift(&bt_timethreshold, tc_precexp);
 		bintime_shift(&bt_tickthreshold, tc_precexp);
 	} else {
 		tc_precexp = 31;
 		bt_timethreshold.sec = INT_MAX;
 		bt_timethreshold.frac = ~(uint64_t)0;
 		bt_tickthreshold = bt_timethreshold;
 	}
 	sbt_timethreshold = bttosbt(bt_timethreshold);
 	sbt_tickthreshold = bttosbt(bt_tickthreshold);
 }
 
 static int
 sysctl_kern_timecounter_adjprecision(SYSCTL_HANDLER_ARGS)
 {
 	int error, val;
 
 	val = tc_timepercentage;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	tc_timepercentage = val;
 	if (cold)
 		goto done;
 	tc_adjprecision();
 done:
 	return (0);
 }
 
 /* Set up the requested number of timehands. */
 static void
 inittimehands(void *dummy)
 {
 	struct timehands *thp;
 	int i;
 
 	TUNABLE_INT_FETCH("kern.timecounter.timehands_count",
 	    &timehands_count);
 	if (timehands_count < 1)
 		timehands_count = 1;
 	if (timehands_count > nitems(ths))
 		timehands_count = nitems(ths);
 	for (i = 1, thp = &ths[0]; i < timehands_count;  thp = &ths[i++])
 		thp->th_next = &ths[i];
 	thp->th_next = &ths[0];
 
 	TUNABLE_STR_FETCH("kern.timecounter.hardware", tc_from_tunable,
 	    sizeof(tc_from_tunable));
 
 	mtx_init(&tc_lock, "tc", NULL, MTX_DEF);
 }
 SYSINIT(timehands, SI_SUB_TUNABLES, SI_ORDER_ANY, inittimehands, NULL);
 
 static void
 inittimecounter(void *dummy)
 {
 	u_int p;
 	int tick_rate;
 
 	/*
 	 * Set the initial timeout to
 	 * max(1, <approx. number of hardclock ticks in a millisecond>).
 	 * People should probably not use the sysctl to set the timeout
 	 * to smaller than its initial value, since that value is the
 	 * smallest reasonable one.  If they want better timestamps they
 	 * should use the non-"get"* functions.
 	 */
 	if (hz > 1000)
 		tc_tick = (hz + 500) / 1000;
 	else
 		tc_tick = 1;
 	tc_adjprecision();
 	FREQ2BT(hz, &tick_bt);
 	tick_sbt = bttosbt(tick_bt);
 	tick_rate = hz / tc_tick;
 	FREQ2BT(tick_rate, &tc_tick_bt);
 	tc_tick_sbt = bttosbt(tc_tick_bt);
 	p = (tc_tick * 1000000) / hz;
 	printf("Timecounters tick every %d.%03u msec\n", p / 1000, p % 1000);
 
 #ifdef FFCLOCK
 	ffclock_init();
 #endif
 
 	/* warm up new timecounter (again) and get rolling. */
 	(void)timecounter->tc_get_timecount(timecounter);
 	mtx_lock_spin(&tc_setclock_mtx);
 	tc_windup(NULL);
 	mtx_unlock_spin(&tc_setclock_mtx);
 }
 
 SYSINIT(timecounter, SI_SUB_CLOCKS, SI_ORDER_SECOND, inittimecounter, NULL);
 
 /* Cpu tick handling -------------------------------------------------*/
 
 static bool cpu_tick_variable;
 static uint64_t	cpu_tick_frequency;
 
 DPCPU_DEFINE_STATIC(uint64_t, tc_cpu_ticks_base);
 DPCPU_DEFINE_STATIC(unsigned, tc_cpu_ticks_last);
 
 static uint64_t
 tc_cpu_ticks(void)
 {
 	struct timecounter *tc;
 	uint64_t res, *base;
 	unsigned u, *last;
 
 	critical_enter();
 	base = DPCPU_PTR(tc_cpu_ticks_base);
 	last = DPCPU_PTR(tc_cpu_ticks_last);
 	tc = timehands->th_counter;
 	u = tc->tc_get_timecount(tc) & tc->tc_counter_mask;
 	if (u < *last)
 		*base += (uint64_t)tc->tc_counter_mask + 1;
 	*last = u;
 	res = u + *base;
 	critical_exit();
 	return (res);
 }
 
 void
 cpu_tick_calibration(void)
 {
 	static time_t last_calib;
 
 	if (time_uptime != last_calib && !(time_uptime & 0xf)) {
 		cpu_tick_calibrate(0);
 		last_calib = time_uptime;
 	}
 }
 
 /*
  * This function gets called every 16 seconds on only one designated
  * CPU in the system from hardclock() via cpu_tick_calibration()().
  *
  * Whenever the real time clock is stepped we get called with reset=1
  * to make sure we handle suspend/resume and similar events correctly.
  */
 
 static void
 cpu_tick_calibrate(int reset)
 {
 	static uint64_t c_last;
 	uint64_t c_this, c_delta;
 	static struct bintime  t_last;
 	struct bintime t_this, t_delta;
 	uint32_t divi;
 
 	if (reset) {
 		/* The clock was stepped, abort & reset */
 		t_last.sec = 0;
 		return;
 	}
 
 	/* we don't calibrate fixed rate cputicks */
 	if (!cpu_tick_variable)
 		return;
 
 	getbinuptime(&t_this);
 	c_this = cpu_ticks();
 	if (t_last.sec != 0) {
 		c_delta = c_this - c_last;
 		t_delta = t_this;
 		bintime_sub(&t_delta, &t_last);
 		/*
 		 * Headroom:
 		 * 	2^(64-20) / 16[s] =
 		 * 	2^(44) / 16[s] =
 		 * 	17.592.186.044.416 / 16 =
 		 * 	1.099.511.627.776 [Hz]
 		 */
 		divi = t_delta.sec << 20;
 		divi |= t_delta.frac >> (64 - 20);
 		c_delta <<= 20;
 		c_delta /= divi;
 		if (c_delta > cpu_tick_frequency) {
 			if (0 && bootverbose)
 				printf("cpu_tick increased to %ju Hz\n",
 				    c_delta);
 			cpu_tick_frequency = c_delta;
 		}
 	}
 	c_last = c_this;
 	t_last = t_this;
 }
 
 void
 set_cputicker(cpu_tick_f *func, uint64_t freq, bool isvariable)
 {
 
 	if (func == NULL) {
 		cpu_ticks = tc_cpu_ticks;
 	} else {
 		cpu_tick_frequency = freq;
 		cpu_tick_variable = isvariable;
 		cpu_ticks = func;
 	}
 }
 
 uint64_t
 cpu_tickrate(void)
 {
 
 	if (cpu_ticks == tc_cpu_ticks) 
 		return (tc_getfrequency());
 	return (cpu_tick_frequency);
 }
 
 /*
  * We need to be slightly careful converting cputicks to microseconds.
  * There is plenty of margin in 64 bits of microseconds (half a million
  * years) and in 64 bits at 4 GHz (146 years), but if we do a multiply
  * before divide conversion (to retain precision) we find that the
  * margin shrinks to 1.5 hours (one millionth of 146y).
  */
 
 uint64_t
 cputick2usec(uint64_t tick)
 {
 	uint64_t tr;
 	tr = cpu_tickrate();
 	return ((tick / tr) * 1000000ULL) + ((tick % tr) * 1000000ULL) / tr;
 }
 
 cpu_tick_f	*cpu_ticks = tc_cpu_ticks;
 
 static int vdso_th_enable = 1;
 static int
 sysctl_fast_gettime(SYSCTL_HANDLER_ARGS)
 {
 	int old_vdso_th_enable, error;
 
 	old_vdso_th_enable = vdso_th_enable;
 	error = sysctl_handle_int(oidp, &old_vdso_th_enable, 0, req);
 	if (error != 0)
 		return (error);
 	vdso_th_enable = old_vdso_th_enable;
 	return (0);
 }
 SYSCTL_PROC(_kern_timecounter, OID_AUTO, fast_gettime,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, 0, sysctl_fast_gettime, "I", "Enable fast time of day");
 
 uint32_t
 tc_fill_vdso_timehands(struct vdso_timehands *vdso_th)
 {
 	struct timehands *th;
 	uint32_t enabled;
 
 	th = timehands;
 	vdso_th->th_scale = th->th_scale;
 	vdso_th->th_offset_count = th->th_offset_count;
 	vdso_th->th_counter_mask = th->th_counter->tc_counter_mask;
 	vdso_th->th_offset = th->th_offset;
 	vdso_th->th_boottime = th->th_boottime;
 	if (th->th_counter->tc_fill_vdso_timehands != NULL) {
 		enabled = th->th_counter->tc_fill_vdso_timehands(vdso_th,
 		    th->th_counter);
 	} else
 		enabled = 0;
 	if (!vdso_th_enable)
 		enabled = 0;
 	return (enabled);
 }
 
 #ifdef COMPAT_FREEBSD32
 uint32_t
 tc_fill_vdso_timehands32(struct vdso_timehands32 *vdso_th32)
 {
 	struct timehands *th;
 	uint32_t enabled;
 
 	th = timehands;
 	*(uint64_t *)&vdso_th32->th_scale[0] = th->th_scale;
 	vdso_th32->th_offset_count = th->th_offset_count;
 	vdso_th32->th_counter_mask = th->th_counter->tc_counter_mask;
 	vdso_th32->th_offset.sec = th->th_offset.sec;
 	*(uint64_t *)&vdso_th32->th_offset.frac[0] = th->th_offset.frac;
 	vdso_th32->th_boottime.sec = th->th_boottime.sec;
 	*(uint64_t *)&vdso_th32->th_boottime.frac[0] = th->th_boottime.frac;
 	if (th->th_counter->tc_fill_vdso_timehands32 != NULL) {
 		enabled = th->th_counter->tc_fill_vdso_timehands32(vdso_th32,
 		    th->th_counter);
 	} else
 		enabled = 0;
 	if (!vdso_th_enable)
 		enabled = 0;
 	return (enabled);
 }
 #endif
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <ddb/ddb.h>
 
 DB_SHOW_COMMAND(timecounter, db_show_timecounter)
 {
 	struct timehands *th;
 	struct timecounter *tc;
 	u_int val1, val2;
 
 	th = timehands;
 	tc = th->th_counter;
 	val1 = tc->tc_get_timecount(tc);
 	__compiler_membar();
 	val2 = tc->tc_get_timecount(tc);
 
 	db_printf("timecounter %p %s\n", tc, tc->tc_name);
 	db_printf("  mask %#x freq %ju qual %d flags %#x priv %p\n",
 	    tc->tc_counter_mask, (uintmax_t)tc->tc_frequency, tc->tc_quality,
 	    tc->tc_flags, tc->tc_priv);
 	db_printf("  val %#x %#x\n", val1, val2);
 	db_printf("timehands adj %#jx scale %#jx ldelta %d off_cnt %d gen %d\n",
 	    (uintmax_t)th->th_adjustment, (uintmax_t)th->th_scale,
 	    th->th_large_delta, th->th_offset_count, th->th_generation);
 	db_printf("  offset %jd %jd boottime %jd %jd\n",
 	    (intmax_t)th->th_offset.sec, (uintmax_t)th->th_offset.frac,
 	    (intmax_t)th->th_boottime.sec, (uintmax_t)th->th_boottime.frac);
 }
 #endif
diff --git a/sys/kern/kern_time.c b/sys/kern/kern_time.c
index 0c31c1563d99..5960136eb515 100644
--- a/sys/kern/kern_time.c
+++ b/sys/kern/kern_time.c
@@ -1,1856 +1,1872 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/limits.h>
 #include <sys/clock.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/kernel.h>
 #include <sys/sleepqueue.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/posix4.h>
 #include <sys/time.h>
+#include <sys/timeffc.h>
 #include <sys/timers.h>
 #include <sys/timetc.h>
 #include <sys/vnode.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
-#define MAX_CLOCKS 	(CLOCK_MONOTONIC+1)
+#define MAX_CLOCKS 	(CLOCK_TAI+1)
 #define CPUCLOCK_BIT		0x80000000
 #define CPUCLOCK_PROCESS_BIT	0x40000000
 #define CPUCLOCK_ID_MASK	(~(CPUCLOCK_BIT|CPUCLOCK_PROCESS_BIT))
 #define MAKE_THREAD_CPUCLOCK(tid)	(CPUCLOCK_BIT|(tid))
 #define MAKE_PROCESS_CPUCLOCK(pid)	\
 	(CPUCLOCK_BIT|CPUCLOCK_PROCESS_BIT|(pid))
 
 #define NS_PER_SEC	1000000000
 
 static struct kclock	posix_clocks[MAX_CLOCKS];
 static uma_zone_t	itimer_zone = NULL;
 
 /*
  * Time of day and interval timer support.
  *
  * These routines provide the kernel entry points to get and set
  * the time-of-day and per-process interval timers.  Subroutines
  * here provide support for adding and subtracting timeval structures
  * and decrementing interval timers, optionally reloading the interval
  * timers when they expire.
  */
 
 static int	settime(struct thread *, struct timeval *);
 static void	timevalfix(struct timeval *);
 static int	user_clock_nanosleep(struct thread *td, clockid_t clock_id,
 		    int flags, const struct timespec *ua_rqtp,
 		    struct timespec *ua_rmtp);
 
 static void	itimer_start(void);
 static int	itimer_init(void *, int, int);
 static void	itimer_fini(void *, int);
 static void	itimer_enter(struct itimer *);
 static void	itimer_leave(struct itimer *);
 static struct itimer *itimer_find(struct proc *, int);
 static void	itimers_alloc(struct proc *);
 static int	realtimer_create(struct itimer *);
 static int	realtimer_gettime(struct itimer *, struct itimerspec *);
 static int	realtimer_settime(struct itimer *, int,
 			struct itimerspec *, struct itimerspec *);
 static int	realtimer_delete(struct itimer *);
-static void	realtimer_clocktime(clockid_t, struct timespec *);
 static void	realtimer_expire(void *);
 static void	realtimer_expire_l(struct itimer *it, bool proc_locked);
 
 static void	realitexpire(void *arg);
 
 static int	register_posix_clock(int, const struct kclock *);
 static void	itimer_fire(struct itimer *it);
 static int	itimespecfix(struct timespec *ts);
 
 #define CLOCK_CALL(clock, call, arglist)		\
 	((*posix_clocks[clock].call) arglist)
 
 SYSINIT(posix_timer, SI_SUB_P1003_1B, SI_ORDER_FIRST+4, itimer_start, NULL);
 
 static int
 settime(struct thread *td, struct timeval *tv)
 {
 	struct timeval delta, tv1, tv2;
 	static struct timeval maxtime, laststep;
 	struct timespec ts;
 
 	microtime(&tv1);
 	delta = *tv;
 	timevalsub(&delta, &tv1);
 
 	/*
 	 * If the system is secure, we do not allow the time to be 
 	 * set to a value earlier than 1 second less than the highest
 	 * time we have yet seen. The worst a miscreant can do in
 	 * this circumstance is "freeze" time. He couldn't go
 	 * back to the past.
 	 *
 	 * We similarly do not allow the clock to be stepped more
 	 * than one second, nor more than once per second. This allows
 	 * a miscreant to make the clock march double-time, but no worse.
 	 */
 	if (securelevel_gt(td->td_ucred, 1) != 0) {
 		if (delta.tv_sec < 0 || delta.tv_usec < 0) {
 			/*
 			 * Update maxtime to latest time we've seen.
 			 */
 			if (tv1.tv_sec > maxtime.tv_sec)
 				maxtime = tv1;
 			tv2 = *tv;
 			timevalsub(&tv2, &maxtime);
 			if (tv2.tv_sec < -1) {
 				tv->tv_sec = maxtime.tv_sec - 1;
 				printf("Time adjustment clamped to -1 second\n");
 			}
 		} else {
 			if (tv1.tv_sec == laststep.tv_sec)
 				return (EPERM);
 			if (delta.tv_sec > 1) {
 				tv->tv_sec = tv1.tv_sec + 1;
 				printf("Time adjustment clamped to +1 second\n");
 			}
 			laststep = *tv;
 		}
 	}
 
 	ts.tv_sec = tv->tv_sec;
 	ts.tv_nsec = tv->tv_usec * 1000;
 	tc_setclock(&ts);
 	resettodr();
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct clock_getcpuclockid2_args {
 	id_t id;
 	int which,
 	clockid_t *clock_id;
 };
 #endif
 /* ARGSUSED */
 int
 sys_clock_getcpuclockid2(struct thread *td, struct clock_getcpuclockid2_args *uap)
 {
 	clockid_t clk_id;
 	int error;
 
 	error = kern_clock_getcpuclockid2(td, uap->id, uap->which, &clk_id);
 	if (error == 0)
 		error = copyout(&clk_id, uap->clock_id, sizeof(clockid_t));
 	return (error);
 }
 
 int
 kern_clock_getcpuclockid2(struct thread *td, id_t id, int which,
     clockid_t *clk_id)
 {
 	struct proc *p;
 	pid_t pid;
 	lwpid_t tid;
 	int error;
 
 	switch (which) {
 	case CPUCLOCK_WHICH_PID:
 		if (id != 0) {
 			error = pget(id, PGET_CANSEE | PGET_NOTID, &p);
 			if (error != 0)
 				return (error);
 			PROC_UNLOCK(p);
 			pid = id;
 		} else {
 			pid = td->td_proc->p_pid;
 		}
 		*clk_id = MAKE_PROCESS_CPUCLOCK(pid);
 		return (0);
 	case CPUCLOCK_WHICH_TID:
 		tid = id == 0 ? td->td_tid : id;
 		*clk_id = MAKE_THREAD_CPUCLOCK(tid);
 		return (0);
 	default:
 		return (EINVAL);
 	}
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct clock_gettime_args {
 	clockid_t clock_id;
 	struct	timespec *tp;
 };
 #endif
 /* ARGSUSED */
 int
 sys_clock_gettime(struct thread *td, struct clock_gettime_args *uap)
 {
 	struct timespec ats;
 	int error;
 
 	error = kern_clock_gettime(td, uap->clock_id, &ats);
 	if (error == 0)
 		error = copyout(&ats, uap->tp, sizeof(ats));
 
 	return (error);
 }
 
 static inline void
 cputick2timespec(uint64_t runtime, struct timespec *ats)
 {
 	uint64_t tr;
 	tr = cpu_tickrate();
 	ats->tv_sec = runtime / tr;
 	ats->tv_nsec = ((runtime % tr) * 1000000000ULL) / tr;
 }
 
 void
 kern_thread_cputime(struct thread *targettd, struct timespec *ats)
 {
 	uint64_t runtime, curtime, switchtime;
 
 	if (targettd == NULL) { /* current thread */
 		spinlock_enter();
 		switchtime = PCPU_GET(switchtime);
 		curtime = cpu_ticks();
 		runtime = curthread->td_runtime;
 		spinlock_exit();
 		runtime += curtime - switchtime;
 	} else {
 		PROC_LOCK_ASSERT(targettd->td_proc, MA_OWNED);
 		thread_lock(targettd);
 		runtime = targettd->td_runtime;
 		thread_unlock(targettd);
 	}
 	cputick2timespec(runtime, ats);
 }
 
 void
 kern_process_cputime(struct proc *targetp, struct timespec *ats)
 {
 	uint64_t runtime;
 	struct rusage ru;
 
 	PROC_LOCK_ASSERT(targetp, MA_OWNED);
 	PROC_STATLOCK(targetp);
 	rufetch(targetp, &ru);
 	runtime = targetp->p_rux.rux_runtime;
 	if (curthread->td_proc == targetp)
 		runtime += cpu_ticks() - PCPU_GET(switchtime);
 	PROC_STATUNLOCK(targetp);
 	cputick2timespec(runtime, ats);
 }
 
 static int
 get_cputime(struct thread *td, clockid_t clock_id, struct timespec *ats)
 {
 	struct proc *p, *p2;
 	struct thread *td2;
 	lwpid_t tid;
 	pid_t pid;
 	int error;
 
 	p = td->td_proc;
 	if ((clock_id & CPUCLOCK_PROCESS_BIT) == 0) {
 		tid = clock_id & CPUCLOCK_ID_MASK;
 		td2 = tdfind(tid, p->p_pid);
 		if (td2 == NULL)
 			return (EINVAL);
 		kern_thread_cputime(td2, ats);
 		PROC_UNLOCK(td2->td_proc);
 	} else {
 		pid = clock_id & CPUCLOCK_ID_MASK;
 		error = pget(pid, PGET_CANSEE, &p2);
 		if (error != 0)
 			return (EINVAL);
 		kern_process_cputime(p2, ats);
 		PROC_UNLOCK(p2);
 	}
 	return (0);
 }
 
 int
 kern_clock_gettime(struct thread *td, clockid_t clock_id, struct timespec *ats)
 {
 	struct timeval sys, user;
+	struct sysclock_snap clk;
+	struct bintime bt;
 	struct proc *p;
+	int err;
 
 	p = td->td_proc;
 	switch (clock_id) {
 	case CLOCK_REALTIME:		/* Default to precise. */
 	case CLOCK_REALTIME_PRECISE:
 		nanotime(ats);
 		break;
 	case CLOCK_REALTIME_FAST:
 		getnanotime(ats);
 		break;
+	case CLOCK_TAI:
+		sysclock_getsnapshot(&clk, 0);
+		err = sysclock_snap2bintime(&clk, &bt, clk.sysclock_active,
+		    (clk.sysclock_active == SYSCLOCK_FFWD) ? FFCLOCK_LERP : 0);
+		if (err)
+			return (err);
+		bintime2timespec(&bt, ats);
+		break;
 	case CLOCK_VIRTUAL:
 		PROC_LOCK(p);
 		PROC_STATLOCK(p);
 		calcru(p, &user, &sys);
 		PROC_STATUNLOCK(p);
 		PROC_UNLOCK(p);
 		TIMEVAL_TO_TIMESPEC(&user, ats);
 		break;
 	case CLOCK_PROF:
 		PROC_LOCK(p);
 		PROC_STATLOCK(p);
 		calcru(p, &user, &sys);
 		PROC_STATUNLOCK(p);
 		PROC_UNLOCK(p);
 		timevaladd(&user, &sys);
 		TIMEVAL_TO_TIMESPEC(&user, ats);
 		break;
 	case CLOCK_MONOTONIC:		/* Default to precise. */
 	case CLOCK_MONOTONIC_PRECISE:
 	case CLOCK_UPTIME:
 	case CLOCK_UPTIME_PRECISE:
 		nanouptime(ats);
 		break;
 	case CLOCK_UPTIME_FAST:
 	case CLOCK_MONOTONIC_FAST:
 		getnanouptime(ats);
 		break;
 	case CLOCK_SECOND:
 		ats->tv_sec = time_second;
 		ats->tv_nsec = 0;
 		break;
 	case CLOCK_THREAD_CPUTIME_ID:
 		kern_thread_cputime(NULL, ats);
 		break;
 	case CLOCK_PROCESS_CPUTIME_ID:
 		PROC_LOCK(p);
 		kern_process_cputime(p, ats);
 		PROC_UNLOCK(p);
 		break;
 	default:
 		if ((int)clock_id >= 0)
 			return (EINVAL);
 		return (get_cputime(td, clock_id, ats));
 	}
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct clock_settime_args {
 	clockid_t clock_id;
 	const struct	timespec *tp;
 };
 #endif
 /* ARGSUSED */
 int
 sys_clock_settime(struct thread *td, struct clock_settime_args *uap)
 {
 	struct timespec ats;
 	int error;
 
 	if ((error = copyin(uap->tp, &ats, sizeof(ats))) != 0)
 		return (error);
 	return (kern_clock_settime(td, uap->clock_id, &ats));
 }
 
 static int allow_insane_settime = 0;
 SYSCTL_INT(_debug, OID_AUTO, allow_insane_settime, CTLFLAG_RWTUN,
     &allow_insane_settime, 0,
     "do not perform possibly restrictive checks on settime(2) args");
 
 int
 kern_clock_settime(struct thread *td, clockid_t clock_id, struct timespec *ats)
 {
 	struct timeval atv;
 	int error;
 
 	if ((error = priv_check(td, PRIV_CLOCK_SETTIME)) != 0)
 		return (error);
 	if (clock_id != CLOCK_REALTIME)
 		return (EINVAL);
 	if (!timespecvalid_interval(ats))
 		return (EINVAL);
 	if (!allow_insane_settime &&
 	    (ats->tv_sec > 8000ULL * 365 * 24 * 60 * 60 ||
 	    ats->tv_sec < utc_offset()))
 		return (EINVAL);
 	/* XXX Don't convert nsec->usec and back */
 	TIMESPEC_TO_TIMEVAL(&atv, ats);
 	error = settime(td, &atv);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct clock_getres_args {
 	clockid_t clock_id;
 	struct	timespec *tp;
 };
 #endif
 int
 sys_clock_getres(struct thread *td, struct clock_getres_args *uap)
 {
 	struct timespec ts;
 	int error;
 
 	if (uap->tp == NULL)
 		return (0);
 
 	error = kern_clock_getres(td, uap->clock_id, &ts);
 	if (error == 0)
 		error = copyout(&ts, uap->tp, sizeof(ts));
 	return (error);
 }
 
 int
 kern_clock_getres(struct thread *td, clockid_t clock_id, struct timespec *ts)
 {
 
 	ts->tv_sec = 0;
 	switch (clock_id) {
 	case CLOCK_REALTIME:
 	case CLOCK_REALTIME_FAST:
 	case CLOCK_REALTIME_PRECISE:
+	case CLOCK_TAI:
 	case CLOCK_MONOTONIC:
 	case CLOCK_MONOTONIC_FAST:
 	case CLOCK_MONOTONIC_PRECISE:
 	case CLOCK_UPTIME:
 	case CLOCK_UPTIME_FAST:
 	case CLOCK_UPTIME_PRECISE:
 		/*
 		 * Round up the result of the division cheaply by adding 1.
 		 * Rounding up is especially important if rounding down
 		 * would give 0.  Perfect rounding is unimportant.
 		 */
 		ts->tv_nsec = NS_PER_SEC / tc_getfrequency() + 1;
 		break;
 	case CLOCK_VIRTUAL:
 	case CLOCK_PROF:
 		/* Accurately round up here because we can do so cheaply. */
 		ts->tv_nsec = howmany(NS_PER_SEC, hz);
 		break;
 	case CLOCK_SECOND:
 		ts->tv_sec = 1;
 		ts->tv_nsec = 0;
 		break;
 	case CLOCK_THREAD_CPUTIME_ID:
 	case CLOCK_PROCESS_CPUTIME_ID:
 	cputime:
 		ts->tv_nsec = 1000000000 / cpu_tickrate() + 1;
 		break;
 	default:
 		if ((int)clock_id < 0)
 			goto cputime;
 		return (EINVAL);
 	}
 	return (0);
 }
 
 int
 kern_nanosleep(struct thread *td, struct timespec *rqt, struct timespec *rmt)
 {
 
 	return (kern_clock_nanosleep(td, CLOCK_REALTIME, TIMER_RELTIME, rqt,
 	    rmt));
 }
 
 static __read_mostly bool nanosleep_precise = true;
 SYSCTL_BOOL(_kern_timecounter, OID_AUTO, nanosleep_precise, CTLFLAG_RW,
     &nanosleep_precise, 0, "clock_nanosleep() with CLOCK_REALTIME, "
     "CLOCK_MONOTONIC, CLOCK_UPTIME and nanosleep(2) use precise clock");
 static uint8_t nanowait[MAXCPU];
 
 int
 kern_clock_nanosleep(struct thread *td, clockid_t clock_id, int flags,
     const struct timespec *rqt, struct timespec *rmt)
 {
 	struct timespec ts, now;
 	sbintime_t sbt, sbtt, prec, tmp;
 	time_t over;
 	int error;
 	bool is_abs_real, precise;
 
 	if (rqt->tv_nsec < 0 || rqt->tv_nsec >= NS_PER_SEC)
 		return (EINVAL);
 	if ((flags & ~TIMER_ABSTIME) != 0)
 		return (EINVAL);
 	switch (clock_id) {
 	case CLOCK_REALTIME:
+	case CLOCK_TAI:
 		precise = nanosleep_precise;
 		is_abs_real = (flags & TIMER_ABSTIME) != 0;
 		break;
 	case CLOCK_REALTIME_PRECISE:
 		precise = true;
 		is_abs_real = (flags & TIMER_ABSTIME) != 0;
 		break;
 	case CLOCK_REALTIME_FAST:
 	case CLOCK_SECOND:
 		precise = false;
 		is_abs_real = (flags & TIMER_ABSTIME) != 0;
 		break;
 	case CLOCK_MONOTONIC:
 	case CLOCK_UPTIME:
 		precise = nanosleep_precise;
 		is_abs_real = false;
 		break;
 	case CLOCK_MONOTONIC_PRECISE:
 	case CLOCK_UPTIME_PRECISE:
 		precise = true;
 		is_abs_real = false;
 		break;
 	case CLOCK_MONOTONIC_FAST:
 	case CLOCK_UPTIME_FAST:
 		precise = false;
 		is_abs_real = false;
 		break;
 	case CLOCK_VIRTUAL:
 	case CLOCK_PROF:
 	case CLOCK_PROCESS_CPUTIME_ID:
 		return (ENOTSUP);
 	case CLOCK_THREAD_CPUTIME_ID:
 	default:
 		return (EINVAL);
 	}
 	do {
 		ts = *rqt;
 		if ((flags & TIMER_ABSTIME) != 0) {
 			if (is_abs_real)
 				td->td_rtcgen =
 				    atomic_load_acq_int(&rtc_generation);
 			error = kern_clock_gettime(td, clock_id, &now);
 			KASSERT(error == 0, ("kern_clock_gettime: %d", error));
 			timespecsub(&ts, &now, &ts);
 		}
 		if (ts.tv_sec < 0 || (ts.tv_sec == 0 && ts.tv_nsec == 0)) {
 			error = EWOULDBLOCK;
 			break;
 		}
 		if (ts.tv_sec > INT32_MAX / 2) {
 			over = ts.tv_sec - INT32_MAX / 2;
 			ts.tv_sec -= over;
 		} else
 			over = 0;
 		tmp = tstosbt(ts);
 		if (precise) {
 			prec = 0;
 			sbt = sbinuptime();
 		} else {
 			prec = tmp >> tc_precexp;
 			if (TIMESEL(&sbt, tmp))
 				sbt += tc_tick_sbt;
 		}
 		sbt += tmp;
 		error = tsleep_sbt(&nanowait[curcpu], PWAIT | PCATCH, "nanslp",
 		    sbt, prec, C_ABSOLUTE);
 	} while (error == 0 && is_abs_real && td->td_rtcgen == 0);
 	td->td_rtcgen = 0;
 	if (error != EWOULDBLOCK) {
 		if (TIMESEL(&sbtt, tmp))
 			sbtt += tc_tick_sbt;
 		if (sbtt >= sbt)
 			return (0);
 		if (error == ERESTART)
 			error = EINTR;
 		if ((flags & TIMER_ABSTIME) == 0 && rmt != NULL) {
 			ts = sbttots(sbt - sbtt);
 			ts.tv_sec += over;
 			if (ts.tv_sec < 0)
 				timespecclear(&ts);
 			*rmt = ts;
 		}
 		return (error);
 	}
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct nanosleep_args {
 	struct	timespec *rqtp;
 	struct	timespec *rmtp;
 };
 #endif
 /* ARGSUSED */
 int
 sys_nanosleep(struct thread *td, struct nanosleep_args *uap)
 {
 
 	return (user_clock_nanosleep(td, CLOCK_REALTIME, TIMER_RELTIME,
 	    uap->rqtp, uap->rmtp));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct clock_nanosleep_args {
 	clockid_t clock_id;
 	int 	  flags;
 	struct	timespec *rqtp;
 	struct	timespec *rmtp;
 };
 #endif
 /* ARGSUSED */
 int
 sys_clock_nanosleep(struct thread *td, struct clock_nanosleep_args *uap)
 {
 	int error;
 
 	error = user_clock_nanosleep(td, uap->clock_id, uap->flags, uap->rqtp,
 	    uap->rmtp);
 	return (kern_posix_error(td, error));
 }
 
 static int
 user_clock_nanosleep(struct thread *td, clockid_t clock_id, int flags,
     const struct timespec *ua_rqtp, struct timespec *ua_rmtp)
 {
 	struct timespec rmt, rqt;
 	int error, error2;
 
 	error = copyin(ua_rqtp, &rqt, sizeof(rqt));
 	if (error)
 		return (error);
 	error = kern_clock_nanosleep(td, clock_id, flags, &rqt, &rmt);
 	if (error == EINTR && ua_rmtp != NULL && (flags & TIMER_ABSTIME) == 0) {
 		error2 = copyout(&rmt, ua_rmtp, sizeof(rmt));
 		if (error2 != 0)
 			error = error2;
 	}
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct gettimeofday_args {
 	struct	timeval *tp;
 	struct	timezone *tzp;
 };
 #endif
 /* ARGSUSED */
 int
 sys_gettimeofday(struct thread *td, struct gettimeofday_args *uap)
 {
 	struct timeval atv;
 	struct timezone rtz;
 	int error = 0;
 
 	if (uap->tp) {
 		microtime(&atv);
 		error = copyout(&atv, uap->tp, sizeof (atv));
 	}
 	if (error == 0 && uap->tzp != NULL) {
 		rtz.tz_minuteswest = 0;
 		rtz.tz_dsttime = 0;
 		error = copyout(&rtz, uap->tzp, sizeof (rtz));
 	}
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct settimeofday_args {
 	struct	timeval *tv;
 	struct	timezone *tzp;
 };
 #endif
 /* ARGSUSED */
 int
 sys_settimeofday(struct thread *td, struct settimeofday_args *uap)
 {
 	struct timeval atv, *tvp;
 	struct timezone atz, *tzp;
 	int error;
 
 	if (uap->tv) {
 		error = copyin(uap->tv, &atv, sizeof(atv));
 		if (error)
 			return (error);
 		tvp = &atv;
 	} else
 		tvp = NULL;
 	if (uap->tzp) {
 		error = copyin(uap->tzp, &atz, sizeof(atz));
 		if (error)
 			return (error);
 		tzp = &atz;
 	} else
 		tzp = NULL;
 	return (kern_settimeofday(td, tvp, tzp));
 }
 
 int
 kern_settimeofday(struct thread *td, struct timeval *tv, struct timezone *tzp)
 {
 	int error;
 
 	error = priv_check(td, PRIV_SETTIMEOFDAY);
 	if (error)
 		return (error);
 	/* Verify all parameters before changing time. */
 	if (tv) {
 		if (tv->tv_usec < 0 || tv->tv_usec >= 1000000 ||
 		    tv->tv_sec < 0)
 			return (EINVAL);
 		error = settime(td, tv);
 	}
 	return (error);
 }
 
 /*
  * Get value of an interval timer.  The process virtual and profiling virtual
  * time timers are kept in the p_stats area, since they can be swapped out.
  * These are kept internally in the way they are specified externally: in
  * time until they expire.
  *
  * The real time interval timer is kept in the process table slot for the
  * process, and its value (it_value) is kept as an absolute time rather than
  * as a delta, so that it is easy to keep periodic real-time signals from
  * drifting.
  *
  * Virtual time timers are processed in the hardclock() routine of
  * kern_clock.c.  The real time timer is processed by a timeout routine,
  * called from the softclock() routine.  Since a callout may be delayed in
  * real time due to interrupt processing in the system, it is possible for
  * the real time timeout routine (realitexpire, given below), to be delayed
  * in real time past when it is supposed to occur.  It does not suffice,
  * therefore, to reload the real timer .it_value from the real time timers
  * .it_interval.  Rather, we compute the next time in absolute time the timer
  * should go off.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getitimer_args {
 	u_int	which;
 	struct	itimerval *itv;
 };
 #endif
 int
 sys_getitimer(struct thread *td, struct getitimer_args *uap)
 {
 	struct itimerval aitv;
 	int error;
 
 	error = kern_getitimer(td, uap->which, &aitv);
 	if (error != 0)
 		return (error);
 	return (copyout(&aitv, uap->itv, sizeof (struct itimerval)));
 }
 
 int
 kern_getitimer(struct thread *td, u_int which, struct itimerval *aitv)
 {
 	struct proc *p = td->td_proc;
 	struct timeval ctv;
 
 	if (which > ITIMER_PROF)
 		return (EINVAL);
 
 	if (which == ITIMER_REAL) {
 		/*
 		 * Convert from absolute to relative time in .it_value
 		 * part of real time timer.  If time for real time timer
 		 * has passed return 0, else return difference between
 		 * current time and time for the timer to go off.
 		 */
 		PROC_LOCK(p);
 		*aitv = p->p_realtimer;
 		PROC_UNLOCK(p);
 		if (timevalisset(&aitv->it_value)) {
 			microuptime(&ctv);
 			if (timevalcmp(&aitv->it_value, &ctv, <))
 				timevalclear(&aitv->it_value);
 			else
 				timevalsub(&aitv->it_value, &ctv);
 		}
 	} else {
 		PROC_ITIMLOCK(p);
 		*aitv = p->p_stats->p_timer[which];
 		PROC_ITIMUNLOCK(p);
 	}
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_STRUCT))
 		ktritimerval(aitv);
 #endif
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct setitimer_args {
 	u_int	which;
 	struct	itimerval *itv, *oitv;
 };
 #endif
 int
 sys_setitimer(struct thread *td, struct setitimer_args *uap)
 {
 	struct itimerval aitv, oitv;
 	int error;
 
 	if (uap->itv == NULL) {
 		uap->itv = uap->oitv;
 		return (sys_getitimer(td, (struct getitimer_args *)uap));
 	}
 
 	if ((error = copyin(uap->itv, &aitv, sizeof(struct itimerval))))
 		return (error);
 	error = kern_setitimer(td, uap->which, &aitv, &oitv);
 	if (error != 0 || uap->oitv == NULL)
 		return (error);
 	return (copyout(&oitv, uap->oitv, sizeof(struct itimerval)));
 }
 
 int
 kern_setitimer(struct thread *td, u_int which, struct itimerval *aitv,
     struct itimerval *oitv)
 {
 	struct proc *p = td->td_proc;
 	struct timeval ctv;
 	sbintime_t sbt, pr;
 
 	if (aitv == NULL)
 		return (kern_getitimer(td, which, oitv));
 
 	if (which > ITIMER_PROF)
 		return (EINVAL);
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_STRUCT))
 		ktritimerval(aitv);
 #endif
 	if (itimerfix(&aitv->it_value) ||
 	    aitv->it_value.tv_sec > INT32_MAX / 2)
 		return (EINVAL);
 	if (!timevalisset(&aitv->it_value))
 		timevalclear(&aitv->it_interval);
 	else if (itimerfix(&aitv->it_interval) ||
 	    aitv->it_interval.tv_sec > INT32_MAX / 2)
 		return (EINVAL);
 
 	if (which == ITIMER_REAL) {
 		PROC_LOCK(p);
 		if (timevalisset(&p->p_realtimer.it_value))
 			callout_stop(&p->p_itcallout);
 		microuptime(&ctv);
 		if (timevalisset(&aitv->it_value)) {
 			pr = tvtosbt(aitv->it_value) >> tc_precexp;
 			timevaladd(&aitv->it_value, &ctv);
 			sbt = tvtosbt(aitv->it_value);
 			callout_reset_sbt(&p->p_itcallout, sbt, pr,
 			    realitexpire, p, C_ABSOLUTE);
 		}
 		*oitv = p->p_realtimer;
 		p->p_realtimer = *aitv;
 		PROC_UNLOCK(p);
 		if (timevalisset(&oitv->it_value)) {
 			if (timevalcmp(&oitv->it_value, &ctv, <))
 				timevalclear(&oitv->it_value);
 			else
 				timevalsub(&oitv->it_value, &ctv);
 		}
 	} else {
 		if (aitv->it_interval.tv_sec == 0 &&
 		    aitv->it_interval.tv_usec != 0 &&
 		    aitv->it_interval.tv_usec < tick)
 			aitv->it_interval.tv_usec = tick;
 		if (aitv->it_value.tv_sec == 0 &&
 		    aitv->it_value.tv_usec != 0 &&
 		    aitv->it_value.tv_usec < tick)
 			aitv->it_value.tv_usec = tick;
 		PROC_ITIMLOCK(p);
 		*oitv = p->p_stats->p_timer[which];
 		p->p_stats->p_timer[which] = *aitv;
 		PROC_ITIMUNLOCK(p);
 	}
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_STRUCT))
 		ktritimerval(oitv);
 #endif
 	return (0);
 }
 
 static void
 realitexpire_reset_callout(struct proc *p, sbintime_t *isbtp)
 {
 	sbintime_t prec;
 
 	if ((p->p_flag & P_WEXIT) != 0)
 		return;
 	prec = isbtp == NULL ? tvtosbt(p->p_realtimer.it_interval) : *isbtp;
 	callout_reset_sbt(&p->p_itcallout, tvtosbt(p->p_realtimer.it_value),
 	    prec >> tc_precexp, realitexpire, p, C_ABSOLUTE);
 }
 
 void
 itimer_proc_continue(struct proc *p)
 {
 	struct timeval ctv;
 	struct itimer *it;
 	int id;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	if ((p->p_flag2 & P2_ITSTOPPED) != 0) {
 		p->p_flag2 &= ~P2_ITSTOPPED;
 		microuptime(&ctv);
 		if (timevalcmp(&p->p_realtimer.it_value, &ctv, >=))
 			realitexpire(p);
 		else
 			realitexpire_reset_callout(p, NULL);
 	}
 
 	if (p->p_itimers != NULL) {
 		for (id = 3; id < TIMER_MAX; id++) {
 			it = p->p_itimers->its_timers[id];
 			if (it == NULL)
 				continue;
 			if ((it->it_flags & ITF_PSTOPPED) != 0) {
 				ITIMER_LOCK(it);
 				if ((it->it_flags & ITF_PSTOPPED) != 0) {
 					it->it_flags &= ~ITF_PSTOPPED;
 					if ((it->it_flags & ITF_DELETING) == 0)
 						realtimer_expire_l(it, true);
 				}
 				ITIMER_UNLOCK(it);
 			}
 		}
 	}
 }
 
 /*
  * Real interval timer expired:
  * send process whose timer expired an alarm signal.
  * If time is not set up to reload, then just return.
  * Else compute next time timer should go off which is > current time.
  * This is where delay in processing this timeout causes multiple
  * SIGALRM calls to be compressed into one.
  * tvtohz() always adds 1 to allow for the time until the next clock
  * interrupt being strictly less than 1 clock tick, but we don't want
  * that here since we want to appear to be in sync with the clock
  * interrupt even when we're delayed.
  */
 static void
 realitexpire(void *arg)
 {
 	struct proc *p;
 	struct timeval ctv;
 	sbintime_t isbt;
 
 	p = (struct proc *)arg;
 	kern_psignal(p, SIGALRM);
 	if (!timevalisset(&p->p_realtimer.it_interval)) {
 		timevalclear(&p->p_realtimer.it_value);
 		return;
 	}
 
 	isbt = tvtosbt(p->p_realtimer.it_interval);
 	if (isbt >= sbt_timethreshold)
 		getmicrouptime(&ctv);
 	else
 		microuptime(&ctv);
 	do {
 		timevaladd(&p->p_realtimer.it_value,
 		    &p->p_realtimer.it_interval);
 	} while (timevalcmp(&p->p_realtimer.it_value, &ctv, <=));
 
 	if (P_SHOULDSTOP(p) || P_KILLED(p)) {
 		p->p_flag2 |= P2_ITSTOPPED;
 		return;
 	}
 
 	p->p_flag2 &= ~P2_ITSTOPPED;
 	realitexpire_reset_callout(p, &isbt);
 }
 
 /*
  * Check that a proposed value to load into the .it_value or
  * .it_interval part of an interval timer is acceptable, and
  * fix it to have at least minimal value (i.e. if it is less
  * than the resolution of the clock, round it up.)
  */
 int
 itimerfix(struct timeval *tv)
 {
 
 	if (tv->tv_sec < 0 || tv->tv_usec < 0 || tv->tv_usec >= 1000000)
 		return (EINVAL);
 	if (tv->tv_sec == 0 && tv->tv_usec != 0 &&
 	    tv->tv_usec < (u_int)tick / 16)
 		tv->tv_usec = (u_int)tick / 16;
 	return (0);
 }
 
 /*
  * Decrement an interval timer by a specified number
  * of microseconds, which must be less than a second,
  * i.e. < 1000000.  If the timer expires, then reload
  * it.  In this case, carry over (usec - old value) to
  * reduce the value reloaded into the timer so that
  * the timer does not drift.  This routine assumes
  * that it is called in a context where the timers
  * on which it is operating cannot change in value.
  */
 int
 itimerdecr(struct itimerval *itp, int usec)
 {
 
 	if (itp->it_value.tv_usec < usec) {
 		if (itp->it_value.tv_sec == 0) {
 			/* expired, and already in next interval */
 			usec -= itp->it_value.tv_usec;
 			goto expire;
 		}
 		itp->it_value.tv_usec += 1000000;
 		itp->it_value.tv_sec--;
 	}
 	itp->it_value.tv_usec -= usec;
 	usec = 0;
 	if (timevalisset(&itp->it_value))
 		return (1);
 	/* expired, exactly at end of interval */
 expire:
 	if (timevalisset(&itp->it_interval)) {
 		itp->it_value = itp->it_interval;
 		itp->it_value.tv_usec -= usec;
 		if (itp->it_value.tv_usec < 0) {
 			itp->it_value.tv_usec += 1000000;
 			itp->it_value.tv_sec--;
 		}
 	} else
 		itp->it_value.tv_usec = 0;		/* sec is already 0 */
 	return (0);
 }
 
 /*
  * Add and subtract routines for timevals.
  * N.B.: subtract routine doesn't deal with
  * results which are before the beginning,
  * it just gets very confused in this case.
  * Caveat emptor.
  */
 void
 timevaladd(struct timeval *t1, const struct timeval *t2)
 {
 
 	t1->tv_sec += t2->tv_sec;
 	t1->tv_usec += t2->tv_usec;
 	timevalfix(t1);
 }
 
 void
 timevalsub(struct timeval *t1, const struct timeval *t2)
 {
 
 	t1->tv_sec -= t2->tv_sec;
 	t1->tv_usec -= t2->tv_usec;
 	timevalfix(t1);
 }
 
 static void
 timevalfix(struct timeval *t1)
 {
 
 	if (t1->tv_usec < 0) {
 		t1->tv_sec--;
 		t1->tv_usec += 1000000;
 	}
 	if (t1->tv_usec >= 1000000) {
 		t1->tv_sec++;
 		t1->tv_usec -= 1000000;
 	}
 }
 
 /*
  * ratecheck(): simple time-based rate-limit checking.
  */
 int
 ratecheck(struct timeval *lasttime, const struct timeval *mininterval)
 {
 	struct timeval tv, delta;
 	int rv = 0;
 
 	getmicrouptime(&tv);		/* NB: 10ms precision */
 	delta = tv;
 	timevalsub(&delta, lasttime);
 
 	/*
 	 * check for 0,0 is so that the message will be seen at least once,
 	 * even if interval is huge.
 	 */
 	if (timevalcmp(&delta, mininterval, >=) ||
 	    (lasttime->tv_sec == 0 && lasttime->tv_usec == 0)) {
 		*lasttime = tv;
 		rv = 1;
 	}
 
 	return (rv);
 }
 
 /*
  * eventratecheck(): events per second limitation.
  *
  * Return 0 if the limit is to be enforced (e.g. the caller
  * should ignore the event because of the rate limitation).
  *
  * maxeps of 0 always causes zero to be returned.  maxeps of -1
  * always causes 1 to be returned; this effectively defeats rate
  * limiting.
  *
  * Note that we maintain the struct timeval for compatibility
  * with other bsd systems.  We reuse the storage and just monitor
  * clock ticks for minimal overhead.
  */
 int
 eventratecheck(struct timeval *lasttime, int *cureps, int maxeps)
 {
 	int now;
 
 	/*
 	 * Reset the last time and counter if this is the first call
 	 * or more than a second has passed since the last update of
 	 * lasttime.
 	 */
 	now = ticks;
 	if (lasttime->tv_sec == 0 || (u_int)(now - lasttime->tv_sec) >= hz) {
 		lasttime->tv_sec = now;
 		*cureps = 1;
 		return (maxeps != 0);
 	} else {
 		(*cureps)++;		/* NB: ignore potential overflow */
 		return (maxeps < 0 || *cureps <= maxeps);
 	}
 }
 
 static void
 itimer_start(void)
 {
 	static const struct kclock rt_clock = {
 		.timer_create  = realtimer_create,
 		.timer_delete  = realtimer_delete,
 		.timer_settime = realtimer_settime,
 		.timer_gettime = realtimer_gettime,
 	};
 
 	itimer_zone = uma_zcreate("itimer", sizeof(struct itimer),
 		NULL, NULL, itimer_init, itimer_fini, UMA_ALIGN_PTR, 0);
 	register_posix_clock(CLOCK_REALTIME,  &rt_clock);
 	register_posix_clock(CLOCK_MONOTONIC, &rt_clock);
+	register_posix_clock(CLOCK_TAI, &rt_clock);
 	p31b_setcfg(CTL_P1003_1B_TIMERS, 200112L);
 	p31b_setcfg(CTL_P1003_1B_DELAYTIMER_MAX, INT_MAX);
 	p31b_setcfg(CTL_P1003_1B_TIMER_MAX, TIMER_MAX);
 }
 
 static int
 register_posix_clock(int clockid, const struct kclock *clk)
 {
 	if ((unsigned)clockid >= MAX_CLOCKS) {
 		printf("%s: invalid clockid\n", __func__);
 		return (0);
 	}
 	posix_clocks[clockid] = *clk;
 	return (1);
 }
 
 static int
 itimer_init(void *mem, int size, int flags)
 {
 	struct itimer *it;
 
 	it = (struct itimer *)mem;
 	mtx_init(&it->it_mtx, "itimer lock", NULL, MTX_DEF);
 	return (0);
 }
 
 static void
 itimer_fini(void *mem, int size)
 {
 	struct itimer *it;
 
 	it = (struct itimer *)mem;
 	mtx_destroy(&it->it_mtx);
 }
 
 static void
 itimer_enter(struct itimer *it)
 {
 
 	mtx_assert(&it->it_mtx, MA_OWNED);
 	it->it_usecount++;
 }
 
 static void
 itimer_leave(struct itimer *it)
 {
 
 	mtx_assert(&it->it_mtx, MA_OWNED);
 	KASSERT(it->it_usecount > 0, ("invalid it_usecount"));
 
 	if (--it->it_usecount == 0 && (it->it_flags & ITF_WANTED) != 0)
 		wakeup(it);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ktimer_create_args {
 	clockid_t clock_id;
 	struct sigevent * evp;
 	int * timerid;
 };
 #endif
 int
 sys_ktimer_create(struct thread *td, struct ktimer_create_args *uap)
 {
 	struct sigevent *evp, ev;
 	int id;
 	int error;
 
 	if (uap->evp == NULL) {
 		evp = NULL;
 	} else {
 		error = copyin(uap->evp, &ev, sizeof(ev));
 		if (error != 0)
 			return (error);
 		evp = &ev;
 	}
 	error = kern_ktimer_create(td, uap->clock_id, evp, &id, -1);
 	if (error == 0) {
 		error = copyout(&id, uap->timerid, sizeof(int));
 		if (error != 0)
 			kern_ktimer_delete(td, id);
 	}
 	return (error);
 }
 
 int
 kern_ktimer_create(struct thread *td, clockid_t clock_id, struct sigevent *evp,
     int *timerid, int preset_id)
 {
 	struct proc *p = td->td_proc;
 	struct itimer *it;
 	int id;
 	int error;
 
 	if (clock_id < 0 || clock_id >= MAX_CLOCKS)
 		return (EINVAL);
 
 	if (posix_clocks[clock_id].timer_create == NULL)
 		return (EINVAL);
 
 	if (evp != NULL) {
 		if (evp->sigev_notify != SIGEV_NONE &&
 		    evp->sigev_notify != SIGEV_SIGNAL &&
 		    evp->sigev_notify != SIGEV_THREAD_ID)
 			return (EINVAL);
 		if ((evp->sigev_notify == SIGEV_SIGNAL ||
 		     evp->sigev_notify == SIGEV_THREAD_ID) &&
 			!_SIG_VALID(evp->sigev_signo))
 			return (EINVAL);
 	}
 
 	if (p->p_itimers == NULL)
 		itimers_alloc(p);
 
 	it = uma_zalloc(itimer_zone, M_WAITOK);
 	it->it_flags = 0;
 	it->it_usecount = 0;
 	timespecclear(&it->it_time.it_value);
 	timespecclear(&it->it_time.it_interval);
 	it->it_overrun = 0;
 	it->it_overrun_last = 0;
 	it->it_clockid = clock_id;
 	it->it_proc = p;
 	ksiginfo_init(&it->it_ksi);
 	it->it_ksi.ksi_flags |= KSI_INS | KSI_EXT;
 	error = CLOCK_CALL(clock_id, timer_create, (it));
 	if (error != 0)
 		goto out;
 
 	PROC_LOCK(p);
 	if (preset_id != -1) {
 		KASSERT(preset_id >= 0 && preset_id < 3, ("invalid preset_id"));
 		id = preset_id;
 		if (p->p_itimers->its_timers[id] != NULL) {
 			PROC_UNLOCK(p);
 			error = 0;
 			goto out;
 		}
 	} else {
 		/*
 		 * Find a free timer slot, skipping those reserved
 		 * for setitimer().
 		 */
 		for (id = 3; id < TIMER_MAX; id++)
 			if (p->p_itimers->its_timers[id] == NULL)
 				break;
 		if (id == TIMER_MAX) {
 			PROC_UNLOCK(p);
 			error = EAGAIN;
 			goto out;
 		}
 	}
 	p->p_itimers->its_timers[id] = it;
 	if (evp != NULL)
 		it->it_sigev = *evp;
 	else {
 		it->it_sigev.sigev_notify = SIGEV_SIGNAL;
 		switch (clock_id) {
 		default:
 		case CLOCK_REALTIME:
+		case CLOCK_TAI:
 			it->it_sigev.sigev_signo = SIGALRM;
 			break;
 		case CLOCK_VIRTUAL:
  			it->it_sigev.sigev_signo = SIGVTALRM;
 			break;
 		case CLOCK_PROF:
 			it->it_sigev.sigev_signo = SIGPROF;
 			break;
 		}
 		it->it_sigev.sigev_value.sival_int = id;
 	}
 
 	if (it->it_sigev.sigev_notify == SIGEV_SIGNAL ||
 	    it->it_sigev.sigev_notify == SIGEV_THREAD_ID) {
 		it->it_ksi.ksi_signo = it->it_sigev.sigev_signo;
 		it->it_ksi.ksi_code = SI_TIMER;
 		it->it_ksi.ksi_value = it->it_sigev.sigev_value;
 		it->it_ksi.ksi_timerid = id;
 	}
 	PROC_UNLOCK(p);
 	*timerid = id;
 	return (0);
 
 out:
 	ITIMER_LOCK(it);
 	CLOCK_CALL(it->it_clockid, timer_delete, (it));
 	ITIMER_UNLOCK(it);
 	uma_zfree(itimer_zone, it);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ktimer_delete_args {
 	int timerid;
 };
 #endif
 int
 sys_ktimer_delete(struct thread *td, struct ktimer_delete_args *uap)
 {
 
 	return (kern_ktimer_delete(td, uap->timerid));
 }
 
 static struct itimer *
 itimer_find(struct proc *p, int timerid)
 {
 	struct itimer *it;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if ((p->p_itimers == NULL) ||
 	    (timerid < 0) || (timerid >= TIMER_MAX) ||
 	    (it = p->p_itimers->its_timers[timerid]) == NULL) {
 		return (NULL);
 	}
 	ITIMER_LOCK(it);
 	if ((it->it_flags & ITF_DELETING) != 0) {
 		ITIMER_UNLOCK(it);
 		it = NULL;
 	}
 	return (it);
 }
 
 int
 kern_ktimer_delete(struct thread *td, int timerid)
 {
 	struct proc *p = td->td_proc;
 	struct itimer *it;
 
 	PROC_LOCK(p);
 	it = itimer_find(p, timerid);
 	if (it == NULL) {
 		PROC_UNLOCK(p);
 		return (EINVAL);
 	}
 	PROC_UNLOCK(p);
 
 	it->it_flags |= ITF_DELETING;
 	while (it->it_usecount > 0) {
 		it->it_flags |= ITF_WANTED;
 		msleep(it, &it->it_mtx, PPAUSE, "itimer", 0);
 	}
 	it->it_flags &= ~ITF_WANTED;
 	CLOCK_CALL(it->it_clockid, timer_delete, (it));
 	ITIMER_UNLOCK(it);
 
 	PROC_LOCK(p);
 	if (KSI_ONQ(&it->it_ksi))
 		sigqueue_take(&it->it_ksi);
 	p->p_itimers->its_timers[timerid] = NULL;
 	PROC_UNLOCK(p);
 	uma_zfree(itimer_zone, it);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ktimer_settime_args {
 	int timerid;
 	int flags;
 	const struct itimerspec * value;
 	struct itimerspec * ovalue;
 };
 #endif
 int
 sys_ktimer_settime(struct thread *td, struct ktimer_settime_args *uap)
 {
 	struct itimerspec val, oval, *ovalp;
 	int error;
 
 	error = copyin(uap->value, &val, sizeof(val));
 	if (error != 0)
 		return (error);
 	ovalp = uap->ovalue != NULL ? &oval : NULL;
 	error = kern_ktimer_settime(td, uap->timerid, uap->flags, &val, ovalp);
 	if (error == 0 && uap->ovalue != NULL)
 		error = copyout(ovalp, uap->ovalue, sizeof(*ovalp));
 	return (error);
 }
 
 int
 kern_ktimer_settime(struct thread *td, int timer_id, int flags,
     struct itimerspec *val, struct itimerspec *oval)
 {
 	struct proc *p;
 	struct itimer *it;
 	int error;
 
 	p = td->td_proc;
 	PROC_LOCK(p);
 	if (timer_id < 3 || (it = itimer_find(p, timer_id)) == NULL) {
 		PROC_UNLOCK(p);
 		error = EINVAL;
 	} else {
 		PROC_UNLOCK(p);
 		itimer_enter(it);
 		error = CLOCK_CALL(it->it_clockid, timer_settime, (it,
 		    flags, val, oval));
 		itimer_leave(it);
 		ITIMER_UNLOCK(it);
 	}
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ktimer_gettime_args {
 	int timerid;
 	struct itimerspec * value;
 };
 #endif
 int
 sys_ktimer_gettime(struct thread *td, struct ktimer_gettime_args *uap)
 {
 	struct itimerspec val;
 	int error;
 
 	error = kern_ktimer_gettime(td, uap->timerid, &val);
 	if (error == 0)
 		error = copyout(&val, uap->value, sizeof(val));
 	return (error);
 }
 
 int
 kern_ktimer_gettime(struct thread *td, int timer_id, struct itimerspec *val)
 {
 	struct proc *p;
 	struct itimer *it;
 	int error;
 
 	p = td->td_proc;
 	PROC_LOCK(p);
 	if (timer_id < 3 || (it = itimer_find(p, timer_id)) == NULL) {
 		PROC_UNLOCK(p);
 		error = EINVAL;
 	} else {
 		PROC_UNLOCK(p);
 		itimer_enter(it);
 		error = CLOCK_CALL(it->it_clockid, timer_gettime, (it, val));
 		itimer_leave(it);
 		ITIMER_UNLOCK(it);
 	}
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct timer_getoverrun_args {
 	int timerid;
 };
 #endif
 int
 sys_ktimer_getoverrun(struct thread *td, struct ktimer_getoverrun_args *uap)
 {
 
 	return (kern_ktimer_getoverrun(td, uap->timerid));
 }
 
 int
 kern_ktimer_getoverrun(struct thread *td, int timer_id)
 {
 	struct proc *p = td->td_proc;
 	struct itimer *it;
 	int error ;
 
 	PROC_LOCK(p);
 	if (timer_id < 3 ||
 	    (it = itimer_find(p, timer_id)) == NULL) {
 		PROC_UNLOCK(p);
 		error = EINVAL;
 	} else {
 		td->td_retval[0] = it->it_overrun_last;
 		ITIMER_UNLOCK(it);
 		PROC_UNLOCK(p);
 		error = 0;
 	}
 	return (error);
 }
 
 static int
 realtimer_create(struct itimer *it)
 {
 	callout_init_mtx(&it->it_callout, &it->it_mtx, 0);
 	return (0);
 }
 
 static int
 realtimer_delete(struct itimer *it)
 {
 	mtx_assert(&it->it_mtx, MA_OWNED);
 
 	/*
 	 * clear timer's value and interval to tell realtimer_expire
 	 * to not rearm the timer.
 	 */
 	timespecclear(&it->it_time.it_value);
 	timespecclear(&it->it_time.it_interval);
 	ITIMER_UNLOCK(it);
 	callout_drain(&it->it_callout);
 	ITIMER_LOCK(it);
 	return (0);
 }
 
 static int
 realtimer_gettime(struct itimer *it, struct itimerspec *ovalue)
 {
 	struct timespec cts;
+	int err;
 
 	mtx_assert(&it->it_mtx, MA_OWNED);
 
-	realtimer_clocktime(it->it_clockid, &cts);
+	err = kern_clock_gettime(curthread, it->it_clockid, &cts);
+	if (err)
+		return (err);
+
 	*ovalue = it->it_time;
 	if (ovalue->it_value.tv_sec != 0 || ovalue->it_value.tv_nsec != 0) {
 		timespecsub(&ovalue->it_value, &cts, &ovalue->it_value);
 		if (ovalue->it_value.tv_sec < 0 ||
 		    (ovalue->it_value.tv_sec == 0 &&
 		     ovalue->it_value.tv_nsec == 0)) {
 			ovalue->it_value.tv_sec  = 0;
 			ovalue->it_value.tv_nsec = 1;
 		}
 	}
 	return (0);
 }
 
 static int
 realtimer_settime(struct itimer *it, int flags, struct itimerspec *value,
     struct itimerspec *ovalue)
 {
 	struct timespec cts, ts;
 	struct timeval tv;
 	struct itimerspec val;
+	int err;
 
 	mtx_assert(&it->it_mtx, MA_OWNED);
 
 	val = *value;
 	if (itimespecfix(&val.it_value))
 		return (EINVAL);
 
 	if (timespecisset(&val.it_value)) {
 		if (itimespecfix(&val.it_interval))
 			return (EINVAL);
 	} else {
 		timespecclear(&val.it_interval);
 	}
 
 	if (ovalue != NULL)
 		realtimer_gettime(it, ovalue);
 
 	it->it_time = val;
 	if (timespecisset(&val.it_value)) {
-		realtimer_clocktime(it->it_clockid, &cts);
+		err = kern_clock_gettime(curthread, it->it_clockid, &cts);
+		if (err)
+			return (err);
+
 		ts = val.it_value;
 		if ((flags & TIMER_ABSTIME) == 0) {
 			/* Convert to absolute time. */
 			timespecadd(&it->it_time.it_value, &cts,
 			    &it->it_time.it_value);
 		} else {
 			timespecsub(&ts, &cts, &ts);
 			/*
 			 * We don't care if ts is negative, tztohz will
 			 * fix it.
 			 */
 		}
 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
 		callout_reset(&it->it_callout, tvtohz(&tv), realtimer_expire,
 		    it);
 	} else {
 		callout_stop(&it->it_callout);
 	}
 
 	return (0);
 }
 
-static void
-realtimer_clocktime(clockid_t id, struct timespec *ts)
-{
-	if (id == CLOCK_REALTIME)
-		getnanotime(ts);
-	else	/* CLOCK_MONOTONIC */
-		getnanouptime(ts);
-}
-
 int
 itimer_accept(struct proc *p, int timerid, ksiginfo_t *ksi)
 {
 	struct itimer *it;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	it = itimer_find(p, timerid);
 	if (it != NULL) {
 		ksi->ksi_overrun = it->it_overrun;
 		it->it_overrun_last = it->it_overrun;
 		it->it_overrun = 0;
 		ITIMER_UNLOCK(it);
 		return (0);
 	}
 	return (EINVAL);
 }
 
 static int
 itimespecfix(struct timespec *ts)
 {
 
 	if (!timespecvalid_interval(ts))
 		return (EINVAL);
 	if ((UINT64_MAX - ts->tv_nsec) / NS_PER_SEC < ts->tv_sec)
 		return (EINVAL);
 	if (ts->tv_sec == 0 && ts->tv_nsec != 0 && ts->tv_nsec < tick * 1000)
 		ts->tv_nsec = tick * 1000;
 	return (0);
 }
 
 #define	timespectons(tsp)			\
 	((uint64_t)(tsp)->tv_sec * NS_PER_SEC + (tsp)->tv_nsec)
 #define	timespecfromns(ns) (struct timespec){	\
 	.tv_sec = (ns) / NS_PER_SEC,		\
 	.tv_nsec = (ns) % NS_PER_SEC		\
 }
 
 static void
 realtimer_expire_l(struct itimer *it, bool proc_locked)
 {
 	struct timespec cts, ts;
 	struct timeval tv;
 	struct proc *p;
 	uint64_t interval, now, overruns, value;
+	int err;
+
+	err = kern_clock_gettime(curthread, it->it_clockid, &cts);
 
-	realtimer_clocktime(it->it_clockid, &cts);
 	/* Only fire if time is reached. */
-	if (timespeccmp(&cts, &it->it_time.it_value, >=)) {
+	if (err == 0 && timespeccmp(&cts, &it->it_time.it_value, >=)) {
 		if (timespecisset(&it->it_time.it_interval)) {
 			timespecadd(&it->it_time.it_value,
 			    &it->it_time.it_interval,
 			    &it->it_time.it_value);
 
 			interval = timespectons(&it->it_time.it_interval);
 			value = timespectons(&it->it_time.it_value);
 			now = timespectons(&cts);
 
 			if (now >= value) {
 				/*
 				 * We missed at least one period.
 				 */
 				overruns = howmany(now - value + 1, interval);
 				if (it->it_overrun + overruns >=
 				    it->it_overrun &&
 				    it->it_overrun + overruns <= INT_MAX) {
 					it->it_overrun += (int)overruns;
 				} else {
 					it->it_overrun = INT_MAX;
 					it->it_ksi.ksi_errno = ERANGE;
 				}
 				value =
 				    now + interval - (now - value) % interval;
 				it->it_time.it_value = timespecfromns(value);
 			}
 		} else {
 			/* single shot timer ? */
 			timespecclear(&it->it_time.it_value);
 		}
 
 		p = it->it_proc;
 		if (timespecisset(&it->it_time.it_value)) {
 			if (P_SHOULDSTOP(p) || P_KILLED(p)) {
 				it->it_flags |= ITF_PSTOPPED;
 			} else {
 				timespecsub(&it->it_time.it_value, &cts, &ts);
 				TIMESPEC_TO_TIMEVAL(&tv, &ts);
 				callout_reset(&it->it_callout, tvtohz(&tv),
 				    realtimer_expire, it);
 			}
 		}
 
 		itimer_enter(it);
 		ITIMER_UNLOCK(it);
 		if (proc_locked)
 			PROC_UNLOCK(p);
 		itimer_fire(it);
 		if (proc_locked)
 			PROC_LOCK(p);
 		ITIMER_LOCK(it);
 		itimer_leave(it);
 	} else if (timespecisset(&it->it_time.it_value)) {
 		p = it->it_proc;
 		if (P_SHOULDSTOP(p) || P_KILLED(p)) {
 			it->it_flags |= ITF_PSTOPPED;
 		} else {
 			ts = it->it_time.it_value;
 			timespecsub(&ts, &cts, &ts);
 			TIMESPEC_TO_TIMEVAL(&tv, &ts);
 			callout_reset(&it->it_callout, tvtohz(&tv),
 			    realtimer_expire, it);
 		}
 	}
 }
 
 /* Timeout callback for realtime timer */
 static void
 realtimer_expire(void *arg)
 {
 	realtimer_expire_l(arg, false);
 }
 
 static void
 itimer_fire(struct itimer *it)
 {
 	struct proc *p = it->it_proc;
 	struct thread *td;
 
 	if (it->it_sigev.sigev_notify == SIGEV_SIGNAL ||
 	    it->it_sigev.sigev_notify == SIGEV_THREAD_ID) {
 		if (sigev_findtd(p, &it->it_sigev, &td) != 0) {
 			ITIMER_LOCK(it);
 			timespecclear(&it->it_time.it_value);
 			timespecclear(&it->it_time.it_interval);
 			callout_stop(&it->it_callout);
 			ITIMER_UNLOCK(it);
 			return;
 		}
 		if (!KSI_ONQ(&it->it_ksi)) {
 			it->it_ksi.ksi_errno = 0;
 			ksiginfo_set_sigev(&it->it_ksi, &it->it_sigev);
 			tdsendsignal(p, td, it->it_ksi.ksi_signo, &it->it_ksi);
 		} else {
 			if (it->it_overrun < INT_MAX)
 				it->it_overrun++;
 			else
 				it->it_ksi.ksi_errno = ERANGE;
 		}
 		PROC_UNLOCK(p);
 	}
 }
 
 static void
 itimers_alloc(struct proc *p)
 {
 	struct itimers *its;
 
 	its = malloc(sizeof (struct itimers), M_SUBPROC, M_WAITOK | M_ZERO);
 	PROC_LOCK(p);
 	if (p->p_itimers == NULL) {
 		p->p_itimers = its;
 		PROC_UNLOCK(p);
 	}
 	else {
 		PROC_UNLOCK(p);
 		free(its, M_SUBPROC);
 	}
 }
 
 /* Clean up timers when some process events are being triggered. */
 static void
 itimers_event_exit_exec(int start_idx, struct proc *p)
 {
 	struct itimers *its;
 	struct itimer *it;
 	int i;
 
 	its = p->p_itimers;
 	if (its == NULL)
 		return;
 
 	for (i = start_idx; i < TIMER_MAX; ++i) {
 		if ((it = its->its_timers[i]) != NULL)
 			kern_ktimer_delete(curthread, i);
 	}
 	if (its->its_timers[0] == NULL && its->its_timers[1] == NULL &&
 	    its->its_timers[2] == NULL) {
 		/* Synchronize with itimer_proc_continue(). */
 		PROC_LOCK(p);
 		p->p_itimers = NULL;
 		PROC_UNLOCK(p);
 		free(its, M_SUBPROC);
 	}
 }
 
 void
 itimers_exec(struct proc *p)
 {
 	/*
 	 * According to susv3, XSI interval timers should be inherited
 	 * by new image.
 	 */
 	itimers_event_exit_exec(3, p);
 }
 
 void
 itimers_exit(struct proc *p)
 {
 	itimers_event_exit_exec(0, p);
 }
diff --git a/sys/kern/kern_umtx.c b/sys/kern/kern_umtx.c
index dc6fee1f8f38..905ebd4f98ac 100644
--- a/sys/kern/kern_umtx.c
+++ b/sys/kern/kern_umtx.c
@@ -1,5256 +1,5259 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2015, 2016 The FreeBSD Foundation
  * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
  * All rights reserved.
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 #include "opt_umtx_profiling.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/syscallsubr.h>
 #include <sys/taskqueue.h>
 #include <sys/time.h>
 #include <sys/eventhandler.h>
 #include <sys/umtx.h>
 #include <sys/umtxvar.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/uma.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 
 #include <machine/atomic.h>
 #include <machine/cpu.h>
 
 #include <compat/freebsd32/freebsd32.h>
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32_proto.h>
 #endif
 
 #define _UMUTEX_TRY		1
 #define _UMUTEX_WAIT		2
 
 #ifdef UMTX_PROFILING
 #define	UPROF_PERC_BIGGER(w, f, sw, sf)					\
 	(((w) > (sw)) || ((w) == (sw) && (f) > (sf)))
 #endif
 
 #define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
 #ifdef INVARIANTS
 #define	UMTXQ_ASSERT_LOCKED_BUSY(key) do {				\
 	struct umtxq_chain *uc;						\
 									\
 	uc = umtxq_getchain(key);					\
 	mtx_assert(&uc->uc_lock, MA_OWNED);				\
 	KASSERT(uc->uc_busy != 0, ("umtx chain is not busy"));		\
 } while (0)
 #else
 #define	UMTXQ_ASSERT_LOCKED_BUSY(key) do {} while (0)
 #endif
 
 /*
  * Don't propagate time-sharing priority, there is a security reason,
  * a user can simply introduce PI-mutex, let thread A lock the mutex,
  * and let another thread B block on the mutex, because B is
  * sleeping, its priority will be boosted, this causes A's priority to
  * be boosted via priority propagating too and will never be lowered even
  * if it is using 100%CPU, this is unfair to other processes.
  */
 
 #define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
 			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
 			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
 
 #define	GOLDEN_RATIO_PRIME	2654404609U
 #ifndef	UMTX_CHAINS
 #define	UMTX_CHAINS		512
 #endif
 #define	UMTX_SHIFTS		(__WORD_BIT - 9)
 
 #define	GET_SHARE(flags)	\
     (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
 
 #define BUSY_SPINS		200
 
 struct umtx_copyops {
 	int	(*copyin_timeout)(const void *uaddr, struct timespec *tsp);
 	int	(*copyin_umtx_time)(const void *uaddr, size_t size,
 	    struct _umtx_time *tp);
 	int	(*copyin_robust_lists)(const void *uaddr, size_t size,
 	    struct umtx_robust_lists_params *rbp);
 	int	(*copyout_timeout)(void *uaddr, size_t size,
 	    struct timespec *tsp);
 	const size_t	timespec_sz;
 	const size_t	umtx_time_sz;
 	const bool	compat32;
 };
 
 _Static_assert(sizeof(struct umutex) == sizeof(struct umutex32), "umutex32");
 _Static_assert(__offsetof(struct umutex, m_spare[0]) ==
     __offsetof(struct umutex32, m_spare[0]), "m_spare32");
 
 int umtx_shm_vnobj_persistent = 0;
 SYSCTL_INT(_kern_ipc, OID_AUTO, umtx_vnode_persistent, CTLFLAG_RWTUN,
     &umtx_shm_vnobj_persistent, 0,
     "False forces destruction of umtx attached to file, on last close");
 static int umtx_max_rb = 1000;
 SYSCTL_INT(_kern_ipc, OID_AUTO, umtx_max_robust, CTLFLAG_RWTUN,
     &umtx_max_rb, 0,
     "Maximum number of robust mutexes allowed for each thread");
 
 static uma_zone_t		umtx_pi_zone;
 static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
 static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
 static int			umtx_pi_allocated;
 
 static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "umtx debug");
 SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
     &umtx_pi_allocated, 0, "Allocated umtx_pi");
 static int umtx_verbose_rb = 1;
 SYSCTL_INT(_debug_umtx, OID_AUTO, robust_faults_verbose, CTLFLAG_RWTUN,
     &umtx_verbose_rb, 0,
     "");
 
 #ifdef UMTX_PROFILING
 static long max_length;
 SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length");
 static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "umtx chain stats");
 #endif
 
 static inline void umtx_abs_timeout_init2(struct umtx_abs_timeout *timo,
     const struct _umtx_time *umtxtime);
 
 static void umtx_shm_init(void);
 static void umtxq_sysinit(void *);
 static void umtxq_hash(struct umtx_key *key);
 static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags,
     bool rb);
 static void umtx_thread_cleanup(struct thread *td);
 SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
 
 #define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
 
 static struct mtx umtx_lock;
 
 #ifdef UMTX_PROFILING
 static void
 umtx_init_profiling(void)
 {
 	struct sysctl_oid *chain_oid;
 	char chain_name[10];
 	int i;
 
 	for (i = 0; i < UMTX_CHAINS; ++i) {
 		snprintf(chain_name, sizeof(chain_name), "%d", i);
 		chain_oid = SYSCTL_ADD_NODE(NULL,
 		    SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO,
 		    chain_name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
 		    "umtx hash stats");
 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
 		    "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL);
 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
 		    "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL);
 	}
 }
 
 static int
 sysctl_debug_umtx_chains_peaks(SYSCTL_HANDLER_ARGS)
 {
 	char buf[512];
 	struct sbuf sb;
 	struct umtxq_chain *uc;
 	u_int fract, i, j, tot, whole;
 	u_int sf0, sf1, sf2, sf3, sf4;
 	u_int si0, si1, si2, si3, si4;
 	u_int sw0, sw1, sw2, sw3, sw4;
 
 	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
 	for (i = 0; i < 2; i++) {
 		tot = 0;
 		for (j = 0; j < UMTX_CHAINS; ++j) {
 			uc = &umtxq_chains[i][j];
 			mtx_lock(&uc->uc_lock);
 			tot += uc->max_length;
 			mtx_unlock(&uc->uc_lock);
 		}
 		if (tot == 0)
 			sbuf_printf(&sb, "%u) Empty ", i);
 		else {
 			sf0 = sf1 = sf2 = sf3 = sf4 = 0;
 			si0 = si1 = si2 = si3 = si4 = 0;
 			sw0 = sw1 = sw2 = sw3 = sw4 = 0;
 			for (j = 0; j < UMTX_CHAINS; j++) {
 				uc = &umtxq_chains[i][j];
 				mtx_lock(&uc->uc_lock);
 				whole = uc->max_length * 100;
 				mtx_unlock(&uc->uc_lock);
 				fract = (whole % tot) * 100;
 				if (UPROF_PERC_BIGGER(whole, fract, sw0, sf0)) {
 					sf0 = fract;
 					si0 = j;
 					sw0 = whole;
 				} else if (UPROF_PERC_BIGGER(whole, fract, sw1,
 				    sf1)) {
 					sf1 = fract;
 					si1 = j;
 					sw1 = whole;
 				} else if (UPROF_PERC_BIGGER(whole, fract, sw2,
 				    sf2)) {
 					sf2 = fract;
 					si2 = j;
 					sw2 = whole;
 				} else if (UPROF_PERC_BIGGER(whole, fract, sw3,
 				    sf3)) {
 					sf3 = fract;
 					si3 = j;
 					sw3 = whole;
 				} else if (UPROF_PERC_BIGGER(whole, fract, sw4,
 				    sf4)) {
 					sf4 = fract;
 					si4 = j;
 					sw4 = whole;
 				}
 			}
 			sbuf_printf(&sb, "queue %u:\n", i);
 			sbuf_printf(&sb, "1st: %u.%u%% idx: %u\n", sw0 / tot,
 			    sf0 / tot, si0);
 			sbuf_printf(&sb, "2nd: %u.%u%% idx: %u\n", sw1 / tot,
 			    sf1 / tot, si1);
 			sbuf_printf(&sb, "3rd: %u.%u%% idx: %u\n", sw2 / tot,
 			    sf2 / tot, si2);
 			sbuf_printf(&sb, "4th: %u.%u%% idx: %u\n", sw3 / tot,
 			    sf3 / tot, si3);
 			sbuf_printf(&sb, "5th: %u.%u%% idx: %u\n", sw4 / tot,
 			    sf4 / tot, si4);
 		}
 	}
 	sbuf_trim(&sb);
 	sbuf_finish(&sb);
 	sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
 	sbuf_delete(&sb);
 	return (0);
 }
 
 static int
 sysctl_debug_umtx_chains_clear(SYSCTL_HANDLER_ARGS)
 {
 	struct umtxq_chain *uc;
 	u_int i, j;
 	int clear, error;
 
 	clear = 0;
 	error = sysctl_handle_int(oidp, &clear, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	if (clear != 0) {
 		for (i = 0; i < 2; ++i) {
 			for (j = 0; j < UMTX_CHAINS; ++j) {
 				uc = &umtxq_chains[i][j];
 				mtx_lock(&uc->uc_lock);
 				uc->length = 0;
 				uc->max_length = 0;
 				mtx_unlock(&uc->uc_lock);
 			}
 		}
 	}
 	return (0);
 }
 
 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, clear,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
     sysctl_debug_umtx_chains_clear, "I",
     "Clear umtx chains statistics");
 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, peaks,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
     sysctl_debug_umtx_chains_peaks, "A",
     "Highest peaks in chains max length");
 #endif
 
 static void
 umtxq_sysinit(void *arg __unused)
 {
 	int i, j;
 
 	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	for (i = 0; i < 2; ++i) {
 		for (j = 0; j < UMTX_CHAINS; ++j) {
 			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
 				 MTX_DEF | MTX_DUPOK);
 			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
 			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
 			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
 			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
 			umtxq_chains[i][j].uc_busy = 0;
 			umtxq_chains[i][j].uc_waiters = 0;
 #ifdef UMTX_PROFILING
 			umtxq_chains[i][j].length = 0;
 			umtxq_chains[i][j].max_length = 0;
 #endif
 		}
 	}
 #ifdef UMTX_PROFILING
 	umtx_init_profiling();
 #endif
 	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_DEF);
 	umtx_shm_init();
 }
 
 struct umtx_q *
 umtxq_alloc(void)
 {
 	struct umtx_q *uq;
 
 	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
 	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX,
 	    M_WAITOK | M_ZERO);
 	TAILQ_INIT(&uq->uq_spare_queue->head);
 	TAILQ_INIT(&uq->uq_pi_contested);
 	uq->uq_inherited_pri = PRI_MAX;
 	return (uq);
 }
 
 void
 umtxq_free(struct umtx_q *uq)
 {
 
 	MPASS(uq->uq_spare_queue != NULL);
 	free(uq->uq_spare_queue, M_UMTX);
 	free(uq, M_UMTX);
 }
 
 static inline void
 umtxq_hash(struct umtx_key *key)
 {
 	unsigned n;
 
 	n = (uintptr_t)key->info.both.a + key->info.both.b;
 	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
 }
 
 struct umtxq_chain *
 umtxq_getchain(struct umtx_key *key)
 {
 
 	if (key->type <= TYPE_SEM)
 		return (&umtxq_chains[1][key->hash]);
 	return (&umtxq_chains[0][key->hash]);
 }
 
 /*
  * Set chain to busy state when following operation
  * may be blocked (kernel mutex can not be used).
  */
 void
 umtxq_busy(struct umtx_key *key)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(key);
 	mtx_assert(&uc->uc_lock, MA_OWNED);
 	if (uc->uc_busy) {
 #ifdef SMP
 		if (smp_cpus > 1) {
 			int count = BUSY_SPINS;
 			if (count > 0) {
 				umtxq_unlock(key);
 				while (uc->uc_busy && --count > 0)
 					cpu_spinwait();
 				umtxq_lock(key);
 			}
 		}
 #endif
 		while (uc->uc_busy) {
 			uc->uc_waiters++;
 			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
 			uc->uc_waiters--;
 		}
 	}
 	uc->uc_busy = 1;
 }
 
 /*
  * Unbusy a chain.
  */
 void
 umtxq_unbusy(struct umtx_key *key)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(key);
 	mtx_assert(&uc->uc_lock, MA_OWNED);
 	KASSERT(uc->uc_busy != 0, ("not busy"));
 	uc->uc_busy = 0;
 	if (uc->uc_waiters)
 		wakeup_one(uc);
 }
 
 void
 umtxq_busy_unlocked(struct umtx_key *key)
 {
 	umtxq_lock(key);
 	umtxq_busy(key);
 	umtxq_unlock(key);
 }
 
 void
 umtxq_unbusy_unlocked(struct umtx_key *key)
 {
 	umtxq_lock(key);
 	umtxq_unbusy(key);
 	umtxq_unlock(key);
 }
 
 static struct umtxq_queue *
 umtxq_queue_lookup(struct umtx_key *key, int q)
 {
 	struct umtxq_queue *uh;
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
 		if (umtx_key_match(&uh->key, key))
 			return (uh);
 	}
 
 	return (NULL);
 }
 
 void
 umtxq_insert_queue(struct umtx_q *uq, int q)
 {
 	struct umtxq_queue *uh;
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(&uq->uq_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
 	uh = umtxq_queue_lookup(&uq->uq_key, q);
 	if (uh != NULL) {
 		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
 	} else {
 		uh = uq->uq_spare_queue;
 		uh->key = uq->uq_key;
 		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
 #ifdef UMTX_PROFILING
 		uc->length++;
 		if (uc->length > uc->max_length) {
 			uc->max_length = uc->length;
 			if (uc->max_length > max_length)
 				max_length = uc->max_length;
 		}
 #endif
 	}
 	uq->uq_spare_queue = NULL;
 
 	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
 	uh->length++;
 	uq->uq_flags |= UQF_UMTXQ;
 	uq->uq_cur_queue = uh;
 	return;
 }
 
 void
 umtxq_remove_queue(struct umtx_q *uq, int q)
 {
 	struct umtxq_chain *uc;
 	struct umtxq_queue *uh;
 
 	uc = umtxq_getchain(&uq->uq_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	if (uq->uq_flags & UQF_UMTXQ) {
 		uh = uq->uq_cur_queue;
 		TAILQ_REMOVE(&uh->head, uq, uq_link);
 		uh->length--;
 		uq->uq_flags &= ~UQF_UMTXQ;
 		if (TAILQ_EMPTY(&uh->head)) {
 			KASSERT(uh->length == 0,
 			    ("inconsistent umtxq_queue length"));
 #ifdef UMTX_PROFILING
 			uc->length--;
 #endif
 			LIST_REMOVE(uh, link);
 		} else {
 			uh = LIST_FIRST(&uc->uc_spare_queue);
 			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
 			LIST_REMOVE(uh, link);
 		}
 		uq->uq_spare_queue = uh;
 		uq->uq_cur_queue = NULL;
 	}
 }
 
 /*
  * Check if there are multiple waiters
  */
 int
 umtxq_count(struct umtx_key *key)
 {
 	struct umtxq_queue *uh;
 
 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(key));
 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
 	if (uh != NULL)
 		return (uh->length);
 	return (0);
 }
 
 /*
  * Check if there are multiple PI waiters and returns first
  * waiter.
  */
 static int
 umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
 {
 	struct umtxq_queue *uh;
 
 	*first = NULL;
 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(key));
 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
 	if (uh != NULL) {
 		*first = TAILQ_FIRST(&uh->head);
 		return (uh->length);
 	}
 	return (0);
 }
 
 /*
  * Wake up threads waiting on an userland object by a bit mask.
  */
 int
 umtxq_signal_mask(struct umtx_key *key, int n_wake, u_int bitset)
 {
 	struct umtxq_queue *uh;
 	struct umtx_q *uq, *uq_temp;
 	int ret;
 
 	ret = 0;
 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(key));
 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
 	if (uh == NULL)
 		return (0);
 	TAILQ_FOREACH_SAFE(uq, &uh->head, uq_link, uq_temp) {
 		if ((uq->uq_bitset & bitset) == 0)
 			continue;
 		umtxq_remove_queue(uq, UMTX_SHARED_QUEUE);
 		wakeup_one(uq);
 		if (++ret >= n_wake)
 			break;
 	}
 	return (ret);
 }
 
 /*
  * Wake up threads waiting on an userland object.
  */
 
 static int
 umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
 {
 	struct umtxq_queue *uh;
 	struct umtx_q *uq;
 	int ret;
 
 	ret = 0;
 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(key));
 	uh = umtxq_queue_lookup(key, q);
 	if (uh != NULL) {
 		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
 			umtxq_remove_queue(uq, q);
 			wakeup(uq);
 			if (++ret >= n_wake)
 				return (ret);
 		}
 	}
 	return (ret);
 }
 
 /*
  * Wake up specified thread.
  */
 static inline void
 umtxq_signal_thread(struct umtx_q *uq)
 {
 
 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(&uq->uq_key));
 	umtxq_remove(uq);
 	wakeup(uq);
 }
 
 /*
  * Wake up a maximum of n_wake threads that are waiting on an userland
  * object identified by key. The remaining threads are removed from queue
  * identified by key and added to the queue identified by key2 (requeued).
  * The n_requeue specifies an upper limit on the number of threads that
  * are requeued to the second queue.
  */
 int
 umtxq_requeue(struct umtx_key *key, int n_wake, struct umtx_key *key2,
     int n_requeue)
 {
 	struct umtxq_queue *uh;
 	struct umtx_q *uq, *uq_temp;
 	int ret;
 
 	ret = 0;
 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(key));
 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(key2));
 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
 	if (uh == NULL)
 		return (0);
 	TAILQ_FOREACH_SAFE(uq, &uh->head, uq_link, uq_temp) {
 		if (++ret <= n_wake) {
 			umtxq_remove(uq);
 			wakeup_one(uq);
 		} else {
 			umtxq_remove(uq);
 			uq->uq_key = *key2;
 			umtxq_insert(uq);
 			if (ret - n_wake == n_requeue)
 				break;
 		}
 	}
 	return (ret);
 }
 
 static inline int
 tstohz(const struct timespec *tsp)
 {
 	struct timeval tv;
 
 	TIMESPEC_TO_TIMEVAL(&tv, tsp);
 	return tvtohz(&tv);
 }
 
 void
 umtx_abs_timeout_init(struct umtx_abs_timeout *timo, int clockid,
     int absolute, const struct timespec *timeout)
 {
 
 	timo->clockid = clockid;
 	if (!absolute) {
 		timo->is_abs_real = false;
 		kern_clock_gettime(curthread, timo->clockid, &timo->cur);
 		timespecadd(&timo->cur, timeout, &timo->end);
 	} else {
 		timo->end = *timeout;
 		timo->is_abs_real = clockid == CLOCK_REALTIME ||
 		    clockid == CLOCK_REALTIME_FAST ||
 		    clockid == CLOCK_REALTIME_PRECISE ||
+		    clockid == CLOCK_TAI ||
 		    clockid == CLOCK_SECOND;
 	}
 }
 
 static void
 umtx_abs_timeout_init2(struct umtx_abs_timeout *timo,
     const struct _umtx_time *umtxtime)
 {
 
 	umtx_abs_timeout_init(timo, umtxtime->_clockid,
 	    (umtxtime->_flags & UMTX_ABSTIME) != 0, &umtxtime->_timeout);
 }
 
 static void
 umtx_abs_timeout_enforce_min(sbintime_t *sbt)
 {
 	sbintime_t when, mint;
 
 	mint = curproc->p_umtx_min_timeout;
 	if (__predict_false(mint != 0)) {
 		when = sbinuptime() + mint;
 		if (*sbt < when)
 			*sbt = when;
 	}
 }
 
 static int
 umtx_abs_timeout_getsbt(struct umtx_abs_timeout *timo, sbintime_t *sbt,
     int *flags)
 {
 	struct bintime bt, bbt;
 	struct timespec tts;
 	sbintime_t rem;
 
 	switch (timo->clockid) {
 
 	/* Clocks that can be converted into absolute time. */
 	case CLOCK_REALTIME:
 	case CLOCK_REALTIME_PRECISE:
 	case CLOCK_REALTIME_FAST:
 	case CLOCK_MONOTONIC:
 	case CLOCK_MONOTONIC_PRECISE:
 	case CLOCK_MONOTONIC_FAST:
 	case CLOCK_UPTIME:
 	case CLOCK_UPTIME_PRECISE:
 	case CLOCK_UPTIME_FAST:
 	case CLOCK_SECOND:
 		timespec2bintime(&timo->end, &bt);
 		switch (timo->clockid) {
 		case CLOCK_REALTIME:
 		case CLOCK_REALTIME_PRECISE:
 		case CLOCK_REALTIME_FAST:
 		case CLOCK_SECOND:
 			getboottimebin(&bbt);
 			bintime_sub(&bt, &bbt);
 			break;
 		}
 		if (bt.sec < 0)
 			return (ETIMEDOUT);
 		if (bt.sec >= (SBT_MAX >> 32)) {
 			*sbt = 0;
 			*flags = 0;
 			return (0);
 		}
 		*sbt = bttosbt(bt);
 		umtx_abs_timeout_enforce_min(sbt);
 
 		/*
 		 * Check if the absolute time should be aligned to
 		 * avoid firing multiple timer events in non-periodic
 		 * timer mode.
 		 */
 		switch (timo->clockid) {
 		case CLOCK_REALTIME_FAST:
 		case CLOCK_MONOTONIC_FAST:
 		case CLOCK_UPTIME_FAST:
 			rem = *sbt % tc_tick_sbt;
 			if (__predict_true(rem != 0))
 				*sbt += tc_tick_sbt - rem;
 			break;
 		case CLOCK_SECOND:
 			rem = *sbt % SBT_1S;
 			if (__predict_true(rem != 0))
 				*sbt += SBT_1S - rem;
 			break;
 		}
 		*flags = C_ABSOLUTE;
 		return (0);
 
 	/* Clocks that has to be periodically polled. */
 	case CLOCK_VIRTUAL:
 	case CLOCK_PROF:
 	case CLOCK_THREAD_CPUTIME_ID:
 	case CLOCK_PROCESS_CPUTIME_ID:
+	case CLOCK_TAI: /* Boot time is not necessarily stable in TAI */
 	default:
 		kern_clock_gettime(curthread, timo->clockid, &timo->cur);
 		if (timespeccmp(&timo->end, &timo->cur, <=))
 			return (ETIMEDOUT);
 		timespecsub(&timo->end, &timo->cur, &tts);
 		*sbt = tick_sbt * tstohz(&tts);
 		*flags = C_HARDCLOCK;
 		return (0);
 	}
 }
 
 static uint32_t
 umtx_unlock_val(uint32_t flags, bool rb)
 {
 
 	if (rb)
 		return (UMUTEX_RB_OWNERDEAD);
 	else if ((flags & UMUTEX_NONCONSISTENT) != 0)
 		return (UMUTEX_RB_NOTRECOV);
 	else
 		return (UMUTEX_UNOWNED);
 
 }
 
 /*
  * Put thread into sleep state, before sleeping, check if
  * thread was removed from umtx queue.
  */
 int
 umtxq_sleep(struct umtx_q *uq, const char *wmesg,
     struct umtx_abs_timeout *timo)
 {
 	struct umtxq_chain *uc;
 	sbintime_t sbt = 0;
 	int error, flags = 0;
 
 	uc = umtxq_getchain(&uq->uq_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	for (;;) {
 		if (!(uq->uq_flags & UQF_UMTXQ)) {
 			error = 0;
 			break;
 		}
 		if (timo != NULL) {
 			if (timo->is_abs_real)
 				curthread->td_rtcgen =
 				    atomic_load_acq_int(&rtc_generation);
 			error = umtx_abs_timeout_getsbt(timo, &sbt, &flags);
 			if (error != 0)
 				break;
 		}
 		error = msleep_sbt(uq, &uc->uc_lock, PCATCH | PDROP, wmesg,
 		    sbt, 0, flags);
 		uc = umtxq_getchain(&uq->uq_key);
 		mtx_lock(&uc->uc_lock);
 		if (error == EINTR || error == ERESTART)
 			break;
 		if (error == EWOULDBLOCK && (flags & C_ABSOLUTE) != 0) {
 			error = ETIMEDOUT;
 			break;
 		}
 	}
 
 	curthread->td_rtcgen = 0;
 	return (error);
 }
 
 /*
  * Convert userspace address into unique logical address.
  */
 int
 umtx_key_get(const void *addr, int type, int share, struct umtx_key *key)
 {
 	struct thread *td = curthread;
 	vm_map_t map;
 	vm_map_entry_t entry;
 	vm_pindex_t pindex;
 	vm_prot_t prot;
 	boolean_t wired;
 
 	key->type = type;
 	if (share == THREAD_SHARE) {
 		key->shared = 0;
 		key->info.private.vs = td->td_proc->p_vmspace;
 		key->info.private.addr = (uintptr_t)addr;
 	} else {
 		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
 		map = &td->td_proc->p_vmspace->vm_map;
 		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
 		    &entry, &key->info.shared.object, &pindex, &prot,
 		    &wired) != KERN_SUCCESS) {
 			return (EFAULT);
 		}
 
 		if ((share == PROCESS_SHARE) ||
 		    (share == AUTO_SHARE &&
 		     VM_INHERIT_SHARE == entry->inheritance)) {
 			key->shared = 1;
 			key->info.shared.offset = (vm_offset_t)addr -
 			    entry->start + entry->offset;
 			vm_object_reference(key->info.shared.object);
 		} else {
 			key->shared = 0;
 			key->info.private.vs = td->td_proc->p_vmspace;
 			key->info.private.addr = (uintptr_t)addr;
 		}
 		vm_map_lookup_done(map, entry);
 	}
 
 	umtxq_hash(key);
 	return (0);
 }
 
 /*
  * Release key.
  */
 void
 umtx_key_release(struct umtx_key *key)
 {
 	if (key->shared)
 		vm_object_deallocate(key->info.shared.object);
 }
 
 #ifdef COMPAT_FREEBSD10
 /*
  * Lock a umtx object.
  */
 static int
 do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
     const struct timespec *timeout)
 {
 	struct umtx_abs_timeout timo;
 	struct umtx_q *uq;
 	u_long owner;
 	u_long old;
 	int error = 0;
 
 	uq = td->td_umtxq;
 	if (timeout != NULL)
 		umtx_abs_timeout_init(&timo, CLOCK_REALTIME, 0, timeout);
 
 	/*
 	 * Care must be exercised when dealing with umtx structure. It
 	 * can fault on any access.
 	 */
 	for (;;) {
 		/*
 		 * Try the uncontested case.  This should be done in userland.
 		 */
 		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
 
 		/* The acquire succeeded. */
 		if (owner == UMTX_UNOWNED)
 			return (0);
 
 		/* The address was invalid. */
 		if (owner == -1)
 			return (EFAULT);
 
 		/* If no one owns it but it is contested try to acquire it. */
 		if (owner == UMTX_CONTESTED) {
 			owner = casuword(&umtx->u_owner,
 			    UMTX_CONTESTED, id | UMTX_CONTESTED);
 
 			if (owner == UMTX_CONTESTED)
 				return (0);
 
 			/* The address was invalid. */
 			if (owner == -1)
 				return (EFAULT);
 
 			error = thread_check_susp(td, false);
 			if (error != 0)
 				break;
 
 			/* If this failed the lock has changed, restart. */
 			continue;
 		}
 
 		/*
 		 * If we caught a signal, we have retried and now
 		 * exit immediately.
 		 */
 		if (error != 0)
 			break;
 
 		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
 			AUTO_SHARE, &uq->uq_key)) != 0)
 			return (error);
 
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_insert(uq);
 		umtxq_unbusy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 
 		/*
 		 * Set the contested bit so that a release in user space
 		 * knows to use the system call for unlock.  If this fails
 		 * either some one else has acquired the lock or it has been
 		 * released.
 		 */
 		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
 
 		/* The address was invalid. */
 		if (old == -1) {
 			umtxq_lock(&uq->uq_key);
 			umtxq_remove(uq);
 			umtxq_unlock(&uq->uq_key);
 			umtx_key_release(&uq->uq_key);
 			return (EFAULT);
 		}
 
 		/*
 		 * We set the contested bit, sleep. Otherwise the lock changed
 		 * and we need to retry or we lost a race to the thread
 		 * unlocking the umtx.
 		 */
 		umtxq_lock(&uq->uq_key);
 		if (old == owner)
 			error = umtxq_sleep(uq, "umtx", timeout == NULL ? NULL :
 			    &timo);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 		umtx_key_release(&uq->uq_key);
 
 		if (error == 0)
 			error = thread_check_susp(td, false);
 	}
 
 	if (timeout == NULL) {
 		/* Mutex locking is restarted if it is interrupted. */
 		if (error == EINTR)
 			error = ERESTART;
 	} else {
 		/* Timed-locking is not restarted. */
 		if (error == ERESTART)
 			error = EINTR;
 	}
 	return (error);
 }
 
 /*
  * Unlock a umtx object.
  */
 static int
 do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
 {
 	struct umtx_key key;
 	u_long owner;
 	u_long old;
 	int error;
 	int count;
 
 	/*
 	 * Make sure we own this mtx.
 	 */
 	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
 	if (owner == -1)
 		return (EFAULT);
 
 	if ((owner & ~UMTX_CONTESTED) != id)
 		return (EPERM);
 
 	/* This should be done in userland */
 	if ((owner & UMTX_CONTESTED) == 0) {
 		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
 		if (old == -1)
 			return (EFAULT);
 		if (old == owner)
 			return (0);
 		owner = old;
 	}
 
 	/* We should only ever be in here for contested locks */
 	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
 	    &key)) != 0)
 		return (error);
 
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	count = umtxq_count(&key);
 	umtxq_unlock(&key);
 
 	/*
 	 * When unlocking the umtx, it must be marked as unowned if
 	 * there is zero or one thread only waiting for it.
 	 * Otherwise, it must be marked as contested.
 	 */
 	old = casuword(&umtx->u_owner, owner,
 	    count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
 	umtxq_lock(&key);
 	umtxq_signal(&key,1);
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	if (old == -1)
 		return (EFAULT);
 	if (old != owner)
 		return (EINVAL);
 	return (0);
 }
 
 #ifdef COMPAT_FREEBSD32
 
 /*
  * Lock a umtx object.
  */
 static int
 do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id,
 	const struct timespec *timeout)
 {
 	struct umtx_abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t owner;
 	uint32_t old;
 	int error = 0;
 
 	uq = td->td_umtxq;
 
 	if (timeout != NULL)
 		umtx_abs_timeout_init(&timo, CLOCK_REALTIME, 0, timeout);
 
 	/*
 	 * Care must be exercised when dealing with umtx structure. It
 	 * can fault on any access.
 	 */
 	for (;;) {
 		/*
 		 * Try the uncontested case.  This should be done in userland.
 		 */
 		owner = casuword32(m, UMUTEX_UNOWNED, id);
 
 		/* The acquire succeeded. */
 		if (owner == UMUTEX_UNOWNED)
 			return (0);
 
 		/* The address was invalid. */
 		if (owner == -1)
 			return (EFAULT);
 
 		/* If no one owns it but it is contested try to acquire it. */
 		if (owner == UMUTEX_CONTESTED) {
 			owner = casuword32(m,
 			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
 			if (owner == UMUTEX_CONTESTED)
 				return (0);
 
 			/* The address was invalid. */
 			if (owner == -1)
 				return (EFAULT);
 
 			error = thread_check_susp(td, false);
 			if (error != 0)
 				break;
 
 			/* If this failed the lock has changed, restart. */
 			continue;
 		}
 
 		/*
 		 * If we caught a signal, we have retried and now
 		 * exit immediately.
 		 */
 		if (error != 0)
 			return (error);
 
 		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
 			AUTO_SHARE, &uq->uq_key)) != 0)
 			return (error);
 
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_insert(uq);
 		umtxq_unbusy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 
 		/*
 		 * Set the contested bit so that a release in user space
 		 * knows to use the system call for unlock.  If this fails
 		 * either some one else has acquired the lock or it has been
 		 * released.
 		 */
 		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
 
 		/* The address was invalid. */
 		if (old == -1) {
 			umtxq_lock(&uq->uq_key);
 			umtxq_remove(uq);
 			umtxq_unlock(&uq->uq_key);
 			umtx_key_release(&uq->uq_key);
 			return (EFAULT);
 		}
 
 		/*
 		 * We set the contested bit, sleep. Otherwise the lock changed
 		 * and we need to retry or we lost a race to the thread
 		 * unlocking the umtx.
 		 */
 		umtxq_lock(&uq->uq_key);
 		if (old == owner)
 			error = umtxq_sleep(uq, "umtx", timeout == NULL ?
 			    NULL : &timo);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 		umtx_key_release(&uq->uq_key);
 
 		if (error == 0)
 			error = thread_check_susp(td, false);
 	}
 
 	if (timeout == NULL) {
 		/* Mutex locking is restarted if it is interrupted. */
 		if (error == EINTR)
 			error = ERESTART;
 	} else {
 		/* Timed-locking is not restarted. */
 		if (error == ERESTART)
 			error = EINTR;
 	}
 	return (error);
 }
 
 /*
  * Unlock a umtx object.
  */
 static int
 do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
 {
 	struct umtx_key key;
 	uint32_t owner;
 	uint32_t old;
 	int error;
 	int count;
 
 	/*
 	 * Make sure we own this mtx.
 	 */
 	owner = fuword32(m);
 	if (owner == -1)
 		return (EFAULT);
 
 	if ((owner & ~UMUTEX_CONTESTED) != id)
 		return (EPERM);
 
 	/* This should be done in userland */
 	if ((owner & UMUTEX_CONTESTED) == 0) {
 		old = casuword32(m, owner, UMUTEX_UNOWNED);
 		if (old == -1)
 			return (EFAULT);
 		if (old == owner)
 			return (0);
 		owner = old;
 	}
 
 	/* We should only ever be in here for contested locks */
 	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
 		&key)) != 0)
 		return (error);
 
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	count = umtxq_count(&key);
 	umtxq_unlock(&key);
 
 	/*
 	 * When unlocking the umtx, it must be marked as unowned if
 	 * there is zero or one thread only waiting for it.
 	 * Otherwise, it must be marked as contested.
 	 */
 	old = casuword32(m, owner,
 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
 	umtxq_lock(&key);
 	umtxq_signal(&key,1);
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	if (old == -1)
 		return (EFAULT);
 	if (old != owner)
 		return (EINVAL);
 	return (0);
 }
 #endif	/* COMPAT_FREEBSD32 */
 #endif	/* COMPAT_FREEBSD10 */
 
 /*
  * Fetch and compare value, sleep on the address if value is not changed.
  */
 static int
 do_wait(struct thread *td, void *addr, u_long id,
     struct _umtx_time *timeout, int compat32, int is_private)
 {
 	struct umtx_abs_timeout timo;
 	struct umtx_q *uq;
 	u_long tmp;
 	uint32_t tmp32;
 	int error = 0;
 
 	uq = td->td_umtxq;
 	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
 	    is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
 		return (error);
 
 	if (timeout != NULL)
 		umtx_abs_timeout_init2(&timo, timeout);
 
 	umtxq_lock(&uq->uq_key);
 	umtxq_insert(uq);
 	umtxq_unlock(&uq->uq_key);
 	if (compat32 == 0) {
 		error = fueword(addr, &tmp);
 		if (error != 0)
 			error = EFAULT;
 	} else {
 		error = fueword32(addr, &tmp32);
 		if (error == 0)
 			tmp = tmp32;
 		else
 			error = EFAULT;
 	}
 	umtxq_lock(&uq->uq_key);
 	if (error == 0) {
 		if (tmp == id)
 			error = umtxq_sleep(uq, "uwait", timeout == NULL ?
 			    NULL : &timo);
 		if ((uq->uq_flags & UQF_UMTXQ) == 0)
 			error = 0;
 		else
 			umtxq_remove(uq);
 	} else if ((uq->uq_flags & UQF_UMTXQ) != 0) {
 		umtxq_remove(uq);
 	}
 	umtxq_unlock(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	if (error == ERESTART)
 		error = EINTR;
 	return (error);
 }
 
 /*
  * Wake up threads sleeping on the specified address.
  */
 int
 kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
 {
 	struct umtx_key key;
 	int ret;
 
 	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
 	    is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
 		return (ret);
 	umtxq_lock(&key);
 	umtxq_signal(&key, n_wake);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (0);
 }
 
 /*
  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
  */
 static int
 do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags,
     struct _umtx_time *timeout, int mode)
 {
 	struct umtx_abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t owner, old, id;
 	int error, rv;
 
 	id = td->td_tid;
 	uq = td->td_umtxq;
 	error = 0;
 	if (timeout != NULL)
 		umtx_abs_timeout_init2(&timo, timeout);
 
 	/*
 	 * Care must be exercised when dealing with umtx structure. It
 	 * can fault on any access.
 	 */
 	for (;;) {
 		rv = fueword32(&m->m_owner, &owner);
 		if (rv == -1)
 			return (EFAULT);
 		if (mode == _UMUTEX_WAIT) {
 			if (owner == UMUTEX_UNOWNED ||
 			    owner == UMUTEX_CONTESTED ||
 			    owner == UMUTEX_RB_OWNERDEAD ||
 			    owner == UMUTEX_RB_NOTRECOV)
 				return (0);
 		} else {
 			/*
 			 * Robust mutex terminated.  Kernel duty is to
 			 * return EOWNERDEAD to the userspace.  The
 			 * umutex.m_flags UMUTEX_NONCONSISTENT is set
 			 * by the common userspace code.
 			 */
 			if (owner == UMUTEX_RB_OWNERDEAD) {
 				rv = casueword32(&m->m_owner,
 				    UMUTEX_RB_OWNERDEAD, &owner,
 				    id | UMUTEX_CONTESTED);
 				if (rv == -1)
 					return (EFAULT);
 				if (rv == 0) {
 					MPASS(owner == UMUTEX_RB_OWNERDEAD);
 					return (EOWNERDEAD); /* success */
 				}
 				MPASS(rv == 1);
 				rv = thread_check_susp(td, false);
 				if (rv != 0)
 					return (rv);
 				continue;
 			}
 			if (owner == UMUTEX_RB_NOTRECOV)
 				return (ENOTRECOVERABLE);
 
 			/*
 			 * Try the uncontested case.  This should be
 			 * done in userland.
 			 */
 			rv = casueword32(&m->m_owner, UMUTEX_UNOWNED,
 			    &owner, id);
 			/* The address was invalid. */
 			if (rv == -1)
 				return (EFAULT);
 
 			/* The acquire succeeded. */
 			if (rv == 0) {
 				MPASS(owner == UMUTEX_UNOWNED);
 				return (0);
 			}
 
 			/*
 			 * If no one owns it but it is contested try
 			 * to acquire it.
 			 */
 			MPASS(rv == 1);
 			if (owner == UMUTEX_CONTESTED) {
 				rv = casueword32(&m->m_owner,
 				    UMUTEX_CONTESTED, &owner,
 				    id | UMUTEX_CONTESTED);
 				/* The address was invalid. */
 				if (rv == -1)
 					return (EFAULT);
 				if (rv == 0) {
 					MPASS(owner == UMUTEX_CONTESTED);
 					return (0);
 				}
 				if (rv == 1) {
 					rv = thread_check_susp(td, false);
 					if (rv != 0)
 						return (rv);
 				}
 
 				/*
 				 * If this failed the lock has
 				 * changed, restart.
 				 */
 				continue;
 			}
 
 			/* rv == 1 but not contested, likely store failure */
 			rv = thread_check_susp(td, false);
 			if (rv != 0)
 				return (rv);
 		}
 
 		if (mode == _UMUTEX_TRY)
 			return (EBUSY);
 
 		/*
 		 * If we caught a signal, we have retried and now
 		 * exit immediately.
 		 */
 		if (error != 0)
 			return (error);
 
 		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
 		    GET_SHARE(flags), &uq->uq_key)) != 0)
 			return (error);
 
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_insert(uq);
 		umtxq_unlock(&uq->uq_key);
 
 		/*
 		 * Set the contested bit so that a release in user space
 		 * knows to use the system call for unlock.  If this fails
 		 * either some one else has acquired the lock or it has been
 		 * released.
 		 */
 		rv = casueword32(&m->m_owner, owner, &old,
 		    owner | UMUTEX_CONTESTED);
 
 		/* The address was invalid or casueword failed to store. */
 		if (rv == -1 || rv == 1) {
 			umtxq_lock(&uq->uq_key);
 			umtxq_remove(uq);
 			umtxq_unbusy(&uq->uq_key);
 			umtxq_unlock(&uq->uq_key);
 			umtx_key_release(&uq->uq_key);
 			if (rv == -1)
 				return (EFAULT);
 			if (rv == 1) {
 				rv = thread_check_susp(td, false);
 				if (rv != 0)
 					return (rv);
 			}
 			continue;
 		}
 
 		/*
 		 * We set the contested bit, sleep. Otherwise the lock changed
 		 * and we need to retry or we lost a race to the thread
 		 * unlocking the umtx.
 		 */
 		umtxq_lock(&uq->uq_key);
 		umtxq_unbusy(&uq->uq_key);
 		MPASS(old == owner);
 		error = umtxq_sleep(uq, "umtxn", timeout == NULL ?
 		    NULL : &timo);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 		umtx_key_release(&uq->uq_key);
 
 		if (error == 0)
 			error = thread_check_susp(td, false);
 	}
 
 	return (0);
 }
 
 /*
  * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
  */
 static int
 do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
 {
 	struct umtx_key key;
 	uint32_t owner, old, id, newlock;
 	int error, count;
 
 	id = td->td_tid;
 
 again:
 	/*
 	 * Make sure we own this mtx.
 	 */
 	error = fueword32(&m->m_owner, &owner);
 	if (error == -1)
 		return (EFAULT);
 
 	if ((owner & ~UMUTEX_CONTESTED) != id)
 		return (EPERM);
 
 	newlock = umtx_unlock_val(flags, rb);
 	if ((owner & UMUTEX_CONTESTED) == 0) {
 		error = casueword32(&m->m_owner, owner, &old, newlock);
 		if (error == -1)
 			return (EFAULT);
 		if (error == 1) {
 			error = thread_check_susp(td, false);
 			if (error != 0)
 				return (error);
 			goto again;
 		}
 		MPASS(old == owner);
 		return (0);
 	}
 
 	/* We should only ever be in here for contested locks */
 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
 	    &key)) != 0)
 		return (error);
 
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	count = umtxq_count(&key);
 	umtxq_unlock(&key);
 
 	/*
 	 * When unlocking the umtx, it must be marked as unowned if
 	 * there is zero or one thread only waiting for it.
 	 * Otherwise, it must be marked as contested.
 	 */
 	if (count > 1)
 		newlock |= UMUTEX_CONTESTED;
 	error = casueword32(&m->m_owner, owner, &old, newlock);
 	umtxq_lock(&key);
 	umtxq_signal(&key, 1);
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	if (error == -1)
 		return (EFAULT);
 	if (error == 1) {
 		if (old != owner)
 			return (EINVAL);
 		error = thread_check_susp(td, false);
 		if (error != 0)
 			return (error);
 		goto again;
 	}
 	return (0);
 }
 
 /*
  * Check if the mutex is available and wake up a waiter,
  * only for simple mutex.
  */
 static int
 do_wake_umutex(struct thread *td, struct umutex *m)
 {
 	struct umtx_key key;
 	uint32_t owner;
 	uint32_t flags;
 	int error;
 	int count;
 
 again:
 	error = fueword32(&m->m_owner, &owner);
 	if (error == -1)
 		return (EFAULT);
 
 	if ((owner & ~UMUTEX_CONTESTED) != 0 && owner != UMUTEX_RB_OWNERDEAD &&
 	    owner != UMUTEX_RB_NOTRECOV)
 		return (0);
 
 	error = fueword32(&m->m_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 
 	/* We should only ever be in here for contested locks */
 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
 	    &key)) != 0)
 		return (error);
 
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	count = umtxq_count(&key);
 	umtxq_unlock(&key);
 
 	if (count <= 1 && owner != UMUTEX_RB_OWNERDEAD &&
 	    owner != UMUTEX_RB_NOTRECOV) {
 		error = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
 		    UMUTEX_UNOWNED);
 		if (error == -1) {
 			error = EFAULT;
 		} else if (error == 1) {
 			umtxq_lock(&key);
 			umtxq_unbusy(&key);
 			umtxq_unlock(&key);
 			umtx_key_release(&key);
 			error = thread_check_susp(td, false);
 			if (error != 0)
 				return (error);
 			goto again;
 		}
 	}
 
 	umtxq_lock(&key);
 	if (error == 0 && count != 0) {
 		MPASS((owner & ~UMUTEX_CONTESTED) == 0 ||
 		    owner == UMUTEX_RB_OWNERDEAD ||
 		    owner == UMUTEX_RB_NOTRECOV);
 		umtxq_signal(&key, 1);
 	}
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (error);
 }
 
 /*
  * Check if the mutex has waiters and tries to fix contention bit.
  */
 static int
 do_wake2_umutex(struct thread *td, struct umutex *m, uint32_t flags)
 {
 	struct umtx_key key;
 	uint32_t owner, old;
 	int type;
 	int error;
 	int count;
 
 	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT |
 	    UMUTEX_ROBUST)) {
 	case 0:
 	case UMUTEX_ROBUST:
 		type = TYPE_NORMAL_UMUTEX;
 		break;
 	case UMUTEX_PRIO_INHERIT:
 		type = TYPE_PI_UMUTEX;
 		break;
 	case (UMUTEX_PRIO_INHERIT | UMUTEX_ROBUST):
 		type = TYPE_PI_ROBUST_UMUTEX;
 		break;
 	case UMUTEX_PRIO_PROTECT:
 		type = TYPE_PP_UMUTEX;
 		break;
 	case (UMUTEX_PRIO_PROTECT | UMUTEX_ROBUST):
 		type = TYPE_PP_ROBUST_UMUTEX;
 		break;
 	default:
 		return (EINVAL);
 	}
 	if ((error = umtx_key_get(m, type, GET_SHARE(flags), &key)) != 0)
 		return (error);
 
 	owner = 0;
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	count = umtxq_count(&key);
 	umtxq_unlock(&key);
 
 	error = fueword32(&m->m_owner, &owner);
 	if (error == -1)
 		error = EFAULT;
 
 	/*
 	 * Only repair contention bit if there is a waiter, this means
 	 * the mutex is still being referenced by userland code,
 	 * otherwise don't update any memory.
 	 */
 	while (error == 0 && (owner & UMUTEX_CONTESTED) == 0 &&
 	    (count > 1 || (count == 1 && (owner & ~UMUTEX_CONTESTED) != 0))) {
 		error = casueword32(&m->m_owner, owner, &old,
 		    owner | UMUTEX_CONTESTED);
 		if (error == -1) {
 			error = EFAULT;
 			break;
 		}
 		if (error == 0) {
 			MPASS(old == owner);
 			break;
 		}
 		owner = old;
 		error = thread_check_susp(td, false);
 	}
 
 	umtxq_lock(&key);
 	if (error == EFAULT) {
 		umtxq_signal(&key, INT_MAX);
 	} else if (count != 0 && ((owner & ~UMUTEX_CONTESTED) == 0 ||
 	    owner == UMUTEX_RB_OWNERDEAD || owner == UMUTEX_RB_NOTRECOV))
 		umtxq_signal(&key, 1);
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (error);
 }
 
 struct umtx_pi *
 umtx_pi_alloc(int flags)
 {
 	struct umtx_pi *pi;
 
 	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
 	if (pi == NULL)
 		return (NULL);
 
 	TAILQ_INIT(&pi->pi_blocked);
 	atomic_add_int(&umtx_pi_allocated, 1);
 	return (pi);
 }
 
 void
 umtx_pi_free(struct umtx_pi *pi)
 {
 	uma_zfree(umtx_pi_zone, pi);
 	atomic_add_int(&umtx_pi_allocated, -1);
 }
 
 /*
  * Adjust the thread's position on a pi_state after its priority has been
  * changed.
  */
 static int
 umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
 {
 	struct umtx_q *uq, *uq1, *uq2;
 	struct thread *td1;
 
 	mtx_assert(&umtx_lock, MA_OWNED);
 	if (pi == NULL)
 		return (0);
 
 	uq = td->td_umtxq;
 
 	/*
 	 * Check if the thread needs to be moved on the blocked chain.
 	 * It needs to be moved if either its priority is lower than
 	 * the previous thread or higher than the next thread.
 	 */
 	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
 	uq2 = TAILQ_NEXT(uq, uq_lockq);
 	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
 	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
 		/*
 		 * Remove thread from blocked chain and determine where
 		 * it should be moved to.
 		 */
 		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
 		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
 			td1 = uq1->uq_thread;
 			MPASS(td1->td_proc->p_magic == P_MAGIC);
 			if (UPRI(td1) > UPRI(td))
 				break;
 		}
 
 		if (uq1 == NULL)
 			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
 		else
 			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
 	}
 	return (1);
 }
 
 static struct umtx_pi *
 umtx_pi_next(struct umtx_pi *pi)
 {
 	struct umtx_q *uq_owner;
 
 	if (pi->pi_owner == NULL)
 		return (NULL);
 	uq_owner = pi->pi_owner->td_umtxq;
 	if (uq_owner == NULL)
 		return (NULL);
 	return (uq_owner->uq_pi_blocked);
 }
 
 /*
  * Floyd's Cycle-Finding Algorithm.
  */
 static bool
 umtx_pi_check_loop(struct umtx_pi *pi)
 {
 	struct umtx_pi *pi1;	/* fast iterator */
 
 	mtx_assert(&umtx_lock, MA_OWNED);
 	if (pi == NULL)
 		return (false);
 	pi1 = pi;
 	for (;;) {
 		pi = umtx_pi_next(pi);
 		if (pi == NULL)
 			break;
 		pi1 = umtx_pi_next(pi1);
 		if (pi1 == NULL)
 			break;
 		pi1 = umtx_pi_next(pi1);
 		if (pi1 == NULL)
 			break;
 		if (pi == pi1)
 			return (true);
 	}
 	return (false);
 }
 
 /*
  * Propagate priority when a thread is blocked on POSIX
  * PI mutex.
  */
 static void
 umtx_propagate_priority(struct thread *td)
 {
 	struct umtx_q *uq;
 	struct umtx_pi *pi;
 	int pri;
 
 	mtx_assert(&umtx_lock, MA_OWNED);
 	pri = UPRI(td);
 	uq = td->td_umtxq;
 	pi = uq->uq_pi_blocked;
 	if (pi == NULL)
 		return;
 	if (umtx_pi_check_loop(pi))
 		return;
 
 	for (;;) {
 		td = pi->pi_owner;
 		if (td == NULL || td == curthread)
 			return;
 
 		MPASS(td->td_proc != NULL);
 		MPASS(td->td_proc->p_magic == P_MAGIC);
 
 		thread_lock(td);
 		if (td->td_lend_user_pri > pri)
 			sched_lend_user_prio(td, pri);
 		else {
 			thread_unlock(td);
 			break;
 		}
 		thread_unlock(td);
 
 		/*
 		 * Pick up the lock that td is blocked on.
 		 */
 		uq = td->td_umtxq;
 		pi = uq->uq_pi_blocked;
 		if (pi == NULL)
 			break;
 		/* Resort td on the list if needed. */
 		umtx_pi_adjust_thread(pi, td);
 	}
 }
 
 /*
  * Unpropagate priority for a PI mutex when a thread blocked on
  * it is interrupted by signal or resumed by others.
  */
 static void
 umtx_repropagate_priority(struct umtx_pi *pi)
 {
 	struct umtx_q *uq, *uq_owner;
 	struct umtx_pi *pi2;
 	int pri;
 
 	mtx_assert(&umtx_lock, MA_OWNED);
 
 	if (umtx_pi_check_loop(pi))
 		return;
 	while (pi != NULL && pi->pi_owner != NULL) {
 		pri = PRI_MAX;
 		uq_owner = pi->pi_owner->td_umtxq;
 
 		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
 			uq = TAILQ_FIRST(&pi2->pi_blocked);
 			if (uq != NULL) {
 				if (pri > UPRI(uq->uq_thread))
 					pri = UPRI(uq->uq_thread);
 			}
 		}
 
 		if (pri > uq_owner->uq_inherited_pri)
 			pri = uq_owner->uq_inherited_pri;
 		thread_lock(pi->pi_owner);
 		sched_lend_user_prio(pi->pi_owner, pri);
 		thread_unlock(pi->pi_owner);
 		if ((pi = uq_owner->uq_pi_blocked) != NULL)
 			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
 	}
 }
 
 /*
  * Insert a PI mutex into owned list.
  */
 static void
 umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
 {
 	struct umtx_q *uq_owner;
 
 	uq_owner = owner->td_umtxq;
 	mtx_assert(&umtx_lock, MA_OWNED);
 	MPASS(pi->pi_owner == NULL);
 	pi->pi_owner = owner;
 	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
 }
 
 /*
  * Disown a PI mutex, and remove it from the owned list.
  */
 static void
 umtx_pi_disown(struct umtx_pi *pi)
 {
 
 	mtx_assert(&umtx_lock, MA_OWNED);
 	TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested, pi, pi_link);
 	pi->pi_owner = NULL;
 }
 
 /*
  * Claim ownership of a PI mutex.
  */
 int
 umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
 {
 	struct umtx_q *uq;
 	int pri;
 
 	mtx_lock(&umtx_lock);
 	if (pi->pi_owner == owner) {
 		mtx_unlock(&umtx_lock);
 		return (0);
 	}
 
 	if (pi->pi_owner != NULL) {
 		/*
 		 * userland may have already messed the mutex, sigh.
 		 */
 		mtx_unlock(&umtx_lock);
 		return (EPERM);
 	}
 	umtx_pi_setowner(pi, owner);
 	uq = TAILQ_FIRST(&pi->pi_blocked);
 	if (uq != NULL) {
 		pri = UPRI(uq->uq_thread);
 		thread_lock(owner);
 		if (pri < UPRI(owner))
 			sched_lend_user_prio(owner, pri);
 		thread_unlock(owner);
 	}
 	mtx_unlock(&umtx_lock);
 	return (0);
 }
 
 /*
  * Adjust a thread's order position in its blocked PI mutex,
  * this may result new priority propagating process.
  */
 void
 umtx_pi_adjust(struct thread *td, u_char oldpri)
 {
 	struct umtx_q *uq;
 	struct umtx_pi *pi;
 
 	uq = td->td_umtxq;
 	mtx_lock(&umtx_lock);
 	/*
 	 * Pick up the lock that td is blocked on.
 	 */
 	pi = uq->uq_pi_blocked;
 	if (pi != NULL) {
 		umtx_pi_adjust_thread(pi, td);
 		umtx_repropagate_priority(pi);
 	}
 	mtx_unlock(&umtx_lock);
 }
 
 /*
  * Sleep on a PI mutex.
  */
 int
 umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi, uint32_t owner,
     const char *wmesg, struct umtx_abs_timeout *timo, bool shared)
 {
 	struct thread *td, *td1;
 	struct umtx_q *uq1;
 	int error, pri;
 #ifdef INVARIANTS
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(&pi->pi_key);
 #endif
 	error = 0;
 	td = uq->uq_thread;
 	KASSERT(td == curthread, ("inconsistent uq_thread"));
 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(&uq->uq_key));
 	KASSERT(uc->uc_busy != 0, ("umtx chain is not busy"));
 	umtxq_insert(uq);
 	mtx_lock(&umtx_lock);
 	if (pi->pi_owner == NULL) {
 		mtx_unlock(&umtx_lock);
 		td1 = tdfind(owner, shared ? -1 : td->td_proc->p_pid);
 		mtx_lock(&umtx_lock);
 		if (td1 != NULL) {
 			if (pi->pi_owner == NULL)
 				umtx_pi_setowner(pi, td1);
 			PROC_UNLOCK(td1->td_proc);
 		}
 	}
 
 	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
 		pri = UPRI(uq1->uq_thread);
 		if (pri > UPRI(td))
 			break;
 	}
 
 	if (uq1 != NULL)
 		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
 	else
 		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
 
 	uq->uq_pi_blocked = pi;
 	thread_lock(td);
 	td->td_flags |= TDF_UPIBLOCKED;
 	thread_unlock(td);
 	umtx_propagate_priority(td);
 	mtx_unlock(&umtx_lock);
 	umtxq_unbusy(&uq->uq_key);
 
 	error = umtxq_sleep(uq, wmesg, timo);
 	umtxq_remove(uq);
 
 	mtx_lock(&umtx_lock);
 	uq->uq_pi_blocked = NULL;
 	thread_lock(td);
 	td->td_flags &= ~TDF_UPIBLOCKED;
 	thread_unlock(td);
 	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
 	umtx_repropagate_priority(pi);
 	mtx_unlock(&umtx_lock);
 	umtxq_unlock(&uq->uq_key);
 
 	return (error);
 }
 
 /*
  * Add reference count for a PI mutex.
  */
 void
 umtx_pi_ref(struct umtx_pi *pi)
 {
 
 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(&pi->pi_key));
 	pi->pi_refcount++;
 }
 
 /*
  * Decrease reference count for a PI mutex, if the counter
  * is decreased to zero, its memory space is freed.
  */
 void
 umtx_pi_unref(struct umtx_pi *pi)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(&pi->pi_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
 	if (--pi->pi_refcount == 0) {
 		mtx_lock(&umtx_lock);
 		if (pi->pi_owner != NULL)
 			umtx_pi_disown(pi);
 		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
 			("blocked queue not empty"));
 		mtx_unlock(&umtx_lock);
 		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
 		umtx_pi_free(pi);
 	}
 }
 
 /*
  * Find a PI mutex in hash table.
  */
 struct umtx_pi *
 umtx_pi_lookup(struct umtx_key *key)
 {
 	struct umtxq_chain *uc;
 	struct umtx_pi *pi;
 
 	uc = umtxq_getchain(key);
 	UMTXQ_LOCKED_ASSERT(uc);
 
 	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
 		if (umtx_key_match(&pi->pi_key, key)) {
 			return (pi);
 		}
 	}
 	return (NULL);
 }
 
 /*
  * Insert a PI mutex into hash table.
  */
 void
 umtx_pi_insert(struct umtx_pi *pi)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(&pi->pi_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
 }
 
 /*
  * Drop a PI mutex and wakeup a top waiter.
  */
 int
 umtx_pi_drop(struct thread *td, struct umtx_key *key, bool rb, int *count)
 {
 	struct umtx_q *uq_first, *uq_first2, *uq_me;
 	struct umtx_pi *pi, *pi2;
 	int pri;
 
 	UMTXQ_ASSERT_LOCKED_BUSY(key);
 	*count = umtxq_count_pi(key, &uq_first);
 	if (uq_first != NULL) {
 		mtx_lock(&umtx_lock);
 		pi = uq_first->uq_pi_blocked;
 		KASSERT(pi != NULL, ("pi == NULL?"));
 		if (pi->pi_owner != td && !(rb && pi->pi_owner == NULL)) {
 			mtx_unlock(&umtx_lock);
 			/* userland messed the mutex */
 			return (EPERM);
 		}
 		uq_me = td->td_umtxq;
 		if (pi->pi_owner == td)
 			umtx_pi_disown(pi);
 		/* get highest priority thread which is still sleeping. */
 		uq_first = TAILQ_FIRST(&pi->pi_blocked);
 		while (uq_first != NULL &&
 		    (uq_first->uq_flags & UQF_UMTXQ) == 0) {
 			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
 		}
 		pri = PRI_MAX;
 		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
 			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
 			if (uq_first2 != NULL) {
 				if (pri > UPRI(uq_first2->uq_thread))
 					pri = UPRI(uq_first2->uq_thread);
 			}
 		}
 		thread_lock(td);
 		sched_lend_user_prio(td, pri);
 		thread_unlock(td);
 		mtx_unlock(&umtx_lock);
 		if (uq_first)
 			umtxq_signal_thread(uq_first);
 	} else {
 		pi = umtx_pi_lookup(key);
 		/*
 		 * A umtx_pi can exist if a signal or timeout removed the
 		 * last waiter from the umtxq, but there is still
 		 * a thread in do_lock_pi() holding the umtx_pi.
 		 */
 		if (pi != NULL) {
 			/*
 			 * The umtx_pi can be unowned, such as when a thread
 			 * has just entered do_lock_pi(), allocated the
 			 * umtx_pi, and unlocked the umtxq.
 			 * If the current thread owns it, it must disown it.
 			 */
 			mtx_lock(&umtx_lock);
 			if (pi->pi_owner == td)
 				umtx_pi_disown(pi);
 			mtx_unlock(&umtx_lock);
 		}
 	}
 	return (0);
 }
 
 /*
  * Lock a PI mutex.
  */
 static int
 do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags,
     struct _umtx_time *timeout, int try)
 {
 	struct umtx_abs_timeout timo;
 	struct umtx_q *uq;
 	struct umtx_pi *pi, *new_pi;
 	uint32_t id, old_owner, owner, old;
 	int error, rv;
 
 	id = td->td_tid;
 	uq = td->td_umtxq;
 
 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
 	    TYPE_PI_ROBUST_UMUTEX : TYPE_PI_UMUTEX, GET_SHARE(flags),
 	    &uq->uq_key)) != 0)
 		return (error);
 
 	if (timeout != NULL)
 		umtx_abs_timeout_init2(&timo, timeout);
 
 	umtxq_lock(&uq->uq_key);
 	pi = umtx_pi_lookup(&uq->uq_key);
 	if (pi == NULL) {
 		new_pi = umtx_pi_alloc(M_NOWAIT);
 		if (new_pi == NULL) {
 			umtxq_unlock(&uq->uq_key);
 			new_pi = umtx_pi_alloc(M_WAITOK);
 			umtxq_lock(&uq->uq_key);
 			pi = umtx_pi_lookup(&uq->uq_key);
 			if (pi != NULL) {
 				umtx_pi_free(new_pi);
 				new_pi = NULL;
 			}
 		}
 		if (new_pi != NULL) {
 			new_pi->pi_key = uq->uq_key;
 			umtx_pi_insert(new_pi);
 			pi = new_pi;
 		}
 	}
 	umtx_pi_ref(pi);
 	umtxq_unlock(&uq->uq_key);
 
 	/*
 	 * Care must be exercised when dealing with umtx structure.  It
 	 * can fault on any access.
 	 */
 	for (;;) {
 		/*
 		 * Try the uncontested case.  This should be done in userland.
 		 */
 		rv = casueword32(&m->m_owner, UMUTEX_UNOWNED, &owner, id);
 		/* The address was invalid. */
 		if (rv == -1) {
 			error = EFAULT;
 			break;
 		}
 		/* The acquire succeeded. */
 		if (rv == 0) {
 			MPASS(owner == UMUTEX_UNOWNED);
 			error = 0;
 			break;
 		}
 
 		if (owner == UMUTEX_RB_NOTRECOV) {
 			error = ENOTRECOVERABLE;
 			break;
 		}
 
 		/*
 		 * Nobody owns it, but the acquire failed. This can happen
 		 * with ll/sc atomics.
 		 */
 		if (owner == UMUTEX_UNOWNED) {
 			error = thread_check_susp(td, true);
 			if (error != 0)
 				break;
 			continue;
 		}
 
 		/*
 		 * Avoid overwriting a possible error from sleep due
 		 * to the pending signal with suspension check result.
 		 */
 		if (error == 0) {
 			error = thread_check_susp(td, true);
 			if (error != 0)
 				break;
 		}
 
 		/* If no one owns it but it is contested try to acquire it. */
 		if (owner == UMUTEX_CONTESTED || owner == UMUTEX_RB_OWNERDEAD) {
 			old_owner = owner;
 			rv = casueword32(&m->m_owner, owner, &owner,
 			    id | UMUTEX_CONTESTED);
 			/* The address was invalid. */
 			if (rv == -1) {
 				error = EFAULT;
 				break;
 			}
 			if (rv == 1) {
 				if (error == 0) {
 					error = thread_check_susp(td, true);
 					if (error != 0)
 						break;
 				}
 
 				/*
 				 * If this failed the lock could
 				 * changed, restart.
 				 */
 				continue;
 			}
 
 			MPASS(rv == 0);
 			MPASS(owner == old_owner);
 			umtxq_lock(&uq->uq_key);
 			umtxq_busy(&uq->uq_key);
 			error = umtx_pi_claim(pi, td);
 			umtxq_unbusy(&uq->uq_key);
 			umtxq_unlock(&uq->uq_key);
 			if (error != 0) {
 				/*
 				 * Since we're going to return an
 				 * error, restore the m_owner to its
 				 * previous, unowned state to avoid
 				 * compounding the problem.
 				 */
 				(void)casuword32(&m->m_owner,
 				    id | UMUTEX_CONTESTED, old_owner);
 			}
 			if (error == 0 && old_owner == UMUTEX_RB_OWNERDEAD)
 				error = EOWNERDEAD;
 			break;
 		}
 
 		if ((owner & ~UMUTEX_CONTESTED) == id) {
 			error = EDEADLK;
 			break;
 		}
 
 		if (try != 0) {
 			error = EBUSY;
 			break;
 		}
 
 		/*
 		 * If we caught a signal, we have retried and now
 		 * exit immediately.
 		 */
 		if (error != 0)
 			break;
 
 		umtxq_busy_unlocked(&uq->uq_key);
 
 		/*
 		 * Set the contested bit so that a release in user space
 		 * knows to use the system call for unlock.  If this fails
 		 * either some one else has acquired the lock or it has been
 		 * released.
 		 */
 		rv = casueword32(&m->m_owner, owner, &old, owner |
 		    UMUTEX_CONTESTED);
 
 		/* The address was invalid. */
 		if (rv == -1) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = EFAULT;
 			break;
 		}
 		if (rv == 1) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = thread_check_susp(td, true);
 			if (error != 0)
 				break;
 
 			/*
 			 * The lock changed and we need to retry or we
 			 * lost a race to the thread unlocking the
 			 * umtx.  Note that the UMUTEX_RB_OWNERDEAD
 			 * value for owner is impossible there.
 			 */
 			continue;
 		}
 
 		umtxq_lock(&uq->uq_key);
 
 		/* We set the contested bit, sleep. */
 		MPASS(old == owner);
 		error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
 		    "umtxpi", timeout == NULL ? NULL : &timo,
 		    (flags & USYNC_PROCESS_SHARED) != 0);
 		if (error != 0)
 			continue;
 
 		error = thread_check_susp(td, false);
 		if (error != 0)
 			break;
 	}
 
 	umtxq_lock(&uq->uq_key);
 	umtx_pi_unref(pi);
 	umtxq_unlock(&uq->uq_key);
 
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 /*
  * Unlock a PI mutex.
  */
 static int
 do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
 {
 	struct umtx_key key;
 	uint32_t id, new_owner, old, owner;
 	int count, error;
 
 	id = td->td_tid;
 
 usrloop:
 	/*
 	 * Make sure we own this mtx.
 	 */
 	error = fueword32(&m->m_owner, &owner);
 	if (error == -1)
 		return (EFAULT);
 
 	if ((owner & ~UMUTEX_CONTESTED) != id)
 		return (EPERM);
 
 	new_owner = umtx_unlock_val(flags, rb);
 
 	/* This should be done in userland */
 	if ((owner & UMUTEX_CONTESTED) == 0) {
 		error = casueword32(&m->m_owner, owner, &old, new_owner);
 		if (error == -1)
 			return (EFAULT);
 		if (error == 1) {
 			error = thread_check_susp(td, true);
 			if (error != 0)
 				return (error);
 			goto usrloop;
 		}
 		if (old == owner)
 			return (0);
 		owner = old;
 	}
 
 	/* We should only ever be in here for contested locks */
 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
 	    TYPE_PI_ROBUST_UMUTEX : TYPE_PI_UMUTEX, GET_SHARE(flags),
 	    &key)) != 0)
 		return (error);
 
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	error = umtx_pi_drop(td, &key, rb, &count);
 	if (error != 0) {
 		umtxq_unbusy(&key);
 		umtxq_unlock(&key);
 		umtx_key_release(&key);
 		/* userland messed the mutex */
 		return (error);
 	}
 	umtxq_unlock(&key);
 
 	/*
 	 * When unlocking the umtx, it must be marked as unowned if
 	 * there is zero or one thread only waiting for it.
 	 * Otherwise, it must be marked as contested.
 	 */
 
 	if (count > 1)
 		new_owner |= UMUTEX_CONTESTED;
 again:
 	error = casueword32(&m->m_owner, owner, &old, new_owner);
 	if (error == 1) {
 		error = thread_check_susp(td, false);
 		if (error == 0)
 			goto again;
 	}
 	umtxq_unbusy_unlocked(&key);
 	umtx_key_release(&key);
 	if (error == -1)
 		return (EFAULT);
 	if (error == 0 && old != owner)
 		return (EINVAL);
 	return (error);
 }
 
 /*
  * Lock a PP mutex.
  */
 static int
 do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags,
     struct _umtx_time *timeout, int try)
 {
 	struct umtx_abs_timeout timo;
 	struct umtx_q *uq, *uq2;
 	struct umtx_pi *pi;
 	uint32_t ceiling;
 	uint32_t owner, id;
 	int error, pri, old_inherited_pri, new_pri, rv;
 	bool su;
 
 	id = td->td_tid;
 	uq = td->td_umtxq;
 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
 	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
 	    &uq->uq_key)) != 0)
 		return (error);
 
 	if (timeout != NULL)
 		umtx_abs_timeout_init2(&timo, timeout);
 
 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
 	for (;;) {
 		old_inherited_pri = uq->uq_inherited_pri;
 		umtxq_busy_unlocked(&uq->uq_key);
 
 		rv = fueword32(&m->m_ceilings[0], &ceiling);
 		if (rv == -1) {
 			error = EFAULT;
 			goto out;
 		}
 		ceiling = RTP_PRIO_MAX - ceiling;
 		if (ceiling > RTP_PRIO_MAX) {
 			error = EINVAL;
 			goto out;
 		}
 		new_pri = PRI_MIN_REALTIME + ceiling;
 
 		if (td->td_base_user_pri < new_pri) {
 			error = EINVAL;
 			goto out;
 		}
 		if (su) {
 			mtx_lock(&umtx_lock);
 			if (new_pri < uq->uq_inherited_pri) {
 				uq->uq_inherited_pri = new_pri;
 				thread_lock(td);
 				if (new_pri < UPRI(td))
 					sched_lend_user_prio(td, new_pri);
 				thread_unlock(td);
 			}
 			mtx_unlock(&umtx_lock);
 		}
 
 		rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
 		    id | UMUTEX_CONTESTED);
 		/* The address was invalid. */
 		if (rv == -1) {
 			error = EFAULT;
 			break;
 		}
 		if (rv == 0) {
 			MPASS(owner == UMUTEX_CONTESTED);
 			error = 0;
 			break;
 		}
 		/* rv == 1 */
 		if (owner == UMUTEX_RB_OWNERDEAD) {
 			rv = casueword32(&m->m_owner, UMUTEX_RB_OWNERDEAD,
 			    &owner, id | UMUTEX_CONTESTED);
 			if (rv == -1) {
 				error = EFAULT;
 				break;
 			}
 			if (rv == 0) {
 				MPASS(owner == UMUTEX_RB_OWNERDEAD);
 				error = EOWNERDEAD; /* success */
 				break;
 			}
 
 			/*
 			 *  rv == 1, only check for suspension if we
 			 *  did not already catched a signal.  If we
 			 *  get an error from the check, the same
 			 *  condition is checked by the umtxq_sleep()
 			 *  call below, so we should obliterate the
 			 *  error to not skip the last loop iteration.
 			 */
 			if (error == 0) {
 				error = thread_check_susp(td, false);
 				if (error == 0 && try == 0) {
 					umtxq_unbusy_unlocked(&uq->uq_key);
 					continue;
 				}
 				error = 0;
 			}
 		} else if (owner == UMUTEX_RB_NOTRECOV) {
 			error = ENOTRECOVERABLE;
 		} else if (owner == UMUTEX_CONTESTED) {
 			/* Spurious failure, retry. */
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			continue;
 		}
 
 		if (try != 0)
 			error = EBUSY;
 
 		/*
 		 * If we caught a signal, we have retried and now
 		 * exit immediately.
 		 */
 		if (error != 0)
 			break;
 
 		umtxq_lock(&uq->uq_key);
 		umtxq_insert(uq);
 		umtxq_unbusy(&uq->uq_key);
 		error = umtxq_sleep(uq, "umtxpp", timeout == NULL ?
 		    NULL : &timo);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 
 		mtx_lock(&umtx_lock);
 		uq->uq_inherited_pri = old_inherited_pri;
 		pri = PRI_MAX;
 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
 			if (uq2 != NULL) {
 				if (pri > UPRI(uq2->uq_thread))
 					pri = UPRI(uq2->uq_thread);
 			}
 		}
 		if (pri > uq->uq_inherited_pri)
 			pri = uq->uq_inherited_pri;
 		thread_lock(td);
 		sched_lend_user_prio(td, pri);
 		thread_unlock(td);
 		mtx_unlock(&umtx_lock);
 	}
 
 	if (error != 0 && error != EOWNERDEAD) {
 		mtx_lock(&umtx_lock);
 		uq->uq_inherited_pri = old_inherited_pri;
 		pri = PRI_MAX;
 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
 			if (uq2 != NULL) {
 				if (pri > UPRI(uq2->uq_thread))
 					pri = UPRI(uq2->uq_thread);
 			}
 		}
 		if (pri > uq->uq_inherited_pri)
 			pri = uq->uq_inherited_pri;
 		thread_lock(td);
 		sched_lend_user_prio(td, pri);
 		thread_unlock(td);
 		mtx_unlock(&umtx_lock);
 	}
 
 out:
 	umtxq_unbusy_unlocked(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 /*
  * Unlock a PP mutex.
  */
 static int
 do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
 {
 	struct umtx_key key;
 	struct umtx_q *uq, *uq2;
 	struct umtx_pi *pi;
 	uint32_t id, owner, rceiling;
 	int error, pri, new_inherited_pri;
 	bool su;
 
 	id = td->td_tid;
 	uq = td->td_umtxq;
 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
 
 	/*
 	 * Make sure we own this mtx.
 	 */
 	error = fueword32(&m->m_owner, &owner);
 	if (error == -1)
 		return (EFAULT);
 
 	if ((owner & ~UMUTEX_CONTESTED) != id)
 		return (EPERM);
 
 	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
 	if (error != 0)
 		return (error);
 
 	if (rceiling == -1)
 		new_inherited_pri = PRI_MAX;
 	else {
 		rceiling = RTP_PRIO_MAX - rceiling;
 		if (rceiling > RTP_PRIO_MAX)
 			return (EINVAL);
 		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
 	}
 
 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
 	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
 	    &key)) != 0)
 		return (error);
 	umtxq_busy_unlocked(&key);
 
 	/*
 	 * For priority protected mutex, always set unlocked state
 	 * to UMUTEX_CONTESTED, so that userland always enters kernel
 	 * to lock the mutex, it is necessary because thread priority
 	 * has to be adjusted for such mutex.
 	 */
 	error = suword32(&m->m_owner, umtx_unlock_val(flags, rb) |
 	    UMUTEX_CONTESTED);
 
 	umtxq_lock(&key);
 	if (error == 0)
 		umtxq_signal(&key, 1);
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 
 	if (error == -1)
 		error = EFAULT;
 	else {
 		mtx_lock(&umtx_lock);
 		if (su || new_inherited_pri == PRI_MAX)
 			uq->uq_inherited_pri = new_inherited_pri;
 		pri = PRI_MAX;
 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
 			if (uq2 != NULL) {
 				if (pri > UPRI(uq2->uq_thread))
 					pri = UPRI(uq2->uq_thread);
 			}
 		}
 		if (pri > uq->uq_inherited_pri)
 			pri = uq->uq_inherited_pri;
 		thread_lock(td);
 		sched_lend_user_prio(td, pri);
 		thread_unlock(td);
 		mtx_unlock(&umtx_lock);
 	}
 	umtx_key_release(&key);
 	return (error);
 }
 
 static int
 do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
     uint32_t *old_ceiling)
 {
 	struct umtx_q *uq;
 	uint32_t flags, id, owner, save_ceiling;
 	int error, rv, rv1;
 
 	error = fueword32(&m->m_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
 		return (EINVAL);
 	if (ceiling > RTP_PRIO_MAX)
 		return (EINVAL);
 	id = td->td_tid;
 	uq = td->td_umtxq;
 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
 	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
 	    &uq->uq_key)) != 0)
 		return (error);
 	for (;;) {
 		umtxq_busy_unlocked(&uq->uq_key);
 
 		rv = fueword32(&m->m_ceilings[0], &save_ceiling);
 		if (rv == -1) {
 			error = EFAULT;
 			break;
 		}
 
 		rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
 		    id | UMUTEX_CONTESTED);
 		if (rv == -1) {
 			error = EFAULT;
 			break;
 		}
 
 		if (rv == 0) {
 			MPASS(owner == UMUTEX_CONTESTED);
 			rv = suword32(&m->m_ceilings[0], ceiling);
 			rv1 = suword32(&m->m_owner, UMUTEX_CONTESTED);
 			error = (rv == 0 && rv1 == 0) ? 0: EFAULT;
 			break;
 		}
 
 		if ((owner & ~UMUTEX_CONTESTED) == id) {
 			rv = suword32(&m->m_ceilings[0], ceiling);
 			error = rv == 0 ? 0 : EFAULT;
 			break;
 		}
 
 		if (owner == UMUTEX_RB_OWNERDEAD) {
 			error = EOWNERDEAD;
 			break;
 		} else if (owner == UMUTEX_RB_NOTRECOV) {
 			error = ENOTRECOVERABLE;
 			break;
 		} else if (owner == UMUTEX_CONTESTED) {
 			/* Spurious failure, retry. */
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			continue;
 		}
 
 		/*
 		 * If we caught a signal, we have retried and now
 		 * exit immediately.
 		 */
 		if (error != 0)
 			break;
 
 		/*
 		 * We set the contested bit, sleep. Otherwise the lock changed
 		 * and we need to retry or we lost a race to the thread
 		 * unlocking the umtx.
 		 */
 		umtxq_lock(&uq->uq_key);
 		umtxq_insert(uq);
 		umtxq_unbusy(&uq->uq_key);
 		error = umtxq_sleep(uq, "umtxpp", NULL);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 	}
 	umtxq_lock(&uq->uq_key);
 	if (error == 0)
 		umtxq_signal(&uq->uq_key, INT_MAX);
 	umtxq_unbusy(&uq->uq_key);
 	umtxq_unlock(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	if (error == 0 && old_ceiling != NULL) {
 		rv = suword32(old_ceiling, save_ceiling);
 		error = rv == 0 ? 0 : EFAULT;
 	}
 	return (error);
 }
 
 /*
  * Lock a userland POSIX mutex.
  */
 static int
 do_lock_umutex(struct thread *td, struct umutex *m,
     struct _umtx_time *timeout, int mode)
 {
 	uint32_t flags;
 	int error;
 
 	error = fueword32(&m->m_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 
 	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
 	case 0:
 		error = do_lock_normal(td, m, flags, timeout, mode);
 		break;
 	case UMUTEX_PRIO_INHERIT:
 		error = do_lock_pi(td, m, flags, timeout, mode);
 		break;
 	case UMUTEX_PRIO_PROTECT:
 		error = do_lock_pp(td, m, flags, timeout, mode);
 		break;
 	default:
 		return (EINVAL);
 	}
 	if (timeout == NULL) {
 		if (error == EINTR && mode != _UMUTEX_WAIT)
 			error = ERESTART;
 	} else {
 		/* Timed-locking is not restarted. */
 		if (error == ERESTART)
 			error = EINTR;
 	}
 	return (error);
 }
 
 /*
  * Unlock a userland POSIX mutex.
  */
 static int
 do_unlock_umutex(struct thread *td, struct umutex *m, bool rb)
 {
 	uint32_t flags;
 	int error;
 
 	error = fueword32(&m->m_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 
 	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
 	case 0:
 		return (do_unlock_normal(td, m, flags, rb));
 	case UMUTEX_PRIO_INHERIT:
 		return (do_unlock_pi(td, m, flags, rb));
 	case UMUTEX_PRIO_PROTECT:
 		return (do_unlock_pp(td, m, flags, rb));
 	}
 
 	return (EINVAL);
 }
 
 static int
 do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
     struct timespec *timeout, u_long wflags)
 {
 	struct umtx_abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t flags, clockid, hasw;
 	int error;
 
 	uq = td->td_umtxq;
 	error = fueword32(&cv->c_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
 	if ((wflags & CVWAIT_CLOCKID) != 0) {
 		error = fueword32(&cv->c_clockid, &clockid);
 		if (error == -1) {
 			umtx_key_release(&uq->uq_key);
 			return (EFAULT);
 		}
-		if (clockid < CLOCK_REALTIME ||
-		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
+		if ((clockid < CLOCK_REALTIME ||
+		    clockid >= CLOCK_THREAD_CPUTIME_ID) &&
+		    clockid != CLOCK_TAI) {
 			/* hmm, only HW clock id will work. */
 			umtx_key_release(&uq->uq_key);
 			return (EINVAL);
 		}
 	} else {
 		clockid = CLOCK_REALTIME;
 	}
 
 	umtxq_lock(&uq->uq_key);
 	umtxq_busy(&uq->uq_key);
 	umtxq_insert(uq);
 	umtxq_unlock(&uq->uq_key);
 
 	/*
 	 * Set c_has_waiters to 1 before releasing user mutex, also
 	 * don't modify cache line when unnecessary.
 	 */
 	error = fueword32(&cv->c_has_waiters, &hasw);
 	if (error == 0 && hasw == 0)
 		error = suword32(&cv->c_has_waiters, 1);
 	if (error != 0) {
 		umtxq_lock(&uq->uq_key);
 		umtxq_remove(uq);
 		umtxq_unbusy(&uq->uq_key);
 		error = EFAULT;
 		goto out;
 	}
 
 	umtxq_unbusy_unlocked(&uq->uq_key);
 
 	error = do_unlock_umutex(td, m, false);
 
 	if (timeout != NULL)
 		umtx_abs_timeout_init(&timo, clockid,
 		    (wflags & CVWAIT_ABSTIME) != 0, timeout);
 
 	umtxq_lock(&uq->uq_key);
 	if (error == 0) {
 		error = umtxq_sleep(uq, "ucond", timeout == NULL ?
 		    NULL : &timo);
 	}
 
 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
 		error = 0;
 	else {
 		/*
 		 * This must be timeout,interrupted by signal or
 		 * surprious wakeup, clear c_has_waiter flag when
 		 * necessary.
 		 */
 		umtxq_busy(&uq->uq_key);
 		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
 			int oldlen = uq->uq_cur_queue->length;
 			umtxq_remove(uq);
 			if (oldlen == 1) {
 				umtxq_unlock(&uq->uq_key);
 				if (suword32(&cv->c_has_waiters, 0) != 0 &&
 				    error == 0)
 					error = EFAULT;
 				umtxq_lock(&uq->uq_key);
 			}
 		}
 		umtxq_unbusy(&uq->uq_key);
 		if (error == ERESTART)
 			error = EINTR;
 	}
 out:
 	umtxq_unlock(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 /*
  * Signal a userland condition variable.
  */
 static int
 do_cv_signal(struct thread *td, struct ucond *cv)
 {
 	struct umtx_key key;
 	int error, cnt, nwake;
 	uint32_t flags;
 
 	error = fueword32(&cv->c_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
 		return (error);
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	cnt = umtxq_count(&key);
 	nwake = umtxq_signal(&key, 1);
 	if (cnt <= nwake) {
 		umtxq_unlock(&key);
 		error = suword32(&cv->c_has_waiters, 0);
 		if (error == -1)
 			error = EFAULT;
 		umtxq_lock(&key);
 	}
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (error);
 }
 
 static int
 do_cv_broadcast(struct thread *td, struct ucond *cv)
 {
 	struct umtx_key key;
 	int error;
 	uint32_t flags;
 
 	error = fueword32(&cv->c_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
 		return (error);
 
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	umtxq_signal(&key, INT_MAX);
 	umtxq_unlock(&key);
 
 	error = suword32(&cv->c_has_waiters, 0);
 	if (error == -1)
 		error = EFAULT;
 
 	umtxq_unbusy_unlocked(&key);
 
 	umtx_key_release(&key);
 	return (error);
 }
 
 static int
 do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag,
     struct _umtx_time *timeout)
 {
 	struct umtx_abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t flags, wrflags;
 	int32_t state, oldstate;
 	int32_t blocked_readers;
 	int error, error1, rv;
 
 	uq = td->td_umtxq;
 	error = fueword32(&rwlock->rw_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
 	if (timeout != NULL)
 		umtx_abs_timeout_init2(&timo, timeout);
 
 	wrflags = URWLOCK_WRITE_OWNER;
 	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
 		wrflags |= URWLOCK_WRITE_WAITERS;
 
 	for (;;) {
 		rv = fueword32(&rwlock->rw_state, &state);
 		if (rv == -1) {
 			umtx_key_release(&uq->uq_key);
 			return (EFAULT);
 		}
 
 		/* try to lock it */
 		while (!(state & wrflags)) {
 			if (__predict_false(URWLOCK_READER_COUNT(state) ==
 			    URWLOCK_MAX_READERS)) {
 				umtx_key_release(&uq->uq_key);
 				return (EAGAIN);
 			}
 			rv = casueword32(&rwlock->rw_state, state,
 			    &oldstate, state + 1);
 			if (rv == -1) {
 				umtx_key_release(&uq->uq_key);
 				return (EFAULT);
 			}
 			if (rv == 0) {
 				MPASS(oldstate == state);
 				umtx_key_release(&uq->uq_key);
 				return (0);
 			}
 			error = thread_check_susp(td, true);
 			if (error != 0)
 				break;
 			state = oldstate;
 		}
 
 		if (error)
 			break;
 
 		/* grab monitor lock */
 		umtxq_busy_unlocked(&uq->uq_key);
 
 		/*
 		 * re-read the state, in case it changed between the try-lock above
 		 * and the check below
 		 */
 		rv = fueword32(&rwlock->rw_state, &state);
 		if (rv == -1)
 			error = EFAULT;
 
 		/* set read contention bit */
 		while (error == 0 && (state & wrflags) &&
 		    !(state & URWLOCK_READ_WAITERS)) {
 			rv = casueword32(&rwlock->rw_state, state,
 			    &oldstate, state | URWLOCK_READ_WAITERS);
 			if (rv == -1) {
 				error = EFAULT;
 				break;
 			}
 			if (rv == 0) {
 				MPASS(oldstate == state);
 				goto sleep;
 			}
 			state = oldstate;
 			error = thread_check_susp(td, false);
 			if (error != 0)
 				break;
 		}
 		if (error != 0) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			break;
 		}
 
 		/* state is changed while setting flags, restart */
 		if (!(state & wrflags)) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = thread_check_susp(td, true);
 			if (error != 0)
 				break;
 			continue;
 		}
 
 sleep:
 		/*
 		 * Contention bit is set, before sleeping, increase
 		 * read waiter count.
 		 */
 		rv = fueword32(&rwlock->rw_blocked_readers,
 		    &blocked_readers);
 		if (rv == 0)
 			rv = suword32(&rwlock->rw_blocked_readers,
 			    blocked_readers + 1);
 		if (rv == -1) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = EFAULT;
 			break;
 		}
 
 		while (state & wrflags) {
 			umtxq_lock(&uq->uq_key);
 			umtxq_insert(uq);
 			umtxq_unbusy(&uq->uq_key);
 
 			error = umtxq_sleep(uq, "urdlck", timeout == NULL ?
 			    NULL : &timo);
 
 			umtxq_busy(&uq->uq_key);
 			umtxq_remove(uq);
 			umtxq_unlock(&uq->uq_key);
 			if (error)
 				break;
 			rv = fueword32(&rwlock->rw_state, &state);
 			if (rv == -1) {
 				error = EFAULT;
 				break;
 			}
 		}
 
 		/* decrease read waiter count, and may clear read contention bit */
 		rv = fueword32(&rwlock->rw_blocked_readers,
 		    &blocked_readers);
 		if (rv == 0)
 			rv = suword32(&rwlock->rw_blocked_readers,
 			    blocked_readers - 1);
 		if (rv == -1) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = EFAULT;
 			break;
 		}
 		if (blocked_readers == 1) {
 			rv = fueword32(&rwlock->rw_state, &state);
 			if (rv == -1) {
 				umtxq_unbusy_unlocked(&uq->uq_key);
 				error = EFAULT;
 				break;
 			}
 			for (;;) {
 				rv = casueword32(&rwlock->rw_state, state,
 				    &oldstate, state & ~URWLOCK_READ_WAITERS);
 				if (rv == -1) {
 					error = EFAULT;
 					break;
 				}
 				if (rv == 0) {
 					MPASS(oldstate == state);
 					break;
 				}
 				state = oldstate;
 				error1 = thread_check_susp(td, false);
 				if (error1 != 0) {
 					if (error == 0)
 						error = error1;
 					break;
 				}
 			}
 		}
 
 		umtxq_unbusy_unlocked(&uq->uq_key);
 		if (error != 0)
 			break;
 	}
 	umtx_key_release(&uq->uq_key);
 	if (error == ERESTART)
 		error = EINTR;
 	return (error);
 }
 
 static int
 do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout)
 {
 	struct umtx_abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t flags;
 	int32_t state, oldstate;
 	int32_t blocked_writers;
 	int32_t blocked_readers;
 	int error, error1, rv;
 
 	uq = td->td_umtxq;
 	error = fueword32(&rwlock->rw_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
 	if (timeout != NULL)
 		umtx_abs_timeout_init2(&timo, timeout);
 
 	blocked_readers = 0;
 	for (;;) {
 		rv = fueword32(&rwlock->rw_state, &state);
 		if (rv == -1) {
 			umtx_key_release(&uq->uq_key);
 			return (EFAULT);
 		}
 		while ((state & URWLOCK_WRITE_OWNER) == 0 &&
 		    URWLOCK_READER_COUNT(state) == 0) {
 			rv = casueword32(&rwlock->rw_state, state,
 			    &oldstate, state | URWLOCK_WRITE_OWNER);
 			if (rv == -1) {
 				umtx_key_release(&uq->uq_key);
 				return (EFAULT);
 			}
 			if (rv == 0) {
 				MPASS(oldstate == state);
 				umtx_key_release(&uq->uq_key);
 				return (0);
 			}
 			state = oldstate;
 			error = thread_check_susp(td, true);
 			if (error != 0)
 				break;
 		}
 
 		if (error) {
 			if ((state & (URWLOCK_WRITE_OWNER |
 			    URWLOCK_WRITE_WAITERS)) == 0 &&
 			    blocked_readers != 0) {
 				umtxq_lock(&uq->uq_key);
 				umtxq_busy(&uq->uq_key);
 				umtxq_signal_queue(&uq->uq_key, INT_MAX,
 				    UMTX_SHARED_QUEUE);
 				umtxq_unbusy(&uq->uq_key);
 				umtxq_unlock(&uq->uq_key);
 			}
 
 			break;
 		}
 
 		/* grab monitor lock */
 		umtxq_busy_unlocked(&uq->uq_key);
 
 		/*
 		 * Re-read the state, in case it changed between the
 		 * try-lock above and the check below.
 		 */
 		rv = fueword32(&rwlock->rw_state, &state);
 		if (rv == -1)
 			error = EFAULT;
 
 		while (error == 0 && ((state & URWLOCK_WRITE_OWNER) ||
 		    URWLOCK_READER_COUNT(state) != 0) &&
 		    (state & URWLOCK_WRITE_WAITERS) == 0) {
 			rv = casueword32(&rwlock->rw_state, state,
 			    &oldstate, state | URWLOCK_WRITE_WAITERS);
 			if (rv == -1) {
 				error = EFAULT;
 				break;
 			}
 			if (rv == 0) {
 				MPASS(oldstate == state);
 				goto sleep;
 			}
 			state = oldstate;
 			error = thread_check_susp(td, false);
 			if (error != 0)
 				break;
 		}
 		if (error != 0) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			break;
 		}
 
 		if ((state & URWLOCK_WRITE_OWNER) == 0 &&
 		    URWLOCK_READER_COUNT(state) == 0) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = thread_check_susp(td, false);
 			if (error != 0)
 				break;
 			continue;
 		}
 sleep:
 		rv = fueword32(&rwlock->rw_blocked_writers,
 		    &blocked_writers);
 		if (rv == 0)
 			rv = suword32(&rwlock->rw_blocked_writers,
 			    blocked_writers + 1);
 		if (rv == -1) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = EFAULT;
 			break;
 		}
 
 		while ((state & URWLOCK_WRITE_OWNER) ||
 		    URWLOCK_READER_COUNT(state) != 0) {
 			umtxq_lock(&uq->uq_key);
 			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
 			umtxq_unbusy(&uq->uq_key);
 
 			error = umtxq_sleep(uq, "uwrlck", timeout == NULL ?
 			    NULL : &timo);
 
 			umtxq_busy(&uq->uq_key);
 			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
 			umtxq_unlock(&uq->uq_key);
 			if (error)
 				break;
 			rv = fueword32(&rwlock->rw_state, &state);
 			if (rv == -1) {
 				error = EFAULT;
 				break;
 			}
 		}
 
 		rv = fueword32(&rwlock->rw_blocked_writers,
 		    &blocked_writers);
 		if (rv == 0)
 			rv = suword32(&rwlock->rw_blocked_writers,
 			    blocked_writers - 1);
 		if (rv == -1) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = EFAULT;
 			break;
 		}
 		if (blocked_writers == 1) {
 			rv = fueword32(&rwlock->rw_state, &state);
 			if (rv == -1) {
 				umtxq_unbusy_unlocked(&uq->uq_key);
 				error = EFAULT;
 				break;
 			}
 			for (;;) {
 				rv = casueword32(&rwlock->rw_state, state,
 				    &oldstate, state & ~URWLOCK_WRITE_WAITERS);
 				if (rv == -1) {
 					error = EFAULT;
 					break;
 				}
 				if (rv == 0) {
 					MPASS(oldstate == state);
 					break;
 				}
 				state = oldstate;
 				error1 = thread_check_susp(td, false);
 				/*
 				 * We are leaving the URWLOCK_WRITE_WAITERS
 				 * behind, but this should not harm the
 				 * correctness.
 				 */
 				if (error1 != 0) {
 					if (error == 0)
 						error = error1;
 					break;
 				}
 			}
 			rv = fueword32(&rwlock->rw_blocked_readers,
 			    &blocked_readers);
 			if (rv == -1) {
 				umtxq_unbusy_unlocked(&uq->uq_key);
 				error = EFAULT;
 				break;
 			}
 		} else
 			blocked_readers = 0;
 
 		umtxq_unbusy_unlocked(&uq->uq_key);
 	}
 
 	umtx_key_release(&uq->uq_key);
 	if (error == ERESTART)
 		error = EINTR;
 	return (error);
 }
 
 static int
 do_rw_unlock(struct thread *td, struct urwlock *rwlock)
 {
 	struct umtx_q *uq;
 	uint32_t flags;
 	int32_t state, oldstate;
 	int error, rv, q, count;
 
 	uq = td->td_umtxq;
 	error = fueword32(&rwlock->rw_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
 	error = fueword32(&rwlock->rw_state, &state);
 	if (error == -1) {
 		error = EFAULT;
 		goto out;
 	}
 	if (state & URWLOCK_WRITE_OWNER) {
 		for (;;) {
 			rv = casueword32(&rwlock->rw_state, state,
 			    &oldstate, state & ~URWLOCK_WRITE_OWNER);
 			if (rv == -1) {
 				error = EFAULT;
 				goto out;
 			}
 			if (rv == 1) {
 				state = oldstate;
 				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
 					error = EPERM;
 					goto out;
 				}
 				error = thread_check_susp(td, true);
 				if (error != 0)
 					goto out;
 			} else
 				break;
 		}
 	} else if (URWLOCK_READER_COUNT(state) != 0) {
 		for (;;) {
 			rv = casueword32(&rwlock->rw_state, state,
 			    &oldstate, state - 1);
 			if (rv == -1) {
 				error = EFAULT;
 				goto out;
 			}
 			if (rv == 1) {
 				state = oldstate;
 				if (URWLOCK_READER_COUNT(oldstate) == 0) {
 					error = EPERM;
 					goto out;
 				}
 				error = thread_check_susp(td, true);
 				if (error != 0)
 					goto out;
 			} else
 				break;
 		}
 	} else {
 		error = EPERM;
 		goto out;
 	}
 
 	count = 0;
 
 	if (!(flags & URWLOCK_PREFER_READER)) {
 		if (state & URWLOCK_WRITE_WAITERS) {
 			count = 1;
 			q = UMTX_EXCLUSIVE_QUEUE;
 		} else if (state & URWLOCK_READ_WAITERS) {
 			count = INT_MAX;
 			q = UMTX_SHARED_QUEUE;
 		}
 	} else {
 		if (state & URWLOCK_READ_WAITERS) {
 			count = INT_MAX;
 			q = UMTX_SHARED_QUEUE;
 		} else if (state & URWLOCK_WRITE_WAITERS) {
 			count = 1;
 			q = UMTX_EXCLUSIVE_QUEUE;
 		}
 	}
 
 	if (count) {
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_signal_queue(&uq->uq_key, count, q);
 		umtxq_unbusy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 	}
 out:
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
 static int
 do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout)
 {
 	struct umtx_abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t flags, count, count1;
 	int error, rv, rv1;
 
 	uq = td->td_umtxq;
 	error = fueword32(&sem->_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
 	if (timeout != NULL)
 		umtx_abs_timeout_init2(&timo, timeout);
 
 again:
 	umtxq_lock(&uq->uq_key);
 	umtxq_busy(&uq->uq_key);
 	umtxq_insert(uq);
 	umtxq_unlock(&uq->uq_key);
 	rv = casueword32(&sem->_has_waiters, 0, &count1, 1);
 	if (rv != -1)
 		rv1 = fueword32(&sem->_count, &count);
 	if (rv == -1 || rv1 == -1 || count != 0 || (rv == 1 && count1 == 0)) {
 		if (rv == 0)
 			rv = suword32(&sem->_has_waiters, 0);
 		umtxq_lock(&uq->uq_key);
 		umtxq_unbusy(&uq->uq_key);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 		if (rv == -1 || rv1 == -1) {
 			error = EFAULT;
 			goto out;
 		}
 		if (count != 0) {
 			error = 0;
 			goto out;
 		}
 		MPASS(rv == 1 && count1 == 0);
 		rv = thread_check_susp(td, true);
 		if (rv == 0)
 			goto again;
 		error = rv;
 		goto out;
 	}
 	umtxq_lock(&uq->uq_key);
 	umtxq_unbusy(&uq->uq_key);
 
 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
 
 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
 		error = 0;
 	else {
 		umtxq_remove(uq);
 		/* A relative timeout cannot be restarted. */
 		if (error == ERESTART && timeout != NULL &&
 		    (timeout->_flags & UMTX_ABSTIME) == 0)
 			error = EINTR;
 	}
 	umtxq_unlock(&uq->uq_key);
 out:
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 /*
  * Signal a userland semaphore.
  */
 static int
 do_sem_wake(struct thread *td, struct _usem *sem)
 {
 	struct umtx_key key;
 	int error, cnt;
 	uint32_t flags;
 
 	error = fueword32(&sem->_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
 		return (error);
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	cnt = umtxq_count(&key);
 	if (cnt > 0) {
 		/*
 		 * Check if count is greater than 0, this means the memory is
 		 * still being referenced by user code, so we can safely
 		 * update _has_waiters flag.
 		 */
 		if (cnt == 1) {
 			umtxq_unlock(&key);
 			error = suword32(&sem->_has_waiters, 0);
 			umtxq_lock(&key);
 			if (error == -1)
 				error = EFAULT;
 		}
 		umtxq_signal(&key, 1);
 	}
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (error);
 }
 #endif
 
 static int
 do_sem2_wait(struct thread *td, struct _usem2 *sem, struct _umtx_time *timeout)
 {
 	struct umtx_abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t count, flags;
 	int error, rv;
 
 	uq = td->td_umtxq;
 	flags = fuword32(&sem->_flags);
 	if (timeout != NULL)
 		umtx_abs_timeout_init2(&timo, timeout);
 
 again:
 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 	umtxq_lock(&uq->uq_key);
 	umtxq_busy(&uq->uq_key);
 	umtxq_insert(uq);
 	umtxq_unlock(&uq->uq_key);
 	rv = fueword32(&sem->_count, &count);
 	if (rv == -1) {
 		umtxq_lock(&uq->uq_key);
 		umtxq_unbusy(&uq->uq_key);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 		umtx_key_release(&uq->uq_key);
 		return (EFAULT);
 	}
 	for (;;) {
 		if (USEM_COUNT(count) != 0) {
 			umtxq_lock(&uq->uq_key);
 			umtxq_unbusy(&uq->uq_key);
 			umtxq_remove(uq);
 			umtxq_unlock(&uq->uq_key);
 			umtx_key_release(&uq->uq_key);
 			return (0);
 		}
 		if (count == USEM_HAS_WAITERS)
 			break;
 		rv = casueword32(&sem->_count, 0, &count, USEM_HAS_WAITERS);
 		if (rv == 0)
 			break;
 		umtxq_lock(&uq->uq_key);
 		umtxq_unbusy(&uq->uq_key);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 		umtx_key_release(&uq->uq_key);
 		if (rv == -1)
 			return (EFAULT);
 		rv = thread_check_susp(td, true);
 		if (rv != 0)
 			return (rv);
 		goto again;
 	}
 	umtxq_lock(&uq->uq_key);
 	umtxq_unbusy(&uq->uq_key);
 
 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
 
 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
 		error = 0;
 	else {
 		umtxq_remove(uq);
 		if (timeout != NULL && (timeout->_flags & UMTX_ABSTIME) == 0) {
 			/* A relative timeout cannot be restarted. */
 			if (error == ERESTART)
 				error = EINTR;
 			if (error == EINTR) {
 				kern_clock_gettime(curthread, timo.clockid,
 				    &timo.cur);
 				timespecsub(&timo.end, &timo.cur,
 				    &timeout->_timeout);
 			}
 		}
 	}
 	umtxq_unlock(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 /*
  * Signal a userland semaphore.
  */
 static int
 do_sem2_wake(struct thread *td, struct _usem2 *sem)
 {
 	struct umtx_key key;
 	int error, cnt, rv;
 	uint32_t count, flags;
 
 	rv = fueword32(&sem->_flags, &flags);
 	if (rv == -1)
 		return (EFAULT);
 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
 		return (error);
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	cnt = umtxq_count(&key);
 	if (cnt > 0) {
 		/*
 		 * If this was the last sleeping thread, clear the waiters
 		 * flag in _count.
 		 */
 		if (cnt == 1) {
 			umtxq_unlock(&key);
 			rv = fueword32(&sem->_count, &count);
 			while (rv != -1 && count & USEM_HAS_WAITERS) {
 				rv = casueword32(&sem->_count, count, &count,
 				    count & ~USEM_HAS_WAITERS);
 				if (rv == 1) {
 					rv = thread_check_susp(td, false);
 					if (rv != 0)
 						break;
 				}
 			}
 			if (rv == -1)
 				error = EFAULT;
 			else if (rv > 0) {
 				error = rv;
 			}
 			umtxq_lock(&key);
 		}
 
 		umtxq_signal(&key, 1);
 	}
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD10
 int
 freebsd10__umtx_lock(struct thread *td, struct freebsd10__umtx_lock_args *uap)
 {
 	return (do_lock_umtx(td, uap->umtx, td->td_tid, 0));
 }
 
 int
 freebsd10__umtx_unlock(struct thread *td,
     struct freebsd10__umtx_unlock_args *uap)
 {
 	return (do_unlock_umtx(td, uap->umtx, td->td_tid));
 }
 #endif
 
 inline int
 umtx_copyin_timeout(const void *uaddr, struct timespec *tsp)
 {
 	int error;
 
 	error = copyin(uaddr, tsp, sizeof(*tsp));
 	if (error == 0) {
 		if (!timespecvalid_interval(tsp))
 			error = EINVAL;
 	}
 	return (error);
 }
 
 static inline int
 umtx_copyin_umtx_time(const void *uaddr, size_t size, struct _umtx_time *tp)
 {
 	int error;
 
 	if (size <= sizeof(tp->_timeout)) {
 		tp->_clockid = CLOCK_REALTIME;
 		tp->_flags = 0;
 		error = copyin(uaddr, &tp->_timeout, sizeof(tp->_timeout));
 	} else
 		error = copyin(uaddr, tp, sizeof(*tp));
 	if (error != 0)
 		return (error);
 	if (!timespecvalid_interval(&tp->_timeout))
 		return (EINVAL);
 	return (0);
 }
 
 static int
 umtx_copyin_robust_lists(const void *uaddr, size_t size,
     struct umtx_robust_lists_params *rb)
 {
 
 	if (size > sizeof(*rb))
 		return (EINVAL);
 	return (copyin(uaddr, rb, size));
 }
 
 static int
 umtx_copyout_timeout(void *uaddr, size_t sz, struct timespec *tsp)
 {
 
 	/*
 	 * Should be guaranteed by the caller, sz == uaddr1 - sizeof(_umtx_time)
 	 * and we're only called if sz >= sizeof(timespec) as supplied in the
 	 * copyops.
 	 */
 	KASSERT(sz >= sizeof(*tsp),
 	    ("umtx_copyops specifies incorrect sizes"));
 
 	return (copyout(tsp, uaddr, sizeof(*tsp)));
 }
 
 #ifdef COMPAT_FREEBSD10
 static int
 __umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap,
     const struct umtx_copyops *ops)
 {
 	struct timespec *ts, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		ts = NULL;
 	else {
 		error = ops->copyin_timeout(uap->uaddr2, &timeout);
 		if (error != 0)
 			return (error);
 		ts = &timeout;
 	}
 #ifdef COMPAT_FREEBSD32
 	if (ops->compat32)
 		return (do_lock_umtx32(td, uap->obj, uap->val, ts));
 #endif
 	return (do_lock_umtx(td, uap->obj, uap->val, ts));
 }
 
 static int
 __umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap,
     const struct umtx_copyops *ops)
 {
 #ifdef COMPAT_FREEBSD32
 	if (ops->compat32)
 		return (do_unlock_umtx32(td, uap->obj, uap->val));
 #endif
 	return (do_unlock_umtx(td, uap->obj, uap->val));
 }
 #endif	/* COMPAT_FREEBSD10 */
 
 #if !defined(COMPAT_FREEBSD10)
 static int
 __umtx_op_unimpl(struct thread *td __unused, struct _umtx_op_args *uap __unused,
     const struct umtx_copyops *ops __unused)
 {
 	return (EOPNOTSUPP);
 }
 #endif	/* COMPAT_FREEBSD10 */
 
 static int
 __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap,
     const struct umtx_copyops *ops)
 {
 	struct _umtx_time timeout, *tm_p;
 	int error;
 
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = ops->copyin_umtx_time(
 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_wait(td, uap->obj, uap->val, tm_p, ops->compat32, 0));
 }
 
 static int
 __umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap,
     const struct umtx_copyops *ops)
 {
 	struct _umtx_time timeout, *tm_p;
 	int error;
 
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = ops->copyin_umtx_time(
 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 0));
 }
 
 static int
 __umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap,
     const struct umtx_copyops *ops)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = ops->copyin_umtx_time(
 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 1));
 }
 
 static int
 __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap,
     const struct umtx_copyops *ops __unused)
 {
 
 	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
 }
 
 #define BATCH_SIZE	128
 static int
 __umtx_op_nwake_private_native(struct thread *td, struct _umtx_op_args *uap)
 {
 	char *uaddrs[BATCH_SIZE], **upp;
 	int count, error, i, pos, tocopy;
 
 	upp = (char **)uap->obj;
 	error = 0;
 	for (count = uap->val, pos = 0; count > 0; count -= tocopy,
 	    pos += tocopy) {
 		tocopy = MIN(count, BATCH_SIZE);
 		error = copyin(upp + pos, uaddrs, tocopy * sizeof(char *));
 		if (error != 0)
 			break;
 		for (i = 0; i < tocopy; ++i) {
 			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
 		}
 		maybe_yield();
 	}
 	return (error);
 }
 
 static int
 __umtx_op_nwake_private_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	uint32_t uaddrs[BATCH_SIZE], *upp;
 	int count, error, i, pos, tocopy;
 
 	upp = (uint32_t *)uap->obj;
 	error = 0;
 	for (count = uap->val, pos = 0; count > 0; count -= tocopy,
 	    pos += tocopy) {
 		tocopy = MIN(count, BATCH_SIZE);
 		error = copyin(upp + pos, uaddrs, tocopy * sizeof(uint32_t));
 		if (error != 0)
 			break;
 		for (i = 0; i < tocopy; ++i) {
 			kern_umtx_wake(td, (void *)(uintptr_t)uaddrs[i],
 			    INT_MAX, 1);
 		}
 		maybe_yield();
 	}
 	return (error);
 }
 
 static int
 __umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap,
     const struct umtx_copyops *ops)
 {
 
 	if (ops->compat32)
 		return (__umtx_op_nwake_private_compat32(td, uap));
 	return (__umtx_op_nwake_private_native(td, uap));
 }
 
 static int
 __umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap,
     const struct umtx_copyops *ops __unused)
 {
 
 	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
 }
 
 static int
 __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap,
    const struct umtx_copyops *ops)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = ops->copyin_umtx_time(
 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_lock_umutex(td, uap->obj, tm_p, 0));
 }
 
 static int
 __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap,
     const struct umtx_copyops *ops __unused)
 {
 
 	return (do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY));
 }
 
 static int
 __umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap,
     const struct umtx_copyops *ops)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = ops->copyin_umtx_time(
 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT));
 }
 
 static int
 __umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap,
     const struct umtx_copyops *ops __unused)
 {
 
 	return (do_wake_umutex(td, uap->obj));
 }
 
 static int
 __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap,
     const struct umtx_copyops *ops __unused)
 {
 
 	return (do_unlock_umutex(td, uap->obj, false));
 }
 
 static int
 __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap,
     const struct umtx_copyops *ops __unused)
 {
 
 	return (do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1));
 }
 
 static int
 __umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap,
     const struct umtx_copyops *ops)
 {
 	struct timespec *ts, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		ts = NULL;
 	else {
 		error = ops->copyin_timeout(uap->uaddr2, &timeout);
 		if (error != 0)
 			return (error);
 		ts = &timeout;
 	}
 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
 }
 
 static int
 __umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap,
     const struct umtx_copyops *ops __unused)
 {
 
 	return (do_cv_signal(td, uap->obj));
 }
 
 static int
 __umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap,
     const struct umtx_copyops *ops __unused)
 {
 
 	return (do_cv_broadcast(td, uap->obj));
 }
 
 static int
 __umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap,
     const struct umtx_copyops *ops)
 {
 	struct _umtx_time timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL) {
 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
 	} else {
 		error = ops->copyin_umtx_time(uap->uaddr2,
 		   (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
 	}
 	return (error);
 }
 
 static int
 __umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap,
     const struct umtx_copyops *ops)
 {
 	struct _umtx_time timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL) {
 		error = do_rw_wrlock(td, uap->obj, 0);
 	} else {
 		error = ops->copyin_umtx_time(uap->uaddr2,
 		   (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 
 		error = do_rw_wrlock(td, uap->obj, &timeout);
 	}
 	return (error);
 }
 
 static int
 __umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap,
     const struct umtx_copyops *ops __unused)
 {
 
 	return (do_rw_unlock(td, uap->obj));
 }
 
 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
 static int
 __umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap,
     const struct umtx_copyops *ops)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = ops->copyin_umtx_time(
 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_sem_wait(td, uap->obj, tm_p));
 }
 
 static int
 __umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap,
     const struct umtx_copyops *ops __unused)
 {
 
 	return (do_sem_wake(td, uap->obj));
 }
 #endif
 
 static int
 __umtx_op_wake2_umutex(struct thread *td, struct _umtx_op_args *uap,
     const struct umtx_copyops *ops __unused)
 {
 
 	return (do_wake2_umutex(td, uap->obj, uap->val));
 }
 
 static int
 __umtx_op_sem2_wait(struct thread *td, struct _umtx_op_args *uap,
     const struct umtx_copyops *ops)
 {
 	struct _umtx_time *tm_p, timeout;
 	size_t uasize;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL) {
 		uasize = 0;
 		tm_p = NULL;
 	} else {
 		uasize = (size_t)uap->uaddr1;
 		error = ops->copyin_umtx_time(uap->uaddr2, uasize, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	error = do_sem2_wait(td, uap->obj, tm_p);
 	if (error == EINTR && uap->uaddr2 != NULL &&
 	    (timeout._flags & UMTX_ABSTIME) == 0 &&
 	    uasize >= ops->umtx_time_sz + ops->timespec_sz) {
 		error = ops->copyout_timeout(
 		    (void *)((uintptr_t)uap->uaddr2 + ops->umtx_time_sz),
 		    uasize - ops->umtx_time_sz, &timeout._timeout);
 		if (error == 0) {
 			error = EINTR;
 		}
 	}
 
 	return (error);
 }
 
 static int
 __umtx_op_sem2_wake(struct thread *td, struct _umtx_op_args *uap,
     const struct umtx_copyops *ops __unused)
 {
 
 	return (do_sem2_wake(td, uap->obj));
 }
 
 #define	USHM_OBJ_UMTX(o)						\
     ((struct umtx_shm_obj_list *)(&(o)->umtx_data))
 
 #define	USHMF_LINKED		0x0001
 struct umtx_shm_reg {
 	TAILQ_ENTRY(umtx_shm_reg) ushm_reg_link;
 	LIST_ENTRY(umtx_shm_reg) ushm_obj_link;
 	struct umtx_key		ushm_key;
 	struct ucred		*ushm_cred;
 	struct shmfd		*ushm_obj;
 	u_int			ushm_refcnt;
 	u_int			ushm_flags;
 };
 
 LIST_HEAD(umtx_shm_obj_list, umtx_shm_reg);
 TAILQ_HEAD(umtx_shm_reg_head, umtx_shm_reg);
 
 static uma_zone_t umtx_shm_reg_zone;
 static struct umtx_shm_reg_head umtx_shm_registry[UMTX_CHAINS];
 static struct mtx umtx_shm_lock;
 static struct umtx_shm_reg_head umtx_shm_reg_delfree =
     TAILQ_HEAD_INITIALIZER(umtx_shm_reg_delfree);
 
 static void umtx_shm_free_reg(struct umtx_shm_reg *reg);
 
 static void
 umtx_shm_reg_delfree_tq(void *context __unused, int pending __unused)
 {
 	struct umtx_shm_reg_head d;
 	struct umtx_shm_reg *reg, *reg1;
 
 	TAILQ_INIT(&d);
 	mtx_lock(&umtx_shm_lock);
 	TAILQ_CONCAT(&d, &umtx_shm_reg_delfree, ushm_reg_link);
 	mtx_unlock(&umtx_shm_lock);
 	TAILQ_FOREACH_SAFE(reg, &d, ushm_reg_link, reg1) {
 		TAILQ_REMOVE(&d, reg, ushm_reg_link);
 		umtx_shm_free_reg(reg);
 	}
 }
 
 static struct task umtx_shm_reg_delfree_task =
     TASK_INITIALIZER(0, umtx_shm_reg_delfree_tq, NULL);
 
 /*
  * Returns 0 if a SHM with the passed key is found in the registry, in which
  * case it is returned through 'oreg'.  Otherwise, returns an error among ESRCH
  * (no corresponding SHM; ESRCH was chosen for compatibility, ENOENT would have
  * been preferable) or EOVERFLOW (there is a corresponding SHM, but reference
  * count would overflow, so can't return it), in which case '*oreg' is left
  * unchanged.
  */
 static int
 umtx_shm_find_reg_locked(const struct umtx_key *key,
     struct umtx_shm_reg **const oreg)
 {
 	struct umtx_shm_reg *reg;
 	struct umtx_shm_reg_head *reg_head;
 
 	KASSERT(key->shared, ("umtx_p_find_rg: private key"));
 	mtx_assert(&umtx_shm_lock, MA_OWNED);
 	reg_head = &umtx_shm_registry[key->hash];
 	TAILQ_FOREACH(reg, reg_head, ushm_reg_link) {
 		KASSERT(reg->ushm_key.shared,
 		    ("non-shared key on reg %p %d", reg, reg->ushm_key.shared));
 		if (reg->ushm_key.info.shared.object ==
 		    key->info.shared.object &&
 		    reg->ushm_key.info.shared.offset ==
 		    key->info.shared.offset) {
 			KASSERT(reg->ushm_key.type == TYPE_SHM, ("TYPE_USHM"));
 			KASSERT(reg->ushm_refcnt != 0,
 			    ("reg %p refcnt 0 onlist", reg));
 			KASSERT((reg->ushm_flags & USHMF_LINKED) != 0,
 			    ("reg %p not linked", reg));
 			/*
 			 * Don't let overflow happen, just deny a new reference
 			 * (this is additional protection against some reference
 			 * count leak, which is known not to be the case at the
 			 * time of this writing).
 			 */
 			if (__predict_false(reg->ushm_refcnt == UINT_MAX))
 				return (EOVERFLOW);
 			reg->ushm_refcnt++;
 			*oreg = reg;
 			return (0);
 		}
 	}
 	return (ESRCH);
 }
 
 /*
  * Calls umtx_shm_find_reg_unlocked() under the 'umtx_shm_lock'.
  */
 static int
 umtx_shm_find_reg(const struct umtx_key *key, struct umtx_shm_reg **const oreg)
 {
 	int error;
 
 	mtx_lock(&umtx_shm_lock);
 	error = umtx_shm_find_reg_locked(key, oreg);
 	mtx_unlock(&umtx_shm_lock);
 	return (error);
 }
 
 static void
 umtx_shm_free_reg(struct umtx_shm_reg *reg)
 {
 
 	chgumtxcnt(reg->ushm_cred->cr_ruidinfo, -1, 0);
 	crfree(reg->ushm_cred);
 	shm_drop(reg->ushm_obj);
 	uma_zfree(umtx_shm_reg_zone, reg);
 }
 
 static bool
 umtx_shm_unref_reg_locked(struct umtx_shm_reg *reg, bool linked_ref)
 {
 	mtx_assert(&umtx_shm_lock, MA_OWNED);
 	KASSERT(reg->ushm_refcnt != 0, ("ushm_reg %p refcnt 0", reg));
 
 	if (linked_ref) {
 		if ((reg->ushm_flags & USHMF_LINKED) == 0)
 			/*
 			 * The reference tied to USHMF_LINKED has already been
 			 * released concurrently.
 			 */
 			return (false);
 
 		TAILQ_REMOVE(&umtx_shm_registry[reg->ushm_key.hash], reg,
 		    ushm_reg_link);
 		LIST_REMOVE(reg, ushm_obj_link);
 		reg->ushm_flags &= ~USHMF_LINKED;
 	}
 
 	reg->ushm_refcnt--;
 	return (reg->ushm_refcnt == 0);
 }
 
 static void
 umtx_shm_unref_reg(struct umtx_shm_reg *reg, bool linked_ref)
 {
 	vm_object_t object;
 	bool dofree;
 
 	if (linked_ref) {
 		/*
 		 * Note: This may be executed multiple times on the same
 		 * shared-memory VM object in presence of concurrent callers
 		 * because 'umtx_shm_lock' is not held all along in umtx_shm()
 		 * and here.
 		 */
 		object = reg->ushm_obj->shm_object;
 		VM_OBJECT_WLOCK(object);
 		vm_object_set_flag(object, OBJ_UMTXDEAD);
 		VM_OBJECT_WUNLOCK(object);
 	}
 	mtx_lock(&umtx_shm_lock);
 	dofree = umtx_shm_unref_reg_locked(reg, linked_ref);
 	mtx_unlock(&umtx_shm_lock);
 	if (dofree)
 		umtx_shm_free_reg(reg);
 }
 
 void
 umtx_shm_object_init(vm_object_t object)
 {
 
 	LIST_INIT(USHM_OBJ_UMTX(object));
 }
 
 void
 umtx_shm_object_terminated(vm_object_t object)
 {
 	struct umtx_shm_reg *reg, *reg1;
 	bool dofree;
 
 	if (LIST_EMPTY(USHM_OBJ_UMTX(object)))
 		return;
 
 	dofree = false;
 	mtx_lock(&umtx_shm_lock);
 	LIST_FOREACH_SAFE(reg, USHM_OBJ_UMTX(object), ushm_obj_link, reg1) {
 		if (umtx_shm_unref_reg_locked(reg, true)) {
 			TAILQ_INSERT_TAIL(&umtx_shm_reg_delfree, reg,
 			    ushm_reg_link);
 			dofree = true;
 		}
 	}
 	mtx_unlock(&umtx_shm_lock);
 	if (dofree)
 		taskqueue_enqueue(taskqueue_thread, &umtx_shm_reg_delfree_task);
 }
 
 static int
 umtx_shm_create_reg(struct thread *td, const struct umtx_key *key,
     struct umtx_shm_reg **res)
 {
 	struct shmfd *shm;
 	struct umtx_shm_reg *reg, *reg1;
 	struct ucred *cred;
 	int error;
 
 	error = umtx_shm_find_reg(key, res);
 	if (error != ESRCH) {
 		/*
 		 * Either no error occured, and '*res' was filled, or EOVERFLOW
 		 * was returned, indicating a reference count limit, and we
 		 * won't create a duplicate registration.  In both cases, we are
 		 * done.
 		 */
 		return (error);
 	}
 	/* No entry, we will create one. */
 
 	cred = td->td_ucred;
 	if (!chgumtxcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_UMTXP)))
 		return (ENOMEM);
 	shm = shm_alloc(td->td_ucred, O_RDWR, false);
 	if (shm == NULL) {
 		chgumtxcnt(cred->cr_ruidinfo, -1, 0);
 		return (ENOMEM);
 	}
 	reg = uma_zalloc(umtx_shm_reg_zone, M_WAITOK | M_ZERO);
 	bcopy(key, &reg->ushm_key, sizeof(*key));
 	reg->ushm_obj = shm;
 	reg->ushm_cred = crhold(cred);
 	error = shm_dotruncate(reg->ushm_obj, PAGE_SIZE);
 	if (error != 0) {
 		umtx_shm_free_reg(reg);
 		return (error);
 	}
 	mtx_lock(&umtx_shm_lock);
 	/* Re-lookup as 'umtx_shm_lock' has been temporarily released. */
 	error = umtx_shm_find_reg_locked(key, &reg1);
 	switch (error) {
 	case 0:
 		mtx_unlock(&umtx_shm_lock);
 		umtx_shm_free_reg(reg);
 		*res = reg1;
 		return (0);
 	case ESRCH:
 		break;
 	default:
 		mtx_unlock(&umtx_shm_lock);
 		umtx_shm_free_reg(reg);
 		return (error);
 	}
 	TAILQ_INSERT_TAIL(&umtx_shm_registry[key->hash], reg, ushm_reg_link);
 	LIST_INSERT_HEAD(USHM_OBJ_UMTX(key->info.shared.object), reg,
 	    ushm_obj_link);
 	reg->ushm_flags = USHMF_LINKED;
 	/*
 	 * This is one reference for the registry and the list of shared
 	 * mutexes referenced by the VM object containing the lock pointer, and
 	 * another for the caller, which it will free after use.  So, one of
 	 * these is tied to the presence of USHMF_LINKED.
 	 */
 	reg->ushm_refcnt = 2;
 	mtx_unlock(&umtx_shm_lock);
 	*res = reg;
 	return (0);
 }
 
 static int
 umtx_shm_alive(struct thread *td, void *addr)
 {
 	vm_map_t map;
 	vm_map_entry_t entry;
 	vm_object_t object;
 	vm_pindex_t pindex;
 	vm_prot_t prot;
 	int res, ret;
 	boolean_t wired;
 
 	map = &td->td_proc->p_vmspace->vm_map;
 	res = vm_map_lookup(&map, (uintptr_t)addr, VM_PROT_READ, &entry,
 	    &object, &pindex, &prot, &wired);
 	if (res != KERN_SUCCESS)
 		return (EFAULT);
 	if (object == NULL)
 		ret = EINVAL;
 	else
 		ret = (object->flags & OBJ_UMTXDEAD) != 0 ? ENOTTY : 0;
 	vm_map_lookup_done(map, entry);
 	return (ret);
 }
 
 static void
 umtx_shm_init(void)
 {
 	int i;
 
 	umtx_shm_reg_zone = uma_zcreate("umtx_shm", sizeof(struct umtx_shm_reg),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	mtx_init(&umtx_shm_lock, "umtxshm", NULL, MTX_DEF);
 	for (i = 0; i < nitems(umtx_shm_registry); i++)
 		TAILQ_INIT(&umtx_shm_registry[i]);
 }
 
 static int
 umtx_shm(struct thread *td, void *addr, u_int flags)
 {
 	struct umtx_key key;
 	struct umtx_shm_reg *reg;
 	struct file *fp;
 	int error, fd;
 
 	if (__bitcount(flags & (UMTX_SHM_CREAT | UMTX_SHM_LOOKUP |
 	    UMTX_SHM_DESTROY| UMTX_SHM_ALIVE)) != 1)
 		return (EINVAL);
 	if ((flags & UMTX_SHM_ALIVE) != 0)
 		return (umtx_shm_alive(td, addr));
 	error = umtx_key_get(addr, TYPE_SHM, PROCESS_SHARE, &key);
 	if (error != 0)
 		return (error);
 	KASSERT(key.shared == 1, ("non-shared key"));
 	error = (flags & UMTX_SHM_CREAT) != 0 ?
 	    umtx_shm_create_reg(td, &key, &reg) :
 	    umtx_shm_find_reg(&key, &reg);
 	umtx_key_release(&key);
 	if (error != 0)
 		return (error);
 	KASSERT(reg != NULL, ("no reg"));
 	if ((flags & UMTX_SHM_DESTROY) != 0) {
 		umtx_shm_unref_reg(reg, true);
 	} else {
 #if 0
 #ifdef MAC
 		error = mac_posixshm_check_open(td->td_ucred,
 		    reg->ushm_obj, FFLAGS(O_RDWR));
 		if (error == 0)
 #endif
 			error = shm_access(reg->ushm_obj, td->td_ucred,
 			    FFLAGS(O_RDWR));
 		if (error == 0)
 #endif
 			error = falloc_caps(td, &fp, &fd, O_CLOEXEC, NULL);
 		if (error == 0) {
 			shm_hold(reg->ushm_obj);
 			finit(fp, FFLAGS(O_RDWR), DTYPE_SHM, reg->ushm_obj,
 			    &shm_ops);
 			td->td_retval[0] = fd;
 			fdrop(fp, td);
 		}
 	}
 	umtx_shm_unref_reg(reg, false);
 	return (error);
 }
 
 static int
 __umtx_op_shm(struct thread *td, struct _umtx_op_args *uap,
     const struct umtx_copyops *ops __unused)
 {
 
 	return (umtx_shm(td, uap->uaddr1, uap->val));
 }
 
 static int
 __umtx_op_robust_lists(struct thread *td, struct _umtx_op_args *uap,
     const struct umtx_copyops *ops)
 {
 	struct umtx_robust_lists_params rb;
 	int error;
 
 	if (ops->compat32) {
 		if ((td->td_pflags2 & TDP2_COMPAT32RB) == 0 &&
 		    (td->td_rb_list != 0 || td->td_rbp_list != 0 ||
 		    td->td_rb_inact != 0))
 			return (EBUSY);
 	} else if ((td->td_pflags2 & TDP2_COMPAT32RB) != 0) {
 		return (EBUSY);
 	}
 
 	bzero(&rb, sizeof(rb));
 	error = ops->copyin_robust_lists(uap->uaddr1, uap->val, &rb);
 	if (error != 0)
 		return (error);
 
 	if (ops->compat32)
 		td->td_pflags2 |= TDP2_COMPAT32RB;
 
 	td->td_rb_list = rb.robust_list_offset;
 	td->td_rbp_list = rb.robust_priv_list_offset;
 	td->td_rb_inact = rb.robust_inact_offset;
 	return (0);
 }
 
 static int
 __umtx_op_get_min_timeout(struct thread *td, struct _umtx_op_args *uap,
     const struct umtx_copyops *ops)
 {
 	long val;
 	int error, val1;
 
 	val = sbttons(td->td_proc->p_umtx_min_timeout);
 	if (ops->compat32) {
 		val1 = (int)val;
 		error = copyout(&val1, uap->uaddr1, sizeof(val1));
 	} else {
 		error = copyout(&val, uap->uaddr1, sizeof(val));
 	}
 	return (error);
 }
 
 static int
 __umtx_op_set_min_timeout(struct thread *td, struct _umtx_op_args *uap,
     const struct umtx_copyops *ops)
 {
 	if (uap->val < 0)
 		return (EINVAL);
 	td->td_proc->p_umtx_min_timeout = nstosbt(uap->val);
 	return (0);
 }
 
 #if defined(__i386__) || defined(__amd64__)
 /*
  * Provide the standard 32-bit definitions for x86, since native/compat32 use a
  * 32-bit time_t there.  Other architectures just need the i386 definitions
  * along with their standard compat32.
  */
 struct timespecx32 {
 	int64_t			tv_sec;
 	int32_t			tv_nsec;
 };
 
 struct umtx_timex32 {
 	struct	timespecx32	_timeout;
 	uint32_t		_flags;
 	uint32_t		_clockid;
 };
 
 #ifndef __i386__
 #define	timespeci386	timespec32
 #define	umtx_timei386	umtx_time32
 #endif
 #else /* !__i386__ && !__amd64__ */
 /* 32-bit architectures can emulate i386, so define these almost everywhere. */
 struct timespeci386 {
 	int32_t			tv_sec;
 	int32_t			tv_nsec;
 };
 
 struct umtx_timei386 {
 	struct	timespeci386	_timeout;
 	uint32_t		_flags;
 	uint32_t		_clockid;
 };
 
 #if defined(__LP64__)
 #define	timespecx32	timespec32
 #define	umtx_timex32	umtx_time32
 #endif
 #endif
 
 static int
 umtx_copyin_robust_lists32(const void *uaddr, size_t size,
     struct umtx_robust_lists_params *rbp)
 {
 	struct umtx_robust_lists_params_compat32 rb32;
 	int error;
 
 	if (size > sizeof(rb32))
 		return (EINVAL);
 	bzero(&rb32, sizeof(rb32));
 	error = copyin(uaddr, &rb32, size);
 	if (error != 0)
 		return (error);
 	CP(rb32, *rbp, robust_list_offset);
 	CP(rb32, *rbp, robust_priv_list_offset);
 	CP(rb32, *rbp, robust_inact_offset);
 	return (0);
 }
 
 #ifndef __i386__
 static inline int
 umtx_copyin_timeouti386(const void *uaddr, struct timespec *tsp)
 {
 	struct timespeci386 ts32;
 	int error;
 
 	error = copyin(uaddr, &ts32, sizeof(ts32));
 	if (error == 0) {
 		if (!timespecvalid_interval(&ts32))
 			error = EINVAL;
 		else {
 			CP(ts32, *tsp, tv_sec);
 			CP(ts32, *tsp, tv_nsec);
 		}
 	}
 	return (error);
 }
 
 static inline int
 umtx_copyin_umtx_timei386(const void *uaddr, size_t size, struct _umtx_time *tp)
 {
 	struct umtx_timei386 t32;
 	int error;
 
 	t32._clockid = CLOCK_REALTIME;
 	t32._flags   = 0;
 	if (size <= sizeof(t32._timeout))
 		error = copyin(uaddr, &t32._timeout, sizeof(t32._timeout));
 	else
 		error = copyin(uaddr, &t32, sizeof(t32));
 	if (error != 0)
 		return (error);
 	if (!timespecvalid_interval(&t32._timeout))
 		return (EINVAL);
 	TS_CP(t32, *tp, _timeout);
 	CP(t32, *tp, _flags);
 	CP(t32, *tp, _clockid);
 	return (0);
 }
 
 static int
 umtx_copyout_timeouti386(void *uaddr, size_t sz, struct timespec *tsp)
 {
 	struct timespeci386 remain32 = {
 		.tv_sec = tsp->tv_sec,
 		.tv_nsec = tsp->tv_nsec,
 	};
 
 	/*
 	 * Should be guaranteed by the caller, sz == uaddr1 - sizeof(_umtx_time)
 	 * and we're only called if sz >= sizeof(timespec) as supplied in the
 	 * copyops.
 	 */
 	KASSERT(sz >= sizeof(remain32),
 	    ("umtx_copyops specifies incorrect sizes"));
 
 	return (copyout(&remain32, uaddr, sizeof(remain32)));
 }
 #endif /* !__i386__ */
 
 #if defined(__i386__) || defined(__LP64__)
 static inline int
 umtx_copyin_timeoutx32(const void *uaddr, struct timespec *tsp)
 {
 	struct timespecx32 ts32;
 	int error;
 
 	error = copyin(uaddr, &ts32, sizeof(ts32));
 	if (error == 0) {
 		if (!timespecvalid_interval(&ts32))
 			error = EINVAL;
 		else {
 			CP(ts32, *tsp, tv_sec);
 			CP(ts32, *tsp, tv_nsec);
 		}
 	}
 	return (error);
 }
 
 static inline int
 umtx_copyin_umtx_timex32(const void *uaddr, size_t size, struct _umtx_time *tp)
 {
 	struct umtx_timex32 t32;
 	int error;
 
 	t32._clockid = CLOCK_REALTIME;
 	t32._flags   = 0;
 	if (size <= sizeof(t32._timeout))
 		error = copyin(uaddr, &t32._timeout, sizeof(t32._timeout));
 	else
 		error = copyin(uaddr, &t32, sizeof(t32));
 	if (error != 0)
 		return (error);
 	if (!timespecvalid_interval(&t32._timeout))
 		return (EINVAL);
 	TS_CP(t32, *tp, _timeout);
 	CP(t32, *tp, _flags);
 	CP(t32, *tp, _clockid);
 	return (0);
 }
 
 static int
 umtx_copyout_timeoutx32(void *uaddr, size_t sz, struct timespec *tsp)
 {
 	struct timespecx32 remain32 = {
 		.tv_sec = tsp->tv_sec,
 		.tv_nsec = tsp->tv_nsec,
 	};
 
 	/*
 	 * Should be guaranteed by the caller, sz == uaddr1 - sizeof(_umtx_time)
 	 * and we're only called if sz >= sizeof(timespec) as supplied in the
 	 * copyops.
 	 */
 	KASSERT(sz >= sizeof(remain32),
 	    ("umtx_copyops specifies incorrect sizes"));
 
 	return (copyout(&remain32, uaddr, sizeof(remain32)));
 }
 #endif /* __i386__ || __LP64__ */
 
 typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap,
     const struct umtx_copyops *umtx_ops);
 
 static const _umtx_op_func op_table[] = {
 #ifdef COMPAT_FREEBSD10
 	[UMTX_OP_LOCK]		= __umtx_op_lock_umtx,
 	[UMTX_OP_UNLOCK]	= __umtx_op_unlock_umtx,
 #else
 	[UMTX_OP_LOCK]		= __umtx_op_unimpl,
 	[UMTX_OP_UNLOCK]	= __umtx_op_unimpl,
 #endif
 	[UMTX_OP_WAIT]		= __umtx_op_wait,
 	[UMTX_OP_WAKE]		= __umtx_op_wake,
 	[UMTX_OP_MUTEX_TRYLOCK]	= __umtx_op_trylock_umutex,
 	[UMTX_OP_MUTEX_LOCK]	= __umtx_op_lock_umutex,
 	[UMTX_OP_MUTEX_UNLOCK]	= __umtx_op_unlock_umutex,
 	[UMTX_OP_SET_CEILING]	= __umtx_op_set_ceiling,
 	[UMTX_OP_CV_WAIT]	= __umtx_op_cv_wait,
 	[UMTX_OP_CV_SIGNAL]	= __umtx_op_cv_signal,
 	[UMTX_OP_CV_BROADCAST]	= __umtx_op_cv_broadcast,
 	[UMTX_OP_WAIT_UINT]	= __umtx_op_wait_uint,
 	[UMTX_OP_RW_RDLOCK]	= __umtx_op_rw_rdlock,
 	[UMTX_OP_RW_WRLOCK]	= __umtx_op_rw_wrlock,
 	[UMTX_OP_RW_UNLOCK]	= __umtx_op_rw_unlock,
 	[UMTX_OP_WAIT_UINT_PRIVATE] = __umtx_op_wait_uint_private,
 	[UMTX_OP_WAKE_PRIVATE]	= __umtx_op_wake_private,
 	[UMTX_OP_MUTEX_WAIT]	= __umtx_op_wait_umutex,
 	[UMTX_OP_MUTEX_WAKE]	= __umtx_op_wake_umutex,
 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
 	[UMTX_OP_SEM_WAIT]	= __umtx_op_sem_wait,
 	[UMTX_OP_SEM_WAKE]	= __umtx_op_sem_wake,
 #else
 	[UMTX_OP_SEM_WAIT]	= __umtx_op_unimpl,
 	[UMTX_OP_SEM_WAKE]	= __umtx_op_unimpl,
 #endif
 	[UMTX_OP_NWAKE_PRIVATE]	= __umtx_op_nwake_private,
 	[UMTX_OP_MUTEX_WAKE2]	= __umtx_op_wake2_umutex,
 	[UMTX_OP_SEM2_WAIT]	= __umtx_op_sem2_wait,
 	[UMTX_OP_SEM2_WAKE]	= __umtx_op_sem2_wake,
 	[UMTX_OP_SHM]		= __umtx_op_shm,
 	[UMTX_OP_ROBUST_LISTS]	= __umtx_op_robust_lists,
 	[UMTX_OP_GET_MIN_TIMEOUT] = __umtx_op_get_min_timeout,
 	[UMTX_OP_SET_MIN_TIMEOUT] = __umtx_op_set_min_timeout,
 };
 
 static const struct umtx_copyops umtx_native_ops = {
 	.copyin_timeout = umtx_copyin_timeout,
 	.copyin_umtx_time = umtx_copyin_umtx_time,
 	.copyin_robust_lists = umtx_copyin_robust_lists,
 	.copyout_timeout = umtx_copyout_timeout,
 	.timespec_sz = sizeof(struct timespec),
 	.umtx_time_sz = sizeof(struct _umtx_time),
 };
 
 #ifndef __i386__
 static const struct umtx_copyops umtx_native_opsi386 = {
 	.copyin_timeout = umtx_copyin_timeouti386,
 	.copyin_umtx_time = umtx_copyin_umtx_timei386,
 	.copyin_robust_lists = umtx_copyin_robust_lists32,
 	.copyout_timeout = umtx_copyout_timeouti386,
 	.timespec_sz = sizeof(struct timespeci386),
 	.umtx_time_sz = sizeof(struct umtx_timei386),
 	.compat32 = true,
 };
 #endif
 
 #if defined(__i386__) || defined(__LP64__)
 /* i386 can emulate other 32-bit archs, too! */
 static const struct umtx_copyops umtx_native_opsx32 = {
 	.copyin_timeout = umtx_copyin_timeoutx32,
 	.copyin_umtx_time = umtx_copyin_umtx_timex32,
 	.copyin_robust_lists = umtx_copyin_robust_lists32,
 	.copyout_timeout = umtx_copyout_timeoutx32,
 	.timespec_sz = sizeof(struct timespecx32),
 	.umtx_time_sz = sizeof(struct umtx_timex32),
 	.compat32 = true,
 };
 
 #ifdef COMPAT_FREEBSD32
 #ifdef __amd64__
 #define	umtx_native_ops32	umtx_native_opsi386
 #else
 #define	umtx_native_ops32	umtx_native_opsx32
 #endif
 #endif /* COMPAT_FREEBSD32 */
 #endif /* __i386__ || __LP64__ */
 
 #define	UMTX_OP__FLAGS	(UMTX_OP__32BIT | UMTX_OP__I386)
 
 static int
 kern__umtx_op(struct thread *td, void *obj, int op, unsigned long val,
     void *uaddr1, void *uaddr2, const struct umtx_copyops *ops)
 {
 	struct _umtx_op_args uap = {
 		.obj = obj,
 		.op = op & ~UMTX_OP__FLAGS,
 		.val = val,
 		.uaddr1 = uaddr1,
 		.uaddr2 = uaddr2
 	};
 
 	if ((uap.op >= nitems(op_table)))
 		return (EINVAL);
 	return ((*op_table[uap.op])(td, &uap, ops));
 }
 
 int
 sys__umtx_op(struct thread *td, struct _umtx_op_args *uap)
 {
 	static const struct umtx_copyops *umtx_ops;
 
 	umtx_ops = &umtx_native_ops;
 #ifdef __LP64__
 	if ((uap->op & (UMTX_OP__32BIT | UMTX_OP__I386)) != 0) {
 		if ((uap->op & UMTX_OP__I386) != 0)
 			umtx_ops = &umtx_native_opsi386;
 		else
 			umtx_ops = &umtx_native_opsx32;
 	}
 #elif !defined(__i386__)
 	/* We consider UMTX_OP__32BIT a nop on !i386 ILP32. */
 	if ((uap->op & UMTX_OP__I386) != 0)
 		umtx_ops = &umtx_native_opsi386;
 #else
 	/* Likewise, UMTX_OP__I386 is a nop on i386. */
 	if ((uap->op & UMTX_OP__32BIT) != 0)
 		umtx_ops = &umtx_native_opsx32;
 #endif
 	return (kern__umtx_op(td, uap->obj, uap->op, uap->val, uap->uaddr1,
 	    uap->uaddr2, umtx_ops));
 }
 
 #ifdef COMPAT_FREEBSD32
 #ifdef COMPAT_FREEBSD10
 int
 freebsd10_freebsd32__umtx_lock(struct thread *td,
     struct freebsd10_freebsd32__umtx_lock_args *uap)
 {
 	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
 }
 
 int
 freebsd10_freebsd32__umtx_unlock(struct thread *td,
     struct freebsd10_freebsd32__umtx_unlock_args *uap)
 {
 	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
 }
 #endif /* COMPAT_FREEBSD10 */
 
 int
 freebsd32__umtx_op(struct thread *td, struct freebsd32__umtx_op_args *uap)
 {
 
 	return (kern__umtx_op(td, uap->obj, uap->op, uap->val, uap->uaddr1,
 	    uap->uaddr2, &umtx_native_ops32));
 }
 #endif /* COMPAT_FREEBSD32 */
 
 void
 umtx_thread_init(struct thread *td)
 {
 
 	td->td_umtxq = umtxq_alloc();
 	td->td_umtxq->uq_thread = td;
 }
 
 void
 umtx_thread_fini(struct thread *td)
 {
 
 	umtxq_free(td->td_umtxq);
 }
 
 /*
  * It will be called when new thread is created, e.g fork().
  */
 void
 umtx_thread_alloc(struct thread *td)
 {
 	struct umtx_q *uq;
 
 	uq = td->td_umtxq;
 	uq->uq_inherited_pri = PRI_MAX;
 
 	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
 	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
 	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
 	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
 }
 
 /*
  * exec() hook.
  *
  * Clear robust lists for all process' threads, not delaying the
  * cleanup to thread exit, since the relevant address space is
  * destroyed right now.
  */
 void
 umtx_exec(struct proc *p)
 {
 	struct thread *td;
 
 	KASSERT(p == curproc, ("need curproc"));
 	KASSERT((p->p_flag & P_HADTHREADS) == 0 ||
 	    (p->p_flag & P_STOPPED_SINGLE) != 0,
 	    ("curproc must be single-threaded"));
 	/*
 	 * There is no need to lock the list as only this thread can be
 	 * running.
 	 */
 	FOREACH_THREAD_IN_PROC(p, td) {
 		KASSERT(td == curthread ||
 		    ((td->td_flags & TDF_BOUNDARY) != 0 && TD_IS_SUSPENDED(td)),
 		    ("running thread %p %p", p, td));
 		umtx_thread_cleanup(td);
 		td->td_rb_list = td->td_rbp_list = td->td_rb_inact = 0;
 	}
 
 	p->p_umtx_min_timeout = 0;
 }
 
 /*
  * thread exit hook.
  */
 void
 umtx_thread_exit(struct thread *td)
 {
 
 	umtx_thread_cleanup(td);
 }
 
 static int
 umtx_read_uptr(struct thread *td, uintptr_t ptr, uintptr_t *res, bool compat32)
 {
 	u_long res1;
 	uint32_t res32;
 	int error;
 
 	if (compat32) {
 		error = fueword32((void *)ptr, &res32);
 		if (error == 0)
 			res1 = res32;
 	} else {
 		error = fueword((void *)ptr, &res1);
 	}
 	if (error == 0)
 		*res = res1;
 	else
 		error = EFAULT;
 	return (error);
 }
 
 static void
 umtx_read_rb_list(struct thread *td, struct umutex *m, uintptr_t *rb_list,
     bool compat32)
 {
 	struct umutex32 m32;
 
 	if (compat32) {
 		memcpy(&m32, m, sizeof(m32));
 		*rb_list = m32.m_rb_lnk;
 	} else {
 		*rb_list = m->m_rb_lnk;
 	}
 }
 
 static int
 umtx_handle_rb(struct thread *td, uintptr_t rbp, uintptr_t *rb_list, bool inact,
     bool compat32)
 {
 	struct umutex m;
 	int error;
 
 	KASSERT(td->td_proc == curproc, ("need current vmspace"));
 	error = copyin((void *)rbp, &m, sizeof(m));
 	if (error != 0)
 		return (error);
 	if (rb_list != NULL)
 		umtx_read_rb_list(td, &m, rb_list, compat32);
 	if ((m.m_flags & UMUTEX_ROBUST) == 0)
 		return (EINVAL);
 	if ((m.m_owner & ~UMUTEX_CONTESTED) != td->td_tid)
 		/* inact is cleared after unlock, allow the inconsistency */
 		return (inact ? 0 : EINVAL);
 	return (do_unlock_umutex(td, (struct umutex *)rbp, true));
 }
 
 static void
 umtx_cleanup_rb_list(struct thread *td, uintptr_t rb_list, uintptr_t *rb_inact,
     const char *name, bool compat32)
 {
 	int error, i;
 	uintptr_t rbp;
 	bool inact;
 
 	if (rb_list == 0)
 		return;
 	error = umtx_read_uptr(td, rb_list, &rbp, compat32);
 	for (i = 0; error == 0 && rbp != 0 && i < umtx_max_rb; i++) {
 		if (rbp == *rb_inact) {
 			inact = true;
 			*rb_inact = 0;
 		} else
 			inact = false;
 		error = umtx_handle_rb(td, rbp, &rbp, inact, compat32);
 	}
 	if (i == umtx_max_rb && umtx_verbose_rb) {
 		uprintf("comm %s pid %d: reached umtx %smax rb %d\n",
 		    td->td_proc->p_comm, td->td_proc->p_pid, name, umtx_max_rb);
 	}
 	if (error != 0 && umtx_verbose_rb) {
 		uprintf("comm %s pid %d: handling %srb error %d\n",
 		    td->td_proc->p_comm, td->td_proc->p_pid, name, error);
 	}
 }
 
 /*
  * Clean up umtx data.
  */
 static void
 umtx_thread_cleanup(struct thread *td)
 {
 	struct umtx_q *uq;
 	struct umtx_pi *pi;
 	uintptr_t rb_inact;
 	bool compat32;
 
 	/*
 	 * Disown pi mutexes.
 	 */
 	uq = td->td_umtxq;
 	if (uq != NULL) {
 		if (uq->uq_inherited_pri != PRI_MAX ||
 		    !TAILQ_EMPTY(&uq->uq_pi_contested)) {
 			mtx_lock(&umtx_lock);
 			uq->uq_inherited_pri = PRI_MAX;
 			while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
 				pi->pi_owner = NULL;
 				TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
 			}
 			mtx_unlock(&umtx_lock);
 		}
 		sched_lend_user_prio_cond(td, PRI_MAX);
 	}
 
 	compat32 = (td->td_pflags2 & TDP2_COMPAT32RB) != 0;
 	td->td_pflags2 &= ~TDP2_COMPAT32RB;
 
 	if (td->td_rb_inact == 0 && td->td_rb_list == 0 && td->td_rbp_list == 0)
 		return;
 
 	/*
 	 * Handle terminated robust mutexes.  Must be done after
 	 * robust pi disown, otherwise unlock could see unowned
 	 * entries.
 	 */
 	rb_inact = td->td_rb_inact;
 	if (rb_inact != 0)
 		(void)umtx_read_uptr(td, rb_inact, &rb_inact, compat32);
 	umtx_cleanup_rb_list(td, td->td_rb_list, &rb_inact, "", compat32);
 	umtx_cleanup_rb_list(td, td->td_rbp_list, &rb_inact, "priv ", compat32);
 	if (rb_inact != 0)
 		(void)umtx_handle_rb(td, rb_inact, NULL, true, compat32);
 }
diff --git a/sys/sys/_clock_id.h b/sys/sys/_clock_id.h
index 728346a0f0ab..83130d1f8a16 100644
--- a/sys/sys/_clock_id.h
+++ b/sys/sys/_clock_id.h
@@ -1,93 +1,97 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2021 Netflix, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #ifndef _SYS_SYS__CLOCK_ID_H
 #define	_SYS_SYS__CLOCK_ID_H
 
 /*
  * These macros are shared between time.h and sys/time.h.
  */
 
 /*
  * Note: The values shown below as a comment for the __POSIX_VISIBLE values are
  * the ones FreeBSD traditionally used based on our reading of the POSIX
  * standards. However, glibc uses 199309 for all of them, even those many were
  * not defined there. To remain bug compatible with glibc means more software
  * that relied on the glibc behavior will compile easily on FreeBSD.
  *
  * Also, CLOCK_UPTIME_FAST is improperly visible temporarily for the lang/pocl
  * port until it can be updated properly. It incorrectly assumes that this was a
  * standard value. It will be moved back to the __BSD_VISIBLE section once the
  * issue is corrected.
  */
 
 #if __POSIX_VISIBLE >= 199309		/* 199506 */
 #define CLOCK_REALTIME		0
 #endif /* __POSIX_VISIBLE >= 199309 */
 #ifdef __BSD_VISIBLE
 #define CLOCK_VIRTUAL		1
 #define CLOCK_PROF		2
 #endif /* __BSD_VISIBLE */
 #if __POSIX_VISIBLE >= 199309		/* 200112 */
 #define CLOCK_MONOTONIC		4
 #define CLOCK_UPTIME_FAST	8
 #endif /* __POSIX_VISIBLE >= 199309 */
 #ifdef __BSD_VISIBLE
 /*
  * FreeBSD-specific clocks.
  */
 #define CLOCK_UPTIME		5
 #define CLOCK_UPTIME_PRECISE	7
 #define CLOCK_REALTIME_PRECISE	9
 #define CLOCK_REALTIME_FAST	10
 #define CLOCK_MONOTONIC_PRECISE	11
 #define CLOCK_MONOTONIC_FAST	12
 #define CLOCK_SECOND		13
 #endif /* __BSD_VISIBLE */
 
 #if __POSIX_VISIBLE >= 199309		/* 200112 */
 #define CLOCK_THREAD_CPUTIME_ID	14
 #define	CLOCK_PROCESS_CPUTIME_ID 15
 #endif /* __POSIX_VISIBLE >= 199309 */
 
+#ifdef __BSD_VISIBLE
+#define	CLOCK_TAI		16
+#endif
+
 /*
  * Linux compatible names.
  */
 #if __BSD_VISIBLE
 #define	CLOCK_BOOTTIME		CLOCK_MONOTONIC
 #define	CLOCK_REALTIME_COARSE	CLOCK_REALTIME_FAST
 #define	CLOCK_MONOTONIC_COARSE	CLOCK_MONOTONIC_FAST
 #endif
 
 #if __BSD_VISIBLE
 #define TIMER_RELTIME	0x0	/* relative timer */
 #endif
 #if __POSIX_VISIBLE >= 199309
 #define TIMER_ABSTIME	0x1	/* absolute timer */
 #endif /* __POSIX_VISIBLE >= 199309 */
 
 #endif /* _SYS_SYS__CLOCK_ID_H */
diff --git a/sys/sys/timeffc.h b/sys/sys/timeffc.h
index a83b62b1672c..8bec73ed48a4 100644
--- a/sys/sys/timeffc.h
+++ b/sys/sys/timeffc.h
@@ -1,389 +1,391 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2011 The University of Melbourne
  * All rights reserved.
  *
  * This software was developed by Julien Ridoux at the University of Melbourne
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #ifndef _SYS_TIMEFF_H_
 #define _SYS_TIMEFF_H_
 
 #include <sys/_ffcounter.h>
 
 /*
  * Feed-forward clock estimate
  * Holds time mark as a ffcounter and conversion to bintime based on current
  * timecounter period and offset estimate passed by the synchronization daemon.
  * Provides time of last daemon update, clock status and bound on error.
  */
 struct ffclock_estimate {
 	struct bintime	update_time;	/* Time of last estimates update. */
 	ffcounter	update_ffcount;	/* Counter value at last update. */
 	ffcounter	leapsec_next;	/* Counter value of next leap second. */
 	uint64_t	period;		/* Estimate of counter period. */
 	uint32_t	errb_abs;	/* Bound on absolute clock error [ns]. */
 	uint32_t	errb_rate;	/* Bound on counter rate error [ps/s]. */
 	uint32_t	status;		/* Clock status. */
 	int16_t		leapsec_total;	/* All leap seconds seen so far. */
 	int8_t		leapsec;	/* Next leap second (in {-1,0,1}). */
 };
 
 #if __BSD_VISIBLE
 #ifdef _KERNEL
 
 /* Define the kern.sysclock sysctl tree. */
 SYSCTL_DECL(_kern_sysclock);
 
 /* Define the kern.sysclock.ffclock sysctl tree. */
 SYSCTL_DECL(_kern_sysclock_ffclock);
 
 /*
  * Index into the sysclocks array for obtaining the ASCII name of a particular
  * sysclock.
  */
 #define	SYSCLOCK_FBCK	0
 #define	SYSCLOCK_FFWD	1
 extern int sysclock_active;
 
 /*
  * Parameters of counter characterisation required by feed-forward algorithms.
  */
 #define	FFCLOCK_SKM_SCALE	1024
 
 /*
  * Feed-forward clock status
  */
 #define	FFCLOCK_STA_UNSYNC	1
 #define	FFCLOCK_STA_WARMUP	2
 
 /*
  * Flags for use by sysclock_snap2bintime() and various ffclock_ functions to
  * control how the timecounter hardware is read and how the hardware snapshot is
  * converted into absolute time.
  * {FB|FF}CLOCK_FAST:	Do not read the hardware counter, instead using the
  *			value at last tick. The time returned has a resolution
  *			of the kernel tick timer (1/hz [s]).
  * FFCLOCK_LERP:	Linear interpolation of ffclock time to guarantee
  *			monotonic time.
- * FFCLOCK_LEAPSEC:	Include leap seconds.
+ * {FB|FF}CLOCK_LEAPSEC: Include leap seconds.
  * {FB|FF}CLOCK_UPTIME:	Time stamp should be relative to system boot, not epoch.
  */
 #define	FFCLOCK_FAST		0x00000001
 #define	FFCLOCK_LERP		0x00000002
 #define	FFCLOCK_LEAPSEC		0x00000004
 #define	FFCLOCK_UPTIME		0x00000008
 #define	FFCLOCK_MASK		0x0000ffff
 
 #define	FBCLOCK_FAST		0x00010000 /* Currently unused. */
 #define	FBCLOCK_UPTIME		0x00020000
+#define	FBCLOCK_LEAPSEC		0x00040000
 #define	FBCLOCK_MASK		0xffff0000
 
 /*
  * Feedback clock specific info structure. The feedback clock's estimation of
  * clock error is an absolute figure determined by the NTP algorithm. The status
  * is determined by the userland daemon.
  */
 struct fbclock_info {
 	struct bintime		error;
 	struct bintime		tick_time;
 	uint64_t		th_scale;
+	long			th_tai_offset;
 	int			status;
 };
 
 /*
  * Feed-forward clock specific info structure. The feed-forward clock's
  * estimation of clock error is an upper bound, which although potentially
  * looser than the feedback clock equivalent, is much more reliable. The status
  * is determined by the userland daemon.
  */
 struct ffclock_info {
 	struct bintime		error;
 	struct bintime		tick_time;
 	struct bintime		tick_time_lerp;
 	uint64_t		period;
 	uint64_t		period_lerp;
 	int			leapsec_adjustment;
 	int			status;
 };
 
 /*
  * Snapshot of system clocks and related information. Holds time read from each
  * clock based on a single read of the active hardware timecounter, as well as
  * respective clock information such as error estimates and the ffcounter value
  * at the time of the read.
  */
 struct sysclock_snap {
 	struct fbclock_info	fb_info;
 	struct ffclock_info	ff_info;
 	ffcounter		ffcount;
 	unsigned int		delta;
 	int			sysclock_active;
 };
 
 /* Take a snapshot of the system clocks and related information. */
 void sysclock_getsnapshot(struct sysclock_snap *clock_snap, int fast);
 
 /* Convert a timestamp from the selected system clock into bintime. */
 int sysclock_snap2bintime(struct sysclock_snap *cs, struct bintime *bt,
     int whichclock, uint32_t flags);
 
 /* Resets feed-forward clock from RTC */
 void ffclock_reset_clock(struct timespec *ts);
 
 /*
  * Return the current value of the feed-forward clock counter. Essential to
  * measure time interval in counter units. If a fast timecounter is used by the
  * system, may also allow fast but accurate timestamping.
  */
 void ffclock_read_counter(ffcounter *ffcount);
 
 /*
  * Retrieve feed-forward counter value and time of last kernel tick. This
  * accepts the FFCLOCK_LERP flag.
  */
 void ffclock_last_tick(ffcounter *ffcount, struct bintime *bt, uint32_t flags);
 
 /*
  * Low level routines to convert a counter timestamp into absolute time and a
  * counter timestamp interval into an interval in seconds. The absolute time
  * conversion accepts the FFCLOCK_LERP flag.
  */
 void ffclock_convert_abs(ffcounter ffcount, struct bintime *bt, uint32_t flags);
 void ffclock_convert_diff(ffcounter ffdelta, struct bintime *bt);
 
 /*
  * Feed-forward clock routines.
  *
  * These functions rely on the timecounters and ffclock_estimates stored in
  * fftimehands. Note that the error_bound parameter is not the error of the
  * clock but an upper bound on the error of the absolute time or time interval
  * returned.
  *
  * ffclock_abstime(): retrieves current time as counter value and convert this
  *     timestamp in seconds. The value (in seconds) of the converted timestamp
  *     depends on the flags passed: for a given counter value, different
  *     conversions are possible. Different clock models can be selected by
  *     combining flags (for example (FFCLOCK_LERP|FFCLOCK_UPTIME) produces
  *     linearly interpolated uptime).
  * ffclock_difftime(): computes a time interval in seconds based on an interval
  *     measured in ffcounter units. This should be the preferred way to measure
  *     small time intervals very accurately.
  */
 void ffclock_abstime(ffcounter *ffcount, struct bintime *bt,
     struct bintime *error_bound, uint32_t flags);
 void ffclock_difftime(ffcounter ffdelta, struct bintime *bt,
     struct bintime *error_bound);
 
 /*
  * Wrapper routines to return current absolute time using the feed-forward
  * clock. These functions are named after those defined in <sys/time.h>, which
  * contains a description of the original ones.
  */
 void ffclock_bintime(struct bintime *bt);
 void ffclock_nanotime(struct timespec *tsp);
 void ffclock_microtime(struct timeval *tvp);
 
 void ffclock_getbintime(struct bintime *bt);
 void ffclock_getnanotime(struct timespec *tsp);
 void ffclock_getmicrotime(struct timeval *tvp);
 
 void ffclock_binuptime(struct bintime *bt);
 void ffclock_nanouptime(struct timespec *tsp);
 void ffclock_microuptime(struct timeval *tvp);
 
 void ffclock_getbinuptime(struct bintime *bt);
 void ffclock_getnanouptime(struct timespec *tsp);
 void ffclock_getmicrouptime(struct timeval *tvp);
 
 /*
  * Wrapper routines to convert a time interval specified in ffcounter units into
  * seconds using the current feed-forward clock estimates.
  */
 void ffclock_bindifftime(ffcounter ffdelta, struct bintime *bt);
 void ffclock_nanodifftime(ffcounter ffdelta, struct timespec *tsp);
 void ffclock_microdifftime(ffcounter ffdelta, struct timeval *tvp);
 
 /*
  * When FFCLOCK is enabled in the kernel, [get]{bin,nano,micro}[up]time() become
  * wrappers around equivalent feedback or feed-forward functions. Provide access
  * outside of kern_tc.c to the feedback clock equivalent functions for
  * specialised use i.e. these are not for general consumption.
  */
 void fbclock_bintime(struct bintime *bt);
 void fbclock_nanotime(struct timespec *tsp);
 void fbclock_microtime(struct timeval *tvp);
 
 void fbclock_getbintime(struct bintime *bt);
 void fbclock_getnanotime(struct timespec *tsp);
 void fbclock_getmicrotime(struct timeval *tvp);
 
 void fbclock_binuptime(struct bintime *bt);
 void fbclock_nanouptime(struct timespec *tsp);
 void fbclock_microuptime(struct timeval *tvp);
 
 void fbclock_getbinuptime(struct bintime *bt);
 void fbclock_getnanouptime(struct timespec *tsp);
 void fbclock_getmicrouptime(struct timeval *tvp);
 
 /*
  * Public system clock wrapper API which allows consumers to select which clock
  * to obtain time from, independent of the current default system clock. These
  * wrappers should be used instead of directly calling the underlying fbclock_
  * or ffclock_ functions.
  */
 static inline void
 bintime_fromclock(struct bintime *bt, int whichclock)
 {
 
 	if (whichclock == SYSCLOCK_FFWD)
 		ffclock_bintime(bt);
 	else
 		fbclock_bintime(bt);
 }
 
 static inline void
 nanotime_fromclock(struct timespec *tsp, int whichclock)
 {
 
 	if (whichclock == SYSCLOCK_FFWD)
 		ffclock_nanotime(tsp);
 	else
 		fbclock_nanotime(tsp);
 }
 
 static inline void
 microtime_fromclock(struct timeval *tvp, int whichclock)
 {
 
 	if (whichclock == SYSCLOCK_FFWD)
 		ffclock_microtime(tvp);
 	else
 		fbclock_microtime(tvp);
 }
 
 static inline void
 getbintime_fromclock(struct bintime *bt, int whichclock)
 {
 
 	if (whichclock == SYSCLOCK_FFWD)
 		ffclock_getbintime(bt);
 	else
 		fbclock_getbintime(bt);
 }
 
 static inline void
 getnanotime_fromclock(struct timespec *tsp, int whichclock)
 {
 
 	if (whichclock == SYSCLOCK_FFWD)
 		ffclock_getnanotime(tsp);
 	else
 		fbclock_getnanotime(tsp);
 }
 
 static inline void
 getmicrotime_fromclock(struct timeval *tvp, int whichclock)
 {
 
 	if (whichclock == SYSCLOCK_FFWD)
 		ffclock_getmicrotime(tvp);
 	else
 		fbclock_getmicrotime(tvp);
 }
 
 static inline void
 binuptime_fromclock(struct bintime *bt, int whichclock)
 {
 
 	if (whichclock == SYSCLOCK_FFWD)
 		ffclock_binuptime(bt);
 	else
 		fbclock_binuptime(bt);
 }
 
 static inline void
 nanouptime_fromclock(struct timespec *tsp, int whichclock)
 {
 
 	if (whichclock == SYSCLOCK_FFWD)
 		ffclock_nanouptime(tsp);
 	else
 		fbclock_nanouptime(tsp);
 }
 
 static inline void
 microuptime_fromclock(struct timeval *tvp, int whichclock)
 {
 
 	if (whichclock == SYSCLOCK_FFWD)
 		ffclock_microuptime(tvp);
 	else
 		fbclock_microuptime(tvp);
 }
 
 static inline void
 getbinuptime_fromclock(struct bintime *bt, int whichclock)
 {
 
 	if (whichclock == SYSCLOCK_FFWD)
 		ffclock_getbinuptime(bt);
 	else
 		fbclock_getbinuptime(bt);
 }
 
 static inline void
 getnanouptime_fromclock(struct timespec *tsp, int whichclock)
 {
 
 	if (whichclock == SYSCLOCK_FFWD)
 		ffclock_getnanouptime(tsp);
 	else
 		fbclock_getnanouptime(tsp);
 }
 
 static inline void
 getmicrouptime_fromclock(struct timeval *tvp, int whichclock)
 {
 
 	if (whichclock == SYSCLOCK_FFWD)
 		ffclock_getmicrouptime(tvp);
 	else
 		fbclock_getmicrouptime(tvp);
 }
 
 #else /* !_KERNEL */
 
 /* Feed-Forward Clock system calls. */
 __BEGIN_DECLS
 int ffclock_getcounter(ffcounter *ffcount);
 int ffclock_getestimate(struct ffclock_estimate *cest);
 int ffclock_setestimate(struct ffclock_estimate *cest);
 __END_DECLS
 
 #endif /* _KERNEL */
 #endif /* __BSD_VISIBLE */
 #endif /* _SYS_TIMEFF_H_ */
diff --git a/sys/sys/timex.h b/sys/sys/timex.h
index 072297375792..03632bdb119c 100644
--- a/sys/sys/timex.h
+++ b/sys/sys/timex.h
@@ -1,169 +1,169 @@
 /*-
  ***********************************************************************
  *								       *
  * Copyright (c) David L. Mills 1993-2001			       *
  * Copyright (c) Poul-Henning Kamp 2000-2001                           *
  *								       *
  * Permission to use, copy, modify, and distribute this software and   *
  * its documentation for any purpose and without fee is hereby	       *
  * granted, provided that the above copyright notice appears in all    *
  * copies and that both the copyright notice and this permission       *
  * notice appear in supporting documentation, and that the name        *
  * University of Delaware not be used in advertising or publicity      *
  * pertaining to distribution of the software without specific,	       *
  * written prior permission. The University of Delaware makes no       *
  * representations about the suitability this software for any	       *
  * purpose. It is provided "as is" without express or implied	       *
  * warranty.							       *
  *								       *
  ***********************************************************************
  *
  * This header file defines the Network Time Protocol (NTP) interfaces
  * for user and daemon application programs.
  *
  * This file was originally created 17 Sep 93 by David L. Mills, Professor
  * of University of Delaware, building on work which had already been ongoing
  * for a decade and a half at that point in time.
  *
  * In 2000 the APIs got a upgrade from microseconds to nanoseconds,
  * a joint work between Poul-Henning Kamp and David L. Mills.
  *
  */
 
 #ifndef _SYS_TIMEX_H_
 #define _SYS_TIMEX_H_ 1
 
 #define NTP_API		4		/* NTP API version */
 
 #ifdef __FreeBSD__
 #include <sys/_timespec.h>
 #endif /* __FreeBSD__ */
 
 /*
  * The following defines establish the performance envelope of the
  * kernel discipline loop. Phase or frequency errors greater than
  * NAXPHASE or MAXFREQ are clamped to these maxima. For update intervals
  * less than MINSEC, the loop always operates in PLL mode; while, for
  * update intervals greater than MAXSEC, the loop always operates in FLL
  * mode. Between these two limits the operating mode is selected by the
  * STA_FLL bit in the status word.
  */
 
 #define MAXPHASE	500000000L	/* max phase error (ns) */
 #define MAXFREQ		500000L		/* max freq error (ns/s) */
 #define MINSEC		256		/* min FLL update interval (s) */
 #define MAXSEC		2048		/* max PLL update interval (s) */
 #define NANOSECOND	1000000000L	/* nanoseconds in one second */
 #define SCALE_PPM	(65536 / 1000)	/* crude ns/s to scaled PPM */
 #define MAXTC		10		/* max time constant */
 
 /*
  * Control mode codes (timex.modes)
  */
 #define MOD_OFFSET	0x0001		/* set time offset */
 #define MOD_FREQUENCY	0x0002		/* set frequency offset */
 #define MOD_MAXERROR	0x0004		/* set maximum time error */
 #define MOD_ESTERROR	0x0008		/* set estimated time error */
 #define MOD_STATUS	0x0010		/* set clock status bits */
 #define MOD_TIMECONST	0x0020		/* set PLL time constant */
 #define MOD_PPSMAX	0x0040		/* set PPS maximum averaging time */
 #define MOD_TAI		0x0080		/* set TAI offset */
 #define	MOD_MICRO	0x1000		/* select microsecond resolution */
 #define	MOD_NANO	0x2000		/* select nanosecond resolution */
 #define MOD_CLKB	0x4000		/* select clock B */
 #define MOD_CLKA	0x8000		/* select clock A */
 
 /*
  * Status codes (timex.status)
  */
 #define STA_PLL		0x0001		/* enable PLL updates (rw) */
 #define STA_PPSFREQ	0x0002		/* enable PPS freq discipline (rw) */
 #define STA_PPSTIME	0x0004		/* enable PPS time discipline (rw) */
 #define STA_FLL		0x0008		/* enable FLL mode (rw) */
 #define STA_INS		0x0010		/* insert leap (rw) */
 #define STA_DEL		0x0020		/* delete leap (rw) */
 #define STA_UNSYNC	0x0040		/* clock unsynchronized (rw) */
 #define STA_FREQHOLD	0x0080		/* hold frequency (rw) */
 #define STA_PPSSIGNAL	0x0100		/* PPS signal present (ro) */
 #define STA_PPSJITTER	0x0200		/* PPS signal jitter exceeded (ro) */
 #define STA_PPSWANDER	0x0400		/* PPS signal wander exceeded (ro) */
 #define STA_PPSERROR	0x0800		/* PPS signal calibration error (ro) */
 #define STA_CLOCKERR	0x1000		/* clock hardware fault (ro) */
 #define STA_NANO	0x2000		/* resolution (0 = us, 1 = ns) (ro) */
 #define STA_MODE	0x4000		/* mode (0 = PLL, 1 = FLL) (ro) */
 #define STA_CLK		0x8000		/* clock source (0 = A, 1 = B) (ro) */
 
 #define STA_RONLY (STA_PPSSIGNAL | STA_PPSJITTER | STA_PPSWANDER | \
     STA_PPSERROR | STA_CLOCKERR | STA_NANO | STA_MODE | STA_CLK)
 
 /*
  * Clock states (ntptimeval.time_state)
  */
 #define TIME_OK		0		/* no leap second warning */
 #define TIME_INS	1		/* insert leap second warning */
 #define TIME_DEL	2		/* delete leap second warning */
 #define TIME_OOP	3		/* leap second in progress */
 #define TIME_WAIT	4		/* leap second has occurred */
 #define TIME_ERROR	5		/* error (see status word) */
 
 /*
  * NTP user interface -- ntp_gettime(2) - used to read kernel clock values
  */
 struct ntptimeval {
 	struct timespec time;		/* current time (ns) (ro) */
 	long maxerror;			/* maximum error (us) (ro) */
 	long esterror;			/* estimated error (us) (ro) */
 	long tai;			/* TAI offset */
 	int time_state;			/* time status */
 };
 
 /*
  * NTP daemon interface -- ntp_adjtime(2) -- used to discipline CPU clock
  * oscillator and control/determine status.
  *
  * Note: The offset, precision and jitter members are in microseconds if
  * STA_NANO is zero and nanoseconds if not.
  */
 struct timex {
 	unsigned int modes;		/* clock mode bits (wo) */
 	long	offset;			/* time offset (ns/us) (rw) */
 	long	freq;			/* frequency offset (scaled PPM) (rw) */
 	long	maxerror;		/* maximum error (us) (rw) */
 	long	esterror;		/* estimated error (us) (rw) */
 	int	status;			/* clock status bits (rw) */
 	long	constant;		/* poll interval (log2 s) (rw) */
 	long	precision;		/* clock precision (ns/us) (ro) */
 	long	tolerance;		/* clock frequency tolerance (scaled
 				 	 * PPM) (ro) */
 	/*
 	 * The following read-only structure members are implemented
 	 * only if the PPS signal discipline is configured in the
 	 * kernel. They are included in all configurations to insure
 	 * portability.
 	 */
 	long	ppsfreq;		/* PPS frequency (scaled PPM) (ro) */
 	long	jitter;			/* PPS jitter (ns/us) (ro) */
 	int	shift;			/* interval duration (s) (shift) (ro) */
 	long	stabil;			/* PPS stability (scaled PPM) (ro) */
 	long	jitcnt;			/* jitter limit exceeded (ro) */
 	long	calcnt;			/* calibration intervals (ro) */
 	long	errcnt;			/* calibration errors (ro) */
 	long	stbcnt;			/* stability limit exceeded (ro) */
 };
 
 #ifdef __FreeBSD__
 
 #ifdef _KERNEL
-void	ntp_update_second(int64_t *adjustment, time_t *newsec);
+void	ntp_update_second(int64_t *adjustment, time_t *newsec, long *tai_off);
 #else /* !_KERNEL */
 #include <sys/cdefs.h>
 
 __BEGIN_DECLS
 int	ntp_adjtime(struct timex *);
 int	ntp_gettime(struct ntptimeval *);
 __END_DECLS
 #endif /* _KERNEL */
 
 #endif /* __FreeBSD__ */
 
 #endif /* !_SYS_TIMEX_H_ */