Differential D22963 Diff 66354 sys/sys/_atomic_subword.h

Changeset View

Standalone View

sys/sys/_atomic_subword.h

Show All 35 Lines
* because of the bits of the word we're trying to write rather than the rest		* because of the bits of the word we're trying to write rather than the rest
* of the word.		* of the word.
*/		*/
#ifndef _MACHINE_ATOMIC_H_		#ifndef _MACHINE_ATOMIC_H_
#error do not include this header, use machine/atomic.h		#error do not include this header, use machine/atomic.h
#endif		#endif

#include <machine/endian.h>		#include <machine/endian.h>
		#ifndef _KERNEL
		#include <stdbool.h>
		#endif

#ifndef NBBY		#ifndef NBBY
#define NBBY 8		#define NBBY 8
#endif		#endif

#define _ATOMIC_WORD_ALIGNED(p) \		#define _ATOMIC_WORD_ALIGNED(p) \
(uint32_t *)((__uintptr_t)(p) - ((__uintptr_t)(p) % 4))		(uint32_t *)((__uintptr_t)(p) - ((__uintptr_t)(p) % 4))

▲ Show 20 Lines • Show All 56 Lines • ▼ Show 20 Lines	_atomic_fcmpset_masked_word(uint32_t addr, uint32_t old, uint32_t val,
* and only attempt once, because the caller should be compensating for		* and only attempt once, because the caller should be compensating for
* that possibility.		* that possibility.
*/		*/
old = (addr & ~mask) \| *old;		old = (addr & ~mask) \| *old;
return (atomic_fcmpset_32(addr, old, (*old & ~mask) \| val));		return (atomic_fcmpset_32(addr, old, (*old & ~mask) \| val));
}		}
#endif		#endif

		#ifndef atomic_cmpset_8
static __inline int		static __inline int
atomic_cmpset_8(__volatile uint8_t *addr, uint8_t old, uint8_t val)		atomic_cmpset_8(__volatile uint8_t *addr, uint8_t old, uint8_t val)
{		{
int shift;		int shift;

shift = _ATOMIC_BYTE_SHIFT(addr);		shift = _ATOMIC_BYTE_SHIFT(addr);

return (_atomic_cmpset_masked_word(_ATOMIC_WORD_ALIGNED(addr),		return (_atomic_cmpset_masked_word(_ATOMIC_WORD_ALIGNED(addr),
old << shift, val << shift, 0xff << shift));		old << shift, val << shift, 0xff << shift));
}		}
		#endif

		#ifndef atomic_fcmpset_8
static __inline int		static __inline int
atomic_fcmpset_8(__volatile uint8_t addr, uint8_t old, uint8_t val)		atomic_fcmpset_8(__volatile uint8_t addr, uint8_t old, uint8_t val)
{		{
int ret, shift;		int ret, shift;
uint32_t wold;		uint32_t wold;

shift = _ATOMIC_BYTE_SHIFT(addr);		shift = _ATOMIC_BYTE_SHIFT(addr);
wold = *old << shift;		wold = *old << shift;
ret = _atomic_fcmpset_masked_word(_ATOMIC_WORD_ALIGNED(addr),		ret = _atomic_fcmpset_masked_word(_ATOMIC_WORD_ALIGNED(addr),
&wold, val << shift, 0xff << shift);		&wold, val << shift, 0xff << shift);
if (ret == 0)		if (ret == 0)
*old = (wold >> shift) & 0xff;		*old = (wold >> shift) & 0xff;
return (ret);		return (ret);
}		}
		#endif

		#ifndef atomic_cmpset_16
static __inline int		static __inline int
atomic_cmpset_16(__volatile uint16_t *addr, uint16_t old, uint16_t val)		atomic_cmpset_16(__volatile uint16_t *addr, uint16_t old, uint16_t val)
{		{
int shift;		int shift;

shift = _ATOMIC_HWORD_SHIFT(addr);		shift = _ATOMIC_HWORD_SHIFT(addr);

return (_atomic_cmpset_masked_word(_ATOMIC_WORD_ALIGNED(addr),		return (_atomic_cmpset_masked_word(_ATOMIC_WORD_ALIGNED(addr),
old << shift, val << shift, 0xffff << shift));		old << shift, val << shift, 0xffff << shift));
}		}
		#endif

		#ifndef atomic_fcmpset_16
static __inline int		static __inline int
atomic_fcmpset_16(__volatile uint16_t addr, uint16_t old, uint16_t val)		atomic_fcmpset_16(__volatile uint16_t addr, uint16_t old, uint16_t val)
{		{
int ret, shift;		int ret, shift;
uint32_t wold;		uint32_t wold;

shift = _ATOMIC_HWORD_SHIFT(addr);		shift = _ATOMIC_HWORD_SHIFT(addr);
wold = *old << shift;		wold = *old << shift;
ret = _atomic_fcmpset_masked_word(_ATOMIC_WORD_ALIGNED(addr),		ret = _atomic_fcmpset_masked_word(_ATOMIC_WORD_ALIGNED(addr),
&wold, val << shift, 0xffff << shift);		&wold, val << shift, 0xffff << shift);
if (ret == 0)		if (ret == 0)
*old = (wold >> shift) & 0xffff;		*old = (wold >> shift) & 0xffff;
return (ret);		return (ret);
}		}
		#endif

		#ifndef atomic_load_acq_8
		static __inline uint8_t
		atomic_load_acq_8(volatile uint8_t *p)
		{
		int shift;
		uint8_t ret;

		shift = _ATOMIC_BYTE_SHIFT(p);
		ret = (atomic_load_acq_32(_ATOMIC_WORD_ALIGNED(p)) >> shift) & 0xff;
		return (ret);
		}
		#endif

		#ifndef atomic_load_acq_16
		static __inline uint16_t
		atomic_load_acq_16(volatile uint16_t *p)
		{
		int shift;
		uint16_t ret;

		shift = _ATOMIC_HWORD_SHIFT(p);
		ret = (atomic_load_acq_32(_ATOMIC_WORD_ALIGNED(p)) >> shift) &
		0xffff;
		return (ret);
		}
		#endif
		rlibbyUnsubmitted Done Inline Actions I don't like that this implemntation silently hides misalignment, and indeed produces a wrong result. Consider what happens when the pointer is aligned to byte 3 of the 4-byte word. I think it would be better to do a misaligned load and maybe trap. Here's a thought: uint16_t atomic_load_acq_16(volatile uint16_t p) { union { uint32_t x32; uint16_t x16[2]; } u; int i; i = ((uintptr_t)p >> 1) & 1; p = (uint16_t )(((uintptr_t)p) & ~(uintptr_t)2); /* Keep mis-alignment / u.x32 = atomic_load_acq_32(p); return (u.x16[i]); } I see the (f)cmpset_16 here has the same problem, and I realize you probably don't care much about misaligned pointers. I wouldn't insist on a change. I just find this behavior surprising. rlibby:* I don't like that this implemntation silently hides misalignment, and indeed produces a wrong…
		rlibbyUnsubmitted Not Done Inline Actions Actually I think maybe the whole thing can be addressed with this /* Align to word, but keep 2-byte misalignment if present. / #define _ATOMIC_ALIGN_HWORD_PTR(p) \ (uint32_t )((uintptr_t)(p) & ~(uintptr_t)2) #if _BYTE_ORDER == _BIG_ENDIAN #define _ATOMIC_HWORD_SHIFT(p) \ ((1 - (((uintptr_t)(p) >> 1) & 1)) * 2 * NBBY) #else #define _ATOMIC_HWORD_SHIFT(p) \ ((((uintptr_t)(p) >> 1) & 1) * 2 * NBBY) #endif rlibby: Actually I think maybe the whole thing can be addressed with this ``` /* Align to word, but…
		kevansUnsubmitted Not Done Inline Actions Hmm, I thought I had tested all the flavors of misalignment on both BE/LE with the original implementation. The math was intended to work out such that you align to the containing word and shift in/out of the correct position within the containing word... I'll take another look again, though. kevans: Hmm, I thought I had tested all the flavors of misalignment on both BE/LE with the original…
		rlibbyUnsubmitted Not Done Inline Actions Right, the problem is that with misalignment at 3/4, there is no containing word for a 2-byte op, since it spans two 4-byte words. What happens now is we round down to the 4-byte boundary. The proposal above is to round down to 1/4 and pass along the misalignment to the 4-byte op. I'm fine leaving this out of the current diff and addressing in follow up, if preferred. rlibby: Right, the problem is that with misalignment at 3/4, there is no containing word for a 2-byte…
		cemAuthorUnsubmitted Not Done Inline Actions `If (p & 0x1) panic(“misaligned”);` cem: `If (p & 0x1) panic(“misaligned”);`
		cemAuthorUnsubmitted Done Inline Actions Unfortunately, including panic()'s systm.h would be extremely header-pollutiony, and other hacks are unappealing. I think we just trust people not to misalign 16-bit atomics on platforms that do not natively support atomics smaller than a 32-bit word. I don't think such platforms would support an atomic 32-bit word operation misaligned by 2 bytes anyway. cem: Unfortunately, including panic()'s systm.h would be extremely header-pollutiony, and other…
		rlibbyUnsubmitted Not Done Inline Actions I'm okay with not addressing this. rlibby: I'm okay with not addressing this.

#undef _ATOMIC_WORD_ALIGNED		#undef _ATOMIC_WORD_ALIGNED
#undef _ATOMIC_BYTE_SHIFT		#undef _ATOMIC_BYTE_SHIFT
#undef _ATOMIC_HWORD_SHIFT		#undef _ATOMIC_HWORD_SHIFT

		/*
		* Provide generic testandset_long implementation based on fcmpset long
		* primitive. It may not be ideal for any given arch, so machine/atomic.h
		* should define the macro atomic_testandset_long to override with an
		* MD-specific version.
		*
		* (Organizationally, this isn't really subword atomics. But atomic_common is
		* included too early in machine/atomic.h, so it isn't a good place for derived
		kevansUnsubmitted Done Inline Actions "early" kevans: "early"
		cemAuthorUnsubmitted Done Inline Actions Thanks, will fix cem: Thanks, will fix
		* primitives like this.)
		*/
		#ifndef atomic_testandset_long
		static __inline int
		atomic_testandset_long(volatile u_long *p, u_int v)
		{
		u_long bit, old;
		bool ret;

		bit = (1ul << (v % (sizeof(p) NBBY)));

		old = atomic_load_acq_long(p);
		ret = false;
		while (!ret && (old & bit) == 0)
		ret = atomic_fcmpset_acq_long(p, &old, old \| bit);
		rlibbyUnsubmitted Not Done Inline Actions Just noticed, why do we have acquire semantics here? atomic(9) does not say these should have any memory barriers. I think plain atomics should be sufficient for the logic here, or am I misunderstanding what the barrier is providing? rlibby: Just noticed, why do we have acquire semantics here? atomic(9) does not say these should have…
		cemAuthorUnsubmitted Done Inline Actions I don't recall exactly what I was thinking. I think plausibly I was imagining this primitive not making sense without acquire semantics? Of course, to provide those semantics, I think we only need `_acq` on the first `atomic_load`, not on the rest. There are very few consumers in tree: qlnxe(4), and hyperv's vmbus(4) and netvsc(4). (Also my out of tree fxrng set uses it, which is my interest in adding it to _atomic_subword.) In vmbus(4)'s case, it is used as an assertion, as if it has acquire semantics; certainly operations after it in program order must not be reordered before it. In netvsc(4) it is used as an allocator from a bitmap. The nearby code could tolerate reordering, but you wouldn't want to allow the CPU to reorder consumers of the allocator before allocation. I guess there might be a data dependency here that prevents reordering anyway. I'm not super familiar with when it would be acceptable to elide otherwise correct semantics on the basis of a data dependency. qlnxe(4), in proper qlnx* style, does not actually use the `testand` property of this primitive; it would be fine with `atomic_set_foo`. (In my out of tree use, I also want to prevent reordering of loads/stores before the `testandset`.) cem: I don't recall exactly what I was thinking. I think plausibly I was imagining this primitive…
		rlibbyUnsubmitted Not Done Inline Actions I'm not arguing against providing acq variants, if needed, but ISTM no caller can actually rely on the acquire semantics in the present procedures, because they are not documented and the non-generic arch implementations may not provide them. D22702 / D22703 will also use this, when available. rlibby: I'm not arguing against providing acq variants, if needed, but ISTM no caller can actually rely…
		cemAuthorUnsubmitted Done Inline Actions If it is the case that this primitive only makes sense with acq semantics, I think it would make sense to fix the documentation. But I'm not sure. From discussion with kib on IRC I think the right thing to do here is provide the relaxed variant, and then for my use, also provide a generic acquire semantics version. I'll update the patch. I'm not sure if the best generic way to provide acquire semantics is to use acquire primitives, or to slap a fence in somewhere. On platforms with cheap acquire primitives, the fence is costlier, but if a platform is just implementing acquire with fenced primitives, many fences is probably worse. I guess we can slap a single fence in and let platforms provide better MD variants themselves if it is not optimal. cem: If it is the case that this primitive only makes sense with acq semantics, I think it would…

		return (!ret);
		}
		#endif

		#ifndef atomic_testandset_long
		rlibbyUnsubmitted Done Inline Actions s/testandset/testandclear/ rlibby: s/testandset/testandclear/
		cemAuthorUnsubmitted Done Inline Actions Will fix, thanks cem: Will fix, thanks
		static __inline int
		atomic_testandclear_long(volatile u_long *p, u_int v)
		{
		u_long bit, old;
		bool ret;

		bit = (1ul << (v % (sizeof(p) NBBY)));

		old = atomic_load_acq_long(p);
		ret = false;
		while (!ret && (old & bit) != 0)
		ret = atomic_fcmpset_acq_long(p, &old, old & ~bit);

		return (ret);
		}
		#endif

#endif /* _SYS__ATOMIC_SUBWORD_H_ */		#endif /* _SYS__ATOMIC_SUBWORD_H_ */