Changeset View
Standalone View
sys/sys/_atomic_subword.h
Show All 35 Lines | |||||
* because of the bits of the word we're trying to write rather than the rest | * because of the bits of the word we're trying to write rather than the rest | ||||
* of the word. | * of the word. | ||||
*/ | */ | ||||
#ifndef _MACHINE_ATOMIC_H_ | #ifndef _MACHINE_ATOMIC_H_ | ||||
#error do not include this header, use machine/atomic.h | #error do not include this header, use machine/atomic.h | ||||
#endif | #endif | ||||
#include <machine/endian.h> | #include <machine/endian.h> | ||||
#ifndef _KERNEL | |||||
#include <stdbool.h> | |||||
#endif | |||||
#ifndef NBBY | #ifndef NBBY | ||||
#define NBBY 8 | #define NBBY 8 | ||||
#endif | #endif | ||||
#define _ATOMIC_WORD_ALIGNED(p) \ | #define _ATOMIC_WORD_ALIGNED(p) \ | ||||
(uint32_t *)((__uintptr_t)(p) - ((__uintptr_t)(p) % 4)) | (uint32_t *)((__uintptr_t)(p) - ((__uintptr_t)(p) % 4)) | ||||
▲ Show 20 Lines • Show All 56 Lines • ▼ Show 20 Lines | _atomic_fcmpset_masked_word(uint32_t *addr, uint32_t *old, uint32_t val, | ||||
* and only attempt once, because the caller should be compensating for | * and only attempt once, because the caller should be compensating for | ||||
* that possibility. | * that possibility. | ||||
*/ | */ | ||||
*old = (*addr & ~mask) | *old; | *old = (*addr & ~mask) | *old; | ||||
return (atomic_fcmpset_32(addr, old, (*old & ~mask) | val)); | return (atomic_fcmpset_32(addr, old, (*old & ~mask) | val)); | ||||
} | } | ||||
#endif | #endif | ||||
#ifndef atomic_cmpset_8 | |||||
static __inline int | static __inline int | ||||
atomic_cmpset_8(__volatile uint8_t *addr, uint8_t old, uint8_t val) | atomic_cmpset_8(__volatile uint8_t *addr, uint8_t old, uint8_t val) | ||||
{ | { | ||||
int shift; | int shift; | ||||
shift = _ATOMIC_BYTE_SHIFT(addr); | shift = _ATOMIC_BYTE_SHIFT(addr); | ||||
return (_atomic_cmpset_masked_word(_ATOMIC_WORD_ALIGNED(addr), | return (_atomic_cmpset_masked_word(_ATOMIC_WORD_ALIGNED(addr), | ||||
old << shift, val << shift, 0xff << shift)); | old << shift, val << shift, 0xff << shift)); | ||||
} | } | ||||
#endif | |||||
#ifndef atomic_fcmpset_8 | |||||
static __inline int | static __inline int | ||||
atomic_fcmpset_8(__volatile uint8_t *addr, uint8_t *old, uint8_t val) | atomic_fcmpset_8(__volatile uint8_t *addr, uint8_t *old, uint8_t val) | ||||
{ | { | ||||
int ret, shift; | int ret, shift; | ||||
uint32_t wold; | uint32_t wold; | ||||
shift = _ATOMIC_BYTE_SHIFT(addr); | shift = _ATOMIC_BYTE_SHIFT(addr); | ||||
wold = *old << shift; | wold = *old << shift; | ||||
ret = _atomic_fcmpset_masked_word(_ATOMIC_WORD_ALIGNED(addr), | ret = _atomic_fcmpset_masked_word(_ATOMIC_WORD_ALIGNED(addr), | ||||
&wold, val << shift, 0xff << shift); | &wold, val << shift, 0xff << shift); | ||||
if (ret == 0) | if (ret == 0) | ||||
*old = (wold >> shift) & 0xff; | *old = (wold >> shift) & 0xff; | ||||
return (ret); | return (ret); | ||||
} | } | ||||
#endif | |||||
#ifndef atomic_cmpset_16 | |||||
static __inline int | static __inline int | ||||
atomic_cmpset_16(__volatile uint16_t *addr, uint16_t old, uint16_t val) | atomic_cmpset_16(__volatile uint16_t *addr, uint16_t old, uint16_t val) | ||||
{ | { | ||||
int shift; | int shift; | ||||
shift = _ATOMIC_HWORD_SHIFT(addr); | shift = _ATOMIC_HWORD_SHIFT(addr); | ||||
return (_atomic_cmpset_masked_word(_ATOMIC_WORD_ALIGNED(addr), | return (_atomic_cmpset_masked_word(_ATOMIC_WORD_ALIGNED(addr), | ||||
old << shift, val << shift, 0xffff << shift)); | old << shift, val << shift, 0xffff << shift)); | ||||
} | } | ||||
#endif | |||||
#ifndef atomic_fcmpset_16 | |||||
static __inline int | static __inline int | ||||
atomic_fcmpset_16(__volatile uint16_t *addr, uint16_t *old, uint16_t val) | atomic_fcmpset_16(__volatile uint16_t *addr, uint16_t *old, uint16_t val) | ||||
{ | { | ||||
int ret, shift; | int ret, shift; | ||||
uint32_t wold; | uint32_t wold; | ||||
shift = _ATOMIC_HWORD_SHIFT(addr); | shift = _ATOMIC_HWORD_SHIFT(addr); | ||||
wold = *old << shift; | wold = *old << shift; | ||||
ret = _atomic_fcmpset_masked_word(_ATOMIC_WORD_ALIGNED(addr), | ret = _atomic_fcmpset_masked_word(_ATOMIC_WORD_ALIGNED(addr), | ||||
&wold, val << shift, 0xffff << shift); | &wold, val << shift, 0xffff << shift); | ||||
if (ret == 0) | if (ret == 0) | ||||
*old = (wold >> shift) & 0xffff; | *old = (wold >> shift) & 0xffff; | ||||
return (ret); | return (ret); | ||||
} | } | ||||
#endif | |||||
#ifndef atomic_load_acq_8 | |||||
static __inline uint8_t | |||||
atomic_load_acq_8(volatile uint8_t *p) | |||||
{ | |||||
int shift; | |||||
uint8_t ret; | |||||
shift = _ATOMIC_BYTE_SHIFT(p); | |||||
ret = (atomic_load_acq_32(_ATOMIC_WORD_ALIGNED(p)) >> shift) & 0xff; | |||||
return (ret); | |||||
} | |||||
#endif | |||||
#ifndef atomic_load_acq_16 | |||||
static __inline uint16_t | |||||
atomic_load_acq_16(volatile uint16_t *p) | |||||
{ | |||||
int shift; | |||||
uint16_t ret; | |||||
shift = _ATOMIC_HWORD_SHIFT(p); | |||||
ret = (atomic_load_acq_32(_ATOMIC_WORD_ALIGNED(p)) >> shift) & | |||||
0xffff; | |||||
return (ret); | |||||
} | |||||
#endif | |||||
rlibby: I don't like that this implemntation silently hides misalignment, and indeed produces a wrong… | |||||
rlibbyUnsubmitted Not Done Inline ActionsActually I think maybe the whole thing can be addressed with this /* Align to word, but keep 2-byte misalignment if present. */ #define _ATOMIC_ALIGN_HWORD_PTR(p) \ (uint32_t *)((uintptr_t)(p) & ~(uintptr_t)2) #if _BYTE_ORDER == _BIG_ENDIAN #define _ATOMIC_HWORD_SHIFT(p) \ ((1 - (((uintptr_t)(p) >> 1) & 1)) * 2 * NBBY) #else #define _ATOMIC_HWORD_SHIFT(p) \ ((((uintptr_t)(p) >> 1) & 1) * 2 * NBBY) #endif rlibby: Actually I think maybe the whole thing can be addressed with this
```
/* Align to word, but… | |||||
kevansUnsubmitted Not Done Inline ActionsHmm, I thought I had tested all the flavors of misalignment on both BE/LE with the original implementation. The math was intended to work out such that you align to the containing word and shift in/out of the correct position within the containing word... I'll take another look again, though. kevans: Hmm, I thought I had tested all the flavors of misalignment on both BE/LE with the original… | |||||
rlibbyUnsubmitted Not Done Inline ActionsRight, the problem is that with misalignment at 3/4, there is no containing word for a 2-byte op, since it spans two 4-byte words. What happens now is we round down to the 4-byte boundary. The proposal above is to round down to 1/4 and pass along the misalignment to the 4-byte op. I'm fine leaving this out of the current diff and addressing in follow up, if preferred. rlibby: Right, the problem is that with misalignment at 3/4, there is no containing word for a 2-byte… | |||||
cemAuthorUnsubmitted Not Done Inline ActionsIf (p & 0x1) panic(“misaligned”); cem: `If (p & 0x1) panic(“misaligned”);` | |||||
cemAuthorUnsubmitted Done Inline ActionsUnfortunately, including panic()'s systm.h would be extremely header-pollutiony, and other hacks are unappealing. I think we just trust people not to misalign 16-bit atomics on platforms that do not natively support atomics smaller than a 32-bit word. I don't think such platforms would support an atomic 32-bit word operation misaligned by 2 bytes anyway. cem: Unfortunately, including panic()'s systm.h would be extremely header-pollutiony, and other… | |||||
rlibbyUnsubmitted Not Done Inline ActionsI'm okay with not addressing this. rlibby: I'm okay with not addressing this. | |||||
#undef _ATOMIC_WORD_ALIGNED | #undef _ATOMIC_WORD_ALIGNED | ||||
#undef _ATOMIC_BYTE_SHIFT | #undef _ATOMIC_BYTE_SHIFT | ||||
#undef _ATOMIC_HWORD_SHIFT | #undef _ATOMIC_HWORD_SHIFT | ||||
/* | |||||
* Provide generic testandset_long implementation based on fcmpset long | |||||
* primitive. It may not be ideal for any given arch, so machine/atomic.h | |||||
* should define the macro atomic_testandset_long to override with an | |||||
* MD-specific version. | |||||
* | |||||
* (Organizationally, this isn't really subword atomics. But atomic_common is | |||||
* included too early in machine/atomic.h, so it isn't a good place for derived | |||||
Done Inline Actions"early" kevans: "early" | |||||
Done Inline ActionsThanks, will fix cem: Thanks, will fix | |||||
* primitives like this.) | |||||
*/ | |||||
#ifndef atomic_testandset_long | |||||
static __inline int | |||||
atomic_testandset_long(volatile u_long *p, u_int v) | |||||
{ | |||||
u_long bit, old; | |||||
bool ret; | |||||
bit = (1ul << (v % (sizeof(*p) * NBBY))); | |||||
old = atomic_load_acq_long(p); | |||||
ret = false; | |||||
while (!ret && (old & bit) == 0) | |||||
ret = atomic_fcmpset_acq_long(p, &old, old | bit); | |||||
Not Done Inline ActionsJust noticed, why do we have acquire semantics here? atomic(9) does not say these should have any memory barriers. I think plain atomics should be sufficient for the logic here, or am I misunderstanding what the barrier is providing? rlibby: Just noticed, why do we have acquire semantics here? atomic(9) does not say these should have… | |||||
Done Inline ActionsI don't recall exactly what I was thinking. I think plausibly I was imagining this primitive not making sense without acquire semantics? Of course, to provide those semantics, I think we only need _acq on the first atomic_load, not on the rest. There are very few consumers in tree: qlnxe(4), and hyperv's vmbus(4) and netvsc(4). (Also my out of tree fxrng set uses it, which is my interest in adding it to _atomic_subword.) In vmbus(4)'s case, it is used as an assertion, as if it has acquire semantics; certainly operations after it in program order must not be reordered before it. In netvsc(4) it is used as an allocator from a bitmap. The nearby code could tolerate reordering, but you wouldn't want to allow the CPU to reorder consumers of the allocator before allocation. I guess there might be a data dependency here that prevents reordering anyway. I'm not super familiar with when it would be acceptable to elide otherwise correct semantics on the basis of a data dependency. qlnxe(4), in proper qlnx* style, does not actually use the testand property of this primitive; it would be fine with atomic_set_foo. (In my out of tree use, I also want to prevent reordering of loads/stores before the testandset.) cem: I don't recall exactly what I was thinking. I think plausibly I was imagining this primitive… | |||||
Not Done Inline Actionsrlibby: I'm not arguing against providing acq variants, if needed, but ISTM no caller can actually rely… | |||||
Done Inline ActionsIf it is the case that this primitive only makes sense with acq semantics, I think it would make sense to fix the documentation. But I'm not sure. From discussion with kib on IRC I think the right thing to do here is provide the relaxed variant, and then for my use, also provide a generic acquire semantics version. I'll update the patch. I'm not sure if the best generic way to provide acquire semantics is to use acquire primitives, or to slap a fence in somewhere. On platforms with cheap acquire primitives, the fence is costlier, but if a platform is just implementing acquire with fenced primitives, many fences is probably worse. I guess we can slap a single fence in and let platforms provide better MD variants themselves if it is not optimal. cem: If it is the case that this primitive only makes sense with acq semantics, I think it would… | |||||
return (!ret); | |||||
} | |||||
#endif | |||||
#ifndef atomic_testandset_long | |||||
rlibbyUnsubmitted Done Inline Actionss/testandset/testandclear/ rlibby: s/testandset/testandclear/ | |||||
cemAuthorUnsubmitted Done Inline ActionsWill fix, thanks cem: Will fix, thanks | |||||
static __inline int | |||||
atomic_testandclear_long(volatile u_long *p, u_int v) | |||||
{ | |||||
u_long bit, old; | |||||
bool ret; | |||||
bit = (1ul << (v % (sizeof(*p) * NBBY))); | |||||
old = atomic_load_acq_long(p); | |||||
ret = false; | |||||
while (!ret && (old & bit) != 0) | |||||
ret = atomic_fcmpset_acq_long(p, &old, old & ~bit); | |||||
return (ret); | |||||
} | |||||
#endif | |||||
#endif /* _SYS__ATOMIC_SUBWORD_H_ */ | #endif /* _SYS__ATOMIC_SUBWORD_H_ */ |
I don't like that this implemntation silently hides misalignment, and indeed produces a wrong result. Consider what happens when the pointer is aligned to byte 3 of the 4-byte word. I think it would be better to do a misaligned load and maybe trap.
Here's a thought:
I see the (f)cmpset_16 here has the same problem, and I realize you probably don't care much about misaligned pointers. I wouldn't insist on a change. I just find this behavior surprising.