Changeset View
Standalone View
sys/compat/linux/linux_mmap.c
Show First 20 Lines • Show All 44 Lines • ▼ Show 20 Lines | |||||
#include <sys/sysent.h> | #include <sys/sysent.h> | ||||
#include <sys/sysproto.h> | #include <sys/sysproto.h> | ||||
#include <vm/pmap.h> | #include <vm/pmap.h> | ||||
#include <vm/vm_extern.h> | #include <vm/vm_extern.h> | ||||
#include <vm/vm_map.h> | #include <vm/vm_map.h> | ||||
#include <compat/linux/linux_emul.h> | #include <compat/linux/linux_emul.h> | ||||
#include <compat/linux/linux_mib.h> | |||||
#include <compat/linux/linux_mmap.h> | #include <compat/linux/linux_mmap.h> | ||||
#include <compat/linux/linux_persona.h> | #include <compat/linux/linux_persona.h> | ||||
#include <compat/linux/linux_util.h> | #include <compat/linux/linux_util.h> | ||||
#define STACK_SIZE (2 * 1024 * 1024) | #define STACK_SIZE (2 * 1024 * 1024) | ||||
#define GUARD_SIZE (4 * PAGE_SIZE) | #define GUARD_SIZE (4 * PAGE_SIZE) | ||||
#if defined(__amd64__) | #if defined(__amd64__) | ||||
static void linux_fixup_prot(struct thread *td, int *prot); | static void linux_fixup_prot(struct thread *td, int *prot); | ||||
#endif | #endif | ||||
static int | static int | ||||
▲ Show 20 Lines • Show All 169 Lines • ▼ Show 20 Lines | linux_mprotect_common(struct thread *td, uintptr_t addr, size_t len, int prot) | ||||
prot &= ~(LINUX_PROT_GROWSDOWN | LINUX_PROT_GROWSUP); | prot &= ~(LINUX_PROT_GROWSDOWN | LINUX_PROT_GROWSUP); | ||||
if ((prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0) | if ((prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0) | ||||
return (EINVAL); | return (EINVAL); | ||||
#if defined(__amd64__) | #if defined(__amd64__) | ||||
linux_fixup_prot(td, &prot); | linux_fixup_prot(td, &prot); | ||||
#endif | #endif | ||||
return (kern_mprotect(td, addr, len, prot)); | return (kern_mprotect(td, addr, len, prot)); | ||||
} | |||||
int | |||||
linux_madvise_common(struct thread *td, uintptr_t addr, size_t len, int l_behav) | |||||
{ | |||||
int behav; | |||||
switch (l_behav) { | |||||
case LINUX_MADV_NORMAL: | |||||
behav = MADV_NORMAL; | |||||
break; | |||||
case LINUX_MADV_RANDOM: | |||||
behav = MADV_RANDOM; | |||||
break; | |||||
case LINUX_MADV_SEQUENTIAL: | |||||
behav = MADV_SEQUENTIAL; | |||||
break; | |||||
case LINUX_MADV_WILLNEED: | |||||
behav = MADV_WILLNEED; | |||||
break; | |||||
case LINUX_MADV_DONTNEED: | |||||
/* | |||||
* Note that the semantics of Linux MADV_DONTNEED is very | |||||
* different from the FreeBSD's madvise(2) flag of the same name. | |||||
*/ | |||||
switch (linux_madv_dontneed) { | |||||
markj: Instead of having a sysctl, let's just map it to FreeBSD MADV_DONTNEED for now. I will to write… | |||||
traszAuthorUnsubmitted Done Inline ActionsPerfect, thank you! trasz: Perfect, thank you!
| |||||
case 0: | |||||
return (0); | |||||
case 1: | |||||
behav = MADV_DONTNEED; | |||||
break; | |||||
case 2: | |||||
behav = MADV_FREE; | |||||
break; | |||||
case -1: | |||||
default: | |||||
linux_msg(curthread, "unsupported madvise MADV_DONTNEED"); | |||||
return (EINVAL); | |||||
} | |||||
break; | |||||
case LINUX_MADV_FREE: | |||||
behav = MADV_FREE; | |||||
break; | |||||
case LINUX_MADV_REMOVE: | |||||
linux_msg(curthread, "unsupported madvise MADV_REMOVE"); | |||||
return (EINVAL); | |||||
case LINUX_MADV_DONTFORK: | |||||
return (kern_minherit(td, addr, len, INHERIT_NONE)); | |||||
case LINUX_MADV_DOFORK: | |||||
return (kern_minherit(td, addr, len, INHERIT_SHARE)); | |||||
markjUnsubmitted Done Inline ActionsDOFORK translates to VM_INHERIT_DEFAULT (i.e. VM_INHERIT_COPY), not _SHARE. markj: DOFORK translates to VM_INHERIT_DEFAULT (i.e. VM_INHERIT_COPY), not _SHARE. | |||||
traszAuthorUnsubmitted Done Inline ActionsThanks; I'll also change the spelling for MADV_KEEPONFORK. trasz: Thanks; I'll also change the spelling for MADV_KEEPONFORK. | |||||
case LINUX_MADV_MERGEABLE: | |||||
linux_msg(curthread, "unsupported madvise MADV_MERGEABLE"); | |||||
return (EINVAL); | |||||
markjUnsubmitted Not Done Inline ActionsMERGEABLE can simply return 0. markj: MERGEABLE can simply return 0. | |||||
traszAuthorUnsubmitted Done Inline ActionsI'm not sure about this one. Note that this is not about merging map entries. Quoting Linux madvise(2): Enable Kernel Samepage Merging (KSM) for the pages in the range specified by addr and length. The kernel regularly scans those areas of user memory that have been marked as mergeable, looking for pages with identical content. These are replaced by a single write-protected page (which is automatically copied if a process later wants to update the content of the page). KSM merges only private anonymous pages (see mmap(2)). The KSM feature is intended for applications that generate many instances of the same data (e.g., virtualization systems such as KVM). It can consume a lot of processing power; use with care. See the Linux kernel source file Documentation/vm/ksm.txt for more details. The MADV_MERGEABLE and MADV_UNMERGEABLE operations are available only if the kernel was configured with CONFIG_KSM. trasz: I'm not sure about this one. Note that this is not about merging map entries. Quoting Linux… | |||||
markjUnsubmitted Not Done Inline ActionsRight, it is an advisory flag that tells the kernel it is allowed to deduplicate anonymous pages. The kernel is allowed to do nothing. But reading the last sentence you quoted, I guess it is ok to leave it as-is, since we can also just pretend that we are a kernel without CONFIG_KSM defined. markj: Right, it is an advisory flag that tells the kernel it is allowed to deduplicate anonymous… | |||||
case LINUX_MADV_UNMERGEABLE: | |||||
/* We don't merge anyway. */ | |||||
return (0); | |||||
case LINUX_MADV_HUGEPAGE: | |||||
/* Ignored; on FreeBSD huge pages are always on. */ | |||||
return (0); | |||||
case LINUX_MADV_NOHUGEPAGE: | |||||
linux_msg(curthread, "unsupported madvise MADV_NOHUGEPAGE"); | |||||
return (EINVAL); | |||||
case LINUX_MADV_DONTDUMP: | |||||
behav = MADV_NOCORE; | |||||
break; | |||||
case LINUX_MADV_DODUMP: | |||||
behav = MADV_CORE; | |||||
break; | |||||
case LINUX_MADV_WIPEONFORK: | |||||
return (kern_minherit(td, addr, len, INHERIT_ZERO)); | |||||
case LINUX_MADV_KEEPONFORK: | |||||
return (kern_minherit(td, addr, len, INHERIT_COPY)); | |||||
case LINUX_MADV_HWPOISON: | |||||
linux_msg(curthread, "unsupported madvise MADV_HWPOISON"); | |||||
return (EINVAL); | |||||
case LINUX_MADV_SOFT_OFFLINE: | |||||
linux_msg(curthread, "unsupported madvise MADV_SOFT_OFFLINE"); | |||||
return (EINVAL); | |||||
default: | |||||
linux_msg(curthread, "unsupported madvise behav %d", l_behav); | |||||
return (EINVAL); | |||||
} | |||||
return (kern_madvise(td, addr, len, behav)); | |||||
markjUnsubmitted Done Inline ActionsI think this function would be a bit clearer if every case contained a return statement. We can call kern_madvise() directly from cases where a Linux madvise() verb maps to a FreeBSD madvise() verb. markj: I think this function would be a bit clearer if every case contained a return statement. We can… | |||||
} | } | ||||
#if defined(__amd64__) | #if defined(__amd64__) | ||||
static void | static void | ||||
linux_fixup_prot(struct thread *td, int *prot) | linux_fixup_prot(struct thread *td, int *prot) | ||||
{ | { | ||||
struct linux_pemuldata *pem; | struct linux_pemuldata *pem; | ||||
if (SV_PROC_FLAG(td->td_proc, SV_ILP32) && *prot & PROT_READ) { | if (SV_PROC_FLAG(td->td_proc, SV_ILP32) && *prot & PROT_READ) { | ||||
pem = pem_find(td->td_proc); | pem = pem_find(td->td_proc); | ||||
if (pem->persona & LINUX_READ_IMPLIES_EXEC) | if (pem->persona & LINUX_READ_IMPLIES_EXEC) | ||||
*prot |= PROT_EXEC; | *prot |= PROT_EXEC; | ||||
} | } | ||||
} | } | ||||
#endif | #endif | ||||
Not Done Inline ActionsIs Linux behavior to always invalidate/free the pages in DONTNEED range, or is it only optional ? Can you return success there without doing anything at all ? kib: Is Linux behavior to always invalidate/free the pages in DONTNEED range, or is it only optional… | |||||
Done Inline ActionsFrom theraven's testcase (see https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=230160), it seems the zeroing should be immediate. However, after some more testing, I've found that simply returning EINVAL breaks a whole lot of stuff. I think I'll add a sysctl to make it possible to enable the technically correct behaviour, but defaulting to a sane one instead, ie aliasing it for MADV_DONTNEED, or perhaps MADV_FREE. trasz: From theraven's testcase (see https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=230160), it… | |||||
Not Done Inline ActionsThe thing that is not clear to me, assume that the range is backed by a shared file mapping that contains some dirty (not yet written back to the file) pages. What should happen after LINUX_MADV_DONTNEED ? kib: The thing that is not clear to me, assume that the range is backed by a shared file mapping… | |||||
Not Done Inline ActionsIndeed, Linux has non-traditional semantics for DONTNEED. Anonymous pages must be zeroed, low-level system software like jemalloc depends on it (at least with older kernels that don't implement MADV_FREE). MADV_FREE does not provide this semantic. It is not clear from the documentation what MADV_DONTNEED does for dirty file-backed pages (or named swap-backed pages). From reading the code and some hints on a few ML posts, I believe it always unmaps the pages but preserves dirty state, i.e., the page is marked dirty if PG_M was set. In other words, modifications to shared data are not discarded, and "repopulating the memory contents from the up-to-date contents of the underlying mapped file" may consist of simply mapping resident file pages. I think we need a new madvise verb to implement these semantics. illumos added it and called it MADV_PURGE. Maybe MADV_LINUX_DONTNEED would be better, given that it is not really intended for use by native code. markj: Indeed, Linux has non-traditional semantics for DONTNEED. Anonymous pages must be zeroed, low… | |||||
Not Done Inline ActionsSo it is unmap/fresh anon map for COW mappings (even if backed by a vnode down the shadow chain), and just unmap for shared vnode mappings ? It is very weird, I do not think there is a reason to expose that in native madvise(), better to implement that in linuxolator. kib: So it is unmap/fresh anon map for COW mappings (even if backed by a vnode down the shadow… | |||||
Not Done Inline ActionsI believe that is right, but I am not yet certain. The implementation would need to use vm_map_madvise() and vm_object_madvise(), wouldn't it? I do not see how it can be kept entirely separate without code duplication. We can avoid exposing the new madvise behaviour to native userspace, but this would make testing/fuzzing more challenging. I would propose simply leaving it undocumented instead. markj: I believe that is right, but I am not yet certain.
The implementation would need to use… | |||||
Not Done Inline ActionsNo I do not think that vm_object_madvise() is useful there. If my interpretation is correct, we would just need to do pmap_remove() for vnode shared mapping, and vm_map_delete()/vm_map_insert() for CoW. Of course this should be accompanied by clipping and other usual VM API dances, but I do not see why cannot it be contained in linux_mmap.c. kib: No I do not think that vm_object_madvise() is useful there.
If my interpretation is correct… | |||||
Not Done Inline ActionsI tried it. Clipping cannot be done outside of vm_map.c (and it should probably stay that way). For CoW mappings, the situation is more complicated. See D25330. I think it is sufficient for jemalloc to work in most cases. markj: I tried it. Clipping cannot be done outside of vm_map.c (and it should probably stay that way). |
Instead of having a sysctl, let's just map it to FreeBSD MADV_DONTNEED for now. I will to write an implementation providing Linux's semantics and commit it as a followup. I can't imagine a scenario where changing this sysctl is going to have a useful effect for a user.