Changeset View
Standalone View
sys/kern/kern_sendfile.c
Show First 20 Lines • Show All 46 Lines • ▼ Show 20 Lines | |||||
#include <sys/sf_buf.h> | #include <sys/sf_buf.h> | ||||
#include <sys/socket.h> | #include <sys/socket.h> | ||||
#include <sys/socketvar.h> | #include <sys/socketvar.h> | ||||
#include <sys/syscallsubr.h> | #include <sys/syscallsubr.h> | ||||
#include <sys/sysctl.h> | #include <sys/sysctl.h> | ||||
#include <sys/vnode.h> | #include <sys/vnode.h> | ||||
#include <net/vnet.h> | #include <net/vnet.h> | ||||
#include <netinet/in.h> | |||||
#include <netinet/in_pcb.h> | |||||
#include <security/audit/audit.h> | #include <security/audit/audit.h> | ||||
#include <security/mac/mac_framework.h> | #include <security/mac/mac_framework.h> | ||||
#include <vm/vm.h> | #include <vm/vm.h> | ||||
#include <vm/vm_object.h> | #include <vm/vm_object.h> | ||||
#include <vm/vm_pager.h> | #include <vm/vm_pager.h> | ||||
▲ Show 20 Lines • Show All 477 Lines • ▼ Show 20 Lines | vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, | ||||
struct vnode *vp; | struct vnode *vp; | ||||
struct vm_object *obj; | struct vm_object *obj; | ||||
struct socket *so; | struct socket *so; | ||||
struct mbuf *m, *mh, *mhtail; | struct mbuf *m, *mh, *mhtail; | ||||
struct sf_buf *sf; | struct sf_buf *sf; | ||||
struct shmfd *shmfd; | struct shmfd *shmfd; | ||||
struct sendfile_sync *sfs; | struct sendfile_sync *sfs; | ||||
struct vattr va; | struct vattr va; | ||||
struct inpcb *inp; | |||||
off_t off, sbytes, rem, obj_size; | off_t off, sbytes, rem, obj_size; | ||||
int error, softerr, bsize, hdrlen; | int domain, error, softerr, bsize, hdrlen; | ||||
obj = NULL; | obj = NULL; | ||||
so = NULL; | so = NULL; | ||||
m = mh = NULL; | m = mh = NULL; | ||||
sfs = NULL; | sfs = NULL; | ||||
hdrlen = sbytes = 0; | hdrlen = sbytes = 0; | ||||
softerr = 0; | softerr = 0; | ||||
▲ Show 20 Lines • Show All 179 Lines • ▼ Show 20 Lines | retry_space: | ||||
rhpages = min(howmany(obj_size - trunc_page(off), PAGE_SIZE) - | rhpages = min(howmany(obj_size - trunc_page(off), PAGE_SIZE) - | ||||
npages, rhpages); | npages, rhpages); | ||||
sfio = malloc(sizeof(struct sf_io) + | sfio = malloc(sizeof(struct sf_io) + | ||||
npages * sizeof(vm_page_t), M_TEMP, M_WAITOK); | npages * sizeof(vm_page_t), M_TEMP, M_WAITOK); | ||||
refcount_init(&sfio->nios, 1); | refcount_init(&sfio->nios, 1); | ||||
sfio->so = so; | sfio->so = so; | ||||
sfio->error = 0; | sfio->error = 0; | ||||
if ((flags & SF_NOCACHE) != 0 && | |||||
(inp = sotoinpcb(so)) != NULL && | |||||
(domain = inp->inp_numa_domain) != M_NODOM && | |||||
obj->domain.dr_policy != DOMAINSET_PREF(domain) && | |||||
obj->domain.dr_iter != domain) { | |||||
kib: Is there a way to determine that the object does not have the policy assigned ? I mean, if… | |||||
markjUnsubmitted Done Inline ActionsTo check for an object policy, we can simply test whether obj->domain.dr_policy is non-NULL, like vm_domainset_iter_page_init() does. I agree that we should not override an existing policy. I do not believe it is necessary to initialize dr_iter for the DOMAINSET_PREF policy; it's only used for round-robin. Similarly it's sufficient to test the dr_policy pointer. markj: To check for an object policy, we can simply test whether `obj->domain.dr_policy` is non-NULL… | |||||
VM_OBJECT_WLOCK(obj); | |||||
obj->domain.dr_policy = DOMAINSET_PREF(domain); | |||||
markjUnsubmitted Done Inline ActionsWe should re-check after acquiring the object lock. markj: We should re-check after acquiring the object lock. | |||||
obj->domain.dr_iter = domain; | |||||
VM_OBJECT_WUNLOCK(obj); | |||||
} | |||||
nios = sendfile_swapin(obj, sfio, off, space, npages, rhpages, | nios = sendfile_swapin(obj, sfio, off, space, npages, rhpages, | ||||
flags); | flags); | ||||
/* | /* | ||||
* Loop and construct maximum sized mbuf chain to be bulk | * Loop and construct maximum sized mbuf chain to be bulk | ||||
Not Done Inline ActionsIt would appear that subsequent sendfile(SF_NOCACHE) calls on the same file will use the policy from the first call even if the current socket's pcb resides within a different domain. alc: It would appear that subsequent sendfile(SF_NOCACHE) calls on the same file will use the policy… | |||||
Done Inline ActionsThat's what I was worried about (and why I need to re-test). I am not familiar with the vm_object lifecycle, and was concerned objects may linger, and create this sort of behavior. gallatin: That's what I was worried about (and why I need to re-test). I am not familiar with the… | |||||
Done Inline ActionsIndeed, this is a problem. It causes at least a 50% increase in cross-domain NUMA traffic in my test setup (13% -> 20%). Is there a solution I'm not thinking of? Kib has a point about different active senders of the same file "ping-ponging" if I revert part of this, and change the check from policy == NULL back to policy != DOMAINSET_PREF(domain). For our (Netflix) workload, reverting back to forcing a change if the policy prefers a domain other than the current one is the obvious choice. I think it may also be the obvious choice in general, as I'm not sure how the "ping-ponging" situation is any worse than round-robin allocations, which is roughly what it degrades to. gallatin: Indeed, this is a problem. It causes at least a 50% increase in cross-domain NUMA traffic in… | |||||
* dumped into socket buffer. | * dumped into socket buffer. | ||||
*/ | */ | ||||
pa = sfio->pa; | pa = sfio->pa; | ||||
for (int i = 0; i < npages; i++) { | for (int i = 0; i < npages; i++) { | ||||
struct mbuf *m0; | struct mbuf *m0; | ||||
/* | /* | ||||
* If a page wasn't grabbed successfully, then | * If a page wasn't grabbed successfully, then | ||||
▲ Show 20 Lines • Show All 295 Lines • Show Last 20 Lines |
Is there a way to determine that the object does not have the policy assigned ? I mean, if object got the policy assigned by some way, perhaps we will grow the ability for user to apply the policy to files/mount points, then it should not be overwritten by this place.
BTW, suppose we have two domains, two network cards with affinity to the corresponding domains, and two incoming connections coming from distinct input pathes, and the same file is served to both. I do not think it is reasonable to ping-pong the object domain setting.
Also, please add a comment explaining the reasoning. I initially intended to say that I want a sysctl there to enable this behavior, but later decided that there is no point, assuming the policy does not override more prioritized setting.