Changeset View
Standalone View
sys/kern/kern_sendfile.c
Show First 20 Lines • Show All 46 Lines • ▼ Show 20 Lines | |||||
#include <sys/sf_buf.h> | #include <sys/sf_buf.h> | ||||
#include <sys/socket.h> | #include <sys/socket.h> | ||||
#include <sys/socketvar.h> | #include <sys/socketvar.h> | ||||
#include <sys/syscallsubr.h> | #include <sys/syscallsubr.h> | ||||
#include <sys/sysctl.h> | #include <sys/sysctl.h> | ||||
#include <sys/vnode.h> | #include <sys/vnode.h> | ||||
#include <net/vnet.h> | #include <net/vnet.h> | ||||
#include <netinet/in.h> | |||||
#include <netinet/in_pcb.h> | |||||
#include <security/audit/audit.h> | #include <security/audit/audit.h> | ||||
#include <security/mac/mac_framework.h> | #include <security/mac/mac_framework.h> | ||||
#include <vm/vm.h> | #include <vm/vm.h> | ||||
#include <vm/vm_object.h> | #include <vm/vm_object.h> | ||||
#include <vm/vm_pager.h> | #include <vm/vm_pager.h> | ||||
▲ Show 20 Lines • Show All 477 Lines • ▼ Show 20 Lines | vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, | ||||
struct vnode *vp; | struct vnode *vp; | ||||
struct vm_object *obj; | struct vm_object *obj; | ||||
struct socket *so; | struct socket *so; | ||||
struct mbuf *m, *mh, *mhtail; | struct mbuf *m, *mh, *mhtail; | ||||
struct sf_buf *sf; | struct sf_buf *sf; | ||||
struct shmfd *shmfd; | struct shmfd *shmfd; | ||||
struct sendfile_sync *sfs; | struct sendfile_sync *sfs; | ||||
struct vattr va; | struct vattr va; | ||||
struct inpcb *inp; | |||||
off_t off, sbytes, rem, obj_size; | off_t off, sbytes, rem, obj_size; | ||||
int error, softerr, bsize, hdrlen; | int error, softerr, bsize, hdrlen; | ||||
obj = NULL; | obj = NULL; | ||||
so = NULL; | so = NULL; | ||||
m = mh = NULL; | m = mh = NULL; | ||||
sfs = NULL; | sfs = NULL; | ||||
hdrlen = sbytes = 0; | hdrlen = sbytes = 0; | ||||
▲ Show 20 Lines • Show All 181 Lines • ▼ Show 20 Lines | retry_space: | ||||
rhpages = min(howmany(obj_size - trunc_page(off), PAGE_SIZE) - | rhpages = min(howmany(obj_size - trunc_page(off), PAGE_SIZE) - | ||||
npages, rhpages); | npages, rhpages); | ||||
sfio = malloc(sizeof(struct sf_io) + | sfio = malloc(sizeof(struct sf_io) + | ||||
npages * sizeof(vm_page_t), M_TEMP, M_WAITOK); | npages * sizeof(vm_page_t), M_TEMP, M_WAITOK); | ||||
refcount_init(&sfio->nios, 1); | refcount_init(&sfio->nios, 1); | ||||
sfio->so = so; | sfio->so = so; | ||||
sfio->error = 0; | sfio->error = 0; | ||||
/* | |||||
* If the network connection is associated with a NUMA | |||||
* domain, and if the user has requested the data be | |||||
* quickly freed, then allocate backing pages from the | |||||
* domain which is local to the network connection. | |||||
kib: Is there a way to determine that the object does not have the policy assigned ? I mean, if… | |||||
Done Inline ActionsTo check for an object policy, we can simply test whether obj->domain.dr_policy is non-NULL, like vm_domainset_iter_page_init() does. I agree that we should not override an existing policy. I do not believe it is necessary to initialize dr_iter for the DOMAINSET_PREF policy; it's only used for round-robin. Similarly it's sufficient to test the dr_policy pointer. markj: To check for an object policy, we can simply test whether `obj->domain.dr_policy` is non-NULL… | |||||
*/ | |||||
if ((flags & SF_NOCACHE) != 0 && | |||||
Done Inline ActionsWe should re-check after acquiring the object lock. markj: We should re-check after acquiring the object lock. | |||||
(inp = sotoinpcb(so)) != NULL && | |||||
inp->inp_numa_domain != M_NODOM && | |||||
obj->domain.dr_policy == NULL) { | |||||
VM_OBJECT_WLOCK(obj); | |||||
if (obj->domain.dr_policy == NULL) | |||||
obj->domain.dr_policy = | |||||
DOMAINSET_PREF(inp->inp_numa_domain); | |||||
VM_OBJECT_WUNLOCK(obj); | |||||
} | |||||
alcUnsubmitted Not Done Inline ActionsIt would appear that subsequent sendfile(SF_NOCACHE) calls on the same file will use the policy from the first call even if the current socket's pcb resides within a different domain. alc: It would appear that subsequent sendfile(SF_NOCACHE) calls on the same file will use the policy… | |||||
gallatinAuthorUnsubmitted Done Inline ActionsThat's what I was worried about (and why I need to re-test). I am not familiar with the vm_object lifecycle, and was concerned objects may linger, and create this sort of behavior. gallatin: That's what I was worried about (and why I need to re-test). I am not familiar with the… | |||||
gallatinAuthorUnsubmitted Done Inline ActionsIndeed, this is a problem. It causes at least a 50% increase in cross-domain NUMA traffic in my test setup (13% -> 20%). Is there a solution I'm not thinking of? Kib has a point about different active senders of the same file "ping-ponging" if I revert part of this, and change the check from policy == NULL back to policy != DOMAINSET_PREF(domain). For our (Netflix) workload, reverting back to forcing a change if the policy prefers a domain other than the current one is the obvious choice. I think it may also be the obvious choice in general, as I'm not sure how the "ping-ponging" situation is any worse than round-robin allocations, which is roughly what it degrades to. gallatin: Indeed, this is a problem. It causes at least a 50% increase in cross-domain NUMA traffic in… | |||||
nios = sendfile_swapin(obj, sfio, off, space, npages, rhpages, | nios = sendfile_swapin(obj, sfio, off, space, npages, rhpages, | ||||
flags); | flags); | ||||
/* | /* | ||||
* Loop and construct maximum sized mbuf chain to be bulk | * Loop and construct maximum sized mbuf chain to be bulk | ||||
* dumped into socket buffer. | * dumped into socket buffer. | ||||
*/ | */ | ||||
▲ Show 20 Lines • Show All 301 Lines • Show Last 20 Lines |
Is there a way to determine that the object does not have the policy assigned ? I mean, if object got the policy assigned by some way, perhaps we will grow the ability for user to apply the policy to files/mount points, then it should not be overwritten by this place.
BTW, suppose we have two domains, two network cards with affinity to the corresponding domains, and two incoming connections coming from distinct input pathes, and the same file is served to both. I do not think it is reasonable to ping-pong the object domain setting.
Also, please add a comment explaining the reasoning. I initially intended to say that I want a sysctl there to enable this behavior, but later decided that there is no point, assuming the policy does not override more prioritized setting.