Changeset View
Changeset View
Standalone View
Standalone View
head/sys/kern/kern_sendfile.c
Show All 28 Lines | |||||
#include <sys/cdefs.h> | #include <sys/cdefs.h> | ||||
__FBSDID("$FreeBSD$"); | __FBSDID("$FreeBSD$"); | ||||
#include <sys/param.h> | #include <sys/param.h> | ||||
#include <sys/systm.h> | #include <sys/systm.h> | ||||
#include <sys/capsicum.h> | #include <sys/capsicum.h> | ||||
#include <sys/kernel.h> | #include <sys/kernel.h> | ||||
#include <netinet/in.h> | |||||
#include <sys/lock.h> | #include <sys/lock.h> | ||||
#include <sys/mutex.h> | #include <sys/mutex.h> | ||||
#include <sys/sysproto.h> | #include <sys/sysproto.h> | ||||
#include <sys/malloc.h> | #include <sys/malloc.h> | ||||
#include <sys/proc.h> | #include <sys/proc.h> | ||||
#include <sys/mman.h> | #include <sys/mman.h> | ||||
#include <sys/mount.h> | #include <sys/mount.h> | ||||
#include <sys/mbuf.h> | #include <sys/mbuf.h> | ||||
Show All 12 Lines | |||||
#include <security/mac/mac_framework.h> | #include <security/mac/mac_framework.h> | ||||
#include <vm/vm.h> | #include <vm/vm.h> | ||||
#include <vm/vm_object.h> | #include <vm/vm_object.h> | ||||
#include <vm/vm_pager.h> | #include <vm/vm_pager.h> | ||||
#define EXT_FLAG_SYNC EXT_FLAG_VENDOR1 | #define EXT_FLAG_SYNC EXT_FLAG_VENDOR1 | ||||
#define EXT_FLAG_NOCACHE EXT_FLAG_VENDOR2 | #define EXT_FLAG_NOCACHE EXT_FLAG_VENDOR2 | ||||
#define EXT_FLAG_CACHE_LAST EXT_FLAG_VENDOR3 | |||||
/* | /* | ||||
* Structure describing a single sendfile(2) I/O, which may consist of | * Structure describing a single sendfile(2) I/O, which may consist of | ||||
* several underlying pager I/Os. | * several underlying pager I/Os. | ||||
* | * | ||||
* The syscall context allocates the structure and initializes 'nios' | * The syscall context allocates the structure and initializes 'nios' | ||||
* to 1. As sendfile_swapin() runs through pages and starts asynchronous | * to 1. As sendfile_swapin() runs through pages and starts asynchronous | ||||
* paging operations, it increments 'nios'. | * paging operations, it increments 'nios'. | ||||
▲ Show 20 Lines • Show All 123 Lines • ▼ Show 20 Lines | if (m->m_ext.ext_flags & EXT_FLAG_SYNC) { | ||||
mtx_lock(&sfs->mtx); | mtx_lock(&sfs->mtx); | ||||
KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0")); | KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0")); | ||||
if (--sfs->count == 0) | if (--sfs->count == 0) | ||||
cv_signal(&sfs->cv); | cv_signal(&sfs->cv); | ||||
mtx_unlock(&sfs->mtx); | mtx_unlock(&sfs->mtx); | ||||
} | } | ||||
} | } | ||||
static void | |||||
sendfile_free_mext_pg(struct mbuf *m) | |||||
{ | |||||
struct mbuf_ext_pgs *ext_pgs; | |||||
vm_page_t pg; | |||||
int i; | |||||
bool nocache, cache_last; | |||||
KASSERT(m->m_flags & M_EXT && m->m_ext.ext_type == EXT_PGS, | |||||
("%s: m %p !M_EXT or !EXT_PGS", __func__, m)); | |||||
nocache = m->m_ext.ext_flags & EXT_FLAG_NOCACHE; | |||||
cache_last = m->m_ext.ext_flags & EXT_FLAG_CACHE_LAST; | |||||
ext_pgs = m->m_ext.ext_pgs; | |||||
for (i = 0; i < ext_pgs->npgs; i++) { | |||||
if (cache_last && i == ext_pgs->npgs - 1) | |||||
nocache = false; | |||||
pg = PHYS_TO_VM_PAGE(ext_pgs->pa[i]); | |||||
sendfile_free_page(pg, nocache); | |||||
} | |||||
if (m->m_ext.ext_flags & EXT_FLAG_SYNC) { | |||||
struct sendfile_sync *sfs = m->m_ext.ext_arg2; | |||||
mtx_lock(&sfs->mtx); | |||||
KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0")); | |||||
if (--sfs->count == 0) | |||||
cv_signal(&sfs->cv); | |||||
mtx_unlock(&sfs->mtx); | |||||
} | |||||
} | |||||
/* | /* | ||||
* Helper function to calculate how much data to put into page i of n. | * Helper function to calculate how much data to put into page i of n. | ||||
* Only first and last pages are special. | * Only first and last pages are special. | ||||
*/ | */ | ||||
static inline off_t | static inline off_t | ||||
xfsize(int i, int n, off_t off, off_t len) | xfsize(int i, int n, off_t off, off_t len) | ||||
{ | { | ||||
▲ Show 20 Lines • Show All 66 Lines • ▼ Show 20 Lines | sendfile_iodone(void *arg, vm_page_t *pg, int count, int error) | ||||
if (error) | if (error) | ||||
sfio->error = error; | sfio->error = error; | ||||
if (!refcount_release(&sfio->nios)) | if (!refcount_release(&sfio->nios)) | ||||
return; | return; | ||||
CURVNET_SET(so->so_vnet); | CURVNET_SET(so->so_vnet); | ||||
if (sfio->error) { | if (sfio->error) { | ||||
struct mbuf *m; | |||||
/* | /* | ||||
* I/O operation failed. The state of data in the socket | * I/O operation failed. The state of data in the socket | ||||
* is now inconsistent, and all what we can do is to tear | * is now inconsistent, and all what we can do is to tear | ||||
* it down. Protocol abort method would tear down protocol | * it down. Protocol abort method would tear down protocol | ||||
* state, free all ready mbufs and detach not ready ones. | * state, free all ready mbufs and detach not ready ones. | ||||
* We will free the mbufs corresponding to this I/O manually. | * We will free the mbufs corresponding to this I/O manually. | ||||
* | * | ||||
* The socket would be marked with EIO and made available | * The socket would be marked with EIO and made available | ||||
* for read, so that application receives EIO on next | * for read, so that application receives EIO on next | ||||
* syscall and eventually closes the socket. | * syscall and eventually closes the socket. | ||||
*/ | */ | ||||
so->so_proto->pr_usrreqs->pru_abort(so); | so->so_proto->pr_usrreqs->pru_abort(so); | ||||
so->so_error = EIO; | so->so_error = EIO; | ||||
m = sfio->m; | mb_free_notready(sfio->m, sfio->npages); | ||||
for (int i = 0; i < sfio->npages; i++) | |||||
m = m_free(m); | |||||
} else | } else | ||||
(void)(so->so_proto->pr_usrreqs->pru_ready)(so, sfio->m, | (void)(so->so_proto->pr_usrreqs->pru_ready)(so, sfio->m, | ||||
sfio->npages); | sfio->npages); | ||||
SOCK_LOCK(so); | SOCK_LOCK(so); | ||||
sorele(so); | sorele(so); | ||||
CURVNET_RESTORE(); | CURVNET_RESTORE(); | ||||
free(sfio, M_TEMP); | free(sfio, M_TEMP); | ||||
▲ Show 20 Lines • Show All 222 Lines • ▼ Show 20 Lines | |||||
vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, | vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, | ||||
struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, | struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, | ||||
struct thread *td) | struct thread *td) | ||||
{ | { | ||||
struct file *sock_fp; | struct file *sock_fp; | ||||
struct vnode *vp; | struct vnode *vp; | ||||
struct vm_object *obj; | struct vm_object *obj; | ||||
struct socket *so; | struct socket *so; | ||||
struct mbuf_ext_pgs *ext_pgs; | |||||
struct mbuf *m, *mh, *mhtail; | struct mbuf *m, *mh, *mhtail; | ||||
struct sf_buf *sf; | struct sf_buf *sf; | ||||
struct shmfd *shmfd; | struct shmfd *shmfd; | ||||
struct sendfile_sync *sfs; | struct sendfile_sync *sfs; | ||||
struct vattr va; | struct vattr va; | ||||
off_t off, sbytes, rem, obj_size; | off_t off, sbytes, rem, obj_size; | ||||
int error, softerr, bsize, hdrlen; | int bsize, error, ext_pgs_idx, hdrlen, max_pgs, softerr; | ||||
bool use_ext_pgs; | |||||
obj = NULL; | obj = NULL; | ||||
so = NULL; | so = NULL; | ||||
m = mh = NULL; | m = mh = NULL; | ||||
sfs = NULL; | sfs = NULL; | ||||
hdrlen = sbytes = 0; | hdrlen = sbytes = 0; | ||||
softerr = 0; | softerr = 0; | ||||
use_ext_pgs = false; | |||||
error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize); | error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize); | ||||
if (error != 0) | if (error != 0) | ||||
return (error); | return (error); | ||||
error = sendfile_getsock(td, sockfd, &sock_fp, &so); | error = sendfile_getsock(td, sockfd, &sock_fp, &so); | ||||
if (error != 0) | if (error != 0) | ||||
goto out; | goto out; | ||||
▲ Show 20 Lines • Show All 144 Lines • ▼ Show 20 Lines | if (vp != NULL) { | ||||
rem = nbytes ? | rem = nbytes ? | ||||
omin(nbytes + offset, obj_size) : obj_size; | omin(nbytes + offset, obj_size) : obj_size; | ||||
rem -= off; | rem -= off; | ||||
} | } | ||||
} | } | ||||
if (space > rem) | if (space > rem) | ||||
space = rem; | space = rem; | ||||
else if (space > PAGE_SIZE) { | |||||
/* | |||||
* Use page boundaries when possible for large | |||||
* requests. | |||||
*/ | |||||
if (off & PAGE_MASK) | |||||
space -= (PAGE_SIZE - (off & PAGE_MASK)); | |||||
space = trunc_page(space); | |||||
if (off & PAGE_MASK) | |||||
space += (PAGE_SIZE - (off & PAGE_MASK)); | |||||
} | |||||
npages = howmany(space + (off & PAGE_MASK), PAGE_SIZE); | npages = howmany(space + (off & PAGE_MASK), PAGE_SIZE); | ||||
/* | /* | ||||
* Calculate maximum allowed number of pages for readahead | * Calculate maximum allowed number of pages for readahead | ||||
* at this iteration. If SF_USER_READAHEAD was set, we don't | * at this iteration. If SF_USER_READAHEAD was set, we don't | ||||
* do any heuristics and use exactly the value supplied by | * do any heuristics and use exactly the value supplied by | ||||
* application. Otherwise, we allow readahead up to "rem". | * application. Otherwise, we allow readahead up to "rem". | ||||
Show All 21 Lines | retry_space: | ||||
nios = sendfile_swapin(obj, sfio, off, space, npages, rhpages, | nios = sendfile_swapin(obj, sfio, off, space, npages, rhpages, | ||||
flags); | flags); | ||||
/* | /* | ||||
* Loop and construct maximum sized mbuf chain to be bulk | * Loop and construct maximum sized mbuf chain to be bulk | ||||
* dumped into socket buffer. | * dumped into socket buffer. | ||||
*/ | */ | ||||
pa = sfio->pa; | pa = sfio->pa; | ||||
/* | |||||
* Use unmapped mbufs if enabled for TCP. Unmapped | |||||
* bufs are restricted to TCP as that is what has been | |||||
* tested. In particular, unmapped mbufs have not | |||||
* been tested with UNIX-domain sockets. | |||||
*/ | |||||
if (mb_use_ext_pgs && | |||||
so->so_proto->pr_protocol == IPPROTO_TCP) { | |||||
use_ext_pgs = true; | |||||
max_pgs = MBUF_PEXT_MAX_PGS; | |||||
/* Start at last index, to wrap on first use. */ | |||||
ext_pgs_idx = max_pgs - 1; | |||||
} | |||||
for (int i = 0; i < npages; i++) { | for (int i = 0; i < npages; i++) { | ||||
struct mbuf *m0; | struct mbuf *m0; | ||||
/* | /* | ||||
* If a page wasn't grabbed successfully, then | * If a page wasn't grabbed successfully, then | ||||
* trim the array. Can happen only with SF_NODISKIO. | * trim the array. Can happen only with SF_NODISKIO. | ||||
*/ | */ | ||||
if (pa[i] == NULL) { | if (pa[i] == NULL) { | ||||
SFSTAT_INC(sf_busy); | SFSTAT_INC(sf_busy); | ||||
fixspace(npages, i, off, &space); | fixspace(npages, i, off, &space); | ||||
npages = i; | npages = i; | ||||
softerr = EBUSY; | softerr = EBUSY; | ||||
break; | break; | ||||
} | |||||
if (use_ext_pgs) { | |||||
off_t xfs; | |||||
ext_pgs_idx++; | |||||
if (ext_pgs_idx == max_pgs) { | |||||
m0 = mb_alloc_ext_pgs(M_WAITOK, false, | |||||
sendfile_free_mext_pg); | |||||
if (flags & SF_NOCACHE) { | |||||
m0->m_ext.ext_flags |= | |||||
EXT_FLAG_NOCACHE; | |||||
/* | |||||
* See comment below regarding | |||||
* ignoring SF_NOCACHE for the | |||||
* last page. | |||||
*/ | |||||
if ((npages - i <= max_pgs) && | |||||
((off + space) & PAGE_MASK) && | |||||
(rem > space || rhpages > 0)) | |||||
m0->m_ext.ext_flags |= | |||||
EXT_FLAG_CACHE_LAST; | |||||
} | |||||
if (sfs != NULL) { | |||||
m0->m_ext.ext_flags |= | |||||
EXT_FLAG_SYNC; | |||||
m0->m_ext.ext_arg2 = sfs; | |||||
mtx_lock(&sfs->mtx); | |||||
sfs->count++; | |||||
mtx_unlock(&sfs->mtx); | |||||
} | |||||
ext_pgs = m0->m_ext.ext_pgs; | |||||
if (i == 0) | |||||
sfio->m = m0; | |||||
ext_pgs_idx = 0; | |||||
/* Append to mbuf chain. */ | |||||
if (mtail != NULL) | |||||
mtail->m_next = m0; | |||||
else | |||||
m = m0; | |||||
mtail = m0; | |||||
ext_pgs->first_pg_off = | |||||
vmoff(i, off) & PAGE_MASK; | |||||
} | |||||
if (nios) { | |||||
mtail->m_flags |= M_NOTREADY; | |||||
ext_pgs->nrdy++; | |||||
} | |||||
ext_pgs->pa[ext_pgs_idx] = VM_PAGE_TO_PHYS(pa[i]); | |||||
ext_pgs->npgs++; | |||||
xfs = xfsize(i, npages, off, space); | |||||
ext_pgs->last_pg_len = xfs; | |||||
MBUF_EXT_PGS_ASSERT_SANITY(ext_pgs); | |||||
mtail->m_len += xfs; | |||||
mtail->m_ext.ext_size += PAGE_SIZE; | |||||
continue; | |||||
} | } | ||||
/* | /* | ||||
* Get a sendfile buf. When allocating the | * Get a sendfile buf. When allocating the | ||||
* first buffer for mbuf chain, we usually | * first buffer for mbuf chain, we usually | ||||
* wait as long as necessary, but this wait | * wait as long as necessary, but this wait | ||||
* can be interrupted. For consequent | * can be interrupted. For consequent | ||||
* buffers, do not sleep, since several | * buffers, do not sleep, since several | ||||
▲ Show 20 Lines • Show All 279 Lines • Show Last 20 Lines |