diff --git a/share/man/man9/VOP_ATTRIB.9 b/share/man/man9/VOP_ATTRIB.9 index 45f1e2f1652a..ddfee1f5d1f9 100644 --- a/share/man/man9/VOP_ATTRIB.9 +++ b/share/man/man9/VOP_ATTRIB.9 @@ -1,139 +1,137 @@ .\" -*- nroff -*- .\" .\" Copyright (c) 1996 Doug Rabson .\" .\" All rights reserved. .\" .\" This program is free software. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE DEVELOPERS ``AS IS'' AND ANY EXPRESS OR .\" IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES .\" OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. .\" IN NO EVENT SHALL THE DEVELOPERS BE LIABLE FOR ANY DIRECT, INDIRECT, .\" INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT .\" NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, .\" DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY .\" THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT .\" (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF .\" THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .\" .\" $FreeBSD$ .\" -.Dd August 8, 2020 +.Dd October 2, 2021 .Dt VOP_ATTRIB 9 .Os .Sh NAME .Nm VOP_GETATTR , .Nm VOP_SETATTR .Nd get and set attributes on a file or directory .Sh SYNOPSIS .In sys/param.h .In sys/vnode.h .Ft int .Fn VOP_GETATTR "struct vnode *vp" "struct vattr *vap" "struct ucred *cred" .Ft int .Fn VOP_SETATTR "struct vnode *vp" "struct vattr *vap" "struct ucred *cred" .Ft int .Fn VOP_STAT "struct vnode *vp" "struct stat *sb" "struct ucred *active_cred" \ -"struct ucred *file_cred" "struct thread *td" +"struct ucred *file_cred" .Sh DESCRIPTION These entry points manipulate various attributes of a file or directory, including file permissions, owner, group, size, access time and modification time. .Pp .Fn VOP_STAT returns data in a format suitable for the .Xr stat 2 system call and by default is implemented as a wrapper around .Fn VOP_GETATTR . Filesystems may want to implement their own variant for performance reasons. .Pp For .Fn VOP_GETATTR and .Fn VOP_SETATTR the arguments are: .Bl -tag -width cred .It Fa vp The vnode of the file. .It Fa vap The attributes of the file. .It Fa cred The user credentials of the calling thread. .El .Pp For .Fn VOP_STAT the arguments are: .Bl -tag -width active_cred .It Fa vp The vnode of the file. .It Fa sb The attributes of the file. .It Fa active_cred The user credentials of the calling thread. .It Fa file_cred The credentials installed on the file description pointing to the vnode or NOCRED. -.It Fa td -The calling thread. .El .Pp Attributes which are not being modified by .Fn VOP_SETATTR should be set to the value .Dv VNOVAL ; .Fn VATTR_NULL may be used to clear all the values, and should generally be used to reset the contents of .Fa *vap prior to setting specific values. .Sh LOCKS Both .Fn VOP_GETATTR and .Fn VOP_STAT expect the vnode to be locked on entry and will leave the vnode locked on return. The lock type can be either shared or exclusive. .Pp .Fn VOP_SETATTR expects the vnode to be locked on entry and will leave the vnode locked on return. The lock type must be exclusive. .Sh RETURN VALUES .Fn VOP_GETATTR returns 0 if it was able to retrieve the attribute data via .Fa *vap , otherwise an appropriate error is returned. .Fn VOP_SETATTR returns zero if the attributes were changed successfully, otherwise an appropriate error is returned. .Fn VOP_STAT returns 0 if it was able to retrieve the attribute data .Fa *sb , otherwise an appropriate error is returned. .Sh ERRORS .Bl -tag -width Er .It Bq Er EPERM The file is immutable. .It Bq Er EACCES The caller does not have permission to modify the file or directory attributes. .It Bq Er EROFS The file system is read-only. .El .Sh SEE ALSO .Xr VFS 9 , .Xr vnode 9 , .Xr VOP_ACCESS 9 .Sh AUTHORS This manual page was written by .An Doug Rabson . diff --git a/sys/compat/linux/linux_event.c b/sys/compat/linux/linux_event.c index 331732eb4234..2539b406618f 100644 --- a/sys/compat/linux/linux_event.c +++ b/sys/compat/linux/linux_event.c @@ -1,980 +1,979 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2007 Roman Divacky * Copyright (c) 2014 Dmitry Chagin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif #include #include #include #include #include typedef uint64_t epoll_udata_t; struct epoll_event { uint32_t events; epoll_udata_t data; } #if defined(__amd64__) __attribute__((packed)) #endif ; #define LINUX_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) static int epoll_to_kevent(struct thread *td, int fd, struct epoll_event *l_event, struct kevent *kevent, int *nkevents); static void kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event); static int epoll_kev_copyout(void *arg, struct kevent *kevp, int count); static int epoll_kev_copyin(void *arg, struct kevent *kevp, int count); static int epoll_register_kevent(struct thread *td, struct file *epfp, int fd, int filter, unsigned int flags); static int epoll_fd_registered(struct thread *td, struct file *epfp, int fd); static int epoll_delete_all_events(struct thread *td, struct file *epfp, int fd); struct epoll_copyin_args { struct kevent *changelist; }; struct epoll_copyout_args { struct epoll_event *leventlist; struct proc *p; uint32_t count; int error; }; /* timerfd */ typedef uint64_t timerfd_t; static fo_rdwr_t timerfd_read; static fo_ioctl_t timerfd_ioctl; static fo_poll_t timerfd_poll; static fo_kqfilter_t timerfd_kqfilter; static fo_stat_t timerfd_stat; static fo_close_t timerfd_close; static fo_fill_kinfo_t timerfd_fill_kinfo; static struct fileops timerfdops = { .fo_read = timerfd_read, .fo_write = invfo_rdwr, .fo_truncate = invfo_truncate, .fo_ioctl = timerfd_ioctl, .fo_poll = timerfd_poll, .fo_kqfilter = timerfd_kqfilter, .fo_stat = timerfd_stat, .fo_close = timerfd_close, .fo_chmod = invfo_chmod, .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = timerfd_fill_kinfo, .fo_flags = DFLAG_PASSABLE }; static void filt_timerfddetach(struct knote *kn); static int filt_timerfdread(struct knote *kn, long hint); static struct filterops timerfd_rfiltops = { .f_isfd = 1, .f_detach = filt_timerfddetach, .f_event = filt_timerfdread }; struct timerfd { clockid_t tfd_clockid; struct itimerspec tfd_time; struct callout tfd_callout; timerfd_t tfd_count; bool tfd_canceled; struct selinfo tfd_sel; struct mtx tfd_lock; }; static void linux_timerfd_expire(void *); static void linux_timerfd_curval(struct timerfd *, struct itimerspec *); static int epoll_create_common(struct thread *td, int flags) { return (kern_kqueue(td, flags, NULL)); } #ifdef LINUX_LEGACY_SYSCALLS int linux_epoll_create(struct thread *td, struct linux_epoll_create_args *args) { /* * args->size is unused. Linux just tests it * and then forgets it as well. */ if (args->size <= 0) return (EINVAL); return (epoll_create_common(td, 0)); } #endif int linux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args) { int flags; if ((args->flags & ~(LINUX_O_CLOEXEC)) != 0) return (EINVAL); flags = 0; if ((args->flags & LINUX_O_CLOEXEC) != 0) flags |= O_CLOEXEC; return (epoll_create_common(td, flags)); } /* Structure converting function from epoll to kevent. */ static int epoll_to_kevent(struct thread *td, int fd, struct epoll_event *l_event, struct kevent *kevent, int *nkevents) { uint32_t levents = l_event->events; struct linux_pemuldata *pem; struct proc *p; unsigned short kev_flags = EV_ADD | EV_ENABLE; /* flags related to how event is registered */ if ((levents & LINUX_EPOLLONESHOT) != 0) kev_flags |= EV_DISPATCH; if ((levents & LINUX_EPOLLET) != 0) kev_flags |= EV_CLEAR; if ((levents & LINUX_EPOLLERR) != 0) kev_flags |= EV_ERROR; if ((levents & LINUX_EPOLLRDHUP) != 0) kev_flags |= EV_EOF; /* flags related to what event is registered */ if ((levents & LINUX_EPOLL_EVRD) != 0) { EV_SET(kevent, fd, EVFILT_READ, kev_flags, 0, 0, 0); kevent->ext[0] = l_event->data; ++kevent; ++(*nkevents); } if ((levents & LINUX_EPOLL_EVWR) != 0) { EV_SET(kevent, fd, EVFILT_WRITE, kev_flags, 0, 0, 0); kevent->ext[0] = l_event->data; ++kevent; ++(*nkevents); } /* zero event mask is legal */ if ((levents & (LINUX_EPOLL_EVRD | LINUX_EPOLL_EVWR)) == 0) { EV_SET(kevent++, fd, EVFILT_READ, EV_ADD|EV_DISABLE, 0, 0, 0); ++(*nkevents); } if ((levents & ~(LINUX_EPOLL_EVSUP)) != 0) { p = td->td_proc; pem = pem_find(p); KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); LINUX_PEM_XLOCK(pem); if ((pem->flags & LINUX_XUNSUP_EPOLL) == 0) { pem->flags |= LINUX_XUNSUP_EPOLL; LINUX_PEM_XUNLOCK(pem); linux_msg(td, "epoll_ctl unsupported flags: 0x%x", levents); } else LINUX_PEM_XUNLOCK(pem); return (EINVAL); } return (0); } /* * Structure converting function from kevent to epoll. In a case * this is called on error in registration we store the error in * event->data and pick it up later in linux_epoll_ctl(). */ static void kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event) { l_event->data = kevent->ext[0]; if ((kevent->flags & EV_ERROR) != 0) { l_event->events = LINUX_EPOLLERR; return; } /* XXX EPOLLPRI, EPOLLHUP */ switch (kevent->filter) { case EVFILT_READ: l_event->events = LINUX_EPOLLIN; if ((kevent->flags & EV_EOF) != 0) l_event->events |= LINUX_EPOLLRDHUP; break; case EVFILT_WRITE: l_event->events = LINUX_EPOLLOUT; break; } } /* * Copyout callback used by kevent. This converts kevent * events to epoll events and copies them back to the * userspace. This is also called on error on registering * of the filter. */ static int epoll_kev_copyout(void *arg, struct kevent *kevp, int count) { struct epoll_copyout_args *args; struct epoll_event *eep; int error, i; args = (struct epoll_copyout_args*) arg; eep = malloc(sizeof(*eep) * count, M_EPOLL, M_WAITOK | M_ZERO); for (i = 0; i < count; i++) kevent_to_epoll(&kevp[i], &eep[i]); error = copyout(eep, args->leventlist, count * sizeof(*eep)); if (error == 0) { args->leventlist += count; args->count += count; } else if (args->error == 0) args->error = error; free(eep, M_EPOLL); return (error); } /* * Copyin callback used by kevent. This copies already * converted filters from kernel memory to the kevent * internal kernel memory. Hence the memcpy instead of * copyin. */ static int epoll_kev_copyin(void *arg, struct kevent *kevp, int count) { struct epoll_copyin_args *args; args = (struct epoll_copyin_args*) arg; memcpy(kevp, args->changelist, count * sizeof(*kevp)); args->changelist += count; return (0); } /* * Load epoll filter, convert it to kevent filter * and load it into kevent subsystem. */ int linux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args) { struct file *epfp, *fp; struct epoll_copyin_args ciargs; struct kevent kev[2]; struct kevent_copyops k_ops = { &ciargs, NULL, epoll_kev_copyin}; struct epoll_event le; cap_rights_t rights; int nchanges = 0; int error; if (args->op != LINUX_EPOLL_CTL_DEL) { error = copyin(args->event, &le, sizeof(le)); if (error != 0) return (error); } error = fget(td, args->epfd, cap_rights_init_one(&rights, CAP_KQUEUE_CHANGE), &epfp); if (error != 0) return (error); if (epfp->f_type != DTYPE_KQUEUE) { error = EINVAL; goto leave1; } /* Protect user data vector from incorrectly supplied fd. */ error = fget(td, args->fd, cap_rights_init_one(&rights, CAP_POLL_EVENT), &fp); if (error != 0) goto leave1; /* Linux disallows spying on himself */ if (epfp == fp) { error = EINVAL; goto leave0; } ciargs.changelist = kev; if (args->op != LINUX_EPOLL_CTL_DEL) { error = epoll_to_kevent(td, args->fd, &le, kev, &nchanges); if (error != 0) goto leave0; } switch (args->op) { case LINUX_EPOLL_CTL_MOD: error = epoll_delete_all_events(td, epfp, args->fd); if (error != 0) goto leave0; break; case LINUX_EPOLL_CTL_ADD: if (epoll_fd_registered(td, epfp, args->fd)) { error = EEXIST; goto leave0; } break; case LINUX_EPOLL_CTL_DEL: /* CTL_DEL means unregister this fd with this epoll */ error = epoll_delete_all_events(td, epfp, args->fd); goto leave0; default: error = EINVAL; goto leave0; } error = kern_kevent_fp(td, epfp, nchanges, 0, &k_ops, NULL); leave0: fdrop(fp, td); leave1: fdrop(epfp, td); return (error); } /* * Wait for a filter to be triggered on the epoll file descriptor. */ static int linux_epoll_wait_common(struct thread *td, int epfd, struct epoll_event *events, int maxevents, int timeout, sigset_t *uset) { struct epoll_copyout_args coargs; struct kevent_copyops k_ops = { &coargs, epoll_kev_copyout, NULL}; struct timespec ts, *tsp; cap_rights_t rights; struct file *epfp; sigset_t omask; int error; if (maxevents <= 0 || maxevents > LINUX_MAX_EVENTS) return (EINVAL); error = fget(td, epfd, cap_rights_init_one(&rights, CAP_KQUEUE_EVENT), &epfp); if (error != 0) return (error); if (epfp->f_type != DTYPE_KQUEUE) { error = EINVAL; goto leave; } if (uset != NULL) { error = kern_sigprocmask(td, SIG_SETMASK, uset, &omask, 0); if (error != 0) goto leave; td->td_pflags |= TDP_OLDMASK; /* * Make sure that ast() is called on return to * usermode and TDP_OLDMASK is cleared, restoring old * sigmask. */ thread_lock(td); td->td_flags |= TDF_ASTPENDING; thread_unlock(td); } coargs.leventlist = events; coargs.p = td->td_proc; coargs.count = 0; coargs.error = 0; /* * Linux epoll_wait(2) man page states that timeout of -1 causes caller * to block indefinitely. Real implementation does it if any negative * timeout value is passed. */ if (timeout >= 0) { /* Convert from milliseconds to timespec. */ ts.tv_sec = timeout / 1000; ts.tv_nsec = (timeout % 1000) * 1000000; tsp = &ts; } else { tsp = NULL; } error = kern_kevent_fp(td, epfp, 0, maxevents, &k_ops, tsp); if (error == 0 && coargs.error != 0) error = coargs.error; /* * kern_kevent might return ENOMEM which is not expected from epoll_wait. * Maybe we should translate that but I don't think it matters at all. */ if (error == 0) td->td_retval[0] = coargs.count; if (uset != NULL) error = kern_sigprocmask(td, SIG_SETMASK, &omask, NULL, 0); leave: fdrop(epfp, td); return (error); } #ifdef LINUX_LEGACY_SYSCALLS int linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args) { return (linux_epoll_wait_common(td, args->epfd, args->events, args->maxevents, args->timeout, NULL)); } #endif int linux_epoll_pwait(struct thread *td, struct linux_epoll_pwait_args *args) { sigset_t mask, *pmask; l_sigset_t lmask; int error; if (args->mask != NULL) { if (args->sigsetsize != sizeof(l_sigset_t)) return (EINVAL); error = copyin(args->mask, &lmask, sizeof(l_sigset_t)); if (error != 0) return (error); linux_to_bsd_sigset(&lmask, &mask); pmask = &mask; } else pmask = NULL; return (linux_epoll_wait_common(td, args->epfd, args->events, args->maxevents, args->timeout, pmask)); } static int epoll_register_kevent(struct thread *td, struct file *epfp, int fd, int filter, unsigned int flags) { struct epoll_copyin_args ciargs; struct kevent kev; struct kevent_copyops k_ops = { &ciargs, NULL, epoll_kev_copyin}; ciargs.changelist = &kev; EV_SET(&kev, fd, filter, flags, 0, 0, 0); return (kern_kevent_fp(td, epfp, 1, 0, &k_ops, NULL)); } static int epoll_fd_registered(struct thread *td, struct file *epfp, int fd) { /* * Set empty filter flags to avoid accidental modification of already * registered events. In the case of event re-registration: * 1. If event does not exists kevent() does nothing and returns ENOENT * 2. If event does exists, it's enabled/disabled state is preserved * but fflags, data and udata fields are overwritten. So we can not * set socket lowats and store user's context pointer in udata. */ if (epoll_register_kevent(td, epfp, fd, EVFILT_READ, 0) != ENOENT || epoll_register_kevent(td, epfp, fd, EVFILT_WRITE, 0) != ENOENT) return (1); return (0); } static int epoll_delete_all_events(struct thread *td, struct file *epfp, int fd) { int error1, error2; error1 = epoll_register_kevent(td, epfp, fd, EVFILT_READ, EV_DELETE); error2 = epoll_register_kevent(td, epfp, fd, EVFILT_WRITE, EV_DELETE); /* return 0 if at least one result positive */ return (error1 == 0 ? 0 : error2); } #ifdef LINUX_LEGACY_SYSCALLS int linux_eventfd(struct thread *td, struct linux_eventfd_args *args) { struct specialfd_eventfd ae; bzero(&ae, sizeof(ae)); ae.initval = args->initval; return (kern_specialfd(td, SPECIALFD_EVENTFD, &ae)); } #endif int linux_eventfd2(struct thread *td, struct linux_eventfd2_args *args) { struct specialfd_eventfd ae; int flags; if ((args->flags & ~(LINUX_O_CLOEXEC | LINUX_O_NONBLOCK | LINUX_EFD_SEMAPHORE)) != 0) return (EINVAL); flags = 0; if ((args->flags & LINUX_O_CLOEXEC) != 0) flags |= EFD_CLOEXEC; if ((args->flags & LINUX_O_NONBLOCK) != 0) flags |= EFD_NONBLOCK; if ((args->flags & LINUX_EFD_SEMAPHORE) != 0) flags |= EFD_SEMAPHORE; bzero(&ae, sizeof(ae)); ae.flags = flags; ae.initval = args->initval; return (kern_specialfd(td, SPECIALFD_EVENTFD, &ae)); } int linux_timerfd_create(struct thread *td, struct linux_timerfd_create_args *args) { struct filedesc *fdp; struct timerfd *tfd; struct file *fp; clockid_t clockid; int fflags, fd, error; if ((args->flags & ~LINUX_TFD_CREATE_FLAGS) != 0) return (EINVAL); error = linux_to_native_clockid(&clockid, args->clockid); if (error != 0) return (error); if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC) return (EINVAL); fflags = 0; if ((args->flags & LINUX_TFD_CLOEXEC) != 0) fflags |= O_CLOEXEC; fdp = td->td_proc->p_fd; error = falloc(td, &fp, &fd, fflags); if (error != 0) return (error); tfd = malloc(sizeof(*tfd), M_EPOLL, M_WAITOK | M_ZERO); tfd->tfd_clockid = clockid; mtx_init(&tfd->tfd_lock, "timerfd", NULL, MTX_DEF); callout_init_mtx(&tfd->tfd_callout, &tfd->tfd_lock, 0); knlist_init_mtx(&tfd->tfd_sel.si_note, &tfd->tfd_lock); fflags = FREAD; if ((args->flags & LINUX_O_NONBLOCK) != 0) fflags |= FNONBLOCK; finit(fp, fflags, DTYPE_LINUXTFD, tfd, &timerfdops); fdrop(fp, td); td->td_retval[0] = fd; return (error); } static int timerfd_close(struct file *fp, struct thread *td) { struct timerfd *tfd; tfd = fp->f_data; if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) return (EINVAL); timespecclear(&tfd->tfd_time.it_value); timespecclear(&tfd->tfd_time.it_interval); callout_drain(&tfd->tfd_callout); seldrain(&tfd->tfd_sel); knlist_destroy(&tfd->tfd_sel.si_note); fp->f_ops = &badfileops; mtx_destroy(&tfd->tfd_lock); free(tfd, M_EPOLL); return (0); } static int timerfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct timerfd *tfd; timerfd_t count; int error; tfd = fp->f_data; if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) return (EINVAL); if (uio->uio_resid < sizeof(timerfd_t)) return (EINVAL); error = 0; mtx_lock(&tfd->tfd_lock); retry: if (tfd->tfd_canceled) { tfd->tfd_count = 0; mtx_unlock(&tfd->tfd_lock); return (ECANCELED); } if (tfd->tfd_count == 0) { if ((fp->f_flag & FNONBLOCK) != 0) { mtx_unlock(&tfd->tfd_lock); return (EAGAIN); } error = mtx_sleep(&tfd->tfd_count, &tfd->tfd_lock, PCATCH, "ltfdrd", 0); if (error == 0) goto retry; } if (error == 0) { count = tfd->tfd_count; tfd->tfd_count = 0; mtx_unlock(&tfd->tfd_lock); error = uiomove(&count, sizeof(timerfd_t), uio); } else mtx_unlock(&tfd->tfd_lock); return (error); } static int timerfd_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct timerfd *tfd; int revents = 0; tfd = fp->f_data; if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) return (POLLERR); mtx_lock(&tfd->tfd_lock); if ((events & (POLLIN|POLLRDNORM)) && tfd->tfd_count > 0) revents |= events & (POLLIN|POLLRDNORM); if (revents == 0) selrecord(td, &tfd->tfd_sel); mtx_unlock(&tfd->tfd_lock); return (revents); } static int timerfd_kqfilter(struct file *fp, struct knote *kn) { struct timerfd *tfd; tfd = fp->f_data; if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) return (EINVAL); if (kn->kn_filter == EVFILT_READ) kn->kn_fop = &timerfd_rfiltops; else return (EINVAL); kn->kn_hook = tfd; knlist_add(&tfd->tfd_sel.si_note, kn, 0); return (0); } static void filt_timerfddetach(struct knote *kn) { struct timerfd *tfd = kn->kn_hook; mtx_lock(&tfd->tfd_lock); knlist_remove(&tfd->tfd_sel.si_note, kn, 1); mtx_unlock(&tfd->tfd_lock); } static int filt_timerfdread(struct knote *kn, long hint) { struct timerfd *tfd = kn->kn_hook; return (tfd->tfd_count > 0); } static int timerfd_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { if (fp->f_data == NULL || fp->f_type != DTYPE_LINUXTFD) return (EINVAL); switch (cmd) { case FIONBIO: case FIOASYNC: return (0); } return (ENOTTY); } static int -timerfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred, - struct thread *td) +timerfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred) { return (ENXIO); } static int timerfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { kif->kf_type = KF_TYPE_UNKNOWN; return (0); } static void linux_timerfd_clocktime(struct timerfd *tfd, struct timespec *ts) { if (tfd->tfd_clockid == CLOCK_REALTIME) getnanotime(ts); else /* CLOCK_MONOTONIC */ getnanouptime(ts); } static void linux_timerfd_curval(struct timerfd *tfd, struct itimerspec *ots) { struct timespec cts; linux_timerfd_clocktime(tfd, &cts); *ots = tfd->tfd_time; if (ots->it_value.tv_sec != 0 || ots->it_value.tv_nsec != 0) { timespecsub(&ots->it_value, &cts, &ots->it_value); if (ots->it_value.tv_sec < 0 || (ots->it_value.tv_sec == 0 && ots->it_value.tv_nsec == 0)) { ots->it_value.tv_sec = 0; ots->it_value.tv_nsec = 1; } } } int linux_timerfd_gettime(struct thread *td, struct linux_timerfd_gettime_args *args) { struct l_itimerspec lots; struct itimerspec ots; struct timerfd *tfd; struct file *fp; int error; error = fget(td, args->fd, &cap_read_rights, &fp); if (error != 0) return (error); tfd = fp->f_data; if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) { error = EINVAL; goto out; } mtx_lock(&tfd->tfd_lock); linux_timerfd_curval(tfd, &ots); mtx_unlock(&tfd->tfd_lock); error = native_to_linux_itimerspec(&lots, &ots); if (error == 0) error = copyout(&lots, args->old_value, sizeof(lots)); out: fdrop(fp, td); return (error); } int linux_timerfd_settime(struct thread *td, struct linux_timerfd_settime_args *args) { struct l_itimerspec lots; struct itimerspec nts, ots; struct timespec cts, ts; struct timerfd *tfd; struct timeval tv; struct file *fp; int error; if ((args->flags & ~LINUX_TFD_SETTIME_FLAGS) != 0) return (EINVAL); error = copyin(args->new_value, &lots, sizeof(lots)); if (error != 0) return (error); error = linux_to_native_itimerspec(&nts, &lots); if (error != 0) return (error); error = fget(td, args->fd, &cap_write_rights, &fp); if (error != 0) return (error); tfd = fp->f_data; if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) { error = EINVAL; goto out; } mtx_lock(&tfd->tfd_lock); if (!timespecisset(&nts.it_value)) timespecclear(&nts.it_interval); if (args->old_value != NULL) linux_timerfd_curval(tfd, &ots); tfd->tfd_time = nts; tfd->tfd_count = 0; if (timespecisset(&nts.it_value)) { linux_timerfd_clocktime(tfd, &cts); ts = nts.it_value; if ((args->flags & LINUX_TFD_TIMER_ABSTIME) == 0) { timespecadd(&tfd->tfd_time.it_value, &cts, &tfd->tfd_time.it_value); } else { timespecsub(&ts, &cts, &ts); } TIMESPEC_TO_TIMEVAL(&tv, &ts); callout_reset(&tfd->tfd_callout, tvtohz(&tv), linux_timerfd_expire, tfd); tfd->tfd_canceled = false; } else { tfd->tfd_canceled = true; callout_stop(&tfd->tfd_callout); } mtx_unlock(&tfd->tfd_lock); if (args->old_value != NULL) { error = native_to_linux_itimerspec(&lots, &ots); if (error == 0) error = copyout(&lots, args->old_value, sizeof(lots)); } out: fdrop(fp, td); return (error); } static void linux_timerfd_expire(void *arg) { struct timespec cts, ts; struct timeval tv; struct timerfd *tfd; tfd = (struct timerfd *)arg; linux_timerfd_clocktime(tfd, &cts); if (timespeccmp(&cts, &tfd->tfd_time.it_value, >=)) { if (timespecisset(&tfd->tfd_time.it_interval)) timespecadd(&tfd->tfd_time.it_value, &tfd->tfd_time.it_interval, &tfd->tfd_time.it_value); else /* single shot timer */ timespecclear(&tfd->tfd_time.it_value); if (timespecisset(&tfd->tfd_time.it_value)) { timespecsub(&tfd->tfd_time.it_value, &cts, &ts); TIMESPEC_TO_TIMEVAL(&tv, &ts); callout_reset(&tfd->tfd_callout, tvtohz(&tv), linux_timerfd_expire, tfd); } tfd->tfd_count++; KNOTE_LOCKED(&tfd->tfd_sel.si_note, 0); selwakeup(&tfd->tfd_sel); wakeup(&tfd->tfd_count); } else if (timespecisset(&tfd->tfd_time.it_value)) { timespecsub(&tfd->tfd_time.it_value, &cts, &ts); TIMESPEC_TO_TIMEVAL(&tv, &ts); callout_reset(&tfd->tfd_callout, tvtohz(&tv), linux_timerfd_expire, tfd); } } diff --git a/sys/compat/linuxkpi/common/src/linux_compat.c b/sys/compat/linuxkpi/common/src/linux_compat.c index 279f7131fc57..7a062ca36eba 100644 --- a/sys/compat/linuxkpi/common/src/linux_compat.c +++ b/sys/compat/linuxkpi/common/src/linux_compat.c @@ -1,2654 +1,2653 @@ /*- * Copyright (c) 2010 Isilon Systems, Inc. * Copyright (c) 2010 iX Systems, Inc. * Copyright (c) 2010 Panasas, Inc. * Copyright (c) 2013-2021 Mellanox Technologies, Ltd. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_stack.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__i386__) || defined(__amd64__) #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__i386__) || defined(__amd64__) #include #endif SYSCTL_NODE(_compat, OID_AUTO, linuxkpi, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "LinuxKPI parameters"); int linuxkpi_debug; SYSCTL_INT(_compat_linuxkpi, OID_AUTO, debug, CTLFLAG_RWTUN, &linuxkpi_debug, 0, "Set to enable pr_debug() prints. Clear to disable."); static struct timeval lkpi_net_lastlog; static int lkpi_net_curpps; static int lkpi_net_maxpps = 99; SYSCTL_INT(_compat_linuxkpi, OID_AUTO, net_ratelimit, CTLFLAG_RWTUN, &lkpi_net_maxpps, 0, "Limit number of LinuxKPI net messages per second."); MALLOC_DEFINE(M_KMALLOC, "lkpikmalloc", "Linux kmalloc compat"); #include /* Undo Linux compat changes. */ #undef RB_ROOT #undef file #undef cdev #define RB_ROOT(head) (head)->rbh_root static void linux_destroy_dev(struct linux_cdev *); static void linux_cdev_deref(struct linux_cdev *ldev); static struct vm_area_struct *linux_cdev_handle_find(void *handle); cpumask_t cpu_online_mask; struct kobject linux_class_root; struct device linux_root_device; struct class linux_class_misc; struct list_head pci_drivers; struct list_head pci_devices; spinlock_t pci_lock; unsigned long linux_timer_hz_mask; wait_queue_head_t linux_bit_waitq; wait_queue_head_t linux_var_waitq; int panic_cmp(struct rb_node *one, struct rb_node *two) { panic("no cmp"); } RB_GENERATE(linux_root, rb_node, __entry, panic_cmp); int kobject_set_name_vargs(struct kobject *kobj, const char *fmt, va_list args) { va_list tmp_va; int len; char *old; char *name; char dummy; old = kobj->name; if (old && fmt == NULL) return (0); /* compute length of string */ va_copy(tmp_va, args); len = vsnprintf(&dummy, 0, fmt, tmp_va); va_end(tmp_va); /* account for zero termination */ len++; /* check for error */ if (len < 1) return (-EINVAL); /* allocate memory for string */ name = kzalloc(len, GFP_KERNEL); if (name == NULL) return (-ENOMEM); vsnprintf(name, len, fmt, args); kobj->name = name; /* free old string */ kfree(old); /* filter new string */ for (; *name != '\0'; name++) if (*name == '/') *name = '!'; return (0); } int kobject_set_name(struct kobject *kobj, const char *fmt, ...) { va_list args; int error; va_start(args, fmt); error = kobject_set_name_vargs(kobj, fmt, args); va_end(args); return (error); } static int kobject_add_complete(struct kobject *kobj, struct kobject *parent) { const struct kobj_type *t; int error; kobj->parent = parent; error = sysfs_create_dir(kobj); if (error == 0 && kobj->ktype && kobj->ktype->default_attrs) { struct attribute **attr; t = kobj->ktype; for (attr = t->default_attrs; *attr != NULL; attr++) { error = sysfs_create_file(kobj, *attr); if (error) break; } if (error) sysfs_remove_dir(kobj); } return (error); } int kobject_add(struct kobject *kobj, struct kobject *parent, const char *fmt, ...) { va_list args; int error; va_start(args, fmt); error = kobject_set_name_vargs(kobj, fmt, args); va_end(args); if (error) return (error); return kobject_add_complete(kobj, parent); } void linux_kobject_release(struct kref *kref) { struct kobject *kobj; char *name; kobj = container_of(kref, struct kobject, kref); sysfs_remove_dir(kobj); name = kobj->name; if (kobj->ktype && kobj->ktype->release) kobj->ktype->release(kobj); kfree(name); } static void linux_kobject_kfree(struct kobject *kobj) { kfree(kobj); } static void linux_kobject_kfree_name(struct kobject *kobj) { if (kobj) { kfree(kobj->name); } } const struct kobj_type linux_kfree_type = { .release = linux_kobject_kfree }; static void linux_device_release(struct device *dev) { pr_debug("linux_device_release: %s\n", dev_name(dev)); kfree(dev); } static ssize_t linux_class_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct class_attribute *dattr; ssize_t error; dattr = container_of(attr, struct class_attribute, attr); error = -EIO; if (dattr->show) error = dattr->show(container_of(kobj, struct class, kobj), dattr, buf); return (error); } static ssize_t linux_class_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { struct class_attribute *dattr; ssize_t error; dattr = container_of(attr, struct class_attribute, attr); error = -EIO; if (dattr->store) error = dattr->store(container_of(kobj, struct class, kobj), dattr, buf, count); return (error); } static void linux_class_release(struct kobject *kobj) { struct class *class; class = container_of(kobj, struct class, kobj); if (class->class_release) class->class_release(class); } static const struct sysfs_ops linux_class_sysfs = { .show = linux_class_show, .store = linux_class_store, }; const struct kobj_type linux_class_ktype = { .release = linux_class_release, .sysfs_ops = &linux_class_sysfs }; static void linux_dev_release(struct kobject *kobj) { struct device *dev; dev = container_of(kobj, struct device, kobj); /* This is the precedence defined by linux. */ if (dev->release) dev->release(dev); else if (dev->class && dev->class->dev_release) dev->class->dev_release(dev); } static ssize_t linux_dev_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct device_attribute *dattr; ssize_t error; dattr = container_of(attr, struct device_attribute, attr); error = -EIO; if (dattr->show) error = dattr->show(container_of(kobj, struct device, kobj), dattr, buf); return (error); } static ssize_t linux_dev_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { struct device_attribute *dattr; ssize_t error; dattr = container_of(attr, struct device_attribute, attr); error = -EIO; if (dattr->store) error = dattr->store(container_of(kobj, struct device, kobj), dattr, buf, count); return (error); } static const struct sysfs_ops linux_dev_sysfs = { .show = linux_dev_show, .store = linux_dev_store, }; const struct kobj_type linux_dev_ktype = { .release = linux_dev_release, .sysfs_ops = &linux_dev_sysfs }; struct device * device_create(struct class *class, struct device *parent, dev_t devt, void *drvdata, const char *fmt, ...) { struct device *dev; va_list args; dev = kzalloc(sizeof(*dev), M_WAITOK); dev->parent = parent; dev->class = class; dev->devt = devt; dev->driver_data = drvdata; dev->release = linux_device_release; va_start(args, fmt); kobject_set_name_vargs(&dev->kobj, fmt, args); va_end(args); device_register(dev); return (dev); } int kobject_init_and_add(struct kobject *kobj, const struct kobj_type *ktype, struct kobject *parent, const char *fmt, ...) { va_list args; int error; kobject_init(kobj, ktype); kobj->ktype = ktype; kobj->parent = parent; kobj->name = NULL; va_start(args, fmt); error = kobject_set_name_vargs(kobj, fmt, args); va_end(args); if (error) return (error); return kobject_add_complete(kobj, parent); } static void linux_kq_lock(void *arg) { spinlock_t *s = arg; spin_lock(s); } static void linux_kq_unlock(void *arg) { spinlock_t *s = arg; spin_unlock(s); } static void linux_kq_assert_lock(void *arg, int what) { #ifdef INVARIANTS spinlock_t *s = arg; if (what == LA_LOCKED) mtx_assert(&s->m, MA_OWNED); else mtx_assert(&s->m, MA_NOTOWNED); #endif } static void linux_file_kqfilter_poll(struct linux_file *, int); struct linux_file * linux_file_alloc(void) { struct linux_file *filp; filp = kzalloc(sizeof(*filp), GFP_KERNEL); /* set initial refcount */ filp->f_count = 1; /* setup fields needed by kqueue support */ spin_lock_init(&filp->f_kqlock); knlist_init(&filp->f_selinfo.si_note, &filp->f_kqlock, linux_kq_lock, linux_kq_unlock, linux_kq_assert_lock); return (filp); } void linux_file_free(struct linux_file *filp) { if (filp->_file == NULL) { if (filp->f_op != NULL && filp->f_op->release != NULL) filp->f_op->release(filp->f_vnode, filp); if (filp->f_shmem != NULL) vm_object_deallocate(filp->f_shmem); kfree_rcu(filp, rcu); } else { /* * The close method of the character device or file * will free the linux_file structure: */ _fdrop(filp->_file, curthread); } } static int linux_cdev_pager_fault(vm_object_t vm_obj, vm_ooffset_t offset, int prot, vm_page_t *mres) { struct vm_area_struct *vmap; vmap = linux_cdev_handle_find(vm_obj->handle); MPASS(vmap != NULL); MPASS(vmap->vm_private_data == vm_obj->handle); if (likely(vmap->vm_ops != NULL && offset < vmap->vm_len)) { vm_paddr_t paddr = IDX_TO_OFF(vmap->vm_pfn) + offset; vm_page_t page; if (((*mres)->flags & PG_FICTITIOUS) != 0) { /* * If the passed in result page is a fake * page, update it with the new physical * address. */ page = *mres; vm_page_updatefake(page, paddr, vm_obj->memattr); } else { /* * Replace the passed in "mres" page with our * own fake page and free up the all of the * original pages. */ VM_OBJECT_WUNLOCK(vm_obj); page = vm_page_getfake(paddr, vm_obj->memattr); VM_OBJECT_WLOCK(vm_obj); vm_page_replace(page, vm_obj, (*mres)->pindex, *mres); *mres = page; } vm_page_valid(page); return (VM_PAGER_OK); } return (VM_PAGER_FAIL); } static int linux_cdev_pager_populate(vm_object_t vm_obj, vm_pindex_t pidx, int fault_type, vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last) { struct vm_area_struct *vmap; int err; /* get VM area structure */ vmap = linux_cdev_handle_find(vm_obj->handle); MPASS(vmap != NULL); MPASS(vmap->vm_private_data == vm_obj->handle); VM_OBJECT_WUNLOCK(vm_obj); linux_set_current(curthread); down_write(&vmap->vm_mm->mmap_sem); if (unlikely(vmap->vm_ops == NULL)) { err = VM_FAULT_SIGBUS; } else { struct vm_fault vmf; /* fill out VM fault structure */ vmf.virtual_address = (void *)(uintptr_t)IDX_TO_OFF(pidx); vmf.flags = (fault_type & VM_PROT_WRITE) ? FAULT_FLAG_WRITE : 0; vmf.pgoff = 0; vmf.page = NULL; vmf.vma = vmap; vmap->vm_pfn_count = 0; vmap->vm_pfn_pcount = &vmap->vm_pfn_count; vmap->vm_obj = vm_obj; err = vmap->vm_ops->fault(&vmf); while (vmap->vm_pfn_count == 0 && err == VM_FAULT_NOPAGE) { kern_yield(PRI_USER); err = vmap->vm_ops->fault(&vmf); } } /* translate return code */ switch (err) { case VM_FAULT_OOM: err = VM_PAGER_AGAIN; break; case VM_FAULT_SIGBUS: err = VM_PAGER_BAD; break; case VM_FAULT_NOPAGE: /* * By contract the fault handler will return having * busied all the pages itself. If pidx is already * found in the object, it will simply xbusy the first * page and return with vm_pfn_count set to 1. */ *first = vmap->vm_pfn_first; *last = *first + vmap->vm_pfn_count - 1; err = VM_PAGER_OK; break; default: err = VM_PAGER_ERROR; break; } up_write(&vmap->vm_mm->mmap_sem); VM_OBJECT_WLOCK(vm_obj); return (err); } static struct rwlock linux_vma_lock; static TAILQ_HEAD(, vm_area_struct) linux_vma_head = TAILQ_HEAD_INITIALIZER(linux_vma_head); static void linux_cdev_handle_free(struct vm_area_struct *vmap) { /* Drop reference on vm_file */ if (vmap->vm_file != NULL) fput(vmap->vm_file); /* Drop reference on mm_struct */ mmput(vmap->vm_mm); kfree(vmap); } static void linux_cdev_handle_remove(struct vm_area_struct *vmap) { rw_wlock(&linux_vma_lock); TAILQ_REMOVE(&linux_vma_head, vmap, vm_entry); rw_wunlock(&linux_vma_lock); } static struct vm_area_struct * linux_cdev_handle_find(void *handle) { struct vm_area_struct *vmap; rw_rlock(&linux_vma_lock); TAILQ_FOREACH(vmap, &linux_vma_head, vm_entry) { if (vmap->vm_private_data == handle) break; } rw_runlock(&linux_vma_lock); return (vmap); } static int linux_cdev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred, u_short *color) { MPASS(linux_cdev_handle_find(handle) != NULL); *color = 0; return (0); } static void linux_cdev_pager_dtor(void *handle) { const struct vm_operations_struct *vm_ops; struct vm_area_struct *vmap; vmap = linux_cdev_handle_find(handle); MPASS(vmap != NULL); /* * Remove handle before calling close operation to prevent * other threads from reusing the handle pointer. */ linux_cdev_handle_remove(vmap); down_write(&vmap->vm_mm->mmap_sem); vm_ops = vmap->vm_ops; if (likely(vm_ops != NULL)) vm_ops->close(vmap); up_write(&vmap->vm_mm->mmap_sem); linux_cdev_handle_free(vmap); } static struct cdev_pager_ops linux_cdev_pager_ops[2] = { { /* OBJT_MGTDEVICE */ .cdev_pg_populate = linux_cdev_pager_populate, .cdev_pg_ctor = linux_cdev_pager_ctor, .cdev_pg_dtor = linux_cdev_pager_dtor }, { /* OBJT_DEVICE */ .cdev_pg_fault = linux_cdev_pager_fault, .cdev_pg_ctor = linux_cdev_pager_ctor, .cdev_pg_dtor = linux_cdev_pager_dtor }, }; int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size) { vm_object_t obj; vm_page_t m; obj = vma->vm_obj; if (obj == NULL || (obj->flags & OBJ_UNMANAGED) != 0) return (-ENOTSUP); VM_OBJECT_RLOCK(obj); for (m = vm_page_find_least(obj, OFF_TO_IDX(address)); m != NULL && m->pindex < OFF_TO_IDX(address + size); m = TAILQ_NEXT(m, listq)) pmap_remove_all(m); VM_OBJECT_RUNLOCK(obj); return (0); } static struct file_operations dummy_ldev_ops = { /* XXXKIB */ }; static struct linux_cdev dummy_ldev = { .ops = &dummy_ldev_ops, }; #define LDEV_SI_DTR 0x0001 #define LDEV_SI_REF 0x0002 static void linux_get_fop(struct linux_file *filp, const struct file_operations **fop, struct linux_cdev **dev) { struct linux_cdev *ldev; u_int siref; ldev = filp->f_cdev; *fop = filp->f_op; if (ldev != NULL) { if (ldev->kobj.ktype == &linux_cdev_static_ktype) { refcount_acquire(&ldev->refs); } else { for (siref = ldev->siref;;) { if ((siref & LDEV_SI_DTR) != 0) { ldev = &dummy_ldev; *fop = ldev->ops; siref = ldev->siref; MPASS((ldev->siref & LDEV_SI_DTR) == 0); } else if (atomic_fcmpset_int(&ldev->siref, &siref, siref + LDEV_SI_REF)) { break; } } } } *dev = ldev; } static void linux_drop_fop(struct linux_cdev *ldev) { if (ldev == NULL) return; if (ldev->kobj.ktype == &linux_cdev_static_ktype) { linux_cdev_deref(ldev); } else { MPASS(ldev->kobj.ktype == &linux_cdev_ktype); MPASS((ldev->siref & ~LDEV_SI_DTR) != 0); atomic_subtract_int(&ldev->siref, LDEV_SI_REF); } } #define OPW(fp,td,code) ({ \ struct file *__fpop; \ __typeof(code) __retval; \ \ __fpop = (td)->td_fpop; \ (td)->td_fpop = (fp); \ __retval = (code); \ (td)->td_fpop = __fpop; \ __retval; \ }) static int linux_dev_fdopen(struct cdev *dev, int fflags, struct thread *td, struct file *file) { struct linux_cdev *ldev; struct linux_file *filp; const struct file_operations *fop; int error; ldev = dev->si_drv1; filp = linux_file_alloc(); filp->f_dentry = &filp->f_dentry_store; filp->f_op = ldev->ops; filp->f_mode = file->f_flag; filp->f_flags = file->f_flag; filp->f_vnode = file->f_vnode; filp->_file = file; refcount_acquire(&ldev->refs); filp->f_cdev = ldev; linux_set_current(td); linux_get_fop(filp, &fop, &ldev); if (fop->open != NULL) { error = -fop->open(file->f_vnode, filp); if (error != 0) { linux_drop_fop(ldev); linux_cdev_deref(filp->f_cdev); kfree(filp); return (error); } } /* hold on to the vnode - used for fstat() */ vhold(filp->f_vnode); /* release the file from devfs */ finit(file, filp->f_mode, DTYPE_DEV, filp, &linuxfileops); linux_drop_fop(ldev); return (ENXIO); } #define LINUX_IOCTL_MIN_PTR 0x10000UL #define LINUX_IOCTL_MAX_PTR (LINUX_IOCTL_MIN_PTR + IOCPARM_MAX) static inline int linux_remap_address(void **uaddr, size_t len) { uintptr_t uaddr_val = (uintptr_t)(*uaddr); if (unlikely(uaddr_val >= LINUX_IOCTL_MIN_PTR && uaddr_val < LINUX_IOCTL_MAX_PTR)) { struct task_struct *pts = current; if (pts == NULL) { *uaddr = NULL; return (1); } /* compute data offset */ uaddr_val -= LINUX_IOCTL_MIN_PTR; /* check that length is within bounds */ if ((len > IOCPARM_MAX) || (uaddr_val + len) > pts->bsd_ioctl_len) { *uaddr = NULL; return (1); } /* re-add kernel buffer address */ uaddr_val += (uintptr_t)pts->bsd_ioctl_data; /* update address location */ *uaddr = (void *)uaddr_val; return (1); } return (0); } int linux_copyin(const void *uaddr, void *kaddr, size_t len) { if (linux_remap_address(__DECONST(void **, &uaddr), len)) { if (uaddr == NULL) return (-EFAULT); memcpy(kaddr, uaddr, len); return (0); } return (-copyin(uaddr, kaddr, len)); } int linux_copyout(const void *kaddr, void *uaddr, size_t len) { if (linux_remap_address(&uaddr, len)) { if (uaddr == NULL) return (-EFAULT); memcpy(uaddr, kaddr, len); return (0); } return (-copyout(kaddr, uaddr, len)); } size_t linux_clear_user(void *_uaddr, size_t _len) { uint8_t *uaddr = _uaddr; size_t len = _len; /* make sure uaddr is aligned before going into the fast loop */ while (((uintptr_t)uaddr & 7) != 0 && len > 7) { if (subyte(uaddr, 0)) return (_len); uaddr++; len--; } /* zero 8 bytes at a time */ while (len > 7) { #ifdef __LP64__ if (suword64(uaddr, 0)) return (_len); #else if (suword32(uaddr, 0)) return (_len); if (suword32(uaddr + 4, 0)) return (_len); #endif uaddr += 8; len -= 8; } /* zero fill end, if any */ while (len > 0) { if (subyte(uaddr, 0)) return (_len); uaddr++; len--; } return (0); } int linux_access_ok(const void *uaddr, size_t len) { uintptr_t saddr; uintptr_t eaddr; /* get start and end address */ saddr = (uintptr_t)uaddr; eaddr = (uintptr_t)uaddr + len; /* verify addresses are valid for userspace */ return ((saddr == eaddr) || (eaddr > saddr && eaddr <= VM_MAXUSER_ADDRESS)); } /* * This function should return either EINTR or ERESTART depending on * the signal type sent to this thread: */ static int linux_get_error(struct task_struct *task, int error) { /* check for signal type interrupt code */ if (error == EINTR || error == ERESTARTSYS || error == ERESTART) { error = -linux_schedule_get_interrupt_value(task); if (error == 0) error = EINTR; } return (error); } static int linux_file_ioctl_sub(struct file *fp, struct linux_file *filp, const struct file_operations *fop, u_long cmd, caddr_t data, struct thread *td) { struct task_struct *task = current; unsigned size; int error; size = IOCPARM_LEN(cmd); /* refer to logic in sys_ioctl() */ if (size > 0) { /* * Setup hint for linux_copyin() and linux_copyout(). * * Background: Linux code expects a user-space address * while FreeBSD supplies a kernel-space address. */ task->bsd_ioctl_data = data; task->bsd_ioctl_len = size; data = (void *)LINUX_IOCTL_MIN_PTR; } else { /* fetch user-space pointer */ data = *(void **)data; } #ifdef COMPAT_FREEBSD32 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { /* try the compat IOCTL handler first */ if (fop->compat_ioctl != NULL) { error = -OPW(fp, td, fop->compat_ioctl(filp, cmd, (u_long)data)); } else { error = ENOTTY; } /* fallback to the regular IOCTL handler, if any */ if (error == ENOTTY && fop->unlocked_ioctl != NULL) { error = -OPW(fp, td, fop->unlocked_ioctl(filp, cmd, (u_long)data)); } } else #endif { if (fop->unlocked_ioctl != NULL) { error = -OPW(fp, td, fop->unlocked_ioctl(filp, cmd, (u_long)data)); } else { error = ENOTTY; } } if (size > 0) { task->bsd_ioctl_data = NULL; task->bsd_ioctl_len = 0; } if (error == EWOULDBLOCK) { /* update kqfilter status, if any */ linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_READ | LINUX_KQ_FLAG_HAS_WRITE); } else { error = linux_get_error(task, error); } return (error); } #define LINUX_POLL_TABLE_NORMAL ((poll_table *)1) /* * This function atomically updates the poll wakeup state and returns * the previous state at the time of update. */ static uint8_t linux_poll_wakeup_state(atomic_t *v, const uint8_t *pstate) { int c, old; c = v->counter; while ((old = atomic_cmpxchg(v, c, pstate[c])) != c) c = old; return (c); } static int linux_poll_wakeup_callback(wait_queue_t *wq, unsigned int wq_state, int flags, void *key) { static const uint8_t state[LINUX_FWQ_STATE_MAX] = { [LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_INIT, /* NOP */ [LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_NOT_READY, /* NOP */ [LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_READY, [LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_READY, /* NOP */ }; struct linux_file *filp = container_of(wq, struct linux_file, f_wait_queue.wq); switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) { case LINUX_FWQ_STATE_QUEUED: linux_poll_wakeup(filp); return (1); default: return (0); } } void linux_poll_wait(struct linux_file *filp, wait_queue_head_t *wqh, poll_table *p) { static const uint8_t state[LINUX_FWQ_STATE_MAX] = { [LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_NOT_READY, [LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_NOT_READY, /* NOP */ [LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_QUEUED, /* NOP */ [LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_QUEUED, }; /* check if we are called inside the select system call */ if (p == LINUX_POLL_TABLE_NORMAL) selrecord(curthread, &filp->f_selinfo); switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) { case LINUX_FWQ_STATE_INIT: /* NOTE: file handles can only belong to one wait-queue */ filp->f_wait_queue.wqh = wqh; filp->f_wait_queue.wq.func = &linux_poll_wakeup_callback; add_wait_queue(wqh, &filp->f_wait_queue.wq); atomic_set(&filp->f_wait_queue.state, LINUX_FWQ_STATE_QUEUED); break; default: break; } } static void linux_poll_wait_dequeue(struct linux_file *filp) { static const uint8_t state[LINUX_FWQ_STATE_MAX] = { [LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_INIT, /* NOP */ [LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_INIT, [LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_INIT, [LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_INIT, }; seldrain(&filp->f_selinfo); switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) { case LINUX_FWQ_STATE_NOT_READY: case LINUX_FWQ_STATE_QUEUED: case LINUX_FWQ_STATE_READY: remove_wait_queue(filp->f_wait_queue.wqh, &filp->f_wait_queue.wq); break; default: break; } } void linux_poll_wakeup(struct linux_file *filp) { /* this function should be NULL-safe */ if (filp == NULL) return; selwakeup(&filp->f_selinfo); spin_lock(&filp->f_kqlock); filp->f_kqflags |= LINUX_KQ_FLAG_NEED_READ | LINUX_KQ_FLAG_NEED_WRITE; /* make sure the "knote" gets woken up */ KNOTE_LOCKED(&filp->f_selinfo.si_note, 1); spin_unlock(&filp->f_kqlock); } static void linux_file_kqfilter_detach(struct knote *kn) { struct linux_file *filp = kn->kn_hook; spin_lock(&filp->f_kqlock); knlist_remove(&filp->f_selinfo.si_note, kn, 1); spin_unlock(&filp->f_kqlock); } static int linux_file_kqfilter_read_event(struct knote *kn, long hint) { struct linux_file *filp = kn->kn_hook; mtx_assert(&filp->f_kqlock.m, MA_OWNED); return ((filp->f_kqflags & LINUX_KQ_FLAG_NEED_READ) ? 1 : 0); } static int linux_file_kqfilter_write_event(struct knote *kn, long hint) { struct linux_file *filp = kn->kn_hook; mtx_assert(&filp->f_kqlock.m, MA_OWNED); return ((filp->f_kqflags & LINUX_KQ_FLAG_NEED_WRITE) ? 1 : 0); } static struct filterops linux_dev_kqfiltops_read = { .f_isfd = 1, .f_detach = linux_file_kqfilter_detach, .f_event = linux_file_kqfilter_read_event, }; static struct filterops linux_dev_kqfiltops_write = { .f_isfd = 1, .f_detach = linux_file_kqfilter_detach, .f_event = linux_file_kqfilter_write_event, }; static void linux_file_kqfilter_poll(struct linux_file *filp, int kqflags) { struct thread *td; const struct file_operations *fop; struct linux_cdev *ldev; int temp; if ((filp->f_kqflags & kqflags) == 0) return; td = curthread; linux_get_fop(filp, &fop, &ldev); /* get the latest polling state */ temp = OPW(filp->_file, td, fop->poll(filp, NULL)); linux_drop_fop(ldev); spin_lock(&filp->f_kqlock); /* clear kqflags */ filp->f_kqflags &= ~(LINUX_KQ_FLAG_NEED_READ | LINUX_KQ_FLAG_NEED_WRITE); /* update kqflags */ if ((temp & (POLLIN | POLLOUT)) != 0) { if ((temp & POLLIN) != 0) filp->f_kqflags |= LINUX_KQ_FLAG_NEED_READ; if ((temp & POLLOUT) != 0) filp->f_kqflags |= LINUX_KQ_FLAG_NEED_WRITE; /* make sure the "knote" gets woken up */ KNOTE_LOCKED(&filp->f_selinfo.si_note, 0); } spin_unlock(&filp->f_kqlock); } static int linux_file_kqfilter(struct file *file, struct knote *kn) { struct linux_file *filp; struct thread *td; int error; td = curthread; filp = (struct linux_file *)file->f_data; filp->f_flags = file->f_flag; if (filp->f_op->poll == NULL) return (EINVAL); spin_lock(&filp->f_kqlock); switch (kn->kn_filter) { case EVFILT_READ: filp->f_kqflags |= LINUX_KQ_FLAG_HAS_READ; kn->kn_fop = &linux_dev_kqfiltops_read; kn->kn_hook = filp; knlist_add(&filp->f_selinfo.si_note, kn, 1); error = 0; break; case EVFILT_WRITE: filp->f_kqflags |= LINUX_KQ_FLAG_HAS_WRITE; kn->kn_fop = &linux_dev_kqfiltops_write; kn->kn_hook = filp; knlist_add(&filp->f_selinfo.si_note, kn, 1); error = 0; break; default: error = EINVAL; break; } spin_unlock(&filp->f_kqlock); if (error == 0) { linux_set_current(td); /* update kqfilter status, if any */ linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_READ | LINUX_KQ_FLAG_HAS_WRITE); } return (error); } static int linux_file_mmap_single(struct file *fp, const struct file_operations *fop, vm_ooffset_t *offset, vm_size_t size, struct vm_object **object, int nprot, bool is_shared, struct thread *td) { struct task_struct *task; struct vm_area_struct *vmap; struct mm_struct *mm; struct linux_file *filp; vm_memattr_t attr; int error; filp = (struct linux_file *)fp->f_data; filp->f_flags = fp->f_flag; if (fop->mmap == NULL) return (EOPNOTSUPP); linux_set_current(td); /* * The same VM object might be shared by multiple processes * and the mm_struct is usually freed when a process exits. * * The atomic reference below makes sure the mm_struct is * available as long as the vmap is in the linux_vma_head. */ task = current; mm = task->mm; if (atomic_inc_not_zero(&mm->mm_users) == 0) return (EINVAL); vmap = kzalloc(sizeof(*vmap), GFP_KERNEL); vmap->vm_start = 0; vmap->vm_end = size; vmap->vm_pgoff = *offset / PAGE_SIZE; vmap->vm_pfn = 0; vmap->vm_flags = vmap->vm_page_prot = (nprot & VM_PROT_ALL); if (is_shared) vmap->vm_flags |= VM_SHARED; vmap->vm_ops = NULL; vmap->vm_file = get_file(filp); vmap->vm_mm = mm; if (unlikely(down_write_killable(&vmap->vm_mm->mmap_sem))) { error = linux_get_error(task, EINTR); } else { error = -OPW(fp, td, fop->mmap(filp, vmap)); error = linux_get_error(task, error); up_write(&vmap->vm_mm->mmap_sem); } if (error != 0) { linux_cdev_handle_free(vmap); return (error); } attr = pgprot2cachemode(vmap->vm_page_prot); if (vmap->vm_ops != NULL) { struct vm_area_struct *ptr; void *vm_private_data; bool vm_no_fault; if (vmap->vm_ops->open == NULL || vmap->vm_ops->close == NULL || vmap->vm_private_data == NULL) { /* free allocated VM area struct */ linux_cdev_handle_free(vmap); return (EINVAL); } vm_private_data = vmap->vm_private_data; rw_wlock(&linux_vma_lock); TAILQ_FOREACH(ptr, &linux_vma_head, vm_entry) { if (ptr->vm_private_data == vm_private_data) break; } /* check if there is an existing VM area struct */ if (ptr != NULL) { /* check if the VM area structure is invalid */ if (ptr->vm_ops == NULL || ptr->vm_ops->open == NULL || ptr->vm_ops->close == NULL) { error = ESTALE; vm_no_fault = 1; } else { error = EEXIST; vm_no_fault = (ptr->vm_ops->fault == NULL); } } else { /* insert VM area structure into list */ TAILQ_INSERT_TAIL(&linux_vma_head, vmap, vm_entry); error = 0; vm_no_fault = (vmap->vm_ops->fault == NULL); } rw_wunlock(&linux_vma_lock); if (error != 0) { /* free allocated VM area struct */ linux_cdev_handle_free(vmap); /* check for stale VM area struct */ if (error != EEXIST) return (error); } /* check if there is no fault handler */ if (vm_no_fault) { *object = cdev_pager_allocate(vm_private_data, OBJT_DEVICE, &linux_cdev_pager_ops[1], size, nprot, *offset, td->td_ucred); } else { *object = cdev_pager_allocate(vm_private_data, OBJT_MGTDEVICE, &linux_cdev_pager_ops[0], size, nprot, *offset, td->td_ucred); } /* check if allocating the VM object failed */ if (*object == NULL) { if (error == 0) { /* remove VM area struct from list */ linux_cdev_handle_remove(vmap); /* free allocated VM area struct */ linux_cdev_handle_free(vmap); } return (EINVAL); } } else { struct sglist *sg; sg = sglist_alloc(1, M_WAITOK); sglist_append_phys(sg, (vm_paddr_t)vmap->vm_pfn << PAGE_SHIFT, vmap->vm_len); *object = vm_pager_allocate(OBJT_SG, sg, vmap->vm_len, nprot, 0, td->td_ucred); linux_cdev_handle_free(vmap); if (*object == NULL) { sglist_free(sg); return (EINVAL); } } if (attr != VM_MEMATTR_DEFAULT) { VM_OBJECT_WLOCK(*object); vm_object_set_memattr(*object, attr); VM_OBJECT_WUNLOCK(*object); } *offset = 0; return (0); } struct cdevsw linuxcdevsw = { .d_version = D_VERSION, .d_fdopen = linux_dev_fdopen, .d_name = "lkpidev", }; static int linux_file_read(struct file *file, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct linux_file *filp; const struct file_operations *fop; struct linux_cdev *ldev; ssize_t bytes; int error; error = 0; filp = (struct linux_file *)file->f_data; filp->f_flags = file->f_flag; /* XXX no support for I/O vectors currently */ if (uio->uio_iovcnt != 1) return (EOPNOTSUPP); if (uio->uio_resid > DEVFS_IOSIZE_MAX) return (EINVAL); linux_set_current(td); linux_get_fop(filp, &fop, &ldev); if (fop->read != NULL) { bytes = OPW(file, td, fop->read(filp, uio->uio_iov->iov_base, uio->uio_iov->iov_len, &uio->uio_offset)); if (bytes >= 0) { uio->uio_iov->iov_base = ((uint8_t *)uio->uio_iov->iov_base) + bytes; uio->uio_iov->iov_len -= bytes; uio->uio_resid -= bytes; } else { error = linux_get_error(current, -bytes); } } else error = ENXIO; /* update kqfilter status, if any */ linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_READ); linux_drop_fop(ldev); return (error); } static int linux_file_write(struct file *file, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct linux_file *filp; const struct file_operations *fop; struct linux_cdev *ldev; ssize_t bytes; int error; filp = (struct linux_file *)file->f_data; filp->f_flags = file->f_flag; /* XXX no support for I/O vectors currently */ if (uio->uio_iovcnt != 1) return (EOPNOTSUPP); if (uio->uio_resid > DEVFS_IOSIZE_MAX) return (EINVAL); linux_set_current(td); linux_get_fop(filp, &fop, &ldev); if (fop->write != NULL) { bytes = OPW(file, td, fop->write(filp, uio->uio_iov->iov_base, uio->uio_iov->iov_len, &uio->uio_offset)); if (bytes >= 0) { uio->uio_iov->iov_base = ((uint8_t *)uio->uio_iov->iov_base) + bytes; uio->uio_iov->iov_len -= bytes; uio->uio_resid -= bytes; error = 0; } else { error = linux_get_error(current, -bytes); } } else error = ENXIO; /* update kqfilter status, if any */ linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_WRITE); linux_drop_fop(ldev); return (error); } static int linux_file_poll(struct file *file, int events, struct ucred *active_cred, struct thread *td) { struct linux_file *filp; const struct file_operations *fop; struct linux_cdev *ldev; int revents; filp = (struct linux_file *)file->f_data; filp->f_flags = file->f_flag; linux_set_current(td); linux_get_fop(filp, &fop, &ldev); if (fop->poll != NULL) { revents = OPW(file, td, fop->poll(filp, LINUX_POLL_TABLE_NORMAL)) & events; } else { revents = 0; } linux_drop_fop(ldev); return (revents); } static int linux_file_close(struct file *file, struct thread *td) { struct linux_file *filp; int (*release)(struct inode *, struct linux_file *); const struct file_operations *fop; struct linux_cdev *ldev; int error; filp = (struct linux_file *)file->f_data; KASSERT(file_count(filp) == 0, ("File refcount(%d) is not zero", file_count(filp))); if (td == NULL) td = curthread; error = 0; filp->f_flags = file->f_flag; linux_set_current(td); linux_poll_wait_dequeue(filp); linux_get_fop(filp, &fop, &ldev); /* * Always use the real release function, if any, to avoid * leaking device resources: */ release = filp->f_op->release; if (release != NULL) error = -OPW(file, td, release(filp->f_vnode, filp)); funsetown(&filp->f_sigio); if (filp->f_vnode != NULL) vdrop(filp->f_vnode); linux_drop_fop(ldev); ldev = filp->f_cdev; if (ldev != NULL) linux_cdev_deref(ldev); linux_synchronize_rcu(RCU_TYPE_REGULAR); kfree(filp); return (error); } static int linux_file_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *cred, struct thread *td) { struct linux_file *filp; const struct file_operations *fop; struct linux_cdev *ldev; struct fiodgname_arg *fgn; const char *p; int error, i; error = 0; filp = (struct linux_file *)fp->f_data; filp->f_flags = fp->f_flag; linux_get_fop(filp, &fop, &ldev); linux_set_current(td); switch (cmd) { case FIONBIO: break; case FIOASYNC: if (fop->fasync == NULL) break; error = -OPW(fp, td, fop->fasync(0, filp, fp->f_flag & FASYNC)); break; case FIOSETOWN: error = fsetown(*(int *)data, &filp->f_sigio); if (error == 0) { if (fop->fasync == NULL) break; error = -OPW(fp, td, fop->fasync(0, filp, fp->f_flag & FASYNC)); } break; case FIOGETOWN: *(int *)data = fgetown(&filp->f_sigio); break; case FIODGNAME: #ifdef COMPAT_FREEBSD32 case FIODGNAME_32: #endif if (filp->f_cdev == NULL || filp->f_cdev->cdev == NULL) { error = ENXIO; break; } fgn = data; p = devtoname(filp->f_cdev->cdev); i = strlen(p) + 1; if (i > fgn->len) { error = EINVAL; break; } error = copyout(p, fiodgname_buf_get_ptr(fgn, cmd), i); break; default: error = linux_file_ioctl_sub(fp, filp, fop, cmd, data, td); break; } linux_drop_fop(ldev); return (error); } static int linux_file_mmap_sub(struct thread *td, vm_size_t objsize, vm_prot_t prot, vm_prot_t maxprot, int flags, struct file *fp, vm_ooffset_t *foff, const struct file_operations *fop, vm_object_t *objp) { /* * Character devices do not provide private mappings * of any kind: */ if ((maxprot & VM_PROT_WRITE) == 0 && (prot & VM_PROT_WRITE) != 0) return (EACCES); if ((flags & (MAP_PRIVATE | MAP_COPY)) != 0) return (EINVAL); return (linux_file_mmap_single(fp, fop, foff, objsize, objp, (int)prot, (flags & MAP_SHARED) ? true : false, td)); } static int linux_file_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, struct thread *td) { struct linux_file *filp; const struct file_operations *fop; struct linux_cdev *ldev; struct mount *mp; struct vnode *vp; vm_object_t object; vm_prot_t maxprot; int error; filp = (struct linux_file *)fp->f_data; vp = filp->f_vnode; if (vp == NULL) return (EOPNOTSUPP); /* * Ensure that file and memory protections are * compatible. */ mp = vp->v_mount; if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) { maxprot = VM_PROT_NONE; if ((prot & VM_PROT_EXECUTE) != 0) return (EACCES); } else maxprot = VM_PROT_EXECUTE; if ((fp->f_flag & FREAD) != 0) maxprot |= VM_PROT_READ; else if ((prot & VM_PROT_READ) != 0) return (EACCES); /* * If we are sharing potential changes via MAP_SHARED and we * are trying to get write permission although we opened it * without asking for it, bail out. * * Note that most character devices always share mappings. * * Rely on linux_file_mmap_sub() to fail invalid MAP_PRIVATE * requests rather than doing it here. */ if ((flags & MAP_SHARED) != 0) { if ((fp->f_flag & FWRITE) != 0) maxprot |= VM_PROT_WRITE; else if ((prot & VM_PROT_WRITE) != 0) return (EACCES); } maxprot &= cap_maxprot; linux_get_fop(filp, &fop, &ldev); error = linux_file_mmap_sub(td, size, prot, maxprot, flags, fp, &foff, fop, &object); if (error != 0) goto out; error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, foff, FALSE, td); if (error != 0) vm_object_deallocate(object); out: linux_drop_fop(ldev); return (error); } static int -linux_file_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, - struct thread *td) +linux_file_stat(struct file *fp, struct stat *sb, struct ucred *active_cred) { struct linux_file *filp; struct vnode *vp; int error; filp = (struct linux_file *)fp->f_data; if (filp->f_vnode == NULL) return (EOPNOTSUPP); vp = filp->f_vnode; vn_lock(vp, LK_SHARED | LK_RETRY); - error = VOP_STAT(vp, sb, td->td_ucred, NOCRED, td); + error = VOP_STAT(vp, sb, curthread->td_ucred, NOCRED); VOP_UNLOCK(vp); return (error); } static int linux_file_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { struct linux_file *filp; struct vnode *vp; int error; filp = fp->f_data; vp = filp->f_vnode; if (vp == NULL) { error = 0; kif->kf_type = KF_TYPE_DEV; } else { vref(vp); FILEDESC_SUNLOCK(fdp); error = vn_fill_kinfo_vnode(vp, kif); vrele(vp); kif->kf_type = KF_TYPE_VNODE; FILEDESC_SLOCK(fdp); } return (error); } unsigned int linux_iminor(struct inode *inode) { struct linux_cdev *ldev; if (inode == NULL || inode->v_rdev == NULL || inode->v_rdev->si_devsw != &linuxcdevsw) return (-1U); ldev = inode->v_rdev->si_drv1; if (ldev == NULL) return (-1U); return (minor(ldev->dev)); } struct fileops linuxfileops = { .fo_read = linux_file_read, .fo_write = linux_file_write, .fo_truncate = invfo_truncate, .fo_kqfilter = linux_file_kqfilter, .fo_stat = linux_file_stat, .fo_fill_kinfo = linux_file_fill_kinfo, .fo_poll = linux_file_poll, .fo_close = linux_file_close, .fo_ioctl = linux_file_ioctl, .fo_mmap = linux_file_mmap, .fo_chmod = invfo_chmod, .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, .fo_flags = DFLAG_PASSABLE, }; /* * Hash of vmmap addresses. This is infrequently accessed and does not * need to be particularly large. This is done because we must store the * caller's idea of the map size to properly unmap. */ struct vmmap { LIST_ENTRY(vmmap) vm_next; void *vm_addr; unsigned long vm_size; }; struct vmmaphd { struct vmmap *lh_first; }; #define VMMAP_HASH_SIZE 64 #define VMMAP_HASH_MASK (VMMAP_HASH_SIZE - 1) #define VM_HASH(addr) ((uintptr_t)(addr) >> PAGE_SHIFT) & VMMAP_HASH_MASK static struct vmmaphd vmmaphead[VMMAP_HASH_SIZE]; static struct mtx vmmaplock; static void vmmap_add(void *addr, unsigned long size) { struct vmmap *vmmap; vmmap = kmalloc(sizeof(*vmmap), GFP_KERNEL); mtx_lock(&vmmaplock); vmmap->vm_size = size; vmmap->vm_addr = addr; LIST_INSERT_HEAD(&vmmaphead[VM_HASH(addr)], vmmap, vm_next); mtx_unlock(&vmmaplock); } static struct vmmap * vmmap_remove(void *addr) { struct vmmap *vmmap; mtx_lock(&vmmaplock); LIST_FOREACH(vmmap, &vmmaphead[VM_HASH(addr)], vm_next) if (vmmap->vm_addr == addr) break; if (vmmap) LIST_REMOVE(vmmap, vm_next); mtx_unlock(&vmmaplock); return (vmmap); } #if defined(__i386__) || defined(__amd64__) || defined(__powerpc__) || defined(__aarch64__) void * _ioremap_attr(vm_paddr_t phys_addr, unsigned long size, int attr) { void *addr; addr = pmap_mapdev_attr(phys_addr, size, attr); if (addr == NULL) return (NULL); vmmap_add(addr, size); return (addr); } #endif void iounmap(void *addr) { struct vmmap *vmmap; vmmap = vmmap_remove(addr); if (vmmap == NULL) return; #if defined(__i386__) || defined(__amd64__) || defined(__powerpc__) || defined(__aarch64__) pmap_unmapdev((vm_offset_t)addr, vmmap->vm_size); #endif kfree(vmmap); } void * vmap(struct page **pages, unsigned int count, unsigned long flags, int prot) { vm_offset_t off; size_t size; size = count * PAGE_SIZE; off = kva_alloc(size); if (off == 0) return (NULL); vmmap_add((void *)off, size); pmap_qenter(off, pages, count); return ((void *)off); } void vunmap(void *addr) { struct vmmap *vmmap; vmmap = vmmap_remove(addr); if (vmmap == NULL) return; pmap_qremove((vm_offset_t)addr, vmmap->vm_size / PAGE_SIZE); kva_free((vm_offset_t)addr, vmmap->vm_size); kfree(vmmap); } static char * devm_kvasprintf(struct device *dev, gfp_t gfp, const char *fmt, va_list ap) { unsigned int len; char *p; va_list aq; va_copy(aq, ap); len = vsnprintf(NULL, 0, fmt, aq); va_end(aq); if (dev != NULL) p = devm_kmalloc(dev, len + 1, gfp); else p = kmalloc(len + 1, gfp); if (p != NULL) vsnprintf(p, len + 1, fmt, ap); return (p); } char * kvasprintf(gfp_t gfp, const char *fmt, va_list ap) { return (devm_kvasprintf(NULL, gfp, fmt, ap)); } char * lkpi_devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...) { va_list ap; char *p; va_start(ap, fmt); p = devm_kvasprintf(dev, gfp, fmt, ap); va_end(ap); return (p); } char * kasprintf(gfp_t gfp, const char *fmt, ...) { va_list ap; char *p; va_start(ap, fmt); p = kvasprintf(gfp, fmt, ap); va_end(ap); return (p); } static void linux_timer_callback_wrapper(void *context) { struct timer_list *timer; timer = context; if (linux_set_current_flags(curthread, M_NOWAIT)) { /* try again later */ callout_reset(&timer->callout, 1, &linux_timer_callback_wrapper, timer); return; } timer->function(timer->data); } int mod_timer(struct timer_list *timer, int expires) { int ret; timer->expires = expires; ret = callout_reset(&timer->callout, linux_timer_jiffies_until(expires), &linux_timer_callback_wrapper, timer); MPASS(ret == 0 || ret == 1); return (ret == 1); } void add_timer(struct timer_list *timer) { callout_reset(&timer->callout, linux_timer_jiffies_until(timer->expires), &linux_timer_callback_wrapper, timer); } void add_timer_on(struct timer_list *timer, int cpu) { callout_reset_on(&timer->callout, linux_timer_jiffies_until(timer->expires), &linux_timer_callback_wrapper, timer, cpu); } int del_timer(struct timer_list *timer) { if (callout_stop(&(timer)->callout) == -1) return (0); return (1); } int del_timer_sync(struct timer_list *timer) { if (callout_drain(&(timer)->callout) == -1) return (0); return (1); } /* greatest common divisor, Euclid equation */ static uint64_t lkpi_gcd_64(uint64_t a, uint64_t b) { uint64_t an; uint64_t bn; while (b != 0) { an = b; bn = a % b; a = an; b = bn; } return (a); } uint64_t lkpi_nsec2hz_rem; uint64_t lkpi_nsec2hz_div = 1000000000ULL; uint64_t lkpi_nsec2hz_max; uint64_t lkpi_usec2hz_rem; uint64_t lkpi_usec2hz_div = 1000000ULL; uint64_t lkpi_usec2hz_max; uint64_t lkpi_msec2hz_rem; uint64_t lkpi_msec2hz_div = 1000ULL; uint64_t lkpi_msec2hz_max; static void linux_timer_init(void *arg) { uint64_t gcd; /* * Compute an internal HZ value which can divide 2**32 to * avoid timer rounding problems when the tick value wraps * around 2**32: */ linux_timer_hz_mask = 1; while (linux_timer_hz_mask < (unsigned long)hz) linux_timer_hz_mask *= 2; linux_timer_hz_mask--; /* compute some internal constants */ lkpi_nsec2hz_rem = hz; lkpi_usec2hz_rem = hz; lkpi_msec2hz_rem = hz; gcd = lkpi_gcd_64(lkpi_nsec2hz_rem, lkpi_nsec2hz_div); lkpi_nsec2hz_rem /= gcd; lkpi_nsec2hz_div /= gcd; lkpi_nsec2hz_max = -1ULL / lkpi_nsec2hz_rem; gcd = lkpi_gcd_64(lkpi_usec2hz_rem, lkpi_usec2hz_div); lkpi_usec2hz_rem /= gcd; lkpi_usec2hz_div /= gcd; lkpi_usec2hz_max = -1ULL / lkpi_usec2hz_rem; gcd = lkpi_gcd_64(lkpi_msec2hz_rem, lkpi_msec2hz_div); lkpi_msec2hz_rem /= gcd; lkpi_msec2hz_div /= gcd; lkpi_msec2hz_max = -1ULL / lkpi_msec2hz_rem; } SYSINIT(linux_timer, SI_SUB_DRIVERS, SI_ORDER_FIRST, linux_timer_init, NULL); void linux_complete_common(struct completion *c, int all) { int wakeup_swapper; sleepq_lock(c); if (all) { c->done = UINT_MAX; wakeup_swapper = sleepq_broadcast(c, SLEEPQ_SLEEP, 0, 0); } else { if (c->done != UINT_MAX) c->done++; wakeup_swapper = sleepq_signal(c, SLEEPQ_SLEEP, 0, 0); } sleepq_release(c); if (wakeup_swapper) kick_proc0(); } /* * Indefinite wait for done != 0 with or without signals. */ int linux_wait_for_common(struct completion *c, int flags) { struct task_struct *task; int error; if (SCHEDULER_STOPPED()) return (0); task = current; if (flags != 0) flags = SLEEPQ_INTERRUPTIBLE | SLEEPQ_SLEEP; else flags = SLEEPQ_SLEEP; error = 0; for (;;) { sleepq_lock(c); if (c->done) break; sleepq_add(c, NULL, "completion", flags, 0); if (flags & SLEEPQ_INTERRUPTIBLE) { DROP_GIANT(); error = -sleepq_wait_sig(c, 0); PICKUP_GIANT(); if (error != 0) { linux_schedule_save_interrupt_value(task, error); error = -ERESTARTSYS; goto intr; } } else { DROP_GIANT(); sleepq_wait(c, 0); PICKUP_GIANT(); } } if (c->done != UINT_MAX) c->done--; sleepq_release(c); intr: return (error); } /* * Time limited wait for done != 0 with or without signals. */ int linux_wait_for_timeout_common(struct completion *c, int timeout, int flags) { struct task_struct *task; int end = jiffies + timeout; int error; if (SCHEDULER_STOPPED()) return (0); task = current; if (flags != 0) flags = SLEEPQ_INTERRUPTIBLE | SLEEPQ_SLEEP; else flags = SLEEPQ_SLEEP; for (;;) { sleepq_lock(c); if (c->done) break; sleepq_add(c, NULL, "completion", flags, 0); sleepq_set_timeout(c, linux_timer_jiffies_until(end)); DROP_GIANT(); if (flags & SLEEPQ_INTERRUPTIBLE) error = -sleepq_timedwait_sig(c, 0); else error = -sleepq_timedwait(c, 0); PICKUP_GIANT(); if (error != 0) { /* check for timeout */ if (error == -EWOULDBLOCK) { error = 0; /* timeout */ } else { /* signal happened */ linux_schedule_save_interrupt_value(task, error); error = -ERESTARTSYS; } goto done; } } if (c->done != UINT_MAX) c->done--; sleepq_release(c); /* return how many jiffies are left */ error = linux_timer_jiffies_until(end); done: return (error); } int linux_try_wait_for_completion(struct completion *c) { int isdone; sleepq_lock(c); isdone = (c->done != 0); if (c->done != 0 && c->done != UINT_MAX) c->done--; sleepq_release(c); return (isdone); } int linux_completion_done(struct completion *c) { int isdone; sleepq_lock(c); isdone = (c->done != 0); sleepq_release(c); return (isdone); } static void linux_cdev_deref(struct linux_cdev *ldev) { if (refcount_release(&ldev->refs) && ldev->kobj.ktype == &linux_cdev_ktype) kfree(ldev); } static void linux_cdev_release(struct kobject *kobj) { struct linux_cdev *cdev; struct kobject *parent; cdev = container_of(kobj, struct linux_cdev, kobj); parent = kobj->parent; linux_destroy_dev(cdev); linux_cdev_deref(cdev); kobject_put(parent); } static void linux_cdev_static_release(struct kobject *kobj) { struct cdev *cdev; struct linux_cdev *ldev; ldev = container_of(kobj, struct linux_cdev, kobj); cdev = ldev->cdev; if (cdev != NULL) { destroy_dev(cdev); ldev->cdev = NULL; } kobject_put(kobj->parent); } int linux_cdev_device_add(struct linux_cdev *ldev, struct device *dev) { int ret; if (dev->devt != 0) { /* Set parent kernel object. */ ldev->kobj.parent = &dev->kobj; /* * Unlike Linux we require the kobject of the * character device structure to have a valid name * before calling this function: */ if (ldev->kobj.name == NULL) return (-EINVAL); ret = cdev_add(ldev, dev->devt, 1); if (ret) return (ret); } ret = device_add(dev); if (ret != 0 && dev->devt != 0) cdev_del(ldev); return (ret); } void linux_cdev_device_del(struct linux_cdev *ldev, struct device *dev) { device_del(dev); if (dev->devt != 0) cdev_del(ldev); } static void linux_destroy_dev(struct linux_cdev *ldev) { if (ldev->cdev == NULL) return; MPASS((ldev->siref & LDEV_SI_DTR) == 0); MPASS(ldev->kobj.ktype == &linux_cdev_ktype); atomic_set_int(&ldev->siref, LDEV_SI_DTR); while ((atomic_load_int(&ldev->siref) & ~LDEV_SI_DTR) != 0) pause("ldevdtr", hz / 4); destroy_dev(ldev->cdev); ldev->cdev = NULL; } const struct kobj_type linux_cdev_ktype = { .release = linux_cdev_release, }; const struct kobj_type linux_cdev_static_ktype = { .release = linux_cdev_static_release, }; static void linux_handle_ifnet_link_event(void *arg, struct ifnet *ifp, int linkstate) { struct notifier_block *nb; struct netdev_notifier_info ni; nb = arg; ni.ifp = ifp; ni.dev = (struct net_device *)ifp; if (linkstate == LINK_STATE_UP) nb->notifier_call(nb, NETDEV_UP, &ni); else nb->notifier_call(nb, NETDEV_DOWN, &ni); } static void linux_handle_ifnet_arrival_event(void *arg, struct ifnet *ifp) { struct notifier_block *nb; struct netdev_notifier_info ni; nb = arg; ni.ifp = ifp; ni.dev = (struct net_device *)ifp; nb->notifier_call(nb, NETDEV_REGISTER, &ni); } static void linux_handle_ifnet_departure_event(void *arg, struct ifnet *ifp) { struct notifier_block *nb; struct netdev_notifier_info ni; nb = arg; ni.ifp = ifp; ni.dev = (struct net_device *)ifp; nb->notifier_call(nb, NETDEV_UNREGISTER, &ni); } static void linux_handle_iflladdr_event(void *arg, struct ifnet *ifp) { struct notifier_block *nb; struct netdev_notifier_info ni; nb = arg; ni.ifp = ifp; ni.dev = (struct net_device *)ifp; nb->notifier_call(nb, NETDEV_CHANGEADDR, &ni); } static void linux_handle_ifaddr_event(void *arg, struct ifnet *ifp) { struct notifier_block *nb; struct netdev_notifier_info ni; nb = arg; ni.ifp = ifp; ni.dev = (struct net_device *)ifp; nb->notifier_call(nb, NETDEV_CHANGEIFADDR, &ni); } int register_netdevice_notifier(struct notifier_block *nb) { nb->tags[NETDEV_UP] = EVENTHANDLER_REGISTER( ifnet_link_event, linux_handle_ifnet_link_event, nb, 0); nb->tags[NETDEV_REGISTER] = EVENTHANDLER_REGISTER( ifnet_arrival_event, linux_handle_ifnet_arrival_event, nb, 0); nb->tags[NETDEV_UNREGISTER] = EVENTHANDLER_REGISTER( ifnet_departure_event, linux_handle_ifnet_departure_event, nb, 0); nb->tags[NETDEV_CHANGEADDR] = EVENTHANDLER_REGISTER( iflladdr_event, linux_handle_iflladdr_event, nb, 0); return (0); } int register_inetaddr_notifier(struct notifier_block *nb) { nb->tags[NETDEV_CHANGEIFADDR] = EVENTHANDLER_REGISTER( ifaddr_event, linux_handle_ifaddr_event, nb, 0); return (0); } int unregister_netdevice_notifier(struct notifier_block *nb) { EVENTHANDLER_DEREGISTER(ifnet_link_event, nb->tags[NETDEV_UP]); EVENTHANDLER_DEREGISTER(ifnet_arrival_event, nb->tags[NETDEV_REGISTER]); EVENTHANDLER_DEREGISTER(ifnet_departure_event, nb->tags[NETDEV_UNREGISTER]); EVENTHANDLER_DEREGISTER(iflladdr_event, nb->tags[NETDEV_CHANGEADDR]); return (0); } int unregister_inetaddr_notifier(struct notifier_block *nb) { EVENTHANDLER_DEREGISTER(ifaddr_event, nb->tags[NETDEV_CHANGEIFADDR]); return (0); } struct list_sort_thunk { int (*cmp)(void *, struct list_head *, struct list_head *); void *priv; }; static inline int linux_le_cmp(void *priv, const void *d1, const void *d2) { struct list_head *le1, *le2; struct list_sort_thunk *thunk; thunk = priv; le1 = *(__DECONST(struct list_head **, d1)); le2 = *(__DECONST(struct list_head **, d2)); return ((thunk->cmp)(thunk->priv, le1, le2)); } void list_sort(void *priv, struct list_head *head, int (*cmp)(void *priv, struct list_head *a, struct list_head *b)) { struct list_sort_thunk thunk; struct list_head **ar, *le; size_t count, i; count = 0; list_for_each(le, head) count++; ar = malloc(sizeof(struct list_head *) * count, M_KMALLOC, M_WAITOK); i = 0; list_for_each(le, head) ar[i++] = le; thunk.cmp = cmp; thunk.priv = priv; qsort_r(ar, count, sizeof(struct list_head *), &thunk, linux_le_cmp); INIT_LIST_HEAD(head); for (i = 0; i < count; i++) list_add_tail(ar[i], head); free(ar, M_KMALLOC); } #if defined(__i386__) || defined(__amd64__) int linux_wbinvd_on_all_cpus(void) { pmap_invalidate_cache(); return (0); } #endif int linux_on_each_cpu(void callback(void *), void *data) { smp_rendezvous(smp_no_rendezvous_barrier, callback, smp_no_rendezvous_barrier, data); return (0); } int linux_in_atomic(void) { return ((curthread->td_pflags & TDP_NOFAULTING) != 0); } struct linux_cdev * linux_find_cdev(const char *name, unsigned major, unsigned minor) { dev_t dev = MKDEV(major, minor); struct cdev *cdev; dev_lock(); LIST_FOREACH(cdev, &linuxcdevsw.d_devs, si_list) { struct linux_cdev *ldev = cdev->si_drv1; if (ldev->dev == dev && strcmp(kobject_name(&ldev->kobj), name) == 0) { break; } } dev_unlock(); return (cdev != NULL ? cdev->si_drv1 : NULL); } int __register_chrdev(unsigned int major, unsigned int baseminor, unsigned int count, const char *name, const struct file_operations *fops) { struct linux_cdev *cdev; int ret = 0; int i; for (i = baseminor; i < baseminor + count; i++) { cdev = cdev_alloc(); cdev->ops = fops; kobject_set_name(&cdev->kobj, name); ret = cdev_add(cdev, makedev(major, i), 1); if (ret != 0) break; } return (ret); } int __register_chrdev_p(unsigned int major, unsigned int baseminor, unsigned int count, const char *name, const struct file_operations *fops, uid_t uid, gid_t gid, int mode) { struct linux_cdev *cdev; int ret = 0; int i; for (i = baseminor; i < baseminor + count; i++) { cdev = cdev_alloc(); cdev->ops = fops; kobject_set_name(&cdev->kobj, name); ret = cdev_add_ext(cdev, makedev(major, i), uid, gid, mode); if (ret != 0) break; } return (ret); } void __unregister_chrdev(unsigned int major, unsigned int baseminor, unsigned int count, const char *name) { struct linux_cdev *cdevp; int i; for (i = baseminor; i < baseminor + count; i++) { cdevp = linux_find_cdev(name, major, i); if (cdevp != NULL) cdev_del(cdevp); } } void linux_dump_stack(void) { #ifdef STACK struct stack st; stack_zero(&st); stack_save(&st); stack_print(&st); #endif } int linuxkpi_net_ratelimit(void) { return (ppsratecheck(&lkpi_net_lastlog, &lkpi_net_curpps, lkpi_net_maxpps)); } #if defined(__i386__) || defined(__amd64__) bool linux_cpu_has_clflush; #endif static void linux_compat_init(void *arg) { struct sysctl_oid *rootoid; int i; #if defined(__i386__) || defined(__amd64__) linux_cpu_has_clflush = (cpu_feature & CPUID_CLFSH); #endif rw_init(&linux_vma_lock, "lkpi-vma-lock"); rootoid = SYSCTL_ADD_ROOT_NODE(NULL, OID_AUTO, "sys", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "sys"); kobject_init(&linux_class_root, &linux_class_ktype); kobject_set_name(&linux_class_root, "class"); linux_class_root.oidp = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(rootoid), OID_AUTO, "class", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "class"); kobject_init(&linux_root_device.kobj, &linux_dev_ktype); kobject_set_name(&linux_root_device.kobj, "device"); linux_root_device.kobj.oidp = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(rootoid), OID_AUTO, "device", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "device"); linux_root_device.bsddev = root_bus; linux_class_misc.name = "misc"; class_register(&linux_class_misc); INIT_LIST_HEAD(&pci_drivers); INIT_LIST_HEAD(&pci_devices); spin_lock_init(&pci_lock); mtx_init(&vmmaplock, "IO Map lock", NULL, MTX_DEF); for (i = 0; i < VMMAP_HASH_SIZE; i++) LIST_INIT(&vmmaphead[i]); init_waitqueue_head(&linux_bit_waitq); init_waitqueue_head(&linux_var_waitq); CPU_COPY(&all_cpus, &cpu_online_mask); } SYSINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_init, NULL); static void linux_compat_uninit(void *arg) { linux_kobject_kfree_name(&linux_class_root); linux_kobject_kfree_name(&linux_root_device.kobj); linux_kobject_kfree_name(&linux_class_misc.kobj); mtx_destroy(&vmmaplock); spin_lock_destroy(&pci_lock); rw_destroy(&linux_vma_lock); } SYSUNINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_uninit, NULL); /* * NOTE: Linux frequently uses "unsigned long" for pointer to integer * conversion and vice versa, where in FreeBSD "uintptr_t" would be * used. Assert these types have the same size, else some parts of the * LinuxKPI may not work like expected: */ CTASSERT(sizeof(unsigned long) == sizeof(uintptr_t)); diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c index a3d67aaa11ba..6ff0ffaf1c43 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c @@ -1,299 +1,299 @@ /* * Copyright (c) 2020 iXsystems, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include int zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp) { struct thread *td; int rc, fd; td = curthread; pwd_ensure_dirs(); /* 12.x doesn't take a const char * */ rc = kern_openat(td, AT_FDCWD, __DECONST(char *, path), UIO_SYSSPACE, flags, mode); if (rc) return (SET_ERROR(rc)); fd = td->td_retval[0]; td->td_retval[0] = 0; if (fget(curthread, fd, &cap_no_rights, fpp)) kern_close(td, fd); return (0); } void zfs_file_close(zfs_file_t *fp) { fo_close(fp, curthread); } static int zfs_file_write_impl(zfs_file_t *fp, const void *buf, size_t count, loff_t *offp, ssize_t *resid) { ssize_t rc; struct uio auio; struct thread *td; struct iovec aiov; td = curthread; aiov.iov_base = (void *)(uintptr_t)buf; aiov.iov_len = count; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_segflg = UIO_SYSSPACE; auio.uio_resid = count; auio.uio_rw = UIO_WRITE; auio.uio_td = td; auio.uio_offset = *offp; if ((fp->f_flag & FWRITE) == 0) return (SET_ERROR(EBADF)); if (fp->f_type == DTYPE_VNODE) bwillwrite(); rc = fo_write(fp, &auio, td->td_ucred, FOF_OFFSET, td); if (rc) return (SET_ERROR(rc)); if (resid) *resid = auio.uio_resid; else if (auio.uio_resid) return (SET_ERROR(EIO)); *offp += count - auio.uio_resid; return (rc); } int zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid) { loff_t off = fp->f_offset; ssize_t rc; rc = zfs_file_write_impl(fp, buf, count, &off, resid); if (rc == 0) fp->f_offset = off; return (SET_ERROR(rc)); } int zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off, ssize_t *resid) { return (zfs_file_write_impl(fp, buf, count, &off, resid)); } static int zfs_file_read_impl(zfs_file_t *fp, void *buf, size_t count, loff_t *offp, ssize_t *resid) { ssize_t rc; struct uio auio; struct thread *td; struct iovec aiov; td = curthread; aiov.iov_base = (void *)(uintptr_t)buf; aiov.iov_len = count; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_segflg = UIO_SYSSPACE; auio.uio_resid = count; auio.uio_rw = UIO_READ; auio.uio_td = td; auio.uio_offset = *offp; if ((fp->f_flag & FREAD) == 0) return (SET_ERROR(EBADF)); rc = fo_read(fp, &auio, td->td_ucred, FOF_OFFSET, td); if (rc) return (SET_ERROR(rc)); if (resid) *resid = auio.uio_resid; *offp += count - auio.uio_resid; return (SET_ERROR(0)); } int zfs_file_read(zfs_file_t *fp, void *buf, size_t count, ssize_t *resid) { loff_t off = fp->f_offset; ssize_t rc; rc = zfs_file_read_impl(fp, buf, count, &off, resid); if (rc == 0) fp->f_offset = off; return (rc); } int zfs_file_pread(zfs_file_t *fp, void *buf, size_t count, loff_t off, ssize_t *resid) { return (zfs_file_read_impl(fp, buf, count, &off, resid)); } int zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence) { int rc; struct thread *td; td = curthread; if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) return (SET_ERROR(ESPIPE)); rc = fo_seek(fp, *offp, whence, td); if (rc == 0) *offp = td->td_uretoff.tdu_off; return (SET_ERROR(rc)); } int zfs_file_getattr(zfs_file_t *fp, zfs_file_attr_t *zfattr) { struct thread *td; struct stat sb; int rc; td = curthread; - rc = fo_stat(fp, &sb, td->td_ucred, td); + rc = fo_stat(fp, &sb, td->td_ucred); if (rc) return (SET_ERROR(rc)); zfattr->zfa_size = sb.st_size; zfattr->zfa_mode = sb.st_mode; return (0); } static __inline int zfs_vop_fsync(vnode_t *vp) { struct mount *mp; int error; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto drop; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(vp, MNT_WAIT, curthread); VOP_UNLOCK1(vp); vn_finished_write(mp); drop: return (SET_ERROR(error)); } int zfs_file_fsync(zfs_file_t *fp, int flags) { if (fp->f_type != DTYPE_VNODE) return (EINVAL); return (zfs_vop_fsync(fp->f_vnode)); } zfs_file_t * zfs_file_get(int fd) { struct file *fp; if (fget(curthread, fd, &cap_no_rights, &fp)) return (NULL); return (fp); } void zfs_file_put(zfs_file_t *fp) { fdrop(fp, curthread); } loff_t zfs_file_off(zfs_file_t *fp) { return (fp->f_offset); } void * zfs_file_private(zfs_file_t *fp) { file_t *tmpfp; void *data; int error; tmpfp = curthread->td_fpop; curthread->td_fpop = fp; error = devfs_get_cdevpriv(&data); curthread->td_fpop = tmpfp; if (error != 0) return (NULL); return (data); } int zfs_file_unlink(const char *fnamep) { zfs_uio_seg_t seg = UIO_SYSSPACE; int rc; #if __FreeBSD_version >= 1300018 rc = kern_funlinkat(curthread, AT_FDCWD, fnamep, FD_NONE, seg, 0, 0); #elif __FreeBSD_version >= 1202504 || defined(AT_BENEATH) rc = kern_unlinkat(curthread, AT_FDCWD, __DECONST(char *, fnamep), seg, 0, 0); #else rc = kern_unlinkat(curthread, AT_FDCWD, __DECONST(char *, fnamep), seg, 0); #endif return (SET_ERROR(rc)); } diff --git a/sys/fs/devfs/devfs_vnops.c b/sys/fs/devfs/devfs_vnops.c index 9a327c02ee96..964d288324b4 100644 --- a/sys/fs/devfs/devfs_vnops.c +++ b/sys/fs/devfs/devfs_vnops.c @@ -1,2122 +1,2122 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2000-2004 * Poul-Henning Kamp. All rights reserved. * Copyright (c) 1989, 1992-1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kernfs_vnops.c 8.15 (Berkeley) 5/21/95 * From: FreeBSD: src/sys/miscfs/kernfs/kernfs_vnops.c 1.43 * * $FreeBSD$ */ /* * TODO: * mkdir: want it ? */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static struct vop_vector devfs_vnodeops; static struct vop_vector devfs_specops; static struct fileops devfs_ops_f; #include #include #include #include #include #include static MALLOC_DEFINE(M_CDEVPDATA, "DEVFSP", "Metainfo for cdev-fp data"); struct mtx devfs_de_interlock; MTX_SYSINIT(devfs_de_interlock, &devfs_de_interlock, "devfs interlock", MTX_DEF); struct sx clone_drain_lock; SX_SYSINIT(clone_drain_lock, &clone_drain_lock, "clone events drain lock"); struct mtx cdevpriv_mtx; MTX_SYSINIT(cdevpriv_mtx, &cdevpriv_mtx, "cdevpriv lock", MTX_DEF); SYSCTL_DECL(_vfs_devfs); static int devfs_dotimes; SYSCTL_INT(_vfs_devfs, OID_AUTO, dotimes, CTLFLAG_RW, &devfs_dotimes, 0, "Update timestamps on DEVFS with default precision"); /* * Update devfs node timestamp. Note that updates are unlocked and * stat(2) could see partially updated times. */ static void devfs_timestamp(struct timespec *tsp) { time_t ts; if (devfs_dotimes) { vfs_timestamp(tsp); } else { ts = time_second; if (tsp->tv_sec != ts) { tsp->tv_sec = ts; tsp->tv_nsec = 0; } } } static int devfs_fp_check(struct file *fp, struct cdev **devp, struct cdevsw **dswp, int *ref) { *dswp = devvn_refthread(fp->f_vnode, devp, ref); if (*dswp == NULL || *devp != fp->f_data) { if (*dswp != NULL) dev_relthread(*devp, *ref); return (ENXIO); } KASSERT((*devp)->si_refcount > 0, ("devfs: un-referenced struct cdev *(%s)", devtoname(*devp))); if (*dswp == NULL) return (ENXIO); curthread->td_fpop = fp; return (0); } int devfs_get_cdevpriv(void **datap) { struct file *fp; struct cdev_privdata *p; int error; fp = curthread->td_fpop; if (fp == NULL) return (EBADF); p = fp->f_cdevpriv; if (p != NULL) { error = 0; *datap = p->cdpd_data; } else error = ENOENT; return (error); } int devfs_set_cdevpriv(void *priv, d_priv_dtor_t *priv_dtr) { struct file *fp; struct cdev_priv *cdp; struct cdev_privdata *p; int error; fp = curthread->td_fpop; if (fp == NULL) return (ENOENT); cdp = cdev2priv((struct cdev *)fp->f_data); p = malloc(sizeof(struct cdev_privdata), M_CDEVPDATA, M_WAITOK); p->cdpd_data = priv; p->cdpd_dtr = priv_dtr; p->cdpd_fp = fp; mtx_lock(&cdevpriv_mtx); if (fp->f_cdevpriv == NULL) { LIST_INSERT_HEAD(&cdp->cdp_fdpriv, p, cdpd_list); fp->f_cdevpriv = p; mtx_unlock(&cdevpriv_mtx); error = 0; } else { mtx_unlock(&cdevpriv_mtx); free(p, M_CDEVPDATA); error = EBUSY; } return (error); } void devfs_destroy_cdevpriv(struct cdev_privdata *p) { mtx_assert(&cdevpriv_mtx, MA_OWNED); KASSERT(p->cdpd_fp->f_cdevpriv == p, ("devfs_destoy_cdevpriv %p != %p", p->cdpd_fp->f_cdevpriv, p)); p->cdpd_fp->f_cdevpriv = NULL; LIST_REMOVE(p, cdpd_list); mtx_unlock(&cdevpriv_mtx); (p->cdpd_dtr)(p->cdpd_data); free(p, M_CDEVPDATA); } static void devfs_fpdrop(struct file *fp) { struct cdev_privdata *p; mtx_lock(&cdevpriv_mtx); if ((p = fp->f_cdevpriv) == NULL) { mtx_unlock(&cdevpriv_mtx); return; } devfs_destroy_cdevpriv(p); } void devfs_clear_cdevpriv(void) { struct file *fp; fp = curthread->td_fpop; if (fp == NULL) return; devfs_fpdrop(fp); } static void devfs_usecount_add(struct vnode *vp) { struct devfs_dirent *de; struct cdev *dev; mtx_lock(&devfs_de_interlock); VI_LOCK(vp); VNPASS(vp->v_type == VCHR || vp->v_type == VBAD, vp); if (VN_IS_DOOMED(vp)) { goto out_unlock; } de = vp->v_data; dev = vp->v_rdev; MPASS(de != NULL); MPASS(dev != NULL); dev->si_usecount++; de->de_usecount++; out_unlock: VI_UNLOCK(vp); mtx_unlock(&devfs_de_interlock); } static void devfs_usecount_subl(struct vnode *vp) { struct devfs_dirent *de; struct cdev *dev; mtx_assert(&devfs_de_interlock, MA_OWNED); ASSERT_VI_LOCKED(vp, __func__); VNPASS(vp->v_type == VCHR || vp->v_type == VBAD, vp); de = vp->v_data; dev = vp->v_rdev; if (de == NULL) return; if (dev == NULL) { MPASS(de->de_usecount == 0); return; } if (dev->si_usecount < de->de_usecount) panic("%s: si_usecount underflow for dev %p " "(has %ld, dirent has %d)\n", __func__, dev, dev->si_usecount, de->de_usecount); if (VN_IS_DOOMED(vp)) { dev->si_usecount -= de->de_usecount; de->de_usecount = 0; } else { if (de->de_usecount == 0) panic("%s: de_usecount underflow for dev %p\n", __func__, dev); dev->si_usecount--; de->de_usecount--; } } static void devfs_usecount_sub(struct vnode *vp) { mtx_lock(&devfs_de_interlock); VI_LOCK(vp); devfs_usecount_subl(vp); VI_UNLOCK(vp); mtx_unlock(&devfs_de_interlock); } static int devfs_usecountl(struct vnode *vp) { VNPASS(vp->v_type == VCHR, vp); mtx_assert(&devfs_de_interlock, MA_OWNED); ASSERT_VI_LOCKED(vp, __func__); return (vp->v_rdev->si_usecount); } int devfs_usecount(struct vnode *vp) { int count; VNPASS(vp->v_type == VCHR, vp); mtx_lock(&devfs_de_interlock); VI_LOCK(vp); count = devfs_usecountl(vp); VI_UNLOCK(vp); mtx_unlock(&devfs_de_interlock); return (count); } void devfs_ctty_ref(struct vnode *vp) { vrefact(vp); devfs_usecount_add(vp); } void devfs_ctty_unref(struct vnode *vp) { devfs_usecount_sub(vp); vrele(vp); } /* * On success devfs_populate_vp() returns with dmp->dm_lock held. */ static int devfs_populate_vp(struct vnode *vp) { struct devfs_dirent *de; struct devfs_mount *dmp; int locked; ASSERT_VOP_LOCKED(vp, "devfs_populate_vp"); dmp = VFSTODEVFS(vp->v_mount); if (!devfs_populate_needed(dmp)) { sx_xlock(&dmp->dm_lock); goto out_nopopulate; } locked = VOP_ISLOCKED(vp); sx_xlock(&dmp->dm_lock); DEVFS_DMP_HOLD(dmp); /* Can't call devfs_populate() with the vnode lock held. */ VOP_UNLOCK(vp); devfs_populate(dmp); sx_xunlock(&dmp->dm_lock); vn_lock(vp, locked | LK_RETRY); sx_xlock(&dmp->dm_lock); if (DEVFS_DMP_DROP(dmp)) { sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); return (ERESTART); } out_nopopulate: if (VN_IS_DOOMED(vp)) { sx_xunlock(&dmp->dm_lock); return (ERESTART); } de = vp->v_data; KASSERT(de != NULL, ("devfs_populate_vp: vp->v_data == NULL but vnode not doomed")); if ((de->de_flags & DE_DOOMED) != 0) { sx_xunlock(&dmp->dm_lock); return (ERESTART); } return (0); } static int devfs_vptocnp(struct vop_vptocnp_args *ap) { struct vnode *vp = ap->a_vp; struct vnode **dvp = ap->a_vpp; struct devfs_mount *dmp; char *buf = ap->a_buf; size_t *buflen = ap->a_buflen; struct devfs_dirent *dd, *de; int i, error; dmp = VFSTODEVFS(vp->v_mount); error = devfs_populate_vp(vp); if (error != 0) return (error); if (vp->v_type != VCHR && vp->v_type != VDIR) { error = ENOENT; goto finished; } dd = vp->v_data; if (vp->v_type == VDIR && dd == dmp->dm_rootdir) { *dvp = vp; vref(*dvp); goto finished; } i = *buflen; i -= dd->de_dirent->d_namlen; if (i < 0) { error = ENOMEM; goto finished; } bcopy(dd->de_dirent->d_name, buf + i, dd->de_dirent->d_namlen); *buflen = i; de = devfs_parent_dirent(dd); if (de == NULL) { error = ENOENT; goto finished; } mtx_lock(&devfs_de_interlock); *dvp = de->de_vnode; if (*dvp != NULL) { VI_LOCK(*dvp); mtx_unlock(&devfs_de_interlock); vholdl(*dvp); VI_UNLOCK(*dvp); vref(*dvp); vdrop(*dvp); } else { mtx_unlock(&devfs_de_interlock); error = ENOENT; } finished: sx_xunlock(&dmp->dm_lock); return (error); } /* * Construct the fully qualified path name relative to the mountpoint. * If a NULL cnp is provided, no '/' is appended to the resulting path. */ char * devfs_fqpn(char *buf, struct devfs_mount *dmp, struct devfs_dirent *dd, struct componentname *cnp) { int i; struct devfs_dirent *de; sx_assert(&dmp->dm_lock, SA_LOCKED); i = SPECNAMELEN; buf[i] = '\0'; if (cnp != NULL) i -= cnp->cn_namelen; if (i < 0) return (NULL); if (cnp != NULL) bcopy(cnp->cn_nameptr, buf + i, cnp->cn_namelen); de = dd; while (de != dmp->dm_rootdir) { if (cnp != NULL || i < SPECNAMELEN) { i--; if (i < 0) return (NULL); buf[i] = '/'; } i -= de->de_dirent->d_namlen; if (i < 0) return (NULL); bcopy(de->de_dirent->d_name, buf + i, de->de_dirent->d_namlen); de = devfs_parent_dirent(de); if (de == NULL) return (NULL); } return (buf + i); } static int devfs_allocv_drop_refs(int drop_dm_lock, struct devfs_mount *dmp, struct devfs_dirent *de) { int not_found; not_found = 0; if (de->de_flags & DE_DOOMED) not_found = 1; if (DEVFS_DE_DROP(de)) { KASSERT(not_found == 1, ("DEVFS de dropped but not doomed")); devfs_dirent_free(de); } if (DEVFS_DMP_DROP(dmp)) { KASSERT(not_found == 1, ("DEVFS mount struct freed before dirent")); not_found = 2; sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); } if (not_found == 1 || (drop_dm_lock && not_found != 2)) sx_unlock(&dmp->dm_lock); return (not_found); } static void devfs_insmntque_dtr(struct vnode *vp, void *arg) { struct devfs_dirent *de; de = (struct devfs_dirent *)arg; mtx_lock(&devfs_de_interlock); vp->v_data = NULL; de->de_vnode = NULL; mtx_unlock(&devfs_de_interlock); vgone(vp); vput(vp); } /* * devfs_allocv shall be entered with dmp->dm_lock held, and it drops * it on return. */ int devfs_allocv(struct devfs_dirent *de, struct mount *mp, int lockmode, struct vnode **vpp) { int error; struct vnode *vp; struct cdev *dev; struct devfs_mount *dmp; struct cdevsw *dsw; enum vgetstate vs; dmp = VFSTODEVFS(mp); if (de->de_flags & DE_DOOMED) { sx_xunlock(&dmp->dm_lock); return (ENOENT); } loop: DEVFS_DE_HOLD(de); DEVFS_DMP_HOLD(dmp); mtx_lock(&devfs_de_interlock); vp = de->de_vnode; if (vp != NULL) { vs = vget_prep(vp); mtx_unlock(&devfs_de_interlock); sx_xunlock(&dmp->dm_lock); vget_finish(vp, lockmode | LK_RETRY, vs); sx_xlock(&dmp->dm_lock); if (devfs_allocv_drop_refs(0, dmp, de)) { vput(vp); return (ENOENT); } else if (VN_IS_DOOMED(vp)) { mtx_lock(&devfs_de_interlock); if (de->de_vnode == vp) { de->de_vnode = NULL; vp->v_data = NULL; } mtx_unlock(&devfs_de_interlock); vput(vp); goto loop; } sx_xunlock(&dmp->dm_lock); *vpp = vp; return (0); } mtx_unlock(&devfs_de_interlock); if (de->de_dirent->d_type == DT_CHR) { if (!(de->de_cdp->cdp_flags & CDP_ACTIVE)) { devfs_allocv_drop_refs(1, dmp, de); return (ENOENT); } dev = &de->de_cdp->cdp_c; } else { dev = NULL; } error = getnewvnode("devfs", mp, &devfs_vnodeops, &vp); if (error != 0) { devfs_allocv_drop_refs(1, dmp, de); printf("devfs_allocv: failed to allocate new vnode\n"); return (error); } if (de->de_dirent->d_type == DT_CHR) { vp->v_type = VCHR; VI_LOCK(vp); dev_lock(); dev_refl(dev); /* XXX: v_rdev should be protect by vnode lock */ vp->v_rdev = dev; VNPASS(vp->v_usecount == 1, vp); /* Special casing of ttys for deadfs. Probably redundant. */ dsw = dev->si_devsw; if (dsw != NULL && (dsw->d_flags & D_TTY) != 0) vp->v_vflag |= VV_ISTTY; dev_unlock(); VI_UNLOCK(vp); if ((dev->si_flags & SI_ETERNAL) != 0) vp->v_vflag |= VV_ETERNALDEV; vp->v_op = &devfs_specops; } else if (de->de_dirent->d_type == DT_DIR) { vp->v_type = VDIR; } else if (de->de_dirent->d_type == DT_LNK) { vp->v_type = VLNK; } else { vp->v_type = VBAD; } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWITNESS); VN_LOCK_ASHARE(vp); mtx_lock(&devfs_de_interlock); vp->v_data = de; de->de_vnode = vp; mtx_unlock(&devfs_de_interlock); error = insmntque1(vp, mp, devfs_insmntque_dtr, de); if (error != 0) { (void) devfs_allocv_drop_refs(1, dmp, de); return (error); } if (devfs_allocv_drop_refs(0, dmp, de)) { vput(vp); return (ENOENT); } #ifdef MAC mac_devfs_vnode_associate(mp, de, vp); #endif sx_xunlock(&dmp->dm_lock); *vpp = vp; return (0); } static int devfs_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; struct devfs_dirent *de; struct proc *p; int error; de = vp->v_data; if (vp->v_type == VDIR) de = de->de_dir; error = vaccess(vp->v_type, de->de_mode, de->de_uid, de->de_gid, ap->a_accmode, ap->a_cred); if (error == 0) return (0); if (error != EACCES) return (error); p = ap->a_td->td_proc; /* We do, however, allow access to the controlling terminal */ PROC_LOCK(p); if (!(p->p_flag & P_CONTROLT)) { PROC_UNLOCK(p); return (error); } if (p->p_session->s_ttydp == de->de_cdp) error = 0; PROC_UNLOCK(p); return (error); } _Static_assert(((FMASK | FCNTLFLAGS) & (FLASTCLOSE | FREVOKE)) == 0, "devfs-only flag reuse failed"); static int devfs_close(struct vop_close_args *ap) { struct vnode *vp = ap->a_vp, *oldvp; struct thread *td = ap->a_td; struct proc *p; struct cdev *dev = vp->v_rdev; struct cdevsw *dsw; struct devfs_dirent *de = vp->v_data; int dflags, error, ref, vp_locked; /* * XXX: Don't call d_close() if we were called because of * XXX: insmntque1() failure. */ if (vp->v_data == NULL) return (0); /* * Hack: a tty device that is a controlling terminal * has a reference from the session structure. * We cannot easily tell that a character device is * a controlling terminal, unless it is the closing * process' controlling terminal. In that case, * if the reference count is 2 (this last descriptor * plus the session), release the reference from the session. */ if (de->de_usecount == 2 && td != NULL) { p = td->td_proc; PROC_LOCK(p); if (vp == p->p_session->s_ttyvp) { PROC_UNLOCK(p); oldvp = NULL; sx_xlock(&proctree_lock); if (vp == p->p_session->s_ttyvp) { SESS_LOCK(p->p_session); mtx_lock(&devfs_de_interlock); VI_LOCK(vp); if (devfs_usecountl(vp) == 2 && !VN_IS_DOOMED(vp)) { p->p_session->s_ttyvp = NULL; p->p_session->s_ttydp = NULL; oldvp = vp; } VI_UNLOCK(vp); mtx_unlock(&devfs_de_interlock); SESS_UNLOCK(p->p_session); } sx_xunlock(&proctree_lock); if (oldvp != NULL) devfs_ctty_unref(oldvp); } else PROC_UNLOCK(p); } /* * We do not want to really close the device if it * is still in use unless we are trying to close it * forcibly. Since every use (buffer, vnode, swap, cmap) * holds a reference to the vnode, and because we mark * any other vnodes that alias this device, when the * sum of the reference counts on all the aliased * vnodes descends to one, we are on last close. */ dsw = dev_refthread(dev, &ref); if (dsw == NULL) return (ENXIO); dflags = 0; mtx_lock(&devfs_de_interlock); VI_LOCK(vp); if (devfs_usecountl(vp) == 1) dflags |= FLASTCLOSE; devfs_usecount_subl(vp); mtx_unlock(&devfs_de_interlock); if (VN_IS_DOOMED(vp)) { /* Forced close. */ dflags |= FREVOKE | FNONBLOCK; } else if (dsw->d_flags & D_TRACKCLOSE) { /* Keep device updated on status. */ } else if ((dflags & FLASTCLOSE) == 0) { VI_UNLOCK(vp); dev_relthread(dev, ref); return (0); } vholdnz(vp); VI_UNLOCK(vp); vp_locked = VOP_ISLOCKED(vp); VOP_UNLOCK(vp); KASSERT(dev->si_refcount > 0, ("devfs_close() on un-referenced struct cdev *(%s)", devtoname(dev))); error = dsw->d_close(dev, ap->a_fflag | dflags, S_IFCHR, td); dev_relthread(dev, ref); vn_lock(vp, vp_locked | LK_RETRY); vdrop(vp); return (error); } static int devfs_close_f(struct file *fp, struct thread *td) { int error; struct file *fpop; /* * NB: td may be NULL if this descriptor is closed due to * garbage collection from a closed UNIX domain socket. */ fpop = curthread->td_fpop; curthread->td_fpop = fp; error = vnops.fo_close(fp, td); curthread->td_fpop = fpop; /* * The f_cdevpriv cannot be assigned non-NULL value while we * are destroying the file. */ if (fp->f_cdevpriv != NULL) devfs_fpdrop(fp); return (error); } static int devfs_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; struct devfs_dirent *de; struct devfs_mount *dmp; struct cdev *dev; struct timeval boottime; int error; error = devfs_populate_vp(vp); if (error != 0) return (error); dmp = VFSTODEVFS(vp->v_mount); sx_xunlock(&dmp->dm_lock); de = vp->v_data; KASSERT(de != NULL, ("Null dirent in devfs_getattr vp=%p", vp)); if (vp->v_type == VDIR) { de = de->de_dir; KASSERT(de != NULL, ("Null dir dirent in devfs_getattr vp=%p", vp)); } vap->va_uid = de->de_uid; vap->va_gid = de->de_gid; vap->va_mode = de->de_mode; if (vp->v_type == VLNK) vap->va_size = strlen(de->de_symlink); else if (vp->v_type == VDIR) vap->va_size = vap->va_bytes = DEV_BSIZE; else vap->va_size = 0; if (vp->v_type != VDIR) vap->va_bytes = 0; vap->va_blocksize = DEV_BSIZE; vap->va_type = vp->v_type; getboottime(&boottime); #define fix(aa) \ do { \ if ((aa).tv_sec <= 3600) { \ (aa).tv_sec = boottime.tv_sec; \ (aa).tv_nsec = boottime.tv_usec * 1000; \ } \ } while (0) if (vp->v_type != VCHR) { fix(de->de_atime); vap->va_atime = de->de_atime; fix(de->de_mtime); vap->va_mtime = de->de_mtime; fix(de->de_ctime); vap->va_ctime = de->de_ctime; } else { dev = vp->v_rdev; fix(dev->si_atime); vap->va_atime = dev->si_atime; fix(dev->si_mtime); vap->va_mtime = dev->si_mtime; fix(dev->si_ctime); vap->va_ctime = dev->si_ctime; vap->va_rdev = cdev2priv(dev)->cdp_inode; } vap->va_gen = 0; vap->va_flags = 0; vap->va_filerev = 0; vap->va_nlink = de->de_links; vap->va_fileid = de->de_inode; return (error); } /* ARGSUSED */ static int devfs_ioctl_f(struct file *fp, u_long com, void *data, struct ucred *cred, struct thread *td) { struct file *fpop; int error; fpop = td->td_fpop; td->td_fpop = fp; error = vnops.fo_ioctl(fp, com, data, cred, td); td->td_fpop = fpop; return (error); } void * fiodgname_buf_get_ptr(void *fgnp, u_long com) { union { struct fiodgname_arg fgn; #ifdef COMPAT_FREEBSD32 struct fiodgname_arg32 fgn32; #endif } *fgnup; fgnup = fgnp; switch (com) { case FIODGNAME: return (fgnup->fgn.buf); #ifdef COMPAT_FREEBSD32 case FIODGNAME_32: return ((void *)(uintptr_t)fgnup->fgn32.buf); #endif default: panic("Unhandled ioctl command %ld", com); } } static int devfs_ioctl(struct vop_ioctl_args *ap) { struct fiodgname_arg *fgn; struct vnode *vpold, *vp; struct cdevsw *dsw; struct thread *td; struct session *sess; struct cdev *dev; int error, ref, i; const char *p; u_long com; vp = ap->a_vp; com = ap->a_command; td = ap->a_td; dsw = devvn_refthread(vp, &dev, &ref); if (dsw == NULL) return (ENXIO); KASSERT(dev->si_refcount > 0, ("devfs: un-referenced struct cdev *(%s)", devtoname(dev))); switch (com) { case FIODTYPE: *(int *)ap->a_data = dsw->d_flags & D_TYPEMASK; error = 0; break; case FIODGNAME: #ifdef COMPAT_FREEBSD32 case FIODGNAME_32: #endif fgn = ap->a_data; p = devtoname(dev); i = strlen(p) + 1; if (i > fgn->len) error = EINVAL; else error = copyout(p, fiodgname_buf_get_ptr(fgn, com), i); break; default: error = dsw->d_ioctl(dev, com, ap->a_data, ap->a_fflag, td); } dev_relthread(dev, ref); if (error == ENOIOCTL) error = ENOTTY; if (error == 0 && com == TIOCSCTTY) { /* * Do nothing if reassigning same control tty, or if the * control tty has already disappeared. If it disappeared, * it's because we were racing with TIOCNOTTY. TIOCNOTTY * already took care of releasing the old vnode and we have * nothing left to do. */ sx_slock(&proctree_lock); sess = td->td_proc->p_session; if (sess->s_ttyvp == vp || sess->s_ttyp == NULL) { sx_sunlock(&proctree_lock); return (0); } devfs_ctty_ref(vp); SESS_LOCK(sess); vpold = sess->s_ttyvp; sess->s_ttyvp = vp; sess->s_ttydp = cdev2priv(dev); SESS_UNLOCK(sess); sx_sunlock(&proctree_lock); /* Get rid of reference to old control tty */ if (vpold) devfs_ctty_unref(vpold); } return (error); } /* ARGSUSED */ static int devfs_kqfilter_f(struct file *fp, struct knote *kn) { struct cdev *dev; struct cdevsw *dsw; int error, ref; struct file *fpop; struct thread *td; td = curthread; fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error) return (error); error = dsw->d_kqfilter(dev, kn); td->td_fpop = fpop; dev_relthread(dev, ref); return (error); } static inline int devfs_prison_check(struct devfs_dirent *de, struct thread *td) { struct cdev_priv *cdp; struct ucred *dcr; struct proc *p; int error; cdp = de->de_cdp; if (cdp == NULL) return (0); dcr = cdp->cdp_c.si_cred; if (dcr == NULL) return (0); error = prison_check(td->td_ucred, dcr); if (error == 0) return (0); /* We do, however, allow access to the controlling terminal */ p = td->td_proc; PROC_LOCK(p); if (!(p->p_flag & P_CONTROLT)) { PROC_UNLOCK(p); return (error); } if (p->p_session->s_ttydp == cdp) error = 0; PROC_UNLOCK(p); return (error); } static int devfs_lookupx(struct vop_lookup_args *ap, int *dm_unlock) { struct componentname *cnp; struct vnode *dvp, **vpp; struct thread *td; struct devfs_dirent *de, *dd; struct devfs_dirent **dde; struct devfs_mount *dmp; struct mount *mp; struct cdev *cdev; int error, flags, nameiop, dvplocked; char specname[SPECNAMELEN + 1], *pname; td = curthread; cnp = ap->a_cnp; vpp = ap->a_vpp; dvp = ap->a_dvp; pname = cnp->cn_nameptr; flags = cnp->cn_flags; nameiop = cnp->cn_nameiop; mp = dvp->v_mount; dmp = VFSTODEVFS(mp); dd = dvp->v_data; *vpp = NULLVP; if ((flags & ISLASTCN) && nameiop == RENAME) return (EOPNOTSUPP); if (dvp->v_type != VDIR) return (ENOTDIR); if ((flags & ISDOTDOT) && (dvp->v_vflag & VV_ROOT)) return (EIO); error = vn_dir_check_exec(dvp, cnp); if (error != 0) return (error); if (cnp->cn_namelen == 1 && *pname == '.') { if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); *vpp = dvp; VREF(dvp); return (0); } if (flags & ISDOTDOT) { if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); de = devfs_parent_dirent(dd); if (de == NULL) return (ENOENT); dvplocked = VOP_ISLOCKED(dvp); VOP_UNLOCK(dvp); error = devfs_allocv(de, mp, cnp->cn_lkflags & LK_TYPE_MASK, vpp); *dm_unlock = 0; vn_lock(dvp, dvplocked | LK_RETRY); return (error); } dd = dvp->v_data; de = devfs_find(dd, cnp->cn_nameptr, cnp->cn_namelen, 0); while (de == NULL) { /* While(...) so we can use break */ if (nameiop == DELETE) return (ENOENT); /* * OK, we didn't have an entry for the name we were asked for * so we try to see if anybody can create it on demand. */ pname = devfs_fqpn(specname, dmp, dd, cnp); if (pname == NULL) break; cdev = NULL; DEVFS_DMP_HOLD(dmp); sx_xunlock(&dmp->dm_lock); sx_slock(&clone_drain_lock); EVENTHANDLER_INVOKE(dev_clone, td->td_ucred, pname, strlen(pname), &cdev); sx_sunlock(&clone_drain_lock); if (cdev == NULL) sx_xlock(&dmp->dm_lock); else if (devfs_populate_vp(dvp) != 0) { *dm_unlock = 0; sx_xlock(&dmp->dm_lock); if (DEVFS_DMP_DROP(dmp)) { sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); } else sx_xunlock(&dmp->dm_lock); dev_rel(cdev); return (ENOENT); } if (DEVFS_DMP_DROP(dmp)) { *dm_unlock = 0; sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); if (cdev != NULL) dev_rel(cdev); return (ENOENT); } if (cdev == NULL) break; dev_lock(); dde = &cdev2priv(cdev)->cdp_dirents[dmp->dm_idx]; if (dde != NULL && *dde != NULL) de = *dde; dev_unlock(); dev_rel(cdev); break; } if (de == NULL || de->de_flags & DE_WHITEOUT) { if ((nameiop == CREATE || nameiop == RENAME) && (flags & (LOCKPARENT | WANTPARENT)) && (flags & ISLASTCN)) { cnp->cn_flags |= SAVENAME; return (EJUSTRETURN); } return (ENOENT); } if (devfs_prison_check(de, td)) return (ENOENT); if ((cnp->cn_nameiop == DELETE) && (flags & ISLASTCN)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) return (error); if (*vpp == dvp) { VREF(dvp); *vpp = dvp; return (0); } } error = devfs_allocv(de, mp, cnp->cn_lkflags & LK_TYPE_MASK, vpp); *dm_unlock = 0; return (error); } static int devfs_lookup(struct vop_lookup_args *ap) { int j; struct devfs_mount *dmp; int dm_unlock; if (devfs_populate_vp(ap->a_dvp) != 0) return (ENOTDIR); dmp = VFSTODEVFS(ap->a_dvp->v_mount); dm_unlock = 1; j = devfs_lookupx(ap, &dm_unlock); if (dm_unlock == 1) sx_xunlock(&dmp->dm_lock); return (j); } static int devfs_mknod(struct vop_mknod_args *ap) { struct componentname *cnp; struct vnode *dvp, **vpp; struct devfs_dirent *dd, *de; struct devfs_mount *dmp; int error; /* * The only type of node we should be creating here is a * character device, for anything else return EOPNOTSUPP. */ if (ap->a_vap->va_type != VCHR) return (EOPNOTSUPP); dvp = ap->a_dvp; dmp = VFSTODEVFS(dvp->v_mount); cnp = ap->a_cnp; vpp = ap->a_vpp; dd = dvp->v_data; error = ENOENT; sx_xlock(&dmp->dm_lock); TAILQ_FOREACH(de, &dd->de_dlist, de_list) { if (cnp->cn_namelen != de->de_dirent->d_namlen) continue; if (de->de_dirent->d_type == DT_CHR && (de->de_cdp->cdp_flags & CDP_ACTIVE) == 0) continue; if (bcmp(cnp->cn_nameptr, de->de_dirent->d_name, de->de_dirent->d_namlen) != 0) continue; if (de->de_flags & DE_WHITEOUT) break; goto notfound; } if (de == NULL) goto notfound; de->de_flags &= ~DE_WHITEOUT; error = devfs_allocv(de, dvp->v_mount, LK_EXCLUSIVE, vpp); return (error); notfound: sx_xunlock(&dmp->dm_lock); return (error); } /* ARGSUSED */ static int devfs_open(struct vop_open_args *ap) { struct thread *td = ap->a_td; struct vnode *vp = ap->a_vp; struct cdev *dev = vp->v_rdev; struct file *fp = ap->a_fp; int error, ref, vlocked; struct cdevsw *dsw; struct file *fpop; if (vp->v_type == VBLK) return (ENXIO); if (dev == NULL) return (ENXIO); /* Make this field valid before any I/O in d_open. */ if (dev->si_iosize_max == 0) dev->si_iosize_max = DFLTPHYS; dsw = dev_refthread(dev, &ref); if (dsw == NULL) return (ENXIO); if (fp == NULL && dsw->d_fdopen != NULL) { dev_relthread(dev, ref); return (ENXIO); } if (vp->v_type == VCHR) devfs_usecount_add(vp); vlocked = VOP_ISLOCKED(vp); VOP_UNLOCK(vp); fpop = td->td_fpop; td->td_fpop = fp; if (fp != NULL) { fp->f_data = dev; fp->f_vnode = vp; } if (dsw->d_fdopen != NULL) error = dsw->d_fdopen(dev, ap->a_mode, td, fp); else error = dsw->d_open(dev, ap->a_mode, S_IFCHR, td); /* Clean up any cdevpriv upon error. */ if (error != 0) devfs_clear_cdevpriv(); td->td_fpop = fpop; vn_lock(vp, vlocked | LK_RETRY); if (error != 0 && vp->v_type == VCHR) devfs_usecount_sub(vp); dev_relthread(dev, ref); if (error != 0) { if (error == ERESTART) error = EINTR; return (error); } #if 0 /* /dev/console */ KASSERT(fp != NULL, ("Could not vnode bypass device on NULL fp")); #else if (fp == NULL) return (error); #endif if (fp->f_ops == &badfileops) finit(fp, fp->f_flag, DTYPE_VNODE, dev, &devfs_ops_f); return (error); } static int devfs_pathconf(struct vop_pathconf_args *ap) { switch (ap->a_name) { case _PC_FILESIZEBITS: *ap->a_retval = 64; return (0); case _PC_NAME_MAX: *ap->a_retval = NAME_MAX; return (0); case _PC_LINK_MAX: *ap->a_retval = INT_MAX; return (0); case _PC_SYMLINK_MAX: *ap->a_retval = MAXPATHLEN; return (0); case _PC_MAX_CANON: if (ap->a_vp->v_vflag & VV_ISTTY) { *ap->a_retval = MAX_CANON; return (0); } return (EINVAL); case _PC_MAX_INPUT: if (ap->a_vp->v_vflag & VV_ISTTY) { *ap->a_retval = MAX_INPUT; return (0); } return (EINVAL); case _PC_VDISABLE: if (ap->a_vp->v_vflag & VV_ISTTY) { *ap->a_retval = _POSIX_VDISABLE; return (0); } return (EINVAL); case _PC_MAC_PRESENT: #ifdef MAC /* * If MAC is enabled, devfs automatically supports * trivial non-persistant label storage. */ *ap->a_retval = 1; #else *ap->a_retval = 0; #endif return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); default: return (vop_stdpathconf(ap)); } /* NOTREACHED */ } /* ARGSUSED */ static int devfs_poll_f(struct file *fp, int events, struct ucred *cred, struct thread *td) { struct cdev *dev; struct cdevsw *dsw; int error, ref; struct file *fpop; fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error != 0) { error = vnops.fo_poll(fp, events, cred, td); return (error); } error = dsw->d_poll(dev, events, td); td->td_fpop = fpop; dev_relthread(dev, ref); return(error); } /* * Print out the contents of a special device vnode. */ static int devfs_print(struct vop_print_args *ap) { printf("\tdev %s\n", devtoname(ap->a_vp->v_rdev)); return (0); } static int devfs_read_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td) { struct cdev *dev; int ioflag, error, ref; ssize_t resid; struct cdevsw *dsw; struct file *fpop; if (uio->uio_resid > DEVFS_IOSIZE_MAX) return (EINVAL); fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error != 0) { error = vnops.fo_read(fp, uio, cred, flags, td); return (error); } resid = uio->uio_resid; ioflag = fp->f_flag & (O_NONBLOCK | O_DIRECT); if (ioflag & O_DIRECT) ioflag |= IO_DIRECT; foffset_lock_uio(fp, uio, flags | FOF_NOLOCK); error = dsw->d_read(dev, uio, ioflag); if (uio->uio_resid != resid || (error == 0 && resid != 0)) devfs_timestamp(&dev->si_atime); td->td_fpop = fpop; dev_relthread(dev, ref); foffset_unlock_uio(fp, uio, flags | FOF_NOLOCK | FOF_NEXTOFF_R); return (error); } static int devfs_readdir(struct vop_readdir_args *ap) { int error; struct uio *uio; struct dirent *dp; struct devfs_dirent *dd; struct devfs_dirent *de; struct devfs_mount *dmp; off_t off; int *tmp_ncookies = NULL; if (ap->a_vp->v_type != VDIR) return (ENOTDIR); uio = ap->a_uio; if (uio->uio_offset < 0) return (EINVAL); /* * XXX: This is a temporary hack to get around this filesystem not * supporting cookies. We store the location of the ncookies pointer * in a temporary variable before calling vfs_subr.c:vfs_read_dirent() * and set the number of cookies to 0. We then set the pointer to * NULL so that vfs_read_dirent doesn't try to call realloc() on * ap->a_cookies. Later in this function, we restore the ap->a_ncookies * pointer to its original location before returning to the caller. */ if (ap->a_ncookies != NULL) { tmp_ncookies = ap->a_ncookies; *ap->a_ncookies = 0; ap->a_ncookies = NULL; } dmp = VFSTODEVFS(ap->a_vp->v_mount); if (devfs_populate_vp(ap->a_vp) != 0) { if (tmp_ncookies != NULL) ap->a_ncookies = tmp_ncookies; return (EIO); } error = 0; de = ap->a_vp->v_data; off = 0; TAILQ_FOREACH(dd, &de->de_dlist, de_list) { KASSERT(dd->de_cdp != (void *)0xdeadc0de, ("%s %d\n", __func__, __LINE__)); if (dd->de_flags & (DE_COVERED | DE_WHITEOUT)) continue; if (devfs_prison_check(dd, uio->uio_td)) continue; if (dd->de_dirent->d_type == DT_DIR) de = dd->de_dir; else de = dd; dp = dd->de_dirent; MPASS(dp->d_reclen == GENERIC_DIRSIZ(dp)); if (dp->d_reclen > uio->uio_resid) break; dp->d_fileno = de->de_inode; /* NOTE: d_off is the offset for the *next* entry. */ dp->d_off = off + dp->d_reclen; if (off >= uio->uio_offset) { error = vfs_read_dirent(ap, dp, off); if (error) break; } off += dp->d_reclen; } sx_xunlock(&dmp->dm_lock); uio->uio_offset = off; /* * Restore ap->a_ncookies if it wasn't originally NULL in the first * place. */ if (tmp_ncookies != NULL) ap->a_ncookies = tmp_ncookies; return (error); } static int devfs_readlink(struct vop_readlink_args *ap) { struct devfs_dirent *de; de = ap->a_vp->v_data; return (uiomove(de->de_symlink, strlen(de->de_symlink), ap->a_uio)); } static void devfs_reclaiml(struct vnode *vp) { struct devfs_dirent *de; mtx_assert(&devfs_de_interlock, MA_OWNED); de = vp->v_data; if (de != NULL) { MPASS(de->de_usecount == 0); de->de_vnode = NULL; vp->v_data = NULL; } } static int devfs_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp; vp = ap->a_vp; mtx_lock(&devfs_de_interlock); devfs_reclaiml(vp); mtx_unlock(&devfs_de_interlock); return (0); } static int devfs_reclaim_vchr(struct vop_reclaim_args *ap) { struct vnode *vp; struct cdev *dev; vp = ap->a_vp; MPASS(vp->v_type == VCHR); mtx_lock(&devfs_de_interlock); VI_LOCK(vp); devfs_usecount_subl(vp); devfs_reclaiml(vp); mtx_unlock(&devfs_de_interlock); dev_lock(); dev = vp->v_rdev; vp->v_rdev = NULL; dev_unlock(); VI_UNLOCK(vp); if (dev != NULL) dev_rel(dev); return (0); } static int devfs_remove(struct vop_remove_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode *vp = ap->a_vp; struct devfs_dirent *dd; struct devfs_dirent *de, *de_covered; struct devfs_mount *dmp = VFSTODEVFS(vp->v_mount); ASSERT_VOP_ELOCKED(dvp, "devfs_remove"); ASSERT_VOP_ELOCKED(vp, "devfs_remove"); sx_xlock(&dmp->dm_lock); dd = ap->a_dvp->v_data; de = vp->v_data; if (de->de_cdp == NULL) { TAILQ_REMOVE(&dd->de_dlist, de, de_list); if (de->de_dirent->d_type == DT_LNK) { de_covered = devfs_find(dd, de->de_dirent->d_name, de->de_dirent->d_namlen, 0); if (de_covered != NULL) de_covered->de_flags &= ~DE_COVERED; } /* We need to unlock dvp because devfs_delete() may lock it. */ VOP_UNLOCK(vp); if (dvp != vp) VOP_UNLOCK(dvp); devfs_delete(dmp, de, 0); sx_xunlock(&dmp->dm_lock); if (dvp != vp) vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } else { de->de_flags |= DE_WHITEOUT; sx_xunlock(&dmp->dm_lock); } return (0); } /* * Revoke is called on a tty when a terminal session ends. The vnode * is orphaned by setting v_op to deadfs so we need to let go of it * as well so that we create a new one next time around. * */ static int devfs_revoke(struct vop_revoke_args *ap) { struct vnode *vp = ap->a_vp, *vp2; struct cdev *dev; struct cdev_priv *cdp; struct devfs_dirent *de; enum vgetstate vs; u_int i; KASSERT((ap->a_flags & REVOKEALL) != 0, ("devfs_revoke !REVOKEALL")); dev = vp->v_rdev; cdp = cdev2priv(dev); dev_lock(); cdp->cdp_inuse++; dev_unlock(); vhold(vp); vgone(vp); vdrop(vp); VOP_UNLOCK(vp); loop: for (;;) { mtx_lock(&devfs_de_interlock); dev_lock(); vp2 = NULL; for (i = 0; i <= cdp->cdp_maxdirent; i++) { de = cdp->cdp_dirents[i]; if (de == NULL) continue; vp2 = de->de_vnode; if (vp2 != NULL) { dev_unlock(); vs = vget_prep(vp2); mtx_unlock(&devfs_de_interlock); if (vget_finish(vp2, LK_EXCLUSIVE, vs) != 0) goto loop; vhold(vp2); vgone(vp2); vdrop(vp2); vput(vp2); break; } } if (vp2 != NULL) { continue; } dev_unlock(); mtx_unlock(&devfs_de_interlock); break; } dev_lock(); cdp->cdp_inuse--; if (!(cdp->cdp_flags & CDP_ACTIVE) && cdp->cdp_inuse == 0) { TAILQ_REMOVE(&cdevp_list, cdp, cdp_list); dev_unlock(); dev_rel(&cdp->cdp_c); } else dev_unlock(); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); return (0); } static int devfs_rioctl(struct vop_ioctl_args *ap) { struct vnode *vp; struct devfs_mount *dmp; int error; vp = ap->a_vp; vn_lock(vp, LK_SHARED | LK_RETRY); if (VN_IS_DOOMED(vp)) { VOP_UNLOCK(vp); return (EBADF); } dmp = VFSTODEVFS(vp->v_mount); sx_xlock(&dmp->dm_lock); VOP_UNLOCK(vp); DEVFS_DMP_HOLD(dmp); devfs_populate(dmp); if (DEVFS_DMP_DROP(dmp)) { sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); return (ENOENT); } error = devfs_rules_ioctl(dmp, ap->a_command, ap->a_data, ap->a_td); sx_xunlock(&dmp->dm_lock); return (error); } static int devfs_rread(struct vop_read_args *ap) { if (ap->a_vp->v_type != VDIR) return (EINVAL); return (VOP_READDIR(ap->a_vp, ap->a_uio, ap->a_cred, NULL, NULL, NULL)); } static int devfs_setattr(struct vop_setattr_args *ap) { struct devfs_dirent *de; struct vattr *vap; struct vnode *vp; struct thread *td; int c, error; uid_t uid; gid_t gid; vap = ap->a_vap; vp = ap->a_vp; td = curthread; if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_flags != VNOVAL && vap->va_flags != 0) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { return (EINVAL); } error = devfs_populate_vp(vp); if (error != 0) return (error); de = vp->v_data; if (vp->v_type == VDIR) de = de->de_dir; c = 0; if (vap->va_uid == (uid_t)VNOVAL) uid = de->de_uid; else uid = vap->va_uid; if (vap->va_gid == (gid_t)VNOVAL) gid = de->de_gid; else gid = vap->va_gid; if (uid != de->de_uid || gid != de->de_gid) { if ((ap->a_cred->cr_uid != de->de_uid) || uid != de->de_uid || (gid != de->de_gid && !groupmember(gid, ap->a_cred))) { error = priv_check(td, PRIV_VFS_CHOWN); if (error != 0) goto ret; } de->de_uid = uid; de->de_gid = gid; c = 1; } if (vap->va_mode != (mode_t)VNOVAL) { if (ap->a_cred->cr_uid != de->de_uid) { error = priv_check(td, PRIV_VFS_ADMIN); if (error != 0) goto ret; } de->de_mode = vap->va_mode; c = 1; } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { error = vn_utimes_perm(vp, vap, ap->a_cred, td); if (error != 0) goto ret; if (vap->va_atime.tv_sec != VNOVAL) { if (vp->v_type == VCHR) vp->v_rdev->si_atime = vap->va_atime; else de->de_atime = vap->va_atime; } if (vap->va_mtime.tv_sec != VNOVAL) { if (vp->v_type == VCHR) vp->v_rdev->si_mtime = vap->va_mtime; else de->de_mtime = vap->va_mtime; } c = 1; } if (c) { if (vp->v_type == VCHR) vfs_timestamp(&vp->v_rdev->si_ctime); else vfs_timestamp(&de->de_mtime); } ret: sx_xunlock(&VFSTODEVFS(vp->v_mount)->dm_lock); return (error); } #ifdef MAC static int devfs_setlabel(struct vop_setlabel_args *ap) { struct vnode *vp; struct devfs_dirent *de; vp = ap->a_vp; de = vp->v_data; mac_vnode_relabel(ap->a_cred, vp, ap->a_label); mac_devfs_update(vp->v_mount, de, vp); return (0); } #endif static int -devfs_stat_f(struct file *fp, struct stat *sb, struct ucred *cred, struct thread *td) +devfs_stat_f(struct file *fp, struct stat *sb, struct ucred *cred) { - return (vnops.fo_stat(fp, sb, cred, td)); + return (vnops.fo_stat(fp, sb, cred)); } static int devfs_symlink(struct vop_symlink_args *ap) { int i, error; struct devfs_dirent *dd; struct devfs_dirent *de, *de_covered, *de_dotdot; struct devfs_mount *dmp; error = priv_check(curthread, PRIV_DEVFS_SYMLINK); if (error) return(error); dmp = VFSTODEVFS(ap->a_dvp->v_mount); if (devfs_populate_vp(ap->a_dvp) != 0) return (ENOENT); dd = ap->a_dvp->v_data; de = devfs_newdirent(ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen); de->de_flags = DE_USER; de->de_uid = 0; de->de_gid = 0; de->de_mode = 0755; de->de_inode = alloc_unr(devfs_inos); de->de_dir = dd; de->de_dirent->d_type = DT_LNK; i = strlen(ap->a_target) + 1; de->de_symlink = malloc(i, M_DEVFS, M_WAITOK); bcopy(ap->a_target, de->de_symlink, i); #ifdef MAC mac_devfs_create_symlink(ap->a_cnp->cn_cred, dmp->dm_mount, dd, de); #endif de_covered = devfs_find(dd, de->de_dirent->d_name, de->de_dirent->d_namlen, 0); if (de_covered != NULL) { if ((de_covered->de_flags & DE_USER) != 0) { devfs_delete(dmp, de, DEVFS_DEL_NORECURSE); sx_xunlock(&dmp->dm_lock); return (EEXIST); } KASSERT((de_covered->de_flags & DE_COVERED) == 0, ("devfs_symlink: entry %p already covered", de_covered)); de_covered->de_flags |= DE_COVERED; } de_dotdot = TAILQ_FIRST(&dd->de_dlist); /* "." */ de_dotdot = TAILQ_NEXT(de_dotdot, de_list); /* ".." */ TAILQ_INSERT_AFTER(&dd->de_dlist, de_dotdot, de, de_list); devfs_dir_ref_de(dmp, dd); devfs_rules_apply(dmp, de); return (devfs_allocv(de, ap->a_dvp->v_mount, LK_EXCLUSIVE, ap->a_vpp)); } static int devfs_truncate_f(struct file *fp, off_t length, struct ucred *cred, struct thread *td) { return (vnops.fo_truncate(fp, length, cred, td)); } static int devfs_write_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td) { struct cdev *dev; int error, ioflag, ref; ssize_t resid; struct cdevsw *dsw; struct file *fpop; if (uio->uio_resid > DEVFS_IOSIZE_MAX) return (EINVAL); fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error != 0) { error = vnops.fo_write(fp, uio, cred, flags, td); return (error); } KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); ioflag = fp->f_flag & (O_NONBLOCK | O_DIRECT | O_FSYNC); if (ioflag & O_DIRECT) ioflag |= IO_DIRECT; foffset_lock_uio(fp, uio, flags | FOF_NOLOCK); resid = uio->uio_resid; error = dsw->d_write(dev, uio, ioflag); if (uio->uio_resid != resid || (error == 0 && resid != 0)) { devfs_timestamp(&dev->si_ctime); dev->si_mtime = dev->si_ctime; } td->td_fpop = fpop; dev_relthread(dev, ref); foffset_unlock_uio(fp, uio, flags | FOF_NOLOCK | FOF_NEXTOFF_W); return (error); } static int devfs_mmap_f(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, struct thread *td) { struct cdev *dev; struct cdevsw *dsw; struct mount *mp; struct vnode *vp; struct file *fpop; vm_object_t object; vm_prot_t maxprot; int error, ref; vp = fp->f_vnode; /* * Ensure that file and memory protections are * compatible. */ mp = vp->v_mount; if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) { maxprot = VM_PROT_NONE; if ((prot & VM_PROT_EXECUTE) != 0) return (EACCES); } else maxprot = VM_PROT_EXECUTE; if ((fp->f_flag & FREAD) != 0) maxprot |= VM_PROT_READ; else if ((prot & VM_PROT_READ) != 0) return (EACCES); /* * If we are sharing potential changes via MAP_SHARED and we * are trying to get write permission although we opened it * without asking for it, bail out. * * Note that most character devices always share mappings. * The one exception is that D_MMAP_ANON devices * (i.e. /dev/zero) permit private writable mappings. * * Rely on vm_mmap_cdev() to fail invalid MAP_PRIVATE requests * as well as updating maxprot to permit writing for * D_MMAP_ANON devices rather than doing that here. */ if ((flags & MAP_SHARED) != 0) { if ((fp->f_flag & FWRITE) != 0) maxprot |= VM_PROT_WRITE; else if ((prot & VM_PROT_WRITE) != 0) return (EACCES); } maxprot &= cap_maxprot; fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error != 0) return (error); error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, dev, dsw, &foff, &object); td->td_fpop = fpop; dev_relthread(dev, ref); if (error != 0) return (error); error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, foff, FALSE, td); if (error != 0) vm_object_deallocate(object); return (error); } dev_t dev2udev(struct cdev *x) { if (x == NULL) return (NODEV); return (cdev2priv(x)->cdp_inode); } static struct fileops devfs_ops_f = { .fo_read = devfs_read_f, .fo_write = devfs_write_f, .fo_truncate = devfs_truncate_f, .fo_ioctl = devfs_ioctl_f, .fo_poll = devfs_poll_f, .fo_kqfilter = devfs_kqfilter_f, .fo_stat = devfs_stat_f, .fo_close = devfs_close_f, .fo_chmod = vn_chmod, .fo_chown = vn_chown, .fo_sendfile = vn_sendfile, .fo_seek = vn_seek, .fo_fill_kinfo = vn_fill_kinfo, .fo_mmap = devfs_mmap_f, .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE }; /* Vops for non-CHR vnodes in /dev. */ static struct vop_vector devfs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = devfs_access, .vop_getattr = devfs_getattr, .vop_ioctl = devfs_rioctl, .vop_lookup = devfs_lookup, .vop_mknod = devfs_mknod, .vop_pathconf = devfs_pathconf, .vop_read = devfs_rread, .vop_readdir = devfs_readdir, .vop_readlink = devfs_readlink, .vop_reclaim = devfs_reclaim, .vop_remove = devfs_remove, .vop_revoke = devfs_revoke, .vop_setattr = devfs_setattr, #ifdef MAC .vop_setlabel = devfs_setlabel, #endif .vop_symlink = devfs_symlink, .vop_vptocnp = devfs_vptocnp, .vop_lock1 = vop_lock, .vop_unlock = vop_unlock, .vop_islocked = vop_islocked, }; VFS_VOP_VECTOR_REGISTER(devfs_vnodeops); /* Vops for VCHR vnodes in /dev. */ static struct vop_vector devfs_specops = { .vop_default = &default_vnodeops, .vop_access = devfs_access, .vop_bmap = VOP_PANIC, .vop_close = devfs_close, .vop_create = VOP_PANIC, .vop_fsync = vop_stdfsync, .vop_getattr = devfs_getattr, .vop_ioctl = devfs_ioctl, .vop_link = VOP_PANIC, .vop_mkdir = VOP_PANIC, .vop_mknod = VOP_PANIC, .vop_open = devfs_open, .vop_pathconf = devfs_pathconf, .vop_poll = dead_poll, .vop_print = devfs_print, .vop_read = dead_read, .vop_readdir = VOP_PANIC, .vop_readlink = VOP_PANIC, .vop_reallocblks = VOP_PANIC, .vop_reclaim = devfs_reclaim_vchr, .vop_remove = devfs_remove, .vop_rename = VOP_PANIC, .vop_revoke = devfs_revoke, .vop_rmdir = VOP_PANIC, .vop_setattr = devfs_setattr, #ifdef MAC .vop_setlabel = devfs_setlabel, #endif .vop_strategy = VOP_PANIC, .vop_symlink = VOP_PANIC, .vop_vptocnp = devfs_vptocnp, .vop_write = dead_write, .vop_lock1 = vop_lock, .vop_unlock = vop_unlock, .vop_islocked = vop_islocked, }; VFS_VOP_VECTOR_REGISTER(devfs_specops); /* * Our calling convention to the device drivers used to be that we passed * vnode.h IO_* flags to read()/write(), but we're moving to fcntl.h O_ * flags instead since that's what open(), close() and ioctl() takes and * we don't really want vnode.h in device drivers. * We solved the source compatibility by redefining some vnode flags to * be the same as the fcntl ones and by sending down the bitwise OR of * the respective fcntl/vnode flags. These CTASSERTS make sure nobody * pulls the rug out under this. */ CTASSERT(O_NONBLOCK == IO_NDELAY); CTASSERT(O_FSYNC == IO_SYNC); diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c index b10dcc2f9469..55c2a36955a5 100644 --- a/sys/kern/kern_descrip.c +++ b/sys/kern/kern_descrip.c @@ -1,5083 +1,5082 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_ddb.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table"); static MALLOC_DEFINE(M_PWD, "pwd", "Descriptor table vnodes"); static MALLOC_DEFINE(M_PWDDESC, "pwddesc", "Pwd descriptors"); static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader", "file desc to leader structures"); static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures"); MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities"); MALLOC_DECLARE(M_FADVISE); static __read_mostly uma_zone_t file_zone; static __read_mostly uma_zone_t filedesc0_zone; __read_mostly uma_zone_t pwd_zone; VFS_SMR_DECLARE; static int closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, bool holdleaders, bool audit); static int fd_first_free(struct filedesc *fdp, int low, int size); static void fdgrowtable(struct filedesc *fdp, int nfd); static void fdgrowtable_exp(struct filedesc *fdp, int nfd); static void fdunused(struct filedesc *fdp, int fd); static void fdused(struct filedesc *fdp, int fd); static int getmaxfd(struct thread *td); static u_long *filecaps_copy_prep(const struct filecaps *src); static void filecaps_copy_finish(const struct filecaps *src, struct filecaps *dst, u_long *ioctls); static u_long *filecaps_free_prep(struct filecaps *fcaps); static void filecaps_free_finish(u_long *ioctls); static struct pwd *pwd_alloc(void); /* * Each process has: * * - An array of open file descriptors (fd_ofiles) * - An array of file flags (fd_ofileflags) * - A bitmap recording which descriptors are in use (fd_map) * * A process starts out with NDFILE descriptors. The value of NDFILE has * been selected based the historical limit of 20 open files, and an * assumption that the majority of processes, especially short-lived * processes like shells, will never need more. * * If this initial allocation is exhausted, a larger descriptor table and * map are allocated dynamically, and the pointers in the process's struct * filedesc are updated to point to those. This is repeated every time * the process runs out of file descriptors (provided it hasn't hit its * resource limit). * * Since threads may hold references to individual descriptor table * entries, the tables are never freed. Instead, they are placed on a * linked list and freed only when the struct filedesc is released. */ #define NDFILE 20 #define NDSLOTSIZE sizeof(NDSLOTTYPE) #define NDENTRIES (NDSLOTSIZE * __CHAR_BIT) #define NDSLOT(x) ((x) / NDENTRIES) #define NDBIT(x) ((NDSLOTTYPE)1 << ((x) % NDENTRIES)) #define NDSLOTS(x) (((x) + NDENTRIES - 1) / NDENTRIES) /* * SLIST entry used to keep track of ofiles which must be reclaimed when * the process exits. */ struct freetable { struct fdescenttbl *ft_table; SLIST_ENTRY(freetable) ft_next; }; /* * Initial allocation: a filedesc structure + the head of SLIST used to * keep track of old ofiles + enough space for NDFILE descriptors. */ struct fdescenttbl0 { int fdt_nfiles; struct filedescent fdt_ofiles[NDFILE]; }; struct filedesc0 { struct filedesc fd_fd; SLIST_HEAD(, freetable) fd_free; struct fdescenttbl0 fd_dfiles; NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)]; }; /* * Descriptor management. */ static int __exclusive_cache_line openfiles; /* actual number of open files */ struct mtx sigio_lock; /* mtx to protect pointers to sigio */ void __read_mostly (*mq_fdclose)(struct thread *td, int fd, struct file *fp); /* * If low >= size, just return low. Otherwise find the first zero bit in the * given bitmap, starting at low and not exceeding size - 1. Return size if * not found. */ static int fd_first_free(struct filedesc *fdp, int low, int size) { NDSLOTTYPE *map = fdp->fd_map; NDSLOTTYPE mask; int off, maxoff; if (low >= size) return (low); off = NDSLOT(low); if (low % NDENTRIES) { mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES))); if ((mask &= ~map[off]) != 0UL) return (off * NDENTRIES + ffsl(mask) - 1); ++off; } for (maxoff = NDSLOTS(size); off < maxoff; ++off) if (map[off] != ~0UL) return (off * NDENTRIES + ffsl(~map[off]) - 1); return (size); } /* * Find the last used fd. * * Call this variant if fdp can't be modified by anyone else (e.g, during exec). * Otherwise use fdlastfile. */ int fdlastfile_single(struct filedesc *fdp) { NDSLOTTYPE *map = fdp->fd_map; int off, minoff; off = NDSLOT(fdp->fd_nfiles - 1); for (minoff = NDSLOT(0); off >= minoff; --off) if (map[off] != 0) return (off * NDENTRIES + flsl(map[off]) - 1); return (-1); } int fdlastfile(struct filedesc *fdp) { FILEDESC_LOCK_ASSERT(fdp); return (fdlastfile_single(fdp)); } static int fdisused(struct filedesc *fdp, int fd) { KASSERT(fd >= 0 && fd < fdp->fd_nfiles, ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles)); return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0); } /* * Mark a file descriptor as used. */ static void fdused_init(struct filedesc *fdp, int fd) { KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd)); fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd); } static void fdused(struct filedesc *fdp, int fd) { FILEDESC_XLOCK_ASSERT(fdp); fdused_init(fdp, fd); if (fd == fdp->fd_freefile) fdp->fd_freefile++; } /* * Mark a file descriptor as unused. */ static void fdunused(struct filedesc *fdp, int fd) { FILEDESC_XLOCK_ASSERT(fdp); KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd)); KASSERT(fdp->fd_ofiles[fd].fde_file == NULL, ("fd=%d is still in use", fd)); fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd); if (fd < fdp->fd_freefile) fdp->fd_freefile = fd; } /* * Free a file descriptor. * * Avoid some work if fdp is about to be destroyed. */ static inline void fdefree_last(struct filedescent *fde) { filecaps_free(&fde->fde_caps); } static inline void fdfree(struct filedesc *fdp, int fd) { struct filedescent *fde; FILEDESC_XLOCK_ASSERT(fdp); fde = &fdp->fd_ofiles[fd]; #ifdef CAPABILITIES seqc_write_begin(&fde->fde_seqc); #endif fde->fde_file = NULL; #ifdef CAPABILITIES seqc_write_end(&fde->fde_seqc); #endif fdefree_last(fde); fdunused(fdp, fd); } /* * System calls on descriptors. */ #ifndef _SYS_SYSPROTO_H_ struct getdtablesize_args { int dummy; }; #endif /* ARGSUSED */ int sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap) { #ifdef RACCT uint64_t lim; #endif td->td_retval[0] = getmaxfd(td); #ifdef RACCT PROC_LOCK(td->td_proc); lim = racct_get_limit(td->td_proc, RACCT_NOFILE); PROC_UNLOCK(td->td_proc); if (lim < td->td_retval[0]) td->td_retval[0] = lim; #endif return (0); } /* * Duplicate a file descriptor to a particular value. * * Note: keep in mind that a potential race condition exists when closing * descriptors from a shared descriptor table (via rfork). */ #ifndef _SYS_SYSPROTO_H_ struct dup2_args { u_int from; u_int to; }; #endif /* ARGSUSED */ int sys_dup2(struct thread *td, struct dup2_args *uap) { return (kern_dup(td, FDDUP_FIXED, 0, (int)uap->from, (int)uap->to)); } /* * Duplicate a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct dup_args { u_int fd; }; #endif /* ARGSUSED */ int sys_dup(struct thread *td, struct dup_args *uap) { return (kern_dup(td, FDDUP_NORMAL, 0, (int)uap->fd, 0)); } /* * The file control system call. */ #ifndef _SYS_SYSPROTO_H_ struct fcntl_args { int fd; int cmd; long arg; }; #endif /* ARGSUSED */ int sys_fcntl(struct thread *td, struct fcntl_args *uap) { return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, uap->arg)); } int kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg) { struct flock fl; struct __oflock ofl; intptr_t arg1; int error, newcmd; error = 0; newcmd = cmd; switch (cmd) { case F_OGETLK: case F_OSETLK: case F_OSETLKW: /* * Convert old flock structure to new. */ error = copyin((void *)(intptr_t)arg, &ofl, sizeof(ofl)); fl.l_start = ofl.l_start; fl.l_len = ofl.l_len; fl.l_pid = ofl.l_pid; fl.l_type = ofl.l_type; fl.l_whence = ofl.l_whence; fl.l_sysid = 0; switch (cmd) { case F_OGETLK: newcmd = F_GETLK; break; case F_OSETLK: newcmd = F_SETLK; break; case F_OSETLKW: newcmd = F_SETLKW; break; } arg1 = (intptr_t)&fl; break; case F_GETLK: case F_SETLK: case F_SETLKW: case F_SETLK_REMOTE: error = copyin((void *)(intptr_t)arg, &fl, sizeof(fl)); arg1 = (intptr_t)&fl; break; default: arg1 = arg; break; } if (error) return (error); error = kern_fcntl(td, fd, newcmd, arg1); if (error) return (error); if (cmd == F_OGETLK) { ofl.l_start = fl.l_start; ofl.l_len = fl.l_len; ofl.l_pid = fl.l_pid; ofl.l_type = fl.l_type; ofl.l_whence = fl.l_whence; error = copyout(&ofl, (void *)(intptr_t)arg, sizeof(ofl)); } else if (cmd == F_GETLK) { error = copyout(&fl, (void *)(intptr_t)arg, sizeof(fl)); } return (error); } int kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) { struct filedesc *fdp; struct flock *flp; struct file *fp, *fp2; struct filedescent *fde; struct proc *p; struct vnode *vp; struct mount *mp; int error, flg, seals, tmp; uint64_t bsize; off_t foffset; error = 0; flg = F_POSIX; p = td->td_proc; fdp = p->p_fd; AUDIT_ARG_FD(cmd); AUDIT_ARG_CMD(cmd); switch (cmd) { case F_DUPFD: tmp = arg; error = kern_dup(td, FDDUP_FCNTL, 0, fd, tmp); break; case F_DUPFD_CLOEXEC: tmp = arg; error = kern_dup(td, FDDUP_FCNTL, FDDUP_FLAG_CLOEXEC, fd, tmp); break; case F_DUP2FD: tmp = arg; error = kern_dup(td, FDDUP_FIXED, 0, fd, tmp); break; case F_DUP2FD_CLOEXEC: tmp = arg; error = kern_dup(td, FDDUP_FIXED, FDDUP_FLAG_CLOEXEC, fd, tmp); break; case F_GETFD: error = EBADF; FILEDESC_SLOCK(fdp); fde = fdeget_locked(fdp, fd); if (fde != NULL) { td->td_retval[0] = (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0; error = 0; } FILEDESC_SUNLOCK(fdp); break; case F_SETFD: error = EBADF; FILEDESC_XLOCK(fdp); fde = fdeget_locked(fdp, fd); if (fde != NULL) { fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) | (arg & FD_CLOEXEC ? UF_EXCLOSE : 0); error = 0; } FILEDESC_XUNLOCK(fdp); break; case F_GETFL: error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETFL, &fp); if (error != 0) break; td->td_retval[0] = OFLAGS(fp->f_flag); fdrop(fp, td); break; case F_SETFL: error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETFL, &fp); if (error != 0) break; if (fp->f_ops == &path_fileops) { fdrop(fp, td); error = EBADF; break; } do { tmp = flg = fp->f_flag; tmp &= ~FCNTLFLAGS; tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS; } while (atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0); tmp = fp->f_flag & FNONBLOCK; error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); if (error != 0) { fdrop(fp, td); break; } tmp = fp->f_flag & FASYNC; error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td); if (error == 0) { fdrop(fp, td); break; } atomic_clear_int(&fp->f_flag, FNONBLOCK); tmp = 0; (void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); fdrop(fp, td); break; case F_GETOWN: error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETOWN, &fp); if (error != 0) break; error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td); if (error == 0) td->td_retval[0] = tmp; fdrop(fp, td); break; case F_SETOWN: error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETOWN, &fp); if (error != 0) break; tmp = arg; error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td); fdrop(fp, td); break; case F_SETLK_REMOTE: error = priv_check(td, PRIV_NFS_LOCKD); if (error != 0) return (error); flg = F_REMOTE; goto do_setlk; case F_SETLKW: flg |= F_WAIT; /* FALLTHROUGH F_SETLK */ case F_SETLK: do_setlk: flp = (struct flock *)arg; if ((flg & F_REMOTE) != 0 && flp->l_sysid == 0) { error = EINVAL; break; } error = fget_unlocked(fdp, fd, &cap_flock_rights, &fp); if (error != 0) break; if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) { error = EBADF; fdrop(fp, td); break; } if (flp->l_whence == SEEK_CUR) { foffset = foffset_get(fp); if (foffset < 0 || (flp->l_start > 0 && foffset > OFF_MAX - flp->l_start)) { error = EOVERFLOW; fdrop(fp, td); break; } flp->l_start += foffset; } vp = fp->f_vnode; switch (flp->l_type) { case F_RDLCK: if ((fp->f_flag & FREAD) == 0) { error = EBADF; break; } if ((p->p_leader->p_flag & P_ADVLOCK) == 0) { PROC_LOCK(p->p_leader); p->p_leader->p_flag |= P_ADVLOCK; PROC_UNLOCK(p->p_leader); } error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, flp, flg); break; case F_WRLCK: if ((fp->f_flag & FWRITE) == 0) { error = EBADF; break; } if ((p->p_leader->p_flag & P_ADVLOCK) == 0) { PROC_LOCK(p->p_leader); p->p_leader->p_flag |= P_ADVLOCK; PROC_UNLOCK(p->p_leader); } error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, flp, flg); break; case F_UNLCK: error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, flp, flg); break; case F_UNLCKSYS: if (flg != F_REMOTE) { error = EINVAL; break; } error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCKSYS, flp, flg); break; default: error = EINVAL; break; } if (error != 0 || flp->l_type == F_UNLCK || flp->l_type == F_UNLCKSYS) { fdrop(fp, td); break; } /* * Check for a race with close. * * The vnode is now advisory locked (or unlocked, but this case * is not really important) as the caller requested. * We had to drop the filedesc lock, so we need to recheck if * the descriptor is still valid, because if it was closed * in the meantime we need to remove advisory lock from the * vnode - close on any descriptor leading to an advisory * locked vnode, removes that lock. * We will return 0 on purpose in that case, as the result of * successful advisory lock might have been externally visible * already. This is fine - effectively we pretend to the caller * that the closing thread was a bit slower and that the * advisory lock succeeded before the close. */ error = fget_unlocked(fdp, fd, &cap_no_rights, &fp2); if (error != 0) { fdrop(fp, td); break; } if (fp != fp2) { flp->l_whence = SEEK_SET; flp->l_start = 0; flp->l_len = 0; flp->l_type = F_UNLCK; (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, flp, F_POSIX); } fdrop(fp, td); fdrop(fp2, td); break; case F_GETLK: error = fget_unlocked(fdp, fd, &cap_flock_rights, &fp); if (error != 0) break; if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) { error = EBADF; fdrop(fp, td); break; } flp = (struct flock *)arg; if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK && flp->l_type != F_UNLCK) { error = EINVAL; fdrop(fp, td); break; } if (flp->l_whence == SEEK_CUR) { foffset = foffset_get(fp); if ((flp->l_start > 0 && foffset > OFF_MAX - flp->l_start) || (flp->l_start < 0 && foffset < OFF_MIN - flp->l_start)) { error = EOVERFLOW; fdrop(fp, td); break; } flp->l_start += foffset; } vp = fp->f_vnode; error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp, F_POSIX); fdrop(fp, td); break; case F_ADD_SEALS: error = fget_unlocked(fdp, fd, &cap_no_rights, &fp); if (error != 0) break; error = fo_add_seals(fp, arg); fdrop(fp, td); break; case F_GET_SEALS: error = fget_unlocked(fdp, fd, &cap_no_rights, &fp); if (error != 0) break; if (fo_get_seals(fp, &seals) == 0) td->td_retval[0] = seals; else error = EINVAL; fdrop(fp, td); break; case F_RDAHEAD: arg = arg ? 128 * 1024: 0; /* FALLTHROUGH */ case F_READAHEAD: error = fget_unlocked(fdp, fd, &cap_no_rights, &fp); if (error != 0) break; if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) { fdrop(fp, td); error = EBADF; break; } vp = fp->f_vnode; if (vp->v_type != VREG) { fdrop(fp, td); error = ENOTTY; break; } /* * Exclusive lock synchronizes against f_seqcount reads and * writes in sequential_heuristic(). */ error = vn_lock(vp, LK_EXCLUSIVE); if (error != 0) { fdrop(fp, td); break; } if (arg >= 0) { bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize; arg = MIN(arg, INT_MAX - bsize + 1); fp->f_seqcount[UIO_READ] = MIN(IO_SEQMAX, (arg + bsize - 1) / bsize); atomic_set_int(&fp->f_flag, FRDAHEAD); } else { atomic_clear_int(&fp->f_flag, FRDAHEAD); } VOP_UNLOCK(vp); fdrop(fp, td); break; case F_ISUNIONSTACK: /* * Check if the vnode is part of a union stack (either the * "union" flag from mount(2) or unionfs). * * Prior to introduction of this op libc's readdir would call * fstatfs(2), in effect unnecessarily copying kilobytes of * data just to check fs name and a mount flag. * * Fixing the code to handle everything in the kernel instead * is a non-trivial endeavor and has low priority, thus this * horrible kludge facilitates the current behavior in a much * cheaper manner until someone(tm) sorts this out. */ error = fget_unlocked(fdp, fd, &cap_no_rights, &fp); if (error != 0) break; if (fp->f_type != DTYPE_VNODE) { fdrop(fp, td); error = EBADF; break; } vp = fp->f_vnode; /* * Since we don't prevent dooming the vnode even non-null mp * found can become immediately stale. This is tolerable since * mount points are type-stable (providing safe memory access) * and any vfs op on this vnode going forward will return an * error (meaning return value in this case is meaningless). */ mp = atomic_load_ptr(&vp->v_mount); if (__predict_false(mp == NULL)) { fdrop(fp, td); error = EBADF; break; } td->td_retval[0] = 0; if (mp->mnt_kern_flag & MNTK_UNIONFS || mp->mnt_flag & MNT_UNION) td->td_retval[0] = 1; fdrop(fp, td); break; default: error = EINVAL; break; } return (error); } static int getmaxfd(struct thread *td) { return (min((int)lim_cur(td, RLIMIT_NOFILE), maxfilesperproc)); } /* * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD). */ int kern_dup(struct thread *td, u_int mode, int flags, int old, int new) { struct filedesc *fdp; struct filedescent *oldfde, *newfde; struct proc *p; struct file *delfp, *oldfp; u_long *oioctls, *nioctls; int error, maxfd; p = td->td_proc; fdp = p->p_fd; oioctls = NULL; MPASS((flags & ~(FDDUP_FLAG_CLOEXEC)) == 0); MPASS(mode < FDDUP_LASTMODE); AUDIT_ARG_FD(old); /* XXXRW: if (flags & FDDUP_FIXED) AUDIT_ARG_FD2(new); */ /* * Verify we have a valid descriptor to dup from and possibly to * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should * return EINVAL when the new descriptor is out of bounds. */ if (old < 0) return (EBADF); if (new < 0) return (mode == FDDUP_FCNTL ? EINVAL : EBADF); maxfd = getmaxfd(td); if (new >= maxfd) return (mode == FDDUP_FCNTL ? EINVAL : EBADF); error = EBADF; FILEDESC_XLOCK(fdp); if (fget_locked(fdp, old) == NULL) goto unlock; if (mode == FDDUP_FIXED && old == new) { td->td_retval[0] = new; if (flags & FDDUP_FLAG_CLOEXEC) fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE; error = 0; goto unlock; } oldfde = &fdp->fd_ofiles[old]; oldfp = oldfde->fde_file; if (!fhold(oldfp)) goto unlock; /* * If the caller specified a file descriptor, make sure the file * table is large enough to hold it, and grab it. Otherwise, just * allocate a new descriptor the usual way. */ switch (mode) { case FDDUP_NORMAL: case FDDUP_FCNTL: if ((error = fdalloc(td, new, &new)) != 0) { fdrop(oldfp, td); goto unlock; } break; case FDDUP_FIXED: if (new >= fdp->fd_nfiles) { /* * The resource limits are here instead of e.g. * fdalloc(), because the file descriptor table may be * shared between processes, so we can't really use * racct_add()/racct_sub(). Instead of counting the * number of actually allocated descriptors, just put * the limit on the size of the file descriptor table. */ #ifdef RACCT if (RACCT_ENABLED()) { error = racct_set_unlocked(p, RACCT_NOFILE, new + 1); if (error != 0) { error = EMFILE; fdrop(oldfp, td); goto unlock; } } #endif fdgrowtable_exp(fdp, new + 1); } if (!fdisused(fdp, new)) fdused(fdp, new); break; default: KASSERT(0, ("%s unsupported mode %d", __func__, mode)); } KASSERT(old != new, ("new fd is same as old")); /* Refetch oldfde because the table may have grown and old one freed. */ oldfde = &fdp->fd_ofiles[old]; KASSERT(oldfp == oldfde->fde_file, ("fdt_ofiles shift from growth observed at fd %d", old)); newfde = &fdp->fd_ofiles[new]; delfp = newfde->fde_file; nioctls = filecaps_copy_prep(&oldfde->fde_caps); /* * Duplicate the source descriptor. */ #ifdef CAPABILITIES seqc_write_begin(&newfde->fde_seqc); #endif oioctls = filecaps_free_prep(&newfde->fde_caps); memcpy(newfde, oldfde, fde_change_size); filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps, nioctls); if ((flags & FDDUP_FLAG_CLOEXEC) != 0) newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE; else newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE; #ifdef CAPABILITIES seqc_write_end(&newfde->fde_seqc); #endif td->td_retval[0] = new; error = 0; if (delfp != NULL) { (void) closefp(fdp, new, delfp, td, true, false); FILEDESC_UNLOCK_ASSERT(fdp); } else { unlock: FILEDESC_XUNLOCK(fdp); } filecaps_free_finish(oioctls); return (error); } static void sigiofree(struct sigio *sigio) { crfree(sigio->sio_ucred); free(sigio, M_SIGIO); } static struct sigio * funsetown_locked(struct sigio *sigio) { struct proc *p; struct pgrp *pg; SIGIO_ASSERT_LOCKED(); if (sigio == NULL) return (NULL); *sigio->sio_myref = NULL; if (sigio->sio_pgid < 0) { pg = sigio->sio_pgrp; PGRP_LOCK(pg); SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio, sio_pgsigio); PGRP_UNLOCK(pg); } else { p = sigio->sio_proc; PROC_LOCK(p); SLIST_REMOVE(&p->p_sigiolst, sigio, sigio, sio_pgsigio); PROC_UNLOCK(p); } return (sigio); } /* * If sigio is on the list associated with a process or process group, * disable signalling from the device, remove sigio from the list and * free sigio. */ void funsetown(struct sigio **sigiop) { struct sigio *sigio; /* Racy check, consumers must provide synchronization. */ if (*sigiop == NULL) return; SIGIO_LOCK(); sigio = funsetown_locked(*sigiop); SIGIO_UNLOCK(); if (sigio != NULL) sigiofree(sigio); } /* * Free a list of sigio structures. The caller must ensure that new sigio * structures cannot be added after this point. For process groups this is * guaranteed using the proctree lock; for processes, the P_WEXIT flag serves * as an interlock. */ void funsetownlst(struct sigiolst *sigiolst) { struct proc *p; struct pgrp *pg; struct sigio *sigio, *tmp; /* Racy check. */ sigio = SLIST_FIRST(sigiolst); if (sigio == NULL) return; p = NULL; pg = NULL; SIGIO_LOCK(); sigio = SLIST_FIRST(sigiolst); if (sigio == NULL) { SIGIO_UNLOCK(); return; } /* * Every entry of the list should belong to a single proc or pgrp. */ if (sigio->sio_pgid < 0) { pg = sigio->sio_pgrp; sx_assert(&proctree_lock, SX_XLOCKED); PGRP_LOCK(pg); } else /* if (sigio->sio_pgid > 0) */ { p = sigio->sio_proc; PROC_LOCK(p); KASSERT((p->p_flag & P_WEXIT) != 0, ("%s: process %p is not exiting", __func__, p)); } SLIST_FOREACH(sigio, sigiolst, sio_pgsigio) { *sigio->sio_myref = NULL; if (pg != NULL) { KASSERT(sigio->sio_pgid < 0, ("Proc sigio in pgrp sigio list")); KASSERT(sigio->sio_pgrp == pg, ("Bogus pgrp in sigio list")); } else /* if (p != NULL) */ { KASSERT(sigio->sio_pgid > 0, ("Pgrp sigio in proc sigio list")); KASSERT(sigio->sio_proc == p, ("Bogus proc in sigio list")); } } if (pg != NULL) PGRP_UNLOCK(pg); else PROC_UNLOCK(p); SIGIO_UNLOCK(); SLIST_FOREACH_SAFE(sigio, sigiolst, sio_pgsigio, tmp) sigiofree(sigio); } /* * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg). * * After permission checking, add a sigio structure to the sigio list for * the process or process group. */ int fsetown(pid_t pgid, struct sigio **sigiop) { struct proc *proc; struct pgrp *pgrp; struct sigio *osigio, *sigio; int ret; if (pgid == 0) { funsetown(sigiop); return (0); } sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK); sigio->sio_pgid = pgid; sigio->sio_ucred = crhold(curthread->td_ucred); sigio->sio_myref = sigiop; ret = 0; if (pgid > 0) { ret = pget(pgid, PGET_NOTWEXIT | PGET_NOTID | PGET_HOLD, &proc); SIGIO_LOCK(); osigio = funsetown_locked(*sigiop); if (ret == 0) { PROC_LOCK(proc); _PRELE(proc); if ((proc->p_flag & P_WEXIT) != 0) { ret = ESRCH; } else if (proc->p_session != curthread->td_proc->p_session) { /* * Policy - Don't allow a process to FSETOWN a * process in another session. * * Remove this test to allow maximum flexibility * or restrict FSETOWN to the current process or * process group for maximum safety. */ ret = EPERM; } else { sigio->sio_proc = proc; SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio); } PROC_UNLOCK(proc); } } else /* if (pgid < 0) */ { sx_slock(&proctree_lock); SIGIO_LOCK(); osigio = funsetown_locked(*sigiop); pgrp = pgfind(-pgid); if (pgrp == NULL) { ret = ESRCH; } else { if (pgrp->pg_session != curthread->td_proc->p_session) { /* * Policy - Don't allow a process to FSETOWN a * process in another session. * * Remove this test to allow maximum flexibility * or restrict FSETOWN to the current process or * process group for maximum safety. */ ret = EPERM; } else { sigio->sio_pgrp = pgrp; SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio); } PGRP_UNLOCK(pgrp); } sx_sunlock(&proctree_lock); } if (ret == 0) *sigiop = sigio; SIGIO_UNLOCK(); if (osigio != NULL) sigiofree(osigio); return (ret); } /* * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg). */ pid_t fgetown(struct sigio **sigiop) { pid_t pgid; SIGIO_LOCK(); pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0; SIGIO_UNLOCK(); return (pgid); } static int closefp_impl(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, bool audit) { int error; FILEDESC_XLOCK_ASSERT(fdp); /* * We now hold the fp reference that used to be owned by the * descriptor array. We have to unlock the FILEDESC *AFTER* * knote_fdclose to prevent a race of the fd getting opened, a knote * added, and deleteing a knote for the new fd. */ if (__predict_false(!TAILQ_EMPTY(&fdp->fd_kqlist))) knote_fdclose(td, fd); /* * We need to notify mqueue if the object is of type mqueue. */ if (__predict_false(fp->f_type == DTYPE_MQUEUE)) mq_fdclose(td, fd, fp); FILEDESC_XUNLOCK(fdp); #ifdef AUDIT if (AUDITING_TD(td) && audit) audit_sysclose(td, fd, fp); #endif error = closef(fp, td); /* * All paths leading up to closefp() will have already removed or * replaced the fd in the filedesc table, so a restart would not * operate on the same file. */ if (error == ERESTART) error = EINTR; return (error); } static int closefp_hl(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, bool holdleaders, bool audit) { int error; FILEDESC_XLOCK_ASSERT(fdp); if (holdleaders) { if (td->td_proc->p_fdtol != NULL) { /* * Ask fdfree() to sleep to ensure that all relevant * process leaders can be traversed in closef(). */ fdp->fd_holdleaderscount++; } else { holdleaders = false; } } error = closefp_impl(fdp, fd, fp, td, audit); if (holdleaders) { FILEDESC_XLOCK(fdp); fdp->fd_holdleaderscount--; if (fdp->fd_holdleaderscount == 0 && fdp->fd_holdleaderswakeup != 0) { fdp->fd_holdleaderswakeup = 0; wakeup(&fdp->fd_holdleaderscount); } FILEDESC_XUNLOCK(fdp); } return (error); } static int closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, bool holdleaders, bool audit) { FILEDESC_XLOCK_ASSERT(fdp); if (__predict_false(td->td_proc->p_fdtol != NULL)) { return (closefp_hl(fdp, fd, fp, td, holdleaders, audit)); } else { return (closefp_impl(fdp, fd, fp, td, audit)); } } /* * Close a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct close_args { int fd; }; #endif /* ARGSUSED */ int sys_close(struct thread *td, struct close_args *uap) { return (kern_close(td, uap->fd)); } int kern_close(struct thread *td, int fd) { struct filedesc *fdp; struct file *fp; fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); if ((fp = fget_locked(fdp, fd)) == NULL) { FILEDESC_XUNLOCK(fdp); return (EBADF); } fdfree(fdp, fd); /* closefp() drops the FILEDESC lock for us. */ return (closefp(fdp, fd, fp, td, true, true)); } int kern_close_range(struct thread *td, u_int lowfd, u_int highfd) { struct filedesc *fdp; const struct fdescenttbl *fdt; struct file *fp; int fd; /* * Check this prior to clamping; closefrom(3) with only fd 0, 1, and 2 * open should not be a usage error. From a close_range() perspective, * close_range(3, ~0U, 0) in the same scenario should also likely not * be a usage error as all fd above 3 are in-fact already closed. */ if (highfd < lowfd) { return (EINVAL); } fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); fdt = atomic_load_ptr(&fdp->fd_files); highfd = MIN(highfd, fdt->fdt_nfiles - 1); fd = lowfd; if (__predict_false(fd > highfd)) { goto out_locked; } for (;;) { fp = fdt->fdt_ofiles[fd].fde_file; if (fp == NULL) { if (fd == highfd) goto out_locked; } else { fdfree(fdp, fd); (void) closefp(fdp, fd, fp, td, true, true); if (fd == highfd) goto out_unlocked; FILEDESC_XLOCK(fdp); fdt = atomic_load_ptr(&fdp->fd_files); } fd++; } out_locked: FILEDESC_XUNLOCK(fdp); out_unlocked: return (0); } #ifndef _SYS_SYSPROTO_H_ struct close_range_args { u_int lowfd; u_int highfd; int flags; }; #endif int sys_close_range(struct thread *td, struct close_range_args *uap) { AUDIT_ARG_FD(uap->lowfd); AUDIT_ARG_CMD(uap->highfd); AUDIT_ARG_FFLAGS(uap->flags); /* No flags currently defined */ if (uap->flags != 0) return (EINVAL); return (kern_close_range(td, uap->lowfd, uap->highfd)); } #ifdef COMPAT_FREEBSD12 /* * Close open file descriptors. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd12_closefrom_args { int lowfd; }; #endif /* ARGSUSED */ int freebsd12_closefrom(struct thread *td, struct freebsd12_closefrom_args *uap) { u_int lowfd; AUDIT_ARG_FD(uap->lowfd); /* * Treat negative starting file descriptor values identical to * closefrom(0) which closes all files. */ lowfd = MAX(0, uap->lowfd); return (kern_close_range(td, lowfd, ~0U)); } #endif /* COMPAT_FREEBSD12 */ #if defined(COMPAT_43) /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct ofstat_args { int fd; struct ostat *sb; }; #endif /* ARGSUSED */ int ofstat(struct thread *td, struct ofstat_args *uap) { struct ostat oub; struct stat ub; int error; error = kern_fstat(td, uap->fd, &ub); if (error == 0) { cvtstat(&ub, &oub); error = copyout(&oub, uap->sb, sizeof(oub)); } return (error); } #endif /* COMPAT_43 */ #if defined(COMPAT_FREEBSD11) int freebsd11_fstat(struct thread *td, struct freebsd11_fstat_args *uap) { struct stat sb; struct freebsd11_stat osb; int error; error = kern_fstat(td, uap->fd, &sb); if (error != 0) return (error); error = freebsd11_cvtstat(&sb, &osb); if (error == 0) error = copyout(&osb, uap->sb, sizeof(osb)); return (error); } #endif /* COMPAT_FREEBSD11 */ /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fstat_args { int fd; struct stat *sb; }; #endif /* ARGSUSED */ int sys_fstat(struct thread *td, struct fstat_args *uap) { struct stat ub; int error; error = kern_fstat(td, uap->fd, &ub); if (error == 0) error = copyout(&ub, uap->sb, sizeof(ub)); return (error); } int kern_fstat(struct thread *td, int fd, struct stat *sbp) { struct file *fp; int error; AUDIT_ARG_FD(fd); error = fget(td, fd, &cap_fstat_rights, &fp); if (__predict_false(error != 0)) return (error); AUDIT_ARG_FILE(td->td_proc, fp); - error = fo_stat(fp, sbp, td->td_ucred, td); + error = fo_stat(fp, sbp, td->td_ucred); fdrop(fp, td); #ifdef __STAT_TIME_T_EXT sbp->st_atim_ext = 0; sbp->st_mtim_ext = 0; sbp->st_ctim_ext = 0; sbp->st_btim_ext = 0; #endif #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrstat_error(sbp, error); #endif return (error); } #if defined(COMPAT_FREEBSD11) /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd11_nfstat_args { int fd; struct nstat *sb; }; #endif /* ARGSUSED */ int freebsd11_nfstat(struct thread *td, struct freebsd11_nfstat_args *uap) { struct nstat nub; struct stat ub; int error; error = kern_fstat(td, uap->fd, &ub); if (error == 0) { freebsd11_cvtnstat(&ub, &nub); error = copyout(&nub, uap->sb, sizeof(nub)); } return (error); } #endif /* COMPAT_FREEBSD11 */ /* * Return pathconf information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fpathconf_args { int fd; int name; }; #endif /* ARGSUSED */ int sys_fpathconf(struct thread *td, struct fpathconf_args *uap) { long value; int error; error = kern_fpathconf(td, uap->fd, uap->name, &value); if (error == 0) td->td_retval[0] = value; return (error); } int kern_fpathconf(struct thread *td, int fd, int name, long *valuep) { struct file *fp; struct vnode *vp; int error; error = fget(td, fd, &cap_fpathconf_rights, &fp); if (error != 0) return (error); if (name == _PC_ASYNC_IO) { *valuep = _POSIX_ASYNCHRONOUS_IO; goto out; } vp = fp->f_vnode; if (vp != NULL) { vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_PATHCONF(vp, name, valuep); VOP_UNLOCK(vp); } else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) { if (name != _PC_PIPE_BUF) { error = EINVAL; } else { *valuep = PIPE_BUF; error = 0; } } else { error = EOPNOTSUPP; } out: fdrop(fp, td); return (error); } /* * Copy filecaps structure allocating memory for ioctls array if needed. * * The last parameter indicates whether the fdtable is locked. If it is not and * ioctls are encountered, copying fails and the caller must lock the table. * * Note that if the table was not locked, the caller has to check the relevant * sequence counter to determine whether the operation was successful. */ bool filecaps_copy(const struct filecaps *src, struct filecaps *dst, bool locked) { size_t size; if (src->fc_ioctls != NULL && !locked) return (false); memcpy(dst, src, sizeof(*src)); if (src->fc_ioctls == NULL) return (true); KASSERT(src->fc_nioctls > 0, ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls)); size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK); memcpy(dst->fc_ioctls, src->fc_ioctls, size); return (true); } static u_long * filecaps_copy_prep(const struct filecaps *src) { u_long *ioctls; size_t size; if (__predict_true(src->fc_ioctls == NULL)) return (NULL); KASSERT(src->fc_nioctls > 0, ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls)); size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; ioctls = malloc(size, M_FILECAPS, M_WAITOK); return (ioctls); } static void filecaps_copy_finish(const struct filecaps *src, struct filecaps *dst, u_long *ioctls) { size_t size; *dst = *src; if (__predict_true(src->fc_ioctls == NULL)) { MPASS(ioctls == NULL); return; } size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; dst->fc_ioctls = ioctls; bcopy(src->fc_ioctls, dst->fc_ioctls, size); } /* * Move filecaps structure to the new place and clear the old place. */ void filecaps_move(struct filecaps *src, struct filecaps *dst) { *dst = *src; bzero(src, sizeof(*src)); } /* * Fill the given filecaps structure with full rights. */ static void filecaps_fill(struct filecaps *fcaps) { CAP_ALL(&fcaps->fc_rights); fcaps->fc_ioctls = NULL; fcaps->fc_nioctls = -1; fcaps->fc_fcntls = CAP_FCNTL_ALL; } /* * Free memory allocated within filecaps structure. */ void filecaps_free(struct filecaps *fcaps) { free(fcaps->fc_ioctls, M_FILECAPS); bzero(fcaps, sizeof(*fcaps)); } static u_long * filecaps_free_prep(struct filecaps *fcaps) { u_long *ioctls; ioctls = fcaps->fc_ioctls; bzero(fcaps, sizeof(*fcaps)); return (ioctls); } static void filecaps_free_finish(u_long *ioctls) { free(ioctls, M_FILECAPS); } /* * Validate the given filecaps structure. */ static void filecaps_validate(const struct filecaps *fcaps, const char *func) { KASSERT(cap_rights_is_valid(&fcaps->fc_rights), ("%s: invalid rights", func)); KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0, ("%s: invalid fcntls", func)); KASSERT(fcaps->fc_fcntls == 0 || cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL), ("%s: fcntls without CAP_FCNTL", func)); KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 : (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0), ("%s: invalid ioctls", func)); KASSERT(fcaps->fc_nioctls == 0 || cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL), ("%s: ioctls without CAP_IOCTL", func)); } static void fdgrowtable_exp(struct filedesc *fdp, int nfd) { int nfd1; FILEDESC_XLOCK_ASSERT(fdp); nfd1 = fdp->fd_nfiles * 2; if (nfd1 < nfd) nfd1 = nfd; fdgrowtable(fdp, nfd1); } /* * Grow the file table to accommodate (at least) nfd descriptors. */ static void fdgrowtable(struct filedesc *fdp, int nfd) { struct filedesc0 *fdp0; struct freetable *ft; struct fdescenttbl *ntable; struct fdescenttbl *otable; int nnfiles, onfiles; NDSLOTTYPE *nmap, *omap; KASSERT(fdp->fd_nfiles > 0, ("zero-length file table")); /* save old values */ onfiles = fdp->fd_nfiles; otable = fdp->fd_files; omap = fdp->fd_map; /* compute the size of the new table */ nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */ if (nnfiles <= onfiles) /* the table is already large enough */ return; /* * Allocate a new table. We need enough space for the number of * entries, file entries themselves and the struct freetable we will use * when we decommission the table and place it on the freelist. * We place the struct freetable in the middle so we don't have * to worry about padding. */ ntable = malloc(offsetof(struct fdescenttbl, fdt_ofiles) + nnfiles * sizeof(ntable->fdt_ofiles[0]) + sizeof(struct freetable), M_FILEDESC, M_ZERO | M_WAITOK); /* copy the old data */ ntable->fdt_nfiles = nnfiles; memcpy(ntable->fdt_ofiles, otable->fdt_ofiles, onfiles * sizeof(ntable->fdt_ofiles[0])); /* * Allocate a new map only if the old is not large enough. It will * grow at a slower rate than the table as it can map more * entries than the table can hold. */ if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) { nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC, M_ZERO | M_WAITOK); /* copy over the old data and update the pointer */ memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap)); fdp->fd_map = nmap; } /* * Make sure that ntable is correctly initialized before we replace * fd_files poiner. Otherwise fget_unlocked() may see inconsistent * data. */ atomic_store_rel_ptr((volatile void *)&fdp->fd_files, (uintptr_t)ntable); /* * Free the old file table when not shared by other threads or processes. * The old file table is considered to be shared when either are true: * - The process has more than one thread. * - The file descriptor table has been shared via fdshare(). * * When shared, the old file table will be placed on a freelist * which will be processed when the struct filedesc is released. * * Note that if onfiles == NDFILE, we're dealing with the original * static allocation contained within (struct filedesc0 *)fdp, * which must not be freed. */ if (onfiles > NDFILE) { /* * Note we may be called here from fdinit while allocating a * table for a new process in which case ->p_fd points * elsewhere. */ if (curproc->p_fd != fdp || FILEDESC_IS_ONLY_USER(fdp)) { free(otable, M_FILEDESC); } else { ft = (struct freetable *)&otable->fdt_ofiles[onfiles]; fdp0 = (struct filedesc0 *)fdp; ft->ft_table = otable; SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next); } } /* * The map does not have the same possibility of threads still * holding references to it. So always free it as long as it * does not reference the original static allocation. */ if (NDSLOTS(onfiles) > NDSLOTS(NDFILE)) free(omap, M_FILEDESC); } /* * Allocate a file descriptor for the process. */ int fdalloc(struct thread *td, int minfd, int *result) { struct proc *p = td->td_proc; struct filedesc *fdp = p->p_fd; int fd, maxfd, allocfd; #ifdef RACCT int error; #endif FILEDESC_XLOCK_ASSERT(fdp); if (fdp->fd_freefile > minfd) minfd = fdp->fd_freefile; maxfd = getmaxfd(td); /* * Search the bitmap for a free descriptor starting at minfd. * If none is found, grow the file table. */ fd = fd_first_free(fdp, minfd, fdp->fd_nfiles); if (__predict_false(fd >= maxfd)) return (EMFILE); if (__predict_false(fd >= fdp->fd_nfiles)) { allocfd = min(fd * 2, maxfd); #ifdef RACCT if (RACCT_ENABLED()) { error = racct_set_unlocked(p, RACCT_NOFILE, allocfd); if (error != 0) return (EMFILE); } #endif /* * fd is already equal to first free descriptor >= minfd, so * we only need to grow the table and we are done. */ fdgrowtable_exp(fdp, allocfd); } /* * Perform some sanity checks, then mark the file descriptor as * used and return it to the caller. */ KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles), ("invalid descriptor %d", fd)); KASSERT(!fdisused(fdp, fd), ("fd_first_free() returned non-free descriptor")); KASSERT(fdp->fd_ofiles[fd].fde_file == NULL, ("file descriptor isn't free")); fdused(fdp, fd); *result = fd; return (0); } /* * Allocate n file descriptors for the process. */ int fdallocn(struct thread *td, int minfd, int *fds, int n) { struct proc *p = td->td_proc; struct filedesc *fdp = p->p_fd; int i; FILEDESC_XLOCK_ASSERT(fdp); for (i = 0; i < n; i++) if (fdalloc(td, 0, &fds[i]) != 0) break; if (i < n) { for (i--; i >= 0; i--) fdunused(fdp, fds[i]); return (EMFILE); } return (0); } /* * Create a new open file structure and allocate a file descriptor for the * process that refers to it. We add one reference to the file for the * descriptor table and one reference for resultfp. This is to prevent us * being preempted and the entry in the descriptor table closed after we * release the FILEDESC lock. */ int falloc_caps(struct thread *td, struct file **resultfp, int *resultfd, int flags, struct filecaps *fcaps) { struct file *fp; int error, fd; MPASS(resultfp != NULL); MPASS(resultfd != NULL); error = _falloc_noinstall(td, &fp, 2); if (__predict_false(error != 0)) { return (error); } error = finstall_refed(td, fp, &fd, flags, fcaps); if (__predict_false(error != 0)) { falloc_abort(td, fp); return (error); } *resultfp = fp; *resultfd = fd; return (0); } /* * Create a new open file structure without allocating a file descriptor. */ int _falloc_noinstall(struct thread *td, struct file **resultfp, u_int n) { struct file *fp; int maxuserfiles = maxfiles - (maxfiles / 20); int openfiles_new; static struct timeval lastfail; static int curfail; KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__)); MPASS(n > 0); openfiles_new = atomic_fetchadd_int(&openfiles, 1) + 1; if ((openfiles_new >= maxuserfiles && priv_check(td, PRIV_MAXFILES) != 0) || openfiles_new >= maxfiles) { atomic_subtract_int(&openfiles, 1); if (ppsratecheck(&lastfail, &curfail, 1)) { printf("kern.maxfiles limit exceeded by uid %i, (%s) " "please see tuning(7).\n", td->td_ucred->cr_ruid, td->td_proc->p_comm); } return (ENFILE); } fp = uma_zalloc(file_zone, M_WAITOK); bzero(fp, sizeof(*fp)); refcount_init(&fp->f_count, n); fp->f_cred = crhold(td->td_ucred); fp->f_ops = &badfileops; *resultfp = fp; return (0); } void falloc_abort(struct thread *td, struct file *fp) { /* * For assertion purposes. */ refcount_init(&fp->f_count, 0); _fdrop(fp, td); } /* * Install a file in a file descriptor table. */ void _finstall(struct filedesc *fdp, struct file *fp, int fd, int flags, struct filecaps *fcaps) { struct filedescent *fde; MPASS(fp != NULL); if (fcaps != NULL) filecaps_validate(fcaps, __func__); FILEDESC_XLOCK_ASSERT(fdp); fde = &fdp->fd_ofiles[fd]; #ifdef CAPABILITIES seqc_write_begin(&fde->fde_seqc); #endif fde->fde_file = fp; fde->fde_flags = (flags & O_CLOEXEC) != 0 ? UF_EXCLOSE : 0; if (fcaps != NULL) filecaps_move(fcaps, &fde->fde_caps); else filecaps_fill(&fde->fde_caps); #ifdef CAPABILITIES seqc_write_end(&fde->fde_seqc); #endif } int finstall_refed(struct thread *td, struct file *fp, int *fd, int flags, struct filecaps *fcaps) { struct filedesc *fdp = td->td_proc->p_fd; int error; MPASS(fd != NULL); FILEDESC_XLOCK(fdp); error = fdalloc(td, 0, fd); if (__predict_true(error == 0)) { _finstall(fdp, fp, *fd, flags, fcaps); } FILEDESC_XUNLOCK(fdp); return (error); } int finstall(struct thread *td, struct file *fp, int *fd, int flags, struct filecaps *fcaps) { int error; MPASS(fd != NULL); if (!fhold(fp)) return (EBADF); error = finstall_refed(td, fp, fd, flags, fcaps); if (__predict_false(error != 0)) { fdrop(fp, td); } return (error); } /* * Build a new filedesc structure from another. * * If fdp is not NULL, return with it shared locked. */ struct filedesc * fdinit(struct filedesc *fdp, bool prepfiles, int *lastfile) { struct filedesc0 *newfdp0; struct filedesc *newfdp; if (prepfiles) MPASS(lastfile != NULL); else MPASS(lastfile == NULL); newfdp0 = uma_zalloc(filedesc0_zone, M_WAITOK | M_ZERO); newfdp = &newfdp0->fd_fd; /* Create the file descriptor table. */ FILEDESC_LOCK_INIT(newfdp); refcount_init(&newfdp->fd_refcnt, 1); refcount_init(&newfdp->fd_holdcnt, 1); newfdp->fd_map = newfdp0->fd_dmap; newfdp->fd_files = (struct fdescenttbl *)&newfdp0->fd_dfiles; newfdp->fd_files->fdt_nfiles = NDFILE; if (fdp == NULL) return (newfdp); FILEDESC_SLOCK(fdp); if (!prepfiles) { FILEDESC_SUNLOCK(fdp); return (newfdp); } for (;;) { *lastfile = fdlastfile(fdp); if (*lastfile < newfdp->fd_nfiles) break; FILEDESC_SUNLOCK(fdp); fdgrowtable(newfdp, *lastfile + 1); FILEDESC_SLOCK(fdp); } return (newfdp); } /* * Build a pwddesc structure from another. * Copy the current, root, and jail root vnode references. * * If pdp is not NULL, return with it shared locked. */ struct pwddesc * pdinit(struct pwddesc *pdp, bool keeplock) { struct pwddesc *newpdp; struct pwd *newpwd; newpdp = malloc(sizeof(*newpdp), M_PWDDESC, M_WAITOK | M_ZERO); PWDDESC_LOCK_INIT(newpdp); refcount_init(&newpdp->pd_refcount, 1); newpdp->pd_cmask = CMASK; if (pdp == NULL) { newpwd = pwd_alloc(); smr_serialized_store(&newpdp->pd_pwd, newpwd, true); return (newpdp); } PWDDESC_XLOCK(pdp); newpwd = pwd_hold_pwddesc(pdp); smr_serialized_store(&newpdp->pd_pwd, newpwd, true); if (!keeplock) PWDDESC_XUNLOCK(pdp); return (newpdp); } /* * Hold either filedesc or pwddesc of the passed process. * * The process lock is used to synchronize against the target exiting and * freeing the data. * * Clearing can be ilustrated in 3 steps: * 1. set the pointer to NULL. Either routine can race against it, hence * atomic_load_ptr. * 2. observe the process lock as not taken. Until then fdhold/pdhold can * race to either still see the pointer or find NULL. It is still safe to * grab a reference as clearing is stalled. * 3. after the lock is observed as not taken, any fdhold/pdhold calls are * guaranteed to see NULL, making it safe to finish clearing */ static struct filedesc * fdhold(struct proc *p) { struct filedesc *fdp; PROC_LOCK_ASSERT(p, MA_OWNED); fdp = atomic_load_ptr(&p->p_fd); if (fdp != NULL) refcount_acquire(&fdp->fd_holdcnt); return (fdp); } static struct pwddesc * pdhold(struct proc *p) { struct pwddesc *pdp; PROC_LOCK_ASSERT(p, MA_OWNED); pdp = atomic_load_ptr(&p->p_pd); if (pdp != NULL) refcount_acquire(&pdp->pd_refcount); return (pdp); } static void fddrop(struct filedesc *fdp) { if (refcount_load(&fdp->fd_holdcnt) > 1) { if (refcount_release(&fdp->fd_holdcnt) == 0) return; } FILEDESC_LOCK_DESTROY(fdp); uma_zfree(filedesc0_zone, fdp); } static void pddrop(struct pwddesc *pdp) { struct pwd *pwd; if (refcount_release_if_not_last(&pdp->pd_refcount)) return; PWDDESC_XLOCK(pdp); if (refcount_release(&pdp->pd_refcount) == 0) { PWDDESC_XUNLOCK(pdp); return; } pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); pwd_set(pdp, NULL); PWDDESC_XUNLOCK(pdp); pwd_drop(pwd); PWDDESC_LOCK_DESTROY(pdp); free(pdp, M_PWDDESC); } /* * Share a filedesc structure. */ struct filedesc * fdshare(struct filedesc *fdp) { refcount_acquire(&fdp->fd_refcnt); return (fdp); } /* * Share a pwddesc structure. */ struct pwddesc * pdshare(struct pwddesc *pdp) { refcount_acquire(&pdp->pd_refcount); return (pdp); } /* * Unshare a filedesc structure, if necessary by making a copy */ void fdunshare(struct thread *td) { struct filedesc *tmp; struct proc *p = td->td_proc; if (refcount_load(&p->p_fd->fd_refcnt) == 1) return; tmp = fdcopy(p->p_fd); fdescfree(td); p->p_fd = tmp; } /* * Unshare a pwddesc structure. */ void pdunshare(struct thread *td) { struct pwddesc *pdp; struct proc *p; p = td->td_proc; /* Not shared. */ if (p->p_pd->pd_refcount == 1) return; pdp = pdcopy(p->p_pd); pdescfree(td); p->p_pd = pdp; } /* * Copy a filedesc structure. A NULL pointer in returns a NULL reference, * this is to ease callers, not catch errors. */ struct filedesc * fdcopy(struct filedesc *fdp) { struct filedesc *newfdp; struct filedescent *nfde, *ofde; int i, lastfile; MPASS(fdp != NULL); newfdp = fdinit(fdp, true, &lastfile); /* copy all passable descriptors (i.e. not kqueue) */ newfdp->fd_freefile = -1; for (i = 0; i <= lastfile; ++i) { ofde = &fdp->fd_ofiles[i]; if (ofde->fde_file == NULL || (ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0 || !fhold(ofde->fde_file)) { if (newfdp->fd_freefile == -1) newfdp->fd_freefile = i; continue; } nfde = &newfdp->fd_ofiles[i]; *nfde = *ofde; filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true); fdused_init(newfdp, i); } if (newfdp->fd_freefile == -1) newfdp->fd_freefile = i; FILEDESC_SUNLOCK(fdp); return (newfdp); } /* * Copy a pwddesc structure. */ struct pwddesc * pdcopy(struct pwddesc *pdp) { struct pwddesc *newpdp; MPASS(pdp != NULL); newpdp = pdinit(pdp, true); newpdp->pd_cmask = pdp->pd_cmask; PWDDESC_XUNLOCK(pdp); return (newpdp); } /* * Clear POSIX style locks. This is only used when fdp looses a reference (i.e. * one of processes using it exits) and the table used to be shared. */ static void fdclearlocks(struct thread *td) { struct filedesc *fdp; struct filedesc_to_leader *fdtol; struct flock lf; struct file *fp; struct proc *p; struct vnode *vp; int i, lastfile; p = td->td_proc; fdp = p->p_fd; fdtol = p->p_fdtol; MPASS(fdtol != NULL); FILEDESC_XLOCK(fdp); KASSERT(fdtol->fdl_refcount > 0, ("filedesc_to_refcount botch: fdl_refcount=%d", fdtol->fdl_refcount)); if (fdtol->fdl_refcount == 1 && (p->p_leader->p_flag & P_ADVLOCK) != 0) { lastfile = fdlastfile(fdp); for (i = 0; i <= lastfile; i++) { fp = fdp->fd_ofiles[i].fde_file; if (fp == NULL || fp->f_type != DTYPE_VNODE || !fhold(fp)) continue; FILEDESC_XUNLOCK(fdp); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; vp = fp->f_vnode; (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, &lf, F_POSIX); FILEDESC_XLOCK(fdp); fdrop(fp, td); } } retry: if (fdtol->fdl_refcount == 1) { if (fdp->fd_holdleaderscount > 0 && (p->p_leader->p_flag & P_ADVLOCK) != 0) { /* * close() or kern_dup() has cleared a reference * in a shared file descriptor table. */ fdp->fd_holdleaderswakeup = 1; sx_sleep(&fdp->fd_holdleaderscount, FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0); goto retry; } if (fdtol->fdl_holdcount > 0) { /* * Ensure that fdtol->fdl_leader remains * valid in closef(). */ fdtol->fdl_wakeup = 1; sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0); goto retry; } } fdtol->fdl_refcount--; if (fdtol->fdl_refcount == 0 && fdtol->fdl_holdcount == 0) { fdtol->fdl_next->fdl_prev = fdtol->fdl_prev; fdtol->fdl_prev->fdl_next = fdtol->fdl_next; } else fdtol = NULL; p->p_fdtol = NULL; FILEDESC_XUNLOCK(fdp); if (fdtol != NULL) free(fdtol, M_FILEDESC_TO_LEADER); } /* * Release a filedesc structure. */ static void fdescfree_fds(struct thread *td, struct filedesc *fdp) { struct filedesc0 *fdp0; struct freetable *ft, *tft; struct filedescent *fde; struct file *fp; int i, lastfile; KASSERT(refcount_load(&fdp->fd_refcnt) == 0, ("%s: fd table %p carries references", __func__, fdp)); /* * Serialize with threads iterating over the table, if any. */ if (refcount_load(&fdp->fd_holdcnt) > 1) { FILEDESC_XLOCK(fdp); FILEDESC_XUNLOCK(fdp); } lastfile = fdlastfile_single(fdp); for (i = 0; i <= lastfile; i++) { fde = &fdp->fd_ofiles[i]; fp = fde->fde_file; if (fp != NULL) { fdefree_last(fde); (void) closef(fp, td); } } if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE)) free(fdp->fd_map, M_FILEDESC); if (fdp->fd_nfiles > NDFILE) free(fdp->fd_files, M_FILEDESC); fdp0 = (struct filedesc0 *)fdp; SLIST_FOREACH_SAFE(ft, &fdp0->fd_free, ft_next, tft) free(ft->ft_table, M_FILEDESC); fddrop(fdp); } void fdescfree(struct thread *td) { struct proc *p; struct filedesc *fdp; p = td->td_proc; fdp = p->p_fd; MPASS(fdp != NULL); #ifdef RACCT if (RACCT_ENABLED()) racct_set_unlocked(p, RACCT_NOFILE, 0); #endif if (p->p_fdtol != NULL) fdclearlocks(td); /* * Check fdhold for an explanation. */ atomic_store_ptr(&p->p_fd, NULL); atomic_thread_fence_seq_cst(); PROC_WAIT_UNLOCKED(p); if (refcount_release(&fdp->fd_refcnt) == 0) return; fdescfree_fds(td, fdp); } void pdescfree(struct thread *td) { struct proc *p; struct pwddesc *pdp; p = td->td_proc; pdp = p->p_pd; MPASS(pdp != NULL); /* * Check pdhold for an explanation. */ atomic_store_ptr(&p->p_pd, NULL); atomic_thread_fence_seq_cst(); PROC_WAIT_UNLOCKED(p); pddrop(pdp); } /* * For setugid programs, we don't want to people to use that setugidness * to generate error messages which write to a file which otherwise would * otherwise be off-limits to the process. We check for filesystems where * the vnode can change out from under us after execve (like [lin]procfs). * * Since fdsetugidsafety calls this only for fd 0, 1 and 2, this check is * sufficient. We also don't check for setugidness since we know we are. */ static bool is_unsafe(struct file *fp) { struct vnode *vp; if (fp->f_type != DTYPE_VNODE) return (false); vp = fp->f_vnode; return ((vp->v_vflag & VV_PROCDEP) != 0); } /* * Make this setguid thing safe, if at all possible. */ void fdsetugidsafety(struct thread *td) { struct filedesc *fdp; struct file *fp; int i; fdp = td->td_proc->p_fd; KASSERT(refcount_load(&fdp->fd_refcnt) == 1, ("the fdtable should not be shared")); MPASS(fdp->fd_nfiles >= 3); for (i = 0; i <= 2; i++) { fp = fdp->fd_ofiles[i].fde_file; if (fp != NULL && is_unsafe(fp)) { FILEDESC_XLOCK(fdp); knote_fdclose(td, i); /* * NULL-out descriptor prior to close to avoid * a race while close blocks. */ fdfree(fdp, i); FILEDESC_XUNLOCK(fdp); (void) closef(fp, td); } } } /* * If a specific file object occupies a specific file descriptor, close the * file descriptor entry and drop a reference on the file object. This is a * convenience function to handle a subsequent error in a function that calls * falloc() that handles the race that another thread might have closed the * file descriptor out from under the thread creating the file object. */ void fdclose(struct thread *td, struct file *fp, int idx) { struct filedesc *fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); if (fdp->fd_ofiles[idx].fde_file == fp) { fdfree(fdp, idx); FILEDESC_XUNLOCK(fdp); fdrop(fp, td); } else FILEDESC_XUNLOCK(fdp); } /* * Close any files on exec? */ void fdcloseexec(struct thread *td) { struct filedesc *fdp; struct filedescent *fde; struct file *fp; int i, lastfile; fdp = td->td_proc->p_fd; KASSERT(refcount_load(&fdp->fd_refcnt) == 1, ("the fdtable should not be shared")); lastfile = fdlastfile_single(fdp); for (i = 0; i <= lastfile; i++) { fde = &fdp->fd_ofiles[i]; fp = fde->fde_file; if (fp != NULL && (fp->f_type == DTYPE_MQUEUE || (fde->fde_flags & UF_EXCLOSE))) { FILEDESC_XLOCK(fdp); fdfree(fdp, i); (void) closefp(fdp, i, fp, td, false, false); FILEDESC_UNLOCK_ASSERT(fdp); } } } /* * It is unsafe for set[ug]id processes to be started with file * descriptors 0..2 closed, as these descriptors are given implicit * significance in the Standard C library. fdcheckstd() will create a * descriptor referencing /dev/null for each of stdin, stdout, and * stderr that is not already open. */ int fdcheckstd(struct thread *td) { struct filedesc *fdp; register_t save; int i, error, devnull; fdp = td->td_proc->p_fd; KASSERT(refcount_load(&fdp->fd_refcnt) == 1, ("the fdtable should not be shared")); MPASS(fdp->fd_nfiles >= 3); devnull = -1; for (i = 0; i <= 2; i++) { if (fdp->fd_ofiles[i].fde_file != NULL) continue; save = td->td_retval[0]; if (devnull != -1) { error = kern_dup(td, FDDUP_FIXED, 0, devnull, i); } else { error = kern_openat(td, AT_FDCWD, "/dev/null", UIO_SYSSPACE, O_RDWR, 0); if (error == 0) { devnull = td->td_retval[0]; KASSERT(devnull == i, ("we didn't get our fd")); } } td->td_retval[0] = save; if (error != 0) return (error); } return (0); } /* * Internal form of close. Decrement reference count on file structure. * Note: td may be NULL when closing a file that was being passed in a * message. */ int closef(struct file *fp, struct thread *td) { struct vnode *vp; struct flock lf; struct filedesc_to_leader *fdtol; struct filedesc *fdp; MPASS(td != NULL); /* * POSIX record locking dictates that any close releases ALL * locks owned by this process. This is handled by setting * a flag in the unlock to free ONLY locks obeying POSIX * semantics, and not to free BSD-style file locks. * If the descriptor was in a message, POSIX-style locks * aren't passed with the descriptor, and the thread pointer * will be NULL. Callers should be careful only to pass a * NULL thread pointer when there really is no owning * context that might have locks, or the locks will be * leaked. */ if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; (void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader, F_UNLCK, &lf, F_POSIX); } fdtol = td->td_proc->p_fdtol; if (fdtol != NULL) { /* * Handle special case where file descriptor table is * shared between multiple process leaders. */ fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); for (fdtol = fdtol->fdl_next; fdtol != td->td_proc->p_fdtol; fdtol = fdtol->fdl_next) { if ((fdtol->fdl_leader->p_flag & P_ADVLOCK) == 0) continue; fdtol->fdl_holdcount++; FILEDESC_XUNLOCK(fdp); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; vp = fp->f_vnode; (void) VOP_ADVLOCK(vp, (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf, F_POSIX); FILEDESC_XLOCK(fdp); fdtol->fdl_holdcount--; if (fdtol->fdl_holdcount == 0 && fdtol->fdl_wakeup != 0) { fdtol->fdl_wakeup = 0; wakeup(fdtol); } } FILEDESC_XUNLOCK(fdp); } } return (fdrop_close(fp, td)); } /* * Hack for file descriptor passing code. */ void closef_nothread(struct file *fp) { fdrop(fp, NULL); } /* * Initialize the file pointer with the specified properties. * * The ops are set with release semantics to be certain that the flags, type, * and data are visible when ops is. This is to prevent ops methods from being * called with bad data. */ void finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops) { fp->f_data = data; fp->f_flag = flag; fp->f_type = type; atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops); } void finit_vnode(struct file *fp, u_int flag, void *data, struct fileops *ops) { fp->f_seqcount[UIO_READ] = 1; fp->f_seqcount[UIO_WRITE] = 1; finit(fp, (flag & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE, data, ops); } int fget_cap_locked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, struct file **fpp, struct filecaps *havecapsp) { struct filedescent *fde; int error; FILEDESC_LOCK_ASSERT(fdp); fde = fdeget_locked(fdp, fd); if (fde == NULL) { error = EBADF; goto out; } #ifdef CAPABILITIES error = cap_check(cap_rights_fde_inline(fde), needrightsp); if (error != 0) goto out; #endif if (havecapsp != NULL) filecaps_copy(&fde->fde_caps, havecapsp, true); *fpp = fde->fde_file; error = 0; out: return (error); } int fget_cap(struct thread *td, int fd, cap_rights_t *needrightsp, struct file **fpp, struct filecaps *havecapsp) { struct filedesc *fdp = td->td_proc->p_fd; int error; #ifndef CAPABILITIES error = fget_unlocked(fdp, fd, needrightsp, fpp); if (havecapsp != NULL && error == 0) filecaps_fill(havecapsp); #else struct file *fp; seqc_t seq; *fpp = NULL; for (;;) { error = fget_unlocked_seq(fdp, fd, needrightsp, &fp, &seq); if (error != 0) return (error); if (havecapsp != NULL) { if (!filecaps_copy(&fdp->fd_ofiles[fd].fde_caps, havecapsp, false)) { fdrop(fp, td); goto get_locked; } } if (!fd_modified(fdp, fd, seq)) break; fdrop(fp, td); } *fpp = fp; return (0); get_locked: FILEDESC_SLOCK(fdp); error = fget_cap_locked(fdp, fd, needrightsp, fpp, havecapsp); if (error == 0 && !fhold(*fpp)) error = EBADF; FILEDESC_SUNLOCK(fdp); #endif return (error); } #ifdef CAPABILITIES int fgetvp_lookup_smr(int fd, struct nameidata *ndp, struct vnode **vpp, bool *fsearch) { const struct filedescent *fde; const struct fdescenttbl *fdt; struct filedesc *fdp; struct file *fp; struct vnode *vp; const cap_rights_t *haverights; cap_rights_t rights; seqc_t seq; VFS_SMR_ASSERT_ENTERED(); rights = *ndp->ni_rightsneeded; cap_rights_set_one(&rights, CAP_LOOKUP); fdp = curproc->p_fd; fdt = fdp->fd_files; if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) return (EBADF); seq = seqc_read_notmodify(fd_seqc(fdt, fd)); fde = &fdt->fdt_ofiles[fd]; haverights = cap_rights_fde_inline(fde); fp = fde->fde_file; if (__predict_false(fp == NULL)) return (EAGAIN); if (__predict_false(cap_check_inline_transient(haverights, &rights))) return (EAGAIN); *fsearch = ((fp->f_flag & FSEARCH) != 0); vp = fp->f_vnode; if (__predict_false(vp == NULL || vp->v_type != VDIR)) { return (EAGAIN); } if (!filecaps_copy(&fde->fde_caps, &ndp->ni_filecaps, false)) { return (EAGAIN); } /* * Use an acquire barrier to force re-reading of fdt so it is * refreshed for verification. */ atomic_thread_fence_acq(); fdt = fdp->fd_files; if (__predict_false(!seqc_consistent_nomb(fd_seqc(fdt, fd), seq))) return (EAGAIN); /* * If file descriptor doesn't have all rights, * all lookups relative to it must also be * strictly relative. * * Not yet supported by fast path. */ CAP_ALL(&rights); if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights, &rights) || ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL || ndp->ni_filecaps.fc_nioctls != -1) { #ifdef notyet ndp->ni_lcf |= NI_LCF_STRICTRELATIVE; #else return (EAGAIN); #endif } *vpp = vp; return (0); } #else int fgetvp_lookup_smr(int fd, struct nameidata *ndp, struct vnode **vpp, bool *fsearch) { const struct fdescenttbl *fdt; struct filedesc *fdp; struct file *fp; struct vnode *vp; VFS_SMR_ASSERT_ENTERED(); fdp = curproc->p_fd; fdt = fdp->fd_files; if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) return (EBADF); fp = fdt->fdt_ofiles[fd].fde_file; if (__predict_false(fp == NULL)) return (EAGAIN); *fsearch = ((fp->f_flag & FSEARCH) != 0); vp = fp->f_vnode; if (__predict_false(vp == NULL || vp->v_type != VDIR)) { return (EAGAIN); } /* * Use an acquire barrier to force re-reading of fdt so it is * refreshed for verification. */ atomic_thread_fence_acq(); fdt = fdp->fd_files; if (__predict_false(fp != fdt->fdt_ofiles[fd].fde_file)) return (EAGAIN); filecaps_fill(&ndp->ni_filecaps); *vpp = vp; return (0); } #endif int fget_unlocked_seq(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, struct file **fpp, seqc_t *seqp) { #ifdef CAPABILITIES const struct filedescent *fde; #endif const struct fdescenttbl *fdt; struct file *fp; #ifdef CAPABILITIES seqc_t seq; cap_rights_t haverights; int error; #endif fdt = fdp->fd_files; if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) return (EBADF); /* * Fetch the descriptor locklessly. We avoid fdrop() races by * never raising a refcount above 0. To accomplish this we have * to use a cmpset loop rather than an atomic_add. The descriptor * must be re-verified once we acquire a reference to be certain * that the identity is still correct and we did not lose a race * due to preemption. */ for (;;) { #ifdef CAPABILITIES seq = seqc_read_notmodify(fd_seqc(fdt, fd)); fde = &fdt->fdt_ofiles[fd]; haverights = *cap_rights_fde_inline(fde); fp = fde->fde_file; if (!seqc_consistent(fd_seqc(fdt, fd), seq)) continue; #else fp = fdt->fdt_ofiles[fd].fde_file; #endif if (fp == NULL) return (EBADF); #ifdef CAPABILITIES error = cap_check_inline(&haverights, needrightsp); if (error != 0) return (error); #endif if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count))) { /* * Force a reload. Other thread could reallocate the * table before this fd was closed, so it is possible * that there is a stale fp pointer in cached version. */ fdt = atomic_load_ptr(&fdp->fd_files); continue; } /* * Use an acquire barrier to force re-reading of fdt so it is * refreshed for verification. */ atomic_thread_fence_acq(); fdt = fdp->fd_files; #ifdef CAPABILITIES if (seqc_consistent_nomb(fd_seqc(fdt, fd), seq)) #else if (fp == fdt->fdt_ofiles[fd].fde_file) #endif break; fdrop(fp, curthread); } *fpp = fp; if (seqp != NULL) { #ifdef CAPABILITIES *seqp = seq; #endif } return (0); } /* * See the comments in fget_unlocked_seq for an explanation of how this works. * * This is a simplified variant which bails out to the aforementioned routine * if anything goes wrong. In practice this only happens when userspace is * racing with itself. */ int fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, struct file **fpp) { #ifdef CAPABILITIES const struct filedescent *fde; #endif const struct fdescenttbl *fdt; struct file *fp; #ifdef CAPABILITIES seqc_t seq; const cap_rights_t *haverights; #endif fdt = fdp->fd_files; if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) return (EBADF); #ifdef CAPABILITIES seq = seqc_read_notmodify(fd_seqc(fdt, fd)); fde = &fdt->fdt_ofiles[fd]; haverights = cap_rights_fde_inline(fde); fp = fde->fde_file; #else fp = fdt->fdt_ofiles[fd].fde_file; #endif if (__predict_false(fp == NULL)) goto out_fallback; #ifdef CAPABILITIES if (__predict_false(cap_check_inline_transient(haverights, needrightsp))) goto out_fallback; #endif if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count))) goto out_fallback; /* * Use an acquire barrier to force re-reading of fdt so it is * refreshed for verification. */ atomic_thread_fence_acq(); fdt = fdp->fd_files; #ifdef CAPABILITIES if (__predict_false(!seqc_consistent_nomb(fd_seqc(fdt, fd), seq))) #else if (__predict_false(fp != fdt->fdt_ofiles[fd].fde_file)) #endif goto out_fdrop; *fpp = fp; return (0); out_fdrop: fdrop(fp, curthread); out_fallback: return (fget_unlocked_seq(fdp, fd, needrightsp, fpp, NULL)); } /* * Translate fd -> file when the caller guarantees the file descriptor table * can't be changed by others. * * Note this does not mean the file object itself is only visible to the caller, * merely that it wont disappear without having to be referenced. * * Must be paired with fput_only_user. */ #ifdef CAPABILITIES int fget_only_user(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, struct file **fpp) { const struct filedescent *fde; const struct fdescenttbl *fdt; const cap_rights_t *haverights; struct file *fp; int error; MPASS(FILEDESC_IS_ONLY_USER(fdp)); if (__predict_false(fd >= fdp->fd_nfiles)) return (EBADF); fdt = fdp->fd_files; fde = &fdt->fdt_ofiles[fd]; fp = fde->fde_file; if (__predict_false(fp == NULL)) return (EBADF); MPASS(refcount_load(&fp->f_count) > 0); haverights = cap_rights_fde_inline(fde); error = cap_check_inline(haverights, needrightsp); if (__predict_false(error != 0)) return (error); *fpp = fp; return (0); } #else int fget_only_user(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, struct file **fpp) { struct file *fp; MPASS(FILEDESC_IS_ONLY_USER(fdp)); if (__predict_false(fd >= fdp->fd_nfiles)) return (EBADF); fp = fdp->fd_ofiles[fd].fde_file; if (__predict_false(fp == NULL)) return (EBADF); MPASS(refcount_load(&fp->f_count) > 0); *fpp = fp; return (0); } #endif /* * Extract the file pointer associated with the specified descriptor for the * current user process. * * If the descriptor doesn't exist or doesn't match 'flags', EBADF is * returned. * * File's rights will be checked against the capability rights mask. * * If an error occurred the non-zero error is returned and *fpp is set to * NULL. Otherwise *fpp is held and set and zero is returned. Caller is * responsible for fdrop(). */ static __inline int _fget(struct thread *td, int fd, struct file **fpp, int flags, cap_rights_t *needrightsp) { struct filedesc *fdp; struct file *fp; int error; *fpp = NULL; fdp = td->td_proc->p_fd; error = fget_unlocked(fdp, fd, needrightsp, &fp); if (__predict_false(error != 0)) return (error); if (__predict_false(fp->f_ops == &badfileops)) { fdrop(fp, td); return (EBADF); } /* * FREAD and FWRITE failure return EBADF as per POSIX. */ error = 0; switch (flags) { case FREAD: case FWRITE: if ((fp->f_flag & flags) == 0) error = EBADF; break; case FEXEC: if ((fp->f_flag & (FREAD | FEXEC)) == 0 || ((fp->f_flag & FWRITE) != 0)) error = EBADF; break; case 0: break; default: KASSERT(0, ("wrong flags")); } if (error != 0) { fdrop(fp, td); return (error); } *fpp = fp; return (0); } int fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) { return (_fget(td, fd, fpp, 0, rightsp)); } int fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, vm_prot_t *maxprotp, struct file **fpp) { int error; #ifndef CAPABILITIES error = _fget(td, fd, fpp, 0, rightsp); if (maxprotp != NULL) *maxprotp = VM_PROT_ALL; return (error); #else cap_rights_t fdrights; struct filedesc *fdp; struct file *fp; seqc_t seq; *fpp = NULL; fdp = td->td_proc->p_fd; MPASS(cap_rights_is_set(rightsp, CAP_MMAP)); for (;;) { error = fget_unlocked_seq(fdp, fd, rightsp, &fp, &seq); if (__predict_false(error != 0)) return (error); if (__predict_false(fp->f_ops == &badfileops)) { fdrop(fp, td); return (EBADF); } if (maxprotp != NULL) fdrights = *cap_rights(fdp, fd); if (!fd_modified(fdp, fd, seq)) break; fdrop(fp, td); } /* * If requested, convert capability rights to access flags. */ if (maxprotp != NULL) *maxprotp = cap_rights_to_vmprot(&fdrights); *fpp = fp; return (0); #endif } int fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) { return (_fget(td, fd, fpp, FREAD, rightsp)); } int fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) { return (_fget(td, fd, fpp, FWRITE, rightsp)); } int fget_fcntl(struct thread *td, int fd, cap_rights_t *rightsp, int needfcntl, struct file **fpp) { struct filedesc *fdp = td->td_proc->p_fd; #ifndef CAPABILITIES return (fget_unlocked(fdp, fd, rightsp, fpp)); #else struct file *fp; int error; seqc_t seq; *fpp = NULL; MPASS(cap_rights_is_set(rightsp, CAP_FCNTL)); for (;;) { error = fget_unlocked_seq(fdp, fd, rightsp, &fp, &seq); if (error != 0) return (error); error = cap_fcntl_check(fdp, fd, needfcntl); if (!fd_modified(fdp, fd, seq)) break; fdrop(fp, td); } if (error != 0) { fdrop(fp, td); return (error); } *fpp = fp; return (0); #endif } /* * Like fget() but loads the underlying vnode, or returns an error if the * descriptor does not represent a vnode. Note that pipes use vnodes but * never have VM objects. The returned vnode will be vref()'d. * * XXX: what about the unused flags ? */ static __inline int _fgetvp(struct thread *td, int fd, int flags, cap_rights_t *needrightsp, struct vnode **vpp) { struct file *fp; int error; *vpp = NULL; error = _fget(td, fd, &fp, flags, needrightsp); if (error != 0) return (error); if (fp->f_vnode == NULL) { error = EINVAL; } else { *vpp = fp->f_vnode; vref(*vpp); } fdrop(fp, td); return (error); } int fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) { return (_fgetvp(td, fd, 0, rightsp, vpp)); } int fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp, struct filecaps *havecaps, struct vnode **vpp) { struct filecaps caps; struct file *fp; int error; error = fget_cap(td, fd, needrightsp, &fp, &caps); if (error != 0) return (error); if (fp->f_ops == &badfileops) { error = EBADF; goto out; } if (fp->f_vnode == NULL) { error = EINVAL; goto out; } *havecaps = caps; *vpp = fp->f_vnode; vref(*vpp); fdrop(fp, td); return (0); out: filecaps_free(&caps); fdrop(fp, td); return (error); } int fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) { return (_fgetvp(td, fd, FREAD, rightsp, vpp)); } int fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) { return (_fgetvp(td, fd, FEXEC, rightsp, vpp)); } #ifdef notyet int fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) { return (_fgetvp(td, fd, FWRITE, rightsp, vpp)); } #endif /* * Handle the last reference to a file being closed. * * Without the noinline attribute clang keeps inlining the func thorough this * file when fdrop is used. */ int __noinline _fdrop(struct file *fp, struct thread *td) { int error; #ifdef INVARIANTS int count; count = refcount_load(&fp->f_count); if (count != 0) panic("fdrop: fp %p count %d", fp, count); #endif error = fo_close(fp, td); atomic_subtract_int(&openfiles, 1); crfree(fp->f_cred); free(fp->f_advice, M_FADVISE); uma_zfree(file_zone, fp); return (error); } /* * Apply an advisory lock on a file descriptor. * * Just attempt to get a record lock of the requested type on the entire file * (l_whence = SEEK_SET, l_start = 0, l_len = 0). */ #ifndef _SYS_SYSPROTO_H_ struct flock_args { int fd; int how; }; #endif /* ARGSUSED */ int sys_flock(struct thread *td, struct flock_args *uap) { struct file *fp; struct vnode *vp; struct flock lf; int error; error = fget(td, uap->fd, &cap_flock_rights, &fp); if (error != 0) return (error); error = EOPNOTSUPP; if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) { goto done; } if (fp->f_ops == &path_fileops) { goto done; } error = 0; vp = fp->f_vnode; lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; if (uap->how & LOCK_UN) { lf.l_type = F_UNLCK; atomic_clear_int(&fp->f_flag, FHASLOCK); error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK); goto done; } if (uap->how & LOCK_EX) lf.l_type = F_WRLCK; else if (uap->how & LOCK_SH) lf.l_type = F_RDLCK; else { error = EBADF; goto done; } atomic_set_int(&fp->f_flag, FHASLOCK); error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT); done: fdrop(fp, td); return (error); } /* * Duplicate the specified descriptor to a free descriptor. */ int dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode, int openerror, int *indxp) { struct filedescent *newfde, *oldfde; struct file *fp; u_long *ioctls; int error, indx; KASSERT(openerror == ENODEV || openerror == ENXIO, ("unexpected error %d in %s", openerror, __func__)); /* * If the to-be-dup'd fd number is greater than the allowed number * of file descriptors, or the fd to be dup'd has already been * closed, then reject. */ FILEDESC_XLOCK(fdp); if ((fp = fget_locked(fdp, dfd)) == NULL) { FILEDESC_XUNLOCK(fdp); return (EBADF); } error = fdalloc(td, 0, &indx); if (error != 0) { FILEDESC_XUNLOCK(fdp); return (error); } /* * There are two cases of interest here. * * For ENODEV simply dup (dfd) to file descriptor (indx) and return. * * For ENXIO steal away the file structure from (dfd) and store it in * (indx). (dfd) is effectively closed by this operation. */ switch (openerror) { case ENODEV: /* * Check that the mode the file is being opened for is a * subset of the mode of the existing descriptor. */ if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) { fdunused(fdp, indx); FILEDESC_XUNLOCK(fdp); return (EACCES); } if (!fhold(fp)) { fdunused(fdp, indx); FILEDESC_XUNLOCK(fdp); return (EBADF); } newfde = &fdp->fd_ofiles[indx]; oldfde = &fdp->fd_ofiles[dfd]; ioctls = filecaps_copy_prep(&oldfde->fde_caps); #ifdef CAPABILITIES seqc_write_begin(&newfde->fde_seqc); #endif memcpy(newfde, oldfde, fde_change_size); filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps, ioctls); #ifdef CAPABILITIES seqc_write_end(&newfde->fde_seqc); #endif break; case ENXIO: /* * Steal away the file pointer from dfd and stuff it into indx. */ newfde = &fdp->fd_ofiles[indx]; oldfde = &fdp->fd_ofiles[dfd]; #ifdef CAPABILITIES seqc_write_begin(&newfde->fde_seqc); #endif memcpy(newfde, oldfde, fde_change_size); oldfde->fde_file = NULL; fdunused(fdp, dfd); #ifdef CAPABILITIES seqc_write_end(&newfde->fde_seqc); #endif break; } FILEDESC_XUNLOCK(fdp); *indxp = indx; return (0); } /* * This sysctl determines if we will allow a process to chroot(2) if it * has a directory open: * 0: disallowed for all processes. * 1: allowed for processes that were not already chroot(2)'ed. * 2: allowed for all processes. */ static int chroot_allow_open_directories = 1; SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW, &chroot_allow_open_directories, 0, "Allow a process to chroot(2) if it has a directory open"); /* * Helper function for raised chroot(2) security function: Refuse if * any filedescriptors are open directories. */ static int chroot_refuse_vdir_fds(struct filedesc *fdp) { struct vnode *vp; struct file *fp; int fd, lastfile; FILEDESC_LOCK_ASSERT(fdp); lastfile = fdlastfile(fdp); for (fd = 0; fd <= lastfile; fd++) { fp = fget_locked(fdp, fd); if (fp == NULL) continue; if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; if (vp->v_type == VDIR) return (EPERM); } } return (0); } static void pwd_fill(struct pwd *oldpwd, struct pwd *newpwd) { if (newpwd->pwd_cdir == NULL && oldpwd->pwd_cdir != NULL) { vrefact(oldpwd->pwd_cdir); newpwd->pwd_cdir = oldpwd->pwd_cdir; } if (newpwd->pwd_rdir == NULL && oldpwd->pwd_rdir != NULL) { vrefact(oldpwd->pwd_rdir); newpwd->pwd_rdir = oldpwd->pwd_rdir; } if (newpwd->pwd_jdir == NULL && oldpwd->pwd_jdir != NULL) { vrefact(oldpwd->pwd_jdir); newpwd->pwd_jdir = oldpwd->pwd_jdir; } } struct pwd * pwd_hold_pwddesc(struct pwddesc *pdp) { struct pwd *pwd; PWDDESC_ASSERT_XLOCKED(pdp); pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); if (pwd != NULL) refcount_acquire(&pwd->pwd_refcount); return (pwd); } bool pwd_hold_smr(struct pwd *pwd) { MPASS(pwd != NULL); if (__predict_true(refcount_acquire_if_not_zero(&pwd->pwd_refcount))) { return (true); } return (false); } struct pwd * pwd_hold(struct thread *td) { struct pwddesc *pdp; struct pwd *pwd; pdp = td->td_proc->p_pd; vfs_smr_enter(); pwd = vfs_smr_entered_load(&pdp->pd_pwd); if (pwd_hold_smr(pwd)) { vfs_smr_exit(); return (pwd); } vfs_smr_exit(); PWDDESC_XLOCK(pdp); pwd = pwd_hold_pwddesc(pdp); MPASS(pwd != NULL); PWDDESC_XUNLOCK(pdp); return (pwd); } struct pwd * pwd_hold_proc(struct proc *p) { struct pwddesc *pdp; struct pwd *pwd; PROC_ASSERT_HELD(p); PROC_LOCK(p); pdp = pdhold(p); MPASS(pdp != NULL); PROC_UNLOCK(p); PWDDESC_XLOCK(pdp); pwd = pwd_hold_pwddesc(pdp); MPASS(pwd != NULL); PWDDESC_XUNLOCK(pdp); pddrop(pdp); return (pwd); } static struct pwd * pwd_alloc(void) { struct pwd *pwd; pwd = uma_zalloc_smr(pwd_zone, M_WAITOK); bzero(pwd, sizeof(*pwd)); refcount_init(&pwd->pwd_refcount, 1); return (pwd); } void pwd_drop(struct pwd *pwd) { if (!refcount_release(&pwd->pwd_refcount)) return; if (pwd->pwd_cdir != NULL) vrele(pwd->pwd_cdir); if (pwd->pwd_rdir != NULL) vrele(pwd->pwd_rdir); if (pwd->pwd_jdir != NULL) vrele(pwd->pwd_jdir); uma_zfree_smr(pwd_zone, pwd); } /* * The caller is responsible for invoking priv_check() and * mac_vnode_check_chroot() to authorize this operation. */ int pwd_chroot(struct thread *td, struct vnode *vp) { struct pwddesc *pdp; struct filedesc *fdp; struct pwd *newpwd, *oldpwd; int error; fdp = td->td_proc->p_fd; pdp = td->td_proc->p_pd; newpwd = pwd_alloc(); FILEDESC_SLOCK(fdp); PWDDESC_XLOCK(pdp); oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); if (chroot_allow_open_directories == 0 || (chroot_allow_open_directories == 1 && oldpwd->pwd_rdir != rootvnode)) { error = chroot_refuse_vdir_fds(fdp); FILEDESC_SUNLOCK(fdp); if (error != 0) { PWDDESC_XUNLOCK(pdp); pwd_drop(newpwd); return (error); } } else { FILEDESC_SUNLOCK(fdp); } vrefact(vp); newpwd->pwd_rdir = vp; if (oldpwd->pwd_jdir == NULL) { vrefact(vp); newpwd->pwd_jdir = vp; } pwd_fill(oldpwd, newpwd); pwd_set(pdp, newpwd); PWDDESC_XUNLOCK(pdp); pwd_drop(oldpwd); return (0); } void pwd_chdir(struct thread *td, struct vnode *vp) { struct pwddesc *pdp; struct pwd *newpwd, *oldpwd; VNPASS(vp->v_usecount > 0, vp); newpwd = pwd_alloc(); pdp = td->td_proc->p_pd; PWDDESC_XLOCK(pdp); oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); newpwd->pwd_cdir = vp; pwd_fill(oldpwd, newpwd); pwd_set(pdp, newpwd); PWDDESC_XUNLOCK(pdp); pwd_drop(oldpwd); } /* * jail_attach(2) changes both root and working directories. */ int pwd_chroot_chdir(struct thread *td, struct vnode *vp) { struct pwddesc *pdp; struct filedesc *fdp; struct pwd *newpwd, *oldpwd; int error; fdp = td->td_proc->p_fd; pdp = td->td_proc->p_pd; newpwd = pwd_alloc(); FILEDESC_SLOCK(fdp); PWDDESC_XLOCK(pdp); oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); error = chroot_refuse_vdir_fds(fdp); FILEDESC_SUNLOCK(fdp); if (error != 0) { PWDDESC_XUNLOCK(pdp); pwd_drop(newpwd); return (error); } vrefact(vp); newpwd->pwd_rdir = vp; vrefact(vp); newpwd->pwd_cdir = vp; if (oldpwd->pwd_jdir == NULL) { vrefact(vp); newpwd->pwd_jdir = vp; } pwd_fill(oldpwd, newpwd); pwd_set(pdp, newpwd); PWDDESC_XUNLOCK(pdp); pwd_drop(oldpwd); return (0); } void pwd_ensure_dirs(void) { struct pwddesc *pdp; struct pwd *oldpwd, *newpwd; pdp = curproc->p_pd; PWDDESC_XLOCK(pdp); oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); if (oldpwd->pwd_cdir != NULL && oldpwd->pwd_rdir != NULL) { PWDDESC_XUNLOCK(pdp); return; } PWDDESC_XUNLOCK(pdp); newpwd = pwd_alloc(); PWDDESC_XLOCK(pdp); oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); pwd_fill(oldpwd, newpwd); if (newpwd->pwd_cdir == NULL) { vrefact(rootvnode); newpwd->pwd_cdir = rootvnode; } if (newpwd->pwd_rdir == NULL) { vrefact(rootvnode); newpwd->pwd_rdir = rootvnode; } pwd_set(pdp, newpwd); PWDDESC_XUNLOCK(pdp); pwd_drop(oldpwd); } void pwd_set_rootvnode(void) { struct pwddesc *pdp; struct pwd *oldpwd, *newpwd; pdp = curproc->p_pd; newpwd = pwd_alloc(); PWDDESC_XLOCK(pdp); oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); vrefact(rootvnode); newpwd->pwd_cdir = rootvnode; vrefact(rootvnode); newpwd->pwd_rdir = rootvnode; pwd_fill(oldpwd, newpwd); pwd_set(pdp, newpwd); PWDDESC_XUNLOCK(pdp); pwd_drop(oldpwd); } /* * Scan all active processes and prisons to see if any of them have a current * or root directory of `olddp'. If so, replace them with the new mount point. */ void mountcheckdirs(struct vnode *olddp, struct vnode *newdp) { struct pwddesc *pdp; struct pwd *newpwd, *oldpwd; struct prison *pr; struct proc *p; int nrele; if (vrefcnt(olddp) == 1) return; nrele = 0; newpwd = pwd_alloc(); sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); pdp = pdhold(p); PROC_UNLOCK(p); if (pdp == NULL) continue; PWDDESC_XLOCK(pdp); oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); if (oldpwd == NULL || (oldpwd->pwd_cdir != olddp && oldpwd->pwd_rdir != olddp && oldpwd->pwd_jdir != olddp)) { PWDDESC_XUNLOCK(pdp); pddrop(pdp); continue; } if (oldpwd->pwd_cdir == olddp) { vrefact(newdp); newpwd->pwd_cdir = newdp; } if (oldpwd->pwd_rdir == olddp) { vrefact(newdp); newpwd->pwd_rdir = newdp; } if (oldpwd->pwd_jdir == olddp) { vrefact(newdp); newpwd->pwd_jdir = newdp; } pwd_fill(oldpwd, newpwd); pwd_set(pdp, newpwd); PWDDESC_XUNLOCK(pdp); pwd_drop(oldpwd); pddrop(pdp); newpwd = pwd_alloc(); } sx_sunlock(&allproc_lock); pwd_drop(newpwd); if (rootvnode == olddp) { vrefact(newdp); rootvnode = newdp; nrele++; } mtx_lock(&prison0.pr_mtx); if (prison0.pr_root == olddp) { vrefact(newdp); prison0.pr_root = newdp; nrele++; } mtx_unlock(&prison0.pr_mtx); sx_slock(&allprison_lock); TAILQ_FOREACH(pr, &allprison, pr_list) { mtx_lock(&pr->pr_mtx); if (pr->pr_root == olddp) { vrefact(newdp); pr->pr_root = newdp; nrele++; } mtx_unlock(&pr->pr_mtx); } sx_sunlock(&allprison_lock); while (nrele--) vrele(olddp); } struct filedesc_to_leader * filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader) { struct filedesc_to_leader *fdtol; fdtol = malloc(sizeof(struct filedesc_to_leader), M_FILEDESC_TO_LEADER, M_WAITOK); fdtol->fdl_refcount = 1; fdtol->fdl_holdcount = 0; fdtol->fdl_wakeup = 0; fdtol->fdl_leader = leader; if (old != NULL) { FILEDESC_XLOCK(fdp); fdtol->fdl_next = old->fdl_next; fdtol->fdl_prev = old; old->fdl_next = fdtol; fdtol->fdl_next->fdl_prev = fdtol; FILEDESC_XUNLOCK(fdp); } else { fdtol->fdl_next = fdtol; fdtol->fdl_prev = fdtol; } return (fdtol); } static int sysctl_kern_proc_nfds(SYSCTL_HANDLER_ARGS) { NDSLOTTYPE *map; struct filedesc *fdp; u_int namelen; int count, off, minoff; namelen = arg2; if (namelen != 1) return (EINVAL); if (*(int *)arg1 != 0) return (EINVAL); fdp = curproc->p_fd; count = 0; FILEDESC_SLOCK(fdp); map = fdp->fd_map; off = NDSLOT(fdp->fd_nfiles - 1); for (minoff = NDSLOT(0); off >= minoff; --off) count += bitcountl(map[off]); FILEDESC_SUNLOCK(fdp); return (SYSCTL_OUT(req, &count, sizeof(count))); } static SYSCTL_NODE(_kern_proc, KERN_PROC_NFDS, nfds, CTLFLAG_RD|CTLFLAG_CAPRD|CTLFLAG_MPSAFE, sysctl_kern_proc_nfds, "Number of open file descriptors"); /* * Get file structures globally. */ static int sysctl_kern_file(SYSCTL_HANDLER_ARGS) { struct xfile xf; struct filedesc *fdp; struct file *fp; struct proc *p; int error, n, lastfile; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); if (req->oldptr == NULL) { n = 0; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } fdp = fdhold(p); PROC_UNLOCK(p); if (fdp == NULL) continue; /* overestimates sparse tables. */ n += fdp->fd_nfiles; fddrop(fdp); } sx_sunlock(&allproc_lock); return (SYSCTL_OUT(req, 0, n * sizeof(xf))); } error = 0; bzero(&xf, sizeof(xf)); xf.xf_size = sizeof(xf); sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } if (p_cansee(req->td, p) != 0) { PROC_UNLOCK(p); continue; } xf.xf_pid = p->p_pid; xf.xf_uid = p->p_ucred->cr_uid; fdp = fdhold(p); PROC_UNLOCK(p); if (fdp == NULL) continue; FILEDESC_SLOCK(fdp); lastfile = fdlastfile(fdp); for (n = 0; refcount_load(&fdp->fd_refcnt) > 0 && n <= lastfile; n++) { if ((fp = fdp->fd_ofiles[n].fde_file) == NULL) continue; xf.xf_fd = n; xf.xf_file = (uintptr_t)fp; xf.xf_data = (uintptr_t)fp->f_data; xf.xf_vnode = (uintptr_t)fp->f_vnode; xf.xf_type = (uintptr_t)fp->f_type; xf.xf_count = refcount_load(&fp->f_count); xf.xf_msgcount = 0; xf.xf_offset = foffset_get(fp); xf.xf_flag = fp->f_flag; error = SYSCTL_OUT(req, &xf, sizeof(xf)); if (error) break; } FILEDESC_SUNLOCK(fdp); fddrop(fdp); if (error) break; } sx_sunlock(&allproc_lock); return (error); } SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE, 0, 0, sysctl_kern_file, "S,xfile", "Entire file table"); #ifdef KINFO_FILE_SIZE CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE); #endif static int xlate_fflags(int fflags) { static const struct { int fflag; int kf_fflag; } fflags_table[] = { { FAPPEND, KF_FLAG_APPEND }, { FASYNC, KF_FLAG_ASYNC }, { FFSYNC, KF_FLAG_FSYNC }, { FHASLOCK, KF_FLAG_HASLOCK }, { FNONBLOCK, KF_FLAG_NONBLOCK }, { FREAD, KF_FLAG_READ }, { FWRITE, KF_FLAG_WRITE }, { O_CREAT, KF_FLAG_CREAT }, { O_DIRECT, KF_FLAG_DIRECT }, { O_EXCL, KF_FLAG_EXCL }, { O_EXEC, KF_FLAG_EXEC }, { O_EXLOCK, KF_FLAG_EXLOCK }, { O_NOFOLLOW, KF_FLAG_NOFOLLOW }, { O_SHLOCK, KF_FLAG_SHLOCK }, { O_TRUNC, KF_FLAG_TRUNC } }; unsigned int i; int kflags; kflags = 0; for (i = 0; i < nitems(fflags_table); i++) if (fflags & fflags_table[i].fflag) kflags |= fflags_table[i].kf_fflag; return (kflags); } /* Trim unused data from kf_path by truncating the structure size. */ void pack_kinfo(struct kinfo_file *kif) { kif->kf_structsize = offsetof(struct kinfo_file, kf_path) + strlen(kif->kf_path) + 1; kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t)); } static void export_file_to_kinfo(struct file *fp, int fd, cap_rights_t *rightsp, struct kinfo_file *kif, struct filedesc *fdp, int flags) { int error; bzero(kif, sizeof(*kif)); /* Set a default type to allow for empty fill_kinfo() methods. */ kif->kf_type = KF_TYPE_UNKNOWN; kif->kf_flags = xlate_fflags(fp->f_flag); if (rightsp != NULL) kif->kf_cap_rights = *rightsp; else cap_rights_init_zero(&kif->kf_cap_rights); kif->kf_fd = fd; kif->kf_ref_count = refcount_load(&fp->f_count); kif->kf_offset = foffset_get(fp); /* * This may drop the filedesc lock, so the 'fp' cannot be * accessed after this call. */ error = fo_fill_kinfo(fp, kif, fdp); if (error == 0) kif->kf_status |= KF_ATTR_VALID; if ((flags & KERN_FILEDESC_PACK_KINFO) != 0) pack_kinfo(kif); else kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t)); } static void export_vnode_to_kinfo(struct vnode *vp, int fd, int fflags, struct kinfo_file *kif, int flags) { int error; bzero(kif, sizeof(*kif)); kif->kf_type = KF_TYPE_VNODE; error = vn_fill_kinfo_vnode(vp, kif); if (error == 0) kif->kf_status |= KF_ATTR_VALID; kif->kf_flags = xlate_fflags(fflags); cap_rights_init_zero(&kif->kf_cap_rights); kif->kf_fd = fd; kif->kf_ref_count = -1; kif->kf_offset = -1; if ((flags & KERN_FILEDESC_PACK_KINFO) != 0) pack_kinfo(kif); else kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t)); vrele(vp); } struct export_fd_buf { struct filedesc *fdp; struct pwddesc *pdp; struct sbuf *sb; ssize_t remainder; struct kinfo_file kif; int flags; }; static int export_kinfo_to_sb(struct export_fd_buf *efbuf) { struct kinfo_file *kif; kif = &efbuf->kif; if (efbuf->remainder != -1) { if (efbuf->remainder < kif->kf_structsize) { /* Terminate export. */ efbuf->remainder = 0; return (0); } efbuf->remainder -= kif->kf_structsize; } return (sbuf_bcat(efbuf->sb, kif, kif->kf_structsize) == 0 ? 0 : ENOMEM); } static int export_file_to_sb(struct file *fp, int fd, cap_rights_t *rightsp, struct export_fd_buf *efbuf) { int error; if (efbuf->remainder == 0) return (0); export_file_to_kinfo(fp, fd, rightsp, &efbuf->kif, efbuf->fdp, efbuf->flags); FILEDESC_SUNLOCK(efbuf->fdp); error = export_kinfo_to_sb(efbuf); FILEDESC_SLOCK(efbuf->fdp); return (error); } static int export_vnode_to_sb(struct vnode *vp, int fd, int fflags, struct export_fd_buf *efbuf) { int error; if (efbuf->remainder == 0) return (0); if (efbuf->pdp != NULL) PWDDESC_XUNLOCK(efbuf->pdp); export_vnode_to_kinfo(vp, fd, fflags, &efbuf->kif, efbuf->flags); error = export_kinfo_to_sb(efbuf); if (efbuf->pdp != NULL) PWDDESC_XLOCK(efbuf->pdp); return (error); } /* * Store a process file descriptor information to sbuf. * * Takes a locked proc as argument, and returns with the proc unlocked. */ int kern_proc_filedesc_out(struct proc *p, struct sbuf *sb, ssize_t maxlen, int flags) { struct file *fp; struct filedesc *fdp; struct pwddesc *pdp; struct export_fd_buf *efbuf; struct vnode *cttyvp, *textvp, *tracevp; struct pwd *pwd; int error, i, lastfile; cap_rights_t rights; PROC_LOCK_ASSERT(p, MA_OWNED); /* ktrace vnode */ tracevp = ktr_get_tracevp(p, true); /* text vnode */ textvp = p->p_textvp; if (textvp != NULL) vrefact(textvp); /* Controlling tty. */ cttyvp = NULL; if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) { cttyvp = p->p_pgrp->pg_session->s_ttyvp; if (cttyvp != NULL) vrefact(cttyvp); } fdp = fdhold(p); pdp = pdhold(p); PROC_UNLOCK(p); efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK); efbuf->fdp = NULL; efbuf->pdp = NULL; efbuf->sb = sb; efbuf->remainder = maxlen; efbuf->flags = flags; if (tracevp != NULL) export_vnode_to_sb(tracevp, KF_FD_TYPE_TRACE, FREAD | FWRITE, efbuf); if (textvp != NULL) export_vnode_to_sb(textvp, KF_FD_TYPE_TEXT, FREAD, efbuf); if (cttyvp != NULL) export_vnode_to_sb(cttyvp, KF_FD_TYPE_CTTY, FREAD | FWRITE, efbuf); error = 0; if (pdp == NULL || fdp == NULL) goto fail; efbuf->fdp = fdp; efbuf->pdp = pdp; PWDDESC_XLOCK(pdp); pwd = pwd_hold_pwddesc(pdp); if (pwd != NULL) { /* working directory */ if (pwd->pwd_cdir != NULL) { vrefact(pwd->pwd_cdir); export_vnode_to_sb(pwd->pwd_cdir, KF_FD_TYPE_CWD, FREAD, efbuf); } /* root directory */ if (pwd->pwd_rdir != NULL) { vrefact(pwd->pwd_rdir); export_vnode_to_sb(pwd->pwd_rdir, KF_FD_TYPE_ROOT, FREAD, efbuf); } /* jail directory */ if (pwd->pwd_jdir != NULL) { vrefact(pwd->pwd_jdir); export_vnode_to_sb(pwd->pwd_jdir, KF_FD_TYPE_JAIL, FREAD, efbuf); } } PWDDESC_XUNLOCK(pdp); if (pwd != NULL) pwd_drop(pwd); FILEDESC_SLOCK(fdp); lastfile = fdlastfile(fdp); for (i = 0; refcount_load(&fdp->fd_refcnt) > 0 && i <= lastfile; i++) { if ((fp = fdp->fd_ofiles[i].fde_file) == NULL) continue; #ifdef CAPABILITIES rights = *cap_rights(fdp, i); #else /* !CAPABILITIES */ rights = cap_no_rights; #endif /* * Create sysctl entry. It is OK to drop the filedesc * lock inside of export_file_to_sb() as we will * re-validate and re-evaluate its properties when the * loop continues. */ error = export_file_to_sb(fp, i, &rights, efbuf); if (error != 0 || efbuf->remainder == 0) break; } FILEDESC_SUNLOCK(fdp); fail: if (fdp != NULL) fddrop(fdp); if (pdp != NULL) pddrop(pdp); free(efbuf, M_TEMP); return (error); } #define FILEDESC_SBUF_SIZE (sizeof(struct kinfo_file) * 5) /* * Get per-process file descriptors for use by procstat(1), et al. */ static int sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS) { struct sbuf sb; struct proc *p; ssize_t maxlen; u_int namelen; int error, error2, *name; namelen = arg2; if (namelen != 1) return (EINVAL); name = (int *)arg1; sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req); sbuf_clear_flags(&sb, SBUF_INCLUDENUL); error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); if (error != 0) { sbuf_delete(&sb); return (error); } maxlen = req->oldptr != NULL ? req->oldlen : -1; error = kern_proc_filedesc_out(p, &sb, maxlen, KERN_FILEDESC_PACK_KINFO); error2 = sbuf_finish(&sb); sbuf_delete(&sb); return (error != 0 ? error : error2); } #ifdef COMPAT_FREEBSD7 #ifdef KINFO_OFILE_SIZE CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE); #endif static void kinfo_to_okinfo(struct kinfo_file *kif, struct kinfo_ofile *okif) { okif->kf_structsize = sizeof(*okif); okif->kf_type = kif->kf_type; okif->kf_fd = kif->kf_fd; okif->kf_ref_count = kif->kf_ref_count; okif->kf_flags = kif->kf_flags & (KF_FLAG_READ | KF_FLAG_WRITE | KF_FLAG_APPEND | KF_FLAG_ASYNC | KF_FLAG_FSYNC | KF_FLAG_NONBLOCK | KF_FLAG_DIRECT | KF_FLAG_HASLOCK); okif->kf_offset = kif->kf_offset; if (kif->kf_type == KF_TYPE_VNODE) okif->kf_vnode_type = kif->kf_un.kf_file.kf_file_type; else okif->kf_vnode_type = KF_VTYPE_VNON; strlcpy(okif->kf_path, kif->kf_path, sizeof(okif->kf_path)); if (kif->kf_type == KF_TYPE_SOCKET) { okif->kf_sock_domain = kif->kf_un.kf_sock.kf_sock_domain0; okif->kf_sock_type = kif->kf_un.kf_sock.kf_sock_type0; okif->kf_sock_protocol = kif->kf_un.kf_sock.kf_sock_protocol0; okif->kf_sa_local = kif->kf_un.kf_sock.kf_sa_local; okif->kf_sa_peer = kif->kf_un.kf_sock.kf_sa_peer; } else { okif->kf_sa_local.ss_family = AF_UNSPEC; okif->kf_sa_peer.ss_family = AF_UNSPEC; } } static int export_vnode_for_osysctl(struct vnode *vp, int type, struct kinfo_file *kif, struct kinfo_ofile *okif, struct pwddesc *pdp, struct sysctl_req *req) { int error; vrefact(vp); PWDDESC_XUNLOCK(pdp); export_vnode_to_kinfo(vp, type, 0, kif, KERN_FILEDESC_PACK_KINFO); kinfo_to_okinfo(kif, okif); error = SYSCTL_OUT(req, okif, sizeof(*okif)); PWDDESC_XLOCK(pdp); return (error); } /* * Get per-process file descriptors for use by procstat(1), et al. */ static int sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS) { struct kinfo_ofile *okif; struct kinfo_file *kif; struct filedesc *fdp; struct pwddesc *pdp; struct pwd *pwd; u_int namelen; int error, i, lastfile, *name; struct file *fp; struct proc *p; namelen = arg2; if (namelen != 1) return (EINVAL); name = (int *)arg1; error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); if (error != 0) return (error); fdp = fdhold(p); if (fdp != NULL) pdp = pdhold(p); PROC_UNLOCK(p); if (fdp == NULL || pdp == NULL) { if (fdp != NULL) fddrop(fdp); return (ENOENT); } kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK); okif = malloc(sizeof(*okif), M_TEMP, M_WAITOK); PWDDESC_XLOCK(pdp); pwd = pwd_hold_pwddesc(pdp); if (pwd != NULL) { if (pwd->pwd_cdir != NULL) export_vnode_for_osysctl(pwd->pwd_cdir, KF_FD_TYPE_CWD, kif, okif, pdp, req); if (pwd->pwd_rdir != NULL) export_vnode_for_osysctl(pwd->pwd_rdir, KF_FD_TYPE_ROOT, kif, okif, pdp, req); if (pwd->pwd_jdir != NULL) export_vnode_for_osysctl(pwd->pwd_jdir, KF_FD_TYPE_JAIL, kif, okif, pdp, req); } PWDDESC_XUNLOCK(pdp); if (pwd != NULL) pwd_drop(pwd); FILEDESC_SLOCK(fdp); lastfile = fdlastfile(fdp); for (i = 0; refcount_load(&fdp->fd_refcnt) > 0 && i <= lastfile; i++) { if ((fp = fdp->fd_ofiles[i].fde_file) == NULL) continue; export_file_to_kinfo(fp, i, NULL, kif, fdp, KERN_FILEDESC_PACK_KINFO); FILEDESC_SUNLOCK(fdp); kinfo_to_okinfo(kif, okif); error = SYSCTL_OUT(req, okif, sizeof(*okif)); FILEDESC_SLOCK(fdp); if (error) break; } FILEDESC_SUNLOCK(fdp); fddrop(fdp); pddrop(pdp); free(kif, M_TEMP); free(okif, M_TEMP); return (0); } static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc, CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_ofiledesc, "Process ofiledesc entries"); #endif /* COMPAT_FREEBSD7 */ int vntype_to_kinfo(int vtype) { struct { int vtype; int kf_vtype; } vtypes_table[] = { { VBAD, KF_VTYPE_VBAD }, { VBLK, KF_VTYPE_VBLK }, { VCHR, KF_VTYPE_VCHR }, { VDIR, KF_VTYPE_VDIR }, { VFIFO, KF_VTYPE_VFIFO }, { VLNK, KF_VTYPE_VLNK }, { VNON, KF_VTYPE_VNON }, { VREG, KF_VTYPE_VREG }, { VSOCK, KF_VTYPE_VSOCK } }; unsigned int i; /* * Perform vtype translation. */ for (i = 0; i < nitems(vtypes_table); i++) if (vtypes_table[i].vtype == vtype) return (vtypes_table[i].kf_vtype); return (KF_VTYPE_UNKNOWN); } static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc, CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_filedesc, "Process filedesc entries"); /* * Store a process current working directory information to sbuf. * * Takes a locked proc as argument, and returns with the proc unlocked. */ int kern_proc_cwd_out(struct proc *p, struct sbuf *sb, ssize_t maxlen) { struct pwddesc *pdp; struct pwd *pwd; struct export_fd_buf *efbuf; struct vnode *cdir; int error; PROC_LOCK_ASSERT(p, MA_OWNED); pdp = pdhold(p); PROC_UNLOCK(p); if (pdp == NULL) return (EINVAL); efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK); efbuf->pdp = pdp; efbuf->sb = sb; efbuf->remainder = maxlen; PWDDESC_XLOCK(pdp); pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); cdir = pwd->pwd_cdir; if (cdir == NULL) { error = EINVAL; } else { vrefact(cdir); error = export_vnode_to_sb(cdir, KF_FD_TYPE_CWD, FREAD, efbuf); } PWDDESC_XUNLOCK(pdp); pddrop(pdp); free(efbuf, M_TEMP); return (error); } /* * Get per-process current working directory. */ static int sysctl_kern_proc_cwd(SYSCTL_HANDLER_ARGS) { struct sbuf sb; struct proc *p; ssize_t maxlen; u_int namelen; int error, error2, *name; namelen = arg2; if (namelen != 1) return (EINVAL); name = (int *)arg1; sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file), req); sbuf_clear_flags(&sb, SBUF_INCLUDENUL); error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); if (error != 0) { sbuf_delete(&sb); return (error); } maxlen = req->oldptr != NULL ? req->oldlen : -1; error = kern_proc_cwd_out(p, &sb, maxlen); error2 = sbuf_finish(&sb); sbuf_delete(&sb); return (error != 0 ? error : error2); } static SYSCTL_NODE(_kern_proc, KERN_PROC_CWD, cwd, CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_cwd, "Process current working directory"); #ifdef DDB /* * For the purposes of debugging, generate a human-readable string for the * file type. */ static const char * file_type_to_name(short type) { switch (type) { case 0: return ("zero"); case DTYPE_VNODE: return ("vnode"); case DTYPE_SOCKET: return ("socket"); case DTYPE_PIPE: return ("pipe"); case DTYPE_FIFO: return ("fifo"); case DTYPE_KQUEUE: return ("kqueue"); case DTYPE_CRYPTO: return ("crypto"); case DTYPE_MQUEUE: return ("mqueue"); case DTYPE_SHM: return ("shm"); case DTYPE_SEM: return ("ksem"); case DTYPE_PTS: return ("pts"); case DTYPE_DEV: return ("dev"); case DTYPE_PROCDESC: return ("proc"); case DTYPE_EVENTFD: return ("eventfd"); case DTYPE_LINUXTFD: return ("ltimer"); default: return ("unkn"); } } /* * For the purposes of debugging, identify a process (if any, perhaps one of * many) that references the passed file in its file descriptor array. Return * NULL if none. */ static struct proc * file_to_first_proc(struct file *fp) { struct filedesc *fdp; struct proc *p; int n; FOREACH_PROC_IN_SYSTEM(p) { if (p->p_state == PRS_NEW) continue; fdp = p->p_fd; if (fdp == NULL) continue; for (n = 0; n < fdp->fd_nfiles; n++) { if (fp == fdp->fd_ofiles[n].fde_file) return (p); } } return (NULL); } static void db_print_file(struct file *fp, int header) { #define XPTRWIDTH ((int)howmany(sizeof(void *) * NBBY, 4)) struct proc *p; if (header) db_printf("%*s %6s %*s %8s %4s %5s %6s %*s %5s %s\n", XPTRWIDTH, "File", "Type", XPTRWIDTH, "Data", "Flag", "GCFl", "Count", "MCount", XPTRWIDTH, "Vnode", "FPID", "FCmd"); p = file_to_first_proc(fp); db_printf("%*p %6s %*p %08x %04x %5d %6d %*p %5d %s\n", XPTRWIDTH, fp, file_type_to_name(fp->f_type), XPTRWIDTH, fp->f_data, fp->f_flag, 0, refcount_load(&fp->f_count), 0, XPTRWIDTH, fp->f_vnode, p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-"); #undef XPTRWIDTH } DB_SHOW_COMMAND(file, db_show_file) { struct file *fp; if (!have_addr) { db_printf("usage: show file \n"); return; } fp = (struct file *)addr; db_print_file(fp, 1); } DB_SHOW_COMMAND(files, db_show_files) { struct filedesc *fdp; struct file *fp; struct proc *p; int header; int n; header = 1; FOREACH_PROC_IN_SYSTEM(p) { if (p->p_state == PRS_NEW) continue; if ((fdp = p->p_fd) == NULL) continue; for (n = 0; n < fdp->fd_nfiles; ++n) { if ((fp = fdp->fd_ofiles[n].fde_file) == NULL) continue; db_print_file(fp, header); header = 0; } } } #endif SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW, &maxfilesperproc, 0, "Maximum files allowed open per process"); SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW, &maxfiles, 0, "Maximum number of files"); SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD, &openfiles, 0, "System-wide number of open files"); /* ARGSUSED*/ static void filelistinit(void *dummy) { file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); filedesc0_zone = uma_zcreate("filedesc0", sizeof(struct filedesc0), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); pwd_zone = uma_zcreate("PWD", sizeof(struct pwd), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_SMR); /* * XXXMJG this is a temporary hack due to boot ordering issues against * the vnode zone. */ vfs_smr = uma_zone_get_smr(pwd_zone); mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF); } SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL); /*-------------------------------------------------------------------*/ static int badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return (EBADF); } static int badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { return (EINVAL); } static int badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { return (EBADF); } static int badfo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { return (0); } static int badfo_kqfilter(struct file *fp, struct knote *kn) { return (EBADF); } static int -badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, - struct thread *td) +badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred) { return (EBADF); } static int badfo_close(struct file *fp, struct thread *td) { return (0); } static int badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { return (EBADF); } static int badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { return (EBADF); } static int badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, struct thread *td) { return (EBADF); } static int badfo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { return (0); } struct fileops badfileops = { .fo_read = badfo_readwrite, .fo_write = badfo_readwrite, .fo_truncate = badfo_truncate, .fo_ioctl = badfo_ioctl, .fo_poll = badfo_poll, .fo_kqfilter = badfo_kqfilter, .fo_stat = badfo_stat, .fo_close = badfo_close, .fo_chmod = badfo_chmod, .fo_chown = badfo_chown, .fo_sendfile = badfo_sendfile, .fo_fill_kinfo = badfo_fill_kinfo, }; static int path_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { return (POLLNVAL); } static int path_close(struct file *fp, struct thread *td) { MPASS(fp->f_type == DTYPE_VNODE); fp->f_ops = &badfileops; vdrop(fp->f_vnode); return (0); } struct fileops path_fileops = { .fo_read = badfo_readwrite, .fo_write = badfo_readwrite, .fo_truncate = badfo_truncate, .fo_ioctl = badfo_ioctl, .fo_poll = path_poll, .fo_kqfilter = vn_kqfilter_opath, .fo_stat = vn_statfile, .fo_close = path_close, .fo_chmod = badfo_chmod, .fo_chown = badfo_chown, .fo_sendfile = badfo_sendfile, .fo_fill_kinfo = vn_fill_kinfo, .fo_flags = DFLAG_PASSABLE, }; int invfo_rdwr(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return (EOPNOTSUPP); } int invfo_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { return (EINVAL); } int invfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { return (ENOTTY); } int invfo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { return (poll_no_poll(events)); } int invfo_kqfilter(struct file *fp, struct knote *kn) { return (EINVAL); } int invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { return (EINVAL); } int invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { return (EINVAL); } int invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, struct thread *td) { return (EINVAL); } /*-------------------------------------------------------------------*/ /* * File Descriptor pseudo-device driver (/dev/fd/). * * Opening minor device N dup()s the file (if any) connected to file * descriptor N belonging to the calling process. Note that this driver * consists of only the ``open()'' routine, because all subsequent * references to this file will be direct to the other driver. * * XXX: we could give this one a cloning event handler if necessary. */ /* ARGSUSED */ static int fdopen(struct cdev *dev, int mode, int type, struct thread *td) { /* * XXX Kludge: set curthread->td_dupfd to contain the value of the * the file descriptor being sought for duplication. The error * return ensures that the vnode for this device will be released * by vn_open. Open will detect this special error and take the * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN * will simply report the error. */ td->td_dupfd = dev2unit(dev); return (ENODEV); } static struct cdevsw fildesc_cdevsw = { .d_version = D_VERSION, .d_open = fdopen, .d_name = "FD", }; static void fildesc_drvinit(void *unused) { struct cdev *dev; dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0666, "fd/0"); make_dev_alias(dev, "stdin"); dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL, UID_ROOT, GID_WHEEL, 0666, "fd/1"); make_dev_alias(dev, "stdout"); dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL, UID_ROOT, GID_WHEEL, 0666, "fd/2"); make_dev_alias(dev, "stderr"); } SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL); diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c index c9b50912a22d..ade71a848868 100644 --- a/sys/kern/kern_event.c +++ b/sys/kern/kern_event.c @@ -1,2855 +1,2854 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1999,2000,2001 Jonathan Lemon * Copyright 2004 John-Mark Gurney * Copyright (c) 2009 Apple, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include "opt_kqueue.h" #ifdef COMPAT_FREEBSD11 #define _WANT_FREEBSD11_KEVENT #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system"); /* * This lock is used if multiple kq locks are required. This possibly * should be made into a per proc lock. */ static struct mtx kq_global; MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF); #define KQ_GLOBAL_LOCK(lck, haslck) do { \ if (!haslck) \ mtx_lock(lck); \ haslck = 1; \ } while (0) #define KQ_GLOBAL_UNLOCK(lck, haslck) do { \ if (haslck) \ mtx_unlock(lck); \ haslck = 0; \ } while (0) TASKQUEUE_DEFINE_THREAD(kqueue_ctx); static int kevent_copyout(void *arg, struct kevent *kevp, int count); static int kevent_copyin(void *arg, struct kevent *kevp, int count); static int kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int mflag); static int kqueue_acquire(struct file *fp, struct kqueue **kqp); static void kqueue_release(struct kqueue *kq, int locked); static void kqueue_destroy(struct kqueue *kq); static void kqueue_drain(struct kqueue *kq, struct thread *td); static int kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident, int mflag); static void kqueue_task(void *arg, int pending); static int kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops, const struct timespec *timeout, struct kevent *keva, struct thread *td); static void kqueue_wakeup(struct kqueue *kq); static struct filterops *kqueue_fo_find(int filt); static void kqueue_fo_release(int filt); struct g_kevent_args; static int kern_kevent_generic(struct thread *td, struct g_kevent_args *uap, struct kevent_copyops *k_ops, const char *struct_name); static fo_ioctl_t kqueue_ioctl; static fo_poll_t kqueue_poll; static fo_kqfilter_t kqueue_kqfilter; static fo_stat_t kqueue_stat; static fo_close_t kqueue_close; static fo_fill_kinfo_t kqueue_fill_kinfo; static struct fileops kqueueops = { .fo_read = invfo_rdwr, .fo_write = invfo_rdwr, .fo_truncate = invfo_truncate, .fo_ioctl = kqueue_ioctl, .fo_poll = kqueue_poll, .fo_kqfilter = kqueue_kqfilter, .fo_stat = kqueue_stat, .fo_close = kqueue_close, .fo_chmod = invfo_chmod, .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = kqueue_fill_kinfo, }; static int knote_attach(struct knote *kn, struct kqueue *kq); static void knote_drop(struct knote *kn, struct thread *td); static void knote_drop_detached(struct knote *kn, struct thread *td); static void knote_enqueue(struct knote *kn); static void knote_dequeue(struct knote *kn); static void knote_init(void); static struct knote *knote_alloc(int mflag); static void knote_free(struct knote *kn); static void filt_kqdetach(struct knote *kn); static int filt_kqueue(struct knote *kn, long hint); static int filt_procattach(struct knote *kn); static void filt_procdetach(struct knote *kn); static int filt_proc(struct knote *kn, long hint); static int filt_fileattach(struct knote *kn); static void filt_timerexpire(void *knx); static void filt_timerexpire_l(struct knote *kn, bool proc_locked); static int filt_timerattach(struct knote *kn); static void filt_timerdetach(struct knote *kn); static void filt_timerstart(struct knote *kn, sbintime_t to); static void filt_timertouch(struct knote *kn, struct kevent *kev, u_long type); static int filt_timervalidate(struct knote *kn, sbintime_t *to); static int filt_timer(struct knote *kn, long hint); static int filt_userattach(struct knote *kn); static void filt_userdetach(struct knote *kn); static int filt_user(struct knote *kn, long hint); static void filt_usertouch(struct knote *kn, struct kevent *kev, u_long type); static struct filterops file_filtops = { .f_isfd = 1, .f_attach = filt_fileattach, }; static struct filterops kqread_filtops = { .f_isfd = 1, .f_detach = filt_kqdetach, .f_event = filt_kqueue, }; /* XXX - move to kern_proc.c? */ static struct filterops proc_filtops = { .f_isfd = 0, .f_attach = filt_procattach, .f_detach = filt_procdetach, .f_event = filt_proc, }; static struct filterops timer_filtops = { .f_isfd = 0, .f_attach = filt_timerattach, .f_detach = filt_timerdetach, .f_event = filt_timer, .f_touch = filt_timertouch, }; static struct filterops user_filtops = { .f_attach = filt_userattach, .f_detach = filt_userdetach, .f_event = filt_user, .f_touch = filt_usertouch, }; static uma_zone_t knote_zone; static unsigned int __exclusive_cache_line kq_ncallouts; static unsigned int kq_calloutmax = 4 * 1024; SYSCTL_UINT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW, &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue"); /* XXX - ensure not influx ? */ #define KNOTE_ACTIVATE(kn, islock) do { \ if ((islock)) \ mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED); \ else \ KQ_LOCK((kn)->kn_kq); \ (kn)->kn_status |= KN_ACTIVE; \ if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) \ knote_enqueue((kn)); \ if (!(islock)) \ KQ_UNLOCK((kn)->kn_kq); \ } while (0) #define KQ_LOCK(kq) do { \ mtx_lock(&(kq)->kq_lock); \ } while (0) #define KQ_FLUX_WAKEUP(kq) do { \ if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) { \ (kq)->kq_state &= ~KQ_FLUXWAIT; \ wakeup((kq)); \ } \ } while (0) #define KQ_UNLOCK_FLUX(kq) do { \ KQ_FLUX_WAKEUP(kq); \ mtx_unlock(&(kq)->kq_lock); \ } while (0) #define KQ_UNLOCK(kq) do { \ mtx_unlock(&(kq)->kq_lock); \ } while (0) #define KQ_OWNED(kq) do { \ mtx_assert(&(kq)->kq_lock, MA_OWNED); \ } while (0) #define KQ_NOTOWNED(kq) do { \ mtx_assert(&(kq)->kq_lock, MA_NOTOWNED); \ } while (0) static struct knlist * kn_list_lock(struct knote *kn) { struct knlist *knl; knl = kn->kn_knlist; if (knl != NULL) knl->kl_lock(knl->kl_lockarg); return (knl); } static void kn_list_unlock(struct knlist *knl) { bool do_free; if (knl == NULL) return; do_free = knl->kl_autodestroy && knlist_empty(knl); knl->kl_unlock(knl->kl_lockarg); if (do_free) { knlist_destroy(knl); free(knl, M_KQUEUE); } } static bool kn_in_flux(struct knote *kn) { return (kn->kn_influx > 0); } static void kn_enter_flux(struct knote *kn) { KQ_OWNED(kn->kn_kq); MPASS(kn->kn_influx < INT_MAX); kn->kn_influx++; } static bool kn_leave_flux(struct knote *kn) { KQ_OWNED(kn->kn_kq); MPASS(kn->kn_influx > 0); kn->kn_influx--; return (kn->kn_influx == 0); } #define KNL_ASSERT_LOCK(knl, islocked) do { \ if (islocked) \ KNL_ASSERT_LOCKED(knl); \ else \ KNL_ASSERT_UNLOCKED(knl); \ } while (0) #ifdef INVARIANTS #define KNL_ASSERT_LOCKED(knl) do { \ knl->kl_assert_lock((knl)->kl_lockarg, LA_LOCKED); \ } while (0) #define KNL_ASSERT_UNLOCKED(knl) do { \ knl->kl_assert_lock((knl)->kl_lockarg, LA_UNLOCKED); \ } while (0) #else /* !INVARIANTS */ #define KNL_ASSERT_LOCKED(knl) do {} while (0) #define KNL_ASSERT_UNLOCKED(knl) do {} while (0) #endif /* INVARIANTS */ #ifndef KN_HASHSIZE #define KN_HASHSIZE 64 /* XXX should be tunable */ #endif #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) static int filt_nullattach(struct knote *kn) { return (ENXIO); }; struct filterops null_filtops = { .f_isfd = 0, .f_attach = filt_nullattach, }; /* XXX - make SYSINIT to add these, and move into respective modules. */ extern struct filterops sig_filtops; extern struct filterops fs_filtops; /* * Table for for all system-defined filters. */ static struct mtx filterops_lock; MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops", MTX_DEF); static struct { struct filterops *for_fop; int for_nolock; int for_refcnt; } sysfilt_ops[EVFILT_SYSCOUNT] = { { &file_filtops, 1 }, /* EVFILT_READ */ { &file_filtops, 1 }, /* EVFILT_WRITE */ { &null_filtops }, /* EVFILT_AIO */ { &file_filtops, 1 }, /* EVFILT_VNODE */ { &proc_filtops, 1 }, /* EVFILT_PROC */ { &sig_filtops, 1 }, /* EVFILT_SIGNAL */ { &timer_filtops, 1 }, /* EVFILT_TIMER */ { &file_filtops, 1 }, /* EVFILT_PROCDESC */ { &fs_filtops, 1 }, /* EVFILT_FS */ { &null_filtops }, /* EVFILT_LIO */ { &user_filtops, 1 }, /* EVFILT_USER */ { &null_filtops }, /* EVFILT_SENDFILE */ { &file_filtops, 1 }, /* EVFILT_EMPTY */ }; /* * Simple redirection for all cdevsw style objects to call their fo_kqfilter * method. */ static int filt_fileattach(struct knote *kn) { return (fo_kqfilter(kn->kn_fp, kn)); } /*ARGSUSED*/ static int kqueue_kqfilter(struct file *fp, struct knote *kn) { struct kqueue *kq = kn->kn_fp->f_data; if (kn->kn_filter != EVFILT_READ) return (EINVAL); kn->kn_status |= KN_KQUEUE; kn->kn_fop = &kqread_filtops; knlist_add(&kq->kq_sel.si_note, kn, 0); return (0); } static void filt_kqdetach(struct knote *kn) { struct kqueue *kq = kn->kn_fp->f_data; knlist_remove(&kq->kq_sel.si_note, kn, 0); } /*ARGSUSED*/ static int filt_kqueue(struct knote *kn, long hint) { struct kqueue *kq = kn->kn_fp->f_data; kn->kn_data = kq->kq_count; return (kn->kn_data > 0); } /* XXX - move to kern_proc.c? */ static int filt_procattach(struct knote *kn) { struct proc *p; int error; bool exiting, immediate; exiting = immediate = false; if (kn->kn_sfflags & NOTE_EXIT) p = pfind_any(kn->kn_id); else p = pfind(kn->kn_id); if (p == NULL) return (ESRCH); if (p->p_flag & P_WEXIT) exiting = true; if ((error = p_cansee(curthread, p))) { PROC_UNLOCK(p); return (error); } kn->kn_ptr.p_proc = p; kn->kn_flags |= EV_CLEAR; /* automatically set */ /* * Internal flag indicating registration done by kernel for the * purposes of getting a NOTE_CHILD notification. */ if (kn->kn_flags & EV_FLAG2) { kn->kn_flags &= ~EV_FLAG2; kn->kn_data = kn->kn_sdata; /* ppid */ kn->kn_fflags = NOTE_CHILD; kn->kn_sfflags &= ~(NOTE_EXIT | NOTE_EXEC | NOTE_FORK); immediate = true; /* Force immediate activation of child note. */ } /* * Internal flag indicating registration done by kernel (for other than * NOTE_CHILD). */ if (kn->kn_flags & EV_FLAG1) { kn->kn_flags &= ~EV_FLAG1; } knlist_add(p->p_klist, kn, 1); /* * Immediately activate any child notes or, in the case of a zombie * target process, exit notes. The latter is necessary to handle the * case where the target process, e.g. a child, dies before the kevent * is registered. */ if (immediate || (exiting && filt_proc(kn, NOTE_EXIT))) KNOTE_ACTIVATE(kn, 0); PROC_UNLOCK(p); return (0); } /* * The knote may be attached to a different process, which may exit, * leaving nothing for the knote to be attached to. So when the process * exits, the knote is marked as DETACHED and also flagged as ONESHOT so * it will be deleted when read out. However, as part of the knote deletion, * this routine is called, so a check is needed to avoid actually performing * a detach, because the original process does not exist any more. */ /* XXX - move to kern_proc.c? */ static void filt_procdetach(struct knote *kn) { knlist_remove(kn->kn_knlist, kn, 0); kn->kn_ptr.p_proc = NULL; } /* XXX - move to kern_proc.c? */ static int filt_proc(struct knote *kn, long hint) { struct proc *p; u_int event; p = kn->kn_ptr.p_proc; if (p == NULL) /* already activated, from attach filter */ return (0); /* Mask off extra data. */ event = (u_int)hint & NOTE_PCTRLMASK; /* If the user is interested in this event, record it. */ if (kn->kn_sfflags & event) kn->kn_fflags |= event; /* Process is gone, so flag the event as finished. */ if (event == NOTE_EXIT) { kn->kn_flags |= EV_EOF | EV_ONESHOT; kn->kn_ptr.p_proc = NULL; if (kn->kn_fflags & NOTE_EXIT) kn->kn_data = KW_EXITCODE(p->p_xexit, p->p_xsig); if (kn->kn_fflags == 0) kn->kn_flags |= EV_DROP; return (1); } return (kn->kn_fflags != 0); } /* * Called when the process forked. It mostly does the same as the * knote(), activating all knotes registered to be activated when the * process forked. Additionally, for each knote attached to the * parent, check whether user wants to track the new process. If so * attach a new knote to it, and immediately report an event with the * child's pid. */ void knote_fork(struct knlist *list, int pid) { struct kqueue *kq; struct knote *kn; struct kevent kev; int error; MPASS(list != NULL); KNL_ASSERT_LOCKED(list); if (SLIST_EMPTY(&list->kl_list)) return; memset(&kev, 0, sizeof(kev)); SLIST_FOREACH(kn, &list->kl_list, kn_selnext) { kq = kn->kn_kq; KQ_LOCK(kq); if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) { KQ_UNLOCK(kq); continue; } /* * The same as knote(), activate the event. */ if ((kn->kn_sfflags & NOTE_TRACK) == 0) { if (kn->kn_fop->f_event(kn, NOTE_FORK)) KNOTE_ACTIVATE(kn, 1); KQ_UNLOCK(kq); continue; } /* * The NOTE_TRACK case. In addition to the activation * of the event, we need to register new events to * track the child. Drop the locks in preparation for * the call to kqueue_register(). */ kn_enter_flux(kn); KQ_UNLOCK(kq); list->kl_unlock(list->kl_lockarg); /* * Activate existing knote and register tracking knotes with * new process. * * First register a knote to get just the child notice. This * must be a separate note from a potential NOTE_EXIT * notification since both NOTE_CHILD and NOTE_EXIT are defined * to use the data field (in conflicting ways). */ kev.ident = pid; kev.filter = kn->kn_filter; kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_ONESHOT | EV_FLAG2; kev.fflags = kn->kn_sfflags; kev.data = kn->kn_id; /* parent */ kev.udata = kn->kn_kevent.udata;/* preserve udata */ error = kqueue_register(kq, &kev, NULL, M_NOWAIT); if (error) kn->kn_fflags |= NOTE_TRACKERR; /* * Then register another knote to track other potential events * from the new process. */ kev.ident = pid; kev.filter = kn->kn_filter; kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1; kev.fflags = kn->kn_sfflags; kev.data = kn->kn_id; /* parent */ kev.udata = kn->kn_kevent.udata;/* preserve udata */ error = kqueue_register(kq, &kev, NULL, M_NOWAIT); if (error) kn->kn_fflags |= NOTE_TRACKERR; if (kn->kn_fop->f_event(kn, NOTE_FORK)) KNOTE_ACTIVATE(kn, 0); list->kl_lock(list->kl_lockarg); KQ_LOCK(kq); kn_leave_flux(kn); KQ_UNLOCK_FLUX(kq); } } /* * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the * interval timer support code. */ #define NOTE_TIMER_PRECMASK \ (NOTE_SECONDS | NOTE_MSECONDS | NOTE_USECONDS | NOTE_NSECONDS) static sbintime_t timer2sbintime(int64_t data, int flags) { int64_t secs; /* * Macros for converting to the fractional second portion of an * sbintime_t using 64bit multiplication to improve precision. */ #define NS_TO_SBT(ns) (((ns) * (((uint64_t)1 << 63) / 500000000)) >> 32) #define US_TO_SBT(us) (((us) * (((uint64_t)1 << 63) / 500000)) >> 32) #define MS_TO_SBT(ms) (((ms) * (((uint64_t)1 << 63) / 500)) >> 32) switch (flags & NOTE_TIMER_PRECMASK) { case NOTE_SECONDS: #ifdef __LP64__ if (data > (SBT_MAX / SBT_1S)) return (SBT_MAX); #endif return ((sbintime_t)data << 32); case NOTE_MSECONDS: /* FALLTHROUGH */ case 0: if (data >= 1000) { secs = data / 1000; #ifdef __LP64__ if (secs > (SBT_MAX / SBT_1S)) return (SBT_MAX); #endif return (secs << 32 | MS_TO_SBT(data % 1000)); } return (MS_TO_SBT(data)); case NOTE_USECONDS: if (data >= 1000000) { secs = data / 1000000; #ifdef __LP64__ if (secs > (SBT_MAX / SBT_1S)) return (SBT_MAX); #endif return (secs << 32 | US_TO_SBT(data % 1000000)); } return (US_TO_SBT(data)); case NOTE_NSECONDS: if (data >= 1000000000) { secs = data / 1000000000; #ifdef __LP64__ if (secs > (SBT_MAX / SBT_1S)) return (SBT_MAX); #endif return (secs << 32 | NS_TO_SBT(data % 1000000000)); } return (NS_TO_SBT(data)); default: break; } return (-1); } struct kq_timer_cb_data { struct callout c; struct proc *p; struct knote *kn; int cpuid; int flags; TAILQ_ENTRY(kq_timer_cb_data) link; sbintime_t next; /* next timer event fires at */ sbintime_t to; /* precalculated timer period, 0 for abs */ }; #define KQ_TIMER_CB_ENQUEUED 0x01 static void kqtimer_sched_callout(struct kq_timer_cb_data *kc) { callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kc->kn, kc->cpuid, C_ABSOLUTE); } void kqtimer_proc_continue(struct proc *p) { struct kq_timer_cb_data *kc, *kc1; struct bintime bt; sbintime_t now; PROC_LOCK_ASSERT(p, MA_OWNED); getboottimebin(&bt); now = bttosbt(bt); TAILQ_FOREACH_SAFE(kc, &p->p_kqtim_stop, link, kc1) { TAILQ_REMOVE(&p->p_kqtim_stop, kc, link); kc->flags &= ~KQ_TIMER_CB_ENQUEUED; if (kc->next <= now) filt_timerexpire_l(kc->kn, true); else kqtimer_sched_callout(kc); } } static void filt_timerexpire_l(struct knote *kn, bool proc_locked) { struct kq_timer_cb_data *kc; struct proc *p; uint64_t delta; sbintime_t now; kc = kn->kn_ptr.p_v; if ((kn->kn_flags & EV_ONESHOT) != 0 || kc->to == 0) { kn->kn_data++; KNOTE_ACTIVATE(kn, 0); return; } now = sbinuptime(); if (now >= kc->next) { delta = (now - kc->next) / kc->to; if (delta == 0) delta = 1; kn->kn_data += delta; kc->next += (delta + 1) * kc->to; if (now >= kc->next) /* overflow */ kc->next = now + kc->to; KNOTE_ACTIVATE(kn, 0); /* XXX - handle locking */ } /* * Initial check for stopped kc->p is racy. It is fine to * miss the set of the stop flags, at worst we would schedule * one more callout. On the other hand, it is not fine to not * schedule when we we missed clearing of the flags, we * recheck them under the lock and observe consistent state. */ p = kc->p; if (P_SHOULDSTOP(p) || P_KILLED(p)) { if (!proc_locked) PROC_LOCK(p); if (P_SHOULDSTOP(p) || P_KILLED(p)) { if ((kc->flags & KQ_TIMER_CB_ENQUEUED) == 0) { kc->flags |= KQ_TIMER_CB_ENQUEUED; TAILQ_INSERT_TAIL(&p->p_kqtim_stop, kc, link); } if (!proc_locked) PROC_UNLOCK(p); return; } if (!proc_locked) PROC_UNLOCK(p); } kqtimer_sched_callout(kc); } static void filt_timerexpire(void *knx) { filt_timerexpire_l(knx, false); } /* * data contains amount of time to sleep */ static int filt_timervalidate(struct knote *kn, sbintime_t *to) { struct bintime bt; sbintime_t sbt; if (kn->kn_sdata < 0) return (EINVAL); if (kn->kn_sdata == 0 && (kn->kn_flags & EV_ONESHOT) == 0) kn->kn_sdata = 1; /* * The only fflags values supported are the timer unit * (precision) and the absolute time indicator. */ if ((kn->kn_sfflags & ~(NOTE_TIMER_PRECMASK | NOTE_ABSTIME)) != 0) return (EINVAL); *to = timer2sbintime(kn->kn_sdata, kn->kn_sfflags); if (*to < 0) return (EINVAL); if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) { getboottimebin(&bt); sbt = bttosbt(bt); *to = MAX(0, *to - sbt); } return (0); } static int filt_timerattach(struct knote *kn) { struct kq_timer_cb_data *kc; sbintime_t to; int error; to = -1; error = filt_timervalidate(kn, &to); if (error != 0) return (error); KASSERT(to > 0 || (kn->kn_flags & EV_ONESHOT) != 0 || (kn->kn_sfflags & NOTE_ABSTIME) != 0, ("%s: periodic timer has a calculated zero timeout", __func__)); KASSERT(to >= 0, ("%s: timer has a calculated negative timeout", __func__)); if (atomic_fetchadd_int(&kq_ncallouts, 1) + 1 > kq_calloutmax) { atomic_subtract_int(&kq_ncallouts, 1); return (ENOMEM); } if ((kn->kn_sfflags & NOTE_ABSTIME) == 0) kn->kn_flags |= EV_CLEAR; /* automatically set */ kn->kn_status &= ~KN_DETACHED; /* knlist_add clears it */ kn->kn_ptr.p_v = kc = malloc(sizeof(*kc), M_KQUEUE, M_WAITOK); kc->kn = kn; kc->p = curproc; kc->cpuid = PCPU_GET(cpuid); kc->flags = 0; callout_init(&kc->c, 1); filt_timerstart(kn, to); return (0); } static void filt_timerstart(struct knote *kn, sbintime_t to) { struct kq_timer_cb_data *kc; kc = kn->kn_ptr.p_v; if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) { kc->next = to; kc->to = 0; } else { kc->next = to + sbinuptime(); kc->to = to; } kqtimer_sched_callout(kc); } static void filt_timerdetach(struct knote *kn) { struct kq_timer_cb_data *kc; unsigned int old __unused; bool pending; kc = kn->kn_ptr.p_v; do { callout_drain(&kc->c); /* * kqtimer_proc_continue() might have rescheduled this callout. * Double-check, using the process mutex as an interlock. */ PROC_LOCK(kc->p); if ((kc->flags & KQ_TIMER_CB_ENQUEUED) != 0) { kc->flags &= ~KQ_TIMER_CB_ENQUEUED; TAILQ_REMOVE(&kc->p->p_kqtim_stop, kc, link); } pending = callout_pending(&kc->c); PROC_UNLOCK(kc->p); } while (pending); free(kc, M_KQUEUE); old = atomic_fetchadd_int(&kq_ncallouts, -1); KASSERT(old > 0, ("Number of callouts cannot become negative")); kn->kn_status |= KN_DETACHED; /* knlist_remove sets it */ } static void filt_timertouch(struct knote *kn, struct kevent *kev, u_long type) { struct kq_timer_cb_data *kc; struct kqueue *kq; sbintime_t to; int error; switch (type) { case EVENT_REGISTER: /* Handle re-added timers that update data/fflags */ if (kev->flags & EV_ADD) { kc = kn->kn_ptr.p_v; /* Drain any existing callout. */ callout_drain(&kc->c); /* Throw away any existing undelivered record * of the timer expiration. This is done under * the presumption that if a process is * re-adding this timer with new parameters, * it is no longer interested in what may have * happened under the old parameters. If it is * interested, it can wait for the expiration, * delete the old timer definition, and then * add the new one. * * This has to be done while the kq is locked: * - if enqueued, dequeue * - make it no longer active * - clear the count of expiration events */ kq = kn->kn_kq; KQ_LOCK(kq); if (kn->kn_status & KN_QUEUED) knote_dequeue(kn); kn->kn_status &= ~KN_ACTIVE; kn->kn_data = 0; KQ_UNLOCK(kq); /* Reschedule timer based on new data/fflags */ kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; error = filt_timervalidate(kn, &to); if (error != 0) { kn->kn_flags |= EV_ERROR; kn->kn_data = error; } else filt_timerstart(kn, to); } break; case EVENT_PROCESS: *kev = kn->kn_kevent; if (kn->kn_flags & EV_CLEAR) { kn->kn_data = 0; kn->kn_fflags = 0; } break; default: panic("filt_timertouch() - invalid type (%ld)", type); break; } } static int filt_timer(struct knote *kn, long hint) { return (kn->kn_data != 0); } static int filt_userattach(struct knote *kn) { /* * EVFILT_USER knotes are not attached to anything in the kernel. */ kn->kn_hook = NULL; if (kn->kn_fflags & NOTE_TRIGGER) kn->kn_hookid = 1; else kn->kn_hookid = 0; return (0); } static void filt_userdetach(__unused struct knote *kn) { /* * EVFILT_USER knotes are not attached to anything in the kernel. */ } static int filt_user(struct knote *kn, __unused long hint) { return (kn->kn_hookid); } static void filt_usertouch(struct knote *kn, struct kevent *kev, u_long type) { u_int ffctrl; switch (type) { case EVENT_REGISTER: if (kev->fflags & NOTE_TRIGGER) kn->kn_hookid = 1; ffctrl = kev->fflags & NOTE_FFCTRLMASK; kev->fflags &= NOTE_FFLAGSMASK; switch (ffctrl) { case NOTE_FFNOP: break; case NOTE_FFAND: kn->kn_sfflags &= kev->fflags; break; case NOTE_FFOR: kn->kn_sfflags |= kev->fflags; break; case NOTE_FFCOPY: kn->kn_sfflags = kev->fflags; break; default: /* XXX Return error? */ break; } kn->kn_sdata = kev->data; if (kev->flags & EV_CLEAR) { kn->kn_hookid = 0; kn->kn_data = 0; kn->kn_fflags = 0; } break; case EVENT_PROCESS: *kev = kn->kn_kevent; kev->fflags = kn->kn_sfflags; kev->data = kn->kn_sdata; if (kn->kn_flags & EV_CLEAR) { kn->kn_hookid = 0; kn->kn_data = 0; kn->kn_fflags = 0; } break; default: panic("filt_usertouch() - invalid type (%ld)", type); break; } } int sys_kqueue(struct thread *td, struct kqueue_args *uap) { return (kern_kqueue(td, 0, NULL)); } static void kqueue_init(struct kqueue *kq) { mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF | MTX_DUPOK); TAILQ_INIT(&kq->kq_head); knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock); TASK_INIT(&kq->kq_task, 0, kqueue_task, kq); } int kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps) { struct filedesc *fdp; struct kqueue *kq; struct file *fp; struct ucred *cred; int fd, error; fdp = td->td_proc->p_fd; cred = td->td_ucred; if (!chgkqcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_KQUEUES))) return (ENOMEM); error = falloc_caps(td, &fp, &fd, flags, fcaps); if (error != 0) { chgkqcnt(cred->cr_ruidinfo, -1, 0); return (error); } /* An extra reference on `fp' has been held for us by falloc(). */ kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO); kqueue_init(kq); kq->kq_fdp = fdp; kq->kq_cred = crhold(cred); FILEDESC_XLOCK(fdp); TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list); FILEDESC_XUNLOCK(fdp); finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops); fdrop(fp, td); td->td_retval[0] = fd; return (0); } struct g_kevent_args { int fd; void *changelist; int nchanges; void *eventlist; int nevents; const struct timespec *timeout; }; int sys_kevent(struct thread *td, struct kevent_args *uap) { struct kevent_copyops k_ops = { .arg = uap, .k_copyout = kevent_copyout, .k_copyin = kevent_copyin, .kevent_size = sizeof(struct kevent), }; struct g_kevent_args gk_args = { .fd = uap->fd, .changelist = uap->changelist, .nchanges = uap->nchanges, .eventlist = uap->eventlist, .nevents = uap->nevents, .timeout = uap->timeout, }; return (kern_kevent_generic(td, &gk_args, &k_ops, "kevent")); } static int kern_kevent_generic(struct thread *td, struct g_kevent_args *uap, struct kevent_copyops *k_ops, const char *struct_name) { struct timespec ts, *tsp; #ifdef KTRACE struct kevent *eventlist = uap->eventlist; #endif int error; if (uap->timeout != NULL) { error = copyin(uap->timeout, &ts, sizeof(ts)); if (error) return (error); tsp = &ts; } else tsp = NULL; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT_ARRAY)) ktrstructarray(struct_name, UIO_USERSPACE, uap->changelist, uap->nchanges, k_ops->kevent_size); #endif error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents, k_ops, tsp); #ifdef KTRACE if (error == 0 && KTRPOINT(td, KTR_STRUCT_ARRAY)) ktrstructarray(struct_name, UIO_USERSPACE, eventlist, td->td_retval[0], k_ops->kevent_size); #endif return (error); } /* * Copy 'count' items into the destination list pointed to by uap->eventlist. */ static int kevent_copyout(void *arg, struct kevent *kevp, int count) { struct kevent_args *uap; int error; KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); uap = (struct kevent_args *)arg; error = copyout(kevp, uap->eventlist, count * sizeof *kevp); if (error == 0) uap->eventlist += count; return (error); } /* * Copy 'count' items from the list pointed to by uap->changelist. */ static int kevent_copyin(void *arg, struct kevent *kevp, int count) { struct kevent_args *uap; int error; KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); uap = (struct kevent_args *)arg; error = copyin(uap->changelist, kevp, count * sizeof *kevp); if (error == 0) uap->changelist += count; return (error); } #ifdef COMPAT_FREEBSD11 static int kevent11_copyout(void *arg, struct kevent *kevp, int count) { struct freebsd11_kevent_args *uap; struct kevent_freebsd11 kev11; int error, i; KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); uap = (struct freebsd11_kevent_args *)arg; for (i = 0; i < count; i++) { kev11.ident = kevp->ident; kev11.filter = kevp->filter; kev11.flags = kevp->flags; kev11.fflags = kevp->fflags; kev11.data = kevp->data; kev11.udata = kevp->udata; error = copyout(&kev11, uap->eventlist, sizeof(kev11)); if (error != 0) break; uap->eventlist++; kevp++; } return (error); } /* * Copy 'count' items from the list pointed to by uap->changelist. */ static int kevent11_copyin(void *arg, struct kevent *kevp, int count) { struct freebsd11_kevent_args *uap; struct kevent_freebsd11 kev11; int error, i; KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); uap = (struct freebsd11_kevent_args *)arg; for (i = 0; i < count; i++) { error = copyin(uap->changelist, &kev11, sizeof(kev11)); if (error != 0) break; kevp->ident = kev11.ident; kevp->filter = kev11.filter; kevp->flags = kev11.flags; kevp->fflags = kev11.fflags; kevp->data = (uintptr_t)kev11.data; kevp->udata = kev11.udata; bzero(&kevp->ext, sizeof(kevp->ext)); uap->changelist++; kevp++; } return (error); } int freebsd11_kevent(struct thread *td, struct freebsd11_kevent_args *uap) { struct kevent_copyops k_ops = { .arg = uap, .k_copyout = kevent11_copyout, .k_copyin = kevent11_copyin, .kevent_size = sizeof(struct kevent_freebsd11), }; struct g_kevent_args gk_args = { .fd = uap->fd, .changelist = uap->changelist, .nchanges = uap->nchanges, .eventlist = uap->eventlist, .nevents = uap->nevents, .timeout = uap->timeout, }; return (kern_kevent_generic(td, &gk_args, &k_ops, "kevent_freebsd11")); } #endif int kern_kevent(struct thread *td, int fd, int nchanges, int nevents, struct kevent_copyops *k_ops, const struct timespec *timeout) { cap_rights_t rights; struct file *fp; int error; cap_rights_init_zero(&rights); if (nchanges > 0) cap_rights_set_one(&rights, CAP_KQUEUE_CHANGE); if (nevents > 0) cap_rights_set_one(&rights, CAP_KQUEUE_EVENT); error = fget(td, fd, &rights, &fp); if (error != 0) return (error); error = kern_kevent_fp(td, fp, nchanges, nevents, k_ops, timeout); fdrop(fp, td); return (error); } static int kqueue_kevent(struct kqueue *kq, struct thread *td, int nchanges, int nevents, struct kevent_copyops *k_ops, const struct timespec *timeout) { struct kevent keva[KQ_NEVENTS]; struct kevent *kevp, *changes; int i, n, nerrors, error; if (nchanges < 0) return (EINVAL); nerrors = 0; while (nchanges > 0) { n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges; error = k_ops->k_copyin(k_ops->arg, keva, n); if (error) return (error); changes = keva; for (i = 0; i < n; i++) { kevp = &changes[i]; if (!kevp->filter) continue; kevp->flags &= ~EV_SYSFLAGS; error = kqueue_register(kq, kevp, td, M_WAITOK); if (error || (kevp->flags & EV_RECEIPT)) { if (nevents == 0) return (error); kevp->flags = EV_ERROR; kevp->data = error; (void)k_ops->k_copyout(k_ops->arg, kevp, 1); nevents--; nerrors++; } } nchanges -= n; } if (nerrors) { td->td_retval[0] = nerrors; return (0); } return (kqueue_scan(kq, nevents, k_ops, timeout, keva, td)); } int kern_kevent_fp(struct thread *td, struct file *fp, int nchanges, int nevents, struct kevent_copyops *k_ops, const struct timespec *timeout) { struct kqueue *kq; int error; error = kqueue_acquire(fp, &kq); if (error != 0) return (error); error = kqueue_kevent(kq, td, nchanges, nevents, k_ops, timeout); kqueue_release(kq, 0); return (error); } /* * Performs a kevent() call on a temporarily created kqueue. This can be * used to perform one-shot polling, similar to poll() and select(). */ int kern_kevent_anonymous(struct thread *td, int nevents, struct kevent_copyops *k_ops) { struct kqueue kq = {}; int error; kqueue_init(&kq); kq.kq_refcnt = 1; error = kqueue_kevent(&kq, td, nevents, nevents, k_ops, NULL); kqueue_drain(&kq, td); kqueue_destroy(&kq); return (error); } int kqueue_add_filteropts(int filt, struct filterops *filtops) { int error; error = 0; if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) { printf( "trying to add a filterop that is out of range: %d is beyond %d\n", ~filt, EVFILT_SYSCOUNT); return EINVAL; } mtx_lock(&filterops_lock); if (sysfilt_ops[~filt].for_fop != &null_filtops && sysfilt_ops[~filt].for_fop != NULL) error = EEXIST; else { sysfilt_ops[~filt].for_fop = filtops; sysfilt_ops[~filt].for_refcnt = 0; } mtx_unlock(&filterops_lock); return (error); } int kqueue_del_filteropts(int filt) { int error; error = 0; if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) return EINVAL; mtx_lock(&filterops_lock); if (sysfilt_ops[~filt].for_fop == &null_filtops || sysfilt_ops[~filt].for_fop == NULL) error = EINVAL; else if (sysfilt_ops[~filt].for_refcnt != 0) error = EBUSY; else { sysfilt_ops[~filt].for_fop = &null_filtops; sysfilt_ops[~filt].for_refcnt = 0; } mtx_unlock(&filterops_lock); return error; } static struct filterops * kqueue_fo_find(int filt) { if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) return NULL; if (sysfilt_ops[~filt].for_nolock) return sysfilt_ops[~filt].for_fop; mtx_lock(&filterops_lock); sysfilt_ops[~filt].for_refcnt++; if (sysfilt_ops[~filt].for_fop == NULL) sysfilt_ops[~filt].for_fop = &null_filtops; mtx_unlock(&filterops_lock); return sysfilt_ops[~filt].for_fop; } static void kqueue_fo_release(int filt) { if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) return; if (sysfilt_ops[~filt].for_nolock) return; mtx_lock(&filterops_lock); KASSERT(sysfilt_ops[~filt].for_refcnt > 0, ("filter object refcount not valid on release")); sysfilt_ops[~filt].for_refcnt--; mtx_unlock(&filterops_lock); } /* * A ref to kq (obtained via kqueue_acquire) must be held. */ static int kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int mflag) { struct filterops *fops; struct file *fp; struct knote *kn, *tkn; struct knlist *knl; int error, filt, event; int haskqglobal, filedesc_unlock; if ((kev->flags & (EV_ENABLE | EV_DISABLE)) == (EV_ENABLE | EV_DISABLE)) return (EINVAL); fp = NULL; kn = NULL; knl = NULL; error = 0; haskqglobal = 0; filedesc_unlock = 0; filt = kev->filter; fops = kqueue_fo_find(filt); if (fops == NULL) return EINVAL; if (kev->flags & EV_ADD) { /* Reject an invalid flag pair early */ if (kev->flags & EV_KEEPUDATA) { tkn = NULL; error = EINVAL; goto done; } /* * Prevent waiting with locks. Non-sleepable * allocation failures are handled in the loop, only * if the spare knote appears to be actually required. */ tkn = knote_alloc(mflag); } else { tkn = NULL; } findkn: if (fops->f_isfd) { KASSERT(td != NULL, ("td is NULL")); if (kev->ident > INT_MAX) error = EBADF; else error = fget(td, kev->ident, &cap_event_rights, &fp); if (error) goto done; if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops, kev->ident, M_NOWAIT) != 0) { /* try again */ fdrop(fp, td); fp = NULL; error = kqueue_expand(kq, fops, kev->ident, mflag); if (error) goto done; goto findkn; } if (fp->f_type == DTYPE_KQUEUE) { /* * If we add some intelligence about what we are doing, * we should be able to support events on ourselves. * We need to know when we are doing this to prevent * getting both the knlist lock and the kq lock since * they are the same thing. */ if (fp->f_data == kq) { error = EINVAL; goto done; } /* * Pre-lock the filedesc before the global * lock mutex, see the comment in * kqueue_close(). */ FILEDESC_XLOCK(td->td_proc->p_fd); filedesc_unlock = 1; KQ_GLOBAL_LOCK(&kq_global, haskqglobal); } KQ_LOCK(kq); if (kev->ident < kq->kq_knlistsize) { SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link) if (kev->filter == kn->kn_filter) break; } } else { if ((kev->flags & EV_ADD) == EV_ADD) { error = kqueue_expand(kq, fops, kev->ident, mflag); if (error != 0) goto done; } KQ_LOCK(kq); /* * If possible, find an existing knote to use for this kevent. */ if (kev->filter == EVFILT_PROC && (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) { /* This is an internal creation of a process tracking * note. Don't attempt to coalesce this with an * existing note. */ ; } else if (kq->kq_knhashmask != 0) { struct klist *list; list = &kq->kq_knhash[ KN_HASH((u_long)kev->ident, kq->kq_knhashmask)]; SLIST_FOREACH(kn, list, kn_link) if (kev->ident == kn->kn_id && kev->filter == kn->kn_filter) break; } } /* knote is in the process of changing, wait for it to stabilize. */ if (kn != NULL && kn_in_flux(kn)) { KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); if (filedesc_unlock) { FILEDESC_XUNLOCK(td->td_proc->p_fd); filedesc_unlock = 0; } kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0); if (fp != NULL) { fdrop(fp, td); fp = NULL; } goto findkn; } /* * kn now contains the matching knote, or NULL if no match */ if (kn == NULL) { if (kev->flags & EV_ADD) { kn = tkn; tkn = NULL; if (kn == NULL) { KQ_UNLOCK(kq); error = ENOMEM; goto done; } kn->kn_fp = fp; kn->kn_kq = kq; kn->kn_fop = fops; /* * apply reference counts to knote structure, and * do not release it at the end of this routine. */ fops = NULL; fp = NULL; kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; kev->fflags = 0; kev->data = 0; kn->kn_kevent = *kev; kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE | EV_ENABLE | EV_DISABLE | EV_FORCEONESHOT); kn->kn_status = KN_DETACHED; if ((kev->flags & EV_DISABLE) != 0) kn->kn_status |= KN_DISABLED; kn_enter_flux(kn); error = knote_attach(kn, kq); KQ_UNLOCK(kq); if (error != 0) { tkn = kn; goto done; } if ((error = kn->kn_fop->f_attach(kn)) != 0) { knote_drop_detached(kn, td); goto done; } knl = kn_list_lock(kn); goto done_ev_add; } else { /* No matching knote and the EV_ADD flag is not set. */ KQ_UNLOCK(kq); error = ENOENT; goto done; } } if (kev->flags & EV_DELETE) { kn_enter_flux(kn); KQ_UNLOCK(kq); knote_drop(kn, td); goto done; } if (kev->flags & EV_FORCEONESHOT) { kn->kn_flags |= EV_ONESHOT; KNOTE_ACTIVATE(kn, 1); } if ((kev->flags & EV_ENABLE) != 0) kn->kn_status &= ~KN_DISABLED; else if ((kev->flags & EV_DISABLE) != 0) kn->kn_status |= KN_DISABLED; /* * The user may change some filter values after the initial EV_ADD, * but doing so will not reset any filter which has already been * triggered. */ kn->kn_status |= KN_SCAN; kn_enter_flux(kn); KQ_UNLOCK(kq); knl = kn_list_lock(kn); if ((kev->flags & EV_KEEPUDATA) == 0) kn->kn_kevent.udata = kev->udata; if (!fops->f_isfd && fops->f_touch != NULL) { fops->f_touch(kn, kev, EVENT_REGISTER); } else { kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; } done_ev_add: /* * We can get here with kn->kn_knlist == NULL. This can happen when * the initial attach event decides that the event is "completed" * already, e.g., filt_procattach() is called on a zombie process. It * will call filt_proc() which will remove it from the list, and NULL * kn_knlist. * * KN_DISABLED will be stable while the knote is in flux, so the * unlocked read will not race with an update. */ if ((kn->kn_status & KN_DISABLED) == 0) event = kn->kn_fop->f_event(kn, 0); else event = 0; KQ_LOCK(kq); if (event) kn->kn_status |= KN_ACTIVE; if ((kn->kn_status & (KN_ACTIVE | KN_DISABLED | KN_QUEUED)) == KN_ACTIVE) knote_enqueue(kn); kn->kn_status &= ~KN_SCAN; kn_leave_flux(kn); kn_list_unlock(knl); KQ_UNLOCK_FLUX(kq); done: KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); if (filedesc_unlock) FILEDESC_XUNLOCK(td->td_proc->p_fd); if (fp != NULL) fdrop(fp, td); knote_free(tkn); if (fops != NULL) kqueue_fo_release(filt); return (error); } static int kqueue_acquire(struct file *fp, struct kqueue **kqp) { int error; struct kqueue *kq; error = 0; kq = fp->f_data; if (fp->f_type != DTYPE_KQUEUE || kq == NULL) return (EBADF); *kqp = kq; KQ_LOCK(kq); if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) { KQ_UNLOCK(kq); return (EBADF); } kq->kq_refcnt++; KQ_UNLOCK(kq); return error; } static void kqueue_release(struct kqueue *kq, int locked) { if (locked) KQ_OWNED(kq); else KQ_LOCK(kq); kq->kq_refcnt--; if (kq->kq_refcnt == 1) wakeup(&kq->kq_refcnt); if (!locked) KQ_UNLOCK(kq); } void kqueue_drain_schedtask(void) { taskqueue_quiesce(taskqueue_kqueue_ctx); } static void kqueue_schedtask(struct kqueue *kq) { struct thread *td; KQ_OWNED(kq); KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN), ("scheduling kqueue task while draining")); if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) { taskqueue_enqueue(taskqueue_kqueue_ctx, &kq->kq_task); kq->kq_state |= KQ_TASKSCHED; td = curthread; thread_lock(td); td->td_flags |= TDF_ASTPENDING | TDF_KQTICKLED; thread_unlock(td); } } /* * Expand the kq to make sure we have storage for fops/ident pair. * * Return 0 on success (or no work necessary), return errno on failure. */ static int kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident, int mflag) { struct klist *list, *tmp_knhash, *to_free; u_long tmp_knhashmask; int error, fd, size; KQ_NOTOWNED(kq); error = 0; to_free = NULL; if (fops->f_isfd) { fd = ident; if (kq->kq_knlistsize <= fd) { size = kq->kq_knlistsize; while (size <= fd) size += KQEXTENT; list = malloc(size * sizeof(*list), M_KQUEUE, mflag); if (list == NULL) return ENOMEM; KQ_LOCK(kq); if ((kq->kq_state & KQ_CLOSING) != 0) { to_free = list; error = EBADF; } else if (kq->kq_knlistsize > fd) { to_free = list; } else { if (kq->kq_knlist != NULL) { bcopy(kq->kq_knlist, list, kq->kq_knlistsize * sizeof(*list)); to_free = kq->kq_knlist; kq->kq_knlist = NULL; } bzero((caddr_t)list + kq->kq_knlistsize * sizeof(*list), (size - kq->kq_knlistsize) * sizeof(*list)); kq->kq_knlistsize = size; kq->kq_knlist = list; } KQ_UNLOCK(kq); } } else { if (kq->kq_knhashmask == 0) { tmp_knhash = hashinit_flags(KN_HASHSIZE, M_KQUEUE, &tmp_knhashmask, (mflag & M_WAITOK) != 0 ? HASH_WAITOK : HASH_NOWAIT); if (tmp_knhash == NULL) return (ENOMEM); KQ_LOCK(kq); if ((kq->kq_state & KQ_CLOSING) != 0) { to_free = tmp_knhash; error = EBADF; } else if (kq->kq_knhashmask == 0) { kq->kq_knhash = tmp_knhash; kq->kq_knhashmask = tmp_knhashmask; } else { to_free = tmp_knhash; } KQ_UNLOCK(kq); } } free(to_free, M_KQUEUE); KQ_NOTOWNED(kq); return (error); } static void kqueue_task(void *arg, int pending) { struct kqueue *kq; int haskqglobal; haskqglobal = 0; kq = arg; KQ_GLOBAL_LOCK(&kq_global, haskqglobal); KQ_LOCK(kq); KNOTE_LOCKED(&kq->kq_sel.si_note, 0); kq->kq_state &= ~KQ_TASKSCHED; if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) { wakeup(&kq->kq_state); } KQ_UNLOCK(kq); KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); } /* * Scan, update kn_data (if not ONESHOT), and copyout triggered events. * We treat KN_MARKER knotes as if they are in flux. */ static int kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops, const struct timespec *tsp, struct kevent *keva, struct thread *td) { struct kevent *kevp; struct knote *kn, *marker; struct knlist *knl; sbintime_t asbt, rsbt; int count, error, haskqglobal, influx, nkev, touch; count = maxevents; nkev = 0; error = 0; haskqglobal = 0; if (maxevents == 0) goto done_nl; if (maxevents < 0) { error = EINVAL; goto done_nl; } rsbt = 0; if (tsp != NULL) { if (tsp->tv_sec < 0 || tsp->tv_nsec < 0 || tsp->tv_nsec >= 1000000000) { error = EINVAL; goto done_nl; } if (timespecisset(tsp)) { if (tsp->tv_sec <= INT32_MAX) { rsbt = tstosbt(*tsp); if (TIMESEL(&asbt, rsbt)) asbt += tc_tick_sbt; if (asbt <= SBT_MAX - rsbt) asbt += rsbt; else asbt = 0; rsbt >>= tc_precexp; } else asbt = 0; } else asbt = -1; } else asbt = 0; marker = knote_alloc(M_WAITOK); marker->kn_status = KN_MARKER; KQ_LOCK(kq); retry: kevp = keva; if (kq->kq_count == 0) { if (asbt == -1) { error = EWOULDBLOCK; } else { kq->kq_state |= KQ_SLEEP; error = msleep_sbt(kq, &kq->kq_lock, PSOCK | PCATCH, "kqread", asbt, rsbt, C_ABSOLUTE); } if (error == 0) goto retry; /* don't restart after signals... */ if (error == ERESTART) error = EINTR; else if (error == EWOULDBLOCK) error = 0; goto done; } TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe); influx = 0; while (count) { KQ_OWNED(kq); kn = TAILQ_FIRST(&kq->kq_head); if ((kn->kn_status == KN_MARKER && kn != marker) || kn_in_flux(kn)) { if (influx) { influx = 0; KQ_FLUX_WAKEUP(kq); } kq->kq_state |= KQ_FLUXWAIT; error = msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0); continue; } TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) { kn->kn_status &= ~KN_QUEUED; kq->kq_count--; continue; } if (kn == marker) { KQ_FLUX_WAKEUP(kq); if (count == maxevents) goto retry; goto done; } KASSERT(!kn_in_flux(kn), ("knote %p is unexpectedly in flux", kn)); if ((kn->kn_flags & EV_DROP) == EV_DROP) { kn->kn_status &= ~KN_QUEUED; kn_enter_flux(kn); kq->kq_count--; KQ_UNLOCK(kq); /* * We don't need to lock the list since we've * marked it as in flux. */ knote_drop(kn, td); KQ_LOCK(kq); continue; } else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) { kn->kn_status &= ~KN_QUEUED; kn_enter_flux(kn); kq->kq_count--; KQ_UNLOCK(kq); /* * We don't need to lock the list since we've * marked the knote as being in flux. */ *kevp = kn->kn_kevent; knote_drop(kn, td); KQ_LOCK(kq); kn = NULL; } else { kn->kn_status |= KN_SCAN; kn_enter_flux(kn); KQ_UNLOCK(kq); if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE) KQ_GLOBAL_LOCK(&kq_global, haskqglobal); knl = kn_list_lock(kn); if (kn->kn_fop->f_event(kn, 0) == 0) { KQ_LOCK(kq); KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE | KN_SCAN); kn_leave_flux(kn); kq->kq_count--; kn_list_unlock(knl); influx = 1; continue; } touch = (!kn->kn_fop->f_isfd && kn->kn_fop->f_touch != NULL); if (touch) kn->kn_fop->f_touch(kn, kevp, EVENT_PROCESS); else *kevp = kn->kn_kevent; KQ_LOCK(kq); KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) { /* * Manually clear knotes who weren't * 'touch'ed. */ if (touch == 0 && kn->kn_flags & EV_CLEAR) { kn->kn_data = 0; kn->kn_fflags = 0; } if (kn->kn_flags & EV_DISPATCH) kn->kn_status |= KN_DISABLED; kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE); kq->kq_count--; } else TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); kn->kn_status &= ~KN_SCAN; kn_leave_flux(kn); kn_list_unlock(knl); influx = 1; } /* we are returning a copy to the user */ kevp++; nkev++; count--; if (nkev == KQ_NEVENTS) { influx = 0; KQ_UNLOCK_FLUX(kq); error = k_ops->k_copyout(k_ops->arg, keva, nkev); nkev = 0; kevp = keva; KQ_LOCK(kq); if (error) break; } } TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe); done: KQ_OWNED(kq); KQ_UNLOCK_FLUX(kq); knote_free(marker); done_nl: KQ_NOTOWNED(kq); if (nkev != 0) error = k_ops->k_copyout(k_ops->arg, keva, nkev); td->td_retval[0] = maxevents - count; return (error); } /*ARGSUSED*/ static int kqueue_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { /* * Enabling sigio causes two major problems: * 1) infinite recursion: * Synopsys: kevent is being used to track signals and have FIOASYNC * set. On receipt of a signal this will cause a kqueue to recurse * into itself over and over. Sending the sigio causes the kqueue * to become ready, which in turn posts sigio again, forever. * Solution: this can be solved by setting a flag in the kqueue that * we have a SIGIO in progress. * 2) locking problems: * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts * us above the proc and pgrp locks. * Solution: Post a signal using an async mechanism, being sure to * record a generation count in the delivery so that we do not deliver * a signal to the wrong process. * * Note, these two mechanisms are somewhat mutually exclusive! */ #if 0 struct kqueue *kq; kq = fp->f_data; switch (cmd) { case FIOASYNC: if (*(int *)data) { kq->kq_state |= KQ_ASYNC; } else { kq->kq_state &= ~KQ_ASYNC; } return (0); case FIOSETOWN: return (fsetown(*(int *)data, &kq->kq_sigio)); case FIOGETOWN: *(int *)data = fgetown(&kq->kq_sigio); return (0); } #endif return (ENOTTY); } /*ARGSUSED*/ static int kqueue_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct kqueue *kq; int revents = 0; int error; if ((error = kqueue_acquire(fp, &kq))) return POLLERR; KQ_LOCK(kq); if (events & (POLLIN | POLLRDNORM)) { if (kq->kq_count) { revents |= events & (POLLIN | POLLRDNORM); } else { selrecord(td, &kq->kq_sel); if (SEL_WAITING(&kq->kq_sel)) kq->kq_state |= KQ_SEL; } } kqueue_release(kq, 1); KQ_UNLOCK(kq); return (revents); } /*ARGSUSED*/ static int -kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred, - struct thread *td) +kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred) { bzero((void *)st, sizeof *st); /* * We no longer return kq_count because the unlocked value is useless. * If you spent all this time getting the count, why not spend your * syscall better by calling kevent? * * XXX - This is needed for libc_r. */ st->st_mode = S_IFIFO; return (0); } static void kqueue_drain(struct kqueue *kq, struct thread *td) { struct knote *kn; int i; KQ_LOCK(kq); KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING, ("kqueue already closing")); kq->kq_state |= KQ_CLOSING; if (kq->kq_refcnt > 1) msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0); KASSERT(kq->kq_refcnt == 1, ("other refs are out there!")); KASSERT(knlist_empty(&kq->kq_sel.si_note), ("kqueue's knlist not empty")); for (i = 0; i < kq->kq_knlistsize; i++) { while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) { if (kn_in_flux(kn)) { kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK, "kqclo1", 0); continue; } kn_enter_flux(kn); KQ_UNLOCK(kq); knote_drop(kn, td); KQ_LOCK(kq); } } if (kq->kq_knhashmask != 0) { for (i = 0; i <= kq->kq_knhashmask; i++) { while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) { if (kn_in_flux(kn)) { kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK, "kqclo2", 0); continue; } kn_enter_flux(kn); KQ_UNLOCK(kq); knote_drop(kn, td); KQ_LOCK(kq); } } } if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) { kq->kq_state |= KQ_TASKDRAIN; msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0); } if ((kq->kq_state & KQ_SEL) == KQ_SEL) { selwakeuppri(&kq->kq_sel, PSOCK); if (!SEL_WAITING(&kq->kq_sel)) kq->kq_state &= ~KQ_SEL; } KQ_UNLOCK(kq); } static void kqueue_destroy(struct kqueue *kq) { KASSERT(kq->kq_fdp == NULL, ("kqueue still attached to a file descriptor")); seldrain(&kq->kq_sel); knlist_destroy(&kq->kq_sel.si_note); mtx_destroy(&kq->kq_lock); if (kq->kq_knhash != NULL) free(kq->kq_knhash, M_KQUEUE); if (kq->kq_knlist != NULL) free(kq->kq_knlist, M_KQUEUE); funsetown(&kq->kq_sigio); } /*ARGSUSED*/ static int kqueue_close(struct file *fp, struct thread *td) { struct kqueue *kq = fp->f_data; struct filedesc *fdp; int error; int filedesc_unlock; if ((error = kqueue_acquire(fp, &kq))) return error; kqueue_drain(kq, td); /* * We could be called due to the knote_drop() doing fdrop(), * called from kqueue_register(). In this case the global * lock is owned, and filedesc sx is locked before, to not * take the sleepable lock after non-sleepable. */ fdp = kq->kq_fdp; kq->kq_fdp = NULL; if (!sx_xlocked(FILEDESC_LOCK(fdp))) { FILEDESC_XLOCK(fdp); filedesc_unlock = 1; } else filedesc_unlock = 0; TAILQ_REMOVE(&fdp->fd_kqlist, kq, kq_list); if (filedesc_unlock) FILEDESC_XUNLOCK(fdp); kqueue_destroy(kq); chgkqcnt(kq->kq_cred->cr_ruidinfo, -1, 0); crfree(kq->kq_cred); free(kq, M_KQUEUE); fp->f_data = NULL; return (0); } static int kqueue_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { kif->kf_type = KF_TYPE_KQUEUE; return (0); } static void kqueue_wakeup(struct kqueue *kq) { KQ_OWNED(kq); if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) { kq->kq_state &= ~KQ_SLEEP; wakeup(kq); } if ((kq->kq_state & KQ_SEL) == KQ_SEL) { selwakeuppri(&kq->kq_sel, PSOCK); if (!SEL_WAITING(&kq->kq_sel)) kq->kq_state &= ~KQ_SEL; } if (!knlist_empty(&kq->kq_sel.si_note)) kqueue_schedtask(kq); if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) { pgsigio(&kq->kq_sigio, SIGIO, 0); } } /* * Walk down a list of knotes, activating them if their event has triggered. * * There is a possibility to optimize in the case of one kq watching another. * Instead of scheduling a task to wake it up, you could pass enough state * down the chain to make up the parent kqueue. Make this code functional * first. */ void knote(struct knlist *list, long hint, int lockflags) { struct kqueue *kq; struct knote *kn, *tkn; int error; if (list == NULL) return; KNL_ASSERT_LOCK(list, lockflags & KNF_LISTLOCKED); if ((lockflags & KNF_LISTLOCKED) == 0) list->kl_lock(list->kl_lockarg); /* * If we unlock the list lock (and enter influx), we can * eliminate the kqueue scheduling, but this will introduce * four lock/unlock's for each knote to test. Also, marker * would be needed to keep iteration position, since filters * or other threads could remove events. */ SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, tkn) { kq = kn->kn_kq; KQ_LOCK(kq); if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) { /* * Do not process the influx notes, except for * the influx coming from the kq unlock in the * kqueue_scan(). In the later case, we do * not interfere with the scan, since the code * fragment in kqueue_scan() locks the knlist, * and cannot proceed until we finished. */ KQ_UNLOCK(kq); } else if ((lockflags & KNF_NOKQLOCK) != 0) { kn_enter_flux(kn); KQ_UNLOCK(kq); error = kn->kn_fop->f_event(kn, hint); KQ_LOCK(kq); kn_leave_flux(kn); if (error) KNOTE_ACTIVATE(kn, 1); KQ_UNLOCK_FLUX(kq); } else { if (kn->kn_fop->f_event(kn, hint)) KNOTE_ACTIVATE(kn, 1); KQ_UNLOCK(kq); } } if ((lockflags & KNF_LISTLOCKED) == 0) list->kl_unlock(list->kl_lockarg); } /* * add a knote to a knlist */ void knlist_add(struct knlist *knl, struct knote *kn, int islocked) { KNL_ASSERT_LOCK(knl, islocked); KQ_NOTOWNED(kn->kn_kq); KASSERT(kn_in_flux(kn), ("knote %p not in flux", kn)); KASSERT((kn->kn_status & KN_DETACHED) != 0, ("knote %p was not detached", kn)); if (!islocked) knl->kl_lock(knl->kl_lockarg); SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext); if (!islocked) knl->kl_unlock(knl->kl_lockarg); KQ_LOCK(kn->kn_kq); kn->kn_knlist = knl; kn->kn_status &= ~KN_DETACHED; KQ_UNLOCK(kn->kn_kq); } static void knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked, int kqislocked) { KASSERT(!kqislocked || knlislocked, ("kq locked w/o knl locked")); KNL_ASSERT_LOCK(knl, knlislocked); mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED); KASSERT(kqislocked || kn_in_flux(kn), ("knote %p not in flux", kn)); KASSERT((kn->kn_status & KN_DETACHED) == 0, ("knote %p was already detached", kn)); if (!knlislocked) knl->kl_lock(knl->kl_lockarg); SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext); kn->kn_knlist = NULL; if (!knlislocked) kn_list_unlock(knl); if (!kqislocked) KQ_LOCK(kn->kn_kq); kn->kn_status |= KN_DETACHED; if (!kqislocked) KQ_UNLOCK(kn->kn_kq); } /* * remove knote from the specified knlist */ void knlist_remove(struct knlist *knl, struct knote *kn, int islocked) { knlist_remove_kq(knl, kn, islocked, 0); } int knlist_empty(struct knlist *knl) { KNL_ASSERT_LOCKED(knl); return (SLIST_EMPTY(&knl->kl_list)); } static struct mtx knlist_lock; MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects", MTX_DEF); static void knlist_mtx_lock(void *arg); static void knlist_mtx_unlock(void *arg); static void knlist_mtx_lock(void *arg) { mtx_lock((struct mtx *)arg); } static void knlist_mtx_unlock(void *arg) { mtx_unlock((struct mtx *)arg); } static void knlist_mtx_assert_lock(void *arg, int what) { if (what == LA_LOCKED) mtx_assert((struct mtx *)arg, MA_OWNED); else mtx_assert((struct mtx *)arg, MA_NOTOWNED); } static void knlist_rw_rlock(void *arg) { rw_rlock((struct rwlock *)arg); } static void knlist_rw_runlock(void *arg) { rw_runlock((struct rwlock *)arg); } static void knlist_rw_assert_lock(void *arg, int what) { if (what == LA_LOCKED) rw_assert((struct rwlock *)arg, RA_LOCKED); else rw_assert((struct rwlock *)arg, RA_UNLOCKED); } void knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *), void (*kl_unlock)(void *), void (*kl_assert_lock)(void *, int)) { if (lock == NULL) knl->kl_lockarg = &knlist_lock; else knl->kl_lockarg = lock; if (kl_lock == NULL) knl->kl_lock = knlist_mtx_lock; else knl->kl_lock = kl_lock; if (kl_unlock == NULL) knl->kl_unlock = knlist_mtx_unlock; else knl->kl_unlock = kl_unlock; if (kl_assert_lock == NULL) knl->kl_assert_lock = knlist_mtx_assert_lock; else knl->kl_assert_lock = kl_assert_lock; knl->kl_autodestroy = 0; SLIST_INIT(&knl->kl_list); } void knlist_init_mtx(struct knlist *knl, struct mtx *lock) { knlist_init(knl, lock, NULL, NULL, NULL); } struct knlist * knlist_alloc(struct mtx *lock) { struct knlist *knl; knl = malloc(sizeof(struct knlist), M_KQUEUE, M_WAITOK); knlist_init_mtx(knl, lock); return (knl); } void knlist_init_rw_reader(struct knlist *knl, struct rwlock *lock) { knlist_init(knl, lock, knlist_rw_rlock, knlist_rw_runlock, knlist_rw_assert_lock); } void knlist_destroy(struct knlist *knl) { KASSERT(KNLIST_EMPTY(knl), ("destroying knlist %p with knotes on it", knl)); } void knlist_detach(struct knlist *knl) { KNL_ASSERT_LOCKED(knl); knl->kl_autodestroy = 1; if (knlist_empty(knl)) { knlist_destroy(knl); free(knl, M_KQUEUE); } } /* * Even if we are locked, we may need to drop the lock to allow any influx * knotes time to "settle". */ void knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn) { struct knote *kn, *kn2; struct kqueue *kq; KASSERT(!knl->kl_autodestroy, ("cleardel for autodestroy %p", knl)); if (islocked) KNL_ASSERT_LOCKED(knl); else { KNL_ASSERT_UNLOCKED(knl); again: /* need to reacquire lock since we have dropped it */ knl->kl_lock(knl->kl_lockarg); } SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) { kq = kn->kn_kq; KQ_LOCK(kq); if (kn_in_flux(kn)) { KQ_UNLOCK(kq); continue; } knlist_remove_kq(knl, kn, 1, 1); if (killkn) { kn_enter_flux(kn); KQ_UNLOCK(kq); knote_drop_detached(kn, td); } else { /* Make sure cleared knotes disappear soon */ kn->kn_flags |= EV_EOF | EV_ONESHOT; KQ_UNLOCK(kq); } kq = NULL; } if (!SLIST_EMPTY(&knl->kl_list)) { /* there are still in flux knotes remaining */ kn = SLIST_FIRST(&knl->kl_list); kq = kn->kn_kq; KQ_LOCK(kq); KASSERT(kn_in_flux(kn), ("knote removed w/o list lock")); knl->kl_unlock(knl->kl_lockarg); kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0); kq = NULL; goto again; } if (islocked) KNL_ASSERT_LOCKED(knl); else { knl->kl_unlock(knl->kl_lockarg); KNL_ASSERT_UNLOCKED(knl); } } /* * Remove all knotes referencing a specified fd must be called with FILEDESC * lock. This prevents a race where a new fd comes along and occupies the * entry and we attach a knote to the fd. */ void knote_fdclose(struct thread *td, int fd) { struct filedesc *fdp = td->td_proc->p_fd; struct kqueue *kq; struct knote *kn; int influx; FILEDESC_XLOCK_ASSERT(fdp); /* * We shouldn't have to worry about new kevents appearing on fd * since filedesc is locked. */ TAILQ_FOREACH(kq, &fdp->fd_kqlist, kq_list) { KQ_LOCK(kq); again: influx = 0; while (kq->kq_knlistsize > fd && (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) { if (kn_in_flux(kn)) { /* someone else might be waiting on our knote */ if (influx) wakeup(kq); kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0); goto again; } kn_enter_flux(kn); KQ_UNLOCK(kq); influx = 1; knote_drop(kn, td); KQ_LOCK(kq); } KQ_UNLOCK_FLUX(kq); } } static int knote_attach(struct knote *kn, struct kqueue *kq) { struct klist *list; KASSERT(kn_in_flux(kn), ("knote %p not marked influx", kn)); KQ_OWNED(kq); if ((kq->kq_state & KQ_CLOSING) != 0) return (EBADF); if (kn->kn_fop->f_isfd) { if (kn->kn_id >= kq->kq_knlistsize) return (ENOMEM); list = &kq->kq_knlist[kn->kn_id]; } else { if (kq->kq_knhash == NULL) return (ENOMEM); list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; } SLIST_INSERT_HEAD(list, kn, kn_link); return (0); } static void knote_drop(struct knote *kn, struct thread *td) { if ((kn->kn_status & KN_DETACHED) == 0) kn->kn_fop->f_detach(kn); knote_drop_detached(kn, td); } static void knote_drop_detached(struct knote *kn, struct thread *td) { struct kqueue *kq; struct klist *list; kq = kn->kn_kq; KASSERT((kn->kn_status & KN_DETACHED) != 0, ("knote %p still attached", kn)); KQ_NOTOWNED(kq); KQ_LOCK(kq); KASSERT(kn->kn_influx == 1, ("knote_drop called on %p with influx %d", kn, kn->kn_influx)); if (kn->kn_fop->f_isfd) list = &kq->kq_knlist[kn->kn_id]; else list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; if (!SLIST_EMPTY(list)) SLIST_REMOVE(list, kn, knote, kn_link); if (kn->kn_status & KN_QUEUED) knote_dequeue(kn); KQ_UNLOCK_FLUX(kq); if (kn->kn_fop->f_isfd) { fdrop(kn->kn_fp, td); kn->kn_fp = NULL; } kqueue_fo_release(kn->kn_kevent.filter); kn->kn_fop = NULL; knote_free(kn); } static void knote_enqueue(struct knote *kn) { struct kqueue *kq = kn->kn_kq; KQ_OWNED(kn->kn_kq); KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued")); TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); kn->kn_status |= KN_QUEUED; kq->kq_count++; kqueue_wakeup(kq); } static void knote_dequeue(struct knote *kn) { struct kqueue *kq = kn->kn_kq; KQ_OWNED(kn->kn_kq); KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued")); TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); kn->kn_status &= ~KN_QUEUED; kq->kq_count--; } static void knote_init(void) { knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); } SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL); static struct knote * knote_alloc(int mflag) { return (uma_zalloc(knote_zone, mflag | M_ZERO)); } static void knote_free(struct knote *kn) { uma_zfree(knote_zone, kn); } /* * Register the kev w/ the kq specified by fd. */ int kqfd_register(int fd, struct kevent *kev, struct thread *td, int mflag) { struct kqueue *kq; struct file *fp; cap_rights_t rights; int error; error = fget(td, fd, cap_rights_init_one(&rights, CAP_KQUEUE_CHANGE), &fp); if (error != 0) return (error); if ((error = kqueue_acquire(fp, &kq)) != 0) goto noacquire; error = kqueue_register(kq, kev, td, mflag); kqueue_release(kq, 0); noacquire: fdrop(fp, td); return (error); } diff --git a/sys/kern/sys_eventfd.c b/sys/kern/sys_eventfd.c index 91c045e85faf..8ee8421296e5 100644 --- a/sys/kern/sys_eventfd.c +++ b/sys/kern/sys_eventfd.c @@ -1,347 +1,346 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2014 Dmitry Chagin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include _Static_assert(EFD_CLOEXEC == O_CLOEXEC, "Mismatched EFD_CLOEXEC"); _Static_assert(EFD_NONBLOCK == O_NONBLOCK, "Mismatched EFD_NONBLOCK"); MALLOC_DEFINE(M_EVENTFD, "eventfd", "eventfd structures"); static fo_rdwr_t eventfd_read; static fo_rdwr_t eventfd_write; static fo_ioctl_t eventfd_ioctl; static fo_poll_t eventfd_poll; static fo_kqfilter_t eventfd_kqfilter; static fo_stat_t eventfd_stat; static fo_close_t eventfd_close; static fo_fill_kinfo_t eventfd_fill_kinfo; static struct fileops eventfdops = { .fo_read = eventfd_read, .fo_write = eventfd_write, .fo_truncate = invfo_truncate, .fo_ioctl = eventfd_ioctl, .fo_poll = eventfd_poll, .fo_kqfilter = eventfd_kqfilter, .fo_stat = eventfd_stat, .fo_close = eventfd_close, .fo_chmod = invfo_chmod, .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = eventfd_fill_kinfo, .fo_flags = DFLAG_PASSABLE }; static void filt_eventfddetach(struct knote *kn); static int filt_eventfdread(struct knote *kn, long hint); static int filt_eventfdwrite(struct knote *kn, long hint); static struct filterops eventfd_rfiltops = { .f_isfd = 1, .f_detach = filt_eventfddetach, .f_event = filt_eventfdread }; static struct filterops eventfd_wfiltops = { .f_isfd = 1, .f_detach = filt_eventfddetach, .f_event = filt_eventfdwrite }; struct eventfd { eventfd_t efd_count; uint32_t efd_flags; struct selinfo efd_sel; struct mtx efd_lock; }; int eventfd_create_file(struct thread *td, struct file *fp, uint32_t initval, int flags) { struct eventfd *efd; int fflags; AUDIT_ARG_FFLAGS(flags); AUDIT_ARG_VALUE(initval); efd = malloc(sizeof(*efd), M_EVENTFD, M_WAITOK | M_ZERO); efd->efd_flags = flags; efd->efd_count = initval; mtx_init(&efd->efd_lock, "eventfd", NULL, MTX_DEF); knlist_init_mtx(&efd->efd_sel.si_note, &efd->efd_lock); fflags = FREAD | FWRITE; if ((flags & EFD_NONBLOCK) != 0) fflags |= FNONBLOCK; finit(fp, fflags, DTYPE_EVENTFD, efd, &eventfdops); return (0); } static int eventfd_close(struct file *fp, struct thread *td) { struct eventfd *efd; efd = fp->f_data; seldrain(&efd->efd_sel); knlist_destroy(&efd->efd_sel.si_note); mtx_destroy(&efd->efd_lock); free(efd, M_EVENTFD); return (0); } static int eventfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct eventfd *efd; eventfd_t count; int error; if (uio->uio_resid < sizeof(eventfd_t)) return (EINVAL); error = 0; efd = fp->f_data; mtx_lock(&efd->efd_lock); while (error == 0 && efd->efd_count == 0) { if ((fp->f_flag & FNONBLOCK) != 0) { mtx_unlock(&efd->efd_lock); return (EAGAIN); } error = mtx_sleep(&efd->efd_count, &efd->efd_lock, PCATCH, "efdrd", 0); } if (error == 0) { MPASS(efd->efd_count > 0); if ((efd->efd_flags & EFD_SEMAPHORE) != 0) { count = 1; --efd->efd_count; } else { count = efd->efd_count; efd->efd_count = 0; } KNOTE_LOCKED(&efd->efd_sel.si_note, 0); selwakeup(&efd->efd_sel); wakeup(&efd->efd_count); mtx_unlock(&efd->efd_lock); error = uiomove(&count, sizeof(eventfd_t), uio); } else mtx_unlock(&efd->efd_lock); return (error); } static int eventfd_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct eventfd *efd; eventfd_t count; int error; if (uio->uio_resid < sizeof(eventfd_t)) return (EINVAL); error = uiomove(&count, sizeof(eventfd_t), uio); if (error != 0) return (error); if (count == UINT64_MAX) return (EINVAL); efd = fp->f_data; mtx_lock(&efd->efd_lock); retry: if (UINT64_MAX - efd->efd_count <= count) { if ((fp->f_flag & FNONBLOCK) != 0) { mtx_unlock(&efd->efd_lock); /* Do not not return the number of bytes written */ uio->uio_resid += sizeof(eventfd_t); return (EAGAIN); } error = mtx_sleep(&efd->efd_count, &efd->efd_lock, PCATCH, "efdwr", 0); if (error == 0) goto retry; } if (error == 0) { MPASS(UINT64_MAX - efd->efd_count > count); efd->efd_count += count; KNOTE_LOCKED(&efd->efd_sel.si_note, 0); selwakeup(&efd->efd_sel); wakeup(&efd->efd_count); } mtx_unlock(&efd->efd_lock); return (error); } static int eventfd_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct eventfd *efd; int revents; efd = fp->f_data; revents = 0; mtx_lock(&efd->efd_lock); if ((events & (POLLIN | POLLRDNORM)) != 0 && efd->efd_count > 0) revents |= events & (POLLIN | POLLRDNORM); if ((events & (POLLOUT | POLLWRNORM)) != 0 && UINT64_MAX - 1 > efd->efd_count) revents |= events & (POLLOUT | POLLWRNORM); if (revents == 0) selrecord(td, &efd->efd_sel); mtx_unlock(&efd->efd_lock); return (revents); } static int eventfd_kqfilter(struct file *fp, struct knote *kn) { struct eventfd *efd = fp->f_data; mtx_lock(&efd->efd_lock); switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &eventfd_rfiltops; break; case EVFILT_WRITE: kn->kn_fop = &eventfd_wfiltops; break; default: mtx_unlock(&efd->efd_lock); return (EINVAL); } kn->kn_hook = efd; knlist_add(&efd->efd_sel.si_note, kn, 1); mtx_unlock(&efd->efd_lock); return (0); } static void filt_eventfddetach(struct knote *kn) { struct eventfd *efd = kn->kn_hook; mtx_lock(&efd->efd_lock); knlist_remove(&efd->efd_sel.si_note, kn, 1); mtx_unlock(&efd->efd_lock); } static int filt_eventfdread(struct knote *kn, long hint) { struct eventfd *efd = kn->kn_hook; int ret; mtx_assert(&efd->efd_lock, MA_OWNED); kn->kn_data = (int64_t)efd->efd_count; ret = efd->efd_count > 0; return (ret); } static int filt_eventfdwrite(struct knote *kn, long hint) { struct eventfd *efd = kn->kn_hook; int ret; mtx_assert(&efd->efd_lock, MA_OWNED); kn->kn_data = (int64_t)(UINT64_MAX - 1 - efd->efd_count); ret = UINT64_MAX - 1 > efd->efd_count; return (ret); } static int eventfd_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { switch (cmd) { case FIONBIO: case FIOASYNC: return (0); } return (ENOTTY); } static int -eventfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred, - struct thread *td) +eventfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred) { bzero((void *)st, sizeof *st); st->st_mode = S_IFIFO; return (0); } static int eventfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { struct eventfd *efd = fp->f_data; kif->kf_type = KF_TYPE_EVENTFD; mtx_lock(&efd->efd_lock); kif->kf_un.kf_eventfd.kf_eventfd_value = efd->efd_count; kif->kf_un.kf_eventfd.kf_eventfd_flags = efd->efd_flags; mtx_unlock(&efd->efd_lock); return (0); } diff --git a/sys/kern/sys_pipe.c b/sys/kern/sys_pipe.c index ee6ccbbad322..7bd7fea28e76 100644 --- a/sys/kern/sys_pipe.c +++ b/sys/kern/sys_pipe.c @@ -1,1841 +1,1840 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1996 John S. Dyson * Copyright (c) 2012 Giovanni Trematerra * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice immediately at the beginning of the file, without modification, * this list of conditions, and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Absolutely no warranty of function or purpose is made by the author * John S. Dyson. * 4. Modifications may be freely made to this file if the above conditions * are met. */ /* * This file contains a high-performance replacement for the socket-based * pipes scheme originally used in FreeBSD/4.4Lite. It does not support * all features of sockets, but does do everything that pipes normally * do. */ /* * This code has two modes of operation, a small write mode and a large * write mode. The small write mode acts like conventional pipes with * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT * and PIPE_SIZE in size, the sending process pins the underlying pages in * memory, and the receiving process copies directly from these pinned pages * in the sending process. * * If the sending process receives a signal, it is possible that it will * go away, and certainly its address space can change, because control * is returned back to the user-mode side. In that case, the pipe code * arranges to copy the buffer supplied by the user process, to a pageable * kernel buffer, and the receiving process will grab the data from the * pageable kernel buffer. Since signals don't happen all that often, * the copy operation is normally eliminated. * * The constant PIPE_MINDIRECT is chosen to make sure that buffering will * happen for small transfers so that the system will not spend all of * its time context switching. * * In order to limit the resource use of pipes, two sysctls exist: * * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable * address space available to us in pipe_map. This value is normally * autotuned, but may also be loader tuned. * * kern.ipc.pipekva - This read-only sysctl tracks the current amount of * memory in use by pipes. * * Based on how large pipekva is relative to maxpipekva, the following * will happen: * * 0% - 50%: * New pipes are given 16K of memory backing, pipes may dynamically * grow to as large as 64K where needed. * 50% - 75%: * New pipes are given 4K (or PAGE_SIZE) of memory backing, * existing pipes may NOT grow. * 75% - 100%: * New pipes are given 4K (or PAGE_SIZE) of memory backing, * existing pipes will be shrunk down to 4K whenever possible. * * Resizing may be disabled by setting kern.ipc.piperesizeallowed=0. If * that is set, the only resize that will occur is the 0 -> SMALL_PIPE_SIZE * resize which MUST occur for reverse-direction pipes when they are * first used. * * Additional information about the current state of pipes may be obtained * from kern.ipc.pipes, kern.ipc.pipefragretry, kern.ipc.pipeallocfail, * and kern.ipc.piperesizefail. * * Locking rules: There are two locks present here: A mutex, used via * PIPE_LOCK, and a flag, used via pipelock(). All locking is done via * the flag, as mutexes can not persist over uiomove. The mutex * exists only to guard access to the flag, and is not in itself a * locking mechanism. Also note that there is only a single mutex for * both directions of a pipe. * * As pipelock() may have to sleep before it can acquire the flag, it * is important to reread all data after a call to pipelock(); everything * in the structure may have changed. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Use this define if you want to disable *fancy* VM things. Expect an * approx 30% decrease in transfer rate. This could be useful for * NetBSD or OpenBSD. */ /* #define PIPE_NODIRECT */ #define PIPE_PEER(pipe) \ (((pipe)->pipe_type & PIPE_TYPE_NAMED) ? (pipe) : ((pipe)->pipe_peer)) /* * interfaces to the outside world */ static fo_rdwr_t pipe_read; static fo_rdwr_t pipe_write; static fo_truncate_t pipe_truncate; static fo_ioctl_t pipe_ioctl; static fo_poll_t pipe_poll; static fo_kqfilter_t pipe_kqfilter; static fo_stat_t pipe_stat; static fo_close_t pipe_close; static fo_chmod_t pipe_chmod; static fo_chown_t pipe_chown; static fo_fill_kinfo_t pipe_fill_kinfo; struct fileops pipeops = { .fo_read = pipe_read, .fo_write = pipe_write, .fo_truncate = pipe_truncate, .fo_ioctl = pipe_ioctl, .fo_poll = pipe_poll, .fo_kqfilter = pipe_kqfilter, .fo_stat = pipe_stat, .fo_close = pipe_close, .fo_chmod = pipe_chmod, .fo_chown = pipe_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = pipe_fill_kinfo, .fo_flags = DFLAG_PASSABLE }; static void filt_pipedetach(struct knote *kn); static void filt_pipedetach_notsup(struct knote *kn); static int filt_pipenotsup(struct knote *kn, long hint); static int filt_piperead(struct knote *kn, long hint); static int filt_pipewrite(struct knote *kn, long hint); static struct filterops pipe_nfiltops = { .f_isfd = 1, .f_detach = filt_pipedetach_notsup, .f_event = filt_pipenotsup }; static struct filterops pipe_rfiltops = { .f_isfd = 1, .f_detach = filt_pipedetach, .f_event = filt_piperead }; static struct filterops pipe_wfiltops = { .f_isfd = 1, .f_detach = filt_pipedetach, .f_event = filt_pipewrite }; /* * Default pipe buffer size(s), this can be kind-of large now because pipe * space is pageable. The pipe code will try to maintain locality of * reference for performance reasons, so small amounts of outstanding I/O * will not wipe the cache. */ #define MINPIPESIZE (PIPE_SIZE/3) #define MAXPIPESIZE (2*PIPE_SIZE/3) static long amountpipekva; static int pipefragretry; static int pipeallocfail; static int piperesizefail; static int piperesizeallowed = 1; static long pipe_mindirect = PIPE_MINDIRECT; SYSCTL_LONG(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &maxpipekva, 0, "Pipe KVA limit"); SYSCTL_LONG(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD, &amountpipekva, 0, "Pipe KVA usage"); SYSCTL_INT(_kern_ipc, OID_AUTO, pipefragretry, CTLFLAG_RD, &pipefragretry, 0, "Pipe allocation retries due to fragmentation"); SYSCTL_INT(_kern_ipc, OID_AUTO, pipeallocfail, CTLFLAG_RD, &pipeallocfail, 0, "Pipe allocation failures"); SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizefail, CTLFLAG_RD, &piperesizefail, 0, "Pipe resize failures"); SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizeallowed, CTLFLAG_RW, &piperesizeallowed, 0, "Pipe resizing allowed"); static void pipeinit(void *dummy __unused); static void pipeclose(struct pipe *cpipe); static void pipe_free_kmem(struct pipe *cpipe); static int pipe_create(struct pipe *pipe, bool backing); static int pipe_paircreate(struct thread *td, struct pipepair **p_pp); static __inline int pipelock(struct pipe *cpipe, int catch); static __inline void pipeunlock(struct pipe *cpipe); static void pipe_timestamp(struct timespec *tsp); #ifndef PIPE_NODIRECT static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio); static void pipe_destroy_write_buffer(struct pipe *wpipe); static int pipe_direct_write(struct pipe *wpipe, struct uio *uio); static void pipe_clone_write_buffer(struct pipe *wpipe); #endif static int pipespace(struct pipe *cpipe, int size); static int pipespace_new(struct pipe *cpipe, int size); static int pipe_zone_ctor(void *mem, int size, void *arg, int flags); static int pipe_zone_init(void *mem, int size, int flags); static void pipe_zone_fini(void *mem, int size); static uma_zone_t pipe_zone; static struct unrhdr64 pipeino_unr; static dev_t pipedev_ino; SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL); static void pipeinit(void *dummy __unused) { pipe_zone = uma_zcreate("pipe", sizeof(struct pipepair), pipe_zone_ctor, NULL, pipe_zone_init, pipe_zone_fini, UMA_ALIGN_PTR, 0); KASSERT(pipe_zone != NULL, ("pipe_zone not initialized")); new_unrhdr64(&pipeino_unr, 1); pipedev_ino = devfs_alloc_cdp_inode(); KASSERT(pipedev_ino > 0, ("pipe dev inode not initialized")); } static int sysctl_handle_pipe_mindirect(SYSCTL_HANDLER_ARGS) { int error = 0; long tmp_pipe_mindirect = pipe_mindirect; error = sysctl_handle_long(oidp, &tmp_pipe_mindirect, arg2, req); if (error != 0 || req->newptr == NULL) return (error); /* * Don't allow pipe_mindirect to be set so low that we violate * atomicity requirements. */ if (tmp_pipe_mindirect <= PIPE_BUF) return (EINVAL); pipe_mindirect = tmp_pipe_mindirect; return (0); } SYSCTL_OID(_kern_ipc, OID_AUTO, pipe_mindirect, CTLTYPE_LONG | CTLFLAG_RW, &pipe_mindirect, 0, sysctl_handle_pipe_mindirect, "L", "Minimum write size triggering VM optimization"); static int pipe_zone_ctor(void *mem, int size, void *arg, int flags) { struct pipepair *pp; struct pipe *rpipe, *wpipe; KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size")); pp = (struct pipepair *)mem; /* * We zero both pipe endpoints to make sure all the kmem pointers * are NULL, flag fields are zero'd, etc. We timestamp both * endpoints with the same time. */ rpipe = &pp->pp_rpipe; bzero(rpipe, sizeof(*rpipe)); pipe_timestamp(&rpipe->pipe_ctime); rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime; wpipe = &pp->pp_wpipe; bzero(wpipe, sizeof(*wpipe)); wpipe->pipe_ctime = rpipe->pipe_ctime; wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime; rpipe->pipe_peer = wpipe; rpipe->pipe_pair = pp; wpipe->pipe_peer = rpipe; wpipe->pipe_pair = pp; /* * Mark both endpoints as present; they will later get free'd * one at a time. When both are free'd, then the whole pair * is released. */ rpipe->pipe_present = PIPE_ACTIVE; wpipe->pipe_present = PIPE_ACTIVE; /* * Eventually, the MAC Framework may initialize the label * in ctor or init, but for now we do it elswhere to avoid * blocking in ctor or init. */ pp->pp_label = NULL; return (0); } static int pipe_zone_init(void *mem, int size, int flags) { struct pipepair *pp; KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size")); pp = (struct pipepair *)mem; mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_NEW); return (0); } static void pipe_zone_fini(void *mem, int size) { struct pipepair *pp; KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size")); pp = (struct pipepair *)mem; mtx_destroy(&pp->pp_mtx); } static int pipe_paircreate(struct thread *td, struct pipepair **p_pp) { struct pipepair *pp; struct pipe *rpipe, *wpipe; int error; *p_pp = pp = uma_zalloc(pipe_zone, M_WAITOK); #ifdef MAC /* * The MAC label is shared between the connected endpoints. As a * result mac_pipe_init() and mac_pipe_create() are called once * for the pair, and not on the endpoints. */ mac_pipe_init(pp); mac_pipe_create(td->td_ucred, pp); #endif rpipe = &pp->pp_rpipe; wpipe = &pp->pp_wpipe; knlist_init_mtx(&rpipe->pipe_sel.si_note, PIPE_MTX(rpipe)); knlist_init_mtx(&wpipe->pipe_sel.si_note, PIPE_MTX(wpipe)); /* * Only the forward direction pipe is backed by big buffer by * default. */ error = pipe_create(rpipe, true); if (error != 0) goto fail; error = pipe_create(wpipe, false); if (error != 0) { /* * This cleanup leaves the pipe inode number for rpipe * still allocated, but never used. We do not free * inode numbers for opened pipes, which is required * for correctness because numbers must be unique. * But also it avoids any memory use by the unr * allocator, so stashing away the transient inode * number is reasonable. */ pipe_free_kmem(rpipe); goto fail; } rpipe->pipe_state |= PIPE_DIRECTOK; wpipe->pipe_state |= PIPE_DIRECTOK; return (0); fail: knlist_destroy(&rpipe->pipe_sel.si_note); knlist_destroy(&wpipe->pipe_sel.si_note); #ifdef MAC mac_pipe_destroy(pp); #endif uma_zfree(pipe_zone, pp); return (error); } int pipe_named_ctor(struct pipe **ppipe, struct thread *td) { struct pipepair *pp; int error; error = pipe_paircreate(td, &pp); if (error != 0) return (error); pp->pp_rpipe.pipe_type |= PIPE_TYPE_NAMED; *ppipe = &pp->pp_rpipe; return (0); } void pipe_dtor(struct pipe *dpipe) { struct pipe *peer; peer = (dpipe->pipe_type & PIPE_TYPE_NAMED) != 0 ? dpipe->pipe_peer : NULL; funsetown(&dpipe->pipe_sigio); pipeclose(dpipe); if (peer != NULL) { funsetown(&peer->pipe_sigio); pipeclose(peer); } } /* * Get a timestamp. * * This used to be vfs_timestamp but the higher precision is unnecessary and * can very negatively affect performance in virtualized environments (e.g., on * vms running on amd64 when using the rdtscp instruction). */ static void pipe_timestamp(struct timespec *tsp) { getnanotime(tsp); } /* * The pipe system call for the DTYPE_PIPE type of pipes. If we fail, let * the zone pick up the pieces via pipeclose(). */ int kern_pipe(struct thread *td, int fildes[2], int flags, struct filecaps *fcaps1, struct filecaps *fcaps2) { struct file *rf, *wf; struct pipe *rpipe, *wpipe; struct pipepair *pp; int fd, fflags, error; error = pipe_paircreate(td, &pp); if (error != 0) return (error); rpipe = &pp->pp_rpipe; wpipe = &pp->pp_wpipe; error = falloc_caps(td, &rf, &fd, flags, fcaps1); if (error) { pipeclose(rpipe); pipeclose(wpipe); return (error); } /* An extra reference on `rf' has been held for us by falloc_caps(). */ fildes[0] = fd; fflags = FREAD | FWRITE; if ((flags & O_NONBLOCK) != 0) fflags |= FNONBLOCK; /* * Warning: once we've gotten past allocation of the fd for the * read-side, we can only drop the read side via fdrop() in order * to avoid races against processes which manage to dup() the read * side while we are blocked trying to allocate the write side. */ finit(rf, fflags, DTYPE_PIPE, rpipe, &pipeops); error = falloc_caps(td, &wf, &fd, flags, fcaps2); if (error) { fdclose(td, rf, fildes[0]); fdrop(rf, td); /* rpipe has been closed by fdrop(). */ pipeclose(wpipe); return (error); } /* An extra reference on `wf' has been held for us by falloc_caps(). */ finit(wf, fflags, DTYPE_PIPE, wpipe, &pipeops); fdrop(wf, td); fildes[1] = fd; fdrop(rf, td); return (0); } #ifdef COMPAT_FREEBSD10 /* ARGSUSED */ int freebsd10_pipe(struct thread *td, struct freebsd10_pipe_args *uap __unused) { int error; int fildes[2]; error = kern_pipe(td, fildes, 0, NULL, NULL); if (error) return (error); td->td_retval[0] = fildes[0]; td->td_retval[1] = fildes[1]; return (0); } #endif int sys_pipe2(struct thread *td, struct pipe2_args *uap) { int error, fildes[2]; if (uap->flags & ~(O_CLOEXEC | O_NONBLOCK)) return (EINVAL); error = kern_pipe(td, fildes, uap->flags, NULL, NULL); if (error) return (error); error = copyout(fildes, uap->fildes, 2 * sizeof(int)); if (error) { (void)kern_close(td, fildes[0]); (void)kern_close(td, fildes[1]); } return (error); } /* * Allocate kva for pipe circular buffer, the space is pageable * This routine will 'realloc' the size of a pipe safely, if it fails * it will retain the old buffer. * If it fails it will return ENOMEM. */ static int pipespace_new(struct pipe *cpipe, int size) { caddr_t buffer; int error, cnt, firstseg; static int curfail = 0; static struct timeval lastfail; KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked")); KASSERT(!(cpipe->pipe_state & PIPE_DIRECTW), ("pipespace: resize of direct writes not allowed")); retry: cnt = cpipe->pipe_buffer.cnt; if (cnt > size) size = cnt; size = round_page(size); buffer = (caddr_t) vm_map_min(pipe_map); error = vm_map_find(pipe_map, NULL, 0, (vm_offset_t *)&buffer, size, 0, VMFS_ANY_SPACE, VM_PROT_RW, VM_PROT_RW, 0); if (error != KERN_SUCCESS) { if (cpipe->pipe_buffer.buffer == NULL && size > SMALL_PIPE_SIZE) { size = SMALL_PIPE_SIZE; pipefragretry++; goto retry; } if (cpipe->pipe_buffer.buffer == NULL) { pipeallocfail++; if (ppsratecheck(&lastfail, &curfail, 1)) printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n"); } else { piperesizefail++; } return (ENOMEM); } /* copy data, then free old resources if we're resizing */ if (cnt > 0) { if (cpipe->pipe_buffer.in <= cpipe->pipe_buffer.out) { firstseg = cpipe->pipe_buffer.size - cpipe->pipe_buffer.out; bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out], buffer, firstseg); if ((cnt - firstseg) > 0) bcopy(cpipe->pipe_buffer.buffer, &buffer[firstseg], cpipe->pipe_buffer.in); } else { bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out], buffer, cnt); } } pipe_free_kmem(cpipe); cpipe->pipe_buffer.buffer = buffer; cpipe->pipe_buffer.size = size; cpipe->pipe_buffer.in = cnt; cpipe->pipe_buffer.out = 0; cpipe->pipe_buffer.cnt = cnt; atomic_add_long(&amountpipekva, cpipe->pipe_buffer.size); return (0); } /* * Wrapper for pipespace_new() that performs locking assertions. */ static int pipespace(struct pipe *cpipe, int size) { KASSERT(cpipe->pipe_state & PIPE_LOCKFL, ("Unlocked pipe passed to pipespace")); return (pipespace_new(cpipe, size)); } /* * lock a pipe for I/O, blocking other access */ static __inline int pipelock(struct pipe *cpipe, int catch) { int error, prio; PIPE_LOCK_ASSERT(cpipe, MA_OWNED); prio = PRIBIO; if (catch) prio |= PCATCH; while (cpipe->pipe_state & PIPE_LOCKFL) { KASSERT(cpipe->pipe_waiters >= 0, ("%s: bad waiter count %d", __func__, cpipe->pipe_waiters)); cpipe->pipe_waiters++; error = msleep(cpipe, PIPE_MTX(cpipe), prio, "pipelk", 0); cpipe->pipe_waiters--; if (error != 0) return (error); } cpipe->pipe_state |= PIPE_LOCKFL; return (0); } /* * unlock a pipe I/O lock */ static __inline void pipeunlock(struct pipe *cpipe) { PIPE_LOCK_ASSERT(cpipe, MA_OWNED); KASSERT(cpipe->pipe_state & PIPE_LOCKFL, ("Unlocked pipe passed to pipeunlock")); KASSERT(cpipe->pipe_waiters >= 0, ("%s: bad waiter count %d", __func__, cpipe->pipe_waiters)); cpipe->pipe_state &= ~PIPE_LOCKFL; if (cpipe->pipe_waiters > 0) { wakeup_one(cpipe); } } void pipeselwakeup(struct pipe *cpipe) { PIPE_LOCK_ASSERT(cpipe, MA_OWNED); if (cpipe->pipe_state & PIPE_SEL) { selwakeuppri(&cpipe->pipe_sel, PSOCK); if (!SEL_WAITING(&cpipe->pipe_sel)) cpipe->pipe_state &= ~PIPE_SEL; } if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio) pgsigio(&cpipe->pipe_sigio, SIGIO, 0); KNOTE_LOCKED(&cpipe->pipe_sel.si_note, 0); } /* * Initialize and allocate VM and memory for pipe. The structure * will start out zero'd from the ctor, so we just manage the kmem. */ static int pipe_create(struct pipe *pipe, bool large_backing) { int error; error = pipespace_new(pipe, !large_backing || amountpipekva > maxpipekva / 2 ? SMALL_PIPE_SIZE : PIPE_SIZE); if (error == 0) pipe->pipe_ino = alloc_unr64(&pipeino_unr); return (error); } /* ARGSUSED */ static int pipe_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct pipe *rpipe; int error; int nread = 0; int size; rpipe = fp->f_data; PIPE_LOCK(rpipe); ++rpipe->pipe_busy; error = pipelock(rpipe, 1); if (error) goto unlocked_error; #ifdef MAC error = mac_pipe_check_read(active_cred, rpipe->pipe_pair); if (error) goto locked_error; #endif if (amountpipekva > (3 * maxpipekva) / 4) { if ((rpipe->pipe_state & PIPE_DIRECTW) == 0 && rpipe->pipe_buffer.size > SMALL_PIPE_SIZE && rpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE && piperesizeallowed == 1) { PIPE_UNLOCK(rpipe); pipespace(rpipe, SMALL_PIPE_SIZE); PIPE_LOCK(rpipe); } } while (uio->uio_resid) { /* * normal pipe buffer receive */ if (rpipe->pipe_buffer.cnt > 0) { size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out; if (size > rpipe->pipe_buffer.cnt) size = rpipe->pipe_buffer.cnt; if (size > uio->uio_resid) size = uio->uio_resid; PIPE_UNLOCK(rpipe); error = uiomove( &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out], size, uio); PIPE_LOCK(rpipe); if (error) break; rpipe->pipe_buffer.out += size; if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size) rpipe->pipe_buffer.out = 0; rpipe->pipe_buffer.cnt -= size; /* * If there is no more to read in the pipe, reset * its pointers to the beginning. This improves * cache hit stats. */ if (rpipe->pipe_buffer.cnt == 0) { rpipe->pipe_buffer.in = 0; rpipe->pipe_buffer.out = 0; } nread += size; #ifndef PIPE_NODIRECT /* * Direct copy, bypassing a kernel buffer. */ } else if ((size = rpipe->pipe_pages.cnt) != 0) { if (size > uio->uio_resid) size = (u_int) uio->uio_resid; PIPE_UNLOCK(rpipe); error = uiomove_fromphys(rpipe->pipe_pages.ms, rpipe->pipe_pages.pos, size, uio); PIPE_LOCK(rpipe); if (error) break; nread += size; rpipe->pipe_pages.pos += size; rpipe->pipe_pages.cnt -= size; if (rpipe->pipe_pages.cnt == 0) { rpipe->pipe_state &= ~PIPE_WANTW; wakeup(rpipe); } #endif } else { /* * detect EOF condition * read returns 0 on EOF, no need to set error */ if (rpipe->pipe_state & PIPE_EOF) break; /* * If the "write-side" has been blocked, wake it up now. */ if (rpipe->pipe_state & PIPE_WANTW) { rpipe->pipe_state &= ~PIPE_WANTW; wakeup(rpipe); } /* * Break if some data was read. */ if (nread > 0) break; /* * Unlock the pipe buffer for our remaining processing. * We will either break out with an error or we will * sleep and relock to loop. */ pipeunlock(rpipe); /* * Handle non-blocking mode operation or * wait for more data. */ if (fp->f_flag & FNONBLOCK) { error = EAGAIN; } else { rpipe->pipe_state |= PIPE_WANTR; if ((error = msleep(rpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, "piperd", 0)) == 0) error = pipelock(rpipe, 1); } if (error) goto unlocked_error; } } #ifdef MAC locked_error: #endif pipeunlock(rpipe); /* XXX: should probably do this before getting any locks. */ if (error == 0) pipe_timestamp(&rpipe->pipe_atime); unlocked_error: --rpipe->pipe_busy; /* * PIPE_WANT processing only makes sense if pipe_busy is 0. */ if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) { rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW); wakeup(rpipe); } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) { /* * Handle write blocking hysteresis. */ if (rpipe->pipe_state & PIPE_WANTW) { rpipe->pipe_state &= ~PIPE_WANTW; wakeup(rpipe); } } /* * Only wake up writers if there was actually something read. * Otherwise, when calling read(2) at EOF, a spurious wakeup occurs. */ if (nread > 0 && rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt >= PIPE_BUF) pipeselwakeup(rpipe); PIPE_UNLOCK(rpipe); if (nread > 0) td->td_ru.ru_msgrcv++; return (error); } #ifndef PIPE_NODIRECT /* * Map the sending processes' buffer into kernel space and wire it. * This is similar to a physical write operation. */ static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio) { u_int size; int i; PIPE_LOCK_ASSERT(wpipe, MA_OWNED); KASSERT((wpipe->pipe_state & PIPE_DIRECTW) == 0, ("%s: PIPE_DIRECTW set on %p", __func__, wpipe)); KASSERT(wpipe->pipe_pages.cnt == 0, ("%s: pipe map for %p contains residual data", __func__, wpipe)); if (uio->uio_iov->iov_len > wpipe->pipe_buffer.size) size = wpipe->pipe_buffer.size; else size = uio->uio_iov->iov_len; wpipe->pipe_state |= PIPE_DIRECTW; PIPE_UNLOCK(wpipe); i = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, (vm_offset_t)uio->uio_iov->iov_base, size, VM_PROT_READ, wpipe->pipe_pages.ms, PIPENPAGES); PIPE_LOCK(wpipe); if (i < 0) { wpipe->pipe_state &= ~PIPE_DIRECTW; return (EFAULT); } wpipe->pipe_pages.npages = i; wpipe->pipe_pages.pos = ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK; wpipe->pipe_pages.cnt = size; uio->uio_iov->iov_len -= size; uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size; if (uio->uio_iov->iov_len == 0) uio->uio_iov++; uio->uio_resid -= size; uio->uio_offset += size; return (0); } /* * Unwire the process buffer. */ static void pipe_destroy_write_buffer(struct pipe *wpipe) { PIPE_LOCK_ASSERT(wpipe, MA_OWNED); KASSERT((wpipe->pipe_state & PIPE_DIRECTW) != 0, ("%s: PIPE_DIRECTW not set on %p", __func__, wpipe)); KASSERT(wpipe->pipe_pages.cnt == 0, ("%s: pipe map for %p contains residual data", __func__, wpipe)); wpipe->pipe_state &= ~PIPE_DIRECTW; vm_page_unhold_pages(wpipe->pipe_pages.ms, wpipe->pipe_pages.npages); wpipe->pipe_pages.npages = 0; } /* * In the case of a signal, the writing process might go away. This * code copies the data into the circular buffer so that the source * pages can be freed without loss of data. */ static void pipe_clone_write_buffer(struct pipe *wpipe) { struct uio uio; struct iovec iov; int size; int pos; PIPE_LOCK_ASSERT(wpipe, MA_OWNED); KASSERT((wpipe->pipe_state & PIPE_DIRECTW) != 0, ("%s: PIPE_DIRECTW not set on %p", __func__, wpipe)); size = wpipe->pipe_pages.cnt; pos = wpipe->pipe_pages.pos; wpipe->pipe_pages.cnt = 0; wpipe->pipe_buffer.in = size; wpipe->pipe_buffer.out = 0; wpipe->pipe_buffer.cnt = size; PIPE_UNLOCK(wpipe); iov.iov_base = wpipe->pipe_buffer.buffer; iov.iov_len = size; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = 0; uio.uio_resid = size; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_td = curthread; uiomove_fromphys(wpipe->pipe_pages.ms, pos, size, &uio); PIPE_LOCK(wpipe); pipe_destroy_write_buffer(wpipe); } /* * This implements the pipe buffer write mechanism. Note that only * a direct write OR a normal pipe write can be pending at any given time. * If there are any characters in the pipe buffer, the direct write will * be deferred until the receiving process grabs all of the bytes from * the pipe buffer. Then the direct mapping write is set-up. */ static int pipe_direct_write(struct pipe *wpipe, struct uio *uio) { int error; retry: PIPE_LOCK_ASSERT(wpipe, MA_OWNED); if ((wpipe->pipe_state & PIPE_EOF) != 0) { error = EPIPE; goto error1; } if (wpipe->pipe_state & PIPE_DIRECTW) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } pipeselwakeup(wpipe); wpipe->pipe_state |= PIPE_WANTW; pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipdww", 0); pipelock(wpipe, 0); if (error != 0) goto error1; goto retry; } if (wpipe->pipe_buffer.cnt > 0) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } pipeselwakeup(wpipe); wpipe->pipe_state |= PIPE_WANTW; pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipdwc", 0); pipelock(wpipe, 0); if (error != 0) goto error1; goto retry; } error = pipe_build_write_buffer(wpipe, uio); if (error) { goto error1; } while (wpipe->pipe_pages.cnt != 0 && (wpipe->pipe_state & PIPE_EOF) == 0) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } pipeselwakeup(wpipe); wpipe->pipe_state |= PIPE_WANTW; pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipdwt", 0); pipelock(wpipe, 0); if (error != 0) break; } if ((wpipe->pipe_state & PIPE_EOF) != 0) { wpipe->pipe_pages.cnt = 0; pipe_destroy_write_buffer(wpipe); pipeselwakeup(wpipe); error = EPIPE; } else if (error == EINTR || error == ERESTART) { pipe_clone_write_buffer(wpipe); } else { pipe_destroy_write_buffer(wpipe); } KASSERT((wpipe->pipe_state & PIPE_DIRECTW) == 0, ("pipe %p leaked PIPE_DIRECTW", wpipe)); return (error); error1: wakeup(wpipe); return (error); } #endif static int pipe_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct pipe *wpipe, *rpipe; ssize_t orig_resid; int desiredsize, error; rpipe = fp->f_data; wpipe = PIPE_PEER(rpipe); PIPE_LOCK(rpipe); error = pipelock(wpipe, 1); if (error) { PIPE_UNLOCK(rpipe); return (error); } /* * detect loss of pipe read side, issue SIGPIPE if lost. */ if (wpipe->pipe_present != PIPE_ACTIVE || (wpipe->pipe_state & PIPE_EOF)) { pipeunlock(wpipe); PIPE_UNLOCK(rpipe); return (EPIPE); } #ifdef MAC error = mac_pipe_check_write(active_cred, wpipe->pipe_pair); if (error) { pipeunlock(wpipe); PIPE_UNLOCK(rpipe); return (error); } #endif ++wpipe->pipe_busy; /* Choose a larger size if it's advantageous */ desiredsize = max(SMALL_PIPE_SIZE, wpipe->pipe_buffer.size); while (desiredsize < wpipe->pipe_buffer.cnt + uio->uio_resid) { if (piperesizeallowed != 1) break; if (amountpipekva > maxpipekva / 2) break; if (desiredsize == BIG_PIPE_SIZE) break; desiredsize = desiredsize * 2; } /* Choose a smaller size if we're in a OOM situation */ if (amountpipekva > (3 * maxpipekva) / 4 && wpipe->pipe_buffer.size > SMALL_PIPE_SIZE && wpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE && piperesizeallowed == 1) desiredsize = SMALL_PIPE_SIZE; /* Resize if the above determined that a new size was necessary */ if (desiredsize != wpipe->pipe_buffer.size && (wpipe->pipe_state & PIPE_DIRECTW) == 0) { PIPE_UNLOCK(wpipe); pipespace(wpipe, desiredsize); PIPE_LOCK(wpipe); } MPASS(wpipe->pipe_buffer.size != 0); orig_resid = uio->uio_resid; while (uio->uio_resid) { int space; if (wpipe->pipe_state & PIPE_EOF) { error = EPIPE; break; } #ifndef PIPE_NODIRECT /* * If the transfer is large, we can gain performance if * we do process-to-process copies directly. * If the write is non-blocking, we don't use the * direct write mechanism. * * The direct write mechanism will detect the reader going * away on us. */ if (uio->uio_segflg == UIO_USERSPACE && uio->uio_iov->iov_len >= pipe_mindirect && wpipe->pipe_buffer.size >= pipe_mindirect && (fp->f_flag & FNONBLOCK) == 0) { error = pipe_direct_write(wpipe, uio); if (error != 0) break; continue; } #endif /* * Pipe buffered writes cannot be coincidental with * direct writes. We wait until the currently executing * direct write is completed before we start filling the * pipe buffer. We break out if a signal occurs or the * reader goes away. */ if (wpipe->pipe_pages.cnt != 0) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } pipeselwakeup(wpipe); wpipe->pipe_state |= PIPE_WANTW; pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, "pipbww", 0); pipelock(wpipe, 0); if (error != 0) break; continue; } space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; /* Writes of size <= PIPE_BUF must be atomic. */ if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF)) space = 0; if (space > 0) { int size; /* Transfer size */ int segsize; /* first segment to transfer */ /* * Transfer size is minimum of uio transfer * and free space in pipe buffer. */ if (space > uio->uio_resid) size = uio->uio_resid; else size = space; /* * First segment to transfer is minimum of * transfer size and contiguous space in * pipe buffer. If first segment to transfer * is less than the transfer size, we've got * a wraparound in the buffer. */ segsize = wpipe->pipe_buffer.size - wpipe->pipe_buffer.in; if (segsize > size) segsize = size; /* Transfer first segment */ PIPE_UNLOCK(rpipe); error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], segsize, uio); PIPE_LOCK(rpipe); if (error == 0 && segsize < size) { KASSERT(wpipe->pipe_buffer.in + segsize == wpipe->pipe_buffer.size, ("Pipe buffer wraparound disappeared")); /* * Transfer remaining part now, to * support atomic writes. Wraparound * happened. */ PIPE_UNLOCK(rpipe); error = uiomove( &wpipe->pipe_buffer.buffer[0], size - segsize, uio); PIPE_LOCK(rpipe); } if (error == 0) { wpipe->pipe_buffer.in += size; if (wpipe->pipe_buffer.in >= wpipe->pipe_buffer.size) { KASSERT(wpipe->pipe_buffer.in == size - segsize + wpipe->pipe_buffer.size, ("Expected wraparound bad")); wpipe->pipe_buffer.in = size - segsize; } wpipe->pipe_buffer.cnt += size; KASSERT(wpipe->pipe_buffer.cnt <= wpipe->pipe_buffer.size, ("Pipe buffer overflow")); } if (error != 0) break; continue; } else { /* * If the "read-side" has been blocked, wake it up now. */ if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } /* * don't block on non-blocking I/O */ if (fp->f_flag & FNONBLOCK) { error = EAGAIN; break; } /* * We have no more space and have something to offer, * wake up select/poll. */ pipeselwakeup(wpipe); wpipe->pipe_state |= PIPE_WANTW; pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, "pipewr", 0); pipelock(wpipe, 0); if (error != 0) break; continue; } } --wpipe->pipe_busy; if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) { wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR); wakeup(wpipe); } else if (wpipe->pipe_buffer.cnt > 0) { /* * If we have put any characters in the buffer, we wake up * the reader. */ if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } } /* * Don't return EPIPE if any byte was written. * EINTR and other interrupts are handled by generic I/O layer. * Do not pretend that I/O succeeded for obvious user error * like EFAULT. */ if (uio->uio_resid != orig_resid && error == EPIPE) error = 0; if (error == 0) pipe_timestamp(&wpipe->pipe_mtime); /* * We have something to offer, * wake up select/poll. */ if (wpipe->pipe_buffer.cnt) pipeselwakeup(wpipe); pipeunlock(wpipe); PIPE_UNLOCK(rpipe); if (uio->uio_resid != orig_resid) td->td_ru.ru_msgsnd++; return (error); } /* ARGSUSED */ static int pipe_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { struct pipe *cpipe; int error; cpipe = fp->f_data; if (cpipe->pipe_type & PIPE_TYPE_NAMED) error = vnops.fo_truncate(fp, length, active_cred, td); else error = invfo_truncate(fp, length, active_cred, td); return (error); } /* * we implement a very minimal set of ioctls for compatibility with sockets. */ static int pipe_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { struct pipe *mpipe = fp->f_data; int error; PIPE_LOCK(mpipe); #ifdef MAC error = mac_pipe_check_ioctl(active_cred, mpipe->pipe_pair, cmd, data); if (error) { PIPE_UNLOCK(mpipe); return (error); } #endif error = 0; switch (cmd) { case FIONBIO: break; case FIOASYNC: if (*(int *)data) { mpipe->pipe_state |= PIPE_ASYNC; } else { mpipe->pipe_state &= ~PIPE_ASYNC; } break; case FIONREAD: if (!(fp->f_flag & FREAD)) { *(int *)data = 0; PIPE_UNLOCK(mpipe); return (0); } if (mpipe->pipe_pages.cnt != 0) *(int *)data = mpipe->pipe_pages.cnt; else *(int *)data = mpipe->pipe_buffer.cnt; break; case FIOSETOWN: PIPE_UNLOCK(mpipe); error = fsetown(*(int *)data, &mpipe->pipe_sigio); goto out_unlocked; case FIOGETOWN: *(int *)data = fgetown(&mpipe->pipe_sigio); break; /* This is deprecated, FIOSETOWN should be used instead. */ case TIOCSPGRP: PIPE_UNLOCK(mpipe); error = fsetown(-(*(int *)data), &mpipe->pipe_sigio); goto out_unlocked; /* This is deprecated, FIOGETOWN should be used instead. */ case TIOCGPGRP: *(int *)data = -fgetown(&mpipe->pipe_sigio); break; default: error = ENOTTY; break; } PIPE_UNLOCK(mpipe); out_unlocked: return (error); } static int pipe_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct pipe *rpipe; struct pipe *wpipe; int levents, revents; #ifdef MAC int error; #endif revents = 0; rpipe = fp->f_data; wpipe = PIPE_PEER(rpipe); PIPE_LOCK(rpipe); #ifdef MAC error = mac_pipe_check_poll(active_cred, rpipe->pipe_pair); if (error) goto locked_error; #endif if (fp->f_flag & FREAD && events & (POLLIN | POLLRDNORM)) if (rpipe->pipe_pages.cnt > 0 || rpipe->pipe_buffer.cnt > 0) revents |= events & (POLLIN | POLLRDNORM); if (fp->f_flag & FWRITE && events & (POLLOUT | POLLWRNORM)) if (wpipe->pipe_present != PIPE_ACTIVE || (wpipe->pipe_state & PIPE_EOF) || ((wpipe->pipe_state & PIPE_DIRECTW) == 0 && ((wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF || wpipe->pipe_buffer.size == 0))) revents |= events & (POLLOUT | POLLWRNORM); levents = events & (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | POLLRDBAND); if (rpipe->pipe_type & PIPE_TYPE_NAMED && fp->f_flag & FREAD && levents && fp->f_pipegen == rpipe->pipe_wgen) events |= POLLINIGNEOF; if ((events & POLLINIGNEOF) == 0) { if (rpipe->pipe_state & PIPE_EOF) { if (fp->f_flag & FREAD) revents |= (events & (POLLIN | POLLRDNORM)); if (wpipe->pipe_present != PIPE_ACTIVE || (wpipe->pipe_state & PIPE_EOF)) revents |= POLLHUP; } } if (revents == 0) { /* * Add ourselves regardless of eventmask as we have to return * POLLHUP even if it was not asked for. */ if ((fp->f_flag & FREAD) != 0) { selrecord(td, &rpipe->pipe_sel); if (SEL_WAITING(&rpipe->pipe_sel)) rpipe->pipe_state |= PIPE_SEL; } if ((fp->f_flag & FWRITE) != 0 && wpipe->pipe_present == PIPE_ACTIVE) { selrecord(td, &wpipe->pipe_sel); if (SEL_WAITING(&wpipe->pipe_sel)) wpipe->pipe_state |= PIPE_SEL; } } #ifdef MAC locked_error: #endif PIPE_UNLOCK(rpipe); return (revents); } /* * We shouldn't need locks here as we're doing a read and this should * be a natural race. */ static int -pipe_stat(struct file *fp, struct stat *ub, struct ucred *active_cred, - struct thread *td) +pipe_stat(struct file *fp, struct stat *ub, struct ucred *active_cred) { struct pipe *pipe; #ifdef MAC int error; #endif pipe = fp->f_data; #ifdef MAC if (mac_pipe_check_stat_enabled()) { PIPE_LOCK(pipe); error = mac_pipe_check_stat(active_cred, pipe->pipe_pair); PIPE_UNLOCK(pipe); if (error) { return (error); } } #endif /* For named pipes ask the underlying filesystem. */ if (pipe->pipe_type & PIPE_TYPE_NAMED) { - return (vnops.fo_stat(fp, ub, active_cred, td)); + return (vnops.fo_stat(fp, ub, active_cred)); } bzero(ub, sizeof(*ub)); ub->st_mode = S_IFIFO; ub->st_blksize = PAGE_SIZE; if (pipe->pipe_pages.cnt != 0) ub->st_size = pipe->pipe_pages.cnt; else ub->st_size = pipe->pipe_buffer.cnt; ub->st_blocks = howmany(ub->st_size, ub->st_blksize); ub->st_atim = pipe->pipe_atime; ub->st_mtim = pipe->pipe_mtime; ub->st_ctim = pipe->pipe_ctime; ub->st_uid = fp->f_cred->cr_uid; ub->st_gid = fp->f_cred->cr_gid; ub->st_dev = pipedev_ino; ub->st_ino = pipe->pipe_ino; /* * Left as 0: st_nlink, st_rdev, st_flags, st_gen. */ return (0); } /* ARGSUSED */ static int pipe_close(struct file *fp, struct thread *td) { if (fp->f_vnode != NULL) return vnops.fo_close(fp, td); fp->f_ops = &badfileops; pipe_dtor(fp->f_data); fp->f_data = NULL; return (0); } static int pipe_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { struct pipe *cpipe; int error; cpipe = fp->f_data; if (cpipe->pipe_type & PIPE_TYPE_NAMED) error = vn_chmod(fp, mode, active_cred, td); else error = invfo_chmod(fp, mode, active_cred, td); return (error); } static int pipe_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { struct pipe *cpipe; int error; cpipe = fp->f_data; if (cpipe->pipe_type & PIPE_TYPE_NAMED) error = vn_chown(fp, uid, gid, active_cred, td); else error = invfo_chown(fp, uid, gid, active_cred, td); return (error); } static int pipe_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { struct pipe *pi; if (fp->f_type == DTYPE_FIFO) return (vn_fill_kinfo(fp, kif, fdp)); kif->kf_type = KF_TYPE_PIPE; pi = fp->f_data; kif->kf_un.kf_pipe.kf_pipe_addr = (uintptr_t)pi; kif->kf_un.kf_pipe.kf_pipe_peer = (uintptr_t)pi->pipe_peer; kif->kf_un.kf_pipe.kf_pipe_buffer_cnt = pi->pipe_buffer.cnt; return (0); } static void pipe_free_kmem(struct pipe *cpipe) { KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipe_free_kmem: pipe mutex locked")); if (cpipe->pipe_buffer.buffer != NULL) { atomic_subtract_long(&amountpipekva, cpipe->pipe_buffer.size); vm_map_remove(pipe_map, (vm_offset_t)cpipe->pipe_buffer.buffer, (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size); cpipe->pipe_buffer.buffer = NULL; } #ifndef PIPE_NODIRECT { cpipe->pipe_pages.cnt = 0; cpipe->pipe_pages.pos = 0; cpipe->pipe_pages.npages = 0; } #endif } /* * shutdown the pipe */ static void pipeclose(struct pipe *cpipe) { struct pipepair *pp; struct pipe *ppipe; KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL")); PIPE_LOCK(cpipe); pipelock(cpipe, 0); pp = cpipe->pipe_pair; /* * If the other side is blocked, wake it up saying that * we want to close it down. */ cpipe->pipe_state |= PIPE_EOF; while (cpipe->pipe_busy) { wakeup(cpipe); cpipe->pipe_state |= PIPE_WANT; pipeunlock(cpipe); msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0); pipelock(cpipe, 0); } pipeselwakeup(cpipe); /* * Disconnect from peer, if any. */ ppipe = cpipe->pipe_peer; if (ppipe->pipe_present == PIPE_ACTIVE) { ppipe->pipe_state |= PIPE_EOF; wakeup(ppipe); pipeselwakeup(ppipe); } /* * Mark this endpoint as free. Release kmem resources. We * don't mark this endpoint as unused until we've finished * doing that, or the pipe might disappear out from under * us. */ PIPE_UNLOCK(cpipe); pipe_free_kmem(cpipe); PIPE_LOCK(cpipe); cpipe->pipe_present = PIPE_CLOSING; pipeunlock(cpipe); /* * knlist_clear() may sleep dropping the PIPE_MTX. Set the * PIPE_FINALIZED, that allows other end to free the * pipe_pair, only after the knotes are completely dismantled. */ knlist_clear(&cpipe->pipe_sel.si_note, 1); cpipe->pipe_present = PIPE_FINALIZED; seldrain(&cpipe->pipe_sel); knlist_destroy(&cpipe->pipe_sel.si_note); /* * If both endpoints are now closed, release the memory for the * pipe pair. If not, unlock. */ if (ppipe->pipe_present == PIPE_FINALIZED) { PIPE_UNLOCK(cpipe); #ifdef MAC mac_pipe_destroy(pp); #endif uma_zfree(pipe_zone, cpipe->pipe_pair); } else PIPE_UNLOCK(cpipe); } /*ARGSUSED*/ static int pipe_kqfilter(struct file *fp, struct knote *kn) { struct pipe *cpipe; /* * If a filter is requested that is not supported by this file * descriptor, don't return an error, but also don't ever generate an * event. */ if ((kn->kn_filter == EVFILT_READ) && !(fp->f_flag & FREAD)) { kn->kn_fop = &pipe_nfiltops; return (0); } if ((kn->kn_filter == EVFILT_WRITE) && !(fp->f_flag & FWRITE)) { kn->kn_fop = &pipe_nfiltops; return (0); } cpipe = fp->f_data; PIPE_LOCK(cpipe); switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &pipe_rfiltops; break; case EVFILT_WRITE: kn->kn_fop = &pipe_wfiltops; if (cpipe->pipe_peer->pipe_present != PIPE_ACTIVE) { /* other end of pipe has been closed */ PIPE_UNLOCK(cpipe); return (EPIPE); } cpipe = PIPE_PEER(cpipe); break; default: PIPE_UNLOCK(cpipe); return (EINVAL); } kn->kn_hook = cpipe; knlist_add(&cpipe->pipe_sel.si_note, kn, 1); PIPE_UNLOCK(cpipe); return (0); } static void filt_pipedetach(struct knote *kn) { struct pipe *cpipe = kn->kn_hook; PIPE_LOCK(cpipe); knlist_remove(&cpipe->pipe_sel.si_note, kn, 1); PIPE_UNLOCK(cpipe); } /*ARGSUSED*/ static int filt_piperead(struct knote *kn, long hint) { struct file *fp = kn->kn_fp; struct pipe *rpipe = kn->kn_hook; PIPE_LOCK_ASSERT(rpipe, MA_OWNED); kn->kn_data = rpipe->pipe_buffer.cnt; if (kn->kn_data == 0) kn->kn_data = rpipe->pipe_pages.cnt; if ((rpipe->pipe_state & PIPE_EOF) != 0 && ((rpipe->pipe_type & PIPE_TYPE_NAMED) == 0 || fp->f_pipegen != rpipe->pipe_wgen)) { kn->kn_flags |= EV_EOF; return (1); } kn->kn_flags &= ~EV_EOF; return (kn->kn_data > 0); } /*ARGSUSED*/ static int filt_pipewrite(struct knote *kn, long hint) { struct pipe *wpipe = kn->kn_hook; /* * If this end of the pipe is closed, the knote was removed from the * knlist and the list lock (i.e., the pipe lock) is therefore not held. */ if (wpipe->pipe_present == PIPE_ACTIVE || (wpipe->pipe_type & PIPE_TYPE_NAMED) != 0) { PIPE_LOCK_ASSERT(wpipe, MA_OWNED); if (wpipe->pipe_state & PIPE_DIRECTW) { kn->kn_data = 0; } else if (wpipe->pipe_buffer.size > 0) { kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; } else { kn->kn_data = PIPE_BUF; } } if (wpipe->pipe_present != PIPE_ACTIVE || (wpipe->pipe_state & PIPE_EOF)) { kn->kn_flags |= EV_EOF; return (1); } kn->kn_flags &= ~EV_EOF; return (kn->kn_data >= PIPE_BUF); } static void filt_pipedetach_notsup(struct knote *kn) { } static int filt_pipenotsup(struct knote *kn, long hint) { return (0); } diff --git a/sys/kern/sys_procdesc.c b/sys/kern/sys_procdesc.c index 95a389c582b0..ff1a1502aa71 100644 --- a/sys/kern/sys_procdesc.c +++ b/sys/kern/sys_procdesc.c @@ -1,560 +1,559 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2009, 2016 Robert N. M. Watson * All rights reserved. * * This software was developed at the University of Cambridge Computer * Laboratory with support from a grant from Google, Inc. * * Portions of this software were developed by BAE Systems, the University of * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent * Computing (TC) research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /*- * FreeBSD process descriptor facility. * * Some processes are represented by a file descriptor, which will be used in * preference to signaling and pids for the purposes of process management, * and is, in effect, a form of capability. When a process descriptor is * used with a process, it ceases to be visible to certain traditional UNIX * process facilities, such as waitpid(2). * * Some semantics: * * - At most one process descriptor will exist for any process, although * references to that descriptor may be held from many processes (or even * be in flight between processes over a local domain socket). * - Last close on the process descriptor will terminate the process using * SIGKILL and reparent it to init so that there's a process to reap it * when it's done exiting. * - If the process exits before the descriptor is closed, it will not * generate SIGCHLD on termination, or be picked up by waitpid(). * - The pdkill(2) system call may be used to deliver a signal to the process * using its process descriptor. * - The pdwait4(2) system call may be used to block (or not) on a process * descriptor to collect termination information. * * Open questions: * * - Will we want to add a pidtoprocdesc(2) system call to allow process * descriptors to be created for processes without pdfork(2)? */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(process_descriptors, "Process Descriptors"); MALLOC_DEFINE(M_PROCDESC, "procdesc", "process descriptors"); static fo_poll_t procdesc_poll; static fo_kqfilter_t procdesc_kqfilter; static fo_stat_t procdesc_stat; static fo_close_t procdesc_close; static fo_fill_kinfo_t procdesc_fill_kinfo; static struct fileops procdesc_ops = { .fo_read = invfo_rdwr, .fo_write = invfo_rdwr, .fo_truncate = invfo_truncate, .fo_ioctl = invfo_ioctl, .fo_poll = procdesc_poll, .fo_kqfilter = procdesc_kqfilter, .fo_stat = procdesc_stat, .fo_close = procdesc_close, .fo_chmod = invfo_chmod, .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = procdesc_fill_kinfo, .fo_flags = DFLAG_PASSABLE, }; /* * Return a locked process given a process descriptor, or ESRCH if it has * died. */ int procdesc_find(struct thread *td, int fd, cap_rights_t *rightsp, struct proc **p) { struct procdesc *pd; struct file *fp; int error; error = fget(td, fd, rightsp, &fp); if (error) return (error); if (fp->f_type != DTYPE_PROCDESC) { error = EBADF; goto out; } pd = fp->f_data; sx_slock(&proctree_lock); if (pd->pd_proc != NULL) { *p = pd->pd_proc; PROC_LOCK(*p); } else error = ESRCH; sx_sunlock(&proctree_lock); out: fdrop(fp, td); return (error); } /* * Function to be used by procstat(1) sysctls when returning procdesc * information. */ pid_t procdesc_pid(struct file *fp_procdesc) { struct procdesc *pd; KASSERT(fp_procdesc->f_type == DTYPE_PROCDESC, ("procdesc_pid: !procdesc")); pd = fp_procdesc->f_data; return (pd->pd_pid); } /* * Retrieve the PID associated with a process descriptor. */ int kern_pdgetpid(struct thread *td, int fd, cap_rights_t *rightsp, pid_t *pidp) { struct file *fp; int error; error = fget(td, fd, rightsp, &fp); if (error) return (error); if (fp->f_type != DTYPE_PROCDESC) { error = EBADF; goto out; } *pidp = procdesc_pid(fp); out: fdrop(fp, td); return (error); } /* * System call to return the pid of a process given its process descriptor. */ int sys_pdgetpid(struct thread *td, struct pdgetpid_args *uap) { pid_t pid; int error; AUDIT_ARG_FD(uap->fd); error = kern_pdgetpid(td, uap->fd, &cap_pdgetpid_rights, &pid); if (error == 0) error = copyout(&pid, uap->pidp, sizeof(pid)); return (error); } /* * When a new process is forked by pdfork(), a file descriptor is allocated * by the fork code first, then the process is forked, and then we get a * chance to set up the process descriptor. Failure is not permitted at this * point, so procdesc_new() must succeed. */ void procdesc_new(struct proc *p, int flags) { struct procdesc *pd; pd = malloc(sizeof(*pd), M_PROCDESC, M_WAITOK | M_ZERO); pd->pd_proc = p; pd->pd_pid = p->p_pid; p->p_procdesc = pd; pd->pd_flags = 0; if (flags & PD_DAEMON) pd->pd_flags |= PDF_DAEMON; PROCDESC_LOCK_INIT(pd); knlist_init_mtx(&pd->pd_selinfo.si_note, &pd->pd_lock); /* * Process descriptors start out with two references: one from their * struct file, and the other from their struct proc. */ refcount_init(&pd->pd_refcount, 2); } /* * Create a new process decriptor for the process that refers to it. */ int procdesc_falloc(struct thread *td, struct file **resultfp, int *resultfd, int flags, struct filecaps *fcaps) { int fflags; fflags = 0; if (flags & PD_CLOEXEC) fflags = O_CLOEXEC; return (falloc_caps(td, resultfp, resultfd, fflags, fcaps)); } /* * Initialize a file with a process descriptor. */ void procdesc_finit(struct procdesc *pdp, struct file *fp) { finit(fp, FREAD | FWRITE, DTYPE_PROCDESC, pdp, &procdesc_ops); } static void procdesc_free(struct procdesc *pd) { /* * When the last reference is released, we assert that the descriptor * has been closed, but not that the process has exited, as we will * detach the descriptor before the process dies if the descript is * closed, as we can't wait synchronously. */ if (refcount_release(&pd->pd_refcount)) { KASSERT(pd->pd_proc == NULL, ("procdesc_free: pd_proc != NULL")); KASSERT((pd->pd_flags & PDF_CLOSED), ("procdesc_free: !PDF_CLOSED")); knlist_destroy(&pd->pd_selinfo.si_note); PROCDESC_LOCK_DESTROY(pd); free(pd, M_PROCDESC); } } /* * procdesc_exit() - notify a process descriptor that its process is exiting. * We use the proctree_lock to ensure that process exit either happens * strictly before or strictly after a concurrent call to procdesc_close(). */ int procdesc_exit(struct proc *p) { struct procdesc *pd; sx_assert(&proctree_lock, SA_XLOCKED); PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(p->p_procdesc != NULL, ("procdesc_exit: p_procdesc NULL")); pd = p->p_procdesc; PROCDESC_LOCK(pd); KASSERT((pd->pd_flags & PDF_CLOSED) == 0 || p->p_pptr == p->p_reaper, ("procdesc_exit: closed && parent not reaper")); pd->pd_flags |= PDF_EXITED; pd->pd_xstat = KW_EXITCODE(p->p_xexit, p->p_xsig); /* * If the process descriptor has been closed, then we have nothing * to do; return 1 so that init will get SIGCHLD and do the reaping. * Clean up the procdesc now rather than letting it happen during * that reap. */ if (pd->pd_flags & PDF_CLOSED) { PROCDESC_UNLOCK(pd); pd->pd_proc = NULL; p->p_procdesc = NULL; procdesc_free(pd); return (1); } if (pd->pd_flags & PDF_SELECTED) { pd->pd_flags &= ~PDF_SELECTED; selwakeup(&pd->pd_selinfo); } KNOTE_LOCKED(&pd->pd_selinfo.si_note, NOTE_EXIT); PROCDESC_UNLOCK(pd); return (0); } /* * When a process descriptor is reaped, perhaps as a result of close() or * pdwait4(), release the process's reference on the process descriptor. */ void procdesc_reap(struct proc *p) { struct procdesc *pd; sx_assert(&proctree_lock, SA_XLOCKED); KASSERT(p->p_procdesc != NULL, ("procdesc_reap: p_procdesc == NULL")); pd = p->p_procdesc; pd->pd_proc = NULL; p->p_procdesc = NULL; procdesc_free(pd); } /* * procdesc_close() - last close on a process descriptor. If the process is * still running, terminate with SIGKILL (unless PDF_DAEMON is set) and let * its reaper clean up the mess; if not, we have to clean up the zombie * ourselves. */ static int procdesc_close(struct file *fp, struct thread *td) { struct procdesc *pd; struct proc *p; KASSERT(fp->f_type == DTYPE_PROCDESC, ("procdesc_close: !procdesc")); pd = fp->f_data; fp->f_ops = &badfileops; fp->f_data = NULL; sx_xlock(&proctree_lock); PROCDESC_LOCK(pd); pd->pd_flags |= PDF_CLOSED; PROCDESC_UNLOCK(pd); p = pd->pd_proc; if (p == NULL) { /* * This is the case where process' exit status was already * collected and procdesc_reap() was already called. */ sx_xunlock(&proctree_lock); } else { PROC_LOCK(p); AUDIT_ARG_PROCESS(p); if (p->p_state == PRS_ZOMBIE) { /* * If the process is already dead and just awaiting * reaping, do that now. This will release the * process's reference to the process descriptor when it * calls back into procdesc_reap(). */ proc_reap(curthread, p, NULL, 0); } else { /* * If the process is not yet dead, we need to kill it, * but we can't wait around synchronously for it to go * away, as that path leads to madness (and deadlocks). * First, detach the process from its descriptor so that * its exit status will be reported normally. */ pd->pd_proc = NULL; p->p_procdesc = NULL; procdesc_free(pd); /* * Next, reparent it to its reaper (usually init(8)) so * that there's someone to pick up the pieces; finally, * terminate with prejudice. */ p->p_sigparent = SIGCHLD; if ((p->p_flag & P_TRACED) == 0) { proc_reparent(p, p->p_reaper, true); } else { proc_clear_orphan(p); p->p_oppid = p->p_reaper->p_pid; proc_add_orphan(p, p->p_reaper); } if ((pd->pd_flags & PDF_DAEMON) == 0) kern_psignal(p, SIGKILL); PROC_UNLOCK(p); sx_xunlock(&proctree_lock); } } /* * Release the file descriptor's reference on the process descriptor. */ procdesc_free(pd); return (0); } static int procdesc_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct procdesc *pd; int revents; revents = 0; pd = fp->f_data; PROCDESC_LOCK(pd); if (pd->pd_flags & PDF_EXITED) revents |= POLLHUP; if (revents == 0) { selrecord(td, &pd->pd_selinfo); pd->pd_flags |= PDF_SELECTED; } PROCDESC_UNLOCK(pd); return (revents); } static void procdesc_kqops_detach(struct knote *kn) { struct procdesc *pd; pd = kn->kn_fp->f_data; knlist_remove(&pd->pd_selinfo.si_note, kn, 0); } static int procdesc_kqops_event(struct knote *kn, long hint) { struct procdesc *pd; u_int event; pd = kn->kn_fp->f_data; if (hint == 0) { /* * Initial test after registration. Generate a NOTE_EXIT in * case the process already terminated before registration. */ event = pd->pd_flags & PDF_EXITED ? NOTE_EXIT : 0; } else { /* Mask off extra data. */ event = (u_int)hint & NOTE_PCTRLMASK; } /* If the user is interested in this event, record it. */ if (kn->kn_sfflags & event) kn->kn_fflags |= event; /* Process is gone, so flag the event as finished. */ if (event == NOTE_EXIT) { kn->kn_flags |= EV_EOF | EV_ONESHOT; if (kn->kn_fflags & NOTE_EXIT) kn->kn_data = pd->pd_xstat; if (kn->kn_fflags == 0) kn->kn_flags |= EV_DROP; return (1); } return (kn->kn_fflags != 0); } static struct filterops procdesc_kqops = { .f_isfd = 1, .f_detach = procdesc_kqops_detach, .f_event = procdesc_kqops_event, }; static int procdesc_kqfilter(struct file *fp, struct knote *kn) { struct procdesc *pd; pd = fp->f_data; switch (kn->kn_filter) { case EVFILT_PROCDESC: kn->kn_fop = &procdesc_kqops; kn->kn_flags |= EV_CLEAR; knlist_add(&pd->pd_selinfo.si_note, kn, 0); return (0); default: return (EINVAL); } } static int -procdesc_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, - struct thread *td) +procdesc_stat(struct file *fp, struct stat *sb, struct ucred *active_cred) { struct procdesc *pd; struct timeval pstart, boottime; /* * XXXRW: Perhaps we should cache some more information from the * process so that we can return it reliably here even after it has * died. For example, caching its credential data. */ bzero(sb, sizeof(*sb)); pd = fp->f_data; sx_slock(&proctree_lock); if (pd->pd_proc != NULL) { PROC_LOCK(pd->pd_proc); AUDIT_ARG_PROCESS(pd->pd_proc); /* Set birth and [acm] times to process start time. */ pstart = pd->pd_proc->p_stats->p_start; getboottime(&boottime); timevaladd(&pstart, &boottime); TIMEVAL_TO_TIMESPEC(&pstart, &sb->st_birthtim); sb->st_atim = sb->st_birthtim; sb->st_ctim = sb->st_birthtim; sb->st_mtim = sb->st_birthtim; if (pd->pd_proc->p_state != PRS_ZOMBIE) sb->st_mode = S_IFREG | S_IRWXU; else sb->st_mode = S_IFREG; sb->st_uid = pd->pd_proc->p_ucred->cr_ruid; sb->st_gid = pd->pd_proc->p_ucred->cr_rgid; PROC_UNLOCK(pd->pd_proc); } else sb->st_mode = S_IFREG; sx_sunlock(&proctree_lock); return (0); } static int procdesc_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { struct procdesc *pdp; kif->kf_type = KF_TYPE_PROCDESC; pdp = fp->f_data; kif->kf_un.kf_proc.kf_pid = pdp->pd_pid; return (0); } diff --git a/sys/kern/sys_socket.c b/sys/kern/sys_socket.c index 8cf703ab8ebd..d4200e5618d2 100644 --- a/sys/kern/sys_socket.c +++ b/sys/kern/sys_socket.c @@ -1,845 +1,844 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)sys_socket.c 8.1 (Berkeley) 6/10/93 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* XXX */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static SYSCTL_NODE(_kern_ipc, OID_AUTO, aio, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "socket AIO stats"); static int empty_results; SYSCTL_INT(_kern_ipc_aio, OID_AUTO, empty_results, CTLFLAG_RD, &empty_results, 0, "socket operation returned EAGAIN"); static int empty_retries; SYSCTL_INT(_kern_ipc_aio, OID_AUTO, empty_retries, CTLFLAG_RD, &empty_retries, 0, "socket operation retries"); static fo_rdwr_t soo_read; static fo_rdwr_t soo_write; static fo_ioctl_t soo_ioctl; static fo_poll_t soo_poll; extern fo_kqfilter_t soo_kqfilter; static fo_stat_t soo_stat; static fo_close_t soo_close; static fo_fill_kinfo_t soo_fill_kinfo; static fo_aio_queue_t soo_aio_queue; static void soo_aio_cancel(struct kaiocb *job); struct fileops socketops = { .fo_read = soo_read, .fo_write = soo_write, .fo_truncate = invfo_truncate, .fo_ioctl = soo_ioctl, .fo_poll = soo_poll, .fo_kqfilter = soo_kqfilter, .fo_stat = soo_stat, .fo_close = soo_close, .fo_chmod = invfo_chmod, .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = soo_fill_kinfo, .fo_aio_queue = soo_aio_queue, .fo_flags = DFLAG_PASSABLE }; static int soo_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct socket *so = fp->f_data; int error; #ifdef MAC error = mac_socket_check_receive(active_cred, so); if (error) return (error); #endif error = soreceive(so, 0, uio, 0, 0, 0); return (error); } static int soo_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct socket *so = fp->f_data; int error; #ifdef MAC error = mac_socket_check_send(active_cred, so); if (error) return (error); #endif error = sosend(so, 0, uio, 0, 0, 0, uio->uio_td); if (error == EPIPE && (so->so_options & SO_NOSIGPIPE) == 0) { PROC_LOCK(uio->uio_td->td_proc); tdsignal(uio->uio_td, SIGPIPE); PROC_UNLOCK(uio->uio_td->td_proc); } return (error); } static int soo_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { struct socket *so = fp->f_data; int error = 0; switch (cmd) { case FIONBIO: SOCK_LOCK(so); if (*(int *)data) so->so_state |= SS_NBIO; else so->so_state &= ~SS_NBIO; SOCK_UNLOCK(so); break; case FIOASYNC: if (*(int *)data) { SOCK_LOCK(so); so->so_state |= SS_ASYNC; if (SOLISTENING(so)) { so->sol_sbrcv_flags |= SB_ASYNC; so->sol_sbsnd_flags |= SB_ASYNC; } else { SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_flags |= SB_ASYNC; SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_flags |= SB_ASYNC; SOCKBUF_UNLOCK(&so->so_snd); } SOCK_UNLOCK(so); } else { SOCK_LOCK(so); so->so_state &= ~SS_ASYNC; if (SOLISTENING(so)) { so->sol_sbrcv_flags &= ~SB_ASYNC; so->sol_sbsnd_flags &= ~SB_ASYNC; } else { SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_flags &= ~SB_ASYNC; SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_flags &= ~SB_ASYNC; SOCKBUF_UNLOCK(&so->so_snd); } SOCK_UNLOCK(so); } break; case FIONREAD: SOCK_RECVBUF_LOCK(so); if (SOLISTENING(so)) { error = EINVAL; } else { *(int *)data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl; } SOCK_RECVBUF_UNLOCK(so); break; case FIONWRITE: /* Unlocked read. */ if (SOLISTENING(so)) { error = EINVAL; } else { *(int *)data = sbavail(&so->so_snd); } break; case FIONSPACE: /* Unlocked read. */ if (SOLISTENING(so)) { error = EINVAL; } else { if ((so->so_snd.sb_hiwat < sbused(&so->so_snd)) || (so->so_snd.sb_mbmax < so->so_snd.sb_mbcnt)) { *(int *)data = 0; } else { *(int *)data = sbspace(&so->so_snd); } } break; case FIOSETOWN: error = fsetown(*(int *)data, &so->so_sigio); break; case FIOGETOWN: *(int *)data = fgetown(&so->so_sigio); break; case SIOCSPGRP: error = fsetown(-(*(int *)data), &so->so_sigio); break; case SIOCGPGRP: *(int *)data = -fgetown(&so->so_sigio); break; case SIOCATMARK: /* Unlocked read. */ if (SOLISTENING(so)) { error = EINVAL; } else { *(int *)data = (so->so_rcv.sb_state & SBS_RCVATMARK) != 0; } break; default: /* * Interface/routing/protocol specific ioctls: interface and * routing ioctls should have a different entry since a * socket is unnecessary. */ if (IOCGROUP(cmd) == 'i') error = ifioctl(so, cmd, data, td); else if (IOCGROUP(cmd) == 'r') { CURVNET_SET(so->so_vnet); error = rtioctl_fib(cmd, data, so->so_fibnum); CURVNET_RESTORE(); } else { CURVNET_SET(so->so_vnet); error = ((*so->so_proto->pr_usrreqs->pru_control) (so, cmd, data, 0, td)); CURVNET_RESTORE(); } break; } return (error); } static int soo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct socket *so = fp->f_data; #ifdef MAC int error; error = mac_socket_check_poll(active_cred, so); if (error) return (error); #endif return (sopoll(so, events, fp->f_cred, td)); } static int -soo_stat(struct file *fp, struct stat *ub, struct ucred *active_cred, - struct thread *td) +soo_stat(struct file *fp, struct stat *ub, struct ucred *active_cred) { struct socket *so = fp->f_data; int error; bzero((caddr_t)ub, sizeof (*ub)); ub->st_mode = S_IFSOCK; #ifdef MAC error = mac_socket_check_stat(active_cred, so); if (error) return (error); #endif SOCK_LOCK(so); if (!SOLISTENING(so)) { struct sockbuf *sb; /* * If SBS_CANTRCVMORE is set, but there's still data left * in the receive buffer, the socket is still readable. */ sb = &so->so_rcv; SOCKBUF_LOCK(sb); if ((sb->sb_state & SBS_CANTRCVMORE) == 0 || sbavail(sb)) ub->st_mode |= S_IRUSR | S_IRGRP | S_IROTH; ub->st_size = sbavail(sb) - sb->sb_ctl; SOCKBUF_UNLOCK(sb); sb = &so->so_snd; SOCKBUF_LOCK(sb); if ((sb->sb_state & SBS_CANTSENDMORE) == 0) ub->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH; SOCKBUF_UNLOCK(sb); } ub->st_uid = so->so_cred->cr_uid; ub->st_gid = so->so_cred->cr_gid; error = so->so_proto->pr_usrreqs->pru_sense(so, ub); SOCK_UNLOCK(so); return (error); } /* * API socket close on file pointer. We call soclose() to close the socket * (including initiating closing protocols). soclose() will sorele() the * file reference but the actual socket will not go away until the socket's * ref count hits 0. */ static int soo_close(struct file *fp, struct thread *td) { int error = 0; struct socket *so; so = fp->f_data; fp->f_ops = &badfileops; fp->f_data = NULL; if (so) error = soclose(so); return (error); } static int soo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { struct sockaddr *sa; struct inpcb *inpcb; struct unpcb *unpcb; struct socket *so; int error; kif->kf_type = KF_TYPE_SOCKET; so = fp->f_data; CURVNET_SET(so->so_vnet); kif->kf_un.kf_sock.kf_sock_domain0 = so->so_proto->pr_domain->dom_family; kif->kf_un.kf_sock.kf_sock_type0 = so->so_type; kif->kf_un.kf_sock.kf_sock_protocol0 = so->so_proto->pr_protocol; kif->kf_un.kf_sock.kf_sock_pcb = (uintptr_t)so->so_pcb; switch (kif->kf_un.kf_sock.kf_sock_domain0) { case AF_INET: case AF_INET6: if (kif->kf_un.kf_sock.kf_sock_protocol0 == IPPROTO_TCP) { if (so->so_pcb != NULL) { inpcb = (struct inpcb *)(so->so_pcb); kif->kf_un.kf_sock.kf_sock_inpcb = (uintptr_t)inpcb->inp_ppcb; kif->kf_un.kf_sock.kf_sock_sendq = sbused(&so->so_snd); kif->kf_un.kf_sock.kf_sock_recvq = sbused(&so->so_rcv); } } break; case AF_UNIX: if (so->so_pcb != NULL) { unpcb = (struct unpcb *)(so->so_pcb); if (unpcb->unp_conn) { kif->kf_un.kf_sock.kf_sock_unpconn = (uintptr_t)unpcb->unp_conn; kif->kf_un.kf_sock.kf_sock_rcv_sb_state = so->so_rcv.sb_state; kif->kf_un.kf_sock.kf_sock_snd_sb_state = so->so_snd.sb_state; kif->kf_un.kf_sock.kf_sock_sendq = sbused(&so->so_snd); kif->kf_un.kf_sock.kf_sock_recvq = sbused(&so->so_rcv); } } break; } error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa); if (error == 0 && sa->sa_len <= sizeof(kif->kf_un.kf_sock.kf_sa_local)) { bcopy(sa, &kif->kf_un.kf_sock.kf_sa_local, sa->sa_len); free(sa, M_SONAME); } error = so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa); if (error == 0 && sa->sa_len <= sizeof(kif->kf_un.kf_sock.kf_sa_peer)) { bcopy(sa, &kif->kf_un.kf_sock.kf_sa_peer, sa->sa_len); free(sa, M_SONAME); } strncpy(kif->kf_path, so->so_proto->pr_domain->dom_name, sizeof(kif->kf_path)); CURVNET_RESTORE(); return (0); } /* * Use the 'backend3' field in AIO jobs to store the amount of data * completed by the AIO job so far. */ #define aio_done backend3 static STAILQ_HEAD(, task) soaio_jobs; static struct mtx soaio_jobs_lock; static struct task soaio_kproc_task; static int soaio_starting, soaio_idle, soaio_queued; static struct unrhdr *soaio_kproc_unr; static int soaio_max_procs = MAX_AIO_PROCS; SYSCTL_INT(_kern_ipc_aio, OID_AUTO, max_procs, CTLFLAG_RW, &soaio_max_procs, 0, "Maximum number of kernel processes to use for async socket IO"); static int soaio_num_procs; SYSCTL_INT(_kern_ipc_aio, OID_AUTO, num_procs, CTLFLAG_RD, &soaio_num_procs, 0, "Number of active kernel processes for async socket IO"); static int soaio_target_procs = TARGET_AIO_PROCS; SYSCTL_INT(_kern_ipc_aio, OID_AUTO, target_procs, CTLFLAG_RD, &soaio_target_procs, 0, "Preferred number of ready kernel processes for async socket IO"); static int soaio_lifetime; SYSCTL_INT(_kern_ipc_aio, OID_AUTO, lifetime, CTLFLAG_RW, &soaio_lifetime, 0, "Maximum lifetime for idle aiod"); static void soaio_kproc_loop(void *arg) { struct proc *p; struct vmspace *myvm; struct task *task; int error, id, pending; id = (intptr_t)arg; /* * Grab an extra reference on the daemon's vmspace so that it * doesn't get freed by jobs that switch to a different * vmspace. */ p = curproc; myvm = vmspace_acquire_ref(p); mtx_lock(&soaio_jobs_lock); MPASS(soaio_starting > 0); soaio_starting--; for (;;) { while (!STAILQ_EMPTY(&soaio_jobs)) { task = STAILQ_FIRST(&soaio_jobs); STAILQ_REMOVE_HEAD(&soaio_jobs, ta_link); soaio_queued--; pending = task->ta_pending; task->ta_pending = 0; mtx_unlock(&soaio_jobs_lock); task->ta_func(task->ta_context, pending); mtx_lock(&soaio_jobs_lock); } MPASS(soaio_queued == 0); if (p->p_vmspace != myvm) { mtx_unlock(&soaio_jobs_lock); vmspace_switch_aio(myvm); mtx_lock(&soaio_jobs_lock); continue; } soaio_idle++; error = mtx_sleep(&soaio_idle, &soaio_jobs_lock, 0, "-", soaio_lifetime); soaio_idle--; if (error == EWOULDBLOCK && STAILQ_EMPTY(&soaio_jobs) && soaio_num_procs > soaio_target_procs) break; } soaio_num_procs--; mtx_unlock(&soaio_jobs_lock); free_unr(soaio_kproc_unr, id); kproc_exit(0); } static void soaio_kproc_create(void *context, int pending) { struct proc *p; int error, id; mtx_lock(&soaio_jobs_lock); for (;;) { if (soaio_num_procs < soaio_target_procs) { /* Must create */ } else if (soaio_num_procs >= soaio_max_procs) { /* * Hit the limit on kernel processes, don't * create another one. */ break; } else if (soaio_queued <= soaio_idle + soaio_starting) { /* * No more AIO jobs waiting for a process to be * created, so stop. */ break; } soaio_starting++; mtx_unlock(&soaio_jobs_lock); id = alloc_unr(soaio_kproc_unr); error = kproc_create(soaio_kproc_loop, (void *)(intptr_t)id, &p, 0, 0, "soaiod%d", id); if (error != 0) { free_unr(soaio_kproc_unr, id); mtx_lock(&soaio_jobs_lock); soaio_starting--; break; } mtx_lock(&soaio_jobs_lock); soaio_num_procs++; } mtx_unlock(&soaio_jobs_lock); } void soaio_enqueue(struct task *task) { mtx_lock(&soaio_jobs_lock); MPASS(task->ta_pending == 0); task->ta_pending++; STAILQ_INSERT_TAIL(&soaio_jobs, task, ta_link); soaio_queued++; if (soaio_queued <= soaio_idle) wakeup_one(&soaio_idle); else if (soaio_num_procs < soaio_max_procs) taskqueue_enqueue(taskqueue_thread, &soaio_kproc_task); mtx_unlock(&soaio_jobs_lock); } static void soaio_init(void) { soaio_lifetime = AIOD_LIFETIME_DEFAULT; STAILQ_INIT(&soaio_jobs); mtx_init(&soaio_jobs_lock, "soaio jobs", NULL, MTX_DEF); soaio_kproc_unr = new_unrhdr(1, INT_MAX, NULL); TASK_INIT(&soaio_kproc_task, 0, soaio_kproc_create, NULL); if (soaio_target_procs > 0) taskqueue_enqueue(taskqueue_thread, &soaio_kproc_task); } SYSINIT(soaio, SI_SUB_VFS, SI_ORDER_ANY, soaio_init, NULL); static __inline int soaio_ready(struct socket *so, struct sockbuf *sb) { return (sb == &so->so_rcv ? soreadable(so) : sowriteable(so)); } static void soaio_process_job(struct socket *so, struct sockbuf *sb, struct kaiocb *job) { struct ucred *td_savedcred; struct thread *td; struct file *fp; size_t cnt, done, job_total_nbytes; long ru_before; int error, flags; SOCKBUF_UNLOCK(sb); aio_switch_vmspace(job); td = curthread; fp = job->fd_file; retry: td_savedcred = td->td_ucred; td->td_ucred = job->cred; job_total_nbytes = job->uiop->uio_resid + job->aio_done; done = job->aio_done; cnt = job->uiop->uio_resid; job->uiop->uio_offset = 0; job->uiop->uio_td = td; flags = MSG_NBIO; /* * For resource usage accounting, only count a completed request * as a single message to avoid counting multiple calls to * sosend/soreceive on a blocking socket. */ if (sb == &so->so_rcv) { ru_before = td->td_ru.ru_msgrcv; #ifdef MAC error = mac_socket_check_receive(fp->f_cred, so); if (error == 0) #endif error = soreceive(so, NULL, job->uiop, NULL, NULL, &flags); if (td->td_ru.ru_msgrcv != ru_before) job->msgrcv = 1; } else { if (!TAILQ_EMPTY(&sb->sb_aiojobq)) flags |= MSG_MORETOCOME; ru_before = td->td_ru.ru_msgsnd; #ifdef MAC error = mac_socket_check_send(fp->f_cred, so); if (error == 0) #endif error = sosend(so, NULL, job->uiop, NULL, NULL, flags, td); if (td->td_ru.ru_msgsnd != ru_before) job->msgsnd = 1; if (error == EPIPE && (so->so_options & SO_NOSIGPIPE) == 0) { PROC_LOCK(job->userproc); kern_psignal(job->userproc, SIGPIPE); PROC_UNLOCK(job->userproc); } } done += cnt - job->uiop->uio_resid; job->aio_done = done; td->td_ucred = td_savedcred; if (error == EWOULDBLOCK) { /* * The request was either partially completed or not * completed at all due to racing with a read() or * write() on the socket. If the socket is * non-blocking, return with any partial completion. * If the socket is blocking or if no progress has * been made, requeue this request at the head of the * queue to try again when the socket is ready. */ MPASS(done != job_total_nbytes); SOCKBUF_LOCK(sb); if (done == 0 || !(so->so_state & SS_NBIO)) { empty_results++; if (soaio_ready(so, sb)) { empty_retries++; SOCKBUF_UNLOCK(sb); goto retry; } if (!aio_set_cancel_function(job, soo_aio_cancel)) { SOCKBUF_UNLOCK(sb); if (done != 0) aio_complete(job, done, 0); else aio_cancel(job); SOCKBUF_LOCK(sb); } else { TAILQ_INSERT_HEAD(&sb->sb_aiojobq, job, list); } return; } SOCKBUF_UNLOCK(sb); } if (done != 0 && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; if (error) aio_complete(job, -1, error); else aio_complete(job, done, 0); SOCKBUF_LOCK(sb); } static void soaio_process_sb(struct socket *so, struct sockbuf *sb) { struct kaiocb *job; CURVNET_SET(so->so_vnet); SOCKBUF_LOCK(sb); while (!TAILQ_EMPTY(&sb->sb_aiojobq) && soaio_ready(so, sb)) { job = TAILQ_FIRST(&sb->sb_aiojobq); TAILQ_REMOVE(&sb->sb_aiojobq, job, list); if (!aio_clear_cancel_function(job)) continue; soaio_process_job(so, sb, job); } /* * If there are still pending requests, the socket must not be * ready so set SB_AIO to request a wakeup when the socket * becomes ready. */ if (!TAILQ_EMPTY(&sb->sb_aiojobq)) sb->sb_flags |= SB_AIO; sb->sb_flags &= ~SB_AIO_RUNNING; SOCKBUF_UNLOCK(sb); SOCK_LOCK(so); sorele(so); CURVNET_RESTORE(); } void soaio_rcv(void *context, int pending) { struct socket *so; so = context; soaio_process_sb(so, &so->so_rcv); } void soaio_snd(void *context, int pending) { struct socket *so; so = context; soaio_process_sb(so, &so->so_snd); } void sowakeup_aio(struct socket *so, struct sockbuf *sb) { SOCKBUF_LOCK_ASSERT(sb); sb->sb_flags &= ~SB_AIO; if (sb->sb_flags & SB_AIO_RUNNING) return; sb->sb_flags |= SB_AIO_RUNNING; soref(so); soaio_enqueue(&sb->sb_aiotask); } static void soo_aio_cancel(struct kaiocb *job) { struct socket *so; struct sockbuf *sb; long done; int opcode; so = job->fd_file->f_data; opcode = job->uaiocb.aio_lio_opcode; if (opcode & LIO_READ) sb = &so->so_rcv; else { MPASS(opcode & LIO_WRITE); sb = &so->so_snd; } SOCKBUF_LOCK(sb); if (!aio_cancel_cleared(job)) TAILQ_REMOVE(&sb->sb_aiojobq, job, list); if (TAILQ_EMPTY(&sb->sb_aiojobq)) sb->sb_flags &= ~SB_AIO; SOCKBUF_UNLOCK(sb); done = job->aio_done; if (done != 0) aio_complete(job, done, 0); else aio_cancel(job); } static int soo_aio_queue(struct file *fp, struct kaiocb *job) { struct socket *so; struct sockbuf *sb; int error; so = fp->f_data; error = (*so->so_proto->pr_usrreqs->pru_aio_queue)(so, job); if (error == 0) return (0); /* Lock through the socket, since this may be a listening socket. */ switch (job->uaiocb.aio_lio_opcode & (LIO_WRITE | LIO_READ)) { case LIO_READ: sb = &so->so_rcv; SOCK_RECVBUF_LOCK(so); break; case LIO_WRITE: sb = &so->so_snd; SOCK_SENDBUF_LOCK(so); break; default: return (EINVAL); } if (SOLISTENING(so)) { if (sb == &so->so_rcv) SOCK_RECVBUF_UNLOCK(so); else SOCK_SENDBUF_UNLOCK(so); return (EINVAL); } if (!aio_set_cancel_function(job, soo_aio_cancel)) panic("new job was cancelled"); TAILQ_INSERT_TAIL(&sb->sb_aiojobq, job, list); if (!(sb->sb_flags & SB_AIO_RUNNING)) { if (soaio_ready(so, sb)) sowakeup_aio(so, sb); else sb->sb_flags |= SB_AIO; } SOCKBUF_UNLOCK(sb); return (0); } diff --git a/sys/kern/tty_pts.c b/sys/kern/tty_pts.c index dbbf35cf1a12..a912634393f6 100644 --- a/sys/kern/tty_pts.c +++ b/sys/kern/tty_pts.c @@ -1,877 +1,876 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2008 Ed Schouten * All rights reserved. * * Portions of this software were developed under sponsorship from Snow * B.V., the Netherlands. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* Add compatibility bits for FreeBSD. */ #define PTS_COMPAT /* Add pty(4) compat bits. */ #define PTS_EXTERNAL /* Add bits to make Linux binaries work. */ #define PTS_LINUX #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Our utmp(5) format is limited to 8-byte TTY line names. This means * we can at most allocate 1000 pseudo-terminals ("pts/999"). Allow * users to increase this number, assuming they have manually increased * UT_LINESIZE. */ static struct unrhdr *pts_pool; static MALLOC_DEFINE(M_PTS, "pts", "pseudo tty device"); /* * Per-PTS structure. * * List of locks * (t) locked by tty_lock() * (c) const until freeing */ struct pts_softc { int pts_unit; /* (c) Device unit number. */ unsigned int pts_flags; /* (t) Device flags. */ #define PTS_PKT 0x1 /* Packet mode. */ #define PTS_FINISHED 0x2 /* Return errors on read()/write(). */ char pts_pkt; /* (t) Unread packet mode data. */ struct cv pts_inwait; /* (t) Blocking write() on master. */ struct selinfo pts_inpoll; /* (t) Select queue for write(). */ struct cv pts_outwait; /* (t) Blocking read() on master. */ struct selinfo pts_outpoll; /* (t) Select queue for read(). */ #ifdef PTS_EXTERNAL struct cdev *pts_cdev; /* (c) Master device node. */ #endif /* PTS_EXTERNAL */ struct ucred *pts_cred; /* (c) Resource limit. */ }; /* * Controller-side file operations. */ static int ptsdev_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); int error = 0; char pkt; if (uio->uio_resid == 0) return (0); tty_lock(tp); for (;;) { /* * Implement packet mode. When packet mode is turned on, * the first byte contains a bitmask of events that * occurred (start, stop, flush, window size, etc). */ if (psc->pts_flags & PTS_PKT && psc->pts_pkt) { pkt = psc->pts_pkt; psc->pts_pkt = 0; tty_unlock(tp); error = ureadc(pkt, uio); return (error); } /* * Transmit regular data. * * XXX: We shouldn't use ttydisc_getc_poll()! Even * though in this implementation, there is likely going * to be data, we should just call ttydisc_getc_uio() * and use its return value to sleep. */ if (ttydisc_getc_poll(tp)) { if (psc->pts_flags & PTS_PKT) { /* * XXX: Small race. Fortunately PTY * consumers aren't multithreaded. */ tty_unlock(tp); error = ureadc(TIOCPKT_DATA, uio); if (error) return (error); tty_lock(tp); } error = ttydisc_getc_uio(tp, uio); break; } /* Maybe the device isn't used anyway. */ if (psc->pts_flags & PTS_FINISHED) break; /* Wait for more data. */ if (fp->f_flag & O_NONBLOCK) { error = EWOULDBLOCK; break; } error = cv_wait_sig(&psc->pts_outwait, tp->t_mtx); if (error != 0) break; } tty_unlock(tp); return (error); } static int ptsdev_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); char ib[256], *ibstart; size_t iblen, rintlen; int error = 0; if (uio->uio_resid == 0) return (0); for (;;) { ibstart = ib; iblen = MIN(uio->uio_resid, sizeof ib); error = uiomove(ib, iblen, uio); tty_lock(tp); if (error != 0) { iblen = 0; goto done; } /* * When possible, avoid the slow path. rint_bypass() * copies all input to the input queue at once. */ MPASS(iblen > 0); do { rintlen = ttydisc_rint_simple(tp, ibstart, iblen); ibstart += rintlen; iblen -= rintlen; if (iblen == 0) { /* All data written. */ break; } /* Maybe the device isn't used anyway. */ if (psc->pts_flags & PTS_FINISHED) { error = EIO; goto done; } /* Wait for more data. */ if (fp->f_flag & O_NONBLOCK) { error = EWOULDBLOCK; goto done; } /* Wake up users on the slave side. */ ttydisc_rint_done(tp); error = cv_wait_sig(&psc->pts_inwait, tp->t_mtx); if (error != 0) goto done; } while (iblen > 0); if (uio->uio_resid == 0) break; tty_unlock(tp); } done: ttydisc_rint_done(tp); tty_unlock(tp); /* * Don't account for the part of the buffer that we couldn't * pass to the TTY. */ uio->uio_resid += iblen; return (error); } static int ptsdev_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); int error = 0, sig; switch (cmd) { case FIODTYPE: *(int *)data = D_TTY; return (0); case FIONBIO: /* This device supports non-blocking operation. */ return (0); case FIONREAD: tty_lock(tp); if (psc->pts_flags & PTS_FINISHED) { /* Force read() to be called. */ *(int *)data = 1; } else { *(int *)data = ttydisc_getc_poll(tp); } tty_unlock(tp); return (0); case FIODGNAME: #ifdef COMPAT_FREEBSD32 case FIODGNAME_32: #endif { struct fiodgname_arg *fgn; const char *p; int i; /* Reverse device name lookups, for ptsname() and ttyname(). */ fgn = data; p = tty_devname(tp); i = strlen(p) + 1; if (i > fgn->len) return (EINVAL); return (copyout(p, fiodgname_buf_get_ptr(fgn, cmd), i)); } /* * We need to implement TIOCGPGRP and TIOCGSID here again. When * called on the pseudo-terminal master, it should not check if * the terminal is the foreground terminal of the calling * process. * * TIOCGETA is also implemented here. Various Linux PTY routines * often call isatty(), which is implemented by tcgetattr(). */ #ifdef PTS_LINUX case TIOCGETA: /* Obtain terminal flags through tcgetattr(). */ tty_lock(tp); *(struct termios*)data = tp->t_termios; tty_unlock(tp); return (0); #endif /* PTS_LINUX */ case TIOCSETAF: case TIOCSETAW: /* * We must make sure we turn tcsetattr() calls of TCSAFLUSH and * TCSADRAIN into something different. If an application would * call TCSAFLUSH or TCSADRAIN on the master descriptor, it may * deadlock waiting for all data to be read. */ cmd = TIOCSETA; break; #if defined(PTS_COMPAT) || defined(PTS_LINUX) case TIOCGPTN: /* * Get the device unit number. */ if (psc->pts_unit < 0) return (ENOTTY); *(unsigned int *)data = psc->pts_unit; return (0); #endif /* PTS_COMPAT || PTS_LINUX */ case TIOCGPGRP: /* Get the foreground process group ID. */ tty_lock(tp); if (tp->t_pgrp != NULL) *(int *)data = tp->t_pgrp->pg_id; else *(int *)data = NO_PID; tty_unlock(tp); return (0); case TIOCGSID: /* Get the session leader process ID. */ tty_lock(tp); if (tp->t_session == NULL) error = ENOTTY; else *(int *)data = tp->t_session->s_sid; tty_unlock(tp); return (error); case TIOCPTMASTER: /* Yes, we are a pseudo-terminal master. */ return (0); case TIOCSIG: /* Signal the foreground process group. */ sig = *(int *)data; if (sig < 1 || sig >= NSIG) return (EINVAL); tty_lock(tp); tty_signal_pgrp(tp, sig); tty_unlock(tp); return (0); case TIOCPKT: /* Enable/disable packet mode. */ tty_lock(tp); if (*(int *)data) psc->pts_flags |= PTS_PKT; else psc->pts_flags &= ~PTS_PKT; tty_unlock(tp); return (0); } /* Just redirect this ioctl to the slave device. */ tty_lock(tp); error = tty_ioctl(tp, cmd, data, fp->f_flag, td); tty_unlock(tp); if (error == ENOIOCTL) error = ENOTTY; return (error); } static int ptsdev_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); int revents = 0; tty_lock(tp); if (psc->pts_flags & PTS_FINISHED) { /* Slave device is not opened. */ tty_unlock(tp); return ((events & (POLLIN|POLLRDNORM)) | POLLHUP); } if (events & (POLLIN|POLLRDNORM)) { /* See if we can getc something. */ if (ttydisc_getc_poll(tp) || (psc->pts_flags & PTS_PKT && psc->pts_pkt)) revents |= events & (POLLIN|POLLRDNORM); } if (events & (POLLOUT|POLLWRNORM)) { /* See if we can rint something. */ if (ttydisc_rint_poll(tp)) revents |= events & (POLLOUT|POLLWRNORM); } /* * No need to check for POLLHUP here. This device cannot be used * as a callout device, which means we always have a carrier, * because the master is. */ if (revents == 0) { /* * This code might look misleading, but the naming of * poll events on this side is the opposite of the slave * device. */ if (events & (POLLIN|POLLRDNORM)) selrecord(td, &psc->pts_outpoll); if (events & (POLLOUT|POLLWRNORM)) selrecord(td, &psc->pts_inpoll); } tty_unlock(tp); return (revents); } /* * kqueue support. */ static void pts_kqops_read_detach(struct knote *kn) { struct file *fp = kn->kn_fp; struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); knlist_remove(&psc->pts_outpoll.si_note, kn, 0); } static int pts_kqops_read_event(struct knote *kn, long hint) { struct file *fp = kn->kn_fp; struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); if (psc->pts_flags & PTS_FINISHED) { kn->kn_flags |= EV_EOF; return (1); } else { kn->kn_data = ttydisc_getc_poll(tp); return (kn->kn_data > 0); } } static void pts_kqops_write_detach(struct knote *kn) { struct file *fp = kn->kn_fp; struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); knlist_remove(&psc->pts_inpoll.si_note, kn, 0); } static int pts_kqops_write_event(struct knote *kn, long hint) { struct file *fp = kn->kn_fp; struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); if (psc->pts_flags & PTS_FINISHED) { kn->kn_flags |= EV_EOF; return (1); } else { kn->kn_data = ttydisc_rint_poll(tp); return (kn->kn_data > 0); } } static struct filterops pts_kqops_read = { .f_isfd = 1, .f_detach = pts_kqops_read_detach, .f_event = pts_kqops_read_event, }; static struct filterops pts_kqops_write = { .f_isfd = 1, .f_detach = pts_kqops_write_detach, .f_event = pts_kqops_write_event, }; static int ptsdev_kqfilter(struct file *fp, struct knote *kn) { struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); int error = 0; tty_lock(tp); switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &pts_kqops_read; knlist_add(&psc->pts_outpoll.si_note, kn, 1); break; case EVFILT_WRITE: kn->kn_fop = &pts_kqops_write; knlist_add(&psc->pts_inpoll.si_note, kn, 1); break; default: error = EINVAL; break; } tty_unlock(tp); return (error); } static int -ptsdev_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, - struct thread *td) +ptsdev_stat(struct file *fp, struct stat *sb, struct ucred *active_cred) { struct tty *tp = fp->f_data; #ifdef PTS_EXTERNAL struct pts_softc *psc = tty_softc(tp); #endif /* PTS_EXTERNAL */ struct cdev *dev = tp->t_dev; /* * According to POSIX, we must implement an fstat(). This also * makes this implementation compatible with Linux binaries, * because Linux calls fstat() on the pseudo-terminal master to * obtain st_rdev. * * XXX: POSIX also mentions we must fill in st_dev, but how? */ bzero(sb, sizeof *sb); #ifdef PTS_EXTERNAL if (psc->pts_cdev != NULL) sb->st_ino = sb->st_rdev = dev2udev(psc->pts_cdev); else #endif /* PTS_EXTERNAL */ sb->st_ino = sb->st_rdev = tty_udev(tp); sb->st_atim = dev->si_atime; sb->st_ctim = dev->si_ctime; sb->st_mtim = dev->si_mtime; sb->st_uid = dev->si_uid; sb->st_gid = dev->si_gid; sb->st_mode = dev->si_mode | S_IFCHR; return (0); } static int ptsdev_close(struct file *fp, struct thread *td) { struct tty *tp = fp->f_data; /* Deallocate TTY device. */ tty_lock(tp); tty_rel_gone(tp); /* * Open of /dev/ptmx or /dev/ptyXX changes the type of file * from DTYPE_VNODE to DTYPE_PTS. vn_open() increases vnode * use count, we need to decrement it, and possibly do other * required cleanup. */ if (fp->f_vnode != NULL) return (vnops.fo_close(fp, td)); return (0); } static int ptsdev_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { struct tty *tp; kif->kf_type = KF_TYPE_PTS; tp = fp->f_data; kif->kf_un.kf_pts.kf_pts_dev = tty_udev(tp); kif->kf_un.kf_pts.kf_pts_dev_freebsd11 = kif->kf_un.kf_pts.kf_pts_dev; /* truncate */ strlcpy(kif->kf_path, tty_devname(tp), sizeof(kif->kf_path)); return (0); } static struct fileops ptsdev_ops = { .fo_read = ptsdev_read, .fo_write = ptsdev_write, .fo_truncate = invfo_truncate, .fo_ioctl = ptsdev_ioctl, .fo_poll = ptsdev_poll, .fo_kqfilter = ptsdev_kqfilter, .fo_stat = ptsdev_stat, .fo_close = ptsdev_close, .fo_chmod = invfo_chmod, .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = ptsdev_fill_kinfo, .fo_flags = DFLAG_PASSABLE, }; /* * Driver-side hooks. */ static void ptsdrv_outwakeup(struct tty *tp) { struct pts_softc *psc = tty_softc(tp); cv_broadcast(&psc->pts_outwait); selwakeup(&psc->pts_outpoll); KNOTE_LOCKED(&psc->pts_outpoll.si_note, 0); } static void ptsdrv_inwakeup(struct tty *tp) { struct pts_softc *psc = tty_softc(tp); cv_broadcast(&psc->pts_inwait); selwakeup(&psc->pts_inpoll); KNOTE_LOCKED(&psc->pts_inpoll.si_note, 0); } static int ptsdrv_open(struct tty *tp) { struct pts_softc *psc = tty_softc(tp); psc->pts_flags &= ~PTS_FINISHED; return (0); } static void ptsdrv_close(struct tty *tp) { struct pts_softc *psc = tty_softc(tp); /* Wake up any blocked readers/writers. */ psc->pts_flags |= PTS_FINISHED; ptsdrv_outwakeup(tp); ptsdrv_inwakeup(tp); } static void ptsdrv_pktnotify(struct tty *tp, char event) { struct pts_softc *psc = tty_softc(tp); /* * Clear conflicting flags. */ switch (event) { case TIOCPKT_STOP: psc->pts_pkt &= ~TIOCPKT_START; break; case TIOCPKT_START: psc->pts_pkt &= ~TIOCPKT_STOP; break; case TIOCPKT_NOSTOP: psc->pts_pkt &= ~TIOCPKT_DOSTOP; break; case TIOCPKT_DOSTOP: psc->pts_pkt &= ~TIOCPKT_NOSTOP; break; } psc->pts_pkt |= event; ptsdrv_outwakeup(tp); } static void ptsdrv_free(void *softc) { struct pts_softc *psc = softc; /* Make device number available again. */ if (psc->pts_unit >= 0) free_unr(pts_pool, psc->pts_unit); chgptscnt(psc->pts_cred->cr_ruidinfo, -1, 0); racct_sub_cred(psc->pts_cred, RACCT_NPTS, 1); crfree(psc->pts_cred); seldrain(&psc->pts_inpoll); seldrain(&psc->pts_outpoll); knlist_destroy(&psc->pts_inpoll.si_note); knlist_destroy(&psc->pts_outpoll.si_note); #ifdef PTS_EXTERNAL /* Destroy master device as well. */ if (psc->pts_cdev != NULL) destroy_dev_sched(psc->pts_cdev); #endif /* PTS_EXTERNAL */ free(psc, M_PTS); } static struct ttydevsw pts_class = { .tsw_flags = TF_NOPREFIX, .tsw_outwakeup = ptsdrv_outwakeup, .tsw_inwakeup = ptsdrv_inwakeup, .tsw_open = ptsdrv_open, .tsw_close = ptsdrv_close, .tsw_pktnotify = ptsdrv_pktnotify, .tsw_free = ptsdrv_free, }; #ifndef PTS_EXTERNAL static #endif /* !PTS_EXTERNAL */ int pts_alloc(int fflags, struct thread *td, struct file *fp) { int unit, ok, error; struct tty *tp; struct pts_softc *psc; struct proc *p = td->td_proc; struct ucred *cred = td->td_ucred; /* Resource limiting. */ PROC_LOCK(p); error = racct_add(p, RACCT_NPTS, 1); if (error != 0) { PROC_UNLOCK(p); return (EAGAIN); } ok = chgptscnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_NPTS)); if (!ok) { racct_sub(p, RACCT_NPTS, 1); PROC_UNLOCK(p); return (EAGAIN); } PROC_UNLOCK(p); /* Try to allocate a new pts unit number. */ unit = alloc_unr(pts_pool); if (unit < 0) { racct_sub(p, RACCT_NPTS, 1); chgptscnt(cred->cr_ruidinfo, -1, 0); return (EAGAIN); } /* Allocate TTY and softc. */ psc = malloc(sizeof(struct pts_softc), M_PTS, M_WAITOK|M_ZERO); cv_init(&psc->pts_inwait, "ptsin"); cv_init(&psc->pts_outwait, "ptsout"); psc->pts_unit = unit; psc->pts_cred = crhold(cred); tp = tty_alloc(&pts_class, psc); knlist_init_mtx(&psc->pts_inpoll.si_note, tp->t_mtx); knlist_init_mtx(&psc->pts_outpoll.si_note, tp->t_mtx); /* Expose the slave device as well. */ tty_makedev(tp, td->td_ucred, "pts/%u", psc->pts_unit); finit(fp, fflags, DTYPE_PTS, tp, &ptsdev_ops); return (0); } #ifdef PTS_EXTERNAL int pts_alloc_external(int fflags, struct thread *td, struct file *fp, struct cdev *dev, const char *name) { int ok, error; struct tty *tp; struct pts_softc *psc; struct proc *p = td->td_proc; struct ucred *cred = td->td_ucred; /* Resource limiting. */ PROC_LOCK(p); error = racct_add(p, RACCT_NPTS, 1); if (error != 0) { PROC_UNLOCK(p); return (EAGAIN); } ok = chgptscnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_NPTS)); if (!ok) { racct_sub(p, RACCT_NPTS, 1); PROC_UNLOCK(p); return (EAGAIN); } PROC_UNLOCK(p); /* Allocate TTY and softc. */ psc = malloc(sizeof(struct pts_softc), M_PTS, M_WAITOK|M_ZERO); cv_init(&psc->pts_inwait, "ptsin"); cv_init(&psc->pts_outwait, "ptsout"); psc->pts_unit = -1; psc->pts_cdev = dev; psc->pts_cred = crhold(cred); tp = tty_alloc(&pts_class, psc); knlist_init_mtx(&psc->pts_inpoll.si_note, tp->t_mtx); knlist_init_mtx(&psc->pts_outpoll.si_note, tp->t_mtx); /* Expose the slave device as well. */ tty_makedev(tp, td->td_ucred, "%s", name); finit(fp, fflags, DTYPE_PTS, tp, &ptsdev_ops); return (0); } #endif /* PTS_EXTERNAL */ int sys_posix_openpt(struct thread *td, struct posix_openpt_args *uap) { int error, fd; struct file *fp; /* * POSIX states it's unspecified when other flags are passed. We * don't allow this. */ if (uap->flags & ~(O_RDWR|O_NOCTTY|O_CLOEXEC)) return (EINVAL); error = falloc(td, &fp, &fd, uap->flags); if (error) return (error); /* Allocate the actual pseudo-TTY. */ error = pts_alloc(FFLAGS(uap->flags & O_ACCMODE), td, fp); if (error != 0) { fdclose(td, fp, fd); fdrop(fp, td); return (error); } /* Pass it back to userspace. */ td->td_retval[0] = fd; fdrop(fp, td); return (0); } static void pts_init(void *unused) { pts_pool = new_unrhdr(0, INT_MAX, NULL); } SYSINIT(pts, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, pts_init, NULL); diff --git a/sys/kern/uipc_mqueue.c b/sys/kern/uipc_mqueue.c index a9fb40848fbd..24ed1be6bf65 100644 --- a/sys/kern/uipc_mqueue.c +++ b/sys/kern/uipc_mqueue.c @@ -1,2945 +1,2944 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005 David Xu * Copyright (c) 2016-2017 Robert N. M. Watson * All rights reserved. * * Portions of this software were developed by BAE Systems, the University of * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent * Computing (TC) research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* * POSIX message queue implementation. * * 1) A mqueue filesystem can be mounted, each message queue appears * in mounted directory, user can change queue's permission and * ownership, or remove a queue. Manually creating a file in the * directory causes a message queue to be created in the kernel with * default message queue attributes applied and same name used, this * method is not advocated since mq_open syscall allows user to specify * different attributes. Also the file system can be mounted multiple * times at different mount points but shows same contents. * * 2) Standard POSIX message queue API. The syscalls do not use vfs layer, * but directly operate on internal data structure, this allows user to * use the IPC facility without having to mount mqueue file system. */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(p1003_1b_mqueue, "POSIX P1003.1B message queues support"); /* * Limits and constants */ #define MQFS_NAMELEN NAME_MAX #define MQFS_DELEN (8 + MQFS_NAMELEN) /* node types */ typedef enum { mqfstype_none = 0, mqfstype_root, mqfstype_dir, mqfstype_this, mqfstype_parent, mqfstype_file, mqfstype_symlink, } mqfs_type_t; struct mqfs_node; /* * mqfs_info: describes a mqfs instance */ struct mqfs_info { struct sx mi_lock; struct mqfs_node *mi_root; struct unrhdr *mi_unrhdr; }; struct mqfs_vdata { LIST_ENTRY(mqfs_vdata) mv_link; struct mqfs_node *mv_node; struct vnode *mv_vnode; struct task mv_task; }; /* * mqfs_node: describes a node (file or directory) within a mqfs */ struct mqfs_node { char mn_name[MQFS_NAMELEN+1]; struct mqfs_info *mn_info; struct mqfs_node *mn_parent; LIST_HEAD(,mqfs_node) mn_children; LIST_ENTRY(mqfs_node) mn_sibling; LIST_HEAD(,mqfs_vdata) mn_vnodes; const void *mn_pr_root; int mn_refcount; mqfs_type_t mn_type; int mn_deleted; uint32_t mn_fileno; void *mn_data; struct timespec mn_birth; struct timespec mn_ctime; struct timespec mn_atime; struct timespec mn_mtime; uid_t mn_uid; gid_t mn_gid; int mn_mode; }; #define VTON(vp) (((struct mqfs_vdata *)((vp)->v_data))->mv_node) #define VTOMQ(vp) ((struct mqueue *)(VTON(vp)->mn_data)) #define VFSTOMQFS(m) ((struct mqfs_info *)((m)->mnt_data)) #define FPTOMQ(fp) ((struct mqueue *)(((struct mqfs_node *) \ (fp)->f_data)->mn_data)) TAILQ_HEAD(msgq, mqueue_msg); struct mqueue; struct mqueue_notifier { LIST_ENTRY(mqueue_notifier) nt_link; struct sigevent nt_sigev; ksiginfo_t nt_ksi; struct proc *nt_proc; }; struct mqueue { struct mtx mq_mutex; int mq_flags; long mq_maxmsg; long mq_msgsize; long mq_curmsgs; long mq_totalbytes; struct msgq mq_msgq; int mq_receivers; int mq_senders; struct selinfo mq_rsel; struct selinfo mq_wsel; struct mqueue_notifier *mq_notifier; }; #define MQ_RSEL 0x01 #define MQ_WSEL 0x02 struct mqueue_msg { TAILQ_ENTRY(mqueue_msg) msg_link; unsigned int msg_prio; unsigned int msg_size; /* following real data... */ }; static SYSCTL_NODE(_kern, OID_AUTO, mqueue, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "POSIX real time message queue"); static int default_maxmsg = 10; static int default_msgsize = 1024; static int maxmsg = 100; SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsg, CTLFLAG_RW, &maxmsg, 0, "Default maximum messages in queue"); static int maxmsgsize = 16384; SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsgsize, CTLFLAG_RW, &maxmsgsize, 0, "Default maximum message size"); static int maxmq = 100; SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmq, CTLFLAG_RW, &maxmq, 0, "maximum message queues"); static int curmq = 0; SYSCTL_INT(_kern_mqueue, OID_AUTO, curmq, CTLFLAG_RW, &curmq, 0, "current message queue number"); static int unloadable = 0; static MALLOC_DEFINE(M_MQUEUEDATA, "mqdata", "mqueue data"); static eventhandler_tag exit_tag; /* Only one instance per-system */ static struct mqfs_info mqfs_data; static uma_zone_t mqnode_zone; static uma_zone_t mqueue_zone; static uma_zone_t mvdata_zone; static uma_zone_t mqnoti_zone; static struct vop_vector mqfs_vnodeops; static struct fileops mqueueops; static unsigned mqfs_osd_jail_slot; /* * Directory structure construction and manipulation */ #ifdef notyet static struct mqfs_node *mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode); static struct mqfs_node *mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode); #endif static struct mqfs_node *mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode); static int mqfs_destroy(struct mqfs_node *mn); static void mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn); static void mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn); static int mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn); static int mqfs_prison_remove(void *obj, void *data); /* * Message queue construction and maniplation */ static struct mqueue *mqueue_alloc(const struct mq_attr *attr); static void mqueue_free(struct mqueue *mq); static int mqueue_send(struct mqueue *mq, const char *msg_ptr, size_t msg_len, unsigned msg_prio, int waitok, const struct timespec *abs_timeout); static int mqueue_receive(struct mqueue *mq, char *msg_ptr, size_t msg_len, unsigned *msg_prio, int waitok, const struct timespec *abs_timeout); static int _mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo); static int _mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo); static void mqueue_send_notification(struct mqueue *mq); static void mqueue_fdclose(struct thread *td, int fd, struct file *fp); static void mq_proc_exit(void *arg, struct proc *p); /* * kqueue filters */ static void filt_mqdetach(struct knote *kn); static int filt_mqread(struct knote *kn, long hint); static int filt_mqwrite(struct knote *kn, long hint); struct filterops mq_rfiltops = { .f_isfd = 1, .f_detach = filt_mqdetach, .f_event = filt_mqread, }; struct filterops mq_wfiltops = { .f_isfd = 1, .f_detach = filt_mqdetach, .f_event = filt_mqwrite, }; /* * Initialize fileno bitmap */ static void mqfs_fileno_init(struct mqfs_info *mi) { struct unrhdr *up; up = new_unrhdr(1, INT_MAX, NULL); mi->mi_unrhdr = up; } /* * Tear down fileno bitmap */ static void mqfs_fileno_uninit(struct mqfs_info *mi) { struct unrhdr *up; up = mi->mi_unrhdr; mi->mi_unrhdr = NULL; delete_unrhdr(up); } /* * Allocate a file number */ static void mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn) { /* make sure our parent has a file number */ if (mn->mn_parent && !mn->mn_parent->mn_fileno) mqfs_fileno_alloc(mi, mn->mn_parent); switch (mn->mn_type) { case mqfstype_root: case mqfstype_dir: case mqfstype_file: case mqfstype_symlink: mn->mn_fileno = alloc_unr(mi->mi_unrhdr); break; case mqfstype_this: KASSERT(mn->mn_parent != NULL, ("mqfstype_this node has no parent")); mn->mn_fileno = mn->mn_parent->mn_fileno; break; case mqfstype_parent: KASSERT(mn->mn_parent != NULL, ("mqfstype_parent node has no parent")); if (mn->mn_parent == mi->mi_root) { mn->mn_fileno = mn->mn_parent->mn_fileno; break; } KASSERT(mn->mn_parent->mn_parent != NULL, ("mqfstype_parent node has no grandparent")); mn->mn_fileno = mn->mn_parent->mn_parent->mn_fileno; break; default: KASSERT(0, ("mqfs_fileno_alloc() called for unknown type node: %d", mn->mn_type)); break; } } /* * Release a file number */ static void mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn) { switch (mn->mn_type) { case mqfstype_root: case mqfstype_dir: case mqfstype_file: case mqfstype_symlink: free_unr(mi->mi_unrhdr, mn->mn_fileno); break; case mqfstype_this: case mqfstype_parent: /* ignore these, as they don't "own" their file number */ break; default: KASSERT(0, ("mqfs_fileno_free() called for unknown type node: %d", mn->mn_type)); break; } } static __inline struct mqfs_node * mqnode_alloc(void) { return uma_zalloc(mqnode_zone, M_WAITOK | M_ZERO); } static __inline void mqnode_free(struct mqfs_node *node) { uma_zfree(mqnode_zone, node); } static __inline void mqnode_addref(struct mqfs_node *node) { atomic_add_int(&node->mn_refcount, 1); } static __inline void mqnode_release(struct mqfs_node *node) { struct mqfs_info *mqfs; int old, exp; mqfs = node->mn_info; old = atomic_fetchadd_int(&node->mn_refcount, -1); if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root) exp = 3; /* include . and .. */ else exp = 1; if (old == exp) { int locked = sx_xlocked(&mqfs->mi_lock); if (!locked) sx_xlock(&mqfs->mi_lock); mqfs_destroy(node); if (!locked) sx_xunlock(&mqfs->mi_lock); } } /* * Add a node to a directory */ static int mqfs_add_node(struct mqfs_node *parent, struct mqfs_node *node) { KASSERT(parent != NULL, ("%s(): parent is NULL", __func__)); KASSERT(parent->mn_info != NULL, ("%s(): parent has no mn_info", __func__)); KASSERT(parent->mn_type == mqfstype_dir || parent->mn_type == mqfstype_root, ("%s(): parent is not a directory", __func__)); node->mn_info = parent->mn_info; node->mn_parent = parent; LIST_INIT(&node->mn_children); LIST_INIT(&node->mn_vnodes); LIST_INSERT_HEAD(&parent->mn_children, node, mn_sibling); mqnode_addref(parent); return (0); } static struct mqfs_node * mqfs_create_node(const char *name, int namelen, struct ucred *cred, int mode, int nodetype) { struct mqfs_node *node; node = mqnode_alloc(); strncpy(node->mn_name, name, namelen); node->mn_pr_root = cred->cr_prison->pr_root; node->mn_type = nodetype; node->mn_refcount = 1; vfs_timestamp(&node->mn_birth); node->mn_ctime = node->mn_atime = node->mn_mtime = node->mn_birth; node->mn_uid = cred->cr_uid; node->mn_gid = cred->cr_gid; node->mn_mode = mode; return (node); } /* * Create a file */ static struct mqfs_node * mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode) { struct mqfs_node *node; node = mqfs_create_node(name, namelen, cred, mode, mqfstype_file); if (mqfs_add_node(parent, node) != 0) { mqnode_free(node); return (NULL); } return (node); } /* * Add . and .. to a directory */ static int mqfs_fixup_dir(struct mqfs_node *parent) { struct mqfs_node *dir; dir = mqnode_alloc(); dir->mn_name[0] = '.'; dir->mn_type = mqfstype_this; dir->mn_refcount = 1; if (mqfs_add_node(parent, dir) != 0) { mqnode_free(dir); return (-1); } dir = mqnode_alloc(); dir->mn_name[0] = dir->mn_name[1] = '.'; dir->mn_type = mqfstype_parent; dir->mn_refcount = 1; if (mqfs_add_node(parent, dir) != 0) { mqnode_free(dir); return (-1); } return (0); } #ifdef notyet /* * Create a directory */ static struct mqfs_node * mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode) { struct mqfs_node *node; node = mqfs_create_node(name, namelen, cred, mode, mqfstype_dir); if (mqfs_add_node(parent, node) != 0) { mqnode_free(node); return (NULL); } if (mqfs_fixup_dir(node) != 0) { mqfs_destroy(node); return (NULL); } return (node); } /* * Create a symlink */ static struct mqfs_node * mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode) { struct mqfs_node *node; node = mqfs_create_node(name, namelen, cred, mode, mqfstype_symlink); if (mqfs_add_node(parent, node) != 0) { mqnode_free(node); return (NULL); } return (node); } #endif /* * Destroy a node or a tree of nodes */ static int mqfs_destroy(struct mqfs_node *node) { struct mqfs_node *parent; KASSERT(node != NULL, ("%s(): node is NULL", __func__)); KASSERT(node->mn_info != NULL, ("%s(): node has no mn_info", __func__)); /* destroy children */ if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root) while (! LIST_EMPTY(&node->mn_children)) mqfs_destroy(LIST_FIRST(&node->mn_children)); /* unlink from parent */ if ((parent = node->mn_parent) != NULL) { KASSERT(parent->mn_info == node->mn_info, ("%s(): parent has different mn_info", __func__)); LIST_REMOVE(node, mn_sibling); } if (node->mn_fileno != 0) mqfs_fileno_free(node->mn_info, node); if (node->mn_data != NULL) mqueue_free(node->mn_data); mqnode_free(node); return (0); } /* * Mount a mqfs instance */ static int mqfs_mount(struct mount *mp) { struct statfs *sbp; if (mp->mnt_flag & MNT_UPDATE) return (EOPNOTSUPP); mp->mnt_data = &mqfs_data; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; MNT_IUNLOCK(mp); vfs_getnewfsid(mp); sbp = &mp->mnt_stat; vfs_mountedfrom(mp, "mqueue"); sbp->f_bsize = PAGE_SIZE; sbp->f_iosize = PAGE_SIZE; sbp->f_blocks = 1; sbp->f_bfree = 0; sbp->f_bavail = 0; sbp->f_files = 1; sbp->f_ffree = 0; return (0); } /* * Unmount a mqfs instance */ static int mqfs_unmount(struct mount *mp, int mntflags) { int error; error = vflush(mp, 0, (mntflags & MNT_FORCE) ? FORCECLOSE : 0, curthread); return (error); } /* * Return a root vnode */ static int mqfs_root(struct mount *mp, int flags, struct vnode **vpp) { struct mqfs_info *mqfs; int ret; mqfs = VFSTOMQFS(mp); ret = mqfs_allocv(mp, vpp, mqfs->mi_root); return (ret); } /* * Return filesystem stats */ static int mqfs_statfs(struct mount *mp, struct statfs *sbp) { /* XXX update statistics */ return (0); } /* * Initialize a mqfs instance */ static int mqfs_init(struct vfsconf *vfc) { struct mqfs_node *root; struct mqfs_info *mi; osd_method_t methods[PR_MAXMETHOD] = { [PR_METHOD_REMOVE] = mqfs_prison_remove, }; mqnode_zone = uma_zcreate("mqnode", sizeof(struct mqfs_node), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mqueue_zone = uma_zcreate("mqueue", sizeof(struct mqueue), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mvdata_zone = uma_zcreate("mvdata", sizeof(struct mqfs_vdata), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mqnoti_zone = uma_zcreate("mqnotifier", sizeof(struct mqueue_notifier), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mi = &mqfs_data; sx_init(&mi->mi_lock, "mqfs lock"); /* set up the root diretory */ root = mqfs_create_node("/", 1, curthread->td_ucred, 01777, mqfstype_root); root->mn_info = mi; LIST_INIT(&root->mn_children); LIST_INIT(&root->mn_vnodes); mi->mi_root = root; mqfs_fileno_init(mi); mqfs_fileno_alloc(mi, root); mqfs_fixup_dir(root); exit_tag = EVENTHANDLER_REGISTER(process_exit, mq_proc_exit, NULL, EVENTHANDLER_PRI_ANY); mq_fdclose = mqueue_fdclose; p31b_setcfg(CTL_P1003_1B_MESSAGE_PASSING, _POSIX_MESSAGE_PASSING); mqfs_osd_jail_slot = osd_jail_register(NULL, methods); return (0); } /* * Destroy a mqfs instance */ static int mqfs_uninit(struct vfsconf *vfc) { struct mqfs_info *mi; if (!unloadable) return (EOPNOTSUPP); osd_jail_deregister(mqfs_osd_jail_slot); EVENTHANDLER_DEREGISTER(process_exit, exit_tag); mi = &mqfs_data; mqfs_destroy(mi->mi_root); mi->mi_root = NULL; mqfs_fileno_uninit(mi); sx_destroy(&mi->mi_lock); uma_zdestroy(mqnode_zone); uma_zdestroy(mqueue_zone); uma_zdestroy(mvdata_zone); uma_zdestroy(mqnoti_zone); return (0); } /* * task routine */ static void do_recycle(void *context, int pending __unused) { struct vnode *vp = (struct vnode *)context; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vrecycle(vp); VOP_UNLOCK(vp); vdrop(vp); } /* * Allocate a vnode */ static int mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn) { struct mqfs_vdata *vd; struct mqfs_info *mqfs; struct vnode *newvpp; int error; mqfs = pn->mn_info; *vpp = NULL; sx_xlock(&mqfs->mi_lock); LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) { if (vd->mv_vnode->v_mount == mp) { vhold(vd->mv_vnode); break; } } if (vd != NULL) { found: *vpp = vd->mv_vnode; sx_xunlock(&mqfs->mi_lock); error = vget(*vpp, LK_RETRY | LK_EXCLUSIVE); vdrop(*vpp); return (error); } sx_xunlock(&mqfs->mi_lock); error = getnewvnode("mqueue", mp, &mqfs_vnodeops, &newvpp); if (error) return (error); vn_lock(newvpp, LK_EXCLUSIVE | LK_RETRY); error = insmntque(newvpp, mp); if (error != 0) return (error); sx_xlock(&mqfs->mi_lock); /* * Check if it has already been allocated * while we were blocked. */ LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) { if (vd->mv_vnode->v_mount == mp) { vhold(vd->mv_vnode); sx_xunlock(&mqfs->mi_lock); vgone(newvpp); vput(newvpp); goto found; } } *vpp = newvpp; vd = uma_zalloc(mvdata_zone, M_WAITOK); (*vpp)->v_data = vd; vd->mv_vnode = *vpp; vd->mv_node = pn; TASK_INIT(&vd->mv_task, 0, do_recycle, *vpp); LIST_INSERT_HEAD(&pn->mn_vnodes, vd, mv_link); mqnode_addref(pn); switch (pn->mn_type) { case mqfstype_root: (*vpp)->v_vflag = VV_ROOT; /* fall through */ case mqfstype_dir: case mqfstype_this: case mqfstype_parent: (*vpp)->v_type = VDIR; break; case mqfstype_file: (*vpp)->v_type = VREG; break; case mqfstype_symlink: (*vpp)->v_type = VLNK; break; case mqfstype_none: KASSERT(0, ("mqfs_allocf called for null node\n")); default: panic("%s has unexpected type: %d", pn->mn_name, pn->mn_type); } sx_xunlock(&mqfs->mi_lock); return (0); } /* * Search a directory entry */ static struct mqfs_node * mqfs_search(struct mqfs_node *pd, const char *name, int len, struct ucred *cred) { struct mqfs_node *pn; const void *pr_root; sx_assert(&pd->mn_info->mi_lock, SX_LOCKED); pr_root = cred->cr_prison->pr_root; LIST_FOREACH(pn, &pd->mn_children, mn_sibling) { /* Only match names within the same prison root directory */ if ((pn->mn_pr_root == NULL || pn->mn_pr_root == pr_root) && strncmp(pn->mn_name, name, len) == 0 && pn->mn_name[len] == '\0') return (pn); } return (NULL); } /* * Look up a file or directory. */ static int mqfs_lookupx(struct vop_cachedlookup_args *ap) { struct componentname *cnp; struct vnode *dvp, **vpp; struct mqfs_node *pd; struct mqfs_node *pn; struct mqfs_info *mqfs; int nameiop, flags, error, namelen; char *pname; struct thread *td; td = curthread; cnp = ap->a_cnp; vpp = ap->a_vpp; dvp = ap->a_dvp; pname = cnp->cn_nameptr; namelen = cnp->cn_namelen; flags = cnp->cn_flags; nameiop = cnp->cn_nameiop; pd = VTON(dvp); pn = NULL; mqfs = pd->mn_info; *vpp = NULLVP; if (dvp->v_type != VDIR) return (ENOTDIR); error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td); if (error) return (error); /* shortcut: check if the name is too long */ if (cnp->cn_namelen >= MQFS_NAMELEN) return (ENOENT); /* self */ if (namelen == 1 && pname[0] == '.') { if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); pn = pd; *vpp = dvp; VREF(dvp); return (0); } /* parent */ if (cnp->cn_flags & ISDOTDOT) { if (dvp->v_vflag & VV_ROOT) return (EIO); if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); VOP_UNLOCK(dvp); KASSERT(pd->mn_parent, ("non-root directory has no parent")); pn = pd->mn_parent; error = mqfs_allocv(dvp->v_mount, vpp, pn); vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); return (error); } /* named node */ sx_xlock(&mqfs->mi_lock); pn = mqfs_search(pd, pname, namelen, cnp->cn_cred); if (pn != NULL) mqnode_addref(pn); sx_xunlock(&mqfs->mi_lock); /* found */ if (pn != NULL) { /* DELETE */ if (nameiop == DELETE && (flags & ISLASTCN)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) { mqnode_release(pn); return (error); } if (*vpp == dvp) { VREF(dvp); *vpp = dvp; mqnode_release(pn); return (0); } } /* allocate vnode */ error = mqfs_allocv(dvp->v_mount, vpp, pn); mqnode_release(pn); if (error == 0 && cnp->cn_flags & MAKEENTRY) cache_enter(dvp, *vpp, cnp); return (error); } /* not found */ /* will create a new entry in the directory ? */ if ((nameiop == CREATE || nameiop == RENAME) && (flags & LOCKPARENT) && (flags & ISLASTCN)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) return (error); cnp->cn_flags |= SAVENAME; return (EJUSTRETURN); } return (ENOENT); } #if 0 struct vop_lookup_args { struct vop_generic_args a_gen; struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; }; #endif /* * vnode lookup operation */ static int mqfs_lookup(struct vop_cachedlookup_args *ap) { int rc; rc = mqfs_lookupx(ap); return (rc); } #if 0 struct vop_create_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; #endif /* * vnode creation operation */ static int mqfs_create(struct vop_create_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct componentname *cnp = ap->a_cnp; struct mqfs_node *pd; struct mqfs_node *pn; struct mqueue *mq; int error; pd = VTON(ap->a_dvp); if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir) return (ENOTDIR); mq = mqueue_alloc(NULL); if (mq == NULL) return (EAGAIN); sx_xlock(&mqfs->mi_lock); if ((cnp->cn_flags & HASBUF) == 0) panic("%s: no name", __func__); pn = mqfs_create_file(pd, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, ap->a_vap->va_mode); if (pn == NULL) { sx_xunlock(&mqfs->mi_lock); error = ENOSPC; } else { mqnode_addref(pn); sx_xunlock(&mqfs->mi_lock); error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn); mqnode_release(pn); if (error) mqfs_destroy(pn); else pn->mn_data = mq; } if (error) mqueue_free(mq); return (error); } /* * Remove an entry */ static int do_unlink(struct mqfs_node *pn, struct ucred *ucred) { struct mqfs_node *parent; struct mqfs_vdata *vd; int error = 0; sx_assert(&pn->mn_info->mi_lock, SX_LOCKED); if (ucred->cr_uid != pn->mn_uid && (error = priv_check_cred(ucred, PRIV_MQ_ADMIN)) != 0) error = EACCES; else if (!pn->mn_deleted) { parent = pn->mn_parent; pn->mn_parent = NULL; pn->mn_deleted = 1; LIST_REMOVE(pn, mn_sibling); LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) { cache_purge(vd->mv_vnode); vhold(vd->mv_vnode); taskqueue_enqueue(taskqueue_thread, &vd->mv_task); } mqnode_release(pn); mqnode_release(parent); } else error = ENOENT; return (error); } #if 0 struct vop_remove_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; }; #endif /* * vnode removal operation */ static int mqfs_remove(struct vop_remove_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct mqfs_node *pn; int error; if (ap->a_vp->v_type == VDIR) return (EPERM); pn = VTON(ap->a_vp); sx_xlock(&mqfs->mi_lock); error = do_unlink(pn, ap->a_cnp->cn_cred); sx_xunlock(&mqfs->mi_lock); return (error); } #if 0 struct vop_inactive_args { struct vnode *a_vp; struct thread *a_td; }; #endif static int mqfs_inactive(struct vop_inactive_args *ap) { struct mqfs_node *pn = VTON(ap->a_vp); if (pn->mn_deleted) vrecycle(ap->a_vp); return (0); } #if 0 struct vop_reclaim_args { struct vop_generic_args a_gen; struct vnode *a_vp; }; #endif static int mqfs_reclaim(struct vop_reclaim_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_vp->v_mount); struct vnode *vp = ap->a_vp; struct mqfs_node *pn; struct mqfs_vdata *vd; vd = vp->v_data; pn = vd->mv_node; sx_xlock(&mqfs->mi_lock); vp->v_data = NULL; LIST_REMOVE(vd, mv_link); uma_zfree(mvdata_zone, vd); mqnode_release(pn); sx_xunlock(&mqfs->mi_lock); return (0); } #if 0 struct vop_open_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; struct file *a_fp; }; #endif static int mqfs_open(struct vop_open_args *ap) { return (0); } #if 0 struct vop_close_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; }; #endif static int mqfs_close(struct vop_close_args *ap) { return (0); } #if 0 struct vop_access_args { struct vop_generic_args a_gen; struct vnode *a_vp; accmode_t a_accmode; struct ucred *a_cred; struct thread *a_td; }; #endif /* * Verify permissions */ static int mqfs_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; struct vattr vattr; int error; error = VOP_GETATTR(vp, &vattr, ap->a_cred); if (error) return (error); error = vaccess(vp->v_type, vattr.va_mode, vattr.va_uid, vattr.va_gid, ap->a_accmode, ap->a_cred); return (error); } #if 0 struct vop_getattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; }; #endif /* * Get file attributes */ static int mqfs_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct mqfs_node *pn = VTON(vp); struct vattr *vap = ap->a_vap; int error = 0; vap->va_type = vp->v_type; vap->va_mode = pn->mn_mode; vap->va_nlink = 1; vap->va_uid = pn->mn_uid; vap->va_gid = pn->mn_gid; vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; vap->va_fileid = pn->mn_fileno; vap->va_size = 0; vap->va_blocksize = PAGE_SIZE; vap->va_bytes = vap->va_size = 0; vap->va_atime = pn->mn_atime; vap->va_mtime = pn->mn_mtime; vap->va_ctime = pn->mn_ctime; vap->va_birthtime = pn->mn_birth; vap->va_gen = 0; vap->va_flags = 0; vap->va_rdev = NODEV; vap->va_bytes = 0; vap->va_filerev = 0; return (error); } #if 0 struct vop_setattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; }; #endif /* * Set attributes */ static int mqfs_setattr(struct vop_setattr_args *ap) { struct mqfs_node *pn; struct vattr *vap; struct vnode *vp; struct thread *td; int c, error; uid_t uid; gid_t gid; td = curthread; vap = ap->a_vap; vp = ap->a_vp; if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_flags != VNOVAL && vap->va_flags != 0) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { return (EINVAL); } pn = VTON(vp); error = c = 0; if (vap->va_uid == (uid_t)VNOVAL) uid = pn->mn_uid; else uid = vap->va_uid; if (vap->va_gid == (gid_t)VNOVAL) gid = pn->mn_gid; else gid = vap->va_gid; if (uid != pn->mn_uid || gid != pn->mn_gid) { /* * To modify the ownership of a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td))) return (error); /* * XXXRW: Why is there a privilege check here: shouldn't the * check in VOP_ACCESS() be enough? Also, are the group bits * below definitely right? */ if (((ap->a_cred->cr_uid != pn->mn_uid) || uid != pn->mn_uid || (gid != pn->mn_gid && !groupmember(gid, ap->a_cred))) && (error = priv_check(td, PRIV_MQ_ADMIN)) != 0) return (error); pn->mn_uid = uid; pn->mn_gid = gid; c = 1; } if (vap->va_mode != (mode_t)VNOVAL) { if ((ap->a_cred->cr_uid != pn->mn_uid) && (error = priv_check(td, PRIV_MQ_ADMIN))) return (error); pn->mn_mode = vap->va_mode; c = 1; } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { /* See the comment in ufs_vnops::ufs_setattr(). */ if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td)) && ((vap->va_vaflags & VA_UTIMES_NULL) == 0 || (error = VOP_ACCESS(vp, VWRITE, ap->a_cred, td)))) return (error); if (vap->va_atime.tv_sec != VNOVAL) { pn->mn_atime = vap->va_atime; } if (vap->va_mtime.tv_sec != VNOVAL) { pn->mn_mtime = vap->va_mtime; } c = 1; } if (c) { vfs_timestamp(&pn->mn_ctime); } return (0); } #if 0 struct vop_read_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; }; #endif /* * Read from a file */ static int mqfs_read(struct vop_read_args *ap) { char buf[80]; struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct mqueue *mq; int len, error; if (vp->v_type != VREG) return (EINVAL); mq = VTOMQ(vp); snprintf(buf, sizeof(buf), "QSIZE:%-10ld MAXMSG:%-10ld CURMSG:%-10ld MSGSIZE:%-10ld\n", mq->mq_totalbytes, mq->mq_maxmsg, mq->mq_curmsgs, mq->mq_msgsize); buf[sizeof(buf)-1] = '\0'; len = strlen(buf); error = uiomove_frombuf(buf, len, uio); return (error); } #if 0 struct vop_readdir_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; u_long **a_cookies; }; #endif /* * Return directory entries. */ static int mqfs_readdir(struct vop_readdir_args *ap) { struct vnode *vp; struct mqfs_info *mi; struct mqfs_node *pd; struct mqfs_node *pn; struct dirent entry; struct uio *uio; const void *pr_root; int *tmp_ncookies = NULL; off_t offset; int error, i; vp = ap->a_vp; mi = VFSTOMQFS(vp->v_mount); pd = VTON(vp); uio = ap->a_uio; if (vp->v_type != VDIR) return (ENOTDIR); if (uio->uio_offset < 0) return (EINVAL); if (ap->a_ncookies != NULL) { tmp_ncookies = ap->a_ncookies; *ap->a_ncookies = 0; ap->a_ncookies = NULL; } error = 0; offset = 0; pr_root = ap->a_cred->cr_prison->pr_root; sx_xlock(&mi->mi_lock); LIST_FOREACH(pn, &pd->mn_children, mn_sibling) { entry.d_reclen = sizeof(entry); /* * Only show names within the same prison root directory * (or not associated with a prison, e.g. "." and ".."). */ if (pn->mn_pr_root != NULL && pn->mn_pr_root != pr_root) continue; if (!pn->mn_fileno) mqfs_fileno_alloc(mi, pn); entry.d_fileno = pn->mn_fileno; entry.d_off = offset + entry.d_reclen; for (i = 0; i < MQFS_NAMELEN - 1 && pn->mn_name[i] != '\0'; ++i) entry.d_name[i] = pn->mn_name[i]; entry.d_namlen = i; switch (pn->mn_type) { case mqfstype_root: case mqfstype_dir: case mqfstype_this: case mqfstype_parent: entry.d_type = DT_DIR; break; case mqfstype_file: entry.d_type = DT_REG; break; case mqfstype_symlink: entry.d_type = DT_LNK; break; default: panic("%s has unexpected node type: %d", pn->mn_name, pn->mn_type); } dirent_terminate(&entry); if (entry.d_reclen > uio->uio_resid) break; if (offset >= uio->uio_offset) { error = vfs_read_dirent(ap, &entry, offset); if (error) break; } offset += entry.d_reclen; } sx_xunlock(&mi->mi_lock); uio->uio_offset = offset; if (tmp_ncookies != NULL) ap->a_ncookies = tmp_ncookies; return (error); } #ifdef notyet #if 0 struct vop_mkdir_args { struct vnode *a_dvp; struvt vnode **a_vpp; struvt componentname *a_cnp; struct vattr *a_vap; }; #endif /* * Create a directory. */ static int mqfs_mkdir(struct vop_mkdir_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct componentname *cnp = ap->a_cnp; struct mqfs_node *pd = VTON(ap->a_dvp); struct mqfs_node *pn; int error; if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir) return (ENOTDIR); sx_xlock(&mqfs->mi_lock); if ((cnp->cn_flags & HASBUF) == 0) panic("%s: no name", __func__); pn = mqfs_create_dir(pd, cnp->cn_nameptr, cnp->cn_namelen, ap->a_vap->cn_cred, ap->a_vap->va_mode); if (pn != NULL) mqnode_addref(pn); sx_xunlock(&mqfs->mi_lock); if (pn == NULL) { error = ENOSPC; } else { error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn); mqnode_release(pn); } return (error); } #if 0 struct vop_rmdir_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; }; #endif /* * Remove a directory. */ static int mqfs_rmdir(struct vop_rmdir_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct mqfs_node *pn = VTON(ap->a_vp); struct mqfs_node *pt; if (pn->mn_type != mqfstype_dir) return (ENOTDIR); sx_xlock(&mqfs->mi_lock); if (pn->mn_deleted) { sx_xunlock(&mqfs->mi_lock); return (ENOENT); } pt = LIST_FIRST(&pn->mn_children); pt = LIST_NEXT(pt, mn_sibling); pt = LIST_NEXT(pt, mn_sibling); if (pt != NULL) { sx_xunlock(&mqfs->mi_lock); return (ENOTEMPTY); } pt = pn->mn_parent; pn->mn_parent = NULL; pn->mn_deleted = 1; LIST_REMOVE(pn, mn_sibling); mqnode_release(pn); mqnode_release(pt); sx_xunlock(&mqfs->mi_lock); cache_purge(ap->a_vp); return (0); } #endif /* notyet */ /* * See if this prison root is obsolete, and clean up associated queues if it is. */ static int mqfs_prison_remove(void *obj, void *data __unused) { const struct prison *pr = obj; struct prison *tpr; struct mqfs_node *pn, *tpn; struct vnode *pr_root; pr_root = pr->pr_root; if (pr->pr_parent->pr_root == pr_root) return (0); TAILQ_FOREACH(tpr, &allprison, pr_list) { if (tpr != pr && tpr->pr_root == pr_root) return (0); } /* * No jails are rooted in this directory anymore, * so no queues should be either. */ sx_xlock(&mqfs_data.mi_lock); LIST_FOREACH_SAFE(pn, &mqfs_data.mi_root->mn_children, mn_sibling, tpn) { if (pn->mn_pr_root == pr_root) (void)do_unlink(pn, curthread->td_ucred); } sx_xunlock(&mqfs_data.mi_lock); return (0); } /* * Allocate a message queue */ static struct mqueue * mqueue_alloc(const struct mq_attr *attr) { struct mqueue *mq; if (curmq >= maxmq) return (NULL); mq = uma_zalloc(mqueue_zone, M_WAITOK | M_ZERO); TAILQ_INIT(&mq->mq_msgq); if (attr != NULL) { mq->mq_maxmsg = attr->mq_maxmsg; mq->mq_msgsize = attr->mq_msgsize; } else { mq->mq_maxmsg = default_maxmsg; mq->mq_msgsize = default_msgsize; } mtx_init(&mq->mq_mutex, "mqueue lock", NULL, MTX_DEF); knlist_init_mtx(&mq->mq_rsel.si_note, &mq->mq_mutex); knlist_init_mtx(&mq->mq_wsel.si_note, &mq->mq_mutex); atomic_add_int(&curmq, 1); return (mq); } /* * Destroy a message queue */ static void mqueue_free(struct mqueue *mq) { struct mqueue_msg *msg; while ((msg = TAILQ_FIRST(&mq->mq_msgq)) != NULL) { TAILQ_REMOVE(&mq->mq_msgq, msg, msg_link); free(msg, M_MQUEUEDATA); } mtx_destroy(&mq->mq_mutex); seldrain(&mq->mq_rsel); seldrain(&mq->mq_wsel); knlist_destroy(&mq->mq_rsel.si_note); knlist_destroy(&mq->mq_wsel.si_note); uma_zfree(mqueue_zone, mq); atomic_add_int(&curmq, -1); } /* * Load a message from user space */ static struct mqueue_msg * mqueue_loadmsg(const char *msg_ptr, size_t msg_size, int msg_prio) { struct mqueue_msg *msg; size_t len; int error; len = sizeof(struct mqueue_msg) + msg_size; msg = malloc(len, M_MQUEUEDATA, M_WAITOK); error = copyin(msg_ptr, ((char *)msg) + sizeof(struct mqueue_msg), msg_size); if (error) { free(msg, M_MQUEUEDATA); msg = NULL; } else { msg->msg_size = msg_size; msg->msg_prio = msg_prio; } return (msg); } /* * Save a message to user space */ static int mqueue_savemsg(struct mqueue_msg *msg, char *msg_ptr, int *msg_prio) { int error; error = copyout(((char *)msg) + sizeof(*msg), msg_ptr, msg->msg_size); if (error == 0 && msg_prio != NULL) error = copyout(&msg->msg_prio, msg_prio, sizeof(int)); return (error); } /* * Free a message's memory */ static __inline void mqueue_freemsg(struct mqueue_msg *msg) { free(msg, M_MQUEUEDATA); } /* * Send a message. if waitok is false, thread will not be * blocked if there is no data in queue, otherwise, absolute * time will be checked. */ int mqueue_send(struct mqueue *mq, const char *msg_ptr, size_t msg_len, unsigned msg_prio, int waitok, const struct timespec *abs_timeout) { struct mqueue_msg *msg; struct timespec ts, ts2; struct timeval tv; int error; if (msg_prio >= MQ_PRIO_MAX) return (EINVAL); if (msg_len > mq->mq_msgsize) return (EMSGSIZE); msg = mqueue_loadmsg(msg_ptr, msg_len, msg_prio); if (msg == NULL) return (EFAULT); /* O_NONBLOCK case */ if (!waitok) { error = _mqueue_send(mq, msg, -1); if (error) goto bad; return (0); } /* we allow a null timeout (wait forever) */ if (abs_timeout == NULL) { error = _mqueue_send(mq, msg, 0); if (error) goto bad; return (0); } /* send it before checking time */ error = _mqueue_send(mq, msg, -1); if (error == 0) return (0); if (error != EAGAIN) goto bad; if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) { error = EINVAL; goto bad; } for (;;) { getnanotime(&ts); timespecsub(abs_timeout, &ts, &ts2); if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) { error = ETIMEDOUT; break; } TIMESPEC_TO_TIMEVAL(&tv, &ts2); error = _mqueue_send(mq, msg, tvtohz(&tv)); if (error != ETIMEDOUT) break; } if (error == 0) return (0); bad: mqueue_freemsg(msg); return (error); } /* * Common routine to send a message */ static int _mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo) { struct mqueue_msg *msg2; int error = 0; mtx_lock(&mq->mq_mutex); while (mq->mq_curmsgs >= mq->mq_maxmsg && error == 0) { if (timo < 0) { mtx_unlock(&mq->mq_mutex); return (EAGAIN); } mq->mq_senders++; error = msleep(&mq->mq_senders, &mq->mq_mutex, PCATCH, "mqsend", timo); mq->mq_senders--; if (error == EAGAIN) error = ETIMEDOUT; } if (mq->mq_curmsgs >= mq->mq_maxmsg) { mtx_unlock(&mq->mq_mutex); return (error); } error = 0; if (TAILQ_EMPTY(&mq->mq_msgq)) { TAILQ_INSERT_HEAD(&mq->mq_msgq, msg, msg_link); } else { if (msg->msg_prio <= TAILQ_LAST(&mq->mq_msgq, msgq)->msg_prio) { TAILQ_INSERT_TAIL(&mq->mq_msgq, msg, msg_link); } else { TAILQ_FOREACH(msg2, &mq->mq_msgq, msg_link) { if (msg2->msg_prio < msg->msg_prio) break; } TAILQ_INSERT_BEFORE(msg2, msg, msg_link); } } mq->mq_curmsgs++; mq->mq_totalbytes += msg->msg_size; if (mq->mq_receivers) wakeup_one(&mq->mq_receivers); else if (mq->mq_notifier != NULL) mqueue_send_notification(mq); if (mq->mq_flags & MQ_RSEL) { mq->mq_flags &= ~MQ_RSEL; selwakeup(&mq->mq_rsel); } KNOTE_LOCKED(&mq->mq_rsel.si_note, 0); mtx_unlock(&mq->mq_mutex); return (0); } /* * Send realtime a signal to process which registered itself * successfully by mq_notify. */ static void mqueue_send_notification(struct mqueue *mq) { struct mqueue_notifier *nt; struct thread *td; struct proc *p; int error; mtx_assert(&mq->mq_mutex, MA_OWNED); nt = mq->mq_notifier; if (nt->nt_sigev.sigev_notify != SIGEV_NONE) { p = nt->nt_proc; error = sigev_findtd(p, &nt->nt_sigev, &td); if (error) { mq->mq_notifier = NULL; return; } if (!KSI_ONQ(&nt->nt_ksi)) { ksiginfo_set_sigev(&nt->nt_ksi, &nt->nt_sigev); tdsendsignal(p, td, nt->nt_ksi.ksi_signo, &nt->nt_ksi); } PROC_UNLOCK(p); } mq->mq_notifier = NULL; } /* * Get a message. if waitok is false, thread will not be * blocked if there is no data in queue, otherwise, absolute * time will be checked. */ int mqueue_receive(struct mqueue *mq, char *msg_ptr, size_t msg_len, unsigned *msg_prio, int waitok, const struct timespec *abs_timeout) { struct mqueue_msg *msg; struct timespec ts, ts2; struct timeval tv; int error; if (msg_len < mq->mq_msgsize) return (EMSGSIZE); /* O_NONBLOCK case */ if (!waitok) { error = _mqueue_recv(mq, &msg, -1); if (error) return (error); goto received; } /* we allow a null timeout (wait forever). */ if (abs_timeout == NULL) { error = _mqueue_recv(mq, &msg, 0); if (error) return (error); goto received; } /* try to get a message before checking time */ error = _mqueue_recv(mq, &msg, -1); if (error == 0) goto received; if (error != EAGAIN) return (error); if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) { error = EINVAL; return (error); } for (;;) { getnanotime(&ts); timespecsub(abs_timeout, &ts, &ts2); if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) { error = ETIMEDOUT; return (error); } TIMESPEC_TO_TIMEVAL(&tv, &ts2); error = _mqueue_recv(mq, &msg, tvtohz(&tv)); if (error == 0) break; if (error != ETIMEDOUT) return (error); } received: error = mqueue_savemsg(msg, msg_ptr, msg_prio); if (error == 0) { curthread->td_retval[0] = msg->msg_size; curthread->td_retval[1] = 0; } mqueue_freemsg(msg); return (error); } /* * Common routine to receive a message */ static int _mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo) { int error = 0; mtx_lock(&mq->mq_mutex); while ((*msg = TAILQ_FIRST(&mq->mq_msgq)) == NULL && error == 0) { if (timo < 0) { mtx_unlock(&mq->mq_mutex); return (EAGAIN); } mq->mq_receivers++; error = msleep(&mq->mq_receivers, &mq->mq_mutex, PCATCH, "mqrecv", timo); mq->mq_receivers--; if (error == EAGAIN) error = ETIMEDOUT; } if (*msg != NULL) { error = 0; TAILQ_REMOVE(&mq->mq_msgq, *msg, msg_link); mq->mq_curmsgs--; mq->mq_totalbytes -= (*msg)->msg_size; if (mq->mq_senders) wakeup_one(&mq->mq_senders); if (mq->mq_flags & MQ_WSEL) { mq->mq_flags &= ~MQ_WSEL; selwakeup(&mq->mq_wsel); } KNOTE_LOCKED(&mq->mq_wsel.si_note, 0); } if (mq->mq_notifier != NULL && mq->mq_receivers == 0 && !TAILQ_EMPTY(&mq->mq_msgq)) { mqueue_send_notification(mq); } mtx_unlock(&mq->mq_mutex); return (error); } static __inline struct mqueue_notifier * notifier_alloc(void) { return (uma_zalloc(mqnoti_zone, M_WAITOK | M_ZERO)); } static __inline void notifier_free(struct mqueue_notifier *p) { uma_zfree(mqnoti_zone, p); } static struct mqueue_notifier * notifier_search(struct proc *p, int fd) { struct mqueue_notifier *nt; LIST_FOREACH(nt, &p->p_mqnotifier, nt_link) { if (nt->nt_ksi.ksi_mqd == fd) break; } return (nt); } static __inline void notifier_insert(struct proc *p, struct mqueue_notifier *nt) { LIST_INSERT_HEAD(&p->p_mqnotifier, nt, nt_link); } static __inline void notifier_delete(struct proc *p, struct mqueue_notifier *nt) { LIST_REMOVE(nt, nt_link); notifier_free(nt); } static void notifier_remove(struct proc *p, struct mqueue *mq, int fd) { struct mqueue_notifier *nt; mtx_assert(&mq->mq_mutex, MA_OWNED); PROC_LOCK(p); nt = notifier_search(p, fd); if (nt != NULL) { if (mq->mq_notifier == nt) mq->mq_notifier = NULL; sigqueue_take(&nt->nt_ksi); notifier_delete(p, nt); } PROC_UNLOCK(p); } static int kern_kmq_open(struct thread *td, const char *upath, int flags, mode_t mode, const struct mq_attr *attr) { char path[MQFS_NAMELEN + 1]; struct mqfs_node *pn; struct pwddesc *pdp; struct file *fp; struct mqueue *mq; int fd, error, len, cmode; AUDIT_ARG_FFLAGS(flags); AUDIT_ARG_MODE(mode); pdp = td->td_proc->p_pd; cmode = (((mode & ~pdp->pd_cmask) & ALLPERMS) & ~S_ISTXT); mq = NULL; if ((flags & O_CREAT) != 0 && attr != NULL) { if (attr->mq_maxmsg <= 0 || attr->mq_maxmsg > maxmsg) return (EINVAL); if (attr->mq_msgsize <= 0 || attr->mq_msgsize > maxmsgsize) return (EINVAL); } error = copyinstr(upath, path, MQFS_NAMELEN + 1, NULL); if (error) return (error); /* * The first character of name must be a slash (/) character * and the remaining characters of name cannot include any slash * characters. */ len = strlen(path); if (len < 2 || path[0] != '/' || strchr(path + 1, '/') != NULL) return (EINVAL); /* * "." and ".." are magic directories, populated on the fly, and cannot * be opened as queues. */ if (strcmp(path, "/.") == 0 || strcmp(path, "/..") == 0) return (EINVAL); AUDIT_ARG_UPATH1_CANON(path); error = falloc(td, &fp, &fd, O_CLOEXEC); if (error) return (error); sx_xlock(&mqfs_data.mi_lock); pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred); if (pn == NULL) { if (!(flags & O_CREAT)) { error = ENOENT; } else { mq = mqueue_alloc(attr); if (mq == NULL) { error = ENFILE; } else { pn = mqfs_create_file(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred, cmode); if (pn == NULL) { error = ENOSPC; mqueue_free(mq); } } } if (error == 0) { pn->mn_data = mq; } } else { if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) { error = EEXIST; } else { accmode_t accmode = 0; if (flags & FREAD) accmode |= VREAD; if (flags & FWRITE) accmode |= VWRITE; error = vaccess(VREG, pn->mn_mode, pn->mn_uid, pn->mn_gid, accmode, td->td_ucred); } } if (error) { sx_xunlock(&mqfs_data.mi_lock); fdclose(td, fp, fd); fdrop(fp, td); return (error); } mqnode_addref(pn); sx_xunlock(&mqfs_data.mi_lock); finit(fp, flags & (FREAD | FWRITE | O_NONBLOCK), DTYPE_MQUEUE, pn, &mqueueops); td->td_retval[0] = fd; fdrop(fp, td); return (0); } /* * Syscall to open a message queue. */ int sys_kmq_open(struct thread *td, struct kmq_open_args *uap) { struct mq_attr attr; int flags, error; if ((uap->flags & O_ACCMODE) == O_ACCMODE || uap->flags & O_EXEC) return (EINVAL); flags = FFLAGS(uap->flags); if ((flags & O_CREAT) != 0 && uap->attr != NULL) { error = copyin(uap->attr, &attr, sizeof(attr)); if (error) return (error); } return (kern_kmq_open(td, uap->path, flags, uap->mode, uap->attr != NULL ? &attr : NULL)); } /* * Syscall to unlink a message queue. */ int sys_kmq_unlink(struct thread *td, struct kmq_unlink_args *uap) { char path[MQFS_NAMELEN+1]; struct mqfs_node *pn; int error, len; error = copyinstr(uap->path, path, MQFS_NAMELEN + 1, NULL); if (error) return (error); len = strlen(path); if (len < 2 || path[0] != '/' || strchr(path + 1, '/') != NULL) return (EINVAL); if (strcmp(path, "/.") == 0 || strcmp(path, "/..") == 0) return (EINVAL); AUDIT_ARG_UPATH1_CANON(path); sx_xlock(&mqfs_data.mi_lock); pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred); if (pn != NULL) error = do_unlink(pn, td->td_ucred); else error = ENOENT; sx_xunlock(&mqfs_data.mi_lock); return (error); } typedef int (*_fgetf)(struct thread *, int, cap_rights_t *, struct file **); /* * Get message queue by giving file slot */ static int _getmq(struct thread *td, int fd, cap_rights_t *rightsp, _fgetf func, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { struct mqfs_node *pn; int error; error = func(td, fd, rightsp, fpp); if (error) return (error); if (&mqueueops != (*fpp)->f_ops) { fdrop(*fpp, td); return (EBADF); } pn = (*fpp)->f_data; if (ppn) *ppn = pn; if (pmq) *pmq = pn->mn_data; return (0); } static __inline int getmq(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { return _getmq(td, fd, &cap_event_rights, fget, fpp, ppn, pmq); } static __inline int getmq_read(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { return _getmq(td, fd, &cap_read_rights, fget_read, fpp, ppn, pmq); } static __inline int getmq_write(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { return _getmq(td, fd, &cap_write_rights, fget_write, fpp, ppn, pmq); } static int kern_kmq_setattr(struct thread *td, int mqd, const struct mq_attr *attr, struct mq_attr *oattr) { struct mqueue *mq; struct file *fp; u_int oflag, flag; int error; AUDIT_ARG_FD(mqd); if (attr != NULL && (attr->mq_flags & ~O_NONBLOCK) != 0) return (EINVAL); error = getmq(td, mqd, &fp, NULL, &mq); if (error) return (error); oattr->mq_maxmsg = mq->mq_maxmsg; oattr->mq_msgsize = mq->mq_msgsize; oattr->mq_curmsgs = mq->mq_curmsgs; if (attr != NULL) { do { oflag = flag = fp->f_flag; flag &= ~O_NONBLOCK; flag |= (attr->mq_flags & O_NONBLOCK); } while (atomic_cmpset_int(&fp->f_flag, oflag, flag) == 0); } else oflag = fp->f_flag; oattr->mq_flags = (O_NONBLOCK & oflag); fdrop(fp, td); return (error); } int sys_kmq_setattr(struct thread *td, struct kmq_setattr_args *uap) { struct mq_attr attr, oattr; int error; if (uap->attr != NULL) { error = copyin(uap->attr, &attr, sizeof(attr)); if (error != 0) return (error); } error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL, &oattr); if (error == 0 && uap->oattr != NULL) { bzero(oattr.__reserved, sizeof(oattr.__reserved)); error = copyout(&oattr, uap->oattr, sizeof(oattr)); } return (error); } int sys_kmq_timedreceive(struct thread *td, struct kmq_timedreceive_args *uap) { struct mqueue *mq; struct file *fp; struct timespec *abs_timeout, ets; int error; int waitok; AUDIT_ARG_FD(uap->mqd); error = getmq_read(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); if (uap->abs_timeout != NULL) { error = copyin(uap->abs_timeout, &ets, sizeof(ets)); if (error != 0) goto out; abs_timeout = &ets; } else abs_timeout = NULL; waitok = !(fp->f_flag & O_NONBLOCK); error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len, uap->msg_prio, waitok, abs_timeout); out: fdrop(fp, td); return (error); } int sys_kmq_timedsend(struct thread *td, struct kmq_timedsend_args *uap) { struct mqueue *mq; struct file *fp; struct timespec *abs_timeout, ets; int error, waitok; AUDIT_ARG_FD(uap->mqd); error = getmq_write(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); if (uap->abs_timeout != NULL) { error = copyin(uap->abs_timeout, &ets, sizeof(ets)); if (error != 0) goto out; abs_timeout = &ets; } else abs_timeout = NULL; waitok = !(fp->f_flag & O_NONBLOCK); error = mqueue_send(mq, uap->msg_ptr, uap->msg_len, uap->msg_prio, waitok, abs_timeout); out: fdrop(fp, td); return (error); } static int kern_kmq_notify(struct thread *td, int mqd, struct sigevent *sigev) { struct filedesc *fdp; struct proc *p; struct mqueue *mq; struct file *fp, *fp2; struct mqueue_notifier *nt, *newnt = NULL; int error; AUDIT_ARG_FD(mqd); if (sigev != NULL) { if (sigev->sigev_notify != SIGEV_SIGNAL && sigev->sigev_notify != SIGEV_THREAD_ID && sigev->sigev_notify != SIGEV_NONE) return (EINVAL); if ((sigev->sigev_notify == SIGEV_SIGNAL || sigev->sigev_notify == SIGEV_THREAD_ID) && !_SIG_VALID(sigev->sigev_signo)) return (EINVAL); } p = td->td_proc; fdp = td->td_proc->p_fd; error = getmq(td, mqd, &fp, NULL, &mq); if (error) return (error); again: FILEDESC_SLOCK(fdp); fp2 = fget_locked(fdp, mqd); if (fp2 == NULL) { FILEDESC_SUNLOCK(fdp); error = EBADF; goto out; } #ifdef CAPABILITIES error = cap_check(cap_rights(fdp, mqd), &cap_event_rights); if (error) { FILEDESC_SUNLOCK(fdp); goto out; } #endif if (fp2 != fp) { FILEDESC_SUNLOCK(fdp); error = EBADF; goto out; } mtx_lock(&mq->mq_mutex); FILEDESC_SUNLOCK(fdp); if (sigev != NULL) { if (mq->mq_notifier != NULL) { error = EBUSY; } else { PROC_LOCK(p); nt = notifier_search(p, mqd); if (nt == NULL) { if (newnt == NULL) { PROC_UNLOCK(p); mtx_unlock(&mq->mq_mutex); newnt = notifier_alloc(); goto again; } } if (nt != NULL) { sigqueue_take(&nt->nt_ksi); if (newnt != NULL) { notifier_free(newnt); newnt = NULL; } } else { nt = newnt; newnt = NULL; ksiginfo_init(&nt->nt_ksi); nt->nt_ksi.ksi_flags |= KSI_INS | KSI_EXT; nt->nt_ksi.ksi_code = SI_MESGQ; nt->nt_proc = p; nt->nt_ksi.ksi_mqd = mqd; notifier_insert(p, nt); } nt->nt_sigev = *sigev; mq->mq_notifier = nt; PROC_UNLOCK(p); /* * if there is no receivers and message queue * is not empty, we should send notification * as soon as possible. */ if (mq->mq_receivers == 0 && !TAILQ_EMPTY(&mq->mq_msgq)) mqueue_send_notification(mq); } } else { notifier_remove(p, mq, mqd); } mtx_unlock(&mq->mq_mutex); out: fdrop(fp, td); if (newnt != NULL) notifier_free(newnt); return (error); } int sys_kmq_notify(struct thread *td, struct kmq_notify_args *uap) { struct sigevent ev, *evp; int error; if (uap->sigev == NULL) { evp = NULL; } else { error = copyin(uap->sigev, &ev, sizeof(ev)); if (error != 0) return (error); evp = &ev; } return (kern_kmq_notify(td, uap->mqd, evp)); } static void mqueue_fdclose(struct thread *td, int fd, struct file *fp) { struct mqueue *mq; #ifdef INVARIANTS struct filedesc *fdp; fdp = td->td_proc->p_fd; FILEDESC_LOCK_ASSERT(fdp); #endif if (fp->f_ops == &mqueueops) { mq = FPTOMQ(fp); mtx_lock(&mq->mq_mutex); notifier_remove(td->td_proc, mq, fd); /* have to wakeup thread in same process */ if (mq->mq_flags & MQ_RSEL) { mq->mq_flags &= ~MQ_RSEL; selwakeup(&mq->mq_rsel); } if (mq->mq_flags & MQ_WSEL) { mq->mq_flags &= ~MQ_WSEL; selwakeup(&mq->mq_wsel); } mtx_unlock(&mq->mq_mutex); } } static void mq_proc_exit(void *arg __unused, struct proc *p) { struct filedesc *fdp; struct file *fp; struct mqueue *mq; int i; fdp = p->p_fd; FILEDESC_SLOCK(fdp); for (i = 0; i < fdp->fd_nfiles; ++i) { fp = fget_locked(fdp, i); if (fp != NULL && fp->f_ops == &mqueueops) { mq = FPTOMQ(fp); mtx_lock(&mq->mq_mutex); notifier_remove(p, FPTOMQ(fp), i); mtx_unlock(&mq->mq_mutex); } } FILEDESC_SUNLOCK(fdp); KASSERT(LIST_EMPTY(&p->p_mqnotifier), ("mq notifiers left")); } static int mqf_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct mqueue *mq = FPTOMQ(fp); int revents = 0; mtx_lock(&mq->mq_mutex); if (events & (POLLIN | POLLRDNORM)) { if (mq->mq_curmsgs) { revents |= events & (POLLIN | POLLRDNORM); } else { mq->mq_flags |= MQ_RSEL; selrecord(td, &mq->mq_rsel); } } if (events & POLLOUT) { if (mq->mq_curmsgs < mq->mq_maxmsg) revents |= POLLOUT; else { mq->mq_flags |= MQ_WSEL; selrecord(td, &mq->mq_wsel); } } mtx_unlock(&mq->mq_mutex); return (revents); } static int mqf_close(struct file *fp, struct thread *td) { struct mqfs_node *pn; fp->f_ops = &badfileops; pn = fp->f_data; fp->f_data = NULL; sx_xlock(&mqfs_data.mi_lock); mqnode_release(pn); sx_xunlock(&mqfs_data.mi_lock); return (0); } static int -mqf_stat(struct file *fp, struct stat *st, struct ucred *active_cred, - struct thread *td) +mqf_stat(struct file *fp, struct stat *st, struct ucred *active_cred) { struct mqfs_node *pn = fp->f_data; bzero(st, sizeof *st); sx_xlock(&mqfs_data.mi_lock); st->st_atim = pn->mn_atime; st->st_mtim = pn->mn_mtime; st->st_ctim = pn->mn_ctime; st->st_birthtim = pn->mn_birth; st->st_uid = pn->mn_uid; st->st_gid = pn->mn_gid; st->st_mode = S_IFIFO | pn->mn_mode; sx_xunlock(&mqfs_data.mi_lock); return (0); } static int mqf_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { struct mqfs_node *pn; int error; error = 0; pn = fp->f_data; sx_xlock(&mqfs_data.mi_lock); error = vaccess(VREG, pn->mn_mode, pn->mn_uid, pn->mn_gid, VADMIN, active_cred); if (error != 0) goto out; pn->mn_mode = mode & ACCESSPERMS; out: sx_xunlock(&mqfs_data.mi_lock); return (error); } static int mqf_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { struct mqfs_node *pn; int error; error = 0; pn = fp->f_data; sx_xlock(&mqfs_data.mi_lock); if (uid == (uid_t)-1) uid = pn->mn_uid; if (gid == (gid_t)-1) gid = pn->mn_gid; if (((uid != pn->mn_uid && uid != active_cred->cr_uid) || (gid != pn->mn_gid && !groupmember(gid, active_cred))) && (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN))) goto out; pn->mn_uid = uid; pn->mn_gid = gid; out: sx_xunlock(&mqfs_data.mi_lock); return (error); } static int mqf_kqfilter(struct file *fp, struct knote *kn) { struct mqueue *mq = FPTOMQ(fp); int error = 0; if (kn->kn_filter == EVFILT_READ) { kn->kn_fop = &mq_rfiltops; knlist_add(&mq->mq_rsel.si_note, kn, 0); } else if (kn->kn_filter == EVFILT_WRITE) { kn->kn_fop = &mq_wfiltops; knlist_add(&mq->mq_wsel.si_note, kn, 0); } else error = EINVAL; return (error); } static void filt_mqdetach(struct knote *kn) { struct mqueue *mq = FPTOMQ(kn->kn_fp); if (kn->kn_filter == EVFILT_READ) knlist_remove(&mq->mq_rsel.si_note, kn, 0); else if (kn->kn_filter == EVFILT_WRITE) knlist_remove(&mq->mq_wsel.si_note, kn, 0); else panic("filt_mqdetach"); } static int filt_mqread(struct knote *kn, long hint) { struct mqueue *mq = FPTOMQ(kn->kn_fp); mtx_assert(&mq->mq_mutex, MA_OWNED); return (mq->mq_curmsgs != 0); } static int filt_mqwrite(struct knote *kn, long hint) { struct mqueue *mq = FPTOMQ(kn->kn_fp); mtx_assert(&mq->mq_mutex, MA_OWNED); return (mq->mq_curmsgs < mq->mq_maxmsg); } static int mqf_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { kif->kf_type = KF_TYPE_MQUEUE; return (0); } static struct fileops mqueueops = { .fo_read = invfo_rdwr, .fo_write = invfo_rdwr, .fo_truncate = invfo_truncate, .fo_ioctl = invfo_ioctl, .fo_poll = mqf_poll, .fo_kqfilter = mqf_kqfilter, .fo_stat = mqf_stat, .fo_close = mqf_close, .fo_chmod = mqf_chmod, .fo_chown = mqf_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = mqf_fill_kinfo, .fo_flags = DFLAG_PASSABLE, }; static struct vop_vector mqfs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = mqfs_access, .vop_cachedlookup = mqfs_lookup, .vop_lookup = vfs_cache_lookup, .vop_reclaim = mqfs_reclaim, .vop_create = mqfs_create, .vop_remove = mqfs_remove, .vop_inactive = mqfs_inactive, .vop_open = mqfs_open, .vop_close = mqfs_close, .vop_getattr = mqfs_getattr, .vop_setattr = mqfs_setattr, .vop_read = mqfs_read, .vop_write = VOP_EOPNOTSUPP, .vop_readdir = mqfs_readdir, .vop_mkdir = VOP_EOPNOTSUPP, .vop_rmdir = VOP_EOPNOTSUPP }; VFS_VOP_VECTOR_REGISTER(mqfs_vnodeops); static struct vfsops mqfs_vfsops = { .vfs_init = mqfs_init, .vfs_uninit = mqfs_uninit, .vfs_mount = mqfs_mount, .vfs_unmount = mqfs_unmount, .vfs_root = mqfs_root, .vfs_statfs = mqfs_statfs, }; static struct vfsconf mqueuefs_vfsconf = { .vfc_version = VFS_VERSION, .vfc_name = "mqueuefs", .vfc_vfsops = &mqfs_vfsops, .vfc_typenum = -1, .vfc_flags = VFCF_SYNTHETIC }; static struct syscall_helper_data mq_syscalls[] = { SYSCALL_INIT_HELPER(kmq_open), SYSCALL_INIT_HELPER_F(kmq_setattr, SYF_CAPENABLED), SYSCALL_INIT_HELPER_F(kmq_timedsend, SYF_CAPENABLED), SYSCALL_INIT_HELPER_F(kmq_timedreceive, SYF_CAPENABLED), SYSCALL_INIT_HELPER_F(kmq_notify, SYF_CAPENABLED), SYSCALL_INIT_HELPER(kmq_unlink), SYSCALL_INIT_LAST }; #ifdef COMPAT_FREEBSD32 #include #include #include #include #include static void mq_attr_from32(const struct mq_attr32 *from, struct mq_attr *to) { to->mq_flags = from->mq_flags; to->mq_maxmsg = from->mq_maxmsg; to->mq_msgsize = from->mq_msgsize; to->mq_curmsgs = from->mq_curmsgs; } static void mq_attr_to32(const struct mq_attr *from, struct mq_attr32 *to) { to->mq_flags = from->mq_flags; to->mq_maxmsg = from->mq_maxmsg; to->mq_msgsize = from->mq_msgsize; to->mq_curmsgs = from->mq_curmsgs; } int freebsd32_kmq_open(struct thread *td, struct freebsd32_kmq_open_args *uap) { struct mq_attr attr; struct mq_attr32 attr32; int flags, error; if ((uap->flags & O_ACCMODE) == O_ACCMODE || uap->flags & O_EXEC) return (EINVAL); flags = FFLAGS(uap->flags); if ((flags & O_CREAT) != 0 && uap->attr != NULL) { error = copyin(uap->attr, &attr32, sizeof(attr32)); if (error) return (error); mq_attr_from32(&attr32, &attr); } return (kern_kmq_open(td, uap->path, flags, uap->mode, uap->attr != NULL ? &attr : NULL)); } int freebsd32_kmq_setattr(struct thread *td, struct freebsd32_kmq_setattr_args *uap) { struct mq_attr attr, oattr; struct mq_attr32 attr32, oattr32; int error; if (uap->attr != NULL) { error = copyin(uap->attr, &attr32, sizeof(attr32)); if (error != 0) return (error); mq_attr_from32(&attr32, &attr); } error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL, &oattr); if (error == 0 && uap->oattr != NULL) { mq_attr_to32(&oattr, &oattr32); bzero(oattr32.__reserved, sizeof(oattr32.__reserved)); error = copyout(&oattr32, uap->oattr, sizeof(oattr32)); } return (error); } int freebsd32_kmq_timedsend(struct thread *td, struct freebsd32_kmq_timedsend_args *uap) { struct mqueue *mq; struct file *fp; struct timespec32 ets32; struct timespec *abs_timeout, ets; int error; int waitok; AUDIT_ARG_FD(uap->mqd); error = getmq_write(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); if (uap->abs_timeout != NULL) { error = copyin(uap->abs_timeout, &ets32, sizeof(ets32)); if (error != 0) goto out; CP(ets32, ets, tv_sec); CP(ets32, ets, tv_nsec); abs_timeout = &ets; } else abs_timeout = NULL; waitok = !(fp->f_flag & O_NONBLOCK); error = mqueue_send(mq, uap->msg_ptr, uap->msg_len, uap->msg_prio, waitok, abs_timeout); out: fdrop(fp, td); return (error); } int freebsd32_kmq_timedreceive(struct thread *td, struct freebsd32_kmq_timedreceive_args *uap) { struct mqueue *mq; struct file *fp; struct timespec32 ets32; struct timespec *abs_timeout, ets; int error, waitok; AUDIT_ARG_FD(uap->mqd); error = getmq_read(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); if (uap->abs_timeout != NULL) { error = copyin(uap->abs_timeout, &ets32, sizeof(ets32)); if (error != 0) goto out; CP(ets32, ets, tv_sec); CP(ets32, ets, tv_nsec); abs_timeout = &ets; } else abs_timeout = NULL; waitok = !(fp->f_flag & O_NONBLOCK); error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len, uap->msg_prio, waitok, abs_timeout); out: fdrop(fp, td); return (error); } int freebsd32_kmq_notify(struct thread *td, struct freebsd32_kmq_notify_args *uap) { struct sigevent ev, *evp; struct sigevent32 ev32; int error; if (uap->sigev == NULL) { evp = NULL; } else { error = copyin(uap->sigev, &ev32, sizeof(ev32)); if (error != 0) return (error); error = convert_sigevent32(&ev32, &ev); if (error != 0) return (error); evp = &ev; } return (kern_kmq_notify(td, uap->mqd, evp)); } static struct syscall_helper_data mq32_syscalls[] = { SYSCALL32_INIT_HELPER(freebsd32_kmq_open), SYSCALL32_INIT_HELPER_F(freebsd32_kmq_setattr, SYF_CAPENABLED), SYSCALL32_INIT_HELPER_F(freebsd32_kmq_timedsend, SYF_CAPENABLED), SYSCALL32_INIT_HELPER_F(freebsd32_kmq_timedreceive, SYF_CAPENABLED), SYSCALL32_INIT_HELPER_F(freebsd32_kmq_notify, SYF_CAPENABLED), SYSCALL32_INIT_HELPER_COMPAT(kmq_unlink), SYSCALL_INIT_LAST }; #endif static int mqinit(void) { int error; error = syscall_helper_register(mq_syscalls, SY_THR_STATIC_KLD); if (error != 0) return (error); #ifdef COMPAT_FREEBSD32 error = syscall32_helper_register(mq32_syscalls, SY_THR_STATIC_KLD); if (error != 0) return (error); #endif return (0); } static int mqunload(void) { #ifdef COMPAT_FREEBSD32 syscall32_helper_unregister(mq32_syscalls); #endif syscall_helper_unregister(mq_syscalls); return (0); } static int mq_modload(struct module *module, int cmd, void *arg) { int error = 0; error = vfs_modevent(module, cmd, arg); if (error != 0) return (error); switch (cmd) { case MOD_LOAD: error = mqinit(); if (error != 0) mqunload(); break; case MOD_UNLOAD: error = mqunload(); break; default: break; } return (error); } static moduledata_t mqueuefs_mod = { "mqueuefs", mq_modload, &mqueuefs_vfsconf }; DECLARE_MODULE(mqueuefs, mqueuefs_mod, SI_SUB_VFS, SI_ORDER_MIDDLE); MODULE_VERSION(mqueuefs, 1); diff --git a/sys/kern/uipc_sem.c b/sys/kern/uipc_sem.c index db1d84696df0..e7c7a04f5c54 100644 --- a/sys/kern/uipc_sem.c +++ b/sys/kern/uipc_sem.c @@ -1,1111 +1,1110 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2002 Alfred Perlstein * Copyright (c) 2003-2005 SPARTA, Inc. * Copyright (c) 2005, 2016-2017 Robert N. M. Watson * All rights reserved. * * This software was developed for the FreeBSD Project in part by Network * Associates Laboratories, the Security Research Division of Network * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), * as part of the DARPA CHATS research program. * * Portions of this software were developed by BAE Systems, the University of * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent * Computing (TC) research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_posix.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(p1003_1b_semaphores, "POSIX P1003.1B semaphores support"); /* * TODO * * - Resource limits? * - Replace global sem_lock with mtx_pool locks? * - Add a MAC check_create() hook for creating new named semaphores. */ #ifndef SEM_MAX #define SEM_MAX 30 #endif #ifdef SEM_DEBUG #define DP(x) printf x #else #define DP(x) #endif struct ksem_mapping { char *km_path; Fnv32_t km_fnv; struct ksem *km_ksem; LIST_ENTRY(ksem_mapping) km_link; }; static MALLOC_DEFINE(M_KSEM, "ksem", "semaphore file descriptor"); static LIST_HEAD(, ksem_mapping) *ksem_dictionary; static struct sx ksem_dict_lock; static struct mtx ksem_count_lock; static struct mtx sem_lock; static u_long ksem_hash; static int ksem_dead; #define KSEM_HASH(fnv) (&ksem_dictionary[(fnv) & ksem_hash]) static int nsems = 0; SYSCTL_DECL(_p1003_1b); SYSCTL_INT(_p1003_1b, OID_AUTO, nsems, CTLFLAG_RD, &nsems, 0, "Number of active kernel POSIX semaphores"); static int kern_sem_wait(struct thread *td, semid_t id, int tryflag, struct timespec *abstime); static int ksem_access(struct ksem *ks, struct ucred *ucred); static struct ksem *ksem_alloc(struct ucred *ucred, mode_t mode, unsigned int value); static int ksem_create(struct thread *td, const char *path, semid_t *semidp, mode_t mode, unsigned int value, int flags, int compat32); static void ksem_drop(struct ksem *ks); static int ksem_get(struct thread *td, semid_t id, cap_rights_t *rightsp, struct file **fpp); static struct ksem *ksem_hold(struct ksem *ks); static void ksem_insert(char *path, Fnv32_t fnv, struct ksem *ks); static struct ksem *ksem_lookup(char *path, Fnv32_t fnv); static void ksem_module_destroy(void); static int ksem_module_init(void); static int ksem_remove(char *path, Fnv32_t fnv, struct ucred *ucred); static int sem_modload(struct module *module, int cmd, void *arg); static fo_stat_t ksem_stat; static fo_close_t ksem_closef; static fo_chmod_t ksem_chmod; static fo_chown_t ksem_chown; static fo_fill_kinfo_t ksem_fill_kinfo; /* File descriptor operations. */ static struct fileops ksem_ops = { .fo_read = invfo_rdwr, .fo_write = invfo_rdwr, .fo_truncate = invfo_truncate, .fo_ioctl = invfo_ioctl, .fo_poll = invfo_poll, .fo_kqfilter = invfo_kqfilter, .fo_stat = ksem_stat, .fo_close = ksem_closef, .fo_chmod = ksem_chmod, .fo_chown = ksem_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = ksem_fill_kinfo, .fo_flags = DFLAG_PASSABLE }; FEATURE(posix_sem, "POSIX semaphores"); static int -ksem_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, - struct thread *td) +ksem_stat(struct file *fp, struct stat *sb, struct ucred *active_cred) { struct ksem *ks; #ifdef MAC int error; #endif ks = fp->f_data; #ifdef MAC error = mac_posixsem_check_stat(active_cred, fp->f_cred, ks); if (error) return (error); #endif /* * Attempt to return sanish values for fstat() on a semaphore * file descriptor. */ bzero(sb, sizeof(*sb)); mtx_lock(&sem_lock); sb->st_atim = ks->ks_atime; sb->st_ctim = ks->ks_ctime; sb->st_mtim = ks->ks_mtime; sb->st_birthtim = ks->ks_birthtime; sb->st_uid = ks->ks_uid; sb->st_gid = ks->ks_gid; sb->st_mode = S_IFREG | ks->ks_mode; /* XXX */ mtx_unlock(&sem_lock); return (0); } static int ksem_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { struct ksem *ks; int error; error = 0; ks = fp->f_data; mtx_lock(&sem_lock); #ifdef MAC error = mac_posixsem_check_setmode(active_cred, ks, mode); if (error != 0) goto out; #endif error = vaccess(VREG, ks->ks_mode, ks->ks_uid, ks->ks_gid, VADMIN, active_cred); if (error != 0) goto out; ks->ks_mode = mode & ACCESSPERMS; out: mtx_unlock(&sem_lock); return (error); } static int ksem_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { struct ksem *ks; int error; error = 0; ks = fp->f_data; mtx_lock(&sem_lock); #ifdef MAC error = mac_posixsem_check_setowner(active_cred, ks, uid, gid); if (error != 0) goto out; #endif if (uid == (uid_t)-1) uid = ks->ks_uid; if (gid == (gid_t)-1) gid = ks->ks_gid; if (((uid != ks->ks_uid && uid != active_cred->cr_uid) || (gid != ks->ks_gid && !groupmember(gid, active_cred))) && (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN))) goto out; ks->ks_uid = uid; ks->ks_gid = gid; out: mtx_unlock(&sem_lock); return (error); } static int ksem_closef(struct file *fp, struct thread *td) { struct ksem *ks; ks = fp->f_data; fp->f_data = NULL; ksem_drop(ks); return (0); } static int ksem_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { const char *path, *pr_path; struct ksem *ks; size_t pr_pathlen; kif->kf_type = KF_TYPE_SEM; ks = fp->f_data; mtx_lock(&sem_lock); kif->kf_un.kf_sem.kf_sem_value = ks->ks_value; kif->kf_un.kf_sem.kf_sem_mode = S_IFREG | ks->ks_mode; /* XXX */ mtx_unlock(&sem_lock); if (ks->ks_path != NULL) { sx_slock(&ksem_dict_lock); if (ks->ks_path != NULL) { path = ks->ks_path; pr_path = curthread->td_ucred->cr_prison->pr_path; if (strcmp(pr_path, "/") != 0) { /* Return the jail-rooted pathname. */ pr_pathlen = strlen(pr_path); if (strncmp(path, pr_path, pr_pathlen) == 0 && path[pr_pathlen] == '/') path += pr_pathlen; } strlcpy(kif->kf_path, path, sizeof(kif->kf_path)); } sx_sunlock(&ksem_dict_lock); } return (0); } /* * ksem object management including creation and reference counting * routines. */ static struct ksem * ksem_alloc(struct ucred *ucred, mode_t mode, unsigned int value) { struct ksem *ks; mtx_lock(&ksem_count_lock); if (nsems == p31b_getcfg(CTL_P1003_1B_SEM_NSEMS_MAX) || ksem_dead) { mtx_unlock(&ksem_count_lock); return (NULL); } nsems++; mtx_unlock(&ksem_count_lock); ks = malloc(sizeof(*ks), M_KSEM, M_WAITOK | M_ZERO); ks->ks_uid = ucred->cr_uid; ks->ks_gid = ucred->cr_gid; ks->ks_mode = mode; ks->ks_value = value; cv_init(&ks->ks_cv, "ksem"); vfs_timestamp(&ks->ks_birthtime); ks->ks_atime = ks->ks_mtime = ks->ks_ctime = ks->ks_birthtime; refcount_init(&ks->ks_ref, 1); #ifdef MAC mac_posixsem_init(ks); mac_posixsem_create(ucred, ks); #endif return (ks); } static struct ksem * ksem_hold(struct ksem *ks) { refcount_acquire(&ks->ks_ref); return (ks); } static void ksem_drop(struct ksem *ks) { if (refcount_release(&ks->ks_ref)) { #ifdef MAC mac_posixsem_destroy(ks); #endif cv_destroy(&ks->ks_cv); free(ks, M_KSEM); mtx_lock(&ksem_count_lock); nsems--; mtx_unlock(&ksem_count_lock); } } /* * Determine if the credentials have sufficient permissions for read * and write access. */ static int ksem_access(struct ksem *ks, struct ucred *ucred) { int error; error = vaccess(VREG, ks->ks_mode, ks->ks_uid, ks->ks_gid, VREAD | VWRITE, ucred); if (error) error = priv_check_cred(ucred, PRIV_SEM_WRITE); return (error); } /* * Dictionary management. We maintain an in-kernel dictionary to map * paths to semaphore objects. We use the FNV hash on the path to * store the mappings in a hash table. */ static struct ksem * ksem_lookup(char *path, Fnv32_t fnv) { struct ksem_mapping *map; LIST_FOREACH(map, KSEM_HASH(fnv), km_link) { if (map->km_fnv != fnv) continue; if (strcmp(map->km_path, path) == 0) return (map->km_ksem); } return (NULL); } static void ksem_insert(char *path, Fnv32_t fnv, struct ksem *ks) { struct ksem_mapping *map; map = malloc(sizeof(struct ksem_mapping), M_KSEM, M_WAITOK); map->km_path = path; map->km_fnv = fnv; map->km_ksem = ksem_hold(ks); ks->ks_path = path; LIST_INSERT_HEAD(KSEM_HASH(fnv), map, km_link); } static int ksem_remove(char *path, Fnv32_t fnv, struct ucred *ucred) { struct ksem_mapping *map; int error; LIST_FOREACH(map, KSEM_HASH(fnv), km_link) { if (map->km_fnv != fnv) continue; if (strcmp(map->km_path, path) == 0) { #ifdef MAC error = mac_posixsem_check_unlink(ucred, map->km_ksem); if (error) return (error); #endif error = ksem_access(map->km_ksem, ucred); if (error) return (error); map->km_ksem->ks_path = NULL; LIST_REMOVE(map, km_link); ksem_drop(map->km_ksem); free(map->km_path, M_KSEM); free(map, M_KSEM); return (0); } } return (ENOENT); } static int ksem_create_copyout_semid(struct thread *td, semid_t *semidp, int fd, int compat32) { semid_t semid; #ifdef COMPAT_FREEBSD32 int32_t semid32; #endif void *ptr; size_t ptrs; #ifdef COMPAT_FREEBSD32 if (compat32) { semid32 = fd; ptr = &semid32; ptrs = sizeof(semid32); } else { #endif semid = fd; ptr = &semid; ptrs = sizeof(semid); compat32 = 0; /* silence gcc */ #ifdef COMPAT_FREEBSD32 } #endif return (copyout(ptr, semidp, ptrs)); } /* Other helper routines. */ static int ksem_create(struct thread *td, const char *name, semid_t *semidp, mode_t mode, unsigned int value, int flags, int compat32) { struct pwddesc *pdp; struct ksem *ks; struct file *fp; char *path; const char *pr_path; size_t pr_pathlen; Fnv32_t fnv; int error, fd; AUDIT_ARG_FFLAGS(flags); AUDIT_ARG_MODE(mode); AUDIT_ARG_VALUE(value); if (value > SEM_VALUE_MAX) return (EINVAL); pdp = td->td_proc->p_pd; mode = (mode & ~pdp->pd_cmask) & ACCESSPERMS; error = falloc(td, &fp, &fd, O_CLOEXEC); if (error) { if (name == NULL) error = ENOSPC; return (error); } /* * Go ahead and copyout the file descriptor now. This is a bit * premature, but it is a lot easier to handle errors as opposed * to later when we've possibly created a new semaphore, etc. */ error = ksem_create_copyout_semid(td, semidp, fd, compat32); if (error) { fdclose(td, fp, fd); fdrop(fp, td); return (error); } if (name == NULL) { /* Create an anonymous semaphore. */ ks = ksem_alloc(td->td_ucred, mode, value); if (ks == NULL) error = ENOSPC; else ks->ks_flags |= KS_ANONYMOUS; } else { path = malloc(MAXPATHLEN, M_KSEM, M_WAITOK); pr_path = td->td_ucred->cr_prison->pr_path; /* Construct a full pathname for jailed callers. */ pr_pathlen = strcmp(pr_path, "/") == 0 ? 0 : strlcpy(path, pr_path, MAXPATHLEN); error = copyinstr(name, path + pr_pathlen, MAXPATHLEN - pr_pathlen, NULL); /* Require paths to start with a '/' character. */ if (error == 0 && path[pr_pathlen] != '/') error = EINVAL; if (error) { fdclose(td, fp, fd); fdrop(fp, td); free(path, M_KSEM); return (error); } AUDIT_ARG_UPATH1_CANON(path); fnv = fnv_32_str(path, FNV1_32_INIT); sx_xlock(&ksem_dict_lock); ks = ksem_lookup(path, fnv); if (ks == NULL) { /* Object does not exist, create it if requested. */ if (flags & O_CREAT) { ks = ksem_alloc(td->td_ucred, mode, value); if (ks == NULL) error = ENFILE; else { ksem_insert(path, fnv, ks); path = NULL; } } else error = ENOENT; } else { /* * Object already exists, obtain a new * reference if requested and permitted. */ if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) error = EEXIST; else { #ifdef MAC error = mac_posixsem_check_open(td->td_ucred, ks); if (error == 0) #endif error = ksem_access(ks, td->td_ucred); } if (error == 0) ksem_hold(ks); #ifdef INVARIANTS else ks = NULL; #endif } sx_xunlock(&ksem_dict_lock); if (path) free(path, M_KSEM); } if (error) { KASSERT(ks == NULL, ("ksem_create error with a ksem")); fdclose(td, fp, fd); fdrop(fp, td); return (error); } KASSERT(ks != NULL, ("ksem_create w/o a ksem")); finit(fp, FREAD | FWRITE, DTYPE_SEM, ks, &ksem_ops); fdrop(fp, td); return (0); } static int ksem_get(struct thread *td, semid_t id, cap_rights_t *rightsp, struct file **fpp) { struct ksem *ks; struct file *fp; int error; error = fget(td, id, rightsp, &fp); if (error) return (EINVAL); if (fp->f_type != DTYPE_SEM) { fdrop(fp, td); return (EINVAL); } ks = fp->f_data; if (ks->ks_flags & KS_DEAD) { fdrop(fp, td); return (EINVAL); } *fpp = fp; return (0); } /* System calls. */ #ifndef _SYS_SYSPROTO_H_ struct ksem_init_args { unsigned int value; semid_t *idp; }; #endif int sys_ksem_init(struct thread *td, struct ksem_init_args *uap) { return (ksem_create(td, NULL, uap->idp, S_IRWXU | S_IRWXG, uap->value, 0, 0)); } #ifndef _SYS_SYSPROTO_H_ struct ksem_open_args { char *name; int oflag; mode_t mode; unsigned int value; semid_t *idp; }; #endif int sys_ksem_open(struct thread *td, struct ksem_open_args *uap) { DP((">>> ksem_open start, pid=%d\n", (int)td->td_proc->p_pid)); if ((uap->oflag & ~(O_CREAT | O_EXCL)) != 0) return (EINVAL); return (ksem_create(td, uap->name, uap->idp, uap->mode, uap->value, uap->oflag, 0)); } #ifndef _SYS_SYSPROTO_H_ struct ksem_unlink_args { char *name; }; #endif int sys_ksem_unlink(struct thread *td, struct ksem_unlink_args *uap) { char *path; const char *pr_path; size_t pr_pathlen; Fnv32_t fnv; int error; path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); pr_path = td->td_ucred->cr_prison->pr_path; pr_pathlen = strcmp(pr_path, "/") == 0 ? 0 : strlcpy(path, pr_path, MAXPATHLEN); error = copyinstr(uap->name, path + pr_pathlen, MAXPATHLEN - pr_pathlen, NULL); if (error) { free(path, M_TEMP); return (error); } AUDIT_ARG_UPATH1_CANON(path); fnv = fnv_32_str(path, FNV1_32_INIT); sx_xlock(&ksem_dict_lock); error = ksem_remove(path, fnv, td->td_ucred); sx_xunlock(&ksem_dict_lock); free(path, M_TEMP); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ksem_close_args { semid_t id; }; #endif int sys_ksem_close(struct thread *td, struct ksem_close_args *uap) { struct ksem *ks; struct file *fp; int error; /* No capability rights required to close a semaphore. */ AUDIT_ARG_FD(uap->id); error = ksem_get(td, uap->id, &cap_no_rights, &fp); if (error) return (error); ks = fp->f_data; if (ks->ks_flags & KS_ANONYMOUS) { fdrop(fp, td); return (EINVAL); } error = kern_close(td, uap->id); fdrop(fp, td); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ksem_post_args { semid_t id; }; #endif int sys_ksem_post(struct thread *td, struct ksem_post_args *uap) { cap_rights_t rights; struct file *fp; struct ksem *ks; int error; AUDIT_ARG_FD(uap->id); error = ksem_get(td, uap->id, cap_rights_init_one(&rights, CAP_SEM_POST), &fp); if (error) return (error); ks = fp->f_data; mtx_lock(&sem_lock); #ifdef MAC error = mac_posixsem_check_post(td->td_ucred, fp->f_cred, ks); if (error) goto err; #endif if (ks->ks_value == SEM_VALUE_MAX) { error = EOVERFLOW; goto err; } ++ks->ks_value; if (ks->ks_waiters > 0) cv_signal(&ks->ks_cv); error = 0; vfs_timestamp(&ks->ks_ctime); err: mtx_unlock(&sem_lock); fdrop(fp, td); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ksem_wait_args { semid_t id; }; #endif int sys_ksem_wait(struct thread *td, struct ksem_wait_args *uap) { return (kern_sem_wait(td, uap->id, 0, NULL)); } #ifndef _SYS_SYSPROTO_H_ struct ksem_timedwait_args { semid_t id; const struct timespec *abstime; }; #endif int sys_ksem_timedwait(struct thread *td, struct ksem_timedwait_args *uap) { struct timespec abstime; struct timespec *ts; int error; /* * We allow a null timespec (wait forever). */ if (uap->abstime == NULL) ts = NULL; else { error = copyin(uap->abstime, &abstime, sizeof(abstime)); if (error != 0) return (error); if (abstime.tv_nsec >= 1000000000 || abstime.tv_nsec < 0) return (EINVAL); ts = &abstime; } return (kern_sem_wait(td, uap->id, 0, ts)); } #ifndef _SYS_SYSPROTO_H_ struct ksem_trywait_args { semid_t id; }; #endif int sys_ksem_trywait(struct thread *td, struct ksem_trywait_args *uap) { return (kern_sem_wait(td, uap->id, 1, NULL)); } static int kern_sem_wait(struct thread *td, semid_t id, int tryflag, struct timespec *abstime) { struct timespec ts1, ts2; struct timeval tv; cap_rights_t rights; struct file *fp; struct ksem *ks; int error; DP((">>> kern_sem_wait entered! pid=%d\n", (int)td->td_proc->p_pid)); AUDIT_ARG_FD(id); error = ksem_get(td, id, cap_rights_init_one(&rights, CAP_SEM_WAIT), &fp); if (error) return (error); ks = fp->f_data; mtx_lock(&sem_lock); DP((">>> kern_sem_wait critical section entered! pid=%d\n", (int)td->td_proc->p_pid)); #ifdef MAC error = mac_posixsem_check_wait(td->td_ucred, fp->f_cred, ks); if (error) { DP(("kern_sem_wait mac failed\n")); goto err; } #endif DP(("kern_sem_wait value = %d, tryflag %d\n", ks->ks_value, tryflag)); vfs_timestamp(&ks->ks_atime); while (ks->ks_value == 0) { ks->ks_waiters++; if (tryflag != 0) error = EAGAIN; else if (abstime == NULL) error = cv_wait_sig(&ks->ks_cv, &sem_lock); else { for (;;) { ts1 = *abstime; getnanotime(&ts2); timespecsub(&ts1, &ts2, &ts1); TIMESPEC_TO_TIMEVAL(&tv, &ts1); if (tv.tv_sec < 0) { error = ETIMEDOUT; break; } error = cv_timedwait_sig(&ks->ks_cv, &sem_lock, tvtohz(&tv)); if (error != EWOULDBLOCK) break; } } ks->ks_waiters--; if (error) goto err; } ks->ks_value--; DP(("kern_sem_wait value post-decrement = %d\n", ks->ks_value)); error = 0; err: mtx_unlock(&sem_lock); fdrop(fp, td); DP(("<<< kern_sem_wait leaving, pid=%d, error = %d\n", (int)td->td_proc->p_pid, error)); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ksem_getvalue_args { semid_t id; int *val; }; #endif int sys_ksem_getvalue(struct thread *td, struct ksem_getvalue_args *uap) { cap_rights_t rights; struct file *fp; struct ksem *ks; int error, val; AUDIT_ARG_FD(uap->id); error = ksem_get(td, uap->id, cap_rights_init_one(&rights, CAP_SEM_GETVALUE), &fp); if (error) return (error); ks = fp->f_data; mtx_lock(&sem_lock); #ifdef MAC error = mac_posixsem_check_getvalue(td->td_ucred, fp->f_cred, ks); if (error) { mtx_unlock(&sem_lock); fdrop(fp, td); return (error); } #endif val = ks->ks_value; vfs_timestamp(&ks->ks_atime); mtx_unlock(&sem_lock); fdrop(fp, td); error = copyout(&val, uap->val, sizeof(val)); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ksem_destroy_args { semid_t id; }; #endif int sys_ksem_destroy(struct thread *td, struct ksem_destroy_args *uap) { struct file *fp; struct ksem *ks; int error; /* No capability rights required to close a semaphore. */ AUDIT_ARG_FD(uap->id); error = ksem_get(td, uap->id, &cap_no_rights, &fp); if (error) return (error); ks = fp->f_data; if (!(ks->ks_flags & KS_ANONYMOUS)) { fdrop(fp, td); return (EINVAL); } mtx_lock(&sem_lock); if (ks->ks_waiters != 0) { mtx_unlock(&sem_lock); error = EBUSY; goto err; } ks->ks_flags |= KS_DEAD; mtx_unlock(&sem_lock); error = kern_close(td, uap->id); err: fdrop(fp, td); return (error); } static struct syscall_helper_data ksem_syscalls[] = { SYSCALL_INIT_HELPER(ksem_init), SYSCALL_INIT_HELPER(ksem_open), SYSCALL_INIT_HELPER(ksem_unlink), SYSCALL_INIT_HELPER(ksem_close), SYSCALL_INIT_HELPER(ksem_post), SYSCALL_INIT_HELPER(ksem_wait), SYSCALL_INIT_HELPER(ksem_timedwait), SYSCALL_INIT_HELPER(ksem_trywait), SYSCALL_INIT_HELPER(ksem_getvalue), SYSCALL_INIT_HELPER(ksem_destroy), SYSCALL_INIT_LAST }; #ifdef COMPAT_FREEBSD32 #include #include #include #include #include int freebsd32_ksem_init(struct thread *td, struct freebsd32_ksem_init_args *uap) { return (ksem_create(td, NULL, uap->idp, S_IRWXU | S_IRWXG, uap->value, 0, 1)); } int freebsd32_ksem_open(struct thread *td, struct freebsd32_ksem_open_args *uap) { if ((uap->oflag & ~(O_CREAT | O_EXCL)) != 0) return (EINVAL); return (ksem_create(td, uap->name, uap->idp, uap->mode, uap->value, uap->oflag, 1)); } int freebsd32_ksem_timedwait(struct thread *td, struct freebsd32_ksem_timedwait_args *uap) { struct timespec32 abstime32; struct timespec *ts, abstime; int error; /* * We allow a null timespec (wait forever). */ if (uap->abstime == NULL) ts = NULL; else { error = copyin(uap->abstime, &abstime32, sizeof(abstime32)); if (error != 0) return (error); CP(abstime32, abstime, tv_sec); CP(abstime32, abstime, tv_nsec); if (abstime.tv_nsec >= 1000000000 || abstime.tv_nsec < 0) return (EINVAL); ts = &abstime; } return (kern_sem_wait(td, uap->id, 0, ts)); } static struct syscall_helper_data ksem32_syscalls[] = { SYSCALL32_INIT_HELPER(freebsd32_ksem_init), SYSCALL32_INIT_HELPER(freebsd32_ksem_open), SYSCALL32_INIT_HELPER_COMPAT(ksem_unlink), SYSCALL32_INIT_HELPER_COMPAT(ksem_close), SYSCALL32_INIT_HELPER_COMPAT(ksem_post), SYSCALL32_INIT_HELPER_COMPAT(ksem_wait), SYSCALL32_INIT_HELPER(freebsd32_ksem_timedwait), SYSCALL32_INIT_HELPER_COMPAT(ksem_trywait), SYSCALL32_INIT_HELPER_COMPAT(ksem_getvalue), SYSCALL32_INIT_HELPER_COMPAT(ksem_destroy), SYSCALL_INIT_LAST }; #endif static int ksem_module_init(void) { int error; mtx_init(&sem_lock, "sem", NULL, MTX_DEF); mtx_init(&ksem_count_lock, "ksem count", NULL, MTX_DEF); sx_init(&ksem_dict_lock, "ksem dictionary"); ksem_dictionary = hashinit(1024, M_KSEM, &ksem_hash); p31b_setcfg(CTL_P1003_1B_SEMAPHORES, 200112L); p31b_setcfg(CTL_P1003_1B_SEM_NSEMS_MAX, SEM_MAX); p31b_setcfg(CTL_P1003_1B_SEM_VALUE_MAX, SEM_VALUE_MAX); error = syscall_helper_register(ksem_syscalls, SY_THR_STATIC_KLD); if (error) return (error); #ifdef COMPAT_FREEBSD32 error = syscall32_helper_register(ksem32_syscalls, SY_THR_STATIC_KLD); if (error) return (error); #endif return (0); } static void ksem_module_destroy(void) { #ifdef COMPAT_FREEBSD32 syscall32_helper_unregister(ksem32_syscalls); #endif syscall_helper_unregister(ksem_syscalls); p31b_setcfg(CTL_P1003_1B_SEMAPHORES, 0); hashdestroy(ksem_dictionary, M_KSEM, ksem_hash); sx_destroy(&ksem_dict_lock); mtx_destroy(&ksem_count_lock); mtx_destroy(&sem_lock); p31b_unsetcfg(CTL_P1003_1B_SEM_VALUE_MAX); p31b_unsetcfg(CTL_P1003_1B_SEM_NSEMS_MAX); } static int sem_modload(struct module *module, int cmd, void *arg) { int error = 0; switch (cmd) { case MOD_LOAD: error = ksem_module_init(); if (error) ksem_module_destroy(); break; case MOD_UNLOAD: mtx_lock(&ksem_count_lock); if (nsems != 0) { error = EOPNOTSUPP; mtx_unlock(&ksem_count_lock); break; } ksem_dead = 1; mtx_unlock(&ksem_count_lock); ksem_module_destroy(); break; case MOD_SHUTDOWN: break; default: error = EINVAL; break; } return (error); } static moduledata_t sem_mod = { "sem", &sem_modload, NULL }; DECLARE_MODULE(sem, sem_mod, SI_SUB_SYSV_SEM, SI_ORDER_FIRST); MODULE_VERSION(sem, 1); diff --git a/sys/kern/uipc_shm.c b/sys/kern/uipc_shm.c index 63c4250f640f..14d808bfc166 100644 --- a/sys/kern/uipc_shm.c +++ b/sys/kern/uipc_shm.c @@ -1,2100 +1,2099 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006, 2011, 2016-2017 Robert N. M. Watson * Copyright 2020 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by BAE Systems, the University of * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent * Computing (TC) research program. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Support for shared swap-backed anonymous memory objects via * shm_open(2), shm_rename(2), and shm_unlink(2). * While most of the implementation is here, vm_mmap.c contains * mapping logic changes. * * posixshmcontrol(1) allows users to inspect the state of the memory * objects. Per-uid swap resource limit controls total amount of * memory that user can consume for anonymous objects, including * shared. */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct shm_mapping { char *sm_path; Fnv32_t sm_fnv; struct shmfd *sm_shmfd; LIST_ENTRY(shm_mapping) sm_link; }; static MALLOC_DEFINE(M_SHMFD, "shmfd", "shared memory file descriptor"); static LIST_HEAD(, shm_mapping) *shm_dictionary; static struct sx shm_dict_lock; static struct mtx shm_timestamp_lock; static u_long shm_hash; static struct unrhdr64 shm_ino_unr; static dev_t shm_dev_ino; #define SHM_HASH(fnv) (&shm_dictionary[(fnv) & shm_hash]) static void shm_init(void *arg); static void shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd); static struct shmfd *shm_lookup(char *path, Fnv32_t fnv); static int shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred); static int shm_dotruncate_cookie(struct shmfd *shmfd, off_t length, void *rl_cookie); static int shm_dotruncate_locked(struct shmfd *shmfd, off_t length, void *rl_cookie); static int shm_copyin_path(struct thread *td, const char *userpath_in, char **path_out); static int shm_deallocate(struct shmfd *shmfd, off_t *offset, off_t *length, int flags); static fo_rdwr_t shm_read; static fo_rdwr_t shm_write; static fo_truncate_t shm_truncate; static fo_ioctl_t shm_ioctl; static fo_stat_t shm_stat; static fo_close_t shm_close; static fo_chmod_t shm_chmod; static fo_chown_t shm_chown; static fo_seek_t shm_seek; static fo_fill_kinfo_t shm_fill_kinfo; static fo_mmap_t shm_mmap; static fo_get_seals_t shm_get_seals; static fo_add_seals_t shm_add_seals; static fo_fallocate_t shm_fallocate; static fo_fspacectl_t shm_fspacectl; /* File descriptor operations. */ struct fileops shm_ops = { .fo_read = shm_read, .fo_write = shm_write, .fo_truncate = shm_truncate, .fo_ioctl = shm_ioctl, .fo_poll = invfo_poll, .fo_kqfilter = invfo_kqfilter, .fo_stat = shm_stat, .fo_close = shm_close, .fo_chmod = shm_chmod, .fo_chown = shm_chown, .fo_sendfile = vn_sendfile, .fo_seek = shm_seek, .fo_fill_kinfo = shm_fill_kinfo, .fo_mmap = shm_mmap, .fo_get_seals = shm_get_seals, .fo_add_seals = shm_add_seals, .fo_fallocate = shm_fallocate, .fo_fspacectl = shm_fspacectl, .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE, }; FEATURE(posix_shm, "POSIX shared memory"); static SYSCTL_NODE(_vm, OID_AUTO, largepages, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); static int largepage_reclaim_tries = 1; SYSCTL_INT(_vm_largepages, OID_AUTO, reclaim_tries, CTLFLAG_RWTUN, &largepage_reclaim_tries, 0, "Number of contig reclaims before giving up for default alloc policy"); static int uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio) { vm_page_t m; vm_pindex_t idx; size_t tlen; int error, offset, rv; idx = OFF_TO_IDX(uio->uio_offset); offset = uio->uio_offset & PAGE_MASK; tlen = MIN(PAGE_SIZE - offset, len); rv = vm_page_grab_valid_unlocked(&m, obj, idx, VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY | VM_ALLOC_NOCREAT); if (rv == VM_PAGER_OK) goto found; /* * Read I/O without either a corresponding resident page or swap * page: use zero_region. This is intended to avoid instantiating * pages on read from a sparse region. */ VM_OBJECT_WLOCK(obj); m = vm_page_lookup(obj, idx); if (uio->uio_rw == UIO_READ && m == NULL && !vm_pager_has_page(obj, idx, NULL, NULL)) { VM_OBJECT_WUNLOCK(obj); return (uiomove(__DECONST(void *, zero_region), tlen, uio)); } /* * Although the tmpfs vnode lock is held here, it is * nonetheless safe to sleep waiting for a free page. The * pageout daemon does not need to acquire the tmpfs vnode * lock to page out tobj's pages because tobj is a OBJT_SWAP * type object. */ rv = vm_page_grab_valid(&m, obj, idx, VM_ALLOC_NORMAL | VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY); if (rv != VM_PAGER_OK) { VM_OBJECT_WUNLOCK(obj); printf("uiomove_object: vm_obj %p idx %jd pager error %d\n", obj, idx, rv); return (EIO); } VM_OBJECT_WUNLOCK(obj); found: error = uiomove_fromphys(&m, offset, tlen, uio); if (uio->uio_rw == UIO_WRITE && error == 0) vm_page_set_dirty(m); vm_page_activate(m); vm_page_sunbusy(m); return (error); } int uiomove_object(vm_object_t obj, off_t obj_size, struct uio *uio) { ssize_t resid; size_t len; int error; error = 0; while ((resid = uio->uio_resid) > 0) { if (obj_size <= uio->uio_offset) break; len = MIN(obj_size - uio->uio_offset, resid); if (len == 0) break; error = uiomove_object_page(obj, len, uio); if (error != 0 || resid == uio->uio_resid) break; } return (error); } static u_long count_largepages[MAXPAGESIZES]; static int shm_largepage_phys_populate(vm_object_t object, vm_pindex_t pidx, int fault_type, vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last) { vm_page_t m; int psind; psind = object->un_pager.phys.data_val; if (psind == 0 || pidx >= object->size) return (VM_PAGER_FAIL); *first = rounddown2(pidx, pagesizes[psind] / PAGE_SIZE); /* * We only busy the first page in the superpage run. It is * useless to busy whole run since we only remove full * superpage, and it takes too long to busy e.g. 512 * 512 == * 262144 pages constituing 1G amd64 superage. */ m = vm_page_grab(object, *first, VM_ALLOC_NORMAL | VM_ALLOC_NOCREAT); MPASS(m != NULL); *last = *first + atop(pagesizes[psind]) - 1; return (VM_PAGER_OK); } static boolean_t shm_largepage_phys_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after) { int psind; psind = object->un_pager.phys.data_val; if (psind == 0 || pindex >= object->size) return (FALSE); if (before != NULL) { *before = pindex - rounddown2(pindex, pagesizes[psind] / PAGE_SIZE); } if (after != NULL) { *after = roundup2(pindex, pagesizes[psind] / PAGE_SIZE) - pindex; } return (TRUE); } static void shm_largepage_phys_ctor(vm_object_t object, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred) { } static void shm_largepage_phys_dtor(vm_object_t object) { int psind; psind = object->un_pager.phys.data_val; if (psind != 0) { atomic_subtract_long(&count_largepages[psind], object->size / (pagesizes[psind] / PAGE_SIZE)); vm_wire_sub(object->size); } else { KASSERT(object->size == 0, ("largepage phys obj %p not initialized bit size %#jx > 0", object, (uintmax_t)object->size)); } } static const struct phys_pager_ops shm_largepage_phys_ops = { .phys_pg_populate = shm_largepage_phys_populate, .phys_pg_haspage = shm_largepage_phys_haspage, .phys_pg_ctor = shm_largepage_phys_ctor, .phys_pg_dtor = shm_largepage_phys_dtor, }; bool shm_largepage(struct shmfd *shmfd) { return (shmfd->shm_object->type == OBJT_PHYS); } static int shm_seek(struct file *fp, off_t offset, int whence, struct thread *td) { struct shmfd *shmfd; off_t foffset; int error; shmfd = fp->f_data; foffset = foffset_lock(fp, 0); error = 0; switch (whence) { case L_INCR: if (foffset < 0 || (offset > 0 && foffset > OFF_MAX - offset)) { error = EOVERFLOW; break; } offset += foffset; break; case L_XTND: if (offset > 0 && shmfd->shm_size > OFF_MAX - offset) { error = EOVERFLOW; break; } offset += shmfd->shm_size; break; case L_SET: break; default: error = EINVAL; } if (error == 0) { if (offset < 0 || offset > shmfd->shm_size) error = EINVAL; else td->td_uretoff.tdu_off = offset; } foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0); return (error); } static int shm_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct shmfd *shmfd; void *rl_cookie; int error; shmfd = fp->f_data; #ifdef MAC error = mac_posixshm_check_read(active_cred, fp->f_cred, shmfd); if (error) return (error); #endif foffset_lock_uio(fp, uio, flags); rl_cookie = rangelock_rlock(&shmfd->shm_rl, uio->uio_offset, uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx); error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio); rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); foffset_unlock_uio(fp, uio, flags); return (error); } static int shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct shmfd *shmfd; void *rl_cookie; int error; off_t size; shmfd = fp->f_data; #ifdef MAC error = mac_posixshm_check_write(active_cred, fp->f_cred, shmfd); if (error) return (error); #endif if (shm_largepage(shmfd) && shmfd->shm_lp_psind == 0) return (EINVAL); foffset_lock_uio(fp, uio, flags); if (uio->uio_resid > OFF_MAX - uio->uio_offset) { /* * Overflow is only an error if we're supposed to expand on * write. Otherwise, we'll just truncate the write to the * size of the file, which can only grow up to OFF_MAX. */ if ((shmfd->shm_flags & SHM_GROW_ON_WRITE) != 0) { foffset_unlock_uio(fp, uio, flags); return (EFBIG); } size = shmfd->shm_size; } else { size = uio->uio_offset + uio->uio_resid; } if ((flags & FOF_OFFSET) == 0) { rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, &shmfd->shm_mtx); } else { rl_cookie = rangelock_wlock(&shmfd->shm_rl, uio->uio_offset, size, &shmfd->shm_mtx); } if ((shmfd->shm_seals & F_SEAL_WRITE) != 0) { error = EPERM; } else { error = 0; if ((shmfd->shm_flags & SHM_GROW_ON_WRITE) != 0 && size > shmfd->shm_size) { error = shm_dotruncate_cookie(shmfd, size, rl_cookie); } if (error == 0) error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio); } rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); foffset_unlock_uio(fp, uio, flags); return (error); } static int shm_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { struct shmfd *shmfd; #ifdef MAC int error; #endif shmfd = fp->f_data; #ifdef MAC error = mac_posixshm_check_truncate(active_cred, fp->f_cred, shmfd); if (error) return (error); #endif return (shm_dotruncate(shmfd, length)); } int shm_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { struct shmfd *shmfd; struct shm_largepage_conf *conf; void *rl_cookie; shmfd = fp->f_data; switch (com) { case FIONBIO: case FIOASYNC: /* * Allow fcntl(fd, F_SETFL, O_NONBLOCK) to work, * just like it would on an unlinked regular file */ return (0); case FIOSSHMLPGCNF: if (!shm_largepage(shmfd)) return (ENOTTY); conf = data; if (shmfd->shm_lp_psind != 0 && conf->psind != shmfd->shm_lp_psind) return (EINVAL); if (conf->psind <= 0 || conf->psind >= MAXPAGESIZES || pagesizes[conf->psind] == 0) return (EINVAL); if (conf->alloc_policy != SHM_LARGEPAGE_ALLOC_DEFAULT && conf->alloc_policy != SHM_LARGEPAGE_ALLOC_NOWAIT && conf->alloc_policy != SHM_LARGEPAGE_ALLOC_HARD) return (EINVAL); rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, &shmfd->shm_mtx); shmfd->shm_lp_psind = conf->psind; shmfd->shm_lp_alloc_policy = conf->alloc_policy; shmfd->shm_object->un_pager.phys.data_val = conf->psind; rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); return (0); case FIOGSHMLPGCNF: if (!shm_largepage(shmfd)) return (ENOTTY); conf = data; rl_cookie = rangelock_rlock(&shmfd->shm_rl, 0, OFF_MAX, &shmfd->shm_mtx); conf->psind = shmfd->shm_lp_psind; conf->alloc_policy = shmfd->shm_lp_alloc_policy; rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); return (0); default: return (ENOTTY); } } static int -shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, - struct thread *td) +shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred) { struct shmfd *shmfd; #ifdef MAC int error; #endif shmfd = fp->f_data; #ifdef MAC error = mac_posixshm_check_stat(active_cred, fp->f_cred, shmfd); if (error) return (error); #endif /* * Attempt to return sanish values for fstat() on a memory file * descriptor. */ bzero(sb, sizeof(*sb)); sb->st_blksize = PAGE_SIZE; sb->st_size = shmfd->shm_size; sb->st_blocks = howmany(sb->st_size, sb->st_blksize); mtx_lock(&shm_timestamp_lock); sb->st_atim = shmfd->shm_atime; sb->st_ctim = shmfd->shm_ctime; sb->st_mtim = shmfd->shm_mtime; sb->st_birthtim = shmfd->shm_birthtime; sb->st_mode = S_IFREG | shmfd->shm_mode; /* XXX */ sb->st_uid = shmfd->shm_uid; sb->st_gid = shmfd->shm_gid; mtx_unlock(&shm_timestamp_lock); sb->st_dev = shm_dev_ino; sb->st_ino = shmfd->shm_ino; sb->st_nlink = shmfd->shm_object->ref_count; sb->st_blocks = shmfd->shm_object->size / (pagesizes[shmfd->shm_lp_psind] >> PAGE_SHIFT); return (0); } static int shm_close(struct file *fp, struct thread *td) { struct shmfd *shmfd; shmfd = fp->f_data; fp->f_data = NULL; shm_drop(shmfd); return (0); } static int shm_copyin_path(struct thread *td, const char *userpath_in, char **path_out) { int error; char *path; const char *pr_path; size_t pr_pathlen; path = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK); pr_path = td->td_ucred->cr_prison->pr_path; /* Construct a full pathname for jailed callers. */ pr_pathlen = strcmp(pr_path, "/") == 0 ? 0 : strlcpy(path, pr_path, MAXPATHLEN); error = copyinstr(userpath_in, path + pr_pathlen, MAXPATHLEN - pr_pathlen, NULL); if (error != 0) goto out; #ifdef KTRACE if (KTRPOINT(curthread, KTR_NAMEI)) ktrnamei(path); #endif /* Require paths to start with a '/' character. */ if (path[pr_pathlen] != '/') { error = EINVAL; goto out; } *path_out = path; out: if (error != 0) free(path, M_SHMFD); return (error); } static int shm_partial_page_invalidate(vm_object_t object, vm_pindex_t idx, int base, int end) { vm_page_t m; int rv; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(base >= 0, ("%s: base %d", __func__, base)); KASSERT(end - base <= PAGE_SIZE, ("%s: base %d end %d", __func__, base, end)); retry: m = vm_page_grab(object, idx, VM_ALLOC_NOCREAT); if (m != NULL) { MPASS(vm_page_all_valid(m)); } else if (vm_pager_has_page(object, idx, NULL, NULL)) { m = vm_page_alloc(object, idx, VM_ALLOC_NORMAL | VM_ALLOC_WAITFAIL); if (m == NULL) goto retry; vm_object_pip_add(object, 1); VM_OBJECT_WUNLOCK(object); rv = vm_pager_get_pages(object, &m, 1, NULL, NULL); VM_OBJECT_WLOCK(object); vm_object_pip_wakeup(object); if (rv == VM_PAGER_OK) { /* * Since the page was not resident, and therefore not * recently accessed, immediately enqueue it for * asynchronous laundering. The current operation is * not regarded as an access. */ vm_page_launder(m); } else { vm_page_free(m); VM_OBJECT_WUNLOCK(object); return (EIO); } } if (m != NULL) { pmap_zero_page_area(m, base, end - base); KASSERT(vm_page_all_valid(m), ("%s: page %p is invalid", __func__, m)); vm_page_set_dirty(m); vm_page_xunbusy(m); } return (0); } static int shm_dotruncate_locked(struct shmfd *shmfd, off_t length, void *rl_cookie) { vm_object_t object; vm_pindex_t nobjsize; vm_ooffset_t delta; int base, error; KASSERT(length >= 0, ("shm_dotruncate: length < 0")); object = shmfd->shm_object; VM_OBJECT_ASSERT_WLOCKED(object); rangelock_cookie_assert(rl_cookie, RA_WLOCKED); if (length == shmfd->shm_size) return (0); nobjsize = OFF_TO_IDX(length + PAGE_MASK); /* Are we shrinking? If so, trim the end. */ if (length < shmfd->shm_size) { if ((shmfd->shm_seals & F_SEAL_SHRINK) != 0) return (EPERM); /* * Disallow any requests to shrink the size if this * object is mapped into the kernel. */ if (shmfd->shm_kmappings > 0) return (EBUSY); /* * Zero the truncated part of the last page. */ base = length & PAGE_MASK; if (base != 0) { error = shm_partial_page_invalidate(object, OFF_TO_IDX(length), base, PAGE_SIZE); if (error) return (error); } delta = IDX_TO_OFF(object->size - nobjsize); if (nobjsize < object->size) vm_object_page_remove(object, nobjsize, object->size, 0); /* Free the swap accounted for shm */ swap_release_by_cred(delta, object->cred); object->charge -= delta; } else { if ((shmfd->shm_seals & F_SEAL_GROW) != 0) return (EPERM); /* Try to reserve additional swap space. */ delta = IDX_TO_OFF(nobjsize - object->size); if (!swap_reserve_by_cred(delta, object->cred)) return (ENOMEM); object->charge += delta; } shmfd->shm_size = length; mtx_lock(&shm_timestamp_lock); vfs_timestamp(&shmfd->shm_ctime); shmfd->shm_mtime = shmfd->shm_ctime; mtx_unlock(&shm_timestamp_lock); object->size = nobjsize; return (0); } static int shm_dotruncate_largepage(struct shmfd *shmfd, off_t length, void *rl_cookie) { vm_object_t object; vm_page_t m; vm_pindex_t newobjsz, oldobjsz; int aflags, error, i, psind, try; KASSERT(length >= 0, ("shm_dotruncate: length < 0")); object = shmfd->shm_object; VM_OBJECT_ASSERT_WLOCKED(object); rangelock_cookie_assert(rl_cookie, RA_WLOCKED); oldobjsz = object->size; newobjsz = OFF_TO_IDX(length); if (length == shmfd->shm_size) return (0); psind = shmfd->shm_lp_psind; if (psind == 0 && length != 0) return (EINVAL); if ((length & (pagesizes[psind] - 1)) != 0) return (EINVAL); if (length < shmfd->shm_size) { if ((shmfd->shm_seals & F_SEAL_SHRINK) != 0) return (EPERM); if (shmfd->shm_kmappings > 0) return (EBUSY); return (ENOTSUP); /* Pages are unmanaged. */ #if 0 vm_object_page_remove(object, newobjsz, oldobjsz, 0); object->size = newobjsz; shmfd->shm_size = length; return (0); #endif } if ((shmfd->shm_seals & F_SEAL_GROW) != 0) return (EPERM); aflags = VM_ALLOC_NORMAL | VM_ALLOC_ZERO; if (shmfd->shm_lp_alloc_policy == SHM_LARGEPAGE_ALLOC_NOWAIT) aflags |= VM_ALLOC_WAITFAIL; try = 0; /* * Extend shmfd and object, keeping all already fully * allocated large pages intact even on error, because dropped * object lock might allowed mapping of them. */ while (object->size < newobjsz) { m = vm_page_alloc_contig(object, object->size, aflags, pagesizes[psind] / PAGE_SIZE, 0, ~0, pagesizes[psind], 0, VM_MEMATTR_DEFAULT); if (m == NULL) { VM_OBJECT_WUNLOCK(object); if (shmfd->shm_lp_alloc_policy == SHM_LARGEPAGE_ALLOC_NOWAIT || (shmfd->shm_lp_alloc_policy == SHM_LARGEPAGE_ALLOC_DEFAULT && try >= largepage_reclaim_tries)) { VM_OBJECT_WLOCK(object); return (ENOMEM); } error = vm_page_reclaim_contig(aflags, pagesizes[psind] / PAGE_SIZE, 0, ~0, pagesizes[psind], 0) ? 0 : vm_wait_intr(object); if (error != 0) { VM_OBJECT_WLOCK(object); return (error); } try++; VM_OBJECT_WLOCK(object); continue; } try = 0; for (i = 0; i < pagesizes[psind] / PAGE_SIZE; i++) { if ((m[i].flags & PG_ZERO) == 0) pmap_zero_page(&m[i]); vm_page_valid(&m[i]); vm_page_xunbusy(&m[i]); } object->size += OFF_TO_IDX(pagesizes[psind]); shmfd->shm_size += pagesizes[psind]; atomic_add_long(&count_largepages[psind], 1); vm_wire_add(atop(pagesizes[psind])); } return (0); } static int shm_dotruncate_cookie(struct shmfd *shmfd, off_t length, void *rl_cookie) { int error; VM_OBJECT_WLOCK(shmfd->shm_object); error = shm_largepage(shmfd) ? shm_dotruncate_largepage(shmfd, length, rl_cookie) : shm_dotruncate_locked(shmfd, length, rl_cookie); VM_OBJECT_WUNLOCK(shmfd->shm_object); return (error); } int shm_dotruncate(struct shmfd *shmfd, off_t length) { void *rl_cookie; int error; rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, &shmfd->shm_mtx); error = shm_dotruncate_cookie(shmfd, length, rl_cookie); rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); return (error); } /* * shmfd object management including creation and reference counting * routines. */ struct shmfd * shm_alloc(struct ucred *ucred, mode_t mode, bool largepage) { struct shmfd *shmfd; shmfd = malloc(sizeof(*shmfd), M_SHMFD, M_WAITOK | M_ZERO); shmfd->shm_size = 0; shmfd->shm_uid = ucred->cr_uid; shmfd->shm_gid = ucred->cr_gid; shmfd->shm_mode = mode; if (largepage) { shmfd->shm_object = phys_pager_allocate(NULL, &shm_largepage_phys_ops, NULL, shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred); shmfd->shm_lp_alloc_policy = SHM_LARGEPAGE_ALLOC_DEFAULT; } else { shmfd->shm_object = vm_pager_allocate(OBJT_SWAP, NULL, shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred); } KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate")); vfs_timestamp(&shmfd->shm_birthtime); shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime = shmfd->shm_birthtime; shmfd->shm_ino = alloc_unr64(&shm_ino_unr); refcount_init(&shmfd->shm_refs, 1); mtx_init(&shmfd->shm_mtx, "shmrl", NULL, MTX_DEF); rangelock_init(&shmfd->shm_rl); #ifdef MAC mac_posixshm_init(shmfd); mac_posixshm_create(ucred, shmfd); #endif return (shmfd); } struct shmfd * shm_hold(struct shmfd *shmfd) { refcount_acquire(&shmfd->shm_refs); return (shmfd); } void shm_drop(struct shmfd *shmfd) { if (refcount_release(&shmfd->shm_refs)) { #ifdef MAC mac_posixshm_destroy(shmfd); #endif rangelock_destroy(&shmfd->shm_rl); mtx_destroy(&shmfd->shm_mtx); vm_object_deallocate(shmfd->shm_object); free(shmfd, M_SHMFD); } } /* * Determine if the credentials have sufficient permissions for a * specified combination of FREAD and FWRITE. */ int shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags) { accmode_t accmode; int error; accmode = 0; if (flags & FREAD) accmode |= VREAD; if (flags & FWRITE) accmode |= VWRITE; mtx_lock(&shm_timestamp_lock); error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid, accmode, ucred); mtx_unlock(&shm_timestamp_lock); return (error); } static void shm_init(void *arg) { char name[32]; int i; mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF); sx_init(&shm_dict_lock, "shm dictionary"); shm_dictionary = hashinit(1024, M_SHMFD, &shm_hash); new_unrhdr64(&shm_ino_unr, 1); shm_dev_ino = devfs_alloc_cdp_inode(); KASSERT(shm_dev_ino > 0, ("shm dev inode not initialized")); for (i = 1; i < MAXPAGESIZES; i++) { if (pagesizes[i] == 0) break; #define M (1024 * 1024) #define G (1024 * M) if (pagesizes[i] >= G) snprintf(name, sizeof(name), "%luG", pagesizes[i] / G); else if (pagesizes[i] >= M) snprintf(name, sizeof(name), "%luM", pagesizes[i] / M); else snprintf(name, sizeof(name), "%lu", pagesizes[i]); #undef G #undef M SYSCTL_ADD_ULONG(NULL, SYSCTL_STATIC_CHILDREN(_vm_largepages), OID_AUTO, name, CTLFLAG_RD, &count_largepages[i], "number of non-transient largepages allocated"); } } SYSINIT(shm_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_init, NULL); /* * Dictionary management. We maintain an in-kernel dictionary to map * paths to shmfd objects. We use the FNV hash on the path to store * the mappings in a hash table. */ static struct shmfd * shm_lookup(char *path, Fnv32_t fnv) { struct shm_mapping *map; LIST_FOREACH(map, SHM_HASH(fnv), sm_link) { if (map->sm_fnv != fnv) continue; if (strcmp(map->sm_path, path) == 0) return (map->sm_shmfd); } return (NULL); } static void shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd) { struct shm_mapping *map; map = malloc(sizeof(struct shm_mapping), M_SHMFD, M_WAITOK); map->sm_path = path; map->sm_fnv = fnv; map->sm_shmfd = shm_hold(shmfd); shmfd->shm_path = path; LIST_INSERT_HEAD(SHM_HASH(fnv), map, sm_link); } static int shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred) { struct shm_mapping *map; int error; LIST_FOREACH(map, SHM_HASH(fnv), sm_link) { if (map->sm_fnv != fnv) continue; if (strcmp(map->sm_path, path) == 0) { #ifdef MAC error = mac_posixshm_check_unlink(ucred, map->sm_shmfd); if (error) return (error); #endif error = shm_access(map->sm_shmfd, ucred, FREAD | FWRITE); if (error) return (error); map->sm_shmfd->shm_path = NULL; LIST_REMOVE(map, sm_link); shm_drop(map->sm_shmfd); free(map->sm_path, M_SHMFD); free(map, M_SHMFD); return (0); } } return (ENOENT); } int kern_shm_open2(struct thread *td, const char *userpath, int flags, mode_t mode, int shmflags, struct filecaps *fcaps, const char *name __unused) { struct pwddesc *pdp; struct shmfd *shmfd; struct file *fp; char *path; void *rl_cookie; Fnv32_t fnv; mode_t cmode; int error, fd, initial_seals; bool largepage; if ((shmflags & ~(SHM_ALLOW_SEALING | SHM_GROW_ON_WRITE | SHM_LARGEPAGE)) != 0) return (EINVAL); initial_seals = F_SEAL_SEAL; if ((shmflags & SHM_ALLOW_SEALING) != 0) initial_seals &= ~F_SEAL_SEAL; #ifdef CAPABILITY_MODE /* * shm_open(2) is only allowed for anonymous objects. */ if (IN_CAPABILITY_MODE(td) && (userpath != SHM_ANON)) return (ECAPMODE); #endif AUDIT_ARG_FFLAGS(flags); AUDIT_ARG_MODE(mode); if ((flags & O_ACCMODE) != O_RDONLY && (flags & O_ACCMODE) != O_RDWR) return (EINVAL); if ((flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0) return (EINVAL); largepage = (shmflags & SHM_LARGEPAGE) != 0; if (largepage && !PMAP_HAS_LARGEPAGES) return (ENOTTY); /* * Currently only F_SEAL_SEAL may be set when creating or opening shmfd. * If the decision is made later to allow additional seals, care must be * taken below to ensure that the seals are properly set if the shmfd * already existed -- this currently assumes that only F_SEAL_SEAL can * be set and doesn't take further precautions to ensure the validity of * the seals being added with respect to current mappings. */ if ((initial_seals & ~F_SEAL_SEAL) != 0) return (EINVAL); pdp = td->td_proc->p_pd; cmode = (mode & ~pdp->pd_cmask) & ACCESSPERMS; /* * shm_open(2) created shm should always have O_CLOEXEC set, as mandated * by POSIX. We allow it to be unset here so that an in-kernel * interface may be written as a thin layer around shm, optionally not * setting CLOEXEC. For shm_open(2), O_CLOEXEC is set unconditionally * in sys_shm_open() to keep this implementation compliant. */ error = falloc_caps(td, &fp, &fd, flags & O_CLOEXEC, fcaps); if (error) return (error); /* A SHM_ANON path pointer creates an anonymous object. */ if (userpath == SHM_ANON) { /* A read-only anonymous object is pointless. */ if ((flags & O_ACCMODE) == O_RDONLY) { fdclose(td, fp, fd); fdrop(fp, td); return (EINVAL); } shmfd = shm_alloc(td->td_ucred, cmode, largepage); shmfd->shm_seals = initial_seals; shmfd->shm_flags = shmflags; } else { error = shm_copyin_path(td, userpath, &path); if (error != 0) { fdclose(td, fp, fd); fdrop(fp, td); return (error); } AUDIT_ARG_UPATH1_CANON(path); fnv = fnv_32_str(path, FNV1_32_INIT); sx_xlock(&shm_dict_lock); shmfd = shm_lookup(path, fnv); if (shmfd == NULL) { /* Object does not yet exist, create it if requested. */ if (flags & O_CREAT) { #ifdef MAC error = mac_posixshm_check_create(td->td_ucred, path); if (error == 0) { #endif shmfd = shm_alloc(td->td_ucred, cmode, largepage); shmfd->shm_seals = initial_seals; shmfd->shm_flags = shmflags; shm_insert(path, fnv, shmfd); #ifdef MAC } #endif } else { free(path, M_SHMFD); error = ENOENT; } } else { rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, &shmfd->shm_mtx); /* * kern_shm_open() likely shouldn't ever error out on * trying to set a seal that already exists, unlike * F_ADD_SEALS. This would break terribly as * shm_open(2) actually sets F_SEAL_SEAL to maintain * historical behavior where the underlying file could * not be sealed. */ initial_seals &= ~shmfd->shm_seals; /* * Object already exists, obtain a new * reference if requested and permitted. */ free(path, M_SHMFD); /* * initial_seals can't set additional seals if we've * already been set F_SEAL_SEAL. If F_SEAL_SEAL is set, * then we've already removed that one from * initial_seals. This is currently redundant as we * only allow setting F_SEAL_SEAL at creation time, but * it's cheap to check and decreases the effort required * to allow additional seals. */ if ((shmfd->shm_seals & F_SEAL_SEAL) != 0 && initial_seals != 0) error = EPERM; else if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) error = EEXIST; else if (shmflags != 0 && shmflags != shmfd->shm_flags) error = EINVAL; else { #ifdef MAC error = mac_posixshm_check_open(td->td_ucred, shmfd, FFLAGS(flags & O_ACCMODE)); if (error == 0) #endif error = shm_access(shmfd, td->td_ucred, FFLAGS(flags & O_ACCMODE)); } /* * Truncate the file back to zero length if * O_TRUNC was specified and the object was * opened with read/write. */ if (error == 0 && (flags & (O_ACCMODE | O_TRUNC)) == (O_RDWR | O_TRUNC)) { VM_OBJECT_WLOCK(shmfd->shm_object); #ifdef MAC error = mac_posixshm_check_truncate( td->td_ucred, fp->f_cred, shmfd); if (error == 0) #endif error = shm_dotruncate_locked(shmfd, 0, rl_cookie); VM_OBJECT_WUNLOCK(shmfd->shm_object); } if (error == 0) { /* * Currently we only allow F_SEAL_SEAL to be * set initially. As noted above, this would * need to be reworked should that change. */ shmfd->shm_seals |= initial_seals; shm_hold(shmfd); } rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); } sx_xunlock(&shm_dict_lock); if (error) { fdclose(td, fp, fd); fdrop(fp, td); return (error); } } finit(fp, FFLAGS(flags & O_ACCMODE), DTYPE_SHM, shmfd, &shm_ops); td->td_retval[0] = fd; fdrop(fp, td); return (0); } /* System calls. */ #ifdef COMPAT_FREEBSD12 int freebsd12_shm_open(struct thread *td, struct freebsd12_shm_open_args *uap) { return (kern_shm_open(td, uap->path, uap->flags | O_CLOEXEC, uap->mode, NULL)); } #endif int sys_shm_unlink(struct thread *td, struct shm_unlink_args *uap) { char *path; Fnv32_t fnv; int error; error = shm_copyin_path(td, uap->path, &path); if (error != 0) return (error); AUDIT_ARG_UPATH1_CANON(path); fnv = fnv_32_str(path, FNV1_32_INIT); sx_xlock(&shm_dict_lock); error = shm_remove(path, fnv, td->td_ucred); sx_xunlock(&shm_dict_lock); free(path, M_SHMFD); return (error); } int sys_shm_rename(struct thread *td, struct shm_rename_args *uap) { char *path_from = NULL, *path_to = NULL; Fnv32_t fnv_from, fnv_to; struct shmfd *fd_from; struct shmfd *fd_to; int error; int flags; flags = uap->flags; AUDIT_ARG_FFLAGS(flags); /* * Make sure the user passed only valid flags. * If you add a new flag, please add a new term here. */ if ((flags & ~( SHM_RENAME_NOREPLACE | SHM_RENAME_EXCHANGE )) != 0) { error = EINVAL; goto out; } /* * EXCHANGE and NOREPLACE don't quite make sense together. Let's * force the user to choose one or the other. */ if ((flags & SHM_RENAME_NOREPLACE) != 0 && (flags & SHM_RENAME_EXCHANGE) != 0) { error = EINVAL; goto out; } /* Renaming to or from anonymous makes no sense */ if (uap->path_from == SHM_ANON || uap->path_to == SHM_ANON) { error = EINVAL; goto out; } error = shm_copyin_path(td, uap->path_from, &path_from); if (error != 0) goto out; error = shm_copyin_path(td, uap->path_to, &path_to); if (error != 0) goto out; AUDIT_ARG_UPATH1_CANON(path_from); AUDIT_ARG_UPATH2_CANON(path_to); /* Rename with from/to equal is a no-op */ if (strcmp(path_from, path_to) == 0) goto out; fnv_from = fnv_32_str(path_from, FNV1_32_INIT); fnv_to = fnv_32_str(path_to, FNV1_32_INIT); sx_xlock(&shm_dict_lock); fd_from = shm_lookup(path_from, fnv_from); if (fd_from == NULL) { error = ENOENT; goto out_locked; } fd_to = shm_lookup(path_to, fnv_to); if ((flags & SHM_RENAME_NOREPLACE) != 0 && fd_to != NULL) { error = EEXIST; goto out_locked; } /* * Unconditionally prevents shm_remove from invalidating the 'from' * shm's state. */ shm_hold(fd_from); error = shm_remove(path_from, fnv_from, td->td_ucred); /* * One of my assumptions failed if ENOENT (e.g. locking didn't * protect us) */ KASSERT(error != ENOENT, ("Our shm disappeared during shm_rename: %s", path_from)); if (error != 0) { shm_drop(fd_from); goto out_locked; } /* * If we are exchanging, we need to ensure the shm_remove below * doesn't invalidate the dest shm's state. */ if ((flags & SHM_RENAME_EXCHANGE) != 0 && fd_to != NULL) shm_hold(fd_to); /* * NOTE: if path_to is not already in the hash, c'est la vie; * it simply means we have nothing already at path_to to unlink. * That is the ENOENT case. * * If we somehow don't have access to unlink this guy, but * did for the shm at path_from, then relink the shm to path_from * and abort with EACCES. * * All other errors: that is weird; let's relink and abort the * operation. */ error = shm_remove(path_to, fnv_to, td->td_ucred); if (error != 0 && error != ENOENT) { shm_insert(path_from, fnv_from, fd_from); shm_drop(fd_from); /* Don't free path_from now, since the hash references it */ path_from = NULL; goto out_locked; } error = 0; shm_insert(path_to, fnv_to, fd_from); /* Don't free path_to now, since the hash references it */ path_to = NULL; /* We kept a ref when we removed, and incremented again in insert */ shm_drop(fd_from); KASSERT(fd_from->shm_refs > 0, ("Expected >0 refs; got: %d\n", fd_from->shm_refs)); if ((flags & SHM_RENAME_EXCHANGE) != 0 && fd_to != NULL) { shm_insert(path_from, fnv_from, fd_to); path_from = NULL; shm_drop(fd_to); KASSERT(fd_to->shm_refs > 0, ("Expected >0 refs; got: %d\n", fd_to->shm_refs)); } out_locked: sx_xunlock(&shm_dict_lock); out: free(path_from, M_SHMFD); free(path_to, M_SHMFD); return (error); } static int shm_mmap_large(struct shmfd *shmfd, vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_prot_t max_prot, int flags, vm_ooffset_t foff, struct thread *td) { struct vmspace *vms; vm_map_entry_t next_entry, prev_entry; vm_offset_t align, mask, maxaddr; int docow, error, rv, try; bool curmap; if (shmfd->shm_lp_psind == 0) return (EINVAL); /* MAP_PRIVATE is disabled */ if ((flags & ~(MAP_SHARED | MAP_FIXED | MAP_EXCL | MAP_NOCORE | #ifdef MAP_32BIT MAP_32BIT | #endif MAP_ALIGNMENT_MASK)) != 0) return (EINVAL); vms = td->td_proc->p_vmspace; curmap = map == &vms->vm_map; if (curmap) { error = kern_mmap_racct_check(td, map, size); if (error != 0) return (error); } docow = shmfd->shm_lp_psind << MAP_SPLIT_BOUNDARY_SHIFT; docow |= MAP_INHERIT_SHARE; if ((flags & MAP_NOCORE) != 0) docow |= MAP_DISABLE_COREDUMP; mask = pagesizes[shmfd->shm_lp_psind] - 1; if ((foff & mask) != 0) return (EINVAL); maxaddr = vm_map_max(map); #ifdef MAP_32BIT if ((flags & MAP_32BIT) != 0 && maxaddr > MAP_32BIT_MAX_ADDR) maxaddr = MAP_32BIT_MAX_ADDR; #endif if (size == 0 || (size & mask) != 0 || (*addr != 0 && ((*addr & mask) != 0 || *addr + size < *addr || *addr + size > maxaddr))) return (EINVAL); align = flags & MAP_ALIGNMENT_MASK; if (align == 0) { align = pagesizes[shmfd->shm_lp_psind]; } else if (align == MAP_ALIGNED_SUPER) { if (shmfd->shm_lp_psind != 1) return (EINVAL); align = pagesizes[1]; } else { align >>= MAP_ALIGNMENT_SHIFT; align = 1ULL << align; /* Also handles overflow. */ if (align < pagesizes[shmfd->shm_lp_psind]) return (EINVAL); } vm_map_lock(map); if ((flags & MAP_FIXED) == 0) { try = 1; if (curmap && (*addr == 0 || (*addr >= round_page((vm_offset_t)vms->vm_taddr) && *addr < round_page((vm_offset_t)vms->vm_daddr + lim_max(td, RLIMIT_DATA))))) { *addr = roundup2((vm_offset_t)vms->vm_daddr + lim_max(td, RLIMIT_DATA), pagesizes[shmfd->shm_lp_psind]); } again: rv = vm_map_find_aligned(map, addr, size, maxaddr, align); if (rv != KERN_SUCCESS) { if (try == 1) { try = 2; *addr = vm_map_min(map); if ((*addr & mask) != 0) *addr = (*addr + mask) & mask; goto again; } goto fail1; } } else if ((flags & MAP_EXCL) == 0) { rv = vm_map_delete(map, *addr, *addr + size); if (rv != KERN_SUCCESS) goto fail1; } else { error = ENOSPC; if (vm_map_lookup_entry(map, *addr, &prev_entry)) goto fail; next_entry = vm_map_entry_succ(prev_entry); if (next_entry->start < *addr + size) goto fail; } rv = vm_map_insert(map, shmfd->shm_object, foff, *addr, *addr + size, prot, max_prot, docow); fail1: error = vm_mmap_to_errno(rv); fail: vm_map_unlock(map); return (error); } static int shm_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t objsize, vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, struct thread *td) { struct shmfd *shmfd; vm_prot_t maxprot; int error; bool writecnt; void *rl_cookie; shmfd = fp->f_data; maxprot = VM_PROT_NONE; rl_cookie = rangelock_rlock(&shmfd->shm_rl, 0, objsize, &shmfd->shm_mtx); /* FREAD should always be set. */ if ((fp->f_flag & FREAD) != 0) maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; /* * If FWRITE's set, we can allow VM_PROT_WRITE unless it's a shared * mapping with a write seal applied. Private mappings are always * writeable. */ if ((flags & MAP_SHARED) == 0) { cap_maxprot |= VM_PROT_WRITE; maxprot |= VM_PROT_WRITE; writecnt = false; } else { if ((fp->f_flag & FWRITE) != 0 && (shmfd->shm_seals & F_SEAL_WRITE) == 0) maxprot |= VM_PROT_WRITE; /* * Any mappings from a writable descriptor may be upgraded to * VM_PROT_WRITE with mprotect(2), unless a write-seal was * applied between the open and subsequent mmap(2). We want to * reject application of a write seal as long as any such * mapping exists so that the seal cannot be trivially bypassed. */ writecnt = (maxprot & VM_PROT_WRITE) != 0; if (!writecnt && (prot & VM_PROT_WRITE) != 0) { error = EACCES; goto out; } } maxprot &= cap_maxprot; /* See comment in vn_mmap(). */ if ( #ifdef _LP64 objsize > OFF_MAX || #endif foff > OFF_MAX - objsize) { error = EINVAL; goto out; } #ifdef MAC error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, flags); if (error != 0) goto out; #endif mtx_lock(&shm_timestamp_lock); vfs_timestamp(&shmfd->shm_atime); mtx_unlock(&shm_timestamp_lock); vm_object_reference(shmfd->shm_object); if (shm_largepage(shmfd)) { writecnt = false; error = shm_mmap_large(shmfd, map, addr, objsize, prot, maxprot, flags, foff, td); } else { if (writecnt) { vm_pager_update_writecount(shmfd->shm_object, 0, objsize); } error = vm_mmap_object(map, addr, objsize, prot, maxprot, flags, shmfd->shm_object, foff, writecnt, td); } if (error != 0) { if (writecnt) vm_pager_release_writecount(shmfd->shm_object, 0, objsize); vm_object_deallocate(shmfd->shm_object); } out: rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); return (error); } static int shm_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { struct shmfd *shmfd; int error; error = 0; shmfd = fp->f_data; mtx_lock(&shm_timestamp_lock); /* * SUSv4 says that x bits of permission need not be affected. * Be consistent with our shm_open there. */ #ifdef MAC error = mac_posixshm_check_setmode(active_cred, shmfd, mode); if (error != 0) goto out; #endif error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid, VADMIN, active_cred); if (error != 0) goto out; shmfd->shm_mode = mode & ACCESSPERMS; out: mtx_unlock(&shm_timestamp_lock); return (error); } static int shm_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { struct shmfd *shmfd; int error; error = 0; shmfd = fp->f_data; mtx_lock(&shm_timestamp_lock); #ifdef MAC error = mac_posixshm_check_setowner(active_cred, shmfd, uid, gid); if (error != 0) goto out; #endif if (uid == (uid_t)-1) uid = shmfd->shm_uid; if (gid == (gid_t)-1) gid = shmfd->shm_gid; if (((uid != shmfd->shm_uid && uid != active_cred->cr_uid) || (gid != shmfd->shm_gid && !groupmember(gid, active_cred))) && (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN))) goto out; shmfd->shm_uid = uid; shmfd->shm_gid = gid; out: mtx_unlock(&shm_timestamp_lock); return (error); } /* * Helper routines to allow the backing object of a shared memory file * descriptor to be mapped in the kernel. */ int shm_map(struct file *fp, size_t size, off_t offset, void **memp) { struct shmfd *shmfd; vm_offset_t kva, ofs; vm_object_t obj; int rv; if (fp->f_type != DTYPE_SHM) return (EINVAL); shmfd = fp->f_data; obj = shmfd->shm_object; VM_OBJECT_WLOCK(obj); /* * XXXRW: This validation is probably insufficient, and subject to * sign errors. It should be fixed. */ if (offset >= shmfd->shm_size || offset + size > round_page(shmfd->shm_size)) { VM_OBJECT_WUNLOCK(obj); return (EINVAL); } shmfd->shm_kmappings++; vm_object_reference_locked(obj); VM_OBJECT_WUNLOCK(obj); /* Map the object into the kernel_map and wire it. */ kva = vm_map_min(kernel_map); ofs = offset & PAGE_MASK; offset = trunc_page(offset); size = round_page(size + ofs); rv = vm_map_find(kernel_map, obj, offset, &kva, size, 0, VMFS_OPTIMAL_SPACE, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0); if (rv == KERN_SUCCESS) { rv = vm_map_wire(kernel_map, kva, kva + size, VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); if (rv == KERN_SUCCESS) { *memp = (void *)(kva + ofs); return (0); } vm_map_remove(kernel_map, kva, kva + size); } else vm_object_deallocate(obj); /* On failure, drop our mapping reference. */ VM_OBJECT_WLOCK(obj); shmfd->shm_kmappings--; VM_OBJECT_WUNLOCK(obj); return (vm_mmap_to_errno(rv)); } /* * We require the caller to unmap the entire entry. This allows us to * safely decrement shm_kmappings when a mapping is removed. */ int shm_unmap(struct file *fp, void *mem, size_t size) { struct shmfd *shmfd; vm_map_entry_t entry; vm_offset_t kva, ofs; vm_object_t obj; vm_pindex_t pindex; vm_prot_t prot; boolean_t wired; vm_map_t map; int rv; if (fp->f_type != DTYPE_SHM) return (EINVAL); shmfd = fp->f_data; kva = (vm_offset_t)mem; ofs = kva & PAGE_MASK; kva = trunc_page(kva); size = round_page(size + ofs); map = kernel_map; rv = vm_map_lookup(&map, kva, VM_PROT_READ | VM_PROT_WRITE, &entry, &obj, &pindex, &prot, &wired); if (rv != KERN_SUCCESS) return (EINVAL); if (entry->start != kva || entry->end != kva + size) { vm_map_lookup_done(map, entry); return (EINVAL); } vm_map_lookup_done(map, entry); if (obj != shmfd->shm_object) return (EINVAL); vm_map_remove(map, kva, kva + size); VM_OBJECT_WLOCK(obj); KASSERT(shmfd->shm_kmappings > 0, ("shm_unmap: object not mapped")); shmfd->shm_kmappings--; VM_OBJECT_WUNLOCK(obj); return (0); } static int shm_fill_kinfo_locked(struct shmfd *shmfd, struct kinfo_file *kif, bool list) { const char *path, *pr_path; size_t pr_pathlen; bool visible; sx_assert(&shm_dict_lock, SA_LOCKED); kif->kf_type = KF_TYPE_SHM; kif->kf_un.kf_file.kf_file_mode = S_IFREG | shmfd->shm_mode; kif->kf_un.kf_file.kf_file_size = shmfd->shm_size; if (shmfd->shm_path != NULL) { if (shmfd->shm_path != NULL) { path = shmfd->shm_path; pr_path = curthread->td_ucred->cr_prison->pr_path; if (strcmp(pr_path, "/") != 0) { /* Return the jail-rooted pathname. */ pr_pathlen = strlen(pr_path); visible = strncmp(path, pr_path, pr_pathlen) == 0 && path[pr_pathlen] == '/'; if (list && !visible) return (EPERM); if (visible) path += pr_pathlen; } strlcpy(kif->kf_path, path, sizeof(kif->kf_path)); } } return (0); } static int shm_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp __unused) { int res; sx_slock(&shm_dict_lock); res = shm_fill_kinfo_locked(fp->f_data, kif, false); sx_sunlock(&shm_dict_lock); return (res); } static int shm_add_seals(struct file *fp, int seals) { struct shmfd *shmfd; void *rl_cookie; vm_ooffset_t writemappings; int error, nseals; error = 0; shmfd = fp->f_data; rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, &shmfd->shm_mtx); /* Even already-set seals should result in EPERM. */ if ((shmfd->shm_seals & F_SEAL_SEAL) != 0) { error = EPERM; goto out; } nseals = seals & ~shmfd->shm_seals; if ((nseals & F_SEAL_WRITE) != 0) { if (shm_largepage(shmfd)) { error = ENOTSUP; goto out; } /* * The rangelock above prevents writable mappings from being * added after we've started applying seals. The RLOCK here * is to avoid torn reads on ILP32 arches as unmapping/reducing * writemappings will be done without a rangelock. */ VM_OBJECT_RLOCK(shmfd->shm_object); writemappings = shmfd->shm_object->un_pager.swp.writemappings; VM_OBJECT_RUNLOCK(shmfd->shm_object); /* kmappings are also writable */ if (writemappings > 0) { error = EBUSY; goto out; } } shmfd->shm_seals |= nseals; out: rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); return (error); } static int shm_get_seals(struct file *fp, int *seals) { struct shmfd *shmfd; shmfd = fp->f_data; *seals = shmfd->shm_seals; return (0); } static int shm_deallocate(struct shmfd *shmfd, off_t *offset, off_t *length, int flags) { vm_object_t object; vm_pindex_t pistart, pi, piend; vm_ooffset_t off, len; int startofs, endofs, end; int error; off = *offset; len = *length; KASSERT(off + len <= (vm_ooffset_t)OFF_MAX, ("off + len overflows")); if (off + len > shmfd->shm_size) len = shmfd->shm_size - off; object = shmfd->shm_object; startofs = off & PAGE_MASK; endofs = (off + len) & PAGE_MASK; pistart = OFF_TO_IDX(off); piend = OFF_TO_IDX(off + len); pi = OFF_TO_IDX(off + PAGE_MASK); error = 0; /* Handle the case when offset is on or beyond shm size. */ if ((off_t)len <= 0) { *length = 0; return (0); } VM_OBJECT_WLOCK(object); if (startofs != 0) { end = pistart != piend ? PAGE_SIZE : endofs; error = shm_partial_page_invalidate(object, pistart, startofs, end); if (error) goto out; off += end - startofs; len -= end - startofs; } if (pi < piend) { vm_object_page_remove(object, pi, piend, 0); off += IDX_TO_OFF(piend - pi); len -= IDX_TO_OFF(piend - pi); } if (endofs != 0 && pistart != piend) { error = shm_partial_page_invalidate(object, piend, 0, endofs); if (error) goto out; off += endofs; len -= endofs; } out: VM_OBJECT_WUNLOCK(shmfd->shm_object); *offset = off; *length = len; return (error); } static int shm_fspacectl(struct file *fp, int cmd, off_t *offset, off_t *length, int flags, struct ucred *active_cred, struct thread *td) { void *rl_cookie; struct shmfd *shmfd; off_t off, len; int error; /* This assumes that the caller already checked for overflow. */ error = EINVAL; shmfd = fp->f_data; off = *offset; len = *length; if (cmd != SPACECTL_DEALLOC || off < 0 || len <= 0 || len > OFF_MAX - off || flags != 0) return (EINVAL); rl_cookie = rangelock_wlock(&shmfd->shm_rl, off, off + len, &shmfd->shm_mtx); switch (cmd) { case SPACECTL_DEALLOC: if ((shmfd->shm_seals & F_SEAL_WRITE) != 0) { error = EPERM; break; } error = shm_deallocate(shmfd, &off, &len, flags); *offset = off; *length = len; break; default: __assert_unreachable(); } rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); return (error); } static int shm_fallocate(struct file *fp, off_t offset, off_t len, struct thread *td) { void *rl_cookie; struct shmfd *shmfd; size_t size; int error; /* This assumes that the caller already checked for overflow. */ error = 0; shmfd = fp->f_data; size = offset + len; /* * Just grab the rangelock for the range that we may be attempting to * grow, rather than blocking read/write for regions we won't be * touching while this (potential) resize is in progress. Other * attempts to resize the shmfd will have to take a write lock from 0 to * OFF_MAX, so this being potentially beyond the current usable range of * the shmfd is not necessarily a concern. If other mechanisms are * added to grow a shmfd, this may need to be re-evaluated. */ rl_cookie = rangelock_wlock(&shmfd->shm_rl, offset, size, &shmfd->shm_mtx); if (size > shmfd->shm_size) error = shm_dotruncate_cookie(shmfd, size, rl_cookie); rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); /* Translate to posix_fallocate(2) return value as needed. */ if (error == ENOMEM) error = ENOSPC; return (error); } static int sysctl_posix_shm_list(SYSCTL_HANDLER_ARGS) { struct shm_mapping *shmm; struct sbuf sb; struct kinfo_file kif; u_long i; ssize_t curlen; int error, error2; sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file) * 5, req); sbuf_clear_flags(&sb, SBUF_INCLUDENUL); curlen = 0; error = 0; sx_slock(&shm_dict_lock); for (i = 0; i < shm_hash + 1; i++) { LIST_FOREACH(shmm, &shm_dictionary[i], sm_link) { error = shm_fill_kinfo_locked(shmm->sm_shmfd, &kif, true); if (error == EPERM) { error = 0; continue; } if (error != 0) break; pack_kinfo(&kif); if (req->oldptr != NULL && kif.kf_structsize + curlen > req->oldlen) break; error = sbuf_bcat(&sb, &kif, kif.kf_structsize) == 0 ? 0 : ENOMEM; if (error != 0) break; curlen += kif.kf_structsize; } } sx_sunlock(&shm_dict_lock); error2 = sbuf_finish(&sb); sbuf_delete(&sb); return (error != 0 ? error : error2); } SYSCTL_PROC(_kern_ipc, OID_AUTO, posix_shm_list, CTLFLAG_RD | CTLFLAG_MPSAFE | CTLTYPE_OPAQUE, NULL, 0, sysctl_posix_shm_list, "", "POSIX SHM list"); int kern_shm_open(struct thread *td, const char *path, int flags, mode_t mode, struct filecaps *caps) { return (kern_shm_open2(td, path, flags, mode, 0, caps, NULL)); } /* * This version of the shm_open() interface leaves CLOEXEC behavior up to the * caller, and libc will enforce it for the traditional shm_open() call. This * allows other consumers, like memfd_create(), to opt-in for CLOEXEC. This * interface also includes a 'name' argument that is currently unused, but could * potentially be exported later via some interface for debugging purposes. * From the kernel's perspective, it is optional. Individual consumers like * memfd_create() may require it in order to be compatible with other systems * implementing the same function. */ int sys_shm_open2(struct thread *td, struct shm_open2_args *uap) { return (kern_shm_open2(td, uap->path, uap->flags, uap->mode, uap->shmflags, NULL, uap->name)); } diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c index 82a8125ece95..29a6b34d083a 100644 --- a/sys/kern/vfs_syscalls.c +++ b/sys/kern/vfs_syscalls.c @@ -1,5012 +1,5012 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_syscalls.c 8.13 (Berkeley) 4/15/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information"); static int kern_chflagsat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, u_long flags, int atflag); static int setfflags(struct thread *td, struct vnode *, u_long); static int getutimes(const struct timeval *, enum uio_seg, struct timespec *); static int getutimens(const struct timespec *, enum uio_seg, struct timespec *, int *); static int setutimes(struct thread *td, struct vnode *, const struct timespec *, int, int); static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred, struct thread *td); static int kern_fhlinkat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, fhandle_t *fhp); static int kern_readlink_vp(struct vnode *vp, char *buf, enum uio_seg bufseg, size_t count, struct thread *td); static int kern_linkat_vp(struct thread *td, struct vnode *vp, int fd, const char *path, enum uio_seg segflag); static uint64_t at2cnpflags(u_int at_flags, u_int mask) { u_int64_t res; MPASS((at_flags & (AT_SYMLINK_FOLLOW | AT_SYMLINK_NOFOLLOW)) != (AT_SYMLINK_FOLLOW | AT_SYMLINK_NOFOLLOW)); res = 0; at_flags &= mask; if ((at_flags & AT_RESOLVE_BENEATH) != 0) res |= RBENEATH; if ((at_flags & AT_SYMLINK_FOLLOW) != 0) res |= FOLLOW; /* NOFOLLOW is pseudo flag */ if ((mask & AT_SYMLINK_NOFOLLOW) != 0) { res |= (at_flags & AT_SYMLINK_NOFOLLOW) != 0 ? NOFOLLOW : FOLLOW; } if ((mask & AT_EMPTY_PATH) != 0 && (at_flags & AT_EMPTY_PATH) != 0) res |= EMPTYPATH; return (res); } int kern_sync(struct thread *td) { struct mount *mp, *nmp; int save; mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } if ((mp->mnt_flag & MNT_RDONLY) == 0 && vn_start_write(NULL, &mp, V_NOWAIT) == 0) { save = curthread_pflags_set(TDP_SYNCIO); vfs_periodic(mp, MNT_NOWAIT); VFS_SYNC(mp, MNT_NOWAIT); curthread_pflags_restore(save); vn_finished_write(mp); } mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp); } mtx_unlock(&mountlist_mtx); return (0); } /* * Sync each mounted filesystem. */ #ifndef _SYS_SYSPROTO_H_ struct sync_args { int dummy; }; #endif /* ARGSUSED */ int sys_sync(struct thread *td, struct sync_args *uap) { return (kern_sync(td)); } /* * Change filesystem quotas. */ #ifndef _SYS_SYSPROTO_H_ struct quotactl_args { char *path; int cmd; int uid; caddr_t arg; }; #endif int sys_quotactl(struct thread *td, struct quotactl_args *uap) { struct mount *mp; struct nameidata nd; int error; bool mp_busy; AUDIT_ARG_CMD(uap->cmd); AUDIT_ARG_UID(uap->uid); if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS)) return (EPERM); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); mp = nd.ni_vp->v_mount; vfs_ref(mp); vput(nd.ni_vp); error = vfs_busy(mp, 0); if (error != 0) { vfs_rel(mp); return (error); } mp_busy = true; error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg, &mp_busy); /* * Since quota on/off operations typically need to open quota * files, the implementation may need to unbusy the mount point * before calling into namei. Otherwise, unmount might be * started between two vfs_busy() invocations (first is ours, * second is from mount point cross-walk code in lookup()), * causing deadlock. * * Avoid unbusying mp if the implementation indicates it has * already done so. */ if (mp_busy) vfs_unbusy(mp); vfs_rel(mp); return (error); } /* * Used by statfs conversion routines to scale the block size up if * necessary so that all of the block counts are <= 'max_size'. Note * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero * value of 'n'. */ void statfs_scale_blocks(struct statfs *sf, long max_size) { uint64_t count; int shift; KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__)); /* * Attempt to scale the block counts to give a more accurate * overview to userland of the ratio of free space to used * space. To do this, find the largest block count and compute * a divisor that lets it fit into a signed integer <= max_size. */ if (sf->f_bavail < 0) count = -sf->f_bavail; else count = sf->f_bavail; count = MAX(sf->f_blocks, MAX(sf->f_bfree, count)); if (count <= max_size) return; count >>= flsl(max_size); shift = 0; while (count > 0) { shift++; count >>=1; } sf->f_bsize <<= shift; sf->f_blocks >>= shift; sf->f_bfree >>= shift; sf->f_bavail >>= shift; } static int kern_do_statfs(struct thread *td, struct mount *mp, struct statfs *buf) { int error; if (mp == NULL) return (EBADF); error = vfs_busy(mp, 0); vfs_rel(mp); if (error != 0) return (error); #ifdef MAC error = mac_mount_check_stat(td->td_ucred, mp); if (error != 0) goto out; #endif error = VFS_STATFS(mp, buf); if (error != 0) goto out; if (priv_check_cred_vfs_generation(td->td_ucred)) { buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0; prison_enforce_statfs(td->td_ucred, mp, buf); } out: vfs_unbusy(mp); return (error); } /* * Get filesystem statistics. */ #ifndef _SYS_SYSPROTO_H_ struct statfs_args { char *path; struct statfs *buf; }; #endif int sys_statfs(struct thread *td, struct statfs_args *uap) { struct statfs *sfp; int error; sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp); if (error == 0) error = copyout(sfp, uap->buf, sizeof(struct statfs)); free(sfp, M_STATFS); return (error); } int kern_statfs(struct thread *td, const char *path, enum uio_seg pathseg, struct statfs *buf) { struct mount *mp; struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, td); error = namei(&nd); if (error != 0) return (error); mp = vfs_ref_from_vp(nd.ni_vp); NDFREE_NOTHING(&nd); vrele(nd.ni_vp); return (kern_do_statfs(td, mp, buf)); } /* * Get filesystem statistics. */ #ifndef _SYS_SYSPROTO_H_ struct fstatfs_args { int fd; struct statfs *buf; }; #endif int sys_fstatfs(struct thread *td, struct fstatfs_args *uap) { struct statfs *sfp; int error; sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fstatfs(td, uap->fd, sfp); if (error == 0) error = copyout(sfp, uap->buf, sizeof(struct statfs)); free(sfp, M_STATFS); return (error); } int kern_fstatfs(struct thread *td, int fd, struct statfs *buf) { struct file *fp; struct mount *mp; struct vnode *vp; int error; AUDIT_ARG_FD(fd); error = getvnode_path(td, fd, &cap_fstatfs_rights, &fp); if (error != 0) return (error); vp = fp->f_vnode; #ifdef AUDIT if (AUDITING_TD(td)) { vn_lock(vp, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(vp); VOP_UNLOCK(vp); } #endif mp = vfs_ref_from_vp(vp); fdrop(fp, td); return (kern_do_statfs(td, mp, buf)); } /* * Get statistics on all filesystems. */ #ifndef _SYS_SYSPROTO_H_ struct getfsstat_args { struct statfs *buf; long bufsize; int mode; }; #endif int sys_getfsstat(struct thread *td, struct getfsstat_args *uap) { size_t count; int error; if (uap->bufsize < 0 || uap->bufsize > SIZE_MAX) return (EINVAL); error = kern_getfsstat(td, &uap->buf, uap->bufsize, &count, UIO_USERSPACE, uap->mode); if (error == 0) td->td_retval[0] = count; return (error); } /* * If (bufsize > 0 && bufseg == UIO_SYSSPACE) * The caller is responsible for freeing memory which will be allocated * in '*buf'. */ int kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize, size_t *countp, enum uio_seg bufseg, int mode) { struct mount *mp, *nmp; struct statfs *sfsp, *sp, *sptmp, *tofree; size_t count, maxcount; int error; switch (mode) { case MNT_WAIT: case MNT_NOWAIT: break; default: if (bufseg == UIO_SYSSPACE) *buf = NULL; return (EINVAL); } restart: maxcount = bufsize / sizeof(struct statfs); if (bufsize == 0) { sfsp = NULL; tofree = NULL; } else if (bufseg == UIO_USERSPACE) { sfsp = *buf; tofree = NULL; } else /* if (bufseg == UIO_SYSSPACE) */ { count = 0; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { count++; } mtx_unlock(&mountlist_mtx); if (maxcount > count) maxcount = count; tofree = sfsp = *buf = malloc(maxcount * sizeof(struct statfs), M_STATFS, M_WAITOK); } count = 0; /* * If there is no target buffer they only want the count. * * This could be TAILQ_FOREACH but it is open-coded to match the original * code below. */ if (sfsp == NULL) { mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (prison_canseemount(td->td_ucred, mp) != 0) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } #ifdef MAC if (mac_mount_check_stat(td->td_ucred, mp) != 0) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } #endif count++; nmp = TAILQ_NEXT(mp, mnt_list); } mtx_unlock(&mountlist_mtx); *countp = count; return (0); } /* * They want the entire thing. * * Short-circuit the corner case of no room for anything, avoids * relocking below. */ if (maxcount < 1) { goto out; } mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (prison_canseemount(td->td_ucred, mp) != 0) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } #ifdef MAC if (mac_mount_check_stat(td->td_ucred, mp) != 0) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } #endif if (mode == MNT_WAIT) { if (vfs_busy(mp, MBF_MNTLSTLOCK) != 0) { /* * If vfs_busy() failed, and MBF_NOWAIT * wasn't passed, then the mp is gone. * Furthermore, because of MBF_MNTLSTLOCK, * the mountlist_mtx was dropped. We have * no other choice than to start over. */ mtx_unlock(&mountlist_mtx); free(tofree, M_STATFS); goto restart; } } else { if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) != 0) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } } sp = &mp->mnt_stat; /* * If MNT_NOWAIT is specified, do not refresh * the fsstat cache. */ if (mode != MNT_NOWAIT) { error = VFS_STATFS(mp, sp); if (error != 0) { mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp); continue; } } if (priv_check_cred_vfs_generation(td->td_ucred)) { sptmp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); *sptmp = *sp; sptmp->f_fsid.val[0] = sptmp->f_fsid.val[1] = 0; prison_enforce_statfs(td->td_ucred, mp, sptmp); sp = sptmp; } else sptmp = NULL; if (bufseg == UIO_SYSSPACE) { bcopy(sp, sfsp, sizeof(*sp)); free(sptmp, M_STATFS); } else /* if (bufseg == UIO_USERSPACE) */ { error = copyout(sp, sfsp, sizeof(*sp)); free(sptmp, M_STATFS); if (error != 0) { vfs_unbusy(mp); return (error); } } sfsp++; count++; if (count == maxcount) { vfs_unbusy(mp); goto out; } mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp); } mtx_unlock(&mountlist_mtx); out: *countp = count; return (0); } #ifdef COMPAT_FREEBSD4 /* * Get old format filesystem statistics. */ static void freebsd4_cvtstatfs(struct statfs *, struct ostatfs *); #ifndef _SYS_SYSPROTO_H_ struct freebsd4_statfs_args { char *path; struct ostatfs *buf; }; #endif int freebsd4_statfs(struct thread *td, struct freebsd4_statfs_args *uap) { struct ostatfs osb; struct statfs *sfp; int error; sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp); if (error == 0) { freebsd4_cvtstatfs(sfp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); } free(sfp, M_STATFS); return (error); } /* * Get filesystem statistics. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd4_fstatfs_args { int fd; struct ostatfs *buf; }; #endif int freebsd4_fstatfs(struct thread *td, struct freebsd4_fstatfs_args *uap) { struct ostatfs osb; struct statfs *sfp; int error; sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fstatfs(td, uap->fd, sfp); if (error == 0) { freebsd4_cvtstatfs(sfp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); } free(sfp, M_STATFS); return (error); } /* * Get statistics on all filesystems. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd4_getfsstat_args { struct ostatfs *buf; long bufsize; int mode; }; #endif int freebsd4_getfsstat(struct thread *td, struct freebsd4_getfsstat_args *uap) { struct statfs *buf, *sp; struct ostatfs osb; size_t count, size; int error; if (uap->bufsize < 0) return (EINVAL); count = uap->bufsize / sizeof(struct ostatfs); if (count > SIZE_MAX / sizeof(struct statfs)) return (EINVAL); size = count * sizeof(struct statfs); error = kern_getfsstat(td, &buf, size, &count, UIO_SYSSPACE, uap->mode); if (error == 0) td->td_retval[0] = count; if (size != 0) { sp = buf; while (count != 0 && error == 0) { freebsd4_cvtstatfs(sp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); sp++; uap->buf++; count--; } free(buf, M_STATFS); } return (error); } /* * Implement fstatfs() for (NFS) file handles. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd4_fhstatfs_args { struct fhandle *u_fhp; struct ostatfs *buf; }; #endif int freebsd4_fhstatfs(struct thread *td, struct freebsd4_fhstatfs_args *uap) { struct ostatfs osb; struct statfs *sfp; fhandle_t fh; int error; error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); if (error != 0) return (error); sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fhstatfs(td, fh, sfp); if (error == 0) { freebsd4_cvtstatfs(sfp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); } free(sfp, M_STATFS); return (error); } /* * Convert a new format statfs structure to an old format statfs structure. */ static void freebsd4_cvtstatfs(struct statfs *nsp, struct ostatfs *osp) { statfs_scale_blocks(nsp, LONG_MAX); bzero(osp, sizeof(*osp)); osp->f_bsize = nsp->f_bsize; osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX); osp->f_blocks = nsp->f_blocks; osp->f_bfree = nsp->f_bfree; osp->f_bavail = nsp->f_bavail; osp->f_files = MIN(nsp->f_files, LONG_MAX); osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX); osp->f_owner = nsp->f_owner; osp->f_type = nsp->f_type; osp->f_flags = nsp->f_flags; osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX); osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX); osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX); osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX); strlcpy(osp->f_fstypename, nsp->f_fstypename, MIN(MFSNAMELEN, OMFSNAMELEN)); strlcpy(osp->f_mntonname, nsp->f_mntonname, MIN(MNAMELEN, OMNAMELEN)); strlcpy(osp->f_mntfromname, nsp->f_mntfromname, MIN(MNAMELEN, OMNAMELEN)); osp->f_fsid = nsp->f_fsid; } #endif /* COMPAT_FREEBSD4 */ #if defined(COMPAT_FREEBSD11) /* * Get old format filesystem statistics. */ static void freebsd11_cvtstatfs(struct statfs *, struct freebsd11_statfs *); int freebsd11_statfs(struct thread *td, struct freebsd11_statfs_args *uap) { struct freebsd11_statfs osb; struct statfs *sfp; int error; sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp); if (error == 0) { freebsd11_cvtstatfs(sfp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); } free(sfp, M_STATFS); return (error); } /* * Get filesystem statistics. */ int freebsd11_fstatfs(struct thread *td, struct freebsd11_fstatfs_args *uap) { struct freebsd11_statfs osb; struct statfs *sfp; int error; sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fstatfs(td, uap->fd, sfp); if (error == 0) { freebsd11_cvtstatfs(sfp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); } free(sfp, M_STATFS); return (error); } /* * Get statistics on all filesystems. */ int freebsd11_getfsstat(struct thread *td, struct freebsd11_getfsstat_args *uap) { struct freebsd11_statfs osb; struct statfs *buf, *sp; size_t count, size; int error; count = uap->bufsize / sizeof(struct ostatfs); size = count * sizeof(struct statfs); error = kern_getfsstat(td, &buf, size, &count, UIO_SYSSPACE, uap->mode); if (error == 0) td->td_retval[0] = count; if (size > 0) { sp = buf; while (count > 0 && error == 0) { freebsd11_cvtstatfs(sp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); sp++; uap->buf++; count--; } free(buf, M_STATFS); } return (error); } /* * Implement fstatfs() for (NFS) file handles. */ int freebsd11_fhstatfs(struct thread *td, struct freebsd11_fhstatfs_args *uap) { struct freebsd11_statfs osb; struct statfs *sfp; fhandle_t fh; int error; error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); if (error) return (error); sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fhstatfs(td, fh, sfp); if (error == 0) { freebsd11_cvtstatfs(sfp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); } free(sfp, M_STATFS); return (error); } /* * Convert a new format statfs structure to an old format statfs structure. */ static void freebsd11_cvtstatfs(struct statfs *nsp, struct freebsd11_statfs *osp) { bzero(osp, sizeof(*osp)); osp->f_version = FREEBSD11_STATFS_VERSION; osp->f_type = nsp->f_type; osp->f_flags = nsp->f_flags; osp->f_bsize = nsp->f_bsize; osp->f_iosize = nsp->f_iosize; osp->f_blocks = nsp->f_blocks; osp->f_bfree = nsp->f_bfree; osp->f_bavail = nsp->f_bavail; osp->f_files = nsp->f_files; osp->f_ffree = nsp->f_ffree; osp->f_syncwrites = nsp->f_syncwrites; osp->f_asyncwrites = nsp->f_asyncwrites; osp->f_syncreads = nsp->f_syncreads; osp->f_asyncreads = nsp->f_asyncreads; osp->f_namemax = nsp->f_namemax; osp->f_owner = nsp->f_owner; osp->f_fsid = nsp->f_fsid; strlcpy(osp->f_fstypename, nsp->f_fstypename, MIN(MFSNAMELEN, sizeof(osp->f_fstypename))); strlcpy(osp->f_mntonname, nsp->f_mntonname, MIN(MNAMELEN, sizeof(osp->f_mntonname))); strlcpy(osp->f_mntfromname, nsp->f_mntfromname, MIN(MNAMELEN, sizeof(osp->f_mntfromname))); } #endif /* COMPAT_FREEBSD11 */ /* * Change current working directory to a given file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchdir_args { int fd; }; #endif int sys_fchdir(struct thread *td, struct fchdir_args *uap) { struct vnode *vp, *tdp; struct mount *mp; struct file *fp; int error; AUDIT_ARG_FD(uap->fd); error = getvnode_path(td, uap->fd, &cap_fchdir_rights, &fp); if (error != 0) return (error); vp = fp->f_vnode; vref(vp); fdrop(fp, td); vn_lock(vp, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(vp); error = change_dir(vp, td); while (!error && (mp = vp->v_mountedhere) != NULL) { if (vfs_busy(mp, 0)) continue; error = VFS_ROOT(mp, LK_SHARED, &tdp); vfs_unbusy(mp); if (error != 0) break; vput(vp); vp = tdp; } if (error != 0) { vput(vp); return (error); } VOP_UNLOCK(vp); pwd_chdir(td, vp); return (0); } /* * Change current working directory (``.''). */ #ifndef _SYS_SYSPROTO_H_ struct chdir_args { char *path; }; #endif int sys_chdir(struct thread *td, struct chdir_args *uap) { return (kern_chdir(td, uap->path, UIO_USERSPACE)); } int kern_chdir(struct thread *td, const char *path, enum uio_seg pathseg) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); if ((error = change_dir(nd.ni_vp, td)) != 0) { vput(nd.ni_vp); NDFREE_NOTHING(&nd); return (error); } VOP_UNLOCK(nd.ni_vp); NDFREE_NOTHING(&nd); pwd_chdir(td, nd.ni_vp); return (0); } static int unprivileged_chroot = 0; SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_chroot, CTLFLAG_RW, &unprivileged_chroot, 0, "Unprivileged processes can use chroot(2)"); /* * Change notion of root (``/'') directory. */ #ifndef _SYS_SYSPROTO_H_ struct chroot_args { char *path; }; #endif int sys_chroot(struct thread *td, struct chroot_args *uap) { struct nameidata nd; struct proc *p; int error; error = priv_check(td, PRIV_VFS_CHROOT); if (error != 0) { p = td->td_proc; PROC_LOCK(p); if (unprivileged_chroot == 0 || (p->p_flag2 & P2_NO_NEW_PRIVS) == 0) { PROC_UNLOCK(p); return (error); } PROC_UNLOCK(p); } NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error != 0) goto error; error = change_dir(nd.ni_vp, td); if (error != 0) goto e_vunlock; #ifdef MAC error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp); if (error != 0) goto e_vunlock; #endif VOP_UNLOCK(nd.ni_vp); error = pwd_chroot(td, nd.ni_vp); vrele(nd.ni_vp); NDFREE_NOTHING(&nd); return (error); e_vunlock: vput(nd.ni_vp); error: NDFREE_NOTHING(&nd); return (error); } /* * Common routine for chroot and chdir. Callers must provide a locked vnode * instance. */ int change_dir(struct vnode *vp, struct thread *td) { #ifdef MAC int error; #endif ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked"); if (vp->v_type != VDIR) return (ENOTDIR); #ifdef MAC error = mac_vnode_check_chdir(td->td_ucred, vp); if (error != 0) return (error); #endif return (VOP_ACCESS(vp, VEXEC, td->td_ucred, td)); } static __inline void flags_to_rights(int flags, cap_rights_t *rightsp) { if (flags & O_EXEC) { cap_rights_set_one(rightsp, CAP_FEXECVE); if (flags & O_PATH) return; } else { switch ((flags & O_ACCMODE)) { case O_RDONLY: cap_rights_set_one(rightsp, CAP_READ); break; case O_RDWR: cap_rights_set_one(rightsp, CAP_READ); /* FALLTHROUGH */ case O_WRONLY: cap_rights_set_one(rightsp, CAP_WRITE); if (!(flags & (O_APPEND | O_TRUNC))) cap_rights_set_one(rightsp, CAP_SEEK); break; } } if (flags & O_CREAT) cap_rights_set_one(rightsp, CAP_CREATE); if (flags & O_TRUNC) cap_rights_set_one(rightsp, CAP_FTRUNCATE); if (flags & (O_SYNC | O_FSYNC)) cap_rights_set_one(rightsp, CAP_FSYNC); if (flags & (O_EXLOCK | O_SHLOCK)) cap_rights_set_one(rightsp, CAP_FLOCK); } /* * Check permissions, allocate an open file structure, and call the device * open routine if any. */ #ifndef _SYS_SYSPROTO_H_ struct open_args { char *path; int flags; int mode; }; #endif int sys_open(struct thread *td, struct open_args *uap) { return (kern_openat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->flags, uap->mode)); } #ifndef _SYS_SYSPROTO_H_ struct openat_args { int fd; char *path; int flag; int mode; }; #endif int sys_openat(struct thread *td, struct openat_args *uap) { AUDIT_ARG_FD(uap->fd); return (kern_openat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag, uap->mode)); } int kern_openat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, int flags, int mode) { struct proc *p = td->td_proc; struct filedesc *fdp; struct pwddesc *pdp; struct file *fp; struct vnode *vp; struct nameidata nd; cap_rights_t rights; int cmode, error, indx; indx = -1; fdp = p->p_fd; pdp = p->p_pd; AUDIT_ARG_FFLAGS(flags); AUDIT_ARG_MODE(mode); cap_rights_init_one(&rights, CAP_LOOKUP); flags_to_rights(flags, &rights); /* * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags * may be specified. On the other hand, for O_PATH any mode * except O_EXEC is ignored. */ if ((flags & O_PATH) != 0) { flags &= ~(O_CREAT | O_ACCMODE); } else if ((flags & O_EXEC) != 0) { if (flags & O_ACCMODE) return (EINVAL); } else if ((flags & O_ACCMODE) == O_ACCMODE) { return (EINVAL); } else { flags = FFLAGS(flags); } /* * Allocate a file structure. The descriptor to reference it * is allocated and used by finstall_refed() below. */ error = falloc_noinstall(td, &fp); if (error != 0) return (error); /* Set the flags early so the finit in devfs can pick them up. */ fp->f_flag = flags & FMASK; cmode = ((mode & ~pdp->pd_cmask) & ALLPERMS) & ~S_ISTXT; NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd, &rights, td); td->td_dupfd = -1; /* XXX check for fdopen */ error = vn_open(&nd, &flags, cmode, fp); if (error != 0) { /* * If the vn_open replaced the method vector, something * wonderous happened deep below and we just pass it up * pretending we know what we do. */ if (error == ENXIO && fp->f_ops != &badfileops) { MPASS((flags & O_PATH) == 0); goto success; } /* * Handle special fdopen() case. bleh. * * Don't do this for relative (capability) lookups; we don't * understand exactly what would happen, and we don't think * that it ever should. */ if ((nd.ni_resflags & NIRES_STRICTREL) == 0 && (error == ENODEV || error == ENXIO) && td->td_dupfd >= 0) { error = dupfdopen(td, fdp, td->td_dupfd, flags, error, &indx); if (error == 0) goto success; } goto bad; } td->td_dupfd = 0; NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; /* * Store the vnode, for any f_type. Typically, the vnode use * count is decremented by direct call to vn_closefile() for * files that switched type in the cdevsw fdopen() method. */ fp->f_vnode = vp; /* * If the file wasn't claimed by devfs bind it to the normal * vnode operations here. */ if (fp->f_ops == &badfileops) { KASSERT(vp->v_type != VFIFO || (flags & O_PATH) != 0, ("Unexpected fifo fp %p vp %p", fp, vp)); if ((flags & O_PATH) != 0) { finit(fp, (flags & FMASK) | (fp->f_flag & FKQALLOWED), DTYPE_VNODE, NULL, &path_fileops); vhold(vp); vunref(vp); } else { finit_vnode(fp, flags, NULL, &vnops); } } VOP_UNLOCK(vp); if (flags & O_TRUNC) { error = fo_truncate(fp, 0, td->td_ucred, td); if (error != 0) goto bad; } success: /* * If we haven't already installed the FD (for dupfdopen), do so now. */ if (indx == -1) { struct filecaps *fcaps; #ifdef CAPABILITIES if ((nd.ni_resflags & NIRES_STRICTREL) != 0) fcaps = &nd.ni_filecaps; else #endif fcaps = NULL; error = finstall_refed(td, fp, &indx, flags, fcaps); /* On success finstall_refed() consumes fcaps. */ if (error != 0) { filecaps_free(&nd.ni_filecaps); goto bad; } } else { filecaps_free(&nd.ni_filecaps); falloc_abort(td, fp); } td->td_retval[0] = indx; return (0); bad: KASSERT(indx == -1, ("indx=%d, should be -1", indx)); falloc_abort(td, fp); return (error); } #ifdef COMPAT_43 /* * Create a file. */ #ifndef _SYS_SYSPROTO_H_ struct ocreat_args { char *path; int mode; }; #endif int ocreat(struct thread *td, struct ocreat_args *uap) { return (kern_openat(td, AT_FDCWD, uap->path, UIO_USERSPACE, O_WRONLY | O_CREAT | O_TRUNC, uap->mode)); } #endif /* COMPAT_43 */ /* * Create a special file. */ #ifndef _SYS_SYSPROTO_H_ struct mknodat_args { int fd; char *path; mode_t mode; dev_t dev; }; #endif int sys_mknodat(struct thread *td, struct mknodat_args *uap) { return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode, uap->dev)); } #if defined(COMPAT_FREEBSD11) int freebsd11_mknod(struct thread *td, struct freebsd11_mknod_args *uap) { return (kern_mknodat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->mode, uap->dev)); } int freebsd11_mknodat(struct thread *td, struct freebsd11_mknodat_args *uap) { return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode, uap->dev)); } #endif /* COMPAT_FREEBSD11 */ int kern_mknodat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, int mode, dev_t dev) { struct vnode *vp; struct mount *mp; struct vattr vattr; struct nameidata nd; int error, whiteout = 0; AUDIT_ARG_MODE(mode); AUDIT_ARG_DEV(dev); switch (mode & S_IFMT) { case S_IFCHR: case S_IFBLK: error = priv_check(td, PRIV_VFS_MKNOD_DEV); if (error == 0 && dev == VNOVAL) error = EINVAL; break; case S_IFWHT: error = priv_check(td, PRIV_VFS_MKNOD_WHT); break; case S_IFIFO: if (dev == 0) return (kern_mkfifoat(td, fd, path, pathseg, mode)); /* FALLTHROUGH */ default: error = EINVAL; break; } if (error != 0) return (error); NDPREINIT(&nd); restart: bwillwrite(); NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 | NOCACHE, pathseg, path, fd, &cap_mknodat_rights, td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; if (vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); if (vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(vp); return (EEXIST); } else { VATTR_NULL(&vattr); vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_pd->pd_cmask; vattr.va_rdev = dev; whiteout = 0; switch (mode & S_IFMT) { case S_IFCHR: vattr.va_type = VCHR; break; case S_IFBLK: vattr.va_type = VBLK; break; case S_IFWHT: whiteout = 1; break; default: panic("kern_mknod: invalid mode"); } } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } #ifdef MAC if (error == 0 && !whiteout) error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); #endif if (error == 0) { if (whiteout) error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE); else { error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); } } VOP_VPUT_PAIR(nd.ni_dvp, error == 0 && !whiteout ? &nd.ni_vp : NULL, true); vn_finished_write(mp); NDFREE(&nd, NDF_ONLY_PNBUF); if (error == ERELOOKUP) goto restart; return (error); } /* * Create a named pipe. */ #ifndef _SYS_SYSPROTO_H_ struct mkfifo_args { char *path; int mode; }; #endif int sys_mkfifo(struct thread *td, struct mkfifo_args *uap) { return (kern_mkfifoat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->mode)); } #ifndef _SYS_SYSPROTO_H_ struct mkfifoat_args { int fd; char *path; mode_t mode; }; #endif int sys_mkfifoat(struct thread *td, struct mkfifoat_args *uap) { return (kern_mkfifoat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode)); } int kern_mkfifoat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, int mode) { struct mount *mp; struct vattr vattr; struct nameidata nd; int error; AUDIT_ARG_MODE(mode); NDPREINIT(&nd); restart: bwillwrite(); NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 | NOCACHE, pathseg, path, fd, &cap_mkfifoat_rights, td); if ((error = namei(&nd)) != 0) return (error); if (nd.ni_vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(nd.ni_vp); return (EEXIST); } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VATTR_NULL(&vattr); vattr.va_type = VFIFO; vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_pd->pd_cmask; #ifdef MAC error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); if (error != 0) goto out; #endif error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); #ifdef MAC out: #endif VOP_VPUT_PAIR(nd.ni_dvp, error == 0 ? &nd.ni_vp : NULL, true); vn_finished_write(mp); NDFREE(&nd, NDF_ONLY_PNBUF); if (error == ERELOOKUP) goto restart; return (error); } /* * Make a hard file link. */ #ifndef _SYS_SYSPROTO_H_ struct link_args { char *path; char *link; }; #endif int sys_link(struct thread *td, struct link_args *uap) { return (kern_linkat(td, AT_FDCWD, AT_FDCWD, uap->path, uap->link, UIO_USERSPACE, AT_SYMLINK_FOLLOW)); } #ifndef _SYS_SYSPROTO_H_ struct linkat_args { int fd1; char *path1; int fd2; char *path2; int flag; }; #endif int sys_linkat(struct thread *td, struct linkat_args *uap) { int flag; flag = uap->flag; if ((flag & ~(AT_SYMLINK_FOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH)) != 0) return (EINVAL); return (kern_linkat(td, uap->fd1, uap->fd2, uap->path1, uap->path2, UIO_USERSPACE, flag)); } int hardlink_check_uid = 0; SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW, &hardlink_check_uid, 0, "Unprivileged processes cannot create hard links to files owned by other " "users"); static int hardlink_check_gid = 0; SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW, &hardlink_check_gid, 0, "Unprivileged processes cannot create hard links to files owned by other " "groups"); static int can_hardlink(struct vnode *vp, struct ucred *cred) { struct vattr va; int error; if (!hardlink_check_uid && !hardlink_check_gid) return (0); error = VOP_GETATTR(vp, &va, cred); if (error != 0) return (error); if (hardlink_check_uid && cred->cr_uid != va.va_uid) { error = priv_check_cred(cred, PRIV_VFS_LINK); if (error != 0) return (error); } if (hardlink_check_gid && !groupmember(va.va_gid, cred)) { error = priv_check_cred(cred, PRIV_VFS_LINK); if (error != 0) return (error); } return (0); } int kern_linkat(struct thread *td, int fd1, int fd2, const char *path1, const char *path2, enum uio_seg segflag, int flag) { struct nameidata nd; int error; NDPREINIT(&nd); do { bwillwrite(); NDINIT_ATRIGHTS(&nd, LOOKUP, AUDITVNODE1 | at2cnpflags(flag, AT_SYMLINK_FOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH), segflag, path1, fd1, &cap_linkat_source_rights, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); if ((nd.ni_resflags & NIRES_EMPTYPATH) != 0) { error = priv_check(td, PRIV_VFS_FHOPEN); if (error != 0) { vrele(nd.ni_vp); return (error); } } error = kern_linkat_vp(td, nd.ni_vp, fd2, path2, segflag); } while (error == EAGAIN || error == ERELOOKUP); return (error); } static int kern_linkat_vp(struct thread *td, struct vnode *vp, int fd, const char *path, enum uio_seg segflag) { struct nameidata nd; struct mount *mp; int error; if (vp->v_type == VDIR) { vrele(vp); return (EPERM); /* POSIX */ } NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE2 | NOCACHE, segflag, path, fd, &cap_linkat_target_rights, td); if ((error = namei(&nd)) == 0) { if (nd.ni_vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(nd.ni_vp); vrele(vp); return (EEXIST); } else if (nd.ni_dvp->v_mount != vp->v_mount) { /* * Cross-device link. No need to recheck * vp->v_type, since it cannot change, except * to VBAD. */ NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vrele(vp); return (EXDEV); } else if ((error = vn_lock(vp, LK_EXCLUSIVE)) == 0) { error = can_hardlink(vp, td->td_ucred); #ifdef MAC if (error == 0) error = mac_vnode_check_link(td->td_ucred, nd.ni_dvp, vp, &nd.ni_cnd); #endif if (error != 0) { vput(vp); vput(nd.ni_dvp); NDFREE(&nd, NDF_ONLY_PNBUF); return (error); } error = vn_start_write(vp, &mp, V_NOWAIT); if (error != 0) { vput(vp); vput(nd.ni_dvp); NDFREE(&nd, NDF_ONLY_PNBUF); error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH); if (error != 0) return (error); return (EAGAIN); } error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd); VOP_VPUT_PAIR(nd.ni_dvp, &vp, true); vn_finished_write(mp); NDFREE(&nd, NDF_ONLY_PNBUF); vp = NULL; } else { vput(nd.ni_dvp); NDFREE(&nd, NDF_ONLY_PNBUF); vrele(vp); return (EAGAIN); } } if (vp != NULL) vrele(vp); return (error); } /* * Make a symbolic link. */ #ifndef _SYS_SYSPROTO_H_ struct symlink_args { char *path; char *link; }; #endif int sys_symlink(struct thread *td, struct symlink_args *uap) { return (kern_symlinkat(td, uap->path, AT_FDCWD, uap->link, UIO_USERSPACE)); } #ifndef _SYS_SYSPROTO_H_ struct symlinkat_args { char *path; int fd; char *path2; }; #endif int sys_symlinkat(struct thread *td, struct symlinkat_args *uap) { return (kern_symlinkat(td, uap->path1, uap->fd, uap->path2, UIO_USERSPACE)); } int kern_symlinkat(struct thread *td, const char *path1, int fd, const char *path2, enum uio_seg segflg) { struct mount *mp; struct vattr vattr; const char *syspath; char *tmppath; struct nameidata nd; int error; if (segflg == UIO_SYSSPACE) { syspath = path1; } else { tmppath = uma_zalloc(namei_zone, M_WAITOK); if ((error = copyinstr(path1, tmppath, MAXPATHLEN, NULL)) != 0) goto out; syspath = tmppath; } AUDIT_ARG_TEXT(syspath); NDPREINIT(&nd); restart: bwillwrite(); NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 | NOCACHE, segflg, path2, fd, &cap_symlinkat_rights, td); if ((error = namei(&nd)) != 0) goto out; if (nd.ni_vp) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(nd.ni_vp); nd.ni_vp = NULL; error = EEXIST; goto out; } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) goto out; goto restart; } VATTR_NULL(&vattr); vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_pd->pd_cmask; #ifdef MAC vattr.va_type = VLNK; error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); if (error != 0) goto out2; #endif error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath); #ifdef MAC out2: #endif VOP_VPUT_PAIR(nd.ni_dvp, error == 0 ? &nd.ni_vp : NULL, true); vn_finished_write(mp); NDFREE(&nd, NDF_ONLY_PNBUF); if (error == ERELOOKUP) goto restart; out: if (segflg != UIO_SYSSPACE) uma_zfree(namei_zone, tmppath); return (error); } /* * Delete a whiteout from the filesystem. */ #ifndef _SYS_SYSPROTO_H_ struct undelete_args { char *path; }; #endif int sys_undelete(struct thread *td, struct undelete_args *uap) { struct mount *mp; struct nameidata nd; int error; NDPREINIT(&nd); restart: bwillwrite(); NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error != 0) return (error); if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); if (nd.ni_vp) vrele(nd.ni_vp); return (EEXIST); } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); if (error == ERELOOKUP) goto restart; return (error); } /* * Delete a name from the filesystem. */ #ifndef _SYS_SYSPROTO_H_ struct unlink_args { char *path; }; #endif int sys_unlink(struct thread *td, struct unlink_args *uap) { return (kern_funlinkat(td, AT_FDCWD, uap->path, FD_NONE, UIO_USERSPACE, 0, 0)); } static int kern_funlinkat_ex(struct thread *td, int dfd, const char *path, int fd, int flag, enum uio_seg pathseg, ino_t oldinum) { if ((flag & ~(AT_REMOVEDIR | AT_RESOLVE_BENEATH)) != 0) return (EINVAL); if ((flag & AT_REMOVEDIR) != 0) return (kern_frmdirat(td, dfd, path, fd, UIO_USERSPACE, 0)); return (kern_funlinkat(td, dfd, path, fd, UIO_USERSPACE, 0, 0)); } #ifndef _SYS_SYSPROTO_H_ struct unlinkat_args { int fd; char *path; int flag; }; #endif int sys_unlinkat(struct thread *td, struct unlinkat_args *uap) { return (kern_funlinkat_ex(td, uap->fd, uap->path, FD_NONE, uap->flag, UIO_USERSPACE, 0)); } #ifndef _SYS_SYSPROTO_H_ struct funlinkat_args { int dfd; const char *path; int fd; int flag; }; #endif int sys_funlinkat(struct thread *td, struct funlinkat_args *uap) { return (kern_funlinkat_ex(td, uap->dfd, uap->path, uap->fd, uap->flag, UIO_USERSPACE, 0)); } int kern_funlinkat(struct thread *td, int dfd, const char *path, int fd, enum uio_seg pathseg, int flag, ino_t oldinum) { struct mount *mp; struct file *fp; struct vnode *vp; struct nameidata nd; struct stat sb; int error; fp = NULL; if (fd != FD_NONE) { error = getvnode_path(td, fd, &cap_no_rights, &fp); if (error != 0) return (error); } NDPREINIT(&nd); restart: bwillwrite(); NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1 | at2cnpflags(flag, AT_RESOLVE_BENEATH), pathseg, path, dfd, &cap_unlinkat_rights, td); if ((error = namei(&nd)) != 0) { if (error == EINVAL) error = EPERM; goto fdout; } vp = nd.ni_vp; if (vp->v_type == VDIR && oldinum == 0) { error = EPERM; /* POSIX */ } else if (oldinum != 0 && - ((error = VOP_STAT(vp, &sb, td->td_ucred, NOCRED, td)) == 0) && + ((error = VOP_STAT(vp, &sb, td->td_ucred, NOCRED)) == 0) && sb.st_ino != oldinum) { error = EIDRM; /* Identifier removed */ } else if (fp != NULL && fp->f_vnode != vp) { if (VN_IS_DOOMED(fp->f_vnode)) error = EBADF; else error = EDEADLK; } else { /* * The root of a mounted filesystem cannot be deleted. * * XXX: can this only be a VDIR case? */ if (vp->v_vflag & VV_ROOT) error = EBUSY; } if (error == 0) { if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (vp == nd.ni_dvp) vrele(vp); else vput(vp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) { goto fdout; } goto restart; } #ifdef MAC error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp, &nd.ni_cnd); if (error != 0) goto out; #endif vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK); error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); #ifdef MAC out: #endif vn_finished_write(mp); } NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (vp == nd.ni_dvp) vrele(vp); else vput(vp); if (error == ERELOOKUP) goto restart; fdout: if (fp != NULL) fdrop(fp, td); return (error); } /* * Reposition read/write file offset. */ #ifndef _SYS_SYSPROTO_H_ struct lseek_args { int fd; int pad; off_t offset; int whence; }; #endif int sys_lseek(struct thread *td, struct lseek_args *uap) { return (kern_lseek(td, uap->fd, uap->offset, uap->whence)); } int kern_lseek(struct thread *td, int fd, off_t offset, int whence) { struct file *fp; int error; AUDIT_ARG_FD(fd); error = fget(td, fd, &cap_seek_rights, &fp); if (error != 0) return (error); error = (fp->f_ops->fo_flags & DFLAG_SEEKABLE) != 0 ? fo_seek(fp, offset, whence, td) : ESPIPE; fdrop(fp, td); return (error); } #if defined(COMPAT_43) /* * Reposition read/write file offset. */ #ifndef _SYS_SYSPROTO_H_ struct olseek_args { int fd; long offset; int whence; }; #endif int olseek(struct thread *td, struct olseek_args *uap) { return (kern_lseek(td, uap->fd, uap->offset, uap->whence)); } #endif /* COMPAT_43 */ #if defined(COMPAT_FREEBSD6) /* Version with the 'pad' argument */ int freebsd6_lseek(struct thread *td, struct freebsd6_lseek_args *uap) { return (kern_lseek(td, uap->fd, uap->offset, uap->whence)); } #endif /* * Check access permissions using passed credentials. */ static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred, struct thread *td) { accmode_t accmode; int error; /* Flags == 0 means only check for existence. */ if (user_flags == 0) return (0); accmode = 0; if (user_flags & R_OK) accmode |= VREAD; if (user_flags & W_OK) accmode |= VWRITE; if (user_flags & X_OK) accmode |= VEXEC; #ifdef MAC error = mac_vnode_check_access(cred, vp, accmode); if (error != 0) return (error); #endif if ((accmode & VWRITE) == 0 || (error = vn_writechk(vp)) == 0) error = VOP_ACCESS(vp, accmode, cred, td); return (error); } /* * Check access permissions using "real" credentials. */ #ifndef _SYS_SYSPROTO_H_ struct access_args { char *path; int amode; }; #endif int sys_access(struct thread *td, struct access_args *uap) { return (kern_accessat(td, AT_FDCWD, uap->path, UIO_USERSPACE, 0, uap->amode)); } #ifndef _SYS_SYSPROTO_H_ struct faccessat_args { int dirfd; char *path; int amode; int flag; } #endif int sys_faccessat(struct thread *td, struct faccessat_args *uap) { return (kern_accessat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag, uap->amode)); } int kern_accessat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, int flag, int amode) { struct ucred *cred, *usecred; struct vnode *vp; struct nameidata nd; int error; if ((flag & ~(AT_EACCESS | AT_RESOLVE_BENEATH | AT_EMPTY_PATH)) != 0) return (EINVAL); if (amode != F_OK && (amode & ~(R_OK | W_OK | X_OK)) != 0) return (EINVAL); /* * Create and modify a temporary credential instead of one that * is potentially shared (if we need one). */ cred = td->td_ucred; if ((flag & AT_EACCESS) == 0 && ((cred->cr_uid != cred->cr_ruid || cred->cr_rgid != cred->cr_groups[0]))) { usecred = crdup(cred); usecred->cr_uid = cred->cr_ruid; usecred->cr_groups[0] = cred->cr_rgid; td->td_ucred = usecred; } else usecred = cred; AUDIT_ARG_VALUE(amode); NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1 | at2cnpflags(flag, AT_RESOLVE_BENEATH | AT_EMPTY_PATH), pathseg, path, fd, &cap_fstat_rights, td); if ((error = namei(&nd)) != 0) goto out; vp = nd.ni_vp; error = vn_access(vp, amode, usecred, td); NDFREE_NOTHING(&nd); vput(vp); out: if (usecred != cred) { td->td_ucred = cred; crfree(usecred); } return (error); } /* * Check access permissions using "effective" credentials. */ #ifndef _SYS_SYSPROTO_H_ struct eaccess_args { char *path; int amode; }; #endif int sys_eaccess(struct thread *td, struct eaccess_args *uap) { return (kern_accessat(td, AT_FDCWD, uap->path, UIO_USERSPACE, AT_EACCESS, uap->amode)); } #if defined(COMPAT_43) /* * Get file status; this version follows links. */ #ifndef _SYS_SYSPROTO_H_ struct ostat_args { char *path; struct ostat *ub; }; #endif int ostat(struct thread *td, struct ostat_args *uap) { struct stat sb; struct ostat osb; int error; error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); cvtstat(&sb, &osb); return (copyout(&osb, uap->ub, sizeof (osb))); } /* * Get file status; this version does not follow links. */ #ifndef _SYS_SYSPROTO_H_ struct olstat_args { char *path; struct ostat *ub; }; #endif int olstat(struct thread *td, struct olstat_args *uap) { struct stat sb; struct ostat osb; int error; error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); cvtstat(&sb, &osb); return (copyout(&osb, uap->ub, sizeof (osb))); } /* * Convert from an old to a new stat structure. * XXX: many values are blindly truncated. */ void cvtstat(struct stat *st, struct ostat *ost) { bzero(ost, sizeof(*ost)); ost->st_dev = st->st_dev; ost->st_ino = st->st_ino; ost->st_mode = st->st_mode; ost->st_nlink = st->st_nlink; ost->st_uid = st->st_uid; ost->st_gid = st->st_gid; ost->st_rdev = st->st_rdev; ost->st_size = MIN(st->st_size, INT32_MAX); ost->st_atim = st->st_atim; ost->st_mtim = st->st_mtim; ost->st_ctim = st->st_ctim; ost->st_blksize = st->st_blksize; ost->st_blocks = st->st_blocks; ost->st_flags = st->st_flags; ost->st_gen = st->st_gen; } #endif /* COMPAT_43 */ #if defined(COMPAT_43) || defined(COMPAT_FREEBSD11) int ino64_trunc_error; SYSCTL_INT(_vfs, OID_AUTO, ino64_trunc_error, CTLFLAG_RW, &ino64_trunc_error, 0, "Error on truncation of device, file or inode number, or link count"); int freebsd11_cvtstat(struct stat *st, struct freebsd11_stat *ost) { ost->st_dev = st->st_dev; if (ost->st_dev != st->st_dev) { switch (ino64_trunc_error) { default: /* * Since dev_t is almost raw, don't clamp to the * maximum for case 2, but ignore the error. */ break; case 1: return (EOVERFLOW); } } ost->st_ino = st->st_ino; if (ost->st_ino != st->st_ino) { switch (ino64_trunc_error) { default: case 0: break; case 1: return (EOVERFLOW); case 2: ost->st_ino = UINT32_MAX; break; } } ost->st_mode = st->st_mode; ost->st_nlink = st->st_nlink; if (ost->st_nlink != st->st_nlink) { switch (ino64_trunc_error) { default: case 0: break; case 1: return (EOVERFLOW); case 2: ost->st_nlink = UINT16_MAX; break; } } ost->st_uid = st->st_uid; ost->st_gid = st->st_gid; ost->st_rdev = st->st_rdev; if (ost->st_rdev != st->st_rdev) { switch (ino64_trunc_error) { default: break; case 1: return (EOVERFLOW); } } ost->st_atim = st->st_atim; ost->st_mtim = st->st_mtim; ost->st_ctim = st->st_ctim; ost->st_size = st->st_size; ost->st_blocks = st->st_blocks; ost->st_blksize = st->st_blksize; ost->st_flags = st->st_flags; ost->st_gen = st->st_gen; ost->st_lspare = 0; ost->st_birthtim = st->st_birthtim; bzero((char *)&ost->st_birthtim + sizeof(ost->st_birthtim), sizeof(*ost) - offsetof(struct freebsd11_stat, st_birthtim) - sizeof(ost->st_birthtim)); return (0); } int freebsd11_stat(struct thread *td, struct freebsd11_stat_args* uap) { struct stat sb; struct freebsd11_stat osb; int error; error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); error = freebsd11_cvtstat(&sb, &osb); if (error == 0) error = copyout(&osb, uap->ub, sizeof(osb)); return (error); } int freebsd11_lstat(struct thread *td, struct freebsd11_lstat_args* uap) { struct stat sb; struct freebsd11_stat osb; int error; error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); error = freebsd11_cvtstat(&sb, &osb); if (error == 0) error = copyout(&osb, uap->ub, sizeof(osb)); return (error); } int freebsd11_fhstat(struct thread *td, struct freebsd11_fhstat_args* uap) { struct fhandle fh; struct stat sb; struct freebsd11_stat osb; int error; error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); if (error != 0) return (error); error = kern_fhstat(td, fh, &sb); if (error != 0) return (error); error = freebsd11_cvtstat(&sb, &osb); if (error == 0) error = copyout(&osb, uap->sb, sizeof(osb)); return (error); } int freebsd11_fstatat(struct thread *td, struct freebsd11_fstatat_args* uap) { struct stat sb; struct freebsd11_stat osb; int error; error = kern_statat(td, uap->flag, uap->fd, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); error = freebsd11_cvtstat(&sb, &osb); if (error == 0) error = copyout(&osb, uap->buf, sizeof(osb)); return (error); } #endif /* COMPAT_FREEBSD11 */ /* * Get file status */ #ifndef _SYS_SYSPROTO_H_ struct fstatat_args { int fd; char *path; struct stat *buf; int flag; } #endif int sys_fstatat(struct thread *td, struct fstatat_args *uap) { struct stat sb; int error; error = kern_statat(td, uap->flag, uap->fd, uap->path, UIO_USERSPACE, &sb, NULL); if (error == 0) error = copyout(&sb, uap->buf, sizeof (sb)); return (error); } int kern_statat(struct thread *td, int flag, int fd, const char *path, enum uio_seg pathseg, struct stat *sbp, void (*hook)(struct vnode *vp, struct stat *sbp)) { struct nameidata nd; int error; if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH)) != 0) return (EINVAL); NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_RESOLVE_BENEATH | AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH) | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path, fd, &cap_fstat_rights, td); if ((error = namei(&nd)) != 0) { if (error == ENOTDIR && (nd.ni_resflags & NIRES_EMPTYPATH) != 0) error = kern_fstat(td, fd, sbp); return (error); } - error = VOP_STAT(nd.ni_vp, sbp, td->td_ucred, NOCRED, td); + error = VOP_STAT(nd.ni_vp, sbp, td->td_ucred, NOCRED); if (error == 0) { if (__predict_false(hook != NULL)) hook(nd.ni_vp, sbp); } NDFREE_NOTHING(&nd); vput(nd.ni_vp); #ifdef __STAT_TIME_T_EXT sbp->st_atim_ext = 0; sbp->st_mtim_ext = 0; sbp->st_ctim_ext = 0; sbp->st_btim_ext = 0; #endif #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrstat_error(sbp, error); #endif return (error); } #if defined(COMPAT_FREEBSD11) /* * Implementation of the NetBSD [l]stat() functions. */ void freebsd11_cvtnstat(struct stat *sb, struct nstat *nsb) { bzero(nsb, sizeof(*nsb)); nsb->st_dev = sb->st_dev; nsb->st_ino = sb->st_ino; nsb->st_mode = sb->st_mode; nsb->st_nlink = sb->st_nlink; nsb->st_uid = sb->st_uid; nsb->st_gid = sb->st_gid; nsb->st_rdev = sb->st_rdev; nsb->st_atim = sb->st_atim; nsb->st_mtim = sb->st_mtim; nsb->st_ctim = sb->st_ctim; nsb->st_size = sb->st_size; nsb->st_blocks = sb->st_blocks; nsb->st_blksize = sb->st_blksize; nsb->st_flags = sb->st_flags; nsb->st_gen = sb->st_gen; nsb->st_birthtim = sb->st_birthtim; } #ifndef _SYS_SYSPROTO_H_ struct freebsd11_nstat_args { char *path; struct nstat *ub; }; #endif int freebsd11_nstat(struct thread *td, struct freebsd11_nstat_args *uap) { struct stat sb; struct nstat nsb; int error; error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); freebsd11_cvtnstat(&sb, &nsb); return (copyout(&nsb, uap->ub, sizeof (nsb))); } /* * NetBSD lstat. Get file status; this version does not follow links. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd11_nlstat_args { char *path; struct nstat *ub; }; #endif int freebsd11_nlstat(struct thread *td, struct freebsd11_nlstat_args *uap) { struct stat sb; struct nstat nsb; int error; error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); freebsd11_cvtnstat(&sb, &nsb); return (copyout(&nsb, uap->ub, sizeof (nsb))); } #endif /* COMPAT_FREEBSD11 */ /* * Get configurable pathname variables. */ #ifndef _SYS_SYSPROTO_H_ struct pathconf_args { char *path; int name; }; #endif int sys_pathconf(struct thread *td, struct pathconf_args *uap) { long value; int error; error = kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, FOLLOW, &value); if (error == 0) td->td_retval[0] = value; return (error); } #ifndef _SYS_SYSPROTO_H_ struct lpathconf_args { char *path; int name; }; #endif int sys_lpathconf(struct thread *td, struct lpathconf_args *uap) { long value; int error; error = kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, NOFOLLOW, &value); if (error == 0) td->td_retval[0] = value; return (error); } int kern_pathconf(struct thread *td, const char *path, enum uio_seg pathseg, int name, u_long flags, long *valuep) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | AUDITVNODE1 | flags, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE_NOTHING(&nd); error = VOP_PATHCONF(nd.ni_vp, name, valuep); vput(nd.ni_vp); return (error); } /* * Return target name of a symbolic link. */ #ifndef _SYS_SYSPROTO_H_ struct readlink_args { char *path; char *buf; size_t count; }; #endif int sys_readlink(struct thread *td, struct readlink_args *uap) { return (kern_readlinkat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->buf, UIO_USERSPACE, uap->count)); } #ifndef _SYS_SYSPROTO_H_ struct readlinkat_args { int fd; char *path; char *buf; size_t bufsize; }; #endif int sys_readlinkat(struct thread *td, struct readlinkat_args *uap) { return (kern_readlinkat(td, uap->fd, uap->path, UIO_USERSPACE, uap->buf, UIO_USERSPACE, uap->bufsize)); } int kern_readlinkat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, char *buf, enum uio_seg bufseg, size_t count) { struct vnode *vp; struct nameidata nd; int error; if (count > IOSIZE_MAX) return (EINVAL); NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1 | EMPTYPATH, pathseg, path, fd, td); if ((error = namei(&nd)) != 0) return (error); NDFREE_NOTHING(&nd); vp = nd.ni_vp; error = kern_readlink_vp(vp, buf, bufseg, count, td); vput(vp); return (error); } /* * Helper function to readlink from a vnode */ static int kern_readlink_vp(struct vnode *vp, char *buf, enum uio_seg bufseg, size_t count, struct thread *td) { struct iovec aiov; struct uio auio; int error; ASSERT_VOP_LOCKED(vp, "kern_readlink_vp(): vp not locked"); #ifdef MAC error = mac_vnode_check_readlink(td->td_ucred, vp); if (error != 0) return (error); #endif if (vp->v_type != VLNK && (vp->v_vflag & VV_READLINK) == 0) return (EINVAL); aiov.iov_base = buf; aiov.iov_len = count; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_segflg = bufseg; auio.uio_td = td; auio.uio_resid = count; error = VOP_READLINK(vp, &auio, td->td_ucred); td->td_retval[0] = count - auio.uio_resid; return (error); } /* * Common implementation code for chflags() and fchflags(). */ static int setfflags(struct thread *td, struct vnode *vp, u_long flags) { struct mount *mp; struct vattr vattr; int error; /* We can't support the value matching VNOVAL. */ if (flags == VNOVAL) return (EOPNOTSUPP); /* * Prevent non-root users from setting flags on devices. When * a device is reused, users can retain ownership of the device * if they are allowed to set flags and programs assume that * chown can't fail when done as root. */ if (vp->v_type == VCHR || vp->v_type == VBLK) { error = priv_check(td, PRIV_VFS_CHFLAGS_DEV); if (error != 0) return (error); } if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VATTR_NULL(&vattr); vattr.va_flags = flags; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); #ifdef MAC error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags); if (error == 0) #endif error = VOP_SETATTR(vp, &vattr, td->td_ucred); VOP_UNLOCK(vp); vn_finished_write(mp); return (error); } /* * Change flags of a file given a path name. */ #ifndef _SYS_SYSPROTO_H_ struct chflags_args { const char *path; u_long flags; }; #endif int sys_chflags(struct thread *td, struct chflags_args *uap) { return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->flags, 0)); } #ifndef _SYS_SYSPROTO_H_ struct chflagsat_args { int fd; const char *path; u_long flags; int atflag; } #endif int sys_chflagsat(struct thread *td, struct chflagsat_args *uap) { if ((uap->atflag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH)) != 0) return (EINVAL); return (kern_chflagsat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flags, uap->atflag)); } /* * Same as chflags() but doesn't follow symlinks. */ #ifndef _SYS_SYSPROTO_H_ struct lchflags_args { const char *path; u_long flags; }; #endif int sys_lchflags(struct thread *td, struct lchflags_args *uap) { return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->flags, AT_SYMLINK_NOFOLLOW)); } static int kern_chflagsat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, u_long flags, int atflag) { struct nameidata nd; int error; AUDIT_ARG_FFLAGS(flags); NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(atflag, AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH) | AUDITVNODE1, pathseg, path, fd, &cap_fchflags_rights, td); if ((error = namei(&nd)) != 0) return (error); NDFREE_NOTHING(&nd); error = setfflags(td, nd.ni_vp, flags); vrele(nd.ni_vp); return (error); } /* * Change flags of a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchflags_args { int fd; u_long flags; }; #endif int sys_fchflags(struct thread *td, struct fchflags_args *uap) { struct file *fp; int error; AUDIT_ARG_FD(uap->fd); AUDIT_ARG_FFLAGS(uap->flags); error = getvnode(td, uap->fd, &cap_fchflags_rights, &fp); if (error != 0) return (error); #ifdef AUDIT if (AUDITING_TD(td)) { vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(fp->f_vnode); VOP_UNLOCK(fp->f_vnode); } #endif error = setfflags(td, fp->f_vnode, uap->flags); fdrop(fp, td); return (error); } /* * Common implementation code for chmod(), lchmod() and fchmod(). */ int setfmode(struct thread *td, struct ucred *cred, struct vnode *vp, int mode) { struct mount *mp; struct vattr vattr; int error; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VATTR_NULL(&vattr); vattr.va_mode = mode & ALLPERMS; #ifdef MAC error = mac_vnode_check_setmode(cred, vp, vattr.va_mode); if (error == 0) #endif error = VOP_SETATTR(vp, &vattr, cred); VOP_UNLOCK(vp); vn_finished_write(mp); return (error); } /* * Change mode of a file given path name. */ #ifndef _SYS_SYSPROTO_H_ struct chmod_args { char *path; int mode; }; #endif int sys_chmod(struct thread *td, struct chmod_args *uap) { return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->mode, 0)); } #ifndef _SYS_SYSPROTO_H_ struct fchmodat_args { int dirfd; char *path; mode_t mode; int flag; } #endif int sys_fchmodat(struct thread *td, struct fchmodat_args *uap) { if ((uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH)) != 0) return (EINVAL); return (kern_fchmodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode, uap->flag)); } /* * Change mode of a file given path name (don't follow links.) */ #ifndef _SYS_SYSPROTO_H_ struct lchmod_args { char *path; int mode; }; #endif int sys_lchmod(struct thread *td, struct lchmod_args *uap) { return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->mode, AT_SYMLINK_NOFOLLOW)); } int kern_fchmodat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, mode_t mode, int flag) { struct nameidata nd; int error; AUDIT_ARG_MODE(mode); NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH) | AUDITVNODE1, pathseg, path, fd, &cap_fchmod_rights, td); if ((error = namei(&nd)) != 0) return (error); NDFREE_NOTHING(&nd); error = setfmode(td, td->td_ucred, nd.ni_vp, mode); vrele(nd.ni_vp); return (error); } /* * Change mode of a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchmod_args { int fd; int mode; }; #endif int sys_fchmod(struct thread *td, struct fchmod_args *uap) { struct file *fp; int error; AUDIT_ARG_FD(uap->fd); AUDIT_ARG_MODE(uap->mode); error = fget(td, uap->fd, &cap_fchmod_rights, &fp); if (error != 0) return (error); error = fo_chmod(fp, uap->mode, td->td_ucred, td); fdrop(fp, td); return (error); } /* * Common implementation for chown(), lchown(), and fchown() */ int setfown(struct thread *td, struct ucred *cred, struct vnode *vp, uid_t uid, gid_t gid) { struct mount *mp; struct vattr vattr; int error; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VATTR_NULL(&vattr); vattr.va_uid = uid; vattr.va_gid = gid; #ifdef MAC error = mac_vnode_check_setowner(cred, vp, vattr.va_uid, vattr.va_gid); if (error == 0) #endif error = VOP_SETATTR(vp, &vattr, cred); VOP_UNLOCK(vp); vn_finished_write(mp); return (error); } /* * Set ownership given a path name. */ #ifndef _SYS_SYSPROTO_H_ struct chown_args { char *path; int uid; int gid; }; #endif int sys_chown(struct thread *td, struct chown_args *uap) { return (kern_fchownat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->uid, uap->gid, 0)); } #ifndef _SYS_SYSPROTO_H_ struct fchownat_args { int fd; const char * path; uid_t uid; gid_t gid; int flag; }; #endif int sys_fchownat(struct thread *td, struct fchownat_args *uap) { if ((uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH)) != 0) return (EINVAL); return (kern_fchownat(td, uap->fd, uap->path, UIO_USERSPACE, uap->uid, uap->gid, uap->flag)); } int kern_fchownat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, int uid, int gid, int flag) { struct nameidata nd; int error; AUDIT_ARG_OWNER(uid, gid); NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH) | AUDITVNODE1, pathseg, path, fd, &cap_fchown_rights, td); if ((error = namei(&nd)) != 0) return (error); NDFREE_NOTHING(&nd); error = setfown(td, td->td_ucred, nd.ni_vp, uid, gid); vrele(nd.ni_vp); return (error); } /* * Set ownership given a path name, do not cross symlinks. */ #ifndef _SYS_SYSPROTO_H_ struct lchown_args { char *path; int uid; int gid; }; #endif int sys_lchown(struct thread *td, struct lchown_args *uap) { return (kern_fchownat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->uid, uap->gid, AT_SYMLINK_NOFOLLOW)); } /* * Set ownership given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchown_args { int fd; int uid; int gid; }; #endif int sys_fchown(struct thread *td, struct fchown_args *uap) { struct file *fp; int error; AUDIT_ARG_FD(uap->fd); AUDIT_ARG_OWNER(uap->uid, uap->gid); error = fget(td, uap->fd, &cap_fchown_rights, &fp); if (error != 0) return (error); error = fo_chown(fp, uap->uid, uap->gid, td->td_ucred, td); fdrop(fp, td); return (error); } /* * Common implementation code for utimes(), lutimes(), and futimes(). */ static int getutimes(const struct timeval *usrtvp, enum uio_seg tvpseg, struct timespec *tsp) { struct timeval tv[2]; const struct timeval *tvp; int error; if (usrtvp == NULL) { vfs_timestamp(&tsp[0]); tsp[1] = tsp[0]; } else { if (tvpseg == UIO_SYSSPACE) { tvp = usrtvp; } else { if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0) return (error); tvp = tv; } if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 || tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000) return (EINVAL); TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]); TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]); } return (0); } /* * Common implementation code for futimens(), utimensat(). */ #define UTIMENS_NULL 0x1 #define UTIMENS_EXIT 0x2 static int getutimens(const struct timespec *usrtsp, enum uio_seg tspseg, struct timespec *tsp, int *retflags) { struct timespec tsnow; int error; vfs_timestamp(&tsnow); *retflags = 0; if (usrtsp == NULL) { tsp[0] = tsnow; tsp[1] = tsnow; *retflags |= UTIMENS_NULL; return (0); } if (tspseg == UIO_SYSSPACE) { tsp[0] = usrtsp[0]; tsp[1] = usrtsp[1]; } else if ((error = copyin(usrtsp, tsp, sizeof(*tsp) * 2)) != 0) return (error); if (tsp[0].tv_nsec == UTIME_OMIT && tsp[1].tv_nsec == UTIME_OMIT) *retflags |= UTIMENS_EXIT; if (tsp[0].tv_nsec == UTIME_NOW && tsp[1].tv_nsec == UTIME_NOW) *retflags |= UTIMENS_NULL; if (tsp[0].tv_nsec == UTIME_OMIT) tsp[0].tv_sec = VNOVAL; else if (tsp[0].tv_nsec == UTIME_NOW) tsp[0] = tsnow; else if (tsp[0].tv_nsec < 0 || tsp[0].tv_nsec >= 1000000000L) return (EINVAL); if (tsp[1].tv_nsec == UTIME_OMIT) tsp[1].tv_sec = VNOVAL; else if (tsp[1].tv_nsec == UTIME_NOW) tsp[1] = tsnow; else if (tsp[1].tv_nsec < 0 || tsp[1].tv_nsec >= 1000000000L) return (EINVAL); return (0); } /* * Common implementation code for utimes(), lutimes(), futimes(), futimens(), * and utimensat(). */ static int setutimes(struct thread *td, struct vnode *vp, const struct timespec *ts, int numtimes, int nullflag) { struct mount *mp; struct vattr vattr; int error; bool setbirthtime; setbirthtime = false; vattr.va_birthtime.tv_sec = VNOVAL; vattr.va_birthtime.tv_nsec = 0; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (numtimes < 3 && VOP_GETATTR(vp, &vattr, td->td_ucred) == 0 && timespeccmp(&ts[1], &vattr.va_birthtime, < )) setbirthtime = true; VATTR_NULL(&vattr); vattr.va_atime = ts[0]; vattr.va_mtime = ts[1]; if (setbirthtime) vattr.va_birthtime = ts[1]; if (numtimes > 2) vattr.va_birthtime = ts[2]; if (nullflag) vattr.va_vaflags |= VA_UTIMES_NULL; #ifdef MAC error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime, vattr.va_mtime); #endif if (error == 0) error = VOP_SETATTR(vp, &vattr, td->td_ucred); VOP_UNLOCK(vp); vn_finished_write(mp); return (error); } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct utimes_args { char *path; struct timeval *tptr; }; #endif int sys_utimes(struct thread *td, struct utimes_args *uap) { return (kern_utimesat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->tptr, UIO_USERSPACE)); } #ifndef _SYS_SYSPROTO_H_ struct futimesat_args { int fd; const char * path; const struct timeval * times; }; #endif int sys_futimesat(struct thread *td, struct futimesat_args *uap) { return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE, uap->times, UIO_USERSPACE)); } int kern_utimesat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, struct timeval *tptr, enum uio_seg tptrseg) { struct nameidata nd; struct timespec ts[2]; int error; if ((error = getutimes(tptr, tptrseg, ts)) != 0) return (error); NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd, &cap_futimes_rights, td); if ((error = namei(&nd)) != 0) return (error); NDFREE_NOTHING(&nd); error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL); vrele(nd.ni_vp); return (error); } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct lutimes_args { char *path; struct timeval *tptr; }; #endif int sys_lutimes(struct thread *td, struct lutimes_args *uap) { return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr, UIO_USERSPACE)); } int kern_lutimes(struct thread *td, const char *path, enum uio_seg pathseg, struct timeval *tptr, enum uio_seg tptrseg) { struct timespec ts[2]; struct nameidata nd; int error; if ((error = getutimes(tptr, tptrseg, ts)) != 0) return (error); NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE_NOTHING(&nd); error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL); vrele(nd.ni_vp); return (error); } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct futimes_args { int fd; struct timeval *tptr; }; #endif int sys_futimes(struct thread *td, struct futimes_args *uap) { return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE)); } int kern_futimes(struct thread *td, int fd, struct timeval *tptr, enum uio_seg tptrseg) { struct timespec ts[2]; struct file *fp; int error; AUDIT_ARG_FD(fd); error = getutimes(tptr, tptrseg, ts); if (error != 0) return (error); error = getvnode(td, fd, &cap_futimes_rights, &fp); if (error != 0) return (error); #ifdef AUDIT if (AUDITING_TD(td)) { vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(fp->f_vnode); VOP_UNLOCK(fp->f_vnode); } #endif error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL); fdrop(fp, td); return (error); } int sys_futimens(struct thread *td, struct futimens_args *uap) { return (kern_futimens(td, uap->fd, uap->times, UIO_USERSPACE)); } int kern_futimens(struct thread *td, int fd, struct timespec *tptr, enum uio_seg tptrseg) { struct timespec ts[2]; struct file *fp; int error, flags; AUDIT_ARG_FD(fd); error = getutimens(tptr, tptrseg, ts, &flags); if (error != 0) return (error); if (flags & UTIMENS_EXIT) return (0); error = getvnode(td, fd, &cap_futimes_rights, &fp); if (error != 0) return (error); #ifdef AUDIT if (AUDITING_TD(td)) { vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(fp->f_vnode); VOP_UNLOCK(fp->f_vnode); } #endif error = setutimes(td, fp->f_vnode, ts, 2, flags & UTIMENS_NULL); fdrop(fp, td); return (error); } int sys_utimensat(struct thread *td, struct utimensat_args *uap) { return (kern_utimensat(td, uap->fd, uap->path, UIO_USERSPACE, uap->times, UIO_USERSPACE, uap->flag)); } int kern_utimensat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, struct timespec *tptr, enum uio_seg tptrseg, int flag) { struct nameidata nd; struct timespec ts[2]; int error, flags; if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH)) != 0) return (EINVAL); if ((error = getutimens(tptr, tptrseg, ts, &flags)) != 0) return (error); NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH) | AUDITVNODE1, pathseg, path, fd, &cap_futimes_rights, td); if ((error = namei(&nd)) != 0) return (error); /* * We are allowed to call namei() regardless of 2xUTIME_OMIT. * POSIX states: * "If both tv_nsec fields are UTIME_OMIT... EACCESS may be detected." * "Search permission is denied by a component of the path prefix." */ NDFREE_NOTHING(&nd); if ((flags & UTIMENS_EXIT) == 0) error = setutimes(td, nd.ni_vp, ts, 2, flags & UTIMENS_NULL); vrele(nd.ni_vp); return (error); } /* * Truncate a file given its path name. */ #ifndef _SYS_SYSPROTO_H_ struct truncate_args { char *path; int pad; off_t length; }; #endif int sys_truncate(struct thread *td, struct truncate_args *uap) { return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length)); } int kern_truncate(struct thread *td, const char *path, enum uio_seg pathseg, off_t length) { struct mount *mp; struct vnode *vp; void *rl_cookie; struct vattr vattr; struct nameidata nd; int error; if (length < 0) return (EINVAL); NDPREINIT(&nd); retry: NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { vn_rangelock_unlock(vp, rl_cookie); vrele(vp); return (error); } NDFREE(&nd, NDF_ONLY_PNBUF); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_type == VDIR) error = EISDIR; #ifdef MAC else if ((error = mac_vnode_check_write(td->td_ucred, NOCRED, vp))) { } #endif else if ((error = vn_writechk(vp)) == 0 && (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) { VATTR_NULL(&vattr); vattr.va_size = length; error = VOP_SETATTR(vp, &vattr, td->td_ucred); } VOP_UNLOCK(vp); vn_finished_write(mp); vn_rangelock_unlock(vp, rl_cookie); vrele(vp); if (error == ERELOOKUP) goto retry; return (error); } #if defined(COMPAT_43) /* * Truncate a file given its path name. */ #ifndef _SYS_SYSPROTO_H_ struct otruncate_args { char *path; long length; }; #endif int otruncate(struct thread *td, struct otruncate_args *uap) { return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length)); } #endif /* COMPAT_43 */ #if defined(COMPAT_FREEBSD6) /* Versions with the pad argument */ int freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap) { return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length)); } int freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap) { return (kern_ftruncate(td, uap->fd, uap->length)); } #endif int kern_fsync(struct thread *td, int fd, bool fullsync) { struct vnode *vp; struct mount *mp; struct file *fp; int error; AUDIT_ARG_FD(fd); error = getvnode(td, fd, &cap_fsync_rights, &fp); if (error != 0) return (error); vp = fp->f_vnode; #if 0 if (!fullsync) /* XXXKIB: compete outstanding aio writes */; #endif retry: error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error != 0) goto drop; vn_lock(vp, vn_lktype_write(mp, vp) | LK_RETRY); AUDIT_ARG_VNODE1(vp); if (vp->v_object != NULL) { VM_OBJECT_WLOCK(vp->v_object); vm_object_page_clean(vp->v_object, 0, 0, 0); VM_OBJECT_WUNLOCK(vp->v_object); } error = fullsync ? VOP_FSYNC(vp, MNT_WAIT, td) : VOP_FDATASYNC(vp, td); VOP_UNLOCK(vp); vn_finished_write(mp); if (error == ERELOOKUP) goto retry; drop: fdrop(fp, td); return (error); } /* * Sync an open file. */ #ifndef _SYS_SYSPROTO_H_ struct fsync_args { int fd; }; #endif int sys_fsync(struct thread *td, struct fsync_args *uap) { return (kern_fsync(td, uap->fd, true)); } int sys_fdatasync(struct thread *td, struct fdatasync_args *uap) { return (kern_fsync(td, uap->fd, false)); } /* * Rename files. Source and destination must either both be directories, or * both not be directories. If target is a directory, it must be empty. */ #ifndef _SYS_SYSPROTO_H_ struct rename_args { char *from; char *to; }; #endif int sys_rename(struct thread *td, struct rename_args *uap) { return (kern_renameat(td, AT_FDCWD, uap->from, AT_FDCWD, uap->to, UIO_USERSPACE)); } #ifndef _SYS_SYSPROTO_H_ struct renameat_args { int oldfd; char *old; int newfd; char *new; }; #endif int sys_renameat(struct thread *td, struct renameat_args *uap) { return (kern_renameat(td, uap->oldfd, uap->old, uap->newfd, uap->new, UIO_USERSPACE)); } #ifdef MAC static int kern_renameat_mac(struct thread *td, int oldfd, const char *old, int newfd, const char *new, enum uio_seg pathseg, struct nameidata *fromnd) { int error; NDINIT_ATRIGHTS(fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART | AUDITVNODE1, pathseg, old, oldfd, &cap_renameat_source_rights, td); if ((error = namei(fromnd)) != 0) return (error); error = mac_vnode_check_rename_from(td->td_ucred, fromnd->ni_dvp, fromnd->ni_vp, &fromnd->ni_cnd); VOP_UNLOCK(fromnd->ni_dvp); if (fromnd->ni_dvp != fromnd->ni_vp) VOP_UNLOCK(fromnd->ni_vp); if (error != 0) { NDFREE(fromnd, NDF_ONLY_PNBUF); vrele(fromnd->ni_dvp); vrele(fromnd->ni_vp); if (fromnd->ni_startdir) vrele(fromnd->ni_startdir); } return (error); } #endif int kern_renameat(struct thread *td, int oldfd, const char *old, int newfd, const char *new, enum uio_seg pathseg) { struct mount *mp = NULL; struct vnode *tvp, *fvp, *tdvp; struct nameidata fromnd, tond; u_int64_t tondflags; int error; again: bwillwrite(); #ifdef MAC if (mac_vnode_check_rename_from_enabled()) { error = kern_renameat_mac(td, oldfd, old, newfd, new, pathseg, &fromnd); if (error != 0) return (error); } else { #endif NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | AUDITVNODE1, pathseg, old, oldfd, &cap_renameat_source_rights, td); if ((error = namei(&fromnd)) != 0) return (error); #ifdef MAC } #endif fvp = fromnd.ni_vp; tondflags = LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNODE2; if (fromnd.ni_vp->v_type == VDIR) tondflags |= WILLBEDIR; NDINIT_ATRIGHTS(&tond, RENAME, tondflags, pathseg, new, newfd, &cap_renameat_target_rights, td); if ((error = namei(&tond)) != 0) { /* Translate error code for rename("dir1", "dir2/."). */ if (error == EISDIR && fvp->v_type == VDIR) error = EINVAL; NDFREE(&fromnd, NDF_ONLY_PNBUF); vrele(fromnd.ni_dvp); vrele(fvp); goto out1; } tdvp = tond.ni_dvp; tvp = tond.ni_vp; error = vn_start_write(fvp, &mp, V_NOWAIT); if (error != 0) { NDFREE(&fromnd, NDF_ONLY_PNBUF); NDFREE(&tond, NDF_ONLY_PNBUF); if (tvp != NULL) vput(tvp); if (tdvp == tvp) vrele(tdvp); else vput(tdvp); vrele(fromnd.ni_dvp); vrele(fvp); vrele(tond.ni_startdir); if (fromnd.ni_startdir != NULL) vrele(fromnd.ni_startdir); error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH); if (error != 0) return (error); goto again; } if (tvp != NULL) { if (fvp->v_type == VDIR && tvp->v_type != VDIR) { error = ENOTDIR; goto out; } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { error = EISDIR; goto out; } #ifdef CAPABILITIES if (newfd != AT_FDCWD && (tond.ni_resflags & NIRES_ABS) == 0) { /* * If the target already exists we require CAP_UNLINKAT * from 'newfd', when newfd was used for the lookup. */ error = cap_check(&tond.ni_filecaps.fc_rights, &cap_unlinkat_rights); if (error != 0) goto out; } #endif } if (fvp == tdvp) { error = EINVAL; goto out; } /* * If the source is the same as the destination (that is, if they * are links to the same vnode), then there is nothing to do. */ if (fvp == tvp) error = ERESTART; #ifdef MAC else error = mac_vnode_check_rename_to(td->td_ucred, tdvp, tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd); #endif out: if (error == 0) { error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd, tond.ni_dvp, tond.ni_vp, &tond.ni_cnd); NDFREE(&fromnd, NDF_ONLY_PNBUF); NDFREE(&tond, NDF_ONLY_PNBUF); } else { NDFREE(&fromnd, NDF_ONLY_PNBUF); NDFREE(&tond, NDF_ONLY_PNBUF); if (tvp != NULL) vput(tvp); if (tdvp == tvp) vrele(tdvp); else vput(tdvp); vrele(fromnd.ni_dvp); vrele(fvp); } vrele(tond.ni_startdir); vn_finished_write(mp); out1: if (fromnd.ni_startdir) vrele(fromnd.ni_startdir); if (error == ERESTART) return (0); if (error == ERELOOKUP) goto again; return (error); } /* * Make a directory file. */ #ifndef _SYS_SYSPROTO_H_ struct mkdir_args { char *path; int mode; }; #endif int sys_mkdir(struct thread *td, struct mkdir_args *uap) { return (kern_mkdirat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->mode)); } #ifndef _SYS_SYSPROTO_H_ struct mkdirat_args { int fd; char *path; mode_t mode; }; #endif int sys_mkdirat(struct thread *td, struct mkdirat_args *uap) { return (kern_mkdirat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode)); } int kern_mkdirat(struct thread *td, int fd, const char *path, enum uio_seg segflg, int mode) { struct mount *mp; struct vattr vattr; struct nameidata nd; int error; AUDIT_ARG_MODE(mode); NDPREINIT(&nd); restart: bwillwrite(); NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 | NC_NOMAKEENTRY | NC_KEEPPOSENTRY | FAILIFEXISTS | WILLBEDIR, segflg, path, fd, &cap_mkdirat_rights, td); if ((error = namei(&nd)) != 0) return (error); if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VATTR_NULL(&vattr); vattr.va_type = VDIR; vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_pd->pd_cmask; #ifdef MAC error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); if (error != 0) goto out; #endif error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); #ifdef MAC out: #endif NDFREE(&nd, NDF_ONLY_PNBUF); VOP_VPUT_PAIR(nd.ni_dvp, error == 0 ? &nd.ni_vp : NULL, true); vn_finished_write(mp); if (error == ERELOOKUP) goto restart; return (error); } /* * Remove a directory file. */ #ifndef _SYS_SYSPROTO_H_ struct rmdir_args { char *path; }; #endif int sys_rmdir(struct thread *td, struct rmdir_args *uap) { return (kern_frmdirat(td, AT_FDCWD, uap->path, FD_NONE, UIO_USERSPACE, 0)); } int kern_frmdirat(struct thread *td, int dfd, const char *path, int fd, enum uio_seg pathseg, int flag) { struct mount *mp; struct vnode *vp; struct file *fp; struct nameidata nd; cap_rights_t rights; int error; fp = NULL; if (fd != FD_NONE) { error = getvnode(td, fd, cap_rights_init_one(&rights, CAP_LOOKUP), &fp); if (error != 0) return (error); } NDPREINIT(&nd); restart: bwillwrite(); NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1 | at2cnpflags(flag, AT_RESOLVE_BENEATH), pathseg, path, dfd, &cap_unlinkat_rights, td); if ((error = namei(&nd)) != 0) goto fdout; vp = nd.ni_vp; if (vp->v_type != VDIR) { error = ENOTDIR; goto out; } /* * No rmdir "." please. */ if (nd.ni_dvp == vp) { error = EINVAL; goto out; } /* * The root of a mounted filesystem cannot be deleted. */ if (vp->v_vflag & VV_ROOT) { error = EBUSY; goto out; } if (fp != NULL && fp->f_vnode != vp) { if (VN_IS_DOOMED(fp->f_vnode)) error = EBADF; else error = EDEADLK; goto out; } #ifdef MAC error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp, &nd.ni_cnd); if (error != 0) goto out; #endif if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) goto fdout; goto restart; } vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK); error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); vn_finished_write(mp); out: NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); if (error == ERELOOKUP) goto restart; fdout: if (fp != NULL) fdrop(fp, td); return (error); } #if defined(COMPAT_43) || defined(COMPAT_FREEBSD11) int freebsd11_kern_getdirentries(struct thread *td, int fd, char *ubuf, u_int count, long *basep, void (*func)(struct freebsd11_dirent *)) { struct freebsd11_dirent dstdp; struct dirent *dp, *edp; char *dirbuf; off_t base; ssize_t resid, ucount; int error; /* XXX arbitrary sanity limit on `count'. */ count = min(count, 64 * 1024); dirbuf = malloc(count, M_TEMP, M_WAITOK); error = kern_getdirentries(td, fd, dirbuf, count, &base, &resid, UIO_SYSSPACE); if (error != 0) goto done; if (basep != NULL) *basep = base; ucount = 0; for (dp = (struct dirent *)dirbuf, edp = (struct dirent *)&dirbuf[count - resid]; ucount < count && dp < edp; ) { if (dp->d_reclen == 0) break; MPASS(dp->d_reclen >= _GENERIC_DIRLEN(0)); if (dp->d_namlen >= sizeof(dstdp.d_name)) continue; dstdp.d_type = dp->d_type; dstdp.d_namlen = dp->d_namlen; dstdp.d_fileno = dp->d_fileno; /* truncate */ if (dstdp.d_fileno != dp->d_fileno) { switch (ino64_trunc_error) { default: case 0: break; case 1: error = EOVERFLOW; goto done; case 2: dstdp.d_fileno = UINT32_MAX; break; } } dstdp.d_reclen = sizeof(dstdp) - sizeof(dstdp.d_name) + ((dp->d_namlen + 1 + 3) &~ 3); bcopy(dp->d_name, dstdp.d_name, dstdp.d_namlen); bzero(dstdp.d_name + dstdp.d_namlen, dstdp.d_reclen - offsetof(struct freebsd11_dirent, d_name) - dstdp.d_namlen); MPASS(dstdp.d_reclen <= dp->d_reclen); MPASS(ucount + dstdp.d_reclen <= count); if (func != NULL) func(&dstdp); error = copyout(&dstdp, ubuf + ucount, dstdp.d_reclen); if (error != 0) break; dp = (struct dirent *)((char *)dp + dp->d_reclen); ucount += dstdp.d_reclen; } done: free(dirbuf, M_TEMP); if (error == 0) td->td_retval[0] = ucount; return (error); } #endif /* COMPAT */ #ifdef COMPAT_43 static void ogetdirentries_cvt(struct freebsd11_dirent *dp) { #if (BYTE_ORDER == LITTLE_ENDIAN) /* * The expected low byte of dp->d_namlen is our dp->d_type. * The high MBZ byte of dp->d_namlen is our dp->d_namlen. */ dp->d_type = dp->d_namlen; dp->d_namlen = 0; #else /* * The dp->d_type is the high byte of the expected dp->d_namlen, * so must be zero'ed. */ dp->d_type = 0; #endif } /* * Read a block of directory entries in a filesystem independent format. */ #ifndef _SYS_SYSPROTO_H_ struct ogetdirentries_args { int fd; char *buf; u_int count; long *basep; }; #endif int ogetdirentries(struct thread *td, struct ogetdirentries_args *uap) { long loff; int error; error = kern_ogetdirentries(td, uap, &loff); if (error == 0) error = copyout(&loff, uap->basep, sizeof(long)); return (error); } int kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap, long *ploff) { long base; int error; /* XXX arbitrary sanity limit on `count'. */ if (uap->count > 64 * 1024) return (EINVAL); error = freebsd11_kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base, ogetdirentries_cvt); if (error == 0 && uap->basep != NULL) error = copyout(&base, uap->basep, sizeof(long)); return (error); } #endif /* COMPAT_43 */ #if defined(COMPAT_FREEBSD11) #ifndef _SYS_SYSPROTO_H_ struct freebsd11_getdirentries_args { int fd; char *buf; u_int count; long *basep; }; #endif int freebsd11_getdirentries(struct thread *td, struct freebsd11_getdirentries_args *uap) { long base; int error; error = freebsd11_kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base, NULL); if (error == 0 && uap->basep != NULL) error = copyout(&base, uap->basep, sizeof(long)); return (error); } int freebsd11_getdents(struct thread *td, struct freebsd11_getdents_args *uap) { struct freebsd11_getdirentries_args ap; ap.fd = uap->fd; ap.buf = uap->buf; ap.count = uap->count; ap.basep = NULL; return (freebsd11_getdirentries(td, &ap)); } #endif /* COMPAT_FREEBSD11 */ /* * Read a block of directory entries in a filesystem independent format. */ int sys_getdirentries(struct thread *td, struct getdirentries_args *uap) { off_t base; int error; error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base, NULL, UIO_USERSPACE); if (error != 0) return (error); if (uap->basep != NULL) error = copyout(&base, uap->basep, sizeof(off_t)); return (error); } int kern_getdirentries(struct thread *td, int fd, char *buf, size_t count, off_t *basep, ssize_t *residp, enum uio_seg bufseg) { struct vnode *vp; struct file *fp; struct uio auio; struct iovec aiov; off_t loff; int error, eofflag; off_t foffset; AUDIT_ARG_FD(fd); if (count > IOSIZE_MAX) return (EINVAL); auio.uio_resid = count; error = getvnode(td, fd, &cap_read_rights, &fp); if (error != 0) return (error); if ((fp->f_flag & FREAD) == 0) { fdrop(fp, td); return (EBADF); } vp = fp->f_vnode; foffset = foffset_lock(fp, 0); unionread: if (vp->v_type != VDIR) { error = EINVAL; goto fail; } aiov.iov_base = buf; aiov.iov_len = count; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = bufseg; auio.uio_td = td; vn_lock(vp, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(vp); loff = auio.uio_offset = foffset; #ifdef MAC error = mac_vnode_check_readdir(td->td_ucred, vp); if (error == 0) #endif error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL); foffset = auio.uio_offset; if (error != 0) { VOP_UNLOCK(vp); goto fail; } if (count == auio.uio_resid && (vp->v_vflag & VV_ROOT) && (vp->v_mount->mnt_flag & MNT_UNION)) { struct vnode *tvp = vp; vp = vp->v_mount->mnt_vnodecovered; VREF(vp); fp->f_vnode = vp; foffset = 0; vput(tvp); goto unionread; } VOP_UNLOCK(vp); *basep = loff; if (residp != NULL) *residp = auio.uio_resid; td->td_retval[0] = count - auio.uio_resid; fail: foffset_unlock(fp, foffset, 0); fdrop(fp, td); return (error); } /* * Set the mode mask for creation of filesystem nodes. */ #ifndef _SYS_SYSPROTO_H_ struct umask_args { int newmask; }; #endif int sys_umask(struct thread *td, struct umask_args *uap) { struct pwddesc *pdp; pdp = td->td_proc->p_pd; PWDDESC_XLOCK(pdp); td->td_retval[0] = pdp->pd_cmask; pdp->pd_cmask = uap->newmask & ALLPERMS; PWDDESC_XUNLOCK(pdp); return (0); } /* * Void all references to file by ripping underlying filesystem away from * vnode. */ #ifndef _SYS_SYSPROTO_H_ struct revoke_args { char *path; }; #endif int sys_revoke(struct thread *td, struct revoke_args *uap) { struct vnode *vp; struct vattr vattr; struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; NDFREE_NOTHING(&nd); if (vp->v_type != VCHR || vp->v_rdev == NULL) { error = EINVAL; goto out; } #ifdef MAC error = mac_vnode_check_revoke(td->td_ucred, vp); if (error != 0) goto out; #endif error = VOP_GETATTR(vp, &vattr, td->td_ucred); if (error != 0) goto out; if (td->td_ucred->cr_uid != vattr.va_uid) { error = priv_check(td, PRIV_VFS_ADMIN); if (error != 0) goto out; } if (devfs_usecount(vp) > 0) VOP_REVOKE(vp, REVOKEALL); out: vput(vp); return (error); } /* * This variant of getvnode() allows O_PATH files. Caller should * ensure that returned file and vnode are only used for compatible * semantics. */ int getvnode_path(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) { struct file *fp; int error; error = fget_unlocked(td->td_proc->p_fd, fd, rightsp, &fp); if (error != 0) return (error); /* * The file could be not of the vnode type, or it may be not * yet fully initialized, in which case the f_vnode pointer * may be set, but f_ops is still badfileops. E.g., * devfs_open() transiently create such situation to * facilitate csw d_fdopen(). * * Dupfdopen() handling in kern_openat() installs the * half-baked file into the process descriptor table, allowing * other thread to dereference it. Guard against the race by * checking f_ops. */ if (__predict_false(fp->f_vnode == NULL || fp->f_ops == &badfileops)) { fdrop(fp, td); return (EINVAL); } *fpp = fp; return (0); } /* * Convert a user file descriptor to a kernel file entry and check * that, if it is a capability, the correct rights are present. * A reference on the file entry is held upon returning. */ int getvnode(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) { int error; error = getvnode_path(td, fd, rightsp, fpp); if (__predict_false(error != 0)) return (error); /* * Filter out O_PATH file descriptors, most getvnode() callers * do not call fo_ methods. */ if (__predict_false((*fpp)->f_ops == &path_fileops)) { fdrop(*fpp, td); error = EBADF; } return (error); } /* * Get an (NFS) file handle. */ #ifndef _SYS_SYSPROTO_H_ struct lgetfh_args { char *fname; fhandle_t *fhp; }; #endif int sys_lgetfh(struct thread *td, struct lgetfh_args *uap) { return (kern_getfhat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->fname, UIO_USERSPACE, uap->fhp, UIO_USERSPACE)); } #ifndef _SYS_SYSPROTO_H_ struct getfh_args { char *fname; fhandle_t *fhp; }; #endif int sys_getfh(struct thread *td, struct getfh_args *uap) { return (kern_getfhat(td, 0, AT_FDCWD, uap->fname, UIO_USERSPACE, uap->fhp, UIO_USERSPACE)); } /* * syscall for the rpc.lockd to use to translate an open descriptor into * a NFS file handle. * * warning: do not remove the priv_check() call or this becomes one giant * security hole. */ #ifndef _SYS_SYSPROTO_H_ struct getfhat_args { int fd; char *path; fhandle_t *fhp; int flags; }; #endif int sys_getfhat(struct thread *td, struct getfhat_args *uap) { if ((uap->flags & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH)) != 0) return (EINVAL); return (kern_getfhat(td, uap->flags, uap->fd, uap->path, UIO_USERSPACE, uap->fhp, UIO_USERSPACE)); } int kern_getfhat(struct thread *td, int flags, int fd, const char *path, enum uio_seg pathseg, fhandle_t *fhp, enum uio_seg fhseg) { struct nameidata nd; fhandle_t fh; struct vnode *vp; int error; error = priv_check(td, PRIV_VFS_GETFH); if (error != 0) return (error); NDINIT_AT(&nd, LOOKUP, at2cnpflags(flags, AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH) | LOCKLEAF | AUDITVNODE1, pathseg, path, fd, td); error = namei(&nd); if (error != 0) return (error); NDFREE_NOTHING(&nd); vp = nd.ni_vp; bzero(&fh, sizeof(fh)); fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid; error = VOP_VPTOFH(vp, &fh.fh_fid); vput(vp); if (error == 0) { if (fhseg == UIO_USERSPACE) error = copyout(&fh, fhp, sizeof (fh)); else memcpy(fhp, &fh, sizeof(fh)); } return (error); } #ifndef _SYS_SYSPROTO_H_ struct fhlink_args { fhandle_t *fhp; const char *to; }; #endif int sys_fhlink(struct thread *td, struct fhlink_args *uap) { return (kern_fhlinkat(td, AT_FDCWD, uap->to, UIO_USERSPACE, uap->fhp)); } #ifndef _SYS_SYSPROTO_H_ struct fhlinkat_args { fhandle_t *fhp; int tofd; const char *to; }; #endif int sys_fhlinkat(struct thread *td, struct fhlinkat_args *uap) { return (kern_fhlinkat(td, uap->tofd, uap->to, UIO_USERSPACE, uap->fhp)); } static int kern_fhlinkat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, fhandle_t *fhp) { fhandle_t fh; struct mount *mp; struct vnode *vp; int error; error = priv_check(td, PRIV_VFS_GETFH); if (error != 0) return (error); error = copyin(fhp, &fh, sizeof(fh)); if (error != 0) return (error); do { bwillwrite(); if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL) return (ESTALE); error = VFS_FHTOVP(mp, &fh.fh_fid, LK_SHARED, &vp); vfs_unbusy(mp); if (error != 0) return (error); VOP_UNLOCK(vp); error = kern_linkat_vp(td, vp, fd, path, pathseg); } while (error == EAGAIN || error == ERELOOKUP); return (error); } #ifndef _SYS_SYSPROTO_H_ struct fhreadlink_args { fhandle_t *fhp; char *buf; size_t bufsize; }; #endif int sys_fhreadlink(struct thread *td, struct fhreadlink_args *uap) { fhandle_t fh; struct mount *mp; struct vnode *vp; int error; error = priv_check(td, PRIV_VFS_GETFH); if (error != 0) return (error); if (uap->bufsize > IOSIZE_MAX) return (EINVAL); error = copyin(uap->fhp, &fh, sizeof(fh)); if (error != 0) return (error); if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL) return (ESTALE); error = VFS_FHTOVP(mp, &fh.fh_fid, LK_SHARED, &vp); vfs_unbusy(mp); if (error != 0) return (error); error = kern_readlink_vp(vp, uap->buf, UIO_USERSPACE, uap->bufsize, td); vput(vp); return (error); } /* * syscall for the rpc.lockd to use to translate a NFS file handle into an * open descriptor. * * warning: do not remove the priv_check() call or this becomes one giant * security hole. */ #ifndef _SYS_SYSPROTO_H_ struct fhopen_args { const struct fhandle *u_fhp; int flags; }; #endif int sys_fhopen(struct thread *td, struct fhopen_args *uap) { return (kern_fhopen(td, uap->u_fhp, uap->flags)); } int kern_fhopen(struct thread *td, const struct fhandle *u_fhp, int flags) { struct mount *mp; struct vnode *vp; struct fhandle fhp; struct file *fp; int fmode, error; int indx; error = priv_check(td, PRIV_VFS_FHOPEN); if (error != 0) return (error); indx = -1; fmode = FFLAGS(flags); /* why not allow a non-read/write open for our lockd? */ if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT)) return (EINVAL); error = copyin(u_fhp, &fhp, sizeof(fhp)); if (error != 0) return(error); /* find the mount point */ mp = vfs_busyfs(&fhp.fh_fsid); if (mp == NULL) return (ESTALE); /* now give me my vnode, it gets returned to me locked */ error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp); vfs_unbusy(mp); if (error != 0) return (error); error = falloc_noinstall(td, &fp); if (error != 0) { vput(vp); return (error); } /* * An extra reference on `fp' has been held for us by * falloc_noinstall(). */ #ifdef INVARIANTS td->td_dupfd = -1; #endif error = vn_open_vnode(vp, fmode, td->td_ucred, td, fp); if (error != 0) { KASSERT(fp->f_ops == &badfileops, ("VOP_OPEN in fhopen() set f_ops")); KASSERT(td->td_dupfd < 0, ("fhopen() encountered fdopen()")); vput(vp); goto bad; } #ifdef INVARIANTS td->td_dupfd = 0; #endif fp->f_vnode = vp; finit_vnode(fp, fmode, NULL, &vnops); VOP_UNLOCK(vp); if ((fmode & O_TRUNC) != 0) { error = fo_truncate(fp, 0, td->td_ucred, td); if (error != 0) goto bad; } error = finstall(td, fp, &indx, fmode, NULL); bad: fdrop(fp, td); td->td_retval[0] = indx; return (error); } /* * Stat an (NFS) file handle. */ #ifndef _SYS_SYSPROTO_H_ struct fhstat_args { struct fhandle *u_fhp; struct stat *sb; }; #endif int sys_fhstat(struct thread *td, struct fhstat_args *uap) { struct stat sb; struct fhandle fh; int error; error = copyin(uap->u_fhp, &fh, sizeof(fh)); if (error != 0) return (error); error = kern_fhstat(td, fh, &sb); if (error == 0) error = copyout(&sb, uap->sb, sizeof(sb)); return (error); } int kern_fhstat(struct thread *td, struct fhandle fh, struct stat *sb) { struct mount *mp; struct vnode *vp; int error; error = priv_check(td, PRIV_VFS_FHSTAT); if (error != 0) return (error); if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL) return (ESTALE); error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp); vfs_unbusy(mp); if (error != 0) return (error); - error = VOP_STAT(vp, sb, td->td_ucred, NOCRED, td); + error = VOP_STAT(vp, sb, td->td_ucred, NOCRED); vput(vp); return (error); } /* * Implement fstatfs() for (NFS) file handles. */ #ifndef _SYS_SYSPROTO_H_ struct fhstatfs_args { struct fhandle *u_fhp; struct statfs *buf; }; #endif int sys_fhstatfs(struct thread *td, struct fhstatfs_args *uap) { struct statfs *sfp; fhandle_t fh; int error; error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); if (error != 0) return (error); sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fhstatfs(td, fh, sfp); if (error == 0) error = copyout(sfp, uap->buf, sizeof(*sfp)); free(sfp, M_STATFS); return (error); } int kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf) { struct mount *mp; struct vnode *vp; int error; error = priv_check(td, PRIV_VFS_FHSTATFS); if (error != 0) return (error); if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL) return (ESTALE); error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp); if (error != 0) { vfs_unbusy(mp); return (error); } vput(vp); error = prison_canseemount(td->td_ucred, mp); if (error != 0) goto out; #ifdef MAC error = mac_mount_check_stat(td->td_ucred, mp); if (error != 0) goto out; #endif error = VFS_STATFS(mp, buf); out: vfs_unbusy(mp); return (error); } /* * Unlike madvise(2), we do not make a best effort to remember every * possible caching hint. Instead, we remember the last setting with * the exception that we will allow POSIX_FADV_NORMAL to adjust the * region of any current setting. */ int kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len, int advice) { struct fadvise_info *fa, *new; struct file *fp; struct vnode *vp; off_t end; int error; if (offset < 0 || len < 0 || offset > OFF_MAX - len) return (EINVAL); AUDIT_ARG_VALUE(advice); switch (advice) { case POSIX_FADV_SEQUENTIAL: case POSIX_FADV_RANDOM: case POSIX_FADV_NOREUSE: new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK); break; case POSIX_FADV_NORMAL: case POSIX_FADV_WILLNEED: case POSIX_FADV_DONTNEED: new = NULL; break; default: return (EINVAL); } /* XXX: CAP_POSIX_FADVISE? */ AUDIT_ARG_FD(fd); error = fget(td, fd, &cap_no_rights, &fp); if (error != 0) goto out; AUDIT_ARG_FILE(td->td_proc, fp); if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) { error = ESPIPE; goto out; } if (fp->f_type != DTYPE_VNODE) { error = ENODEV; goto out; } vp = fp->f_vnode; if (vp->v_type != VREG) { error = ENODEV; goto out; } if (len == 0) end = OFF_MAX; else end = offset + len - 1; switch (advice) { case POSIX_FADV_SEQUENTIAL: case POSIX_FADV_RANDOM: case POSIX_FADV_NOREUSE: /* * Try to merge any existing non-standard region with * this new region if possible, otherwise create a new * non-standard region for this request. */ mtx_pool_lock(mtxpool_sleep, fp); fa = fp->f_advice; if (fa != NULL && fa->fa_advice == advice && ((fa->fa_start <= end && fa->fa_end >= offset) || (end != OFF_MAX && fa->fa_start == end + 1) || (fa->fa_end != OFF_MAX && fa->fa_end + 1 == offset))) { if (offset < fa->fa_start) fa->fa_start = offset; if (end > fa->fa_end) fa->fa_end = end; } else { new->fa_advice = advice; new->fa_start = offset; new->fa_end = end; fp->f_advice = new; new = fa; } mtx_pool_unlock(mtxpool_sleep, fp); break; case POSIX_FADV_NORMAL: /* * If a the "normal" region overlaps with an existing * non-standard region, trim or remove the * non-standard region. */ mtx_pool_lock(mtxpool_sleep, fp); fa = fp->f_advice; if (fa != NULL) { if (offset <= fa->fa_start && end >= fa->fa_end) { new = fa; fp->f_advice = NULL; } else if (offset <= fa->fa_start && end >= fa->fa_start) fa->fa_start = end + 1; else if (offset <= fa->fa_end && end >= fa->fa_end) fa->fa_end = offset - 1; else if (offset >= fa->fa_start && end <= fa->fa_end) { /* * If the "normal" region is a middle * portion of the existing * non-standard region, just remove * the whole thing rather than picking * one side or the other to * preserve. */ new = fa; fp->f_advice = NULL; } } mtx_pool_unlock(mtxpool_sleep, fp); break; case POSIX_FADV_WILLNEED: case POSIX_FADV_DONTNEED: error = VOP_ADVISE(vp, offset, end, advice); break; } out: if (fp != NULL) fdrop(fp, td); free(new, M_FADVISE); return (error); } int sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap) { int error; error = kern_posix_fadvise(td, uap->fd, uap->offset, uap->len, uap->advice); return (kern_posix_error(td, error)); } int kern_copy_file_range(struct thread *td, int infd, off_t *inoffp, int outfd, off_t *outoffp, size_t len, unsigned int flags) { struct file *infp, *outfp; struct vnode *invp, *outvp; int error; size_t retlen; void *rl_rcookie, *rl_wcookie; off_t savinoff, savoutoff; infp = outfp = NULL; rl_rcookie = rl_wcookie = NULL; savinoff = -1; error = 0; retlen = 0; if (flags != 0) { error = EINVAL; goto out; } if (len > SSIZE_MAX) /* * Although the len argument is size_t, the return argument * is ssize_t (which is signed). Therefore a size that won't * fit in ssize_t can't be returned. */ len = SSIZE_MAX; /* Get the file structures for the file descriptors. */ error = fget_read(td, infd, &cap_read_rights, &infp); if (error != 0) goto out; if (infp->f_ops == &badfileops) { error = EBADF; goto out; } if (infp->f_vnode == NULL) { error = EINVAL; goto out; } error = fget_write(td, outfd, &cap_write_rights, &outfp); if (error != 0) goto out; if (outfp->f_ops == &badfileops) { error = EBADF; goto out; } if (outfp->f_vnode == NULL) { error = EINVAL; goto out; } /* Set the offset pointers to the correct place. */ if (inoffp == NULL) inoffp = &infp->f_offset; if (outoffp == NULL) outoffp = &outfp->f_offset; savinoff = *inoffp; savoutoff = *outoffp; invp = infp->f_vnode; outvp = outfp->f_vnode; /* Sanity check the f_flag bits. */ if ((outfp->f_flag & (FWRITE | FAPPEND)) != FWRITE || (infp->f_flag & FREAD) == 0) { error = EBADF; goto out; } /* If len == 0, just return 0. */ if (len == 0) goto out; /* * If infp and outfp refer to the same file, the byte ranges cannot * overlap. */ if (invp == outvp && ((savinoff <= savoutoff && savinoff + len > savoutoff) || (savinoff > savoutoff && savoutoff + len > savinoff))) { error = EINVAL; goto out; } /* Range lock the byte ranges for both invp and outvp. */ for (;;) { rl_wcookie = vn_rangelock_wlock(outvp, *outoffp, *outoffp + len); rl_rcookie = vn_rangelock_tryrlock(invp, *inoffp, *inoffp + len); if (rl_rcookie != NULL) break; vn_rangelock_unlock(outvp, rl_wcookie); rl_rcookie = vn_rangelock_rlock(invp, *inoffp, *inoffp + len); vn_rangelock_unlock(invp, rl_rcookie); } retlen = len; error = vn_copy_file_range(invp, inoffp, outvp, outoffp, &retlen, flags, infp->f_cred, outfp->f_cred, td); out: if (rl_rcookie != NULL) vn_rangelock_unlock(invp, rl_rcookie); if (rl_wcookie != NULL) vn_rangelock_unlock(outvp, rl_wcookie); if (savinoff != -1 && (error == EINTR || error == ERESTART)) { *inoffp = savinoff; *outoffp = savoutoff; } if (outfp != NULL) fdrop(outfp, td); if (infp != NULL) fdrop(infp, td); td->td_retval[0] = retlen; return (error); } int sys_copy_file_range(struct thread *td, struct copy_file_range_args *uap) { off_t inoff, outoff, *inoffp, *outoffp; int error; inoffp = outoffp = NULL; if (uap->inoffp != NULL) { error = copyin(uap->inoffp, &inoff, sizeof(off_t)); if (error != 0) return (error); inoffp = &inoff; } if (uap->outoffp != NULL) { error = copyin(uap->outoffp, &outoff, sizeof(off_t)); if (error != 0) return (error); outoffp = &outoff; } error = kern_copy_file_range(td, uap->infd, inoffp, uap->outfd, outoffp, uap->len, uap->flags); if (error == 0 && uap->inoffp != NULL) error = copyout(inoffp, uap->inoffp, sizeof(off_t)); if (error == 0 && uap->outoffp != NULL) error = copyout(outoffp, uap->outoffp, sizeof(off_t)); return (error); } diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index 2c25a25da6c6..d6c472995489 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -1,3708 +1,3707 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Copyright (c) 2012 Konstantin Belousov * Copyright (c) 2013, 2014 The FreeBSD Foundation * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_hwpmc_hooks.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif static fo_rdwr_t vn_read; static fo_rdwr_t vn_write; static fo_rdwr_t vn_io_fault; static fo_truncate_t vn_truncate; static fo_ioctl_t vn_ioctl; static fo_poll_t vn_poll; static fo_kqfilter_t vn_kqfilter; static fo_close_t vn_closefile; static fo_mmap_t vn_mmap; static fo_fallocate_t vn_fallocate; static fo_fspacectl_t vn_fspacectl; struct fileops vnops = { .fo_read = vn_io_fault, .fo_write = vn_io_fault, .fo_truncate = vn_truncate, .fo_ioctl = vn_ioctl, .fo_poll = vn_poll, .fo_kqfilter = vn_kqfilter, .fo_stat = vn_statfile, .fo_close = vn_closefile, .fo_chmod = vn_chmod, .fo_chown = vn_chown, .fo_sendfile = vn_sendfile, .fo_seek = vn_seek, .fo_fill_kinfo = vn_fill_kinfo, .fo_mmap = vn_mmap, .fo_fallocate = vn_fallocate, .fo_fspacectl = vn_fspacectl, .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE }; const u_int io_hold_cnt = 16; static int vn_io_fault_enable = 1; SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RWTUN, &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance"); static int vn_io_fault_prefault = 0; SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_prefault, CTLFLAG_RWTUN, &vn_io_fault_prefault, 0, "Enable vn_io_fault prefaulting"); static int vn_io_pgcache_read_enable = 1; SYSCTL_INT(_debug, OID_AUTO, vn_io_pgcache_read_enable, CTLFLAG_RWTUN, &vn_io_pgcache_read_enable, 0, "Enable copying from page cache for reads, avoiding fs"); static u_long vn_io_faults_cnt; SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD, &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers"); static int vfs_allow_read_dir = 0; SYSCTL_INT(_security_bsd, OID_AUTO, allow_read_dir, CTLFLAG_RW, &vfs_allow_read_dir, 0, "Enable read(2) of directory by root for filesystems that support it"); /* * Returns true if vn_io_fault mode of handling the i/o request should * be used. */ static bool do_vn_io_fault(struct vnode *vp, struct uio *uio) { struct mount *mp; return (uio->uio_segflg == UIO_USERSPACE && vp->v_type == VREG && (mp = vp->v_mount) != NULL && (mp->mnt_kern_flag & MNTK_NO_IOPF) != 0 && vn_io_fault_enable); } /* * Structure used to pass arguments to vn_io_fault1(), to do either * file- or vnode-based I/O calls. */ struct vn_io_fault_args { enum { VN_IO_FAULT_FOP, VN_IO_FAULT_VOP } kind; struct ucred *cred; int flags; union { struct fop_args_tag { struct file *fp; fo_rdwr_t *doio; } fop_args; struct vop_args_tag { struct vnode *vp; } vop_args; } args; }; static int vn_io_fault1(struct vnode *vp, struct uio *uio, struct vn_io_fault_args *args, struct thread *td); int vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp) { struct thread *td = curthread; return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp)); } static uint64_t open2nameif(int fmode, u_int vn_open_flags) { uint64_t res; res = ISOPEN | LOCKLEAF; if ((fmode & O_RESOLVE_BENEATH) != 0) res |= RBENEATH; if ((fmode & O_EMPTY_PATH) != 0) res |= EMPTYPATH; if ((fmode & FREAD) != 0) res |= OPENREAD; if ((fmode & FWRITE) != 0) res |= OPENWRITE; if ((vn_open_flags & VN_OPEN_NOAUDIT) == 0) res |= AUDITVNODE1; if ((vn_open_flags & VN_OPEN_NOCAPCHECK) != 0) res |= NOCAPCHECK; return (res); } /* * Common code for vnode open operations via a name lookup. * Lookup the vnode and invoke VOP_CREATE if needed. * Check permissions, and call the VOP_OPEN or VOP_CREATE routine. * * Note that this does NOT free nameidata for the successful case, * due to the NDINIT being done elsewhere. */ int vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags, struct ucred *cred, struct file *fp) { struct vnode *vp; struct mount *mp; struct vattr vat; struct vattr *vap = &vat; int fmode, error; bool first_open; restart: first_open = false; fmode = *flagp; if ((fmode & (O_CREAT | O_EXCL | O_DIRECTORY)) == (O_CREAT | O_EXCL | O_DIRECTORY) || (fmode & (O_CREAT | O_EMPTY_PATH)) == (O_CREAT | O_EMPTY_PATH)) return (EINVAL); else if ((fmode & (O_CREAT | O_DIRECTORY)) == O_CREAT) { ndp->ni_cnd.cn_nameiop = CREATE; ndp->ni_cnd.cn_flags = open2nameif(fmode, vn_open_flags); /* * Set NOCACHE to avoid flushing the cache when * rolling in many files at once. * * Set NC_KEEPPOSENTRY to keep positive entries if they already * exist despite NOCACHE. */ ndp->ni_cnd.cn_flags |= LOCKPARENT | NOCACHE | NC_KEEPPOSENTRY; if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0) ndp->ni_cnd.cn_flags |= FOLLOW; if ((vn_open_flags & VN_OPEN_INVFS) == 0) bwillwrite(); if ((error = namei(ndp)) != 0) return (error); if (ndp->ni_vp == NULL) { VATTR_NULL(vap); vap->va_type = VREG; vap->va_mode = cmode; if (fmode & O_EXCL) vap->va_vaflags |= VA_EXCLUSIVE; if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(ndp, NDF_ONLY_PNBUF); vput(ndp->ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); NDREINIT(ndp); goto restart; } if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0) ndp->ni_cnd.cn_flags |= MAKEENTRY; #ifdef MAC error = mac_vnode_check_create(cred, ndp->ni_dvp, &ndp->ni_cnd, vap); if (error == 0) #endif error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd, vap); vp = ndp->ni_vp; if (error == 0 && (fmode & O_EXCL) != 0 && (fmode & (O_EXLOCK | O_SHLOCK)) != 0) { VI_LOCK(vp); vp->v_iflag |= VI_FOPENING; VI_UNLOCK(vp); first_open = true; } VOP_VPUT_PAIR(ndp->ni_dvp, error == 0 ? &vp : NULL, false); vn_finished_write(mp); if (error) { NDFREE(ndp, NDF_ONLY_PNBUF); if (error == ERELOOKUP) { NDREINIT(ndp); goto restart; } return (error); } fmode &= ~O_TRUNC; } else { if (ndp->ni_dvp == ndp->ni_vp) vrele(ndp->ni_dvp); else vput(ndp->ni_dvp); ndp->ni_dvp = NULL; vp = ndp->ni_vp; if (fmode & O_EXCL) { error = EEXIST; goto bad; } if (vp->v_type == VDIR) { error = EISDIR; goto bad; } fmode &= ~O_CREAT; } } else { ndp->ni_cnd.cn_nameiop = LOOKUP; ndp->ni_cnd.cn_flags = open2nameif(fmode, vn_open_flags); ndp->ni_cnd.cn_flags |= (fmode & O_NOFOLLOW) != 0 ? NOFOLLOW : FOLLOW; if ((fmode & FWRITE) == 0) ndp->ni_cnd.cn_flags |= LOCKSHARED; if ((error = namei(ndp)) != 0) return (error); vp = ndp->ni_vp; } error = vn_open_vnode(vp, fmode, cred, curthread, fp); if (first_open) { VI_LOCK(vp); vp->v_iflag &= ~VI_FOPENING; wakeup(vp); VI_UNLOCK(vp); } if (error) goto bad; *flagp = fmode; return (0); bad: NDFREE(ndp, NDF_ONLY_PNBUF); vput(vp); *flagp = fmode; ndp->ni_vp = NULL; return (error); } static int vn_open_vnode_advlock(struct vnode *vp, int fmode, struct file *fp) { struct flock lf; int error, lock_flags, type; ASSERT_VOP_LOCKED(vp, "vn_open_vnode_advlock"); if ((fmode & (O_EXLOCK | O_SHLOCK)) == 0) return (0); KASSERT(fp != NULL, ("open with flock requires fp")); if (fp->f_type != DTYPE_NONE && fp->f_type != DTYPE_VNODE) return (EOPNOTSUPP); lock_flags = VOP_ISLOCKED(vp); VOP_UNLOCK(vp); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = (fmode & O_EXLOCK) != 0 ? F_WRLCK : F_RDLCK; type = F_FLOCK; if ((fmode & FNONBLOCK) == 0) type |= F_WAIT; if ((fmode & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) type |= F_FIRSTOPEN; error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type); if (error == 0) fp->f_flag |= FHASLOCK; vn_lock(vp, lock_flags | LK_RETRY); return (error); } /* * Common code for vnode open operations once a vnode is located. * Check permissions, and call the VOP_OPEN routine. */ int vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred, struct thread *td, struct file *fp) { accmode_t accmode; int error; if (vp->v_type == VLNK) { if ((fmode & O_PATH) == 0 || (fmode & FEXEC) != 0) return (EMLINK); } if (vp->v_type != VDIR && fmode & O_DIRECTORY) return (ENOTDIR); accmode = 0; if ((fmode & O_PATH) == 0) { if (vp->v_type == VSOCK) return (EOPNOTSUPP); if ((fmode & (FWRITE | O_TRUNC)) != 0) { if (vp->v_type == VDIR) return (EISDIR); accmode |= VWRITE; } if ((fmode & FREAD) != 0) accmode |= VREAD; if ((fmode & O_APPEND) && (fmode & FWRITE)) accmode |= VAPPEND; #ifdef MAC if ((fmode & O_CREAT) != 0) accmode |= VCREAT; #endif } if ((fmode & FEXEC) != 0) accmode |= VEXEC; #ifdef MAC if ((fmode & O_VERIFY) != 0) accmode |= VVERIFY; error = mac_vnode_check_open(cred, vp, accmode); if (error != 0) return (error); accmode &= ~(VCREAT | VVERIFY); #endif if ((fmode & O_CREAT) == 0 && accmode != 0) { error = VOP_ACCESS(vp, accmode, cred, td); if (error != 0) return (error); } if ((fmode & O_PATH) != 0) { if (vp->v_type != VFIFO && vp->v_type != VSOCK && VOP_ACCESS(vp, VREAD, cred, td) == 0) fp->f_flag |= FKQALLOWED; return (0); } if (vp->v_type == VFIFO && VOP_ISLOCKED(vp) != LK_EXCLUSIVE) vn_lock(vp, LK_UPGRADE | LK_RETRY); error = VOP_OPEN(vp, fmode, cred, td, fp); if (error != 0) return (error); error = vn_open_vnode_advlock(vp, fmode, fp); if (error == 0 && (fmode & FWRITE) != 0) { error = VOP_ADD_WRITECOUNT(vp, 1); if (error == 0) { CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d", __func__, vp, vp->v_writecount); } } /* * Error from advlock or VOP_ADD_WRITECOUNT() still requires * calling VOP_CLOSE() to pair with earlier VOP_OPEN(). */ if (error != 0) { if (fp != NULL) { /* * Arrange the call by having fdrop() to use * vn_closefile(). This is to satisfy * filesystems like devfs or tmpfs, which * override fo_close(). */ fp->f_flag |= FOPENFAILED; fp->f_vnode = vp; if (fp->f_ops == &badfileops) { fp->f_type = DTYPE_VNODE; fp->f_ops = &vnops; } vref(vp); } else { /* * If there is no fp, due to kernel-mode open, * we can call VOP_CLOSE() now. */ if (vp->v_type != VFIFO && (fmode & FWRITE) != 0 && !MNT_EXTENDED_SHARED(vp->v_mount) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE) vn_lock(vp, LK_UPGRADE | LK_RETRY); (void)VOP_CLOSE(vp, fmode & (FREAD | FWRITE | FEXEC), cred, td); } } ASSERT_VOP_LOCKED(vp, "vn_open_vnode"); return (error); } /* * Check for write permissions on the specified vnode. * Prototype text segments cannot be written. * It is racy. */ int vn_writechk(struct vnode *vp) { ASSERT_VOP_LOCKED(vp, "vn_writechk"); /* * If there's shared text associated with * the vnode, try to free it up once. If * we fail, we can't allow writing. */ if (VOP_IS_TEXT(vp)) return (ETXTBSY); return (0); } /* * Vnode close call */ static int vn_close1(struct vnode *vp, int flags, struct ucred *file_cred, struct thread *td, bool keep_ref) { struct mount *mp; int error, lock_flags; if (vp->v_type != VFIFO && (flags & FWRITE) == 0 && MNT_EXTENDED_SHARED(vp->v_mount)) lock_flags = LK_SHARED; else lock_flags = LK_EXCLUSIVE; vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, lock_flags | LK_RETRY); AUDIT_ARG_VNODE1(vp); if ((flags & (FWRITE | FOPENFAILED)) == FWRITE) { VOP_ADD_WRITECOUNT_CHECKED(vp, -1); CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d", __func__, vp, vp->v_writecount); } error = VOP_CLOSE(vp, flags, file_cred, td); if (keep_ref) VOP_UNLOCK(vp); else vput(vp); vn_finished_write(mp); return (error); } int vn_close(struct vnode *vp, int flags, struct ucred *file_cred, struct thread *td) { return (vn_close1(vp, flags, file_cred, td, false)); } /* * Heuristic to detect sequential operation. */ static int sequential_heuristic(struct uio *uio, struct file *fp) { enum uio_rw rw; ASSERT_VOP_LOCKED(fp->f_vnode, __func__); rw = uio->uio_rw; if (fp->f_flag & FRDAHEAD) return (fp->f_seqcount[rw] << IO_SEQSHIFT); /* * Offset 0 is handled specially. open() sets f_seqcount to 1 so * that the first I/O is normally considered to be slightly * sequential. Seeking to offset 0 doesn't change sequentiality * unless previous seeks have reduced f_seqcount to 0, in which * case offset 0 is not special. */ if ((uio->uio_offset == 0 && fp->f_seqcount[rw] > 0) || uio->uio_offset == fp->f_nextoff[rw]) { /* * f_seqcount is in units of fixed-size blocks so that it * depends mainly on the amount of sequential I/O and not * much on the number of sequential I/O's. The fixed size * of 16384 is hard-coded here since it is (not quite) just * a magic size that works well here. This size is more * closely related to the best I/O size for real disks than * to any block size used by software. */ if (uio->uio_resid >= IO_SEQMAX * 16384) fp->f_seqcount[rw] = IO_SEQMAX; else { fp->f_seqcount[rw] += howmany(uio->uio_resid, 16384); if (fp->f_seqcount[rw] > IO_SEQMAX) fp->f_seqcount[rw] = IO_SEQMAX; } return (fp->f_seqcount[rw] << IO_SEQSHIFT); } /* Not sequential. Quickly draw-down sequentiality. */ if (fp->f_seqcount[rw] > 1) fp->f_seqcount[rw] = 1; else fp->f_seqcount[rw] = 0; return (0); } /* * Package up an I/O request on a vnode into a uio and do it. */ int vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, struct ucred *file_cred, ssize_t *aresid, struct thread *td) { struct uio auio; struct iovec aiov; struct mount *mp; struct ucred *cred; void *rl_cookie; struct vn_io_fault_args args; int error, lock_flags; if (offset < 0 && vp->v_type != VCHR) return (EINVAL); auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = base; aiov.iov_len = len; auio.uio_resid = len; auio.uio_offset = offset; auio.uio_segflg = segflg; auio.uio_rw = rw; auio.uio_td = td; error = 0; if ((ioflg & IO_NODELOCKED) == 0) { if ((ioflg & IO_RANGELOCKED) == 0) { if (rw == UIO_READ) { rl_cookie = vn_rangelock_rlock(vp, offset, offset + len); } else if ((ioflg & IO_APPEND) != 0) { rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); } else { rl_cookie = vn_rangelock_wlock(vp, offset, offset + len); } } else rl_cookie = NULL; mp = NULL; if (rw == UIO_WRITE) { if (vp->v_type != VCHR && (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto out; lock_flags = vn_lktype_write(mp, vp); } else lock_flags = LK_SHARED; vn_lock(vp, lock_flags | LK_RETRY); } else rl_cookie = NULL; ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); #ifdef MAC if ((ioflg & IO_NOMACCHECK) == 0) { if (rw == UIO_READ) error = mac_vnode_check_read(active_cred, file_cred, vp); else error = mac_vnode_check_write(active_cred, file_cred, vp); } #endif if (error == 0) { if (file_cred != NULL) cred = file_cred; else cred = active_cred; if (do_vn_io_fault(vp, &auio)) { args.kind = VN_IO_FAULT_VOP; args.cred = cred; args.flags = ioflg; args.args.vop_args.vp = vp; error = vn_io_fault1(vp, &auio, &args, td); } else if (rw == UIO_READ) { error = VOP_READ(vp, &auio, ioflg, cred); } else /* if (rw == UIO_WRITE) */ { error = VOP_WRITE(vp, &auio, ioflg, cred); } } if (aresid) *aresid = auio.uio_resid; else if (auio.uio_resid && error == 0) error = EIO; if ((ioflg & IO_NODELOCKED) == 0) { VOP_UNLOCK(vp); if (mp != NULL) vn_finished_write(mp); } out: if (rl_cookie != NULL) vn_rangelock_unlock(vp, rl_cookie); return (error); } /* * Package up an I/O request on a vnode into a uio and do it. The I/O * request is split up into smaller chunks and we try to avoid saturating * the buffer cache while potentially holding a vnode locked, so we * check bwillwrite() before calling vn_rdwr(). We also call kern_yield() * to give other processes a chance to lock the vnode (either other processes * core'ing the same binary, or unrelated processes scanning the directory). */ int vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base, size_t len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, struct ucred *file_cred, size_t *aresid, struct thread *td) { int error = 0; ssize_t iaresid; do { int chunk; /* * Force `offset' to a multiple of MAXBSIZE except possibly * for the first chunk, so that filesystems only need to * write full blocks except possibly for the first and last * chunks. */ chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE; if (chunk > len) chunk = len; if (rw != UIO_READ && vp->v_type == VREG) bwillwrite(); iaresid = 0; error = vn_rdwr(rw, vp, base, chunk, offset, segflg, ioflg, active_cred, file_cred, &iaresid, td); len -= chunk; /* aresid calc already includes length */ if (error) break; offset += chunk; base = (char *)base + chunk; kern_yield(PRI_USER); } while (len); if (aresid) *aresid = len + iaresid; return (error); } #if OFF_MAX <= LONG_MAX off_t foffset_lock(struct file *fp, int flags) { volatile short *flagsp; off_t res; short state; KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed")); if ((flags & FOF_NOLOCK) != 0) return (atomic_load_long(&fp->f_offset)); /* * According to McKusick the vn lock was protecting f_offset here. * It is now protected by the FOFFSET_LOCKED flag. */ flagsp = &fp->f_vnread_flags; if (atomic_cmpset_acq_16(flagsp, 0, FOFFSET_LOCKED)) return (atomic_load_long(&fp->f_offset)); sleepq_lock(&fp->f_vnread_flags); state = atomic_load_16(flagsp); for (;;) { if ((state & FOFFSET_LOCKED) == 0) { if (!atomic_fcmpset_acq_16(flagsp, &state, FOFFSET_LOCKED)) continue; break; } if ((state & FOFFSET_LOCK_WAITING) == 0) { if (!atomic_fcmpset_acq_16(flagsp, &state, state | FOFFSET_LOCK_WAITING)) continue; } DROP_GIANT(); sleepq_add(&fp->f_vnread_flags, NULL, "vofflock", 0, 0); sleepq_wait(&fp->f_vnread_flags, PUSER -1); PICKUP_GIANT(); sleepq_lock(&fp->f_vnread_flags); state = atomic_load_16(flagsp); } res = atomic_load_long(&fp->f_offset); sleepq_release(&fp->f_vnread_flags); return (res); } void foffset_unlock(struct file *fp, off_t val, int flags) { volatile short *flagsp; short state; KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed")); if ((flags & FOF_NOUPDATE) == 0) atomic_store_long(&fp->f_offset, val); if ((flags & FOF_NEXTOFF_R) != 0) fp->f_nextoff[UIO_READ] = val; if ((flags & FOF_NEXTOFF_W) != 0) fp->f_nextoff[UIO_WRITE] = val; if ((flags & FOF_NOLOCK) != 0) return; flagsp = &fp->f_vnread_flags; state = atomic_load_16(flagsp); if ((state & FOFFSET_LOCK_WAITING) == 0 && atomic_cmpset_rel_16(flagsp, state, 0)) return; sleepq_lock(&fp->f_vnread_flags); MPASS((fp->f_vnread_flags & FOFFSET_LOCKED) != 0); MPASS((fp->f_vnread_flags & FOFFSET_LOCK_WAITING) != 0); fp->f_vnread_flags = 0; sleepq_broadcast(&fp->f_vnread_flags, SLEEPQ_SLEEP, 0, 0); sleepq_release(&fp->f_vnread_flags); } #else off_t foffset_lock(struct file *fp, int flags) { struct mtx *mtxp; off_t res; KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed")); mtxp = mtx_pool_find(mtxpool_sleep, fp); mtx_lock(mtxp); if ((flags & FOF_NOLOCK) == 0) { while (fp->f_vnread_flags & FOFFSET_LOCKED) { fp->f_vnread_flags |= FOFFSET_LOCK_WAITING; msleep(&fp->f_vnread_flags, mtxp, PUSER -1, "vofflock", 0); } fp->f_vnread_flags |= FOFFSET_LOCKED; } res = fp->f_offset; mtx_unlock(mtxp); return (res); } void foffset_unlock(struct file *fp, off_t val, int flags) { struct mtx *mtxp; KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed")); mtxp = mtx_pool_find(mtxpool_sleep, fp); mtx_lock(mtxp); if ((flags & FOF_NOUPDATE) == 0) fp->f_offset = val; if ((flags & FOF_NEXTOFF_R) != 0) fp->f_nextoff[UIO_READ] = val; if ((flags & FOF_NEXTOFF_W) != 0) fp->f_nextoff[UIO_WRITE] = val; if ((flags & FOF_NOLOCK) == 0) { KASSERT((fp->f_vnread_flags & FOFFSET_LOCKED) != 0, ("Lost FOFFSET_LOCKED")); if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING) wakeup(&fp->f_vnread_flags); fp->f_vnread_flags = 0; } mtx_unlock(mtxp); } #endif void foffset_lock_uio(struct file *fp, struct uio *uio, int flags) { if ((flags & FOF_OFFSET) == 0) uio->uio_offset = foffset_lock(fp, flags); } void foffset_unlock_uio(struct file *fp, struct uio *uio, int flags) { if ((flags & FOF_OFFSET) == 0) foffset_unlock(fp, uio->uio_offset, flags); } static int get_advice(struct file *fp, struct uio *uio) { struct mtx *mtxp; int ret; ret = POSIX_FADV_NORMAL; if (fp->f_advice == NULL || fp->f_vnode->v_type != VREG) return (ret); mtxp = mtx_pool_find(mtxpool_sleep, fp); mtx_lock(mtxp); if (fp->f_advice != NULL && uio->uio_offset >= fp->f_advice->fa_start && uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end) ret = fp->f_advice->fa_advice; mtx_unlock(mtxp); return (ret); } static int get_write_ioflag(struct file *fp) { int ioflag; struct mount *mp; struct vnode *vp; ioflag = 0; vp = fp->f_vnode; mp = atomic_load_ptr(&vp->v_mount); if ((fp->f_flag & O_DIRECT) != 0) ioflag |= IO_DIRECT; if ((fp->f_flag & O_FSYNC) != 0 || (mp != NULL && (mp->mnt_flag & MNT_SYNCHRONOUS) != 0)) ioflag |= IO_SYNC; /* * For O_DSYNC we set both IO_SYNC and IO_DATASYNC, so that VOP_WRITE() * or VOP_DEALLOCATE() implementations that don't understand IO_DATASYNC * fall back to full O_SYNC behavior. */ if ((fp->f_flag & O_DSYNC) != 0) ioflag |= IO_SYNC | IO_DATASYNC; return (ioflag); } int vn_read_from_obj(struct vnode *vp, struct uio *uio) { vm_object_t obj; vm_page_t ma[io_hold_cnt + 2]; off_t off, vsz; ssize_t resid; int error, i, j; MPASS(uio->uio_resid <= ptoa(io_hold_cnt + 2)); obj = atomic_load_ptr(&vp->v_object); if (obj == NULL) return (EJUSTRETURN); /* * Depends on type stability of vm_objects. */ vm_object_pip_add(obj, 1); if ((obj->flags & OBJ_DEAD) != 0) { /* * Note that object might be already reused from the * vnode, and the OBJ_DEAD flag cleared. This is fine, * we recheck for DOOMED vnode state after all pages * are busied, and retract then. * * But we check for OBJ_DEAD to ensure that we do not * busy pages while vm_object_terminate_pages() * processes the queue. */ error = EJUSTRETURN; goto out_pip; } resid = uio->uio_resid; off = uio->uio_offset; for (i = 0; resid > 0; i++) { MPASS(i < io_hold_cnt + 2); ma[i] = vm_page_grab_unlocked(obj, atop(off), VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY | VM_ALLOC_NOWAIT); if (ma[i] == NULL) break; /* * Skip invalid pages. Valid mask can be partial only * at EOF, and we clip later. */ if (vm_page_none_valid(ma[i])) { vm_page_sunbusy(ma[i]); break; } resid -= PAGE_SIZE; off += PAGE_SIZE; } if (i == 0) { error = EJUSTRETURN; goto out_pip; } /* * Check VIRF_DOOMED after we busied our pages. Since * vgonel() terminates the vnode' vm_object, it cannot * process past pages busied by us. */ if (VN_IS_DOOMED(vp)) { error = EJUSTRETURN; goto out; } resid = PAGE_SIZE - (uio->uio_offset & PAGE_MASK) + ptoa(i - 1); if (resid > uio->uio_resid) resid = uio->uio_resid; /* * Unlocked read of vnp_size is safe because truncation cannot * pass busied page. But we load vnp_size into a local * variable so that possible concurrent extension does not * break calculation. */ #if defined(__powerpc__) && !defined(__powerpc64__) vsz = obj->un_pager.vnp.vnp_size; #else vsz = atomic_load_64(&obj->un_pager.vnp.vnp_size); #endif if (uio->uio_offset >= vsz) { error = EJUSTRETURN; goto out; } if (uio->uio_offset + resid > vsz) resid = vsz - uio->uio_offset; error = vn_io_fault_pgmove(ma, uio->uio_offset & PAGE_MASK, resid, uio); out: for (j = 0; j < i; j++) { if (error == 0) vm_page_reference(ma[j]); vm_page_sunbusy(ma[j]); } out_pip: vm_object_pip_wakeup(obj); if (error != 0) return (error); return (uio->uio_resid == 0 ? 0 : EJUSTRETURN); } /* * File table vnode read routine. */ static int vn_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct vnode *vp; off_t orig_offset; int error, ioflag; int advice; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET")); vp = fp->f_vnode; ioflag = 0; if (fp->f_flag & FNONBLOCK) ioflag |= IO_NDELAY; if (fp->f_flag & O_DIRECT) ioflag |= IO_DIRECT; /* * Try to read from page cache. VIRF_DOOMED check is racy but * allows us to avoid unneeded work outright. */ if (vn_io_pgcache_read_enable && !mac_vnode_check_read_enabled() && (vn_irflag_read(vp) & (VIRF_DOOMED | VIRF_PGREAD)) == VIRF_PGREAD) { error = VOP_READ_PGCACHE(vp, uio, ioflag, fp->f_cred); if (error == 0) { fp->f_nextoff[UIO_READ] = uio->uio_offset; return (0); } if (error != EJUSTRETURN) return (error); } advice = get_advice(fp, uio); vn_lock(vp, LK_SHARED | LK_RETRY); switch (advice) { case POSIX_FADV_NORMAL: case POSIX_FADV_SEQUENTIAL: case POSIX_FADV_NOREUSE: ioflag |= sequential_heuristic(uio, fp); break; case POSIX_FADV_RANDOM: /* Disable read-ahead for random I/O. */ break; } orig_offset = uio->uio_offset; #ifdef MAC error = mac_vnode_check_read(active_cred, fp->f_cred, vp); if (error == 0) #endif error = VOP_READ(vp, uio, ioflag, fp->f_cred); fp->f_nextoff[UIO_READ] = uio->uio_offset; VOP_UNLOCK(vp); if (error == 0 && advice == POSIX_FADV_NOREUSE && orig_offset != uio->uio_offset) /* * Use POSIX_FADV_DONTNEED to flush pages and buffers * for the backing file after a POSIX_FADV_NOREUSE * read(2). */ error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1, POSIX_FADV_DONTNEED); return (error); } /* * File table vnode write routine. */ static int vn_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct vnode *vp; struct mount *mp; off_t orig_offset; int error, ioflag; int advice; bool need_finished_write; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET")); vp = fp->f_vnode; if (vp->v_type == VREG) bwillwrite(); ioflag = IO_UNIT; if (vp->v_type == VREG && (fp->f_flag & O_APPEND) != 0) ioflag |= IO_APPEND; if ((fp->f_flag & FNONBLOCK) != 0) ioflag |= IO_NDELAY; ioflag |= get_write_ioflag(fp); mp = NULL; need_finished_write = false; if (vp->v_type != VCHR) { error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error != 0) goto unlock; need_finished_write = true; } advice = get_advice(fp, uio); vn_lock(vp, vn_lktype_write(mp, vp) | LK_RETRY); switch (advice) { case POSIX_FADV_NORMAL: case POSIX_FADV_SEQUENTIAL: case POSIX_FADV_NOREUSE: ioflag |= sequential_heuristic(uio, fp); break; case POSIX_FADV_RANDOM: /* XXX: Is this correct? */ break; } orig_offset = uio->uio_offset; #ifdef MAC error = mac_vnode_check_write(active_cred, fp->f_cred, vp); if (error == 0) #endif error = VOP_WRITE(vp, uio, ioflag, fp->f_cred); fp->f_nextoff[UIO_WRITE] = uio->uio_offset; VOP_UNLOCK(vp); if (need_finished_write) vn_finished_write(mp); if (error == 0 && advice == POSIX_FADV_NOREUSE && orig_offset != uio->uio_offset) /* * Use POSIX_FADV_DONTNEED to flush pages and buffers * for the backing file after a POSIX_FADV_NOREUSE * write(2). */ error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1, POSIX_FADV_DONTNEED); unlock: return (error); } /* * The vn_io_fault() is a wrapper around vn_read() and vn_write() to * prevent the following deadlock: * * Assume that the thread A reads from the vnode vp1 into userspace * buffer buf1 backed by the pages of vnode vp2. If a page in buf1 is * currently not resident, then system ends up with the call chain * vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] -> * vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2) * which establishes lock order vp1->vn_lock, then vp2->vn_lock. * If, at the same time, thread B reads from vnode vp2 into buffer buf2 * backed by the pages of vnode vp1, and some page in buf2 is not * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock. * * To prevent the lock order reversal and deadlock, vn_io_fault() does * not allow page faults to happen during VOP_READ() or VOP_WRITE(). * Instead, it first tries to do the whole range i/o with pagefaults * disabled. If all pages in the i/o buffer are resident and mapped, * VOP will succeed (ignoring the genuine filesystem errors). * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do * i/o in chunks, with all pages in the chunk prefaulted and held * using vm_fault_quick_hold_pages(). * * Filesystems using this deadlock avoidance scheme should use the * array of the held pages from uio, saved in the curthread->td_ma, * instead of doing uiomove(). A helper function * vn_io_fault_uiomove() converts uiomove request into * uiomove_fromphys() over td_ma array. * * Since vnode locks do not cover the whole i/o anymore, rangelocks * make the current i/o request atomic with respect to other i/os and * truncations. */ /* * Decode vn_io_fault_args and perform the corresponding i/o. */ static int vn_io_fault_doio(struct vn_io_fault_args *args, struct uio *uio, struct thread *td) { int error, save; error = 0; save = vm_fault_disable_pagefaults(); switch (args->kind) { case VN_IO_FAULT_FOP: error = (args->args.fop_args.doio)(args->args.fop_args.fp, uio, args->cred, args->flags, td); break; case VN_IO_FAULT_VOP: if (uio->uio_rw == UIO_READ) { error = VOP_READ(args->args.vop_args.vp, uio, args->flags, args->cred); } else if (uio->uio_rw == UIO_WRITE) { error = VOP_WRITE(args->args.vop_args.vp, uio, args->flags, args->cred); } break; default: panic("vn_io_fault_doio: unknown kind of io %d %d", args->kind, uio->uio_rw); } vm_fault_enable_pagefaults(save); return (error); } static int vn_io_fault_touch(char *base, const struct uio *uio) { int r; r = fubyte(base); if (r == -1 || (uio->uio_rw == UIO_READ && subyte(base, r) == -1)) return (EFAULT); return (0); } static int vn_io_fault_prefault_user(const struct uio *uio) { char *base; const struct iovec *iov; size_t len; ssize_t resid; int error, i; KASSERT(uio->uio_segflg == UIO_USERSPACE, ("vn_io_fault_prefault userspace")); error = i = 0; iov = uio->uio_iov; resid = uio->uio_resid; base = iov->iov_base; len = iov->iov_len; while (resid > 0) { error = vn_io_fault_touch(base, uio); if (error != 0) break; if (len < PAGE_SIZE) { if (len != 0) { error = vn_io_fault_touch(base + len - 1, uio); if (error != 0) break; resid -= len; } if (++i >= uio->uio_iovcnt) break; iov = uio->uio_iov + i; base = iov->iov_base; len = iov->iov_len; } else { len -= PAGE_SIZE; base += PAGE_SIZE; resid -= PAGE_SIZE; } } return (error); } /* * Common code for vn_io_fault(), agnostic to the kind of i/o request. * Uses vn_io_fault_doio() to make the call to an actual i/o function. * Used from vn_rdwr() and vn_io_fault(), which encode the i/o request * into args and call vn_io_fault1() to handle faults during the user * mode buffer accesses. */ static int vn_io_fault1(struct vnode *vp, struct uio *uio, struct vn_io_fault_args *args, struct thread *td) { vm_page_t ma[io_hold_cnt + 2]; struct uio *uio_clone, short_uio; struct iovec short_iovec[1]; vm_page_t *prev_td_ma; vm_prot_t prot; vm_offset_t addr, end; size_t len, resid; ssize_t adv; int error, cnt, saveheld, prev_td_ma_cnt; if (vn_io_fault_prefault) { error = vn_io_fault_prefault_user(uio); if (error != 0) return (error); /* Or ignore ? */ } prot = uio->uio_rw == UIO_READ ? VM_PROT_WRITE : VM_PROT_READ; /* * The UFS follows IO_UNIT directive and replays back both * uio_offset and uio_resid if an error is encountered during the * operation. But, since the iovec may be already advanced, * uio is still in an inconsistent state. * * Cache a copy of the original uio, which is advanced to the redo * point using UIO_NOCOPY below. */ uio_clone = cloneuio(uio); resid = uio->uio_resid; short_uio.uio_segflg = UIO_USERSPACE; short_uio.uio_rw = uio->uio_rw; short_uio.uio_td = uio->uio_td; error = vn_io_fault_doio(args, uio, td); if (error != EFAULT) goto out; atomic_add_long(&vn_io_faults_cnt, 1); uio_clone->uio_segflg = UIO_NOCOPY; uiomove(NULL, resid - uio->uio_resid, uio_clone); uio_clone->uio_segflg = uio->uio_segflg; saveheld = curthread_pflags_set(TDP_UIOHELD); prev_td_ma = td->td_ma; prev_td_ma_cnt = td->td_ma_cnt; while (uio_clone->uio_resid != 0) { len = uio_clone->uio_iov->iov_len; if (len == 0) { KASSERT(uio_clone->uio_iovcnt >= 1, ("iovcnt underflow")); uio_clone->uio_iov++; uio_clone->uio_iovcnt--; continue; } if (len > ptoa(io_hold_cnt)) len = ptoa(io_hold_cnt); addr = (uintptr_t)uio_clone->uio_iov->iov_base; end = round_page(addr + len); if (end < addr) { error = EFAULT; break; } cnt = atop(end - trunc_page(addr)); /* * A perfectly misaligned address and length could cause * both the start and the end of the chunk to use partial * page. +2 accounts for such a situation. */ cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map, addr, len, prot, ma, io_hold_cnt + 2); if (cnt == -1) { error = EFAULT; break; } short_uio.uio_iov = &short_iovec[0]; short_iovec[0].iov_base = (void *)addr; short_uio.uio_iovcnt = 1; short_uio.uio_resid = short_iovec[0].iov_len = len; short_uio.uio_offset = uio_clone->uio_offset; td->td_ma = ma; td->td_ma_cnt = cnt; error = vn_io_fault_doio(args, &short_uio, td); vm_page_unhold_pages(ma, cnt); adv = len - short_uio.uio_resid; uio_clone->uio_iov->iov_base = (char *)uio_clone->uio_iov->iov_base + adv; uio_clone->uio_iov->iov_len -= adv; uio_clone->uio_resid -= adv; uio_clone->uio_offset += adv; uio->uio_resid -= adv; uio->uio_offset += adv; if (error != 0 || adv == 0) break; } td->td_ma = prev_td_ma; td->td_ma_cnt = prev_td_ma_cnt; curthread_pflags_restore(saveheld); out: free(uio_clone, M_IOV); return (error); } static int vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { fo_rdwr_t *doio; struct vnode *vp; void *rl_cookie; struct vn_io_fault_args args; int error; doio = uio->uio_rw == UIO_READ ? vn_read : vn_write; vp = fp->f_vnode; /* * The ability to read(2) on a directory has historically been * allowed for all users, but this can and has been the source of * at least one security issue in the past. As such, it is now hidden * away behind a sysctl for those that actually need it to use it, and * restricted to root when it's turned on to make it relatively safe to * leave on for longer sessions of need. */ if (vp->v_type == VDIR) { KASSERT(uio->uio_rw == UIO_READ, ("illegal write attempted on a directory")); if (!vfs_allow_read_dir) return (EISDIR); if ((error = priv_check(td, PRIV_VFS_READ_DIR)) != 0) return (EISDIR); } foffset_lock_uio(fp, uio, flags); if (do_vn_io_fault(vp, uio)) { args.kind = VN_IO_FAULT_FOP; args.args.fop_args.fp = fp; args.args.fop_args.doio = doio; args.cred = active_cred; args.flags = flags | FOF_OFFSET; if (uio->uio_rw == UIO_READ) { rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset, uio->uio_offset + uio->uio_resid); } else if ((fp->f_flag & O_APPEND) != 0 || (flags & FOF_OFFSET) == 0) { /* For appenders, punt and lock the whole range. */ rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); } else { rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset, uio->uio_offset + uio->uio_resid); } error = vn_io_fault1(vp, uio, &args, td); vn_rangelock_unlock(vp, rl_cookie); } else { error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td); } foffset_unlock_uio(fp, uio, flags); return (error); } /* * Helper function to perform the requested uiomove operation using * the held pages for io->uio_iov[0].iov_base buffer instead of * copyin/copyout. Access to the pages with uiomove_fromphys() * instead of iov_base prevents page faults that could occur due to * pmap_collect() invalidating the mapping created by * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or * object cleanup revoking the write access from page mappings. * * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove() * instead of plain uiomove(). */ int vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio) { struct uio transp_uio; struct iovec transp_iov[1]; struct thread *td; size_t adv; int error, pgadv; td = curthread; if ((td->td_pflags & TDP_UIOHELD) == 0 || uio->uio_segflg != UIO_USERSPACE) return (uiomove(data, xfersize, uio)); KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt)); transp_iov[0].iov_base = data; transp_uio.uio_iov = &transp_iov[0]; transp_uio.uio_iovcnt = 1; if (xfersize > uio->uio_resid) xfersize = uio->uio_resid; transp_uio.uio_resid = transp_iov[0].iov_len = xfersize; transp_uio.uio_offset = 0; transp_uio.uio_segflg = UIO_SYSSPACE; /* * Since transp_iov points to data, and td_ma page array * corresponds to original uio->uio_iov, we need to invert the * direction of the i/o operation as passed to * uiomove_fromphys(). */ switch (uio->uio_rw) { case UIO_WRITE: transp_uio.uio_rw = UIO_READ; break; case UIO_READ: transp_uio.uio_rw = UIO_WRITE; break; } transp_uio.uio_td = uio->uio_td; error = uiomove_fromphys(td->td_ma, ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK, xfersize, &transp_uio); adv = xfersize - transp_uio.uio_resid; pgadv = (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) - (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT); td->td_ma += pgadv; KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt, pgadv)); td->td_ma_cnt -= pgadv; uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv; uio->uio_iov->iov_len -= adv; uio->uio_resid -= adv; uio->uio_offset += adv; return (error); } int vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize, struct uio *uio) { struct thread *td; vm_offset_t iov_base; int cnt, pgadv; td = curthread; if ((td->td_pflags & TDP_UIOHELD) == 0 || uio->uio_segflg != UIO_USERSPACE) return (uiomove_fromphys(ma, offset, xfersize, uio)); KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt)); cnt = xfersize > uio->uio_resid ? uio->uio_resid : xfersize; iov_base = (vm_offset_t)uio->uio_iov->iov_base; switch (uio->uio_rw) { case UIO_WRITE: pmap_copy_pages(td->td_ma, iov_base & PAGE_MASK, ma, offset, cnt); break; case UIO_READ: pmap_copy_pages(ma, offset, td->td_ma, iov_base & PAGE_MASK, cnt); break; } pgadv = ((iov_base + cnt) >> PAGE_SHIFT) - (iov_base >> PAGE_SHIFT); td->td_ma += pgadv; KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt, pgadv)); td->td_ma_cnt -= pgadv; uio->uio_iov->iov_base = (char *)(iov_base + cnt); uio->uio_iov->iov_len -= cnt; uio->uio_resid -= cnt; uio->uio_offset += cnt; return (0); } /* * File table truncate routine. */ static int vn_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { struct mount *mp; struct vnode *vp; void *rl_cookie; int error; vp = fp->f_vnode; retry: /* * Lock the whole range for truncation. Otherwise split i/o * might happen partly before and partly after the truncation. */ rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error) goto out1; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG_VNODE1(vp); if (vp->v_type == VDIR) { error = EISDIR; goto out; } #ifdef MAC error = mac_vnode_check_write(active_cred, fp->f_cred, vp); if (error) goto out; #endif error = vn_truncate_locked(vp, length, (fp->f_flag & O_FSYNC) != 0, fp->f_cred); out: VOP_UNLOCK(vp); vn_finished_write(mp); out1: vn_rangelock_unlock(vp, rl_cookie); if (error == ERELOOKUP) goto retry; return (error); } /* * Truncate a file that is already locked. */ int vn_truncate_locked(struct vnode *vp, off_t length, bool sync, struct ucred *cred) { struct vattr vattr; int error; error = VOP_ADD_WRITECOUNT(vp, 1); if (error == 0) { VATTR_NULL(&vattr); vattr.va_size = length; if (sync) vattr.va_vaflags |= VA_SYNC; error = VOP_SETATTR(vp, &vattr, cred); VOP_ADD_WRITECOUNT_CHECKED(vp, -1); } return (error); } /* * File table vnode stat routine. */ int -vn_statfile(struct file *fp, struct stat *sb, struct ucred *active_cred, - struct thread *td) +vn_statfile(struct file *fp, struct stat *sb, struct ucred *active_cred) { struct vnode *vp = fp->f_vnode; int error; vn_lock(vp, LK_SHARED | LK_RETRY); - error = VOP_STAT(vp, sb, active_cred, fp->f_cred, td); + error = VOP_STAT(vp, sb, active_cred, fp->f_cred); VOP_UNLOCK(vp); return (error); } /* * File table vnode ioctl routine. */ static int vn_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { struct vattr vattr; struct vnode *vp; struct fiobmap2_arg *bmarg; int error; vp = fp->f_vnode; switch (vp->v_type) { case VDIR: case VREG: switch (com) { case FIONREAD: vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &vattr, active_cred); VOP_UNLOCK(vp); if (error == 0) *(int *)data = vattr.va_size - fp->f_offset; return (error); case FIOBMAP2: bmarg = (struct fiobmap2_arg *)data; vn_lock(vp, LK_SHARED | LK_RETRY); #ifdef MAC error = mac_vnode_check_read(active_cred, fp->f_cred, vp); if (error == 0) #endif error = VOP_BMAP(vp, bmarg->bn, NULL, &bmarg->bn, &bmarg->runp, &bmarg->runb); VOP_UNLOCK(vp); return (error); case FIONBIO: case FIOASYNC: return (0); default: return (VOP_IOCTL(vp, com, data, fp->f_flag, active_cred, td)); } break; case VCHR: return (VOP_IOCTL(vp, com, data, fp->f_flag, active_cred, td)); default: return (ENOTTY); } } /* * File table vnode poll routine. */ static int vn_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct vnode *vp; int error; vp = fp->f_vnode; #if defined(MAC) || defined(AUDIT) if (AUDITING_TD(td) || mac_vnode_check_poll_enabled()) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG_VNODE1(vp); error = mac_vnode_check_poll(active_cred, fp->f_cred, vp); VOP_UNLOCK(vp); if (error != 0) return (error); } #endif error = VOP_POLL(vp, events, fp->f_cred, td); return (error); } /* * Acquire the requested lock and then check for validity. LK_RETRY * permits vn_lock to return doomed vnodes. */ static int __noinline _vn_lock_fallback(struct vnode *vp, int flags, const char *file, int line, int error) { KASSERT((flags & LK_RETRY) == 0 || error == 0, ("vn_lock: error %d incompatible with flags %#x", error, flags)); if (error == 0) VNASSERT(VN_IS_DOOMED(vp), vp, ("vnode not doomed")); if ((flags & LK_RETRY) == 0) { if (error == 0) { VOP_UNLOCK(vp); error = ENOENT; } return (error); } /* * LK_RETRY case. * * Nothing to do if we got the lock. */ if (error == 0) return (0); /* * Interlock was dropped by the call in _vn_lock. */ flags &= ~LK_INTERLOCK; do { error = VOP_LOCK1(vp, flags, file, line); } while (error != 0); return (0); } int _vn_lock(struct vnode *vp, int flags, const char *file, int line) { int error; VNASSERT((flags & LK_TYPE_MASK) != 0, vp, ("vn_lock: no locktype (%d passed)", flags)); VNPASS(vp->v_holdcnt > 0, vp); error = VOP_LOCK1(vp, flags, file, line); if (__predict_false(error != 0 || VN_IS_DOOMED(vp))) return (_vn_lock_fallback(vp, flags, file, line, error)); return (0); } /* * File table vnode close routine. */ static int vn_closefile(struct file *fp, struct thread *td) { struct vnode *vp; struct flock lf; int error; bool ref; vp = fp->f_vnode; fp->f_ops = &badfileops; ref = (fp->f_flag & FHASLOCK) != 0; error = vn_close1(vp, fp->f_flag, fp->f_cred, td, ref); if (__predict_false(ref)) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; (void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK); vrele(vp); } return (error); } /* * Preparing to start a filesystem write operation. If the operation is * permitted, then we bump the count of operations in progress and * proceed. If a suspend request is in progress, we wait until the * suspension is over, and then proceed. */ static int vn_start_write_refed(struct mount *mp, int flags, bool mplocked) { struct mount_pcpu *mpcpu; int error, mflags; if (__predict_true(!mplocked) && (flags & V_XSLEEP) == 0 && vfs_op_thread_enter(mp, mpcpu)) { MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0); vfs_mp_count_add_pcpu(mpcpu, writeopcount, 1); vfs_op_thread_exit(mp, mpcpu); return (0); } if (mplocked) mtx_assert(MNT_MTX(mp), MA_OWNED); else MNT_ILOCK(mp); error = 0; /* * Check on status of suspension. */ if ((curthread->td_pflags & TDP_IGNSUSP) == 0 || mp->mnt_susp_owner != curthread) { mflags = ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ? (flags & PCATCH) : 0) | (PUSER - 1); while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { if (flags & V_NOWAIT) { error = EWOULDBLOCK; goto unlock; } error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags, "suspfs", 0); if (error) goto unlock; } } if (flags & V_XSLEEP) goto unlock; mp->mnt_writeopcount++; unlock: if (error != 0 || (flags & V_XSLEEP) != 0) MNT_REL(mp); MNT_IUNLOCK(mp); return (error); } int vn_start_write(struct vnode *vp, struct mount **mpp, int flags) { struct mount *mp; int error; KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL), ("V_MNTREF requires mp")); error = 0; /* * If a vnode is provided, get and return the mount point that * to which it will write. */ if (vp != NULL) { if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) { *mpp = NULL; if (error != EOPNOTSUPP) return (error); return (0); } } if ((mp = *mpp) == NULL) return (0); /* * VOP_GETWRITEMOUNT() returns with the mp refcount held through * a vfs_ref(). * As long as a vnode is not provided we need to acquire a * refcount for the provided mountpoint too, in order to * emulate a vfs_ref(). */ if (vp == NULL && (flags & V_MNTREF) == 0) vfs_ref(mp); return (vn_start_write_refed(mp, flags, false)); } /* * Secondary suspension. Used by operations such as vop_inactive * routines that are needed by the higher level functions. These * are allowed to proceed until all the higher level functions have * completed (indicated by mnt_writeopcount dropping to zero). At that * time, these operations are halted until the suspension is over. */ int vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags) { struct mount *mp; int error; KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL), ("V_MNTREF requires mp")); retry: if (vp != NULL) { if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) { *mpp = NULL; if (error != EOPNOTSUPP) return (error); return (0); } } /* * If we are not suspended or have not yet reached suspended * mode, then let the operation proceed. */ if ((mp = *mpp) == NULL) return (0); /* * VOP_GETWRITEMOUNT() returns with the mp refcount held through * a vfs_ref(). * As long as a vnode is not provided we need to acquire a * refcount for the provided mountpoint too, in order to * emulate a vfs_ref(). */ MNT_ILOCK(mp); if (vp == NULL && (flags & V_MNTREF) == 0) MNT_REF(mp); if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) { mp->mnt_secondary_writes++; mp->mnt_secondary_accwrites++; MNT_IUNLOCK(mp); return (0); } if (flags & V_NOWAIT) { MNT_REL(mp); MNT_IUNLOCK(mp); return (EWOULDBLOCK); } /* * Wait for the suspension to finish. */ error = msleep(&mp->mnt_flag, MNT_MTX(mp), (PUSER - 1) | PDROP | ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ? (flags & PCATCH) : 0), "suspfs", 0); vfs_rel(mp); if (error == 0) goto retry; return (error); } /* * Filesystem write operation has completed. If we are suspending and this * operation is the last one, notify the suspender that the suspension is * now in effect. */ void vn_finished_write(struct mount *mp) { struct mount_pcpu *mpcpu; int c; if (mp == NULL) return; if (vfs_op_thread_enter(mp, mpcpu)) { vfs_mp_count_sub_pcpu(mpcpu, writeopcount, 1); vfs_mp_count_sub_pcpu(mpcpu, ref, 1); vfs_op_thread_exit(mp, mpcpu); return; } MNT_ILOCK(mp); vfs_assert_mount_counters(mp); MNT_REL(mp); c = --mp->mnt_writeopcount; if (mp->mnt_vfs_ops == 0) { MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0); MNT_IUNLOCK(mp); return; } if (c < 0) vfs_dump_mount_counters(mp); if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && c == 0) wakeup(&mp->mnt_writeopcount); MNT_IUNLOCK(mp); } /* * Filesystem secondary write operation has completed. If we are * suspending and this operation is the last one, notify the suspender * that the suspension is now in effect. */ void vn_finished_secondary_write(struct mount *mp) { if (mp == NULL) return; MNT_ILOCK(mp); MNT_REL(mp); mp->mnt_secondary_writes--; if (mp->mnt_secondary_writes < 0) panic("vn_finished_secondary_write: neg cnt"); if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && mp->mnt_secondary_writes <= 0) wakeup(&mp->mnt_secondary_writes); MNT_IUNLOCK(mp); } /* * Request a filesystem to suspend write operations. */ int vfs_write_suspend(struct mount *mp, int flags) { int error; vfs_op_enter(mp); MNT_ILOCK(mp); vfs_assert_mount_counters(mp); if (mp->mnt_susp_owner == curthread) { vfs_op_exit_locked(mp); MNT_IUNLOCK(mp); return (EALREADY); } while (mp->mnt_kern_flag & MNTK_SUSPEND) msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0); /* * Unmount holds a write reference on the mount point. If we * own busy reference and drain for writers, we deadlock with * the reference draining in the unmount path. Callers of * vfs_write_suspend() must specify VS_SKIP_UNMOUNT if * vfs_busy() reference is owned and caller is not in the * unmount context. */ if ((flags & VS_SKIP_UNMOUNT) != 0 && (mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) { vfs_op_exit_locked(mp); MNT_IUNLOCK(mp); return (EBUSY); } mp->mnt_kern_flag |= MNTK_SUSPEND; mp->mnt_susp_owner = curthread; if (mp->mnt_writeopcount > 0) (void) msleep(&mp->mnt_writeopcount, MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0); else MNT_IUNLOCK(mp); if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0) { vfs_write_resume(mp, 0); /* vfs_write_resume does vfs_op_exit() for us */ } return (error); } /* * Request a filesystem to resume write operations. */ void vfs_write_resume(struct mount *mp, int flags) { MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner")); mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 | MNTK_SUSPENDED); mp->mnt_susp_owner = NULL; wakeup(&mp->mnt_writeopcount); wakeup(&mp->mnt_flag); curthread->td_pflags &= ~TDP_IGNSUSP; if ((flags & VR_START_WRITE) != 0) { MNT_REF(mp); mp->mnt_writeopcount++; } MNT_IUNLOCK(mp); if ((flags & VR_NO_SUSPCLR) == 0) VFS_SUSP_CLEAN(mp); vfs_op_exit(mp); } else if ((flags & VR_START_WRITE) != 0) { MNT_REF(mp); vn_start_write_refed(mp, 0, true); } else { MNT_IUNLOCK(mp); } } /* * Helper loop around vfs_write_suspend() for filesystem unmount VFS * methods. */ int vfs_write_suspend_umnt(struct mount *mp) { int error; KASSERT((curthread->td_pflags & TDP_IGNSUSP) == 0, ("vfs_write_suspend_umnt: recursed")); /* dounmount() already called vn_start_write(). */ for (;;) { vn_finished_write(mp); error = vfs_write_suspend(mp, 0); if (error != 0) { vn_start_write(NULL, &mp, V_WAIT); return (error); } MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_SUSPENDED) != 0) break; MNT_IUNLOCK(mp); vn_start_write(NULL, &mp, V_WAIT); } mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2); wakeup(&mp->mnt_flag); MNT_IUNLOCK(mp); curthread->td_pflags |= TDP_IGNSUSP; return (0); } /* * Implement kqueues for files by translating it to vnode operation. */ static int vn_kqfilter(struct file *fp, struct knote *kn) { return (VOP_KQFILTER(fp->f_vnode, kn)); } int vn_kqfilter_opath(struct file *fp, struct knote *kn) { if ((fp->f_flag & FKQALLOWED) == 0) return (EBADF); return (vn_kqfilter(fp, kn)); } /* * Simplified in-kernel wrapper calls for extended attribute access. * Both calls pass in a NULL credential, authorizing as "kernel" access. * Set IO_NODELOCKED in ioflg if the vnode is already locked. */ int vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int *buflen, char *buf, struct thread *td) { struct uio auio; struct iovec iov; int error; iov.iov_len = *buflen; iov.iov_base = buf; auio.uio_iov = &iov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_offset = 0; auio.uio_resid = *buflen; if ((ioflg & IO_NODELOCKED) == 0) vn_lock(vp, LK_SHARED | LK_RETRY); ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); /* authorize attribute retrieval as kernel */ error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL, td); if ((ioflg & IO_NODELOCKED) == 0) VOP_UNLOCK(vp); if (error == 0) { *buflen = *buflen - auio.uio_resid; } return (error); } /* * XXX failure mode if partially written? */ int vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int buflen, char *buf, struct thread *td) { struct uio auio; struct iovec iov; struct mount *mp; int error; iov.iov_len = buflen; iov.iov_base = buf; auio.uio_iov = &iov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_WRITE; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_offset = 0; auio.uio_resid = buflen; if ((ioflg & IO_NODELOCKED) == 0) { if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); /* authorize attribute setting as kernel */ error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td); if ((ioflg & IO_NODELOCKED) == 0) { vn_finished_write(mp); VOP_UNLOCK(vp); } return (error); } int vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, struct thread *td) { struct mount *mp; int error; if ((ioflg & IO_NODELOCKED) == 0) { if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); /* authorize attribute removal as kernel */ error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td); if (error == EOPNOTSUPP) error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, NULL, td); if ((ioflg & IO_NODELOCKED) == 0) { vn_finished_write(mp); VOP_UNLOCK(vp); } return (error); } static int vn_get_ino_alloc_vget(struct mount *mp, void *arg, int lkflags, struct vnode **rvp) { return (VFS_VGET(mp, *(ino_t *)arg, lkflags, rvp)); } int vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp) { return (vn_vget_ino_gen(vp, vn_get_ino_alloc_vget, &ino, lkflags, rvp)); } int vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc, void *alloc_arg, int lkflags, struct vnode **rvp) { struct mount *mp; int ltype, error; ASSERT_VOP_LOCKED(vp, "vn_vget_ino_get"); mp = vp->v_mount; ltype = VOP_ISLOCKED(vp); KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED, ("vn_vget_ino: vp not locked")); error = vfs_busy(mp, MBF_NOWAIT); if (error != 0) { vfs_ref(mp); VOP_UNLOCK(vp); error = vfs_busy(mp, 0); vn_lock(vp, ltype | LK_RETRY); vfs_rel(mp); if (error != 0) return (ENOENT); if (VN_IS_DOOMED(vp)) { vfs_unbusy(mp); return (ENOENT); } } VOP_UNLOCK(vp); error = alloc(mp, alloc_arg, lkflags, rvp); vfs_unbusy(mp); if (error != 0 || *rvp != vp) vn_lock(vp, ltype | LK_RETRY); if (VN_IS_DOOMED(vp)) { if (error == 0) { if (*rvp == vp) vunref(vp); else vput(*rvp); } error = ENOENT; } return (error); } int vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio, struct thread *td) { off_t lim; bool ktr_write; if (td == NULL) return (0); /* * There are conditions where the limit is to be ignored. * However, since it is almost never reached, check it first. */ ktr_write = (td->td_pflags & TDP_INKTRACE) != 0; lim = lim_cur(td, RLIMIT_FSIZE); if (__predict_false(ktr_write)) lim = td->td_ktr_io_lim; if (__predict_true((uoff_t)uio->uio_offset + uio->uio_resid <= lim)) return (0); /* * The limit is reached. */ if (vp->v_type != VREG || (td->td_pflags2 & TDP2_ACCT) != 0) return (0); if (!ktr_write || ktr_filesize_limit_signal) { PROC_LOCK(td->td_proc); kern_psignal(td->td_proc, SIGXFSZ); PROC_UNLOCK(td->td_proc); } return (EFBIG); } int vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { struct vnode *vp; vp = fp->f_vnode; #ifdef AUDIT vn_lock(vp, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(vp); VOP_UNLOCK(vp); #endif return (setfmode(td, active_cred, vp, mode)); } int vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { struct vnode *vp; vp = fp->f_vnode; #ifdef AUDIT vn_lock(vp, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(vp); VOP_UNLOCK(vp); #endif return (setfown(td, active_cred, vp, uid, gid)); } void vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end) { vm_object_t object; if ((object = vp->v_object) == NULL) return; VM_OBJECT_WLOCK(object); vm_object_page_remove(object, start, end, 0); VM_OBJECT_WUNLOCK(object); } int vn_bmap_seekhole_locked(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred) { struct vattr va; daddr_t bn, bnp; uint64_t bsize; off_t noff; int error; KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA, ("%s: Wrong command %lu", __func__, cmd)); ASSERT_VOP_LOCKED(vp, "vn_bmap_seekhole_locked"); if (vp->v_type != VREG) { error = ENOTTY; goto out; } error = VOP_GETATTR(vp, &va, cred); if (error != 0) goto out; noff = *off; if (noff >= va.va_size) { error = ENXIO; goto out; } bsize = vp->v_mount->mnt_stat.f_iosize; for (bn = noff / bsize; noff < va.va_size; bn++, noff += bsize - noff % bsize) { error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL); if (error == EOPNOTSUPP) { error = ENOTTY; goto out; } if ((bnp == -1 && cmd == FIOSEEKHOLE) || (bnp != -1 && cmd == FIOSEEKDATA)) { noff = bn * bsize; if (noff < *off) noff = *off; goto out; } } if (noff > va.va_size) noff = va.va_size; /* noff == va.va_size. There is an implicit hole at the end of file. */ if (cmd == FIOSEEKDATA) error = ENXIO; out: if (error == 0) *off = noff; return (error); } int vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred) { int error; KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA, ("%s: Wrong command %lu", __func__, cmd)); if (vn_lock(vp, LK_SHARED) != 0) return (EBADF); error = vn_bmap_seekhole_locked(vp, cmd, off, cred); VOP_UNLOCK(vp); return (error); } int vn_seek(struct file *fp, off_t offset, int whence, struct thread *td) { struct ucred *cred; struct vnode *vp; struct vattr vattr; off_t foffset, size; int error, noneg; cred = td->td_ucred; vp = fp->f_vnode; foffset = foffset_lock(fp, 0); noneg = (vp->v_type != VCHR); error = 0; switch (whence) { case L_INCR: if (noneg && (foffset < 0 || (offset > 0 && foffset > OFF_MAX - offset))) { error = EOVERFLOW; break; } offset += foffset; break; case L_XTND: vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &vattr, cred); VOP_UNLOCK(vp); if (error) break; /* * If the file references a disk device, then fetch * the media size and use that to determine the ending * offset. */ if (vattr.va_size == 0 && vp->v_type == VCHR && fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0) vattr.va_size = size; if (noneg && (vattr.va_size > OFF_MAX || (offset > 0 && vattr.va_size > OFF_MAX - offset))) { error = EOVERFLOW; break; } offset += vattr.va_size; break; case L_SET: break; case SEEK_DATA: error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td); if (error == ENOTTY) error = EINVAL; break; case SEEK_HOLE: error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td); if (error == ENOTTY) error = EINVAL; break; default: error = EINVAL; } if (error == 0 && noneg && offset < 0) error = EINVAL; if (error != 0) goto drop; VFS_KNOTE_UNLOCKED(vp, 0); td->td_uretoff.tdu_off = offset; drop: foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0); return (error); } int vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred, struct thread *td) { int error; /* * Grant permission if the caller is the owner of the file, or * the super-user, or has ACL_WRITE_ATTRIBUTES permission on * on the file. If the time pointer is null, then write * permission on the file is also sufficient. * * From NFSv4.1, draft 21, 6.2.1.3.1, Discussion of Mask Attributes: * A user having ACL_WRITE_DATA or ACL_WRITE_ATTRIBUTES * will be allowed to set the times [..] to the current * server time. */ error = VOP_ACCESSX(vp, VWRITE_ATTRIBUTES, cred, td); if (error != 0 && (vap->va_vaflags & VA_UTIMES_NULL) != 0) error = VOP_ACCESS(vp, VWRITE, cred, td); return (error); } int vn_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { struct vnode *vp; int error; if (fp->f_type == DTYPE_FIFO) kif->kf_type = KF_TYPE_FIFO; else kif->kf_type = KF_TYPE_VNODE; vp = fp->f_vnode; vref(vp); FILEDESC_SUNLOCK(fdp); error = vn_fill_kinfo_vnode(vp, kif); vrele(vp); FILEDESC_SLOCK(fdp); return (error); } static inline void vn_fill_junk(struct kinfo_file *kif) { size_t len, olen; /* * Simulate vn_fullpath returning changing values for a given * vp during e.g. coredump. */ len = (arc4random() % (sizeof(kif->kf_path) - 2)) + 1; olen = strlen(kif->kf_path); if (len < olen) strcpy(&kif->kf_path[len - 1], "$"); else for (; olen < len; olen++) strcpy(&kif->kf_path[olen], "A"); } int vn_fill_kinfo_vnode(struct vnode *vp, struct kinfo_file *kif) { struct vattr va; char *fullpath, *freepath; int error; kif->kf_un.kf_file.kf_file_type = vntype_to_kinfo(vp->v_type); freepath = NULL; fullpath = "-"; error = vn_fullpath(vp, &fullpath, &freepath); if (error == 0) { strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path)); } if (freepath != NULL) free(freepath, M_TEMP); KFAIL_POINT_CODE(DEBUG_FP, fill_kinfo_vnode__random_path, vn_fill_junk(kif); ); /* * Retrieve vnode attributes. */ va.va_fsid = VNOVAL; va.va_rdev = NODEV; vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &va, curthread->td_ucred); VOP_UNLOCK(vp); if (error != 0) return (error); if (va.va_fsid != VNOVAL) kif->kf_un.kf_file.kf_file_fsid = va.va_fsid; else kif->kf_un.kf_file.kf_file_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; kif->kf_un.kf_file.kf_file_fsid_freebsd11 = kif->kf_un.kf_file.kf_file_fsid; /* truncate */ kif->kf_un.kf_file.kf_file_fileid = va.va_fileid; kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode); kif->kf_un.kf_file.kf_file_size = va.va_size; kif->kf_un.kf_file.kf_file_rdev = va.va_rdev; kif->kf_un.kf_file.kf_file_rdev_freebsd11 = kif->kf_un.kf_file.kf_file_rdev; /* truncate */ return (0); } int vn_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, struct thread *td) { #ifdef HWPMC_HOOKS struct pmckern_map_in pkm; #endif struct mount *mp; struct vnode *vp; vm_object_t object; vm_prot_t maxprot; boolean_t writecounted; int error; #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \ defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) /* * POSIX shared-memory objects are defined to have * kernel persistence, and are not defined to support * read(2)/write(2) -- or even open(2). Thus, we can * use MAP_ASYNC to trade on-disk coherence for speed. * The shm_open(3) library routine turns on the FPOSIXSHM * flag to request this behavior. */ if ((fp->f_flag & FPOSIXSHM) != 0) flags |= MAP_NOSYNC; #endif vp = fp->f_vnode; /* * Ensure that file and memory protections are * compatible. Note that we only worry about * writability if mapping is shared; in this case, * current and max prot are dictated by the open file. * XXX use the vnode instead? Problem is: what * credentials do we use for determination? What if * proc does a setuid? */ mp = vp->v_mount; if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) { maxprot = VM_PROT_NONE; if ((prot & VM_PROT_EXECUTE) != 0) return (EACCES); } else maxprot = VM_PROT_EXECUTE; if ((fp->f_flag & FREAD) != 0) maxprot |= VM_PROT_READ; else if ((prot & VM_PROT_READ) != 0) return (EACCES); /* * If we are sharing potential changes via MAP_SHARED and we * are trying to get write permission although we opened it * without asking for it, bail out. */ if ((flags & MAP_SHARED) != 0) { if ((fp->f_flag & FWRITE) != 0) maxprot |= VM_PROT_WRITE; else if ((prot & VM_PROT_WRITE) != 0) return (EACCES); } else { maxprot |= VM_PROT_WRITE; cap_maxprot |= VM_PROT_WRITE; } maxprot &= cap_maxprot; /* * For regular files and shared memory, POSIX requires that * the value of foff be a legitimate offset within the data * object. In particular, negative offsets are invalid. * Blocking negative offsets and overflows here avoids * possible wraparound or user-level access into reserved * ranges of the data object later. In contrast, POSIX does * not dictate how offsets are used by device drivers, so in * the case of a device mapping a negative offset is passed * on. */ if ( #ifdef _LP64 size > OFF_MAX || #endif foff > OFF_MAX - size) return (EINVAL); writecounted = FALSE; error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, vp, &foff, &object, &writecounted); if (error != 0) return (error); error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, foff, writecounted, td); if (error != 0) { /* * If this mapping was accounted for in the vnode's * writecount, then undo that now. */ if (writecounted) vm_pager_release_writecount(object, 0, size); vm_object_deallocate(object); } #ifdef HWPMC_HOOKS /* Inform hwpmc(4) if an executable is being mapped. */ if (PMC_HOOK_INSTALLED(PMC_FN_MMAP)) { if ((prot & VM_PROT_EXECUTE) != 0 && error == 0) { pkm.pm_file = vp; pkm.pm_address = (uintptr_t) *addr; PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_MMAP, (void *) &pkm); } } #endif return (error); } void vn_fsid(struct vnode *vp, struct vattr *va) { fsid_t *f; f = &vp->v_mount->mnt_stat.f_fsid; va->va_fsid = (uint32_t)f->val[1]; va->va_fsid <<= sizeof(f->val[1]) * NBBY; va->va_fsid += (uint32_t)f->val[0]; } int vn_fsync_buf(struct vnode *vp, int waitfor) { struct buf *bp, *nbp; struct bufobj *bo; struct mount *mp; int error, maxretry; error = 0; maxretry = 10000; /* large, arbitrarily chosen */ mp = NULL; if (vp->v_type == VCHR) { VI_LOCK(vp); mp = vp->v_rdev->si_mountpt; VI_UNLOCK(vp); } bo = &vp->v_bufobj; BO_LOCK(bo); loop1: /* * MARK/SCAN initialization to avoid infinite loops. */ TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) { bp->b_vflags &= ~BV_SCANNED; bp->b_error = 0; } /* * Flush all dirty buffers associated with a vnode. */ loop2: TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if ((bp->b_vflags & BV_SCANNED) != 0) continue; bp->b_vflags |= BV_SCANNED; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) { if (waitfor != MNT_WAIT) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_INTERLOCK | LK_SLEEPFAIL, BO_LOCKPTR(bo)) != 0) { BO_LOCK(bo); goto loop1; } BO_LOCK(bo); } BO_UNLOCK(bo); KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); if ((bp->b_flags & B_DELWRI) == 0) panic("fsync: not dirty"); if ((vp->v_object != NULL) && (bp->b_flags & B_CLUSTEROK)) { vfs_bio_awrite(bp); } else { bremfree(bp); bawrite(bp); } if (maxretry < 1000) pause("dirty", hz < 1000 ? 1 : hz / 1000); BO_LOCK(bo); goto loop2; } /* * If synchronous the caller expects us to completely resolve all * dirty buffers in the system. Wait for in-progress I/O to * complete (which could include background bitmap writes), then * retry if dirty blocks still exist. */ if (waitfor == MNT_WAIT) { bufobj_wwait(bo, 0, 0); if (bo->bo_dirty.bv_cnt > 0) { /* * If we are unable to write any of these buffers * then we fail now rather than trying endlessly * to write them out. */ TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) if ((error = bp->b_error) != 0) break; if ((mp != NULL && mp->mnt_secondary_writes > 0) || (error == 0 && --maxretry >= 0)) goto loop1; if (error == 0) error = EAGAIN; } } BO_UNLOCK(bo); if (error != 0) vn_printf(vp, "fsync: giving up on dirty (error = %d) ", error); return (error); } /* * Copies a byte range from invp to outvp. Calls VOP_COPY_FILE_RANGE() * or vn_generic_copy_file_range() after rangelocking the byte ranges, * to do the actual copy. * vn_generic_copy_file_range() is factored out, so it can be called * from a VOP_COPY_FILE_RANGE() call as well, but handles vnodes from * different file systems. */ int vn_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp, off_t *outoffp, size_t *lenp, unsigned int flags, struct ucred *incred, struct ucred *outcred, struct thread *fsize_td) { int error; size_t len; uint64_t uval; len = *lenp; *lenp = 0; /* For error returns. */ error = 0; /* Do some sanity checks on the arguments. */ if (invp->v_type == VDIR || outvp->v_type == VDIR) error = EISDIR; else if (*inoffp < 0 || *outoffp < 0 || invp->v_type != VREG || outvp->v_type != VREG) error = EINVAL; if (error != 0) goto out; /* Ensure offset + len does not wrap around. */ uval = *inoffp; uval += len; if (uval > INT64_MAX) len = INT64_MAX - *inoffp; uval = *outoffp; uval += len; if (uval > INT64_MAX) len = INT64_MAX - *outoffp; if (len == 0) goto out; /* * If the two vnode are for the same file system, call * VOP_COPY_FILE_RANGE(), otherwise call vn_generic_copy_file_range() * which can handle copies across multiple file systems. */ *lenp = len; if (invp->v_mount == outvp->v_mount) error = VOP_COPY_FILE_RANGE(invp, inoffp, outvp, outoffp, lenp, flags, incred, outcred, fsize_td); else error = vn_generic_copy_file_range(invp, inoffp, outvp, outoffp, lenp, flags, incred, outcred, fsize_td); out: return (error); } /* * Test len bytes of data starting at dat for all bytes == 0. * Return true if all bytes are zero, false otherwise. * Expects dat to be well aligned. */ static bool mem_iszero(void *dat, int len) { int i; const u_int *p; const char *cp; for (p = dat; len > 0; len -= sizeof(*p), p++) { if (len >= sizeof(*p)) { if (*p != 0) return (false); } else { cp = (const char *)p; for (i = 0; i < len; i++, cp++) if (*cp != '\0') return (false); } } return (true); } /* * Look for a hole in the output file and, if found, adjust *outoffp * and *xferp to skip past the hole. * *xferp is the entire hole length to be written and xfer2 is how many bytes * to be written as 0's upon return. */ static off_t vn_skip_hole(struct vnode *outvp, off_t xfer2, off_t *outoffp, off_t *xferp, off_t *dataoffp, off_t *holeoffp, struct ucred *cred) { int error; off_t delta; if (*holeoffp == 0 || *holeoffp <= *outoffp) { *dataoffp = *outoffp; error = VOP_IOCTL(outvp, FIOSEEKDATA, dataoffp, 0, cred, curthread); if (error == 0) { *holeoffp = *dataoffp; error = VOP_IOCTL(outvp, FIOSEEKHOLE, holeoffp, 0, cred, curthread); } if (error != 0 || *holeoffp == *dataoffp) { /* * Since outvp is unlocked, it may be possible for * another thread to do a truncate(), lseek(), write() * creating a hole at startoff between the above * VOP_IOCTL() calls, if the other thread does not do * rangelocking. * If that happens, *holeoffp == *dataoffp and finding * the hole has failed, so disable vn_skip_hole(). */ *holeoffp = -1; /* Disable use of vn_skip_hole(). */ return (xfer2); } KASSERT(*dataoffp >= *outoffp, ("vn_skip_hole: dataoff=%jd < outoff=%jd", (intmax_t)*dataoffp, (intmax_t)*outoffp)); KASSERT(*holeoffp > *dataoffp, ("vn_skip_hole: holeoff=%jd <= dataoff=%jd", (intmax_t)*holeoffp, (intmax_t)*dataoffp)); } /* * If there is a hole before the data starts, advance *outoffp and * *xferp past the hole. */ if (*dataoffp > *outoffp) { delta = *dataoffp - *outoffp; if (delta >= *xferp) { /* Entire *xferp is a hole. */ *outoffp += *xferp; *xferp = 0; return (0); } *xferp -= delta; *outoffp += delta; xfer2 = MIN(xfer2, *xferp); } /* * If a hole starts before the end of this xfer2, reduce this xfer2 so * that the write ends at the start of the hole. * *holeoffp should always be greater than *outoffp, but for the * non-INVARIANTS case, check this to make sure xfer2 remains a sane * value. */ if (*holeoffp > *outoffp && *holeoffp < *outoffp + xfer2) xfer2 = *holeoffp - *outoffp; return (xfer2); } /* * Write an xfer sized chunk to outvp in blksize blocks from dat. * dat is a maximum of blksize in length and can be written repeatedly in * the chunk. * If growfile == true, just grow the file via vn_truncate_locked() instead * of doing actual writes. * If checkhole == true, a hole is being punched, so skip over any hole * already in the output file. */ static int vn_write_outvp(struct vnode *outvp, char *dat, off_t outoff, off_t xfer, u_long blksize, bool growfile, bool checkhole, struct ucred *cred) { struct mount *mp; off_t dataoff, holeoff, xfer2; int error; /* * Loop around doing writes of blksize until write has been completed. * Lock/unlock on each loop iteration so that a bwillwrite() can be * done for each iteration, since the xfer argument can be very * large if there is a large hole to punch in the output file. */ error = 0; holeoff = 0; do { xfer2 = MIN(xfer, blksize); if (checkhole) { /* * Punching a hole. Skip writing if there is * already a hole in the output file. */ xfer2 = vn_skip_hole(outvp, xfer2, &outoff, &xfer, &dataoff, &holeoff, cred); if (xfer == 0) break; if (holeoff < 0) checkhole = false; KASSERT(xfer2 > 0, ("vn_write_outvp: xfer2=%jd", (intmax_t)xfer2)); } bwillwrite(); mp = NULL; error = vn_start_write(outvp, &mp, V_WAIT); if (error != 0) break; if (growfile) { error = vn_lock(outvp, LK_EXCLUSIVE); if (error == 0) { error = vn_truncate_locked(outvp, outoff + xfer, false, cred); VOP_UNLOCK(outvp); } } else { error = vn_lock(outvp, vn_lktype_write(mp, outvp)); if (error == 0) { error = vn_rdwr(UIO_WRITE, outvp, dat, xfer2, outoff, UIO_SYSSPACE, IO_NODELOCKED, curthread->td_ucred, cred, NULL, curthread); outoff += xfer2; xfer -= xfer2; VOP_UNLOCK(outvp); } } if (mp != NULL) vn_finished_write(mp); } while (!growfile && xfer > 0 && error == 0); return (error); } /* * Copy a byte range of one file to another. This function can handle the * case where invp and outvp are on different file systems. * It can also be called by a VOP_COPY_FILE_RANGE() to do the work, if there * is no better file system specific way to do it. */ int vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp, off_t *outoffp, size_t *lenp, unsigned int flags, struct ucred *incred, struct ucred *outcred, struct thread *fsize_td) { struct vattr va, inva; struct mount *mp; struct uio io; off_t startoff, endoff, xfer, xfer2; u_long blksize; int error, interrupted; bool cantseek, readzeros, eof, lastblock, holetoeof; ssize_t aresid; size_t copylen, len, rem, savlen; char *dat; long holein, holeout; struct timespec curts, endts; holein = holeout = 0; savlen = len = *lenp; error = 0; interrupted = 0; dat = NULL; error = vn_lock(invp, LK_SHARED); if (error != 0) goto out; if (VOP_PATHCONF(invp, _PC_MIN_HOLE_SIZE, &holein) != 0) holein = 0; if (holein > 0) error = VOP_GETATTR(invp, &inva, incred); VOP_UNLOCK(invp); if (error != 0) goto out; mp = NULL; error = vn_start_write(outvp, &mp, V_WAIT); if (error == 0) error = vn_lock(outvp, LK_EXCLUSIVE); if (error == 0) { /* * If fsize_td != NULL, do a vn_rlimit_fsize() call, * now that outvp is locked. */ if (fsize_td != NULL) { io.uio_offset = *outoffp; io.uio_resid = len; error = vn_rlimit_fsize(outvp, &io, fsize_td); if (error != 0) error = EFBIG; } if (VOP_PATHCONF(outvp, _PC_MIN_HOLE_SIZE, &holeout) != 0) holeout = 0; /* * Holes that are past EOF do not need to be written as a block * of zero bytes. So, truncate the output file as far as * possible and then use va.va_size to decide if writing 0 * bytes is necessary in the loop below. */ if (error == 0) error = VOP_GETATTR(outvp, &va, outcred); if (error == 0 && va.va_size > *outoffp && va.va_size <= *outoffp + len) { #ifdef MAC error = mac_vnode_check_write(curthread->td_ucred, outcred, outvp); if (error == 0) #endif error = vn_truncate_locked(outvp, *outoffp, false, outcred); if (error == 0) va.va_size = *outoffp; } VOP_UNLOCK(outvp); } if (mp != NULL) vn_finished_write(mp); if (error != 0) goto out; /* * Set the blksize to the larger of the hole sizes for invp and outvp. * If hole sizes aren't available, set the blksize to the larger * f_iosize of invp and outvp. * This code expects the hole sizes and f_iosizes to be powers of 2. * This value is clipped at 4Kbytes and 1Mbyte. */ blksize = MAX(holein, holeout); /* Clip len to end at an exact multiple of hole size. */ if (blksize > 1) { rem = *inoffp % blksize; if (rem > 0) rem = blksize - rem; if (len > rem && len - rem > blksize) len = savlen = rounddown(len - rem, blksize) + rem; } if (blksize <= 1) blksize = MAX(invp->v_mount->mnt_stat.f_iosize, outvp->v_mount->mnt_stat.f_iosize); if (blksize < 4096) blksize = 4096; else if (blksize > 1024 * 1024) blksize = 1024 * 1024; dat = malloc(blksize, M_TEMP, M_WAITOK); /* * If VOP_IOCTL(FIOSEEKHOLE) works for invp, use it and FIOSEEKDATA * to find holes. Otherwise, just scan the read block for all 0s * in the inner loop where the data copying is done. * Note that some file systems such as NFSv3, NFSv4.0 and NFSv4.1 may * support holes on the server, but do not support FIOSEEKHOLE. * The kernel flag COPY_FILE_RANGE_TIMEO1SEC is used to indicate * that this function should return after 1second with a partial * completion. */ if ((flags & COPY_FILE_RANGE_TIMEO1SEC) != 0) { getnanouptime(&endts); endts.tv_sec++; } else timespecclear(&endts); holetoeof = eof = false; while (len > 0 && error == 0 && !eof && interrupted == 0) { endoff = 0; /* To shut up compilers. */ cantseek = true; startoff = *inoffp; copylen = len; /* * Find the next data area. If there is just a hole to EOF, * FIOSEEKDATA should fail with ENXIO. * (I do not know if any file system will report a hole to * EOF via FIOSEEKHOLE, but I am pretty sure FIOSEEKDATA * will fail for those file systems.) * * For input files that don't support FIOSEEKDATA/FIOSEEKHOLE, * the code just falls through to the inner copy loop. */ error = EINVAL; if (holein > 0) { error = VOP_IOCTL(invp, FIOSEEKDATA, &startoff, 0, incred, curthread); if (error == ENXIO) { startoff = endoff = inva.va_size; eof = holetoeof = true; error = 0; } } if (error == 0 && !holetoeof) { endoff = startoff; error = VOP_IOCTL(invp, FIOSEEKHOLE, &endoff, 0, incred, curthread); /* * Since invp is unlocked, it may be possible for * another thread to do a truncate(), lseek(), write() * creating a hole at startoff between the above * VOP_IOCTL() calls, if the other thread does not do * rangelocking. * If that happens, startoff == endoff and finding * the hole has failed, so set an error. */ if (error == 0 && startoff == endoff) error = EINVAL; /* Any error. Reset to 0. */ } if (error == 0) { if (startoff > *inoffp) { /* Found hole before data block. */ xfer = MIN(startoff - *inoffp, len); if (*outoffp < va.va_size) { /* Must write 0s to punch hole. */ xfer2 = MIN(va.va_size - *outoffp, xfer); memset(dat, 0, MIN(xfer2, blksize)); error = vn_write_outvp(outvp, dat, *outoffp, xfer2, blksize, false, holeout > 0, outcred); } if (error == 0 && *outoffp + xfer > va.va_size && (xfer == len || holetoeof)) { /* Grow output file (hole at end). */ error = vn_write_outvp(outvp, dat, *outoffp, xfer, blksize, true, false, outcred); } if (error == 0) { *inoffp += xfer; *outoffp += xfer; len -= xfer; if (len < savlen) { interrupted = sig_intr(); if (timespecisset(&endts) && interrupted == 0) { getnanouptime(&curts); if (timespeccmp(&curts, &endts, >=)) interrupted = EINTR; } } } } copylen = MIN(len, endoff - startoff); cantseek = false; } else { cantseek = true; startoff = *inoffp; copylen = len; error = 0; } xfer = blksize; if (cantseek) { /* * Set first xfer to end at a block boundary, so that * holes are more likely detected in the loop below via * the for all bytes 0 method. */ xfer -= (*inoffp % blksize); } /* Loop copying the data block. */ while (copylen > 0 && error == 0 && !eof && interrupted == 0) { if (copylen < xfer) xfer = copylen; error = vn_lock(invp, LK_SHARED); if (error != 0) goto out; error = vn_rdwr(UIO_READ, invp, dat, xfer, startoff, UIO_SYSSPACE, IO_NODELOCKED, curthread->td_ucred, incred, &aresid, curthread); VOP_UNLOCK(invp); lastblock = false; if (error == 0 && aresid > 0) { /* Stop the copy at EOF on the input file. */ xfer -= aresid; eof = true; lastblock = true; } if (error == 0) { /* * Skip the write for holes past the initial EOF * of the output file, unless this is the last * write of the output file at EOF. */ readzeros = cantseek ? mem_iszero(dat, xfer) : false; if (xfer == len) lastblock = true; if (!cantseek || *outoffp < va.va_size || lastblock || !readzeros) error = vn_write_outvp(outvp, dat, *outoffp, xfer, blksize, readzeros && lastblock && *outoffp >= va.va_size, false, outcred); if (error == 0) { *inoffp += xfer; startoff += xfer; *outoffp += xfer; copylen -= xfer; len -= xfer; if (len < savlen) { interrupted = sig_intr(); if (timespecisset(&endts) && interrupted == 0) { getnanouptime(&curts); if (timespeccmp(&curts, &endts, >=)) interrupted = EINTR; } } } } xfer = blksize; } } out: *lenp = savlen - len; free(dat, M_TEMP); return (error); } static int vn_fallocate(struct file *fp, off_t offset, off_t len, struct thread *td) { struct mount *mp; struct vnode *vp; off_t olen, ooffset; int error; #ifdef AUDIT int audited_vnode1 = 0; #endif vp = fp->f_vnode; if (vp->v_type != VREG) return (ENODEV); /* Allocating blocks may take a long time, so iterate. */ for (;;) { olen = len; ooffset = offset; bwillwrite(); mp = NULL; error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error != 0) break; error = vn_lock(vp, LK_EXCLUSIVE); if (error != 0) { vn_finished_write(mp); break; } #ifdef AUDIT if (!audited_vnode1) { AUDIT_ARG_VNODE1(vp); audited_vnode1 = 1; } #endif #ifdef MAC error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp); if (error == 0) #endif error = VOP_ALLOCATE(vp, &offset, &len); VOP_UNLOCK(vp); vn_finished_write(mp); if (olen + ooffset != offset + len) { panic("offset + len changed from %jx/%jx to %jx/%jx", ooffset, olen, offset, len); } if (error != 0 || len == 0) break; KASSERT(olen > len, ("Iteration did not make progress?")); maybe_yield(); } return (error); } static int vn_deallocate_impl(struct vnode *vp, off_t *offset, off_t *length, int flags, int ioflag, struct ucred *cred, struct ucred *active_cred, struct ucred *file_cred) { struct mount *mp; void *rl_cookie; off_t off, len; int error; #ifdef AUDIT bool audited_vnode1 = false; #endif rl_cookie = NULL; error = 0; mp = NULL; off = *offset; len = *length; if ((ioflag & (IO_NODELOCKED | IO_RANGELOCKED)) == 0) rl_cookie = vn_rangelock_wlock(vp, off, off + len); while (len > 0 && error == 0) { /* * Try to deallocate the longest range in one pass. * In case a pass takes too long to be executed, it returns * partial result. The residue will be proceeded in the next * pass. */ if ((ioflag & IO_NODELOCKED) == 0) { bwillwrite(); if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto out; vn_lock(vp, vn_lktype_write(mp, vp) | LK_RETRY); } #ifdef AUDIT if (!audited_vnode1) { AUDIT_ARG_VNODE1(vp); audited_vnode1 = true; } #endif #ifdef MAC if ((ioflag & IO_NOMACCHECK) == 0) error = mac_vnode_check_write(active_cred, file_cred, vp); #endif if (error == 0) error = VOP_DEALLOCATE(vp, &off, &len, flags, ioflag, cred); if ((ioflag & IO_NODELOCKED) == 0) { VOP_UNLOCK(vp); if (mp != NULL) { vn_finished_write(mp); mp = NULL; } } if (error == 0 && len != 0) maybe_yield(); } out: if (rl_cookie != NULL) vn_rangelock_unlock(vp, rl_cookie); *offset = off; *length = len; return (error); } /* * This function is supposed to be used in the situations where the deallocation * is not triggered by a user request. */ int vn_deallocate(struct vnode *vp, off_t *offset, off_t *length, int flags, int ioflag, struct ucred *active_cred, struct ucred *file_cred) { struct ucred *cred; if (*offset < 0 || *length <= 0 || *length > OFF_MAX - *offset || flags != 0) return (EINVAL); if (vp->v_type != VREG) return (ENODEV); cred = file_cred != NOCRED ? file_cred : active_cred; return (vn_deallocate_impl(vp, offset, length, flags, ioflag, cred, active_cred, file_cred)); } static int vn_fspacectl(struct file *fp, int cmd, off_t *offset, off_t *length, int flags, struct ucred *active_cred, struct thread *td) { int error; struct vnode *vp; int ioflag; vp = fp->f_vnode; if (cmd != SPACECTL_DEALLOC || *offset < 0 || *length <= 0 || *length > OFF_MAX - *offset || flags != 0) return (EINVAL); if (vp->v_type != VREG) return (ENODEV); ioflag = get_write_ioflag(fp); switch (cmd) { case SPACECTL_DEALLOC: error = vn_deallocate_impl(vp, offset, length, flags, ioflag, active_cred, active_cred, fp->f_cred); break; default: panic("vn_fspacectl: unknown cmd %d", cmd); } return (error); } static u_long vn_lock_pair_pause_cnt; SYSCTL_ULONG(_debug, OID_AUTO, vn_lock_pair_pause, CTLFLAG_RD, &vn_lock_pair_pause_cnt, 0, "Count of vn_lock_pair deadlocks"); u_int vn_lock_pair_pause_max; SYSCTL_UINT(_debug, OID_AUTO, vn_lock_pair_pause_max, CTLFLAG_RW, &vn_lock_pair_pause_max, 0, "Max ticks for vn_lock_pair deadlock avoidance sleep"); static void vn_lock_pair_pause(const char *wmesg) { atomic_add_long(&vn_lock_pair_pause_cnt, 1); pause(wmesg, prng32_bounded(vn_lock_pair_pause_max)); } /* * Lock pair of vnodes vp1, vp2, avoiding lock order reversal. * vp1_locked indicates whether vp1 is exclusively locked; if not, vp1 * must be unlocked. Same for vp2 and vp2_locked. One of the vnodes * can be NULL. * * The function returns with both vnodes exclusively locked, and * guarantees that it does not create lock order reversal with other * threads during its execution. Both vnodes could be unlocked * temporary (and reclaimed). */ void vn_lock_pair(struct vnode *vp1, bool vp1_locked, struct vnode *vp2, bool vp2_locked) { int error; if (vp1 == NULL && vp2 == NULL) return; if (vp1 != NULL) { if (vp1_locked) ASSERT_VOP_ELOCKED(vp1, "vp1"); else ASSERT_VOP_UNLOCKED(vp1, "vp1"); } else { vp1_locked = true; } if (vp2 != NULL) { if (vp2_locked) ASSERT_VOP_ELOCKED(vp2, "vp2"); else ASSERT_VOP_UNLOCKED(vp2, "vp2"); } else { vp2_locked = true; } if (!vp1_locked && !vp2_locked) { vn_lock(vp1, LK_EXCLUSIVE | LK_RETRY); vp1_locked = true; } for (;;) { if (vp1_locked && vp2_locked) break; if (vp1_locked && vp2 != NULL) { if (vp1 != NULL) { error = VOP_LOCK1(vp2, LK_EXCLUSIVE | LK_NOWAIT, __FILE__, __LINE__); if (error == 0) break; VOP_UNLOCK(vp1); vp1_locked = false; vn_lock_pair_pause("vlp1"); } vn_lock(vp2, LK_EXCLUSIVE | LK_RETRY); vp2_locked = true; } if (vp2_locked && vp1 != NULL) { if (vp2 != NULL) { error = VOP_LOCK1(vp1, LK_EXCLUSIVE | LK_NOWAIT, __FILE__, __LINE__); if (error == 0) break; VOP_UNLOCK(vp2); vp2_locked = false; vn_lock_pair_pause("vlp2"); } vn_lock(vp1, LK_EXCLUSIVE | LK_RETRY); vp1_locked = true; } } if (vp1 != NULL) ASSERT_VOP_ELOCKED(vp1, "vp1 ret"); if (vp2 != NULL) ASSERT_VOP_ELOCKED(vp2, "vp2 ret"); } int vn_lktype_write(struct mount *mp, struct vnode *vp) { if (MNT_SHARED_WRITES(mp) || (mp == NULL && MNT_SHARED_WRITES(vp->v_mount))) return (LK_SHARED); return (LK_EXCLUSIVE); } diff --git a/sys/kern/vnode_if.src b/sys/kern/vnode_if.src index ff57d1c9a28e..d10758019b87 100644 --- a/sys/kern/vnode_if.src +++ b/sys/kern/vnode_if.src @@ -1,839 +1,838 @@ #- # Copyright (c) 1992, 1993 # The Regents of the University of California. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # 3. Neither the name of the University nor the names of its contributors # may be used to endorse or promote products derived from this software # without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # # @(#)vnode_if.src 8.12 (Berkeley) 5/14/95 # $FreeBSD$ # # # Above each of the vop descriptors in lines starting with %% # is a specification of the locking protocol used by each vop call. # The first column is the name of the variable, the remaining three # columns are in, out and error respectively. The "in" column defines # the lock state on input, the "out" column defines the state on successful # return, and the "error" column defines the locking state on error exit. # # The locking value can take the following values: # L: locked; not converted to type of lock. # E: locked with exclusive lock for this process. # U: unlocked. # -: not applicable. vnode does not yet (or no longer) exists. # =: the same on input and output, may be either L or U. # # The paramater named "vpp" is assumed to be always used with double # indirection (**vpp) and that name is hard-coded in vnode_if.awk ! # # Lines starting with %! specify a pre or post-condition function # to call before/after the vop call. # # If other such parameters are introduced, they have to be added to # the AWK script at the head of the definition of "add_debug_code()". # vop_islocked { IN struct vnode *vp; }; %% lookup dvp L L L %% lookup vpp - L - # XXX - the lookup locking protocol defies simple description and depends # on the flags and operation fields in the (cnp) structure. Note # especially that *vpp may equal dvp and both may be locked. vop_lookup { IN struct vnode *dvp; INOUT struct vnode **vpp; IN struct componentname *cnp; }; %% cachedlookup dvp L L L %% cachedlookup vpp - L - # This must be an exact copy of lookup. See kern/vfs_cache.c for details. vop_cachedlookup { IN struct vnode *dvp; INOUT struct vnode **vpp; IN struct componentname *cnp; }; %% create dvp E E E %% create vpp - L - %! create pre vop_create_pre %! create post vop_create_post vop_create { IN struct vnode *dvp; OUT struct vnode **vpp; IN struct componentname *cnp; IN struct vattr *vap; }; %% whiteout dvp E E E %! whiteout pre vop_whiteout_pre %! whiteout post vop_whiteout_post vop_whiteout { IN struct vnode *dvp; IN struct componentname *cnp; IN int flags; }; %% mknod dvp E E E %% mknod vpp - L - %! mknod pre vop_mknod_pre %! mknod post vop_mknod_post vop_mknod { IN struct vnode *dvp; OUT struct vnode **vpp; IN struct componentname *cnp; IN struct vattr *vap; }; %% open vp L L L %! open post vop_open_post vop_open { IN struct vnode *vp; IN int mode; IN struct ucred *cred; IN struct thread *td; IN struct file *fp; }; %% close vp L L L %! close post vop_close_post vop_close { IN struct vnode *vp; IN int fflag; IN struct ucred *cred; IN struct thread *td; }; %% fplookup_vexec vp - - - %! fplookup_vexec debugpre vop_fplookup_vexec_debugpre %! fplookup_vexec debugpost vop_fplookup_vexec_debugpost vop_fplookup_vexec { IN struct vnode *vp; IN struct ucred *cred; }; %% fplookup_symlink vp - - - %! fplookup_symlink debugpre vop_fplookup_symlink_debugpre %! fplookup_symlink debugpost vop_fplookup_symlink_debugpost vop_fplookup_symlink { IN struct vnode *vp; IN struct cache_fpl *fpl; }; %% access vp L L L vop_access { IN struct vnode *vp; IN accmode_t accmode; IN struct ucred *cred; IN struct thread *td; }; %% accessx vp L L L vop_accessx { IN struct vnode *vp; IN accmode_t accmode; IN struct ucred *cred; IN struct thread *td; }; %% stat vp L L L vop_stat { IN struct vnode *vp; OUT struct stat *sb; IN struct ucred *active_cred; IN struct ucred *file_cred; - IN struct thread *td; }; %% getattr vp L L L vop_getattr { IN struct vnode *vp; OUT struct vattr *vap; IN struct ucred *cred; }; %% setattr vp E E E %! setattr pre vop_setattr_pre %! setattr post vop_setattr_post vop_setattr { IN struct vnode *vp; IN struct vattr *vap; IN struct ucred *cred; }; %% mmapped vp L L L vop_mmapped { IN struct vnode *vp; }; %% read vp L L L %! read post vop_read_post vop_read { IN struct vnode *vp; INOUT struct uio *uio; IN int ioflag; IN struct ucred *cred; }; %% read_pgcache vp - - - %! read_pgcache post vop_read_pgcache_post vop_read_pgcache { IN struct vnode *vp; INOUT struct uio *uio; IN int ioflag; IN struct ucred *cred; }; %% write vp L L L %! write pre VOP_WRITE_PRE %! write post VOP_WRITE_POST vop_write { IN struct vnode *vp; INOUT struct uio *uio; IN int ioflag; IN struct ucred *cred; }; %% ioctl vp U U U vop_ioctl { IN struct vnode *vp; IN u_long command; IN void *data; IN int fflag; IN struct ucred *cred; IN struct thread *td; }; %% poll vp U U U vop_poll { IN struct vnode *vp; IN int events; IN struct ucred *cred; IN struct thread *td; }; %% kqfilter vp U U U vop_kqfilter { IN struct vnode *vp; IN struct knote *kn; }; %% revoke vp L L L vop_revoke { IN struct vnode *vp; IN int flags; }; %% fsync vp L L L vop_fsync { IN struct vnode *vp; IN int waitfor; IN struct thread *td; }; %% remove dvp E E E %% remove vp E E E %! remove pre vop_remove_pre %! remove post vop_remove_post vop_remove { IN struct vnode *dvp; IN struct vnode *vp; IN struct componentname *cnp; }; %% link tdvp E E E %% link vp E E E %! link pre vop_link_pre %! link post vop_link_post vop_link { IN struct vnode *tdvp; IN struct vnode *vp; IN struct componentname *cnp; }; %! rename pre vop_rename_pre %! rename post vop_rename_post vop_rename { IN WILLRELE struct vnode *fdvp; IN WILLRELE struct vnode *fvp; IN struct componentname *fcnp; IN WILLRELE struct vnode *tdvp; IN WILLRELE struct vnode *tvp; IN struct componentname *tcnp; }; %% mkdir dvp E E E %% mkdir vpp - E - %! mkdir pre vop_mkdir_pre %! mkdir post vop_mkdir_post %! mkdir debugpost vop_mkdir_debugpost vop_mkdir { IN struct vnode *dvp; OUT struct vnode **vpp; IN struct componentname *cnp; IN struct vattr *vap; }; %% rmdir dvp E E E %% rmdir vp E E E %! rmdir pre vop_rmdir_pre %! rmdir post vop_rmdir_post vop_rmdir { IN struct vnode *dvp; IN struct vnode *vp; IN struct componentname *cnp; }; %% symlink dvp E E E %% symlink vpp - E - %! symlink pre vop_symlink_pre %! symlink post vop_symlink_post vop_symlink { IN struct vnode *dvp; OUT struct vnode **vpp; IN struct componentname *cnp; IN struct vattr *vap; IN const char *target; }; %% readdir vp L L L %! readdir post vop_readdir_post vop_readdir { IN struct vnode *vp; INOUT struct uio *uio; IN struct ucred *cred; INOUT int *eofflag; OUT int *ncookies; INOUT u_long **cookies; }; %% readlink vp L L L vop_readlink { IN struct vnode *vp; INOUT struct uio *uio; IN struct ucred *cred; }; %% inactive vp E E E vop_inactive { IN struct vnode *vp; }; %! need_inactive debugpre vop_need_inactive_debugpre %! need_inactive debugpost vop_need_inactive_debugpost vop_need_inactive { IN struct vnode *vp; }; %% reclaim vp E E E %! reclaim post vop_reclaim_post vop_reclaim { IN struct vnode *vp; }; %! lock1 debugpre vop_lock_debugpre %! lock1 debugpost vop_lock_debugpost vop_lock1 { IN struct vnode *vp; IN int flags; IN const char *file; IN int line; }; %! unlock debugpre vop_unlock_debugpre vop_unlock { IN struct vnode *vp; }; %% bmap vp L L L vop_bmap { IN struct vnode *vp; IN daddr_t bn; OUT struct bufobj **bop; IN daddr_t *bnp; OUT int *runp; OUT int *runb; }; %% strategy vp L L L %! strategy debugpre vop_strategy_debugpre vop_strategy { IN struct vnode *vp; IN struct buf *bp; }; %% getwritemount vp = = = vop_getwritemount { IN struct vnode *vp; OUT struct mount **mpp; }; %% print vp - - - vop_print { IN struct vnode *vp; }; %% pathconf vp L L L vop_pathconf { IN struct vnode *vp; IN int name; OUT long *retval; }; %% advlock vp U U U vop_advlock { IN struct vnode *vp; IN void *id; IN int op; IN struct flock *fl; IN int flags; }; %% advlockasync vp U U U vop_advlockasync { IN struct vnode *vp; IN void *id; IN int op; IN struct flock *fl; IN int flags; IN struct task *task; INOUT void **cookiep; }; %% advlockpurge vp E E E vop_advlockpurge { IN struct vnode *vp; }; %% reallocblks vp E E E vop_reallocblks { IN struct vnode *vp; IN struct cluster_save *buflist; }; %% getpages vp L L L vop_getpages { IN struct vnode *vp; IN vm_page_t *m; IN int count; IN int *rbehind; IN int *rahead; }; %% getpages_async vp L L L vop_getpages_async { IN struct vnode *vp; IN vm_page_t *m; IN int count; IN int *rbehind; IN int *rahead; IN vop_getpages_iodone_t *iodone; IN void *arg; }; %% putpages vp L L L vop_putpages { IN struct vnode *vp; IN vm_page_t *m; IN int count; IN int sync; IN int *rtvals; }; %% getacl vp L L L vop_getacl { IN struct vnode *vp; IN acl_type_t type; OUT struct acl *aclp; IN struct ucred *cred; IN struct thread *td; }; %% setacl vp E E E %! setacl pre vop_setacl_pre %! setacl post vop_setacl_post vop_setacl { IN struct vnode *vp; IN acl_type_t type; IN struct acl *aclp; IN struct ucred *cred; IN struct thread *td; }; %% aclcheck vp = = = vop_aclcheck { IN struct vnode *vp; IN acl_type_t type; IN struct acl *aclp; IN struct ucred *cred; IN struct thread *td; }; %% closeextattr vp L L L vop_closeextattr { IN struct vnode *vp; IN int commit; IN struct ucred *cred; IN struct thread *td; }; %% getextattr vp L L L vop_getextattr { IN struct vnode *vp; IN int attrnamespace; IN const char *name; INOUT struct uio *uio; OUT size_t *size; IN struct ucred *cred; IN struct thread *td; }; %% listextattr vp L L L vop_listextattr { IN struct vnode *vp; IN int attrnamespace; INOUT struct uio *uio; OUT size_t *size; IN struct ucred *cred; IN struct thread *td; }; %% openextattr vp L L L vop_openextattr { IN struct vnode *vp; IN struct ucred *cred; IN struct thread *td; }; %% deleteextattr vp E E E %! deleteextattr pre vop_deleteextattr_pre %! deleteextattr post vop_deleteextattr_post vop_deleteextattr { IN struct vnode *vp; IN int attrnamespace; IN const char *name; IN struct ucred *cred; IN struct thread *td; }; %% setextattr vp E E E %! setextattr pre vop_setextattr_pre %! setextattr post vop_setextattr_post vop_setextattr { IN struct vnode *vp; IN int attrnamespace; IN const char *name; INOUT struct uio *uio; IN struct ucred *cred; IN struct thread *td; }; %% setlabel vp E E E vop_setlabel { IN struct vnode *vp; IN struct label *label; IN struct ucred *cred; IN struct thread *td; }; %% vptofh vp = = = vop_vptofh { IN struct vnode *vp; IN struct fid *fhp; }; %% vptocnp vp L L L %% vptocnp vpp - U - vop_vptocnp { IN struct vnode *vp; OUT struct vnode **vpp; INOUT char *buf; INOUT size_t *buflen; }; %% allocate vp E E E vop_allocate { IN struct vnode *vp; INOUT off_t *offset; INOUT off_t *len; }; %% advise vp U U U vop_advise { IN struct vnode *vp; IN off_t start; IN off_t end; IN int advice; }; %% unp_bind vp E E E vop_unp_bind { IN struct vnode *vp; IN struct unpcb *unpcb; }; %% unp_connect vp L L L vop_unp_connect { IN struct vnode *vp; OUT struct unpcb **unpcb; }; %% unp_detach vp = = = vop_unp_detach { IN struct vnode *vp; }; %% is_text vp L L L vop_is_text { IN struct vnode *vp; }; %% set_text vp = = = vop_set_text { IN struct vnode *vp; }; %% vop_unset_text vp L L L vop_unset_text { IN struct vnode *vp; }; %% add_writecount vp L L L vop_add_writecount { IN struct vnode *vp; IN int inc; }; %% fdatasync vp L L L vop_fdatasync { IN struct vnode *vp; IN struct thread *td; }; %% copy_file_range invp U U U %% copy_file_range outvp U U U vop_copy_file_range { IN struct vnode *invp; INOUT off_t *inoffp; IN struct vnode *outvp; INOUT off_t *outoffp; INOUT size_t *lenp; IN unsigned int flags; IN struct ucred *incred; IN struct ucred *outcred; IN struct thread *fsizetd; }; %% vput_pair dvp E - - vop_vput_pair { IN struct vnode *dvp; INOUT struct vnode **vpp; IN bool unlock_vp; }; %% deallocate vp L L L vop_deallocate { IN struct vnode *vp; INOUT off_t *offset; INOUT off_t *len; IN int flags; IN int ioflag; IN struct ucred *cred; }; # The VOPs below are spares at the end of the table to allow new VOPs to be # added in stable branches without breaking the KBI. New VOPs in HEAD should # be added above these spares. When merging a new VOP to a stable branch, # the new VOP should replace one of the spares. vop_spare1 { IN struct vnode *vp; }; vop_spare2 { IN struct vnode *vp; }; vop_spare3 { IN struct vnode *vp; }; vop_spare4 { IN struct vnode *vp; }; vop_spare5 { IN struct vnode *vp; }; diff --git a/sys/sys/file.h b/sys/sys/file.h index 8a790a25fc6b..5a6f0b1d2071 100644 --- a/sys/sys/file.h +++ b/sys/sys/file.h @@ -1,492 +1,491 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)file.h 8.3 (Berkeley) 1/9/95 * $FreeBSD$ */ #ifndef _SYS_FILE_H_ #define _SYS_FILE_H_ #ifndef _KERNEL #include /* XXX */ #include #include #else #include #include #include #include #include struct filedesc; struct stat; struct thread; struct uio; struct knote; struct vnode; struct nameidata; #endif /* _KERNEL */ #define DTYPE_NONE 0 /* not yet initialized */ #define DTYPE_VNODE 1 /* file */ #define DTYPE_SOCKET 2 /* communications endpoint */ #define DTYPE_PIPE 3 /* pipe */ #define DTYPE_FIFO 4 /* fifo (named pipe) */ #define DTYPE_KQUEUE 5 /* event queue */ #define DTYPE_CRYPTO 6 /* crypto */ #define DTYPE_MQUEUE 7 /* posix message queue */ #define DTYPE_SHM 8 /* swap-backed shared memory */ #define DTYPE_SEM 9 /* posix semaphore */ #define DTYPE_PTS 10 /* pseudo teletype master device */ #define DTYPE_DEV 11 /* Device specific fd type */ #define DTYPE_PROCDESC 12 /* process descriptor */ #define DTYPE_EVENTFD 13 /* eventfd */ #define DTYPE_LINUXTFD 14 /* emulation timerfd type */ #ifdef _KERNEL struct file; struct filecaps; struct kaiocb; struct kinfo_file; struct ucred; #define FOF_OFFSET 0x01 /* Use the offset in uio argument */ #define FOF_NOLOCK 0x02 /* Do not take FOFFSET_LOCK */ #define FOF_NEXTOFF_R 0x04 /* Also update f_nextoff[UIO_READ] */ #define FOF_NEXTOFF_W 0x08 /* Also update f_nextoff[UIO_WRITE] */ #define FOF_NOUPDATE 0x10 /* Do not update f_offset */ off_t foffset_lock(struct file *fp, int flags); void foffset_lock_uio(struct file *fp, struct uio *uio, int flags); void foffset_unlock(struct file *fp, off_t val, int flags); void foffset_unlock_uio(struct file *fp, struct uio *uio, int flags); static inline off_t foffset_get(struct file *fp) { return (foffset_lock(fp, FOF_NOLOCK)); } typedef int fo_rdwr_t(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td); typedef int fo_truncate_t(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td); typedef int fo_ioctl_t(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td); typedef int fo_poll_t(struct file *fp, int events, struct ucred *active_cred, struct thread *td); typedef int fo_kqfilter_t(struct file *fp, struct knote *kn); typedef int fo_stat_t(struct file *fp, struct stat *sb, - struct ucred *active_cred, struct thread *td); + struct ucred *active_cred); typedef int fo_close_t(struct file *fp, struct thread *td); typedef int fo_chmod_t(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td); typedef int fo_chown_t(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td); typedef int fo_sendfile_t(struct file *fp, int sockfd, struct uio *hdr_uio, struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, struct thread *td); typedef int fo_seek_t(struct file *fp, off_t offset, int whence, struct thread *td); typedef int fo_fill_kinfo_t(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp); typedef int fo_mmap_t(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, struct thread *td); typedef int fo_aio_queue_t(struct file *fp, struct kaiocb *job); typedef int fo_add_seals_t(struct file *fp, int flags); typedef int fo_get_seals_t(struct file *fp, int *flags); typedef int fo_fallocate_t(struct file *fp, off_t offset, off_t len, struct thread *td); typedef int fo_fspacectl_t(struct file *fp, int cmd, off_t *offset, off_t *length, int flags, struct ucred *active_cred, struct thread *td); typedef int fo_flags_t; struct fileops { fo_rdwr_t *fo_read; fo_rdwr_t *fo_write; fo_truncate_t *fo_truncate; fo_ioctl_t *fo_ioctl; fo_poll_t *fo_poll; fo_kqfilter_t *fo_kqfilter; fo_stat_t *fo_stat; fo_close_t *fo_close; fo_chmod_t *fo_chmod; fo_chown_t *fo_chown; fo_sendfile_t *fo_sendfile; fo_seek_t *fo_seek; fo_fill_kinfo_t *fo_fill_kinfo; fo_mmap_t *fo_mmap; fo_aio_queue_t *fo_aio_queue; fo_add_seals_t *fo_add_seals; fo_get_seals_t *fo_get_seals; fo_fallocate_t *fo_fallocate; fo_fspacectl_t *fo_fspacectl; fo_flags_t fo_flags; /* DFLAG_* below */ }; #define DFLAG_PASSABLE 0x01 /* may be passed via unix sockets. */ #define DFLAG_SEEKABLE 0x02 /* seekable / nonsequential */ #endif /* _KERNEL */ #if defined(_KERNEL) || defined(_WANT_FILE) /* * Kernel descriptor table. * One entry for each open kernel vnode and socket. * * Below is the list of locks that protects members in struct file. * * (a) f_vnode lock required (shared allows both reads and writes) * (f) updated with atomics and blocking on sleepq * (d) cdevpriv_mtx * none not locked */ struct fadvise_info { int fa_advice; /* (f) FADV_* type. */ off_t fa_start; /* (f) Region start. */ off_t fa_end; /* (f) Region end. */ }; struct file { volatile u_int f_flag; /* see fcntl.h */ volatile u_int f_count; /* reference count */ void *f_data; /* file descriptor specific data */ struct fileops *f_ops; /* File operations */ struct vnode *f_vnode; /* NULL or applicable vnode */ struct ucred *f_cred; /* associated credentials. */ short f_type; /* descriptor type */ short f_vnread_flags; /* (f) Sleep lock for f_offset */ /* * DTYPE_VNODE specific fields. */ union { int16_t f_seqcount[2]; /* (a) Count of seq. reads and writes. */ int f_pipegen; }; off_t f_nextoff[2]; /* next expected read/write offset. */ union { struct cdev_privdata *fvn_cdevpriv; /* (d) Private data for the cdev. */ struct fadvise_info *fvn_advice; } f_vnun; /* * DFLAG_SEEKABLE specific fields */ off_t f_offset; }; #define f_cdevpriv f_vnun.fvn_cdevpriv #define f_advice f_vnun.fvn_advice #define FOFFSET_LOCKED 0x1 #define FOFFSET_LOCK_WAITING 0x2 #endif /* _KERNEL || _WANT_FILE */ /* * Userland version of struct file, for sysctl */ struct xfile { ksize_t xf_size; /* size of struct xfile */ pid_t xf_pid; /* owning process */ uid_t xf_uid; /* effective uid of owning process */ int xf_fd; /* descriptor number */ int _xf_int_pad1; kvaddr_t xf_file; /* address of struct file */ short xf_type; /* descriptor type */ short _xf_short_pad1; int xf_count; /* reference count */ int xf_msgcount; /* references from message queue */ int _xf_int_pad2; off_t xf_offset; /* file offset */ kvaddr_t xf_data; /* file descriptor specific data */ kvaddr_t xf_vnode; /* vnode pointer */ u_int xf_flag; /* flags (see fcntl.h) */ int _xf_int_pad3; int64_t _xf_int64_pad[6]; }; #ifdef _KERNEL extern struct fileops vnops; extern struct fileops badfileops; extern struct fileops path_fileops; extern struct fileops socketops; extern int maxfiles; /* kernel limit on number of open files */ extern int maxfilesperproc; /* per process limit on number of open files */ int fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp); int fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, vm_prot_t *maxprotp, struct file **fpp); int fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp); int fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp); int fget_fcntl(struct thread *td, int fd, cap_rights_t *rightsp, int needfcntl, struct file **fpp); int _fdrop(struct file *fp, struct thread *td); fo_rdwr_t invfo_rdwr; fo_truncate_t invfo_truncate; fo_ioctl_t invfo_ioctl; fo_poll_t invfo_poll; fo_kqfilter_t invfo_kqfilter; fo_chmod_t invfo_chmod; fo_chown_t invfo_chown; fo_sendfile_t invfo_sendfile; fo_stat_t vn_statfile; fo_sendfile_t vn_sendfile; fo_seek_t vn_seek; fo_fill_kinfo_t vn_fill_kinfo; fo_kqfilter_t vn_kqfilter_opath; int vn_fill_kinfo_vnode(struct vnode *vp, struct kinfo_file *kif); void finit(struct file *, u_int, short, void *, struct fileops *); void finit_vnode(struct file *, u_int, void *, struct fileops *); int fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp); int fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp); int fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp, struct filecaps *havecaps, struct vnode **vpp); int fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp); int fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp); int fgetvp_lookup_smr(int fd, struct nameidata *ndp, struct vnode **vpp, bool *fsearch); static __inline __result_use_check bool fhold(struct file *fp) { return (refcount_acquire_checked(&fp->f_count)); } #define fdrop(fp, td) ({ \ struct file *_fp; \ int _error; \ \ _error = 0; \ _fp = (fp); \ if (__predict_false(refcount_release(&_fp->f_count))) \ _error = _fdrop(_fp, td); \ _error; \ }) #define fdrop_close(fp, td) ({ \ struct file *_fp; \ int _error; \ \ _error = 0; \ _fp = (fp); \ if (__predict_true(refcount_release(&_fp->f_count))) \ _error = _fdrop(_fp, td); \ _error; \ }) static __inline fo_rdwr_t fo_read; static __inline fo_rdwr_t fo_write; static __inline fo_truncate_t fo_truncate; static __inline fo_ioctl_t fo_ioctl; static __inline fo_poll_t fo_poll; static __inline fo_kqfilter_t fo_kqfilter; static __inline fo_stat_t fo_stat; static __inline fo_close_t fo_close; static __inline fo_chmod_t fo_chmod; static __inline fo_chown_t fo_chown; static __inline fo_sendfile_t fo_sendfile; static __inline int fo_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return ((*fp->f_ops->fo_read)(fp, uio, active_cred, flags, td)); } static __inline int fo_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return ((*fp->f_ops->fo_write)(fp, uio, active_cred, flags, td)); } static __inline int fo_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { return ((*fp->f_ops->fo_truncate)(fp, length, active_cred, td)); } static __inline int fo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { return ((*fp->f_ops->fo_ioctl)(fp, com, data, active_cred, td)); } static __inline int fo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { return ((*fp->f_ops->fo_poll)(fp, events, active_cred, td)); } static __inline int -fo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, - struct thread *td) +fo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred) { - return ((*fp->f_ops->fo_stat)(fp, sb, active_cred, td)); + return ((*fp->f_ops->fo_stat)(fp, sb, active_cred)); } static __inline int fo_close(struct file *fp, struct thread *td) { return ((*fp->f_ops->fo_close)(fp, td)); } static __inline int fo_kqfilter(struct file *fp, struct knote *kn) { return ((*fp->f_ops->fo_kqfilter)(fp, kn)); } static __inline int fo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { return ((*fp->f_ops->fo_chmod)(fp, mode, active_cred, td)); } static __inline int fo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { return ((*fp->f_ops->fo_chown)(fp, uid, gid, active_cred, td)); } static __inline int fo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, struct thread *td) { return ((*fp->f_ops->fo_sendfile)(fp, sockfd, hdr_uio, trl_uio, offset, nbytes, sent, flags, td)); } static __inline int fo_seek(struct file *fp, off_t offset, int whence, struct thread *td) { return ((*fp->f_ops->fo_seek)(fp, offset, whence, td)); } static __inline int fo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { return ((*fp->f_ops->fo_fill_kinfo)(fp, kif, fdp)); } static __inline int fo_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, struct thread *td) { if (fp->f_ops->fo_mmap == NULL) return (ENODEV); return ((*fp->f_ops->fo_mmap)(fp, map, addr, size, prot, cap_maxprot, flags, foff, td)); } static __inline int fo_aio_queue(struct file *fp, struct kaiocb *job) { return ((*fp->f_ops->fo_aio_queue)(fp, job)); } static __inline int fo_add_seals(struct file *fp, int seals) { if (fp->f_ops->fo_add_seals == NULL) return (EINVAL); return ((*fp->f_ops->fo_add_seals)(fp, seals)); } static __inline int fo_get_seals(struct file *fp, int *seals) { if (fp->f_ops->fo_get_seals == NULL) return (EINVAL); return ((*fp->f_ops->fo_get_seals)(fp, seals)); } static __inline int fo_fallocate(struct file *fp, off_t offset, off_t len, struct thread *td) { if (fp->f_ops->fo_fallocate == NULL) return (ENODEV); return ((*fp->f_ops->fo_fallocate)(fp, offset, len, td)); } static __inline int fo_fspacectl(struct file *fp, int cmd, off_t *offset, off_t *length, int flags, struct ucred *active_cred, struct thread *td) { if (fp->f_ops->fo_fspacectl == NULL) return (ENODEV); return ((*fp->f_ops->fo_fspacectl)(fp, cmd, offset, length, flags, active_cred, td)); } #endif /* _KERNEL */ #endif /* !SYS_FILE_H */ diff --git a/sys/sys/param.h b/sys/sys/param.h index 9a016e35323e..50d861a61985 100644 --- a/sys/sys/param.h +++ b/sys/sys/param.h @@ -1,392 +1,392 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)param.h 8.3 (Berkeley) 4/4/95 * $FreeBSD$ */ #ifndef _SYS_PARAM_H_ #define _SYS_PARAM_H_ #include #define BSD 199506 /* System version (year & month). */ #define BSD4_3 1 #define BSD4_4 1 /* * __FreeBSD_version numbers are documented in the Porter's Handbook. * If you bump the version for any reason, you should update the documentation * there. * Currently this lives here in the doc/ repository: * * documentation/content/en/books/porters-handbook/versions/_index.adoc * * Encoding: Rxx * 'R' is in the range 0 to 4 if this is a release branch or * X.0-CURRENT before releng/X.0 is created, otherwise 'R' is * in the range 5 to 9. * Short hand: MMmmXXX * * __FreeBSD_version is bumped every time there's a change in the base system * that's noteworthy. A noteworthy change is any change which changes the * kernel's KBI in -CURRENT, one that changes some detail about the system that * external software (or the ports system) would want to know about, one that * adds a system call, one that adds or deletes a shipped library, a security * fix, or similar change not specifically noted here. Bumps should be limited * to one per day / a couple per week except for security fixes. * * The approved way to obtain this from a shell script is: * awk '/^\#define[[:space:]]*__FreeBSD_version/ {print $3}' * Other methods to parse this file may work, but are not guaranteed against * future changes. The above script works back to FreeBSD 3.x when this macro * was introduced. This number is propagated to other places needing it that * cannot include sys/param.h and should only be updated here. */ #undef __FreeBSD_version -#define __FreeBSD_version 1400036 +#define __FreeBSD_version 1400037 /* * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD, * which by definition is always true on FreeBSD. This macro is also defined * on other systems that use the kernel of FreeBSD, such as GNU/kFreeBSD. * * It is tempting to use this macro in userland code when we want to enable * kernel-specific routines, and in fact it's fine to do this in code that * is part of FreeBSD itself. However, be aware that as presence of this * macro is still not widespread (e.g. older FreeBSD versions, 3rd party * compilers, etc), it is STRONGLY DISCOURAGED to check for this macro in * external applications without also checking for __FreeBSD__ as an * alternative. */ #undef __FreeBSD_kernel__ #define __FreeBSD_kernel__ #if defined(_KERNEL) || defined(IN_RTLD) #define P_OSREL_SIGWAIT 700000 #define P_OSREL_SIGSEGV 700004 #define P_OSREL_MAP_ANON 800104 #define P_OSREL_MAP_FSTRICT 1100036 #define P_OSREL_SHUTDOWN_ENOTCONN 1100077 #define P_OSREL_MAP_GUARD 1200035 #define P_OSREL_WRFSBASE 1200041 #define P_OSREL_CK_CYLGRP 1200046 #define P_OSREL_VMTOTAL64 1200054 #define P_OSREL_CK_SUPERBLOCK 1300000 #define P_OSREL_CK_INODE 1300005 #define P_OSREL_POWERPC_NEW_AUX_ARGS 1300070 #define P_OSREL_MAJOR(x) ((x) / 100000) #endif #ifndef LOCORE #include #endif /* * Machine-independent constants (some used in following include files). * Redefined constants are from POSIX 1003.1 limits file. * * MAXCOMLEN should be >= sizeof(ac_comm) (see ) */ #include #define MAXCOMLEN 19 /* max command name remembered */ #define MAXINTERP PATH_MAX /* max interpreter file name length */ #define MAXLOGNAME 33 /* max login name length (incl. NUL) */ #define MAXUPRC CHILD_MAX /* max simultaneous processes */ #define NCARGS ARG_MAX /* max bytes for an exec function */ #define NGROUPS (NGROUPS_MAX+1) /* max number groups */ #define NOFILE OPEN_MAX /* max open files per process */ #define NOGROUP 65535 /* marker for empty group set member */ #define MAXHOSTNAMELEN 256 /* max hostname size */ #define SPECNAMELEN 255 /* max length of devicename */ /* More types and definitions used throughout the kernel. */ #ifdef _KERNEL #include #include #ifndef LOCORE #include #include #endif #ifndef FALSE #define FALSE 0 #endif #ifndef TRUE #define TRUE 1 #endif #endif #ifndef _KERNEL #ifndef LOCORE /* Signals. */ #include #endif #endif /* Machine type dependent parameters. */ #include #ifndef _KERNEL #include #endif #ifndef DEV_BSHIFT #define DEV_BSHIFT 9 /* log2(DEV_BSIZE) */ #endif #define DEV_BSIZE (1<>PAGE_SHIFT) #endif /* * btodb() is messy and perhaps slow because `bytes' may be an off_t. We * want to shift an unsigned type to avoid sign extension and we don't * want to widen `bytes' unnecessarily. Assume that the result fits in * a daddr_t. */ #ifndef btodb #define btodb(bytes) /* calculates (bytes / DEV_BSIZE) */ \ (sizeof (bytes) > sizeof(long) \ ? (daddr_t)((unsigned long long)(bytes) >> DEV_BSHIFT) \ : (daddr_t)((unsigned long)(bytes) >> DEV_BSHIFT)) #endif #ifndef dbtob #define dbtob(db) /* calculates (db * DEV_BSIZE) */ \ ((off_t)(db) << DEV_BSHIFT) #endif #define PRIMASK 0x0ff #define PCATCH 0x100 /* OR'd with pri for tsleep to check signals */ #define PDROP 0x200 /* OR'd with pri to stop re-entry of interlock mutex */ #define PNOLOCK 0x400 /* OR'd with pri to allow sleeping w/o a lock */ #define PRILASTFLAG 0x400 /* Last flag defined above */ #define NZERO 0 /* default "nice" */ #define NBBY 8 /* number of bits in a byte */ #define NBPW sizeof(int) /* number of bytes per word (integer) */ #define CMASK 022 /* default file mask: S_IWGRP|S_IWOTH */ #define NODEV (dev_t)(-1) /* non-existent device */ /* * File system parameters and macros. * * MAXBSIZE - Filesystems are made out of blocks of at most MAXBSIZE bytes * per block. MAXBSIZE may be made larger without effecting * any existing filesystems as long as it does not exceed MAXPHYS, * and may be made smaller at the risk of not being able to use * filesystems which require a block size exceeding MAXBSIZE. * * MAXBCACHEBUF - Maximum size of a buffer in the buffer cache. This must * be >= MAXBSIZE and can be set differently for different * architectures by defining it in . * Making this larger allows NFS to do larger reads/writes. * * BKVASIZE - Nominal buffer space per buffer, in bytes. BKVASIZE is the * minimum KVM memory reservation the kernel is willing to make. * Filesystems can of course request smaller chunks. Actual * backing memory uses a chunk size of a page (PAGE_SIZE). * The default value here can be overridden on a per-architecture * basis by defining it in . * * If you make BKVASIZE too small you risk seriously fragmenting * the buffer KVM map which may slow things down a bit. If you * make it too big the kernel will not be able to optimally use * the KVM memory reserved for the buffer cache and will wind * up with too-few buffers. * * The default is 16384, roughly 2x the block size used by a * normal UFS filesystem. */ #define MAXBSIZE 65536 /* must be power of 2 */ #ifndef MAXBCACHEBUF #define MAXBCACHEBUF MAXBSIZE /* must be a power of 2 >= MAXBSIZE */ #endif #ifndef BKVASIZE #define BKVASIZE 16384 /* must be power of 2 */ #endif #define BKVAMASK (BKVASIZE-1) /* * MAXPATHLEN defines the longest permissible path length after expanding * symbolic links. It is used to allocate a temporary buffer from the buffer * pool in which to do the name expansion, hence should be a power of two, * and must be less than or equal to MAXBSIZE. MAXSYMLINKS defines the * maximum number of symbolic links that may be expanded in a path name. * It should be set high enough to allow all legitimate uses, but halt * infinite loops reasonably quickly. */ #define MAXPATHLEN PATH_MAX #define MAXSYMLINKS 32 /* Bit map related macros. */ #define setbit(a,i) (((unsigned char *)(a))[(i)/NBBY] |= 1<<((i)%NBBY)) #define clrbit(a,i) (((unsigned char *)(a))[(i)/NBBY] &= ~(1<<((i)%NBBY))) #define isset(a,i) \ (((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) #define isclr(a,i) \ ((((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) == 0) /* Macros for counting and rounding. */ #ifndef howmany #define howmany(x, y) (((x)+((y)-1))/(y)) #endif #define nitems(x) (sizeof((x)) / sizeof((x)[0])) #define rounddown(x, y) (((x)/(y))*(y)) #define rounddown2(x, y) __align_down(x, y) /* if y is power of two */ #define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) /* to any y */ #define roundup2(x, y) __align_up(x, y) /* if y is powers of two */ #define powerof2(x) ((((x)-1)&(x))==0) /* Macros for min/max. */ #define MIN(a,b) (((a)<(b))?(a):(b)) #define MAX(a,b) (((a)>(b))?(a):(b)) #ifdef _KERNEL /* * Basic byte order function prototypes for non-inline functions. */ #ifndef LOCORE #ifndef _BYTEORDER_PROTOTYPED #define _BYTEORDER_PROTOTYPED __BEGIN_DECLS __uint32_t htonl(__uint32_t); __uint16_t htons(__uint16_t); __uint32_t ntohl(__uint32_t); __uint16_t ntohs(__uint16_t); __END_DECLS #endif #endif #ifndef _BYTEORDER_FUNC_DEFINED #define _BYTEORDER_FUNC_DEFINED #define htonl(x) __htonl(x) #define htons(x) __htons(x) #define ntohl(x) __ntohl(x) #define ntohs(x) __ntohs(x) #endif /* !_BYTEORDER_FUNC_DEFINED */ #endif /* _KERNEL */ /* * Scale factor for scaled integers used to count %cpu time and load avgs. * * The number of CPU `tick's that map to a unique `%age' can be expressed * by the formula (1 / (2 ^ (FSHIFT - 11))). The maximum load average that * can be calculated (assuming 32 bits) can be closely approximated using * the formula (2 ^ (2 * (16 - FSHIFT))) for (FSHIFT < 15). * * For the scheduler to maintain a 1:1 mapping of CPU `tick' to `%age', * FSHIFT must be at least 11; this gives us a maximum load avg of ~1024. */ #define FSHIFT 11 /* bits to right of fixed binary point */ #define FSCALE (1<> (PAGE_SHIFT - DEV_BSHIFT)) #define ctodb(db) /* calculates pages to devblks */ \ ((db) << (PAGE_SHIFT - DEV_BSHIFT)) /* * Old spelling of __containerof(). */ #define member2struct(s, m, x) \ ((struct s *)(void *)((char *)(x) - offsetof(struct s, m))) /* * Access a variable length array that has been declared as a fixed * length array. */ #define __PAST_END(array, offset) (((__typeof__(*(array)) *)(array))[offset]) #endif /* _SYS_PARAM_H_ */ diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index 81f3f3d5489c..a9b57dd4c25d 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -1,1143 +1,1143 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vnode.h 8.7 (Berkeley) 2/4/94 * $FreeBSD$ */ #ifndef _SYS_VNODE_H_ #define _SYS_VNODE_H_ #include #include #include #include #include #include #include #include #include #include #include /* * The vnode is the focus of all file activity in UNIX. There is a * unique vnode allocated for each active file, each current directory, * each mounted-on file, text file, and the root. */ /* * Vnode types. VNON means no type. */ enum vtype { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD, VMARKER }; enum vgetstate { VGET_NONE, VGET_HOLDCNT, VGET_USECOUNT }; /* * Each underlying filesystem allocates its own private area and hangs * it from v_data. If non-null, this area is freed in getnewvnode(). */ struct namecache; struct cache_fpl; struct vpollinfo { struct mtx vpi_lock; /* lock to protect below */ struct selinfo vpi_selinfo; /* identity of poller(s) */ short vpi_events; /* what they are looking for */ short vpi_revents; /* what has happened */ }; /* * Reading or writing any of these items requires holding the appropriate lock. * * Lock reference: * c - namecache mutex * i - interlock * l - mp mnt_listmtx or freelist mutex * I - updated with atomics, 0->1 and 1->0 transitions with interlock held * m - mount point interlock * p - pollinfo lock * u - Only a reference to the vnode is needed to read. * v - vnode lock * * Vnodes may be found on many lists. The general way to deal with operating * on a vnode that is on a list is: * 1) Lock the list and find the vnode. * 2) Lock interlock so that the vnode does not go away. * 3) Unlock the list to avoid lock order reversals. * 4) vget with LK_INTERLOCK and check for ENOENT, or * 5) Check for DOOMED if the vnode lock is not required. * 6) Perform your operation, then vput(). */ #if defined(_KERNEL) || defined(_KVM_VNODE) struct vnode { /* * Fields which define the identity of the vnode. These fields are * owned by the filesystem (XXX: and vgone() ?) */ enum vtype v_type:8; /* u vnode type */ short v_irflag; /* i frequently read flags */ seqc_t v_seqc; /* i modification count */ uint32_t v_nchash; /* u namecache hash */ u_int v_hash; struct vop_vector *v_op; /* u vnode operations vector */ void *v_data; /* u private data for fs */ /* * Filesystem instance stuff */ struct mount *v_mount; /* u ptr to vfs we are in */ TAILQ_ENTRY(vnode) v_nmntvnodes; /* m vnodes for mount point */ /* * Type specific fields, only one applies to any given vnode. */ union { struct mount *v_mountedhere; /* v ptr to mountpoint (VDIR) */ struct unpcb *v_unpcb; /* v unix domain net (VSOCK) */ struct cdev *v_rdev; /* v device (VCHR, VBLK) */ struct fifoinfo *v_fifoinfo; /* v fifo (VFIFO) */ }; /* * vfs_hash: (mount + inode) -> vnode hash. The hash value * itself is grouped with other int fields, to avoid padding. */ LIST_ENTRY(vnode) v_hashlist; /* * VFS_namecache stuff */ LIST_HEAD(, namecache) v_cache_src; /* c Cache entries from us */ TAILQ_HEAD(, namecache) v_cache_dst; /* c Cache entries to us */ struct namecache *v_cache_dd; /* c Cache entry for .. vnode */ /* * Locking */ struct lock v_lock; /* u (if fs don't have one) */ struct mtx v_interlock; /* lock for "i" things */ struct lock *v_vnlock; /* u pointer to vnode lock */ /* * The machinery of being a vnode */ TAILQ_ENTRY(vnode) v_vnodelist; /* l vnode lists */ TAILQ_ENTRY(vnode) v_lazylist; /* l vnode lazy list */ struct bufobj v_bufobj; /* * Buffer cache object */ /* * Hooks for various subsystems and features. */ struct vpollinfo *v_pollinfo; /* i Poll events, p for *v_pi */ struct label *v_label; /* MAC label for vnode */ struct lockf *v_lockf; /* Byte-level advisory lock list */ struct rangelock v_rl; /* Byte-range lock */ u_int v_holdcnt; /* I prevents recycling. */ u_int v_usecount; /* I ref count of users */ u_short v_iflag; /* i vnode flags (see below) */ u_short v_vflag; /* v vnode flags */ u_short v_mflag; /* l mnt-specific vnode flags */ short v_dbatchcpu; /* i LRU requeue deferral batch */ int v_writecount; /* I ref count of writers or (negative) text users */ int v_seqc_users; /* i modifications pending */ }; #ifndef DEBUG_LOCKS #ifdef _LP64 /* * Not crossing 448 bytes fits 9 vnodes per page. If you have to add fields * to the structure and there is nothing which can be done to prevent growth * then so be it. But don't grow it without a good reason. */ _Static_assert(sizeof(struct vnode) <= 448, "vnode size crosses 448 bytes"); #endif #endif #endif /* defined(_KERNEL) || defined(_KVM_VNODE) */ #define bo2vnode(bo) __containerof((bo), struct vnode, v_bufobj) /* XXX: These are temporary to avoid a source sweep at this time */ #define v_object v_bufobj.bo_object /* * Userland version of struct vnode, for sysctl. */ struct xvnode { size_t xv_size; /* sizeof(struct xvnode) */ void *xv_vnode; /* address of real vnode */ u_long xv_flag; /* vnode vflags */ int xv_usecount; /* reference count of users */ int xv_writecount; /* reference count of writers */ int xv_holdcnt; /* page & buffer references */ u_long xv_id; /* capability identifier */ void *xv_mount; /* address of parent mount */ long xv_numoutput; /* num of writes in progress */ enum vtype xv_type; /* vnode type */ union { void *xvu_socket; /* unpcb, if VSOCK */ void *xvu_fifo; /* fifo, if VFIFO */ dev_t xvu_rdev; /* maj/min, if VBLK/VCHR */ struct { dev_t xvu_dev; /* device, if VDIR/VREG/VLNK */ ino_t xvu_ino; /* id, if VDIR/VREG/VLNK */ } xv_uns; } xv_un; }; #define xv_socket xv_un.xvu_socket #define xv_fifo xv_un.xvu_fifo #define xv_rdev xv_un.xvu_rdev #define xv_dev xv_un.xv_uns.xvu_dev #define xv_ino xv_un.xv_uns.xvu_ino /* We don't need to lock the knlist */ #define VN_KNLIST_EMPTY(vp) ((vp)->v_pollinfo == NULL || \ KNLIST_EMPTY(&(vp)->v_pollinfo->vpi_selinfo.si_note)) #define VN_KNOTE(vp, b, a) \ do { \ if (!VN_KNLIST_EMPTY(vp)) \ KNOTE(&vp->v_pollinfo->vpi_selinfo.si_note, (b), \ (a) | KNF_NOKQLOCK); \ } while (0) #define VN_KNOTE_LOCKED(vp, b) VN_KNOTE(vp, b, KNF_LISTLOCKED) #define VN_KNOTE_UNLOCKED(vp, b) VN_KNOTE(vp, b, 0) /* * Vnode flags. * VI flags are protected by interlock and live in v_iflag * VV flags are protected by the vnode lock and live in v_vflag * * VIRF_DOOMED is doubly protected by the interlock and vnode lock. Both * are required for writing but the status may be checked with either. */ #define VHOLD_NO_SMR (1<<29) /* Disable vhold_smr */ #define VHOLD_ALL_FLAGS (VHOLD_NO_SMR) #define VIRF_DOOMED 0x0001 /* This vnode is being recycled */ #define VIRF_PGREAD 0x0002 /* Direct reads from the page cache are permitted, never cleared once set */ #define VIRF_MOUNTPOINT 0x0004 /* This vnode is mounted on */ #define VI_TEXT_REF 0x0001 /* Text ref grabbed use ref */ #define VI_MOUNT 0x0002 /* Mount in progress */ #define VI_DOINGINACT 0x0004 /* VOP_INACTIVE is in progress */ #define VI_OWEINACT 0x0008 /* Need to call inactive */ #define VI_DEFINACT 0x0010 /* deferred inactive */ #define VI_FOPENING 0x0020 /* In open, with opening process having the first right to advlock file */ #define VV_ROOT 0x0001 /* root of its filesystem */ #define VV_ISTTY 0x0002 /* vnode represents a tty */ #define VV_NOSYNC 0x0004 /* unlinked, stop syncing */ #define VV_ETERNALDEV 0x0008 /* device that is never destroyed */ #define VV_CACHEDLABEL 0x0010 /* Vnode has valid cached MAC label */ #define VV_VMSIZEVNLOCK 0x0020 /* object size check requires vnode lock */ #define VV_COPYONWRITE 0x0040 /* vnode is doing copy-on-write */ #define VV_SYSTEM 0x0080 /* vnode being used by kernel */ #define VV_PROCDEP 0x0100 /* vnode is process dependent */ #define VV_NOKNOTE 0x0200 /* don't activate knotes on this vnode */ #define VV_DELETED 0x0400 /* should be removed */ #define VV_MD 0x0800 /* vnode backs the md device */ #define VV_FORCEINSMQ 0x1000 /* force the insmntque to succeed */ #define VV_READLINK 0x2000 /* fdescfs linux vnode */ #define VV_UNREF 0x4000 /* vunref, do not drop lock in inactive() */ #define VMP_LAZYLIST 0x0001 /* Vnode is on mnt's lazy list */ /* * Vnode attributes. A field value of VNOVAL represents a field whose value * is unavailable (getattr) or which is not to be changed (setattr). */ struct vattr { enum vtype va_type; /* vnode type (for create) */ u_short va_mode; /* files access mode and type */ u_short va_padding0; uid_t va_uid; /* owner user id */ gid_t va_gid; /* owner group id */ nlink_t va_nlink; /* number of references to file */ dev_t va_fsid; /* filesystem id */ ino_t va_fileid; /* file id */ u_quad_t va_size; /* file size in bytes */ long va_blocksize; /* blocksize preferred for i/o */ struct timespec va_atime; /* time of last access */ struct timespec va_mtime; /* time of last modification */ struct timespec va_ctime; /* time file changed */ struct timespec va_birthtime; /* time file created */ u_long va_gen; /* generation number of file */ u_long va_flags; /* flags defined for file */ dev_t va_rdev; /* device the special file represents */ u_quad_t va_bytes; /* bytes of disk space held by file */ u_quad_t va_filerev; /* file modification number */ u_int va_vaflags; /* operations flags, see below */ long va_spare; /* remain quad aligned */ }; /* * Flags for va_vaflags. */ #define VA_UTIMES_NULL 0x01 /* utimes argument was NULL */ #define VA_EXCLUSIVE 0x02 /* exclusive create request */ #define VA_SYNC 0x04 /* O_SYNC truncation */ /* * Flags for ioflag. (high 16 bits used to ask for read-ahead and * help with write clustering) * NB: IO_NDELAY and IO_DIRECT are linked to fcntl.h */ #define IO_UNIT 0x0001 /* do I/O as atomic unit */ #define IO_APPEND 0x0002 /* append write to end */ #define IO_NDELAY 0x0004 /* FNDELAY flag set in file table */ #define IO_NODELOCKED 0x0008 /* underlying node already locked */ #define IO_ASYNC 0x0010 /* bawrite rather then bdwrite */ #define IO_VMIO 0x0020 /* data already in VMIO space */ #define IO_INVAL 0x0040 /* invalidate after I/O */ #define IO_SYNC 0x0080 /* do I/O synchronously */ #define IO_DIRECT 0x0100 /* attempt to bypass buffer cache */ #define IO_NOREUSE 0x0200 /* VMIO data won't be reused */ #define IO_EXT 0x0400 /* operate on external attributes */ #define IO_NORMAL 0x0800 /* operate on regular data */ #define IO_NOMACCHECK 0x1000 /* MAC checks unnecessary */ #define IO_BUFLOCKED 0x2000 /* ffs flag; indir buf is locked */ #define IO_RANGELOCKED 0x4000 /* range locked */ #define IO_DATASYNC 0x8000 /* do only data I/O synchronously */ #define IO_SEQMAX 0x7F /* seq heuristic max value */ #define IO_SEQSHIFT 16 /* seq heuristic in upper 16 bits */ /* * Flags for accmode_t. */ #define VEXEC 000000000100 /* execute/search permission */ #define VWRITE 000000000200 /* write permission */ #define VREAD 000000000400 /* read permission */ #define VADMIN 000000010000 /* being the file owner */ #define VAPPEND 000000040000 /* permission to write/append */ /* * VEXPLICIT_DENY makes VOP_ACCESSX(9) return EPERM or EACCES only * if permission was denied explicitly, by a "deny" rule in NFSv4 ACL, * and 0 otherwise. This never happens with ordinary unix access rights * or POSIX.1e ACLs. Obviously, VEXPLICIT_DENY must be OR-ed with * some other V* constant. */ #define VEXPLICIT_DENY 000000100000 #define VREAD_NAMED_ATTRS 000000200000 /* not used */ #define VWRITE_NAMED_ATTRS 000000400000 /* not used */ #define VDELETE_CHILD 000001000000 #define VREAD_ATTRIBUTES 000002000000 /* permission to stat(2) */ #define VWRITE_ATTRIBUTES 000004000000 /* change {m,c,a}time */ #define VDELETE 000010000000 #define VREAD_ACL 000020000000 /* read ACL and file mode */ #define VWRITE_ACL 000040000000 /* change ACL and/or file mode */ #define VWRITE_OWNER 000100000000 /* change file owner */ #define VSYNCHRONIZE 000200000000 /* not used */ #define VCREAT 000400000000 /* creating new file */ #define VVERIFY 001000000000 /* verification required */ /* * Permissions that were traditionally granted only to the file owner. */ #define VADMIN_PERMS (VADMIN | VWRITE_ATTRIBUTES | VWRITE_ACL | \ VWRITE_OWNER) /* * Permissions that were traditionally granted to everyone. */ #define VSTAT_PERMS (VREAD_ATTRIBUTES | VREAD_ACL) /* * Permissions that allow to change the state of the file in any way. */ #define VMODIFY_PERMS (VWRITE | VAPPEND | VADMIN_PERMS | VDELETE_CHILD | \ VDELETE) /* * Token indicating no attribute value yet assigned. */ #define VNOVAL (-1) /* * LK_TIMELOCK timeout for vnode locks (used mainly by the pageout daemon) */ #define VLKTIMEOUT (hz / 20 + 1) #ifdef _KERNEL #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_VNODE); #endif extern u_int ncsizefactor; extern const u_int io_hold_cnt; /* * Convert between vnode types and inode formats (since POSIX.1 * defines mode word of stat structure in terms of inode formats). */ extern enum vtype iftovt_tab[]; extern int vttoif_tab[]; #define IFTOVT(mode) (iftovt_tab[((mode) & S_IFMT) >> 12]) #define VTTOIF(indx) (vttoif_tab[(int)(indx)]) #define MAKEIMODE(indx, mode) (int)(VTTOIF(indx) | (mode)) /* * Flags to various vnode functions. */ #define SKIPSYSTEM 0x0001 /* vflush: skip vnodes marked VSYSTEM */ #define FORCECLOSE 0x0002 /* vflush: force file closure */ #define WRITECLOSE 0x0004 /* vflush: only close writable files */ #define EARLYFLUSH 0x0008 /* vflush: early call for ffs_flushfiles */ #define V_SAVE 0x0001 /* vinvalbuf: sync file first */ #define V_ALT 0x0002 /* vinvalbuf: invalidate only alternate bufs */ #define V_NORMAL 0x0004 /* vinvalbuf: invalidate only regular bufs */ #define V_CLEANONLY 0x0008 /* vinvalbuf: invalidate only clean bufs */ #define V_VMIO 0x0010 /* vinvalbuf: called during pageout */ #define V_ALLOWCLEAN 0x0020 /* vinvalbuf: allow clean buffers after flush */ #define REVOKEALL 0x0001 /* vop_revoke: revoke all aliases */ #define V_WAIT 0x0001 /* vn_start_write: sleep for suspend */ #define V_NOWAIT 0x0002 /* vn_start_write: don't sleep for suspend */ #define V_XSLEEP 0x0004 /* vn_start_write: just return after sleep */ #define V_MNTREF 0x0010 /* vn_start_write: mp is already ref-ed */ #define VR_START_WRITE 0x0001 /* vfs_write_resume: start write atomically */ #define VR_NO_SUSPCLR 0x0002 /* vfs_write_resume: do not clear suspension */ #define VS_SKIP_UNMOUNT 0x0001 /* vfs_write_suspend: fail if the filesystem is being unmounted */ #define VREF(vp) vref(vp) #ifdef DIAGNOSTIC #define VATTR_NULL(vap) vattr_null(vap) #else #define VATTR_NULL(vap) (*(vap) = va_null) /* initialize a vattr */ #endif /* DIAGNOSTIC */ #define NULLVP ((struct vnode *)NULL) /* * Global vnode data. */ extern struct vnode *rootvnode; /* root (i.e. "/") vnode */ extern struct mount *rootdevmp; /* "/dev" mount */ extern u_long desiredvnodes; /* number of vnodes desired */ extern struct uma_zone *namei_zone; extern struct vattr va_null; /* predefined null vattr structure */ extern u_int vn_lock_pair_pause_max; #define VI_LOCK(vp) mtx_lock(&(vp)->v_interlock) #define VI_LOCK_FLAGS(vp, flags) mtx_lock_flags(&(vp)->v_interlock, (flags)) #define VI_TRYLOCK(vp) mtx_trylock(&(vp)->v_interlock) #define VI_UNLOCK(vp) mtx_unlock(&(vp)->v_interlock) #define VI_MTX(vp) (&(vp)->v_interlock) #define VN_LOCK_AREC(vp) lockallowrecurse((vp)->v_vnlock) #define VN_LOCK_ASHARE(vp) lockallowshare((vp)->v_vnlock) #define VN_LOCK_DSHARE(vp) lockdisableshare((vp)->v_vnlock) #endif /* _KERNEL */ /* * Mods for extensibility. */ /* * Flags for vdesc_flags: */ #define VDESC_MAX_VPS 16 /* Low order 16 flag bits are reserved for willrele flags for vp arguments. */ #define VDESC_VP0_WILLRELE 0x0001 #define VDESC_VP1_WILLRELE 0x0002 #define VDESC_VP2_WILLRELE 0x0004 #define VDESC_VP3_WILLRELE 0x0008 /* * A generic structure. * This can be used by bypass routines to identify generic arguments. */ struct vop_generic_args { struct vnodeop_desc *a_desc; /* other random data follows, presumably */ }; typedef int vop_bypass_t(struct vop_generic_args *); /* * VDESC_NO_OFFSET is used to identify the end of the offset list * and in places where no such field exists. */ #define VDESC_NO_OFFSET -1 /* * This structure describes the vnode operation taking place. */ struct vnodeop_desc { char *vdesc_name; /* a readable name for debugging */ int vdesc_flags; /* VDESC_* flags */ int vdesc_vop_offset; vop_bypass_t *vdesc_call; /* Function to call */ /* * These ops are used by bypass routines to map and locate arguments. * Creds and procs are not needed in bypass routines, but sometimes * they are useful to (for example) transport layers. * Nameidata is useful because it has a cred in it. */ int *vdesc_vp_offsets; /* list ended by VDESC_NO_OFFSET */ int vdesc_vpp_offset; /* return vpp location */ int vdesc_cred_offset; /* cred location, if any */ int vdesc_thread_offset; /* thread location, if any */ int vdesc_componentname_offset; /* if any */ }; #ifdef _KERNEL /* * A list of all the operation descs. */ extern struct vnodeop_desc *vnodeop_descs[]; #define VOPARG_OFFSETOF(s_type, field) __offsetof(s_type, field) #define VOPARG_OFFSETTO(s_type, s_offset, struct_p) \ ((s_type)(((char*)(struct_p)) + (s_offset))) #ifdef DEBUG_VFS_LOCKS /* * Support code to aid in debugging VFS locking problems. Not totally * reliable since if the thread sleeps between changing the lock * state and checking it with the assert, some other thread could * change the state. They are good enough for debugging a single * filesystem using a single-threaded test. Note that the unreliability is * limited to false negatives; efforts were made to ensure that false * positives cannot occur. */ void assert_vi_locked(struct vnode *vp, const char *str); void assert_vi_unlocked(struct vnode *vp, const char *str); void assert_vop_elocked(struct vnode *vp, const char *str); void assert_vop_locked(struct vnode *vp, const char *str); void assert_vop_unlocked(struct vnode *vp, const char *str); #define ASSERT_VI_LOCKED(vp, str) assert_vi_locked((vp), (str)) #define ASSERT_VI_UNLOCKED(vp, str) assert_vi_unlocked((vp), (str)) #define ASSERT_VOP_ELOCKED(vp, str) assert_vop_elocked((vp), (str)) #define ASSERT_VOP_LOCKED(vp, str) assert_vop_locked((vp), (str)) #define ASSERT_VOP_UNLOCKED(vp, str) assert_vop_unlocked((vp), (str)) #define ASSERT_VOP_IN_SEQC(vp) do { \ struct vnode *_vp = (vp); \ \ VNPASS(seqc_in_modify(_vp->v_seqc), _vp); \ } while (0) #define ASSERT_VOP_NOT_IN_SEQC(vp) do { \ struct vnode *_vp = (vp); \ \ VNPASS(!seqc_in_modify(_vp->v_seqc), _vp); \ } while (0) #else /* !DEBUG_VFS_LOCKS */ #define ASSERT_VI_LOCKED(vp, str) ((void)0) #define ASSERT_VI_UNLOCKED(vp, str) ((void)0) #define ASSERT_VOP_ELOCKED(vp, str) ((void)0) #define ASSERT_VOP_LOCKED(vp, str) ((void)0) #define ASSERT_VOP_UNLOCKED(vp, str) ((void)0) #define ASSERT_VOP_IN_SEQC(vp) ((void)0) #define ASSERT_VOP_NOT_IN_SEQC(vp) ((void)0) #endif /* DEBUG_VFS_LOCKS */ /* * This call works for vnodes in the kernel. */ #define VCALL(c) ((c)->a_desc->vdesc_call(c)) #define DOINGASYNC(vp) \ (((vp)->v_mount->mnt_kern_flag & MNTK_ASYNC) != 0 && \ ((curthread->td_pflags & TDP_SYNCIO) == 0)) /* * VMIO support inline */ extern int vmiodirenable; static __inline int vn_canvmio(struct vnode *vp) { if (vp && (vp->v_type == VREG || (vmiodirenable && vp->v_type == VDIR))) return(TRUE); return(FALSE); } /* * Finally, include the default set of vnode operations. */ typedef void vop_getpages_iodone_t(void *, vm_page_t *, int, int); #include "vnode_if.h" /* vn_open_flags */ #define VN_OPEN_NOAUDIT 0x00000001 #define VN_OPEN_NOCAPCHECK 0x00000002 #define VN_OPEN_NAMECACHE 0x00000004 #define VN_OPEN_INVFS 0x00000008 /* copy_file_range kernel flags */ #define COPY_FILE_RANGE_KFLAGS 0xff000000 #define COPY_FILE_RANGE_TIMEO1SEC 0x01000000 /* Return after 1sec. */ /* * Public vnode manipulation functions. */ struct componentname; struct file; struct mount; struct nameidata; struct ostat; struct freebsd11_stat; struct thread; struct proc; struct stat; struct nstat; struct ucred; struct uio; struct vattr; struct vfsops; struct vnode; typedef int (*vn_get_ino_t)(struct mount *, void *, int, struct vnode **); int bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn); /* cache_* may belong in namei.h. */ void cache_changesize(u_long newhashsize); #define VFS_CACHE_DROPOLD 0x1 void cache_enter_time_flags(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, struct timespec *tsp, struct timespec *dtsp, int flags); #define cache_enter(dvp, vp, cnp) \ cache_enter_time(dvp, vp, cnp, NULL, NULL) void cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, struct timespec *tsp, struct timespec *dtsp); int cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct timespec *tsp, int *ticksp); void cache_vnode_init(struct vnode *vp); void cache_purge(struct vnode *vp); void cache_purge_vgone(struct vnode *vp); void cache_purge_negative(struct vnode *vp); void cache_purgevfs(struct mount *mp); char *cache_symlink_alloc(size_t size, int flags); void cache_symlink_free(char *string, size_t size); int cache_symlink_resolve(struct cache_fpl *fpl, const char *string, size_t len); void cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp); void cache_vop_rmdir(struct vnode *dvp, struct vnode *vp); #ifdef INVARIANTS void cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp); #else static inline void cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) { } #endif void cache_fast_lookup_enabled_recalc(void); int change_dir(struct vnode *vp, struct thread *td); void cvtstat(struct stat *st, struct ostat *ost); void freebsd11_cvtnstat(struct stat *sb, struct nstat *nsb); int freebsd11_cvtstat(struct stat *st, struct freebsd11_stat *ost); int getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, struct vnode **vpp); void getnewvnode_reserve(void); void getnewvnode_drop_reserve(void); int insmntque1(struct vnode *vp, struct mount *mp, void (*dtr)(struct vnode *, void *), void *dtr_arg); int insmntque(struct vnode *vp, struct mount *mp); u_quad_t init_va_filerev(void); int speedup_syncer(void); int vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen); int vn_getcwd(char *buf, char **retbuf, size_t *buflen); int vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf); int vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf); struct vnode * vn_dir_dd_ino(struct vnode *vp); int vn_commname(struct vnode *vn, char *buf, u_int buflen); int vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, u_int pathlen); int vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, accmode_t accmode, struct ucred *cred); int vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred); int vaccess_acl_nfs4(enum vtype type, uid_t file_uid, gid_t file_gid, struct acl *aclp, accmode_t accmode, struct ucred *cred); int vaccess_acl_posix1e(enum vtype type, uid_t file_uid, gid_t file_gid, struct acl *acl, accmode_t accmode, struct ucred *cred); void vattr_null(struct vattr *vap); void vlazy(struct vnode *); void vdrop(struct vnode *); void vdropl(struct vnode *); int vflush(struct mount *mp, int rootrefs, int flags, struct thread *td); int vget(struct vnode *vp, int flags); enum vgetstate vget_prep_smr(struct vnode *vp); enum vgetstate vget_prep(struct vnode *vp); int vget_finish(struct vnode *vp, int flags, enum vgetstate vs); void vget_finish_ref(struct vnode *vp, enum vgetstate vs); void vget_abort(struct vnode *vp, enum vgetstate vs); void vgone(struct vnode *vp); void vhold(struct vnode *); void vholdnz(struct vnode *); bool vhold_smr(struct vnode *); int vinactive(struct vnode *vp); int vinvalbuf(struct vnode *vp, int save, int slpflag, int slptimeo); int vtruncbuf(struct vnode *vp, off_t length, int blksize); void v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn, int blksize); void vunref(struct vnode *); void vn_printf(struct vnode *vp, const char *fmt, ...) __printflike(2,3); int vrecycle(struct vnode *vp); int vrecyclel(struct vnode *vp); int vn_bmap_seekhole_locked(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred); int vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred); int vn_close(struct vnode *vp, int flags, struct ucred *file_cred, struct thread *td); int vn_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp, off_t *outoffp, size_t *lenp, unsigned int flags, struct ucred *incred, struct ucred *outcred, struct thread *fsize_td); int vn_deallocate(struct vnode *vp, off_t *offset, off_t *length, int flags, int ioflg, struct ucred *active_cred, struct ucred *file_cred); void vn_finished_write(struct mount *mp); void vn_finished_secondary_write(struct mount *mp); int vn_fsync_buf(struct vnode *vp, int waitfor); int vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp, off_t *outoffp, size_t *lenp, unsigned int flags, struct ucred *incred, struct ucred *outcred, struct thread *fsize_td); int vn_need_pageq_flush(struct vnode *vp); bool vn_isdisk_error(struct vnode *vp, int *errp); bool vn_isdisk(struct vnode *vp); int _vn_lock(struct vnode *vp, int flags, const char *file, int line); #define vn_lock(vp, flags) _vn_lock(vp, flags, __FILE__, __LINE__) void vn_lock_pair(struct vnode *vp1, bool vp1_locked, struct vnode *vp2, bool vp2_locked); int vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp); int vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags, struct ucred *cred, struct file *fp); int vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred, struct thread *td, struct file *fp); void vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end); int vn_pollrecord(struct vnode *vp, struct thread *p, int events); int vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, struct ucred *file_cred, ssize_t *aresid, struct thread *td); int vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base, size_t len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, struct ucred *file_cred, size_t *aresid, struct thread *td); int vn_read_from_obj(struct vnode *vp, struct uio *uio); int vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio, struct thread *td); int vn_start_write(struct vnode *vp, struct mount **mpp, int flags); int vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags); int vn_truncate_locked(struct vnode *vp, off_t length, bool sync, struct ucred *cred); int vn_writechk(struct vnode *vp); int vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int *buflen, char *buf, struct thread *td); int vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int buflen, char *buf, struct thread *td); int vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, struct thread *td); int vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp); int vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc, void *alloc_arg, int lkflags, struct vnode **rvp); int vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred, struct thread *td); int vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio); int vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize, struct uio *uio); void vn_seqc_write_begin_locked(struct vnode *vp); void vn_seqc_write_begin(struct vnode *vp); void vn_seqc_write_end_locked(struct vnode *vp); void vn_seqc_write_end(struct vnode *vp); #define vn_seqc_read_any(vp) seqc_read_any(&(vp)->v_seqc) #define vn_seqc_read_notmodify(vp) seqc_read_notmodify(&(vp)->v_seqc) #define vn_seqc_consistent(vp, seq) seqc_consistent(&(vp)->v_seqc, seq) #define vn_rangelock_unlock(vp, cookie) \ rangelock_unlock(&(vp)->v_rl, (cookie), VI_MTX(vp)) #define vn_rangelock_unlock_range(vp, cookie, start, end) \ rangelock_unlock_range(&(vp)->v_rl, (cookie), (start), (end), \ VI_MTX(vp)) #define vn_rangelock_rlock(vp, start, end) \ rangelock_rlock(&(vp)->v_rl, (start), (end), VI_MTX(vp)) #define vn_rangelock_tryrlock(vp, start, end) \ rangelock_tryrlock(&(vp)->v_rl, (start), (end), VI_MTX(vp)) #define vn_rangelock_wlock(vp, start, end) \ rangelock_wlock(&(vp)->v_rl, (start), (end), VI_MTX(vp)) #define vn_rangelock_trywlock(vp, start, end) \ rangelock_trywlock(&(vp)->v_rl, (start), (end), VI_MTX(vp)) #define vn_irflag_read(vp) atomic_load_short(&(vp)->v_irflag) void vn_irflag_set_locked(struct vnode *vp, short toset); void vn_irflag_set(struct vnode *vp, short toset); void vn_irflag_set_cond_locked(struct vnode *vp, short toset); void vn_irflag_set_cond(struct vnode *vp, short toset); void vn_irflag_unset_locked(struct vnode *vp, short tounset); void vn_irflag_unset(struct vnode *vp, short tounset); int vfs_cache_lookup(struct vop_lookup_args *ap); int vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp); void vfs_timestamp(struct timespec *); void vfs_write_resume(struct mount *mp, int flags); int vfs_write_suspend(struct mount *mp, int flags); int vfs_write_suspend_umnt(struct mount *mp); struct vnode *vnlru_alloc_marker(void); void vnlru_free_marker(struct vnode *); void vnlru_free_vfsops(int, struct vfsops *, struct vnode *); int vop_stdbmap(struct vop_bmap_args *); int vop_stdfdatasync_buf(struct vop_fdatasync_args *); int vop_stdfsync(struct vop_fsync_args *); int vop_stdgetwritemount(struct vop_getwritemount_args *); int vop_stdgetpages(struct vop_getpages_args *); int vop_stdinactive(struct vop_inactive_args *); int vop_stdioctl(struct vop_ioctl_args *); int vop_stdneed_inactive(struct vop_need_inactive_args *); int vop_stdkqfilter(struct vop_kqfilter_args *); int vop_stdlock(struct vop_lock1_args *); int vop_stdunlock(struct vop_unlock_args *); int vop_stdislocked(struct vop_islocked_args *); int vop_lock(struct vop_lock1_args *); int vop_unlock(struct vop_unlock_args *); int vop_islocked(struct vop_islocked_args *); int vop_stdputpages(struct vop_putpages_args *); int vop_nopoll(struct vop_poll_args *); int vop_stdaccess(struct vop_access_args *ap); int vop_stdaccessx(struct vop_accessx_args *ap); int vop_stdadvise(struct vop_advise_args *ap); int vop_stdadvlock(struct vop_advlock_args *ap); int vop_stdadvlockasync(struct vop_advlockasync_args *ap); int vop_stdadvlockpurge(struct vop_advlockpurge_args *ap); int vop_stdallocate(struct vop_allocate_args *ap); int vop_stddeallocate(struct vop_deallocate_args *ap); int vop_stdset_text(struct vop_set_text_args *ap); int vop_stdpathconf(struct vop_pathconf_args *); int vop_stdpoll(struct vop_poll_args *); int vop_stdvptocnp(struct vop_vptocnp_args *ap); int vop_stdvptofh(struct vop_vptofh_args *ap); int vop_stdunp_bind(struct vop_unp_bind_args *ap); int vop_stdunp_connect(struct vop_unp_connect_args *ap); int vop_stdunp_detach(struct vop_unp_detach_args *ap); int vop_eopnotsupp(struct vop_generic_args *ap); int vop_ebadf(struct vop_generic_args *ap); int vop_einval(struct vop_generic_args *ap); int vop_enoent(struct vop_generic_args *ap); int vop_enotty(struct vop_generic_args *ap); int vop_eagain(struct vop_generic_args *ap); int vop_null(struct vop_generic_args *ap); int vop_panic(struct vop_generic_args *ap); int dead_poll(struct vop_poll_args *ap); int dead_read(struct vop_read_args *ap); int dead_write(struct vop_write_args *ap); /* These are called from within the actual VOPS. */ void vop_close_post(void *a, int rc); void vop_create_pre(void *a); void vop_create_post(void *a, int rc); void vop_whiteout_pre(void *a); void vop_whiteout_post(void *a, int rc); void vop_deleteextattr_pre(void *a); void vop_deleteextattr_post(void *a, int rc); void vop_link_pre(void *a); void vop_link_post(void *a, int rc); void vop_lookup_post(void *a, int rc); void vop_lookup_pre(void *a); void vop_mkdir_pre(void *a); void vop_mkdir_post(void *a, int rc); void vop_mknod_pre(void *a); void vop_mknod_post(void *a, int rc); void vop_open_post(void *a, int rc); void vop_read_post(void *a, int rc); void vop_read_pgcache_post(void *ap, int rc); void vop_readdir_post(void *a, int rc); void vop_reclaim_post(void *a, int rc); void vop_remove_pre(void *a); void vop_remove_post(void *a, int rc); void vop_rename_post(void *a, int rc); void vop_rename_pre(void *a); void vop_rmdir_pre(void *a); void vop_rmdir_post(void *a, int rc); void vop_setattr_pre(void *a); void vop_setattr_post(void *a, int rc); void vop_setacl_pre(void *a); void vop_setacl_post(void *a, int rc); void vop_setextattr_pre(void *a); void vop_setextattr_post(void *a, int rc); void vop_symlink_pre(void *a); void vop_symlink_post(void *a, int rc); int vop_sigdefer(struct vop_vector *vop, struct vop_generic_args *a); #ifdef DEBUG_VFS_LOCKS void vop_fplookup_vexec_debugpre(void *a); void vop_fplookup_vexec_debugpost(void *a, int rc); void vop_fplookup_symlink_debugpre(void *a); void vop_fplookup_symlink_debugpost(void *a, int rc); void vop_strategy_debugpre(void *a); void vop_lock_debugpre(void *a); void vop_lock_debugpost(void *a, int rc); void vop_unlock_debugpre(void *a); void vop_need_inactive_debugpre(void *a); void vop_need_inactive_debugpost(void *a, int rc); void vop_mkdir_debugpost(void *a, int rc); #else #define vop_fplookup_vexec_debugpre(x) do { } while (0) #define vop_fplookup_vexec_debugpost(x, y) do { } while (0) #define vop_fplookup_symlink_debugpre(x) do { } while (0) #define vop_fplookup_symlink_debugpost(x, y) do { } while (0) #define vop_strategy_debugpre(x) do { } while (0) #define vop_lock_debugpre(x) do { } while (0) #define vop_lock_debugpost(x, y) do { } while (0) #define vop_unlock_debugpre(x) do { } while (0) #define vop_need_inactive_debugpre(x) do { } while (0) #define vop_need_inactive_debugpost(x, y) do { } while (0) #define vop_mkdir_debugpost(x, y) do { } while (0) #endif void vop_rename_fail(struct vop_rename_args *ap); #define vop_stat_helper_pre(ap) ({ \ int _error; \ AUDIT_ARG_VNODE1(ap->a_vp); \ _error = mac_vnode_check_stat(ap->a_active_cred, ap->a_file_cred, ap->a_vp);\ if (__predict_true(_error == 0)) \ bzero(ap->a_sb, sizeof(*ap->a_sb)); \ _error; \ }) #define vop_stat_helper_post(ap, error) ({ \ int _error = (error); \ - if (priv_check_cred_vfs_generation(ap->a_td->td_ucred)) \ + if (priv_check_cred_vfs_generation(ap->a_active_cred)) \ ap->a_sb->st_gen = 0; \ _error; \ }) #define VOP_WRITE_PRE(ap) \ struct vattr va; \ int error; \ off_t osize, ooffset, noffset; \ \ osize = ooffset = noffset = 0; \ if (!VN_KNLIST_EMPTY((ap)->a_vp)) { \ error = VOP_GETATTR((ap)->a_vp, &va, (ap)->a_cred); \ if (error) \ return (error); \ ooffset = (ap)->a_uio->uio_offset; \ osize = (off_t)va.va_size; \ } #define VOP_WRITE_POST(ap, ret) \ noffset = (ap)->a_uio->uio_offset; \ if (noffset > ooffset && !VN_KNLIST_EMPTY((ap)->a_vp)) { \ VFS_KNOTE_LOCKED((ap)->a_vp, NOTE_WRITE \ | (noffset > osize ? NOTE_EXTEND : 0)); \ } #define VOP_LOCK(vp, flags) VOP_LOCK1(vp, flags, __FILE__, __LINE__) #ifdef INVARIANTS #define VOP_ADD_WRITECOUNT_CHECKED(vp, cnt) \ do { \ int error_; \ \ error_ = VOP_ADD_WRITECOUNT((vp), (cnt)); \ VNASSERT(error_ == 0, (vp), ("VOP_ADD_WRITECOUNT returned %d", \ error_)); \ } while (0) #define VOP_SET_TEXT_CHECKED(vp) \ do { \ int error_; \ \ error_ = VOP_SET_TEXT((vp)); \ VNASSERT(error_ == 0, (vp), ("VOP_SET_TEXT returned %d", \ error_)); \ } while (0) #define VOP_UNSET_TEXT_CHECKED(vp) \ do { \ int error_; \ \ error_ = VOP_UNSET_TEXT((vp)); \ VNASSERT(error_ == 0, (vp), ("VOP_UNSET_TEXT returned %d", \ error_)); \ } while (0) #else #define VOP_ADD_WRITECOUNT_CHECKED(vp, cnt) VOP_ADD_WRITECOUNT((vp), (cnt)) #define VOP_SET_TEXT_CHECKED(vp) VOP_SET_TEXT((vp)) #define VOP_UNSET_TEXT_CHECKED(vp) VOP_UNSET_TEXT((vp)) #endif #define VN_IS_DOOMED(vp) __predict_false((vn_irflag_read(vp) & VIRF_DOOMED) != 0) void vput(struct vnode *vp); void vrele(struct vnode *vp); void vref(struct vnode *vp); void vrefact(struct vnode *vp); void v_addpollinfo(struct vnode *vp); static __inline int vrefcnt(struct vnode *vp) { return (vp->v_usecount); } #define vholdl(vp) do { \ ASSERT_VI_LOCKED(vp, __func__); \ vhold(vp); \ } while (0) #define vrefl(vp) do { \ ASSERT_VI_LOCKED(vp, __func__); \ vref(vp); \ } while (0) int vnode_create_vobject(struct vnode *vp, off_t size, struct thread *td); void vnode_destroy_vobject(struct vnode *vp); extern struct vop_vector fifo_specops; extern struct vop_vector dead_vnodeops; extern struct vop_vector default_vnodeops; #define VOP_PANIC ((void*)(uintptr_t)vop_panic) #define VOP_NULL ((void*)(uintptr_t)vop_null) #define VOP_EBADF ((void*)(uintptr_t)vop_ebadf) #define VOP_ENOTTY ((void*)(uintptr_t)vop_enotty) #define VOP_EINVAL ((void*)(uintptr_t)vop_einval) #define VOP_ENOENT ((void*)(uintptr_t)vop_enoent) #define VOP_EOPNOTSUPP ((void*)(uintptr_t)vop_eopnotsupp) #define VOP_EAGAIN ((void*)(uintptr_t)vop_eagain) /* fifo_vnops.c */ int fifo_printinfo(struct vnode *); /* vfs_hash.c */ typedef int vfs_hash_cmp_t(struct vnode *vp, void *arg); void vfs_hash_changesize(u_long newhashsize); int vfs_hash_get(const struct mount *mp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg); u_int vfs_hash_index(struct vnode *vp); int vfs_hash_insert(struct vnode *vp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg); void vfs_hash_ref(const struct mount *mp, u_int hash, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg); void vfs_hash_rehash(struct vnode *vp, u_int hash); void vfs_hash_remove(struct vnode *vp); int vfs_kqfilter(struct vop_kqfilter_args *); struct dirent; int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off); int vfs_emptydir(struct vnode *vp); int vfs_unixify_accmode(accmode_t *accmode); void vfs_unp_reclaim(struct vnode *vp); int setfmode(struct thread *td, struct ucred *cred, struct vnode *vp, int mode); int setfown(struct thread *td, struct ucred *cred, struct vnode *vp, uid_t uid, gid_t gid); int vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td); int vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td); void vn_fsid(struct vnode *vp, struct vattr *va); int vn_dir_check_exec(struct vnode *vp, struct componentname *cnp); int vn_lktype_write(struct mount *mp, struct vnode *vp); #define VOP_UNLOCK_FLAGS(vp, flags) ({ \ struct vnode *_vp = (vp); \ int _flags = (flags); \ int _error; \ \ if ((_flags & ~(LK_INTERLOCK | LK_RELEASE)) != 0) \ panic("%s: unsupported flags %x\n", __func__, flags); \ _error = VOP_UNLOCK(_vp); \ if (_flags & LK_INTERLOCK) \ VI_UNLOCK(_vp); \ _error; \ }) #include #define VFS_VOP_VECTOR_REGISTER(vnodeops) \ SYSINIT(vfs_vector_##vnodeops##_f, SI_SUB_VFS, SI_ORDER_ANY, \ vfs_vector_op_register, &vnodeops) #define VFS_SMR_DECLARE \ extern smr_t vfs_smr #define VFS_SMR() vfs_smr #define vfs_smr_enter() smr_enter(VFS_SMR()) #define vfs_smr_exit() smr_exit(VFS_SMR()) #define vfs_smr_synchronize() smr_synchronize(VFS_SMR()) #define vfs_smr_entered_load(ptr) smr_entered_load((ptr), VFS_SMR()) #define VFS_SMR_ASSERT_ENTERED() SMR_ASSERT_ENTERED(VFS_SMR()) #define VFS_SMR_ASSERT_NOT_ENTERED() SMR_ASSERT_NOT_ENTERED(VFS_SMR()) #define VFS_SMR_ZONE_SET(zone) uma_zone_set_smr((zone), VFS_SMR()) #define vn_load_v_data_smr(vp) ({ \ struct vnode *_vp = (vp); \ \ VFS_SMR_ASSERT_ENTERED(); \ atomic_load_consume_ptr(&(_vp)->v_data);\ }) #endif /* _KERNEL */ #endif /* !_SYS_VNODE_H_ */