Index: lib/libc/gen/Makefile.inc =================================================================== --- lib/libc/gen/Makefile.inc +++ lib/libc/gen/Makefile.inc @@ -47,6 +47,7 @@ err.c \ errlst.c \ errno.c \ + eventfd_rw.c \ exec.c \ exect.c \ fdevname.c \ Index: lib/libc/gen/Symbol.map =================================================================== --- lib/libc/gen/Symbol.map +++ lib/libc/gen/Symbol.map @@ -421,6 +421,8 @@ }; FBSD_1.6 { + eventfd_read; + eventfd_write; getlogin_r; memalign; scandir_b; Index: lib/libc/gen/eventfd_rw.c =================================================================== --- /dev/null +++ lib/libc/gen/eventfd_rw.c @@ -0,0 +1,41 @@ +/*- + * SPDX-License-Identifier: MIT + * + * Copyright (c) 2005-2020 Rich Felker, et al. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +#include +__FBSDID("$FreeBSD$"); + +#include "namespace.h" +#include +#include +#include "un-namespace.h" + +int eventfd_read(int fd, eventfd_t *value) +{ + return (sizeof(*value) == _read(fd, value, sizeof(*value))) ? 0 : -1; +} + +int eventfd_write(int fd, eventfd_t value) +{ + return (sizeof(value) == _write(fd, &value, sizeof(value))) ? 0 : -1; +} Index: lib/libc/sys/Makefile.inc =================================================================== --- lib/libc/sys/Makefile.inc +++ lib/libc/sys/Makefile.inc @@ -178,6 +178,7 @@ cpuset_getaffinity.2 \ cpuset_getdomain.2 \ dup.2 \ + eventfd.2 \ execve.2 \ _exit.2 \ extattr_get_file.2 \ @@ -379,6 +380,8 @@ MLINKS+=cpuset_getaffinity.2 cpuset_setaffinity.2 MLINKS+=cpuset_getdomain.2 cpuset_setdomain.2 MLINKS+=dup.2 dup2.2 +MLINKS+=eventfd.2 eventfd_read.3 \ + eventfd.2 eventfd_write.3 MLINKS+=execve.2 fexecve.2 MLINKS+=extattr_get_file.2 extattr.2 \ extattr_get_file.2 extattr_delete_fd.2 \ Index: lib/libc/sys/Symbol.map =================================================================== --- lib/libc/sys/Symbol.map +++ lib/libc/sys/Symbol.map @@ -405,6 +405,7 @@ __sysctlbyname; close_range; copy_file_range; + eventfd; fhlink; fhlinkat; fhreadlink; Index: lib/libc/sys/eventfd.2 =================================================================== --- /dev/null +++ lib/libc/sys/eventfd.2 @@ -0,0 +1,203 @@ +.\" SPDX-License-Identifier: BSD-2-Clause +.\" +.\" Copyright (c) 2020 Greg V +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd October 8, 2020 +.Dt EVENTFD 2 +.Os +.Sh NAME +.Nm eventfd +.Nd create a file descriptor for event notification +.Sh LIBRARY +.Lb libc +.Sh SYNOPSIS +.In sys/eventfd.h +.Ft int +.Fn eventfd "unsigned int initval" "int flags" +.Ft int +.Fn eventfd_read "int fd" "eventfd_t *value" +.Ft int +.Fn eventfd_write "int fd" "eventfd_t value" +.Sh DESCRIPTION +.Fn eventfd +creates a special file descriptor with event counter or semaphore semantics, +designed for interprocess communication. +The returned file descriptor refers to a kernel object containing an +unsigned 64-bit integer counter, which is initialized with the value of the +.Fa initval +argument. +.Pp +The +.Fa flags +argument may contain the result of +.Em or Ns 'ing +the following values: +.Pp +.Bd -literal -offset indent -compact +EFD_CLOEXEC set FD_CLOEXEC on the file descriptor +EFD_NONBLOCK do not block on read/write operations +EFD_SEMAPHORE use semaphore semantics +.Ed +.Pp +File operations have the following semantics: +.Bl -tag -width EFD_SEMAPHORE +.It Xr read 2 +If the counter is zero, the call blocks until the counter becomes non-zero, unless +.Dv EFD_NONBLOCK +was set, in which case it would fail with +.Dv EAGAIN +instead. +.Pp +If the counter is non-zero: +.Bl -tag -width * +.It * +If +.Dv EFD_SEMAPHORE +is not set, the current value of the counter is returned, +and the value is reset to zero. +.It * +If +.Dv EFD_SEMAPHORE +is set, the constant 1 is returned, and the value is decremented by 1. +.El +.Pp +The numeric value is encoded as 64-bit (8 bytes) in host byte order. +The +.Xr read 2 +call fails with +.Dv EINVAL +if there is less than 8 bytes available in the supplied buffer. +.It Xr write 2 +Adds the given value to the counter. +The maximum value that can be stored in the counter is the +maximum unsigned 64-bit integer value minus one (0xfffffffffffffffe). +.Pp +If the resulting value exceeds the maximum, the call would block +until the value is reduced by +.Xr read 2 , +unless +.Dv EFD_NONBLOCK +was set, in which case it would fail with +.Dv EAGAIN +instead. +.Pp +The numeric value is encoded as 64-bit (8 bytes) in host byte order. +The +.Xr write 2 +call fails with +.Dv EINVAL +if there is less than 8 bytes available in the supplied buffer, +or if the value 0xffffffffffffffff is given. +.It Xr poll 2 +When receiving notifications via +.Xr poll 2 / +.Xr ppoll 2 / +.Xr select 2 / +.Xr pselect 2 / +.Xr kqueue 2 , +the following semantics apply: +.Bl -tag -width * +.It * +The file descriptor is readable when the counter is greater than zero. +.It * +The file descriptor is writable when the counter is less than the maximum value. +.El +.El +.Pp +File descriptors created by +.Fn eventfd +are passable to other processes via +.Xr sendmsg 2 +and are preserved across +.Xr fork 2 ; +in both cases the descriptors refer to the same counter from both processes. +Unless +.Dv O_CLOEXEC +flag was specified, +the created file descriptor will remain open across +.Xr execve 2 +system calls; see +.Xr close 2 , +.Xr fcntl 2 +and +.Dv O_CLOEXEC +description. +.Pp +.Fn eventfd_read +and +.Fn eventfd_write +are thin wrappers around +.Xr read 2 +and +.Xr write 2 +system calls, +provided for compatibility with glibc. +.Sh RETURN VALUES +If successful, +.Fn eventfd +returns a non-negative integer, termed a file descriptor. +It returns \-1 on failure, and sets +.Va errno +to indicate the error. +.Pp +The +.Fn eventfd_read +and +.Fn eventfd_write +functions return 0 if the operation succeeded, -1 otherwise. +.Sh ERRORS +.Fn eventfd +may fail with: +.Bl -tag -width Er +.It Bq Er EINVAL +The flags argument given to +.Fn eventfd +has unknown bits set. +.It Bq Er EMFILE +The process has already reached its limit for open +file descriptors. +.It Bq Er ENFILE +The system file table is full. +.It Bq Er ENOMEM +No memory was available to create the kernel object. +.El +.Sh SEE ALSO +.Xr close 2 , +.Xr kqueue 2 , +.Xr poll 2 , +.Xr read 2 , +.Xr select 2 , +.Xr write 2 +.Sh STANDARDS +The +.Fn eventfd +system call is non-standard. +It is present in Linux. +.Sh HISTORY +The +.Fn eventfd +system call first appeared in +.Fx 13.0 . Index: lib/libc/sys/kqueue.2 =================================================================== --- lib/libc/sys/kqueue.2 +++ lib/libc/sys/kqueue.2 @@ -334,6 +334,11 @@ enabled and there is any data to read; .Va data contains the number of bytes available. +.It Eventfds +Returns when the counter is greater than 0; +.Va data +contains the counter value, which must be cast to +.Vt uint64_t . .El .It Dv EVFILT_WRITE Takes a descriptor as the identifier, and returns whenever @@ -352,6 +357,11 @@ identical to the .Dv EVFILT_READ case. +.Pp +For eventfds, +.Va data +will contain the maximum value that can be added to the counter +without blocking. .It Dv EVFILT_EMPTY Takes a descriptor as the identifier, and returns whenever there is no remaining data in the write buffer. Index: lib/libprocstat/libprocstat.h =================================================================== --- lib/libprocstat/libprocstat.h +++ lib/libprocstat/libprocstat.h @@ -72,6 +72,7 @@ #define PS_FST_TYPE_NONE 12 #define PS_FST_TYPE_PROCDESC 13 #define PS_FST_TYPE_DEV 14 +#define PS_FST_TYPE_EVENTFD 15 /* * Special descriptor numbers. Index: lib/libprocstat/libprocstat.c =================================================================== --- lib/libprocstat/libprocstat.c +++ lib/libprocstat/libprocstat.c @@ -622,6 +622,10 @@ type = PS_FST_TYPE_DEV; data = file.f_data; break; + case DTYPE_EVENTFD: + type = PS_FST_TYPE_EVENTFD; + data = file.f_data; + break; default: continue; } @@ -714,6 +718,7 @@ { KF_TYPE_SHM, PS_FST_TYPE_SHM }, { KF_TYPE_SOCKET, PS_FST_TYPE_SOCKET }, { KF_TYPE_VNODE, PS_FST_TYPE_VNODE }, + { KF_TYPE_EVENTFD, PS_FST_TYPE_EVENTFD }, { KF_TYPE_UNKNOWN, PS_FST_TYPE_UNKNOWN } }; #define NKFTYPES (sizeof(kftypes2fst) / sizeof(*kftypes2fst)) Index: sys/bsm/audit_kevents.h =================================================================== --- sys/bsm/audit_kevents.h +++ sys/bsm/audit_kevents.h @@ -659,6 +659,7 @@ #define AUE_SHMRENAME 43263 /* FreeBSD-specific. */ #define AUE_REALPATHAT 43264 /* FreeBSD-specific. */ #define AUE_CLOSERANGE 43265 /* FreeBSD-specific. */ +#define AUE_EVENTFD 43266 /* FreeBSD/Linux. */ /* * Darwin BSM uses a number of AUE_O_* definitions, which are aliased to the Index: sys/compat/freebsd32/syscalls.master =================================================================== --- sys/compat/freebsd32/syscalls.master +++ sys/compat/freebsd32/syscalls.master @@ -1167,5 +1167,6 @@ ; 576 is initialised by the krpc code, if present. 576 AUE_NULL NOSTD|NOPROTO { int rpctls_syscall(int op, \ const char *path); } +577 AUE_EVENTFD NOPROTO { int eventfd(unsigned int initval, int flags); } ; vim: syntax=off Index: sys/compat/linux/linux_event.h =================================================================== --- sys/compat/linux/linux_event.h +++ sys/compat/linux/linux_event.h @@ -56,8 +56,6 @@ #define LINUX_EPOLL_CTL_DEL 2 #define LINUX_EPOLL_CTL_MOD 3 -#define LINUX_EFD_SEMAPHORE (1 << 0) - #define LINUX_TFD_TIMER_ABSTIME (1 << 0) #define LINUX_TFD_TIMER_CANCEL_ON_SET (1 << 1) #define LINUX_TFD_CLOEXEC LINUX_O_CLOEXEC Index: sys/compat/linux/linux_event.c =================================================================== --- sys/compat/linux/linux_event.c +++ sys/compat/linux/linux_event.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2007 Roman Divacky * Copyright (c) 2014 Dmitry Chagin * All rights reserved. @@ -52,6 +54,7 @@ #include #include #include +#include #ifdef COMPAT_LINUX32 #include @@ -122,53 +125,11 @@ int error; }; -/* eventfd */ -typedef uint64_t eventfd_t; - -static fo_rdwr_t eventfd_read; -static fo_rdwr_t eventfd_write; -static fo_ioctl_t eventfd_ioctl; -static fo_poll_t eventfd_poll; -static fo_kqfilter_t eventfd_kqfilter; -static fo_stat_t eventfd_stat; -static fo_close_t eventfd_close; -static fo_fill_kinfo_t eventfd_fill_kinfo; - -static struct fileops eventfdops = { - .fo_read = eventfd_read, - .fo_write = eventfd_write, - .fo_truncate = invfo_truncate, - .fo_ioctl = eventfd_ioctl, - .fo_poll = eventfd_poll, - .fo_kqfilter = eventfd_kqfilter, - .fo_stat = eventfd_stat, - .fo_close = eventfd_close, - .fo_chmod = invfo_chmod, - .fo_chown = invfo_chown, - .fo_sendfile = invfo_sendfile, - .fo_fill_kinfo = eventfd_fill_kinfo, - .fo_flags = DFLAG_PASSABLE -}; - -static void filt_eventfddetach(struct knote *kn); -static int filt_eventfdread(struct knote *kn, long hint); -static int filt_eventfdwrite(struct knote *kn, long hint); - -static struct filterops eventfd_rfiltops = { - .f_isfd = 1, - .f_detach = filt_eventfddetach, - .f_event = filt_eventfdread -}; -static struct filterops eventfd_wfiltops = { - .f_isfd = 1, - .f_detach = filt_eventfddetach, - .f_event = filt_eventfdwrite -}; - /* timerfd */ typedef uint64_t timerfd_t; static fo_rdwr_t timerfd_read; +static fo_ioctl_t timerfd_ioctl; static fo_poll_t timerfd_poll; static fo_kqfilter_t timerfd_kqfilter; static fo_stat_t timerfd_stat; @@ -179,7 +140,7 @@ .fo_read = timerfd_read, .fo_write = invfo_rdwr, .fo_truncate = invfo_truncate, - .fo_ioctl = eventfd_ioctl, + .fo_ioctl = timerfd_ioctl, .fo_poll = timerfd_poll, .fo_kqfilter = timerfd_kqfilter, .fo_stat = timerfd_stat, @@ -200,13 +161,6 @@ .f_event = filt_timerfdread }; -struct eventfd { - eventfd_t efd_count; - uint32_t efd_flags; - struct selinfo efd_sel; - struct mtx efd_lock; -}; - struct timerfd { clockid_t tfd_clockid; struct itimerspec tfd_time; @@ -217,7 +171,6 @@ struct mtx tfd_lock; }; -static int eventfd_create(struct thread *td, uint32_t initval, int flags); static void linux_timerfd_expire(void *); static void linux_timerfd_curval(struct timerfd *, struct itimerspec *); @@ -695,41 +648,6 @@ return (error1 == 0 ? 0 : error2); } -static int -eventfd_create(struct thread *td, uint32_t initval, int flags) -{ - struct filedesc *fdp; - struct eventfd *efd; - struct file *fp; - int fflags, fd, error; - - fflags = 0; - if ((flags & LINUX_O_CLOEXEC) != 0) - fflags |= O_CLOEXEC; - - fdp = td->td_proc->p_fd; - error = falloc(td, &fp, &fd, fflags); - if (error != 0) - return (error); - - efd = malloc(sizeof(*efd), M_EPOLL, M_WAITOK | M_ZERO); - efd->efd_flags = flags; - efd->efd_count = initval; - mtx_init(&efd->efd_lock, "eventfd", NULL, MTX_DEF); - - knlist_init_mtx(&efd->efd_sel.si_note, &efd->efd_lock); - - fflags = FREAD | FWRITE; - if ((flags & LINUX_O_NONBLOCK) != 0) - fflags |= FNONBLOCK; - - finit(fp, fflags, DTYPE_LINUXEFD, efd, &eventfdops); - fdrop(fp, td); - - td->td_retval[0] = fd; - return (error); -} - #ifdef LINUX_LEGACY_SYSCALLS int linux_eventfd(struct thread *td, struct linux_eventfd_args *args) @@ -742,253 +660,19 @@ int linux_eventfd2(struct thread *td, struct linux_eventfd2_args *args) { + int flags = 0; - if ((args->flags & ~(LINUX_O_CLOEXEC|LINUX_O_NONBLOCK|LINUX_EFD_SEMAPHORE)) != 0) + if ((args->flags & ~(LINUX_O_CLOEXEC|LINUX_O_NONBLOCK|EFD_SEMAPHORE)) != 0) return (EINVAL); - return (eventfd_create(td, args->initval, args->flags)); -} + if ((args->flags & LINUX_O_CLOEXEC) != 0) + flags |= EFD_CLOEXEC; + if ((args->flags & LINUX_O_NONBLOCK) != 0) + flags |= EFD_NONBLOCK; + if ((args->flags & EFD_SEMAPHORE) != 0) + flags |= EFD_SEMAPHORE; -static int -eventfd_close(struct file *fp, struct thread *td) -{ - struct eventfd *efd; - - efd = fp->f_data; - if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) - return (EINVAL); - - seldrain(&efd->efd_sel); - knlist_destroy(&efd->efd_sel.si_note); - - fp->f_ops = &badfileops; - mtx_destroy(&efd->efd_lock); - free(efd, M_EPOLL); - - return (0); -} - -static int -eventfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred, - int flags, struct thread *td) -{ - struct eventfd *efd; - eventfd_t count; - int error; - - efd = fp->f_data; - if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) - return (EINVAL); - - if (uio->uio_resid < sizeof(eventfd_t)) - return (EINVAL); - - error = 0; - mtx_lock(&efd->efd_lock); -retry: - if (efd->efd_count == 0) { - if ((fp->f_flag & FNONBLOCK) != 0) { - mtx_unlock(&efd->efd_lock); - return (EAGAIN); - } - error = mtx_sleep(&efd->efd_count, &efd->efd_lock, PCATCH, "lefdrd", 0); - if (error == 0) - goto retry; - } - if (error == 0) { - if ((efd->efd_flags & LINUX_EFD_SEMAPHORE) != 0) { - count = 1; - --efd->efd_count; - } else { - count = efd->efd_count; - efd->efd_count = 0; - } - KNOTE_LOCKED(&efd->efd_sel.si_note, 0); - selwakeup(&efd->efd_sel); - wakeup(&efd->efd_count); - mtx_unlock(&efd->efd_lock); - error = uiomove(&count, sizeof(eventfd_t), uio); - } else - mtx_unlock(&efd->efd_lock); - - return (error); -} - -static int -eventfd_write(struct file *fp, struct uio *uio, struct ucred *active_cred, - int flags, struct thread *td) -{ - struct eventfd *efd; - eventfd_t count; - int error; - - efd = fp->f_data; - if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) - return (EINVAL); - - if (uio->uio_resid < sizeof(eventfd_t)) - return (EINVAL); - - error = uiomove(&count, sizeof(eventfd_t), uio); - if (error != 0) - return (error); - if (count == UINT64_MAX) - return (EINVAL); - - mtx_lock(&efd->efd_lock); -retry: - if (UINT64_MAX - efd->efd_count <= count) { - if ((fp->f_flag & FNONBLOCK) != 0) { - mtx_unlock(&efd->efd_lock); - /* Do not not return the number of bytes written */ - uio->uio_resid += sizeof(eventfd_t); - return (EAGAIN); - } - error = mtx_sleep(&efd->efd_count, &efd->efd_lock, - PCATCH, "lefdwr", 0); - if (error == 0) - goto retry; - } - if (error == 0) { - efd->efd_count += count; - KNOTE_LOCKED(&efd->efd_sel.si_note, 0); - selwakeup(&efd->efd_sel); - wakeup(&efd->efd_count); - } - mtx_unlock(&efd->efd_lock); - - return (error); -} - -static int -eventfd_poll(struct file *fp, int events, struct ucred *active_cred, - struct thread *td) -{ - struct eventfd *efd; - int revents = 0; - - efd = fp->f_data; - if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) - return (POLLERR); - - mtx_lock(&efd->efd_lock); - if ((events & (POLLIN|POLLRDNORM)) && efd->efd_count > 0) - revents |= events & (POLLIN|POLLRDNORM); - if ((events & (POLLOUT|POLLWRNORM)) && UINT64_MAX - 1 > efd->efd_count) - revents |= events & (POLLOUT|POLLWRNORM); - if (revents == 0) - selrecord(td, &efd->efd_sel); - mtx_unlock(&efd->efd_lock); - - return (revents); -} - -/*ARGSUSED*/ -static int -eventfd_kqfilter(struct file *fp, struct knote *kn) -{ - struct eventfd *efd; - - efd = fp->f_data; - if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) - return (EINVAL); - - mtx_lock(&efd->efd_lock); - switch (kn->kn_filter) { - case EVFILT_READ: - kn->kn_fop = &eventfd_rfiltops; - break; - case EVFILT_WRITE: - kn->kn_fop = &eventfd_wfiltops; - break; - default: - mtx_unlock(&efd->efd_lock); - return (EINVAL); - } - - kn->kn_hook = efd; - knlist_add(&efd->efd_sel.si_note, kn, 1); - mtx_unlock(&efd->efd_lock); - - return (0); -} - -static void -filt_eventfddetach(struct knote *kn) -{ - struct eventfd *efd = kn->kn_hook; - - mtx_lock(&efd->efd_lock); - knlist_remove(&efd->efd_sel.si_note, kn, 1); - mtx_unlock(&efd->efd_lock); -} - -/*ARGSUSED*/ -static int -filt_eventfdread(struct knote *kn, long hint) -{ - struct eventfd *efd = kn->kn_hook; - int ret; - - mtx_assert(&efd->efd_lock, MA_OWNED); - ret = (efd->efd_count > 0); - - return (ret); -} - -/*ARGSUSED*/ -static int -filt_eventfdwrite(struct knote *kn, long hint) -{ - struct eventfd *efd = kn->kn_hook; - int ret; - - mtx_assert(&efd->efd_lock, MA_OWNED); - ret = (UINT64_MAX - 1 > efd->efd_count); - - return (ret); -} - -/*ARGSUSED*/ -static int -eventfd_ioctl(struct file *fp, u_long cmd, void *data, - struct ucred *active_cred, struct thread *td) -{ - - if (fp->f_data == NULL || (fp->f_type != DTYPE_LINUXEFD && - fp->f_type != DTYPE_LINUXTFD)) - return (EINVAL); - - switch (cmd) - { - case FIONBIO: - if ((*(int *)data)) - atomic_set_int(&fp->f_flag, FNONBLOCK); - else - atomic_clear_int(&fp->f_flag, FNONBLOCK); - case FIOASYNC: - return (0); - default: - return (ENXIO); - } -} - -/*ARGSUSED*/ -static int -eventfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred, - struct thread *td) -{ - - return (ENXIO); -} - -/*ARGSUSED*/ -static int -eventfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) -{ - - kif->kf_type = KF_TYPE_UNKNOWN; - return (0); + return (eventfd_create(td, args->initval, flags)); } int @@ -1126,7 +810,6 @@ return (revents); } -/*ARGSUSED*/ static int timerfd_kqfilter(struct file *fp, struct knote *kn) { @@ -1157,7 +840,6 @@ mtx_unlock(&tfd->tfd_lock); } -/*ARGSUSED*/ static int filt_timerfdread(struct knote *kn, long hint) { @@ -1166,7 +848,23 @@ return (tfd->tfd_count > 0); } -/*ARGSUSED*/ +static int +timerfd_ioctl(struct file *fp, u_long cmd, void *data, + struct ucred *active_cred, struct thread *td) +{ + + if (fp->f_data == NULL || fp->f_type != DTYPE_LINUXTFD) + return (EINVAL); + + switch (cmd) { + case FIONBIO: + case FIOASYNC: + return (0); + } + + return (ENOTTY); +} + static int timerfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred, struct thread *td) @@ -1175,7 +873,6 @@ return (ENXIO); } -/*ARGSUSED*/ static int timerfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { Index: sys/conf/files =================================================================== --- sys/conf/files +++ sys/conf/files @@ -3913,6 +3913,7 @@ kern/subr_vmem.c standard kern/subr_witness.c optional witness kern/sys_capability.c standard +kern/sys_eventfd.c standard kern/sys_generic.c standard kern/sys_getrandom.c standard kern/sys_pipe.c standard Index: sys/kern/capabilities.conf =================================================================== --- sys/kern/capabilities.conf +++ sys/kern/capabilities.conf @@ -393,6 +393,11 @@ ktimer_gettime ktimer_settime +## +## Allow creating eventfds. +## +eventfd + ## ## We can't allow ktrace(2) because it relies on a global namespace, but we ## might want to introduce an fktrace(2) of some sort. Index: sys/kern/kern_descrip.c =================================================================== --- sys/kern/kern_descrip.c +++ sys/kern/kern_descrip.c @@ -4393,8 +4393,8 @@ return ("dev"); case DTYPE_PROCDESC: return ("proc"); - case DTYPE_LINUXEFD: - return ("levent"); + case DTYPE_EVENTFD: + return ("eventfd"); case DTYPE_LINUXTFD: return ("ltimer"); default: Index: sys/kern/sys_eventfd.c =================================================================== --- /dev/null +++ sys/kern/sys_eventfd.c @@ -0,0 +1,365 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2007 Roman Divacky + * Copyright (c) 2014 Dmitry Chagin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +MALLOC_DEFINE(M_EVENTFD, "eventfd", "eventfd structures"); + +static fo_rdwr_t eventfd_read; +static fo_rdwr_t eventfd_write; +static fo_ioctl_t eventfd_ioctl; +static fo_poll_t eventfd_poll; +static fo_kqfilter_t eventfd_kqfilter; +static fo_stat_t eventfd_stat; +static fo_close_t eventfd_close; +static fo_fill_kinfo_t eventfd_fill_kinfo; + +static struct fileops eventfdops = { + .fo_read = eventfd_read, + .fo_write = eventfd_write, + .fo_truncate = invfo_truncate, + .fo_ioctl = eventfd_ioctl, + .fo_poll = eventfd_poll, + .fo_kqfilter = eventfd_kqfilter, + .fo_stat = eventfd_stat, + .fo_close = eventfd_close, + .fo_chmod = invfo_chmod, + .fo_chown = invfo_chown, + .fo_sendfile = invfo_sendfile, + .fo_fill_kinfo = eventfd_fill_kinfo, + .fo_flags = DFLAG_PASSABLE +}; + +static void filt_eventfddetach(struct knote *kn); +static int filt_eventfdread(struct knote *kn, long hint); +static int filt_eventfdwrite(struct knote *kn, long hint); + +static struct filterops eventfd_rfiltops = { + .f_isfd = 1, + .f_detach = filt_eventfddetach, + .f_event = filt_eventfdread +}; +static struct filterops eventfd_wfiltops = { + .f_isfd = 1, + .f_detach = filt_eventfddetach, + .f_event = filt_eventfdwrite +}; + +struct eventfd { + eventfd_t efd_count; + uint32_t efd_flags; + struct selinfo efd_sel; + struct mtx efd_lock; +}; + +int +eventfd_create(struct thread *td, uint32_t initval, int flags) +{ + struct eventfd *efd; + struct file *fp; + int fflags, fd, error; + + AUDIT_ARG_FFLAGS(flags); + AUDIT_ARG_VALUE(initval); + + fflags = 0; + if ((flags & EFD_CLOEXEC) != 0) + fflags |= O_CLOEXEC; + + error = falloc(td, &fp, &fd, fflags); + if (error != 0) + return (error); + + efd = malloc(sizeof(*efd), M_EVENTFD, M_WAITOK | M_ZERO); + efd->efd_flags = flags; + efd->efd_count = initval; + mtx_init(&efd->efd_lock, "eventfd", NULL, MTX_DEF); + + knlist_init_mtx(&efd->efd_sel.si_note, &efd->efd_lock); + + fflags = FREAD | FWRITE; + if ((flags & EFD_NONBLOCK) != 0) + fflags |= FNONBLOCK; + + finit(fp, fflags, DTYPE_EVENTFD, efd, &eventfdops); + fdrop(fp, td); + + td->td_retval[0] = fd; + return (0); +} + +static int +eventfd_close(struct file *fp, struct thread *td) +{ + struct eventfd *efd = fp->f_data; + + seldrain(&efd->efd_sel); + knlist_destroy(&efd->efd_sel.si_note); + + fp->f_ops = &badfileops; + mtx_destroy(&efd->efd_lock); + free(efd, M_EVENTFD); + + return (0); +} + +int +sys_eventfd(struct thread *td, struct eventfd_args *args) +{ + + if ((args->flags & ~(EFD_CLOEXEC|EFD_NONBLOCK|EFD_SEMAPHORE)) != 0) + return (EINVAL); + + return (eventfd_create(td, args->initval, args->flags)); +} + +static int +eventfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred, + int flags, struct thread *td) +{ + struct eventfd *efd = fp->f_data; + eventfd_t count; + int error; + + if (uio->uio_resid < sizeof(eventfd_t)) + return (EINVAL); + + error = 0; + mtx_lock(&efd->efd_lock); + while (error == 0 && efd->efd_count == 0) { + if ((fp->f_flag & FNONBLOCK) != 0) { + mtx_unlock(&efd->efd_lock); + return (EAGAIN); + } + error = mtx_sleep(&efd->efd_count, &efd->efd_lock, PCATCH, "efdrd", 0); + } + if (error == 0) { + if ((efd->efd_flags & EFD_SEMAPHORE) != 0) { + count = 1; + --efd->efd_count; + } else { + count = efd->efd_count; + efd->efd_count = 0; + } + KNOTE_LOCKED(&efd->efd_sel.si_note, 0); + selwakeup(&efd->efd_sel); + wakeup(&efd->efd_count); + mtx_unlock(&efd->efd_lock); + error = uiomove(&count, sizeof(eventfd_t), uio); + } else + mtx_unlock(&efd->efd_lock); + + return (error); +} + +static int +eventfd_write(struct file *fp, struct uio *uio, struct ucred *active_cred, + int flags, struct thread *td) +{ + struct eventfd *efd = fp->f_data; + eventfd_t count; + int error; + + if (uio->uio_resid < sizeof(eventfd_t)) + return (EINVAL); + + error = uiomove(&count, sizeof(eventfd_t), uio); + if (error != 0) + return (error); + if (count == UINT64_MAX) + return (EINVAL); + + mtx_lock(&efd->efd_lock); +retry: + if (UINT64_MAX - efd->efd_count <= count) { + if ((fp->f_flag & FNONBLOCK) != 0) { + mtx_unlock(&efd->efd_lock); + /* Do not not return the number of bytes written */ + uio->uio_resid += sizeof(eventfd_t); + return (EAGAIN); + } + error = mtx_sleep(&efd->efd_count, &efd->efd_lock, + PCATCH, "efdwr", 0); + if (error == 0) + goto retry; + } + if (error == 0) { + efd->efd_count += count; + KNOTE_LOCKED(&efd->efd_sel.si_note, 0); + selwakeup(&efd->efd_sel); + wakeup(&efd->efd_count); + } + mtx_unlock(&efd->efd_lock); + + return (error); +} + +static int +eventfd_poll(struct file *fp, int events, struct ucred *active_cred, + struct thread *td) +{ + struct eventfd *efd = fp->f_data; + int revents = 0; + + mtx_lock(&efd->efd_lock); + if ((events & (POLLIN|POLLRDNORM)) && efd->efd_count > 0) + revents |= events & (POLLIN|POLLRDNORM); + if ((events & (POLLOUT|POLLWRNORM)) && UINT64_MAX - 1 > efd->efd_count) + revents |= events & (POLLOUT|POLLWRNORM); + if (revents == 0) + selrecord(td, &efd->efd_sel); + mtx_unlock(&efd->efd_lock); + + return (revents); +} + +static int +eventfd_kqfilter(struct file *fp, struct knote *kn) +{ + struct eventfd *efd = fp->f_data; + + mtx_lock(&efd->efd_lock); + switch (kn->kn_filter) { + case EVFILT_READ: + kn->kn_fop = &eventfd_rfiltops; + break; + case EVFILT_WRITE: + kn->kn_fop = &eventfd_wfiltops; + break; + default: + mtx_unlock(&efd->efd_lock); + return (EINVAL); + } + + kn->kn_hook = efd; + knlist_add(&efd->efd_sel.si_note, kn, 1); + mtx_unlock(&efd->efd_lock); + + return (0); +} + +static void +filt_eventfddetach(struct knote *kn) +{ + struct eventfd *efd = kn->kn_hook; + + mtx_lock(&efd->efd_lock); + knlist_remove(&efd->efd_sel.si_note, kn, 1); + mtx_unlock(&efd->efd_lock); +} + +static int +filt_eventfdread(struct knote *kn, long hint) +{ + struct eventfd *efd = kn->kn_hook; + int ret; + + mtx_assert(&efd->efd_lock, MA_OWNED); + kn->kn_data = (int64_t)efd->efd_count; + ret = (efd->efd_count > 0); + + return (ret); +} + +static int +filt_eventfdwrite(struct knote *kn, long hint) +{ + struct eventfd *efd = kn->kn_hook; + int ret; + + mtx_assert(&efd->efd_lock, MA_OWNED); + kn->kn_data = (int64_t)(UINT64_MAX - 1 - efd->efd_count); + ret = (UINT64_MAX - 1 > efd->efd_count); + + return (ret); +} + +static int +eventfd_ioctl(struct file *fp, u_long cmd, void *data, + struct ucred *active_cred, struct thread *td) +{ + + switch (cmd) { + case FIONBIO: + case FIOASYNC: + return (0); + } + + return (ENOTTY); +} + +static int +eventfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred, + struct thread *td) +{ + + bzero((void *)st, sizeof *st); + st->st_mode = S_IFIFO; + return (0); +} + +static int +eventfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) +{ + struct eventfd *efd = fp->f_data; + + kif->kf_type = KF_TYPE_EVENTFD; + mtx_lock(&efd->efd_lock); + kif->kf_un.kf_eventfd.kf_eventfd_value = efd->efd_count; + kif->kf_un.kf_eventfd.kf_eventfd_flags = efd->efd_flags; + mtx_unlock(&efd->efd_lock); + return (0); +} Index: sys/kern/syscalls.master =================================================================== --- sys/kern/syscalls.master +++ sys/kern/syscalls.master @@ -3241,6 +3241,12 @@ _In_z_ const char *path ); } +577 AUE_EVENTFD STD { + int eventfd( + unsigned int initval, + int flags + ); + } ; Please copy any additions and changes to the following compatability tables: ; sys/compat/freebsd32/syscalls.master Index: sys/sys/eventfd.h =================================================================== --- /dev/null +++ sys/sys/eventfd.h @@ -0,0 +1,58 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Greg V + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _SYS_EVENTFD_H_ +#define _SYS_EVENTFD_H_ + +#include +#include + +typedef uint64_t eventfd_t; + +#define EFD_SEMAPHORE (1 << 0) +#define EFD_CLOEXEC O_CLOEXEC +#define EFD_NONBLOCK O_NONBLOCK + +#ifdef _KERNEL + +struct thread; + +int eventfd_create(struct thread *td, uint32_t initval, int flags); + +#else + +#include + +__BEGIN_DECLS +int eventfd(unsigned int initval, int flags); +int eventfd_read(int fd, eventfd_t *value); +int eventfd_write(int fd, eventfd_t value); +__END_DECLS + +#endif /* !_KERNEL */ + +#endif /* !_SYS_EVENTFD_H_ */ Index: sys/sys/file.h =================================================================== --- sys/sys/file.h +++ sys/sys/file.h @@ -69,7 +69,7 @@ #define DTYPE_PTS 10 /* pseudo teletype master device */ #define DTYPE_DEV 11 /* Device specific fd type */ #define DTYPE_PROCDESC 12 /* process descriptor */ -#define DTYPE_LINUXEFD 13 /* emulation eventfd type */ +#define DTYPE_EVENTFD 13 /* emulation eventfd type */ #define DTYPE_LINUXTFD 14 /* emulation timerfd type */ #ifdef _KERNEL Index: sys/sys/user.h =================================================================== --- sys/sys/user.h +++ sys/sys/user.h @@ -64,6 +64,7 @@ #include #endif #include +#include /* * KERN_PROC subtype ops return arrays of selected proc structure entries: @@ -263,6 +264,7 @@ #define KF_TYPE_PTS 10 #define KF_TYPE_PROCDESC 11 #define KF_TYPE_DEV 12 +#define KF_TYPE_EVENTFD 13 #define KF_TYPE_UNKNOWN 255 #define KF_VTYPE_VNON 0 @@ -435,6 +437,10 @@ uint64_t kf_spareint64[32]; pid_t kf_pid; } kf_proc; + struct { + eventfd_t kf_eventfd_value; + uint32_t kf_eventfd_flags; + } kf_eventfd; } kf_un; }; uint16_t kf_status; /* Status flags. */ Index: usr.bin/procstat/procstat.1 =================================================================== --- usr.bin/procstat/procstat.1 +++ usr.bin/procstat/procstat.1 @@ -315,6 +315,8 @@ crypto .It e POSIX semaphore +.It E +eventfd .It f fifo .It h Index: usr.bin/procstat/procstat_files.c =================================================================== --- usr.bin/procstat/procstat_files.c +++ usr.bin/procstat/procstat_files.c @@ -419,6 +419,11 @@ xo_emit("{eq:fd_type/dev}"); break; + case PS_FST_TYPE_EVENTFD: + str = "E"; + xo_emit("{eq:fd_type/eventfd}"); + break; + case PS_FST_TYPE_NONE: str = "?"; xo_emit("{eq:fd_type/none}");