diff --git a/lib/libc/sys/kqueue.2 b/lib/libc/sys/kqueue.2 index ed737c626ef8..3ded4ae3d8f7 100644 --- a/lib/libc/sys/kqueue.2 +++ b/lib/libc/sys/kqueue.2 @@ -1,833 +1,848 @@ .\" Copyright (c) 2000 Jonathan Lemon .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" -.Dd September 7, 2021 +.Dd September 23, 2021 .Dt KQUEUE 2 .Os .Sh NAME .Nm kqueue , .Nm kevent .Nd kernel event notification mechanism .Sh LIBRARY .Lb libc .Sh SYNOPSIS .In sys/event.h .Ft int .Fn kqueue "void" .Ft int .Fo kevent .Fa "int kq" .Fa "const struct kevent *changelist" .Fa "int nchanges" .Fa "struct kevent *eventlist" .Fa "int nevents" .Fa "const struct timespec *timeout" .Fc .Fn EV_SET "kev" ident filter flags fflags data udata .Sh DESCRIPTION The .Fn kqueue system call provides a generic method of notifying the user when an event happens or a condition holds, based on the results of small pieces of kernel code termed filters. A kevent is identified by the (ident, filter) pair; there may only be one unique kevent per kqueue. .Pp The filter is executed upon the initial registration of a kevent in order to detect whether a preexisting condition is present, and is also executed whenever an event is passed to the filter for evaluation. If the filter determines that the condition should be reported, then the kevent is placed on the kqueue for the user to retrieve. .Pp The filter is also run when the user attempts to retrieve the kevent from the kqueue. If the filter indicates that the condition that triggered the event no longer holds, the kevent is removed from the kqueue and is not returned. .Pp Multiple events which trigger the filter do not result in multiple kevents being placed on the kqueue; instead, the filter will aggregate the events into a single struct kevent. Calling .Fn close on a file descriptor will remove any kevents that reference the descriptor. .Pp The .Fn kqueue system call creates a new kernel event queue and returns a descriptor. The queue is not inherited by a child created with .Xr fork 2 . However, if .Xr rfork 2 is called without the .Dv RFFDG flag, then the descriptor table is shared, which will allow sharing of the kqueue between two processes. .Pp The .Fn kevent system call is used to register events with the queue, and return any pending events to the user. The .Fa changelist argument is a pointer to an array of .Va kevent structures, as defined in .In sys/event.h . All changes contained in the .Fa changelist are applied before any pending events are read from the queue. The .Fa nchanges argument gives the size of .Fa changelist . The .Fa eventlist argument is a pointer to an array of kevent structures. The .Fa nevents argument determines the size of .Fa eventlist . When .Fa nevents is zero, .Fn kevent will return immediately even if there is a .Fa timeout specified unlike .Xr select 2 . If .Fa timeout is a non-NULL pointer, it specifies a maximum interval to wait for an event, which will be interpreted as a struct timespec. If .Fa timeout is a NULL pointer, .Fn kevent waits indefinitely. To effect a poll, the .Fa timeout argument should be non-NULL, pointing to a zero-valued .Va timespec structure. The same array may be used for the .Fa changelist and .Fa eventlist . .Pp The .Fn EV_SET macro is provided for ease of initializing a kevent structure. .Pp The .Va kevent structure is defined as: .Bd -literal struct kevent { uintptr_t ident; /* identifier for this event */ short filter; /* filter for event */ u_short flags; /* action flags for kqueue */ u_int fflags; /* filter flag value */ int64_t data; /* filter data value */ void *udata; /* opaque user data identifier */ uint64_t ext[4]; /* extensions */ }; .Ed .Pp The fields of .Fa struct kevent are: .Bl -tag -width "Fa filter" .It Fa ident Value used to identify this event. The exact interpretation is determined by the attached filter, but often is a file descriptor. .It Fa filter Identifies the kernel filter used to process this event. The pre-defined system filters are described below. .It Fa flags Actions to perform on the event. .It Fa fflags Filter-specific flags. .It Fa data Filter-specific data value. .It Fa udata Opaque user-defined value passed through the kernel unchanged. .It Fa ext Extended data passed to and from kernel. The .Fa ext[0] and .Fa ext[1] members use is defined by the filter. If the filter does not use them, the members are copied unchanged. The .Fa ext[2] and .Fa ext[3] members are always passed through the kernel as-is, making additional context available to application. .El .Pp The .Va flags field can contain the following values: .Bl -tag -width EV_DISPATCH .It Dv EV_ADD Adds the event to the kqueue. Re-adding an existing event will modify the parameters of the original event, and not result in a duplicate entry. Adding an event automatically enables it, unless overridden by the EV_DISABLE flag. .It Dv EV_ENABLE Permit .Fn kevent to return the event if it is triggered. .It Dv EV_DISABLE Disable the event so .Fn kevent will not return it. The filter itself is not disabled. .It Dv EV_DISPATCH Disable the event source immediately after delivery of an event. See .Dv EV_DISABLE above. .It Dv EV_DELETE Removes the event from the kqueue. Events which are attached to file descriptors are automatically deleted on the last close of the descriptor. .It Dv EV_RECEIPT This flag is useful for making bulk changes to a kqueue without draining any pending events. When passed as input, it forces .Dv EV_ERROR to always be returned. When a filter is successfully added the .Va data field will be zero. Note that if this flag is encountered and there is no remaining space in .Fa eventlist to hold the .Dv EV_ERROR event, then subsequent changes will not get processed. .It Dv EV_ONESHOT Causes the event to return only the first occurrence of the filter being triggered. After the user retrieves the event from the kqueue, it is deleted. .It Dv EV_CLEAR After the event is retrieved by the user, its state is reset. This is useful for filters which report state transitions instead of the current state. Note that some filters may automatically set this flag internally. .It Dv EV_EOF Filters may set this flag to indicate filter-specific EOF condition. .It Dv EV_ERROR See .Sx RETURN VALUES below. +.It Dv EV_KEEPUDATA +Causes +.Fn kevent +to leave unchanged any +.Fa udata +associated with an existing event. This allows other aspects of the +event to be modified without requiring the caller to know the +.Fa udata +value presently associated. +This is especially useful with +.Dv NOTE_TRIGGER +or flags like +.Dv EV_ENABLE. +This flag may not be used with +.Dv EV_ADD. .El .Pp The predefined system filters are listed below. Arguments may be passed to and from the filter via the .Va fflags and .Va data fields in the kevent structure. .Bl -tag -width "Dv EVFILT_PROCDESC" .It Dv EVFILT_READ Takes a descriptor as the identifier, and returns whenever there is data available to read. The behavior of the filter is slightly different depending on the descriptor type. .Bl -tag -width 2n .It Sockets Sockets which have previously been passed to .Fn listen return when there is an incoming connection pending. .Va data contains the size of the listen backlog. .Pp Other socket descriptors return when there is data to be read, subject to the .Dv SO_RCVLOWAT value of the socket buffer. This may be overridden with a per-filter low water mark at the time the filter is added by setting the .Dv NOTE_LOWAT flag in .Va fflags , and specifying the new low water mark in .Va data . On return, .Va data contains the number of bytes of protocol data available to read. .Pp If the read direction of the socket has shutdown, then the filter also sets .Dv EV_EOF in .Va flags , and returns the socket error (if any) in .Va fflags . It is possible for EOF to be returned (indicating the connection is gone) while there is still data pending in the socket buffer. .It Vnodes Returns when the file pointer is not at the end of file. .Va data contains the offset from current position to end of file, and may be negative. .Pp This behavior is different from .Xr poll 2 , where read events are triggered for regular files unconditionally. This event can be triggered unconditionally by setting the .Dv NOTE_FILE_POLL flag in .Va fflags . .It "Fifos, Pipes" Returns when the there is data to read; .Va data contains the number of bytes available. .Pp When the last writer disconnects, the filter will set .Dv EV_EOF in .Va flags . This will be cleared by the filter when a new writer connects, at which point the filter will resume waiting for data to become available before returning. .It "BPF devices" Returns when the BPF buffer is full, the BPF timeout has expired, or when the BPF has .Dq immediate mode enabled and there is any data to read; .Va data contains the number of bytes available. .It Eventfds Returns when the counter is greater than 0; .Va data contains the counter value, which must be cast to .Vt uint64_t . .It Kqueues Returns when pending events are present on the queue; .Va data contains the number of events available. .El .It Dv EVFILT_WRITE Takes a descriptor as the identifier, and returns whenever it is possible to write to the descriptor. For sockets, pipes and fifos, .Va data will contain the amount of space remaining in the write buffer. The filter will set .Dv EV_EOF when the reader disconnects, and for the fifo case, this will be cleared when a new reader connects. Note that this filter is not supported for vnodes or BPF devices. .Pp For sockets, the low water mark and socket error handling is identical to the .Dv EVFILT_READ case. .Pp For eventfds, .Va data will contain the maximum value that can be added to the counter without blocking. .It Dv EVFILT_EMPTY Takes a descriptor as the identifier, and returns whenever there is no remaining data in the write buffer. .It Dv EVFILT_AIO Events for this filter are not registered with .Fn kevent directly but are registered via the .Va aio_sigevent member of an asynchronous I/O request when it is scheduled via an asynchronous I/O system call such as .Fn aio_read . The filter returns under the same conditions as .Fn aio_error . For more details on this filter see .Xr sigevent 3 and .Xr aio 4 . .It Dv EVFILT_VNODE Takes a file descriptor as the identifier and the events to watch for in .Va fflags , and returns when one or more of the requested events occurs on the descriptor. The events to monitor are: .Bl -tag -width "Dv NOTE_CLOSE_WRITE" .It Dv NOTE_ATTRIB The file referenced by the descriptor had its attributes changed. .It Dv NOTE_CLOSE A file descriptor referencing the monitored file, was closed. The closed file descriptor did not have write access. .It Dv NOTE_CLOSE_WRITE A file descriptor referencing the monitored file, was closed. The closed file descriptor had write access. .Pp This note, as well as .Dv NOTE_CLOSE , are not activated when files are closed forcibly by .Xr unmount 2 or .Xr revoke 2 . Instead, .Dv NOTE_REVOKE is sent for such events. .It Dv NOTE_DELETE The .Fn unlink system call was called on the file referenced by the descriptor. .It Dv NOTE_EXTEND For regular file, the file referenced by the descriptor was extended. .Pp For directory, reports that a directory entry was added or removed, as the result of rename operation. The .Dv NOTE_EXTEND event is not reported when a name is changed inside the directory. .It Dv NOTE_LINK The link count on the file changed. In particular, the .Dv NOTE_LINK event is reported if a subdirectory was created or deleted inside the directory referenced by the descriptor. .It Dv NOTE_OPEN The file referenced by the descriptor was opened. .It Dv NOTE_READ A read occurred on the file referenced by the descriptor. .It Dv NOTE_RENAME The file referenced by the descriptor was renamed. .It Dv NOTE_REVOKE Access to the file was revoked via .Xr revoke 2 or the underlying file system was unmounted. .It Dv NOTE_WRITE A write occurred on the file referenced by the descriptor. .El .Pp On return, .Va fflags contains the events which triggered the filter. .It Dv EVFILT_PROC Takes the process ID to monitor as the identifier and the events to watch for in .Va fflags , and returns when the process performs one or more of the requested events. If a process can normally see another process, it can attach an event to it. The events to monitor are: .Bl -tag -width "Dv NOTE_TRACKERR" .It Dv NOTE_EXIT The process has exited. The exit status will be stored in .Va data . .It Dv NOTE_FORK The process has called .Fn fork . .It Dv NOTE_EXEC The process has executed a new process via .Xr execve 2 or a similar call. .It Dv NOTE_TRACK Follow a process across .Fn fork calls. The parent process registers a new kevent to monitor the child process using the same .Va fflags as the original event. The child process will signal an event with .Dv NOTE_CHILD set in .Va fflags and the parent PID in .Va data . .Pp If the parent process fails to register a new kevent .Pq usually due to resource limitations , it will signal an event with .Dv NOTE_TRACKERR set in .Va fflags , and the child process will not signal a .Dv NOTE_CHILD event. .El .Pp On return, .Va fflags contains the events which triggered the filter. .It Dv EVFILT_PROCDESC Takes the process descriptor created by .Xr pdfork 2 to monitor as the identifier and the events to watch for in .Va fflags , and returns when the associated process performs one or more of the requested events. The events to monitor are: .Bl -tag -width "Dv NOTE_EXIT" .It Dv NOTE_EXIT The process has exited. The exit status will be stored in .Va data . .El .Pp On return, .Va fflags contains the events which triggered the filter. .It Dv EVFILT_SIGNAL Takes the signal number to monitor as the identifier and returns when the given signal is delivered to the process. This coexists with the .Fn signal and .Fn sigaction facilities, and has a lower precedence. The filter will record all attempts to deliver a signal to a process, even if the signal has been marked as .Dv SIG_IGN , except for the .Dv SIGCHLD signal, which, if ignored, will not be recorded by the filter. Event notification happens after normal signal delivery processing. .Va data returns the number of times the signal has occurred since the last call to .Fn kevent . This filter automatically sets the .Dv EV_CLEAR flag internally. .It Dv EVFILT_TIMER Establishes an arbitrary timer identified by .Va ident . When adding a timer, .Va data specifies the moment to fire the timer (for .Dv NOTE_ABSTIME ) or the timeout period. The timer will be periodic unless .Dv EV_ONESHOT or .Dv NOTE_ABSTIME is specified. On return, .Va data contains the number of times the timeout has expired since the last call to .Fn kevent . For non-monotonic timers, this filter automatically sets the .Dv EV_CLEAR flag internally. .Pp The filter accepts the following flags in the .Va fflags argument: .Bl -tag -width "Dv NOTE_MSECONDS" .It Dv NOTE_SECONDS .Va data is in seconds. .It Dv NOTE_MSECONDS .Va data is in milliseconds. .It Dv NOTE_USECONDS .Va data is in microseconds. .It Dv NOTE_NSECONDS .Va data is in nanoseconds. .It Dv NOTE_ABSTIME The specified expiration time is absolute. .El .Pp If .Va fflags is not set, the default is milliseconds. On return, .Va fflags contains the events which triggered the filter. .Pp If an existing timer is re-added, the existing timer will be effectively canceled (throwing away any undelivered record of previous timer expiration) and re-started using the new parameters contained in .Va data and .Va fflags . .Pp There is a system wide limit on the number of timers which is controlled by the .Va kern.kq_calloutmax sysctl. .It Dv EVFILT_USER Establishes a user event identified by .Va ident which is not associated with any kernel mechanism but is triggered by user level code. The lower 24 bits of the .Va fflags may be used for user defined flags and manipulated using the following: .Bl -tag -width "Dv NOTE_FFLAGSMASK" .It Dv NOTE_FFNOP Ignore the input .Va fflags . .It Dv NOTE_FFAND Bitwise AND .Va fflags . .It Dv NOTE_FFOR Bitwise OR .Va fflags . .It Dv NOTE_FFCOPY Copy .Va fflags . .It Dv NOTE_FFCTRLMASK Control mask for .Va fflags . .It Dv NOTE_FFLAGSMASK User defined flag mask for .Va fflags . .El .Pp A user event is triggered for output with the following: .Bl -tag -width "Dv NOTE_FFLAGSMASK" .It Dv NOTE_TRIGGER Cause the event to be triggered. .El .Pp On return, .Va fflags contains the users defined flags in the lower 24 bits. .El .Sh CANCELLATION BEHAVIOUR If .Fa nevents is non-zero, i.e., the function is potentially blocking, the call is a cancellation point. Otherwise, i.e., if .Fa nevents is zero, the call is not cancellable. Cancellation can only occur before any changes are made to the kqueue, or when the call was blocked and no changes to the queue were requested. .Sh RETURN VALUES The .Fn kqueue system call creates a new kernel event queue and returns a file descriptor. If there was an error creating the kernel event queue, a value of -1 is returned and errno set. .Pp The .Fn kevent system call returns the number of events placed in the .Fa eventlist , up to the value given by .Fa nevents . If an error occurs while processing an element of the .Fa changelist and there is enough room in the .Fa eventlist , then the event will be placed in the .Fa eventlist with .Dv EV_ERROR set in .Va flags and the system error in .Va data . Otherwise, .Dv -1 will be returned, and .Dv errno will be set to indicate the error condition. If the time limit expires, then .Fn kevent returns 0. .Sh EXAMPLES .Bd -literal -compact #include #include #include #include #include #include int main(int argc, char **argv) { struct kevent event; /* Event we want to monitor */ struct kevent tevent; /* Event triggered */ int kq, fd, ret; if (argc != 2) err(EXIT_FAILURE, "Usage: %s path\en", argv[0]); fd = open(argv[1], O_RDONLY); if (fd == -1) err(EXIT_FAILURE, "Failed to open '%s'", argv[1]); /* Create kqueue. */ kq = kqueue(); if (kq == -1) err(EXIT_FAILURE, "kqueue() failed"); /* Initialize kevent structure. */ EV_SET(&event, fd, EVFILT_VNODE, EV_ADD | EV_CLEAR, NOTE_WRITE, 0, NULL); /* Attach event to the kqueue. */ ret = kevent(kq, &event, 1, NULL, 0, NULL); if (ret == -1) err(EXIT_FAILURE, "kevent register"); if (event.flags & EV_ERROR) errx(EXIT_FAILURE, "Event error: %s", strerror(event.data)); for (;;) { /* Sleep until something happens. */ ret = kevent(kq, NULL, 0, &tevent, 1, NULL); if (ret == -1) { err(EXIT_FAILURE, "kevent wait"); } else if (ret > 0) { printf("Something was written in '%s'\en", argv[1]); } } } .Ed .Sh ERRORS The .Fn kqueue system call fails if: .Bl -tag -width Er .It Bq Er ENOMEM The kernel failed to allocate enough memory for the kernel queue. .It Bq Er ENOMEM The .Dv RLIMIT_KQUEUES rlimit (see .Xr getrlimit 2 ) for the current user would be exceeded. .It Bq Er EMFILE The per-process descriptor table is full. .It Bq Er ENFILE The system file table is full. .El .Pp The .Fn kevent system call fails if: .Bl -tag -width Er .It Bq Er EACCES The process does not have permission to register a filter. .It Bq Er EFAULT There was an error reading or writing the .Va kevent structure. .It Bq Er EBADF The specified descriptor is invalid. .It Bq Er EINTR A signal was delivered before the timeout expired and before any events were placed on the kqueue for return. .It Bq Er EINTR A cancellation request was delivered to the thread, but not yet handled. .It Bq Er EINVAL The specified time limit or filter is invalid. .It Bq Er EINVAL The specified length of the event or change lists is negative. .It Bq Er ENOENT The event could not be found to be modified or deleted. .It Bq Er ENOMEM No memory was available to register the event or, in the special case of a timer, the maximum number of timers has been exceeded. This maximum is configurable via the .Va kern.kq_calloutmax sysctl. .It Bq Er ESRCH The specified process to attach to does not exist. .El .Pp When .Fn kevent call fails with .Er EINTR error, all changes in the .Fa changelist have been applied. .Sh SEE ALSO .Xr aio_error 2 , .Xr aio_read 2 , .Xr aio_return 2 , .Xr poll 2 , .Xr read 2 , .Xr select 2 , .Xr sigaction 2 , .Xr write 2 , .Xr pthread_setcancelstate 3 , .Xr signal 3 .Rs .%A Jonathan Lemon .%T "Kqueue: A Generic and Scalable Event Notification Facility" .%I USENIX Association .%B Proceedings of the FREENIX Track: 2001 USENIX Annual Technical Conference .%D June 25-30, 2001 .\".http://www.usenix.org/event/usenix01/freenix01/full_papers/lemon/lemon.pdf .Re .Sh HISTORY The .Fn kqueue and .Fn kevent system calls first appeared in .Fx 4.1 . .Sh AUTHORS The .Fn kqueue system and this manual page were written by .An Jonathan Lemon Aq Mt jlemon@FreeBSD.org . .Sh BUGS The .Fa timeout value is limited to 24 hours; longer timeouts will be silently reinterpreted as 24 hours. .Pp In versions older than .Fx 12.0 , .In sys/event.h failed to parse without including .In sys/types.h manually. diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c index db505b234268..5fa5bf9cad06 100644 --- a/sys/kern/kern_event.c +++ b/sys/kern/kern_event.c @@ -1,2841 +1,2849 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1999,2000,2001 Jonathan Lemon * Copyright 2004 John-Mark Gurney * Copyright (c) 2009 Apple, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include "opt_kqueue.h" #ifdef COMPAT_FREEBSD11 #define _WANT_FREEBSD11_KEVENT #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system"); /* * This lock is used if multiple kq locks are required. This possibly * should be made into a per proc lock. */ static struct mtx kq_global; MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF); #define KQ_GLOBAL_LOCK(lck, haslck) do { \ if (!haslck) \ mtx_lock(lck); \ haslck = 1; \ } while (0) #define KQ_GLOBAL_UNLOCK(lck, haslck) do { \ if (haslck) \ mtx_unlock(lck); \ haslck = 0; \ } while (0) TASKQUEUE_DEFINE_THREAD(kqueue_ctx); static int kevent_copyout(void *arg, struct kevent *kevp, int count); static int kevent_copyin(void *arg, struct kevent *kevp, int count); static int kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int mflag); static int kqueue_acquire(struct file *fp, struct kqueue **kqp); static void kqueue_release(struct kqueue *kq, int locked); static void kqueue_destroy(struct kqueue *kq); static void kqueue_drain(struct kqueue *kq, struct thread *td); static int kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident, int mflag); static void kqueue_task(void *arg, int pending); static int kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops, const struct timespec *timeout, struct kevent *keva, struct thread *td); static void kqueue_wakeup(struct kqueue *kq); static struct filterops *kqueue_fo_find(int filt); static void kqueue_fo_release(int filt); struct g_kevent_args; static int kern_kevent_generic(struct thread *td, struct g_kevent_args *uap, struct kevent_copyops *k_ops, const char *struct_name); static fo_ioctl_t kqueue_ioctl; static fo_poll_t kqueue_poll; static fo_kqfilter_t kqueue_kqfilter; static fo_stat_t kqueue_stat; static fo_close_t kqueue_close; static fo_fill_kinfo_t kqueue_fill_kinfo; static struct fileops kqueueops = { .fo_read = invfo_rdwr, .fo_write = invfo_rdwr, .fo_truncate = invfo_truncate, .fo_ioctl = kqueue_ioctl, .fo_poll = kqueue_poll, .fo_kqfilter = kqueue_kqfilter, .fo_stat = kqueue_stat, .fo_close = kqueue_close, .fo_chmod = invfo_chmod, .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = kqueue_fill_kinfo, }; static int knote_attach(struct knote *kn, struct kqueue *kq); static void knote_drop(struct knote *kn, struct thread *td); static void knote_drop_detached(struct knote *kn, struct thread *td); static void knote_enqueue(struct knote *kn); static void knote_dequeue(struct knote *kn); static void knote_init(void); static struct knote *knote_alloc(int mflag); static void knote_free(struct knote *kn); static void filt_kqdetach(struct knote *kn); static int filt_kqueue(struct knote *kn, long hint); static int filt_procattach(struct knote *kn); static void filt_procdetach(struct knote *kn); static int filt_proc(struct knote *kn, long hint); static int filt_fileattach(struct knote *kn); static void filt_timerexpire(void *knx); static void filt_timerexpire_l(struct knote *kn, bool proc_locked); static int filt_timerattach(struct knote *kn); static void filt_timerdetach(struct knote *kn); static void filt_timerstart(struct knote *kn, sbintime_t to); static void filt_timertouch(struct knote *kn, struct kevent *kev, u_long type); static int filt_timervalidate(struct knote *kn, sbintime_t *to); static int filt_timer(struct knote *kn, long hint); static int filt_userattach(struct knote *kn); static void filt_userdetach(struct knote *kn); static int filt_user(struct knote *kn, long hint); static void filt_usertouch(struct knote *kn, struct kevent *kev, u_long type); static struct filterops file_filtops = { .f_isfd = 1, .f_attach = filt_fileattach, }; static struct filterops kqread_filtops = { .f_isfd = 1, .f_detach = filt_kqdetach, .f_event = filt_kqueue, }; /* XXX - move to kern_proc.c? */ static struct filterops proc_filtops = { .f_isfd = 0, .f_attach = filt_procattach, .f_detach = filt_procdetach, .f_event = filt_proc, }; static struct filterops timer_filtops = { .f_isfd = 0, .f_attach = filt_timerattach, .f_detach = filt_timerdetach, .f_event = filt_timer, .f_touch = filt_timertouch, }; static struct filterops user_filtops = { .f_attach = filt_userattach, .f_detach = filt_userdetach, .f_event = filt_user, .f_touch = filt_usertouch, }; static uma_zone_t knote_zone; static unsigned int __exclusive_cache_line kq_ncallouts; static unsigned int kq_calloutmax = 4 * 1024; SYSCTL_UINT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW, &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue"); /* XXX - ensure not influx ? */ #define KNOTE_ACTIVATE(kn, islock) do { \ if ((islock)) \ mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED); \ else \ KQ_LOCK((kn)->kn_kq); \ (kn)->kn_status |= KN_ACTIVE; \ if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) \ knote_enqueue((kn)); \ if (!(islock)) \ KQ_UNLOCK((kn)->kn_kq); \ } while (0) #define KQ_LOCK(kq) do { \ mtx_lock(&(kq)->kq_lock); \ } while (0) #define KQ_FLUX_WAKEUP(kq) do { \ if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) { \ (kq)->kq_state &= ~KQ_FLUXWAIT; \ wakeup((kq)); \ } \ } while (0) #define KQ_UNLOCK_FLUX(kq) do { \ KQ_FLUX_WAKEUP(kq); \ mtx_unlock(&(kq)->kq_lock); \ } while (0) #define KQ_UNLOCK(kq) do { \ mtx_unlock(&(kq)->kq_lock); \ } while (0) #define KQ_OWNED(kq) do { \ mtx_assert(&(kq)->kq_lock, MA_OWNED); \ } while (0) #define KQ_NOTOWNED(kq) do { \ mtx_assert(&(kq)->kq_lock, MA_NOTOWNED); \ } while (0) static struct knlist * kn_list_lock(struct knote *kn) { struct knlist *knl; knl = kn->kn_knlist; if (knl != NULL) knl->kl_lock(knl->kl_lockarg); return (knl); } static void kn_list_unlock(struct knlist *knl) { bool do_free; if (knl == NULL) return; do_free = knl->kl_autodestroy && knlist_empty(knl); knl->kl_unlock(knl->kl_lockarg); if (do_free) { knlist_destroy(knl); free(knl, M_KQUEUE); } } static bool kn_in_flux(struct knote *kn) { return (kn->kn_influx > 0); } static void kn_enter_flux(struct knote *kn) { KQ_OWNED(kn->kn_kq); MPASS(kn->kn_influx < INT_MAX); kn->kn_influx++; } static bool kn_leave_flux(struct knote *kn) { KQ_OWNED(kn->kn_kq); MPASS(kn->kn_influx > 0); kn->kn_influx--; return (kn->kn_influx == 0); } #define KNL_ASSERT_LOCK(knl, islocked) do { \ if (islocked) \ KNL_ASSERT_LOCKED(knl); \ else \ KNL_ASSERT_UNLOCKED(knl); \ } while (0) #ifdef INVARIANTS #define KNL_ASSERT_LOCKED(knl) do { \ knl->kl_assert_lock((knl)->kl_lockarg, LA_LOCKED); \ } while (0) #define KNL_ASSERT_UNLOCKED(knl) do { \ knl->kl_assert_lock((knl)->kl_lockarg, LA_UNLOCKED); \ } while (0) #else /* !INVARIANTS */ #define KNL_ASSERT_LOCKED(knl) do {} while (0) #define KNL_ASSERT_UNLOCKED(knl) do {} while (0) #endif /* INVARIANTS */ #ifndef KN_HASHSIZE #define KN_HASHSIZE 64 /* XXX should be tunable */ #endif #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) static int filt_nullattach(struct knote *kn) { return (ENXIO); }; struct filterops null_filtops = { .f_isfd = 0, .f_attach = filt_nullattach, }; /* XXX - make SYSINIT to add these, and move into respective modules. */ extern struct filterops sig_filtops; extern struct filterops fs_filtops; /* * Table for for all system-defined filters. */ static struct mtx filterops_lock; MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops", MTX_DEF); static struct { struct filterops *for_fop; int for_nolock; int for_refcnt; } sysfilt_ops[EVFILT_SYSCOUNT] = { { &file_filtops, 1 }, /* EVFILT_READ */ { &file_filtops, 1 }, /* EVFILT_WRITE */ { &null_filtops }, /* EVFILT_AIO */ { &file_filtops, 1 }, /* EVFILT_VNODE */ { &proc_filtops, 1 }, /* EVFILT_PROC */ { &sig_filtops, 1 }, /* EVFILT_SIGNAL */ { &timer_filtops, 1 }, /* EVFILT_TIMER */ { &file_filtops, 1 }, /* EVFILT_PROCDESC */ { &fs_filtops, 1 }, /* EVFILT_FS */ { &null_filtops }, /* EVFILT_LIO */ { &user_filtops, 1 }, /* EVFILT_USER */ { &null_filtops }, /* EVFILT_SENDFILE */ { &file_filtops, 1 }, /* EVFILT_EMPTY */ }; /* * Simple redirection for all cdevsw style objects to call their fo_kqfilter * method. */ static int filt_fileattach(struct knote *kn) { return (fo_kqfilter(kn->kn_fp, kn)); } /*ARGSUSED*/ static int kqueue_kqfilter(struct file *fp, struct knote *kn) { struct kqueue *kq = kn->kn_fp->f_data; if (kn->kn_filter != EVFILT_READ) return (EINVAL); kn->kn_status |= KN_KQUEUE; kn->kn_fop = &kqread_filtops; knlist_add(&kq->kq_sel.si_note, kn, 0); return (0); } static void filt_kqdetach(struct knote *kn) { struct kqueue *kq = kn->kn_fp->f_data; knlist_remove(&kq->kq_sel.si_note, kn, 0); } /*ARGSUSED*/ static int filt_kqueue(struct knote *kn, long hint) { struct kqueue *kq = kn->kn_fp->f_data; kn->kn_data = kq->kq_count; return (kn->kn_data > 0); } /* XXX - move to kern_proc.c? */ static int filt_procattach(struct knote *kn) { struct proc *p; int error; bool exiting, immediate; exiting = immediate = false; if (kn->kn_sfflags & NOTE_EXIT) p = pfind_any(kn->kn_id); else p = pfind(kn->kn_id); if (p == NULL) return (ESRCH); if (p->p_flag & P_WEXIT) exiting = true; if ((error = p_cansee(curthread, p))) { PROC_UNLOCK(p); return (error); } kn->kn_ptr.p_proc = p; kn->kn_flags |= EV_CLEAR; /* automatically set */ /* * Internal flag indicating registration done by kernel for the * purposes of getting a NOTE_CHILD notification. */ if (kn->kn_flags & EV_FLAG2) { kn->kn_flags &= ~EV_FLAG2; kn->kn_data = kn->kn_sdata; /* ppid */ kn->kn_fflags = NOTE_CHILD; kn->kn_sfflags &= ~(NOTE_EXIT | NOTE_EXEC | NOTE_FORK); immediate = true; /* Force immediate activation of child note. */ } /* * Internal flag indicating registration done by kernel (for other than * NOTE_CHILD). */ if (kn->kn_flags & EV_FLAG1) { kn->kn_flags &= ~EV_FLAG1; } knlist_add(p->p_klist, kn, 1); /* * Immediately activate any child notes or, in the case of a zombie * target process, exit notes. The latter is necessary to handle the * case where the target process, e.g. a child, dies before the kevent * is registered. */ if (immediate || (exiting && filt_proc(kn, NOTE_EXIT))) KNOTE_ACTIVATE(kn, 0); PROC_UNLOCK(p); return (0); } /* * The knote may be attached to a different process, which may exit, * leaving nothing for the knote to be attached to. So when the process * exits, the knote is marked as DETACHED and also flagged as ONESHOT so * it will be deleted when read out. However, as part of the knote deletion, * this routine is called, so a check is needed to avoid actually performing * a detach, because the original process does not exist any more. */ /* XXX - move to kern_proc.c? */ static void filt_procdetach(struct knote *kn) { knlist_remove(kn->kn_knlist, kn, 0); kn->kn_ptr.p_proc = NULL; } /* XXX - move to kern_proc.c? */ static int filt_proc(struct knote *kn, long hint) { struct proc *p; u_int event; p = kn->kn_ptr.p_proc; if (p == NULL) /* already activated, from attach filter */ return (0); /* Mask off extra data. */ event = (u_int)hint & NOTE_PCTRLMASK; /* If the user is interested in this event, record it. */ if (kn->kn_sfflags & event) kn->kn_fflags |= event; /* Process is gone, so flag the event as finished. */ if (event == NOTE_EXIT) { kn->kn_flags |= EV_EOF | EV_ONESHOT; kn->kn_ptr.p_proc = NULL; if (kn->kn_fflags & NOTE_EXIT) kn->kn_data = KW_EXITCODE(p->p_xexit, p->p_xsig); if (kn->kn_fflags == 0) kn->kn_flags |= EV_DROP; return (1); } return (kn->kn_fflags != 0); } /* * Called when the process forked. It mostly does the same as the * knote(), activating all knotes registered to be activated when the * process forked. Additionally, for each knote attached to the * parent, check whether user wants to track the new process. If so * attach a new knote to it, and immediately report an event with the * child's pid. */ void knote_fork(struct knlist *list, int pid) { struct kqueue *kq; struct knote *kn; struct kevent kev; int error; MPASS(list != NULL); KNL_ASSERT_LOCKED(list); if (SLIST_EMPTY(&list->kl_list)) return; memset(&kev, 0, sizeof(kev)); SLIST_FOREACH(kn, &list->kl_list, kn_selnext) { kq = kn->kn_kq; KQ_LOCK(kq); if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) { KQ_UNLOCK(kq); continue; } /* * The same as knote(), activate the event. */ if ((kn->kn_sfflags & NOTE_TRACK) == 0) { if (kn->kn_fop->f_event(kn, NOTE_FORK)) KNOTE_ACTIVATE(kn, 1); KQ_UNLOCK(kq); continue; } /* * The NOTE_TRACK case. In addition to the activation * of the event, we need to register new events to * track the child. Drop the locks in preparation for * the call to kqueue_register(). */ kn_enter_flux(kn); KQ_UNLOCK(kq); list->kl_unlock(list->kl_lockarg); /* * Activate existing knote and register tracking knotes with * new process. * * First register a knote to get just the child notice. This * must be a separate note from a potential NOTE_EXIT * notification since both NOTE_CHILD and NOTE_EXIT are defined * to use the data field (in conflicting ways). */ kev.ident = pid; kev.filter = kn->kn_filter; kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_ONESHOT | EV_FLAG2; kev.fflags = kn->kn_sfflags; kev.data = kn->kn_id; /* parent */ kev.udata = kn->kn_kevent.udata;/* preserve udata */ error = kqueue_register(kq, &kev, NULL, M_NOWAIT); if (error) kn->kn_fflags |= NOTE_TRACKERR; /* * Then register another knote to track other potential events * from the new process. */ kev.ident = pid; kev.filter = kn->kn_filter; kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1; kev.fflags = kn->kn_sfflags; kev.data = kn->kn_id; /* parent */ kev.udata = kn->kn_kevent.udata;/* preserve udata */ error = kqueue_register(kq, &kev, NULL, M_NOWAIT); if (error) kn->kn_fflags |= NOTE_TRACKERR; if (kn->kn_fop->f_event(kn, NOTE_FORK)) KNOTE_ACTIVATE(kn, 0); list->kl_lock(list->kl_lockarg); KQ_LOCK(kq); kn_leave_flux(kn); KQ_UNLOCK_FLUX(kq); } } /* * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the * interval timer support code. */ #define NOTE_TIMER_PRECMASK \ (NOTE_SECONDS | NOTE_MSECONDS | NOTE_USECONDS | NOTE_NSECONDS) static sbintime_t timer2sbintime(int64_t data, int flags) { int64_t secs; /* * Macros for converting to the fractional second portion of an * sbintime_t using 64bit multiplication to improve precision. */ #define NS_TO_SBT(ns) (((ns) * (((uint64_t)1 << 63) / 500000000)) >> 32) #define US_TO_SBT(us) (((us) * (((uint64_t)1 << 63) / 500000)) >> 32) #define MS_TO_SBT(ms) (((ms) * (((uint64_t)1 << 63) / 500)) >> 32) switch (flags & NOTE_TIMER_PRECMASK) { case NOTE_SECONDS: #ifdef __LP64__ if (data > (SBT_MAX / SBT_1S)) return (SBT_MAX); #endif return ((sbintime_t)data << 32); case NOTE_MSECONDS: /* FALLTHROUGH */ case 0: if (data >= 1000) { secs = data / 1000; #ifdef __LP64__ if (secs > (SBT_MAX / SBT_1S)) return (SBT_MAX); #endif return (secs << 32 | MS_TO_SBT(data % 1000)); } return (MS_TO_SBT(data)); case NOTE_USECONDS: if (data >= 1000000) { secs = data / 1000000; #ifdef __LP64__ if (secs > (SBT_MAX / SBT_1S)) return (SBT_MAX); #endif return (secs << 32 | US_TO_SBT(data % 1000000)); } return (US_TO_SBT(data)); case NOTE_NSECONDS: if (data >= 1000000000) { secs = data / 1000000000; #ifdef __LP64__ if (secs > (SBT_MAX / SBT_1S)) return (SBT_MAX); #endif return (secs << 32 | NS_TO_SBT(data % 1000000000)); } return (NS_TO_SBT(data)); default: break; } return (-1); } struct kq_timer_cb_data { struct callout c; struct proc *p; struct knote *kn; int cpuid; int flags; TAILQ_ENTRY(kq_timer_cb_data) link; sbintime_t next; /* next timer event fires at */ sbintime_t to; /* precalculated timer period, 0 for abs */ }; #define KQ_TIMER_CB_ENQUEUED 0x01 static void kqtimer_sched_callout(struct kq_timer_cb_data *kc) { callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kc->kn, kc->cpuid, C_ABSOLUTE); } void kqtimer_proc_continue(struct proc *p) { struct kq_timer_cb_data *kc, *kc1; struct bintime bt; sbintime_t now; PROC_LOCK_ASSERT(p, MA_OWNED); getboottimebin(&bt); now = bttosbt(bt); TAILQ_FOREACH_SAFE(kc, &p->p_kqtim_stop, link, kc1) { TAILQ_REMOVE(&p->p_kqtim_stop, kc, link); kc->flags &= ~KQ_TIMER_CB_ENQUEUED; if (kc->next <= now) filt_timerexpire_l(kc->kn, true); else kqtimer_sched_callout(kc); } } static void filt_timerexpire_l(struct knote *kn, bool proc_locked) { struct kq_timer_cb_data *kc; struct proc *p; uint64_t delta; sbintime_t now; kc = kn->kn_ptr.p_v; if ((kn->kn_flags & EV_ONESHOT) != 0 || kc->to == 0) { kn->kn_data++; KNOTE_ACTIVATE(kn, 0); return; } now = sbinuptime(); if (now >= kc->next) { delta = (now - kc->next) / kc->to; if (delta == 0) delta = 1; kn->kn_data += delta; kc->next += (delta + 1) * kc->to; if (now >= kc->next) /* overflow */ kc->next = now + kc->to; KNOTE_ACTIVATE(kn, 0); /* XXX - handle locking */ } /* * Initial check for stopped kc->p is racy. It is fine to * miss the set of the stop flags, at worst we would schedule * one more callout. On the other hand, it is not fine to not * schedule when we we missed clearing of the flags, we * recheck them under the lock and observe consistent state. */ p = kc->p; if (P_SHOULDSTOP(p) || P_KILLED(p)) { if (!proc_locked) PROC_LOCK(p); if (P_SHOULDSTOP(p) || P_KILLED(p)) { if ((kc->flags & KQ_TIMER_CB_ENQUEUED) == 0) { kc->flags |= KQ_TIMER_CB_ENQUEUED; TAILQ_INSERT_TAIL(&p->p_kqtim_stop, kc, link); } if (!proc_locked) PROC_UNLOCK(p); return; } if (!proc_locked) PROC_UNLOCK(p); } kqtimer_sched_callout(kc); } static void filt_timerexpire(void *knx) { filt_timerexpire_l(knx, false); } /* * data contains amount of time to sleep */ static int filt_timervalidate(struct knote *kn, sbintime_t *to) { struct bintime bt; sbintime_t sbt; if (kn->kn_sdata < 0) return (EINVAL); if (kn->kn_sdata == 0 && (kn->kn_flags & EV_ONESHOT) == 0) kn->kn_sdata = 1; /* * The only fflags values supported are the timer unit * (precision) and the absolute time indicator. */ if ((kn->kn_sfflags & ~(NOTE_TIMER_PRECMASK | NOTE_ABSTIME)) != 0) return (EINVAL); *to = timer2sbintime(kn->kn_sdata, kn->kn_sfflags); if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) { getboottimebin(&bt); sbt = bttosbt(bt); *to -= sbt; } if (*to < 0) return (EINVAL); return (0); } static int filt_timerattach(struct knote *kn) { struct kq_timer_cb_data *kc; sbintime_t to; int error; error = filt_timervalidate(kn, &to); if (error != 0) return (error); if (atomic_fetchadd_int(&kq_ncallouts, 1) + 1 > kq_calloutmax) { atomic_subtract_int(&kq_ncallouts, 1); return (ENOMEM); } if ((kn->kn_sfflags & NOTE_ABSTIME) == 0) kn->kn_flags |= EV_CLEAR; /* automatically set */ kn->kn_status &= ~KN_DETACHED; /* knlist_add clears it */ kn->kn_ptr.p_v = kc = malloc(sizeof(*kc), M_KQUEUE, M_WAITOK); kc->kn = kn; kc->p = curproc; kc->cpuid = PCPU_GET(cpuid); kc->flags = 0; callout_init(&kc->c, 1); filt_timerstart(kn, to); return (0); } static void filt_timerstart(struct knote *kn, sbintime_t to) { struct kq_timer_cb_data *kc; kc = kn->kn_ptr.p_v; if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) { kc->next = to; kc->to = 0; } else { kc->next = to + sbinuptime(); kc->to = to; } kqtimer_sched_callout(kc); } static void filt_timerdetach(struct knote *kn) { struct kq_timer_cb_data *kc; unsigned int old __unused; bool pending; kc = kn->kn_ptr.p_v; do { callout_drain(&kc->c); /* * kqtimer_proc_continue() might have rescheduled this callout. * Double-check, using the process mutex as an interlock. */ PROC_LOCK(kc->p); if ((kc->flags & KQ_TIMER_CB_ENQUEUED) != 0) { kc->flags &= ~KQ_TIMER_CB_ENQUEUED; TAILQ_REMOVE(&kc->p->p_kqtim_stop, kc, link); } pending = callout_pending(&kc->c); PROC_UNLOCK(kc->p); } while (pending); free(kc, M_KQUEUE); old = atomic_fetchadd_int(&kq_ncallouts, -1); KASSERT(old > 0, ("Number of callouts cannot become negative")); kn->kn_status |= KN_DETACHED; /* knlist_remove sets it */ } static void filt_timertouch(struct knote *kn, struct kevent *kev, u_long type) { struct kq_timer_cb_data *kc; struct kqueue *kq; sbintime_t to; int error; switch (type) { case EVENT_REGISTER: /* Handle re-added timers that update data/fflags */ if (kev->flags & EV_ADD) { kc = kn->kn_ptr.p_v; /* Drain any existing callout. */ callout_drain(&kc->c); /* Throw away any existing undelivered record * of the timer expiration. This is done under * the presumption that if a process is * re-adding this timer with new parameters, * it is no longer interested in what may have * happened under the old parameters. If it is * interested, it can wait for the expiration, * delete the old timer definition, and then * add the new one. * * This has to be done while the kq is locked: * - if enqueued, dequeue * - make it no longer active * - clear the count of expiration events */ kq = kn->kn_kq; KQ_LOCK(kq); if (kn->kn_status & KN_QUEUED) knote_dequeue(kn); kn->kn_status &= ~KN_ACTIVE; kn->kn_data = 0; KQ_UNLOCK(kq); /* Reschedule timer based on new data/fflags */ kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; error = filt_timervalidate(kn, &to); if (error != 0) { kn->kn_flags |= EV_ERROR; kn->kn_data = error; } else filt_timerstart(kn, to); } break; case EVENT_PROCESS: *kev = kn->kn_kevent; if (kn->kn_flags & EV_CLEAR) { kn->kn_data = 0; kn->kn_fflags = 0; } break; default: panic("filt_timertouch() - invalid type (%ld)", type); break; } } static int filt_timer(struct knote *kn, long hint) { return (kn->kn_data != 0); } static int filt_userattach(struct knote *kn) { /* * EVFILT_USER knotes are not attached to anything in the kernel. */ kn->kn_hook = NULL; if (kn->kn_fflags & NOTE_TRIGGER) kn->kn_hookid = 1; else kn->kn_hookid = 0; return (0); } static void filt_userdetach(__unused struct knote *kn) { /* * EVFILT_USER knotes are not attached to anything in the kernel. */ } static int filt_user(struct knote *kn, __unused long hint) { return (kn->kn_hookid); } static void filt_usertouch(struct knote *kn, struct kevent *kev, u_long type) { u_int ffctrl; switch (type) { case EVENT_REGISTER: if (kev->fflags & NOTE_TRIGGER) kn->kn_hookid = 1; ffctrl = kev->fflags & NOTE_FFCTRLMASK; kev->fflags &= NOTE_FFLAGSMASK; switch (ffctrl) { case NOTE_FFNOP: break; case NOTE_FFAND: kn->kn_sfflags &= kev->fflags; break; case NOTE_FFOR: kn->kn_sfflags |= kev->fflags; break; case NOTE_FFCOPY: kn->kn_sfflags = kev->fflags; break; default: /* XXX Return error? */ break; } kn->kn_sdata = kev->data; if (kev->flags & EV_CLEAR) { kn->kn_hookid = 0; kn->kn_data = 0; kn->kn_fflags = 0; } break; case EVENT_PROCESS: *kev = kn->kn_kevent; kev->fflags = kn->kn_sfflags; kev->data = kn->kn_sdata; if (kn->kn_flags & EV_CLEAR) { kn->kn_hookid = 0; kn->kn_data = 0; kn->kn_fflags = 0; } break; default: panic("filt_usertouch() - invalid type (%ld)", type); break; } } int sys_kqueue(struct thread *td, struct kqueue_args *uap) { return (kern_kqueue(td, 0, NULL)); } static void kqueue_init(struct kqueue *kq) { mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF | MTX_DUPOK); TAILQ_INIT(&kq->kq_head); knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock); TASK_INIT(&kq->kq_task, 0, kqueue_task, kq); } int kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps) { struct filedesc *fdp; struct kqueue *kq; struct file *fp; struct ucred *cred; int fd, error; fdp = td->td_proc->p_fd; cred = td->td_ucred; if (!chgkqcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_KQUEUES))) return (ENOMEM); error = falloc_caps(td, &fp, &fd, flags, fcaps); if (error != 0) { chgkqcnt(cred->cr_ruidinfo, -1, 0); return (error); } /* An extra reference on `fp' has been held for us by falloc(). */ kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO); kqueue_init(kq); kq->kq_fdp = fdp; kq->kq_cred = crhold(cred); FILEDESC_XLOCK(fdp); TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list); FILEDESC_XUNLOCK(fdp); finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops); fdrop(fp, td); td->td_retval[0] = fd; return (0); } struct g_kevent_args { int fd; void *changelist; int nchanges; void *eventlist; int nevents; const struct timespec *timeout; }; int sys_kevent(struct thread *td, struct kevent_args *uap) { struct kevent_copyops k_ops = { .arg = uap, .k_copyout = kevent_copyout, .k_copyin = kevent_copyin, .kevent_size = sizeof(struct kevent), }; struct g_kevent_args gk_args = { .fd = uap->fd, .changelist = uap->changelist, .nchanges = uap->nchanges, .eventlist = uap->eventlist, .nevents = uap->nevents, .timeout = uap->timeout, }; return (kern_kevent_generic(td, &gk_args, &k_ops, "kevent")); } static int kern_kevent_generic(struct thread *td, struct g_kevent_args *uap, struct kevent_copyops *k_ops, const char *struct_name) { struct timespec ts, *tsp; #ifdef KTRACE struct kevent *eventlist = uap->eventlist; #endif int error; if (uap->timeout != NULL) { error = copyin(uap->timeout, &ts, sizeof(ts)); if (error) return (error); tsp = &ts; } else tsp = NULL; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT_ARRAY)) ktrstructarray(struct_name, UIO_USERSPACE, uap->changelist, uap->nchanges, k_ops->kevent_size); #endif error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents, k_ops, tsp); #ifdef KTRACE if (error == 0 && KTRPOINT(td, KTR_STRUCT_ARRAY)) ktrstructarray(struct_name, UIO_USERSPACE, eventlist, td->td_retval[0], k_ops->kevent_size); #endif return (error); } /* * Copy 'count' items into the destination list pointed to by uap->eventlist. */ static int kevent_copyout(void *arg, struct kevent *kevp, int count) { struct kevent_args *uap; int error; KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); uap = (struct kevent_args *)arg; error = copyout(kevp, uap->eventlist, count * sizeof *kevp); if (error == 0) uap->eventlist += count; return (error); } /* * Copy 'count' items from the list pointed to by uap->changelist. */ static int kevent_copyin(void *arg, struct kevent *kevp, int count) { struct kevent_args *uap; int error; KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); uap = (struct kevent_args *)arg; error = copyin(uap->changelist, kevp, count * sizeof *kevp); if (error == 0) uap->changelist += count; return (error); } #ifdef COMPAT_FREEBSD11 static int kevent11_copyout(void *arg, struct kevent *kevp, int count) { struct freebsd11_kevent_args *uap; struct kevent_freebsd11 kev11; int error, i; KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); uap = (struct freebsd11_kevent_args *)arg; for (i = 0; i < count; i++) { kev11.ident = kevp->ident; kev11.filter = kevp->filter; kev11.flags = kevp->flags; kev11.fflags = kevp->fflags; kev11.data = kevp->data; kev11.udata = kevp->udata; error = copyout(&kev11, uap->eventlist, sizeof(kev11)); if (error != 0) break; uap->eventlist++; kevp++; } return (error); } /* * Copy 'count' items from the list pointed to by uap->changelist. */ static int kevent11_copyin(void *arg, struct kevent *kevp, int count) { struct freebsd11_kevent_args *uap; struct kevent_freebsd11 kev11; int error, i; KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); uap = (struct freebsd11_kevent_args *)arg; for (i = 0; i < count; i++) { error = copyin(uap->changelist, &kev11, sizeof(kev11)); if (error != 0) break; kevp->ident = kev11.ident; kevp->filter = kev11.filter; kevp->flags = kev11.flags; kevp->fflags = kev11.fflags; kevp->data = (uintptr_t)kev11.data; kevp->udata = kev11.udata; bzero(&kevp->ext, sizeof(kevp->ext)); uap->changelist++; kevp++; } return (error); } int freebsd11_kevent(struct thread *td, struct freebsd11_kevent_args *uap) { struct kevent_copyops k_ops = { .arg = uap, .k_copyout = kevent11_copyout, .k_copyin = kevent11_copyin, .kevent_size = sizeof(struct kevent_freebsd11), }; struct g_kevent_args gk_args = { .fd = uap->fd, .changelist = uap->changelist, .nchanges = uap->nchanges, .eventlist = uap->eventlist, .nevents = uap->nevents, .timeout = uap->timeout, }; return (kern_kevent_generic(td, &gk_args, &k_ops, "kevent_freebsd11")); } #endif int kern_kevent(struct thread *td, int fd, int nchanges, int nevents, struct kevent_copyops *k_ops, const struct timespec *timeout) { cap_rights_t rights; struct file *fp; int error; cap_rights_init_zero(&rights); if (nchanges > 0) cap_rights_set_one(&rights, CAP_KQUEUE_CHANGE); if (nevents > 0) cap_rights_set_one(&rights, CAP_KQUEUE_EVENT); error = fget(td, fd, &rights, &fp); if (error != 0) return (error); error = kern_kevent_fp(td, fp, nchanges, nevents, k_ops, timeout); fdrop(fp, td); return (error); } static int kqueue_kevent(struct kqueue *kq, struct thread *td, int nchanges, int nevents, struct kevent_copyops *k_ops, const struct timespec *timeout) { struct kevent keva[KQ_NEVENTS]; struct kevent *kevp, *changes; int i, n, nerrors, error; if (nchanges < 0) return (EINVAL); nerrors = 0; while (nchanges > 0) { n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges; error = k_ops->k_copyin(k_ops->arg, keva, n); if (error) return (error); changes = keva; for (i = 0; i < n; i++) { kevp = &changes[i]; if (!kevp->filter) continue; kevp->flags &= ~EV_SYSFLAGS; error = kqueue_register(kq, kevp, td, M_WAITOK); if (error || (kevp->flags & EV_RECEIPT)) { if (nevents == 0) return (error); kevp->flags = EV_ERROR; kevp->data = error; (void)k_ops->k_copyout(k_ops->arg, kevp, 1); nevents--; nerrors++; } } nchanges -= n; } if (nerrors) { td->td_retval[0] = nerrors; return (0); } return (kqueue_scan(kq, nevents, k_ops, timeout, keva, td)); } int kern_kevent_fp(struct thread *td, struct file *fp, int nchanges, int nevents, struct kevent_copyops *k_ops, const struct timespec *timeout) { struct kqueue *kq; int error; error = kqueue_acquire(fp, &kq); if (error != 0) return (error); error = kqueue_kevent(kq, td, nchanges, nevents, k_ops, timeout); kqueue_release(kq, 0); return (error); } /* * Performs a kevent() call on a temporarily created kqueue. This can be * used to perform one-shot polling, similar to poll() and select(). */ int kern_kevent_anonymous(struct thread *td, int nevents, struct kevent_copyops *k_ops) { struct kqueue kq = {}; int error; kqueue_init(&kq); kq.kq_refcnt = 1; error = kqueue_kevent(&kq, td, nevents, nevents, k_ops, NULL); kqueue_drain(&kq, td); kqueue_destroy(&kq); return (error); } int kqueue_add_filteropts(int filt, struct filterops *filtops) { int error; error = 0; if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) { printf( "trying to add a filterop that is out of range: %d is beyond %d\n", ~filt, EVFILT_SYSCOUNT); return EINVAL; } mtx_lock(&filterops_lock); if (sysfilt_ops[~filt].for_fop != &null_filtops && sysfilt_ops[~filt].for_fop != NULL) error = EEXIST; else { sysfilt_ops[~filt].for_fop = filtops; sysfilt_ops[~filt].for_refcnt = 0; } mtx_unlock(&filterops_lock); return (error); } int kqueue_del_filteropts(int filt) { int error; error = 0; if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) return EINVAL; mtx_lock(&filterops_lock); if (sysfilt_ops[~filt].for_fop == &null_filtops || sysfilt_ops[~filt].for_fop == NULL) error = EINVAL; else if (sysfilt_ops[~filt].for_refcnt != 0) error = EBUSY; else { sysfilt_ops[~filt].for_fop = &null_filtops; sysfilt_ops[~filt].for_refcnt = 0; } mtx_unlock(&filterops_lock); return error; } static struct filterops * kqueue_fo_find(int filt) { if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) return NULL; if (sysfilt_ops[~filt].for_nolock) return sysfilt_ops[~filt].for_fop; mtx_lock(&filterops_lock); sysfilt_ops[~filt].for_refcnt++; if (sysfilt_ops[~filt].for_fop == NULL) sysfilt_ops[~filt].for_fop = &null_filtops; mtx_unlock(&filterops_lock); return sysfilt_ops[~filt].for_fop; } static void kqueue_fo_release(int filt) { if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) return; if (sysfilt_ops[~filt].for_nolock) return; mtx_lock(&filterops_lock); KASSERT(sysfilt_ops[~filt].for_refcnt > 0, ("filter object refcount not valid on release")); sysfilt_ops[~filt].for_refcnt--; mtx_unlock(&filterops_lock); } /* * A ref to kq (obtained via kqueue_acquire) must be held. */ static int kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int mflag) { struct filterops *fops; struct file *fp; struct knote *kn, *tkn; struct knlist *knl; int error, filt, event; int haskqglobal, filedesc_unlock; if ((kev->flags & (EV_ENABLE | EV_DISABLE)) == (EV_ENABLE | EV_DISABLE)) return (EINVAL); fp = NULL; kn = NULL; knl = NULL; error = 0; haskqglobal = 0; filedesc_unlock = 0; filt = kev->filter; fops = kqueue_fo_find(filt); if (fops == NULL) return EINVAL; if (kev->flags & EV_ADD) { + /* Reject an invalid flag pair early */ + if (kev->flags & EV_KEEPUDATA) { + tkn = NULL; + error = EINVAL; + goto done; + } + /* * Prevent waiting with locks. Non-sleepable * allocation failures are handled in the loop, only * if the spare knote appears to be actually required. */ tkn = knote_alloc(mflag); } else { tkn = NULL; } findkn: if (fops->f_isfd) { KASSERT(td != NULL, ("td is NULL")); if (kev->ident > INT_MAX) error = EBADF; else error = fget(td, kev->ident, &cap_event_rights, &fp); if (error) goto done; if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops, kev->ident, M_NOWAIT) != 0) { /* try again */ fdrop(fp, td); fp = NULL; error = kqueue_expand(kq, fops, kev->ident, mflag); if (error) goto done; goto findkn; } if (fp->f_type == DTYPE_KQUEUE) { /* * If we add some intelligence about what we are doing, * we should be able to support events on ourselves. * We need to know when we are doing this to prevent * getting both the knlist lock and the kq lock since * they are the same thing. */ if (fp->f_data == kq) { error = EINVAL; goto done; } /* * Pre-lock the filedesc before the global * lock mutex, see the comment in * kqueue_close(). */ FILEDESC_XLOCK(td->td_proc->p_fd); filedesc_unlock = 1; KQ_GLOBAL_LOCK(&kq_global, haskqglobal); } KQ_LOCK(kq); if (kev->ident < kq->kq_knlistsize) { SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link) if (kev->filter == kn->kn_filter) break; } } else { if ((kev->flags & EV_ADD) == EV_ADD) { error = kqueue_expand(kq, fops, kev->ident, mflag); if (error != 0) goto done; } KQ_LOCK(kq); /* * If possible, find an existing knote to use for this kevent. */ if (kev->filter == EVFILT_PROC && (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) { /* This is an internal creation of a process tracking * note. Don't attempt to coalesce this with an * existing note. */ ; } else if (kq->kq_knhashmask != 0) { struct klist *list; list = &kq->kq_knhash[ KN_HASH((u_long)kev->ident, kq->kq_knhashmask)]; SLIST_FOREACH(kn, list, kn_link) if (kev->ident == kn->kn_id && kev->filter == kn->kn_filter) break; } } /* knote is in the process of changing, wait for it to stabilize. */ if (kn != NULL && kn_in_flux(kn)) { KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); if (filedesc_unlock) { FILEDESC_XUNLOCK(td->td_proc->p_fd); filedesc_unlock = 0; } kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0); if (fp != NULL) { fdrop(fp, td); fp = NULL; } goto findkn; } /* * kn now contains the matching knote, or NULL if no match */ if (kn == NULL) { if (kev->flags & EV_ADD) { kn = tkn; tkn = NULL; if (kn == NULL) { KQ_UNLOCK(kq); error = ENOMEM; goto done; } kn->kn_fp = fp; kn->kn_kq = kq; kn->kn_fop = fops; /* * apply reference counts to knote structure, and * do not release it at the end of this routine. */ fops = NULL; fp = NULL; kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; kev->fflags = 0; kev->data = 0; kn->kn_kevent = *kev; kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE | EV_ENABLE | EV_DISABLE | EV_FORCEONESHOT); kn->kn_status = KN_DETACHED; if ((kev->flags & EV_DISABLE) != 0) kn->kn_status |= KN_DISABLED; kn_enter_flux(kn); error = knote_attach(kn, kq); KQ_UNLOCK(kq); if (error != 0) { tkn = kn; goto done; } if ((error = kn->kn_fop->f_attach(kn)) != 0) { knote_drop_detached(kn, td); goto done; } knl = kn_list_lock(kn); goto done_ev_add; } else { /* No matching knote and the EV_ADD flag is not set. */ KQ_UNLOCK(kq); error = ENOENT; goto done; } } if (kev->flags & EV_DELETE) { kn_enter_flux(kn); KQ_UNLOCK(kq); knote_drop(kn, td); goto done; } if (kev->flags & EV_FORCEONESHOT) { kn->kn_flags |= EV_ONESHOT; KNOTE_ACTIVATE(kn, 1); } if ((kev->flags & EV_ENABLE) != 0) kn->kn_status &= ~KN_DISABLED; else if ((kev->flags & EV_DISABLE) != 0) kn->kn_status |= KN_DISABLED; /* * The user may change some filter values after the initial EV_ADD, * but doing so will not reset any filter which has already been * triggered. */ kn->kn_status |= KN_SCAN; kn_enter_flux(kn); KQ_UNLOCK(kq); knl = kn_list_lock(kn); - kn->kn_kevent.udata = kev->udata; + if ((kev->flags & EV_KEEPUDATA) == 0) + kn->kn_kevent.udata = kev->udata; if (!fops->f_isfd && fops->f_touch != NULL) { fops->f_touch(kn, kev, EVENT_REGISTER); } else { kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; } done_ev_add: /* * We can get here with kn->kn_knlist == NULL. This can happen when * the initial attach event decides that the event is "completed" * already, e.g., filt_procattach() is called on a zombie process. It * will call filt_proc() which will remove it from the list, and NULL * kn_knlist. * * KN_DISABLED will be stable while the knote is in flux, so the * unlocked read will not race with an update. */ if ((kn->kn_status & KN_DISABLED) == 0) event = kn->kn_fop->f_event(kn, 0); else event = 0; KQ_LOCK(kq); if (event) kn->kn_status |= KN_ACTIVE; if ((kn->kn_status & (KN_ACTIVE | KN_DISABLED | KN_QUEUED)) == KN_ACTIVE) knote_enqueue(kn); kn->kn_status &= ~KN_SCAN; kn_leave_flux(kn); kn_list_unlock(knl); KQ_UNLOCK_FLUX(kq); done: KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); if (filedesc_unlock) FILEDESC_XUNLOCK(td->td_proc->p_fd); if (fp != NULL) fdrop(fp, td); knote_free(tkn); if (fops != NULL) kqueue_fo_release(filt); return (error); } static int kqueue_acquire(struct file *fp, struct kqueue **kqp) { int error; struct kqueue *kq; error = 0; kq = fp->f_data; if (fp->f_type != DTYPE_KQUEUE || kq == NULL) return (EBADF); *kqp = kq; KQ_LOCK(kq); if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) { KQ_UNLOCK(kq); return (EBADF); } kq->kq_refcnt++; KQ_UNLOCK(kq); return error; } static void kqueue_release(struct kqueue *kq, int locked) { if (locked) KQ_OWNED(kq); else KQ_LOCK(kq); kq->kq_refcnt--; if (kq->kq_refcnt == 1) wakeup(&kq->kq_refcnt); if (!locked) KQ_UNLOCK(kq); } void kqueue_drain_schedtask(void) { taskqueue_quiesce(taskqueue_kqueue_ctx); } static void kqueue_schedtask(struct kqueue *kq) { struct thread *td; KQ_OWNED(kq); KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN), ("scheduling kqueue task while draining")); if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) { taskqueue_enqueue(taskqueue_kqueue_ctx, &kq->kq_task); kq->kq_state |= KQ_TASKSCHED; td = curthread; thread_lock(td); td->td_flags |= TDF_ASTPENDING | TDF_KQTICKLED; thread_unlock(td); } } /* * Expand the kq to make sure we have storage for fops/ident pair. * * Return 0 on success (or no work necessary), return errno on failure. */ static int kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident, int mflag) { struct klist *list, *tmp_knhash, *to_free; u_long tmp_knhashmask; int error, fd, size; KQ_NOTOWNED(kq); error = 0; to_free = NULL; if (fops->f_isfd) { fd = ident; if (kq->kq_knlistsize <= fd) { size = kq->kq_knlistsize; while (size <= fd) size += KQEXTENT; list = malloc(size * sizeof(*list), M_KQUEUE, mflag); if (list == NULL) return ENOMEM; KQ_LOCK(kq); if ((kq->kq_state & KQ_CLOSING) != 0) { to_free = list; error = EBADF; } else if (kq->kq_knlistsize > fd) { to_free = list; } else { if (kq->kq_knlist != NULL) { bcopy(kq->kq_knlist, list, kq->kq_knlistsize * sizeof(*list)); to_free = kq->kq_knlist; kq->kq_knlist = NULL; } bzero((caddr_t)list + kq->kq_knlistsize * sizeof(*list), (size - kq->kq_knlistsize) * sizeof(*list)); kq->kq_knlistsize = size; kq->kq_knlist = list; } KQ_UNLOCK(kq); } } else { if (kq->kq_knhashmask == 0) { tmp_knhash = hashinit_flags(KN_HASHSIZE, M_KQUEUE, &tmp_knhashmask, (mflag & M_WAITOK) != 0 ? HASH_WAITOK : HASH_NOWAIT); if (tmp_knhash == NULL) return (ENOMEM); KQ_LOCK(kq); if ((kq->kq_state & KQ_CLOSING) != 0) { to_free = tmp_knhash; error = EBADF; } else if (kq->kq_knhashmask == 0) { kq->kq_knhash = tmp_knhash; kq->kq_knhashmask = tmp_knhashmask; } else { to_free = tmp_knhash; } KQ_UNLOCK(kq); } } free(to_free, M_KQUEUE); KQ_NOTOWNED(kq); return (error); } static void kqueue_task(void *arg, int pending) { struct kqueue *kq; int haskqglobal; haskqglobal = 0; kq = arg; KQ_GLOBAL_LOCK(&kq_global, haskqglobal); KQ_LOCK(kq); KNOTE_LOCKED(&kq->kq_sel.si_note, 0); kq->kq_state &= ~KQ_TASKSCHED; if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) { wakeup(&kq->kq_state); } KQ_UNLOCK(kq); KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); } /* * Scan, update kn_data (if not ONESHOT), and copyout triggered events. * We treat KN_MARKER knotes as if they are in flux. */ static int kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops, const struct timespec *tsp, struct kevent *keva, struct thread *td) { struct kevent *kevp; struct knote *kn, *marker; struct knlist *knl; sbintime_t asbt, rsbt; int count, error, haskqglobal, influx, nkev, touch; count = maxevents; nkev = 0; error = 0; haskqglobal = 0; if (maxevents == 0) goto done_nl; if (maxevents < 0) { error = EINVAL; goto done_nl; } rsbt = 0; if (tsp != NULL) { if (tsp->tv_sec < 0 || tsp->tv_nsec < 0 || tsp->tv_nsec >= 1000000000) { error = EINVAL; goto done_nl; } if (timespecisset(tsp)) { if (tsp->tv_sec <= INT32_MAX) { rsbt = tstosbt(*tsp); if (TIMESEL(&asbt, rsbt)) asbt += tc_tick_sbt; if (asbt <= SBT_MAX - rsbt) asbt += rsbt; else asbt = 0; rsbt >>= tc_precexp; } else asbt = 0; } else asbt = -1; } else asbt = 0; marker = knote_alloc(M_WAITOK); marker->kn_status = KN_MARKER; KQ_LOCK(kq); retry: kevp = keva; if (kq->kq_count == 0) { if (asbt == -1) { error = EWOULDBLOCK; } else { kq->kq_state |= KQ_SLEEP; error = msleep_sbt(kq, &kq->kq_lock, PSOCK | PCATCH, "kqread", asbt, rsbt, C_ABSOLUTE); } if (error == 0) goto retry; /* don't restart after signals... */ if (error == ERESTART) error = EINTR; else if (error == EWOULDBLOCK) error = 0; goto done; } TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe); influx = 0; while (count) { KQ_OWNED(kq); kn = TAILQ_FIRST(&kq->kq_head); if ((kn->kn_status == KN_MARKER && kn != marker) || kn_in_flux(kn)) { if (influx) { influx = 0; KQ_FLUX_WAKEUP(kq); } kq->kq_state |= KQ_FLUXWAIT; error = msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0); continue; } TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) { kn->kn_status &= ~KN_QUEUED; kq->kq_count--; continue; } if (kn == marker) { KQ_FLUX_WAKEUP(kq); if (count == maxevents) goto retry; goto done; } KASSERT(!kn_in_flux(kn), ("knote %p is unexpectedly in flux", kn)); if ((kn->kn_flags & EV_DROP) == EV_DROP) { kn->kn_status &= ~KN_QUEUED; kn_enter_flux(kn); kq->kq_count--; KQ_UNLOCK(kq); /* * We don't need to lock the list since we've * marked it as in flux. */ knote_drop(kn, td); KQ_LOCK(kq); continue; } else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) { kn->kn_status &= ~KN_QUEUED; kn_enter_flux(kn); kq->kq_count--; KQ_UNLOCK(kq); /* * We don't need to lock the list since we've * marked the knote as being in flux. */ *kevp = kn->kn_kevent; knote_drop(kn, td); KQ_LOCK(kq); kn = NULL; } else { kn->kn_status |= KN_SCAN; kn_enter_flux(kn); KQ_UNLOCK(kq); if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE) KQ_GLOBAL_LOCK(&kq_global, haskqglobal); knl = kn_list_lock(kn); if (kn->kn_fop->f_event(kn, 0) == 0) { KQ_LOCK(kq); KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE | KN_SCAN); kn_leave_flux(kn); kq->kq_count--; kn_list_unlock(knl); influx = 1; continue; } touch = (!kn->kn_fop->f_isfd && kn->kn_fop->f_touch != NULL); if (touch) kn->kn_fop->f_touch(kn, kevp, EVENT_PROCESS); else *kevp = kn->kn_kevent; KQ_LOCK(kq); KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) { /* * Manually clear knotes who weren't * 'touch'ed. */ if (touch == 0 && kn->kn_flags & EV_CLEAR) { kn->kn_data = 0; kn->kn_fflags = 0; } if (kn->kn_flags & EV_DISPATCH) kn->kn_status |= KN_DISABLED; kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE); kq->kq_count--; } else TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); kn->kn_status &= ~KN_SCAN; kn_leave_flux(kn); kn_list_unlock(knl); influx = 1; } /* we are returning a copy to the user */ kevp++; nkev++; count--; if (nkev == KQ_NEVENTS) { influx = 0; KQ_UNLOCK_FLUX(kq); error = k_ops->k_copyout(k_ops->arg, keva, nkev); nkev = 0; kevp = keva; KQ_LOCK(kq); if (error) break; } } TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe); done: KQ_OWNED(kq); KQ_UNLOCK_FLUX(kq); knote_free(marker); done_nl: KQ_NOTOWNED(kq); if (nkev != 0) error = k_ops->k_copyout(k_ops->arg, keva, nkev); td->td_retval[0] = maxevents - count; return (error); } /*ARGSUSED*/ static int kqueue_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { /* * Enabling sigio causes two major problems: * 1) infinite recursion: * Synopsys: kevent is being used to track signals and have FIOASYNC * set. On receipt of a signal this will cause a kqueue to recurse * into itself over and over. Sending the sigio causes the kqueue * to become ready, which in turn posts sigio again, forever. * Solution: this can be solved by setting a flag in the kqueue that * we have a SIGIO in progress. * 2) locking problems: * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts * us above the proc and pgrp locks. * Solution: Post a signal using an async mechanism, being sure to * record a generation count in the delivery so that we do not deliver * a signal to the wrong process. * * Note, these two mechanisms are somewhat mutually exclusive! */ #if 0 struct kqueue *kq; kq = fp->f_data; switch (cmd) { case FIOASYNC: if (*(int *)data) { kq->kq_state |= KQ_ASYNC; } else { kq->kq_state &= ~KQ_ASYNC; } return (0); case FIOSETOWN: return (fsetown(*(int *)data, &kq->kq_sigio)); case FIOGETOWN: *(int *)data = fgetown(&kq->kq_sigio); return (0); } #endif return (ENOTTY); } /*ARGSUSED*/ static int kqueue_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct kqueue *kq; int revents = 0; int error; if ((error = kqueue_acquire(fp, &kq))) return POLLERR; KQ_LOCK(kq); if (events & (POLLIN | POLLRDNORM)) { if (kq->kq_count) { revents |= events & (POLLIN | POLLRDNORM); } else { selrecord(td, &kq->kq_sel); if (SEL_WAITING(&kq->kq_sel)) kq->kq_state |= KQ_SEL; } } kqueue_release(kq, 1); KQ_UNLOCK(kq); return (revents); } /*ARGSUSED*/ static int kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred, struct thread *td) { bzero((void *)st, sizeof *st); /* * We no longer return kq_count because the unlocked value is useless. * If you spent all this time getting the count, why not spend your * syscall better by calling kevent? * * XXX - This is needed for libc_r. */ st->st_mode = S_IFIFO; return (0); } static void kqueue_drain(struct kqueue *kq, struct thread *td) { struct knote *kn; int i; KQ_LOCK(kq); KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING, ("kqueue already closing")); kq->kq_state |= KQ_CLOSING; if (kq->kq_refcnt > 1) msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0); KASSERT(kq->kq_refcnt == 1, ("other refs are out there!")); KASSERT(knlist_empty(&kq->kq_sel.si_note), ("kqueue's knlist not empty")); for (i = 0; i < kq->kq_knlistsize; i++) { while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) { if (kn_in_flux(kn)) { kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK, "kqclo1", 0); continue; } kn_enter_flux(kn); KQ_UNLOCK(kq); knote_drop(kn, td); KQ_LOCK(kq); } } if (kq->kq_knhashmask != 0) { for (i = 0; i <= kq->kq_knhashmask; i++) { while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) { if (kn_in_flux(kn)) { kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK, "kqclo2", 0); continue; } kn_enter_flux(kn); KQ_UNLOCK(kq); knote_drop(kn, td); KQ_LOCK(kq); } } } if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) { kq->kq_state |= KQ_TASKDRAIN; msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0); } if ((kq->kq_state & KQ_SEL) == KQ_SEL) { selwakeuppri(&kq->kq_sel, PSOCK); if (!SEL_WAITING(&kq->kq_sel)) kq->kq_state &= ~KQ_SEL; } KQ_UNLOCK(kq); } static void kqueue_destroy(struct kqueue *kq) { KASSERT(kq->kq_fdp == NULL, ("kqueue still attached to a file descriptor")); seldrain(&kq->kq_sel); knlist_destroy(&kq->kq_sel.si_note); mtx_destroy(&kq->kq_lock); if (kq->kq_knhash != NULL) free(kq->kq_knhash, M_KQUEUE); if (kq->kq_knlist != NULL) free(kq->kq_knlist, M_KQUEUE); funsetown(&kq->kq_sigio); } /*ARGSUSED*/ static int kqueue_close(struct file *fp, struct thread *td) { struct kqueue *kq = fp->f_data; struct filedesc *fdp; int error; int filedesc_unlock; if ((error = kqueue_acquire(fp, &kq))) return error; kqueue_drain(kq, td); /* * We could be called due to the knote_drop() doing fdrop(), * called from kqueue_register(). In this case the global * lock is owned, and filedesc sx is locked before, to not * take the sleepable lock after non-sleepable. */ fdp = kq->kq_fdp; kq->kq_fdp = NULL; if (!sx_xlocked(FILEDESC_LOCK(fdp))) { FILEDESC_XLOCK(fdp); filedesc_unlock = 1; } else filedesc_unlock = 0; TAILQ_REMOVE(&fdp->fd_kqlist, kq, kq_list); if (filedesc_unlock) FILEDESC_XUNLOCK(fdp); kqueue_destroy(kq); chgkqcnt(kq->kq_cred->cr_ruidinfo, -1, 0); crfree(kq->kq_cred); free(kq, M_KQUEUE); fp->f_data = NULL; return (0); } static int kqueue_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { kif->kf_type = KF_TYPE_KQUEUE; return (0); } static void kqueue_wakeup(struct kqueue *kq) { KQ_OWNED(kq); if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) { kq->kq_state &= ~KQ_SLEEP; wakeup(kq); } if ((kq->kq_state & KQ_SEL) == KQ_SEL) { selwakeuppri(&kq->kq_sel, PSOCK); if (!SEL_WAITING(&kq->kq_sel)) kq->kq_state &= ~KQ_SEL; } if (!knlist_empty(&kq->kq_sel.si_note)) kqueue_schedtask(kq); if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) { pgsigio(&kq->kq_sigio, SIGIO, 0); } } /* * Walk down a list of knotes, activating them if their event has triggered. * * There is a possibility to optimize in the case of one kq watching another. * Instead of scheduling a task to wake it up, you could pass enough state * down the chain to make up the parent kqueue. Make this code functional * first. */ void knote(struct knlist *list, long hint, int lockflags) { struct kqueue *kq; struct knote *kn, *tkn; int error; if (list == NULL) return; KNL_ASSERT_LOCK(list, lockflags & KNF_LISTLOCKED); if ((lockflags & KNF_LISTLOCKED) == 0) list->kl_lock(list->kl_lockarg); /* * If we unlock the list lock (and enter influx), we can * eliminate the kqueue scheduling, but this will introduce * four lock/unlock's for each knote to test. Also, marker * would be needed to keep iteration position, since filters * or other threads could remove events. */ SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, tkn) { kq = kn->kn_kq; KQ_LOCK(kq); if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) { /* * Do not process the influx notes, except for * the influx coming from the kq unlock in the * kqueue_scan(). In the later case, we do * not interfere with the scan, since the code * fragment in kqueue_scan() locks the knlist, * and cannot proceed until we finished. */ KQ_UNLOCK(kq); } else if ((lockflags & KNF_NOKQLOCK) != 0) { kn_enter_flux(kn); KQ_UNLOCK(kq); error = kn->kn_fop->f_event(kn, hint); KQ_LOCK(kq); kn_leave_flux(kn); if (error) KNOTE_ACTIVATE(kn, 1); KQ_UNLOCK_FLUX(kq); } else { if (kn->kn_fop->f_event(kn, hint)) KNOTE_ACTIVATE(kn, 1); KQ_UNLOCK(kq); } } if ((lockflags & KNF_LISTLOCKED) == 0) list->kl_unlock(list->kl_lockarg); } /* * add a knote to a knlist */ void knlist_add(struct knlist *knl, struct knote *kn, int islocked) { KNL_ASSERT_LOCK(knl, islocked); KQ_NOTOWNED(kn->kn_kq); KASSERT(kn_in_flux(kn), ("knote %p not in flux", kn)); KASSERT((kn->kn_status & KN_DETACHED) != 0, ("knote %p was not detached", kn)); if (!islocked) knl->kl_lock(knl->kl_lockarg); SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext); if (!islocked) knl->kl_unlock(knl->kl_lockarg); KQ_LOCK(kn->kn_kq); kn->kn_knlist = knl; kn->kn_status &= ~KN_DETACHED; KQ_UNLOCK(kn->kn_kq); } static void knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked, int kqislocked) { KASSERT(!kqislocked || knlislocked, ("kq locked w/o knl locked")); KNL_ASSERT_LOCK(knl, knlislocked); mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED); KASSERT(kqislocked || kn_in_flux(kn), ("knote %p not in flux", kn)); KASSERT((kn->kn_status & KN_DETACHED) == 0, ("knote %p was already detached", kn)); if (!knlislocked) knl->kl_lock(knl->kl_lockarg); SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext); kn->kn_knlist = NULL; if (!knlislocked) kn_list_unlock(knl); if (!kqislocked) KQ_LOCK(kn->kn_kq); kn->kn_status |= KN_DETACHED; if (!kqislocked) KQ_UNLOCK(kn->kn_kq); } /* * remove knote from the specified knlist */ void knlist_remove(struct knlist *knl, struct knote *kn, int islocked) { knlist_remove_kq(knl, kn, islocked, 0); } int knlist_empty(struct knlist *knl) { KNL_ASSERT_LOCKED(knl); return (SLIST_EMPTY(&knl->kl_list)); } static struct mtx knlist_lock; MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects", MTX_DEF); static void knlist_mtx_lock(void *arg); static void knlist_mtx_unlock(void *arg); static void knlist_mtx_lock(void *arg) { mtx_lock((struct mtx *)arg); } static void knlist_mtx_unlock(void *arg) { mtx_unlock((struct mtx *)arg); } static void knlist_mtx_assert_lock(void *arg, int what) { if (what == LA_LOCKED) mtx_assert((struct mtx *)arg, MA_OWNED); else mtx_assert((struct mtx *)arg, MA_NOTOWNED); } static void knlist_rw_rlock(void *arg) { rw_rlock((struct rwlock *)arg); } static void knlist_rw_runlock(void *arg) { rw_runlock((struct rwlock *)arg); } static void knlist_rw_assert_lock(void *arg, int what) { if (what == LA_LOCKED) rw_assert((struct rwlock *)arg, RA_LOCKED); else rw_assert((struct rwlock *)arg, RA_UNLOCKED); } void knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *), void (*kl_unlock)(void *), void (*kl_assert_lock)(void *, int)) { if (lock == NULL) knl->kl_lockarg = &knlist_lock; else knl->kl_lockarg = lock; if (kl_lock == NULL) knl->kl_lock = knlist_mtx_lock; else knl->kl_lock = kl_lock; if (kl_unlock == NULL) knl->kl_unlock = knlist_mtx_unlock; else knl->kl_unlock = kl_unlock; if (kl_assert_lock == NULL) knl->kl_assert_lock = knlist_mtx_assert_lock; else knl->kl_assert_lock = kl_assert_lock; knl->kl_autodestroy = 0; SLIST_INIT(&knl->kl_list); } void knlist_init_mtx(struct knlist *knl, struct mtx *lock) { knlist_init(knl, lock, NULL, NULL, NULL); } struct knlist * knlist_alloc(struct mtx *lock) { struct knlist *knl; knl = malloc(sizeof(struct knlist), M_KQUEUE, M_WAITOK); knlist_init_mtx(knl, lock); return (knl); } void knlist_init_rw_reader(struct knlist *knl, struct rwlock *lock) { knlist_init(knl, lock, knlist_rw_rlock, knlist_rw_runlock, knlist_rw_assert_lock); } void knlist_destroy(struct knlist *knl) { KASSERT(KNLIST_EMPTY(knl), ("destroying knlist %p with knotes on it", knl)); } void knlist_detach(struct knlist *knl) { KNL_ASSERT_LOCKED(knl); knl->kl_autodestroy = 1; if (knlist_empty(knl)) { knlist_destroy(knl); free(knl, M_KQUEUE); } } /* * Even if we are locked, we may need to drop the lock to allow any influx * knotes time to "settle". */ void knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn) { struct knote *kn, *kn2; struct kqueue *kq; KASSERT(!knl->kl_autodestroy, ("cleardel for autodestroy %p", knl)); if (islocked) KNL_ASSERT_LOCKED(knl); else { KNL_ASSERT_UNLOCKED(knl); again: /* need to reacquire lock since we have dropped it */ knl->kl_lock(knl->kl_lockarg); } SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) { kq = kn->kn_kq; KQ_LOCK(kq); if (kn_in_flux(kn)) { KQ_UNLOCK(kq); continue; } knlist_remove_kq(knl, kn, 1, 1); if (killkn) { kn_enter_flux(kn); KQ_UNLOCK(kq); knote_drop_detached(kn, td); } else { /* Make sure cleared knotes disappear soon */ kn->kn_flags |= EV_EOF | EV_ONESHOT; KQ_UNLOCK(kq); } kq = NULL; } if (!SLIST_EMPTY(&knl->kl_list)) { /* there are still in flux knotes remaining */ kn = SLIST_FIRST(&knl->kl_list); kq = kn->kn_kq; KQ_LOCK(kq); KASSERT(kn_in_flux(kn), ("knote removed w/o list lock")); knl->kl_unlock(knl->kl_lockarg); kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0); kq = NULL; goto again; } if (islocked) KNL_ASSERT_LOCKED(knl); else { knl->kl_unlock(knl->kl_lockarg); KNL_ASSERT_UNLOCKED(knl); } } /* * Remove all knotes referencing a specified fd must be called with FILEDESC * lock. This prevents a race where a new fd comes along and occupies the * entry and we attach a knote to the fd. */ void knote_fdclose(struct thread *td, int fd) { struct filedesc *fdp = td->td_proc->p_fd; struct kqueue *kq; struct knote *kn; int influx; FILEDESC_XLOCK_ASSERT(fdp); /* * We shouldn't have to worry about new kevents appearing on fd * since filedesc is locked. */ TAILQ_FOREACH(kq, &fdp->fd_kqlist, kq_list) { KQ_LOCK(kq); again: influx = 0; while (kq->kq_knlistsize > fd && (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) { if (kn_in_flux(kn)) { /* someone else might be waiting on our knote */ if (influx) wakeup(kq); kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0); goto again; } kn_enter_flux(kn); KQ_UNLOCK(kq); influx = 1; knote_drop(kn, td); KQ_LOCK(kq); } KQ_UNLOCK_FLUX(kq); } } static int knote_attach(struct knote *kn, struct kqueue *kq) { struct klist *list; KASSERT(kn_in_flux(kn), ("knote %p not marked influx", kn)); KQ_OWNED(kq); if ((kq->kq_state & KQ_CLOSING) != 0) return (EBADF); if (kn->kn_fop->f_isfd) { if (kn->kn_id >= kq->kq_knlistsize) return (ENOMEM); list = &kq->kq_knlist[kn->kn_id]; } else { if (kq->kq_knhash == NULL) return (ENOMEM); list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; } SLIST_INSERT_HEAD(list, kn, kn_link); return (0); } static void knote_drop(struct knote *kn, struct thread *td) { if ((kn->kn_status & KN_DETACHED) == 0) kn->kn_fop->f_detach(kn); knote_drop_detached(kn, td); } static void knote_drop_detached(struct knote *kn, struct thread *td) { struct kqueue *kq; struct klist *list; kq = kn->kn_kq; KASSERT((kn->kn_status & KN_DETACHED) != 0, ("knote %p still attached", kn)); KQ_NOTOWNED(kq); KQ_LOCK(kq); KASSERT(kn->kn_influx == 1, ("knote_drop called on %p with influx %d", kn, kn->kn_influx)); if (kn->kn_fop->f_isfd) list = &kq->kq_knlist[kn->kn_id]; else list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; if (!SLIST_EMPTY(list)) SLIST_REMOVE(list, kn, knote, kn_link); if (kn->kn_status & KN_QUEUED) knote_dequeue(kn); KQ_UNLOCK_FLUX(kq); if (kn->kn_fop->f_isfd) { fdrop(kn->kn_fp, td); kn->kn_fp = NULL; } kqueue_fo_release(kn->kn_kevent.filter); kn->kn_fop = NULL; knote_free(kn); } static void knote_enqueue(struct knote *kn) { struct kqueue *kq = kn->kn_kq; KQ_OWNED(kn->kn_kq); KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued")); TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); kn->kn_status |= KN_QUEUED; kq->kq_count++; kqueue_wakeup(kq); } static void knote_dequeue(struct knote *kn) { struct kqueue *kq = kn->kn_kq; KQ_OWNED(kn->kn_kq); KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued")); TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); kn->kn_status &= ~KN_QUEUED; kq->kq_count--; } static void knote_init(void) { knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); } SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL); static struct knote * knote_alloc(int mflag) { return (uma_zalloc(knote_zone, mflag | M_ZERO)); } static void knote_free(struct knote *kn) { uma_zfree(knote_zone, kn); } /* * Register the kev w/ the kq specified by fd. */ int kqfd_register(int fd, struct kevent *kev, struct thread *td, int mflag) { struct kqueue *kq; struct file *fp; cap_rights_t rights; int error; error = fget(td, fd, cap_rights_init_one(&rights, CAP_KQUEUE_CHANGE), &fp); if (error != 0) return (error); if ((error = kqueue_acquire(fp, &kq)) != 0) goto noacquire; error = kqueue_register(kq, kev, td, mflag); kqueue_release(kq, 0); noacquire: fdrop(fp, td); return (error); } diff --git a/sys/sys/event.h b/sys/sys/event.h index 80ed1268c8a1..f1bdc7e2a80e 100644 --- a/sys/sys/event.h +++ b/sys/sys/event.h @@ -1,368 +1,369 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1999,2000,2001 Jonathan Lemon * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_EVENT_H_ #define _SYS_EVENT_H_ #include #include #define EVFILT_READ (-1) #define EVFILT_WRITE (-2) #define EVFILT_AIO (-3) /* attached to aio requests */ #define EVFILT_VNODE (-4) /* attached to vnodes */ #define EVFILT_PROC (-5) /* attached to struct proc */ #define EVFILT_SIGNAL (-6) /* attached to struct proc */ #define EVFILT_TIMER (-7) /* timers */ #define EVFILT_PROCDESC (-8) /* attached to process descriptors */ #define EVFILT_FS (-9) /* filesystem events */ #define EVFILT_LIO (-10) /* attached to lio requests */ #define EVFILT_USER (-11) /* User events */ #define EVFILT_SENDFILE (-12) /* attached to sendfile requests */ #define EVFILT_EMPTY (-13) /* empty send socket buf */ #define EVFILT_SYSCOUNT 13 #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L #define EV_SET(kevp_, a, b, c, d, e, f) do { \ *(kevp_) = (struct kevent){ \ .ident = (a), \ .filter = (b), \ .flags = (c), \ .fflags = (d), \ .data = (e), \ .udata = (f), \ .ext = {0}, \ }; \ } while (0) #else /* Pre-C99 or not STDC (e.g., C++) */ /* * The definition of the local variable kevp could possibly conflict * with a user-defined value passed in parameters a-f. */ #define EV_SET(kevp_, a, b, c, d, e, f) do { \ struct kevent *kevp = (kevp_); \ (kevp)->ident = (a); \ (kevp)->filter = (b); \ (kevp)->flags = (c); \ (kevp)->fflags = (d); \ (kevp)->data = (e); \ (kevp)->udata = (f); \ (kevp)->ext[0] = 0; \ (kevp)->ext[1] = 0; \ (kevp)->ext[2] = 0; \ (kevp)->ext[3] = 0; \ } while (0) #endif struct kevent { __uintptr_t ident; /* identifier for this event */ short filter; /* filter for event */ unsigned short flags; /* action flags for kqueue */ unsigned int fflags; /* filter flag value */ __int64_t data; /* filter data value */ void *udata; /* opaque user data identifier */ __uint64_t ext[4]; /* extensions */ }; #if defined(_WANT_FREEBSD11_KEVENT) /* Older structure used in FreeBSD 11.x and older. */ struct kevent_freebsd11 { __uintptr_t ident; /* identifier for this event */ short filter; /* filter for event */ unsigned short flags; unsigned int fflags; __intptr_t data; void *udata; /* opaque user data identifier */ }; #endif #if defined(_WANT_KEVENT32) || (defined(_KERNEL) && defined(__LP64__)) struct kevent32 { uint32_t ident; /* identifier for this event */ short filter; /* filter for event */ u_short flags; u_int fflags; #ifndef __amd64__ uint32_t pad0; #endif uint32_t data1, data2; uint32_t udata; /* opaque user data identifier */ #ifndef __amd64__ uint32_t pad1; #endif uint32_t ext64[8]; }; #ifdef _WANT_FREEBSD11_KEVENT struct kevent32_freebsd11 { u_int32_t ident; /* identifier for this event */ short filter; /* filter for event */ u_short flags; u_int fflags; int32_t data; u_int32_t udata; /* opaque user data identifier */ }; #endif #endif /* actions */ #define EV_ADD 0x0001 /* add event to kq (implies enable) */ #define EV_DELETE 0x0002 /* delete event from kq */ #define EV_ENABLE 0x0004 /* enable event */ #define EV_DISABLE 0x0008 /* disable event (not reported) */ #define EV_FORCEONESHOT 0x0100 /* enable _ONESHOT and force trigger */ +#define EV_KEEPUDATA 0x0200 /* do not update the udata field */ /* flags */ #define EV_ONESHOT 0x0010 /* only report one occurrence */ #define EV_CLEAR 0x0020 /* clear event state after reporting */ #define EV_RECEIPT 0x0040 /* force EV_ERROR on success, data=0 */ #define EV_DISPATCH 0x0080 /* disable event after reporting */ #define EV_SYSFLAGS 0xF000 /* reserved by system */ #define EV_DROP 0x1000 /* note should be dropped */ #define EV_FLAG1 0x2000 /* filter-specific flag */ #define EV_FLAG2 0x4000 /* filter-specific flag */ /* returned values */ #define EV_EOF 0x8000 /* EOF detected */ #define EV_ERROR 0x4000 /* error, data contains errno */ /* * data/hint flags/masks for EVFILT_USER, shared with userspace * * On input, the top two bits of fflags specifies how the lower twenty four * bits should be applied to the stored value of fflags. * * On output, the top two bits will always be set to NOTE_FFNOP and the * remaining twenty four bits will contain the stored fflags value. */ #define NOTE_FFNOP 0x00000000 /* ignore input fflags */ #define NOTE_FFAND 0x40000000 /* AND fflags */ #define NOTE_FFOR 0x80000000 /* OR fflags */ #define NOTE_FFCOPY 0xc0000000 /* copy fflags */ #define NOTE_FFCTRLMASK 0xc0000000 /* masks for operations */ #define NOTE_FFLAGSMASK 0x00ffffff #define NOTE_TRIGGER 0x01000000 /* Cause the event to be triggered for output. */ /* * data/hint flags for EVFILT_{READ|WRITE}, shared with userspace */ #define NOTE_LOWAT 0x0001 /* low water mark */ #define NOTE_FILE_POLL 0x0002 /* behave like poll() */ /* * data/hint flags for EVFILT_VNODE, shared with userspace */ #define NOTE_DELETE 0x0001 /* vnode was removed */ #define NOTE_WRITE 0x0002 /* data contents changed */ #define NOTE_EXTEND 0x0004 /* size increased */ #define NOTE_ATTRIB 0x0008 /* attributes changed */ #define NOTE_LINK 0x0010 /* link count changed */ #define NOTE_RENAME 0x0020 /* vnode was renamed */ #define NOTE_REVOKE 0x0040 /* vnode access was revoked */ #define NOTE_OPEN 0x0080 /* vnode was opened */ #define NOTE_CLOSE 0x0100 /* file closed, fd did not allowed write */ #define NOTE_CLOSE_WRITE 0x0200 /* file closed, fd did allowed write */ #define NOTE_READ 0x0400 /* file was read */ /* * data/hint flags for EVFILT_PROC and EVFILT_PROCDESC, shared with userspace */ #define NOTE_EXIT 0x80000000 /* process exited */ #define NOTE_FORK 0x40000000 /* process forked */ #define NOTE_EXEC 0x20000000 /* process exec'd */ #define NOTE_PCTRLMASK 0xf0000000 /* mask for hint bits */ #define NOTE_PDATAMASK 0x000fffff /* mask for pid */ /* additional flags for EVFILT_PROC */ #define NOTE_TRACK 0x00000001 /* follow across forks */ #define NOTE_TRACKERR 0x00000002 /* could not track child */ #define NOTE_CHILD 0x00000004 /* am a child process */ /* additional flags for EVFILT_TIMER */ #define NOTE_SECONDS 0x00000001 /* data is seconds */ #define NOTE_MSECONDS 0x00000002 /* data is milliseconds */ #define NOTE_USECONDS 0x00000004 /* data is microseconds */ #define NOTE_NSECONDS 0x00000008 /* data is nanoseconds */ #define NOTE_ABSTIME 0x00000010 /* timeout is absolute */ struct knote; SLIST_HEAD(klist, knote); struct kqueue; TAILQ_HEAD(kqlist, kqueue); struct knlist { struct klist kl_list; void (*kl_lock)(void *); /* lock function */ void (*kl_unlock)(void *); void (*kl_assert_lock)(void *, int); void *kl_lockarg; /* argument passed to lock functions */ int kl_autodestroy; }; #ifdef _KERNEL /* * Flags for knote call */ #define KNF_LISTLOCKED 0x0001 /* knlist is locked */ #define KNF_NOKQLOCK 0x0002 /* do not keep KQ_LOCK */ #define KNOTE(list, hint, flags) knote(list, hint, flags) #define KNOTE_LOCKED(list, hint) knote(list, hint, KNF_LISTLOCKED) #define KNOTE_UNLOCKED(list, hint) knote(list, hint, 0) #define KNLIST_EMPTY(list) SLIST_EMPTY(&(list)->kl_list) /* * Flag indicating hint is a signal. Used by EVFILT_SIGNAL, and also * shared by EVFILT_PROC (all knotes attached to p->p_klist) */ #define NOTE_SIGNAL 0x08000000 /* * Hint values for the optional f_touch event filter. If f_touch is not set * to NULL and f_isfd is zero the f_touch filter will be called with the type * argument set to EVENT_REGISTER during a kevent() system call. It is also * called under the same conditions with the type argument set to EVENT_PROCESS * when the event has been triggered. */ #define EVENT_REGISTER 1 #define EVENT_PROCESS 2 struct filterops { int f_isfd; /* true if ident == filedescriptor */ int (*f_attach)(struct knote *kn); void (*f_detach)(struct knote *kn); int (*f_event)(struct knote *kn, long hint); void (*f_touch)(struct knote *kn, struct kevent *kev, u_long type); }; /* * An in-flux knote cannot be dropped from its kq while the kq is * unlocked. If the KN_SCAN flag is not set, a thread can only set * kn_influx when it is exclusive owner of the knote state, and can * modify kn_status as if it had the KQ lock. KN_SCAN must not be set * on a knote which is already in flux. * * kn_sfflags, kn_sdata, and kn_kevent are protected by the knlist lock. */ struct knote { SLIST_ENTRY(knote) kn_link; /* for kq */ SLIST_ENTRY(knote) kn_selnext; /* for struct selinfo */ struct knlist *kn_knlist; /* f_attach populated */ TAILQ_ENTRY(knote) kn_tqe; struct kqueue *kn_kq; /* which queue we are on */ struct kevent kn_kevent; void *kn_hook; int kn_hookid; int kn_status; /* protected by kq lock */ #define KN_ACTIVE 0x01 /* event has been triggered */ #define KN_QUEUED 0x02 /* event is on queue */ #define KN_DISABLED 0x04 /* event is disabled */ #define KN_DETACHED 0x08 /* knote is detached */ #define KN_MARKER 0x20 /* ignore this knote */ #define KN_KQUEUE 0x40 /* this knote belongs to a kq */ #define KN_SCAN 0x100 /* flux set in kqueue_scan() */ int kn_influx; int kn_sfflags; /* saved filter flags */ int64_t kn_sdata; /* saved data field */ union { struct file *p_fp; /* file data pointer */ struct proc *p_proc; /* proc pointer */ struct kaiocb *p_aio; /* AIO job pointer */ struct aioliojob *p_lio; /* LIO job pointer */ void *p_v; /* generic other pointer */ } kn_ptr; struct filterops *kn_fop; #define kn_id kn_kevent.ident #define kn_filter kn_kevent.filter #define kn_flags kn_kevent.flags #define kn_fflags kn_kevent.fflags #define kn_data kn_kevent.data #define kn_fp kn_ptr.p_fp }; struct kevent_copyops { void *arg; int (*k_copyout)(void *arg, struct kevent *kevp, int count); int (*k_copyin)(void *arg, struct kevent *kevp, int count); size_t kevent_size; }; struct thread; struct proc; struct knlist; struct mtx; struct rwlock; void knote(struct knlist *list, long hint, int lockflags); void knote_fork(struct knlist *list, int pid); struct knlist *knlist_alloc(struct mtx *lock); void knlist_detach(struct knlist *knl); void knlist_add(struct knlist *knl, struct knote *kn, int islocked); void knlist_remove(struct knlist *knl, struct knote *kn, int islocked); int knlist_empty(struct knlist *knl); void knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *), void (*kl_unlock)(void *), void (*kl_assert_lock)(void *, int)); void knlist_init_mtx(struct knlist *knl, struct mtx *lock); void knlist_init_rw_reader(struct knlist *knl, struct rwlock *lock); void knlist_destroy(struct knlist *knl); void knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn); #define knlist_clear(knl, islocked) \ knlist_cleardel((knl), NULL, (islocked), 0) #define knlist_delete(knl, td, islocked) \ knlist_cleardel((knl), (td), (islocked), 1) void knote_fdclose(struct thread *p, int fd); int kqfd_register(int fd, struct kevent *kev, struct thread *p, int mflag); int kqueue_add_filteropts(int filt, struct filterops *filtops); int kqueue_del_filteropts(int filt); void kqueue_drain_schedtask(void); #else /* !_KERNEL */ #include struct timespec; __BEGIN_DECLS int kqueue(void); int kevent(int kq, const struct kevent *changelist, int nchanges, struct kevent *eventlist, int nevents, const struct timespec *timeout); __END_DECLS #endif /* !_KERNEL */ #endif /* !_SYS_EVENT_H_ */ diff --git a/tests/sys/kqueue/libkqueue/user.c b/tests/sys/kqueue/libkqueue/user.c index 3844251ff4ba..1f66234c4cda 100644 --- a/tests/sys/kqueue/libkqueue/user.c +++ b/tests/sys/kqueue/libkqueue/user.c @@ -1,128 +1,188 @@ /* * Copyright (c) 2009 Mark Heily * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. * * $FreeBSD$ */ #include "common.h" static void add_and_delete(void) { const char *test_id = "kevent(EVFILT_USER, EV_ADD and EV_DELETE)"; struct kevent kev; test_begin(test_id); kevent_add(kqfd, &kev, 1, EVFILT_USER, EV_ADD, 0, 0, NULL); test_no_kevents(); kevent_add(kqfd, &kev, 1, EVFILT_USER, EV_DELETE, 0, 0, NULL); test_no_kevents(); success(); } static void event_wait(void) { const char *test_id = "kevent(EVFILT_USER, wait)"; struct kevent kev; test_begin(test_id); test_no_kevents(); /* Add the event, and then trigger it */ kevent_add(kqfd, &kev, 1, EVFILT_USER, EV_ADD | EV_CLEAR, 0, 0, NULL); kevent_add(kqfd, &kev, 1, EVFILT_USER, 0, NOTE_TRIGGER, 0, NULL); kev.fflags &= ~NOTE_FFCTRLMASK; kev.fflags &= ~NOTE_TRIGGER; kev.flags = EV_CLEAR; kevent_cmp(&kev, kevent_get(kqfd)); test_no_kevents(); success(); } +static void +event_wait_keepudata(void) +{ + const char *test_id = "kevent(EVFILT_USER, wait w/ EV_KEEPUDATA)"; + struct kevent kev; + + test_begin(test_id); + + test_no_kevents(); + + kevent_add(kqfd, &kev, 1, EVFILT_USER, EV_ADD | EV_CLEAR, 0, 0, &kev); + kevent_add(kqfd, &kev, 1, EVFILT_USER, EV_KEEPUDATA, NOTE_TRIGGER, 0, + NULL); + + kev.fflags &= ~NOTE_FFCTRLMASK; + kev.fflags &= ~NOTE_TRIGGER; + kev.flags = EV_CLEAR; + kev.udata = &kev; + kevent_cmp(&kev, kevent_get(kqfd)); + + test_no_kevents(); + + success(); +} + + static void disable_and_enable(void) { const char *test_id = "kevent(EVFILT_USER, EV_DISABLE and EV_ENABLE)"; struct kevent kev; test_begin(test_id); test_no_kevents(); kevent_add(kqfd, &kev, 1, EVFILT_USER, EV_ADD, 0, 0, NULL); kevent_add(kqfd, &kev, 1, EVFILT_USER, EV_DISABLE, 0, 0, NULL); /* Trigger the event, but since it is disabled, nothing will happen. */ kevent_add(kqfd, &kev, 1, EVFILT_USER, 0, NOTE_TRIGGER, 0, NULL); test_no_kevents(); kevent_add(kqfd, &kev, 1, EVFILT_USER, EV_ENABLE, 0, 0, NULL); kevent_add(kqfd, &kev, 1, EVFILT_USER, 0, NOTE_TRIGGER, 0, NULL); kev.flags = EV_CLEAR; kev.fflags &= ~NOTE_FFCTRLMASK; kev.fflags &= ~NOTE_TRIGGER; kevent_cmp(&kev, kevent_get(kqfd)); success(); } +static void +disable_and_enable_keepudata(void) +{ + const char *test_id = + "kevent(EVFILT_USER, EV_DISABLE and EV_ENABLE w/ EV_KEEPUDATA)"; + struct kevent kev; + + test_begin(test_id); + + test_no_kevents(); + + kevent_add(kqfd, &kev, 1, EVFILT_USER, EV_ADD, 0, 0, &kev); + kevent_add(kqfd, &kev, 1, EVFILT_USER, EV_DISABLE | EV_KEEPUDATA, 0, 0, + NULL); + + /* Trigger the event, but since it is disabled, nothing will happen. */ + kevent_add(kqfd, &kev, 1, EVFILT_USER, EV_KEEPUDATA, NOTE_TRIGGER, 0, NULL); + test_no_kevents(); + + kevent_add(kqfd, &kev, 1, EVFILT_USER, EV_ENABLE | EV_KEEPUDATA, 0, 0, + NULL); + kevent_add(kqfd, &kev, 1, EVFILT_USER, EV_KEEPUDATA, NOTE_TRIGGER, 0, NULL); + + kev.flags = EV_CLEAR; + kev.fflags &= ~NOTE_FFCTRLMASK; + kev.fflags &= ~NOTE_TRIGGER; + kev.udata = &kev; + kevent_cmp(&kev, kevent_get(kqfd)); + + success(); +} + static void oneshot(void) { const char *test_id = "kevent(EVFILT_USER, EV_ONESHOT)"; struct kevent kev; test_begin(test_id); test_no_kevents(); kevent_add(kqfd, &kev, 2, EVFILT_USER, EV_ADD | EV_ONESHOT, 0, 0, NULL); puts(" -- event 1"); kevent_add(kqfd, &kev, 2, EVFILT_USER, 0, NOTE_TRIGGER, 0, NULL); kev.flags = EV_ONESHOT; kev.fflags &= ~NOTE_FFCTRLMASK; kev.fflags &= ~NOTE_TRIGGER; kevent_cmp(&kev, kevent_get(kqfd)); test_no_kevents(); success(); } void test_evfilt_user(void) { kqfd = kqueue(); add_and_delete(); event_wait(); + event_wait_keepudata(); disable_and_enable(); + disable_and_enable_keepudata(); oneshot(); /* TODO: try different fflags operations */ close(kqfd); }