diff --git a/lib/libc/gen/Makefile.inc b/lib/libc/gen/Makefile.inc --- a/lib/libc/gen/Makefile.inc +++ b/lib/libc/gen/Makefile.inc @@ -89,6 +89,7 @@ glob.c \ glob-compat11.c \ initgroups.c \ + inotify.c \ isatty.c \ isinf.c \ isnan.c \ diff --git a/lib/libc/gen/Symbol.map b/lib/libc/gen/Symbol.map --- a/lib/libc/gen/Symbol.map +++ b/lib/libc/gen/Symbol.map @@ -460,6 +460,9 @@ execvpe; fts_open_b; glob_b; + inotify_add_watch; + inotify_init; + inotify_init1; psiginfo; rtld_get_var; rtld_set_var; diff --git a/lib/libc/gen/inotify.c b/lib/libc/gen/inotify.c new file mode 100644 --- /dev/null +++ b/lib/libc/gen/inotify.c @@ -0,0 +1,48 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Klara, Inc. + */ + +#include "namespace.h" +#include +#include +#include +#include "un-namespace.h" +#include "libc_private.h" + +/* + * Provide compatibility with libinotify, which uses different values for these + * flags. + */ +#define IN_NONBLOCK_OLD 0x80000 +#define IN_CLOEXEC_OLD 0x00800 + +int +inotify_add_watch(int fd, const char *pathname, uint32_t mask) +{ + return (inotify_add_watch_at(fd, AT_FDCWD, pathname, mask)); +} + +int +inotify_init1(int flags) +{ + struct specialfd_inotify args; + + if ((flags & IN_NONBLOCK_OLD) != 0) { + flags &= ~IN_NONBLOCK_OLD; + flags |= IN_NONBLOCK; + } + if ((flags & IN_CLOEXEC_OLD) != 0) { + flags &= ~IN_CLOEXEC_OLD; + flags |= IN_CLOEXEC; + } + args.flags = flags; + return (__sys___specialfd(SPECIALFD_INOTIFY, &args, sizeof(args))); +} + +int +inotify_init(void) +{ + return (inotify_init1(0)); +} diff --git a/lib/libsys/Makefile.sys b/lib/libsys/Makefile.sys --- a/lib/libsys/Makefile.sys +++ b/lib/libsys/Makefile.sys @@ -224,6 +224,7 @@ getsockopt.2 \ gettimeofday.2 \ getuid.2 \ + inotify.2 \ intro.2 \ ioctl.2 \ issetugid.2 \ @@ -448,6 +449,11 @@ MLINKS+=getsockopt.2 setsockopt.2 MLINKS+=gettimeofday.2 settimeofday.2 MLINKS+=getuid.2 geteuid.2 +MLINKS+=inotify.2 inotify_init.2 \ + inotify.2 inotify_init1.2 \ + inotify.2 inotify_add_watch.2 \ + inotify.2 inotify_add_watch_at.2 \ + inotify.2 inotify_rm_watch.2 MLINKS+=intro.2 errno.2 MLINKS+=jail.2 jail_attach.2 \ jail.2 jail_get.2 \ diff --git a/lib/libsys/Symbol.sys.map b/lib/libsys/Symbol.sys.map --- a/lib/libsys/Symbol.sys.map +++ b/lib/libsys/Symbol.sys.map @@ -381,6 +381,8 @@ exterrctl; fchroot; getrlimitusage; + inotify_add_watch_at; + inotify_rm_watch; kcmp; setcred; }; diff --git a/lib/libsys/inotify.2 b/lib/libsys/inotify.2 new file mode 100644 --- /dev/null +++ b/lib/libsys/inotify.2 @@ -0,0 +1,379 @@ +.\" +.\" SPDX-License-Identifier: BSD-2-Clause +.\" +.\" Copyright (c) 2025 Klara, Inc. +.\" +.Dd May 19, 2025 +.Dt INOTIFY 2 +.Os +.Sh NAME +.Nm inotify_init , +.Nm inotify_init1 , +.Nm inotify_add_watch , +.Nm inotify_add_watch_at , +.Nm inotify_rm_watch +.Nd monitor file system events +.Sh LIBRARY +.Lb libc +.Sh SYNOPSIS +.In sys/inotify.h +.Ft int +.Fo inotify_init +.Fc +.Ft int +.Fo inotify_init1 +.Fa int flags +.Fc +.Ft int +.Fo inotify_add_watch +.Fa int fd +.Fa const char *pathname +.Fa uint32_t mask +.Fc +.Ft int +.Fo inotify_add_watch_at +.Fa int fd +.Fa int dfd +.Fa const char *pathname +.Fa uint32_t mask +.Fc +.Ft int +.Fo inotify_rm_watch +.Fa int fd +.Fa uint32_t wd +.Fc +.Bd -literal +struct inotify_event { + int wd; /* Watch descriptor */ + uint32_t mask; /* Event and flags */ + uint32_t cookie; /* Unique ID which links rename events */ + uint32_t len; /* Name field size, including nul bytes */ + char name[0]; /* Filename (nul-terminated) */ +}; +.Ed +.Sh DESCRIPTION +The inotify system calls provide an interface to monitor file system events. +They aim to be compatible with the Linux inotify interface. +The provided functionality is similar to the +.Dv EVFILT_VNODE +filter of the +.Xr kevent 2 +system call, but further allows monitoring of a directory without needing to +open each object in that directory. +This avoids races and reduces the number of file descriptors needed to monitor +a large file hierarchy. +.Pp +inotify allows one or more file system objects, generally files or directories, +to be watched for events, such as file open or close. +Watched objects are associated with a file descriptor returned +by +.Fn inotify_init +or +.Fn inotify_init1 . +When an event occurs, a record describing the event becomes available for +reading from the inotify file descriptor. +Each inotify descriptor thus refers to a queue of events waiting to be read. +inotify descriptors are inherited across +.Xr fork 2 +calls and may be passed to other processes via +.Xr unix 4 +sockets. +.Pp +The +.Fn inotify_init1 +system call accepts two flags. +The +.Dv IN_NONBLOCK +flag causes the inotify descriptor to be opened in non-blocking mode, such that +.Xr read 2 +calls will not block if no records are available to consume, and will instead +return +.Er EWOULDBLOCK . +The +.Dv IN_CLOEXEC +flag causes the inotify descriptor to be closed automatically when +.Xr execve 2 +is called. +.Pp +To watch a file or directory, the +.Fn inotify_add_watch +or +.Fn inotify_add_watch_at +system calls must be used. +They take a path and a mask of events to watch for, and return a +.Dq watch descriptor , +a non-negative integer which uniquely identifies the watched object within the +inotify descriptor. +.Pp +The +.Fn inotify_rm_watch +system call removes a watch from an inotify descriptor. +.Pp +When watching a directory, objects within the directory are monitored for events +as well as the directory itself. +A record describing an inotify event consists of a +.Dq struct inotify_event +followed by the name of the object in the directory being watched. +If the watched object itself generates an event, no name is present. +Extra nul bytes may follow the file name in order to provide alignment for a +subsequent record. +.Pp +The following events are defined: +.Bl -tag -width IN_CLOSE_NOWRITE +.It Dv IN_ACCESS +A file's contents were accessed, e.g., by +.Xr read 2 +.Xr copy_file_range 2 , +.Xr sendfile 2 , +or +.Xr getdirentries 2 . +.It Dv IN_ATTRIB +A file's metadata was changed, e.g., by +.Xr chmod 2 +or +.Xr unlink 2 . +.It Dv IN_CLOSE_WRITE +A file that was previously opened for writing was closed. +.It Dv IN_CLOSE_NOWRITE +A file that was previously opened read-only was closed. +.It Dv IN_CREATE +A file within a watched directory was created, e.g., by +.Xr open 2 , +.Xr mkdir 2 , +.Xr symlink 2 , +.Xr mknod 2 , +or +.Xr bind 2 . +.It Dv IN_DELETE +A file or directory within a watched directory was removed. +.It Dv IN_DELETE_SELF +The watched file or directory itself was deleted. +This event is generated only when the link count of the file drops +to zero. +.It Dv IN_MODIFY +A file's contents were modified, e.g., by +.Xr write 2 +or +.Xr copy_file_range 2 . +.It Dv IN_MOVE_SELF +The watched file or directory itself was renamed. +.It Dv IN_MOVED_FROM +A file or directory was moved from a watched directory. +.It Dv IN_MOVED_TO +A file or directory was moved into a watched directory. +A +.Xr rename 2 +call thus may generate two events, one for the old name and one for the new +name. +These are linked together by the +.Ar cookie +field in the inotify record, which can be compared to link the two records +to the same event. +.It Dv IN_OPEN +A file was opened. +.El +.Pp +Some additional flags may be set in inotify event records: +.Bl -tag -width IN_Q_OVERFLOW +.It Dv IN_IGNORED +When a watch is removed from a file, for example because it was created with the +.Dv IN_ONESHOT +flag, the file was deleted, or the watch was explicitly removed with +.Xr inotify_rm_watch 2 , +an event with this mask is generated to indicate that the watch will not +generate any more events. +Once this event is generated, the watch is automatically removed, and in +particular should not be removed manually with +.Xr inotify_rm_watch 2 . +.It Dv IN_ISDIR +When the subject of an event is a directory, this flag is set in the +.Ar mask +.It Dv IN_Q_OVERFLOW +One or more events were dropped, for example because of a kernel memory allocation +failure or because the event queue size hit a limit. +.It Dv IN_UNMOUNT +The filesystem containing the watched object was unmounted. +.El +.Pp +A number of flags may also be specified in the +.Ar mask +given to +.Fn inotify_add_watch +and +.Fn inotify_add_watch_at : +.Bl -tag -width IN_DONT_FOLLOW +.It Dv IN_DONT_FOLLOW +If +.Ar pathname +is a symbolic link, do not follow it. +.It Dv IN_EXCL_UNLINK +This currently has no effect, see the +.Sx BUGS +section. +.In Dv IN_MASK_ADD +When adding a watch to an object, and that object is already watched by the +same inotify descriptor, by default the mask of the existing watch is +overwritten. +When +.Dv IN_MASK_ADD +is specified, the mask of the existing watch is instead logically ORed with +the new mask. +.In Dv IN_MASK_CREATE +When +.Fn inotify_add watch +is used to add a watch to an object, +.Dv IN_MASK_CREATE +is specified, and that object is already watched by the same inotify descriptor, +return an error instead of updating the existing watch. +.In Dv IN_ONESHOT +Monitor the object for a single event, after which the watch is automatically +removed. +As part of removal, a +.Dv IN_IGNORED +event is generated. +.In Dv IN_ONLYDIR +When creating a watch, fail with +.Er ENOTDIR +if the path does not refer to a directory. +.El +.Sh SYSCTL VARIABLES +The following variables are available as both +.Xr sysctl 8 +variables and +.Xr loader 8 +tunables: +.Bl -tag -width 15 +.It Va vfs.inotify.max_events +The maximum number of inotify records that can be queued for a single +inotify descriptor. +Records in excess of this limit are discarded, and a single event with +mask equal to +.Dv IN_Q_OVERFLOW +will be present in the queue. +.It Va vfs.inotify.max_user_instances +The maximum number of inotify descriptors that can be created by a single +user. +.It Va vfs.inotify.max_user_watches +The maximum number of inotify watches per user. +.El +.Sh EXAMPLES +See the example program in +.Pa /usr/share/examples/inotify/inotify.c . +.Sh ERRORS +The +.Fn inotify_init +and +.Fn inotify_init1 +functions will fail if: +.Bl -tag -width Er +.It Bq Er ENFILE +The system limit on the total number of open files has been reached. +.It Bq Er EMFILE +A per-process limit on the number of open files has been reached. +.It Bq Er EMFILE +The system limit on the number of inotify descriptors has been reached. +.It Bq Er EINVAL +An unrecognized flag was passed to +.Fn inotify_init1 . +.El +.Pp +The +.Fn inotify_add_watch +and +.Fn inotify_add_watch_at +system calls will fail if: +.Bl -tag -width Er +.It Bq Er EBADF +The +.Ar fd +parameter is not a valid file descriptor. +.It Bq Er EINVAL +The +.Ar fd +parameter is not an inotify descriptor. +.It Bq Er EINVAL +The +.Ar mask +parameter does not specify an event, or +the +.Dv IN_MASK_CREATE +and +.Dv IN_MASK_ADD +flags are both set, or an unrecognized flag was passed. +.It Bq Er ENOTDIR +The +.Ar pathname +parameter refers to a file that is not a directory, and the +.Dv IN_ONLYDIR +flag was specified. +.It Bq Er ENOSPC +The per-user limit on the total number of inotify watches has been reached. +.It Bq Er ECAPMODE +The process is in capability mode and +.Fn inotify_add_watch +was called, or +.Fn inotify_add_watch_at +was called with +.Dv AT_FDCWD +as the directory file descriptor +.Ar dfd . +.It Bq Er ENOTCAPABLE +The process is in capability mode and +.Ar pathname +contains a +.Dq .. +component leading to a directory outside the directory hierarchy specified +by +.Ar dfd . +.El +.Pp +The +.Fn inotify_rm_watch +system call will fail if: +.Bl -tag -width Er +.It Bq Er EBADF +The +.Ar fd +parameter is not a valid file descriptor. +.It Bq Er EINVAL +The +.Ar fd +parameter is not an inotify descriptor. +.It Bq Er EINVAL +The +.Ar wd +parameter is not a valid watch descriptor. +.El +.Sh SEE ALSO +.Xr kevent 2 , +.Xr capsicum 4 +.Sh STANDARDS +The +.Nm +interface originates from Linux and is non-standard. +This implementation aims to be compatible with that of Linux and is based +on the documentation available at +.Pa https://man7.org/linux/man-pages/man7/inotify.7.html . +.Sh HISTORY +The inotify system calls first appeared in +.Fx 15.0 . +.Sh BUGS +If a file in a watched directory has multiple hard links, +an access via any hard link for that file will generate an event, even +if the accessed link belongs to an unwatched directory. +This is not the case for the Linux implementation, where only accesses +via the hard link in the watched directory will generate an event. +.Pp +If a watched directory contains multiple hard links of a file, an event +on one of the hard links will generate an inotify record for each link +in the directory. +.Pp +When a file is unlinked, no more events will be generated for that file, +even if it continues to be accessed. +By default, the Linux implementation will continue to generate events in +this case. +Thus, the +.Fx +implementation behaves as though +.Dv IN_EXCL_UNLINK +is always set. diff --git a/share/examples/Makefile b/share/examples/Makefile --- a/share/examples/Makefile +++ b/share/examples/Makefile @@ -15,6 +15,7 @@ find_interface \ flua \ indent \ + inotify \ ipfw \ jails \ kld \ diff --git a/share/examples/inotify/Makefile b/share/examples/inotify/Makefile new file mode 100644 --- /dev/null +++ b/share/examples/inotify/Makefile @@ -0,0 +1,6 @@ +PROG= inotify +MAN= + +LIBADD= xo + +.include diff --git a/share/examples/inotify/inotify.c b/share/examples/inotify/inotify.c new file mode 100644 --- /dev/null +++ b/share/examples/inotify/inotify.c @@ -0,0 +1,172 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Klara, Inc. + */ + +/* + * A simple program to demonstrate inotify. Given one or more paths, it watches + * all events on those paths and prints them to standard output. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +static void +usage(void) +{ + xo_errx(1, "usage: inotify [ ...]"); +} + +static const char * +ev2str(uint32_t event) +{ + switch (event & IN_ALL_EVENTS) { + case IN_ACCESS: + return ("IN_ACCESS"); + case IN_ATTRIB: + return ("IN_ATTRIB"); + case IN_CLOSE_WRITE: + return ("IN_CLOSE_WRITE"); + case IN_CLOSE_NOWRITE: + return ("IN_CLOSE_NOWRITE"); + case IN_CREATE: + return ("IN_CREATE"); + case IN_DELETE: + return ("IN_DELETE"); + case IN_DELETE_SELF: + return ("IN_DELETE_SELF"); + case IN_MODIFY: + return ("IN_MODIFY"); + case IN_MOVE_SELF: + return ("IN_MOVE_SELF"); + case IN_MOVED_FROM: + return ("IN_MOVED_FROM"); + case IN_MOVED_TO: + return ("IN_MOVED_TO"); + case IN_OPEN: + return ("IN_OPEN"); + default: + switch (event) { + case IN_IGNORED: + return ("IN_IGNORED"); + case IN_Q_OVERFLOW: + return ("IN_Q_OVERFLOW"); + case IN_UNMOUNT: + return ("IN_UNMOUNT"); + } + warnx("unknown event %#x", event); + assert(0); + } +} + +static void +set_handler(int kq, int sig) +{ + struct kevent kev; + + (void)signal(sig, SIG_IGN); + EV_SET(&kev, sig, EVFILT_SIGNAL, EV_ADD, 0, 0, NULL); + if (kevent(kq, &kev, 1, NULL, 0, NULL) < 0) + xo_err(1, "kevent"); +} + +int +main(int argc, char **argv) +{ + struct inotify_event *iev, *iev1; + struct kevent kev; + size_t ievsz; + int ifd, kq; + + argc = xo_parse_args(argc, argv); + if (argc < 2) + usage(); + argc--; + argv++; + + ifd = inotify_init1(IN_NONBLOCK); + if (ifd < 0) + xo_err(1, "inotify"); + for (int i = 0; i < argc; i++) { + int wd; + + wd = inotify_add_watch(ifd, argv[i], IN_ALL_EVENTS); + if (wd < 0) + xo_err(1, "inotify_add_watch(%s)", argv[i]); + } + + xo_set_version("1"); + xo_open_list("events"); + + kq = kqueue(); + if (kq < 0) + xo_err(1, "kqueue"); + + /* + * Handle signals in the event loop so that we can close the xo list. + */ + set_handler(kq, SIGINT); + set_handler(kq, SIGTERM); + set_handler(kq, SIGHUP); + set_handler(kq, SIGQUIT); + + /* + * Monitor the inotify descriptor for events. + */ + EV_SET(&kev, ifd, EVFILT_READ, EV_ADD, 0, 0, NULL); + if (kevent(kq, &kev, 1, NULL, 0, NULL) < 0) + xo_err(1, "kevent"); + + ievsz = sizeof(*iev) + NAME_MAX + 1; + iev = malloc(ievsz); + if (iev == NULL) + err(1, "malloc"); + + for (;;) { + ssize_t n; + const char *ev; + + if (kevent(kq, NULL, 0, &kev, 1, NULL) < 0) + xo_err(1, "kevent"); + if (kev.filter == EVFILT_SIGNAL) + break; + + n = read(ifd, iev, ievsz); + if (n < 0) + xo_err(1, "read"); + assert(n >= (ssize_t)sizeof(*iev)); + + for (iev1 = iev; n > 0;) { + assert(n >= (ssize_t)sizeof(*iev1)); + + ev = ev2str(iev1->mask); + xo_open_instance("event"); + xo_emit("{:wd/%3d} {:event/%16s} {:name/%s}\n", + iev1->wd, ev, iev1->name); + xo_close_instance("event"); + + n -= sizeof(*iev1) + iev1->len; + iev1 = (struct inotify_event *)(void *) + ((char *)iev1 + sizeof(*iev1) + iev1->len); + } + (void)xo_flush(); + } + + xo_close_list("events"); + + if (xo_finish() < 0) + xo_err(1, "stdout"); + exit(0); +} diff --git a/share/man/man4/rights.4 b/share/man/man4/rights.4 --- a/share/man/man4/rights.4 +++ b/share/man/man4/rights.4 @@ -30,7 +30,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd May 1, 2024 +.Dd May 22, 2025 .Dt RIGHTS 4 .Os .Sh NAME @@ -319,6 +319,14 @@ .It Dv CAP_GETSOCKOPT Permit .Xr getsockopt 2 . +.It Dv CAP_INOTIFY_ADD +Permit +.Xr inotify_add_watch 2 +and +.Xr inotify_add_watch_at 2 . +.It Dv CAP_INOTIFY_RM +Permit +.Xr inotify_rm_watch 2 . .It Dv CAP_IOCTL Permit .Xr ioctl 2 . diff --git a/share/man/man9/Makefile b/share/man/man9/Makefile --- a/share/man/man9/Makefile +++ b/share/man/man9/Makefile @@ -435,6 +435,7 @@ VOP_GETEXTATTR.9 \ VOP_GETPAGES.9 \ VOP_INACTIVE.9 \ + VOP_INOTIFY.9 \ VOP_IOCTL.9 \ VOP_LINK.9 \ VOP_LISTEXTATTR.9 \ @@ -2461,6 +2462,7 @@ MLINKS+=VOP_FSYNC.9 VOP_FDATASYNC.9 MLINKS+=VOP_GETPAGES.9 VOP_PUTPAGES.9 MLINKS+=VOP_INACTIVE.9 VOP_RECLAIM.9 +MLINKS+=VOP_INOTIFY.9 VOP_INOTIFY_ADD_WATCH.9 MLINKS+=VOP_LOCK.9 vn_lock.9 \ VOP_LOCK.9 VOP_ISLOCKED.9 \ VOP_LOCK.9 VOP_UNLOCK.9 diff --git a/share/man/man9/VOP_INOTIFY.9 b/share/man/man9/VOP_INOTIFY.9 new file mode 100644 --- /dev/null +++ b/share/man/man9/VOP_INOTIFY.9 @@ -0,0 +1,60 @@ +.\"- +.\" SPDX-License-Identifier: BSD-2-Clause +.\" +.\" Copyright (c) 2025 Klara, Inc. +.\" +.Dd May 27, 2025 +.Dt VOP_INOTIFY 9 +.Os +.Sh NAME +.Nm VOP_INOTIFY +.Nd vnode inotify interface +.Sh SYNOPSIS +.In sys/param.h +.In sys/vnode.h +.Ft int +.Fo VOP_INOTIFY +.Fa struct vnode *vp +.Fa struct vnode *dvp +.Fa struct componentname *cnp +.Fa int event +.Fa uint32_t cookie +.Fc +.Ft int +.Fo VOP_INOTIFY_ADD_WATCH +.Fa struct vnode *vp +.Fa struct inotify_softc *sc +.Fa uint32_t mask +.Fa uint32_t *wdp +.Fa struct thread *td +.Fc +.Sh DESCRIPTION +The +.Fn VOP_INOTIFY +operation notifies the +.Xr inotify 2 +subsystem of a file system event on a vnode. +The +.Fa dvp +and +.Fa cnp +arguments are optional and are only used to obtain a file name for the event. +If they are omitted, the inotify subsystem will use the file name cache to +find a name for the vnode, but this is more expensive. +.Pp +The +.Fn VOP_INOTIFY_ADD_WATCH +operation is for internal use by the inotify subsystem to add a watch to a +vnode. +.Sh LOCKS +The +.Fn VOP_INOTIFY +operation does not assume any particular vnode lock state. +The +.Fn VOP_INOTIFY_ADD_WATCH +operation should be called with the vnode locked. +.Sh RETURN VALUES +Zero is returned on success, otherwise an error code is returned. +.Sh SEE ALSO +.Xr inotify 2 , +.Xr vnode 9 diff --git a/sys/bsm/audit_kevents.h b/sys/bsm/audit_kevents.h --- a/sys/bsm/audit_kevents.h +++ b/sys/bsm/audit_kevents.h @@ -663,6 +663,7 @@ #define AUE_FSPACECTL 43269 /* FreeBSD-specific. */ #define AUE_TIMERFD 43270 /* FreeBSD/Linux. */ #define AUE_SETCRED 43271 /* FreeBSD-specific. */ +#define AUE_INOTIFY 43272 /* FreeBSD/Linux. */ /* * Darwin BSM uses a number of AUE_O_* definitions, which are aliased to the diff --git a/sys/conf/files b/sys/conf/files --- a/sys/conf/files +++ b/sys/conf/files @@ -3979,6 +3979,7 @@ kern/vfs_extattr.c standard kern/vfs_hash.c standard kern/vfs_init.c standard +kern/vfs_inotify.c standard kern/vfs_lookup.c standard kern/vfs_mount.c standard kern/vfs_mountroot.c standard diff --git a/sys/fs/nullfs/null_subr.c b/sys/fs/nullfs/null_subr.c --- a/sys/fs/nullfs/null_subr.c +++ b/sys/fs/nullfs/null_subr.c @@ -245,6 +245,10 @@ vp->v_object = lowervp->v_object; vn_irflag_set(vp, VIRF_PGREAD); } + if ((vn_irflag_read(lowervp) & VIRF_INOTIFY) != 0) + vn_irflag_set(vp, VIRF_INOTIFY); + if ((vn_irflag_read(lowervp) & VIRF_INOTIFY_PARENT) != 0) + vn_irflag_set(vp, VIRF_INOTIFY_PARENT); if (lowervp == MOUNTTONULLMOUNT(mp)->nullm_lowerrootvp) vp->v_vflag |= VV_ROOT; diff --git a/sys/fs/nullfs/null_vnops.c b/sys/fs/nullfs/null_vnops.c --- a/sys/fs/nullfs/null_vnops.c +++ b/sys/fs/nullfs/null_vnops.c @@ -189,6 +189,26 @@ SYSCTL_INT(_debug, OID_AUTO, nullfs_bug_bypass, CTLFLAG_RW, &null_bug_bypass, 0, ""); +/* + * Synchronize inotify flags with the lower vnode: + * - If the upper vnode has the flag set and the lower does not, then the lower + * vnode is unwatched and the upper vnode does not need to go through + * VOP_INOTIFY. + * - If the lower vnode is watched, then the upper vnode should go through + * VOP_INOTIFY, so copy the flag up. + */ +static void +null_copy_inotify(struct vnode *vp, struct vnode *lvp, short flag) +{ + if ((vn_irflag_read(vp) & flag) != 0) { + if (__predict_false((vn_irflag_read(lvp) & flag) == 0)) + vn_irflag_unset(vp, flag); + } else if ((vn_irflag_read(lvp) & flag) != 0) { + if (__predict_false((vn_irflag_read(vp) & flag) == 0)) + vn_irflag_set(vp, flag); + } +} + /* * This is the 10-Apr-92 bypass routine. * This version has been optimized for speed, throwing away some @@ -305,7 +325,10 @@ lvp = *(vps_p[i]); /* - * Get rid of the transient hold on lvp. + * Get rid of the transient hold on lvp. Copy inotify + * flags up in case something is watching the lower + * layer. + * * If lowervp was unlocked during VOP * operation, nullfs upper vnode could have * been reclaimed, which changes its v_vnlock @@ -314,6 +337,10 @@ * upper (reclaimed) vnode. */ if (lvp != NULLVP) { + null_copy_inotify(old_vps[i], lvp, + VIRF_INOTIFY); + null_copy_inotify(old_vps[i], lvp, + VIRF_INOTIFY_PARENT); if (VOP_ISLOCKED(lvp) == LK_EXCLUSIVE && old_vps[i]->v_vnlock != lvp->v_vnlock) { VOP_UNLOCK(lvp); diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c --- a/sys/kern/kern_exec.c +++ b/sys/kern/kern_exec.c @@ -26,7 +26,6 @@ * SUCH DAMAGE. */ -#include #include "opt_capsicum.h" #include "opt_hwpmc_hooks.h" #include "opt_ktrace.h" @@ -44,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -1883,8 +1883,10 @@ * general case). */ error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL); - if (error == 0) + if (error == 0) { + INOTIFY(vp, IN_OPEN); imgp->opened = true; + } return (error); } @@ -1893,6 +1895,7 @@ { if (imgp->opened) { VOP_CLOSE(imgp->vp, FREAD, td->td_ucred, td); + INOTIFY(imgp->vp, IN_CLOSE); imgp->opened = false; } if (imgp->textset) { diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c --- a/sys/kern/kern_resource.c +++ b/sys/kern/kern_resource.c @@ -1637,6 +1637,12 @@ if (uip->ui_pipecnt != 0) printf("freeing uidinfo: uid = %d, pipecnt = %ld\n", uip->ui_uid, uip->ui_pipecnt); + if (uip->ui_inotifycnt != 0) + printf("freeing uidinfo: uid = %d, inotifycnt = %ld\n", + uip->ui_uid, uip->ui_inotifycnt); + if (uip->ui_inotifywatchcnt != 0) + printf("freeing uidinfo: uid = %d, inotifywatchcnt = %ld\n", + uip->ui_uid, uip->ui_inotifywatchcnt); free(uip, M_UIDINFO); } @@ -1742,6 +1748,21 @@ return (chglimit(uip, &uip->ui_pipecnt, diff, max, "pipecnt")); } +int +chginotifycnt(struct uidinfo *uip, int diff, rlim_t max) +{ + + return (chglimit(uip, &uip->ui_inotifycnt, diff, max, "inotifycnt")); +} + +int +chginotifywatchcnt(struct uidinfo *uip, int diff, rlim_t max) +{ + + return (chglimit(uip, &uip->ui_inotifywatchcnt, diff, max, + "inotifywatchcnt")); +} + static int sysctl_kern_proc_rlimit_usage(SYSCTL_HANDLER_ARGS) { diff --git a/sys/kern/kern_sendfile.c b/sys/kern/kern_sendfile.c --- a/sys/kern/kern_sendfile.c +++ b/sys/kern/kern_sendfile.c @@ -27,12 +27,12 @@ * SUCH DAMAGE. */ -#include #include "opt_kern_tls.h" #include #include #include +#include #include #include #include @@ -1242,6 +1242,8 @@ */ if (error == 0) { td->td_retval[0] = 0; + if (sbytes > 0 && vp != NULL) + INOTIFY(vp, IN_ACCESS); } if (sent != NULL) { (*sent) = sbytes; diff --git a/sys/kern/subr_capability.c b/sys/kern/subr_capability.c --- a/sys/kern/subr_capability.c +++ b/sys/kern/subr_capability.c @@ -74,6 +74,10 @@ CAP_RIGHTS_INITIALIZER(CAP_GETSOCKOPT); const cap_rights_t cap_getsockname_rights = CAP_RIGHTS_INITIALIZER(CAP_GETSOCKNAME); +const cap_rights_t cap_inotify_add_rights = + CAP_RIGHTS_INITIALIZER(CAP_INOTIFY_ADD); +const cap_rights_t cap_inotify_rm_rights = + CAP_RIGHTS_INITIALIZER(CAP_INOTIFY_RM); const cap_rights_t cap_ioctl_rights = CAP_RIGHTS_INITIALIZER(CAP_IOCTL); const cap_rights_t cap_listen_rights = CAP_RIGHTS_INITIALIZER(CAP_LISTEN); const cap_rights_t cap_linkat_source_rights = diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c --- a/sys/kern/sys_generic.c +++ b/sys/kern/sys_generic.c @@ -42,11 +42,12 @@ #include #include #include +#include #include #include #include #include -#include +#include #include #include #include @@ -939,7 +940,6 @@ kern_specialfd(struct thread *td, int type, void *arg) { struct file *fp; - struct specialfd_eventfd *ae; int error, fd, fflags; fflags = 0; @@ -948,12 +948,22 @@ return (error); switch (type) { - case SPECIALFD_EVENTFD: + case SPECIALFD_EVENTFD: { + struct specialfd_eventfd *ae; + ae = arg; if ((ae->flags & EFD_CLOEXEC) != 0) fflags |= O_CLOEXEC; error = eventfd_create_file(td, fp, ae->initval, ae->flags); break; + } + case SPECIALFD_INOTIFY: { + struct specialfd_inotify *si; + + si = arg; + error = inotify_create_file(td, fp, si->flags, &fflags); + break; + } default: error = EINVAL; break; @@ -970,11 +980,12 @@ int sys___specialfd(struct thread *td, struct __specialfd_args *args) { - struct specialfd_eventfd ae; int error; switch (args->type) { - case SPECIALFD_EVENTFD: + case SPECIALFD_EVENTFD: { + struct specialfd_eventfd ae; + if (args->len != sizeof(struct specialfd_eventfd)) { error = EINVAL; break; @@ -989,6 +1000,20 @@ } error = kern_specialfd(td, args->type, &ae); break; + } + case SPECIALFD_INOTIFY: { + struct specialfd_inotify si; + + if (args->len != sizeof(si)) { + error = EINVAL; + break; + } + error = copyin(args->req, &si, sizeof(si)); + if (error != 0) + break; + error = kern_specialfd(td, args->type, &si); + break; + } default: error = EINVAL; break; diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master --- a/sys/kern/syscalls.master +++ b/sys/kern/syscalls.master @@ -3356,4 +3356,19 @@ _In_reads_bytes_(4) void *ptr ); } +593 AUE_INOTIFY STD|CAPENABLED { + int inotify_add_watch_at( + int fd, + int dfd, + _In_z_ const char *path, + uint32_t mask + ); + } +594 AUE_INOTIFY STD|CAPENABLED { + int inotify_rm_watch( + int fd, + int wd + ); + } + ; vim: syntax=off diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c --- a/sys/kern/vfs_cache.c +++ b/sys/kern/vfs_cache.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -2636,6 +2637,14 @@ atomic_thread_fence_rel(); atomic_store_ptr(&dvp->v_cache_dd, ncp); } else if (vp != NULL) { + /* + * Take the slow path in INOTIFY(). This flag will be lazily + * cleared by cache_vop_inotify() once all directories referring + * to vp are unwatched. + */ + if (__predict_false((vn_irflag_read(dvp) & VIRF_INOTIFY) != 0)) + vn_irflag_set_cond(vp, VIRF_INOTIFY_PARENT); + /* * For this case, the cache entry maps both the * directory name in it and the name ".." for the @@ -4061,6 +4070,56 @@ return (error); } +void +cache_vop_inotify(struct vnode *vp, int event, uint32_t cookie) +{ + struct mtx *vlp; + struct namecache *ncp; + int isdir; + bool logged, self; + + isdir = vp->v_type == VDIR ? IN_ISDIR : 0; + self = (vn_irflag_read(vp) & VIRF_INOTIFY) != 0 && + (vp->v_type != VDIR || (event & ~_IN_DIR_EVENTS) != 0); + + if (self) { + int selfevent; + + if (event == _IN_ATTRIB_LINKCOUNT) + selfevent = IN_ATTRIB; + else + selfevent = event; + inotify_log(vp, NULL, 0, selfevent | isdir, cookie); + } + if ((event & IN_ALL_EVENTS) == 0) + return; + + logged = false; + vlp = VP2VNODELOCK(vp); + mtx_lock(vlp); + TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) { + if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) + continue; + if ((vn_irflag_read(ncp->nc_dvp) & VIRF_INOTIFY) != 0) { + /* + * XXX-MJ if the vnode has two links in the same + * dir, we'll log the same event twice. + */ + inotify_log(ncp->nc_dvp, ncp->nc_name, ncp->nc_nlen, + event | isdir, cookie); + logged = true; + } + } + if (!logged && (vn_irflag_read(vp) & VIRF_INOTIFY_PARENT) != 0) { + /* + * We didn't find a watched directory that contains this vnode, + * so stop calling VOP_INOTIFY for operations on the vnode. + */ + vn_irflag_unset(vp, VIRF_INOTIFY_PARENT); + } + mtx_unlock(vlp); +} + #ifdef DDB static void db_print_vpath(struct vnode *vp) diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c --- a/sys/kern/vfs_default.c +++ b/sys/kern/vfs_default.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -119,6 +120,8 @@ .vop_getwritemount = vop_stdgetwritemount, .vop_inactive = VOP_NULL, .vop_need_inactive = vop_stdneed_inactive, + .vop_inotify = vop_stdinotify, + .vop_inotify_add_watch = vop_stdinotify_add_watch, .vop_ioctl = vop_stdioctl, .vop_kqfilter = vop_stdkqfilter, .vop_islocked = vop_stdislocked, @@ -1305,6 +1308,20 @@ return (1); } +int +vop_stdinotify(struct vop_inotify_args *ap) +{ + vn_inotify(ap->a_vp, ap->a_dvp, ap->a_cnp, ap->a_event, ap->a_cookie); + return (0); +} + +int +vop_stdinotify_add_watch(struct vop_inotify_add_watch_args *ap) +{ + return (vn_inotify_add_watch(ap->a_vp, ap->a_sc, ap->a_mask, + ap->a_wdp, ap->a_td)); +} + int vop_stdioctl(struct vop_ioctl_args *ap) { diff --git a/sys/kern/vfs_inotify.c b/sys/kern/vfs_inotify.c new file mode 100644 --- /dev/null +++ b/sys/kern/vfs_inotify.c @@ -0,0 +1,949 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Klara, Inc. + */ + +#include "opt_ktrace.h" + +#include +#include +#include +#include +#define EXTERR_CATEGORY EXTERR_CAT_INOTIFY +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +uint32_t inotify_rename_cookie; + +static SYSCTL_NODE(_vfs, OID_AUTO, inotify, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, + "inotify configuration"); + +static int inotify_max_queued_events = 16384; +SYSCTL_INT(_vfs_inotify, OID_AUTO, max_queued_events, CTLFLAG_RWTUN, + &inotify_max_queued_events, 0, + "Maximum number of events to queue on an inotify descriptor"); + +static int inotify_max_user_instances = 1024; +SYSCTL_INT(_vfs_inotify, OID_AUTO, max_user_instances, CTLFLAG_RWTUN, + &inotify_max_user_instances, 0, + "Maximum number of inotify descriptors per user"); + +static int inotify_max_user_watches = 128 * 1024; +SYSCTL_INT(_vfs_inotify, OID_AUTO, max_user_watches, CTLFLAG_RWTUN, + &inotify_max_user_watches, 0, + "Maximum number of inotify watches per user"); + +static SYSCTL_NODE(_debug, OID_AUTO, inotify, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, + "inotify debugging"); + +static int inotify_coalesce = 1; +SYSCTL_INT(_debug_inotify, OID_AUTO, coalesce, CTLFLAG_RWTUN, + &inotify_coalesce, 0, + "Coalesce inotify events when possible"); + +static int inotify_foo = 0; +SYSCTL_INT(_debug_inotify, OID_AUTO, foo, CTLFLAG_RWTUN, + &inotify_foo, 0, + "Debugging variable for inotify"); + +static fo_rdwr_t inotify_read; +static fo_ioctl_t inotify_ioctl; +static fo_poll_t inotify_poll; +static fo_kqfilter_t inotify_kqfilter; +static fo_stat_t inotify_stat; +static fo_close_t inotify_close; +static fo_fill_kinfo_t inotify_fill_kinfo; + +static const struct fileops inotifyfdops = { + .fo_read = inotify_read, + .fo_write = invfo_rdwr, + .fo_truncate = invfo_truncate, + .fo_ioctl = inotify_ioctl, + .fo_poll = inotify_poll, + .fo_kqfilter = inotify_kqfilter, + .fo_stat = inotify_stat, + .fo_close = inotify_close, + .fo_chmod = invfo_chmod, + .fo_chown = invfo_chown, + .fo_sendfile = invfo_sendfile, + .fo_fill_kinfo = inotify_fill_kinfo, + .fo_cmp = file_kcmp_generic, + .fo_flags = DFLAG_PASSABLE, +}; + +static void filt_inotifydetach(struct knote *kn); +static int filt_inotifyevent(struct knote *kn, long hint); + +static const struct filterops inotify_rfiltops = { + .f_isfd = 1, + .f_detach = filt_inotifydetach, + .f_event = filt_inotifyevent, +}; + +static MALLOC_DEFINE(M_INOTIFY, "inotify", "inotify data structures"); + +struct inotify_record { + STAILQ_ENTRY(inotify_record) link; + struct inotify_event ev; +}; + +static uint64_t inotify_ino = 1; + +struct inotify_watch { + struct inotify_softc *sc; /* back-pointer */ + int wd; /* unique ID */ + uint32_t mask; /* event mask */ + struct vnode *vp; /* vnode being watched, refed */ + RB_ENTRY(inotify_watch) ilink; /* inotify linkage */ + TAILQ_ENTRY(inotify_watch) vlink; /* vnode linkage */ +}; + +static int +inotify_watch_cmp(const struct inotify_watch *a, + const struct inotify_watch *b) +{ + if (a->wd < b->wd) + return (-1); + else if (a->wd > b->wd) + return (1); + else + return (0); +} +RB_HEAD(inotify_watch_tree, inotify_watch); +RB_GENERATE_STATIC(inotify_watch_tree, inotify_watch, ilink, inotify_watch_cmp); + +struct inotify_softc { + struct mtx lock; /* serialize all softc writes */ + STAILQ_HEAD(, inotify_record) pending; /* events waiting to be read */ + struct inotify_record overflow; /* preallocated record */ + int nextwatch; /* next watch ID to try */ + int npending; /* number of pending events */ + size_t nbpending; /* bytes available to read */ + uint64_t ino; /* unique identifier */ + struct inotify_watch_tree watches; /* active watches */ + struct selinfo sel; /* select/poll/kevent info */ + struct ucred *cred; /* credential ref */ +}; + +static struct inotify_record * +inotify_dequeue(struct inotify_softc *sc) +{ + struct inotify_record *rec; + + mtx_assert(&sc->lock, MA_OWNED); + KASSERT(!STAILQ_EMPTY(&sc->pending), + ("%s: queue for %p is empty", __func__, sc)); + + rec = STAILQ_FIRST(&sc->pending); + STAILQ_REMOVE_HEAD(&sc->pending, link); + sc->npending--; + sc->nbpending -= sizeof(rec->ev) + rec->ev.len; + return (rec); +} + +static void +inotify_enqueue(struct inotify_softc *sc, struct inotify_record *rec, bool head) +{ + mtx_assert(&sc->lock, MA_OWNED); + + if (head) + STAILQ_INSERT_HEAD(&sc->pending, rec, link); + else + STAILQ_INSERT_TAIL(&sc->pending, rec, link); + sc->npending++; + sc->nbpending += sizeof(rec->ev) + rec->ev.len; +} + +static int +inotify_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags, + struct thread *td) +{ + struct inotify_softc *sc; + struct inotify_record *rec; + int error; + bool first; + + sc = fp->f_data; + error = 0; + + mtx_lock(&sc->lock); + while (STAILQ_EMPTY(&sc->pending)) { + if ((flags & IO_NDELAY) != 0 || (fp->f_flag & FNONBLOCK) != 0) { + mtx_unlock(&sc->lock); + return (EWOULDBLOCK); + } + error = msleep(&sc->pending, &sc->lock, PCATCH, "inotify", 0); + if (error != 0) { + mtx_unlock(&sc->lock); + return (error); + } + } + for (first = true; !STAILQ_EMPTY(&sc->pending); first = false) { + size_t len; + + rec = inotify_dequeue(sc); + len = sizeof(rec->ev) + rec->ev.len; + if (uio->uio_resid < (ssize_t)len) { + inotify_enqueue(sc, rec, true); + if (first) { + SET_ERROR0(EINVAL, "read buffer is too small"); + error = EINVAL; + } + break; + } + mtx_unlock(&sc->lock); + error = uiomove(&rec->ev, len, uio); +#ifdef KTRACE + if (error == 0 && KTRPOINT(td, KTR_STRUCT)) + ktrstruct("inotify", &rec->ev, len); +#endif + mtx_lock(&sc->lock); + if (error != 0) { + inotify_enqueue(sc, rec, true); + mtx_unlock(&sc->lock); + return (error); + } + if (rec == &sc->overflow) { + /* + * Signal to inotify_queue_record() that the overflow + * record can be reused. + */ + memset(rec, 0, sizeof(*rec)); + } else { + free(rec, M_INOTIFY); + } + } + mtx_unlock(&sc->lock); + return (error); +} + +static int +inotify_ioctl(struct file *fp, u_long com, void *data, struct ucred *cred, + struct thread *td) +{ + struct inotify_softc *sc; + + sc = fp->f_data; + + switch (com) { + case FIONREAD: + *(int *)data = (int)sc->nbpending; + return (0); + case FIONBIO: + case FIOASYNC: + return (0); + default: + return (ENOTTY); + } + + return (0); +} + +static int +inotify_poll(struct file *fp, int events, struct ucred *cred, struct thread *td) +{ + struct inotify_softc *sc; + int revents; + + sc = fp->f_data; + revents = 0; + + mtx_lock(&sc->lock); + if ((events & (POLLIN | POLLRDNORM)) != 0 && sc->npending > 0) + revents |= events & (POLLIN | POLLRDNORM); + else + selrecord(td, &sc->sel); + mtx_unlock(&sc->lock); + return (revents); +} + +static void +filt_inotifydetach(struct knote *kn) +{ + struct inotify_softc *sc; + + sc = kn->kn_hook; + knlist_remove(&sc->sel.si_note, kn, 0); +} + +static int +filt_inotifyevent(struct knote *kn, long hint) +{ + struct inotify_softc *sc; + + sc = kn->kn_hook; + mtx_assert(&sc->lock, MA_OWNED); + kn->kn_data = sc->nbpending; + return (kn->kn_data > 0); +} + +static int +inotify_kqfilter(struct file *fp, struct knote *kn) +{ + struct inotify_softc *sc; + + if (kn->kn_filter != EVFILT_READ) + return (EINVAL); + sc = fp->f_data; + kn->kn_fop = &inotify_rfiltops; + kn->kn_hook = sc; + knlist_add(&sc->sel.si_note, kn, 0); + return (0); +} + +static int +inotify_stat(struct file *fp, struct stat *sb, struct ucred *cred) +{ + struct inotify_softc *sc; + + sc = fp->f_data; + + memset(sb, 0, sizeof(*sb)); + sb->st_mode = S_IFREG | S_IRUSR; + sb->st_blksize = sizeof(struct inotify_event) + _IN_NAMESIZE(NAME_MAX); + mtx_lock(&sc->lock); + sb->st_size = sc->nbpending; + sb->st_blocks = sc->npending; + sb->st_uid = sc->cred->cr_ruid; + sb->st_gid = sc->cred->cr_rgid; + sb->st_ino = sc->ino; + mtx_unlock(&sc->lock); + return (0); +} + +static void +inotify_unlink_watch_locked(struct inotify_watch *watch) +{ + struct vnode *vp; + + vp = watch->vp; + mtx_assert(&vp->v_pollinfo->vpi_lock, MA_OWNED); + + TAILQ_REMOVE(&vp->v_pollinfo->vpi_inotify, watch, vlink); + if (TAILQ_EMPTY(&vp->v_pollinfo->vpi_inotify)) + vn_irflag_unset_locked(vp, VIRF_INOTIFY); +} + +/* + * Assumes that the watch has already been removed from its softc. + */ +static void +inotify_remove_watch(struct inotify_watch *watch) +{ + struct inotify_softc *sc; + struct vnode *vp; + + sc = watch->sc; + (void)chginotifywatchcnt(sc->cred->cr_ruidinfo, -1, 0); + + vp = watch->vp; + mtx_lock(&vp->v_pollinfo->vpi_lock); + inotify_unlink_watch_locked(watch); + mtx_unlock(&vp->v_pollinfo->vpi_lock); + + vrele(vp); + free(watch, M_INOTIFY); +} + +static int +inotify_close(struct file *fp, struct thread *td) +{ + struct inotify_softc *sc; + struct inotify_record *rec; + struct inotify_watch *watch; + + sc = fp->f_data; + + mtx_lock(&sc->lock); + (void)chginotifycnt(sc->cred->cr_ruidinfo, -1, 0); + while ((watch = RB_MIN(inotify_watch_tree, &sc->watches)) != NULL) { + RB_REMOVE(inotify_watch_tree, &sc->watches, watch); + mtx_unlock(&sc->lock); + inotify_remove_watch(watch); + mtx_lock(&sc->lock); + } + while (!STAILQ_EMPTY(&sc->pending)) { + rec = inotify_dequeue(sc); + if (rec != &sc->overflow) + free(rec, M_INOTIFY); + } + mtx_unlock(&sc->lock); + seldrain(&sc->sel); + knlist_destroy(&sc->sel.si_note); + mtx_destroy(&sc->lock); + crfree(sc->cred); + free(sc, M_INOTIFY); + return (0); +} + +static int +inotify_fill_kinfo(struct file *fp, struct kinfo_file *kif, + struct filedesc *fdp) +{ + struct inotify_softc *sc; + + sc = fp->f_data; + + kif->kf_type = KF_TYPE_INOTIFY; + kif->kf_un.kf_inotify.kf_inotify_npending = sc->npending; + kif->kf_un.kf_inotify.kf_inotify_nbpending = sc->nbpending; + return (0); +} + +int +inotify_create_file(struct thread *td, struct file *fp, int flags, int *fflagsp) +{ + struct inotify_softc *sc; + int fflags; + + if ((flags & ~(IN_NONBLOCK | IN_CLOEXEC)) != 0) + return (EINVAL); + + if (!chginotifycnt(td->td_ucred->cr_ruidinfo, 1, + inotify_max_user_instances)) + return (EMFILE); + + sc = malloc(sizeof(*sc), M_INOTIFY, M_WAITOK | M_ZERO); + sc->nextwatch = 1; /* Required for compatibility. */ + STAILQ_INIT(&sc->pending); + RB_INIT(&sc->watches); + mtx_init(&sc->lock, "inotify", NULL, MTX_DEF); + knlist_init_mtx(&sc->sel.si_note, &sc->lock); + sc->cred = crhold(td->td_ucred); + sc->ino = atomic_fetchadd_64(&inotify_ino, 1); + + fflags = FREAD; + if ((flags & IN_NONBLOCK) != 0) + fflags |= FNONBLOCK; + if ((flags & IN_CLOEXEC) != 0) + *fflagsp |= O_CLOEXEC; + finit(fp, fflags, DTYPE_INOTIFY, sc, &inotifyfdops); + + return (0); +} + +static struct inotify_record * +inotify_alloc_record(uint32_t wd, const char *name, size_t namelen, int event, + uint32_t cookie, int waitok) +{ + struct inotify_event *evp; + struct inotify_record *rec; + + rec = malloc(sizeof(*rec) + _IN_NAMESIZE(namelen), M_INOTIFY, + waitok | M_ZERO); + if (rec == NULL) + return (NULL); + evp = &rec->ev; + evp->wd = wd; + evp->mask = event; + evp->cookie = cookie; + evp->len = _IN_NAMESIZE(namelen); + if (name != NULL) + memcpy(evp->name, name, namelen); + return (rec); +} + +/* + * Put an event record on the queue for an inotify desscriptor. Return false if + * the record was not enqueued for some reason, true otherwise. + */ +static bool +inotify_queue_record(struct inotify_softc *sc, struct inotify_record *rec) +{ + struct inotify_record *prev; + struct inotify_event *evp; + + mtx_assert(&sc->lock, MA_OWNED); + + /* Try to coalesce duplicate events. */ + evp = &rec->ev; + if (inotify_coalesce && rec != &sc->overflow && + (prev = STAILQ_LAST(&sc->pending, inotify_record, link)) != NULL && + prev->ev.wd == evp->wd && + prev->ev.mask == evp->mask && + prev->ev.cookie == evp->cookie && + prev->ev.len == evp->len && + (evp->len == 0 || strcmp(prev->ev.name, evp->name) == 0)) + return (false); + /* Would this one overflow the queue? */ + if (sc->npending >= inotify_max_queued_events || rec == &sc->overflow) { + /* Have we already recorded a drop? */ + if (sc->overflow.ev.mask == IN_Q_OVERFLOW) + return (false); + evp->wd = -1; + evp->mask = IN_Q_OVERFLOW; + evp->cookie = 0; + evp->len = 0; + } + inotify_enqueue(sc, rec, false); + selwakeup(&sc->sel); + KNOTE_LOCKED(&sc->sel.si_note, 0); + wakeup(&sc->pending); + return (true); +} + +static int +inotify_log_one(struct inotify_watch *watch, const char *name, size_t namelen, + int event, uint32_t cookie) +{ + struct inotify_watch key; + struct inotify_softc *sc; + struct inotify_record *rec; + int relecount; + bool allocfail; + + relecount = 0; + + sc = watch->sc; + rec = inotify_alloc_record(watch->wd, name, namelen, event, cookie, + M_NOWAIT); + if (rec == NULL) { + rec = &sc->overflow; + allocfail = true; + } else { + allocfail = false; + } + + mtx_lock(&sc->lock); + if (!inotify_queue_record(sc, rec) && !allocfail) + free(rec, M_INOTIFY); + if ((watch->mask & IN_ONESHOT) != 0 || + (event & (IN_DELETE_SELF | IN_UNMOUNT)) != 0) { + if (!allocfail) { + rec = inotify_alloc_record(watch->wd, NULL, 0, + IN_IGNORED, 0, M_NOWAIT); + if (rec != NULL && !inotify_queue_record(sc, rec)) + free(rec, M_INOTIFY); + } + + /* + * Remove the watch, taking care to handle races with + * inotify_close(). + */ + key.wd = watch->wd; + if (RB_FIND(inotify_watch_tree, &sc->watches, &key) != NULL) { + RB_REMOVE(inotify_watch_tree, &sc->watches, watch); + inotify_unlink_watch_locked(watch); + free(watch, M_INOTIFY); + + /* Defer vrele() to until locks are dropped. */ + relecount++; + } + } + mtx_unlock(&sc->lock); + return (relecount); +} + +void +inotify_log(struct vnode *vp, const char *name, size_t namelen, int event, + uint32_t cookie) +{ + struct inotify_watch *watch, *tmp; + int relecount; + + KASSERT((event & ~(IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT)) == 0, + ("inotify_log: invalid event %#x", event)); + + relecount = 0; + mtx_lock(&vp->v_pollinfo->vpi_lock); + TAILQ_FOREACH_SAFE(watch, &vp->v_pollinfo->vpi_inotify, vlink, tmp) { + KASSERT(watch->vp == vp, + ("inotify_log: watch %p vp != vp", watch)); + if ((watch->mask & event) != 0 || event == IN_UNMOUNT) { + relecount += inotify_log_one(watch, name, namelen, event, + cookie); + } + } + mtx_unlock(&vp->v_pollinfo->vpi_lock); + + for (int i = 0; i < relecount; i++) + vrele(vp); +} + +/* + * An inotify event occurred on a watched vnode. + */ +void +vn_inotify(struct vnode *vp, struct vnode *dvp, struct componentname *cnp, + int event, uint32_t cookie) +{ + int isdir; + + VNPASS(vp->v_holdcnt > 0, vp); + + isdir = vp->v_type == VDIR ? IN_ISDIR : 0; + + if (dvp != NULL) { + VNPASS(dvp->v_holdcnt > 0, dvp); + + /* + * Should we log an event for the vnode itself? + */ + if ((vn_irflag_read(vp) & VIRF_INOTIFY) != 0) { + int selfevent; + + switch (event) { + case _IN_MOVE_DELETE: + case IN_DELETE: + /* + * IN_DELETE_SELF is only generated when the + * last hard link of a file is removed. + */ + selfevent = IN_DELETE_SELF; + if (vp->v_type != VDIR) { + struct vattr va; + int error; + + error = VOP_GETATTR(vp, &va, cnp->cn_cred); + if (error == 0 && va.va_nlink != 0) + selfevent = 0; + } + break; + case IN_MOVED_FROM: + cookie = 0; + selfevent = IN_MOVE_SELF; + break; + case _IN_ATTRIB_LINKCOUNT: + selfevent = IN_ATTRIB; + break; + default: + selfevent = event; + break; + } + + if ((selfevent & ~_IN_DIR_EVENTS) != 0) { + inotify_log(vp, NULL, 0, selfevent | isdir, + cookie); + } + } + + /* + * Something is watching the directory through which this vnode + * was referenced, so we may need to log the event. + */ + if ((event & IN_ALL_EVENTS) != 0 && + (vn_irflag_read(dvp) & VIRF_INOTIFY) != 0) { + inotify_log(dvp, cnp->cn_nameptr, + cnp->cn_namelen, event | isdir, cookie); + } + } else { + /* + * We don't know which watched directory might contain the + * vnode, so we have to fall back to searching the name cache. + */ + cache_vop_inotify(vp, event, cookie); + } +} + +int +vn_inotify_add_watch(struct vnode *vp, struct inotify_softc *sc, uint32_t mask, + uint32_t *wdp, struct thread *td) +{ + struct inotify_watch *watch, *watch1; + uint32_t wd; + + /* + * If this is a directory, make sure all of its entries are present in + * the name cache so that we're able to look them up if an event occurs. + * The persistent reference on the directory prevents the outgoing name + * cache entries from being reclaimed. + */ + if (vp->v_type == VDIR) { + struct dirent *dp; + char *buf; + off_t off; + size_t buflen, len; + int eof, error; + + buflen = 128 * sizeof(struct dirent); + buf = malloc(buflen, M_TEMP, M_WAITOK); + + error = 0; + len = off = eof = 0; + for (;;) { + struct nameidata nd; + + error = vn_dir_next_dirent(vp, td, buf, buflen, &dp, + &len, &off, &eof); + if (error != 0) + break; + if (len == 0) + /* Finished reading. */ + break; + if (strcmp(dp->d_name, ".") == 0 || + strcmp(dp->d_name, "..") == 0) + continue; + + /* + * namei() consumes a reference on the starting + * directory if it's specified as a vnode. + */ + vrefact(vp); + NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, + dp->d_name, vp); + error = namei(&nd); + if (error != 0) + break; + vn_irflag_set_cond(nd.ni_vp, VIRF_INOTIFY_PARENT); + vrele(nd.ni_vp); + } + free(buf, M_TEMP); + if (error != 0) + return (error); + } + + /* + * The vnode referenced in kern_inotify_add_watch() might be different + * than this one if nullfs is in the picture. + */ + vrefact(vp); + watch = malloc(sizeof(*watch), M_INOTIFY, M_WAITOK | M_ZERO); + watch->sc = sc; + watch->vp = vp; + watch->mask = mask; + + /* + * Are we updating an existing watch? Search the vnode's list rather + * than that of the softc, as the former is likely to be shorter. + */ + v_addpollinfo(vp); + mtx_lock(&vp->v_pollinfo->vpi_lock); + TAILQ_FOREACH(watch1, &vp->v_pollinfo->vpi_inotify, vlink) { + if (watch1->sc == sc) + break; + } + mtx_lock(&sc->lock); + if (watch1 != NULL) { + mtx_unlock(&vp->v_pollinfo->vpi_lock); + + /* + * We found an existing watch, update it based on our flags. + */ + if ((mask & IN_MASK_CREATE) != 0) { + mtx_unlock(&sc->lock); + vrele(vp); + free(watch, M_INOTIFY); + return (EEXIST); + } + if ((mask & IN_MASK_ADD) != 0) + watch1->mask |= mask; + else + watch1->mask = mask; + *wdp = watch1->wd; + mtx_unlock(&sc->lock); + vrele(vp); + free(watch, M_INOTIFY); + return (EJUSTRETURN); + } + + /* + * We're creating a new watch. Add it to the softc and vnode watch + * lists. + */ + do { + struct inotify_watch key; + + /* + * Search for the next available watch descriptor. This is + * implemented so as to avoid reusing watch descriptors for as + * long as possible. + */ + key.wd = wd = sc->nextwatch++; + watch1 = RB_FIND(inotify_watch_tree, &sc->watches, &key); + } while (watch1 != NULL || wd == 0); + watch->wd = wd; + RB_INSERT(inotify_watch_tree, &sc->watches, watch); + TAILQ_INSERT_TAIL(&vp->v_pollinfo->vpi_inotify, watch, vlink); + mtx_unlock(&sc->lock); + mtx_unlock(&vp->v_pollinfo->vpi_lock); + vn_irflag_set_cond(vp, VIRF_INOTIFY); + + *wdp = wd; + + return (0); +} + +void +vn_inotify_revoke(struct vnode *vp) +{ + if (vp->v_pollinfo == NULL) { + /* This is a nullfs vnode which shadows a watched vnode. */ + return; + } + inotify_log(vp, NULL, 0, IN_UNMOUNT, 0); +} + +static int +fget_inotify(struct thread *td, int fd, const cap_rights_t *needrightsp, + struct file **fpp) +{ + struct file *fp; + int error; + + error = fget(td, fd, needrightsp, &fp); + if (error != 0) + return (error); + if (fp->f_type != DTYPE_INOTIFY) { + fdrop(fp, td); + return (EINVAL); + } + *fpp = fp; + return (0); +} + +static int +kern_inotify_add_watch(int fd, int dfd, const char *path, uint32_t mask, + struct thread *td) +{ + struct nameidata nd; + struct file *fp; + struct inotify_softc *sc; + struct vnode *vp; + uint32_t wd; + int error; + + fp = NULL; + vp = NULL; + + if ((mask & IN_ALL_EVENTS) == 0) { + SET_ERROR0(EINVAL, "no events specified"); + return (EINVAL); + } + if ((mask & (IN_MASK_ADD | IN_MASK_CREATE)) == + (IN_MASK_ADD | IN_MASK_CREATE)) { + SET_ERROR0(EINVAL, + "IN_MASK_ADD and IN_MASK_CREATE are mutually exclusive"); + return (EINVAL); + } + if ((mask & ~(IN_ALL_EVENTS | _IN_ALL_FLAGS | IN_UNMOUNT)) != 0) { + SET_ERROR0(EINVAL, "unrecognized flag"); + return (EINVAL); + } + + error = fget_inotify(td, fd, &cap_inotify_add_rights, &fp); + if (error != 0) + return (error); + sc = fp->f_data; + + NDINIT_AT(&nd, LOOKUP, + ((mask & IN_DONT_FOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF | + LOCKSHARED | AUDITVNODE1, UIO_USERSPACE, path, dfd); + error = namei(&nd); + if (error != 0) + goto out; + NDFREE_PNBUF(&nd); + vp = nd.ni_vp; + + error = VOP_ACCESS(vp, VREAD, td->td_ucred, td); + if (error != 0) + goto out; + + if ((mask & IN_ONLYDIR) != 0 && vp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } + + if (!chginotifywatchcnt(sc->cred->cr_ruidinfo, 1, + inotify_max_user_watches)) { + error = ENOSPC; + goto out; + } + error = VOP_INOTIFY_ADD_WATCH(vp, sc, mask, &wd, td); + if (error != 0) { + chginotifywatchcnt(sc->cred->cr_ruidinfo, -1, 0); + if (error == EJUSTRETURN) { + /* We updated an existing watch, everything is ok. */ + error = 0; + } else { + goto out; + } + } + td->td_retval[0] = wd; + +out: + if (vp != NULL) + vput(vp); + fdrop(fp, td); + return (error); +} + +int +sys_inotify_add_watch_at(struct thread *td, + struct inotify_add_watch_at_args *uap) +{ + return (kern_inotify_add_watch(uap->fd, uap->dfd, uap->path, + uap->mask, td)); +} + +static int +kern_inotify_rm_watch(int fd, uint32_t wd, struct thread *td) +{ + struct file *fp; + struct inotify_softc *sc; + struct inotify_record *rec; + struct inotify_watch key, *watch; + int error; + + error = fget_inotify(td, fd, &cap_inotify_rm_rights, &fp); + if (error != 0) + return (error); + sc = fp->f_data; + + rec = inotify_alloc_record(wd, NULL, 0, IN_IGNORED, 0, M_WAITOK); + + /* + * For compatibility with Linux, we do not remove pending events + * associated with the watch. Watch descriptors are implemented so as + * to avoid being reused for as long as possible, so one hopes that any + * pending events from the removed watch descriptor will be removed + * before the watch descriptor is recycled. + */ + key.wd = wd; + mtx_lock(&sc->lock); + watch = RB_FIND(inotify_watch_tree, &sc->watches, &key); + if (watch == NULL) { + free(rec, M_INOTIFY); + error = EINVAL; + } else { + RB_REMOVE(inotify_watch_tree, &sc->watches, watch); + if (!inotify_queue_record(sc, rec)) { + free(rec, M_INOTIFY); + error = 0; + } + } + mtx_unlock(&sc->lock); + if (watch != NULL) + inotify_remove_watch(watch); + fdrop(fp, td); + return (error); +} + +int +sys_inotify_rm_watch(struct thread *td, struct inotify_rm_watch_args *uap) +{ + return (kern_inotify_rm_watch(uap->fd, uap->wd, td)); +} diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -38,7 +38,6 @@ * External virtual filesystem routines */ -#include #include "opt_ddb.h" #include "opt_watchdog.h" @@ -57,6 +56,7 @@ #include #include #include +#include #include #include #include @@ -5247,7 +5247,8 @@ static void destroy_vpollinfo(struct vpollinfo *vi) { - + KASSERT(TAILQ_EMPTY(&vi->vpi_inotify), + ("%s: pollinfo %p has lingering watches", __func__, vi)); knlist_clear(&vi->vpi_selinfo.si_note, 1); seldrain(&vi->vpi_selinfo); destroy_vpollinfo_free(vi); @@ -5261,12 +5262,13 @@ { struct vpollinfo *vi; - if (vp->v_pollinfo != NULL) + if (atomic_load_ptr(&vp->v_pollinfo) != NULL) return; vi = malloc(sizeof(*vi), M_VNODEPOLL, M_WAITOK | M_ZERO); mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, vfs_knlunlock, vfs_knl_assert_lock); + TAILQ_INIT(&vi->vpi_inotify); VI_LOCK(vp); if (vp->v_pollinfo != NULL) { VI_UNLOCK(vp); @@ -6057,6 +6059,28 @@ } #endif +void +vop_allocate_post(void *ap, int rc) +{ + struct vop_allocate_args *a; + + a = ap; + if (rc == 0) + INOTIFY(a->a_vp, IN_MODIFY); +} + +void +vop_copy_file_range_post(void *ap, int rc) +{ + struct vop_copy_file_range_args *a; + + a = ap; + if (rc == 0) { + INOTIFY(a->a_invp, IN_ACCESS); + INOTIFY(a->a_outvp, IN_MODIFY); + } +} + void vop_create_pre(void *ap) { @@ -6077,8 +6101,20 @@ a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); - if (!rc) + if (!rc) { VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); + INOTIFY_NAME(*a->a_vpp, dvp, a->a_cnp, IN_CREATE); + } +} + +void +vop_deallocate_post(void *ap, int rc) +{ + struct vop_deallocate_args *a; + + a = ap; + if (rc == 0) + INOTIFY(a->a_vp, IN_MODIFY); } void @@ -6123,8 +6159,10 @@ a = ap; vp = a->a_vp; vn_seqc_write_end(vp); - if (!rc) + if (!rc) { VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); + INOTIFY(vp, IN_ATTRIB); + } } void @@ -6154,6 +6192,8 @@ if (!rc) { VFS_KNOTE_LOCKED(vp, NOTE_LINK); VFS_KNOTE_LOCKED(tdvp, NOTE_WRITE); + INOTIFY_NAME(vp, tdvp, a->a_cnp, _IN_ATTRIB_LINKCOUNT); + INOTIFY_NAME(vp, tdvp, a->a_cnp, IN_CREATE); } } @@ -6177,8 +6217,10 @@ a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); - if (!rc) + if (!rc) { VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); + INOTIFY_NAME(*a->a_vpp, dvp, a->a_cnp, IN_CREATE); + } } #ifdef DEBUG_VFS_LOCKS @@ -6213,8 +6255,10 @@ a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); - if (!rc) + if (!rc) { VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); + INOTIFY_NAME(*a->a_vpp, dvp, a->a_cnp, IN_CREATE); + } } void @@ -6226,8 +6270,10 @@ a = ap; vp = a->a_vp; ASSERT_VOP_IN_SEQC(vp); - if (!rc) + if (!rc) { VFS_KNOTE_LOCKED(vp, NOTE_REVOKE); + INOTIFY_REVOKE(vp); + } } void @@ -6258,6 +6304,8 @@ if (!rc) { VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); VFS_KNOTE_LOCKED(vp, NOTE_DELETE); + INOTIFY_NAME(vp, dvp, a->a_cnp, _IN_ATTRIB_LINKCOUNT); + INOTIFY_NAME(vp, dvp, a->a_cnp, IN_DELETE); } } @@ -6289,6 +6337,8 @@ VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); if (a->a_tvp) VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); + INOTIFY_MOVE(a->a_fvp, a->a_fdvp, a->a_fcnp, a->a_tvp, + a->a_tdvp, a->a_tcnp); } if (a->a_tdvp != a->a_fdvp) vdrop(a->a_fdvp); @@ -6328,6 +6378,7 @@ vp->v_vflag |= VV_UNLINKED; VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); VFS_KNOTE_LOCKED(vp, NOTE_DELETE); + INOTIFY_NAME(vp, dvp, a->a_cnp, IN_DELETE); } } @@ -6351,8 +6402,10 @@ a = ap; vp = a->a_vp; vn_seqc_write_end(vp); - if (!rc) + if (!rc) { VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); + INOTIFY(vp, IN_ATTRIB); + } } void @@ -6397,8 +6450,10 @@ a = ap; vp = a->a_vp; vn_seqc_write_end(vp); - if (!rc) + if (!rc) { VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); + INOTIFY(vp, IN_ATTRIB); + } } void @@ -6421,8 +6476,10 @@ a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); - if (!rc) + if (!rc) { VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); + INOTIFY_NAME(*a->a_vpp, dvp, a->a_cnp, IN_CREATE); + } } void @@ -6430,8 +6487,10 @@ { struct vop_open_args *a = ap; - if (!rc) + if (!rc) { VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN); + INOTIFY(a->a_vp, IN_OPEN); + } } void @@ -6443,6 +6502,8 @@ !VN_IS_DOOMED(a->a_vp))) { VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ? NOTE_CLOSE_WRITE : NOTE_CLOSE); + INOTIFY(a->a_vp, (a->a_fflag & FWRITE) != 0 ? + IN_CLOSE_WRITE : IN_CLOSE_NOWRITE); } } @@ -6451,8 +6512,10 @@ { struct vop_read_args *a = ap; - if (!rc) + if (!rc) { VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); + INOTIFY(a->a_vp, IN_ACCESS); + } } void @@ -6469,8 +6532,10 @@ { struct vop_readdir_args *a = ap; - if (!rc) + if (!rc) { VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); + INOTIFY(a->a_vp, IN_ACCESS); + } } static struct knlist fs_knlist; diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include #include @@ -303,7 +304,8 @@ NDREINIT(ndp); goto restart; } - if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0) + if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0 || + (vn_irflag_read(ndp->ni_dvp) & VIRF_INOTIFY) != 0) ndp->ni_cnd.cn_flags |= MAKEENTRY; #ifdef MAC error = mac_vnode_check_create(cred, ndp->ni_dvp, @@ -479,6 +481,7 @@ if (vp->v_type != VFIFO && vp->v_type != VSOCK && VOP_ACCESS(vp, VREAD, cred, td) == 0) fp->f_flag |= FKQALLOWED; + INOTIFY(vp, IN_OPEN); return (0); } @@ -1741,6 +1744,8 @@ vattr.va_vaflags |= VA_SYNC; error = VOP_SETATTR(vp, &vattr, cred); VOP_ADD_WRITECOUNT_CHECKED(vp, -1); + if (error == 0) + INOTIFY(vp, IN_MODIFY); } return (error); } diff --git a/sys/kern/vnode_if.src b/sys/kern/vnode_if.src --- a/sys/kern/vnode_if.src +++ b/sys/kern/vnode_if.src @@ -702,6 +702,7 @@ %% allocate vp E E E +%! allocate post vop_allocate_post vop_allocate { IN struct vnode *vp; @@ -786,6 +787,7 @@ %% copy_file_range invp U U U %% copy_file_range outvp U U U +%! copy_file_range post vop_copy_file_range_post vop_copy_file_range { IN struct vnode *invp; @@ -810,6 +812,7 @@ %% deallocate vp L L L +%! deallocate post vop_deallocate_post vop_deallocate { IN struct vnode *vp; @@ -821,6 +824,27 @@ }; +%% inotify vp - - - + +vop_inotify { + IN struct vnode *vp; + IN struct vnode *dvp; + IN struct componentname *cnp; + IN int event; + IN uint32_t cookie; +}; + + +%% inotify_add_watch vp L L L + +vop_inotify_add_watch { + IN struct vnode *vp; + IN struct inotify_softc *sc; + IN uint32_t mask; + OUT uint32_t *wdp; + IN struct thread *td; +}; + # The VOPs below are spares at the end of the table to allow new VOPs to be # added in stable branches without breaking the KBI. New VOPs in HEAD should # be added above these spares. When merging a new VOP to a stable branch, diff --git a/sys/sys/caprights.h b/sys/sys/caprights.h --- a/sys/sys/caprights.h +++ b/sys/sys/caprights.h @@ -79,6 +79,8 @@ extern const cap_rights_t cap_getpeername_rights; extern const cap_rights_t cap_getsockopt_rights; extern const cap_rights_t cap_getsockname_rights; +extern const cap_rights_t cap_inotify_add_rights; +extern const cap_rights_t cap_inotify_rm_rights; extern const cap_rights_t cap_ioctl_rights; extern const cap_rights_t cap_linkat_source_rights; extern const cap_rights_t cap_linkat_target_rights; diff --git a/sys/sys/capsicum.h b/sys/sys/capsicum.h --- a/sys/sys/capsicum.h +++ b/sys/sys/capsicum.h @@ -279,11 +279,15 @@ #define CAP_KQUEUE (CAP_KQUEUE_EVENT | CAP_KQUEUE_CHANGE) +/* Allows operations on inotify descriptors. */ +#define CAP_INOTIFY_ADD CAPRIGHT(1, 0x0000000000200000ULL) +#define CAP_INOTIFY_RM CAPRIGHT(1, 0x0000000000400000ULL) + /* All used bits for index 1. */ -#define CAP_ALL1 CAPRIGHT(1, 0x00000000001FFFFFULL) +#define CAP_ALL1 CAPRIGHT(1, 0x00000000007FFFFFULL) /* Available bits for index 1. */ -#define CAP_UNUSED1_22 CAPRIGHT(1, 0x0000000000200000ULL) +#define CAP_UNUSED1_22 CAPRIGHT(1, 0x0000000000800000ULL) /* ... */ #define CAP_UNUSED1_57 CAPRIGHT(1, 0x0100000000000000ULL) diff --git a/sys/sys/exterr_cat.h b/sys/sys/exterr_cat.h --- a/sys/sys/exterr_cat.h +++ b/sys/sys/exterr_cat.h @@ -15,6 +15,7 @@ #define EXTERR_CAT_FILEDESC 2 #define EXTERR_KTRACE 3 /* To allow inclusion of this file into kern_ktrace.c */ +#define EXTERR_CAT_INOTIFY 4 #endif diff --git a/sys/sys/file.h b/sys/sys/file.h --- a/sys/sys/file.h +++ b/sys/sys/file.h @@ -71,6 +71,7 @@ #define DTYPE_PROCDESC 12 /* process descriptor */ #define DTYPE_EVENTFD 13 /* eventfd */ #define DTYPE_TIMERFD 14 /* timerfd */ +#define DTYPE_INOTIFY 15 /* inotify descriptor */ #ifdef _KERNEL diff --git a/sys/sys/inotify.h b/sys/sys/inotify.h new file mode 100644 --- /dev/null +++ b/sys/sys/inotify.h @@ -0,0 +1,146 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Klara, Inc. + */ + +#ifndef _INOTIFY_H_ +#define _INOTIFY_H_ + +#include + +/* Flags for inotify_init1(). */ +#define IN_NONBLOCK 0x00000004 /* O_NONBLOCK */ +#define IN_CLOEXEC 0x00100000 /* O_CLOEXEC */ + +struct inotify_event { + int wd; + __uint32_t mask; + __uint32_t cookie; + __uint32_t len; + char name[0]; +}; + +/* Events, set in the mask field. */ +#define IN_ACCESS 0x00000001 +#define IN_MODIFY 0x00000002 +#define IN_ATTRIB 0x00000004 +#define IN_CLOSE_WRITE 0x00000008 +#define IN_CLOSE_NOWRITE 0x00000010 +#define IN_CLOSE (IN_CLOSE_WRITE | IN_CLOSE_NOWRITE) +#define IN_OPEN 0x00000020 +#define IN_MOVED_FROM 0x00000040 +#define IN_MOVED_TO 0x00000080 +#define IN_MOVE (IN_MOVED_FROM | IN_MOVED_TO) +#define IN_CREATE 0x00000100 +#define IN_DELETE 0x00000200 +#define IN_DELETE_SELF 0x00000400 +#define IN_MOVE_SELF 0x00000800 +#define IN_ALL_EVENTS 0x00000fff + +/* Events report only for entries in a watched dir, not the dir itself. */ +#define _IN_DIR_EVENTS (IN_CLOSE_WRITE | IN_DELETE | IN_MODIFY | \ + IN_MOVED_FROM | IN_MOVED_TO) + +#ifdef _KERNEL +/* + * An unlink that's done as part of a rename only records IN_DELETE if the + * unlinked vnode itself is watched, and not when the containing directory is + * watched. + */ +#define _IN_MOVE_DELETE 0x40000000 +/* + * Inode link count changes only trigger IN_ATTRIB events if the inode itself is + * watched, and not when the containing directory is watched. + */ +#define _IN_ATTRIB_LINKCOUNT 0x80000000 +#endif + +/* Flags, set in the mask field. */ +#define IN_ONLYDIR 0x01000000 +#define IN_DONT_FOLLOW 0x02000000 +#define IN_EXCL_UNLINK 0x04000000 +#define IN_MASK_CREATE 0x10000000 +#define IN_MASK_ADD 0x20000000 +#define IN_ONESHOT 0x80000000 +#define _IN_ALL_FLAGS (IN_ONLYDIR | IN_DONT_FOLLOW | \ + IN_EXCL_UNLINK | IN_MASK_CREATE | \ + IN_MASK_ADD | IN_ONESHOT) + +/* Flags returned by the kernel. */ +#define IN_UNMOUNT 0x00002000 +#define IN_Q_OVERFLOW 0x00004000 +#define IN_IGNORED 0x00008000 +#define IN_ISDIR 0x40000000 +#define _IN_ALL_RETFLAGS (IN_Q_OVERFLOW | IN_UNMOUNT | IN_IGNORED | \ + IN_ISDIR) + +#define _IN_ALIGN _Alignof(struct inotify_event) +#define _IN_NAMESIZE(namelen) \ + ((namelen) == 0 ? 0 : __align_up((namelen) + 1, _IN_ALIGN)) + +#ifdef _KERNEL +struct componentname; +struct file; +struct inotify_softc; +struct thread; +struct vnode; + +int inotify_create_file(struct thread *, struct file *, int, int *); +void inotify_log(struct vnode *, const char *, size_t, int, __uint32_t); + +void vn_inotify(struct vnode *, struct vnode *, struct componentname *, int, + uint32_t); +int vn_inotify_add_watch(struct vnode *, struct inotify_softc *, + __uint32_t, __uint32_t *, struct thread *); +void vn_inotify_revoke(struct vnode *); + +/* Log an inotify event. */ +#define INOTIFY(vp, ev) do { \ + if (__predict_false((vn_irflag_read(vp) & (VIRF_INOTIFY | \ + VIRF_INOTIFY_PARENT)) != 0)) \ + VOP_INOTIFY((vp), NULL, NULL, (ev), 0); \ +} while (0) + +/* Log an inotify event using a specific name for the vnode. */ +#define INOTIFY_NAME(vp, dvp, cnp, ev) do { \ + if (__predict_false((vn_irflag_read(vp) & VIRF_INOTIFY) != 0 || \ + (vn_irflag_read(dvp) & VIRF_INOTIFY) != 0)) \ + VOP_INOTIFY((vp), (dvp), (cnp), (ev), 0); \ +} while (0) + +extern __uint32_t inotify_rename_cookie; + +#define INOTIFY_MOVE(vp, fdvp, fcnp, tvp, tdvp, tcnp) do { \ + if (__predict_false((vn_irflag_read(fdvp) & VIRF_INOTIFY) != 0 || \ + (vn_irflag_read(tdvp) & VIRF_INOTIFY) != 0 || \ + (vn_irflag_read(vp) & VIRF_INOTIFY) != 0)) { \ + __uint32_t cookie; \ + \ + cookie = atomic_fetchadd_32(&inotify_rename_cookie, 1); \ + VOP_INOTIFY((vp), (fdvp), (fcnp), IN_MOVED_FROM, cookie); \ + VOP_INOTIFY((vp), (tdvp), (tcnp), IN_MOVED_TO, cookie); \ + } \ + if ((tvp) != NULL) \ + INOTIFY_NAME((tvp), (tdvp), (tcnp), _IN_MOVE_DELETE); \ +} while (0) + +#define INOTIFY_REVOKE(vp) do { \ + if (__predict_false((vn_irflag_read(vp) & VIRF_INOTIFY) != 0)) \ + vn_inotify_revoke((vp)); \ +} while (0) + +#else +#include + +__BEGIN_DECLS +int inotify_init(void); +int inotify_init1(int flags); +int inotify_add_watch(int fd, const char *pathname, __uint32_t mask); +int inotify_add_watch_at(int fd, int dfd, const char *pathname, + __uint32_t mask); +int inotify_rm_watch(int fd, int wd); +__END_DECLS +#endif /* !_KERNEL */ + +#endif /* !_INOTIFY_H_ */ diff --git a/sys/sys/resourcevar.h b/sys/sys/resourcevar.h --- a/sys/sys/resourcevar.h +++ b/sys/sys/resourcevar.h @@ -122,6 +122,8 @@ long ui_kqcnt; /* (b) number of kqueues */ long ui_umtxcnt; /* (b) number of shared umtxs */ long ui_pipecnt; /* (b) consumption of pipe buffers */ + long ui_inotifycnt; /* (b) number of inotify descriptors */ + long ui_inotifywatchcnt; /* (b) number of inotify watches */ uid_t ui_uid; /* (a) uid */ u_int ui_ref; /* (b) reference count */ #ifdef RACCT @@ -144,6 +146,8 @@ int chgptscnt(struct uidinfo *uip, int diff, rlim_t maxval); int chgumtxcnt(struct uidinfo *uip, int diff, rlim_t maxval); int chgpipecnt(struct uidinfo *uip, int diff, rlim_t max); +int chginotifycnt(struct uidinfo *uip, int diff, rlim_t maxval); +int chginotifywatchcnt(struct uidinfo *uip, int diff, rlim_t maxval); int kern_proc_setrlimit(struct thread *td, struct proc *p, u_int which, struct rlimit *limp); struct plimit diff --git a/sys/sys/specialfd.h b/sys/sys/specialfd.h --- a/sys/sys/specialfd.h +++ b/sys/sys/specialfd.h @@ -30,6 +30,7 @@ enum specialfd_type { SPECIALFD_EVENTFD = 1, + SPECIALFD_INOTIFY = 2, }; struct specialfd_eventfd { @@ -37,4 +38,8 @@ int flags; }; +struct specialfd_inotify { + int flags; +}; + #endif /* !_SYS_SPECIALFD_H_ */ diff --git a/sys/sys/user.h b/sys/sys/user.h --- a/sys/sys/user.h +++ b/sys/sys/user.h @@ -265,6 +265,7 @@ #define KF_TYPE_DEV 12 #define KF_TYPE_EVENTFD 13 #define KF_TYPE_TIMERFD 14 +#define KF_TYPE_INOTIFY 15 #define KF_TYPE_UNKNOWN 255 #define KF_VTYPE_VNON 0 @@ -456,6 +457,10 @@ int32_t kf_kqueue_count; int32_t kf_kqueue_state; } kf_kqueue; + struct { + uint64_t kf_inotify_npending; + uint64_t kf_inotify_nbpending; + } kf_inotify; } kf_un; }; uint16_t kf_status; /* Status flags. */ diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -86,11 +86,13 @@ * it from v_data. If non-null, this area is freed in getnewvnode(). */ -struct namecache; struct cache_fpl; +struct inotify_watch; +struct namecache; struct vpollinfo { struct mtx vpi_lock; /* lock to protect below */ + TAILQ_HEAD(, inotify_watch) vpi_inotify; /* list of inotify watchers */ struct selinfo vpi_selinfo; /* identity of poller(s) */ short vpi_events; /* what they are looking for */ short vpi_revents; /* what has happened */ @@ -248,6 +250,9 @@ #define VIRF_CROSSMP 0x0010 /* Cross-mp vnode, no locking */ #define VIRF_NAMEDDIR 0x0020 /* Named attribute directory */ #define VIRF_NAMEDATTR 0x0040 /* Named attribute */ +#define VIRF_INOTIFY 0x0080 /* This vnode is being watched */ +#define VIRF_INOTIFY_PARENT 0x0100 /* A parent of this vnode may be being + watched */ #define VI_UNUSED0 0x0001 /* unused */ #define VI_MOUNT 0x0002 /* Mount in progress */ @@ -667,6 +672,7 @@ void cache_symlink_free(char *string, size_t size); int cache_symlink_resolve(struct cache_fpl *fpl, const char *string, size_t len); +void cache_vop_inotify(struct vnode *vp, int event, uint32_t cookie); void cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp); void cache_vop_rmdir(struct vnode *dvp, struct vnode *vp); @@ -869,8 +875,10 @@ int vop_stdgetwritemount(struct vop_getwritemount_args *); int vop_stdgetpages(struct vop_getpages_args *); int vop_stdinactive(struct vop_inactive_args *); -int vop_stdioctl(struct vop_ioctl_args *); int vop_stdneed_inactive(struct vop_need_inactive_args *); +int vop_stdinotify(struct vop_inotify_args *); +int vop_stdinotify_add_watch(struct vop_inotify_add_watch_args *); +int vop_stdioctl(struct vop_ioctl_args *); int vop_stdkqfilter(struct vop_kqfilter_args *); int vop_stdlock(struct vop_lock1_args *); int vop_stdunlock(struct vop_unlock_args *); @@ -910,9 +918,12 @@ int dead_write(struct vop_write_args *ap); /* These are called from within the actual VOPS. */ +void vop_allocate_post(void *a, int rc); +void vop_copy_file_range_post(void *ap, int rc); void vop_close_post(void *a, int rc); void vop_create_pre(void *a); void vop_create_post(void *a, int rc); +void vop_deallocate_post(void *a, int rc); void vop_whiteout_pre(void *a); void vop_whiteout_post(void *a, int rc); void vop_deleteextattr_pre(void *a); @@ -1020,9 +1031,12 @@ #define VOP_WRITE_POST(ap, ret) \ noffset = (ap)->a_uio->uio_offset; \ - if (noffset > ooffset && !VN_KNLIST_EMPTY((ap)->a_vp)) { \ - VFS_KNOTE_LOCKED((ap)->a_vp, NOTE_WRITE \ - | (noffset > osize ? NOTE_EXTEND : 0)); \ + if (noffset > ooffset) { \ + if (VN_KNLIST_EMPTY((ap)->a_vp)) { \ + VFS_KNOTE_LOCKED((ap)->a_vp, NOTE_WRITE | \ + (noffset > osize ? NOTE_EXTEND : 0)); \ + } \ + INOTIFY((ap)->a_vp, IN_MODIFY); \ } #define VOP_LOCK(vp, flags) VOP_LOCK1(vp, flags, __FILE__, __LINE__) diff --git a/sys/tools/vnode_if.awk b/sys/tools/vnode_if.awk --- a/sys/tools/vnode_if.awk +++ b/sys/tools/vnode_if.awk @@ -193,6 +193,7 @@ printc(common_head \ "#include \n" \ "#include \n" \ + "#include \n" \ "#include \n" \ "#include \n" \ "#include \n" \ diff --git a/tests/sys/kern/Makefile b/tests/sys/kern/Makefile --- a/tests/sys/kern/Makefile +++ b/tests/sys/kern/Makefile @@ -19,6 +19,7 @@ TEST_METADATA.kern_descrip_test+= is_exclusive="true" ATF_TESTS_C+= fdgrowtable_test ATF_TESTS_C+= jail_lookup_root +ATF_TESTS_C+= inotify_test ATF_TESTS_C+= kill_zombie .if ${MK_OPENSSL} != "no" ATF_TESTS_C+= ktls_test @@ -85,6 +86,7 @@ LIBADD.sys_getrandom+= pthread LIBADD.ptrace_test+= pthread LIBADD.unix_seqpacket_test+= pthread +LIBADD.inotify_test+= util LIBADD.kcov+= pthread CFLAGS.ktls_test+= -DOPENSSL_API_COMPAT=0x10100000L LIBADD.ktls_test+= crypto util diff --git a/tests/sys/kern/inotify_test.c b/tests/sys/kern/inotify_test.c new file mode 100644 --- /dev/null +++ b/tests/sys/kern/inotify_test.c @@ -0,0 +1,862 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Klara, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static const char * +ev2name(int event) +{ + switch (event) { + case IN_ACCESS: + return ("IN_ACCESS"); + case IN_ATTRIB: + return ("IN_ATTRIB"); + case IN_CLOSE_WRITE: + return ("IN_CLOSE_WRITE"); + case IN_CLOSE_NOWRITE: + return ("IN_CLOSE_NOWRITE"); + case IN_CREATE: + return ("IN_CREATE"); + case IN_DELETE: + return ("IN_DELETE"); + case IN_DELETE_SELF: + return ("IN_DELETE_SELF"); + case IN_MODIFY: + return ("IN_MODIFY"); + case IN_MOVE_SELF: + return ("IN_MOVE_SELF"); + case IN_MOVED_FROM: + return ("IN_MOVED_FROM"); + case IN_MOVED_TO: + return ("IN_MOVED_TO"); + case IN_OPEN: + return ("IN_OPEN"); + default: + return (NULL); + } +} + +static void +close_checked(int fd) +{ + ATF_REQUIRE(close(fd) == 0); +} + +/* + * Make sure that no other events are pending, and close the inotify descriptor. + */ +static void +close_inotify(int fd) +{ + int n; + + ATF_REQUIRE(ioctl(fd, FIONREAD, &n) == 0); + ATF_REQUIRE(n == 0); + close_checked(fd); +} + +static uint32_t +consume_event_cookie(int ifd, int wd, int event, int flags, const char *name) +{ + struct inotify_event *ev; + size_t evsz, namelen; + ssize_t n; + uint32_t cookie; + + /* Only read one record. */ + namelen = name == NULL ? 0 : strlen(name); + evsz = sizeof(*ev) + _IN_NAMESIZE(namelen); + ev = malloc(evsz); + ATF_REQUIRE(ev != NULL); + + n = read(ifd, ev, evsz); + ATF_REQUIRE_MSG(n >= 0, "failed to read event %s", ev2name(event)); + ATF_REQUIRE((size_t)n >= sizeof(*ev)); + ATF_REQUIRE((size_t)n == sizeof(*ev) + ev->len); + ATF_REQUIRE((size_t)n == evsz); + + ATF_REQUIRE_MSG((ev->mask & IN_ALL_EVENTS) == event, + "expected event %#x, got %#x", event, ev->mask); + ATF_REQUIRE_MSG((ev->mask & _IN_ALL_RETFLAGS) == flags, + "expected flags %#x, got %#x", flags, ev->mask); + ATF_REQUIRE_MSG(ev->wd == wd, + "expected wd %d, got %d", wd, ev->wd); + ATF_REQUIRE_MSG(name == NULL || strcmp(name, ev->name) == 0, + "expected name '%s', got '%s'", name, ev->name); + cookie = ev->cookie; + if ((ev->mask & (IN_MOVED_FROM | IN_MOVED_TO)) == 0) + ATF_REQUIRE(cookie == 0); + free(ev); + return (cookie); +} + +/* + * Read an event from the inotify file descriptor and check that it + * matches the expected values. + */ +static void +consume_event(int ifd, int wd, int event, int flags, const char *name) +{ + (void)consume_event_cookie(ifd, wd, event, flags, name); +} + +static int +inotify(int flags) +{ + int ifd; + + ifd = inotify_init1(flags); + ATF_REQUIRE(ifd != -1); + return (ifd); +} + +static void +mount_nullfs(char *dir, char *src) +{ + struct iovec *iov; + char errmsg[1024]; + int error, iovlen; + + iov = NULL; + iovlen = 0; + + build_iovec(&iov, &iovlen, "fstype", "nullfs", (size_t)-1); + build_iovec(&iov, &iovlen, "fspath", dir, (size_t)-1); + build_iovec(&iov, &iovlen, "target", src, (size_t)-1); + build_iovec(&iov, &iovlen, "errmsg", errmsg, sizeof(errmsg)); + + errmsg[0] = '\0'; + error = nmount(iov, iovlen, 0); + ATF_REQUIRE_MSG(error == 0, + "mount nullfs %s %s: %s", src, dir, + errmsg[0] == '\0' ? strerror(errno) : errmsg); + + free_iovec(&iov, &iovlen); +} + +static void +mount_tmpfs(const char *dir) +{ + struct iovec *iov; + char errmsg[1024]; + int error, iovlen; + + iov = NULL; + iovlen = 0; + + build_iovec(&iov, &iovlen, "fstype", "tmpfs", (size_t)-1); + build_iovec(&iov, &iovlen, "fspath", __DECONST(char *, dir), + (size_t)-1); + build_iovec(&iov, &iovlen, "errmsg", errmsg, sizeof(errmsg)); + + errmsg[0] = '\0'; + error = nmount(iov, iovlen, 0); + ATF_REQUIRE_MSG(error == 0, + "mount tmpfs %s: %s", dir, + errmsg[0] == '\0' ? strerror(errno) : errmsg); + + free_iovec(&iov, &iovlen); +} + +static int +watch_file(int ifd, int events, char *path) +{ + int fd, wd; + + strncpy(path, "test.XXXXXX", PATH_MAX); + fd = mkstemp(path); + ATF_REQUIRE(fd != -1); + close_checked(fd); + + wd = inotify_add_watch(ifd, path, events); + ATF_REQUIRE(wd != -1); + + return (wd); +} + +static int +watch_dir(int ifd, int events, char *path) +{ + char *p; + int wd; + + strlcpy(path, "test.XXXXXX", PATH_MAX); + p = mkdtemp(path); + ATF_REQUIRE(p == path); + + wd = inotify_add_watch(ifd, path, events); + ATF_REQUIRE(wd != -1); + + return (wd); +} + +/* + * Verify that Capsicum restrictions are applied as expected. + */ +ATF_TC_WITHOUT_HEAD(inotify_capsicum); +ATF_TC_BODY(inotify_capsicum, tc) +{ + int error, dfd, ifd, wd; + + ifd = inotify(IN_NONBLOCK); + ATF_REQUIRE(ifd != -1); + + dfd = open(".", O_RDONLY | O_DIRECTORY); + ATF_REQUIRE(dfd != -1); + + error = mkdirat(dfd, "testdir", 0755); + ATF_REQUIRE(error == 0); + + error = cap_enter(); + ATF_REQUIRE(error == 0); + + /* + * Plain inotify_add_watch() is disallowed. + */ + wd = inotify_add_watch(ifd, ".", IN_DELETE_SELF); + ATF_REQUIRE_ERRNO(ECAPMODE, wd == -1); + wd = inotify_add_watch_at(ifd, dfd, "testdir", IN_DELETE_SELF); + ATF_REQUIRE(wd >= 0); + + /* + * Generate a record and consume it. + */ + error = unlinkat(dfd, "testdir", AT_REMOVEDIR); + ATF_REQUIRE(error == 0); + consume_event(ifd, wd, IN_DELETE_SELF, IN_ISDIR, NULL); + consume_event(ifd, wd, 0, IN_IGNORED, NULL); + + close_checked(dfd); + close_inotify(ifd); +} + +/* + * Make sure that duplicate, back-to-back events are coalesced. + */ +ATF_TC_WITHOUT_HEAD(inotify_coalesce); +ATF_TC_BODY(inotify_coalesce, tc) +{ + char file[PATH_MAX], path[PATH_MAX]; + int fd, fd1, ifd, n, wd; + + ifd = inotify(IN_NONBLOCK); + + /* Create a directory and watch it. */ + wd = watch_dir(ifd, IN_OPEN, path); + /* Create a file in the directory and open it. */ + snprintf(file, sizeof(file), "%s/file", path); + fd = open(file, O_RDWR | O_CREAT, 0644); + ATF_REQUIRE(fd != -1); + close_checked(fd); + fd = open(file, O_RDWR); + ATF_REQUIRE(fd != -1); + fd1 = open(file, O_RDONLY); + ATF_REQUIRE(fd1 != -1); + close_checked(fd1); + close_checked(fd); + + consume_event(ifd, wd, IN_OPEN, 0, "file"); + ATF_REQUIRE(ioctl(ifd, FIONREAD, &n) == 0); + ATF_REQUIRE(n == 0); + + close_inotify(ifd); +} + +/* + * Check handling of IN_MASK_CREATE. + */ +ATF_TC_WITHOUT_HEAD(inotify_mask_create); +ATF_TC_BODY(inotify_mask_create, tc) +{ + char path[PATH_MAX]; + int ifd, wd, wd1; + + ifd = inotify(IN_NONBLOCK); + + /* Create a directory and watch it. */ + wd = watch_dir(ifd, IN_CREATE, path); + /* Updating the watch with IN_MASK_CREATE should result in an error. */ + wd1 = inotify_add_watch(ifd, path, IN_MODIFY | IN_MASK_CREATE); + ATF_REQUIRE_ERRNO(EEXIST, wd1 == -1); + /* It's an error to specify IN_MASK_ADD with IN_MASK_CREATE. */ + wd1 = inotify_add_watch(ifd, path, IN_MODIFY | IN_MASK_ADD | + IN_MASK_CREATE); + ATF_REQUIRE_ERRNO(EINVAL, wd1 == -1); + /* Updating the watch without IN_MASK_CREATE should work. */ + wd1 = inotify_add_watch(ifd, path, IN_MODIFY); + ATF_REQUIRE(wd1 != -1); + ATF_REQUIRE_EQ(wd, wd1); + + close_inotify(ifd); +} + +/* + * Make sure that inotify cooperates with nullfs: if a lower vnode is the + * subject of an event, the upper vnode should be notified, and if the upper + * vnode is the subject of an event, the lower vnode should be notified. + */ +ATF_TC_WITH_CLEANUP(inotify_nullfs); +ATF_TC_HEAD(inotify_nullfs, tc) +{ + atf_tc_set_md_var(tc, "require.user", "root"); +} +ATF_TC_BODY(inotify_nullfs, tc) +{ + char path[PATH_MAX], *p; + int dfd, error, fd, ifd, mask, wd; + + mask = IN_CREATE | IN_OPEN; + + ifd = inotify(IN_NONBLOCK); + + strlcpy(path, "./test.XXXXXX", sizeof(path)); + p = mkdtemp(path); + ATF_REQUIRE(p == path); + + error = mkdir("./mnt", 0755); + ATF_REQUIRE(error == 0); + + /* Mount the testdir onto ./mnt. */ + mount_nullfs("./mnt", path); + + wd = inotify_add_watch(ifd, "./mnt", mask); + ATF_REQUIRE(wd != -1); + + /* Create a file in the lower directory and open it. */ + dfd = open(path, O_RDONLY | O_DIRECTORY); + ATF_REQUIRE(dfd != -1); + fd = openat(dfd, "file", O_RDWR | O_CREAT, 0644); + close_checked(fd); + close_checked(dfd); + + /* We should see events via the nullfs mount. */ + consume_event(ifd, wd, IN_OPEN, IN_ISDIR, NULL); + consume_event(ifd, wd, IN_CREATE, 0, "file"); + consume_event(ifd, wd, IN_OPEN, 0, "file"); + + error = inotify_rm_watch(ifd, wd); + ATF_REQUIRE(error == 0); + consume_event(ifd, wd, 0, IN_IGNORED, NULL); + + /* Watch the lower directory. */ + wd = inotify_add_watch(ifd, path, mask); + ATF_REQUIRE(wd != -1); + /* ... and create a file in the upper directory and open it. */ + dfd = open("./mnt", O_RDONLY | O_DIRECTORY); + ATF_REQUIRE(dfd != -1); + fd = openat(dfd, "file2", O_RDWR | O_CREAT, 0644); + ATF_REQUIRE(fd != -1); + close_checked(fd); + close_checked(dfd); + + /* We should see events via the lower directory. */ + consume_event(ifd, wd, IN_OPEN, IN_ISDIR, NULL); + consume_event(ifd, wd, IN_CREATE, 0, "file2"); + consume_event(ifd, wd, IN_OPEN, 0, "file2"); + + close_inotify(ifd); +} +ATF_TC_CLEANUP(inotify_nullfs, tc) +{ + int error; + + error = unmount("./mnt", 0); + if (error != 0) { + perror("unmount"); + exit(1); + } +} + +/* + * Make sure that exceeding max_events pending events results in an overflow + * event. + */ +ATF_TC_WITHOUT_HEAD(inotify_queue_overflow); +ATF_TC_BODY(inotify_queue_overflow, tc) +{ + char path[PATH_MAX]; + size_t size; + int error, dfd, ifd, max, wd; + + size = sizeof(max); + error = sysctlbyname("vfs.inotify.max_queued_events", &max, &size, NULL, + 0); + ATF_REQUIRE(error == 0); + + ifd = inotify(IN_NONBLOCK); + + /* Create a directory and watch it for file creation events. */ + wd = watch_dir(ifd, IN_CREATE, path); + dfd = open(path, O_DIRECTORY); + ATF_REQUIRE(dfd != -1); + /* Generate max+1 file creation events. */ + for (int i = 0; i < max + 1; i++) { + char name[NAME_MAX]; + int fd; + + (void)snprintf(name, sizeof(name), "file%d", i); + fd = openat(dfd, name, O_CREAT | O_RDWR, 0644); + ATF_REQUIRE(fd != -1); + close_checked(fd); + } + + /* + * Read our events. We should see files 0..max-1 and then an overflow + * event. + */ + for (int i = 0; i < max; i++) { + char name[NAME_MAX]; + + (void)snprintf(name, sizeof(name), "file%d", i); + consume_event(ifd, wd, IN_CREATE, 0, name); + } + + /* Look for an overflow event. */ + consume_event(ifd, -1, 0, IN_Q_OVERFLOW, NULL); + + close_checked(dfd); + close_inotify(ifd); +} + +ATF_TC_WITHOUT_HEAD(inotify_event_access_file); +ATF_TC_BODY(inotify_event_access_file, tc) +{ + char path[PATH_MAX], buf[16]; + off_t nb; + ssize_t n; + int error, fd, fd1, ifd, s[2], wd; + + ifd = inotify(IN_NONBLOCK); + + wd = watch_file(ifd, IN_ACCESS, path); + + fd = open(path, O_RDWR); + n = write(fd, "test", 4); + ATF_REQUIRE(n == 4); + + /* A simple read(2) should generate an access. */ + ATF_REQUIRE(lseek(fd, 0, SEEK_SET) == 0); + n = read(fd, buf, sizeof(buf)); + ATF_REQUIRE(n == 4); + ATF_REQUIRE(memcmp(buf, "test", 4) == 0); + consume_event(ifd, wd, IN_ACCESS, 0, NULL); + + /* copy_file_range(2) should as well. */ + ATF_REQUIRE(lseek(fd, 0, SEEK_SET) == 0); + fd1 = open("sink", O_RDWR | O_CREAT, 0644); + ATF_REQUIRE(fd1 != -1); + n = copy_file_range(fd, NULL, fd1, NULL, 4, 0); + ATF_REQUIRE(n == 4); + close_checked(fd1); + consume_event(ifd, wd, IN_ACCESS, 0, NULL); + + /* As should sendfile(2). */ + error = socketpair(AF_UNIX, SOCK_STREAM, 0, s); + ATF_REQUIRE(error == 0); + error = sendfile(fd, s[0], 0, 4, NULL, &nb, 0); + ATF_REQUIRE(error == 0); + ATF_REQUIRE(nb == 4); + consume_event(ifd, wd, IN_ACCESS, 0, NULL); + close_checked(s[0]); + close_checked(s[1]); + + close_checked(fd); + + close_inotify(ifd); +} + +ATF_TC_WITHOUT_HEAD(inotify_event_access_dir); +ATF_TC_BODY(inotify_event_access_dir, tc) +{ + char root[PATH_MAX], path[PATH_MAX]; + struct dirent *ent; + DIR *dir; + int error, ifd, wd; + + ifd = inotify(IN_NONBLOCK); + + wd = watch_dir(ifd, IN_ACCESS, root); + snprintf(path, sizeof(path), "%s/dir", root); + error = mkdir(path, 0755); + ATF_REQUIRE(error == 0); + + /* Read an entry and generate an access. */ + dir = opendir(path); + ATF_REQUIRE(dir != NULL); + ent = readdir(dir); + ATF_REQUIRE(ent != NULL); + ATF_REQUIRE(strcmp(ent->d_name, ".") == 0 || + strcmp(ent->d_name, "..") == 0); + ATF_REQUIRE(closedir(dir) == 0); + consume_event(ifd, wd, IN_ACCESS, IN_ISDIR, "dir"); + + /* + * Reading the watched directory should generate an access event. + * This is contrary to Linux's inotify man page, which states that + * IN_ACCESS is only generated for accesses to objects in a watched + * directory. + */ + dir = opendir(root); + ATF_REQUIRE(dir != NULL); + ent = readdir(dir); + ATF_REQUIRE(ent != NULL); + ATF_REQUIRE(strcmp(ent->d_name, ".") == 0 || + strcmp(ent->d_name, "..") == 0); + ATF_REQUIRE(closedir(dir) == 0); + consume_event(ifd, wd, IN_ACCESS, IN_ISDIR, NULL); + + close_inotify(ifd); +} + +ATF_TC_WITHOUT_HEAD(inotify_event_attrib); +ATF_TC_BODY(inotify_event_attrib, tc) +{ + char path[PATH_MAX]; + int error, ifd, fd, wd; + + ifd = inotify(IN_NONBLOCK); + + wd = watch_file(ifd, IN_ATTRIB, path); + + fd = open(path, O_RDWR); + ATF_REQUIRE(fd != -1); + error = fchmod(fd, 0600); + ATF_REQUIRE(error == 0); + consume_event(ifd, wd, IN_ATTRIB, 0, NULL); + + error = fchown(fd, 0, 0); + ATF_REQUIRE(error == 0); + consume_event(ifd, wd, IN_ATTRIB, 0, NULL); + + close_checked(fd); + close_inotify(ifd); +} + +ATF_TC_WITHOUT_HEAD(inotify_event_close_nowrite); +ATF_TC_BODY(inotify_event_close_nowrite, tc) +{ + char file[PATH_MAX], file1[PATH_MAX], dir[PATH_MAX]; + int ifd, fd, wd1, wd2; + + ifd = inotify(IN_NONBLOCK); + + wd1 = watch_dir(ifd, IN_CLOSE_NOWRITE, dir); + wd2 = watch_file(ifd, IN_CLOSE_NOWRITE | IN_CLOSE_WRITE, file); + + fd = open(dir, O_DIRECTORY); + ATF_REQUIRE(fd != -1); + close_checked(fd); + consume_event(ifd, wd1, IN_CLOSE_NOWRITE, IN_ISDIR, NULL); + + fd = open(file, O_RDONLY); + ATF_REQUIRE(fd != -1); + close_checked(fd); + consume_event(ifd, wd2, IN_CLOSE_NOWRITE, 0, NULL); + + snprintf(file1, sizeof(file1), "%s/file", dir); + fd = open(file1, O_RDONLY | O_CREAT, 0644); + ATF_REQUIRE(fd != -1); + close_checked(fd); + consume_event(ifd, wd1, IN_CLOSE_NOWRITE, 0, "file"); + + close_inotify(ifd); +} + +ATF_TC_WITHOUT_HEAD(inotify_event_close_write); +ATF_TC_BODY(inotify_event_close_write, tc) +{ + char path[PATH_MAX]; + int ifd, fd, wd; + + ifd = inotify(IN_NONBLOCK); + + wd = watch_file(ifd, IN_CLOSE_NOWRITE | IN_CLOSE_WRITE, path); + + fd = open(path, O_RDWR); + ATF_REQUIRE(fd != -1); + close_checked(fd); + consume_event(ifd, wd, IN_CLOSE_WRITE, 0, NULL); + + close_inotify(ifd); +} + +/* Verify that various operations in a directory generate IN_CREATE events. */ +ATF_TC_WITHOUT_HEAD(inotify_event_create); +ATF_TC_BODY(inotify_event_create, tc) +{ + struct sockaddr_un sun; + char path[PATH_MAX], path1[PATH_MAX], root[PATH_MAX]; + ssize_t n; + int error, ifd, ifd1, fd, s, wd, wd1; + char b; + + ifd = inotify(IN_NONBLOCK); + + wd = watch_dir(ifd, IN_CREATE, root); + + /* Regular file. */ + snprintf(path, sizeof(path), "%s/file", root); + fd = open(path, O_RDWR | O_CREAT, 0644); + ATF_REQUIRE(fd != -1); + /* + * Make sure we get an event triggered by the fd used to create the + * file. + */ + ifd1 = inotify(IN_NONBLOCK); + wd1 = inotify_add_watch(ifd1, root, IN_MODIFY); + b = 42; + n = write(fd, &b, sizeof(b)); + ATF_REQUIRE(n == sizeof(b)); + close_checked(fd); + consume_event(ifd, wd, IN_CREATE, 0, "file"); + consume_event(ifd1, wd1, IN_MODIFY, 0, "file"); + close_inotify(ifd1); + + /* Hard link. */ + snprintf(path1, sizeof(path1), "%s/link", root); + error = link(path, path1); + ATF_REQUIRE(error == 0); + consume_event(ifd, wd, IN_CREATE, 0, "link"); + + /* Directory. */ + snprintf(path, sizeof(path), "%s/dir", root); + error = mkdir(path, 0755); + ATF_REQUIRE(error == 0); + consume_event(ifd, wd, IN_CREATE, IN_ISDIR, "dir"); + + /* Symbolic link. */ + snprintf(path1, sizeof(path1), "%s/symlink", root); + error = symlink(path, path1); + ATF_REQUIRE(error == 0); + consume_event(ifd, wd, IN_CREATE, 0, "symlink"); + + /* FIFO. */ + snprintf(path, sizeof(path), "%s/fifo", root); + error = mkfifo(path, 0644); + ATF_REQUIRE(error == 0); + consume_event(ifd, wd, IN_CREATE, 0, "fifo"); + + /* Binding a socket. */ + s = socket(AF_UNIX, SOCK_STREAM, 0); + memset(&sun, 0, sizeof(sun)); + sun.sun_family = AF_UNIX; + sun.sun_len = sizeof(sun); + snprintf(sun.sun_path, sizeof(sun.sun_path), "%s/socket", root); + error = bind(s, (struct sockaddr *)&sun, sizeof(sun)); + ATF_REQUIRE(error == 0); + close_checked(s); + consume_event(ifd, wd, IN_CREATE, 0, "socket"); + + close_inotify(ifd); +} + +ATF_TC_WITHOUT_HEAD(inotify_event_delete); +ATF_TC_BODY(inotify_event_delete, tc) +{ + char root[PATH_MAX], path[PATH_MAX], file[PATH_MAX]; + int error, fd, ifd, wd, wd2; + + ifd = inotify(IN_NONBLOCK); + + wd = watch_dir(ifd, IN_DELETE | IN_DELETE_SELF, root); + + snprintf(path, sizeof(path), "%s/file", root); + fd = open(path, O_RDWR | O_CREAT, 0644); + ATF_REQUIRE(fd != -1); + error = unlink(path); + ATF_REQUIRE(error == 0); + consume_event(ifd, wd, IN_DELETE, 0, "file"); + close_checked(fd); + + /* + * Make sure that renaming over a file generates a delete event when and + * only when that file is watched. + */ + fd = open(path, O_RDWR | O_CREAT, 0644); + ATF_REQUIRE(fd != -1); + close_checked(fd); + wd2 = inotify_add_watch(ifd, path, IN_DELETE | IN_DELETE_SELF); + ATF_REQUIRE(wd2 != -1); + snprintf(file, sizeof(file), "%s/file2", root); + fd = open(file, O_RDWR | O_CREAT, 0644); + ATF_REQUIRE(fd != -1); + close_checked(fd); + error = rename(file, path); + ATF_REQUIRE(error == 0); + consume_event(ifd, wd2, IN_DELETE_SELF, 0, NULL); + consume_event(ifd, wd2, 0, IN_IGNORED, NULL); + + error = unlink(path); + ATF_REQUIRE(error == 0); + consume_event(ifd, wd, IN_DELETE, 0, "file"); + error = rmdir(root); + ATF_REQUIRE(error == 0); + consume_event(ifd, wd, IN_DELETE_SELF, IN_ISDIR, NULL); + consume_event(ifd, wd, 0, IN_IGNORED, NULL); + + close_inotify(ifd); +} + +ATF_TC_WITHOUT_HEAD(inotify_event_move); +ATF_TC_BODY(inotify_event_move, tc) +{ + char dir1[PATH_MAX], dir2[PATH_MAX], path1[PATH_MAX], path2[PATH_MAX]; + char path3[PATH_MAX]; + int error, ifd, fd, wd1, wd2, wd3; + uint32_t cookie1, cookie2; + + ifd = inotify(IN_NONBLOCK); + + wd1 = watch_dir(ifd, IN_MOVE | IN_MOVE_SELF, dir1); + wd2 = watch_dir(ifd, IN_MOVE | IN_MOVE_SELF, dir2); + + snprintf(path1, sizeof(path1), "%s/file", dir1); + fd = open(path1, O_RDWR | O_CREAT, 0644); + ATF_REQUIRE(fd != -1); + close_checked(fd); + snprintf(path2, sizeof(path2), "%s/file2", dir2); + error = rename(path1, path2); + ATF_REQUIRE(error == 0); + cookie1 = consume_event_cookie(ifd, wd1, IN_MOVED_FROM, 0, "file"); + cookie2 = consume_event_cookie(ifd, wd2, IN_MOVED_TO, 0, "file2"); + ATF_REQUIRE_MSG(cookie1 == cookie2, + "expected cookie %u, got %u", cookie1, cookie2); + + snprintf(path2, sizeof(path2), "%s/dir", dir2); + error = rename(dir1, path2); + ATF_REQUIRE(error == 0); + consume_event(ifd, wd1, IN_MOVE_SELF, IN_ISDIR, NULL); + consume_event(ifd, wd2, IN_MOVED_TO, IN_ISDIR, "dir"); + + wd3 = watch_file(ifd, IN_MOVE_SELF, path3); + error = rename(path3, "foo"); + ATF_REQUIRE(error == 0); + consume_event(ifd, wd3, IN_MOVE_SELF, 0, NULL); + + close_inotify(ifd); +} + +ATF_TC_WITHOUT_HEAD(inotify_event_open); +ATF_TC_BODY(inotify_event_open, tc) +{ + char root[PATH_MAX], path[PATH_MAX]; + int error, ifd, fd, wd; + + ifd = inotify(IN_NONBLOCK); + + wd = watch_dir(ifd, IN_OPEN, root); + + snprintf(path, sizeof(path), "%s/file", root); + fd = open(path, O_RDWR | O_CREAT, 0644); + ATF_REQUIRE(fd != -1); + close_checked(fd); + consume_event(ifd, wd, IN_OPEN, 0, "file"); + + fd = open(path, O_PATH); + ATF_REQUIRE(fd != -1); + close_checked(fd); + consume_event(ifd, wd, IN_OPEN, 0, "file"); + + fd = open(root, O_DIRECTORY); + ATF_REQUIRE(fd != -1); + close_checked(fd); + consume_event(ifd, wd, IN_OPEN, IN_ISDIR, NULL); + + snprintf(path, sizeof(path), "%s/fifo", root); + error = mkfifo(path, 0644); + ATF_REQUIRE(error == 0); + fd = open(path, O_RDWR); + ATF_REQUIRE(fd != -1); + close_checked(fd); + consume_event(ifd, wd, IN_OPEN, 0, "fifo"); + + close_inotify(ifd); +} + +ATF_TC_WITH_CLEANUP(inotify_event_unmount); +ATF_TC_HEAD(inotify_event_unmount, tc) +{ + atf_tc_set_md_var(tc, "require.user", "root"); +} +ATF_TC_BODY(inotify_event_unmount, tc) +{ + int error, fd, ifd, wd; + + ifd = inotify(IN_NONBLOCK); + + error = mkdir("./root", 0755); + ATF_REQUIRE(error == 0); + + mount_tmpfs("./root"); + + error = mkdir("./root/dir", 0755); + ATF_REQUIRE(error == 0); + wd = inotify_add_watch(ifd, "./root/dir", IN_OPEN); + ATF_REQUIRE(wd >= 0); + + fd = open("./root/dir", O_RDONLY | O_DIRECTORY); + ATF_REQUIRE(fd != -1); + consume_event(ifd, wd, IN_OPEN, IN_ISDIR, NULL); + close_checked(fd); + + /* A regular unmount should fail, as inotify holds a vnode reference. */ + error = unmount("./root", 0); + ATF_REQUIRE_ERRNO(EBUSY, error == -1); + error = unmount("./root", MNT_FORCE); + ATF_REQUIRE_MSG(error == 0, + "unmounting ./root failed: %s", strerror(errno)); + + consume_event(ifd, wd, 0, IN_UNMOUNT, NULL); + consume_event(ifd, wd, 0, IN_IGNORED, NULL); + + close_inotify(ifd); +} +ATF_TC_CLEANUP(inotify_event_unmount, tc) +{ + (void)unmount("./root", MNT_FORCE); +} + +ATF_TP_ADD_TCS(tp) +{ + /* Tests for the inotify syscalls. */ + ATF_TP_ADD_TC(tp, inotify_capsicum); + ATF_TP_ADD_TC(tp, inotify_coalesce); + ATF_TP_ADD_TC(tp, inotify_mask_create); + ATF_TP_ADD_TC(tp, inotify_nullfs); + ATF_TP_ADD_TC(tp, inotify_queue_overflow); + /* Tests for the various inotify event types. */ + ATF_TP_ADD_TC(tp, inotify_event_access_file); + ATF_TP_ADD_TC(tp, inotify_event_access_dir); + ATF_TP_ADD_TC(tp, inotify_event_attrib); + ATF_TP_ADD_TC(tp, inotify_event_close_nowrite); + ATF_TP_ADD_TC(tp, inotify_event_close_write); + ATF_TP_ADD_TC(tp, inotify_event_create); + ATF_TP_ADD_TC(tp, inotify_event_delete); + ATF_TP_ADD_TC(tp, inotify_event_move); + ATF_TP_ADD_TC(tp, inotify_event_open); + ATF_TP_ADD_TC(tp, inotify_event_unmount); + return (atf_no_error()); +}