D50315.id156124.diff
No OneTemporary
Actions

Size

95 KB

Referenced Files

None

Subscribers

None

D50315.id156124.diff
View Options

	diff --git a/lib/libc/gen/Makefile.inc b/lib/libc/gen/Makefile.inc
	--- a/lib/libc/gen/Makefile.inc
	+++ b/lib/libc/gen/Makefile.inc
	@@ -89,6 +89,7 @@
	glob.c \
	glob-compat11.c \
	initgroups.c \
	+ inotify.c \
	isatty.c \
	isinf.c \
	isnan.c \
	diff --git a/lib/libc/gen/Symbol.map b/lib/libc/gen/Symbol.map
	--- a/lib/libc/gen/Symbol.map
	+++ b/lib/libc/gen/Symbol.map
	@@ -459,6 +459,11 @@
	aio_write2;
	execvpe;
	fts_open_b;
	+ inotify_add_watch;
	+ inotify_add_watch_at;
	+ inotify_init;
	+ inotify_init1;
	+ inotify_rm_watch;
	psiginfo;
	rtld_get_var;
	rtld_set_var;
	diff --git a/lib/libc/gen/inotify.c b/lib/libc/gen/inotify.c
	new file mode 100644
	--- /dev/null
	+++ b/lib/libc/gen/inotify.c
	@@ -0,0 +1,33 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ *
	+ * Copyright (c) 2025 Klara, Inc.
	+ */
	+
	+#include "namespace.h"
	+#include <sys/fcntl.h>
	+#include <sys/inotify.h>
	+#include <sys/specialfd.h>
	+#include "un-namespace.h"
	+#include "libc_private.h"
	+
	+int
	+inotify_add_watch(int fd, const char *pathname, uint32_t mask)
	+{
	+ return (inotify_add_watch_at(fd, AT_FDCWD, pathname, mask));
	+}
	+
	+int
	+inotify_init1(int flags)
	+{
	+ struct specialfd_inotify args;
	+
	+ args.flags = flags;
	+ return (__sys___specialfd(SPECIALFD_INOTIFY, &args, sizeof(args)));
	+}
	+
	+int
	+inotify_init(void)
	+{
	+ return (inotify_init1(0));
	+}
	diff --git a/lib/libsys/Makefile.sys b/lib/libsys/Makefile.sys
	--- a/lib/libsys/Makefile.sys
	+++ b/lib/libsys/Makefile.sys
	@@ -224,6 +224,7 @@
	getsockopt.2 \
	gettimeofday.2 \
	getuid.2 \
	+ inotify.2 \
	intro.2 \
	ioctl.2 \
	issetugid.2 \
	@@ -448,6 +449,11 @@
	MLINKS+=getsockopt.2 setsockopt.2
	MLINKS+=gettimeofday.2 settimeofday.2
	MLINKS+=getuid.2 geteuid.2
	+MLINKS+=inotify.2 inotify_init.2 \
	+ inotify.2 inotify_init1.2 \
	+ inotify.2 inotify_add_watch.2 \
	+ inotify.2 inotify_add_watch_at.2 \
	+ inotify.2 inotify_rm_watch.2
	MLINKS+=intro.2 errno.2
	MLINKS+=jail.2 jail_attach.2 \
	jail.2 jail_get.2 \
	diff --git a/lib/libsys/_libsys.h b/lib/libsys/_libsys.h
	--- a/lib/libsys/_libsys.h
	+++ b/lib/libsys/_libsys.h
	@@ -465,6 +465,8 @@
	typedef int (__sys_getrlimitusage_t)(u_int, int, rlim_t *);
	typedef int (__sys_fchroot_t)(int);
	typedef int (__sys_setcred_t)(u_int, const struct setcred *, size_t);
	+typedef int (__sys_inotify_add_watch_t)(int, const char *, uint32_t);
	+typedef int (__sys_inotify_rm_watch_t)(int, int);

	void __sys_exit(int rval);
	int __sys_fork(void);
	@@ -866,6 +868,8 @@
	int __sys_getrlimitusage(u_int which, int flags, rlim_t * res);
	int __sys_fchroot(int fd);
	int __sys_setcred(u_int flags, const struct setcred * wcred, size_t size);
	+int __sys_inotify_add_watch(int fd, const char * path, uint32_t mask);
	+int __sys_inotify_rm_watch(int fd, int wd);
	__END_DECLS

	#endif /* __LIBSYS_H_ */
	diff --git a/lib/libsys/inotify.2 b/lib/libsys/inotify.2
	new file mode 100644
	--- /dev/null
	+++ b/lib/libsys/inotify.2
	@@ -0,0 +1,373 @@
	+.\"
	+.\" SPDX-License-Identifier: BSD-2-Clause
	+.\"
	+.\" Copyright (c) 2025 Klara, Inc.
	+.\"
	+.Dd May 19, 2025
	+.Dt INOTIFY 2
	+.Os
	+.Sh NAME
	+.Nm inotify_init ,
	+.Nm inotify_init1 ,
	+.Nm inotify_add_watch ,
	+.Nm inotify_add_watch_at ,
	+.Nm inotify_rm_watch
	+.Nd monitor file system events
	+.Sh LIBRARY
	+.Lb libc
	+.Sh SYNOPSIS
	+.In sys/inotify.h
	+.Ft int
	+.Fo inotify_init
	+.Fc
	+.Ft int
	+.Fo inotify_init1
	+.Fa int flags
	+.Fc
	+.Ft int
	+.Fo inotify_add_watch
	+.Fa int fd
	+.Fa const char *pathname
	+.Fa uint32_t mask
	+.Fc
	+.Ft int
	+.Fo inotify_add_watch_at
	+.Fa int fd
	+.Fa int dfd
	+.Fa const char *pathname
	+.Fa uint32_t mask
	+.Fc
	+.Ft int
	+.Fo inotify_rm_watch
	+.Fa int fd
	+.Fa uint32_t wd
	+.Fc
	+.Bd -literal
	+struct inotify_event {
	+ int wd; /* Watch descriptor */
	+ uint32_t mask; /* Event and flags */
	+ uint32_t cookie; /* Unique ID which links rename events */
	+ uint32_t len; /* Name field size, including nul bytes */
	+ char name[0]; /* Filename (nul-terminated) */
	+};
	+.Ed
	+.Sh DESCRIPTION
	+The inotify system calls provide an interface to monitor file system events.
	+They aim to be compatible with the Linux inotify interface.
	+The provided functionality is similar to the
	+.Dv EVFILT_VNODE
	+filter of the
	+.Xr kevent 2
	+system call, but further allows monitoring of a directory without needing to
	+open each object in that directory.
	+This avoids races and reduces the number of file descriptors needed to monitor
	+a large file hierarchy.
	+.Pp
	+inotify allows one or more file system objects, generally files or directories,
	+to be watched for events, such as file open or close.
	+Watched objects are associated with a file descriptor returned
	+by
	+.Fn inotify_init
	+or
	+.Fn inotify_init1 .
	+When an event occurs, a record describing the event becomes available for
	+reading from the inotify file descriptor.
	+Each inotify descriptor thus refers to a queue of events waiting to be read.
	+.Pp
	+The
	+.Fn inotify_init1
	+system call accepts two flags.
	+The
	+.Dv IN_NONBLOCK
	+flag causes the inotify descriptor to be opened in non-blocking mode, such that
	+.Xr read 2
	+calls will not block if no records are available to consume, and will instead
	+return
	+.Er EWOULDBLOC .
	+The
	+.Dv IN_CLOEXEC
	+flag causes the inotify descriptor to be closed automatically when
	+.Xr execve 2
	+is called.
	+.Pp
	+To watch a file or directory, the
	+.Fn inotify_add_watch
	+or
	+.Fn inotify_add_watch_at
	+system calls must be used.
	+They take a path and a mask of events to watch for, and return a
	+.Dq watch descriptor ,
	+a non-negative integer which uniquely identifies the watched object within the
	+inotify descriptor.
	+.Pp
	+The
	+.Fn inotify_rm_watch
	+system call removes a watch from an inotify descriptor.
	+.Pp
	+When watching a directory, objects within the directory are monitored for events
	+as well as the directory itself.
	+A record describing an inotify event consists of a
	+.Dq struct inotify_event
	+followed by the name of the object in the directory being watched.
	+If the watched object itself generates an event, no name is present.
	+Extra nul bytes may follow the file name in order to provide alignment for a
	+subsequent record.
	+.Pp
	+The following events are defined:
	+.Bl -tag -width IN_CLOSE_NOWRITE
	+.It Dv IN_ACCESS
	+A file's contents were accessed, e.g., by
	+.Xr read 2
	+.Xr copy_file_range 2 ,
	+.Xr sendfile 2 ,
	+or
	+.Xr getdirentries 2 .
	+.It Dv IN_ATTRIB
	+A file's metadata was changed, e.g., by
	+.Xr chmod 2
	+or
	+.Xr unlink 2 .
	+.It Dv IN_CLOSE_WRITE
	+A file that was previously opened for writing was closed.
	+.It Dv IN_CLOSE_NOWRITE
	+A file that was previously opened read-only was closed.
	+.It Dv IN_CREATE
	+A file within a watched directory was created, e.g., by
	+.Xr open 2 ,
	+.Xr mkdir 2 ,
	+.Xr symlink 2 ,
	+or
	+.Xr bind 2 .
	+.It Dv IN_DELETE
	+A file or directory within a watched directory was removed.
	+.It Dv IN_DELETE_SELF
	+The watched file or directory itself was deleted.
	+This event is generated only when the link count of the file drops
	+to zero.
	+.It Dv IN_MODIFY
	+A file's contents were modified, e.g., by
	+.Xr write 2
	+or
	+.Xr copy_file_range 2 .
	+.It Dv IN_MOVE_SELF
	+The watched file or directory itself was renamed.
	+.It Dv IN_MOVED_FROM
	+A file or directory was moved from a watched directory.
	+.It Dv IN_MOVED_TO
	+A file or directory was moved into a watched directory.
	+A
	+.Xr rename 2
	+call thus may generate two events, one for the old name and one for the new
	+name.
	+These are linked together by the
	+.Ar cookie
	+field in the inotify record, which can be compared to link the two records
	+to the same event.
	+.It Dv IN_OPEN
	+A file was opened.
	+.El
	+.Pp
	+Some additional flags may be set in inotify event records:
	+.Bl -tag -width IN_Q_OVERFLOW
	+.It Dv IN_IGNORED
	+When a watch is removed from a file, for example because it was created with the
	+.Dv IN_ONESHOT
	+flag, the file was deleted, or the watch was explicitly removed with
	+.Xr inotify_rm_watch 2 ,
	+an event with this mask is generated to indicate that the watch will not
	+generate any more events.
	+Once this event is generated, the watch is automatically removed, and in
	+particular should not be removed manually with
	+.Xr inotify_rm_watch 2 .
	+.It Dv IN_ISDIR
	+When the subject of an event is a directory, this flag is set in the
	+.Ar mask
	+.It Dv IN_Q_OVERFLOW
	+One or more events were dropped, for example because of a kernel memory allocation
	+failure or because the event queue size hit a limit.
	+.It Dv IN_UNMOUNT
	+The filesystem containing the watched object was unmounted.
	+.El
	+.Pp
	+A number of flags may also be specified in the
	+.Ar mask
	+given to
	+.Fn inotify_add_watch
	+and
	+.Fn inotify_add_watch_at :
	+.Bl -tag -width IN_DONT_FOLLOW
	+.It Dv IN_DONT_FOLLOW
	+If
	+.Ar pathname
	+is a symbolic link, do not follow it.
	+.It Dv IN_EXCL_UNLINK
	+This currently has no effect, see the
	+.Sx BUGS
	+section.
	+.In Dv IN_MASK_ADD
	+When adding a watch to an object, and that object is already watched by the
	+same inotify descriptor, by default the mask of the existing watch is
	+overwritten.
	+When
	+.Dv IN_MASK_ADD
	+is specified, the mask of the existing watch is instead logically ORed with
	+the new mask.
	+.In Dv IN_MASK_CREATE
	+When
	+.Fn inotify_add watch
	+is used to add a watch to an object,
	+.Dv IN_MASK_CREATE
	+is specified, and that object is already watched by the same inotify descriptor,
	+return an error instead of updating the existing watch.
	+.In Dv IN_ONESHOT
	+Monitor the object for a single event, after which the watch is automatically
	+removed.
	+As part of removal, a
	+.Dv IN_IGNORED
	+event is generated.
	+.In Dv IN_ONLYDIR
	+When creating a watch, fail with
	+.Er ENOTDIR
	+if the path does not refer to a directory.
	+.El
	+.Sh SYSCTL VARIABLES
	+The following variables are available as both
	+.Xr sysctl 8
	+variables and
	+.Xr loader 8
	+tunables:
	+.Bl -tag -width 15
	+.It Va vfs.inotify.max_events
	+The maximum number of inotify records that can be queued for a single
	+inotify descriptor.
	+Records in excess of this limit are discarded, and a single event with
	+mask equal to
	+.Dv IN_Q_OVERFLOW
	+will be present in the queue.
	+.It Va vfs.inotify.max_user_instances
	+The maximum number of inotify descriptors that can be created by a single
	+user.
	+.It Va vfs.inotify.max_user_watches
	+The maximum number of inotify watches per user.
	+.El
	+.Sh EXAMPLES
	+See the example program in
	+.Pa /usr/share/examples/inotify/inotify.c .
	+.Sh ERRORS
	+The
	+.Fn inotify_init
	+and
	+.Fn inotify_init1
	+functions will fail if:
	+.Bl -tag -width Er
	+.It Bq Er ENFILE
	+The system limit on the total number of open files has been reached.
	+.It Bq Er EMFILE
	+A per-process limit on the number of open files has been reached.
	+.It Bq Er EMFILE
	+The system limit on the number of inotify descriptors has been reached.
	+.It Bq Er EINVAL
	+An unrecognized flag was passed to
	+.Fn inotify_init1 .
	+.El
	+.Pp
	+The
	+.Fn inotify_add_watch
	+and
	+.Fn inotify_add_watch_at
	+system calls will fail if:
	+.Bl -tag -width Er
	+.It Bq Er EBADF
	+The
	+.Ar fd
	+parameter is not a valid file descriptor.
	+.It Bq Er EINVAL
	+The
	+.Ar fd
	+parameter is not an inotify descriptor.
	+.It Bq Er EINVAL
	+The
	+.Ar mask
	+parameter does not specify an event, or
	+the
	+.Dv IN_MASK_CREATE
	+and
	+.Dv IN_MASK_ADD
	+flags are both set, or an unrecognized flag was passed.
	+.It Bq Er ENOTDIR
	+The
	+.Ar pathname
	+parameter refers to a file that is not a directory, and the
	+.Dv IN_ONLYDIR
	+flag was specified.
	+.It Bq Er ENOSPC
	+The per-user limit on the total number of inotify watches has been reached.
	+.It Bq Er ECAPMODE
	+The process is in capability mode and
	+.Fn inotify_add_watch
	+was called, or
	+.Fn inotify_add_watch_at
	+was called with
	+.Dv AT_FDCWD
	+as the directory file descriptor
	+.Ar dfd .
	+.It Bq Er ENOTCAPABLE
	+The process is in capability mode and
	+.Ar pathname
	+contains a
	+.Dq ..
	+component leading to a directory outside the directory hierarchy specified
	+by
	+.Ar dfd .
	+.El
	+.Pp
	+The
	+.Fn inotify_rm_watch
	+system call will fail if:
	+.Bl -tag -width Er
	+.It Bq Er EBADF
	+The
	+.Ar fd
	+parameter is not a valid file descriptor.
	+.It Bq Er EINVAL
	+The
	+.Ar fd
	+parameter is not an inotify descriptor.
	+.It Bq Er EINVAL
	+The
	+.Ar wd
	+parameter is not a valid watch descriptor.
	+.El
	+.Sh SEE ALSO
	+.Xr kevent 2 ,
	+.Xr capsicum 4
	+.Sh STANDARDS
	+The
	+.Nm
	+interface originates from Linux and is non-standard.
	+This implementation aims to be compatible with that of Linux and is based
	+on the documentation available at
	+.Pa https://man7.org/linux/man-pages/man7/inotify.7.html .
	+.Sh HISTORY
	+The inotify system calls first appeared in
	+.Fx 15.0 .
	+.Sh BUGS
	+If a file in a watched directory has multiple hard links,
	+an access via any hard link for that file will generate an event, even
	+if the accessed link belongs to an unwatched directory.
	+This is not the case for the Linux implementation, where only accesses
	+via the hard link in the watched directory will generate an event.
	+.Pp
	+If a watched directory contains multiple hard links of a file, an event
	+on one of the hard links will generate an inotify record for each link
	+in the directory.
	+.Pp
	+When a file is unlinked, no more events will be generated for that file,
	+even if it continues to be accessed.
	+By default, the Linux implementation will continue to generate events in
	+this case.
	+Thus, the
	+.Fx
	+implementation behaves as though
	+.Dv IN_EXCL_UNLINK
	+is always set.
	diff --git a/share/examples/Makefile b/share/examples/Makefile
	--- a/share/examples/Makefile
	+++ b/share/examples/Makefile
	@@ -15,6 +15,7 @@
	find_interface \
	flua \
	indent \
	+ inotify \
	ipfw \
	jails \
	kld \
	diff --git a/share/examples/inotify/Makefile b/share/examples/inotify/Makefile
	new file mode 100644
	--- /dev/null
	+++ b/share/examples/inotify/Makefile
	@@ -0,0 +1,6 @@
	+PROG= inotify
	+MAN=
	+
	+LIBADD= xo
	+
	+.include <bsd.prog.mk>
	diff --git a/share/examples/inotify/inotify.c b/share/examples/inotify/inotify.c
	new file mode 100644
	--- /dev/null
	+++ b/share/examples/inotify/inotify.c
	@@ -0,0 +1,172 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ *
	+ * Copyright (c) 2025 Klara, Inc.
	+ */
	+
	+/*
	+ * A simple program to demonstrate inotify. Given one or more paths, it watches
	+ * all events on those paths and prints them to standard output.
	+ */
	+
	+#include <sys/types.h>
	+#include <sys/event.h>
	+#include <sys/inotify.h>
	+
	+#include <assert.h>
	+#include <err.h>
	+#include <limits.h>
	+#include <signal.h>
	+#include <stdio.h>
	+#include <stdlib.h>
	+#include <unistd.h>
	+
	+#include <libxo/xo.h>
	+
	+static void
	+usage(void)
	+{
	+ xo_errx(1, "usage: inotify <path1> [<path2> ...]");
	+}
	+
	+static const char *
	+ev2str(uint32_t event)
	+{
	+ switch (event & IN_ALL_EVENTS) {
	+ case IN_ACCESS:
	+ return ("IN_ACCESS");
	+ case IN_ATTRIB:
	+ return ("IN_ATTRIB");
	+ case IN_CLOSE_WRITE:
	+ return ("IN_CLOSE_WRITE");
	+ case IN_CLOSE_NOWRITE:
	+ return ("IN_CLOSE_NOWRITE");
	+ case IN_CREATE:
	+ return ("IN_CREATE");
	+ case IN_DELETE:
	+ return ("IN_DELETE");
	+ case IN_DELETE_SELF:
	+ return ("IN_DELETE_SELF");
	+ case IN_MODIFY:
	+ return ("IN_MODIFY");
	+ case IN_MOVE_SELF:
	+ return ("IN_MOVE_SELF");
	+ case IN_MOVED_FROM:
	+ return ("IN_MOVED_FROM");
	+ case IN_MOVED_TO:
	+ return ("IN_MOVED_TO");
	+ case IN_OPEN:
	+ return ("IN_OPEN");
	+ default:
	+ switch (event) {
	+ case IN_IGNORED:
	+ return ("IN_IGNORED");
	+ case IN_Q_OVERFLOW:
	+ return ("IN_Q_OVERFLOW");
	+ case IN_UNMOUNT:
	+ return ("IN_UNMOUNT");
	+ }
	+ warnx("unknown event %#x", event);
	+ assert(0);
	+ }
	+}
	+
	+static void
	+set_handler(int kq, int sig)
	+{
	+ struct kevent kev;
	+
	+ (void)signal(sig, SIG_IGN);
	+ EV_SET(&kev, sig, EVFILT_SIGNAL, EV_ADD, 0, 0, NULL);
	+ if (kevent(kq, &kev, 1, NULL, 0, NULL) < 0)
	+ xo_err(1, "kevent");
	+}
	+
	+int
	+main(int argc, char **argv)
	+{
	+ struct inotify_event iev, iev1;
	+ struct kevent kev;
	+ size_t ievsz;
	+ int ifd, kq;
	+
	+ argc = xo_parse_args(argc, argv);
	+ if (argc < 2)
	+ usage();
	+ argc--;
	+ argv++;
	+
	+ ifd = inotify_init1(IN_NONBLOCK);
	+ if (ifd < 0)
	+ xo_err(1, "inotify");
	+ for (int i = 0; i < argc; i++) {
	+ int wd;
	+
	+ wd = inotify_add_watch(ifd, argv[i], IN_ALL_EVENTS);
	+ if (wd < 0)
	+ xo_err(1, "inotify_add_watch(%s)", argv[i]);
	+ }
	+
	+ xo_set_version("1");
	+ xo_open_list("events");
	+
	+ kq = kqueue();
	+ if (kq < 0)
	+ xo_err(1, "kqueue");
	+
	+ /*
	+ * Handle signals in the event loop so that we can close the xo list.
	+ */
	+ set_handler(kq, SIGINT);
	+ set_handler(kq, SIGTERM);
	+ set_handler(kq, SIGHUP);
	+ set_handler(kq, SIGQUIT);
	+
	+ /*
	+ * Monitor the inotify descriptor for events.
	+ */
	+ EV_SET(&kev, ifd, EVFILT_READ, EV_ADD, 0, 0, NULL);
	+ if (kevent(kq, &kev, 1, NULL, 0, NULL) < 0)
	+ xo_err(1, "kevent");
	+
	+ ievsz = sizeof(*iev) + NAME_MAX + 1;
	+ iev = malloc(ievsz);
	+ if (iev == NULL)
	+ err(1, "malloc");
	+
	+ for (;;) {
	+ ssize_t n;
	+ const char *ev;
	+
	+ if (kevent(kq, NULL, 0, &kev, 1, NULL) < 0)
	+ xo_err(1, "kevent");
	+ if (kev.filter == EVFILT_SIGNAL)
	+ break;
	+
	+ n = read(ifd, iev, ievsz);
	+ if (n < 0)
	+ xo_err(1, "read");
	+ assert(n >= (ssize_t)sizeof(*iev));
	+
	+ for (iev1 = iev; n > 0;) {
	+ assert(n >= (ssize_t)sizeof(*iev1));
	+
	+ ev = ev2str(iev1->mask);
	+ xo_open_instance("event");
	+ xo_emit("{:wd/%3d} {:event/%16s} {:name/%s}\n",
	+ iev1->wd, ev, iev1->name);
	+ xo_close_instance("event");
	+
	+ n -= sizeof(*iev1) + iev1->len;
	+ iev1 = (struct inotify_event )(void )
	+ ((char )iev1 + sizeof(iev1) + iev1->len);
	+ }
	+ (void)xo_flush();
	+ }
	+
	+ xo_close_list("events");
	+
	+ if (xo_finish() < 0)
	+ xo_err(1, "stdout");
	+ exit(0);
	+}
	diff --git a/share/man/man4/rights.4 b/share/man/man4/rights.4
	--- a/share/man/man4/rights.4
	+++ b/share/man/man4/rights.4
	@@ -30,7 +30,7 @@
	.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	.\" SUCH DAMAGE.
	.\"
	-.Dd May 1, 2024
	+.Dd May 22, 2025
	.Dt RIGHTS 4
	.Os
	.Sh NAME
	@@ -319,6 +319,14 @@
	.It Dv CAP_GETSOCKOPT
	Permit
	.Xr getsockopt 2 .
	+.It Dv CAP_INOTIFY_ADD
	+Permit
	+.Xr inotify_add_watch 2
	+and
	+.Xr inotify_add_watch_at 2 .
	+.It Dv CAP_INOTIFY_RM
	+Permit
	+.Xr inotify_rm_watch 2 .
	.It Dv CAP_IOCTL
	Permit
	.Xr ioctl 2 .
	diff --git a/share/man/man9/Makefile b/share/man/man9/Makefile
	--- a/share/man/man9/Makefile
	+++ b/share/man/man9/Makefile
	@@ -435,6 +435,7 @@
	VOP_GETEXTATTR.9 \
	VOP_GETPAGES.9 \
	VOP_INACTIVE.9 \
	+ VOP_INOTIFY.9 \
	VOP_IOCTL.9 \
	VOP_LINK.9 \
	VOP_LISTEXTATTR.9 \
	@@ -2461,6 +2462,7 @@
	MLINKS+=VOP_FSYNC.9 VOP_FDATASYNC.9
	MLINKS+=VOP_GETPAGES.9 VOP_PUTPAGES.9
	MLINKS+=VOP_INACTIVE.9 VOP_RECLAIM.9
	+MLINKS+=VOP_INOTIFY.9 VOP_INOTIFY_ADD_WATCH.9
	MLINKS+=VOP_LOCK.9 vn_lock.9 \
	VOP_LOCK.9 VOP_ISLOCKED.9 \
	VOP_LOCK.9 VOP_UNLOCK.9
	diff --git a/share/man/man9/VOP_INOTIFY.9 b/share/man/man9/VOP_INOTIFY.9
	new file mode 100644
	--- /dev/null
	+++ b/share/man/man9/VOP_INOTIFY.9
	@@ -0,0 +1,60 @@
	+.\"-
	+.\" SPDX-License-Identifier: BSD-2-Clause
	+.\"
	+.\" Copyright (c) 2025 Klara, Inc.
	+.\"
	+.Dd May 27, 2025
	+.Dt VOP_INOTIFY 9
	+.Os
	+.Sh NAME
	+.Nm VOP_INOTIFY
	+.Nd vnode inotify interface
	+.Sh SYNOPSIS
	+.In sys/param.h
	+.In sys/vnode.h
	+.Ft int
	+.Fo VOP_INOTIFY
	+.Fa struct vnode *vp
	+.Fa struct vnode *dvp
	+.Fa struct componentname *cnp
	+.Fa int event
	+.Fa uint32_t cookie
	+.Fc
	+.Ft int
	+.Fo VOP_INOTIFY_ADD_WATCH
	+.Fa struct vnode *vp
	+.Fa struct inotify_softc *sc
	+.Fa uint32_t mask
	+.Fa uint32_t *wdp
	+.Fa struct thread *td
	+.Fc
	+.Sh DESCRIPTION
	+The
	+.Fn VOP_INOTIFY
	+operation notifies the
	+.Xr inotify 2
	+subsystem of a file system event on a vnode.
	+The
	+.Fa dvp
	+and
	+.Fa cnp
	+arguments are optional and are only used to obtain a file name for the event.
	+If they are omitted, the inotify subsystem will use the file name cache to
	+find a name for the vnode, but this is more expensive.
	+.Pp
	+The
	+.Fn VOP_INOTIFY_ADD_WATCH
	+operation is for internal use by the inotify subsystem to add a watch to a
	+vnode.
	+.Sh LOCKS
	+The
	+.Fn VOP_INOTIFY
	+operation does not assume any particular vnode lock state.
	+The
	+.Fn VOP_INOTIFY_ADD_WATCH
	+operation should be called with the vnode locked.
	+.Sh RETURN VALUES
	+Zero is returned on success, otherwise an error code is returned.
	+.Sh SEE ALSO
	+.Xr inotify 2 ,
	+.Xr vnode 9
	diff --git a/sys/bsm/audit_kevents.h b/sys/bsm/audit_kevents.h
	--- a/sys/bsm/audit_kevents.h
	+++ b/sys/bsm/audit_kevents.h
	@@ -663,6 +663,7 @@
	#define AUE_FSPACECTL 43269 /* FreeBSD-specific. */
	#define AUE_TIMERFD 43270 /* FreeBSD/Linux. */
	#define AUE_SETCRED 43271 /* FreeBSD-specific. */
	+#define AUE_INOTIFY 43272 /* FreeBSD/Linux. */

	/*
	* Darwin BSM uses a number of AUE_O_* definitions, which are aliased to the
	diff --git a/sys/conf/files b/sys/conf/files
	--- a/sys/conf/files
	+++ b/sys/conf/files
	@@ -3979,6 +3979,7 @@
	kern/vfs_extattr.c standard
	kern/vfs_hash.c standard
	kern/vfs_init.c standard
	+kern/vfs_inotify.c standard
	kern/vfs_lookup.c standard
	kern/vfs_mount.c standard
	kern/vfs_mountroot.c standard
	diff --git a/sys/fs/nullfs/null_subr.c b/sys/fs/nullfs/null_subr.c
	--- a/sys/fs/nullfs/null_subr.c
	+++ b/sys/fs/nullfs/null_subr.c
	@@ -245,6 +245,10 @@
	vp->v_object = lowervp->v_object;
	vn_irflag_set(vp, VIRF_PGREAD);
	}
	+ if ((vn_irflag_read(lowervp) & VIRF_INOTIFY) != 0)
	+ vn_irflag_set(vp, VIRF_INOTIFY);
	+ if ((vn_irflag_read(lowervp) & VIRF_INOTIFY_PARENT) != 0)
	+ vn_irflag_set(vp, VIRF_INOTIFY_PARENT);
	if (lowervp == MOUNTTONULLMOUNT(mp)->nullm_lowerrootvp)
	vp->v_vflag \|= VV_ROOT;

	diff --git a/sys/fs/nullfs/null_vnops.c b/sys/fs/nullfs/null_vnops.c
	--- a/sys/fs/nullfs/null_vnops.c
	+++ b/sys/fs/nullfs/null_vnops.c
	@@ -189,6 +189,26 @@
	SYSCTL_INT(_debug, OID_AUTO, nullfs_bug_bypass, CTLFLAG_RW,
	&null_bug_bypass, 0, "");

	+/*
	+ * Synchronize inotify flags with the lower vnode:
	+ * - If the upper vnode has the flag set and the lower does not, then the lower
	+ * vnode is unwatched and the upper vnode does not need to go through
	+ * VOP_INOTIFY.
	+ * - If the lower vnode is watched, then the upper vnode should go through
	+ * VOP_INOTIFY, so copy the flag up.
	+ */
	+static void
	+null_copy_inotify(struct vnode vp, struct vnode lvp, short flag)
	+{
	+ if ((vn_irflag_read(vp) & flag) != 0) {
	+ if (__predict_false((vn_irflag_read(lvp) & flag) == 0))
	+ vn_irflag_unset(vp, flag);
	+ } else if ((vn_irflag_read(lvp) & flag) != 0) {
	+ if (__predict_false((vn_irflag_read(vp) & flag) == 0))
	+ vn_irflag_set(vp, flag);
	+ }
	+}
	+
	/*
	* This is the 10-Apr-92 bypass routine.
	* This version has been optimized for speed, throwing away some
	@@ -305,7 +325,10 @@
	lvp = *(vps_p[i]);

	/*
	- * Get rid of the transient hold on lvp.
	+ * Get rid of the transient hold on lvp. Copy inotify
	+ * flags up in case something is watching the lower
	+ * layer.
	+ *
	* If lowervp was unlocked during VOP
	* operation, nullfs upper vnode could have
	* been reclaimed, which changes its v_vnlock
	@@ -314,6 +337,10 @@
	* upper (reclaimed) vnode.
	*/
	if (lvp != NULLVP) {
	+ null_copy_inotify(old_vps[i], lvp,
	+ VIRF_INOTIFY);
	+ null_copy_inotify(old_vps[i], lvp,
	+ VIRF_INOTIFY_PARENT);
	if (VOP_ISLOCKED(lvp) == LK_EXCLUSIVE &&
	old_vps[i]->v_vnlock != lvp->v_vnlock) {
	VOP_UNLOCK(lvp);
	diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c
	--- a/sys/kern/kern_exec.c
	+++ b/sys/kern/kern_exec.c
	@@ -26,7 +26,6 @@
	* SUCH DAMAGE.
	*/

	-#include <sys/cdefs.h>
	#include "opt_capsicum.h"
	#include "opt_hwpmc_hooks.h"
	#include "opt_ktrace.h"
	@@ -44,6 +43,7 @@
	#include <sys/filedesc.h>
	#include <sys/imgact.h>
	#include <sys/imgact_elf.h>
	+#include <sys/inotify.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	@@ -1882,8 +1882,10 @@
	* general case).
	*/
	error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL);
	- if (error == 0)
	+ if (error == 0) {
	+ INOTIFY(vp, IN_OPEN);
	imgp->opened = true;
	+ }
	return (error);
	}

	@@ -1892,6 +1894,7 @@
	{
	if (imgp->opened) {
	VOP_CLOSE(imgp->vp, FREAD, td->td_ucred, td);
	+ INOTIFY(imgp->vp, IN_CLOSE);
	imgp->opened = false;
	}
	if (imgp->textset) {
	diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c
	--- a/sys/kern/kern_resource.c
	+++ b/sys/kern/kern_resource.c
	@@ -1637,6 +1637,12 @@
	if (uip->ui_pipecnt != 0)
	printf("freeing uidinfo: uid = %d, pipecnt = %ld\n",
	uip->ui_uid, uip->ui_pipecnt);
	+ if (uip->ui_inotifycnt != 0)
	+ printf("freeing uidinfo: uid = %d, inotifycnt = %ld\n",
	+ uip->ui_uid, uip->ui_inotifycnt);
	+ if (uip->ui_inotifywatchcnt != 0)
	+ printf("freeing uidinfo: uid = %d, inotifywatchcnt = %ld\n",
	+ uip->ui_uid, uip->ui_inotifywatchcnt);
	free(uip, M_UIDINFO);
	}

	@@ -1742,6 +1748,21 @@
	return (chglimit(uip, &uip->ui_pipecnt, diff, max, "pipecnt"));
	}

	+int
	+chginotifycnt(struct uidinfo *uip, int diff, rlim_t max)
	+{
	+
	+ return (chglimit(uip, &uip->ui_inotifycnt, diff, max, "inotifycnt"));
	+}
	+
	+int
	+chginotifywatchcnt(struct uidinfo *uip, int diff, rlim_t max)
	+{
	+
	+ return (chglimit(uip, &uip->ui_inotifywatchcnt, diff, max,
	+ "inotifywatchcnt"));
	+}
	+
	static int
	sysctl_kern_proc_rlimit_usage(SYSCTL_HANDLER_ARGS)
	{
	diff --git a/sys/kern/kern_sendfile.c b/sys/kern/kern_sendfile.c
	--- a/sys/kern/kern_sendfile.c
	+++ b/sys/kern/kern_sendfile.c
	@@ -27,12 +27,12 @@
	* SUCH DAMAGE.
	*/

	-#include <sys/cdefs.h>
	#include "opt_kern_tls.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/capsicum.h>
	+#include <sys/inotify.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/ktls.h>
	@@ -1242,6 +1242,8 @@
	*/
	if (error == 0) {
	td->td_retval[0] = 0;
	+ if (sbytes > 0 && vp != NULL)
	+ INOTIFY(vp, IN_ACCESS);
	}
	if (sent != NULL) {
	(*sent) = sbytes;
	diff --git a/sys/kern/subr_capability.c b/sys/kern/subr_capability.c
	--- a/sys/kern/subr_capability.c
	+++ b/sys/kern/subr_capability.c
	@@ -74,6 +74,10 @@
	CAP_RIGHTS_INITIALIZER(CAP_GETSOCKOPT);
	const cap_rights_t cap_getsockname_rights =
	CAP_RIGHTS_INITIALIZER(CAP_GETSOCKNAME);
	+const cap_rights_t cap_inotify_add_rights =
	+ CAP_RIGHTS_INITIALIZER(CAP_INOTIFY_ADD);
	+const cap_rights_t cap_inotify_rm_rights =
	+ CAP_RIGHTS_INITIALIZER(CAP_INOTIFY_RM);
	const cap_rights_t cap_ioctl_rights = CAP_RIGHTS_INITIALIZER(CAP_IOCTL);
	const cap_rights_t cap_listen_rights = CAP_RIGHTS_INITIALIZER(CAP_LISTEN);
	const cap_rights_t cap_linkat_source_rights =
	diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c
	--- a/sys/kern/sys_generic.c
	+++ b/sys/kern/sys_generic.c
	@@ -34,7 +34,6 @@
	* SUCH DAMAGE.
	*/

	-#include <sys/cdefs.h>
	#include "opt_capsicum.h"
	#include "opt_ktrace.h"

	@@ -46,6 +45,7 @@
	#include <sys/filio.h>
	#include <sys/fcntl.h>
	#include <sys/file.h>
	+#include <sys/inotify.h>
	#include <sys/lock.h>
	#include <sys/proc.h>
	#include <sys/signalvar.h>
	@@ -938,7 +938,6 @@
	kern_specialfd(struct thread td, int type, void arg)
	{
	struct file *fp;
	- struct specialfd_eventfd *ae;
	int error, fd, fflags;

	fflags = 0;
	@@ -947,12 +946,22 @@
	return (error);

	switch (type) {
	- case SPECIALFD_EVENTFD:
	+ case SPECIALFD_EVENTFD: {
	+ struct specialfd_eventfd *ae;
	+
	ae = arg;
	if ((ae->flags & EFD_CLOEXEC) != 0)
	fflags \|= O_CLOEXEC;
	error = eventfd_create_file(td, fp, ae->initval, ae->flags);
	break;
	+ }
	+ case SPECIALFD_INOTIFY: {
	+ struct specialfd_inotify *si;
	+
	+ si = arg;
	+ error = inotify_create_file(td, fp, si->flags, &fflags);
	+ break;
	+ }
	default:
	error = EINVAL;
	break;
	@@ -969,11 +978,12 @@
	int
	sys___specialfd(struct thread td, struct __specialfd_args args)
	{
	- struct specialfd_eventfd ae;
	int error;

	switch (args->type) {
	- case SPECIALFD_EVENTFD:
	+ case SPECIALFD_EVENTFD: {
	+ struct specialfd_eventfd ae;
	+
	if (args->len != sizeof(struct specialfd_eventfd)) {
	error = EINVAL;
	break;
	@@ -988,6 +998,20 @@
	}
	error = kern_specialfd(td, args->type, &ae);
	break;
	+ }
	+ case SPECIALFD_INOTIFY: {
	+ struct specialfd_inotify si;
	+
	+ if (args->len != sizeof(si)) {
	+ error = EINVAL;
	+ break;
	+ }
	+ error = copyin(args->req, &si, sizeof(si));
	+ if (error != 0)
	+ break;
	+ error = kern_specialfd(td, args->type, &si);
	+ break;
	+ }
	default:
	error = EINVAL;
	break;
	diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
	--- a/sys/kern/syscalls.master
	+++ b/sys/kern/syscalls.master
	@@ -3349,5 +3349,19 @@
	size_t size
	);
	}
	+592 AUE_INOTIFY STD\|CAPENABLED {
	+ int inotify_add_watch_at(
	+ int fd,
	+ int dfd,
	+ _In_z_ const char *path,
	+ uint32_t mask
	+ );
	+ }
	+593 AUE_INOTIFY STD\|CAPENABLED {
	+ int inotify_rm_watch(
	+ int fd,
	+ int wd
	+ );
	+ }

	; vim: syntax=off
	diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c
	--- a/sys/kern/vfs_cache.c
	+++ b/sys/kern/vfs_cache.c
	@@ -41,6 +41,7 @@
	#include <sys/counter.h>
	#include <sys/filedesc.h>
	#include <sys/fnv_hash.h>
	+#include <sys/inotify.h>
	#include <sys/kernel.h>
	#include <sys/ktr.h>
	#include <sys/lock.h>
	@@ -2636,6 +2637,14 @@
	atomic_thread_fence_rel();
	atomic_store_ptr(&dvp->v_cache_dd, ncp);
	} else if (vp != NULL) {
	+ /*
	+ * Take the slow path in INOTIFY(). This flag will be lazily
	+ * cleared by cache_vop_inotify() once all directories referring
	+ * to vp are unwatched.
	+ */
	+ if (__predict_false((vn_irflag_read(dvp) & VIRF_INOTIFY) != 0))
	+ vn_irflag_set_cond(vp, VIRF_INOTIFY_PARENT);
	+
	/*
	* For this case, the cache entry maps both the
	* directory name in it and the name ".." for the
	@@ -4061,6 +4070,56 @@
	return (error);
	}

	+void
	+cache_vop_inotify(struct vnode *vp, int event, uint32_t cookie)
	+{
	+ struct mtx *vlp;
	+ struct namecache *ncp;
	+ int isdir;
	+ bool logged, self;
	+
	+ isdir = vp->v_type == VDIR ? IN_ISDIR : 0;
	+ self = (vn_irflag_read(vp) & VIRF_INOTIFY) != 0 &&
	+ (vp->v_type != VDIR \|\| (event & ~_IN_DIR_EVENTS) != 0);
	+
	+ if (self) {
	+ int selfevent;
	+
	+ if (event == _IN_ATTRIB_LINKCOUNT)
	+ selfevent = IN_ATTRIB;
	+ else
	+ selfevent = event;
	+ inotify_log(vp, NULL, 0, selfevent \| isdir, cookie);
	+ }
	+ if ((event & IN_ALL_EVENTS) == 0)
	+ return;
	+
	+ logged = false;
	+ vlp = VP2VNODELOCK(vp);
	+ mtx_lock(vlp);
	+ TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
	+ if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
	+ continue;
	+ if ((vn_irflag_read(ncp->nc_dvp) & VIRF_INOTIFY) != 0) {
	+ /*
	+ * XXX-MJ if the vnode has two links in the same
	+ * dir, we'll log the same event twice.
	+ */
	+ inotify_log(ncp->nc_dvp, ncp->nc_name, ncp->nc_nlen,
	+ event \| isdir, cookie);
	+ logged = true;
	+ }
	+ }
	+ if (!logged && (vn_irflag_read(vp) & VIRF_INOTIFY_PARENT) != 0) {
	+ /*
	+ * We didn't find a watched directory that contains this vnode,
	+ * so stop calling VOP_INOTIFY for operations on the vnode.
	+ */
	+ vn_irflag_unset(vp, VIRF_INOTIFY_PARENT);
	+ }
	+ mtx_unlock(vlp);
	+}
	+
	#ifdef DDB
	static void
	db_print_vpath(struct vnode *vp)
	diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c
	--- a/sys/kern/vfs_default.c
	+++ b/sys/kern/vfs_default.c
	@@ -39,6 +39,7 @@
	#include <sys/conf.h>
	#include <sys/event.h>
	#include <sys/filio.h>
	+#include <sys/inotify.h>
	#include <sys/kernel.h>
	#include <sys/limits.h>
	#include <sys/lock.h>
	@@ -119,6 +120,8 @@
	.vop_getwritemount = vop_stdgetwritemount,
	.vop_inactive = VOP_NULL,
	.vop_need_inactive = vop_stdneed_inactive,
	+ .vop_inotify = vop_stdinotify,
	+ .vop_inotify_add_watch = vop_stdinotify_add_watch,
	.vop_ioctl = vop_stdioctl,
	.vop_kqfilter = vop_stdkqfilter,
	.vop_islocked = vop_stdislocked,
	@@ -1305,6 +1308,20 @@
	return (1);
	}

	+int
	+vop_stdinotify(struct vop_inotify_args *ap)
	+{
	+ vn_inotify(ap->a_vp, ap->a_dvp, ap->a_cnp, ap->a_event, ap->a_cookie);
	+ return (0);
	+}
	+
	+int
	+vop_stdinotify_add_watch(struct vop_inotify_add_watch_args *ap)
	+{
	+ return (vn_inotify_add_watch(ap->a_vp, ap->a_sc, ap->a_mask,
	+ ap->a_wdp, ap->a_td));
	+}
	+
	int
	vop_stdioctl(struct vop_ioctl_args *ap)
	{
	diff --git a/sys/kern/vfs_inotify.c b/sys/kern/vfs_inotify.c
	new file mode 100644
	--- /dev/null
	+++ b/sys/kern/vfs_inotify.c
	@@ -0,0 +1,891 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ *
	+ * Copyright (c) 2025 Klara, Inc.
	+ */
	+
	+#include "opt_ktrace.h"
	+
	+#include <sys/param.h>
	+#include <sys/systm.h>
	+#include <sys/caprights.h>
	+#include <sys/dirent.h>
	+#include <sys/fcntl.h>
	+#include <sys/file.h>
	+#include <sys/filio.h>
	+#include <sys/inotify.h>
	+#include <sys/kernel.h>
	+#include <sys/lock.h>
	+#include <sys/ktrace.h>
	+#include <sys/malloc.h>
	+#include <sys/mutex.h>
	+#include <sys/namei.h>
	+#include <sys/poll.h>
	+#include <sys/proc.h>
	+#include <sys/queue.h>
	+#include <sys/resourcevar.h>
	+#include <sys/selinfo.h>
	+#include <sys/stat.h>
	+#include <sys/syscallsubr.h>
	+#include <sys/sysctl.h>
	+#include <sys/sysent.h>
	+#include <sys/syslimits.h>
	+#include <sys/sysproto.h>
	+#include <sys/tree.h>
	+#include <sys/user.h>
	+#include <sys/vnode.h>
	+
	+uint32_t inotify_rename_cookie;
	+
	+static SYSCTL_NODE(_vfs, OID_AUTO, inotify, CTLFLAG_RD \| CTLFLAG_MPSAFE, 0,
	+ "inotify configuration");
	+
	+static int inotify_max_events = 512;
	+SYSCTL_INT(_vfs_inotify, OID_AUTO, max_events, CTLFLAG_RWTUN,
	+ &inotify_max_events, 0,
	+ "Maximum number of events to queue on an inotify descriptor");
	+
	+static int inotify_max_user_instances = 1024;
	+SYSCTL_INT(_vfs_inotify, OID_AUTO, max_user_instances, CTLFLAG_RWTUN,
	+ &inotify_max_user_instances, 0,
	+ "Maximum number of inotify descriptors per user");
	+
	+static int inotify_max_user_watches = 128 * 1024;
	+SYSCTL_INT(_vfs_inotify, OID_AUTO, max_user_watches, CTLFLAG_RWTUN,
	+ &inotify_max_user_watches, 0,
	+ "Maximum number of inotify watches per user");
	+
	+static SYSCTL_NODE(_debug, OID_AUTO, inotify, CTLFLAG_RD \| CTLFLAG_MPSAFE, 0,
	+ "inotify debugging");
	+
	+static int inotify_coalesce = 1;
	+SYSCTL_INT(_debug_inotify, OID_AUTO, coalesce, CTLFLAG_RWTUN,
	+ &inotify_coalesce, 0,
	+ "Coalesce inotify events when possible");
	+
	+static fo_rdwr_t inotify_read;
	+static fo_ioctl_t inotify_ioctl;
	+static fo_poll_t inotify_poll;
	+static fo_kqfilter_t inotify_kqfilter;
	+static fo_stat_t inotify_stat;
	+static fo_close_t inotify_close;
	+static fo_fill_kinfo_t inotify_fill_kinfo;
	+
	+static const struct fileops inotifyfdops = {
	+ .fo_read = inotify_read,
	+ .fo_write = invfo_rdwr,
	+ .fo_truncate = invfo_truncate,
	+ .fo_ioctl = inotify_ioctl,
	+ .fo_poll = inotify_poll,
	+ .fo_kqfilter = inotify_kqfilter,
	+ .fo_stat = inotify_stat,
	+ .fo_close = inotify_close,
	+ .fo_chmod = invfo_chmod,
	+ .fo_chown = invfo_chown,
	+ .fo_sendfile = invfo_sendfile,
	+ .fo_fill_kinfo = inotify_fill_kinfo,
	+ .fo_cmp = file_kcmp_generic,
	+};
	+
	+static void filt_inotifydetach(struct knote *kn);
	+static int filt_inotifyevent(struct knote *kn, long hint);
	+
	+static const struct filterops inotify_rfiltops = {
	+ .f_isfd = 1,
	+ .f_detach = filt_inotifydetach,
	+ .f_event = filt_inotifyevent,
	+};
	+
	+static MALLOC_DEFINE(M_INOTIFY, "inotify", "inotify data structures");
	+
	+struct inotify_record {
	+ STAILQ_ENTRY(inotify_record) link;
	+ struct inotify_event ev;
	+};
	+
	+struct inotify_watch {
	+ struct inotify_softc sc; / back-pointer */
	+ int wd; /* unique ID */
	+ uint32_t mask; /* event mask */
	+ struct vnode vp; / vnode being watched, refed */
	+ RB_ENTRY(inotify_watch) ilink; /* inotify linkage */
	+ TAILQ_ENTRY(inotify_watch) vlink; /* vnode linkage */
	+};
	+
	+static int
	+inotify_watch_cmp(const struct inotify_watch *a,
	+ const struct inotify_watch *b)
	+{
	+ if (a->wd < b->wd)
	+ return (-1);
	+ else if (a->wd > b->wd)
	+ return (1);
	+ else
	+ return (0);
	+}
	+RB_HEAD(inotify_watch_tree, inotify_watch);
	+RB_GENERATE_STATIC(inotify_watch_tree, inotify_watch, ilink, inotify_watch_cmp);
	+
	+struct inotify_softc {
	+ struct mtx lock;
	+ STAILQ_HEAD(, inotify_record) pending; /* events waiting to be read */
	+ struct inotify_record overflow; /* preallocated record */
	+ int nextwatch; /* next watch ID to try */
	+ int npending; /* number of pending events */
	+ size_t nbpending; /* bytes available to read */
	+ struct inotify_watch_tree watches; /* active watches */
	+ struct selinfo sel; /* select/poll/kevent info */
	+ struct ucred cred; / credential ref */
	+};
	+
	+static int
	+inotify_read(struct file fp, struct uio uio, struct ucred *cred, int flags,
	+ struct thread *td)
	+{
	+ struct inotify_softc *sc;
	+ struct inotify_record *rec;
	+ int error;
	+ bool first;
	+
	+ sc = fp->f_data;
	+ error = 0;
	+
	+ mtx_lock(&sc->lock);
	+ while (STAILQ_EMPTY(&sc->pending)) {
	+ if ((flags & IO_NDELAY) != 0 \|\| (fp->f_flag & FNONBLOCK) != 0) {
	+ mtx_unlock(&sc->lock);
	+ return (EWOULDBLOCK);
	+ }
	+ error = msleep(&sc->pending, &sc->lock, PCATCH, "inotify", 0);
	+ if (error != 0) {
	+ mtx_unlock(&sc->lock);
	+ return (error);
	+ }
	+ }
	+ for (first = true; (rec = STAILQ_FIRST(&sc->pending)) != NULL;
	+ first = false) {
	+ size_t len;
	+
	+ len = sizeof(rec->ev) + rec->ev.len;
	+ if (uio->uio_resid < (ssize_t)len) {
	+ if (first)
	+ error = EINVAL;
	+ break;
	+ }
	+ STAILQ_REMOVE_HEAD(&sc->pending, link);
	+ sc->npending--;
	+ sc->nbpending -= len;
	+ mtx_unlock(&sc->lock);
	+ error = uiomove(&rec->ev, len, uio);
	+#ifdef KTRACE
	+ if (error == 0 && KTRPOINT(td, KTR_STRUCT))
	+ ktrstruct("inotify", &rec->ev, len);
	+#endif
	+ mtx_lock(&sc->lock);
	+ if (error != 0) {
	+ sc->npending++;
	+ sc->nbpending += len;
	+ STAILQ_INSERT_HEAD(&sc->pending, rec, link);
	+ mtx_unlock(&sc->lock);
	+ return (error);
	+ }
	+ if (rec == &sc->overflow)
	+ memset(rec, 0, sizeof(*rec));
	+ else
	+ free(rec, M_INOTIFY);
	+ }
	+ mtx_unlock(&sc->lock);
	+ return (error);
	+}
	+
	+static int
	+inotify_ioctl(struct file fp, u_long com, void data, struct ucred *cred,
	+ struct thread *td)
	+{
	+ struct inotify_softc *sc;
	+
	+ sc = fp->f_data;
	+
	+ switch (com) {
	+ case FIONREAD:
	+ (int )data = (int)sc->nbpending;
	+ return (0);
	+ case FIONBIO:
	+ case FIOASYNC:
	+ return (0);
	+ default:
	+ return (ENOTTY);
	+ }
	+
	+ return (0);
	+}
	+
	+static int
	+inotify_poll(struct file fp, int events, struct ucred cred, struct thread *td)
	+{
	+ struct inotify_softc *sc;
	+ int revents;
	+
	+ sc = fp->f_data;
	+ revents = 0;
	+
	+ mtx_lock(&sc->lock);
	+ if ((events & (POLLIN \| POLLRDNORM)) != 0 && sc->npending > 0)
	+ revents \|= events & (POLLIN \| POLLRDNORM);
	+ else
	+ selrecord(td, &sc->sel);
	+ mtx_unlock(&sc->lock);
	+ return (revents);
	+}
	+
	+static void
	+filt_inotifydetach(struct knote *kn)
	+{
	+ struct inotify_softc *sc;
	+
	+ sc = kn->kn_hook;
	+ knlist_remove(&sc->sel.si_note, kn, 0);
	+}
	+
	+static int
	+filt_inotifyevent(struct knote *kn, long hint)
	+{
	+ struct inotify_softc *sc;
	+
	+ sc = kn->kn_hook;
	+ mtx_assert(&sc->lock, MA_OWNED);
	+ kn->kn_data = sc->nbpending;
	+ return (kn->kn_data > 0);
	+}
	+
	+static int
	+inotify_kqfilter(struct file fp, struct knote kn)
	+{
	+ struct inotify_softc *sc;
	+
	+ if (kn->kn_filter != EVFILT_READ)
	+ return (EINVAL);
	+ sc = fp->f_data;
	+ kn->kn_fop = &inotify_rfiltops;
	+ kn->kn_hook = sc;
	+ knlist_add(&sc->sel.si_note, kn, 0);
	+ return (0);
	+}
	+
	+static int
	+inotify_stat(struct file fp, struct stat sb, struct ucred *cred)
	+{
	+ /* XXX-MJ */
	+ memset(sb, 0, sizeof(*sb));
	+ return (0);
	+}
	+
	+static void
	+inotify_unlink_watch_locked(struct inotify_watch *watch)
	+{
	+ struct vnode *vp;
	+
	+ vp = watch->vp;
	+ mtx_assert(&vp->v_pollinfo->vpi_lock, MA_OWNED);
	+
	+ TAILQ_REMOVE(&vp->v_pollinfo->vpi_inotify, watch, vlink);
	+ if (TAILQ_EMPTY(&vp->v_pollinfo->vpi_inotify))
	+ vn_irflag_unset_locked(vp, VIRF_INOTIFY);
	+}
	+
	+/*
	+ * Assumes that the watch has already been removed from its softc.
	+ */
	+static void
	+inotify_remove_watch(struct inotify_watch *watch)
	+{
	+ struct inotify_softc *sc;
	+ struct vnode *vp;
	+
	+ sc = watch->sc;
	+ (void)chginotifywatchcnt(sc->cred->cr_ruidinfo, -1, 0);
	+
	+ vp = watch->vp;
	+ mtx_lock(&vp->v_pollinfo->vpi_lock);
	+ inotify_unlink_watch_locked(watch);
	+ mtx_unlock(&vp->v_pollinfo->vpi_lock);
	+
	+ vrele(vp);
	+ free(watch, M_INOTIFY);
	+}
	+
	+static int
	+inotify_close(struct file fp, struct thread td)
	+{
	+ struct inotify_softc *sc;
	+ struct inotify_record *rec;
	+ struct inotify_watch *watch;
	+
	+ sc = fp->f_data;
	+
	+ mtx_lock(&sc->lock);
	+ (void)chginotifycnt(sc->cred->cr_ruidinfo, -1, 0);
	+ while ((rec = STAILQ_FIRST(&sc->pending)) != NULL) {
	+ STAILQ_REMOVE_HEAD(&sc->pending, link);
	+ if (rec != &sc->overflow)
	+ free(rec, M_INOTIFY);
	+ }
	+ while ((watch = RB_MIN(inotify_watch_tree, &sc->watches)) != NULL) {
	+ RB_REMOVE(inotify_watch_tree, &sc->watches, watch);
	+ mtx_unlock(&sc->lock);
	+ inotify_remove_watch(watch);
	+ mtx_lock(&sc->lock);
	+ }
	+ mtx_unlock(&sc->lock);
	+ seldrain(&sc->sel);
	+ knlist_destroy(&sc->sel.si_note);
	+ mtx_destroy(&sc->lock);
	+ crfree(sc->cred);
	+ free(sc, M_INOTIFY);
	+ return (0);
	+}
	+
	+static int
	+inotify_fill_kinfo(struct file fp, struct kinfo_file kif,
	+ struct filedesc *fdp)
	+{
	+ struct inotify_softc *sc;
	+
	+ sc = fp->f_data;
	+
	+ kif->kf_type = KF_TYPE_INOTIFY;
	+ kif->kf_un.kf_inotify.kf_inotify_npending = sc->npending;
	+ kif->kf_un.kf_inotify.kf_inotify_nbpending = sc->nbpending;
	+ return (0);
	+}
	+
	+int
	+inotify_create_file(struct thread td, struct file fp, int flags, int *fflagsp)
	+{
	+ struct inotify_softc *sc;
	+ int fflags;
	+
	+ if ((flags & ~(IN_NONBLOCK \| IN_CLOEXEC)) != 0)
	+ return (EINVAL);
	+
	+ if (!chginotifycnt(td->td_ucred->cr_ruidinfo, 1,
	+ inotify_max_user_instances))
	+ return (EMFILE);
	+
	+ sc = malloc(sizeof(*sc), M_INOTIFY, M_WAITOK \| M_ZERO);
	+ sc->nextwatch = 1; /* Required for compatibility. */
	+ STAILQ_INIT(&sc->pending);
	+ RB_INIT(&sc->watches);
	+ mtx_init(&sc->lock, "inotify", NULL, MTX_DEF);
	+ knlist_init_mtx(&sc->sel.si_note, &sc->lock);
	+ sc->cred = crhold(td->td_ucred);
	+
	+ fflags = FREAD;
	+ if ((flags & IN_NONBLOCK) != 0)
	+ fflags \|= FNONBLOCK;
	+ if ((flags & IN_CLOEXEC) != 0)
	+ *fflagsp \|= O_CLOEXEC;
	+ finit(fp, fflags, DTYPE_INOTIFY, sc, &inotifyfdops);
	+
	+ return (0);
	+}
	+
	+static struct inotify_record *
	+inotify_alloc_record(uint32_t wd, const char *name, size_t namelen, int event,
	+ uint32_t cookie, int waitok)
	+{
	+ struct inotify_event *evp;
	+ struct inotify_record *rec;
	+
	+ rec = malloc(sizeof(*rec) + _IN_NAMESIZE(namelen), M_INOTIFY,
	+ waitok \| M_ZERO);
	+ if (rec == NULL)
	+ return (NULL);
	+ evp = &rec->ev;
	+ evp->wd = wd;
	+ evp->mask = event;
	+ evp->cookie = cookie;
	+ evp->len = _IN_NAMESIZE(namelen);
	+ if (name != NULL)
	+ memcpy(evp->name, name, namelen);
	+ return (rec);
	+}
	+
	+/*
	+ * Put an event record on the queue for an inotify desscriptor. Return false if
	+ * the record was not enqueued for some reason, true otherwise.
	+ */
	+static bool
	+inotify_queue_record(struct inotify_softc sc, struct inotify_record rec)
	+{
	+ struct inotify_record *prev;
	+ struct inotify_event *evp;
	+
	+ mtx_assert(&sc->lock, MA_OWNED);
	+
	+ /* Try to coalesce duplicate events. */
	+ evp = &rec->ev;
	+ if (inotify_coalesce && rec != &sc->overflow &&
	+ (prev = STAILQ_LAST(&sc->pending, inotify_record, link)) != NULL &&
	+ prev->ev.wd == evp->wd &&
	+ prev->ev.mask == evp->mask &&
	+ prev->ev.cookie == evp->cookie &&
	+ prev->ev.len == evp->len &&
	+ (evp->len == 0 \|\| strcmp(prev->ev.name, evp->name) == 0))
	+ return (false);
	+ /* Would this one overflow the queue? */
	+ if (sc->npending >= inotify_max_events \|\| rec == &sc->overflow) {
	+ /* Have we already recorded a drop? */
	+ if (sc->overflow.ev.mask == IN_Q_OVERFLOW)
	+ return (false);
	+ evp->wd = -1;
	+ evp->mask = IN_Q_OVERFLOW;
	+ evp->cookie = 0;
	+ evp->len = 0;
	+ }
	+ STAILQ_INSERT_TAIL(&sc->pending, rec, link);
	+ sc->npending++;
	+ sc->nbpending += sizeof(rec->ev) + evp->len;
	+ selwakeup(&sc->sel);
	+ KNOTE_LOCKED(&sc->sel.si_note, 0);
	+ wakeup(&sc->pending);
	+ return (true);
	+}
	+
	+static int
	+inotify_log_one(struct inotify_watch watch, const char name, size_t namelen,
	+ int event, uint32_t cookie)
	+{
	+ struct inotify_watch key;
	+ struct inotify_softc *sc;
	+ struct inotify_record *rec;
	+ int relecount;
	+ bool allocfail;
	+
	+ relecount = 0;
	+
	+ sc = watch->sc;
	+ rec = inotify_alloc_record(watch->wd, name, namelen, event, cookie,
	+ M_NOWAIT);
	+ if (rec == NULL) {
	+ rec = &sc->overflow;
	+ allocfail = true;
	+ } else {
	+ allocfail = false;
	+ }
	+
	+ mtx_lock(&sc->lock);
	+ if (!inotify_queue_record(sc, rec) && !allocfail)
	+ free(rec, M_INOTIFY);
	+ if ((watch->mask & IN_ONESHOT) != 0 \|\|
	+ (event & (IN_DELETE_SELF \| IN_UNMOUNT)) != 0) {
	+ if (!allocfail) {
	+ rec = inotify_alloc_record(watch->wd, NULL, 0,
	+ IN_IGNORED, 0, M_NOWAIT);
	+ if (rec != NULL && !inotify_queue_record(sc, rec))
	+ free(rec, M_INOTIFY);
	+ }
	+
	+ /*
	+ * Remove the watch, taking care to handle races with
	+ * inotify_close().
	+ */
	+ key.wd = watch->wd;
	+ if (RB_FIND(inotify_watch_tree, &sc->watches, &key) != NULL) {
	+ RB_REMOVE(inotify_watch_tree, &sc->watches, watch);
	+ inotify_unlink_watch_locked(watch);
	+ free(watch, M_INOTIFY);
	+
	+ /* Defer vrele() to until locks are dropped. */
	+ relecount++;
	+ }
	+ }
	+ mtx_unlock(&sc->lock);
	+ return (relecount);
	+}
	+
	+void
	+inotify_log(struct vnode vp, const char name, size_t namelen, int event,
	+ uint32_t cookie)
	+{
	+ struct inotify_watch watch, tmp;
	+ int relecount;
	+
	+ KASSERT((event & ~(IN_ALL_EVENTS \| IN_ISDIR \| IN_UNMOUNT)) == 0,
	+ ("inotify_log: invalid event %#x", event));
	+
	+ relecount = 0;
	+ mtx_lock(&vp->v_pollinfo->vpi_lock);
	+ TAILQ_FOREACH_SAFE(watch, &vp->v_pollinfo->vpi_inotify, vlink, tmp) {
	+ KASSERT(watch->vp == vp,
	+ ("inotify_log: watch %p vp != vp", watch));
	+ if ((watch->mask & event) != 0 \|\| event == IN_UNMOUNT) {
	+ relecount += inotify_log_one(watch, name, namelen, event,
	+ cookie);
	+ }
	+ }
	+ mtx_unlock(&vp->v_pollinfo->vpi_lock);
	+
	+ for (int i = 0; i < relecount; i++)
	+ vrele(vp);
	+}
	+
	+/*
	+ * An inotify event occurred on a watched vnode.
	+ */
	+void
	+vn_inotify(struct vnode vp, struct vnode dvp, struct componentname *cnp,
	+ int event, uint32_t cookie)
	+{
	+ int isdir;
	+
	+ VNPASS(vp->v_holdcnt > 0, vp);
	+
	+ isdir = vp->v_type == VDIR ? IN_ISDIR : 0;
	+
	+ if (dvp != NULL) {
	+ VNPASS(dvp->v_holdcnt > 0, dvp);
	+
	+ /*
	+ * Should we log an event for the vnode itself?
	+ */
	+ if ((vn_irflag_read(vp) & VIRF_INOTIFY) != 0) {
	+ int selfevent;
	+
	+ switch (event) {
	+ case _IN_MOVE_DELETE:
	+ case IN_DELETE:
	+ /*
	+ * IN_DELETE_SELF is only generated when the
	+ * last hard link of a file is removed.
	+ */
	+ selfevent = IN_DELETE_SELF;
	+ if (vp->v_type != VDIR) {
	+ struct vattr va;
	+ int error;
	+
	+ error = VOP_GETATTR(vp, &va, cnp->cn_cred);
	+ if (error == 0 && va.va_nlink != 0)
	+ selfevent = 0;
	+ }
	+ break;
	+ case IN_MOVED_FROM:
	+ cookie = 0;
	+ selfevent = IN_MOVE_SELF;
	+ break;
	+ case _IN_ATTRIB_LINKCOUNT:
	+ selfevent = IN_ATTRIB;
	+ break;
	+ default:
	+ selfevent = event;
	+ break;
	+ }
	+
	+ if ((selfevent & ~_IN_DIR_EVENTS) != 0) {
	+ inotify_log(vp, NULL, 0, selfevent \| isdir,
	+ cookie);
	+ }
	+ }
	+
	+ /*
	+ * Something is watching the directory through which this vnode
	+ * was referenced, so we may need to log the event.
	+ */
	+ if ((event & IN_ALL_EVENTS) != 0 &&
	+ (vn_irflag_read(dvp) & VIRF_INOTIFY) != 0) {
	+ inotify_log(dvp, cnp->cn_nameptr,
	+ cnp->cn_namelen, event \| isdir, cookie);
	+ }
	+ } else {
	+ /*
	+ * We don't know which watched directory might contain the
	+ * vnode, so we have to fall back to searching the name cache.
	+ */
	+ cache_vop_inotify(vp, event, cookie);
	+ }
	+}
	+
	+int
	+vn_inotify_add_watch(struct vnode vp, struct inotify_softc sc, uint32_t mask,
	+ uint32_t wdp, struct thread td)
	+{
	+ struct inotify_watch watch, watch1;
	+ uint32_t wd;
	+ int error;
	+
	+ /*
	+ * If this is a directory, make sure all of its entries are present in
	+ * the name cache so that we're able to look them up if an event occurs.
	+ * The persistent reference on the directory prevents the outgoing name
	+ * cache entries from being reclaimed.
	+ */
	+ if (vp->v_type == VDIR) {
	+ struct dirent *dp;
	+ char *buf;
	+ off_t off;
	+ size_t buflen, len;
	+ int eof;
	+
	+ buflen = 128 * sizeof(struct dirent);
	+ buf = malloc(buflen, M_TEMP, M_WAITOK);
	+
	+ len = off = eof = 0;
	+ for (;;) {
	+ struct nameidata nd;
	+
	+ error = vn_dir_next_dirent(vp, td, buf, buflen, &dp,
	+ &len, &off, &eof);
	+ if (error != 0) {
	+ free(buf, M_TEMP);
	+ return (error);
	+ }
	+ if (len == 0)
	+ /* Finished reading. */
	+ break;
	+ if (strcmp(dp->d_name, ".") == 0 \|\|
	+ strcmp(dp->d_name, "..") == 0)
	+ continue;
	+
	+ /*
	+ * namei() consumes a reference on the starting
	+ * directory if it's specified as a vnode.
	+ */
	+ vrefact(vp);
	+ NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE,
	+ dp->d_name, vp);
	+ error = namei(&nd);
	+ if (error != 0) {
	+ free(buf, M_TEMP);
	+ return (error);
	+ }
	+ vn_irflag_set_cond(nd.ni_vp, VIRF_INOTIFY_PARENT);
	+ vrele(nd.ni_vp);
	+ }
	+ free(buf, M_TEMP);
	+ }
	+
	+ /*
	+ * The vnode referenced in kern_inotify_add_watch() might be different
	+ * than this one if nullfs is in the picture.
	+ */
	+ vref(vp);
	+ watch = malloc(sizeof(*watch), M_INOTIFY, M_WAITOK \| M_ZERO);
	+ watch->sc = sc;
	+ watch->vp = vp;
	+ watch->mask = mask;
	+
	+ /*
	+ * Are we updating an existing watch? Search the vnode's list rather
	+ * than that of the softc, as the former is likely to be shorter.
	+ */
	+ v_addpollinfo(vp);
	+ mtx_lock(&vp->v_pollinfo->vpi_lock);
	+ TAILQ_FOREACH(watch1, &vp->v_pollinfo->vpi_inotify, vlink) {
	+ if (watch1->sc == sc)
	+ break;
	+ }
	+ mtx_lock(&sc->lock);
	+ if (watch1 != NULL) {
	+ mtx_unlock(&vp->v_pollinfo->vpi_lock);
	+
	+ /*
	+ * We found an existing watch, update it based on our flags.
	+ */
	+ if ((mask & IN_MASK_CREATE) != 0) {
	+ mtx_unlock(&sc->lock);
	+ vrele(vp);
	+ free(watch, M_INOTIFY);
	+ return (EEXIST);
	+ }
	+ if ((mask & IN_MASK_ADD) != 0)
	+ watch1->mask \|= mask;
	+ else
	+ watch1->mask = mask;
	+ *wdp = watch1->wd;
	+ mtx_unlock(&sc->lock);
	+ vrele(vp);
	+ free(watch, M_INOTIFY);
	+ return (0);
	+ }
	+
	+ /*
	+ * We're creating a new watch. Add it to the softc and vnode watch
	+ * lists.
	+ */
	+ do {
	+ struct inotify_watch key;
	+
	+ /*
	+ * Search for the next available watch descriptor. This is
	+ * implemented so as to avoid reusing watch descriptors for as
	+ * long as possible.
	+ */
	+ key.wd = wd = sc->nextwatch++;
	+ watch1 = RB_FIND(inotify_watch_tree, &sc->watches, &key);
	+ } while (watch1 != NULL \|\| wd == 0);
	+ watch->wd = wd;
	+ RB_INSERT(inotify_watch_tree, &sc->watches, watch);
	+ TAILQ_INSERT_TAIL(&vp->v_pollinfo->vpi_inotify, watch, vlink);
	+ mtx_unlock(&sc->lock);
	+ mtx_unlock(&vp->v_pollinfo->vpi_lock);
	+ vn_irflag_set_cond(vp, VIRF_INOTIFY);
	+
	+ *wdp = wd;
	+
	+ return (0);
	+}
	+
	+void
	+vn_inotify_revoke(struct vnode *vp)
	+{
	+ if (vp->v_pollinfo == NULL) {
	+ /* This is a nullfs vnode which shadows a watched vnode. */
	+ return;
	+ }
	+ inotify_log(vp, NULL, 0, IN_UNMOUNT, 0);
	+}
	+
	+static int
	+fget_inotify(struct thread td, int fd, const cap_rights_t needrightsp,
	+ struct file **fpp)
	+{
	+ struct file *fp;
	+ int error;
	+
	+ error = fget(td, fd, needrightsp, &fp);
	+ if (error != 0)
	+ return (error);
	+ if (fp->f_type != DTYPE_INOTIFY) {
	+ fdrop(fp, td);
	+ return (EINVAL);
	+ }
	+ *fpp = fp;
	+ return (0);
	+}
	+
	+static int
	+kern_inotify_add_watch(int fd, int dfd, const char *path, uint32_t mask,
	+ struct thread *td)
	+{
	+ struct nameidata nd;
	+ struct file *fp;
	+ struct inotify_softc *sc;
	+ struct vnode *vp;
	+ uint32_t wd;
	+ int error;
	+
	+ fp = NULL;
	+ vp = NULL;
	+
	+ if ((mask & IN_ALL_EVENTS) == 0)
	+ return (EINVAL);
	+ if ((mask & (IN_MASK_ADD \| IN_MASK_CREATE)) ==
	+ (IN_MASK_ADD \| IN_MASK_CREATE))
	+ return (EINVAL);
	+ if ((mask & ~(IN_ALL_EVENTS \| _IN_ALL_FLAGS \| IN_UNMOUNT)) != 0)
	+ return (EINVAL);
	+
	+ error = fget_inotify(td, fd, &cap_inotify_add_rights, &fp);
	+ if (error != 0)
	+ return (error);
	+ sc = fp->f_data;
	+
	+ NDINIT_AT(&nd, LOOKUP,
	+ ((mask & IN_DONT_FOLLOW) ? NOFOLLOW : FOLLOW) \| LOCKLEAF \|
	+ LOCKSHARED \| AUDITVNODE1, UIO_USERSPACE, path, dfd);
	+ error = namei(&nd);
	+ if (error != 0)
	+ goto out;
	+ NDFREE_PNBUF(&nd);
	+ vp = nd.ni_vp;
	+
	+ error = VOP_ACCESS(vp, VREAD, td->td_ucred, td);
	+ if (error != 0)
	+ goto out;
	+
	+ if ((mask & IN_ONLYDIR) != 0 && vp->v_type != VDIR) {
	+ error = ENOTDIR;
	+ goto out;
	+ }
	+
	+ if (!chginotifywatchcnt(sc->cred->cr_ruidinfo, 1,
	+ inotify_max_user_watches)) {
	+ error = ENOSPC;
	+ goto out;
	+ }
	+ error = VOP_INOTIFY_ADD_WATCH(vp, sc, mask, &wd, td);
	+ if (error != 0)
	+ goto outlim;
	+ vput(vp);
	+ fdrop(fp, td);
	+
	+ td->td_retval[0] = wd;
	+ return (0);
	+
	+outlim:
	+ (void)chginotifywatchcnt(sc->cred->cr_ruidinfo, -1, 0);
	+
	+out:
	+ if (vp != NULL)
	+ vput(vp);
	+ if (fp != NULL)
	+ fdrop(fp, td);
	+ return (error);
	+}
	+
	+int
	+sys_inotify_add_watch_at(struct thread *td,
	+ struct inotify_add_watch_at_args *uap)
	+{
	+ return (kern_inotify_add_watch(uap->fd, uap->dfd, uap->path,
	+ uap->mask, td));
	+}
	+
	+static int
	+kern_inotify_rm_watch(int fd, uint32_t wd, struct thread *td)
	+{
	+ struct file *fp;
	+ struct inotify_softc *sc;
	+ struct inotify_record *rec;
	+ struct inotify_watch key, *watch;
	+ int error;
	+
	+ error = fget_inotify(td, fd, &cap_inotify_rm_rights, &fp);
	+ if (error != 0)
	+ return (error);
	+ sc = fp->f_data;
	+
	+ rec = inotify_alloc_record(wd, NULL, 0, IN_IGNORED, 0, M_WAITOK);
	+
	+ /*
	+ * For compatibility with Linux, we do not remove pending events
	+ * associated with the watch. Watch descriptors are implemented so as
	+ * to avoid being reused for as long as possible, so one hopes that any
	+ * pending events from the removed watch descriptor will be removed
	+ * before the watch descriptor is recycled.
	+ */
	+ key.wd = wd;
	+ mtx_lock(&sc->lock);
	+ watch = RB_FIND(inotify_watch_tree, &sc->watches, &key);
	+ if (watch == NULL) {
	+ free(rec, M_INOTIFY);
	+ error = EINVAL;
	+ } else {
	+ RB_REMOVE(inotify_watch_tree, &sc->watches, watch);
	+ if (!inotify_queue_record(sc, rec)) {
	+ free(rec, M_INOTIFY);
	+ error = 0;
	+ }
	+ }
	+ mtx_unlock(&sc->lock);
	+ if (watch != NULL)
	+ inotify_remove_watch(watch);
	+ fdrop(fp, td);
	+ return (error);
	+}
	+
	+int
	+sys_inotify_rm_watch(struct thread td, struct inotify_rm_watch_args uap)
	+{
	+ return (kern_inotify_rm_watch(uap->fd, uap->wd, td));
	+}
	diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
	--- a/sys/kern/vfs_subr.c
	+++ b/sys/kern/vfs_subr.c
	@@ -38,7 +38,6 @@
	* External virtual filesystem routines
	*/

	-#include <sys/cdefs.h>
	#include "opt_ddb.h"
	#include "opt_watchdog.h"

	@@ -57,6 +56,7 @@
	#include <sys/extattr.h>
	#include <sys/file.h>
	#include <sys/fcntl.h>
	+#include <sys/inotify.h>
	#include <sys/jail.h>
	#include <sys/kdb.h>
	#include <sys/kernel.h>
	@@ -5247,7 +5247,8 @@
	static void
	destroy_vpollinfo(struct vpollinfo *vi)
	{
	-
	+ KASSERT(TAILQ_EMPTY(&vi->vpi_inotify),
	+ ("%s: pollinfo %p has lingering watches", __func__, vi));
	knlist_clear(&vi->vpi_selinfo.si_note, 1);
	seldrain(&vi->vpi_selinfo);
	destroy_vpollinfo_free(vi);
	@@ -5261,12 +5262,13 @@
	{
	struct vpollinfo *vi;

	- if (vp->v_pollinfo != NULL)
	+ if (atomic_load_ptr(&vp->v_pollinfo) != NULL)
	return;
	vi = malloc(sizeof(*vi), M_VNODEPOLL, M_WAITOK \| M_ZERO);
	mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
	knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
	vfs_knlunlock, vfs_knl_assert_lock);
	+ TAILQ_INIT(&vi->vpi_inotify);
	VI_LOCK(vp);
	if (vp->v_pollinfo != NULL) {
	VI_UNLOCK(vp);
	@@ -6057,6 +6059,28 @@
	}
	#endif

	+void
	+vop_allocate_post(void *ap, int rc)
	+{
	+ struct vop_allocate_args *a;
	+
	+ a = ap;
	+ if (rc == 0)
	+ INOTIFY(a->a_vp, IN_MODIFY);
	+}
	+
	+void
	+vop_copy_file_range_post(void *ap, int rc)
	+{
	+ struct vop_copy_file_range_args *a;
	+
	+ a = ap;
	+ if (rc == 0) {
	+ INOTIFY(a->a_invp, IN_ACCESS);
	+ INOTIFY(a->a_outvp, IN_MODIFY);
	+ }
	+}
	+
	void
	vop_create_pre(void *ap)
	{
	@@ -6077,8 +6101,20 @@
	a = ap;
	dvp = a->a_dvp;
	vn_seqc_write_end(dvp);
	- if (!rc)
	+ if (!rc) {
	VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
	+ INOTIFY_NAME(*a->a_vpp, dvp, a->a_cnp, IN_CREATE);
	+ }
	+}
	+
	+void
	+vop_deallocate_post(void *ap, int rc)
	+{
	+ struct vop_deallocate_args *a;
	+
	+ a = ap;
	+ if (rc == 0)
	+ INOTIFY(a->a_vp, IN_MODIFY);
	}

	void
	@@ -6123,8 +6159,10 @@
	a = ap;
	vp = a->a_vp;
	vn_seqc_write_end(vp);
	- if (!rc)
	+ if (!rc) {
	VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
	+ INOTIFY(vp, IN_ATTRIB);
	+ }
	}

	void
	@@ -6154,6 +6192,8 @@
	if (!rc) {
	VFS_KNOTE_LOCKED(vp, NOTE_LINK);
	VFS_KNOTE_LOCKED(tdvp, NOTE_WRITE);
	+ INOTIFY_NAME(vp, tdvp, a->a_cnp, _IN_ATTRIB_LINKCOUNT);
	+ INOTIFY_NAME(vp, tdvp, a->a_cnp, IN_CREATE);
	}
	}

	@@ -6177,8 +6217,10 @@
	a = ap;
	dvp = a->a_dvp;
	vn_seqc_write_end(dvp);
	- if (!rc)
	+ if (!rc) {
	VFS_KNOTE_LOCKED(dvp, NOTE_WRITE \| NOTE_LINK);
	+ INOTIFY_NAME(*a->a_vpp, dvp, a->a_cnp, IN_CREATE);
	+ }
	}

	#ifdef DEBUG_VFS_LOCKS
	@@ -6213,8 +6255,10 @@
	a = ap;
	dvp = a->a_dvp;
	vn_seqc_write_end(dvp);
	- if (!rc)
	+ if (!rc) {
	VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
	+ INOTIFY_NAME(*a->a_vpp, dvp, a->a_cnp, IN_CREATE);
	+ }
	}

	void
	@@ -6226,8 +6270,10 @@
	a = ap;
	vp = a->a_vp;
	ASSERT_VOP_IN_SEQC(vp);
	- if (!rc)
	+ if (!rc) {
	VFS_KNOTE_LOCKED(vp, NOTE_REVOKE);
	+ INOTIFY_REVOKE(vp);
	+ }
	}

	void
	@@ -6258,6 +6304,8 @@
	if (!rc) {
	VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
	VFS_KNOTE_LOCKED(vp, NOTE_DELETE);
	+ INOTIFY_NAME(vp, dvp, a->a_cnp, _IN_ATTRIB_LINKCOUNT);
	+ INOTIFY_NAME(vp, dvp, a->a_cnp, IN_DELETE);
	}
	}

	@@ -6289,6 +6337,8 @@
	VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
	if (a->a_tvp)
	VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
	+ INOTIFY_MOVE(a->a_fvp, a->a_fdvp, a->a_fcnp, a->a_tvp,
	+ a->a_tdvp, a->a_tcnp);
	}
	if (a->a_tdvp != a->a_fdvp)
	vdrop(a->a_fdvp);
	@@ -6328,6 +6378,7 @@
	vp->v_vflag \|= VV_UNLINKED;
	VFS_KNOTE_LOCKED(dvp, NOTE_WRITE \| NOTE_LINK);
	VFS_KNOTE_LOCKED(vp, NOTE_DELETE);
	+ INOTIFY_NAME(vp, dvp, a->a_cnp, IN_DELETE);
	}
	}

	@@ -6351,8 +6402,10 @@
	a = ap;
	vp = a->a_vp;
	vn_seqc_write_end(vp);
	- if (!rc)
	+ if (!rc) {
	VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB);
	+ INOTIFY(vp, IN_ATTRIB);
	+ }
	}

	void
	@@ -6397,8 +6450,10 @@
	a = ap;
	vp = a->a_vp;
	vn_seqc_write_end(vp);
	- if (!rc)
	+ if (!rc) {
	VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB);
	+ INOTIFY(vp, IN_ATTRIB);
	+ }
	}

	void
	@@ -6421,8 +6476,10 @@
	a = ap;
	dvp = a->a_dvp;
	vn_seqc_write_end(dvp);
	- if (!rc)
	+ if (!rc) {
	VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
	+ INOTIFY_NAME(*a->a_vpp, dvp, a->a_cnp, IN_CREATE);
	+ }
	}

	void
	@@ -6430,8 +6487,10 @@
	{
	struct vop_open_args *a = ap;

	- if (!rc)
	+ if (!rc) {
	VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN);
	+ INOTIFY(a->a_vp, IN_OPEN);
	+ }
	}

	void
	@@ -6443,6 +6502,8 @@
	!VN_IS_DOOMED(a->a_vp))) {
	VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ?
	NOTE_CLOSE_WRITE : NOTE_CLOSE);
	+ INOTIFY(a->a_vp, (a->a_fflag & FWRITE) != 0 ?
	+ IN_CLOSE_WRITE : IN_CLOSE_NOWRITE);
	}
	}

	@@ -6451,8 +6512,10 @@
	{
	struct vop_read_args *a = ap;

	- if (!rc)
	+ if (!rc) {
	VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
	+ INOTIFY(a->a_vp, IN_ACCESS);
	+ }
	}

	void
	@@ -6469,8 +6532,10 @@
	{
	struct vop_readdir_args *a = ap;

	- if (!rc)
	+ if (!rc) {
	VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
	+ INOTIFY(a->a_vp, IN_ACCESS);
	+ }
	}

	static struct knlist fs_knlist;
	diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c
	--- a/sys/kern/vfs_syscalls.c
	+++ b/sys/kern/vfs_syscalls.c
	@@ -50,6 +50,7 @@
	#include <sys/file.h>
	#include <sys/filedesc.h>
	#include <sys/filio.h>
	+#include <sys/inotify.h>
	#include <sys/jail.h>
	#include <sys/kernel.h>
	#ifdef KTRACE
	diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
	--- a/sys/kern/vfs_vnops.c
	+++ b/sys/kern/vfs_vnops.c
	@@ -51,6 +51,7 @@
	#include <sys/fcntl.h>
	#include <sys/file.h>
	#include <sys/filio.h>
	+#include <sys/inotify.h>
	#include <sys/ktr.h>
	#include <sys/ktrace.h>
	#include <sys/limits.h>
	@@ -296,7 +297,8 @@
	NDREINIT(ndp);
	goto restart;
	}
	- if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0)
	+ if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0 \|\|
	+ (vn_irflag_read(ndp->ni_dvp) & VIRF_INOTIFY) != 0)
	ndp->ni_cnd.cn_flags \|= MAKEENTRY;
	#ifdef MAC
	error = mac_vnode_check_create(cred, ndp->ni_dvp,
	@@ -306,12 +308,14 @@
	error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
	&ndp->ni_cnd, vap);
	vp = ndp->ni_vp;
	- if (error == 0 && (fmode & O_EXCL) != 0 &&
	- (fmode & (O_EXLOCK \| O_SHLOCK)) != 0) {
	- VI_LOCK(vp);
	- vp->v_iflag \|= VI_FOPENING;
	- VI_UNLOCK(vp);
	- first_open = true;
	+ if (error == 0) {
	+ if ((fmode & O_EXCL) != 0 &&
	+ (fmode & (O_EXLOCK \| O_SHLOCK)) != 0) {
	+ VI_LOCK(vp);
	+ vp->v_iflag \|= VI_FOPENING;
	+ VI_UNLOCK(vp);
	+ first_open = true;
	+ }
	}
	VOP_VPUT_PAIR(ndp->ni_dvp, error == 0 ? &vp : NULL,
	false);
	@@ -473,6 +477,7 @@
	if (vp->v_type != VFIFO && vp->v_type != VSOCK &&
	VOP_ACCESS(vp, VREAD, cred, td) == 0)
	fp->f_flag \|= FKQALLOWED;
	+ INOTIFY(vp, IN_OPEN);
	return (0);
	}

	@@ -1174,8 +1179,8 @@
	if (error == 0)
	#endif
	error = VOP_READ(vp, uio, ioflag, fp->f_cred);
	- fp->f_nextoff[UIO_READ] = uio->uio_offset;
	VOP_UNLOCK(vp);
	+ fp->f_nextoff[UIO_READ] = uio->uio_offset;
	if (error == 0 && advice == POSIX_FADV_NOREUSE &&
	orig_offset != uio->uio_offset)
	/*
	@@ -1244,8 +1249,8 @@
	if (error == 0)
	#endif
	error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
	- fp->f_nextoff[UIO_WRITE] = uio->uio_offset;
	VOP_UNLOCK(vp);
	+ fp->f_nextoff[UIO_WRITE] = uio->uio_offset;
	if (need_finished_write)
	vn_finished_write(mp);
	if (error == 0 && advice == POSIX_FADV_NOREUSE &&
	@@ -1735,6 +1740,8 @@
	vattr.va_vaflags \|= VA_SYNC;
	error = VOP_SETATTR(vp, &vattr, cred);
	VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
	+ if (error == 0)
	+ INOTIFY(vp, IN_MODIFY);
	}
	return (error);
	}
	diff --git a/sys/kern/vnode_if.src b/sys/kern/vnode_if.src
	--- a/sys/kern/vnode_if.src
	+++ b/sys/kern/vnode_if.src
	@@ -702,6 +702,7 @@


	%% allocate vp E E E
	+%! allocate post vop_allocate_post

	vop_allocate {
	IN struct vnode *vp;
	@@ -786,6 +787,7 @@

	%% copy_file_range invp U U U
	%% copy_file_range outvp U U U
	+%! copy_file_range post vop_copy_file_range_post

	vop_copy_file_range {
	IN struct vnode *invp;
	@@ -810,6 +812,7 @@


	%% deallocate vp L L L
	+%! deallocate post vop_deallocate_post

	vop_deallocate {
	IN struct vnode *vp;
	@@ -821,6 +824,27 @@
	};


	+%% inotify vp - - -
	+
	+vop_inotify {
	+ IN struct vnode *vp;
	+ IN struct vnode *dvp;
	+ IN struct componentname *cnp;
	+ IN int event;
	+ IN uint32_t cookie;
	+};
	+
	+
	+%% inotify_add_watch vp L L L
	+
	+vop_inotify_add_watch {
	+ IN struct vnode *vp;
	+ IN struct inotify_softc *sc;
	+ IN uint32_t mask;
	+ OUT uint32_t *wdp;
	+ IN struct thread *td;
	+};
	+
	# The VOPs below are spares at the end of the table to allow new VOPs to be
	# added in stable branches without breaking the KBI. New VOPs in HEAD should
	# be added above these spares. When merging a new VOP to a stable branch,
	diff --git a/sys/sys/caprights.h b/sys/sys/caprights.h
	--- a/sys/sys/caprights.h
	+++ b/sys/sys/caprights.h
	@@ -79,6 +79,8 @@
	extern const cap_rights_t cap_getpeername_rights;
	extern const cap_rights_t cap_getsockopt_rights;
	extern const cap_rights_t cap_getsockname_rights;
	+extern const cap_rights_t cap_inotify_add_rights;
	+extern const cap_rights_t cap_inotify_rm_rights;
	extern const cap_rights_t cap_ioctl_rights;
	extern const cap_rights_t cap_linkat_source_rights;
	extern const cap_rights_t cap_linkat_target_rights;
	diff --git a/sys/sys/capsicum.h b/sys/sys/capsicum.h
	--- a/sys/sys/capsicum.h
	+++ b/sys/sys/capsicum.h
	@@ -279,11 +279,15 @@

	#define CAP_KQUEUE (CAP_KQUEUE_EVENT \| CAP_KQUEUE_CHANGE)

	+/* Allows operations on inotify descriptors. */
	+#define CAP_INOTIFY_ADD CAPRIGHT(1, 0x0000000000200000ULL)
	+#define CAP_INOTIFY_RM CAPRIGHT(1, 0x0000000000400000ULL)
	+
	/* All used bits for index 1. */
	-#define CAP_ALL1 CAPRIGHT(1, 0x00000000001FFFFFULL)
	+#define CAP_ALL1 CAPRIGHT(1, 0x00000000007FFFFFULL)

	/* Available bits for index 1. */
	-#define CAP_UNUSED1_22 CAPRIGHT(1, 0x0000000000200000ULL)
	+#define CAP_UNUSED1_22 CAPRIGHT(1, 0x0000000000800000ULL)
	/* ... */
	#define CAP_UNUSED1_57 CAPRIGHT(1, 0x0100000000000000ULL)

	diff --git a/sys/sys/file.h b/sys/sys/file.h
	--- a/sys/sys/file.h
	+++ b/sys/sys/file.h
	@@ -71,6 +71,7 @@
	#define DTYPE_PROCDESC 12 /* process descriptor */
	#define DTYPE_EVENTFD 13 /* eventfd */
	#define DTYPE_TIMERFD 14 /* timerfd */
	+#define DTYPE_INOTIFY 15 /* inotify descriptor */

	#ifdef _KERNEL

	diff --git a/sys/sys/inotify.h b/sys/sys/inotify.h
	new file mode 100644
	--- /dev/null
	+++ b/sys/sys/inotify.h
	@@ -0,0 +1,141 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ *
	+ * Copyright (c) 2025 Klara, Inc.
	+ */
	+
	+#ifndef _INOTIFY_H_
	+#define _INOTIFY_H_
	+
	+#include <sys/_types.h>
	+
	+/* Flags for inotify_init1(). */
	+#define IN_NONBLOCK 0x00000004 /* O_NONBLOCK */
	+#define IN_CLOEXEC 0x00100000 /* O_CLOEXEC */
	+
	+struct inotify_event {
	+ int wd;
	+ __uint32_t mask;
	+ __uint32_t cookie;
	+ __uint32_t len;
	+ char name[0];
	+};
	+
	+/* Events, set in the mask field. */
	+#define IN_ACCESS 0x00000001
	+#define IN_ATTRIB 0x00000002
	+#define IN_CLOSE_WRITE 0x00000004
	+#define IN_CLOSE_NOWRITE 0x00000008
	+#define IN_CLOSE (IN_CLOSE_WRITE \| IN_CLOSE_NOWRITE)
	+#define IN_CREATE 0x00000010
	+#define IN_DELETE 0x00000020
	+#define IN_DELETE_SELF 0x00000040
	+#define IN_MODIFY 0x00000080
	+#define IN_MOVE_SELF 0x00000100
	+#define IN_MOVED_FROM 0x00000200
	+#define IN_MOVED_TO 0x00000400
	+#define IN_MOVE (IN_MOVED_FROM \| IN_MOVED_TO)
	+#define IN_OPEN 0x00000800
	+#define IN_ALL_EVENTS 0x00000fff
	+
	+/* Events report only for entries in a watched dir, not the dir itself. */
	+#define _IN_DIR_EVENTS (IN_CLOSE_WRITE \| IN_DELETE \| IN_MODIFY \| \
	+ IN_MOVED_FROM \| IN_MOVED_TO)
	+
	+/* Flags, set in the mask field. */
	+#define IN_DONT_FOLLOW 0x00010000
	+#define IN_EXCL_UNLINK 0x00020000
	+#define IN_MASK_ADD 0x00040000
	+#define IN_ONESHOT 0x00080000
	+#define IN_ONLYDIR 0x00100000
	+#define IN_MASK_CREATE 0x00200000
	+#define _IN_ALL_FLAGS 0x003f0000
	+
	+/* Flags returned by the kernel. */
	+#define IN_IGNORED 0x01000000
	+#define IN_ISDIR 0x02000000
	+#define IN_Q_OVERFLOW 0x04000000
	+#define IN_UNMOUNT 0x08000000
	+#define _IN_ALL_RETFLAGS 0x0f000000
	+
	+#define _IN_ALIGN _Alignof(struct inotify_event)
	+#define _IN_NAMESIZE(namelen) \
	+ ((namelen) == 0 ? 0 : __align_up((namelen) + 1, _IN_ALIGN))
	+
	+#ifdef _KERNEL
	+/*
	+ * An unlink that's done as part of a rename only records IN_DELETE if the
	+ * unlinked vnode itself is watched, and not when the containing directory is
	+ * watched.
	+ */
	+#define _IN_MOVE_DELETE 0x40000000
	+/*
	+ * Inode link count changes only trigger IN_ATTRIB events if the inode itself is
	+ * watched, and not when the containing directory is watched.
	+ */
	+#define _IN_ATTRIB_LINKCOUNT 0x80000000
	+
	+struct componentname;
	+struct file;
	+struct inotify_softc;
	+struct thread;
	+struct vnode;
	+
	+int inotify_create_file(struct thread , struct file , int, int *);
	+void inotify_log(struct vnode , const char , size_t, int, __uint32_t);
	+
	+void vn_inotify(struct vnode , struct vnode , struct componentname *, int,
	+ uint32_t);
	+int vn_inotify_add_watch(struct vnode , struct inotify_softc ,
	+ __uint32_t, __uint32_t , struct thread );
	+void vn_inotify_revoke(struct vnode *);
	+
	+/* Log an inotify event. */
	+#define INOTIFY(vp, ev) do { \
	+ if (__predict_false((vn_irflag_read(vp) & (VIRF_INOTIFY \| \
	+ VIRF_INOTIFY_PARENT)) != 0)) \
	+ VOP_INOTIFY((vp), NULL, NULL, (ev), 0); \
	+} while (0)
	+
	+/* Log an inotify event using a specific name for the vnode. */
	+#define INOTIFY_NAME(vp, dvp, cnp, ev) do { \
	+ if (__predict_false((vn_irflag_read(vp) & VIRF_INOTIFY) != 0 \|\| \
	+ (vn_irflag_read(dvp) & VIRF_INOTIFY) != 0)) \
	+ VOP_INOTIFY((vp), (dvp), (cnp), (ev), 0); \
	+} while (0)
	+
	+extern __uint32_t inotify_rename_cookie;
	+
	+#define INOTIFY_MOVE(vp, fdvp, fcnp, tvp, tdvp, tcnp) do { \
	+ if (__predict_false((vn_irflag_read(fdvp) & VIRF_INOTIFY) != 0 \|\| \
	+ (vn_irflag_read(tdvp) & VIRF_INOTIFY) != 0 \|\| \
	+ (vn_irflag_read(vp) & VIRF_INOTIFY) != 0)) { \
	+ __uint32_t cookie; \
	+ \
	+ cookie = atomic_fetchadd_32(&inotify_rename_cookie, 1); \
	+ VOP_INOTIFY((vp), (fdvp), (fcnp), IN_MOVED_FROM, cookie); \
	+ VOP_INOTIFY((vp), (tdvp), (tcnp), IN_MOVED_TO, cookie); \
	+ } \
	+ if ((tvp) != NULL) \
	+ INOTIFY_NAME((tvp), (tdvp), (tcnp), _IN_MOVE_DELETE); \
	+} while (0)
	+
	+#define INOTIFY_REVOKE(vp) do { \
	+ if (__predict_false((vn_irflag_read(vp) & VIRF_INOTIFY) != 0)) \
	+ vn_inotify_revoke((vp)); \
	+} while (0)
	+
	+#else
	+#include <sys/cdefs.h>
	+
	+__BEGIN_DECLS
	+int inotify_init(void);
	+int inotify_init1(int flags);
	+int inotify_add_watch(int fd, const char *pathname, __uint32_t mask);
	+int inotify_add_watch_at(int fd, int dfd, const char *pathname,
	+ __uint32_t mask);
	+int inotify_rm_watch(int fd, int wd);
	+__END_DECLS
	+#endif /* !_KERNEL */
	+
	+#endif /* !_INOTIFY_H_ */
	diff --git a/sys/sys/resourcevar.h b/sys/sys/resourcevar.h
	--- a/sys/sys/resourcevar.h
	+++ b/sys/sys/resourcevar.h
	@@ -122,6 +122,8 @@
	long ui_kqcnt; /* (b) number of kqueues */
	long ui_umtxcnt; /* (b) number of shared umtxs */
	long ui_pipecnt; /* (b) consumption of pipe buffers */
	+ long ui_inotifycnt; /* (b) number of inotify descriptors */
	+ long ui_inotifywatchcnt; /* (b) number of inotify watches */
	uid_t ui_uid; /* (a) uid */
	u_int ui_ref; /* (b) reference count */
	#ifdef RACCT
	@@ -144,6 +146,8 @@
	int chgptscnt(struct uidinfo *uip, int diff, rlim_t maxval);
	int chgumtxcnt(struct uidinfo *uip, int diff, rlim_t maxval);
	int chgpipecnt(struct uidinfo *uip, int diff, rlim_t max);
	+int chginotifycnt(struct uidinfo *uip, int diff, rlim_t maxval);
	+int chginotifywatchcnt(struct uidinfo *uip, int diff, rlim_t maxval);
	int kern_proc_setrlimit(struct thread td, struct proc p, u_int which,
	struct rlimit *limp);
	struct plimit
	diff --git a/sys/sys/specialfd.h b/sys/sys/specialfd.h
	--- a/sys/sys/specialfd.h
	+++ b/sys/sys/specialfd.h
	@@ -30,6 +30,7 @@

	enum specialfd_type {
	SPECIALFD_EVENTFD = 1,
	+ SPECIALFD_INOTIFY = 2,
	};

	struct specialfd_eventfd {
	@@ -37,4 +38,8 @@
	int flags;
	};

	+struct specialfd_inotify {
	+ int flags;
	+};
	+
	#endif /* !_SYS_SPECIALFD_H_ */
	diff --git a/sys/sys/user.h b/sys/sys/user.h
	--- a/sys/sys/user.h
	+++ b/sys/sys/user.h
	@@ -264,6 +264,7 @@
	#define KF_TYPE_DEV 12
	#define KF_TYPE_EVENTFD 13
	#define KF_TYPE_TIMERFD 14
	+#define KF_TYPE_INOTIFY 15
	#define KF_TYPE_UNKNOWN 255

	#define KF_VTYPE_VNON 0
	@@ -455,6 +456,10 @@
	int32_t kf_kqueue_count;
	int32_t kf_kqueue_state;
	} kf_kqueue;
	+ struct {
	+ uint64_t kf_inotify_npending;
	+ uint64_t kf_inotify_nbpending;
	+ } kf_inotify;
	} kf_un;
	};
	uint16_t kf_status; /* Status flags. */
	diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
	--- a/sys/sys/vnode.h
	+++ b/sys/sys/vnode.h
	@@ -86,11 +86,13 @@
	* it from v_data. If non-null, this area is freed in getnewvnode().
	*/

	-struct namecache;
	struct cache_fpl;
	+struct inotify_watch;
	+struct namecache;

	struct vpollinfo {
	struct mtx vpi_lock; /* lock to protect below */
	+ TAILQ_HEAD(, inotify_watch) vpi_inotify; /* list of inotify watchers */
	struct selinfo vpi_selinfo; /* identity of poller(s) */
	short vpi_events; /* what they are looking for */
	short vpi_revents; /* what has happened */
	@@ -248,6 +250,9 @@
	#define VIRF_CROSSMP 0x0010 /* Cross-mp vnode, no locking */
	#define VIRF_NAMEDDIR 0x0020 /* Named attribute directory */
	#define VIRF_NAMEDATTR 0x0040 /* Named attribute */
	+#define VIRF_INOTIFY 0x0080 /* This vnode is being watched */
	+#define VIRF_INOTIFY_PARENT 0x0100 /* A parent of this vnode may be being
	+ watched */

	#define VI_UNUSED0 0x0001 /* unused */
	#define VI_MOUNT 0x0002 /* Mount in progress */
	@@ -667,6 +672,7 @@
	void cache_symlink_free(char *string, size_t size);
	int cache_symlink_resolve(struct cache_fpl fpl, const char string,
	size_t len);
	+void cache_vop_inotify(struct vnode *vp, int event, uint32_t cookie);
	void cache_vop_rename(struct vnode fdvp, struct vnode fvp, struct vnode *tdvp,
	struct vnode tvp, struct componentname fcnp, struct componentname *tcnp);
	void cache_vop_rmdir(struct vnode dvp, struct vnode vp);
	@@ -869,8 +875,10 @@
	int vop_stdgetwritemount(struct vop_getwritemount_args *);
	int vop_stdgetpages(struct vop_getpages_args *);
	int vop_stdinactive(struct vop_inactive_args *);
	-int vop_stdioctl(struct vop_ioctl_args *);
	int vop_stdneed_inactive(struct vop_need_inactive_args *);
	+int vop_stdinotify(struct vop_inotify_args *);
	+int vop_stdinotify_add_watch(struct vop_inotify_add_watch_args *);
	+int vop_stdioctl(struct vop_ioctl_args *);
	int vop_stdkqfilter(struct vop_kqfilter_args *);
	int vop_stdlock(struct vop_lock1_args *);
	int vop_stdunlock(struct vop_unlock_args *);
	@@ -910,9 +918,12 @@
	int dead_write(struct vop_write_args *ap);

	/* These are called from within the actual VOPS. */
	+void vop_allocate_post(void *a, int rc);
	+void vop_copy_file_range_post(void *ap, int rc);
	void vop_close_post(void *a, int rc);
	void vop_create_pre(void *a);
	void vop_create_post(void *a, int rc);
	+void vop_deallocate_post(void *a, int rc);
	void vop_whiteout_pre(void *a);
	void vop_whiteout_post(void *a, int rc);
	void vop_deleteextattr_pre(void *a);
	@@ -1020,9 +1031,12 @@

	#define VOP_WRITE_POST(ap, ret) \
	noffset = (ap)->a_uio->uio_offset; \
	- if (noffset > ooffset && !VN_KNLIST_EMPTY((ap)->a_vp)) { \
	- VFS_KNOTE_LOCKED((ap)->a_vp, NOTE_WRITE \
	- \| (noffset > osize ? NOTE_EXTEND : 0)); \
	+ if (noffset > ooffset) { \
	+ if (VN_KNLIST_EMPTY((ap)->a_vp)) { \
	+ VFS_KNOTE_LOCKED((ap)->a_vp, NOTE_WRITE \| \
	+ (noffset > osize ? NOTE_EXTEND : 0)); \
	+ } \
	+ INOTIFY((ap)->a_vp, IN_MODIFY); \
	}

	#define VOP_LOCK(vp, flags) VOP_LOCK1(vp, flags, __FILE__, __LINE__)
	diff --git a/sys/tools/vnode_if.awk b/sys/tools/vnode_if.awk
	--- a/sys/tools/vnode_if.awk
	+++ b/sys/tools/vnode_if.awk
	@@ -193,6 +193,7 @@
	printc(common_head \
	"#include <sys/param.h>\n" \
	"#include <sys/event.h>\n" \
	+ "#include <sys/inotify.h>\n" \
	"#include <sys/kernel.h>\n" \
	"#include <sys/mount.h>\n" \
	"#include <sys/sdt.h>\n" \
	diff --git a/tests/sys/kern/Makefile b/tests/sys/kern/Makefile
	--- a/tests/sys/kern/Makefile
	+++ b/tests/sys/kern/Makefile
	@@ -18,6 +18,7 @@
	# One test modifies the maxfiles limit, which can cause spurious test failures.
	TEST_METADATA.kern_descrip_test+= is_exclusive="true"
	ATF_TESTS_C+= fdgrowtable_test
	+ATF_TESTS_C+= inotify_test
	ATF_TESTS_C+= kill_zombie
	.if ${MK_OPENSSL} != "no"
	ATF_TESTS_C+= ktls_test
	@@ -83,6 +84,7 @@
	LIBADD.sys_getrandom+= pthread
	LIBADD.ptrace_test+= pthread
	LIBADD.unix_seqpacket_test+= pthread
	+LIBADD.inotify_test+= util
	LIBADD.kcov+= pthread
	CFLAGS.ktls_test+= -DOPENSSL_API_COMPAT=0x10100000L
	LIBADD.ktls_test+= crypto util
	diff --git a/tests/sys/kern/inotify_test.c b/tests/sys/kern/inotify_test.c
	new file mode 100644
	--- /dev/null
	+++ b/tests/sys/kern/inotify_test.c
	@@ -0,0 +1,861 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause
	+ *
	+ * Copyright (c) 2025 Klara, Inc.
	+ */
	+
	+#include <sys/capsicum.h>
	+#include <sys/filio.h>
	+#include <sys/inotify.h>
	+#include <sys/ioccom.h>
	+#include <sys/mount.h>
	+#include <sys/socket.h>
	+#include <sys/stat.h>
	+#include <sys/sysctl.h>
	+#include <sys/un.h>
	+
	+#include <dirent.h>
	+#include <errno.h>
	+#include <fcntl.h>
	+#include <limits.h>
	+#include <mntopts.h>
	+#include <stdio.h>
	+#include <stdlib.h>
	+#include <string.h>
	+#include <unistd.h>
	+
	+#include <atf-c.h>
	+
	+static const char *
	+ev2name(int event)
	+{
	+ switch (event) {
	+ case IN_ACCESS:
	+ return ("IN_ACCESS");
	+ case IN_ATTRIB:
	+ return ("IN_ATTRIB");
	+ case IN_CLOSE_WRITE:
	+ return ("IN_CLOSE_WRITE");
	+ case IN_CLOSE_NOWRITE:
	+ return ("IN_CLOSE_NOWRITE");
	+ case IN_CREATE:
	+ return ("IN_CREATE");
	+ case IN_DELETE:
	+ return ("IN_DELETE");
	+ case IN_DELETE_SELF:
	+ return ("IN_DELETE_SELF");
	+ case IN_MODIFY:
	+ return ("IN_MODIFY");
	+ case IN_MOVE_SELF:
	+ return ("IN_MOVE_SELF");
	+ case IN_MOVED_FROM:
	+ return ("IN_MOVED_FROM");
	+ case IN_MOVED_TO:
	+ return ("IN_MOVED_TO");
	+ case IN_OPEN:
	+ return ("IN_OPEN");
	+ default:
	+ return (NULL);
	+ }
	+}
	+
	+static void
	+close_checked(int fd)
	+{
	+ ATF_REQUIRE(close(fd) == 0);
	+}
	+
	+/*
	+ * Make sure that no other events are pending, and close the inotify descriptor.
	+ */
	+static void
	+close_inotify(int fd)
	+{
	+ int n;
	+
	+ ATF_REQUIRE(ioctl(fd, FIONREAD, &n) == 0);
	+ ATF_REQUIRE(n == 0);
	+ close_checked(fd);
	+}
	+
	+static uint32_t
	+consume_event_cookie(int ifd, int wd, int event, int flags, const char *name)
	+{
	+ struct inotify_event *ev;
	+ size_t evsz, namelen;
	+ ssize_t n;
	+ uint32_t cookie;
	+
	+ /* Only read one record. */
	+ namelen = name == NULL ? 0 : strlen(name);
	+ evsz = sizeof(*ev) + _IN_NAMESIZE(namelen);
	+ ev = malloc(evsz);
	+ ATF_REQUIRE(ev != NULL);
	+
	+ n = read(ifd, ev, evsz);
	+ ATF_REQUIRE_MSG(n >= 0, "failed to read event %s", ev2name(event));
	+ ATF_REQUIRE((size_t)n >= sizeof(*ev));
	+ ATF_REQUIRE((size_t)n == sizeof(*ev) + ev->len);
	+ ATF_REQUIRE((size_t)n == evsz);
	+
	+ ATF_REQUIRE_MSG((ev->mask & IN_ALL_EVENTS) == event,
	+ "expected event %#x, got %#x", event, ev->mask);
	+ ATF_REQUIRE_MSG((ev->mask & _IN_ALL_RETFLAGS) == flags,
	+ "expected flags %#x, got %#x", flags, ev->mask);
	+ ATF_REQUIRE_MSG(ev->wd == wd,
	+ "expected wd %d, got %d", wd, ev->wd);
	+ ATF_REQUIRE_MSG(name == NULL \|\| strcmp(name, ev->name) == 0,
	+ "expected name '%s', got '%s'", name, ev->name);
	+ cookie = ev->cookie;
	+ if ((ev->mask & (IN_MOVED_FROM \| IN_MOVED_TO)) == 0)
	+ ATF_REQUIRE(cookie == 0);
	+ free(ev);
	+ return (cookie);
	+}
	+
	+/*
	+ * Read an event from the inotify file descriptor and check that it
	+ * matches the expected values.
	+ */
	+static void
	+consume_event(int ifd, int wd, int event, int flags, const char *name)
	+{
	+ (void)consume_event_cookie(ifd, wd, event, flags, name);
	+}
	+
	+static int
	+inotify(int flags)
	+{
	+ int ifd;
	+
	+ ifd = inotify_init1(flags);
	+ ATF_REQUIRE(ifd != -1);
	+ return (ifd);
	+}
	+
	+static void
	+mount_nullfs(char dir, char src)
	+{
	+ struct iovec *iov;
	+ char errmsg[1024];
	+ int error, iovlen;
	+
	+ iov = NULL;
	+ iovlen = 0;
	+
	+ build_iovec(&iov, &iovlen, "fstype", "nullfs", (size_t)-1);
	+ build_iovec(&iov, &iovlen, "fspath", dir, (size_t)-1);
	+ build_iovec(&iov, &iovlen, "target", src, (size_t)-1);
	+ build_iovec(&iov, &iovlen, "errmsg", errmsg, sizeof(errmsg));
	+
	+ errmsg[0] = '\0';
	+ error = nmount(iov, iovlen, 0);
	+ ATF_REQUIRE_MSG(error == 0,
	+ "mount nullfs %s %s: %s", src, dir,
	+ errmsg[0] == '\0' ? strerror(errno) : errmsg);
	+
	+ free_iovec(&iov, &iovlen);
	+}
	+
	+static void
	+mount_tmpfs(const char *dir)
	+{
	+ struct iovec *iov;
	+ char errmsg[1024];
	+ int error, iovlen;
	+
	+ iov = NULL;
	+ iovlen = 0;
	+
	+ build_iovec(&iov, &iovlen, "fstype", "tmpfs", (size_t)-1);
	+ build_iovec(&iov, &iovlen, "fspath", __DECONST(char *, dir),
	+ (size_t)-1);
	+ build_iovec(&iov, &iovlen, "errmsg", errmsg, sizeof(errmsg));
	+
	+ errmsg[0] = '\0';
	+ error = nmount(iov, iovlen, 0);
	+ ATF_REQUIRE_MSG(error == 0,
	+ "mount tmpfs %s: %s", dir,
	+ errmsg[0] == '\0' ? strerror(errno) : errmsg);
	+
	+ free_iovec(&iov, &iovlen);
	+}
	+
	+static int
	+watch_file(int ifd, int events, char *path)
	+{
	+ int fd, wd;
	+
	+ strncpy(path, "test.XXXXXX", PATH_MAX);
	+ fd = mkstemp(path);
	+ ATF_REQUIRE(fd != -1);
	+ close_checked(fd);
	+
	+ wd = inotify_add_watch(ifd, path, events);
	+ ATF_REQUIRE(wd != -1);
	+
	+ return (wd);
	+}
	+
	+static int
	+watch_dir(int ifd, int events, char *path)
	+{
	+ char *p;
	+ int wd;
	+
	+ strlcpy(path, "test.XXXXXX", PATH_MAX);
	+ p = mkdtemp(path);
	+ ATF_REQUIRE(p == path);
	+
	+ wd = inotify_add_watch(ifd, path, events);
	+ ATF_REQUIRE(wd != -1);
	+
	+ return (wd);
	+}
	+
	+/*
	+ * Verify that Capsicum restrictions are applied as expected.
	+ */
	+ATF_TC_WITHOUT_HEAD(inotify_capsicum);
	+ATF_TC_BODY(inotify_capsicum, tc)
	+{
	+ int error, dfd, ifd, wd;
	+
	+ ifd = inotify(IN_NONBLOCK);
	+ ATF_REQUIRE(ifd != -1);
	+
	+ dfd = open(".", O_RDONLY \| O_DIRECTORY);
	+ ATF_REQUIRE(dfd != -1);
	+
	+ error = mkdirat(dfd, "testdir", 0755);
	+ ATF_REQUIRE(error == 0);
	+
	+ error = cap_enter();
	+ ATF_REQUIRE(error == 0);
	+
	+ /*
	+ * Plain inotify_add_watch() is disallowed.
	+ */
	+ wd = inotify_add_watch(ifd, ".", IN_DELETE_SELF);
	+ ATF_REQUIRE_ERRNO(ECAPMODE, wd == -1);
	+ wd = inotify_add_watch_at(ifd, dfd, "testdir", IN_DELETE_SELF);
	+ ATF_REQUIRE(wd >= 0);
	+
	+ /*
	+ * Generate a record and consume it.
	+ */
	+ error = unlinkat(dfd, "testdir", AT_REMOVEDIR);
	+ ATF_REQUIRE(error == 0);
	+ consume_event(ifd, wd, IN_DELETE_SELF, IN_ISDIR, NULL);
	+ consume_event(ifd, wd, 0, IN_IGNORED, NULL);
	+
	+ close_checked(dfd);
	+ close_inotify(ifd);
	+}
	+
	+/*
	+ * Make sure that duplicate, back-to-back events are coalesced.
	+ */
	+ATF_TC_WITHOUT_HEAD(inotify_coalesce);
	+ATF_TC_BODY(inotify_coalesce, tc)
	+{
	+ char file[PATH_MAX], path[PATH_MAX];
	+ int fd, fd1, ifd, n, wd;
	+
	+ ifd = inotify(IN_NONBLOCK);
	+
	+ /* Create a directory and watch it. */
	+ wd = watch_dir(ifd, IN_OPEN, path);
	+ /* Create a file in the directory and open it. */
	+ snprintf(file, sizeof(file), "%s/file", path);
	+ fd = open(file, O_RDWR \| O_CREAT, 0644);
	+ ATF_REQUIRE(fd != -1);
	+ close_checked(fd);
	+ fd = open(file, O_RDWR);
	+ ATF_REQUIRE(fd != -1);
	+ fd1 = open(file, O_RDONLY);
	+ ATF_REQUIRE(fd1 != -1);
	+ close_checked(fd1);
	+ close_checked(fd);
	+
	+ consume_event(ifd, wd, IN_OPEN, 0, "file");
	+ ATF_REQUIRE(ioctl(ifd, FIONREAD, &n) == 0);
	+ ATF_REQUIRE(n == 0);
	+
	+ close_inotify(ifd);
	+}
	+
	+/*
	+ * Check handling of IN_MASK_CREATE.
	+ */
	+ATF_TC_WITHOUT_HEAD(inotify_mask_create);
	+ATF_TC_BODY(inotify_mask_create, tc)
	+{
	+ char path[PATH_MAX];
	+ int ifd, wd, wd1;
	+
	+ ifd = inotify(IN_NONBLOCK);
	+
	+ /* Create a directory and watch it. */
	+ wd = watch_dir(ifd, IN_CREATE, path);
	+ /* Updating the watch with IN_MASK_CREATE should result in an error. */
	+ wd1 = inotify_add_watch(ifd, path, IN_MODIFY \| IN_MASK_CREATE);
	+ ATF_REQUIRE_ERRNO(EEXIST, wd1 == -1);
	+ /* It's an error to specify IN_MASK_ADD with IN_MASK_CREATE. */
	+ wd1 = inotify_add_watch(ifd, path, IN_MODIFY \| IN_MASK_ADD \|
	+ IN_MASK_CREATE);
	+ ATF_REQUIRE_ERRNO(EINVAL, wd1 == -1);
	+ /* Updating the watch without IN_MASK_CREATE should work. */
	+ wd1 = inotify_add_watch(ifd, path, IN_MODIFY);
	+ ATF_REQUIRE(wd1 != -1);
	+ ATF_REQUIRE_EQ(wd, wd1);
	+
	+ close_inotify(ifd);
	+}
	+
	+/*
	+ * Make sure that inotify cooperates with nullfs: if a lower vnode is the
	+ * subject of an event, the upper vnode should be notified, and if the upper
	+ * vnode is the subject of an event, the lower vnode should be notified.
	+ */
	+ATF_TC_WITH_CLEANUP(inotify_nullfs);
	+ATF_TC_HEAD(inotify_nullfs, tc)
	+{
	+ atf_tc_set_md_var(tc, "require.user", "root");
	+}
	+ATF_TC_BODY(inotify_nullfs, tc)
	+{
	+ char path[PATH_MAX], *p;
	+ int dfd, error, fd, ifd, mask, wd;
	+
	+ mask = IN_CREATE \| IN_OPEN;
	+
	+ ifd = inotify(IN_NONBLOCK);
	+
	+ strlcpy(path, "./test.XXXXXX", sizeof(path));
	+ p = mkdtemp(path);
	+ ATF_REQUIRE(p == path);
	+
	+ error = mkdir("./mnt", 0755);
	+ ATF_REQUIRE(error == 0);
	+
	+ /* Mount the testdir onto ./mnt. */
	+ mount_nullfs("./mnt", path);
	+
	+ wd = inotify_add_watch(ifd, "./mnt", mask);
	+ ATF_REQUIRE(wd != -1);
	+
	+ /* Create a file in the lower directory and open it. */
	+ dfd = open(path, O_RDONLY \| O_DIRECTORY);
	+ ATF_REQUIRE(dfd != -1);
	+ fd = openat(dfd, "file", O_RDWR \| O_CREAT, 0644);
	+ close_checked(fd);
	+ close_checked(dfd);
	+
	+ /* We should see events via the nullfs mount. */
	+ consume_event(ifd, wd, IN_OPEN, IN_ISDIR, NULL);
	+ consume_event(ifd, wd, IN_CREATE, 0, "file");
	+ consume_event(ifd, wd, IN_OPEN, 0, "file");
	+
	+ error = inotify_rm_watch(ifd, wd);
	+ ATF_REQUIRE(error == 0);
	+ consume_event(ifd, wd, 0, IN_IGNORED, NULL);
	+
	+ /* Watch the lower directory. */
	+ wd = inotify_add_watch(ifd, path, mask);
	+ ATF_REQUIRE(wd != -1);
	+ /* ... and create a file in the upper directory and open it. */
	+ dfd = open("./mnt", O_RDONLY \| O_DIRECTORY);
	+ ATF_REQUIRE(dfd != -1);
	+ fd = openat(dfd, "file2", O_RDWR \| O_CREAT, 0644);
	+ ATF_REQUIRE(fd != -1);
	+ close_checked(fd);
	+ close_checked(dfd);
	+
	+ /* We should see events via the lower directory. */
	+ consume_event(ifd, wd, IN_OPEN, IN_ISDIR, NULL);
	+ consume_event(ifd, wd, IN_CREATE, 0, "file2");
	+ consume_event(ifd, wd, IN_OPEN, 0, "file2");
	+
	+ close_inotify(ifd);
	+}
	+ATF_TC_CLEANUP(inotify_nullfs, tc)
	+{
	+ int error;
	+
	+ error = unmount("./mnt", 0);
	+ if (error != 0) {
	+ perror("unmount");
	+ exit(1);
	+ }
	+}
	+
	+/*
	+ * Make sure that exceeding max_events pending events results in an overflow
	+ * event.
	+ */
	+ATF_TC_WITHOUT_HEAD(inotify_queue_overflow);
	+ATF_TC_BODY(inotify_queue_overflow, tc)
	+{
	+ char path[PATH_MAX];
	+ size_t size;
	+ int error, dfd, ifd, max, wd;
	+
	+ size = sizeof(max);
	+ error = sysctlbyname("vfs.inotify.max_events", &max, &size, NULL, 0);
	+ ATF_REQUIRE(error == 0);
	+
	+ ifd = inotify(IN_NONBLOCK);
	+
	+ /* Create a directory and watch it for file creation events. */
	+ wd = watch_dir(ifd, IN_CREATE, path);
	+ dfd = open(path, O_DIRECTORY);
	+ ATF_REQUIRE(dfd != -1);
	+ /* Generate max+1 file creation events. */
	+ for (int i = 0; i < max + 1; i++) {
	+ char name[NAME_MAX];
	+ int fd;
	+
	+ (void)snprintf(name, sizeof(name), "file%d", i);
	+ fd = openat(dfd, name, O_CREAT \| O_RDWR, 0644);
	+ ATF_REQUIRE(fd != -1);
	+ close_checked(fd);
	+ }
	+
	+ /*
	+ * Read our events. We should see files 0..max-1 and then an overflow
	+ * event.
	+ */
	+ for (int i = 0; i < max; i++) {
	+ char name[NAME_MAX];
	+
	+ (void)snprintf(name, sizeof(name), "file%d", i);
	+ consume_event(ifd, wd, IN_CREATE, 0, name);
	+ }
	+
	+ /* Look for an overflow event. */
	+ consume_event(ifd, -1, 0, IN_Q_OVERFLOW, NULL);
	+
	+ close_checked(dfd);
	+ close_inotify(ifd);
	+}
	+
	+ATF_TC_WITHOUT_HEAD(inotify_event_access_file);
	+ATF_TC_BODY(inotify_event_access_file, tc)
	+{
	+ char path[PATH_MAX], buf[16];
	+ off_t nb;
	+ ssize_t n;
	+ int error, fd, fd1, ifd, s[2], wd;
	+
	+ ifd = inotify(IN_NONBLOCK);
	+
	+ wd = watch_file(ifd, IN_ACCESS, path);
	+
	+ fd = open(path, O_RDWR);
	+ n = write(fd, "test", 4);
	+ ATF_REQUIRE(n == 4);
	+
	+ /* A simple read(2) should generate an access. */
	+ ATF_REQUIRE(lseek(fd, 0, SEEK_SET) == 0);
	+ n = read(fd, buf, sizeof(buf));
	+ ATF_REQUIRE(n == 4);
	+ ATF_REQUIRE(memcmp(buf, "test", 4) == 0);
	+ consume_event(ifd, wd, IN_ACCESS, 0, NULL);
	+
	+ /* copy_file_range(2) should as well. */
	+ ATF_REQUIRE(lseek(fd, 0, SEEK_SET) == 0);
	+ fd1 = open("sink", O_RDWR \| O_CREAT, 0644);
	+ ATF_REQUIRE(fd1 != -1);
	+ n = copy_file_range(fd, NULL, fd1, NULL, 4, 0);
	+ ATF_REQUIRE(n == 4);
	+ close_checked(fd1);
	+ consume_event(ifd, wd, IN_ACCESS, 0, NULL);
	+
	+ /* As should sendfile(2). */
	+ error = socketpair(AF_UNIX, SOCK_STREAM, 0, s);
	+ ATF_REQUIRE(error == 0);
	+ error = sendfile(fd, s[0], 0, 4, NULL, &nb, 0);
	+ ATF_REQUIRE(error == 0);
	+ ATF_REQUIRE(nb == 4);
	+ consume_event(ifd, wd, IN_ACCESS, 0, NULL);
	+ close_checked(s[0]);
	+ close_checked(s[1]);
	+
	+ close_checked(fd);
	+
	+ close_inotify(ifd);
	+}
	+
	+ATF_TC_WITHOUT_HEAD(inotify_event_access_dir);
	+ATF_TC_BODY(inotify_event_access_dir, tc)
	+{
	+ char root[PATH_MAX], path[PATH_MAX];
	+ struct dirent *ent;
	+ DIR *dir;
	+ int error, ifd, wd;
	+
	+ ifd = inotify(IN_NONBLOCK);
	+
	+ wd = watch_dir(ifd, IN_ACCESS, root);
	+ snprintf(path, sizeof(path), "%s/dir", root);
	+ error = mkdir(path, 0755);
	+ ATF_REQUIRE(error == 0);
	+
	+ /* Read an entry and generate an access. */
	+ dir = opendir(path);
	+ ATF_REQUIRE(dir != NULL);
	+ ent = readdir(dir);
	+ ATF_REQUIRE(ent != NULL);
	+ ATF_REQUIRE(strcmp(ent->d_name, ".") == 0 \|\|
	+ strcmp(ent->d_name, "..") == 0);
	+ ATF_REQUIRE(closedir(dir) == 0);
	+ consume_event(ifd, wd, IN_ACCESS, IN_ISDIR, "dir");
	+
	+ /*
	+ * Reading the watched directory should generate an access event.
	+ * This is contrary to Linux's inotify man page, which states that
	+ * IN_ACCESS is only generated for accesses to objects in a watched
	+ * directory.
	+ */
	+ dir = opendir(root);
	+ ATF_REQUIRE(dir != NULL);
	+ ent = readdir(dir);
	+ ATF_REQUIRE(ent != NULL);
	+ ATF_REQUIRE(strcmp(ent->d_name, ".") == 0 \|\|
	+ strcmp(ent->d_name, "..") == 0);
	+ ATF_REQUIRE(closedir(dir) == 0);
	+ consume_event(ifd, wd, IN_ACCESS, IN_ISDIR, NULL);
	+
	+ close_inotify(ifd);
	+}
	+
	+ATF_TC_WITHOUT_HEAD(inotify_event_attrib);
	+ATF_TC_BODY(inotify_event_attrib, tc)
	+{
	+ char path[PATH_MAX];
	+ int error, ifd, fd, wd;
	+
	+ ifd = inotify(IN_NONBLOCK);
	+
	+ wd = watch_file(ifd, IN_ATTRIB, path);
	+
	+ fd = open(path, O_RDWR);
	+ ATF_REQUIRE(fd != -1);
	+ error = fchmod(fd, 0600);
	+ ATF_REQUIRE(error == 0);
	+ consume_event(ifd, wd, IN_ATTRIB, 0, NULL);
	+
	+ error = fchown(fd, 0, 0);
	+ ATF_REQUIRE(error == 0);
	+ consume_event(ifd, wd, IN_ATTRIB, 0, NULL);
	+
	+ close_checked(fd);
	+ close_inotify(ifd);
	+}
	+
	+ATF_TC_WITHOUT_HEAD(inotify_event_close_nowrite);
	+ATF_TC_BODY(inotify_event_close_nowrite, tc)
	+{
	+ char file[PATH_MAX], file1[PATH_MAX], dir[PATH_MAX];
	+ int ifd, fd, wd1, wd2;
	+
	+ ifd = inotify(IN_NONBLOCK);
	+
	+ wd1 = watch_dir(ifd, IN_CLOSE_NOWRITE, dir);
	+ wd2 = watch_file(ifd, IN_CLOSE_NOWRITE \| IN_CLOSE_WRITE, file);
	+
	+ fd = open(dir, O_DIRECTORY);
	+ ATF_REQUIRE(fd != -1);
	+ close_checked(fd);
	+ consume_event(ifd, wd1, IN_CLOSE_NOWRITE, IN_ISDIR, NULL);
	+
	+ fd = open(file, O_RDONLY);
	+ ATF_REQUIRE(fd != -1);
	+ close_checked(fd);
	+ consume_event(ifd, wd2, IN_CLOSE_NOWRITE, 0, NULL);
	+
	+ snprintf(file1, sizeof(file1), "%s/file", dir);
	+ fd = open(file1, O_RDONLY \| O_CREAT, 0644);
	+ ATF_REQUIRE(fd != -1);
	+ close_checked(fd);
	+ consume_event(ifd, wd1, IN_CLOSE_NOWRITE, 0, "file");
	+
	+ close_inotify(ifd);
	+}
	+
	+ATF_TC_WITHOUT_HEAD(inotify_event_close_write);
	+ATF_TC_BODY(inotify_event_close_write, tc)
	+{
	+ char path[PATH_MAX];
	+ int ifd, fd, wd;
	+
	+ ifd = inotify(IN_NONBLOCK);
	+
	+ wd = watch_file(ifd, IN_CLOSE_NOWRITE \| IN_CLOSE_WRITE, path);
	+
	+ fd = open(path, O_RDWR);
	+ ATF_REQUIRE(fd != -1);
	+ close_checked(fd);
	+ consume_event(ifd, wd, IN_CLOSE_WRITE, 0, NULL);
	+
	+ close_inotify(ifd);
	+}
	+
	+/* Verify that various operations in a directory generate IN_CREATE events. */
	+ATF_TC_WITHOUT_HEAD(inotify_event_create);
	+ATF_TC_BODY(inotify_event_create, tc)
	+{
	+ struct sockaddr_un sun;
	+ char path[PATH_MAX], path1[PATH_MAX], root[PATH_MAX];
	+ ssize_t n;
	+ int error, ifd, ifd1, fd, s, wd, wd1;
	+ char b;
	+
	+ ifd = inotify(IN_NONBLOCK);
	+
	+ wd = watch_dir(ifd, IN_CREATE, root);
	+
	+ /* Regular file. */
	+ snprintf(path, sizeof(path), "%s/file", root);
	+ fd = open(path, O_RDWR \| O_CREAT, 0644);
	+ ATF_REQUIRE(fd != -1);
	+ /*
	+ * Make sure we get an event triggered by the fd used to create the
	+ * file.
	+ */
	+ ifd1 = inotify(IN_NONBLOCK);
	+ wd1 = inotify_add_watch(ifd1, root, IN_MODIFY);
	+ b = 42;
	+ n = write(fd, &b, sizeof(b));
	+ ATF_REQUIRE(n == sizeof(b));
	+ close_checked(fd);
	+ consume_event(ifd, wd, IN_CREATE, 0, "file");
	+ consume_event(ifd1, wd1, IN_MODIFY, 0, "file");
	+ close_inotify(ifd1);
	+
	+ /* Hard link. */
	+ snprintf(path1, sizeof(path1), "%s/link", root);
	+ error = link(path, path1);
	+ ATF_REQUIRE(error == 0);
	+ consume_event(ifd, wd, IN_CREATE, 0, "link");
	+
	+ /* Directory. */
	+ snprintf(path, sizeof(path), "%s/dir", root);
	+ error = mkdir(path, 0755);
	+ ATF_REQUIRE(error == 0);
	+ consume_event(ifd, wd, IN_CREATE, IN_ISDIR, "dir");
	+
	+ /* Symbolic link. */
	+ snprintf(path1, sizeof(path1), "%s/symlink", root);
	+ error = symlink(path, path1);
	+ ATF_REQUIRE(error == 0);
	+ consume_event(ifd, wd, IN_CREATE, 0, "symlink");
	+
	+ /* FIFO. */
	+ snprintf(path, sizeof(path), "%s/fifo", root);
	+ error = mkfifo(path, 0644);
	+ ATF_REQUIRE(error == 0);
	+ consume_event(ifd, wd, IN_CREATE, 0, "fifo");
	+
	+ /* Binding a socket. */
	+ s = socket(AF_UNIX, SOCK_STREAM, 0);
	+ memset(&sun, 0, sizeof(sun));
	+ sun.sun_family = AF_UNIX;
	+ sun.sun_len = sizeof(sun);
	+ snprintf(sun.sun_path, sizeof(sun.sun_path), "%s/socket", root);
	+ error = bind(s, (struct sockaddr *)&sun, sizeof(sun));
	+ ATF_REQUIRE(error == 0);
	+ close_checked(s);
	+ consume_event(ifd, wd, IN_CREATE, 0, "socket");
	+
	+ close_inotify(ifd);
	+}
	+
	+ATF_TC_WITHOUT_HEAD(inotify_event_delete);
	+ATF_TC_BODY(inotify_event_delete, tc)
	+{
	+ char root[PATH_MAX], path[PATH_MAX], file[PATH_MAX];
	+ int error, fd, ifd, wd, wd2;
	+
	+ ifd = inotify(IN_NONBLOCK);
	+
	+ wd = watch_dir(ifd, IN_DELETE \| IN_DELETE_SELF, root);
	+
	+ snprintf(path, sizeof(path), "%s/file", root);
	+ fd = open(path, O_RDWR \| O_CREAT, 0644);
	+ ATF_REQUIRE(fd != -1);
	+ error = unlink(path);
	+ ATF_REQUIRE(error == 0);
	+ consume_event(ifd, wd, IN_DELETE, 0, "file");
	+ close_checked(fd);
	+
	+ /*
	+ * Make sure that renaming over a file generates a delete event when and
	+ * only when that file is watched.
	+ */
	+ fd = open(path, O_RDWR \| O_CREAT, 0644);
	+ ATF_REQUIRE(fd != -1);
	+ close_checked(fd);
	+ wd2 = inotify_add_watch(ifd, path, IN_DELETE \| IN_DELETE_SELF);
	+ ATF_REQUIRE(wd2 != -1);
	+ snprintf(file, sizeof(file), "%s/file2", root);
	+ fd = open(file, O_RDWR \| O_CREAT, 0644);
	+ ATF_REQUIRE(fd != -1);
	+ close_checked(fd);
	+ error = rename(file, path);
	+ ATF_REQUIRE(error == 0);
	+ consume_event(ifd, wd2, IN_DELETE_SELF, 0, NULL);
	+ consume_event(ifd, wd2, 0, IN_IGNORED, NULL);
	+
	+ error = unlink(path);
	+ ATF_REQUIRE(error == 0);
	+ consume_event(ifd, wd, IN_DELETE, 0, "file");
	+ error = rmdir(root);
	+ ATF_REQUIRE(error == 0);
	+ consume_event(ifd, wd, IN_DELETE_SELF, IN_ISDIR, NULL);
	+ consume_event(ifd, wd, 0, IN_IGNORED, NULL);
	+
	+ close_inotify(ifd);
	+}
	+
	+ATF_TC_WITHOUT_HEAD(inotify_event_move);
	+ATF_TC_BODY(inotify_event_move, tc)
	+{
	+ char dir1[PATH_MAX], dir2[PATH_MAX], path1[PATH_MAX], path2[PATH_MAX];
	+ char path3[PATH_MAX];
	+ int error, ifd, fd, wd1, wd2, wd3;
	+ uint32_t cookie1, cookie2;
	+
	+ ifd = inotify(IN_NONBLOCK);
	+
	+ wd1 = watch_dir(ifd, IN_MOVE \| IN_MOVE_SELF, dir1);
	+ wd2 = watch_dir(ifd, IN_MOVE \| IN_MOVE_SELF, dir2);
	+
	+ snprintf(path1, sizeof(path1), "%s/file", dir1);
	+ fd = open(path1, O_RDWR \| O_CREAT, 0644);
	+ ATF_REQUIRE(fd != -1);
	+ close_checked(fd);
	+ snprintf(path2, sizeof(path2), "%s/file2", dir2);
	+ error = rename(path1, path2);
	+ ATF_REQUIRE(error == 0);
	+ cookie1 = consume_event_cookie(ifd, wd1, IN_MOVED_FROM, 0, "file");
	+ cookie2 = consume_event_cookie(ifd, wd2, IN_MOVED_TO, 0, "file2");
	+ ATF_REQUIRE_MSG(cookie1 == cookie2,
	+ "expected cookie %u, got %u", cookie1, cookie2);
	+
	+ snprintf(path2, sizeof(path2), "%s/dir", dir2);
	+ error = rename(dir1, path2);
	+ ATF_REQUIRE(error == 0);
	+ consume_event(ifd, wd1, IN_MOVE_SELF, IN_ISDIR, NULL);
	+ consume_event(ifd, wd2, IN_MOVED_TO, IN_ISDIR, "dir");
	+
	+ wd3 = watch_file(ifd, IN_MOVE_SELF, path3);
	+ error = rename(path3, "foo");
	+ ATF_REQUIRE(error == 0);
	+ consume_event(ifd, wd3, IN_MOVE_SELF, 0, NULL);
	+
	+ close_inotify(ifd);
	+}
	+
	+ATF_TC_WITHOUT_HEAD(inotify_event_open);
	+ATF_TC_BODY(inotify_event_open, tc)
	+{
	+ char root[PATH_MAX], path[PATH_MAX];
	+ int error, ifd, fd, wd;
	+
	+ ifd = inotify(IN_NONBLOCK);
	+
	+ wd = watch_dir(ifd, IN_OPEN, root);
	+
	+ snprintf(path, sizeof(path), "%s/file", root);
	+ fd = open(path, O_RDWR \| O_CREAT, 0644);
	+ ATF_REQUIRE(fd != -1);
	+ close_checked(fd);
	+ consume_event(ifd, wd, IN_OPEN, 0, "file");
	+
	+ fd = open(path, O_PATH);
	+ ATF_REQUIRE(fd != -1);
	+ close_checked(fd);
	+ consume_event(ifd, wd, IN_OPEN, 0, "file");
	+
	+ fd = open(root, O_DIRECTORY);
	+ ATF_REQUIRE(fd != -1);
	+ close_checked(fd);
	+ consume_event(ifd, wd, IN_OPEN, IN_ISDIR, NULL);
	+
	+ snprintf(path, sizeof(path), "%s/fifo", root);
	+ error = mkfifo(path, 0644);
	+ ATF_REQUIRE(error == 0);
	+ fd = open(path, O_RDWR);
	+ ATF_REQUIRE(fd != -1);
	+ close_checked(fd);
	+ consume_event(ifd, wd, IN_OPEN, 0, "fifo");
	+
	+ close_inotify(ifd);
	+}
	+
	+ATF_TC_WITH_CLEANUP(inotify_event_unmount);
	+ATF_TC_HEAD(inotify_event_unmount, tc)
	+{
	+ atf_tc_set_md_var(tc, "require.user", "root");
	+}
	+ATF_TC_BODY(inotify_event_unmount, tc)
	+{
	+ int error, fd, ifd, wd;
	+
	+ ifd = inotify(IN_NONBLOCK);
	+
	+ error = mkdir("./root", 0755);
	+ ATF_REQUIRE(error == 0);
	+
	+ mount_tmpfs("./root");
	+
	+ error = mkdir("./root/dir", 0755);
	+ ATF_REQUIRE(error == 0);
	+ wd = inotify_add_watch(ifd, "./root/dir", IN_OPEN);
	+ ATF_REQUIRE(wd >= 0);
	+
	+ fd = open("./root/dir", O_RDONLY \| O_DIRECTORY);
	+ ATF_REQUIRE(fd != -1);
	+ consume_event(ifd, wd, IN_OPEN, IN_ISDIR, NULL);
	+ close_checked(fd);
	+
	+ /* A regular unmount should fail, as inotify holds a vnode reference. */
	+ error = unmount("./root", 0);
	+ ATF_REQUIRE_ERRNO(EBUSY, error == -1);
	+ error = unmount("./root", MNT_FORCE);
	+ ATF_REQUIRE_MSG(error == 0,
	+ "unmounting ./root failed: %s", strerror(errno));
	+
	+ consume_event(ifd, wd, 0, IN_UNMOUNT, NULL);
	+ consume_event(ifd, wd, 0, IN_IGNORED, NULL);
	+
	+ close_inotify(ifd);
	+}
	+ATF_TC_CLEANUP(inotify_event_unmount, tc)
	+{
	+ (void)unmount("./root", MNT_FORCE);
	+}
	+
	+ATF_TP_ADD_TCS(tp)
	+{
	+ /* Tests for the inotify syscalls. */
	+ ATF_TP_ADD_TC(tp, inotify_capsicum);
	+ ATF_TP_ADD_TC(tp, inotify_coalesce);
	+ ATF_TP_ADD_TC(tp, inotify_mask_create);
	+ ATF_TP_ADD_TC(tp, inotify_nullfs);
	+ ATF_TP_ADD_TC(tp, inotify_queue_overflow);
	+ /* Tests for the various inotify event types. */
	+ ATF_TP_ADD_TC(tp, inotify_event_access_file);
	+ ATF_TP_ADD_TC(tp, inotify_event_access_dir);
	+ ATF_TP_ADD_TC(tp, inotify_event_attrib);
	+ ATF_TP_ADD_TC(tp, inotify_event_close_nowrite);
	+ ATF_TP_ADD_TC(tp, inotify_event_close_write);
	+ ATF_TP_ADD_TC(tp, inotify_event_create);
	+ ATF_TP_ADD_TC(tp, inotify_event_delete);
	+ ATF_TP_ADD_TC(tp, inotify_event_move);
	+ ATF_TP_ADD_TC(tp, inotify_event_open);
	+ ATF_TP_ADD_TC(tp, inotify_event_unmount);
	+ return (atf_no_error());
	+}

File Metadata

Mime Type: text/plain
Expires: Mon, Dec 15, 7:43 AM (11 h, 13 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 26978849
Default Alt Text: D50315.id156124.diff (95 KB)

D50315.id156124.diffNo OneTemporaryActions

D50315.id156124.diffView Options

File Metadata

Event Timeline

D50315.id156124.diff
No OneTemporary
Actions

D50315.id156124.diff
View Options