diff --git a/lib/libc/sys/open.2 b/lib/libc/sys/open.2 index 06a877e34460..a7806df69daf 100644 --- a/lib/libc/sys/open.2 +++ b/lib/libc/sys/open.2 @@ -1,666 +1,669 @@ .\" Copyright (c) 1980, 1991, 1993 .\" The Regents of the University of California. All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" 3. Neither the name of the University nor the names of its contributors .\" may be used to endorse or promote products derived from this software .\" without specific prior written permission. .\" .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" @(#)open.2 8.2 (Berkeley) 11/16/93 .\" $FreeBSD$ .\" .Dd March 18, 2021 .Dt OPEN 2 .Os .Sh NAME .Nm open , openat .Nd open or create a file for reading, writing or executing .Sh LIBRARY .Lb libc .Sh SYNOPSIS .In fcntl.h .Ft int .Fn open "const char *path" "int flags" "..." .Ft int .Fn openat "int fd" "const char *path" "int flags" "..." .Sh DESCRIPTION The file name specified by .Fa path is opened for either execution or reading and/or writing as specified by the argument .Fa flags and the file descriptor returned to the calling process. The .Fa flags argument may indicate the file is to be created if it does not exist (by specifying the .Dv O_CREAT flag). In this case .Fn open and .Fn openat require an additional argument .Fa "mode_t mode" , and the file is created with mode .Fa mode as described in .Xr chmod 2 and modified by the process' umask value (see .Xr umask 2 ) . .Pp The .Fn openat function is equivalent to the .Fn open function except in the case where the .Fa path specifies a relative path. For .Fn openat and relative .Fa path , the file to be opened is determined relative to the directory associated with the file descriptor .Fa fd instead of the current working directory. The .Fa flag parameter and the optional fourth parameter correspond exactly to the parameters of .Fn open . If .Fn openat is passed the special value .Dv AT_FDCWD in the .Fa fd parameter, the current working directory is used and the behavior is identical to a call to .Fn open . .Pp When .Fn openat is called with an absolute .Fa path , it ignores the .Fa fd argument. .Pp In .Xr capsicum 4 capability mode, .Fn open is not permitted. The .Fa path argument to .Fn openat must be strictly relative to a file descriptor .Fa fd . .Fa path must not be an absolute path and must not contain ".." components which cause the path resolution to escape the directory hierarchy starting at .Fa fd . Additionally, no symbolic link in .Fa path may target absolute path or contain escaping ".." components. .Fa fd must not be .Dv AT_FDCWD . .Pp If the .Dv vfs.lookup_cap_dotdot .Xr sysctl 3 MIB is set to zero, ".." components in the paths, used in capability mode, are completely disabled. If the .Dv vfs.lookup_cap_dotdot_nonlocal MIB is set to zero, ".." is not allowed if found on non-local filesystem. .Pp The flags specified are formed by .Em or Ns 'ing the following values .Pp .Bd -literal -offset indent -compact O_RDONLY open for reading only O_WRONLY open for writing only O_RDWR open for reading and writing O_EXEC open for execute only O_SEARCH open for search only, an alias for O_EXEC O_NONBLOCK do not block on open O_APPEND append on each write O_CREAT create file if it does not exist O_TRUNC truncate size to 0 O_EXCL error if create and file exists O_SHLOCK atomically obtain a shared lock O_EXLOCK atomically obtain an exclusive lock O_DIRECT eliminate or reduce cache effects O_FSYNC synchronous writes (historical synonym for O_SYNC) O_SYNC synchronous writes O_DSYNC synchronous data writes O_NOFOLLOW do not follow symlinks O_NOCTTY ignored O_TTY_INIT ignored O_DIRECTORY error if file is not a directory O_CLOEXEC set FD_CLOEXEC upon open O_VERIFY verify the contents of the file O_RESOLVE_BENEATH path resolution must not cross the fd directory O_PATH record only the target path in the opened descriptor .Ed .Pp Opening a file with .Dv O_APPEND set causes each write on the file to be appended to the end. If .Dv O_TRUNC is specified and the file exists, the file is truncated to zero length. If .Dv O_EXCL is set with .Dv O_CREAT and the file already exists, .Fn open returns an error. This may be used to implement a simple exclusive access locking mechanism. If .Dv O_EXCL is set and the last component of the pathname is a symbolic link, .Fn open will fail even if the symbolic link points to a non-existent name. If the .Dv O_NONBLOCK flag is specified and the .Fn open system call would result in the process being blocked for some reason (e.g., waiting for carrier on a dialup line), .Fn open returns immediately. The descriptor remains in non-blocking mode for subsequent operations. .Pp If .Dv O_SYNC is used in the mask, all writes will immediately and synchronously be written to disk. .Dv O_FSYNC is an historical synonym for .Dv O_SYNC . .Pp If .Dv O_DSYNC is used in the mask, all data and metadata required to read the data will be synchronously written to disk, but changes to metadata such as file access and modification timestamps may be written later. .Pp If .Dv O_NOFOLLOW is used in the mask and the target file passed to .Fn open is a symbolic link then the .Fn open will fail. .Pp When opening a file, a lock with .Xr flock 2 semantics can be obtained by setting .Dv O_SHLOCK for a shared lock, or .Dv O_EXLOCK for an exclusive lock. If creating a file with .Dv O_CREAT , the request for the lock will never fail (provided that the underlying file system supports locking). .Pp .Dv O_DIRECT may be used to minimize or eliminate the cache effects of reading and writing. The system will attempt to avoid caching the data you read or write. If it cannot avoid caching the data, it will minimize the impact the data has on the cache. Use of this flag can drastically reduce performance if not used with care. .Pp .Dv O_NOCTTY may be used to ensure the OS does not assign this file as the controlling terminal when it opens a tty device. This is the default on .Fx , but is present for .Tn POSIX compatibility. The .Fn open system call will not assign controlling terminals on .Fx . .Pp .Dv O_TTY_INIT may be used to ensure the OS restores the terminal attributes when initially opening a TTY. This is the default on .Fx , but is present for .Tn POSIX compatibility. The initial call to .Fn open on a TTY will always restore default terminal attributes on .Fx . .Pp .Dv O_DIRECTORY may be used to ensure the resulting file descriptor refers to a directory. This flag can be used to prevent applications with elevated privileges from opening files which are even unsafe to open with .Dv O_RDONLY , such as device nodes. .Pp .Dv O_CLOEXEC may be used to set .Dv FD_CLOEXEC flag for the newly returned file descriptor. .Pp .Dv O_VERIFY may be used to indicate to the kernel that the contents of the file should be verified before allowing the open to proceed. The details of what .Dq verified means is implementation specific. The run-time linker (rtld) uses this flag to ensure shared objects have been verified before operating on them. .Pp .Dv O_RESOLVE_BENEATH returns .Er ENOTCAPABLE if any intermediate component of the specified relative path does not reside in the directory hierarchy beneath the starting directory. Absolute paths or even the temporal escape from beneath of the starting directory is not allowed. .Pp When .Fa fd is opened with .Dv O_SEARCH , execute permissions are checked at open time. The .Fa fd may not be used for any read operations like .Xr getdirentries 2 . The primary use for this descriptor will be as the lookup descriptor for the .Fn *at family of functions. .Pp .Dv O_PATH returns a file descriptor that can be used as a directory file descriptor for .Xr openat 2 and other system calls taking a file descriptor argument, like .Xr fstatat 2 and others. The other functionality of the returned file descriptor is limited to the descriptor-level operations. It can be used for .Bl -tag -width SCM_RIGHTS -offset indent -compact .It Xr fcntl 2 but advisory locking is not allowed .It Xr dup 2 .It Xr close 2 .It Xr fstat 2 .It Xr fexecve 2 requires that .Dv O_EXEC was also specified at open time .It Dv SCM_RIGHTS can be passed over a .Xr unix 4 socket using a .Dv SCM_RIGHTS message +.It Xr kqueue 2 +using for +.Dv EVFILT_VNODE .El But operations like .Xr read 2 , .Xr ftruncate 2 , and any other that operate on file and not on file descriptor (except .Xr fstat 2 ), are not allowed. File opened with the .Dv O_PATH flag does not prevent non-forced unmount of the volume it belongs to. See also the description of .Dv AT_EMPTY_PATH flag for .Xr fstatat 2 and related syscalls. .Pp If successful, .Fn open returns a non-negative integer, termed a file descriptor. It returns \-1 on failure. The file pointer used to mark the current position within the file is set to the beginning of the file. .Pp If a sleeping open of a device node from .Xr devfs 5 is interrupted by a signal, the call always fails with .Er EINTR , even if the .Dv SA_RESTART flag is set for the signal. A sleeping open of a fifo (see .Xr mkfifo 2 ) is restarted as normal. .Pp When a new file is created it is given the group of the directory which contains it. .Pp Unless .Dv O_CLOEXEC flag was specified, the new descriptor is set to remain open across .Xr execve 2 system calls; see .Xr close 2 , .Xr fcntl 2 and .Dv O_CLOEXEC description. .Pp The system imposes a limit on the number of file descriptors open simultaneously by one process. The .Xr getdtablesize 2 system call returns the current system limit. .Sh RETURN VALUES If successful, .Fn open and .Fn openat return a non-negative integer, termed a file descriptor. They return \-1 on failure, and set .Va errno to indicate the error. .Sh ERRORS The named file is opened unless: .Bl -tag -width Er .It Bq Er ENOTDIR A component of the path prefix is not a directory. .It Bq Er ENAMETOOLONG A component of a pathname exceeded 255 characters, or an entire path name exceeded 1023 characters. .It Bq Er ENOENT .Dv O_CREAT is not set and the named file does not exist. .It Bq Er ENOENT A component of the path name that must exist does not exist. .It Bq Er EACCES Search permission is denied for a component of the path prefix. .It Bq Er EACCES The required permissions (for reading and/or writing) are denied for the given flags. .It Bq Er EACCES .Dv O_TRUNC is specified and write permission is denied. .It Bq Er EACCES .Dv O_CREAT is specified, the file does not exist, and the directory in which it is to be created does not permit writing. .It Bq Er EPERM .Dv O_CREAT is specified, the file does not exist, and the directory in which it is to be created has its immutable flag set, see the .Xr chflags 2 manual page for more information. .It Bq Er EPERM The named file has its immutable flag set and the file is to be modified. .It Bq Er EPERM The named file has its append-only flag set, the file is to be modified, and .Dv O_TRUNC is specified or .Dv O_APPEND is not specified. .It Bq Er ELOOP Too many symbolic links were encountered in translating the pathname. .It Bq Er EISDIR The named file is a directory, and the arguments specify it is to be modified. .It Bq Er EISDIR The named file is a directory, and the flags specified .Dv O_CREAT without .Dv O_DIRECTORY . .It Bq Er EROFS The named file resides on a read-only file system, and the file is to be modified. .It Bq Er EROFS .Dv O_CREAT is specified and the named file would reside on a read-only file system. .It Bq Er EMFILE The process has already reached its limit for open file descriptors. .It Bq Er ENFILE The system file table is full. .It Bq Er EMLINK .Dv O_NOFOLLOW was specified and the target is a symbolic link. .It Bq Er ENXIO The named file is a character special or block special file, and the device associated with this special file does not exist. .It Bq Er ENXIO .Dv O_NONBLOCK is set, the named file is a fifo, .Dv O_WRONLY is set, and no process has the file open for reading. .It Bq Er EINTR The .Fn open operation was interrupted by a signal. .It Bq Er EOPNOTSUPP .Dv O_SHLOCK or .Dv O_EXLOCK is specified but the underlying file system does not support locking. .It Bq Er EOPNOTSUPP The named file is a special file mounted through a file system that does not support access to it (e.g.\& NFS). .It Bq Er EWOULDBLOCK .Dv O_NONBLOCK and one of .Dv O_SHLOCK or .Dv O_EXLOCK is specified and the file is locked. .It Bq Er ENOSPC .Dv O_CREAT is specified, the file does not exist, and the directory in which the entry for the new file is being placed cannot be extended because there is no space left on the file system containing the directory. .It Bq Er ENOSPC .Dv O_CREAT is specified, the file does not exist, and there are no free inodes on the file system on which the file is being created. .It Bq Er EDQUOT .Dv O_CREAT is specified, the file does not exist, and the directory in which the entry for the new file is being placed cannot be extended because the user's quota of disk blocks on the file system containing the directory has been exhausted. .It Bq Er EDQUOT .Dv O_CREAT is specified, the file does not exist, and the user's quota of inodes on the file system on which the file is being created has been exhausted. .It Bq Er EIO An I/O error occurred while making the directory entry or allocating the inode for .Dv O_CREAT . .It Bq Er EINTEGRITY Corrupted data was detected while reading from the file system. .It Bq Er ETXTBSY The file is a pure procedure (shared text) file that is being executed and the .Fn open system call requests write access. .It Bq Er EFAULT The .Fa path argument points outside the process's allocated address space. .It Bq Er EEXIST .Dv O_CREAT and .Dv O_EXCL were specified and the file exists. .It Bq Er EOPNOTSUPP An attempt was made to open a socket (not currently implemented). .It Bq Er EINVAL An attempt was made to open a descriptor with an illegal combination of .Dv O_RDONLY , .Dv O_WRONLY , or .Dv O_RDWR , and .Dv O_EXEC or .Dv O_SEARCH . .It Bq Er EINVAL The .Dv O_RESOLVE_BENEATH flag is specified and .Dv path is absolute. .It Bq Er EBADF The .Fa path argument does not specify an absolute path and the .Fa fd argument is neither .Dv AT_FDCWD nor a valid file descriptor open for searching. .It Bq Er ENOTDIR The .Fa path argument is not an absolute path and .Fa fd is neither .Dv AT_FDCWD nor a file descriptor associated with a directory. .It Bq Er ENOTDIR .Dv O_DIRECTORY is specified and the file is not a directory. .It Bq Er ECAPMODE .Dv AT_FDCWD is specified and the process is in capability mode. .It Bq Er ECAPMODE .Fn open was called and the process is in capability mode. .It Bq Er ENOTCAPABLE .Fa path is an absolute path, or contained a ".." component leading to a directory outside of the directory hierarchy specified by .Fa fd , and the process is in capability mode. .It Bq Er ENOTCAPABLE The .Dv O_RESOLVE_BENEATH flag was provided, and the relative .Fa path escapes the .Ar fd directory. .El .Sh SEE ALSO .Xr chmod 2 , .Xr close 2 , .Xr dup 2 , .Xr fexecve 2 , .Xr fhopen 2 , .Xr getdtablesize 2 , .Xr getfh 2 , .Xr lgetfh 2 , .Xr lseek 2 , .Xr read 2 , .Xr umask 2 , .Xr write 2 , .Xr fopen 3 , .Xr capsicum 4 .Sh STANDARDS These functions are specified by .St -p1003.1-2008 . .Fx sets .Va errno to .Er EMLINK instead of .Er ELOOP as specified by .Tn POSIX when .Dv O_NOFOLLOW is set in flags and the final component of pathname is a symbolic link to distinguish it from the case of too many symbolic link traversals in one of its non-final components. .Sh HISTORY The .Fn open function appeared in .At v1 . The .Fn openat function was introduced in .Fx 8.0 . .Dv O_DSYNC appeared in 13.0. .Sh BUGS The Open Group Extended API Set 2 specification requires that the test for whether .Fa fd is searchable is based on whether .Fa fd is open for searching, not whether the underlying directory currently permits searches. The present implementation of the .Fa openat checks the current permissions of directory instead. .Pp The .Fa mode argument is variadic and may result in different calling conventions than might otherwise be expected. diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c index a28e94634326..30ac40ffe296 100644 --- a/sys/kern/kern_descrip.c +++ b/sys/kern/kern_descrip.c @@ -1,5117 +1,5117 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_ddb.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table"); static MALLOC_DEFINE(M_PWD, "pwd", "Descriptor table vnodes"); static MALLOC_DEFINE(M_PWDDESC, "pwddesc", "Pwd descriptors"); static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader", "file desc to leader structures"); static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures"); MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities"); MALLOC_DECLARE(M_FADVISE); static __read_mostly uma_zone_t file_zone; static __read_mostly uma_zone_t filedesc0_zone; __read_mostly uma_zone_t pwd_zone; VFS_SMR_DECLARE; static int closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, bool holdleaders, bool audit); static int fd_first_free(struct filedesc *fdp, int low, int size); static void fdgrowtable(struct filedesc *fdp, int nfd); static void fdgrowtable_exp(struct filedesc *fdp, int nfd); static void fdunused(struct filedesc *fdp, int fd); static void fdused(struct filedesc *fdp, int fd); static int getmaxfd(struct thread *td); static u_long *filecaps_copy_prep(const struct filecaps *src); static void filecaps_copy_finish(const struct filecaps *src, struct filecaps *dst, u_long *ioctls); static u_long *filecaps_free_prep(struct filecaps *fcaps); static void filecaps_free_finish(u_long *ioctls); static struct pwd *pwd_alloc(void); /* * Each process has: * * - An array of open file descriptors (fd_ofiles) * - An array of file flags (fd_ofileflags) * - A bitmap recording which descriptors are in use (fd_map) * * A process starts out with NDFILE descriptors. The value of NDFILE has * been selected based the historical limit of 20 open files, and an * assumption that the majority of processes, especially short-lived * processes like shells, will never need more. * * If this initial allocation is exhausted, a larger descriptor table and * map are allocated dynamically, and the pointers in the process's struct * filedesc are updated to point to those. This is repeated every time * the process runs out of file descriptors (provided it hasn't hit its * resource limit). * * Since threads may hold references to individual descriptor table * entries, the tables are never freed. Instead, they are placed on a * linked list and freed only when the struct filedesc is released. */ #define NDFILE 20 #define NDSLOTSIZE sizeof(NDSLOTTYPE) #define NDENTRIES (NDSLOTSIZE * __CHAR_BIT) #define NDSLOT(x) ((x) / NDENTRIES) #define NDBIT(x) ((NDSLOTTYPE)1 << ((x) % NDENTRIES)) #define NDSLOTS(x) (((x) + NDENTRIES - 1) / NDENTRIES) /* * SLIST entry used to keep track of ofiles which must be reclaimed when * the process exits. */ struct freetable { struct fdescenttbl *ft_table; SLIST_ENTRY(freetable) ft_next; }; /* * Initial allocation: a filedesc structure + the head of SLIST used to * keep track of old ofiles + enough space for NDFILE descriptors. */ struct fdescenttbl0 { int fdt_nfiles; struct filedescent fdt_ofiles[NDFILE]; }; struct filedesc0 { struct filedesc fd_fd; SLIST_HEAD(, freetable) fd_free; struct fdescenttbl0 fd_dfiles; NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)]; }; /* * Descriptor management. */ static int __exclusive_cache_line openfiles; /* actual number of open files */ struct mtx sigio_lock; /* mtx to protect pointers to sigio */ void __read_mostly (*mq_fdclose)(struct thread *td, int fd, struct file *fp); /* * If low >= size, just return low. Otherwise find the first zero bit in the * given bitmap, starting at low and not exceeding size - 1. Return size if * not found. */ static int fd_first_free(struct filedesc *fdp, int low, int size) { NDSLOTTYPE *map = fdp->fd_map; NDSLOTTYPE mask; int off, maxoff; if (low >= size) return (low); off = NDSLOT(low); if (low % NDENTRIES) { mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES))); if ((mask &= ~map[off]) != 0UL) return (off * NDENTRIES + ffsl(mask) - 1); ++off; } for (maxoff = NDSLOTS(size); off < maxoff; ++off) if (map[off] != ~0UL) return (off * NDENTRIES + ffsl(~map[off]) - 1); return (size); } /* * Find the last used fd. * * Call this variant if fdp can't be modified by anyone else (e.g, during exec). * Otherwise use fdlastfile. */ int fdlastfile_single(struct filedesc *fdp) { NDSLOTTYPE *map = fdp->fd_map; int off, minoff; off = NDSLOT(fdp->fd_nfiles - 1); for (minoff = NDSLOT(0); off >= minoff; --off) if (map[off] != 0) return (off * NDENTRIES + flsl(map[off]) - 1); return (-1); } int fdlastfile(struct filedesc *fdp) { FILEDESC_LOCK_ASSERT(fdp); return (fdlastfile_single(fdp)); } static int fdisused(struct filedesc *fdp, int fd) { KASSERT(fd >= 0 && fd < fdp->fd_nfiles, ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles)); return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0); } /* * Mark a file descriptor as used. */ static void fdused_init(struct filedesc *fdp, int fd) { KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd)); fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd); } static void fdused(struct filedesc *fdp, int fd) { FILEDESC_XLOCK_ASSERT(fdp); fdused_init(fdp, fd); if (fd == fdp->fd_freefile) fdp->fd_freefile++; } /* * Mark a file descriptor as unused. */ static void fdunused(struct filedesc *fdp, int fd) { FILEDESC_XLOCK_ASSERT(fdp); KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd)); KASSERT(fdp->fd_ofiles[fd].fde_file == NULL, ("fd=%d is still in use", fd)); fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd); if (fd < fdp->fd_freefile) fdp->fd_freefile = fd; } /* * Free a file descriptor. * * Avoid some work if fdp is about to be destroyed. */ static inline void fdefree_last(struct filedescent *fde) { filecaps_free(&fde->fde_caps); } static inline void fdfree(struct filedesc *fdp, int fd) { struct filedescent *fde; FILEDESC_XLOCK_ASSERT(fdp); fde = &fdp->fd_ofiles[fd]; #ifdef CAPABILITIES seqc_write_begin(&fde->fde_seqc); #endif fde->fde_file = NULL; #ifdef CAPABILITIES seqc_write_end(&fde->fde_seqc); #endif fdefree_last(fde); fdunused(fdp, fd); } /* * System calls on descriptors. */ #ifndef _SYS_SYSPROTO_H_ struct getdtablesize_args { int dummy; }; #endif /* ARGSUSED */ int sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap) { #ifdef RACCT uint64_t lim; #endif td->td_retval[0] = getmaxfd(td); #ifdef RACCT PROC_LOCK(td->td_proc); lim = racct_get_limit(td->td_proc, RACCT_NOFILE); PROC_UNLOCK(td->td_proc); if (lim < td->td_retval[0]) td->td_retval[0] = lim; #endif return (0); } /* * Duplicate a file descriptor to a particular value. * * Note: keep in mind that a potential race condition exists when closing * descriptors from a shared descriptor table (via rfork). */ #ifndef _SYS_SYSPROTO_H_ struct dup2_args { u_int from; u_int to; }; #endif /* ARGSUSED */ int sys_dup2(struct thread *td, struct dup2_args *uap) { return (kern_dup(td, FDDUP_FIXED, 0, (int)uap->from, (int)uap->to)); } /* * Duplicate a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct dup_args { u_int fd; }; #endif /* ARGSUSED */ int sys_dup(struct thread *td, struct dup_args *uap) { return (kern_dup(td, FDDUP_NORMAL, 0, (int)uap->fd, 0)); } /* * The file control system call. */ #ifndef _SYS_SYSPROTO_H_ struct fcntl_args { int fd; int cmd; long arg; }; #endif /* ARGSUSED */ int sys_fcntl(struct thread *td, struct fcntl_args *uap) { return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, uap->arg)); } int kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg) { struct flock fl; struct __oflock ofl; intptr_t arg1; int error, newcmd; error = 0; newcmd = cmd; switch (cmd) { case F_OGETLK: case F_OSETLK: case F_OSETLKW: /* * Convert old flock structure to new. */ error = copyin((void *)(intptr_t)arg, &ofl, sizeof(ofl)); fl.l_start = ofl.l_start; fl.l_len = ofl.l_len; fl.l_pid = ofl.l_pid; fl.l_type = ofl.l_type; fl.l_whence = ofl.l_whence; fl.l_sysid = 0; switch (cmd) { case F_OGETLK: newcmd = F_GETLK; break; case F_OSETLK: newcmd = F_SETLK; break; case F_OSETLKW: newcmd = F_SETLKW; break; } arg1 = (intptr_t)&fl; break; case F_GETLK: case F_SETLK: case F_SETLKW: case F_SETLK_REMOTE: error = copyin((void *)(intptr_t)arg, &fl, sizeof(fl)); arg1 = (intptr_t)&fl; break; default: arg1 = arg; break; } if (error) return (error); error = kern_fcntl(td, fd, newcmd, arg1); if (error) return (error); if (cmd == F_OGETLK) { ofl.l_start = fl.l_start; ofl.l_len = fl.l_len; ofl.l_pid = fl.l_pid; ofl.l_type = fl.l_type; ofl.l_whence = fl.l_whence; error = copyout(&ofl, (void *)(intptr_t)arg, sizeof(ofl)); } else if (cmd == F_GETLK) { error = copyout(&fl, (void *)(intptr_t)arg, sizeof(fl)); } return (error); } int kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) { struct filedesc *fdp; struct flock *flp; struct file *fp, *fp2; struct filedescent *fde; struct proc *p; struct vnode *vp; struct mount *mp; int error, flg, seals, tmp; uint64_t bsize; off_t foffset; error = 0; flg = F_POSIX; p = td->td_proc; fdp = p->p_fd; AUDIT_ARG_FD(cmd); AUDIT_ARG_CMD(cmd); switch (cmd) { case F_DUPFD: tmp = arg; error = kern_dup(td, FDDUP_FCNTL, 0, fd, tmp); break; case F_DUPFD_CLOEXEC: tmp = arg; error = kern_dup(td, FDDUP_FCNTL, FDDUP_FLAG_CLOEXEC, fd, tmp); break; case F_DUP2FD: tmp = arg; error = kern_dup(td, FDDUP_FIXED, 0, fd, tmp); break; case F_DUP2FD_CLOEXEC: tmp = arg; error = kern_dup(td, FDDUP_FIXED, FDDUP_FLAG_CLOEXEC, fd, tmp); break; case F_GETFD: error = EBADF; FILEDESC_SLOCK(fdp); fde = fdeget_locked(fdp, fd); if (fde != NULL) { td->td_retval[0] = (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0; error = 0; } FILEDESC_SUNLOCK(fdp); break; case F_SETFD: error = EBADF; FILEDESC_XLOCK(fdp); fde = fdeget_locked(fdp, fd); if (fde != NULL) { fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) | (arg & FD_CLOEXEC ? UF_EXCLOSE : 0); error = 0; } FILEDESC_XUNLOCK(fdp); break; case F_GETFL: error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETFL, &fp); if (error != 0) break; td->td_retval[0] = OFLAGS(fp->f_flag); fdrop(fp, td); break; case F_SETFL: error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETFL, &fp); if (error != 0) break; if (fp->f_ops == &path_fileops) { fdrop(fp, td); error = EBADF; break; } do { tmp = flg = fp->f_flag; tmp &= ~FCNTLFLAGS; tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS; } while (atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0); tmp = fp->f_flag & FNONBLOCK; error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); if (error != 0) { fdrop(fp, td); break; } tmp = fp->f_flag & FASYNC; error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td); if (error == 0) { fdrop(fp, td); break; } atomic_clear_int(&fp->f_flag, FNONBLOCK); tmp = 0; (void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); fdrop(fp, td); break; case F_GETOWN: error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETOWN, &fp); if (error != 0) break; error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td); if (error == 0) td->td_retval[0] = tmp; fdrop(fp, td); break; case F_SETOWN: error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETOWN, &fp); if (error != 0) break; tmp = arg; error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td); fdrop(fp, td); break; case F_SETLK_REMOTE: error = priv_check(td, PRIV_NFS_LOCKD); if (error != 0) return (error); flg = F_REMOTE; goto do_setlk; case F_SETLKW: flg |= F_WAIT; /* FALLTHROUGH F_SETLK */ case F_SETLK: do_setlk: flp = (struct flock *)arg; if ((flg & F_REMOTE) != 0 && flp->l_sysid == 0) { error = EINVAL; break; } error = fget_unlocked(fdp, fd, &cap_flock_rights, &fp); if (error != 0) break; if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) { error = EBADF; fdrop(fp, td); break; } if (flp->l_whence == SEEK_CUR) { foffset = foffset_get(fp); if (foffset < 0 || (flp->l_start > 0 && foffset > OFF_MAX - flp->l_start)) { error = EOVERFLOW; fdrop(fp, td); break; } flp->l_start += foffset; } vp = fp->f_vnode; switch (flp->l_type) { case F_RDLCK: if ((fp->f_flag & FREAD) == 0) { error = EBADF; break; } if ((p->p_leader->p_flag & P_ADVLOCK) == 0) { PROC_LOCK(p->p_leader); p->p_leader->p_flag |= P_ADVLOCK; PROC_UNLOCK(p->p_leader); } error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, flp, flg); break; case F_WRLCK: if ((fp->f_flag & FWRITE) == 0) { error = EBADF; break; } if ((p->p_leader->p_flag & P_ADVLOCK) == 0) { PROC_LOCK(p->p_leader); p->p_leader->p_flag |= P_ADVLOCK; PROC_UNLOCK(p->p_leader); } error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, flp, flg); break; case F_UNLCK: error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, flp, flg); break; case F_UNLCKSYS: if (flg != F_REMOTE) { error = EINVAL; break; } error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCKSYS, flp, flg); break; default: error = EINVAL; break; } if (error != 0 || flp->l_type == F_UNLCK || flp->l_type == F_UNLCKSYS) { fdrop(fp, td); break; } /* * Check for a race with close. * * The vnode is now advisory locked (or unlocked, but this case * is not really important) as the caller requested. * We had to drop the filedesc lock, so we need to recheck if * the descriptor is still valid, because if it was closed * in the meantime we need to remove advisory lock from the * vnode - close on any descriptor leading to an advisory * locked vnode, removes that lock. * We will return 0 on purpose in that case, as the result of * successful advisory lock might have been externally visible * already. This is fine - effectively we pretend to the caller * that the closing thread was a bit slower and that the * advisory lock succeeded before the close. */ error = fget_unlocked(fdp, fd, &cap_no_rights, &fp2); if (error != 0) { fdrop(fp, td); break; } if (fp != fp2) { flp->l_whence = SEEK_SET; flp->l_start = 0; flp->l_len = 0; flp->l_type = F_UNLCK; (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, flp, F_POSIX); } fdrop(fp, td); fdrop(fp2, td); break; case F_GETLK: error = fget_unlocked(fdp, fd, &cap_flock_rights, &fp); if (error != 0) break; if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) { error = EBADF; fdrop(fp, td); break; } flp = (struct flock *)arg; if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK && flp->l_type != F_UNLCK) { error = EINVAL; fdrop(fp, td); break; } if (flp->l_whence == SEEK_CUR) { foffset = foffset_get(fp); if ((flp->l_start > 0 && foffset > OFF_MAX - flp->l_start) || (flp->l_start < 0 && foffset < OFF_MIN - flp->l_start)) { error = EOVERFLOW; fdrop(fp, td); break; } flp->l_start += foffset; } vp = fp->f_vnode; error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp, F_POSIX); fdrop(fp, td); break; case F_ADD_SEALS: error = fget_unlocked(fdp, fd, &cap_no_rights, &fp); if (error != 0) break; error = fo_add_seals(fp, arg); fdrop(fp, td); break; case F_GET_SEALS: error = fget_unlocked(fdp, fd, &cap_no_rights, &fp); if (error != 0) break; if (fo_get_seals(fp, &seals) == 0) td->td_retval[0] = seals; else error = EINVAL; fdrop(fp, td); break; case F_RDAHEAD: arg = arg ? 128 * 1024: 0; /* FALLTHROUGH */ case F_READAHEAD: error = fget_unlocked(fdp, fd, &cap_no_rights, &fp); if (error != 0) break; if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) { fdrop(fp, td); error = EBADF; break; } vp = fp->f_vnode; if (vp->v_type != VREG) { fdrop(fp, td); error = ENOTTY; break; } /* * Exclusive lock synchronizes against f_seqcount reads and * writes in sequential_heuristic(). */ error = vn_lock(vp, LK_EXCLUSIVE); if (error != 0) { fdrop(fp, td); break; } if (arg >= 0) { bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize; arg = MIN(arg, INT_MAX - bsize + 1); fp->f_seqcount[UIO_READ] = MIN(IO_SEQMAX, (arg + bsize - 1) / bsize); atomic_set_int(&fp->f_flag, FRDAHEAD); } else { atomic_clear_int(&fp->f_flag, FRDAHEAD); } VOP_UNLOCK(vp); fdrop(fp, td); break; case F_ISUNIONSTACK: /* * Check if the vnode is part of a union stack (either the * "union" flag from mount(2) or unionfs). * * Prior to introduction of this op libc's readdir would call * fstatfs(2), in effect unnecessarily copying kilobytes of * data just to check fs name and a mount flag. * * Fixing the code to handle everything in the kernel instead * is a non-trivial endeavor and has low priority, thus this * horrible kludge facilitates the current behavior in a much * cheaper manner until someone(tm) sorts this out. */ error = fget_unlocked(fdp, fd, &cap_no_rights, &fp); if (error != 0) break; if (fp->f_type != DTYPE_VNODE) { fdrop(fp, td); error = EBADF; break; } vp = fp->f_vnode; /* * Since we don't prevent dooming the vnode even non-null mp * found can become immediately stale. This is tolerable since * mount points are type-stable (providing safe memory access) * and any vfs op on this vnode going forward will return an * error (meaning return value in this case is meaningless). */ mp = atomic_load_ptr(&vp->v_mount); if (__predict_false(mp == NULL)) { fdrop(fp, td); error = EBADF; break; } td->td_retval[0] = 0; if (mp->mnt_kern_flag & MNTK_UNIONFS || mp->mnt_flag & MNT_UNION) td->td_retval[0] = 1; fdrop(fp, td); break; default: error = EINVAL; break; } return (error); } static int getmaxfd(struct thread *td) { return (min((int)lim_cur(td, RLIMIT_NOFILE), maxfilesperproc)); } /* * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD). */ int kern_dup(struct thread *td, u_int mode, int flags, int old, int new) { struct filedesc *fdp; struct filedescent *oldfde, *newfde; struct proc *p; struct file *delfp, *oldfp; u_long *oioctls, *nioctls; int error, maxfd; p = td->td_proc; fdp = p->p_fd; oioctls = NULL; MPASS((flags & ~(FDDUP_FLAG_CLOEXEC)) == 0); MPASS(mode < FDDUP_LASTMODE); AUDIT_ARG_FD(old); /* XXXRW: if (flags & FDDUP_FIXED) AUDIT_ARG_FD2(new); */ /* * Verify we have a valid descriptor to dup from and possibly to * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should * return EINVAL when the new descriptor is out of bounds. */ if (old < 0) return (EBADF); if (new < 0) return (mode == FDDUP_FCNTL ? EINVAL : EBADF); maxfd = getmaxfd(td); if (new >= maxfd) return (mode == FDDUP_FCNTL ? EINVAL : EBADF); error = EBADF; FILEDESC_XLOCK(fdp); if (fget_locked(fdp, old) == NULL) goto unlock; if ((mode == FDDUP_FIXED || mode == FDDUP_MUSTREPLACE) && old == new) { td->td_retval[0] = new; if (flags & FDDUP_FLAG_CLOEXEC) fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE; error = 0; goto unlock; } oldfde = &fdp->fd_ofiles[old]; oldfp = oldfde->fde_file; if (!fhold(oldfp)) goto unlock; /* * If the caller specified a file descriptor, make sure the file * table is large enough to hold it, and grab it. Otherwise, just * allocate a new descriptor the usual way. */ switch (mode) { case FDDUP_NORMAL: case FDDUP_FCNTL: if ((error = fdalloc(td, new, &new)) != 0) { fdrop(oldfp, td); goto unlock; } break; case FDDUP_MUSTREPLACE: /* Target file descriptor must exist. */ if (fget_locked(fdp, new) == NULL) { fdrop(oldfp, td); goto unlock; } break; case FDDUP_FIXED: if (new >= fdp->fd_nfiles) { /* * The resource limits are here instead of e.g. * fdalloc(), because the file descriptor table may be * shared between processes, so we can't really use * racct_add()/racct_sub(). Instead of counting the * number of actually allocated descriptors, just put * the limit on the size of the file descriptor table. */ #ifdef RACCT if (RACCT_ENABLED()) { error = racct_set_unlocked(p, RACCT_NOFILE, new + 1); if (error != 0) { error = EMFILE; fdrop(oldfp, td); goto unlock; } } #endif fdgrowtable_exp(fdp, new + 1); } if (!fdisused(fdp, new)) fdused(fdp, new); break; default: KASSERT(0, ("%s unsupported mode %d", __func__, mode)); } KASSERT(old != new, ("new fd is same as old")); /* Refetch oldfde because the table may have grown and old one freed. */ oldfde = &fdp->fd_ofiles[old]; KASSERT(oldfp == oldfde->fde_file, ("fdt_ofiles shift from growth observed at fd %d", old)); newfde = &fdp->fd_ofiles[new]; delfp = newfde->fde_file; nioctls = filecaps_copy_prep(&oldfde->fde_caps); /* * Duplicate the source descriptor. */ #ifdef CAPABILITIES seqc_write_begin(&newfde->fde_seqc); #endif oioctls = filecaps_free_prep(&newfde->fde_caps); memcpy(newfde, oldfde, fde_change_size); filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps, nioctls); if ((flags & FDDUP_FLAG_CLOEXEC) != 0) newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE; else newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE; #ifdef CAPABILITIES seqc_write_end(&newfde->fde_seqc); #endif td->td_retval[0] = new; error = 0; if (delfp != NULL) { (void) closefp(fdp, new, delfp, td, true, false); FILEDESC_UNLOCK_ASSERT(fdp); } else { unlock: FILEDESC_XUNLOCK(fdp); } filecaps_free_finish(oioctls); return (error); } static void sigiofree(struct sigio *sigio) { crfree(sigio->sio_ucred); free(sigio, M_SIGIO); } static struct sigio * funsetown_locked(struct sigio *sigio) { struct proc *p; struct pgrp *pg; SIGIO_ASSERT_LOCKED(); if (sigio == NULL) return (NULL); *(sigio->sio_myref) = NULL; if (sigio->sio_pgid < 0) { pg = sigio->sio_pgrp; PGRP_LOCK(pg); SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio, sigio, sio_pgsigio); PGRP_UNLOCK(pg); } else { p = sigio->sio_proc; PROC_LOCK(p); SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio, sigio, sio_pgsigio); PROC_UNLOCK(p); } return (sigio); } /* * If sigio is on the list associated with a process or process group, * disable signalling from the device, remove sigio from the list and * free sigio. */ void funsetown(struct sigio **sigiop) { struct sigio *sigio; /* Racy check, consumers must provide synchronization. */ if (*sigiop == NULL) return; SIGIO_LOCK(); sigio = funsetown_locked(*sigiop); SIGIO_UNLOCK(); if (sigio != NULL) sigiofree(sigio); } /* * Free a list of sigio structures. The caller must ensure that new sigio * structures cannot be added after this point. For process groups this is * guaranteed using the proctree lock; for processes, the P_WEXIT flag serves * as an interlock. */ void funsetownlst(struct sigiolst *sigiolst) { struct proc *p; struct pgrp *pg; struct sigio *sigio, *tmp; /* Racy check. */ sigio = SLIST_FIRST(sigiolst); if (sigio == NULL) return; p = NULL; pg = NULL; SIGIO_LOCK(); sigio = SLIST_FIRST(sigiolst); if (sigio == NULL) { SIGIO_UNLOCK(); return; } /* * Every entry of the list should belong to a single proc or pgrp. */ if (sigio->sio_pgid < 0) { pg = sigio->sio_pgrp; sx_assert(&proctree_lock, SX_XLOCKED); PGRP_LOCK(pg); } else /* if (sigio->sio_pgid > 0) */ { p = sigio->sio_proc; PROC_LOCK(p); KASSERT((p->p_flag & P_WEXIT) != 0, ("%s: process %p is not exiting", __func__, p)); } SLIST_FOREACH(sigio, sigiolst, sio_pgsigio) { *sigio->sio_myref = NULL; if (pg != NULL) { KASSERT(sigio->sio_pgid < 0, ("Proc sigio in pgrp sigio list")); KASSERT(sigio->sio_pgrp == pg, ("Bogus pgrp in sigio list")); } else /* if (p != NULL) */ { KASSERT(sigio->sio_pgid > 0, ("Pgrp sigio in proc sigio list")); KASSERT(sigio->sio_proc == p, ("Bogus proc in sigio list")); } } if (pg != NULL) PGRP_UNLOCK(pg); else PROC_UNLOCK(p); SIGIO_UNLOCK(); SLIST_FOREACH_SAFE(sigio, sigiolst, sio_pgsigio, tmp) sigiofree(sigio); } /* * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg). * * After permission checking, add a sigio structure to the sigio list for * the process or process group. */ int fsetown(pid_t pgid, struct sigio **sigiop) { struct proc *proc; struct pgrp *pgrp; struct sigio *osigio, *sigio; int ret; if (pgid == 0) { funsetown(sigiop); return (0); } ret = 0; sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK); sigio->sio_pgid = pgid; sigio->sio_ucred = crhold(curthread->td_ucred); sigio->sio_myref = sigiop; sx_slock(&proctree_lock); SIGIO_LOCK(); osigio = funsetown_locked(*sigiop); if (pgid > 0) { proc = pfind(pgid); if (proc == NULL) { ret = ESRCH; goto fail; } /* * Policy - Don't allow a process to FSETOWN a process * in another session. * * Remove this test to allow maximum flexibility or * restrict FSETOWN to the current process or process * group for maximum safety. */ if (proc->p_session != curthread->td_proc->p_session) { PROC_UNLOCK(proc); ret = EPERM; goto fail; } sigio->sio_proc = proc; SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio); PROC_UNLOCK(proc); } else /* if (pgid < 0) */ { pgrp = pgfind(-pgid); if (pgrp == NULL) { ret = ESRCH; goto fail; } /* * Policy - Don't allow a process to FSETOWN a process * in another session. * * Remove this test to allow maximum flexibility or * restrict FSETOWN to the current process or process * group for maximum safety. */ if (pgrp->pg_session != curthread->td_proc->p_session) { PGRP_UNLOCK(pgrp); ret = EPERM; goto fail; } SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio); sigio->sio_pgrp = pgrp; PGRP_UNLOCK(pgrp); } sx_sunlock(&proctree_lock); *sigiop = sigio; SIGIO_UNLOCK(); if (osigio != NULL) sigiofree(osigio); return (0); fail: SIGIO_UNLOCK(); sx_sunlock(&proctree_lock); sigiofree(sigio); if (osigio != NULL) sigiofree(osigio); return (ret); } /* * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg). */ pid_t fgetown(struct sigio **sigiop) { pid_t pgid; SIGIO_LOCK(); pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0; SIGIO_UNLOCK(); return (pgid); } static int closefp_impl(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, bool audit) { int error; FILEDESC_XLOCK_ASSERT(fdp); /* * We now hold the fp reference that used to be owned by the * descriptor array. We have to unlock the FILEDESC *AFTER* * knote_fdclose to prevent a race of the fd getting opened, a knote * added, and deleteing a knote for the new fd. */ if (__predict_false(!TAILQ_EMPTY(&fdp->fd_kqlist))) knote_fdclose(td, fd); /* * We need to notify mqueue if the object is of type mqueue. */ if (__predict_false(fp->f_type == DTYPE_MQUEUE)) mq_fdclose(td, fd, fp); FILEDESC_XUNLOCK(fdp); #ifdef AUDIT if (AUDITING_TD(td) && audit) audit_sysclose(td, fd, fp); #endif error = closef(fp, td); /* * All paths leading up to closefp() will have already removed or * replaced the fd in the filedesc table, so a restart would not * operate on the same file. */ if (error == ERESTART) error = EINTR; return (error); } static int closefp_hl(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, bool holdleaders, bool audit) { int error; FILEDESC_XLOCK_ASSERT(fdp); if (holdleaders) { if (td->td_proc->p_fdtol != NULL) { /* * Ask fdfree() to sleep to ensure that all relevant * process leaders can be traversed in closef(). */ fdp->fd_holdleaderscount++; } else { holdleaders = false; } } error = closefp_impl(fdp, fd, fp, td, audit); if (holdleaders) { FILEDESC_XLOCK(fdp); fdp->fd_holdleaderscount--; if (fdp->fd_holdleaderscount == 0 && fdp->fd_holdleaderswakeup != 0) { fdp->fd_holdleaderswakeup = 0; wakeup(&fdp->fd_holdleaderscount); } FILEDESC_XUNLOCK(fdp); } return (error); } static int closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, bool holdleaders, bool audit) { FILEDESC_XLOCK_ASSERT(fdp); if (__predict_false(td->td_proc->p_fdtol != NULL)) { return (closefp_hl(fdp, fd, fp, td, holdleaders, audit)); } else { return (closefp_impl(fdp, fd, fp, td, audit)); } } /* * Close a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct close_args { int fd; }; #endif /* ARGSUSED */ int sys_close(struct thread *td, struct close_args *uap) { return (kern_close(td, uap->fd)); } int kern_close(struct thread *td, int fd) { struct filedesc *fdp; struct file *fp; fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); if ((fp = fget_locked(fdp, fd)) == NULL) { FILEDESC_XUNLOCK(fdp); return (EBADF); } fdfree(fdp, fd); /* closefp() drops the FILEDESC lock for us. */ return (closefp(fdp, fd, fp, td, true, true)); } int kern_close_range(struct thread *td, u_int lowfd, u_int highfd) { struct filedesc *fdp; const struct fdescenttbl *fdt; struct file *fp; int fd; /* * Check this prior to clamping; closefrom(3) with only fd 0, 1, and 2 * open should not be a usage error. From a close_range() perspective, * close_range(3, ~0U, 0) in the same scenario should also likely not * be a usage error as all fd above 3 are in-fact already closed. */ if (highfd < lowfd) { return (EINVAL); } fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); fdt = atomic_load_ptr(&fdp->fd_files); highfd = MIN(highfd, fdt->fdt_nfiles - 1); fd = lowfd; if (__predict_false(fd > highfd)) { goto out_locked; } for (;;) { fp = fdt->fdt_ofiles[fd].fde_file; if (fp == NULL) { if (fd == highfd) goto out_locked; } else { fdfree(fdp, fd); (void) closefp(fdp, fd, fp, td, true, true); if (fd == highfd) goto out_unlocked; FILEDESC_XLOCK(fdp); fdt = atomic_load_ptr(&fdp->fd_files); } fd++; } out_locked: FILEDESC_XUNLOCK(fdp); out_unlocked: return (0); } #ifndef _SYS_SYSPROTO_H_ struct close_range_args { u_int lowfd; u_int highfd; int flags; }; #endif int sys_close_range(struct thread *td, struct close_range_args *uap) { AUDIT_ARG_FD(uap->lowfd); AUDIT_ARG_CMD(uap->highfd); AUDIT_ARG_FFLAGS(uap->flags); /* No flags currently defined */ if (uap->flags != 0) return (EINVAL); return (kern_close_range(td, uap->lowfd, uap->highfd)); } #ifdef COMPAT_FREEBSD12 /* * Close open file descriptors. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd12_closefrom_args { int lowfd; }; #endif /* ARGSUSED */ int freebsd12_closefrom(struct thread *td, struct freebsd12_closefrom_args *uap) { u_int lowfd; AUDIT_ARG_FD(uap->lowfd); /* * Treat negative starting file descriptor values identical to * closefrom(0) which closes all files. */ lowfd = MAX(0, uap->lowfd); return (kern_close_range(td, lowfd, ~0U)); } #endif /* COMPAT_FREEBSD12 */ #if defined(COMPAT_43) /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct ofstat_args { int fd; struct ostat *sb; }; #endif /* ARGSUSED */ int ofstat(struct thread *td, struct ofstat_args *uap) { struct ostat oub; struct stat ub; int error; error = kern_fstat(td, uap->fd, &ub); if (error == 0) { cvtstat(&ub, &oub); error = copyout(&oub, uap->sb, sizeof(oub)); } return (error); } #endif /* COMPAT_43 */ #if defined(COMPAT_FREEBSD11) int freebsd11_fstat(struct thread *td, struct freebsd11_fstat_args *uap) { struct stat sb; struct freebsd11_stat osb; int error; error = kern_fstat(td, uap->fd, &sb); if (error != 0) return (error); error = freebsd11_cvtstat(&sb, &osb); if (error == 0) error = copyout(&osb, uap->sb, sizeof(osb)); return (error); } #endif /* COMPAT_FREEBSD11 */ /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fstat_args { int fd; struct stat *sb; }; #endif /* ARGSUSED */ int sys_fstat(struct thread *td, struct fstat_args *uap) { struct stat ub; int error; error = kern_fstat(td, uap->fd, &ub); if (error == 0) error = copyout(&ub, uap->sb, sizeof(ub)); return (error); } int kern_fstat(struct thread *td, int fd, struct stat *sbp) { struct file *fp; int error; AUDIT_ARG_FD(fd); error = fget(td, fd, &cap_fstat_rights, &fp); if (__predict_false(error != 0)) return (error); AUDIT_ARG_FILE(td->td_proc, fp); error = fo_stat(fp, sbp, td->td_ucred, td); fdrop(fp, td); #ifdef __STAT_TIME_T_EXT sbp->st_atim_ext = 0; sbp->st_mtim_ext = 0; sbp->st_ctim_ext = 0; sbp->st_btim_ext = 0; #endif #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrstat_error(sbp, error); #endif return (error); } #if defined(COMPAT_FREEBSD11) /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd11_nfstat_args { int fd; struct nstat *sb; }; #endif /* ARGSUSED */ int freebsd11_nfstat(struct thread *td, struct freebsd11_nfstat_args *uap) { struct nstat nub; struct stat ub; int error; error = kern_fstat(td, uap->fd, &ub); if (error == 0) { freebsd11_cvtnstat(&ub, &nub); error = copyout(&nub, uap->sb, sizeof(nub)); } return (error); } #endif /* COMPAT_FREEBSD11 */ /* * Return pathconf information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fpathconf_args { int fd; int name; }; #endif /* ARGSUSED */ int sys_fpathconf(struct thread *td, struct fpathconf_args *uap) { long value; int error; error = kern_fpathconf(td, uap->fd, uap->name, &value); if (error == 0) td->td_retval[0] = value; return (error); } int kern_fpathconf(struct thread *td, int fd, int name, long *valuep) { struct file *fp; struct vnode *vp; int error; error = fget(td, fd, &cap_fpathconf_rights, &fp); if (error != 0) return (error); if (name == _PC_ASYNC_IO) { *valuep = _POSIX_ASYNCHRONOUS_IO; goto out; } vp = fp->f_vnode; if (vp != NULL) { vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_PATHCONF(vp, name, valuep); VOP_UNLOCK(vp); } else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) { if (name != _PC_PIPE_BUF) { error = EINVAL; } else { *valuep = PIPE_BUF; error = 0; } } else { error = EOPNOTSUPP; } out: fdrop(fp, td); return (error); } /* * Copy filecaps structure allocating memory for ioctls array if needed. * * The last parameter indicates whether the fdtable is locked. If it is not and * ioctls are encountered, copying fails and the caller must lock the table. * * Note that if the table was not locked, the caller has to check the relevant * sequence counter to determine whether the operation was successful. */ bool filecaps_copy(const struct filecaps *src, struct filecaps *dst, bool locked) { size_t size; if (src->fc_ioctls != NULL && !locked) return (false); memcpy(dst, src, sizeof(*src)); if (src->fc_ioctls == NULL) return (true); KASSERT(src->fc_nioctls > 0, ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls)); size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK); memcpy(dst->fc_ioctls, src->fc_ioctls, size); return (true); } static u_long * filecaps_copy_prep(const struct filecaps *src) { u_long *ioctls; size_t size; if (__predict_true(src->fc_ioctls == NULL)) return (NULL); KASSERT(src->fc_nioctls > 0, ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls)); size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; ioctls = malloc(size, M_FILECAPS, M_WAITOK); return (ioctls); } static void filecaps_copy_finish(const struct filecaps *src, struct filecaps *dst, u_long *ioctls) { size_t size; *dst = *src; if (__predict_true(src->fc_ioctls == NULL)) { MPASS(ioctls == NULL); return; } size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; dst->fc_ioctls = ioctls; bcopy(src->fc_ioctls, dst->fc_ioctls, size); } /* * Move filecaps structure to the new place and clear the old place. */ void filecaps_move(struct filecaps *src, struct filecaps *dst) { *dst = *src; bzero(src, sizeof(*src)); } /* * Fill the given filecaps structure with full rights. */ static void filecaps_fill(struct filecaps *fcaps) { CAP_ALL(&fcaps->fc_rights); fcaps->fc_ioctls = NULL; fcaps->fc_nioctls = -1; fcaps->fc_fcntls = CAP_FCNTL_ALL; } /* * Free memory allocated within filecaps structure. */ void filecaps_free(struct filecaps *fcaps) { free(fcaps->fc_ioctls, M_FILECAPS); bzero(fcaps, sizeof(*fcaps)); } static u_long * filecaps_free_prep(struct filecaps *fcaps) { u_long *ioctls; ioctls = fcaps->fc_ioctls; bzero(fcaps, sizeof(*fcaps)); return (ioctls); } static void filecaps_free_finish(u_long *ioctls) { free(ioctls, M_FILECAPS); } /* * Validate the given filecaps structure. */ static void filecaps_validate(const struct filecaps *fcaps, const char *func) { KASSERT(cap_rights_is_valid(&fcaps->fc_rights), ("%s: invalid rights", func)); KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0, ("%s: invalid fcntls", func)); KASSERT(fcaps->fc_fcntls == 0 || cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL), ("%s: fcntls without CAP_FCNTL", func)); KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 : (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0), ("%s: invalid ioctls", func)); KASSERT(fcaps->fc_nioctls == 0 || cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL), ("%s: ioctls without CAP_IOCTL", func)); } static void fdgrowtable_exp(struct filedesc *fdp, int nfd) { int nfd1; FILEDESC_XLOCK_ASSERT(fdp); nfd1 = fdp->fd_nfiles * 2; if (nfd1 < nfd) nfd1 = nfd; fdgrowtable(fdp, nfd1); } /* * Grow the file table to accommodate (at least) nfd descriptors. */ static void fdgrowtable(struct filedesc *fdp, int nfd) { struct filedesc0 *fdp0; struct freetable *ft; struct fdescenttbl *ntable; struct fdescenttbl *otable; int nnfiles, onfiles; NDSLOTTYPE *nmap, *omap; KASSERT(fdp->fd_nfiles > 0, ("zero-length file table")); /* save old values */ onfiles = fdp->fd_nfiles; otable = fdp->fd_files; omap = fdp->fd_map; /* compute the size of the new table */ nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */ if (nnfiles <= onfiles) /* the table is already large enough */ return; /* * Allocate a new table. We need enough space for the number of * entries, file entries themselves and the struct freetable we will use * when we decommission the table and place it on the freelist. * We place the struct freetable in the middle so we don't have * to worry about padding. */ ntable = malloc(offsetof(struct fdescenttbl, fdt_ofiles) + nnfiles * sizeof(ntable->fdt_ofiles[0]) + sizeof(struct freetable), M_FILEDESC, M_ZERO | M_WAITOK); /* copy the old data */ ntable->fdt_nfiles = nnfiles; memcpy(ntable->fdt_ofiles, otable->fdt_ofiles, onfiles * sizeof(ntable->fdt_ofiles[0])); /* * Allocate a new map only if the old is not large enough. It will * grow at a slower rate than the table as it can map more * entries than the table can hold. */ if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) { nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC, M_ZERO | M_WAITOK); /* copy over the old data and update the pointer */ memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap)); fdp->fd_map = nmap; } /* * Make sure that ntable is correctly initialized before we replace * fd_files poiner. Otherwise fget_unlocked() may see inconsistent * data. */ atomic_store_rel_ptr((volatile void *)&fdp->fd_files, (uintptr_t)ntable); /* * Free the old file table when not shared by other threads or processes. * The old file table is considered to be shared when either are true: * - The process has more than one thread. * - The file descriptor table has been shared via fdshare(). * * When shared, the old file table will be placed on a freelist * which will be processed when the struct filedesc is released. * * Note that if onfiles == NDFILE, we're dealing with the original * static allocation contained within (struct filedesc0 *)fdp, * which must not be freed. */ if (onfiles > NDFILE) { /* * Note we may be called here from fdinit while allocating a * table for a new process in which case ->p_fd points * elsewhere. */ if (curproc->p_fd != fdp || FILEDESC_IS_ONLY_USER(fdp)) { free(otable, M_FILEDESC); } else { ft = (struct freetable *)&otable->fdt_ofiles[onfiles]; fdp0 = (struct filedesc0 *)fdp; ft->ft_table = otable; SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next); } } /* * The map does not have the same possibility of threads still * holding references to it. So always free it as long as it * does not reference the original static allocation. */ if (NDSLOTS(onfiles) > NDSLOTS(NDFILE)) free(omap, M_FILEDESC); } /* * Allocate a file descriptor for the process. */ int fdalloc(struct thread *td, int minfd, int *result) { struct proc *p = td->td_proc; struct filedesc *fdp = p->p_fd; int fd, maxfd, allocfd; #ifdef RACCT int error; #endif FILEDESC_XLOCK_ASSERT(fdp); if (fdp->fd_freefile > minfd) minfd = fdp->fd_freefile; maxfd = getmaxfd(td); /* * Search the bitmap for a free descriptor starting at minfd. * If none is found, grow the file table. */ fd = fd_first_free(fdp, minfd, fdp->fd_nfiles); if (__predict_false(fd >= maxfd)) return (EMFILE); if (__predict_false(fd >= fdp->fd_nfiles)) { allocfd = min(fd * 2, maxfd); #ifdef RACCT if (RACCT_ENABLED()) { error = racct_set_unlocked(p, RACCT_NOFILE, allocfd); if (error != 0) return (EMFILE); } #endif /* * fd is already equal to first free descriptor >= minfd, so * we only need to grow the table and we are done. */ fdgrowtable_exp(fdp, allocfd); } /* * Perform some sanity checks, then mark the file descriptor as * used and return it to the caller. */ KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles), ("invalid descriptor %d", fd)); KASSERT(!fdisused(fdp, fd), ("fd_first_free() returned non-free descriptor")); KASSERT(fdp->fd_ofiles[fd].fde_file == NULL, ("file descriptor isn't free")); fdused(fdp, fd); *result = fd; return (0); } /* * Allocate n file descriptors for the process. */ int fdallocn(struct thread *td, int minfd, int *fds, int n) { struct proc *p = td->td_proc; struct filedesc *fdp = p->p_fd; int i; FILEDESC_XLOCK_ASSERT(fdp); for (i = 0; i < n; i++) if (fdalloc(td, 0, &fds[i]) != 0) break; if (i < n) { for (i--; i >= 0; i--) fdunused(fdp, fds[i]); return (EMFILE); } return (0); } /* * Create a new open file structure and allocate a file descriptor for the * process that refers to it. We add one reference to the file for the * descriptor table and one reference for resultfp. This is to prevent us * being preempted and the entry in the descriptor table closed after we * release the FILEDESC lock. */ int falloc_caps(struct thread *td, struct file **resultfp, int *resultfd, int flags, struct filecaps *fcaps) { struct file *fp; int error, fd; MPASS(resultfp != NULL); MPASS(resultfd != NULL); error = _falloc_noinstall(td, &fp, 2); if (__predict_false(error != 0)) { return (error); } error = finstall_refed(td, fp, &fd, flags, fcaps); if (__predict_false(error != 0)) { falloc_abort(td, fp); return (error); } *resultfp = fp; *resultfd = fd; return (0); } /* * Create a new open file structure without allocating a file descriptor. */ int _falloc_noinstall(struct thread *td, struct file **resultfp, u_int n) { struct file *fp; int maxuserfiles = maxfiles - (maxfiles / 20); int openfiles_new; static struct timeval lastfail; static int curfail; KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__)); MPASS(n > 0); openfiles_new = atomic_fetchadd_int(&openfiles, 1) + 1; if ((openfiles_new >= maxuserfiles && priv_check(td, PRIV_MAXFILES) != 0) || openfiles_new >= maxfiles) { atomic_subtract_int(&openfiles, 1); if (ppsratecheck(&lastfail, &curfail, 1)) { printf("kern.maxfiles limit exceeded by uid %i, (%s) " "please see tuning(7).\n", td->td_ucred->cr_ruid, td->td_proc->p_comm); } return (ENFILE); } fp = uma_zalloc(file_zone, M_WAITOK); bzero(fp, sizeof(*fp)); refcount_init(&fp->f_count, n); fp->f_cred = crhold(td->td_ucred); fp->f_ops = &badfileops; *resultfp = fp; return (0); } void falloc_abort(struct thread *td, struct file *fp) { /* * For assertion purposes. */ refcount_init(&fp->f_count, 0); _fdrop(fp, td); } /* * Install a file in a file descriptor table. */ void _finstall(struct filedesc *fdp, struct file *fp, int fd, int flags, struct filecaps *fcaps) { struct filedescent *fde; MPASS(fp != NULL); if (fcaps != NULL) filecaps_validate(fcaps, __func__); FILEDESC_XLOCK_ASSERT(fdp); fde = &fdp->fd_ofiles[fd]; #ifdef CAPABILITIES seqc_write_begin(&fde->fde_seqc); #endif fde->fde_file = fp; fde->fde_flags = (flags & O_CLOEXEC) != 0 ? UF_EXCLOSE : 0; if (fcaps != NULL) filecaps_move(fcaps, &fde->fde_caps); else filecaps_fill(&fde->fde_caps); #ifdef CAPABILITIES seqc_write_end(&fde->fde_seqc); #endif } int finstall_refed(struct thread *td, struct file *fp, int *fd, int flags, struct filecaps *fcaps) { struct filedesc *fdp = td->td_proc->p_fd; int error; MPASS(fd != NULL); FILEDESC_XLOCK(fdp); error = fdalloc(td, 0, fd); if (__predict_true(error == 0)) { _finstall(fdp, fp, *fd, flags, fcaps); } FILEDESC_XUNLOCK(fdp); return (error); } int finstall(struct thread *td, struct file *fp, int *fd, int flags, struct filecaps *fcaps) { int error; MPASS(fd != NULL); if (!fhold(fp)) return (EBADF); error = finstall_refed(td, fp, fd, flags, fcaps); if (__predict_false(error != 0)) { fdrop(fp, td); } return (error); } /* * Build a new filedesc structure from another. * * If fdp is not NULL, return with it shared locked. */ struct filedesc * fdinit(struct filedesc *fdp, bool prepfiles, int *lastfile) { struct filedesc0 *newfdp0; struct filedesc *newfdp; if (prepfiles) MPASS(lastfile != NULL); else MPASS(lastfile == NULL); newfdp0 = uma_zalloc(filedesc0_zone, M_WAITOK | M_ZERO); newfdp = &newfdp0->fd_fd; /* Create the file descriptor table. */ FILEDESC_LOCK_INIT(newfdp); refcount_init(&newfdp->fd_refcnt, 1); refcount_init(&newfdp->fd_holdcnt, 1); newfdp->fd_map = newfdp0->fd_dmap; newfdp->fd_files = (struct fdescenttbl *)&newfdp0->fd_dfiles; newfdp->fd_files->fdt_nfiles = NDFILE; if (fdp == NULL) return (newfdp); FILEDESC_SLOCK(fdp); if (!prepfiles) { FILEDESC_SUNLOCK(fdp); return (newfdp); } for (;;) { *lastfile = fdlastfile(fdp); if (*lastfile < newfdp->fd_nfiles) break; FILEDESC_SUNLOCK(fdp); fdgrowtable(newfdp, *lastfile + 1); FILEDESC_SLOCK(fdp); } return (newfdp); } /* * Build a pwddesc structure from another. * Copy the current, root, and jail root vnode references. * * If pdp is not NULL, return with it shared locked. */ struct pwddesc * pdinit(struct pwddesc *pdp, bool keeplock) { struct pwddesc *newpdp; struct pwd *newpwd; newpdp = malloc(sizeof(*newpdp), M_PWDDESC, M_WAITOK | M_ZERO); PWDDESC_LOCK_INIT(newpdp); refcount_init(&newpdp->pd_refcount, 1); newpdp->pd_cmask = CMASK; if (pdp == NULL) { newpwd = pwd_alloc(); smr_serialized_store(&newpdp->pd_pwd, newpwd, true); return (newpdp); } PWDDESC_XLOCK(pdp); newpwd = pwd_hold_pwddesc(pdp); smr_serialized_store(&newpdp->pd_pwd, newpwd, true); if (!keeplock) PWDDESC_XUNLOCK(pdp); return (newpdp); } static struct filedesc * fdhold(struct proc *p) { struct filedesc *fdp; PROC_LOCK_ASSERT(p, MA_OWNED); fdp = p->p_fd; if (fdp != NULL) refcount_acquire(&fdp->fd_holdcnt); return (fdp); } static struct pwddesc * pdhold(struct proc *p) { struct pwddesc *pdp; PROC_LOCK_ASSERT(p, MA_OWNED); pdp = p->p_pd; if (pdp != NULL) refcount_acquire(&pdp->pd_refcount); return (pdp); } static void fddrop(struct filedesc *fdp) { if (refcount_load(&fdp->fd_holdcnt) > 1) { if (refcount_release(&fdp->fd_holdcnt) == 0) return; } FILEDESC_LOCK_DESTROY(fdp); uma_zfree(filedesc0_zone, fdp); } static void pddrop(struct pwddesc *pdp) { struct pwd *pwd; if (refcount_release_if_not_last(&pdp->pd_refcount)) return; PWDDESC_XLOCK(pdp); if (refcount_release(&pdp->pd_refcount) == 0) { PWDDESC_XUNLOCK(pdp); return; } pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); pwd_set(pdp, NULL); PWDDESC_XUNLOCK(pdp); pwd_drop(pwd); PWDDESC_LOCK_DESTROY(pdp); free(pdp, M_PWDDESC); } /* * Share a filedesc structure. */ struct filedesc * fdshare(struct filedesc *fdp) { refcount_acquire(&fdp->fd_refcnt); return (fdp); } /* * Share a pwddesc structure. */ struct pwddesc * pdshare(struct pwddesc *pdp) { refcount_acquire(&pdp->pd_refcount); return (pdp); } /* * Unshare a filedesc structure, if necessary by making a copy */ void fdunshare(struct thread *td) { struct filedesc *tmp; struct proc *p = td->td_proc; if (refcount_load(&p->p_fd->fd_refcnt) == 1) return; tmp = fdcopy(p->p_fd); fdescfree(td); p->p_fd = tmp; } /* * Unshare a pwddesc structure. */ void pdunshare(struct thread *td) { struct pwddesc *pdp; struct proc *p; p = td->td_proc; /* Not shared. */ if (p->p_pd->pd_refcount == 1) return; pdp = pdcopy(p->p_pd); pdescfree(td); p->p_pd = pdp; } void fdinstall_remapped(struct thread *td, struct filedesc *fdp) { fdescfree(td); td->td_proc->p_fd = fdp; } /* * Copy a filedesc structure. A NULL pointer in returns a NULL reference, * this is to ease callers, not catch errors. */ struct filedesc * fdcopy(struct filedesc *fdp) { struct filedesc *newfdp; struct filedescent *nfde, *ofde; int i, lastfile; MPASS(fdp != NULL); newfdp = fdinit(fdp, true, &lastfile); /* copy all passable descriptors (i.e. not kqueue) */ newfdp->fd_freefile = -1; for (i = 0; i <= lastfile; ++i) { ofde = &fdp->fd_ofiles[i]; if (ofde->fde_file == NULL || (ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0 || !fhold(ofde->fde_file)) { if (newfdp->fd_freefile == -1) newfdp->fd_freefile = i; continue; } nfde = &newfdp->fd_ofiles[i]; *nfde = *ofde; filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true); fdused_init(newfdp, i); } if (newfdp->fd_freefile == -1) newfdp->fd_freefile = i; FILEDESC_SUNLOCK(fdp); return (newfdp); } /* * Copy a pwddesc structure. */ struct pwddesc * pdcopy(struct pwddesc *pdp) { struct pwddesc *newpdp; MPASS(pdp != NULL); newpdp = pdinit(pdp, true); newpdp->pd_cmask = pdp->pd_cmask; PWDDESC_XUNLOCK(pdp); return (newpdp); } /* * Copies a filedesc structure, while remapping all file descriptors * stored inside using a translation table. * * File descriptors are copied over to the new file descriptor table, * regardless of whether the close-on-exec flag is set. */ int fdcopy_remapped(struct filedesc *fdp, const int *fds, size_t nfds, struct filedesc **ret) { struct filedesc *newfdp; struct filedescent *nfde, *ofde; int error, i, lastfile; MPASS(fdp != NULL); newfdp = fdinit(fdp, true, &lastfile); if (nfds > lastfile + 1) { /* New table cannot be larger than the old one. */ error = E2BIG; goto bad; } /* Copy all passable descriptors (i.e. not kqueue). */ newfdp->fd_freefile = nfds; for (i = 0; i < nfds; ++i) { if (fds[i] < 0 || fds[i] > lastfile) { /* File descriptor out of bounds. */ error = EBADF; goto bad; } ofde = &fdp->fd_ofiles[fds[i]]; if (ofde->fde_file == NULL) { /* Unused file descriptor. */ error = EBADF; goto bad; } if ((ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0) { /* File descriptor cannot be passed. */ error = EINVAL; goto bad; } if (!fhold(ofde->fde_file)) { error = EBADF; goto bad; } nfde = &newfdp->fd_ofiles[i]; *nfde = *ofde; filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true); fdused_init(newfdp, i); } FILEDESC_SUNLOCK(fdp); *ret = newfdp; return (0); bad: FILEDESC_SUNLOCK(fdp); fdescfree_remapped(newfdp); return (error); } /* * Clear POSIX style locks. This is only used when fdp looses a reference (i.e. * one of processes using it exits) and the table used to be shared. */ static void fdclearlocks(struct thread *td) { struct filedesc *fdp; struct filedesc_to_leader *fdtol; struct flock lf; struct file *fp; struct proc *p; struct vnode *vp; int i, lastfile; p = td->td_proc; fdp = p->p_fd; fdtol = p->p_fdtol; MPASS(fdtol != NULL); FILEDESC_XLOCK(fdp); KASSERT(fdtol->fdl_refcount > 0, ("filedesc_to_refcount botch: fdl_refcount=%d", fdtol->fdl_refcount)); if (fdtol->fdl_refcount == 1 && (p->p_leader->p_flag & P_ADVLOCK) != 0) { lastfile = fdlastfile(fdp); for (i = 0; i <= lastfile; i++) { fp = fdp->fd_ofiles[i].fde_file; if (fp == NULL || fp->f_type != DTYPE_VNODE || !fhold(fp)) continue; FILEDESC_XUNLOCK(fdp); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; vp = fp->f_vnode; (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, &lf, F_POSIX); FILEDESC_XLOCK(fdp); fdrop(fp, td); } } retry: if (fdtol->fdl_refcount == 1) { if (fdp->fd_holdleaderscount > 0 && (p->p_leader->p_flag & P_ADVLOCK) != 0) { /* * close() or kern_dup() has cleared a reference * in a shared file descriptor table. */ fdp->fd_holdleaderswakeup = 1; sx_sleep(&fdp->fd_holdleaderscount, FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0); goto retry; } if (fdtol->fdl_holdcount > 0) { /* * Ensure that fdtol->fdl_leader remains * valid in closef(). */ fdtol->fdl_wakeup = 1; sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0); goto retry; } } fdtol->fdl_refcount--; if (fdtol->fdl_refcount == 0 && fdtol->fdl_holdcount == 0) { fdtol->fdl_next->fdl_prev = fdtol->fdl_prev; fdtol->fdl_prev->fdl_next = fdtol->fdl_next; } else fdtol = NULL; p->p_fdtol = NULL; FILEDESC_XUNLOCK(fdp); if (fdtol != NULL) free(fdtol, M_FILEDESC_TO_LEADER); } /* * Release a filedesc structure. */ static void fdescfree_fds(struct thread *td, struct filedesc *fdp, bool needclose) { struct filedesc0 *fdp0; struct freetable *ft, *tft; struct filedescent *fde; struct file *fp; int i, lastfile; KASSERT(refcount_load(&fdp->fd_refcnt) == 0, ("%s: fd table %p carries references", __func__, fdp)); /* * Serialize with threads iterating over the table, if any. */ if (refcount_load(&fdp->fd_holdcnt) > 1) { FILEDESC_XLOCK(fdp); FILEDESC_XUNLOCK(fdp); } lastfile = fdlastfile_single(fdp); for (i = 0; i <= lastfile; i++) { fde = &fdp->fd_ofiles[i]; fp = fde->fde_file; if (fp != NULL) { fdefree_last(fde); if (needclose) (void) closef(fp, td); else fdrop(fp, td); } } if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE)) free(fdp->fd_map, M_FILEDESC); if (fdp->fd_nfiles > NDFILE) free(fdp->fd_files, M_FILEDESC); fdp0 = (struct filedesc0 *)fdp; SLIST_FOREACH_SAFE(ft, &fdp0->fd_free, ft_next, tft) free(ft->ft_table, M_FILEDESC); fddrop(fdp); } void fdescfree(struct thread *td) { struct proc *p; struct filedesc *fdp; p = td->td_proc; fdp = p->p_fd; MPASS(fdp != NULL); #ifdef RACCT if (RACCT_ENABLED()) racct_set_unlocked(p, RACCT_NOFILE, 0); #endif if (p->p_fdtol != NULL) fdclearlocks(td); PROC_LOCK(p); p->p_fd = NULL; PROC_UNLOCK(p); if (refcount_release(&fdp->fd_refcnt) == 0) return; fdescfree_fds(td, fdp, 1); } void pdescfree(struct thread *td) { struct proc *p; struct pwddesc *pdp; p = td->td_proc; pdp = p->p_pd; MPASS(pdp != NULL); PROC_LOCK(p); p->p_pd = NULL; PROC_UNLOCK(p); pddrop(pdp); } void fdescfree_remapped(struct filedesc *fdp) { #ifdef INVARIANTS /* fdescfree_fds() asserts that fd_refcnt == 0. */ if (!refcount_release(&fdp->fd_refcnt)) panic("%s: fd table %p has extra references", __func__, fdp); #endif fdescfree_fds(curthread, fdp, 0); } /* * For setugid programs, we don't want to people to use that setugidness * to generate error messages which write to a file which otherwise would * otherwise be off-limits to the process. We check for filesystems where * the vnode can change out from under us after execve (like [lin]procfs). * * Since fdsetugidsafety calls this only for fd 0, 1 and 2, this check is * sufficient. We also don't check for setugidness since we know we are. */ static bool is_unsafe(struct file *fp) { struct vnode *vp; if (fp->f_type != DTYPE_VNODE) return (false); vp = fp->f_vnode; return ((vp->v_vflag & VV_PROCDEP) != 0); } /* * Make this setguid thing safe, if at all possible. */ void fdsetugidsafety(struct thread *td) { struct filedesc *fdp; struct file *fp; int i; fdp = td->td_proc->p_fd; KASSERT(refcount_load(&fdp->fd_refcnt) == 1, ("the fdtable should not be shared")); MPASS(fdp->fd_nfiles >= 3); for (i = 0; i <= 2; i++) { fp = fdp->fd_ofiles[i].fde_file; if (fp != NULL && is_unsafe(fp)) { FILEDESC_XLOCK(fdp); knote_fdclose(td, i); /* * NULL-out descriptor prior to close to avoid * a race while close blocks. */ fdfree(fdp, i); FILEDESC_XUNLOCK(fdp); (void) closef(fp, td); } } } /* * If a specific file object occupies a specific file descriptor, close the * file descriptor entry and drop a reference on the file object. This is a * convenience function to handle a subsequent error in a function that calls * falloc() that handles the race that another thread might have closed the * file descriptor out from under the thread creating the file object. */ void fdclose(struct thread *td, struct file *fp, int idx) { struct filedesc *fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); if (fdp->fd_ofiles[idx].fde_file == fp) { fdfree(fdp, idx); FILEDESC_XUNLOCK(fdp); fdrop(fp, td); } else FILEDESC_XUNLOCK(fdp); } /* * Close any files on exec? */ void fdcloseexec(struct thread *td) { struct filedesc *fdp; struct filedescent *fde; struct file *fp; int i, lastfile; fdp = td->td_proc->p_fd; KASSERT(refcount_load(&fdp->fd_refcnt) == 1, ("the fdtable should not be shared")); lastfile = fdlastfile_single(fdp); for (i = 0; i <= lastfile; i++) { fde = &fdp->fd_ofiles[i]; fp = fde->fde_file; if (fp != NULL && (fp->f_type == DTYPE_MQUEUE || (fde->fde_flags & UF_EXCLOSE))) { FILEDESC_XLOCK(fdp); fdfree(fdp, i); (void) closefp(fdp, i, fp, td, false, false); FILEDESC_UNLOCK_ASSERT(fdp); } } } /* * It is unsafe for set[ug]id processes to be started with file * descriptors 0..2 closed, as these descriptors are given implicit * significance in the Standard C library. fdcheckstd() will create a * descriptor referencing /dev/null for each of stdin, stdout, and * stderr that is not already open. */ int fdcheckstd(struct thread *td) { struct filedesc *fdp; register_t save; int i, error, devnull; fdp = td->td_proc->p_fd; KASSERT(refcount_load(&fdp->fd_refcnt) == 1, ("the fdtable should not be shared")); MPASS(fdp->fd_nfiles >= 3); devnull = -1; for (i = 0; i <= 2; i++) { if (fdp->fd_ofiles[i].fde_file != NULL) continue; save = td->td_retval[0]; if (devnull != -1) { error = kern_dup(td, FDDUP_FIXED, 0, devnull, i); } else { error = kern_openat(td, AT_FDCWD, "/dev/null", UIO_SYSSPACE, O_RDWR, 0); if (error == 0) { devnull = td->td_retval[0]; KASSERT(devnull == i, ("we didn't get our fd")); } } td->td_retval[0] = save; if (error != 0) return (error); } return (0); } /* * Internal form of close. Decrement reference count on file structure. * Note: td may be NULL when closing a file that was being passed in a * message. */ int closef(struct file *fp, struct thread *td) { struct vnode *vp; struct flock lf; struct filedesc_to_leader *fdtol; struct filedesc *fdp; MPASS(td != NULL); /* * POSIX record locking dictates that any close releases ALL * locks owned by this process. This is handled by setting * a flag in the unlock to free ONLY locks obeying POSIX * semantics, and not to free BSD-style file locks. * If the descriptor was in a message, POSIX-style locks * aren't passed with the descriptor, and the thread pointer * will be NULL. Callers should be careful only to pass a * NULL thread pointer when there really is no owning * context that might have locks, or the locks will be * leaked. */ if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; (void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader, F_UNLCK, &lf, F_POSIX); } fdtol = td->td_proc->p_fdtol; if (fdtol != NULL) { /* * Handle special case where file descriptor table is * shared between multiple process leaders. */ fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); for (fdtol = fdtol->fdl_next; fdtol != td->td_proc->p_fdtol; fdtol = fdtol->fdl_next) { if ((fdtol->fdl_leader->p_flag & P_ADVLOCK) == 0) continue; fdtol->fdl_holdcount++; FILEDESC_XUNLOCK(fdp); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; vp = fp->f_vnode; (void) VOP_ADVLOCK(vp, (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf, F_POSIX); FILEDESC_XLOCK(fdp); fdtol->fdl_holdcount--; if (fdtol->fdl_holdcount == 0 && fdtol->fdl_wakeup != 0) { fdtol->fdl_wakeup = 0; wakeup(fdtol); } } FILEDESC_XUNLOCK(fdp); } } return (fdrop_close(fp, td)); } /* * Hack for file descriptor passing code. */ void closef_nothread(struct file *fp) { fdrop(fp, NULL); } /* * Initialize the file pointer with the specified properties. * * The ops are set with release semantics to be certain that the flags, type, * and data are visible when ops is. This is to prevent ops methods from being * called with bad data. */ void finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops) { fp->f_data = data; fp->f_flag = flag; fp->f_type = type; atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops); } void finit_vnode(struct file *fp, u_int flag, void *data, struct fileops *ops) { fp->f_seqcount[UIO_READ] = 1; fp->f_seqcount[UIO_WRITE] = 1; finit(fp, (flag & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE, data, ops); } int fget_cap_locked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, struct file **fpp, struct filecaps *havecapsp) { struct filedescent *fde; int error; FILEDESC_LOCK_ASSERT(fdp); fde = fdeget_locked(fdp, fd); if (fde == NULL) { error = EBADF; goto out; } #ifdef CAPABILITIES error = cap_check(cap_rights_fde_inline(fde), needrightsp); if (error != 0) goto out; #endif if (havecapsp != NULL) filecaps_copy(&fde->fde_caps, havecapsp, true); *fpp = fde->fde_file; error = 0; out: return (error); } int fget_cap(struct thread *td, int fd, cap_rights_t *needrightsp, struct file **fpp, struct filecaps *havecapsp) { struct filedesc *fdp = td->td_proc->p_fd; int error; #ifndef CAPABILITIES error = fget_unlocked(fdp, fd, needrightsp, fpp); if (havecapsp != NULL && error == 0) filecaps_fill(havecapsp); #else struct file *fp; seqc_t seq; *fpp = NULL; for (;;) { error = fget_unlocked_seq(fdp, fd, needrightsp, &fp, &seq); if (error != 0) return (error); if (havecapsp != NULL) { if (!filecaps_copy(&fdp->fd_ofiles[fd].fde_caps, havecapsp, false)) { fdrop(fp, td); goto get_locked; } } if (!fd_modified(fdp, fd, seq)) break; fdrop(fp, td); } *fpp = fp; return (0); get_locked: FILEDESC_SLOCK(fdp); error = fget_cap_locked(fdp, fd, needrightsp, fpp, havecapsp); if (error == 0 && !fhold(*fpp)) error = EBADF; FILEDESC_SUNLOCK(fdp); #endif return (error); } #ifdef CAPABILITIES int fgetvp_lookup_smr(int fd, struct nameidata *ndp, struct vnode **vpp, bool *fsearch) { const struct filedescent *fde; const struct fdescenttbl *fdt; struct filedesc *fdp; struct file *fp; struct vnode *vp; const cap_rights_t *haverights; cap_rights_t rights; seqc_t seq; VFS_SMR_ASSERT_ENTERED(); rights = *ndp->ni_rightsneeded; cap_rights_set_one(&rights, CAP_LOOKUP); fdp = curproc->p_fd; fdt = fdp->fd_files; if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) return (EBADF); seq = seqc_read_notmodify(fd_seqc(fdt, fd)); fde = &fdt->fdt_ofiles[fd]; haverights = cap_rights_fde_inline(fde); fp = fde->fde_file; if (__predict_false(fp == NULL)) return (EAGAIN); if (__predict_false(cap_check_inline_transient(haverights, &rights))) return (EAGAIN); *fsearch = ((fp->f_flag & FSEARCH) != 0); vp = fp->f_vnode; if (__predict_false(vp == NULL || vp->v_type != VDIR)) { return (EAGAIN); } if (!filecaps_copy(&fde->fde_caps, &ndp->ni_filecaps, false)) { return (EAGAIN); } /* * Use an acquire barrier to force re-reading of fdt so it is * refreshed for verification. */ atomic_thread_fence_acq(); fdt = fdp->fd_files; if (__predict_false(!seqc_consistent_nomb(fd_seqc(fdt, fd), seq))) return (EAGAIN); /* * If file descriptor doesn't have all rights, * all lookups relative to it must also be * strictly relative. * * Not yet supported by fast path. */ CAP_ALL(&rights); if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights, &rights) || ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL || ndp->ni_filecaps.fc_nioctls != -1) { #ifdef notyet ndp->ni_lcf |= NI_LCF_STRICTRELATIVE; #else return (EAGAIN); #endif } *vpp = vp; return (0); } #else int fgetvp_lookup_smr(int fd, struct nameidata *ndp, struct vnode **vpp, bool *fsearch) { const struct fdescenttbl *fdt; struct filedesc *fdp; struct file *fp; struct vnode *vp; VFS_SMR_ASSERT_ENTERED(); fdp = curproc->p_fd; fdt = fdp->fd_files; if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) return (EBADF); fp = fdt->fdt_ofiles[fd].fde_file; if (__predict_false(fp == NULL)) return (EAGAIN); *fsearch = ((fp->f_flag & FSEARCH) != 0); vp = fp->f_vnode; if (__predict_false(vp == NULL || vp->v_type != VDIR)) { return (EAGAIN); } /* * Use an acquire barrier to force re-reading of fdt so it is * refreshed for verification. */ atomic_thread_fence_acq(); fdt = fdp->fd_files; if (__predict_false(fp != fdt->fdt_ofiles[fd].fde_file)) return (EAGAIN); filecaps_fill(&ndp->ni_filecaps); *vpp = vp; return (0); } #endif int fget_unlocked_seq(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, struct file **fpp, seqc_t *seqp) { #ifdef CAPABILITIES const struct filedescent *fde; #endif const struct fdescenttbl *fdt; struct file *fp; #ifdef CAPABILITIES seqc_t seq; cap_rights_t haverights; int error; #endif fdt = fdp->fd_files; if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) return (EBADF); /* * Fetch the descriptor locklessly. We avoid fdrop() races by * never raising a refcount above 0. To accomplish this we have * to use a cmpset loop rather than an atomic_add. The descriptor * must be re-verified once we acquire a reference to be certain * that the identity is still correct and we did not lose a race * due to preemption. */ for (;;) { #ifdef CAPABILITIES seq = seqc_read_notmodify(fd_seqc(fdt, fd)); fde = &fdt->fdt_ofiles[fd]; haverights = *cap_rights_fde_inline(fde); fp = fde->fde_file; if (!seqc_consistent(fd_seqc(fdt, fd), seq)) continue; #else fp = fdt->fdt_ofiles[fd].fde_file; #endif if (fp == NULL) return (EBADF); #ifdef CAPABILITIES error = cap_check_inline(&haverights, needrightsp); if (error != 0) return (error); #endif if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count))) { /* * Force a reload. Other thread could reallocate the * table before this fd was closed, so it is possible * that there is a stale fp pointer in cached version. */ fdt = atomic_load_ptr(&fdp->fd_files); continue; } /* * Use an acquire barrier to force re-reading of fdt so it is * refreshed for verification. */ atomic_thread_fence_acq(); fdt = fdp->fd_files; #ifdef CAPABILITIES if (seqc_consistent_nomb(fd_seqc(fdt, fd), seq)) #else if (fp == fdt->fdt_ofiles[fd].fde_file) #endif break; fdrop(fp, curthread); } *fpp = fp; if (seqp != NULL) { #ifdef CAPABILITIES *seqp = seq; #endif } return (0); } /* * See the comments in fget_unlocked_seq for an explanation of how this works. * * This is a simplified variant which bails out to the aforementioned routine * if anything goes wrong. In practice this only happens when userspace is * racing with itself. */ int fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, struct file **fpp) { #ifdef CAPABILITIES const struct filedescent *fde; #endif const struct fdescenttbl *fdt; struct file *fp; #ifdef CAPABILITIES seqc_t seq; const cap_rights_t *haverights; #endif fdt = fdp->fd_files; if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) return (EBADF); #ifdef CAPABILITIES seq = seqc_read_notmodify(fd_seqc(fdt, fd)); fde = &fdt->fdt_ofiles[fd]; haverights = cap_rights_fde_inline(fde); fp = fde->fde_file; #else fp = fdt->fdt_ofiles[fd].fde_file; #endif if (__predict_false(fp == NULL)) goto out_fallback; #ifdef CAPABILITIES if (__predict_false(cap_check_inline_transient(haverights, needrightsp))) goto out_fallback; #endif if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count))) goto out_fallback; /* * Use an acquire barrier to force re-reading of fdt so it is * refreshed for verification. */ atomic_thread_fence_acq(); fdt = fdp->fd_files; #ifdef CAPABILITIES if (__predict_false(!seqc_consistent_nomb(fd_seqc(fdt, fd), seq))) #else if (__predict_false(fp != fdt->fdt_ofiles[fd].fde_file)) #endif goto out_fdrop; *fpp = fp; return (0); out_fdrop: fdrop(fp, curthread); out_fallback: return (fget_unlocked_seq(fdp, fd, needrightsp, fpp, NULL)); } /* * Translate fd -> file when the caller guarantees the file descriptor table * can't be changed by others. * * Note this does not mean the file object itself is only visible to the caller, * merely that it wont disappear without having to be referenced. * * Must be paired with fput_only_user. */ #ifdef CAPABILITIES int fget_only_user(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, struct file **fpp) { const struct filedescent *fde; const struct fdescenttbl *fdt; const cap_rights_t *haverights; struct file *fp; int error; MPASS(FILEDESC_IS_ONLY_USER(fdp)); if (__predict_false(fd >= fdp->fd_nfiles)) return (EBADF); fdt = fdp->fd_files; fde = &fdt->fdt_ofiles[fd]; fp = fde->fde_file; if (__predict_false(fp == NULL)) return (EBADF); MPASS(refcount_load(&fp->f_count) > 0); haverights = cap_rights_fde_inline(fde); error = cap_check_inline(haverights, needrightsp); if (__predict_false(error != 0)) return (error); *fpp = fp; return (0); } #else int fget_only_user(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, struct file **fpp) { struct file *fp; MPASS(FILEDESC_IS_ONLY_USER(fdp)); if (__predict_false(fd >= fdp->fd_nfiles)) return (EBADF); fp = fdp->fd_ofiles[fd].fde_file; if (__predict_false(fp == NULL)) return (EBADF); MPASS(refcount_load(&fp->f_count) > 0); *fpp = fp; return (0); } #endif /* * Extract the file pointer associated with the specified descriptor for the * current user process. * * If the descriptor doesn't exist or doesn't match 'flags', EBADF is * returned. * * File's rights will be checked against the capability rights mask. * * If an error occurred the non-zero error is returned and *fpp is set to * NULL. Otherwise *fpp is held and set and zero is returned. Caller is * responsible for fdrop(). */ static __inline int _fget(struct thread *td, int fd, struct file **fpp, int flags, cap_rights_t *needrightsp) { struct filedesc *fdp; struct file *fp; int error; *fpp = NULL; fdp = td->td_proc->p_fd; error = fget_unlocked(fdp, fd, needrightsp, &fp); if (__predict_false(error != 0)) return (error); if (__predict_false(fp->f_ops == &badfileops)) { fdrop(fp, td); return (EBADF); } /* * FREAD and FWRITE failure return EBADF as per POSIX. */ error = 0; switch (flags) { case FREAD: case FWRITE: if ((fp->f_flag & flags) == 0) error = EBADF; break; case FEXEC: if ((fp->f_flag & (FREAD | FEXEC)) == 0 || ((fp->f_flag & FWRITE) != 0)) error = EBADF; break; case 0: break; default: KASSERT(0, ("wrong flags")); } if (error != 0) { fdrop(fp, td); return (error); } *fpp = fp; return (0); } int fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) { return (_fget(td, fd, fpp, 0, rightsp)); } int fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, vm_prot_t *maxprotp, struct file **fpp) { int error; #ifndef CAPABILITIES error = _fget(td, fd, fpp, 0, rightsp); if (maxprotp != NULL) *maxprotp = VM_PROT_ALL; return (error); #else cap_rights_t fdrights; struct filedesc *fdp; struct file *fp; seqc_t seq; *fpp = NULL; fdp = td->td_proc->p_fd; MPASS(cap_rights_is_set(rightsp, CAP_MMAP)); for (;;) { error = fget_unlocked_seq(fdp, fd, rightsp, &fp, &seq); if (__predict_false(error != 0)) return (error); if (__predict_false(fp->f_ops == &badfileops)) { fdrop(fp, td); return (EBADF); } if (maxprotp != NULL) fdrights = *cap_rights(fdp, fd); if (!fd_modified(fdp, fd, seq)) break; fdrop(fp, td); } /* * If requested, convert capability rights to access flags. */ if (maxprotp != NULL) *maxprotp = cap_rights_to_vmprot(&fdrights); *fpp = fp; return (0); #endif } int fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) { return (_fget(td, fd, fpp, FREAD, rightsp)); } int fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) { return (_fget(td, fd, fpp, FWRITE, rightsp)); } int fget_fcntl(struct thread *td, int fd, cap_rights_t *rightsp, int needfcntl, struct file **fpp) { struct filedesc *fdp = td->td_proc->p_fd; #ifndef CAPABILITIES return (fget_unlocked(fdp, fd, rightsp, fpp)); #else struct file *fp; int error; seqc_t seq; *fpp = NULL; MPASS(cap_rights_is_set(rightsp, CAP_FCNTL)); for (;;) { error = fget_unlocked_seq(fdp, fd, rightsp, &fp, &seq); if (error != 0) return (error); error = cap_fcntl_check(fdp, fd, needfcntl); if (!fd_modified(fdp, fd, seq)) break; fdrop(fp, td); } if (error != 0) { fdrop(fp, td); return (error); } *fpp = fp; return (0); #endif } /* * Like fget() but loads the underlying vnode, or returns an error if the * descriptor does not represent a vnode. Note that pipes use vnodes but * never have VM objects. The returned vnode will be vref()'d. * * XXX: what about the unused flags ? */ static __inline int _fgetvp(struct thread *td, int fd, int flags, cap_rights_t *needrightsp, struct vnode **vpp) { struct file *fp; int error; *vpp = NULL; error = _fget(td, fd, &fp, flags, needrightsp); if (error != 0) return (error); if (fp->f_vnode == NULL) { error = EINVAL; } else { *vpp = fp->f_vnode; vref(*vpp); } fdrop(fp, td); return (error); } int fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) { return (_fgetvp(td, fd, 0, rightsp, vpp)); } int fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp, struct filecaps *havecaps, struct vnode **vpp) { struct filecaps caps; struct file *fp; int error; error = fget_cap(td, fd, needrightsp, &fp, &caps); if (error != 0) return (error); if (fp->f_ops == &badfileops) { error = EBADF; goto out; } if (fp->f_vnode == NULL) { error = EINVAL; goto out; } *havecaps = caps; *vpp = fp->f_vnode; vref(*vpp); fdrop(fp, td); return (0); out: filecaps_free(&caps); fdrop(fp, td); return (error); } int fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) { return (_fgetvp(td, fd, FREAD, rightsp, vpp)); } int fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) { return (_fgetvp(td, fd, FEXEC, rightsp, vpp)); } #ifdef notyet int fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) { return (_fgetvp(td, fd, FWRITE, rightsp, vpp)); } #endif /* * Handle the last reference to a file being closed. * * Without the noinline attribute clang keeps inlining the func thorough this * file when fdrop is used. */ int __noinline _fdrop(struct file *fp, struct thread *td) { int error; #ifdef INVARIANTS int count; count = refcount_load(&fp->f_count); if (count != 0) panic("fdrop: fp %p count %d", fp, count); #endif error = fo_close(fp, td); atomic_subtract_int(&openfiles, 1); crfree(fp->f_cred); free(fp->f_advice, M_FADVISE); uma_zfree(file_zone, fp); return (error); } /* * Apply an advisory lock on a file descriptor. * * Just attempt to get a record lock of the requested type on the entire file * (l_whence = SEEK_SET, l_start = 0, l_len = 0). */ #ifndef _SYS_SYSPROTO_H_ struct flock_args { int fd; int how; }; #endif /* ARGSUSED */ int sys_flock(struct thread *td, struct flock_args *uap) { struct file *fp; struct vnode *vp; struct flock lf; int error; error = fget(td, uap->fd, &cap_flock_rights, &fp); if (error != 0) return (error); if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) { fdrop(fp, td); return (EOPNOTSUPP); } vp = fp->f_vnode; lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; if (uap->how & LOCK_UN) { lf.l_type = F_UNLCK; atomic_clear_int(&fp->f_flag, FHASLOCK); error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK); goto done2; } if (uap->how & LOCK_EX) lf.l_type = F_WRLCK; else if (uap->how & LOCK_SH) lf.l_type = F_RDLCK; else { error = EBADF; goto done2; } atomic_set_int(&fp->f_flag, FHASLOCK); error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT); done2: fdrop(fp, td); return (error); } /* * Duplicate the specified descriptor to a free descriptor. */ int dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode, int openerror, int *indxp) { struct filedescent *newfde, *oldfde; struct file *fp; u_long *ioctls; int error, indx; KASSERT(openerror == ENODEV || openerror == ENXIO, ("unexpected error %d in %s", openerror, __func__)); /* * If the to-be-dup'd fd number is greater than the allowed number * of file descriptors, or the fd to be dup'd has already been * closed, then reject. */ FILEDESC_XLOCK(fdp); if ((fp = fget_locked(fdp, dfd)) == NULL) { FILEDESC_XUNLOCK(fdp); return (EBADF); } error = fdalloc(td, 0, &indx); if (error != 0) { FILEDESC_XUNLOCK(fdp); return (error); } /* * There are two cases of interest here. * * For ENODEV simply dup (dfd) to file descriptor (indx) and return. * * For ENXIO steal away the file structure from (dfd) and store it in * (indx). (dfd) is effectively closed by this operation. */ switch (openerror) { case ENODEV: /* * Check that the mode the file is being opened for is a * subset of the mode of the existing descriptor. */ if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) { fdunused(fdp, indx); FILEDESC_XUNLOCK(fdp); return (EACCES); } if (!fhold(fp)) { fdunused(fdp, indx); FILEDESC_XUNLOCK(fdp); return (EBADF); } newfde = &fdp->fd_ofiles[indx]; oldfde = &fdp->fd_ofiles[dfd]; ioctls = filecaps_copy_prep(&oldfde->fde_caps); #ifdef CAPABILITIES seqc_write_begin(&newfde->fde_seqc); #endif memcpy(newfde, oldfde, fde_change_size); filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps, ioctls); #ifdef CAPABILITIES seqc_write_end(&newfde->fde_seqc); #endif break; case ENXIO: /* * Steal away the file pointer from dfd and stuff it into indx. */ newfde = &fdp->fd_ofiles[indx]; oldfde = &fdp->fd_ofiles[dfd]; #ifdef CAPABILITIES seqc_write_begin(&newfde->fde_seqc); #endif memcpy(newfde, oldfde, fde_change_size); oldfde->fde_file = NULL; fdunused(fdp, dfd); #ifdef CAPABILITIES seqc_write_end(&newfde->fde_seqc); #endif break; } FILEDESC_XUNLOCK(fdp); *indxp = indx; return (0); } /* * This sysctl determines if we will allow a process to chroot(2) if it * has a directory open: * 0: disallowed for all processes. * 1: allowed for processes that were not already chroot(2)'ed. * 2: allowed for all processes. */ static int chroot_allow_open_directories = 1; SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW, &chroot_allow_open_directories, 0, "Allow a process to chroot(2) if it has a directory open"); /* * Helper function for raised chroot(2) security function: Refuse if * any filedescriptors are open directories. */ static int chroot_refuse_vdir_fds(struct filedesc *fdp) { struct vnode *vp; struct file *fp; int fd, lastfile; FILEDESC_LOCK_ASSERT(fdp); lastfile = fdlastfile(fdp); for (fd = 0; fd <= lastfile; fd++) { fp = fget_locked(fdp, fd); if (fp == NULL) continue; if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; if (vp->v_type == VDIR) return (EPERM); } } return (0); } static void pwd_fill(struct pwd *oldpwd, struct pwd *newpwd) { if (newpwd->pwd_cdir == NULL && oldpwd->pwd_cdir != NULL) { vrefact(oldpwd->pwd_cdir); newpwd->pwd_cdir = oldpwd->pwd_cdir; } if (newpwd->pwd_rdir == NULL && oldpwd->pwd_rdir != NULL) { vrefact(oldpwd->pwd_rdir); newpwd->pwd_rdir = oldpwd->pwd_rdir; } if (newpwd->pwd_jdir == NULL && oldpwd->pwd_jdir != NULL) { vrefact(oldpwd->pwd_jdir); newpwd->pwd_jdir = oldpwd->pwd_jdir; } } struct pwd * pwd_hold_pwddesc(struct pwddesc *pdp) { struct pwd *pwd; PWDDESC_ASSERT_XLOCKED(pdp); pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); if (pwd != NULL) refcount_acquire(&pwd->pwd_refcount); return (pwd); } bool pwd_hold_smr(struct pwd *pwd) { MPASS(pwd != NULL); if (__predict_true(refcount_acquire_if_not_zero(&pwd->pwd_refcount))) { return (true); } return (false); } struct pwd * pwd_hold(struct thread *td) { struct pwddesc *pdp; struct pwd *pwd; pdp = td->td_proc->p_pd; vfs_smr_enter(); pwd = vfs_smr_entered_load(&pdp->pd_pwd); if (pwd_hold_smr(pwd)) { vfs_smr_exit(); return (pwd); } vfs_smr_exit(); PWDDESC_XLOCK(pdp); pwd = pwd_hold_pwddesc(pdp); MPASS(pwd != NULL); PWDDESC_XUNLOCK(pdp); return (pwd); } static struct pwd * pwd_alloc(void) { struct pwd *pwd; pwd = uma_zalloc_smr(pwd_zone, M_WAITOK); bzero(pwd, sizeof(*pwd)); refcount_init(&pwd->pwd_refcount, 1); return (pwd); } void pwd_drop(struct pwd *pwd) { if (!refcount_release(&pwd->pwd_refcount)) return; if (pwd->pwd_cdir != NULL) vrele(pwd->pwd_cdir); if (pwd->pwd_rdir != NULL) vrele(pwd->pwd_rdir); if (pwd->pwd_jdir != NULL) vrele(pwd->pwd_jdir); uma_zfree_smr(pwd_zone, pwd); } /* * The caller is responsible for invoking priv_check() and * mac_vnode_check_chroot() to authorize this operation. */ int pwd_chroot(struct thread *td, struct vnode *vp) { struct pwddesc *pdp; struct filedesc *fdp; struct pwd *newpwd, *oldpwd; int error; fdp = td->td_proc->p_fd; pdp = td->td_proc->p_pd; newpwd = pwd_alloc(); FILEDESC_SLOCK(fdp); PWDDESC_XLOCK(pdp); oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); if (chroot_allow_open_directories == 0 || (chroot_allow_open_directories == 1 && oldpwd->pwd_rdir != rootvnode)) { error = chroot_refuse_vdir_fds(fdp); FILEDESC_SUNLOCK(fdp); if (error != 0) { PWDDESC_XUNLOCK(pdp); pwd_drop(newpwd); return (error); } } else { FILEDESC_SUNLOCK(fdp); } vrefact(vp); newpwd->pwd_rdir = vp; if (oldpwd->pwd_jdir == NULL) { vrefact(vp); newpwd->pwd_jdir = vp; } pwd_fill(oldpwd, newpwd); pwd_set(pdp, newpwd); PWDDESC_XUNLOCK(pdp); pwd_drop(oldpwd); return (0); } void pwd_chdir(struct thread *td, struct vnode *vp) { struct pwddesc *pdp; struct pwd *newpwd, *oldpwd; VNPASS(vp->v_usecount > 0, vp); newpwd = pwd_alloc(); pdp = td->td_proc->p_pd; PWDDESC_XLOCK(pdp); oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); newpwd->pwd_cdir = vp; pwd_fill(oldpwd, newpwd); pwd_set(pdp, newpwd); PWDDESC_XUNLOCK(pdp); pwd_drop(oldpwd); } /* * jail_attach(2) changes both root and working directories. */ int pwd_chroot_chdir(struct thread *td, struct vnode *vp) { struct pwddesc *pdp; struct filedesc *fdp; struct pwd *newpwd, *oldpwd; int error; fdp = td->td_proc->p_fd; pdp = td->td_proc->p_pd; newpwd = pwd_alloc(); FILEDESC_SLOCK(fdp); PWDDESC_XLOCK(pdp); oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); error = chroot_refuse_vdir_fds(fdp); FILEDESC_SUNLOCK(fdp); if (error != 0) { PWDDESC_XUNLOCK(pdp); pwd_drop(newpwd); return (error); } vrefact(vp); newpwd->pwd_rdir = vp; vrefact(vp); newpwd->pwd_cdir = vp; if (oldpwd->pwd_jdir == NULL) { vrefact(vp); newpwd->pwd_jdir = vp; } pwd_fill(oldpwd, newpwd); pwd_set(pdp, newpwd); PWDDESC_XUNLOCK(pdp); pwd_drop(oldpwd); return (0); } void pwd_ensure_dirs(void) { struct pwddesc *pdp; struct pwd *oldpwd, *newpwd; pdp = curproc->p_pd; PWDDESC_XLOCK(pdp); oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); if (oldpwd->pwd_cdir != NULL && oldpwd->pwd_rdir != NULL) { PWDDESC_XUNLOCK(pdp); return; } PWDDESC_XUNLOCK(pdp); newpwd = pwd_alloc(); PWDDESC_XLOCK(pdp); oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); pwd_fill(oldpwd, newpwd); if (newpwd->pwd_cdir == NULL) { vrefact(rootvnode); newpwd->pwd_cdir = rootvnode; } if (newpwd->pwd_rdir == NULL) { vrefact(rootvnode); newpwd->pwd_rdir = rootvnode; } pwd_set(pdp, newpwd); PWDDESC_XUNLOCK(pdp); pwd_drop(oldpwd); } void pwd_set_rootvnode(void) { struct pwddesc *pdp; struct pwd *oldpwd, *newpwd; pdp = curproc->p_pd; newpwd = pwd_alloc(); PWDDESC_XLOCK(pdp); oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); vrefact(rootvnode); newpwd->pwd_cdir = rootvnode; vrefact(rootvnode); newpwd->pwd_rdir = rootvnode; pwd_fill(oldpwd, newpwd); pwd_set(pdp, newpwd); PWDDESC_XUNLOCK(pdp); pwd_drop(oldpwd); } /* * Scan all active processes and prisons to see if any of them have a current * or root directory of `olddp'. If so, replace them with the new mount point. */ void mountcheckdirs(struct vnode *olddp, struct vnode *newdp) { struct pwddesc *pdp; struct pwd *newpwd, *oldpwd; struct prison *pr; struct proc *p; int nrele; if (vrefcnt(olddp) == 1) return; nrele = 0; newpwd = pwd_alloc(); sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); pdp = pdhold(p); PROC_UNLOCK(p); if (pdp == NULL) continue; PWDDESC_XLOCK(pdp); oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); if (oldpwd == NULL || (oldpwd->pwd_cdir != olddp && oldpwd->pwd_rdir != olddp && oldpwd->pwd_jdir != olddp)) { PWDDESC_XUNLOCK(pdp); pddrop(pdp); continue; } if (oldpwd->pwd_cdir == olddp) { vrefact(newdp); newpwd->pwd_cdir = newdp; } if (oldpwd->pwd_rdir == olddp) { vrefact(newdp); newpwd->pwd_rdir = newdp; } if (oldpwd->pwd_jdir == olddp) { vrefact(newdp); newpwd->pwd_jdir = newdp; } pwd_fill(oldpwd, newpwd); pwd_set(pdp, newpwd); PWDDESC_XUNLOCK(pdp); pwd_drop(oldpwd); pddrop(pdp); newpwd = pwd_alloc(); } sx_sunlock(&allproc_lock); pwd_drop(newpwd); if (rootvnode == olddp) { vrefact(newdp); rootvnode = newdp; nrele++; } mtx_lock(&prison0.pr_mtx); if (prison0.pr_root == olddp) { vrefact(newdp); prison0.pr_root = newdp; nrele++; } mtx_unlock(&prison0.pr_mtx); sx_slock(&allprison_lock); TAILQ_FOREACH(pr, &allprison, pr_list) { mtx_lock(&pr->pr_mtx); if (pr->pr_root == olddp) { vrefact(newdp); pr->pr_root = newdp; nrele++; } mtx_unlock(&pr->pr_mtx); } sx_sunlock(&allprison_lock); while (nrele--) vrele(olddp); } struct filedesc_to_leader * filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader) { struct filedesc_to_leader *fdtol; fdtol = malloc(sizeof(struct filedesc_to_leader), M_FILEDESC_TO_LEADER, M_WAITOK); fdtol->fdl_refcount = 1; fdtol->fdl_holdcount = 0; fdtol->fdl_wakeup = 0; fdtol->fdl_leader = leader; if (old != NULL) { FILEDESC_XLOCK(fdp); fdtol->fdl_next = old->fdl_next; fdtol->fdl_prev = old; old->fdl_next = fdtol; fdtol->fdl_next->fdl_prev = fdtol; FILEDESC_XUNLOCK(fdp); } else { fdtol->fdl_next = fdtol; fdtol->fdl_prev = fdtol; } return (fdtol); } static int sysctl_kern_proc_nfds(SYSCTL_HANDLER_ARGS) { NDSLOTTYPE *map; struct filedesc *fdp; int count, off, minoff; if (*(int *)arg1 != 0) return (EINVAL); fdp = curproc->p_fd; count = 0; FILEDESC_SLOCK(fdp); map = fdp->fd_map; off = NDSLOT(fdp->fd_nfiles - 1); for (minoff = NDSLOT(0); off >= minoff; --off) count += bitcountl(map[off]); FILEDESC_SUNLOCK(fdp); return (SYSCTL_OUT(req, &count, sizeof(count))); } static SYSCTL_NODE(_kern_proc, KERN_PROC_NFDS, nfds, CTLFLAG_RD|CTLFLAG_CAPRD|CTLFLAG_MPSAFE, sysctl_kern_proc_nfds, "Number of open file descriptors"); /* * Get file structures globally. */ static int sysctl_kern_file(SYSCTL_HANDLER_ARGS) { struct xfile xf; struct filedesc *fdp; struct file *fp; struct proc *p; int error, n, lastfile; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); if (req->oldptr == NULL) { n = 0; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } fdp = fdhold(p); PROC_UNLOCK(p); if (fdp == NULL) continue; /* overestimates sparse tables. */ n += fdp->fd_nfiles; fddrop(fdp); } sx_sunlock(&allproc_lock); return (SYSCTL_OUT(req, 0, n * sizeof(xf))); } error = 0; bzero(&xf, sizeof(xf)); xf.xf_size = sizeof(xf); sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } if (p_cansee(req->td, p) != 0) { PROC_UNLOCK(p); continue; } xf.xf_pid = p->p_pid; xf.xf_uid = p->p_ucred->cr_uid; fdp = fdhold(p); PROC_UNLOCK(p); if (fdp == NULL) continue; FILEDESC_SLOCK(fdp); lastfile = fdlastfile(fdp); for (n = 0; refcount_load(&fdp->fd_refcnt) > 0 && n <= lastfile; n++) { if ((fp = fdp->fd_ofiles[n].fde_file) == NULL) continue; xf.xf_fd = n; xf.xf_file = (uintptr_t)fp; xf.xf_data = (uintptr_t)fp->f_data; xf.xf_vnode = (uintptr_t)fp->f_vnode; xf.xf_type = (uintptr_t)fp->f_type; xf.xf_count = refcount_load(&fp->f_count); xf.xf_msgcount = 0; xf.xf_offset = foffset_get(fp); xf.xf_flag = fp->f_flag; error = SYSCTL_OUT(req, &xf, sizeof(xf)); if (error) break; } FILEDESC_SUNLOCK(fdp); fddrop(fdp); if (error) break; } sx_sunlock(&allproc_lock); return (error); } SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE, 0, 0, sysctl_kern_file, "S,xfile", "Entire file table"); #ifdef KINFO_FILE_SIZE CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE); #endif static int xlate_fflags(int fflags) { static const struct { int fflag; int kf_fflag; } fflags_table[] = { { FAPPEND, KF_FLAG_APPEND }, { FASYNC, KF_FLAG_ASYNC }, { FFSYNC, KF_FLAG_FSYNC }, { FHASLOCK, KF_FLAG_HASLOCK }, { FNONBLOCK, KF_FLAG_NONBLOCK }, { FREAD, KF_FLAG_READ }, { FWRITE, KF_FLAG_WRITE }, { O_CREAT, KF_FLAG_CREAT }, { O_DIRECT, KF_FLAG_DIRECT }, { O_EXCL, KF_FLAG_EXCL }, { O_EXEC, KF_FLAG_EXEC }, { O_EXLOCK, KF_FLAG_EXLOCK }, { O_NOFOLLOW, KF_FLAG_NOFOLLOW }, { O_SHLOCK, KF_FLAG_SHLOCK }, { O_TRUNC, KF_FLAG_TRUNC } }; unsigned int i; int kflags; kflags = 0; for (i = 0; i < nitems(fflags_table); i++) if (fflags & fflags_table[i].fflag) kflags |= fflags_table[i].kf_fflag; return (kflags); } /* Trim unused data from kf_path by truncating the structure size. */ void pack_kinfo(struct kinfo_file *kif) { kif->kf_structsize = offsetof(struct kinfo_file, kf_path) + strlen(kif->kf_path) + 1; kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t)); } static void export_file_to_kinfo(struct file *fp, int fd, cap_rights_t *rightsp, struct kinfo_file *kif, struct filedesc *fdp, int flags) { int error; bzero(kif, sizeof(*kif)); /* Set a default type to allow for empty fill_kinfo() methods. */ kif->kf_type = KF_TYPE_UNKNOWN; kif->kf_flags = xlate_fflags(fp->f_flag); if (rightsp != NULL) kif->kf_cap_rights = *rightsp; else cap_rights_init_zero(&kif->kf_cap_rights); kif->kf_fd = fd; kif->kf_ref_count = refcount_load(&fp->f_count); kif->kf_offset = foffset_get(fp); /* * This may drop the filedesc lock, so the 'fp' cannot be * accessed after this call. */ error = fo_fill_kinfo(fp, kif, fdp); if (error == 0) kif->kf_status |= KF_ATTR_VALID; if ((flags & KERN_FILEDESC_PACK_KINFO) != 0) pack_kinfo(kif); else kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t)); } static void export_vnode_to_kinfo(struct vnode *vp, int fd, int fflags, struct kinfo_file *kif, int flags) { int error; bzero(kif, sizeof(*kif)); kif->kf_type = KF_TYPE_VNODE; error = vn_fill_kinfo_vnode(vp, kif); if (error == 0) kif->kf_status |= KF_ATTR_VALID; kif->kf_flags = xlate_fflags(fflags); cap_rights_init_zero(&kif->kf_cap_rights); kif->kf_fd = fd; kif->kf_ref_count = -1; kif->kf_offset = -1; if ((flags & KERN_FILEDESC_PACK_KINFO) != 0) pack_kinfo(kif); else kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t)); vrele(vp); } struct export_fd_buf { struct filedesc *fdp; struct pwddesc *pdp; struct sbuf *sb; ssize_t remainder; struct kinfo_file kif; int flags; }; static int export_kinfo_to_sb(struct export_fd_buf *efbuf) { struct kinfo_file *kif; kif = &efbuf->kif; if (efbuf->remainder != -1) { if (efbuf->remainder < kif->kf_structsize) { /* Terminate export. */ efbuf->remainder = 0; return (0); } efbuf->remainder -= kif->kf_structsize; } return (sbuf_bcat(efbuf->sb, kif, kif->kf_structsize) == 0 ? 0 : ENOMEM); } static int export_file_to_sb(struct file *fp, int fd, cap_rights_t *rightsp, struct export_fd_buf *efbuf) { int error; if (efbuf->remainder == 0) return (0); export_file_to_kinfo(fp, fd, rightsp, &efbuf->kif, efbuf->fdp, efbuf->flags); FILEDESC_SUNLOCK(efbuf->fdp); error = export_kinfo_to_sb(efbuf); FILEDESC_SLOCK(efbuf->fdp); return (error); } static int export_vnode_to_sb(struct vnode *vp, int fd, int fflags, struct export_fd_buf *efbuf) { int error; if (efbuf->remainder == 0) return (0); if (efbuf->pdp != NULL) PWDDESC_XUNLOCK(efbuf->pdp); export_vnode_to_kinfo(vp, fd, fflags, &efbuf->kif, efbuf->flags); error = export_kinfo_to_sb(efbuf); if (efbuf->pdp != NULL) PWDDESC_XLOCK(efbuf->pdp); return (error); } /* * Store a process file descriptor information to sbuf. * * Takes a locked proc as argument, and returns with the proc unlocked. */ int kern_proc_filedesc_out(struct proc *p, struct sbuf *sb, ssize_t maxlen, int flags) { struct file *fp; struct filedesc *fdp; struct pwddesc *pdp; struct export_fd_buf *efbuf; struct vnode *cttyvp, *textvp, *tracevp; struct pwd *pwd; int error, i, lastfile; cap_rights_t rights; PROC_LOCK_ASSERT(p, MA_OWNED); /* ktrace vnode */ tracevp = p->p_tracevp; if (tracevp != NULL) vrefact(tracevp); /* text vnode */ textvp = p->p_textvp; if (textvp != NULL) vrefact(textvp); /* Controlling tty. */ cttyvp = NULL; if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) { cttyvp = p->p_pgrp->pg_session->s_ttyvp; if (cttyvp != NULL) vrefact(cttyvp); } fdp = fdhold(p); pdp = pdhold(p); PROC_UNLOCK(p); efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK); efbuf->fdp = NULL; efbuf->pdp = NULL; efbuf->sb = sb; efbuf->remainder = maxlen; efbuf->flags = flags; if (tracevp != NULL) export_vnode_to_sb(tracevp, KF_FD_TYPE_TRACE, FREAD | FWRITE, efbuf); if (textvp != NULL) export_vnode_to_sb(textvp, KF_FD_TYPE_TEXT, FREAD, efbuf); if (cttyvp != NULL) export_vnode_to_sb(cttyvp, KF_FD_TYPE_CTTY, FREAD | FWRITE, efbuf); error = 0; if (pdp == NULL || fdp == NULL) goto fail; efbuf->fdp = fdp; efbuf->pdp = pdp; PWDDESC_XLOCK(pdp); pwd = pwd_hold_pwddesc(pdp); if (pwd != NULL) { /* working directory */ if (pwd->pwd_cdir != NULL) { vrefact(pwd->pwd_cdir); export_vnode_to_sb(pwd->pwd_cdir, KF_FD_TYPE_CWD, FREAD, efbuf); } /* root directory */ if (pwd->pwd_rdir != NULL) { vrefact(pwd->pwd_rdir); export_vnode_to_sb(pwd->pwd_rdir, KF_FD_TYPE_ROOT, FREAD, efbuf); } /* jail directory */ if (pwd->pwd_jdir != NULL) { vrefact(pwd->pwd_jdir); export_vnode_to_sb(pwd->pwd_jdir, KF_FD_TYPE_JAIL, FREAD, efbuf); } } PWDDESC_XUNLOCK(pdp); if (pwd != NULL) pwd_drop(pwd); FILEDESC_SLOCK(fdp); lastfile = fdlastfile(fdp); for (i = 0; refcount_load(&fdp->fd_refcnt) > 0 && i <= lastfile; i++) { if ((fp = fdp->fd_ofiles[i].fde_file) == NULL) continue; #ifdef CAPABILITIES rights = *cap_rights(fdp, i); #else /* !CAPABILITIES */ rights = cap_no_rights; #endif /* * Create sysctl entry. It is OK to drop the filedesc * lock inside of export_file_to_sb() as we will * re-validate and re-evaluate its properties when the * loop continues. */ error = export_file_to_sb(fp, i, &rights, efbuf); if (error != 0 || efbuf->remainder == 0) break; } FILEDESC_SUNLOCK(fdp); fail: if (fdp != NULL) fddrop(fdp); if (pdp != NULL) pddrop(pdp); free(efbuf, M_TEMP); return (error); } #define FILEDESC_SBUF_SIZE (sizeof(struct kinfo_file) * 5) /* * Get per-process file descriptors for use by procstat(1), et al. */ static int sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS) { struct sbuf sb; struct proc *p; ssize_t maxlen; int error, error2, *name; name = (int *)arg1; sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req); sbuf_clear_flags(&sb, SBUF_INCLUDENUL); error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); if (error != 0) { sbuf_delete(&sb); return (error); } maxlen = req->oldptr != NULL ? req->oldlen : -1; error = kern_proc_filedesc_out(p, &sb, maxlen, KERN_FILEDESC_PACK_KINFO); error2 = sbuf_finish(&sb); sbuf_delete(&sb); return (error != 0 ? error : error2); } #ifdef COMPAT_FREEBSD7 #ifdef KINFO_OFILE_SIZE CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE); #endif static void kinfo_to_okinfo(struct kinfo_file *kif, struct kinfo_ofile *okif) { okif->kf_structsize = sizeof(*okif); okif->kf_type = kif->kf_type; okif->kf_fd = kif->kf_fd; okif->kf_ref_count = kif->kf_ref_count; okif->kf_flags = kif->kf_flags & (KF_FLAG_READ | KF_FLAG_WRITE | KF_FLAG_APPEND | KF_FLAG_ASYNC | KF_FLAG_FSYNC | KF_FLAG_NONBLOCK | KF_FLAG_DIRECT | KF_FLAG_HASLOCK); okif->kf_offset = kif->kf_offset; if (kif->kf_type == KF_TYPE_VNODE) okif->kf_vnode_type = kif->kf_un.kf_file.kf_file_type; else okif->kf_vnode_type = KF_VTYPE_VNON; strlcpy(okif->kf_path, kif->kf_path, sizeof(okif->kf_path)); if (kif->kf_type == KF_TYPE_SOCKET) { okif->kf_sock_domain = kif->kf_un.kf_sock.kf_sock_domain0; okif->kf_sock_type = kif->kf_un.kf_sock.kf_sock_type0; okif->kf_sock_protocol = kif->kf_un.kf_sock.kf_sock_protocol0; okif->kf_sa_local = kif->kf_un.kf_sock.kf_sa_local; okif->kf_sa_peer = kif->kf_un.kf_sock.kf_sa_peer; } else { okif->kf_sa_local.ss_family = AF_UNSPEC; okif->kf_sa_peer.ss_family = AF_UNSPEC; } } static int export_vnode_for_osysctl(struct vnode *vp, int type, struct kinfo_file *kif, struct kinfo_ofile *okif, struct pwddesc *pdp, struct sysctl_req *req) { int error; vrefact(vp); PWDDESC_XUNLOCK(pdp); export_vnode_to_kinfo(vp, type, 0, kif, KERN_FILEDESC_PACK_KINFO); kinfo_to_okinfo(kif, okif); error = SYSCTL_OUT(req, okif, sizeof(*okif)); PWDDESC_XLOCK(pdp); return (error); } /* * Get per-process file descriptors for use by procstat(1), et al. */ static int sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS) { struct kinfo_ofile *okif; struct kinfo_file *kif; struct filedesc *fdp; struct pwddesc *pdp; struct pwd *pwd; int error, i, lastfile, *name; struct file *fp; struct proc *p; name = (int *)arg1; error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); if (error != 0) return (error); fdp = fdhold(p); if (fdp != NULL) pdp = pdhold(p); PROC_UNLOCK(p); if (fdp == NULL || pdp == NULL) { if (fdp != NULL) fddrop(fdp); return (ENOENT); } kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK); okif = malloc(sizeof(*okif), M_TEMP, M_WAITOK); PWDDESC_XLOCK(pdp); pwd = pwd_hold_pwddesc(pdp); if (pwd != NULL) { if (pwd->pwd_cdir != NULL) export_vnode_for_osysctl(pwd->pwd_cdir, KF_FD_TYPE_CWD, kif, okif, pdp, req); if (pwd->pwd_rdir != NULL) export_vnode_for_osysctl(pwd->pwd_rdir, KF_FD_TYPE_ROOT, kif, okif, pdp, req); if (pwd->pwd_jdir != NULL) export_vnode_for_osysctl(pwd->pwd_jdir, KF_FD_TYPE_JAIL, kif, okif, pdp, req); } PWDDESC_XUNLOCK(pdp); if (pwd != NULL) pwd_drop(pwd); FILEDESC_SLOCK(fdp); lastfile = fdlastfile(fdp); for (i = 0; refcount_load(&fdp->fd_refcnt) > 0 && i <= lastfile; i++) { if ((fp = fdp->fd_ofiles[i].fde_file) == NULL) continue; export_file_to_kinfo(fp, i, NULL, kif, fdp, KERN_FILEDESC_PACK_KINFO); FILEDESC_SUNLOCK(fdp); kinfo_to_okinfo(kif, okif); error = SYSCTL_OUT(req, okif, sizeof(*okif)); FILEDESC_SLOCK(fdp); if (error) break; } FILEDESC_SUNLOCK(fdp); fddrop(fdp); pddrop(pdp); free(kif, M_TEMP); free(okif, M_TEMP); return (0); } static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc, CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_ofiledesc, "Process ofiledesc entries"); #endif /* COMPAT_FREEBSD7 */ int vntype_to_kinfo(int vtype) { struct { int vtype; int kf_vtype; } vtypes_table[] = { { VBAD, KF_VTYPE_VBAD }, { VBLK, KF_VTYPE_VBLK }, { VCHR, KF_VTYPE_VCHR }, { VDIR, KF_VTYPE_VDIR }, { VFIFO, KF_VTYPE_VFIFO }, { VLNK, KF_VTYPE_VLNK }, { VNON, KF_VTYPE_VNON }, { VREG, KF_VTYPE_VREG }, { VSOCK, KF_VTYPE_VSOCK } }; unsigned int i; /* * Perform vtype translation. */ for (i = 0; i < nitems(vtypes_table); i++) if (vtypes_table[i].vtype == vtype) return (vtypes_table[i].kf_vtype); return (KF_VTYPE_UNKNOWN); } static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc, CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_filedesc, "Process filedesc entries"); /* * Store a process current working directory information to sbuf. * * Takes a locked proc as argument, and returns with the proc unlocked. */ int kern_proc_cwd_out(struct proc *p, struct sbuf *sb, ssize_t maxlen) { struct pwddesc *pdp; struct pwd *pwd; struct export_fd_buf *efbuf; struct vnode *cdir; int error; PROC_LOCK_ASSERT(p, MA_OWNED); pdp = pdhold(p); PROC_UNLOCK(p); if (pdp == NULL) return (EINVAL); efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK); efbuf->pdp = pdp; efbuf->sb = sb; efbuf->remainder = maxlen; PWDDESC_XLOCK(pdp); pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); cdir = pwd->pwd_cdir; if (cdir == NULL) { error = EINVAL; } else { vrefact(cdir); error = export_vnode_to_sb(cdir, KF_FD_TYPE_CWD, FREAD, efbuf); } PWDDESC_XUNLOCK(pdp); pddrop(pdp); free(efbuf, M_TEMP); return (error); } /* * Get per-process current working directory. */ static int sysctl_kern_proc_cwd(SYSCTL_HANDLER_ARGS) { struct sbuf sb; struct proc *p; ssize_t maxlen; int error, error2, *name; name = (int *)arg1; sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file), req); sbuf_clear_flags(&sb, SBUF_INCLUDENUL); error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); if (error != 0) { sbuf_delete(&sb); return (error); } maxlen = req->oldptr != NULL ? req->oldlen : -1; error = kern_proc_cwd_out(p, &sb, maxlen); error2 = sbuf_finish(&sb); sbuf_delete(&sb); return (error != 0 ? error : error2); } static SYSCTL_NODE(_kern_proc, KERN_PROC_CWD, cwd, CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_cwd, "Process current working directory"); #ifdef DDB /* * For the purposes of debugging, generate a human-readable string for the * file type. */ static const char * file_type_to_name(short type) { switch (type) { case 0: return ("zero"); case DTYPE_VNODE: return ("vnode"); case DTYPE_SOCKET: return ("socket"); case DTYPE_PIPE: return ("pipe"); case DTYPE_FIFO: return ("fifo"); case DTYPE_KQUEUE: return ("kqueue"); case DTYPE_CRYPTO: return ("crypto"); case DTYPE_MQUEUE: return ("mqueue"); case DTYPE_SHM: return ("shm"); case DTYPE_SEM: return ("ksem"); case DTYPE_PTS: return ("pts"); case DTYPE_DEV: return ("dev"); case DTYPE_PROCDESC: return ("proc"); case DTYPE_EVENTFD: return ("eventfd"); case DTYPE_LINUXTFD: return ("ltimer"); default: return ("unkn"); } } /* * For the purposes of debugging, identify a process (if any, perhaps one of * many) that references the passed file in its file descriptor array. Return * NULL if none. */ static struct proc * file_to_first_proc(struct file *fp) { struct filedesc *fdp; struct proc *p; int n; FOREACH_PROC_IN_SYSTEM(p) { if (p->p_state == PRS_NEW) continue; fdp = p->p_fd; if (fdp == NULL) continue; for (n = 0; n < fdp->fd_nfiles; n++) { if (fp == fdp->fd_ofiles[n].fde_file) return (p); } } return (NULL); } static void db_print_file(struct file *fp, int header) { #define XPTRWIDTH ((int)howmany(sizeof(void *) * NBBY, 4)) struct proc *p; if (header) db_printf("%*s %6s %*s %8s %4s %5s %6s %*s %5s %s\n", XPTRWIDTH, "File", "Type", XPTRWIDTH, "Data", "Flag", "GCFl", "Count", "MCount", XPTRWIDTH, "Vnode", "FPID", "FCmd"); p = file_to_first_proc(fp); db_printf("%*p %6s %*p %08x %04x %5d %6d %*p %5d %s\n", XPTRWIDTH, fp, file_type_to_name(fp->f_type), XPTRWIDTH, fp->f_data, fp->f_flag, 0, refcount_load(&fp->f_count), 0, XPTRWIDTH, fp->f_vnode, p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-"); #undef XPTRWIDTH } DB_SHOW_COMMAND(file, db_show_file) { struct file *fp; if (!have_addr) { db_printf("usage: show file \n"); return; } fp = (struct file *)addr; db_print_file(fp, 1); } DB_SHOW_COMMAND(files, db_show_files) { struct filedesc *fdp; struct file *fp; struct proc *p; int header; int n; header = 1; FOREACH_PROC_IN_SYSTEM(p) { if (p->p_state == PRS_NEW) continue; if ((fdp = p->p_fd) == NULL) continue; for (n = 0; n < fdp->fd_nfiles; ++n) { if ((fp = fdp->fd_ofiles[n].fde_file) == NULL) continue; db_print_file(fp, header); header = 0; } } } #endif SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW, &maxfilesperproc, 0, "Maximum files allowed open per process"); SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW, &maxfiles, 0, "Maximum number of files"); SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD, &openfiles, 0, "System-wide number of open files"); /* ARGSUSED*/ static void filelistinit(void *dummy) { file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); filedesc0_zone = uma_zcreate("filedesc0", sizeof(struct filedesc0), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); pwd_zone = uma_zcreate("PWD", sizeof(struct pwd), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_SMR); /* * XXXMJG this is a temporary hack due to boot ordering issues against * the vnode zone. */ vfs_smr = uma_zone_get_smr(pwd_zone); mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF); } SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL); /*-------------------------------------------------------------------*/ static int badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return (EBADF); } static int badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { return (EINVAL); } static int badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { return (EBADF); } static int badfo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { return (0); } static int badfo_kqfilter(struct file *fp, struct knote *kn) { return (EBADF); } static int badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td) { return (EBADF); } static int badfo_close(struct file *fp, struct thread *td) { return (0); } static int badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { return (EBADF); } static int badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { return (EBADF); } static int badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, struct thread *td) { return (EBADF); } static int badfo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { return (0); } struct fileops badfileops = { .fo_read = badfo_readwrite, .fo_write = badfo_readwrite, .fo_truncate = badfo_truncate, .fo_ioctl = badfo_ioctl, .fo_poll = badfo_poll, .fo_kqfilter = badfo_kqfilter, .fo_stat = badfo_stat, .fo_close = badfo_close, .fo_chmod = badfo_chmod, .fo_chown = badfo_chown, .fo_sendfile = badfo_sendfile, .fo_fill_kinfo = badfo_fill_kinfo, }; static int path_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { return (POLLNVAL); } static int path_close(struct file *fp, struct thread *td) { MPASS(fp->f_type == DTYPE_VNODE); fp->f_ops = &badfileops; vdrop(fp->f_vnode); return (0); } struct fileops path_fileops = { .fo_read = badfo_readwrite, .fo_write = badfo_readwrite, .fo_truncate = badfo_truncate, .fo_ioctl = badfo_ioctl, .fo_poll = path_poll, - .fo_kqfilter = badfo_kqfilter, + .fo_kqfilter = vn_kqfilter_opath, .fo_stat = vn_statfile, .fo_close = path_close, .fo_chmod = badfo_chmod, .fo_chown = badfo_chown, .fo_sendfile = badfo_sendfile, .fo_fill_kinfo = vn_fill_kinfo, .fo_flags = DFLAG_PASSABLE, }; int invfo_rdwr(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return (EOPNOTSUPP); } int invfo_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { return (EINVAL); } int invfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { return (ENOTTY); } int invfo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { return (poll_no_poll(events)); } int invfo_kqfilter(struct file *fp, struct knote *kn) { return (EINVAL); } int invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { return (EINVAL); } int invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { return (EINVAL); } int invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, struct thread *td) { return (EINVAL); } /*-------------------------------------------------------------------*/ /* * File Descriptor pseudo-device driver (/dev/fd/). * * Opening minor device N dup()s the file (if any) connected to file * descriptor N belonging to the calling process. Note that this driver * consists of only the ``open()'' routine, because all subsequent * references to this file will be direct to the other driver. * * XXX: we could give this one a cloning event handler if necessary. */ /* ARGSUSED */ static int fdopen(struct cdev *dev, int mode, int type, struct thread *td) { /* * XXX Kludge: set curthread->td_dupfd to contain the value of the * the file descriptor being sought for duplication. The error * return ensures that the vnode for this device will be released * by vn_open. Open will detect this special error and take the * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN * will simply report the error. */ td->td_dupfd = dev2unit(dev); return (ENODEV); } static struct cdevsw fildesc_cdevsw = { .d_version = D_VERSION, .d_open = fdopen, .d_name = "FD", }; static void fildesc_drvinit(void *unused) { struct cdev *dev; dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0666, "fd/0"); make_dev_alias(dev, "stdin"); dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL, UID_ROOT, GID_WHEEL, 0666, "fd/1"); make_dev_alias(dev, "stdout"); dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL, UID_ROOT, GID_WHEEL, 0666, "fd/2"); make_dev_alias(dev, "stderr"); } SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL); diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c index 9130843f6761..43104a472b5b 100644 --- a/sys/kern/vfs_syscalls.c +++ b/sys/kern/vfs_syscalls.c @@ -1,4990 +1,4991 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_syscalls.c 8.13 (Berkeley) 4/15/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information"); static int kern_chflagsat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, u_long flags, int atflag); static int setfflags(struct thread *td, struct vnode *, u_long); static int getutimes(const struct timeval *, enum uio_seg, struct timespec *); static int getutimens(const struct timespec *, enum uio_seg, struct timespec *, int *); static int setutimes(struct thread *td, struct vnode *, const struct timespec *, int, int); static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred, struct thread *td); static int kern_fhlinkat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, fhandle_t *fhp); static int kern_readlink_vp(struct vnode *vp, char *buf, enum uio_seg bufseg, size_t count, struct thread *td); static int kern_linkat_vp(struct thread *td, struct vnode *vp, int fd, const char *path, enum uio_seg segflag); static uint64_t at2cnpflags(u_int at_flags, u_int mask) { u_int64_t res; MPASS((at_flags & (AT_SYMLINK_FOLLOW | AT_SYMLINK_NOFOLLOW)) != (AT_SYMLINK_FOLLOW | AT_SYMLINK_NOFOLLOW)); res = 0; at_flags &= mask; if ((at_flags & AT_RESOLVE_BENEATH) != 0) res |= RBENEATH; if ((at_flags & AT_SYMLINK_FOLLOW) != 0) res |= FOLLOW; /* NOFOLLOW is pseudo flag */ if ((mask & AT_SYMLINK_NOFOLLOW) != 0) { res |= (at_flags & AT_SYMLINK_NOFOLLOW) != 0 ? NOFOLLOW : FOLLOW; } if ((mask & AT_EMPTY_PATH) != 0 && (at_flags & AT_EMPTY_PATH) != 0) res |= EMPTYPATH; return (res); } int kern_sync(struct thread *td) { struct mount *mp, *nmp; int save; mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } if ((mp->mnt_flag & MNT_RDONLY) == 0 && vn_start_write(NULL, &mp, V_NOWAIT) == 0) { save = curthread_pflags_set(TDP_SYNCIO); vfs_periodic(mp, MNT_NOWAIT); VFS_SYNC(mp, MNT_NOWAIT); curthread_pflags_restore(save); vn_finished_write(mp); } mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp); } mtx_unlock(&mountlist_mtx); return (0); } /* * Sync each mounted filesystem. */ #ifndef _SYS_SYSPROTO_H_ struct sync_args { int dummy; }; #endif /* ARGSUSED */ int sys_sync(struct thread *td, struct sync_args *uap) { return (kern_sync(td)); } /* * Change filesystem quotas. */ #ifndef _SYS_SYSPROTO_H_ struct quotactl_args { char *path; int cmd; int uid; caddr_t arg; }; #endif int sys_quotactl(struct thread *td, struct quotactl_args *uap) { struct mount *mp; struct nameidata nd; int error; AUDIT_ARG_CMD(uap->cmd); AUDIT_ARG_UID(uap->uid); if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS)) return (EPERM); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); mp = nd.ni_vp->v_mount; vfs_ref(mp); vput(nd.ni_vp); error = vfs_busy(mp, 0); if (error != 0) { vfs_rel(mp); return (error); } error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg); /* * Since quota on operation typically needs to open quota * file, the Q_QUOTAON handler needs to unbusy the mount point * before calling into namei. Otherwise, unmount might be * started between two vfs_busy() invocations (first is our, * second is from mount point cross-walk code in lookup()), * causing deadlock. * * Require that Q_QUOTAON handles the vfs_busy() reference on * its own, always returning with ubusied mount point. */ if ((uap->cmd >> SUBCMDSHIFT) != Q_QUOTAON && (uap->cmd >> SUBCMDSHIFT) != Q_QUOTAOFF) vfs_unbusy(mp); vfs_rel(mp); return (error); } /* * Used by statfs conversion routines to scale the block size up if * necessary so that all of the block counts are <= 'max_size'. Note * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero * value of 'n'. */ void statfs_scale_blocks(struct statfs *sf, long max_size) { uint64_t count; int shift; KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__)); /* * Attempt to scale the block counts to give a more accurate * overview to userland of the ratio of free space to used * space. To do this, find the largest block count and compute * a divisor that lets it fit into a signed integer <= max_size. */ if (sf->f_bavail < 0) count = -sf->f_bavail; else count = sf->f_bavail; count = MAX(sf->f_blocks, MAX(sf->f_bfree, count)); if (count <= max_size) return; count >>= flsl(max_size); shift = 0; while (count > 0) { shift++; count >>=1; } sf->f_bsize <<= shift; sf->f_blocks >>= shift; sf->f_bfree >>= shift; sf->f_bavail >>= shift; } static int kern_do_statfs(struct thread *td, struct mount *mp, struct statfs *buf) { int error; if (mp == NULL) return (EBADF); error = vfs_busy(mp, 0); vfs_rel(mp); if (error != 0) return (error); #ifdef MAC error = mac_mount_check_stat(td->td_ucred, mp); if (error != 0) goto out; #endif error = VFS_STATFS(mp, buf); if (error != 0) goto out; if (priv_check_cred_vfs_generation(td->td_ucred)) { buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0; prison_enforce_statfs(td->td_ucred, mp, buf); } out: vfs_unbusy(mp); return (error); } /* * Get filesystem statistics. */ #ifndef _SYS_SYSPROTO_H_ struct statfs_args { char *path; struct statfs *buf; }; #endif int sys_statfs(struct thread *td, struct statfs_args *uap) { struct statfs *sfp; int error; sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp); if (error == 0) error = copyout(sfp, uap->buf, sizeof(struct statfs)); free(sfp, M_STATFS); return (error); } int kern_statfs(struct thread *td, const char *path, enum uio_seg pathseg, struct statfs *buf) { struct mount *mp; struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, td); error = namei(&nd); if (error != 0) return (error); mp = vfs_ref_from_vp(nd.ni_vp); NDFREE_NOTHING(&nd); vrele(nd.ni_vp); return (kern_do_statfs(td, mp, buf)); } /* * Get filesystem statistics. */ #ifndef _SYS_SYSPROTO_H_ struct fstatfs_args { int fd; struct statfs *buf; }; #endif int sys_fstatfs(struct thread *td, struct fstatfs_args *uap) { struct statfs *sfp; int error; sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fstatfs(td, uap->fd, sfp); if (error == 0) error = copyout(sfp, uap->buf, sizeof(struct statfs)); free(sfp, M_STATFS); return (error); } int kern_fstatfs(struct thread *td, int fd, struct statfs *buf) { struct file *fp; struct mount *mp; struct vnode *vp; int error; AUDIT_ARG_FD(fd); error = getvnode_path(td, fd, &cap_fstatfs_rights, &fp); if (error != 0) return (error); vp = fp->f_vnode; #ifdef AUDIT if (AUDITING_TD(td)) { vn_lock(vp, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(vp); VOP_UNLOCK(vp); } #endif mp = vfs_ref_from_vp(vp); fdrop(fp, td); return (kern_do_statfs(td, mp, buf)); } /* * Get statistics on all filesystems. */ #ifndef _SYS_SYSPROTO_H_ struct getfsstat_args { struct statfs *buf; long bufsize; int mode; }; #endif int sys_getfsstat(struct thread *td, struct getfsstat_args *uap) { size_t count; int error; if (uap->bufsize < 0 || uap->bufsize > SIZE_MAX) return (EINVAL); error = kern_getfsstat(td, &uap->buf, uap->bufsize, &count, UIO_USERSPACE, uap->mode); if (error == 0) td->td_retval[0] = count; return (error); } /* * If (bufsize > 0 && bufseg == UIO_SYSSPACE) * The caller is responsible for freeing memory which will be allocated * in '*buf'. */ int kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize, size_t *countp, enum uio_seg bufseg, int mode) { struct mount *mp, *nmp; struct statfs *sfsp, *sp, *sptmp, *tofree; size_t count, maxcount; int error; switch (mode) { case MNT_WAIT: case MNT_NOWAIT: break; default: if (bufseg == UIO_SYSSPACE) *buf = NULL; return (EINVAL); } restart: maxcount = bufsize / sizeof(struct statfs); if (bufsize == 0) { sfsp = NULL; tofree = NULL; } else if (bufseg == UIO_USERSPACE) { sfsp = *buf; tofree = NULL; } else /* if (bufseg == UIO_SYSSPACE) */ { count = 0; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { count++; } mtx_unlock(&mountlist_mtx); if (maxcount > count) maxcount = count; tofree = sfsp = *buf = malloc(maxcount * sizeof(struct statfs), M_STATFS, M_WAITOK); } count = 0; /* * If there is no target buffer they only want the count. * * This could be TAILQ_FOREACH but it is open-coded to match the original * code below. */ if (sfsp == NULL) { mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (prison_canseemount(td->td_ucred, mp) != 0) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } #ifdef MAC if (mac_mount_check_stat(td->td_ucred, mp) != 0) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } #endif count++; nmp = TAILQ_NEXT(mp, mnt_list); } mtx_unlock(&mountlist_mtx); *countp = count; return (0); } /* * They want the entire thing. * * Short-circuit the corner case of no room for anything, avoids * relocking below. */ if (maxcount < 1) { goto out; } mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (prison_canseemount(td->td_ucred, mp) != 0) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } #ifdef MAC if (mac_mount_check_stat(td->td_ucred, mp) != 0) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } #endif if (mode == MNT_WAIT) { if (vfs_busy(mp, MBF_MNTLSTLOCK) != 0) { /* * If vfs_busy() failed, and MBF_NOWAIT * wasn't passed, then the mp is gone. * Furthermore, because of MBF_MNTLSTLOCK, * the mountlist_mtx was dropped. We have * no other choice than to start over. */ mtx_unlock(&mountlist_mtx); free(tofree, M_STATFS); goto restart; } } else { if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) != 0) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } } sp = &mp->mnt_stat; /* * If MNT_NOWAIT is specified, do not refresh * the fsstat cache. */ if (mode != MNT_NOWAIT) { error = VFS_STATFS(mp, sp); if (error != 0) { mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp); continue; } } if (priv_check_cred_vfs_generation(td->td_ucred)) { sptmp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); *sptmp = *sp; sptmp->f_fsid.val[0] = sptmp->f_fsid.val[1] = 0; prison_enforce_statfs(td->td_ucred, mp, sptmp); sp = sptmp; } else sptmp = NULL; if (bufseg == UIO_SYSSPACE) { bcopy(sp, sfsp, sizeof(*sp)); free(sptmp, M_STATFS); } else /* if (bufseg == UIO_USERSPACE) */ { error = copyout(sp, sfsp, sizeof(*sp)); free(sptmp, M_STATFS); if (error != 0) { vfs_unbusy(mp); return (error); } } sfsp++; count++; if (count == maxcount) { vfs_unbusy(mp); goto out; } mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp); } mtx_unlock(&mountlist_mtx); out: *countp = count; return (0); } #ifdef COMPAT_FREEBSD4 /* * Get old format filesystem statistics. */ static void freebsd4_cvtstatfs(struct statfs *, struct ostatfs *); #ifndef _SYS_SYSPROTO_H_ struct freebsd4_statfs_args { char *path; struct ostatfs *buf; }; #endif int freebsd4_statfs(struct thread *td, struct freebsd4_statfs_args *uap) { struct ostatfs osb; struct statfs *sfp; int error; sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp); if (error == 0) { freebsd4_cvtstatfs(sfp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); } free(sfp, M_STATFS); return (error); } /* * Get filesystem statistics. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd4_fstatfs_args { int fd; struct ostatfs *buf; }; #endif int freebsd4_fstatfs(struct thread *td, struct freebsd4_fstatfs_args *uap) { struct ostatfs osb; struct statfs *sfp; int error; sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fstatfs(td, uap->fd, sfp); if (error == 0) { freebsd4_cvtstatfs(sfp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); } free(sfp, M_STATFS); return (error); } /* * Get statistics on all filesystems. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd4_getfsstat_args { struct ostatfs *buf; long bufsize; int mode; }; #endif int freebsd4_getfsstat(struct thread *td, struct freebsd4_getfsstat_args *uap) { struct statfs *buf, *sp; struct ostatfs osb; size_t count, size; int error; if (uap->bufsize < 0) return (EINVAL); count = uap->bufsize / sizeof(struct ostatfs); if (count > SIZE_MAX / sizeof(struct statfs)) return (EINVAL); size = count * sizeof(struct statfs); error = kern_getfsstat(td, &buf, size, &count, UIO_SYSSPACE, uap->mode); if (error == 0) td->td_retval[0] = count; if (size != 0) { sp = buf; while (count != 0 && error == 0) { freebsd4_cvtstatfs(sp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); sp++; uap->buf++; count--; } free(buf, M_STATFS); } return (error); } /* * Implement fstatfs() for (NFS) file handles. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd4_fhstatfs_args { struct fhandle *u_fhp; struct ostatfs *buf; }; #endif int freebsd4_fhstatfs(struct thread *td, struct freebsd4_fhstatfs_args *uap) { struct ostatfs osb; struct statfs *sfp; fhandle_t fh; int error; error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); if (error != 0) return (error); sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fhstatfs(td, fh, sfp); if (error == 0) { freebsd4_cvtstatfs(sfp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); } free(sfp, M_STATFS); return (error); } /* * Convert a new format statfs structure to an old format statfs structure. */ static void freebsd4_cvtstatfs(struct statfs *nsp, struct ostatfs *osp) { statfs_scale_blocks(nsp, LONG_MAX); bzero(osp, sizeof(*osp)); osp->f_bsize = nsp->f_bsize; osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX); osp->f_blocks = nsp->f_blocks; osp->f_bfree = nsp->f_bfree; osp->f_bavail = nsp->f_bavail; osp->f_files = MIN(nsp->f_files, LONG_MAX); osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX); osp->f_owner = nsp->f_owner; osp->f_type = nsp->f_type; osp->f_flags = nsp->f_flags; osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX); osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX); osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX); osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX); strlcpy(osp->f_fstypename, nsp->f_fstypename, MIN(MFSNAMELEN, OMFSNAMELEN)); strlcpy(osp->f_mntonname, nsp->f_mntonname, MIN(MNAMELEN, OMNAMELEN)); strlcpy(osp->f_mntfromname, nsp->f_mntfromname, MIN(MNAMELEN, OMNAMELEN)); osp->f_fsid = nsp->f_fsid; } #endif /* COMPAT_FREEBSD4 */ #if defined(COMPAT_FREEBSD11) /* * Get old format filesystem statistics. */ static void freebsd11_cvtstatfs(struct statfs *, struct freebsd11_statfs *); int freebsd11_statfs(struct thread *td, struct freebsd11_statfs_args *uap) { struct freebsd11_statfs osb; struct statfs *sfp; int error; sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp); if (error == 0) { freebsd11_cvtstatfs(sfp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); } free(sfp, M_STATFS); return (error); } /* * Get filesystem statistics. */ int freebsd11_fstatfs(struct thread *td, struct freebsd11_fstatfs_args *uap) { struct freebsd11_statfs osb; struct statfs *sfp; int error; sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fstatfs(td, uap->fd, sfp); if (error == 0) { freebsd11_cvtstatfs(sfp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); } free(sfp, M_STATFS); return (error); } /* * Get statistics on all filesystems. */ int freebsd11_getfsstat(struct thread *td, struct freebsd11_getfsstat_args *uap) { struct freebsd11_statfs osb; struct statfs *buf, *sp; size_t count, size; int error; count = uap->bufsize / sizeof(struct ostatfs); size = count * sizeof(struct statfs); error = kern_getfsstat(td, &buf, size, &count, UIO_SYSSPACE, uap->mode); if (error == 0) td->td_retval[0] = count; if (size > 0) { sp = buf; while (count > 0 && error == 0) { freebsd11_cvtstatfs(sp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); sp++; uap->buf++; count--; } free(buf, M_STATFS); } return (error); } /* * Implement fstatfs() for (NFS) file handles. */ int freebsd11_fhstatfs(struct thread *td, struct freebsd11_fhstatfs_args *uap) { struct freebsd11_statfs osb; struct statfs *sfp; fhandle_t fh; int error; error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); if (error) return (error); sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fhstatfs(td, fh, sfp); if (error == 0) { freebsd11_cvtstatfs(sfp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); } free(sfp, M_STATFS); return (error); } /* * Convert a new format statfs structure to an old format statfs structure. */ static void freebsd11_cvtstatfs(struct statfs *nsp, struct freebsd11_statfs *osp) { bzero(osp, sizeof(*osp)); osp->f_version = FREEBSD11_STATFS_VERSION; osp->f_type = nsp->f_type; osp->f_flags = nsp->f_flags; osp->f_bsize = nsp->f_bsize; osp->f_iosize = nsp->f_iosize; osp->f_blocks = nsp->f_blocks; osp->f_bfree = nsp->f_bfree; osp->f_bavail = nsp->f_bavail; osp->f_files = nsp->f_files; osp->f_ffree = nsp->f_ffree; osp->f_syncwrites = nsp->f_syncwrites; osp->f_asyncwrites = nsp->f_asyncwrites; osp->f_syncreads = nsp->f_syncreads; osp->f_asyncreads = nsp->f_asyncreads; osp->f_namemax = nsp->f_namemax; osp->f_owner = nsp->f_owner; osp->f_fsid = nsp->f_fsid; strlcpy(osp->f_fstypename, nsp->f_fstypename, MIN(MFSNAMELEN, sizeof(osp->f_fstypename))); strlcpy(osp->f_mntonname, nsp->f_mntonname, MIN(MNAMELEN, sizeof(osp->f_mntonname))); strlcpy(osp->f_mntfromname, nsp->f_mntfromname, MIN(MNAMELEN, sizeof(osp->f_mntfromname))); } #endif /* COMPAT_FREEBSD11 */ /* * Change current working directory to a given file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchdir_args { int fd; }; #endif int sys_fchdir(struct thread *td, struct fchdir_args *uap) { struct vnode *vp, *tdp; struct mount *mp; struct file *fp; int error; AUDIT_ARG_FD(uap->fd); error = getvnode_path(td, uap->fd, &cap_fchdir_rights, &fp); if (error != 0) return (error); vp = fp->f_vnode; vref(vp); fdrop(fp, td); vn_lock(vp, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(vp); error = change_dir(vp, td); while (!error && (mp = vp->v_mountedhere) != NULL) { if (vfs_busy(mp, 0)) continue; error = VFS_ROOT(mp, LK_SHARED, &tdp); vfs_unbusy(mp); if (error != 0) break; vput(vp); vp = tdp; } if (error != 0) { vput(vp); return (error); } VOP_UNLOCK(vp); pwd_chdir(td, vp); return (0); } /* * Change current working directory (``.''). */ #ifndef _SYS_SYSPROTO_H_ struct chdir_args { char *path; }; #endif int sys_chdir(struct thread *td, struct chdir_args *uap) { return (kern_chdir(td, uap->path, UIO_USERSPACE)); } int kern_chdir(struct thread *td, const char *path, enum uio_seg pathseg) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); if ((error = change_dir(nd.ni_vp, td)) != 0) { vput(nd.ni_vp); NDFREE_NOTHING(&nd); return (error); } VOP_UNLOCK(nd.ni_vp); NDFREE_NOTHING(&nd); pwd_chdir(td, nd.ni_vp); return (0); } /* * Change notion of root (``/'') directory. */ #ifndef _SYS_SYSPROTO_H_ struct chroot_args { char *path; }; #endif int sys_chroot(struct thread *td, struct chroot_args *uap) { struct nameidata nd; int error; error = priv_check(td, PRIV_VFS_CHROOT); if (error != 0) return (error); NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error != 0) goto error; error = change_dir(nd.ni_vp, td); if (error != 0) goto e_vunlock; #ifdef MAC error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp); if (error != 0) goto e_vunlock; #endif VOP_UNLOCK(nd.ni_vp); error = pwd_chroot(td, nd.ni_vp); vrele(nd.ni_vp); NDFREE_NOTHING(&nd); return (error); e_vunlock: vput(nd.ni_vp); error: NDFREE_NOTHING(&nd); return (error); } /* * Common routine for chroot and chdir. Callers must provide a locked vnode * instance. */ int change_dir(struct vnode *vp, struct thread *td) { #ifdef MAC int error; #endif ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked"); if (vp->v_type != VDIR) return (ENOTDIR); #ifdef MAC error = mac_vnode_check_chdir(td->td_ucred, vp); if (error != 0) return (error); #endif return (VOP_ACCESS(vp, VEXEC, td->td_ucred, td)); } static __inline void flags_to_rights(int flags, cap_rights_t *rightsp) { if (flags & O_EXEC) { cap_rights_set_one(rightsp, CAP_FEXECVE); if (flags & O_PATH) return; } else { switch ((flags & O_ACCMODE)) { case O_RDONLY: cap_rights_set_one(rightsp, CAP_READ); break; case O_RDWR: cap_rights_set_one(rightsp, CAP_READ); /* FALLTHROUGH */ case O_WRONLY: cap_rights_set_one(rightsp, CAP_WRITE); if (!(flags & (O_APPEND | O_TRUNC))) cap_rights_set_one(rightsp, CAP_SEEK); break; } } if (flags & O_CREAT) cap_rights_set_one(rightsp, CAP_CREATE); if (flags & O_TRUNC) cap_rights_set_one(rightsp, CAP_FTRUNCATE); if (flags & (O_SYNC | O_FSYNC)) cap_rights_set_one(rightsp, CAP_FSYNC); if (flags & (O_EXLOCK | O_SHLOCK)) cap_rights_set_one(rightsp, CAP_FLOCK); } /* * Check permissions, allocate an open file structure, and call the device * open routine if any. */ #ifndef _SYS_SYSPROTO_H_ struct open_args { char *path; int flags; int mode; }; #endif int sys_open(struct thread *td, struct open_args *uap) { return (kern_openat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->flags, uap->mode)); } #ifndef _SYS_SYSPROTO_H_ struct openat_args { int fd; char *path; int flag; int mode; }; #endif int sys_openat(struct thread *td, struct openat_args *uap) { AUDIT_ARG_FD(uap->fd); return (kern_openat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag, uap->mode)); } int kern_openat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, int flags, int mode) { struct proc *p = td->td_proc; struct filedesc *fdp; struct pwddesc *pdp; struct file *fp; struct vnode *vp; struct nameidata nd; cap_rights_t rights; int cmode, error, indx; indx = -1; fdp = p->p_fd; pdp = p->p_pd; AUDIT_ARG_FFLAGS(flags); AUDIT_ARG_MODE(mode); cap_rights_init_one(&rights, CAP_LOOKUP); flags_to_rights(flags, &rights); /* * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags * may be specified. On the other hand, for O_PATH any mode * except O_EXEC is ignored. */ if ((flags & O_PATH) != 0) { flags &= ~(O_CREAT | O_ACCMODE); } else if ((flags & O_EXEC) != 0) { if (flags & O_ACCMODE) return (EINVAL); } else if ((flags & O_ACCMODE) == O_ACCMODE) { return (EINVAL); } else { flags = FFLAGS(flags); } /* * Allocate a file structure. The descriptor to reference it * is allocated and used by finstall_refed() below. */ error = falloc_noinstall(td, &fp); if (error != 0) return (error); /* Set the flags early so the finit in devfs can pick them up. */ fp->f_flag = flags & FMASK; cmode = ((mode & ~pdp->pd_cmask) & ALLPERMS) & ~S_ISTXT; NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd, &rights, td); td->td_dupfd = -1; /* XXX check for fdopen */ error = vn_open(&nd, &flags, cmode, fp); if (error != 0) { /* * If the vn_open replaced the method vector, something * wonderous happened deep below and we just pass it up * pretending we know what we do. */ if (error == ENXIO && fp->f_ops != &badfileops) { MPASS((flags & O_PATH) == 0); goto success; } /* * Handle special fdopen() case. bleh. * * Don't do this for relative (capability) lookups; we don't * understand exactly what would happen, and we don't think * that it ever should. */ if ((nd.ni_resflags & NIRES_STRICTREL) == 0 && (error == ENODEV || error == ENXIO) && td->td_dupfd >= 0) { error = dupfdopen(td, fdp, td->td_dupfd, flags, error, &indx); if (error == 0) goto success; } goto bad; } td->td_dupfd = 0; NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; /* * Store the vnode, for any f_type. Typically, the vnode use * count is decremented by direct call to vn_closefile() for * files that switched type in the cdevsw fdopen() method. */ fp->f_vnode = vp; /* * If the file wasn't claimed by devfs bind it to the normal * vnode operations here. */ if (fp->f_ops == &badfileops) { KASSERT(vp->v_type != VFIFO || (flags & O_PATH) != 0, ("Unexpected fifo fp %p vp %p", fp, vp)); if ((flags & O_PATH) != 0) { - finit_vnode(fp, flags, NULL, &path_fileops); + finit(fp, (flags & FMASK) | (fp->f_flag & FKQALLOWED), + DTYPE_VNODE, NULL, &path_fileops); vhold(vp); vunref(vp); } else { finit_vnode(fp, flags, NULL, &vnops); } } VOP_UNLOCK(vp); if (flags & O_TRUNC) { error = fo_truncate(fp, 0, td->td_ucred, td); if (error != 0) goto bad; } success: /* * If we haven't already installed the FD (for dupfdopen), do so now. */ if (indx == -1) { struct filecaps *fcaps; #ifdef CAPABILITIES if ((nd.ni_resflags & NIRES_STRICTREL) != 0) fcaps = &nd.ni_filecaps; else #endif fcaps = NULL; error = finstall_refed(td, fp, &indx, flags, fcaps); /* On success finstall_refed() consumes fcaps. */ if (error != 0) { filecaps_free(&nd.ni_filecaps); goto bad; } } else { filecaps_free(&nd.ni_filecaps); falloc_abort(td, fp); } td->td_retval[0] = indx; return (0); bad: KASSERT(indx == -1, ("indx=%d, should be -1", indx)); falloc_abort(td, fp); return (error); } #ifdef COMPAT_43 /* * Create a file. */ #ifndef _SYS_SYSPROTO_H_ struct ocreat_args { char *path; int mode; }; #endif int ocreat(struct thread *td, struct ocreat_args *uap) { return (kern_openat(td, AT_FDCWD, uap->path, UIO_USERSPACE, O_WRONLY | O_CREAT | O_TRUNC, uap->mode)); } #endif /* COMPAT_43 */ /* * Create a special file. */ #ifndef _SYS_SYSPROTO_H_ struct mknodat_args { int fd; char *path; mode_t mode; dev_t dev; }; #endif int sys_mknodat(struct thread *td, struct mknodat_args *uap) { return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode, uap->dev)); } #if defined(COMPAT_FREEBSD11) int freebsd11_mknod(struct thread *td, struct freebsd11_mknod_args *uap) { return (kern_mknodat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->mode, uap->dev)); } int freebsd11_mknodat(struct thread *td, struct freebsd11_mknodat_args *uap) { return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode, uap->dev)); } #endif /* COMPAT_FREEBSD11 */ int kern_mknodat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, int mode, dev_t dev) { struct vnode *vp; struct mount *mp; struct vattr vattr; struct nameidata nd; int error, whiteout = 0; AUDIT_ARG_MODE(mode); AUDIT_ARG_DEV(dev); switch (mode & S_IFMT) { case S_IFCHR: case S_IFBLK: error = priv_check(td, PRIV_VFS_MKNOD_DEV); if (error == 0 && dev == VNOVAL) error = EINVAL; break; case S_IFWHT: error = priv_check(td, PRIV_VFS_MKNOD_WHT); break; case S_IFIFO: if (dev == 0) return (kern_mkfifoat(td, fd, path, pathseg, mode)); /* FALLTHROUGH */ default: error = EINVAL; break; } if (error != 0) return (error); restart: bwillwrite(); NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 | NOCACHE, pathseg, path, fd, &cap_mknodat_rights, td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; if (vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); if (vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(vp); return (EEXIST); } else { VATTR_NULL(&vattr); vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_pd->pd_cmask; vattr.va_rdev = dev; whiteout = 0; switch (mode & S_IFMT) { case S_IFCHR: vattr.va_type = VCHR; break; case S_IFBLK: vattr.va_type = VBLK; break; case S_IFWHT: whiteout = 1; break; default: panic("kern_mknod: invalid mode"); } } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } #ifdef MAC if (error == 0 && !whiteout) error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); #endif if (error == 0) { if (whiteout) error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE); else { error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); } } VOP_VPUT_PAIR(nd.ni_dvp, error == 0 && !whiteout ? &nd.ni_vp : NULL, true); vn_finished_write(mp); NDFREE(&nd, NDF_ONLY_PNBUF); if (error == ERELOOKUP) goto restart; return (error); } /* * Create a named pipe. */ #ifndef _SYS_SYSPROTO_H_ struct mkfifo_args { char *path; int mode; }; #endif int sys_mkfifo(struct thread *td, struct mkfifo_args *uap) { return (kern_mkfifoat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->mode)); } #ifndef _SYS_SYSPROTO_H_ struct mkfifoat_args { int fd; char *path; mode_t mode; }; #endif int sys_mkfifoat(struct thread *td, struct mkfifoat_args *uap) { return (kern_mkfifoat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode)); } int kern_mkfifoat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, int mode) { struct mount *mp; struct vattr vattr; struct nameidata nd; int error; AUDIT_ARG_MODE(mode); restart: bwillwrite(); NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 | NOCACHE, pathseg, path, fd, &cap_mkfifoat_rights, td); if ((error = namei(&nd)) != 0) return (error); if (nd.ni_vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(nd.ni_vp); return (EEXIST); } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VATTR_NULL(&vattr); vattr.va_type = VFIFO; vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_pd->pd_cmask; #ifdef MAC error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); if (error != 0) goto out; #endif error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); #ifdef MAC out: #endif VOP_VPUT_PAIR(nd.ni_dvp, error == 0 ? &nd.ni_vp : NULL, true); vn_finished_write(mp); NDFREE(&nd, NDF_ONLY_PNBUF); if (error == ERELOOKUP) goto restart; return (error); } /* * Make a hard file link. */ #ifndef _SYS_SYSPROTO_H_ struct link_args { char *path; char *link; }; #endif int sys_link(struct thread *td, struct link_args *uap) { return (kern_linkat(td, AT_FDCWD, AT_FDCWD, uap->path, uap->link, UIO_USERSPACE, FOLLOW)); } #ifndef _SYS_SYSPROTO_H_ struct linkat_args { int fd1; char *path1; int fd2; char *path2; int flag; }; #endif int sys_linkat(struct thread *td, struct linkat_args *uap) { int flag; flag = uap->flag; if ((flag & ~(AT_SYMLINK_FOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH)) != 0) return (EINVAL); return (kern_linkat(td, uap->fd1, uap->fd2, uap->path1, uap->path2, UIO_USERSPACE, at2cnpflags(flag, AT_SYMLINK_FOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH))); } int hardlink_check_uid = 0; SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW, &hardlink_check_uid, 0, "Unprivileged processes cannot create hard links to files owned by other " "users"); static int hardlink_check_gid = 0; SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW, &hardlink_check_gid, 0, "Unprivileged processes cannot create hard links to files owned by other " "groups"); static int can_hardlink(struct vnode *vp, struct ucred *cred) { struct vattr va; int error; if (!hardlink_check_uid && !hardlink_check_gid) return (0); error = VOP_GETATTR(vp, &va, cred); if (error != 0) return (error); if (hardlink_check_uid && cred->cr_uid != va.va_uid) { error = priv_check_cred(cred, PRIV_VFS_LINK); if (error != 0) return (error); } if (hardlink_check_gid && !groupmember(va.va_gid, cred)) { error = priv_check_cred(cred, PRIV_VFS_LINK); if (error != 0) return (error); } return (0); } int kern_linkat(struct thread *td, int fd1, int fd2, const char *path1, const char *path2, enum uio_seg segflag, int follow) { struct nameidata nd; int error; do { bwillwrite(); NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, segflag, path1, fd1, &cap_linkat_source_rights, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = kern_linkat_vp(td, nd.ni_vp, fd2, path2, segflag); } while (error == EAGAIN || error == ERELOOKUP); return (error); } static int kern_linkat_vp(struct thread *td, struct vnode *vp, int fd, const char *path, enum uio_seg segflag) { struct nameidata nd; struct mount *mp; int error; if (vp->v_type == VDIR) { vrele(vp); return (EPERM); /* POSIX */ } NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE2 | NOCACHE, segflag, path, fd, &cap_linkat_target_rights, td); if ((error = namei(&nd)) == 0) { if ((nd.ni_resflags & NIRES_EMPTYPATH) != 0) { error = priv_check(td, PRIV_VFS_FHOPEN); if (error != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp != NULL) { if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(nd.ni_vp); } else { vput(nd.ni_dvp); } vrele(vp); return (error); } } if (nd.ni_vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(nd.ni_vp); vrele(vp); return (EEXIST); } else if (nd.ni_dvp->v_mount != vp->v_mount) { /* * Cross-device link. No need to recheck * vp->v_type, since it cannot change, except * to VBAD. */ NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vrele(vp); return (EXDEV); } else if ((error = vn_lock(vp, LK_EXCLUSIVE)) == 0) { error = can_hardlink(vp, td->td_ucred); #ifdef MAC if (error == 0) error = mac_vnode_check_link(td->td_ucred, nd.ni_dvp, vp, &nd.ni_cnd); #endif if (error != 0) { vput(vp); vput(nd.ni_dvp); NDFREE(&nd, NDF_ONLY_PNBUF); return (error); } error = vn_start_write(vp, &mp, V_NOWAIT); if (error != 0) { vput(vp); vput(nd.ni_dvp); NDFREE(&nd, NDF_ONLY_PNBUF); error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH); if (error != 0) return (error); return (EAGAIN); } error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd); VOP_VPUT_PAIR(nd.ni_dvp, &vp, true); vn_finished_write(mp); NDFREE(&nd, NDF_ONLY_PNBUF); vp = NULL; } else { vput(nd.ni_dvp); NDFREE(&nd, NDF_ONLY_PNBUF); vrele(vp); return (EAGAIN); } } if (vp != NULL) vrele(vp); return (error); } /* * Make a symbolic link. */ #ifndef _SYS_SYSPROTO_H_ struct symlink_args { char *path; char *link; }; #endif int sys_symlink(struct thread *td, struct symlink_args *uap) { return (kern_symlinkat(td, uap->path, AT_FDCWD, uap->link, UIO_USERSPACE)); } #ifndef _SYS_SYSPROTO_H_ struct symlinkat_args { char *path; int fd; char *path2; }; #endif int sys_symlinkat(struct thread *td, struct symlinkat_args *uap) { return (kern_symlinkat(td, uap->path1, uap->fd, uap->path2, UIO_USERSPACE)); } int kern_symlinkat(struct thread *td, const char *path1, int fd, const char *path2, enum uio_seg segflg) { struct mount *mp; struct vattr vattr; const char *syspath; char *tmppath; struct nameidata nd; int error; if (segflg == UIO_SYSSPACE) { syspath = path1; } else { tmppath = uma_zalloc(namei_zone, M_WAITOK); if ((error = copyinstr(path1, tmppath, MAXPATHLEN, NULL)) != 0) goto out; syspath = tmppath; } AUDIT_ARG_TEXT(syspath); restart: bwillwrite(); NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 | NOCACHE, segflg, path2, fd, &cap_symlinkat_rights, td); if ((error = namei(&nd)) != 0) goto out; if (nd.ni_vp) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(nd.ni_vp); nd.ni_vp = NULL; error = EEXIST; goto out; } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) goto out; goto restart; } VATTR_NULL(&vattr); vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_pd->pd_cmask; #ifdef MAC vattr.va_type = VLNK; error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); if (error != 0) goto out2; #endif error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath); #ifdef MAC out2: #endif VOP_VPUT_PAIR(nd.ni_dvp, error == 0 ? &nd.ni_vp : NULL, true); vn_finished_write(mp); NDFREE(&nd, NDF_ONLY_PNBUF); if (error == ERELOOKUP) goto restart; out: if (segflg != UIO_SYSSPACE) uma_zfree(namei_zone, tmppath); return (error); } /* * Delete a whiteout from the filesystem. */ #ifndef _SYS_SYSPROTO_H_ struct undelete_args { char *path; }; #endif int sys_undelete(struct thread *td, struct undelete_args *uap) { struct mount *mp; struct nameidata nd; int error; restart: bwillwrite(); NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error != 0) return (error); if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); if (nd.ni_vp) vrele(nd.ni_vp); return (EEXIST); } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); if (error == ERELOOKUP) goto restart; return (error); } /* * Delete a name from the filesystem. */ #ifndef _SYS_SYSPROTO_H_ struct unlink_args { char *path; }; #endif int sys_unlink(struct thread *td, struct unlink_args *uap) { return (kern_funlinkat(td, AT_FDCWD, uap->path, FD_NONE, UIO_USERSPACE, 0, 0)); } static int kern_funlinkat_ex(struct thread *td, int dfd, const char *path, int fd, int flag, enum uio_seg pathseg, ino_t oldinum) { if ((flag & ~(AT_REMOVEDIR | AT_RESOLVE_BENEATH)) != 0) return (EINVAL); if ((flag & AT_REMOVEDIR) != 0) return (kern_frmdirat(td, dfd, path, fd, UIO_USERSPACE, 0)); return (kern_funlinkat(td, dfd, path, fd, UIO_USERSPACE, 0, 0)); } #ifndef _SYS_SYSPROTO_H_ struct unlinkat_args { int fd; char *path; int flag; }; #endif int sys_unlinkat(struct thread *td, struct unlinkat_args *uap) { return (kern_funlinkat_ex(td, uap->fd, uap->path, FD_NONE, uap->flag, UIO_USERSPACE, 0)); } #ifndef _SYS_SYSPROTO_H_ struct funlinkat_args { int dfd; const char *path; int fd; int flag; }; #endif int sys_funlinkat(struct thread *td, struct funlinkat_args *uap) { return (kern_funlinkat_ex(td, uap->dfd, uap->path, uap->fd, uap->flag, UIO_USERSPACE, 0)); } int kern_funlinkat(struct thread *td, int dfd, const char *path, int fd, enum uio_seg pathseg, int flag, ino_t oldinum) { struct mount *mp; struct file *fp; struct vnode *vp; struct nameidata nd; struct stat sb; int error; fp = NULL; if (fd != FD_NONE) { error = getvnode_path(td, fd, &cap_no_rights, &fp); if (error != 0) return (error); } restart: bwillwrite(); NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1 | at2cnpflags(flag, AT_RESOLVE_BENEATH), pathseg, path, dfd, &cap_unlinkat_rights, td); if ((error = namei(&nd)) != 0) { if (error == EINVAL) error = EPERM; goto fdout; } vp = nd.ni_vp; if (vp->v_type == VDIR && oldinum == 0) { error = EPERM; /* POSIX */ } else if (oldinum != 0 && ((error = VOP_STAT(vp, &sb, td->td_ucred, NOCRED, td)) == 0) && sb.st_ino != oldinum) { error = EIDRM; /* Identifier removed */ } else if (fp != NULL && fp->f_vnode != vp) { if (VN_IS_DOOMED(fp->f_vnode)) error = EBADF; else error = EDEADLK; } else { /* * The root of a mounted filesystem cannot be deleted. * * XXX: can this only be a VDIR case? */ if (vp->v_vflag & VV_ROOT) error = EBUSY; } if (error == 0) { if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (vp == nd.ni_dvp) vrele(vp); else vput(vp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) { goto fdout; } goto restart; } #ifdef MAC error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp, &nd.ni_cnd); if (error != 0) goto out; #endif vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK); error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); #ifdef MAC out: #endif vn_finished_write(mp); } NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (vp == nd.ni_dvp) vrele(vp); else vput(vp); if (error == ERELOOKUP) goto restart; fdout: if (fp != NULL) fdrop(fp, td); return (error); } /* * Reposition read/write file offset. */ #ifndef _SYS_SYSPROTO_H_ struct lseek_args { int fd; int pad; off_t offset; int whence; }; #endif int sys_lseek(struct thread *td, struct lseek_args *uap) { return (kern_lseek(td, uap->fd, uap->offset, uap->whence)); } int kern_lseek(struct thread *td, int fd, off_t offset, int whence) { struct file *fp; int error; AUDIT_ARG_FD(fd); error = fget(td, fd, &cap_seek_rights, &fp); if (error != 0) return (error); error = (fp->f_ops->fo_flags & DFLAG_SEEKABLE) != 0 ? fo_seek(fp, offset, whence, td) : ESPIPE; fdrop(fp, td); return (error); } #if defined(COMPAT_43) /* * Reposition read/write file offset. */ #ifndef _SYS_SYSPROTO_H_ struct olseek_args { int fd; long offset; int whence; }; #endif int olseek(struct thread *td, struct olseek_args *uap) { return (kern_lseek(td, uap->fd, uap->offset, uap->whence)); } #endif /* COMPAT_43 */ #if defined(COMPAT_FREEBSD6) /* Version with the 'pad' argument */ int freebsd6_lseek(struct thread *td, struct freebsd6_lseek_args *uap) { return (kern_lseek(td, uap->fd, uap->offset, uap->whence)); } #endif /* * Check access permissions using passed credentials. */ static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred, struct thread *td) { accmode_t accmode; int error; /* Flags == 0 means only check for existence. */ if (user_flags == 0) return (0); accmode = 0; if (user_flags & R_OK) accmode |= VREAD; if (user_flags & W_OK) accmode |= VWRITE; if (user_flags & X_OK) accmode |= VEXEC; #ifdef MAC error = mac_vnode_check_access(cred, vp, accmode); if (error != 0) return (error); #endif if ((accmode & VWRITE) == 0 || (error = vn_writechk(vp)) == 0) error = VOP_ACCESS(vp, accmode, cred, td); return (error); } /* * Check access permissions using "real" credentials. */ #ifndef _SYS_SYSPROTO_H_ struct access_args { char *path; int amode; }; #endif int sys_access(struct thread *td, struct access_args *uap) { return (kern_accessat(td, AT_FDCWD, uap->path, UIO_USERSPACE, 0, uap->amode)); } #ifndef _SYS_SYSPROTO_H_ struct faccessat_args { int dirfd; char *path; int amode; int flag; } #endif int sys_faccessat(struct thread *td, struct faccessat_args *uap) { return (kern_accessat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag, uap->amode)); } int kern_accessat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, int flag, int amode) { struct ucred *cred, *usecred; struct vnode *vp; struct nameidata nd; int error; if ((flag & ~(AT_EACCESS | AT_RESOLVE_BENEATH | AT_EMPTY_PATH)) != 0) return (EINVAL); if (amode != F_OK && (amode & ~(R_OK | W_OK | X_OK)) != 0) return (EINVAL); /* * Create and modify a temporary credential instead of one that * is potentially shared (if we need one). */ cred = td->td_ucred; if ((flag & AT_EACCESS) == 0 && ((cred->cr_uid != cred->cr_ruid || cred->cr_rgid != cred->cr_groups[0]))) { usecred = crdup(cred); usecred->cr_uid = cred->cr_ruid; usecred->cr_groups[0] = cred->cr_rgid; td->td_ucred = usecred; } else usecred = cred; AUDIT_ARG_VALUE(amode); NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1 | at2cnpflags(flag, AT_RESOLVE_BENEATH | AT_EMPTY_PATH), pathseg, path, fd, &cap_fstat_rights, td); if ((error = namei(&nd)) != 0) goto out; vp = nd.ni_vp; error = vn_access(vp, amode, usecred, td); NDFREE_NOTHING(&nd); vput(vp); out: if (usecred != cred) { td->td_ucred = cred; crfree(usecred); } return (error); } /* * Check access permissions using "effective" credentials. */ #ifndef _SYS_SYSPROTO_H_ struct eaccess_args { char *path; int amode; }; #endif int sys_eaccess(struct thread *td, struct eaccess_args *uap) { return (kern_accessat(td, AT_FDCWD, uap->path, UIO_USERSPACE, AT_EACCESS, uap->amode)); } #if defined(COMPAT_43) /* * Get file status; this version follows links. */ #ifndef _SYS_SYSPROTO_H_ struct ostat_args { char *path; struct ostat *ub; }; #endif int ostat(struct thread *td, struct ostat_args *uap) { struct stat sb; struct ostat osb; int error; error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); cvtstat(&sb, &osb); return (copyout(&osb, uap->ub, sizeof (osb))); } /* * Get file status; this version does not follow links. */ #ifndef _SYS_SYSPROTO_H_ struct olstat_args { char *path; struct ostat *ub; }; #endif int olstat(struct thread *td, struct olstat_args *uap) { struct stat sb; struct ostat osb; int error; error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); cvtstat(&sb, &osb); return (copyout(&osb, uap->ub, sizeof (osb))); } /* * Convert from an old to a new stat structure. * XXX: many values are blindly truncated. */ void cvtstat(struct stat *st, struct ostat *ost) { bzero(ost, sizeof(*ost)); ost->st_dev = st->st_dev; ost->st_ino = st->st_ino; ost->st_mode = st->st_mode; ost->st_nlink = st->st_nlink; ost->st_uid = st->st_uid; ost->st_gid = st->st_gid; ost->st_rdev = st->st_rdev; ost->st_size = MIN(st->st_size, INT32_MAX); ost->st_atim = st->st_atim; ost->st_mtim = st->st_mtim; ost->st_ctim = st->st_ctim; ost->st_blksize = st->st_blksize; ost->st_blocks = st->st_blocks; ost->st_flags = st->st_flags; ost->st_gen = st->st_gen; } #endif /* COMPAT_43 */ #if defined(COMPAT_43) || defined(COMPAT_FREEBSD11) int ino64_trunc_error; SYSCTL_INT(_vfs, OID_AUTO, ino64_trunc_error, CTLFLAG_RW, &ino64_trunc_error, 0, "Error on truncation of device, file or inode number, or link count"); int freebsd11_cvtstat(struct stat *st, struct freebsd11_stat *ost) { ost->st_dev = st->st_dev; if (ost->st_dev != st->st_dev) { switch (ino64_trunc_error) { default: /* * Since dev_t is almost raw, don't clamp to the * maximum for case 2, but ignore the error. */ break; case 1: return (EOVERFLOW); } } ost->st_ino = st->st_ino; if (ost->st_ino != st->st_ino) { switch (ino64_trunc_error) { default: case 0: break; case 1: return (EOVERFLOW); case 2: ost->st_ino = UINT32_MAX; break; } } ost->st_mode = st->st_mode; ost->st_nlink = st->st_nlink; if (ost->st_nlink != st->st_nlink) { switch (ino64_trunc_error) { default: case 0: break; case 1: return (EOVERFLOW); case 2: ost->st_nlink = UINT16_MAX; break; } } ost->st_uid = st->st_uid; ost->st_gid = st->st_gid; ost->st_rdev = st->st_rdev; if (ost->st_rdev != st->st_rdev) { switch (ino64_trunc_error) { default: break; case 1: return (EOVERFLOW); } } ost->st_atim = st->st_atim; ost->st_mtim = st->st_mtim; ost->st_ctim = st->st_ctim; ost->st_size = st->st_size; ost->st_blocks = st->st_blocks; ost->st_blksize = st->st_blksize; ost->st_flags = st->st_flags; ost->st_gen = st->st_gen; ost->st_lspare = 0; ost->st_birthtim = st->st_birthtim; bzero((char *)&ost->st_birthtim + sizeof(ost->st_birthtim), sizeof(*ost) - offsetof(struct freebsd11_stat, st_birthtim) - sizeof(ost->st_birthtim)); return (0); } int freebsd11_stat(struct thread *td, struct freebsd11_stat_args* uap) { struct stat sb; struct freebsd11_stat osb; int error; error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); error = freebsd11_cvtstat(&sb, &osb); if (error == 0) error = copyout(&osb, uap->ub, sizeof(osb)); return (error); } int freebsd11_lstat(struct thread *td, struct freebsd11_lstat_args* uap) { struct stat sb; struct freebsd11_stat osb; int error; error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); error = freebsd11_cvtstat(&sb, &osb); if (error == 0) error = copyout(&osb, uap->ub, sizeof(osb)); return (error); } int freebsd11_fhstat(struct thread *td, struct freebsd11_fhstat_args* uap) { struct fhandle fh; struct stat sb; struct freebsd11_stat osb; int error; error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); if (error != 0) return (error); error = kern_fhstat(td, fh, &sb); if (error != 0) return (error); error = freebsd11_cvtstat(&sb, &osb); if (error == 0) error = copyout(&osb, uap->sb, sizeof(osb)); return (error); } int freebsd11_fstatat(struct thread *td, struct freebsd11_fstatat_args* uap) { struct stat sb; struct freebsd11_stat osb; int error; error = kern_statat(td, uap->flag, uap->fd, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); error = freebsd11_cvtstat(&sb, &osb); if (error == 0) error = copyout(&osb, uap->buf, sizeof(osb)); return (error); } #endif /* COMPAT_FREEBSD11 */ /* * Get file status */ #ifndef _SYS_SYSPROTO_H_ struct fstatat_args { int fd; char *path; struct stat *buf; int flag; } #endif int sys_fstatat(struct thread *td, struct fstatat_args *uap) { struct stat sb; int error; error = kern_statat(td, uap->flag, uap->fd, uap->path, UIO_USERSPACE, &sb, NULL); if (error == 0) error = copyout(&sb, uap->buf, sizeof (sb)); return (error); } int kern_statat(struct thread *td, int flag, int fd, const char *path, enum uio_seg pathseg, struct stat *sbp, void (*hook)(struct vnode *vp, struct stat *sbp)) { struct nameidata nd; int error; if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH)) != 0) return (EINVAL); NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_RESOLVE_BENEATH | AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH) | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path, fd, &cap_fstat_rights, td); if ((error = namei(&nd)) != 0) return (error); error = VOP_STAT(nd.ni_vp, sbp, td->td_ucred, NOCRED, td); if (error == 0) { if (__predict_false(hook != NULL)) hook(nd.ni_vp, sbp); } NDFREE_NOTHING(&nd); vput(nd.ni_vp); #ifdef __STAT_TIME_T_EXT sbp->st_atim_ext = 0; sbp->st_mtim_ext = 0; sbp->st_ctim_ext = 0; sbp->st_btim_ext = 0; #endif #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrstat_error(sbp, error); #endif return (error); } #if defined(COMPAT_FREEBSD11) /* * Implementation of the NetBSD [l]stat() functions. */ void freebsd11_cvtnstat(struct stat *sb, struct nstat *nsb) { bzero(nsb, sizeof(*nsb)); nsb->st_dev = sb->st_dev; nsb->st_ino = sb->st_ino; nsb->st_mode = sb->st_mode; nsb->st_nlink = sb->st_nlink; nsb->st_uid = sb->st_uid; nsb->st_gid = sb->st_gid; nsb->st_rdev = sb->st_rdev; nsb->st_atim = sb->st_atim; nsb->st_mtim = sb->st_mtim; nsb->st_ctim = sb->st_ctim; nsb->st_size = sb->st_size; nsb->st_blocks = sb->st_blocks; nsb->st_blksize = sb->st_blksize; nsb->st_flags = sb->st_flags; nsb->st_gen = sb->st_gen; nsb->st_birthtim = sb->st_birthtim; } #ifndef _SYS_SYSPROTO_H_ struct freebsd11_nstat_args { char *path; struct nstat *ub; }; #endif int freebsd11_nstat(struct thread *td, struct freebsd11_nstat_args *uap) { struct stat sb; struct nstat nsb; int error; error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); freebsd11_cvtnstat(&sb, &nsb); return (copyout(&nsb, uap->ub, sizeof (nsb))); } /* * NetBSD lstat. Get file status; this version does not follow links. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd11_nlstat_args { char *path; struct nstat *ub; }; #endif int freebsd11_nlstat(struct thread *td, struct freebsd11_nlstat_args *uap) { struct stat sb; struct nstat nsb; int error; error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); freebsd11_cvtnstat(&sb, &nsb); return (copyout(&nsb, uap->ub, sizeof (nsb))); } #endif /* COMPAT_FREEBSD11 */ /* * Get configurable pathname variables. */ #ifndef _SYS_SYSPROTO_H_ struct pathconf_args { char *path; int name; }; #endif int sys_pathconf(struct thread *td, struct pathconf_args *uap) { long value; int error; error = kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, FOLLOW, &value); if (error == 0) td->td_retval[0] = value; return (error); } #ifndef _SYS_SYSPROTO_H_ struct lpathconf_args { char *path; int name; }; #endif int sys_lpathconf(struct thread *td, struct lpathconf_args *uap) { long value; int error; error = kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, NOFOLLOW, &value); if (error == 0) td->td_retval[0] = value; return (error); } int kern_pathconf(struct thread *td, const char *path, enum uio_seg pathseg, int name, u_long flags, long *valuep) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | AUDITVNODE1 | flags, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE_NOTHING(&nd); error = VOP_PATHCONF(nd.ni_vp, name, valuep); vput(nd.ni_vp); return (error); } /* * Return target name of a symbolic link. */ #ifndef _SYS_SYSPROTO_H_ struct readlink_args { char *path; char *buf; size_t count; }; #endif int sys_readlink(struct thread *td, struct readlink_args *uap) { return (kern_readlinkat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->buf, UIO_USERSPACE, uap->count)); } #ifndef _SYS_SYSPROTO_H_ struct readlinkat_args { int fd; char *path; char *buf; size_t bufsize; }; #endif int sys_readlinkat(struct thread *td, struct readlinkat_args *uap) { return (kern_readlinkat(td, uap->fd, uap->path, UIO_USERSPACE, uap->buf, UIO_USERSPACE, uap->bufsize)); } int kern_readlinkat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, char *buf, enum uio_seg bufseg, size_t count) { struct vnode *vp; struct nameidata nd; int error; if (count > IOSIZE_MAX) return (EINVAL); NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path, fd, td); if ((error = namei(&nd)) != 0) return (error); NDFREE_NOTHING(&nd); vp = nd.ni_vp; error = kern_readlink_vp(vp, buf, bufseg, count, td); vput(vp); return (error); } /* * Helper function to readlink from a vnode */ static int kern_readlink_vp(struct vnode *vp, char *buf, enum uio_seg bufseg, size_t count, struct thread *td) { struct iovec aiov; struct uio auio; int error; ASSERT_VOP_LOCKED(vp, "kern_readlink_vp(): vp not locked"); #ifdef MAC error = mac_vnode_check_readlink(td->td_ucred, vp); if (error != 0) return (error); #endif if (vp->v_type != VLNK && (vp->v_vflag & VV_READLINK) == 0) return (EINVAL); aiov.iov_base = buf; aiov.iov_len = count; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_segflg = bufseg; auio.uio_td = td; auio.uio_resid = count; error = VOP_READLINK(vp, &auio, td->td_ucred); td->td_retval[0] = count - auio.uio_resid; return (error); } /* * Common implementation code for chflags() and fchflags(). */ static int setfflags(struct thread *td, struct vnode *vp, u_long flags) { struct mount *mp; struct vattr vattr; int error; /* We can't support the value matching VNOVAL. */ if (flags == VNOVAL) return (EOPNOTSUPP); /* * Prevent non-root users from setting flags on devices. When * a device is reused, users can retain ownership of the device * if they are allowed to set flags and programs assume that * chown can't fail when done as root. */ if (vp->v_type == VCHR || vp->v_type == VBLK) { error = priv_check(td, PRIV_VFS_CHFLAGS_DEV); if (error != 0) return (error); } if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VATTR_NULL(&vattr); vattr.va_flags = flags; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); #ifdef MAC error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags); if (error == 0) #endif error = VOP_SETATTR(vp, &vattr, td->td_ucred); VOP_UNLOCK(vp); vn_finished_write(mp); return (error); } /* * Change flags of a file given a path name. */ #ifndef _SYS_SYSPROTO_H_ struct chflags_args { const char *path; u_long flags; }; #endif int sys_chflags(struct thread *td, struct chflags_args *uap) { return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->flags, 0)); } #ifndef _SYS_SYSPROTO_H_ struct chflagsat_args { int fd; const char *path; u_long flags; int atflag; } #endif int sys_chflagsat(struct thread *td, struct chflagsat_args *uap) { if ((uap->atflag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH)) != 0) return (EINVAL); return (kern_chflagsat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flags, uap->atflag)); } /* * Same as chflags() but doesn't follow symlinks. */ #ifndef _SYS_SYSPROTO_H_ struct lchflags_args { const char *path; u_long flags; }; #endif int sys_lchflags(struct thread *td, struct lchflags_args *uap) { return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->flags, AT_SYMLINK_NOFOLLOW)); } static int kern_chflagsat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, u_long flags, int atflag) { struct nameidata nd; int error; AUDIT_ARG_FFLAGS(flags); NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(atflag, AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH) | AUDITVNODE1, pathseg, path, fd, &cap_fchflags_rights, td); if ((error = namei(&nd)) != 0) return (error); NDFREE_NOTHING(&nd); error = setfflags(td, nd.ni_vp, flags); vrele(nd.ni_vp); return (error); } /* * Change flags of a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchflags_args { int fd; u_long flags; }; #endif int sys_fchflags(struct thread *td, struct fchflags_args *uap) { struct file *fp; int error; AUDIT_ARG_FD(uap->fd); AUDIT_ARG_FFLAGS(uap->flags); error = getvnode(td, uap->fd, &cap_fchflags_rights, &fp); if (error != 0) return (error); #ifdef AUDIT vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(fp->f_vnode); VOP_UNLOCK(fp->f_vnode); #endif error = setfflags(td, fp->f_vnode, uap->flags); fdrop(fp, td); return (error); } /* * Common implementation code for chmod(), lchmod() and fchmod(). */ int setfmode(struct thread *td, struct ucred *cred, struct vnode *vp, int mode) { struct mount *mp; struct vattr vattr; int error; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VATTR_NULL(&vattr); vattr.va_mode = mode & ALLPERMS; #ifdef MAC error = mac_vnode_check_setmode(cred, vp, vattr.va_mode); if (error == 0) #endif error = VOP_SETATTR(vp, &vattr, cred); VOP_UNLOCK(vp); vn_finished_write(mp); return (error); } /* * Change mode of a file given path name. */ #ifndef _SYS_SYSPROTO_H_ struct chmod_args { char *path; int mode; }; #endif int sys_chmod(struct thread *td, struct chmod_args *uap) { return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->mode, 0)); } #ifndef _SYS_SYSPROTO_H_ struct fchmodat_args { int dirfd; char *path; mode_t mode; int flag; } #endif int sys_fchmodat(struct thread *td, struct fchmodat_args *uap) { if ((uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH)) != 0) return (EINVAL); return (kern_fchmodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode, uap->flag)); } /* * Change mode of a file given path name (don't follow links.) */ #ifndef _SYS_SYSPROTO_H_ struct lchmod_args { char *path; int mode; }; #endif int sys_lchmod(struct thread *td, struct lchmod_args *uap) { return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->mode, AT_SYMLINK_NOFOLLOW)); } int kern_fchmodat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, mode_t mode, int flag) { struct nameidata nd; int error; AUDIT_ARG_MODE(mode); NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH) | AUDITVNODE1, pathseg, path, fd, &cap_fchmod_rights, td); if ((error = namei(&nd)) != 0) return (error); NDFREE_NOTHING(&nd); error = setfmode(td, td->td_ucred, nd.ni_vp, mode); vrele(nd.ni_vp); return (error); } /* * Change mode of a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchmod_args { int fd; int mode; }; #endif int sys_fchmod(struct thread *td, struct fchmod_args *uap) { struct file *fp; int error; AUDIT_ARG_FD(uap->fd); AUDIT_ARG_MODE(uap->mode); error = fget(td, uap->fd, &cap_fchmod_rights, &fp); if (error != 0) return (error); error = fo_chmod(fp, uap->mode, td->td_ucred, td); fdrop(fp, td); return (error); } /* * Common implementation for chown(), lchown(), and fchown() */ int setfown(struct thread *td, struct ucred *cred, struct vnode *vp, uid_t uid, gid_t gid) { struct mount *mp; struct vattr vattr; int error; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VATTR_NULL(&vattr); vattr.va_uid = uid; vattr.va_gid = gid; #ifdef MAC error = mac_vnode_check_setowner(cred, vp, vattr.va_uid, vattr.va_gid); if (error == 0) #endif error = VOP_SETATTR(vp, &vattr, cred); VOP_UNLOCK(vp); vn_finished_write(mp); return (error); } /* * Set ownership given a path name. */ #ifndef _SYS_SYSPROTO_H_ struct chown_args { char *path; int uid; int gid; }; #endif int sys_chown(struct thread *td, struct chown_args *uap) { return (kern_fchownat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->uid, uap->gid, 0)); } #ifndef _SYS_SYSPROTO_H_ struct fchownat_args { int fd; const char * path; uid_t uid; gid_t gid; int flag; }; #endif int sys_fchownat(struct thread *td, struct fchownat_args *uap) { if ((uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH)) != 0) return (EINVAL); return (kern_fchownat(td, uap->fd, uap->path, UIO_USERSPACE, uap->uid, uap->gid, uap->flag)); } int kern_fchownat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, int uid, int gid, int flag) { struct nameidata nd; int error; AUDIT_ARG_OWNER(uid, gid); NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH) | AUDITVNODE1, pathseg, path, fd, &cap_fchown_rights, td); if ((error = namei(&nd)) != 0) return (error); NDFREE_NOTHING(&nd); error = setfown(td, td->td_ucred, nd.ni_vp, uid, gid); vrele(nd.ni_vp); return (error); } /* * Set ownership given a path name, do not cross symlinks. */ #ifndef _SYS_SYSPROTO_H_ struct lchown_args { char *path; int uid; int gid; }; #endif int sys_lchown(struct thread *td, struct lchown_args *uap) { return (kern_fchownat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->uid, uap->gid, AT_SYMLINK_NOFOLLOW)); } /* * Set ownership given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchown_args { int fd; int uid; int gid; }; #endif int sys_fchown(struct thread *td, struct fchown_args *uap) { struct file *fp; int error; AUDIT_ARG_FD(uap->fd); AUDIT_ARG_OWNER(uap->uid, uap->gid); error = fget(td, uap->fd, &cap_fchown_rights, &fp); if (error != 0) return (error); error = fo_chown(fp, uap->uid, uap->gid, td->td_ucred, td); fdrop(fp, td); return (error); } /* * Common implementation code for utimes(), lutimes(), and futimes(). */ static int getutimes(const struct timeval *usrtvp, enum uio_seg tvpseg, struct timespec *tsp) { struct timeval tv[2]; const struct timeval *tvp; int error; if (usrtvp == NULL) { vfs_timestamp(&tsp[0]); tsp[1] = tsp[0]; } else { if (tvpseg == UIO_SYSSPACE) { tvp = usrtvp; } else { if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0) return (error); tvp = tv; } if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 || tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000) return (EINVAL); TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]); TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]); } return (0); } /* * Common implementation code for futimens(), utimensat(). */ #define UTIMENS_NULL 0x1 #define UTIMENS_EXIT 0x2 static int getutimens(const struct timespec *usrtsp, enum uio_seg tspseg, struct timespec *tsp, int *retflags) { struct timespec tsnow; int error; vfs_timestamp(&tsnow); *retflags = 0; if (usrtsp == NULL) { tsp[0] = tsnow; tsp[1] = tsnow; *retflags |= UTIMENS_NULL; return (0); } if (tspseg == UIO_SYSSPACE) { tsp[0] = usrtsp[0]; tsp[1] = usrtsp[1]; } else if ((error = copyin(usrtsp, tsp, sizeof(*tsp) * 2)) != 0) return (error); if (tsp[0].tv_nsec == UTIME_OMIT && tsp[1].tv_nsec == UTIME_OMIT) *retflags |= UTIMENS_EXIT; if (tsp[0].tv_nsec == UTIME_NOW && tsp[1].tv_nsec == UTIME_NOW) *retflags |= UTIMENS_NULL; if (tsp[0].tv_nsec == UTIME_OMIT) tsp[0].tv_sec = VNOVAL; else if (tsp[0].tv_nsec == UTIME_NOW) tsp[0] = tsnow; else if (tsp[0].tv_nsec < 0 || tsp[0].tv_nsec >= 1000000000L) return (EINVAL); if (tsp[1].tv_nsec == UTIME_OMIT) tsp[1].tv_sec = VNOVAL; else if (tsp[1].tv_nsec == UTIME_NOW) tsp[1] = tsnow; else if (tsp[1].tv_nsec < 0 || tsp[1].tv_nsec >= 1000000000L) return (EINVAL); return (0); } /* * Common implementation code for utimes(), lutimes(), futimes(), futimens(), * and utimensat(). */ static int setutimes(struct thread *td, struct vnode *vp, const struct timespec *ts, int numtimes, int nullflag) { struct mount *mp; struct vattr vattr; int error, setbirthtime; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); setbirthtime = 0; if (numtimes < 3 && !VOP_GETATTR(vp, &vattr, td->td_ucred) && timespeccmp(&ts[1], &vattr.va_birthtime, < )) setbirthtime = 1; VATTR_NULL(&vattr); vattr.va_atime = ts[0]; vattr.va_mtime = ts[1]; if (setbirthtime) vattr.va_birthtime = ts[1]; if (numtimes > 2) vattr.va_birthtime = ts[2]; if (nullflag) vattr.va_vaflags |= VA_UTIMES_NULL; #ifdef MAC error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime, vattr.va_mtime); #endif if (error == 0) error = VOP_SETATTR(vp, &vattr, td->td_ucred); VOP_UNLOCK(vp); vn_finished_write(mp); return (error); } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct utimes_args { char *path; struct timeval *tptr; }; #endif int sys_utimes(struct thread *td, struct utimes_args *uap) { return (kern_utimesat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->tptr, UIO_USERSPACE)); } #ifndef _SYS_SYSPROTO_H_ struct futimesat_args { int fd; const char * path; const struct timeval * times; }; #endif int sys_futimesat(struct thread *td, struct futimesat_args *uap) { return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE, uap->times, UIO_USERSPACE)); } int kern_utimesat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, struct timeval *tptr, enum uio_seg tptrseg) { struct nameidata nd; struct timespec ts[2]; int error; if ((error = getutimes(tptr, tptrseg, ts)) != 0) return (error); NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd, &cap_futimes_rights, td); if ((error = namei(&nd)) != 0) return (error); NDFREE_NOTHING(&nd); error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL); vrele(nd.ni_vp); return (error); } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct lutimes_args { char *path; struct timeval *tptr; }; #endif int sys_lutimes(struct thread *td, struct lutimes_args *uap) { return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr, UIO_USERSPACE)); } int kern_lutimes(struct thread *td, const char *path, enum uio_seg pathseg, struct timeval *tptr, enum uio_seg tptrseg) { struct timespec ts[2]; struct nameidata nd; int error; if ((error = getutimes(tptr, tptrseg, ts)) != 0) return (error); NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE_NOTHING(&nd); error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL); vrele(nd.ni_vp); return (error); } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct futimes_args { int fd; struct timeval *tptr; }; #endif int sys_futimes(struct thread *td, struct futimes_args *uap) { return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE)); } int kern_futimes(struct thread *td, int fd, struct timeval *tptr, enum uio_seg tptrseg) { struct timespec ts[2]; struct file *fp; int error; AUDIT_ARG_FD(fd); error = getutimes(tptr, tptrseg, ts); if (error != 0) return (error); error = getvnode(td, fd, &cap_futimes_rights, &fp); if (error != 0) return (error); #ifdef AUDIT vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(fp->f_vnode); VOP_UNLOCK(fp->f_vnode); #endif error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL); fdrop(fp, td); return (error); } int sys_futimens(struct thread *td, struct futimens_args *uap) { return (kern_futimens(td, uap->fd, uap->times, UIO_USERSPACE)); } int kern_futimens(struct thread *td, int fd, struct timespec *tptr, enum uio_seg tptrseg) { struct timespec ts[2]; struct file *fp; int error, flags; AUDIT_ARG_FD(fd); error = getutimens(tptr, tptrseg, ts, &flags); if (error != 0) return (error); if (flags & UTIMENS_EXIT) return (0); error = getvnode(td, fd, &cap_futimes_rights, &fp); if (error != 0) return (error); #ifdef AUDIT vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(fp->f_vnode); VOP_UNLOCK(fp->f_vnode); #endif error = setutimes(td, fp->f_vnode, ts, 2, flags & UTIMENS_NULL); fdrop(fp, td); return (error); } int sys_utimensat(struct thread *td, struct utimensat_args *uap) { return (kern_utimensat(td, uap->fd, uap->path, UIO_USERSPACE, uap->times, UIO_USERSPACE, uap->flag)); } int kern_utimensat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, struct timespec *tptr, enum uio_seg tptrseg, int flag) { struct nameidata nd; struct timespec ts[2]; int error, flags; if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH)) != 0) return (EINVAL); if ((error = getutimens(tptr, tptrseg, ts, &flags)) != 0) return (error); NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH) | AUDITVNODE1, pathseg, path, fd, &cap_futimes_rights, td); if ((error = namei(&nd)) != 0) return (error); /* * We are allowed to call namei() regardless of 2xUTIME_OMIT. * POSIX states: * "If both tv_nsec fields are UTIME_OMIT... EACCESS may be detected." * "Search permission is denied by a component of the path prefix." */ NDFREE_NOTHING(&nd); if ((flags & UTIMENS_EXIT) == 0) error = setutimes(td, nd.ni_vp, ts, 2, flags & UTIMENS_NULL); vrele(nd.ni_vp); return (error); } /* * Truncate a file given its path name. */ #ifndef _SYS_SYSPROTO_H_ struct truncate_args { char *path; int pad; off_t length; }; #endif int sys_truncate(struct thread *td, struct truncate_args *uap) { return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length)); } int kern_truncate(struct thread *td, const char *path, enum uio_seg pathseg, off_t length) { struct mount *mp; struct vnode *vp; void *rl_cookie; struct vattr vattr; struct nameidata nd; int error; if (length < 0) return (EINVAL); retry: NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { vn_rangelock_unlock(vp, rl_cookie); vrele(vp); return (error); } NDFREE(&nd, NDF_ONLY_PNBUF); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_type == VDIR) error = EISDIR; #ifdef MAC else if ((error = mac_vnode_check_write(td->td_ucred, NOCRED, vp))) { } #endif else if ((error = vn_writechk(vp)) == 0 && (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) { VATTR_NULL(&vattr); vattr.va_size = length; error = VOP_SETATTR(vp, &vattr, td->td_ucred); } VOP_UNLOCK(vp); vn_finished_write(mp); vn_rangelock_unlock(vp, rl_cookie); vrele(vp); if (error == ERELOOKUP) goto retry; return (error); } #if defined(COMPAT_43) /* * Truncate a file given its path name. */ #ifndef _SYS_SYSPROTO_H_ struct otruncate_args { char *path; long length; }; #endif int otruncate(struct thread *td, struct otruncate_args *uap) { return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length)); } #endif /* COMPAT_43 */ #if defined(COMPAT_FREEBSD6) /* Versions with the pad argument */ int freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap) { return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length)); } int freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap) { return (kern_ftruncate(td, uap->fd, uap->length)); } #endif int kern_fsync(struct thread *td, int fd, bool fullsync) { struct vnode *vp; struct mount *mp; struct file *fp; int error, lock_flags; AUDIT_ARG_FD(fd); error = getvnode(td, fd, &cap_fsync_rights, &fp); if (error != 0) return (error); vp = fp->f_vnode; #if 0 if (!fullsync) /* XXXKIB: compete outstanding aio writes */; #endif retry: error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error != 0) goto drop; if (MNT_SHARED_WRITES(mp) || ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) { lock_flags = LK_SHARED; } else { lock_flags = LK_EXCLUSIVE; } vn_lock(vp, lock_flags | LK_RETRY); AUDIT_ARG_VNODE1(vp); if (vp->v_object != NULL) { VM_OBJECT_WLOCK(vp->v_object); vm_object_page_clean(vp->v_object, 0, 0, 0); VM_OBJECT_WUNLOCK(vp->v_object); } error = fullsync ? VOP_FSYNC(vp, MNT_WAIT, td) : VOP_FDATASYNC(vp, td); VOP_UNLOCK(vp); vn_finished_write(mp); if (error == ERELOOKUP) goto retry; drop: fdrop(fp, td); return (error); } /* * Sync an open file. */ #ifndef _SYS_SYSPROTO_H_ struct fsync_args { int fd; }; #endif int sys_fsync(struct thread *td, struct fsync_args *uap) { return (kern_fsync(td, uap->fd, true)); } int sys_fdatasync(struct thread *td, struct fdatasync_args *uap) { return (kern_fsync(td, uap->fd, false)); } /* * Rename files. Source and destination must either both be directories, or * both not be directories. If target is a directory, it must be empty. */ #ifndef _SYS_SYSPROTO_H_ struct rename_args { char *from; char *to; }; #endif int sys_rename(struct thread *td, struct rename_args *uap) { return (kern_renameat(td, AT_FDCWD, uap->from, AT_FDCWD, uap->to, UIO_USERSPACE)); } #ifndef _SYS_SYSPROTO_H_ struct renameat_args { int oldfd; char *old; int newfd; char *new; }; #endif int sys_renameat(struct thread *td, struct renameat_args *uap) { return (kern_renameat(td, uap->oldfd, uap->old, uap->newfd, uap->new, UIO_USERSPACE)); } #ifdef MAC static int kern_renameat_mac(struct thread *td, int oldfd, const char *old, int newfd, const char *new, enum uio_seg pathseg, struct nameidata *fromnd) { int error; NDINIT_ATRIGHTS(fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART | AUDITVNODE1, pathseg, old, oldfd, &cap_renameat_source_rights, td); if ((error = namei(fromnd)) != 0) return (error); error = mac_vnode_check_rename_from(td->td_ucred, fromnd->ni_dvp, fromnd->ni_vp, &fromnd->ni_cnd); VOP_UNLOCK(fromnd->ni_dvp); if (fromnd->ni_dvp != fromnd->ni_vp) VOP_UNLOCK(fromnd->ni_vp); if (error != 0) { NDFREE(fromnd, NDF_ONLY_PNBUF); vrele(fromnd->ni_dvp); vrele(fromnd->ni_vp); if (fromnd->ni_startdir) vrele(fromnd->ni_startdir); } return (error); } #endif int kern_renameat(struct thread *td, int oldfd, const char *old, int newfd, const char *new, enum uio_seg pathseg) { struct mount *mp = NULL; struct vnode *tvp, *fvp, *tdvp; struct nameidata fromnd, tond; u_int64_t tondflags; int error; again: bwillwrite(); #ifdef MAC if (mac_vnode_check_rename_from_enabled()) { error = kern_renameat_mac(td, oldfd, old, newfd, new, pathseg, &fromnd); if (error != 0) return (error); } else { #endif NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | AUDITVNODE1, pathseg, old, oldfd, &cap_renameat_source_rights, td); if ((error = namei(&fromnd)) != 0) return (error); #ifdef MAC } #endif fvp = fromnd.ni_vp; tondflags = LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNODE2; if (fromnd.ni_vp->v_type == VDIR) tondflags |= WILLBEDIR; NDINIT_ATRIGHTS(&tond, RENAME, tondflags, pathseg, new, newfd, &cap_renameat_target_rights, td); if ((error = namei(&tond)) != 0) { /* Translate error code for rename("dir1", "dir2/."). */ if (error == EISDIR && fvp->v_type == VDIR) error = EINVAL; NDFREE(&fromnd, NDF_ONLY_PNBUF); vrele(fromnd.ni_dvp); vrele(fvp); goto out1; } tdvp = tond.ni_dvp; tvp = tond.ni_vp; error = vn_start_write(fvp, &mp, V_NOWAIT); if (error != 0) { NDFREE(&fromnd, NDF_ONLY_PNBUF); NDFREE(&tond, NDF_ONLY_PNBUF); if (tvp != NULL) vput(tvp); if (tdvp == tvp) vrele(tdvp); else vput(tdvp); vrele(fromnd.ni_dvp); vrele(fvp); vrele(tond.ni_startdir); if (fromnd.ni_startdir != NULL) vrele(fromnd.ni_startdir); error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH); if (error != 0) return (error); goto again; } if (tvp != NULL) { if (fvp->v_type == VDIR && tvp->v_type != VDIR) { error = ENOTDIR; goto out; } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { error = EISDIR; goto out; } #ifdef CAPABILITIES if (newfd != AT_FDCWD && (tond.ni_resflags & NIRES_ABS) == 0) { /* * If the target already exists we require CAP_UNLINKAT * from 'newfd', when newfd was used for the lookup. */ error = cap_check(&tond.ni_filecaps.fc_rights, &cap_unlinkat_rights); if (error != 0) goto out; } #endif } if (fvp == tdvp) { error = EINVAL; goto out; } /* * If the source is the same as the destination (that is, if they * are links to the same vnode), then there is nothing to do. */ if (fvp == tvp) error = ERESTART; #ifdef MAC else error = mac_vnode_check_rename_to(td->td_ucred, tdvp, tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd); #endif out: if (error == 0) { error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd, tond.ni_dvp, tond.ni_vp, &tond.ni_cnd); NDFREE(&fromnd, NDF_ONLY_PNBUF); NDFREE(&tond, NDF_ONLY_PNBUF); } else { NDFREE(&fromnd, NDF_ONLY_PNBUF); NDFREE(&tond, NDF_ONLY_PNBUF); if (tvp != NULL) vput(tvp); if (tdvp == tvp) vrele(tdvp); else vput(tdvp); vrele(fromnd.ni_dvp); vrele(fvp); } vrele(tond.ni_startdir); vn_finished_write(mp); out1: if (fromnd.ni_startdir) vrele(fromnd.ni_startdir); if (error == ERESTART) return (0); if (error == ERELOOKUP) goto again; return (error); } /* * Make a directory file. */ #ifndef _SYS_SYSPROTO_H_ struct mkdir_args { char *path; int mode; }; #endif int sys_mkdir(struct thread *td, struct mkdir_args *uap) { return (kern_mkdirat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->mode)); } #ifndef _SYS_SYSPROTO_H_ struct mkdirat_args { int fd; char *path; mode_t mode; }; #endif int sys_mkdirat(struct thread *td, struct mkdirat_args *uap) { return (kern_mkdirat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode)); } int kern_mkdirat(struct thread *td, int fd, const char *path, enum uio_seg segflg, int mode) { struct mount *mp; struct vattr vattr; struct nameidata nd; int error; AUDIT_ARG_MODE(mode); restart: bwillwrite(); NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 | NC_NOMAKEENTRY | NC_KEEPPOSENTRY | FAILIFEXISTS | WILLBEDIR, segflg, path, fd, &cap_mkdirat_rights, td); if ((error = namei(&nd)) != 0) return (error); if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VATTR_NULL(&vattr); vattr.va_type = VDIR; vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_pd->pd_cmask; #ifdef MAC error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); if (error != 0) goto out; #endif error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); #ifdef MAC out: #endif NDFREE(&nd, NDF_ONLY_PNBUF); VOP_VPUT_PAIR(nd.ni_dvp, error == 0 ? &nd.ni_vp : NULL, true); vn_finished_write(mp); if (error == ERELOOKUP) goto restart; return (error); } /* * Remove a directory file. */ #ifndef _SYS_SYSPROTO_H_ struct rmdir_args { char *path; }; #endif int sys_rmdir(struct thread *td, struct rmdir_args *uap) { return (kern_frmdirat(td, AT_FDCWD, uap->path, FD_NONE, UIO_USERSPACE, 0)); } int kern_frmdirat(struct thread *td, int dfd, const char *path, int fd, enum uio_seg pathseg, int flag) { struct mount *mp; struct vnode *vp; struct file *fp; struct nameidata nd; cap_rights_t rights; int error; fp = NULL; if (fd != FD_NONE) { error = getvnode(td, fd, cap_rights_init_one(&rights, CAP_LOOKUP), &fp); if (error != 0) return (error); } restart: bwillwrite(); NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1 | at2cnpflags(flag, AT_RESOLVE_BENEATH), pathseg, path, dfd, &cap_unlinkat_rights, td); if ((error = namei(&nd)) != 0) goto fdout; vp = nd.ni_vp; if (vp->v_type != VDIR) { error = ENOTDIR; goto out; } /* * No rmdir "." please. */ if (nd.ni_dvp == vp) { error = EINVAL; goto out; } /* * The root of a mounted filesystem cannot be deleted. */ if (vp->v_vflag & VV_ROOT) { error = EBUSY; goto out; } if (fp != NULL && fp->f_vnode != vp) { if (VN_IS_DOOMED(fp->f_vnode)) error = EBADF; else error = EDEADLK; goto out; } #ifdef MAC error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp, &nd.ni_cnd); if (error != 0) goto out; #endif if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) goto fdout; goto restart; } vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK); error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); vn_finished_write(mp); out: NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); if (error == ERELOOKUP) goto restart; fdout: if (fp != NULL) fdrop(fp, td); return (error); } #if defined(COMPAT_43) || defined(COMPAT_FREEBSD11) int freebsd11_kern_getdirentries(struct thread *td, int fd, char *ubuf, u_int count, long *basep, void (*func)(struct freebsd11_dirent *)) { struct freebsd11_dirent dstdp; struct dirent *dp, *edp; char *dirbuf; off_t base; ssize_t resid, ucount; int error; /* XXX arbitrary sanity limit on `count'. */ count = min(count, 64 * 1024); dirbuf = malloc(count, M_TEMP, M_WAITOK); error = kern_getdirentries(td, fd, dirbuf, count, &base, &resid, UIO_SYSSPACE); if (error != 0) goto done; if (basep != NULL) *basep = base; ucount = 0; for (dp = (struct dirent *)dirbuf, edp = (struct dirent *)&dirbuf[count - resid]; ucount < count && dp < edp; ) { if (dp->d_reclen == 0) break; MPASS(dp->d_reclen >= _GENERIC_DIRLEN(0)); if (dp->d_namlen >= sizeof(dstdp.d_name)) continue; dstdp.d_type = dp->d_type; dstdp.d_namlen = dp->d_namlen; dstdp.d_fileno = dp->d_fileno; /* truncate */ if (dstdp.d_fileno != dp->d_fileno) { switch (ino64_trunc_error) { default: case 0: break; case 1: error = EOVERFLOW; goto done; case 2: dstdp.d_fileno = UINT32_MAX; break; } } dstdp.d_reclen = sizeof(dstdp) - sizeof(dstdp.d_name) + ((dp->d_namlen + 1 + 3) &~ 3); bcopy(dp->d_name, dstdp.d_name, dstdp.d_namlen); bzero(dstdp.d_name + dstdp.d_namlen, dstdp.d_reclen - offsetof(struct freebsd11_dirent, d_name) - dstdp.d_namlen); MPASS(dstdp.d_reclen <= dp->d_reclen); MPASS(ucount + dstdp.d_reclen <= count); if (func != NULL) func(&dstdp); error = copyout(&dstdp, ubuf + ucount, dstdp.d_reclen); if (error != 0) break; dp = (struct dirent *)((char *)dp + dp->d_reclen); ucount += dstdp.d_reclen; } done: free(dirbuf, M_TEMP); if (error == 0) td->td_retval[0] = ucount; return (error); } #endif /* COMPAT */ #ifdef COMPAT_43 static void ogetdirentries_cvt(struct freebsd11_dirent *dp) { #if (BYTE_ORDER == LITTLE_ENDIAN) /* * The expected low byte of dp->d_namlen is our dp->d_type. * The high MBZ byte of dp->d_namlen is our dp->d_namlen. */ dp->d_type = dp->d_namlen; dp->d_namlen = 0; #else /* * The dp->d_type is the high byte of the expected dp->d_namlen, * so must be zero'ed. */ dp->d_type = 0; #endif } /* * Read a block of directory entries in a filesystem independent format. */ #ifndef _SYS_SYSPROTO_H_ struct ogetdirentries_args { int fd; char *buf; u_int count; long *basep; }; #endif int ogetdirentries(struct thread *td, struct ogetdirentries_args *uap) { long loff; int error; error = kern_ogetdirentries(td, uap, &loff); if (error == 0) error = copyout(&loff, uap->basep, sizeof(long)); return (error); } int kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap, long *ploff) { long base; int error; /* XXX arbitrary sanity limit on `count'. */ if (uap->count > 64 * 1024) return (EINVAL); error = freebsd11_kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base, ogetdirentries_cvt); if (error == 0 && uap->basep != NULL) error = copyout(&base, uap->basep, sizeof(long)); return (error); } #endif /* COMPAT_43 */ #if defined(COMPAT_FREEBSD11) #ifndef _SYS_SYSPROTO_H_ struct freebsd11_getdirentries_args { int fd; char *buf; u_int count; long *basep; }; #endif int freebsd11_getdirentries(struct thread *td, struct freebsd11_getdirentries_args *uap) { long base; int error; error = freebsd11_kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base, NULL); if (error == 0 && uap->basep != NULL) error = copyout(&base, uap->basep, sizeof(long)); return (error); } int freebsd11_getdents(struct thread *td, struct freebsd11_getdents_args *uap) { struct freebsd11_getdirentries_args ap; ap.fd = uap->fd; ap.buf = uap->buf; ap.count = uap->count; ap.basep = NULL; return (freebsd11_getdirentries(td, &ap)); } #endif /* COMPAT_FREEBSD11 */ /* * Read a block of directory entries in a filesystem independent format. */ int sys_getdirentries(struct thread *td, struct getdirentries_args *uap) { off_t base; int error; error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base, NULL, UIO_USERSPACE); if (error != 0) return (error); if (uap->basep != NULL) error = copyout(&base, uap->basep, sizeof(off_t)); return (error); } int kern_getdirentries(struct thread *td, int fd, char *buf, size_t count, off_t *basep, ssize_t *residp, enum uio_seg bufseg) { struct vnode *vp; struct file *fp; struct uio auio; struct iovec aiov; off_t loff; int error, eofflag; off_t foffset; AUDIT_ARG_FD(fd); if (count > IOSIZE_MAX) return (EINVAL); auio.uio_resid = count; error = getvnode(td, fd, &cap_read_rights, &fp); if (error != 0) return (error); if ((fp->f_flag & FREAD) == 0) { fdrop(fp, td); return (EBADF); } vp = fp->f_vnode; foffset = foffset_lock(fp, 0); unionread: if (vp->v_type != VDIR) { error = EINVAL; goto fail; } aiov.iov_base = buf; aiov.iov_len = count; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = bufseg; auio.uio_td = td; vn_lock(vp, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(vp); loff = auio.uio_offset = foffset; #ifdef MAC error = mac_vnode_check_readdir(td->td_ucred, vp); if (error == 0) #endif error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL); foffset = auio.uio_offset; if (error != 0) { VOP_UNLOCK(vp); goto fail; } if (count == auio.uio_resid && (vp->v_vflag & VV_ROOT) && (vp->v_mount->mnt_flag & MNT_UNION)) { struct vnode *tvp = vp; vp = vp->v_mount->mnt_vnodecovered; VREF(vp); fp->f_vnode = vp; foffset = 0; vput(tvp); goto unionread; } VOP_UNLOCK(vp); *basep = loff; if (residp != NULL) *residp = auio.uio_resid; td->td_retval[0] = count - auio.uio_resid; fail: foffset_unlock(fp, foffset, 0); fdrop(fp, td); return (error); } /* * Set the mode mask for creation of filesystem nodes. */ #ifndef _SYS_SYSPROTO_H_ struct umask_args { int newmask; }; #endif int sys_umask(struct thread *td, struct umask_args *uap) { struct pwddesc *pdp; pdp = td->td_proc->p_pd; PWDDESC_XLOCK(pdp); td->td_retval[0] = pdp->pd_cmask; pdp->pd_cmask = uap->newmask & ALLPERMS; PWDDESC_XUNLOCK(pdp); return (0); } /* * Void all references to file by ripping underlying filesystem away from * vnode. */ #ifndef _SYS_SYSPROTO_H_ struct revoke_args { char *path; }; #endif int sys_revoke(struct thread *td, struct revoke_args *uap) { struct vnode *vp; struct vattr vattr; struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; NDFREE_NOTHING(&nd); if (vp->v_type != VCHR || vp->v_rdev == NULL) { error = EINVAL; goto out; } #ifdef MAC error = mac_vnode_check_revoke(td->td_ucred, vp); if (error != 0) goto out; #endif error = VOP_GETATTR(vp, &vattr, td->td_ucred); if (error != 0) goto out; if (td->td_ucred->cr_uid != vattr.va_uid) { error = priv_check(td, PRIV_VFS_ADMIN); if (error != 0) goto out; } if (devfs_usecount(vp) > 0) VOP_REVOKE(vp, REVOKEALL); out: vput(vp); return (error); } /* * This variant of getvnode() allows O_PATH files. Caller should * ensure that returned file and vnode are only used for compatible * semantics. */ int getvnode_path(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) { struct file *fp; int error; error = fget_unlocked(td->td_proc->p_fd, fd, rightsp, &fp); if (error != 0) return (error); /* * The file could be not of the vnode type, or it may be not * yet fully initialized, in which case the f_vnode pointer * may be set, but f_ops is still badfileops. E.g., * devfs_open() transiently create such situation to * facilitate csw d_fdopen(). * * Dupfdopen() handling in kern_openat() installs the * half-baked file into the process descriptor table, allowing * other thread to dereference it. Guard against the race by * checking f_ops. */ if (fp->f_vnode == NULL || fp->f_ops == &badfileops) { fdrop(fp, td); return (EINVAL); } *fpp = fp; return (0); } /* * Convert a user file descriptor to a kernel file entry and check * that, if it is a capability, the correct rights are present. * A reference on the file entry is held upon returning. */ int getvnode(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) { int error; error = getvnode_path(td, fd, rightsp, fpp); /* * Filter out O_PATH file descriptors, most getvnode() callers * do not call fo_ methods. */ if (error == 0 && (*fpp)->f_ops == &path_fileops) { fdrop(*fpp, td); error = EBADF; } return (error); } /* * Get an (NFS) file handle. */ #ifndef _SYS_SYSPROTO_H_ struct lgetfh_args { char *fname; fhandle_t *fhp; }; #endif int sys_lgetfh(struct thread *td, struct lgetfh_args *uap) { return (kern_getfhat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->fname, UIO_USERSPACE, uap->fhp, UIO_USERSPACE)); } #ifndef _SYS_SYSPROTO_H_ struct getfh_args { char *fname; fhandle_t *fhp; }; #endif int sys_getfh(struct thread *td, struct getfh_args *uap) { return (kern_getfhat(td, 0, AT_FDCWD, uap->fname, UIO_USERSPACE, uap->fhp, UIO_USERSPACE)); } /* * syscall for the rpc.lockd to use to translate an open descriptor into * a NFS file handle. * * warning: do not remove the priv_check() call or this becomes one giant * security hole. */ #ifndef _SYS_SYSPROTO_H_ struct getfhat_args { int fd; char *path; fhandle_t *fhp; int flags; }; #endif int sys_getfhat(struct thread *td, struct getfhat_args *uap) { if ((uap->flags & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH)) != 0) return (EINVAL); return (kern_getfhat(td, uap->flags, uap->fd, uap->path, UIO_USERSPACE, uap->fhp, UIO_USERSPACE)); } int kern_getfhat(struct thread *td, int flags, int fd, const char *path, enum uio_seg pathseg, fhandle_t *fhp, enum uio_seg fhseg) { struct nameidata nd; fhandle_t fh; struct vnode *vp; int error; error = priv_check(td, PRIV_VFS_GETFH); if (error != 0) return (error); NDINIT_AT(&nd, LOOKUP, at2cnpflags(flags, AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH) | LOCKLEAF | AUDITVNODE1, pathseg, path, fd, td); error = namei(&nd); if (error != 0) return (error); NDFREE_NOTHING(&nd); vp = nd.ni_vp; bzero(&fh, sizeof(fh)); fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid; error = VOP_VPTOFH(vp, &fh.fh_fid); vput(vp); if (error == 0) { if (fhseg == UIO_USERSPACE) error = copyout(&fh, fhp, sizeof (fh)); else memcpy(fhp, &fh, sizeof(fh)); } return (error); } #ifndef _SYS_SYSPROTO_H_ struct fhlink_args { fhandle_t *fhp; const char *to; }; #endif int sys_fhlink(struct thread *td, struct fhlink_args *uap) { return (kern_fhlinkat(td, AT_FDCWD, uap->to, UIO_USERSPACE, uap->fhp)); } #ifndef _SYS_SYSPROTO_H_ struct fhlinkat_args { fhandle_t *fhp; int tofd; const char *to; }; #endif int sys_fhlinkat(struct thread *td, struct fhlinkat_args *uap) { return (kern_fhlinkat(td, uap->tofd, uap->to, UIO_USERSPACE, uap->fhp)); } static int kern_fhlinkat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, fhandle_t *fhp) { fhandle_t fh; struct mount *mp; struct vnode *vp; int error; error = priv_check(td, PRIV_VFS_GETFH); if (error != 0) return (error); error = copyin(fhp, &fh, sizeof(fh)); if (error != 0) return (error); do { bwillwrite(); if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL) return (ESTALE); error = VFS_FHTOVP(mp, &fh.fh_fid, LK_SHARED, &vp); vfs_unbusy(mp); if (error != 0) return (error); VOP_UNLOCK(vp); error = kern_linkat_vp(td, vp, fd, path, pathseg); } while (error == EAGAIN || error == ERELOOKUP); return (error); } #ifndef _SYS_SYSPROTO_H_ struct fhreadlink_args { fhandle_t *fhp; char *buf; size_t bufsize; }; #endif int sys_fhreadlink(struct thread *td, struct fhreadlink_args *uap) { fhandle_t fh; struct mount *mp; struct vnode *vp; int error; error = priv_check(td, PRIV_VFS_GETFH); if (error != 0) return (error); if (uap->bufsize > IOSIZE_MAX) return (EINVAL); error = copyin(uap->fhp, &fh, sizeof(fh)); if (error != 0) return (error); if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL) return (ESTALE); error = VFS_FHTOVP(mp, &fh.fh_fid, LK_SHARED, &vp); vfs_unbusy(mp); if (error != 0) return (error); error = kern_readlink_vp(vp, uap->buf, UIO_USERSPACE, uap->bufsize, td); vput(vp); return (error); } /* * syscall for the rpc.lockd to use to translate a NFS file handle into an * open descriptor. * * warning: do not remove the priv_check() call or this becomes one giant * security hole. */ #ifndef _SYS_SYSPROTO_H_ struct fhopen_args { const struct fhandle *u_fhp; int flags; }; #endif int sys_fhopen(struct thread *td, struct fhopen_args *uap) { return (kern_fhopen(td, uap->u_fhp, uap->flags)); } int kern_fhopen(struct thread *td, const struct fhandle *u_fhp, int flags) { struct mount *mp; struct vnode *vp; struct fhandle fhp; struct file *fp; int fmode, error; int indx; error = priv_check(td, PRIV_VFS_FHOPEN); if (error != 0) return (error); indx = -1; fmode = FFLAGS(flags); /* why not allow a non-read/write open for our lockd? */ if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT)) return (EINVAL); error = copyin(u_fhp, &fhp, sizeof(fhp)); if (error != 0) return(error); /* find the mount point */ mp = vfs_busyfs(&fhp.fh_fsid); if (mp == NULL) return (ESTALE); /* now give me my vnode, it gets returned to me locked */ error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp); vfs_unbusy(mp); if (error != 0) return (error); error = falloc_noinstall(td, &fp); if (error != 0) { vput(vp); return (error); } /* * An extra reference on `fp' has been held for us by * falloc_noinstall(). */ #ifdef INVARIANTS td->td_dupfd = -1; #endif error = vn_open_vnode(vp, fmode, td->td_ucred, td, fp); if (error != 0) { KASSERT(fp->f_ops == &badfileops, ("VOP_OPEN in fhopen() set f_ops")); KASSERT(td->td_dupfd < 0, ("fhopen() encountered fdopen()")); vput(vp); goto bad; } #ifdef INVARIANTS td->td_dupfd = 0; #endif fp->f_vnode = vp; finit_vnode(fp, fmode, NULL, &vnops); VOP_UNLOCK(vp); if ((fmode & O_TRUNC) != 0) { error = fo_truncate(fp, 0, td->td_ucred, td); if (error != 0) goto bad; } error = finstall(td, fp, &indx, fmode, NULL); bad: fdrop(fp, td); td->td_retval[0] = indx; return (error); } /* * Stat an (NFS) file handle. */ #ifndef _SYS_SYSPROTO_H_ struct fhstat_args { struct fhandle *u_fhp; struct stat *sb; }; #endif int sys_fhstat(struct thread *td, struct fhstat_args *uap) { struct stat sb; struct fhandle fh; int error; error = copyin(uap->u_fhp, &fh, sizeof(fh)); if (error != 0) return (error); error = kern_fhstat(td, fh, &sb); if (error == 0) error = copyout(&sb, uap->sb, sizeof(sb)); return (error); } int kern_fhstat(struct thread *td, struct fhandle fh, struct stat *sb) { struct mount *mp; struct vnode *vp; int error; error = priv_check(td, PRIV_VFS_FHSTAT); if (error != 0) return (error); if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL) return (ESTALE); error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp); vfs_unbusy(mp); if (error != 0) return (error); error = VOP_STAT(vp, sb, td->td_ucred, NOCRED, td); vput(vp); return (error); } /* * Implement fstatfs() for (NFS) file handles. */ #ifndef _SYS_SYSPROTO_H_ struct fhstatfs_args { struct fhandle *u_fhp; struct statfs *buf; }; #endif int sys_fhstatfs(struct thread *td, struct fhstatfs_args *uap) { struct statfs *sfp; fhandle_t fh; int error; error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); if (error != 0) return (error); sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fhstatfs(td, fh, sfp); if (error == 0) error = copyout(sfp, uap->buf, sizeof(*sfp)); free(sfp, M_STATFS); return (error); } int kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf) { struct mount *mp; struct vnode *vp; int error; error = priv_check(td, PRIV_VFS_FHSTATFS); if (error != 0) return (error); if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL) return (ESTALE); error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp); if (error != 0) { vfs_unbusy(mp); return (error); } vput(vp); error = prison_canseemount(td->td_ucred, mp); if (error != 0) goto out; #ifdef MAC error = mac_mount_check_stat(td->td_ucred, mp); if (error != 0) goto out; #endif error = VFS_STATFS(mp, buf); out: vfs_unbusy(mp); return (error); } /* * Unlike madvise(2), we do not make a best effort to remember every * possible caching hint. Instead, we remember the last setting with * the exception that we will allow POSIX_FADV_NORMAL to adjust the * region of any current setting. */ int kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len, int advice) { struct fadvise_info *fa, *new; struct file *fp; struct vnode *vp; off_t end; int error; if (offset < 0 || len < 0 || offset > OFF_MAX - len) return (EINVAL); AUDIT_ARG_VALUE(advice); switch (advice) { case POSIX_FADV_SEQUENTIAL: case POSIX_FADV_RANDOM: case POSIX_FADV_NOREUSE: new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK); break; case POSIX_FADV_NORMAL: case POSIX_FADV_WILLNEED: case POSIX_FADV_DONTNEED: new = NULL; break; default: return (EINVAL); } /* XXX: CAP_POSIX_FADVISE? */ AUDIT_ARG_FD(fd); error = fget(td, fd, &cap_no_rights, &fp); if (error != 0) goto out; AUDIT_ARG_FILE(td->td_proc, fp); if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) { error = ESPIPE; goto out; } if (fp->f_type != DTYPE_VNODE) { error = ENODEV; goto out; } vp = fp->f_vnode; if (vp->v_type != VREG) { error = ENODEV; goto out; } if (len == 0) end = OFF_MAX; else end = offset + len - 1; switch (advice) { case POSIX_FADV_SEQUENTIAL: case POSIX_FADV_RANDOM: case POSIX_FADV_NOREUSE: /* * Try to merge any existing non-standard region with * this new region if possible, otherwise create a new * non-standard region for this request. */ mtx_pool_lock(mtxpool_sleep, fp); fa = fp->f_advice; if (fa != NULL && fa->fa_advice == advice && ((fa->fa_start <= end && fa->fa_end >= offset) || (end != OFF_MAX && fa->fa_start == end + 1) || (fa->fa_end != OFF_MAX && fa->fa_end + 1 == offset))) { if (offset < fa->fa_start) fa->fa_start = offset; if (end > fa->fa_end) fa->fa_end = end; } else { new->fa_advice = advice; new->fa_start = offset; new->fa_end = end; fp->f_advice = new; new = fa; } mtx_pool_unlock(mtxpool_sleep, fp); break; case POSIX_FADV_NORMAL: /* * If a the "normal" region overlaps with an existing * non-standard region, trim or remove the * non-standard region. */ mtx_pool_lock(mtxpool_sleep, fp); fa = fp->f_advice; if (fa != NULL) { if (offset <= fa->fa_start && end >= fa->fa_end) { new = fa; fp->f_advice = NULL; } else if (offset <= fa->fa_start && end >= fa->fa_start) fa->fa_start = end + 1; else if (offset <= fa->fa_end && end >= fa->fa_end) fa->fa_end = offset - 1; else if (offset >= fa->fa_start && end <= fa->fa_end) { /* * If the "normal" region is a middle * portion of the existing * non-standard region, just remove * the whole thing rather than picking * one side or the other to * preserve. */ new = fa; fp->f_advice = NULL; } } mtx_pool_unlock(mtxpool_sleep, fp); break; case POSIX_FADV_WILLNEED: case POSIX_FADV_DONTNEED: error = VOP_ADVISE(vp, offset, end, advice); break; } out: if (fp != NULL) fdrop(fp, td); free(new, M_FADVISE); return (error); } int sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap) { int error; error = kern_posix_fadvise(td, uap->fd, uap->offset, uap->len, uap->advice); return (kern_posix_error(td, error)); } int kern_copy_file_range(struct thread *td, int infd, off_t *inoffp, int outfd, off_t *outoffp, size_t len, unsigned int flags) { struct file *infp, *outfp; struct vnode *invp, *outvp; int error; size_t retlen; void *rl_rcookie, *rl_wcookie; off_t savinoff, savoutoff; infp = outfp = NULL; rl_rcookie = rl_wcookie = NULL; savinoff = -1; error = 0; retlen = 0; if (flags != 0) { error = EINVAL; goto out; } if (len > SSIZE_MAX) /* * Although the len argument is size_t, the return argument * is ssize_t (which is signed). Therefore a size that won't * fit in ssize_t can't be returned. */ len = SSIZE_MAX; /* Get the file structures for the file descriptors. */ error = fget_read(td, infd, &cap_read_rights, &infp); if (error != 0) goto out; if (infp->f_ops == &badfileops) { error = EBADF; goto out; } if (infp->f_vnode == NULL) { error = EINVAL; goto out; } error = fget_write(td, outfd, &cap_write_rights, &outfp); if (error != 0) goto out; if (outfp->f_ops == &badfileops) { error = EBADF; goto out; } if (outfp->f_vnode == NULL) { error = EINVAL; goto out; } /* Set the offset pointers to the correct place. */ if (inoffp == NULL) inoffp = &infp->f_offset; if (outoffp == NULL) outoffp = &outfp->f_offset; savinoff = *inoffp; savoutoff = *outoffp; invp = infp->f_vnode; outvp = outfp->f_vnode; /* Sanity check the f_flag bits. */ if ((outfp->f_flag & (FWRITE | FAPPEND)) != FWRITE || (infp->f_flag & FREAD) == 0) { error = EBADF; goto out; } /* If len == 0, just return 0. */ if (len == 0) goto out; /* * If infp and outfp refer to the same file, the byte ranges cannot * overlap. */ if (invp == outvp && ((savinoff <= savoutoff && savinoff + len > savoutoff) || (savinoff > savoutoff && savoutoff + len > savinoff))) { error = EINVAL; goto out; } /* Range lock the byte ranges for both invp and outvp. */ for (;;) { rl_wcookie = vn_rangelock_wlock(outvp, *outoffp, *outoffp + len); rl_rcookie = vn_rangelock_tryrlock(invp, *inoffp, *inoffp + len); if (rl_rcookie != NULL) break; vn_rangelock_unlock(outvp, rl_wcookie); rl_rcookie = vn_rangelock_rlock(invp, *inoffp, *inoffp + len); vn_rangelock_unlock(invp, rl_rcookie); } retlen = len; error = vn_copy_file_range(invp, inoffp, outvp, outoffp, &retlen, flags, infp->f_cred, outfp->f_cred, td); out: if (rl_rcookie != NULL) vn_rangelock_unlock(invp, rl_rcookie); if (rl_wcookie != NULL) vn_rangelock_unlock(outvp, rl_wcookie); if (savinoff != -1 && (error == EINTR || error == ERESTART)) { *inoffp = savinoff; *outoffp = savoutoff; } if (outfp != NULL) fdrop(outfp, td); if (infp != NULL) fdrop(infp, td); td->td_retval[0] = retlen; return (error); } int sys_copy_file_range(struct thread *td, struct copy_file_range_args *uap) { off_t inoff, outoff, *inoffp, *outoffp; int error; inoffp = outoffp = NULL; if (uap->inoffp != NULL) { error = copyin(uap->inoffp, &inoff, sizeof(off_t)); if (error != 0) return (error); inoffp = &inoff; } if (uap->outoffp != NULL) { error = copyin(uap->outoffp, &outoff, sizeof(off_t)); if (error != 0) return (error); outoffp = &outoff; } error = kern_copy_file_range(td, uap->infd, inoffp, uap->outfd, outoffp, uap->len, uap->flags); if (error == 0 && uap->inoffp != NULL) error = copyout(inoffp, uap->inoffp, sizeof(off_t)); if (error == 0 && uap->outoffp != NULL) error = copyout(outoffp, uap->outoffp, sizeof(off_t)); return (error); } diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index bb9ee2cceb79..9da35721def4 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -1,3460 +1,3472 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Copyright (c) 2012 Konstantin Belousov * Copyright (c) 2013, 2014 The FreeBSD Foundation * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_hwpmc_hooks.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif static fo_rdwr_t vn_read; static fo_rdwr_t vn_write; static fo_rdwr_t vn_io_fault; static fo_truncate_t vn_truncate; static fo_ioctl_t vn_ioctl; static fo_poll_t vn_poll; static fo_kqfilter_t vn_kqfilter; static fo_close_t vn_closefile; static fo_mmap_t vn_mmap; static fo_fallocate_t vn_fallocate; struct fileops vnops = { .fo_read = vn_io_fault, .fo_write = vn_io_fault, .fo_truncate = vn_truncate, .fo_ioctl = vn_ioctl, .fo_poll = vn_poll, .fo_kqfilter = vn_kqfilter, .fo_stat = vn_statfile, .fo_close = vn_closefile, .fo_chmod = vn_chmod, .fo_chown = vn_chown, .fo_sendfile = vn_sendfile, .fo_seek = vn_seek, .fo_fill_kinfo = vn_fill_kinfo, .fo_mmap = vn_mmap, .fo_fallocate = vn_fallocate, .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE }; const u_int io_hold_cnt = 16; static int vn_io_fault_enable = 1; SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RWTUN, &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance"); static int vn_io_fault_prefault = 0; SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_prefault, CTLFLAG_RWTUN, &vn_io_fault_prefault, 0, "Enable vn_io_fault prefaulting"); static int vn_io_pgcache_read_enable = 1; SYSCTL_INT(_debug, OID_AUTO, vn_io_pgcache_read_enable, CTLFLAG_RWTUN, &vn_io_pgcache_read_enable, 0, "Enable copying from page cache for reads, avoiding fs"); static u_long vn_io_faults_cnt; SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD, &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers"); static int vfs_allow_read_dir = 0; SYSCTL_INT(_security_bsd, OID_AUTO, allow_read_dir, CTLFLAG_RW, &vfs_allow_read_dir, 0, "Enable read(2) of directory by root for filesystems that support it"); /* * Returns true if vn_io_fault mode of handling the i/o request should * be used. */ static bool do_vn_io_fault(struct vnode *vp, struct uio *uio) { struct mount *mp; return (uio->uio_segflg == UIO_USERSPACE && vp->v_type == VREG && (mp = vp->v_mount) != NULL && (mp->mnt_kern_flag & MNTK_NO_IOPF) != 0 && vn_io_fault_enable); } /* * Structure used to pass arguments to vn_io_fault1(), to do either * file- or vnode-based I/O calls. */ struct vn_io_fault_args { enum { VN_IO_FAULT_FOP, VN_IO_FAULT_VOP } kind; struct ucred *cred; int flags; union { struct fop_args_tag { struct file *fp; fo_rdwr_t *doio; } fop_args; struct vop_args_tag { struct vnode *vp; } vop_args; } args; }; static int vn_io_fault1(struct vnode *vp, struct uio *uio, struct vn_io_fault_args *args, struct thread *td); int vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp) { struct thread *td = ndp->ni_cnd.cn_thread; return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp)); } static uint64_t open2nameif(int fmode, u_int vn_open_flags) { uint64_t res; res = ISOPEN | LOCKLEAF; if ((fmode & O_RESOLVE_BENEATH) != 0) res |= RBENEATH; if ((vn_open_flags & VN_OPEN_NOAUDIT) == 0) res |= AUDITVNODE1; if ((vn_open_flags & VN_OPEN_NOCAPCHECK) != 0) res |= NOCAPCHECK; return (res); } /* * Common code for vnode open operations via a name lookup. * Lookup the vnode and invoke VOP_CREATE if needed. * Check permissions, and call the VOP_OPEN or VOP_CREATE routine. * * Note that this does NOT free nameidata for the successful case, * due to the NDINIT being done elsewhere. */ int vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags, struct ucred *cred, struct file *fp) { struct vnode *vp; struct mount *mp; struct thread *td = ndp->ni_cnd.cn_thread; struct vattr vat; struct vattr *vap = &vat; int fmode, error; bool first_open; restart: first_open = false; fmode = *flagp; if ((fmode & (O_CREAT | O_EXCL | O_DIRECTORY)) == (O_CREAT | O_EXCL | O_DIRECTORY)) return (EINVAL); else if ((fmode & (O_CREAT | O_DIRECTORY)) == O_CREAT) { ndp->ni_cnd.cn_nameiop = CREATE; ndp->ni_cnd.cn_flags = open2nameif(fmode, vn_open_flags); /* * Set NOCACHE to avoid flushing the cache when * rolling in many files at once. * * Set NC_KEEPPOSENTRY to keep positive entries if they already * exist despite NOCACHE. */ ndp->ni_cnd.cn_flags |= LOCKPARENT | NOCACHE | NC_KEEPPOSENTRY; if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0) ndp->ni_cnd.cn_flags |= FOLLOW; if ((vn_open_flags & VN_OPEN_INVFS) == 0) bwillwrite(); if ((error = namei(ndp)) != 0) return (error); if (ndp->ni_vp == NULL) { VATTR_NULL(vap); vap->va_type = VREG; vap->va_mode = cmode; if (fmode & O_EXCL) vap->va_vaflags |= VA_EXCLUSIVE; if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(ndp, NDF_ONLY_PNBUF); vput(ndp->ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); NDREINIT(ndp); goto restart; } if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0) ndp->ni_cnd.cn_flags |= MAKEENTRY; #ifdef MAC error = mac_vnode_check_create(cred, ndp->ni_dvp, &ndp->ni_cnd, vap); if (error == 0) #endif error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd, vap); vp = ndp->ni_vp; if (error == 0 && (fmode & O_EXCL) != 0 && (fmode & (O_EXLOCK | O_SHLOCK)) != 0) { VI_LOCK(vp); vp->v_iflag |= VI_FOPENING; VI_UNLOCK(vp); first_open = true; } VOP_VPUT_PAIR(ndp->ni_dvp, error == 0 ? &vp : NULL, false); vn_finished_write(mp); if (error) { NDFREE(ndp, NDF_ONLY_PNBUF); if (error == ERELOOKUP) { NDREINIT(ndp); goto restart; } return (error); } fmode &= ~O_TRUNC; } else { if (ndp->ni_dvp == ndp->ni_vp) vrele(ndp->ni_dvp); else vput(ndp->ni_dvp); ndp->ni_dvp = NULL; vp = ndp->ni_vp; if (fmode & O_EXCL) { error = EEXIST; goto bad; } if (vp->v_type == VDIR) { error = EISDIR; goto bad; } fmode &= ~O_CREAT; } } else { ndp->ni_cnd.cn_nameiop = LOOKUP; ndp->ni_cnd.cn_flags = open2nameif(fmode, vn_open_flags); ndp->ni_cnd.cn_flags |= (fmode & O_NOFOLLOW) != 0 ? NOFOLLOW : FOLLOW; if ((fmode & FWRITE) == 0) ndp->ni_cnd.cn_flags |= LOCKSHARED; if ((error = namei(ndp)) != 0) return (error); vp = ndp->ni_vp; } error = vn_open_vnode(vp, fmode, cred, td, fp); if (first_open) { VI_LOCK(vp); vp->v_iflag &= ~VI_FOPENING; wakeup(vp); VI_UNLOCK(vp); } if (error) goto bad; *flagp = fmode; return (0); bad: NDFREE(ndp, NDF_ONLY_PNBUF); vput(vp); *flagp = fmode; ndp->ni_vp = NULL; return (error); } static int vn_open_vnode_advlock(struct vnode *vp, int fmode, struct file *fp) { struct flock lf; int error, lock_flags, type; ASSERT_VOP_LOCKED(vp, "vn_open_vnode_advlock"); if ((fmode & (O_EXLOCK | O_SHLOCK)) == 0) return (0); KASSERT(fp != NULL, ("open with flock requires fp")); if (fp->f_type != DTYPE_NONE && fp->f_type != DTYPE_VNODE) return (EOPNOTSUPP); lock_flags = VOP_ISLOCKED(vp); VOP_UNLOCK(vp); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = (fmode & O_EXLOCK) != 0 ? F_WRLCK : F_RDLCK; type = F_FLOCK; if ((fmode & FNONBLOCK) == 0) type |= F_WAIT; if ((fmode & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) type |= F_FIRSTOPEN; error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type); if (error == 0) fp->f_flag |= FHASLOCK; vn_lock(vp, lock_flags | LK_RETRY); return (error); } /* * Common code for vnode open operations once a vnode is located. * Check permissions, and call the VOP_OPEN routine. */ int vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred, struct thread *td, struct file *fp) { accmode_t accmode; int error; if (vp->v_type == VLNK) { if ((fmode & O_PATH) == 0 || (fmode & FEXEC) != 0) return (EMLINK); } if (vp->v_type == VSOCK) return (EOPNOTSUPP); if (vp->v_type != VDIR && fmode & O_DIRECTORY) return (ENOTDIR); accmode = 0; if ((fmode & O_PATH) == 0) { if ((fmode & (FWRITE | O_TRUNC)) != 0) { if (vp->v_type == VDIR) return (EISDIR); accmode |= VWRITE; } if ((fmode & FREAD) != 0) accmode |= VREAD; if ((fmode & O_APPEND) && (fmode & FWRITE)) accmode |= VAPPEND; #ifdef MAC if ((fmode & O_CREAT) != 0) accmode |= VCREAT; #endif } if ((fmode & FEXEC) != 0) accmode |= VEXEC; #ifdef MAC if ((fmode & O_VERIFY) != 0) accmode |= VVERIFY; error = mac_vnode_check_open(cred, vp, accmode); if (error != 0) return (error); accmode &= ~(VCREAT | VVERIFY); #endif if ((fmode & O_CREAT) == 0 && accmode != 0) { error = VOP_ACCESS(vp, accmode, cred, td); if (error != 0) return (error); } - if ((fmode & O_PATH) != 0) + if ((fmode & O_PATH) != 0) { + error = VOP_ACCESS(vp, VREAD, cred, td); + if (error == 0) + fp->f_flag |= FKQALLOWED; return (0); + } if (vp->v_type == VFIFO && VOP_ISLOCKED(vp) != LK_EXCLUSIVE) vn_lock(vp, LK_UPGRADE | LK_RETRY); error = VOP_OPEN(vp, fmode, cred, td, fp); if (error != 0) return (error); error = vn_open_vnode_advlock(vp, fmode, fp); if (error == 0 && (fmode & FWRITE) != 0) { error = VOP_ADD_WRITECOUNT(vp, 1); if (error == 0) { CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d", __func__, vp, vp->v_writecount); } } /* * Error from advlock or VOP_ADD_WRITECOUNT() still requires * calling VOP_CLOSE() to pair with earlier VOP_OPEN(). * Arrange for that by having fdrop() to use vn_closefile(). */ if (error != 0) { fp->f_flag |= FOPENFAILED; fp->f_vnode = vp; if (fp->f_ops == &badfileops) { fp->f_type = DTYPE_VNODE; fp->f_ops = &vnops; } vref(vp); } ASSERT_VOP_LOCKED(vp, "vn_open_vnode"); return (error); } /* * Check for write permissions on the specified vnode. * Prototype text segments cannot be written. * It is racy. */ int vn_writechk(struct vnode *vp) { ASSERT_VOP_LOCKED(vp, "vn_writechk"); /* * If there's shared text associated with * the vnode, try to free it up once. If * we fail, we can't allow writing. */ if (VOP_IS_TEXT(vp)) return (ETXTBSY); return (0); } /* * Vnode close call */ static int vn_close1(struct vnode *vp, int flags, struct ucred *file_cred, struct thread *td, bool keep_ref) { struct mount *mp; int error, lock_flags; if (vp->v_type != VFIFO && (flags & FWRITE) == 0 && MNT_EXTENDED_SHARED(vp->v_mount)) lock_flags = LK_SHARED; else lock_flags = LK_EXCLUSIVE; vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, lock_flags | LK_RETRY); AUDIT_ARG_VNODE1(vp); if ((flags & (FWRITE | FOPENFAILED)) == FWRITE) { VOP_ADD_WRITECOUNT_CHECKED(vp, -1); CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d", __func__, vp, vp->v_writecount); } error = VOP_CLOSE(vp, flags, file_cred, td); if (keep_ref) VOP_UNLOCK(vp); else vput(vp); vn_finished_write(mp); return (error); } int vn_close(struct vnode *vp, int flags, struct ucred *file_cred, struct thread *td) { return (vn_close1(vp, flags, file_cred, td, false)); } /* * Heuristic to detect sequential operation. */ static int sequential_heuristic(struct uio *uio, struct file *fp) { enum uio_rw rw; ASSERT_VOP_LOCKED(fp->f_vnode, __func__); rw = uio->uio_rw; if (fp->f_flag & FRDAHEAD) return (fp->f_seqcount[rw] << IO_SEQSHIFT); /* * Offset 0 is handled specially. open() sets f_seqcount to 1 so * that the first I/O is normally considered to be slightly * sequential. Seeking to offset 0 doesn't change sequentiality * unless previous seeks have reduced f_seqcount to 0, in which * case offset 0 is not special. */ if ((uio->uio_offset == 0 && fp->f_seqcount[rw] > 0) || uio->uio_offset == fp->f_nextoff[rw]) { /* * f_seqcount is in units of fixed-size blocks so that it * depends mainly on the amount of sequential I/O and not * much on the number of sequential I/O's. The fixed size * of 16384 is hard-coded here since it is (not quite) just * a magic size that works well here. This size is more * closely related to the best I/O size for real disks than * to any block size used by software. */ if (uio->uio_resid >= IO_SEQMAX * 16384) fp->f_seqcount[rw] = IO_SEQMAX; else { fp->f_seqcount[rw] += howmany(uio->uio_resid, 16384); if (fp->f_seqcount[rw] > IO_SEQMAX) fp->f_seqcount[rw] = IO_SEQMAX; } return (fp->f_seqcount[rw] << IO_SEQSHIFT); } /* Not sequential. Quickly draw-down sequentiality. */ if (fp->f_seqcount[rw] > 1) fp->f_seqcount[rw] = 1; else fp->f_seqcount[rw] = 0; return (0); } /* * Package up an I/O request on a vnode into a uio and do it. */ int vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, struct ucred *file_cred, ssize_t *aresid, struct thread *td) { struct uio auio; struct iovec aiov; struct mount *mp; struct ucred *cred; void *rl_cookie; struct vn_io_fault_args args; int error, lock_flags; if (offset < 0 && vp->v_type != VCHR) return (EINVAL); auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = base; aiov.iov_len = len; auio.uio_resid = len; auio.uio_offset = offset; auio.uio_segflg = segflg; auio.uio_rw = rw; auio.uio_td = td; error = 0; if ((ioflg & IO_NODELOCKED) == 0) { if ((ioflg & IO_RANGELOCKED) == 0) { if (rw == UIO_READ) { rl_cookie = vn_rangelock_rlock(vp, offset, offset + len); } else if ((ioflg & IO_APPEND) != 0) { rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); } else { rl_cookie = vn_rangelock_wlock(vp, offset, offset + len); } } else rl_cookie = NULL; mp = NULL; if (rw == UIO_WRITE) { if (vp->v_type != VCHR && (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto out; if (MNT_SHARED_WRITES(mp) || ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) lock_flags = LK_SHARED; else lock_flags = LK_EXCLUSIVE; } else lock_flags = LK_SHARED; vn_lock(vp, lock_flags | LK_RETRY); } else rl_cookie = NULL; ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); #ifdef MAC if ((ioflg & IO_NOMACCHECK) == 0) { if (rw == UIO_READ) error = mac_vnode_check_read(active_cred, file_cred, vp); else error = mac_vnode_check_write(active_cred, file_cred, vp); } #endif if (error == 0) { if (file_cred != NULL) cred = file_cred; else cred = active_cred; if (do_vn_io_fault(vp, &auio)) { args.kind = VN_IO_FAULT_VOP; args.cred = cred; args.flags = ioflg; args.args.vop_args.vp = vp; error = vn_io_fault1(vp, &auio, &args, td); } else if (rw == UIO_READ) { error = VOP_READ(vp, &auio, ioflg, cred); } else /* if (rw == UIO_WRITE) */ { error = VOP_WRITE(vp, &auio, ioflg, cred); } } if (aresid) *aresid = auio.uio_resid; else if (auio.uio_resid && error == 0) error = EIO; if ((ioflg & IO_NODELOCKED) == 0) { VOP_UNLOCK(vp); if (mp != NULL) vn_finished_write(mp); } out: if (rl_cookie != NULL) vn_rangelock_unlock(vp, rl_cookie); return (error); } /* * Package up an I/O request on a vnode into a uio and do it. The I/O * request is split up into smaller chunks and we try to avoid saturating * the buffer cache while potentially holding a vnode locked, so we * check bwillwrite() before calling vn_rdwr(). We also call kern_yield() * to give other processes a chance to lock the vnode (either other processes * core'ing the same binary, or unrelated processes scanning the directory). */ int vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base, size_t len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, struct ucred *file_cred, size_t *aresid, struct thread *td) { int error = 0; ssize_t iaresid; do { int chunk; /* * Force `offset' to a multiple of MAXBSIZE except possibly * for the first chunk, so that filesystems only need to * write full blocks except possibly for the first and last * chunks. */ chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE; if (chunk > len) chunk = len; if (rw != UIO_READ && vp->v_type == VREG) bwillwrite(); iaresid = 0; error = vn_rdwr(rw, vp, base, chunk, offset, segflg, ioflg, active_cred, file_cred, &iaresid, td); len -= chunk; /* aresid calc already includes length */ if (error) break; offset += chunk; base = (char *)base + chunk; kern_yield(PRI_USER); } while (len); if (aresid) *aresid = len + iaresid; return (error); } #if OFF_MAX <= LONG_MAX off_t foffset_lock(struct file *fp, int flags) { volatile short *flagsp; off_t res; short state; KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed")); if ((flags & FOF_NOLOCK) != 0) return (atomic_load_long(&fp->f_offset)); /* * According to McKusick the vn lock was protecting f_offset here. * It is now protected by the FOFFSET_LOCKED flag. */ flagsp = &fp->f_vnread_flags; if (atomic_cmpset_acq_16(flagsp, 0, FOFFSET_LOCKED)) return (atomic_load_long(&fp->f_offset)); sleepq_lock(&fp->f_vnread_flags); state = atomic_load_16(flagsp); for (;;) { if ((state & FOFFSET_LOCKED) == 0) { if (!atomic_fcmpset_acq_16(flagsp, &state, FOFFSET_LOCKED)) continue; break; } if ((state & FOFFSET_LOCK_WAITING) == 0) { if (!atomic_fcmpset_acq_16(flagsp, &state, state | FOFFSET_LOCK_WAITING)) continue; } DROP_GIANT(); sleepq_add(&fp->f_vnread_flags, NULL, "vofflock", 0, 0); sleepq_wait(&fp->f_vnread_flags, PUSER -1); PICKUP_GIANT(); sleepq_lock(&fp->f_vnread_flags); state = atomic_load_16(flagsp); } res = atomic_load_long(&fp->f_offset); sleepq_release(&fp->f_vnread_flags); return (res); } void foffset_unlock(struct file *fp, off_t val, int flags) { volatile short *flagsp; short state; KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed")); if ((flags & FOF_NOUPDATE) == 0) atomic_store_long(&fp->f_offset, val); if ((flags & FOF_NEXTOFF_R) != 0) fp->f_nextoff[UIO_READ] = val; if ((flags & FOF_NEXTOFF_W) != 0) fp->f_nextoff[UIO_WRITE] = val; if ((flags & FOF_NOLOCK) != 0) return; flagsp = &fp->f_vnread_flags; state = atomic_load_16(flagsp); if ((state & FOFFSET_LOCK_WAITING) == 0 && atomic_cmpset_rel_16(flagsp, state, 0)) return; sleepq_lock(&fp->f_vnread_flags); MPASS((fp->f_vnread_flags & FOFFSET_LOCKED) != 0); MPASS((fp->f_vnread_flags & FOFFSET_LOCK_WAITING) != 0); fp->f_vnread_flags = 0; sleepq_broadcast(&fp->f_vnread_flags, SLEEPQ_SLEEP, 0, 0); sleepq_release(&fp->f_vnread_flags); } #else off_t foffset_lock(struct file *fp, int flags) { struct mtx *mtxp; off_t res; KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed")); mtxp = mtx_pool_find(mtxpool_sleep, fp); mtx_lock(mtxp); if ((flags & FOF_NOLOCK) == 0) { while (fp->f_vnread_flags & FOFFSET_LOCKED) { fp->f_vnread_flags |= FOFFSET_LOCK_WAITING; msleep(&fp->f_vnread_flags, mtxp, PUSER -1, "vofflock", 0); } fp->f_vnread_flags |= FOFFSET_LOCKED; } res = fp->f_offset; mtx_unlock(mtxp); return (res); } void foffset_unlock(struct file *fp, off_t val, int flags) { struct mtx *mtxp; KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed")); mtxp = mtx_pool_find(mtxpool_sleep, fp); mtx_lock(mtxp); if ((flags & FOF_NOUPDATE) == 0) fp->f_offset = val; if ((flags & FOF_NEXTOFF_R) != 0) fp->f_nextoff[UIO_READ] = val; if ((flags & FOF_NEXTOFF_W) != 0) fp->f_nextoff[UIO_WRITE] = val; if ((flags & FOF_NOLOCK) == 0) { KASSERT((fp->f_vnread_flags & FOFFSET_LOCKED) != 0, ("Lost FOFFSET_LOCKED")); if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING) wakeup(&fp->f_vnread_flags); fp->f_vnread_flags = 0; } mtx_unlock(mtxp); } #endif void foffset_lock_uio(struct file *fp, struct uio *uio, int flags) { if ((flags & FOF_OFFSET) == 0) uio->uio_offset = foffset_lock(fp, flags); } void foffset_unlock_uio(struct file *fp, struct uio *uio, int flags) { if ((flags & FOF_OFFSET) == 0) foffset_unlock(fp, uio->uio_offset, flags); } static int get_advice(struct file *fp, struct uio *uio) { struct mtx *mtxp; int ret; ret = POSIX_FADV_NORMAL; if (fp->f_advice == NULL || fp->f_vnode->v_type != VREG) return (ret); mtxp = mtx_pool_find(mtxpool_sleep, fp); mtx_lock(mtxp); if (fp->f_advice != NULL && uio->uio_offset >= fp->f_advice->fa_start && uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end) ret = fp->f_advice->fa_advice; mtx_unlock(mtxp); return (ret); } int vn_read_from_obj(struct vnode *vp, struct uio *uio) { vm_object_t obj; vm_page_t ma[io_hold_cnt + 2]; off_t off, vsz; ssize_t resid; int error, i, j; MPASS(uio->uio_resid <= ptoa(io_hold_cnt + 2)); obj = atomic_load_ptr(&vp->v_object); if (obj == NULL) return (EJUSTRETURN); /* * Depends on type stability of vm_objects. */ vm_object_pip_add(obj, 1); if ((obj->flags & OBJ_DEAD) != 0) { /* * Note that object might be already reused from the * vnode, and the OBJ_DEAD flag cleared. This is fine, * we recheck for DOOMED vnode state after all pages * are busied, and retract then. * * But we check for OBJ_DEAD to ensure that we do not * busy pages while vm_object_terminate_pages() * processes the queue. */ error = EJUSTRETURN; goto out_pip; } resid = uio->uio_resid; off = uio->uio_offset; for (i = 0; resid > 0; i++) { MPASS(i < io_hold_cnt + 2); ma[i] = vm_page_grab_unlocked(obj, atop(off), VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY | VM_ALLOC_NOWAIT); if (ma[i] == NULL) break; /* * Skip invalid pages. Valid mask can be partial only * at EOF, and we clip later. */ if (vm_page_none_valid(ma[i])) { vm_page_sunbusy(ma[i]); break; } resid -= PAGE_SIZE; off += PAGE_SIZE; } if (i == 0) { error = EJUSTRETURN; goto out_pip; } /* * Check VIRF_DOOMED after we busied our pages. Since * vgonel() terminates the vnode' vm_object, it cannot * process past pages busied by us. */ if (VN_IS_DOOMED(vp)) { error = EJUSTRETURN; goto out; } resid = PAGE_SIZE - (uio->uio_offset & PAGE_MASK) + ptoa(i - 1); if (resid > uio->uio_resid) resid = uio->uio_resid; /* * Unlocked read of vnp_size is safe because truncation cannot * pass busied page. But we load vnp_size into a local * variable so that possible concurrent extension does not * break calculation. */ #if defined(__powerpc__) && !defined(__powerpc64__) vsz = obj->un_pager.vnp.vnp_size; #else vsz = atomic_load_64(&obj->un_pager.vnp.vnp_size); #endif if (uio->uio_offset >= vsz) { error = EJUSTRETURN; goto out; } if (uio->uio_offset + resid > vsz) resid = vsz - uio->uio_offset; error = vn_io_fault_pgmove(ma, uio->uio_offset & PAGE_MASK, resid, uio); out: for (j = 0; j < i; j++) { if (error == 0) vm_page_reference(ma[j]); vm_page_sunbusy(ma[j]); } out_pip: vm_object_pip_wakeup(obj); if (error != 0) return (error); return (uio->uio_resid == 0 ? 0 : EJUSTRETURN); } /* * File table vnode read routine. */ static int vn_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct vnode *vp; off_t orig_offset; int error, ioflag; int advice; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET")); vp = fp->f_vnode; ioflag = 0; if (fp->f_flag & FNONBLOCK) ioflag |= IO_NDELAY; if (fp->f_flag & O_DIRECT) ioflag |= IO_DIRECT; /* * Try to read from page cache. VIRF_DOOMED check is racy but * allows us to avoid unneeded work outright. */ if (vn_io_pgcache_read_enable && !mac_vnode_check_read_enabled() && (vn_irflag_read(vp) & (VIRF_DOOMED | VIRF_PGREAD)) == VIRF_PGREAD) { error = VOP_READ_PGCACHE(vp, uio, ioflag, fp->f_cred); if (error == 0) { fp->f_nextoff[UIO_READ] = uio->uio_offset; return (0); } if (error != EJUSTRETURN) return (error); } advice = get_advice(fp, uio); vn_lock(vp, LK_SHARED | LK_RETRY); switch (advice) { case POSIX_FADV_NORMAL: case POSIX_FADV_SEQUENTIAL: case POSIX_FADV_NOREUSE: ioflag |= sequential_heuristic(uio, fp); break; case POSIX_FADV_RANDOM: /* Disable read-ahead for random I/O. */ break; } orig_offset = uio->uio_offset; #ifdef MAC error = mac_vnode_check_read(active_cred, fp->f_cred, vp); if (error == 0) #endif error = VOP_READ(vp, uio, ioflag, fp->f_cred); fp->f_nextoff[UIO_READ] = uio->uio_offset; VOP_UNLOCK(vp); if (error == 0 && advice == POSIX_FADV_NOREUSE && orig_offset != uio->uio_offset) /* * Use POSIX_FADV_DONTNEED to flush pages and buffers * for the backing file after a POSIX_FADV_NOREUSE * read(2). */ error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1, POSIX_FADV_DONTNEED); return (error); } /* * File table vnode write routine. */ static int vn_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct vnode *vp; struct mount *mp; off_t orig_offset; int error, ioflag, lock_flags; int advice; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET")); vp = fp->f_vnode; if (vp->v_type == VREG) bwillwrite(); ioflag = IO_UNIT; if (vp->v_type == VREG && (fp->f_flag & O_APPEND)) ioflag |= IO_APPEND; if (fp->f_flag & FNONBLOCK) ioflag |= IO_NDELAY; if (fp->f_flag & O_DIRECT) ioflag |= IO_DIRECT; if ((fp->f_flag & O_FSYNC) || (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) ioflag |= IO_SYNC; /* * For O_DSYNC we set both IO_SYNC and IO_DATASYNC, so that VOP_WRITE() * implementations that don't understand IO_DATASYNC fall back to full * O_SYNC behavior. */ if (fp->f_flag & O_DSYNC) ioflag |= IO_SYNC | IO_DATASYNC; mp = NULL; if (vp->v_type != VCHR && (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto unlock; advice = get_advice(fp, uio); if (MNT_SHARED_WRITES(mp) || (mp == NULL && MNT_SHARED_WRITES(vp->v_mount))) { lock_flags = LK_SHARED; } else { lock_flags = LK_EXCLUSIVE; } vn_lock(vp, lock_flags | LK_RETRY); switch (advice) { case POSIX_FADV_NORMAL: case POSIX_FADV_SEQUENTIAL: case POSIX_FADV_NOREUSE: ioflag |= sequential_heuristic(uio, fp); break; case POSIX_FADV_RANDOM: /* XXX: Is this correct? */ break; } orig_offset = uio->uio_offset; #ifdef MAC error = mac_vnode_check_write(active_cred, fp->f_cred, vp); if (error == 0) #endif error = VOP_WRITE(vp, uio, ioflag, fp->f_cred); fp->f_nextoff[UIO_WRITE] = uio->uio_offset; VOP_UNLOCK(vp); if (vp->v_type != VCHR) vn_finished_write(mp); if (error == 0 && advice == POSIX_FADV_NOREUSE && orig_offset != uio->uio_offset) /* * Use POSIX_FADV_DONTNEED to flush pages and buffers * for the backing file after a POSIX_FADV_NOREUSE * write(2). */ error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1, POSIX_FADV_DONTNEED); unlock: return (error); } /* * The vn_io_fault() is a wrapper around vn_read() and vn_write() to * prevent the following deadlock: * * Assume that the thread A reads from the vnode vp1 into userspace * buffer buf1 backed by the pages of vnode vp2. If a page in buf1 is * currently not resident, then system ends up with the call chain * vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] -> * vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2) * which establishes lock order vp1->vn_lock, then vp2->vn_lock. * If, at the same time, thread B reads from vnode vp2 into buffer buf2 * backed by the pages of vnode vp1, and some page in buf2 is not * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock. * * To prevent the lock order reversal and deadlock, vn_io_fault() does * not allow page faults to happen during VOP_READ() or VOP_WRITE(). * Instead, it first tries to do the whole range i/o with pagefaults * disabled. If all pages in the i/o buffer are resident and mapped, * VOP will succeed (ignoring the genuine filesystem errors). * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do * i/o in chunks, with all pages in the chunk prefaulted and held * using vm_fault_quick_hold_pages(). * * Filesystems using this deadlock avoidance scheme should use the * array of the held pages from uio, saved in the curthread->td_ma, * instead of doing uiomove(). A helper function * vn_io_fault_uiomove() converts uiomove request into * uiomove_fromphys() over td_ma array. * * Since vnode locks do not cover the whole i/o anymore, rangelocks * make the current i/o request atomic with respect to other i/os and * truncations. */ /* * Decode vn_io_fault_args and perform the corresponding i/o. */ static int vn_io_fault_doio(struct vn_io_fault_args *args, struct uio *uio, struct thread *td) { int error, save; error = 0; save = vm_fault_disable_pagefaults(); switch (args->kind) { case VN_IO_FAULT_FOP: error = (args->args.fop_args.doio)(args->args.fop_args.fp, uio, args->cred, args->flags, td); break; case VN_IO_FAULT_VOP: if (uio->uio_rw == UIO_READ) { error = VOP_READ(args->args.vop_args.vp, uio, args->flags, args->cred); } else if (uio->uio_rw == UIO_WRITE) { error = VOP_WRITE(args->args.vop_args.vp, uio, args->flags, args->cred); } break; default: panic("vn_io_fault_doio: unknown kind of io %d %d", args->kind, uio->uio_rw); } vm_fault_enable_pagefaults(save); return (error); } static int vn_io_fault_touch(char *base, const struct uio *uio) { int r; r = fubyte(base); if (r == -1 || (uio->uio_rw == UIO_READ && subyte(base, r) == -1)) return (EFAULT); return (0); } static int vn_io_fault_prefault_user(const struct uio *uio) { char *base; const struct iovec *iov; size_t len; ssize_t resid; int error, i; KASSERT(uio->uio_segflg == UIO_USERSPACE, ("vn_io_fault_prefault userspace")); error = i = 0; iov = uio->uio_iov; resid = uio->uio_resid; base = iov->iov_base; len = iov->iov_len; while (resid > 0) { error = vn_io_fault_touch(base, uio); if (error != 0) break; if (len < PAGE_SIZE) { if (len != 0) { error = vn_io_fault_touch(base + len - 1, uio); if (error != 0) break; resid -= len; } if (++i >= uio->uio_iovcnt) break; iov = uio->uio_iov + i; base = iov->iov_base; len = iov->iov_len; } else { len -= PAGE_SIZE; base += PAGE_SIZE; resid -= PAGE_SIZE; } } return (error); } /* * Common code for vn_io_fault(), agnostic to the kind of i/o request. * Uses vn_io_fault_doio() to make the call to an actual i/o function. * Used from vn_rdwr() and vn_io_fault(), which encode the i/o request * into args and call vn_io_fault1() to handle faults during the user * mode buffer accesses. */ static int vn_io_fault1(struct vnode *vp, struct uio *uio, struct vn_io_fault_args *args, struct thread *td) { vm_page_t ma[io_hold_cnt + 2]; struct uio *uio_clone, short_uio; struct iovec short_iovec[1]; vm_page_t *prev_td_ma; vm_prot_t prot; vm_offset_t addr, end; size_t len, resid; ssize_t adv; int error, cnt, saveheld, prev_td_ma_cnt; if (vn_io_fault_prefault) { error = vn_io_fault_prefault_user(uio); if (error != 0) return (error); /* Or ignore ? */ } prot = uio->uio_rw == UIO_READ ? VM_PROT_WRITE : VM_PROT_READ; /* * The UFS follows IO_UNIT directive and replays back both * uio_offset and uio_resid if an error is encountered during the * operation. But, since the iovec may be already advanced, * uio is still in an inconsistent state. * * Cache a copy of the original uio, which is advanced to the redo * point using UIO_NOCOPY below. */ uio_clone = cloneuio(uio); resid = uio->uio_resid; short_uio.uio_segflg = UIO_USERSPACE; short_uio.uio_rw = uio->uio_rw; short_uio.uio_td = uio->uio_td; error = vn_io_fault_doio(args, uio, td); if (error != EFAULT) goto out; atomic_add_long(&vn_io_faults_cnt, 1); uio_clone->uio_segflg = UIO_NOCOPY; uiomove(NULL, resid - uio->uio_resid, uio_clone); uio_clone->uio_segflg = uio->uio_segflg; saveheld = curthread_pflags_set(TDP_UIOHELD); prev_td_ma = td->td_ma; prev_td_ma_cnt = td->td_ma_cnt; while (uio_clone->uio_resid != 0) { len = uio_clone->uio_iov->iov_len; if (len == 0) { KASSERT(uio_clone->uio_iovcnt >= 1, ("iovcnt underflow")); uio_clone->uio_iov++; uio_clone->uio_iovcnt--; continue; } if (len > ptoa(io_hold_cnt)) len = ptoa(io_hold_cnt); addr = (uintptr_t)uio_clone->uio_iov->iov_base; end = round_page(addr + len); if (end < addr) { error = EFAULT; break; } cnt = atop(end - trunc_page(addr)); /* * A perfectly misaligned address and length could cause * both the start and the end of the chunk to use partial * page. +2 accounts for such a situation. */ cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map, addr, len, prot, ma, io_hold_cnt + 2); if (cnt == -1) { error = EFAULT; break; } short_uio.uio_iov = &short_iovec[0]; short_iovec[0].iov_base = (void *)addr; short_uio.uio_iovcnt = 1; short_uio.uio_resid = short_iovec[0].iov_len = len; short_uio.uio_offset = uio_clone->uio_offset; td->td_ma = ma; td->td_ma_cnt = cnt; error = vn_io_fault_doio(args, &short_uio, td); vm_page_unhold_pages(ma, cnt); adv = len - short_uio.uio_resid; uio_clone->uio_iov->iov_base = (char *)uio_clone->uio_iov->iov_base + adv; uio_clone->uio_iov->iov_len -= adv; uio_clone->uio_resid -= adv; uio_clone->uio_offset += adv; uio->uio_resid -= adv; uio->uio_offset += adv; if (error != 0 || adv == 0) break; } td->td_ma = prev_td_ma; td->td_ma_cnt = prev_td_ma_cnt; curthread_pflags_restore(saveheld); out: free(uio_clone, M_IOV); return (error); } static int vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { fo_rdwr_t *doio; struct vnode *vp; void *rl_cookie; struct vn_io_fault_args args; int error; doio = uio->uio_rw == UIO_READ ? vn_read : vn_write; vp = fp->f_vnode; /* * The ability to read(2) on a directory has historically been * allowed for all users, but this can and has been the source of * at least one security issue in the past. As such, it is now hidden * away behind a sysctl for those that actually need it to use it, and * restricted to root when it's turned on to make it relatively safe to * leave on for longer sessions of need. */ if (vp->v_type == VDIR) { KASSERT(uio->uio_rw == UIO_READ, ("illegal write attempted on a directory")); if (!vfs_allow_read_dir) return (EISDIR); if ((error = priv_check(td, PRIV_VFS_READ_DIR)) != 0) return (EISDIR); } foffset_lock_uio(fp, uio, flags); if (do_vn_io_fault(vp, uio)) { args.kind = VN_IO_FAULT_FOP; args.args.fop_args.fp = fp; args.args.fop_args.doio = doio; args.cred = active_cred; args.flags = flags | FOF_OFFSET; if (uio->uio_rw == UIO_READ) { rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset, uio->uio_offset + uio->uio_resid); } else if ((fp->f_flag & O_APPEND) != 0 || (flags & FOF_OFFSET) == 0) { /* For appenders, punt and lock the whole range. */ rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); } else { rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset, uio->uio_offset + uio->uio_resid); } error = vn_io_fault1(vp, uio, &args, td); vn_rangelock_unlock(vp, rl_cookie); } else { error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td); } foffset_unlock_uio(fp, uio, flags); return (error); } /* * Helper function to perform the requested uiomove operation using * the held pages for io->uio_iov[0].iov_base buffer instead of * copyin/copyout. Access to the pages with uiomove_fromphys() * instead of iov_base prevents page faults that could occur due to * pmap_collect() invalidating the mapping created by * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or * object cleanup revoking the write access from page mappings. * * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove() * instead of plain uiomove(). */ int vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio) { struct uio transp_uio; struct iovec transp_iov[1]; struct thread *td; size_t adv; int error, pgadv; td = curthread; if ((td->td_pflags & TDP_UIOHELD) == 0 || uio->uio_segflg != UIO_USERSPACE) return (uiomove(data, xfersize, uio)); KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt)); transp_iov[0].iov_base = data; transp_uio.uio_iov = &transp_iov[0]; transp_uio.uio_iovcnt = 1; if (xfersize > uio->uio_resid) xfersize = uio->uio_resid; transp_uio.uio_resid = transp_iov[0].iov_len = xfersize; transp_uio.uio_offset = 0; transp_uio.uio_segflg = UIO_SYSSPACE; /* * Since transp_iov points to data, and td_ma page array * corresponds to original uio->uio_iov, we need to invert the * direction of the i/o operation as passed to * uiomove_fromphys(). */ switch (uio->uio_rw) { case UIO_WRITE: transp_uio.uio_rw = UIO_READ; break; case UIO_READ: transp_uio.uio_rw = UIO_WRITE; break; } transp_uio.uio_td = uio->uio_td; error = uiomove_fromphys(td->td_ma, ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK, xfersize, &transp_uio); adv = xfersize - transp_uio.uio_resid; pgadv = (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) - (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT); td->td_ma += pgadv; KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt, pgadv)); td->td_ma_cnt -= pgadv; uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv; uio->uio_iov->iov_len -= adv; uio->uio_resid -= adv; uio->uio_offset += adv; return (error); } int vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize, struct uio *uio) { struct thread *td; vm_offset_t iov_base; int cnt, pgadv; td = curthread; if ((td->td_pflags & TDP_UIOHELD) == 0 || uio->uio_segflg != UIO_USERSPACE) return (uiomove_fromphys(ma, offset, xfersize, uio)); KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt)); cnt = xfersize > uio->uio_resid ? uio->uio_resid : xfersize; iov_base = (vm_offset_t)uio->uio_iov->iov_base; switch (uio->uio_rw) { case UIO_WRITE: pmap_copy_pages(td->td_ma, iov_base & PAGE_MASK, ma, offset, cnt); break; case UIO_READ: pmap_copy_pages(ma, offset, td->td_ma, iov_base & PAGE_MASK, cnt); break; } pgadv = ((iov_base + cnt) >> PAGE_SHIFT) - (iov_base >> PAGE_SHIFT); td->td_ma += pgadv; KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt, pgadv)); td->td_ma_cnt -= pgadv; uio->uio_iov->iov_base = (char *)(iov_base + cnt); uio->uio_iov->iov_len -= cnt; uio->uio_resid -= cnt; uio->uio_offset += cnt; return (0); } /* * File table truncate routine. */ static int vn_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { struct mount *mp; struct vnode *vp; void *rl_cookie; int error; vp = fp->f_vnode; retry: /* * Lock the whole range for truncation. Otherwise split i/o * might happen partly before and partly after the truncation. */ rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error) goto out1; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG_VNODE1(vp); if (vp->v_type == VDIR) { error = EISDIR; goto out; } #ifdef MAC error = mac_vnode_check_write(active_cred, fp->f_cred, vp); if (error) goto out; #endif error = vn_truncate_locked(vp, length, (fp->f_flag & O_FSYNC) != 0, fp->f_cred); out: VOP_UNLOCK(vp); vn_finished_write(mp); out1: vn_rangelock_unlock(vp, rl_cookie); if (error == ERELOOKUP) goto retry; return (error); } /* * Truncate a file that is already locked. */ int vn_truncate_locked(struct vnode *vp, off_t length, bool sync, struct ucred *cred) { struct vattr vattr; int error; error = VOP_ADD_WRITECOUNT(vp, 1); if (error == 0) { VATTR_NULL(&vattr); vattr.va_size = length; if (sync) vattr.va_vaflags |= VA_SYNC; error = VOP_SETATTR(vp, &vattr, cred); VOP_ADD_WRITECOUNT_CHECKED(vp, -1); } return (error); } /* * File table vnode stat routine. */ int vn_statfile(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td) { struct vnode *vp = fp->f_vnode; int error; vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_STAT(vp, sb, active_cred, fp->f_cred, td); VOP_UNLOCK(vp); return (error); } /* * File table vnode ioctl routine. */ static int vn_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { struct vattr vattr; struct vnode *vp; struct fiobmap2_arg *bmarg; int error; vp = fp->f_vnode; switch (vp->v_type) { case VDIR: case VREG: switch (com) { case FIONREAD: vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &vattr, active_cred); VOP_UNLOCK(vp); if (error == 0) *(int *)data = vattr.va_size - fp->f_offset; return (error); case FIOBMAP2: bmarg = (struct fiobmap2_arg *)data; vn_lock(vp, LK_SHARED | LK_RETRY); #ifdef MAC error = mac_vnode_check_read(active_cred, fp->f_cred, vp); if (error == 0) #endif error = VOP_BMAP(vp, bmarg->bn, NULL, &bmarg->bn, &bmarg->runp, &bmarg->runb); VOP_UNLOCK(vp); return (error); case FIONBIO: case FIOASYNC: return (0); default: return (VOP_IOCTL(vp, com, data, fp->f_flag, active_cred, td)); } break; case VCHR: return (VOP_IOCTL(vp, com, data, fp->f_flag, active_cred, td)); default: return (ENOTTY); } } /* * File table vnode poll routine. */ static int vn_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct vnode *vp; int error; vp = fp->f_vnode; #if defined(MAC) || defined(AUDIT) if (AUDITING_TD(td) || mac_vnode_check_poll_enabled()) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG_VNODE1(vp); error = mac_vnode_check_poll(active_cred, fp->f_cred, vp); VOP_UNLOCK(vp); if (error != 0) return (error); } #endif error = VOP_POLL(vp, events, fp->f_cred, td); return (error); } /* * Acquire the requested lock and then check for validity. LK_RETRY * permits vn_lock to return doomed vnodes. */ static int __noinline _vn_lock_fallback(struct vnode *vp, int flags, const char *file, int line, int error) { KASSERT((flags & LK_RETRY) == 0 || error == 0, ("vn_lock: error %d incompatible with flags %#x", error, flags)); if (error == 0) VNASSERT(VN_IS_DOOMED(vp), vp, ("vnode not doomed")); if ((flags & LK_RETRY) == 0) { if (error == 0) { VOP_UNLOCK(vp); error = ENOENT; } return (error); } /* * LK_RETRY case. * * Nothing to do if we got the lock. */ if (error == 0) return (0); /* * Interlock was dropped by the call in _vn_lock. */ flags &= ~LK_INTERLOCK; do { error = VOP_LOCK1(vp, flags, file, line); } while (error != 0); return (0); } int _vn_lock(struct vnode *vp, int flags, const char *file, int line) { int error; VNASSERT((flags & LK_TYPE_MASK) != 0, vp, ("vn_lock: no locktype (%d passed)", flags)); VNPASS(vp->v_holdcnt > 0, vp); error = VOP_LOCK1(vp, flags, file, line); if (__predict_false(error != 0 || VN_IS_DOOMED(vp))) return (_vn_lock_fallback(vp, flags, file, line, error)); return (0); } /* * File table vnode close routine. */ static int vn_closefile(struct file *fp, struct thread *td) { struct vnode *vp; struct flock lf; int error; bool ref; vp = fp->f_vnode; fp->f_ops = &badfileops; ref = (fp->f_flag & FHASLOCK) != 0 && fp->f_type == DTYPE_VNODE; error = vn_close1(vp, fp->f_flag, fp->f_cred, td, ref); if (__predict_false(ref)) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; (void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK); vrele(vp); } return (error); } /* * Preparing to start a filesystem write operation. If the operation is * permitted, then we bump the count of operations in progress and * proceed. If a suspend request is in progress, we wait until the * suspension is over, and then proceed. */ static int vn_start_write_refed(struct mount *mp, int flags, bool mplocked) { struct mount_pcpu *mpcpu; int error, mflags; if (__predict_true(!mplocked) && (flags & V_XSLEEP) == 0 && vfs_op_thread_enter(mp, mpcpu)) { MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0); vfs_mp_count_add_pcpu(mpcpu, writeopcount, 1); vfs_op_thread_exit(mp, mpcpu); return (0); } if (mplocked) mtx_assert(MNT_MTX(mp), MA_OWNED); else MNT_ILOCK(mp); error = 0; /* * Check on status of suspension. */ if ((curthread->td_pflags & TDP_IGNSUSP) == 0 || mp->mnt_susp_owner != curthread) { mflags = ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ? (flags & PCATCH) : 0) | (PUSER - 1); while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { if (flags & V_NOWAIT) { error = EWOULDBLOCK; goto unlock; } error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags, "suspfs", 0); if (error) goto unlock; } } if (flags & V_XSLEEP) goto unlock; mp->mnt_writeopcount++; unlock: if (error != 0 || (flags & V_XSLEEP) != 0) MNT_REL(mp); MNT_IUNLOCK(mp); return (error); } int vn_start_write(struct vnode *vp, struct mount **mpp, int flags) { struct mount *mp; int error; KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL), ("V_MNTREF requires mp")); error = 0; /* * If a vnode is provided, get and return the mount point that * to which it will write. */ if (vp != NULL) { if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) { *mpp = NULL; if (error != EOPNOTSUPP) return (error); return (0); } } if ((mp = *mpp) == NULL) return (0); /* * VOP_GETWRITEMOUNT() returns with the mp refcount held through * a vfs_ref(). * As long as a vnode is not provided we need to acquire a * refcount for the provided mountpoint too, in order to * emulate a vfs_ref(). */ if (vp == NULL && (flags & V_MNTREF) == 0) vfs_ref(mp); return (vn_start_write_refed(mp, flags, false)); } /* * Secondary suspension. Used by operations such as vop_inactive * routines that are needed by the higher level functions. These * are allowed to proceed until all the higher level functions have * completed (indicated by mnt_writeopcount dropping to zero). At that * time, these operations are halted until the suspension is over. */ int vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags) { struct mount *mp; int error; KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL), ("V_MNTREF requires mp")); retry: if (vp != NULL) { if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) { *mpp = NULL; if (error != EOPNOTSUPP) return (error); return (0); } } /* * If we are not suspended or have not yet reached suspended * mode, then let the operation proceed. */ if ((mp = *mpp) == NULL) return (0); /* * VOP_GETWRITEMOUNT() returns with the mp refcount held through * a vfs_ref(). * As long as a vnode is not provided we need to acquire a * refcount for the provided mountpoint too, in order to * emulate a vfs_ref(). */ MNT_ILOCK(mp); if (vp == NULL && (flags & V_MNTREF) == 0) MNT_REF(mp); if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) { mp->mnt_secondary_writes++; mp->mnt_secondary_accwrites++; MNT_IUNLOCK(mp); return (0); } if (flags & V_NOWAIT) { MNT_REL(mp); MNT_IUNLOCK(mp); return (EWOULDBLOCK); } /* * Wait for the suspension to finish. */ error = msleep(&mp->mnt_flag, MNT_MTX(mp), (PUSER - 1) | PDROP | ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ? (flags & PCATCH) : 0), "suspfs", 0); vfs_rel(mp); if (error == 0) goto retry; return (error); } /* * Filesystem write operation has completed. If we are suspending and this * operation is the last one, notify the suspender that the suspension is * now in effect. */ void vn_finished_write(struct mount *mp) { struct mount_pcpu *mpcpu; int c; if (mp == NULL) return; if (vfs_op_thread_enter(mp, mpcpu)) { vfs_mp_count_sub_pcpu(mpcpu, writeopcount, 1); vfs_mp_count_sub_pcpu(mpcpu, ref, 1); vfs_op_thread_exit(mp, mpcpu); return; } MNT_ILOCK(mp); vfs_assert_mount_counters(mp); MNT_REL(mp); c = --mp->mnt_writeopcount; if (mp->mnt_vfs_ops == 0) { MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0); MNT_IUNLOCK(mp); return; } if (c < 0) vfs_dump_mount_counters(mp); if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && c == 0) wakeup(&mp->mnt_writeopcount); MNT_IUNLOCK(mp); } /* * Filesystem secondary write operation has completed. If we are * suspending and this operation is the last one, notify the suspender * that the suspension is now in effect. */ void vn_finished_secondary_write(struct mount *mp) { if (mp == NULL) return; MNT_ILOCK(mp); MNT_REL(mp); mp->mnt_secondary_writes--; if (mp->mnt_secondary_writes < 0) panic("vn_finished_secondary_write: neg cnt"); if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && mp->mnt_secondary_writes <= 0) wakeup(&mp->mnt_secondary_writes); MNT_IUNLOCK(mp); } /* * Request a filesystem to suspend write operations. */ int vfs_write_suspend(struct mount *mp, int flags) { int error; vfs_op_enter(mp); MNT_ILOCK(mp); vfs_assert_mount_counters(mp); if (mp->mnt_susp_owner == curthread) { vfs_op_exit_locked(mp); MNT_IUNLOCK(mp); return (EALREADY); } while (mp->mnt_kern_flag & MNTK_SUSPEND) msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0); /* * Unmount holds a write reference on the mount point. If we * own busy reference and drain for writers, we deadlock with * the reference draining in the unmount path. Callers of * vfs_write_suspend() must specify VS_SKIP_UNMOUNT if * vfs_busy() reference is owned and caller is not in the * unmount context. */ if ((flags & VS_SKIP_UNMOUNT) != 0 && (mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) { vfs_op_exit_locked(mp); MNT_IUNLOCK(mp); return (EBUSY); } mp->mnt_kern_flag |= MNTK_SUSPEND; mp->mnt_susp_owner = curthread; if (mp->mnt_writeopcount > 0) (void) msleep(&mp->mnt_writeopcount, MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0); else MNT_IUNLOCK(mp); if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0) { vfs_write_resume(mp, 0); /* vfs_write_resume does vfs_op_exit() for us */ } return (error); } /* * Request a filesystem to resume write operations. */ void vfs_write_resume(struct mount *mp, int flags) { MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner")); mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 | MNTK_SUSPENDED); mp->mnt_susp_owner = NULL; wakeup(&mp->mnt_writeopcount); wakeup(&mp->mnt_flag); curthread->td_pflags &= ~TDP_IGNSUSP; if ((flags & VR_START_WRITE) != 0) { MNT_REF(mp); mp->mnt_writeopcount++; } MNT_IUNLOCK(mp); if ((flags & VR_NO_SUSPCLR) == 0) VFS_SUSP_CLEAN(mp); vfs_op_exit(mp); } else if ((flags & VR_START_WRITE) != 0) { MNT_REF(mp); vn_start_write_refed(mp, 0, true); } else { MNT_IUNLOCK(mp); } } /* * Helper loop around vfs_write_suspend() for filesystem unmount VFS * methods. */ int vfs_write_suspend_umnt(struct mount *mp) { int error; KASSERT((curthread->td_pflags & TDP_IGNSUSP) == 0, ("vfs_write_suspend_umnt: recursed")); /* dounmount() already called vn_start_write(). */ for (;;) { vn_finished_write(mp); error = vfs_write_suspend(mp, 0); if (error != 0) { vn_start_write(NULL, &mp, V_WAIT); return (error); } MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_SUSPENDED) != 0) break; MNT_IUNLOCK(mp); vn_start_write(NULL, &mp, V_WAIT); } mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2); wakeup(&mp->mnt_flag); MNT_IUNLOCK(mp); curthread->td_pflags |= TDP_IGNSUSP; return (0); } /* * Implement kqueues for files by translating it to vnode operation. */ static int vn_kqfilter(struct file *fp, struct knote *kn) { return (VOP_KQFILTER(fp->f_vnode, kn)); } +int +vn_kqfilter_opath(struct file *fp, struct knote *kn) +{ + if ((fp->f_flag & FKQALLOWED) == 0) + return (EBADF); + return (vn_kqfilter(fp, kn)); +} + /* * Simplified in-kernel wrapper calls for extended attribute access. * Both calls pass in a NULL credential, authorizing as "kernel" access. * Set IO_NODELOCKED in ioflg if the vnode is already locked. */ int vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int *buflen, char *buf, struct thread *td) { struct uio auio; struct iovec iov; int error; iov.iov_len = *buflen; iov.iov_base = buf; auio.uio_iov = &iov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_offset = 0; auio.uio_resid = *buflen; if ((ioflg & IO_NODELOCKED) == 0) vn_lock(vp, LK_SHARED | LK_RETRY); ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); /* authorize attribute retrieval as kernel */ error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL, td); if ((ioflg & IO_NODELOCKED) == 0) VOP_UNLOCK(vp); if (error == 0) { *buflen = *buflen - auio.uio_resid; } return (error); } /* * XXX failure mode if partially written? */ int vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int buflen, char *buf, struct thread *td) { struct uio auio; struct iovec iov; struct mount *mp; int error; iov.iov_len = buflen; iov.iov_base = buf; auio.uio_iov = &iov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_WRITE; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_offset = 0; auio.uio_resid = buflen; if ((ioflg & IO_NODELOCKED) == 0) { if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); /* authorize attribute setting as kernel */ error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td); if ((ioflg & IO_NODELOCKED) == 0) { vn_finished_write(mp); VOP_UNLOCK(vp); } return (error); } int vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, struct thread *td) { struct mount *mp; int error; if ((ioflg & IO_NODELOCKED) == 0) { if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); /* authorize attribute removal as kernel */ error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td); if (error == EOPNOTSUPP) error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, NULL, td); if ((ioflg & IO_NODELOCKED) == 0) { vn_finished_write(mp); VOP_UNLOCK(vp); } return (error); } static int vn_get_ino_alloc_vget(struct mount *mp, void *arg, int lkflags, struct vnode **rvp) { return (VFS_VGET(mp, *(ino_t *)arg, lkflags, rvp)); } int vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp) { return (vn_vget_ino_gen(vp, vn_get_ino_alloc_vget, &ino, lkflags, rvp)); } int vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc, void *alloc_arg, int lkflags, struct vnode **rvp) { struct mount *mp; int ltype, error; ASSERT_VOP_LOCKED(vp, "vn_vget_ino_get"); mp = vp->v_mount; ltype = VOP_ISLOCKED(vp); KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED, ("vn_vget_ino: vp not locked")); error = vfs_busy(mp, MBF_NOWAIT); if (error != 0) { vfs_ref(mp); VOP_UNLOCK(vp); error = vfs_busy(mp, 0); vn_lock(vp, ltype | LK_RETRY); vfs_rel(mp); if (error != 0) return (ENOENT); if (VN_IS_DOOMED(vp)) { vfs_unbusy(mp); return (ENOENT); } } VOP_UNLOCK(vp); error = alloc(mp, alloc_arg, lkflags, rvp); vfs_unbusy(mp); if (error != 0 || *rvp != vp) vn_lock(vp, ltype | LK_RETRY); if (VN_IS_DOOMED(vp)) { if (error == 0) { if (*rvp == vp) vunref(vp); else vput(*rvp); } error = ENOENT; } return (error); } int vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio, struct thread *td) { if (vp->v_type != VREG || td == NULL) return (0); if ((uoff_t)uio->uio_offset + uio->uio_resid > lim_cur(td, RLIMIT_FSIZE)) { PROC_LOCK(td->td_proc); kern_psignal(td->td_proc, SIGXFSZ); PROC_UNLOCK(td->td_proc); return (EFBIG); } return (0); } int vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { struct vnode *vp; vp = fp->f_vnode; #ifdef AUDIT vn_lock(vp, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(vp); VOP_UNLOCK(vp); #endif return (setfmode(td, active_cred, vp, mode)); } int vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { struct vnode *vp; vp = fp->f_vnode; #ifdef AUDIT vn_lock(vp, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(vp); VOP_UNLOCK(vp); #endif return (setfown(td, active_cred, vp, uid, gid)); } void vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end) { vm_object_t object; if ((object = vp->v_object) == NULL) return; VM_OBJECT_WLOCK(object); vm_object_page_remove(object, start, end, 0); VM_OBJECT_WUNLOCK(object); } int vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred) { struct vattr va; daddr_t bn, bnp; uint64_t bsize; off_t noff; int error; KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA, ("Wrong command %lu", cmd)); if (vn_lock(vp, LK_SHARED) != 0) return (EBADF); if (vp->v_type != VREG) { error = ENOTTY; goto unlock; } error = VOP_GETATTR(vp, &va, cred); if (error != 0) goto unlock; noff = *off; if (noff >= va.va_size) { error = ENXIO; goto unlock; } bsize = vp->v_mount->mnt_stat.f_iosize; for (bn = noff / bsize; noff < va.va_size; bn++, noff += bsize - noff % bsize) { error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL); if (error == EOPNOTSUPP) { error = ENOTTY; goto unlock; } if ((bnp == -1 && cmd == FIOSEEKHOLE) || (bnp != -1 && cmd == FIOSEEKDATA)) { noff = bn * bsize; if (noff < *off) noff = *off; goto unlock; } } if (noff > va.va_size) noff = va.va_size; /* noff == va.va_size. There is an implicit hole at the end of file. */ if (cmd == FIOSEEKDATA) error = ENXIO; unlock: VOP_UNLOCK(vp); if (error == 0) *off = noff; return (error); } int vn_seek(struct file *fp, off_t offset, int whence, struct thread *td) { struct ucred *cred; struct vnode *vp; struct vattr vattr; off_t foffset, size; int error, noneg; cred = td->td_ucred; vp = fp->f_vnode; foffset = foffset_lock(fp, 0); noneg = (vp->v_type != VCHR); error = 0; switch (whence) { case L_INCR: if (noneg && (foffset < 0 || (offset > 0 && foffset > OFF_MAX - offset))) { error = EOVERFLOW; break; } offset += foffset; break; case L_XTND: vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &vattr, cred); VOP_UNLOCK(vp); if (error) break; /* * If the file references a disk device, then fetch * the media size and use that to determine the ending * offset. */ if (vattr.va_size == 0 && vp->v_type == VCHR && fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0) vattr.va_size = size; if (noneg && (vattr.va_size > OFF_MAX || (offset > 0 && vattr.va_size > OFF_MAX - offset))) { error = EOVERFLOW; break; } offset += vattr.va_size; break; case L_SET: break; case SEEK_DATA: error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td); if (error == ENOTTY) error = EINVAL; break; case SEEK_HOLE: error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td); if (error == ENOTTY) error = EINVAL; break; default: error = EINVAL; } if (error == 0 && noneg && offset < 0) error = EINVAL; if (error != 0) goto drop; VFS_KNOTE_UNLOCKED(vp, 0); td->td_uretoff.tdu_off = offset; drop: foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0); return (error); } int vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred, struct thread *td) { int error; /* * Grant permission if the caller is the owner of the file, or * the super-user, or has ACL_WRITE_ATTRIBUTES permission on * on the file. If the time pointer is null, then write * permission on the file is also sufficient. * * From NFSv4.1, draft 21, 6.2.1.3.1, Discussion of Mask Attributes: * A user having ACL_WRITE_DATA or ACL_WRITE_ATTRIBUTES * will be allowed to set the times [..] to the current * server time. */ error = VOP_ACCESSX(vp, VWRITE_ATTRIBUTES, cred, td); if (error != 0 && (vap->va_vaflags & VA_UTIMES_NULL) != 0) error = VOP_ACCESS(vp, VWRITE, cred, td); return (error); } int vn_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { struct vnode *vp; int error; if (fp->f_type == DTYPE_FIFO) kif->kf_type = KF_TYPE_FIFO; else kif->kf_type = KF_TYPE_VNODE; vp = fp->f_vnode; vref(vp); FILEDESC_SUNLOCK(fdp); error = vn_fill_kinfo_vnode(vp, kif); vrele(vp); FILEDESC_SLOCK(fdp); return (error); } static inline void vn_fill_junk(struct kinfo_file *kif) { size_t len, olen; /* * Simulate vn_fullpath returning changing values for a given * vp during e.g. coredump. */ len = (arc4random() % (sizeof(kif->kf_path) - 2)) + 1; olen = strlen(kif->kf_path); if (len < olen) strcpy(&kif->kf_path[len - 1], "$"); else for (; olen < len; olen++) strcpy(&kif->kf_path[olen], "A"); } int vn_fill_kinfo_vnode(struct vnode *vp, struct kinfo_file *kif) { struct vattr va; char *fullpath, *freepath; int error; kif->kf_un.kf_file.kf_file_type = vntype_to_kinfo(vp->v_type); freepath = NULL; fullpath = "-"; error = vn_fullpath(vp, &fullpath, &freepath); if (error == 0) { strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path)); } if (freepath != NULL) free(freepath, M_TEMP); KFAIL_POINT_CODE(DEBUG_FP, fill_kinfo_vnode__random_path, vn_fill_junk(kif); ); /* * Retrieve vnode attributes. */ va.va_fsid = VNOVAL; va.va_rdev = NODEV; vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &va, curthread->td_ucred); VOP_UNLOCK(vp); if (error != 0) return (error); if (va.va_fsid != VNOVAL) kif->kf_un.kf_file.kf_file_fsid = va.va_fsid; else kif->kf_un.kf_file.kf_file_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; kif->kf_un.kf_file.kf_file_fsid_freebsd11 = kif->kf_un.kf_file.kf_file_fsid; /* truncate */ kif->kf_un.kf_file.kf_file_fileid = va.va_fileid; kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode); kif->kf_un.kf_file.kf_file_size = va.va_size; kif->kf_un.kf_file.kf_file_rdev = va.va_rdev; kif->kf_un.kf_file.kf_file_rdev_freebsd11 = kif->kf_un.kf_file.kf_file_rdev; /* truncate */ return (0); } int vn_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, struct thread *td) { #ifdef HWPMC_HOOKS struct pmckern_map_in pkm; #endif struct mount *mp; struct vnode *vp; vm_object_t object; vm_prot_t maxprot; boolean_t writecounted; int error; #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \ defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) /* * POSIX shared-memory objects are defined to have * kernel persistence, and are not defined to support * read(2)/write(2) -- or even open(2). Thus, we can * use MAP_ASYNC to trade on-disk coherence for speed. * The shm_open(3) library routine turns on the FPOSIXSHM * flag to request this behavior. */ if ((fp->f_flag & FPOSIXSHM) != 0) flags |= MAP_NOSYNC; #endif vp = fp->f_vnode; /* * Ensure that file and memory protections are * compatible. Note that we only worry about * writability if mapping is shared; in this case, * current and max prot are dictated by the open file. * XXX use the vnode instead? Problem is: what * credentials do we use for determination? What if * proc does a setuid? */ mp = vp->v_mount; if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) { maxprot = VM_PROT_NONE; if ((prot & VM_PROT_EXECUTE) != 0) return (EACCES); } else maxprot = VM_PROT_EXECUTE; if ((fp->f_flag & FREAD) != 0) maxprot |= VM_PROT_READ; else if ((prot & VM_PROT_READ) != 0) return (EACCES); /* * If we are sharing potential changes via MAP_SHARED and we * are trying to get write permission although we opened it * without asking for it, bail out. */ if ((flags & MAP_SHARED) != 0) { if ((fp->f_flag & FWRITE) != 0) maxprot |= VM_PROT_WRITE; else if ((prot & VM_PROT_WRITE) != 0) return (EACCES); } else { maxprot |= VM_PROT_WRITE; cap_maxprot |= VM_PROT_WRITE; } maxprot &= cap_maxprot; /* * For regular files and shared memory, POSIX requires that * the value of foff be a legitimate offset within the data * object. In particular, negative offsets are invalid. * Blocking negative offsets and overflows here avoids * possible wraparound or user-level access into reserved * ranges of the data object later. In contrast, POSIX does * not dictate how offsets are used by device drivers, so in * the case of a device mapping a negative offset is passed * on. */ if ( #ifdef _LP64 size > OFF_MAX || #endif foff > OFF_MAX - size) return (EINVAL); writecounted = FALSE; error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, vp, &foff, &object, &writecounted); if (error != 0) return (error); error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, foff, writecounted, td); if (error != 0) { /* * If this mapping was accounted for in the vnode's * writecount, then undo that now. */ if (writecounted) vm_pager_release_writecount(object, 0, size); vm_object_deallocate(object); } #ifdef HWPMC_HOOKS /* Inform hwpmc(4) if an executable is being mapped. */ if (PMC_HOOK_INSTALLED(PMC_FN_MMAP)) { if ((prot & VM_PROT_EXECUTE) != 0 && error == 0) { pkm.pm_file = vp; pkm.pm_address = (uintptr_t) *addr; PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_MMAP, (void *) &pkm); } } #endif return (error); } void vn_fsid(struct vnode *vp, struct vattr *va) { fsid_t *f; f = &vp->v_mount->mnt_stat.f_fsid; va->va_fsid = (uint32_t)f->val[1]; va->va_fsid <<= sizeof(f->val[1]) * NBBY; va->va_fsid += (uint32_t)f->val[0]; } int vn_fsync_buf(struct vnode *vp, int waitfor) { struct buf *bp, *nbp; struct bufobj *bo; struct mount *mp; int error, maxretry; error = 0; maxretry = 10000; /* large, arbitrarily chosen */ mp = NULL; if (vp->v_type == VCHR) { VI_LOCK(vp); mp = vp->v_rdev->si_mountpt; VI_UNLOCK(vp); } bo = &vp->v_bufobj; BO_LOCK(bo); loop1: /* * MARK/SCAN initialization to avoid infinite loops. */ TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) { bp->b_vflags &= ~BV_SCANNED; bp->b_error = 0; } /* * Flush all dirty buffers associated with a vnode. */ loop2: TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if ((bp->b_vflags & BV_SCANNED) != 0) continue; bp->b_vflags |= BV_SCANNED; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) { if (waitfor != MNT_WAIT) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_INTERLOCK | LK_SLEEPFAIL, BO_LOCKPTR(bo)) != 0) { BO_LOCK(bo); goto loop1; } BO_LOCK(bo); } BO_UNLOCK(bo); KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); if ((bp->b_flags & B_DELWRI) == 0) panic("fsync: not dirty"); if ((vp->v_object != NULL) && (bp->b_flags & B_CLUSTEROK)) { vfs_bio_awrite(bp); } else { bremfree(bp); bawrite(bp); } if (maxretry < 1000) pause("dirty", hz < 1000 ? 1 : hz / 1000); BO_LOCK(bo); goto loop2; } /* * If synchronous the caller expects us to completely resolve all * dirty buffers in the system. Wait for in-progress I/O to * complete (which could include background bitmap writes), then * retry if dirty blocks still exist. */ if (waitfor == MNT_WAIT) { bufobj_wwait(bo, 0, 0); if (bo->bo_dirty.bv_cnt > 0) { /* * If we are unable to write any of these buffers * then we fail now rather than trying endlessly * to write them out. */ TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) if ((error = bp->b_error) != 0) break; if ((mp != NULL && mp->mnt_secondary_writes > 0) || (error == 0 && --maxretry >= 0)) goto loop1; if (error == 0) error = EAGAIN; } } BO_UNLOCK(bo); if (error != 0) vn_printf(vp, "fsync: giving up on dirty (error = %d) ", error); return (error); } /* * Copies a byte range from invp to outvp. Calls VOP_COPY_FILE_RANGE() * or vn_generic_copy_file_range() after rangelocking the byte ranges, * to do the actual copy. * vn_generic_copy_file_range() is factored out, so it can be called * from a VOP_COPY_FILE_RANGE() call as well, but handles vnodes from * different file systems. */ int vn_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp, off_t *outoffp, size_t *lenp, unsigned int flags, struct ucred *incred, struct ucred *outcred, struct thread *fsize_td) { int error; size_t len; uint64_t uval; len = *lenp; *lenp = 0; /* For error returns. */ error = 0; /* Do some sanity checks on the arguments. */ if (invp->v_type == VDIR || outvp->v_type == VDIR) error = EISDIR; else if (*inoffp < 0 || *outoffp < 0 || invp->v_type != VREG || outvp->v_type != VREG) error = EINVAL; if (error != 0) goto out; /* Ensure offset + len does not wrap around. */ uval = *inoffp; uval += len; if (uval > INT64_MAX) len = INT64_MAX - *inoffp; uval = *outoffp; uval += len; if (uval > INT64_MAX) len = INT64_MAX - *outoffp; if (len == 0) goto out; /* * If the two vnode are for the same file system, call * VOP_COPY_FILE_RANGE(), otherwise call vn_generic_copy_file_range() * which can handle copies across multiple file systems. */ *lenp = len; if (invp->v_mount == outvp->v_mount) error = VOP_COPY_FILE_RANGE(invp, inoffp, outvp, outoffp, lenp, flags, incred, outcred, fsize_td); else error = vn_generic_copy_file_range(invp, inoffp, outvp, outoffp, lenp, flags, incred, outcred, fsize_td); out: return (error); } /* * Test len bytes of data starting at dat for all bytes == 0. * Return true if all bytes are zero, false otherwise. * Expects dat to be well aligned. */ static bool mem_iszero(void *dat, int len) { int i; const u_int *p; const char *cp; for (p = dat; len > 0; len -= sizeof(*p), p++) { if (len >= sizeof(*p)) { if (*p != 0) return (false); } else { cp = (const char *)p; for (i = 0; i < len; i++, cp++) if (*cp != '\0') return (false); } } return (true); } /* * Look for a hole in the output file and, if found, adjust *outoffp * and *xferp to skip past the hole. * *xferp is the entire hole length to be written and xfer2 is how many bytes * to be written as 0's upon return. */ static off_t vn_skip_hole(struct vnode *outvp, off_t xfer2, off_t *outoffp, off_t *xferp, off_t *dataoffp, off_t *holeoffp, struct ucred *cred) { int error; off_t delta; if (*holeoffp == 0 || *holeoffp <= *outoffp) { *dataoffp = *outoffp; error = VOP_IOCTL(outvp, FIOSEEKDATA, dataoffp, 0, cred, curthread); if (error == 0) { *holeoffp = *dataoffp; error = VOP_IOCTL(outvp, FIOSEEKHOLE, holeoffp, 0, cred, curthread); } if (error != 0 || *holeoffp == *dataoffp) { /* * Since outvp is unlocked, it may be possible for * another thread to do a truncate(), lseek(), write() * creating a hole at startoff between the above * VOP_IOCTL() calls, if the other thread does not do * rangelocking. * If that happens, *holeoffp == *dataoffp and finding * the hole has failed, so disable vn_skip_hole(). */ *holeoffp = -1; /* Disable use of vn_skip_hole(). */ return (xfer2); } KASSERT(*dataoffp >= *outoffp, ("vn_skip_hole: dataoff=%jd < outoff=%jd", (intmax_t)*dataoffp, (intmax_t)*outoffp)); KASSERT(*holeoffp > *dataoffp, ("vn_skip_hole: holeoff=%jd <= dataoff=%jd", (intmax_t)*holeoffp, (intmax_t)*dataoffp)); } /* * If there is a hole before the data starts, advance *outoffp and * *xferp past the hole. */ if (*dataoffp > *outoffp) { delta = *dataoffp - *outoffp; if (delta >= *xferp) { /* Entire *xferp is a hole. */ *outoffp += *xferp; *xferp = 0; return (0); } *xferp -= delta; *outoffp += delta; xfer2 = MIN(xfer2, *xferp); } /* * If a hole starts before the end of this xfer2, reduce this xfer2 so * that the write ends at the start of the hole. * *holeoffp should always be greater than *outoffp, but for the * non-INVARIANTS case, check this to make sure xfer2 remains a sane * value. */ if (*holeoffp > *outoffp && *holeoffp < *outoffp + xfer2) xfer2 = *holeoffp - *outoffp; return (xfer2); } /* * Write an xfer sized chunk to outvp in blksize blocks from dat. * dat is a maximum of blksize in length and can be written repeatedly in * the chunk. * If growfile == true, just grow the file via vn_truncate_locked() instead * of doing actual writes. * If checkhole == true, a hole is being punched, so skip over any hole * already in the output file. */ static int vn_write_outvp(struct vnode *outvp, char *dat, off_t outoff, off_t xfer, u_long blksize, bool growfile, bool checkhole, struct ucred *cred) { struct mount *mp; off_t dataoff, holeoff, xfer2; int error, lckf; /* * Loop around doing writes of blksize until write has been completed. * Lock/unlock on each loop iteration so that a bwillwrite() can be * done for each iteration, since the xfer argument can be very * large if there is a large hole to punch in the output file. */ error = 0; holeoff = 0; do { xfer2 = MIN(xfer, blksize); if (checkhole) { /* * Punching a hole. Skip writing if there is * already a hole in the output file. */ xfer2 = vn_skip_hole(outvp, xfer2, &outoff, &xfer, &dataoff, &holeoff, cred); if (xfer == 0) break; if (holeoff < 0) checkhole = false; KASSERT(xfer2 > 0, ("vn_write_outvp: xfer2=%jd", (intmax_t)xfer2)); } bwillwrite(); mp = NULL; error = vn_start_write(outvp, &mp, V_WAIT); if (error != 0) break; if (growfile) { error = vn_lock(outvp, LK_EXCLUSIVE); if (error == 0) { error = vn_truncate_locked(outvp, outoff + xfer, false, cred); VOP_UNLOCK(outvp); } } else { if (MNT_SHARED_WRITES(mp)) lckf = LK_SHARED; else lckf = LK_EXCLUSIVE; error = vn_lock(outvp, lckf); if (error == 0) { error = vn_rdwr(UIO_WRITE, outvp, dat, xfer2, outoff, UIO_SYSSPACE, IO_NODELOCKED, curthread->td_ucred, cred, NULL, curthread); outoff += xfer2; xfer -= xfer2; VOP_UNLOCK(outvp); } } if (mp != NULL) vn_finished_write(mp); } while (!growfile && xfer > 0 && error == 0); return (error); } /* * Copy a byte range of one file to another. This function can handle the * case where invp and outvp are on different file systems. * It can also be called by a VOP_COPY_FILE_RANGE() to do the work, if there * is no better file system specific way to do it. */ int vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp, off_t *outoffp, size_t *lenp, unsigned int flags, struct ucred *incred, struct ucred *outcred, struct thread *fsize_td) { struct vattr va; struct mount *mp; struct uio io; off_t startoff, endoff, xfer, xfer2; u_long blksize; int error, interrupted; bool cantseek, readzeros, eof, lastblock; ssize_t aresid; size_t copylen, len, rem, savlen; char *dat; long holein, holeout; holein = holeout = 0; savlen = len = *lenp; error = 0; interrupted = 0; dat = NULL; error = vn_lock(invp, LK_SHARED); if (error != 0) goto out; if (VOP_PATHCONF(invp, _PC_MIN_HOLE_SIZE, &holein) != 0) holein = 0; VOP_UNLOCK(invp); mp = NULL; error = vn_start_write(outvp, &mp, V_WAIT); if (error == 0) error = vn_lock(outvp, LK_EXCLUSIVE); if (error == 0) { /* * If fsize_td != NULL, do a vn_rlimit_fsize() call, * now that outvp is locked. */ if (fsize_td != NULL) { io.uio_offset = *outoffp; io.uio_resid = len; error = vn_rlimit_fsize(outvp, &io, fsize_td); if (error != 0) error = EFBIG; } if (VOP_PATHCONF(outvp, _PC_MIN_HOLE_SIZE, &holeout) != 0) holeout = 0; /* * Holes that are past EOF do not need to be written as a block * of zero bytes. So, truncate the output file as far as * possible and then use va.va_size to decide if writing 0 * bytes is necessary in the loop below. */ if (error == 0) error = VOP_GETATTR(outvp, &va, outcred); if (error == 0 && va.va_size > *outoffp && va.va_size <= *outoffp + len) { #ifdef MAC error = mac_vnode_check_write(curthread->td_ucred, outcred, outvp); if (error == 0) #endif error = vn_truncate_locked(outvp, *outoffp, false, outcred); if (error == 0) va.va_size = *outoffp; } VOP_UNLOCK(outvp); } if (mp != NULL) vn_finished_write(mp); if (error != 0) goto out; /* * Set the blksize to the larger of the hole sizes for invp and outvp. * If hole sizes aren't available, set the blksize to the larger * f_iosize of invp and outvp. * This code expects the hole sizes and f_iosizes to be powers of 2. * This value is clipped at 4Kbytes and 1Mbyte. */ blksize = MAX(holein, holeout); /* Clip len to end at an exact multiple of hole size. */ if (blksize > 1) { rem = *inoffp % blksize; if (rem > 0) rem = blksize - rem; if (len > rem && len - rem > blksize) len = savlen = rounddown(len - rem, blksize) + rem; } if (blksize <= 1) blksize = MAX(invp->v_mount->mnt_stat.f_iosize, outvp->v_mount->mnt_stat.f_iosize); if (blksize < 4096) blksize = 4096; else if (blksize > 1024 * 1024) blksize = 1024 * 1024; dat = malloc(blksize, M_TEMP, M_WAITOK); /* * If VOP_IOCTL(FIOSEEKHOLE) works for invp, use it and FIOSEEKDATA * to find holes. Otherwise, just scan the read block for all 0s * in the inner loop where the data copying is done. * Note that some file systems such as NFSv3, NFSv4.0 and NFSv4.1 may * support holes on the server, but do not support FIOSEEKHOLE. */ eof = false; while (len > 0 && error == 0 && !eof && interrupted == 0) { endoff = 0; /* To shut up compilers. */ cantseek = true; startoff = *inoffp; copylen = len; /* * Find the next data area. If there is just a hole to EOF, * FIOSEEKDATA should fail and then we drop down into the * inner loop and create the hole on the outvp file. * (I do not know if any file system will report a hole to * EOF via FIOSEEKHOLE, but I am pretty sure FIOSEEKDATA * will fail for those file systems.) * * For input files that don't support FIOSEEKDATA/FIOSEEKHOLE, * the code just falls through to the inner copy loop. */ error = EINVAL; if (holein > 0) error = VOP_IOCTL(invp, FIOSEEKDATA, &startoff, 0, incred, curthread); if (error == 0) { endoff = startoff; error = VOP_IOCTL(invp, FIOSEEKHOLE, &endoff, 0, incred, curthread); /* * Since invp is unlocked, it may be possible for * another thread to do a truncate(), lseek(), write() * creating a hole at startoff between the above * VOP_IOCTL() calls, if the other thread does not do * rangelocking. * If that happens, startoff == endoff and finding * the hole has failed, so set an error. */ if (error == 0 && startoff == endoff) error = EINVAL; /* Any error. Reset to 0. */ } if (error == 0) { if (startoff > *inoffp) { /* Found hole before data block. */ xfer = MIN(startoff - *inoffp, len); if (*outoffp < va.va_size) { /* Must write 0s to punch hole. */ xfer2 = MIN(va.va_size - *outoffp, xfer); memset(dat, 0, MIN(xfer2, blksize)); error = vn_write_outvp(outvp, dat, *outoffp, xfer2, blksize, false, holeout > 0, outcred); } if (error == 0 && *outoffp + xfer > va.va_size && xfer == len) /* Grow last block. */ error = vn_write_outvp(outvp, dat, *outoffp, xfer, blksize, true, false, outcred); if (error == 0) { *inoffp += xfer; *outoffp += xfer; len -= xfer; if (len < savlen) interrupted = sig_intr(); } } copylen = MIN(len, endoff - startoff); cantseek = false; } else { cantseek = true; startoff = *inoffp; copylen = len; error = 0; } xfer = blksize; if (cantseek) { /* * Set first xfer to end at a block boundary, so that * holes are more likely detected in the loop below via * the for all bytes 0 method. */ xfer -= (*inoffp % blksize); } /* Loop copying the data block. */ while (copylen > 0 && error == 0 && !eof && interrupted == 0) { if (copylen < xfer) xfer = copylen; error = vn_lock(invp, LK_SHARED); if (error != 0) goto out; error = vn_rdwr(UIO_READ, invp, dat, xfer, startoff, UIO_SYSSPACE, IO_NODELOCKED, curthread->td_ucred, incred, &aresid, curthread); VOP_UNLOCK(invp); lastblock = false; if (error == 0 && aresid > 0) { /* Stop the copy at EOF on the input file. */ xfer -= aresid; eof = true; lastblock = true; } if (error == 0) { /* * Skip the write for holes past the initial EOF * of the output file, unless this is the last * write of the output file at EOF. */ readzeros = cantseek ? mem_iszero(dat, xfer) : false; if (xfer == len) lastblock = true; if (!cantseek || *outoffp < va.va_size || lastblock || !readzeros) error = vn_write_outvp(outvp, dat, *outoffp, xfer, blksize, readzeros && lastblock && *outoffp >= va.va_size, false, outcred); if (error == 0) { *inoffp += xfer; startoff += xfer; *outoffp += xfer; copylen -= xfer; len -= xfer; if (len < savlen) interrupted = sig_intr(); } } xfer = blksize; } } out: *lenp = savlen - len; free(dat, M_TEMP); return (error); } static int vn_fallocate(struct file *fp, off_t offset, off_t len, struct thread *td) { struct mount *mp; struct vnode *vp; off_t olen, ooffset; int error; #ifdef AUDIT int audited_vnode1 = 0; #endif vp = fp->f_vnode; if (vp->v_type != VREG) return (ENODEV); /* Allocating blocks may take a long time, so iterate. */ for (;;) { olen = len; ooffset = offset; bwillwrite(); mp = NULL; error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error != 0) break; error = vn_lock(vp, LK_EXCLUSIVE); if (error != 0) { vn_finished_write(mp); break; } #ifdef AUDIT if (!audited_vnode1) { AUDIT_ARG_VNODE1(vp); audited_vnode1 = 1; } #endif #ifdef MAC error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp); if (error == 0) #endif error = VOP_ALLOCATE(vp, &offset, &len); VOP_UNLOCK(vp); vn_finished_write(mp); if (olen + ooffset != offset + len) { panic("offset + len changed from %jx/%jx to %jx/%jx", ooffset, olen, offset, len); } if (error != 0 || len == 0) break; KASSERT(olen > len, ("Iteration did not make progress?")); maybe_yield(); } return (error); } static u_long vn_lock_pair_pause_cnt; SYSCTL_ULONG(_debug, OID_AUTO, vn_lock_pair_pause, CTLFLAG_RD, &vn_lock_pair_pause_cnt, 0, "Count of vn_lock_pair deadlocks"); u_int vn_lock_pair_pause_max; SYSCTL_UINT(_debug, OID_AUTO, vn_lock_pair_pause_max, CTLFLAG_RW, &vn_lock_pair_pause_max, 0, "Max ticks for vn_lock_pair deadlock avoidance sleep"); static void vn_lock_pair_pause(const char *wmesg) { atomic_add_long(&vn_lock_pair_pause_cnt, 1); pause(wmesg, prng32_bounded(vn_lock_pair_pause_max)); } /* * Lock pair of vnodes vp1, vp2, avoiding lock order reversal. * vp1_locked indicates whether vp1 is exclusively locked; if not, vp1 * must be unlocked. Same for vp2 and vp2_locked. One of the vnodes * can be NULL. * * The function returns with both vnodes exclusively locked, and * guarantees that it does not create lock order reversal with other * threads during its execution. Both vnodes could be unlocked * temporary (and reclaimed). */ void vn_lock_pair(struct vnode *vp1, bool vp1_locked, struct vnode *vp2, bool vp2_locked) { int error; if (vp1 == NULL && vp2 == NULL) return; if (vp1 != NULL) { if (vp1_locked) ASSERT_VOP_ELOCKED(vp1, "vp1"); else ASSERT_VOP_UNLOCKED(vp1, "vp1"); } else { vp1_locked = true; } if (vp2 != NULL) { if (vp2_locked) ASSERT_VOP_ELOCKED(vp2, "vp2"); else ASSERT_VOP_UNLOCKED(vp2, "vp2"); } else { vp2_locked = true; } if (!vp1_locked && !vp2_locked) { vn_lock(vp1, LK_EXCLUSIVE | LK_RETRY); vp1_locked = true; } for (;;) { if (vp1_locked && vp2_locked) break; if (vp1_locked && vp2 != NULL) { if (vp1 != NULL) { error = VOP_LOCK1(vp2, LK_EXCLUSIVE | LK_NOWAIT, __FILE__, __LINE__); if (error == 0) break; VOP_UNLOCK(vp1); vp1_locked = false; vn_lock_pair_pause("vlp1"); } vn_lock(vp2, LK_EXCLUSIVE | LK_RETRY); vp2_locked = true; } if (vp2_locked && vp1 != NULL) { if (vp2 != NULL) { error = VOP_LOCK1(vp1, LK_EXCLUSIVE | LK_NOWAIT, __FILE__, __LINE__); if (error == 0) break; VOP_UNLOCK(vp2); vp2_locked = false; vn_lock_pair_pause("vlp2"); } vn_lock(vp1, LK_EXCLUSIVE | LK_RETRY); vp1_locked = true; } } if (vp1 != NULL) ASSERT_VOP_ELOCKED(vp1, "vp1 ret"); if (vp2 != NULL) ASSERT_VOP_ELOCKED(vp2, "vp2 ret"); } diff --git a/sys/sys/fcntl.h b/sys/sys/fcntl.h index c328abaa02af..58d46ae26338 100644 --- a/sys/sys/fcntl.h +++ b/sys/sys/fcntl.h @@ -1,370 +1,372 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1983, 1990, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)fcntl.h 8.3 (Berkeley) 1/21/94 * $FreeBSD$ */ #ifndef _SYS_FCNTL_H_ #define _SYS_FCNTL_H_ /* * This file includes the definitions for open and fcntl * described by POSIX for ; it also includes * related kernel definitions. */ #include #include #ifndef _MODE_T_DECLARED typedef __mode_t mode_t; #define _MODE_T_DECLARED #endif #ifndef _OFF_T_DECLARED typedef __off_t off_t; #define _OFF_T_DECLARED #endif #ifndef _PID_T_DECLARED typedef __pid_t pid_t; #define _PID_T_DECLARED #endif /* * File status flags: these are used by open(2), fcntl(2). * They are also used (indirectly) in the kernel file structure f_flags, * which is a superset of the open/fcntl flags. Open flags and f_flags * are inter-convertible using OFLAGS(fflags) and FFLAGS(oflags). * Open/fcntl flags begin with O_; kernel-internal flags begin with F. */ /* open-only flags */ #define O_RDONLY 0x0000 /* open for reading only */ #define O_WRONLY 0x0001 /* open for writing only */ #define O_RDWR 0x0002 /* open for reading and writing */ #define O_ACCMODE 0x0003 /* mask for above modes */ /* * Kernel encoding of open mode; separate read and write bits that are * independently testable: 1 greater than the above. * * XXX * FREAD and FWRITE are excluded from the #ifdef _KERNEL so that TIOCFLUSH, * which was documented to use FREAD/FWRITE, continues to work. */ #if __BSD_VISIBLE #define FREAD 0x0001 #define FWRITE 0x0002 #endif #define O_NONBLOCK 0x0004 /* no delay */ #define O_APPEND 0x0008 /* set append mode */ #if __BSD_VISIBLE #define O_SHLOCK 0x0010 /* open with shared file lock */ #define O_EXLOCK 0x0020 /* open with exclusive file lock */ #define O_ASYNC 0x0040 /* signal pgrp when data ready */ #define O_FSYNC 0x0080 /* synchronous writes */ #endif #define O_SYNC 0x0080 /* POSIX synonym for O_FSYNC */ #if __POSIX_VISIBLE >= 200809 #define O_NOFOLLOW 0x0100 /* don't follow symlinks */ #endif #define O_CREAT 0x0200 /* create if nonexistent */ #define O_TRUNC 0x0400 /* truncate to zero length */ #define O_EXCL 0x0800 /* error if already exists */ #ifdef _KERNEL #define FHASLOCK 0x4000 /* descriptor holds advisory lock */ #endif /* Defined by POSIX 1003.1; BSD default, but must be distinct from O_RDONLY. */ #define O_NOCTTY 0x8000 /* don't assign controlling terminal */ #if __BSD_VISIBLE /* Attempt to bypass buffer cache */ #define O_DIRECT 0x00010000 #endif #if __POSIX_VISIBLE >= 200809 #define O_DIRECTORY 0x00020000 /* Fail if not directory */ #define O_EXEC 0x00040000 /* Open for execute only */ #define O_SEARCH O_EXEC #endif #ifdef _KERNEL #define FEXEC O_EXEC #define FSEARCH O_SEARCH #endif #if __POSIX_VISIBLE >= 200809 /* Defined by POSIX 1003.1-2008; BSD default, but reserve for future use. */ #define O_TTY_INIT 0x00080000 /* Restore default termios attributes */ #define O_CLOEXEC 0x00100000 #endif #if __BSD_VISIBLE #define O_VERIFY 0x00200000 /* open only after verification */ #define O_PATH 0x00400000 /* fd is only a path */ #define O_RESOLVE_BENEATH 0x00800000 /* Do not allow name resolution to walk out of cwd */ #endif #define O_DSYNC 0x01000000 /* POSIX data sync */ /* * XXX missing O_RSYNC. */ #ifdef _KERNEL /* Only for devfs d_close() flags. */ #define FLASTCLOSE O_DIRECTORY #define FREVOKE O_VERIFY /* Only for fo_close() from half-succeeded open */ #define FOPENFAILED O_TTY_INIT +/* Only for O_PATH files which passed ACCESS FREAD check on open */ +#define FKQALLOWED O_RESOLVE_BENEATH /* convert from open() flags to/from fflags; convert O_RD/WR to FREAD/FWRITE */ #define FFLAGS(oflags) ((oflags) & O_EXEC ? (oflags) : (oflags) + 1) #define OFLAGS(fflags) \ (((fflags) & (O_EXEC | O_PATH)) != 0 ? (fflags) : (fflags) - 1) /* bits to save after open */ #define FMASK (FREAD|FWRITE|FAPPEND|FASYNC|FFSYNC|FDSYNC|FNONBLOCK| \ O_DIRECT|FEXEC|O_PATH) /* bits settable by fcntl(F_SETFL, ...) */ #define FCNTLFLAGS (FAPPEND|FASYNC|FFSYNC|FDSYNC|FNONBLOCK|FRDAHEAD|O_DIRECT) #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \ defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) /* * Set by shm_open(3) in older libc's to get automatic MAP_ASYNC * behavior for POSIX shared memory objects (which are otherwise * implemented as plain files). */ #define FPOSIXSHM O_NOFOLLOW #undef FCNTLFLAGS #define FCNTLFLAGS (FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FPOSIXSHM|FRDAHEAD| \ O_DIRECT) #endif #endif /* * The O_* flags used to have only F* names, which were used in the kernel * and by fcntl. We retain the F* names for the kernel f_flag field * and for backward compatibility for fcntl. These flags are deprecated. */ #if __BSD_VISIBLE #define FAPPEND O_APPEND /* kernel/compat */ #define FASYNC O_ASYNC /* kernel/compat */ #define FFSYNC O_FSYNC /* kernel */ #define FDSYNC O_DSYNC /* kernel */ #define FNONBLOCK O_NONBLOCK /* kernel */ #define FNDELAY O_NONBLOCK /* compat */ #define O_NDELAY O_NONBLOCK /* compat */ #endif /* * Historically, we ran out of bits in f_flag (which was once a short). * However, the flag bits not set in FMASK are only meaningful in the * initial open syscall. Those bits were thus given a * different meaning for fcntl(2). */ #if __BSD_VISIBLE /* Read ahead */ #define FRDAHEAD O_CREAT #endif #if __POSIX_VISIBLE >= 200809 /* * Magic value that specify the use of the current working directory * to determine the target of relative file paths in the openat() and * similar syscalls. */ #define AT_FDCWD -100 /* * Miscellaneous flags for the *at() syscalls. */ #define AT_EACCESS 0x0100 /* Check access using effective user and group ID */ #define AT_SYMLINK_NOFOLLOW 0x0200 /* Do not follow symbolic links */ #define AT_SYMLINK_FOLLOW 0x0400 /* Follow symbolic link */ #define AT_REMOVEDIR 0x0800 /* Remove directory instead of file */ #endif /* __POSIX_VISIBLE >= 200809 */ #if __BSD_VISIBLE /* #define AT_UNUSED1 0x1000 *//* Was AT_BENEATH */ #define AT_RESOLVE_BENEATH 0x2000 /* Do not allow name resolution to walk out of dirfd */ #define AT_EMPTY_PATH 0x4000 /* Operate on dirfd if path is empty */ #endif /* __BSD_VISIBLE */ /* * Constants used for fcntl(2) */ /* command values */ #define F_DUPFD 0 /* duplicate file descriptor */ #define F_GETFD 1 /* get file descriptor flags */ #define F_SETFD 2 /* set file descriptor flags */ #define F_GETFL 3 /* get file status flags */ #define F_SETFL 4 /* set file status flags */ #if __XSI_VISIBLE || __POSIX_VISIBLE >= 200112 #define F_GETOWN 5 /* get SIGIO/SIGURG proc/pgrp */ #define F_SETOWN 6 /* set SIGIO/SIGURG proc/pgrp */ #endif #if __BSD_VISIBLE #define F_OGETLK 7 /* get record locking information */ #define F_OSETLK 8 /* set record locking information */ #define F_OSETLKW 9 /* F_SETLK; wait if blocked */ #define F_DUP2FD 10 /* duplicate file descriptor to arg */ #endif #define F_GETLK 11 /* get record locking information */ #define F_SETLK 12 /* set record locking information */ #define F_SETLKW 13 /* F_SETLK; wait if blocked */ #if __BSD_VISIBLE #define F_SETLK_REMOTE 14 /* debugging support for remote locks */ #define F_READAHEAD 15 /* read ahead */ #define F_RDAHEAD 16 /* Darwin compatible read ahead */ #endif #if __POSIX_VISIBLE >= 200809 #define F_DUPFD_CLOEXEC 17 /* Like F_DUPFD, but FD_CLOEXEC is set */ #endif #if __BSD_VISIBLE #define F_DUP2FD_CLOEXEC 18 /* Like F_DUP2FD, but FD_CLOEXEC is set */ #define F_ADD_SEALS 19 #define F_GET_SEALS 20 #define F_ISUNIONSTACK 21 /* Kludge for libc, don't use it. */ /* Seals (F_ADD_SEALS, F_GET_SEALS). */ #define F_SEAL_SEAL 0x0001 /* Prevent adding sealings */ #define F_SEAL_SHRINK 0x0002 /* May not shrink */ #define F_SEAL_GROW 0x0004 /* May not grow */ #define F_SEAL_WRITE 0x0008 /* May not write */ #endif /* __BSD_VISIBLE */ /* file descriptor flags (F_GETFD, F_SETFD) */ #define FD_CLOEXEC 1 /* close-on-exec flag */ /* record locking flags (F_GETLK, F_SETLK, F_SETLKW) */ #define F_RDLCK 1 /* shared or read lock */ #define F_UNLCK 2 /* unlock */ #define F_WRLCK 3 /* exclusive or write lock */ #if __BSD_VISIBLE #define F_UNLCKSYS 4 /* purge locks for a given system ID */ #define F_CANCEL 5 /* cancel an async lock request */ #endif #ifdef _KERNEL #define F_WAIT 0x010 /* Wait until lock is granted */ #define F_FLOCK 0x020 /* Use flock(2) semantics for lock */ #define F_POSIX 0x040 /* Use POSIX semantics for lock */ #define F_REMOTE 0x080 /* Lock owner is remote NFS client */ #define F_NOINTR 0x100 /* Ignore signals when waiting */ #define F_FIRSTOPEN 0x200 /* First right to advlock file */ #endif /* * Advisory file segment locking data type - * information passed to system by user */ struct flock { off_t l_start; /* starting offset */ off_t l_len; /* len = 0 means until end of file */ pid_t l_pid; /* lock owner */ short l_type; /* lock type: read/write, etc. */ short l_whence; /* type of l_start */ int l_sysid; /* remote system id or zero for local */ }; #if __BSD_VISIBLE /* * Old advisory file segment locking data type, * before adding l_sysid. */ struct __oflock { off_t l_start; /* starting offset */ off_t l_len; /* len = 0 means until end of file */ pid_t l_pid; /* lock owner */ short l_type; /* lock type: read/write, etc. */ short l_whence; /* type of l_start */ }; #endif #if __BSD_VISIBLE /* lock operations for flock(2) */ #define LOCK_SH 0x01 /* shared file lock */ #define LOCK_EX 0x02 /* exclusive file lock */ #define LOCK_NB 0x04 /* don't block when locking */ #define LOCK_UN 0x08 /* unlock file */ #endif #if __POSIX_VISIBLE >= 200112 /* * Advice to posix_fadvise */ #define POSIX_FADV_NORMAL 0 /* no special treatment */ #define POSIX_FADV_RANDOM 1 /* expect random page references */ #define POSIX_FADV_SEQUENTIAL 2 /* expect sequential page references */ #define POSIX_FADV_WILLNEED 3 /* will need these pages */ #define POSIX_FADV_DONTNEED 4 /* dont need these pages */ #define POSIX_FADV_NOREUSE 5 /* access data only once */ #endif #ifdef __BSD_VISIBLE /* * Magic value that specify that corresponding file descriptor to filename * is unknown and sanitary check should be omitted in the funlinkat() and * similar syscalls. */ #define FD_NONE -200 #endif #ifndef _KERNEL __BEGIN_DECLS int open(const char *, int, ...); int creat(const char *, mode_t); int fcntl(int, int, ...); #if __BSD_VISIBLE int flock(int, int); #endif #if __POSIX_VISIBLE >= 200809 int openat(int, const char *, int, ...); #endif #if __POSIX_VISIBLE >= 200112 int posix_fadvise(int, off_t, off_t, int); int posix_fallocate(int, off_t, off_t); #endif __END_DECLS #endif #endif /* !_SYS_FCNTL_H_ */ diff --git a/sys/sys/file.h b/sys/sys/file.h index 9237ee5ceb9d..b16e23bdfbcf 100644 --- a/sys/sys/file.h +++ b/sys/sys/file.h @@ -1,476 +1,477 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)file.h 8.3 (Berkeley) 1/9/95 * $FreeBSD$ */ #ifndef _SYS_FILE_H_ #define _SYS_FILE_H_ #ifndef _KERNEL #include /* XXX */ #include #include #else #include #include #include #include #include struct filedesc; struct stat; struct thread; struct uio; struct knote; struct vnode; struct nameidata; #endif /* _KERNEL */ #define DTYPE_NONE 0 /* not yet initialized */ #define DTYPE_VNODE 1 /* file */ #define DTYPE_SOCKET 2 /* communications endpoint */ #define DTYPE_PIPE 3 /* pipe */ #define DTYPE_FIFO 4 /* fifo (named pipe) */ #define DTYPE_KQUEUE 5 /* event queue */ #define DTYPE_CRYPTO 6 /* crypto */ #define DTYPE_MQUEUE 7 /* posix message queue */ #define DTYPE_SHM 8 /* swap-backed shared memory */ #define DTYPE_SEM 9 /* posix semaphore */ #define DTYPE_PTS 10 /* pseudo teletype master device */ #define DTYPE_DEV 11 /* Device specific fd type */ #define DTYPE_PROCDESC 12 /* process descriptor */ #define DTYPE_EVENTFD 13 /* eventfd */ #define DTYPE_LINUXTFD 14 /* emulation timerfd type */ #ifdef _KERNEL struct file; struct filecaps; struct kaiocb; struct kinfo_file; struct ucred; #define FOF_OFFSET 0x01 /* Use the offset in uio argument */ #define FOF_NOLOCK 0x02 /* Do not take FOFFSET_LOCK */ #define FOF_NEXTOFF_R 0x04 /* Also update f_nextoff[UIO_READ] */ #define FOF_NEXTOFF_W 0x08 /* Also update f_nextoff[UIO_WRITE] */ #define FOF_NOUPDATE 0x10 /* Do not update f_offset */ off_t foffset_lock(struct file *fp, int flags); void foffset_lock_uio(struct file *fp, struct uio *uio, int flags); void foffset_unlock(struct file *fp, off_t val, int flags); void foffset_unlock_uio(struct file *fp, struct uio *uio, int flags); static inline off_t foffset_get(struct file *fp) { return (foffset_lock(fp, FOF_NOLOCK)); } typedef int fo_rdwr_t(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td); typedef int fo_truncate_t(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td); typedef int fo_ioctl_t(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td); typedef int fo_poll_t(struct file *fp, int events, struct ucred *active_cred, struct thread *td); typedef int fo_kqfilter_t(struct file *fp, struct knote *kn); typedef int fo_stat_t(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td); typedef int fo_close_t(struct file *fp, struct thread *td); typedef int fo_chmod_t(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td); typedef int fo_chown_t(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td); typedef int fo_sendfile_t(struct file *fp, int sockfd, struct uio *hdr_uio, struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, struct thread *td); typedef int fo_seek_t(struct file *fp, off_t offset, int whence, struct thread *td); typedef int fo_fill_kinfo_t(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp); typedef int fo_mmap_t(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, struct thread *td); typedef int fo_aio_queue_t(struct file *fp, struct kaiocb *job); typedef int fo_add_seals_t(struct file *fp, int flags); typedef int fo_get_seals_t(struct file *fp, int *flags); typedef int fo_fallocate_t(struct file *fp, off_t offset, off_t len, struct thread *td); typedef int fo_flags_t; struct fileops { fo_rdwr_t *fo_read; fo_rdwr_t *fo_write; fo_truncate_t *fo_truncate; fo_ioctl_t *fo_ioctl; fo_poll_t *fo_poll; fo_kqfilter_t *fo_kqfilter; fo_stat_t *fo_stat; fo_close_t *fo_close; fo_chmod_t *fo_chmod; fo_chown_t *fo_chown; fo_sendfile_t *fo_sendfile; fo_seek_t *fo_seek; fo_fill_kinfo_t *fo_fill_kinfo; fo_mmap_t *fo_mmap; fo_aio_queue_t *fo_aio_queue; fo_add_seals_t *fo_add_seals; fo_get_seals_t *fo_get_seals; fo_fallocate_t *fo_fallocate; fo_flags_t fo_flags; /* DFLAG_* below */ }; #define DFLAG_PASSABLE 0x01 /* may be passed via unix sockets. */ #define DFLAG_SEEKABLE 0x02 /* seekable / nonsequential */ #endif /* _KERNEL */ #if defined(_KERNEL) || defined(_WANT_FILE) /* * Kernel descriptor table. * One entry for each open kernel vnode and socket. * * Below is the list of locks that protects members in struct file. * * (a) f_vnode lock required (shared allows both reads and writes) * (f) updated with atomics and blocking on sleepq * (d) cdevpriv_mtx * none not locked */ struct fadvise_info { int fa_advice; /* (f) FADV_* type. */ off_t fa_start; /* (f) Region start. */ off_t fa_end; /* (f) Region end. */ }; struct file { volatile u_int f_flag; /* see fcntl.h */ volatile u_int f_count; /* reference count */ void *f_data; /* file descriptor specific data */ struct fileops *f_ops; /* File operations */ struct vnode *f_vnode; /* NULL or applicable vnode */ struct ucred *f_cred; /* associated credentials. */ short f_type; /* descriptor type */ short f_vnread_flags; /* (f) Sleep lock for f_offset */ /* * DTYPE_VNODE specific fields. */ union { int16_t f_seqcount[2]; /* (a) Count of seq. reads and writes. */ int f_pipegen; }; off_t f_nextoff[2]; /* next expected read/write offset. */ union { struct cdev_privdata *fvn_cdevpriv; /* (d) Private data for the cdev. */ struct fadvise_info *fvn_advice; } f_vnun; /* * DFLAG_SEEKABLE specific fields */ off_t f_offset; }; #define f_cdevpriv f_vnun.fvn_cdevpriv #define f_advice f_vnun.fvn_advice #define FOFFSET_LOCKED 0x1 #define FOFFSET_LOCK_WAITING 0x2 #endif /* _KERNEL || _WANT_FILE */ /* * Userland version of struct file, for sysctl */ struct xfile { ksize_t xf_size; /* size of struct xfile */ pid_t xf_pid; /* owning process */ uid_t xf_uid; /* effective uid of owning process */ int xf_fd; /* descriptor number */ int _xf_int_pad1; kvaddr_t xf_file; /* address of struct file */ short xf_type; /* descriptor type */ short _xf_short_pad1; int xf_count; /* reference count */ int xf_msgcount; /* references from message queue */ int _xf_int_pad2; off_t xf_offset; /* file offset */ kvaddr_t xf_data; /* file descriptor specific data */ kvaddr_t xf_vnode; /* vnode pointer */ u_int xf_flag; /* flags (see fcntl.h) */ int _xf_int_pad3; int64_t _xf_int64_pad[6]; }; #ifdef _KERNEL extern struct fileops vnops; extern struct fileops badfileops; extern struct fileops path_fileops; extern struct fileops socketops; extern int maxfiles; /* kernel limit on number of open files */ extern int maxfilesperproc; /* per process limit on number of open files */ int fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp); int fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, vm_prot_t *maxprotp, struct file **fpp); int fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp); int fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp); int fget_fcntl(struct thread *td, int fd, cap_rights_t *rightsp, int needfcntl, struct file **fpp); int _fdrop(struct file *fp, struct thread *td); fo_rdwr_t invfo_rdwr; fo_truncate_t invfo_truncate; fo_ioctl_t invfo_ioctl; fo_poll_t invfo_poll; fo_kqfilter_t invfo_kqfilter; fo_chmod_t invfo_chmod; fo_chown_t invfo_chown; fo_sendfile_t invfo_sendfile; fo_stat_t vn_statfile; fo_sendfile_t vn_sendfile; fo_seek_t vn_seek; fo_fill_kinfo_t vn_fill_kinfo; +fo_kqfilter_t vn_kqfilter_opath; int vn_fill_kinfo_vnode(struct vnode *vp, struct kinfo_file *kif); void finit(struct file *, u_int, short, void *, struct fileops *); void finit_vnode(struct file *, u_int, void *, struct fileops *); int fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp); int fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp); int fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp, struct filecaps *havecaps, struct vnode **vpp); int fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp); int fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp); int fgetvp_lookup_smr(int fd, struct nameidata *ndp, struct vnode **vpp, bool *fsearch); static __inline __result_use_check bool fhold(struct file *fp) { return (refcount_acquire_checked(&fp->f_count)); } #define fdrop(fp, td) ({ \ struct file *_fp; \ int _error; \ \ _error = 0; \ _fp = (fp); \ if (__predict_false(refcount_release(&_fp->f_count))) \ _error = _fdrop(_fp, td); \ _error; \ }) #define fdrop_close(fp, td) ({ \ struct file *_fp; \ int _error; \ \ _error = 0; \ _fp = (fp); \ if (__predict_true(refcount_release(&_fp->f_count))) \ _error = _fdrop(_fp, td); \ _error; \ }) static __inline fo_rdwr_t fo_read; static __inline fo_rdwr_t fo_write; static __inline fo_truncate_t fo_truncate; static __inline fo_ioctl_t fo_ioctl; static __inline fo_poll_t fo_poll; static __inline fo_kqfilter_t fo_kqfilter; static __inline fo_stat_t fo_stat; static __inline fo_close_t fo_close; static __inline fo_chmod_t fo_chmod; static __inline fo_chown_t fo_chown; static __inline fo_sendfile_t fo_sendfile; static __inline int fo_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return ((*fp->f_ops->fo_read)(fp, uio, active_cred, flags, td)); } static __inline int fo_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return ((*fp->f_ops->fo_write)(fp, uio, active_cred, flags, td)); } static __inline int fo_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { return ((*fp->f_ops->fo_truncate)(fp, length, active_cred, td)); } static __inline int fo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { return ((*fp->f_ops->fo_ioctl)(fp, com, data, active_cred, td)); } static __inline int fo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { return ((*fp->f_ops->fo_poll)(fp, events, active_cred, td)); } static __inline int fo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td) { return ((*fp->f_ops->fo_stat)(fp, sb, active_cred, td)); } static __inline int fo_close(struct file *fp, struct thread *td) { return ((*fp->f_ops->fo_close)(fp, td)); } static __inline int fo_kqfilter(struct file *fp, struct knote *kn) { return ((*fp->f_ops->fo_kqfilter)(fp, kn)); } static __inline int fo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { return ((*fp->f_ops->fo_chmod)(fp, mode, active_cred, td)); } static __inline int fo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { return ((*fp->f_ops->fo_chown)(fp, uid, gid, active_cred, td)); } static __inline int fo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, struct thread *td) { return ((*fp->f_ops->fo_sendfile)(fp, sockfd, hdr_uio, trl_uio, offset, nbytes, sent, flags, td)); } static __inline int fo_seek(struct file *fp, off_t offset, int whence, struct thread *td) { return ((*fp->f_ops->fo_seek)(fp, offset, whence, td)); } static __inline int fo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { return ((*fp->f_ops->fo_fill_kinfo)(fp, kif, fdp)); } static __inline int fo_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, struct thread *td) { if (fp->f_ops->fo_mmap == NULL) return (ENODEV); return ((*fp->f_ops->fo_mmap)(fp, map, addr, size, prot, cap_maxprot, flags, foff, td)); } static __inline int fo_aio_queue(struct file *fp, struct kaiocb *job) { return ((*fp->f_ops->fo_aio_queue)(fp, job)); } static __inline int fo_add_seals(struct file *fp, int seals) { if (fp->f_ops->fo_add_seals == NULL) return (EINVAL); return ((*fp->f_ops->fo_add_seals)(fp, seals)); } static __inline int fo_get_seals(struct file *fp, int *seals) { if (fp->f_ops->fo_get_seals == NULL) return (EINVAL); return ((*fp->f_ops->fo_get_seals)(fp, seals)); } static __inline int fo_fallocate(struct file *fp, off_t offset, off_t len, struct thread *td) { if (fp->f_ops->fo_fallocate == NULL) return (ENODEV); return ((*fp->f_ops->fo_fallocate)(fp, offset, len, td)); } #endif /* _KERNEL */ #endif /* !SYS_FILE_H */