Index: head/sys/cddl/compat/opensolaris/sys/file.h =================================================================== --- head/sys/cddl/compat/opensolaris/sys/file.h (revision 333424) +++ head/sys/cddl/compat/opensolaris/sys/file.h (revision 333425) @@ -1,65 +1,64 @@ /*- * Copyright (c) 2007 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _OPENSOLARIS_SYS_FILE_H_ #define _OPENSOLARIS_SYS_FILE_H_ #include_next #define FKIOCTL 0x80000000 /* ioctl addresses are from kernel */ #ifdef _KERNEL typedef struct file file_t; #include static __inline file_t * getf(int fd, cap_rights_t *rightsp) { struct file *fp; if (fget(curthread, fd, rightsp, &fp) == 0) return (fp); return (NULL); } static __inline void releasef(int fd) { struct file *fp; - cap_rights_t rights; /* No CAP_ rights required, as we're only releasing. */ - if (fget(curthread, fd, cap_rights_init(&rights), &fp) == 0) { + if (fget(curthread, fd, &cap_no_rights, &fp) == 0) { fdrop(fp, curthread); fdrop(fp, curthread); } } #endif /* _KERNEL */ #endif /* !_OPENSOLARIS_SYS_FILE_H_ */ Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c (revision 333424) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c (revision 333425) @@ -1,7060 +1,7054 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved. * Copyright 2013 Martin Matuska . All rights reserved. * Copyright 2014 Xin Li . All rights reserved. * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved. * Copyright (c) 2011, 2017 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome * Copyright 2017 RackTop Systems. * Copyright (c) 2017 Datto Inc. */ /* * ZFS ioctls. * * This file handles the ioctls to /dev/zfs, used for configuring ZFS storage * pools and filesystems, e.g. with /sbin/zfs and /sbin/zpool. * * There are two ways that we handle ioctls: the legacy way where almost * all of the logic is in the ioctl callback, and the new way where most * of the marshalling is handled in the common entry point, zfsdev_ioctl(). * * Non-legacy ioctls should be registered by calling * zfs_ioctl_register() from zfs_ioctl_init(). The ioctl is invoked * from userland by lzc_ioctl(). * * The registration arguments are as follows: * * const char *name * The name of the ioctl. This is used for history logging. If the * ioctl returns successfully (the callback returns 0), and allow_log * is true, then a history log entry will be recorded with the input & * output nvlists. The log entry can be printed with "zpool history -i". * * zfs_ioc_t ioc * The ioctl request number, which userland will pass to ioctl(2). * The ioctl numbers can change from release to release, because * the caller (libzfs) must be matched to the kernel. * * zfs_secpolicy_func_t *secpolicy * This function will be called before the zfs_ioc_func_t, to * determine if this operation is permitted. It should return EPERM * on failure, and 0 on success. Checks include determining if the * dataset is visible in this zone, and if the user has either all * zfs privileges in the zone (SYS_MOUNT), or has been granted permission * to do this operation on this dataset with "zfs allow". * * zfs_ioc_namecheck_t namecheck * This specifies what to expect in the zfs_cmd_t:zc_name -- a pool * name, a dataset name, or nothing. If the name is not well-formed, * the ioctl will fail and the callback will not be called. * Therefore, the callback can assume that the name is well-formed * (e.g. is null-terminated, doesn't have more than one '@' character, * doesn't have invalid characters). * * zfs_ioc_poolcheck_t pool_check * This specifies requirements on the pool state. If the pool does * not meet them (is suspended or is readonly), the ioctl will fail * and the callback will not be called. If any checks are specified * (i.e. it is not POOL_CHECK_NONE), namecheck must not be NO_NAME. * Multiple checks can be or-ed together (e.g. POOL_CHECK_SUSPENDED | * POOL_CHECK_READONLY). * * boolean_t smush_outnvlist * If smush_outnvlist is true, then the output is presumed to be a * list of errors, and it will be "smushed" down to fit into the * caller's buffer, by removing some entries and replacing them with a * single "N_MORE_ERRORS" entry indicating how many were removed. See * nvlist_smush() for details. If smush_outnvlist is false, and the * outnvlist does not fit into the userland-provided buffer, then the * ioctl will fail with ENOMEM. * * zfs_ioc_func_t *func * The callback function that will perform the operation. * * The callback should return 0 on success, or an error number on * failure. If the function fails, the userland ioctl will return -1, * and errno will be set to the callback's return value. The callback * will be called with the following arguments: * * const char *name * The name of the pool or dataset to operate on, from * zfs_cmd_t:zc_name. The 'namecheck' argument specifies the * expected type (pool, dataset, or none). * * nvlist_t *innvl * The input nvlist, deserialized from zfs_cmd_t:zc_nvlist_src. Or * NULL if no input nvlist was provided. Changes to this nvlist are * ignored. If the input nvlist could not be deserialized, the * ioctl will fail and the callback will not be called. * * nvlist_t *outnvl * The output nvlist, initially empty. The callback can fill it in, * and it will be returned to userland by serializing it into * zfs_cmd_t:zc_nvlist_dst. If it is non-empty, and serialization * fails (e.g. because the caller didn't supply a large enough * buffer), then the overall ioctl will fail. See the * 'smush_nvlist' argument above for additional behaviors. * * There are two typical uses of the output nvlist: * - To return state, e.g. property values. In this case, * smush_outnvlist should be false. If the buffer was not large * enough, the caller will reallocate a larger buffer and try * the ioctl again. * * - To return multiple errors from an ioctl which makes on-disk * changes. In this case, smush_outnvlist should be true. * Ioctls which make on-disk modifications should generally not * use the outnvl if they succeed, because the caller can not * distinguish between the operation failing, and * deserialization failing. */ #ifdef __FreeBSD__ #include "opt_kstack_pages.h" #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" #include "zfs_prop.h" #include "zfs_deleg.h" #include "zfs_comutil.h" #include "zfs_ioctl_compat.h" #include "lua.h" #include "lauxlib.h" static struct cdev *zfsdev; extern void zfs_init(void); extern void zfs_fini(void); uint_t zfs_fsyncer_key; extern uint_t rrw_tsd_key; static uint_t zfs_allow_log_key; extern uint_t zfs_geom_probe_vdev_key; typedef int zfs_ioc_legacy_func_t(zfs_cmd_t *); typedef int zfs_ioc_func_t(const char *, nvlist_t *, nvlist_t *); typedef int zfs_secpolicy_func_t(zfs_cmd_t *, nvlist_t *, cred_t *); typedef enum { NO_NAME, POOL_NAME, DATASET_NAME } zfs_ioc_namecheck_t; typedef enum { POOL_CHECK_NONE = 1 << 0, POOL_CHECK_SUSPENDED = 1 << 1, POOL_CHECK_READONLY = 1 << 2, } zfs_ioc_poolcheck_t; typedef struct zfs_ioc_vec { zfs_ioc_legacy_func_t *zvec_legacy_func; zfs_ioc_func_t *zvec_func; zfs_secpolicy_func_t *zvec_secpolicy; zfs_ioc_namecheck_t zvec_namecheck; boolean_t zvec_allow_log; zfs_ioc_poolcheck_t zvec_pool_check; boolean_t zvec_smush_outnvlist; const char *zvec_name; } zfs_ioc_vec_t; /* This array is indexed by zfs_userquota_prop_t */ static const char *userquota_perms[] = { ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_PERM_GROUPQUOTA, }; static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc); static int zfs_check_settable(const char *name, nvpair_t *property, cred_t *cr); static int zfs_check_clearable(char *dataset, nvlist_t *props, nvlist_t **errors); static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *, boolean_t *); int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t *); static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp); static void zfsdev_close(void *data); static int zfs_prop_activate_feature(spa_t *spa, spa_feature_t feature); /* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */ void __dprintf(const char *file, const char *func, int line, const char *fmt, ...) { const char *newfile; char buf[512]; va_list adx; /* * Get rid of annoying "../common/" prefix to filename. */ newfile = strrchr(file, '/'); if (newfile != NULL) { newfile = newfile + 1; /* Get rid of leading / */ } else { newfile = file; } va_start(adx, fmt); (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); /* * To get this data, use the zfs-dprintf probe as so: * dtrace -q -n 'zfs-dprintf \ * /stringof(arg0) == "dbuf.c"/ \ * {printf("%s: %s", stringof(arg1), stringof(arg3))}' * arg0 = file name * arg1 = function name * arg2 = line number * arg3 = message */ DTRACE_PROBE4(zfs__dprintf, char *, newfile, char *, func, int, line, char *, buf); } static void history_str_free(char *buf) { kmem_free(buf, HIS_MAX_RECORD_LEN); } static char * history_str_get(zfs_cmd_t *zc) { char *buf; if (zc->zc_history == 0) return (NULL); buf = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP); if (copyinstr((void *)(uintptr_t)zc->zc_history, buf, HIS_MAX_RECORD_LEN, NULL) != 0) { history_str_free(buf); return (NULL); } buf[HIS_MAX_RECORD_LEN -1] = '\0'; return (buf); } /* * Check to see if the named dataset is currently defined as bootable */ static boolean_t zfs_is_bootfs(const char *name) { objset_t *os; if (dmu_objset_hold(name, FTAG, &os) == 0) { boolean_t ret; ret = (dmu_objset_id(os) == spa_bootfs(dmu_objset_spa(os))); dmu_objset_rele(os, FTAG); return (ret); } return (B_FALSE); } /* * Return non-zero if the spa version is less than requested version. */ static int zfs_earlier_version(const char *name, int version) { spa_t *spa; if (spa_open(name, &spa, FTAG) == 0) { if (spa_version(spa) < version) { spa_close(spa, FTAG); return (1); } spa_close(spa, FTAG); } return (0); } /* * Return TRUE if the ZPL version is less than requested version. */ static boolean_t zpl_earlier_version(const char *name, int version) { objset_t *os; boolean_t rc = B_TRUE; if (dmu_objset_hold(name, FTAG, &os) == 0) { uint64_t zplversion; if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele(os, FTAG); return (B_TRUE); } /* XXX reading from non-owned objset */ if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &zplversion) == 0) rc = zplversion < version; dmu_objset_rele(os, FTAG); } return (rc); } static void zfs_log_history(zfs_cmd_t *zc) { spa_t *spa; char *buf; if ((buf = history_str_get(zc)) == NULL) return; if (spa_open(zc->zc_name, &spa, FTAG) == 0) { if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY) (void) spa_history_log(spa, buf); spa_close(spa, FTAG); } history_str_free(buf); } /* * Policy for top-level read operations (list pools). Requires no privileges, * and can be used in the local zone, as there is no associated dataset. */ /* ARGSUSED */ static int zfs_secpolicy_none(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (0); } /* * Policy for dataset read operations (list children, get statistics). Requires * no privileges, but must be visible in the local zone. */ /* ARGSUSED */ static int zfs_secpolicy_read(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { if (INGLOBALZONE(curthread) || zone_dataset_visible(zc->zc_name, NULL)) return (0); return (SET_ERROR(ENOENT)); } static int zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr) { int writable = 1; /* * The dataset must be visible by this zone -- check this first * so they don't see EPERM on something they shouldn't know about. */ if (!INGLOBALZONE(curthread) && !zone_dataset_visible(dataset, &writable)) return (SET_ERROR(ENOENT)); if (INGLOBALZONE(curthread)) { /* * If the fs is zoned, only root can access it from the * global zone. */ if (secpolicy_zfs(cr) && zoned) return (SET_ERROR(EPERM)); } else { /* * If we are in a local zone, the 'zoned' property must be set. */ if (!zoned) return (SET_ERROR(EPERM)); /* must be writable by this zone */ if (!writable) return (SET_ERROR(EPERM)); } return (0); } static int zfs_dozonecheck(const char *dataset, cred_t *cr) { uint64_t zoned; if (dsl_prop_get_integer(dataset, "jailed", &zoned, NULL)) return (SET_ERROR(ENOENT)); return (zfs_dozonecheck_impl(dataset, zoned, cr)); } static int zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr) { uint64_t zoned; if (dsl_prop_get_int_ds(ds, "jailed", &zoned)) return (SET_ERROR(ENOENT)); return (zfs_dozonecheck_impl(dataset, zoned, cr)); } static int zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds, const char *perm, cred_t *cr) { int error; error = zfs_dozonecheck_ds(name, ds, cr); if (error == 0) { error = secpolicy_zfs(cr); if (error != 0) error = dsl_deleg_access_impl(ds, perm, cr); } return (error); } static int zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) { int error; dsl_dataset_t *ds; dsl_pool_t *dp; /* * First do a quick check for root in the global zone, which * is allowed to do all write_perms. This ensures that zfs_ioc_* * will get to handle nonexistent datasets. */ if (INGLOBALZONE(curthread) && secpolicy_zfs(cr) == 0) return (0); error = dsl_pool_hold(name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, name, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } error = zfs_secpolicy_write_perms_ds(name, ds, perm, cr); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } #ifdef SECLABEL /* * Policy for setting the security label property. * * Returns 0 for success, non-zero for access and other errors. */ static int zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr) { char ds_hexsl[MAXNAMELEN]; bslabel_t ds_sl, new_sl; boolean_t new_default = FALSE; uint64_t zoned; int needed_priv = -1; int error; /* First get the existing dataset label. */ error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1, sizeof (ds_hexsl), &ds_hexsl, NULL); if (error != 0) return (SET_ERROR(EPERM)); if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0) new_default = TRUE; /* The label must be translatable */ if (!new_default && (hexstr_to_label(strval, &new_sl) != 0)) return (SET_ERROR(EINVAL)); /* * In a non-global zone, disallow attempts to set a label that * doesn't match that of the zone; otherwise no other checks * are needed. */ if (!INGLOBALZONE(curproc)) { if (new_default || !blequal(&new_sl, CR_SL(CRED()))) return (SET_ERROR(EPERM)); return (0); } /* * For global-zone datasets (i.e., those whose zoned property is * "off", verify that the specified new label is valid for the * global zone. */ if (dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) return (SET_ERROR(EPERM)); if (!zoned) { if (zfs_check_global_label(name, strval) != 0) return (SET_ERROR(EPERM)); } /* * If the existing dataset label is nondefault, check if the * dataset is mounted (label cannot be changed while mounted). * Get the zfsvfs; if there isn't one, then the dataset isn't * mounted (or isn't a dataset, doesn't exist, ...). */ if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) != 0) { objset_t *os; static char *setsl_tag = "setsl_tag"; /* * Try to own the dataset; abort if there is any error, * (e.g., already mounted, in use, or other error). */ error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE, setsl_tag, &os); if (error != 0) return (SET_ERROR(EPERM)); dmu_objset_disown(os, setsl_tag); if (new_default) { needed_priv = PRIV_FILE_DOWNGRADE_SL; goto out_check; } if (hexstr_to_label(strval, &new_sl) != 0) return (SET_ERROR(EPERM)); if (blstrictdom(&ds_sl, &new_sl)) needed_priv = PRIV_FILE_DOWNGRADE_SL; else if (blstrictdom(&new_sl, &ds_sl)) needed_priv = PRIV_FILE_UPGRADE_SL; } else { /* dataset currently has a default label */ if (!new_default) needed_priv = PRIV_FILE_UPGRADE_SL; } out_check: if (needed_priv != -1) return (PRIV_POLICY(cr, needed_priv, B_FALSE, EPERM, NULL)); return (0); } #endif /* SECLABEL */ static int zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, cred_t *cr) { char *strval; /* * Check permissions for special properties. */ switch (prop) { case ZFS_PROP_ZONED: /* * Disallow setting of 'zoned' from within a local zone. */ if (!INGLOBALZONE(curthread)) return (SET_ERROR(EPERM)); break; case ZFS_PROP_QUOTA: case ZFS_PROP_FILESYSTEM_LIMIT: case ZFS_PROP_SNAPSHOT_LIMIT: if (!INGLOBALZONE(curthread)) { uint64_t zoned; char setpoint[ZFS_MAX_DATASET_NAME_LEN]; /* * Unprivileged users are allowed to modify the * limit on things *under* (ie. contained by) * the thing they own. */ if (dsl_prop_get_integer(dsname, "jailed", &zoned, setpoint)) return (SET_ERROR(EPERM)); if (!zoned || strlen(dsname) <= strlen(setpoint)) return (SET_ERROR(EPERM)); } break; case ZFS_PROP_MLSLABEL: #ifdef SECLABEL if (!is_system_labeled()) return (SET_ERROR(EPERM)); if (nvpair_value_string(propval, &strval) == 0) { int err; err = zfs_set_slabel_policy(dsname, strval, CRED()); if (err != 0) return (err); } #else return (EOPNOTSUPP); #endif break; } return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr)); } /* ARGSUSED */ static int zfs_secpolicy_set_fsacl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error; error = zfs_dozonecheck(zc->zc_name, cr); if (error != 0) return (error); /* * permission to set permissions will be evaluated later in * dsl_deleg_can_allow() */ return (0); } /* ARGSUSED */ static int zfs_secpolicy_rollback(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_ROLLBACK, cr)); } /* ARGSUSED */ static int zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { dsl_pool_t *dp; dsl_dataset_t *ds; char *cp; int error; /* * Generate the current snapshot name from the given objsetid, then * use that name for the secpolicy/zone checks. */ cp = strchr(zc->zc_name, '@'); if (cp == NULL) return (SET_ERROR(EINVAL)); error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } dsl_dataset_name(ds, zc->zc_name); error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds, ZFS_DELEG_PERM_SEND, cr); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } /* ARGSUSED */ static int zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_SEND, cr)); } /* ARGSUSED */ static int zfs_secpolicy_deleg_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { vnode_t *vp; int error; if ((error = lookupname(zc->zc_value, UIO_SYSSPACE, NO_FOLLOW, NULL, &vp)) != 0) return (error); /* Now make sure mntpnt and dataset are ZFS */ if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 || (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource), zc->zc_name) != 0)) { VN_RELE(vp); return (SET_ERROR(EPERM)); } VN_RELE(vp); return (dsl_deleg_access(zc->zc_name, ZFS_DELEG_PERM_SHARE, cr)); } int zfs_secpolicy_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { if (!INGLOBALZONE(curthread)) return (SET_ERROR(EPERM)); if (secpolicy_nfs(cr) == 0) { return (0); } else { return (zfs_secpolicy_deleg_share(zc, innvl, cr)); } } int zfs_secpolicy_smb_acl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { if (!INGLOBALZONE(curthread)) return (SET_ERROR(EPERM)); if (secpolicy_smb(cr) == 0) { return (0); } else { return (zfs_secpolicy_deleg_share(zc, innvl, cr)); } } static int zfs_get_parent(const char *datasetname, char *parent, int parentsize) { char *cp; /* * Remove the @bla or /bla from the end of the name to get the parent. */ (void) strncpy(parent, datasetname, parentsize); cp = strrchr(parent, '@'); if (cp != NULL) { cp[0] = '\0'; } else { cp = strrchr(parent, '/'); if (cp == NULL) return (SET_ERROR(ENOENT)); cp[0] = '\0'; } return (0); } int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) { int error; if ((error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr)); } /* ARGSUSED */ static int zfs_secpolicy_destroy(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_destroy_perms(zc->zc_name, cr)); } /* * Destroying snapshots with delegated permissions requires * descendant mount and destroy permissions. */ /* ARGSUSED */ static int zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvlist_t *snaps; nvpair_t *pair, *nextpair; int error = 0; if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0) return (SET_ERROR(EINVAL)); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nextpair) { nextpair = nvlist_next_nvpair(snaps, pair); error = zfs_secpolicy_destroy_perms(nvpair_name(pair), cr); if (error == ENOENT) { /* * Ignore any snapshots that don't exist (we consider * them "already destroyed"). Remove the name from the * nvl here in case the snapshot is created between * now and when we try to destroy it (in which case * we don't want to destroy it since we haven't * checked for permission). */ fnvlist_remove_nvpair(snaps, pair); error = 0; } if (error != 0) break; } return (error); } int zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) { char parentname[ZFS_MAX_DATASET_NAME_LEN]; int error; if ((error = zfs_secpolicy_write_perms(from, ZFS_DELEG_PERM_RENAME, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(from, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); if ((error = zfs_get_parent(to, parentname, sizeof (parentname))) != 0) return (error); if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_CREATE, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (error); } /* ARGSUSED */ static int zfs_secpolicy_rename(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { char *at = NULL; int error; if ((zc->zc_cookie & 1) != 0) { /* * This is recursive rename, so the starting snapshot might * not exist. Check file system or volume permission instead. */ at = strchr(zc->zc_name, '@'); if (at == NULL) return (EINVAL); *at = '\0'; } error = zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr); if (at != NULL) *at = '@'; return (error); } /* ARGSUSED */ static int zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { dsl_pool_t *dp; dsl_dataset_t *clone; int error; error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_PROMOTE, cr); if (error != 0) return (error); error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &clone); if (error == 0) { char parentname[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_t *origin = NULL; dsl_dir_t *dd; dd = clone->ds_dir; error = dsl_dataset_hold_obj(dd->dd_pool, dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin); if (error != 0) { dsl_dataset_rele(clone, FTAG); dsl_pool_rele(dp, FTAG); return (error); } error = zfs_secpolicy_write_perms_ds(zc->zc_name, clone, ZFS_DELEG_PERM_MOUNT, cr); dsl_dataset_name(origin, parentname); if (error == 0) { error = zfs_secpolicy_write_perms_ds(parentname, origin, ZFS_DELEG_PERM_PROMOTE, cr); } dsl_dataset_rele(clone, FTAG); dsl_dataset_rele(origin, FTAG); } dsl_pool_rele(dp, FTAG); return (error); } /* ARGSUSED */ static int zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error; if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_RECEIVE, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_CREATE, cr)); } int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) { return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_SNAPSHOT, cr)); } /* * Check for permission to create each snapshot in the nvlist. */ /* ARGSUSED */ static int zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvlist_t *snaps; int error; nvpair_t *pair; if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0) return (SET_ERROR(EINVAL)); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { char *name = nvpair_name(pair); char *atp = strchr(name, '@'); if (atp == NULL) { error = SET_ERROR(EINVAL); break; } *atp = '\0'; error = zfs_secpolicy_snapshot_perms(name, cr); *atp = '@'; if (error != 0) break; } return (error); } /* * Check for permission to create each snapshot in the nvlist. */ /* ARGSUSED */ static int zfs_secpolicy_bookmark(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error = 0; for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { char *name = nvpair_name(pair); char *hashp = strchr(name, '#'); if (hashp == NULL) { error = SET_ERROR(EINVAL); break; } *hashp = '\0'; error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_BOOKMARK, cr); *hashp = '#'; if (error != 0) break; } return (error); } /* ARGSUSED */ static int zfs_secpolicy_remap(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_REMAP, cr)); } /* ARGSUSED */ static int zfs_secpolicy_destroy_bookmarks(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvpair_t *pair, *nextpair; int error = 0; for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nextpair) { char *name = nvpair_name(pair); char *hashp = strchr(name, '#'); nextpair = nvlist_next_nvpair(innvl, pair); if (hashp == NULL) { error = SET_ERROR(EINVAL); break; } *hashp = '\0'; error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr); *hashp = '#'; if (error == ENOENT) { /* * Ignore any filesystems that don't exist (we consider * their bookmarks "already destroyed"). Remove * the name from the nvl here in case the filesystem * is created between now and when we try to destroy * the bookmark (in which case we don't want to * destroy it since we haven't checked for permission). */ fnvlist_remove_nvpair(innvl, pair); error = 0; } if (error != 0) break; } return (error); } /* ARGSUSED */ static int zfs_secpolicy_log_history(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { /* * Even root must have a proper TSD so that we know what pool * to log to. */ if (tsd_get(zfs_allow_log_key) == NULL) return (SET_ERROR(EPERM)); return (0); } static int zfs_secpolicy_create_clone(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { char parentname[ZFS_MAX_DATASET_NAME_LEN]; int error; char *origin; if ((error = zfs_get_parent(zc->zc_name, parentname, sizeof (parentname))) != 0) return (error); if (nvlist_lookup_string(innvl, "origin", &origin) == 0 && (error = zfs_secpolicy_write_perms(origin, ZFS_DELEG_PERM_CLONE, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_CREATE, cr)) != 0) return (error); return (zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_MOUNT, cr)); } /* * Policy for pool operations - create/destroy pools, add vdevs, etc. Requires * SYS_CONFIG privilege, which is not available in a local zone. */ /* ARGSUSED */ static int zfs_secpolicy_config(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { if (secpolicy_sys_config(cr, B_FALSE) != 0) return (SET_ERROR(EPERM)); return (0); } /* * Policy for object to name lookups. */ /* ARGSUSED */ static int zfs_secpolicy_diff(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error; if ((error = secpolicy_sys_config(cr, B_FALSE)) == 0) return (0); error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr); return (error); } /* * Policy for fault injection. Requires all privileges. */ /* ARGSUSED */ static int zfs_secpolicy_inject(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (secpolicy_zinject(cr)); } /* ARGSUSED */ static int zfs_secpolicy_inherit_prop(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { zfs_prop_t prop = zfs_name_to_prop(zc->zc_value); if (prop == ZPROP_INVAL) { if (!zfs_prop_user(zc->zc_value)) return (SET_ERROR(EINVAL)); return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_USERPROP, cr)); } else { return (zfs_secpolicy_setprop(zc->zc_name, prop, NULL, cr)); } } static int zfs_secpolicy_userspace_one(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int err = zfs_secpolicy_read(zc, innvl, cr); if (err) return (err); if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (SET_ERROR(EINVAL)); if (zc->zc_value[0] == 0) { /* * They are asking about a posix uid/gid. If it's * themself, allow it. */ if (zc->zc_objset_type == ZFS_PROP_USERUSED || zc->zc_objset_type == ZFS_PROP_USERQUOTA) { if (zc->zc_guid == crgetuid(cr)) return (0); } else { if (groupmember(zc->zc_guid, cr)) return (0); } } return (zfs_secpolicy_write_perms(zc->zc_name, userquota_perms[zc->zc_objset_type], cr)); } static int zfs_secpolicy_userspace_many(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int err = zfs_secpolicy_read(zc, innvl, cr); if (err) return (err); if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (SET_ERROR(EINVAL)); return (zfs_secpolicy_write_perms(zc->zc_name, userquota_perms[zc->zc_objset_type], cr)); } /* ARGSUSED */ static int zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION, NULL, cr)); } /* ARGSUSED */ static int zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvpair_t *pair; nvlist_t *holds; int error; error = nvlist_lookup_nvlist(innvl, "holds", &holds); if (error != 0) return (SET_ERROR(EINVAL)); for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; pair = nvlist_next_nvpair(holds, pair)) { char fsname[ZFS_MAX_DATASET_NAME_LEN]; error = dmu_fsname(nvpair_name(pair), fsname); if (error != 0) return (error); error = zfs_secpolicy_write_perms(fsname, ZFS_DELEG_PERM_HOLD, cr); if (error != 0) return (error); } return (0); } /* ARGSUSED */ static int zfs_secpolicy_release(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvpair_t *pair; int error; for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { char fsname[ZFS_MAX_DATASET_NAME_LEN]; error = dmu_fsname(nvpair_name(pair), fsname); if (error != 0) return (error); error = zfs_secpolicy_write_perms(fsname, ZFS_DELEG_PERM_RELEASE, cr); if (error != 0) return (error); } return (0); } /* * Policy for allowing temporary snapshots to be taken or released */ static int zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { /* * A temporary snapshot is the same as a snapshot, * hold, destroy and release all rolled into one. * Delegated diff alone is sufficient that we allow this. */ int error; if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr)) == 0) return (0); error = zfs_secpolicy_snapshot_perms(zc->zc_name, cr); if (error == 0) error = zfs_secpolicy_hold(zc, innvl, cr); if (error == 0) error = zfs_secpolicy_release(zc, innvl, cr); if (error == 0) error = zfs_secpolicy_destroy(zc, innvl, cr); return (error); } /* * Returns the nvlist as specified by the user in the zfs_cmd_t. */ static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp) { char *packed; int error; nvlist_t *list = NULL; /* * Read in and unpack the user-supplied nvlist. */ if (size == 0) return (SET_ERROR(EINVAL)); packed = kmem_alloc(size, KM_SLEEP); if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size, iflag)) != 0) { kmem_free(packed, size); return (SET_ERROR(EFAULT)); } if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) { kmem_free(packed, size); return (error); } kmem_free(packed, size); *nvp = list; return (0); } /* * Reduce the size of this nvlist until it can be serialized in 'max' bytes. * Entries will be removed from the end of the nvlist, and one int32 entry * named "N_MORE_ERRORS" will be added indicating how many entries were * removed. */ static int nvlist_smush(nvlist_t *errors, size_t max) { size_t size; size = fnvlist_size(errors); if (size > max) { nvpair_t *more_errors; int n = 0; if (max < 1024) return (SET_ERROR(ENOMEM)); fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, 0); more_errors = nvlist_prev_nvpair(errors, NULL); do { nvpair_t *pair = nvlist_prev_nvpair(errors, more_errors); fnvlist_remove_nvpair(errors, pair); n++; size = fnvlist_size(errors); } while (size > max); fnvlist_remove_nvpair(errors, more_errors); fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, n); ASSERT3U(fnvlist_size(errors), <=, max); } return (0); } static int put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) { char *packed = NULL; int error = 0; size_t size; size = fnvlist_size(nvl); if (size > zc->zc_nvlist_dst_size) { /* * Solaris returns ENOMEM here, because even if an error is * returned from an ioctl(2), new zc_nvlist_dst_size will be * passed to the userland. This is not the case for FreeBSD. * We need to return 0, so the kernel will copy the * zc_nvlist_dst_size back and the userland can discover that a * bigger buffer is needed. */ error = 0; } else { packed = fnvlist_pack(nvl, &size); if (ddi_copyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst, size, zc->zc_iflags) != 0) error = SET_ERROR(EFAULT); fnvlist_pack_free(packed, size); } zc->zc_nvlist_dst_size = size; zc->zc_nvlist_dst_filled = B_TRUE; return (error); } int getzfsvfs_impl(objset_t *os, vfs_t **vfsp) { zfsvfs_t *zfvp; int error = 0; if (dmu_objset_type(os) != DMU_OST_ZFS) { return (SET_ERROR(EINVAL)); } mutex_enter(&os->os_user_ptr_lock); zfvp = dmu_objset_get_user(os); if (zfvp) { *vfsp = zfvp->z_vfs; vfs_ref(zfvp->z_vfs); } else { error = SET_ERROR(ESRCH); } mutex_exit(&os->os_user_ptr_lock); return (error); } int getzfsvfs(const char *dsname, zfsvfs_t **zfvp) { objset_t *os; vfs_t *vfsp; int error; error = dmu_objset_hold(dsname, FTAG, &os); if (error != 0) return (error); error = getzfsvfs_impl(os, &vfsp); dmu_objset_rele(os, FTAG); if (error != 0) return (error); error = vfs_busy(vfsp, 0); vfs_rel(vfsp); if (error != 0) { *zfvp = NULL; error = SET_ERROR(ESRCH); } else { *zfvp = vfsp->vfs_data; } return (error); } /* * Find a zfsvfs_t for a mounted filesystem, or create our own, in which * case its z_vfs will be NULL, and it will be opened as the owner. * If 'writer' is set, the z_teardown_lock will be held for RW_WRITER, * which prevents all vnode ops from running. */ static int zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer) { int error = 0; if (getzfsvfs(name, zfvp) != 0) error = zfsvfs_create(name, zfvp); if (error == 0) { rrm_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER : RW_READER, tag); #ifdef illumos if ((*zfvp)->z_unmounted) { /* * XXX we could probably try again, since the unmounting * thread should be just about to disassociate the * objset from the zfsvfs. */ rrm_exit(&(*zfvp)->z_teardown_lock, tag); return (SET_ERROR(EBUSY)); } #else /* * vfs_busy() ensures that the filesystem is not and * can not be unmounted. */ ASSERT(!(*zfvp)->z_unmounted); #endif } return (error); } static void zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag) { rrm_exit(&zfsvfs->z_teardown_lock, tag); if (zfsvfs->z_vfs) { #ifdef illumos VFS_RELE(zfsvfs->z_vfs); #else vfs_unbusy(zfsvfs->z_vfs); #endif } else { dmu_objset_disown(zfsvfs->z_os, zfsvfs); zfsvfs_free(zfsvfs); } } static int zfs_ioc_pool_create(zfs_cmd_t *zc) { int error; nvlist_t *config, *props = NULL; nvlist_t *rootprops = NULL; nvlist_t *zplprops = NULL; if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) return (error); if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { nvlist_free(config); return (error); } if (props) { nvlist_t *nvl = NULL; uint64_t version = SPA_VERSION; (void) nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), &version); if (!SPA_VERSION_IS_SUPPORTED(version)) { error = SET_ERROR(EINVAL); goto pool_props_bad; } (void) nvlist_lookup_nvlist(props, ZPOOL_ROOTFS_PROPS, &nvl); if (nvl) { error = nvlist_dup(nvl, &rootprops, KM_SLEEP); if (error != 0) { nvlist_free(config); nvlist_free(props); return (error); } (void) nvlist_remove_all(props, ZPOOL_ROOTFS_PROPS); } VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); error = zfs_fill_zplprops_root(version, rootprops, zplprops, NULL); if (error != 0) goto pool_props_bad; } error = spa_create(zc->zc_name, config, props, zplprops); /* * Set the remaining root properties */ if (!error && (error = zfs_set_prop_nvlist(zc->zc_name, ZPROP_SRC_LOCAL, rootprops, NULL)) != 0) (void) spa_destroy(zc->zc_name); pool_props_bad: nvlist_free(rootprops); nvlist_free(zplprops); nvlist_free(config); nvlist_free(props); return (error); } static int zfs_ioc_pool_destroy(zfs_cmd_t *zc) { int error; zfs_log_history(zc); error = spa_destroy(zc->zc_name); if (error == 0) zvol_remove_minors(zc->zc_name); return (error); } static int zfs_ioc_pool_import(zfs_cmd_t *zc) { nvlist_t *config, *props = NULL; uint64_t guid; int error; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) != 0) return (error); if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { nvlist_free(config); return (error); } if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || guid != zc->zc_guid) error = SET_ERROR(EINVAL); else error = spa_import(zc->zc_name, config, props, zc->zc_cookie); if (zc->zc_nvlist_dst != 0) { int err; if ((err = put_nvlist(zc, config)) != 0) error = err; } nvlist_free(config); nvlist_free(props); return (error); } static int zfs_ioc_pool_export(zfs_cmd_t *zc) { int error; boolean_t force = (boolean_t)zc->zc_cookie; boolean_t hardforce = (boolean_t)zc->zc_guid; zfs_log_history(zc); error = spa_export(zc->zc_name, NULL, force, hardforce); if (error == 0) zvol_remove_minors(zc->zc_name); return (error); } static int zfs_ioc_pool_configs(zfs_cmd_t *zc) { nvlist_t *configs; int error; if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL) return (SET_ERROR(EEXIST)); error = put_nvlist(zc, configs); nvlist_free(configs); return (error); } /* * inputs: * zc_name name of the pool * * outputs: * zc_cookie real errno * zc_nvlist_dst config nvlist * zc_nvlist_dst_size size of config nvlist */ static int zfs_ioc_pool_stats(zfs_cmd_t *zc) { nvlist_t *config; int error; int ret = 0; error = spa_get_stats(zc->zc_name, &config, zc->zc_value, sizeof (zc->zc_value)); if (config != NULL) { ret = put_nvlist(zc, config); nvlist_free(config); /* * The config may be present even if 'error' is non-zero. * In this case we return success, and preserve the real errno * in 'zc_cookie'. */ zc->zc_cookie = error; } else { ret = error; } return (ret); } /* * Try to import the given pool, returning pool stats as appropriate so that * user land knows which devices are available and overall pool health. */ static int zfs_ioc_pool_tryimport(zfs_cmd_t *zc) { nvlist_t *tryconfig, *config; int error; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &tryconfig)) != 0) return (error); config = spa_tryimport(tryconfig); nvlist_free(tryconfig); if (config == NULL) return (SET_ERROR(EINVAL)); error = put_nvlist(zc, config); nvlist_free(config); return (error); } /* * inputs: * zc_name name of the pool * zc_cookie scan func (pool_scan_func_t) * zc_flags scrub pause/resume flag (pool_scrub_cmd_t) */ static int zfs_ioc_pool_scan(zfs_cmd_t *zc) { spa_t *spa; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (zc->zc_flags >= POOL_SCRUB_FLAGS_END) return (SET_ERROR(EINVAL)); if (zc->zc_flags == POOL_SCRUB_PAUSE) error = spa_scrub_pause_resume(spa, POOL_SCRUB_PAUSE); else if (zc->zc_cookie == POOL_SCAN_NONE) error = spa_scan_stop(spa); else error = spa_scan(spa, zc->zc_cookie); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_freeze(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error == 0) { spa_freeze(spa); spa_close(spa, FTAG); } return (error); } static int zfs_ioc_pool_upgrade(zfs_cmd_t *zc) { spa_t *spa; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (zc->zc_cookie < spa_version(spa) || !SPA_VERSION_IS_SUPPORTED(zc->zc_cookie)) { spa_close(spa, FTAG); return (SET_ERROR(EINVAL)); } spa_upgrade(spa, zc->zc_cookie); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_get_history(zfs_cmd_t *zc) { spa_t *spa; char *hist_buf; uint64_t size; int error; if ((size = zc->zc_history_len) == 0) return (SET_ERROR(EINVAL)); if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } hist_buf = kmem_alloc(size, KM_SLEEP); if ((error = spa_history_get(spa, &zc->zc_history_offset, &zc->zc_history_len, hist_buf)) == 0) { error = ddi_copyout(hist_buf, (void *)(uintptr_t)zc->zc_history, zc->zc_history_len, zc->zc_iflags); } spa_close(spa, FTAG); kmem_free(hist_buf, size); return (error); } static int zfs_ioc_pool_reguid(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error == 0) { error = spa_change_guid(spa); spa_close(spa, FTAG); } return (error); } static int zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc) { return (dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value)); } /* * inputs: * zc_name name of filesystem * zc_obj object to find * * outputs: * zc_value name of object */ static int zfs_ioc_obj_to_path(zfs_cmd_t *zc) { objset_t *os; int error; /* XXX reading from objset not owned */ if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele(os, FTAG); return (SET_ERROR(EINVAL)); } error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value, sizeof (zc->zc_value)); dmu_objset_rele(os, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_obj object to find * * outputs: * zc_stat stats on object * zc_value path to object */ static int zfs_ioc_obj_to_stats(zfs_cmd_t *zc) { objset_t *os; int error; /* XXX reading from objset not owned */ if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele(os, FTAG); return (SET_ERROR(EINVAL)); } error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value, sizeof (zc->zc_value)); dmu_objset_rele(os, FTAG); return (error); } static int zfs_ioc_vdev_add(zfs_cmd_t *zc) { spa_t *spa; int error; nvlist_t *config, **l2cache, **spares; uint_t nl2cache = 0, nspares = 0; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config); (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache); (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_SPARES, &spares, &nspares); #ifdef illumos /* * A root pool with concatenated devices is not supported. * Thus, can not add a device to a root pool. * * Intent log device can not be added to a rootpool because * during mountroot, zil is replayed, a seperated log device * can not be accessed during the mountroot time. * * l2cache and spare devices are ok to be added to a rootpool. */ if (spa_bootfs(spa) != 0 && nl2cache == 0 && nspares == 0) { nvlist_free(config); spa_close(spa, FTAG); return (SET_ERROR(EDOM)); } #endif /* illumos */ if (error == 0) { error = spa_vdev_add(spa, config); nvlist_free(config); } spa_close(spa, FTAG); return (error); } /* * inputs: * zc_name name of the pool * zc_guid guid of vdev to remove * zc_cookie cancel removal */ static int zfs_ioc_vdev_remove(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); if (zc->zc_cookie != 0) { error = spa_vdev_remove_cancel(spa); } else { error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE); } spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_set_state(zfs_cmd_t *zc) { spa_t *spa; int error; vdev_state_t newstate = VDEV_STATE_UNKNOWN; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); switch (zc->zc_cookie) { case VDEV_STATE_ONLINE: error = vdev_online(spa, zc->zc_guid, zc->zc_obj, &newstate); break; case VDEV_STATE_OFFLINE: error = vdev_offline(spa, zc->zc_guid, zc->zc_obj); break; case VDEV_STATE_FAULTED: if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED && zc->zc_obj != VDEV_AUX_EXTERNAL) zc->zc_obj = VDEV_AUX_ERR_EXCEEDED; error = vdev_fault(spa, zc->zc_guid, zc->zc_obj); break; case VDEV_STATE_DEGRADED: if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED && zc->zc_obj != VDEV_AUX_EXTERNAL) zc->zc_obj = VDEV_AUX_ERR_EXCEEDED; error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj); break; default: error = SET_ERROR(EINVAL); } zc->zc_cookie = newstate; spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_attach(zfs_cmd_t *zc) { spa_t *spa; int replacing = zc->zc_cookie; nvlist_t *config; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) == 0) { error = spa_vdev_attach(spa, zc->zc_guid, config, replacing); nvlist_free(config); } spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_detach(zfs_cmd_t *zc) { spa_t *spa; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); error = spa_vdev_detach(spa, zc->zc_guid, 0, B_FALSE); spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_split(zfs_cmd_t *zc) { spa_t *spa; nvlist_t *config, *props = NULL; int error; boolean_t exp = !!(zc->zc_cookie & ZPOOL_EXPORT_AFTER_SPLIT); if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) { spa_close(spa, FTAG); return (error); } if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { spa_close(spa, FTAG); nvlist_free(config); return (error); } error = spa_vdev_split_mirror(spa, zc->zc_string, config, props, exp); spa_close(spa, FTAG); nvlist_free(config); nvlist_free(props); return (error); } static int zfs_ioc_vdev_setpath(zfs_cmd_t *zc) { spa_t *spa; char *path = zc->zc_value; uint64_t guid = zc->zc_guid; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = spa_vdev_setpath(spa, guid, path); spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_setfru(zfs_cmd_t *zc) { spa_t *spa; char *fru = zc->zc_value; uint64_t guid = zc->zc_guid; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = spa_vdev_setfru(spa, guid, fru); spa_close(spa, FTAG); return (error); } static int zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) { int error = 0; nvlist_t *nv; dmu_objset_fast_stat(os, &zc->zc_objset_stats); if (zc->zc_nvlist_dst != 0 && (error = dsl_prop_get_all(os, &nv)) == 0) { dmu_objset_stats(os, nv); /* * NB: zvol_get_stats() will read the objset contents, * which we aren't supposed to do with a * DS_MODE_USER hold, because it could be * inconsistent. So this is a bit of a workaround... * XXX reading with out owning */ if (!zc->zc_objset_stats.dds_inconsistent && dmu_objset_type(os) == DMU_OST_ZVOL) { error = zvol_get_stats(os, nv); if (error == EIO) return (error); VERIFY0(error); } error = put_nvlist(zc, nv); nvlist_free(nv); } return (error); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_objset_stats(zfs_cmd_t *zc) { objset_t *os; int error; error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error == 0) { error = zfs_ioc_objset_stats_impl(zc, os); dmu_objset_rele(os, FTAG); } if (error == ENOMEM) error = 0; return (error); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_nvlist_dst received property nvlist * zc_nvlist_dst_size size of received property nvlist * * Gets received properties (distinct from local properties on or after * SPA_VERSION_RECVD_PROPS) for callers who want to differentiate received from * local property values. */ static int zfs_ioc_objset_recvd_props(zfs_cmd_t *zc) { int error = 0; nvlist_t *nv; /* * Without this check, we would return local property values if the * caller has not already received properties on or after * SPA_VERSION_RECVD_PROPS. */ if (!dsl_prop_get_hasrecvd(zc->zc_name)) return (SET_ERROR(ENOTSUP)); if (zc->zc_nvlist_dst != 0 && (error = dsl_prop_get_received(zc->zc_name, &nv)) == 0) { error = put_nvlist(zc, nv); nvlist_free(nv); } return (error); } static int nvl_add_zplprop(objset_t *os, nvlist_t *props, zfs_prop_t prop) { uint64_t value; int error; /* * zfs_get_zplprop() will either find a value or give us * the default value (if there is one). */ if ((error = zfs_get_zplprop(os, prop, &value)) != 0) return (error); VERIFY(nvlist_add_uint64(props, zfs_prop_to_name(prop), value) == 0); return (0); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for zpl property nvlist * * outputs: * zc_nvlist_dst zpl property nvlist * zc_nvlist_dst_size size of zpl property nvlist */ static int zfs_ioc_objset_zplprops(zfs_cmd_t *zc) { objset_t *os; int err; /* XXX reading without owning */ if (err = dmu_objset_hold(zc->zc_name, FTAG, &os)) return (err); dmu_objset_fast_stat(os, &zc->zc_objset_stats); /* * NB: nvl_add_zplprop() will read the objset contents, * which we aren't supposed to do with a DS_MODE_USER * hold, because it could be inconsistent. */ if (zc->zc_nvlist_dst != 0 && !zc->zc_objset_stats.dds_inconsistent && dmu_objset_type(os) == DMU_OST_ZFS) { nvlist_t *nv; VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); if ((err = nvl_add_zplprop(os, nv, ZFS_PROP_VERSION)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_NORMALIZE)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_UTF8ONLY)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_CASE)) == 0) err = put_nvlist(zc, nv); nvlist_free(nv); } else { err = SET_ERROR(ENOENT); } dmu_objset_rele(os, FTAG); return (err); } boolean_t dataset_name_hidden(const char *name) { /* * Skip over datasets that are not visible in this zone, * internal datasets (which have a $ in their name), and * temporary datasets (which have a % in their name). */ if (strchr(name, '$') != NULL) return (B_TRUE); if (strchr(name, '%') != NULL) return (B_TRUE); if (!INGLOBALZONE(curthread) && !zone_dataset_visible(name, NULL)) return (B_TRUE); return (B_FALSE); } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_name name of next filesystem * zc_cookie zap cursor * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_dataset_list_next(zfs_cmd_t *zc) { objset_t *os; int error; char *p; size_t orig_len = strlen(zc->zc_name); top: if (error = dmu_objset_hold(zc->zc_name, FTAG, &os)) { if (error == ENOENT) error = SET_ERROR(ESRCH); return (error); } p = strrchr(zc->zc_name, '/'); if (p == NULL || p[1] != '\0') (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name)); p = zc->zc_name + strlen(zc->zc_name); do { error = dmu_dir_list_next(os, sizeof (zc->zc_name) - (p - zc->zc_name), p, NULL, &zc->zc_cookie); if (error == ENOENT) error = SET_ERROR(ESRCH); } while (error == 0 && dataset_name_hidden(zc->zc_name)); dmu_objset_rele(os, FTAG); /* * If it's an internal dataset (ie. with a '$' in its name), * don't try to get stats for it, otherwise we'll return ENOENT. */ if (error == 0 && strchr(zc->zc_name, '$') == NULL) { error = zfs_ioc_objset_stats(zc); /* fill in the stats */ if (error == ENOENT) { /* We lost a race with destroy, get the next one. */ zc->zc_name[orig_len] = '\0'; goto top; } } return (error); } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_nvlist_dst_size size of buffer for property nvlist * zc_simple when set, only name is requested * * outputs: * zc_name name of next snapshot * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) { objset_t *os; int error; error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error != 0) { return (error == ENOENT ? ESRCH : error); } /* * A dataset name of maximum length cannot have any snapshots, * so exit immediately. */ if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= ZFS_MAX_DATASET_NAME_LEN) { dmu_objset_rele(os, FTAG); return (SET_ERROR(ESRCH)); } error = dmu_snapshot_list_next(os, sizeof (zc->zc_name) - strlen(zc->zc_name), zc->zc_name + strlen(zc->zc_name), &zc->zc_obj, &zc->zc_cookie, NULL); if (error == 0 && !zc->zc_simple) { dsl_dataset_t *ds; dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool; error = dsl_dataset_hold_obj(dp, zc->zc_obj, FTAG, &ds); if (error == 0) { objset_t *ossnap; error = dmu_objset_from_ds(ds, &ossnap); if (error == 0) error = zfs_ioc_objset_stats_impl(zc, ossnap); dsl_dataset_rele(ds, FTAG); } } else if (error == ENOENT) { error = SET_ERROR(ESRCH); } dmu_objset_rele(os, FTAG); /* if we failed, undo the @ that we tacked on to zc_name */ if (error != 0) *strchr(zc->zc_name, '@') = '\0'; return (error); } static int zfs_prop_set_userquota(const char *dsname, nvpair_t *pair) { const char *propname = nvpair_name(pair); uint64_t *valary; unsigned int vallen; const char *domain; char *dash; zfs_userquota_prop_t type; uint64_t rid; uint64_t quota; zfsvfs_t *zfsvfs; int err; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) != 0) return (SET_ERROR(EINVAL)); } /* * A correctly constructed propname is encoded as * userquota@-. */ if ((dash = strchr(propname, '-')) == NULL || nvpair_value_uint64_array(pair, &valary, &vallen) != 0 || vallen != 3) return (SET_ERROR(EINVAL)); domain = dash + 1; type = valary[0]; rid = valary[1]; quota = valary[2]; err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_FALSE); if (err == 0) { err = zfs_set_userquota(zfsvfs, type, domain, rid, quota); zfsvfs_rele(zfsvfs, FTAG); } return (err); } /* * If the named property is one that has a special function to set its value, * return 0 on success and a positive error code on failure; otherwise if it is * not one of the special properties handled by this function, return -1. * * XXX: It would be better for callers of the property interface if we handled * these special cases in dsl_prop.c (in the dsl layer). */ static int zfs_prop_set_special(const char *dsname, zprop_source_t source, nvpair_t *pair) { const char *propname = nvpair_name(pair); zfs_prop_t prop = zfs_name_to_prop(propname); uint64_t intval; int err = -1; if (prop == ZPROP_INVAL) { if (zfs_prop_userquota(propname)) return (zfs_prop_set_userquota(dsname, pair)); return (-1); } if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) == 0); } if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) return (-1); VERIFY(0 == nvpair_value_uint64(pair, &intval)); switch (prop) { case ZFS_PROP_QUOTA: err = dsl_dir_set_quota(dsname, source, intval); break; case ZFS_PROP_REFQUOTA: err = dsl_dataset_set_refquota(dsname, source, intval); break; case ZFS_PROP_FILESYSTEM_LIMIT: case ZFS_PROP_SNAPSHOT_LIMIT: if (intval == UINT64_MAX) { /* clearing the limit, just do it */ err = 0; } else { err = dsl_dir_activate_fs_ss_limit(dsname); } /* * Set err to -1 to force the zfs_set_prop_nvlist code down the * default path to set the value in the nvlist. */ if (err == 0) err = -1; break; case ZFS_PROP_RESERVATION: err = dsl_dir_set_reservation(dsname, source, intval); break; case ZFS_PROP_REFRESERVATION: err = dsl_dataset_set_refreservation(dsname, source, intval); break; case ZFS_PROP_VOLSIZE: err = zvol_set_volsize(dsname, intval); break; case ZFS_PROP_VERSION: { zfsvfs_t *zfsvfs; if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_TRUE)) != 0) break; err = zfs_set_version(zfsvfs, intval); zfsvfs_rele(zfsvfs, FTAG); if (err == 0 && intval >= ZPL_VERSION_USERSPACE) { zfs_cmd_t *zc; zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); (void) strcpy(zc->zc_name, dsname); (void) zfs_ioc_userspace_upgrade(zc); kmem_free(zc, sizeof (zfs_cmd_t)); } break; } default: err = -1; } return (err); } /* * This function is best effort. If it fails to set any of the given properties, * it continues to set as many as it can and returns the last error * encountered. If the caller provides a non-NULL errlist, it will be filled in * with the list of names of all the properties that failed along with the * corresponding error numbers. * * If every property is set successfully, zero is returned and errlist is not * modified. */ int zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl, nvlist_t *errlist) { nvpair_t *pair; nvpair_t *propval; int rv = 0; uint64_t intval; char *strval; nvlist_t *genericnvl = fnvlist_alloc(); nvlist_t *retrynvl = fnvlist_alloc(); retry: pair = NULL; while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { const char *propname = nvpair_name(pair); zfs_prop_t prop = zfs_name_to_prop(propname); int err = 0; /* decode the property value */ propval = pair; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; attrs = fnvpair_value_nvlist(pair); if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &propval) != 0) err = SET_ERROR(EINVAL); } /* Validate value type */ if (err == 0 && prop == ZPROP_INVAL) { if (zfs_prop_user(propname)) { if (nvpair_type(propval) != DATA_TYPE_STRING) err = SET_ERROR(EINVAL); } else if (zfs_prop_userquota(propname)) { if (nvpair_type(propval) != DATA_TYPE_UINT64_ARRAY) err = SET_ERROR(EINVAL); } else { err = SET_ERROR(EINVAL); } } else if (err == 0) { if (nvpair_type(propval) == DATA_TYPE_STRING) { if (zfs_prop_get_type(prop) != PROP_TYPE_STRING) err = SET_ERROR(EINVAL); } else if (nvpair_type(propval) == DATA_TYPE_UINT64) { const char *unused; intval = fnvpair_value_uint64(propval); switch (zfs_prop_get_type(prop)) { case PROP_TYPE_NUMBER: break; case PROP_TYPE_STRING: err = SET_ERROR(EINVAL); break; case PROP_TYPE_INDEX: if (zfs_prop_index_to_string(prop, intval, &unused) != 0) err = SET_ERROR(EINVAL); break; default: cmn_err(CE_PANIC, "unknown property type"); } } else { err = SET_ERROR(EINVAL); } } /* Validate permissions */ if (err == 0) err = zfs_check_settable(dsname, pair, CRED()); if (err == 0) { err = zfs_prop_set_special(dsname, source, pair); if (err == -1) { /* * For better performance we build up a list of * properties to set in a single transaction. */ err = nvlist_add_nvpair(genericnvl, pair); } else if (err != 0 && nvl != retrynvl) { /* * This may be a spurious error caused by * receiving quota and reservation out of order. * Try again in a second pass. */ err = nvlist_add_nvpair(retrynvl, pair); } } if (err != 0) { if (errlist != NULL) fnvlist_add_int32(errlist, propname, err); rv = err; } } if (nvl != retrynvl && !nvlist_empty(retrynvl)) { nvl = retrynvl; goto retry; } if (!nvlist_empty(genericnvl) && dsl_props_set(dsname, source, genericnvl) != 0) { /* * If this fails, we still want to set as many properties as we * can, so try setting them individually. */ pair = NULL; while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) { const char *propname = nvpair_name(pair); int err = 0; propval = pair; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; attrs = fnvpair_value_nvlist(pair); propval = fnvlist_lookup_nvpair(attrs, ZPROP_VALUE); } if (nvpair_type(propval) == DATA_TYPE_STRING) { strval = fnvpair_value_string(propval); err = dsl_prop_set_string(dsname, propname, source, strval); } else { intval = fnvpair_value_uint64(propval); err = dsl_prop_set_int(dsname, propname, source, intval); } if (err != 0) { if (errlist != NULL) { fnvlist_add_int32(errlist, propname, err); } rv = err; } } } nvlist_free(genericnvl); nvlist_free(retrynvl); return (rv); } /* * Check that all the properties are valid user properties. */ static int zfs_check_userprops(const char *fsname, nvlist_t *nvl) { nvpair_t *pair = NULL; int error = 0; while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { const char *propname = nvpair_name(pair); if (!zfs_prop_user(propname) || nvpair_type(pair) != DATA_TYPE_STRING) return (SET_ERROR(EINVAL)); if (error = zfs_secpolicy_write_perms(fsname, ZFS_DELEG_PERM_USERPROP, CRED())) return (error); if (strlen(propname) >= ZAP_MAXNAMELEN) return (SET_ERROR(ENAMETOOLONG)); if (strlen(fnvpair_value_string(pair)) >= ZAP_MAXVALUELEN) return (E2BIG); } return (0); } static void props_skip(nvlist_t *props, nvlist_t *skipped, nvlist_t **newprops) { nvpair_t *pair; VERIFY(nvlist_alloc(newprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); pair = NULL; while ((pair = nvlist_next_nvpair(props, pair)) != NULL) { if (nvlist_exists(skipped, nvpair_name(pair))) continue; VERIFY(nvlist_add_nvpair(*newprops, pair) == 0); } } static int clear_received_props(const char *dsname, nvlist_t *props, nvlist_t *skipped) { int err = 0; nvlist_t *cleared_props = NULL; props_skip(props, skipped, &cleared_props); if (!nvlist_empty(cleared_props)) { /* * Acts on local properties until the dataset has received * properties at least once on or after SPA_VERSION_RECVD_PROPS. */ zprop_source_t flags = (ZPROP_SRC_NONE | (dsl_prop_get_hasrecvd(dsname) ? ZPROP_SRC_RECEIVED : 0)); err = zfs_set_prop_nvlist(dsname, flags, cleared_props, NULL); } nvlist_free(cleared_props); return (err); } /* * inputs: * zc_name name of filesystem * zc_value name of property to set * zc_nvlist_src{_size} nvlist of properties to apply * zc_cookie received properties flag * * outputs: * zc_nvlist_dst{_size} error for each unapplied received property */ static int zfs_ioc_set_prop(zfs_cmd_t *zc) { nvlist_t *nvl; boolean_t received = zc->zc_cookie; zprop_source_t source = (received ? ZPROP_SRC_RECEIVED : ZPROP_SRC_LOCAL); nvlist_t *errors; int error; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &nvl)) != 0) return (error); if (received) { nvlist_t *origprops; if (dsl_prop_get_received(zc->zc_name, &origprops) == 0) { (void) clear_received_props(zc->zc_name, origprops, nvl); nvlist_free(origprops); } error = dsl_prop_set_hasrecvd(zc->zc_name); } errors = fnvlist_alloc(); if (error == 0) error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, errors); if (zc->zc_nvlist_dst != 0 && errors != NULL) { (void) put_nvlist(zc, errors); } nvlist_free(errors); nvlist_free(nvl); return (error); } /* * inputs: * zc_name name of filesystem * zc_value name of property to inherit * zc_cookie revert to received value if TRUE * * outputs: none */ static int zfs_ioc_inherit_prop(zfs_cmd_t *zc) { const char *propname = zc->zc_value; zfs_prop_t prop = zfs_name_to_prop(propname); boolean_t received = zc->zc_cookie; zprop_source_t source = (received ? ZPROP_SRC_NONE /* revert to received value, if any */ : ZPROP_SRC_INHERITED); /* explicitly inherit */ if (received) { nvlist_t *dummy; nvpair_t *pair; zprop_type_t type; int err; /* * zfs_prop_set_special() expects properties in the form of an * nvpair with type info. */ if (prop == ZPROP_INVAL) { if (!zfs_prop_user(propname)) return (SET_ERROR(EINVAL)); type = PROP_TYPE_STRING; } else if (prop == ZFS_PROP_VOLSIZE || prop == ZFS_PROP_VERSION) { return (SET_ERROR(EINVAL)); } else { type = zfs_prop_get_type(prop); } VERIFY(nvlist_alloc(&dummy, NV_UNIQUE_NAME, KM_SLEEP) == 0); switch (type) { case PROP_TYPE_STRING: VERIFY(0 == nvlist_add_string(dummy, propname, "")); break; case PROP_TYPE_NUMBER: case PROP_TYPE_INDEX: VERIFY(0 == nvlist_add_uint64(dummy, propname, 0)); break; default: nvlist_free(dummy); return (SET_ERROR(EINVAL)); } pair = nvlist_next_nvpair(dummy, NULL); err = zfs_prop_set_special(zc->zc_name, source, pair); nvlist_free(dummy); if (err != -1) return (err); /* special property already handled */ } else { /* * Only check this in the non-received case. We want to allow * 'inherit -S' to revert non-inheritable properties like quota * and reservation to the received or default values even though * they are not considered inheritable. */ if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop)) return (SET_ERROR(EINVAL)); } /* property name has been validated by zfs_secpolicy_inherit_prop() */ return (dsl_prop_inherit(zc->zc_name, zc->zc_value, source)); } static int zfs_ioc_pool_set_props(zfs_cmd_t *zc) { nvlist_t *props; spa_t *spa; int error; nvpair_t *pair; if (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props)) return (error); /* * If the only property is the configfile, then just do a spa_lookup() * to handle the faulted case. */ pair = nvlist_next_nvpair(props, NULL); if (pair != NULL && strcmp(nvpair_name(pair), zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 && nvlist_next_nvpair(props, pair) == NULL) { mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(zc->zc_name)) != NULL) { spa_configfile_set(spa, props, B_FALSE); spa_write_cachefile(spa, B_FALSE, B_TRUE); } mutex_exit(&spa_namespace_lock); if (spa != NULL) { nvlist_free(props); return (0); } } if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { nvlist_free(props); return (error); } error = spa_prop_set(spa, props); nvlist_free(props); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_get_props(zfs_cmd_t *zc) { spa_t *spa; int error; nvlist_t *nvp = NULL; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { /* * If the pool is faulted, there may be properties we can still * get (such as altroot and cachefile), so attempt to get them * anyway. */ mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(zc->zc_name)) != NULL) error = spa_prop_get(spa, &nvp); mutex_exit(&spa_namespace_lock); } else { error = spa_prop_get(spa, &nvp); spa_close(spa, FTAG); } if (error == 0 && zc->zc_nvlist_dst != 0) error = put_nvlist(zc, nvp); else error = SET_ERROR(EFAULT); nvlist_free(nvp); return (error); } /* * inputs: * zc_name name of filesystem * zc_nvlist_src{_size} nvlist of delegated permissions * zc_perm_action allow/unallow flag * * outputs: none */ static int zfs_ioc_set_fsacl(zfs_cmd_t *zc) { int error; nvlist_t *fsaclnv = NULL; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &fsaclnv)) != 0) return (error); /* * Verify nvlist is constructed correctly */ if ((error = zfs_deleg_verify_nvlist(fsaclnv)) != 0) { nvlist_free(fsaclnv); return (SET_ERROR(EINVAL)); } /* * If we don't have PRIV_SYS_MOUNT, then validate * that user is allowed to hand out each permission in * the nvlist(s) */ error = secpolicy_zfs(CRED()); if (error != 0) { if (zc->zc_perm_action == B_FALSE) { error = dsl_deleg_can_allow(zc->zc_name, fsaclnv, CRED()); } else { error = dsl_deleg_can_unallow(zc->zc_name, fsaclnv, CRED()); } } if (error == 0) error = dsl_deleg_set(zc->zc_name, fsaclnv, zc->zc_perm_action); nvlist_free(fsaclnv); return (error); } /* * inputs: * zc_name name of filesystem * * outputs: * zc_nvlist_src{_size} nvlist of delegated permissions */ static int zfs_ioc_get_fsacl(zfs_cmd_t *zc) { nvlist_t *nvp; int error; if ((error = dsl_deleg_get(zc->zc_name, &nvp)) == 0) { error = put_nvlist(zc, nvp); nvlist_free(nvp); } return (error); } /* ARGSUSED */ static void zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) { zfs_creat_t *zct = arg; zfs_create_fs(os, cr, zct->zct_zplprops, tx); } #define ZFS_PROP_UNDEFINED ((uint64_t)-1) /* * inputs: * os parent objset pointer (NULL if root fs) * fuids_ok fuids allowed in this version of the spa? * sa_ok SAs allowed in this version of the spa? * createprops list of properties requested by creator * * outputs: * zplprops values for the zplprops we attach to the master node object * is_ci true if requested file system will be purely case-insensitive * * Determine the settings for utf8only, normalization and * casesensitivity. Specific values may have been requested by the * creator and/or we can inherit values from the parent dataset. If * the file system is of too early a vintage, a creator can not * request settings for these properties, even if the requested * setting is the default value. We don't actually want to create dsl * properties for these, so remove them from the source nvlist after * processing. */ static int zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, boolean_t fuids_ok, boolean_t sa_ok, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { uint64_t sense = ZFS_PROP_UNDEFINED; uint64_t norm = ZFS_PROP_UNDEFINED; uint64_t u8 = ZFS_PROP_UNDEFINED; ASSERT(zplprops != NULL); if (os != NULL && os->os_phys->os_type != DMU_OST_ZFS) return (SET_ERROR(EINVAL)); /* * Pull out creator prop choices, if any. */ if (createprops) { (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_VERSION), &zplver); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE), &norm); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE)); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY), &u8); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY)); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_CASE), &sense); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_CASE)); } /* * If the zpl version requested is whacky or the file system * or pool is version is too "young" to support normalization * and the creator tried to set a value for one of the props, * error out. */ if ((zplver < ZPL_VERSION_INITIAL || zplver > ZPL_VERSION) || (zplver >= ZPL_VERSION_FUID && !fuids_ok) || (zplver >= ZPL_VERSION_SA && !sa_ok) || (zplver < ZPL_VERSION_NORMALIZATION && (norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED || sense != ZFS_PROP_UNDEFINED))) return (SET_ERROR(ENOTSUP)); /* * Put the version in the zplprops */ VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0); if (norm == ZFS_PROP_UNDEFINED) VERIFY(zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm) == 0); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0); /* * If we're normalizing, names must always be valid UTF-8 strings. */ if (norm) u8 = 1; if (u8 == ZFS_PROP_UNDEFINED) VERIFY(zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8) == 0); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0); if (sense == ZFS_PROP_UNDEFINED) VERIFY(zfs_get_zplprop(os, ZFS_PROP_CASE, &sense) == 0); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0); if (is_ci) *is_ci = (sense == ZFS_CASE_INSENSITIVE); return (0); } static int zfs_fill_zplprops(const char *dataset, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { boolean_t fuids_ok, sa_ok; uint64_t zplver = ZPL_VERSION; objset_t *os = NULL; char parentname[ZFS_MAX_DATASET_NAME_LEN]; char *cp; spa_t *spa; uint64_t spa_vers; int error; (void) strlcpy(parentname, dataset, sizeof (parentname)); cp = strrchr(parentname, '/'); ASSERT(cp != NULL); cp[0] = '\0'; if ((error = spa_open(dataset, &spa, FTAG)) != 0) return (error); spa_vers = spa_version(spa); spa_close(spa, FTAG); zplver = zfs_zpl_version_map(spa_vers); fuids_ok = (zplver >= ZPL_VERSION_FUID); sa_ok = (zplver >= ZPL_VERSION_SA); /* * Open parent object set so we can inherit zplprop values. */ if ((error = dmu_objset_hold(parentname, FTAG, &os)) != 0) return (error); error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, sa_ok, createprops, zplprops, is_ci); dmu_objset_rele(os, FTAG); return (error); } static int zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { boolean_t fuids_ok; boolean_t sa_ok; uint64_t zplver = ZPL_VERSION; int error; zplver = zfs_zpl_version_map(spa_vers); fuids_ok = (zplver >= ZPL_VERSION_FUID); sa_ok = (zplver >= ZPL_VERSION_SA); error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, sa_ok, createprops, zplprops, is_ci); return (error); } /* * innvl: { * "type" -> dmu_objset_type_t (int32) * (optional) "props" -> { prop -> value } * } * * outnvl: propname -> error code (int32) */ static int zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { int error = 0; zfs_creat_t zct = { 0 }; nvlist_t *nvprops = NULL; void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); int32_t type32; dmu_objset_type_t type; boolean_t is_insensitive = B_FALSE; if (nvlist_lookup_int32(innvl, "type", &type32) != 0) return (SET_ERROR(EINVAL)); type = type32; (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); switch (type) { case DMU_OST_ZFS: cbfunc = zfs_create_cb; break; case DMU_OST_ZVOL: cbfunc = zvol_create_cb; break; default: cbfunc = NULL; break; } if (strchr(fsname, '@') || strchr(fsname, '%')) return (SET_ERROR(EINVAL)); zct.zct_props = nvprops; if (cbfunc == NULL) return (SET_ERROR(EINVAL)); if (type == DMU_OST_ZVOL) { uint64_t volsize, volblocksize; if (nvprops == NULL) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) != 0) return (SET_ERROR(EINVAL)); if ((error = nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize)) != 0 && error != ENOENT) return (SET_ERROR(EINVAL)); if (error != 0) volblocksize = zfs_prop_default_numeric( ZFS_PROP_VOLBLOCKSIZE); if ((error = zvol_check_volblocksize( volblocksize)) != 0 || (error = zvol_check_volsize(volsize, volblocksize)) != 0) return (error); } else if (type == DMU_OST_ZFS) { int error; /* * We have to have normalization and * case-folding flags correct when we do the * file system creation, so go figure them out * now. */ VERIFY(nvlist_alloc(&zct.zct_zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); error = zfs_fill_zplprops(fsname, nvprops, zct.zct_zplprops, &is_insensitive); if (error != 0) { nvlist_free(zct.zct_zplprops); return (error); } } error = dmu_objset_create(fsname, type, is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct); nvlist_free(zct.zct_zplprops); /* * It would be nice to do this atomically. */ if (error == 0) { error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, nvprops, outnvl); if (error != 0) (void) dsl_destroy_head(fsname); } #ifdef __FreeBSD__ if (error == 0 && type == DMU_OST_ZVOL) zvol_create_minors(fsname); #endif return (error); } /* * innvl: { * "origin" -> name of origin snapshot * (optional) "props" -> { prop -> value } * } * * outnvl: propname -> error code (int32) */ static int zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { int error = 0; nvlist_t *nvprops = NULL; char *origin_name; if (nvlist_lookup_string(innvl, "origin", &origin_name) != 0) return (SET_ERROR(EINVAL)); (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); if (strchr(fsname, '@') || strchr(fsname, '%')) return (SET_ERROR(EINVAL)); if (dataset_namecheck(origin_name, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); error = dmu_objset_clone(fsname, origin_name); if (error != 0) return (error); /* * It would be nice to do this atomically. */ if (error == 0) { error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, nvprops, outnvl); if (error != 0) (void) dsl_destroy_head(fsname); } #ifdef __FreeBSD__ if (error == 0) zvol_create_minors(fsname); #endif return (error); } /* ARGSUSED */ static int zfs_ioc_remap(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { if (strchr(fsname, '@') || strchr(fsname, '%')) return (SET_ERROR(EINVAL)); return (dmu_objset_remap_indirects(fsname)); } /* * innvl: { * "snaps" -> { snapshot1, snapshot2 } * (optional) "props" -> { prop -> value (string) } * } * * outnvl: snapshot -> error code (int32) */ static int zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { nvlist_t *snaps; nvlist_t *props = NULL; int error, poollen; nvpair_t *pair; (void) nvlist_lookup_nvlist(innvl, "props", &props); if ((error = zfs_check_userprops(poolname, props)) != 0) return (error); if (!nvlist_empty(props) && zfs_earlier_version(poolname, SPA_VERSION_SNAP_PROPS)) return (SET_ERROR(ENOTSUP)); if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0) return (SET_ERROR(EINVAL)); poollen = strlen(poolname); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { const char *name = nvpair_name(pair); const char *cp = strchr(name, '@'); /* * The snap name must contain an @, and the part after it must * contain only valid characters. */ if (cp == NULL || zfs_component_namecheck(cp + 1, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); /* * The snap must be in the specified pool. */ if (strncmp(name, poolname, poollen) != 0 || (name[poollen] != '/' && name[poollen] != '@')) return (SET_ERROR(EXDEV)); /* This must be the only snap of this fs. */ for (nvpair_t *pair2 = nvlist_next_nvpair(snaps, pair); pair2 != NULL; pair2 = nvlist_next_nvpair(snaps, pair2)) { if (strncmp(name, nvpair_name(pair2), cp - name + 1) == 0) { return (SET_ERROR(EXDEV)); } } } error = dsl_dataset_snapshot(snaps, props, outnvl); return (error); } /* * innvl: "message" -> string */ /* ARGSUSED */ static int zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) { char *message; spa_t *spa; int error; char *poolname; /* * The poolname in the ioctl is not set, we get it from the TSD, * which was set at the end of the last successful ioctl that allows * logging. The secpolicy func already checked that it is set. * Only one log ioctl is allowed after each successful ioctl, so * we clear the TSD here. */ poolname = tsd_get(zfs_allow_log_key); (void) tsd_set(zfs_allow_log_key, NULL); error = spa_open(poolname, &spa, FTAG); strfree(poolname); if (error != 0) return (error); if (nvlist_lookup_string(innvl, "message", &message) != 0) { spa_close(spa, FTAG); return (SET_ERROR(EINVAL)); } if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } error = spa_history_log(spa, message); spa_close(spa, FTAG); return (error); } #ifdef __FreeBSD__ static int zfs_ioc_nextboot(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) { char name[MAXNAMELEN]; spa_t *spa; vdev_t *vd; char *command; uint64_t pool_guid; uint64_t vdev_guid; int error; if (nvlist_lookup_uint64(innvl, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0) return (EINVAL); if (nvlist_lookup_uint64(innvl, ZPOOL_CONFIG_GUID, &vdev_guid) != 0) return (EINVAL); if (nvlist_lookup_string(innvl, "command", &command) != 0) return (EINVAL); mutex_enter(&spa_namespace_lock); spa = spa_by_guid(pool_guid, vdev_guid); if (spa != NULL) strcpy(name, spa_name(spa)); mutex_exit(&spa_namespace_lock); if (spa == NULL) return (ENOENT); if ((error = spa_open(name, &spa, FTAG)) != 0) return (error); spa_vdev_state_enter(spa, SCL_ALL); vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE); if (vd == NULL) { (void) spa_vdev_state_exit(spa, NULL, ENXIO); spa_close(spa, FTAG); return (ENODEV); } error = vdev_label_write_pad2(vd, command, strlen(command)); (void) spa_vdev_state_exit(spa, NULL, 0); txg_wait_synced(spa->spa_dsl_pool, 0); spa_close(spa, FTAG); return (error); } #endif /* * The dp_config_rwlock must not be held when calling this, because the * unmount may need to write out data. * * This function is best-effort. Callers must deal gracefully if it * remains mounted (or is remounted after this call). * * Returns 0 if the argument is not a snapshot, or it is not currently a * filesystem, or we were able to unmount it. Returns error code otherwise. */ void zfs_unmount_snap(const char *snapname) { vfs_t *vfsp = NULL; zfsvfs_t *zfsvfs = NULL; if (strchr(snapname, '@') == NULL) return; int err = getzfsvfs(snapname, &zfsvfs); if (err != 0) { ASSERT3P(zfsvfs, ==, NULL); return; } vfsp = zfsvfs->z_vfs; ASSERT(!dsl_pool_config_held(dmu_objset_pool(zfsvfs->z_os))); #ifdef illumos err = vn_vfswlock(vfsp->vfs_vnodecovered); VFS_RELE(vfsp); if (err != 0) return; #endif /* * Always force the unmount for snapshots. */ #ifdef illumos (void) dounmount(vfsp, MS_FORCE, kcred); #else vfs_ref(vfsp); vfs_unbusy(vfsp); (void) dounmount(vfsp, MS_FORCE, curthread); #endif } /* ARGSUSED */ static int zfs_unmount_snap_cb(const char *snapname, void *arg) { zfs_unmount_snap(snapname); return (0); } /* * When a clone is destroyed, its origin may also need to be destroyed, * in which case it must be unmounted. This routine will do that unmount * if necessary. */ void zfs_destroy_unmount_origin(const char *fsname) { int error; objset_t *os; dsl_dataset_t *ds; error = dmu_objset_hold(fsname, FTAG, &os); if (error != 0) return; ds = dmu_objset_ds(os); if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev)) { char originname[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_name(ds->ds_prev, originname); dmu_objset_rele(os, FTAG); zfs_unmount_snap(originname); } else { dmu_objset_rele(os, FTAG); } } /* * innvl: { * "snaps" -> { snapshot1, snapshot2 } * (optional boolean) "defer" * } * * outnvl: snapshot -> error code (int32) * */ /* ARGSUSED */ static int zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { int error, poollen; nvlist_t *snaps; nvpair_t *pair; boolean_t defer; if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0) return (SET_ERROR(EINVAL)); defer = nvlist_exists(innvl, "defer"); poollen = strlen(poolname); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { const char *name = nvpair_name(pair); /* * The snap must be in the specified pool to prevent the * invalid removal of zvol minors below. */ if (strncmp(name, poolname, poollen) != 0 || (name[poollen] != '/' && name[poollen] != '@')) return (SET_ERROR(EXDEV)); zfs_unmount_snap(nvpair_name(pair)); #if defined(__FreeBSD__) zvol_remove_minors(name); #endif } return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl)); } /* * Create bookmarks. Bookmark names are of the form #. * All bookmarks must be in the same pool. * * innvl: { * bookmark1 -> snapshot1, bookmark2 -> snapshot2 * } * * outnvl: bookmark -> error code (int32) * */ /* ARGSUSED */ static int zfs_ioc_bookmark(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { char *snap_name; /* * Verify the snapshot argument. */ if (nvpair_value_string(pair, &snap_name) != 0) return (SET_ERROR(EINVAL)); /* Verify that the keys (bookmarks) are unique */ for (nvpair_t *pair2 = nvlist_next_nvpair(innvl, pair); pair2 != NULL; pair2 = nvlist_next_nvpair(innvl, pair2)) { if (strcmp(nvpair_name(pair), nvpair_name(pair2)) == 0) return (SET_ERROR(EINVAL)); } } return (dsl_bookmark_create(innvl, outnvl)); } /* * innvl: { * property 1, property 2, ... * } * * outnvl: { * bookmark name 1 -> { property 1, property 2, ... }, * bookmark name 2 -> { property 1, property 2, ... } * } * */ static int zfs_ioc_get_bookmarks(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { return (dsl_get_bookmarks(fsname, innvl, outnvl)); } /* * innvl: { * bookmark name 1, bookmark name 2 * } * * outnvl: bookmark -> error code (int32) * */ static int zfs_ioc_destroy_bookmarks(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { int error, poollen; poollen = strlen(poolname); for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { const char *name = nvpair_name(pair); const char *cp = strchr(name, '#'); /* * The bookmark name must contain an #, and the part after it * must contain only valid characters. */ if (cp == NULL || zfs_component_namecheck(cp + 1, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); /* * The bookmark must be in the specified pool. */ if (strncmp(name, poolname, poollen) != 0 || (name[poollen] != '/' && name[poollen] != '#')) return (SET_ERROR(EXDEV)); } error = dsl_bookmark_destroy(innvl, outnvl); return (error); } static int zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { char *program; uint64_t instrlimit, memlimit; boolean_t sync_flag; nvpair_t *nvarg = NULL; if (0 != nvlist_lookup_string(innvl, ZCP_ARG_PROGRAM, &program)) { return (EINVAL); } if (0 != nvlist_lookup_boolean_value(innvl, ZCP_ARG_SYNC, &sync_flag)) { sync_flag = B_TRUE; } if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_INSTRLIMIT, &instrlimit)) { instrlimit = ZCP_DEFAULT_INSTRLIMIT; } if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_MEMLIMIT, &memlimit)) { memlimit = ZCP_DEFAULT_MEMLIMIT; } if (0 != nvlist_lookup_nvpair(innvl, ZCP_ARG_ARGLIST, &nvarg)) { return (EINVAL); } if (instrlimit == 0 || instrlimit > zfs_lua_max_instrlimit) return (EINVAL); if (memlimit == 0 || memlimit > zfs_lua_max_memlimit) return (EINVAL); return (zcp_eval(poolname, program, sync_flag, instrlimit, memlimit, nvarg, outnvl)); } /* * innvl: unused * outnvl: empty */ /* ARGSUSED */ static int zfs_ioc_pool_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { return (spa_checkpoint(poolname)); } /* * innvl: unused * outnvl: empty */ /* ARGSUSED */ static int zfs_ioc_pool_discard_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { return (spa_checkpoint_discard(poolname)); } /* * inputs: * zc_name name of dataset to destroy * zc_objset_type type of objset * zc_defer_destroy mark for deferred destroy * * outputs: none */ static int zfs_ioc_destroy(zfs_cmd_t *zc) { int err; if (zc->zc_objset_type == DMU_OST_ZFS) zfs_unmount_snap(zc->zc_name); if (strchr(zc->zc_name, '@')) err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy); else err = dsl_destroy_head(zc->zc_name); if (zc->zc_objset_type == DMU_OST_ZVOL && err == 0) #ifdef __FreeBSD__ zvol_remove_minors(zc->zc_name); #else (void) zvol_remove_minor(zc->zc_name); #endif return (err); } /* * fsname is name of dataset to rollback (to most recent snapshot) * * innvl may contain name of expected target snapshot * * outnvl: "target" -> name of most recent snapshot * } */ /* ARGSUSED */ static int zfs_ioc_rollback(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { zfsvfs_t *zfsvfs; char *target = NULL; int error; (void) nvlist_lookup_string(innvl, "target", &target); if (target != NULL) { const char *cp = strchr(target, '@'); /* * The snap name must contain an @, and the part after it must * contain only valid characters. */ if (cp == NULL || zfs_component_namecheck(cp + 1, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); } if (getzfsvfs(fsname, &zfsvfs) == 0) { dsl_dataset_t *ds; ds = dmu_objset_ds(zfsvfs->z_os); error = zfs_suspend_fs(zfsvfs); if (error == 0) { int resume_err; error = dsl_dataset_rollback(fsname, target, zfsvfs, outnvl); resume_err = zfs_resume_fs(zfsvfs, ds); error = error ? error : resume_err; } #ifdef illumos VFS_RELE(zfsvfs->z_vfs); #else vfs_unbusy(zfsvfs->z_vfs); #endif } else { error = dsl_dataset_rollback(fsname, target, NULL, outnvl); } return (error); } static int recursive_unmount(const char *fsname, void *arg) { const char *snapname = arg; char fullname[ZFS_MAX_DATASET_NAME_LEN]; (void) snprintf(fullname, sizeof (fullname), "%s@%s", fsname, snapname); zfs_unmount_snap(fullname); return (0); } /* * inputs: * zc_name old name of dataset * zc_value new name of dataset * zc_cookie recursive flag (only valid for snapshots) * * outputs: none */ static int zfs_ioc_rename(zfs_cmd_t *zc) { boolean_t recursive = zc->zc_cookie & 1; char *at; boolean_t allow_mounted = B_TRUE; #ifdef __FreeBSD__ allow_mounted = (zc->zc_cookie & 2) != 0; #endif /* "zfs rename" from and to ...%recv datasets should both fail */ zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 || dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || strchr(zc->zc_name, '%') || strchr(zc->zc_value, '%')) return (SET_ERROR(EINVAL)); at = strchr(zc->zc_name, '@'); if (at != NULL) { /* snaps must be in same fs */ int error; if (strncmp(zc->zc_name, zc->zc_value, at - zc->zc_name + 1)) return (SET_ERROR(EXDEV)); *at = '\0'; if (zc->zc_objset_type == DMU_OST_ZFS && !allow_mounted) { error = dmu_objset_find(zc->zc_name, recursive_unmount, at + 1, recursive ? DS_FIND_CHILDREN : 0); if (error != 0) { *at = '@'; return (error); } } error = dsl_dataset_rename_snapshot(zc->zc_name, at + 1, strchr(zc->zc_value, '@') + 1, recursive); *at = '@'; return (error); } else { #ifdef illumos if (zc->zc_objset_type == DMU_OST_ZVOL) (void) zvol_remove_minor(zc->zc_name); #endif return (dsl_dir_rename(zc->zc_name, zc->zc_value)); } } static int zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) { const char *propname = nvpair_name(pair); boolean_t issnap = (strchr(dsname, '@') != NULL); zfs_prop_t prop = zfs_name_to_prop(propname); uint64_t intval; int err; if (prop == ZPROP_INVAL) { if (zfs_prop_user(propname)) { if (err = zfs_secpolicy_write_perms(dsname, ZFS_DELEG_PERM_USERPROP, cr)) return (err); return (0); } if (!issnap && zfs_prop_userquota(propname)) { const char *perm = NULL; const char *uq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA]; const char *gq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA]; if (strncmp(propname, uq_prefix, strlen(uq_prefix)) == 0) { perm = ZFS_DELEG_PERM_USERQUOTA; } else if (strncmp(propname, gq_prefix, strlen(gq_prefix)) == 0) { perm = ZFS_DELEG_PERM_GROUPQUOTA; } else { /* USERUSED and GROUPUSED are read-only */ return (SET_ERROR(EINVAL)); } if (err = zfs_secpolicy_write_perms(dsname, perm, cr)) return (err); return (0); } return (SET_ERROR(EINVAL)); } if (issnap) return (SET_ERROR(EINVAL)); if (nvpair_type(pair) == DATA_TYPE_NVLIST) { /* * dsl_prop_get_all_impl() returns properties in this * format. */ nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) == 0); } /* * Check that this value is valid for this pool version */ switch (prop) { case ZFS_PROP_COMPRESSION: /* * If the user specified gzip compression, make sure * the SPA supports it. We ignore any errors here since * we'll catch them later. */ if (nvpair_value_uint64(pair, &intval) == 0) { if (intval >= ZIO_COMPRESS_GZIP_1 && intval <= ZIO_COMPRESS_GZIP_9 && zfs_earlier_version(dsname, SPA_VERSION_GZIP_COMPRESSION)) { return (SET_ERROR(ENOTSUP)); } if (intval == ZIO_COMPRESS_ZLE && zfs_earlier_version(dsname, SPA_VERSION_ZLE_COMPRESSION)) return (SET_ERROR(ENOTSUP)); if (intval == ZIO_COMPRESS_LZ4) { spa_t *spa; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LZ4_COMPRESS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } /* * If this is a bootable dataset then * verify that the compression algorithm * is supported for booting. We must return * something other than ENOTSUP since it * implies a downrev pool version. */ if (zfs_is_bootfs(dsname) && !BOOTFS_COMPRESS_VALID(intval)) { return (SET_ERROR(ERANGE)); } } break; case ZFS_PROP_COPIES: if (zfs_earlier_version(dsname, SPA_VERSION_DITTO_BLOCKS)) return (SET_ERROR(ENOTSUP)); break; case ZFS_PROP_RECORDSIZE: /* Record sizes above 128k need the feature to be enabled */ if (nvpair_value_uint64(pair, &intval) == 0 && intval > SPA_OLD_MAXBLOCKSIZE) { spa_t *spa; /* * We don't allow setting the property above 1MB, * unless the tunable has been changed. */ if (intval > zfs_max_recordsize || intval > SPA_MAXBLOCKSIZE) return (SET_ERROR(ERANGE)); if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } break; case ZFS_PROP_SHARESMB: if (zpl_earlier_version(dsname, ZPL_VERSION_FUID)) return (SET_ERROR(ENOTSUP)); break; case ZFS_PROP_ACLINHERIT: if (nvpair_type(pair) == DATA_TYPE_UINT64 && nvpair_value_uint64(pair, &intval) == 0) { if (intval == ZFS_ACL_PASSTHROUGH_X && zfs_earlier_version(dsname, SPA_VERSION_PASSTHROUGH_X)) return (SET_ERROR(ENOTSUP)); } break; case ZFS_PROP_CHECKSUM: case ZFS_PROP_DEDUP: { spa_feature_t feature; spa_t *spa; /* dedup feature version checks */ if (prop == ZFS_PROP_DEDUP && zfs_earlier_version(dsname, SPA_VERSION_DEDUP)) return (SET_ERROR(ENOTSUP)); if (nvpair_value_uint64(pair, &intval) != 0) return (SET_ERROR(EINVAL)); /* check prop value is enabled in features */ feature = zio_checksum_to_feature(intval & ZIO_CHECKSUM_MASK); if (feature == SPA_FEATURE_NONE) break; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); /* * Salted checksums are not supported on root pools. */ if (spa_bootfs(spa) != 0 && intval < ZIO_CHECKSUM_FUNCTIONS && (zio_checksum_table[intval].ci_flags & ZCHECKSUM_FLAG_SALTED)) { spa_close(spa, FTAG); return (SET_ERROR(ERANGE)); } if (!spa_feature_is_enabled(spa, feature)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); break; } } return (zfs_secpolicy_setprop(dsname, prop, pair, CRED())); } /* * Checks for a race condition to make sure we don't increment a feature flag * multiple times. */ static int zfs_prop_activate_feature_check(void *arg, dmu_tx_t *tx) { spa_t *spa = dmu_tx_pool(tx)->dp_spa; spa_feature_t *featurep = arg; if (!spa_feature_is_active(spa, *featurep)) return (0); else return (SET_ERROR(EBUSY)); } /* * The callback invoked on feature activation in the sync task caused by * zfs_prop_activate_feature. */ static void zfs_prop_activate_feature_sync(void *arg, dmu_tx_t *tx) { spa_t *spa = dmu_tx_pool(tx)->dp_spa; spa_feature_t *featurep = arg; spa_feature_incr(spa, *featurep, tx); } /* * Activates a feature on a pool in response to a property setting. This * creates a new sync task which modifies the pool to reflect the feature * as being active. */ static int zfs_prop_activate_feature(spa_t *spa, spa_feature_t feature) { int err; /* EBUSY here indicates that the feature is already active */ err = dsl_sync_task(spa_name(spa), zfs_prop_activate_feature_check, zfs_prop_activate_feature_sync, &feature, 2, ZFS_SPACE_CHECK_RESERVED); if (err != 0 && err != EBUSY) return (err); else return (0); } /* * Removes properties from the given props list that fail permission checks * needed to clear them and to restore them in case of a receive error. For each * property, make sure we have both set and inherit permissions. * * Returns the first error encountered if any permission checks fail. If the * caller provides a non-NULL errlist, it also gives the complete list of names * of all the properties that failed a permission check along with the * corresponding error numbers. The caller is responsible for freeing the * returned errlist. * * If every property checks out successfully, zero is returned and the list * pointed at by errlist is NULL. */ static int zfs_check_clearable(char *dataset, nvlist_t *props, nvlist_t **errlist) { zfs_cmd_t *zc; nvpair_t *pair, *next_pair; nvlist_t *errors; int err, rv = 0; if (props == NULL) return (0); VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP); (void) strcpy(zc->zc_name, dataset); pair = nvlist_next_nvpair(props, NULL); while (pair != NULL) { next_pair = nvlist_next_nvpair(props, pair); (void) strcpy(zc->zc_value, nvpair_name(pair)); if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 || (err = zfs_secpolicy_inherit_prop(zc, NULL, CRED())) != 0) { VERIFY(nvlist_remove_nvpair(props, pair) == 0); VERIFY(nvlist_add_int32(errors, zc->zc_value, err) == 0); } pair = next_pair; } kmem_free(zc, sizeof (zfs_cmd_t)); if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) { nvlist_free(errors); errors = NULL; } else { VERIFY(nvpair_value_int32(pair, &rv) == 0); } if (errlist == NULL) nvlist_free(errors); else *errlist = errors; return (rv); } static boolean_t propval_equals(nvpair_t *p1, nvpair_t *p2) { if (nvpair_type(p1) == DATA_TYPE_NVLIST) { /* dsl_prop_get_all_impl() format */ nvlist_t *attrs; VERIFY(nvpair_value_nvlist(p1, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &p1) == 0); } if (nvpair_type(p2) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(p2, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &p2) == 0); } if (nvpair_type(p1) != nvpair_type(p2)) return (B_FALSE); if (nvpair_type(p1) == DATA_TYPE_STRING) { char *valstr1, *valstr2; VERIFY(nvpair_value_string(p1, (char **)&valstr1) == 0); VERIFY(nvpair_value_string(p2, (char **)&valstr2) == 0); return (strcmp(valstr1, valstr2) == 0); } else { uint64_t intval1, intval2; VERIFY(nvpair_value_uint64(p1, &intval1) == 0); VERIFY(nvpair_value_uint64(p2, &intval2) == 0); return (intval1 == intval2); } } /* * Remove properties from props if they are not going to change (as determined * by comparison with origprops). Remove them from origprops as well, since we * do not need to clear or restore properties that won't change. */ static void props_reduce(nvlist_t *props, nvlist_t *origprops) { nvpair_t *pair, *next_pair; if (origprops == NULL) return; /* all props need to be received */ pair = nvlist_next_nvpair(props, NULL); while (pair != NULL) { const char *propname = nvpair_name(pair); nvpair_t *match; next_pair = nvlist_next_nvpair(props, pair); if ((nvlist_lookup_nvpair(origprops, propname, &match) != 0) || !propval_equals(pair, match)) goto next; /* need to set received value */ /* don't clear the existing received value */ (void) nvlist_remove_nvpair(origprops, match); /* don't bother receiving the property */ (void) nvlist_remove_nvpair(props, pair); next: pair = next_pair; } } /* * Extract properties that cannot be set PRIOR to the receipt of a dataset. * For example, refquota cannot be set until after the receipt of a dataset, * because in replication streams, an older/earlier snapshot may exceed the * refquota. We want to receive the older/earlier snapshot, but setting * refquota pre-receipt will set the dsl's ACTUAL quota, which will prevent * the older/earlier snapshot from being received (with EDQUOT). * * The ZFS test "zfs_receive_011_pos" demonstrates such a scenario. * * libzfs will need to be judicious handling errors encountered by props * extracted by this function. */ static nvlist_t * extract_delay_props(nvlist_t *props) { nvlist_t *delayprops; nvpair_t *nvp, *tmp; static const zfs_prop_t delayable[] = { ZFS_PROP_REFQUOTA, 0 }; int i; VERIFY(nvlist_alloc(&delayprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); for (nvp = nvlist_next_nvpair(props, NULL); nvp != NULL; nvp = nvlist_next_nvpair(props, nvp)) { /* * strcmp() is safe because zfs_prop_to_name() always returns * a bounded string. */ for (i = 0; delayable[i] != 0; i++) { if (strcmp(zfs_prop_to_name(delayable[i]), nvpair_name(nvp)) == 0) { break; } } if (delayable[i] != 0) { tmp = nvlist_prev_nvpair(props, nvp); VERIFY(nvlist_add_nvpair(delayprops, nvp) == 0); VERIFY(nvlist_remove_nvpair(props, nvp) == 0); nvp = tmp; } } if (nvlist_empty(delayprops)) { nvlist_free(delayprops); delayprops = NULL; } return (delayprops); } #ifdef DEBUG static boolean_t zfs_ioc_recv_inject_err; #endif /* * inputs: * zc_name name of containing filesystem * zc_nvlist_src{_size} nvlist of properties to apply * zc_value name of snapshot to create * zc_string name of clone origin (if DRR_FLAG_CLONE) * zc_cookie file descriptor to recv from * zc_begin_record the BEGIN record of the stream (not byteswapped) * zc_guid force flag * zc_cleanup_fd cleanup-on-exit file descriptor * zc_action_handle handle for this guid/ds mapping (or zero on first call) * zc_resumable if data is incomplete assume sender will resume * * outputs: * zc_cookie number of bytes read * zc_nvlist_dst{_size} error for each unapplied received property * zc_obj zprop_errflags_t * zc_action_handle handle for this guid/ds mapping */ static int zfs_ioc_recv(zfs_cmd_t *zc) { file_t *fp; dmu_recv_cookie_t drc; boolean_t force = (boolean_t)zc->zc_guid; int fd; int error = 0; int props_error = 0; nvlist_t *errors; offset_t off; nvlist_t *props = NULL; /* sent properties */ nvlist_t *origprops = NULL; /* existing properties */ nvlist_t *delayprops = NULL; /* sent properties applied post-receive */ char *origin = NULL; char *tosnap; char tofs[ZFS_MAX_DATASET_NAME_LEN]; - cap_rights_t rights; boolean_t first_recvd_props = B_FALSE; if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || strchr(zc->zc_value, '@') == NULL || strchr(zc->zc_value, '%')) return (SET_ERROR(EINVAL)); (void) strcpy(tofs, zc->zc_value); tosnap = strchr(tofs, '@'); *tosnap++ = '\0'; if (zc->zc_nvlist_src != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props)) != 0) return (error); fd = zc->zc_cookie; #ifdef illumos fp = getf(fd); #else - fget_read(curthread, fd, cap_rights_init(&rights, CAP_PREAD), &fp); + fget_read(curthread, fd, &cap_pread_rights, &fp); #endif if (fp == NULL) { nvlist_free(props); return (SET_ERROR(EBADF)); } errors = fnvlist_alloc(); if (zc->zc_string[0]) origin = zc->zc_string; error = dmu_recv_begin(tofs, tosnap, &zc->zc_begin_record, force, zc->zc_resumable, origin, &drc); if (error != 0) goto out; /* * Set properties before we receive the stream so that they are applied * to the new data. Note that we must call dmu_recv_stream() if * dmu_recv_begin() succeeds. */ if (props != NULL && !drc.drc_newfs) { if (spa_version(dsl_dataset_get_spa(drc.drc_ds)) >= SPA_VERSION_RECVD_PROPS && !dsl_prop_get_hasrecvd(tofs)) first_recvd_props = B_TRUE; /* * If new received properties are supplied, they are to * completely replace the existing received properties, so stash * away the existing ones. */ if (dsl_prop_get_received(tofs, &origprops) == 0) { nvlist_t *errlist = NULL; /* * Don't bother writing a property if its value won't * change (and avoid the unnecessary security checks). * * The first receive after SPA_VERSION_RECVD_PROPS is a * special case where we blow away all local properties * regardless. */ if (!first_recvd_props) props_reduce(props, origprops); if (zfs_check_clearable(tofs, origprops, &errlist) != 0) (void) nvlist_merge(errors, errlist, 0); nvlist_free(errlist); if (clear_received_props(tofs, origprops, first_recvd_props ? NULL : props) != 0) zc->zc_obj |= ZPROP_ERR_NOCLEAR; } else { zc->zc_obj |= ZPROP_ERR_NOCLEAR; } } if (props != NULL) { props_error = dsl_prop_set_hasrecvd(tofs); if (props_error == 0) { delayprops = extract_delay_props(props); (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, props, errors); } } off = fp->f_offset; error = dmu_recv_stream(&drc, fp, &off, zc->zc_cleanup_fd, &zc->zc_action_handle); if (error == 0) { zfsvfs_t *zfsvfs = NULL; if (getzfsvfs(tofs, &zfsvfs) == 0) { /* online recv */ dsl_dataset_t *ds; int end_err; ds = dmu_objset_ds(zfsvfs->z_os); error = zfs_suspend_fs(zfsvfs); /* * If the suspend fails, then the recv_end will * likely also fail, and clean up after itself. */ end_err = dmu_recv_end(&drc, zfsvfs); if (error == 0) error = zfs_resume_fs(zfsvfs, ds); error = error ? error : end_err; #ifdef illumos VFS_RELE(zfsvfs->z_vfs); #else vfs_unbusy(zfsvfs->z_vfs); #endif } else { error = dmu_recv_end(&drc, NULL); } /* Set delayed properties now, after we're done receiving. */ if (delayprops != NULL && error == 0) { (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, delayprops, errors); } } if (delayprops != NULL) { /* * Merge delayed props back in with initial props, in case * we're DEBUG and zfs_ioc_recv_inject_err is set (which means * we have to make sure clear_received_props() includes * the delayed properties). * * Since zfs_ioc_recv_inject_err is only in DEBUG kernels, * using ASSERT() will be just like a VERIFY. */ ASSERT(nvlist_merge(props, delayprops, 0) == 0); nvlist_free(delayprops); } /* * Now that all props, initial and delayed, are set, report the prop * errors to the caller. */ if (zc->zc_nvlist_dst_size != 0 && (nvlist_smush(errors, zc->zc_nvlist_dst_size) != 0 || put_nvlist(zc, errors) != 0)) { /* * Caller made zc->zc_nvlist_dst less than the minimum expected * size or supplied an invalid address. */ props_error = SET_ERROR(EINVAL); } zc->zc_cookie = off - fp->f_offset; if (off >= 0 && off <= MAXOFFSET_T) fp->f_offset = off; #ifdef DEBUG if (zfs_ioc_recv_inject_err) { zfs_ioc_recv_inject_err = B_FALSE; error = 1; } #endif #ifdef __FreeBSD__ if (error == 0) zvol_create_minors(tofs); #endif /* * On error, restore the original props. */ if (error != 0 && props != NULL && !drc.drc_newfs) { if (clear_received_props(tofs, props, NULL) != 0) { /* * We failed to clear the received properties. * Since we may have left a $recvd value on the * system, we can't clear the $hasrecvd flag. */ zc->zc_obj |= ZPROP_ERR_NORESTORE; } else if (first_recvd_props) { dsl_prop_unset_hasrecvd(tofs); } if (origprops == NULL && !drc.drc_newfs) { /* We failed to stash the original properties. */ zc->zc_obj |= ZPROP_ERR_NORESTORE; } /* * dsl_props_set() will not convert RECEIVED to LOCAL on or * after SPA_VERSION_RECVD_PROPS, so we need to specify LOCAL * explictly if we're restoring local properties cleared in the * first new-style receive. */ if (origprops != NULL && zfs_set_prop_nvlist(tofs, (first_recvd_props ? ZPROP_SRC_LOCAL : ZPROP_SRC_RECEIVED), origprops, NULL) != 0) { /* * We stashed the original properties but failed to * restore them. */ zc->zc_obj |= ZPROP_ERR_NORESTORE; } } out: nvlist_free(props); nvlist_free(origprops); nvlist_free(errors); releasef(fd); if (error == 0) error = props_error; return (error); } /* * inputs: * zc_name name of snapshot to send * zc_cookie file descriptor to send stream to * zc_obj fromorigin flag (mutually exclusive with zc_fromobj) * zc_sendobj objsetid of snapshot to send * zc_fromobj objsetid of incremental fromsnap (may be zero) * zc_guid if set, estimate size of stream only. zc_cookie is ignored. * output size in zc_objset_type. * zc_flags lzc_send_flags * * outputs: * zc_objset_type estimated size, if zc_guid is set */ static int zfs_ioc_send(zfs_cmd_t *zc) { int error; offset_t off; boolean_t estimate = (zc->zc_guid != 0); boolean_t embedok = (zc->zc_flags & 0x1); boolean_t large_block_ok = (zc->zc_flags & 0x2); boolean_t compressok = (zc->zc_flags & 0x4); if (zc->zc_obj != 0) { dsl_pool_t *dp; dsl_dataset_t *tosnap; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (dsl_dir_is_clone(tosnap->ds_dir)) zc->zc_fromobj = dsl_dir_phys(tosnap->ds_dir)->dd_origin_obj; dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); } if (estimate) { dsl_pool_t *dp; dsl_dataset_t *tosnap; dsl_dataset_t *fromsnap = NULL; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (zc->zc_fromobj != 0) { error = dsl_dataset_hold_obj(dp, zc->zc_fromobj, FTAG, &fromsnap); if (error != 0) { dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (error); } } error = dmu_send_estimate(tosnap, fromsnap, compressok, &zc->zc_objset_type); if (fromsnap != NULL) dsl_dataset_rele(fromsnap, FTAG); dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); } else { file_t *fp; - cap_rights_t rights; #ifdef illumos fp = getf(zc->zc_cookie); #else - fget_write(curthread, zc->zc_cookie, - cap_rights_init(&rights, CAP_WRITE), &fp); + fget_write(curthread, zc->zc_cookie, &cap_write_rights, &fp); #endif if (fp == NULL) return (SET_ERROR(EBADF)); off = fp->f_offset; error = dmu_send_obj(zc->zc_name, zc->zc_sendobj, zc->zc_fromobj, embedok, large_block_ok, compressok, #ifdef illumos zc->zc_cookie, fp->f_vnode, &off); #else zc->zc_cookie, fp, &off); #endif if (off >= 0 && off <= MAXOFFSET_T) fp->f_offset = off; releasef(zc->zc_cookie); } return (error); } /* * inputs: * zc_name name of snapshot on which to report progress * zc_cookie file descriptor of send stream * * outputs: * zc_cookie number of bytes written in send stream thus far */ static int zfs_ioc_send_progress(zfs_cmd_t *zc) { dsl_pool_t *dp; dsl_dataset_t *ds; dmu_sendarg_t *dsp = NULL; int error; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } mutex_enter(&ds->ds_sendstream_lock); /* * Iterate over all the send streams currently active on this dataset. * If there's one which matches the specified file descriptor _and_ the * stream was started by the current process, return the progress of * that stream. */ for (dsp = list_head(&ds->ds_sendstreams); dsp != NULL; dsp = list_next(&ds->ds_sendstreams, dsp)) { if (dsp->dsa_outfd == zc->zc_cookie && dsp->dsa_proc == curproc) break; } if (dsp != NULL) zc->zc_cookie = *(dsp->dsa_off); else error = SET_ERROR(ENOENT); mutex_exit(&ds->ds_sendstream_lock); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } static int zfs_ioc_inject_fault(zfs_cmd_t *zc) { int id, error; error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id, &zc->zc_inject_record); if (error == 0) zc->zc_guid = (uint64_t)id; return (error); } static int zfs_ioc_clear_fault(zfs_cmd_t *zc) { return (zio_clear_fault((int)zc->zc_guid)); } static int zfs_ioc_inject_list_next(zfs_cmd_t *zc) { int id = (int)zc->zc_guid; int error; error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name), &zc->zc_inject_record); zc->zc_guid = id; return (error); } static int zfs_ioc_error_log(zfs_cmd_t *zc) { spa_t *spa; int error; size_t count = (size_t)zc->zc_nvlist_dst_size; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst, &count); if (error == 0) zc->zc_nvlist_dst_size = count; else zc->zc_nvlist_dst_size = spa_get_errlog_size(spa); spa_close(spa, FTAG); return (error); } static int zfs_ioc_clear(zfs_cmd_t *zc) { spa_t *spa; vdev_t *vd; int error; /* * On zpool clear we also fix up missing slogs */ mutex_enter(&spa_namespace_lock); spa = spa_lookup(zc->zc_name); if (spa == NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(EIO)); } if (spa_get_log_state(spa) == SPA_LOG_MISSING) { /* we need to let spa_open/spa_load clear the chains */ spa_set_log_state(spa, SPA_LOG_CLEAR); } spa->spa_last_open_failed = 0; mutex_exit(&spa_namespace_lock); if (zc->zc_cookie & ZPOOL_NO_REWIND) { error = spa_open(zc->zc_name, &spa, FTAG); } else { nvlist_t *policy; nvlist_t *config = NULL; if (zc->zc_nvlist_src == 0) return (SET_ERROR(EINVAL)); if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &policy)) == 0) { error = spa_open_rewind(zc->zc_name, &spa, FTAG, policy, &config); if (config != NULL) { int err; if ((err = put_nvlist(zc, config)) != 0) error = err; nvlist_free(config); } nvlist_free(policy); } } if (error != 0) return (error); spa_vdev_state_enter(spa, SCL_NONE); if (zc->zc_guid == 0) { vd = NULL; } else { vd = spa_lookup_by_guid(spa, zc->zc_guid, B_TRUE); if (vd == NULL) { (void) spa_vdev_state_exit(spa, NULL, ENODEV); spa_close(spa, FTAG); return (SET_ERROR(ENODEV)); } } vdev_clear(spa, vd); (void) spa_vdev_state_exit(spa, NULL, 0); /* * Resume any suspended I/Os. */ if (zio_resume(spa) != 0) error = SET_ERROR(EIO); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_reopen(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); spa_vdev_state_enter(spa, SCL_NONE); /* * If a resilver is already in progress then set the * spa_scrub_reopen flag to B_TRUE so that we don't restart * the scan as a side effect of the reopen. Otherwise, let * vdev_open() decided if a resilver is required. */ spa->spa_scrub_reopen = dsl_scan_resilvering(spa->spa_dsl_pool); vdev_reopen(spa->spa_root_vdev); spa->spa_scrub_reopen = B_FALSE; (void) spa_vdev_state_exit(spa, NULL, 0); spa_close(spa, FTAG); return (0); } /* * inputs: * zc_name name of filesystem * * outputs: * zc_string name of conflicting snapshot, if there is one */ static int zfs_ioc_promote(zfs_cmd_t *zc) { dsl_pool_t *dp; dsl_dataset_t *ds, *ods; char origin[ZFS_MAX_DATASET_NAME_LEN]; char *cp; int error; zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 || strchr(zc->zc_name, '%')) return (SET_ERROR(EINVAL)); error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (!dsl_dir_is_clone(ds->ds_dir)) { dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (SET_ERROR(EINVAL)); } error = dsl_dataset_hold_obj(dp, dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &ods); if (error != 0) { dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } dsl_dataset_name(ods, origin); dsl_dataset_rele(ods, FTAG); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); /* * We don't need to unmount *all* the origin fs's snapshots, but * it's easier. */ cp = strchr(origin, '@'); if (cp) *cp = '\0'; (void) dmu_objset_find(origin, zfs_unmount_snap_cb, NULL, DS_FIND_SNAPSHOTS); return (dsl_dataset_promote(zc->zc_name, zc->zc_string)); } /* * Retrieve a single {user|group}{used|quota}@... property. * * inputs: * zc_name name of filesystem * zc_objset_type zfs_userquota_prop_t * zc_value domain name (eg. "S-1-234-567-89") * zc_guid RID/UID/GID * * outputs: * zc_cookie property value */ static int zfs_ioc_userspace_one(zfs_cmd_t *zc) { zfsvfs_t *zfsvfs; int error; if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (SET_ERROR(EINVAL)); error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); if (error != 0) return (error); error = zfs_userspace_one(zfsvfs, zc->zc_objset_type, zc->zc_value, zc->zc_guid, &zc->zc_cookie); zfsvfs_rele(zfsvfs, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_objset_type zfs_userquota_prop_t * zc_nvlist_dst[_size] buffer to fill (not really an nvlist) * * outputs: * zc_nvlist_dst[_size] data buffer (array of zfs_useracct_t) * zc_cookie zap cursor */ static int zfs_ioc_userspace_many(zfs_cmd_t *zc) { zfsvfs_t *zfsvfs; int bufsize = zc->zc_nvlist_dst_size; if (bufsize <= 0) return (SET_ERROR(ENOMEM)); int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); if (error != 0) return (error); void *buf = kmem_alloc(bufsize, KM_SLEEP); error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie, buf, &zc->zc_nvlist_dst_size); if (error == 0) { error = ddi_copyout(buf, (void *)(uintptr_t)zc->zc_nvlist_dst, zc->zc_nvlist_dst_size, zc->zc_iflags); } kmem_free(buf, bufsize); zfsvfs_rele(zfsvfs, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * * outputs: * none */ static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) { objset_t *os; int error = 0; zfsvfs_t *zfsvfs; if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) { if (!dmu_objset_userused_enabled(zfsvfs->z_os)) { /* * If userused is not enabled, it may be because the * objset needs to be closed & reopened (to grow the * objset_phys_t). Suspend/resume the fs will do that. */ dsl_dataset_t *ds, *newds; ds = dmu_objset_ds(zfsvfs->z_os); error = zfs_suspend_fs(zfsvfs); if (error == 0) { dmu_objset_refresh_ownership(ds, &newds, zfsvfs); error = zfs_resume_fs(zfsvfs, newds); } } if (error == 0) error = dmu_objset_userspace_upgrade(zfsvfs->z_os); #ifdef illumos VFS_RELE(zfsvfs->z_vfs); #else vfs_unbusy(zfsvfs->z_vfs); #endif } else { /* XXX kind of reading contents without owning */ error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error != 0) return (error); error = dmu_objset_userspace_upgrade(os); dmu_objset_rele(os, FTAG); } return (error); } #ifdef illumos /* * We don't want to have a hard dependency * against some special symbols in sharefs * nfs, and smbsrv. Determine them if needed when * the first file system is shared. * Neither sharefs, nfs or smbsrv are unloadable modules. */ int (*znfsexport_fs)(void *arg); int (*zshare_fs)(enum sharefs_sys_op, share_t *, uint32_t); int (*zsmbexport_fs)(void *arg, boolean_t add_share); int zfs_nfsshare_inited; int zfs_smbshare_inited; ddi_modhandle_t nfs_mod; ddi_modhandle_t sharefs_mod; ddi_modhandle_t smbsrv_mod; #endif /* illumos */ kmutex_t zfs_share_lock; #ifdef illumos static int zfs_init_sharefs() { int error; ASSERT(MUTEX_HELD(&zfs_share_lock)); /* Both NFS and SMB shares also require sharetab support. */ if (sharefs_mod == NULL && ((sharefs_mod = ddi_modopen("fs/sharefs", KRTLD_MODE_FIRST, &error)) == NULL)) { return (SET_ERROR(ENOSYS)); } if (zshare_fs == NULL && ((zshare_fs = (int (*)(enum sharefs_sys_op, share_t *, uint32_t)) ddi_modsym(sharefs_mod, "sharefs_impl", &error)) == NULL)) { return (SET_ERROR(ENOSYS)); } return (0); } #endif /* illumos */ static int zfs_ioc_share(zfs_cmd_t *zc) { #ifdef illumos int error; int opcode; switch (zc->zc_share.z_sharetype) { case ZFS_SHARE_NFS: case ZFS_UNSHARE_NFS: if (zfs_nfsshare_inited == 0) { mutex_enter(&zfs_share_lock); if (nfs_mod == NULL && ((nfs_mod = ddi_modopen("fs/nfs", KRTLD_MODE_FIRST, &error)) == NULL)) { mutex_exit(&zfs_share_lock); return (SET_ERROR(ENOSYS)); } if (znfsexport_fs == NULL && ((znfsexport_fs = (int (*)(void *)) ddi_modsym(nfs_mod, "nfs_export", &error)) == NULL)) { mutex_exit(&zfs_share_lock); return (SET_ERROR(ENOSYS)); } error = zfs_init_sharefs(); if (error != 0) { mutex_exit(&zfs_share_lock); return (SET_ERROR(ENOSYS)); } zfs_nfsshare_inited = 1; mutex_exit(&zfs_share_lock); } break; case ZFS_SHARE_SMB: case ZFS_UNSHARE_SMB: if (zfs_smbshare_inited == 0) { mutex_enter(&zfs_share_lock); if (smbsrv_mod == NULL && ((smbsrv_mod = ddi_modopen("drv/smbsrv", KRTLD_MODE_FIRST, &error)) == NULL)) { mutex_exit(&zfs_share_lock); return (SET_ERROR(ENOSYS)); } if (zsmbexport_fs == NULL && ((zsmbexport_fs = (int (*)(void *, boolean_t))ddi_modsym(smbsrv_mod, "smb_server_share", &error)) == NULL)) { mutex_exit(&zfs_share_lock); return (SET_ERROR(ENOSYS)); } error = zfs_init_sharefs(); if (error != 0) { mutex_exit(&zfs_share_lock); return (SET_ERROR(ENOSYS)); } zfs_smbshare_inited = 1; mutex_exit(&zfs_share_lock); } break; default: return (SET_ERROR(EINVAL)); } switch (zc->zc_share.z_sharetype) { case ZFS_SHARE_NFS: case ZFS_UNSHARE_NFS: if (error = znfsexport_fs((void *) (uintptr_t)zc->zc_share.z_exportdata)) return (error); break; case ZFS_SHARE_SMB: case ZFS_UNSHARE_SMB: if (error = zsmbexport_fs((void *) (uintptr_t)zc->zc_share.z_exportdata, zc->zc_share.z_sharetype == ZFS_SHARE_SMB ? B_TRUE: B_FALSE)) { return (error); } break; } opcode = (zc->zc_share.z_sharetype == ZFS_SHARE_NFS || zc->zc_share.z_sharetype == ZFS_SHARE_SMB) ? SHAREFS_ADD : SHAREFS_REMOVE; /* * Add or remove share from sharetab */ error = zshare_fs(opcode, (void *)(uintptr_t)zc->zc_share.z_sharedata, zc->zc_share.z_sharemax); return (error); #else /* !illumos */ return (ENOSYS); #endif /* illumos */ } ace_t full_access[] = { {(uid_t)-1, ACE_ALL_PERMS, ACE_EVERYONE, 0} }; /* * inputs: * zc_name name of containing filesystem * zc_obj object # beyond which we want next in-use object # * * outputs: * zc_obj next in-use object # */ static int zfs_ioc_next_obj(zfs_cmd_t *zc) { objset_t *os = NULL; int error; error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error != 0) return (error); error = dmu_object_next(os, &zc->zc_obj, B_FALSE, dsl_dataset_phys(os->os_dsl_dataset)->ds_prev_snap_txg); dmu_objset_rele(os, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_value prefix name for snapshot * zc_cleanup_fd cleanup-on-exit file descriptor for calling process * * outputs: * zc_value short name of new snapshot */ static int zfs_ioc_tmp_snapshot(zfs_cmd_t *zc) { char *snap_name; char *hold_name; int error; minor_t minor; error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor); if (error != 0) return (error); snap_name = kmem_asprintf("%s-%016llx", zc->zc_value, (u_longlong_t)ddi_get_lbolt64()); hold_name = kmem_asprintf("%%%s", zc->zc_value); error = dsl_dataset_snapshot_tmp(zc->zc_name, snap_name, minor, hold_name); if (error == 0) (void) strcpy(zc->zc_value, snap_name); strfree(snap_name); strfree(hold_name); zfs_onexit_fd_rele(zc->zc_cleanup_fd); return (error); } /* * inputs: * zc_name name of "to" snapshot * zc_value name of "from" snapshot * zc_cookie file descriptor to write diff data on * * outputs: * dmu_diff_record_t's to the file descriptor */ static int zfs_ioc_diff(zfs_cmd_t *zc) { file_t *fp; - cap_rights_t rights; offset_t off; int error; #ifdef illumos fp = getf(zc->zc_cookie); #else - fget_write(curthread, zc->zc_cookie, - cap_rights_init(&rights, CAP_WRITE), &fp); + fget_write(curthread, zc->zc_cookie, &cap_write_rights, &fp); #endif if (fp == NULL) return (SET_ERROR(EBADF)); off = fp->f_offset; #ifdef illumos error = dmu_diff(zc->zc_name, zc->zc_value, fp->f_vnode, &off); #else error = dmu_diff(zc->zc_name, zc->zc_value, fp, &off); #endif if (off >= 0 && off <= MAXOFFSET_T) fp->f_offset = off; releasef(zc->zc_cookie); return (error); } #ifdef illumos /* * Remove all ACL files in shares dir */ static int zfs_smb_acl_purge(znode_t *dzp) { zap_cursor_t zc; zap_attribute_t zap; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; int error; for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id); (error = zap_cursor_retrieve(&zc, &zap)) == 0; zap_cursor_advance(&zc)) { if ((error = VOP_REMOVE(ZTOV(dzp), zap.za_name, kcred, NULL, 0)) != 0) break; } zap_cursor_fini(&zc); return (error); } #endif /* illumos */ static int zfs_ioc_smb_acl(zfs_cmd_t *zc) { #ifdef illumos vnode_t *vp; znode_t *dzp; vnode_t *resourcevp = NULL; znode_t *sharedir; zfsvfs_t *zfsvfs; nvlist_t *nvlist; char *src, *target; vattr_t vattr; vsecattr_t vsec; int error = 0; if ((error = lookupname(zc->zc_value, UIO_SYSSPACE, NO_FOLLOW, NULL, &vp)) != 0) return (error); /* Now make sure mntpnt and dataset are ZFS */ if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 || (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource), zc->zc_name) != 0)) { VN_RELE(vp); return (SET_ERROR(EINVAL)); } dzp = VTOZ(vp); zfsvfs = dzp->z_zfsvfs; ZFS_ENTER(zfsvfs); /* * Create share dir if its missing. */ mutex_enter(&zfsvfs->z_lock); if (zfsvfs->z_shares_dir == 0) { dmu_tx_t *tx; tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, TRUE, ZFS_SHARES_DIR); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) { dmu_tx_abort(tx); } else { error = zfs_create_share_dir(zfsvfs, tx); dmu_tx_commit(tx); } if (error != 0) { mutex_exit(&zfsvfs->z_lock); VN_RELE(vp); ZFS_EXIT(zfsvfs); return (error); } } mutex_exit(&zfsvfs->z_lock); ASSERT(zfsvfs->z_shares_dir); if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &sharedir)) != 0) { VN_RELE(vp); ZFS_EXIT(zfsvfs); return (error); } switch (zc->zc_cookie) { case ZFS_SMB_ACL_ADD: vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; vattr.va_type = VREG; vattr.va_mode = S_IFREG|0777; vattr.va_uid = 0; vattr.va_gid = 0; vsec.vsa_mask = VSA_ACE; vsec.vsa_aclentp = &full_access; vsec.vsa_aclentsz = sizeof (full_access); vsec.vsa_aclcnt = 1; error = VOP_CREATE(ZTOV(sharedir), zc->zc_string, &vattr, EXCL, 0, &resourcevp, kcred, 0, NULL, &vsec); if (resourcevp) VN_RELE(resourcevp); break; case ZFS_SMB_ACL_REMOVE: error = VOP_REMOVE(ZTOV(sharedir), zc->zc_string, kcred, NULL, 0); break; case ZFS_SMB_ACL_RENAME: if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &nvlist)) != 0) { VN_RELE(vp); VN_RELE(ZTOV(sharedir)); ZFS_EXIT(zfsvfs); return (error); } if (nvlist_lookup_string(nvlist, ZFS_SMB_ACL_SRC, &src) || nvlist_lookup_string(nvlist, ZFS_SMB_ACL_TARGET, &target)) { VN_RELE(vp); VN_RELE(ZTOV(sharedir)); ZFS_EXIT(zfsvfs); nvlist_free(nvlist); return (error); } error = VOP_RENAME(ZTOV(sharedir), src, ZTOV(sharedir), target, kcred, NULL, 0); nvlist_free(nvlist); break; case ZFS_SMB_ACL_PURGE: error = zfs_smb_acl_purge(sharedir); break; default: error = SET_ERROR(EINVAL); break; } VN_RELE(vp); VN_RELE(ZTOV(sharedir)); ZFS_EXIT(zfsvfs); return (error); #else /* !illumos */ return (EOPNOTSUPP); #endif /* illumos */ } /* * innvl: { * "holds" -> { snapname -> holdname (string), ... } * (optional) "cleanup_fd" -> fd (int32) * } * * outnvl: { * snapname -> error value (int32) * ... * } */ /* ARGSUSED */ static int zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist) { nvpair_t *pair; nvlist_t *holds; int cleanup_fd = -1; int error; minor_t minor = 0; error = nvlist_lookup_nvlist(args, "holds", &holds); if (error != 0) return (SET_ERROR(EINVAL)); /* make sure the user didn't pass us any invalid (empty) tags */ for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; pair = nvlist_next_nvpair(holds, pair)) { char *htag; error = nvpair_value_string(pair, &htag); if (error != 0) return (SET_ERROR(error)); if (strlen(htag) == 0) return (SET_ERROR(EINVAL)); } if (nvlist_lookup_int32(args, "cleanup_fd", &cleanup_fd) == 0) { error = zfs_onexit_fd_hold(cleanup_fd, &minor); if (error != 0) return (error); } error = dsl_dataset_user_hold(holds, minor, errlist); if (minor != 0) zfs_onexit_fd_rele(cleanup_fd); return (error); } /* * innvl is not used. * * outnvl: { * holdname -> time added (uint64 seconds since epoch) * ... * } */ /* ARGSUSED */ static int zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl) { return (dsl_dataset_get_holds(snapname, outnvl)); } /* * innvl: { * snapname -> { holdname, ... } * ... * } * * outnvl: { * snapname -> error value (int32) * ... * } */ /* ARGSUSED */ static int zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist) { return (dsl_dataset_user_release(holds, errlist)); } /* * inputs: * zc_name name of new filesystem or snapshot * zc_value full name of old snapshot * * outputs: * zc_cookie space in bytes * zc_objset_type compressed space in bytes * zc_perm_action uncompressed space in bytes */ static int zfs_ioc_space_written(zfs_cmd_t *zc) { int error; dsl_pool_t *dp; dsl_dataset_t *new, *old; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &new); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } error = dsl_dataset_hold(dp, zc->zc_value, FTAG, &old); if (error != 0) { dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); return (error); } error = dsl_dataset_space_written(old, new, &zc->zc_cookie, &zc->zc_objset_type, &zc->zc_perm_action); dsl_dataset_rele(old, FTAG); dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); return (error); } /* * innvl: { * "firstsnap" -> snapshot name * } * * outnvl: { * "used" -> space in bytes * "compressed" -> compressed space in bytes * "uncompressed" -> uncompressed space in bytes * } */ static int zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) { int error; dsl_pool_t *dp; dsl_dataset_t *new, *old; char *firstsnap; uint64_t used, comp, uncomp; if (nvlist_lookup_string(innvl, "firstsnap", &firstsnap) != 0) return (SET_ERROR(EINVAL)); error = dsl_pool_hold(lastsnap, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, lastsnap, FTAG, &new); if (error == 0 && !new->ds_is_snapshot) { dsl_dataset_rele(new, FTAG); error = SET_ERROR(EINVAL); } if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } error = dsl_dataset_hold(dp, firstsnap, FTAG, &old); if (error == 0 && !old->ds_is_snapshot) { dsl_dataset_rele(old, FTAG); error = SET_ERROR(EINVAL); } if (error != 0) { dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); return (error); } error = dsl_dataset_space_wouldfree(old, new, &used, &comp, &uncomp); dsl_dataset_rele(old, FTAG); dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); fnvlist_add_uint64(outnvl, "used", used); fnvlist_add_uint64(outnvl, "compressed", comp); fnvlist_add_uint64(outnvl, "uncompressed", uncomp); return (error); } static int zfs_ioc_jail(zfs_cmd_t *zc) { return (zone_dataset_attach(curthread->td_ucred, zc->zc_name, (int)zc->zc_jailid)); } static int zfs_ioc_unjail(zfs_cmd_t *zc) { return (zone_dataset_detach(curthread->td_ucred, zc->zc_name, (int)zc->zc_jailid)); } /* * innvl: { * "fd" -> file descriptor to write stream to (int32) * (optional) "fromsnap" -> full snap name to send an incremental from * (optional) "largeblockok" -> (value ignored) * indicates that blocks > 128KB are permitted * (optional) "embedok" -> (value ignored) * presence indicates DRR_WRITE_EMBEDDED records are permitted * (optional) "compressok" -> (value ignored) * presence indicates compressed DRR_WRITE records are permitted * (optional) "resume_object" and "resume_offset" -> (uint64) * if present, resume send stream from specified object and offset. * } * * outnvl is unused */ /* ARGSUSED */ static int zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { - cap_rights_t rights; file_t *fp; int error; offset_t off; char *fromname = NULL; int fd; boolean_t largeblockok; boolean_t embedok; boolean_t compressok; uint64_t resumeobj = 0; uint64_t resumeoff = 0; error = nvlist_lookup_int32(innvl, "fd", &fd); if (error != 0) return (SET_ERROR(EINVAL)); (void) nvlist_lookup_string(innvl, "fromsnap", &fromname); largeblockok = nvlist_exists(innvl, "largeblockok"); embedok = nvlist_exists(innvl, "embedok"); compressok = nvlist_exists(innvl, "compressok"); (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj); (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff); #ifdef illumos file_t *fp = getf(fd); #else - fget_write(curthread, fd, cap_rights_init(&rights, CAP_WRITE), &fp); + fget_write(curthread, fd, &cap_write_rights, &fp); #endif if (fp == NULL) return (SET_ERROR(EBADF)); off = fp->f_offset; error = dmu_send(snapname, fromname, embedok, largeblockok, compressok, #ifdef illumos fd, resumeobj, resumeoff, fp->f_vnode, &off); #else fd, resumeobj, resumeoff, fp, &off); #endif #ifdef illumos if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) fp->f_offset = off; #else fp->f_offset = off; #endif releasef(fd); return (error); } /* * Determine approximately how large a zfs send stream will be -- the number * of bytes that will be written to the fd supplied to zfs_ioc_send_new(). * * innvl: { * (optional) "from" -> full snap or bookmark name to send an incremental * from * (optional) "largeblockok" -> (value ignored) * indicates that blocks > 128KB are permitted * (optional) "embedok" -> (value ignored) * presence indicates DRR_WRITE_EMBEDDED records are permitted * (optional) "compressok" -> (value ignored) * presence indicates compressed DRR_WRITE records are permitted * } * * outnvl: { * "space" -> bytes of space (uint64) * } */ static int zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { dsl_pool_t *dp; dsl_dataset_t *tosnap; int error; char *fromname; boolean_t compressok; uint64_t space; error = dsl_pool_hold(snapname, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, snapname, FTAG, &tosnap); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } compressok = nvlist_exists(innvl, "compressok"); error = nvlist_lookup_string(innvl, "from", &fromname); if (error == 0) { if (strchr(fromname, '@') != NULL) { /* * If from is a snapshot, hold it and use the more * efficient dmu_send_estimate to estimate send space * size using deadlists. */ dsl_dataset_t *fromsnap; error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap); if (error != 0) goto out; error = dmu_send_estimate(tosnap, fromsnap, compressok, &space); dsl_dataset_rele(fromsnap, FTAG); } else if (strchr(fromname, '#') != NULL) { /* * If from is a bookmark, fetch the creation TXG of the * snapshot it was created from and use that to find * blocks that were born after it. */ zfs_bookmark_phys_t frombm; error = dsl_bookmark_lookup(dp, fromname, tosnap, &frombm); if (error != 0) goto out; error = dmu_send_estimate_from_txg(tosnap, frombm.zbm_creation_txg, compressok, &space); } else { /* * from is not properly formatted as a snapshot or * bookmark */ error = SET_ERROR(EINVAL); goto out; } } else { /* * If estimating the size of a full send, use dmu_send_estimate. */ error = dmu_send_estimate(tosnap, NULL, compressok, &space); } fnvlist_add_uint64(outnvl, "space", space); out: dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (error); } static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST]; static void zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, boolean_t log_history, zfs_ioc_poolcheck_t pool_check) { zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST]; ASSERT3U(ioc, >=, ZFS_IOC_FIRST); ASSERT3U(ioc, <, ZFS_IOC_LAST); ASSERT3P(vec->zvec_legacy_func, ==, NULL); ASSERT3P(vec->zvec_func, ==, NULL); vec->zvec_legacy_func = func; vec->zvec_secpolicy = secpolicy; vec->zvec_namecheck = namecheck; vec->zvec_allow_log = log_history; vec->zvec_pool_check = pool_check; } /* * See the block comment at the beginning of this file for details on * each argument to this function. */ static void zfs_ioctl_register(const char *name, zfs_ioc_t ioc, zfs_ioc_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, zfs_ioc_poolcheck_t pool_check, boolean_t smush_outnvlist, boolean_t allow_log) { zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST]; ASSERT3U(ioc, >=, ZFS_IOC_FIRST); ASSERT3U(ioc, <, ZFS_IOC_LAST); ASSERT3P(vec->zvec_legacy_func, ==, NULL); ASSERT3P(vec->zvec_func, ==, NULL); /* if we are logging, the name must be valid */ ASSERT(!allow_log || namecheck != NO_NAME); vec->zvec_name = name; vec->zvec_func = func; vec->zvec_secpolicy = secpolicy; vec->zvec_namecheck = namecheck; vec->zvec_pool_check = pool_check; vec->zvec_smush_outnvlist = smush_outnvlist; vec->zvec_allow_log = allow_log; } static void zfs_ioctl_register_pool(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, boolean_t log_history, zfs_ioc_poolcheck_t pool_check) { zfs_ioctl_register_legacy(ioc, func, secpolicy, POOL_NAME, log_history, pool_check); } static void zfs_ioctl_register_dataset_nolog(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_poolcheck_t pool_check) { zfs_ioctl_register_legacy(ioc, func, secpolicy, DATASET_NAME, B_FALSE, pool_check); } static void zfs_ioctl_register_pool_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) { zfs_ioctl_register_legacy(ioc, func, zfs_secpolicy_config, POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); } static void zfs_ioctl_register_pool_meta(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { zfs_ioctl_register_legacy(ioc, func, secpolicy, NO_NAME, B_FALSE, POOL_CHECK_NONE); } static void zfs_ioctl_register_dataset_read_secpolicy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { zfs_ioctl_register_legacy(ioc, func, secpolicy, DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED); } static void zfs_ioctl_register_dataset_read(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) { zfs_ioctl_register_dataset_read_secpolicy(ioc, func, zfs_secpolicy_read); } static void zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { zfs_ioctl_register_legacy(ioc, func, secpolicy, DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); } static void zfs_ioctl_init(void) { zfs_ioctl_register("snapshot", ZFS_IOC_SNAPSHOT, zfs_ioc_snapshot, zfs_secpolicy_snapshot, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); zfs_ioctl_register("log_history", ZFS_IOC_LOG_HISTORY, zfs_ioc_log_history, zfs_secpolicy_log_history, NO_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE); zfs_ioctl_register("space_snaps", ZFS_IOC_SPACE_SNAPS, zfs_ioc_space_snaps, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); zfs_ioctl_register("send", ZFS_IOC_SEND_NEW, zfs_ioc_send_new, zfs_secpolicy_send_new, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); zfs_ioctl_register("send_space", ZFS_IOC_SEND_SPACE, zfs_ioc_send_space, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); zfs_ioctl_register("create", ZFS_IOC_CREATE, zfs_ioc_create, zfs_secpolicy_create_clone, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); zfs_ioctl_register("clone", ZFS_IOC_CLONE, zfs_ioc_clone, zfs_secpolicy_create_clone, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); zfs_ioctl_register("remap", ZFS_IOC_REMAP, zfs_ioc_remap, zfs_secpolicy_remap, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE); zfs_ioctl_register("destroy_snaps", ZFS_IOC_DESTROY_SNAPS, zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); zfs_ioctl_register("hold", ZFS_IOC_HOLD, zfs_ioc_hold, zfs_secpolicy_hold, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); zfs_ioctl_register("release", ZFS_IOC_RELEASE, zfs_ioc_release, zfs_secpolicy_release, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); zfs_ioctl_register("get_holds", ZFS_IOC_GET_HOLDS, zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); zfs_ioctl_register("rollback", ZFS_IOC_ROLLBACK, zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE); zfs_ioctl_register("bookmark", ZFS_IOC_BOOKMARK, zfs_ioc_bookmark, zfs_secpolicy_bookmark, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); zfs_ioctl_register("get_bookmarks", ZFS_IOC_GET_BOOKMARKS, zfs_ioc_get_bookmarks, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); zfs_ioctl_register("destroy_bookmarks", ZFS_IOC_DESTROY_BOOKMARKS, zfs_ioc_destroy_bookmarks, zfs_secpolicy_destroy_bookmarks, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); zfs_ioctl_register("channel_program", ZFS_IOC_CHANNEL_PROGRAM, zfs_ioc_channel_program, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); zfs_ioctl_register("zpool_checkpoint", ZFS_IOC_POOL_CHECKPOINT, zfs_ioc_pool_checkpoint, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); zfs_ioctl_register("zpool_discard_checkpoint", ZFS_IOC_POOL_DISCARD_CHECKPOINT, zfs_ioc_pool_discard_checkpoint, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); /* IOCTLS that use the legacy function signature */ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_READONLY); zfs_ioctl_register_pool(ZFS_IOC_POOL_CREATE, zfs_ioc_pool_create, zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SCAN, zfs_ioc_pool_scan); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_UPGRADE, zfs_ioc_pool_upgrade); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ADD, zfs_ioc_vdev_add); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_REMOVE, zfs_ioc_vdev_remove); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SET_STATE, zfs_ioc_vdev_set_state); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ATTACH, zfs_ioc_vdev_attach); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_DETACH, zfs_ioc_vdev_detach); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETPATH, zfs_ioc_vdev_setpath); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETFRU, zfs_ioc_vdev_setfru); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SET_PROPS, zfs_ioc_pool_set_props); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SPLIT, zfs_ioc_vdev_split); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_REGUID, zfs_ioc_pool_reguid); zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_CONFIGS, zfs_ioc_pool_configs, zfs_secpolicy_none); zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_TRYIMPORT, zfs_ioc_pool_tryimport, zfs_secpolicy_config); zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_FAULT, zfs_ioc_inject_fault, zfs_secpolicy_inject); zfs_ioctl_register_pool_meta(ZFS_IOC_CLEAR_FAULT, zfs_ioc_clear_fault, zfs_secpolicy_inject); zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_LIST_NEXT, zfs_ioc_inject_list_next, zfs_secpolicy_inject); /* * pool destroy, and export don't log the history as part of * zfsdev_ioctl, but rather zfs_ioc_pool_export * does the logging of those commands. */ zfs_ioctl_register_pool(ZFS_IOC_POOL_DESTROY, zfs_ioc_pool_destroy, zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_POOL_EXPORT, zfs_ioc_pool_export, zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_POOL_STATS, zfs_ioc_pool_stats, zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_PROPS, zfs_ioc_pool_get_props, zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_ERROR_LOG, zfs_ioc_error_log, zfs_secpolicy_inject, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_DSOBJ_TO_DSNAME, zfs_ioc_dsobj_to_dsname, zfs_secpolicy_diff, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_HISTORY, zfs_ioc_pool_get_history, zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_POOL_IMPORT, zfs_ioc_pool_import, zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_CLEAR, zfs_ioc_clear, zfs_secpolicy_config, B_TRUE, POOL_CHECK_READONLY); zfs_ioctl_register_pool(ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen, zfs_secpolicy_config, B_TRUE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_dataset_read(ZFS_IOC_SPACE_WRITTEN, zfs_ioc_space_written); zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_RECVD_PROPS, zfs_ioc_objset_recvd_props); zfs_ioctl_register_dataset_read(ZFS_IOC_NEXT_OBJ, zfs_ioc_next_obj); zfs_ioctl_register_dataset_read(ZFS_IOC_GET_FSACL, zfs_ioc_get_fsacl); zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_STATS, zfs_ioc_objset_stats); zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_ZPLPROPS, zfs_ioc_objset_zplprops); zfs_ioctl_register_dataset_read(ZFS_IOC_DATASET_LIST_NEXT, zfs_ioc_dataset_list_next); zfs_ioctl_register_dataset_read(ZFS_IOC_SNAPSHOT_LIST_NEXT, zfs_ioc_snapshot_list_next); zfs_ioctl_register_dataset_read(ZFS_IOC_SEND_PROGRESS, zfs_ioc_send_progress); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_DIFF, zfs_ioc_diff, zfs_secpolicy_diff); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_STATS, zfs_ioc_obj_to_stats, zfs_secpolicy_diff); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_PATH, zfs_ioc_obj_to_path, zfs_secpolicy_diff); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_ONE, zfs_ioc_userspace_one, zfs_secpolicy_userspace_one); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_MANY, zfs_ioc_userspace_many, zfs_secpolicy_userspace_many); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_SEND, zfs_ioc_send, zfs_secpolicy_send); zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_PROP, zfs_ioc_set_prop, zfs_secpolicy_none); zfs_ioctl_register_dataset_modify(ZFS_IOC_DESTROY, zfs_ioc_destroy, zfs_secpolicy_destroy); zfs_ioctl_register_dataset_modify(ZFS_IOC_RENAME, zfs_ioc_rename, zfs_secpolicy_rename); zfs_ioctl_register_dataset_modify(ZFS_IOC_RECV, zfs_ioc_recv, zfs_secpolicy_recv); zfs_ioctl_register_dataset_modify(ZFS_IOC_PROMOTE, zfs_ioc_promote, zfs_secpolicy_promote); zfs_ioctl_register_dataset_modify(ZFS_IOC_INHERIT_PROP, zfs_ioc_inherit_prop, zfs_secpolicy_inherit_prop); zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_FSACL, zfs_ioc_set_fsacl, zfs_secpolicy_set_fsacl); zfs_ioctl_register_dataset_nolog(ZFS_IOC_SHARE, zfs_ioc_share, zfs_secpolicy_share, POOL_CHECK_NONE); zfs_ioctl_register_dataset_nolog(ZFS_IOC_SMB_ACL, zfs_ioc_smb_acl, zfs_secpolicy_smb_acl, POOL_CHECK_NONE); zfs_ioctl_register_dataset_nolog(ZFS_IOC_USERSPACE_UPGRADE, zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); zfs_ioctl_register_dataset_nolog(ZFS_IOC_TMP_SNAPSHOT, zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); #ifdef __FreeBSD__ zfs_ioctl_register_dataset_nolog(ZFS_IOC_JAIL, zfs_ioc_jail, zfs_secpolicy_config, POOL_CHECK_NONE); zfs_ioctl_register_dataset_nolog(ZFS_IOC_UNJAIL, zfs_ioc_unjail, zfs_secpolicy_config, POOL_CHECK_NONE); zfs_ioctl_register("fbsd_nextboot", ZFS_IOC_NEXTBOOT, zfs_ioc_nextboot, zfs_secpolicy_config, NO_NAME, POOL_CHECK_NONE, B_FALSE, B_FALSE); #endif } int pool_status_check(const char *name, zfs_ioc_namecheck_t type, zfs_ioc_poolcheck_t check) { spa_t *spa; int error; ASSERT(type == POOL_NAME || type == DATASET_NAME); if (check & POOL_CHECK_NONE) return (0); error = spa_open(name, &spa, FTAG); if (error == 0) { if ((check & POOL_CHECK_SUSPENDED) && spa_suspended(spa)) error = SET_ERROR(EAGAIN); else if ((check & POOL_CHECK_READONLY) && !spa_writeable(spa)) error = SET_ERROR(EROFS); spa_close(spa, FTAG); } return (error); } /* * Find a free minor number. */ minor_t zfsdev_minor_alloc(void) { static minor_t last_minor; minor_t m; ASSERT(MUTEX_HELD(&spa_namespace_lock)); for (m = last_minor + 1; m != last_minor; m++) { if (m > ZFSDEV_MAX_MINOR) m = 1; if (ddi_get_soft_state(zfsdev_state, m) == NULL) { last_minor = m; return (m); } } return (0); } static int zfs_ctldev_init(struct cdev *devp) { minor_t minor; zfs_soft_state_t *zs; ASSERT(MUTEX_HELD(&spa_namespace_lock)); minor = zfsdev_minor_alloc(); if (minor == 0) return (SET_ERROR(ENXIO)); if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) return (SET_ERROR(EAGAIN)); devfs_set_cdevpriv((void *)(uintptr_t)minor, zfsdev_close); zs = ddi_get_soft_state(zfsdev_state, minor); zs->zss_type = ZSST_CTLDEV; zfs_onexit_init((zfs_onexit_t **)&zs->zss_data); return (0); } static void zfs_ctldev_destroy(zfs_onexit_t *zo, minor_t minor) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); zfs_onexit_destroy(zo); ddi_soft_state_free(zfsdev_state, minor); } void * zfsdev_get_soft_state(minor_t minor, enum zfs_soft_state_type which) { zfs_soft_state_t *zp; zp = ddi_get_soft_state(zfsdev_state, minor); if (zp == NULL || zp->zss_type != which) return (NULL); return (zp->zss_data); } static int zfsdev_open(struct cdev *devp, int flag, int mode, struct thread *td) { int error = 0; #ifdef illumos if (getminor(*devp) != 0) return (zvol_open(devp, flag, otyp, cr)); #endif /* This is the control device. Allocate a new minor if requested. */ if (flag & FEXCL) { mutex_enter(&spa_namespace_lock); error = zfs_ctldev_init(devp); mutex_exit(&spa_namespace_lock); } return (error); } static void zfsdev_close(void *data) { zfs_onexit_t *zo; minor_t minor = (minor_t)(uintptr_t)data; if (minor == 0) return; mutex_enter(&spa_namespace_lock); zo = zfsdev_get_soft_state(minor, ZSST_CTLDEV); if (zo == NULL) { mutex_exit(&spa_namespace_lock); return; } zfs_ctldev_destroy(zo, minor); mutex_exit(&spa_namespace_lock); } static int zfsdev_ioctl(struct cdev *dev, u_long zcmd, caddr_t arg, int flag, struct thread *td) { zfs_cmd_t *zc; uint_t vecnum; int error, rc, len; #ifdef illumos minor_t minor = getminor(dev); #else zfs_iocparm_t *zc_iocparm; int cflag, cmd, oldvecnum; boolean_t newioc, compat; void *compat_zc = NULL; cred_t *cr = td->td_ucred; #endif const zfs_ioc_vec_t *vec; char *saved_poolname = NULL; nvlist_t *innvl = NULL; cflag = ZFS_CMD_COMPAT_NONE; compat = B_FALSE; newioc = B_TRUE; /* "new" style (zfs_iocparm_t) ioctl */ len = IOCPARM_LEN(zcmd); vecnum = cmd = zcmd & 0xff; /* * Check if we are talking to supported older binaries * and translate zfs_cmd if necessary */ if (len != sizeof(zfs_iocparm_t)) { newioc = B_FALSE; compat = B_TRUE; vecnum = cmd; switch (len) { case sizeof(zfs_cmd_zcmd_t): cflag = ZFS_CMD_COMPAT_LZC; break; case sizeof(zfs_cmd_deadman_t): cflag = ZFS_CMD_COMPAT_DEADMAN; break; case sizeof(zfs_cmd_v28_t): cflag = ZFS_CMD_COMPAT_V28; break; case sizeof(zfs_cmd_v15_t): if (cmd >= sizeof(zfs_ioctl_v15_to_v28) / sizeof(zfs_ioctl_v15_to_v28[0])) return (EINVAL); cflag = ZFS_CMD_COMPAT_V15; vecnum = zfs_ioctl_v15_to_v28[cmd]; /* * Return without further handling * if the command is blacklisted. */ if (vecnum == ZFS_IOC_COMPAT_PASS) return (0); else if (vecnum == ZFS_IOC_COMPAT_FAIL) return (ENOTSUP); break; default: return (EINVAL); } } #ifdef illumos vecnum = cmd - ZFS_IOC_FIRST; ASSERT3U(getmajor(dev), ==, ddi_driver_major(zfs_dip)); #endif if (vecnum >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0])) return (SET_ERROR(EINVAL)); vec = &zfs_ioc_vec[vecnum]; zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP); #ifdef illumos error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag); if (error != 0) { error = SET_ERROR(EFAULT); goto out; } #else /* !illumos */ bzero(zc, sizeof(zfs_cmd_t)); if (newioc) { zc_iocparm = (void *)arg; switch (zc_iocparm->zfs_ioctl_version) { case ZFS_IOCVER_CURRENT: if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_t)) { error = SET_ERROR(EINVAL); goto out; } break; case ZFS_IOCVER_INLANES: if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_inlanes_t)) { error = SET_ERROR(EFAULT); goto out; } compat = B_TRUE; cflag = ZFS_CMD_COMPAT_INLANES; break; case ZFS_IOCVER_RESUME: if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_resume_t)) { error = SET_ERROR(EFAULT); goto out; } compat = B_TRUE; cflag = ZFS_CMD_COMPAT_RESUME; break; case ZFS_IOCVER_EDBP: if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_edbp_t)) { error = SET_ERROR(EFAULT); goto out; } compat = B_TRUE; cflag = ZFS_CMD_COMPAT_EDBP; break; case ZFS_IOCVER_ZCMD: if (zc_iocparm->zfs_cmd_size > sizeof(zfs_cmd_t) || zc_iocparm->zfs_cmd_size < sizeof(zfs_cmd_zcmd_t)) { error = SET_ERROR(EFAULT); goto out; } compat = B_TRUE; cflag = ZFS_CMD_COMPAT_ZCMD; break; default: error = SET_ERROR(EINVAL); goto out; /* NOTREACHED */ } if (compat) { ASSERT(sizeof(zfs_cmd_t) >= zc_iocparm->zfs_cmd_size); compat_zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP); bzero(compat_zc, sizeof(zfs_cmd_t)); error = ddi_copyin((void *)(uintptr_t)zc_iocparm->zfs_cmd, compat_zc, zc_iocparm->zfs_cmd_size, flag); if (error != 0) { error = SET_ERROR(EFAULT); goto out; } } else { error = ddi_copyin((void *)(uintptr_t)zc_iocparm->zfs_cmd, zc, zc_iocparm->zfs_cmd_size, flag); if (error != 0) { error = SET_ERROR(EFAULT); goto out; } } } if (compat) { if (newioc) { ASSERT(compat_zc != NULL); zfs_cmd_compat_get(zc, compat_zc, cflag); } else { ASSERT(compat_zc == NULL); zfs_cmd_compat_get(zc, arg, cflag); } oldvecnum = vecnum; error = zfs_ioctl_compat_pre(zc, &vecnum, cflag); if (error != 0) goto out; if (oldvecnum != vecnum) vec = &zfs_ioc_vec[vecnum]; } #endif /* !illumos */ zc->zc_iflags = flag & FKIOCTL; if (zc->zc_nvlist_src_size != 0) { error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &innvl); if (error != 0) goto out; } /* rewrite innvl for backwards compatibility */ if (compat) innvl = zfs_ioctl_compat_innvl(zc, innvl, vecnum, cflag); /* * Ensure that all pool/dataset names are valid before we pass down to * the lower layers. */ zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; switch (vec->zvec_namecheck) { case POOL_NAME: if (pool_namecheck(zc->zc_name, NULL, NULL) != 0) error = SET_ERROR(EINVAL); else error = pool_status_check(zc->zc_name, vec->zvec_namecheck, vec->zvec_pool_check); break; case DATASET_NAME: if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0) error = SET_ERROR(EINVAL); else error = pool_status_check(zc->zc_name, vec->zvec_namecheck, vec->zvec_pool_check); break; case NO_NAME: break; } if (error == 0) error = vec->zvec_secpolicy(zc, innvl, cr); if (error != 0) goto out; /* legacy ioctls can modify zc_name */ len = strcspn(zc->zc_name, "/@#") + 1; saved_poolname = kmem_alloc(len, KM_SLEEP); (void) strlcpy(saved_poolname, zc->zc_name, len); if (vec->zvec_func != NULL) { nvlist_t *outnvl; int puterror = 0; spa_t *spa; nvlist_t *lognv = NULL; ASSERT(vec->zvec_legacy_func == NULL); /* * Add the innvl to the lognv before calling the func, * in case the func changes the innvl. */ if (vec->zvec_allow_log) { lognv = fnvlist_alloc(); fnvlist_add_string(lognv, ZPOOL_HIST_IOCTL, vec->zvec_name); if (!nvlist_empty(innvl)) { fnvlist_add_nvlist(lognv, ZPOOL_HIST_INPUT_NVL, innvl); } } outnvl = fnvlist_alloc(); error = vec->zvec_func(zc->zc_name, innvl, outnvl); /* * Some commands can partially execute, modfiy state, and still * return an error. In these cases, attempt to record what * was modified. */ if ((error == 0 || (cmd == ZFS_IOC_CHANNEL_PROGRAM && error != EINVAL)) && vec->zvec_allow_log && spa_open(zc->zc_name, &spa, FTAG) == 0) { if (!nvlist_empty(outnvl)) { fnvlist_add_nvlist(lognv, ZPOOL_HIST_OUTPUT_NVL, outnvl); } if (error != 0) { fnvlist_add_int64(lognv, ZPOOL_HIST_ERRNO, error); } (void) spa_history_log_nvl(spa, lognv); spa_close(spa, FTAG); } fnvlist_free(lognv); /* rewrite outnvl for backwards compatibility */ if (compat) outnvl = zfs_ioctl_compat_outnvl(zc, outnvl, vecnum, cflag); if (!nvlist_empty(outnvl) || zc->zc_nvlist_dst_size != 0) { int smusherror = 0; if (vec->zvec_smush_outnvlist) { smusherror = nvlist_smush(outnvl, zc->zc_nvlist_dst_size); } if (smusherror == 0) puterror = put_nvlist(zc, outnvl); } if (puterror != 0) error = puterror; nvlist_free(outnvl); } else { error = vec->zvec_legacy_func(zc); } out: nvlist_free(innvl); #ifdef illumos rc = ddi_copyout(zc, (void *)arg, sizeof (zfs_cmd_t), flag); if (error == 0 && rc != 0) error = SET_ERROR(EFAULT); #else if (compat) { zfs_ioctl_compat_post(zc, cmd, cflag); if (newioc) { ASSERT(compat_zc != NULL); ASSERT(sizeof(zfs_cmd_t) >= zc_iocparm->zfs_cmd_size); zfs_cmd_compat_put(zc, compat_zc, vecnum, cflag); rc = ddi_copyout(compat_zc, (void *)(uintptr_t)zc_iocparm->zfs_cmd, zc_iocparm->zfs_cmd_size, flag); if (error == 0 && rc != 0) error = SET_ERROR(EFAULT); kmem_free(compat_zc, sizeof (zfs_cmd_t)); } else { zfs_cmd_compat_put(zc, arg, vecnum, cflag); } } else { ASSERT(newioc); rc = ddi_copyout(zc, (void *)(uintptr_t)zc_iocparm->zfs_cmd, sizeof (zfs_cmd_t), flag); if (error == 0 && rc != 0) error = SET_ERROR(EFAULT); } #endif if (error == 0 && vec->zvec_allow_log) { char *s = tsd_get(zfs_allow_log_key); if (s != NULL) strfree(s); (void) tsd_set(zfs_allow_log_key, saved_poolname); } else { if (saved_poolname != NULL) strfree(saved_poolname); } kmem_free(zc, sizeof (zfs_cmd_t)); return (error); } #ifdef illumos static int zfs_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) { if (cmd != DDI_ATTACH) return (DDI_FAILURE); if (ddi_create_minor_node(dip, "zfs", S_IFCHR, 0, DDI_PSEUDO, 0) == DDI_FAILURE) return (DDI_FAILURE); zfs_dip = dip; ddi_report_dev(dip); return (DDI_SUCCESS); } static int zfs_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) { if (spa_busy() || zfs_busy() || zvol_busy()) return (DDI_FAILURE); if (cmd != DDI_DETACH) return (DDI_FAILURE); zfs_dip = NULL; ddi_prop_remove_all(dip); ddi_remove_minor_node(dip, NULL); return (DDI_SUCCESS); } /*ARGSUSED*/ static int zfs_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) { switch (infocmd) { case DDI_INFO_DEVT2DEVINFO: *result = zfs_dip; return (DDI_SUCCESS); case DDI_INFO_DEVT2INSTANCE: *result = (void *)0; return (DDI_SUCCESS); } return (DDI_FAILURE); } #endif /* illumos */ /* * OK, so this is a little weird. * * /dev/zfs is the control node, i.e. minor 0. * /dev/zvol/[r]dsk/pool/dataset are the zvols, minor > 0. * * /dev/zfs has basically nothing to do except serve up ioctls, * so most of the standard driver entry points are in zvol.c. */ #ifdef illumos static struct cb_ops zfs_cb_ops = { zfsdev_open, /* open */ zfsdev_close, /* close */ zvol_strategy, /* strategy */ nodev, /* print */ zvol_dump, /* dump */ zvol_read, /* read */ zvol_write, /* write */ zfsdev_ioctl, /* ioctl */ nodev, /* devmap */ nodev, /* mmap */ nodev, /* segmap */ nochpoll, /* poll */ ddi_prop_op, /* prop_op */ NULL, /* streamtab */ D_NEW | D_MP | D_64BIT, /* Driver compatibility flag */ CB_REV, /* version */ nodev, /* async read */ nodev, /* async write */ }; static struct dev_ops zfs_dev_ops = { DEVO_REV, /* version */ 0, /* refcnt */ zfs_info, /* info */ nulldev, /* identify */ nulldev, /* probe */ zfs_attach, /* attach */ zfs_detach, /* detach */ nodev, /* reset */ &zfs_cb_ops, /* driver operations */ NULL, /* no bus operations */ NULL, /* power */ ddi_quiesce_not_needed, /* quiesce */ }; static struct modldrv zfs_modldrv = { &mod_driverops, "ZFS storage pool", &zfs_dev_ops }; static struct modlinkage modlinkage = { MODREV_1, (void *)&zfs_modlfs, (void *)&zfs_modldrv, NULL }; #endif /* illumos */ static struct cdevsw zfs_cdevsw = { .d_version = D_VERSION, .d_open = zfsdev_open, .d_ioctl = zfsdev_ioctl, .d_name = ZFS_DEV_NAME }; static void zfs_allow_log_destroy(void *arg) { char *poolname = arg; strfree(poolname); } static void zfsdev_init(void) { zfsdev = make_dev(&zfs_cdevsw, 0x0, UID_ROOT, GID_OPERATOR, 0666, ZFS_DEV_NAME); } static void zfsdev_fini(void) { if (zfsdev != NULL) destroy_dev(zfsdev); } static struct root_hold_token *zfs_root_token; struct proc *zfsproc; #ifdef illumos int _init(void) { int error; spa_init(FREAD | FWRITE); zfs_init(); zvol_init(); zfs_ioctl_init(); if ((error = mod_install(&modlinkage)) != 0) { zvol_fini(); zfs_fini(); spa_fini(); return (error); } tsd_create(&zfs_fsyncer_key, NULL); tsd_create(&rrw_tsd_key, rrw_tsd_destroy); tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy); error = ldi_ident_from_mod(&modlinkage, &zfs_li); ASSERT(error == 0); mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL); return (0); } int _fini(void) { int error; if (spa_busy() || zfs_busy() || zvol_busy() || zio_injection_enabled) return (SET_ERROR(EBUSY)); if ((error = mod_remove(&modlinkage)) != 0) return (error); zvol_fini(); zfs_fini(); spa_fini(); if (zfs_nfsshare_inited) (void) ddi_modclose(nfs_mod); if (zfs_smbshare_inited) (void) ddi_modclose(smbsrv_mod); if (zfs_nfsshare_inited || zfs_smbshare_inited) (void) ddi_modclose(sharefs_mod); tsd_destroy(&zfs_fsyncer_key); ldi_ident_release(zfs_li); zfs_li = NULL; mutex_destroy(&zfs_share_lock); return (error); } int _info(struct modinfo *modinfop) { return (mod_info(&modlinkage, modinfop)); } #endif /* illumos */ static int zfs__init(void); static int zfs__fini(void); static void zfs_shutdown(void *, int); static eventhandler_tag zfs_shutdown_event_tag; #ifdef __FreeBSD__ #define ZFS_MIN_KSTACK_PAGES 4 #endif int zfs__init(void) { #ifdef __FreeBSD__ #if KSTACK_PAGES < ZFS_MIN_KSTACK_PAGES printf("ZFS NOTICE: KSTACK_PAGES is %d which could result in stack " "overflow panic!\nPlease consider adding " "'options KSTACK_PAGES=%d' to your kernel config\n", KSTACK_PAGES, ZFS_MIN_KSTACK_PAGES); #endif #endif zfs_root_token = root_mount_hold("ZFS"); mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL); spa_init(FREAD | FWRITE); zfs_init(); zvol_init(); zfs_ioctl_init(); tsd_create(&zfs_fsyncer_key, NULL); tsd_create(&rrw_tsd_key, rrw_tsd_destroy); tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy); tsd_create(&zfs_geom_probe_vdev_key, NULL); printf("ZFS storage pool version: features support (" SPA_VERSION_STRING ")\n"); root_mount_rel(zfs_root_token); zfsdev_init(); return (0); } int zfs__fini(void) { if (spa_busy() || zfs_busy() || zvol_busy() || zio_injection_enabled) { return (EBUSY); } zfsdev_fini(); zvol_fini(); zfs_fini(); spa_fini(); tsd_destroy(&zfs_fsyncer_key); tsd_destroy(&rrw_tsd_key); tsd_destroy(&zfs_allow_log_key); mutex_destroy(&zfs_share_lock); return (0); } static void zfs_shutdown(void *arg __unused, int howto __unused) { /* * ZFS fini routines can not properly work in a panic-ed system. */ if (panicstr == NULL) (void)zfs__fini(); } static int zfs_modevent(module_t mod, int type, void *unused __unused) { int err; switch (type) { case MOD_LOAD: err = zfs__init(); if (err == 0) zfs_shutdown_event_tag = EVENTHANDLER_REGISTER( shutdown_post_sync, zfs_shutdown, NULL, SHUTDOWN_PRI_FIRST); return (err); case MOD_UNLOAD: err = zfs__fini(); if (err == 0 && zfs_shutdown_event_tag != NULL) EVENTHANDLER_DEREGISTER(shutdown_post_sync, zfs_shutdown_event_tag); return (err); case MOD_SHUTDOWN: return (0); default: break; } return (EOPNOTSUPP); } static moduledata_t zfs_mod = { "zfsctrl", zfs_modevent, 0 }; DECLARE_MODULE(zfsctrl, zfs_mod, SI_SUB_VFS, SI_ORDER_ANY); MODULE_VERSION(zfsctrl, 1); MODULE_DEPEND(zfsctrl, opensolaris, 1, 1, 1); MODULE_DEPEND(zfsctrl, krpc, 1, 1, 1); MODULE_DEPEND(zfsctrl, acl_nfs4, 1, 1, 1); Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c (revision 333424) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c (revision 333425) @@ -1,254 +1,254 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include /* * ZFS kernel routines may add/delete callback routines to be invoked * upon process exit (triggered via the close operation from the /dev/zfs * driver). * * These cleanup callbacks are intended to allow for the accumulation * of kernel state across multiple ioctls. User processes participate * by opening ZFS_DEV with O_EXCL. This causes the ZFS driver to do a * clone-open, generating a unique minor number. The process then passes * along that file descriptor to each ioctl that might have a cleanup operation. * * Consumers of the onexit routines should call zfs_onexit_fd_hold() early * on to validate the given fd and add a reference to its file table entry. * This allows the consumer to do its work and then add a callback, knowing * that zfs_onexit_add_cb() won't fail with EBADF. When finished, consumers * should call zfs_onexit_fd_rele(). * * A simple example is zfs_ioc_recv(), where we might create an AVL tree * with dataset/GUID mappings and then reuse that tree on subsequent * zfs_ioc_recv() calls. * * On the first zfs_ioc_recv() call, dmu_recv_stream() will kmem_alloc() * the AVL tree and pass it along with a callback function to * zfs_onexit_add_cb(). The zfs_onexit_add_cb() routine will register the * callback and return an action handle. * * The action handle is then passed from user space to subsequent * zfs_ioc_recv() calls, so that dmu_recv_stream() can fetch its AVL tree * by calling zfs_onexit_cb_data() with the device minor number and * action handle. * * If the user process exits abnormally, the callback is invoked implicitly * as part of the driver close operation. Once the user space process is * finished with the accumulated kernel state, it can also just call close(2) * on the cleanup fd to trigger the cleanup callback. */ void zfs_onexit_init(zfs_onexit_t **zop) { zfs_onexit_t *zo; zo = *zop = kmem_zalloc(sizeof (zfs_onexit_t), KM_SLEEP); mutex_init(&zo->zo_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zo->zo_actions, sizeof (zfs_onexit_action_node_t), offsetof(zfs_onexit_action_node_t, za_link)); } void zfs_onexit_destroy(zfs_onexit_t *zo) { zfs_onexit_action_node_t *ap; mutex_enter(&zo->zo_lock); while ((ap = list_head(&zo->zo_actions)) != NULL) { list_remove(&zo->zo_actions, ap); mutex_exit(&zo->zo_lock); ap->za_func(ap->za_data); kmem_free(ap, sizeof (zfs_onexit_action_node_t)); mutex_enter(&zo->zo_lock); } mutex_exit(&zo->zo_lock); list_destroy(&zo->zo_actions); mutex_destroy(&zo->zo_lock); kmem_free(zo, sizeof (zfs_onexit_t)); } static int zfs_onexit_minor_to_state(minor_t minor, zfs_onexit_t **zo) { *zo = zfsdev_get_soft_state(minor, ZSST_CTLDEV); if (*zo == NULL) return (SET_ERROR(EBADF)); return (0); } /* * Consumers might need to operate by minor number instead of fd, since * they might be running in another thread (e.g. txg_sync_thread). Callers * of this function must call zfs_onexit_fd_rele() when they're finished * using the minor number. */ int zfs_onexit_fd_hold(int fd, minor_t *minorp) { file_t *fp, *tmpfp; zfs_onexit_t *zo; cap_rights_t rights; void *data; int error; - fp = getf(fd, cap_rights_init(&rights)); + fp = getf(fd, &cap_no_rights); if (fp == NULL) return (SET_ERROR(EBADF)); tmpfp = curthread->td_fpop; curthread->td_fpop = fp; error = devfs_get_cdevpriv(&data); if (error == 0) *minorp = (minor_t)(uintptr_t)data; curthread->td_fpop = tmpfp; if (error != 0) return (SET_ERROR(EBADF)); return (zfs_onexit_minor_to_state(*minorp, &zo)); } void zfs_onexit_fd_rele(int fd) { releasef(fd); } /* * Add a callback to be invoked when the calling process exits. */ int zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, uint64_t *action_handle) { zfs_onexit_t *zo; zfs_onexit_action_node_t *ap; int error; error = zfs_onexit_minor_to_state(minor, &zo); if (error) return (error); ap = kmem_alloc(sizeof (zfs_onexit_action_node_t), KM_SLEEP); list_link_init(&ap->za_link); ap->za_func = func; ap->za_data = data; mutex_enter(&zo->zo_lock); list_insert_tail(&zo->zo_actions, ap); mutex_exit(&zo->zo_lock); if (action_handle) *action_handle = (uint64_t)(uintptr_t)ap; return (0); } static zfs_onexit_action_node_t * zfs_onexit_find_cb(zfs_onexit_t *zo, uint64_t action_handle) { zfs_onexit_action_node_t *match; zfs_onexit_action_node_t *ap; list_t *l; ASSERT(MUTEX_HELD(&zo->zo_lock)); match = (zfs_onexit_action_node_t *)(uintptr_t)action_handle; l = &zo->zo_actions; for (ap = list_head(l); ap != NULL; ap = list_next(l, ap)) { if (match == ap) break; } return (ap); } /* * Delete the callback, triggering it first if 'fire' is set. */ int zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, boolean_t fire) { zfs_onexit_t *zo; zfs_onexit_action_node_t *ap; int error; error = zfs_onexit_minor_to_state(minor, &zo); if (error) return (error); mutex_enter(&zo->zo_lock); ap = zfs_onexit_find_cb(zo, action_handle); if (ap != NULL) { list_remove(&zo->zo_actions, ap); mutex_exit(&zo->zo_lock); if (fire) ap->za_func(ap->za_data); kmem_free(ap, sizeof (zfs_onexit_action_node_t)); } else { mutex_exit(&zo->zo_lock); error = SET_ERROR(ENOENT); } return (error); } /* * Return the data associated with this callback. This allows consumers * of the cleanup-on-exit interfaces to stash kernel data across system * calls, knowing that it will be cleaned up if the calling process exits. */ int zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, void **data) { zfs_onexit_t *zo; zfs_onexit_action_node_t *ap; int error; *data = NULL; error = zfs_onexit_minor_to_state(minor, &zo); if (error) return (error); mutex_enter(&zo->zo_lock); ap = zfs_onexit_find_cb(zo, action_handle); if (ap != NULL) *data = ap->za_data; else error = SET_ERROR(ENOENT); mutex_exit(&zo->zo_lock); return (error); } Index: head/sys/compat/cloudabi/cloudabi_file.c =================================================================== --- head/sys/compat/cloudabi/cloudabi_file.c (revision 333424) +++ head/sys/compat/cloudabi/cloudabi_file.c (revision 333425) @@ -1,762 +1,760 @@ /*- * Copyright (c) 2015 Nuxi, https://nuxi.nl/ * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_CLOUDABI_PATH, "cloudabipath", "CloudABI pathnames"); /* * Copying pathnames from userspace to kernelspace. * * Unlike most operating systems, CloudABI doesn't use null-terminated * pathname strings. Processes always pass pathnames to the kernel by * providing a base pointer and a length. This has a couple of reasons: * * - It makes it easier to use CloudABI in combination with programming * languages other than C, that may use non-null terminated strings. * - It allows for calling system calls on individual components of the * pathname without modifying the input string. * * The function below copies in pathname strings and null-terminates it. * It also ensure that the string itself does not contain any null * bytes. * * TODO(ed): Add an abstraction to vfs_lookup.c that allows us to pass * in unterminated pathname strings, so we can do away with * the copying. */ static int copyin_path(const char *uaddr, size_t len, char **result) { char *buf; int error; if (len >= PATH_MAX) return (ENAMETOOLONG); buf = malloc(len + 1, M_CLOUDABI_PATH, M_WAITOK); error = copyin(uaddr, buf, len); if (error != 0) { free(buf, M_CLOUDABI_PATH); return (error); } if (memchr(buf, '\0', len) != NULL) { free(buf, M_CLOUDABI_PATH); return (EINVAL); } buf[len] = '\0'; *result = buf; return (0); } static void cloudabi_freestr(char *buf) { free(buf, M_CLOUDABI_PATH); } int cloudabi_sys_file_advise(struct thread *td, struct cloudabi_sys_file_advise_args *uap) { int advice; switch (uap->advice) { case CLOUDABI_ADVICE_DONTNEED: advice = POSIX_FADV_DONTNEED; break; case CLOUDABI_ADVICE_NOREUSE: advice = POSIX_FADV_NOREUSE; break; case CLOUDABI_ADVICE_NORMAL: advice = POSIX_FADV_NORMAL; break; case CLOUDABI_ADVICE_RANDOM: advice = POSIX_FADV_RANDOM; break; case CLOUDABI_ADVICE_SEQUENTIAL: advice = POSIX_FADV_SEQUENTIAL; break; case CLOUDABI_ADVICE_WILLNEED: advice = POSIX_FADV_WILLNEED; break; default: return (EINVAL); } return (kern_posix_fadvise(td, uap->fd, uap->offset, uap->len, advice)); } int cloudabi_sys_file_allocate(struct thread *td, struct cloudabi_sys_file_allocate_args *uap) { return (kern_posix_fallocate(td, uap->fd, uap->offset, uap->len)); } int cloudabi_sys_file_create(struct thread *td, struct cloudabi_sys_file_create_args *uap) { char *path; int error; error = copyin_path(uap->path, uap->path_len, &path); if (error != 0) return (error); /* * CloudABI processes cannot interact with UNIX credentials and * permissions. Depend on the umask that is set prior to * execution to restrict the file permissions. */ switch (uap->type) { case CLOUDABI_FILETYPE_DIRECTORY: error = kern_mkdirat(td, uap->fd, path, UIO_SYSSPACE, 0777); break; default: error = EINVAL; break; } cloudabi_freestr(path); return (error); } int cloudabi_sys_file_link(struct thread *td, struct cloudabi_sys_file_link_args *uap) { char *path1, *path2; int error; error = copyin_path(uap->path1, uap->path1_len, &path1); if (error != 0) return (error); error = copyin_path(uap->path2, uap->path2_len, &path2); if (error != 0) { cloudabi_freestr(path1); return (error); } error = kern_linkat(td, uap->fd1.fd, uap->fd2, path1, path2, UIO_SYSSPACE, (uap->fd1.flags & CLOUDABI_LOOKUP_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW); cloudabi_freestr(path1); cloudabi_freestr(path2); return (error); } int cloudabi_sys_file_open(struct thread *td, struct cloudabi_sys_file_open_args *uap) { cloudabi_fdstat_t fds; cap_rights_t rights; struct filecaps fcaps = {}; struct nameidata nd; struct file *fp; struct vnode *vp; char *path; int error, fd, fflags; bool read, write; error = copyin(uap->fds, &fds, sizeof(fds)); if (error != 0) return (error); /* All the requested rights should be set on the descriptor. */ error = cloudabi_convert_rights( fds.fs_rights_base | fds.fs_rights_inheriting, &rights); if (error != 0) return (error); cap_rights_set(&rights, CAP_LOOKUP); /* Convert rights to corresponding access mode. */ read = (fds.fs_rights_base & (CLOUDABI_RIGHT_FD_READ | CLOUDABI_RIGHT_FILE_READDIR | CLOUDABI_RIGHT_MEM_MAP_EXEC)) != 0; write = (fds.fs_rights_base & (CLOUDABI_RIGHT_FD_DATASYNC | CLOUDABI_RIGHT_FD_WRITE | CLOUDABI_RIGHT_FILE_ALLOCATE | CLOUDABI_RIGHT_FILE_STAT_FPUT_SIZE)) != 0; fflags = write ? read ? FREAD | FWRITE : FWRITE : FREAD; /* Convert open flags. */ if ((uap->oflags & CLOUDABI_O_CREAT) != 0) { fflags |= O_CREAT; cap_rights_set(&rights, CAP_CREATE); } if ((uap->oflags & CLOUDABI_O_DIRECTORY) != 0) fflags |= O_DIRECTORY; if ((uap->oflags & CLOUDABI_O_EXCL) != 0) fflags |= O_EXCL; if ((uap->oflags & CLOUDABI_O_TRUNC) != 0) { fflags |= O_TRUNC; cap_rights_set(&rights, CAP_FTRUNCATE); } if ((fds.fs_flags & CLOUDABI_FDFLAG_APPEND) != 0) fflags |= O_APPEND; if ((fds.fs_flags & CLOUDABI_FDFLAG_NONBLOCK) != 0) fflags |= O_NONBLOCK; if ((fds.fs_flags & (CLOUDABI_FDFLAG_SYNC | CLOUDABI_FDFLAG_DSYNC | CLOUDABI_FDFLAG_RSYNC)) != 0) { fflags |= O_SYNC; cap_rights_set(&rights, CAP_FSYNC); } if ((uap->dirfd.flags & CLOUDABI_LOOKUP_SYMLINK_FOLLOW) == 0) fflags |= O_NOFOLLOW; if (write && (fflags & (O_APPEND | O_TRUNC)) == 0) cap_rights_set(&rights, CAP_SEEK); /* Allocate new file descriptor. */ error = falloc_noinstall(td, &fp); if (error != 0) return (error); fp->f_flag = fflags & FMASK; /* Open path. */ error = copyin_path(uap->path, uap->path_len, &path); if (error != 0) { fdrop(fp, td); return (error); } NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, path, uap->dirfd.fd, &rights, td); error = vn_open(&nd, &fflags, 0777 & ~td->td_proc->p_fd->fd_cmask, fp); cloudabi_freestr(path); if (error != 0) { /* Custom operations provided. */ if (error == ENXIO && fp->f_ops != &badfileops) goto success; /* * POSIX compliance: return ELOOP in case openat() is * called on a symbolic link and O_NOFOLLOW is set. */ if (error == EMLINK) error = ELOOP; fdrop(fp, td); return (error); } NDFREE(&nd, NDF_ONLY_PNBUF); filecaps_free(&nd.ni_filecaps); fp->f_vnode = vp = nd.ni_vp; /* Install vnode operations if no custom operations are provided. */ if (fp->f_ops == &badfileops) { fp->f_seqcount = 1; finit(fp, (fflags & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE, vp, &vnops); } VOP_UNLOCK(vp, 0); /* Truncate file. */ if (fflags & O_TRUNC) { error = fo_truncate(fp, 0, td->td_ucred, td); if (error != 0) { fdrop(fp, td); return (error); } } success: /* Determine which Capsicum rights to set on the file descriptor. */ cloudabi_remove_conflicting_rights(cloudabi_convert_filetype(fp), &fds.fs_rights_base, &fds.fs_rights_inheriting); cloudabi_convert_rights(fds.fs_rights_base | fds.fs_rights_inheriting, &fcaps.fc_rights); if (cap_rights_is_set(&fcaps.fc_rights)) fcaps.fc_fcntls = CAP_FCNTL_SETFL; error = finstall(td, fp, &fd, fflags, &fcaps); fdrop(fp, td); if (error != 0) return (error); td->td_retval[0] = fd; return (0); } /* Converts a FreeBSD directory entry structure and writes it to userspace. */ static int write_dirent(struct dirent *bde, cloudabi_dircookie_t cookie, struct uio *uio) { cloudabi_dirent_t cde = { .d_next = cookie, .d_ino = bde->d_fileno, .d_namlen = bde->d_namlen, }; size_t len; int error; /* Convert file type. */ switch (bde->d_type) { case DT_BLK: cde.d_type = CLOUDABI_FILETYPE_BLOCK_DEVICE; break; case DT_CHR: cde.d_type = CLOUDABI_FILETYPE_CHARACTER_DEVICE; break; case DT_DIR: cde.d_type = CLOUDABI_FILETYPE_DIRECTORY; break; case DT_FIFO: cde.d_type = CLOUDABI_FILETYPE_SOCKET_STREAM; break; case DT_LNK: cde.d_type = CLOUDABI_FILETYPE_SYMBOLIC_LINK; break; case DT_REG: cde.d_type = CLOUDABI_FILETYPE_REGULAR_FILE; break; case DT_SOCK: /* The exact socket type cannot be derived. */ cde.d_type = CLOUDABI_FILETYPE_SOCKET_STREAM; break; default: cde.d_type = CLOUDABI_FILETYPE_UNKNOWN; break; } /* Write directory entry structure. */ len = sizeof(cde) < uio->uio_resid ? sizeof(cde) : uio->uio_resid; error = uiomove(&cde, len, uio); if (error != 0) return (error); /* Write filename. */ len = bde->d_namlen < uio->uio_resid ? bde->d_namlen : uio->uio_resid; return (uiomove(bde->d_name, len, uio)); } int cloudabi_sys_file_readdir(struct thread *td, struct cloudabi_sys_file_readdir_args *uap) { struct iovec iov = { .iov_base = uap->buf, .iov_len = uap->buf_len }; struct uio uio = { .uio_iov = &iov, .uio_iovcnt = 1, .uio_resid = iov.iov_len, .uio_segflg = UIO_USERSPACE, .uio_rw = UIO_READ, .uio_td = td }; struct file *fp; struct vnode *vp; void *readbuf; - cap_rights_t rights; cloudabi_dircookie_t offset; int error; /* Obtain directory vnode. */ - error = getvnode(td, uap->fd, cap_rights_init(&rights, CAP_READ), &fp); + error = getvnode(td, uap->fd, &cap_read_rights, &fp); if (error != 0) { if (error == EINVAL) return (ENOTDIR); return (error); } if ((fp->f_flag & FREAD) == 0) { fdrop(fp, td); return (EBADF); } /* * Call VOP_READDIR() and convert resulting data until the user * provided buffer is filled. */ readbuf = malloc(MAXBSIZE, M_TEMP, M_WAITOK); offset = uap->cookie; vp = fp->f_vnode; while (uio.uio_resid > 0) { struct iovec readiov = { .iov_base = readbuf, .iov_len = MAXBSIZE }; struct uio readuio = { .uio_iov = &readiov, .uio_iovcnt = 1, .uio_rw = UIO_READ, .uio_segflg = UIO_SYSSPACE, .uio_td = td, .uio_resid = MAXBSIZE, .uio_offset = offset }; struct dirent *bde; unsigned long *cookies, *cookie; size_t readbuflen; int eof, ncookies; /* Validate file type. */ vn_lock(vp, LK_SHARED | LK_RETRY); if (vp->v_type != VDIR) { VOP_UNLOCK(vp, 0); error = ENOTDIR; goto done; } #ifdef MAC error = mac_vnode_check_readdir(td->td_ucred, vp); if (error != 0) { VOP_UNLOCK(vp, 0); goto done; } #endif /* MAC */ /* Read new directory entries. */ cookies = NULL; ncookies = 0; error = VOP_READDIR(vp, &readuio, fp->f_cred, &eof, &ncookies, &cookies); VOP_UNLOCK(vp, 0); if (error != 0) goto done; /* Convert entries to CloudABI's format. */ readbuflen = MAXBSIZE - readuio.uio_resid; bde = readbuf; cookie = cookies; while (readbuflen >= offsetof(struct dirent, d_name) && uio.uio_resid > 0 && ncookies > 0) { /* Ensure that the returned offset always increases. */ if (readbuflen >= bde->d_reclen && bde->d_fileno != 0 && *cookie > offset) { error = write_dirent(bde, *cookie, &uio); if (error != 0) { free(cookies, M_TEMP); goto done; } } if (offset < *cookie) offset = *cookie; ++cookie; --ncookies; readbuflen -= bde->d_reclen; bde = (struct dirent *)((char *)bde + bde->d_reclen); } free(cookies, M_TEMP); if (eof) break; } done: fdrop(fp, td); free(readbuf, M_TEMP); if (error != 0) return (error); /* Return number of bytes copied to userspace. */ td->td_retval[0] = uap->buf_len - uio.uio_resid; return (0); } int cloudabi_sys_file_readlink(struct thread *td, struct cloudabi_sys_file_readlink_args *uap) { char *path; int error; error = copyin_path(uap->path, uap->path_len, &path); if (error != 0) return (error); error = kern_readlinkat(td, uap->fd, path, UIO_SYSSPACE, uap->buf, UIO_USERSPACE, uap->buf_len); cloudabi_freestr(path); return (error); } int cloudabi_sys_file_rename(struct thread *td, struct cloudabi_sys_file_rename_args *uap) { char *old, *new; int error; error = copyin_path(uap->path1, uap->path1_len, &old); if (error != 0) return (error); error = copyin_path(uap->path2, uap->path2_len, &new); if (error != 0) { cloudabi_freestr(old); return (error); } error = kern_renameat(td, uap->fd1, old, uap->fd2, new, UIO_SYSSPACE); cloudabi_freestr(old); cloudabi_freestr(new); return (error); } /* Converts a FreeBSD stat structure to a CloudABI stat structure. */ static void convert_stat(const struct stat *sb, cloudabi_filestat_t *csb) { cloudabi_filestat_t res = { .st_dev = sb->st_dev, .st_ino = sb->st_ino, .st_nlink = sb->st_nlink, .st_size = sb->st_size, }; cloudabi_convert_timespec(&sb->st_atim, &res.st_atim); cloudabi_convert_timespec(&sb->st_mtim, &res.st_mtim); cloudabi_convert_timespec(&sb->st_ctim, &res.st_ctim); *csb = res; } int cloudabi_sys_file_stat_fget(struct thread *td, struct cloudabi_sys_file_stat_fget_args *uap) { struct stat sb; cloudabi_filestat_t csb; struct file *fp; - cap_rights_t rights; cloudabi_filetype_t filetype; int error; memset(&csb, 0, sizeof(csb)); /* Fetch file descriptor attributes. */ - error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FSTAT), &fp); + error = fget(td, uap->fd, &cap_fstat_rights, &fp); if (error != 0) return (error); error = fo_stat(fp, &sb, td->td_ucred, td); if (error != 0) { fdrop(fp, td); return (error); } filetype = cloudabi_convert_filetype(fp); fdrop(fp, td); /* Convert attributes to CloudABI's format. */ convert_stat(&sb, &csb); csb.st_filetype = filetype; return (copyout(&csb, uap->buf, sizeof(csb))); } /* Converts timestamps to arguments to futimens() and utimensat(). */ static void convert_utimens_arguments(const cloudabi_filestat_t *fs, cloudabi_fsflags_t flags, struct timespec *ts) { if ((flags & CLOUDABI_FILESTAT_ATIM_NOW) != 0) { ts[0].tv_nsec = UTIME_NOW; } else if ((flags & CLOUDABI_FILESTAT_ATIM) != 0) { ts[0].tv_sec = fs->st_atim / 1000000000; ts[0].tv_nsec = fs->st_atim % 1000000000; } else { ts[0].tv_nsec = UTIME_OMIT; } if ((flags & CLOUDABI_FILESTAT_MTIM_NOW) != 0) { ts[1].tv_nsec = UTIME_NOW; } else if ((flags & CLOUDABI_FILESTAT_MTIM) != 0) { ts[1].tv_sec = fs->st_mtim / 1000000000; ts[1].tv_nsec = fs->st_mtim % 1000000000; } else { ts[1].tv_nsec = UTIME_OMIT; } } int cloudabi_sys_file_stat_fput(struct thread *td, struct cloudabi_sys_file_stat_fput_args *uap) { cloudabi_filestat_t fs; struct timespec ts[2]; int error; error = copyin(uap->buf, &fs, sizeof(fs)); if (error != 0) return (error); /* * Only support truncation and timestamp modification separately * for now, to prevent unnecessary code duplication. */ if ((uap->flags & CLOUDABI_FILESTAT_SIZE) != 0) { /* Call into kern_ftruncate() for file truncation. */ if ((uap->flags & ~CLOUDABI_FILESTAT_SIZE) != 0) return (EINVAL); return (kern_ftruncate(td, uap->fd, fs.st_size)); } else if ((uap->flags & (CLOUDABI_FILESTAT_ATIM | CLOUDABI_FILESTAT_ATIM_NOW | CLOUDABI_FILESTAT_MTIM | CLOUDABI_FILESTAT_MTIM_NOW)) != 0) { /* Call into kern_futimens() for timestamp modification. */ if ((uap->flags & ~(CLOUDABI_FILESTAT_ATIM | CLOUDABI_FILESTAT_ATIM_NOW | CLOUDABI_FILESTAT_MTIM | CLOUDABI_FILESTAT_MTIM_NOW)) != 0) return (EINVAL); convert_utimens_arguments(&fs, uap->flags, ts); return (kern_futimens(td, uap->fd, ts, UIO_SYSSPACE)); } return (EINVAL); } int cloudabi_sys_file_stat_get(struct thread *td, struct cloudabi_sys_file_stat_get_args *uap) { struct stat sb; cloudabi_filestat_t csb; char *path; int error; memset(&csb, 0, sizeof(csb)); error = copyin_path(uap->path, uap->path_len, &path); if (error != 0) return (error); error = kern_statat(td, (uap->fd.flags & CLOUDABI_LOOKUP_SYMLINK_FOLLOW) != 0 ? 0 : AT_SYMLINK_NOFOLLOW, uap->fd.fd, path, UIO_SYSSPACE, &sb, NULL); cloudabi_freestr(path); if (error != 0) return (error); /* Convert results and return them. */ convert_stat(&sb, &csb); if (S_ISBLK(sb.st_mode)) csb.st_filetype = CLOUDABI_FILETYPE_BLOCK_DEVICE; else if (S_ISCHR(sb.st_mode)) csb.st_filetype = CLOUDABI_FILETYPE_CHARACTER_DEVICE; else if (S_ISDIR(sb.st_mode)) csb.st_filetype = CLOUDABI_FILETYPE_DIRECTORY; else if (S_ISFIFO(sb.st_mode)) csb.st_filetype = CLOUDABI_FILETYPE_SOCKET_STREAM; else if (S_ISREG(sb.st_mode)) csb.st_filetype = CLOUDABI_FILETYPE_REGULAR_FILE; else if (S_ISSOCK(sb.st_mode)) { /* Inaccurate, but the best that we can do. */ csb.st_filetype = CLOUDABI_FILETYPE_SOCKET_STREAM; } else if (S_ISLNK(sb.st_mode)) csb.st_filetype = CLOUDABI_FILETYPE_SYMBOLIC_LINK; else csb.st_filetype = CLOUDABI_FILETYPE_UNKNOWN; return (copyout(&csb, uap->buf, sizeof(csb))); } int cloudabi_sys_file_stat_put(struct thread *td, struct cloudabi_sys_file_stat_put_args *uap) { cloudabi_filestat_t fs; struct timespec ts[2]; char *path; int error; /* * Only support timestamp modification for now, as there is no * truncateat(). */ if ((uap->flags & ~(CLOUDABI_FILESTAT_ATIM | CLOUDABI_FILESTAT_ATIM_NOW | CLOUDABI_FILESTAT_MTIM | CLOUDABI_FILESTAT_MTIM_NOW)) != 0) return (EINVAL); error = copyin(uap->buf, &fs, sizeof(fs)); if (error != 0) return (error); error = copyin_path(uap->path, uap->path_len, &path); if (error != 0) return (error); convert_utimens_arguments(&fs, uap->flags, ts); error = kern_utimensat(td, uap->fd.fd, path, UIO_SYSSPACE, ts, UIO_SYSSPACE, (uap->fd.flags & CLOUDABI_LOOKUP_SYMLINK_FOLLOW) ? 0 : AT_SYMLINK_NOFOLLOW); cloudabi_freestr(path); return (error); } int cloudabi_sys_file_symlink(struct thread *td, struct cloudabi_sys_file_symlink_args *uap) { char *path1, *path2; int error; error = copyin_path(uap->path1, uap->path1_len, &path1); if (error != 0) return (error); error = copyin_path(uap->path2, uap->path2_len, &path2); if (error != 0) { cloudabi_freestr(path1); return (error); } error = kern_symlinkat(td, path1, uap->fd, path2, UIO_SYSSPACE); cloudabi_freestr(path1); cloudabi_freestr(path2); return (error); } int cloudabi_sys_file_unlink(struct thread *td, struct cloudabi_sys_file_unlink_args *uap) { char *path; int error; error = copyin_path(uap->path, uap->path_len, &path); if (error != 0) return (error); if (uap->flags & CLOUDABI_UNLINK_REMOVEDIR) error = kern_rmdirat(td, uap->fd, path, UIO_SYSSPACE); else error = kern_unlinkat(td, uap->fd, path, UIO_SYSSPACE, 0); cloudabi_freestr(path); return (error); } Index: head/sys/compat/linux/linux_event.c =================================================================== --- head/sys/compat/linux/linux_event.c (revision 333424) +++ head/sys/compat/linux/linux_event.c (revision 333425) @@ -1,1324 +1,1322 @@ /*- * Copyright (c) 2007 Roman Divacky * Copyright (c) 2014 Dmitry Chagin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif #include #include #include #include #include /* * epoll defines 'struct epoll_event' with the field 'data' as 64 bits * on all architectures. But on 32 bit architectures BSD 'struct kevent' only * has 32 bit opaque pointer as 'udata' field. So we can't pass epoll supplied * data verbatuim. Therefore we allocate 64-bit memory block to pass * user supplied data for every file descriptor. */ typedef uint64_t epoll_udata_t; struct epoll_emuldata { uint32_t fdc; /* epoll udata max index */ epoll_udata_t udata[1]; /* epoll user data vector */ }; #define EPOLL_DEF_SZ 16 #define EPOLL_SIZE(fdn) \ (sizeof(struct epoll_emuldata)+(fdn) * sizeof(epoll_udata_t)) struct epoll_event { uint32_t events; epoll_udata_t data; } #if defined(__amd64__) __attribute__((packed)) #endif ; #define LINUX_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) static void epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata); static int epoll_to_kevent(struct thread *td, struct file *epfp, int fd, struct epoll_event *l_event, int *kev_flags, struct kevent *kevent, int *nkevents); static void kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event); static int epoll_kev_copyout(void *arg, struct kevent *kevp, int count); static int epoll_kev_copyin(void *arg, struct kevent *kevp, int count); static int epoll_delete_event(struct thread *td, struct file *epfp, int fd, int filter); static int epoll_delete_all_events(struct thread *td, struct file *epfp, int fd); struct epoll_copyin_args { struct kevent *changelist; }; struct epoll_copyout_args { struct epoll_event *leventlist; struct proc *p; uint32_t count; int error; }; /* eventfd */ typedef uint64_t eventfd_t; static fo_rdwr_t eventfd_read; static fo_rdwr_t eventfd_write; static fo_ioctl_t eventfd_ioctl; static fo_poll_t eventfd_poll; static fo_kqfilter_t eventfd_kqfilter; static fo_stat_t eventfd_stat; static fo_close_t eventfd_close; static fo_fill_kinfo_t eventfd_fill_kinfo; static struct fileops eventfdops = { .fo_read = eventfd_read, .fo_write = eventfd_write, .fo_truncate = invfo_truncate, .fo_ioctl = eventfd_ioctl, .fo_poll = eventfd_poll, .fo_kqfilter = eventfd_kqfilter, .fo_stat = eventfd_stat, .fo_close = eventfd_close, .fo_chmod = invfo_chmod, .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = eventfd_fill_kinfo, .fo_flags = DFLAG_PASSABLE }; static void filt_eventfddetach(struct knote *kn); static int filt_eventfdread(struct knote *kn, long hint); static int filt_eventfdwrite(struct knote *kn, long hint); static struct filterops eventfd_rfiltops = { .f_isfd = 1, .f_detach = filt_eventfddetach, .f_event = filt_eventfdread }; static struct filterops eventfd_wfiltops = { .f_isfd = 1, .f_detach = filt_eventfddetach, .f_event = filt_eventfdwrite }; /* timerfd */ typedef uint64_t timerfd_t; static fo_rdwr_t timerfd_read; static fo_poll_t timerfd_poll; static fo_kqfilter_t timerfd_kqfilter; static fo_stat_t timerfd_stat; static fo_close_t timerfd_close; static fo_fill_kinfo_t timerfd_fill_kinfo; static struct fileops timerfdops = { .fo_read = timerfd_read, .fo_write = invfo_rdwr, .fo_truncate = invfo_truncate, .fo_ioctl = eventfd_ioctl, .fo_poll = timerfd_poll, .fo_kqfilter = timerfd_kqfilter, .fo_stat = timerfd_stat, .fo_close = timerfd_close, .fo_chmod = invfo_chmod, .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = timerfd_fill_kinfo, .fo_flags = DFLAG_PASSABLE }; static void filt_timerfddetach(struct knote *kn); static int filt_timerfdread(struct knote *kn, long hint); static struct filterops timerfd_rfiltops = { .f_isfd = 1, .f_detach = filt_timerfddetach, .f_event = filt_timerfdread }; struct eventfd { eventfd_t efd_count; uint32_t efd_flags; struct selinfo efd_sel; struct mtx efd_lock; }; struct timerfd { clockid_t tfd_clockid; struct itimerspec tfd_time; struct callout tfd_callout; timerfd_t tfd_count; bool tfd_canceled; struct selinfo tfd_sel; struct mtx tfd_lock; }; static int eventfd_create(struct thread *td, uint32_t initval, int flags); static void linux_timerfd_expire(void *); static void linux_timerfd_curval(struct timerfd *, struct itimerspec *); static void epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata) { struct linux_pemuldata *pem; struct epoll_emuldata *emd; struct proc *p; p = td->td_proc; pem = pem_find(p); KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); LINUX_PEM_XLOCK(pem); if (pem->epoll == NULL) { emd = malloc(EPOLL_SIZE(fd), M_EPOLL, M_WAITOK); emd->fdc = fd; pem->epoll = emd; } else { emd = pem->epoll; if (fd > emd->fdc) { emd = realloc(emd, EPOLL_SIZE(fd), M_EPOLL, M_WAITOK); emd->fdc = fd; pem->epoll = emd; } } emd->udata[fd] = udata; LINUX_PEM_XUNLOCK(pem); } static int epoll_create_common(struct thread *td, int flags) { int error; error = kern_kqueue(td, flags, NULL); if (error != 0) return (error); epoll_fd_install(td, EPOLL_DEF_SZ, 0); return (0); } int linux_epoll_create(struct thread *td, struct linux_epoll_create_args *args) { /* * args->size is unused. Linux just tests it * and then forgets it as well. */ if (args->size <= 0) return (EINVAL); return (epoll_create_common(td, 0)); } int linux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args) { int flags; if ((args->flags & ~(LINUX_O_CLOEXEC)) != 0) return (EINVAL); flags = 0; if ((args->flags & LINUX_O_CLOEXEC) != 0) flags |= O_CLOEXEC; return (epoll_create_common(td, flags)); } /* Structure converting function from epoll to kevent. */ static int epoll_to_kevent(struct thread *td, struct file *epfp, int fd, struct epoll_event *l_event, int *kev_flags, struct kevent *kevent, int *nkevents) { uint32_t levents = l_event->events; struct linux_pemuldata *pem; struct proc *p; /* flags related to how event is registered */ if ((levents & LINUX_EPOLLONESHOT) != 0) *kev_flags |= EV_ONESHOT; if ((levents & LINUX_EPOLLET) != 0) *kev_flags |= EV_CLEAR; if ((levents & LINUX_EPOLLERR) != 0) *kev_flags |= EV_ERROR; if ((levents & LINUX_EPOLLRDHUP) != 0) *kev_flags |= EV_EOF; /* flags related to what event is registered */ if ((levents & LINUX_EPOLL_EVRD) != 0) { EV_SET(kevent++, fd, EVFILT_READ, *kev_flags, 0, 0, 0); ++(*nkevents); } if ((levents & LINUX_EPOLL_EVWR) != 0) { EV_SET(kevent++, fd, EVFILT_WRITE, *kev_flags, 0, 0, 0); ++(*nkevents); } if ((levents & ~(LINUX_EPOLL_EVSUP)) != 0) { p = td->td_proc; pem = pem_find(p); KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); KASSERT(pem->epoll != NULL, ("epoll proc epolldata not found.\n")); LINUX_PEM_XLOCK(pem); if ((pem->flags & LINUX_XUNSUP_EPOLL) == 0) { pem->flags |= LINUX_XUNSUP_EPOLL; LINUX_PEM_XUNLOCK(pem); linux_msg(td, "epoll_ctl unsupported flags: 0x%x\n", levents); } else LINUX_PEM_XUNLOCK(pem); return (EINVAL); } return (0); } /* * Structure converting function from kevent to epoll. In a case * this is called on error in registration we store the error in * event->data and pick it up later in linux_epoll_ctl(). */ static void kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event) { if ((kevent->flags & EV_ERROR) != 0) { l_event->events = LINUX_EPOLLERR; return; } /* XXX EPOLLPRI, EPOLLHUP */ switch (kevent->filter) { case EVFILT_READ: l_event->events = LINUX_EPOLLIN; if ((kevent->flags & EV_EOF) != 0) l_event->events |= LINUX_EPOLLRDHUP; break; case EVFILT_WRITE: l_event->events = LINUX_EPOLLOUT; break; } } /* * Copyout callback used by kevent. This converts kevent * events to epoll events and copies them back to the * userspace. This is also called on error on registering * of the filter. */ static int epoll_kev_copyout(void *arg, struct kevent *kevp, int count) { struct epoll_copyout_args *args; struct linux_pemuldata *pem; struct epoll_emuldata *emd; struct epoll_event *eep; int error, fd, i; args = (struct epoll_copyout_args*) arg; eep = malloc(sizeof(*eep) * count, M_EPOLL, M_WAITOK | M_ZERO); pem = pem_find(args->p); KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); LINUX_PEM_SLOCK(pem); emd = pem->epoll; KASSERT(emd != NULL, ("epoll proc epolldata not found.\n")); for (i = 0; i < count; i++) { kevent_to_epoll(&kevp[i], &eep[i]); fd = kevp[i].ident; KASSERT(fd <= emd->fdc, ("epoll user data vector" " is too small.\n")); eep[i].data = emd->udata[fd]; } LINUX_PEM_SUNLOCK(pem); error = copyout(eep, args->leventlist, count * sizeof(*eep)); if (error == 0) { args->leventlist += count; args->count += count; } else if (args->error == 0) args->error = error; free(eep, M_EPOLL); return (error); } /* * Copyin callback used by kevent. This copies already * converted filters from kernel memory to the kevent * internal kernel memory. Hence the memcpy instead of * copyin. */ static int epoll_kev_copyin(void *arg, struct kevent *kevp, int count) { struct epoll_copyin_args *args; args = (struct epoll_copyin_args*) arg; memcpy(kevp, args->changelist, count * sizeof(*kevp)); args->changelist += count; return (0); } /* * Load epoll filter, convert it to kevent filter * and load it into kevent subsystem. */ int linux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args) { struct file *epfp, *fp; struct epoll_copyin_args ciargs; struct kevent kev[2]; struct kevent_copyops k_ops = { &ciargs, NULL, epoll_kev_copyin}; struct epoll_event le; cap_rights_t rights; int kev_flags; int nchanges = 0; int error; if (args->op != LINUX_EPOLL_CTL_DEL) { error = copyin(args->event, &le, sizeof(le)); if (error != 0) return (error); } error = fget(td, args->epfd, cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &epfp); if (error != 0) return (error); if (epfp->f_type != DTYPE_KQUEUE) { error = EINVAL; goto leave1; } /* Protect user data vector from incorrectly supplied fd. */ error = fget(td, args->fd, cap_rights_init(&rights, CAP_POLL_EVENT), &fp); if (error != 0) goto leave1; /* Linux disallows spying on himself */ if (epfp == fp) { error = EINVAL; goto leave0; } ciargs.changelist = kev; if (args->op != LINUX_EPOLL_CTL_DEL) { kev_flags = EV_ADD | EV_ENABLE; error = epoll_to_kevent(td, epfp, args->fd, &le, &kev_flags, kev, &nchanges); if (error != 0) goto leave0; } switch (args->op) { case LINUX_EPOLL_CTL_MOD: error = epoll_delete_all_events(td, epfp, args->fd); if (error != 0) goto leave0; break; case LINUX_EPOLL_CTL_ADD: /* * kqueue_register() return ENOENT if event does not exists * and the EV_ADD flag is not set. */ kev[0].flags &= ~EV_ADD; error = kqfd_register(args->epfd, &kev[0], td, 1); if (error != ENOENT) { error = EEXIST; goto leave0; } error = 0; kev[0].flags |= EV_ADD; break; case LINUX_EPOLL_CTL_DEL: /* CTL_DEL means unregister this fd with this epoll */ error = epoll_delete_all_events(td, epfp, args->fd); goto leave0; default: error = EINVAL; goto leave0; } epoll_fd_install(td, args->fd, le.data); error = kern_kevent_fp(td, epfp, nchanges, 0, &k_ops, NULL); leave0: fdrop(fp, td); leave1: fdrop(epfp, td); return (error); } /* * Wait for a filter to be triggered on the epoll file descriptor. */ static int linux_epoll_wait_common(struct thread *td, int epfd, struct epoll_event *events, int maxevents, int timeout, sigset_t *uset) { struct epoll_copyout_args coargs; struct kevent_copyops k_ops = { &coargs, epoll_kev_copyout, NULL}; struct timespec ts, *tsp; cap_rights_t rights; struct file *epfp; sigset_t omask; int error; if (maxevents <= 0 || maxevents > LINUX_MAX_EVENTS) return (EINVAL); error = fget(td, epfd, cap_rights_init(&rights, CAP_KQUEUE_EVENT), &epfp); if (error != 0) return (error); if (epfp->f_type != DTYPE_KQUEUE) { error = EINVAL; goto leave1; } if (uset != NULL) { error = kern_sigprocmask(td, SIG_SETMASK, uset, &omask, 0); if (error != 0) goto leave1; td->td_pflags |= TDP_OLDMASK; /* * Make sure that ast() is called on return to * usermode and TDP_OLDMASK is cleared, restoring old * sigmask. */ thread_lock(td); td->td_flags |= TDF_ASTPENDING; thread_unlock(td); } coargs.leventlist = events; coargs.p = td->td_proc; coargs.count = 0; coargs.error = 0; if (timeout != -1) { if (timeout < 0) { error = EINVAL; goto leave0; } /* Convert from milliseconds to timespec. */ ts.tv_sec = timeout / 1000; ts.tv_nsec = (timeout % 1000) * 1000000; tsp = &ts; } else { tsp = NULL; } error = kern_kevent_fp(td, epfp, 0, maxevents, &k_ops, tsp); if (error == 0 && coargs.error != 0) error = coargs.error; /* * kern_kevent might return ENOMEM which is not expected from epoll_wait. * Maybe we should translate that but I don't think it matters at all. */ if (error == 0) td->td_retval[0] = coargs.count; leave0: if (uset != NULL) error = kern_sigprocmask(td, SIG_SETMASK, &omask, NULL, 0); leave1: fdrop(epfp, td); return (error); } int linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args) { return (linux_epoll_wait_common(td, args->epfd, args->events, args->maxevents, args->timeout, NULL)); } int linux_epoll_pwait(struct thread *td, struct linux_epoll_pwait_args *args) { sigset_t mask, *pmask; l_sigset_t lmask; int error; if (args->mask != NULL) { if (args->sigsetsize != sizeof(l_sigset_t)) return (EINVAL); error = copyin(args->mask, &lmask, sizeof(l_sigset_t)); if (error != 0) return (error); linux_to_bsd_sigset(&lmask, &mask); pmask = &mask; } else pmask = NULL; return (linux_epoll_wait_common(td, args->epfd, args->events, args->maxevents, args->timeout, pmask)); } static int epoll_delete_event(struct thread *td, struct file *epfp, int fd, int filter) { struct epoll_copyin_args ciargs; struct kevent kev; struct kevent_copyops k_ops = { &ciargs, NULL, epoll_kev_copyin}; ciargs.changelist = &kev; EV_SET(&kev, fd, filter, EV_DELETE | EV_DISABLE, 0, 0, 0); return (kern_kevent_fp(td, epfp, 1, 0, &k_ops, NULL)); } static int epoll_delete_all_events(struct thread *td, struct file *epfp, int fd) { int error1, error2; error1 = epoll_delete_event(td, epfp, fd, EVFILT_READ); error2 = epoll_delete_event(td, epfp, fd, EVFILT_WRITE); /* return 0 if at least one result positive */ return (error1 == 0 ? 0 : error2); } static int eventfd_create(struct thread *td, uint32_t initval, int flags) { struct filedesc *fdp; struct eventfd *efd; struct file *fp; int fflags, fd, error; fflags = 0; if ((flags & LINUX_O_CLOEXEC) != 0) fflags |= O_CLOEXEC; fdp = td->td_proc->p_fd; error = falloc(td, &fp, &fd, fflags); if (error != 0) return (error); efd = malloc(sizeof(*efd), M_EPOLL, M_WAITOK | M_ZERO); efd->efd_flags = flags; efd->efd_count = initval; mtx_init(&efd->efd_lock, "eventfd", NULL, MTX_DEF); knlist_init_mtx(&efd->efd_sel.si_note, &efd->efd_lock); fflags = FREAD | FWRITE; if ((flags & LINUX_O_NONBLOCK) != 0) fflags |= FNONBLOCK; finit(fp, fflags, DTYPE_LINUXEFD, efd, &eventfdops); fdrop(fp, td); td->td_retval[0] = fd; return (error); } int linux_eventfd(struct thread *td, struct linux_eventfd_args *args) { return (eventfd_create(td, args->initval, 0)); } int linux_eventfd2(struct thread *td, struct linux_eventfd2_args *args) { if ((args->flags & ~(LINUX_O_CLOEXEC|LINUX_O_NONBLOCK|LINUX_EFD_SEMAPHORE)) != 0) return (EINVAL); return (eventfd_create(td, args->initval, args->flags)); } static int eventfd_close(struct file *fp, struct thread *td) { struct eventfd *efd; efd = fp->f_data; if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) return (EINVAL); seldrain(&efd->efd_sel); knlist_destroy(&efd->efd_sel.si_note); fp->f_ops = &badfileops; mtx_destroy(&efd->efd_lock); free(efd, M_EPOLL); return (0); } static int eventfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct eventfd *efd; eventfd_t count; int error; efd = fp->f_data; if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) return (EINVAL); if (uio->uio_resid < sizeof(eventfd_t)) return (EINVAL); error = 0; mtx_lock(&efd->efd_lock); retry: if (efd->efd_count == 0) { if ((fp->f_flag & FNONBLOCK) != 0) { mtx_unlock(&efd->efd_lock); return (EAGAIN); } error = mtx_sleep(&efd->efd_count, &efd->efd_lock, PCATCH, "lefdrd", 0); if (error == 0) goto retry; } if (error == 0) { if ((efd->efd_flags & LINUX_EFD_SEMAPHORE) != 0) { count = 1; --efd->efd_count; } else { count = efd->efd_count; efd->efd_count = 0; } KNOTE_LOCKED(&efd->efd_sel.si_note, 0); selwakeup(&efd->efd_sel); wakeup(&efd->efd_count); mtx_unlock(&efd->efd_lock); error = uiomove(&count, sizeof(eventfd_t), uio); } else mtx_unlock(&efd->efd_lock); return (error); } static int eventfd_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct eventfd *efd; eventfd_t count; int error; efd = fp->f_data; if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) return (EINVAL); if (uio->uio_resid < sizeof(eventfd_t)) return (EINVAL); error = uiomove(&count, sizeof(eventfd_t), uio); if (error != 0) return (error); if (count == UINT64_MAX) return (EINVAL); mtx_lock(&efd->efd_lock); retry: if (UINT64_MAX - efd->efd_count <= count) { if ((fp->f_flag & FNONBLOCK) != 0) { mtx_unlock(&efd->efd_lock); /* Do not not return the number of bytes written */ uio->uio_resid += sizeof(eventfd_t); return (EAGAIN); } error = mtx_sleep(&efd->efd_count, &efd->efd_lock, PCATCH, "lefdwr", 0); if (error == 0) goto retry; } if (error == 0) { efd->efd_count += count; KNOTE_LOCKED(&efd->efd_sel.si_note, 0); selwakeup(&efd->efd_sel); wakeup(&efd->efd_count); } mtx_unlock(&efd->efd_lock); return (error); } static int eventfd_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct eventfd *efd; int revents = 0; efd = fp->f_data; if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) return (POLLERR); mtx_lock(&efd->efd_lock); if ((events & (POLLIN|POLLRDNORM)) && efd->efd_count > 0) revents |= events & (POLLIN|POLLRDNORM); if ((events & (POLLOUT|POLLWRNORM)) && UINT64_MAX - 1 > efd->efd_count) revents |= events & (POLLOUT|POLLWRNORM); if (revents == 0) selrecord(td, &efd->efd_sel); mtx_unlock(&efd->efd_lock); return (revents); } /*ARGSUSED*/ static int eventfd_kqfilter(struct file *fp, struct knote *kn) { struct eventfd *efd; efd = fp->f_data; if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) return (EINVAL); mtx_lock(&efd->efd_lock); switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &eventfd_rfiltops; break; case EVFILT_WRITE: kn->kn_fop = &eventfd_wfiltops; break; default: mtx_unlock(&efd->efd_lock); return (EINVAL); } kn->kn_hook = efd; knlist_add(&efd->efd_sel.si_note, kn, 1); mtx_unlock(&efd->efd_lock); return (0); } static void filt_eventfddetach(struct knote *kn) { struct eventfd *efd = kn->kn_hook; mtx_lock(&efd->efd_lock); knlist_remove(&efd->efd_sel.si_note, kn, 1); mtx_unlock(&efd->efd_lock); } /*ARGSUSED*/ static int filt_eventfdread(struct knote *kn, long hint) { struct eventfd *efd = kn->kn_hook; int ret; mtx_assert(&efd->efd_lock, MA_OWNED); ret = (efd->efd_count > 0); return (ret); } /*ARGSUSED*/ static int filt_eventfdwrite(struct knote *kn, long hint) { struct eventfd *efd = kn->kn_hook; int ret; mtx_assert(&efd->efd_lock, MA_OWNED); ret = (UINT64_MAX - 1 > efd->efd_count); return (ret); } /*ARGSUSED*/ static int eventfd_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { if (fp->f_data == NULL || (fp->f_type != DTYPE_LINUXEFD && fp->f_type != DTYPE_LINUXTFD)) return (EINVAL); switch (cmd) { case FIONBIO: if ((*(int *)data)) atomic_set_int(&fp->f_flag, FNONBLOCK); else atomic_clear_int(&fp->f_flag, FNONBLOCK); case FIOASYNC: return (0); default: return (ENXIO); } } /*ARGSUSED*/ static int eventfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred, struct thread *td) { return (ENXIO); } /*ARGSUSED*/ static int eventfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { kif->kf_type = KF_TYPE_UNKNOWN; return (0); } int linux_timerfd_create(struct thread *td, struct linux_timerfd_create_args *args) { struct filedesc *fdp; struct timerfd *tfd; struct file *fp; clockid_t clockid; int fflags, fd, error; if ((args->flags & ~LINUX_TFD_CREATE_FLAGS) != 0) return (EINVAL); error = linux_to_native_clockid(&clockid, args->clockid); if (error != 0) return (error); if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC) return (EINVAL); fflags = 0; if ((args->flags & LINUX_TFD_CLOEXEC) != 0) fflags |= O_CLOEXEC; fdp = td->td_proc->p_fd; error = falloc(td, &fp, &fd, fflags); if (error != 0) return (error); tfd = malloc(sizeof(*tfd), M_EPOLL, M_WAITOK | M_ZERO); tfd->tfd_clockid = clockid; mtx_init(&tfd->tfd_lock, "timerfd", NULL, MTX_DEF); callout_init_mtx(&tfd->tfd_callout, &tfd->tfd_lock, 0); knlist_init_mtx(&tfd->tfd_sel.si_note, &tfd->tfd_lock); fflags = FREAD; if ((args->flags & LINUX_O_NONBLOCK) != 0) fflags |= FNONBLOCK; finit(fp, fflags, DTYPE_LINUXTFD, tfd, &timerfdops); fdrop(fp, td); td->td_retval[0] = fd; return (error); } static int timerfd_close(struct file *fp, struct thread *td) { struct timerfd *tfd; tfd = fp->f_data; if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) return (EINVAL); timespecclear(&tfd->tfd_time.it_value); timespecclear(&tfd->tfd_time.it_interval); mtx_lock(&tfd->tfd_lock); callout_drain(&tfd->tfd_callout); mtx_unlock(&tfd->tfd_lock); seldrain(&tfd->tfd_sel); knlist_destroy(&tfd->tfd_sel.si_note); fp->f_ops = &badfileops; mtx_destroy(&tfd->tfd_lock); free(tfd, M_EPOLL); return (0); } static int timerfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct timerfd *tfd; timerfd_t count; int error; tfd = fp->f_data; if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) return (EINVAL); if (uio->uio_resid < sizeof(timerfd_t)) return (EINVAL); error = 0; mtx_lock(&tfd->tfd_lock); retry: if (tfd->tfd_canceled) { tfd->tfd_count = 0; mtx_unlock(&tfd->tfd_lock); return (ECANCELED); } if (tfd->tfd_count == 0) { if ((fp->f_flag & FNONBLOCK) != 0) { mtx_unlock(&tfd->tfd_lock); return (EAGAIN); } error = mtx_sleep(&tfd->tfd_count, &tfd->tfd_lock, PCATCH, "ltfdrd", 0); if (error == 0) goto retry; } if (error == 0) { count = tfd->tfd_count; tfd->tfd_count = 0; mtx_unlock(&tfd->tfd_lock); error = uiomove(&count, sizeof(timerfd_t), uio); } else mtx_unlock(&tfd->tfd_lock); return (error); } static int timerfd_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct timerfd *tfd; int revents = 0; tfd = fp->f_data; if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) return (POLLERR); mtx_lock(&tfd->tfd_lock); if ((events & (POLLIN|POLLRDNORM)) && tfd->tfd_count > 0) revents |= events & (POLLIN|POLLRDNORM); if (revents == 0) selrecord(td, &tfd->tfd_sel); mtx_unlock(&tfd->tfd_lock); return (revents); } /*ARGSUSED*/ static int timerfd_kqfilter(struct file *fp, struct knote *kn) { struct timerfd *tfd; tfd = fp->f_data; if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) return (EINVAL); if (kn->kn_filter == EVFILT_READ) kn->kn_fop = &timerfd_rfiltops; else return (EINVAL); kn->kn_hook = tfd; knlist_add(&tfd->tfd_sel.si_note, kn, 0); return (0); } static void filt_timerfddetach(struct knote *kn) { struct timerfd *tfd = kn->kn_hook; mtx_lock(&tfd->tfd_lock); knlist_remove(&tfd->tfd_sel.si_note, kn, 1); mtx_unlock(&tfd->tfd_lock); } /*ARGSUSED*/ static int filt_timerfdread(struct knote *kn, long hint) { struct timerfd *tfd = kn->kn_hook; return (tfd->tfd_count > 0); } /*ARGSUSED*/ static int timerfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred, struct thread *td) { return (ENXIO); } /*ARGSUSED*/ static int timerfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { kif->kf_type = KF_TYPE_UNKNOWN; return (0); } static void linux_timerfd_clocktime(struct timerfd *tfd, struct timespec *ts) { if (tfd->tfd_clockid == CLOCK_REALTIME) getnanotime(ts); else /* CLOCK_MONOTONIC */ getnanouptime(ts); } static void linux_timerfd_curval(struct timerfd *tfd, struct itimerspec *ots) { struct timespec cts; linux_timerfd_clocktime(tfd, &cts); *ots = tfd->tfd_time; if (ots->it_value.tv_sec != 0 || ots->it_value.tv_nsec != 0) { timespecsub(&ots->it_value, &cts); if (ots->it_value.tv_sec < 0 || (ots->it_value.tv_sec == 0 && ots->it_value.tv_nsec == 0)) { ots->it_value.tv_sec = 0; ots->it_value.tv_nsec = 1; } } } int linux_timerfd_gettime(struct thread *td, struct linux_timerfd_gettime_args *args) { - cap_rights_t rights; struct l_itimerspec lots; struct itimerspec ots; struct timerfd *tfd; struct file *fp; int error; - error = fget(td, args->fd, cap_rights_init(&rights, CAP_READ), &fp); + error = fget(td, args->fd, &cap_read_rights, &fp); if (error != 0) return (error); tfd = fp->f_data; if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) { error = EINVAL; goto out; } mtx_lock(&tfd->tfd_lock); linux_timerfd_curval(tfd, &ots); mtx_unlock(&tfd->tfd_lock); error = native_to_linux_itimerspec(&lots, &ots); if (error == 0) error = copyout(&lots, args->old_value, sizeof(lots)); out: fdrop(fp, td); return (error); } int linux_timerfd_settime(struct thread *td, struct linux_timerfd_settime_args *args) { struct l_itimerspec lots; struct itimerspec nts, ots; struct timespec cts, ts; - cap_rights_t rights; struct timerfd *tfd; struct timeval tv; struct file *fp; int error; if ((args->flags & ~LINUX_TFD_SETTIME_FLAGS) != 0) return (EINVAL); error = copyin(args->new_value, &lots, sizeof(lots)); if (error != 0) return (error); error = linux_to_native_itimerspec(&nts, &lots); if (error != 0) return (error); - error = fget(td, args->fd, cap_rights_init(&rights, CAP_WRITE), &fp); + error = fget(td, args->fd, &cap_write_rights, &fp); if (error != 0) return (error); tfd = fp->f_data; if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) { error = EINVAL; goto out; } mtx_lock(&tfd->tfd_lock); if (!timespecisset(&nts.it_value)) timespecclear(&nts.it_interval); if (args->old_value != NULL) linux_timerfd_curval(tfd, &ots); tfd->tfd_time = nts; if (timespecisset(&nts.it_value)) { linux_timerfd_clocktime(tfd, &cts); ts = nts.it_value; if ((args->flags & LINUX_TFD_TIMER_ABSTIME) == 0) { timespecadd(&tfd->tfd_time.it_value, &cts); } else { timespecsub(&ts, &cts); } TIMESPEC_TO_TIMEVAL(&tv, &ts); callout_reset(&tfd->tfd_callout, tvtohz(&tv), linux_timerfd_expire, tfd); tfd->tfd_canceled = false; } else { tfd->tfd_canceled = true; callout_stop(&tfd->tfd_callout); } mtx_unlock(&tfd->tfd_lock); if (args->old_value != NULL) { error = native_to_linux_itimerspec(&lots, &ots); if (error == 0) error = copyout(&lots, args->old_value, sizeof(lots)); } out: fdrop(fp, td); return (error); } static void linux_timerfd_expire(void *arg) { struct timespec cts, ts; struct timeval tv; struct timerfd *tfd; tfd = (struct timerfd *)arg; linux_timerfd_clocktime(tfd, &cts); if (timespeccmp(&cts, &tfd->tfd_time.it_value, >=)) { if (timespecisset(&tfd->tfd_time.it_interval)) timespecadd(&tfd->tfd_time.it_value, &tfd->tfd_time.it_interval); else /* single shot timer */ timespecclear(&tfd->tfd_time.it_value); if (timespecisset(&tfd->tfd_time.it_value)) { ts = tfd->tfd_time.it_value; timespecsub(&ts, &cts); TIMESPEC_TO_TIMEVAL(&tv, &ts); callout_reset(&tfd->tfd_callout, tvtohz(&tv), linux_timerfd_expire, tfd); } tfd->tfd_count++; KNOTE_LOCKED(&tfd->tfd_sel.si_note, 0); selwakeup(&tfd->tfd_sel); wakeup(&tfd->tfd_count); } else if (timespecisset(&tfd->tfd_time.it_value)) { ts = tfd->tfd_time.it_value; timespecsub(&ts, &cts); TIMESPEC_TO_TIMEVAL(&tv, &ts); callout_reset(&tfd->tfd_callout, tvtohz(&tv), linux_timerfd_expire, tfd); } } Index: head/sys/compat/linux/linux_file.c =================================================================== --- head/sys/compat/linux/linux_file.c (revision 333424) +++ head/sys/compat/linux/linux_file.c (revision 333425) @@ -1,1662 +1,1657 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1994-1995 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif #include #include #include static int linux_common_open(struct thread *, int, char *, int, int); static int linux_getdents_error(struct thread *, int, int); int linux_creat(struct thread *td, struct linux_creat_args *args) { char *path; int error; LCONVPATHEXIST(td, args->path, &path); #ifdef DEBUG if (ldebug(creat)) printf(ARGS(creat, "%s, %d"), path, args->mode); #endif error = kern_openat(td, AT_FDCWD, path, UIO_SYSSPACE, O_WRONLY | O_CREAT | O_TRUNC, args->mode); LFREEPATH(path); return (error); } static int linux_common_open(struct thread *td, int dirfd, char *path, int l_flags, int mode) { - cap_rights_t rights; struct proc *p = td->td_proc; struct file *fp; int fd; int bsd_flags, error; bsd_flags = 0; switch (l_flags & LINUX_O_ACCMODE) { case LINUX_O_WRONLY: bsd_flags |= O_WRONLY; break; case LINUX_O_RDWR: bsd_flags |= O_RDWR; break; default: bsd_flags |= O_RDONLY; } if (l_flags & LINUX_O_NDELAY) bsd_flags |= O_NONBLOCK; if (l_flags & LINUX_O_APPEND) bsd_flags |= O_APPEND; if (l_flags & LINUX_O_SYNC) bsd_flags |= O_FSYNC; if (l_flags & LINUX_O_NONBLOCK) bsd_flags |= O_NONBLOCK; if (l_flags & LINUX_FASYNC) bsd_flags |= O_ASYNC; if (l_flags & LINUX_O_CREAT) bsd_flags |= O_CREAT; if (l_flags & LINUX_O_TRUNC) bsd_flags |= O_TRUNC; if (l_flags & LINUX_O_EXCL) bsd_flags |= O_EXCL; if (l_flags & LINUX_O_NOCTTY) bsd_flags |= O_NOCTTY; if (l_flags & LINUX_O_DIRECT) bsd_flags |= O_DIRECT; if (l_flags & LINUX_O_NOFOLLOW) bsd_flags |= O_NOFOLLOW; if (l_flags & LINUX_O_DIRECTORY) bsd_flags |= O_DIRECTORY; /* XXX LINUX_O_NOATIME: unable to be easily implemented. */ error = kern_openat(td, dirfd, path, UIO_SYSSPACE, bsd_flags, mode); if (error != 0) goto done; if (bsd_flags & O_NOCTTY) goto done; /* * XXX In between kern_openat() and fget(), another process * having the same filedesc could use that fd without * checking below. */ fd = td->td_retval[0]; - if (fget(td, fd, cap_rights_init(&rights, CAP_IOCTL), &fp) == 0) { + if (fget(td, fd, &cap_ioctl_rights, &fp) == 0) { if (fp->f_type != DTYPE_VNODE) { fdrop(fp, td); goto done; } sx_slock(&proctree_lock); PROC_LOCK(p); if (SESS_LEADER(p) && !(p->p_flag & P_CONTROLT)) { PROC_UNLOCK(p); sx_sunlock(&proctree_lock); /* XXXPJD: Verify if TIOCSCTTY is allowed. */ (void) fo_ioctl(fp, TIOCSCTTY, (caddr_t) 0, td->td_ucred, td); } else { PROC_UNLOCK(p); sx_sunlock(&proctree_lock); } fdrop(fp, td); } done: #ifdef DEBUG if (ldebug(open)) printf(LMSG("open returns error %d"), error); #endif LFREEPATH(path); return (error); } int linux_openat(struct thread *td, struct linux_openat_args *args) { char *path; int dfd; dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd; if (args->flags & LINUX_O_CREAT) LCONVPATH_AT(td, args->filename, &path, 1, dfd); else LCONVPATH_AT(td, args->filename, &path, 0, dfd); #ifdef DEBUG if (ldebug(openat)) printf(ARGS(openat, "%i, %s, 0x%x, 0x%x"), args->dfd, path, args->flags, args->mode); #endif return (linux_common_open(td, dfd, path, args->flags, args->mode)); } int linux_open(struct thread *td, struct linux_open_args *args) { char *path; if (args->flags & LINUX_O_CREAT) LCONVPATHCREAT(td, args->path, &path); else LCONVPATHEXIST(td, args->path, &path); #ifdef DEBUG if (ldebug(open)) printf(ARGS(open, "%s, 0x%x, 0x%x"), path, args->flags, args->mode); #endif return (linux_common_open(td, AT_FDCWD, path, args->flags, args->mode)); } int linux_lseek(struct thread *td, struct linux_lseek_args *args) { #ifdef DEBUG if (ldebug(lseek)) printf(ARGS(lseek, "%d, %ld, %d"), args->fdes, (long)args->off, args->whence); #endif return (kern_lseek(td, args->fdes, args->off, args->whence)); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) int linux_llseek(struct thread *td, struct linux_llseek_args *args) { int error; off_t off; #ifdef DEBUG if (ldebug(llseek)) printf(ARGS(llseek, "%d, %d:%d, %d"), args->fd, args->ohigh, args->olow, args->whence); #endif off = (args->olow) | (((off_t) args->ohigh) << 32); error = kern_lseek(td, args->fd, off, args->whence); if (error != 0) return (error); error = copyout(td->td_retval, args->res, sizeof(off_t)); if (error != 0) return (error); td->td_retval[0] = 0; return (0); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ /* * Note that linux_getdents(2) and linux_getdents64(2) have the same * arguments. They only differ in the definition of struct dirent they * operate on. * Note that linux_readdir(2) is a special case of linux_getdents(2) * where count is always equals 1, meaning that the buffer is one * dirent-structure in size and that the code can't handle more anyway. * Note that linux_readdir(2) can't be implemented by means of linux_getdents(2) * as in case when the *dent buffer size is equal to 1 linux_getdents(2) will * trash user stack. */ static int linux_getdents_error(struct thread *td, int fd, int err) { - cap_rights_t rights; struct vnode *vp; struct file *fp; int error; /* Linux return ENOTDIR in case when fd is not a directory. */ - error = getvnode(td, fd, cap_rights_init(&rights, CAP_READ), &fp); + error = getvnode(td, fd, &cap_read_rights, &fp); if (error != 0) return (error); vp = fp->f_vnode; if (vp->v_type != VDIR) { fdrop(fp, td); return (ENOTDIR); } fdrop(fp, td); return (err); } struct l_dirent { l_ulong d_ino; l_off_t d_off; l_ushort d_reclen; char d_name[LINUX_NAME_MAX + 1]; }; struct l_dirent64 { uint64_t d_ino; int64_t d_off; l_ushort d_reclen; u_char d_type; char d_name[LINUX_NAME_MAX + 1]; }; /* * Linux uses the last byte in the dirent buffer to store d_type, * at least glibc-2.7 requires it. That is why l_dirent is padded with 2 bytes. */ #define LINUX_RECLEN(namlen) \ roundup(offsetof(struct l_dirent, d_name) + (namlen) + 2, sizeof(l_ulong)) #define LINUX_RECLEN64(namlen) \ roundup(offsetof(struct l_dirent64, d_name) + (namlen) + 1, \ sizeof(uint64_t)) int linux_getdents(struct thread *td, struct linux_getdents_args *args) { struct dirent *bdp; caddr_t inp, buf; /* BSD-format */ int len, reclen; /* BSD-format */ caddr_t outp; /* Linux-format */ int resid, linuxreclen; /* Linux-format */ caddr_t lbuf; /* Linux-format */ off_t base; struct l_dirent *linux_dirent; int buflen, error; size_t retval; #ifdef DEBUG if (ldebug(getdents)) printf(ARGS(getdents, "%d, *, %d"), args->fd, args->count); #endif buflen = min(args->count, MAXBSIZE); buf = malloc(buflen, M_TEMP, M_WAITOK); error = kern_getdirentries(td, args->fd, buf, buflen, &base, NULL, UIO_SYSSPACE); if (error != 0) { error = linux_getdents_error(td, args->fd, error); goto out1; } lbuf = malloc(LINUX_RECLEN(LINUX_NAME_MAX), M_TEMP, M_WAITOK | M_ZERO); len = td->td_retval[0]; inp = buf; outp = (caddr_t)args->dent; resid = args->count; retval = 0; while (len > 0) { bdp = (struct dirent *) inp; reclen = bdp->d_reclen; linuxreclen = LINUX_RECLEN(bdp->d_namlen); /* * No more space in the user supplied dirent buffer. * Return EINVAL. */ if (resid < linuxreclen) { error = EINVAL; goto out; } linux_dirent = (struct l_dirent*)lbuf; linux_dirent->d_ino = bdp->d_fileno; linux_dirent->d_off = base + reclen; linux_dirent->d_reclen = linuxreclen; /* * Copy d_type to last byte of l_dirent buffer */ lbuf[linuxreclen - 1] = bdp->d_type; strlcpy(linux_dirent->d_name, bdp->d_name, linuxreclen - offsetof(struct l_dirent, d_name)-1); error = copyout(linux_dirent, outp, linuxreclen); if (error != 0) goto out; inp += reclen; base += reclen; len -= reclen; retval += linuxreclen; outp += linuxreclen; resid -= linuxreclen; } td->td_retval[0] = retval; out: free(lbuf, M_TEMP); out1: free(buf, M_TEMP); return (error); } int linux_getdents64(struct thread *td, struct linux_getdents64_args *args) { struct dirent *bdp; caddr_t inp, buf; /* BSD-format */ int len, reclen; /* BSD-format */ caddr_t outp; /* Linux-format */ int resid, linuxreclen; /* Linux-format */ caddr_t lbuf; /* Linux-format */ off_t base; struct l_dirent64 *linux_dirent64; int buflen, error; size_t retval; #ifdef DEBUG if (ldebug(getdents64)) uprintf(ARGS(getdents64, "%d, *, %d"), args->fd, args->count); #endif buflen = min(args->count, MAXBSIZE); buf = malloc(buflen, M_TEMP, M_WAITOK); error = kern_getdirentries(td, args->fd, buf, buflen, &base, NULL, UIO_SYSSPACE); if (error != 0) { error = linux_getdents_error(td, args->fd, error); goto out1; } lbuf = malloc(LINUX_RECLEN64(LINUX_NAME_MAX), M_TEMP, M_WAITOK | M_ZERO); len = td->td_retval[0]; inp = buf; outp = (caddr_t)args->dirent; resid = args->count; retval = 0; while (len > 0) { bdp = (struct dirent *) inp; reclen = bdp->d_reclen; linuxreclen = LINUX_RECLEN64(bdp->d_namlen); /* * No more space in the user supplied dirent buffer. * Return EINVAL. */ if (resid < linuxreclen) { error = EINVAL; goto out; } linux_dirent64 = (struct l_dirent64*)lbuf; linux_dirent64->d_ino = bdp->d_fileno; linux_dirent64->d_off = base + reclen; linux_dirent64->d_reclen = linuxreclen; linux_dirent64->d_type = bdp->d_type; strlcpy(linux_dirent64->d_name, bdp->d_name, linuxreclen - offsetof(struct l_dirent64, d_name)); error = copyout(linux_dirent64, outp, linuxreclen); if (error != 0) goto out; inp += reclen; base += reclen; len -= reclen; retval += linuxreclen; outp += linuxreclen; resid -= linuxreclen; } td->td_retval[0] = retval; out: free(lbuf, M_TEMP); out1: free(buf, M_TEMP); return (error); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) int linux_readdir(struct thread *td, struct linux_readdir_args *args) { struct dirent *bdp; caddr_t buf; /* BSD-format */ int linuxreclen; /* Linux-format */ caddr_t lbuf; /* Linux-format */ off_t base; struct l_dirent *linux_dirent; int buflen, error; #ifdef DEBUG if (ldebug(readdir)) printf(ARGS(readdir, "%d, *"), args->fd); #endif buflen = LINUX_RECLEN(LINUX_NAME_MAX); buf = malloc(buflen, M_TEMP, M_WAITOK); error = kern_getdirentries(td, args->fd, buf, buflen, &base, NULL, UIO_SYSSPACE); if (error != 0) { error = linux_getdents_error(td, args->fd, error); goto out; } if (td->td_retval[0] == 0) goto out; lbuf = malloc(LINUX_RECLEN(LINUX_NAME_MAX), M_TEMP, M_WAITOK | M_ZERO); bdp = (struct dirent *) buf; linuxreclen = LINUX_RECLEN(bdp->d_namlen); linux_dirent = (struct l_dirent*)lbuf; linux_dirent->d_ino = bdp->d_fileno; linux_dirent->d_off = linuxreclen; linux_dirent->d_reclen = bdp->d_namlen; strlcpy(linux_dirent->d_name, bdp->d_name, linuxreclen - offsetof(struct l_dirent, d_name)); error = copyout(linux_dirent, args->dent, linuxreclen); if (error == 0) td->td_retval[0] = linuxreclen; free(lbuf, M_TEMP); out: free(buf, M_TEMP); return (error); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ /* * These exist mainly for hooks for doing /compat/linux translation. */ int linux_access(struct thread *td, struct linux_access_args *args) { char *path; int error; /* Linux convention. */ if (args->amode & ~(F_OK | X_OK | W_OK | R_OK)) return (EINVAL); LCONVPATHEXIST(td, args->path, &path); #ifdef DEBUG if (ldebug(access)) printf(ARGS(access, "%s, %d"), path, args->amode); #endif error = kern_accessat(td, AT_FDCWD, path, UIO_SYSSPACE, 0, args->amode); LFREEPATH(path); return (error); } int linux_faccessat(struct thread *td, struct linux_faccessat_args *args) { char *path; int error, dfd; /* Linux convention. */ if (args->amode & ~(F_OK | X_OK | W_OK | R_OK)) return (EINVAL); dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd; LCONVPATHEXIST_AT(td, args->filename, &path, dfd); #ifdef DEBUG if (ldebug(access)) printf(ARGS(access, "%s, %d"), path, args->amode); #endif error = kern_accessat(td, dfd, path, UIO_SYSSPACE, 0, args->amode); LFREEPATH(path); return (error); } int linux_unlink(struct thread *td, struct linux_unlink_args *args) { char *path; int error; struct stat st; LCONVPATHEXIST(td, args->path, &path); #ifdef DEBUG if (ldebug(unlink)) printf(ARGS(unlink, "%s"), path); #endif error = kern_unlinkat(td, AT_FDCWD, path, UIO_SYSSPACE, 0); if (error == EPERM) { /* Introduce POSIX noncompliant behaviour of Linux */ if (kern_statat(td, 0, AT_FDCWD, path, UIO_SYSSPACE, &st, NULL) == 0) { if (S_ISDIR(st.st_mode)) error = EISDIR; } } LFREEPATH(path); return (error); } int linux_unlinkat(struct thread *td, struct linux_unlinkat_args *args) { char *path; int error, dfd; struct stat st; if (args->flag & ~LINUX_AT_REMOVEDIR) return (EINVAL); dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd; LCONVPATHEXIST_AT(td, args->pathname, &path, dfd); #ifdef DEBUG if (ldebug(unlinkat)) printf(ARGS(unlinkat, "%s"), path); #endif if (args->flag & LINUX_AT_REMOVEDIR) error = kern_rmdirat(td, dfd, path, UIO_SYSSPACE); else error = kern_unlinkat(td, dfd, path, UIO_SYSSPACE, 0); if (error == EPERM && !(args->flag & LINUX_AT_REMOVEDIR)) { /* Introduce POSIX noncompliant behaviour of Linux */ if (kern_statat(td, AT_SYMLINK_NOFOLLOW, dfd, path, UIO_SYSSPACE, &st, NULL) == 0 && S_ISDIR(st.st_mode)) error = EISDIR; } LFREEPATH(path); return (error); } int linux_chdir(struct thread *td, struct linux_chdir_args *args) { char *path; int error; LCONVPATHEXIST(td, args->path, &path); #ifdef DEBUG if (ldebug(chdir)) printf(ARGS(chdir, "%s"), path); #endif error = kern_chdir(td, path, UIO_SYSSPACE); LFREEPATH(path); return (error); } int linux_chmod(struct thread *td, struct linux_chmod_args *args) { char *path; int error; LCONVPATHEXIST(td, args->path, &path); #ifdef DEBUG if (ldebug(chmod)) printf(ARGS(chmod, "%s, %d"), path, args->mode); #endif error = kern_fchmodat(td, AT_FDCWD, path, UIO_SYSSPACE, args->mode, 0); LFREEPATH(path); return (error); } int linux_fchmodat(struct thread *td, struct linux_fchmodat_args *args) { char *path; int error, dfd; dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd; LCONVPATHEXIST_AT(td, args->filename, &path, dfd); #ifdef DEBUG if (ldebug(fchmodat)) printf(ARGS(fchmodat, "%s, %d"), path, args->mode); #endif error = kern_fchmodat(td, dfd, path, UIO_SYSSPACE, args->mode, 0); LFREEPATH(path); return (error); } int linux_mkdir(struct thread *td, struct linux_mkdir_args *args) { char *path; int error; LCONVPATHCREAT(td, args->path, &path); #ifdef DEBUG if (ldebug(mkdir)) printf(ARGS(mkdir, "%s, %d"), path, args->mode); #endif error = kern_mkdirat(td, AT_FDCWD, path, UIO_SYSSPACE, args->mode); LFREEPATH(path); return (error); } int linux_mkdirat(struct thread *td, struct linux_mkdirat_args *args) { char *path; int error, dfd; dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd; LCONVPATHCREAT_AT(td, args->pathname, &path, dfd); #ifdef DEBUG if (ldebug(mkdirat)) printf(ARGS(mkdirat, "%s, %d"), path, args->mode); #endif error = kern_mkdirat(td, dfd, path, UIO_SYSSPACE, args->mode); LFREEPATH(path); return (error); } int linux_rmdir(struct thread *td, struct linux_rmdir_args *args) { char *path; int error; LCONVPATHEXIST(td, args->path, &path); #ifdef DEBUG if (ldebug(rmdir)) printf(ARGS(rmdir, "%s"), path); #endif error = kern_rmdirat(td, AT_FDCWD, path, UIO_SYSSPACE); LFREEPATH(path); return (error); } int linux_rename(struct thread *td, struct linux_rename_args *args) { char *from, *to; int error; LCONVPATHEXIST(td, args->from, &from); /* Expand LCONVPATHCREATE so that `from' can be freed on errors */ error = linux_emul_convpath(td, args->to, UIO_USERSPACE, &to, 1, AT_FDCWD); if (to == NULL) { LFREEPATH(from); return (error); } #ifdef DEBUG if (ldebug(rename)) printf(ARGS(rename, "%s, %s"), from, to); #endif error = kern_renameat(td, AT_FDCWD, from, AT_FDCWD, to, UIO_SYSSPACE); LFREEPATH(from); LFREEPATH(to); return (error); } int linux_renameat(struct thread *td, struct linux_renameat_args *args) { char *from, *to; int error, olddfd, newdfd; olddfd = (args->olddfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->olddfd; newdfd = (args->newdfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->newdfd; LCONVPATHEXIST_AT(td, args->oldname, &from, olddfd); /* Expand LCONVPATHCREATE so that `from' can be freed on errors */ error = linux_emul_convpath(td, args->newname, UIO_USERSPACE, &to, 1, newdfd); if (to == NULL) { LFREEPATH(from); return (error); } #ifdef DEBUG if (ldebug(renameat)) printf(ARGS(renameat, "%s, %s"), from, to); #endif error = kern_renameat(td, olddfd, from, newdfd, to, UIO_SYSSPACE); LFREEPATH(from); LFREEPATH(to); return (error); } int linux_symlink(struct thread *td, struct linux_symlink_args *args) { char *path, *to; int error; LCONVPATHEXIST(td, args->path, &path); /* Expand LCONVPATHCREATE so that `path' can be freed on errors */ error = linux_emul_convpath(td, args->to, UIO_USERSPACE, &to, 1, AT_FDCWD); if (to == NULL) { LFREEPATH(path); return (error); } #ifdef DEBUG if (ldebug(symlink)) printf(ARGS(symlink, "%s, %s"), path, to); #endif error = kern_symlinkat(td, path, AT_FDCWD, to, UIO_SYSSPACE); LFREEPATH(path); LFREEPATH(to); return (error); } int linux_symlinkat(struct thread *td, struct linux_symlinkat_args *args) { char *path, *to; int error, dfd; dfd = (args->newdfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->newdfd; LCONVPATHEXIST(td, args->oldname, &path); /* Expand LCONVPATHCREATE so that `path' can be freed on errors */ error = linux_emul_convpath(td, args->newname, UIO_USERSPACE, &to, 1, dfd); if (to == NULL) { LFREEPATH(path); return (error); } #ifdef DEBUG if (ldebug(symlinkat)) printf(ARGS(symlinkat, "%s, %s"), path, to); #endif error = kern_symlinkat(td, path, dfd, to, UIO_SYSSPACE); LFREEPATH(path); LFREEPATH(to); return (error); } int linux_readlink(struct thread *td, struct linux_readlink_args *args) { char *name; int error; LCONVPATHEXIST(td, args->name, &name); #ifdef DEBUG if (ldebug(readlink)) printf(ARGS(readlink, "%s, %p, %d"), name, (void *)args->buf, args->count); #endif error = kern_readlinkat(td, AT_FDCWD, name, UIO_SYSSPACE, args->buf, UIO_USERSPACE, args->count); LFREEPATH(name); return (error); } int linux_readlinkat(struct thread *td, struct linux_readlinkat_args *args) { char *name; int error, dfd; dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd; LCONVPATHEXIST_AT(td, args->path, &name, dfd); #ifdef DEBUG if (ldebug(readlinkat)) printf(ARGS(readlinkat, "%s, %p, %d"), name, (void *)args->buf, args->bufsiz); #endif error = kern_readlinkat(td, dfd, name, UIO_SYSSPACE, args->buf, UIO_USERSPACE, args->bufsiz); LFREEPATH(name); return (error); } int linux_truncate(struct thread *td, struct linux_truncate_args *args) { char *path; int error; LCONVPATHEXIST(td, args->path, &path); #ifdef DEBUG if (ldebug(truncate)) printf(ARGS(truncate, "%s, %ld"), path, (long)args->length); #endif error = kern_truncate(td, path, UIO_SYSSPACE, args->length); LFREEPATH(path); return (error); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) int linux_truncate64(struct thread *td, struct linux_truncate64_args *args) { char *path; int error; LCONVPATHEXIST(td, args->path, &path); #ifdef DEBUG if (ldebug(truncate64)) printf(ARGS(truncate64, "%s, %jd"), path, args->length); #endif error = kern_truncate(td, path, UIO_SYSSPACE, args->length); LFREEPATH(path); return (error); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ int linux_ftruncate(struct thread *td, struct linux_ftruncate_args *args) { return (kern_ftruncate(td, args->fd, args->length)); } int linux_link(struct thread *td, struct linux_link_args *args) { char *path, *to; int error; LCONVPATHEXIST(td, args->path, &path); /* Expand LCONVPATHCREATE so that `path' can be freed on errors */ error = linux_emul_convpath(td, args->to, UIO_USERSPACE, &to, 1, AT_FDCWD); if (to == NULL) { LFREEPATH(path); return (error); } #ifdef DEBUG if (ldebug(link)) printf(ARGS(link, "%s, %s"), path, to); #endif error = kern_linkat(td, AT_FDCWD, AT_FDCWD, path, to, UIO_SYSSPACE, FOLLOW); LFREEPATH(path); LFREEPATH(to); return (error); } int linux_linkat(struct thread *td, struct linux_linkat_args *args) { char *path, *to; int error, olddfd, newdfd, follow; if (args->flag & ~LINUX_AT_SYMLINK_FOLLOW) return (EINVAL); olddfd = (args->olddfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->olddfd; newdfd = (args->newdfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->newdfd; LCONVPATHEXIST_AT(td, args->oldname, &path, olddfd); /* Expand LCONVPATHCREATE so that `path' can be freed on errors */ error = linux_emul_convpath(td, args->newname, UIO_USERSPACE, &to, 1, newdfd); if (to == NULL) { LFREEPATH(path); return (error); } #ifdef DEBUG if (ldebug(linkat)) printf(ARGS(linkat, "%i, %s, %i, %s, %i"), args->olddfd, path, args->newdfd, to, args->flag); #endif follow = (args->flag & LINUX_AT_SYMLINK_FOLLOW) == 0 ? NOFOLLOW : FOLLOW; error = kern_linkat(td, olddfd, newdfd, path, to, UIO_SYSSPACE, follow); LFREEPATH(path); LFREEPATH(to); return (error); } int linux_fdatasync(td, uap) struct thread *td; struct linux_fdatasync_args *uap; { return (kern_fsync(td, uap->fd, false)); } int linux_pread(struct thread *td, struct linux_pread_args *uap) { - cap_rights_t rights; struct vnode *vp; int error; error = kern_pread(td, uap->fd, uap->buf, uap->nbyte, uap->offset); if (error == 0) { /* This seems to violate POSIX but Linux does it. */ - error = fgetvp(td, uap->fd, - cap_rights_init(&rights, CAP_PREAD), &vp); + error = fgetvp(td, uap->fd, &cap_pread_rights, &vp); if (error != 0) return (error); if (vp->v_type == VDIR) { vrele(vp); return (EISDIR); } vrele(vp); } return (error); } int linux_pwrite(struct thread *td, struct linux_pwrite_args *uap) { return (kern_pwrite(td, uap->fd, uap->buf, uap->nbyte, uap->offset)); } int linux_preadv(struct thread *td, struct linux_preadv_args *uap) { struct uio *auio; int error; off_t offset; /* * According http://man7.org/linux/man-pages/man2/preadv.2.html#NOTES * pos_l and pos_h, respectively, contain the * low order and high order 32 bits of offset. */ offset = (((off_t)uap->pos_h << (sizeof(offset) * 4)) << (sizeof(offset) * 4)) | uap->pos_l; if (offset < 0) return (EINVAL); #ifdef COMPAT_LINUX32 error = linux32_copyinuio(PTRIN(uap->vec), uap->vlen, &auio); #else error = copyinuio(uap->vec, uap->vlen, &auio); #endif if (error != 0) return (error); error = kern_preadv(td, uap->fd, auio, offset); free(auio, M_IOV); return (error); } int linux_pwritev(struct thread *td, struct linux_pwritev_args *uap) { struct uio *auio; int error; off_t offset; /* * According http://man7.org/linux/man-pages/man2/pwritev.2.html#NOTES * pos_l and pos_h, respectively, contain the * low order and high order 32 bits of offset. */ offset = (((off_t)uap->pos_h << (sizeof(offset) * 4)) << (sizeof(offset) * 4)) | uap->pos_l; if (offset < 0) return (EINVAL); #ifdef COMPAT_LINUX32 error = linux32_copyinuio(PTRIN(uap->vec), uap->vlen, &auio); #else error = copyinuio(uap->vec, uap->vlen, &auio); #endif if (error != 0) return (error); error = kern_pwritev(td, uap->fd, auio, offset); free(auio, M_IOV); return (error); } int linux_mount(struct thread *td, struct linux_mount_args *args) { char fstypename[MFSNAMELEN]; char *mntonname, *mntfromname; int error, fsflags; mntonname = malloc(MNAMELEN, M_TEMP, M_WAITOK); mntfromname = malloc(MNAMELEN, M_TEMP, M_WAITOK); error = copyinstr(args->filesystemtype, fstypename, MFSNAMELEN - 1, NULL); if (error != 0) goto out; error = copyinstr(args->specialfile, mntfromname, MNAMELEN - 1, NULL); if (error != 0) goto out; error = copyinstr(args->dir, mntonname, MNAMELEN - 1, NULL); if (error != 0) goto out; #ifdef DEBUG if (ldebug(mount)) printf(ARGS(mount, "%s, %s, %s"), fstypename, mntfromname, mntonname); #endif if (strcmp(fstypename, "ext2") == 0) { strcpy(fstypename, "ext2fs"); } else if (strcmp(fstypename, "proc") == 0) { strcpy(fstypename, "linprocfs"); } else if (strcmp(fstypename, "vfat") == 0) { strcpy(fstypename, "msdosfs"); } fsflags = 0; if ((args->rwflag & 0xffff0000) == 0xc0ed0000) { /* * Linux SYNC flag is not included; the closest equivalent * FreeBSD has is !ASYNC, which is our default. */ if (args->rwflag & LINUX_MS_RDONLY) fsflags |= MNT_RDONLY; if (args->rwflag & LINUX_MS_NOSUID) fsflags |= MNT_NOSUID; if (args->rwflag & LINUX_MS_NOEXEC) fsflags |= MNT_NOEXEC; if (args->rwflag & LINUX_MS_REMOUNT) fsflags |= MNT_UPDATE; } error = kernel_vmount(fsflags, "fstype", fstypename, "fspath", mntonname, "from", mntfromname, NULL); out: free(mntonname, M_TEMP); free(mntfromname, M_TEMP); return (error); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) int linux_oldumount(struct thread *td, struct linux_oldumount_args *args) { struct linux_umount_args args2; args2.path = args->path; args2.flags = 0; return (linux_umount(td, &args2)); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ int linux_umount(struct thread *td, struct linux_umount_args *args) { struct unmount_args bsd; bsd.path = args->path; bsd.flags = args->flags; /* XXX correct? */ return (sys_unmount(td, &bsd)); } /* * fcntl family of syscalls */ struct l_flock { l_short l_type; l_short l_whence; l_off_t l_start; l_off_t l_len; l_pid_t l_pid; } #if defined(__amd64__) && defined(COMPAT_LINUX32) __packed #endif ; static void linux_to_bsd_flock(struct l_flock *linux_flock, struct flock *bsd_flock) { switch (linux_flock->l_type) { case LINUX_F_RDLCK: bsd_flock->l_type = F_RDLCK; break; case LINUX_F_WRLCK: bsd_flock->l_type = F_WRLCK; break; case LINUX_F_UNLCK: bsd_flock->l_type = F_UNLCK; break; default: bsd_flock->l_type = -1; break; } bsd_flock->l_whence = linux_flock->l_whence; bsd_flock->l_start = (off_t)linux_flock->l_start; bsd_flock->l_len = (off_t)linux_flock->l_len; bsd_flock->l_pid = (pid_t)linux_flock->l_pid; bsd_flock->l_sysid = 0; } static void bsd_to_linux_flock(struct flock *bsd_flock, struct l_flock *linux_flock) { switch (bsd_flock->l_type) { case F_RDLCK: linux_flock->l_type = LINUX_F_RDLCK; break; case F_WRLCK: linux_flock->l_type = LINUX_F_WRLCK; break; case F_UNLCK: linux_flock->l_type = LINUX_F_UNLCK; break; } linux_flock->l_whence = bsd_flock->l_whence; linux_flock->l_start = (l_off_t)bsd_flock->l_start; linux_flock->l_len = (l_off_t)bsd_flock->l_len; linux_flock->l_pid = (l_pid_t)bsd_flock->l_pid; } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) struct l_flock64 { l_short l_type; l_short l_whence; l_loff_t l_start; l_loff_t l_len; l_pid_t l_pid; } #if defined(__amd64__) && defined(COMPAT_LINUX32) __packed #endif ; static void linux_to_bsd_flock64(struct l_flock64 *linux_flock, struct flock *bsd_flock) { switch (linux_flock->l_type) { case LINUX_F_RDLCK: bsd_flock->l_type = F_RDLCK; break; case LINUX_F_WRLCK: bsd_flock->l_type = F_WRLCK; break; case LINUX_F_UNLCK: bsd_flock->l_type = F_UNLCK; break; default: bsd_flock->l_type = -1; break; } bsd_flock->l_whence = linux_flock->l_whence; bsd_flock->l_start = (off_t)linux_flock->l_start; bsd_flock->l_len = (off_t)linux_flock->l_len; bsd_flock->l_pid = (pid_t)linux_flock->l_pid; bsd_flock->l_sysid = 0; } static void bsd_to_linux_flock64(struct flock *bsd_flock, struct l_flock64 *linux_flock) { switch (bsd_flock->l_type) { case F_RDLCK: linux_flock->l_type = LINUX_F_RDLCK; break; case F_WRLCK: linux_flock->l_type = LINUX_F_WRLCK; break; case F_UNLCK: linux_flock->l_type = LINUX_F_UNLCK; break; } linux_flock->l_whence = bsd_flock->l_whence; linux_flock->l_start = (l_loff_t)bsd_flock->l_start; linux_flock->l_len = (l_loff_t)bsd_flock->l_len; linux_flock->l_pid = (l_pid_t)bsd_flock->l_pid; } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ static int fcntl_common(struct thread *td, struct linux_fcntl_args *args) { struct l_flock linux_flock; struct flock bsd_flock; - cap_rights_t rights; struct file *fp; long arg; int error, result; switch (args->cmd) { case LINUX_F_DUPFD: return (kern_fcntl(td, args->fd, F_DUPFD, args->arg)); case LINUX_F_GETFD: return (kern_fcntl(td, args->fd, F_GETFD, 0)); case LINUX_F_SETFD: return (kern_fcntl(td, args->fd, F_SETFD, args->arg)); case LINUX_F_GETFL: error = kern_fcntl(td, args->fd, F_GETFL, 0); result = td->td_retval[0]; td->td_retval[0] = 0; if (result & O_RDONLY) td->td_retval[0] |= LINUX_O_RDONLY; if (result & O_WRONLY) td->td_retval[0] |= LINUX_O_WRONLY; if (result & O_RDWR) td->td_retval[0] |= LINUX_O_RDWR; if (result & O_NDELAY) td->td_retval[0] |= LINUX_O_NONBLOCK; if (result & O_APPEND) td->td_retval[0] |= LINUX_O_APPEND; if (result & O_FSYNC) td->td_retval[0] |= LINUX_O_SYNC; if (result & O_ASYNC) td->td_retval[0] |= LINUX_FASYNC; #ifdef LINUX_O_NOFOLLOW if (result & O_NOFOLLOW) td->td_retval[0] |= LINUX_O_NOFOLLOW; #endif #ifdef LINUX_O_DIRECT if (result & O_DIRECT) td->td_retval[0] |= LINUX_O_DIRECT; #endif return (error); case LINUX_F_SETFL: arg = 0; if (args->arg & LINUX_O_NDELAY) arg |= O_NONBLOCK; if (args->arg & LINUX_O_APPEND) arg |= O_APPEND; if (args->arg & LINUX_O_SYNC) arg |= O_FSYNC; if (args->arg & LINUX_FASYNC) arg |= O_ASYNC; #ifdef LINUX_O_NOFOLLOW if (args->arg & LINUX_O_NOFOLLOW) arg |= O_NOFOLLOW; #endif #ifdef LINUX_O_DIRECT if (args->arg & LINUX_O_DIRECT) arg |= O_DIRECT; #endif return (kern_fcntl(td, args->fd, F_SETFL, arg)); case LINUX_F_GETLK: error = copyin((void *)args->arg, &linux_flock, sizeof(linux_flock)); if (error) return (error); linux_to_bsd_flock(&linux_flock, &bsd_flock); error = kern_fcntl(td, args->fd, F_GETLK, (intptr_t)&bsd_flock); if (error) return (error); bsd_to_linux_flock(&bsd_flock, &linux_flock); return (copyout(&linux_flock, (void *)args->arg, sizeof(linux_flock))); case LINUX_F_SETLK: error = copyin((void *)args->arg, &linux_flock, sizeof(linux_flock)); if (error) return (error); linux_to_bsd_flock(&linux_flock, &bsd_flock); return (kern_fcntl(td, args->fd, F_SETLK, (intptr_t)&bsd_flock)); case LINUX_F_SETLKW: error = copyin((void *)args->arg, &linux_flock, sizeof(linux_flock)); if (error) return (error); linux_to_bsd_flock(&linux_flock, &bsd_flock); return (kern_fcntl(td, args->fd, F_SETLKW, (intptr_t)&bsd_flock)); case LINUX_F_GETOWN: return (kern_fcntl(td, args->fd, F_GETOWN, 0)); case LINUX_F_SETOWN: /* * XXX some Linux applications depend on F_SETOWN having no * significant effect for pipes (SIGIO is not delivered for * pipes under Linux-2.2.35 at least). */ error = fget(td, args->fd, - cap_rights_init(&rights, CAP_FCNTL), &fp); + &cap_fcntl_rights, &fp); if (error) return (error); if (fp->f_type == DTYPE_PIPE) { fdrop(fp, td); return (EINVAL); } fdrop(fp, td); return (kern_fcntl(td, args->fd, F_SETOWN, args->arg)); case LINUX_F_DUPFD_CLOEXEC: return (kern_fcntl(td, args->fd, F_DUPFD_CLOEXEC, args->arg)); } return (EINVAL); } int linux_fcntl(struct thread *td, struct linux_fcntl_args *args) { #ifdef DEBUG if (ldebug(fcntl)) printf(ARGS(fcntl, "%d, %08x, *"), args->fd, args->cmd); #endif return (fcntl_common(td, args)); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) int linux_fcntl64(struct thread *td, struct linux_fcntl64_args *args) { struct l_flock64 linux_flock; struct flock bsd_flock; struct linux_fcntl_args fcntl_args; int error; #ifdef DEBUG if (ldebug(fcntl64)) printf(ARGS(fcntl64, "%d, %08x, *"), args->fd, args->cmd); #endif switch (args->cmd) { case LINUX_F_GETLK64: error = copyin((void *)args->arg, &linux_flock, sizeof(linux_flock)); if (error) return (error); linux_to_bsd_flock64(&linux_flock, &bsd_flock); error = kern_fcntl(td, args->fd, F_GETLK, (intptr_t)&bsd_flock); if (error) return (error); bsd_to_linux_flock64(&bsd_flock, &linux_flock); return (copyout(&linux_flock, (void *)args->arg, sizeof(linux_flock))); case LINUX_F_SETLK64: error = copyin((void *)args->arg, &linux_flock, sizeof(linux_flock)); if (error) return (error); linux_to_bsd_flock64(&linux_flock, &bsd_flock); return (kern_fcntl(td, args->fd, F_SETLK, (intptr_t)&bsd_flock)); case LINUX_F_SETLKW64: error = copyin((void *)args->arg, &linux_flock, sizeof(linux_flock)); if (error) return (error); linux_to_bsd_flock64(&linux_flock, &bsd_flock); return (kern_fcntl(td, args->fd, F_SETLKW, (intptr_t)&bsd_flock)); } fcntl_args.fd = args->fd; fcntl_args.cmd = args->cmd; fcntl_args.arg = args->arg; return (fcntl_common(td, &fcntl_args)); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ int linux_chown(struct thread *td, struct linux_chown_args *args) { char *path; int error; LCONVPATHEXIST(td, args->path, &path); #ifdef DEBUG if (ldebug(chown)) printf(ARGS(chown, "%s, %d, %d"), path, args->uid, args->gid); #endif error = kern_fchownat(td, AT_FDCWD, path, UIO_SYSSPACE, args->uid, args->gid, 0); LFREEPATH(path); return (error); } int linux_fchownat(struct thread *td, struct linux_fchownat_args *args) { char *path; int error, dfd, flag; if (args->flag & ~LINUX_AT_SYMLINK_NOFOLLOW) return (EINVAL); dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd; LCONVPATHEXIST_AT(td, args->filename, &path, dfd); #ifdef DEBUG if (ldebug(fchownat)) printf(ARGS(fchownat, "%s, %d, %d"), path, args->uid, args->gid); #endif flag = (args->flag & LINUX_AT_SYMLINK_NOFOLLOW) == 0 ? 0 : AT_SYMLINK_NOFOLLOW; error = kern_fchownat(td, dfd, path, UIO_SYSSPACE, args->uid, args->gid, flag); LFREEPATH(path); return (error); } int linux_lchown(struct thread *td, struct linux_lchown_args *args) { char *path; int error; LCONVPATHEXIST(td, args->path, &path); #ifdef DEBUG if (ldebug(lchown)) printf(ARGS(lchown, "%s, %d, %d"), path, args->uid, args->gid); #endif error = kern_fchownat(td, AT_FDCWD, path, UIO_SYSSPACE, args->uid, args->gid, AT_SYMLINK_NOFOLLOW); LFREEPATH(path); return (error); } static int convert_fadvice(int advice) { switch (advice) { case LINUX_POSIX_FADV_NORMAL: return (POSIX_FADV_NORMAL); case LINUX_POSIX_FADV_RANDOM: return (POSIX_FADV_RANDOM); case LINUX_POSIX_FADV_SEQUENTIAL: return (POSIX_FADV_SEQUENTIAL); case LINUX_POSIX_FADV_WILLNEED: return (POSIX_FADV_WILLNEED); case LINUX_POSIX_FADV_DONTNEED: return (POSIX_FADV_DONTNEED); case LINUX_POSIX_FADV_NOREUSE: return (POSIX_FADV_NOREUSE); default: return (-1); } } int linux_fadvise64(struct thread *td, struct linux_fadvise64_args *args) { int advice; advice = convert_fadvice(args->advice); if (advice == -1) return (EINVAL); return (kern_posix_fadvise(td, args->fd, args->offset, args->len, advice)); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) int linux_fadvise64_64(struct thread *td, struct linux_fadvise64_64_args *args) { int advice; advice = convert_fadvice(args->advice); if (advice == -1) return (EINVAL); return (kern_posix_fadvise(td, args->fd, args->offset, args->len, advice)); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ int linux_pipe(struct thread *td, struct linux_pipe_args *args) { int fildes[2]; int error; #ifdef DEBUG if (ldebug(pipe)) printf(ARGS(pipe, "*")); #endif error = kern_pipe(td, fildes, 0, NULL, NULL); if (error != 0) return (error); error = copyout(fildes, args->pipefds, sizeof(fildes)); if (error != 0) { (void)kern_close(td, fildes[0]); (void)kern_close(td, fildes[1]); } return (error); } int linux_pipe2(struct thread *td, struct linux_pipe2_args *args) { int fildes[2]; int error, flags; #ifdef DEBUG if (ldebug(pipe2)) printf(ARGS(pipe2, "*, %d"), args->flags); #endif if ((args->flags & ~(LINUX_O_NONBLOCK | LINUX_O_CLOEXEC)) != 0) return (EINVAL); flags = 0; if ((args->flags & LINUX_O_NONBLOCK) != 0) flags |= O_NONBLOCK; if ((args->flags & LINUX_O_CLOEXEC) != 0) flags |= O_CLOEXEC; error = kern_pipe(td, fildes, flags, NULL, NULL); if (error != 0) return (error); error = copyout(fildes, args->pipefds, sizeof(fildes)); if (error != 0) { (void)kern_close(td, fildes[0]); (void)kern_close(td, fildes[1]); } return (error); } int linux_dup3(struct thread *td, struct linux_dup3_args *args) { int cmd; intptr_t newfd; if (args->oldfd == args->newfd) return (EINVAL); if ((args->flags & ~LINUX_O_CLOEXEC) != 0) return (EINVAL); if (args->flags & LINUX_O_CLOEXEC) cmd = F_DUP2FD_CLOEXEC; else cmd = F_DUP2FD; newfd = args->newfd; return (kern_fcntl(td, args->oldfd, cmd, newfd)); } int linux_fallocate(struct thread *td, struct linux_fallocate_args *args) { /* * We emulate only posix_fallocate system call for which * mode should be 0. */ if (args->mode != 0) return (ENOSYS); return (kern_posix_fallocate(td, args->fd, args->offset, args->len)); } Index: head/sys/compat/linux/linux_ioctl.c =================================================================== --- head/sys/compat/linux/linux_ioctl.c (revision 333424) +++ head/sys/compat/linux/linux_ioctl.c (revision 333425) @@ -1,3812 +1,3800 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1994-1995 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_compat.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif #include #include #include #include #include #include #include #include #include #include CTASSERT(LINUX_IFNAMSIZ == IFNAMSIZ); static linux_ioctl_function_t linux_ioctl_cdrom; static linux_ioctl_function_t linux_ioctl_vfat; static linux_ioctl_function_t linux_ioctl_console; static linux_ioctl_function_t linux_ioctl_hdio; static linux_ioctl_function_t linux_ioctl_disk; static linux_ioctl_function_t linux_ioctl_socket; static linux_ioctl_function_t linux_ioctl_sound; static linux_ioctl_function_t linux_ioctl_termio; static linux_ioctl_function_t linux_ioctl_private; static linux_ioctl_function_t linux_ioctl_drm; static linux_ioctl_function_t linux_ioctl_sg; static linux_ioctl_function_t linux_ioctl_v4l; static linux_ioctl_function_t linux_ioctl_v4l2; static linux_ioctl_function_t linux_ioctl_special; static linux_ioctl_function_t linux_ioctl_fbsd_usb; static linux_ioctl_function_t linux_ioctl_evdev; static struct linux_ioctl_handler cdrom_handler = { linux_ioctl_cdrom, LINUX_IOCTL_CDROM_MIN, LINUX_IOCTL_CDROM_MAX }; static struct linux_ioctl_handler vfat_handler = { linux_ioctl_vfat, LINUX_IOCTL_VFAT_MIN, LINUX_IOCTL_VFAT_MAX }; static struct linux_ioctl_handler console_handler = { linux_ioctl_console, LINUX_IOCTL_CONSOLE_MIN, LINUX_IOCTL_CONSOLE_MAX }; static struct linux_ioctl_handler hdio_handler = { linux_ioctl_hdio, LINUX_IOCTL_HDIO_MIN, LINUX_IOCTL_HDIO_MAX }; static struct linux_ioctl_handler disk_handler = { linux_ioctl_disk, LINUX_IOCTL_DISK_MIN, LINUX_IOCTL_DISK_MAX }; static struct linux_ioctl_handler socket_handler = { linux_ioctl_socket, LINUX_IOCTL_SOCKET_MIN, LINUX_IOCTL_SOCKET_MAX }; static struct linux_ioctl_handler sound_handler = { linux_ioctl_sound, LINUX_IOCTL_SOUND_MIN, LINUX_IOCTL_SOUND_MAX }; static struct linux_ioctl_handler termio_handler = { linux_ioctl_termio, LINUX_IOCTL_TERMIO_MIN, LINUX_IOCTL_TERMIO_MAX }; static struct linux_ioctl_handler private_handler = { linux_ioctl_private, LINUX_IOCTL_PRIVATE_MIN, LINUX_IOCTL_PRIVATE_MAX }; static struct linux_ioctl_handler drm_handler = { linux_ioctl_drm, LINUX_IOCTL_DRM_MIN, LINUX_IOCTL_DRM_MAX }; static struct linux_ioctl_handler sg_handler = { linux_ioctl_sg, LINUX_IOCTL_SG_MIN, LINUX_IOCTL_SG_MAX }; static struct linux_ioctl_handler video_handler = { linux_ioctl_v4l, LINUX_IOCTL_VIDEO_MIN, LINUX_IOCTL_VIDEO_MAX }; static struct linux_ioctl_handler video2_handler = { linux_ioctl_v4l2, LINUX_IOCTL_VIDEO2_MIN, LINUX_IOCTL_VIDEO2_MAX }; static struct linux_ioctl_handler fbsd_usb = { linux_ioctl_fbsd_usb, FBSD_LUSB_MIN, FBSD_LUSB_MAX }; static struct linux_ioctl_handler evdev_handler = { linux_ioctl_evdev, LINUX_IOCTL_EVDEV_MIN, LINUX_IOCTL_EVDEV_MAX }; DATA_SET(linux_ioctl_handler_set, cdrom_handler); DATA_SET(linux_ioctl_handler_set, vfat_handler); DATA_SET(linux_ioctl_handler_set, console_handler); DATA_SET(linux_ioctl_handler_set, hdio_handler); DATA_SET(linux_ioctl_handler_set, disk_handler); DATA_SET(linux_ioctl_handler_set, socket_handler); DATA_SET(linux_ioctl_handler_set, sound_handler); DATA_SET(linux_ioctl_handler_set, termio_handler); DATA_SET(linux_ioctl_handler_set, private_handler); DATA_SET(linux_ioctl_handler_set, drm_handler); DATA_SET(linux_ioctl_handler_set, sg_handler); DATA_SET(linux_ioctl_handler_set, video_handler); DATA_SET(linux_ioctl_handler_set, video2_handler); DATA_SET(linux_ioctl_handler_set, fbsd_usb); DATA_SET(linux_ioctl_handler_set, evdev_handler); struct handler_element { TAILQ_ENTRY(handler_element) list; int (*func)(struct thread *, struct linux_ioctl_args *); int low, high, span; }; static TAILQ_HEAD(, handler_element) handlers = TAILQ_HEAD_INITIALIZER(handlers); static struct sx linux_ioctl_sx; SX_SYSINIT(linux_ioctl, &linux_ioctl_sx, "Linux ioctl handlers"); /* * hdio related ioctls for VMWare support */ struct linux_hd_geometry { u_int8_t heads; u_int8_t sectors; u_int16_t cylinders; u_int32_t start; }; struct linux_hd_big_geometry { u_int8_t heads; u_int8_t sectors; u_int32_t cylinders; u_int32_t start; }; static int linux_ioctl_hdio(struct thread *td, struct linux_ioctl_args *args) { - cap_rights_t rights; struct file *fp; int error; u_int sectorsize, fwcylinders, fwheads, fwsectors; off_t mediasize, bytespercyl; - error = fget(td, args->fd, cap_rights_init(&rights, CAP_IOCTL), &fp); + error = fget(td, args->fd, &cap_ioctl_rights, &fp); if (error != 0) return (error); switch (args->cmd & 0xffff) { case LINUX_HDIO_GET_GEO: case LINUX_HDIO_GET_GEO_BIG: error = fo_ioctl(fp, DIOCGMEDIASIZE, (caddr_t)&mediasize, td->td_ucred, td); if (!error) error = fo_ioctl(fp, DIOCGSECTORSIZE, (caddr_t)§orsize, td->td_ucred, td); if (!error) error = fo_ioctl(fp, DIOCGFWHEADS, (caddr_t)&fwheads, td->td_ucred, td); if (!error) error = fo_ioctl(fp, DIOCGFWSECTORS, (caddr_t)&fwsectors, td->td_ucred, td); /* * XXX: DIOCGFIRSTOFFSET is not yet implemented, so * so pretend that GEOM always says 0. This is NOT VALID * for slices or partitions, only the per-disk raw devices. */ fdrop(fp, td); if (error) return (error); /* * 1. Calculate the number of bytes in a cylinder, * given the firmware's notion of heads and sectors * per cylinder. * 2. Calculate the number of cylinders, given the total * size of the media. * All internal calculations should have 64-bit precision. */ bytespercyl = (off_t) sectorsize * fwheads * fwsectors; fwcylinders = mediasize / bytespercyl; #if defined(DEBUG) linux_msg(td, "HDIO_GET_GEO: mediasize %jd, c/h/s %d/%d/%d, " "bpc %jd", (intmax_t)mediasize, fwcylinders, fwheads, fwsectors, (intmax_t)bytespercyl); #endif if ((args->cmd & 0xffff) == LINUX_HDIO_GET_GEO) { struct linux_hd_geometry hdg; hdg.cylinders = fwcylinders; hdg.heads = fwheads; hdg.sectors = fwsectors; hdg.start = 0; error = copyout(&hdg, (void *)args->arg, sizeof(hdg)); } else if ((args->cmd & 0xffff) == LINUX_HDIO_GET_GEO_BIG) { struct linux_hd_big_geometry hdbg; memset(&hdbg, 0, sizeof(hdbg)); hdbg.cylinders = fwcylinders; hdbg.heads = fwheads; hdbg.sectors = fwsectors; hdbg.start = 0; error = copyout(&hdbg, (void *)args->arg, sizeof(hdbg)); } return (error); break; default: /* XXX */ linux_msg(td, "ioctl fd=%d, cmd=0x%x ('%c',%d) is not implemented", args->fd, (int)(args->cmd & 0xffff), (int)(args->cmd & 0xff00) >> 8, (int)(args->cmd & 0xff)); break; } fdrop(fp, td); return (ENOIOCTL); } static int linux_ioctl_disk(struct thread *td, struct linux_ioctl_args *args) { - cap_rights_t rights; struct file *fp; int error; u_int sectorsize; off_t mediasize; - error = fget(td, args->fd, cap_rights_init(&rights, CAP_IOCTL), &fp); + error = fget(td, args->fd, &cap_ioctl_rights, &fp); if (error != 0) return (error); switch (args->cmd & 0xffff) { case LINUX_BLKGETSIZE: error = fo_ioctl(fp, DIOCGSECTORSIZE, (caddr_t)§orsize, td->td_ucred, td); if (!error) error = fo_ioctl(fp, DIOCGMEDIASIZE, (caddr_t)&mediasize, td->td_ucred, td); fdrop(fp, td); if (error) return (error); sectorsize = mediasize / sectorsize; /* * XXX: How do we know we return the right size of integer ? */ return (copyout(§orsize, (void *)args->arg, sizeof(sectorsize))); break; case LINUX_BLKSSZGET: error = fo_ioctl(fp, DIOCGSECTORSIZE, (caddr_t)§orsize, td->td_ucred, td); fdrop(fp, td); if (error) return (error); return (copyout(§orsize, (void *)args->arg, sizeof(sectorsize))); break; } fdrop(fp, td); return (ENOIOCTL); } /* * termio related ioctls */ struct linux_termio { unsigned short c_iflag; unsigned short c_oflag; unsigned short c_cflag; unsigned short c_lflag; unsigned char c_line; unsigned char c_cc[LINUX_NCC]; }; struct linux_termios { unsigned int c_iflag; unsigned int c_oflag; unsigned int c_cflag; unsigned int c_lflag; unsigned char c_line; unsigned char c_cc[LINUX_NCCS]; }; struct linux_winsize { unsigned short ws_row, ws_col; unsigned short ws_xpixel, ws_ypixel; }; struct speedtab { int sp_speed; /* Speed. */ int sp_code; /* Code. */ }; static struct speedtab sptab[] = { { B0, LINUX_B0 }, { B50, LINUX_B50 }, { B75, LINUX_B75 }, { B110, LINUX_B110 }, { B134, LINUX_B134 }, { B150, LINUX_B150 }, { B200, LINUX_B200 }, { B300, LINUX_B300 }, { B600, LINUX_B600 }, { B1200, LINUX_B1200 }, { B1800, LINUX_B1800 }, { B2400, LINUX_B2400 }, { B4800, LINUX_B4800 }, { B9600, LINUX_B9600 }, { B19200, LINUX_B19200 }, { B38400, LINUX_B38400 }, { B57600, LINUX_B57600 }, { B115200, LINUX_B115200 }, {-1, -1 } }; struct linux_serial_struct { int type; int line; int port; int irq; int flags; int xmit_fifo_size; int custom_divisor; int baud_base; unsigned short close_delay; char reserved_char[2]; int hub6; unsigned short closing_wait; unsigned short closing_wait2; int reserved[4]; }; static int linux_to_bsd_speed(int code, struct speedtab *table) { for ( ; table->sp_code != -1; table++) if (table->sp_code == code) return (table->sp_speed); return (-1); } static int bsd_to_linux_speed(int speed, struct speedtab *table) { for ( ; table->sp_speed != -1; table++) if (table->sp_speed == speed) return (table->sp_code); return (-1); } static void bsd_to_linux_termios(struct termios *bios, struct linux_termios *lios) { int i; #ifdef DEBUG if (ldebug(ioctl)) { printf("LINUX: BSD termios structure (input):\n"); printf("i=%08x o=%08x c=%08x l=%08x ispeed=%d ospeed=%d\n", bios->c_iflag, bios->c_oflag, bios->c_cflag, bios->c_lflag, bios->c_ispeed, bios->c_ospeed); printf("c_cc "); for (i=0; ic_cc[i]); printf("\n"); } #endif lios->c_iflag = 0; if (bios->c_iflag & IGNBRK) lios->c_iflag |= LINUX_IGNBRK; if (bios->c_iflag & BRKINT) lios->c_iflag |= LINUX_BRKINT; if (bios->c_iflag & IGNPAR) lios->c_iflag |= LINUX_IGNPAR; if (bios->c_iflag & PARMRK) lios->c_iflag |= LINUX_PARMRK; if (bios->c_iflag & INPCK) lios->c_iflag |= LINUX_INPCK; if (bios->c_iflag & ISTRIP) lios->c_iflag |= LINUX_ISTRIP; if (bios->c_iflag & INLCR) lios->c_iflag |= LINUX_INLCR; if (bios->c_iflag & IGNCR) lios->c_iflag |= LINUX_IGNCR; if (bios->c_iflag & ICRNL) lios->c_iflag |= LINUX_ICRNL; if (bios->c_iflag & IXON) lios->c_iflag |= LINUX_IXON; if (bios->c_iflag & IXANY) lios->c_iflag |= LINUX_IXANY; if (bios->c_iflag & IXOFF) lios->c_iflag |= LINUX_IXOFF; if (bios->c_iflag & IMAXBEL) lios->c_iflag |= LINUX_IMAXBEL; lios->c_oflag = 0; if (bios->c_oflag & OPOST) lios->c_oflag |= LINUX_OPOST; if (bios->c_oflag & ONLCR) lios->c_oflag |= LINUX_ONLCR; if (bios->c_oflag & TAB3) lios->c_oflag |= LINUX_XTABS; lios->c_cflag = bsd_to_linux_speed(bios->c_ispeed, sptab); lios->c_cflag |= (bios->c_cflag & CSIZE) >> 4; if (bios->c_cflag & CSTOPB) lios->c_cflag |= LINUX_CSTOPB; if (bios->c_cflag & CREAD) lios->c_cflag |= LINUX_CREAD; if (bios->c_cflag & PARENB) lios->c_cflag |= LINUX_PARENB; if (bios->c_cflag & PARODD) lios->c_cflag |= LINUX_PARODD; if (bios->c_cflag & HUPCL) lios->c_cflag |= LINUX_HUPCL; if (bios->c_cflag & CLOCAL) lios->c_cflag |= LINUX_CLOCAL; if (bios->c_cflag & CRTSCTS) lios->c_cflag |= LINUX_CRTSCTS; lios->c_lflag = 0; if (bios->c_lflag & ISIG) lios->c_lflag |= LINUX_ISIG; if (bios->c_lflag & ICANON) lios->c_lflag |= LINUX_ICANON; if (bios->c_lflag & ECHO) lios->c_lflag |= LINUX_ECHO; if (bios->c_lflag & ECHOE) lios->c_lflag |= LINUX_ECHOE; if (bios->c_lflag & ECHOK) lios->c_lflag |= LINUX_ECHOK; if (bios->c_lflag & ECHONL) lios->c_lflag |= LINUX_ECHONL; if (bios->c_lflag & NOFLSH) lios->c_lflag |= LINUX_NOFLSH; if (bios->c_lflag & TOSTOP) lios->c_lflag |= LINUX_TOSTOP; if (bios->c_lflag & ECHOCTL) lios->c_lflag |= LINUX_ECHOCTL; if (bios->c_lflag & ECHOPRT) lios->c_lflag |= LINUX_ECHOPRT; if (bios->c_lflag & ECHOKE) lios->c_lflag |= LINUX_ECHOKE; if (bios->c_lflag & FLUSHO) lios->c_lflag |= LINUX_FLUSHO; if (bios->c_lflag & PENDIN) lios->c_lflag |= LINUX_PENDIN; if (bios->c_lflag & IEXTEN) lios->c_lflag |= LINUX_IEXTEN; for (i=0; ic_cc[i] = LINUX_POSIX_VDISABLE; lios->c_cc[LINUX_VINTR] = bios->c_cc[VINTR]; lios->c_cc[LINUX_VQUIT] = bios->c_cc[VQUIT]; lios->c_cc[LINUX_VERASE] = bios->c_cc[VERASE]; lios->c_cc[LINUX_VKILL] = bios->c_cc[VKILL]; lios->c_cc[LINUX_VEOF] = bios->c_cc[VEOF]; lios->c_cc[LINUX_VEOL] = bios->c_cc[VEOL]; lios->c_cc[LINUX_VMIN] = bios->c_cc[VMIN]; lios->c_cc[LINUX_VTIME] = bios->c_cc[VTIME]; lios->c_cc[LINUX_VEOL2] = bios->c_cc[VEOL2]; lios->c_cc[LINUX_VSUSP] = bios->c_cc[VSUSP]; lios->c_cc[LINUX_VSTART] = bios->c_cc[VSTART]; lios->c_cc[LINUX_VSTOP] = bios->c_cc[VSTOP]; lios->c_cc[LINUX_VREPRINT] = bios->c_cc[VREPRINT]; lios->c_cc[LINUX_VDISCARD] = bios->c_cc[VDISCARD]; lios->c_cc[LINUX_VWERASE] = bios->c_cc[VWERASE]; lios->c_cc[LINUX_VLNEXT] = bios->c_cc[VLNEXT]; for (i=0; ic_cc[i] == _POSIX_VDISABLE) lios->c_cc[i] = LINUX_POSIX_VDISABLE; } lios->c_line = 0; #ifdef DEBUG if (ldebug(ioctl)) { printf("LINUX: LINUX termios structure (output):\n"); printf("i=%08x o=%08x c=%08x l=%08x line=%d\n", lios->c_iflag, lios->c_oflag, lios->c_cflag, lios->c_lflag, (int)lios->c_line); printf("c_cc "); for (i=0; ic_cc[i]); printf("\n"); } #endif } static void linux_to_bsd_termios(struct linux_termios *lios, struct termios *bios) { int i; #ifdef DEBUG if (ldebug(ioctl)) { printf("LINUX: LINUX termios structure (input):\n"); printf("i=%08x o=%08x c=%08x l=%08x line=%d\n", lios->c_iflag, lios->c_oflag, lios->c_cflag, lios->c_lflag, (int)lios->c_line); printf("c_cc "); for (i=0; ic_cc[i]); printf("\n"); } #endif bios->c_iflag = 0; if (lios->c_iflag & LINUX_IGNBRK) bios->c_iflag |= IGNBRK; if (lios->c_iflag & LINUX_BRKINT) bios->c_iflag |= BRKINT; if (lios->c_iflag & LINUX_IGNPAR) bios->c_iflag |= IGNPAR; if (lios->c_iflag & LINUX_PARMRK) bios->c_iflag |= PARMRK; if (lios->c_iflag & LINUX_INPCK) bios->c_iflag |= INPCK; if (lios->c_iflag & LINUX_ISTRIP) bios->c_iflag |= ISTRIP; if (lios->c_iflag & LINUX_INLCR) bios->c_iflag |= INLCR; if (lios->c_iflag & LINUX_IGNCR) bios->c_iflag |= IGNCR; if (lios->c_iflag & LINUX_ICRNL) bios->c_iflag |= ICRNL; if (lios->c_iflag & LINUX_IXON) bios->c_iflag |= IXON; if (lios->c_iflag & LINUX_IXANY) bios->c_iflag |= IXANY; if (lios->c_iflag & LINUX_IXOFF) bios->c_iflag |= IXOFF; if (lios->c_iflag & LINUX_IMAXBEL) bios->c_iflag |= IMAXBEL; bios->c_oflag = 0; if (lios->c_oflag & LINUX_OPOST) bios->c_oflag |= OPOST; if (lios->c_oflag & LINUX_ONLCR) bios->c_oflag |= ONLCR; if (lios->c_oflag & LINUX_XTABS) bios->c_oflag |= TAB3; bios->c_cflag = (lios->c_cflag & LINUX_CSIZE) << 4; if (lios->c_cflag & LINUX_CSTOPB) bios->c_cflag |= CSTOPB; if (lios->c_cflag & LINUX_CREAD) bios->c_cflag |= CREAD; if (lios->c_cflag & LINUX_PARENB) bios->c_cflag |= PARENB; if (lios->c_cflag & LINUX_PARODD) bios->c_cflag |= PARODD; if (lios->c_cflag & LINUX_HUPCL) bios->c_cflag |= HUPCL; if (lios->c_cflag & LINUX_CLOCAL) bios->c_cflag |= CLOCAL; if (lios->c_cflag & LINUX_CRTSCTS) bios->c_cflag |= CRTSCTS; bios->c_lflag = 0; if (lios->c_lflag & LINUX_ISIG) bios->c_lflag |= ISIG; if (lios->c_lflag & LINUX_ICANON) bios->c_lflag |= ICANON; if (lios->c_lflag & LINUX_ECHO) bios->c_lflag |= ECHO; if (lios->c_lflag & LINUX_ECHOE) bios->c_lflag |= ECHOE; if (lios->c_lflag & LINUX_ECHOK) bios->c_lflag |= ECHOK; if (lios->c_lflag & LINUX_ECHONL) bios->c_lflag |= ECHONL; if (lios->c_lflag & LINUX_NOFLSH) bios->c_lflag |= NOFLSH; if (lios->c_lflag & LINUX_TOSTOP) bios->c_lflag |= TOSTOP; if (lios->c_lflag & LINUX_ECHOCTL) bios->c_lflag |= ECHOCTL; if (lios->c_lflag & LINUX_ECHOPRT) bios->c_lflag |= ECHOPRT; if (lios->c_lflag & LINUX_ECHOKE) bios->c_lflag |= ECHOKE; if (lios->c_lflag & LINUX_FLUSHO) bios->c_lflag |= FLUSHO; if (lios->c_lflag & LINUX_PENDIN) bios->c_lflag |= PENDIN; if (lios->c_lflag & LINUX_IEXTEN) bios->c_lflag |= IEXTEN; for (i=0; ic_cc[i] = _POSIX_VDISABLE; bios->c_cc[VINTR] = lios->c_cc[LINUX_VINTR]; bios->c_cc[VQUIT] = lios->c_cc[LINUX_VQUIT]; bios->c_cc[VERASE] = lios->c_cc[LINUX_VERASE]; bios->c_cc[VKILL] = lios->c_cc[LINUX_VKILL]; bios->c_cc[VEOF] = lios->c_cc[LINUX_VEOF]; bios->c_cc[VEOL] = lios->c_cc[LINUX_VEOL]; bios->c_cc[VMIN] = lios->c_cc[LINUX_VMIN]; bios->c_cc[VTIME] = lios->c_cc[LINUX_VTIME]; bios->c_cc[VEOL2] = lios->c_cc[LINUX_VEOL2]; bios->c_cc[VSUSP] = lios->c_cc[LINUX_VSUSP]; bios->c_cc[VSTART] = lios->c_cc[LINUX_VSTART]; bios->c_cc[VSTOP] = lios->c_cc[LINUX_VSTOP]; bios->c_cc[VREPRINT] = lios->c_cc[LINUX_VREPRINT]; bios->c_cc[VDISCARD] = lios->c_cc[LINUX_VDISCARD]; bios->c_cc[VWERASE] = lios->c_cc[LINUX_VWERASE]; bios->c_cc[VLNEXT] = lios->c_cc[LINUX_VLNEXT]; for (i=0; ic_cc[i] == LINUX_POSIX_VDISABLE) bios->c_cc[i] = _POSIX_VDISABLE; } bios->c_ispeed = bios->c_ospeed = linux_to_bsd_speed(lios->c_cflag & LINUX_CBAUD, sptab); #ifdef DEBUG if (ldebug(ioctl)) { printf("LINUX: BSD termios structure (output):\n"); printf("i=%08x o=%08x c=%08x l=%08x ispeed=%d ospeed=%d\n", bios->c_iflag, bios->c_oflag, bios->c_cflag, bios->c_lflag, bios->c_ispeed, bios->c_ospeed); printf("c_cc "); for (i=0; ic_cc[i]); printf("\n"); } #endif } static void bsd_to_linux_termio(struct termios *bios, struct linux_termio *lio) { struct linux_termios lios; bsd_to_linux_termios(bios, &lios); lio->c_iflag = lios.c_iflag; lio->c_oflag = lios.c_oflag; lio->c_cflag = lios.c_cflag; lio->c_lflag = lios.c_lflag; lio->c_line = lios.c_line; memcpy(lio->c_cc, lios.c_cc, LINUX_NCC); } static void linux_to_bsd_termio(struct linux_termio *lio, struct termios *bios) { struct linux_termios lios; int i; lios.c_iflag = lio->c_iflag; lios.c_oflag = lio->c_oflag; lios.c_cflag = lio->c_cflag; lios.c_lflag = lio->c_lflag; for (i=LINUX_NCC; ic_cc, LINUX_NCC); linux_to_bsd_termios(&lios, bios); } static int linux_ioctl_termio(struct thread *td, struct linux_ioctl_args *args) { struct termios bios; struct linux_termios lios; struct linux_termio lio; - cap_rights_t rights; struct file *fp; int error; - error = fget(td, args->fd, cap_rights_init(&rights, CAP_IOCTL), &fp); + error = fget(td, args->fd, &cap_ioctl_rights, &fp); if (error != 0) return (error); switch (args->cmd & 0xffff) { case LINUX_TCGETS: error = fo_ioctl(fp, TIOCGETA, (caddr_t)&bios, td->td_ucred, td); if (error) break; bsd_to_linux_termios(&bios, &lios); error = copyout(&lios, (void *)args->arg, sizeof(lios)); break; case LINUX_TCSETS: error = copyin((void *)args->arg, &lios, sizeof(lios)); if (error) break; linux_to_bsd_termios(&lios, &bios); error = (fo_ioctl(fp, TIOCSETA, (caddr_t)&bios, td->td_ucred, td)); break; case LINUX_TCSETSW: error = copyin((void *)args->arg, &lios, sizeof(lios)); if (error) break; linux_to_bsd_termios(&lios, &bios); error = (fo_ioctl(fp, TIOCSETAW, (caddr_t)&bios, td->td_ucred, td)); break; case LINUX_TCSETSF: error = copyin((void *)args->arg, &lios, sizeof(lios)); if (error) break; linux_to_bsd_termios(&lios, &bios); error = (fo_ioctl(fp, TIOCSETAF, (caddr_t)&bios, td->td_ucred, td)); break; case LINUX_TCGETA: error = fo_ioctl(fp, TIOCGETA, (caddr_t)&bios, td->td_ucred, td); if (error) break; bsd_to_linux_termio(&bios, &lio); error = (copyout(&lio, (void *)args->arg, sizeof(lio))); break; case LINUX_TCSETA: error = copyin((void *)args->arg, &lio, sizeof(lio)); if (error) break; linux_to_bsd_termio(&lio, &bios); error = (fo_ioctl(fp, TIOCSETA, (caddr_t)&bios, td->td_ucred, td)); break; case LINUX_TCSETAW: error = copyin((void *)args->arg, &lio, sizeof(lio)); if (error) break; linux_to_bsd_termio(&lio, &bios); error = (fo_ioctl(fp, TIOCSETAW, (caddr_t)&bios, td->td_ucred, td)); break; case LINUX_TCSETAF: error = copyin((void *)args->arg, &lio, sizeof(lio)); if (error) break; linux_to_bsd_termio(&lio, &bios); error = (fo_ioctl(fp, TIOCSETAF, (caddr_t)&bios, td->td_ucred, td)); break; /* LINUX_TCSBRK */ case LINUX_TCXONC: { switch (args->arg) { case LINUX_TCOOFF: args->cmd = TIOCSTOP; break; case LINUX_TCOON: args->cmd = TIOCSTART; break; case LINUX_TCIOFF: case LINUX_TCION: { int c; struct write_args wr; error = fo_ioctl(fp, TIOCGETA, (caddr_t)&bios, td->td_ucred, td); if (error) break; fdrop(fp, td); c = (args->arg == LINUX_TCIOFF) ? VSTOP : VSTART; c = bios.c_cc[c]; if (c != _POSIX_VDISABLE) { wr.fd = args->fd; wr.buf = &c; wr.nbyte = sizeof(c); return (sys_write(td, &wr)); } else return (0); } default: fdrop(fp, td); return (EINVAL); } args->arg = 0; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; } case LINUX_TCFLSH: { int val; switch (args->arg) { case LINUX_TCIFLUSH: val = FREAD; break; case LINUX_TCOFLUSH: val = FWRITE; break; case LINUX_TCIOFLUSH: val = FREAD | FWRITE; break; default: fdrop(fp, td); return (EINVAL); } error = (fo_ioctl(fp,TIOCFLUSH,(caddr_t)&val,td->td_ucred,td)); break; } case LINUX_TIOCEXCL: args->cmd = TIOCEXCL; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_TIOCNXCL: args->cmd = TIOCNXCL; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_TIOCSCTTY: args->cmd = TIOCSCTTY; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_TIOCGPGRP: args->cmd = TIOCGPGRP; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_TIOCSPGRP: args->cmd = TIOCSPGRP; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; /* LINUX_TIOCOUTQ */ /* LINUX_TIOCSTI */ case LINUX_TIOCGWINSZ: args->cmd = TIOCGWINSZ; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_TIOCSWINSZ: args->cmd = TIOCSWINSZ; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_TIOCMGET: args->cmd = TIOCMGET; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_TIOCMBIS: args->cmd = TIOCMBIS; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_TIOCMBIC: args->cmd = TIOCMBIC; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_TIOCMSET: args->cmd = TIOCMSET; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; /* TIOCGSOFTCAR */ /* TIOCSSOFTCAR */ case LINUX_FIONREAD: /* LINUX_TIOCINQ */ args->cmd = FIONREAD; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; /* LINUX_TIOCLINUX */ case LINUX_TIOCCONS: args->cmd = TIOCCONS; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_TIOCGSERIAL: { struct linux_serial_struct lss; bzero(&lss, sizeof(lss)); lss.type = LINUX_PORT_16550A; lss.flags = 0; lss.close_delay = 0; error = copyout(&lss, (void *)args->arg, sizeof(lss)); break; } case LINUX_TIOCSSERIAL: { struct linux_serial_struct lss; error = copyin((void *)args->arg, &lss, sizeof(lss)); if (error) break; /* XXX - It really helps to have an implementation that * does nothing. NOT! */ error = 0; break; } case LINUX_TIOCPKT: args->cmd = TIOCPKT; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_FIONBIO: args->cmd = FIONBIO; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_TIOCNOTTY: args->cmd = TIOCNOTTY; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_TIOCSETD: { int line; switch (args->arg) { case LINUX_N_TTY: line = TTYDISC; break; case LINUX_N_SLIP: line = SLIPDISC; break; case LINUX_N_PPP: line = PPPDISC; break; default: fdrop(fp, td); return (EINVAL); } error = (fo_ioctl(fp, TIOCSETD, (caddr_t)&line, td->td_ucred, td)); break; } case LINUX_TIOCGETD: { int linux_line; int bsd_line = TTYDISC; error = fo_ioctl(fp, TIOCGETD, (caddr_t)&bsd_line, td->td_ucred, td); if (error) break; switch (bsd_line) { case TTYDISC: linux_line = LINUX_N_TTY; break; case SLIPDISC: linux_line = LINUX_N_SLIP; break; case PPPDISC: linux_line = LINUX_N_PPP; break; default: fdrop(fp, td); return (EINVAL); } error = (copyout(&linux_line, (void *)args->arg, sizeof(int))); break; } /* LINUX_TCSBRKP */ /* LINUX_TIOCTTYGSTRUCT */ case LINUX_FIONCLEX: args->cmd = FIONCLEX; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_FIOCLEX: args->cmd = FIOCLEX; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_FIOASYNC: args->cmd = FIOASYNC; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; /* LINUX_TIOCSERCONFIG */ /* LINUX_TIOCSERGWILD */ /* LINUX_TIOCSERSWILD */ /* LINUX_TIOCGLCKTRMIOS */ /* LINUX_TIOCSLCKTRMIOS */ case LINUX_TIOCSBRK: args->cmd = TIOCSBRK; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_TIOCCBRK: args->cmd = TIOCCBRK; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_TIOCGPTN: { int nb; error = fo_ioctl(fp, TIOCGPTN, (caddr_t)&nb, td->td_ucred, td); if (!error) error = copyout(&nb, (void *)args->arg, sizeof(int)); break; } case LINUX_TIOCSPTLCK: /* Our unlockpt() does nothing. */ error = 0; break; default: error = ENOIOCTL; break; } fdrop(fp, td); return (error); } /* * CDROM related ioctls */ struct linux_cdrom_msf { u_char cdmsf_min0; u_char cdmsf_sec0; u_char cdmsf_frame0; u_char cdmsf_min1; u_char cdmsf_sec1; u_char cdmsf_frame1; }; struct linux_cdrom_tochdr { u_char cdth_trk0; u_char cdth_trk1; }; union linux_cdrom_addr { struct { u_char minute; u_char second; u_char frame; } msf; int lba; }; struct linux_cdrom_tocentry { u_char cdte_track; u_char cdte_adr:4; u_char cdte_ctrl:4; u_char cdte_format; union linux_cdrom_addr cdte_addr; u_char cdte_datamode; }; struct linux_cdrom_subchnl { u_char cdsc_format; u_char cdsc_audiostatus; u_char cdsc_adr:4; u_char cdsc_ctrl:4; u_char cdsc_trk; u_char cdsc_ind; union linux_cdrom_addr cdsc_absaddr; union linux_cdrom_addr cdsc_reladdr; }; struct l_cdrom_read_audio { union linux_cdrom_addr addr; u_char addr_format; l_int nframes; u_char *buf; }; struct l_dvd_layer { u_char book_version:4; u_char book_type:4; u_char min_rate:4; u_char disc_size:4; u_char layer_type:4; u_char track_path:1; u_char nlayers:2; u_char track_density:4; u_char linear_density:4; u_char bca:1; u_int32_t start_sector; u_int32_t end_sector; u_int32_t end_sector_l0; }; struct l_dvd_physical { u_char type; u_char layer_num; struct l_dvd_layer layer[4]; }; struct l_dvd_copyright { u_char type; u_char layer_num; u_char cpst; u_char rmi; }; struct l_dvd_disckey { u_char type; l_uint agid:2; u_char value[2048]; }; struct l_dvd_bca { u_char type; l_int len; u_char value[188]; }; struct l_dvd_manufact { u_char type; u_char layer_num; l_int len; u_char value[2048]; }; typedef union { u_char type; struct l_dvd_physical physical; struct l_dvd_copyright copyright; struct l_dvd_disckey disckey; struct l_dvd_bca bca; struct l_dvd_manufact manufact; } l_dvd_struct; typedef u_char l_dvd_key[5]; typedef u_char l_dvd_challenge[10]; struct l_dvd_lu_send_agid { u_char type; l_uint agid:2; }; struct l_dvd_host_send_challenge { u_char type; l_uint agid:2; l_dvd_challenge chal; }; struct l_dvd_send_key { u_char type; l_uint agid:2; l_dvd_key key; }; struct l_dvd_lu_send_challenge { u_char type; l_uint agid:2; l_dvd_challenge chal; }; struct l_dvd_lu_send_title_key { u_char type; l_uint agid:2; l_dvd_key title_key; l_int lba; l_uint cpm:1; l_uint cp_sec:1; l_uint cgms:2; }; struct l_dvd_lu_send_asf { u_char type; l_uint agid:2; l_uint asf:1; }; struct l_dvd_host_send_rpcstate { u_char type; u_char pdrc; }; struct l_dvd_lu_send_rpcstate { u_char type:2; u_char vra:3; u_char ucca:3; u_char region_mask; u_char rpc_scheme; }; typedef union { u_char type; struct l_dvd_lu_send_agid lsa; struct l_dvd_host_send_challenge hsc; struct l_dvd_send_key lsk; struct l_dvd_lu_send_challenge lsc; struct l_dvd_send_key hsk; struct l_dvd_lu_send_title_key lstk; struct l_dvd_lu_send_asf lsasf; struct l_dvd_host_send_rpcstate hrpcs; struct l_dvd_lu_send_rpcstate lrpcs; } l_dvd_authinfo; static void bsd_to_linux_msf_lba(u_char af, union msf_lba *bp, union linux_cdrom_addr *lp) { if (af == CD_LBA_FORMAT) lp->lba = bp->lba; else { lp->msf.minute = bp->msf.minute; lp->msf.second = bp->msf.second; lp->msf.frame = bp->msf.frame; } } static void set_linux_cdrom_addr(union linux_cdrom_addr *addr, int format, int lba) { if (format == LINUX_CDROM_MSF) { addr->msf.frame = lba % 75; lba /= 75; lba += 2; addr->msf.second = lba % 60; addr->msf.minute = lba / 60; } else addr->lba = lba; } static int linux_to_bsd_dvd_struct(l_dvd_struct *lp, struct dvd_struct *bp) { bp->format = lp->type; switch (bp->format) { case DVD_STRUCT_PHYSICAL: if (bp->layer_num >= 4) return (EINVAL); bp->layer_num = lp->physical.layer_num; break; case DVD_STRUCT_COPYRIGHT: bp->layer_num = lp->copyright.layer_num; break; case DVD_STRUCT_DISCKEY: bp->agid = lp->disckey.agid; break; case DVD_STRUCT_BCA: case DVD_STRUCT_MANUFACT: break; default: return (EINVAL); } return (0); } static int bsd_to_linux_dvd_struct(struct dvd_struct *bp, l_dvd_struct *lp) { switch (bp->format) { case DVD_STRUCT_PHYSICAL: { struct dvd_layer *blp = (struct dvd_layer *)bp->data; struct l_dvd_layer *llp = &lp->physical.layer[bp->layer_num]; memset(llp, 0, sizeof(*llp)); llp->book_version = blp->book_version; llp->book_type = blp->book_type; llp->min_rate = blp->max_rate; llp->disc_size = blp->disc_size; llp->layer_type = blp->layer_type; llp->track_path = blp->track_path; llp->nlayers = blp->nlayers; llp->track_density = blp->track_density; llp->linear_density = blp->linear_density; llp->bca = blp->bca; llp->start_sector = blp->start_sector; llp->end_sector = blp->end_sector; llp->end_sector_l0 = blp->end_sector_l0; break; } case DVD_STRUCT_COPYRIGHT: lp->copyright.cpst = bp->cpst; lp->copyright.rmi = bp->rmi; break; case DVD_STRUCT_DISCKEY: memcpy(lp->disckey.value, bp->data, sizeof(lp->disckey.value)); break; case DVD_STRUCT_BCA: lp->bca.len = bp->length; memcpy(lp->bca.value, bp->data, sizeof(lp->bca.value)); break; case DVD_STRUCT_MANUFACT: lp->manufact.len = bp->length; memcpy(lp->manufact.value, bp->data, sizeof(lp->manufact.value)); /* lp->manufact.layer_num is unused in Linux (redhat 7.0). */ break; default: return (EINVAL); } return (0); } static int linux_to_bsd_dvd_authinfo(l_dvd_authinfo *lp, int *bcode, struct dvd_authinfo *bp) { switch (lp->type) { case LINUX_DVD_LU_SEND_AGID: *bcode = DVDIOCREPORTKEY; bp->format = DVD_REPORT_AGID; bp->agid = lp->lsa.agid; break; case LINUX_DVD_HOST_SEND_CHALLENGE: *bcode = DVDIOCSENDKEY; bp->format = DVD_SEND_CHALLENGE; bp->agid = lp->hsc.agid; memcpy(bp->keychal, lp->hsc.chal, 10); break; case LINUX_DVD_LU_SEND_KEY1: *bcode = DVDIOCREPORTKEY; bp->format = DVD_REPORT_KEY1; bp->agid = lp->lsk.agid; break; case LINUX_DVD_LU_SEND_CHALLENGE: *bcode = DVDIOCREPORTKEY; bp->format = DVD_REPORT_CHALLENGE; bp->agid = lp->lsc.agid; break; case LINUX_DVD_HOST_SEND_KEY2: *bcode = DVDIOCSENDKEY; bp->format = DVD_SEND_KEY2; bp->agid = lp->hsk.agid; memcpy(bp->keychal, lp->hsk.key, 5); break; case LINUX_DVD_LU_SEND_TITLE_KEY: *bcode = DVDIOCREPORTKEY; bp->format = DVD_REPORT_TITLE_KEY; bp->agid = lp->lstk.agid; bp->lba = lp->lstk.lba; break; case LINUX_DVD_LU_SEND_ASF: *bcode = DVDIOCREPORTKEY; bp->format = DVD_REPORT_ASF; bp->agid = lp->lsasf.agid; break; case LINUX_DVD_INVALIDATE_AGID: *bcode = DVDIOCREPORTKEY; bp->format = DVD_INVALIDATE_AGID; bp->agid = lp->lsa.agid; break; case LINUX_DVD_LU_SEND_RPC_STATE: *bcode = DVDIOCREPORTKEY; bp->format = DVD_REPORT_RPC; break; case LINUX_DVD_HOST_SEND_RPC_STATE: *bcode = DVDIOCSENDKEY; bp->format = DVD_SEND_RPC; bp->region = lp->hrpcs.pdrc; break; default: return (EINVAL); } return (0); } static int bsd_to_linux_dvd_authinfo(struct dvd_authinfo *bp, l_dvd_authinfo *lp) { switch (lp->type) { case LINUX_DVD_LU_SEND_AGID: lp->lsa.agid = bp->agid; break; case LINUX_DVD_HOST_SEND_CHALLENGE: lp->type = LINUX_DVD_LU_SEND_KEY1; break; case LINUX_DVD_LU_SEND_KEY1: memcpy(lp->lsk.key, bp->keychal, sizeof(lp->lsk.key)); break; case LINUX_DVD_LU_SEND_CHALLENGE: memcpy(lp->lsc.chal, bp->keychal, sizeof(lp->lsc.chal)); break; case LINUX_DVD_HOST_SEND_KEY2: lp->type = LINUX_DVD_AUTH_ESTABLISHED; break; case LINUX_DVD_LU_SEND_TITLE_KEY: memcpy(lp->lstk.title_key, bp->keychal, sizeof(lp->lstk.title_key)); lp->lstk.cpm = bp->cpm; lp->lstk.cp_sec = bp->cp_sec; lp->lstk.cgms = bp->cgms; break; case LINUX_DVD_LU_SEND_ASF: lp->lsasf.asf = bp->asf; break; case LINUX_DVD_INVALIDATE_AGID: break; case LINUX_DVD_LU_SEND_RPC_STATE: lp->lrpcs.type = bp->reg_type; lp->lrpcs.vra = bp->vend_rsts; lp->lrpcs.ucca = bp->user_rsts; lp->lrpcs.region_mask = bp->region; lp->lrpcs.rpc_scheme = bp->rpc_scheme; break; case LINUX_DVD_HOST_SEND_RPC_STATE: break; default: return (EINVAL); } return (0); } static int linux_ioctl_cdrom(struct thread *td, struct linux_ioctl_args *args) { - cap_rights_t rights; struct file *fp; int error; - error = fget(td, args->fd, cap_rights_init(&rights, CAP_IOCTL), &fp); + error = fget(td, args->fd, &cap_ioctl_rights, &fp); if (error != 0) return (error); switch (args->cmd & 0xffff) { case LINUX_CDROMPAUSE: args->cmd = CDIOCPAUSE; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_CDROMRESUME: args->cmd = CDIOCRESUME; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_CDROMPLAYMSF: args->cmd = CDIOCPLAYMSF; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_CDROMPLAYTRKIND: args->cmd = CDIOCPLAYTRACKS; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_CDROMREADTOCHDR: { struct ioc_toc_header th; struct linux_cdrom_tochdr lth; error = fo_ioctl(fp, CDIOREADTOCHEADER, (caddr_t)&th, td->td_ucred, td); if (!error) { lth.cdth_trk0 = th.starting_track; lth.cdth_trk1 = th.ending_track; copyout(<h, (void *)args->arg, sizeof(lth)); } break; } case LINUX_CDROMREADTOCENTRY: { struct linux_cdrom_tocentry lte; struct ioc_read_toc_single_entry irtse; error = copyin((void *)args->arg, <e, sizeof(lte)); if (error) break; irtse.address_format = lte.cdte_format; irtse.track = lte.cdte_track; error = fo_ioctl(fp, CDIOREADTOCENTRY, (caddr_t)&irtse, td->td_ucred, td); if (!error) { lte.cdte_ctrl = irtse.entry.control; lte.cdte_adr = irtse.entry.addr_type; bsd_to_linux_msf_lba(irtse.address_format, &irtse.entry.addr, <e.cdte_addr); error = copyout(<e, (void *)args->arg, sizeof(lte)); } break; } case LINUX_CDROMSTOP: args->cmd = CDIOCSTOP; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_CDROMSTART: args->cmd = CDIOCSTART; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_CDROMEJECT: args->cmd = CDIOCEJECT; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; /* LINUX_CDROMVOLCTRL */ case LINUX_CDROMSUBCHNL: { struct linux_cdrom_subchnl sc; struct ioc_read_subchannel bsdsc; struct cd_sub_channel_info bsdinfo; bsdsc.address_format = CD_LBA_FORMAT; bsdsc.data_format = CD_CURRENT_POSITION; bsdsc.track = 0; bsdsc.data_len = sizeof(bsdinfo); bsdsc.data = &bsdinfo; error = fo_ioctl(fp, CDIOCREADSUBCHANNEL_SYSSPACE, (caddr_t)&bsdsc, td->td_ucred, td); if (error) break; error = copyin((void *)args->arg, &sc, sizeof(sc)); if (error) break; sc.cdsc_audiostatus = bsdinfo.header.audio_status; sc.cdsc_adr = bsdinfo.what.position.addr_type; sc.cdsc_ctrl = bsdinfo.what.position.control; sc.cdsc_trk = bsdinfo.what.position.track_number; sc.cdsc_ind = bsdinfo.what.position.index_number; set_linux_cdrom_addr(&sc.cdsc_absaddr, sc.cdsc_format, bsdinfo.what.position.absaddr.lba); set_linux_cdrom_addr(&sc.cdsc_reladdr, sc.cdsc_format, bsdinfo.what.position.reladdr.lba); error = copyout(&sc, (void *)args->arg, sizeof(sc)); break; } /* LINUX_CDROMREADMODE2 */ /* LINUX_CDROMREADMODE1 */ /* LINUX_CDROMREADAUDIO */ /* LINUX_CDROMEJECT_SW */ /* LINUX_CDROMMULTISESSION */ /* LINUX_CDROM_GET_UPC */ case LINUX_CDROMRESET: args->cmd = CDIOCRESET; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; /* LINUX_CDROMVOLREAD */ /* LINUX_CDROMREADRAW */ /* LINUX_CDROMREADCOOKED */ /* LINUX_CDROMSEEK */ /* LINUX_CDROMPLAYBLK */ /* LINUX_CDROMREADALL */ /* LINUX_CDROMCLOSETRAY */ /* LINUX_CDROMLOADFROMSLOT */ /* LINUX_CDROMGETSPINDOWN */ /* LINUX_CDROMSETSPINDOWN */ /* LINUX_CDROM_SET_OPTIONS */ /* LINUX_CDROM_CLEAR_OPTIONS */ /* LINUX_CDROM_SELECT_SPEED */ /* LINUX_CDROM_SELECT_DISC */ /* LINUX_CDROM_MEDIA_CHANGED */ /* LINUX_CDROM_DRIVE_STATUS */ /* LINUX_CDROM_DISC_STATUS */ /* LINUX_CDROM_CHANGER_NSLOTS */ /* LINUX_CDROM_LOCKDOOR */ /* LINUX_CDROM_DEBUG */ /* LINUX_CDROM_GET_CAPABILITY */ /* LINUX_CDROMAUDIOBUFSIZ */ case LINUX_DVD_READ_STRUCT: { l_dvd_struct *lds; struct dvd_struct *bds; lds = malloc(sizeof(*lds), M_LINUX, M_WAITOK); bds = malloc(sizeof(*bds), M_LINUX, M_WAITOK); error = copyin((void *)args->arg, lds, sizeof(*lds)); if (error) goto out; error = linux_to_bsd_dvd_struct(lds, bds); if (error) goto out; error = fo_ioctl(fp, DVDIOCREADSTRUCTURE, (caddr_t)bds, td->td_ucred, td); if (error) goto out; error = bsd_to_linux_dvd_struct(bds, lds); if (error) goto out; error = copyout(lds, (void *)args->arg, sizeof(*lds)); out: free(bds, M_LINUX); free(lds, M_LINUX); break; } /* LINUX_DVD_WRITE_STRUCT */ case LINUX_DVD_AUTH: { l_dvd_authinfo lda; struct dvd_authinfo bda; int bcode; error = copyin((void *)args->arg, &lda, sizeof(lda)); if (error) break; error = linux_to_bsd_dvd_authinfo(&lda, &bcode, &bda); if (error) break; error = fo_ioctl(fp, bcode, (caddr_t)&bda, td->td_ucred, td); if (error) { if (lda.type == LINUX_DVD_HOST_SEND_KEY2) { lda.type = LINUX_DVD_AUTH_FAILURE; copyout(&lda, (void *)args->arg, sizeof(lda)); } break; } error = bsd_to_linux_dvd_authinfo(&bda, &lda); if (error) break; error = copyout(&lda, (void *)args->arg, sizeof(lda)); break; } case LINUX_SCSI_GET_BUS_NUMBER: { struct sg_scsi_id id; error = fo_ioctl(fp, SG_GET_SCSI_ID, (caddr_t)&id, td->td_ucred, td); if (error) break; error = copyout(&id.channel, (void *)args->arg, sizeof(int)); break; } case LINUX_SCSI_GET_IDLUN: { struct sg_scsi_id id; struct scsi_idlun idl; error = fo_ioctl(fp, SG_GET_SCSI_ID, (caddr_t)&id, td->td_ucred, td); if (error) break; idl.dev_id = (id.scsi_id & 0xff) + ((id.lun & 0xff) << 8) + ((id.channel & 0xff) << 16) + ((id.host_no & 0xff) << 24); idl.host_unique_id = id.host_no; error = copyout(&idl, (void *)args->arg, sizeof(idl)); break; } /* LINUX_CDROM_SEND_PACKET */ /* LINUX_CDROM_NEXT_WRITABLE */ /* LINUX_CDROM_LAST_WRITTEN */ default: error = ENOIOCTL; break; } fdrop(fp, td); return (error); } static int linux_ioctl_vfat(struct thread *td, struct linux_ioctl_args *args) { return (ENOTTY); } /* * Sound related ioctls */ struct linux_old_mixer_info { char id[16]; char name[32]; }; static u_int32_t dirbits[4] = { IOC_VOID, IOC_IN, IOC_OUT, IOC_INOUT }; #define SETDIR(c) (((c) & ~IOC_DIRMASK) | dirbits[args->cmd >> 30]) static int linux_ioctl_sound(struct thread *td, struct linux_ioctl_args *args) { switch (args->cmd & 0xffff) { case LINUX_SOUND_MIXER_WRITE_VOLUME: args->cmd = SETDIR(SOUND_MIXER_WRITE_VOLUME); return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_BASS: args->cmd = SETDIR(SOUND_MIXER_WRITE_BASS); return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_TREBLE: args->cmd = SETDIR(SOUND_MIXER_WRITE_TREBLE); return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_SYNTH: args->cmd = SETDIR(SOUND_MIXER_WRITE_SYNTH); return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_PCM: args->cmd = SETDIR(SOUND_MIXER_WRITE_PCM); return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_SPEAKER: args->cmd = SETDIR(SOUND_MIXER_WRITE_SPEAKER); return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_LINE: args->cmd = SETDIR(SOUND_MIXER_WRITE_LINE); return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_MIC: args->cmd = SETDIR(SOUND_MIXER_WRITE_MIC); return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_CD: args->cmd = SETDIR(SOUND_MIXER_WRITE_CD); return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_IMIX: args->cmd = SETDIR(SOUND_MIXER_WRITE_IMIX); return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_ALTPCM: args->cmd = SETDIR(SOUND_MIXER_WRITE_ALTPCM); return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_RECLEV: args->cmd = SETDIR(SOUND_MIXER_WRITE_RECLEV); return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_IGAIN: args->cmd = SETDIR(SOUND_MIXER_WRITE_IGAIN); return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_OGAIN: args->cmd = SETDIR(SOUND_MIXER_WRITE_OGAIN); return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_LINE1: args->cmd = SETDIR(SOUND_MIXER_WRITE_LINE1); return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_LINE2: args->cmd = SETDIR(SOUND_MIXER_WRITE_LINE2); return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_LINE3: args->cmd = SETDIR(SOUND_MIXER_WRITE_LINE3); return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_INFO: { /* Key on encoded length */ switch ((args->cmd >> 16) & 0x1fff) { case 0x005c: { /* SOUND_MIXER_INFO */ args->cmd = SOUND_MIXER_INFO; return (sys_ioctl(td, (struct ioctl_args *)args)); } case 0x0030: { /* SOUND_OLD_MIXER_INFO */ struct linux_old_mixer_info info; bzero(&info, sizeof(info)); strncpy(info.id, "OSS", sizeof(info.id) - 1); strncpy(info.name, "FreeBSD OSS Mixer", sizeof(info.name) - 1); copyout(&info, (void *)args->arg, sizeof(info)); return (0); } default: return (ENOIOCTL); } break; } case LINUX_OSS_GETVERSION: { int version = linux_get_oss_version(td); return (copyout(&version, (void *)args->arg, sizeof(int))); } case LINUX_SOUND_MIXER_READ_STEREODEVS: args->cmd = SOUND_MIXER_READ_STEREODEVS; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_READ_CAPS: args->cmd = SOUND_MIXER_READ_CAPS; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_READ_RECMASK: args->cmd = SOUND_MIXER_READ_RECMASK; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_READ_DEVMASK: args->cmd = SOUND_MIXER_READ_DEVMASK; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_MIXER_WRITE_RECSRC: args->cmd = SETDIR(SOUND_MIXER_WRITE_RECSRC); return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_RESET: args->cmd = SNDCTL_DSP_RESET; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_SYNC: args->cmd = SNDCTL_DSP_SYNC; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_SPEED: args->cmd = SNDCTL_DSP_SPEED; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_STEREO: args->cmd = SNDCTL_DSP_STEREO; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_GETBLKSIZE: /* LINUX_SNDCTL_DSP_SETBLKSIZE */ args->cmd = SNDCTL_DSP_GETBLKSIZE; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_SETFMT: args->cmd = SNDCTL_DSP_SETFMT; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_PCM_WRITE_CHANNELS: args->cmd = SOUND_PCM_WRITE_CHANNELS; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SOUND_PCM_WRITE_FILTER: args->cmd = SOUND_PCM_WRITE_FILTER; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_POST: args->cmd = SNDCTL_DSP_POST; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_SUBDIVIDE: args->cmd = SNDCTL_DSP_SUBDIVIDE; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_SETFRAGMENT: args->cmd = SNDCTL_DSP_SETFRAGMENT; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_GETFMTS: args->cmd = SNDCTL_DSP_GETFMTS; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_GETOSPACE: args->cmd = SNDCTL_DSP_GETOSPACE; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_GETISPACE: args->cmd = SNDCTL_DSP_GETISPACE; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_NONBLOCK: args->cmd = SNDCTL_DSP_NONBLOCK; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_GETCAPS: args->cmd = SNDCTL_DSP_GETCAPS; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_SETTRIGGER: /* LINUX_SNDCTL_GETTRIGGER */ args->cmd = SNDCTL_DSP_SETTRIGGER; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_GETIPTR: args->cmd = SNDCTL_DSP_GETIPTR; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_GETOPTR: args->cmd = SNDCTL_DSP_GETOPTR; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_SETDUPLEX: args->cmd = SNDCTL_DSP_SETDUPLEX; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_DSP_GETODELAY: args->cmd = SNDCTL_DSP_GETODELAY; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SEQ_RESET: args->cmd = SNDCTL_SEQ_RESET; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SEQ_SYNC: args->cmd = SNDCTL_SEQ_SYNC; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SYNTH_INFO: args->cmd = SNDCTL_SYNTH_INFO; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SEQ_CTRLRATE: args->cmd = SNDCTL_SEQ_CTRLRATE; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SEQ_GETOUTCOUNT: args->cmd = SNDCTL_SEQ_GETOUTCOUNT; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SEQ_GETINCOUNT: args->cmd = SNDCTL_SEQ_GETINCOUNT; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SEQ_PERCMODE: args->cmd = SNDCTL_SEQ_PERCMODE; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_FM_LOAD_INSTR: args->cmd = SNDCTL_FM_LOAD_INSTR; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SEQ_TESTMIDI: args->cmd = SNDCTL_SEQ_TESTMIDI; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SEQ_RESETSAMPLES: args->cmd = SNDCTL_SEQ_RESETSAMPLES; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SEQ_NRSYNTHS: args->cmd = SNDCTL_SEQ_NRSYNTHS; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SEQ_NRMIDIS: args->cmd = SNDCTL_SEQ_NRMIDIS; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_MIDI_INFO: args->cmd = SNDCTL_MIDI_INFO; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SEQ_TRESHOLD: args->cmd = SNDCTL_SEQ_TRESHOLD; return (sys_ioctl(td, (struct ioctl_args *)args)); case LINUX_SNDCTL_SYNTH_MEMAVL: args->cmd = SNDCTL_SYNTH_MEMAVL; return (sys_ioctl(td, (struct ioctl_args *)args)); } return (ENOIOCTL); } /* * Console related ioctls */ static int linux_ioctl_console(struct thread *td, struct linux_ioctl_args *args) { - cap_rights_t rights; struct file *fp; int error; - error = fget(td, args->fd, cap_rights_init(&rights, CAP_IOCTL), &fp); + error = fget(td, args->fd, &cap_ioctl_rights, &fp); if (error != 0) return (error); switch (args->cmd & 0xffff) { case LINUX_KIOCSOUND: args->cmd = KIOCSOUND; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_KDMKTONE: args->cmd = KDMKTONE; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_KDGETLED: args->cmd = KDGETLED; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_KDSETLED: args->cmd = KDSETLED; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_KDSETMODE: args->cmd = KDSETMODE; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_KDGETMODE: args->cmd = KDGETMODE; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_KDGKBMODE: args->cmd = KDGKBMODE; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_KDSKBMODE: { int kbdmode; switch (args->arg) { case LINUX_KBD_RAW: kbdmode = K_RAW; break; case LINUX_KBD_XLATE: kbdmode = K_XLATE; break; case LINUX_KBD_MEDIUMRAW: kbdmode = K_RAW; break; default: fdrop(fp, td); return (EINVAL); } error = (fo_ioctl(fp, KDSKBMODE, (caddr_t)&kbdmode, td->td_ucred, td)); break; } case LINUX_VT_OPENQRY: args->cmd = VT_OPENQRY; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_VT_GETMODE: args->cmd = VT_GETMODE; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_VT_SETMODE: { struct vt_mode mode; if ((error = copyin((void *)args->arg, &mode, sizeof(mode)))) break; if (LINUX_SIG_VALID(mode.relsig)) mode.relsig = linux_to_bsd_signal(mode.relsig); else mode.relsig = 0; if (LINUX_SIG_VALID(mode.acqsig)) mode.acqsig = linux_to_bsd_signal(mode.acqsig); else mode.acqsig = 0; /* XXX. Linux ignores frsig and set it to 0. */ mode.frsig = 0; if ((error = copyout(&mode, (void *)args->arg, sizeof(mode)))) break; args->cmd = VT_SETMODE; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; } case LINUX_VT_GETSTATE: args->cmd = VT_GETACTIVE; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_VT_RELDISP: args->cmd = VT_RELDISP; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_VT_ACTIVATE: args->cmd = VT_ACTIVATE; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; case LINUX_VT_WAITACTIVE: args->cmd = VT_WAITACTIVE; error = (sys_ioctl(td, (struct ioctl_args *)args)); break; default: error = ENOIOCTL; break; } fdrop(fp, td); return (error); } /* * Criteria for interface name translation */ #define IFP_IS_ETH(ifp) (ifp->if_type == IFT_ETHER) /* * Translate a Linux interface name to a FreeBSD interface name, * and return the associated ifnet structure * bsdname and lxname need to be least IFNAMSIZ bytes long, but * can point to the same buffer. */ static struct ifnet * ifname_linux_to_bsd(struct thread *td, const char *lxname, char *bsdname) { struct ifnet *ifp; int len, unit; char *ep; int is_eth, index; for (len = 0; len < LINUX_IFNAMSIZ; ++len) if (!isalpha(lxname[len])) break; if (len == 0 || len == LINUX_IFNAMSIZ) return (NULL); unit = (int)strtoul(lxname + len, &ep, 10); if (ep == NULL || ep == lxname + len || ep >= lxname + LINUX_IFNAMSIZ) return (NULL); index = 0; is_eth = (len == 3 && !strncmp(lxname, "eth", len)) ? 1 : 0; CURVNET_SET(TD_TO_VNET(td)); IFNET_RLOCK(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { /* * Allow Linux programs to use FreeBSD names. Don't presume * we never have an interface named "eth", so don't make * the test optional based on is_eth. */ if (strncmp(ifp->if_xname, lxname, LINUX_IFNAMSIZ) == 0) break; if (is_eth && IFP_IS_ETH(ifp) && unit == index++) break; } IFNET_RUNLOCK(); CURVNET_RESTORE(); if (ifp != NULL) strlcpy(bsdname, ifp->if_xname, IFNAMSIZ); return (ifp); } /* * Implement the SIOCGIFNAME ioctl */ static int linux_ioctl_ifname(struct thread *td, struct l_ifreq *uifr) { struct l_ifreq ifr; struct ifnet *ifp; int error, ethno, index; error = copyin(uifr, &ifr, sizeof(ifr)); if (error != 0) return (error); CURVNET_SET(TD_TO_VNET(curthread)); IFNET_RLOCK(); index = 1; /* ifr.ifr_ifindex starts from 1 */ ethno = 0; error = ENODEV; TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifr.ifr_ifindex == index) { if (IFP_IS_ETH(ifp)) snprintf(ifr.ifr_name, LINUX_IFNAMSIZ, "eth%d", ethno); else strlcpy(ifr.ifr_name, ifp->if_xname, LINUX_IFNAMSIZ); error = 0; break; } if (IFP_IS_ETH(ifp)) ethno++; index++; } IFNET_RUNLOCK(); if (error == 0) error = copyout(&ifr, uifr, sizeof(ifr)); CURVNET_RESTORE(); return (error); } /* * Implement the SIOCGIFCONF ioctl */ static int linux_ifconf(struct thread *td, struct ifconf *uifc) { #ifdef COMPAT_LINUX32 struct l_ifconf ifc; #else struct ifconf ifc; #endif struct l_ifreq ifr; struct ifnet *ifp; struct ifaddr *ifa; struct sbuf *sb; int error, ethno, full = 0, valid_len, max_len; error = copyin(uifc, &ifc, sizeof(ifc)); if (error != 0) return (error); max_len = MAXPHYS - 1; CURVNET_SET(TD_TO_VNET(td)); /* handle the 'request buffer size' case */ if ((l_uintptr_t)ifc.ifc_buf == PTROUT(NULL)) { ifc.ifc_len = 0; IFNET_RLOCK(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa = ifa->ifa_addr; if (sa->sa_family == AF_INET) ifc.ifc_len += sizeof(ifr); } } IFNET_RUNLOCK(); error = copyout(&ifc, uifc, sizeof(ifc)); CURVNET_RESTORE(); return (error); } if (ifc.ifc_len <= 0) { CURVNET_RESTORE(); return (EINVAL); } again: /* Keep track of eth interfaces */ ethno = 0; if (ifc.ifc_len <= max_len) { max_len = ifc.ifc_len; full = 1; } sb = sbuf_new(NULL, NULL, max_len + 1, SBUF_FIXEDLEN); max_len = 0; valid_len = 0; /* Return all AF_INET addresses of all interfaces */ IFNET_RLOCK(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { int addrs = 0; bzero(&ifr, sizeof(ifr)); if (IFP_IS_ETH(ifp)) snprintf(ifr.ifr_name, LINUX_IFNAMSIZ, "eth%d", ethno++); else strlcpy(ifr.ifr_name, ifp->if_xname, LINUX_IFNAMSIZ); /* Walk the address list */ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa = ifa->ifa_addr; if (sa->sa_family == AF_INET) { ifr.ifr_addr.sa_family = LINUX_AF_INET; memcpy(ifr.ifr_addr.sa_data, sa->sa_data, sizeof(ifr.ifr_addr.sa_data)); sbuf_bcat(sb, &ifr, sizeof(ifr)); max_len += sizeof(ifr); addrs++; } if (sbuf_error(sb) == 0) valid_len = sbuf_len(sb); } if (addrs == 0) { bzero((caddr_t)&ifr.ifr_addr, sizeof(ifr.ifr_addr)); sbuf_bcat(sb, &ifr, sizeof(ifr)); max_len += sizeof(ifr); if (sbuf_error(sb) == 0) valid_len = sbuf_len(sb); } } IFNET_RUNLOCK(); if (valid_len != max_len && !full) { sbuf_delete(sb); goto again; } ifc.ifc_len = valid_len; sbuf_finish(sb); error = copyout(sbuf_data(sb), PTRIN(ifc.ifc_buf), ifc.ifc_len); if (error == 0) error = copyout(&ifc, uifc, sizeof(ifc)); sbuf_delete(sb); CURVNET_RESTORE(); return (error); } static int linux_gifflags(struct thread *td, struct ifnet *ifp, struct l_ifreq *ifr) { l_short flags; flags = (ifp->if_flags | ifp->if_drv_flags) & 0xffff; /* these flags have no Linux equivalent */ flags &= ~(IFF_DRV_OACTIVE|IFF_SIMPLEX| IFF_LINK0|IFF_LINK1|IFF_LINK2); /* Linux' multicast flag is in a different bit */ if (flags & IFF_MULTICAST) { flags &= ~IFF_MULTICAST; flags |= 0x1000; } return (copyout(&flags, &ifr->ifr_flags, sizeof(flags))); } #define ARPHRD_ETHER 1 #define ARPHRD_LOOPBACK 772 static int linux_gifhwaddr(struct ifnet *ifp, struct l_ifreq *ifr) { struct ifaddr *ifa; struct sockaddr_dl *sdl; struct l_sockaddr lsa; if (ifp->if_type == IFT_LOOP) { bzero(&lsa, sizeof(lsa)); lsa.sa_family = ARPHRD_LOOPBACK; return (copyout(&lsa, &ifr->ifr_hwaddr, sizeof(lsa))); } if (ifp->if_type != IFT_ETHER) return (ENOENT); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sdl = (struct sockaddr_dl*)ifa->ifa_addr; if (sdl != NULL && (sdl->sdl_family == AF_LINK) && (sdl->sdl_type == IFT_ETHER)) { bzero(&lsa, sizeof(lsa)); lsa.sa_family = ARPHRD_ETHER; bcopy(LLADDR(sdl), lsa.sa_data, LINUX_IFHWADDRLEN); return (copyout(&lsa, &ifr->ifr_hwaddr, sizeof(lsa))); } } return (ENOENT); } /* * If we fault in bsd_to_linux_ifreq() then we will fault when we call * the native ioctl(). Thus, we don't really need to check the return * value of this function. */ static int bsd_to_linux_ifreq(struct ifreq *arg) { struct ifreq ifr; size_t ifr_len = sizeof(struct ifreq); int error; if ((error = copyin(arg, &ifr, ifr_len))) return (error); *(u_short *)&ifr.ifr_addr = ifr.ifr_addr.sa_family; error = copyout(&ifr, arg, ifr_len); return (error); } /* * Socket related ioctls */ static int linux_ioctl_socket(struct thread *td, struct linux_ioctl_args *args) { char lifname[LINUX_IFNAMSIZ], ifname[IFNAMSIZ]; - cap_rights_t rights; struct ifnet *ifp; struct file *fp; int error, type; ifp = NULL; error = 0; - error = fget(td, args->fd, cap_rights_init(&rights, CAP_IOCTL), &fp); + error = fget(td, args->fd, &cap_ioctl_rights, &fp); if (error != 0) return (error); type = fp->f_type; fdrop(fp, td); if (type != DTYPE_SOCKET) { /* not a socket - probably a tap / vmnet device */ switch (args->cmd) { case LINUX_SIOCGIFADDR: case LINUX_SIOCSIFADDR: case LINUX_SIOCGIFFLAGS: return (linux_ioctl_special(td, args)); default: return (ENOIOCTL); } } switch (args->cmd & 0xffff) { case LINUX_FIOGETOWN: case LINUX_FIOSETOWN: case LINUX_SIOCADDMULTI: case LINUX_SIOCATMARK: case LINUX_SIOCDELMULTI: case LINUX_SIOCGIFNAME: case LINUX_SIOCGIFCONF: case LINUX_SIOCGPGRP: case LINUX_SIOCSPGRP: case LINUX_SIOCGIFCOUNT: /* these ioctls don't take an interface name */ #ifdef DEBUG printf("%s(): ioctl %d\n", __func__, args->cmd & 0xffff); #endif break; case LINUX_SIOCGIFFLAGS: case LINUX_SIOCGIFADDR: case LINUX_SIOCSIFADDR: case LINUX_SIOCGIFDSTADDR: case LINUX_SIOCGIFBRDADDR: case LINUX_SIOCGIFNETMASK: case LINUX_SIOCSIFNETMASK: case LINUX_SIOCGIFMTU: case LINUX_SIOCSIFMTU: case LINUX_SIOCSIFNAME: case LINUX_SIOCGIFHWADDR: case LINUX_SIOCSIFHWADDR: case LINUX_SIOCDEVPRIVATE: case LINUX_SIOCDEVPRIVATE+1: case LINUX_SIOCGIFINDEX: /* copy in the interface name and translate it. */ error = copyin((void *)args->arg, lifname, LINUX_IFNAMSIZ); if (error != 0) return (error); #ifdef DEBUG printf("%s(): ioctl %d on %.*s\n", __func__, args->cmd & 0xffff, LINUX_IFNAMSIZ, lifname); #endif memset(ifname, 0, sizeof(ifname)); ifp = ifname_linux_to_bsd(td, lifname, ifname); if (ifp == NULL) return (EINVAL); /* * We need to copy it back out in case we pass the * request on to our native ioctl(), which will expect * the ifreq to be in user space and have the correct * interface name. */ error = copyout(ifname, (void *)args->arg, IFNAMSIZ); if (error != 0) return (error); #ifdef DEBUG printf("%s(): %s translated to %s\n", __func__, lifname, ifname); #endif break; default: return (ENOIOCTL); } switch (args->cmd & 0xffff) { case LINUX_FIOSETOWN: args->cmd = FIOSETOWN; error = sys_ioctl(td, (struct ioctl_args *)args); break; case LINUX_SIOCSPGRP: args->cmd = SIOCSPGRP; error = sys_ioctl(td, (struct ioctl_args *)args); break; case LINUX_FIOGETOWN: args->cmd = FIOGETOWN; error = sys_ioctl(td, (struct ioctl_args *)args); break; case LINUX_SIOCGPGRP: args->cmd = SIOCGPGRP; error = sys_ioctl(td, (struct ioctl_args *)args); break; case LINUX_SIOCATMARK: args->cmd = SIOCATMARK; error = sys_ioctl(td, (struct ioctl_args *)args); break; /* LINUX_SIOCGSTAMP */ case LINUX_SIOCGIFNAME: error = linux_ioctl_ifname(td, (struct l_ifreq *)args->arg); break; case LINUX_SIOCGIFCONF: error = linux_ifconf(td, (struct ifconf *)args->arg); break; case LINUX_SIOCGIFFLAGS: args->cmd = SIOCGIFFLAGS; error = linux_gifflags(td, ifp, (struct l_ifreq *)args->arg); break; case LINUX_SIOCGIFADDR: args->cmd = SIOCGIFADDR; error = sys_ioctl(td, (struct ioctl_args *)args); bsd_to_linux_ifreq((struct ifreq *)args->arg); break; case LINUX_SIOCSIFADDR: /* XXX probably doesn't work, included for completeness */ args->cmd = SIOCSIFADDR; error = sys_ioctl(td, (struct ioctl_args *)args); break; case LINUX_SIOCGIFDSTADDR: args->cmd = SIOCGIFDSTADDR; error = sys_ioctl(td, (struct ioctl_args *)args); bsd_to_linux_ifreq((struct ifreq *)args->arg); break; case LINUX_SIOCGIFBRDADDR: args->cmd = SIOCGIFBRDADDR; error = sys_ioctl(td, (struct ioctl_args *)args); bsd_to_linux_ifreq((struct ifreq *)args->arg); break; case LINUX_SIOCGIFNETMASK: args->cmd = SIOCGIFNETMASK; error = sys_ioctl(td, (struct ioctl_args *)args); bsd_to_linux_ifreq((struct ifreq *)args->arg); break; case LINUX_SIOCSIFNETMASK: error = ENOIOCTL; break; case LINUX_SIOCGIFMTU: args->cmd = SIOCGIFMTU; error = sys_ioctl(td, (struct ioctl_args *)args); break; case LINUX_SIOCSIFMTU: args->cmd = SIOCSIFMTU; error = sys_ioctl(td, (struct ioctl_args *)args); break; case LINUX_SIOCSIFNAME: error = ENOIOCTL; break; case LINUX_SIOCGIFHWADDR: error = linux_gifhwaddr(ifp, (struct l_ifreq *)args->arg); break; case LINUX_SIOCSIFHWADDR: error = ENOIOCTL; break; case LINUX_SIOCADDMULTI: args->cmd = SIOCADDMULTI; error = sys_ioctl(td, (struct ioctl_args *)args); break; case LINUX_SIOCDELMULTI: args->cmd = SIOCDELMULTI; error = sys_ioctl(td, (struct ioctl_args *)args); break; case LINUX_SIOCGIFINDEX: args->cmd = SIOCGIFINDEX; error = sys_ioctl(td, (struct ioctl_args *)args); break; case LINUX_SIOCGIFCOUNT: error = 0; break; /* * XXX This is slightly bogus, but these ioctls are currently * XXX only used by the aironet (if_an) network driver. */ case LINUX_SIOCDEVPRIVATE: args->cmd = SIOCGPRIVATE_0; error = sys_ioctl(td, (struct ioctl_args *)args); break; case LINUX_SIOCDEVPRIVATE+1: args->cmd = SIOCGPRIVATE_1; error = sys_ioctl(td, (struct ioctl_args *)args); break; } if (ifp != NULL) /* restore the original interface name */ copyout(lifname, (void *)args->arg, LINUX_IFNAMSIZ); #ifdef DEBUG printf("%s(): returning %d\n", __func__, error); #endif return (error); } /* * Device private ioctl handler */ static int linux_ioctl_private(struct thread *td, struct linux_ioctl_args *args) { - cap_rights_t rights; struct file *fp; int error, type; - error = fget(td, args->fd, cap_rights_init(&rights, CAP_IOCTL), &fp); + error = fget(td, args->fd, &cap_ioctl_rights, &fp); if (error != 0) return (error); type = fp->f_type; fdrop(fp, td); if (type == DTYPE_SOCKET) return (linux_ioctl_socket(td, args)); return (ENOIOCTL); } /* * DRM ioctl handler (sys/dev/drm) */ static int linux_ioctl_drm(struct thread *td, struct linux_ioctl_args *args) { args->cmd = SETDIR(args->cmd); return (sys_ioctl(td, (struct ioctl_args *)args)); } #ifdef COMPAT_LINUX32 #define CP(src,dst,fld) do { (dst).fld = (src).fld; } while (0) #define PTRIN_CP(src,dst,fld) \ do { (dst).fld = PTRIN((src).fld); } while (0) #define PTROUT_CP(src,dst,fld) \ do { (dst).fld = PTROUT((src).fld); } while (0) static int linux_ioctl_sg_io(struct thread *td, struct linux_ioctl_args *args) { struct sg_io_hdr io; struct sg_io_hdr32 io32; - cap_rights_t rights; struct file *fp; int error; - error = fget(td, args->fd, cap_rights_init(&rights, CAP_IOCTL), &fp); + error = fget(td, args->fd, &cap_ioctl_rights, &fp); if (error != 0) { printf("sg_linux_ioctl: fget returned %d\n", error); return (error); } if ((error = copyin((void *)args->arg, &io32, sizeof(io32))) != 0) goto out; CP(io32, io, interface_id); CP(io32, io, dxfer_direction); CP(io32, io, cmd_len); CP(io32, io, mx_sb_len); CP(io32, io, iovec_count); CP(io32, io, dxfer_len); PTRIN_CP(io32, io, dxferp); PTRIN_CP(io32, io, cmdp); PTRIN_CP(io32, io, sbp); CP(io32, io, timeout); CP(io32, io, flags); CP(io32, io, pack_id); PTRIN_CP(io32, io, usr_ptr); CP(io32, io, status); CP(io32, io, masked_status); CP(io32, io, msg_status); CP(io32, io, sb_len_wr); CP(io32, io, host_status); CP(io32, io, driver_status); CP(io32, io, resid); CP(io32, io, duration); CP(io32, io, info); if ((error = fo_ioctl(fp, SG_IO, (caddr_t)&io, td->td_ucred, td)) != 0) goto out; CP(io, io32, interface_id); CP(io, io32, dxfer_direction); CP(io, io32, cmd_len); CP(io, io32, mx_sb_len); CP(io, io32, iovec_count); CP(io, io32, dxfer_len); PTROUT_CP(io, io32, dxferp); PTROUT_CP(io, io32, cmdp); PTROUT_CP(io, io32, sbp); CP(io, io32, timeout); CP(io, io32, flags); CP(io, io32, pack_id); PTROUT_CP(io, io32, usr_ptr); CP(io, io32, status); CP(io, io32, masked_status); CP(io, io32, msg_status); CP(io, io32, sb_len_wr); CP(io, io32, host_status); CP(io, io32, driver_status); CP(io, io32, resid); CP(io, io32, duration); CP(io, io32, info); error = copyout(&io32, (void *)args->arg, sizeof(io32)); out: fdrop(fp, td); return (error); } #endif static int linux_ioctl_sg(struct thread *td, struct linux_ioctl_args *args) { switch (args->cmd) { case LINUX_SG_GET_VERSION_NUM: args->cmd = SG_GET_VERSION_NUM; break; case LINUX_SG_SET_TIMEOUT: args->cmd = SG_SET_TIMEOUT; break; case LINUX_SG_GET_TIMEOUT: args->cmd = SG_GET_TIMEOUT; break; case LINUX_SG_IO: args->cmd = SG_IO; #ifdef COMPAT_LINUX32 return (linux_ioctl_sg_io(td, args)); #endif break; case LINUX_SG_GET_RESERVED_SIZE: args->cmd = SG_GET_RESERVED_SIZE; break; case LINUX_SG_GET_SCSI_ID: args->cmd = SG_GET_SCSI_ID; break; case LINUX_SG_GET_SG_TABLESIZE: args->cmd = SG_GET_SG_TABLESIZE; break; default: return (ENODEV); } return (sys_ioctl(td, (struct ioctl_args *)args)); } /* * Video4Linux (V4L) ioctl handler */ static int linux_to_bsd_v4l_tuner(struct l_video_tuner *lvt, struct video_tuner *vt) { vt->tuner = lvt->tuner; strlcpy(vt->name, lvt->name, LINUX_VIDEO_TUNER_NAME_SIZE); vt->rangelow = lvt->rangelow; /* possible long size conversion */ vt->rangehigh = lvt->rangehigh; /* possible long size conversion */ vt->flags = lvt->flags; vt->mode = lvt->mode; vt->signal = lvt->signal; return (0); } static int bsd_to_linux_v4l_tuner(struct video_tuner *vt, struct l_video_tuner *lvt) { lvt->tuner = vt->tuner; strlcpy(lvt->name, vt->name, LINUX_VIDEO_TUNER_NAME_SIZE); lvt->rangelow = vt->rangelow; /* possible long size conversion */ lvt->rangehigh = vt->rangehigh; /* possible long size conversion */ lvt->flags = vt->flags; lvt->mode = vt->mode; lvt->signal = vt->signal; return (0); } #ifdef COMPAT_LINUX_V4L_CLIPLIST static int linux_to_bsd_v4l_clip(struct l_video_clip *lvc, struct video_clip *vc) { vc->x = lvc->x; vc->y = lvc->y; vc->width = lvc->width; vc->height = lvc->height; vc->next = PTRIN(lvc->next); /* possible pointer size conversion */ return (0); } #endif static int linux_to_bsd_v4l_window(struct l_video_window *lvw, struct video_window *vw) { vw->x = lvw->x; vw->y = lvw->y; vw->width = lvw->width; vw->height = lvw->height; vw->chromakey = lvw->chromakey; vw->flags = lvw->flags; vw->clips = PTRIN(lvw->clips); /* possible pointer size conversion */ vw->clipcount = lvw->clipcount; return (0); } static int bsd_to_linux_v4l_window(struct video_window *vw, struct l_video_window *lvw) { lvw->x = vw->x; lvw->y = vw->y; lvw->width = vw->width; lvw->height = vw->height; lvw->chromakey = vw->chromakey; lvw->flags = vw->flags; lvw->clips = PTROUT(vw->clips); /* possible pointer size conversion */ lvw->clipcount = vw->clipcount; return (0); } static int linux_to_bsd_v4l_buffer(struct l_video_buffer *lvb, struct video_buffer *vb) { vb->base = PTRIN(lvb->base); /* possible pointer size conversion */ vb->height = lvb->height; vb->width = lvb->width; vb->depth = lvb->depth; vb->bytesperline = lvb->bytesperline; return (0); } static int bsd_to_linux_v4l_buffer(struct video_buffer *vb, struct l_video_buffer *lvb) { lvb->base = PTROUT(vb->base); /* possible pointer size conversion */ lvb->height = vb->height; lvb->width = vb->width; lvb->depth = vb->depth; lvb->bytesperline = vb->bytesperline; return (0); } static int linux_to_bsd_v4l_code(struct l_video_code *lvc, struct video_code *vc) { strlcpy(vc->loadwhat, lvc->loadwhat, LINUX_VIDEO_CODE_LOADWHAT_SIZE); vc->datasize = lvc->datasize; vc->data = PTRIN(lvc->data); /* possible pointer size conversion */ return (0); } #ifdef COMPAT_LINUX_V4L_CLIPLIST static int linux_v4l_clip_copy(void *lvc, struct video_clip **ppvc) { int error; struct video_clip vclip; struct l_video_clip l_vclip; error = copyin(lvc, &l_vclip, sizeof(l_vclip)); if (error) return (error); linux_to_bsd_v4l_clip(&l_vclip, &vclip); /* XXX: If there can be no concurrency: s/M_NOWAIT/M_WAITOK/ */ if ((*ppvc = malloc(sizeof(**ppvc), M_LINUX, M_NOWAIT)) == NULL) return (ENOMEM); /* XXX: Linux has no ENOMEM here. */ memcpy(*ppvc, &vclip, sizeof(vclip)); (*ppvc)->next = NULL; return (0); } static int linux_v4l_cliplist_free(struct video_window *vw) { struct video_clip **ppvc; struct video_clip **ppvc_next; for (ppvc = &(vw->clips); *ppvc != NULL; ppvc = ppvc_next) { ppvc_next = &((*ppvc)->next); free(*ppvc, M_LINUX); } vw->clips = NULL; return (0); } static int linux_v4l_cliplist_copy(struct l_video_window *lvw, struct video_window *vw) { int error; int clipcount; void *plvc; struct video_clip **ppvc; /* * XXX: The cliplist is used to pass in a list of clipping * rectangles or, if clipcount == VIDEO_CLIP_BITMAP, a * clipping bitmap. Some Linux apps, however, appear to * leave cliplist and clips uninitialized. In any case, * the cliplist is not used by pwc(4), at the time of * writing, FreeBSD's only V4L driver. When a driver * that uses the cliplist is developed, this code may * need re-examiniation. */ error = 0; clipcount = vw->clipcount; if (clipcount == VIDEO_CLIP_BITMAP) { /* * In this case, the pointer (clips) is overloaded * to be a "void *" to a bitmap, therefore there * is no struct video_clip to copy now. */ } else if (clipcount > 0 && clipcount <= 16384) { /* * Clips points to list of clip rectangles, so * copy the list. * * XXX: Upper limit of 16384 was used here to try to * avoid cases when clipcount and clips pointer * are uninitialized and therefore have high random * values, as is the case in the Linux Skype * application. The value 16384 was chosen as that * is what is used in the Linux stradis(4) MPEG * decoder driver, the only place we found an * example of cliplist use. */ plvc = PTRIN(lvw->clips); vw->clips = NULL; ppvc = &(vw->clips); while (clipcount-- > 0) { if (plvc == NULL) { error = EFAULT; break; } else { error = linux_v4l_clip_copy(plvc, ppvc); if (error) { linux_v4l_cliplist_free(vw); break; } } ppvc = &((*ppvc)->next); plvc = PTRIN(((struct l_video_clip *) plvc)->next); } } else { /* * clipcount == 0 or negative (but not VIDEO_CLIP_BITMAP) * Force cliplist to null. */ vw->clipcount = 0; vw->clips = NULL; } return (error); } #endif static int linux_ioctl_v4l(struct thread *td, struct linux_ioctl_args *args) { - cap_rights_t rights; struct file *fp; int error; struct video_tuner vtun; struct video_window vwin; struct video_buffer vbuf; struct video_code vcode; struct l_video_tuner l_vtun; struct l_video_window l_vwin; struct l_video_buffer l_vbuf; struct l_video_code l_vcode; switch (args->cmd & 0xffff) { case LINUX_VIDIOCGCAP: args->cmd = VIDIOCGCAP; break; case LINUX_VIDIOCGCHAN: args->cmd = VIDIOCGCHAN; break; case LINUX_VIDIOCSCHAN: args->cmd = VIDIOCSCHAN; break; case LINUX_VIDIOCGTUNER: error = fget(td, args->fd, - cap_rights_init(&rights, CAP_IOCTL), &fp); + &cap_ioctl_rights, &fp); if (error != 0) return (error); error = copyin((void *) args->arg, &l_vtun, sizeof(l_vtun)); if (error) { fdrop(fp, td); return (error); } linux_to_bsd_v4l_tuner(&l_vtun, &vtun); error = fo_ioctl(fp, VIDIOCGTUNER, &vtun, td->td_ucred, td); if (!error) { bsd_to_linux_v4l_tuner(&vtun, &l_vtun); error = copyout(&l_vtun, (void *) args->arg, sizeof(l_vtun)); } fdrop(fp, td); return (error); case LINUX_VIDIOCSTUNER: error = fget(td, args->fd, - cap_rights_init(&rights, CAP_IOCTL), &fp); + &cap_ioctl_rights, &fp); if (error != 0) return (error); error = copyin((void *) args->arg, &l_vtun, sizeof(l_vtun)); if (error) { fdrop(fp, td); return (error); } linux_to_bsd_v4l_tuner(&l_vtun, &vtun); error = fo_ioctl(fp, VIDIOCSTUNER, &vtun, td->td_ucred, td); fdrop(fp, td); return (error); case LINUX_VIDIOCGPICT: args->cmd = VIDIOCGPICT; break; case LINUX_VIDIOCSPICT: args->cmd = VIDIOCSPICT; break; case LINUX_VIDIOCCAPTURE: args->cmd = VIDIOCCAPTURE; break; case LINUX_VIDIOCGWIN: error = fget(td, args->fd, - cap_rights_init(&rights, CAP_IOCTL), &fp); + &cap_ioctl_rights, &fp); if (error != 0) return (error); error = fo_ioctl(fp, VIDIOCGWIN, &vwin, td->td_ucred, td); if (!error) { bsd_to_linux_v4l_window(&vwin, &l_vwin); error = copyout(&l_vwin, (void *) args->arg, sizeof(l_vwin)); } fdrop(fp, td); return (error); case LINUX_VIDIOCSWIN: error = fget(td, args->fd, - cap_rights_init(&rights, CAP_IOCTL), &fp); + &cap_ioctl_rights, &fp); if (error != 0) return (error); error = copyin((void *) args->arg, &l_vwin, sizeof(l_vwin)); if (error) { fdrop(fp, td); return (error); } linux_to_bsd_v4l_window(&l_vwin, &vwin); #ifdef COMPAT_LINUX_V4L_CLIPLIST error = linux_v4l_cliplist_copy(&l_vwin, &vwin); if (error) { fdrop(fp, td); return (error); } #endif error = fo_ioctl(fp, VIDIOCSWIN, &vwin, td->td_ucred, td); fdrop(fp, td); #ifdef COMPAT_LINUX_V4L_CLIPLIST linux_v4l_cliplist_free(&vwin); #endif return (error); case LINUX_VIDIOCGFBUF: error = fget(td, args->fd, - cap_rights_init(&rights, CAP_IOCTL), &fp); + &cap_ioctl_rights, &fp); if (error != 0) return (error); error = fo_ioctl(fp, VIDIOCGFBUF, &vbuf, td->td_ucred, td); if (!error) { bsd_to_linux_v4l_buffer(&vbuf, &l_vbuf); error = copyout(&l_vbuf, (void *) args->arg, sizeof(l_vbuf)); } fdrop(fp, td); return (error); case LINUX_VIDIOCSFBUF: error = fget(td, args->fd, - cap_rights_init(&rights, CAP_IOCTL), &fp); + &cap_ioctl_rights, &fp); if (error != 0) return (error); error = copyin((void *) args->arg, &l_vbuf, sizeof(l_vbuf)); if (error) { fdrop(fp, td); return (error); } linux_to_bsd_v4l_buffer(&l_vbuf, &vbuf); error = fo_ioctl(fp, VIDIOCSFBUF, &vbuf, td->td_ucred, td); fdrop(fp, td); return (error); case LINUX_VIDIOCKEY: args->cmd = VIDIOCKEY; break; case LINUX_VIDIOCGFREQ: args->cmd = VIDIOCGFREQ; break; case LINUX_VIDIOCSFREQ: args->cmd = VIDIOCSFREQ; break; case LINUX_VIDIOCGAUDIO: args->cmd = VIDIOCGAUDIO; break; case LINUX_VIDIOCSAUDIO: args->cmd = VIDIOCSAUDIO; break; case LINUX_VIDIOCSYNC: args->cmd = VIDIOCSYNC; break; case LINUX_VIDIOCMCAPTURE: args->cmd = VIDIOCMCAPTURE; break; case LINUX_VIDIOCGMBUF: args->cmd = VIDIOCGMBUF; break; case LINUX_VIDIOCGUNIT: args->cmd = VIDIOCGUNIT; break; case LINUX_VIDIOCGCAPTURE: args->cmd = VIDIOCGCAPTURE; break; case LINUX_VIDIOCSCAPTURE: args->cmd = VIDIOCSCAPTURE; break; case LINUX_VIDIOCSPLAYMODE: args->cmd = VIDIOCSPLAYMODE; break; case LINUX_VIDIOCSWRITEMODE: args->cmd = VIDIOCSWRITEMODE; break; case LINUX_VIDIOCGPLAYINFO: args->cmd = VIDIOCGPLAYINFO; break; case LINUX_VIDIOCSMICROCODE: error = fget(td, args->fd, - cap_rights_init(&rights, CAP_IOCTL), &fp); + &cap_ioctl_rights, &fp); if (error != 0) return (error); error = copyin((void *) args->arg, &l_vcode, sizeof(l_vcode)); if (error) { fdrop(fp, td); return (error); } linux_to_bsd_v4l_code(&l_vcode, &vcode); error = fo_ioctl(fp, VIDIOCSMICROCODE, &vcode, td->td_ucred, td); fdrop(fp, td); return (error); case LINUX_VIDIOCGVBIFMT: args->cmd = VIDIOCGVBIFMT; break; case LINUX_VIDIOCSVBIFMT: args->cmd = VIDIOCSVBIFMT; break; default: return (ENOIOCTL); } error = sys_ioctl(td, (struct ioctl_args *)args); return (error); } /* * Special ioctl handler */ static int linux_ioctl_special(struct thread *td, struct linux_ioctl_args *args) { int error; switch (args->cmd) { case LINUX_SIOCGIFADDR: args->cmd = SIOCGIFADDR; error = sys_ioctl(td, (struct ioctl_args *)args); break; case LINUX_SIOCSIFADDR: args->cmd = SIOCSIFADDR; error = sys_ioctl(td, (struct ioctl_args *)args); break; case LINUX_SIOCGIFFLAGS: args->cmd = SIOCGIFFLAGS; error = sys_ioctl(td, (struct ioctl_args *)args); break; default: error = ENOIOCTL; } return (error); } static int linux_to_bsd_v4l2_standard(struct l_v4l2_standard *lvstd, struct v4l2_standard *vstd) { vstd->index = lvstd->index; vstd->id = lvstd->id; CTASSERT(sizeof(vstd->name) == sizeof(lvstd->name)); memcpy(vstd->name, lvstd->name, sizeof(vstd->name)); vstd->frameperiod = lvstd->frameperiod; vstd->framelines = lvstd->framelines; CTASSERT(sizeof(vstd->reserved) == sizeof(lvstd->reserved)); memcpy(vstd->reserved, lvstd->reserved, sizeof(vstd->reserved)); return (0); } static int bsd_to_linux_v4l2_standard(struct v4l2_standard *vstd, struct l_v4l2_standard *lvstd) { lvstd->index = vstd->index; lvstd->id = vstd->id; CTASSERT(sizeof(vstd->name) == sizeof(lvstd->name)); memcpy(lvstd->name, vstd->name, sizeof(lvstd->name)); lvstd->frameperiod = vstd->frameperiod; lvstd->framelines = vstd->framelines; CTASSERT(sizeof(vstd->reserved) == sizeof(lvstd->reserved)); memcpy(lvstd->reserved, vstd->reserved, sizeof(lvstd->reserved)); return (0); } static int linux_to_bsd_v4l2_buffer(struct l_v4l2_buffer *lvb, struct v4l2_buffer *vb) { vb->index = lvb->index; vb->type = lvb->type; vb->bytesused = lvb->bytesused; vb->flags = lvb->flags; vb->field = lvb->field; vb->timestamp.tv_sec = lvb->timestamp.tv_sec; vb->timestamp.tv_usec = lvb->timestamp.tv_usec; memcpy(&vb->timecode, &lvb->timecode, sizeof (lvb->timecode)); vb->sequence = lvb->sequence; vb->memory = lvb->memory; if (lvb->memory == V4L2_MEMORY_USERPTR) /* possible pointer size conversion */ vb->m.userptr = (unsigned long)PTRIN(lvb->m.userptr); else vb->m.offset = lvb->m.offset; vb->length = lvb->length; vb->input = lvb->input; vb->reserved = lvb->reserved; return (0); } static int bsd_to_linux_v4l2_buffer(struct v4l2_buffer *vb, struct l_v4l2_buffer *lvb) { lvb->index = vb->index; lvb->type = vb->type; lvb->bytesused = vb->bytesused; lvb->flags = vb->flags; lvb->field = vb->field; lvb->timestamp.tv_sec = vb->timestamp.tv_sec; lvb->timestamp.tv_usec = vb->timestamp.tv_usec; memcpy(&lvb->timecode, &vb->timecode, sizeof (vb->timecode)); lvb->sequence = vb->sequence; lvb->memory = vb->memory; if (vb->memory == V4L2_MEMORY_USERPTR) /* possible pointer size conversion */ lvb->m.userptr = PTROUT(vb->m.userptr); else lvb->m.offset = vb->m.offset; lvb->length = vb->length; lvb->input = vb->input; lvb->reserved = vb->reserved; return (0); } static int linux_to_bsd_v4l2_format(struct l_v4l2_format *lvf, struct v4l2_format *vf) { vf->type = lvf->type; if (lvf->type == V4L2_BUF_TYPE_VIDEO_OVERLAY #ifdef V4L2_BUF_TYPE_VIDEO_OUTPUT_OVERLAY || lvf->type == V4L2_BUF_TYPE_VIDEO_OUTPUT_OVERLAY #endif ) /* * XXX TODO - needs 32 -> 64 bit conversion: * (unused by webcams?) */ return (EINVAL); memcpy(&vf->fmt, &lvf->fmt, sizeof(vf->fmt)); return (0); } static int bsd_to_linux_v4l2_format(struct v4l2_format *vf, struct l_v4l2_format *lvf) { lvf->type = vf->type; if (vf->type == V4L2_BUF_TYPE_VIDEO_OVERLAY #ifdef V4L2_BUF_TYPE_VIDEO_OUTPUT_OVERLAY || vf->type == V4L2_BUF_TYPE_VIDEO_OUTPUT_OVERLAY #endif ) /* * XXX TODO - needs 32 -> 64 bit conversion: * (unused by webcams?) */ return (EINVAL); memcpy(&lvf->fmt, &vf->fmt, sizeof(vf->fmt)); return (0); } static int linux_ioctl_v4l2(struct thread *td, struct linux_ioctl_args *args) { - cap_rights_t rights; struct file *fp; int error; struct v4l2_format vformat; struct l_v4l2_format l_vformat; struct v4l2_standard vstd; struct l_v4l2_standard l_vstd; struct l_v4l2_buffer l_vbuf; struct v4l2_buffer vbuf; struct v4l2_input vinp; switch (args->cmd & 0xffff) { case LINUX_VIDIOC_RESERVED: case LINUX_VIDIOC_LOG_STATUS: if ((args->cmd & IOC_DIRMASK) != LINUX_IOC_VOID) return (ENOIOCTL); args->cmd = (args->cmd & 0xffff) | IOC_VOID; break; case LINUX_VIDIOC_OVERLAY: case LINUX_VIDIOC_STREAMON: case LINUX_VIDIOC_STREAMOFF: case LINUX_VIDIOC_S_STD: case LINUX_VIDIOC_S_TUNER: case LINUX_VIDIOC_S_AUDIO: case LINUX_VIDIOC_S_AUDOUT: case LINUX_VIDIOC_S_MODULATOR: case LINUX_VIDIOC_S_FREQUENCY: case LINUX_VIDIOC_S_CROP: case LINUX_VIDIOC_S_JPEGCOMP: case LINUX_VIDIOC_S_PRIORITY: case LINUX_VIDIOC_DBG_S_REGISTER: case LINUX_VIDIOC_S_HW_FREQ_SEEK: case LINUX_VIDIOC_SUBSCRIBE_EVENT: case LINUX_VIDIOC_UNSUBSCRIBE_EVENT: args->cmd = (args->cmd & ~IOC_DIRMASK) | IOC_IN; break; case LINUX_VIDIOC_QUERYCAP: case LINUX_VIDIOC_G_STD: case LINUX_VIDIOC_G_AUDIO: case LINUX_VIDIOC_G_INPUT: case LINUX_VIDIOC_G_OUTPUT: case LINUX_VIDIOC_G_AUDOUT: case LINUX_VIDIOC_G_JPEGCOMP: case LINUX_VIDIOC_QUERYSTD: case LINUX_VIDIOC_G_PRIORITY: case LINUX_VIDIOC_QUERY_DV_PRESET: args->cmd = (args->cmd & ~IOC_DIRMASK) | IOC_OUT; break; case LINUX_VIDIOC_ENUM_FMT: case LINUX_VIDIOC_REQBUFS: case LINUX_VIDIOC_G_PARM: case LINUX_VIDIOC_S_PARM: case LINUX_VIDIOC_G_CTRL: case LINUX_VIDIOC_S_CTRL: case LINUX_VIDIOC_G_TUNER: case LINUX_VIDIOC_QUERYCTRL: case LINUX_VIDIOC_QUERYMENU: case LINUX_VIDIOC_S_INPUT: case LINUX_VIDIOC_S_OUTPUT: case LINUX_VIDIOC_ENUMOUTPUT: case LINUX_VIDIOC_G_MODULATOR: case LINUX_VIDIOC_G_FREQUENCY: case LINUX_VIDIOC_CROPCAP: case LINUX_VIDIOC_G_CROP: case LINUX_VIDIOC_ENUMAUDIO: case LINUX_VIDIOC_ENUMAUDOUT: case LINUX_VIDIOC_G_SLICED_VBI_CAP: #ifdef VIDIOC_ENUM_FRAMESIZES case LINUX_VIDIOC_ENUM_FRAMESIZES: case LINUX_VIDIOC_ENUM_FRAMEINTERVALS: case LINUX_VIDIOC_ENCODER_CMD: case LINUX_VIDIOC_TRY_ENCODER_CMD: #endif case LINUX_VIDIOC_DBG_G_REGISTER: case LINUX_VIDIOC_DBG_G_CHIP_IDENT: case LINUX_VIDIOC_ENUM_DV_PRESETS: case LINUX_VIDIOC_S_DV_PRESET: case LINUX_VIDIOC_G_DV_PRESET: case LINUX_VIDIOC_S_DV_TIMINGS: case LINUX_VIDIOC_G_DV_TIMINGS: args->cmd = (args->cmd & ~IOC_DIRMASK) | IOC_INOUT; break; case LINUX_VIDIOC_G_FMT: case LINUX_VIDIOC_S_FMT: case LINUX_VIDIOC_TRY_FMT: error = copyin((void *)args->arg, &l_vformat, sizeof(l_vformat)); if (error) return (error); error = fget(td, args->fd, - cap_rights_init(&rights, CAP_IOCTL), &fp); + &cap_ioctl_rights, &fp); if (error) return (error); if (linux_to_bsd_v4l2_format(&l_vformat, &vformat) != 0) error = EINVAL; else if ((args->cmd & 0xffff) == LINUX_VIDIOC_G_FMT) error = fo_ioctl(fp, VIDIOC_G_FMT, &vformat, td->td_ucred, td); else if ((args->cmd & 0xffff) == LINUX_VIDIOC_S_FMT) error = fo_ioctl(fp, VIDIOC_S_FMT, &vformat, td->td_ucred, td); else error = fo_ioctl(fp, VIDIOC_TRY_FMT, &vformat, td->td_ucred, td); bsd_to_linux_v4l2_format(&vformat, &l_vformat); copyout(&l_vformat, (void *)args->arg, sizeof(l_vformat)); fdrop(fp, td); return (error); case LINUX_VIDIOC_ENUMSTD: error = copyin((void *)args->arg, &l_vstd, sizeof(l_vstd)); if (error) return (error); linux_to_bsd_v4l2_standard(&l_vstd, &vstd); error = fget(td, args->fd, - cap_rights_init(&rights, CAP_IOCTL), &fp); + &cap_ioctl_rights, &fp); if (error) return (error); error = fo_ioctl(fp, VIDIOC_ENUMSTD, (caddr_t)&vstd, td->td_ucred, td); if (error) { fdrop(fp, td); return (error); } bsd_to_linux_v4l2_standard(&vstd, &l_vstd); error = copyout(&l_vstd, (void *)args->arg, sizeof(l_vstd)); fdrop(fp, td); return (error); case LINUX_VIDIOC_ENUMINPUT: /* * The Linux struct l_v4l2_input differs only in size, * it has no padding at the end. */ error = copyin((void *)args->arg, &vinp, sizeof(struct l_v4l2_input)); if (error != 0) return (error); error = fget(td, args->fd, - cap_rights_init(&rights, CAP_IOCTL), &fp); + &cap_ioctl_rights, &fp); if (error != 0) return (error); error = fo_ioctl(fp, VIDIOC_ENUMINPUT, (caddr_t)&vinp, td->td_ucred, td); if (error) { fdrop(fp, td); return (error); } error = copyout(&vinp, (void *)args->arg, sizeof(struct l_v4l2_input)); fdrop(fp, td); return (error); case LINUX_VIDIOC_QUERYBUF: case LINUX_VIDIOC_QBUF: case LINUX_VIDIOC_DQBUF: error = copyin((void *)args->arg, &l_vbuf, sizeof(l_vbuf)); if (error) return (error); error = fget(td, args->fd, - cap_rights_init(&rights, CAP_IOCTL), &fp); + &cap_ioctl_rights, &fp); if (error) return (error); linux_to_bsd_v4l2_buffer(&l_vbuf, &vbuf); if ((args->cmd & 0xffff) == LINUX_VIDIOC_QUERYBUF) error = fo_ioctl(fp, VIDIOC_QUERYBUF, &vbuf, td->td_ucred, td); else if ((args->cmd & 0xffff) == LINUX_VIDIOC_QBUF) error = fo_ioctl(fp, VIDIOC_QBUF, &vbuf, td->td_ucred, td); else error = fo_ioctl(fp, VIDIOC_DQBUF, &vbuf, td->td_ucred, td); bsd_to_linux_v4l2_buffer(&vbuf, &l_vbuf); copyout(&l_vbuf, (void *)args->arg, sizeof(l_vbuf)); fdrop(fp, td); return (error); /* * XXX TODO - these need 32 -> 64 bit conversion: * (are any of them needed for webcams?) */ case LINUX_VIDIOC_G_FBUF: case LINUX_VIDIOC_S_FBUF: case LINUX_VIDIOC_G_EXT_CTRLS: case LINUX_VIDIOC_S_EXT_CTRLS: case LINUX_VIDIOC_TRY_EXT_CTRLS: case LINUX_VIDIOC_DQEVENT: default: return (ENOIOCTL); } error = sys_ioctl(td, (struct ioctl_args *)args); return (error); } /* * Support for emulators/linux-libusb. This port uses FBSD_LUSB* macros * instead of USB* ones. This lets us to provide correct values for cmd. * 0xffffffe0 -- 0xffffffff range seemed to be the least collision-prone. */ static int linux_ioctl_fbsd_usb(struct thread *td, struct linux_ioctl_args *args) { int error; error = 0; switch (args->cmd) { case FBSD_LUSB_DEVICEENUMERATE: args->cmd = USB_DEVICEENUMERATE; break; case FBSD_LUSB_DEV_QUIRK_ADD: args->cmd = USB_DEV_QUIRK_ADD; break; case FBSD_LUSB_DEV_QUIRK_GET: args->cmd = USB_DEV_QUIRK_GET; break; case FBSD_LUSB_DEV_QUIRK_REMOVE: args->cmd = USB_DEV_QUIRK_REMOVE; break; case FBSD_LUSB_DO_REQUEST: args->cmd = USB_DO_REQUEST; break; case FBSD_LUSB_FS_CLEAR_STALL_SYNC: args->cmd = USB_FS_CLEAR_STALL_SYNC; break; case FBSD_LUSB_FS_CLOSE: args->cmd = USB_FS_CLOSE; break; case FBSD_LUSB_FS_COMPLETE: args->cmd = USB_FS_COMPLETE; break; case FBSD_LUSB_FS_INIT: args->cmd = USB_FS_INIT; break; case FBSD_LUSB_FS_OPEN: args->cmd = USB_FS_OPEN; break; case FBSD_LUSB_FS_START: args->cmd = USB_FS_START; break; case FBSD_LUSB_FS_STOP: args->cmd = USB_FS_STOP; break; case FBSD_LUSB_FS_UNINIT: args->cmd = USB_FS_UNINIT; break; case FBSD_LUSB_GET_CONFIG: args->cmd = USB_GET_CONFIG; break; case FBSD_LUSB_GET_DEVICEINFO: args->cmd = USB_GET_DEVICEINFO; break; case FBSD_LUSB_GET_DEVICE_DESC: args->cmd = USB_GET_DEVICE_DESC; break; case FBSD_LUSB_GET_FULL_DESC: args->cmd = USB_GET_FULL_DESC; break; case FBSD_LUSB_GET_IFACE_DRIVER: args->cmd = USB_GET_IFACE_DRIVER; break; case FBSD_LUSB_GET_PLUGTIME: args->cmd = USB_GET_PLUGTIME; break; case FBSD_LUSB_GET_POWER_MODE: args->cmd = USB_GET_POWER_MODE; break; case FBSD_LUSB_GET_REPORT_DESC: args->cmd = USB_GET_REPORT_DESC; break; case FBSD_LUSB_GET_REPORT_ID: args->cmd = USB_GET_REPORT_ID; break; case FBSD_LUSB_GET_TEMPLATE: args->cmd = USB_GET_TEMPLATE; break; case FBSD_LUSB_IFACE_DRIVER_ACTIVE: args->cmd = USB_IFACE_DRIVER_ACTIVE; break; case FBSD_LUSB_IFACE_DRIVER_DETACH: args->cmd = USB_IFACE_DRIVER_DETACH; break; case FBSD_LUSB_QUIRK_NAME_GET: args->cmd = USB_QUIRK_NAME_GET; break; case FBSD_LUSB_READ_DIR: args->cmd = USB_READ_DIR; break; case FBSD_LUSB_SET_ALTINTERFACE: args->cmd = USB_SET_ALTINTERFACE; break; case FBSD_LUSB_SET_CONFIG: args->cmd = USB_SET_CONFIG; break; case FBSD_LUSB_SET_IMMED: args->cmd = USB_SET_IMMED; break; case FBSD_LUSB_SET_POWER_MODE: args->cmd = USB_SET_POWER_MODE; break; case FBSD_LUSB_SET_TEMPLATE: args->cmd = USB_SET_TEMPLATE; break; case FBSD_LUSB_FS_OPEN_STREAM: args->cmd = USB_FS_OPEN_STREAM; break; case FBSD_LUSB_GET_DEV_PORT_PATH: args->cmd = USB_GET_DEV_PORT_PATH; break; case FBSD_LUSB_GET_POWER_USAGE: args->cmd = USB_GET_POWER_USAGE; break; default: error = ENOIOCTL; } if (error != ENOIOCTL) error = sys_ioctl(td, (struct ioctl_args *)args); return (error); } /* * Some evdev ioctls must be translated. * - EVIOCGMTSLOTS is a IOC_READ ioctl on Linux although it has input data * (must be IOC_INOUT on FreeBSD). * - On Linux, EVIOCGRAB, EVIOCREVOKE and EVIOCRMFF are defined as _IOW with * an int argument. You don't pass an int pointer to the ioctl(), however, * but just the int directly. On FreeBSD, they are defined as _IOWINT for * this to work. */ static int linux_ioctl_evdev(struct thread *td, struct linux_ioctl_args *args) { - cap_rights_t rights; struct file *fp; clockid_t clock; int error; args->cmd = SETDIR(args->cmd); switch (args->cmd) { case (EVIOCGRAB & ~IOC_DIRMASK) | IOC_IN: args->cmd = EVIOCGRAB; break; case (EVIOCREVOKE & ~IOC_DIRMASK) | IOC_IN: args->cmd = EVIOCREVOKE; break; case (EVIOCRMFF & ~IOC_DIRMASK) | IOC_IN: args->cmd = EVIOCRMFF; break; case EVIOCSCLOCKID: { error = copyin(PTRIN(args->arg), &clock, sizeof(clock)); if (error != 0) return (error); if (clock & ~(LINUX_IOCTL_EVDEV_CLK)) return (EINVAL); error = linux_to_native_clockid(&clock, clock); if (error != 0) return (error); error = fget(td, args->fd, - cap_rights_init(&rights, CAP_IOCTL), &fp); + &cap_ioctl_rights, &fp); if (error != 0) return (error); error = fo_ioctl(fp, EVIOCSCLOCKID, &clock, td->td_ucred, td); fdrop(fp, td); return (error); } default: break; } if (IOCBASECMD(args->cmd) == ((EVIOCGMTSLOTS(0) & ~IOC_DIRMASK) | IOC_OUT)) args->cmd = (args->cmd & ~IOC_DIRMASK) | IOC_INOUT; return (sys_ioctl(td, (struct ioctl_args *)args)); } /* * main ioctl syscall function */ int linux_ioctl(struct thread *td, struct linux_ioctl_args *args) { - cap_rights_t rights; struct file *fp; struct handler_element *he; int error, cmd; #ifdef DEBUG if (ldebug(ioctl)) printf(ARGS(ioctl, "%d, %04lx, *"), args->fd, (unsigned long)args->cmd); #endif - error = fget(td, args->fd, cap_rights_init(&rights, CAP_IOCTL), &fp); + error = fget(td, args->fd, &cap_ioctl_rights, &fp); if (error != 0) return (error); if ((fp->f_flag & (FREAD|FWRITE)) == 0) { fdrop(fp, td); return (EBADF); } /* Iterate over the ioctl handlers */ cmd = args->cmd & 0xffff; sx_slock(&linux_ioctl_sx); mtx_lock(&Giant); TAILQ_FOREACH(he, &handlers, list) { if (cmd >= he->low && cmd <= he->high) { error = (*he->func)(td, args); if (error != ENOIOCTL) { mtx_unlock(&Giant); sx_sunlock(&linux_ioctl_sx); fdrop(fp, td); return (error); } } } mtx_unlock(&Giant); sx_sunlock(&linux_ioctl_sx); fdrop(fp, td); switch (args->cmd & 0xffff) { case LINUX_BTRFS_IOC_CLONE: return (ENOTSUP); default: linux_msg(td, "ioctl fd=%d, cmd=0x%x ('%c',%d) is not implemented", args->fd, (int)(args->cmd & 0xffff), (int)(args->cmd & 0xff00) >> 8, (int)(args->cmd & 0xff)); break; } return (EINVAL); } int linux_ioctl_register_handler(struct linux_ioctl_handler *h) { struct handler_element *he, *cur; if (h == NULL || h->func == NULL) return (EINVAL); /* * Reuse the element if the handler is already on the list, otherwise * create a new element. */ sx_xlock(&linux_ioctl_sx); TAILQ_FOREACH(he, &handlers, list) { if (he->func == h->func) break; } if (he == NULL) { he = malloc(sizeof(*he), M_LINUX, M_WAITOK); he->func = h->func; } else TAILQ_REMOVE(&handlers, he, list); /* Initialize range information. */ he->low = h->low; he->high = h->high; he->span = h->high - h->low + 1; /* Add the element to the list, sorted on span. */ TAILQ_FOREACH(cur, &handlers, list) { if (cur->span > he->span) { TAILQ_INSERT_BEFORE(cur, he, list); sx_xunlock(&linux_ioctl_sx); return (0); } } TAILQ_INSERT_TAIL(&handlers, he, list); sx_xunlock(&linux_ioctl_sx); return (0); } int linux_ioctl_unregister_handler(struct linux_ioctl_handler *h) { struct handler_element *he; if (h == NULL || h->func == NULL) return (EINVAL); sx_xlock(&linux_ioctl_sx); TAILQ_FOREACH(he, &handlers, list) { if (he->func == h->func) { TAILQ_REMOVE(&handlers, he, list); sx_xunlock(&linux_ioctl_sx); free(he, M_LINUX); return (0); } } sx_xunlock(&linux_ioctl_sx); return (EINVAL); } Index: head/sys/compat/linux/linux_mmap.c =================================================================== --- head/sys/compat/linux/linux_mmap.c (revision 333424) +++ head/sys/compat/linux/linux_mmap.c (revision 333425) @@ -1,251 +1,250 @@ /*- * Copyright (c) 2004 Tim J. Robbins * Copyright (c) 2002 Doug Rabson * Copyright (c) 2000 Marcel Moolenaar * Copyright (c) 1994-1995 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define STACK_SIZE (2 * 1024 * 1024) #define GUARD_SIZE (4 * PAGE_SIZE) #if defined(__amd64__) static void linux_fixup_prot(struct thread *td, int *prot); #endif int linux_mmap_common(struct thread *td, uintptr_t addr, size_t len, int prot, int flags, int fd, off_t pos) { struct proc *p = td->td_proc; struct vmspace *vms = td->td_proc->p_vmspace; int bsd_flags, error; struct file *fp; - cap_rights_t rights; LINUX_CTR6(mmap2, "0x%lx, %ld, %ld, 0x%08lx, %ld, 0x%lx", addr, len, prot, flags, fd, pos); error = 0; bsd_flags = 0; fp = NULL; /* * Linux mmap(2): * You must specify exactly one of MAP_SHARED and MAP_PRIVATE */ if (!((flags & LINUX_MAP_SHARED) ^ (flags & LINUX_MAP_PRIVATE))) return (EINVAL); if (flags & LINUX_MAP_SHARED) bsd_flags |= MAP_SHARED; if (flags & LINUX_MAP_PRIVATE) bsd_flags |= MAP_PRIVATE; if (flags & LINUX_MAP_FIXED) bsd_flags |= MAP_FIXED; if (flags & LINUX_MAP_ANON) { /* Enforce pos to be on page boundary, then ignore. */ if ((pos & PAGE_MASK) != 0) return (EINVAL); pos = 0; bsd_flags |= MAP_ANON; } else bsd_flags |= MAP_NOSYNC; if (flags & LINUX_MAP_GROWSDOWN) bsd_flags |= MAP_STACK; /* * PROT_READ, PROT_WRITE, or PROT_EXEC implies PROT_READ and PROT_EXEC * on Linux/i386 if the binary requires executable stack. * We do this only for IA32 emulation as on native i386 this is does not * make sense without PAE. * * XXX. Linux checks that the file system is not mounted with noexec. */ #if defined(__amd64__) linux_fixup_prot(td, &prot); #endif /* Linux does not check file descriptor when MAP_ANONYMOUS is set. */ fd = (bsd_flags & MAP_ANON) ? -1 : fd; if (fd != -1) { /* * Linux follows Solaris mmap(2) description: * The file descriptor fildes is opened with * read permission, regardless of the * protection options specified. */ - error = fget(td, fd, cap_rights_init(&rights, CAP_MMAP), &fp); + error = fget(td, fd, &cap_mmap_rights, &fp); if (error != 0) return (error); if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_DEV) { fdrop(fp, td); return (EINVAL); } /* Linux mmap() just fails for O_WRONLY files */ if (!(fp->f_flag & FREAD)) { fdrop(fp, td); return (EACCES); } fdrop(fp, td); } if (flags & LINUX_MAP_GROWSDOWN) { /* * The Linux MAP_GROWSDOWN option does not limit auto * growth of the region. Linux mmap with this option * takes as addr the initial BOS, and as len, the initial * region size. It can then grow down from addr without * limit. However, Linux threads has an implicit internal * limit to stack size of STACK_SIZE. Its just not * enforced explicitly in Linux. But, here we impose * a limit of (STACK_SIZE - GUARD_SIZE) on the stack * region, since we can do this with our mmap. * * Our mmap with MAP_STACK takes addr as the maximum * downsize limit on BOS, and as len the max size of * the region. It then maps the top SGROWSIZ bytes, * and auto grows the region down, up to the limit * in addr. * * If we don't use the MAP_STACK option, the effect * of this code is to allocate a stack region of a * fixed size of (STACK_SIZE - GUARD_SIZE). */ if ((caddr_t)addr + len > vms->vm_maxsaddr) { /* * Some Linux apps will attempt to mmap * thread stacks near the top of their * address space. If their TOS is greater * than vm_maxsaddr, vm_map_growstack() * will confuse the thread stack with the * process stack and deliver a SEGV if they * attempt to grow the thread stack past their * current stacksize rlimit. To avoid this, * adjust vm_maxsaddr upwards to reflect * the current stacksize rlimit rather * than the maximum possible stacksize. * It would be better to adjust the * mmap'ed region, but some apps do not check * mmap's return value. */ PROC_LOCK(p); vms->vm_maxsaddr = (char *)p->p_sysent->sv_usrstack - lim_cur_proc(p, RLIMIT_STACK); PROC_UNLOCK(p); } /* * This gives us our maximum stack size and a new BOS. * If we're using VM_STACK, then mmap will just map * the top SGROWSIZ bytes, and let the stack grow down * to the limit at BOS. If we're not using VM_STACK * we map the full stack, since we don't have a way * to autogrow it. */ if (len <= STACK_SIZE - GUARD_SIZE) { addr = addr - (STACK_SIZE - GUARD_SIZE - len); len = STACK_SIZE - GUARD_SIZE; } } /* * FreeBSD is free to ignore the address hint if MAP_FIXED wasn't * passed. However, some Linux applications, like the ART runtime, * depend on the hint. If the MAP_FIXED wasn't passed, but the * address is not zero, try with MAP_FIXED and MAP_EXCL first, * and fall back to the normal behaviour if that fails. */ if (addr != 0 && (bsd_flags & MAP_FIXED) == 0 && (bsd_flags & MAP_EXCL) == 0) { error = kern_mmap(td, addr, len, prot, bsd_flags | MAP_FIXED | MAP_EXCL, fd, pos); if (error == 0) goto out; } error = kern_mmap(td, addr, len, prot, bsd_flags, fd, pos); out: LINUX_CTR2(mmap2, "return: %d (%p)", error, td->td_retval[0]); return (error); } int linux_mprotect_common(struct thread *td, uintptr_t addr, size_t len, int prot) { #if defined(__amd64__) linux_fixup_prot(td, &prot); #endif return (kern_mprotect(td, addr, len, prot)); } #if defined(__amd64__) static void linux_fixup_prot(struct thread *td, int *prot) { struct linux_pemuldata *pem; if (SV_PROC_FLAG(td->td_proc, SV_ILP32) && *prot & PROT_READ) { pem = pem_find(td->td_proc); if (pem->persona & LINUX_READ_IMPLIES_EXEC) *prot |= PROT_EXEC; } } #endif Index: head/sys/compat/linux/linux_socket.c =================================================================== --- head/sys/compat/linux/linux_socket.c (revision 333424) +++ head/sys/compat/linux/linux_socket.c (revision 333425) @@ -1,1767 +1,1764 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1995 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* XXX we use functions that might not exist. */ #include "opt_compat.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #endif #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif #include #include #include #include static int linux_to_bsd_domain(int); static int linux_sendmsg_common(struct thread *, l_int, struct l_msghdr *, l_uint); static int linux_recvmsg_common(struct thread *, l_int, struct l_msghdr *, l_uint, struct msghdr *); static int linux_set_socket_flags(int, int *); /* * Reads a Linux sockaddr and does any necessary translation. * Linux sockaddrs don't have a length field, only a family. * Copy the osockaddr structure pointed to by osa to kernel, adjust * family and convert to sockaddr. */ static int linux_getsockaddr(struct sockaddr **sap, const struct osockaddr *osa, int salen) { struct sockaddr *sa; struct osockaddr *kosa; #ifdef INET6 struct sockaddr_in6 *sin6; int oldv6size; #endif char *name; int bdom, error, hdrlen, namelen; if (salen < 2 || salen > UCHAR_MAX || !osa) return (EINVAL); #ifdef INET6 oldv6size = 0; /* * Check for old (pre-RFC2553) sockaddr_in6. We may accept it * if it's a v4-mapped address, so reserve the proper space * for it. */ if (salen == sizeof(struct sockaddr_in6) - sizeof(uint32_t)) { salen += sizeof(uint32_t); oldv6size = 1; } #endif kosa = malloc(salen, M_SONAME, M_WAITOK); if ((error = copyin(osa, kosa, salen))) goto out; bdom = linux_to_bsd_domain(kosa->sa_family); if (bdom == -1) { error = EAFNOSUPPORT; goto out; } #ifdef INET6 /* * Older Linux IPv6 code uses obsolete RFC2133 struct sockaddr_in6, * which lacks the scope id compared with RFC2553 one. If we detect * the situation, reject the address and write a message to system log. * * Still accept addresses for which the scope id is not used. */ if (oldv6size) { if (bdom == AF_INET6) { sin6 = (struct sockaddr_in6 *)kosa; if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) || (!IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr) && !IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr) && !IN6_IS_ADDR_V4COMPAT(&sin6->sin6_addr) && !IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) && !IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))) { sin6->sin6_scope_id = 0; } else { log(LOG_DEBUG, "obsolete pre-RFC2553 sockaddr_in6 rejected\n"); error = EINVAL; goto out; } } else salen -= sizeof(uint32_t); } #endif if (bdom == AF_INET) { if (salen < sizeof(struct sockaddr_in)) { error = EINVAL; goto out; } salen = sizeof(struct sockaddr_in); } if (bdom == AF_LOCAL && salen > sizeof(struct sockaddr_un)) { hdrlen = offsetof(struct sockaddr_un, sun_path); name = ((struct sockaddr_un *)kosa)->sun_path; if (*name == '\0') { /* * Linux abstract namespace starts with a NULL byte. * XXX We do not support abstract namespace yet. */ namelen = strnlen(name + 1, salen - hdrlen - 1) + 1; } else namelen = strnlen(name, salen - hdrlen); salen = hdrlen + namelen; if (salen > sizeof(struct sockaddr_un)) { error = ENAMETOOLONG; goto out; } } sa = (struct sockaddr *)kosa; sa->sa_family = bdom; sa->sa_len = salen; *sap = sa; return (0); out: free(kosa, M_SONAME); return (error); } static int linux_to_bsd_domain(int domain) { switch (domain) { case LINUX_AF_UNSPEC: return (AF_UNSPEC); case LINUX_AF_UNIX: return (AF_LOCAL); case LINUX_AF_INET: return (AF_INET); case LINUX_AF_INET6: return (AF_INET6); case LINUX_AF_AX25: return (AF_CCITT); case LINUX_AF_IPX: return (AF_IPX); case LINUX_AF_APPLETALK: return (AF_APPLETALK); } return (-1); } static int bsd_to_linux_domain(int domain) { switch (domain) { case AF_UNSPEC: return (LINUX_AF_UNSPEC); case AF_LOCAL: return (LINUX_AF_UNIX); case AF_INET: return (LINUX_AF_INET); case AF_INET6: return (LINUX_AF_INET6); case AF_CCITT: return (LINUX_AF_AX25); case AF_IPX: return (LINUX_AF_IPX); case AF_APPLETALK: return (LINUX_AF_APPLETALK); } return (-1); } static int linux_to_bsd_sockopt_level(int level) { switch (level) { case LINUX_SOL_SOCKET: return (SOL_SOCKET); } return (level); } static int bsd_to_linux_sockopt_level(int level) { switch (level) { case SOL_SOCKET: return (LINUX_SOL_SOCKET); } return (level); } static int linux_to_bsd_ip_sockopt(int opt) { switch (opt) { case LINUX_IP_TOS: return (IP_TOS); case LINUX_IP_TTL: return (IP_TTL); case LINUX_IP_OPTIONS: return (IP_OPTIONS); case LINUX_IP_MULTICAST_IF: return (IP_MULTICAST_IF); case LINUX_IP_MULTICAST_TTL: return (IP_MULTICAST_TTL); case LINUX_IP_MULTICAST_LOOP: return (IP_MULTICAST_LOOP); case LINUX_IP_ADD_MEMBERSHIP: return (IP_ADD_MEMBERSHIP); case LINUX_IP_DROP_MEMBERSHIP: return (IP_DROP_MEMBERSHIP); case LINUX_IP_HDRINCL: return (IP_HDRINCL); } return (-1); } static int linux_to_bsd_ip6_sockopt(int opt) { switch (opt) { case LINUX_IPV6_NEXTHOP: return (IPV6_NEXTHOP); case LINUX_IPV6_UNICAST_HOPS: return (IPV6_UNICAST_HOPS); case LINUX_IPV6_MULTICAST_IF: return (IPV6_MULTICAST_IF); case LINUX_IPV6_MULTICAST_HOPS: return (IPV6_MULTICAST_HOPS); case LINUX_IPV6_MULTICAST_LOOP: return (IPV6_MULTICAST_LOOP); case LINUX_IPV6_ADD_MEMBERSHIP: return (IPV6_JOIN_GROUP); case LINUX_IPV6_DROP_MEMBERSHIP: return (IPV6_LEAVE_GROUP); case LINUX_IPV6_V6ONLY: return (IPV6_V6ONLY); case LINUX_IPV6_DONTFRAG: return (IPV6_DONTFRAG); #if 0 case LINUX_IPV6_CHECKSUM: return (IPV6_CHECKSUM); case LINUX_IPV6_RECVPKTINFO: return (IPV6_RECVPKTINFO); case LINUX_IPV6_PKTINFO: return (IPV6_PKTINFO); case LINUX_IPV6_RECVHOPLIMIT: return (IPV6_RECVHOPLIMIT); case LINUX_IPV6_HOPLIMIT: return (IPV6_HOPLIMIT); case LINUX_IPV6_RECVHOPOPTS: return (IPV6_RECVHOPOPTS); case LINUX_IPV6_HOPOPTS: return (IPV6_HOPOPTS); case LINUX_IPV6_RTHDRDSTOPTS: return (IPV6_RTHDRDSTOPTS); case LINUX_IPV6_RECVRTHDR: return (IPV6_RECVRTHDR); case LINUX_IPV6_RTHDR: return (IPV6_RTHDR); case LINUX_IPV6_RECVDSTOPTS: return (IPV6_RECVDSTOPTS); case LINUX_IPV6_DSTOPTS: return (IPV6_DSTOPTS); case LINUX_IPV6_RECVPATHMTU: return (IPV6_RECVPATHMTU); case LINUX_IPV6_PATHMTU: return (IPV6_PATHMTU); #endif } return (-1); } static int linux_to_bsd_so_sockopt(int opt) { switch (opt) { case LINUX_SO_DEBUG: return (SO_DEBUG); case LINUX_SO_REUSEADDR: return (SO_REUSEADDR); case LINUX_SO_TYPE: return (SO_TYPE); case LINUX_SO_ERROR: return (SO_ERROR); case LINUX_SO_DONTROUTE: return (SO_DONTROUTE); case LINUX_SO_BROADCAST: return (SO_BROADCAST); case LINUX_SO_SNDBUF: return (SO_SNDBUF); case LINUX_SO_RCVBUF: return (SO_RCVBUF); case LINUX_SO_KEEPALIVE: return (SO_KEEPALIVE); case LINUX_SO_OOBINLINE: return (SO_OOBINLINE); case LINUX_SO_LINGER: return (SO_LINGER); case LINUX_SO_PEERCRED: return (LOCAL_PEERCRED); case LINUX_SO_RCVLOWAT: return (SO_RCVLOWAT); case LINUX_SO_SNDLOWAT: return (SO_SNDLOWAT); case LINUX_SO_RCVTIMEO: return (SO_RCVTIMEO); case LINUX_SO_SNDTIMEO: return (SO_SNDTIMEO); case LINUX_SO_TIMESTAMP: return (SO_TIMESTAMP); case LINUX_SO_ACCEPTCONN: return (SO_ACCEPTCONN); } return (-1); } static int linux_to_bsd_tcp_sockopt(int opt) { switch (opt) { case LINUX_TCP_NODELAY: return (TCP_NODELAY); case LINUX_TCP_MAXSEG: return (TCP_MAXSEG); case LINUX_TCP_KEEPIDLE: return (TCP_KEEPIDLE); case LINUX_TCP_KEEPINTVL: return (TCP_KEEPINTVL); case LINUX_TCP_KEEPCNT: return (TCP_KEEPCNT); case LINUX_TCP_MD5SIG: return (TCP_MD5SIG); } return (-1); } static int linux_to_bsd_msg_flags(int flags) { int ret_flags = 0; if (flags & LINUX_MSG_OOB) ret_flags |= MSG_OOB; if (flags & LINUX_MSG_PEEK) ret_flags |= MSG_PEEK; if (flags & LINUX_MSG_DONTROUTE) ret_flags |= MSG_DONTROUTE; if (flags & LINUX_MSG_CTRUNC) ret_flags |= MSG_CTRUNC; if (flags & LINUX_MSG_TRUNC) ret_flags |= MSG_TRUNC; if (flags & LINUX_MSG_DONTWAIT) ret_flags |= MSG_DONTWAIT; if (flags & LINUX_MSG_EOR) ret_flags |= MSG_EOR; if (flags & LINUX_MSG_WAITALL) ret_flags |= MSG_WAITALL; if (flags & LINUX_MSG_NOSIGNAL) ret_flags |= MSG_NOSIGNAL; #if 0 /* not handled */ if (flags & LINUX_MSG_PROXY) ; if (flags & LINUX_MSG_FIN) ; if (flags & LINUX_MSG_SYN) ; if (flags & LINUX_MSG_CONFIRM) ; if (flags & LINUX_MSG_RST) ; if (flags & LINUX_MSG_ERRQUEUE) ; #endif return (ret_flags); } /* * If bsd_to_linux_sockaddr() or linux_to_bsd_sockaddr() faults, then the * native syscall will fault. Thus, we don't really need to check the * return values for these functions. */ static int bsd_to_linux_sockaddr(struct sockaddr *arg) { struct sockaddr sa; size_t sa_len = sizeof(struct sockaddr); int error, bdom; if ((error = copyin(arg, &sa, sa_len))) return (error); bdom = bsd_to_linux_domain(sa.sa_family); if (bdom == -1) return (EAFNOSUPPORT); *(u_short *)&sa = bdom; return (copyout(&sa, arg, sa_len)); } static int linux_to_bsd_sockaddr(struct sockaddr *arg, int len) { struct sockaddr sa; size_t sa_len = sizeof(struct sockaddr); int error, bdom; if ((error = copyin(arg, &sa, sa_len))) return (error); bdom = linux_to_bsd_domain(*(sa_family_t *)&sa); if (bdom == -1) return (EAFNOSUPPORT); sa.sa_family = bdom; sa.sa_len = len; return (copyout(&sa, arg, sa_len)); } static int linux_sa_put(struct osockaddr *osa) { struct osockaddr sa; int error, bdom; /* * Only read/write the osockaddr family part, the rest is * not changed. */ error = copyin(osa, &sa, sizeof(sa.sa_family)); if (error != 0) return (error); bdom = bsd_to_linux_domain(sa.sa_family); if (bdom == -1) return (EINVAL); sa.sa_family = bdom; return (copyout(&sa, osa, sizeof(sa.sa_family))); } static int linux_to_bsd_cmsg_type(int cmsg_type) { switch (cmsg_type) { case LINUX_SCM_RIGHTS: return (SCM_RIGHTS); case LINUX_SCM_CREDENTIALS: return (SCM_CREDS); } return (-1); } static int bsd_to_linux_cmsg_type(int cmsg_type) { switch (cmsg_type) { case SCM_RIGHTS: return (LINUX_SCM_RIGHTS); case SCM_CREDS: return (LINUX_SCM_CREDENTIALS); case SCM_TIMESTAMP: return (LINUX_SCM_TIMESTAMP); } return (-1); } static int linux_to_bsd_msghdr(struct msghdr *bhdr, const struct l_msghdr *lhdr) { if (lhdr->msg_controllen > INT_MAX) return (ENOBUFS); bhdr->msg_name = PTRIN(lhdr->msg_name); bhdr->msg_namelen = lhdr->msg_namelen; bhdr->msg_iov = PTRIN(lhdr->msg_iov); bhdr->msg_iovlen = lhdr->msg_iovlen; bhdr->msg_control = PTRIN(lhdr->msg_control); /* * msg_controllen is skipped since BSD and LINUX control messages * are potentially different sizes (e.g. the cred structure used * by SCM_CREDS is different between the two operating system). * * The caller can set it (if necessary) after converting all the * control messages. */ bhdr->msg_flags = linux_to_bsd_msg_flags(lhdr->msg_flags); return (0); } static int bsd_to_linux_msghdr(const struct msghdr *bhdr, struct l_msghdr *lhdr) { lhdr->msg_name = PTROUT(bhdr->msg_name); lhdr->msg_namelen = bhdr->msg_namelen; lhdr->msg_iov = PTROUT(bhdr->msg_iov); lhdr->msg_iovlen = bhdr->msg_iovlen; lhdr->msg_control = PTROUT(bhdr->msg_control); /* * msg_controllen is skipped since BSD and LINUX control messages * are potentially different sizes (e.g. the cred structure used * by SCM_CREDS is different between the two operating system). * * The caller can set it (if necessary) after converting all the * control messages. */ /* msg_flags skipped */ return (0); } static int linux_set_socket_flags(int lflags, int *flags) { if (lflags & ~(LINUX_SOCK_CLOEXEC | LINUX_SOCK_NONBLOCK)) return (EINVAL); if (lflags & LINUX_SOCK_NONBLOCK) *flags |= SOCK_NONBLOCK; if (lflags & LINUX_SOCK_CLOEXEC) *flags |= SOCK_CLOEXEC; return (0); } static int linux_sendit(struct thread *td, int s, struct msghdr *mp, int flags, struct mbuf *control, enum uio_seg segflg) { struct sockaddr *to; int error; if (mp->msg_name != NULL) { error = linux_getsockaddr(&to, mp->msg_name, mp->msg_namelen); if (error != 0) return (error); mp->msg_name = to; } else to = NULL; error = kern_sendit(td, s, mp, linux_to_bsd_msg_flags(flags), control, segflg); if (to) free(to, M_SONAME); return (error); } /* Return 0 if IP_HDRINCL is set for the given socket. */ static int linux_check_hdrincl(struct thread *td, int s) { int error, optval; socklen_t size_val; size_val = sizeof(optval); error = kern_getsockopt(td, s, IPPROTO_IP, IP_HDRINCL, &optval, UIO_SYSSPACE, &size_val); if (error != 0) return (error); return (optval == 0); } /* * Updated sendto() when IP_HDRINCL is set: * tweak endian-dependent fields in the IP packet. */ static int linux_sendto_hdrincl(struct thread *td, struct linux_sendto_args *linux_args) { /* * linux_ip_copysize defines how many bytes we should copy * from the beginning of the IP packet before we customize it for BSD. * It should include all the fields we modify (ip_len and ip_off). */ #define linux_ip_copysize 8 struct ip *packet; struct msghdr msg; struct iovec aiov[1]; int error; /* Check that the packet isn't too big or too small. */ if (linux_args->len < linux_ip_copysize || linux_args->len > IP_MAXPACKET) return (EINVAL); packet = (struct ip *)malloc(linux_args->len, M_LINUX, M_WAITOK); /* Make kernel copy of the packet to be sent */ if ((error = copyin(PTRIN(linux_args->msg), packet, linux_args->len))) goto goout; /* Convert fields from Linux to BSD raw IP socket format */ packet->ip_len = linux_args->len; packet->ip_off = ntohs(packet->ip_off); /* Prepare the msghdr and iovec structures describing the new packet */ msg.msg_name = PTRIN(linux_args->to); msg.msg_namelen = linux_args->tolen; msg.msg_iov = aiov; msg.msg_iovlen = 1; msg.msg_control = NULL; msg.msg_flags = 0; aiov[0].iov_base = (char *)packet; aiov[0].iov_len = linux_args->len; error = linux_sendit(td, linux_args->s, &msg, linux_args->flags, NULL, UIO_SYSSPACE); goout: free(packet, M_LINUX); return (error); } int linux_socket(struct thread *td, struct linux_socket_args *args) { int domain, retval_socket, type; type = args->type & LINUX_SOCK_TYPE_MASK; if (type < 0 || type > LINUX_SOCK_MAX) return (EINVAL); retval_socket = linux_set_socket_flags(args->type & ~LINUX_SOCK_TYPE_MASK, &type); if (retval_socket != 0) return (retval_socket); domain = linux_to_bsd_domain(args->domain); if (domain == -1) return (EAFNOSUPPORT); retval_socket = kern_socket(td, domain, type, args->protocol); if (retval_socket) return (retval_socket); if (type == SOCK_RAW && (args->protocol == IPPROTO_RAW || args->protocol == 0) && domain == PF_INET) { /* It's a raw IP socket: set the IP_HDRINCL option. */ int hdrincl; hdrincl = 1; /* We ignore any error returned by kern_setsockopt() */ kern_setsockopt(td, td->td_retval[0], IPPROTO_IP, IP_HDRINCL, &hdrincl, UIO_SYSSPACE, sizeof(hdrincl)); } #ifdef INET6 /* * Linux AF_INET6 socket has IPV6_V6ONLY setsockopt set to 0 by default * and some apps depend on this. So, set V6ONLY to 0 for Linux apps. * For simplicity we do this unconditionally of the net.inet6.ip6.v6only * sysctl value. */ if (domain == PF_INET6) { int v6only; v6only = 0; /* We ignore any error returned by setsockopt() */ kern_setsockopt(td, td->td_retval[0], IPPROTO_IPV6, IPV6_V6ONLY, &v6only, UIO_SYSSPACE, sizeof(v6only)); } #endif return (retval_socket); } int linux_bind(struct thread *td, struct linux_bind_args *args) { struct sockaddr *sa; int error; error = linux_getsockaddr(&sa, PTRIN(args->name), args->namelen); if (error != 0) return (error); error = kern_bindat(td, AT_FDCWD, args->s, sa); free(sa, M_SONAME); if (error == EADDRNOTAVAIL && args->namelen != sizeof(struct sockaddr_in)) return (EINVAL); return (error); } int linux_connect(struct thread *td, struct linux_connect_args *args) { - cap_rights_t rights; struct socket *so; struct sockaddr *sa; struct file *fp; u_int fflag; int error; error = linux_getsockaddr(&sa, (struct osockaddr *)PTRIN(args->name), args->namelen); if (error != 0) return (error); error = kern_connectat(td, AT_FDCWD, args->s, sa); free(sa, M_SONAME); if (error != EISCONN) return (error); /* * Linux doesn't return EISCONN the first time it occurs, * when on a non-blocking socket. Instead it returns the * error getsockopt(SOL_SOCKET, SO_ERROR) would return on BSD. */ - error = getsock_cap(td, args->s, cap_rights_init(&rights, CAP_CONNECT), + error = getsock_cap(td, args->s, &cap_connect_rights, &fp, &fflag, NULL); if (error != 0) return (error); error = EISCONN; so = fp->f_data; if (fflag & FNONBLOCK) { SOCK_LOCK(so); if (so->so_emuldata == 0) error = so->so_error; so->so_emuldata = (void *)1; SOCK_UNLOCK(so); } fdrop(fp, td); return (error); } int linux_listen(struct thread *td, struct linux_listen_args *args) { return (kern_listen(td, args->s, args->backlog)); } static int linux_accept_common(struct thread *td, int s, l_uintptr_t addr, l_uintptr_t namelen, int flags) { struct accept4_args /* { int s; struct sockaddr * __restrict name; socklen_t * __restrict anamelen; int flags; } */ bsd_args; - cap_rights_t rights; struct socket *so; struct file *fp; int error, error1; bsd_args.s = s; bsd_args.name = (struct sockaddr * __restrict)PTRIN(addr); bsd_args.anamelen = PTRIN(namelen); bsd_args.flags = 0; error = linux_set_socket_flags(flags, &bsd_args.flags); if (error != 0) return (error); error = sys_accept4(td, &bsd_args); bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.name); if (error != 0) { if (error == EFAULT && namelen != sizeof(struct sockaddr_in)) return (EINVAL); if (error == EINVAL) { - error1 = getsock_cap(td, s, - cap_rights_init(&rights, CAP_ACCEPT), &fp, NULL, NULL); + error1 = getsock_cap(td, s, &cap_accept_rights, &fp, NULL, NULL); if (error1 != 0) return (error1); so = fp->f_data; if (so->so_type == SOCK_DGRAM) { fdrop(fp, td); return (EOPNOTSUPP); } fdrop(fp, td); } return (error); } if (addr) error = linux_sa_put(PTRIN(addr)); if (error != 0) { (void)kern_close(td, td->td_retval[0]); td->td_retval[0] = 0; } return (error); } int linux_accept(struct thread *td, struct linux_accept_args *args) { return (linux_accept_common(td, args->s, args->addr, args->namelen, 0)); } int linux_accept4(struct thread *td, struct linux_accept4_args *args) { return (linux_accept_common(td, args->s, args->addr, args->namelen, args->flags)); } int linux_getsockname(struct thread *td, struct linux_getsockname_args *args) { struct getsockname_args /* { int fdes; struct sockaddr * __restrict asa; socklen_t * __restrict alen; } */ bsd_args; int error; bsd_args.fdes = args->s; bsd_args.asa = (struct sockaddr * __restrict)PTRIN(args->addr); bsd_args.alen = PTRIN(args->namelen); error = sys_getsockname(td, &bsd_args); bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.asa); if (error != 0) return (error); return (linux_sa_put(PTRIN(args->addr))); } int linux_getpeername(struct thread *td, struct linux_getpeername_args *args) { struct getpeername_args /* { int fdes; caddr_t asa; int *alen; } */ bsd_args; int error; bsd_args.fdes = args->s; bsd_args.asa = (struct sockaddr *)PTRIN(args->addr); bsd_args.alen = (socklen_t *)PTRIN(args->namelen); error = sys_getpeername(td, &bsd_args); bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.asa); if (error != 0) return (error); return (linux_sa_put(PTRIN(args->addr))); } int linux_socketpair(struct thread *td, struct linux_socketpair_args *args) { struct socketpair_args /* { int domain; int type; int protocol; int *rsv; } */ bsd_args; int error; bsd_args.domain = linux_to_bsd_domain(args->domain); if (bsd_args.domain != PF_LOCAL) return (EAFNOSUPPORT); bsd_args.type = args->type & LINUX_SOCK_TYPE_MASK; if (bsd_args.type < 0 || bsd_args.type > LINUX_SOCK_MAX) return (EINVAL); error = linux_set_socket_flags(args->type & ~LINUX_SOCK_TYPE_MASK, &bsd_args.type); if (error != 0) return (error); if (args->protocol != 0 && args->protocol != PF_UNIX) /* * Use of PF_UNIX as protocol argument is not right, * but Linux does it. * Do not map PF_UNIX as its Linux value is identical * to FreeBSD one. */ return (EPROTONOSUPPORT); else bsd_args.protocol = 0; bsd_args.rsv = (int *)PTRIN(args->rsv); return (sys_socketpair(td, &bsd_args)); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) struct linux_send_args { register_t s; register_t msg; register_t len; register_t flags; }; static int linux_send(struct thread *td, struct linux_send_args *args) { struct sendto_args /* { int s; caddr_t buf; int len; int flags; caddr_t to; int tolen; } */ bsd_args; bsd_args.s = args->s; bsd_args.buf = (caddr_t)PTRIN(args->msg); bsd_args.len = args->len; bsd_args.flags = args->flags; bsd_args.to = NULL; bsd_args.tolen = 0; return (sys_sendto(td, &bsd_args)); } struct linux_recv_args { register_t s; register_t msg; register_t len; register_t flags; }; static int linux_recv(struct thread *td, struct linux_recv_args *args) { struct recvfrom_args /* { int s; caddr_t buf; int len; int flags; struct sockaddr *from; socklen_t fromlenaddr; } */ bsd_args; bsd_args.s = args->s; bsd_args.buf = (caddr_t)PTRIN(args->msg); bsd_args.len = args->len; bsd_args.flags = linux_to_bsd_msg_flags(args->flags); bsd_args.from = NULL; bsd_args.fromlenaddr = 0; return (sys_recvfrom(td, &bsd_args)); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ int linux_sendto(struct thread *td, struct linux_sendto_args *args) { struct msghdr msg; struct iovec aiov; if (linux_check_hdrincl(td, args->s) == 0) /* IP_HDRINCL set, tweak the packet before sending */ return (linux_sendto_hdrincl(td, args)); msg.msg_name = PTRIN(args->to); msg.msg_namelen = args->tolen; msg.msg_iov = &aiov; msg.msg_iovlen = 1; msg.msg_control = NULL; msg.msg_flags = 0; aiov.iov_base = PTRIN(args->msg); aiov.iov_len = args->len; return (linux_sendit(td, args->s, &msg, args->flags, NULL, UIO_USERSPACE)); } int linux_recvfrom(struct thread *td, struct linux_recvfrom_args *args) { struct msghdr msg; struct iovec aiov; int error, fromlen; if (PTRIN(args->fromlen) != NULL) { error = copyin(PTRIN(args->fromlen), &fromlen, sizeof(fromlen)); if (error != 0) return (error); if (fromlen < 0) return (EINVAL); msg.msg_namelen = fromlen; } else msg.msg_namelen = 0; msg.msg_name = (struct sockaddr * __restrict)PTRIN(args->from); msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = PTRIN(args->buf); aiov.iov_len = args->len; msg.msg_control = 0; msg.msg_flags = linux_to_bsd_msg_flags(args->flags); error = kern_recvit(td, args->s, &msg, UIO_USERSPACE, NULL); if (error != 0) return (error); if (PTRIN(args->from) != NULL) { error = bsd_to_linux_sockaddr((struct sockaddr *) PTRIN(args->from)); if (error != 0) return (error); error = linux_sa_put((struct osockaddr *) PTRIN(args->from)); } if (PTRIN(args->fromlen) != NULL) error = copyout(&msg.msg_namelen, PTRIN(args->fromlen), sizeof(msg.msg_namelen)); return (error); } static int linux_sendmsg_common(struct thread *td, l_int s, struct l_msghdr *msghdr, l_uint flags) { struct cmsghdr *cmsg; struct cmsgcred cmcred; struct mbuf *control; struct msghdr msg; struct l_cmsghdr linux_cmsg; struct l_cmsghdr *ptr_cmsg; struct l_msghdr linux_msg; struct iovec *iov; socklen_t datalen; struct sockaddr *sa; sa_family_t sa_family; void *data; int error; error = copyin(msghdr, &linux_msg, sizeof(linux_msg)); if (error != 0) return (error); /* * Some Linux applications (ping) define a non-NULL control data * pointer, but a msg_controllen of 0, which is not allowed in the * FreeBSD system call interface. NULL the msg_control pointer in * order to handle this case. This should be checked, but allows the * Linux ping to work. */ if (PTRIN(linux_msg.msg_control) != NULL && linux_msg.msg_controllen == 0) linux_msg.msg_control = PTROUT(NULL); error = linux_to_bsd_msghdr(&msg, &linux_msg); if (error != 0) return (error); #ifdef COMPAT_LINUX32 error = linux32_copyiniov(PTRIN(msg.msg_iov), msg.msg_iovlen, &iov, EMSGSIZE); #else error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); #endif if (error != 0) return (error); control = NULL; cmsg = NULL; if ((ptr_cmsg = LINUX_CMSG_FIRSTHDR(&linux_msg)) != NULL) { error = kern_getsockname(td, s, &sa, &datalen); if (error != 0) goto bad; sa_family = sa->sa_family; free(sa, M_SONAME); error = ENOBUFS; cmsg = malloc(CMSG_HDRSZ, M_LINUX, M_WAITOK|M_ZERO); control = m_get(M_WAITOK, MT_CONTROL); do { error = copyin(ptr_cmsg, &linux_cmsg, sizeof(struct l_cmsghdr)); if (error != 0) goto bad; error = EINVAL; if (linux_cmsg.cmsg_len < sizeof(struct l_cmsghdr)) goto bad; /* * Now we support only SCM_RIGHTS and SCM_CRED, * so return EINVAL in any other cmsg_type */ cmsg->cmsg_type = linux_to_bsd_cmsg_type(linux_cmsg.cmsg_type); cmsg->cmsg_level = linux_to_bsd_sockopt_level(linux_cmsg.cmsg_level); if (cmsg->cmsg_type == -1 || cmsg->cmsg_level != SOL_SOCKET) goto bad; /* * Some applications (e.g. pulseaudio) attempt to * send ancillary data even if the underlying protocol * doesn't support it which is not allowed in the * FreeBSD system call interface. */ if (sa_family != AF_UNIX) continue; data = LINUX_CMSG_DATA(ptr_cmsg); datalen = linux_cmsg.cmsg_len - L_CMSG_HDRSZ; switch (cmsg->cmsg_type) { case SCM_RIGHTS: break; case SCM_CREDS: data = &cmcred; datalen = sizeof(cmcred); /* * The lower levels will fill in the structure */ bzero(data, datalen); break; } cmsg->cmsg_len = CMSG_LEN(datalen); error = ENOBUFS; if (!m_append(control, CMSG_HDRSZ, (c_caddr_t)cmsg)) goto bad; if (!m_append(control, datalen, (c_caddr_t)data)) goto bad; } while ((ptr_cmsg = LINUX_CMSG_NXTHDR(&linux_msg, ptr_cmsg))); if (m_length(control, NULL) == 0) { m_freem(control); control = NULL; } } msg.msg_iov = iov; msg.msg_flags = 0; error = linux_sendit(td, s, &msg, flags, control, UIO_USERSPACE); control = NULL; bad: m_freem(control); free(iov, M_IOV); if (cmsg) free(cmsg, M_LINUX); return (error); } int linux_sendmsg(struct thread *td, struct linux_sendmsg_args *args) { return (linux_sendmsg_common(td, args->s, PTRIN(args->msg), args->flags)); } int linux_sendmmsg(struct thread *td, struct linux_sendmmsg_args *args) { struct l_mmsghdr *msg; l_uint retval; int error, datagrams; if (args->vlen > UIO_MAXIOV) args->vlen = UIO_MAXIOV; msg = PTRIN(args->msg); datagrams = 0; while (datagrams < args->vlen) { error = linux_sendmsg_common(td, args->s, &msg->msg_hdr, args->flags); if (error != 0) break; retval = td->td_retval[0]; error = copyout(&retval, &msg->msg_len, sizeof(msg->msg_len)); if (error != 0) break; ++msg; ++datagrams; } if (error == 0) td->td_retval[0] = datagrams; return (error); } static int linux_recvmsg_common(struct thread *td, l_int s, struct l_msghdr *msghdr, l_uint flags, struct msghdr *msg) { struct cmsghdr *cm; struct cmsgcred *cmcred; struct l_cmsghdr *linux_cmsg = NULL; struct l_ucred linux_ucred; socklen_t datalen, outlen; struct l_msghdr linux_msg; struct iovec *iov, *uiov; struct mbuf *control = NULL; struct mbuf **controlp; struct timeval *ftmvl; l_timeval ltmvl; caddr_t outbuf; void *data; int error, i, fd, fds, *fdp; error = copyin(msghdr, &linux_msg, sizeof(linux_msg)); if (error != 0) return (error); error = linux_to_bsd_msghdr(msg, &linux_msg); if (error != 0) return (error); #ifdef COMPAT_LINUX32 error = linux32_copyiniov(PTRIN(msg->msg_iov), msg->msg_iovlen, &iov, EMSGSIZE); #else error = copyiniov(msg->msg_iov, msg->msg_iovlen, &iov, EMSGSIZE); #endif if (error != 0) return (error); if (msg->msg_name) { error = linux_to_bsd_sockaddr((struct sockaddr *)msg->msg_name, msg->msg_namelen); if (error != 0) goto bad; } uiov = msg->msg_iov; msg->msg_iov = iov; controlp = (msg->msg_control != NULL) ? &control : NULL; error = kern_recvit(td, s, msg, UIO_USERSPACE, controlp); msg->msg_iov = uiov; if (error != 0) goto bad; error = bsd_to_linux_msghdr(msg, &linux_msg); if (error != 0) goto bad; if (linux_msg.msg_name) { error = bsd_to_linux_sockaddr((struct sockaddr *) PTRIN(linux_msg.msg_name)); if (error != 0) goto bad; } if (linux_msg.msg_name && linux_msg.msg_namelen > 2) { error = linux_sa_put(PTRIN(linux_msg.msg_name)); if (error != 0) goto bad; } outbuf = PTRIN(linux_msg.msg_control); outlen = 0; if (control) { linux_cmsg = malloc(L_CMSG_HDRSZ, M_LINUX, M_WAITOK | M_ZERO); msg->msg_control = mtod(control, struct cmsghdr *); msg->msg_controllen = control->m_len; cm = CMSG_FIRSTHDR(msg); while (cm != NULL) { linux_cmsg->cmsg_type = bsd_to_linux_cmsg_type(cm->cmsg_type); linux_cmsg->cmsg_level = bsd_to_linux_sockopt_level(cm->cmsg_level); if (linux_cmsg->cmsg_type == -1 || cm->cmsg_level != SOL_SOCKET) { error = EINVAL; goto bad; } data = CMSG_DATA(cm); datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; switch (cm->cmsg_type) { case SCM_RIGHTS: if (flags & LINUX_MSG_CMSG_CLOEXEC) { fds = datalen / sizeof(int); fdp = data; for (i = 0; i < fds; i++) { fd = *fdp++; (void)kern_fcntl(td, fd, F_SETFD, FD_CLOEXEC); } } break; case SCM_CREDS: /* * Currently LOCAL_CREDS is never in * effect for Linux so no need to worry * about sockcred */ if (datalen != sizeof(*cmcred)) { error = EMSGSIZE; goto bad; } cmcred = (struct cmsgcred *)data; bzero(&linux_ucred, sizeof(linux_ucred)); linux_ucred.pid = cmcred->cmcred_pid; linux_ucred.uid = cmcred->cmcred_uid; linux_ucred.gid = cmcred->cmcred_gid; data = &linux_ucred; datalen = sizeof(linux_ucred); break; case SCM_TIMESTAMP: if (datalen != sizeof(struct timeval)) { error = EMSGSIZE; goto bad; } ftmvl = (struct timeval *)data; ltmvl.tv_sec = ftmvl->tv_sec; ltmvl.tv_usec = ftmvl->tv_usec; data = <mvl; datalen = sizeof(ltmvl); break; } if (outlen + LINUX_CMSG_LEN(datalen) > linux_msg.msg_controllen) { if (outlen == 0) { error = EMSGSIZE; goto bad; } else { linux_msg.msg_flags |= LINUX_MSG_CTRUNC; goto out; } } linux_cmsg->cmsg_len = LINUX_CMSG_LEN(datalen); error = copyout(linux_cmsg, outbuf, L_CMSG_HDRSZ); if (error != 0) goto bad; outbuf += L_CMSG_HDRSZ; error = copyout(data, outbuf, datalen); if (error != 0) goto bad; outbuf += LINUX_CMSG_ALIGN(datalen); outlen += LINUX_CMSG_LEN(datalen); cm = CMSG_NXTHDR(msg, cm); } } out: linux_msg.msg_controllen = outlen; error = copyout(&linux_msg, msghdr, sizeof(linux_msg)); bad: free(iov, M_IOV); m_freem(control); free(linux_cmsg, M_LINUX); return (error); } int linux_recvmsg(struct thread *td, struct linux_recvmsg_args *args) { struct msghdr bsd_msg; return (linux_recvmsg_common(td, args->s, PTRIN(args->msg), args->flags, &bsd_msg)); } int linux_recvmmsg(struct thread *td, struct linux_recvmmsg_args *args) { struct l_mmsghdr *msg; struct msghdr bsd_msg; struct l_timespec lts; struct timespec ts, tts; l_uint retval; int error, datagrams; if (args->timeout) { error = copyin(args->timeout, <s, sizeof(struct l_timespec)); if (error != 0) return (error); error = linux_to_native_timespec(&ts, <s); if (error != 0) return (error); getnanotime(&tts); timespecadd(&tts, &ts); } msg = PTRIN(args->msg); datagrams = 0; while (datagrams < args->vlen) { error = linux_recvmsg_common(td, args->s, &msg->msg_hdr, args->flags & ~LINUX_MSG_WAITFORONE, &bsd_msg); if (error != 0) break; retval = td->td_retval[0]; error = copyout(&retval, &msg->msg_len, sizeof(msg->msg_len)); if (error != 0) break; ++msg; ++datagrams; /* * MSG_WAITFORONE turns on MSG_DONTWAIT after one packet. */ if (args->flags & LINUX_MSG_WAITFORONE) args->flags |= LINUX_MSG_DONTWAIT; /* * See BUGS section of recvmmsg(2). */ if (args->timeout) { getnanotime(&ts); timespecsub(&ts, &tts); if (!timespecisset(&ts) || ts.tv_sec > 0) break; } /* Out of band data, return right away. */ if (bsd_msg.msg_flags & MSG_OOB) break; } if (error == 0) td->td_retval[0] = datagrams; return (error); } int linux_shutdown(struct thread *td, struct linux_shutdown_args *args) { return (kern_shutdown(td, args->s, args->how)); } int linux_setsockopt(struct thread *td, struct linux_setsockopt_args *args) { struct setsockopt_args /* { int s; int level; int name; caddr_t val; int valsize; } */ bsd_args; l_timeval linux_tv; struct timeval tv; int error, name; bsd_args.s = args->s; bsd_args.level = linux_to_bsd_sockopt_level(args->level); switch (bsd_args.level) { case SOL_SOCKET: name = linux_to_bsd_so_sockopt(args->optname); switch (name) { case SO_RCVTIMEO: /* FALLTHROUGH */ case SO_SNDTIMEO: error = copyin(PTRIN(args->optval), &linux_tv, sizeof(linux_tv)); if (error != 0) return (error); tv.tv_sec = linux_tv.tv_sec; tv.tv_usec = linux_tv.tv_usec; return (kern_setsockopt(td, args->s, bsd_args.level, name, &tv, UIO_SYSSPACE, sizeof(tv))); /* NOTREACHED */ default: break; } break; case IPPROTO_IP: name = linux_to_bsd_ip_sockopt(args->optname); break; case IPPROTO_IPV6: name = linux_to_bsd_ip6_sockopt(args->optname); break; case IPPROTO_TCP: name = linux_to_bsd_tcp_sockopt(args->optname); break; default: name = -1; break; } if (name == -1) return (ENOPROTOOPT); bsd_args.name = name; bsd_args.val = PTRIN(args->optval); bsd_args.valsize = args->optlen; if (name == IPV6_NEXTHOP) { linux_to_bsd_sockaddr((struct sockaddr *)bsd_args.val, bsd_args.valsize); error = sys_setsockopt(td, &bsd_args); bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.val); } else error = sys_setsockopt(td, &bsd_args); return (error); } int linux_getsockopt(struct thread *td, struct linux_getsockopt_args *args) { struct getsockopt_args /* { int s; int level; int name; caddr_t val; int *avalsize; } */ bsd_args; l_timeval linux_tv; struct timeval tv; socklen_t tv_len, xulen, len; struct xucred xu; struct l_ucred lxu; int error, name, newval; bsd_args.s = args->s; bsd_args.level = linux_to_bsd_sockopt_level(args->level); switch (bsd_args.level) { case SOL_SOCKET: name = linux_to_bsd_so_sockopt(args->optname); switch (name) { case SO_RCVTIMEO: /* FALLTHROUGH */ case SO_SNDTIMEO: tv_len = sizeof(tv); error = kern_getsockopt(td, args->s, bsd_args.level, name, &tv, UIO_SYSSPACE, &tv_len); if (error != 0) return (error); linux_tv.tv_sec = tv.tv_sec; linux_tv.tv_usec = tv.tv_usec; return (copyout(&linux_tv, PTRIN(args->optval), sizeof(linux_tv))); /* NOTREACHED */ case LOCAL_PEERCRED: if (args->optlen < sizeof(lxu)) return (EINVAL); xulen = sizeof(xu); error = kern_getsockopt(td, args->s, bsd_args.level, name, &xu, UIO_SYSSPACE, &xulen); if (error != 0) return (error); /* * XXX Use 0 for pid as the FreeBSD does not cache peer pid. */ lxu.pid = 0; lxu.uid = xu.cr_uid; lxu.gid = xu.cr_gid; return (copyout(&lxu, PTRIN(args->optval), sizeof(lxu))); /* NOTREACHED */ case SO_ERROR: len = sizeof(newval); error = kern_getsockopt(td, args->s, bsd_args.level, name, &newval, UIO_SYSSPACE, &len); if (error != 0) return (error); newval = -SV_ABI_ERRNO(td->td_proc, newval); return (copyout(&newval, PTRIN(args->optval), len)); /* NOTREACHED */ default: break; } break; case IPPROTO_IP: name = linux_to_bsd_ip_sockopt(args->optname); break; case IPPROTO_IPV6: name = linux_to_bsd_ip6_sockopt(args->optname); break; case IPPROTO_TCP: name = linux_to_bsd_tcp_sockopt(args->optname); break; default: name = -1; break; } if (name == -1) return (EINVAL); bsd_args.name = name; bsd_args.val = PTRIN(args->optval); bsd_args.avalsize = PTRIN(args->optlen); if (name == IPV6_NEXTHOP) { error = sys_getsockopt(td, &bsd_args); bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.val); } else error = sys_getsockopt(td, &bsd_args); return (error); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) /* Argument list sizes for linux_socketcall */ static const unsigned char lxs_args_cnt[] = { 0 /* unused*/, 3 /* socket */, 3 /* bind */, 3 /* connect */, 2 /* listen */, 3 /* accept */, 3 /* getsockname */, 3 /* getpeername */, 4 /* socketpair */, 4 /* send */, 4 /* recv */, 6 /* sendto */, 6 /* recvfrom */, 2 /* shutdown */, 5 /* setsockopt */, 5 /* getsockopt */, 3 /* sendmsg */, 3 /* recvmsg */, 4 /* accept4 */, 5 /* recvmmsg */, 4 /* sendmmsg */ }; #define LINUX_ARGS_CNT (nitems(lxs_args_cnt) - 1) #define LINUX_ARG_SIZE(x) (lxs_args_cnt[x] * sizeof(l_ulong)) int linux_socketcall(struct thread *td, struct linux_socketcall_args *args) { l_ulong a[6]; #if defined(__amd64__) && defined(COMPAT_LINUX32) register_t l_args[6]; #endif void *arg; int error; if (args->what < LINUX_SOCKET || args->what > LINUX_ARGS_CNT) return (EINVAL); error = copyin(PTRIN(args->args), a, LINUX_ARG_SIZE(args->what)); if (error != 0) return (error); #if defined(__amd64__) && defined(COMPAT_LINUX32) for (int i = 0; i < lxs_args_cnt[args->what]; ++i) l_args[i] = a[i]; arg = l_args; #else arg = a; #endif switch (args->what) { case LINUX_SOCKET: return (linux_socket(td, arg)); case LINUX_BIND: return (linux_bind(td, arg)); case LINUX_CONNECT: return (linux_connect(td, arg)); case LINUX_LISTEN: return (linux_listen(td, arg)); case LINUX_ACCEPT: return (linux_accept(td, arg)); case LINUX_GETSOCKNAME: return (linux_getsockname(td, arg)); case LINUX_GETPEERNAME: return (linux_getpeername(td, arg)); case LINUX_SOCKETPAIR: return (linux_socketpair(td, arg)); case LINUX_SEND: return (linux_send(td, arg)); case LINUX_RECV: return (linux_recv(td, arg)); case LINUX_SENDTO: return (linux_sendto(td, arg)); case LINUX_RECVFROM: return (linux_recvfrom(td, arg)); case LINUX_SHUTDOWN: return (linux_shutdown(td, arg)); case LINUX_SETSOCKOPT: return (linux_setsockopt(td, arg)); case LINUX_GETSOCKOPT: return (linux_getsockopt(td, arg)); case LINUX_SENDMSG: return (linux_sendmsg(td, arg)); case LINUX_RECVMSG: return (linux_recvmsg(td, arg)); case LINUX_ACCEPT4: return (linux_accept4(td, arg)); case LINUX_RECVMMSG: return (linux_recvmmsg(td, arg)); case LINUX_SENDMMSG: return (linux_sendmmsg(td, arg)); } uprintf("LINUX: 'socket' typ=%d not implemented\n", args->what); return (ENOSYS); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ Index: head/sys/compat/linux/linux_stats.c =================================================================== --- head/sys/compat/linux/linux_stats.c (revision 333424) +++ head/sys/compat/linux/linux_stats.c (revision 333425) @@ -1,718 +1,716 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1994-1995 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif #include #include static void translate_vnhook_major_minor(struct vnode *vp, struct stat *sb) { int major, minor; if (vp->v_type == VCHR && vp->v_rdev != NULL && linux_driver_get_major_minor(devtoname(vp->v_rdev), &major, &minor) == 0) { sb->st_rdev = (major << 8 | minor); } } static int linux_kern_statat(struct thread *td, int flag, int fd, char *path, enum uio_seg pathseg, struct stat *sbp) { return (kern_statat(td, flag, fd, path, pathseg, sbp, translate_vnhook_major_minor)); } static int linux_kern_stat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp) { return (linux_kern_statat(td, 0, AT_FDCWD, path, pathseg, sbp)); } static int linux_kern_lstat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp) { return (linux_kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, path, pathseg, sbp)); } static void translate_fd_major_minor(struct thread *td, int fd, struct stat *buf) { struct file *fp; struct vnode *vp; - cap_rights_t rights; int major, minor; /* * No capability rights required here. */ if ((!S_ISCHR(buf->st_mode) && !S_ISBLK(buf->st_mode)) || - fget(td, fd, cap_rights_init(&rights), &fp) != 0) + fget(td, fd, &cap_no_rights, &fp) != 0) return; vp = fp->f_vnode; if (vp != NULL && vp->v_rdev != NULL && linux_driver_get_major_minor(devtoname(vp->v_rdev), &major, &minor) == 0) { buf->st_rdev = (major << 8 | minor); } else if (fp->f_type == DTYPE_PTS) { struct tty *tp = fp->f_data; /* Convert the numbers for the slave device. */ if (linux_driver_get_major_minor(devtoname(tp->t_dev), &major, &minor) == 0) { buf->st_rdev = (major << 8 | minor); } } fdrop(fp, td); } static int newstat_copyout(struct stat *buf, void *ubuf) { struct l_newstat tbuf; bzero(&tbuf, sizeof(tbuf)); tbuf.st_dev = minor(buf->st_dev) | (major(buf->st_dev) << 8); tbuf.st_ino = buf->st_ino; tbuf.st_mode = buf->st_mode; tbuf.st_nlink = buf->st_nlink; tbuf.st_uid = buf->st_uid; tbuf.st_gid = buf->st_gid; tbuf.st_rdev = buf->st_rdev; tbuf.st_size = buf->st_size; tbuf.st_atim.tv_sec = buf->st_atim.tv_sec; tbuf.st_atim.tv_nsec = buf->st_atim.tv_nsec; tbuf.st_mtim.tv_sec = buf->st_mtim.tv_sec; tbuf.st_mtim.tv_nsec = buf->st_mtim.tv_nsec; tbuf.st_ctim.tv_sec = buf->st_ctim.tv_sec; tbuf.st_ctim.tv_nsec = buf->st_ctim.tv_nsec; tbuf.st_blksize = buf->st_blksize; tbuf.st_blocks = buf->st_blocks; return (copyout(&tbuf, ubuf, sizeof(tbuf))); } int linux_newstat(struct thread *td, struct linux_newstat_args *args) { struct stat buf; char *path; int error; LCONVPATHEXIST(td, args->path, &path); #ifdef DEBUG if (ldebug(newstat)) printf(ARGS(newstat, "%s, *"), path); #endif error = linux_kern_stat(td, path, UIO_SYSSPACE, &buf); LFREEPATH(path); if (error) return (error); return (newstat_copyout(&buf, args->buf)); } int linux_newlstat(struct thread *td, struct linux_newlstat_args *args) { struct stat sb; char *path; int error; LCONVPATHEXIST(td, args->path, &path); #ifdef DEBUG if (ldebug(newlstat)) printf(ARGS(newlstat, "%s, *"), path); #endif error = linux_kern_lstat(td, path, UIO_SYSSPACE, &sb); LFREEPATH(path); if (error) return (error); return (newstat_copyout(&sb, args->buf)); } int linux_newfstat(struct thread *td, struct linux_newfstat_args *args) { struct stat buf; int error; #ifdef DEBUG if (ldebug(newfstat)) printf(ARGS(newfstat, "%d, *"), args->fd); #endif error = kern_fstat(td, args->fd, &buf); translate_fd_major_minor(td, args->fd, &buf); if (!error) error = newstat_copyout(&buf, args->buf); return (error); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) static int stat_copyout(struct stat *buf, void *ubuf) { struct l_stat lbuf; bzero(&lbuf, sizeof(lbuf)); lbuf.st_dev = buf->st_dev; lbuf.st_ino = buf->st_ino; lbuf.st_mode = buf->st_mode; lbuf.st_nlink = buf->st_nlink; lbuf.st_uid = buf->st_uid; lbuf.st_gid = buf->st_gid; lbuf.st_rdev = buf->st_rdev; if (buf->st_size < (quad_t)1 << 32) lbuf.st_size = buf->st_size; else lbuf.st_size = -2; lbuf.st_atim.tv_sec = buf->st_atim.tv_sec; lbuf.st_atim.tv_nsec = buf->st_atim.tv_nsec; lbuf.st_mtim.tv_sec = buf->st_mtim.tv_sec; lbuf.st_mtim.tv_nsec = buf->st_mtim.tv_nsec; lbuf.st_ctim.tv_sec = buf->st_ctim.tv_sec; lbuf.st_ctim.tv_nsec = buf->st_ctim.tv_nsec; lbuf.st_blksize = buf->st_blksize; lbuf.st_blocks = buf->st_blocks; lbuf.st_flags = buf->st_flags; lbuf.st_gen = buf->st_gen; return (copyout(&lbuf, ubuf, sizeof(lbuf))); } int linux_stat(struct thread *td, struct linux_stat_args *args) { struct stat buf; char *path; int error; LCONVPATHEXIST(td, args->path, &path); #ifdef DEBUG if (ldebug(stat)) printf(ARGS(stat, "%s, *"), path); #endif error = linux_kern_stat(td, path, UIO_SYSSPACE, &buf); if (error) { LFREEPATH(path); return (error); } LFREEPATH(path); return (stat_copyout(&buf, args->up)); } int linux_lstat(struct thread *td, struct linux_lstat_args *args) { struct stat buf; char *path; int error; LCONVPATHEXIST(td, args->path, &path); #ifdef DEBUG if (ldebug(lstat)) printf(ARGS(lstat, "%s, *"), path); #endif error = linux_kern_lstat(td, path, UIO_SYSSPACE, &buf); if (error) { LFREEPATH(path); return (error); } LFREEPATH(path); return (stat_copyout(&buf, args->up)); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ struct l_statfs { l_long f_type; l_long f_bsize; l_long f_blocks; l_long f_bfree; l_long f_bavail; l_long f_files; l_long f_ffree; l_fsid_t f_fsid; l_long f_namelen; l_long f_frsize; l_long f_flags; l_long f_spare[4]; }; #define LINUX_CODA_SUPER_MAGIC 0x73757245L #define LINUX_EXT2_SUPER_MAGIC 0xEF53L #define LINUX_HPFS_SUPER_MAGIC 0xf995e849L #define LINUX_ISOFS_SUPER_MAGIC 0x9660L #define LINUX_MSDOS_SUPER_MAGIC 0x4d44L #define LINUX_NCP_SUPER_MAGIC 0x564cL #define LINUX_NFS_SUPER_MAGIC 0x6969L #define LINUX_NTFS_SUPER_MAGIC 0x5346544EL #define LINUX_PROC_SUPER_MAGIC 0x9fa0L #define LINUX_UFS_SUPER_MAGIC 0x00011954L /* XXX - UFS_MAGIC in Linux */ #define LINUX_ZFS_SUPER_MAGIC 0x2FC12FC1 #define LINUX_DEVFS_SUPER_MAGIC 0x1373L #define LINUX_SHMFS_MAGIC 0x01021994 static long bsd_to_linux_ftype(const char *fstypename) { int i; static struct {const char *bsd_name; long linux_type;} b2l_tbl[] = { {"ufs", LINUX_UFS_SUPER_MAGIC}, {"zfs", LINUX_ZFS_SUPER_MAGIC}, {"cd9660", LINUX_ISOFS_SUPER_MAGIC}, {"nfs", LINUX_NFS_SUPER_MAGIC}, {"ext2fs", LINUX_EXT2_SUPER_MAGIC}, {"procfs", LINUX_PROC_SUPER_MAGIC}, {"msdosfs", LINUX_MSDOS_SUPER_MAGIC}, {"ntfs", LINUX_NTFS_SUPER_MAGIC}, {"nwfs", LINUX_NCP_SUPER_MAGIC}, {"hpfs", LINUX_HPFS_SUPER_MAGIC}, {"coda", LINUX_CODA_SUPER_MAGIC}, {"devfs", LINUX_DEVFS_SUPER_MAGIC}, {"tmpfs", LINUX_SHMFS_MAGIC}, {NULL, 0L}}; for (i = 0; b2l_tbl[i].bsd_name != NULL; i++) if (strcmp(b2l_tbl[i].bsd_name, fstypename) == 0) return (b2l_tbl[i].linux_type); return (0L); } static int bsd_to_linux_statfs(struct statfs *bsd_statfs, struct l_statfs *linux_statfs) { #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) uint64_t tmp; #define LINUX_HIBITS 0xffffffff00000000ULL tmp = bsd_statfs->f_blocks | bsd_statfs->f_bfree | bsd_statfs->f_files | bsd_statfs->f_bsize; if ((bsd_statfs->f_bavail != -1 && (bsd_statfs->f_bavail & LINUX_HIBITS)) || (bsd_statfs->f_ffree != -1 && (bsd_statfs->f_ffree & LINUX_HIBITS)) || (tmp & LINUX_HIBITS)) return (EOVERFLOW); #undef LINUX_HIBITS #endif linux_statfs->f_type = bsd_to_linux_ftype(bsd_statfs->f_fstypename); linux_statfs->f_bsize = bsd_statfs->f_bsize; linux_statfs->f_blocks = bsd_statfs->f_blocks; linux_statfs->f_bfree = bsd_statfs->f_bfree; linux_statfs->f_bavail = bsd_statfs->f_bavail; linux_statfs->f_ffree = bsd_statfs->f_ffree; linux_statfs->f_files = bsd_statfs->f_files; linux_statfs->f_fsid.val[0] = bsd_statfs->f_fsid.val[0]; linux_statfs->f_fsid.val[1] = bsd_statfs->f_fsid.val[1]; linux_statfs->f_namelen = MAXNAMLEN; linux_statfs->f_frsize = bsd_statfs->f_bsize; linux_statfs->f_flags = 0; memset(linux_statfs->f_spare, 0, sizeof(linux_statfs->f_spare)); return (0); } int linux_statfs(struct thread *td, struct linux_statfs_args *args) { struct l_statfs linux_statfs; struct statfs *bsd_statfs; char *path; int error; LCONVPATHEXIST(td, args->path, &path); #ifdef DEBUG if (ldebug(statfs)) printf(ARGS(statfs, "%s, *"), path); #endif bsd_statfs = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_statfs(td, path, UIO_SYSSPACE, bsd_statfs); LFREEPATH(path); if (error == 0) error = bsd_to_linux_statfs(bsd_statfs, &linux_statfs); free(bsd_statfs, M_STATFS); if (error != 0) return (error); return (copyout(&linux_statfs, args->buf, sizeof(linux_statfs))); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) static void bsd_to_linux_statfs64(struct statfs *bsd_statfs, struct l_statfs64 *linux_statfs) { linux_statfs->f_type = bsd_to_linux_ftype(bsd_statfs->f_fstypename); linux_statfs->f_bsize = bsd_statfs->f_bsize; linux_statfs->f_blocks = bsd_statfs->f_blocks; linux_statfs->f_bfree = bsd_statfs->f_bfree; linux_statfs->f_bavail = bsd_statfs->f_bavail; linux_statfs->f_ffree = bsd_statfs->f_ffree; linux_statfs->f_files = bsd_statfs->f_files; linux_statfs->f_fsid.val[0] = bsd_statfs->f_fsid.val[0]; linux_statfs->f_fsid.val[1] = bsd_statfs->f_fsid.val[1]; linux_statfs->f_namelen = MAXNAMLEN; linux_statfs->f_frsize = bsd_statfs->f_bsize; linux_statfs->f_flags = 0; memset(linux_statfs->f_spare, 0, sizeof(linux_statfs->f_spare)); } int linux_statfs64(struct thread *td, struct linux_statfs64_args *args) { struct l_statfs64 linux_statfs; struct statfs *bsd_statfs; char *path; int error; if (args->bufsize != sizeof(struct l_statfs64)) return (EINVAL); LCONVPATHEXIST(td, args->path, &path); #ifdef DEBUG if (ldebug(statfs64)) printf(ARGS(statfs64, "%s, *"), path); #endif bsd_statfs = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_statfs(td, path, UIO_SYSSPACE, bsd_statfs); LFREEPATH(path); if (error == 0) bsd_to_linux_statfs64(bsd_statfs, &linux_statfs); free(bsd_statfs, M_STATFS); if (error != 0) return (error); return (copyout(&linux_statfs, args->buf, sizeof(linux_statfs))); } int linux_fstatfs64(struct thread *td, struct linux_fstatfs64_args *args) { struct l_statfs64 linux_statfs; struct statfs *bsd_statfs; int error; #ifdef DEBUG if (ldebug(fstatfs64)) printf(ARGS(fstatfs64, "%d, *"), args->fd); #endif if (args->bufsize != sizeof(struct l_statfs64)) return (EINVAL); bsd_statfs = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fstatfs(td, args->fd, bsd_statfs); if (error == 0) bsd_to_linux_statfs64(bsd_statfs, &linux_statfs); free(bsd_statfs, M_STATFS); if (error != 0) return (error); return (copyout(&linux_statfs, args->buf, sizeof(linux_statfs))); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ int linux_fstatfs(struct thread *td, struct linux_fstatfs_args *args) { struct l_statfs linux_statfs; struct statfs *bsd_statfs; int error; #ifdef DEBUG if (ldebug(fstatfs)) printf(ARGS(fstatfs, "%d, *"), args->fd); #endif bsd_statfs = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fstatfs(td, args->fd, bsd_statfs); if (error == 0) error = bsd_to_linux_statfs(bsd_statfs, &linux_statfs); free(bsd_statfs, M_STATFS); if (error != 0) return (error); return (copyout(&linux_statfs, args->buf, sizeof(linux_statfs))); } struct l_ustat { l_daddr_t f_tfree; l_ino_t f_tinode; char f_fname[6]; char f_fpack[6]; }; int linux_ustat(struct thread *td, struct linux_ustat_args *args) { #ifdef DEBUG if (ldebug(ustat)) printf(ARGS(ustat, "%ju, *"), (uintmax_t)args->dev); #endif return (EOPNOTSUPP); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) static int stat64_copyout(struct stat *buf, void *ubuf) { struct l_stat64 lbuf; bzero(&lbuf, sizeof(lbuf)); lbuf.st_dev = minor(buf->st_dev) | (major(buf->st_dev) << 8); lbuf.st_ino = buf->st_ino; lbuf.st_mode = buf->st_mode; lbuf.st_nlink = buf->st_nlink; lbuf.st_uid = buf->st_uid; lbuf.st_gid = buf->st_gid; lbuf.st_rdev = buf->st_rdev; lbuf.st_size = buf->st_size; lbuf.st_atim.tv_sec = buf->st_atim.tv_sec; lbuf.st_atim.tv_nsec = buf->st_atim.tv_nsec; lbuf.st_mtim.tv_sec = buf->st_mtim.tv_sec; lbuf.st_mtim.tv_nsec = buf->st_mtim.tv_nsec; lbuf.st_ctim.tv_sec = buf->st_ctim.tv_sec; lbuf.st_ctim.tv_nsec = buf->st_ctim.tv_nsec; lbuf.st_blksize = buf->st_blksize; lbuf.st_blocks = buf->st_blocks; /* * The __st_ino field makes all the difference. In the Linux kernel * it is conditionally compiled based on STAT64_HAS_BROKEN_ST_INO, * but without the assignment to __st_ino the runtime linker refuses * to mmap(2) any shared libraries. I guess it's broken alright :-) */ lbuf.__st_ino = buf->st_ino; return (copyout(&lbuf, ubuf, sizeof(lbuf))); } int linux_stat64(struct thread *td, struct linux_stat64_args *args) { struct stat buf; char *filename; int error; LCONVPATHEXIST(td, args->filename, &filename); #ifdef DEBUG if (ldebug(stat64)) printf(ARGS(stat64, "%s, *"), filename); #endif error = linux_kern_stat(td, filename, UIO_SYSSPACE, &buf); LFREEPATH(filename); if (error) return (error); return (stat64_copyout(&buf, args->statbuf)); } int linux_lstat64(struct thread *td, struct linux_lstat64_args *args) { struct stat sb; char *filename; int error; LCONVPATHEXIST(td, args->filename, &filename); #ifdef DEBUG if (ldebug(lstat64)) printf(ARGS(lstat64, "%s, *"), args->filename); #endif error = linux_kern_lstat(td, filename, UIO_SYSSPACE, &sb); LFREEPATH(filename); if (error) return (error); return (stat64_copyout(&sb, args->statbuf)); } int linux_fstat64(struct thread *td, struct linux_fstat64_args *args) { struct stat buf; int error; #ifdef DEBUG if (ldebug(fstat64)) printf(ARGS(fstat64, "%d, *"), args->fd); #endif error = kern_fstat(td, args->fd, &buf); translate_fd_major_minor(td, args->fd, &buf); if (!error) error = stat64_copyout(&buf, args->statbuf); return (error); } int linux_fstatat64(struct thread *td, struct linux_fstatat64_args *args) { char *path; int error, dfd, flag; struct stat buf; if (args->flag & ~LINUX_AT_SYMLINK_NOFOLLOW) return (EINVAL); flag = (args->flag & LINUX_AT_SYMLINK_NOFOLLOW) ? AT_SYMLINK_NOFOLLOW : 0; dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd; LCONVPATHEXIST_AT(td, args->pathname, &path, dfd); #ifdef DEBUG if (ldebug(fstatat64)) printf(ARGS(fstatat64, "%i, %s, %i"), args->dfd, path, args->flag); #endif error = linux_kern_statat(td, flag, dfd, path, UIO_SYSSPACE, &buf); if (!error) error = stat64_copyout(&buf, args->statbuf); LFREEPATH(path); return (error); } #else /* __amd64__ && !COMPAT_LINUX32 */ int linux_newfstatat(struct thread *td, struct linux_newfstatat_args *args) { char *path; int error, dfd, flag; struct stat buf; if (args->flag & ~LINUX_AT_SYMLINK_NOFOLLOW) return (EINVAL); flag = (args->flag & LINUX_AT_SYMLINK_NOFOLLOW) ? AT_SYMLINK_NOFOLLOW : 0; dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd; LCONVPATHEXIST_AT(td, args->pathname, &path, dfd); #ifdef DEBUG if (ldebug(newfstatat)) printf(ARGS(newfstatat, "%i, %s, %i"), args->dfd, path, args->flag); #endif error = linux_kern_statat(td, flag, dfd, path, UIO_SYSSPACE, &buf); if (error == 0) error = newstat_copyout(&buf, args->statbuf); LFREEPATH(path); return (error); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ int linux_syncfs(struct thread *td, struct linux_syncfs_args *args) { - cap_rights_t rights; struct mount *mp; struct vnode *vp; int error, save; - error = fgetvp(td, args->fd, cap_rights_init(&rights, CAP_FSYNC), &vp); + error = fgetvp(td, args->fd, &cap_fsync_rights, &vp); if (error != 0) /* * Linux syncfs() returns only EBADF, however fgetvp() * can return EINVAL in case of file descriptor does * not represent a vnode. XXX. */ return (error); mp = vp->v_mount; mtx_lock(&mountlist_mtx); error = vfs_busy(mp, MBF_MNTLSTLOCK); if (error != 0) { /* See comment above. */ mtx_unlock(&mountlist_mtx); goto out; } if ((mp->mnt_flag & MNT_RDONLY) == 0 && vn_start_write(NULL, &mp, V_NOWAIT) == 0) { save = curthread_pflags_set(TDP_SYNCIO); vfs_msync(mp, MNT_NOWAIT); VFS_SYNC(mp, MNT_NOWAIT); curthread_pflags_restore(save); vn_finished_write(mp); } vfs_unbusy(mp); out: vrele(vp); return (error); } Index: head/sys/compat/linuxkpi/common/include/linux/file.h =================================================================== --- head/sys/compat/linuxkpi/common/include/linux/file.h (revision 333424) +++ head/sys/compat/linuxkpi/common/include/linux/file.h (revision 333425) @@ -1,193 +1,190 @@ /*- * Copyright (c) 2010 Isilon Systems, Inc. * Copyright (c) 2010 iX Systems, Inc. * Copyright (c) 2010 Panasas, Inc. * Copyright (c) 2013-2017 Mellanox Technologies, Ltd. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _LINUX_FILE_H_ #define _LINUX_FILE_H_ #include #include #include #include #include #include #include #include struct linux_file; #undef file extern struct fileops linuxfileops; static inline struct linux_file * linux_fget(unsigned int fd) { - cap_rights_t rights; struct file *file; /* lookup file pointer by file descriptor index */ if (fget_unlocked(curthread->td_proc->p_fd, fd, - cap_rights_init(&rights), &file, NULL) != 0) + &cap_no_rights, &file, NULL) != 0) return (NULL); /* check if file handle really belongs to us */ if (file->f_data == NULL || file->f_ops != &linuxfileops) { fdrop(file, curthread); return (NULL); } return ((struct linux_file *)file->f_data); } extern void linux_file_free(struct linux_file *filp); static inline void fput(struct linux_file *filp) { if (refcount_release(filp->_file == NULL ? &filp->f_count : &filp->_file->f_count)) { linux_file_free(filp); } } static inline unsigned int file_count(struct linux_file *filp) { return (filp->_file == NULL ? filp->f_count : filp->_file->f_count); } static inline void put_unused_fd(unsigned int fd) { - cap_rights_t rights; struct file *file; if (fget_unlocked(curthread->td_proc->p_fd, fd, - cap_rights_init(&rights), &file, NULL) != 0) { + &cap_no_rights, &file, NULL) != 0) { return; } /* * NOTE: We should only get here when the "fd" has not been * installed, so no need to free the associated Linux file * structure. */ fdclose(curthread, file, fd); /* drop extra reference */ fdrop(file, curthread); } static inline void fd_install(unsigned int fd, struct linux_file *filp) { - cap_rights_t rights; struct file *file; if (fget_unlocked(curthread->td_proc->p_fd, fd, - cap_rights_init(&rights), &file, NULL) != 0) { + &cap_no_rights, &file, NULL) != 0) { filp->_file = NULL; } else { filp->_file = file; finit(file, filp->f_mode, DTYPE_DEV, filp, &linuxfileops); /* transfer reference count from "filp" to "file" */ while (refcount_release(&filp->f_count) == 0) refcount_acquire(&file->f_count); } /* drop the extra reference */ fput(filp); } static inline int get_unused_fd(void) { struct file *file; int error; int fd; error = falloc(curthread, &file, &fd, 0); if (error) return -error; /* drop the extra reference */ fdrop(file, curthread); return fd; } static inline int get_unused_fd_flags(int flags) { struct file *file; int error; int fd; error = falloc(curthread, &file, &fd, flags); if (error) return -error; /* drop the extra reference */ fdrop(file, curthread); return fd; } extern struct linux_file *linux_file_alloc(void); static inline struct linux_file * alloc_file(int mode, const struct file_operations *fops) { struct linux_file *filp; filp = linux_file_alloc(); filp->f_op = fops; filp->f_mode = mode; return (filp); } struct fd { struct linux_file *linux_file; }; static inline void fdput(struct fd fd) { fput(fd.linux_file); } static inline struct fd fdget(unsigned int fd) { struct linux_file *f = linux_fget(fd); return (struct fd){f}; } #define file linux_file #define fget(...) linux_fget(__VA_ARGS__) #endif /* _LINUX_FILE_H_ */ Index: head/sys/dev/filemon/filemon.c =================================================================== --- head/sys/dev/filemon/filemon.c (revision 333424) +++ head/sys/dev/filemon/filemon.c (revision 333425) @@ -1,512 +1,511 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011, David E. O'Brien. * Copyright (c) 2009-2011, Juniper Networks, Inc. * Copyright (c) 2015-2016, EMC Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY JUNIPER NETWORKS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL JUNIPER NETWORKS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "filemon.h" #if defined(COMPAT_FREEBSD32) #include #include #include #endif static d_close_t filemon_close; static d_ioctl_t filemon_ioctl; static d_open_t filemon_open; static struct cdevsw filemon_cdevsw = { .d_version = D_VERSION, .d_close = filemon_close, .d_ioctl = filemon_ioctl, .d_open = filemon_open, .d_name = "filemon", }; MALLOC_DECLARE(M_FILEMON); MALLOC_DEFINE(M_FILEMON, "filemon", "File access monitor"); /* * The filemon->lock protects several things currently: * - fname1/fname2/msgbufr are pre-allocated and used per syscall * for logging and copyins rather than stack variables. * - Serializing the filemon's log output. * - Preventing inheritance or removal of the filemon into proc.p_filemon. */ struct filemon { struct sx lock; /* Lock for this filemon. */ struct file *fp; /* Output file pointer. */ struct ucred *cred; /* Credential of tracer. */ char fname1[MAXPATHLEN]; /* Temporary filename buffer. */ char fname2[MAXPATHLEN]; /* Temporary filename buffer. */ char msgbufr[1024]; /* Output message buffer. */ int error; /* Log write error, returned on close(2). */ u_int refcnt; /* Pointer reference count. */ u_int proccnt; /* Process count. */ }; static struct cdev *filemon_dev; static void filemon_output(struct filemon *filemon, char *msg, size_t len); static __inline struct filemon * filemon_acquire(struct filemon *filemon) { if (filemon != NULL) refcount_acquire(&filemon->refcnt); return (filemon); } /* * Release a reference and free on the last one. */ static void filemon_release(struct filemon *filemon) { if (refcount_release(&filemon->refcnt) == 0) return; /* * There are valid cases of releasing while locked, such as in * filemon_untrack_processes, but none which are done where there * is not at least 1 reference remaining. */ sx_assert(&filemon->lock, SA_UNLOCKED); if (filemon->cred != NULL) crfree(filemon->cred); sx_destroy(&filemon->lock); free(filemon, M_FILEMON); } /* * Acquire the proc's p_filemon reference and lock the filemon. * The proc's p_filemon may not match this filemon on return. */ static struct filemon * filemon_proc_get(struct proc *p) { struct filemon *filemon; if (p->p_filemon == NULL) return (NULL); PROC_LOCK(p); filemon = filemon_acquire(p->p_filemon); PROC_UNLOCK(p); if (filemon == NULL) return (NULL); /* * The p->p_filemon may have changed by now. That case is handled * by the exit and fork hooks and filemon_attach_proc specially. */ sx_xlock(&filemon->lock); return (filemon); } /* Remove and release the filemon on the given process. */ static void filemon_proc_drop(struct proc *p) { struct filemon *filemon; KASSERT(p->p_filemon != NULL, ("%s: proc %p NULL p_filemon", __func__, p)); sx_assert(&p->p_filemon->lock, SA_XLOCKED); PROC_LOCK(p); filemon = p->p_filemon; p->p_filemon = NULL; --filemon->proccnt; PROC_UNLOCK(p); /* * This should not be the last reference yet. filemon_release() * cannot be called with filemon locked, which the caller expects * will stay locked. */ KASSERT(filemon->refcnt > 1, ("%s: proc %p dropping filemon %p " "with last reference", __func__, p, filemon)); filemon_release(filemon); } /* Unlock and release the filemon. */ static __inline void filemon_drop(struct filemon *filemon) { sx_xunlock(&filemon->lock); filemon_release(filemon); } #include "filemon_wrapper.c" static void filemon_write_header(struct filemon *filemon) { int len; struct timeval now; getmicrotime(&now); len = snprintf(filemon->msgbufr, sizeof(filemon->msgbufr), "# filemon version %d\n# Target pid %d\n# Start %ju.%06ju\nV %d\n", FILEMON_VERSION, curproc->p_pid, (uintmax_t)now.tv_sec, (uintmax_t)now.tv_usec, FILEMON_VERSION); filemon_output(filemon, filemon->msgbufr, len); } /* * Invalidate the passed filemon in all processes. */ static void filemon_untrack_processes(struct filemon *filemon) { struct proc *p; sx_assert(&filemon->lock, SA_XLOCKED); /* Avoid allproc loop if there is no need. */ if (filemon->proccnt == 0) return; /* * Processes in this list won't go away while here since * filemon_event_process_exit() will lock on filemon->lock * which we hold. */ sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { /* * No PROC_LOCK is needed to compare here since it is * guaranteed to not change since we have its filemon * locked. Everything that changes this p_filemon will * be locked on it. */ if (p->p_filemon == filemon) filemon_proc_drop(p); } sx_sunlock(&allproc_lock); /* * It's possible some references were acquired but will be * dropped shortly as they are restricted from being * inherited. There is at least the reference in cdevpriv remaining. */ KASSERT(filemon->refcnt > 0, ("%s: filemon %p should have " "references still.", __func__, filemon)); KASSERT(filemon->proccnt == 0, ("%s: filemon %p should not have " "attached procs still.", __func__, filemon)); } /* * Close out the log. */ static void filemon_close_log(struct filemon *filemon) { struct file *fp; struct timeval now; size_t len; sx_assert(&filemon->lock, SA_XLOCKED); if (filemon->fp == NULL) return; getmicrotime(&now); len = snprintf(filemon->msgbufr, sizeof(filemon->msgbufr), "# Stop %ju.%06ju\n# Bye bye\n", (uintmax_t)now.tv_sec, (uintmax_t)now.tv_usec); filemon_output(filemon, filemon->msgbufr, len); fp = filemon->fp; filemon->fp = NULL; sx_xunlock(&filemon->lock); fdrop(fp, curthread); sx_xlock(&filemon->lock); } /* * The devfs file is being closed. Untrace all processes. It is possible * filemon_close/close(2) was not called. */ static void filemon_dtr(void *data) { struct filemon *filemon = data; if (filemon == NULL) return; sx_xlock(&filemon->lock); /* * Detach the filemon. It cannot be inherited after this. */ filemon_untrack_processes(filemon); filemon_close_log(filemon); filemon_drop(filemon); } /* Attach the filemon to the process. */ static int filemon_attach_proc(struct filemon *filemon, struct proc *p) { struct filemon *filemon2; sx_assert(&filemon->lock, SA_XLOCKED); PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT((p->p_flag & P_WEXIT) == 0, ("%s: filemon %p attaching to exiting process %p", __func__, filemon, p)); KASSERT((p->p_flag & P_INEXEC) == 0, ("%s: filemon %p attaching to execing process %p", __func__, filemon, p)); if (p->p_filemon == filemon) return (0); /* * Don't allow truncating other process traces. It is * not really intended to trace procs other than curproc * anyhow. */ if (p->p_filemon != NULL && p != curproc) return (EBUSY); /* * Historic behavior of filemon has been to let a child initiate * tracing on itself and cease existing tracing. Bmake * .META + .MAKE relies on this. It is only relevant for attaching to * curproc. */ while (p->p_filemon != NULL) { PROC_UNLOCK(p); sx_xunlock(&filemon->lock); while ((filemon2 = filemon_proc_get(p)) != NULL) { /* It may have changed. */ if (p->p_filemon == filemon2) filemon_proc_drop(p); filemon_drop(filemon2); } sx_xlock(&filemon->lock); PROC_LOCK(p); /* * It may have been attached to, though unlikely. * Try again if needed. */ } KASSERT(p->p_filemon == NULL, ("%s: proc %p didn't detach filemon %p", __func__, p, p->p_filemon)); p->p_filemon = filemon_acquire(filemon); ++filemon->proccnt; return (0); } static int filemon_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag __unused, struct thread *td) { int error = 0; struct filemon *filemon; struct proc *p; - cap_rights_t rights; if ((error = devfs_get_cdevpriv((void **) &filemon)) != 0) return (error); sx_xlock(&filemon->lock); switch (cmd) { /* Set the output file descriptor. */ case FILEMON_SET_FD: if (filemon->fp != NULL) { error = EEXIST; break; } error = fget_write(td, *(int *)data, - cap_rights_init(&rights, CAP_PWRITE), + &cap_pwrite_rights, &filemon->fp); if (error == 0) /* Write the file header. */ filemon_write_header(filemon); break; /* Set the monitored process ID. */ case FILEMON_SET_PID: /* Invalidate any existing processes already set. */ filemon_untrack_processes(filemon); error = pget(*((pid_t *)data), PGET_CANDEBUG | PGET_NOTWEXIT | PGET_NOTINEXEC, &p); if (error == 0) { KASSERT(p->p_filemon != filemon, ("%s: proc %p didn't untrack filemon %p", __func__, p, filemon)); error = filemon_attach_proc(filemon, p); PROC_UNLOCK(p); } break; default: error = EINVAL; break; } sx_xunlock(&filemon->lock); return (error); } static int filemon_open(struct cdev *dev, int oflags __unused, int devtype __unused, struct thread *td) { int error; struct filemon *filemon; filemon = malloc(sizeof(*filemon), M_FILEMON, M_WAITOK | M_ZERO); sx_init(&filemon->lock, "filemon"); refcount_init(&filemon->refcnt, 1); filemon->cred = crhold(td->td_ucred); error = devfs_set_cdevpriv(filemon, filemon_dtr); if (error != 0) filemon_release(filemon); return (error); } /* Called on close of last devfs file handle, before filemon_dtr(). */ static int filemon_close(struct cdev *dev __unused, int flag __unused, int fmt __unused, struct thread *td __unused) { struct filemon *filemon; int error; if ((error = devfs_get_cdevpriv((void **) &filemon)) != 0) return (error); sx_xlock(&filemon->lock); filemon_close_log(filemon); error = filemon->error; sx_xunlock(&filemon->lock); /* * Processes are still being traced but won't log anything * now. After this call returns filemon_dtr() is called which * will detach processes. */ return (error); } static void filemon_load(void *dummy __unused) { /* Install the syscall wrappers. */ filemon_wrapper_install(); filemon_dev = make_dev(&filemon_cdevsw, 0, UID_ROOT, GID_WHEEL, 0666, "filemon"); } static int filemon_unload(void) { destroy_dev(filemon_dev); filemon_wrapper_deinstall(); return (0); } static int filemon_modevent(module_t mod __unused, int type, void *data) { int error = 0; switch (type) { case MOD_LOAD: filemon_load(data); break; case MOD_UNLOAD: error = filemon_unload(); break; case MOD_QUIESCE: /* * The wrapper implementation is unsafe for reliable unload. * Require forcing an unload. */ error = EBUSY; break; case MOD_SHUTDOWN: break; default: error = EOPNOTSUPP; break; } return (error); } DEV_MODULE(filemon, filemon_modevent, NULL); MODULE_VERSION(filemon, 1); Index: head/sys/dev/hwpmc/hwpmc_logging.c =================================================================== --- head/sys/dev/hwpmc/hwpmc_logging.c (revision 333424) +++ head/sys/dev/hwpmc/hwpmc_logging.c (revision 333425) @@ -1,1131 +1,1129 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005-2007 Joseph Koshy * Copyright (c) 2007 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by A. Joseph Koshy under * sponsorship from the FreeBSD Foundation and Google, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* * Logging code for hwpmc(4) */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Sysctl tunables */ SYSCTL_DECL(_kern_hwpmc); /* * kern.hwpmc.logbuffersize -- size of the per-cpu owner buffers. */ static int pmclog_buffer_size = PMC_LOG_BUFFER_SIZE; #if (__FreeBSD_version < 1100000) TUNABLE_INT(PMC_SYSCTL_NAME_PREFIX "logbuffersize", &pmclog_buffer_size); #endif SYSCTL_INT(_kern_hwpmc, OID_AUTO, logbuffersize, CTLFLAG_RDTUN, &pmclog_buffer_size, 0, "size of log buffers in kilobytes"); /* * kern.hwpmc.nbuffer -- number of global log buffers */ static int pmc_nlogbuffers = PMC_NLOGBUFFERS; #if (__FreeBSD_version < 1100000) TUNABLE_INT(PMC_SYSCTL_NAME_PREFIX "nbuffers", &pmc_nlogbuffers); #endif SYSCTL_INT(_kern_hwpmc, OID_AUTO, nbuffers, CTLFLAG_RDTUN, &pmc_nlogbuffers, 0, "number of global log buffers"); /* * Global log buffer list and associated spin lock. */ TAILQ_HEAD(, pmclog_buffer) pmc_bufferlist = TAILQ_HEAD_INITIALIZER(pmc_bufferlist); static struct mtx pmc_bufferlist_mtx; /* spin lock */ static struct mtx pmc_kthread_mtx; /* sleep lock */ #define PMCLOG_INIT_BUFFER_DESCRIPTOR(D) do { \ const int __roundup = roundup(sizeof(*D), \ sizeof(uint32_t)); \ (D)->plb_fence = ((char *) (D)) + \ 1024*pmclog_buffer_size; \ (D)->plb_base = (D)->plb_ptr = ((char *) (D)) + \ __roundup; \ } while (0) /* * Log file record constructors. */ #define _PMCLOG_TO_HEADER(T,L) \ ((PMCLOG_HEADER_MAGIC << 24) | \ (PMCLOG_TYPE_ ## T << 16) | \ ((L) & 0xFFFF)) /* reserve LEN bytes of space and initialize the entry header */ #define _PMCLOG_RESERVE(PO,TYPE,LEN,ACTION) do { \ uint32_t *_le; \ int _len = roundup((LEN), sizeof(uint32_t)); \ if ((_le = pmclog_reserve((PO), _len)) == NULL) { \ ACTION; \ } \ *_le = _PMCLOG_TO_HEADER(TYPE,_len); \ _le += 3 /* skip over timestamp */ #define PMCLOG_RESERVE(P,T,L) _PMCLOG_RESERVE(P,T,L,return) #define PMCLOG_RESERVE_WITH_ERROR(P,T,L) _PMCLOG_RESERVE(P,T,L, \ error=ENOMEM;goto error) #define PMCLOG_EMIT32(V) do { *_le++ = (V); } while (0) #define PMCLOG_EMIT64(V) do { \ *_le++ = (uint32_t) ((V) & 0xFFFFFFFF); \ *_le++ = (uint32_t) (((V) >> 32) & 0xFFFFFFFF); \ } while (0) /* Emit a string. Caution: does NOT update _le, so needs to be last */ #define PMCLOG_EMITSTRING(S,L) do { bcopy((S), _le, (L)); } while (0) #define PMCLOG_EMITNULLSTRING(L) do { bzero(_le, (L)); } while (0) #define PMCLOG_DESPATCH(PO) \ pmclog_release((PO)); \ } while (0) /* * Assertions about the log file format. */ CTASSERT(sizeof(struct pmclog_callchain) == 6*4 + PMC_CALLCHAIN_DEPTH_MAX*sizeof(uintfptr_t)); CTASSERT(sizeof(struct pmclog_closelog) == 3*4); CTASSERT(sizeof(struct pmclog_dropnotify) == 3*4); CTASSERT(sizeof(struct pmclog_map_in) == PATH_MAX + 4*4 + sizeof(uintfptr_t)); CTASSERT(offsetof(struct pmclog_map_in,pl_pathname) == 4*4 + sizeof(uintfptr_t)); CTASSERT(sizeof(struct pmclog_map_out) == 4*4 + 2*sizeof(uintfptr_t)); CTASSERT(sizeof(struct pmclog_pcsample) == 6*4 + sizeof(uintfptr_t)); CTASSERT(sizeof(struct pmclog_pmcallocate) == 6*4); CTASSERT(sizeof(struct pmclog_pmcattach) == 5*4 + PATH_MAX); CTASSERT(offsetof(struct pmclog_pmcattach,pl_pathname) == 5*4); CTASSERT(sizeof(struct pmclog_pmcdetach) == 5*4); CTASSERT(sizeof(struct pmclog_proccsw) == 5*4 + 8); CTASSERT(sizeof(struct pmclog_procexec) == 5*4 + PATH_MAX + sizeof(uintfptr_t)); CTASSERT(offsetof(struct pmclog_procexec,pl_pathname) == 5*4 + sizeof(uintfptr_t)); CTASSERT(sizeof(struct pmclog_procexit) == 5*4 + 8); CTASSERT(sizeof(struct pmclog_procfork) == 5*4); CTASSERT(sizeof(struct pmclog_sysexit) == 4*4); CTASSERT(sizeof(struct pmclog_userdata) == 4*4); /* * Log buffer structure */ struct pmclog_buffer { TAILQ_ENTRY(pmclog_buffer) plb_next; char *plb_base; char *plb_ptr; char *plb_fence; }; /* * Prototypes */ static int pmclog_get_buffer(struct pmc_owner *po); static void pmclog_loop(void *arg); static void pmclog_release(struct pmc_owner *po); static uint32_t *pmclog_reserve(struct pmc_owner *po, int length); static void pmclog_schedule_io(struct pmc_owner *po); static void pmclog_stop_kthread(struct pmc_owner *po); /* * Helper functions */ /* * Get a log buffer */ static int pmclog_get_buffer(struct pmc_owner *po) { struct pmclog_buffer *plb; mtx_assert(&po->po_mtx, MA_OWNED); KASSERT(po->po_curbuf == NULL, ("[pmclog,%d] po=%p current buffer still valid", __LINE__, po)); mtx_lock_spin(&pmc_bufferlist_mtx); if ((plb = TAILQ_FIRST(&pmc_bufferlist)) != NULL) TAILQ_REMOVE(&pmc_bufferlist, plb, plb_next); mtx_unlock_spin(&pmc_bufferlist_mtx); PMCDBG2(LOG,GTB,1, "po=%p plb=%p", po, plb); #ifdef HWPMC_DEBUG if (plb) KASSERT(plb->plb_ptr == plb->plb_base && plb->plb_base < plb->plb_fence, ("[pmclog,%d] po=%p buffer invariants: ptr=%p " "base=%p fence=%p", __LINE__, po, plb->plb_ptr, plb->plb_base, plb->plb_fence)); #endif po->po_curbuf = plb; /* update stats */ atomic_add_int(&pmc_stats.pm_buffer_requests, 1); if (plb == NULL) atomic_add_int(&pmc_stats.pm_buffer_requests_failed, 1); return (plb ? 0 : ENOMEM); } struct pmclog_proc_init_args { struct proc *kthr; struct pmc_owner *po; bool exit; bool acted; }; int pmclog_proc_create(struct thread *td, void **handlep) { struct pmclog_proc_init_args *ia; int error; ia = malloc(sizeof(*ia), M_TEMP, M_WAITOK | M_ZERO); error = kproc_create(pmclog_loop, ia, &ia->kthr, RFHIGHPID, 0, "hwpmc: proc(%d)", td->td_proc->p_pid); if (error == 0) *handlep = ia; return (error); } void pmclog_proc_ignite(void *handle, struct pmc_owner *po) { struct pmclog_proc_init_args *ia; ia = handle; mtx_lock(&pmc_kthread_mtx); MPASS(!ia->acted); MPASS(ia->po == NULL); MPASS(!ia->exit); MPASS(ia->kthr != NULL); if (po == NULL) { ia->exit = true; } else { ia->po = po; KASSERT(po->po_kthread == NULL, ("[pmclog,%d] po=%p kthread (%p) already present", __LINE__, po, po->po_kthread)); po->po_kthread = ia->kthr; } wakeup(ia); while (!ia->acted) msleep(ia, &pmc_kthread_mtx, PWAIT, "pmclogw", 0); mtx_unlock(&pmc_kthread_mtx); free(ia, M_TEMP); } /* * Log handler loop. * * This function is executed by each pmc owner's helper thread. */ static void pmclog_loop(void *arg) { struct pmclog_proc_init_args *ia; struct pmc_owner *po; struct pmclog_buffer *lb; struct proc *p; struct ucred *ownercred; struct ucred *mycred; struct thread *td; sigset_t unb; struct uio auio; struct iovec aiov; size_t nbytes; int error; td = curthread; SIGEMPTYSET(unb); SIGADDSET(unb, SIGHUP); (void)kern_sigprocmask(td, SIG_UNBLOCK, &unb, NULL, 0); ia = arg; MPASS(ia->kthr == curproc); MPASS(!ia->acted); mtx_lock(&pmc_kthread_mtx); while (ia->po == NULL && !ia->exit) msleep(ia, &pmc_kthread_mtx, PWAIT, "pmclogi", 0); if (ia->exit) { ia->acted = true; wakeup(ia); mtx_unlock(&pmc_kthread_mtx); kproc_exit(0); } MPASS(ia->po != NULL); po = ia->po; ia->acted = true; wakeup(ia); mtx_unlock(&pmc_kthread_mtx); ia = NULL; p = po->po_owner; mycred = td->td_ucred; PROC_LOCK(p); ownercred = crhold(p->p_ucred); PROC_UNLOCK(p); PMCDBG2(LOG,INI,1, "po=%p kt=%p", po, po->po_kthread); KASSERT(po->po_kthread == curthread->td_proc, ("[pmclog,%d] proc mismatch po=%p po/kt=%p curproc=%p", __LINE__, po, po->po_kthread, curthread->td_proc)); lb = NULL; /* * Loop waiting for I/O requests to be added to the owner * struct's queue. The loop is exited when the log file * is deconfigured. */ mtx_lock(&pmc_kthread_mtx); for (;;) { /* check if we've been asked to exit */ if ((po->po_flags & PMC_PO_OWNS_LOGFILE) == 0) break; if (lb == NULL) { /* look for a fresh buffer to write */ mtx_lock_spin(&po->po_mtx); if ((lb = TAILQ_FIRST(&po->po_logbuffers)) == NULL) { mtx_unlock_spin(&po->po_mtx); /* No more buffers and shutdown required. */ if (po->po_flags & PMC_PO_SHUTDOWN) break; (void) msleep(po, &pmc_kthread_mtx, PWAIT, "pmcloop", 0); continue; } TAILQ_REMOVE(&po->po_logbuffers, lb, plb_next); mtx_unlock_spin(&po->po_mtx); } mtx_unlock(&pmc_kthread_mtx); /* process the request */ PMCDBG3(LOG,WRI,2, "po=%p base=%p ptr=%p", po, lb->plb_base, lb->plb_ptr); /* change our thread's credentials before issuing the I/O */ aiov.iov_base = lb->plb_base; aiov.iov_len = nbytes = lb->plb_ptr - lb->plb_base; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = -1; auio.uio_resid = nbytes; auio.uio_rw = UIO_WRITE; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; /* switch thread credentials -- see kern_ktrace.c */ td->td_ucred = ownercred; error = fo_write(po->po_file, &auio, ownercred, 0, td); td->td_ucred = mycred; if (error) { /* XXX some errors are recoverable */ /* send a SIGIO to the owner and exit */ PROC_LOCK(p); kern_psignal(p, SIGIO); PROC_UNLOCK(p); mtx_lock(&pmc_kthread_mtx); po->po_error = error; /* save for flush log */ PMCDBG2(LOG,WRI,2, "po=%p error=%d", po, error); break; } mtx_lock(&pmc_kthread_mtx); /* put the used buffer back into the global pool */ PMCLOG_INIT_BUFFER_DESCRIPTOR(lb); mtx_lock_spin(&pmc_bufferlist_mtx); TAILQ_INSERT_HEAD(&pmc_bufferlist, lb, plb_next); mtx_unlock_spin(&pmc_bufferlist_mtx); lb = NULL; } wakeup_one(po->po_kthread); po->po_kthread = NULL; mtx_unlock(&pmc_kthread_mtx); /* return the current I/O buffer to the global pool */ if (lb) { PMCLOG_INIT_BUFFER_DESCRIPTOR(lb); mtx_lock_spin(&pmc_bufferlist_mtx); TAILQ_INSERT_HEAD(&pmc_bufferlist, lb, plb_next); mtx_unlock_spin(&pmc_bufferlist_mtx); } /* * Exit this thread, signalling the waiter */ crfree(ownercred); kproc_exit(0); } /* * Release and log entry and schedule an I/O if needed. */ static void pmclog_release(struct pmc_owner *po) { KASSERT(po->po_curbuf->plb_ptr >= po->po_curbuf->plb_base, ("[pmclog,%d] buffer invariants po=%p ptr=%p base=%p", __LINE__, po, po->po_curbuf->plb_ptr, po->po_curbuf->plb_base)); KASSERT(po->po_curbuf->plb_ptr <= po->po_curbuf->plb_fence, ("[pmclog,%d] buffer invariants po=%p ptr=%p fenc=%p", __LINE__, po, po->po_curbuf->plb_ptr, po->po_curbuf->plb_fence)); /* schedule an I/O if we've filled a buffer */ if (po->po_curbuf->plb_ptr >= po->po_curbuf->plb_fence) pmclog_schedule_io(po); mtx_unlock_spin(&po->po_mtx); PMCDBG1(LOG,REL,1, "po=%p", po); } /* * Attempt to reserve 'length' bytes of space in an owner's log * buffer. The function returns a pointer to 'length' bytes of space * if there was enough space or returns NULL if no space was * available. Non-null returns do so with the po mutex locked. The * caller must invoke pmclog_release() on the pmc owner structure * when done. */ static uint32_t * pmclog_reserve(struct pmc_owner *po, int length) { uintptr_t newptr, oldptr; uint32_t *lh; struct timespec ts; PMCDBG2(LOG,ALL,1, "po=%p len=%d", po, length); KASSERT(length % sizeof(uint32_t) == 0, ("[pmclog,%d] length not a multiple of word size", __LINE__)); mtx_lock_spin(&po->po_mtx); /* No more data when shutdown in progress. */ if (po->po_flags & PMC_PO_SHUTDOWN) { mtx_unlock_spin(&po->po_mtx); return (NULL); } if (po->po_curbuf == NULL) if (pmclog_get_buffer(po) != 0) { mtx_unlock_spin(&po->po_mtx); return (NULL); } KASSERT(po->po_curbuf != NULL, ("[pmclog,%d] po=%p no current buffer", __LINE__, po)); KASSERT(po->po_curbuf->plb_ptr >= po->po_curbuf->plb_base && po->po_curbuf->plb_ptr <= po->po_curbuf->plb_fence, ("[pmclog,%d] po=%p buffer invariants: ptr=%p base=%p fence=%p", __LINE__, po, po->po_curbuf->plb_ptr, po->po_curbuf->plb_base, po->po_curbuf->plb_fence)); oldptr = (uintptr_t) po->po_curbuf->plb_ptr; newptr = oldptr + length; KASSERT(oldptr != (uintptr_t) NULL, ("[pmclog,%d] po=%p Null log buffer pointer", __LINE__, po)); /* * If we have space in the current buffer, return a pointer to * available space with the PO structure locked. */ if (newptr <= (uintptr_t) po->po_curbuf->plb_fence) { po->po_curbuf->plb_ptr = (char *) newptr; goto done; } /* * Otherwise, schedule the current buffer for output and get a * fresh buffer. */ pmclog_schedule_io(po); if (pmclog_get_buffer(po) != 0) { mtx_unlock_spin(&po->po_mtx); return (NULL); } KASSERT(po->po_curbuf != NULL, ("[pmclog,%d] po=%p no current buffer", __LINE__, po)); KASSERT(po->po_curbuf->plb_ptr != NULL, ("[pmclog,%d] null return from pmc_get_log_buffer", __LINE__)); KASSERT(po->po_curbuf->plb_ptr == po->po_curbuf->plb_base && po->po_curbuf->plb_ptr <= po->po_curbuf->plb_fence, ("[pmclog,%d] po=%p buffer invariants: ptr=%p base=%p fence=%p", __LINE__, po, po->po_curbuf->plb_ptr, po->po_curbuf->plb_base, po->po_curbuf->plb_fence)); oldptr = (uintptr_t) po->po_curbuf->plb_ptr; done: lh = (uint32_t *) oldptr; lh++; /* skip header */ getnanotime(&ts); /* fill in the timestamp */ *lh++ = ts.tv_sec & 0xFFFFFFFF; *lh++ = ts.tv_nsec & 0xFFFFFFF; return ((uint32_t *) oldptr); } /* * Schedule an I/O. * * Transfer the current buffer to the helper kthread. */ static void pmclog_schedule_io(struct pmc_owner *po) { KASSERT(po->po_curbuf != NULL, ("[pmclog,%d] schedule_io with null buffer po=%p", __LINE__, po)); KASSERT(po->po_curbuf->plb_ptr >= po->po_curbuf->plb_base, ("[pmclog,%d] buffer invariants po=%p ptr=%p base=%p", __LINE__, po, po->po_curbuf->plb_ptr, po->po_curbuf->plb_base)); KASSERT(po->po_curbuf->plb_ptr <= po->po_curbuf->plb_fence, ("[pmclog,%d] buffer invariants po=%p ptr=%p fenc=%p", __LINE__, po, po->po_curbuf->plb_ptr, po->po_curbuf->plb_fence)); PMCDBG1(LOG,SIO, 1, "po=%p", po); mtx_assert(&po->po_mtx, MA_OWNED); /* * Add the current buffer to the tail of the buffer list and * wakeup the helper. */ TAILQ_INSERT_TAIL(&po->po_logbuffers, po->po_curbuf, plb_next); po->po_curbuf = NULL; wakeup_one(po); } /* * Stop the helper kthread. */ static void pmclog_stop_kthread(struct pmc_owner *po) { mtx_lock(&pmc_kthread_mtx); po->po_flags &= ~PMC_PO_OWNS_LOGFILE; if (po->po_kthread != NULL) { PROC_LOCK(po->po_kthread); kern_psignal(po->po_kthread, SIGHUP); PROC_UNLOCK(po->po_kthread); } wakeup_one(po); while (po->po_kthread) msleep(po->po_kthread, &pmc_kthread_mtx, PPAUSE, "pmckstp", 0); mtx_unlock(&pmc_kthread_mtx); } /* * Public functions */ /* * Configure a log file for pmc owner 'po'. * * Parameter 'logfd' is a file handle referencing an open file in the * owner process. This file needs to have been opened for writing. */ int pmclog_configure_log(struct pmc_mdep *md, struct pmc_owner *po, int logfd) { struct proc *p; - cap_rights_t rights; int error; sx_assert(&pmc_sx, SA_XLOCKED); PMCDBG2(LOG,CFG,1, "config po=%p logfd=%d", po, logfd); p = po->po_owner; /* return EBUSY if a log file was already present */ if (po->po_flags & PMC_PO_OWNS_LOGFILE) return (EBUSY); KASSERT(po->po_file == NULL, ("[pmclog,%d] po=%p file (%p) already present", __LINE__, po, po->po_file)); /* get a reference to the file state */ - error = fget_write(curthread, logfd, - cap_rights_init(&rights, CAP_WRITE), &po->po_file); + error = fget_write(curthread, logfd, &cap_write_rights, &po->po_file); if (error) goto error; /* mark process as owning a log file */ po->po_flags |= PMC_PO_OWNS_LOGFILE; /* mark process as using HWPMCs */ PROC_LOCK(p); p->p_flag |= P_HWPMC; PROC_UNLOCK(p); /* create a log initialization entry */ PMCLOG_RESERVE_WITH_ERROR(po, INITIALIZE, sizeof(struct pmclog_initialize)); PMCLOG_EMIT32(PMC_VERSION); PMCLOG_EMIT32(md->pmd_cputype); PMCLOG_DESPATCH(po); return (0); error: KASSERT(po->po_kthread == NULL, ("[pmclog,%d] po=%p kthread not " "stopped", __LINE__, po)); if (po->po_file) (void) fdrop(po->po_file, curthread); po->po_file = NULL; /* clear file and error state */ po->po_error = 0; po->po_flags &= ~PMC_PO_OWNS_LOGFILE; return (error); } /* * De-configure a log file. This will throw away any buffers queued * for this owner process. */ int pmclog_deconfigure_log(struct pmc_owner *po) { int error; struct pmclog_buffer *lb; PMCDBG1(LOG,CFG,1, "de-config po=%p", po); if ((po->po_flags & PMC_PO_OWNS_LOGFILE) == 0) return (EINVAL); KASSERT(po->po_sscount == 0, ("[pmclog,%d] po=%p still owning SS PMCs", __LINE__, po)); KASSERT(po->po_file != NULL, ("[pmclog,%d] po=%p no log file", __LINE__, po)); /* stop the kthread, this will reset the 'OWNS_LOGFILE' flag */ pmclog_stop_kthread(po); KASSERT(po->po_kthread == NULL, ("[pmclog,%d] po=%p kthread not stopped", __LINE__, po)); /* return all queued log buffers to the global pool */ while ((lb = TAILQ_FIRST(&po->po_logbuffers)) != NULL) { TAILQ_REMOVE(&po->po_logbuffers, lb, plb_next); PMCLOG_INIT_BUFFER_DESCRIPTOR(lb); mtx_lock_spin(&pmc_bufferlist_mtx); TAILQ_INSERT_HEAD(&pmc_bufferlist, lb, plb_next); mtx_unlock_spin(&pmc_bufferlist_mtx); } /* return the 'current' buffer to the global pool */ if ((lb = po->po_curbuf) != NULL) { PMCLOG_INIT_BUFFER_DESCRIPTOR(lb); mtx_lock_spin(&pmc_bufferlist_mtx); TAILQ_INSERT_HEAD(&pmc_bufferlist, lb, plb_next); mtx_unlock_spin(&pmc_bufferlist_mtx); } /* drop a reference to the fd */ if (po->po_file != NULL) { error = fdrop(po->po_file, curthread); po->po_file = NULL; } else error = 0; po->po_error = 0; return (error); } /* * Flush a process' log buffer. */ int pmclog_flush(struct pmc_owner *po) { int error; struct pmclog_buffer *lb; PMCDBG1(LOG,FLS,1, "po=%p", po); /* * If there is a pending error recorded by the logger thread, * return that. */ if (po->po_error) return (po->po_error); error = 0; /* * Check that we do have an active log file. */ mtx_lock(&pmc_kthread_mtx); if ((po->po_flags & PMC_PO_OWNS_LOGFILE) == 0) { error = EINVAL; goto error; } /* * Schedule the current buffer if any and not empty. */ mtx_lock_spin(&po->po_mtx); lb = po->po_curbuf; if (lb && lb->plb_ptr != lb->plb_base) { pmclog_schedule_io(po); } else error = ENOBUFS; mtx_unlock_spin(&po->po_mtx); error: mtx_unlock(&pmc_kthread_mtx); return (error); } int pmclog_close(struct pmc_owner *po) { PMCDBG1(LOG,CLO,1, "po=%p", po); pmclog_process_closelog(po); mtx_lock(&pmc_kthread_mtx); /* * Schedule the current buffer. */ mtx_lock_spin(&po->po_mtx); if (po->po_curbuf) pmclog_schedule_io(po); else wakeup_one(po); mtx_unlock_spin(&po->po_mtx); /* * Initiate shutdown: no new data queued, * thread will close file on last block. */ po->po_flags |= PMC_PO_SHUTDOWN; mtx_unlock(&pmc_kthread_mtx); return (0); } void pmclog_process_callchain(struct pmc *pm, struct pmc_sample *ps) { int n, recordlen; uint32_t flags; struct pmc_owner *po; PMCDBG3(LOG,SAM,1,"pm=%p pid=%d n=%d", pm, ps->ps_pid, ps->ps_nsamples); recordlen = offsetof(struct pmclog_callchain, pl_pc) + ps->ps_nsamples * sizeof(uintfptr_t); po = pm->pm_owner; flags = PMC_CALLCHAIN_TO_CPUFLAGS(ps->ps_cpu,ps->ps_flags); PMCLOG_RESERVE(po, CALLCHAIN, recordlen); PMCLOG_EMIT32(ps->ps_pid); PMCLOG_EMIT32(pm->pm_id); PMCLOG_EMIT32(flags); for (n = 0; n < ps->ps_nsamples; n++) PMCLOG_EMITADDR(ps->ps_pc[n]); PMCLOG_DESPATCH(po); } void pmclog_process_closelog(struct pmc_owner *po) { PMCLOG_RESERVE(po,CLOSELOG,sizeof(struct pmclog_closelog)); PMCLOG_DESPATCH(po); } void pmclog_process_dropnotify(struct pmc_owner *po) { PMCLOG_RESERVE(po,DROPNOTIFY,sizeof(struct pmclog_dropnotify)); PMCLOG_DESPATCH(po); } void pmclog_process_map_in(struct pmc_owner *po, pid_t pid, uintfptr_t start, const char *path) { int pathlen, recordlen; KASSERT(path != NULL, ("[pmclog,%d] map-in, null path", __LINE__)); pathlen = strlen(path) + 1; /* #bytes for path name */ recordlen = offsetof(struct pmclog_map_in, pl_pathname) + pathlen; PMCLOG_RESERVE(po, MAP_IN, recordlen); PMCLOG_EMIT32(pid); PMCLOG_EMITADDR(start); PMCLOG_EMITSTRING(path,pathlen); PMCLOG_DESPATCH(po); } void pmclog_process_map_out(struct pmc_owner *po, pid_t pid, uintfptr_t start, uintfptr_t end) { KASSERT(start <= end, ("[pmclog,%d] start > end", __LINE__)); PMCLOG_RESERVE(po, MAP_OUT, sizeof(struct pmclog_map_out)); PMCLOG_EMIT32(pid); PMCLOG_EMITADDR(start); PMCLOG_EMITADDR(end); PMCLOG_DESPATCH(po); } void pmclog_process_pmcallocate(struct pmc *pm) { struct pmc_owner *po; struct pmc_soft *ps; po = pm->pm_owner; PMCDBG1(LOG,ALL,1, "pm=%p", pm); if (PMC_TO_CLASS(pm) == PMC_CLASS_SOFT) { PMCLOG_RESERVE(po, PMCALLOCATEDYN, sizeof(struct pmclog_pmcallocatedyn)); PMCLOG_EMIT32(pm->pm_id); PMCLOG_EMIT32(pm->pm_event); PMCLOG_EMIT32(pm->pm_flags); ps = pmc_soft_ev_acquire(pm->pm_event); if (ps != NULL) PMCLOG_EMITSTRING(ps->ps_ev.pm_ev_name,PMC_NAME_MAX); else PMCLOG_EMITNULLSTRING(PMC_NAME_MAX); pmc_soft_ev_release(ps); PMCLOG_DESPATCH(po); } else { PMCLOG_RESERVE(po, PMCALLOCATE, sizeof(struct pmclog_pmcallocate)); PMCLOG_EMIT32(pm->pm_id); PMCLOG_EMIT32(pm->pm_event); PMCLOG_EMIT32(pm->pm_flags); PMCLOG_DESPATCH(po); } } void pmclog_process_pmcattach(struct pmc *pm, pid_t pid, char *path) { int pathlen, recordlen; struct pmc_owner *po; PMCDBG2(LOG,ATT,1,"pm=%p pid=%d", pm, pid); po = pm->pm_owner; pathlen = strlen(path) + 1; /* #bytes for the string */ recordlen = offsetof(struct pmclog_pmcattach, pl_pathname) + pathlen; PMCLOG_RESERVE(po, PMCATTACH, recordlen); PMCLOG_EMIT32(pm->pm_id); PMCLOG_EMIT32(pid); PMCLOG_EMITSTRING(path, pathlen); PMCLOG_DESPATCH(po); } void pmclog_process_pmcdetach(struct pmc *pm, pid_t pid) { struct pmc_owner *po; PMCDBG2(LOG,ATT,1,"!pm=%p pid=%d", pm, pid); po = pm->pm_owner; PMCLOG_RESERVE(po, PMCDETACH, sizeof(struct pmclog_pmcdetach)); PMCLOG_EMIT32(pm->pm_id); PMCLOG_EMIT32(pid); PMCLOG_DESPATCH(po); } /* * Log a context switch event to the log file. */ void pmclog_process_proccsw(struct pmc *pm, struct pmc_process *pp, pmc_value_t v) { struct pmc_owner *po; KASSERT(pm->pm_flags & PMC_F_LOG_PROCCSW, ("[pmclog,%d] log-process-csw called gratuitously", __LINE__)); PMCDBG3(LOG,SWO,1,"pm=%p pid=%d v=%jx", pm, pp->pp_proc->p_pid, v); po = pm->pm_owner; PMCLOG_RESERVE(po, PROCCSW, sizeof(struct pmclog_proccsw)); PMCLOG_EMIT32(pm->pm_id); PMCLOG_EMIT64(v); PMCLOG_EMIT32(pp->pp_proc->p_pid); PMCLOG_DESPATCH(po); } void pmclog_process_procexec(struct pmc_owner *po, pmc_id_t pmid, pid_t pid, uintfptr_t startaddr, char *path) { int pathlen, recordlen; PMCDBG3(LOG,EXC,1,"po=%p pid=%d path=\"%s\"", po, pid, path); pathlen = strlen(path) + 1; /* #bytes for the path */ recordlen = offsetof(struct pmclog_procexec, pl_pathname) + pathlen; PMCLOG_RESERVE(po, PROCEXEC, recordlen); PMCLOG_EMIT32(pid); PMCLOG_EMITADDR(startaddr); PMCLOG_EMIT32(pmid); PMCLOG_EMITSTRING(path,pathlen); PMCLOG_DESPATCH(po); } /* * Log a process exit event (and accumulated pmc value) to the log file. */ void pmclog_process_procexit(struct pmc *pm, struct pmc_process *pp) { int ri; struct pmc_owner *po; ri = PMC_TO_ROWINDEX(pm); PMCDBG3(LOG,EXT,1,"pm=%p pid=%d v=%jx", pm, pp->pp_proc->p_pid, pp->pp_pmcs[ri].pp_pmcval); po = pm->pm_owner; PMCLOG_RESERVE(po, PROCEXIT, sizeof(struct pmclog_procexit)); PMCLOG_EMIT32(pm->pm_id); PMCLOG_EMIT64(pp->pp_pmcs[ri].pp_pmcval); PMCLOG_EMIT32(pp->pp_proc->p_pid); PMCLOG_DESPATCH(po); } /* * Log a fork event. */ void pmclog_process_procfork(struct pmc_owner *po, pid_t oldpid, pid_t newpid) { PMCLOG_RESERVE(po, PROCFORK, sizeof(struct pmclog_procfork)); PMCLOG_EMIT32(oldpid); PMCLOG_EMIT32(newpid); PMCLOG_DESPATCH(po); } /* * Log a process exit event of the form suitable for system-wide PMCs. */ void pmclog_process_sysexit(struct pmc_owner *po, pid_t pid) { PMCLOG_RESERVE(po, SYSEXIT, sizeof(struct pmclog_sysexit)); PMCLOG_EMIT32(pid); PMCLOG_DESPATCH(po); } /* * Write a user log entry. */ int pmclog_process_userlog(struct pmc_owner *po, struct pmc_op_writelog *wl) { int error; PMCDBG2(LOG,WRI,1, "writelog po=%p ud=0x%x", po, wl->pm_userdata); error = 0; PMCLOG_RESERVE_WITH_ERROR(po, USERDATA, sizeof(struct pmclog_userdata)); PMCLOG_EMIT32(wl->pm_userdata); PMCLOG_DESPATCH(po); error: return (error); } /* * Initialization. * * Create a pool of log buffers and initialize mutexes. */ void pmclog_initialize() { int n; struct pmclog_buffer *plb; if (pmclog_buffer_size <= 0) { (void) printf("hwpmc: tunable logbuffersize=%d must be " "greater than zero.\n", pmclog_buffer_size); pmclog_buffer_size = PMC_LOG_BUFFER_SIZE; } if (pmc_nlogbuffers <= 0) { (void) printf("hwpmc: tunable nlogbuffers=%d must be greater " "than zero.\n", pmc_nlogbuffers); pmc_nlogbuffers = PMC_NLOGBUFFERS; } /* create global pool of log buffers */ for (n = 0; n < pmc_nlogbuffers; n++) { plb = malloc(1024 * pmclog_buffer_size, M_PMC, M_WAITOK|M_ZERO); PMCLOG_INIT_BUFFER_DESCRIPTOR(plb); TAILQ_INSERT_HEAD(&pmc_bufferlist, plb, plb_next); } mtx_init(&pmc_bufferlist_mtx, "pmc-buffer-list", "pmc-leaf", MTX_SPIN); mtx_init(&pmc_kthread_mtx, "pmc-kthread", "pmc-sleep", MTX_DEF); } /* * Shutdown logging. * * Destroy mutexes and release memory back the to free pool. */ void pmclog_shutdown() { struct pmclog_buffer *plb; mtx_destroy(&pmc_kthread_mtx); mtx_destroy(&pmc_bufferlist_mtx); while ((plb = TAILQ_FIRST(&pmc_bufferlist)) != NULL) { TAILQ_REMOVE(&pmc_bufferlist, plb, plb_next); free(plb, M_PMC); } } Index: head/sys/fs/fdescfs/fdesc_vnops.c =================================================================== --- head/sys/fs/fdescfs/fdesc_vnops.c (revision 333424) +++ head/sys/fs/fdescfs/fdesc_vnops.c (revision 333425) @@ -1,659 +1,657 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)fdesc_vnops.c 8.9 (Berkeley) 1/21/94 * * $FreeBSD$ */ /* * /dev/fd Filesystem */ #include #include #include #include #include #include #include /* boottime */ #include #include #include #include /* Must come after sys/malloc.h */ #include #include #include #include #include #include #include #include #define NFDCACHE 4 #define FD_NHASH(ix) \ (&fdhashtbl[(ix) & fdhash]) static LIST_HEAD(fdhashhead, fdescnode) *fdhashtbl; static u_long fdhash; struct mtx fdesc_hashmtx; static vop_getattr_t fdesc_getattr; static vop_lookup_t fdesc_lookup; static vop_open_t fdesc_open; static vop_pathconf_t fdesc_pathconf; static vop_readdir_t fdesc_readdir; static vop_readlink_t fdesc_readlink; static vop_reclaim_t fdesc_reclaim; static vop_setattr_t fdesc_setattr; static struct vop_vector fdesc_vnodeops = { .vop_default = &default_vnodeops, .vop_access = VOP_NULL, .vop_getattr = fdesc_getattr, .vop_lookup = fdesc_lookup, .vop_open = fdesc_open, .vop_pathconf = fdesc_pathconf, .vop_readdir = fdesc_readdir, .vop_readlink = fdesc_readlink, .vop_reclaim = fdesc_reclaim, .vop_setattr = fdesc_setattr, }; static void fdesc_insmntque_dtr(struct vnode *, void *); static void fdesc_remove_entry(struct fdescnode *); /* * Initialise cache headers */ int fdesc_init(struct vfsconf *vfsp) { mtx_init(&fdesc_hashmtx, "fdescfs_hash", NULL, MTX_DEF); fdhashtbl = hashinit(NFDCACHE, M_CACHE, &fdhash); return (0); } /* * Uninit ready for unload. */ int fdesc_uninit(struct vfsconf *vfsp) { hashdestroy(fdhashtbl, M_CACHE, fdhash); mtx_destroy(&fdesc_hashmtx); return (0); } /* * If allocating vnode fails, call this. */ static void fdesc_insmntque_dtr(struct vnode *vp, void *arg) { vgone(vp); vput(vp); } /* * Remove an entry from the hash if it exists. */ static void fdesc_remove_entry(struct fdescnode *fd) { struct fdhashhead *fc; struct fdescnode *fd2; fc = FD_NHASH(fd->fd_ix); mtx_lock(&fdesc_hashmtx); LIST_FOREACH(fd2, fc, fd_hash) { if (fd == fd2) { LIST_REMOVE(fd, fd_hash); break; } } mtx_unlock(&fdesc_hashmtx); } int fdesc_allocvp(fdntype ftype, unsigned fd_fd, int ix, struct mount *mp, struct vnode **vpp) { struct fdescmount *fmp; struct fdhashhead *fc; struct fdescnode *fd, *fd2; struct vnode *vp, *vp2; struct thread *td; int error; td = curthread; fc = FD_NHASH(ix); loop: mtx_lock(&fdesc_hashmtx); /* * If a forced unmount is progressing, we need to drop it. The flags are * protected by the hashmtx. */ fmp = mp->mnt_data; if (fmp == NULL || fmp->flags & FMNT_UNMOUNTF) { mtx_unlock(&fdesc_hashmtx); return (-1); } LIST_FOREACH(fd, fc, fd_hash) { if (fd->fd_ix == ix && fd->fd_vnode->v_mount == mp) { /* Get reference to vnode in case it's being free'd */ vp = fd->fd_vnode; VI_LOCK(vp); mtx_unlock(&fdesc_hashmtx); if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) goto loop; *vpp = vp; return (0); } } mtx_unlock(&fdesc_hashmtx); fd = malloc(sizeof(struct fdescnode), M_TEMP, M_WAITOK); error = getnewvnode("fdescfs", mp, &fdesc_vnodeops, &vp); if (error) { free(fd, M_TEMP); return (error); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vp->v_data = fd; fd->fd_vnode = vp; fd->fd_type = ftype; fd->fd_fd = fd_fd; fd->fd_ix = ix; if (ftype == Fdesc && fmp->flags & FMNT_LINRDLNKF) vp->v_vflag |= VV_READLINK; error = insmntque1(vp, mp, fdesc_insmntque_dtr, NULL); if (error != 0) { *vpp = NULLVP; return (error); } /* Make sure that someone didn't beat us when inserting the vnode. */ mtx_lock(&fdesc_hashmtx); /* * If a forced unmount is progressing, we need to drop it. The flags are * protected by the hashmtx. */ fmp = mp->mnt_data; if (fmp == NULL || fmp->flags & FMNT_UNMOUNTF) { mtx_unlock(&fdesc_hashmtx); vgone(vp); vput(vp); *vpp = NULLVP; return (-1); } LIST_FOREACH(fd2, fc, fd_hash) { if (fd2->fd_ix == ix && fd2->fd_vnode->v_mount == mp) { /* Get reference to vnode in case it's being free'd */ vp2 = fd2->fd_vnode; VI_LOCK(vp2); mtx_unlock(&fdesc_hashmtx); error = vget(vp2, LK_EXCLUSIVE | LK_INTERLOCK, td); /* Someone beat us, dec use count and wait for reclaim */ vgone(vp); vput(vp); /* If we didn't get it, return no vnode. */ if (error) vp2 = NULLVP; *vpp = vp2; return (error); } } /* If we came here, we can insert it safely. */ LIST_INSERT_HEAD(fc, fd, fd_hash); mtx_unlock(&fdesc_hashmtx); *vpp = vp; return (0); } struct fdesc_get_ino_args { fdntype ftype; unsigned fd_fd; int ix; struct file *fp; struct thread *td; }; static int fdesc_get_ino_alloc(struct mount *mp, void *arg, int lkflags, struct vnode **rvp) { struct fdesc_get_ino_args *a; int error; a = arg; error = fdesc_allocvp(a->ftype, a->fd_fd, a->ix, mp, rvp); fdrop(a->fp, a->td); return (error); } /* * vp is the current namei directory * ndp is the name to locate in that directory... */ static int fdesc_lookup(struct vop_lookup_args *ap) { struct vnode **vpp = ap->a_vpp; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; char *pname = cnp->cn_nameptr; struct thread *td = cnp->cn_thread; struct file *fp; struct fdesc_get_ino_args arg; - cap_rights_t rights; int nlen = cnp->cn_namelen; u_int fd, fd1; int error; struct vnode *fvp; if ((cnp->cn_flags & ISLASTCN) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { error = EROFS; goto bad; } if (cnp->cn_namelen == 1 && *pname == '.') { *vpp = dvp; VREF(dvp); return (0); } if (VTOFDESC(dvp)->fd_type != Froot) { error = ENOTDIR; goto bad; } fd = 0; /* the only time a leading 0 is acceptable is if it's "0" */ if (*pname == '0' && nlen != 1) { error = ENOENT; goto bad; } while (nlen--) { if (*pname < '0' || *pname > '9') { error = ENOENT; goto bad; } fd1 = 10 * fd + *pname++ - '0'; if (fd1 < fd) { error = ENOENT; goto bad; } fd = fd1; } /* * No rights to check since 'fp' isn't actually used. */ - if ((error = fget(td, fd, cap_rights_init(&rights), &fp)) != 0) + if ((error = fget(td, fd, &cap_no_rights, &fp)) != 0) goto bad; /* Check if we're looking up ourselves. */ if (VTOFDESC(dvp)->fd_ix == FD_DESC + fd) { /* * In case we're holding the last reference to the file, the dvp * will be re-acquired. */ vhold(dvp); VOP_UNLOCK(dvp, 0); fdrop(fp, td); /* Re-aquire the lock afterwards. */ vn_lock(dvp, LK_RETRY | LK_EXCLUSIVE); vdrop(dvp); fvp = dvp; if ((dvp->v_iflag & VI_DOOMED) != 0) error = ENOENT; } else { /* * Unlock our root node (dvp) when doing this, since we might * deadlock since the vnode might be locked by another thread * and the root vnode lock will be obtained afterwards (in case * we're looking up the fd of the root vnode), which will be the * opposite lock order. Vhold the root vnode first so we don't * lose it. */ arg.ftype = Fdesc; arg.fd_fd = fd; arg.ix = FD_DESC + fd; arg.fp = fp; arg.td = td; error = vn_vget_ino_gen(dvp, fdesc_get_ino_alloc, &arg, LK_EXCLUSIVE, &fvp); } if (error) goto bad; *vpp = fvp; return (0); bad: *vpp = NULL; return (error); } static int fdesc_open(struct vop_open_args *ap) { struct vnode *vp = ap->a_vp; if (VTOFDESC(vp)->fd_type == Froot) return (0); /* * XXX Kludge: set td->td_proc->p_dupfd to contain the value of the file * descriptor being sought for duplication. The error return ensures * that the vnode for this device will be released by vn_open. Open * will detect this special error and take the actions in dupfdopen. * Other callers of vn_open or VOP_OPEN will simply report the * error. */ ap->a_td->td_dupfd = VTOFDESC(vp)->fd_fd; /* XXX */ return (ENODEV); } static int fdesc_pathconf(struct vop_pathconf_args *ap) { struct vnode *vp = ap->a_vp; int error; switch (ap->a_name) { case _PC_NAME_MAX: *ap->a_retval = NAME_MAX; return (0); case _PC_LINK_MAX: if (VTOFDESC(vp)->fd_type == Froot) *ap->a_retval = 2; else *ap->a_retval = 1; return (0); default: if (VTOFDESC(vp)->fd_type == Froot) return (vop_stdpathconf(ap)); vref(vp); VOP_UNLOCK(vp, 0); error = kern_fpathconf(curthread, VTOFDESC(vp)->fd_fd, ap->a_name, ap->a_retval); vn_lock(vp, LK_SHARED | LK_RETRY); vunref(vp); return (error); } } static int fdesc_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; struct timeval boottime; getboottime(&boottime); vap->va_mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH; vap->va_fileid = VTOFDESC(vp)->fd_ix; vap->va_uid = 0; vap->va_gid = 0; vap->va_blocksize = DEV_BSIZE; vap->va_atime.tv_sec = boottime.tv_sec; vap->va_atime.tv_nsec = 0; vap->va_mtime = vap->va_atime; vap->va_ctime = vap->va_mtime; vap->va_gen = 0; vap->va_flags = 0; vap->va_bytes = 0; vap->va_filerev = 0; switch (VTOFDESC(vp)->fd_type) { case Froot: vap->va_type = VDIR; vap->va_nlink = 2; vap->va_size = DEV_BSIZE; vap->va_rdev = NODEV; break; case Fdesc: vap->va_type = (vp->v_vflag & VV_READLINK) == 0 ? VCHR : VLNK; vap->va_nlink = 1; vap->va_size = 0; vap->va_rdev = makedev(0, vap->va_fileid); break; default: panic("fdesc_getattr"); break; } vp->v_type = vap->va_type; return (0); } static int fdesc_setattr(struct vop_setattr_args *ap) { struct vattr *vap = ap->a_vap; struct vnode *vp; struct mount *mp; struct file *fp; struct thread *td = curthread; cap_rights_t rights; unsigned fd; int error; /* * Can't mess with the root vnode */ if (VTOFDESC(ap->a_vp)->fd_type == Froot) return (EACCES); fd = VTOFDESC(ap->a_vp)->fd_fd; /* * Allow setattr where there is an underlying vnode. */ error = getvnode(td, fd, cap_rights_init(&rights, CAP_EXTATTR_SET), &fp); if (error) { /* * getvnode() returns EINVAL if the file descriptor is not * backed by a vnode. Silently drop all changes except * chflags(2) in this case. */ if (error == EINVAL) { if (vap->va_flags != VNOVAL) error = EOPNOTSUPP; else error = 0; } return (error); } vp = fp->f_vnode; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) == 0) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_SETATTR(vp, ap->a_vap, ap->a_cred); VOP_UNLOCK(vp, 0); vn_finished_write(mp); } fdrop(fp, td); return (error); } #define UIO_MX _GENERIC_DIRLEN(10) /* number of symbols in INT_MAX printout */ static int fdesc_readdir(struct vop_readdir_args *ap) { struct fdescmount *fmp; struct uio *uio = ap->a_uio; struct filedesc *fdp; struct dirent d; struct dirent *dp = &d; int error, i, off, fcnt; if (VTOFDESC(ap->a_vp)->fd_type != Froot) panic("fdesc_readdir: not dir"); fmp = VFSTOFDESC(ap->a_vp->v_mount); if (ap->a_ncookies != NULL) *ap->a_ncookies = 0; off = (int)uio->uio_offset; if (off != uio->uio_offset || off < 0 || (u_int)off % UIO_MX != 0 || uio->uio_resid < UIO_MX) return (EINVAL); i = (u_int)off / UIO_MX; fdp = uio->uio_td->td_proc->p_fd; error = 0; fcnt = i - 2; /* The first two nodes are `.' and `..' */ FILEDESC_SLOCK(fdp); while (i < fdp->fd_nfiles + 2 && uio->uio_resid >= UIO_MX) { bzero((caddr_t)dp, UIO_MX); switch (i) { case 0: /* `.' */ case 1: /* `..' */ dp->d_fileno = i + FD_ROOT; dp->d_namlen = i + 1; dp->d_reclen = UIO_MX; bcopy("..", dp->d_name, dp->d_namlen); dp->d_name[i + 1] = '\0'; dp->d_type = DT_DIR; break; default: if (fdp->fd_ofiles[fcnt].fde_file == NULL) break; dp->d_namlen = sprintf(dp->d_name, "%d", fcnt); dp->d_reclen = UIO_MX; dp->d_type = (fmp->flags & FMNT_LINRDLNKF) == 0 ? DT_CHR : DT_LNK; dp->d_fileno = i + FD_DESC; break; } if (dp->d_namlen != 0) { /* * And ship to userland */ FILEDESC_SUNLOCK(fdp); error = uiomove(dp, UIO_MX, uio); if (error) goto done; FILEDESC_SLOCK(fdp); } i++; fcnt++; } FILEDESC_SUNLOCK(fdp); done: uio->uio_offset = i * UIO_MX; return (error); } static int fdesc_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp; struct fdescnode *fd; vp = ap->a_vp; fd = VTOFDESC(vp); fdesc_remove_entry(fd); free(vp->v_data, M_TEMP); vp->v_data = NULL; return (0); } static int fdesc_readlink(struct vop_readlink_args *va) { struct vnode *vp, *vn; - cap_rights_t rights; struct thread *td; struct uio *uio; struct file *fp; char *freepath, *fullpath; size_t pathlen; int lockflags, fd_fd; int error; freepath = NULL; vn = va->a_vp; if (VTOFDESC(vn)->fd_type != Fdesc) panic("fdesc_readlink: not fdescfs link"); fd_fd = ((struct fdescnode *)vn->v_data)->fd_fd; lockflags = VOP_ISLOCKED(vn); VOP_UNLOCK(vn, 0); td = curthread; - error = fget_cap(td, fd_fd, cap_rights_init(&rights), &fp, NULL); + error = fget_cap(td, fd_fd, &cap_no_rights, &fp, NULL); if (error != 0) goto out; switch (fp->f_type) { case DTYPE_VNODE: vp = fp->f_vnode; error = vn_fullpath(td, vp, &fullpath, &freepath); break; default: fullpath = "anon_inode:[unknown]"; break; } if (error == 0) { uio = va->a_uio; pathlen = strlen(fullpath); error = uiomove(fullpath, pathlen, uio); } if (freepath != NULL) free(freepath, M_TEMP); fdrop(fp, td); out: vn_lock(vn, lockflags | LK_RETRY); return (error); } Index: head/sys/fs/fuse/fuse_vfsops.c =================================================================== --- head/sys/fs/fuse/fuse_vfsops.c (revision 333424) +++ head/sys/fs/fuse/fuse_vfsops.c (revision 333425) @@ -1,536 +1,535 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2007-2009 Google Inc. and Amit Singh * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Copyright (C) 2005 Csaba Henk. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "fuse.h" #include "fuse_param.h" #include "fuse_node.h" #include "fuse_ipc.h" #include "fuse_internal.h" #include #include #define FUSE_DEBUG_MODULE VFSOPS #include "fuse_debug.h" /* This will do for privilege types for now */ #ifndef PRIV_VFS_FUSE_ALLOWOTHER #define PRIV_VFS_FUSE_ALLOWOTHER PRIV_VFS_MOUNT_NONUSER #endif #ifndef PRIV_VFS_FUSE_MOUNT_NONUSER #define PRIV_VFS_FUSE_MOUNT_NONUSER PRIV_VFS_MOUNT_NONUSER #endif #ifndef PRIV_VFS_FUSE_SYNC_UNMOUNT #define PRIV_VFS_FUSE_SYNC_UNMOUNT PRIV_VFS_MOUNT_NONUSER #endif static vfs_mount_t fuse_vfsop_mount; static vfs_unmount_t fuse_vfsop_unmount; static vfs_root_t fuse_vfsop_root; static vfs_statfs_t fuse_vfsop_statfs; struct vfsops fuse_vfsops = { .vfs_mount = fuse_vfsop_mount, .vfs_unmount = fuse_vfsop_unmount, .vfs_root = fuse_vfsop_root, .vfs_statfs = fuse_vfsop_statfs, }; SYSCTL_INT(_vfs_fuse, OID_AUTO, init_backgrounded, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, 1, "indicate async handshake"); static int fuse_enforce_dev_perms = 0; SYSCTL_INT(_vfs_fuse, OID_AUTO, enforce_dev_perms, CTLFLAG_RW, &fuse_enforce_dev_perms, 0, "enforce fuse device permissions for secondary mounts"); static unsigned sync_unmount = 1; SYSCTL_UINT(_vfs_fuse, OID_AUTO, sync_unmount, CTLFLAG_RW, &sync_unmount, 0, "specify when to use synchronous unmount"); MALLOC_DEFINE(M_FUSEVFS, "fuse_filesystem", "buffer for fuse vfs layer"); static int fuse_getdevice(const char *fspec, struct thread *td, struct cdev **fdevp) { struct nameidata nd, *ndp = &nd; struct vnode *devvp; struct cdev *fdev; int err; /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible disk device. */ NDINIT(ndp, LOOKUP, FOLLOW, UIO_SYSSPACE, fspec, td); if ((err = namei(ndp)) != 0) return err; NDFREE(ndp, NDF_ONLY_PNBUF); devvp = ndp->ni_vp; if (devvp->v_type != VCHR) { vrele(devvp); return ENXIO; } fdev = devvp->v_rdev; dev_ref(fdev); if (fuse_enforce_dev_perms) { /* * Check if mounter can open the fuse device. * * This has significance only if we are doing a secondary mount * which doesn't involve actually opening fuse devices, but we * still want to enforce the permissions of the device (in * order to keep control over the circle of fuse users). * * (In case of primary mounts, we are either the superuser so * we can do anything anyway, or we can mount only if the * device is already opened by us, ie. we are permitted to open * the device.) */ #if 0 #ifdef MAC err = mac_check_vnode_open(td->td_ucred, devvp, VREAD | VWRITE); if (!err) #endif #endif /* 0 */ err = VOP_ACCESS(devvp, VREAD | VWRITE, td->td_ucred, td); if (err) { vrele(devvp); dev_rel(fdev); return err; } } /* * according to coda code, no extra lock is needed -- * although in sys/vnode.h this field is marked "v" */ vrele(devvp); if (!fdev->si_devsw || strcmp("fuse", fdev->si_devsw->d_name)) { dev_rel(fdev); return ENXIO; } *fdevp = fdev; return 0; } #define FUSE_FLAGOPT(fnam, fval) do { \ vfs_flagopt(opts, #fnam, &mntopts, fval); \ vfs_flagopt(opts, "__" #fnam, &__mntopts, fval); \ } while (0) static int fuse_vfsop_mount(struct mount *mp) { int err; uint64_t mntopts, __mntopts; int max_read_set; uint32_t max_read; int daemon_timeout; int fd; size_t len; struct cdev *fdev; struct fuse_data *data; struct thread *td; struct file *fp, *fptmp; char *fspec, *subtype; struct vfsoptlist *opts; - cap_rights_t rights; subtype = NULL; max_read_set = 0; max_read = ~0; err = 0; mntopts = 0; __mntopts = 0; td = curthread; fuse_trace_printf_vfsop(); if (mp->mnt_flag & MNT_UPDATE) return EOPNOTSUPP; MNT_ILOCK(mp); mp->mnt_flag |= MNT_SYNCHRONOUS; mp->mnt_data = NULL; MNT_IUNLOCK(mp); /* Get the new options passed to mount */ opts = mp->mnt_optnew; if (!opts) return EINVAL; /* `fspath' contains the mount point (eg. /mnt/fuse/sshfs); REQUIRED */ if (!vfs_getopts(opts, "fspath", &err)) return err; /* `from' contains the device name (eg. /dev/fuse0); REQUIRED */ fspec = vfs_getopts(opts, "from", &err); if (!fspec) return err; /* `fd' contains the filedescriptor for this session; REQUIRED */ if (vfs_scanopt(opts, "fd", "%d", &fd) != 1) return EINVAL; err = fuse_getdevice(fspec, td, &fdev); if (err != 0) return err; /* * With the help of underscored options the mount program * can inform us from the flags it sets by default */ FUSE_FLAGOPT(allow_other, FSESS_DAEMON_CAN_SPY); FUSE_FLAGOPT(push_symlinks_in, FSESS_PUSH_SYMLINKS_IN); FUSE_FLAGOPT(default_permissions, FSESS_DEFAULT_PERMISSIONS); FUSE_FLAGOPT(no_attrcache, FSESS_NO_ATTRCACHE); FUSE_FLAGOPT(no_readahed, FSESS_NO_READAHEAD); FUSE_FLAGOPT(no_datacache, FSESS_NO_DATACACHE); FUSE_FLAGOPT(no_namecache, FSESS_NO_NAMECACHE); FUSE_FLAGOPT(no_mmap, FSESS_NO_MMAP); FUSE_FLAGOPT(brokenio, FSESS_BROKENIO); if (vfs_scanopt(opts, "max_read=", "%u", &max_read) == 1) max_read_set = 1; if (vfs_scanopt(opts, "timeout=", "%u", &daemon_timeout) == 1) { if (daemon_timeout < FUSE_MIN_DAEMON_TIMEOUT) daemon_timeout = FUSE_MIN_DAEMON_TIMEOUT; else if (daemon_timeout > FUSE_MAX_DAEMON_TIMEOUT) daemon_timeout = FUSE_MAX_DAEMON_TIMEOUT; } else { daemon_timeout = FUSE_DEFAULT_DAEMON_TIMEOUT; } subtype = vfs_getopts(opts, "subtype=", &err); FS_DEBUG2G("mntopts 0x%jx\n", (uintmax_t)mntopts); - err = fget(td, fd, cap_rights_init(&rights, CAP_READ), &fp); + err = fget(td, fd, &cap_read_rights, &fp); if (err != 0) { FS_DEBUG("invalid or not opened device: data=%p\n", data); goto out; } fptmp = td->td_fpop; td->td_fpop = fp; err = devfs_get_cdevpriv((void **)&data); td->td_fpop = fptmp; fdrop(fp, td); FUSE_LOCK(); if (err != 0 || data == NULL || data->mp != NULL) { FS_DEBUG("invalid or not opened device: data=%p data.mp=%p\n", data, data != NULL ? data->mp : NULL); err = ENXIO; FUSE_UNLOCK(); goto out; } if (fdata_get_dead(data)) { FS_DEBUG("device is dead during mount: data=%p\n", data); err = ENOTCONN; FUSE_UNLOCK(); goto out; } /* Sanity + permission checks */ if (!data->daemoncred) panic("fuse daemon found, but identity unknown"); if (mntopts & FSESS_DAEMON_CAN_SPY) err = priv_check(td, PRIV_VFS_FUSE_ALLOWOTHER); if (err == 0 && td->td_ucred->cr_uid != data->daemoncred->cr_uid) /* are we allowed to do the first mount? */ err = priv_check(td, PRIV_VFS_FUSE_MOUNT_NONUSER); if (err) { FUSE_UNLOCK(); goto out; } data->ref++; data->mp = mp; data->dataflags |= mntopts; data->max_read = max_read; data->daemon_timeout = daemon_timeout; FUSE_UNLOCK(); vfs_getnewfsid(mp); MNT_ILOCK(mp); mp->mnt_data = data; mp->mnt_flag |= MNT_LOCAL; mp->mnt_kern_flag |= MNTK_USES_BCACHE; MNT_IUNLOCK(mp); /* We need this here as this slot is used by getnewvnode() */ mp->mnt_stat.f_iosize = PAGE_SIZE; if (subtype) { strlcat(mp->mnt_stat.f_fstypename, ".", MFSNAMELEN); strlcat(mp->mnt_stat.f_fstypename, subtype, MFSNAMELEN); } copystr(fspec, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &len); bzero(mp->mnt_stat.f_mntfromname + len, MNAMELEN - len); FS_DEBUG2G("mp %p: %s\n", mp, mp->mnt_stat.f_mntfromname); /* Now handshaking with daemon */ fuse_internal_send_init(data, td); out: if (err) { FUSE_LOCK(); if (data->mp == mp) { /* * Destroy device only if we acquired reference to * it */ FS_DEBUG("mount failed, destroy device: data=%p mp=%p" " err=%d\n", data, mp, err); data->mp = NULL; fdata_trydestroy(data); } FUSE_UNLOCK(); dev_rel(fdev); } return err; } static int fuse_vfsop_unmount(struct mount *mp, int mntflags) { int err = 0; int flags = 0; struct cdev *fdev; struct fuse_data *data; struct fuse_dispatcher fdi; struct thread *td = curthread; fuse_trace_printf_vfsop(); if (mntflags & MNT_FORCE) { flags |= FORCECLOSE; } data = fuse_get_mpdata(mp); if (!data) { panic("no private data for mount point?"); } /* There is 1 extra root vnode reference (mp->mnt_data). */ FUSE_LOCK(); if (data->vroot != NULL) { struct vnode *vroot = data->vroot; data->vroot = NULL; FUSE_UNLOCK(); vrele(vroot); } else FUSE_UNLOCK(); err = vflush(mp, 0, flags, td); if (err) { debug_printf("vflush failed"); return err; } if (fdata_get_dead(data)) { goto alreadydead; } fdisp_init(&fdi, 0); fdisp_make(&fdi, FUSE_DESTROY, mp, 0, td, NULL); err = fdisp_wait_answ(&fdi); fdisp_destroy(&fdi); fdata_set_dead(data); alreadydead: FUSE_LOCK(); data->mp = NULL; fdev = data->fdev; fdata_trydestroy(data); FUSE_UNLOCK(); MNT_ILOCK(mp); mp->mnt_data = NULL; mp->mnt_flag &= ~MNT_LOCAL; MNT_IUNLOCK(mp); dev_rel(fdev); return 0; } static int fuse_vfsop_root(struct mount *mp, int lkflags, struct vnode **vpp) { struct fuse_data *data = fuse_get_mpdata(mp); int err = 0; if (data->vroot != NULL) { err = vget(data->vroot, lkflags, curthread); if (err == 0) *vpp = data->vroot; } else { err = fuse_vnode_get(mp, FUSE_ROOT_ID, NULL, vpp, NULL, VDIR); if (err == 0) { FUSE_LOCK(); MPASS(data->vroot == NULL || data->vroot == *vpp); if (data->vroot == NULL) { FS_DEBUG("new root vnode\n"); data->vroot = *vpp; FUSE_UNLOCK(); vref(*vpp); } else if (data->vroot != *vpp) { FS_DEBUG("root vnode race\n"); FUSE_UNLOCK(); VOP_UNLOCK(*vpp, 0); vrele(*vpp); vrecycle(*vpp); *vpp = data->vroot; } else FUSE_UNLOCK(); } } return err; } static int fuse_vfsop_statfs(struct mount *mp, struct statfs *sbp) { struct fuse_dispatcher fdi; int err = 0; struct fuse_statfs_out *fsfo; struct fuse_data *data; FS_DEBUG2G("mp %p: %s\n", mp, mp->mnt_stat.f_mntfromname); data = fuse_get_mpdata(mp); if (!(data->dataflags & FSESS_INITED)) goto fake; fdisp_init(&fdi, 0); fdisp_make(&fdi, FUSE_STATFS, mp, FUSE_ROOT_ID, NULL, NULL); err = fdisp_wait_answ(&fdi); if (err) { fdisp_destroy(&fdi); if (err == ENOTCONN) { /* * We want to seem a legitimate fs even if the daemon * is stiff dead... (so that, eg., we can still do path * based unmounting after the daemon dies). */ goto fake; } return err; } fsfo = fdi.answ; sbp->f_blocks = fsfo->st.blocks; sbp->f_bfree = fsfo->st.bfree; sbp->f_bavail = fsfo->st.bavail; sbp->f_files = fsfo->st.files; sbp->f_ffree = fsfo->st.ffree; /* cast from uint64_t to int64_t */ sbp->f_namemax = fsfo->st.namelen; sbp->f_bsize = fsfo->st.frsize; /* cast from uint32_t to uint64_t */ FS_DEBUG("fuse_statfs_out -- blocks: %llu, bfree: %llu, bavail: %llu, " "fil es: %llu, ffree: %llu, bsize: %i, namelen: %i\n", (unsigned long long)fsfo->st.blocks, (unsigned long long)fsfo->st.bfree, (unsigned long long)fsfo->st.bavail, (unsigned long long)fsfo->st.files, (unsigned long long)fsfo->st.ffree, fsfo->st.bsize, fsfo->st.namelen); fdisp_destroy(&fdi); return 0; fake: sbp->f_blocks = 0; sbp->f_bfree = 0; sbp->f_bavail = 0; sbp->f_files = 0; sbp->f_ffree = 0; sbp->f_namemax = 0; sbp->f_bsize = FUSE_DEFAULT_BLOCKSIZE; return 0; } Index: head/sys/kern/kern_descrip.c =================================================================== --- head/sys/kern/kern_descrip.c (revision 333424) +++ head/sys/kern/kern_descrip.c (revision 333425) @@ -1,4260 +1,4249 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_ddb.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table"); static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader", "file desc to leader structures"); static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures"); MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities"); MALLOC_DECLARE(M_FADVISE); static __read_mostly uma_zone_t file_zone; static __read_mostly uma_zone_t filedesc0_zone; static int closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, int holdleaders); static int fd_first_free(struct filedesc *fdp, int low, int size); static int fd_last_used(struct filedesc *fdp, int size); static void fdgrowtable(struct filedesc *fdp, int nfd); static void fdgrowtable_exp(struct filedesc *fdp, int nfd); static void fdunused(struct filedesc *fdp, int fd); static void fdused(struct filedesc *fdp, int fd); static int getmaxfd(struct thread *td); static u_long *filecaps_copy_prep(const struct filecaps *src); static void filecaps_copy_finish(const struct filecaps *src, struct filecaps *dst, u_long *ioctls); static u_long *filecaps_free_prep(struct filecaps *fcaps); static void filecaps_free_finish(u_long *ioctls); /* * Each process has: * * - An array of open file descriptors (fd_ofiles) * - An array of file flags (fd_ofileflags) * - A bitmap recording which descriptors are in use (fd_map) * * A process starts out with NDFILE descriptors. The value of NDFILE has * been selected based the historical limit of 20 open files, and an * assumption that the majority of processes, especially short-lived * processes like shells, will never need more. * * If this initial allocation is exhausted, a larger descriptor table and * map are allocated dynamically, and the pointers in the process's struct * filedesc are updated to point to those. This is repeated every time * the process runs out of file descriptors (provided it hasn't hit its * resource limit). * * Since threads may hold references to individual descriptor table * entries, the tables are never freed. Instead, they are placed on a * linked list and freed only when the struct filedesc is released. */ #define NDFILE 20 #define NDSLOTSIZE sizeof(NDSLOTTYPE) #define NDENTRIES (NDSLOTSIZE * __CHAR_BIT) #define NDSLOT(x) ((x) / NDENTRIES) #define NDBIT(x) ((NDSLOTTYPE)1 << ((x) % NDENTRIES)) #define NDSLOTS(x) (((x) + NDENTRIES - 1) / NDENTRIES) /* * SLIST entry used to keep track of ofiles which must be reclaimed when * the process exits. */ struct freetable { struct fdescenttbl *ft_table; SLIST_ENTRY(freetable) ft_next; }; /* * Initial allocation: a filedesc structure + the head of SLIST used to * keep track of old ofiles + enough space for NDFILE descriptors. */ struct fdescenttbl0 { int fdt_nfiles; struct filedescent fdt_ofiles[NDFILE]; }; struct filedesc0 { struct filedesc fd_fd; SLIST_HEAD(, freetable) fd_free; struct fdescenttbl0 fd_dfiles; NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)]; }; /* * Descriptor management. */ volatile int __exclusive_cache_line openfiles; /* actual number of open files */ struct mtx sigio_lock; /* mtx to protect pointers to sigio */ void __read_mostly (*mq_fdclose)(struct thread *td, int fd, struct file *fp); /* * If low >= size, just return low. Otherwise find the first zero bit in the * given bitmap, starting at low and not exceeding size - 1. Return size if * not found. */ static int fd_first_free(struct filedesc *fdp, int low, int size) { NDSLOTTYPE *map = fdp->fd_map; NDSLOTTYPE mask; int off, maxoff; if (low >= size) return (low); off = NDSLOT(low); if (low % NDENTRIES) { mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES))); if ((mask &= ~map[off]) != 0UL) return (off * NDENTRIES + ffsl(mask) - 1); ++off; } for (maxoff = NDSLOTS(size); off < maxoff; ++off) if (map[off] != ~0UL) return (off * NDENTRIES + ffsl(~map[off]) - 1); return (size); } /* * Find the highest non-zero bit in the given bitmap, starting at 0 and * not exceeding size - 1. Return -1 if not found. */ static int fd_last_used(struct filedesc *fdp, int size) { NDSLOTTYPE *map = fdp->fd_map; NDSLOTTYPE mask; int off, minoff; off = NDSLOT(size); if (size % NDENTRIES) { mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES)); if ((mask &= map[off]) != 0) return (off * NDENTRIES + flsl(mask) - 1); --off; } for (minoff = NDSLOT(0); off >= minoff; --off) if (map[off] != 0) return (off * NDENTRIES + flsl(map[off]) - 1); return (-1); } static int fdisused(struct filedesc *fdp, int fd) { KASSERT(fd >= 0 && fd < fdp->fd_nfiles, ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles)); return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0); } /* * Mark a file descriptor as used. */ static void fdused_init(struct filedesc *fdp, int fd) { KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd)); fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd); } static void fdused(struct filedesc *fdp, int fd) { FILEDESC_XLOCK_ASSERT(fdp); fdused_init(fdp, fd); if (fd > fdp->fd_lastfile) fdp->fd_lastfile = fd; if (fd == fdp->fd_freefile) fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles); } /* * Mark a file descriptor as unused. */ static void fdunused(struct filedesc *fdp, int fd) { FILEDESC_XLOCK_ASSERT(fdp); KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd)); KASSERT(fdp->fd_ofiles[fd].fde_file == NULL, ("fd=%d is still in use", fd)); fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd); if (fd < fdp->fd_freefile) fdp->fd_freefile = fd; if (fd == fdp->fd_lastfile) fdp->fd_lastfile = fd_last_used(fdp, fd); } /* * Free a file descriptor. * * Avoid some work if fdp is about to be destroyed. */ static inline void fdefree_last(struct filedescent *fde) { filecaps_free(&fde->fde_caps); } static inline void fdfree(struct filedesc *fdp, int fd) { struct filedescent *fde; fde = &fdp->fd_ofiles[fd]; #ifdef CAPABILITIES seq_write_begin(&fde->fde_seq); #endif fde->fde_file = NULL; #ifdef CAPABILITIES seq_write_end(&fde->fde_seq); #endif fdefree_last(fde); fdunused(fdp, fd); } void pwd_ensure_dirs(void) { struct filedesc *fdp; fdp = curproc->p_fd; FILEDESC_XLOCK(fdp); if (fdp->fd_cdir == NULL) { fdp->fd_cdir = rootvnode; vrefact(rootvnode); } if (fdp->fd_rdir == NULL) { fdp->fd_rdir = rootvnode; vrefact(rootvnode); } FILEDESC_XUNLOCK(fdp); } /* * System calls on descriptors. */ #ifndef _SYS_SYSPROTO_H_ struct getdtablesize_args { int dummy; }; #endif /* ARGSUSED */ int sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap) { #ifdef RACCT uint64_t lim; #endif td->td_retval[0] = min((int)lim_cur(td, RLIMIT_NOFILE), maxfilesperproc); #ifdef RACCT PROC_LOCK(td->td_proc); lim = racct_get_limit(td->td_proc, RACCT_NOFILE); PROC_UNLOCK(td->td_proc); if (lim < td->td_retval[0]) td->td_retval[0] = lim; #endif return (0); } /* * Duplicate a file descriptor to a particular value. * * Note: keep in mind that a potential race condition exists when closing * descriptors from a shared descriptor table (via rfork). */ #ifndef _SYS_SYSPROTO_H_ struct dup2_args { u_int from; u_int to; }; #endif /* ARGSUSED */ int sys_dup2(struct thread *td, struct dup2_args *uap) { return (kern_dup(td, FDDUP_FIXED, 0, (int)uap->from, (int)uap->to)); } /* * Duplicate a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct dup_args { u_int fd; }; #endif /* ARGSUSED */ int sys_dup(struct thread *td, struct dup_args *uap) { return (kern_dup(td, FDDUP_NORMAL, 0, (int)uap->fd, 0)); } /* * The file control system call. */ #ifndef _SYS_SYSPROTO_H_ struct fcntl_args { int fd; int cmd; long arg; }; #endif /* ARGSUSED */ int sys_fcntl(struct thread *td, struct fcntl_args *uap) { return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, uap->arg)); } int kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg) { struct flock fl; struct __oflock ofl; intptr_t arg1; int error, newcmd; error = 0; newcmd = cmd; switch (cmd) { case F_OGETLK: case F_OSETLK: case F_OSETLKW: /* * Convert old flock structure to new. */ error = copyin((void *)(intptr_t)arg, &ofl, sizeof(ofl)); fl.l_start = ofl.l_start; fl.l_len = ofl.l_len; fl.l_pid = ofl.l_pid; fl.l_type = ofl.l_type; fl.l_whence = ofl.l_whence; fl.l_sysid = 0; switch (cmd) { case F_OGETLK: newcmd = F_GETLK; break; case F_OSETLK: newcmd = F_SETLK; break; case F_OSETLKW: newcmd = F_SETLKW; break; } arg1 = (intptr_t)&fl; break; case F_GETLK: case F_SETLK: case F_SETLKW: case F_SETLK_REMOTE: error = copyin((void *)(intptr_t)arg, &fl, sizeof(fl)); arg1 = (intptr_t)&fl; break; default: arg1 = arg; break; } if (error) return (error); error = kern_fcntl(td, fd, newcmd, arg1); if (error) return (error); if (cmd == F_OGETLK) { ofl.l_start = fl.l_start; ofl.l_len = fl.l_len; ofl.l_pid = fl.l_pid; ofl.l_type = fl.l_type; ofl.l_whence = fl.l_whence; error = copyout(&ofl, (void *)(intptr_t)arg, sizeof(ofl)); } else if (cmd == F_GETLK) { error = copyout(&fl, (void *)(intptr_t)arg, sizeof(fl)); } return (error); } int kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) { struct filedesc *fdp; struct flock *flp; struct file *fp, *fp2; struct filedescent *fde; struct proc *p; struct vnode *vp; - cap_rights_t rights; int error, flg, tmp; uint64_t bsize; off_t foffset; error = 0; flg = F_POSIX; p = td->td_proc; fdp = p->p_fd; AUDIT_ARG_FD(cmd); AUDIT_ARG_CMD(cmd); switch (cmd) { case F_DUPFD: tmp = arg; error = kern_dup(td, FDDUP_FCNTL, 0, fd, tmp); break; case F_DUPFD_CLOEXEC: tmp = arg; error = kern_dup(td, FDDUP_FCNTL, FDDUP_FLAG_CLOEXEC, fd, tmp); break; case F_DUP2FD: tmp = arg; error = kern_dup(td, FDDUP_FIXED, 0, fd, tmp); break; case F_DUP2FD_CLOEXEC: tmp = arg; error = kern_dup(td, FDDUP_FIXED, FDDUP_FLAG_CLOEXEC, fd, tmp); break; case F_GETFD: error = EBADF; FILEDESC_SLOCK(fdp); fde = fdeget_locked(fdp, fd); if (fde != NULL) { td->td_retval[0] = (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0; error = 0; } FILEDESC_SUNLOCK(fdp); break; case F_SETFD: error = EBADF; FILEDESC_XLOCK(fdp); fde = fdeget_locked(fdp, fd); if (fde != NULL) { fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) | (arg & FD_CLOEXEC ? UF_EXCLOSE : 0); error = 0; } FILEDESC_XUNLOCK(fdp); break; case F_GETFL: - error = fget_fcntl(td, fd, - cap_rights_init(&rights, CAP_FCNTL), F_GETFL, &fp); + error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETFL, &fp); if (error != 0) break; td->td_retval[0] = OFLAGS(fp->f_flag); fdrop(fp, td); break; case F_SETFL: - error = fget_fcntl(td, fd, - cap_rights_init(&rights, CAP_FCNTL), F_SETFL, &fp); + error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETFL, &fp); if (error != 0) break; do { tmp = flg = fp->f_flag; tmp &= ~FCNTLFLAGS; tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS; } while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0); tmp = fp->f_flag & FNONBLOCK; error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); if (error != 0) { fdrop(fp, td); break; } tmp = fp->f_flag & FASYNC; error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td); if (error == 0) { fdrop(fp, td); break; } atomic_clear_int(&fp->f_flag, FNONBLOCK); tmp = 0; (void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); fdrop(fp, td); break; case F_GETOWN: - error = fget_fcntl(td, fd, - cap_rights_init(&rights, CAP_FCNTL), F_GETOWN, &fp); + error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETOWN, &fp); if (error != 0) break; error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td); if (error == 0) td->td_retval[0] = tmp; fdrop(fp, td); break; case F_SETOWN: - error = fget_fcntl(td, fd, - cap_rights_init(&rights, CAP_FCNTL), F_SETOWN, &fp); + error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETOWN, &fp); if (error != 0) break; tmp = arg; error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td); fdrop(fp, td); break; case F_SETLK_REMOTE: error = priv_check(td, PRIV_NFS_LOCKD); if (error) return (error); flg = F_REMOTE; goto do_setlk; case F_SETLKW: flg |= F_WAIT; /* FALLTHROUGH F_SETLK */ case F_SETLK: do_setlk: - cap_rights_init(&rights, CAP_FLOCK); - error = fget_unlocked(fdp, fd, &rights, &fp, NULL); + error = fget_unlocked(fdp, fd, &cap_flock_rights, &fp, NULL); if (error != 0) break; if (fp->f_type != DTYPE_VNODE) { error = EBADF; fdrop(fp, td); break; } flp = (struct flock *)arg; if (flp->l_whence == SEEK_CUR) { foffset = foffset_get(fp); if (foffset < 0 || (flp->l_start > 0 && foffset > OFF_MAX - flp->l_start)) { error = EOVERFLOW; fdrop(fp, td); break; } flp->l_start += foffset; } vp = fp->f_vnode; switch (flp->l_type) { case F_RDLCK: if ((fp->f_flag & FREAD) == 0) { error = EBADF; break; } if ((p->p_leader->p_flag & P_ADVLOCK) == 0) { PROC_LOCK(p->p_leader); p->p_leader->p_flag |= P_ADVLOCK; PROC_UNLOCK(p->p_leader); } error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, flp, flg); break; case F_WRLCK: if ((fp->f_flag & FWRITE) == 0) { error = EBADF; break; } if ((p->p_leader->p_flag & P_ADVLOCK) == 0) { PROC_LOCK(p->p_leader); p->p_leader->p_flag |= P_ADVLOCK; PROC_UNLOCK(p->p_leader); } error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, flp, flg); break; case F_UNLCK: error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, flp, flg); break; case F_UNLCKSYS: /* * Temporary api for testing remote lock * infrastructure. */ if (flg != F_REMOTE) { error = EINVAL; break; } error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCKSYS, flp, flg); break; default: error = EINVAL; break; } if (error != 0 || flp->l_type == F_UNLCK || flp->l_type == F_UNLCKSYS) { fdrop(fp, td); break; } /* * Check for a race with close. * * The vnode is now advisory locked (or unlocked, but this case * is not really important) as the caller requested. * We had to drop the filedesc lock, so we need to recheck if * the descriptor is still valid, because if it was closed * in the meantime we need to remove advisory lock from the * vnode - close on any descriptor leading to an advisory * locked vnode, removes that lock. * We will return 0 on purpose in that case, as the result of * successful advisory lock might have been externally visible * already. This is fine - effectively we pretend to the caller * that the closing thread was a bit slower and that the * advisory lock succeeded before the close. */ - error = fget_unlocked(fdp, fd, &rights, &fp2, NULL); + error = fget_unlocked(fdp, fd, &cap_no_rights, &fp2, NULL); if (error != 0) { fdrop(fp, td); break; } if (fp != fp2) { flp->l_whence = SEEK_SET; flp->l_start = 0; flp->l_len = 0; flp->l_type = F_UNLCK; (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, flp, F_POSIX); } fdrop(fp, td); fdrop(fp2, td); break; case F_GETLK: - error = fget_unlocked(fdp, fd, - cap_rights_init(&rights, CAP_FLOCK), &fp, NULL); + error = fget_unlocked(fdp, fd, &cap_flock_rights, &fp, NULL); if (error != 0) break; if (fp->f_type != DTYPE_VNODE) { error = EBADF; fdrop(fp, td); break; } flp = (struct flock *)arg; if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK && flp->l_type != F_UNLCK) { error = EINVAL; fdrop(fp, td); break; } if (flp->l_whence == SEEK_CUR) { foffset = foffset_get(fp); if ((flp->l_start > 0 && foffset > OFF_MAX - flp->l_start) || (flp->l_start < 0 && foffset < OFF_MIN - flp->l_start)) { error = EOVERFLOW; fdrop(fp, td); break; } flp->l_start += foffset; } vp = fp->f_vnode; error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp, F_POSIX); fdrop(fp, td); break; case F_RDAHEAD: arg = arg ? 128 * 1024: 0; /* FALLTHROUGH */ case F_READAHEAD: - error = fget_unlocked(fdp, fd, - cap_rights_init(&rights), &fp, NULL); + error = fget_unlocked(fdp, fd, &cap_no_rights, &fp, NULL); if (error != 0) break; if (fp->f_type != DTYPE_VNODE) { fdrop(fp, td); error = EBADF; break; } vp = fp->f_vnode; /* * Exclusive lock synchronizes against f_seqcount reads and * writes in sequential_heuristic(). */ error = vn_lock(vp, LK_EXCLUSIVE); if (error != 0) { fdrop(fp, td); break; } if (arg >= 0) { bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize; fp->f_seqcount = (arg + bsize - 1) / bsize; atomic_set_int(&fp->f_flag, FRDAHEAD); } else { atomic_clear_int(&fp->f_flag, FRDAHEAD); } VOP_UNLOCK(vp, 0); fdrop(fp, td); break; default: error = EINVAL; break; } return (error); } static int getmaxfd(struct thread *td) { return (min((int)lim_cur(td, RLIMIT_NOFILE), maxfilesperproc)); } /* * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD). */ int kern_dup(struct thread *td, u_int mode, int flags, int old, int new) { struct filedesc *fdp; struct filedescent *oldfde, *newfde; struct proc *p; struct file *delfp; u_long *oioctls, *nioctls; int error, maxfd; p = td->td_proc; fdp = p->p_fd; MPASS((flags & ~(FDDUP_FLAG_CLOEXEC)) == 0); MPASS(mode < FDDUP_LASTMODE); AUDIT_ARG_FD(old); /* XXXRW: if (flags & FDDUP_FIXED) AUDIT_ARG_FD2(new); */ /* * Verify we have a valid descriptor to dup from and possibly to * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should * return EINVAL when the new descriptor is out of bounds. */ if (old < 0) return (EBADF); if (new < 0) return (mode == FDDUP_FCNTL ? EINVAL : EBADF); maxfd = getmaxfd(td); if (new >= maxfd) return (mode == FDDUP_FCNTL ? EINVAL : EBADF); error = EBADF; FILEDESC_XLOCK(fdp); if (fget_locked(fdp, old) == NULL) goto unlock; if ((mode == FDDUP_FIXED || mode == FDDUP_MUSTREPLACE) && old == new) { td->td_retval[0] = new; if (flags & FDDUP_FLAG_CLOEXEC) fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE; error = 0; goto unlock; } /* * If the caller specified a file descriptor, make sure the file * table is large enough to hold it, and grab it. Otherwise, just * allocate a new descriptor the usual way. */ switch (mode) { case FDDUP_NORMAL: case FDDUP_FCNTL: if ((error = fdalloc(td, new, &new)) != 0) goto unlock; break; case FDDUP_MUSTREPLACE: /* Target file descriptor must exist. */ if (fget_locked(fdp, new) == NULL) goto unlock; break; case FDDUP_FIXED: if (new >= fdp->fd_nfiles) { /* * The resource limits are here instead of e.g. * fdalloc(), because the file descriptor table may be * shared between processes, so we can't really use * racct_add()/racct_sub(). Instead of counting the * number of actually allocated descriptors, just put * the limit on the size of the file descriptor table. */ #ifdef RACCT if (racct_enable) { PROC_LOCK(p); error = racct_set(p, RACCT_NOFILE, new + 1); PROC_UNLOCK(p); if (error != 0) { error = EMFILE; goto unlock; } } #endif fdgrowtable_exp(fdp, new + 1); } if (!fdisused(fdp, new)) fdused(fdp, new); break; default: KASSERT(0, ("%s unsupported mode %d", __func__, mode)); } KASSERT(old != new, ("new fd is same as old")); oldfde = &fdp->fd_ofiles[old]; fhold(oldfde->fde_file); newfde = &fdp->fd_ofiles[new]; delfp = newfde->fde_file; oioctls = filecaps_free_prep(&newfde->fde_caps); nioctls = filecaps_copy_prep(&oldfde->fde_caps); /* * Duplicate the source descriptor. */ #ifdef CAPABILITIES seq_write_begin(&newfde->fde_seq); #endif memcpy(newfde, oldfde, fde_change_size); filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps, nioctls); if ((flags & FDDUP_FLAG_CLOEXEC) != 0) newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE; else newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE; #ifdef CAPABILITIES seq_write_end(&newfde->fde_seq); #endif filecaps_free_finish(oioctls); td->td_retval[0] = new; error = 0; if (delfp != NULL) { (void) closefp(fdp, new, delfp, td, 1); FILEDESC_UNLOCK_ASSERT(fdp); } else { unlock: FILEDESC_XUNLOCK(fdp); } return (error); } /* * If sigio is on the list associated with a process or process group, * disable signalling from the device, remove sigio from the list and * free sigio. */ void funsetown(struct sigio **sigiop) { struct sigio *sigio; if (*sigiop == NULL) return; SIGIO_LOCK(); sigio = *sigiop; if (sigio == NULL) { SIGIO_UNLOCK(); return; } *(sigio->sio_myref) = NULL; if ((sigio)->sio_pgid < 0) { struct pgrp *pg = (sigio)->sio_pgrp; PGRP_LOCK(pg); SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio, sigio, sio_pgsigio); PGRP_UNLOCK(pg); } else { struct proc *p = (sigio)->sio_proc; PROC_LOCK(p); SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio, sigio, sio_pgsigio); PROC_UNLOCK(p); } SIGIO_UNLOCK(); crfree(sigio->sio_ucred); free(sigio, M_SIGIO); } /* * Free a list of sigio structures. * We only need to lock the SIGIO_LOCK because we have made ourselves * inaccessible to callers of fsetown and therefore do not need to lock * the proc or pgrp struct for the list manipulation. */ void funsetownlst(struct sigiolst *sigiolst) { struct proc *p; struct pgrp *pg; struct sigio *sigio; sigio = SLIST_FIRST(sigiolst); if (sigio == NULL) return; p = NULL; pg = NULL; /* * Every entry of the list should belong * to a single proc or pgrp. */ if (sigio->sio_pgid < 0) { pg = sigio->sio_pgrp; PGRP_LOCK_ASSERT(pg, MA_NOTOWNED); } else /* if (sigio->sio_pgid > 0) */ { p = sigio->sio_proc; PROC_LOCK_ASSERT(p, MA_NOTOWNED); } SIGIO_LOCK(); while ((sigio = SLIST_FIRST(sigiolst)) != NULL) { *(sigio->sio_myref) = NULL; if (pg != NULL) { KASSERT(sigio->sio_pgid < 0, ("Proc sigio in pgrp sigio list")); KASSERT(sigio->sio_pgrp == pg, ("Bogus pgrp in sigio list")); PGRP_LOCK(pg); SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio, sio_pgsigio); PGRP_UNLOCK(pg); } else /* if (p != NULL) */ { KASSERT(sigio->sio_pgid > 0, ("Pgrp sigio in proc sigio list")); KASSERT(sigio->sio_proc == p, ("Bogus proc in sigio list")); PROC_LOCK(p); SLIST_REMOVE(&p->p_sigiolst, sigio, sigio, sio_pgsigio); PROC_UNLOCK(p); } SIGIO_UNLOCK(); crfree(sigio->sio_ucred); free(sigio, M_SIGIO); SIGIO_LOCK(); } SIGIO_UNLOCK(); } /* * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg). * * After permission checking, add a sigio structure to the sigio list for * the process or process group. */ int fsetown(pid_t pgid, struct sigio **sigiop) { struct proc *proc; struct pgrp *pgrp; struct sigio *sigio; int ret; if (pgid == 0) { funsetown(sigiop); return (0); } ret = 0; /* Allocate and fill in the new sigio out of locks. */ sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK); sigio->sio_pgid = pgid; sigio->sio_ucred = crhold(curthread->td_ucred); sigio->sio_myref = sigiop; sx_slock(&proctree_lock); if (pgid > 0) { proc = pfind(pgid); if (proc == NULL) { ret = ESRCH; goto fail; } /* * Policy - Don't allow a process to FSETOWN a process * in another session. * * Remove this test to allow maximum flexibility or * restrict FSETOWN to the current process or process * group for maximum safety. */ PROC_UNLOCK(proc); if (proc->p_session != curthread->td_proc->p_session) { ret = EPERM; goto fail; } pgrp = NULL; } else /* if (pgid < 0) */ { pgrp = pgfind(-pgid); if (pgrp == NULL) { ret = ESRCH; goto fail; } PGRP_UNLOCK(pgrp); /* * Policy - Don't allow a process to FSETOWN a process * in another session. * * Remove this test to allow maximum flexibility or * restrict FSETOWN to the current process or process * group for maximum safety. */ if (pgrp->pg_session != curthread->td_proc->p_session) { ret = EPERM; goto fail; } proc = NULL; } funsetown(sigiop); if (pgid > 0) { PROC_LOCK(proc); /* * Since funsetownlst() is called without the proctree * locked, we need to check for P_WEXIT. * XXX: is ESRCH correct? */ if ((proc->p_flag & P_WEXIT) != 0) { PROC_UNLOCK(proc); ret = ESRCH; goto fail; } SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio); sigio->sio_proc = proc; PROC_UNLOCK(proc); } else { PGRP_LOCK(pgrp); SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio); sigio->sio_pgrp = pgrp; PGRP_UNLOCK(pgrp); } sx_sunlock(&proctree_lock); SIGIO_LOCK(); *sigiop = sigio; SIGIO_UNLOCK(); return (0); fail: sx_sunlock(&proctree_lock); crfree(sigio->sio_ucred); free(sigio, M_SIGIO); return (ret); } /* * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg). */ pid_t fgetown(sigiop) struct sigio **sigiop; { pid_t pgid; SIGIO_LOCK(); pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0; SIGIO_UNLOCK(); return (pgid); } /* * Function drops the filedesc lock on return. */ static int closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, int holdleaders) { int error; FILEDESC_XLOCK_ASSERT(fdp); if (holdleaders) { if (td->td_proc->p_fdtol != NULL) { /* * Ask fdfree() to sleep to ensure that all relevant * process leaders can be traversed in closef(). */ fdp->fd_holdleaderscount++; } else { holdleaders = 0; } } /* * We now hold the fp reference that used to be owned by the * descriptor array. We have to unlock the FILEDESC *AFTER* * knote_fdclose to prevent a race of the fd getting opened, a knote * added, and deleteing a knote for the new fd. */ knote_fdclose(td, fd); /* * We need to notify mqueue if the object is of type mqueue. */ if (fp->f_type == DTYPE_MQUEUE) mq_fdclose(td, fd, fp); FILEDESC_XUNLOCK(fdp); error = closef(fp, td); if (holdleaders) { FILEDESC_XLOCK(fdp); fdp->fd_holdleaderscount--; if (fdp->fd_holdleaderscount == 0 && fdp->fd_holdleaderswakeup != 0) { fdp->fd_holdleaderswakeup = 0; wakeup(&fdp->fd_holdleaderscount); } FILEDESC_XUNLOCK(fdp); } return (error); } /* * Close a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct close_args { int fd; }; #endif /* ARGSUSED */ int sys_close(struct thread *td, struct close_args *uap) { return (kern_close(td, uap->fd)); } int kern_close(struct thread *td, int fd) { struct filedesc *fdp; struct file *fp; fdp = td->td_proc->p_fd; AUDIT_SYSCLOSE(td, fd); FILEDESC_XLOCK(fdp); if ((fp = fget_locked(fdp, fd)) == NULL) { FILEDESC_XUNLOCK(fdp); return (EBADF); } fdfree(fdp, fd); /* closefp() drops the FILEDESC lock for us. */ return (closefp(fdp, fd, fp, td, 1)); } /* * Close open file descriptors. */ #ifndef _SYS_SYSPROTO_H_ struct closefrom_args { int lowfd; }; #endif /* ARGSUSED */ int sys_closefrom(struct thread *td, struct closefrom_args *uap) { struct filedesc *fdp; int fd; fdp = td->td_proc->p_fd; AUDIT_ARG_FD(uap->lowfd); /* * Treat negative starting file descriptor values identical to * closefrom(0) which closes all files. */ if (uap->lowfd < 0) uap->lowfd = 0; FILEDESC_SLOCK(fdp); for (fd = uap->lowfd; fd <= fdp->fd_lastfile; fd++) { if (fdp->fd_ofiles[fd].fde_file != NULL) { FILEDESC_SUNLOCK(fdp); (void)kern_close(td, fd); FILEDESC_SLOCK(fdp); } } FILEDESC_SUNLOCK(fdp); return (0); } #if defined(COMPAT_43) /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct ofstat_args { int fd; struct ostat *sb; }; #endif /* ARGSUSED */ int ofstat(struct thread *td, struct ofstat_args *uap) { struct ostat oub; struct stat ub; int error; error = kern_fstat(td, uap->fd, &ub); if (error == 0) { cvtstat(&ub, &oub); error = copyout(&oub, uap->sb, sizeof(oub)); } return (error); } #endif /* COMPAT_43 */ #if defined(COMPAT_FREEBSD11) int freebsd11_fstat(struct thread *td, struct freebsd11_fstat_args *uap) { struct stat sb; struct freebsd11_stat osb; int error; error = kern_fstat(td, uap->fd, &sb); if (error != 0) return (error); error = freebsd11_cvtstat(&sb, &osb); if (error == 0) error = copyout(&osb, uap->sb, sizeof(osb)); return (error); } #endif /* COMPAT_FREEBSD11 */ /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fstat_args { int fd; struct stat *sb; }; #endif /* ARGSUSED */ int sys_fstat(struct thread *td, struct fstat_args *uap) { struct stat ub; int error; error = kern_fstat(td, uap->fd, &ub); if (error == 0) error = copyout(&ub, uap->sb, sizeof(ub)); return (error); } int kern_fstat(struct thread *td, int fd, struct stat *sbp) { struct file *fp; - cap_rights_t rights; int error; AUDIT_ARG_FD(fd); - error = fget(td, fd, cap_rights_init(&rights, CAP_FSTAT), &fp); + error = fget(td, fd, &cap_fstat_rights, &fp); if (error != 0) return (error); AUDIT_ARG_FILE(td->td_proc, fp); error = fo_stat(fp, sbp, td->td_ucred, td); fdrop(fp, td); #ifdef __STAT_TIME_T_EXT if (error == 0) { sbp->st_atim_ext = 0; sbp->st_mtim_ext = 0; sbp->st_ctim_ext = 0; sbp->st_btim_ext = 0; } #endif #ifdef KTRACE if (error == 0 && KTRPOINT(td, KTR_STRUCT)) ktrstat(sbp); #endif return (error); } #if defined(COMPAT_FREEBSD11) /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd11_nfstat_args { int fd; struct nstat *sb; }; #endif /* ARGSUSED */ int freebsd11_nfstat(struct thread *td, struct freebsd11_nfstat_args *uap) { struct nstat nub; struct stat ub; int error; error = kern_fstat(td, uap->fd, &ub); if (error == 0) { freebsd11_cvtnstat(&ub, &nub); error = copyout(&nub, uap->sb, sizeof(nub)); } return (error); } #endif /* COMPAT_FREEBSD11 */ /* * Return pathconf information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fpathconf_args { int fd; int name; }; #endif /* ARGSUSED */ int sys_fpathconf(struct thread *td, struct fpathconf_args *uap) { long value; int error; error = kern_fpathconf(td, uap->fd, uap->name, &value); if (error == 0) td->td_retval[0] = value; return (error); } int kern_fpathconf(struct thread *td, int fd, int name, long *valuep) { struct file *fp; struct vnode *vp; - cap_rights_t rights; int error; - error = fget(td, fd, cap_rights_init(&rights, CAP_FPATHCONF), &fp); + error = fget(td, fd, &cap_fpathconf_rights, &fp); if (error != 0) return (error); if (name == _PC_ASYNC_IO) { *valuep = _POSIX_ASYNCHRONOUS_IO; goto out; } vp = fp->f_vnode; if (vp != NULL) { vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_PATHCONF(vp, name, valuep); VOP_UNLOCK(vp, 0); } else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) { if (name != _PC_PIPE_BUF) { error = EINVAL; } else { *valuep = PIPE_BUF; error = 0; } } else { error = EOPNOTSUPP; } out: fdrop(fp, td); return (error); } /* * Initialize filecaps structure. */ void filecaps_init(struct filecaps *fcaps) { bzero(fcaps, sizeof(*fcaps)); fcaps->fc_nioctls = -1; } /* * Copy filecaps structure allocating memory for ioctls array if needed. * * The last parameter indicates whether the fdtable is locked. If it is not and * ioctls are encountered, copying fails and the caller must lock the table. * * Note that if the table was not locked, the caller has to check the relevant * sequence counter to determine whether the operation was successful. */ bool filecaps_copy(const struct filecaps *src, struct filecaps *dst, bool locked) { size_t size; if (src->fc_ioctls != NULL && !locked) return (false); memcpy(dst, src, sizeof(*src)); if (src->fc_ioctls == NULL) return (true); KASSERT(src->fc_nioctls > 0, ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls)); size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK); memcpy(dst->fc_ioctls, src->fc_ioctls, size); return (true); } static u_long * filecaps_copy_prep(const struct filecaps *src) { u_long *ioctls; size_t size; if (src->fc_ioctls == NULL) return (NULL); KASSERT(src->fc_nioctls > 0, ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls)); size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; ioctls = malloc(size, M_FILECAPS, M_WAITOK); return (ioctls); } static void filecaps_copy_finish(const struct filecaps *src, struct filecaps *dst, u_long *ioctls) { size_t size; *dst = *src; if (src->fc_ioctls == NULL) { MPASS(ioctls == NULL); return; } size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; dst->fc_ioctls = ioctls; bcopy(src->fc_ioctls, dst->fc_ioctls, size); } /* * Move filecaps structure to the new place and clear the old place. */ void filecaps_move(struct filecaps *src, struct filecaps *dst) { *dst = *src; bzero(src, sizeof(*src)); } /* * Fill the given filecaps structure with full rights. */ static void filecaps_fill(struct filecaps *fcaps) { CAP_ALL(&fcaps->fc_rights); fcaps->fc_ioctls = NULL; fcaps->fc_nioctls = -1; fcaps->fc_fcntls = CAP_FCNTL_ALL; } /* * Free memory allocated within filecaps structure. */ void filecaps_free(struct filecaps *fcaps) { free(fcaps->fc_ioctls, M_FILECAPS); bzero(fcaps, sizeof(*fcaps)); } static u_long * filecaps_free_prep(struct filecaps *fcaps) { u_long *ioctls; ioctls = fcaps->fc_ioctls; bzero(fcaps, sizeof(*fcaps)); return (ioctls); } static void filecaps_free_finish(u_long *ioctls) { free(ioctls, M_FILECAPS); } /* * Validate the given filecaps structure. */ static void filecaps_validate(const struct filecaps *fcaps, const char *func) { KASSERT(cap_rights_is_valid(&fcaps->fc_rights), ("%s: invalid rights", func)); KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0, ("%s: invalid fcntls", func)); KASSERT(fcaps->fc_fcntls == 0 || cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL), ("%s: fcntls without CAP_FCNTL", func)); KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 : (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0), ("%s: invalid ioctls", func)); KASSERT(fcaps->fc_nioctls == 0 || cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL), ("%s: ioctls without CAP_IOCTL", func)); } static void fdgrowtable_exp(struct filedesc *fdp, int nfd) { int nfd1; FILEDESC_XLOCK_ASSERT(fdp); nfd1 = fdp->fd_nfiles * 2; if (nfd1 < nfd) nfd1 = nfd; fdgrowtable(fdp, nfd1); } /* * Grow the file table to accommodate (at least) nfd descriptors. */ static void fdgrowtable(struct filedesc *fdp, int nfd) { struct filedesc0 *fdp0; struct freetable *ft; struct fdescenttbl *ntable; struct fdescenttbl *otable; int nnfiles, onfiles; NDSLOTTYPE *nmap, *omap; /* * If lastfile is -1 this struct filedesc was just allocated and we are * growing it to accommodate for the one we are going to copy from. There * is no need to have a lock on this one as it's not visible to anyone. */ if (fdp->fd_lastfile != -1) FILEDESC_XLOCK_ASSERT(fdp); KASSERT(fdp->fd_nfiles > 0, ("zero-length file table")); /* save old values */ onfiles = fdp->fd_nfiles; otable = fdp->fd_files; omap = fdp->fd_map; /* compute the size of the new table */ nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */ if (nnfiles <= onfiles) /* the table is already large enough */ return; /* * Allocate a new table. We need enough space for the number of * entries, file entries themselves and the struct freetable we will use * when we decommission the table and place it on the freelist. * We place the struct freetable in the middle so we don't have * to worry about padding. */ ntable = malloc(offsetof(struct fdescenttbl, fdt_ofiles) + nnfiles * sizeof(ntable->fdt_ofiles[0]) + sizeof(struct freetable), M_FILEDESC, M_ZERO | M_WAITOK); /* copy the old data */ ntable->fdt_nfiles = nnfiles; memcpy(ntable->fdt_ofiles, otable->fdt_ofiles, onfiles * sizeof(ntable->fdt_ofiles[0])); /* * Allocate a new map only if the old is not large enough. It will * grow at a slower rate than the table as it can map more * entries than the table can hold. */ if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) { nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC, M_ZERO | M_WAITOK); /* copy over the old data and update the pointer */ memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap)); fdp->fd_map = nmap; } /* * Make sure that ntable is correctly initialized before we replace * fd_files poiner. Otherwise fget_unlocked() may see inconsistent * data. */ atomic_store_rel_ptr((volatile void *)&fdp->fd_files, (uintptr_t)ntable); /* * Do not free the old file table, as some threads may still * reference entries within it. Instead, place it on a freelist * which will be processed when the struct filedesc is released. * * Note that if onfiles == NDFILE, we're dealing with the original * static allocation contained within (struct filedesc0 *)fdp, * which must not be freed. */ if (onfiles > NDFILE) { ft = (struct freetable *)&otable->fdt_ofiles[onfiles]; fdp0 = (struct filedesc0 *)fdp; ft->ft_table = otable; SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next); } /* * The map does not have the same possibility of threads still * holding references to it. So always free it as long as it * does not reference the original static allocation. */ if (NDSLOTS(onfiles) > NDSLOTS(NDFILE)) free(omap, M_FILEDESC); } /* * Allocate a file descriptor for the process. */ int fdalloc(struct thread *td, int minfd, int *result) { struct proc *p = td->td_proc; struct filedesc *fdp = p->p_fd; int fd, maxfd, allocfd; #ifdef RACCT int error; #endif FILEDESC_XLOCK_ASSERT(fdp); if (fdp->fd_freefile > minfd) minfd = fdp->fd_freefile; maxfd = getmaxfd(td); /* * Search the bitmap for a free descriptor starting at minfd. * If none is found, grow the file table. */ fd = fd_first_free(fdp, minfd, fdp->fd_nfiles); if (fd >= maxfd) return (EMFILE); if (fd >= fdp->fd_nfiles) { allocfd = min(fd * 2, maxfd); #ifdef RACCT if (racct_enable) { PROC_LOCK(p); error = racct_set(p, RACCT_NOFILE, allocfd); PROC_UNLOCK(p); if (error != 0) return (EMFILE); } #endif /* * fd is already equal to first free descriptor >= minfd, so * we only need to grow the table and we are done. */ fdgrowtable_exp(fdp, allocfd); } /* * Perform some sanity checks, then mark the file descriptor as * used and return it to the caller. */ KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles), ("invalid descriptor %d", fd)); KASSERT(!fdisused(fdp, fd), ("fd_first_free() returned non-free descriptor")); KASSERT(fdp->fd_ofiles[fd].fde_file == NULL, ("file descriptor isn't free")); fdused(fdp, fd); *result = fd; return (0); } /* * Allocate n file descriptors for the process. */ int fdallocn(struct thread *td, int minfd, int *fds, int n) { struct proc *p = td->td_proc; struct filedesc *fdp = p->p_fd; int i; FILEDESC_XLOCK_ASSERT(fdp); for (i = 0; i < n; i++) if (fdalloc(td, 0, &fds[i]) != 0) break; if (i < n) { for (i--; i >= 0; i--) fdunused(fdp, fds[i]); return (EMFILE); } return (0); } /* * Create a new open file structure and allocate a file descriptor for the * process that refers to it. We add one reference to the file for the * descriptor table and one reference for resultfp. This is to prevent us * being preempted and the entry in the descriptor table closed after we * release the FILEDESC lock. */ int falloc_caps(struct thread *td, struct file **resultfp, int *resultfd, int flags, struct filecaps *fcaps) { struct file *fp; int error, fd; error = falloc_noinstall(td, &fp); if (error) return (error); /* no reference held on error */ error = finstall(td, fp, &fd, flags, fcaps); if (error) { fdrop(fp, td); /* one reference (fp only) */ return (error); } if (resultfp != NULL) *resultfp = fp; /* copy out result */ else fdrop(fp, td); /* release local reference */ if (resultfd != NULL) *resultfd = fd; return (0); } /* * Create a new open file structure without allocating a file descriptor. */ int falloc_noinstall(struct thread *td, struct file **resultfp) { struct file *fp; int maxuserfiles = maxfiles - (maxfiles / 20); int openfiles_new; static struct timeval lastfail; static int curfail; KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__)); openfiles_new = atomic_fetchadd_int(&openfiles, 1) + 1; if ((openfiles_new >= maxuserfiles && priv_check(td, PRIV_MAXFILES) != 0) || openfiles_new >= maxfiles) { atomic_subtract_int(&openfiles, 1); if (ppsratecheck(&lastfail, &curfail, 1)) { printf("kern.maxfiles limit exceeded by uid %i, (%s) " "please see tuning(7).\n", td->td_ucred->cr_ruid, td->td_proc->p_comm); } return (ENFILE); } fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO); refcount_init(&fp->f_count, 1); fp->f_cred = crhold(td->td_ucred); fp->f_ops = &badfileops; *resultfp = fp; return (0); } /* * Install a file in a file descriptor table. */ void _finstall(struct filedesc *fdp, struct file *fp, int fd, int flags, struct filecaps *fcaps) { struct filedescent *fde; MPASS(fp != NULL); if (fcaps != NULL) filecaps_validate(fcaps, __func__); FILEDESC_XLOCK_ASSERT(fdp); fde = &fdp->fd_ofiles[fd]; #ifdef CAPABILITIES seq_write_begin(&fde->fde_seq); #endif fde->fde_file = fp; fde->fde_flags = (flags & O_CLOEXEC) != 0 ? UF_EXCLOSE : 0; if (fcaps != NULL) filecaps_move(fcaps, &fde->fde_caps); else filecaps_fill(&fde->fde_caps); #ifdef CAPABILITIES seq_write_end(&fde->fde_seq); #endif } int finstall(struct thread *td, struct file *fp, int *fd, int flags, struct filecaps *fcaps) { struct filedesc *fdp = td->td_proc->p_fd; int error; MPASS(fd != NULL); FILEDESC_XLOCK(fdp); if ((error = fdalloc(td, 0, fd))) { FILEDESC_XUNLOCK(fdp); return (error); } fhold(fp); _finstall(fdp, fp, *fd, flags, fcaps); FILEDESC_XUNLOCK(fdp); return (0); } /* * Build a new filedesc structure from another. * Copy the current, root, and jail root vnode references. * * If fdp is not NULL, return with it shared locked. */ struct filedesc * fdinit(struct filedesc *fdp, bool prepfiles) { struct filedesc0 *newfdp0; struct filedesc *newfdp; newfdp0 = uma_zalloc(filedesc0_zone, M_WAITOK | M_ZERO); newfdp = &newfdp0->fd_fd; /* Create the file descriptor table. */ FILEDESC_LOCK_INIT(newfdp); refcount_init(&newfdp->fd_refcnt, 1); refcount_init(&newfdp->fd_holdcnt, 1); newfdp->fd_cmask = CMASK; newfdp->fd_map = newfdp0->fd_dmap; newfdp->fd_lastfile = -1; newfdp->fd_files = (struct fdescenttbl *)&newfdp0->fd_dfiles; newfdp->fd_files->fdt_nfiles = NDFILE; if (fdp == NULL) return (newfdp); if (prepfiles && fdp->fd_lastfile >= newfdp->fd_nfiles) fdgrowtable(newfdp, fdp->fd_lastfile + 1); FILEDESC_SLOCK(fdp); newfdp->fd_cdir = fdp->fd_cdir; if (newfdp->fd_cdir) vrefact(newfdp->fd_cdir); newfdp->fd_rdir = fdp->fd_rdir; if (newfdp->fd_rdir) vrefact(newfdp->fd_rdir); newfdp->fd_jdir = fdp->fd_jdir; if (newfdp->fd_jdir) vrefact(newfdp->fd_jdir); if (!prepfiles) { FILEDESC_SUNLOCK(fdp); } else { while (fdp->fd_lastfile >= newfdp->fd_nfiles) { FILEDESC_SUNLOCK(fdp); fdgrowtable(newfdp, fdp->fd_lastfile + 1); FILEDESC_SLOCK(fdp); } } return (newfdp); } static struct filedesc * fdhold(struct proc *p) { struct filedesc *fdp; PROC_LOCK_ASSERT(p, MA_OWNED); fdp = p->p_fd; if (fdp != NULL) refcount_acquire(&fdp->fd_holdcnt); return (fdp); } static void fddrop(struct filedesc *fdp) { if (fdp->fd_holdcnt > 1) { if (refcount_release(&fdp->fd_holdcnt) == 0) return; } FILEDESC_LOCK_DESTROY(fdp); uma_zfree(filedesc0_zone, fdp); } /* * Share a filedesc structure. */ struct filedesc * fdshare(struct filedesc *fdp) { refcount_acquire(&fdp->fd_refcnt); return (fdp); } /* * Unshare a filedesc structure, if necessary by making a copy */ void fdunshare(struct thread *td) { struct filedesc *tmp; struct proc *p = td->td_proc; if (p->p_fd->fd_refcnt == 1) return; tmp = fdcopy(p->p_fd); fdescfree(td); p->p_fd = tmp; } void fdinstall_remapped(struct thread *td, struct filedesc *fdp) { fdescfree(td); td->td_proc->p_fd = fdp; } /* * Copy a filedesc structure. A NULL pointer in returns a NULL reference, * this is to ease callers, not catch errors. */ struct filedesc * fdcopy(struct filedesc *fdp) { struct filedesc *newfdp; struct filedescent *nfde, *ofde; int i; MPASS(fdp != NULL); newfdp = fdinit(fdp, true); /* copy all passable descriptors (i.e. not kqueue) */ newfdp->fd_freefile = -1; for (i = 0; i <= fdp->fd_lastfile; ++i) { ofde = &fdp->fd_ofiles[i]; if (ofde->fde_file == NULL || (ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0) { if (newfdp->fd_freefile == -1) newfdp->fd_freefile = i; continue; } nfde = &newfdp->fd_ofiles[i]; *nfde = *ofde; filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true); fhold(nfde->fde_file); fdused_init(newfdp, i); newfdp->fd_lastfile = i; } if (newfdp->fd_freefile == -1) newfdp->fd_freefile = i; newfdp->fd_cmask = fdp->fd_cmask; FILEDESC_SUNLOCK(fdp); return (newfdp); } /* * Copies a filedesc structure, while remapping all file descriptors * stored inside using a translation table. * * File descriptors are copied over to the new file descriptor table, * regardless of whether the close-on-exec flag is set. */ int fdcopy_remapped(struct filedesc *fdp, const int *fds, size_t nfds, struct filedesc **ret) { struct filedesc *newfdp; struct filedescent *nfde, *ofde; int error, i; MPASS(fdp != NULL); newfdp = fdinit(fdp, true); if (nfds > fdp->fd_lastfile + 1) { /* New table cannot be larger than the old one. */ error = E2BIG; goto bad; } /* Copy all passable descriptors (i.e. not kqueue). */ newfdp->fd_freefile = nfds; for (i = 0; i < nfds; ++i) { if (fds[i] < 0 || fds[i] > fdp->fd_lastfile) { /* File descriptor out of bounds. */ error = EBADF; goto bad; } ofde = &fdp->fd_ofiles[fds[i]]; if (ofde->fde_file == NULL) { /* Unused file descriptor. */ error = EBADF; goto bad; } if ((ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0) { /* File descriptor cannot be passed. */ error = EINVAL; goto bad; } nfde = &newfdp->fd_ofiles[i]; *nfde = *ofde; filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true); fhold(nfde->fde_file); fdused_init(newfdp, i); newfdp->fd_lastfile = i; } newfdp->fd_cmask = fdp->fd_cmask; FILEDESC_SUNLOCK(fdp); *ret = newfdp; return (0); bad: FILEDESC_SUNLOCK(fdp); fdescfree_remapped(newfdp); return (error); } /* * Clear POSIX style locks. This is only used when fdp looses a reference (i.e. * one of processes using it exits) and the table used to be shared. */ static void fdclearlocks(struct thread *td) { struct filedesc *fdp; struct filedesc_to_leader *fdtol; struct flock lf; struct file *fp; struct proc *p; struct vnode *vp; int i; p = td->td_proc; fdp = p->p_fd; fdtol = p->p_fdtol; MPASS(fdtol != NULL); FILEDESC_XLOCK(fdp); KASSERT(fdtol->fdl_refcount > 0, ("filedesc_to_refcount botch: fdl_refcount=%d", fdtol->fdl_refcount)); if (fdtol->fdl_refcount == 1 && (p->p_leader->p_flag & P_ADVLOCK) != 0) { for (i = 0; i <= fdp->fd_lastfile; i++) { fp = fdp->fd_ofiles[i].fde_file; if (fp == NULL || fp->f_type != DTYPE_VNODE) continue; fhold(fp); FILEDESC_XUNLOCK(fdp); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; vp = fp->f_vnode; (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, &lf, F_POSIX); FILEDESC_XLOCK(fdp); fdrop(fp, td); } } retry: if (fdtol->fdl_refcount == 1) { if (fdp->fd_holdleaderscount > 0 && (p->p_leader->p_flag & P_ADVLOCK) != 0) { /* * close() or kern_dup() has cleared a reference * in a shared file descriptor table. */ fdp->fd_holdleaderswakeup = 1; sx_sleep(&fdp->fd_holdleaderscount, FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0); goto retry; } if (fdtol->fdl_holdcount > 0) { /* * Ensure that fdtol->fdl_leader remains * valid in closef(). */ fdtol->fdl_wakeup = 1; sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0); goto retry; } } fdtol->fdl_refcount--; if (fdtol->fdl_refcount == 0 && fdtol->fdl_holdcount == 0) { fdtol->fdl_next->fdl_prev = fdtol->fdl_prev; fdtol->fdl_prev->fdl_next = fdtol->fdl_next; } else fdtol = NULL; p->p_fdtol = NULL; FILEDESC_XUNLOCK(fdp); if (fdtol != NULL) free(fdtol, M_FILEDESC_TO_LEADER); } /* * Release a filedesc structure. */ static void fdescfree_fds(struct thread *td, struct filedesc *fdp, bool needclose) { struct filedesc0 *fdp0; struct freetable *ft, *tft; struct filedescent *fde; struct file *fp; int i; for (i = 0; i <= fdp->fd_lastfile; i++) { fde = &fdp->fd_ofiles[i]; fp = fde->fde_file; if (fp != NULL) { fdefree_last(fde); if (needclose) (void) closef(fp, td); else fdrop(fp, td); } } if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE)) free(fdp->fd_map, M_FILEDESC); if (fdp->fd_nfiles > NDFILE) free(fdp->fd_files, M_FILEDESC); fdp0 = (struct filedesc0 *)fdp; SLIST_FOREACH_SAFE(ft, &fdp0->fd_free, ft_next, tft) free(ft->ft_table, M_FILEDESC); fddrop(fdp); } void fdescfree(struct thread *td) { struct proc *p; struct filedesc *fdp; struct vnode *cdir, *jdir, *rdir; p = td->td_proc; fdp = p->p_fd; MPASS(fdp != NULL); #ifdef RACCT if (racct_enable) { PROC_LOCK(p); racct_set(p, RACCT_NOFILE, 0); PROC_UNLOCK(p); } #endif if (p->p_fdtol != NULL) fdclearlocks(td); PROC_LOCK(p); p->p_fd = NULL; PROC_UNLOCK(p); if (refcount_release(&fdp->fd_refcnt) == 0) return; FILEDESC_XLOCK(fdp); cdir = fdp->fd_cdir; fdp->fd_cdir = NULL; rdir = fdp->fd_rdir; fdp->fd_rdir = NULL; jdir = fdp->fd_jdir; fdp->fd_jdir = NULL; FILEDESC_XUNLOCK(fdp); if (cdir != NULL) vrele(cdir); if (rdir != NULL) vrele(rdir); if (jdir != NULL) vrele(jdir); fdescfree_fds(td, fdp, 1); } void fdescfree_remapped(struct filedesc *fdp) { if (fdp->fd_cdir != NULL) vrele(fdp->fd_cdir); if (fdp->fd_rdir != NULL) vrele(fdp->fd_rdir); if (fdp->fd_jdir != NULL) vrele(fdp->fd_jdir); fdescfree_fds(curthread, fdp, 0); } /* * For setugid programs, we don't want to people to use that setugidness * to generate error messages which write to a file which otherwise would * otherwise be off-limits to the process. We check for filesystems where * the vnode can change out from under us after execve (like [lin]procfs). * * Since fdsetugidsafety calls this only for fd 0, 1 and 2, this check is * sufficient. We also don't check for setugidness since we know we are. */ static bool is_unsafe(struct file *fp) { struct vnode *vp; if (fp->f_type != DTYPE_VNODE) return (false); vp = fp->f_vnode; return ((vp->v_vflag & VV_PROCDEP) != 0); } /* * Make this setguid thing safe, if at all possible. */ void fdsetugidsafety(struct thread *td) { struct filedesc *fdp; struct file *fp; int i; fdp = td->td_proc->p_fd; KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); MPASS(fdp->fd_nfiles >= 3); for (i = 0; i <= 2; i++) { fp = fdp->fd_ofiles[i].fde_file; if (fp != NULL && is_unsafe(fp)) { FILEDESC_XLOCK(fdp); knote_fdclose(td, i); /* * NULL-out descriptor prior to close to avoid * a race while close blocks. */ fdfree(fdp, i); FILEDESC_XUNLOCK(fdp); (void) closef(fp, td); } } } /* * If a specific file object occupies a specific file descriptor, close the * file descriptor entry and drop a reference on the file object. This is a * convenience function to handle a subsequent error in a function that calls * falloc() that handles the race that another thread might have closed the * file descriptor out from under the thread creating the file object. */ void fdclose(struct thread *td, struct file *fp, int idx) { struct filedesc *fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); if (fdp->fd_ofiles[idx].fde_file == fp) { fdfree(fdp, idx); FILEDESC_XUNLOCK(fdp); fdrop(fp, td); } else FILEDESC_XUNLOCK(fdp); } /* * Close any files on exec? */ void fdcloseexec(struct thread *td) { struct filedesc *fdp; struct filedescent *fde; struct file *fp; int i; fdp = td->td_proc->p_fd; KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); for (i = 0; i <= fdp->fd_lastfile; i++) { fde = &fdp->fd_ofiles[i]; fp = fde->fde_file; if (fp != NULL && (fp->f_type == DTYPE_MQUEUE || (fde->fde_flags & UF_EXCLOSE))) { FILEDESC_XLOCK(fdp); fdfree(fdp, i); (void) closefp(fdp, i, fp, td, 0); FILEDESC_UNLOCK_ASSERT(fdp); } } } /* * It is unsafe for set[ug]id processes to be started with file * descriptors 0..2 closed, as these descriptors are given implicit * significance in the Standard C library. fdcheckstd() will create a * descriptor referencing /dev/null for each of stdin, stdout, and * stderr that is not already open. */ int fdcheckstd(struct thread *td) { struct filedesc *fdp; register_t save; int i, error, devnull; fdp = td->td_proc->p_fd; KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); MPASS(fdp->fd_nfiles >= 3); devnull = -1; for (i = 0; i <= 2; i++) { if (fdp->fd_ofiles[i].fde_file != NULL) continue; save = td->td_retval[0]; if (devnull != -1) { error = kern_dup(td, FDDUP_FIXED, 0, devnull, i); } else { error = kern_openat(td, AT_FDCWD, "/dev/null", UIO_SYSSPACE, O_RDWR, 0); if (error == 0) { devnull = td->td_retval[0]; KASSERT(devnull == i, ("we didn't get our fd")); } } td->td_retval[0] = save; if (error != 0) return (error); } return (0); } /* * Internal form of close. Decrement reference count on file structure. * Note: td may be NULL when closing a file that was being passed in a * message. * * XXXRW: Giant is not required for the caller, but often will be held; this * makes it moderately likely the Giant will be recursed in the VFS case. */ int closef(struct file *fp, struct thread *td) { struct vnode *vp; struct flock lf; struct filedesc_to_leader *fdtol; struct filedesc *fdp; /* * POSIX record locking dictates that any close releases ALL * locks owned by this process. This is handled by setting * a flag in the unlock to free ONLY locks obeying POSIX * semantics, and not to free BSD-style file locks. * If the descriptor was in a message, POSIX-style locks * aren't passed with the descriptor, and the thread pointer * will be NULL. Callers should be careful only to pass a * NULL thread pointer when there really is no owning * context that might have locks, or the locks will be * leaked. */ if (fp->f_type == DTYPE_VNODE && td != NULL) { vp = fp->f_vnode; if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; (void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader, F_UNLCK, &lf, F_POSIX); } fdtol = td->td_proc->p_fdtol; if (fdtol != NULL) { /* * Handle special case where file descriptor table is * shared between multiple process leaders. */ fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); for (fdtol = fdtol->fdl_next; fdtol != td->td_proc->p_fdtol; fdtol = fdtol->fdl_next) { if ((fdtol->fdl_leader->p_flag & P_ADVLOCK) == 0) continue; fdtol->fdl_holdcount++; FILEDESC_XUNLOCK(fdp); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; vp = fp->f_vnode; (void) VOP_ADVLOCK(vp, (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf, F_POSIX); FILEDESC_XLOCK(fdp); fdtol->fdl_holdcount--; if (fdtol->fdl_holdcount == 0 && fdtol->fdl_wakeup != 0) { fdtol->fdl_wakeup = 0; wakeup(fdtol); } } FILEDESC_XUNLOCK(fdp); } } return (fdrop(fp, td)); } /* * Initialize the file pointer with the specified properties. * * The ops are set with release semantics to be certain that the flags, type, * and data are visible when ops is. This is to prevent ops methods from being * called with bad data. */ void finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops) { fp->f_data = data; fp->f_flag = flag; fp->f_type = type; atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops); } int fget_cap_locked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, struct file **fpp, struct filecaps *havecapsp) { struct filedescent *fde; int error; FILEDESC_LOCK_ASSERT(fdp); fde = fdeget_locked(fdp, fd); if (fde == NULL) { error = EBADF; goto out; } #ifdef CAPABILITIES error = cap_check(cap_rights_fde(fde), needrightsp); if (error != 0) goto out; #endif if (havecapsp != NULL) filecaps_copy(&fde->fde_caps, havecapsp, true); *fpp = fde->fde_file; error = 0; out: return (error); } int fget_cap(struct thread *td, int fd, cap_rights_t *needrightsp, struct file **fpp, struct filecaps *havecapsp) { struct filedesc *fdp = td->td_proc->p_fd; int error; #ifndef CAPABILITIES error = fget_unlocked(fdp, fd, needrightsp, fpp, NULL); if (error == 0 && havecapsp != NULL) filecaps_fill(havecapsp); #else struct file *fp; seq_t seq; for (;;) { error = fget_unlocked(fdp, fd, needrightsp, &fp, &seq); if (error != 0) return (error); if (havecapsp != NULL) { if (!filecaps_copy(&fdp->fd_ofiles[fd].fde_caps, havecapsp, false)) { fdrop(fp, td); goto get_locked; } } if (!fd_modified(fdp, fd, seq)) break; fdrop(fp, td); } *fpp = fp; return (0); get_locked: FILEDESC_SLOCK(fdp); error = fget_cap_locked(fdp, fd, needrightsp, fpp, havecapsp); if (error == 0) fhold(*fpp); FILEDESC_SUNLOCK(fdp); #endif return (error); } int fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, struct file **fpp, seq_t *seqp) { #ifdef CAPABILITIES struct filedescent *fde; #endif struct fdescenttbl *fdt; struct file *fp; u_int count; #ifdef CAPABILITIES seq_t seq; cap_rights_t haverights; int error; #endif fdt = fdp->fd_files; if ((u_int)fd >= fdt->fdt_nfiles) return (EBADF); /* * Fetch the descriptor locklessly. We avoid fdrop() races by * never raising a refcount above 0. To accomplish this we have * to use a cmpset loop rather than an atomic_add. The descriptor * must be re-verified once we acquire a reference to be certain * that the identity is still correct and we did not lose a race * due to preemption. */ for (;;) { #ifdef CAPABILITIES seq = seq_read(fd_seq(fdt, fd)); fde = &fdt->fdt_ofiles[fd]; haverights = *cap_rights_fde(fde); fp = fde->fde_file; if (!seq_consistent(fd_seq(fdt, fd), seq)) continue; #else fp = fdt->fdt_ofiles[fd].fde_file; #endif if (fp == NULL) return (EBADF); #ifdef CAPABILITIES error = cap_check(&haverights, needrightsp); if (error != 0) return (error); #endif count = fp->f_count; retry: if (count == 0) { /* * Force a reload. Other thread could reallocate the * table before this fd was closed, so it possible that * there is a stale fp pointer in cached version. */ fdt = *(struct fdescenttbl * volatile *)&(fdp->fd_files); continue; } /* * Use an acquire barrier to force re-reading of fdt so it is * refreshed for verification. */ if (atomic_fcmpset_acq_int(&fp->f_count, &count, count + 1) == 0) goto retry; fdt = fdp->fd_files; #ifdef CAPABILITIES if (seq_consistent_nomb(fd_seq(fdt, fd), seq)) #else if (fp == fdt->fdt_ofiles[fd].fde_file) #endif break; fdrop(fp, curthread); } *fpp = fp; if (seqp != NULL) { #ifdef CAPABILITIES *seqp = seq; #endif } return (0); } /* * Extract the file pointer associated with the specified descriptor for the * current user process. * * If the descriptor doesn't exist or doesn't match 'flags', EBADF is * returned. * * File's rights will be checked against the capability rights mask. * * If an error occurred the non-zero error is returned and *fpp is set to * NULL. Otherwise *fpp is held and set and zero is returned. Caller is * responsible for fdrop(). */ static __inline int _fget(struct thread *td, int fd, struct file **fpp, int flags, cap_rights_t *needrightsp, seq_t *seqp) { struct filedesc *fdp; struct file *fp; int error; *fpp = NULL; fdp = td->td_proc->p_fd; error = fget_unlocked(fdp, fd, needrightsp, &fp, seqp); if (error != 0) return (error); if (fp->f_ops == &badfileops) { fdrop(fp, td); return (EBADF); } /* * FREAD and FWRITE failure return EBADF as per POSIX. */ error = 0; switch (flags) { case FREAD: case FWRITE: if ((fp->f_flag & flags) == 0) error = EBADF; break; case FEXEC: if ((fp->f_flag & (FREAD | FEXEC)) == 0 || ((fp->f_flag & FWRITE) != 0)) error = EBADF; break; case 0: break; default: KASSERT(0, ("wrong flags")); } if (error != 0) { fdrop(fp, td); return (error); } *fpp = fp; return (0); } int fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) { return (_fget(td, fd, fpp, 0, rightsp, NULL)); } int fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, u_char *maxprotp, struct file **fpp) { int error; #ifndef CAPABILITIES error = _fget(td, fd, fpp, 0, rightsp, NULL); if (maxprotp != NULL) *maxprotp = VM_PROT_ALL; #else struct filedesc *fdp = td->td_proc->p_fd; seq_t seq; MPASS(cap_rights_is_set(rightsp, CAP_MMAP)); for (;;) { error = _fget(td, fd, fpp, 0, rightsp, &seq); if (error != 0) return (error); /* * If requested, convert capability rights to access flags. */ if (maxprotp != NULL) *maxprotp = cap_rights_to_vmprot(cap_rights(fdp, fd)); if (!fd_modified(fdp, fd, seq)) break; fdrop(*fpp, td); } #endif return (error); } int fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) { return (_fget(td, fd, fpp, FREAD, rightsp, NULL)); } int fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) { return (_fget(td, fd, fpp, FWRITE, rightsp, NULL)); } int fget_fcntl(struct thread *td, int fd, cap_rights_t *rightsp, int needfcntl, struct file **fpp) { struct filedesc *fdp = td->td_proc->p_fd; #ifndef CAPABILITIES return (fget_unlocked(fdp, fd, rightsp, fpp, NULL)); #else int error; seq_t seq; MPASS(cap_rights_is_set(rightsp, CAP_FCNTL)); for (;;) { error = fget_unlocked(fdp, fd, rightsp, fpp, &seq); if (error != 0) return (error); error = cap_fcntl_check(fdp, fd, needfcntl); if (!fd_modified(fdp, fd, seq)) break; fdrop(*fpp, td); } if (error != 0) { fdrop(*fpp, td); *fpp = NULL; } return (error); #endif } /* * Like fget() but loads the underlying vnode, or returns an error if the * descriptor does not represent a vnode. Note that pipes use vnodes but * never have VM objects. The returned vnode will be vref()'d. * * XXX: what about the unused flags ? */ static __inline int _fgetvp(struct thread *td, int fd, int flags, cap_rights_t *needrightsp, struct vnode **vpp) { struct file *fp; int error; *vpp = NULL; error = _fget(td, fd, &fp, flags, needrightsp, NULL); if (error != 0) return (error); if (fp->f_vnode == NULL) { error = EINVAL; } else { *vpp = fp->f_vnode; vrefact(*vpp); } fdrop(fp, td); return (error); } int fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) { return (_fgetvp(td, fd, 0, rightsp, vpp)); } int fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp, struct filecaps *havecaps, struct vnode **vpp) { struct filedesc *fdp; struct filecaps caps; struct file *fp; int error; fdp = td->td_proc->p_fd; error = fget_cap_locked(fdp, fd, needrightsp, &fp, &caps); if (error != 0) return (error); if (fp->f_ops == &badfileops) { error = EBADF; goto out; } if (fp->f_vnode == NULL) { error = EINVAL; goto out; } *havecaps = caps; *vpp = fp->f_vnode; vrefact(*vpp); return (0); out: filecaps_free(&caps); return (error); } int fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) { return (_fgetvp(td, fd, FREAD, rightsp, vpp)); } int fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) { return (_fgetvp(td, fd, FEXEC, rightsp, vpp)); } #ifdef notyet int fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) { return (_fgetvp(td, fd, FWRITE, rightsp, vpp)); } #endif /* * Handle the last reference to a file being closed. */ int _fdrop(struct file *fp, struct thread *td) { int error; if (fp->f_count != 0) panic("fdrop: count %d", fp->f_count); error = fo_close(fp, td); atomic_subtract_int(&openfiles, 1); crfree(fp->f_cred); free(fp->f_advice, M_FADVISE); uma_zfree(file_zone, fp); return (error); } /* * Apply an advisory lock on a file descriptor. * * Just attempt to get a record lock of the requested type on the entire file * (l_whence = SEEK_SET, l_start = 0, l_len = 0). */ #ifndef _SYS_SYSPROTO_H_ struct flock_args { int fd; int how; }; #endif /* ARGSUSED */ int sys_flock(struct thread *td, struct flock_args *uap) { struct file *fp; struct vnode *vp; struct flock lf; - cap_rights_t rights; int error; - error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FLOCK), &fp); + error = fget(td, uap->fd, &cap_flock_rights, &fp); if (error != 0) return (error); if (fp->f_type != DTYPE_VNODE) { fdrop(fp, td); return (EOPNOTSUPP); } vp = fp->f_vnode; lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; if (uap->how & LOCK_UN) { lf.l_type = F_UNLCK; atomic_clear_int(&fp->f_flag, FHASLOCK); error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK); goto done2; } if (uap->how & LOCK_EX) lf.l_type = F_WRLCK; else if (uap->how & LOCK_SH) lf.l_type = F_RDLCK; else { error = EBADF; goto done2; } atomic_set_int(&fp->f_flag, FHASLOCK); error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT); done2: fdrop(fp, td); return (error); } /* * Duplicate the specified descriptor to a free descriptor. */ int dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode, int openerror, int *indxp) { struct filedescent *newfde, *oldfde; struct file *fp; u_long *ioctls; int error, indx; KASSERT(openerror == ENODEV || openerror == ENXIO, ("unexpected error %d in %s", openerror, __func__)); /* * If the to-be-dup'd fd number is greater than the allowed number * of file descriptors, or the fd to be dup'd has already been * closed, then reject. */ FILEDESC_XLOCK(fdp); if ((fp = fget_locked(fdp, dfd)) == NULL) { FILEDESC_XUNLOCK(fdp); return (EBADF); } error = fdalloc(td, 0, &indx); if (error != 0) { FILEDESC_XUNLOCK(fdp); return (error); } /* * There are two cases of interest here. * * For ENODEV simply dup (dfd) to file descriptor (indx) and return. * * For ENXIO steal away the file structure from (dfd) and store it in * (indx). (dfd) is effectively closed by this operation. */ switch (openerror) { case ENODEV: /* * Check that the mode the file is being opened for is a * subset of the mode of the existing descriptor. */ if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) { fdunused(fdp, indx); FILEDESC_XUNLOCK(fdp); return (EACCES); } fhold(fp); newfde = &fdp->fd_ofiles[indx]; oldfde = &fdp->fd_ofiles[dfd]; ioctls = filecaps_copy_prep(&oldfde->fde_caps); #ifdef CAPABILITIES seq_write_begin(&newfde->fde_seq); #endif memcpy(newfde, oldfde, fde_change_size); filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps, ioctls); #ifdef CAPABILITIES seq_write_end(&newfde->fde_seq); #endif break; case ENXIO: /* * Steal away the file pointer from dfd and stuff it into indx. */ newfde = &fdp->fd_ofiles[indx]; oldfde = &fdp->fd_ofiles[dfd]; #ifdef CAPABILITIES seq_write_begin(&newfde->fde_seq); #endif memcpy(newfde, oldfde, fde_change_size); oldfde->fde_file = NULL; fdunused(fdp, dfd); #ifdef CAPABILITIES seq_write_end(&newfde->fde_seq); #endif break; } FILEDESC_XUNLOCK(fdp); *indxp = indx; return (0); } /* * This sysctl determines if we will allow a process to chroot(2) if it * has a directory open: * 0: disallowed for all processes. * 1: allowed for processes that were not already chroot(2)'ed. * 2: allowed for all processes. */ static int chroot_allow_open_directories = 1; SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW, &chroot_allow_open_directories, 0, "Allow a process to chroot(2) if it has a directory open"); /* * Helper function for raised chroot(2) security function: Refuse if * any filedescriptors are open directories. */ static int chroot_refuse_vdir_fds(struct filedesc *fdp) { struct vnode *vp; struct file *fp; int fd; FILEDESC_LOCK_ASSERT(fdp); for (fd = 0; fd <= fdp->fd_lastfile; fd++) { fp = fget_locked(fdp, fd); if (fp == NULL) continue; if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; if (vp->v_type == VDIR) return (EPERM); } } return (0); } /* * Common routine for kern_chroot() and jail_attach(). The caller is * responsible for invoking priv_check() and mac_vnode_check_chroot() to * authorize this operation. */ int pwd_chroot(struct thread *td, struct vnode *vp) { struct filedesc *fdp; struct vnode *oldvp; int error; fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); if (chroot_allow_open_directories == 0 || (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) { error = chroot_refuse_vdir_fds(fdp); if (error != 0) { FILEDESC_XUNLOCK(fdp); return (error); } } oldvp = fdp->fd_rdir; vrefact(vp); fdp->fd_rdir = vp; if (fdp->fd_jdir == NULL) { vrefact(vp); fdp->fd_jdir = vp; } FILEDESC_XUNLOCK(fdp); vrele(oldvp); return (0); } void pwd_chdir(struct thread *td, struct vnode *vp) { struct filedesc *fdp; struct vnode *oldvp; fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); VNASSERT(vp->v_usecount > 0, vp, ("chdir to a vnode with zero usecount")); oldvp = fdp->fd_cdir; fdp->fd_cdir = vp; FILEDESC_XUNLOCK(fdp); vrele(oldvp); } /* * Scan all active processes and prisons to see if any of them have a current * or root directory of `olddp'. If so, replace them with the new mount point. */ void mountcheckdirs(struct vnode *olddp, struct vnode *newdp) { struct filedesc *fdp; struct prison *pr; struct proc *p; int nrele; if (vrefcnt(olddp) == 1) return; nrele = 0; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); fdp = fdhold(p); PROC_UNLOCK(p); if (fdp == NULL) continue; FILEDESC_XLOCK(fdp); if (fdp->fd_cdir == olddp) { vrefact(newdp); fdp->fd_cdir = newdp; nrele++; } if (fdp->fd_rdir == olddp) { vrefact(newdp); fdp->fd_rdir = newdp; nrele++; } if (fdp->fd_jdir == olddp) { vrefact(newdp); fdp->fd_jdir = newdp; nrele++; } FILEDESC_XUNLOCK(fdp); fddrop(fdp); } sx_sunlock(&allproc_lock); if (rootvnode == olddp) { vrefact(newdp); rootvnode = newdp; nrele++; } mtx_lock(&prison0.pr_mtx); if (prison0.pr_root == olddp) { vrefact(newdp); prison0.pr_root = newdp; nrele++; } mtx_unlock(&prison0.pr_mtx); sx_slock(&allprison_lock); TAILQ_FOREACH(pr, &allprison, pr_list) { mtx_lock(&pr->pr_mtx); if (pr->pr_root == olddp) { vrefact(newdp); pr->pr_root = newdp; nrele++; } mtx_unlock(&pr->pr_mtx); } sx_sunlock(&allprison_lock); while (nrele--) vrele(olddp); } struct filedesc_to_leader * filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader) { struct filedesc_to_leader *fdtol; fdtol = malloc(sizeof(struct filedesc_to_leader), M_FILEDESC_TO_LEADER, M_WAITOK); fdtol->fdl_refcount = 1; fdtol->fdl_holdcount = 0; fdtol->fdl_wakeup = 0; fdtol->fdl_leader = leader; if (old != NULL) { FILEDESC_XLOCK(fdp); fdtol->fdl_next = old->fdl_next; fdtol->fdl_prev = old; old->fdl_next = fdtol; fdtol->fdl_next->fdl_prev = fdtol; FILEDESC_XUNLOCK(fdp); } else { fdtol->fdl_next = fdtol; fdtol->fdl_prev = fdtol; } return (fdtol); } static int sysctl_kern_proc_nfds(SYSCTL_HANDLER_ARGS) { struct filedesc *fdp; int i, count, slots; if (*(int *)arg1 != 0) return (EINVAL); fdp = curproc->p_fd; count = 0; FILEDESC_SLOCK(fdp); slots = NDSLOTS(fdp->fd_lastfile + 1); for (i = 0; i < slots; i++) count += bitcountl(fdp->fd_map[i]); FILEDESC_SUNLOCK(fdp); return (SYSCTL_OUT(req, &count, sizeof(count))); } static SYSCTL_NODE(_kern_proc, KERN_PROC_NFDS, nfds, CTLFLAG_RD|CTLFLAG_CAPRD|CTLFLAG_MPSAFE, sysctl_kern_proc_nfds, "Number of open file descriptors"); /* * Get file structures globally. */ static int sysctl_kern_file(SYSCTL_HANDLER_ARGS) { struct xfile xf; struct filedesc *fdp; struct file *fp; struct proc *p; int error, n; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); if (req->oldptr == NULL) { n = 0; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } fdp = fdhold(p); PROC_UNLOCK(p); if (fdp == NULL) continue; /* overestimates sparse tables. */ if (fdp->fd_lastfile > 0) n += fdp->fd_lastfile; fddrop(fdp); } sx_sunlock(&allproc_lock); return (SYSCTL_OUT(req, 0, n * sizeof(xf))); } error = 0; bzero(&xf, sizeof(xf)); xf.xf_size = sizeof(xf); sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } if (p_cansee(req->td, p) != 0) { PROC_UNLOCK(p); continue; } xf.xf_pid = p->p_pid; xf.xf_uid = p->p_ucred->cr_uid; fdp = fdhold(p); PROC_UNLOCK(p); if (fdp == NULL) continue; FILEDESC_SLOCK(fdp); for (n = 0; fdp->fd_refcnt > 0 && n <= fdp->fd_lastfile; ++n) { if ((fp = fdp->fd_ofiles[n].fde_file) == NULL) continue; xf.xf_fd = n; xf.xf_file = fp; xf.xf_data = fp->f_data; xf.xf_vnode = fp->f_vnode; xf.xf_type = fp->f_type; xf.xf_count = fp->f_count; xf.xf_msgcount = 0; xf.xf_offset = foffset_get(fp); xf.xf_flag = fp->f_flag; error = SYSCTL_OUT(req, &xf, sizeof(xf)); if (error) break; } FILEDESC_SUNLOCK(fdp); fddrop(fdp); if (error) break; } sx_sunlock(&allproc_lock); return (error); } SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE, 0, 0, sysctl_kern_file, "S,xfile", "Entire file table"); #ifdef KINFO_FILE_SIZE CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE); #endif static int xlate_fflags(int fflags) { static const struct { int fflag; int kf_fflag; } fflags_table[] = { { FAPPEND, KF_FLAG_APPEND }, { FASYNC, KF_FLAG_ASYNC }, { FFSYNC, KF_FLAG_FSYNC }, { FHASLOCK, KF_FLAG_HASLOCK }, { FNONBLOCK, KF_FLAG_NONBLOCK }, { FREAD, KF_FLAG_READ }, { FWRITE, KF_FLAG_WRITE }, { O_CREAT, KF_FLAG_CREAT }, { O_DIRECT, KF_FLAG_DIRECT }, { O_EXCL, KF_FLAG_EXCL }, { O_EXEC, KF_FLAG_EXEC }, { O_EXLOCK, KF_FLAG_EXLOCK }, { O_NOFOLLOW, KF_FLAG_NOFOLLOW }, { O_SHLOCK, KF_FLAG_SHLOCK }, { O_TRUNC, KF_FLAG_TRUNC } }; unsigned int i; int kflags; kflags = 0; for (i = 0; i < nitems(fflags_table); i++) if (fflags & fflags_table[i].fflag) kflags |= fflags_table[i].kf_fflag; return (kflags); } /* Trim unused data from kf_path by truncating the structure size. */ static void pack_kinfo(struct kinfo_file *kif) { kif->kf_structsize = offsetof(struct kinfo_file, kf_path) + strlen(kif->kf_path) + 1; kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t)); } static void export_file_to_kinfo(struct file *fp, int fd, cap_rights_t *rightsp, struct kinfo_file *kif, struct filedesc *fdp, int flags) { int error; bzero(kif, sizeof(*kif)); /* Set a default type to allow for empty fill_kinfo() methods. */ kif->kf_type = KF_TYPE_UNKNOWN; kif->kf_flags = xlate_fflags(fp->f_flag); if (rightsp != NULL) kif->kf_cap_rights = *rightsp; else cap_rights_init(&kif->kf_cap_rights); kif->kf_fd = fd; kif->kf_ref_count = fp->f_count; kif->kf_offset = foffset_get(fp); /* * This may drop the filedesc lock, so the 'fp' cannot be * accessed after this call. */ error = fo_fill_kinfo(fp, kif, fdp); if (error == 0) kif->kf_status |= KF_ATTR_VALID; if ((flags & KERN_FILEDESC_PACK_KINFO) != 0) pack_kinfo(kif); else kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t)); } static void export_vnode_to_kinfo(struct vnode *vp, int fd, int fflags, struct kinfo_file *kif, int flags) { int error; bzero(kif, sizeof(*kif)); kif->kf_type = KF_TYPE_VNODE; error = vn_fill_kinfo_vnode(vp, kif); if (error == 0) kif->kf_status |= KF_ATTR_VALID; kif->kf_flags = xlate_fflags(fflags); cap_rights_init(&kif->kf_cap_rights); kif->kf_fd = fd; kif->kf_ref_count = -1; kif->kf_offset = -1; if ((flags & KERN_FILEDESC_PACK_KINFO) != 0) pack_kinfo(kif); else kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t)); vrele(vp); } struct export_fd_buf { struct filedesc *fdp; struct sbuf *sb; ssize_t remainder; struct kinfo_file kif; int flags; }; static int export_kinfo_to_sb(struct export_fd_buf *efbuf) { struct kinfo_file *kif; kif = &efbuf->kif; if (efbuf->remainder != -1) { if (efbuf->remainder < kif->kf_structsize) { /* Terminate export. */ efbuf->remainder = 0; return (0); } efbuf->remainder -= kif->kf_structsize; } return (sbuf_bcat(efbuf->sb, kif, kif->kf_structsize) == 0 ? 0 : ENOMEM); } static int export_file_to_sb(struct file *fp, int fd, cap_rights_t *rightsp, struct export_fd_buf *efbuf) { int error; if (efbuf->remainder == 0) return (0); export_file_to_kinfo(fp, fd, rightsp, &efbuf->kif, efbuf->fdp, efbuf->flags); FILEDESC_SUNLOCK(efbuf->fdp); error = export_kinfo_to_sb(efbuf); FILEDESC_SLOCK(efbuf->fdp); return (error); } static int export_vnode_to_sb(struct vnode *vp, int fd, int fflags, struct export_fd_buf *efbuf) { int error; if (efbuf->remainder == 0) return (0); if (efbuf->fdp != NULL) FILEDESC_SUNLOCK(efbuf->fdp); export_vnode_to_kinfo(vp, fd, fflags, &efbuf->kif, efbuf->flags); error = export_kinfo_to_sb(efbuf); if (efbuf->fdp != NULL) FILEDESC_SLOCK(efbuf->fdp); return (error); } /* * Store a process file descriptor information to sbuf. * * Takes a locked proc as argument, and returns with the proc unlocked. */ int kern_proc_filedesc_out(struct proc *p, struct sbuf *sb, ssize_t maxlen, int flags) { struct file *fp; struct filedesc *fdp; struct export_fd_buf *efbuf; struct vnode *cttyvp, *textvp, *tracevp; int error, i; cap_rights_t rights; PROC_LOCK_ASSERT(p, MA_OWNED); /* ktrace vnode */ tracevp = p->p_tracevp; if (tracevp != NULL) vrefact(tracevp); /* text vnode */ textvp = p->p_textvp; if (textvp != NULL) vrefact(textvp); /* Controlling tty. */ cttyvp = NULL; if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) { cttyvp = p->p_pgrp->pg_session->s_ttyvp; if (cttyvp != NULL) vrefact(cttyvp); } fdp = fdhold(p); PROC_UNLOCK(p); efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK); efbuf->fdp = NULL; efbuf->sb = sb; efbuf->remainder = maxlen; efbuf->flags = flags; if (tracevp != NULL) export_vnode_to_sb(tracevp, KF_FD_TYPE_TRACE, FREAD | FWRITE, efbuf); if (textvp != NULL) export_vnode_to_sb(textvp, KF_FD_TYPE_TEXT, FREAD, efbuf); if (cttyvp != NULL) export_vnode_to_sb(cttyvp, KF_FD_TYPE_CTTY, FREAD | FWRITE, efbuf); error = 0; if (fdp == NULL) goto fail; efbuf->fdp = fdp; FILEDESC_SLOCK(fdp); /* working directory */ if (fdp->fd_cdir != NULL) { vrefact(fdp->fd_cdir); export_vnode_to_sb(fdp->fd_cdir, KF_FD_TYPE_CWD, FREAD, efbuf); } /* root directory */ if (fdp->fd_rdir != NULL) { vrefact(fdp->fd_rdir); export_vnode_to_sb(fdp->fd_rdir, KF_FD_TYPE_ROOT, FREAD, efbuf); } /* jail directory */ if (fdp->fd_jdir != NULL) { vrefact(fdp->fd_jdir); export_vnode_to_sb(fdp->fd_jdir, KF_FD_TYPE_JAIL, FREAD, efbuf); } for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) { if ((fp = fdp->fd_ofiles[i].fde_file) == NULL) continue; #ifdef CAPABILITIES rights = *cap_rights(fdp, i); #else /* !CAPABILITIES */ - cap_rights_init(&rights); + rights = cap_no_rights; #endif /* * Create sysctl entry. It is OK to drop the filedesc * lock inside of export_file_to_sb() as we will * re-validate and re-evaluate its properties when the * loop continues. */ error = export_file_to_sb(fp, i, &rights, efbuf); if (error != 0 || efbuf->remainder == 0) break; } FILEDESC_SUNLOCK(fdp); fddrop(fdp); fail: free(efbuf, M_TEMP); return (error); } #define FILEDESC_SBUF_SIZE (sizeof(struct kinfo_file) * 5) /* * Get per-process file descriptors for use by procstat(1), et al. */ static int sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS) { struct sbuf sb; struct proc *p; ssize_t maxlen; int error, error2, *name; name = (int *)arg1; sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req); sbuf_clear_flags(&sb, SBUF_INCLUDENUL); error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); if (error != 0) { sbuf_delete(&sb); return (error); } maxlen = req->oldptr != NULL ? req->oldlen : -1; error = kern_proc_filedesc_out(p, &sb, maxlen, KERN_FILEDESC_PACK_KINFO); error2 = sbuf_finish(&sb); sbuf_delete(&sb); return (error != 0 ? error : error2); } #ifdef COMPAT_FREEBSD7 #ifdef KINFO_OFILE_SIZE CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE); #endif static void kinfo_to_okinfo(struct kinfo_file *kif, struct kinfo_ofile *okif) { okif->kf_structsize = sizeof(*okif); okif->kf_type = kif->kf_type; okif->kf_fd = kif->kf_fd; okif->kf_ref_count = kif->kf_ref_count; okif->kf_flags = kif->kf_flags & (KF_FLAG_READ | KF_FLAG_WRITE | KF_FLAG_APPEND | KF_FLAG_ASYNC | KF_FLAG_FSYNC | KF_FLAG_NONBLOCK | KF_FLAG_DIRECT | KF_FLAG_HASLOCK); okif->kf_offset = kif->kf_offset; if (kif->kf_type == KF_TYPE_VNODE) okif->kf_vnode_type = kif->kf_un.kf_file.kf_file_type; else okif->kf_vnode_type = KF_VTYPE_VNON; strlcpy(okif->kf_path, kif->kf_path, sizeof(okif->kf_path)); if (kif->kf_type == KF_TYPE_SOCKET) { okif->kf_sock_domain = kif->kf_un.kf_sock.kf_sock_domain0; okif->kf_sock_type = kif->kf_un.kf_sock.kf_sock_type0; okif->kf_sock_protocol = kif->kf_un.kf_sock.kf_sock_protocol0; okif->kf_sa_local = kif->kf_un.kf_sock.kf_sa_local; okif->kf_sa_peer = kif->kf_un.kf_sock.kf_sa_peer; } else { okif->kf_sa_local.ss_family = AF_UNSPEC; okif->kf_sa_peer.ss_family = AF_UNSPEC; } } static int export_vnode_for_osysctl(struct vnode *vp, int type, struct kinfo_file *kif, struct kinfo_ofile *okif, struct filedesc *fdp, struct sysctl_req *req) { int error; vrefact(vp); FILEDESC_SUNLOCK(fdp); export_vnode_to_kinfo(vp, type, 0, kif, KERN_FILEDESC_PACK_KINFO); kinfo_to_okinfo(kif, okif); error = SYSCTL_OUT(req, okif, sizeof(*okif)); FILEDESC_SLOCK(fdp); return (error); } /* * Get per-process file descriptors for use by procstat(1), et al. */ static int sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS) { struct kinfo_ofile *okif; struct kinfo_file *kif; struct filedesc *fdp; int error, i, *name; struct file *fp; struct proc *p; name = (int *)arg1; error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); if (error != 0) return (error); fdp = fdhold(p); PROC_UNLOCK(p); if (fdp == NULL) return (ENOENT); kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK); okif = malloc(sizeof(*okif), M_TEMP, M_WAITOK); FILEDESC_SLOCK(fdp); if (fdp->fd_cdir != NULL) export_vnode_for_osysctl(fdp->fd_cdir, KF_FD_TYPE_CWD, kif, okif, fdp, req); if (fdp->fd_rdir != NULL) export_vnode_for_osysctl(fdp->fd_rdir, KF_FD_TYPE_ROOT, kif, okif, fdp, req); if (fdp->fd_jdir != NULL) export_vnode_for_osysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif, okif, fdp, req); for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) { if ((fp = fdp->fd_ofiles[i].fde_file) == NULL) continue; export_file_to_kinfo(fp, i, NULL, kif, fdp, KERN_FILEDESC_PACK_KINFO); FILEDESC_SUNLOCK(fdp); kinfo_to_okinfo(kif, okif); error = SYSCTL_OUT(req, okif, sizeof(*okif)); FILEDESC_SLOCK(fdp); if (error) break; } FILEDESC_SUNLOCK(fdp); fddrop(fdp); free(kif, M_TEMP); free(okif, M_TEMP); return (0); } static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc, CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_ofiledesc, "Process ofiledesc entries"); #endif /* COMPAT_FREEBSD7 */ int vntype_to_kinfo(int vtype) { struct { int vtype; int kf_vtype; } vtypes_table[] = { { VBAD, KF_VTYPE_VBAD }, { VBLK, KF_VTYPE_VBLK }, { VCHR, KF_VTYPE_VCHR }, { VDIR, KF_VTYPE_VDIR }, { VFIFO, KF_VTYPE_VFIFO }, { VLNK, KF_VTYPE_VLNK }, { VNON, KF_VTYPE_VNON }, { VREG, KF_VTYPE_VREG }, { VSOCK, KF_VTYPE_VSOCK } }; unsigned int i; /* * Perform vtype translation. */ for (i = 0; i < nitems(vtypes_table); i++) if (vtypes_table[i].vtype == vtype) return (vtypes_table[i].kf_vtype); return (KF_VTYPE_UNKNOWN); } static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc, CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_filedesc, "Process filedesc entries"); /* * Store a process current working directory information to sbuf. * * Takes a locked proc as argument, and returns with the proc unlocked. */ int kern_proc_cwd_out(struct proc *p, struct sbuf *sb, ssize_t maxlen) { struct filedesc *fdp; struct export_fd_buf *efbuf; int error; PROC_LOCK_ASSERT(p, MA_OWNED); fdp = fdhold(p); PROC_UNLOCK(p); if (fdp == NULL) return (EINVAL); efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK); efbuf->fdp = fdp; efbuf->sb = sb; efbuf->remainder = maxlen; FILEDESC_SLOCK(fdp); if (fdp->fd_cdir == NULL) error = EINVAL; else { vrefact(fdp->fd_cdir); error = export_vnode_to_sb(fdp->fd_cdir, KF_FD_TYPE_CWD, FREAD, efbuf); } FILEDESC_SUNLOCK(fdp); fddrop(fdp); free(efbuf, M_TEMP); return (error); } /* * Get per-process current working directory. */ static int sysctl_kern_proc_cwd(SYSCTL_HANDLER_ARGS) { struct sbuf sb; struct proc *p; ssize_t maxlen; int error, error2, *name; name = (int *)arg1; sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file), req); sbuf_clear_flags(&sb, SBUF_INCLUDENUL); error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); if (error != 0) { sbuf_delete(&sb); return (error); } maxlen = req->oldptr != NULL ? req->oldlen : -1; error = kern_proc_cwd_out(p, &sb, maxlen); error2 = sbuf_finish(&sb); sbuf_delete(&sb); return (error != 0 ? error : error2); } static SYSCTL_NODE(_kern_proc, KERN_PROC_CWD, cwd, CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_cwd, "Process current working directory"); #ifdef DDB /* * For the purposes of debugging, generate a human-readable string for the * file type. */ static const char * file_type_to_name(short type) { switch (type) { case 0: return ("zero"); case DTYPE_VNODE: return ("vnode"); case DTYPE_SOCKET: return ("socket"); case DTYPE_PIPE: return ("pipe"); case DTYPE_FIFO: return ("fifo"); case DTYPE_KQUEUE: return ("kqueue"); case DTYPE_CRYPTO: return ("crypto"); case DTYPE_MQUEUE: return ("mqueue"); case DTYPE_SHM: return ("shm"); case DTYPE_SEM: return ("ksem"); case DTYPE_PTS: return ("pts"); case DTYPE_DEV: return ("dev"); case DTYPE_PROCDESC: return ("proc"); case DTYPE_LINUXEFD: return ("levent"); case DTYPE_LINUXTFD: return ("ltimer"); default: return ("unkn"); } } /* * For the purposes of debugging, identify a process (if any, perhaps one of * many) that references the passed file in its file descriptor array. Return * NULL if none. */ static struct proc * file_to_first_proc(struct file *fp) { struct filedesc *fdp; struct proc *p; int n; FOREACH_PROC_IN_SYSTEM(p) { if (p->p_state == PRS_NEW) continue; fdp = p->p_fd; if (fdp == NULL) continue; for (n = 0; n <= fdp->fd_lastfile; n++) { if (fp == fdp->fd_ofiles[n].fde_file) return (p); } } return (NULL); } static void db_print_file(struct file *fp, int header) { #define XPTRWIDTH ((int)howmany(sizeof(void *) * NBBY, 4)) struct proc *p; if (header) db_printf("%*s %6s %*s %8s %4s %5s %6s %*s %5s %s\n", XPTRWIDTH, "File", "Type", XPTRWIDTH, "Data", "Flag", "GCFl", "Count", "MCount", XPTRWIDTH, "Vnode", "FPID", "FCmd"); p = file_to_first_proc(fp); db_printf("%*p %6s %*p %08x %04x %5d %6d %*p %5d %s\n", XPTRWIDTH, fp, file_type_to_name(fp->f_type), XPTRWIDTH, fp->f_data, fp->f_flag, 0, fp->f_count, 0, XPTRWIDTH, fp->f_vnode, p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-"); #undef XPTRWIDTH } DB_SHOW_COMMAND(file, db_show_file) { struct file *fp; if (!have_addr) { db_printf("usage: show file \n"); return; } fp = (struct file *)addr; db_print_file(fp, 1); } DB_SHOW_COMMAND(files, db_show_files) { struct filedesc *fdp; struct file *fp; struct proc *p; int header; int n; header = 1; FOREACH_PROC_IN_SYSTEM(p) { if (p->p_state == PRS_NEW) continue; if ((fdp = p->p_fd) == NULL) continue; for (n = 0; n <= fdp->fd_lastfile; ++n) { if ((fp = fdp->fd_ofiles[n].fde_file) == NULL) continue; db_print_file(fp, header); header = 0; } } } #endif SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW, &maxfilesperproc, 0, "Maximum files allowed open per process"); SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW, &maxfiles, 0, "Maximum number of files"); SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD, __DEVOLATILE(int *, &openfiles), 0, "System-wide number of open files"); /* ARGSUSED*/ static void filelistinit(void *dummy) { file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); filedesc0_zone = uma_zcreate("filedesc0", sizeof(struct filedesc0), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF); } SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL); /*-------------------------------------------------------------------*/ static int badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return (EBADF); } static int badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { return (EINVAL); } static int badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { return (EBADF); } static int badfo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { return (0); } static int badfo_kqfilter(struct file *fp, struct knote *kn) { return (EBADF); } static int badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td) { return (EBADF); } static int badfo_close(struct file *fp, struct thread *td) { return (0); } static int badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { return (EBADF); } static int badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { return (EBADF); } static int badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, struct thread *td) { return (EBADF); } static int badfo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { return (0); } struct fileops badfileops = { .fo_read = badfo_readwrite, .fo_write = badfo_readwrite, .fo_truncate = badfo_truncate, .fo_ioctl = badfo_ioctl, .fo_poll = badfo_poll, .fo_kqfilter = badfo_kqfilter, .fo_stat = badfo_stat, .fo_close = badfo_close, .fo_chmod = badfo_chmod, .fo_chown = badfo_chown, .fo_sendfile = badfo_sendfile, .fo_fill_kinfo = badfo_fill_kinfo, }; int invfo_rdwr(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return (EOPNOTSUPP); } int invfo_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { return (EINVAL); } int invfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { return (ENOTTY); } int invfo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { return (poll_no_poll(events)); } int invfo_kqfilter(struct file *fp, struct knote *kn) { return (EINVAL); } int invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { return (EINVAL); } int invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { return (EINVAL); } int invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, struct thread *td) { return (EINVAL); } /*-------------------------------------------------------------------*/ /* * File Descriptor pseudo-device driver (/dev/fd/). * * Opening minor device N dup()s the file (if any) connected to file * descriptor N belonging to the calling process. Note that this driver * consists of only the ``open()'' routine, because all subsequent * references to this file will be direct to the other driver. * * XXX: we could give this one a cloning event handler if necessary. */ /* ARGSUSED */ static int fdopen(struct cdev *dev, int mode, int type, struct thread *td) { /* * XXX Kludge: set curthread->td_dupfd to contain the value of the * the file descriptor being sought for duplication. The error * return ensures that the vnode for this device will be released * by vn_open. Open will detect this special error and take the * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN * will simply report the error. */ td->td_dupfd = dev2unit(dev); return (ENODEV); } static struct cdevsw fildesc_cdevsw = { .d_version = D_VERSION, .d_open = fdopen, .d_name = "FD", }; static void fildesc_drvinit(void *unused) { struct cdev *dev; dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0666, "fd/0"); make_dev_alias(dev, "stdin"); dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL, UID_ROOT, GID_WHEEL, 0666, "fd/1"); make_dev_alias(dev, "stdout"); dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL, UID_ROOT, GID_WHEEL, 0666, "fd/2"); make_dev_alias(dev, "stderr"); } SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL); Index: head/sys/kern/kern_event.c =================================================================== --- head/sys/kern/kern_event.c (revision 333424) +++ head/sys/kern/kern_event.c (revision 333425) @@ -1,2640 +1,2638 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1999,2000,2001 Jonathan Lemon * Copyright 2004 John-Mark Gurney * Copyright (c) 2009 Apple, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include "opt_kqueue.h" #ifdef COMPAT_FREEBSD11 #define _WANT_FREEBSD11_KEVENT #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system"); /* * This lock is used if multiple kq locks are required. This possibly * should be made into a per proc lock. */ static struct mtx kq_global; MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF); #define KQ_GLOBAL_LOCK(lck, haslck) do { \ if (!haslck) \ mtx_lock(lck); \ haslck = 1; \ } while (0) #define KQ_GLOBAL_UNLOCK(lck, haslck) do { \ if (haslck) \ mtx_unlock(lck); \ haslck = 0; \ } while (0) TASKQUEUE_DEFINE_THREAD(kqueue_ctx); static int kevent_copyout(void *arg, struct kevent *kevp, int count); static int kevent_copyin(void *arg, struct kevent *kevp, int count); static int kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int waitok); static int kqueue_acquire(struct file *fp, struct kqueue **kqp); static void kqueue_release(struct kqueue *kq, int locked); static void kqueue_destroy(struct kqueue *kq); static void kqueue_drain(struct kqueue *kq, struct thread *td); static int kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident, int waitok); static void kqueue_task(void *arg, int pending); static int kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops, const struct timespec *timeout, struct kevent *keva, struct thread *td); static void kqueue_wakeup(struct kqueue *kq); static struct filterops *kqueue_fo_find(int filt); static void kqueue_fo_release(int filt); struct g_kevent_args; static int kern_kevent_generic(struct thread *td, struct g_kevent_args *uap, struct kevent_copyops *k_ops, const char *struct_name); static fo_ioctl_t kqueue_ioctl; static fo_poll_t kqueue_poll; static fo_kqfilter_t kqueue_kqfilter; static fo_stat_t kqueue_stat; static fo_close_t kqueue_close; static fo_fill_kinfo_t kqueue_fill_kinfo; static struct fileops kqueueops = { .fo_read = invfo_rdwr, .fo_write = invfo_rdwr, .fo_truncate = invfo_truncate, .fo_ioctl = kqueue_ioctl, .fo_poll = kqueue_poll, .fo_kqfilter = kqueue_kqfilter, .fo_stat = kqueue_stat, .fo_close = kqueue_close, .fo_chmod = invfo_chmod, .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = kqueue_fill_kinfo, }; static int knote_attach(struct knote *kn, struct kqueue *kq); static void knote_drop(struct knote *kn, struct thread *td); static void knote_drop_detached(struct knote *kn, struct thread *td); static void knote_enqueue(struct knote *kn); static void knote_dequeue(struct knote *kn); static void knote_init(void); static struct knote *knote_alloc(int waitok); static void knote_free(struct knote *kn); static void filt_kqdetach(struct knote *kn); static int filt_kqueue(struct knote *kn, long hint); static int filt_procattach(struct knote *kn); static void filt_procdetach(struct knote *kn); static int filt_proc(struct knote *kn, long hint); static int filt_fileattach(struct knote *kn); static void filt_timerexpire(void *knx); static int filt_timerattach(struct knote *kn); static void filt_timerdetach(struct knote *kn); static int filt_timer(struct knote *kn, long hint); static int filt_userattach(struct knote *kn); static void filt_userdetach(struct knote *kn); static int filt_user(struct knote *kn, long hint); static void filt_usertouch(struct knote *kn, struct kevent *kev, u_long type); static struct filterops file_filtops = { .f_isfd = 1, .f_attach = filt_fileattach, }; static struct filterops kqread_filtops = { .f_isfd = 1, .f_detach = filt_kqdetach, .f_event = filt_kqueue, }; /* XXX - move to kern_proc.c? */ static struct filterops proc_filtops = { .f_isfd = 0, .f_attach = filt_procattach, .f_detach = filt_procdetach, .f_event = filt_proc, }; static struct filterops timer_filtops = { .f_isfd = 0, .f_attach = filt_timerattach, .f_detach = filt_timerdetach, .f_event = filt_timer, }; static struct filterops user_filtops = { .f_attach = filt_userattach, .f_detach = filt_userdetach, .f_event = filt_user, .f_touch = filt_usertouch, }; static uma_zone_t knote_zone; static unsigned int kq_ncallouts = 0; static unsigned int kq_calloutmax = 4 * 1024; SYSCTL_UINT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW, &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue"); /* XXX - ensure not influx ? */ #define KNOTE_ACTIVATE(kn, islock) do { \ if ((islock)) \ mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED); \ else \ KQ_LOCK((kn)->kn_kq); \ (kn)->kn_status |= KN_ACTIVE; \ if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) \ knote_enqueue((kn)); \ if (!(islock)) \ KQ_UNLOCK((kn)->kn_kq); \ } while(0) #define KQ_LOCK(kq) do { \ mtx_lock(&(kq)->kq_lock); \ } while (0) #define KQ_FLUX_WAKEUP(kq) do { \ if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) { \ (kq)->kq_state &= ~KQ_FLUXWAIT; \ wakeup((kq)); \ } \ } while (0) #define KQ_UNLOCK_FLUX(kq) do { \ KQ_FLUX_WAKEUP(kq); \ mtx_unlock(&(kq)->kq_lock); \ } while (0) #define KQ_UNLOCK(kq) do { \ mtx_unlock(&(kq)->kq_lock); \ } while (0) #define KQ_OWNED(kq) do { \ mtx_assert(&(kq)->kq_lock, MA_OWNED); \ } while (0) #define KQ_NOTOWNED(kq) do { \ mtx_assert(&(kq)->kq_lock, MA_NOTOWNED); \ } while (0) static struct knlist * kn_list_lock(struct knote *kn) { struct knlist *knl; knl = kn->kn_knlist; if (knl != NULL) knl->kl_lock(knl->kl_lockarg); return (knl); } static void kn_list_unlock(struct knlist *knl) { bool do_free; if (knl == NULL) return; do_free = knl->kl_autodestroy && knlist_empty(knl); knl->kl_unlock(knl->kl_lockarg); if (do_free) { knlist_destroy(knl); free(knl, M_KQUEUE); } } static bool kn_in_flux(struct knote *kn) { return (kn->kn_influx > 0); } static void kn_enter_flux(struct knote *kn) { KQ_OWNED(kn->kn_kq); MPASS(kn->kn_influx < INT_MAX); kn->kn_influx++; } static bool kn_leave_flux(struct knote *kn) { KQ_OWNED(kn->kn_kq); MPASS(kn->kn_influx > 0); kn->kn_influx--; return (kn->kn_influx == 0); } #define KNL_ASSERT_LOCK(knl, islocked) do { \ if (islocked) \ KNL_ASSERT_LOCKED(knl); \ else \ KNL_ASSERT_UNLOCKED(knl); \ } while (0) #ifdef INVARIANTS #define KNL_ASSERT_LOCKED(knl) do { \ knl->kl_assert_locked((knl)->kl_lockarg); \ } while (0) #define KNL_ASSERT_UNLOCKED(knl) do { \ knl->kl_assert_unlocked((knl)->kl_lockarg); \ } while (0) #else /* !INVARIANTS */ #define KNL_ASSERT_LOCKED(knl) do {} while(0) #define KNL_ASSERT_UNLOCKED(knl) do {} while (0) #endif /* INVARIANTS */ #ifndef KN_HASHSIZE #define KN_HASHSIZE 64 /* XXX should be tunable */ #endif #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) static int filt_nullattach(struct knote *kn) { return (ENXIO); }; struct filterops null_filtops = { .f_isfd = 0, .f_attach = filt_nullattach, }; /* XXX - make SYSINIT to add these, and move into respective modules. */ extern struct filterops sig_filtops; extern struct filterops fs_filtops; /* * Table for for all system-defined filters. */ static struct mtx filterops_lock; MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops", MTX_DEF); static struct { struct filterops *for_fop; int for_nolock; int for_refcnt; } sysfilt_ops[EVFILT_SYSCOUNT] = { { &file_filtops, 1 }, /* EVFILT_READ */ { &file_filtops, 1 }, /* EVFILT_WRITE */ { &null_filtops }, /* EVFILT_AIO */ { &file_filtops, 1 }, /* EVFILT_VNODE */ { &proc_filtops, 1 }, /* EVFILT_PROC */ { &sig_filtops, 1 }, /* EVFILT_SIGNAL */ { &timer_filtops, 1 }, /* EVFILT_TIMER */ { &file_filtops, 1 }, /* EVFILT_PROCDESC */ { &fs_filtops, 1 }, /* EVFILT_FS */ { &null_filtops }, /* EVFILT_LIO */ { &user_filtops, 1 }, /* EVFILT_USER */ { &null_filtops }, /* EVFILT_SENDFILE */ { &file_filtops, 1 }, /* EVFILT_EMPTY */ }; /* * Simple redirection for all cdevsw style objects to call their fo_kqfilter * method. */ static int filt_fileattach(struct knote *kn) { return (fo_kqfilter(kn->kn_fp, kn)); } /*ARGSUSED*/ static int kqueue_kqfilter(struct file *fp, struct knote *kn) { struct kqueue *kq = kn->kn_fp->f_data; if (kn->kn_filter != EVFILT_READ) return (EINVAL); kn->kn_status |= KN_KQUEUE; kn->kn_fop = &kqread_filtops; knlist_add(&kq->kq_sel.si_note, kn, 0); return (0); } static void filt_kqdetach(struct knote *kn) { struct kqueue *kq = kn->kn_fp->f_data; knlist_remove(&kq->kq_sel.si_note, kn, 0); } /*ARGSUSED*/ static int filt_kqueue(struct knote *kn, long hint) { struct kqueue *kq = kn->kn_fp->f_data; kn->kn_data = kq->kq_count; return (kn->kn_data > 0); } /* XXX - move to kern_proc.c? */ static int filt_procattach(struct knote *kn) { struct proc *p; int error; bool exiting, immediate; exiting = immediate = false; if (kn->kn_sfflags & NOTE_EXIT) p = pfind_any(kn->kn_id); else p = pfind(kn->kn_id); if (p == NULL) return (ESRCH); if (p->p_flag & P_WEXIT) exiting = true; if ((error = p_cansee(curthread, p))) { PROC_UNLOCK(p); return (error); } kn->kn_ptr.p_proc = p; kn->kn_flags |= EV_CLEAR; /* automatically set */ /* * Internal flag indicating registration done by kernel for the * purposes of getting a NOTE_CHILD notification. */ if (kn->kn_flags & EV_FLAG2) { kn->kn_flags &= ~EV_FLAG2; kn->kn_data = kn->kn_sdata; /* ppid */ kn->kn_fflags = NOTE_CHILD; kn->kn_sfflags &= ~(NOTE_EXIT | NOTE_EXEC | NOTE_FORK); immediate = true; /* Force immediate activation of child note. */ } /* * Internal flag indicating registration done by kernel (for other than * NOTE_CHILD). */ if (kn->kn_flags & EV_FLAG1) { kn->kn_flags &= ~EV_FLAG1; } knlist_add(p->p_klist, kn, 1); /* * Immediately activate any child notes or, in the case of a zombie * target process, exit notes. The latter is necessary to handle the * case where the target process, e.g. a child, dies before the kevent * is registered. */ if (immediate || (exiting && filt_proc(kn, NOTE_EXIT))) KNOTE_ACTIVATE(kn, 0); PROC_UNLOCK(p); return (0); } /* * The knote may be attached to a different process, which may exit, * leaving nothing for the knote to be attached to. So when the process * exits, the knote is marked as DETACHED and also flagged as ONESHOT so * it will be deleted when read out. However, as part of the knote deletion, * this routine is called, so a check is needed to avoid actually performing * a detach, because the original process does not exist any more. */ /* XXX - move to kern_proc.c? */ static void filt_procdetach(struct knote *kn) { knlist_remove(kn->kn_knlist, kn, 0); kn->kn_ptr.p_proc = NULL; } /* XXX - move to kern_proc.c? */ static int filt_proc(struct knote *kn, long hint) { struct proc *p; u_int event; p = kn->kn_ptr.p_proc; if (p == NULL) /* already activated, from attach filter */ return (0); /* Mask off extra data. */ event = (u_int)hint & NOTE_PCTRLMASK; /* If the user is interested in this event, record it. */ if (kn->kn_sfflags & event) kn->kn_fflags |= event; /* Process is gone, so flag the event as finished. */ if (event == NOTE_EXIT) { kn->kn_flags |= EV_EOF | EV_ONESHOT; kn->kn_ptr.p_proc = NULL; if (kn->kn_fflags & NOTE_EXIT) kn->kn_data = KW_EXITCODE(p->p_xexit, p->p_xsig); if (kn->kn_fflags == 0) kn->kn_flags |= EV_DROP; return (1); } return (kn->kn_fflags != 0); } /* * Called when the process forked. It mostly does the same as the * knote(), activating all knotes registered to be activated when the * process forked. Additionally, for each knote attached to the * parent, check whether user wants to track the new process. If so * attach a new knote to it, and immediately report an event with the * child's pid. */ void knote_fork(struct knlist *list, int pid) { struct kqueue *kq; struct knote *kn; struct kevent kev; int error; if (list == NULL) return; list->kl_lock(list->kl_lockarg); SLIST_FOREACH(kn, &list->kl_list, kn_selnext) { kq = kn->kn_kq; KQ_LOCK(kq); if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) { KQ_UNLOCK(kq); continue; } /* * The same as knote(), activate the event. */ if ((kn->kn_sfflags & NOTE_TRACK) == 0) { kn->kn_status |= KN_HASKQLOCK; if (kn->kn_fop->f_event(kn, NOTE_FORK)) KNOTE_ACTIVATE(kn, 1); kn->kn_status &= ~KN_HASKQLOCK; KQ_UNLOCK(kq); continue; } /* * The NOTE_TRACK case. In addition to the activation * of the event, we need to register new events to * track the child. Drop the locks in preparation for * the call to kqueue_register(). */ kn_enter_flux(kn); KQ_UNLOCK(kq); list->kl_unlock(list->kl_lockarg); /* * Activate existing knote and register tracking knotes with * new process. * * First register a knote to get just the child notice. This * must be a separate note from a potential NOTE_EXIT * notification since both NOTE_CHILD and NOTE_EXIT are defined * to use the data field (in conflicting ways). */ kev.ident = pid; kev.filter = kn->kn_filter; kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_ONESHOT | EV_FLAG2; kev.fflags = kn->kn_sfflags; kev.data = kn->kn_id; /* parent */ kev.udata = kn->kn_kevent.udata;/* preserve udata */ error = kqueue_register(kq, &kev, NULL, 0); if (error) kn->kn_fflags |= NOTE_TRACKERR; /* * Then register another knote to track other potential events * from the new process. */ kev.ident = pid; kev.filter = kn->kn_filter; kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1; kev.fflags = kn->kn_sfflags; kev.data = kn->kn_id; /* parent */ kev.udata = kn->kn_kevent.udata;/* preserve udata */ error = kqueue_register(kq, &kev, NULL, 0); if (error) kn->kn_fflags |= NOTE_TRACKERR; if (kn->kn_fop->f_event(kn, NOTE_FORK)) KNOTE_ACTIVATE(kn, 0); KQ_LOCK(kq); kn_leave_flux(kn); KQ_UNLOCK_FLUX(kq); list->kl_lock(list->kl_lockarg); } list->kl_unlock(list->kl_lockarg); } /* * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the * interval timer support code. */ #define NOTE_TIMER_PRECMASK \ (NOTE_SECONDS | NOTE_MSECONDS | NOTE_USECONDS | NOTE_NSECONDS) static sbintime_t timer2sbintime(intptr_t data, int flags) { int64_t secs; /* * Macros for converting to the fractional second portion of an * sbintime_t using 64bit multiplication to improve precision. */ #define NS_TO_SBT(ns) (((ns) * (((uint64_t)1 << 63) / 500000000)) >> 32) #define US_TO_SBT(us) (((us) * (((uint64_t)1 << 63) / 500000)) >> 32) #define MS_TO_SBT(ms) (((ms) * (((uint64_t)1 << 63) / 500)) >> 32) switch (flags & NOTE_TIMER_PRECMASK) { case NOTE_SECONDS: #ifdef __LP64__ if (data > (SBT_MAX / SBT_1S)) return (SBT_MAX); #endif return ((sbintime_t)data << 32); case NOTE_MSECONDS: /* FALLTHROUGH */ case 0: if (data >= 1000) { secs = data / 1000; #ifdef __LP64__ if (secs > (SBT_MAX / SBT_1S)) return (SBT_MAX); #endif return (secs << 32 | MS_TO_SBT(data % 1000)); } return (MS_TO_SBT(data)); case NOTE_USECONDS: if (data >= 1000000) { secs = data / 1000000; #ifdef __LP64__ if (secs > (SBT_MAX / SBT_1S)) return (SBT_MAX); #endif return (secs << 32 | US_TO_SBT(data % 1000000)); } return (US_TO_SBT(data)); case NOTE_NSECONDS: if (data >= 1000000000) { secs = data / 1000000000; #ifdef __LP64__ if (secs > (SBT_MAX / SBT_1S)) return (SBT_MAX); #endif return (secs << 32 | US_TO_SBT(data % 1000000000)); } return (NS_TO_SBT(data)); default: break; } return (-1); } struct kq_timer_cb_data { struct callout c; sbintime_t next; /* next timer event fires at */ sbintime_t to; /* precalculated timer period, 0 for abs */ }; static void filt_timerexpire(void *knx) { struct knote *kn; struct kq_timer_cb_data *kc; kn = knx; kn->kn_data++; KNOTE_ACTIVATE(kn, 0); /* XXX - handle locking */ if ((kn->kn_flags & EV_ONESHOT) != 0) return; kc = kn->kn_ptr.p_v; if (kc->to == 0) return; kc->next += kc->to; callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kn, PCPU_GET(cpuid), C_ABSOLUTE); } /* * data contains amount of time to sleep */ static int filt_timerattach(struct knote *kn) { struct kq_timer_cb_data *kc; struct bintime bt; sbintime_t to, sbt; unsigned int ncallouts; if (kn->kn_sdata < 0) return (EINVAL); if (kn->kn_sdata == 0 && (kn->kn_flags & EV_ONESHOT) == 0) kn->kn_sdata = 1; /* Only precision unit are supported in flags so far */ if ((kn->kn_sfflags & ~(NOTE_TIMER_PRECMASK | NOTE_ABSTIME)) != 0) return (EINVAL); to = timer2sbintime(kn->kn_sdata, kn->kn_sfflags); if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) { getboottimebin(&bt); sbt = bttosbt(bt); to -= sbt; } if (to < 0) return (EINVAL); do { ncallouts = kq_ncallouts; if (ncallouts >= kq_calloutmax) return (ENOMEM); } while (!atomic_cmpset_int(&kq_ncallouts, ncallouts, ncallouts + 1)); if ((kn->kn_sfflags & NOTE_ABSTIME) == 0) kn->kn_flags |= EV_CLEAR; /* automatically set */ kn->kn_status &= ~KN_DETACHED; /* knlist_add clears it */ kn->kn_ptr.p_v = kc = malloc(sizeof(*kc), M_KQUEUE, M_WAITOK); callout_init(&kc->c, 1); if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) { kc->next = to; kc->to = 0; } else { kc->next = to + sbinuptime(); kc->to = to; } callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kn, PCPU_GET(cpuid), C_ABSOLUTE); return (0); } static void filt_timerdetach(struct knote *kn) { struct kq_timer_cb_data *kc; unsigned int old; kc = kn->kn_ptr.p_v; callout_drain(&kc->c); free(kc, M_KQUEUE); old = atomic_fetchadd_int(&kq_ncallouts, -1); KASSERT(old > 0, ("Number of callouts cannot become negative")); kn->kn_status |= KN_DETACHED; /* knlist_remove sets it */ } static int filt_timer(struct knote *kn, long hint) { return (kn->kn_data != 0); } static int filt_userattach(struct knote *kn) { /* * EVFILT_USER knotes are not attached to anything in the kernel. */ kn->kn_hook = NULL; if (kn->kn_fflags & NOTE_TRIGGER) kn->kn_hookid = 1; else kn->kn_hookid = 0; return (0); } static void filt_userdetach(__unused struct knote *kn) { /* * EVFILT_USER knotes are not attached to anything in the kernel. */ } static int filt_user(struct knote *kn, __unused long hint) { return (kn->kn_hookid); } static void filt_usertouch(struct knote *kn, struct kevent *kev, u_long type) { u_int ffctrl; switch (type) { case EVENT_REGISTER: if (kev->fflags & NOTE_TRIGGER) kn->kn_hookid = 1; ffctrl = kev->fflags & NOTE_FFCTRLMASK; kev->fflags &= NOTE_FFLAGSMASK; switch (ffctrl) { case NOTE_FFNOP: break; case NOTE_FFAND: kn->kn_sfflags &= kev->fflags; break; case NOTE_FFOR: kn->kn_sfflags |= kev->fflags; break; case NOTE_FFCOPY: kn->kn_sfflags = kev->fflags; break; default: /* XXX Return error? */ break; } kn->kn_sdata = kev->data; if (kev->flags & EV_CLEAR) { kn->kn_hookid = 0; kn->kn_data = 0; kn->kn_fflags = 0; } break; case EVENT_PROCESS: *kev = kn->kn_kevent; kev->fflags = kn->kn_sfflags; kev->data = kn->kn_sdata; if (kn->kn_flags & EV_CLEAR) { kn->kn_hookid = 0; kn->kn_data = 0; kn->kn_fflags = 0; } break; default: panic("filt_usertouch() - invalid type (%ld)", type); break; } } int sys_kqueue(struct thread *td, struct kqueue_args *uap) { return (kern_kqueue(td, 0, NULL)); } static void kqueue_init(struct kqueue *kq) { mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF | MTX_DUPOK); TAILQ_INIT(&kq->kq_head); knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock); TASK_INIT(&kq->kq_task, 0, kqueue_task, kq); } int kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps) { struct filedesc *fdp; struct kqueue *kq; struct file *fp; struct ucred *cred; int fd, error; fdp = td->td_proc->p_fd; cred = td->td_ucred; if (!chgkqcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_KQUEUES))) return (ENOMEM); error = falloc_caps(td, &fp, &fd, flags, fcaps); if (error != 0) { chgkqcnt(cred->cr_ruidinfo, -1, 0); return (error); } /* An extra reference on `fp' has been held for us by falloc(). */ kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO); kqueue_init(kq); kq->kq_fdp = fdp; kq->kq_cred = crhold(cred); FILEDESC_XLOCK(fdp); TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list); FILEDESC_XUNLOCK(fdp); finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops); fdrop(fp, td); td->td_retval[0] = fd; return (0); } struct g_kevent_args { int fd; void *changelist; int nchanges; void *eventlist; int nevents; const struct timespec *timeout; }; int sys_kevent(struct thread *td, struct kevent_args *uap) { struct kevent_copyops k_ops = { .arg = uap, .k_copyout = kevent_copyout, .k_copyin = kevent_copyin, .kevent_size = sizeof(struct kevent), }; struct g_kevent_args gk_args = { .fd = uap->fd, .changelist = uap->changelist, .nchanges = uap->nchanges, .eventlist = uap->eventlist, .nevents = uap->nevents, .timeout = uap->timeout, }; return (kern_kevent_generic(td, &gk_args, &k_ops, "kevent")); } static int kern_kevent_generic(struct thread *td, struct g_kevent_args *uap, struct kevent_copyops *k_ops, const char *struct_name) { struct timespec ts, *tsp; #ifdef KTRACE struct kevent *eventlist = uap->eventlist; #endif int error; if (uap->timeout != NULL) { error = copyin(uap->timeout, &ts, sizeof(ts)); if (error) return (error); tsp = &ts; } else tsp = NULL; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT_ARRAY)) ktrstructarray(struct_name, UIO_USERSPACE, uap->changelist, uap->nchanges, k_ops->kevent_size); #endif error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents, k_ops, tsp); #ifdef KTRACE if (error == 0 && KTRPOINT(td, KTR_STRUCT_ARRAY)) ktrstructarray(struct_name, UIO_USERSPACE, eventlist, td->td_retval[0], k_ops->kevent_size); #endif return (error); } /* * Copy 'count' items into the destination list pointed to by uap->eventlist. */ static int kevent_copyout(void *arg, struct kevent *kevp, int count) { struct kevent_args *uap; int error; KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); uap = (struct kevent_args *)arg; error = copyout(kevp, uap->eventlist, count * sizeof *kevp); if (error == 0) uap->eventlist += count; return (error); } /* * Copy 'count' items from the list pointed to by uap->changelist. */ static int kevent_copyin(void *arg, struct kevent *kevp, int count) { struct kevent_args *uap; int error; KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); uap = (struct kevent_args *)arg; error = copyin(uap->changelist, kevp, count * sizeof *kevp); if (error == 0) uap->changelist += count; return (error); } #ifdef COMPAT_FREEBSD11 static int kevent11_copyout(void *arg, struct kevent *kevp, int count) { struct freebsd11_kevent_args *uap; struct kevent_freebsd11 kev11; int error, i; KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); uap = (struct freebsd11_kevent_args *)arg; for (i = 0; i < count; i++) { kev11.ident = kevp->ident; kev11.filter = kevp->filter; kev11.flags = kevp->flags; kev11.fflags = kevp->fflags; kev11.data = kevp->data; kev11.udata = kevp->udata; error = copyout(&kev11, uap->eventlist, sizeof(kev11)); if (error != 0) break; uap->eventlist++; kevp++; } return (error); } /* * Copy 'count' items from the list pointed to by uap->changelist. */ static int kevent11_copyin(void *arg, struct kevent *kevp, int count) { struct freebsd11_kevent_args *uap; struct kevent_freebsd11 kev11; int error, i; KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); uap = (struct freebsd11_kevent_args *)arg; for (i = 0; i < count; i++) { error = copyin(uap->changelist, &kev11, sizeof(kev11)); if (error != 0) break; kevp->ident = kev11.ident; kevp->filter = kev11.filter; kevp->flags = kev11.flags; kevp->fflags = kev11.fflags; kevp->data = (uintptr_t)kev11.data; kevp->udata = kev11.udata; bzero(&kevp->ext, sizeof(kevp->ext)); uap->changelist++; kevp++; } return (error); } int freebsd11_kevent(struct thread *td, struct freebsd11_kevent_args *uap) { struct kevent_copyops k_ops = { .arg = uap, .k_copyout = kevent11_copyout, .k_copyin = kevent11_copyin, .kevent_size = sizeof(struct kevent_freebsd11), }; struct g_kevent_args gk_args = { .fd = uap->fd, .changelist = uap->changelist, .nchanges = uap->nchanges, .eventlist = uap->eventlist, .nevents = uap->nevents, .timeout = uap->timeout, }; return (kern_kevent_generic(td, &gk_args, &k_ops, "kevent_freebsd11")); } #endif int kern_kevent(struct thread *td, int fd, int nchanges, int nevents, struct kevent_copyops *k_ops, const struct timespec *timeout) { cap_rights_t rights; struct file *fp; int error; cap_rights_init(&rights); if (nchanges > 0) cap_rights_set(&rights, CAP_KQUEUE_CHANGE); if (nevents > 0) cap_rights_set(&rights, CAP_KQUEUE_EVENT); error = fget(td, fd, &rights, &fp); if (error != 0) return (error); error = kern_kevent_fp(td, fp, nchanges, nevents, k_ops, timeout); fdrop(fp, td); return (error); } static int kqueue_kevent(struct kqueue *kq, struct thread *td, int nchanges, int nevents, struct kevent_copyops *k_ops, const struct timespec *timeout) { struct kevent keva[KQ_NEVENTS]; struct kevent *kevp, *changes; int i, n, nerrors, error; nerrors = 0; while (nchanges > 0) { n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges; error = k_ops->k_copyin(k_ops->arg, keva, n); if (error) return (error); changes = keva; for (i = 0; i < n; i++) { kevp = &changes[i]; if (!kevp->filter) continue; kevp->flags &= ~EV_SYSFLAGS; error = kqueue_register(kq, kevp, td, 1); if (error || (kevp->flags & EV_RECEIPT)) { if (nevents == 0) return (error); kevp->flags = EV_ERROR; kevp->data = error; (void)k_ops->k_copyout(k_ops->arg, kevp, 1); nevents--; nerrors++; } } nchanges -= n; } if (nerrors) { td->td_retval[0] = nerrors; return (0); } return (kqueue_scan(kq, nevents, k_ops, timeout, keva, td)); } int kern_kevent_fp(struct thread *td, struct file *fp, int nchanges, int nevents, struct kevent_copyops *k_ops, const struct timespec *timeout) { struct kqueue *kq; int error; error = kqueue_acquire(fp, &kq); if (error != 0) return (error); error = kqueue_kevent(kq, td, nchanges, nevents, k_ops, timeout); kqueue_release(kq, 0); return (error); } /* * Performs a kevent() call on a temporarily created kqueue. This can be * used to perform one-shot polling, similar to poll() and select(). */ int kern_kevent_anonymous(struct thread *td, int nevents, struct kevent_copyops *k_ops) { struct kqueue kq = {}; int error; kqueue_init(&kq); kq.kq_refcnt = 1; error = kqueue_kevent(&kq, td, nevents, nevents, k_ops, NULL); kqueue_drain(&kq, td); kqueue_destroy(&kq); return (error); } int kqueue_add_filteropts(int filt, struct filterops *filtops) { int error; error = 0; if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) { printf( "trying to add a filterop that is out of range: %d is beyond %d\n", ~filt, EVFILT_SYSCOUNT); return EINVAL; } mtx_lock(&filterops_lock); if (sysfilt_ops[~filt].for_fop != &null_filtops && sysfilt_ops[~filt].for_fop != NULL) error = EEXIST; else { sysfilt_ops[~filt].for_fop = filtops; sysfilt_ops[~filt].for_refcnt = 0; } mtx_unlock(&filterops_lock); return (error); } int kqueue_del_filteropts(int filt) { int error; error = 0; if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) return EINVAL; mtx_lock(&filterops_lock); if (sysfilt_ops[~filt].for_fop == &null_filtops || sysfilt_ops[~filt].for_fop == NULL) error = EINVAL; else if (sysfilt_ops[~filt].for_refcnt != 0) error = EBUSY; else { sysfilt_ops[~filt].for_fop = &null_filtops; sysfilt_ops[~filt].for_refcnt = 0; } mtx_unlock(&filterops_lock); return error; } static struct filterops * kqueue_fo_find(int filt) { if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) return NULL; if (sysfilt_ops[~filt].for_nolock) return sysfilt_ops[~filt].for_fop; mtx_lock(&filterops_lock); sysfilt_ops[~filt].for_refcnt++; if (sysfilt_ops[~filt].for_fop == NULL) sysfilt_ops[~filt].for_fop = &null_filtops; mtx_unlock(&filterops_lock); return sysfilt_ops[~filt].for_fop; } static void kqueue_fo_release(int filt) { if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) return; if (sysfilt_ops[~filt].for_nolock) return; mtx_lock(&filterops_lock); KASSERT(sysfilt_ops[~filt].for_refcnt > 0, ("filter object refcount not valid on release")); sysfilt_ops[~filt].for_refcnt--; mtx_unlock(&filterops_lock); } /* * A ref to kq (obtained via kqueue_acquire) must be held. waitok will * influence if memory allocation should wait. Make sure it is 0 if you * hold any mutexes. */ static int kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int waitok) { struct filterops *fops; struct file *fp; struct knote *kn, *tkn; struct knlist *knl; - cap_rights_t rights; int error, filt, event; int haskqglobal, filedesc_unlock; if ((kev->flags & (EV_ENABLE | EV_DISABLE)) == (EV_ENABLE | EV_DISABLE)) return (EINVAL); fp = NULL; kn = NULL; knl = NULL; error = 0; haskqglobal = 0; filedesc_unlock = 0; filt = kev->filter; fops = kqueue_fo_find(filt); if (fops == NULL) return EINVAL; if (kev->flags & EV_ADD) { /* * Prevent waiting with locks. Non-sleepable * allocation failures are handled in the loop, only * if the spare knote appears to be actually required. */ tkn = knote_alloc(waitok); } else { tkn = NULL; } findkn: if (fops->f_isfd) { KASSERT(td != NULL, ("td is NULL")); if (kev->ident > INT_MAX) error = EBADF; else - error = fget(td, kev->ident, - cap_rights_init(&rights, CAP_EVENT), &fp); + error = fget(td, kev->ident, &cap_event_rights, &fp); if (error) goto done; if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops, kev->ident, 0) != 0) { /* try again */ fdrop(fp, td); fp = NULL; error = kqueue_expand(kq, fops, kev->ident, waitok); if (error) goto done; goto findkn; } if (fp->f_type == DTYPE_KQUEUE) { /* * If we add some intelligence about what we are doing, * we should be able to support events on ourselves. * We need to know when we are doing this to prevent * getting both the knlist lock and the kq lock since * they are the same thing. */ if (fp->f_data == kq) { error = EINVAL; goto done; } /* * Pre-lock the filedesc before the global * lock mutex, see the comment in * kqueue_close(). */ FILEDESC_XLOCK(td->td_proc->p_fd); filedesc_unlock = 1; KQ_GLOBAL_LOCK(&kq_global, haskqglobal); } KQ_LOCK(kq); if (kev->ident < kq->kq_knlistsize) { SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link) if (kev->filter == kn->kn_filter) break; } } else { if ((kev->flags & EV_ADD) == EV_ADD) kqueue_expand(kq, fops, kev->ident, waitok); KQ_LOCK(kq); /* * If possible, find an existing knote to use for this kevent. */ if (kev->filter == EVFILT_PROC && (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) { /* This is an internal creation of a process tracking * note. Don't attempt to coalesce this with an * existing note. */ ; } else if (kq->kq_knhashmask != 0) { struct klist *list; list = &kq->kq_knhash[ KN_HASH((u_long)kev->ident, kq->kq_knhashmask)]; SLIST_FOREACH(kn, list, kn_link) if (kev->ident == kn->kn_id && kev->filter == kn->kn_filter) break; } } /* knote is in the process of changing, wait for it to stabilize. */ if (kn != NULL && kn_in_flux(kn)) { KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); if (filedesc_unlock) { FILEDESC_XUNLOCK(td->td_proc->p_fd); filedesc_unlock = 0; } kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0); if (fp != NULL) { fdrop(fp, td); fp = NULL; } goto findkn; } /* * kn now contains the matching knote, or NULL if no match */ if (kn == NULL) { if (kev->flags & EV_ADD) { kn = tkn; tkn = NULL; if (kn == NULL) { KQ_UNLOCK(kq); error = ENOMEM; goto done; } kn->kn_fp = fp; kn->kn_kq = kq; kn->kn_fop = fops; /* * apply reference counts to knote structure, and * do not release it at the end of this routine. */ fops = NULL; fp = NULL; kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; kev->fflags = 0; kev->data = 0; kn->kn_kevent = *kev; kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE | EV_ENABLE | EV_DISABLE | EV_FORCEONESHOT); kn->kn_status = KN_DETACHED; kn_enter_flux(kn); error = knote_attach(kn, kq); KQ_UNLOCK(kq); if (error != 0) { tkn = kn; goto done; } if ((error = kn->kn_fop->f_attach(kn)) != 0) { knote_drop_detached(kn, td); goto done; } knl = kn_list_lock(kn); goto done_ev_add; } else { /* No matching knote and the EV_ADD flag is not set. */ KQ_UNLOCK(kq); error = ENOENT; goto done; } } if (kev->flags & EV_DELETE) { kn_enter_flux(kn); KQ_UNLOCK(kq); knote_drop(kn, td); goto done; } if (kev->flags & EV_FORCEONESHOT) { kn->kn_flags |= EV_ONESHOT; KNOTE_ACTIVATE(kn, 1); } /* * The user may change some filter values after the initial EV_ADD, * but doing so will not reset any filter which has already been * triggered. */ kn->kn_status |= KN_SCAN; kn_enter_flux(kn); KQ_UNLOCK(kq); knl = kn_list_lock(kn); kn->kn_kevent.udata = kev->udata; if (!fops->f_isfd && fops->f_touch != NULL) { fops->f_touch(kn, kev, EVENT_REGISTER); } else { kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; } /* * We can get here with kn->kn_knlist == NULL. This can happen when * the initial attach event decides that the event is "completed" * already. i.e. filt_procattach is called on a zombie process. It * will call filt_proc which will remove it from the list, and NULL * kn_knlist. */ done_ev_add: if ((kev->flags & EV_ENABLE) != 0) kn->kn_status &= ~KN_DISABLED; else if ((kev->flags & EV_DISABLE) != 0) kn->kn_status |= KN_DISABLED; if ((kn->kn_status & KN_DISABLED) == 0) event = kn->kn_fop->f_event(kn, 0); else event = 0; KQ_LOCK(kq); if (event) kn->kn_status |= KN_ACTIVE; if ((kn->kn_status & (KN_ACTIVE | KN_DISABLED | KN_QUEUED)) == KN_ACTIVE) knote_enqueue(kn); kn->kn_status &= ~KN_SCAN; kn_leave_flux(kn); kn_list_unlock(knl); KQ_UNLOCK_FLUX(kq); done: KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); if (filedesc_unlock) FILEDESC_XUNLOCK(td->td_proc->p_fd); if (fp != NULL) fdrop(fp, td); knote_free(tkn); if (fops != NULL) kqueue_fo_release(filt); return (error); } static int kqueue_acquire(struct file *fp, struct kqueue **kqp) { int error; struct kqueue *kq; error = 0; kq = fp->f_data; if (fp->f_type != DTYPE_KQUEUE || kq == NULL) return (EBADF); *kqp = kq; KQ_LOCK(kq); if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) { KQ_UNLOCK(kq); return (EBADF); } kq->kq_refcnt++; KQ_UNLOCK(kq); return error; } static void kqueue_release(struct kqueue *kq, int locked) { if (locked) KQ_OWNED(kq); else KQ_LOCK(kq); kq->kq_refcnt--; if (kq->kq_refcnt == 1) wakeup(&kq->kq_refcnt); if (!locked) KQ_UNLOCK(kq); } static void kqueue_schedtask(struct kqueue *kq) { KQ_OWNED(kq); KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN), ("scheduling kqueue task while draining")); if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) { taskqueue_enqueue(taskqueue_kqueue_ctx, &kq->kq_task); kq->kq_state |= KQ_TASKSCHED; } } /* * Expand the kq to make sure we have storage for fops/ident pair. * * Return 0 on success (or no work necessary), return errno on failure. * * Not calling hashinit w/ waitok (proper malloc flag) should be safe. * If kqueue_register is called from a non-fd context, there usually/should * be no locks held. */ static int kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident, int waitok) { struct klist *list, *tmp_knhash, *to_free; u_long tmp_knhashmask; int size; int fd; int mflag = waitok ? M_WAITOK : M_NOWAIT; KQ_NOTOWNED(kq); to_free = NULL; if (fops->f_isfd) { fd = ident; if (kq->kq_knlistsize <= fd) { size = kq->kq_knlistsize; while (size <= fd) size += KQEXTENT; list = malloc(size * sizeof(*list), M_KQUEUE, mflag); if (list == NULL) return ENOMEM; KQ_LOCK(kq); if (kq->kq_knlistsize > fd) { to_free = list; list = NULL; } else { if (kq->kq_knlist != NULL) { bcopy(kq->kq_knlist, list, kq->kq_knlistsize * sizeof(*list)); to_free = kq->kq_knlist; kq->kq_knlist = NULL; } bzero((caddr_t)list + kq->kq_knlistsize * sizeof(*list), (size - kq->kq_knlistsize) * sizeof(*list)); kq->kq_knlistsize = size; kq->kq_knlist = list; } KQ_UNLOCK(kq); } } else { if (kq->kq_knhashmask == 0) { tmp_knhash = hashinit(KN_HASHSIZE, M_KQUEUE, &tmp_knhashmask); if (tmp_knhash == NULL) return ENOMEM; KQ_LOCK(kq); if (kq->kq_knhashmask == 0) { kq->kq_knhash = tmp_knhash; kq->kq_knhashmask = tmp_knhashmask; } else { to_free = tmp_knhash; } KQ_UNLOCK(kq); } } free(to_free, M_KQUEUE); KQ_NOTOWNED(kq); return 0; } static void kqueue_task(void *arg, int pending) { struct kqueue *kq; int haskqglobal; haskqglobal = 0; kq = arg; KQ_GLOBAL_LOCK(&kq_global, haskqglobal); KQ_LOCK(kq); KNOTE_LOCKED(&kq->kq_sel.si_note, 0); kq->kq_state &= ~KQ_TASKSCHED; if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) { wakeup(&kq->kq_state); } KQ_UNLOCK(kq); KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); } /* * Scan, update kn_data (if not ONESHOT), and copyout triggered events. * We treat KN_MARKER knotes as if they are in flux. */ static int kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops, const struct timespec *tsp, struct kevent *keva, struct thread *td) { struct kevent *kevp; struct knote *kn, *marker; struct knlist *knl; sbintime_t asbt, rsbt; int count, error, haskqglobal, influx, nkev, touch; count = maxevents; nkev = 0; error = 0; haskqglobal = 0; if (maxevents == 0) goto done_nl; rsbt = 0; if (tsp != NULL) { if (tsp->tv_sec < 0 || tsp->tv_nsec < 0 || tsp->tv_nsec >= 1000000000) { error = EINVAL; goto done_nl; } if (timespecisset(tsp)) { if (tsp->tv_sec <= INT32_MAX) { rsbt = tstosbt(*tsp); if (TIMESEL(&asbt, rsbt)) asbt += tc_tick_sbt; if (asbt <= SBT_MAX - rsbt) asbt += rsbt; else asbt = 0; rsbt >>= tc_precexp; } else asbt = 0; } else asbt = -1; } else asbt = 0; marker = knote_alloc(1); marker->kn_status = KN_MARKER; KQ_LOCK(kq); retry: kevp = keva; if (kq->kq_count == 0) { if (asbt == -1) { error = EWOULDBLOCK; } else { kq->kq_state |= KQ_SLEEP; error = msleep_sbt(kq, &kq->kq_lock, PSOCK | PCATCH, "kqread", asbt, rsbt, C_ABSOLUTE); } if (error == 0) goto retry; /* don't restart after signals... */ if (error == ERESTART) error = EINTR; else if (error == EWOULDBLOCK) error = 0; goto done; } TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe); influx = 0; while (count) { KQ_OWNED(kq); kn = TAILQ_FIRST(&kq->kq_head); if ((kn->kn_status == KN_MARKER && kn != marker) || kn_in_flux(kn)) { if (influx) { influx = 0; KQ_FLUX_WAKEUP(kq); } kq->kq_state |= KQ_FLUXWAIT; error = msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0); continue; } TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) { kn->kn_status &= ~KN_QUEUED; kq->kq_count--; continue; } if (kn == marker) { KQ_FLUX_WAKEUP(kq); if (count == maxevents) goto retry; goto done; } KASSERT(!kn_in_flux(kn), ("knote %p is unexpectedly in flux", kn)); if ((kn->kn_flags & EV_DROP) == EV_DROP) { kn->kn_status &= ~KN_QUEUED; kn_enter_flux(kn); kq->kq_count--; KQ_UNLOCK(kq); /* * We don't need to lock the list since we've * marked it as in flux. */ knote_drop(kn, td); KQ_LOCK(kq); continue; } else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) { kn->kn_status &= ~KN_QUEUED; kn_enter_flux(kn); kq->kq_count--; KQ_UNLOCK(kq); /* * We don't need to lock the list since we've * marked the knote as being in flux. */ *kevp = kn->kn_kevent; knote_drop(kn, td); KQ_LOCK(kq); kn = NULL; } else { kn->kn_status |= KN_SCAN; kn_enter_flux(kn); KQ_UNLOCK(kq); if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE) KQ_GLOBAL_LOCK(&kq_global, haskqglobal); knl = kn_list_lock(kn); if (kn->kn_fop->f_event(kn, 0) == 0) { KQ_LOCK(kq); KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE | KN_SCAN); kn_leave_flux(kn); kq->kq_count--; kn_list_unlock(knl); influx = 1; continue; } touch = (!kn->kn_fop->f_isfd && kn->kn_fop->f_touch != NULL); if (touch) kn->kn_fop->f_touch(kn, kevp, EVENT_PROCESS); else *kevp = kn->kn_kevent; KQ_LOCK(kq); KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) { /* * Manually clear knotes who weren't * 'touch'ed. */ if (touch == 0 && kn->kn_flags & EV_CLEAR) { kn->kn_data = 0; kn->kn_fflags = 0; } if (kn->kn_flags & EV_DISPATCH) kn->kn_status |= KN_DISABLED; kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE); kq->kq_count--; } else TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); kn->kn_status &= ~KN_SCAN; kn_leave_flux(kn); kn_list_unlock(knl); influx = 1; } /* we are returning a copy to the user */ kevp++; nkev++; count--; if (nkev == KQ_NEVENTS) { influx = 0; KQ_UNLOCK_FLUX(kq); error = k_ops->k_copyout(k_ops->arg, keva, nkev); nkev = 0; kevp = keva; KQ_LOCK(kq); if (error) break; } } TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe); done: KQ_OWNED(kq); KQ_UNLOCK_FLUX(kq); knote_free(marker); done_nl: KQ_NOTOWNED(kq); if (nkev != 0) error = k_ops->k_copyout(k_ops->arg, keva, nkev); td->td_retval[0] = maxevents - count; return (error); } /*ARGSUSED*/ static int kqueue_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { /* * Enabling sigio causes two major problems: * 1) infinite recursion: * Synopsys: kevent is being used to track signals and have FIOASYNC * set. On receipt of a signal this will cause a kqueue to recurse * into itself over and over. Sending the sigio causes the kqueue * to become ready, which in turn posts sigio again, forever. * Solution: this can be solved by setting a flag in the kqueue that * we have a SIGIO in progress. * 2) locking problems: * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts * us above the proc and pgrp locks. * Solution: Post a signal using an async mechanism, being sure to * record a generation count in the delivery so that we do not deliver * a signal to the wrong process. * * Note, these two mechanisms are somewhat mutually exclusive! */ #if 0 struct kqueue *kq; kq = fp->f_data; switch (cmd) { case FIOASYNC: if (*(int *)data) { kq->kq_state |= KQ_ASYNC; } else { kq->kq_state &= ~KQ_ASYNC; } return (0); case FIOSETOWN: return (fsetown(*(int *)data, &kq->kq_sigio)); case FIOGETOWN: *(int *)data = fgetown(&kq->kq_sigio); return (0); } #endif return (ENOTTY); } /*ARGSUSED*/ static int kqueue_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct kqueue *kq; int revents = 0; int error; if ((error = kqueue_acquire(fp, &kq))) return POLLERR; KQ_LOCK(kq); if (events & (POLLIN | POLLRDNORM)) { if (kq->kq_count) { revents |= events & (POLLIN | POLLRDNORM); } else { selrecord(td, &kq->kq_sel); if (SEL_WAITING(&kq->kq_sel)) kq->kq_state |= KQ_SEL; } } kqueue_release(kq, 1); KQ_UNLOCK(kq); return (revents); } /*ARGSUSED*/ static int kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred, struct thread *td) { bzero((void *)st, sizeof *st); /* * We no longer return kq_count because the unlocked value is useless. * If you spent all this time getting the count, why not spend your * syscall better by calling kevent? * * XXX - This is needed for libc_r. */ st->st_mode = S_IFIFO; return (0); } static void kqueue_drain(struct kqueue *kq, struct thread *td) { struct knote *kn; int i; KQ_LOCK(kq); KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING, ("kqueue already closing")); kq->kq_state |= KQ_CLOSING; if (kq->kq_refcnt > 1) msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0); KASSERT(kq->kq_refcnt == 1, ("other refs are out there!")); KASSERT(knlist_empty(&kq->kq_sel.si_note), ("kqueue's knlist not empty")); for (i = 0; i < kq->kq_knlistsize; i++) { while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) { if (kn_in_flux(kn)) { kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK, "kqclo1", 0); continue; } kn_enter_flux(kn); KQ_UNLOCK(kq); knote_drop(kn, td); KQ_LOCK(kq); } } if (kq->kq_knhashmask != 0) { for (i = 0; i <= kq->kq_knhashmask; i++) { while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) { if (kn_in_flux(kn)) { kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK, "kqclo2", 0); continue; } kn_enter_flux(kn); KQ_UNLOCK(kq); knote_drop(kn, td); KQ_LOCK(kq); } } } if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) { kq->kq_state |= KQ_TASKDRAIN; msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0); } if ((kq->kq_state & KQ_SEL) == KQ_SEL) { selwakeuppri(&kq->kq_sel, PSOCK); if (!SEL_WAITING(&kq->kq_sel)) kq->kq_state &= ~KQ_SEL; } KQ_UNLOCK(kq); } static void kqueue_destroy(struct kqueue *kq) { KASSERT(kq->kq_fdp == NULL, ("kqueue still attached to a file descriptor")); seldrain(&kq->kq_sel); knlist_destroy(&kq->kq_sel.si_note); mtx_destroy(&kq->kq_lock); if (kq->kq_knhash != NULL) free(kq->kq_knhash, M_KQUEUE); if (kq->kq_knlist != NULL) free(kq->kq_knlist, M_KQUEUE); funsetown(&kq->kq_sigio); } /*ARGSUSED*/ static int kqueue_close(struct file *fp, struct thread *td) { struct kqueue *kq = fp->f_data; struct filedesc *fdp; int error; int filedesc_unlock; if ((error = kqueue_acquire(fp, &kq))) return error; kqueue_drain(kq, td); /* * We could be called due to the knote_drop() doing fdrop(), * called from kqueue_register(). In this case the global * lock is owned, and filedesc sx is locked before, to not * take the sleepable lock after non-sleepable. */ fdp = kq->kq_fdp; kq->kq_fdp = NULL; if (!sx_xlocked(FILEDESC_LOCK(fdp))) { FILEDESC_XLOCK(fdp); filedesc_unlock = 1; } else filedesc_unlock = 0; TAILQ_REMOVE(&fdp->fd_kqlist, kq, kq_list); if (filedesc_unlock) FILEDESC_XUNLOCK(fdp); kqueue_destroy(kq); chgkqcnt(kq->kq_cred->cr_ruidinfo, -1, 0); crfree(kq->kq_cred); free(kq, M_KQUEUE); fp->f_data = NULL; return (0); } static int kqueue_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { kif->kf_type = KF_TYPE_KQUEUE; return (0); } static void kqueue_wakeup(struct kqueue *kq) { KQ_OWNED(kq); if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) { kq->kq_state &= ~KQ_SLEEP; wakeup(kq); } if ((kq->kq_state & KQ_SEL) == KQ_SEL) { selwakeuppri(&kq->kq_sel, PSOCK); if (!SEL_WAITING(&kq->kq_sel)) kq->kq_state &= ~KQ_SEL; } if (!knlist_empty(&kq->kq_sel.si_note)) kqueue_schedtask(kq); if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) { pgsigio(&kq->kq_sigio, SIGIO, 0); } } /* * Walk down a list of knotes, activating them if their event has triggered. * * There is a possibility to optimize in the case of one kq watching another. * Instead of scheduling a task to wake it up, you could pass enough state * down the chain to make up the parent kqueue. Make this code functional * first. */ void knote(struct knlist *list, long hint, int lockflags) { struct kqueue *kq; struct knote *kn, *tkn; int error; if (list == NULL) return; KNL_ASSERT_LOCK(list, lockflags & KNF_LISTLOCKED); if ((lockflags & KNF_LISTLOCKED) == 0) list->kl_lock(list->kl_lockarg); /* * If we unlock the list lock (and enter influx), we can * eliminate the kqueue scheduling, but this will introduce * four lock/unlock's for each knote to test. Also, marker * would be needed to keep iteration position, since filters * or other threads could remove events. */ SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, tkn) { kq = kn->kn_kq; KQ_LOCK(kq); if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) { /* * Do not process the influx notes, except for * the influx coming from the kq unlock in the * kqueue_scan(). In the later case, we do * not interfere with the scan, since the code * fragment in kqueue_scan() locks the knlist, * and cannot proceed until we finished. */ KQ_UNLOCK(kq); } else if ((lockflags & KNF_NOKQLOCK) != 0) { kn_enter_flux(kn); KQ_UNLOCK(kq); error = kn->kn_fop->f_event(kn, hint); KQ_LOCK(kq); kn_leave_flux(kn); if (error) KNOTE_ACTIVATE(kn, 1); KQ_UNLOCK_FLUX(kq); } else { kn->kn_status |= KN_HASKQLOCK; if (kn->kn_fop->f_event(kn, hint)) KNOTE_ACTIVATE(kn, 1); kn->kn_status &= ~KN_HASKQLOCK; KQ_UNLOCK(kq); } } if ((lockflags & KNF_LISTLOCKED) == 0) list->kl_unlock(list->kl_lockarg); } /* * add a knote to a knlist */ void knlist_add(struct knlist *knl, struct knote *kn, int islocked) { KNL_ASSERT_LOCK(knl, islocked); KQ_NOTOWNED(kn->kn_kq); KASSERT(kn_in_flux(kn), ("knote %p not in flux", kn)); KASSERT((kn->kn_status & KN_DETACHED) != 0, ("knote %p was not detached", kn)); if (!islocked) knl->kl_lock(knl->kl_lockarg); SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext); if (!islocked) knl->kl_unlock(knl->kl_lockarg); KQ_LOCK(kn->kn_kq); kn->kn_knlist = knl; kn->kn_status &= ~KN_DETACHED; KQ_UNLOCK(kn->kn_kq); } static void knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked, int kqislocked) { KASSERT(!kqislocked || knlislocked, ("kq locked w/o knl locked")); KNL_ASSERT_LOCK(knl, knlislocked); mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED); KASSERT(kqislocked || kn_in_flux(kn), ("knote %p not in flux", kn)); KASSERT((kn->kn_status & KN_DETACHED) == 0, ("knote %p was already detached", kn)); if (!knlislocked) knl->kl_lock(knl->kl_lockarg); SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext); kn->kn_knlist = NULL; if (!knlislocked) kn_list_unlock(knl); if (!kqislocked) KQ_LOCK(kn->kn_kq); kn->kn_status |= KN_DETACHED; if (!kqislocked) KQ_UNLOCK(kn->kn_kq); } /* * remove knote from the specified knlist */ void knlist_remove(struct knlist *knl, struct knote *kn, int islocked) { knlist_remove_kq(knl, kn, islocked, 0); } int knlist_empty(struct knlist *knl) { KNL_ASSERT_LOCKED(knl); return (SLIST_EMPTY(&knl->kl_list)); } static struct mtx knlist_lock; MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects", MTX_DEF); static void knlist_mtx_lock(void *arg); static void knlist_mtx_unlock(void *arg); static void knlist_mtx_lock(void *arg) { mtx_lock((struct mtx *)arg); } static void knlist_mtx_unlock(void *arg) { mtx_unlock((struct mtx *)arg); } static void knlist_mtx_assert_locked(void *arg) { mtx_assert((struct mtx *)arg, MA_OWNED); } static void knlist_mtx_assert_unlocked(void *arg) { mtx_assert((struct mtx *)arg, MA_NOTOWNED); } static void knlist_rw_rlock(void *arg) { rw_rlock((struct rwlock *)arg); } static void knlist_rw_runlock(void *arg) { rw_runlock((struct rwlock *)arg); } static void knlist_rw_assert_locked(void *arg) { rw_assert((struct rwlock *)arg, RA_LOCKED); } static void knlist_rw_assert_unlocked(void *arg) { rw_assert((struct rwlock *)arg, RA_UNLOCKED); } void knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *), void (*kl_unlock)(void *), void (*kl_assert_locked)(void *), void (*kl_assert_unlocked)(void *)) { if (lock == NULL) knl->kl_lockarg = &knlist_lock; else knl->kl_lockarg = lock; if (kl_lock == NULL) knl->kl_lock = knlist_mtx_lock; else knl->kl_lock = kl_lock; if (kl_unlock == NULL) knl->kl_unlock = knlist_mtx_unlock; else knl->kl_unlock = kl_unlock; if (kl_assert_locked == NULL) knl->kl_assert_locked = knlist_mtx_assert_locked; else knl->kl_assert_locked = kl_assert_locked; if (kl_assert_unlocked == NULL) knl->kl_assert_unlocked = knlist_mtx_assert_unlocked; else knl->kl_assert_unlocked = kl_assert_unlocked; knl->kl_autodestroy = 0; SLIST_INIT(&knl->kl_list); } void knlist_init_mtx(struct knlist *knl, struct mtx *lock) { knlist_init(knl, lock, NULL, NULL, NULL, NULL); } struct knlist * knlist_alloc(struct mtx *lock) { struct knlist *knl; knl = malloc(sizeof(struct knlist), M_KQUEUE, M_WAITOK); knlist_init_mtx(knl, lock); return (knl); } void knlist_init_rw_reader(struct knlist *knl, struct rwlock *lock) { knlist_init(knl, lock, knlist_rw_rlock, knlist_rw_runlock, knlist_rw_assert_locked, knlist_rw_assert_unlocked); } void knlist_destroy(struct knlist *knl) { KASSERT(KNLIST_EMPTY(knl), ("destroying knlist %p with knotes on it", knl)); } void knlist_detach(struct knlist *knl) { KNL_ASSERT_LOCKED(knl); knl->kl_autodestroy = 1; if (knlist_empty(knl)) { knlist_destroy(knl); free(knl, M_KQUEUE); } } /* * Even if we are locked, we may need to drop the lock to allow any influx * knotes time to "settle". */ void knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn) { struct knote *kn, *kn2; struct kqueue *kq; KASSERT(!knl->kl_autodestroy, ("cleardel for autodestroy %p", knl)); if (islocked) KNL_ASSERT_LOCKED(knl); else { KNL_ASSERT_UNLOCKED(knl); again: /* need to reacquire lock since we have dropped it */ knl->kl_lock(knl->kl_lockarg); } SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) { kq = kn->kn_kq; KQ_LOCK(kq); if (kn_in_flux(kn)) { KQ_UNLOCK(kq); continue; } knlist_remove_kq(knl, kn, 1, 1); if (killkn) { kn_enter_flux(kn); KQ_UNLOCK(kq); knote_drop_detached(kn, td); } else { /* Make sure cleared knotes disappear soon */ kn->kn_flags |= EV_EOF | EV_ONESHOT; KQ_UNLOCK(kq); } kq = NULL; } if (!SLIST_EMPTY(&knl->kl_list)) { /* there are still in flux knotes remaining */ kn = SLIST_FIRST(&knl->kl_list); kq = kn->kn_kq; KQ_LOCK(kq); KASSERT(kn_in_flux(kn), ("knote removed w/o list lock")); knl->kl_unlock(knl->kl_lockarg); kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0); kq = NULL; goto again; } if (islocked) KNL_ASSERT_LOCKED(knl); else { knl->kl_unlock(knl->kl_lockarg); KNL_ASSERT_UNLOCKED(knl); } } /* * Remove all knotes referencing a specified fd must be called with FILEDESC * lock. This prevents a race where a new fd comes along and occupies the * entry and we attach a knote to the fd. */ void knote_fdclose(struct thread *td, int fd) { struct filedesc *fdp = td->td_proc->p_fd; struct kqueue *kq; struct knote *kn; int influx; FILEDESC_XLOCK_ASSERT(fdp); /* * We shouldn't have to worry about new kevents appearing on fd * since filedesc is locked. */ TAILQ_FOREACH(kq, &fdp->fd_kqlist, kq_list) { KQ_LOCK(kq); again: influx = 0; while (kq->kq_knlistsize > fd && (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) { if (kn_in_flux(kn)) { /* someone else might be waiting on our knote */ if (influx) wakeup(kq); kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0); goto again; } kn_enter_flux(kn); KQ_UNLOCK(kq); influx = 1; knote_drop(kn, td); KQ_LOCK(kq); } KQ_UNLOCK_FLUX(kq); } } static int knote_attach(struct knote *kn, struct kqueue *kq) { struct klist *list; KASSERT(kn_in_flux(kn), ("knote %p not marked influx", kn)); KQ_OWNED(kq); if (kn->kn_fop->f_isfd) { if (kn->kn_id >= kq->kq_knlistsize) return (ENOMEM); list = &kq->kq_knlist[kn->kn_id]; } else { if (kq->kq_knhash == NULL) return (ENOMEM); list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; } SLIST_INSERT_HEAD(list, kn, kn_link); return (0); } static void knote_drop(struct knote *kn, struct thread *td) { if ((kn->kn_status & KN_DETACHED) == 0) kn->kn_fop->f_detach(kn); knote_drop_detached(kn, td); } static void knote_drop_detached(struct knote *kn, struct thread *td) { struct kqueue *kq; struct klist *list; kq = kn->kn_kq; KASSERT((kn->kn_status & KN_DETACHED) != 0, ("knote %p still attached", kn)); KQ_NOTOWNED(kq); KQ_LOCK(kq); KASSERT(kn->kn_influx == 1, ("knote_drop called on %p with influx %d", kn, kn->kn_influx)); if (kn->kn_fop->f_isfd) list = &kq->kq_knlist[kn->kn_id]; else list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; if (!SLIST_EMPTY(list)) SLIST_REMOVE(list, kn, knote, kn_link); if (kn->kn_status & KN_QUEUED) knote_dequeue(kn); KQ_UNLOCK_FLUX(kq); if (kn->kn_fop->f_isfd) { fdrop(kn->kn_fp, td); kn->kn_fp = NULL; } kqueue_fo_release(kn->kn_kevent.filter); kn->kn_fop = NULL; knote_free(kn); } static void knote_enqueue(struct knote *kn) { struct kqueue *kq = kn->kn_kq; KQ_OWNED(kn->kn_kq); KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued")); TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); kn->kn_status |= KN_QUEUED; kq->kq_count++; kqueue_wakeup(kq); } static void knote_dequeue(struct knote *kn) { struct kqueue *kq = kn->kn_kq; KQ_OWNED(kn->kn_kq); KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued")); TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); kn->kn_status &= ~KN_QUEUED; kq->kq_count--; } static void knote_init(void) { knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); } SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL); static struct knote * knote_alloc(int waitok) { return (uma_zalloc(knote_zone, (waitok ? M_WAITOK : M_NOWAIT) | M_ZERO)); } static void knote_free(struct knote *kn) { uma_zfree(knote_zone, kn); } /* * Register the kev w/ the kq specified by fd. */ int kqfd_register(int fd, struct kevent *kev, struct thread *td, int waitok) { struct kqueue *kq; struct file *fp; cap_rights_t rights; int error; error = fget(td, fd, cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &fp); if (error != 0) return (error); if ((error = kqueue_acquire(fp, &kq)) != 0) goto noacquire; error = kqueue_register(kq, kev, td, waitok); kqueue_release(kq, 0); noacquire: fdrop(fp, td); return (error); } Index: head/sys/kern/kern_exec.c =================================================================== --- head/sys/kern/kern_exec.c (revision 333424) +++ head/sys/kern/kern_exec.c (revision 333425) @@ -1,1733 +1,1731 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1993, David Greenman * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_hwpmc_hooks.h" #include "opt_ktrace.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #include #include #include #ifdef KDTRACE_HOOKS #include dtrace_execexit_func_t dtrace_fasttrap_exec; #endif SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE1(proc, , , exec, "char *"); SDT_PROBE_DEFINE1(proc, , , exec__failure, "int"); SDT_PROBE_DEFINE1(proc, , , exec__success, "char *"); MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments"); int coredump_pack_fileinfo = 1; SYSCTL_INT(_kern, OID_AUTO, coredump_pack_fileinfo, CTLFLAG_RWTUN, &coredump_pack_fileinfo, 0, "Enable file path packing in 'procstat -f' coredump notes"); int coredump_pack_vmmapinfo = 1; SYSCTL_INT(_kern, OID_AUTO, coredump_pack_vmmapinfo, CTLFLAG_RWTUN, &coredump_pack_vmmapinfo, 0, "Enable file path packing in 'procstat -v' coredump notes"); static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS); static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS); static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS); static int do_execve(struct thread *td, struct image_args *args, struct mac *mac_p); /* XXX This should be vm_size_t. */ SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD| CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_ps_strings, "LU", ""); /* XXX This should be vm_size_t. */ SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD| CTLFLAG_CAPRD|CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_usrstack, "LU", ""); SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_stackprot, "I", ""); u_long ps_arg_cache_limit = PAGE_SIZE / 16; SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW, &ps_arg_cache_limit, 0, ""); static int disallow_high_osrel; SYSCTL_INT(_kern, OID_AUTO, disallow_high_osrel, CTLFLAG_RW, &disallow_high_osrel, 0, "Disallow execution of binaries built for higher version of the world"); static int map_at_zero = 0; SYSCTL_INT(_security_bsd, OID_AUTO, map_at_zero, CTLFLAG_RWTUN, &map_at_zero, 0, "Permit processes to map an object at virtual address 0."); EVENTHANDLER_LIST_DECLARE(process_exec); static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS) { struct proc *p; int error; p = curproc; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { unsigned int val; val = (unsigned int)p->p_sysent->sv_psstrings; error = SYSCTL_OUT(req, &val, sizeof(val)); } else #endif error = SYSCTL_OUT(req, &p->p_sysent->sv_psstrings, sizeof(p->p_sysent->sv_psstrings)); return error; } static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS) { struct proc *p; int error; p = curproc; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { unsigned int val; val = (unsigned int)p->p_sysent->sv_usrstack; error = SYSCTL_OUT(req, &val, sizeof(val)); } else #endif error = SYSCTL_OUT(req, &p->p_sysent->sv_usrstack, sizeof(p->p_sysent->sv_usrstack)); return error; } static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS) { struct proc *p; p = curproc; return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot, sizeof(p->p_sysent->sv_stackprot))); } /* * Each of the items is a pointer to a `const struct execsw', hence the * double pointer here. */ static const struct execsw **execsw; #ifndef _SYS_SYSPROTO_H_ struct execve_args { char *fname; char **argv; char **envv; }; #endif int sys_execve(struct thread *td, struct execve_args *uap) { struct image_args args; struct vmspace *oldvmspace; int error; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE, uap->argv, uap->envv); if (error == 0) error = kern_execve(td, &args, NULL); post_execve(td, error, oldvmspace); return (error); } #ifndef _SYS_SYSPROTO_H_ struct fexecve_args { int fd; char **argv; char **envv; } #endif int sys_fexecve(struct thread *td, struct fexecve_args *uap) { struct image_args args; struct vmspace *oldvmspace; int error; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = exec_copyin_args(&args, NULL, UIO_SYSSPACE, uap->argv, uap->envv); if (error == 0) { args.fd = uap->fd; error = kern_execve(td, &args, NULL); } post_execve(td, error, oldvmspace); return (error); } #ifndef _SYS_SYSPROTO_H_ struct __mac_execve_args { char *fname; char **argv; char **envv; struct mac *mac_p; }; #endif int sys___mac_execve(struct thread *td, struct __mac_execve_args *uap) { #ifdef MAC struct image_args args; struct vmspace *oldvmspace; int error; error = pre_execve(td, &oldvmspace); if (error != 0) return (error); error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE, uap->argv, uap->envv); if (error == 0) error = kern_execve(td, &args, uap->mac_p); post_execve(td, error, oldvmspace); return (error); #else return (ENOSYS); #endif } int pre_execve(struct thread *td, struct vmspace **oldvmspace) { struct proc *p; int error; KASSERT(td == curthread, ("non-current thread %p", td)); error = 0; p = td->td_proc; if ((p->p_flag & P_HADTHREADS) != 0) { PROC_LOCK(p); if (thread_single(p, SINGLE_BOUNDARY) != 0) error = ERESTART; PROC_UNLOCK(p); } KASSERT(error != 0 || (td->td_pflags & TDP_EXECVMSPC) == 0, ("nested execve")); *oldvmspace = p->p_vmspace; return (error); } void post_execve(struct thread *td, int error, struct vmspace *oldvmspace) { struct proc *p; KASSERT(td == curthread, ("non-current thread %p", td)); p = td->td_proc; if ((p->p_flag & P_HADTHREADS) != 0) { PROC_LOCK(p); /* * If success, we upgrade to SINGLE_EXIT state to * force other threads to suicide. */ if (error == EJUSTRETURN) thread_single(p, SINGLE_EXIT); else thread_single_end(p, SINGLE_BOUNDARY); PROC_UNLOCK(p); } if ((td->td_pflags & TDP_EXECVMSPC) != 0) { KASSERT(p->p_vmspace != oldvmspace, ("oldvmspace still used")); vmspace_free(oldvmspace); td->td_pflags &= ~TDP_EXECVMSPC; } } /* * XXX: kern_execve has the astonishing property of not always returning to * the caller. If sufficiently bad things happen during the call to * do_execve(), it can end up calling exit1(); as a result, callers must * avoid doing anything which they might need to undo (e.g., allocating * memory). */ int kern_execve(struct thread *td, struct image_args *args, struct mac *mac_p) { AUDIT_ARG_ARGV(args->begin_argv, args->argc, args->begin_envv - args->begin_argv); AUDIT_ARG_ENVV(args->begin_envv, args->envc, args->endp - args->begin_envv); return (do_execve(td, args, mac_p)); } /* * In-kernel implementation of execve(). All arguments are assumed to be * userspace pointers from the passed thread. */ static int do_execve(struct thread *td, struct image_args *args, struct mac *mac_p) { struct proc *p = td->td_proc; struct nameidata nd; struct ucred *oldcred; struct uidinfo *euip = NULL; register_t *stack_base; int error, i; struct image_params image_params, *imgp; struct vattr attr; int (*img_first)(struct image_params *); struct pargs *oldargs = NULL, *newargs = NULL; struct sigacts *oldsigacts = NULL, *newsigacts = NULL; #ifdef KTRACE struct vnode *tracevp = NULL; struct ucred *tracecred = NULL; #endif struct vnode *oldtextvp = NULL, *newtextvp; - cap_rights_t rights; int credential_changing; int textset; #ifdef MAC struct label *interpvplabel = NULL; int will_transition; #endif #ifdef HWPMC_HOOKS struct pmckern_procexec pe; #endif static const char fexecv_proc_title[] = "(fexecv)"; imgp = &image_params; /* * Lock the process and set the P_INEXEC flag to indicate that * it should be left alone until we're done here. This is * necessary to avoid race conditions - e.g. in ptrace() - * that might allow a local user to illicitly obtain elevated * privileges. */ PROC_LOCK(p); KASSERT((p->p_flag & P_INEXEC) == 0, ("%s(): process already has P_INEXEC flag", __func__)); p->p_flag |= P_INEXEC; PROC_UNLOCK(p); /* * Initialize part of the common data */ bzero(imgp, sizeof(*imgp)); imgp->proc = p; imgp->attr = &attr; imgp->args = args; oldcred = p->p_ucred; #ifdef MAC error = mac_execve_enter(imgp, mac_p); if (error) goto exec_fail; #endif /* * Translate the file name. namei() returns a vnode pointer * in ni_vp among other things. * * XXXAUDIT: It would be desirable to also audit the name of the * interpreter if this is an interpreted binary. */ if (args->fname != NULL) { NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME | AUDITVNODE1, UIO_SYSSPACE, args->fname, td); } SDT_PROBE1(proc, , , exec, args->fname); interpret: if (args->fname != NULL) { #ifdef CAPABILITY_MODE /* * While capability mode can't reach this point via direct * path arguments to execve(), we also don't allow * interpreters to be used in capability mode (for now). * Catch indirect lookups and return a permissions error. */ if (IN_CAPABILITY_MODE(td)) { error = ECAPMODE; goto exec_fail; } #endif error = namei(&nd); if (error) goto exec_fail; newtextvp = nd.ni_vp; imgp->vp = newtextvp; } else { AUDIT_ARG_FD(args->fd); /* * Descriptors opened only with O_EXEC or O_RDONLY are allowed. */ - error = fgetvp_exec(td, args->fd, - cap_rights_init(&rights, CAP_FEXECVE), &newtextvp); + error = fgetvp_exec(td, args->fd, &cap_fexecve_rights, &newtextvp); if (error) goto exec_fail; vn_lock(newtextvp, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG_VNODE1(newtextvp); imgp->vp = newtextvp; } /* * Check file permissions (also 'opens' file) */ error = exec_check_permissions(imgp); if (error) goto exec_fail_dealloc; imgp->object = imgp->vp->v_object; if (imgp->object != NULL) vm_object_reference(imgp->object); /* * Set VV_TEXT now so no one can write to the executable while we're * activating it. * * Remember if this was set before and unset it in case this is not * actually an executable image. */ textset = VOP_IS_TEXT(imgp->vp); VOP_SET_TEXT(imgp->vp); error = exec_map_first_page(imgp); if (error) goto exec_fail_dealloc; imgp->proc->p_osrel = 0; /* * Implement image setuid/setgid. * * Determine new credentials before attempting image activators * so that it can be used by process_exec handlers to determine * credential/setid changes. * * Don't honor setuid/setgid if the filesystem prohibits it or if * the process is being traced. * * We disable setuid/setgid/etc in capability mode on the basis * that most setugid applications are not written with that * environment in mind, and will therefore almost certainly operate * incorrectly. In principle there's no reason that setugid * applications might not be useful in capability mode, so we may want * to reconsider this conservative design choice in the future. * * XXXMAC: For the time being, use NOSUID to also prohibit * transitions on the file system. */ credential_changing = 0; credential_changing |= (attr.va_mode & S_ISUID) && oldcred->cr_uid != attr.va_uid; credential_changing |= (attr.va_mode & S_ISGID) && oldcred->cr_gid != attr.va_gid; #ifdef MAC will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp, interpvplabel, imgp); credential_changing |= will_transition; #endif /* Don't inherit PROC_PDEATHSIG_CTL value if setuid/setgid. */ if (credential_changing) imgp->proc->p_pdeathsig = 0; if (credential_changing && #ifdef CAPABILITY_MODE ((oldcred->cr_flags & CRED_FLAG_CAPMODE) == 0) && #endif (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 && (p->p_flag & P_TRACED) == 0) { imgp->credential_setid = true; VOP_UNLOCK(imgp->vp, 0); imgp->newcred = crdup(oldcred); if (attr.va_mode & S_ISUID) { euip = uifind(attr.va_uid); change_euid(imgp->newcred, euip); } vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); if (attr.va_mode & S_ISGID) change_egid(imgp->newcred, attr.va_gid); /* * Implement correct POSIX saved-id behavior. * * XXXMAC: Note that the current logic will save the * uid and gid if a MAC domain transition occurs, even * though maybe it shouldn't. */ change_svuid(imgp->newcred, imgp->newcred->cr_uid); change_svgid(imgp->newcred, imgp->newcred->cr_gid); } else { /* * Implement correct POSIX saved-id behavior. * * XXX: It's not clear that the existing behavior is * POSIX-compliant. A number of sources indicate that the * saved uid/gid should only be updated if the new ruid is * not equal to the old ruid, or the new euid is not equal * to the old euid and the new euid is not equal to the old * ruid. The FreeBSD code always updates the saved uid/gid. * Also, this code uses the new (replaced) euid and egid as * the source, which may or may not be the right ones to use. */ if (oldcred->cr_svuid != oldcred->cr_uid || oldcred->cr_svgid != oldcred->cr_gid) { VOP_UNLOCK(imgp->vp, 0); imgp->newcred = crdup(oldcred); vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); change_svuid(imgp->newcred, imgp->newcred->cr_uid); change_svgid(imgp->newcred, imgp->newcred->cr_gid); } } /* The new credentials are installed into the process later. */ /* * Do the best to calculate the full path to the image file. */ if (args->fname != NULL && args->fname[0] == '/') imgp->execpath = args->fname; else { VOP_UNLOCK(imgp->vp, 0); if (vn_fullpath(td, imgp->vp, &imgp->execpath, &imgp->freepath) != 0) imgp->execpath = args->fname; vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); } /* * If the current process has a special image activator it * wants to try first, call it. For example, emulating shell * scripts differently. */ error = -1; if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL) error = img_first(imgp); /* * Loop through the list of image activators, calling each one. * An activator returns -1 if there is no match, 0 on success, * and an error otherwise. */ for (i = 0; error == -1 && execsw[i]; ++i) { if (execsw[i]->ex_imgact == NULL || execsw[i]->ex_imgact == img_first) { continue; } error = (*execsw[i]->ex_imgact)(imgp); } if (error) { if (error == -1) { if (textset == 0) VOP_UNSET_TEXT(imgp->vp); error = ENOEXEC; } goto exec_fail_dealloc; } /* * Special interpreter operation, cleanup and loop up to try to * activate the interpreter. */ if (imgp->interpreted) { exec_unmap_first_page(imgp); /* * VV_TEXT needs to be unset for scripts. There is a short * period before we determine that something is a script where * VV_TEXT will be set. The vnode lock is held over this * entire period so nothing should illegitimately be blocked. */ VOP_UNSET_TEXT(imgp->vp); /* free name buffer and old vnode */ if (args->fname != NULL) NDFREE(&nd, NDF_ONLY_PNBUF); #ifdef MAC mac_execve_interpreter_enter(newtextvp, &interpvplabel); #endif if (imgp->opened) { VOP_CLOSE(newtextvp, FREAD, td->td_ucred, td); imgp->opened = 0; } vput(newtextvp); vm_object_deallocate(imgp->object); imgp->object = NULL; imgp->credential_setid = false; if (imgp->newcred != NULL) { crfree(imgp->newcred); imgp->newcred = NULL; } imgp->execpath = NULL; free(imgp->freepath, M_TEMP); imgp->freepath = NULL; /* set new name to that of the interpreter */ NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME, UIO_SYSSPACE, imgp->interpreter_name, td); args->fname = imgp->interpreter_name; goto interpret; } /* * NB: We unlock the vnode here because it is believed that none * of the sv_copyout_strings/sv_fixup operations require the vnode. */ VOP_UNLOCK(imgp->vp, 0); if (disallow_high_osrel && P_OSREL_MAJOR(p->p_osrel) > P_OSREL_MAJOR(__FreeBSD_version)) { error = ENOEXEC; uprintf("Osrel %d for image %s too high\n", p->p_osrel, imgp->execpath != NULL ? imgp->execpath : ""); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); goto exec_fail_dealloc; } /* ABI enforces the use of Capsicum. Switch into capabilities mode. */ if (SV_PROC_FLAG(p, SV_CAPSICUM)) sys_cap_enter(td, NULL); /* * Copy out strings (args and env) and initialize stack base */ if (p->p_sysent->sv_copyout_strings) stack_base = (*p->p_sysent->sv_copyout_strings)(imgp); else stack_base = exec_copyout_strings(imgp); /* * If custom stack fixup routine present for this process * let it do the stack setup. * Else stuff argument count as first item on stack */ if (p->p_sysent->sv_fixup != NULL) (*p->p_sysent->sv_fixup)(&stack_base, imgp); else suword(--stack_base, imgp->args->argc); if (args->fdp != NULL) { /* Install a brand new file descriptor table. */ fdinstall_remapped(td, args->fdp); args->fdp = NULL; } else { /* * Keep on using the existing file descriptor table. For * security and other reasons, the file descriptor table * cannot be shared after an exec. */ fdunshare(td); /* close files on exec */ fdcloseexec(td); } /* * Malloc things before we need locks. */ i = imgp->args->begin_envv - imgp->args->begin_argv; /* Cache arguments if they fit inside our allowance */ if (ps_arg_cache_limit >= i + sizeof(struct pargs)) { newargs = pargs_alloc(i); bcopy(imgp->args->begin_argv, newargs->ar_args, i); } /* * For security and other reasons, signal handlers cannot * be shared after an exec. The new process gets a copy of the old * handlers. In execsigs(), the new process will have its signals * reset. */ if (sigacts_shared(p->p_sigacts)) { oldsigacts = p->p_sigacts; newsigacts = sigacts_alloc(); sigacts_copy(newsigacts, oldsigacts); } vn_lock(imgp->vp, LK_SHARED | LK_RETRY); PROC_LOCK(p); if (oldsigacts) p->p_sigacts = newsigacts; /* Stop profiling */ stopprofclock(p); /* reset caught signals */ execsigs(p); /* name this process - nameiexec(p, ndp) */ bzero(p->p_comm, sizeof(p->p_comm)); if (args->fname) bcopy(nd.ni_cnd.cn_nameptr, p->p_comm, min(nd.ni_cnd.cn_namelen, MAXCOMLEN)); else if (vn_commname(newtextvp, p->p_comm, sizeof(p->p_comm)) != 0) bcopy(fexecv_proc_title, p->p_comm, sizeof(fexecv_proc_title)); bcopy(p->p_comm, td->td_name, sizeof(td->td_name)); #ifdef KTR sched_clear_tdname(td); #endif /* * mark as execed, wakeup the process that vforked (if any) and tell * it that it now has its own resources back */ p->p_flag |= P_EXEC; if ((p->p_flag2 & P2_NOTRACE_EXEC) == 0) p->p_flag2 &= ~P2_NOTRACE; if (p->p_flag & P_PPWAIT) { p->p_flag &= ~(P_PPWAIT | P_PPTRACE); cv_broadcast(&p->p_pwait); /* STOPs are no longer ignored, arrange for AST */ signotify(td); } /* * Implement image setuid/setgid installation. */ if (imgp->credential_setid) { /* * Turn off syscall tracing for set-id programs, except for * root. Record any set-id flags first to make sure that * we do not regain any tracing during a possible block. */ setsugid(p); #ifdef KTRACE if (p->p_tracecred != NULL && priv_check_cred(p->p_tracecred, PRIV_DEBUG_DIFFCRED, 0)) ktrprocexec(p, &tracecred, &tracevp); #endif /* * Close any file descriptors 0..2 that reference procfs, * then make sure file descriptors 0..2 are in use. * * Both fdsetugidsafety() and fdcheckstd() may call functions * taking sleepable locks, so temporarily drop our locks. */ PROC_UNLOCK(p); VOP_UNLOCK(imgp->vp, 0); fdsetugidsafety(td); error = fdcheckstd(td); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); if (error != 0) goto exec_fail_dealloc; PROC_LOCK(p); #ifdef MAC if (will_transition) { mac_vnode_execve_transition(oldcred, imgp->newcred, imgp->vp, interpvplabel, imgp); } #endif } else { if (oldcred->cr_uid == oldcred->cr_ruid && oldcred->cr_gid == oldcred->cr_rgid) p->p_flag &= ~P_SUGID; } /* * Set the new credentials. */ if (imgp->newcred != NULL) { proc_set_cred(p, imgp->newcred); crfree(oldcred); oldcred = NULL; } /* * Store the vp for use in procfs. This vnode was referenced by namei * or fgetvp_exec. */ oldtextvp = p->p_textvp; p->p_textvp = newtextvp; #ifdef KDTRACE_HOOKS /* * Tell the DTrace fasttrap provider about the exec if it * has declared an interest. */ if (dtrace_fasttrap_exec) dtrace_fasttrap_exec(p); #endif /* * Notify others that we exec'd, and clear the P_INEXEC flag * as we're now a bona fide freshly-execed process. */ KNOTE_LOCKED(p->p_klist, NOTE_EXEC); p->p_flag &= ~P_INEXEC; /* clear "fork but no exec" flag, as we _are_ execing */ p->p_acflag &= ~AFORK; /* * Free any previous argument cache and replace it with * the new argument cache, if any. */ oldargs = p->p_args; p->p_args = newargs; newargs = NULL; PROC_UNLOCK(p); #ifdef HWPMC_HOOKS /* * Check if system-wide sampling is in effect or if the * current process is using PMCs. If so, do exec() time * processing. This processing needs to happen AFTER the * P_INEXEC flag is cleared. */ if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) { VOP_UNLOCK(imgp->vp, 0); pe.pm_credentialschanged = credential_changing; pe.pm_entryaddr = imgp->entry_addr; PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); } #endif /* Set values passed into the program in registers. */ if (p->p_sysent->sv_setregs) (*p->p_sysent->sv_setregs)(td, imgp, (u_long)(uintptr_t)stack_base); else exec_setregs(td, imgp, (u_long)(uintptr_t)stack_base); vfs_mark_atime(imgp->vp, td->td_ucred); SDT_PROBE1(proc, , , exec__success, args->fname); exec_fail_dealloc: if (imgp->firstpage != NULL) exec_unmap_first_page(imgp); if (imgp->vp != NULL) { if (args->fname) NDFREE(&nd, NDF_ONLY_PNBUF); if (imgp->opened) VOP_CLOSE(imgp->vp, FREAD, td->td_ucred, td); if (error != 0) vput(imgp->vp); else VOP_UNLOCK(imgp->vp, 0); } if (imgp->object != NULL) vm_object_deallocate(imgp->object); free(imgp->freepath, M_TEMP); if (error == 0) { if (p->p_ptevents & PTRACE_EXEC) { PROC_LOCK(p); if (p->p_ptevents & PTRACE_EXEC) td->td_dbgflags |= TDB_EXEC; PROC_UNLOCK(p); } /* * Stop the process here if its stop event mask has * the S_EXEC bit set. */ STOPEVENT(p, S_EXEC, 0); } else { exec_fail: /* we're done here, clear P_INEXEC */ PROC_LOCK(p); p->p_flag &= ~P_INEXEC; PROC_UNLOCK(p); SDT_PROBE1(proc, , , exec__failure, error); } if (imgp->newcred != NULL && oldcred != NULL) crfree(imgp->newcred); #ifdef MAC mac_execve_exit(imgp); mac_execve_interpreter_exit(interpvplabel); #endif exec_free_args(args); /* * Handle deferred decrement of ref counts. */ if (oldtextvp != NULL) vrele(oldtextvp); #ifdef KTRACE if (tracevp != NULL) vrele(tracevp); if (tracecred != NULL) crfree(tracecred); #endif pargs_drop(oldargs); pargs_drop(newargs); if (oldsigacts != NULL) sigacts_free(oldsigacts); if (euip != NULL) uifree(euip); if (error && imgp->vmspace_destroyed) { /* sorry, no more process anymore. exit gracefully */ exit1(td, 0, SIGABRT); /* NOT REACHED */ } #ifdef KTRACE if (error == 0) ktrprocctor(p); #endif /* * We don't want cpu_set_syscall_retval() to overwrite any of * the register values put in place by exec_setregs(). * Implementations of cpu_set_syscall_retval() will leave * registers unmodified when returning EJUSTRETURN. */ return (error == 0 ? EJUSTRETURN : error); } int exec_map_first_page(imgp) struct image_params *imgp; { int rv, i, after, initial_pagein; vm_page_t ma[VM_INITIAL_PAGEIN]; vm_object_t object; if (imgp->firstpage != NULL) exec_unmap_first_page(imgp); object = imgp->vp->v_object; if (object == NULL) return (EACCES); VM_OBJECT_WLOCK(object); #if VM_NRESERVLEVEL > 0 vm_object_color(object, 0); #endif ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY); if (ma[0]->valid != VM_PAGE_BITS_ALL) { vm_page_xbusy(ma[0]); if (!vm_pager_has_page(object, 0, NULL, &after)) { vm_page_lock(ma[0]); vm_page_free(ma[0]); vm_page_unlock(ma[0]); VM_OBJECT_WUNLOCK(object); return (EIO); } initial_pagein = min(after, VM_INITIAL_PAGEIN); KASSERT(initial_pagein <= object->size, ("%s: initial_pagein %d object->size %ju", __func__, initial_pagein, (uintmax_t )object->size)); for (i = 1; i < initial_pagein; i++) { if ((ma[i] = vm_page_next(ma[i - 1])) != NULL) { if (ma[i]->valid) break; if (!vm_page_tryxbusy(ma[i])) break; } else { ma[i] = vm_page_alloc(object, i, VM_ALLOC_NORMAL); if (ma[i] == NULL) break; } } initial_pagein = i; rv = vm_pager_get_pages(object, ma, initial_pagein, NULL, NULL); if (rv != VM_PAGER_OK) { for (i = 0; i < initial_pagein; i++) { vm_page_lock(ma[i]); vm_page_free(ma[i]); vm_page_unlock(ma[i]); } VM_OBJECT_WUNLOCK(object); return (EIO); } vm_page_xunbusy(ma[0]); for (i = 1; i < initial_pagein; i++) vm_page_readahead_finish(ma[i]); } vm_page_lock(ma[0]); vm_page_hold(ma[0]); vm_page_activate(ma[0]); vm_page_unlock(ma[0]); VM_OBJECT_WUNLOCK(object); imgp->firstpage = sf_buf_alloc(ma[0], 0); imgp->image_header = (char *)sf_buf_kva(imgp->firstpage); return (0); } void exec_unmap_first_page(struct image_params *imgp) { vm_page_t m; if (imgp->firstpage != NULL) { m = sf_buf_page(imgp->firstpage); sf_buf_free(imgp->firstpage); imgp->firstpage = NULL; vm_page_lock(m); vm_page_unhold(m); vm_page_unlock(m); } } /* * Destroy old address space, and allocate a new stack. * The new stack is only sgrowsiz large because it is grown * automatically on a page fault. */ int exec_new_vmspace(struct image_params *imgp, struct sysentvec *sv) { int error; struct proc *p = imgp->proc; struct vmspace *vmspace = p->p_vmspace; vm_object_t obj; struct rlimit rlim_stack; vm_offset_t sv_minuser, stack_addr; vm_map_t map; u_long ssiz; imgp->vmspace_destroyed = 1; imgp->sysent = sv; /* May be called with Giant held */ EVENTHANDLER_DIRECT_INVOKE(process_exec, p, imgp); /* * Blow away entire process VM, if address space not shared, * otherwise, create a new VM space so that other threads are * not disrupted */ map = &vmspace->vm_map; if (map_at_zero) sv_minuser = sv->sv_minuser; else sv_minuser = MAX(sv->sv_minuser, PAGE_SIZE); if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv_minuser && vm_map_max(map) == sv->sv_maxuser) { shmexit(vmspace); pmap_remove_pages(vmspace_pmap(vmspace)); vm_map_remove(map, vm_map_min(map), vm_map_max(map)); /* An exec terminates mlockall(MCL_FUTURE). */ vm_map_lock(map); vm_map_modflags(map, 0, MAP_WIREFUTURE); vm_map_unlock(map); } else { error = vmspace_exec(p, sv_minuser, sv->sv_maxuser); if (error) return (error); vmspace = p->p_vmspace; map = &vmspace->vm_map; } /* Map a shared page */ obj = sv->sv_shared_page_obj; if (obj != NULL) { vm_object_reference(obj); error = vm_map_fixed(map, obj, 0, sv->sv_shared_page_base, sv->sv_shared_page_len, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_READ | VM_PROT_EXECUTE, MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE); if (error != KERN_SUCCESS) { vm_object_deallocate(obj); return (vm_mmap_to_errno(error)); } } /* Allocate a new stack */ if (imgp->stack_sz != 0) { ssiz = trunc_page(imgp->stack_sz); PROC_LOCK(p); lim_rlimit_proc(p, RLIMIT_STACK, &rlim_stack); PROC_UNLOCK(p); if (ssiz > rlim_stack.rlim_max) ssiz = rlim_stack.rlim_max; if (ssiz > rlim_stack.rlim_cur) { rlim_stack.rlim_cur = ssiz; kern_setrlimit(curthread, RLIMIT_STACK, &rlim_stack); } } else if (sv->sv_maxssiz != NULL) { ssiz = *sv->sv_maxssiz; } else { ssiz = maxssiz; } stack_addr = sv->sv_usrstack - ssiz; error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz, obj != NULL && imgp->stack_prot != 0 ? imgp->stack_prot : sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN); if (error != KERN_SUCCESS) return (vm_mmap_to_errno(error)); /* * vm_ssize and vm_maxsaddr are somewhat antiquated concepts, but they * are still used to enforce the stack rlimit on the process stack. */ vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT; vmspace->vm_maxsaddr = (char *)stack_addr; return (0); } /* * Copy out argument and environment strings from the old process address * space into the temporary string buffer. */ int exec_copyin_args(struct image_args *args, char *fname, enum uio_seg segflg, char **argv, char **envv) { u_long argp, envp; int error; size_t length; bzero(args, sizeof(*args)); if (argv == NULL) return (EFAULT); /* * Allocate demand-paged memory for the file name, argument, and * environment strings. */ error = exec_alloc_args(args); if (error != 0) return (error); /* * Copy the file name. */ if (fname != NULL) { args->fname = args->buf; error = (segflg == UIO_SYSSPACE) ? copystr(fname, args->fname, PATH_MAX, &length) : copyinstr(fname, args->fname, PATH_MAX, &length); if (error != 0) goto err_exit; } else length = 0; args->begin_argv = args->buf + length; args->endp = args->begin_argv; args->stringspace = ARG_MAX; /* * extract arguments first */ for (;;) { error = fueword(argv++, &argp); if (error == -1) { error = EFAULT; goto err_exit; } if (argp == 0) break; error = copyinstr((void *)(uintptr_t)argp, args->endp, args->stringspace, &length); if (error != 0) { if (error == ENAMETOOLONG) error = E2BIG; goto err_exit; } args->stringspace -= length; args->endp += length; args->argc++; } args->begin_envv = args->endp; /* * extract environment strings */ if (envv) { for (;;) { error = fueword(envv++, &envp); if (error == -1) { error = EFAULT; goto err_exit; } if (envp == 0) break; error = copyinstr((void *)(uintptr_t)envp, args->endp, args->stringspace, &length); if (error != 0) { if (error == ENAMETOOLONG) error = E2BIG; goto err_exit; } args->stringspace -= length; args->endp += length; args->envc++; } } return (0); err_exit: exec_free_args(args); return (error); } int exec_copyin_data_fds(struct thread *td, struct image_args *args, const void *data, size_t datalen, const int *fds, size_t fdslen) { struct filedesc *ofdp; const char *p; int *kfds; int error; memset(args, '\0', sizeof(*args)); ofdp = td->td_proc->p_fd; if (datalen >= ARG_MAX || fdslen > ofdp->fd_lastfile + 1) return (E2BIG); error = exec_alloc_args(args); if (error != 0) return (error); args->begin_argv = args->buf; args->stringspace = ARG_MAX; if (datalen > 0) { /* * Argument buffer has been provided. Copy it into the * kernel as a single string and add a terminating null * byte. */ error = copyin(data, args->begin_argv, datalen); if (error != 0) goto err_exit; args->begin_argv[datalen] = '\0'; args->endp = args->begin_argv + datalen + 1; args->stringspace -= datalen + 1; /* * Traditional argument counting. Count the number of * null bytes. */ for (p = args->begin_argv; p < args->endp; ++p) if (*p == '\0') ++args->argc; } else { /* No argument buffer provided. */ args->endp = args->begin_argv; } /* There are no environment variables. */ args->begin_envv = args->endp; /* Create new file descriptor table. */ kfds = malloc(fdslen * sizeof(int), M_TEMP, M_WAITOK); error = copyin(fds, kfds, fdslen * sizeof(int)); if (error != 0) { free(kfds, M_TEMP); goto err_exit; } error = fdcopy_remapped(ofdp, kfds, fdslen, &args->fdp); free(kfds, M_TEMP); if (error != 0) goto err_exit; return (0); err_exit: exec_free_args(args); return (error); } struct exec_args_kva { vm_offset_t addr; u_int gen; SLIST_ENTRY(exec_args_kva) next; }; static DPCPU_DEFINE(struct exec_args_kva *, exec_args_kva); static SLIST_HEAD(, exec_args_kva) exec_args_kva_freelist; static struct mtx exec_args_kva_mtx; static u_int exec_args_gen; static void exec_prealloc_args_kva(void *arg __unused) { struct exec_args_kva *argkva; u_int i; SLIST_INIT(&exec_args_kva_freelist); mtx_init(&exec_args_kva_mtx, "exec args kva", NULL, MTX_DEF); for (i = 0; i < exec_map_entries; i++) { argkva = malloc(sizeof(*argkva), M_PARGS, M_WAITOK); argkva->addr = kmap_alloc_wait(exec_map, exec_map_entry_size); argkva->gen = exec_args_gen; SLIST_INSERT_HEAD(&exec_args_kva_freelist, argkva, next); } } SYSINIT(exec_args_kva, SI_SUB_EXEC, SI_ORDER_ANY, exec_prealloc_args_kva, NULL); static vm_offset_t exec_alloc_args_kva(void **cookie) { struct exec_args_kva *argkva; argkva = (void *)atomic_readandclear_ptr( (uintptr_t *)DPCPU_PTR(exec_args_kva)); if (argkva == NULL) { mtx_lock(&exec_args_kva_mtx); while ((argkva = SLIST_FIRST(&exec_args_kva_freelist)) == NULL) (void)mtx_sleep(&exec_args_kva_freelist, &exec_args_kva_mtx, 0, "execkva", 0); SLIST_REMOVE_HEAD(&exec_args_kva_freelist, next); mtx_unlock(&exec_args_kva_mtx); } *(struct exec_args_kva **)cookie = argkva; return (argkva->addr); } static void exec_release_args_kva(struct exec_args_kva *argkva, u_int gen) { vm_offset_t base; base = argkva->addr; if (argkva->gen != gen) { vm_map_madvise(exec_map, base, base + exec_map_entry_size, MADV_FREE); argkva->gen = gen; } if (!atomic_cmpset_ptr((uintptr_t *)DPCPU_PTR(exec_args_kva), (uintptr_t)NULL, (uintptr_t)argkva)) { mtx_lock(&exec_args_kva_mtx); SLIST_INSERT_HEAD(&exec_args_kva_freelist, argkva, next); wakeup_one(&exec_args_kva_freelist); mtx_unlock(&exec_args_kva_mtx); } } static void exec_free_args_kva(void *cookie) { exec_release_args_kva(cookie, exec_args_gen); } static void exec_args_kva_lowmem(void *arg __unused) { SLIST_HEAD(, exec_args_kva) head; struct exec_args_kva *argkva; u_int gen; int i; gen = atomic_fetchadd_int(&exec_args_gen, 1) + 1; /* * Force an madvise of each KVA range. Any currently allocated ranges * will have MADV_FREE applied once they are freed. */ SLIST_INIT(&head); mtx_lock(&exec_args_kva_mtx); SLIST_SWAP(&head, &exec_args_kva_freelist, exec_args_kva); mtx_unlock(&exec_args_kva_mtx); while ((argkva = SLIST_FIRST(&head)) != NULL) { SLIST_REMOVE_HEAD(&head, next); exec_release_args_kva(argkva, gen); } CPU_FOREACH(i) { argkva = (void *)atomic_readandclear_ptr( (uintptr_t *)DPCPU_ID_PTR(i, exec_args_kva)); if (argkva != NULL) exec_release_args_kva(argkva, gen); } } EVENTHANDLER_DEFINE(vm_lowmem, exec_args_kva_lowmem, NULL, EVENTHANDLER_PRI_ANY); /* * Allocate temporary demand-paged, zero-filled memory for the file name, * argument, and environment strings. */ int exec_alloc_args(struct image_args *args) { args->buf = (char *)exec_alloc_args_kva(&args->bufkva); return (0); } void exec_free_args(struct image_args *args) { if (args->buf != NULL) { exec_free_args_kva(args->bufkva); args->buf = NULL; } if (args->fname_buf != NULL) { free(args->fname_buf, M_TEMP); args->fname_buf = NULL; } if (args->fdp != NULL) fdescfree_remapped(args->fdp); } /* * Copy strings out to the new process address space, constructing new arg * and env vector tables. Return a pointer to the base so that it can be used * as the initial stack pointer. */ register_t * exec_copyout_strings(struct image_params *imgp) { int argc, envc; char **vectp; char *stringp; uintptr_t destp; register_t *stack_base; struct ps_strings *arginfo; struct proc *p; size_t execpath_len; int szsigcode, szps; char canary[sizeof(long) * 8]; szps = sizeof(pagesizes[0]) * MAXPAGESIZES; /* * Calculate string base and vector table pointers. * Also deal with signal trampoline code for this exec type. */ if (imgp->execpath != NULL && imgp->auxargs != NULL) execpath_len = strlen(imgp->execpath) + 1; else execpath_len = 0; p = imgp->proc; szsigcode = 0; arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings; if (p->p_sysent->sv_sigcode_base == 0) { if (p->p_sysent->sv_szsigcode != NULL) szsigcode = *(p->p_sysent->sv_szsigcode); } destp = (uintptr_t)arginfo; /* * install sigcode */ if (szsigcode != 0) { destp -= szsigcode; destp = rounddown2(destp, sizeof(void *)); copyout(p->p_sysent->sv_sigcode, (void *)destp, szsigcode); } /* * Copy the image path for the rtld. */ if (execpath_len != 0) { destp -= execpath_len; imgp->execpathp = destp; copyout(imgp->execpath, (void *)destp, execpath_len); } /* * Prepare the canary for SSP. */ arc4rand(canary, sizeof(canary), 0); destp -= sizeof(canary); imgp->canary = destp; copyout(canary, (void *)destp, sizeof(canary)); imgp->canarylen = sizeof(canary); /* * Prepare the pagesizes array. */ destp -= szps; destp = rounddown2(destp, sizeof(void *)); imgp->pagesizes = destp; copyout(pagesizes, (void *)destp, szps); imgp->pagesizeslen = szps; destp -= ARG_MAX - imgp->args->stringspace; destp = rounddown2(destp, sizeof(void *)); vectp = (char **)destp; if (imgp->auxargs) { /* * Allocate room on the stack for the ELF auxargs * array. It has up to AT_COUNT entries. */ vectp -= howmany(AT_COUNT * sizeof(Elf_Auxinfo), sizeof(*vectp)); } /* * Allocate room for the argv[] and env vectors including the * terminating NULL pointers. */ vectp -= imgp->args->argc + 1 + imgp->args->envc + 1; /* * vectp also becomes our initial stack base */ stack_base = (register_t *)vectp; stringp = imgp->args->begin_argv; argc = imgp->args->argc; envc = imgp->args->envc; /* * Copy out strings - arguments and environment. */ copyout(stringp, (void *)destp, ARG_MAX - imgp->args->stringspace); /* * Fill in "ps_strings" struct for ps, w, etc. */ suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp); suword32(&arginfo->ps_nargvstr, argc); /* * Fill in argument portion of vector table. */ for (; argc > 0; --argc) { suword(vectp++, (long)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* a null vector table pointer separates the argp's from the envp's */ suword(vectp++, 0); suword(&arginfo->ps_envstr, (long)(intptr_t)vectp); suword32(&arginfo->ps_nenvstr, envc); /* * Fill in environment portion of vector table. */ for (; envc > 0; --envc) { suword(vectp++, (long)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* end of vector table is a null pointer */ suword(vectp, 0); return (stack_base); } /* * Check permissions of file to execute. * Called with imgp->vp locked. * Return 0 for success or error code on failure. */ int exec_check_permissions(struct image_params *imgp) { struct vnode *vp = imgp->vp; struct vattr *attr = imgp->attr; struct thread *td; int error, writecount; td = curthread; /* Get file attributes */ error = VOP_GETATTR(vp, attr, td->td_ucred); if (error) return (error); #ifdef MAC error = mac_vnode_check_exec(td->td_ucred, imgp->vp, imgp); if (error) return (error); #endif /* * 1) Check if file execution is disabled for the filesystem that * this file resides on. * 2) Ensure that at least one execute bit is on. Otherwise, a * privileged user will always succeed, and we don't want this * to happen unless the file really is executable. * 3) Ensure that the file is a regular file. */ if ((vp->v_mount->mnt_flag & MNT_NOEXEC) || (attr->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0 || (attr->va_type != VREG)) return (EACCES); /* * Zero length files can't be exec'd */ if (attr->va_size == 0) return (ENOEXEC); /* * Check for execute permission to file based on current credentials. */ error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); if (error) return (error); /* * Check number of open-for-writes on the file and deny execution * if there are any. */ error = VOP_GET_WRITECOUNT(vp, &writecount); if (error != 0) return (error); if (writecount != 0) return (ETXTBSY); /* * Call filesystem specific open routine (which does nothing in the * general case). */ error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL); if (error == 0) imgp->opened = 1; return (error); } /* * Exec handler registration */ int exec_register(const struct execsw *execsw_arg) { const struct execsw **es, **xs, **newexecsw; u_int count = 2; /* New slot and trailing NULL */ if (execsw) for (es = execsw; *es; es++) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); xs = newexecsw; if (execsw) for (es = execsw; *es; es++) *xs++ = *es; *xs++ = execsw_arg; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return (0); } int exec_unregister(const struct execsw *execsw_arg) { const struct execsw **es, **xs, **newexecsw; int count = 1; if (execsw == NULL) panic("unregister with no handlers left?\n"); for (es = execsw; *es; es++) { if (*es == execsw_arg) break; } if (*es == NULL) return (ENOENT); for (es = execsw; *es; es++) if (*es != execsw_arg) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); xs = newexecsw; for (es = execsw; *es; es++) if (*es != execsw_arg) *xs++ = *es; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return (0); } Index: head/sys/kern/kern_sendfile.c =================================================================== --- head/sys/kern/kern_sendfile.c (revision 333424) +++ head/sys/kern/kern_sendfile.c (revision 333425) @@ -1,1055 +1,1051 @@ /*- * Copyright (c) 2013-2015 Gleb Smirnoff * Copyright (c) 1998, David Greenman. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define EXT_FLAG_SYNC EXT_FLAG_VENDOR1 #define EXT_FLAG_NOCACHE EXT_FLAG_VENDOR2 /* * Structure describing a single sendfile(2) I/O, which may consist of * several underlying pager I/Os. * * The syscall context allocates the structure and initializes 'nios' * to 1. As sendfile_swapin() runs through pages and starts asynchronous * paging operations, it increments 'nios'. * * Every I/O completion calls sendfile_iodone(), which decrements the 'nios', * and the syscall also calls sendfile_iodone() after allocating all mbufs, * linking them and sending to socket. Whoever reaches zero 'nios' is * responsible to * call pru_ready on the socket, to notify it of readyness * of the data. */ struct sf_io { volatile u_int nios; u_int error; int npages; struct socket *so; struct mbuf *m; vm_page_t pa[]; }; /* * Structure used to track requests with SF_SYNC flag. */ struct sendfile_sync { struct mtx mtx; struct cv cv; unsigned count; }; counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)]; static void sfstat_init(const void *unused) { COUNTER_ARRAY_ALLOC(sfstat, sizeof(struct sfstat) / sizeof(uint64_t), M_WAITOK); } SYSINIT(sfstat, SI_SUB_MBUF, SI_ORDER_FIRST, sfstat_init, NULL); static int sfstat_sysctl(SYSCTL_HANDLER_ARGS) { struct sfstat s; COUNTER_ARRAY_COPY(sfstat, &s, sizeof(s) / sizeof(uint64_t)); if (req->newptr) COUNTER_ARRAY_ZERO(sfstat, sizeof(s) / sizeof(uint64_t)); return (SYSCTL_OUT(req, &s, sizeof(s))); } SYSCTL_PROC(_kern_ipc, OID_AUTO, sfstat, CTLTYPE_OPAQUE | CTLFLAG_RW, NULL, 0, sfstat_sysctl, "I", "sendfile statistics"); /* * Detach mapped page and release resources back to the system. Called * by mbuf(9) code when last reference to a page is freed. */ static void sendfile_free_page(vm_page_t pg, bool nocache) { bool freed; vm_page_lock(pg); /* * In either case check for the object going away on us. This can * happen since we don't hold a reference to it. If so, we're * responsible for freeing the page. In 'noncache' case try to free * the page, but only if it is cheap to. */ if (vm_page_unwire_noq(pg)) { vm_object_t obj; if ((obj = pg->object) == NULL) vm_page_free(pg); else { freed = false; if (nocache && !vm_page_xbusied(pg) && VM_OBJECT_TRYWLOCK(obj)) { /* Only free unmapped pages. */ if (obj->ref_count == 0 || !pmap_page_is_mapped(pg)) /* * The busy test before the object is * locked cannot be relied upon. */ freed = vm_page_try_to_free(pg); VM_OBJECT_WUNLOCK(obj); } if (!freed) { /* * If we were asked to not cache the page, place * it near the head of the inactive queue so * that it is reclaimed sooner. Otherwise, * maintain LRU. */ if (nocache) vm_page_deactivate_noreuse(pg); else if (vm_page_active(pg)) vm_page_reference(pg); else vm_page_deactivate(pg); } } } vm_page_unlock(pg); } static void sendfile_free_mext(struct mbuf *m) { struct sf_buf *sf; vm_page_t pg; bool nocache; KASSERT(m->m_flags & M_EXT && m->m_ext.ext_type == EXT_SFBUF, ("%s: m %p !M_EXT or !EXT_SFBUF", __func__, m)); sf = m->m_ext.ext_arg1; pg = sf_buf_page(sf); nocache = m->m_ext.ext_flags & EXT_FLAG_NOCACHE; sf_buf_free(sf); sendfile_free_page(pg, nocache); if (m->m_ext.ext_flags & EXT_FLAG_SYNC) { struct sendfile_sync *sfs = m->m_ext.ext_arg2; mtx_lock(&sfs->mtx); KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0")); if (--sfs->count == 0) cv_signal(&sfs->cv); mtx_unlock(&sfs->mtx); } } /* * Helper function to calculate how much data to put into page i of n. * Only first and last pages are special. */ static inline off_t xfsize(int i, int n, off_t off, off_t len) { if (i == 0) return (omin(PAGE_SIZE - (off & PAGE_MASK), len)); if (i == n - 1 && ((off + len) & PAGE_MASK) > 0) return ((off + len) & PAGE_MASK); return (PAGE_SIZE); } /* * Helper function to get offset within object for i page. */ static inline vm_ooffset_t vmoff(int i, off_t off) { if (i == 0) return ((vm_ooffset_t)off); return (trunc_page(off + i * PAGE_SIZE)); } /* * Helper function used when allocation of a page or sf_buf failed. * Pretend as if we don't have enough space, subtract xfsize() of * all pages that failed. */ static inline void fixspace(int old, int new, off_t off, int *space) { KASSERT(old > new, ("%s: old %d new %d", __func__, old, new)); /* Subtract last one. */ *space -= xfsize(old - 1, old, off, *space); old--; if (new == old) /* There was only one page. */ return; /* Subtract first one. */ if (new == 0) { *space -= xfsize(0, old, off, *space); new++; } /* Rest of pages are full sized. */ *space -= (old - new) * PAGE_SIZE; KASSERT(*space >= 0, ("%s: space went backwards", __func__)); } /* * I/O completion callback. */ static void sendfile_iodone(void *arg, vm_page_t *pg, int count, int error) { struct sf_io *sfio = arg; struct socket *so = sfio->so; for (int i = 0; i < count; i++) if (pg[i] != bogus_page) vm_page_xunbusy(pg[i]); if (error) sfio->error = error; if (!refcount_release(&sfio->nios)) return; CURVNET_SET(so->so_vnet); if (sfio->error) { struct mbuf *m; /* * I/O operation failed. The state of data in the socket * is now inconsistent, and all what we can do is to tear * it down. Protocol abort method would tear down protocol * state, free all ready mbufs and detach not ready ones. * We will free the mbufs corresponding to this I/O manually. * * The socket would be marked with EIO and made available * for read, so that application receives EIO on next * syscall and eventually closes the socket. */ so->so_proto->pr_usrreqs->pru_abort(so); so->so_error = EIO; m = sfio->m; for (int i = 0; i < sfio->npages; i++) m = m_free(m); } else (void )(so->so_proto->pr_usrreqs->pru_ready)(so, sfio->m, sfio->npages); SOCK_LOCK(so); sorele(so); CURVNET_RESTORE(); free(sfio, M_TEMP); } /* * Iterate through pages vector and request paging for non-valid pages. */ static int sendfile_swapin(vm_object_t obj, struct sf_io *sfio, off_t off, off_t len, int npages, int rhpages, int flags) { vm_page_t *pa = sfio->pa; int grabbed, nios; nios = 0; flags = (flags & SF_NODISKIO) ? VM_ALLOC_NOWAIT : 0; /* * First grab all the pages and wire them. Note that we grab * only required pages. Readahead pages are dealt with later. */ VM_OBJECT_WLOCK(obj); grabbed = vm_page_grab_pages(obj, OFF_TO_IDX(off), VM_ALLOC_NORMAL | VM_ALLOC_WIRED | flags, pa, npages); if (grabbed < npages) { for (int i = grabbed; i < npages; i++) pa[i] = NULL; npages = grabbed; rhpages = 0; } for (int i = 0; i < npages;) { int j, a, count, rv; /* Skip valid pages. */ if (vm_page_is_valid(pa[i], vmoff(i, off) & PAGE_MASK, xfsize(i, npages, off, len))) { vm_page_xunbusy(pa[i]); SFSTAT_INC(sf_pages_valid); i++; continue; } /* * Next page is invalid. Check if it belongs to pager. It * may not be there, which is a regular situation for shmem * pager. For vnode pager this happens only in case of * a sparse file. * * Important feature of vm_pager_has_page() is the hint * stored in 'a', about how many pages we can pagein after * this page in a single I/O. */ if (!vm_pager_has_page(obj, OFF_TO_IDX(vmoff(i, off)), NULL, &a)) { pmap_zero_page(pa[i]); pa[i]->valid = VM_PAGE_BITS_ALL; MPASS(pa[i]->dirty == 0); vm_page_xunbusy(pa[i]); i++; continue; } /* * We want to pagein as many pages as possible, limited only * by the 'a' hint and actual request. */ count = min(a + 1, npages - i); /* * We should not pagein into a valid page, thus we first trim * any valid pages off the end of request, and substitute * to bogus_page those, that are in the middle. */ for (j = i + count - 1; j > i; j--) { if (vm_page_is_valid(pa[j], vmoff(j, off) & PAGE_MASK, xfsize(j, npages, off, len))) { count--; rhpages = 0; } else break; } for (j = i + 1; j < i + count - 1; j++) if (vm_page_is_valid(pa[j], vmoff(j, off) & PAGE_MASK, xfsize(j, npages, off, len))) { vm_page_xunbusy(pa[j]); SFSTAT_INC(sf_pages_valid); SFSTAT_INC(sf_pages_bogus); pa[j] = bogus_page; } refcount_acquire(&sfio->nios); rv = vm_pager_get_pages_async(obj, pa + i, count, NULL, i + count == npages ? &rhpages : NULL, &sendfile_iodone, sfio); KASSERT(rv == VM_PAGER_OK, ("%s: pager fail obj %p page %p", __func__, obj, pa[i])); SFSTAT_INC(sf_iocnt); SFSTAT_ADD(sf_pages_read, count); if (i + count == npages) SFSTAT_ADD(sf_rhpages_read, rhpages); /* * Restore the valid page pointers. They are already * unbusied, but still wired. */ for (j = i; j < i + count; j++) if (pa[j] == bogus_page) { pa[j] = vm_page_lookup(obj, OFF_TO_IDX(vmoff(j, off))); KASSERT(pa[j], ("%s: page %p[%d] disappeared", __func__, pa, j)); } i += count; nios++; } VM_OBJECT_WUNLOCK(obj); if (nios == 0 && npages != 0) SFSTAT_INC(sf_noiocnt); return (nios); } static int sendfile_getobj(struct thread *td, struct file *fp, vm_object_t *obj_res, struct vnode **vp_res, struct shmfd **shmfd_res, off_t *obj_size, int *bsize) { struct vattr va; vm_object_t obj; struct vnode *vp; struct shmfd *shmfd; int error; vp = *vp_res = NULL; obj = NULL; shmfd = *shmfd_res = NULL; *bsize = 0; /* * The file descriptor must be a regular file and have a * backing VM object. */ if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; vn_lock(vp, LK_SHARED | LK_RETRY); if (vp->v_type != VREG) { error = EINVAL; goto out; } *bsize = vp->v_mount->mnt_stat.f_iosize; error = VOP_GETATTR(vp, &va, td->td_ucred); if (error != 0) goto out; *obj_size = va.va_size; obj = vp->v_object; if (obj == NULL) { error = EINVAL; goto out; } } else if (fp->f_type == DTYPE_SHM) { error = 0; shmfd = fp->f_data; obj = shmfd->shm_object; *obj_size = shmfd->shm_size; } else { error = EINVAL; goto out; } VM_OBJECT_WLOCK(obj); if ((obj->flags & OBJ_DEAD) != 0) { VM_OBJECT_WUNLOCK(obj); error = EBADF; goto out; } /* * Temporarily increase the backing VM object's reference * count so that a forced reclamation of its vnode does not * immediately destroy it. */ vm_object_reference_locked(obj); VM_OBJECT_WUNLOCK(obj); *obj_res = obj; *vp_res = vp; *shmfd_res = shmfd; out: if (vp != NULL) VOP_UNLOCK(vp, 0); return (error); } static int sendfile_getsock(struct thread *td, int s, struct file **sock_fp, struct socket **so) { - cap_rights_t rights; int error; *sock_fp = NULL; *so = NULL; /* * The socket must be a stream socket and connected. */ - error = getsock_cap(td, s, cap_rights_init(&rights, CAP_SEND), + error = getsock_cap(td, s, &cap_send_rights, sock_fp, NULL, NULL); if (error != 0) return (error); *so = (*sock_fp)->f_data; if ((*so)->so_type != SOCK_STREAM) return (EINVAL); return (0); } int vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, struct thread *td) { struct file *sock_fp; struct vnode *vp; struct vm_object *obj; struct socket *so; struct mbuf *m, *mh, *mhtail; struct sf_buf *sf; struct shmfd *shmfd; struct sendfile_sync *sfs; struct vattr va; off_t off, sbytes, rem, obj_size; int error, softerr, bsize, hdrlen; obj = NULL; so = NULL; m = mh = NULL; sfs = NULL; hdrlen = sbytes = 0; softerr = 0; error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize); if (error != 0) return (error); error = sendfile_getsock(td, sockfd, &sock_fp, &so); if (error != 0) goto out; #ifdef MAC error = mac_socket_check_send(td->td_ucred, so); if (error != 0) goto out; #endif SFSTAT_INC(sf_syscalls); SFSTAT_ADD(sf_rhpages_requested, SF_READAHEAD(flags)); if (flags & SF_SYNC) { sfs = malloc(sizeof *sfs, M_TEMP, M_WAITOK | M_ZERO); mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF); cv_init(&sfs->cv, "sendfile"); } rem = nbytes ? omin(nbytes, obj_size - offset) : obj_size - offset; /* * Protect against multiple writers to the socket. * * XXXRW: Historically this has assumed non-interruptibility, so now * we implement that, but possibly shouldn't. */ (void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR); /* * Loop through the pages of the file, starting with the requested * offset. Get a file page (do I/O if necessary), map the file page * into an sf_buf, attach an mbuf header to the sf_buf, and queue * it on the socket. * This is done in two loops. The inner loop turns as many pages * as it can, up to available socket buffer space, without blocking * into mbufs to have it bulk delivered into the socket send buffer. * The outer loop checks the state and available space of the socket * and takes care of the overall progress. */ for (off = offset; rem > 0; ) { struct sf_io *sfio; vm_page_t *pa; struct mbuf *mtail; int nios, space, npages, rhpages; mtail = NULL; /* * Check the socket state for ongoing connection, * no errors and space in socket buffer. * If space is low allow for the remainder of the * file to be processed if it fits the socket buffer. * Otherwise block in waiting for sufficient space * to proceed, or if the socket is nonblocking, return * to userland with EAGAIN while reporting how far * we've come. * We wait until the socket buffer has significant free * space to do bulk sends. This makes good use of file * system read ahead and allows packet segmentation * offloading hardware to take over lots of work. If * we were not careful here we would send off only one * sfbuf at a time. */ SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2) so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2; retry_space: if (so->so_snd.sb_state & SBS_CANTSENDMORE) { error = EPIPE; SOCKBUF_UNLOCK(&so->so_snd); goto done; } else if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto done; } if ((so->so_state & SS_ISCONNECTED) == 0) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOTCONN; goto done; } space = sbspace(&so->so_snd); if (space < rem && (space <= 0 || space < so->so_snd.sb_lowat)) { if (so->so_state & SS_NBIO) { SOCKBUF_UNLOCK(&so->so_snd); error = EAGAIN; goto done; } /* * sbwait drops the lock while sleeping. * When we loop back to retry_space the * state may have changed and we retest * for it. */ error = sbwait(&so->so_snd); /* * An error from sbwait usually indicates that we've * been interrupted by a signal. If we've sent anything * then return bytes sent, otherwise return the error. */ if (error != 0) { SOCKBUF_UNLOCK(&so->so_snd); goto done; } goto retry_space; } SOCKBUF_UNLOCK(&so->so_snd); /* * At the beginning of the first loop check if any headers * are specified and copy them into mbufs. Reduce space in * the socket buffer by the size of the header mbuf chain. * Clear hdr_uio here and hdrlen at the end of the first loop. */ if (hdr_uio != NULL && hdr_uio->uio_resid > 0) { hdr_uio->uio_td = td; hdr_uio->uio_rw = UIO_WRITE; mh = m_uiotombuf(hdr_uio, M_WAITOK, space, 0, 0); hdrlen = m_length(mh, &mhtail); space -= hdrlen; /* * If header consumed all the socket buffer space, * don't waste CPU cycles and jump to the end. */ if (space == 0) { sfio = NULL; nios = 0; goto prepend_header; } hdr_uio = NULL; } if (vp != NULL) { error = vn_lock(vp, LK_SHARED); if (error != 0) goto done; error = VOP_GETATTR(vp, &va, td->td_ucred); if (error != 0 || off >= va.va_size) { VOP_UNLOCK(vp, 0); goto done; } if (va.va_size != obj_size) { obj_size = va.va_size; rem = nbytes ? omin(nbytes + offset, obj_size) : obj_size; rem -= off; } } if (space > rem) space = rem; npages = howmany(space + (off & PAGE_MASK), PAGE_SIZE); /* * Calculate maximum allowed number of pages for readahead * at this iteration. If SF_USER_READAHEAD was set, we don't * do any heuristics and use exactly the value supplied by * application. Otherwise, we allow readahead up to "rem". * If application wants more, let it be, but there is no * reason to go above MAXPHYS. Also check against "obj_size", * since vm_pager_has_page() can hint beyond EOF. */ if (flags & SF_USER_READAHEAD) { rhpages = SF_READAHEAD(flags); } else { rhpages = howmany(rem + (off & PAGE_MASK), PAGE_SIZE) - npages; rhpages += SF_READAHEAD(flags); } rhpages = min(howmany(MAXPHYS, PAGE_SIZE), rhpages); rhpages = min(howmany(obj_size - trunc_page(off), PAGE_SIZE) - npages, rhpages); sfio = malloc(sizeof(struct sf_io) + npages * sizeof(vm_page_t), M_TEMP, M_WAITOK); refcount_init(&sfio->nios, 1); sfio->so = so; sfio->error = 0; nios = sendfile_swapin(obj, sfio, off, space, npages, rhpages, flags); /* * Loop and construct maximum sized mbuf chain to be bulk * dumped into socket buffer. */ pa = sfio->pa; for (int i = 0; i < npages; i++) { struct mbuf *m0; /* * If a page wasn't grabbed successfully, then * trim the array. Can happen only with SF_NODISKIO. */ if (pa[i] == NULL) { SFSTAT_INC(sf_busy); fixspace(npages, i, off, &space); npages = i; softerr = EBUSY; break; } /* * Get a sendfile buf. When allocating the * first buffer for mbuf chain, we usually * wait as long as necessary, but this wait * can be interrupted. For consequent * buffers, do not sleep, since several * threads might exhaust the buffers and then * deadlock. */ sf = sf_buf_alloc(pa[i], m != NULL ? SFB_NOWAIT : SFB_CATCH); if (sf == NULL) { SFSTAT_INC(sf_allocfail); for (int j = i; j < npages; j++) { vm_page_lock(pa[j]); vm_page_unwire(pa[j], PQ_INACTIVE); vm_page_unlock(pa[j]); } if (m == NULL) softerr = ENOBUFS; fixspace(npages, i, off, &space); npages = i; break; } m0 = m_get(M_WAITOK, MT_DATA); m0->m_ext.ext_buf = (char *)sf_buf_kva(sf); m0->m_ext.ext_size = PAGE_SIZE; m0->m_ext.ext_arg1 = sf; m0->m_ext.ext_type = EXT_SFBUF; m0->m_ext.ext_flags = EXT_FLAG_EMBREF; m0->m_ext.ext_free = sendfile_free_mext; /* * SF_NOCACHE sets the page as being freed upon send. * However, we ignore it for the last page in 'space', * if the page is truncated, and we got more data to * send (rem > space), or if we have readahead * configured (rhpages > 0). */ if ((flags & SF_NOCACHE) && (i != npages - 1 || !((off + space) & PAGE_MASK) || !(rem > space || rhpages > 0))) m0->m_ext.ext_flags |= EXT_FLAG_NOCACHE; if (sfs != NULL) { m0->m_ext.ext_flags |= EXT_FLAG_SYNC; m0->m_ext.ext_arg2 = sfs; mtx_lock(&sfs->mtx); sfs->count++; mtx_unlock(&sfs->mtx); } m0->m_ext.ext_count = 1; m0->m_flags |= (M_EXT | M_RDONLY); if (nios) m0->m_flags |= M_NOTREADY; m0->m_data = (char *)sf_buf_kva(sf) + (vmoff(i, off) & PAGE_MASK); m0->m_len = xfsize(i, npages, off, space); if (i == 0) sfio->m = m0; /* Append to mbuf chain. */ if (mtail != NULL) mtail->m_next = m0; else m = m0; mtail = m0; } if (vp != NULL) VOP_UNLOCK(vp, 0); /* Keep track of bytes processed. */ off += space; rem -= space; /* Prepend header, if any. */ if (hdrlen) { prepend_header: mhtail->m_next = m; m = mh; mh = NULL; } if (m == NULL) { KASSERT(softerr, ("%s: m NULL, no error", __func__)); error = softerr; free(sfio, M_TEMP); goto done; } /* Add the buffer chain to the socket buffer. */ KASSERT(m_length(m, NULL) == space + hdrlen, ("%s: mlen %u space %d hdrlen %d", __func__, m_length(m, NULL), space, hdrlen)); CURVNET_SET(so->so_vnet); if (nios == 0) { /* * If sendfile_swapin() didn't initiate any I/Os, * which happens if all data is cached in VM, then * we can send data right now without the * PRUS_NOTREADY flag. */ free(sfio, M_TEMP); error = (*so->so_proto->pr_usrreqs->pru_send) (so, 0, m, NULL, NULL, td); } else { sfio->npages = npages; soref(so); error = (*so->so_proto->pr_usrreqs->pru_send) (so, PRUS_NOTREADY, m, NULL, NULL, td); sendfile_iodone(sfio, NULL, 0, 0); } CURVNET_RESTORE(); m = NULL; /* pru_send always consumes */ if (error) goto done; sbytes += space + hdrlen; if (hdrlen) hdrlen = 0; if (softerr) { error = softerr; goto done; } } /* * Send trailers. Wimp out and use writev(2). */ if (trl_uio != NULL) { sbunlock(&so->so_snd); error = kern_writev(td, sockfd, trl_uio); if (error == 0) sbytes += td->td_retval[0]; goto out; } done: sbunlock(&so->so_snd); out: /* * If there was no error we have to clear td->td_retval[0] * because it may have been set by writev. */ if (error == 0) { td->td_retval[0] = 0; } if (sent != NULL) { (*sent) = sbytes; } if (obj != NULL) vm_object_deallocate(obj); if (so) fdrop(sock_fp, td); if (m) m_freem(m); if (mh) m_freem(mh); if (sfs != NULL) { mtx_lock(&sfs->mtx); if (sfs->count != 0) cv_wait(&sfs->cv, &sfs->mtx); KASSERT(sfs->count == 0, ("sendfile sync still busy")); cv_destroy(&sfs->cv); mtx_destroy(&sfs->mtx); free(sfs, M_TEMP); } if (error == ERESTART) error = EINTR; return (error); } static int sendfile(struct thread *td, struct sendfile_args *uap, int compat) { struct sf_hdtr hdtr; struct uio *hdr_uio, *trl_uio; struct file *fp; - cap_rights_t rights; off_t sbytes; int error; /* * File offset must be positive. If it goes beyond EOF * we send only the header/trailer and no payload data. */ if (uap->offset < 0) return (EINVAL); sbytes = 0; hdr_uio = trl_uio = NULL; if (uap->hdtr != NULL) { error = copyin(uap->hdtr, &hdtr, sizeof(hdtr)); if (error != 0) goto out; if (hdtr.headers != NULL) { error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio); if (error != 0) goto out; #ifdef COMPAT_FREEBSD4 /* * In FreeBSD < 5.0 the nbytes to send also included * the header. If compat is specified subtract the * header size from nbytes. */ if (compat) { if (uap->nbytes > hdr_uio->uio_resid) uap->nbytes -= hdr_uio->uio_resid; else uap->nbytes = 0; } #endif } if (hdtr.trailers != NULL) { error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio); if (error != 0) goto out; } } AUDIT_ARG_FD(uap->fd); /* * sendfile(2) can start at any offset within a file so we require * CAP_READ+CAP_SEEK = CAP_PREAD. */ - if ((error = fget_read(td, uap->fd, - cap_rights_init(&rights, CAP_PREAD), &fp)) != 0) { + if ((error = fget_read(td, uap->fd, &cap_pread_rights, &fp)) != 0) goto out; - } error = fo_sendfile(fp, uap->s, hdr_uio, trl_uio, uap->offset, uap->nbytes, &sbytes, uap->flags, td); fdrop(fp, td); if (uap->sbytes != NULL) copyout(&sbytes, uap->sbytes, sizeof(off_t)); out: free(hdr_uio, M_IOV); free(trl_uio, M_IOV); return (error); } /* * sendfile(2) * * int sendfile(int fd, int s, off_t offset, size_t nbytes, * struct sf_hdtr *hdtr, off_t *sbytes, int flags) * * Send a file specified by 'fd' and starting at 'offset' to a socket * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes == * 0. Optionally add a header and/or trailer to the socket output. If * specified, write the total number of bytes sent into *sbytes. */ int sys_sendfile(struct thread *td, struct sendfile_args *uap) { return (sendfile(td, uap, 0)); } #ifdef COMPAT_FREEBSD4 int freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap) { struct sendfile_args args; args.fd = uap->fd; args.s = uap->s; args.offset = uap->offset; args.nbytes = uap->nbytes; args.hdtr = uap->hdtr; args.sbytes = uap->sbytes; args.flags = uap->flags; return (sendfile(td, &args, 1)); } #endif /* COMPAT_FREEBSD4 */ Index: head/sys/kern/kern_sig.c =================================================================== --- head/sys/kern/kern_sig.c (revision 333424) +++ head/sys/kern/kern_sig.c (revision 333425) @@ -1,3752 +1,3750 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_sig.c 8.7 (Berkeley) 4/18/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define ONSIG 32 /* NSIG for osig* syscalls. XXX. */ SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE3(proc, , , signal__send, "struct thread *", "struct proc *", "int"); SDT_PROBE_DEFINE2(proc, , , signal__clear, "int", "ksiginfo_t *"); SDT_PROBE_DEFINE3(proc, , , signal__discard, "struct thread *", "struct proc *", "int"); static int coredump(struct thread *); static int killpg1(struct thread *td, int sig, int pgid, int all, ksiginfo_t *ksi); static int issignal(struct thread *td); static int sigprop(int sig); static void tdsigwakeup(struct thread *, int, sig_t, int); static int sig_suspend_threads(struct thread *, struct proc *, int); static int filt_sigattach(struct knote *kn); static void filt_sigdetach(struct knote *kn); static int filt_signal(struct knote *kn, long hint); static struct thread *sigtd(struct proc *p, int sig, int prop); static void sigqueue_start(void); static uma_zone_t ksiginfo_zone = NULL; struct filterops sig_filtops = { .f_isfd = 0, .f_attach = filt_sigattach, .f_detach = filt_sigdetach, .f_event = filt_signal, }; static int kern_logsigexit = 1; SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW, &kern_logsigexit, 0, "Log processes quitting on abnormal signals to syslog(3)"); static int kern_forcesigexit = 1; SYSCTL_INT(_kern, OID_AUTO, forcesigexit, CTLFLAG_RW, &kern_forcesigexit, 0, "Force trap signal to be handled"); static SYSCTL_NODE(_kern, OID_AUTO, sigqueue, CTLFLAG_RW, 0, "POSIX real time signal"); static int max_pending_per_proc = 128; SYSCTL_INT(_kern_sigqueue, OID_AUTO, max_pending_per_proc, CTLFLAG_RW, &max_pending_per_proc, 0, "Max pending signals per proc"); static int preallocate_siginfo = 1024; SYSCTL_INT(_kern_sigqueue, OID_AUTO, preallocate, CTLFLAG_RDTUN, &preallocate_siginfo, 0, "Preallocated signal memory size"); static int signal_overflow = 0; SYSCTL_INT(_kern_sigqueue, OID_AUTO, overflow, CTLFLAG_RD, &signal_overflow, 0, "Number of signals overflew"); static int signal_alloc_fail = 0; SYSCTL_INT(_kern_sigqueue, OID_AUTO, alloc_fail, CTLFLAG_RD, &signal_alloc_fail, 0, "signals failed to be allocated"); static int kern_lognosys = 0; SYSCTL_INT(_kern, OID_AUTO, lognosys, CTLFLAG_RWTUN, &kern_lognosys, 0, "Log invalid syscalls"); SYSINIT(signal, SI_SUB_P1003_1B, SI_ORDER_FIRST+3, sigqueue_start, NULL); /* * Policy -- Can ucred cr1 send SIGIO to process cr2? * Should use cr_cansignal() once cr_cansignal() allows SIGIO and SIGURG * in the right situations. */ #define CANSIGIO(cr1, cr2) \ ((cr1)->cr_uid == 0 || \ (cr1)->cr_ruid == (cr2)->cr_ruid || \ (cr1)->cr_uid == (cr2)->cr_ruid || \ (cr1)->cr_ruid == (cr2)->cr_uid || \ (cr1)->cr_uid == (cr2)->cr_uid) static int sugid_coredump; SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RWTUN, &sugid_coredump, 0, "Allow setuid and setgid processes to dump core"); static int capmode_coredump; SYSCTL_INT(_kern, OID_AUTO, capmode_coredump, CTLFLAG_RWTUN, &capmode_coredump, 0, "Allow processes in capability mode to dump core"); static int do_coredump = 1; SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW, &do_coredump, 0, "Enable/Disable coredumps"); static int set_core_nodump_flag = 0; SYSCTL_INT(_kern, OID_AUTO, nodump_coredump, CTLFLAG_RW, &set_core_nodump_flag, 0, "Enable setting the NODUMP flag on coredump files"); static int coredump_devctl = 0; SYSCTL_INT(_kern, OID_AUTO, coredump_devctl, CTLFLAG_RW, &coredump_devctl, 0, "Generate a devctl notification when processes coredump"); /* * Signal properties and actions. * The array below categorizes the signals and their default actions * according to the following properties: */ #define SIGPROP_KILL 0x01 /* terminates process by default */ #define SIGPROP_CORE 0x02 /* ditto and coredumps */ #define SIGPROP_STOP 0x04 /* suspend process */ #define SIGPROP_TTYSTOP 0x08 /* ditto, from tty */ #define SIGPROP_IGNORE 0x10 /* ignore by default */ #define SIGPROP_CONT 0x20 /* continue if suspended */ #define SIGPROP_CANTMASK 0x40 /* non-maskable, catchable */ static int sigproptbl[NSIG] = { [SIGHUP] = SIGPROP_KILL, [SIGINT] = SIGPROP_KILL, [SIGQUIT] = SIGPROP_KILL | SIGPROP_CORE, [SIGILL] = SIGPROP_KILL | SIGPROP_CORE, [SIGTRAP] = SIGPROP_KILL | SIGPROP_CORE, [SIGABRT] = SIGPROP_KILL | SIGPROP_CORE, [SIGEMT] = SIGPROP_KILL | SIGPROP_CORE, [SIGFPE] = SIGPROP_KILL | SIGPROP_CORE, [SIGKILL] = SIGPROP_KILL, [SIGBUS] = SIGPROP_KILL | SIGPROP_CORE, [SIGSEGV] = SIGPROP_KILL | SIGPROP_CORE, [SIGSYS] = SIGPROP_KILL | SIGPROP_CORE, [SIGPIPE] = SIGPROP_KILL, [SIGALRM] = SIGPROP_KILL, [SIGTERM] = SIGPROP_KILL, [SIGURG] = SIGPROP_IGNORE, [SIGSTOP] = SIGPROP_STOP, [SIGTSTP] = SIGPROP_STOP | SIGPROP_TTYSTOP, [SIGCONT] = SIGPROP_IGNORE | SIGPROP_CONT, [SIGCHLD] = SIGPROP_IGNORE, [SIGTTIN] = SIGPROP_STOP | SIGPROP_TTYSTOP, [SIGTTOU] = SIGPROP_STOP | SIGPROP_TTYSTOP, [SIGIO] = SIGPROP_IGNORE, [SIGXCPU] = SIGPROP_KILL, [SIGXFSZ] = SIGPROP_KILL, [SIGVTALRM] = SIGPROP_KILL, [SIGPROF] = SIGPROP_KILL, [SIGWINCH] = SIGPROP_IGNORE, [SIGINFO] = SIGPROP_IGNORE, [SIGUSR1] = SIGPROP_KILL, [SIGUSR2] = SIGPROP_KILL, }; static void reschedule_signals(struct proc *p, sigset_t block, int flags); static void sigqueue_start(void) { ksiginfo_zone = uma_zcreate("ksiginfo", sizeof(ksiginfo_t), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_prealloc(ksiginfo_zone, preallocate_siginfo); p31b_setcfg(CTL_P1003_1B_REALTIME_SIGNALS, _POSIX_REALTIME_SIGNALS); p31b_setcfg(CTL_P1003_1B_RTSIG_MAX, SIGRTMAX - SIGRTMIN + 1); p31b_setcfg(CTL_P1003_1B_SIGQUEUE_MAX, max_pending_per_proc); } ksiginfo_t * ksiginfo_alloc(int wait) { int flags; flags = M_ZERO; if (! wait) flags |= M_NOWAIT; if (ksiginfo_zone != NULL) return ((ksiginfo_t *)uma_zalloc(ksiginfo_zone, flags)); return (NULL); } void ksiginfo_free(ksiginfo_t *ksi) { uma_zfree(ksiginfo_zone, ksi); } static __inline int ksiginfo_tryfree(ksiginfo_t *ksi) { if (!(ksi->ksi_flags & KSI_EXT)) { uma_zfree(ksiginfo_zone, ksi); return (1); } return (0); } void sigqueue_init(sigqueue_t *list, struct proc *p) { SIGEMPTYSET(list->sq_signals); SIGEMPTYSET(list->sq_kill); SIGEMPTYSET(list->sq_ptrace); TAILQ_INIT(&list->sq_list); list->sq_proc = p; list->sq_flags = SQ_INIT; } /* * Get a signal's ksiginfo. * Return: * 0 - signal not found * others - signal number */ static int sigqueue_get(sigqueue_t *sq, int signo, ksiginfo_t *si) { struct proc *p = sq->sq_proc; struct ksiginfo *ksi, *next; int count = 0; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); if (!SIGISMEMBER(sq->sq_signals, signo)) return (0); if (SIGISMEMBER(sq->sq_ptrace, signo)) { count++; SIGDELSET(sq->sq_ptrace, signo); si->ksi_flags |= KSI_PTRACE; } if (SIGISMEMBER(sq->sq_kill, signo)) { count++; if (count == 1) SIGDELSET(sq->sq_kill, signo); } TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) { if (ksi->ksi_signo == signo) { if (count == 0) { TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; ksiginfo_copy(ksi, si); if (ksiginfo_tryfree(ksi) && p != NULL) p->p_pendingcnt--; } if (++count > 1) break; } } if (count <= 1) SIGDELSET(sq->sq_signals, signo); si->ksi_signo = signo; return (signo); } void sigqueue_take(ksiginfo_t *ksi) { struct ksiginfo *kp; struct proc *p; sigqueue_t *sq; if (ksi == NULL || (sq = ksi->ksi_sigq) == NULL) return; p = sq->sq_proc; TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; if (!(ksi->ksi_flags & KSI_EXT) && p != NULL) p->p_pendingcnt--; for (kp = TAILQ_FIRST(&sq->sq_list); kp != NULL; kp = TAILQ_NEXT(kp, ksi_link)) { if (kp->ksi_signo == ksi->ksi_signo) break; } if (kp == NULL && !SIGISMEMBER(sq->sq_kill, ksi->ksi_signo) && !SIGISMEMBER(sq->sq_ptrace, ksi->ksi_signo)) SIGDELSET(sq->sq_signals, ksi->ksi_signo); } static int sigqueue_add(sigqueue_t *sq, int signo, ksiginfo_t *si) { struct proc *p = sq->sq_proc; struct ksiginfo *ksi; int ret = 0; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); /* * SIGKILL/SIGSTOP cannot be caught or masked, so take the fast path * for these signals. */ if (signo == SIGKILL || signo == SIGSTOP || si == NULL) { SIGADDSET(sq->sq_kill, signo); goto out_set_bit; } /* directly insert the ksi, don't copy it */ if (si->ksi_flags & KSI_INS) { if (si->ksi_flags & KSI_HEAD) TAILQ_INSERT_HEAD(&sq->sq_list, si, ksi_link); else TAILQ_INSERT_TAIL(&sq->sq_list, si, ksi_link); si->ksi_sigq = sq; goto out_set_bit; } if (__predict_false(ksiginfo_zone == NULL)) { SIGADDSET(sq->sq_kill, signo); goto out_set_bit; } if (p != NULL && p->p_pendingcnt >= max_pending_per_proc) { signal_overflow++; ret = EAGAIN; } else if ((ksi = ksiginfo_alloc(0)) == NULL) { signal_alloc_fail++; ret = EAGAIN; } else { if (p != NULL) p->p_pendingcnt++; ksiginfo_copy(si, ksi); ksi->ksi_signo = signo; if (si->ksi_flags & KSI_HEAD) TAILQ_INSERT_HEAD(&sq->sq_list, ksi, ksi_link); else TAILQ_INSERT_TAIL(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = sq; } if (ret != 0) { if ((si->ksi_flags & KSI_PTRACE) != 0) { SIGADDSET(sq->sq_ptrace, signo); ret = 0; goto out_set_bit; } else if ((si->ksi_flags & KSI_TRAP) != 0 || (si->ksi_flags & KSI_SIGQ) == 0) { SIGADDSET(sq->sq_kill, signo); ret = 0; goto out_set_bit; } return (ret); } out_set_bit: SIGADDSET(sq->sq_signals, signo); return (ret); } void sigqueue_flush(sigqueue_t *sq) { struct proc *p = sq->sq_proc; ksiginfo_t *ksi; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); if (p != NULL) PROC_LOCK_ASSERT(p, MA_OWNED); while ((ksi = TAILQ_FIRST(&sq->sq_list)) != NULL) { TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; if (ksiginfo_tryfree(ksi) && p != NULL) p->p_pendingcnt--; } SIGEMPTYSET(sq->sq_signals); SIGEMPTYSET(sq->sq_kill); SIGEMPTYSET(sq->sq_ptrace); } static void sigqueue_move_set(sigqueue_t *src, sigqueue_t *dst, const sigset_t *set) { sigset_t tmp; struct proc *p1, *p2; ksiginfo_t *ksi, *next; KASSERT(src->sq_flags & SQ_INIT, ("src sigqueue not inited")); KASSERT(dst->sq_flags & SQ_INIT, ("dst sigqueue not inited")); p1 = src->sq_proc; p2 = dst->sq_proc; /* Move siginfo to target list */ TAILQ_FOREACH_SAFE(ksi, &src->sq_list, ksi_link, next) { if (SIGISMEMBER(*set, ksi->ksi_signo)) { TAILQ_REMOVE(&src->sq_list, ksi, ksi_link); if (p1 != NULL) p1->p_pendingcnt--; TAILQ_INSERT_TAIL(&dst->sq_list, ksi, ksi_link); ksi->ksi_sigq = dst; if (p2 != NULL) p2->p_pendingcnt++; } } /* Move pending bits to target list */ tmp = src->sq_kill; SIGSETAND(tmp, *set); SIGSETOR(dst->sq_kill, tmp); SIGSETNAND(src->sq_kill, tmp); tmp = src->sq_ptrace; SIGSETAND(tmp, *set); SIGSETOR(dst->sq_ptrace, tmp); SIGSETNAND(src->sq_ptrace, tmp); tmp = src->sq_signals; SIGSETAND(tmp, *set); SIGSETOR(dst->sq_signals, tmp); SIGSETNAND(src->sq_signals, tmp); } #if 0 static void sigqueue_move(sigqueue_t *src, sigqueue_t *dst, int signo) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, signo); sigqueue_move_set(src, dst, &set); } #endif static void sigqueue_delete_set(sigqueue_t *sq, const sigset_t *set) { struct proc *p = sq->sq_proc; ksiginfo_t *ksi, *next; KASSERT(sq->sq_flags & SQ_INIT, ("src sigqueue not inited")); /* Remove siginfo queue */ TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) { if (SIGISMEMBER(*set, ksi->ksi_signo)) { TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; if (ksiginfo_tryfree(ksi) && p != NULL) p->p_pendingcnt--; } } SIGSETNAND(sq->sq_kill, *set); SIGSETNAND(sq->sq_ptrace, *set); SIGSETNAND(sq->sq_signals, *set); } void sigqueue_delete(sigqueue_t *sq, int signo) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, signo); sigqueue_delete_set(sq, &set); } /* Remove a set of signals for a process */ static void sigqueue_delete_set_proc(struct proc *p, const sigset_t *set) { sigqueue_t worklist; struct thread *td0; PROC_LOCK_ASSERT(p, MA_OWNED); sigqueue_init(&worklist, NULL); sigqueue_move_set(&p->p_sigqueue, &worklist, set); FOREACH_THREAD_IN_PROC(p, td0) sigqueue_move_set(&td0->td_sigqueue, &worklist, set); sigqueue_flush(&worklist); } void sigqueue_delete_proc(struct proc *p, int signo) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, signo); sigqueue_delete_set_proc(p, &set); } static void sigqueue_delete_stopmask_proc(struct proc *p) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, SIGSTOP); SIGADDSET(set, SIGTSTP); SIGADDSET(set, SIGTTIN); SIGADDSET(set, SIGTTOU); sigqueue_delete_set_proc(p, &set); } /* * Determine signal that should be delivered to thread td, the current * thread, 0 if none. If there is a pending stop signal with default * action, the process stops in issignal(). */ int cursig(struct thread *td) { PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); mtx_assert(&td->td_proc->p_sigacts->ps_mtx, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_NOTOWNED); return (SIGPENDING(td) ? issignal(td) : 0); } /* * Arrange for ast() to handle unmasked pending signals on return to user * mode. This must be called whenever a signal is added to td_sigqueue or * unmasked in td_sigmask. */ void signotify(struct thread *td) { struct proc *p; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); if (SIGPENDING(td)) { thread_lock(td); td->td_flags |= TDF_NEEDSIGCHK | TDF_ASTPENDING; thread_unlock(td); } } int sigonstack(size_t sp) { struct thread *td = curthread; return ((td->td_pflags & TDP_ALTSTACK) ? #if defined(COMPAT_43) ((td->td_sigstk.ss_size == 0) ? (td->td_sigstk.ss_flags & SS_ONSTACK) : ((sp - (size_t)td->td_sigstk.ss_sp) < td->td_sigstk.ss_size)) #else ((sp - (size_t)td->td_sigstk.ss_sp) < td->td_sigstk.ss_size) #endif : 0); } static __inline int sigprop(int sig) { if (sig > 0 && sig < nitems(sigproptbl)) return (sigproptbl[sig]); return (0); } int sig_ffs(sigset_t *set) { int i; for (i = 0; i < _SIG_WORDS; i++) if (set->__bits[i]) return (ffs(set->__bits[i]) + (i * 32)); return (0); } static bool sigact_flag_test(const struct sigaction *act, int flag) { /* * SA_SIGINFO is reset when signal disposition is set to * ignore or default. Other flags are kept according to user * settings. */ return ((act->sa_flags & flag) != 0 && (flag != SA_SIGINFO || ((__sighandler_t *)act->sa_sigaction != SIG_IGN && (__sighandler_t *)act->sa_sigaction != SIG_DFL))); } /* * kern_sigaction * sigaction * freebsd4_sigaction * osigaction */ int kern_sigaction(struct thread *td, int sig, const struct sigaction *act, struct sigaction *oact, int flags) { struct sigacts *ps; struct proc *p = td->td_proc; if (!_SIG_VALID(sig)) return (EINVAL); if (act != NULL && act->sa_handler != SIG_DFL && act->sa_handler != SIG_IGN && (act->sa_flags & ~(SA_ONSTACK | SA_RESTART | SA_RESETHAND | SA_NOCLDSTOP | SA_NODEFER | SA_NOCLDWAIT | SA_SIGINFO)) != 0) return (EINVAL); PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); if (oact) { memset(oact, 0, sizeof(*oact)); oact->sa_mask = ps->ps_catchmask[_SIG_IDX(sig)]; if (SIGISMEMBER(ps->ps_sigonstack, sig)) oact->sa_flags |= SA_ONSTACK; if (!SIGISMEMBER(ps->ps_sigintr, sig)) oact->sa_flags |= SA_RESTART; if (SIGISMEMBER(ps->ps_sigreset, sig)) oact->sa_flags |= SA_RESETHAND; if (SIGISMEMBER(ps->ps_signodefer, sig)) oact->sa_flags |= SA_NODEFER; if (SIGISMEMBER(ps->ps_siginfo, sig)) { oact->sa_flags |= SA_SIGINFO; oact->sa_sigaction = (__siginfohandler_t *)ps->ps_sigact[_SIG_IDX(sig)]; } else oact->sa_handler = ps->ps_sigact[_SIG_IDX(sig)]; if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDSTOP) oact->sa_flags |= SA_NOCLDSTOP; if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDWAIT) oact->sa_flags |= SA_NOCLDWAIT; } if (act) { if ((sig == SIGKILL || sig == SIGSTOP) && act->sa_handler != SIG_DFL) { mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); return (EINVAL); } /* * Change setting atomically. */ ps->ps_catchmask[_SIG_IDX(sig)] = act->sa_mask; SIG_CANTMASK(ps->ps_catchmask[_SIG_IDX(sig)]); if (sigact_flag_test(act, SA_SIGINFO)) { ps->ps_sigact[_SIG_IDX(sig)] = (__sighandler_t *)act->sa_sigaction; SIGADDSET(ps->ps_siginfo, sig); } else { ps->ps_sigact[_SIG_IDX(sig)] = act->sa_handler; SIGDELSET(ps->ps_siginfo, sig); } if (!sigact_flag_test(act, SA_RESTART)) SIGADDSET(ps->ps_sigintr, sig); else SIGDELSET(ps->ps_sigintr, sig); if (sigact_flag_test(act, SA_ONSTACK)) SIGADDSET(ps->ps_sigonstack, sig); else SIGDELSET(ps->ps_sigonstack, sig); if (sigact_flag_test(act, SA_RESETHAND)) SIGADDSET(ps->ps_sigreset, sig); else SIGDELSET(ps->ps_sigreset, sig); if (sigact_flag_test(act, SA_NODEFER)) SIGADDSET(ps->ps_signodefer, sig); else SIGDELSET(ps->ps_signodefer, sig); if (sig == SIGCHLD) { if (act->sa_flags & SA_NOCLDSTOP) ps->ps_flag |= PS_NOCLDSTOP; else ps->ps_flag &= ~PS_NOCLDSTOP; if (act->sa_flags & SA_NOCLDWAIT) { /* * Paranoia: since SA_NOCLDWAIT is implemented * by reparenting the dying child to PID 1 (and * trust it to reap the zombie), PID 1 itself * is forbidden to set SA_NOCLDWAIT. */ if (p->p_pid == 1) ps->ps_flag &= ~PS_NOCLDWAIT; else ps->ps_flag |= PS_NOCLDWAIT; } else ps->ps_flag &= ~PS_NOCLDWAIT; if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN) ps->ps_flag |= PS_CLDSIGIGN; else ps->ps_flag &= ~PS_CLDSIGIGN; } /* * Set bit in ps_sigignore for signals that are set to SIG_IGN, * and for signals set to SIG_DFL where the default is to * ignore. However, don't put SIGCONT in ps_sigignore, as we * have to restart the process. */ if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || (sigprop(sig) & SIGPROP_IGNORE && ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)) { /* never to be seen again */ sigqueue_delete_proc(p, sig); if (sig != SIGCONT) /* easier in psignal */ SIGADDSET(ps->ps_sigignore, sig); SIGDELSET(ps->ps_sigcatch, sig); } else { SIGDELSET(ps->ps_sigignore, sig); if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL) SIGDELSET(ps->ps_sigcatch, sig); else SIGADDSET(ps->ps_sigcatch, sig); } #ifdef COMPAT_FREEBSD4 if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL || (flags & KSA_FREEBSD4) == 0) SIGDELSET(ps->ps_freebsd4, sig); else SIGADDSET(ps->ps_freebsd4, sig); #endif #ifdef COMPAT_43 if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL || (flags & KSA_OSIGSET) == 0) SIGDELSET(ps->ps_osigset, sig); else SIGADDSET(ps->ps_osigset, sig); #endif } mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); return (0); } #ifndef _SYS_SYSPROTO_H_ struct sigaction_args { int sig; struct sigaction *act; struct sigaction *oact; }; #endif int sys_sigaction(struct thread *td, struct sigaction_args *uap) { struct sigaction act, oact; struct sigaction *actp, *oactp; int error; actp = (uap->act != NULL) ? &act : NULL; oactp = (uap->oact != NULL) ? &oact : NULL; if (actp) { error = copyin(uap->act, actp, sizeof(act)); if (error) return (error); } error = kern_sigaction(td, uap->sig, actp, oactp, 0); if (oactp && !error) error = copyout(oactp, uap->oact, sizeof(oact)); return (error); } #ifdef COMPAT_FREEBSD4 #ifndef _SYS_SYSPROTO_H_ struct freebsd4_sigaction_args { int sig; struct sigaction *act; struct sigaction *oact; }; #endif int freebsd4_sigaction(struct thread *td, struct freebsd4_sigaction_args *uap) { struct sigaction act, oact; struct sigaction *actp, *oactp; int error; actp = (uap->act != NULL) ? &act : NULL; oactp = (uap->oact != NULL) ? &oact : NULL; if (actp) { error = copyin(uap->act, actp, sizeof(act)); if (error) return (error); } error = kern_sigaction(td, uap->sig, actp, oactp, KSA_FREEBSD4); if (oactp && !error) error = copyout(oactp, uap->oact, sizeof(oact)); return (error); } #endif /* COMAPT_FREEBSD4 */ #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigaction_args { int signum; struct osigaction *nsa; struct osigaction *osa; }; #endif int osigaction(struct thread *td, struct osigaction_args *uap) { struct osigaction sa; struct sigaction nsa, osa; struct sigaction *nsap, *osap; int error; if (uap->signum <= 0 || uap->signum >= ONSIG) return (EINVAL); nsap = (uap->nsa != NULL) ? &nsa : NULL; osap = (uap->osa != NULL) ? &osa : NULL; if (nsap) { error = copyin(uap->nsa, &sa, sizeof(sa)); if (error) return (error); nsap->sa_handler = sa.sa_handler; nsap->sa_flags = sa.sa_flags; OSIG2SIG(sa.sa_mask, nsap->sa_mask); } error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET); if (osap && !error) { sa.sa_handler = osap->sa_handler; sa.sa_flags = osap->sa_flags; SIG2OSIG(osap->sa_mask, sa.sa_mask); error = copyout(&sa, uap->osa, sizeof(sa)); } return (error); } #if !defined(__i386__) /* Avoid replicating the same stub everywhere */ int osigreturn(struct thread *td, struct osigreturn_args *uap) { return (nosys(td, (struct nosys_args *)uap)); } #endif #endif /* COMPAT_43 */ /* * Initialize signal state for process 0; * set to ignore signals that are ignored by default. */ void siginit(struct proc *p) { int i; struct sigacts *ps; PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); for (i = 1; i <= NSIG; i++) { if (sigprop(i) & SIGPROP_IGNORE && i != SIGCONT) { SIGADDSET(ps->ps_sigignore, i); } } mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); } /* * Reset specified signal to the default disposition. */ static void sigdflt(struct sigacts *ps, int sig) { mtx_assert(&ps->ps_mtx, MA_OWNED); SIGDELSET(ps->ps_sigcatch, sig); if ((sigprop(sig) & SIGPROP_IGNORE) != 0 && sig != SIGCONT) SIGADDSET(ps->ps_sigignore, sig); ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; SIGDELSET(ps->ps_siginfo, sig); } /* * Reset signals for an exec of the specified process. */ void execsigs(struct proc *p) { sigset_t osigignore; struct sigacts *ps; int sig; struct thread *td; /* * Reset caught signals. Held signals remain held * through td_sigmask (unless they were caught, * and are now ignored by default). */ PROC_LOCK_ASSERT(p, MA_OWNED); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); while (SIGNOTEMPTY(ps->ps_sigcatch)) { sig = sig_ffs(&ps->ps_sigcatch); sigdflt(ps, sig); if ((sigprop(sig) & SIGPROP_IGNORE) != 0) sigqueue_delete_proc(p, sig); } /* * As CloudABI processes cannot modify signal handlers, fully * reset all signals to their default behavior. Do ignore * SIGPIPE, as it would otherwise be impossible to recover from * writes to broken pipes and sockets. */ if (SV_PROC_ABI(p) == SV_ABI_CLOUDABI) { osigignore = ps->ps_sigignore; while (SIGNOTEMPTY(osigignore)) { sig = sig_ffs(&osigignore); SIGDELSET(osigignore, sig); if (sig != SIGPIPE) sigdflt(ps, sig); } SIGADDSET(ps->ps_sigignore, SIGPIPE); } /* * Reset stack state to the user stack. * Clear set of signals caught on the signal stack. */ td = curthread; MPASS(td->td_proc == p); td->td_sigstk.ss_flags = SS_DISABLE; td->td_sigstk.ss_size = 0; td->td_sigstk.ss_sp = 0; td->td_pflags &= ~TDP_ALTSTACK; /* * Reset no zombies if child dies flag as Solaris does. */ ps->ps_flag &= ~(PS_NOCLDWAIT | PS_CLDSIGIGN); if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN) ps->ps_sigact[_SIG_IDX(SIGCHLD)] = SIG_DFL; mtx_unlock(&ps->ps_mtx); } /* * kern_sigprocmask() * * Manipulate signal mask. */ int kern_sigprocmask(struct thread *td, int how, sigset_t *set, sigset_t *oset, int flags) { sigset_t new_block, oset1; struct proc *p; int error; p = td->td_proc; if ((flags & SIGPROCMASK_PROC_LOCKED) != 0) PROC_LOCK_ASSERT(p, MA_OWNED); else PROC_LOCK(p); mtx_assert(&p->p_sigacts->ps_mtx, (flags & SIGPROCMASK_PS_LOCKED) != 0 ? MA_OWNED : MA_NOTOWNED); if (oset != NULL) *oset = td->td_sigmask; error = 0; if (set != NULL) { switch (how) { case SIG_BLOCK: SIG_CANTMASK(*set); oset1 = td->td_sigmask; SIGSETOR(td->td_sigmask, *set); new_block = td->td_sigmask; SIGSETNAND(new_block, oset1); break; case SIG_UNBLOCK: SIGSETNAND(td->td_sigmask, *set); signotify(td); goto out; case SIG_SETMASK: SIG_CANTMASK(*set); oset1 = td->td_sigmask; if (flags & SIGPROCMASK_OLD) SIGSETLO(td->td_sigmask, *set); else td->td_sigmask = *set; new_block = td->td_sigmask; SIGSETNAND(new_block, oset1); signotify(td); break; default: error = EINVAL; goto out; } /* * The new_block set contains signals that were not previously * blocked, but are blocked now. * * In case we block any signal that was not previously blocked * for td, and process has the signal pending, try to schedule * signal delivery to some thread that does not block the * signal, possibly waking it up. */ if (p->p_numthreads != 1) reschedule_signals(p, new_block, flags); } out: if (!(flags & SIGPROCMASK_PROC_LOCKED)) PROC_UNLOCK(p); return (error); } #ifndef _SYS_SYSPROTO_H_ struct sigprocmask_args { int how; const sigset_t *set; sigset_t *oset; }; #endif int sys_sigprocmask(struct thread *td, struct sigprocmask_args *uap) { sigset_t set, oset; sigset_t *setp, *osetp; int error; setp = (uap->set != NULL) ? &set : NULL; osetp = (uap->oset != NULL) ? &oset : NULL; if (setp) { error = copyin(uap->set, setp, sizeof(set)); if (error) return (error); } error = kern_sigprocmask(td, uap->how, setp, osetp, 0); if (osetp && !error) { error = copyout(osetp, uap->oset, sizeof(oset)); } return (error); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigprocmask_args { int how; osigset_t mask; }; #endif int osigprocmask(struct thread *td, struct osigprocmask_args *uap) { sigset_t set, oset; int error; OSIG2SIG(uap->mask, set); error = kern_sigprocmask(td, uap->how, &set, &oset, 1); SIG2OSIG(oset, td->td_retval[0]); return (error); } #endif /* COMPAT_43 */ int sys_sigwait(struct thread *td, struct sigwait_args *uap) { ksiginfo_t ksi; sigset_t set; int error; error = copyin(uap->set, &set, sizeof(set)); if (error) { td->td_retval[0] = error; return (0); } error = kern_sigtimedwait(td, set, &ksi, NULL); if (error) { if (error == EINTR && td->td_proc->p_osrel < P_OSREL_SIGWAIT) error = ERESTART; if (error == ERESTART) return (error); td->td_retval[0] = error; return (0); } error = copyout(&ksi.ksi_signo, uap->sig, sizeof(ksi.ksi_signo)); td->td_retval[0] = error; return (0); } int sys_sigtimedwait(struct thread *td, struct sigtimedwait_args *uap) { struct timespec ts; struct timespec *timeout; sigset_t set; ksiginfo_t ksi; int error; if (uap->timeout) { error = copyin(uap->timeout, &ts, sizeof(ts)); if (error) return (error); timeout = &ts; } else timeout = NULL; error = copyin(uap->set, &set, sizeof(set)); if (error) return (error); error = kern_sigtimedwait(td, set, &ksi, timeout); if (error) return (error); if (uap->info) error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t)); if (error == 0) td->td_retval[0] = ksi.ksi_signo; return (error); } int sys_sigwaitinfo(struct thread *td, struct sigwaitinfo_args *uap) { ksiginfo_t ksi; sigset_t set; int error; error = copyin(uap->set, &set, sizeof(set)); if (error) return (error); error = kern_sigtimedwait(td, set, &ksi, NULL); if (error) return (error); if (uap->info) error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t)); if (error == 0) td->td_retval[0] = ksi.ksi_signo; return (error); } static void proc_td_siginfo_capture(struct thread *td, siginfo_t *si) { struct thread *thr; FOREACH_THREAD_IN_PROC(td->td_proc, thr) { if (thr == td) thr->td_si = *si; else thr->td_si.si_signo = 0; } } int kern_sigtimedwait(struct thread *td, sigset_t waitset, ksiginfo_t *ksi, struct timespec *timeout) { struct sigacts *ps; sigset_t saved_mask, new_block; struct proc *p; int error, sig, timo, timevalid = 0; struct timespec rts, ets, ts; struct timeval tv; p = td->td_proc; error = 0; ets.tv_sec = 0; ets.tv_nsec = 0; if (timeout != NULL) { if (timeout->tv_nsec >= 0 && timeout->tv_nsec < 1000000000) { timevalid = 1; getnanouptime(&rts); ets = rts; timespecadd(&ets, timeout); } } ksiginfo_init(ksi); /* Some signals can not be waited for. */ SIG_CANTMASK(waitset); ps = p->p_sigacts; PROC_LOCK(p); saved_mask = td->td_sigmask; SIGSETNAND(td->td_sigmask, waitset); for (;;) { mtx_lock(&ps->ps_mtx); sig = cursig(td); mtx_unlock(&ps->ps_mtx); KASSERT(sig >= 0, ("sig %d", sig)); if (sig != 0 && SIGISMEMBER(waitset, sig)) { if (sigqueue_get(&td->td_sigqueue, sig, ksi) != 0 || sigqueue_get(&p->p_sigqueue, sig, ksi) != 0) { error = 0; break; } } if (error != 0) break; /* * POSIX says this must be checked after looking for pending * signals. */ if (timeout != NULL) { if (!timevalid) { error = EINVAL; break; } getnanouptime(&rts); if (timespeccmp(&rts, &ets, >=)) { error = EAGAIN; break; } ts = ets; timespecsub(&ts, &rts); TIMESPEC_TO_TIMEVAL(&tv, &ts); timo = tvtohz(&tv); } else { timo = 0; } error = msleep(ps, &p->p_mtx, PPAUSE|PCATCH, "sigwait", timo); if (timeout != NULL) { if (error == ERESTART) { /* Timeout can not be restarted. */ error = EINTR; } else if (error == EAGAIN) { /* We will calculate timeout by ourself. */ error = 0; } } } new_block = saved_mask; SIGSETNAND(new_block, td->td_sigmask); td->td_sigmask = saved_mask; /* * Fewer signals can be delivered to us, reschedule signal * notification. */ if (p->p_numthreads != 1) reschedule_signals(p, new_block, 0); if (error == 0) { SDT_PROBE2(proc, , , signal__clear, sig, ksi); if (ksi->ksi_code == SI_TIMER) itimer_accept(p, ksi->ksi_timerid, ksi); #ifdef KTRACE if (KTRPOINT(td, KTR_PSIG)) { sig_t action; mtx_lock(&ps->ps_mtx); action = ps->ps_sigact[_SIG_IDX(sig)]; mtx_unlock(&ps->ps_mtx); ktrpsig(sig, action, &td->td_sigmask, ksi->ksi_code); } #endif if (sig == SIGKILL) { proc_td_siginfo_capture(td, &ksi->ksi_info); sigexit(td, sig); } } PROC_UNLOCK(p); return (error); } #ifndef _SYS_SYSPROTO_H_ struct sigpending_args { sigset_t *set; }; #endif int sys_sigpending(struct thread *td, struct sigpending_args *uap) { struct proc *p = td->td_proc; sigset_t pending; PROC_LOCK(p); pending = p->p_sigqueue.sq_signals; SIGSETOR(pending, td->td_sigqueue.sq_signals); PROC_UNLOCK(p); return (copyout(&pending, uap->set, sizeof(sigset_t))); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigpending_args { int dummy; }; #endif int osigpending(struct thread *td, struct osigpending_args *uap) { struct proc *p = td->td_proc; sigset_t pending; PROC_LOCK(p); pending = p->p_sigqueue.sq_signals; SIGSETOR(pending, td->td_sigqueue.sq_signals); PROC_UNLOCK(p); SIG2OSIG(pending, td->td_retval[0]); return (0); } #endif /* COMPAT_43 */ #if defined(COMPAT_43) /* * Generalized interface signal handler, 4.3-compatible. */ #ifndef _SYS_SYSPROTO_H_ struct osigvec_args { int signum; struct sigvec *nsv; struct sigvec *osv; }; #endif /* ARGSUSED */ int osigvec(struct thread *td, struct osigvec_args *uap) { struct sigvec vec; struct sigaction nsa, osa; struct sigaction *nsap, *osap; int error; if (uap->signum <= 0 || uap->signum >= ONSIG) return (EINVAL); nsap = (uap->nsv != NULL) ? &nsa : NULL; osap = (uap->osv != NULL) ? &osa : NULL; if (nsap) { error = copyin(uap->nsv, &vec, sizeof(vec)); if (error) return (error); nsap->sa_handler = vec.sv_handler; OSIG2SIG(vec.sv_mask, nsap->sa_mask); nsap->sa_flags = vec.sv_flags; nsap->sa_flags ^= SA_RESTART; /* opposite of SV_INTERRUPT */ } error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET); if (osap && !error) { vec.sv_handler = osap->sa_handler; SIG2OSIG(osap->sa_mask, vec.sv_mask); vec.sv_flags = osap->sa_flags; vec.sv_flags &= ~SA_NOCLDWAIT; vec.sv_flags ^= SA_RESTART; error = copyout(&vec, uap->osv, sizeof(vec)); } return (error); } #ifndef _SYS_SYSPROTO_H_ struct osigblock_args { int mask; }; #endif int osigblock(struct thread *td, struct osigblock_args *uap) { sigset_t set, oset; OSIG2SIG(uap->mask, set); kern_sigprocmask(td, SIG_BLOCK, &set, &oset, 0); SIG2OSIG(oset, td->td_retval[0]); return (0); } #ifndef _SYS_SYSPROTO_H_ struct osigsetmask_args { int mask; }; #endif int osigsetmask(struct thread *td, struct osigsetmask_args *uap) { sigset_t set, oset; OSIG2SIG(uap->mask, set); kern_sigprocmask(td, SIG_SETMASK, &set, &oset, 0); SIG2OSIG(oset, td->td_retval[0]); return (0); } #endif /* COMPAT_43 */ /* * Suspend calling thread until signal, providing mask to be set in the * meantime. */ #ifndef _SYS_SYSPROTO_H_ struct sigsuspend_args { const sigset_t *sigmask; }; #endif /* ARGSUSED */ int sys_sigsuspend(struct thread *td, struct sigsuspend_args *uap) { sigset_t mask; int error; error = copyin(uap->sigmask, &mask, sizeof(mask)); if (error) return (error); return (kern_sigsuspend(td, mask)); } int kern_sigsuspend(struct thread *td, sigset_t mask) { struct proc *p = td->td_proc; int has_sig, sig; /* * When returning from sigsuspend, we want * the old mask to be restored after the * signal handler has finished. Thus, we * save it here and mark the sigacts structure * to indicate this. */ PROC_LOCK(p); kern_sigprocmask(td, SIG_SETMASK, &mask, &td->td_oldsigmask, SIGPROCMASK_PROC_LOCKED); td->td_pflags |= TDP_OLDMASK; /* * Process signals now. Otherwise, we can get spurious wakeup * due to signal entered process queue, but delivered to other * thread. But sigsuspend should return only on signal * delivery. */ (p->p_sysent->sv_set_syscall_retval)(td, EINTR); for (has_sig = 0; !has_sig;) { while (msleep(&p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH, "pause", 0) == 0) /* void */; thread_suspend_check(0); mtx_lock(&p->p_sigacts->ps_mtx); while ((sig = cursig(td)) != 0) { KASSERT(sig >= 0, ("sig %d", sig)); has_sig += postsig(sig); } mtx_unlock(&p->p_sigacts->ps_mtx); } PROC_UNLOCK(p); td->td_errno = EINTR; td->td_pflags |= TDP_NERRNO; return (EJUSTRETURN); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ /* * Compatibility sigsuspend call for old binaries. Note nonstandard calling * convention: libc stub passes mask, not pointer, to save a copyin. */ #ifndef _SYS_SYSPROTO_H_ struct osigsuspend_args { osigset_t mask; }; #endif /* ARGSUSED */ int osigsuspend(struct thread *td, struct osigsuspend_args *uap) { sigset_t mask; OSIG2SIG(uap->mask, mask); return (kern_sigsuspend(td, mask)); } #endif /* COMPAT_43 */ #if defined(COMPAT_43) #ifndef _SYS_SYSPROTO_H_ struct osigstack_args { struct sigstack *nss; struct sigstack *oss; }; #endif /* ARGSUSED */ int osigstack(struct thread *td, struct osigstack_args *uap) { struct sigstack nss, oss; int error = 0; if (uap->nss != NULL) { error = copyin(uap->nss, &nss, sizeof(nss)); if (error) return (error); } oss.ss_sp = td->td_sigstk.ss_sp; oss.ss_onstack = sigonstack(cpu_getstack(td)); if (uap->nss != NULL) { td->td_sigstk.ss_sp = nss.ss_sp; td->td_sigstk.ss_size = 0; td->td_sigstk.ss_flags |= nss.ss_onstack & SS_ONSTACK; td->td_pflags |= TDP_ALTSTACK; } if (uap->oss != NULL) error = copyout(&oss, uap->oss, sizeof(oss)); return (error); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct sigaltstack_args { stack_t *ss; stack_t *oss; }; #endif /* ARGSUSED */ int sys_sigaltstack(struct thread *td, struct sigaltstack_args *uap) { stack_t ss, oss; int error; if (uap->ss != NULL) { error = copyin(uap->ss, &ss, sizeof(ss)); if (error) return (error); } error = kern_sigaltstack(td, (uap->ss != NULL) ? &ss : NULL, (uap->oss != NULL) ? &oss : NULL); if (error) return (error); if (uap->oss != NULL) error = copyout(&oss, uap->oss, sizeof(stack_t)); return (error); } int kern_sigaltstack(struct thread *td, stack_t *ss, stack_t *oss) { struct proc *p = td->td_proc; int oonstack; oonstack = sigonstack(cpu_getstack(td)); if (oss != NULL) { *oss = td->td_sigstk; oss->ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; } if (ss != NULL) { if (oonstack) return (EPERM); if ((ss->ss_flags & ~SS_DISABLE) != 0) return (EINVAL); if (!(ss->ss_flags & SS_DISABLE)) { if (ss->ss_size < p->p_sysent->sv_minsigstksz) return (ENOMEM); td->td_sigstk = *ss; td->td_pflags |= TDP_ALTSTACK; } else { td->td_pflags &= ~TDP_ALTSTACK; } } return (0); } /* * Common code for kill process group/broadcast kill. * cp is calling process. */ static int killpg1(struct thread *td, int sig, int pgid, int all, ksiginfo_t *ksi) { struct proc *p; struct pgrp *pgrp; int err; int ret; ret = ESRCH; if (all) { /* * broadcast */ sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if (p->p_pid <= 1 || p->p_flag & P_SYSTEM || p == td->td_proc || p->p_state == PRS_NEW) { continue; } PROC_LOCK(p); err = p_cansignal(td, p, sig); if (err == 0) { if (sig) pksignal(p, sig, ksi); ret = err; } else if (ret == ESRCH) ret = err; PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); } else { sx_slock(&proctree_lock); if (pgid == 0) { /* * zero pgid means send to my process group. */ pgrp = td->td_proc->p_pgrp; PGRP_LOCK(pgrp); } else { pgrp = pgfind(pgid); if (pgrp == NULL) { sx_sunlock(&proctree_lock); return (ESRCH); } } sx_sunlock(&proctree_lock); LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_pid <= 1 || p->p_flag & P_SYSTEM || p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } err = p_cansignal(td, p, sig); if (err == 0) { if (sig) pksignal(p, sig, ksi); ret = err; } else if (ret == ESRCH) ret = err; PROC_UNLOCK(p); } PGRP_UNLOCK(pgrp); } return (ret); } #ifndef _SYS_SYSPROTO_H_ struct kill_args { int pid; int signum; }; #endif /* ARGSUSED */ int sys_kill(struct thread *td, struct kill_args *uap) { ksiginfo_t ksi; struct proc *p; int error; /* * A process in capability mode can send signals only to himself. * The main rationale behind this is that abort(3) is implemented as * kill(getpid(), SIGABRT). */ if (IN_CAPABILITY_MODE(td) && uap->pid != td->td_proc->p_pid) return (ECAPMODE); AUDIT_ARG_SIGNUM(uap->signum); AUDIT_ARG_PID(uap->pid); if ((u_int)uap->signum > _SIG_MAXSIG) return (EINVAL); ksiginfo_init(&ksi); ksi.ksi_signo = uap->signum; ksi.ksi_code = SI_USER; ksi.ksi_pid = td->td_proc->p_pid; ksi.ksi_uid = td->td_ucred->cr_ruid; if (uap->pid > 0) { /* kill single process */ if ((p = pfind_any(uap->pid)) == NULL) return (ESRCH); AUDIT_ARG_PROCESS(p); error = p_cansignal(td, p, uap->signum); if (error == 0 && uap->signum) pksignal(p, uap->signum, &ksi); PROC_UNLOCK(p); return (error); } switch (uap->pid) { case -1: /* broadcast signal */ return (killpg1(td, uap->signum, 0, 1, &ksi)); case 0: /* signal own process group */ return (killpg1(td, uap->signum, 0, 0, &ksi)); default: /* negative explicit process group */ return (killpg1(td, uap->signum, -uap->pid, 0, &ksi)); } /* NOTREACHED */ } int sys_pdkill(struct thread *td, struct pdkill_args *uap) { struct proc *p; - cap_rights_t rights; int error; AUDIT_ARG_SIGNUM(uap->signum); AUDIT_ARG_FD(uap->fd); if ((u_int)uap->signum > _SIG_MAXSIG) return (EINVAL); - error = procdesc_find(td, uap->fd, - cap_rights_init(&rights, CAP_PDKILL), &p); + error = procdesc_find(td, uap->fd, &cap_pdkill_rights, &p); if (error) return (error); AUDIT_ARG_PROCESS(p); error = p_cansignal(td, p, uap->signum); if (error == 0 && uap->signum) kern_psignal(p, uap->signum); PROC_UNLOCK(p); return (error); } #if defined(COMPAT_43) #ifndef _SYS_SYSPROTO_H_ struct okillpg_args { int pgid; int signum; }; #endif /* ARGSUSED */ int okillpg(struct thread *td, struct okillpg_args *uap) { ksiginfo_t ksi; AUDIT_ARG_SIGNUM(uap->signum); AUDIT_ARG_PID(uap->pgid); if ((u_int)uap->signum > _SIG_MAXSIG) return (EINVAL); ksiginfo_init(&ksi); ksi.ksi_signo = uap->signum; ksi.ksi_code = SI_USER; ksi.ksi_pid = td->td_proc->p_pid; ksi.ksi_uid = td->td_ucred->cr_ruid; return (killpg1(td, uap->signum, uap->pgid, 0, &ksi)); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct sigqueue_args { pid_t pid; int signum; /* union sigval */ void *value; }; #endif int sys_sigqueue(struct thread *td, struct sigqueue_args *uap) { union sigval sv; sv.sival_ptr = uap->value; return (kern_sigqueue(td, uap->pid, uap->signum, &sv)); } int kern_sigqueue(struct thread *td, pid_t pid, int signum, union sigval *value) { ksiginfo_t ksi; struct proc *p; int error; if ((u_int)signum > _SIG_MAXSIG) return (EINVAL); /* * Specification says sigqueue can only send signal to * single process. */ if (pid <= 0) return (EINVAL); if ((p = pfind_any(pid)) == NULL) return (ESRCH); error = p_cansignal(td, p, signum); if (error == 0 && signum != 0) { ksiginfo_init(&ksi); ksi.ksi_flags = KSI_SIGQ; ksi.ksi_signo = signum; ksi.ksi_code = SI_QUEUE; ksi.ksi_pid = td->td_proc->p_pid; ksi.ksi_uid = td->td_ucred->cr_ruid; ksi.ksi_value = *value; error = pksignal(p, ksi.ksi_signo, &ksi); } PROC_UNLOCK(p); return (error); } /* * Send a signal to a process group. */ void gsignal(int pgid, int sig, ksiginfo_t *ksi) { struct pgrp *pgrp; if (pgid != 0) { sx_slock(&proctree_lock); pgrp = pgfind(pgid); sx_sunlock(&proctree_lock); if (pgrp != NULL) { pgsignal(pgrp, sig, 0, ksi); PGRP_UNLOCK(pgrp); } } } /* * Send a signal to a process group. If checktty is 1, * limit to members which have a controlling terminal. */ void pgsignal(struct pgrp *pgrp, int sig, int checkctty, ksiginfo_t *ksi) { struct proc *p; if (pgrp) { PGRP_LOCK_ASSERT(pgrp, MA_OWNED); LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && (checkctty == 0 || p->p_flag & P_CONTROLT)) pksignal(p, sig, ksi); PROC_UNLOCK(p); } } } /* * Recalculate the signal mask and reset the signal disposition after * usermode frame for delivery is formed. Should be called after * mach-specific routine, because sysent->sv_sendsig() needs correct * ps_siginfo and signal mask. */ static void postsig_done(int sig, struct thread *td, struct sigacts *ps) { sigset_t mask; mtx_assert(&ps->ps_mtx, MA_OWNED); td->td_ru.ru_nsignals++; mask = ps->ps_catchmask[_SIG_IDX(sig)]; if (!SIGISMEMBER(ps->ps_signodefer, sig)) SIGADDSET(mask, sig); kern_sigprocmask(td, SIG_BLOCK, &mask, NULL, SIGPROCMASK_PROC_LOCKED | SIGPROCMASK_PS_LOCKED); if (SIGISMEMBER(ps->ps_sigreset, sig)) sigdflt(ps, sig); } /* * Send a signal caused by a trap to the current thread. If it will be * caught immediately, deliver it with correct code. Otherwise, post it * normally. */ void trapsignal(struct thread *td, ksiginfo_t *ksi) { struct sigacts *ps; struct proc *p; int sig; int code; p = td->td_proc; sig = ksi->ksi_signo; code = ksi->ksi_code; KASSERT(_SIG_VALID(sig), ("invalid signal")); PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); if ((p->p_flag & P_TRACED) == 0 && SIGISMEMBER(ps->ps_sigcatch, sig) && !SIGISMEMBER(td->td_sigmask, sig)) { #ifdef KTRACE if (KTRPOINT(curthread, KTR_PSIG)) ktrpsig(sig, ps->ps_sigact[_SIG_IDX(sig)], &td->td_sigmask, code); #endif (*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)], ksi, &td->td_sigmask); postsig_done(sig, td, ps); mtx_unlock(&ps->ps_mtx); } else { /* * Avoid a possible infinite loop if the thread * masking the signal or process is ignoring the * signal. */ if (kern_forcesigexit && (SIGISMEMBER(td->td_sigmask, sig) || ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN)) { SIGDELSET(td->td_sigmask, sig); SIGDELSET(ps->ps_sigcatch, sig); SIGDELSET(ps->ps_sigignore, sig); ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; } mtx_unlock(&ps->ps_mtx); p->p_code = code; /* XXX for core dump/debugger */ p->p_sig = sig; /* XXX to verify code */ tdsendsignal(p, td, sig, ksi); } PROC_UNLOCK(p); } static struct thread * sigtd(struct proc *p, int sig, int prop) { struct thread *td, *signal_td; PROC_LOCK_ASSERT(p, MA_OWNED); /* * Check if current thread can handle the signal without * switching context to another thread. */ if (curproc == p && !SIGISMEMBER(curthread->td_sigmask, sig)) return (curthread); signal_td = NULL; FOREACH_THREAD_IN_PROC(p, td) { if (!SIGISMEMBER(td->td_sigmask, sig)) { signal_td = td; break; } } if (signal_td == NULL) signal_td = FIRST_THREAD_IN_PROC(p); return (signal_td); } /* * Send the signal to the process. If the signal has an action, the action * is usually performed by the target process rather than the caller; we add * the signal to the set of pending signals for the process. * * Exceptions: * o When a stop signal is sent to a sleeping process that takes the * default action, the process is stopped without awakening it. * o SIGCONT restarts stopped processes (or puts them back to sleep) * regardless of the signal action (eg, blocked or ignored). * * Other ignored signals are discarded immediately. * * NB: This function may be entered from the debugger via the "kill" DDB * command. There is little that can be done to mitigate the possibly messy * side effects of this unwise possibility. */ void kern_psignal(struct proc *p, int sig) { ksiginfo_t ksi; ksiginfo_init(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = SI_KERNEL; (void) tdsendsignal(p, NULL, sig, &ksi); } int pksignal(struct proc *p, int sig, ksiginfo_t *ksi) { return (tdsendsignal(p, NULL, sig, ksi)); } /* Utility function for finding a thread to send signal event to. */ int sigev_findtd(struct proc *p ,struct sigevent *sigev, struct thread **ttd) { struct thread *td; if (sigev->sigev_notify == SIGEV_THREAD_ID) { td = tdfind(sigev->sigev_notify_thread_id, p->p_pid); if (td == NULL) return (ESRCH); *ttd = td; } else { *ttd = NULL; PROC_LOCK(p); } return (0); } void tdsignal(struct thread *td, int sig) { ksiginfo_t ksi; ksiginfo_init(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = SI_KERNEL; (void) tdsendsignal(td->td_proc, td, sig, &ksi); } void tdksignal(struct thread *td, int sig, ksiginfo_t *ksi) { (void) tdsendsignal(td->td_proc, td, sig, ksi); } int tdsendsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi) { sig_t action; sigqueue_t *sigqueue; int prop; struct sigacts *ps; int intrval; int ret = 0; int wakeup_swapper; MPASS(td == NULL || p == td->td_proc); PROC_LOCK_ASSERT(p, MA_OWNED); if (!_SIG_VALID(sig)) panic("%s(): invalid signal %d", __func__, sig); KASSERT(ksi == NULL || !KSI_ONQ(ksi), ("%s: ksi on queue", __func__)); /* * IEEE Std 1003.1-2001: return success when killing a zombie. */ if (p->p_state == PRS_ZOMBIE) { if (ksi && (ksi->ksi_flags & KSI_INS)) ksiginfo_tryfree(ksi); return (ret); } ps = p->p_sigacts; KNOTE_LOCKED(p->p_klist, NOTE_SIGNAL | sig); prop = sigprop(sig); if (td == NULL) { td = sigtd(p, sig, prop); sigqueue = &p->p_sigqueue; } else sigqueue = &td->td_sigqueue; SDT_PROBE3(proc, , , signal__send, td, p, sig); /* * If the signal is being ignored, * then we forget about it immediately. * (Note: we don't set SIGCONT in ps_sigignore, * and if it is set to SIG_IGN, * action will be SIG_DFL here.) */ mtx_lock(&ps->ps_mtx); if (SIGISMEMBER(ps->ps_sigignore, sig)) { SDT_PROBE3(proc, , , signal__discard, td, p, sig); mtx_unlock(&ps->ps_mtx); if (ksi && (ksi->ksi_flags & KSI_INS)) ksiginfo_tryfree(ksi); return (ret); } if (SIGISMEMBER(td->td_sigmask, sig)) action = SIG_HOLD; else if (SIGISMEMBER(ps->ps_sigcatch, sig)) action = SIG_CATCH; else action = SIG_DFL; if (SIGISMEMBER(ps->ps_sigintr, sig)) intrval = EINTR; else intrval = ERESTART; mtx_unlock(&ps->ps_mtx); if (prop & SIGPROP_CONT) sigqueue_delete_stopmask_proc(p); else if (prop & SIGPROP_STOP) { /* * If sending a tty stop signal to a member of an orphaned * process group, discard the signal here if the action * is default; don't stop the process below if sleeping, * and don't clear any pending SIGCONT. */ if ((prop & SIGPROP_TTYSTOP) && (p->p_pgrp->pg_jobc == 0) && (action == SIG_DFL)) { if (ksi && (ksi->ksi_flags & KSI_INS)) ksiginfo_tryfree(ksi); return (ret); } sigqueue_delete_proc(p, SIGCONT); if (p->p_flag & P_CONTINUED) { p->p_flag &= ~P_CONTINUED; PROC_LOCK(p->p_pptr); sigqueue_take(p->p_ksi); PROC_UNLOCK(p->p_pptr); } } ret = sigqueue_add(sigqueue, sig, ksi); if (ret != 0) return (ret); signotify(td); /* * Defer further processing for signals which are held, * except that stopped processes must be continued by SIGCONT. */ if (action == SIG_HOLD && !((prop & SIGPROP_CONT) && (p->p_flag & P_STOPPED_SIG))) return (ret); /* SIGKILL: Remove procfs STOPEVENTs. */ if (sig == SIGKILL) { /* from procfs_ioctl.c: PIOCBIC */ p->p_stops = 0; /* from procfs_ioctl.c: PIOCCONT */ p->p_step = 0; wakeup(&p->p_step); } /* * Some signals have a process-wide effect and a per-thread * component. Most processing occurs when the process next * tries to cross the user boundary, however there are some * times when processing needs to be done immediately, such as * waking up threads so that they can cross the user boundary. * We try to do the per-process part here. */ if (P_SHOULDSTOP(p)) { KASSERT(!(p->p_flag & P_WEXIT), ("signal to stopped but exiting process")); if (sig == SIGKILL) { /* * If traced process is already stopped, * then no further action is necessary. */ if (p->p_flag & P_TRACED) goto out; /* * SIGKILL sets process running. * It will die elsewhere. * All threads must be restarted. */ p->p_flag &= ~P_STOPPED_SIG; goto runfast; } if (prop & SIGPROP_CONT) { /* * If traced process is already stopped, * then no further action is necessary. */ if (p->p_flag & P_TRACED) goto out; /* * If SIGCONT is default (or ignored), we continue the * process but don't leave the signal in sigqueue as * it has no further action. If SIGCONT is held, we * continue the process and leave the signal in * sigqueue. If the process catches SIGCONT, let it * handle the signal itself. If it isn't waiting on * an event, it goes back to run state. * Otherwise, process goes back to sleep state. */ p->p_flag &= ~P_STOPPED_SIG; PROC_SLOCK(p); if (p->p_numthreads == p->p_suspcount) { PROC_SUNLOCK(p); p->p_flag |= P_CONTINUED; p->p_xsig = SIGCONT; PROC_LOCK(p->p_pptr); childproc_continued(p); PROC_UNLOCK(p->p_pptr); PROC_SLOCK(p); } if (action == SIG_DFL) { thread_unsuspend(p); PROC_SUNLOCK(p); sigqueue_delete(sigqueue, sig); goto out; } if (action == SIG_CATCH) { /* * The process wants to catch it so it needs * to run at least one thread, but which one? */ PROC_SUNLOCK(p); goto runfast; } /* * The signal is not ignored or caught. */ thread_unsuspend(p); PROC_SUNLOCK(p); goto out; } if (prop & SIGPROP_STOP) { /* * If traced process is already stopped, * then no further action is necessary. */ if (p->p_flag & P_TRACED) goto out; /* * Already stopped, don't need to stop again * (If we did the shell could get confused). * Just make sure the signal STOP bit set. */ p->p_flag |= P_STOPPED_SIG; sigqueue_delete(sigqueue, sig); goto out; } /* * All other kinds of signals: * If a thread is sleeping interruptibly, simulate a * wakeup so that when it is continued it will be made * runnable and can look at the signal. However, don't make * the PROCESS runnable, leave it stopped. * It may run a bit until it hits a thread_suspend_check(). */ wakeup_swapper = 0; PROC_SLOCK(p); thread_lock(td); if (TD_ON_SLEEPQ(td) && (td->td_flags & TDF_SINTR)) wakeup_swapper = sleepq_abort(td, intrval); thread_unlock(td); PROC_SUNLOCK(p); if (wakeup_swapper) kick_proc0(); goto out; /* * Mutexes are short lived. Threads waiting on them will * hit thread_suspend_check() soon. */ } else if (p->p_state == PRS_NORMAL) { if (p->p_flag & P_TRACED || action == SIG_CATCH) { tdsigwakeup(td, sig, action, intrval); goto out; } MPASS(action == SIG_DFL); if (prop & SIGPROP_STOP) { if (p->p_flag & (P_PPWAIT|P_WEXIT)) goto out; p->p_flag |= P_STOPPED_SIG; p->p_xsig = sig; PROC_SLOCK(p); wakeup_swapper = sig_suspend_threads(td, p, 1); if (p->p_numthreads == p->p_suspcount) { /* * only thread sending signal to another * process can reach here, if thread is sending * signal to its process, because thread does * not suspend itself here, p_numthreads * should never be equal to p_suspcount. */ thread_stopped(p); PROC_SUNLOCK(p); sigqueue_delete_proc(p, p->p_xsig); } else PROC_SUNLOCK(p); if (wakeup_swapper) kick_proc0(); goto out; } } else { /* Not in "NORMAL" state. discard the signal. */ sigqueue_delete(sigqueue, sig); goto out; } /* * The process is not stopped so we need to apply the signal to all the * running threads. */ runfast: tdsigwakeup(td, sig, action, intrval); PROC_SLOCK(p); thread_unsuspend(p); PROC_SUNLOCK(p); out: /* If we jump here, proc slock should not be owned. */ PROC_SLOCK_ASSERT(p, MA_NOTOWNED); return (ret); } /* * The force of a signal has been directed against a single * thread. We need to see what we can do about knocking it * out of any sleep it may be in etc. */ static void tdsigwakeup(struct thread *td, int sig, sig_t action, int intrval) { struct proc *p = td->td_proc; int prop; int wakeup_swapper; wakeup_swapper = 0; PROC_LOCK_ASSERT(p, MA_OWNED); prop = sigprop(sig); PROC_SLOCK(p); thread_lock(td); /* * Bring the priority of a thread up if we want it to get * killed in this lifetime. Be careful to avoid bumping the * priority of the idle thread, since we still allow to signal * kernel processes. */ if (action == SIG_DFL && (prop & SIGPROP_KILL) != 0 && td->td_priority > PUSER && !TD_IS_IDLETHREAD(td)) sched_prio(td, PUSER); if (TD_ON_SLEEPQ(td)) { /* * If thread is sleeping uninterruptibly * we can't interrupt the sleep... the signal will * be noticed when the process returns through * trap() or syscall(). */ if ((td->td_flags & TDF_SINTR) == 0) goto out; /* * If SIGCONT is default (or ignored) and process is * asleep, we are finished; the process should not * be awakened. */ if ((prop & SIGPROP_CONT) && action == SIG_DFL) { thread_unlock(td); PROC_SUNLOCK(p); sigqueue_delete(&p->p_sigqueue, sig); /* * It may be on either list in this state. * Remove from both for now. */ sigqueue_delete(&td->td_sigqueue, sig); return; } /* * Don't awaken a sleeping thread for SIGSTOP if the * STOP signal is deferred. */ if ((prop & SIGPROP_STOP) != 0 && (td->td_flags & (TDF_SBDRY | TDF_SERESTART | TDF_SEINTR)) == TDF_SBDRY) goto out; /* * Give low priority threads a better chance to run. */ if (td->td_priority > PUSER && !TD_IS_IDLETHREAD(td)) sched_prio(td, PUSER); wakeup_swapper = sleepq_abort(td, intrval); } else { /* * Other states do nothing with the signal immediately, * other than kicking ourselves if we are running. * It will either never be noticed, or noticed very soon. */ #ifdef SMP if (TD_IS_RUNNING(td) && td != curthread) forward_signal(td); #endif } out: PROC_SUNLOCK(p); thread_unlock(td); if (wakeup_swapper) kick_proc0(); } static int sig_suspend_threads(struct thread *td, struct proc *p, int sending) { struct thread *td2; int wakeup_swapper; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); MPASS(sending || td == curthread); wakeup_swapper = 0; FOREACH_THREAD_IN_PROC(p, td2) { thread_lock(td2); td2->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK; if ((TD_IS_SLEEPING(td2) || TD_IS_SWAPPED(td2)) && (td2->td_flags & TDF_SINTR)) { if (td2->td_flags & TDF_SBDRY) { /* * Once a thread is asleep with * TDF_SBDRY and without TDF_SERESTART * or TDF_SEINTR set, it should never * become suspended due to this check. */ KASSERT(!TD_IS_SUSPENDED(td2), ("thread with deferred stops suspended")); if (TD_SBDRY_INTR(td2)) wakeup_swapper |= sleepq_abort(td2, TD_SBDRY_ERRNO(td2)); } else if (!TD_IS_SUSPENDED(td2)) { thread_suspend_one(td2); } } else if (!TD_IS_SUSPENDED(td2)) { if (sending || td != td2) td2->td_flags |= TDF_ASTPENDING; #ifdef SMP if (TD_IS_RUNNING(td2) && td2 != td) forward_signal(td2); #endif } thread_unlock(td2); } return (wakeup_swapper); } /* * Stop the process for an event deemed interesting to the debugger. If si is * non-NULL, this is a signal exchange; the new signal requested by the * debugger will be returned for handling. If si is NULL, this is some other * type of interesting event. The debugger may request a signal be delivered in * that case as well, however it will be deferred until it can be handled. */ int ptracestop(struct thread *td, int sig, ksiginfo_t *si) { struct proc *p = td->td_proc; struct thread *td2; ksiginfo_t ksi; int prop; PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(!(p->p_flag & P_WEXIT), ("Stopping exiting process")); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.lock_object, "Stopping for traced signal"); td->td_xsig = sig; if (si == NULL || (si->ksi_flags & KSI_PTRACE) == 0) { td->td_dbgflags |= TDB_XSIG; CTR4(KTR_PTRACE, "ptracestop: tid %d (pid %d) flags %#x sig %d", td->td_tid, p->p_pid, td->td_dbgflags, sig); PROC_SLOCK(p); while ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_XSIG)) { if (P_KILLED(p)) { /* * Ensure that, if we've been PT_KILLed, the * exit status reflects that. Another thread * may also be in ptracestop(), having just * received the SIGKILL, but this thread was * unsuspended first. */ td->td_dbgflags &= ~TDB_XSIG; td->td_xsig = SIGKILL; p->p_ptevents = 0; break; } if (p->p_flag & P_SINGLE_EXIT && !(td->td_dbgflags & TDB_EXIT)) { /* * Ignore ptrace stops except for thread exit * events when the process exits. */ td->td_dbgflags &= ~TDB_XSIG; PROC_SUNLOCK(p); return (0); } /* * Make wait(2) work. Ensure that right after the * attach, the thread which was decided to become the * leader of attach gets reported to the waiter. * Otherwise, just avoid overwriting another thread's * assignment to p_xthread. If another thread has * already set p_xthread, the current thread will get * a chance to report itself upon the next iteration. */ if ((td->td_dbgflags & TDB_FSTP) != 0 || ((p->p_flag2 & P2_PTRACE_FSTP) == 0 && p->p_xthread == NULL)) { p->p_xsig = sig; p->p_xthread = td; td->td_dbgflags &= ~TDB_FSTP; p->p_flag2 &= ~P2_PTRACE_FSTP; p->p_flag |= P_STOPPED_SIG | P_STOPPED_TRACE; sig_suspend_threads(td, p, 0); } if ((td->td_dbgflags & TDB_STOPATFORK) != 0) { td->td_dbgflags &= ~TDB_STOPATFORK; cv_broadcast(&p->p_dbgwait); } stopme: thread_suspend_switch(td, p); if (p->p_xthread == td) p->p_xthread = NULL; if (!(p->p_flag & P_TRACED)) break; if (td->td_dbgflags & TDB_SUSPEND) { if (p->p_flag & P_SINGLE_EXIT) break; goto stopme; } } PROC_SUNLOCK(p); } if (si != NULL && sig == td->td_xsig) { /* Parent wants us to take the original signal unchanged. */ si->ksi_flags |= KSI_HEAD; if (sigqueue_add(&td->td_sigqueue, sig, si) != 0) si->ksi_signo = 0; } else if (td->td_xsig != 0) { /* * If parent wants us to take a new signal, then it will leave * it in td->td_xsig; otherwise we just look for signals again. */ ksiginfo_init(&ksi); ksi.ksi_signo = td->td_xsig; ksi.ksi_flags |= KSI_PTRACE; prop = sigprop(td->td_xsig); td2 = sigtd(p, td->td_xsig, prop); tdsendsignal(p, td2, td->td_xsig, &ksi); if (td != td2) return (0); } return (td->td_xsig); } static void reschedule_signals(struct proc *p, sigset_t block, int flags) { struct sigacts *ps; struct thread *td; int sig; PROC_LOCK_ASSERT(p, MA_OWNED); ps = p->p_sigacts; mtx_assert(&ps->ps_mtx, (flags & SIGPROCMASK_PS_LOCKED) != 0 ? MA_OWNED : MA_NOTOWNED); if (SIGISEMPTY(p->p_siglist)) return; SIGSETAND(block, p->p_siglist); while ((sig = sig_ffs(&block)) != 0) { SIGDELSET(block, sig); td = sigtd(p, sig, 0); signotify(td); if (!(flags & SIGPROCMASK_PS_LOCKED)) mtx_lock(&ps->ps_mtx); if (p->p_flag & P_TRACED || (SIGISMEMBER(ps->ps_sigcatch, sig) && !SIGISMEMBER(td->td_sigmask, sig))) tdsigwakeup(td, sig, SIG_CATCH, (SIGISMEMBER(ps->ps_sigintr, sig) ? EINTR : ERESTART)); if (!(flags & SIGPROCMASK_PS_LOCKED)) mtx_unlock(&ps->ps_mtx); } } void tdsigcleanup(struct thread *td) { struct proc *p; sigset_t unblocked; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sigqueue_flush(&td->td_sigqueue); if (p->p_numthreads == 1) return; /* * Since we cannot handle signals, notify signal post code * about this by filling the sigmask. * * Also, if needed, wake up thread(s) that do not block the * same signals as the exiting thread, since the thread might * have been selected for delivery and woken up. */ SIGFILLSET(unblocked); SIGSETNAND(unblocked, td->td_sigmask); SIGFILLSET(td->td_sigmask); reschedule_signals(p, unblocked, 0); } static int sigdeferstop_curr_flags(int cflags) { MPASS((cflags & (TDF_SEINTR | TDF_SERESTART)) == 0 || (cflags & TDF_SBDRY) != 0); return (cflags & (TDF_SBDRY | TDF_SEINTR | TDF_SERESTART)); } /* * Defer the delivery of SIGSTOP for the current thread, according to * the requested mode. Returns previous flags, which must be restored * by sigallowstop(). * * TDF_SBDRY, TDF_SEINTR, and TDF_SERESTART flags are only set and * cleared by the current thread, which allow the lock-less read-only * accesses below. */ int sigdeferstop_impl(int mode) { struct thread *td; int cflags, nflags; td = curthread; cflags = sigdeferstop_curr_flags(td->td_flags); switch (mode) { case SIGDEFERSTOP_NOP: nflags = cflags; break; case SIGDEFERSTOP_OFF: nflags = 0; break; case SIGDEFERSTOP_SILENT: nflags = (cflags | TDF_SBDRY) & ~(TDF_SEINTR | TDF_SERESTART); break; case SIGDEFERSTOP_EINTR: nflags = (cflags | TDF_SBDRY | TDF_SEINTR) & ~TDF_SERESTART; break; case SIGDEFERSTOP_ERESTART: nflags = (cflags | TDF_SBDRY | TDF_SERESTART) & ~TDF_SEINTR; break; default: panic("sigdeferstop: invalid mode %x", mode); break; } if (cflags == nflags) return (SIGDEFERSTOP_VAL_NCHG); thread_lock(td); td->td_flags = (td->td_flags & ~cflags) | nflags; thread_unlock(td); return (cflags); } /* * Restores the STOP handling mode, typically permitting the delivery * of SIGSTOP for the current thread. This does not immediately * suspend if a stop was posted. Instead, the thread will suspend * either via ast() or a subsequent interruptible sleep. */ void sigallowstop_impl(int prev) { struct thread *td; int cflags; KASSERT(prev != SIGDEFERSTOP_VAL_NCHG, ("failed sigallowstop")); KASSERT((prev & ~(TDF_SBDRY | TDF_SEINTR | TDF_SERESTART)) == 0, ("sigallowstop: incorrect previous mode %x", prev)); td = curthread; cflags = sigdeferstop_curr_flags(td->td_flags); if (cflags != prev) { thread_lock(td); td->td_flags = (td->td_flags & ~cflags) | prev; thread_unlock(td); } } /* * If the current process has received a signal (should be caught or cause * termination, should interrupt current syscall), return the signal number. * Stop signals with default action are processed immediately, then cleared; * they aren't returned. This is checked after each entry to the system for * a syscall or trap (though this can usually be done without calling issignal * by checking the pending signal masks in cursig.) The normal call * sequence is * * while (sig = cursig(curthread)) * postsig(sig); */ static int issignal(struct thread *td) { struct proc *p; struct sigacts *ps; struct sigqueue *queue; sigset_t sigpending; ksiginfo_t ksi; int prop, sig, traced; p = td->td_proc; ps = p->p_sigacts; mtx_assert(&ps->ps_mtx, MA_OWNED); PROC_LOCK_ASSERT(p, MA_OWNED); for (;;) { traced = (p->p_flag & P_TRACED) || (p->p_stops & S_SIG); sigpending = td->td_sigqueue.sq_signals; SIGSETOR(sigpending, p->p_sigqueue.sq_signals); SIGSETNAND(sigpending, td->td_sigmask); if ((p->p_flag & P_PPWAIT) != 0 || (td->td_flags & (TDF_SBDRY | TDF_SERESTART | TDF_SEINTR)) == TDF_SBDRY) SIG_STOPSIGMASK(sigpending); if (SIGISEMPTY(sigpending)) /* no signal to send */ return (0); if ((p->p_flag & (P_TRACED | P_PPTRACE)) == P_TRACED && (p->p_flag2 & P2_PTRACE_FSTP) != 0 && SIGISMEMBER(sigpending, SIGSTOP)) { /* * If debugger just attached, always consume * SIGSTOP from ptrace(PT_ATTACH) first, to * execute the debugger attach ritual in * order. */ sig = SIGSTOP; td->td_dbgflags |= TDB_FSTP; } else { sig = sig_ffs(&sigpending); } if (p->p_stops & S_SIG) { mtx_unlock(&ps->ps_mtx); stopevent(p, S_SIG, sig); mtx_lock(&ps->ps_mtx); } /* * We should see pending but ignored signals * only if P_TRACED was on when they were posted. */ if (SIGISMEMBER(ps->ps_sigignore, sig) && (traced == 0)) { sigqueue_delete(&td->td_sigqueue, sig); sigqueue_delete(&p->p_sigqueue, sig); continue; } if ((p->p_flag & (P_TRACED | P_PPTRACE)) == P_TRACED) { /* * If traced, always stop. * Remove old signal from queue before the stop. * XXX shrug off debugger, it causes siginfo to * be thrown away. */ queue = &td->td_sigqueue; ksiginfo_init(&ksi); if (sigqueue_get(queue, sig, &ksi) == 0) { queue = &p->p_sigqueue; sigqueue_get(queue, sig, &ksi); } td->td_si = ksi.ksi_info; mtx_unlock(&ps->ps_mtx); sig = ptracestop(td, sig, &ksi); mtx_lock(&ps->ps_mtx); /* * Keep looking if the debugger discarded or * replaced the signal. */ if (sig == 0) continue; /* * If the signal became masked, re-queue it. */ if (SIGISMEMBER(td->td_sigmask, sig)) { ksi.ksi_flags |= KSI_HEAD; sigqueue_add(&p->p_sigqueue, sig, &ksi); continue; } /* * If the traced bit got turned off, requeue * the signal and go back up to the top to * rescan signals. This ensures that p_sig* * and p_sigact are consistent. */ if ((p->p_flag & P_TRACED) == 0) { ksi.ksi_flags |= KSI_HEAD; sigqueue_add(queue, sig, &ksi); continue; } } prop = sigprop(sig); /* * Decide whether the signal should be returned. * Return the signal's number, or fall through * to clear it from the pending mask. */ switch ((intptr_t)p->p_sigacts->ps_sigact[_SIG_IDX(sig)]) { case (intptr_t)SIG_DFL: /* * Don't take default actions on system processes. */ if (p->p_pid <= 1) { #ifdef DIAGNOSTIC /* * Are you sure you want to ignore SIGSEGV * in init? XXX */ printf("Process (pid %lu) got signal %d\n", (u_long)p->p_pid, sig); #endif break; /* == ignore */ } /* * If there is a pending stop signal to process with * default action, stop here, then clear the signal. * Traced or exiting processes should ignore stops. * Additionally, a member of an orphaned process group * should ignore tty stops. */ if (prop & SIGPROP_STOP) { if (p->p_flag & (P_TRACED | P_WEXIT | P_SINGLE_EXIT) || (p->p_pgrp->pg_jobc == 0 && prop & SIGPROP_TTYSTOP)) break; /* == ignore */ if (TD_SBDRY_INTR(td)) { KASSERT((td->td_flags & TDF_SBDRY) != 0, ("lost TDF_SBDRY")); return (-1); } mtx_unlock(&ps->ps_mtx); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.lock_object, "Catching SIGSTOP"); sigqueue_delete(&td->td_sigqueue, sig); sigqueue_delete(&p->p_sigqueue, sig); p->p_flag |= P_STOPPED_SIG; p->p_xsig = sig; PROC_SLOCK(p); sig_suspend_threads(td, p, 0); thread_suspend_switch(td, p); PROC_SUNLOCK(p); mtx_lock(&ps->ps_mtx); goto next; } else if (prop & SIGPROP_IGNORE) { /* * Except for SIGCONT, shouldn't get here. * Default action is to ignore; drop it. */ break; /* == ignore */ } else return (sig); /*NOTREACHED*/ case (intptr_t)SIG_IGN: /* * Masking above should prevent us ever trying * to take action on an ignored signal other * than SIGCONT, unless process is traced. */ if ((prop & SIGPROP_CONT) == 0 && (p->p_flag & P_TRACED) == 0) printf("issignal\n"); break; /* == ignore */ default: /* * This signal has an action, let * postsig() process it. */ return (sig); } sigqueue_delete(&td->td_sigqueue, sig); /* take the signal! */ sigqueue_delete(&p->p_sigqueue, sig); next:; } /* NOTREACHED */ } void thread_stopped(struct proc *p) { int n; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); n = p->p_suspcount; if (p == curproc) n++; if ((p->p_flag & P_STOPPED_SIG) && (n == p->p_numthreads)) { PROC_SUNLOCK(p); p->p_flag &= ~P_WAITED; PROC_LOCK(p->p_pptr); childproc_stopped(p, (p->p_flag & P_TRACED) ? CLD_TRAPPED : CLD_STOPPED); PROC_UNLOCK(p->p_pptr); PROC_SLOCK(p); } } /* * Take the action for the specified signal * from the current set of pending signals. */ int postsig(int sig) { struct thread *td; struct proc *p; struct sigacts *ps; sig_t action; ksiginfo_t ksi; sigset_t returnmask; KASSERT(sig != 0, ("postsig")); td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); ps = p->p_sigacts; mtx_assert(&ps->ps_mtx, MA_OWNED); ksiginfo_init(&ksi); if (sigqueue_get(&td->td_sigqueue, sig, &ksi) == 0 && sigqueue_get(&p->p_sigqueue, sig, &ksi) == 0) return (0); ksi.ksi_signo = sig; if (ksi.ksi_code == SI_TIMER) itimer_accept(p, ksi.ksi_timerid, &ksi); action = ps->ps_sigact[_SIG_IDX(sig)]; #ifdef KTRACE if (KTRPOINT(td, KTR_PSIG)) ktrpsig(sig, action, td->td_pflags & TDP_OLDMASK ? &td->td_oldsigmask : &td->td_sigmask, ksi.ksi_code); #endif if ((p->p_stops & S_SIG) != 0) { mtx_unlock(&ps->ps_mtx); stopevent(p, S_SIG, sig); mtx_lock(&ps->ps_mtx); } if (action == SIG_DFL) { /* * Default action, where the default is to kill * the process. (Other cases were ignored above.) */ mtx_unlock(&ps->ps_mtx); proc_td_siginfo_capture(td, &ksi.ksi_info); sigexit(td, sig); /* NOTREACHED */ } else { /* * If we get here, the signal must be caught. */ KASSERT(action != SIG_IGN, ("postsig action %p", action)); KASSERT(!SIGISMEMBER(td->td_sigmask, sig), ("postsig action: blocked sig %d", sig)); /* * Set the new mask value and also defer further * occurrences of this signal. * * Special case: user has done a sigsuspend. Here the * current mask is not of interest, but rather the * mask from before the sigsuspend is what we want * restored after the signal processing is completed. */ if (td->td_pflags & TDP_OLDMASK) { returnmask = td->td_oldsigmask; td->td_pflags &= ~TDP_OLDMASK; } else returnmask = td->td_sigmask; if (p->p_sig == sig) { p->p_code = 0; p->p_sig = 0; } (*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask); postsig_done(sig, td, ps); } return (1); } /* * Kill the current process for stated reason. */ void killproc(struct proc *p, char *why) { PROC_LOCK_ASSERT(p, MA_OWNED); CTR3(KTR_PROC, "killproc: proc %p (pid %d, %s)", p, p->p_pid, p->p_comm); log(LOG_ERR, "pid %d (%s), uid %d, was killed: %s\n", p->p_pid, p->p_comm, p->p_ucred ? p->p_ucred->cr_uid : -1, why); p->p_flag |= P_WKILLED; kern_psignal(p, SIGKILL); } /* * Force the current process to exit with the specified signal, dumping core * if appropriate. We bypass the normal tests for masked and caught signals, * allowing unrecoverable failures to terminate the process without changing * signal state. Mark the accounting record with the signal termination. * If dumping core, save the signal number for the debugger. Calls exit and * does not return. */ void sigexit(struct thread *td, int sig) { struct proc *p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); p->p_acflag |= AXSIG; /* * We must be single-threading to generate a core dump. This * ensures that the registers in the core file are up-to-date. * Also, the ELF dump handler assumes that the thread list doesn't * change out from under it. * * XXX If another thread attempts to single-thread before us * (e.g. via fork()), we won't get a dump at all. */ if ((sigprop(sig) & SIGPROP_CORE) && thread_single(p, SINGLE_NO_EXIT) == 0) { p->p_sig = sig; /* * Log signals which would cause core dumps * (Log as LOG_INFO to appease those who don't want * these messages.) * XXX : Todo, as well as euid, write out ruid too * Note that coredump() drops proc lock. */ if (coredump(td) == 0) sig |= WCOREFLAG; if (kern_logsigexit) log(LOG_INFO, "pid %d (%s), uid %d: exited on signal %d%s\n", p->p_pid, p->p_comm, td->td_ucred ? td->td_ucred->cr_uid : -1, sig &~ WCOREFLAG, sig & WCOREFLAG ? " (core dumped)" : ""); } else PROC_UNLOCK(p); exit1(td, 0, sig); /* NOTREACHED */ } /* * Send queued SIGCHLD to parent when child process's state * is changed. */ static void sigparent(struct proc *p, int reason, int status) { PROC_LOCK_ASSERT(p, MA_OWNED); PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED); if (p->p_ksi != NULL) { p->p_ksi->ksi_signo = SIGCHLD; p->p_ksi->ksi_code = reason; p->p_ksi->ksi_status = status; p->p_ksi->ksi_pid = p->p_pid; p->p_ksi->ksi_uid = p->p_ucred->cr_ruid; if (KSI_ONQ(p->p_ksi)) return; } pksignal(p->p_pptr, SIGCHLD, p->p_ksi); } static void childproc_jobstate(struct proc *p, int reason, int sig) { struct sigacts *ps; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED); /* * Wake up parent sleeping in kern_wait(), also send * SIGCHLD to parent, but SIGCHLD does not guarantee * that parent will awake, because parent may masked * the signal. */ p->p_pptr->p_flag |= P_STATCHILD; wakeup(p->p_pptr); ps = p->p_pptr->p_sigacts; mtx_lock(&ps->ps_mtx); if ((ps->ps_flag & PS_NOCLDSTOP) == 0) { mtx_unlock(&ps->ps_mtx); sigparent(p, reason, sig); } else mtx_unlock(&ps->ps_mtx); } void childproc_stopped(struct proc *p, int reason) { childproc_jobstate(p, reason, p->p_xsig); } void childproc_continued(struct proc *p) { childproc_jobstate(p, CLD_CONTINUED, SIGCONT); } void childproc_exited(struct proc *p) { int reason, status; if (WCOREDUMP(p->p_xsig)) { reason = CLD_DUMPED; status = WTERMSIG(p->p_xsig); } else if (WIFSIGNALED(p->p_xsig)) { reason = CLD_KILLED; status = WTERMSIG(p->p_xsig); } else { reason = CLD_EXITED; status = p->p_xexit; } /* * XXX avoid calling wakeup(p->p_pptr), the work is * done in exit1(). */ sigparent(p, reason, status); } /* * We only have 1 character for the core count in the format * string, so the range will be 0-9 */ #define MAX_NUM_CORE_FILES 10 #ifndef NUM_CORE_FILES #define NUM_CORE_FILES 5 #endif CTASSERT(NUM_CORE_FILES >= 0 && NUM_CORE_FILES <= MAX_NUM_CORE_FILES); static int num_cores = NUM_CORE_FILES; static int sysctl_debug_num_cores_check (SYSCTL_HANDLER_ARGS) { int error; int new_val; new_val = num_cores; error = sysctl_handle_int(oidp, &new_val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (new_val > MAX_NUM_CORE_FILES) new_val = MAX_NUM_CORE_FILES; if (new_val < 0) new_val = 0; num_cores = new_val; return (0); } SYSCTL_PROC(_debug, OID_AUTO, ncores, CTLTYPE_INT|CTLFLAG_RW, 0, sizeof(int), sysctl_debug_num_cores_check, "I", ""); #define GZIP_SUFFIX ".gz" #define ZSTD_SUFFIX ".zst" int compress_user_cores = 0; static int sysctl_compress_user_cores(SYSCTL_HANDLER_ARGS) { int error, val; val = compress_user_cores; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val != 0 && !compressor_avail(val)) return (EINVAL); compress_user_cores = val; return (error); } SYSCTL_PROC(_kern, OID_AUTO, compress_user_cores, CTLTYPE_INT | CTLFLAG_RWTUN, 0, sizeof(int), sysctl_compress_user_cores, "I", "Enable compression of user corefiles (" __XSTRING(COMPRESS_GZIP) " = gzip, " __XSTRING(COMPRESS_ZSTD) " = zstd)"); int compress_user_cores_level = 6; SYSCTL_INT(_kern, OID_AUTO, compress_user_cores_level, CTLFLAG_RWTUN, &compress_user_cores_level, 0, "Corefile compression level"); /* * Protect the access to corefilename[] by allproc_lock. */ #define corefilename_lock allproc_lock static char corefilename[MAXPATHLEN] = {"%N.core"}; TUNABLE_STR("kern.corefile", corefilename, sizeof(corefilename)); static int sysctl_kern_corefile(SYSCTL_HANDLER_ARGS) { int error; sx_xlock(&corefilename_lock); error = sysctl_handle_string(oidp, corefilename, sizeof(corefilename), req); sx_xunlock(&corefilename_lock); return (error); } SYSCTL_PROC(_kern, OID_AUTO, corefile, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_kern_corefile, "A", "Process corefile name format string"); /* * corefile_open(comm, uid, pid, td, compress, vpp, namep) * Expand the name described in corefilename, using name, uid, and pid * and open/create core file. * corefilename is a printf-like string, with three format specifiers: * %N name of process ("name") * %P process id (pid) * %U user id (uid) * For example, "%N.core" is the default; they can be disabled completely * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P". * This is controlled by the sysctl variable kern.corefile (see above). */ static int corefile_open(const char *comm, uid_t uid, pid_t pid, struct thread *td, int compress, struct vnode **vpp, char **namep) { struct nameidata nd; struct sbuf sb; const char *format; char *hostname, *name; int indexpos, i, error, cmode, flags, oflags; hostname = NULL; format = corefilename; name = malloc(MAXPATHLEN, M_TEMP, M_WAITOK | M_ZERO); indexpos = -1; (void)sbuf_new(&sb, name, MAXPATHLEN, SBUF_FIXEDLEN); sx_slock(&corefilename_lock); for (i = 0; format[i] != '\0'; i++) { switch (format[i]) { case '%': /* Format character */ i++; switch (format[i]) { case '%': sbuf_putc(&sb, '%'); break; case 'H': /* hostname */ if (hostname == NULL) { hostname = malloc(MAXHOSTNAMELEN, M_TEMP, M_WAITOK); } getcredhostname(td->td_ucred, hostname, MAXHOSTNAMELEN); sbuf_printf(&sb, "%s", hostname); break; case 'I': /* autoincrementing index */ sbuf_printf(&sb, "0"); indexpos = sbuf_len(&sb) - 1; break; case 'N': /* process name */ sbuf_printf(&sb, "%s", comm); break; case 'P': /* process id */ sbuf_printf(&sb, "%u", pid); break; case 'U': /* user id */ sbuf_printf(&sb, "%u", uid); break; default: log(LOG_ERR, "Unknown format character %c in " "corename `%s'\n", format[i], format); break; } break; default: sbuf_putc(&sb, format[i]); break; } } sx_sunlock(&corefilename_lock); free(hostname, M_TEMP); if (compress == COMPRESS_GZIP) sbuf_printf(&sb, GZIP_SUFFIX); else if (compress == COMPRESS_ZSTD) sbuf_printf(&sb, ZSTD_SUFFIX); if (sbuf_error(&sb) != 0) { log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too " "long\n", (long)pid, comm, (u_long)uid); sbuf_delete(&sb); free(name, M_TEMP); return (ENOMEM); } sbuf_finish(&sb); sbuf_delete(&sb); cmode = S_IRUSR | S_IWUSR; oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE | (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0); /* * If the core format has a %I in it, then we need to check * for existing corefiles before returning a name. * To do this we iterate over 0..num_cores to find a * non-existing core file name to use. */ if (indexpos != -1) { for (i = 0; i < num_cores; i++) { flags = O_CREAT | O_EXCL | FWRITE | O_NOFOLLOW; name[indexpos] = '0' + i; NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name, td); error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred, NULL); if (error) { if (error == EEXIST) continue; log(LOG_ERR, "pid %d (%s), uid (%u): Path `%s' failed " "on initial open test, error = %d\n", pid, comm, uid, name, error); } goto out; } } flags = O_CREAT | FWRITE | O_NOFOLLOW; NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name, td); error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred, NULL); out: if (error) { #ifdef AUDIT audit_proc_coredump(td, name, error); #endif free(name, M_TEMP); return (error); } NDFREE(&nd, NDF_ONLY_PNBUF); *vpp = nd.ni_vp; *namep = name; return (0); } static int coredump_sanitise_path(const char *path) { size_t i; /* * Only send a subset of ASCII to devd(8) because it * might pass these strings to sh -c. */ for (i = 0; path[i]; i++) if (!(isalpha(path[i]) || isdigit(path[i])) && path[i] != '/' && path[i] != '.' && path[i] != '-') return (0); return (1); } /* * Dump a process' core. The main routine does some * policy checking, and creates the name of the coredump; * then it passes on a vnode and a size limit to the process-specific * coredump routine if there is one; if there _is not_ one, it returns * ENOSYS; otherwise it returns the error from the process-specific routine. */ static int coredump(struct thread *td) { struct proc *p = td->td_proc; struct ucred *cred = td->td_ucred; struct vnode *vp; struct flock lf; struct vattr vattr; int error, error1, locked; char *name; /* name of corefile */ void *rl_cookie; off_t limit; char *data = NULL; char *fullpath, *freepath = NULL; size_t len; static const char comm_name[] = "comm="; static const char core_name[] = "core="; PROC_LOCK_ASSERT(p, MA_OWNED); MPASS((p->p_flag & P_HADTHREADS) == 0 || p->p_singlethread == td); _STOPEVENT(p, S_CORE, 0); if (!do_coredump || (!sugid_coredump && (p->p_flag & P_SUGID) != 0) || (p->p_flag2 & P2_NOTRACE) != 0) { PROC_UNLOCK(p); return (EFAULT); } /* * Note that the bulk of limit checking is done after * the corefile is created. The exception is if the limit * for corefiles is 0, in which case we don't bother * creating the corefile at all. This layout means that * a corefile is truncated instead of not being created, * if it is larger than the limit. */ limit = (off_t)lim_cur(td, RLIMIT_CORE); if (limit == 0 || racct_get_available(p, RACCT_CORE) == 0) { PROC_UNLOCK(p); return (EFBIG); } PROC_UNLOCK(p); error = corefile_open(p->p_comm, cred->cr_uid, p->p_pid, td, compress_user_cores, &vp, &name); if (error != 0) return (error); /* * Don't dump to non-regular files or files with links. * Do not dump into system files. */ if (vp->v_type != VREG || VOP_GETATTR(vp, &vattr, cred) != 0 || vattr.va_nlink != 1 || (vp->v_vflag & VV_SYSTEM) != 0) { VOP_UNLOCK(vp, 0); error = EFAULT; goto out; } VOP_UNLOCK(vp, 0); /* Postpone other writers, including core dumps of other processes. */ rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_WRLCK; locked = (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK) == 0); VATTR_NULL(&vattr); vattr.va_size = 0; if (set_core_nodump_flag) vattr.va_flags = UF_NODUMP; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VOP_SETATTR(vp, &vattr, cred); VOP_UNLOCK(vp, 0); PROC_LOCK(p); p->p_acflag |= ACORE; PROC_UNLOCK(p); if (p->p_sysent->sv_coredump != NULL) { error = p->p_sysent->sv_coredump(td, vp, limit, 0); } else { error = ENOSYS; } if (locked) { lf.l_type = F_UNLCK; VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK); } vn_rangelock_unlock(vp, rl_cookie); /* * Notify the userland helper that a process triggered a core dump. * This allows the helper to run an automated debugging session. */ if (error != 0 || coredump_devctl == 0) goto out; len = MAXPATHLEN * 2 + sizeof(comm_name) - 1 + sizeof(' ') + sizeof(core_name) - 1; data = malloc(len, M_TEMP, M_WAITOK); if (vn_fullpath_global(td, p->p_textvp, &fullpath, &freepath) != 0) goto out; if (!coredump_sanitise_path(fullpath)) goto out; snprintf(data, len, "%s%s ", comm_name, fullpath); free(freepath, M_TEMP); freepath = NULL; if (vn_fullpath_global(td, vp, &fullpath, &freepath) != 0) goto out; if (!coredump_sanitise_path(fullpath)) goto out; strlcat(data, core_name, len); strlcat(data, fullpath, len); devctl_notify("kernel", "signal", "coredump", data); out: error1 = vn_close(vp, FWRITE, cred, td); if (error == 0) error = error1; #ifdef AUDIT audit_proc_coredump(td, name, error); #endif free(freepath, M_TEMP); free(data, M_TEMP); free(name, M_TEMP); return (error); } /* * Nonexistent system call-- signal process (may want to handle it). Flag * error in case process won't see signal immediately (blocked or ignored). */ #ifndef _SYS_SYSPROTO_H_ struct nosys_args { int dummy; }; #endif /* ARGSUSED */ int nosys(struct thread *td, struct nosys_args *args) { struct proc *p; p = td->td_proc; PROC_LOCK(p); tdsignal(td, SIGSYS); PROC_UNLOCK(p); if (kern_lognosys == 1 || kern_lognosys == 3) { uprintf("pid %d comm %s: nosys %d\n", p->p_pid, p->p_comm, td->td_sa.code); } if (kern_lognosys == 2 || kern_lognosys == 3) { printf("pid %d comm %s: nosys %d\n", p->p_pid, p->p_comm, td->td_sa.code); } return (ENOSYS); } /* * Send a SIGIO or SIGURG signal to a process or process group using stored * credentials rather than those of the current process. */ void pgsigio(struct sigio **sigiop, int sig, int checkctty) { ksiginfo_t ksi; struct sigio *sigio; ksiginfo_init(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = SI_KERNEL; SIGIO_LOCK(); sigio = *sigiop; if (sigio == NULL) { SIGIO_UNLOCK(); return; } if (sigio->sio_pgid > 0) { PROC_LOCK(sigio->sio_proc); if (CANSIGIO(sigio->sio_ucred, sigio->sio_proc->p_ucred)) kern_psignal(sigio->sio_proc, sig); PROC_UNLOCK(sigio->sio_proc); } else if (sigio->sio_pgid < 0) { struct proc *p; PGRP_LOCK(sigio->sio_pgrp); LIST_FOREACH(p, &sigio->sio_pgrp->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && CANSIGIO(sigio->sio_ucred, p->p_ucred) && (checkctty == 0 || (p->p_flag & P_CONTROLT))) kern_psignal(p, sig); PROC_UNLOCK(p); } PGRP_UNLOCK(sigio->sio_pgrp); } SIGIO_UNLOCK(); } static int filt_sigattach(struct knote *kn) { struct proc *p = curproc; kn->kn_ptr.p_proc = p; kn->kn_flags |= EV_CLEAR; /* automatically set */ knlist_add(p->p_klist, kn, 0); return (0); } static void filt_sigdetach(struct knote *kn) { struct proc *p = kn->kn_ptr.p_proc; knlist_remove(p->p_klist, kn, 0); } /* * signal knotes are shared with proc knotes, so we apply a mask to * the hint in order to differentiate them from process hints. This * could be avoided by using a signal-specific knote list, but probably * isn't worth the trouble. */ static int filt_signal(struct knote *kn, long hint) { if (hint & NOTE_SIGNAL) { hint &= ~NOTE_SIGNAL; if (kn->kn_id == hint) kn->kn_data++; } return (kn->kn_data != 0); } struct sigacts * sigacts_alloc(void) { struct sigacts *ps; ps = malloc(sizeof(struct sigacts), M_SUBPROC, M_WAITOK | M_ZERO); refcount_init(&ps->ps_refcnt, 1); mtx_init(&ps->ps_mtx, "sigacts", NULL, MTX_DEF); return (ps); } void sigacts_free(struct sigacts *ps) { if (refcount_release(&ps->ps_refcnt) == 0) return; mtx_destroy(&ps->ps_mtx); free(ps, M_SUBPROC); } struct sigacts * sigacts_hold(struct sigacts *ps) { refcount_acquire(&ps->ps_refcnt); return (ps); } void sigacts_copy(struct sigacts *dest, struct sigacts *src) { KASSERT(dest->ps_refcnt == 1, ("sigacts_copy to shared dest")); mtx_lock(&src->ps_mtx); bcopy(src, dest, offsetof(struct sigacts, ps_refcnt)); mtx_unlock(&src->ps_mtx); } int sigacts_shared(struct sigacts *ps) { return (ps->ps_refcnt > 1); } Index: head/sys/kern/subr_capability.c =================================================================== --- head/sys/kern/subr_capability.c (revision 333424) +++ head/sys/kern/subr_capability.c (revision 333425) @@ -1,309 +1,351 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 FreeBSD Foundation * All rights reserved. * * This software was developed by Pawel Jakub Dawidek under sponsorship from * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Note that this file is compiled into the kernel and into libc. */ #include #include #ifdef _KERNEL #include - +#include #include #else /* !_KERNEL */ #include #include #include #include #include #endif #ifdef _KERNEL #define assert(exp) KASSERT((exp), ("%s:%u", __func__, __LINE__)) + +CAP_RIGHTS_DEFINE1(cap_accept_rights, CAP_ACCEPT); +CAP_RIGHTS_DEFINE1(cap_bind_rights, CAP_BIND); +CAP_RIGHTS_DEFINE1(cap_connect_rights, CAP_CONNECT); +CAP_RIGHTS_DEFINE1(cap_event_rights, CAP_EVENT); +CAP_RIGHTS_DEFINE1(cap_fchdir_rights, CAP_FCHDIR); +CAP_RIGHTS_DEFINE1(cap_fcntl_rights, CAP_FCNTL); +CAP_RIGHTS_DEFINE1(cap_fexecve_rights, CAP_FEXECVE); +CAP_RIGHTS_DEFINE1(cap_flock_rights, CAP_FLOCK); +CAP_RIGHTS_DEFINE1(cap_fpathconf_rights, CAP_FPATHCONF); +CAP_RIGHTS_DEFINE1(cap_fstat_rights, CAP_FSTAT); +CAP_RIGHTS_DEFINE1(cap_fsync_rights, CAP_FSYNC); +CAP_RIGHTS_DEFINE1(cap_ftruncate_rights, CAP_FTRUNCATE); +CAP_RIGHTS_DEFINE1(cap_getpeername_rights, CAP_GETPEERNAME); +CAP_RIGHTS_DEFINE1(cap_getsockname_rights, CAP_GETSOCKNAME); +CAP_RIGHTS_DEFINE1(cap_getsockopt_rights, CAP_GETSOCKOPT); +CAP_RIGHTS_DEFINE1(cap_ioctl_rights, CAP_IOCTL); +CAP_RIGHTS_DEFINE1(cap_listen_rights, CAP_LISTEN); +CAP_RIGHTS_DEFINE1(cap_mmap_rights, CAP_MMAP); +CAP_RIGHTS_DEFINE1(cap_pdgetpid_rights, CAP_PDGETPID); +CAP_RIGHTS_DEFINE1(cap_pdkill_rights, CAP_PDKILL); +CAP_RIGHTS_DEFINE1(cap_pread_rights, CAP_PREAD); +CAP_RIGHTS_DEFINE1(cap_pwrite_rights, CAP_PWRITE); +CAP_RIGHTS_DEFINE1(cap_read_rights, CAP_READ); +CAP_RIGHTS_DEFINE1(cap_recv_rights, CAP_RECV); +CAP_RIGHTS_DEFINE1(cap_send_rights, CAP_SEND); +CAP_RIGHTS_DEFINE1(cap_setsockopt_rights, CAP_SETSOCKOPT); +CAP_RIGHTS_DEFINE1(cap_shutdown_rights, CAP_SHUTDOWN); +CAP_RIGHTS_DEFINE1(cap_write_rights, CAP_WRITE); + +__read_mostly cap_rights_t cap_no_rights; +CAP_RIGHTS_SYSINIT0(cap_no_rights, cap_no_rights); #endif #define CAPARSIZE_MIN (CAP_RIGHTS_VERSION_00 + 2) #define CAPARSIZE_MAX (CAP_RIGHTS_VERSION + 2) static __inline int right_to_index(uint64_t right) { static const int bit2idx[] = { -1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }; int idx; idx = CAPIDXBIT(right); assert(idx >= 0 && idx < sizeof(bit2idx) / sizeof(bit2idx[0])); return (bit2idx[idx]); } static void cap_rights_vset(cap_rights_t *rights, va_list ap) { uint64_t right; int i, n; assert(CAPVER(rights) == CAP_RIGHTS_VERSION_00); n = CAPARSIZE(rights); assert(n >= CAPARSIZE_MIN && n <= CAPARSIZE_MAX); for (;;) { right = (uint64_t)va_arg(ap, unsigned long long); if (right == 0) break; assert(CAPRVER(right) == 0); i = right_to_index(right); assert(i >= 0); assert(i < n); assert(CAPIDXBIT(rights->cr_rights[i]) == CAPIDXBIT(right)); rights->cr_rights[i] |= right; assert(CAPIDXBIT(rights->cr_rights[i]) == CAPIDXBIT(right)); } } static void cap_rights_vclear(cap_rights_t *rights, va_list ap) { uint64_t right; int i, n; assert(CAPVER(rights) == CAP_RIGHTS_VERSION_00); n = CAPARSIZE(rights); assert(n >= CAPARSIZE_MIN && n <= CAPARSIZE_MAX); for (;;) { right = (uint64_t)va_arg(ap, unsigned long long); if (right == 0) break; assert(CAPRVER(right) == 0); i = right_to_index(right); assert(i >= 0); assert(i < n); assert(CAPIDXBIT(rights->cr_rights[i]) == CAPIDXBIT(right)); rights->cr_rights[i] &= ~(right & 0x01FFFFFFFFFFFFFFULL); assert(CAPIDXBIT(rights->cr_rights[i]) == CAPIDXBIT(right)); } } static bool cap_rights_is_vset(const cap_rights_t *rights, va_list ap) { uint64_t right; int i, n; assert(CAPVER(rights) == CAP_RIGHTS_VERSION_00); n = CAPARSIZE(rights); assert(n >= CAPARSIZE_MIN && n <= CAPARSIZE_MAX); for (;;) { right = (uint64_t)va_arg(ap, unsigned long long); if (right == 0) break; assert(CAPRVER(right) == 0); i = right_to_index(right); assert(i >= 0); assert(i < n); assert(CAPIDXBIT(rights->cr_rights[i]) == CAPIDXBIT(right)); if ((rights->cr_rights[i] & right) != right) return (false); } return (true); +} + +void +__cap_rights_sysinit(void *arg) +{ + struct cap_rights_init_args *cria = arg; + cap_rights_t *rights = cria->cria_rights; + + __cap_rights_init(CAP_RIGHTS_VERSION, rights, cria->cria_value1, + cria->cria_value2, cria->cria_value3, cria->cria_value4, 0ULL); } cap_rights_t * __cap_rights_init(int version, cap_rights_t *rights, ...) { unsigned int n; va_list ap; assert(version == CAP_RIGHTS_VERSION_00); n = version + 2; assert(n >= CAPARSIZE_MIN && n <= CAPARSIZE_MAX); CAP_NONE(rights); va_start(ap, rights); cap_rights_vset(rights, ap); va_end(ap); return (rights); } cap_rights_t * __cap_rights_set(cap_rights_t *rights, ...) { va_list ap; assert(CAPVER(rights) == CAP_RIGHTS_VERSION_00); va_start(ap, rights); cap_rights_vset(rights, ap); va_end(ap); return (rights); } cap_rights_t * __cap_rights_clear(cap_rights_t *rights, ...) { va_list ap; assert(CAPVER(rights) == CAP_RIGHTS_VERSION_00); va_start(ap, rights); cap_rights_vclear(rights, ap); va_end(ap); return (rights); } bool __cap_rights_is_set(const cap_rights_t *rights, ...) { va_list ap; bool ret; assert(CAPVER(rights) == CAP_RIGHTS_VERSION_00); va_start(ap, rights); ret = cap_rights_is_vset(rights, ap); va_end(ap); return (ret); } bool cap_rights_is_valid(const cap_rights_t *rights) { cap_rights_t allrights; int i, j; if (CAPVER(rights) != CAP_RIGHTS_VERSION_00) return (false); if (CAPARSIZE(rights) < CAPARSIZE_MIN || CAPARSIZE(rights) > CAPARSIZE_MAX) { return (false); } CAP_ALL(&allrights); if (!cap_rights_contains(&allrights, rights)) return (false); for (i = 0; i < CAPARSIZE(rights); i++) { j = right_to_index(rights->cr_rights[i]); if (i != j) return (false); if (i > 0) { if (CAPRVER(rights->cr_rights[i]) != 0) return (false); } } return (true); } cap_rights_t * cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src) { unsigned int i, n; assert(CAPVER(dst) == CAP_RIGHTS_VERSION_00); assert(CAPVER(src) == CAP_RIGHTS_VERSION_00); assert(CAPVER(dst) == CAPVER(src)); assert(cap_rights_is_valid(src)); assert(cap_rights_is_valid(dst)); n = CAPARSIZE(dst); assert(n >= CAPARSIZE_MIN && n <= CAPARSIZE_MAX); for (i = 0; i < n; i++) dst->cr_rights[i] |= src->cr_rights[i]; assert(cap_rights_is_valid(src)); assert(cap_rights_is_valid(dst)); return (dst); } cap_rights_t * cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src) { unsigned int i, n; assert(CAPVER(dst) == CAP_RIGHTS_VERSION_00); assert(CAPVER(src) == CAP_RIGHTS_VERSION_00); assert(CAPVER(dst) == CAPVER(src)); assert(cap_rights_is_valid(src)); assert(cap_rights_is_valid(dst)); n = CAPARSIZE(dst); assert(n >= CAPARSIZE_MIN && n <= CAPARSIZE_MAX); for (i = 0; i < n; i++) { dst->cr_rights[i] &= ~(src->cr_rights[i] & 0x01FFFFFFFFFFFFFFULL); } assert(cap_rights_is_valid(src)); assert(cap_rights_is_valid(dst)); return (dst); } bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little) { unsigned int i, n; assert(CAPVER(big) == CAP_RIGHTS_VERSION_00); assert(CAPVER(little) == CAP_RIGHTS_VERSION_00); assert(CAPVER(big) == CAPVER(little)); n = CAPARSIZE(big); assert(n >= CAPARSIZE_MIN && n <= CAPARSIZE_MAX); for (i = 0; i < n; i++) { if ((big->cr_rights[i] & little->cr_rights[i]) != little->cr_rights[i]) { return (false); } } return (true); } Index: head/sys/kern/sys_generic.c =================================================================== --- head/sys/kern/sys_generic.c (revision 333424) +++ head/sys/kern/sys_generic.c (revision 333425) @@ -1,1933 +1,1914 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include /* * The following macro defines how many bytes will be allocated from * the stack instead of memory allocated when passing the IOCTL data * structures from userspace and to the kernel. Some IOCTLs having * small data structures are used very frequently and this small * buffer on the stack gives a significant speedup improvement for * those requests. The value of this define should be greater or equal * to 64 bytes and should also be power of two. The data structure is * currently hard-aligned to a 8-byte boundary on the stack. This * should currently be sufficient for all supported platforms. */ #define SYS_IOCTL_SMALL_SIZE 128 /* bytes */ #define SYS_IOCTL_SMALL_ALIGN 8 /* bytes */ #ifdef __LP64__ static int iosize_max_clamp = 0; SYSCTL_INT(_debug, OID_AUTO, iosize_max_clamp, CTLFLAG_RW, &iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX"); static int devfs_iosize_max_clamp = 1; SYSCTL_INT(_debug, OID_AUTO, devfs_iosize_max_clamp, CTLFLAG_RW, &devfs_iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX for devices"); #endif /* * Assert that the return value of read(2) and write(2) syscalls fits * into a register. If not, an architecture will need to provide the * usermode wrappers to reconstruct the result. */ CTASSERT(sizeof(register_t) >= sizeof(size_t)); static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); MALLOC_DEFINE(M_IOV, "iov", "large iov's"); static int pollout(struct thread *, struct pollfd *, struct pollfd *, u_int); static int pollscan(struct thread *, struct pollfd *, u_int); static int pollrescan(struct thread *); static int selscan(struct thread *, fd_mask **, fd_mask **, int); static int selrescan(struct thread *, fd_mask **, fd_mask **); static void selfdalloc(struct thread *, void *); static void selfdfree(struct seltd *, struct selfd *); static int dofileread(struct thread *, int, struct file *, struct uio *, off_t, int); static int dofilewrite(struct thread *, int, struct file *, struct uio *, off_t, int); static void doselwakeup(struct selinfo *, int); static void seltdinit(struct thread *); static int seltdwait(struct thread *, sbintime_t, sbintime_t); static void seltdclear(struct thread *); /* * One seltd per-thread allocated on demand as needed. * * t - protected by st_mtx * k - Only accessed by curthread or read-only */ struct seltd { STAILQ_HEAD(, selfd) st_selq; /* (k) List of selfds. */ struct selfd *st_free1; /* (k) free fd for read set. */ struct selfd *st_free2; /* (k) free fd for write set. */ struct mtx st_mtx; /* Protects struct seltd */ struct cv st_wait; /* (t) Wait channel. */ int st_flags; /* (t) SELTD_ flags. */ }; #define SELTD_PENDING 0x0001 /* We have pending events. */ #define SELTD_RESCAN 0x0002 /* Doing a rescan. */ /* * One selfd allocated per-thread per-file-descriptor. * f - protected by sf_mtx */ struct selfd { STAILQ_ENTRY(selfd) sf_link; /* (k) fds owned by this td. */ TAILQ_ENTRY(selfd) sf_threads; /* (f) fds on this selinfo. */ struct selinfo *sf_si; /* (f) selinfo when linked. */ struct mtx *sf_mtx; /* Pointer to selinfo mtx. */ struct seltd *sf_td; /* (k) owning seltd. */ void *sf_cookie; /* (k) fd or pollfd. */ u_int sf_refs; }; static uma_zone_t selfd_zone; static struct mtx_pool *mtxpool_select; #ifdef __LP64__ size_t devfs_iosize_max(void) { return (devfs_iosize_max_clamp || SV_CURPROC_FLAG(SV_ILP32) ? INT_MAX : SSIZE_MAX); } size_t iosize_max(void) { return (iosize_max_clamp || SV_CURPROC_FLAG(SV_ILP32) ? INT_MAX : SSIZE_MAX); } #endif #ifndef _SYS_SYSPROTO_H_ struct read_args { int fd; void *buf; size_t nbyte; }; #endif int sys_read(td, uap) struct thread *td; struct read_args *uap; { struct uio auio; struct iovec aiov; int error; if (uap->nbyte > IOSIZE_MAX) return (EINVAL); aiov.iov_base = uap->buf; aiov.iov_len = uap->nbyte; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = uap->nbyte; auio.uio_segflg = UIO_USERSPACE; error = kern_readv(td, uap->fd, &auio); return (error); } /* * Positioned read system call */ #ifndef _SYS_SYSPROTO_H_ struct pread_args { int fd; void *buf; size_t nbyte; int pad; off_t offset; }; #endif int sys_pread(struct thread *td, struct pread_args *uap) { return (kern_pread(td, uap->fd, uap->buf, uap->nbyte, uap->offset)); } int kern_pread(struct thread *td, int fd, void *buf, size_t nbyte, off_t offset) { struct uio auio; struct iovec aiov; int error; if (nbyte > IOSIZE_MAX) return (EINVAL); aiov.iov_base = buf; aiov.iov_len = nbyte; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = nbyte; auio.uio_segflg = UIO_USERSPACE; error = kern_preadv(td, fd, &auio, offset); return (error); } #if defined(COMPAT_FREEBSD6) int freebsd6_pread(struct thread *td, struct freebsd6_pread_args *uap) { return (kern_pread(td, uap->fd, uap->buf, uap->nbyte, uap->offset)); } #endif /* * Scatter read system call. */ #ifndef _SYS_SYSPROTO_H_ struct readv_args { int fd; struct iovec *iovp; u_int iovcnt; }; #endif int sys_readv(struct thread *td, struct readv_args *uap) { struct uio *auio; int error; error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_readv(td, uap->fd, auio); free(auio, M_IOV); return (error); } int kern_readv(struct thread *td, int fd, struct uio *auio) { struct file *fp; - cap_rights_t rights; int error; - error = fget_read(td, fd, cap_rights_init(&rights, CAP_READ), &fp); + error = fget_read(td, fd, &cap_read_rights, &fp); if (error) return (error); error = dofileread(td, fd, fp, auio, (off_t)-1, 0); fdrop(fp, td); return (error); } /* * Scatter positioned read system call. */ #ifndef _SYS_SYSPROTO_H_ struct preadv_args { int fd; struct iovec *iovp; u_int iovcnt; off_t offset; }; #endif int sys_preadv(struct thread *td, struct preadv_args *uap) { struct uio *auio; int error; error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_preadv(td, uap->fd, auio, uap->offset); free(auio, M_IOV); return (error); } int kern_preadv(td, fd, auio, offset) struct thread *td; int fd; struct uio *auio; off_t offset; { struct file *fp; - cap_rights_t rights; int error; - error = fget_read(td, fd, cap_rights_init(&rights, CAP_PREAD), &fp); + error = fget_read(td, fd, &cap_pread_rights, &fp); if (error) return (error); if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) error = ESPIPE; else if (offset < 0 && (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR)) error = EINVAL; else error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET); fdrop(fp, td); return (error); } /* * Common code for readv and preadv that reads data in * from a file using the passed in uio, offset, and flags. */ static int dofileread(td, fd, fp, auio, offset, flags) struct thread *td; int fd; struct file *fp; struct uio *auio; off_t offset; int flags; { ssize_t cnt; int error; #ifdef KTRACE struct uio *ktruio = NULL; #endif AUDIT_ARG_FD(fd); /* Finish zero length reads right here */ if (auio->uio_resid == 0) { td->td_retval[0] = 0; return (0); } auio->uio_rw = UIO_READ; auio->uio_offset = offset; auio->uio_td = td; #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(auio); #endif cnt = auio->uio_resid; if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) { if (auio->uio_resid != cnt && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; } cnt -= auio->uio_resid; #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = cnt; ktrgenio(fd, UIO_READ, ktruio, error); } #endif td->td_retval[0] = cnt; return (error); } #ifndef _SYS_SYSPROTO_H_ struct write_args { int fd; const void *buf; size_t nbyte; }; #endif int sys_write(td, uap) struct thread *td; struct write_args *uap; { struct uio auio; struct iovec aiov; int error; if (uap->nbyte > IOSIZE_MAX) return (EINVAL); aiov.iov_base = (void *)(uintptr_t)uap->buf; aiov.iov_len = uap->nbyte; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = uap->nbyte; auio.uio_segflg = UIO_USERSPACE; error = kern_writev(td, uap->fd, &auio); return (error); } /* * Positioned write system call. */ #ifndef _SYS_SYSPROTO_H_ struct pwrite_args { int fd; const void *buf; size_t nbyte; int pad; off_t offset; }; #endif int sys_pwrite(struct thread *td, struct pwrite_args *uap) { return (kern_pwrite(td, uap->fd, uap->buf, uap->nbyte, uap->offset)); } int kern_pwrite(struct thread *td, int fd, const void *buf, size_t nbyte, off_t offset) { struct uio auio; struct iovec aiov; int error; if (nbyte > IOSIZE_MAX) return (EINVAL); aiov.iov_base = (void *)(uintptr_t)buf; aiov.iov_len = nbyte; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = nbyte; auio.uio_segflg = UIO_USERSPACE; error = kern_pwritev(td, fd, &auio, offset); return (error); } #if defined(COMPAT_FREEBSD6) int freebsd6_pwrite(struct thread *td, struct freebsd6_pwrite_args *uap) { return (kern_pwrite(td, uap->fd, uap->buf, uap->nbyte, uap->offset)); } #endif /* * Gather write system call. */ #ifndef _SYS_SYSPROTO_H_ struct writev_args { int fd; struct iovec *iovp; u_int iovcnt; }; #endif int sys_writev(struct thread *td, struct writev_args *uap) { struct uio *auio; int error; error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_writev(td, uap->fd, auio); free(auio, M_IOV); return (error); } int kern_writev(struct thread *td, int fd, struct uio *auio) { struct file *fp; - cap_rights_t rights; int error; - error = fget_write(td, fd, cap_rights_init(&rights, CAP_WRITE), &fp); + error = fget_write(td, fd, &cap_write_rights, &fp); if (error) return (error); error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0); fdrop(fp, td); return (error); } /* * Gather positioned write system call. */ #ifndef _SYS_SYSPROTO_H_ struct pwritev_args { int fd; struct iovec *iovp; u_int iovcnt; off_t offset; }; #endif int sys_pwritev(struct thread *td, struct pwritev_args *uap) { struct uio *auio; int error; error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_pwritev(td, uap->fd, auio, uap->offset); free(auio, M_IOV); return (error); } int kern_pwritev(td, fd, auio, offset) struct thread *td; struct uio *auio; int fd; off_t offset; { struct file *fp; - cap_rights_t rights; int error; - error = fget_write(td, fd, cap_rights_init(&rights, CAP_PWRITE), &fp); + error = fget_write(td, fd, &cap_pwrite_rights, &fp); if (error) return (error); if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) error = ESPIPE; else if (offset < 0 && (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR)) error = EINVAL; else error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET); fdrop(fp, td); return (error); } /* * Common code for writev and pwritev that writes data to * a file using the passed in uio, offset, and flags. */ static int dofilewrite(td, fd, fp, auio, offset, flags) struct thread *td; int fd; struct file *fp; struct uio *auio; off_t offset; int flags; { ssize_t cnt; int error; #ifdef KTRACE struct uio *ktruio = NULL; #endif AUDIT_ARG_FD(fd); auio->uio_rw = UIO_WRITE; auio->uio_td = td; auio->uio_offset = offset; #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(auio); #endif cnt = auio->uio_resid; if (fp->f_type == DTYPE_VNODE && (fp->f_vnread_flags & FDEVFS_VNODE) == 0) bwillwrite(); if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) { if (auio->uio_resid != cnt && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; /* Socket layer is responsible for issuing SIGPIPE. */ if (fp->f_type != DTYPE_SOCKET && error == EPIPE) { PROC_LOCK(td->td_proc); tdsignal(td, SIGPIPE); PROC_UNLOCK(td->td_proc); } } cnt -= auio->uio_resid; #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = cnt; ktrgenio(fd, UIO_WRITE, ktruio, error); } #endif td->td_retval[0] = cnt; return (error); } /* * Truncate a file given a file descriptor. * * Can't use fget_write() here, since must return EINVAL and not EBADF if the * descriptor isn't writable. */ int kern_ftruncate(td, fd, length) struct thread *td; int fd; off_t length; { struct file *fp; - cap_rights_t rights; int error; AUDIT_ARG_FD(fd); if (length < 0) return (EINVAL); - error = fget(td, fd, cap_rights_init(&rights, CAP_FTRUNCATE), &fp); + error = fget(td, fd, &cap_ftruncate_rights, &fp); if (error) return (error); AUDIT_ARG_FILE(td->td_proc, fp); if (!(fp->f_flag & FWRITE)) { fdrop(fp, td); return (EINVAL); } error = fo_truncate(fp, length, td->td_ucred, td); fdrop(fp, td); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ftruncate_args { int fd; int pad; off_t length; }; #endif int sys_ftruncate(td, uap) struct thread *td; struct ftruncate_args *uap; { return (kern_ftruncate(td, uap->fd, uap->length)); } #if defined(COMPAT_43) #ifndef _SYS_SYSPROTO_H_ struct oftruncate_args { int fd; long length; }; #endif int oftruncate(td, uap) struct thread *td; struct oftruncate_args *uap; { return (kern_ftruncate(td, uap->fd, uap->length)); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct ioctl_args { int fd; u_long com; caddr_t data; }; #endif /* ARGSUSED */ int sys_ioctl(struct thread *td, struct ioctl_args *uap) { u_char smalldata[SYS_IOCTL_SMALL_SIZE] __aligned(SYS_IOCTL_SMALL_ALIGN); u_long com; int arg, error; u_int size; caddr_t data; if (uap->com > 0xffffffff) { printf( "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n", td->td_proc->p_pid, td->td_name, uap->com); uap->com &= 0xffffffff; } com = uap->com; /* * Interpret high order word to find amount of data to be * copied to/from the user's address space. */ size = IOCPARM_LEN(com); if ((size > IOCPARM_MAX) || ((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0) || #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43) ((com & IOC_OUT) && size == 0) || #else ((com & (IOC_IN | IOC_OUT)) && size == 0) || #endif ((com & IOC_VOID) && size > 0 && size != sizeof(int))) return (ENOTTY); if (size > 0) { if (com & IOC_VOID) { /* Integer argument. */ arg = (intptr_t)uap->data; data = (void *)&arg; size = 0; } else { if (size > SYS_IOCTL_SMALL_SIZE) data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK); else data = smalldata; } } else data = (void *)&uap->data; if (com & IOC_IN) { error = copyin(uap->data, data, (u_int)size); if (error != 0) goto out; } else if (com & IOC_OUT) { /* * Zero the buffer so the user always * gets back something deterministic. */ bzero(data, size); } error = kern_ioctl(td, uap->fd, com, data); if (error == 0 && (com & IOC_OUT)) error = copyout(data, uap->data, (u_int)size); out: if (size > SYS_IOCTL_SMALL_SIZE) free(data, M_IOCTLOPS); return (error); } int kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data) { struct file *fp; struct filedesc *fdp; -#ifndef CAPABILITIES - cap_rights_t rights; -#endif int error, tmp, locked; AUDIT_ARG_FD(fd); AUDIT_ARG_CMD(com); fdp = td->td_proc->p_fd; switch (com) { case FIONCLEX: case FIOCLEX: FILEDESC_XLOCK(fdp); locked = LA_XLOCKED; break; default: #ifdef CAPABILITIES FILEDESC_SLOCK(fdp); locked = LA_SLOCKED; #else locked = LA_UNLOCKED; #endif break; } #ifdef CAPABILITIES if ((fp = fget_locked(fdp, fd)) == NULL) { error = EBADF; goto out; } if ((error = cap_ioctl_check(fdp, fd, com)) != 0) { fp = NULL; /* fhold() was not called yet */ goto out; } fhold(fp); if (locked == LA_SLOCKED) { FILEDESC_SUNLOCK(fdp); locked = LA_UNLOCKED; } #else - error = fget(td, fd, cap_rights_init(&rights, CAP_IOCTL), &fp); + error = fget(td, fd, &cap_ioctl_rights, &fp); if (error != 0) { fp = NULL; goto out; } #endif if ((fp->f_flag & (FREAD | FWRITE)) == 0) { error = EBADF; goto out; } switch (com) { case FIONCLEX: fdp->fd_ofiles[fd].fde_flags &= ~UF_EXCLOSE; goto out; case FIOCLEX: fdp->fd_ofiles[fd].fde_flags |= UF_EXCLOSE; goto out; case FIONBIO: if ((tmp = *(int *)data)) atomic_set_int(&fp->f_flag, FNONBLOCK); else atomic_clear_int(&fp->f_flag, FNONBLOCK); data = (void *)&tmp; break; case FIOASYNC: if ((tmp = *(int *)data)) atomic_set_int(&fp->f_flag, FASYNC); else atomic_clear_int(&fp->f_flag, FASYNC); data = (void *)&tmp; break; } error = fo_ioctl(fp, com, data, td->td_ucred, td); out: switch (locked) { case LA_XLOCKED: FILEDESC_XUNLOCK(fdp); break; #ifdef CAPABILITIES case LA_SLOCKED: FILEDESC_SUNLOCK(fdp); break; #endif default: FILEDESC_UNLOCK_ASSERT(fdp); break; } if (fp != NULL) fdrop(fp, td); return (error); } int poll_no_poll(int events) { /* * Return true for read/write. If the user asked for something * special, return POLLNVAL, so that clients have a way of * determining reliably whether or not the extended * functionality is present without hard-coding knowledge * of specific filesystem implementations. */ if (events & ~POLLSTANDARD) return (POLLNVAL); return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); } int sys_pselect(struct thread *td, struct pselect_args *uap) { struct timespec ts; struct timeval tv, *tvp; sigset_t set, *uset; int error; if (uap->ts != NULL) { error = copyin(uap->ts, &ts, sizeof(ts)); if (error != 0) return (error); TIMESPEC_TO_TIMEVAL(&tv, &ts); tvp = &tv; } else tvp = NULL; if (uap->sm != NULL) { error = copyin(uap->sm, &set, sizeof(set)); if (error != 0) return (error); uset = &set; } else uset = NULL; return (kern_pselect(td, uap->nd, uap->in, uap->ou, uap->ex, tvp, uset, NFDBITS)); } int kern_pselect(struct thread *td, int nd, fd_set *in, fd_set *ou, fd_set *ex, struct timeval *tvp, sigset_t *uset, int abi_nfdbits) { int error; if (uset != NULL) { error = kern_sigprocmask(td, SIG_SETMASK, uset, &td->td_oldsigmask, 0); if (error != 0) return (error); td->td_pflags |= TDP_OLDMASK; /* * Make sure that ast() is called on return to * usermode and TDP_OLDMASK is cleared, restoring old * sigmask. */ thread_lock(td); td->td_flags |= TDF_ASTPENDING; thread_unlock(td); } error = kern_select(td, nd, in, ou, ex, tvp, abi_nfdbits); return (error); } #ifndef _SYS_SYSPROTO_H_ struct select_args { int nd; fd_set *in, *ou, *ex; struct timeval *tv; }; #endif int sys_select(struct thread *td, struct select_args *uap) { struct timeval tv, *tvp; int error; if (uap->tv != NULL) { error = copyin(uap->tv, &tv, sizeof(tv)); if (error) return (error); tvp = &tv; } else tvp = NULL; return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp, NFDBITS)); } /* * In the unlikely case when user specified n greater then the last * open file descriptor, check that no bits are set after the last * valid fd. We must return EBADF if any is set. * * There are applications that rely on the behaviour. * * nd is fd_lastfile + 1. */ static int select_check_badfd(fd_set *fd_in, int nd, int ndu, int abi_nfdbits) { char *addr, *oaddr; int b, i, res; uint8_t bits; if (nd >= ndu || fd_in == NULL) return (0); oaddr = NULL; bits = 0; /* silence gcc */ for (i = nd; i < ndu; i++) { b = i / NBBY; #if BYTE_ORDER == LITTLE_ENDIAN addr = (char *)fd_in + b; #else addr = (char *)fd_in; if (abi_nfdbits == NFDBITS) { addr += rounddown(b, sizeof(fd_mask)) + sizeof(fd_mask) - 1 - b % sizeof(fd_mask); } else { addr += rounddown(b, sizeof(uint32_t)) + sizeof(uint32_t) - 1 - b % sizeof(uint32_t); } #endif if (addr != oaddr) { res = fubyte(addr); if (res == -1) return (EFAULT); oaddr = addr; bits = res; } if ((bits & (1 << (i % NBBY))) != 0) return (EBADF); } return (0); } int kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou, fd_set *fd_ex, struct timeval *tvp, int abi_nfdbits) { struct filedesc *fdp; /* * The magic 2048 here is chosen to be just enough for FD_SETSIZE * infds with the new FD_SETSIZE of 1024, and more than enough for * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE * of 256. */ fd_mask s_selbits[howmany(2048, NFDBITS)]; fd_mask *ibits[3], *obits[3], *selbits, *sbp; struct timeval rtv; sbintime_t asbt, precision, rsbt; u_int nbufbytes, ncpbytes, ncpubytes, nfdbits; int error, lf, ndu; if (nd < 0) return (EINVAL); fdp = td->td_proc->p_fd; ndu = nd; lf = fdp->fd_lastfile; if (nd > lf + 1) nd = lf + 1; error = select_check_badfd(fd_in, nd, ndu, abi_nfdbits); if (error != 0) return (error); error = select_check_badfd(fd_ou, nd, ndu, abi_nfdbits); if (error != 0) return (error); error = select_check_badfd(fd_ex, nd, ndu, abi_nfdbits); if (error != 0) return (error); /* * Allocate just enough bits for the non-null fd_sets. Use the * preallocated auto buffer if possible. */ nfdbits = roundup(nd, NFDBITS); ncpbytes = nfdbits / NBBY; ncpubytes = roundup(nd, abi_nfdbits) / NBBY; nbufbytes = 0; if (fd_in != NULL) nbufbytes += 2 * ncpbytes; if (fd_ou != NULL) nbufbytes += 2 * ncpbytes; if (fd_ex != NULL) nbufbytes += 2 * ncpbytes; if (nbufbytes <= sizeof s_selbits) selbits = &s_selbits[0]; else selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); /* * Assign pointers into the bit buffers and fetch the input bits. * Put the output buffers together so that they can be bzeroed * together. */ sbp = selbits; #define getbits(name, x) \ do { \ if (name == NULL) { \ ibits[x] = NULL; \ obits[x] = NULL; \ } else { \ ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ obits[x] = sbp; \ sbp += ncpbytes / sizeof *sbp; \ error = copyin(name, ibits[x], ncpubytes); \ if (error != 0) \ goto done; \ bzero((char *)ibits[x] + ncpubytes, \ ncpbytes - ncpubytes); \ } \ } while (0) getbits(fd_in, 0); getbits(fd_ou, 1); getbits(fd_ex, 2); #undef getbits #if BYTE_ORDER == BIG_ENDIAN && defined(__LP64__) /* * XXX: swizzle_fdset assumes that if abi_nfdbits != NFDBITS, * we are running under 32-bit emulation. This should be more * generic. */ #define swizzle_fdset(bits) \ if (abi_nfdbits != NFDBITS && bits != NULL) { \ int i; \ for (i = 0; i < ncpbytes / sizeof *sbp; i++) \ bits[i] = (bits[i] >> 32) | (bits[i] << 32); \ } #else #define swizzle_fdset(bits) #endif /* Make sure the bit order makes it through an ABI transition */ swizzle_fdset(ibits[0]); swizzle_fdset(ibits[1]); swizzle_fdset(ibits[2]); if (nbufbytes != 0) bzero(selbits, nbufbytes / 2); precision = 0; if (tvp != NULL) { rtv = *tvp; if (rtv.tv_sec < 0 || rtv.tv_usec < 0 || rtv.tv_usec >= 1000000) { error = EINVAL; goto done; } if (!timevalisset(&rtv)) asbt = 0; else if (rtv.tv_sec <= INT32_MAX) { rsbt = tvtosbt(rtv); precision = rsbt; precision >>= tc_precexp; if (TIMESEL(&asbt, rsbt)) asbt += tc_tick_sbt; if (asbt <= SBT_MAX - rsbt) asbt += rsbt; else asbt = -1; } else asbt = -1; } else asbt = -1; seltdinit(td); /* Iterate until the timeout expires or descriptors become ready. */ for (;;) { error = selscan(td, ibits, obits, nd); if (error || td->td_retval[0] != 0) break; error = seltdwait(td, asbt, precision); if (error) break; error = selrescan(td, ibits, obits); if (error || td->td_retval[0] != 0) break; } seltdclear(td); done: /* select is not restarted after signals... */ if (error == ERESTART) error = EINTR; if (error == EWOULDBLOCK) error = 0; /* swizzle bit order back, if necessary */ swizzle_fdset(obits[0]); swizzle_fdset(obits[1]); swizzle_fdset(obits[2]); #undef swizzle_fdset #define putbits(name, x) \ if (name && (error2 = copyout(obits[x], name, ncpubytes))) \ error = error2; if (error == 0) { int error2; putbits(fd_in, 0); putbits(fd_ou, 1); putbits(fd_ex, 2); #undef putbits } if (selbits != &s_selbits[0]) free(selbits, M_SELECT); return (error); } /* * Convert a select bit set to poll flags. * * The backend always returns POLLHUP/POLLERR if appropriate and we * return this as a set bit in any set. */ static int select_flags[3] = { POLLRDNORM | POLLHUP | POLLERR, POLLWRNORM | POLLHUP | POLLERR, POLLRDBAND | POLLERR }; /* * Compute the fo_poll flags required for a fd given by the index and * bit position in the fd_mask array. */ static __inline int selflags(fd_mask **ibits, int idx, fd_mask bit) { int flags; int msk; flags = 0; for (msk = 0; msk < 3; msk++) { if (ibits[msk] == NULL) continue; if ((ibits[msk][idx] & bit) == 0) continue; flags |= select_flags[msk]; } return (flags); } /* * Set the appropriate output bits given a mask of fired events and the * input bits originally requested. */ static __inline int selsetbits(fd_mask **ibits, fd_mask **obits, int idx, fd_mask bit, int events) { int msk; int n; n = 0; for (msk = 0; msk < 3; msk++) { if ((events & select_flags[msk]) == 0) continue; if (ibits[msk] == NULL) continue; if ((ibits[msk][idx] & bit) == 0) continue; /* * XXX Check for a duplicate set. This can occur because a * socket calls selrecord() twice for each poll() call * resulting in two selfds per real fd. selrescan() will * call selsetbits twice as a result. */ if ((obits[msk][idx] & bit) != 0) continue; obits[msk][idx] |= bit; n++; } return (n); } static __inline int getselfd_cap(struct filedesc *fdp, int fd, struct file **fpp) { - cap_rights_t rights; - cap_rights_init(&rights, CAP_EVENT); - - return (fget_unlocked(fdp, fd, &rights, fpp, NULL)); + return (fget_unlocked(fdp, fd, &cap_event_rights, fpp, NULL)); } /* * Traverse the list of fds attached to this thread's seltd and check for * completion. */ static int selrescan(struct thread *td, fd_mask **ibits, fd_mask **obits) { struct filedesc *fdp; struct selinfo *si; struct seltd *stp; struct selfd *sfp; struct selfd *sfn; struct file *fp; fd_mask bit; int fd, ev, n, idx; int error; fdp = td->td_proc->p_fd; stp = td->td_sel; n = 0; STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { fd = (int)(uintptr_t)sfp->sf_cookie; si = sfp->sf_si; selfdfree(stp, sfp); /* If the selinfo wasn't cleared the event didn't fire. */ if (si != NULL) continue; error = getselfd_cap(fdp, fd, &fp); if (error) return (error); idx = fd / NFDBITS; bit = (fd_mask)1 << (fd % NFDBITS); ev = fo_poll(fp, selflags(ibits, idx, bit), td->td_ucred, td); fdrop(fp, td); if (ev != 0) n += selsetbits(ibits, obits, idx, bit, ev); } stp->st_flags = 0; td->td_retval[0] = n; return (0); } /* * Perform the initial filedescriptor scan and register ourselves with * each selinfo. */ static int selscan(td, ibits, obits, nfd) struct thread *td; fd_mask **ibits, **obits; int nfd; { struct filedesc *fdp; struct file *fp; fd_mask bit; int ev, flags, end, fd; int n, idx; int error; fdp = td->td_proc->p_fd; n = 0; for (idx = 0, fd = 0; fd < nfd; idx++) { end = imin(fd + NFDBITS, nfd); for (bit = 1; fd < end; bit <<= 1, fd++) { /* Compute the list of events we're interested in. */ flags = selflags(ibits, idx, bit); if (flags == 0) continue; error = getselfd_cap(fdp, fd, &fp); if (error) return (error); selfdalloc(td, (void *)(uintptr_t)fd); ev = fo_poll(fp, flags, td->td_ucred, td); fdrop(fp, td); if (ev != 0) n += selsetbits(ibits, obits, idx, bit, ev); } } td->td_retval[0] = n; return (0); } int sys_poll(struct thread *td, struct poll_args *uap) { struct timespec ts, *tsp; if (uap->timeout != INFTIM) { if (uap->timeout < 0) return (EINVAL); ts.tv_sec = uap->timeout / 1000; ts.tv_nsec = (uap->timeout % 1000) * 1000000; tsp = &ts; } else tsp = NULL; return (kern_poll(td, uap->fds, uap->nfds, tsp, NULL)); } int kern_poll(struct thread *td, struct pollfd *fds, u_int nfds, struct timespec *tsp, sigset_t *uset) { struct pollfd *bits; struct pollfd smallbits[32]; sbintime_t sbt, precision, tmp; time_t over; struct timespec ts; int error; size_t ni; precision = 0; if (tsp != NULL) { if (tsp->tv_sec < 0) return (EINVAL); if (tsp->tv_nsec < 0 || tsp->tv_nsec >= 1000000000) return (EINVAL); if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) sbt = 0; else { ts = *tsp; if (ts.tv_sec > INT32_MAX / 2) { over = ts.tv_sec - INT32_MAX / 2; ts.tv_sec -= over; } else over = 0; tmp = tstosbt(ts); precision = tmp; precision >>= tc_precexp; if (TIMESEL(&sbt, tmp)) sbt += tc_tick_sbt; sbt += tmp; } } else sbt = -1; if (nfds > maxfilesperproc && nfds > FD_SETSIZE) return (EINVAL); ni = nfds * sizeof(struct pollfd); if (ni > sizeof(smallbits)) bits = malloc(ni, M_TEMP, M_WAITOK); else bits = smallbits; error = copyin(fds, bits, ni); if (error) goto done; if (uset != NULL) { error = kern_sigprocmask(td, SIG_SETMASK, uset, &td->td_oldsigmask, 0); if (error) goto done; td->td_pflags |= TDP_OLDMASK; /* * Make sure that ast() is called on return to * usermode and TDP_OLDMASK is cleared, restoring old * sigmask. */ thread_lock(td); td->td_flags |= TDF_ASTPENDING; thread_unlock(td); } seltdinit(td); /* Iterate until the timeout expires or descriptors become ready. */ for (;;) { error = pollscan(td, bits, nfds); if (error || td->td_retval[0] != 0) break; error = seltdwait(td, sbt, precision); if (error) break; error = pollrescan(td); if (error || td->td_retval[0] != 0) break; } seltdclear(td); done: /* poll is not restarted after signals... */ if (error == ERESTART) error = EINTR; if (error == EWOULDBLOCK) error = 0; if (error == 0) { error = pollout(td, bits, fds, nfds); if (error) goto out; } out: if (ni > sizeof(smallbits)) free(bits, M_TEMP); return (error); } int sys_ppoll(struct thread *td, struct ppoll_args *uap) { struct timespec ts, *tsp; sigset_t set, *ssp; int error; if (uap->ts != NULL) { error = copyin(uap->ts, &ts, sizeof(ts)); if (error) return (error); tsp = &ts; } else tsp = NULL; if (uap->set != NULL) { error = copyin(uap->set, &set, sizeof(set)); if (error) return (error); ssp = &set; } else ssp = NULL; /* * fds is still a pointer to user space. kern_poll() will * take care of copyin that array to the kernel space. */ return (kern_poll(td, uap->fds, uap->nfds, tsp, ssp)); } static int pollrescan(struct thread *td) { struct seltd *stp; struct selfd *sfp; struct selfd *sfn; struct selinfo *si; struct filedesc *fdp; struct file *fp; struct pollfd *fd; -#ifdef CAPABILITIES - cap_rights_t rights; -#endif int n; n = 0; fdp = td->td_proc->p_fd; stp = td->td_sel; FILEDESC_SLOCK(fdp); STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) { fd = (struct pollfd *)sfp->sf_cookie; si = sfp->sf_si; selfdfree(stp, sfp); /* If the selinfo wasn't cleared the event didn't fire. */ if (si != NULL) continue; fp = fdp->fd_ofiles[fd->fd].fde_file; #ifdef CAPABILITIES if (fp == NULL || - cap_check(cap_rights(fdp, fd->fd), - cap_rights_init(&rights, CAP_EVENT)) != 0) + cap_check(cap_rights(fdp, fd->fd), &cap_event_rights) != 0) #else if (fp == NULL) #endif { fd->revents = POLLNVAL; n++; continue; } /* * Note: backend also returns POLLHUP and * POLLERR if appropriate. */ fd->revents = fo_poll(fp, fd->events, td->td_ucred, td); if (fd->revents != 0) n++; } FILEDESC_SUNLOCK(fdp); stp->st_flags = 0; td->td_retval[0] = n; return (0); } static int pollout(td, fds, ufds, nfd) struct thread *td; struct pollfd *fds; struct pollfd *ufds; u_int nfd; { int error = 0; u_int i = 0; u_int n = 0; for (i = 0; i < nfd; i++) { error = copyout(&fds->revents, &ufds->revents, sizeof(ufds->revents)); if (error) return (error); if (fds->revents != 0) n++; fds++; ufds++; } td->td_retval[0] = n; return (0); } static int pollscan(td, fds, nfd) struct thread *td; struct pollfd *fds; u_int nfd; { struct filedesc *fdp = td->td_proc->p_fd; struct file *fp; -#ifdef CAPABILITIES - cap_rights_t rights; -#endif int i, n = 0; FILEDESC_SLOCK(fdp); for (i = 0; i < nfd; i++, fds++) { if (fds->fd > fdp->fd_lastfile) { fds->revents = POLLNVAL; n++; } else if (fds->fd < 0) { fds->revents = 0; } else { fp = fdp->fd_ofiles[fds->fd].fde_file; #ifdef CAPABILITIES if (fp == NULL || - cap_check(cap_rights(fdp, fds->fd), - cap_rights_init(&rights, CAP_EVENT)) != 0) + cap_check(cap_rights(fdp, fds->fd), &cap_event_rights) != 0) #else if (fp == NULL) #endif { fds->revents = POLLNVAL; n++; } else { /* * Note: backend also returns POLLHUP and * POLLERR if appropriate. */ selfdalloc(td, fds); fds->revents = fo_poll(fp, fds->events, td->td_ucred, td); /* * POSIX requires POLLOUT to be never * set simultaneously with POLLHUP. */ if ((fds->revents & POLLHUP) != 0) fds->revents &= ~POLLOUT; if (fds->revents != 0) n++; } } } FILEDESC_SUNLOCK(fdp); td->td_retval[0] = n; return (0); } /* * XXX This was created specifically to support netncp and netsmb. This * allows the caller to specify a socket to wait for events on. It returns * 0 if any events matched and an error otherwise. There is no way to * determine which events fired. */ int selsocket(struct socket *so, int events, struct timeval *tvp, struct thread *td) { struct timeval rtv; sbintime_t asbt, precision, rsbt; int error; precision = 0; /* stupid gcc! */ if (tvp != NULL) { rtv = *tvp; if (rtv.tv_sec < 0 || rtv.tv_usec < 0 || rtv.tv_usec >= 1000000) return (EINVAL); if (!timevalisset(&rtv)) asbt = 0; else if (rtv.tv_sec <= INT32_MAX) { rsbt = tvtosbt(rtv); precision = rsbt; precision >>= tc_precexp; if (TIMESEL(&asbt, rsbt)) asbt += tc_tick_sbt; if (asbt <= SBT_MAX - rsbt) asbt += rsbt; else asbt = -1; } else asbt = -1; } else asbt = -1; seltdinit(td); /* * Iterate until the timeout expires or the socket becomes ready. */ for (;;) { selfdalloc(td, NULL); error = sopoll(so, events, NULL, td); /* error here is actually the ready events. */ if (error) return (0); error = seltdwait(td, asbt, precision); if (error) break; } seltdclear(td); /* XXX Duplicates ncp/smb behavior. */ if (error == ERESTART) error = 0; return (error); } /* * Preallocate two selfds associated with 'cookie'. Some fo_poll routines * have two select sets, one for read and another for write. */ static void selfdalloc(struct thread *td, void *cookie) { struct seltd *stp; stp = td->td_sel; if (stp->st_free1 == NULL) stp->st_free1 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO); stp->st_free1->sf_td = stp; stp->st_free1->sf_cookie = cookie; if (stp->st_free2 == NULL) stp->st_free2 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO); stp->st_free2->sf_td = stp; stp->st_free2->sf_cookie = cookie; } static void selfdfree(struct seltd *stp, struct selfd *sfp) { STAILQ_REMOVE(&stp->st_selq, sfp, selfd, sf_link); if (sfp->sf_si != NULL) { mtx_lock(sfp->sf_mtx); if (sfp->sf_si != NULL) { TAILQ_REMOVE(&sfp->sf_si->si_tdlist, sfp, sf_threads); refcount_release(&sfp->sf_refs); } mtx_unlock(sfp->sf_mtx); } if (refcount_release(&sfp->sf_refs)) uma_zfree(selfd_zone, sfp); } /* Drain the waiters tied to all the selfd belonging the specified selinfo. */ void seldrain(sip) struct selinfo *sip; { /* * This feature is already provided by doselwakeup(), thus it is * enough to go for it. * Eventually, the context, should take care to avoid races * between thread calling select()/poll() and file descriptor * detaching, but, again, the races are just the same as * selwakeup(). */ doselwakeup(sip, -1); } /* * Record a select request. */ void selrecord(selector, sip) struct thread *selector; struct selinfo *sip; { struct selfd *sfp; struct seltd *stp; struct mtx *mtxp; stp = selector->td_sel; /* * Don't record when doing a rescan. */ if (stp->st_flags & SELTD_RESCAN) return; /* * Grab one of the preallocated descriptors. */ sfp = NULL; if ((sfp = stp->st_free1) != NULL) stp->st_free1 = NULL; else if ((sfp = stp->st_free2) != NULL) stp->st_free2 = NULL; else panic("selrecord: No free selfd on selq"); mtxp = sip->si_mtx; if (mtxp == NULL) mtxp = mtx_pool_find(mtxpool_select, sip); /* * Initialize the sfp and queue it in the thread. */ sfp->sf_si = sip; sfp->sf_mtx = mtxp; refcount_init(&sfp->sf_refs, 2); STAILQ_INSERT_TAIL(&stp->st_selq, sfp, sf_link); /* * Now that we've locked the sip, check for initialization. */ mtx_lock(mtxp); if (sip->si_mtx == NULL) { sip->si_mtx = mtxp; TAILQ_INIT(&sip->si_tdlist); } /* * Add this thread to the list of selfds listening on this selinfo. */ TAILQ_INSERT_TAIL(&sip->si_tdlist, sfp, sf_threads); mtx_unlock(sip->si_mtx); } /* Wake up a selecting thread. */ void selwakeup(sip) struct selinfo *sip; { doselwakeup(sip, -1); } /* Wake up a selecting thread, and set its priority. */ void selwakeuppri(sip, pri) struct selinfo *sip; int pri; { doselwakeup(sip, pri); } /* * Do a wakeup when a selectable event occurs. */ static void doselwakeup(sip, pri) struct selinfo *sip; int pri; { struct selfd *sfp; struct selfd *sfn; struct seltd *stp; /* If it's not initialized there can't be any waiters. */ if (sip->si_mtx == NULL) return; /* * Locking the selinfo locks all selfds associated with it. */ mtx_lock(sip->si_mtx); TAILQ_FOREACH_SAFE(sfp, &sip->si_tdlist, sf_threads, sfn) { /* * Once we remove this sfp from the list and clear the * sf_si seltdclear will know to ignore this si. */ TAILQ_REMOVE(&sip->si_tdlist, sfp, sf_threads); sfp->sf_si = NULL; stp = sfp->sf_td; mtx_lock(&stp->st_mtx); stp->st_flags |= SELTD_PENDING; cv_broadcastpri(&stp->st_wait, pri); mtx_unlock(&stp->st_mtx); if (refcount_release(&sfp->sf_refs)) uma_zfree(selfd_zone, sfp); } mtx_unlock(sip->si_mtx); } static void seltdinit(struct thread *td) { struct seltd *stp; if ((stp = td->td_sel) != NULL) goto out; td->td_sel = stp = malloc(sizeof(*stp), M_SELECT, M_WAITOK|M_ZERO); mtx_init(&stp->st_mtx, "sellck", NULL, MTX_DEF); cv_init(&stp->st_wait, "select"); out: stp->st_flags = 0; STAILQ_INIT(&stp->st_selq); } static int seltdwait(struct thread *td, sbintime_t sbt, sbintime_t precision) { struct seltd *stp; int error; stp = td->td_sel; /* * An event of interest may occur while we do not hold the seltd * locked so check the pending flag before we sleep. */ mtx_lock(&stp->st_mtx); /* * Any further calls to selrecord will be a rescan. */ stp->st_flags |= SELTD_RESCAN; if (stp->st_flags & SELTD_PENDING) { mtx_unlock(&stp->st_mtx); return (0); } if (sbt == 0) error = EWOULDBLOCK; else if (sbt != -1) error = cv_timedwait_sig_sbt(&stp->st_wait, &stp->st_mtx, sbt, precision, C_ABSOLUTE); else error = cv_wait_sig(&stp->st_wait, &stp->st_mtx); mtx_unlock(&stp->st_mtx); return (error); } void seltdfini(struct thread *td) { struct seltd *stp; stp = td->td_sel; if (stp == NULL) return; if (stp->st_free1) uma_zfree(selfd_zone, stp->st_free1); if (stp->st_free2) uma_zfree(selfd_zone, stp->st_free2); td->td_sel = NULL; cv_destroy(&stp->st_wait); mtx_destroy(&stp->st_mtx); free(stp, M_SELECT); } /* * Remove the references to the thread from all of the objects we were * polling. */ static void seltdclear(struct thread *td) { struct seltd *stp; struct selfd *sfp; struct selfd *sfn; stp = td->td_sel; STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) selfdfree(stp, sfp); stp->st_flags = 0; } static void selectinit(void *); SYSINIT(select, SI_SUB_SYSCALLS, SI_ORDER_ANY, selectinit, NULL); static void selectinit(void *dummy __unused) { selfd_zone = uma_zcreate("selfd", sizeof(struct selfd), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mtxpool_select = mtx_pool_create("select mtxpool", 128, MTX_DEF); } /* * Set up a syscall return value that follows the convention specified for * posix_* functions. */ int kern_posix_error(struct thread *td, int error) { if (error <= 0) return (error); td->td_errno = error; td->td_pflags |= TDP_NERRNO; td->td_retval[0] = error; return (0); } Index: head/sys/kern/sys_procdesc.c =================================================================== --- head/sys/kern/sys_procdesc.c (revision 333424) +++ head/sys/kern/sys_procdesc.c (revision 333425) @@ -1,572 +1,570 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2009, 2016 Robert N. M. Watson * All rights reserved. * * This software was developed at the University of Cambridge Computer * Laboratory with support from a grant from Google, Inc. * * Portions of this software were developed by BAE Systems, the University of * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent * Computing (TC) research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /*- * FreeBSD process descriptor facility. * * Some processes are represented by a file descriptor, which will be used in * preference to signaling and pids for the purposes of process management, * and is, in effect, a form of capability. When a process descriptor is * used with a process, it ceases to be visible to certain traditional UNIX * process facilities, such as waitpid(2). * * Some semantics: * * - At most one process descriptor will exist for any process, although * references to that descriptor may be held from many processes (or even * be in flight between processes over a local domain socket). * - Last close on the process descriptor will terminate the process using * SIGKILL and reparent it to init so that there's a process to reap it * when it's done exiting. * - If the process exits before the descriptor is closed, it will not * generate SIGCHLD on termination, or be picked up by waitpid(). * - The pdkill(2) system call may be used to deliver a signal to the process * using its process descriptor. * - The pdwait4(2) system call may be used to block (or not) on a process * descriptor to collect termination information. * * Open questions: * * - How to handle ptrace(2)? * - Will we want to add a pidtoprocdesc(2) system call to allow process * descriptors to be created for processes without pdfork(2)? */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(process_descriptors, "Process Descriptors"); static uma_zone_t procdesc_zone; static fo_poll_t procdesc_poll; static fo_kqfilter_t procdesc_kqfilter; static fo_stat_t procdesc_stat; static fo_close_t procdesc_close; static fo_fill_kinfo_t procdesc_fill_kinfo; static struct fileops procdesc_ops = { .fo_read = invfo_rdwr, .fo_write = invfo_rdwr, .fo_truncate = invfo_truncate, .fo_ioctl = invfo_ioctl, .fo_poll = procdesc_poll, .fo_kqfilter = procdesc_kqfilter, .fo_stat = procdesc_stat, .fo_close = procdesc_close, .fo_chmod = invfo_chmod, .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = procdesc_fill_kinfo, .fo_flags = DFLAG_PASSABLE, }; /* * Initialize with VFS so that process descriptors are available along with * other file descriptor types. As long as it runs before init(8) starts, * there shouldn't be a problem. */ static void procdesc_init(void *dummy __unused) { procdesc_zone = uma_zcreate("procdesc", sizeof(struct procdesc), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); if (procdesc_zone == NULL) panic("procdesc_init: procdesc_zone not initialized"); } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, procdesc_init, NULL); /* * Return a locked process given a process descriptor, or ESRCH if it has * died. */ int procdesc_find(struct thread *td, int fd, cap_rights_t *rightsp, struct proc **p) { struct procdesc *pd; struct file *fp; int error; error = fget(td, fd, rightsp, &fp); if (error) return (error); if (fp->f_type != DTYPE_PROCDESC) { error = EBADF; goto out; } pd = fp->f_data; sx_slock(&proctree_lock); if (pd->pd_proc != NULL) { *p = pd->pd_proc; PROC_LOCK(*p); } else error = ESRCH; sx_sunlock(&proctree_lock); out: fdrop(fp, td); return (error); } /* * Function to be used by procstat(1) sysctls when returning procdesc * information. */ pid_t procdesc_pid(struct file *fp_procdesc) { struct procdesc *pd; KASSERT(fp_procdesc->f_type == DTYPE_PROCDESC, ("procdesc_pid: !procdesc")); pd = fp_procdesc->f_data; return (pd->pd_pid); } /* * Retrieve the PID associated with a process descriptor. */ int kern_pdgetpid(struct thread *td, int fd, cap_rights_t *rightsp, pid_t *pidp) { struct file *fp; int error; error = fget(td, fd, rightsp, &fp); if (error) return (error); if (fp->f_type != DTYPE_PROCDESC) { error = EBADF; goto out; } *pidp = procdesc_pid(fp); out: fdrop(fp, td); return (error); } /* * System call to return the pid of a process given its process descriptor. */ int sys_pdgetpid(struct thread *td, struct pdgetpid_args *uap) { - cap_rights_t rights; pid_t pid; int error; AUDIT_ARG_FD(uap->fd); - error = kern_pdgetpid(td, uap->fd, - cap_rights_init(&rights, CAP_PDGETPID), &pid); + error = kern_pdgetpid(td, uap->fd, &cap_pdgetpid_rights, &pid); if (error == 0) error = copyout(&pid, uap->pidp, sizeof(pid)); return (error); } /* * When a new process is forked by pdfork(), a file descriptor is allocated * by the fork code first, then the process is forked, and then we get a * chance to set up the process descriptor. Failure is not permitted at this * point, so procdesc_new() must succeed. */ void procdesc_new(struct proc *p, int flags) { struct procdesc *pd; pd = uma_zalloc(procdesc_zone, M_WAITOK | M_ZERO); pd->pd_proc = p; pd->pd_pid = p->p_pid; p->p_procdesc = pd; pd->pd_flags = 0; if (flags & PD_DAEMON) pd->pd_flags |= PDF_DAEMON; PROCDESC_LOCK_INIT(pd); knlist_init_mtx(&pd->pd_selinfo.si_note, &pd->pd_lock); /* * Process descriptors start out with two references: one from their * struct file, and the other from their struct proc. */ refcount_init(&pd->pd_refcount, 2); } /* * Create a new process decriptor for the process that refers to it. */ int procdesc_falloc(struct thread *td, struct file **resultfp, int *resultfd, int flags, struct filecaps *fcaps) { int fflags; fflags = 0; if (flags & PD_CLOEXEC) fflags = O_CLOEXEC; return (falloc_caps(td, resultfp, resultfd, fflags, fcaps)); } /* * Initialize a file with a process descriptor. */ void procdesc_finit(struct procdesc *pdp, struct file *fp) { finit(fp, FREAD | FWRITE, DTYPE_PROCDESC, pdp, &procdesc_ops); } static void procdesc_free(struct procdesc *pd) { /* * When the last reference is released, we assert that the descriptor * has been closed, but not that the process has exited, as we will * detach the descriptor before the process dies if the descript is * closed, as we can't wait synchronously. */ if (refcount_release(&pd->pd_refcount)) { KASSERT(pd->pd_proc == NULL, ("procdesc_free: pd_proc != NULL")); KASSERT((pd->pd_flags & PDF_CLOSED), ("procdesc_free: !PDF_CLOSED")); knlist_destroy(&pd->pd_selinfo.si_note); PROCDESC_LOCK_DESTROY(pd); uma_zfree(procdesc_zone, pd); } } /* * procdesc_exit() - notify a process descriptor that its process is exiting. * We use the proctree_lock to ensure that process exit either happens * strictly before or strictly after a concurrent call to procdesc_close(). */ int procdesc_exit(struct proc *p) { struct procdesc *pd; sx_assert(&proctree_lock, SA_XLOCKED); PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(p->p_procdesc != NULL, ("procdesc_exit: p_procdesc NULL")); pd = p->p_procdesc; PROCDESC_LOCK(pd); KASSERT((pd->pd_flags & PDF_CLOSED) == 0 || p->p_pptr == initproc, ("procdesc_exit: closed && parent not init")); pd->pd_flags |= PDF_EXITED; pd->pd_xstat = KW_EXITCODE(p->p_xexit, p->p_xsig); /* * If the process descriptor has been closed, then we have nothing * to do; return 1 so that init will get SIGCHLD and do the reaping. * Clean up the procdesc now rather than letting it happen during * that reap. */ if (pd->pd_flags & PDF_CLOSED) { PROCDESC_UNLOCK(pd); pd->pd_proc = NULL; p->p_procdesc = NULL; procdesc_free(pd); return (1); } if (pd->pd_flags & PDF_SELECTED) { pd->pd_flags &= ~PDF_SELECTED; selwakeup(&pd->pd_selinfo); } KNOTE_LOCKED(&pd->pd_selinfo.si_note, NOTE_EXIT); PROCDESC_UNLOCK(pd); return (0); } /* * When a process descriptor is reaped, perhaps as a result of close() or * pdwait4(), release the process's reference on the process descriptor. */ void procdesc_reap(struct proc *p) { struct procdesc *pd; sx_assert(&proctree_lock, SA_XLOCKED); KASSERT(p->p_procdesc != NULL, ("procdesc_reap: p_procdesc == NULL")); pd = p->p_procdesc; pd->pd_proc = NULL; p->p_procdesc = NULL; procdesc_free(pd); } /* * procdesc_close() - last close on a process descriptor. If the process is * still running, terminate with SIGKILL (unless PDF_DAEMON is set) and let * init(8) clean up the mess; if not, we have to clean up the zombie ourselves. */ static int procdesc_close(struct file *fp, struct thread *td) { struct procdesc *pd; struct proc *p; KASSERT(fp->f_type == DTYPE_PROCDESC, ("procdesc_close: !procdesc")); pd = fp->f_data; fp->f_ops = &badfileops; fp->f_data = NULL; sx_xlock(&proctree_lock); PROCDESC_LOCK(pd); pd->pd_flags |= PDF_CLOSED; PROCDESC_UNLOCK(pd); p = pd->pd_proc; if (p == NULL) { /* * This is the case where process' exit status was already * collected and procdesc_reap() was already called. */ sx_xunlock(&proctree_lock); } else { PROC_LOCK(p); AUDIT_ARG_PROCESS(p); if (p->p_state == PRS_ZOMBIE) { /* * If the process is already dead and just awaiting * reaping, do that now. This will release the * process's reference to the process descriptor when it * calls back into procdesc_reap(). */ proc_reap(curthread, p, NULL, 0); } else { /* * If the process is not yet dead, we need to kill it, * but we can't wait around synchronously for it to go * away, as that path leads to madness (and deadlocks). * First, detach the process from its descriptor so that * its exit status will be reported normally. */ pd->pd_proc = NULL; p->p_procdesc = NULL; procdesc_free(pd); /* * Next, reparent it to init(8) so that there's someone * to pick up the pieces; finally, terminate with * prejudice. */ p->p_sigparent = SIGCHLD; proc_reparent(p, initproc); if ((pd->pd_flags & PDF_DAEMON) == 0) kern_psignal(p, SIGKILL); PROC_UNLOCK(p); sx_xunlock(&proctree_lock); } } /* * Release the file descriptor's reference on the process descriptor. */ procdesc_free(pd); return (0); } static int procdesc_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct procdesc *pd; int revents; revents = 0; pd = fp->f_data; PROCDESC_LOCK(pd); if (pd->pd_flags & PDF_EXITED) revents |= POLLHUP; if (revents == 0) { selrecord(td, &pd->pd_selinfo); pd->pd_flags |= PDF_SELECTED; } PROCDESC_UNLOCK(pd); return (revents); } static void procdesc_kqops_detach(struct knote *kn) { struct procdesc *pd; pd = kn->kn_fp->f_data; knlist_remove(&pd->pd_selinfo.si_note, kn, 0); } static int procdesc_kqops_event(struct knote *kn, long hint) { struct procdesc *pd; u_int event; pd = kn->kn_fp->f_data; if (hint == 0) { /* * Initial test after registration. Generate a NOTE_EXIT in * case the process already terminated before registration. */ event = pd->pd_flags & PDF_EXITED ? NOTE_EXIT : 0; } else { /* Mask off extra data. */ event = (u_int)hint & NOTE_PCTRLMASK; } /* If the user is interested in this event, record it. */ if (kn->kn_sfflags & event) kn->kn_fflags |= event; /* Process is gone, so flag the event as finished. */ if (event == NOTE_EXIT) { kn->kn_flags |= EV_EOF | EV_ONESHOT; if (kn->kn_fflags & NOTE_EXIT) kn->kn_data = pd->pd_xstat; if (kn->kn_fflags == 0) kn->kn_flags |= EV_DROP; return (1); } return (kn->kn_fflags != 0); } static struct filterops procdesc_kqops = { .f_isfd = 1, .f_detach = procdesc_kqops_detach, .f_event = procdesc_kqops_event, }; static int procdesc_kqfilter(struct file *fp, struct knote *kn) { struct procdesc *pd; pd = fp->f_data; switch (kn->kn_filter) { case EVFILT_PROCDESC: kn->kn_fop = &procdesc_kqops; kn->kn_flags |= EV_CLEAR; knlist_add(&pd->pd_selinfo.si_note, kn, 0); return (0); default: return (EINVAL); } } static int procdesc_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td) { struct procdesc *pd; struct timeval pstart, boottime; /* * XXXRW: Perhaps we should cache some more information from the * process so that we can return it reliably here even after it has * died. For example, caching its credential data. */ bzero(sb, sizeof(*sb)); pd = fp->f_data; sx_slock(&proctree_lock); if (pd->pd_proc != NULL) { PROC_LOCK(pd->pd_proc); AUDIT_ARG_PROCESS(pd->pd_proc); /* Set birth and [acm] times to process start time. */ pstart = pd->pd_proc->p_stats->p_start; getboottime(&boottime); timevaladd(&pstart, &boottime); TIMEVAL_TO_TIMESPEC(&pstart, &sb->st_birthtim); sb->st_atim = sb->st_birthtim; sb->st_ctim = sb->st_birthtim; sb->st_mtim = sb->st_birthtim; if (pd->pd_proc->p_state != PRS_ZOMBIE) sb->st_mode = S_IFREG | S_IRWXU; else sb->st_mode = S_IFREG; sb->st_uid = pd->pd_proc->p_ucred->cr_ruid; sb->st_gid = pd->pd_proc->p_ucred->cr_rgid; PROC_UNLOCK(pd->pd_proc); } else sb->st_mode = S_IFREG; sx_sunlock(&proctree_lock); return (0); } static int procdesc_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { struct procdesc *pdp; kif->kf_type = KF_TYPE_PROCDESC; pdp = fp->f_data; kif->kf_un.kf_proc.kf_pid = pdp->pd_pid; return (0); } Index: head/sys/kern/uipc_mqueue.c =================================================================== --- head/sys/kern/uipc_mqueue.c (revision 333424) +++ head/sys/kern/uipc_mqueue.c (revision 333425) @@ -1,2940 +1,2933 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005 David Xu * Copyright (c) 2016-2017 Robert N. M. Watson * All rights reserved. * * Portions of this software were developed by BAE Systems, the University of * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent * Computing (TC) research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* * POSIX message queue implementation. * * 1) A mqueue filesystem can be mounted, each message queue appears * in mounted directory, user can change queue's permission and * ownership, or remove a queue. Manually creating a file in the * directory causes a message queue to be created in the kernel with * default message queue attributes applied and same name used, this * method is not advocated since mq_open syscall allows user to specify * different attributes. Also the file system can be mounted multiple * times at different mount points but shows same contents. * * 2) Standard POSIX message queue API. The syscalls do not use vfs layer, * but directly operate on internal data structure, this allows user to * use the IPC facility without having to mount mqueue file system. */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(p1003_1b_mqueue, "POSIX P1003.1B message queues support"); /* * Limits and constants */ #define MQFS_NAMELEN NAME_MAX #define MQFS_DELEN (8 + MQFS_NAMELEN) /* node types */ typedef enum { mqfstype_none = 0, mqfstype_root, mqfstype_dir, mqfstype_this, mqfstype_parent, mqfstype_file, mqfstype_symlink, } mqfs_type_t; struct mqfs_node; /* * mqfs_info: describes a mqfs instance */ struct mqfs_info { struct sx mi_lock; struct mqfs_node *mi_root; struct unrhdr *mi_unrhdr; }; struct mqfs_vdata { LIST_ENTRY(mqfs_vdata) mv_link; struct mqfs_node *mv_node; struct vnode *mv_vnode; struct task mv_task; }; /* * mqfs_node: describes a node (file or directory) within a mqfs */ struct mqfs_node { char mn_name[MQFS_NAMELEN+1]; struct mqfs_info *mn_info; struct mqfs_node *mn_parent; LIST_HEAD(,mqfs_node) mn_children; LIST_ENTRY(mqfs_node) mn_sibling; LIST_HEAD(,mqfs_vdata) mn_vnodes; const void *mn_pr_root; int mn_refcount; mqfs_type_t mn_type; int mn_deleted; uint32_t mn_fileno; void *mn_data; struct timespec mn_birth; struct timespec mn_ctime; struct timespec mn_atime; struct timespec mn_mtime; uid_t mn_uid; gid_t mn_gid; int mn_mode; }; #define VTON(vp) (((struct mqfs_vdata *)((vp)->v_data))->mv_node) #define VTOMQ(vp) ((struct mqueue *)(VTON(vp)->mn_data)) #define VFSTOMQFS(m) ((struct mqfs_info *)((m)->mnt_data)) #define FPTOMQ(fp) ((struct mqueue *)(((struct mqfs_node *) \ (fp)->f_data)->mn_data)) TAILQ_HEAD(msgq, mqueue_msg); struct mqueue; struct mqueue_notifier { LIST_ENTRY(mqueue_notifier) nt_link; struct sigevent nt_sigev; ksiginfo_t nt_ksi; struct proc *nt_proc; }; struct mqueue { struct mtx mq_mutex; int mq_flags; long mq_maxmsg; long mq_msgsize; long mq_curmsgs; long mq_totalbytes; struct msgq mq_msgq; int mq_receivers; int mq_senders; struct selinfo mq_rsel; struct selinfo mq_wsel; struct mqueue_notifier *mq_notifier; }; #define MQ_RSEL 0x01 #define MQ_WSEL 0x02 struct mqueue_msg { TAILQ_ENTRY(mqueue_msg) msg_link; unsigned int msg_prio; unsigned int msg_size; /* following real data... */ }; static SYSCTL_NODE(_kern, OID_AUTO, mqueue, CTLFLAG_RW, 0, "POSIX real time message queue"); static int default_maxmsg = 10; static int default_msgsize = 1024; static int maxmsg = 100; SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsg, CTLFLAG_RW, &maxmsg, 0, "Default maximum messages in queue"); static int maxmsgsize = 16384; SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsgsize, CTLFLAG_RW, &maxmsgsize, 0, "Default maximum message size"); static int maxmq = 100; SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmq, CTLFLAG_RW, &maxmq, 0, "maximum message queues"); static int curmq = 0; SYSCTL_INT(_kern_mqueue, OID_AUTO, curmq, CTLFLAG_RW, &curmq, 0, "current message queue number"); static int unloadable = 0; static MALLOC_DEFINE(M_MQUEUEDATA, "mqdata", "mqueue data"); static eventhandler_tag exit_tag; /* Only one instance per-system */ static struct mqfs_info mqfs_data; static uma_zone_t mqnode_zone; static uma_zone_t mqueue_zone; static uma_zone_t mvdata_zone; static uma_zone_t mqnoti_zone; static struct vop_vector mqfs_vnodeops; static struct fileops mqueueops; static unsigned mqfs_osd_jail_slot; /* * Directory structure construction and manipulation */ #ifdef notyet static struct mqfs_node *mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode); static struct mqfs_node *mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode); #endif static struct mqfs_node *mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode); static int mqfs_destroy(struct mqfs_node *mn); static void mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn); static void mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn); static int mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn); static int mqfs_prison_remove(void *obj, void *data); /* * Message queue construction and maniplation */ static struct mqueue *mqueue_alloc(const struct mq_attr *attr); static void mqueue_free(struct mqueue *mq); static int mqueue_send(struct mqueue *mq, const char *msg_ptr, size_t msg_len, unsigned msg_prio, int waitok, const struct timespec *abs_timeout); static int mqueue_receive(struct mqueue *mq, char *msg_ptr, size_t msg_len, unsigned *msg_prio, int waitok, const struct timespec *abs_timeout); static int _mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo); static int _mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo); static void mqueue_send_notification(struct mqueue *mq); static void mqueue_fdclose(struct thread *td, int fd, struct file *fp); static void mq_proc_exit(void *arg, struct proc *p); /* * kqueue filters */ static void filt_mqdetach(struct knote *kn); static int filt_mqread(struct knote *kn, long hint); static int filt_mqwrite(struct knote *kn, long hint); struct filterops mq_rfiltops = { .f_isfd = 1, .f_detach = filt_mqdetach, .f_event = filt_mqread, }; struct filterops mq_wfiltops = { .f_isfd = 1, .f_detach = filt_mqdetach, .f_event = filt_mqwrite, }; /* * Initialize fileno bitmap */ static void mqfs_fileno_init(struct mqfs_info *mi) { struct unrhdr *up; up = new_unrhdr(1, INT_MAX, NULL); mi->mi_unrhdr = up; } /* * Tear down fileno bitmap */ static void mqfs_fileno_uninit(struct mqfs_info *mi) { struct unrhdr *up; up = mi->mi_unrhdr; mi->mi_unrhdr = NULL; delete_unrhdr(up); } /* * Allocate a file number */ static void mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn) { /* make sure our parent has a file number */ if (mn->mn_parent && !mn->mn_parent->mn_fileno) mqfs_fileno_alloc(mi, mn->mn_parent); switch (mn->mn_type) { case mqfstype_root: case mqfstype_dir: case mqfstype_file: case mqfstype_symlink: mn->mn_fileno = alloc_unr(mi->mi_unrhdr); break; case mqfstype_this: KASSERT(mn->mn_parent != NULL, ("mqfstype_this node has no parent")); mn->mn_fileno = mn->mn_parent->mn_fileno; break; case mqfstype_parent: KASSERT(mn->mn_parent != NULL, ("mqfstype_parent node has no parent")); if (mn->mn_parent == mi->mi_root) { mn->mn_fileno = mn->mn_parent->mn_fileno; break; } KASSERT(mn->mn_parent->mn_parent != NULL, ("mqfstype_parent node has no grandparent")); mn->mn_fileno = mn->mn_parent->mn_parent->mn_fileno; break; default: KASSERT(0, ("mqfs_fileno_alloc() called for unknown type node: %d", mn->mn_type)); break; } } /* * Release a file number */ static void mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn) { switch (mn->mn_type) { case mqfstype_root: case mqfstype_dir: case mqfstype_file: case mqfstype_symlink: free_unr(mi->mi_unrhdr, mn->mn_fileno); break; case mqfstype_this: case mqfstype_parent: /* ignore these, as they don't "own" their file number */ break; default: KASSERT(0, ("mqfs_fileno_free() called for unknown type node: %d", mn->mn_type)); break; } } static __inline struct mqfs_node * mqnode_alloc(void) { return uma_zalloc(mqnode_zone, M_WAITOK | M_ZERO); } static __inline void mqnode_free(struct mqfs_node *node) { uma_zfree(mqnode_zone, node); } static __inline void mqnode_addref(struct mqfs_node *node) { atomic_fetchadd_int(&node->mn_refcount, 1); } static __inline void mqnode_release(struct mqfs_node *node) { struct mqfs_info *mqfs; int old, exp; mqfs = node->mn_info; old = atomic_fetchadd_int(&node->mn_refcount, -1); if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root) exp = 3; /* include . and .. */ else exp = 1; if (old == exp) { int locked = sx_xlocked(&mqfs->mi_lock); if (!locked) sx_xlock(&mqfs->mi_lock); mqfs_destroy(node); if (!locked) sx_xunlock(&mqfs->mi_lock); } } /* * Add a node to a directory */ static int mqfs_add_node(struct mqfs_node *parent, struct mqfs_node *node) { KASSERT(parent != NULL, ("%s(): parent is NULL", __func__)); KASSERT(parent->mn_info != NULL, ("%s(): parent has no mn_info", __func__)); KASSERT(parent->mn_type == mqfstype_dir || parent->mn_type == mqfstype_root, ("%s(): parent is not a directory", __func__)); node->mn_info = parent->mn_info; node->mn_parent = parent; LIST_INIT(&node->mn_children); LIST_INIT(&node->mn_vnodes); LIST_INSERT_HEAD(&parent->mn_children, node, mn_sibling); mqnode_addref(parent); return (0); } static struct mqfs_node * mqfs_create_node(const char *name, int namelen, struct ucred *cred, int mode, int nodetype) { struct mqfs_node *node; node = mqnode_alloc(); strncpy(node->mn_name, name, namelen); node->mn_pr_root = cred->cr_prison->pr_root; node->mn_type = nodetype; node->mn_refcount = 1; vfs_timestamp(&node->mn_birth); node->mn_ctime = node->mn_atime = node->mn_mtime = node->mn_birth; node->mn_uid = cred->cr_uid; node->mn_gid = cred->cr_gid; node->mn_mode = mode; return (node); } /* * Create a file */ static struct mqfs_node * mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode) { struct mqfs_node *node; node = mqfs_create_node(name, namelen, cred, mode, mqfstype_file); if (mqfs_add_node(parent, node) != 0) { mqnode_free(node); return (NULL); } return (node); } /* * Add . and .. to a directory */ static int mqfs_fixup_dir(struct mqfs_node *parent) { struct mqfs_node *dir; dir = mqnode_alloc(); dir->mn_name[0] = '.'; dir->mn_type = mqfstype_this; dir->mn_refcount = 1; if (mqfs_add_node(parent, dir) != 0) { mqnode_free(dir); return (-1); } dir = mqnode_alloc(); dir->mn_name[0] = dir->mn_name[1] = '.'; dir->mn_type = mqfstype_parent; dir->mn_refcount = 1; if (mqfs_add_node(parent, dir) != 0) { mqnode_free(dir); return (-1); } return (0); } #ifdef notyet /* * Create a directory */ static struct mqfs_node * mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode) { struct mqfs_node *node; node = mqfs_create_node(name, namelen, cred, mode, mqfstype_dir); if (mqfs_add_node(parent, node) != 0) { mqnode_free(node); return (NULL); } if (mqfs_fixup_dir(node) != 0) { mqfs_destroy(node); return (NULL); } return (node); } /* * Create a symlink */ static struct mqfs_node * mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode) { struct mqfs_node *node; node = mqfs_create_node(name, namelen, cred, mode, mqfstype_symlink); if (mqfs_add_node(parent, node) != 0) { mqnode_free(node); return (NULL); } return (node); } #endif /* * Destroy a node or a tree of nodes */ static int mqfs_destroy(struct mqfs_node *node) { struct mqfs_node *parent; KASSERT(node != NULL, ("%s(): node is NULL", __func__)); KASSERT(node->mn_info != NULL, ("%s(): node has no mn_info", __func__)); /* destroy children */ if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root) while (! LIST_EMPTY(&node->mn_children)) mqfs_destroy(LIST_FIRST(&node->mn_children)); /* unlink from parent */ if ((parent = node->mn_parent) != NULL) { KASSERT(parent->mn_info == node->mn_info, ("%s(): parent has different mn_info", __func__)); LIST_REMOVE(node, mn_sibling); } if (node->mn_fileno != 0) mqfs_fileno_free(node->mn_info, node); if (node->mn_data != NULL) mqueue_free(node->mn_data); mqnode_free(node); return (0); } /* * Mount a mqfs instance */ static int mqfs_mount(struct mount *mp) { struct statfs *sbp; if (mp->mnt_flag & MNT_UPDATE) return (EOPNOTSUPP); mp->mnt_data = &mqfs_data; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; MNT_IUNLOCK(mp); vfs_getnewfsid(mp); sbp = &mp->mnt_stat; vfs_mountedfrom(mp, "mqueue"); sbp->f_bsize = PAGE_SIZE; sbp->f_iosize = PAGE_SIZE; sbp->f_blocks = 1; sbp->f_bfree = 0; sbp->f_bavail = 0; sbp->f_files = 1; sbp->f_ffree = 0; return (0); } /* * Unmount a mqfs instance */ static int mqfs_unmount(struct mount *mp, int mntflags) { int error; error = vflush(mp, 0, (mntflags & MNT_FORCE) ? FORCECLOSE : 0, curthread); return (error); } /* * Return a root vnode */ static int mqfs_root(struct mount *mp, int flags, struct vnode **vpp) { struct mqfs_info *mqfs; int ret; mqfs = VFSTOMQFS(mp); ret = mqfs_allocv(mp, vpp, mqfs->mi_root); return (ret); } /* * Return filesystem stats */ static int mqfs_statfs(struct mount *mp, struct statfs *sbp) { /* XXX update statistics */ return (0); } /* * Initialize a mqfs instance */ static int mqfs_init(struct vfsconf *vfc) { struct mqfs_node *root; struct mqfs_info *mi; osd_method_t methods[PR_MAXMETHOD] = { [PR_METHOD_REMOVE] = mqfs_prison_remove, }; mqnode_zone = uma_zcreate("mqnode", sizeof(struct mqfs_node), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mqueue_zone = uma_zcreate("mqueue", sizeof(struct mqueue), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mvdata_zone = uma_zcreate("mvdata", sizeof(struct mqfs_vdata), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mqnoti_zone = uma_zcreate("mqnotifier", sizeof(struct mqueue_notifier), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mi = &mqfs_data; sx_init(&mi->mi_lock, "mqfs lock"); /* set up the root diretory */ root = mqfs_create_node("/", 1, curthread->td_ucred, 01777, mqfstype_root); root->mn_info = mi; LIST_INIT(&root->mn_children); LIST_INIT(&root->mn_vnodes); mi->mi_root = root; mqfs_fileno_init(mi); mqfs_fileno_alloc(mi, root); mqfs_fixup_dir(root); exit_tag = EVENTHANDLER_REGISTER(process_exit, mq_proc_exit, NULL, EVENTHANDLER_PRI_ANY); mq_fdclose = mqueue_fdclose; p31b_setcfg(CTL_P1003_1B_MESSAGE_PASSING, _POSIX_MESSAGE_PASSING); mqfs_osd_jail_slot = osd_jail_register(NULL, methods); return (0); } /* * Destroy a mqfs instance */ static int mqfs_uninit(struct vfsconf *vfc) { struct mqfs_info *mi; if (!unloadable) return (EOPNOTSUPP); osd_jail_deregister(mqfs_osd_jail_slot); EVENTHANDLER_DEREGISTER(process_exit, exit_tag); mi = &mqfs_data; mqfs_destroy(mi->mi_root); mi->mi_root = NULL; mqfs_fileno_uninit(mi); sx_destroy(&mi->mi_lock); uma_zdestroy(mqnode_zone); uma_zdestroy(mqueue_zone); uma_zdestroy(mvdata_zone); uma_zdestroy(mqnoti_zone); return (0); } /* * task routine */ static void do_recycle(void *context, int pending __unused) { struct vnode *vp = (struct vnode *)context; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vrecycle(vp); VOP_UNLOCK(vp, 0); vdrop(vp); } /* * Allocate a vnode */ static int mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn) { struct mqfs_vdata *vd; struct mqfs_info *mqfs; struct vnode *newvpp; int error; mqfs = pn->mn_info; *vpp = NULL; sx_xlock(&mqfs->mi_lock); LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) { if (vd->mv_vnode->v_mount == mp) { vhold(vd->mv_vnode); break; } } if (vd != NULL) { found: *vpp = vd->mv_vnode; sx_xunlock(&mqfs->mi_lock); error = vget(*vpp, LK_RETRY | LK_EXCLUSIVE, curthread); vdrop(*vpp); return (error); } sx_xunlock(&mqfs->mi_lock); error = getnewvnode("mqueue", mp, &mqfs_vnodeops, &newvpp); if (error) return (error); vn_lock(newvpp, LK_EXCLUSIVE | LK_RETRY); error = insmntque(newvpp, mp); if (error != 0) return (error); sx_xlock(&mqfs->mi_lock); /* * Check if it has already been allocated * while we were blocked. */ LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) { if (vd->mv_vnode->v_mount == mp) { vhold(vd->mv_vnode); sx_xunlock(&mqfs->mi_lock); vgone(newvpp); vput(newvpp); goto found; } } *vpp = newvpp; vd = uma_zalloc(mvdata_zone, M_WAITOK); (*vpp)->v_data = vd; vd->mv_vnode = *vpp; vd->mv_node = pn; TASK_INIT(&vd->mv_task, 0, do_recycle, *vpp); LIST_INSERT_HEAD(&pn->mn_vnodes, vd, mv_link); mqnode_addref(pn); switch (pn->mn_type) { case mqfstype_root: (*vpp)->v_vflag = VV_ROOT; /* fall through */ case mqfstype_dir: case mqfstype_this: case mqfstype_parent: (*vpp)->v_type = VDIR; break; case mqfstype_file: (*vpp)->v_type = VREG; break; case mqfstype_symlink: (*vpp)->v_type = VLNK; break; case mqfstype_none: KASSERT(0, ("mqfs_allocf called for null node\n")); default: panic("%s has unexpected type: %d", pn->mn_name, pn->mn_type); } sx_xunlock(&mqfs->mi_lock); return (0); } /* * Search a directory entry */ static struct mqfs_node * mqfs_search(struct mqfs_node *pd, const char *name, int len, struct ucred *cred) { struct mqfs_node *pn; const void *pr_root; sx_assert(&pd->mn_info->mi_lock, SX_LOCKED); pr_root = cred->cr_prison->pr_root; LIST_FOREACH(pn, &pd->mn_children, mn_sibling) { /* Only match names within the same prison root directory */ if ((pn->mn_pr_root == NULL || pn->mn_pr_root == pr_root) && strncmp(pn->mn_name, name, len) == 0 && pn->mn_name[len] == '\0') return (pn); } return (NULL); } /* * Look up a file or directory. */ static int mqfs_lookupx(struct vop_cachedlookup_args *ap) { struct componentname *cnp; struct vnode *dvp, **vpp; struct mqfs_node *pd; struct mqfs_node *pn; struct mqfs_info *mqfs; int nameiop, flags, error, namelen; char *pname; struct thread *td; cnp = ap->a_cnp; vpp = ap->a_vpp; dvp = ap->a_dvp; pname = cnp->cn_nameptr; namelen = cnp->cn_namelen; td = cnp->cn_thread; flags = cnp->cn_flags; nameiop = cnp->cn_nameiop; pd = VTON(dvp); pn = NULL; mqfs = pd->mn_info; *vpp = NULLVP; if (dvp->v_type != VDIR) return (ENOTDIR); error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, cnp->cn_thread); if (error) return (error); /* shortcut: check if the name is too long */ if (cnp->cn_namelen >= MQFS_NAMELEN) return (ENOENT); /* self */ if (namelen == 1 && pname[0] == '.') { if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); pn = pd; *vpp = dvp; VREF(dvp); return (0); } /* parent */ if (cnp->cn_flags & ISDOTDOT) { if (dvp->v_vflag & VV_ROOT) return (EIO); if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); VOP_UNLOCK(dvp, 0); KASSERT(pd->mn_parent, ("non-root directory has no parent")); pn = pd->mn_parent; error = mqfs_allocv(dvp->v_mount, vpp, pn); vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); return (error); } /* named node */ sx_xlock(&mqfs->mi_lock); pn = mqfs_search(pd, pname, namelen, cnp->cn_cred); if (pn != NULL) mqnode_addref(pn); sx_xunlock(&mqfs->mi_lock); /* found */ if (pn != NULL) { /* DELETE */ if (nameiop == DELETE && (flags & ISLASTCN)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) { mqnode_release(pn); return (error); } if (*vpp == dvp) { VREF(dvp); *vpp = dvp; mqnode_release(pn); return (0); } } /* allocate vnode */ error = mqfs_allocv(dvp->v_mount, vpp, pn); mqnode_release(pn); if (error == 0 && cnp->cn_flags & MAKEENTRY) cache_enter(dvp, *vpp, cnp); return (error); } /* not found */ /* will create a new entry in the directory ? */ if ((nameiop == CREATE || nameiop == RENAME) && (flags & LOCKPARENT) && (flags & ISLASTCN)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) return (error); cnp->cn_flags |= SAVENAME; return (EJUSTRETURN); } return (ENOENT); } #if 0 struct vop_lookup_args { struct vop_generic_args a_gen; struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; }; #endif /* * vnode lookup operation */ static int mqfs_lookup(struct vop_cachedlookup_args *ap) { int rc; rc = mqfs_lookupx(ap); return (rc); } #if 0 struct vop_create_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; #endif /* * vnode creation operation */ static int mqfs_create(struct vop_create_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct componentname *cnp = ap->a_cnp; struct mqfs_node *pd; struct mqfs_node *pn; struct mqueue *mq; int error; pd = VTON(ap->a_dvp); if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir) return (ENOTDIR); mq = mqueue_alloc(NULL); if (mq == NULL) return (EAGAIN); sx_xlock(&mqfs->mi_lock); if ((cnp->cn_flags & HASBUF) == 0) panic("%s: no name", __func__); pn = mqfs_create_file(pd, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, ap->a_vap->va_mode); if (pn == NULL) { sx_xunlock(&mqfs->mi_lock); error = ENOSPC; } else { mqnode_addref(pn); sx_xunlock(&mqfs->mi_lock); error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn); mqnode_release(pn); if (error) mqfs_destroy(pn); else pn->mn_data = mq; } if (error) mqueue_free(mq); return (error); } /* * Remove an entry */ static int do_unlink(struct mqfs_node *pn, struct ucred *ucred) { struct mqfs_node *parent; struct mqfs_vdata *vd; int error = 0; sx_assert(&pn->mn_info->mi_lock, SX_LOCKED); if (ucred->cr_uid != pn->mn_uid && (error = priv_check_cred(ucred, PRIV_MQ_ADMIN, 0)) != 0) error = EACCES; else if (!pn->mn_deleted) { parent = pn->mn_parent; pn->mn_parent = NULL; pn->mn_deleted = 1; LIST_REMOVE(pn, mn_sibling); LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) { cache_purge(vd->mv_vnode); vhold(vd->mv_vnode); taskqueue_enqueue(taskqueue_thread, &vd->mv_task); } mqnode_release(pn); mqnode_release(parent); } else error = ENOENT; return (error); } #if 0 struct vop_remove_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; }; #endif /* * vnode removal operation */ static int mqfs_remove(struct vop_remove_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct mqfs_node *pn; int error; if (ap->a_vp->v_type == VDIR) return (EPERM); pn = VTON(ap->a_vp); sx_xlock(&mqfs->mi_lock); error = do_unlink(pn, ap->a_cnp->cn_cred); sx_xunlock(&mqfs->mi_lock); return (error); } #if 0 struct vop_inactive_args { struct vnode *a_vp; struct thread *a_td; }; #endif static int mqfs_inactive(struct vop_inactive_args *ap) { struct mqfs_node *pn = VTON(ap->a_vp); if (pn->mn_deleted) vrecycle(ap->a_vp); return (0); } #if 0 struct vop_reclaim_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct thread *a_td; }; #endif static int mqfs_reclaim(struct vop_reclaim_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_vp->v_mount); struct vnode *vp = ap->a_vp; struct mqfs_node *pn; struct mqfs_vdata *vd; vd = vp->v_data; pn = vd->mv_node; sx_xlock(&mqfs->mi_lock); vp->v_data = NULL; LIST_REMOVE(vd, mv_link); uma_zfree(mvdata_zone, vd); mqnode_release(pn); sx_xunlock(&mqfs->mi_lock); return (0); } #if 0 struct vop_open_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; struct file *a_fp; }; #endif static int mqfs_open(struct vop_open_args *ap) { return (0); } #if 0 struct vop_close_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; }; #endif static int mqfs_close(struct vop_close_args *ap) { return (0); } #if 0 struct vop_access_args { struct vop_generic_args a_gen; struct vnode *a_vp; accmode_t a_accmode; struct ucred *a_cred; struct thread *a_td; }; #endif /* * Verify permissions */ static int mqfs_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; struct vattr vattr; int error; error = VOP_GETATTR(vp, &vattr, ap->a_cred); if (error) return (error); error = vaccess(vp->v_type, vattr.va_mode, vattr.va_uid, vattr.va_gid, ap->a_accmode, ap->a_cred, NULL); return (error); } #if 0 struct vop_getattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; }; #endif /* * Get file attributes */ static int mqfs_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct mqfs_node *pn = VTON(vp); struct vattr *vap = ap->a_vap; int error = 0; vap->va_type = vp->v_type; vap->va_mode = pn->mn_mode; vap->va_nlink = 1; vap->va_uid = pn->mn_uid; vap->va_gid = pn->mn_gid; vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; vap->va_fileid = pn->mn_fileno; vap->va_size = 0; vap->va_blocksize = PAGE_SIZE; vap->va_bytes = vap->va_size = 0; vap->va_atime = pn->mn_atime; vap->va_mtime = pn->mn_mtime; vap->va_ctime = pn->mn_ctime; vap->va_birthtime = pn->mn_birth; vap->va_gen = 0; vap->va_flags = 0; vap->va_rdev = NODEV; vap->va_bytes = 0; vap->va_filerev = 0; return (error); } #if 0 struct vop_setattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; }; #endif /* * Set attributes */ static int mqfs_setattr(struct vop_setattr_args *ap) { struct mqfs_node *pn; struct vattr *vap; struct vnode *vp; struct thread *td; int c, error; uid_t uid; gid_t gid; td = curthread; vap = ap->a_vap; vp = ap->a_vp; if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_flags != VNOVAL && vap->va_flags != 0) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { return (EINVAL); } pn = VTON(vp); error = c = 0; if (vap->va_uid == (uid_t)VNOVAL) uid = pn->mn_uid; else uid = vap->va_uid; if (vap->va_gid == (gid_t)VNOVAL) gid = pn->mn_gid; else gid = vap->va_gid; if (uid != pn->mn_uid || gid != pn->mn_gid) { /* * To modify the ownership of a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td))) return (error); /* * XXXRW: Why is there a privilege check here: shouldn't the * check in VOP_ACCESS() be enough? Also, are the group bits * below definitely right? */ if (((ap->a_cred->cr_uid != pn->mn_uid) || uid != pn->mn_uid || (gid != pn->mn_gid && !groupmember(gid, ap->a_cred))) && (error = priv_check(td, PRIV_MQ_ADMIN)) != 0) return (error); pn->mn_uid = uid; pn->mn_gid = gid; c = 1; } if (vap->va_mode != (mode_t)VNOVAL) { if ((ap->a_cred->cr_uid != pn->mn_uid) && (error = priv_check(td, PRIV_MQ_ADMIN))) return (error); pn->mn_mode = vap->va_mode; c = 1; } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { /* See the comment in ufs_vnops::ufs_setattr(). */ if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td)) && ((vap->va_vaflags & VA_UTIMES_NULL) == 0 || (error = VOP_ACCESS(vp, VWRITE, ap->a_cred, td)))) return (error); if (vap->va_atime.tv_sec != VNOVAL) { pn->mn_atime = vap->va_atime; } if (vap->va_mtime.tv_sec != VNOVAL) { pn->mn_mtime = vap->va_mtime; } c = 1; } if (c) { vfs_timestamp(&pn->mn_ctime); } return (0); } #if 0 struct vop_read_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; }; #endif /* * Read from a file */ static int mqfs_read(struct vop_read_args *ap) { char buf[80]; struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct mqfs_node *pn; struct mqueue *mq; int len, error; if (vp->v_type != VREG) return (EINVAL); pn = VTON(vp); mq = VTOMQ(vp); snprintf(buf, sizeof(buf), "QSIZE:%-10ld MAXMSG:%-10ld CURMSG:%-10ld MSGSIZE:%-10ld\n", mq->mq_totalbytes, mq->mq_maxmsg, mq->mq_curmsgs, mq->mq_msgsize); buf[sizeof(buf)-1] = '\0'; len = strlen(buf); error = uiomove_frombuf(buf, len, uio); return (error); } #if 0 struct vop_readdir_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; u_long **a_cookies; }; #endif /* * Return directory entries. */ static int mqfs_readdir(struct vop_readdir_args *ap) { struct vnode *vp; struct mqfs_info *mi; struct mqfs_node *pd; struct mqfs_node *pn; struct dirent entry; struct uio *uio; const void *pr_root; int *tmp_ncookies = NULL; off_t offset; int error, i; vp = ap->a_vp; mi = VFSTOMQFS(vp->v_mount); pd = VTON(vp); uio = ap->a_uio; if (vp->v_type != VDIR) return (ENOTDIR); if (uio->uio_offset < 0) return (EINVAL); if (ap->a_ncookies != NULL) { tmp_ncookies = ap->a_ncookies; *ap->a_ncookies = 0; ap->a_ncookies = NULL; } error = 0; offset = 0; pr_root = ap->a_cred->cr_prison->pr_root; sx_xlock(&mi->mi_lock); LIST_FOREACH(pn, &pd->mn_children, mn_sibling) { entry.d_reclen = sizeof(entry); /* * Only show names within the same prison root directory * (or not associated with a prison, e.g. "." and ".."). */ if (pn->mn_pr_root != NULL && pn->mn_pr_root != pr_root) continue; if (!pn->mn_fileno) mqfs_fileno_alloc(mi, pn); entry.d_fileno = pn->mn_fileno; for (i = 0; i < MQFS_NAMELEN - 1 && pn->mn_name[i] != '\0'; ++i) entry.d_name[i] = pn->mn_name[i]; entry.d_name[i] = 0; entry.d_namlen = i; switch (pn->mn_type) { case mqfstype_root: case mqfstype_dir: case mqfstype_this: case mqfstype_parent: entry.d_type = DT_DIR; break; case mqfstype_file: entry.d_type = DT_REG; break; case mqfstype_symlink: entry.d_type = DT_LNK; break; default: panic("%s has unexpected node type: %d", pn->mn_name, pn->mn_type); } if (entry.d_reclen > uio->uio_resid) break; if (offset >= uio->uio_offset) { error = vfs_read_dirent(ap, &entry, offset); if (error) break; } offset += entry.d_reclen; } sx_xunlock(&mi->mi_lock); uio->uio_offset = offset; if (tmp_ncookies != NULL) ap->a_ncookies = tmp_ncookies; return (error); } #ifdef notyet #if 0 struct vop_mkdir_args { struct vnode *a_dvp; struvt vnode **a_vpp; struvt componentname *a_cnp; struct vattr *a_vap; }; #endif /* * Create a directory. */ static int mqfs_mkdir(struct vop_mkdir_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct componentname *cnp = ap->a_cnp; struct mqfs_node *pd = VTON(ap->a_dvp); struct mqfs_node *pn; int error; if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir) return (ENOTDIR); sx_xlock(&mqfs->mi_lock); if ((cnp->cn_flags & HASBUF) == 0) panic("%s: no name", __func__); pn = mqfs_create_dir(pd, cnp->cn_nameptr, cnp->cn_namelen, ap->a_vap->cn_cred, ap->a_vap->va_mode); if (pn != NULL) mqnode_addref(pn); sx_xunlock(&mqfs->mi_lock); if (pn == NULL) { error = ENOSPC; } else { error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn); mqnode_release(pn); } return (error); } #if 0 struct vop_rmdir_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; }; #endif /* * Remove a directory. */ static int mqfs_rmdir(struct vop_rmdir_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct mqfs_node *pn = VTON(ap->a_vp); struct mqfs_node *pt; if (pn->mn_type != mqfstype_dir) return (ENOTDIR); sx_xlock(&mqfs->mi_lock); if (pn->mn_deleted) { sx_xunlock(&mqfs->mi_lock); return (ENOENT); } pt = LIST_FIRST(&pn->mn_children); pt = LIST_NEXT(pt, mn_sibling); pt = LIST_NEXT(pt, mn_sibling); if (pt != NULL) { sx_xunlock(&mqfs->mi_lock); return (ENOTEMPTY); } pt = pn->mn_parent; pn->mn_parent = NULL; pn->mn_deleted = 1; LIST_REMOVE(pn, mn_sibling); mqnode_release(pn); mqnode_release(pt); sx_xunlock(&mqfs->mi_lock); cache_purge(ap->a_vp); return (0); } #endif /* notyet */ /* * See if this prison root is obsolete, and clean up associated queues if it is. */ static int mqfs_prison_remove(void *obj, void *data __unused) { const struct prison *pr = obj; const struct prison *tpr; struct mqfs_node *pn, *tpn; int found; found = 0; TAILQ_FOREACH(tpr, &allprison, pr_list) { if (tpr->pr_root == pr->pr_root && tpr != pr && tpr->pr_ref > 0) found = 1; } if (!found) { /* * No jails are rooted in this directory anymore, * so no queues should be either. */ sx_xlock(&mqfs_data.mi_lock); LIST_FOREACH_SAFE(pn, &mqfs_data.mi_root->mn_children, mn_sibling, tpn) { if (pn->mn_pr_root == pr->pr_root) (void)do_unlink(pn, curthread->td_ucred); } sx_xunlock(&mqfs_data.mi_lock); } return (0); } /* * Allocate a message queue */ static struct mqueue * mqueue_alloc(const struct mq_attr *attr) { struct mqueue *mq; if (curmq >= maxmq) return (NULL); mq = uma_zalloc(mqueue_zone, M_WAITOK | M_ZERO); TAILQ_INIT(&mq->mq_msgq); if (attr != NULL) { mq->mq_maxmsg = attr->mq_maxmsg; mq->mq_msgsize = attr->mq_msgsize; } else { mq->mq_maxmsg = default_maxmsg; mq->mq_msgsize = default_msgsize; } mtx_init(&mq->mq_mutex, "mqueue lock", NULL, MTX_DEF); knlist_init_mtx(&mq->mq_rsel.si_note, &mq->mq_mutex); knlist_init_mtx(&mq->mq_wsel.si_note, &mq->mq_mutex); atomic_add_int(&curmq, 1); return (mq); } /* * Destroy a message queue */ static void mqueue_free(struct mqueue *mq) { struct mqueue_msg *msg; while ((msg = TAILQ_FIRST(&mq->mq_msgq)) != NULL) { TAILQ_REMOVE(&mq->mq_msgq, msg, msg_link); free(msg, M_MQUEUEDATA); } mtx_destroy(&mq->mq_mutex); seldrain(&mq->mq_rsel); seldrain(&mq->mq_wsel); knlist_destroy(&mq->mq_rsel.si_note); knlist_destroy(&mq->mq_wsel.si_note); uma_zfree(mqueue_zone, mq); atomic_add_int(&curmq, -1); } /* * Load a message from user space */ static struct mqueue_msg * mqueue_loadmsg(const char *msg_ptr, size_t msg_size, int msg_prio) { struct mqueue_msg *msg; size_t len; int error; len = sizeof(struct mqueue_msg) + msg_size; msg = malloc(len, M_MQUEUEDATA, M_WAITOK); error = copyin(msg_ptr, ((char *)msg) + sizeof(struct mqueue_msg), msg_size); if (error) { free(msg, M_MQUEUEDATA); msg = NULL; } else { msg->msg_size = msg_size; msg->msg_prio = msg_prio; } return (msg); } /* * Save a message to user space */ static int mqueue_savemsg(struct mqueue_msg *msg, char *msg_ptr, int *msg_prio) { int error; error = copyout(((char *)msg) + sizeof(*msg), msg_ptr, msg->msg_size); if (error == 0 && msg_prio != NULL) error = copyout(&msg->msg_prio, msg_prio, sizeof(int)); return (error); } /* * Free a message's memory */ static __inline void mqueue_freemsg(struct mqueue_msg *msg) { free(msg, M_MQUEUEDATA); } /* * Send a message. if waitok is false, thread will not be * blocked if there is no data in queue, otherwise, absolute * time will be checked. */ int mqueue_send(struct mqueue *mq, const char *msg_ptr, size_t msg_len, unsigned msg_prio, int waitok, const struct timespec *abs_timeout) { struct mqueue_msg *msg; struct timespec ts, ts2; struct timeval tv; int error; if (msg_prio >= MQ_PRIO_MAX) return (EINVAL); if (msg_len > mq->mq_msgsize) return (EMSGSIZE); msg = mqueue_loadmsg(msg_ptr, msg_len, msg_prio); if (msg == NULL) return (EFAULT); /* O_NONBLOCK case */ if (!waitok) { error = _mqueue_send(mq, msg, -1); if (error) goto bad; return (0); } /* we allow a null timeout (wait forever) */ if (abs_timeout == NULL) { error = _mqueue_send(mq, msg, 0); if (error) goto bad; return (0); } /* send it before checking time */ error = _mqueue_send(mq, msg, -1); if (error == 0) return (0); if (error != EAGAIN) goto bad; if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) { error = EINVAL; goto bad; } for (;;) { ts2 = *abs_timeout; getnanotime(&ts); timespecsub(&ts2, &ts); if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) { error = ETIMEDOUT; break; } TIMESPEC_TO_TIMEVAL(&tv, &ts2); error = _mqueue_send(mq, msg, tvtohz(&tv)); if (error != ETIMEDOUT) break; } if (error == 0) return (0); bad: mqueue_freemsg(msg); return (error); } /* * Common routine to send a message */ static int _mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo) { struct mqueue_msg *msg2; int error = 0; mtx_lock(&mq->mq_mutex); while (mq->mq_curmsgs >= mq->mq_maxmsg && error == 0) { if (timo < 0) { mtx_unlock(&mq->mq_mutex); return (EAGAIN); } mq->mq_senders++; error = msleep(&mq->mq_senders, &mq->mq_mutex, PCATCH, "mqsend", timo); mq->mq_senders--; if (error == EAGAIN) error = ETIMEDOUT; } if (mq->mq_curmsgs >= mq->mq_maxmsg) { mtx_unlock(&mq->mq_mutex); return (error); } error = 0; if (TAILQ_EMPTY(&mq->mq_msgq)) { TAILQ_INSERT_HEAD(&mq->mq_msgq, msg, msg_link); } else { if (msg->msg_prio <= TAILQ_LAST(&mq->mq_msgq, msgq)->msg_prio) { TAILQ_INSERT_TAIL(&mq->mq_msgq, msg, msg_link); } else { TAILQ_FOREACH(msg2, &mq->mq_msgq, msg_link) { if (msg2->msg_prio < msg->msg_prio) break; } TAILQ_INSERT_BEFORE(msg2, msg, msg_link); } } mq->mq_curmsgs++; mq->mq_totalbytes += msg->msg_size; if (mq->mq_receivers) wakeup_one(&mq->mq_receivers); else if (mq->mq_notifier != NULL) mqueue_send_notification(mq); if (mq->mq_flags & MQ_RSEL) { mq->mq_flags &= ~MQ_RSEL; selwakeup(&mq->mq_rsel); } KNOTE_LOCKED(&mq->mq_rsel.si_note, 0); mtx_unlock(&mq->mq_mutex); return (0); } /* * Send realtime a signal to process which registered itself * successfully by mq_notify. */ static void mqueue_send_notification(struct mqueue *mq) { struct mqueue_notifier *nt; struct thread *td; struct proc *p; int error; mtx_assert(&mq->mq_mutex, MA_OWNED); nt = mq->mq_notifier; if (nt->nt_sigev.sigev_notify != SIGEV_NONE) { p = nt->nt_proc; error = sigev_findtd(p, &nt->nt_sigev, &td); if (error) { mq->mq_notifier = NULL; return; } if (!KSI_ONQ(&nt->nt_ksi)) { ksiginfo_set_sigev(&nt->nt_ksi, &nt->nt_sigev); tdsendsignal(p, td, nt->nt_ksi.ksi_signo, &nt->nt_ksi); } PROC_UNLOCK(p); } mq->mq_notifier = NULL; } /* * Get a message. if waitok is false, thread will not be * blocked if there is no data in queue, otherwise, absolute * time will be checked. */ int mqueue_receive(struct mqueue *mq, char *msg_ptr, size_t msg_len, unsigned *msg_prio, int waitok, const struct timespec *abs_timeout) { struct mqueue_msg *msg; struct timespec ts, ts2; struct timeval tv; int error; if (msg_len < mq->mq_msgsize) return (EMSGSIZE); /* O_NONBLOCK case */ if (!waitok) { error = _mqueue_recv(mq, &msg, -1); if (error) return (error); goto received; } /* we allow a null timeout (wait forever). */ if (abs_timeout == NULL) { error = _mqueue_recv(mq, &msg, 0); if (error) return (error); goto received; } /* try to get a message before checking time */ error = _mqueue_recv(mq, &msg, -1); if (error == 0) goto received; if (error != EAGAIN) return (error); if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) { error = EINVAL; return (error); } for (;;) { ts2 = *abs_timeout; getnanotime(&ts); timespecsub(&ts2, &ts); if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) { error = ETIMEDOUT; return (error); } TIMESPEC_TO_TIMEVAL(&tv, &ts2); error = _mqueue_recv(mq, &msg, tvtohz(&tv)); if (error == 0) break; if (error != ETIMEDOUT) return (error); } received: error = mqueue_savemsg(msg, msg_ptr, msg_prio); if (error == 0) { curthread->td_retval[0] = msg->msg_size; curthread->td_retval[1] = 0; } mqueue_freemsg(msg); return (error); } /* * Common routine to receive a message */ static int _mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo) { int error = 0; mtx_lock(&mq->mq_mutex); while ((*msg = TAILQ_FIRST(&mq->mq_msgq)) == NULL && error == 0) { if (timo < 0) { mtx_unlock(&mq->mq_mutex); return (EAGAIN); } mq->mq_receivers++; error = msleep(&mq->mq_receivers, &mq->mq_mutex, PCATCH, "mqrecv", timo); mq->mq_receivers--; if (error == EAGAIN) error = ETIMEDOUT; } if (*msg != NULL) { error = 0; TAILQ_REMOVE(&mq->mq_msgq, *msg, msg_link); mq->mq_curmsgs--; mq->mq_totalbytes -= (*msg)->msg_size; if (mq->mq_senders) wakeup_one(&mq->mq_senders); if (mq->mq_flags & MQ_WSEL) { mq->mq_flags &= ~MQ_WSEL; selwakeup(&mq->mq_wsel); } KNOTE_LOCKED(&mq->mq_wsel.si_note, 0); } if (mq->mq_notifier != NULL && mq->mq_receivers == 0 && !TAILQ_EMPTY(&mq->mq_msgq)) { mqueue_send_notification(mq); } mtx_unlock(&mq->mq_mutex); return (error); } static __inline struct mqueue_notifier * notifier_alloc(void) { return (uma_zalloc(mqnoti_zone, M_WAITOK | M_ZERO)); } static __inline void notifier_free(struct mqueue_notifier *p) { uma_zfree(mqnoti_zone, p); } static struct mqueue_notifier * notifier_search(struct proc *p, int fd) { struct mqueue_notifier *nt; LIST_FOREACH(nt, &p->p_mqnotifier, nt_link) { if (nt->nt_ksi.ksi_mqd == fd) break; } return (nt); } static __inline void notifier_insert(struct proc *p, struct mqueue_notifier *nt) { LIST_INSERT_HEAD(&p->p_mqnotifier, nt, nt_link); } static __inline void notifier_delete(struct proc *p, struct mqueue_notifier *nt) { LIST_REMOVE(nt, nt_link); notifier_free(nt); } static void notifier_remove(struct proc *p, struct mqueue *mq, int fd) { struct mqueue_notifier *nt; mtx_assert(&mq->mq_mutex, MA_OWNED); PROC_LOCK(p); nt = notifier_search(p, fd); if (nt != NULL) { if (mq->mq_notifier == nt) mq->mq_notifier = NULL; sigqueue_take(&nt->nt_ksi); notifier_delete(p, nt); } PROC_UNLOCK(p); } static int kern_kmq_open(struct thread *td, const char *upath, int flags, mode_t mode, const struct mq_attr *attr) { char path[MQFS_NAMELEN + 1]; struct mqfs_node *pn; struct filedesc *fdp; struct file *fp; struct mqueue *mq; int fd, error, len, cmode; AUDIT_ARG_FFLAGS(flags); AUDIT_ARG_MODE(mode); fdp = td->td_proc->p_fd; cmode = (((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT); mq = NULL; if ((flags & O_CREAT) != 0 && attr != NULL) { if (attr->mq_maxmsg <= 0 || attr->mq_maxmsg > maxmsg) return (EINVAL); if (attr->mq_msgsize <= 0 || attr->mq_msgsize > maxmsgsize) return (EINVAL); } error = copyinstr(upath, path, MQFS_NAMELEN + 1, NULL); if (error) return (error); /* * The first character of name must be a slash (/) character * and the remaining characters of name cannot include any slash * characters. */ len = strlen(path); if (len < 2 || path[0] != '/' || strchr(path + 1, '/') != NULL) return (EINVAL); AUDIT_ARG_UPATH1_CANON(path); error = falloc(td, &fp, &fd, O_CLOEXEC); if (error) return (error); sx_xlock(&mqfs_data.mi_lock); pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred); if (pn == NULL) { if (!(flags & O_CREAT)) { error = ENOENT; } else { mq = mqueue_alloc(attr); if (mq == NULL) { error = ENFILE; } else { pn = mqfs_create_file(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred, cmode); if (pn == NULL) { error = ENOSPC; mqueue_free(mq); } } } if (error == 0) { pn->mn_data = mq; } } else { if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) { error = EEXIST; } else { accmode_t accmode = 0; if (flags & FREAD) accmode |= VREAD; if (flags & FWRITE) accmode |= VWRITE; error = vaccess(VREG, pn->mn_mode, pn->mn_uid, pn->mn_gid, accmode, td->td_ucred, NULL); } } if (error) { sx_xunlock(&mqfs_data.mi_lock); fdclose(td, fp, fd); fdrop(fp, td); return (error); } mqnode_addref(pn); sx_xunlock(&mqfs_data.mi_lock); finit(fp, flags & (FREAD | FWRITE | O_NONBLOCK), DTYPE_MQUEUE, pn, &mqueueops); td->td_retval[0] = fd; fdrop(fp, td); return (0); } /* * Syscall to open a message queue. */ int sys_kmq_open(struct thread *td, struct kmq_open_args *uap) { struct mq_attr attr; int flags, error; if ((uap->flags & O_ACCMODE) == O_ACCMODE || uap->flags & O_EXEC) return (EINVAL); flags = FFLAGS(uap->flags); if ((flags & O_CREAT) != 0 && uap->attr != NULL) { error = copyin(uap->attr, &attr, sizeof(attr)); if (error) return (error); } return (kern_kmq_open(td, uap->path, flags, uap->mode, uap->attr != NULL ? &attr : NULL)); } /* * Syscall to unlink a message queue. */ int sys_kmq_unlink(struct thread *td, struct kmq_unlink_args *uap) { char path[MQFS_NAMELEN+1]; struct mqfs_node *pn; int error, len; error = copyinstr(uap->path, path, MQFS_NAMELEN + 1, NULL); if (error) return (error); len = strlen(path); if (len < 2 || path[0] != '/' || strchr(path + 1, '/') != NULL) return (EINVAL); AUDIT_ARG_UPATH1_CANON(path); sx_xlock(&mqfs_data.mi_lock); pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred); if (pn != NULL) error = do_unlink(pn, td->td_ucred); else error = ENOENT; sx_xunlock(&mqfs_data.mi_lock); return (error); } typedef int (*_fgetf)(struct thread *, int, cap_rights_t *, struct file **); /* * Get message queue by giving file slot */ static int _getmq(struct thread *td, int fd, cap_rights_t *rightsp, _fgetf func, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { struct mqfs_node *pn; int error; error = func(td, fd, rightsp, fpp); if (error) return (error); if (&mqueueops != (*fpp)->f_ops) { fdrop(*fpp, td); return (EBADF); } pn = (*fpp)->f_data; if (ppn) *ppn = pn; if (pmq) *pmq = pn->mn_data; return (0); } static __inline int getmq(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { - cap_rights_t rights; - return _getmq(td, fd, cap_rights_init(&rights, CAP_EVENT), fget, + return _getmq(td, fd, &cap_event_rights, fget, fpp, ppn, pmq); } static __inline int getmq_read(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { - cap_rights_t rights; - return _getmq(td, fd, cap_rights_init(&rights, CAP_READ), fget_read, + return _getmq(td, fd, &cap_read_rights, fget_read, fpp, ppn, pmq); } static __inline int getmq_write(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { - cap_rights_t rights; - return _getmq(td, fd, cap_rights_init(&rights, CAP_WRITE), fget_write, + return _getmq(td, fd, &cap_write_rights, fget_write, fpp, ppn, pmq); } static int kern_kmq_setattr(struct thread *td, int mqd, const struct mq_attr *attr, struct mq_attr *oattr) { struct mqueue *mq; struct file *fp; u_int oflag, flag; int error; AUDIT_ARG_FD(mqd); if (attr != NULL && (attr->mq_flags & ~O_NONBLOCK) != 0) return (EINVAL); error = getmq(td, mqd, &fp, NULL, &mq); if (error) return (error); oattr->mq_maxmsg = mq->mq_maxmsg; oattr->mq_msgsize = mq->mq_msgsize; oattr->mq_curmsgs = mq->mq_curmsgs; if (attr != NULL) { do { oflag = flag = fp->f_flag; flag &= ~O_NONBLOCK; flag |= (attr->mq_flags & O_NONBLOCK); } while (atomic_cmpset_int(&fp->f_flag, oflag, flag) == 0); } else oflag = fp->f_flag; oattr->mq_flags = (O_NONBLOCK & oflag); fdrop(fp, td); return (error); } int sys_kmq_setattr(struct thread *td, struct kmq_setattr_args *uap) { struct mq_attr attr, oattr; int error; if (uap->attr != NULL) { error = copyin(uap->attr, &attr, sizeof(attr)); if (error != 0) return (error); } error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL, &oattr); if (error == 0 && uap->oattr != NULL) { bzero(oattr.__reserved, sizeof(oattr.__reserved)); error = copyout(&oattr, uap->oattr, sizeof(oattr)); } return (error); } int sys_kmq_timedreceive(struct thread *td, struct kmq_timedreceive_args *uap) { struct mqueue *mq; struct file *fp; struct timespec *abs_timeout, ets; int error; int waitok; AUDIT_ARG_FD(uap->mqd); error = getmq_read(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); if (uap->abs_timeout != NULL) { error = copyin(uap->abs_timeout, &ets, sizeof(ets)); if (error != 0) return (error); abs_timeout = &ets; } else abs_timeout = NULL; waitok = !(fp->f_flag & O_NONBLOCK); error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len, uap->msg_prio, waitok, abs_timeout); fdrop(fp, td); return (error); } int sys_kmq_timedsend(struct thread *td, struct kmq_timedsend_args *uap) { struct mqueue *mq; struct file *fp; struct timespec *abs_timeout, ets; int error, waitok; AUDIT_ARG_FD(uap->mqd); error = getmq_write(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); if (uap->abs_timeout != NULL) { error = copyin(uap->abs_timeout, &ets, sizeof(ets)); if (error != 0) return (error); abs_timeout = &ets; } else abs_timeout = NULL; waitok = !(fp->f_flag & O_NONBLOCK); error = mqueue_send(mq, uap->msg_ptr, uap->msg_len, uap->msg_prio, waitok, abs_timeout); fdrop(fp, td); return (error); } static int kern_kmq_notify(struct thread *td, int mqd, struct sigevent *sigev) { -#ifdef CAPABILITIES - cap_rights_t rights; -#endif struct filedesc *fdp; struct proc *p; struct mqueue *mq; struct file *fp, *fp2; struct mqueue_notifier *nt, *newnt = NULL; int error; AUDIT_ARG_FD(mqd); if (sigev != NULL) { if (sigev->sigev_notify != SIGEV_SIGNAL && sigev->sigev_notify != SIGEV_THREAD_ID && sigev->sigev_notify != SIGEV_NONE) return (EINVAL); if ((sigev->sigev_notify == SIGEV_SIGNAL || sigev->sigev_notify == SIGEV_THREAD_ID) && !_SIG_VALID(sigev->sigev_signo)) return (EINVAL); } p = td->td_proc; fdp = td->td_proc->p_fd; error = getmq(td, mqd, &fp, NULL, &mq); if (error) return (error); again: FILEDESC_SLOCK(fdp); fp2 = fget_locked(fdp, mqd); if (fp2 == NULL) { FILEDESC_SUNLOCK(fdp); error = EBADF; goto out; } #ifdef CAPABILITIES - error = cap_check(cap_rights(fdp, mqd), - cap_rights_init(&rights, CAP_EVENT)); + error = cap_check(cap_rights(fdp, mqd), &cap_event_rights); if (error) { FILEDESC_SUNLOCK(fdp); goto out; } #endif if (fp2 != fp) { FILEDESC_SUNLOCK(fdp); error = EBADF; goto out; } mtx_lock(&mq->mq_mutex); FILEDESC_SUNLOCK(fdp); if (sigev != NULL) { if (mq->mq_notifier != NULL) { error = EBUSY; } else { PROC_LOCK(p); nt = notifier_search(p, mqd); if (nt == NULL) { if (newnt == NULL) { PROC_UNLOCK(p); mtx_unlock(&mq->mq_mutex); newnt = notifier_alloc(); goto again; } } if (nt != NULL) { sigqueue_take(&nt->nt_ksi); if (newnt != NULL) { notifier_free(newnt); newnt = NULL; } } else { nt = newnt; newnt = NULL; ksiginfo_init(&nt->nt_ksi); nt->nt_ksi.ksi_flags |= KSI_INS | KSI_EXT; nt->nt_ksi.ksi_code = SI_MESGQ; nt->nt_proc = p; nt->nt_ksi.ksi_mqd = mqd; notifier_insert(p, nt); } nt->nt_sigev = *sigev; mq->mq_notifier = nt; PROC_UNLOCK(p); /* * if there is no receivers and message queue * is not empty, we should send notification * as soon as possible. */ if (mq->mq_receivers == 0 && !TAILQ_EMPTY(&mq->mq_msgq)) mqueue_send_notification(mq); } } else { notifier_remove(p, mq, mqd); } mtx_unlock(&mq->mq_mutex); out: fdrop(fp, td); if (newnt != NULL) notifier_free(newnt); return (error); } int sys_kmq_notify(struct thread *td, struct kmq_notify_args *uap) { struct sigevent ev, *evp; int error; if (uap->sigev == NULL) { evp = NULL; } else { error = copyin(uap->sigev, &ev, sizeof(ev)); if (error != 0) return (error); evp = &ev; } return (kern_kmq_notify(td, uap->mqd, evp)); } static void mqueue_fdclose(struct thread *td, int fd, struct file *fp) { struct filedesc *fdp; struct mqueue *mq; fdp = td->td_proc->p_fd; FILEDESC_LOCK_ASSERT(fdp); if (fp->f_ops == &mqueueops) { mq = FPTOMQ(fp); mtx_lock(&mq->mq_mutex); notifier_remove(td->td_proc, mq, fd); /* have to wakeup thread in same process */ if (mq->mq_flags & MQ_RSEL) { mq->mq_flags &= ~MQ_RSEL; selwakeup(&mq->mq_rsel); } if (mq->mq_flags & MQ_WSEL) { mq->mq_flags &= ~MQ_WSEL; selwakeup(&mq->mq_wsel); } mtx_unlock(&mq->mq_mutex); } } static void mq_proc_exit(void *arg __unused, struct proc *p) { struct filedesc *fdp; struct file *fp; struct mqueue *mq; int i; fdp = p->p_fd; FILEDESC_SLOCK(fdp); for (i = 0; i < fdp->fd_nfiles; ++i) { fp = fget_locked(fdp, i); if (fp != NULL && fp->f_ops == &mqueueops) { mq = FPTOMQ(fp); mtx_lock(&mq->mq_mutex); notifier_remove(p, FPTOMQ(fp), i); mtx_unlock(&mq->mq_mutex); } } FILEDESC_SUNLOCK(fdp); KASSERT(LIST_EMPTY(&p->p_mqnotifier), ("mq notifiers left")); } static int mqf_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct mqueue *mq = FPTOMQ(fp); int revents = 0; mtx_lock(&mq->mq_mutex); if (events & (POLLIN | POLLRDNORM)) { if (mq->mq_curmsgs) { revents |= events & (POLLIN | POLLRDNORM); } else { mq->mq_flags |= MQ_RSEL; selrecord(td, &mq->mq_rsel); } } if (events & POLLOUT) { if (mq->mq_curmsgs < mq->mq_maxmsg) revents |= POLLOUT; else { mq->mq_flags |= MQ_WSEL; selrecord(td, &mq->mq_wsel); } } mtx_unlock(&mq->mq_mutex); return (revents); } static int mqf_close(struct file *fp, struct thread *td) { struct mqfs_node *pn; fp->f_ops = &badfileops; pn = fp->f_data; fp->f_data = NULL; sx_xlock(&mqfs_data.mi_lock); mqnode_release(pn); sx_xunlock(&mqfs_data.mi_lock); return (0); } static int mqf_stat(struct file *fp, struct stat *st, struct ucred *active_cred, struct thread *td) { struct mqfs_node *pn = fp->f_data; bzero(st, sizeof *st); sx_xlock(&mqfs_data.mi_lock); st->st_atim = pn->mn_atime; st->st_mtim = pn->mn_mtime; st->st_ctim = pn->mn_ctime; st->st_birthtim = pn->mn_birth; st->st_uid = pn->mn_uid; st->st_gid = pn->mn_gid; st->st_mode = S_IFIFO | pn->mn_mode; sx_xunlock(&mqfs_data.mi_lock); return (0); } static int mqf_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { struct mqfs_node *pn; int error; error = 0; pn = fp->f_data; sx_xlock(&mqfs_data.mi_lock); error = vaccess(VREG, pn->mn_mode, pn->mn_uid, pn->mn_gid, VADMIN, active_cred, NULL); if (error != 0) goto out; pn->mn_mode = mode & ACCESSPERMS; out: sx_xunlock(&mqfs_data.mi_lock); return (error); } static int mqf_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { struct mqfs_node *pn; int error; error = 0; pn = fp->f_data; sx_xlock(&mqfs_data.mi_lock); if (uid == (uid_t)-1) uid = pn->mn_uid; if (gid == (gid_t)-1) gid = pn->mn_gid; if (((uid != pn->mn_uid && uid != active_cred->cr_uid) || (gid != pn->mn_gid && !groupmember(gid, active_cred))) && (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN, 0))) goto out; pn->mn_uid = uid; pn->mn_gid = gid; out: sx_xunlock(&mqfs_data.mi_lock); return (error); } static int mqf_kqfilter(struct file *fp, struct knote *kn) { struct mqueue *mq = FPTOMQ(fp); int error = 0; if (kn->kn_filter == EVFILT_READ) { kn->kn_fop = &mq_rfiltops; knlist_add(&mq->mq_rsel.si_note, kn, 0); } else if (kn->kn_filter == EVFILT_WRITE) { kn->kn_fop = &mq_wfiltops; knlist_add(&mq->mq_wsel.si_note, kn, 0); } else error = EINVAL; return (error); } static void filt_mqdetach(struct knote *kn) { struct mqueue *mq = FPTOMQ(kn->kn_fp); if (kn->kn_filter == EVFILT_READ) knlist_remove(&mq->mq_rsel.si_note, kn, 0); else if (kn->kn_filter == EVFILT_WRITE) knlist_remove(&mq->mq_wsel.si_note, kn, 0); else panic("filt_mqdetach"); } static int filt_mqread(struct knote *kn, long hint) { struct mqueue *mq = FPTOMQ(kn->kn_fp); mtx_assert(&mq->mq_mutex, MA_OWNED); return (mq->mq_curmsgs != 0); } static int filt_mqwrite(struct knote *kn, long hint) { struct mqueue *mq = FPTOMQ(kn->kn_fp); mtx_assert(&mq->mq_mutex, MA_OWNED); return (mq->mq_curmsgs < mq->mq_maxmsg); } static int mqf_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { kif->kf_type = KF_TYPE_MQUEUE; return (0); } static struct fileops mqueueops = { .fo_read = invfo_rdwr, .fo_write = invfo_rdwr, .fo_truncate = invfo_truncate, .fo_ioctl = invfo_ioctl, .fo_poll = mqf_poll, .fo_kqfilter = mqf_kqfilter, .fo_stat = mqf_stat, .fo_close = mqf_close, .fo_chmod = mqf_chmod, .fo_chown = mqf_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = mqf_fill_kinfo, }; static struct vop_vector mqfs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = mqfs_access, .vop_cachedlookup = mqfs_lookup, .vop_lookup = vfs_cache_lookup, .vop_reclaim = mqfs_reclaim, .vop_create = mqfs_create, .vop_remove = mqfs_remove, .vop_inactive = mqfs_inactive, .vop_open = mqfs_open, .vop_close = mqfs_close, .vop_getattr = mqfs_getattr, .vop_setattr = mqfs_setattr, .vop_read = mqfs_read, .vop_write = VOP_EOPNOTSUPP, .vop_readdir = mqfs_readdir, .vop_mkdir = VOP_EOPNOTSUPP, .vop_rmdir = VOP_EOPNOTSUPP }; static struct vfsops mqfs_vfsops = { .vfs_init = mqfs_init, .vfs_uninit = mqfs_uninit, .vfs_mount = mqfs_mount, .vfs_unmount = mqfs_unmount, .vfs_root = mqfs_root, .vfs_statfs = mqfs_statfs, }; static struct vfsconf mqueuefs_vfsconf = { .vfc_version = VFS_VERSION, .vfc_name = "mqueuefs", .vfc_vfsops = &mqfs_vfsops, .vfc_typenum = -1, .vfc_flags = VFCF_SYNTHETIC }; static struct syscall_helper_data mq_syscalls[] = { SYSCALL_INIT_HELPER(kmq_open), SYSCALL_INIT_HELPER_F(kmq_setattr, SYF_CAPENABLED), SYSCALL_INIT_HELPER_F(kmq_timedsend, SYF_CAPENABLED), SYSCALL_INIT_HELPER_F(kmq_timedreceive, SYF_CAPENABLED), SYSCALL_INIT_HELPER_F(kmq_notify, SYF_CAPENABLED), SYSCALL_INIT_HELPER(kmq_unlink), SYSCALL_INIT_LAST }; #ifdef COMPAT_FREEBSD32 #include #include #include #include #include static void mq_attr_from32(const struct mq_attr32 *from, struct mq_attr *to) { to->mq_flags = from->mq_flags; to->mq_maxmsg = from->mq_maxmsg; to->mq_msgsize = from->mq_msgsize; to->mq_curmsgs = from->mq_curmsgs; } static void mq_attr_to32(const struct mq_attr *from, struct mq_attr32 *to) { to->mq_flags = from->mq_flags; to->mq_maxmsg = from->mq_maxmsg; to->mq_msgsize = from->mq_msgsize; to->mq_curmsgs = from->mq_curmsgs; } int freebsd32_kmq_open(struct thread *td, struct freebsd32_kmq_open_args *uap) { struct mq_attr attr; struct mq_attr32 attr32; int flags, error; if ((uap->flags & O_ACCMODE) == O_ACCMODE || uap->flags & O_EXEC) return (EINVAL); flags = FFLAGS(uap->flags); if ((flags & O_CREAT) != 0 && uap->attr != NULL) { error = copyin(uap->attr, &attr32, sizeof(attr32)); if (error) return (error); mq_attr_from32(&attr32, &attr); } return (kern_kmq_open(td, uap->path, flags, uap->mode, uap->attr != NULL ? &attr : NULL)); } int freebsd32_kmq_setattr(struct thread *td, struct freebsd32_kmq_setattr_args *uap) { struct mq_attr attr, oattr; struct mq_attr32 attr32, oattr32; int error; if (uap->attr != NULL) { error = copyin(uap->attr, &attr32, sizeof(attr32)); if (error != 0) return (error); mq_attr_from32(&attr32, &attr); } error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL, &oattr); if (error == 0 && uap->oattr != NULL) { mq_attr_to32(&oattr, &oattr32); bzero(oattr32.__reserved, sizeof(oattr32.__reserved)); error = copyout(&oattr32, uap->oattr, sizeof(oattr32)); } return (error); } int freebsd32_kmq_timedsend(struct thread *td, struct freebsd32_kmq_timedsend_args *uap) { struct mqueue *mq; struct file *fp; struct timespec32 ets32; struct timespec *abs_timeout, ets; int error; int waitok; AUDIT_ARG_FD(uap->mqd); error = getmq_write(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); if (uap->abs_timeout != NULL) { error = copyin(uap->abs_timeout, &ets32, sizeof(ets32)); if (error != 0) return (error); CP(ets32, ets, tv_sec); CP(ets32, ets, tv_nsec); abs_timeout = &ets; } else abs_timeout = NULL; waitok = !(fp->f_flag & O_NONBLOCK); error = mqueue_send(mq, uap->msg_ptr, uap->msg_len, uap->msg_prio, waitok, abs_timeout); fdrop(fp, td); return (error); } int freebsd32_kmq_timedreceive(struct thread *td, struct freebsd32_kmq_timedreceive_args *uap) { struct mqueue *mq; struct file *fp; struct timespec32 ets32; struct timespec *abs_timeout, ets; int error, waitok; AUDIT_ARG_FD(uap->mqd); error = getmq_read(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); if (uap->abs_timeout != NULL) { error = copyin(uap->abs_timeout, &ets32, sizeof(ets32)); if (error != 0) return (error); CP(ets32, ets, tv_sec); CP(ets32, ets, tv_nsec); abs_timeout = &ets; } else abs_timeout = NULL; waitok = !(fp->f_flag & O_NONBLOCK); error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len, uap->msg_prio, waitok, abs_timeout); fdrop(fp, td); return (error); } int freebsd32_kmq_notify(struct thread *td, struct freebsd32_kmq_notify_args *uap) { struct sigevent ev, *evp; struct sigevent32 ev32; int error; if (uap->sigev == NULL) { evp = NULL; } else { error = copyin(uap->sigev, &ev32, sizeof(ev32)); if (error != 0) return (error); error = convert_sigevent32(&ev32, &ev); if (error != 0) return (error); evp = &ev; } return (kern_kmq_notify(td, uap->mqd, evp)); } static struct syscall_helper_data mq32_syscalls[] = { SYSCALL32_INIT_HELPER(freebsd32_kmq_open), SYSCALL32_INIT_HELPER_F(freebsd32_kmq_setattr, SYF_CAPENABLED), SYSCALL32_INIT_HELPER_F(freebsd32_kmq_timedsend, SYF_CAPENABLED), SYSCALL32_INIT_HELPER_F(freebsd32_kmq_timedreceive, SYF_CAPENABLED), SYSCALL32_INIT_HELPER_F(freebsd32_kmq_notify, SYF_CAPENABLED), SYSCALL32_INIT_HELPER_COMPAT(kmq_unlink), SYSCALL_INIT_LAST }; #endif static int mqinit(void) { int error; error = syscall_helper_register(mq_syscalls, SY_THR_STATIC_KLD); if (error != 0) return (error); #ifdef COMPAT_FREEBSD32 error = syscall32_helper_register(mq32_syscalls, SY_THR_STATIC_KLD); if (error != 0) return (error); #endif return (0); } static int mqunload(void) { #ifdef COMPAT_FREEBSD32 syscall32_helper_unregister(mq32_syscalls); #endif syscall_helper_unregister(mq_syscalls); return (0); } static int mq_modload(struct module *module, int cmd, void *arg) { int error = 0; error = vfs_modevent(module, cmd, arg); if (error != 0) return (error); switch (cmd) { case MOD_LOAD: error = mqinit(); if (error != 0) mqunload(); break; case MOD_UNLOAD: error = mqunload(); break; default: break; } return (error); } static moduledata_t mqueuefs_mod = { "mqueuefs", mq_modload, &mqueuefs_vfsconf }; DECLARE_MODULE(mqueuefs, mqueuefs_mod, SI_SUB_VFS, SI_ORDER_MIDDLE); MODULE_VERSION(mqueuefs, 1); Index: head/sys/kern/uipc_sem.c =================================================================== --- head/sys/kern/uipc_sem.c (revision 333424) +++ head/sys/kern/uipc_sem.c (revision 333425) @@ -1,1112 +1,1110 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2002 Alfred Perlstein * Copyright (c) 2003-2005 SPARTA, Inc. * Copyright (c) 2005, 2016-2017 Robert N. M. Watson * All rights reserved. * * This software was developed for the FreeBSD Project in part by Network * Associates Laboratories, the Security Research Division of Network * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), * as part of the DARPA CHATS research program. * * Portions of this software were developed by BAE Systems, the University of * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent * Computing (TC) research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_posix.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(p1003_1b_semaphores, "POSIX P1003.1B semaphores support"); /* * TODO * * - Resource limits? * - Replace global sem_lock with mtx_pool locks? * - Add a MAC check_create() hook for creating new named semaphores. */ #ifndef SEM_MAX #define SEM_MAX 30 #endif #ifdef SEM_DEBUG #define DP(x) printf x #else #define DP(x) #endif struct ksem_mapping { char *km_path; Fnv32_t km_fnv; struct ksem *km_ksem; LIST_ENTRY(ksem_mapping) km_link; }; static MALLOC_DEFINE(M_KSEM, "ksem", "semaphore file descriptor"); static LIST_HEAD(, ksem_mapping) *ksem_dictionary; static struct sx ksem_dict_lock; static struct mtx ksem_count_lock; static struct mtx sem_lock; static u_long ksem_hash; static int ksem_dead; #define KSEM_HASH(fnv) (&ksem_dictionary[(fnv) & ksem_hash]) static int nsems = 0; SYSCTL_DECL(_p1003_1b); SYSCTL_INT(_p1003_1b, OID_AUTO, nsems, CTLFLAG_RD, &nsems, 0, "Number of active kernel POSIX semaphores"); static int kern_sem_wait(struct thread *td, semid_t id, int tryflag, struct timespec *abstime); static int ksem_access(struct ksem *ks, struct ucred *ucred); static struct ksem *ksem_alloc(struct ucred *ucred, mode_t mode, unsigned int value); static int ksem_create(struct thread *td, const char *path, semid_t *semidp, mode_t mode, unsigned int value, int flags, int compat32); static void ksem_drop(struct ksem *ks); static int ksem_get(struct thread *td, semid_t id, cap_rights_t *rightsp, struct file **fpp); static struct ksem *ksem_hold(struct ksem *ks); static void ksem_insert(char *path, Fnv32_t fnv, struct ksem *ks); static struct ksem *ksem_lookup(char *path, Fnv32_t fnv); static void ksem_module_destroy(void); static int ksem_module_init(void); static int ksem_remove(char *path, Fnv32_t fnv, struct ucred *ucred); static int sem_modload(struct module *module, int cmd, void *arg); static fo_stat_t ksem_stat; static fo_close_t ksem_closef; static fo_chmod_t ksem_chmod; static fo_chown_t ksem_chown; static fo_fill_kinfo_t ksem_fill_kinfo; /* File descriptor operations. */ static struct fileops ksem_ops = { .fo_read = invfo_rdwr, .fo_write = invfo_rdwr, .fo_truncate = invfo_truncate, .fo_ioctl = invfo_ioctl, .fo_poll = invfo_poll, .fo_kqfilter = invfo_kqfilter, .fo_stat = ksem_stat, .fo_close = ksem_closef, .fo_chmod = ksem_chmod, .fo_chown = ksem_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = ksem_fill_kinfo, .fo_flags = DFLAG_PASSABLE }; FEATURE(posix_sem, "POSIX semaphores"); static int ksem_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td) { struct ksem *ks; #ifdef MAC int error; #endif ks = fp->f_data; #ifdef MAC error = mac_posixsem_check_stat(active_cred, fp->f_cred, ks); if (error) return (error); #endif /* * Attempt to return sanish values for fstat() on a semaphore * file descriptor. */ bzero(sb, sizeof(*sb)); mtx_lock(&sem_lock); sb->st_atim = ks->ks_atime; sb->st_ctim = ks->ks_ctime; sb->st_mtim = ks->ks_mtime; sb->st_birthtim = ks->ks_birthtime; sb->st_uid = ks->ks_uid; sb->st_gid = ks->ks_gid; sb->st_mode = S_IFREG | ks->ks_mode; /* XXX */ mtx_unlock(&sem_lock); return (0); } static int ksem_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { struct ksem *ks; int error; error = 0; ks = fp->f_data; mtx_lock(&sem_lock); #ifdef MAC error = mac_posixsem_check_setmode(active_cred, ks, mode); if (error != 0) goto out; #endif error = vaccess(VREG, ks->ks_mode, ks->ks_uid, ks->ks_gid, VADMIN, active_cred, NULL); if (error != 0) goto out; ks->ks_mode = mode & ACCESSPERMS; out: mtx_unlock(&sem_lock); return (error); } static int ksem_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { struct ksem *ks; int error; error = 0; ks = fp->f_data; mtx_lock(&sem_lock); #ifdef MAC error = mac_posixsem_check_setowner(active_cred, ks, uid, gid); if (error != 0) goto out; #endif if (uid == (uid_t)-1) uid = ks->ks_uid; if (gid == (gid_t)-1) gid = ks->ks_gid; if (((uid != ks->ks_uid && uid != active_cred->cr_uid) || (gid != ks->ks_gid && !groupmember(gid, active_cred))) && (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN, 0))) goto out; ks->ks_uid = uid; ks->ks_gid = gid; out: mtx_unlock(&sem_lock); return (error); } static int ksem_closef(struct file *fp, struct thread *td) { struct ksem *ks; ks = fp->f_data; fp->f_data = NULL; ksem_drop(ks); return (0); } static int ksem_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { const char *path, *pr_path; struct ksem *ks; size_t pr_pathlen; kif->kf_type = KF_TYPE_SEM; ks = fp->f_data; mtx_lock(&sem_lock); kif->kf_un.kf_sem.kf_sem_value = ks->ks_value; kif->kf_un.kf_sem.kf_sem_mode = S_IFREG | ks->ks_mode; /* XXX */ mtx_unlock(&sem_lock); if (ks->ks_path != NULL) { sx_slock(&ksem_dict_lock); if (ks->ks_path != NULL) { path = ks->ks_path; pr_path = curthread->td_ucred->cr_prison->pr_path; if (strcmp(pr_path, "/") != 0) { /* Return the jail-rooted pathname. */ pr_pathlen = strlen(pr_path); if (strncmp(path, pr_path, pr_pathlen) == 0 && path[pr_pathlen] == '/') path += pr_pathlen; } strlcpy(kif->kf_path, path, sizeof(kif->kf_path)); } sx_sunlock(&ksem_dict_lock); } return (0); } /* * ksem object management including creation and reference counting * routines. */ static struct ksem * ksem_alloc(struct ucred *ucred, mode_t mode, unsigned int value) { struct ksem *ks; mtx_lock(&ksem_count_lock); if (nsems == p31b_getcfg(CTL_P1003_1B_SEM_NSEMS_MAX) || ksem_dead) { mtx_unlock(&ksem_count_lock); return (NULL); } nsems++; mtx_unlock(&ksem_count_lock); ks = malloc(sizeof(*ks), M_KSEM, M_WAITOK | M_ZERO); ks->ks_uid = ucred->cr_uid; ks->ks_gid = ucred->cr_gid; ks->ks_mode = mode; ks->ks_value = value; cv_init(&ks->ks_cv, "ksem"); vfs_timestamp(&ks->ks_birthtime); ks->ks_atime = ks->ks_mtime = ks->ks_ctime = ks->ks_birthtime; refcount_init(&ks->ks_ref, 1); #ifdef MAC mac_posixsem_init(ks); mac_posixsem_create(ucred, ks); #endif return (ks); } static struct ksem * ksem_hold(struct ksem *ks) { refcount_acquire(&ks->ks_ref); return (ks); } static void ksem_drop(struct ksem *ks) { if (refcount_release(&ks->ks_ref)) { #ifdef MAC mac_posixsem_destroy(ks); #endif cv_destroy(&ks->ks_cv); free(ks, M_KSEM); mtx_lock(&ksem_count_lock); nsems--; mtx_unlock(&ksem_count_lock); } } /* * Determine if the credentials have sufficient permissions for read * and write access. */ static int ksem_access(struct ksem *ks, struct ucred *ucred) { int error; error = vaccess(VREG, ks->ks_mode, ks->ks_uid, ks->ks_gid, VREAD | VWRITE, ucred, NULL); if (error) error = priv_check_cred(ucred, PRIV_SEM_WRITE, 0); return (error); } /* * Dictionary management. We maintain an in-kernel dictionary to map * paths to semaphore objects. We use the FNV hash on the path to * store the mappings in a hash table. */ static struct ksem * ksem_lookup(char *path, Fnv32_t fnv) { struct ksem_mapping *map; LIST_FOREACH(map, KSEM_HASH(fnv), km_link) { if (map->km_fnv != fnv) continue; if (strcmp(map->km_path, path) == 0) return (map->km_ksem); } return (NULL); } static void ksem_insert(char *path, Fnv32_t fnv, struct ksem *ks) { struct ksem_mapping *map; map = malloc(sizeof(struct ksem_mapping), M_KSEM, M_WAITOK); map->km_path = path; map->km_fnv = fnv; map->km_ksem = ksem_hold(ks); ks->ks_path = path; LIST_INSERT_HEAD(KSEM_HASH(fnv), map, km_link); } static int ksem_remove(char *path, Fnv32_t fnv, struct ucred *ucred) { struct ksem_mapping *map; int error; LIST_FOREACH(map, KSEM_HASH(fnv), km_link) { if (map->km_fnv != fnv) continue; if (strcmp(map->km_path, path) == 0) { #ifdef MAC error = mac_posixsem_check_unlink(ucred, map->km_ksem); if (error) return (error); #endif error = ksem_access(map->km_ksem, ucred); if (error) return (error); map->km_ksem->ks_path = NULL; LIST_REMOVE(map, km_link); ksem_drop(map->km_ksem); free(map->km_path, M_KSEM); free(map, M_KSEM); return (0); } } return (ENOENT); } static int ksem_create_copyout_semid(struct thread *td, semid_t *semidp, int fd, int compat32) { semid_t semid; #ifdef COMPAT_FREEBSD32 int32_t semid32; #endif void *ptr; size_t ptrs; #ifdef COMPAT_FREEBSD32 if (compat32) { semid32 = fd; ptr = &semid32; ptrs = sizeof(semid32); } else { #endif semid = fd; ptr = &semid; ptrs = sizeof(semid); compat32 = 0; /* silence gcc */ #ifdef COMPAT_FREEBSD32 } #endif return (copyout(ptr, semidp, ptrs)); } /* Other helper routines. */ static int ksem_create(struct thread *td, const char *name, semid_t *semidp, mode_t mode, unsigned int value, int flags, int compat32) { struct filedesc *fdp; struct ksem *ks; struct file *fp; char *path; const char *pr_path; size_t pr_pathlen; Fnv32_t fnv; int error, fd; AUDIT_ARG_FFLAGS(flags); AUDIT_ARG_MODE(mode); AUDIT_ARG_VALUE(value); if (value > SEM_VALUE_MAX) return (EINVAL); fdp = td->td_proc->p_fd; mode = (mode & ~fdp->fd_cmask) & ACCESSPERMS; error = falloc(td, &fp, &fd, O_CLOEXEC); if (error) { if (name == NULL) error = ENOSPC; return (error); } /* * Go ahead and copyout the file descriptor now. This is a bit * premature, but it is a lot easier to handle errors as opposed * to later when we've possibly created a new semaphore, etc. */ error = ksem_create_copyout_semid(td, semidp, fd, compat32); if (error) { fdclose(td, fp, fd); fdrop(fp, td); return (error); } if (name == NULL) { /* Create an anonymous semaphore. */ ks = ksem_alloc(td->td_ucred, mode, value); if (ks == NULL) error = ENOSPC; else ks->ks_flags |= KS_ANONYMOUS; } else { path = malloc(MAXPATHLEN, M_KSEM, M_WAITOK); pr_path = td->td_ucred->cr_prison->pr_path; /* Construct a full pathname for jailed callers. */ pr_pathlen = strcmp(pr_path, "/") == 0 ? 0 : strlcpy(path, pr_path, MAXPATHLEN); error = copyinstr(name, path + pr_pathlen, MAXPATHLEN - pr_pathlen, NULL); /* Require paths to start with a '/' character. */ if (error == 0 && path[pr_pathlen] != '/') error = EINVAL; if (error) { fdclose(td, fp, fd); fdrop(fp, td); free(path, M_KSEM); return (error); } AUDIT_ARG_UPATH1_CANON(path); fnv = fnv_32_str(path, FNV1_32_INIT); sx_xlock(&ksem_dict_lock); ks = ksem_lookup(path, fnv); if (ks == NULL) { /* Object does not exist, create it if requested. */ if (flags & O_CREAT) { ks = ksem_alloc(td->td_ucred, mode, value); if (ks == NULL) error = ENFILE; else { ksem_insert(path, fnv, ks); path = NULL; } } else error = ENOENT; } else { /* * Object already exists, obtain a new * reference if requested and permitted. */ if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) error = EEXIST; else { #ifdef MAC error = mac_posixsem_check_open(td->td_ucred, ks); if (error == 0) #endif error = ksem_access(ks, td->td_ucred); } if (error == 0) ksem_hold(ks); #ifdef INVARIANTS else ks = NULL; #endif } sx_xunlock(&ksem_dict_lock); if (path) free(path, M_KSEM); } if (error) { KASSERT(ks == NULL, ("ksem_create error with a ksem")); fdclose(td, fp, fd); fdrop(fp, td); return (error); } KASSERT(ks != NULL, ("ksem_create w/o a ksem")); finit(fp, FREAD | FWRITE, DTYPE_SEM, ks, &ksem_ops); fdrop(fp, td); return (0); } static int ksem_get(struct thread *td, semid_t id, cap_rights_t *rightsp, struct file **fpp) { struct ksem *ks; struct file *fp; int error; error = fget(td, id, rightsp, &fp); if (error) return (EINVAL); if (fp->f_type != DTYPE_SEM) { fdrop(fp, td); return (EINVAL); } ks = fp->f_data; if (ks->ks_flags & KS_DEAD) { fdrop(fp, td); return (EINVAL); } *fpp = fp; return (0); } /* System calls. */ #ifndef _SYS_SYSPROTO_H_ struct ksem_init_args { unsigned int value; semid_t *idp; }; #endif int sys_ksem_init(struct thread *td, struct ksem_init_args *uap) { return (ksem_create(td, NULL, uap->idp, S_IRWXU | S_IRWXG, uap->value, 0, 0)); } #ifndef _SYS_SYSPROTO_H_ struct ksem_open_args { char *name; int oflag; mode_t mode; unsigned int value; semid_t *idp; }; #endif int sys_ksem_open(struct thread *td, struct ksem_open_args *uap) { DP((">>> ksem_open start, pid=%d\n", (int)td->td_proc->p_pid)); if ((uap->oflag & ~(O_CREAT | O_EXCL)) != 0) return (EINVAL); return (ksem_create(td, uap->name, uap->idp, uap->mode, uap->value, uap->oflag, 0)); } #ifndef _SYS_SYSPROTO_H_ struct ksem_unlink_args { char *name; }; #endif int sys_ksem_unlink(struct thread *td, struct ksem_unlink_args *uap) { char *path; const char *pr_path; size_t pr_pathlen; Fnv32_t fnv; int error; path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); pr_path = td->td_ucred->cr_prison->pr_path; pr_pathlen = strcmp(pr_path, "/") == 0 ? 0 : strlcpy(path, pr_path, MAXPATHLEN); error = copyinstr(uap->name, path + pr_pathlen, MAXPATHLEN - pr_pathlen, NULL); if (error) { free(path, M_TEMP); return (error); } AUDIT_ARG_UPATH1_CANON(path); fnv = fnv_32_str(path, FNV1_32_INIT); sx_xlock(&ksem_dict_lock); error = ksem_remove(path, fnv, td->td_ucred); sx_xunlock(&ksem_dict_lock); free(path, M_TEMP); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ksem_close_args { semid_t id; }; #endif int sys_ksem_close(struct thread *td, struct ksem_close_args *uap) { - cap_rights_t rights; struct ksem *ks; struct file *fp; int error; /* No capability rights required to close a semaphore. */ AUDIT_ARG_FD(uap->id); - error = ksem_get(td, uap->id, cap_rights_init(&rights), &fp); + error = ksem_get(td, uap->id, &cap_no_rights, &fp); if (error) return (error); ks = fp->f_data; if (ks->ks_flags & KS_ANONYMOUS) { fdrop(fp, td); return (EINVAL); } error = kern_close(td, uap->id); fdrop(fp, td); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ksem_post_args { semid_t id; }; #endif int sys_ksem_post(struct thread *td, struct ksem_post_args *uap) { cap_rights_t rights; struct file *fp; struct ksem *ks; int error; AUDIT_ARG_FD(uap->id); error = ksem_get(td, uap->id, cap_rights_init(&rights, CAP_SEM_POST), &fp); if (error) return (error); ks = fp->f_data; mtx_lock(&sem_lock); #ifdef MAC error = mac_posixsem_check_post(td->td_ucred, fp->f_cred, ks); if (error) goto err; #endif if (ks->ks_value == SEM_VALUE_MAX) { error = EOVERFLOW; goto err; } ++ks->ks_value; if (ks->ks_waiters > 0) cv_signal(&ks->ks_cv); error = 0; vfs_timestamp(&ks->ks_ctime); err: mtx_unlock(&sem_lock); fdrop(fp, td); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ksem_wait_args { semid_t id; }; #endif int sys_ksem_wait(struct thread *td, struct ksem_wait_args *uap) { return (kern_sem_wait(td, uap->id, 0, NULL)); } #ifndef _SYS_SYSPROTO_H_ struct ksem_timedwait_args { semid_t id; const struct timespec *abstime; }; #endif int sys_ksem_timedwait(struct thread *td, struct ksem_timedwait_args *uap) { struct timespec abstime; struct timespec *ts; int error; /* * We allow a null timespec (wait forever). */ if (uap->abstime == NULL) ts = NULL; else { error = copyin(uap->abstime, &abstime, sizeof(abstime)); if (error != 0) return (error); if (abstime.tv_nsec >= 1000000000 || abstime.tv_nsec < 0) return (EINVAL); ts = &abstime; } return (kern_sem_wait(td, uap->id, 0, ts)); } #ifndef _SYS_SYSPROTO_H_ struct ksem_trywait_args { semid_t id; }; #endif int sys_ksem_trywait(struct thread *td, struct ksem_trywait_args *uap) { return (kern_sem_wait(td, uap->id, 1, NULL)); } static int kern_sem_wait(struct thread *td, semid_t id, int tryflag, struct timespec *abstime) { struct timespec ts1, ts2; struct timeval tv; cap_rights_t rights; struct file *fp; struct ksem *ks; int error; DP((">>> kern_sem_wait entered! pid=%d\n", (int)td->td_proc->p_pid)); AUDIT_ARG_FD(id); error = ksem_get(td, id, cap_rights_init(&rights, CAP_SEM_WAIT), &fp); if (error) return (error); ks = fp->f_data; mtx_lock(&sem_lock); DP((">>> kern_sem_wait critical section entered! pid=%d\n", (int)td->td_proc->p_pid)); #ifdef MAC error = mac_posixsem_check_wait(td->td_ucred, fp->f_cred, ks); if (error) { DP(("kern_sem_wait mac failed\n")); goto err; } #endif DP(("kern_sem_wait value = %d, tryflag %d\n", ks->ks_value, tryflag)); vfs_timestamp(&ks->ks_atime); while (ks->ks_value == 0) { ks->ks_waiters++; if (tryflag != 0) error = EAGAIN; else if (abstime == NULL) error = cv_wait_sig(&ks->ks_cv, &sem_lock); else { for (;;) { ts1 = *abstime; getnanotime(&ts2); timespecsub(&ts1, &ts2); TIMESPEC_TO_TIMEVAL(&tv, &ts1); if (tv.tv_sec < 0) { error = ETIMEDOUT; break; } error = cv_timedwait_sig(&ks->ks_cv, &sem_lock, tvtohz(&tv)); if (error != EWOULDBLOCK) break; } } ks->ks_waiters--; if (error) goto err; } ks->ks_value--; DP(("kern_sem_wait value post-decrement = %d\n", ks->ks_value)); error = 0; err: mtx_unlock(&sem_lock); fdrop(fp, td); DP(("<<< kern_sem_wait leaving, pid=%d, error = %d\n", (int)td->td_proc->p_pid, error)); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ksem_getvalue_args { semid_t id; int *val; }; #endif int sys_ksem_getvalue(struct thread *td, struct ksem_getvalue_args *uap) { cap_rights_t rights; struct file *fp; struct ksem *ks; int error, val; AUDIT_ARG_FD(uap->id); error = ksem_get(td, uap->id, cap_rights_init(&rights, CAP_SEM_GETVALUE), &fp); if (error) return (error); ks = fp->f_data; mtx_lock(&sem_lock); #ifdef MAC error = mac_posixsem_check_getvalue(td->td_ucred, fp->f_cred, ks); if (error) { mtx_unlock(&sem_lock); fdrop(fp, td); return (error); } #endif val = ks->ks_value; vfs_timestamp(&ks->ks_atime); mtx_unlock(&sem_lock); fdrop(fp, td); error = copyout(&val, uap->val, sizeof(val)); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ksem_destroy_args { semid_t id; }; #endif int sys_ksem_destroy(struct thread *td, struct ksem_destroy_args *uap) { - cap_rights_t rights; struct file *fp; struct ksem *ks; int error; /* No capability rights required to close a semaphore. */ AUDIT_ARG_FD(uap->id); - error = ksem_get(td, uap->id, cap_rights_init(&rights), &fp); + error = ksem_get(td, uap->id, &cap_no_rights, &fp); if (error) return (error); ks = fp->f_data; if (!(ks->ks_flags & KS_ANONYMOUS)) { fdrop(fp, td); return (EINVAL); } mtx_lock(&sem_lock); if (ks->ks_waiters != 0) { mtx_unlock(&sem_lock); error = EBUSY; goto err; } ks->ks_flags |= KS_DEAD; mtx_unlock(&sem_lock); error = kern_close(td, uap->id); err: fdrop(fp, td); return (error); } static struct syscall_helper_data ksem_syscalls[] = { SYSCALL_INIT_HELPER(ksem_init), SYSCALL_INIT_HELPER(ksem_open), SYSCALL_INIT_HELPER(ksem_unlink), SYSCALL_INIT_HELPER(ksem_close), SYSCALL_INIT_HELPER(ksem_post), SYSCALL_INIT_HELPER(ksem_wait), SYSCALL_INIT_HELPER(ksem_timedwait), SYSCALL_INIT_HELPER(ksem_trywait), SYSCALL_INIT_HELPER(ksem_getvalue), SYSCALL_INIT_HELPER(ksem_destroy), SYSCALL_INIT_LAST }; #ifdef COMPAT_FREEBSD32 #include #include #include #include #include int freebsd32_ksem_init(struct thread *td, struct freebsd32_ksem_init_args *uap) { return (ksem_create(td, NULL, uap->idp, S_IRWXU | S_IRWXG, uap->value, 0, 1)); } int freebsd32_ksem_open(struct thread *td, struct freebsd32_ksem_open_args *uap) { if ((uap->oflag & ~(O_CREAT | O_EXCL)) != 0) return (EINVAL); return (ksem_create(td, uap->name, uap->idp, uap->mode, uap->value, uap->oflag, 1)); } int freebsd32_ksem_timedwait(struct thread *td, struct freebsd32_ksem_timedwait_args *uap) { struct timespec32 abstime32; struct timespec *ts, abstime; int error; /* * We allow a null timespec (wait forever). */ if (uap->abstime == NULL) ts = NULL; else { error = copyin(uap->abstime, &abstime32, sizeof(abstime32)); if (error != 0) return (error); CP(abstime32, abstime, tv_sec); CP(abstime32, abstime, tv_nsec); if (abstime.tv_nsec >= 1000000000 || abstime.tv_nsec < 0) return (EINVAL); ts = &abstime; } return (kern_sem_wait(td, uap->id, 0, ts)); } static struct syscall_helper_data ksem32_syscalls[] = { SYSCALL32_INIT_HELPER(freebsd32_ksem_init), SYSCALL32_INIT_HELPER(freebsd32_ksem_open), SYSCALL32_INIT_HELPER_COMPAT(ksem_unlink), SYSCALL32_INIT_HELPER_COMPAT(ksem_close), SYSCALL32_INIT_HELPER_COMPAT(ksem_post), SYSCALL32_INIT_HELPER_COMPAT(ksem_wait), SYSCALL32_INIT_HELPER(freebsd32_ksem_timedwait), SYSCALL32_INIT_HELPER_COMPAT(ksem_trywait), SYSCALL32_INIT_HELPER_COMPAT(ksem_getvalue), SYSCALL32_INIT_HELPER_COMPAT(ksem_destroy), SYSCALL_INIT_LAST }; #endif static int ksem_module_init(void) { int error; mtx_init(&sem_lock, "sem", NULL, MTX_DEF); mtx_init(&ksem_count_lock, "ksem count", NULL, MTX_DEF); sx_init(&ksem_dict_lock, "ksem dictionary"); ksem_dictionary = hashinit(1024, M_KSEM, &ksem_hash); p31b_setcfg(CTL_P1003_1B_SEMAPHORES, 200112L); p31b_setcfg(CTL_P1003_1B_SEM_NSEMS_MAX, SEM_MAX); p31b_setcfg(CTL_P1003_1B_SEM_VALUE_MAX, SEM_VALUE_MAX); error = syscall_helper_register(ksem_syscalls, SY_THR_STATIC_KLD); if (error) return (error); #ifdef COMPAT_FREEBSD32 error = syscall32_helper_register(ksem32_syscalls, SY_THR_STATIC_KLD); if (error) return (error); #endif return (0); } static void ksem_module_destroy(void) { #ifdef COMPAT_FREEBSD32 syscall32_helper_unregister(ksem32_syscalls); #endif syscall_helper_unregister(ksem_syscalls); p31b_setcfg(CTL_P1003_1B_SEMAPHORES, 0); hashdestroy(ksem_dictionary, M_KSEM, ksem_hash); sx_destroy(&ksem_dict_lock); mtx_destroy(&ksem_count_lock); mtx_destroy(&sem_lock); p31b_unsetcfg(CTL_P1003_1B_SEM_VALUE_MAX); p31b_unsetcfg(CTL_P1003_1B_SEM_NSEMS_MAX); } static int sem_modload(struct module *module, int cmd, void *arg) { int error = 0; switch (cmd) { case MOD_LOAD: error = ksem_module_init(); if (error) ksem_module_destroy(); break; case MOD_UNLOAD: mtx_lock(&ksem_count_lock); if (nsems != 0) { error = EOPNOTSUPP; mtx_unlock(&ksem_count_lock); break; } ksem_dead = 1; mtx_unlock(&ksem_count_lock); ksem_module_destroy(); break; case MOD_SHUTDOWN: break; default: error = EINVAL; break; } return (error); } static moduledata_t sem_mod = { "sem", &sem_modload, NULL }; DECLARE_MODULE(sem, sem_mod, SI_SUB_SYSV_SEM, SI_ORDER_FIRST); MODULE_VERSION(sem, 1); Index: head/sys/kern/uipc_syscalls.c =================================================================== --- head/sys/kern/uipc_syscalls.c (revision 333424) +++ head/sys/kern/uipc_syscalls.c (revision 333425) @@ -1,1574 +1,1564 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #ifdef COMPAT_FREEBSD32 #include #endif #include #include #include static int sendit(struct thread *td, int s, struct msghdr *mp, int flags); static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp); static int accept1(struct thread *td, int s, struct sockaddr *uname, socklen_t *anamelen, int flags); static int getsockname1(struct thread *td, struct getsockname_args *uap, int compat); static int getpeername1(struct thread *td, struct getpeername_args *uap, int compat); static int sockargs(struct mbuf **, char *, socklen_t, int); /* * Convert a user file descriptor to a kernel file entry and check if required * capability rights are present. * If required copy of current set of capability rights is returned. * A reference on the file entry is held upon returning. */ int getsock_cap(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp, u_int *fflagp, struct filecaps *havecapsp) { struct file *fp; int error; error = fget_cap(td, fd, rightsp, &fp, havecapsp); if (error != 0) return (error); if (fp->f_type != DTYPE_SOCKET) { fdrop(fp, td); if (havecapsp != NULL) filecaps_free(havecapsp); return (ENOTSOCK); } if (fflagp != NULL) *fflagp = fp->f_flag; *fpp = fp; return (0); } /* * System call interface to the socket abstraction. */ #if defined(COMPAT_43) #define COMPAT_OLDSOCK #endif int sys_socket(struct thread *td, struct socket_args *uap) { return (kern_socket(td, uap->domain, uap->type, uap->protocol)); } int kern_socket(struct thread *td, int domain, int type, int protocol) { struct socket *so; struct file *fp; int fd, error, oflag, fflag; AUDIT_ARG_SOCKET(domain, type, protocol); oflag = 0; fflag = 0; if ((type & SOCK_CLOEXEC) != 0) { type &= ~SOCK_CLOEXEC; oflag |= O_CLOEXEC; } if ((type & SOCK_NONBLOCK) != 0) { type &= ~SOCK_NONBLOCK; fflag |= FNONBLOCK; } #ifdef MAC error = mac_socket_check_create(td->td_ucred, domain, type, protocol); if (error != 0) return (error); #endif error = falloc(td, &fp, &fd, oflag); if (error != 0) return (error); /* An extra reference on `fp' has been held for us by falloc(). */ error = socreate(domain, &so, type, protocol, td->td_ucred, td); if (error != 0) { fdclose(td, fp, fd); } else { finit(fp, FREAD | FWRITE | fflag, DTYPE_SOCKET, so, &socketops); if ((fflag & FNONBLOCK) != 0) (void) fo_ioctl(fp, FIONBIO, &fflag, td->td_ucred, td); td->td_retval[0] = fd; } fdrop(fp, td); return (error); } int sys_bind(struct thread *td, struct bind_args *uap) { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_bindat(td, AT_FDCWD, uap->s, sa); free(sa, M_SONAME); } return (error); } int kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) { struct socket *so; struct file *fp; - cap_rights_t rights; int error; #ifdef CAPABILITY_MODE if (IN_CAPABILITY_MODE(td) && (dirfd == AT_FDCWD)) return (ECAPMODE); #endif AUDIT_ARG_FD(fd); AUDIT_ARG_SOCKADDR(td, dirfd, sa); - error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_BIND), + error = getsock_cap(td, fd, &cap_bind_rights, &fp, NULL, NULL); if (error != 0) return (error); so = fp->f_data; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(sa); #endif #ifdef MAC error = mac_socket_check_bind(td->td_ucred, so, sa); if (error == 0) { #endif if (dirfd == AT_FDCWD) error = sobind(so, sa, td); else error = sobindat(dirfd, so, sa, td); #ifdef MAC } #endif fdrop(fp, td); return (error); } int sys_bindat(struct thread *td, struct bindat_args *uap) { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_bindat(td, uap->fd, uap->s, sa); free(sa, M_SONAME); } return (error); } int sys_listen(struct thread *td, struct listen_args *uap) { return (kern_listen(td, uap->s, uap->backlog)); } int kern_listen(struct thread *td, int s, int backlog) { struct socket *so; struct file *fp; - cap_rights_t rights; int error; AUDIT_ARG_FD(s); - error = getsock_cap(td, s, cap_rights_init(&rights, CAP_LISTEN), + error = getsock_cap(td, s, &cap_listen_rights, &fp, NULL, NULL); if (error == 0) { so = fp->f_data; #ifdef MAC error = mac_socket_check_listen(td->td_ucred, so); if (error == 0) #endif error = solisten(so, backlog, td); fdrop(fp, td); } return (error); } /* * accept1() */ static int accept1(td, s, uname, anamelen, flags) struct thread *td; int s; struct sockaddr *uname; socklen_t *anamelen; int flags; { struct sockaddr *name; socklen_t namelen; struct file *fp; int error; if (uname == NULL) return (kern_accept4(td, s, NULL, NULL, flags, NULL)); error = copyin(anamelen, &namelen, sizeof (namelen)); if (error != 0) return (error); error = kern_accept4(td, s, &name, &namelen, flags, &fp); if (error != 0) return (error); if (error == 0 && uname != NULL) { #ifdef COMPAT_OLDSOCK if (flags & ACCEPT4_COMPAT) ((struct osockaddr *)name)->sa_family = name->sa_family; #endif error = copyout(name, uname, namelen); } if (error == 0) error = copyout(&namelen, anamelen, sizeof(namelen)); if (error != 0) fdclose(td, fp, td->td_retval[0]); fdrop(fp, td); free(name, M_SONAME); return (error); } int kern_accept(struct thread *td, int s, struct sockaddr **name, socklen_t *namelen, struct file **fp) { return (kern_accept4(td, s, name, namelen, ACCEPT4_INHERIT, fp)); } int kern_accept4(struct thread *td, int s, struct sockaddr **name, socklen_t *namelen, int flags, struct file **fp) { struct file *headfp, *nfp = NULL; struct sockaddr *sa = NULL; struct socket *head, *so; struct filecaps fcaps; - cap_rights_t rights; u_int fflag; pid_t pgid; int error, fd, tmp; if (name != NULL) *name = NULL; AUDIT_ARG_FD(s); - error = getsock_cap(td, s, cap_rights_init(&rights, CAP_ACCEPT), + error = getsock_cap(td, s, &cap_accept_rights, &headfp, &fflag, &fcaps); if (error != 0) return (error); head = headfp->f_data; if ((head->so_options & SO_ACCEPTCONN) == 0) { error = EINVAL; goto done; } #ifdef MAC error = mac_socket_check_accept(td->td_ucred, head); if (error != 0) goto done; #endif error = falloc_caps(td, &nfp, &fd, (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0, &fcaps); if (error != 0) goto done; SOCK_LOCK(head); if (!SOLISTENING(head)) { SOCK_UNLOCK(head); error = EINVAL; goto noconnection; } error = solisten_dequeue(head, &so, flags); if (error != 0) goto noconnection; /* An extra reference on `nfp' has been held for us by falloc(). */ td->td_retval[0] = fd; /* Connection has been removed from the listen queue. */ KNOTE_UNLOCKED(&head->so_rdsel.si_note, 0); if (flags & ACCEPT4_INHERIT) { pgid = fgetown(&head->so_sigio); if (pgid != 0) fsetown(pgid, &so->so_sigio); } else { fflag &= ~(FNONBLOCK | FASYNC); if (flags & SOCK_NONBLOCK) fflag |= FNONBLOCK; } finit(nfp, fflag, DTYPE_SOCKET, so, &socketops); /* Sync socket nonblocking/async state with file flags */ tmp = fflag & FNONBLOCK; (void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td); tmp = fflag & FASYNC; (void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td); error = soaccept(so, &sa); if (error != 0) goto noconnection; if (sa == NULL) { if (name) *namelen = 0; goto done; } AUDIT_ARG_SOCKADDR(td, AT_FDCWD, sa); if (name) { /* check sa_len before it is destroyed */ if (*namelen > sa->sa_len) *namelen = sa->sa_len; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(sa); #endif *name = sa; sa = NULL; } noconnection: free(sa, M_SONAME); /* * close the new descriptor, assuming someone hasn't ripped it * out from under us. */ if (error != 0) fdclose(td, nfp, fd); /* * Release explicitly held references before returning. We return * a reference on nfp to the caller on success if they request it. */ done: if (nfp == NULL) filecaps_free(&fcaps); if (fp != NULL) { if (error == 0) { *fp = nfp; nfp = NULL; } else *fp = NULL; } if (nfp != NULL) fdrop(nfp, td); fdrop(headfp, td); return (error); } int sys_accept(td, uap) struct thread *td; struct accept_args *uap; { return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT)); } int sys_accept4(td, uap) struct thread *td; struct accept4_args *uap; { if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return (EINVAL); return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags)); } #ifdef COMPAT_OLDSOCK int oaccept(td, uap) struct thread *td; struct accept_args *uap; { return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT | ACCEPT4_COMPAT)); } #endif /* COMPAT_OLDSOCK */ int sys_connect(struct thread *td, struct connect_args *uap) { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_connectat(td, AT_FDCWD, uap->s, sa); free(sa, M_SONAME); } return (error); } int kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) { struct socket *so; struct file *fp; - cap_rights_t rights; int error, interrupted = 0; #ifdef CAPABILITY_MODE if (IN_CAPABILITY_MODE(td) && (dirfd == AT_FDCWD)) return (ECAPMODE); #endif AUDIT_ARG_FD(fd); AUDIT_ARG_SOCKADDR(td, dirfd, sa); - error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_CONNECT), + error = getsock_cap(td, fd, &cap_connect_rights, &fp, NULL, NULL); if (error != 0) return (error); so = fp->f_data; if (so->so_state & SS_ISCONNECTING) { error = EALREADY; goto done1; } #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(sa); #endif #ifdef MAC error = mac_socket_check_connect(td->td_ucred, so, sa); if (error != 0) goto bad; #endif if (dirfd == AT_FDCWD) error = soconnect(so, sa, td); else error = soconnectat(dirfd, so, sa, td); if (error != 0) goto bad; if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { error = EINPROGRESS; goto done1; } SOCK_LOCK(so); while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { error = msleep(&so->so_timeo, &so->so_lock, PSOCK | PCATCH, "connec", 0); if (error != 0) { if (error == EINTR || error == ERESTART) interrupted = 1; break; } } if (error == 0) { error = so->so_error; so->so_error = 0; } SOCK_UNLOCK(so); bad: if (!interrupted) so->so_state &= ~SS_ISCONNECTING; if (error == ERESTART) error = EINTR; done1: fdrop(fp, td); return (error); } int sys_connectat(struct thread *td, struct connectat_args *uap) { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_connectat(td, uap->fd, uap->s, sa); free(sa, M_SONAME); } return (error); } int kern_socketpair(struct thread *td, int domain, int type, int protocol, int *rsv) { struct file *fp1, *fp2; struct socket *so1, *so2; int fd, error, oflag, fflag; AUDIT_ARG_SOCKET(domain, type, protocol); oflag = 0; fflag = 0; if ((type & SOCK_CLOEXEC) != 0) { type &= ~SOCK_CLOEXEC; oflag |= O_CLOEXEC; } if ((type & SOCK_NONBLOCK) != 0) { type &= ~SOCK_NONBLOCK; fflag |= FNONBLOCK; } #ifdef MAC /* We might want to have a separate check for socket pairs. */ error = mac_socket_check_create(td->td_ucred, domain, type, protocol); if (error != 0) return (error); #endif error = socreate(domain, &so1, type, protocol, td->td_ucred, td); if (error != 0) return (error); error = socreate(domain, &so2, type, protocol, td->td_ucred, td); if (error != 0) goto free1; /* On success extra reference to `fp1' and 'fp2' is set by falloc. */ error = falloc(td, &fp1, &fd, oflag); if (error != 0) goto free2; rsv[0] = fd; fp1->f_data = so1; /* so1 already has ref count */ error = falloc(td, &fp2, &fd, oflag); if (error != 0) goto free3; fp2->f_data = so2; /* so2 already has ref count */ rsv[1] = fd; error = soconnect2(so1, so2); if (error != 0) goto free4; if (type == SOCK_DGRAM) { /* * Datagram socket connection is asymmetric. */ error = soconnect2(so2, so1); if (error != 0) goto free4; } finit(fp1, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp1->f_data, &socketops); finit(fp2, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp2->f_data, &socketops); if ((fflag & FNONBLOCK) != 0) { (void) fo_ioctl(fp1, FIONBIO, &fflag, td->td_ucred, td); (void) fo_ioctl(fp2, FIONBIO, &fflag, td->td_ucred, td); } fdrop(fp1, td); fdrop(fp2, td); return (0); free4: fdclose(td, fp2, rsv[1]); fdrop(fp2, td); free3: fdclose(td, fp1, rsv[0]); fdrop(fp1, td); free2: if (so2 != NULL) (void)soclose(so2); free1: if (so1 != NULL) (void)soclose(so1); return (error); } int sys_socketpair(struct thread *td, struct socketpair_args *uap) { int error, sv[2]; error = kern_socketpair(td, uap->domain, uap->type, uap->protocol, sv); if (error != 0) return (error); error = copyout(sv, uap->rsv, 2 * sizeof(int)); if (error != 0) { (void)kern_close(td, sv[0]); (void)kern_close(td, sv[1]); } return (error); } static int sendit(struct thread *td, int s, struct msghdr *mp, int flags) { struct mbuf *control; struct sockaddr *to; int error; #ifdef CAPABILITY_MODE if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL)) return (ECAPMODE); #endif if (mp->msg_name != NULL) { error = getsockaddr(&to, mp->msg_name, mp->msg_namelen); if (error != 0) { to = NULL; goto bad; } mp->msg_name = to; } else { to = NULL; } if (mp->msg_control) { if (mp->msg_controllen < sizeof(struct cmsghdr) #ifdef COMPAT_OLDSOCK && mp->msg_flags != MSG_COMPAT #endif ) { error = EINVAL; goto bad; } error = sockargs(&control, mp->msg_control, mp->msg_controllen, MT_CONTROL); if (error != 0) goto bad; #ifdef COMPAT_OLDSOCK if (mp->msg_flags == MSG_COMPAT) { struct cmsghdr *cm; M_PREPEND(control, sizeof(*cm), M_WAITOK); cm = mtod(control, struct cmsghdr *); cm->cmsg_len = control->m_len; cm->cmsg_level = SOL_SOCKET; cm->cmsg_type = SCM_RIGHTS; } #endif } else { control = NULL; } error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE); bad: free(to, M_SONAME); return (error); } int kern_sendit(struct thread *td, int s, struct msghdr *mp, int flags, struct mbuf *control, enum uio_seg segflg) { struct file *fp; struct uio auio; struct iovec *iov; struct socket *so; cap_rights_t rights; #ifdef KTRACE struct uio *ktruio = NULL; #endif ssize_t len; int i, error; AUDIT_ARG_FD(s); cap_rights_init(&rights, CAP_SEND); if (mp->msg_name != NULL) { AUDIT_ARG_SOCKADDR(td, AT_FDCWD, mp->msg_name); cap_rights_set(&rights, CAP_CONNECT); } error = getsock_cap(td, s, &rights, &fp, NULL, NULL); if (error != 0) { m_freem(control); return (error); } so = (struct socket *)fp->f_data; #ifdef KTRACE if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(mp->msg_name); #endif #ifdef MAC if (mp->msg_name != NULL) { error = mac_socket_check_connect(td->td_ucred, so, mp->msg_name); if (error != 0) { m_freem(control); goto bad; } } error = mac_socket_check_send(td->td_ucred, so); if (error != 0) { m_freem(control); goto bad; } #endif auio.uio_iov = mp->msg_iov; auio.uio_iovcnt = mp->msg_iovlen; auio.uio_segflg = segflg; auio.uio_rw = UIO_WRITE; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; iov = mp->msg_iov; for (i = 0; i < mp->msg_iovlen; i++, iov++) { if ((auio.uio_resid += iov->iov_len) < 0) { error = EINVAL; m_freem(control); goto bad; } } #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif len = auio.uio_resid; error = sosend(so, mp->msg_name, &auio, 0, control, flags, td); if (error != 0) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; /* Generation of SIGPIPE can be controlled per socket */ if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && !(flags & MSG_NOSIGNAL)) { PROC_LOCK(td->td_proc); tdsignal(td, SIGPIPE); PROC_UNLOCK(td->td_proc); } } if (error == 0) td->td_retval[0] = len - auio.uio_resid; #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = td->td_retval[0]; ktrgenio(s, UIO_WRITE, ktruio, error); } #endif bad: fdrop(fp, td); return (error); } int sys_sendto(struct thread *td, struct sendto_args *uap) { struct msghdr msg; struct iovec aiov; msg.msg_name = uap->to; msg.msg_namelen = uap->tolen; msg.msg_iov = &aiov; msg.msg_iovlen = 1; msg.msg_control = 0; #ifdef COMPAT_OLDSOCK msg.msg_flags = 0; #endif aiov.iov_base = uap->buf; aiov.iov_len = uap->len; return (sendit(td, uap->s, &msg, uap->flags)); } #ifdef COMPAT_OLDSOCK int osend(struct thread *td, struct osend_args *uap) { struct msghdr msg; struct iovec aiov; msg.msg_name = 0; msg.msg_namelen = 0; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = 0; return (sendit(td, uap->s, &msg, uap->flags)); } int osendmsg(struct thread *td, struct osendmsg_args *uap) { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_iov = iov; msg.msg_flags = MSG_COMPAT; error = sendit(td, uap->s, &msg, uap->flags); free(iov, M_IOV); return (error); } #endif int sys_sendmsg(struct thread *td, struct sendmsg_args *uap) { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (msg)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_iov = iov; #ifdef COMPAT_OLDSOCK msg.msg_flags = 0; #endif error = sendit(td, uap->s, &msg, uap->flags); free(iov, M_IOV); return (error); } int kern_recvit(struct thread *td, int s, struct msghdr *mp, enum uio_seg fromseg, struct mbuf **controlp) { struct uio auio; struct iovec *iov; struct mbuf *m, *control = NULL; caddr_t ctlbuf; struct file *fp; struct socket *so; struct sockaddr *fromsa = NULL; - cap_rights_t rights; #ifdef KTRACE struct uio *ktruio = NULL; #endif ssize_t len; int error, i; if (controlp != NULL) *controlp = NULL; AUDIT_ARG_FD(s); - error = getsock_cap(td, s, cap_rights_init(&rights, CAP_RECV), + error = getsock_cap(td, s, &cap_recv_rights, &fp, NULL, NULL); if (error != 0) return (error); so = fp->f_data; #ifdef MAC error = mac_socket_check_receive(td->td_ucred, so); if (error != 0) { fdrop(fp, td); return (error); } #endif auio.uio_iov = mp->msg_iov; auio.uio_iovcnt = mp->msg_iovlen; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = UIO_READ; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; iov = mp->msg_iov; for (i = 0; i < mp->msg_iovlen; i++, iov++) { if ((auio.uio_resid += iov->iov_len) < 0) { fdrop(fp, td); return (EINVAL); } } #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif len = auio.uio_resid; error = soreceive(so, &fromsa, &auio, NULL, (mp->msg_control || controlp) ? &control : NULL, &mp->msg_flags); if (error != 0) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; } if (fromsa != NULL) AUDIT_ARG_SOCKADDR(td, AT_FDCWD, fromsa); #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = len - auio.uio_resid; ktrgenio(s, UIO_READ, ktruio, error); } #endif if (error != 0) goto out; td->td_retval[0] = len - auio.uio_resid; if (mp->msg_name) { len = mp->msg_namelen; if (len <= 0 || fromsa == NULL) len = 0; else { /* save sa_len before it is destroyed by MSG_COMPAT */ len = MIN(len, fromsa->sa_len); #ifdef COMPAT_OLDSOCK if (mp->msg_flags & MSG_COMPAT) ((struct osockaddr *)fromsa)->sa_family = fromsa->sa_family; #endif if (fromseg == UIO_USERSPACE) { error = copyout(fromsa, mp->msg_name, (unsigned)len); if (error != 0) goto out; } else bcopy(fromsa, mp->msg_name, len); } mp->msg_namelen = len; } if (mp->msg_control && controlp == NULL) { #ifdef COMPAT_OLDSOCK /* * We assume that old recvmsg calls won't receive access * rights and other control info, esp. as control info * is always optional and those options didn't exist in 4.3. * If we receive rights, trim the cmsghdr; anything else * is tossed. */ if (control && mp->msg_flags & MSG_COMPAT) { if (mtod(control, struct cmsghdr *)->cmsg_level != SOL_SOCKET || mtod(control, struct cmsghdr *)->cmsg_type != SCM_RIGHTS) { mp->msg_controllen = 0; goto out; } control->m_len -= sizeof (struct cmsghdr); control->m_data += sizeof (struct cmsghdr); } #endif len = mp->msg_controllen; m = control; mp->msg_controllen = 0; ctlbuf = mp->msg_control; while (m && len > 0) { unsigned int tocopy; if (len >= m->m_len) tocopy = m->m_len; else { mp->msg_flags |= MSG_CTRUNC; tocopy = len; } if ((error = copyout(mtod(m, caddr_t), ctlbuf, tocopy)) != 0) goto out; ctlbuf += tocopy; len -= tocopy; m = m->m_next; } mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control; } out: fdrop(fp, td); #ifdef KTRACE if (fromsa && KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(fromsa); #endif free(fromsa, M_SONAME); if (error == 0 && controlp != NULL) *controlp = control; else if (control) m_freem(control); return (error); } static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp) { int error; error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL); if (error != 0) return (error); if (namelenp != NULL) { error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t)); #ifdef COMPAT_OLDSOCK if (mp->msg_flags & MSG_COMPAT) error = 0; /* old recvfrom didn't check */ #endif } return (error); } int sys_recvfrom(struct thread *td, struct recvfrom_args *uap) { struct msghdr msg; struct iovec aiov; int error; if (uap->fromlenaddr) { error = copyin(uap->fromlenaddr, &msg.msg_namelen, sizeof (msg.msg_namelen)); if (error != 0) goto done2; } else { msg.msg_namelen = 0; } msg.msg_name = uap->from; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = uap->flags; error = recvit(td, uap->s, &msg, uap->fromlenaddr); done2: return (error); } #ifdef COMPAT_OLDSOCK int orecvfrom(struct thread *td, struct recvfrom_args *uap) { uap->flags |= MSG_COMPAT; return (sys_recvfrom(td, uap)); } #endif #ifdef COMPAT_OLDSOCK int orecv(struct thread *td, struct orecv_args *uap) { struct msghdr msg; struct iovec aiov; msg.msg_name = 0; msg.msg_namelen = 0; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = uap->flags; return (recvit(td, uap->s, &msg, NULL)); } /* * Old recvmsg. This code takes advantage of the fact that the old msghdr * overlays the new one, missing only the flags, and with the (old) access * rights where the control fields are now. */ int orecvmsg(struct thread *td, struct orecvmsg_args *uap) { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_flags = uap->flags | MSG_COMPAT; msg.msg_iov = iov; error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen); if (msg.msg_controllen && error == 0) error = copyout(&msg.msg_controllen, &uap->msg->msg_accrightslen, sizeof (int)); free(iov, M_IOV); return (error); } #endif int sys_recvmsg(struct thread *td, struct recvmsg_args *uap) { struct msghdr msg; struct iovec *uiov, *iov; int error; error = copyin(uap->msg, &msg, sizeof (msg)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_flags = uap->flags; #ifdef COMPAT_OLDSOCK msg.msg_flags &= ~MSG_COMPAT; #endif uiov = msg.msg_iov; msg.msg_iov = iov; error = recvit(td, uap->s, &msg, NULL); if (error == 0) { msg.msg_iov = uiov; error = copyout(&msg, uap->msg, sizeof(msg)); } free(iov, M_IOV); return (error); } int sys_shutdown(struct thread *td, struct shutdown_args *uap) { return (kern_shutdown(td, uap->s, uap->how)); } int kern_shutdown(struct thread *td, int s, int how) { struct socket *so; struct file *fp; - cap_rights_t rights; int error; AUDIT_ARG_FD(s); - error = getsock_cap(td, s, cap_rights_init(&rights, CAP_SHUTDOWN), + error = getsock_cap(td, s, &cap_shutdown_rights, &fp, NULL, NULL); if (error == 0) { so = fp->f_data; error = soshutdown(so, how); /* * Previous versions did not return ENOTCONN, but 0 in * case the socket was not connected. Some important * programs like syslogd up to r279016, 2015-02-19, * still depend on this behavior. */ if (error == ENOTCONN && td->td_proc->p_osrel < P_OSREL_SHUTDOWN_ENOTCONN) error = 0; fdrop(fp, td); } return (error); } int sys_setsockopt(struct thread *td, struct setsockopt_args *uap) { return (kern_setsockopt(td, uap->s, uap->level, uap->name, uap->val, UIO_USERSPACE, uap->valsize)); } int kern_setsockopt(struct thread *td, int s, int level, int name, void *val, enum uio_seg valseg, socklen_t valsize) { struct socket *so; struct file *fp; struct sockopt sopt; - cap_rights_t rights; int error; if (val == NULL && valsize != 0) return (EFAULT); if ((int)valsize < 0) return (EINVAL); sopt.sopt_dir = SOPT_SET; sopt.sopt_level = level; sopt.sopt_name = name; sopt.sopt_val = val; sopt.sopt_valsize = valsize; switch (valseg) { case UIO_USERSPACE: sopt.sopt_td = td; break; case UIO_SYSSPACE: sopt.sopt_td = NULL; break; default: panic("kern_setsockopt called with bad valseg"); } AUDIT_ARG_FD(s); - error = getsock_cap(td, s, cap_rights_init(&rights, CAP_SETSOCKOPT), + error = getsock_cap(td, s, &cap_setsockopt_rights, &fp, NULL, NULL); if (error == 0) { so = fp->f_data; error = sosetopt(so, &sopt); fdrop(fp, td); } return(error); } int sys_getsockopt(struct thread *td, struct getsockopt_args *uap) { socklen_t valsize; int error; if (uap->val) { error = copyin(uap->avalsize, &valsize, sizeof (valsize)); if (error != 0) return (error); } error = kern_getsockopt(td, uap->s, uap->level, uap->name, uap->val, UIO_USERSPACE, &valsize); if (error == 0) error = copyout(&valsize, uap->avalsize, sizeof (valsize)); return (error); } /* * Kernel version of getsockopt. * optval can be a userland or userspace. optlen is always a kernel pointer. */ int kern_getsockopt(struct thread *td, int s, int level, int name, void *val, enum uio_seg valseg, socklen_t *valsize) { struct socket *so; struct file *fp; struct sockopt sopt; - cap_rights_t rights; int error; if (val == NULL) *valsize = 0; if ((int)*valsize < 0) return (EINVAL); sopt.sopt_dir = SOPT_GET; sopt.sopt_level = level; sopt.sopt_name = name; sopt.sopt_val = val; sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */ switch (valseg) { case UIO_USERSPACE: sopt.sopt_td = td; break; case UIO_SYSSPACE: sopt.sopt_td = NULL; break; default: panic("kern_getsockopt called with bad valseg"); } AUDIT_ARG_FD(s); - error = getsock_cap(td, s, cap_rights_init(&rights, CAP_GETSOCKOPT), + error = getsock_cap(td, s, &cap_getsockopt_rights, &fp, NULL, NULL); if (error == 0) { so = fp->f_data; error = sogetopt(so, &sopt); *valsize = sopt.sopt_valsize; fdrop(fp, td); } return (error); } /* * getsockname1() - Get socket name. */ static int getsockname1(struct thread *td, struct getsockname_args *uap, int compat) { struct sockaddr *sa; socklen_t len; int error; error = copyin(uap->alen, &len, sizeof(len)); if (error != 0) return (error); error = kern_getsockname(td, uap->fdes, &sa, &len); if (error != 0) return (error); if (len != 0) { #ifdef COMPAT_OLDSOCK if (compat) ((struct osockaddr *)sa)->sa_family = sa->sa_family; #endif error = copyout(sa, uap->asa, (u_int)len); } free(sa, M_SONAME); if (error == 0) error = copyout(&len, uap->alen, sizeof(len)); return (error); } int kern_getsockname(struct thread *td, int fd, struct sockaddr **sa, socklen_t *alen) { struct socket *so; struct file *fp; - cap_rights_t rights; socklen_t len; int error; AUDIT_ARG_FD(fd); - error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_GETSOCKNAME), + error = getsock_cap(td, fd, &cap_getsockname_rights, &fp, NULL, NULL); if (error != 0) return (error); so = fp->f_data; *sa = NULL; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa); CURVNET_RESTORE(); if (error != 0) goto bad; if (*sa == NULL) len = 0; else len = MIN(*alen, (*sa)->sa_len); *alen = len; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(*sa); #endif bad: fdrop(fp, td); if (error != 0 && *sa != NULL) { free(*sa, M_SONAME); *sa = NULL; } return (error); } int sys_getsockname(struct thread *td, struct getsockname_args *uap) { return (getsockname1(td, uap, 0)); } #ifdef COMPAT_OLDSOCK int ogetsockname(struct thread *td, struct getsockname_args *uap) { return (getsockname1(td, uap, 1)); } #endif /* COMPAT_OLDSOCK */ /* * getpeername1() - Get name of peer for connected socket. */ static int getpeername1(struct thread *td, struct getpeername_args *uap, int compat) { struct sockaddr *sa; socklen_t len; int error; error = copyin(uap->alen, &len, sizeof (len)); if (error != 0) return (error); error = kern_getpeername(td, uap->fdes, &sa, &len); if (error != 0) return (error); if (len != 0) { #ifdef COMPAT_OLDSOCK if (compat) ((struct osockaddr *)sa)->sa_family = sa->sa_family; #endif error = copyout(sa, uap->asa, (u_int)len); } free(sa, M_SONAME); if (error == 0) error = copyout(&len, uap->alen, sizeof(len)); return (error); } int kern_getpeername(struct thread *td, int fd, struct sockaddr **sa, socklen_t *alen) { struct socket *so; struct file *fp; - cap_rights_t rights; socklen_t len; int error; AUDIT_ARG_FD(fd); - error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_GETPEERNAME), + error = getsock_cap(td, fd, &cap_getpeername_rights, &fp, NULL, NULL); if (error != 0) return (error); so = fp->f_data; if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) { error = ENOTCONN; goto done; } *sa = NULL; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa); CURVNET_RESTORE(); if (error != 0) goto bad; if (*sa == NULL) len = 0; else len = MIN(*alen, (*sa)->sa_len); *alen = len; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(*sa); #endif bad: if (error != 0 && *sa != NULL) { free(*sa, M_SONAME); *sa = NULL; } done: fdrop(fp, td); return (error); } int sys_getpeername(struct thread *td, struct getpeername_args *uap) { return (getpeername1(td, uap, 0)); } #ifdef COMPAT_OLDSOCK int ogetpeername(struct thread *td, struct ogetpeername_args *uap) { /* XXX uap should have type `getpeername_args *' to begin with. */ return (getpeername1(td, (struct getpeername_args *)uap, 1)); } #endif /* COMPAT_OLDSOCK */ static int sockargs(struct mbuf **mp, char *buf, socklen_t buflen, int type) { struct sockaddr *sa; struct mbuf *m; int error; if (buflen > MLEN) { #ifdef COMPAT_OLDSOCK if (type == MT_SONAME && buflen <= 112) buflen = MLEN; /* unix domain compat. hack */ else #endif if (buflen > MCLBYTES) return (EINVAL); } m = m_get2(buflen, M_WAITOK, type, 0); m->m_len = buflen; error = copyin(buf, mtod(m, void *), buflen); if (error != 0) (void) m_free(m); else { *mp = m; if (type == MT_SONAME) { sa = mtod(m, struct sockaddr *); #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN if (sa->sa_family == 0 && sa->sa_len < AF_MAX) sa->sa_family = sa->sa_len; #endif sa->sa_len = buflen; } } return (error); } int getsockaddr(struct sockaddr **namp, caddr_t uaddr, size_t len) { struct sockaddr *sa; int error; if (len > SOCK_MAXADDRLEN) return (ENAMETOOLONG); if (len < offsetof(struct sockaddr, sa_data[0])) return (EINVAL); sa = malloc(len, M_SONAME, M_WAITOK); error = copyin(uaddr, sa, len); if (error != 0) { free(sa, M_SONAME); } else { #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN if (sa->sa_family == 0 && sa->sa_len < AF_MAX) sa->sa_family = sa->sa_len; #endif sa->sa_len = len; *namp = sa; } return (error); } Index: head/sys/kern/vfs_aio.c =================================================================== --- head/sys/kern/vfs_aio.c (revision 333424) +++ head/sys/kern/vfs_aio.c (revision 333425) @@ -1,2989 +1,2985 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1997 John S. Dyson. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. John S. Dyson's name may not be used to endorse or promote products * derived from this software without specific prior written permission. * * DISCLAIMER: This code isn't warranted to do anything useful. Anything * bad that happens because of using this software isn't the responsibility * of the author. This software is distributed AS-IS. */ /* * This file contains support for the POSIX 1003.1B AIO/LIO facility. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Counter for allocating reference ids to new jobs. Wrapped to 1 on * overflow. (XXX will be removed soon.) */ static u_long jobrefid; /* * Counter for aio_fsync. */ static uint64_t jobseqno; #ifndef MAX_AIO_PER_PROC #define MAX_AIO_PER_PROC 32 #endif #ifndef MAX_AIO_QUEUE_PER_PROC #define MAX_AIO_QUEUE_PER_PROC 256 #endif #ifndef MAX_AIO_QUEUE #define MAX_AIO_QUEUE 1024 /* Bigger than MAX_AIO_QUEUE_PER_PROC */ #endif #ifndef MAX_BUF_AIO #define MAX_BUF_AIO 16 #endif FEATURE(aio, "Asynchronous I/O"); SYSCTL_DECL(_p1003_1b); static MALLOC_DEFINE(M_LIO, "lio", "listio aio control block list"); static MALLOC_DEFINE(M_AIOS, "aios", "aio_suspend aio control block list"); static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "Async IO management"); static int enable_aio_unsafe = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, enable_unsafe, CTLFLAG_RW, &enable_aio_unsafe, 0, "Permit asynchronous IO on all file types, not just known-safe types"); static unsigned int unsafe_warningcnt = 1; SYSCTL_UINT(_vfs_aio, OID_AUTO, unsafe_warningcnt, CTLFLAG_RW, &unsafe_warningcnt, 0, "Warnings that will be triggered upon failed IO requests on unsafe files"); static int max_aio_procs = MAX_AIO_PROCS; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, CTLFLAG_RW, &max_aio_procs, 0, "Maximum number of kernel processes to use for handling async IO "); static int num_aio_procs = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, CTLFLAG_RD, &num_aio_procs, 0, "Number of presently active kernel processes for async IO"); /* * The code will adjust the actual number of AIO processes towards this * number when it gets a chance. */ static int target_aio_procs = TARGET_AIO_PROCS; SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs, 0, "Preferred number of ready kernel processes for async IO"); static int max_queue_count = MAX_AIO_QUEUE; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0, "Maximum number of aio requests to queue, globally"); static int num_queue_count = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0, "Number of queued aio requests"); static int num_buf_aio = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0, "Number of aio requests presently handled by the buf subsystem"); static int num_unmapped_aio = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, num_unmapped_aio, CTLFLAG_RD, &num_unmapped_aio, 0, "Number of aio requests presently handled by unmapped I/O buffers"); /* Number of async I/O processes in the process of being started */ /* XXX This should be local to aio_aqueue() */ static int num_aio_resv_start = 0; static int aiod_lifetime; SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0, "Maximum lifetime for idle aiod"); static int max_aio_per_proc = MAX_AIO_PER_PROC; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc, 0, "Maximum active aio requests per process"); static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW, &max_aio_queue_per_proc, 0, "Maximum queued aio requests per process"); static int max_buf_aio = MAX_BUF_AIO; SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0, "Maximum buf aio requests per process"); /* * Though redundant with vfs.aio.max_aio_queue_per_proc, POSIX requires * sysconf(3) to support AIO_LISTIO_MAX, and we implement that with * vfs.aio.aio_listio_max. */ SYSCTL_INT(_p1003_1b, CTL_P1003_1B_AIO_LISTIO_MAX, aio_listio_max, CTLFLAG_RD | CTLFLAG_CAPRD, &max_aio_queue_per_proc, 0, "Maximum aio requests for a single lio_listio call"); #ifdef COMPAT_FREEBSD6 typedef struct oaiocb { int aio_fildes; /* File descriptor */ off_t aio_offset; /* File offset for I/O */ volatile void *aio_buf; /* I/O buffer in process space */ size_t aio_nbytes; /* Number of bytes for I/O */ struct osigevent aio_sigevent; /* Signal to deliver */ int aio_lio_opcode; /* LIO opcode */ int aio_reqprio; /* Request priority -- ignored */ struct __aiocb_private _aiocb_private; } oaiocb_t; #endif /* * Below is a key of locks used to protect each member of struct kaiocb * aioliojob and kaioinfo and any backends. * * * - need not protected * a - locked by kaioinfo lock * b - locked by backend lock, the backend lock can be null in some cases, * for example, BIO belongs to this type, in this case, proc lock is * reused. * c - locked by aio_job_mtx, the lock for the generic file I/O backend. */ /* * If the routine that services an AIO request blocks while running in an * AIO kernel process it can starve other I/O requests. BIO requests * queued via aio_qphysio() complete in GEOM and do not use AIO kernel * processes at all. Socket I/O requests use a separate pool of * kprocs and also force non-blocking I/O. Other file I/O requests * use the generic fo_read/fo_write operations which can block. The * fsync and mlock operations can also block while executing. Ideally * none of these requests would block while executing. * * Note that the service routines cannot toggle O_NONBLOCK in the file * structure directly while handling a request due to races with * userland threads. */ /* jobflags */ #define KAIOCB_QUEUEING 0x01 #define KAIOCB_CANCELLED 0x02 #define KAIOCB_CANCELLING 0x04 #define KAIOCB_CHECKSYNC 0x08 #define KAIOCB_CLEARED 0x10 #define KAIOCB_FINISHED 0x20 /* * AIO process info */ #define AIOP_FREE 0x1 /* proc on free queue */ struct aioproc { int aioprocflags; /* (c) AIO proc flags */ TAILQ_ENTRY(aioproc) list; /* (c) list of processes */ struct proc *aioproc; /* (*) the AIO proc */ }; /* * data-structure for lio signal management */ struct aioliojob { int lioj_flags; /* (a) listio flags */ int lioj_count; /* (a) listio flags */ int lioj_finished_count; /* (a) listio flags */ struct sigevent lioj_signal; /* (a) signal on all I/O done */ TAILQ_ENTRY(aioliojob) lioj_list; /* (a) lio list */ struct knlist klist; /* (a) list of knotes */ ksiginfo_t lioj_ksi; /* (a) Realtime signal info */ }; #define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ #define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ #define LIOJ_KEVENT_POSTED 0x4 /* kevent triggered */ /* * per process aio data structure */ struct kaioinfo { struct mtx kaio_mtx; /* the lock to protect this struct */ int kaio_flags; /* (a) per process kaio flags */ int kaio_active_count; /* (c) number of currently used AIOs */ int kaio_count; /* (a) size of AIO queue */ int kaio_buffer_count; /* (a) number of physio buffers */ TAILQ_HEAD(,kaiocb) kaio_all; /* (a) all AIOs in a process */ TAILQ_HEAD(,kaiocb) kaio_done; /* (a) done queue for process */ TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */ TAILQ_HEAD(,kaiocb) kaio_jobqueue; /* (a) job queue for process */ TAILQ_HEAD(,kaiocb) kaio_syncqueue; /* (a) queue for aio_fsync */ TAILQ_HEAD(,kaiocb) kaio_syncready; /* (a) second q for aio_fsync */ struct task kaio_task; /* (*) task to kick aio processes */ struct task kaio_sync_task; /* (*) task to schedule fsync jobs */ }; #define AIO_LOCK(ki) mtx_lock(&(ki)->kaio_mtx) #define AIO_UNLOCK(ki) mtx_unlock(&(ki)->kaio_mtx) #define AIO_LOCK_ASSERT(ki, f) mtx_assert(&(ki)->kaio_mtx, (f)) #define AIO_MTX(ki) (&(ki)->kaio_mtx) #define KAIO_RUNDOWN 0x1 /* process is being run down */ #define KAIO_WAKEUP 0x2 /* wakeup process when AIO completes */ /* * Operations used to interact with userland aio control blocks. * Different ABIs provide their own operations. */ struct aiocb_ops { int (*copyin)(struct aiocb *ujob, struct aiocb *kjob); long (*fetch_status)(struct aiocb *ujob); long (*fetch_error)(struct aiocb *ujob); int (*store_status)(struct aiocb *ujob, long status); int (*store_error)(struct aiocb *ujob, long error); int (*store_kernelinfo)(struct aiocb *ujob, long jobref); int (*store_aiocb)(struct aiocb **ujobp, struct aiocb *ujob); }; static TAILQ_HEAD(,aioproc) aio_freeproc; /* (c) Idle daemons */ static struct sema aio_newproc_sem; static struct mtx aio_job_mtx; static TAILQ_HEAD(,kaiocb) aio_jobs; /* (c) Async job list */ static struct unrhdr *aiod_unr; void aio_init_aioinfo(struct proc *p); static int aio_onceonly(void); static int aio_free_entry(struct kaiocb *job); static void aio_process_rw(struct kaiocb *job); static void aio_process_sync(struct kaiocb *job); static void aio_process_mlock(struct kaiocb *job); static void aio_schedule_fsync(void *context, int pending); static int aio_newproc(int *); int aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lio, int type, struct aiocb_ops *ops); static int aio_queue_file(struct file *fp, struct kaiocb *job); static void aio_physwakeup(struct bio *bp); static void aio_proc_rundown(void *arg, struct proc *p); static void aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp); static int aio_qphysio(struct proc *p, struct kaiocb *job); static void aio_daemon(void *param); static void aio_bio_done_notify(struct proc *userp, struct kaiocb *job); static bool aio_clear_cancel_function_locked(struct kaiocb *job); static int aio_kick(struct proc *userp); static void aio_kick_nowait(struct proc *userp); static void aio_kick_helper(void *context, int pending); static int filt_aioattach(struct knote *kn); static void filt_aiodetach(struct knote *kn); static int filt_aio(struct knote *kn, long hint); static int filt_lioattach(struct knote *kn); static void filt_liodetach(struct knote *kn); static int filt_lio(struct knote *kn, long hint); /* * Zones for: * kaio Per process async io info * aiop async io process data * aiocb async io jobs * aiolio list io jobs */ static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiolio_zone; /* kqueue filters for aio */ static struct filterops aio_filtops = { .f_isfd = 0, .f_attach = filt_aioattach, .f_detach = filt_aiodetach, .f_event = filt_aio, }; static struct filterops lio_filtops = { .f_isfd = 0, .f_attach = filt_lioattach, .f_detach = filt_liodetach, .f_event = filt_lio }; static eventhandler_tag exit_tag, exec_tag; TASKQUEUE_DEFINE_THREAD(aiod_kick); /* * Main operations function for use as a kernel module. */ static int aio_modload(struct module *module, int cmd, void *arg) { int error = 0; switch (cmd) { case MOD_LOAD: aio_onceonly(); break; case MOD_SHUTDOWN: break; default: error = EOPNOTSUPP; break; } return (error); } static moduledata_t aio_mod = { "aio", &aio_modload, NULL }; DECLARE_MODULE(aio, aio_mod, SI_SUB_VFS, SI_ORDER_ANY); MODULE_VERSION(aio, 1); /* * Startup initialization */ static int aio_onceonly(void) { exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL, EVENTHANDLER_PRI_ANY); exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec, NULL, EVENTHANDLER_PRI_ANY); kqueue_add_filteropts(EVFILT_AIO, &aio_filtops); kqueue_add_filteropts(EVFILT_LIO, &lio_filtops); TAILQ_INIT(&aio_freeproc); sema_init(&aio_newproc_sem, 0, "aio_new_proc"); mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF); TAILQ_INIT(&aio_jobs); aiod_unr = new_unrhdr(1, INT_MAX, NULL); kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiop_zone = uma_zcreate("AIOP", sizeof(struct aioproc), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiocb_zone = uma_zcreate("AIOCB", sizeof(struct kaiocb), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); aiod_lifetime = AIOD_LIFETIME_DEFAULT; jobrefid = 1; p31b_setcfg(CTL_P1003_1B_ASYNCHRONOUS_IO, _POSIX_ASYNCHRONOUS_IO); p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE); p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0); return (0); } /* * Init the per-process aioinfo structure. The aioinfo limits are set * per-process for user limit (resource) management. */ void aio_init_aioinfo(struct proc *p) { struct kaioinfo *ki; ki = uma_zalloc(kaio_zone, M_WAITOK); mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF | MTX_NEW); ki->kaio_flags = 0; ki->kaio_active_count = 0; ki->kaio_count = 0; ki->kaio_buffer_count = 0; TAILQ_INIT(&ki->kaio_all); TAILQ_INIT(&ki->kaio_done); TAILQ_INIT(&ki->kaio_jobqueue); TAILQ_INIT(&ki->kaio_liojoblist); TAILQ_INIT(&ki->kaio_syncqueue); TAILQ_INIT(&ki->kaio_syncready); TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p); TASK_INIT(&ki->kaio_sync_task, 0, aio_schedule_fsync, ki); PROC_LOCK(p); if (p->p_aioinfo == NULL) { p->p_aioinfo = ki; PROC_UNLOCK(p); } else { PROC_UNLOCK(p); mtx_destroy(&ki->kaio_mtx); uma_zfree(kaio_zone, ki); } while (num_aio_procs < MIN(target_aio_procs, max_aio_procs)) aio_newproc(NULL); } static int aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi) { struct thread *td; int error; error = sigev_findtd(p, sigev, &td); if (error) return (error); if (!KSI_ONQ(ksi)) { ksiginfo_set_sigev(ksi, sigev); ksi->ksi_code = SI_ASYNCIO; ksi->ksi_flags |= KSI_EXT | KSI_INS; tdsendsignal(p, td, ksi->ksi_signo, ksi); } PROC_UNLOCK(p); return (error); } /* * Free a job entry. Wait for completion if it is currently active, but don't * delay forever. If we delay, we return a flag that says that we have to * restart the queue scan. */ static int aio_free_entry(struct kaiocb *job) { struct kaioinfo *ki; struct aioliojob *lj; struct proc *p; p = job->userproc; MPASS(curproc == p); ki = p->p_aioinfo; MPASS(ki != NULL); AIO_LOCK_ASSERT(ki, MA_OWNED); MPASS(job->jobflags & KAIOCB_FINISHED); atomic_subtract_int(&num_queue_count, 1); ki->kaio_count--; MPASS(ki->kaio_count >= 0); TAILQ_REMOVE(&ki->kaio_done, job, plist); TAILQ_REMOVE(&ki->kaio_all, job, allist); lj = job->lio; if (lj) { lj->lioj_count--; lj->lioj_finished_count--; if (lj->lioj_count == 0) { TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); /* lio is going away, we need to destroy any knotes */ knlist_delete(&lj->klist, curthread, 1); PROC_LOCK(p); sigqueue_take(&lj->lioj_ksi); PROC_UNLOCK(p); uma_zfree(aiolio_zone, lj); } } /* job is going away, we need to destroy any knotes */ knlist_delete(&job->klist, curthread, 1); PROC_LOCK(p); sigqueue_take(&job->ksi); PROC_UNLOCK(p); AIO_UNLOCK(ki); /* * The thread argument here is used to find the owning process * and is also passed to fo_close() which may pass it to various * places such as devsw close() routines. Because of that, we * need a thread pointer from the process owning the job that is * persistent and won't disappear out from under us or move to * another process. * * Currently, all the callers of this function call it to remove * a kaiocb from the current process' job list either via a * syscall or due to the current process calling exit() or * execve(). Thus, we know that p == curproc. We also know that * curthread can't exit since we are curthread. * * Therefore, we use curthread as the thread to pass to * knlist_delete(). This does mean that it is possible for the * thread pointer at close time to differ from the thread pointer * at open time, but this is already true of file descriptors in * a multithreaded process. */ if (job->fd_file) fdrop(job->fd_file, curthread); crfree(job->cred); uma_zfree(aiocb_zone, job); AIO_LOCK(ki); return (0); } static void aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp __unused) { aio_proc_rundown(arg, p); } static int aio_cancel_job(struct proc *p, struct kaioinfo *ki, struct kaiocb *job) { aio_cancel_fn_t *func; int cancelled; AIO_LOCK_ASSERT(ki, MA_OWNED); if (job->jobflags & (KAIOCB_CANCELLED | KAIOCB_FINISHED)) return (0); MPASS((job->jobflags & KAIOCB_CANCELLING) == 0); job->jobflags |= KAIOCB_CANCELLED; func = job->cancel_fn; /* * If there is no cancel routine, just leave the job marked as * cancelled. The job should be in active use by a caller who * should complete it normally or when it fails to install a * cancel routine. */ if (func == NULL) return (0); /* * Set the CANCELLING flag so that aio_complete() will defer * completions of this job. This prevents the job from being * freed out from under the cancel callback. After the * callback any deferred completion (whether from the callback * or any other source) will be completed. */ job->jobflags |= KAIOCB_CANCELLING; AIO_UNLOCK(ki); func(job); AIO_LOCK(ki); job->jobflags &= ~KAIOCB_CANCELLING; if (job->jobflags & KAIOCB_FINISHED) { cancelled = job->uaiocb._aiocb_private.error == ECANCELED; TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist); aio_bio_done_notify(p, job); } else { /* * The cancel callback might have scheduled an * operation to cancel this request, but it is * only counted as cancelled if the request is * cancelled when the callback returns. */ cancelled = 0; } return (cancelled); } /* * Rundown the jobs for a given process. */ static void aio_proc_rundown(void *arg, struct proc *p) { struct kaioinfo *ki; struct aioliojob *lj; struct kaiocb *job, *jobn; KASSERT(curthread->td_proc == p, ("%s: called on non-curproc", __func__)); ki = p->p_aioinfo; if (ki == NULL) return; AIO_LOCK(ki); ki->kaio_flags |= KAIO_RUNDOWN; restart: /* * Try to cancel all pending requests. This code simulates * aio_cancel on all pending I/O requests. */ TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) { aio_cancel_job(p, ki, job); } /* Wait for all running I/O to be finished */ if (TAILQ_FIRST(&ki->kaio_jobqueue) || ki->kaio_active_count != 0) { ki->kaio_flags |= KAIO_WAKEUP; msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz); goto restart; } /* Free all completed I/O requests. */ while ((job = TAILQ_FIRST(&ki->kaio_done)) != NULL) aio_free_entry(job); while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) { if (lj->lioj_count == 0) { TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); knlist_delete(&lj->klist, curthread, 1); PROC_LOCK(p); sigqueue_take(&lj->lioj_ksi); PROC_UNLOCK(p); uma_zfree(aiolio_zone, lj); } else { panic("LIO job not cleaned up: C:%d, FC:%d\n", lj->lioj_count, lj->lioj_finished_count); } } AIO_UNLOCK(ki); taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_task); taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_sync_task); mtx_destroy(&ki->kaio_mtx); uma_zfree(kaio_zone, ki); p->p_aioinfo = NULL; } /* * Select a job to run (called by an AIO daemon). */ static struct kaiocb * aio_selectjob(struct aioproc *aiop) { struct kaiocb *job; struct kaioinfo *ki; struct proc *userp; mtx_assert(&aio_job_mtx, MA_OWNED); restart: TAILQ_FOREACH(job, &aio_jobs, list) { userp = job->userproc; ki = userp->p_aioinfo; if (ki->kaio_active_count < max_aio_per_proc) { TAILQ_REMOVE(&aio_jobs, job, list); if (!aio_clear_cancel_function(job)) goto restart; /* Account for currently active jobs. */ ki->kaio_active_count++; break; } } return (job); } /* * Move all data to a permanent storage device. This code * simulates the fsync syscall. */ static int aio_fsync_vnode(struct thread *td, struct vnode *vp) { struct mount *mp; int error; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) goto drop; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_object != NULL) { VM_OBJECT_WLOCK(vp->v_object); vm_object_page_clean(vp->v_object, 0, 0, 0); VM_OBJECT_WUNLOCK(vp->v_object); } error = VOP_FSYNC(vp, MNT_WAIT, td); VOP_UNLOCK(vp, 0); vn_finished_write(mp); drop: return (error); } /* * The AIO processing activity for LIO_READ/LIO_WRITE. This is the code that * does the I/O request for the non-physio version of the operations. The * normal vn operations are used, and this code should work in all instances * for every type of file, including pipes, sockets, fifos, and regular files. * * XXX I don't think it works well for socket, pipe, and fifo. */ static void aio_process_rw(struct kaiocb *job) { struct ucred *td_savedcred; struct thread *td; struct aiocb *cb; struct file *fp; struct uio auio; struct iovec aiov; ssize_t cnt; long msgsnd_st, msgsnd_end; long msgrcv_st, msgrcv_end; long oublock_st, oublock_end; long inblock_st, inblock_end; int error; KASSERT(job->uaiocb.aio_lio_opcode == LIO_READ || job->uaiocb.aio_lio_opcode == LIO_WRITE, ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode)); aio_switch_vmspace(job); td = curthread; td_savedcred = td->td_ucred; td->td_ucred = job->cred; cb = &job->uaiocb; fp = job->fd_file; aiov.iov_base = (void *)(uintptr_t)cb->aio_buf; aiov.iov_len = cb->aio_nbytes; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = cb->aio_offset; auio.uio_resid = cb->aio_nbytes; cnt = cb->aio_nbytes; auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; msgrcv_st = td->td_ru.ru_msgrcv; msgsnd_st = td->td_ru.ru_msgsnd; inblock_st = td->td_ru.ru_inblock; oublock_st = td->td_ru.ru_oublock; /* * aio_aqueue() acquires a reference to the file that is * released in aio_free_entry(). */ if (cb->aio_lio_opcode == LIO_READ) { auio.uio_rw = UIO_READ; if (auio.uio_resid == 0) error = 0; else error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td); } else { if (fp->f_type == DTYPE_VNODE) bwillwrite(); auio.uio_rw = UIO_WRITE; error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td); } msgrcv_end = td->td_ru.ru_msgrcv; msgsnd_end = td->td_ru.ru_msgsnd; inblock_end = td->td_ru.ru_inblock; oublock_end = td->td_ru.ru_oublock; job->msgrcv = msgrcv_end - msgrcv_st; job->msgsnd = msgsnd_end - msgsnd_st; job->inblock = inblock_end - inblock_st; job->outblock = oublock_end - oublock_st; if ((error) && (auio.uio_resid != cnt)) { if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) error = 0; if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) { PROC_LOCK(job->userproc); kern_psignal(job->userproc, SIGPIPE); PROC_UNLOCK(job->userproc); } } cnt -= auio.uio_resid; td->td_ucred = td_savedcred; if (error) aio_complete(job, -1, error); else aio_complete(job, cnt, 0); } static void aio_process_sync(struct kaiocb *job) { struct thread *td = curthread; struct ucred *td_savedcred = td->td_ucred; struct file *fp = job->fd_file; int error = 0; KASSERT(job->uaiocb.aio_lio_opcode == LIO_SYNC, ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode)); td->td_ucred = job->cred; if (fp->f_vnode != NULL) error = aio_fsync_vnode(td, fp->f_vnode); td->td_ucred = td_savedcred; if (error) aio_complete(job, -1, error); else aio_complete(job, 0, 0); } static void aio_process_mlock(struct kaiocb *job) { struct aiocb *cb = &job->uaiocb; int error; KASSERT(job->uaiocb.aio_lio_opcode == LIO_MLOCK, ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode)); aio_switch_vmspace(job); error = kern_mlock(job->userproc, job->cred, __DEVOLATILE(uintptr_t, cb->aio_buf), cb->aio_nbytes); aio_complete(job, error != 0 ? -1 : 0, error); } static void aio_bio_done_notify(struct proc *userp, struct kaiocb *job) { struct aioliojob *lj; struct kaioinfo *ki; struct kaiocb *sjob, *sjobn; int lj_done; bool schedule_fsync; ki = userp->p_aioinfo; AIO_LOCK_ASSERT(ki, MA_OWNED); lj = job->lio; lj_done = 0; if (lj) { lj->lioj_finished_count++; if (lj->lioj_count == lj->lioj_finished_count) lj_done = 1; } TAILQ_INSERT_TAIL(&ki->kaio_done, job, plist); MPASS(job->jobflags & KAIOCB_FINISHED); if (ki->kaio_flags & KAIO_RUNDOWN) goto notification_done; if (job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL || job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) aio_sendsig(userp, &job->uaiocb.aio_sigevent, &job->ksi); KNOTE_LOCKED(&job->klist, 1); if (lj_done) { if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { lj->lioj_flags |= LIOJ_KEVENT_POSTED; KNOTE_LOCKED(&lj->klist, 1); } if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) { aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi); lj->lioj_flags |= LIOJ_SIGNAL_POSTED; } } notification_done: if (job->jobflags & KAIOCB_CHECKSYNC) { schedule_fsync = false; TAILQ_FOREACH_SAFE(sjob, &ki->kaio_syncqueue, list, sjobn) { if (job->fd_file != sjob->fd_file || job->seqno >= sjob->seqno) continue; if (--sjob->pending > 0) continue; TAILQ_REMOVE(&ki->kaio_syncqueue, sjob, list); if (!aio_clear_cancel_function_locked(sjob)) continue; TAILQ_INSERT_TAIL(&ki->kaio_syncready, sjob, list); schedule_fsync = true; } if (schedule_fsync) taskqueue_enqueue(taskqueue_aiod_kick, &ki->kaio_sync_task); } if (ki->kaio_flags & KAIO_WAKEUP) { ki->kaio_flags &= ~KAIO_WAKEUP; wakeup(&userp->p_aioinfo); } } static void aio_schedule_fsync(void *context, int pending) { struct kaioinfo *ki; struct kaiocb *job; ki = context; AIO_LOCK(ki); while (!TAILQ_EMPTY(&ki->kaio_syncready)) { job = TAILQ_FIRST(&ki->kaio_syncready); TAILQ_REMOVE(&ki->kaio_syncready, job, list); AIO_UNLOCK(ki); aio_schedule(job, aio_process_sync); AIO_LOCK(ki); } AIO_UNLOCK(ki); } bool aio_cancel_cleared(struct kaiocb *job) { /* * The caller should hold the same queue lock held when * aio_clear_cancel_function() was called and set this flag * ensuring this check sees an up-to-date value. However, * there is no way to assert that. */ return ((job->jobflags & KAIOCB_CLEARED) != 0); } static bool aio_clear_cancel_function_locked(struct kaiocb *job) { AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED); MPASS(job->cancel_fn != NULL); if (job->jobflags & KAIOCB_CANCELLING) { job->jobflags |= KAIOCB_CLEARED; return (false); } job->cancel_fn = NULL; return (true); } bool aio_clear_cancel_function(struct kaiocb *job) { struct kaioinfo *ki; bool ret; ki = job->userproc->p_aioinfo; AIO_LOCK(ki); ret = aio_clear_cancel_function_locked(job); AIO_UNLOCK(ki); return (ret); } static bool aio_set_cancel_function_locked(struct kaiocb *job, aio_cancel_fn_t *func) { AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED); if (job->jobflags & KAIOCB_CANCELLED) return (false); job->cancel_fn = func; return (true); } bool aio_set_cancel_function(struct kaiocb *job, aio_cancel_fn_t *func) { struct kaioinfo *ki; bool ret; ki = job->userproc->p_aioinfo; AIO_LOCK(ki); ret = aio_set_cancel_function_locked(job, func); AIO_UNLOCK(ki); return (ret); } void aio_complete(struct kaiocb *job, long status, int error) { struct kaioinfo *ki; struct proc *userp; job->uaiocb._aiocb_private.error = error; job->uaiocb._aiocb_private.status = status; userp = job->userproc; ki = userp->p_aioinfo; AIO_LOCK(ki); KASSERT(!(job->jobflags & KAIOCB_FINISHED), ("duplicate aio_complete")); job->jobflags |= KAIOCB_FINISHED; if ((job->jobflags & (KAIOCB_QUEUEING | KAIOCB_CANCELLING)) == 0) { TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist); aio_bio_done_notify(userp, job); } AIO_UNLOCK(ki); } void aio_cancel(struct kaiocb *job) { aio_complete(job, -1, ECANCELED); } void aio_switch_vmspace(struct kaiocb *job) { vmspace_switch_aio(job->userproc->p_vmspace); } /* * The AIO daemon, most of the actual work is done in aio_process_*, * but the setup (and address space mgmt) is done in this routine. */ static void aio_daemon(void *_id) { struct kaiocb *job; struct aioproc *aiop; struct kaioinfo *ki; struct proc *p; struct vmspace *myvm; struct thread *td = curthread; int id = (intptr_t)_id; /* * Grab an extra reference on the daemon's vmspace so that it * doesn't get freed by jobs that switch to a different * vmspace. */ p = td->td_proc; myvm = vmspace_acquire_ref(p); KASSERT(p->p_textvp == NULL, ("kthread has a textvp")); /* * Allocate and ready the aio control info. There is one aiop structure * per daemon. */ aiop = uma_zalloc(aiop_zone, M_WAITOK); aiop->aioproc = p; aiop->aioprocflags = 0; /* * Wakeup parent process. (Parent sleeps to keep from blasting away * and creating too many daemons.) */ sema_post(&aio_newproc_sem); mtx_lock(&aio_job_mtx); for (;;) { /* * Take daemon off of free queue */ if (aiop->aioprocflags & AIOP_FREE) { TAILQ_REMOVE(&aio_freeproc, aiop, list); aiop->aioprocflags &= ~AIOP_FREE; } /* * Check for jobs. */ while ((job = aio_selectjob(aiop)) != NULL) { mtx_unlock(&aio_job_mtx); ki = job->userproc->p_aioinfo; job->handle_fn(job); mtx_lock(&aio_job_mtx); /* Decrement the active job count. */ ki->kaio_active_count--; } /* * Disconnect from user address space. */ if (p->p_vmspace != myvm) { mtx_unlock(&aio_job_mtx); vmspace_switch_aio(myvm); mtx_lock(&aio_job_mtx); /* * We have to restart to avoid race, we only sleep if * no job can be selected. */ continue; } mtx_assert(&aio_job_mtx, MA_OWNED); TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); aiop->aioprocflags |= AIOP_FREE; /* * If daemon is inactive for a long time, allow it to exit, * thereby freeing resources. */ if (msleep(p, &aio_job_mtx, PRIBIO, "aiordy", aiod_lifetime) == EWOULDBLOCK && TAILQ_EMPTY(&aio_jobs) && (aiop->aioprocflags & AIOP_FREE) && num_aio_procs > target_aio_procs) break; } TAILQ_REMOVE(&aio_freeproc, aiop, list); num_aio_procs--; mtx_unlock(&aio_job_mtx); uma_zfree(aiop_zone, aiop); free_unr(aiod_unr, id); vmspace_free(myvm); KASSERT(p->p_vmspace == myvm, ("AIOD: bad vmspace for exiting daemon")); KASSERT(myvm->vm_refcnt > 1, ("AIOD: bad vm refcnt for exiting daemon: %d", myvm->vm_refcnt)); kproc_exit(0); } /* * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The * AIO daemon modifies its environment itself. */ static int aio_newproc(int *start) { int error; struct proc *p; int id; id = alloc_unr(aiod_unr); error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p, RFNOWAIT, 0, "aiod%d", id); if (error == 0) { /* * Wait until daemon is started. */ sema_wait(&aio_newproc_sem); mtx_lock(&aio_job_mtx); num_aio_procs++; if (start != NULL) (*start)--; mtx_unlock(&aio_job_mtx); } else { free_unr(aiod_unr, id); } return (error); } /* * Try the high-performance, low-overhead physio method for eligible * VCHR devices. This method doesn't use an aio helper thread, and * thus has very low overhead. * * Assumes that the caller, aio_aqueue(), has incremented the file * structure's reference count, preventing its deallocation for the * duration of this call. */ static int aio_qphysio(struct proc *p, struct kaiocb *job) { struct aiocb *cb; struct file *fp; struct bio *bp; struct buf *pbuf; struct vnode *vp; struct cdevsw *csw; struct cdev *dev; struct kaioinfo *ki; int error, ref, poff; vm_prot_t prot; cb = &job->uaiocb; fp = job->fd_file; if (!(cb->aio_lio_opcode == LIO_WRITE || cb->aio_lio_opcode == LIO_READ)) return (-1); if (fp == NULL || fp->f_type != DTYPE_VNODE) return (-1); vp = fp->f_vnode; if (vp->v_type != VCHR) return (-1); if (vp->v_bufobj.bo_bsize == 0) return (-1); if (cb->aio_nbytes % vp->v_bufobj.bo_bsize) return (-1); ref = 0; csw = devvn_refthread(vp, &dev, &ref); if (csw == NULL) return (ENXIO); if ((csw->d_flags & D_DISK) == 0) { error = -1; goto unref; } if (cb->aio_nbytes > dev->si_iosize_max) { error = -1; goto unref; } ki = p->p_aioinfo; poff = (vm_offset_t)cb->aio_buf & PAGE_MASK; if ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed) { if (cb->aio_nbytes > MAXPHYS) { error = -1; goto unref; } pbuf = NULL; } else { if (cb->aio_nbytes > MAXPHYS - poff) { error = -1; goto unref; } if (ki->kaio_buffer_count >= max_buf_aio) { error = EAGAIN; goto unref; } job->pbuf = pbuf = (struct buf *)getpbuf(NULL); BUF_KERNPROC(pbuf); AIO_LOCK(ki); ki->kaio_buffer_count++; AIO_UNLOCK(ki); } job->bp = bp = g_alloc_bio(); bp->bio_length = cb->aio_nbytes; bp->bio_bcount = cb->aio_nbytes; bp->bio_done = aio_physwakeup; bp->bio_data = (void *)(uintptr_t)cb->aio_buf; bp->bio_offset = cb->aio_offset; bp->bio_cmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ; bp->bio_dev = dev; bp->bio_caller1 = (void *)job; prot = VM_PROT_READ; if (cb->aio_lio_opcode == LIO_READ) prot |= VM_PROT_WRITE; /* Less backwards than it looks */ job->npages = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, (vm_offset_t)bp->bio_data, bp->bio_length, prot, job->pages, nitems(job->pages)); if (job->npages < 0) { error = EFAULT; goto doerror; } if (pbuf != NULL) { pmap_qenter((vm_offset_t)pbuf->b_data, job->pages, job->npages); bp->bio_data = pbuf->b_data + poff; atomic_add_int(&num_buf_aio, 1); } else { bp->bio_ma = job->pages; bp->bio_ma_n = job->npages; bp->bio_ma_offset = poff; bp->bio_data = unmapped_buf; bp->bio_flags |= BIO_UNMAPPED; atomic_add_int(&num_unmapped_aio, 1); } /* Perform transfer. */ csw->d_strategy(bp); dev_relthread(dev, ref); return (0); doerror: if (pbuf != NULL) { AIO_LOCK(ki); ki->kaio_buffer_count--; AIO_UNLOCK(ki); relpbuf(pbuf, NULL); job->pbuf = NULL; } g_destroy_bio(bp); job->bp = NULL; unref: dev_relthread(dev, ref); return (error); } #ifdef COMPAT_FREEBSD6 static int convert_old_sigevent(struct osigevent *osig, struct sigevent *nsig) { /* * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are * supported by AIO with the old sigevent structure. */ nsig->sigev_notify = osig->sigev_notify; switch (nsig->sigev_notify) { case SIGEV_NONE: break; case SIGEV_SIGNAL: nsig->sigev_signo = osig->__sigev_u.__sigev_signo; break; case SIGEV_KEVENT: nsig->sigev_notify_kqueue = osig->__sigev_u.__sigev_notify_kqueue; nsig->sigev_value.sival_ptr = osig->sigev_value.sival_ptr; break; default: return (EINVAL); } return (0); } static int aiocb_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob) { struct oaiocb *ojob; int error; bzero(kjob, sizeof(struct aiocb)); error = copyin(ujob, kjob, sizeof(struct oaiocb)); if (error) return (error); ojob = (struct oaiocb *)kjob; return (convert_old_sigevent(&ojob->aio_sigevent, &kjob->aio_sigevent)); } #endif static int aiocb_copyin(struct aiocb *ujob, struct aiocb *kjob) { return (copyin(ujob, kjob, sizeof(struct aiocb))); } static long aiocb_fetch_status(struct aiocb *ujob) { return (fuword(&ujob->_aiocb_private.status)); } static long aiocb_fetch_error(struct aiocb *ujob) { return (fuword(&ujob->_aiocb_private.error)); } static int aiocb_store_status(struct aiocb *ujob, long status) { return (suword(&ujob->_aiocb_private.status, status)); } static int aiocb_store_error(struct aiocb *ujob, long error) { return (suword(&ujob->_aiocb_private.error, error)); } static int aiocb_store_kernelinfo(struct aiocb *ujob, long jobref) { return (suword(&ujob->_aiocb_private.kernelinfo, jobref)); } static int aiocb_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob) { return (suword(ujobp, (long)ujob)); } static struct aiocb_ops aiocb_ops = { .copyin = aiocb_copyin, .fetch_status = aiocb_fetch_status, .fetch_error = aiocb_fetch_error, .store_status = aiocb_store_status, .store_error = aiocb_store_error, .store_kernelinfo = aiocb_store_kernelinfo, .store_aiocb = aiocb_store_aiocb, }; #ifdef COMPAT_FREEBSD6 static struct aiocb_ops aiocb_ops_osigevent = { .copyin = aiocb_copyin_old_sigevent, .fetch_status = aiocb_fetch_status, .fetch_error = aiocb_fetch_error, .store_status = aiocb_store_status, .store_error = aiocb_store_error, .store_kernelinfo = aiocb_store_kernelinfo, .store_aiocb = aiocb_store_aiocb, }; #endif /* * Queue a new AIO request. Choosing either the threaded or direct physio VCHR * technique is done in this code. */ int aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj, int type, struct aiocb_ops *ops) { struct proc *p = td->td_proc; - cap_rights_t rights; struct file *fp; struct kaiocb *job; struct kaioinfo *ki; struct kevent kev; int opcode; int error; int fd, kqfd; int jid; u_short evflags; if (p->p_aioinfo == NULL) aio_init_aioinfo(p); ki = p->p_aioinfo; ops->store_status(ujob, -1); ops->store_error(ujob, 0); ops->store_kernelinfo(ujob, -1); if (num_queue_count >= max_queue_count || ki->kaio_count >= max_aio_queue_per_proc) { ops->store_error(ujob, EAGAIN); return (EAGAIN); } job = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO); knlist_init_mtx(&job->klist, AIO_MTX(ki)); error = ops->copyin(ujob, &job->uaiocb); if (error) { ops->store_error(ujob, error); uma_zfree(aiocb_zone, job); return (error); } if (job->uaiocb.aio_nbytes > IOSIZE_MAX) { uma_zfree(aiocb_zone, job); return (EINVAL); } if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT && job->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL && job->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID && job->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) { ops->store_error(ujob, EINVAL); uma_zfree(aiocb_zone, job); return (EINVAL); } if ((job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL || job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) && !_SIG_VALID(job->uaiocb.aio_sigevent.sigev_signo)) { uma_zfree(aiocb_zone, job); return (EINVAL); } ksiginfo_init(&job->ksi); /* Save userspace address of the job info. */ job->ujob = ujob; /* Get the opcode. */ if (type != LIO_NOP) job->uaiocb.aio_lio_opcode = type; opcode = job->uaiocb.aio_lio_opcode; /* * Validate the opcode and fetch the file object for the specified * file descriptor. * * XXXRW: Moved the opcode validation up here so that we don't * retrieve a file descriptor without knowing what the capabiltity * should be. */ fd = job->uaiocb.aio_fildes; switch (opcode) { case LIO_WRITE: - error = fget_write(td, fd, - cap_rights_init(&rights, CAP_PWRITE), &fp); + error = fget_write(td, fd, &cap_pwrite_rights, &fp); break; case LIO_READ: - error = fget_read(td, fd, - cap_rights_init(&rights, CAP_PREAD), &fp); + error = fget_read(td, fd, &cap_pread_rights, &fp); break; case LIO_SYNC: - error = fget(td, fd, cap_rights_init(&rights, CAP_FSYNC), &fp); + error = fget(td, fd, &cap_fsync_rights, &fp); break; case LIO_MLOCK: fp = NULL; break; case LIO_NOP: - error = fget(td, fd, cap_rights_init(&rights), &fp); + error = fget(td, fd, &cap_no_rights, &fp); break; default: error = EINVAL; } if (error) { uma_zfree(aiocb_zone, job); ops->store_error(ujob, error); return (error); } if (opcode == LIO_SYNC && fp->f_vnode == NULL) { error = EINVAL; goto aqueue_fail; } if ((opcode == LIO_READ || opcode == LIO_WRITE) && job->uaiocb.aio_offset < 0 && (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR)) { error = EINVAL; goto aqueue_fail; } job->fd_file = fp; mtx_lock(&aio_job_mtx); jid = jobrefid++; job->seqno = jobseqno++; mtx_unlock(&aio_job_mtx); error = ops->store_kernelinfo(ujob, jid); if (error) { error = EINVAL; goto aqueue_fail; } job->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid; if (opcode == LIO_NOP) { fdrop(fp, td); uma_zfree(aiocb_zone, job); return (0); } if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT) goto no_kqueue; evflags = job->uaiocb.aio_sigevent.sigev_notify_kevent_flags; if ((evflags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) { error = EINVAL; goto aqueue_fail; } kqfd = job->uaiocb.aio_sigevent.sigev_notify_kqueue; kev.ident = (uintptr_t)job->ujob; kev.filter = EVFILT_AIO; kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | evflags; kev.data = (intptr_t)job; kev.udata = job->uaiocb.aio_sigevent.sigev_value.sival_ptr; error = kqfd_register(kqfd, &kev, td, 1); if (error) goto aqueue_fail; no_kqueue: ops->store_error(ujob, EINPROGRESS); job->uaiocb._aiocb_private.error = EINPROGRESS; job->userproc = p; job->cred = crhold(td->td_ucred); job->jobflags = KAIOCB_QUEUEING; job->lio = lj; if (opcode == LIO_MLOCK) { aio_schedule(job, aio_process_mlock); error = 0; } else if (fp->f_ops->fo_aio_queue == NULL) error = aio_queue_file(fp, job); else error = fo_aio_queue(fp, job); if (error) goto aqueue_fail; AIO_LOCK(ki); job->jobflags &= ~KAIOCB_QUEUEING; TAILQ_INSERT_TAIL(&ki->kaio_all, job, allist); ki->kaio_count++; if (lj) lj->lioj_count++; atomic_add_int(&num_queue_count, 1); if (job->jobflags & KAIOCB_FINISHED) { /* * The queue callback completed the request synchronously. * The bulk of the completion is deferred in that case * until this point. */ aio_bio_done_notify(p, job); } else TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, job, plist); AIO_UNLOCK(ki); return (0); aqueue_fail: knlist_delete(&job->klist, curthread, 0); if (fp) fdrop(fp, td); uma_zfree(aiocb_zone, job); ops->store_error(ujob, error); return (error); } static void aio_cancel_daemon_job(struct kaiocb *job) { mtx_lock(&aio_job_mtx); if (!aio_cancel_cleared(job)) TAILQ_REMOVE(&aio_jobs, job, list); mtx_unlock(&aio_job_mtx); aio_cancel(job); } void aio_schedule(struct kaiocb *job, aio_handle_fn_t *func) { mtx_lock(&aio_job_mtx); if (!aio_set_cancel_function(job, aio_cancel_daemon_job)) { mtx_unlock(&aio_job_mtx); aio_cancel(job); return; } job->handle_fn = func; TAILQ_INSERT_TAIL(&aio_jobs, job, list); aio_kick_nowait(job->userproc); mtx_unlock(&aio_job_mtx); } static void aio_cancel_sync(struct kaiocb *job) { struct kaioinfo *ki; ki = job->userproc->p_aioinfo; AIO_LOCK(ki); if (!aio_cancel_cleared(job)) TAILQ_REMOVE(&ki->kaio_syncqueue, job, list); AIO_UNLOCK(ki); aio_cancel(job); } int aio_queue_file(struct file *fp, struct kaiocb *job) { struct kaioinfo *ki; struct kaiocb *job2; struct vnode *vp; struct mount *mp; int error; bool safe; ki = job->userproc->p_aioinfo; error = aio_qphysio(job->userproc, job); if (error >= 0) return (error); safe = false; if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; if (vp->v_type == VREG || vp->v_type == VDIR) { mp = fp->f_vnode->v_mount; if (mp == NULL || (mp->mnt_flag & MNT_LOCAL) != 0) safe = true; } } if (!(safe || enable_aio_unsafe)) { counted_warning(&unsafe_warningcnt, "is attempting to use unsafe AIO requests"); return (EOPNOTSUPP); } switch (job->uaiocb.aio_lio_opcode) { case LIO_READ: case LIO_WRITE: aio_schedule(job, aio_process_rw); error = 0; break; case LIO_SYNC: AIO_LOCK(ki); TAILQ_FOREACH(job2, &ki->kaio_jobqueue, plist) { if (job2->fd_file == job->fd_file && job2->uaiocb.aio_lio_opcode != LIO_SYNC && job2->seqno < job->seqno) { job2->jobflags |= KAIOCB_CHECKSYNC; job->pending++; } } if (job->pending != 0) { if (!aio_set_cancel_function_locked(job, aio_cancel_sync)) { AIO_UNLOCK(ki); aio_cancel(job); return (0); } TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, job, list); AIO_UNLOCK(ki); return (0); } AIO_UNLOCK(ki); aio_schedule(job, aio_process_sync); error = 0; break; default: error = EINVAL; } return (error); } static void aio_kick_nowait(struct proc *userp) { struct kaioinfo *ki = userp->p_aioinfo; struct aioproc *aiop; mtx_assert(&aio_job_mtx, MA_OWNED); if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { TAILQ_REMOVE(&aio_freeproc, aiop, list); aiop->aioprocflags &= ~AIOP_FREE; wakeup(aiop->aioproc); } else if (num_aio_resv_start + num_aio_procs < max_aio_procs && ki->kaio_active_count + num_aio_resv_start < max_aio_per_proc) { taskqueue_enqueue(taskqueue_aiod_kick, &ki->kaio_task); } } static int aio_kick(struct proc *userp) { struct kaioinfo *ki = userp->p_aioinfo; struct aioproc *aiop; int error, ret = 0; mtx_assert(&aio_job_mtx, MA_OWNED); retryproc: if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { TAILQ_REMOVE(&aio_freeproc, aiop, list); aiop->aioprocflags &= ~AIOP_FREE; wakeup(aiop->aioproc); } else if (num_aio_resv_start + num_aio_procs < max_aio_procs && ki->kaio_active_count + num_aio_resv_start < max_aio_per_proc) { num_aio_resv_start++; mtx_unlock(&aio_job_mtx); error = aio_newproc(&num_aio_resv_start); mtx_lock(&aio_job_mtx); if (error) { num_aio_resv_start--; goto retryproc; } } else { ret = -1; } return (ret); } static void aio_kick_helper(void *context, int pending) { struct proc *userp = context; mtx_lock(&aio_job_mtx); while (--pending >= 0) { if (aio_kick(userp)) break; } mtx_unlock(&aio_job_mtx); } /* * Support the aio_return system call, as a side-effect, kernel resources are * released. */ static int kern_aio_return(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops) { struct proc *p = td->td_proc; struct kaiocb *job; struct kaioinfo *ki; long status, error; ki = p->p_aioinfo; if (ki == NULL) return (EINVAL); AIO_LOCK(ki); TAILQ_FOREACH(job, &ki->kaio_done, plist) { if (job->ujob == ujob) break; } if (job != NULL) { MPASS(job->jobflags & KAIOCB_FINISHED); status = job->uaiocb._aiocb_private.status; error = job->uaiocb._aiocb_private.error; td->td_retval[0] = status; td->td_ru.ru_oublock += job->outblock; td->td_ru.ru_inblock += job->inblock; td->td_ru.ru_msgsnd += job->msgsnd; td->td_ru.ru_msgrcv += job->msgrcv; aio_free_entry(job); AIO_UNLOCK(ki); ops->store_error(ujob, error); ops->store_status(ujob, status); } else { error = EINVAL; AIO_UNLOCK(ki); } return (error); } int sys_aio_return(struct thread *td, struct aio_return_args *uap) { return (kern_aio_return(td, uap->aiocbp, &aiocb_ops)); } /* * Allow a process to wakeup when any of the I/O requests are completed. */ static int kern_aio_suspend(struct thread *td, int njoblist, struct aiocb **ujoblist, struct timespec *ts) { struct proc *p = td->td_proc; struct timeval atv; struct kaioinfo *ki; struct kaiocb *firstjob, *job; int error, i, timo; timo = 0; if (ts) { if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000) return (EINVAL); TIMESPEC_TO_TIMEVAL(&atv, ts); if (itimerfix(&atv)) return (EINVAL); timo = tvtohz(&atv); } ki = p->p_aioinfo; if (ki == NULL) return (EAGAIN); if (njoblist == 0) return (0); AIO_LOCK(ki); for (;;) { firstjob = NULL; error = 0; TAILQ_FOREACH(job, &ki->kaio_all, allist) { for (i = 0; i < njoblist; i++) { if (job->ujob == ujoblist[i]) { if (firstjob == NULL) firstjob = job; if (job->jobflags & KAIOCB_FINISHED) goto RETURN; } } } /* All tasks were finished. */ if (firstjob == NULL) break; ki->kaio_flags |= KAIO_WAKEUP; error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH, "aiospn", timo); if (error == ERESTART) error = EINTR; if (error) break; } RETURN: AIO_UNLOCK(ki); return (error); } int sys_aio_suspend(struct thread *td, struct aio_suspend_args *uap) { struct timespec ts, *tsp; struct aiocb **ujoblist; int error; if (uap->nent < 0 || uap->nent > max_aio_queue_per_proc) return (EINVAL); if (uap->timeout) { /* Get timespec struct. */ if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0) return (error); tsp = &ts; } else tsp = NULL; ujoblist = malloc(uap->nent * sizeof(ujoblist[0]), M_AIOS, M_WAITOK); error = copyin(uap->aiocbp, ujoblist, uap->nent * sizeof(ujoblist[0])); if (error == 0) error = kern_aio_suspend(td, uap->nent, ujoblist, tsp); free(ujoblist, M_AIOS); return (error); } /* * aio_cancel cancels any non-physio aio operations not currently in * progress. */ int sys_aio_cancel(struct thread *td, struct aio_cancel_args *uap) { struct proc *p = td->td_proc; struct kaioinfo *ki; struct kaiocb *job, *jobn; struct file *fp; - cap_rights_t rights; int error; int cancelled = 0; int notcancelled = 0; struct vnode *vp; /* Lookup file object. */ - error = fget(td, uap->fd, cap_rights_init(&rights), &fp); + error = fget(td, uap->fd, &cap_no_rights, &fp); if (error) return (error); ki = p->p_aioinfo; if (ki == NULL) goto done; if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; if (vn_isdisk(vp, &error)) { fdrop(fp, td); td->td_retval[0] = AIO_NOTCANCELED; return (0); } } AIO_LOCK(ki); TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) { if ((uap->fd == job->uaiocb.aio_fildes) && ((uap->aiocbp == NULL) || (uap->aiocbp == job->ujob))) { if (aio_cancel_job(p, ki, job)) { cancelled++; } else { notcancelled++; } if (uap->aiocbp != NULL) break; } } AIO_UNLOCK(ki); done: fdrop(fp, td); if (uap->aiocbp != NULL) { if (cancelled) { td->td_retval[0] = AIO_CANCELED; return (0); } } if (notcancelled) { td->td_retval[0] = AIO_NOTCANCELED; return (0); } if (cancelled) { td->td_retval[0] = AIO_CANCELED; return (0); } td->td_retval[0] = AIO_ALLDONE; return (0); } /* * aio_error is implemented in the kernel level for compatibility purposes * only. For a user mode async implementation, it would be best to do it in * a userland subroutine. */ static int kern_aio_error(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops) { struct proc *p = td->td_proc; struct kaiocb *job; struct kaioinfo *ki; int status; ki = p->p_aioinfo; if (ki == NULL) { td->td_retval[0] = EINVAL; return (0); } AIO_LOCK(ki); TAILQ_FOREACH(job, &ki->kaio_all, allist) { if (job->ujob == ujob) { if (job->jobflags & KAIOCB_FINISHED) td->td_retval[0] = job->uaiocb._aiocb_private.error; else td->td_retval[0] = EINPROGRESS; AIO_UNLOCK(ki); return (0); } } AIO_UNLOCK(ki); /* * Hack for failure of aio_aqueue. */ status = ops->fetch_status(ujob); if (status == -1) { td->td_retval[0] = ops->fetch_error(ujob); return (0); } td->td_retval[0] = EINVAL; return (0); } int sys_aio_error(struct thread *td, struct aio_error_args *uap) { return (kern_aio_error(td, uap->aiocbp, &aiocb_ops)); } /* syscall - asynchronous read from a file (REALTIME) */ #ifdef COMPAT_FREEBSD6 int freebsd6_aio_read(struct thread *td, struct freebsd6_aio_read_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, &aiocb_ops_osigevent)); } #endif int sys_aio_read(struct thread *td, struct aio_read_args *uap) { return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops)); } /* syscall - asynchronous write to a file (REALTIME) */ #ifdef COMPAT_FREEBSD6 int freebsd6_aio_write(struct thread *td, struct freebsd6_aio_write_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops_osigevent)); } #endif int sys_aio_write(struct thread *td, struct aio_write_args *uap) { return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops)); } int sys_aio_mlock(struct thread *td, struct aio_mlock_args *uap) { return (aio_aqueue(td, uap->aiocbp, NULL, LIO_MLOCK, &aiocb_ops)); } static int kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list, struct aiocb **acb_list, int nent, struct sigevent *sig, struct aiocb_ops *ops) { struct proc *p = td->td_proc; struct aiocb *job; struct kaioinfo *ki; struct aioliojob *lj; struct kevent kev; int error; int nagain, nerror; int i; if ((mode != LIO_NOWAIT) && (mode != LIO_WAIT)) return (EINVAL); if (nent < 0 || nent > max_aio_queue_per_proc) return (EINVAL); if (p->p_aioinfo == NULL) aio_init_aioinfo(p); ki = p->p_aioinfo; lj = uma_zalloc(aiolio_zone, M_WAITOK); lj->lioj_flags = 0; lj->lioj_count = 0; lj->lioj_finished_count = 0; knlist_init_mtx(&lj->klist, AIO_MTX(ki)); ksiginfo_init(&lj->lioj_ksi); /* * Setup signal. */ if (sig && (mode == LIO_NOWAIT)) { bcopy(sig, &lj->lioj_signal, sizeof(lj->lioj_signal)); if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { /* Assume only new style KEVENT */ kev.filter = EVFILT_LIO; kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1; kev.ident = (uintptr_t)uacb_list; /* something unique */ kev.data = (intptr_t)lj; /* pass user defined sigval data */ kev.udata = lj->lioj_signal.sigev_value.sival_ptr; error = kqfd_register( lj->lioj_signal.sigev_notify_kqueue, &kev, td, 1); if (error) { uma_zfree(aiolio_zone, lj); return (error); } } else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) { ; } else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) { if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) { uma_zfree(aiolio_zone, lj); return EINVAL; } lj->lioj_flags |= LIOJ_SIGNAL; } else { uma_zfree(aiolio_zone, lj); return EINVAL; } } AIO_LOCK(ki); TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list); /* * Add extra aiocb count to avoid the lio to be freed * by other threads doing aio_waitcomplete or aio_return, * and prevent event from being sent until we have queued * all tasks. */ lj->lioj_count = 1; AIO_UNLOCK(ki); /* * Get pointers to the list of I/O requests. */ nagain = 0; nerror = 0; for (i = 0; i < nent; i++) { job = acb_list[i]; if (job != NULL) { error = aio_aqueue(td, job, lj, LIO_NOP, ops); if (error == EAGAIN) nagain++; else if (error != 0) nerror++; } } error = 0; AIO_LOCK(ki); if (mode == LIO_WAIT) { while (lj->lioj_count - 1 != lj->lioj_finished_count) { ki->kaio_flags |= KAIO_WAKEUP; error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH, "aiospn", 0); if (error == ERESTART) error = EINTR; if (error) break; } } else { if (lj->lioj_count - 1 == lj->lioj_finished_count) { if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { lj->lioj_flags |= LIOJ_KEVENT_POSTED; KNOTE_LOCKED(&lj->klist, 1); } if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) { aio_sendsig(p, &lj->lioj_signal, &lj->lioj_ksi); lj->lioj_flags |= LIOJ_SIGNAL_POSTED; } } } lj->lioj_count--; if (lj->lioj_count == 0) { TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); knlist_delete(&lj->klist, curthread, 1); PROC_LOCK(p); sigqueue_take(&lj->lioj_ksi); PROC_UNLOCK(p); AIO_UNLOCK(ki); uma_zfree(aiolio_zone, lj); } else AIO_UNLOCK(ki); if (nerror) return (EIO); else if (nagain) return (EAGAIN); else return (error); } /* syscall - list directed I/O (REALTIME) */ #ifdef COMPAT_FREEBSD6 int freebsd6_lio_listio(struct thread *td, struct freebsd6_lio_listio_args *uap) { struct aiocb **acb_list; struct sigevent *sigp, sig; struct osigevent osig; int error, nent; if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) return (EINVAL); nent = uap->nent; if (nent < 0 || nent > max_aio_queue_per_proc) return (EINVAL); if (uap->sig && (uap->mode == LIO_NOWAIT)) { error = copyin(uap->sig, &osig, sizeof(osig)); if (error) return (error); error = convert_old_sigevent(&osig, &sig); if (error) return (error); sigp = &sig; } else sigp = NULL; acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0])); if (error == 0) error = kern_lio_listio(td, uap->mode, (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp, &aiocb_ops_osigevent); free(acb_list, M_LIO); return (error); } #endif /* syscall - list directed I/O (REALTIME) */ int sys_lio_listio(struct thread *td, struct lio_listio_args *uap) { struct aiocb **acb_list; struct sigevent *sigp, sig; int error, nent; if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) return (EINVAL); nent = uap->nent; if (nent < 0 || nent > max_aio_queue_per_proc) return (EINVAL); if (uap->sig && (uap->mode == LIO_NOWAIT)) { error = copyin(uap->sig, &sig, sizeof(sig)); if (error) return (error); sigp = &sig; } else sigp = NULL; acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0])); if (error == 0) error = kern_lio_listio(td, uap->mode, uap->acb_list, acb_list, nent, sigp, &aiocb_ops); free(acb_list, M_LIO); return (error); } static void aio_physwakeup(struct bio *bp) { struct kaiocb *job = (struct kaiocb *)bp->bio_caller1; struct proc *userp; struct kaioinfo *ki; size_t nbytes; int error, nblks; /* Release mapping into kernel space. */ userp = job->userproc; ki = userp->p_aioinfo; if (job->pbuf) { pmap_qremove((vm_offset_t)job->pbuf->b_data, job->npages); relpbuf(job->pbuf, NULL); job->pbuf = NULL; atomic_subtract_int(&num_buf_aio, 1); AIO_LOCK(ki); ki->kaio_buffer_count--; AIO_UNLOCK(ki); } else atomic_subtract_int(&num_unmapped_aio, 1); vm_page_unhold_pages(job->pages, job->npages); bp = job->bp; job->bp = NULL; nbytes = job->uaiocb.aio_nbytes - bp->bio_resid; error = 0; if (bp->bio_flags & BIO_ERROR) error = bp->bio_error; nblks = btodb(nbytes); if (job->uaiocb.aio_lio_opcode == LIO_WRITE) job->outblock += nblks; else job->inblock += nblks; if (error) aio_complete(job, -1, error); else aio_complete(job, nbytes, 0); g_destroy_bio(bp); } /* syscall - wait for the next completion of an aio request */ static int kern_aio_waitcomplete(struct thread *td, struct aiocb **ujobp, struct timespec *ts, struct aiocb_ops *ops) { struct proc *p = td->td_proc; struct timeval atv; struct kaioinfo *ki; struct kaiocb *job; struct aiocb *ujob; long error, status; int timo; ops->store_aiocb(ujobp, NULL); if (ts == NULL) { timo = 0; } else if (ts->tv_sec == 0 && ts->tv_nsec == 0) { timo = -1; } else { if ((ts->tv_nsec < 0) || (ts->tv_nsec >= 1000000000)) return (EINVAL); TIMESPEC_TO_TIMEVAL(&atv, ts); if (itimerfix(&atv)) return (EINVAL); timo = tvtohz(&atv); } if (p->p_aioinfo == NULL) aio_init_aioinfo(p); ki = p->p_aioinfo; error = 0; job = NULL; AIO_LOCK(ki); while ((job = TAILQ_FIRST(&ki->kaio_done)) == NULL) { if (timo == -1) { error = EWOULDBLOCK; break; } ki->kaio_flags |= KAIO_WAKEUP; error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH, "aiowc", timo); if (timo && error == ERESTART) error = EINTR; if (error) break; } if (job != NULL) { MPASS(job->jobflags & KAIOCB_FINISHED); ujob = job->ujob; status = job->uaiocb._aiocb_private.status; error = job->uaiocb._aiocb_private.error; td->td_retval[0] = status; td->td_ru.ru_oublock += job->outblock; td->td_ru.ru_inblock += job->inblock; td->td_ru.ru_msgsnd += job->msgsnd; td->td_ru.ru_msgrcv += job->msgrcv; aio_free_entry(job); AIO_UNLOCK(ki); ops->store_aiocb(ujobp, ujob); ops->store_error(ujob, error); ops->store_status(ujob, status); } else AIO_UNLOCK(ki); return (error); } int sys_aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap) { struct timespec ts, *tsp; int error; if (uap->timeout) { /* Get timespec struct. */ error = copyin(uap->timeout, &ts, sizeof(ts)); if (error) return (error); tsp = &ts; } else tsp = NULL; return (kern_aio_waitcomplete(td, uap->aiocbp, tsp, &aiocb_ops)); } static int kern_aio_fsync(struct thread *td, int op, struct aiocb *ujob, struct aiocb_ops *ops) { if (op != O_SYNC) /* XXX lack of O_DSYNC */ return (EINVAL); return (aio_aqueue(td, ujob, NULL, LIO_SYNC, ops)); } int sys_aio_fsync(struct thread *td, struct aio_fsync_args *uap) { return (kern_aio_fsync(td, uap->op, uap->aiocbp, &aiocb_ops)); } /* kqueue attach function */ static int filt_aioattach(struct knote *kn) { struct kaiocb *job; job = (struct kaiocb *)(uintptr_t)kn->kn_sdata; /* * The job pointer must be validated before using it, so * registration is restricted to the kernel; the user cannot * set EV_FLAG1. */ if ((kn->kn_flags & EV_FLAG1) == 0) return (EPERM); kn->kn_ptr.p_aio = job; kn->kn_flags &= ~EV_FLAG1; knlist_add(&job->klist, kn, 0); return (0); } /* kqueue detach function */ static void filt_aiodetach(struct knote *kn) { struct knlist *knl; knl = &kn->kn_ptr.p_aio->klist; knl->kl_lock(knl->kl_lockarg); if (!knlist_empty(knl)) knlist_remove(knl, kn, 1); knl->kl_unlock(knl->kl_lockarg); } /* kqueue filter function */ /*ARGSUSED*/ static int filt_aio(struct knote *kn, long hint) { struct kaiocb *job = kn->kn_ptr.p_aio; kn->kn_data = job->uaiocb._aiocb_private.error; if (!(job->jobflags & KAIOCB_FINISHED)) return (0); kn->kn_flags |= EV_EOF; return (1); } /* kqueue attach function */ static int filt_lioattach(struct knote *kn) { struct aioliojob *lj; lj = (struct aioliojob *)(uintptr_t)kn->kn_sdata; /* * The aioliojob pointer must be validated before using it, so * registration is restricted to the kernel; the user cannot * set EV_FLAG1. */ if ((kn->kn_flags & EV_FLAG1) == 0) return (EPERM); kn->kn_ptr.p_lio = lj; kn->kn_flags &= ~EV_FLAG1; knlist_add(&lj->klist, kn, 0); return (0); } /* kqueue detach function */ static void filt_liodetach(struct knote *kn) { struct knlist *knl; knl = &kn->kn_ptr.p_lio->klist; knl->kl_lock(knl->kl_lockarg); if (!knlist_empty(knl)) knlist_remove(knl, kn, 1); knl->kl_unlock(knl->kl_lockarg); } /* kqueue filter function */ /*ARGSUSED*/ static int filt_lio(struct knote *kn, long hint) { struct aioliojob * lj = kn->kn_ptr.p_lio; return (lj->lioj_flags & LIOJ_KEVENT_POSTED); } #ifdef COMPAT_FREEBSD32 #include #include #include #include #include #include #include struct __aiocb_private32 { int32_t status; int32_t error; uint32_t kernelinfo; }; #ifdef COMPAT_FREEBSD6 typedef struct oaiocb32 { int aio_fildes; /* File descriptor */ uint64_t aio_offset __packed; /* File offset for I/O */ uint32_t aio_buf; /* I/O buffer in process space */ uint32_t aio_nbytes; /* Number of bytes for I/O */ struct osigevent32 aio_sigevent; /* Signal to deliver */ int aio_lio_opcode; /* LIO opcode */ int aio_reqprio; /* Request priority -- ignored */ struct __aiocb_private32 _aiocb_private; } oaiocb32_t; #endif typedef struct aiocb32 { int32_t aio_fildes; /* File descriptor */ uint64_t aio_offset __packed; /* File offset for I/O */ uint32_t aio_buf; /* I/O buffer in process space */ uint32_t aio_nbytes; /* Number of bytes for I/O */ int __spare__[2]; uint32_t __spare2__; int aio_lio_opcode; /* LIO opcode */ int aio_reqprio; /* Request priority -- ignored */ struct __aiocb_private32 _aiocb_private; struct sigevent32 aio_sigevent; /* Signal to deliver */ } aiocb32_t; #ifdef COMPAT_FREEBSD6 static int convert_old_sigevent32(struct osigevent32 *osig, struct sigevent *nsig) { /* * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are * supported by AIO with the old sigevent structure. */ CP(*osig, *nsig, sigev_notify); switch (nsig->sigev_notify) { case SIGEV_NONE: break; case SIGEV_SIGNAL: nsig->sigev_signo = osig->__sigev_u.__sigev_signo; break; case SIGEV_KEVENT: nsig->sigev_notify_kqueue = osig->__sigev_u.__sigev_notify_kqueue; PTRIN_CP(*osig, *nsig, sigev_value.sival_ptr); break; default: return (EINVAL); } return (0); } static int aiocb32_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob) { struct oaiocb32 job32; int error; bzero(kjob, sizeof(struct aiocb)); error = copyin(ujob, &job32, sizeof(job32)); if (error) return (error); CP(job32, *kjob, aio_fildes); CP(job32, *kjob, aio_offset); PTRIN_CP(job32, *kjob, aio_buf); CP(job32, *kjob, aio_nbytes); CP(job32, *kjob, aio_lio_opcode); CP(job32, *kjob, aio_reqprio); CP(job32, *kjob, _aiocb_private.status); CP(job32, *kjob, _aiocb_private.error); PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo); return (convert_old_sigevent32(&job32.aio_sigevent, &kjob->aio_sigevent)); } #endif static int aiocb32_copyin(struct aiocb *ujob, struct aiocb *kjob) { struct aiocb32 job32; int error; error = copyin(ujob, &job32, sizeof(job32)); if (error) return (error); CP(job32, *kjob, aio_fildes); CP(job32, *kjob, aio_offset); PTRIN_CP(job32, *kjob, aio_buf); CP(job32, *kjob, aio_nbytes); CP(job32, *kjob, aio_lio_opcode); CP(job32, *kjob, aio_reqprio); CP(job32, *kjob, _aiocb_private.status); CP(job32, *kjob, _aiocb_private.error); PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo); return (convert_sigevent32(&job32.aio_sigevent, &kjob->aio_sigevent)); } static long aiocb32_fetch_status(struct aiocb *ujob) { struct aiocb32 *ujob32; ujob32 = (struct aiocb32 *)ujob; return (fuword32(&ujob32->_aiocb_private.status)); } static long aiocb32_fetch_error(struct aiocb *ujob) { struct aiocb32 *ujob32; ujob32 = (struct aiocb32 *)ujob; return (fuword32(&ujob32->_aiocb_private.error)); } static int aiocb32_store_status(struct aiocb *ujob, long status) { struct aiocb32 *ujob32; ujob32 = (struct aiocb32 *)ujob; return (suword32(&ujob32->_aiocb_private.status, status)); } static int aiocb32_store_error(struct aiocb *ujob, long error) { struct aiocb32 *ujob32; ujob32 = (struct aiocb32 *)ujob; return (suword32(&ujob32->_aiocb_private.error, error)); } static int aiocb32_store_kernelinfo(struct aiocb *ujob, long jobref) { struct aiocb32 *ujob32; ujob32 = (struct aiocb32 *)ujob; return (suword32(&ujob32->_aiocb_private.kernelinfo, jobref)); } static int aiocb32_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob) { return (suword32(ujobp, (long)ujob)); } static struct aiocb_ops aiocb32_ops = { .copyin = aiocb32_copyin, .fetch_status = aiocb32_fetch_status, .fetch_error = aiocb32_fetch_error, .store_status = aiocb32_store_status, .store_error = aiocb32_store_error, .store_kernelinfo = aiocb32_store_kernelinfo, .store_aiocb = aiocb32_store_aiocb, }; #ifdef COMPAT_FREEBSD6 static struct aiocb_ops aiocb32_ops_osigevent = { .copyin = aiocb32_copyin_old_sigevent, .fetch_status = aiocb32_fetch_status, .fetch_error = aiocb32_fetch_error, .store_status = aiocb32_store_status, .store_error = aiocb32_store_error, .store_kernelinfo = aiocb32_store_kernelinfo, .store_aiocb = aiocb32_store_aiocb, }; #endif int freebsd32_aio_return(struct thread *td, struct freebsd32_aio_return_args *uap) { return (kern_aio_return(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops)); } int freebsd32_aio_suspend(struct thread *td, struct freebsd32_aio_suspend_args *uap) { struct timespec32 ts32; struct timespec ts, *tsp; struct aiocb **ujoblist; uint32_t *ujoblist32; int error, i; if (uap->nent < 0 || uap->nent > max_aio_queue_per_proc) return (EINVAL); if (uap->timeout) { /* Get timespec struct. */ if ((error = copyin(uap->timeout, &ts32, sizeof(ts32))) != 0) return (error); CP(ts32, ts, tv_sec); CP(ts32, ts, tv_nsec); tsp = &ts; } else tsp = NULL; ujoblist = malloc(uap->nent * sizeof(ujoblist[0]), M_AIOS, M_WAITOK); ujoblist32 = (uint32_t *)ujoblist; error = copyin(uap->aiocbp, ujoblist32, uap->nent * sizeof(ujoblist32[0])); if (error == 0) { for (i = uap->nent - 1; i >= 0; i--) ujoblist[i] = PTRIN(ujoblist32[i]); error = kern_aio_suspend(td, uap->nent, ujoblist, tsp); } free(ujoblist, M_AIOS); return (error); } int freebsd32_aio_error(struct thread *td, struct freebsd32_aio_error_args *uap) { return (kern_aio_error(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops)); } #ifdef COMPAT_FREEBSD6 int freebsd6_freebsd32_aio_read(struct thread *td, struct freebsd6_freebsd32_aio_read_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, &aiocb32_ops_osigevent)); } #endif int freebsd32_aio_read(struct thread *td, struct freebsd32_aio_read_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, &aiocb32_ops)); } #ifdef COMPAT_FREEBSD6 int freebsd6_freebsd32_aio_write(struct thread *td, struct freebsd6_freebsd32_aio_write_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, &aiocb32_ops_osigevent)); } #endif int freebsd32_aio_write(struct thread *td, struct freebsd32_aio_write_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, &aiocb32_ops)); } int freebsd32_aio_mlock(struct thread *td, struct freebsd32_aio_mlock_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_MLOCK, &aiocb32_ops)); } int freebsd32_aio_waitcomplete(struct thread *td, struct freebsd32_aio_waitcomplete_args *uap) { struct timespec32 ts32; struct timespec ts, *tsp; int error; if (uap->timeout) { /* Get timespec struct. */ error = copyin(uap->timeout, &ts32, sizeof(ts32)); if (error) return (error); CP(ts32, ts, tv_sec); CP(ts32, ts, tv_nsec); tsp = &ts; } else tsp = NULL; return (kern_aio_waitcomplete(td, (struct aiocb **)uap->aiocbp, tsp, &aiocb32_ops)); } int freebsd32_aio_fsync(struct thread *td, struct freebsd32_aio_fsync_args *uap) { return (kern_aio_fsync(td, uap->op, (struct aiocb *)uap->aiocbp, &aiocb32_ops)); } #ifdef COMPAT_FREEBSD6 int freebsd6_freebsd32_lio_listio(struct thread *td, struct freebsd6_freebsd32_lio_listio_args *uap) { struct aiocb **acb_list; struct sigevent *sigp, sig; struct osigevent32 osig; uint32_t *acb_list32; int error, i, nent; if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) return (EINVAL); nent = uap->nent; if (nent < 0 || nent > max_aio_queue_per_proc) return (EINVAL); if (uap->sig && (uap->mode == LIO_NOWAIT)) { error = copyin(uap->sig, &osig, sizeof(osig)); if (error) return (error); error = convert_old_sigevent32(&osig, &sig); if (error) return (error); sigp = &sig; } else sigp = NULL; acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK); error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t)); if (error) { free(acb_list32, M_LIO); return (error); } acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); for (i = 0; i < nent; i++) acb_list[i] = PTRIN(acb_list32[i]); free(acb_list32, M_LIO); error = kern_lio_listio(td, uap->mode, (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp, &aiocb32_ops_osigevent); free(acb_list, M_LIO); return (error); } #endif int freebsd32_lio_listio(struct thread *td, struct freebsd32_lio_listio_args *uap) { struct aiocb **acb_list; struct sigevent *sigp, sig; struct sigevent32 sig32; uint32_t *acb_list32; int error, i, nent; if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) return (EINVAL); nent = uap->nent; if (nent < 0 || nent > max_aio_queue_per_proc) return (EINVAL); if (uap->sig && (uap->mode == LIO_NOWAIT)) { error = copyin(uap->sig, &sig32, sizeof(sig32)); if (error) return (error); error = convert_sigevent32(&sig32, &sig); if (error) return (error); sigp = &sig; } else sigp = NULL; acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK); error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t)); if (error) { free(acb_list32, M_LIO); return (error); } acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); for (i = 0; i < nent; i++) acb_list[i] = PTRIN(acb_list32[i]); free(acb_list32, M_LIO); error = kern_lio_listio(td, uap->mode, (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp, &aiocb32_ops); free(acb_list, M_LIO); return (error); } #endif Index: head/sys/kern/vfs_syscalls.c =================================================================== --- head/sys/kern/vfs_syscalls.c (revision 333424) +++ head/sys/kern/vfs_syscalls.c (revision 333425) @@ -1,4620 +1,4615 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_syscalls.c 8.13 (Berkeley) 4/15/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information"); SDT_PROVIDER_DEFINE(vfs); SDT_PROBE_DEFINE2(vfs, , stat, mode, "char *", "int"); SDT_PROBE_DEFINE2(vfs, , stat, reg, "char *", "int"); static int kern_chflagsat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, u_long flags, int atflag); static int setfflags(struct thread *td, struct vnode *, u_long); static int getutimes(const struct timeval *, enum uio_seg, struct timespec *); static int getutimens(const struct timespec *, enum uio_seg, struct timespec *, int *); static int setutimes(struct thread *td, struct vnode *, const struct timespec *, int, int); static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred, struct thread *td); /* * Sync each mounted filesystem. */ #ifndef _SYS_SYSPROTO_H_ struct sync_args { int dummy; }; #endif /* ARGSUSED */ int sys_sync(struct thread *td, struct sync_args *uap) { struct mount *mp, *nmp; int save; mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } if ((mp->mnt_flag & MNT_RDONLY) == 0 && vn_start_write(NULL, &mp, V_NOWAIT) == 0) { save = curthread_pflags_set(TDP_SYNCIO); vfs_msync(mp, MNT_NOWAIT); VFS_SYNC(mp, MNT_NOWAIT); curthread_pflags_restore(save); vn_finished_write(mp); } mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp); } mtx_unlock(&mountlist_mtx); return (0); } /* * Change filesystem quotas. */ #ifndef _SYS_SYSPROTO_H_ struct quotactl_args { char *path; int cmd; int uid; caddr_t arg; }; #endif int sys_quotactl(struct thread *td, struct quotactl_args *uap) { struct mount *mp; struct nameidata nd; int error; AUDIT_ARG_CMD(uap->cmd); AUDIT_ARG_UID(uap->uid); if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS)) return (EPERM); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); mp = nd.ni_vp->v_mount; vfs_ref(mp); vput(nd.ni_vp); error = vfs_busy(mp, 0); vfs_rel(mp); if (error != 0) return (error); error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg); /* * Since quota on operation typically needs to open quota * file, the Q_QUOTAON handler needs to unbusy the mount point * before calling into namei. Otherwise, unmount might be * started between two vfs_busy() invocations (first is our, * second is from mount point cross-walk code in lookup()), * causing deadlock. * * Require that Q_QUOTAON handles the vfs_busy() reference on * its own, always returning with ubusied mount point. */ if ((uap->cmd >> SUBCMDSHIFT) != Q_QUOTAON) vfs_unbusy(mp); return (error); } /* * Used by statfs conversion routines to scale the block size up if * necessary so that all of the block counts are <= 'max_size'. Note * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero * value of 'n'. */ void statfs_scale_blocks(struct statfs *sf, long max_size) { uint64_t count; int shift; KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__)); /* * Attempt to scale the block counts to give a more accurate * overview to userland of the ratio of free space to used * space. To do this, find the largest block count and compute * a divisor that lets it fit into a signed integer <= max_size. */ if (sf->f_bavail < 0) count = -sf->f_bavail; else count = sf->f_bavail; count = MAX(sf->f_blocks, MAX(sf->f_bfree, count)); if (count <= max_size) return; count >>= flsl(max_size); shift = 0; while (count > 0) { shift++; count >>=1; } sf->f_bsize <<= shift; sf->f_blocks >>= shift; sf->f_bfree >>= shift; sf->f_bavail >>= shift; } static int kern_do_statfs(struct thread *td, struct mount *mp, struct statfs *buf) { struct statfs *sp; int error; if (mp == NULL) return (EBADF); error = vfs_busy(mp, 0); vfs_rel(mp); if (error != 0) return (error); #ifdef MAC error = mac_mount_check_stat(td->td_ucred, mp); if (error != 0) goto out; #endif /* * Set these in case the underlying filesystem fails to do so. */ sp = &mp->mnt_stat; sp->f_version = STATFS_VERSION; sp->f_namemax = NAME_MAX; sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; error = VFS_STATFS(mp, sp); if (error != 0) goto out; *buf = *sp; if (priv_check(td, PRIV_VFS_GENERATION)) { buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0; prison_enforce_statfs(td->td_ucred, mp, buf); } out: vfs_unbusy(mp); return (error); } /* * Get filesystem statistics. */ #ifndef _SYS_SYSPROTO_H_ struct statfs_args { char *path; struct statfs *buf; }; #endif int sys_statfs(struct thread *td, struct statfs_args *uap) { struct statfs *sfp; int error; sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp); if (error == 0) error = copyout(sfp, uap->buf, sizeof(struct statfs)); free(sfp, M_STATFS); return (error); } int kern_statfs(struct thread *td, char *path, enum uio_seg pathseg, struct statfs *buf) { struct mount *mp; struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path, td); error = namei(&nd); if (error != 0) return (error); mp = nd.ni_vp->v_mount; vfs_ref(mp); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_vp); return (kern_do_statfs(td, mp, buf)); } /* * Get filesystem statistics. */ #ifndef _SYS_SYSPROTO_H_ struct fstatfs_args { int fd; struct statfs *buf; }; #endif int sys_fstatfs(struct thread *td, struct fstatfs_args *uap) { struct statfs *sfp; int error; sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fstatfs(td, uap->fd, sfp); if (error == 0) error = copyout(sfp, uap->buf, sizeof(struct statfs)); free(sfp, M_STATFS); return (error); } int kern_fstatfs(struct thread *td, int fd, struct statfs *buf) { struct file *fp; struct mount *mp; struct vnode *vp; cap_rights_t rights; int error; AUDIT_ARG_FD(fd); error = getvnode(td, fd, cap_rights_init(&rights, CAP_FSTATFS), &fp); if (error != 0) return (error); vp = fp->f_vnode; vn_lock(vp, LK_SHARED | LK_RETRY); #ifdef AUDIT AUDIT_ARG_VNODE1(vp); #endif mp = vp->v_mount; if (mp != NULL) vfs_ref(mp); VOP_UNLOCK(vp, 0); fdrop(fp, td); return (kern_do_statfs(td, mp, buf)); } /* * Get statistics on all filesystems. */ #ifndef _SYS_SYSPROTO_H_ struct getfsstat_args { struct statfs *buf; long bufsize; int mode; }; #endif int sys_getfsstat(struct thread *td, struct getfsstat_args *uap) { size_t count; int error; if (uap->bufsize < 0 || uap->bufsize > SIZE_MAX) return (EINVAL); error = kern_getfsstat(td, &uap->buf, uap->bufsize, &count, UIO_USERSPACE, uap->mode); if (error == 0) td->td_retval[0] = count; return (error); } /* * If (bufsize > 0 && bufseg == UIO_SYSSPACE) * The caller is responsible for freeing memory which will be allocated * in '*buf'. */ int kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize, size_t *countp, enum uio_seg bufseg, int mode) { struct mount *mp, *nmp; struct statfs *sfsp, *sp, *sptmp, *tofree; size_t count, maxcount; int error; switch (mode) { case MNT_WAIT: case MNT_NOWAIT: break; default: if (bufseg == UIO_SYSSPACE) *buf = NULL; return (EINVAL); } restart: maxcount = bufsize / sizeof(struct statfs); if (bufsize == 0) { sfsp = NULL; tofree = NULL; } else if (bufseg == UIO_USERSPACE) { sfsp = *buf; tofree = NULL; } else /* if (bufseg == UIO_SYSSPACE) */ { count = 0; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { count++; } mtx_unlock(&mountlist_mtx); if (maxcount > count) maxcount = count; tofree = sfsp = *buf = malloc(maxcount * sizeof(struct statfs), M_STATFS, M_WAITOK); } count = 0; mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (prison_canseemount(td->td_ucred, mp) != 0) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } #ifdef MAC if (mac_mount_check_stat(td->td_ucred, mp) != 0) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } #endif if (mode == MNT_WAIT) { if (vfs_busy(mp, MBF_MNTLSTLOCK) != 0) { /* * If vfs_busy() failed, and MBF_NOWAIT * wasn't passed, then the mp is gone. * Furthermore, because of MBF_MNTLSTLOCK, * the mountlist_mtx was dropped. We have * no other choice than to start over. */ mtx_unlock(&mountlist_mtx); free(tofree, M_STATFS); goto restart; } } else { if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) != 0) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } } if (sfsp != NULL && count < maxcount) { sp = &mp->mnt_stat; /* * Set these in case the underlying filesystem * fails to do so. */ sp->f_version = STATFS_VERSION; sp->f_namemax = NAME_MAX; sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; /* * If MNT_NOWAIT is specified, do not refresh * the fsstat cache. */ if (mode != MNT_NOWAIT) { error = VFS_STATFS(mp, sp); if (error != 0) { mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp); continue; } } if (priv_check(td, PRIV_VFS_GENERATION)) { sptmp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); *sptmp = *sp; sptmp->f_fsid.val[0] = sptmp->f_fsid.val[1] = 0; prison_enforce_statfs(td->td_ucred, mp, sptmp); sp = sptmp; } else sptmp = NULL; if (bufseg == UIO_SYSSPACE) { bcopy(sp, sfsp, sizeof(*sp)); free(sptmp, M_STATFS); } else /* if (bufseg == UIO_USERSPACE) */ { error = copyout(sp, sfsp, sizeof(*sp)); free(sptmp, M_STATFS); if (error != 0) { vfs_unbusy(mp); return (error); } } sfsp++; } count++; mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp); } mtx_unlock(&mountlist_mtx); if (sfsp != NULL && count > maxcount) *countp = maxcount; else *countp = count; return (0); } #ifdef COMPAT_FREEBSD4 /* * Get old format filesystem statistics. */ static void freebsd4_cvtstatfs(struct statfs *, struct ostatfs *); #ifndef _SYS_SYSPROTO_H_ struct freebsd4_statfs_args { char *path; struct ostatfs *buf; }; #endif int freebsd4_statfs(struct thread *td, struct freebsd4_statfs_args *uap) { struct ostatfs osb; struct statfs *sfp; int error; sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp); if (error == 0) { freebsd4_cvtstatfs(sfp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); } free(sfp, M_STATFS); return (error); } /* * Get filesystem statistics. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd4_fstatfs_args { int fd; struct ostatfs *buf; }; #endif int freebsd4_fstatfs(struct thread *td, struct freebsd4_fstatfs_args *uap) { struct ostatfs osb; struct statfs *sfp; int error; sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fstatfs(td, uap->fd, sfp); if (error == 0) { freebsd4_cvtstatfs(sfp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); } free(sfp, M_STATFS); return (error); } /* * Get statistics on all filesystems. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd4_getfsstat_args { struct ostatfs *buf; long bufsize; int mode; }; #endif int freebsd4_getfsstat(struct thread *td, struct freebsd4_getfsstat_args *uap) { struct statfs *buf, *sp; struct ostatfs osb; size_t count, size; int error; if (uap->bufsize < 0) return (EINVAL); count = uap->bufsize / sizeof(struct ostatfs); if (count > SIZE_MAX / sizeof(struct statfs)) return (EINVAL); size = count * sizeof(struct statfs); error = kern_getfsstat(td, &buf, size, &count, UIO_SYSSPACE, uap->mode); if (error == 0) td->td_retval[0] = count; if (size != 0) { sp = buf; while (count != 0 && error == 0) { freebsd4_cvtstatfs(sp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); sp++; uap->buf++; count--; } free(buf, M_STATFS); } return (error); } /* * Implement fstatfs() for (NFS) file handles. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd4_fhstatfs_args { struct fhandle *u_fhp; struct ostatfs *buf; }; #endif int freebsd4_fhstatfs(struct thread *td, struct freebsd4_fhstatfs_args *uap) { struct ostatfs osb; struct statfs *sfp; fhandle_t fh; int error; error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); if (error != 0) return (error); sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fhstatfs(td, fh, sfp); if (error == 0) { freebsd4_cvtstatfs(sfp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); } free(sfp, M_STATFS); return (error); } /* * Convert a new format statfs structure to an old format statfs structure. */ static void freebsd4_cvtstatfs(struct statfs *nsp, struct ostatfs *osp) { statfs_scale_blocks(nsp, LONG_MAX); bzero(osp, sizeof(*osp)); osp->f_bsize = nsp->f_bsize; osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX); osp->f_blocks = nsp->f_blocks; osp->f_bfree = nsp->f_bfree; osp->f_bavail = nsp->f_bavail; osp->f_files = MIN(nsp->f_files, LONG_MAX); osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX); osp->f_owner = nsp->f_owner; osp->f_type = nsp->f_type; osp->f_flags = nsp->f_flags; osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX); osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX); osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX); osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX); strlcpy(osp->f_fstypename, nsp->f_fstypename, MIN(MFSNAMELEN, OMFSNAMELEN)); strlcpy(osp->f_mntonname, nsp->f_mntonname, MIN(MNAMELEN, OMNAMELEN)); strlcpy(osp->f_mntfromname, nsp->f_mntfromname, MIN(MNAMELEN, OMNAMELEN)); osp->f_fsid = nsp->f_fsid; } #endif /* COMPAT_FREEBSD4 */ #if defined(COMPAT_FREEBSD11) /* * Get old format filesystem statistics. */ static void freebsd11_cvtstatfs(struct statfs *, struct freebsd11_statfs *); int freebsd11_statfs(struct thread *td, struct freebsd11_statfs_args *uap) { struct freebsd11_statfs osb; struct statfs *sfp; int error; sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp); if (error == 0) { freebsd11_cvtstatfs(sfp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); } free(sfp, M_STATFS); return (error); } /* * Get filesystem statistics. */ int freebsd11_fstatfs(struct thread *td, struct freebsd11_fstatfs_args *uap) { struct freebsd11_statfs osb; struct statfs *sfp; int error; sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fstatfs(td, uap->fd, sfp); if (error == 0) { freebsd11_cvtstatfs(sfp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); } free(sfp, M_STATFS); return (error); } /* * Get statistics on all filesystems. */ int freebsd11_getfsstat(struct thread *td, struct freebsd11_getfsstat_args *uap) { struct freebsd11_statfs osb; struct statfs *buf, *sp; size_t count, size; int error; count = uap->bufsize / sizeof(struct ostatfs); size = count * sizeof(struct statfs); error = kern_getfsstat(td, &buf, size, &count, UIO_SYSSPACE, uap->mode); if (error == 0) td->td_retval[0] = count; if (size > 0) { sp = buf; while (count > 0 && error == 0) { freebsd11_cvtstatfs(sp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); sp++; uap->buf++; count--; } free(buf, M_STATFS); } return (error); } /* * Implement fstatfs() for (NFS) file handles. */ int freebsd11_fhstatfs(struct thread *td, struct freebsd11_fhstatfs_args *uap) { struct freebsd11_statfs osb; struct statfs *sfp; fhandle_t fh; int error; error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); if (error) return (error); sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fhstatfs(td, fh, sfp); if (error == 0) { freebsd11_cvtstatfs(sfp, &osb); error = copyout(&osb, uap->buf, sizeof(osb)); } free(sfp, M_STATFS); return (error); } /* * Convert a new format statfs structure to an old format statfs structure. */ static void freebsd11_cvtstatfs(struct statfs *nsp, struct freebsd11_statfs *osp) { bzero(osp, sizeof(*osp)); osp->f_version = FREEBSD11_STATFS_VERSION; osp->f_type = nsp->f_type; osp->f_flags = nsp->f_flags; osp->f_bsize = nsp->f_bsize; osp->f_iosize = nsp->f_iosize; osp->f_blocks = nsp->f_blocks; osp->f_bfree = nsp->f_bfree; osp->f_bavail = nsp->f_bavail; osp->f_files = nsp->f_files; osp->f_ffree = nsp->f_ffree; osp->f_syncwrites = nsp->f_syncwrites; osp->f_asyncwrites = nsp->f_asyncwrites; osp->f_syncreads = nsp->f_syncreads; osp->f_asyncreads = nsp->f_asyncreads; osp->f_namemax = nsp->f_namemax; osp->f_owner = nsp->f_owner; osp->f_fsid = nsp->f_fsid; strlcpy(osp->f_fstypename, nsp->f_fstypename, MIN(MFSNAMELEN, sizeof(osp->f_fstypename))); strlcpy(osp->f_mntonname, nsp->f_mntonname, MIN(MNAMELEN, sizeof(osp->f_mntonname))); strlcpy(osp->f_mntfromname, nsp->f_mntfromname, MIN(MNAMELEN, sizeof(osp->f_mntfromname))); } #endif /* COMPAT_FREEBSD11 */ /* * Change current working directory to a given file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchdir_args { int fd; }; #endif int sys_fchdir(struct thread *td, struct fchdir_args *uap) { struct vnode *vp, *tdp; struct mount *mp; struct file *fp; - cap_rights_t rights; int error; AUDIT_ARG_FD(uap->fd); - error = getvnode(td, uap->fd, cap_rights_init(&rights, CAP_FCHDIR), + error = getvnode(td, uap->fd, &cap_fchdir_rights, &fp); if (error != 0) return (error); vp = fp->f_vnode; vrefact(vp); fdrop(fp, td); vn_lock(vp, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(vp); error = change_dir(vp, td); while (!error && (mp = vp->v_mountedhere) != NULL) { if (vfs_busy(mp, 0)) continue; error = VFS_ROOT(mp, LK_SHARED, &tdp); vfs_unbusy(mp); if (error != 0) break; vput(vp); vp = tdp; } if (error != 0) { vput(vp); return (error); } VOP_UNLOCK(vp, 0); pwd_chdir(td, vp); return (0); } /* * Change current working directory (``.''). */ #ifndef _SYS_SYSPROTO_H_ struct chdir_args { char *path; }; #endif int sys_chdir(struct thread *td, struct chdir_args *uap) { return (kern_chdir(td, uap->path, UIO_USERSPACE)); } int kern_chdir(struct thread *td, char *path, enum uio_seg pathseg) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); if ((error = change_dir(nd.ni_vp, td)) != 0) { vput(nd.ni_vp); NDFREE(&nd, NDF_ONLY_PNBUF); return (error); } VOP_UNLOCK(nd.ni_vp, 0); NDFREE(&nd, NDF_ONLY_PNBUF); pwd_chdir(td, nd.ni_vp); return (0); } /* * Change notion of root (``/'') directory. */ #ifndef _SYS_SYSPROTO_H_ struct chroot_args { char *path; }; #endif int sys_chroot(struct thread *td, struct chroot_args *uap) { struct nameidata nd; int error; error = priv_check(td, PRIV_VFS_CHROOT); if (error != 0) return (error); NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error != 0) goto error; error = change_dir(nd.ni_vp, td); if (error != 0) goto e_vunlock; #ifdef MAC error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp); if (error != 0) goto e_vunlock; #endif VOP_UNLOCK(nd.ni_vp, 0); error = pwd_chroot(td, nd.ni_vp); vrele(nd.ni_vp); NDFREE(&nd, NDF_ONLY_PNBUF); return (error); e_vunlock: vput(nd.ni_vp); error: NDFREE(&nd, NDF_ONLY_PNBUF); return (error); } /* * Common routine for chroot and chdir. Callers must provide a locked vnode * instance. */ int change_dir(struct vnode *vp, struct thread *td) { #ifdef MAC int error; #endif ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked"); if (vp->v_type != VDIR) return (ENOTDIR); #ifdef MAC error = mac_vnode_check_chdir(td->td_ucred, vp); if (error != 0) return (error); #endif return (VOP_ACCESS(vp, VEXEC, td->td_ucred, td)); } static __inline void flags_to_rights(int flags, cap_rights_t *rightsp) { if (flags & O_EXEC) { cap_rights_set(rightsp, CAP_FEXECVE); } else { switch ((flags & O_ACCMODE)) { case O_RDONLY: cap_rights_set(rightsp, CAP_READ); break; case O_RDWR: cap_rights_set(rightsp, CAP_READ); /* FALLTHROUGH */ case O_WRONLY: cap_rights_set(rightsp, CAP_WRITE); if (!(flags & (O_APPEND | O_TRUNC))) cap_rights_set(rightsp, CAP_SEEK); break; } } if (flags & O_CREAT) cap_rights_set(rightsp, CAP_CREATE); if (flags & O_TRUNC) cap_rights_set(rightsp, CAP_FTRUNCATE); if (flags & (O_SYNC | O_FSYNC)) cap_rights_set(rightsp, CAP_FSYNC); if (flags & (O_EXLOCK | O_SHLOCK)) cap_rights_set(rightsp, CAP_FLOCK); } /* * Check permissions, allocate an open file structure, and call the device * open routine if any. */ #ifndef _SYS_SYSPROTO_H_ struct open_args { char *path; int flags; int mode; }; #endif int sys_open(struct thread *td, struct open_args *uap) { return (kern_openat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->flags, uap->mode)); } #ifndef _SYS_SYSPROTO_H_ struct openat_args { int fd; char *path; int flag; int mode; }; #endif int sys_openat(struct thread *td, struct openat_args *uap) { AUDIT_ARG_FD(uap->fd); return (kern_openat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag, uap->mode)); } int kern_openat(struct thread *td, int fd, char *path, enum uio_seg pathseg, int flags, int mode) { struct proc *p = td->td_proc; struct filedesc *fdp = p->p_fd; struct file *fp; struct vnode *vp; struct nameidata nd; cap_rights_t rights; int cmode, error, indx; indx = -1; AUDIT_ARG_FFLAGS(flags); AUDIT_ARG_MODE(mode); cap_rights_init(&rights, CAP_LOOKUP); flags_to_rights(flags, &rights); /* * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags * may be specified. */ if (flags & O_EXEC) { if (flags & O_ACCMODE) return (EINVAL); } else if ((flags & O_ACCMODE) == O_ACCMODE) { return (EINVAL); } else { flags = FFLAGS(flags); } /* * Allocate a file structure. The descriptor to reference it * is allocated and set by finstall() below. */ error = falloc_noinstall(td, &fp); if (error != 0) return (error); /* * An extra reference on `fp' has been held for us by * falloc_noinstall(). */ /* Set the flags early so the finit in devfs can pick them up. */ fp->f_flag = flags & FMASK; cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT; NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd, &rights, td); td->td_dupfd = -1; /* XXX check for fdopen */ error = vn_open(&nd, &flags, cmode, fp); if (error != 0) { /* * If the vn_open replaced the method vector, something * wonderous happened deep below and we just pass it up * pretending we know what we do. */ if (error == ENXIO && fp->f_ops != &badfileops) goto success; /* * Handle special fdopen() case. bleh. * * Don't do this for relative (capability) lookups; we don't * understand exactly what would happen, and we don't think * that it ever should. */ if ((nd.ni_lcf & NI_LCF_STRICTRELATIVE) == 0 && (error == ENODEV || error == ENXIO) && td->td_dupfd >= 0) { error = dupfdopen(td, fdp, td->td_dupfd, flags, error, &indx); if (error == 0) goto success; } goto bad; } td->td_dupfd = 0; NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; /* * Store the vnode, for any f_type. Typically, the vnode use * count is decremented by direct call to vn_closefile() for * files that switched type in the cdevsw fdopen() method. */ fp->f_vnode = vp; /* * If the file wasn't claimed by devfs bind it to the normal * vnode operations here. */ if (fp->f_ops == &badfileops) { KASSERT(vp->v_type != VFIFO, ("Unexpected fifo.")); fp->f_seqcount = 1; finit(fp, (flags & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE, vp, &vnops); } VOP_UNLOCK(vp, 0); if (flags & O_TRUNC) { error = fo_truncate(fp, 0, td->td_ucred, td); if (error != 0) goto bad; } success: /* * If we haven't already installed the FD (for dupfdopen), do so now. */ if (indx == -1) { struct filecaps *fcaps; #ifdef CAPABILITIES if ((nd.ni_lcf & NI_LCF_STRICTRELATIVE) != 0) fcaps = &nd.ni_filecaps; else #endif fcaps = NULL; error = finstall(td, fp, &indx, flags, fcaps); /* On success finstall() consumes fcaps. */ if (error != 0) { filecaps_free(&nd.ni_filecaps); goto bad; } } else { filecaps_free(&nd.ni_filecaps); } /* * Release our private reference, leaving the one associated with * the descriptor table intact. */ fdrop(fp, td); td->td_retval[0] = indx; return (0); bad: KASSERT(indx == -1, ("indx=%d, should be -1", indx)); fdrop(fp, td); return (error); } #ifdef COMPAT_43 /* * Create a file. */ #ifndef _SYS_SYSPROTO_H_ struct ocreat_args { char *path; int mode; }; #endif int ocreat(struct thread *td, struct ocreat_args *uap) { return (kern_openat(td, AT_FDCWD, uap->path, UIO_USERSPACE, O_WRONLY | O_CREAT | O_TRUNC, uap->mode)); } #endif /* COMPAT_43 */ /* * Create a special file. */ #ifndef _SYS_SYSPROTO_H_ struct mknodat_args { int fd; char *path; mode_t mode; dev_t dev; }; #endif int sys_mknodat(struct thread *td, struct mknodat_args *uap) { return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode, uap->dev)); } #if defined(COMPAT_FREEBSD11) int freebsd11_mknod(struct thread *td, struct freebsd11_mknod_args *uap) { return (kern_mknodat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->mode, uap->dev)); } int freebsd11_mknodat(struct thread *td, struct freebsd11_mknodat_args *uap) { return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode, uap->dev)); } #endif /* COMPAT_FREEBSD11 */ int kern_mknodat(struct thread *td, int fd, char *path, enum uio_seg pathseg, int mode, dev_t dev) { struct vnode *vp; struct mount *mp; struct vattr vattr; struct nameidata nd; cap_rights_t rights; int error, whiteout = 0; AUDIT_ARG_MODE(mode); AUDIT_ARG_DEV(dev); switch (mode & S_IFMT) { case S_IFCHR: case S_IFBLK: error = priv_check(td, PRIV_VFS_MKNOD_DEV); if (error == 0 && dev == VNOVAL) error = EINVAL; break; case S_IFWHT: error = priv_check(td, PRIV_VFS_MKNOD_WHT); break; case S_IFIFO: if (dev == 0) return (kern_mkfifoat(td, fd, path, pathseg, mode)); /* FALLTHROUGH */ default: error = EINVAL; break; } if (error != 0) return (error); restart: bwillwrite(); NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 | NOCACHE, pathseg, path, fd, cap_rights_init(&rights, CAP_MKNODAT), td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; if (vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); if (vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(vp); return (EEXIST); } else { VATTR_NULL(&vattr); vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask; vattr.va_rdev = dev; whiteout = 0; switch (mode & S_IFMT) { case S_IFCHR: vattr.va_type = VCHR; break; case S_IFBLK: vattr.va_type = VBLK; break; case S_IFWHT: whiteout = 1; break; default: panic("kern_mknod: invalid mode"); } } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } #ifdef MAC if (error == 0 && !whiteout) error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); #endif if (error == 0) { if (whiteout) error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE); else { error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); if (error == 0) vput(nd.ni_vp); } } NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); return (error); } /* * Create a named pipe. */ #ifndef _SYS_SYSPROTO_H_ struct mkfifo_args { char *path; int mode; }; #endif int sys_mkfifo(struct thread *td, struct mkfifo_args *uap) { return (kern_mkfifoat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->mode)); } #ifndef _SYS_SYSPROTO_H_ struct mkfifoat_args { int fd; char *path; mode_t mode; }; #endif int sys_mkfifoat(struct thread *td, struct mkfifoat_args *uap) { return (kern_mkfifoat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode)); } int kern_mkfifoat(struct thread *td, int fd, char *path, enum uio_seg pathseg, int mode) { struct mount *mp; struct vattr vattr; struct nameidata nd; cap_rights_t rights; int error; AUDIT_ARG_MODE(mode); restart: bwillwrite(); NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 | NOCACHE, pathseg, path, fd, cap_rights_init(&rights, CAP_MKFIFOAT), td); if ((error = namei(&nd)) != 0) return (error); if (nd.ni_vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(nd.ni_vp); return (EEXIST); } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VATTR_NULL(&vattr); vattr.va_type = VFIFO; vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask; #ifdef MAC error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); if (error != 0) goto out; #endif error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); if (error == 0) vput(nd.ni_vp); #ifdef MAC out: #endif vput(nd.ni_dvp); vn_finished_write(mp); NDFREE(&nd, NDF_ONLY_PNBUF); return (error); } /* * Make a hard file link. */ #ifndef _SYS_SYSPROTO_H_ struct link_args { char *path; char *link; }; #endif int sys_link(struct thread *td, struct link_args *uap) { return (kern_linkat(td, AT_FDCWD, AT_FDCWD, uap->path, uap->link, UIO_USERSPACE, FOLLOW)); } #ifndef _SYS_SYSPROTO_H_ struct linkat_args { int fd1; char *path1; int fd2; char *path2; int flag; }; #endif int sys_linkat(struct thread *td, struct linkat_args *uap) { int flag; flag = uap->flag; if (flag & ~AT_SYMLINK_FOLLOW) return (EINVAL); return (kern_linkat(td, uap->fd1, uap->fd2, uap->path1, uap->path2, UIO_USERSPACE, (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW)); } int hardlink_check_uid = 0; SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW, &hardlink_check_uid, 0, "Unprivileged processes cannot create hard links to files owned by other " "users"); static int hardlink_check_gid = 0; SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW, &hardlink_check_gid, 0, "Unprivileged processes cannot create hard links to files owned by other " "groups"); static int can_hardlink(struct vnode *vp, struct ucred *cred) { struct vattr va; int error; if (!hardlink_check_uid && !hardlink_check_gid) return (0); error = VOP_GETATTR(vp, &va, cred); if (error != 0) return (error); if (hardlink_check_uid && cred->cr_uid != va.va_uid) { error = priv_check_cred(cred, PRIV_VFS_LINK, 0); if (error != 0) return (error); } if (hardlink_check_gid && !groupmember(va.va_gid, cred)) { error = priv_check_cred(cred, PRIV_VFS_LINK, 0); if (error != 0) return (error); } return (0); } int kern_linkat(struct thread *td, int fd1, int fd2, char *path1, char *path2, enum uio_seg segflg, int follow) { struct vnode *vp; struct mount *mp; struct nameidata nd; cap_rights_t rights; int error; again: bwillwrite(); NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, segflg, path1, fd1, cap_rights_init(&rights, CAP_LINKAT_SOURCE), td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; if (vp->v_type == VDIR) { vrele(vp); return (EPERM); /* POSIX */ } NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE2 | NOCACHE, segflg, path2, fd2, cap_rights_init(&rights, CAP_LINKAT_TARGET), td); if ((error = namei(&nd)) == 0) { if (nd.ni_vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(nd.ni_vp); vrele(vp); return (EEXIST); } else if (nd.ni_dvp->v_mount != vp->v_mount) { /* * Cross-device link. No need to recheck * vp->v_type, since it cannot change, except * to VBAD. */ NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vrele(vp); return (EXDEV); } else if ((error = vn_lock(vp, LK_EXCLUSIVE)) == 0) { error = can_hardlink(vp, td->td_ucred); #ifdef MAC if (error == 0) error = mac_vnode_check_link(td->td_ucred, nd.ni_dvp, vp, &nd.ni_cnd); #endif if (error != 0) { vput(vp); vput(nd.ni_dvp); NDFREE(&nd, NDF_ONLY_PNBUF); return (error); } error = vn_start_write(vp, &mp, V_NOWAIT); if (error != 0) { vput(vp); vput(nd.ni_dvp); NDFREE(&nd, NDF_ONLY_PNBUF); error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH); if (error != 0) return (error); goto again; } error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd); VOP_UNLOCK(vp, 0); vput(nd.ni_dvp); vn_finished_write(mp); NDFREE(&nd, NDF_ONLY_PNBUF); } else { vput(nd.ni_dvp); NDFREE(&nd, NDF_ONLY_PNBUF); vrele(vp); goto again; } } vrele(vp); return (error); } /* * Make a symbolic link. */ #ifndef _SYS_SYSPROTO_H_ struct symlink_args { char *path; char *link; }; #endif int sys_symlink(struct thread *td, struct symlink_args *uap) { return (kern_symlinkat(td, uap->path, AT_FDCWD, uap->link, UIO_USERSPACE)); } #ifndef _SYS_SYSPROTO_H_ struct symlinkat_args { char *path; int fd; char *path2; }; #endif int sys_symlinkat(struct thread *td, struct symlinkat_args *uap) { return (kern_symlinkat(td, uap->path1, uap->fd, uap->path2, UIO_USERSPACE)); } int kern_symlinkat(struct thread *td, char *path1, int fd, char *path2, enum uio_seg segflg) { struct mount *mp; struct vattr vattr; char *syspath; struct nameidata nd; int error; cap_rights_t rights; if (segflg == UIO_SYSSPACE) { syspath = path1; } else { syspath = uma_zalloc(namei_zone, M_WAITOK); if ((error = copyinstr(path1, syspath, MAXPATHLEN, NULL)) != 0) goto out; } AUDIT_ARG_TEXT(syspath); restart: bwillwrite(); NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 | NOCACHE, segflg, path2, fd, cap_rights_init(&rights, CAP_SYMLINKAT), td); if ((error = namei(&nd)) != 0) goto out; if (nd.ni_vp) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(nd.ni_vp); error = EEXIST; goto out; } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) goto out; goto restart; } VATTR_NULL(&vattr); vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask; #ifdef MAC vattr.va_type = VLNK; error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); if (error != 0) goto out2; #endif error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath); if (error == 0) vput(nd.ni_vp); #ifdef MAC out2: #endif NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); out: if (segflg != UIO_SYSSPACE) uma_zfree(namei_zone, syspath); return (error); } /* * Delete a whiteout from the filesystem. */ #ifndef _SYS_SYSPROTO_H_ struct undelete_args { char *path; }; #endif int sys_undelete(struct thread *td, struct undelete_args *uap) { struct mount *mp; struct nameidata nd; int error; restart: bwillwrite(); NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | AUDITVNODE1, UIO_USERSPACE, uap->path, td); error = namei(&nd); if (error != 0) return (error); if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); if (nd.ni_vp) vrele(nd.ni_vp); return (EEXIST); } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); return (error); } /* * Delete a name from the filesystem. */ #ifndef _SYS_SYSPROTO_H_ struct unlink_args { char *path; }; #endif int sys_unlink(struct thread *td, struct unlink_args *uap) { return (kern_unlinkat(td, AT_FDCWD, uap->path, UIO_USERSPACE, 0)); } #ifndef _SYS_SYSPROTO_H_ struct unlinkat_args { int fd; char *path; int flag; }; #endif int sys_unlinkat(struct thread *td, struct unlinkat_args *uap) { int flag = uap->flag; int fd = uap->fd; char *path = uap->path; if (flag & ~AT_REMOVEDIR) return (EINVAL); if (flag & AT_REMOVEDIR) return (kern_rmdirat(td, fd, path, UIO_USERSPACE)); else return (kern_unlinkat(td, fd, path, UIO_USERSPACE, 0)); } int kern_unlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg, ino_t oldinum) { struct mount *mp; struct vnode *vp; struct nameidata nd; struct stat sb; cap_rights_t rights; int error; restart: bwillwrite(); NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1, pathseg, path, fd, cap_rights_init(&rights, CAP_UNLINKAT), td); if ((error = namei(&nd)) != 0) return (error == EINVAL ? EPERM : error); vp = nd.ni_vp; if (vp->v_type == VDIR && oldinum == 0) { error = EPERM; /* POSIX */ } else if (oldinum != 0 && ((error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td)) == 0) && sb.st_ino != oldinum) { error = EIDRM; /* Identifier removed */ } else { /* * The root of a mounted filesystem cannot be deleted. * * XXX: can this only be a VDIR case? */ if (vp->v_vflag & VV_ROOT) error = EBUSY; } if (error == 0) { if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (vp == nd.ni_dvp) vrele(vp); else vput(vp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } #ifdef MAC error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp, &nd.ni_cnd); if (error != 0) goto out; #endif vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK); error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); #ifdef MAC out: #endif vn_finished_write(mp); } NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (vp == nd.ni_dvp) vrele(vp); else vput(vp); return (error); } /* * Reposition read/write file offset. */ #ifndef _SYS_SYSPROTO_H_ struct lseek_args { int fd; int pad; off_t offset; int whence; }; #endif int sys_lseek(struct thread *td, struct lseek_args *uap) { return (kern_lseek(td, uap->fd, uap->offset, uap->whence)); } int kern_lseek(struct thread *td, int fd, off_t offset, int whence) { struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(fd); error = fget(td, fd, cap_rights_init(&rights, CAP_SEEK), &fp); if (error != 0) return (error); error = (fp->f_ops->fo_flags & DFLAG_SEEKABLE) != 0 ? fo_seek(fp, offset, whence, td) : ESPIPE; fdrop(fp, td); return (error); } #if defined(COMPAT_43) /* * Reposition read/write file offset. */ #ifndef _SYS_SYSPROTO_H_ struct olseek_args { int fd; long offset; int whence; }; #endif int olseek(struct thread *td, struct olseek_args *uap) { return (kern_lseek(td, uap->fd, uap->offset, uap->whence)); } #endif /* COMPAT_43 */ #if defined(COMPAT_FREEBSD6) /* Version with the 'pad' argument */ int freebsd6_lseek(struct thread *td, struct freebsd6_lseek_args *uap) { return (kern_lseek(td, uap->fd, uap->offset, uap->whence)); } #endif /* * Check access permissions using passed credentials. */ static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred, struct thread *td) { accmode_t accmode; int error; /* Flags == 0 means only check for existence. */ if (user_flags == 0) return (0); accmode = 0; if (user_flags & R_OK) accmode |= VREAD; if (user_flags & W_OK) accmode |= VWRITE; if (user_flags & X_OK) accmode |= VEXEC; #ifdef MAC error = mac_vnode_check_access(cred, vp, accmode); if (error != 0) return (error); #endif if ((accmode & VWRITE) == 0 || (error = vn_writechk(vp)) == 0) error = VOP_ACCESS(vp, accmode, cred, td); return (error); } /* * Check access permissions using "real" credentials. */ #ifndef _SYS_SYSPROTO_H_ struct access_args { char *path; int amode; }; #endif int sys_access(struct thread *td, struct access_args *uap) { return (kern_accessat(td, AT_FDCWD, uap->path, UIO_USERSPACE, 0, uap->amode)); } #ifndef _SYS_SYSPROTO_H_ struct faccessat_args { int dirfd; char *path; int amode; int flag; } #endif int sys_faccessat(struct thread *td, struct faccessat_args *uap) { return (kern_accessat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag, uap->amode)); } int kern_accessat(struct thread *td, int fd, char *path, enum uio_seg pathseg, int flag, int amode) { struct ucred *cred, *usecred; struct vnode *vp; struct nameidata nd; cap_rights_t rights; int error; if (flag & ~AT_EACCESS) return (EINVAL); if (amode != F_OK && (amode & ~(R_OK | W_OK | X_OK)) != 0) return (EINVAL); /* * Create and modify a temporary credential instead of one that * is potentially shared (if we need one). */ cred = td->td_ucred; if ((flag & AT_EACCESS) == 0 && ((cred->cr_uid != cred->cr_ruid || cred->cr_rgid != cred->cr_groups[0]))) { usecred = crdup(cred); usecred->cr_uid = cred->cr_ruid; usecred->cr_groups[0] = cred->cr_rgid; td->td_ucred = usecred; } else usecred = cred; AUDIT_ARG_VALUE(amode); NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path, fd, cap_rights_init(&rights, CAP_FSTAT), td); if ((error = namei(&nd)) != 0) goto out; vp = nd.ni_vp; error = vn_access(vp, amode, usecred, td); NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); out: if (usecred != cred) { td->td_ucred = cred; crfree(usecred); } return (error); } /* * Check access permissions using "effective" credentials. */ #ifndef _SYS_SYSPROTO_H_ struct eaccess_args { char *path; int amode; }; #endif int sys_eaccess(struct thread *td, struct eaccess_args *uap) { return (kern_accessat(td, AT_FDCWD, uap->path, UIO_USERSPACE, AT_EACCESS, uap->amode)); } #if defined(COMPAT_43) /* * Get file status; this version follows links. */ #ifndef _SYS_SYSPROTO_H_ struct ostat_args { char *path; struct ostat *ub; }; #endif int ostat(struct thread *td, struct ostat_args *uap) { struct stat sb; struct ostat osb; int error; error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); cvtstat(&sb, &osb); return (copyout(&osb, uap->ub, sizeof (osb))); } /* * Get file status; this version does not follow links. */ #ifndef _SYS_SYSPROTO_H_ struct olstat_args { char *path; struct ostat *ub; }; #endif int olstat(struct thread *td, struct olstat_args *uap) { struct stat sb; struct ostat osb; int error; error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); cvtstat(&sb, &osb); return (copyout(&osb, uap->ub, sizeof (osb))); } /* * Convert from an old to a new stat structure. */ void cvtstat(struct stat *st, struct ostat *ost) { bzero(ost, sizeof(*ost)); ost->st_dev = st->st_dev; ost->st_ino = st->st_ino; ost->st_mode = st->st_mode; ost->st_nlink = st->st_nlink; ost->st_uid = st->st_uid; ost->st_gid = st->st_gid; ost->st_rdev = st->st_rdev; if (st->st_size < (quad_t)1 << 32) ost->st_size = st->st_size; else ost->st_size = -2; ost->st_atim = st->st_atim; ost->st_mtim = st->st_mtim; ost->st_ctim = st->st_ctim; ost->st_blksize = st->st_blksize; ost->st_blocks = st->st_blocks; ost->st_flags = st->st_flags; ost->st_gen = st->st_gen; } #endif /* COMPAT_43 */ #if defined(COMPAT_43) || defined(COMPAT_FREEBSD11) int ino64_trunc_error; SYSCTL_INT(_vfs, OID_AUTO, ino64_trunc_error, CTLFLAG_RW, &ino64_trunc_error, 0, "Error on truncation of inode number, device id or link count"); int freebsd11_cvtstat(struct stat *st, struct freebsd11_stat *ost) { ost->st_dev = st->st_dev; ost->st_ino = st->st_ino; if (ost->st_ino != st->st_ino) { switch (ino64_trunc_error) { default: case 0: break; case 1: return (EOVERFLOW); case 2: ost->st_ino = UINT32_MAX; break; } } ost->st_mode = st->st_mode; ost->st_nlink = st->st_nlink; if (ost->st_nlink != st->st_nlink) { switch (ino64_trunc_error) { default: case 0: break; case 1: return (EOVERFLOW); case 2: ost->st_nlink = UINT16_MAX; break; } } ost->st_uid = st->st_uid; ost->st_gid = st->st_gid; ost->st_rdev = st->st_rdev; ost->st_atim = st->st_atim; ost->st_mtim = st->st_mtim; ost->st_ctim = st->st_ctim; ost->st_size = st->st_size; ost->st_blocks = st->st_blocks; ost->st_blksize = st->st_blksize; ost->st_flags = st->st_flags; ost->st_gen = st->st_gen; ost->st_lspare = 0; ost->st_birthtim = st->st_birthtim; bzero((char *)&ost->st_birthtim + sizeof(ost->st_birthtim), sizeof(*ost) - offsetof(struct freebsd11_stat, st_birthtim) - sizeof(ost->st_birthtim)); return (0); } int freebsd11_stat(struct thread *td, struct freebsd11_stat_args* uap) { struct stat sb; struct freebsd11_stat osb; int error; error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); error = freebsd11_cvtstat(&sb, &osb); if (error == 0) error = copyout(&osb, uap->ub, sizeof(osb)); return (error); } int freebsd11_lstat(struct thread *td, struct freebsd11_lstat_args* uap) { struct stat sb; struct freebsd11_stat osb; int error; error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); error = freebsd11_cvtstat(&sb, &osb); if (error == 0) error = copyout(&osb, uap->ub, sizeof(osb)); return (error); } int freebsd11_fhstat(struct thread *td, struct freebsd11_fhstat_args* uap) { struct fhandle fh; struct stat sb; struct freebsd11_stat osb; int error; error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); if (error != 0) return (error); error = kern_fhstat(td, fh, &sb); if (error != 0) return (error); error = freebsd11_cvtstat(&sb, &osb); if (error == 0) error = copyout(&osb, uap->sb, sizeof(osb)); return (error); } int freebsd11_fstatat(struct thread *td, struct freebsd11_fstatat_args* uap) { struct stat sb; struct freebsd11_stat osb; int error; error = kern_statat(td, uap->flag, uap->fd, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); error = freebsd11_cvtstat(&sb, &osb); if (error == 0) error = copyout(&osb, uap->buf, sizeof(osb)); return (error); } #endif /* COMPAT_FREEBSD11 */ /* * Get file status */ #ifndef _SYS_SYSPROTO_H_ struct fstatat_args { int fd; char *path; struct stat *buf; int flag; } #endif int sys_fstatat(struct thread *td, struct fstatat_args *uap) { struct stat sb; int error; error = kern_statat(td, uap->flag, uap->fd, uap->path, UIO_USERSPACE, &sb, NULL); if (error == 0) error = copyout(&sb, uap->buf, sizeof (sb)); return (error); } int kern_statat(struct thread *td, int flag, int fd, char *path, enum uio_seg pathseg, struct stat *sbp, void (*hook)(struct vnode *vp, struct stat *sbp)) { struct nameidata nd; struct stat sb; - cap_rights_t rights; int error; if (flag & ~AT_SYMLINK_NOFOLLOW) return (EINVAL); NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path, fd, - cap_rights_init(&rights, CAP_FSTAT), td); + &cap_fstat_rights, td); if ((error = namei(&nd)) != 0) return (error); error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td); if (error == 0) { SDT_PROBE2(vfs, , stat, mode, path, sb.st_mode); if (S_ISREG(sb.st_mode)) SDT_PROBE2(vfs, , stat, reg, path, pathseg); if (__predict_false(hook != NULL)) hook(nd.ni_vp, &sb); } NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_vp); if (error != 0) return (error); #ifdef __STAT_TIME_T_EXT sb.st_atim_ext = 0; sb.st_mtim_ext = 0; sb.st_ctim_ext = 0; sb.st_btim_ext = 0; #endif *sbp = sb; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrstat(&sb); #endif return (0); } #if defined(COMPAT_FREEBSD11) /* * Implementation of the NetBSD [l]stat() functions. */ void freebsd11_cvtnstat(struct stat *sb, struct nstat *nsb) { bzero(nsb, sizeof(*nsb)); nsb->st_dev = sb->st_dev; nsb->st_ino = sb->st_ino; nsb->st_mode = sb->st_mode; nsb->st_nlink = sb->st_nlink; nsb->st_uid = sb->st_uid; nsb->st_gid = sb->st_gid; nsb->st_rdev = sb->st_rdev; nsb->st_atim = sb->st_atim; nsb->st_mtim = sb->st_mtim; nsb->st_ctim = sb->st_ctim; nsb->st_size = sb->st_size; nsb->st_blocks = sb->st_blocks; nsb->st_blksize = sb->st_blksize; nsb->st_flags = sb->st_flags; nsb->st_gen = sb->st_gen; nsb->st_birthtim = sb->st_birthtim; } #ifndef _SYS_SYSPROTO_H_ struct freebsd11_nstat_args { char *path; struct nstat *ub; }; #endif int freebsd11_nstat(struct thread *td, struct freebsd11_nstat_args *uap) { struct stat sb; struct nstat nsb; int error; error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); freebsd11_cvtnstat(&sb, &nsb); return (copyout(&nsb, uap->ub, sizeof (nsb))); } /* * NetBSD lstat. Get file status; this version does not follow links. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd11_nlstat_args { char *path; struct nstat *ub; }; #endif int freebsd11_nlstat(struct thread *td, struct freebsd11_nlstat_args *uap) { struct stat sb; struct nstat nsb; int error; error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path, UIO_USERSPACE, &sb, NULL); if (error != 0) return (error); freebsd11_cvtnstat(&sb, &nsb); return (copyout(&nsb, uap->ub, sizeof (nsb))); } #endif /* COMPAT_FREEBSD11 */ /* * Get configurable pathname variables. */ #ifndef _SYS_SYSPROTO_H_ struct pathconf_args { char *path; int name; }; #endif int sys_pathconf(struct thread *td, struct pathconf_args *uap) { long value; int error; error = kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, FOLLOW, &value); if (error == 0) td->td_retval[0] = value; return (error); } #ifndef _SYS_SYSPROTO_H_ struct lpathconf_args { char *path; int name; }; #endif int sys_lpathconf(struct thread *td, struct lpathconf_args *uap) { long value; int error; error = kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, NOFOLLOW, &value); if (error == 0) td->td_retval[0] = value; return (error); } int kern_pathconf(struct thread *td, char *path, enum uio_seg pathseg, int name, u_long flags, long *valuep) { struct nameidata nd; int error; NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | AUDITVNODE1 | flags, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = VOP_PATHCONF(nd.ni_vp, name, valuep); vput(nd.ni_vp); return (error); } /* * Return target name of a symbolic link. */ #ifndef _SYS_SYSPROTO_H_ struct readlink_args { char *path; char *buf; size_t count; }; #endif int sys_readlink(struct thread *td, struct readlink_args *uap) { return (kern_readlinkat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->buf, UIO_USERSPACE, uap->count)); } #ifndef _SYS_SYSPROTO_H_ struct readlinkat_args { int fd; char *path; char *buf; size_t bufsize; }; #endif int sys_readlinkat(struct thread *td, struct readlinkat_args *uap) { return (kern_readlinkat(td, uap->fd, uap->path, UIO_USERSPACE, uap->buf, UIO_USERSPACE, uap->bufsize)); } int kern_readlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg, char *buf, enum uio_seg bufseg, size_t count) { struct vnode *vp; struct iovec aiov; struct uio auio; struct nameidata nd; int error; if (count > IOSIZE_MAX) return (EINVAL); NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path, fd, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; #ifdef MAC error = mac_vnode_check_readlink(td->td_ucred, vp); if (error != 0) { vput(vp); return (error); } #endif if (vp->v_type != VLNK && (vp->v_vflag & VV_READLINK) == 0) error = EINVAL; else { aiov.iov_base = buf; aiov.iov_len = count; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_segflg = bufseg; auio.uio_td = td; auio.uio_resid = count; error = VOP_READLINK(vp, &auio, td->td_ucred); td->td_retval[0] = count - auio.uio_resid; } vput(vp); return (error); } /* * Common implementation code for chflags() and fchflags(). */ static int setfflags(struct thread *td, struct vnode *vp, u_long flags) { struct mount *mp; struct vattr vattr; int error; /* We can't support the value matching VNOVAL. */ if (flags == VNOVAL) return (EOPNOTSUPP); /* * Prevent non-root users from setting flags on devices. When * a device is reused, users can retain ownership of the device * if they are allowed to set flags and programs assume that * chown can't fail when done as root. */ if (vp->v_type == VCHR || vp->v_type == VBLK) { error = priv_check(td, PRIV_VFS_CHFLAGS_DEV); if (error != 0) return (error); } if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); VATTR_NULL(&vattr); vattr.va_flags = flags; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); #ifdef MAC error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags); if (error == 0) #endif error = VOP_SETATTR(vp, &vattr, td->td_ucred); VOP_UNLOCK(vp, 0); vn_finished_write(mp); return (error); } /* * Change flags of a file given a path name. */ #ifndef _SYS_SYSPROTO_H_ struct chflags_args { const char *path; u_long flags; }; #endif int sys_chflags(struct thread *td, struct chflags_args *uap) { return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->flags, 0)); } #ifndef _SYS_SYSPROTO_H_ struct chflagsat_args { int fd; const char *path; u_long flags; int atflag; } #endif int sys_chflagsat(struct thread *td, struct chflagsat_args *uap) { int fd = uap->fd; const char *path = uap->path; u_long flags = uap->flags; int atflag = uap->atflag; if (atflag & ~AT_SYMLINK_NOFOLLOW) return (EINVAL); return (kern_chflagsat(td, fd, path, UIO_USERSPACE, flags, atflag)); } /* * Same as chflags() but doesn't follow symlinks. */ #ifndef _SYS_SYSPROTO_H_ struct lchflags_args { const char *path; u_long flags; }; #endif int sys_lchflags(struct thread *td, struct lchflags_args *uap) { return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->flags, AT_SYMLINK_NOFOLLOW)); } static int kern_chflagsat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, u_long flags, int atflag) { struct nameidata nd; cap_rights_t rights; int error, follow; AUDIT_ARG_FFLAGS(flags); follow = (atflag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW; NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd, cap_rights_init(&rights, CAP_FCHFLAGS), td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfflags(td, nd.ni_vp, flags); vrele(nd.ni_vp); return (error); } /* * Change flags of a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchflags_args { int fd; u_long flags; }; #endif int sys_fchflags(struct thread *td, struct fchflags_args *uap) { struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(uap->fd); AUDIT_ARG_FFLAGS(uap->flags); error = getvnode(td, uap->fd, cap_rights_init(&rights, CAP_FCHFLAGS), &fp); if (error != 0) return (error); #ifdef AUDIT vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(fp->f_vnode); VOP_UNLOCK(fp->f_vnode, 0); #endif error = setfflags(td, fp->f_vnode, uap->flags); fdrop(fp, td); return (error); } /* * Common implementation code for chmod(), lchmod() and fchmod(). */ int setfmode(struct thread *td, struct ucred *cred, struct vnode *vp, int mode) { struct mount *mp; struct vattr vattr; int error; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VATTR_NULL(&vattr); vattr.va_mode = mode & ALLPERMS; #ifdef MAC error = mac_vnode_check_setmode(cred, vp, vattr.va_mode); if (error == 0) #endif error = VOP_SETATTR(vp, &vattr, cred); VOP_UNLOCK(vp, 0); vn_finished_write(mp); return (error); } /* * Change mode of a file given path name. */ #ifndef _SYS_SYSPROTO_H_ struct chmod_args { char *path; int mode; }; #endif int sys_chmod(struct thread *td, struct chmod_args *uap) { return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->mode, 0)); } #ifndef _SYS_SYSPROTO_H_ struct fchmodat_args { int dirfd; char *path; mode_t mode; int flag; } #endif int sys_fchmodat(struct thread *td, struct fchmodat_args *uap) { int flag = uap->flag; int fd = uap->fd; char *path = uap->path; mode_t mode = uap->mode; if (flag & ~AT_SYMLINK_NOFOLLOW) return (EINVAL); return (kern_fchmodat(td, fd, path, UIO_USERSPACE, mode, flag)); } /* * Change mode of a file given path name (don't follow links.) */ #ifndef _SYS_SYSPROTO_H_ struct lchmod_args { char *path; int mode; }; #endif int sys_lchmod(struct thread *td, struct lchmod_args *uap) { return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->mode, AT_SYMLINK_NOFOLLOW)); } int kern_fchmodat(struct thread *td, int fd, char *path, enum uio_seg pathseg, mode_t mode, int flag) { struct nameidata nd; cap_rights_t rights; int error, follow; AUDIT_ARG_MODE(mode); follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW; NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd, cap_rights_init(&rights, CAP_FCHMOD), td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfmode(td, td->td_ucred, nd.ni_vp, mode); vrele(nd.ni_vp); return (error); } /* * Change mode of a file given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchmod_args { int fd; int mode; }; #endif int sys_fchmod(struct thread *td, struct fchmod_args *uap) { struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(uap->fd); AUDIT_ARG_MODE(uap->mode); error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FCHMOD), &fp); if (error != 0) return (error); error = fo_chmod(fp, uap->mode, td->td_ucred, td); fdrop(fp, td); return (error); } /* * Common implementation for chown(), lchown(), and fchown() */ int setfown(struct thread *td, struct ucred *cred, struct vnode *vp, uid_t uid, gid_t gid) { struct mount *mp; struct vattr vattr; int error; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VATTR_NULL(&vattr); vattr.va_uid = uid; vattr.va_gid = gid; #ifdef MAC error = mac_vnode_check_setowner(cred, vp, vattr.va_uid, vattr.va_gid); if (error == 0) #endif error = VOP_SETATTR(vp, &vattr, cred); VOP_UNLOCK(vp, 0); vn_finished_write(mp); return (error); } /* * Set ownership given a path name. */ #ifndef _SYS_SYSPROTO_H_ struct chown_args { char *path; int uid; int gid; }; #endif int sys_chown(struct thread *td, struct chown_args *uap) { return (kern_fchownat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->uid, uap->gid, 0)); } #ifndef _SYS_SYSPROTO_H_ struct fchownat_args { int fd; const char * path; uid_t uid; gid_t gid; int flag; }; #endif int sys_fchownat(struct thread *td, struct fchownat_args *uap) { int flag; flag = uap->flag; if (flag & ~AT_SYMLINK_NOFOLLOW) return (EINVAL); return (kern_fchownat(td, uap->fd, uap->path, UIO_USERSPACE, uap->uid, uap->gid, uap->flag)); } int kern_fchownat(struct thread *td, int fd, char *path, enum uio_seg pathseg, int uid, int gid, int flag) { struct nameidata nd; cap_rights_t rights; int error, follow; AUDIT_ARG_OWNER(uid, gid); follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW; NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd, cap_rights_init(&rights, CAP_FCHOWN), td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setfown(td, td->td_ucred, nd.ni_vp, uid, gid); vrele(nd.ni_vp); return (error); } /* * Set ownership given a path name, do not cross symlinks. */ #ifndef _SYS_SYSPROTO_H_ struct lchown_args { char *path; int uid; int gid; }; #endif int sys_lchown(struct thread *td, struct lchown_args *uap) { return (kern_fchownat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->uid, uap->gid, AT_SYMLINK_NOFOLLOW)); } /* * Set ownership given a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fchown_args { int fd; int uid; int gid; }; #endif int sys_fchown(struct thread *td, struct fchown_args *uap) { struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(uap->fd); AUDIT_ARG_OWNER(uap->uid, uap->gid); error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FCHOWN), &fp); if (error != 0) return (error); error = fo_chown(fp, uap->uid, uap->gid, td->td_ucred, td); fdrop(fp, td); return (error); } /* * Common implementation code for utimes(), lutimes(), and futimes(). */ static int getutimes(const struct timeval *usrtvp, enum uio_seg tvpseg, struct timespec *tsp) { struct timeval tv[2]; const struct timeval *tvp; int error; if (usrtvp == NULL) { vfs_timestamp(&tsp[0]); tsp[1] = tsp[0]; } else { if (tvpseg == UIO_SYSSPACE) { tvp = usrtvp; } else { if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0) return (error); tvp = tv; } if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 || tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000) return (EINVAL); TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]); TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]); } return (0); } /* * Common implementation code for futimens(), utimensat(). */ #define UTIMENS_NULL 0x1 #define UTIMENS_EXIT 0x2 static int getutimens(const struct timespec *usrtsp, enum uio_seg tspseg, struct timespec *tsp, int *retflags) { struct timespec tsnow; int error; vfs_timestamp(&tsnow); *retflags = 0; if (usrtsp == NULL) { tsp[0] = tsnow; tsp[1] = tsnow; *retflags |= UTIMENS_NULL; return (0); } if (tspseg == UIO_SYSSPACE) { tsp[0] = usrtsp[0]; tsp[1] = usrtsp[1]; } else if ((error = copyin(usrtsp, tsp, sizeof(*tsp) * 2)) != 0) return (error); if (tsp[0].tv_nsec == UTIME_OMIT && tsp[1].tv_nsec == UTIME_OMIT) *retflags |= UTIMENS_EXIT; if (tsp[0].tv_nsec == UTIME_NOW && tsp[1].tv_nsec == UTIME_NOW) *retflags |= UTIMENS_NULL; if (tsp[0].tv_nsec == UTIME_OMIT) tsp[0].tv_sec = VNOVAL; else if (tsp[0].tv_nsec == UTIME_NOW) tsp[0] = tsnow; else if (tsp[0].tv_nsec < 0 || tsp[0].tv_nsec >= 1000000000L) return (EINVAL); if (tsp[1].tv_nsec == UTIME_OMIT) tsp[1].tv_sec = VNOVAL; else if (tsp[1].tv_nsec == UTIME_NOW) tsp[1] = tsnow; else if (tsp[1].tv_nsec < 0 || tsp[1].tv_nsec >= 1000000000L) return (EINVAL); return (0); } /* * Common implementation code for utimes(), lutimes(), futimes(), futimens(), * and utimensat(). */ static int setutimes(struct thread *td, struct vnode *vp, const struct timespec *ts, int numtimes, int nullflag) { struct mount *mp; struct vattr vattr; int error, setbirthtime; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); setbirthtime = 0; if (numtimes < 3 && !VOP_GETATTR(vp, &vattr, td->td_ucred) && timespeccmp(&ts[1], &vattr.va_birthtime, < )) setbirthtime = 1; VATTR_NULL(&vattr); vattr.va_atime = ts[0]; vattr.va_mtime = ts[1]; if (setbirthtime) vattr.va_birthtime = ts[1]; if (numtimes > 2) vattr.va_birthtime = ts[2]; if (nullflag) vattr.va_vaflags |= VA_UTIMES_NULL; #ifdef MAC error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime, vattr.va_mtime); #endif if (error == 0) error = VOP_SETATTR(vp, &vattr, td->td_ucred); VOP_UNLOCK(vp, 0); vn_finished_write(mp); return (error); } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct utimes_args { char *path; struct timeval *tptr; }; #endif int sys_utimes(struct thread *td, struct utimes_args *uap) { return (kern_utimesat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->tptr, UIO_USERSPACE)); } #ifndef _SYS_SYSPROTO_H_ struct futimesat_args { int fd; const char * path; const struct timeval * times; }; #endif int sys_futimesat(struct thread *td, struct futimesat_args *uap) { return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE, uap->times, UIO_USERSPACE)); } int kern_utimesat(struct thread *td, int fd, char *path, enum uio_seg pathseg, struct timeval *tptr, enum uio_seg tptrseg) { struct nameidata nd; struct timespec ts[2]; cap_rights_t rights; int error; if ((error = getutimes(tptr, tptrseg, ts)) != 0) return (error); NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd, cap_rights_init(&rights, CAP_FUTIMES), td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL); vrele(nd.ni_vp); return (error); } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct lutimes_args { char *path; struct timeval *tptr; }; #endif int sys_lutimes(struct thread *td, struct lutimes_args *uap) { return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr, UIO_USERSPACE)); } int kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg, struct timeval *tptr, enum uio_seg tptrseg) { struct timespec ts[2]; struct nameidata nd; int error; if ((error = getutimes(tptr, tptrseg, ts)) != 0) return (error); NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL); vrele(nd.ni_vp); return (error); } /* * Set the access and modification times of a file. */ #ifndef _SYS_SYSPROTO_H_ struct futimes_args { int fd; struct timeval *tptr; }; #endif int sys_futimes(struct thread *td, struct futimes_args *uap) { return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE)); } int kern_futimes(struct thread *td, int fd, struct timeval *tptr, enum uio_seg tptrseg) { struct timespec ts[2]; struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(fd); error = getutimes(tptr, tptrseg, ts); if (error != 0) return (error); error = getvnode(td, fd, cap_rights_init(&rights, CAP_FUTIMES), &fp); if (error != 0) return (error); #ifdef AUDIT vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(fp->f_vnode); VOP_UNLOCK(fp->f_vnode, 0); #endif error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL); fdrop(fp, td); return (error); } int sys_futimens(struct thread *td, struct futimens_args *uap) { return (kern_futimens(td, uap->fd, uap->times, UIO_USERSPACE)); } int kern_futimens(struct thread *td, int fd, struct timespec *tptr, enum uio_seg tptrseg) { struct timespec ts[2]; struct file *fp; cap_rights_t rights; int error, flags; AUDIT_ARG_FD(fd); error = getutimens(tptr, tptrseg, ts, &flags); if (error != 0) return (error); if (flags & UTIMENS_EXIT) return (0); error = getvnode(td, fd, cap_rights_init(&rights, CAP_FUTIMES), &fp); if (error != 0) return (error); #ifdef AUDIT vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(fp->f_vnode); VOP_UNLOCK(fp->f_vnode, 0); #endif error = setutimes(td, fp->f_vnode, ts, 2, flags & UTIMENS_NULL); fdrop(fp, td); return (error); } int sys_utimensat(struct thread *td, struct utimensat_args *uap) { return (kern_utimensat(td, uap->fd, uap->path, UIO_USERSPACE, uap->times, UIO_USERSPACE, uap->flag)); } int kern_utimensat(struct thread *td, int fd, char *path, enum uio_seg pathseg, struct timespec *tptr, enum uio_seg tptrseg, int flag) { struct nameidata nd; struct timespec ts[2]; cap_rights_t rights; int error, flags; if (flag & ~AT_SYMLINK_NOFOLLOW) return (EINVAL); if ((error = getutimens(tptr, tptrseg, ts, &flags)) != 0) return (error); NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW) | AUDITVNODE1, pathseg, path, fd, cap_rights_init(&rights, CAP_FUTIMES), td); if ((error = namei(&nd)) != 0) return (error); /* * We are allowed to call namei() regardless of 2xUTIME_OMIT. * POSIX states: * "If both tv_nsec fields are UTIME_OMIT... EACCESS may be detected." * "Search permission is denied by a component of the path prefix." */ NDFREE(&nd, NDF_ONLY_PNBUF); if ((flags & UTIMENS_EXIT) == 0) error = setutimes(td, nd.ni_vp, ts, 2, flags & UTIMENS_NULL); vrele(nd.ni_vp); return (error); } /* * Truncate a file given its path name. */ #ifndef _SYS_SYSPROTO_H_ struct truncate_args { char *path; int pad; off_t length; }; #endif int sys_truncate(struct thread *td, struct truncate_args *uap) { return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length)); } int kern_truncate(struct thread *td, char *path, enum uio_seg pathseg, off_t length) { struct mount *mp; struct vnode *vp; void *rl_cookie; struct vattr vattr; struct nameidata nd; int error; if (length < 0) return(EINVAL); NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) { vn_rangelock_unlock(vp, rl_cookie); vrele(vp); return (error); } NDFREE(&nd, NDF_ONLY_PNBUF); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_type == VDIR) error = EISDIR; #ifdef MAC else if ((error = mac_vnode_check_write(td->td_ucred, NOCRED, vp))) { } #endif else if ((error = vn_writechk(vp)) == 0 && (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) { VATTR_NULL(&vattr); vattr.va_size = length; error = VOP_SETATTR(vp, &vattr, td->td_ucred); } VOP_UNLOCK(vp, 0); vn_finished_write(mp); vn_rangelock_unlock(vp, rl_cookie); vrele(vp); return (error); } #if defined(COMPAT_43) /* * Truncate a file given its path name. */ #ifndef _SYS_SYSPROTO_H_ struct otruncate_args { char *path; long length; }; #endif int otruncate(struct thread *td, struct otruncate_args *uap) { return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length)); } #endif /* COMPAT_43 */ #if defined(COMPAT_FREEBSD6) /* Versions with the pad argument */ int freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap) { return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length)); } int freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap) { return (kern_ftruncate(td, uap->fd, uap->length)); } #endif int kern_fsync(struct thread *td, int fd, bool fullsync) { struct vnode *vp; struct mount *mp; struct file *fp; cap_rights_t rights; int error, lock_flags; AUDIT_ARG_FD(fd); error = getvnode(td, fd, cap_rights_init(&rights, CAP_FSYNC), &fp); if (error != 0) return (error); vp = fp->f_vnode; #if 0 if (!fullsync) /* XXXKIB: compete outstanding aio writes */; #endif error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error != 0) goto drop; if (MNT_SHARED_WRITES(mp) || ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) { lock_flags = LK_SHARED; } else { lock_flags = LK_EXCLUSIVE; } vn_lock(vp, lock_flags | LK_RETRY); AUDIT_ARG_VNODE1(vp); if (vp->v_object != NULL) { VM_OBJECT_WLOCK(vp->v_object); vm_object_page_clean(vp->v_object, 0, 0, 0); VM_OBJECT_WUNLOCK(vp->v_object); } error = fullsync ? VOP_FSYNC(vp, MNT_WAIT, td) : VOP_FDATASYNC(vp, td); VOP_UNLOCK(vp, 0); vn_finished_write(mp); drop: fdrop(fp, td); return (error); } /* * Sync an open file. */ #ifndef _SYS_SYSPROTO_H_ struct fsync_args { int fd; }; #endif int sys_fsync(struct thread *td, struct fsync_args *uap) { return (kern_fsync(td, uap->fd, true)); } int sys_fdatasync(struct thread *td, struct fdatasync_args *uap) { return (kern_fsync(td, uap->fd, false)); } /* * Rename files. Source and destination must either both be directories, or * both not be directories. If target is a directory, it must be empty. */ #ifndef _SYS_SYSPROTO_H_ struct rename_args { char *from; char *to; }; #endif int sys_rename(struct thread *td, struct rename_args *uap) { return (kern_renameat(td, AT_FDCWD, uap->from, AT_FDCWD, uap->to, UIO_USERSPACE)); } #ifndef _SYS_SYSPROTO_H_ struct renameat_args { int oldfd; char *old; int newfd; char *new; }; #endif int sys_renameat(struct thread *td, struct renameat_args *uap) { return (kern_renameat(td, uap->oldfd, uap->old, uap->newfd, uap->new, UIO_USERSPACE)); } int kern_renameat(struct thread *td, int oldfd, char *old, int newfd, char *new, enum uio_seg pathseg) { struct mount *mp = NULL; struct vnode *tvp, *fvp, *tdvp; struct nameidata fromnd, tond; cap_rights_t rights; int error; again: bwillwrite(); #ifdef MAC NDINIT_ATRIGHTS(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART | AUDITVNODE1, pathseg, old, oldfd, cap_rights_init(&rights, CAP_RENAMEAT_SOURCE), td); #else NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | AUDITVNODE1, pathseg, old, oldfd, cap_rights_init(&rights, CAP_RENAMEAT_SOURCE), td); #endif if ((error = namei(&fromnd)) != 0) return (error); #ifdef MAC error = mac_vnode_check_rename_from(td->td_ucred, fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd); VOP_UNLOCK(fromnd.ni_dvp, 0); if (fromnd.ni_dvp != fromnd.ni_vp) VOP_UNLOCK(fromnd.ni_vp, 0); #endif fvp = fromnd.ni_vp; NDINIT_ATRIGHTS(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNODE2, pathseg, new, newfd, cap_rights_init(&rights, CAP_RENAMEAT_TARGET), td); if (fromnd.ni_vp->v_type == VDIR) tond.ni_cnd.cn_flags |= WILLBEDIR; if ((error = namei(&tond)) != 0) { /* Translate error code for rename("dir1", "dir2/."). */ if (error == EISDIR && fvp->v_type == VDIR) error = EINVAL; NDFREE(&fromnd, NDF_ONLY_PNBUF); vrele(fromnd.ni_dvp); vrele(fvp); goto out1; } tdvp = tond.ni_dvp; tvp = tond.ni_vp; error = vn_start_write(fvp, &mp, V_NOWAIT); if (error != 0) { NDFREE(&fromnd, NDF_ONLY_PNBUF); NDFREE(&tond, NDF_ONLY_PNBUF); if (tvp != NULL) vput(tvp); if (tdvp == tvp) vrele(tdvp); else vput(tdvp); vrele(fromnd.ni_dvp); vrele(fvp); vrele(tond.ni_startdir); if (fromnd.ni_startdir != NULL) vrele(fromnd.ni_startdir); error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH); if (error != 0) return (error); goto again; } if (tvp != NULL) { if (fvp->v_type == VDIR && tvp->v_type != VDIR) { error = ENOTDIR; goto out; } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { error = EISDIR; goto out; } #ifdef CAPABILITIES if (newfd != AT_FDCWD) { /* * If the target already exists we require CAP_UNLINKAT * from 'newfd'. */ error = cap_check(&tond.ni_filecaps.fc_rights, cap_rights_init(&rights, CAP_UNLINKAT)); if (error != 0) goto out; } #endif } if (fvp == tdvp) { error = EINVAL; goto out; } /* * If the source is the same as the destination (that is, if they * are links to the same vnode), then there is nothing to do. */ if (fvp == tvp) error = -1; #ifdef MAC else error = mac_vnode_check_rename_to(td->td_ucred, tdvp, tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd); #endif out: if (error == 0) { error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd, tond.ni_dvp, tond.ni_vp, &tond.ni_cnd); NDFREE(&fromnd, NDF_ONLY_PNBUF); NDFREE(&tond, NDF_ONLY_PNBUF); } else { NDFREE(&fromnd, NDF_ONLY_PNBUF); NDFREE(&tond, NDF_ONLY_PNBUF); if (tvp != NULL) vput(tvp); if (tdvp == tvp) vrele(tdvp); else vput(tdvp); vrele(fromnd.ni_dvp); vrele(fvp); } vrele(tond.ni_startdir); vn_finished_write(mp); out1: if (fromnd.ni_startdir) vrele(fromnd.ni_startdir); if (error == -1) return (0); return (error); } /* * Make a directory file. */ #ifndef _SYS_SYSPROTO_H_ struct mkdir_args { char *path; int mode; }; #endif int sys_mkdir(struct thread *td, struct mkdir_args *uap) { return (kern_mkdirat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->mode)); } #ifndef _SYS_SYSPROTO_H_ struct mkdirat_args { int fd; char *path; mode_t mode; }; #endif int sys_mkdirat(struct thread *td, struct mkdirat_args *uap) { return (kern_mkdirat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode)); } int kern_mkdirat(struct thread *td, int fd, char *path, enum uio_seg segflg, int mode) { struct mount *mp; struct vnode *vp; struct vattr vattr; struct nameidata nd; cap_rights_t rights; int error; AUDIT_ARG_MODE(mode); restart: bwillwrite(); NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 | NOCACHE, segflg, path, fd, cap_rights_init(&rights, CAP_MKDIRAT), td); nd.ni_cnd.cn_flags |= WILLBEDIR; if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; if (vp != NULL) { NDFREE(&nd, NDF_ONLY_PNBUF); /* * XXX namei called with LOCKPARENT but not LOCKLEAF has * the strange behaviour of leaving the vnode unlocked * if the target is the same vnode as the parent. */ if (vp == nd.ni_dvp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); vrele(vp); return (EEXIST); } if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VATTR_NULL(&vattr); vattr.va_type = VDIR; vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask; #ifdef MAC error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd, &vattr); if (error != 0) goto out; #endif error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); #ifdef MAC out: #endif NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (error == 0) vput(nd.ni_vp); vn_finished_write(mp); return (error); } /* * Remove a directory file. */ #ifndef _SYS_SYSPROTO_H_ struct rmdir_args { char *path; }; #endif int sys_rmdir(struct thread *td, struct rmdir_args *uap) { return (kern_rmdirat(td, AT_FDCWD, uap->path, UIO_USERSPACE)); } int kern_rmdirat(struct thread *td, int fd, char *path, enum uio_seg pathseg) { struct mount *mp; struct vnode *vp; struct nameidata nd; cap_rights_t rights; int error; restart: bwillwrite(); NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1, pathseg, path, fd, cap_rights_init(&rights, CAP_UNLINKAT), td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; if (vp->v_type != VDIR) { error = ENOTDIR; goto out; } /* * No rmdir "." please. */ if (nd.ni_dvp == vp) { error = EINVAL; goto out; } /* * The root of a mounted filesystem cannot be deleted. */ if (vp->v_vflag & VV_ROOT) { error = EBUSY; goto out; } #ifdef MAC error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp, &nd.ni_cnd); if (error != 0) goto out; #endif if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK); error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); vn_finished_write(mp); out: NDFREE(&nd, NDF_ONLY_PNBUF); vput(vp); if (nd.ni_dvp == vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); return (error); } #if defined(COMPAT_43) || defined(COMPAT_FREEBSD11) int freebsd11_kern_getdirentries(struct thread *td, int fd, char *ubuf, u_int count, long *basep, void (*func)(struct freebsd11_dirent *)) { struct freebsd11_dirent dstdp; struct dirent *dp, *edp; char *dirbuf; off_t base; ssize_t resid, ucount; int error; /* XXX arbitrary sanity limit on `count'. */ count = min(count, 64 * 1024); dirbuf = malloc(count, M_TEMP, M_WAITOK); error = kern_getdirentries(td, fd, dirbuf, count, &base, &resid, UIO_SYSSPACE); if (error != 0) goto done; if (basep != NULL) *basep = base; ucount = 0; for (dp = (struct dirent *)dirbuf, edp = (struct dirent *)&dirbuf[count - resid]; ucount < count && dp < edp; ) { if (dp->d_reclen == 0) break; MPASS(dp->d_reclen >= _GENERIC_DIRLEN(0)); if (dp->d_namlen >= sizeof(dstdp.d_name)) continue; dstdp.d_type = dp->d_type; dstdp.d_namlen = dp->d_namlen; dstdp.d_fileno = dp->d_fileno; /* truncate */ if (dstdp.d_fileno != dp->d_fileno) { switch (ino64_trunc_error) { default: case 0: break; case 1: error = EOVERFLOW; goto done; case 2: dstdp.d_fileno = UINT32_MAX; break; } } dstdp.d_reclen = sizeof(dstdp) - sizeof(dstdp.d_name) + ((dp->d_namlen + 1 + 3) &~ 3); bcopy(dp->d_name, dstdp.d_name, dstdp.d_namlen); bzero(dstdp.d_name + dstdp.d_namlen, dstdp.d_reclen - offsetof(struct freebsd11_dirent, d_name) - dstdp.d_namlen); MPASS(dstdp.d_reclen <= dp->d_reclen); MPASS(ucount + dstdp.d_reclen <= count); if (func != NULL) func(&dstdp); error = copyout(&dstdp, ubuf + ucount, dstdp.d_reclen); if (error != 0) break; dp = (struct dirent *)((char *)dp + dp->d_reclen); ucount += dstdp.d_reclen; } done: free(dirbuf, M_TEMP); if (error == 0) td->td_retval[0] = ucount; return (error); } #endif /* COMPAT */ #ifdef COMPAT_43 static void ogetdirentries_cvt(struct freebsd11_dirent *dp) { #if (BYTE_ORDER == LITTLE_ENDIAN) /* * The expected low byte of dp->d_namlen is our dp->d_type. * The high MBZ byte of dp->d_namlen is our dp->d_namlen. */ dp->d_type = dp->d_namlen; dp->d_namlen = 0; #else /* * The dp->d_type is the high byte of the expected dp->d_namlen, * so must be zero'ed. */ dp->d_type = 0; #endif } /* * Read a block of directory entries in a filesystem independent format. */ #ifndef _SYS_SYSPROTO_H_ struct ogetdirentries_args { int fd; char *buf; u_int count; long *basep; }; #endif int ogetdirentries(struct thread *td, struct ogetdirentries_args *uap) { long loff; int error; error = kern_ogetdirentries(td, uap, &loff); if (error == 0) error = copyout(&loff, uap->basep, sizeof(long)); return (error); } int kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap, long *ploff) { long base; int error; /* XXX arbitrary sanity limit on `count'. */ if (uap->count > 64 * 1024) return (EINVAL); error = freebsd11_kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base, ogetdirentries_cvt); if (error == 0 && uap->basep != NULL) error = copyout(&base, uap->basep, sizeof(long)); return (error); } #endif /* COMPAT_43 */ #if defined(COMPAT_FREEBSD11) #ifndef _SYS_SYSPROTO_H_ struct freebsd11_getdirentries_args { int fd; char *buf; u_int count; long *basep; }; #endif int freebsd11_getdirentries(struct thread *td, struct freebsd11_getdirentries_args *uap) { long base; int error; error = freebsd11_kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base, NULL); if (error == 0 && uap->basep != NULL) error = copyout(&base, uap->basep, sizeof(long)); return (error); } int freebsd11_getdents(struct thread *td, struct freebsd11_getdents_args *uap) { struct freebsd11_getdirentries_args ap; ap.fd = uap->fd; ap.buf = uap->buf; ap.count = uap->count; ap.basep = NULL; return (freebsd11_getdirentries(td, &ap)); } #endif /* COMPAT_FREEBSD11 */ /* * Read a block of directory entries in a filesystem independent format. */ int sys_getdirentries(struct thread *td, struct getdirentries_args *uap) { off_t base; int error; error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base, NULL, UIO_USERSPACE); if (error != 0) return (error); if (uap->basep != NULL) error = copyout(&base, uap->basep, sizeof(off_t)); return (error); } int kern_getdirentries(struct thread *td, int fd, char *buf, size_t count, off_t *basep, ssize_t *residp, enum uio_seg bufseg) { struct vnode *vp; struct file *fp; struct uio auio; struct iovec aiov; - cap_rights_t rights; off_t loff; int error, eofflag; off_t foffset; AUDIT_ARG_FD(fd); if (count > IOSIZE_MAX) return (EINVAL); auio.uio_resid = count; - error = getvnode(td, fd, cap_rights_init(&rights, CAP_READ), &fp); + error = getvnode(td, fd, &cap_read_rights, &fp); if (error != 0) return (error); if ((fp->f_flag & FREAD) == 0) { fdrop(fp, td); return (EBADF); } vp = fp->f_vnode; foffset = foffset_lock(fp, 0); unionread: if (vp->v_type != VDIR) { error = EINVAL; goto fail; } aiov.iov_base = buf; aiov.iov_len = count; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = bufseg; auio.uio_td = td; vn_lock(vp, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(vp); loff = auio.uio_offset = foffset; #ifdef MAC error = mac_vnode_check_readdir(td->td_ucred, vp); if (error == 0) #endif error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL); foffset = auio.uio_offset; if (error != 0) { VOP_UNLOCK(vp, 0); goto fail; } if (count == auio.uio_resid && (vp->v_vflag & VV_ROOT) && (vp->v_mount->mnt_flag & MNT_UNION)) { struct vnode *tvp = vp; vp = vp->v_mount->mnt_vnodecovered; VREF(vp); fp->f_vnode = vp; fp->f_data = vp; foffset = 0; vput(tvp); goto unionread; } VOP_UNLOCK(vp, 0); *basep = loff; if (residp != NULL) *residp = auio.uio_resid; td->td_retval[0] = count - auio.uio_resid; fail: foffset_unlock(fp, foffset, 0); fdrop(fp, td); return (error); } /* * Set the mode mask for creation of filesystem nodes. */ #ifndef _SYS_SYSPROTO_H_ struct umask_args { int newmask; }; #endif int sys_umask(struct thread *td, struct umask_args *uap) { struct filedesc *fdp; fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); td->td_retval[0] = fdp->fd_cmask; fdp->fd_cmask = uap->newmask & ALLPERMS; FILEDESC_XUNLOCK(fdp); return (0); } /* * Void all references to file by ripping underlying filesystem away from * vnode. */ #ifndef _SYS_SYSPROTO_H_ struct revoke_args { char *path; }; #endif int sys_revoke(struct thread *td, struct revoke_args *uap) { struct vnode *vp; struct vattr vattr; struct nameidata nd; int error; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE, uap->path, td); if ((error = namei(&nd)) != 0) return (error); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (vp->v_type != VCHR || vp->v_rdev == NULL) { error = EINVAL; goto out; } #ifdef MAC error = mac_vnode_check_revoke(td->td_ucred, vp); if (error != 0) goto out; #endif error = VOP_GETATTR(vp, &vattr, td->td_ucred); if (error != 0) goto out; if (td->td_ucred->cr_uid != vattr.va_uid) { error = priv_check(td, PRIV_VFS_ADMIN); if (error != 0) goto out; } if (vcount(vp) > 1) VOP_REVOKE(vp, REVOKEALL); out: vput(vp); return (error); } /* * Convert a user file descriptor to a kernel file entry and check that, if it * is a capability, the correct rights are present. A reference on the file * entry is held upon returning. */ int getvnode(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) { struct file *fp; int error; error = fget_unlocked(td->td_proc->p_fd, fd, rightsp, &fp, NULL); if (error != 0) return (error); /* * The file could be not of the vnode type, or it may be not * yet fully initialized, in which case the f_vnode pointer * may be set, but f_ops is still badfileops. E.g., * devfs_open() transiently create such situation to * facilitate csw d_fdopen(). * * Dupfdopen() handling in kern_openat() installs the * half-baked file into the process descriptor table, allowing * other thread to dereference it. Guard against the race by * checking f_ops. */ if (fp->f_vnode == NULL || fp->f_ops == &badfileops) { fdrop(fp, td); return (EINVAL); } *fpp = fp; return (0); } /* * Get an (NFS) file handle. */ #ifndef _SYS_SYSPROTO_H_ struct lgetfh_args { char *fname; fhandle_t *fhp; }; #endif int sys_lgetfh(struct thread *td, struct lgetfh_args *uap) { struct nameidata nd; fhandle_t fh; struct vnode *vp; int error; error = priv_check(td, PRIV_VFS_GETFH); if (error != 0) return (error); NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE, uap->fname, td); error = namei(&nd); if (error != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; bzero(&fh, sizeof(fh)); fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid; error = VOP_VPTOFH(vp, &fh.fh_fid); vput(vp); if (error == 0) error = copyout(&fh, uap->fhp, sizeof (fh)); return (error); } #ifndef _SYS_SYSPROTO_H_ struct getfh_args { char *fname; fhandle_t *fhp; }; #endif int sys_getfh(struct thread *td, struct getfh_args *uap) { struct nameidata nd; fhandle_t fh; struct vnode *vp; int error; error = priv_check(td, PRIV_VFS_GETFH); if (error != 0) return (error); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE, uap->fname, td); error = namei(&nd); if (error != 0) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; bzero(&fh, sizeof(fh)); fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid; error = VOP_VPTOFH(vp, &fh.fh_fid); vput(vp); if (error == 0) error = copyout(&fh, uap->fhp, sizeof (fh)); return (error); } /* * syscall for the rpc.lockd to use to translate a NFS file handle into an * open descriptor. * * warning: do not remove the priv_check() call or this becomes one giant * security hole. */ #ifndef _SYS_SYSPROTO_H_ struct fhopen_args { const struct fhandle *u_fhp; int flags; }; #endif int sys_fhopen(struct thread *td, struct fhopen_args *uap) { struct mount *mp; struct vnode *vp; struct fhandle fhp; struct file *fp; int fmode, error; int indx; error = priv_check(td, PRIV_VFS_FHOPEN); if (error != 0) return (error); indx = -1; fmode = FFLAGS(uap->flags); /* why not allow a non-read/write open for our lockd? */ if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT)) return (EINVAL); error = copyin(uap->u_fhp, &fhp, sizeof(fhp)); if (error != 0) return(error); /* find the mount point */ mp = vfs_busyfs(&fhp.fh_fsid); if (mp == NULL) return (ESTALE); /* now give me my vnode, it gets returned to me locked */ error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp); vfs_unbusy(mp); if (error != 0) return (error); error = falloc_noinstall(td, &fp); if (error != 0) { vput(vp); return (error); } /* * An extra reference on `fp' has been held for us by * falloc_noinstall(). */ #ifdef INVARIANTS td->td_dupfd = -1; #endif error = vn_open_vnode(vp, fmode, td->td_ucred, td, fp); if (error != 0) { KASSERT(fp->f_ops == &badfileops, ("VOP_OPEN in fhopen() set f_ops")); KASSERT(td->td_dupfd < 0, ("fhopen() encountered fdopen()")); vput(vp); goto bad; } #ifdef INVARIANTS td->td_dupfd = 0; #endif fp->f_vnode = vp; fp->f_seqcount = 1; finit(fp, (fmode & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE, vp, &vnops); VOP_UNLOCK(vp, 0); if ((fmode & O_TRUNC) != 0) { error = fo_truncate(fp, 0, td->td_ucred, td); if (error != 0) goto bad; } error = finstall(td, fp, &indx, fmode, NULL); bad: fdrop(fp, td); td->td_retval[0] = indx; return (error); } /* * Stat an (NFS) file handle. */ #ifndef _SYS_SYSPROTO_H_ struct fhstat_args { struct fhandle *u_fhp; struct stat *sb; }; #endif int sys_fhstat(struct thread *td, struct fhstat_args *uap) { struct stat sb; struct fhandle fh; int error; error = copyin(uap->u_fhp, &fh, sizeof(fh)); if (error != 0) return (error); error = kern_fhstat(td, fh, &sb); if (error == 0) error = copyout(&sb, uap->sb, sizeof(sb)); return (error); } int kern_fhstat(struct thread *td, struct fhandle fh, struct stat *sb) { struct mount *mp; struct vnode *vp; int error; error = priv_check(td, PRIV_VFS_FHSTAT); if (error != 0) return (error); if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL) return (ESTALE); error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp); vfs_unbusy(mp); if (error != 0) return (error); error = vn_stat(vp, sb, td->td_ucred, NOCRED, td); vput(vp); return (error); } /* * Implement fstatfs() for (NFS) file handles. */ #ifndef _SYS_SYSPROTO_H_ struct fhstatfs_args { struct fhandle *u_fhp; struct statfs *buf; }; #endif int sys_fhstatfs(struct thread *td, struct fhstatfs_args *uap) { struct statfs *sfp; fhandle_t fh; int error; error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t)); if (error != 0) return (error); sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = kern_fhstatfs(td, fh, sfp); if (error == 0) error = copyout(sfp, uap->buf, sizeof(*sfp)); free(sfp, M_STATFS); return (error); } int kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf) { struct statfs *sp; struct mount *mp; struct vnode *vp; int error; error = priv_check(td, PRIV_VFS_FHSTATFS); if (error != 0) return (error); if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL) return (ESTALE); error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp); if (error != 0) { vfs_unbusy(mp); return (error); } vput(vp); error = prison_canseemount(td->td_ucred, mp); if (error != 0) goto out; #ifdef MAC error = mac_mount_check_stat(td->td_ucred, mp); if (error != 0) goto out; #endif /* * Set these in case the underlying filesystem fails to do so. */ sp = &mp->mnt_stat; sp->f_version = STATFS_VERSION; sp->f_namemax = NAME_MAX; sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; error = VFS_STATFS(mp, sp); if (error == 0) *buf = *sp; out: vfs_unbusy(mp); return (error); } int kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len) { struct file *fp; struct mount *mp; struct vnode *vp; - cap_rights_t rights; off_t olen, ooffset; int error; #ifdef AUDIT int audited_vnode1 = 0; #endif AUDIT_ARG_FD(fd); if (offset < 0 || len <= 0) return (EINVAL); /* Check for wrap. */ if (offset > OFF_MAX - len) return (EFBIG); AUDIT_ARG_FD(fd); - error = fget(td, fd, cap_rights_init(&rights, CAP_PWRITE), &fp); + error = fget(td, fd, &cap_pwrite_rights, &fp); if (error != 0) return (error); AUDIT_ARG_FILE(td->td_proc, fp); if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) { error = ESPIPE; goto out; } if ((fp->f_flag & FWRITE) == 0) { error = EBADF; goto out; } if (fp->f_type != DTYPE_VNODE) { error = ENODEV; goto out; } vp = fp->f_vnode; if (vp->v_type != VREG) { error = ENODEV; goto out; } /* Allocating blocks may take a long time, so iterate. */ for (;;) { olen = len; ooffset = offset; bwillwrite(); mp = NULL; error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error != 0) break; error = vn_lock(vp, LK_EXCLUSIVE); if (error != 0) { vn_finished_write(mp); break; } #ifdef AUDIT if (!audited_vnode1) { AUDIT_ARG_VNODE1(vp); audited_vnode1 = 1; } #endif #ifdef MAC error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp); if (error == 0) #endif error = VOP_ALLOCATE(vp, &offset, &len); VOP_UNLOCK(vp, 0); vn_finished_write(mp); if (olen + ooffset != offset + len) { panic("offset + len changed from %jx/%jx to %jx/%jx", ooffset, olen, offset, len); } if (error != 0 || len == 0) break; KASSERT(olen > len, ("Iteration did not make progress?")); maybe_yield(); } out: fdrop(fp, td); return (error); } int sys_posix_fallocate(struct thread *td, struct posix_fallocate_args *uap) { int error; error = kern_posix_fallocate(td, uap->fd, uap->offset, uap->len); return (kern_posix_error(td, error)); } /* * Unlike madvise(2), we do not make a best effort to remember every * possible caching hint. Instead, we remember the last setting with * the exception that we will allow POSIX_FADV_NORMAL to adjust the * region of any current setting. */ int kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len, int advice) { struct fadvise_info *fa, *new; struct file *fp; struct vnode *vp; - cap_rights_t rights; off_t end; int error; if (offset < 0 || len < 0 || offset > OFF_MAX - len) return (EINVAL); AUDIT_ARG_VALUE(advice); switch (advice) { case POSIX_FADV_SEQUENTIAL: case POSIX_FADV_RANDOM: case POSIX_FADV_NOREUSE: new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK); break; case POSIX_FADV_NORMAL: case POSIX_FADV_WILLNEED: case POSIX_FADV_DONTNEED: new = NULL; break; default: return (EINVAL); } /* XXX: CAP_POSIX_FADVISE? */ AUDIT_ARG_FD(fd); - error = fget(td, fd, cap_rights_init(&rights), &fp); + error = fget(td, fd, &cap_no_rights, &fp); if (error != 0) goto out; AUDIT_ARG_FILE(td->td_proc, fp); if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) { error = ESPIPE; goto out; } if (fp->f_type != DTYPE_VNODE) { error = ENODEV; goto out; } vp = fp->f_vnode; if (vp->v_type != VREG) { error = ENODEV; goto out; } if (len == 0) end = OFF_MAX; else end = offset + len - 1; switch (advice) { case POSIX_FADV_SEQUENTIAL: case POSIX_FADV_RANDOM: case POSIX_FADV_NOREUSE: /* * Try to merge any existing non-standard region with * this new region if possible, otherwise create a new * non-standard region for this request. */ mtx_pool_lock(mtxpool_sleep, fp); fa = fp->f_advice; if (fa != NULL && fa->fa_advice == advice && ((fa->fa_start <= end && fa->fa_end >= offset) || (end != OFF_MAX && fa->fa_start == end + 1) || (fa->fa_end != OFF_MAX && fa->fa_end + 1 == offset))) { if (offset < fa->fa_start) fa->fa_start = offset; if (end > fa->fa_end) fa->fa_end = end; } else { new->fa_advice = advice; new->fa_start = offset; new->fa_end = end; fp->f_advice = new; new = fa; } mtx_pool_unlock(mtxpool_sleep, fp); break; case POSIX_FADV_NORMAL: /* * If a the "normal" region overlaps with an existing * non-standard region, trim or remove the * non-standard region. */ mtx_pool_lock(mtxpool_sleep, fp); fa = fp->f_advice; if (fa != NULL) { if (offset <= fa->fa_start && end >= fa->fa_end) { new = fa; fp->f_advice = NULL; } else if (offset <= fa->fa_start && end >= fa->fa_start) fa->fa_start = end + 1; else if (offset <= fa->fa_end && end >= fa->fa_end) fa->fa_end = offset - 1; else if (offset >= fa->fa_start && end <= fa->fa_end) { /* * If the "normal" region is a middle * portion of the existing * non-standard region, just remove * the whole thing rather than picking * one side or the other to * preserve. */ new = fa; fp->f_advice = NULL; } } mtx_pool_unlock(mtxpool_sleep, fp); break; case POSIX_FADV_WILLNEED: case POSIX_FADV_DONTNEED: error = VOP_ADVISE(vp, offset, end, advice); break; } out: if (fp != NULL) fdrop(fp, td); free(new, M_FADVISE); return (error); } int sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap) { int error; error = kern_posix_fadvise(td, uap->fd, uap->offset, uap->len, uap->advice); return (kern_posix_error(td, error)); } Index: head/sys/netsmb/smb_dev.c =================================================================== --- head/sys/netsmb/smb_dev.c (revision 333424) +++ head/sys/netsmb/smb_dev.c (revision 333425) @@ -1,416 +1,415 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2000-2001 Boris Popov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include /* Must come after sys/malloc.h */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static struct cdev *nsmb_dev; static d_open_t nsmb_dev_open; static d_ioctl_t nsmb_dev_ioctl; MODULE_DEPEND(netsmb, libiconv, 1, 1, 2); MODULE_VERSION(netsmb, NSMB_VERSION); static int smb_version = NSMB_VERSION; struct sx smb_lock; SYSCTL_DECL(_net_smb); SYSCTL_INT(_net_smb, OID_AUTO, version, CTLFLAG_RD, &smb_version, 0, ""); static MALLOC_DEFINE(M_NSMBDEV, "NETSMBDEV", "NET/SMB device"); static struct cdevsw nsmb_cdevsw = { .d_version = D_VERSION, .d_open = nsmb_dev_open, .d_ioctl = nsmb_dev_ioctl, .d_name = NSMB_NAME }; static int nsmb_dev_init(void) { nsmb_dev = make_dev(&nsmb_cdevsw, 0, UID_ROOT, GID_OPERATOR, 0600, "nsmb"); if (nsmb_dev == NULL) return (ENOMEM); return (0); } static void nsmb_dev_destroy(void) { MPASS(nsmb_dev != NULL); destroy_dev(nsmb_dev); nsmb_dev = NULL; } static struct smb_dev * smbdev_alloc(struct cdev *dev) { struct smb_dev *sdp; sdp = malloc(sizeof(struct smb_dev), M_NSMBDEV, M_WAITOK | M_ZERO); sdp->dev = dev; sdp->sd_level = -1; sdp->sd_flags |= NSMBFL_OPEN; sdp->refcount = 1; return (sdp); } void sdp_dtor(void *arg) { struct smb_dev *dev; dev = (struct smb_dev *)arg; SMB_LOCK(); sdp_trydestroy(dev); SMB_UNLOCK(); } static int nsmb_dev_open(struct cdev *dev, int oflags, int devtype, struct thread *td) { struct smb_dev *sdp; int error; sdp = smbdev_alloc(dev); error = devfs_set_cdevpriv(sdp, sdp_dtor); if (error) { free(sdp, M_NSMBDEV); return (error); } return (0); } void sdp_trydestroy(struct smb_dev *sdp) { struct smb_vc *vcp; struct smb_share *ssp; struct smb_cred *scred; SMB_LOCKASSERT(); if (!sdp) panic("No smb_dev upon device close"); MPASS(sdp->refcount > 0); sdp->refcount--; if (sdp->refcount) return; scred = malloc(sizeof(struct smb_cred), M_NSMBDEV, M_WAITOK); smb_makescred(scred, curthread, NULL); ssp = sdp->sd_share; if (ssp != NULL) { smb_share_lock(ssp); smb_share_rele(ssp, scred); } vcp = sdp->sd_vc; if (vcp != NULL) { smb_vc_lock(vcp); smb_vc_rele(vcp, scred); } free(scred, M_NSMBDEV); free(sdp, M_NSMBDEV); return; } static int nsmb_dev_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td) { struct smb_dev *sdp; struct smb_vc *vcp; struct smb_share *ssp; struct smb_cred *scred; int error = 0; error = devfs_get_cdevpriv((void **)&sdp); if (error) return (error); scred = malloc(sizeof(struct smb_cred), M_NSMBDEV, M_WAITOK); SMB_LOCK(); smb_makescred(scred, td, NULL); switch (cmd) { case SMBIOC_OPENSESSION: if (sdp->sd_vc) { error = EISCONN; goto out; } error = smb_usr_opensession((struct smbioc_ossn*)data, scred, &vcp); if (error) break; sdp->sd_vc = vcp; smb_vc_unlock(vcp); sdp->sd_level = SMBL_VC; break; case SMBIOC_OPENSHARE: if (sdp->sd_share) { error = EISCONN; goto out; } if (sdp->sd_vc == NULL) { error = ENOTCONN; goto out; } error = smb_usr_openshare(sdp->sd_vc, (struct smbioc_oshare*)data, scred, &ssp); if (error) break; sdp->sd_share = ssp; smb_share_unlock(ssp); sdp->sd_level = SMBL_SHARE; break; case SMBIOC_REQUEST: if (sdp->sd_share == NULL) { error = ENOTCONN; goto out; } error = smb_usr_simplerequest(sdp->sd_share, (struct smbioc_rq*)data, scred); break; case SMBIOC_T2RQ: if (sdp->sd_share == NULL) { error = ENOTCONN; goto out; } error = smb_usr_t2request(sdp->sd_share, (struct smbioc_t2rq*)data, scred); break; case SMBIOC_SETFLAGS: { struct smbioc_flags *fl = (struct smbioc_flags*)data; int on; if (fl->ioc_level == SMBL_VC) { if (fl->ioc_mask & SMBV_PERMANENT) { on = fl->ioc_flags & SMBV_PERMANENT; if ((vcp = sdp->sd_vc) == NULL) { error = ENOTCONN; goto out; } error = smb_vc_get(vcp, scred); if (error) break; if (on && (vcp->obj.co_flags & SMBV_PERMANENT) == 0) { vcp->obj.co_flags |= SMBV_PERMANENT; smb_vc_ref(vcp); } else if (!on && (vcp->obj.co_flags & SMBV_PERMANENT)) { vcp->obj.co_flags &= ~SMBV_PERMANENT; smb_vc_rele(vcp, scred); } smb_vc_put(vcp, scred); } else error = EINVAL; } else if (fl->ioc_level == SMBL_SHARE) { if (fl->ioc_mask & SMBS_PERMANENT) { on = fl->ioc_flags & SMBS_PERMANENT; if ((ssp = sdp->sd_share) == NULL) { error = ENOTCONN; goto out; } error = smb_share_get(ssp, scred); if (error) break; if (on && (ssp->obj.co_flags & SMBS_PERMANENT) == 0) { ssp->obj.co_flags |= SMBS_PERMANENT; smb_share_ref(ssp); } else if (!on && (ssp->obj.co_flags & SMBS_PERMANENT)) { ssp->obj.co_flags &= ~SMBS_PERMANENT; smb_share_rele(ssp, scred); } smb_share_put(ssp, scred); } else error = EINVAL; break; } else error = EINVAL; break; } case SMBIOC_LOOKUP: if (sdp->sd_vc || sdp->sd_share) { error = EISCONN; goto out; } vcp = NULL; ssp = NULL; error = smb_usr_lookup((struct smbioc_lookup*)data, scred, &vcp, &ssp); if (error) break; if (vcp) { sdp->sd_vc = vcp; smb_vc_unlock(vcp); sdp->sd_level = SMBL_VC; } if (ssp) { sdp->sd_share = ssp; smb_share_unlock(ssp); sdp->sd_level = SMBL_SHARE; } break; case SMBIOC_READ: case SMBIOC_WRITE: { struct smbioc_rw *rwrq = (struct smbioc_rw*)data; struct uio auio; struct iovec iov; if ((ssp = sdp->sd_share) == NULL) { error = ENOTCONN; goto out; } iov.iov_base = rwrq->ioc_base; iov.iov_len = rwrq->ioc_cnt; auio.uio_iov = &iov; auio.uio_iovcnt = 1; auio.uio_offset = rwrq->ioc_offset; auio.uio_resid = rwrq->ioc_cnt; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = (cmd == SMBIOC_READ) ? UIO_READ : UIO_WRITE; auio.uio_td = td; if (cmd == SMBIOC_READ) error = smb_read(ssp, rwrq->ioc_fh, &auio, scred); else error = smb_write(ssp, rwrq->ioc_fh, &auio, scred); rwrq->ioc_cnt -= auio.uio_resid; break; } default: error = ENODEV; } out: free(scred, M_NSMBDEV); SMB_UNLOCK(); return error; } static int nsmb_dev_load(module_t mod, int cmd, void *arg) { int error = 0; switch (cmd) { case MOD_LOAD: error = smb_sm_init(); if (error) break; error = smb_iod_init(); if (error) { smb_sm_done(); break; } error = nsmb_dev_init(); if (error) break; sx_init(&smb_lock, "samba device lock"); break; case MOD_UNLOAD: smb_iod_done(); error = smb_sm_done(); if (error) break; nsmb_dev_destroy(); sx_destroy(&smb_lock); break; default: error = EINVAL; break; } return error; } DEV_MODULE (dev_netsmb, nsmb_dev_load, 0); int smb_dev2share(int fd, int mode, struct smb_cred *scred, struct smb_share **sspp, struct smb_dev **ssdp) { - cap_rights_t rights; struct file *fp, *fptmp; struct smb_dev *sdp; struct smb_share *ssp; struct thread *td; int error; td = curthread; - error = fget(td, fd, cap_rights_init(&rights, CAP_READ), &fp); + error = fget(td, fd, &cap_read_rights, &fp); if (error) return (error); fptmp = td->td_fpop; td->td_fpop = fp; error = devfs_get_cdevpriv((void **)&sdp); td->td_fpop = fptmp; fdrop(fp, td); if (error || sdp == NULL) return (error); SMB_LOCK(); *ssdp = sdp; ssp = sdp->sd_share; if (ssp == NULL) { SMB_UNLOCK(); return (ENOTCONN); } error = smb_share_get(ssp, scred); if (error == 0) { sdp->refcount++; *sspp = ssp; } SMB_UNLOCK(); return error; } Index: head/sys/sys/capsicum.h =================================================================== --- head/sys/sys/capsicum.h (revision 333424) +++ head/sys/sys/capsicum.h (revision 333425) @@ -1,430 +1,517 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2008-2010, 2015 Robert N. M. Watson * Copyright (c) 2012 FreeBSD Foundation * All rights reserved. * * This software was developed at the University of Cambridge Computer * Laboratory with support from a grant from Google, Inc. * * Portions of this software were developed by Pawel Jakub Dawidek under * sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * Definitions for FreeBSD capabilities facility. */ #ifndef _SYS_CAPSICUM_H_ #define _SYS_CAPSICUM_H_ #include #include #include #include #include #ifndef _KERNEL #include #endif #define CAPRIGHT(idx, bit) ((1ULL << (57 + (idx))) | (bit)) /* * Possible rights on capabilities. * * Notes: * Some system calls don't require a capability in order to perform an * operation on an fd. These include: close, dup, dup2. * * sendfile is authorized using CAP_READ on the file and CAP_WRITE on the * socket. * * mmap() and aio*() system calls will need special attention as they may * involve reads or writes depending a great deal on context. */ /* INDEX 0 */ /* * General file I/O. */ /* Allows for openat(O_RDONLY), read(2), readv(2). */ #define CAP_READ CAPRIGHT(0, 0x0000000000000001ULL) /* Allows for openat(O_WRONLY | O_APPEND), write(2), writev(2). */ #define CAP_WRITE CAPRIGHT(0, 0x0000000000000002ULL) /* Allows for lseek(fd, 0, SEEK_CUR). */ #define CAP_SEEK_TELL CAPRIGHT(0, 0x0000000000000004ULL) /* Allows for lseek(2). */ #define CAP_SEEK (CAP_SEEK_TELL | 0x0000000000000008ULL) /* Allows for aio_read(2), pread(2), preadv(2). */ #define CAP_PREAD (CAP_SEEK | CAP_READ) /* * Allows for aio_write(2), openat(O_WRONLY) (without O_APPEND), pwrite(2), * pwritev(2). */ #define CAP_PWRITE (CAP_SEEK | CAP_WRITE) /* Allows for mmap(PROT_NONE). */ #define CAP_MMAP CAPRIGHT(0, 0x0000000000000010ULL) /* Allows for mmap(PROT_READ). */ #define CAP_MMAP_R (CAP_MMAP | CAP_SEEK | CAP_READ) /* Allows for mmap(PROT_WRITE). */ #define CAP_MMAP_W (CAP_MMAP | CAP_SEEK | CAP_WRITE) /* Allows for mmap(PROT_EXEC). */ #define CAP_MMAP_X (CAP_MMAP | CAP_SEEK | 0x0000000000000020ULL) /* Allows for mmap(PROT_READ | PROT_WRITE). */ #define CAP_MMAP_RW (CAP_MMAP_R | CAP_MMAP_W) /* Allows for mmap(PROT_READ | PROT_EXEC). */ #define CAP_MMAP_RX (CAP_MMAP_R | CAP_MMAP_X) /* Allows for mmap(PROT_WRITE | PROT_EXEC). */ #define CAP_MMAP_WX (CAP_MMAP_W | CAP_MMAP_X) /* Allows for mmap(PROT_READ | PROT_WRITE | PROT_EXEC). */ #define CAP_MMAP_RWX (CAP_MMAP_R | CAP_MMAP_W | CAP_MMAP_X) /* Allows for openat(O_CREAT). */ #define CAP_CREATE CAPRIGHT(0, 0x0000000000000040ULL) /* Allows for openat(O_EXEC) and fexecve(2) in turn. */ #define CAP_FEXECVE CAPRIGHT(0, 0x0000000000000080ULL) /* Allows for openat(O_SYNC), openat(O_FSYNC), fsync(2), aio_fsync(2). */ #define CAP_FSYNC CAPRIGHT(0, 0x0000000000000100ULL) /* Allows for openat(O_TRUNC), ftruncate(2). */ #define CAP_FTRUNCATE CAPRIGHT(0, 0x0000000000000200ULL) /* Lookups - used to constrain *at() calls. */ #define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL) /* VFS methods. */ /* Allows for fchdir(2). */ #define CAP_FCHDIR CAPRIGHT(0, 0x0000000000000800ULL) /* Allows for fchflags(2). */ #define CAP_FCHFLAGS CAPRIGHT(0, 0x0000000000001000ULL) /* Allows for fchflags(2) and chflagsat(2). */ #define CAP_CHFLAGSAT (CAP_FCHFLAGS | CAP_LOOKUP) /* Allows for fchmod(2). */ #define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL) /* Allows for fchmod(2) and fchmodat(2). */ #define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP) /* Allows for fchown(2). */ #define CAP_FCHOWN CAPRIGHT(0, 0x0000000000004000ULL) /* Allows for fchown(2) and fchownat(2). */ #define CAP_FCHOWNAT (CAP_FCHOWN | CAP_LOOKUP) /* Allows for fcntl(2). */ #define CAP_FCNTL CAPRIGHT(0, 0x0000000000008000ULL) /* * Allows for flock(2), openat(O_SHLOCK), openat(O_EXLOCK), * fcntl(F_SETLK_REMOTE), fcntl(F_SETLKW), fcntl(F_SETLK), fcntl(F_GETLK). */ #define CAP_FLOCK CAPRIGHT(0, 0x0000000000010000ULL) /* Allows for fpathconf(2). */ #define CAP_FPATHCONF CAPRIGHT(0, 0x0000000000020000ULL) /* Allows for UFS background-fsck operations. */ #define CAP_FSCK CAPRIGHT(0, 0x0000000000040000ULL) /* Allows for fstat(2). */ #define CAP_FSTAT CAPRIGHT(0, 0x0000000000080000ULL) /* Allows for fstat(2), fstatat(2) and faccessat(2). */ #define CAP_FSTATAT (CAP_FSTAT | CAP_LOOKUP) /* Allows for fstatfs(2). */ #define CAP_FSTATFS CAPRIGHT(0, 0x0000000000100000ULL) /* Allows for futimens(2) and futimes(2). */ #define CAP_FUTIMES CAPRIGHT(0, 0x0000000000200000ULL) /* Allows for futimens(2), futimes(2), futimesat(2) and utimensat(2). */ #define CAP_FUTIMESAT (CAP_FUTIMES | CAP_LOOKUP) /* Allows for linkat(2) (target directory descriptor). */ #define CAP_LINKAT_TARGET (CAP_LOOKUP | 0x0000000000400000ULL) /* Allows for mkdirat(2). */ #define CAP_MKDIRAT (CAP_LOOKUP | 0x0000000000800000ULL) /* Allows for mkfifoat(2). */ #define CAP_MKFIFOAT (CAP_LOOKUP | 0x0000000001000000ULL) /* Allows for mknodat(2). */ #define CAP_MKNODAT (CAP_LOOKUP | 0x0000000002000000ULL) /* Allows for renameat(2) (source directory descriptor). */ #define CAP_RENAMEAT_SOURCE (CAP_LOOKUP | 0x0000000004000000ULL) /* Allows for symlinkat(2). */ #define CAP_SYMLINKAT (CAP_LOOKUP | 0x0000000008000000ULL) /* * Allows for unlinkat(2) and renameat(2) if destination object exists and * will be removed. */ #define CAP_UNLINKAT (CAP_LOOKUP | 0x0000000010000000ULL) /* Socket operations. */ /* Allows for accept(2) and accept4(2). */ #define CAP_ACCEPT CAPRIGHT(0, 0x0000000020000000ULL) /* Allows for bind(2). */ #define CAP_BIND CAPRIGHT(0, 0x0000000040000000ULL) /* Allows for connect(2). */ #define CAP_CONNECT CAPRIGHT(0, 0x0000000080000000ULL) /* Allows for getpeername(2). */ #define CAP_GETPEERNAME CAPRIGHT(0, 0x0000000100000000ULL) /* Allows for getsockname(2). */ #define CAP_GETSOCKNAME CAPRIGHT(0, 0x0000000200000000ULL) /* Allows for getsockopt(2). */ #define CAP_GETSOCKOPT CAPRIGHT(0, 0x0000000400000000ULL) /* Allows for listen(2). */ #define CAP_LISTEN CAPRIGHT(0, 0x0000000800000000ULL) /* Allows for sctp_peeloff(2). */ #define CAP_PEELOFF CAPRIGHT(0, 0x0000001000000000ULL) #define CAP_RECV CAP_READ #define CAP_SEND CAP_WRITE /* Allows for setsockopt(2). */ #define CAP_SETSOCKOPT CAPRIGHT(0, 0x0000002000000000ULL) /* Allows for shutdown(2). */ #define CAP_SHUTDOWN CAPRIGHT(0, 0x0000004000000000ULL) /* Allows for bindat(2) on a directory descriptor. */ #define CAP_BINDAT (CAP_LOOKUP | 0x0000008000000000ULL) /* Allows for connectat(2) on a directory descriptor. */ #define CAP_CONNECTAT (CAP_LOOKUP | 0x0000010000000000ULL) /* Allows for linkat(2) (source directory descriptor). */ #define CAP_LINKAT_SOURCE (CAP_LOOKUP | 0x0000020000000000ULL) /* Allows for renameat(2) (target directory descriptor). */ #define CAP_RENAMEAT_TARGET (CAP_LOOKUP | 0x0000040000000000ULL) #define CAP_SOCK_CLIENT \ (CAP_CONNECT | CAP_GETPEERNAME | CAP_GETSOCKNAME | CAP_GETSOCKOPT | \ CAP_PEELOFF | CAP_RECV | CAP_SEND | CAP_SETSOCKOPT | CAP_SHUTDOWN) #define CAP_SOCK_SERVER \ (CAP_ACCEPT | CAP_BIND | CAP_GETPEERNAME | CAP_GETSOCKNAME | \ CAP_GETSOCKOPT | CAP_LISTEN | CAP_PEELOFF | CAP_RECV | CAP_SEND | \ CAP_SETSOCKOPT | CAP_SHUTDOWN) /* All used bits for index 0. */ #define CAP_ALL0 CAPRIGHT(0, 0x000007FFFFFFFFFFULL) /* Available bits for index 0. */ #define CAP_UNUSED0_44 CAPRIGHT(0, 0x0000080000000000ULL) /* ... */ #define CAP_UNUSED0_57 CAPRIGHT(0, 0x0100000000000000ULL) /* INDEX 1 */ /* Mandatory Access Control. */ /* Allows for mac_get_fd(3). */ #define CAP_MAC_GET CAPRIGHT(1, 0x0000000000000001ULL) /* Allows for mac_set_fd(3). */ #define CAP_MAC_SET CAPRIGHT(1, 0x0000000000000002ULL) /* Methods on semaphores. */ #define CAP_SEM_GETVALUE CAPRIGHT(1, 0x0000000000000004ULL) #define CAP_SEM_POST CAPRIGHT(1, 0x0000000000000008ULL) #define CAP_SEM_WAIT CAPRIGHT(1, 0x0000000000000010ULL) /* Allows select(2) and poll(2) on descriptor. */ #define CAP_EVENT CAPRIGHT(1, 0x0000000000000020ULL) /* Allows for kevent(2) on kqueue descriptor with eventlist != NULL. */ #define CAP_KQUEUE_EVENT CAPRIGHT(1, 0x0000000000000040ULL) /* Strange and powerful rights that should not be given lightly. */ /* Allows for ioctl(2). */ #define CAP_IOCTL CAPRIGHT(1, 0x0000000000000080ULL) #define CAP_TTYHOOK CAPRIGHT(1, 0x0000000000000100ULL) /* Process management via process descriptors. */ /* Allows for pdgetpid(2). */ #define CAP_PDGETPID CAPRIGHT(1, 0x0000000000000200ULL) /* Allows for pdwait4(2). */ #define CAP_PDWAIT CAPRIGHT(1, 0x0000000000000400ULL) /* Allows for pdkill(2). */ #define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL) /* Extended attributes. */ /* Allows for extattr_delete_fd(2). */ #define CAP_EXTATTR_DELETE CAPRIGHT(1, 0x0000000000001000ULL) /* Allows for extattr_get_fd(2). */ #define CAP_EXTATTR_GET CAPRIGHT(1, 0x0000000000002000ULL) /* Allows for extattr_list_fd(2). */ #define CAP_EXTATTR_LIST CAPRIGHT(1, 0x0000000000004000ULL) /* Allows for extattr_set_fd(2). */ #define CAP_EXTATTR_SET CAPRIGHT(1, 0x0000000000008000ULL) /* Access Control Lists. */ /* Allows for acl_valid_fd_np(3). */ #define CAP_ACL_CHECK CAPRIGHT(1, 0x0000000000010000ULL) /* Allows for acl_delete_fd_np(3). */ #define CAP_ACL_DELETE CAPRIGHT(1, 0x0000000000020000ULL) /* Allows for acl_get_fd(3) and acl_get_fd_np(3). */ #define CAP_ACL_GET CAPRIGHT(1, 0x0000000000040000ULL) /* Allows for acl_set_fd(3) and acl_set_fd_np(3). */ #define CAP_ACL_SET CAPRIGHT(1, 0x0000000000080000ULL) /* Allows for kevent(2) on kqueue descriptor with changelist != NULL. */ #define CAP_KQUEUE_CHANGE CAPRIGHT(1, 0x0000000000100000ULL) #define CAP_KQUEUE (CAP_KQUEUE_EVENT | CAP_KQUEUE_CHANGE) /* All used bits for index 1. */ #define CAP_ALL1 CAPRIGHT(1, 0x00000000001FFFFFULL) /* Available bits for index 1. */ #define CAP_UNUSED1_22 CAPRIGHT(1, 0x0000000000200000ULL) /* ... */ #define CAP_UNUSED1_57 CAPRIGHT(1, 0x0100000000000000ULL) /* Backward compatibility. */ #define CAP_POLL_EVENT CAP_EVENT #define CAP_ALL(rights) do { \ (rights)->cr_rights[0] = \ ((uint64_t)CAP_RIGHTS_VERSION << 62) | CAP_ALL0; \ (rights)->cr_rights[1] = CAP_ALL1; \ } while (0) #define CAP_NONE(rights) do { \ (rights)->cr_rights[0] = \ ((uint64_t)CAP_RIGHTS_VERSION << 62) | CAPRIGHT(0, 0ULL); \ (rights)->cr_rights[1] = CAPRIGHT(1, 0ULL); \ } while (0) #define CAPRVER(right) ((int)((right) >> 62)) #define CAPVER(rights) CAPRVER((rights)->cr_rights[0]) #define CAPARSIZE(rights) (CAPVER(rights) + 2) #define CAPIDXBIT(right) ((int)(((right) >> 57) & 0x1F)) /* * Allowed fcntl(2) commands. */ #define CAP_FCNTL_GETFL (1 << F_GETFL) #define CAP_FCNTL_SETFL (1 << F_SETFL) #define CAP_FCNTL_GETOWN (1 << F_GETOWN) #define CAP_FCNTL_SETOWN (1 << F_SETOWN) #define CAP_FCNTL_ALL (CAP_FCNTL_GETFL | CAP_FCNTL_SETFL | \ CAP_FCNTL_GETOWN | CAP_FCNTL_SETOWN) #define CAP_IOCTLS_ALL SSIZE_MAX __BEGIN_DECLS #define cap_rights_init(...) \ __cap_rights_init(CAP_RIGHTS_VERSION, __VA_ARGS__, 0ULL) cap_rights_t *__cap_rights_init(int version, cap_rights_t *rights, ...); #define cap_rights_set(...) \ __cap_rights_set(__VA_ARGS__, 0ULL) cap_rights_t *__cap_rights_set(cap_rights_t *rights, ...); #define cap_rights_clear(...) \ __cap_rights_clear(__VA_ARGS__, 0ULL) cap_rights_t *__cap_rights_clear(cap_rights_t *rights, ...); #define cap_rights_is_set(...) \ __cap_rights_is_set(__VA_ARGS__, 0ULL) bool __cap_rights_is_set(const cap_rights_t *rights, ...); bool cap_rights_is_valid(const cap_rights_t *rights); cap_rights_t *cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src); cap_rights_t *cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src); bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little); +void __cap_rights_sysinit(void *arg); __END_DECLS +struct cap_rights_init_args { + cap_rights_t *cria_rights; + uint64_t cria_value1; + uint64_t cria_value2; + uint64_t cria_value3; + uint64_t cria_value4; + uint64_t cria_value5; +}; +#define CAP_RIGHTS_SYSINIT0(name, rights) \ + static struct cap_rights_init_args name##_args = { \ + &(rights) \ + }; \ + SYSINIT(name##_cap_rights_sysinit, SI_SUB_COPYRIGHT+1, SI_ORDER_ANY, \ + __cap_rights_sysinit, &name##_args); + +#define CAP_RIGHTS_SYSINIT1(name, rights, value1) \ + static struct cap_rights_init_args name##_args = { \ + &(rights), \ + (value1) \ + }; \ + SYSINIT(name##_cap_rights_sysinit, SI_SUB_COPYRIGHT+1, SI_ORDER_ANY, \ + __cap_rights_sysinit, &name##_args); + +#define CAP_RIGHTS_SYSINIT2(name, rights, value1, value2) \ + static struct cap_rights_init_args name##_args = { \ + &(rights), \ + (value1), \ + (value2) \ + }; \ + SYSINIT(name##_cap_rights_sysinit, SI_SUB_COPYRIGHT, SI_ORDER_ANY, \ + __cap_rights_sysinit, &name##_args); + +#define CAP_RIGHTS_SYSINIT3(name, rights, value1, value2, value3) \ + static struct cap_rights_init_args name##_args = { \ + &(rights), \ + (value1), \ + (value2), \ + (value3) \ + }; \ + SYSINIT(name##_cap_rights_sysinit, SI_SUB_COPYRIGHT, SI_ORDER_ANY, \ + __cap_rights_sysinit, &name##_args); + +#define CAP_RIGHTS_SYSINIT4(name, rights, value1, value2, value3, value4) \ + static struct cap_rights_init_args name##_args = { \ + &(rights), \ + (value1), \ + (value2), \ + (value3), \ + (value4) \ + }; \ + SYSINIT(name##_cap_rights_sysinit, SI_SUB_COPYRIGHT, SI_ORDER_ANY, \ + __cap_rights_sysinit, &name##_args); + +#define CAP_RIGHTS_DEFINE1(name, value) \ + __read_mostly cap_rights_t name; \ + CAP_RIGHTS_SYSINIT1(name, name, value); + #ifdef _KERNEL #include +extern cap_rights_t cap_accept_rights; +extern cap_rights_t cap_bind_rights; +extern cap_rights_t cap_connect_rights; +extern cap_rights_t cap_event_rights; +extern cap_rights_t cap_fchdir_rights; +extern cap_rights_t cap_fcntl_rights; +extern cap_rights_t cap_fexecve_rights; +extern cap_rights_t cap_flock_rights; +extern cap_rights_t cap_fpathconf_rights; +extern cap_rights_t cap_fstat_rights; +extern cap_rights_t cap_ftruncate_rights; +extern cap_rights_t cap_getpeername_rights; +extern cap_rights_t cap_getsockopt_rights; +extern cap_rights_t cap_getsockname_rights; +extern cap_rights_t cap_ioctl_rights; +extern cap_rights_t cap_listen_rights; +extern cap_rights_t cap_mmap_rights; +extern cap_rights_t cap_no_rights; +extern cap_rights_t cap_fsync_rights; +extern cap_rights_t cap_pdgetpid_rights; +extern cap_rights_t cap_pdkill_rights; +extern cap_rights_t cap_pread_rights; +extern cap_rights_t cap_pwrite_rights; +extern cap_rights_t cap_read_rights; +extern cap_rights_t cap_recv_rights; +extern cap_rights_t cap_send_rights; +extern cap_rights_t cap_setsockopt_rights; +extern cap_rights_t cap_shutdown_rights; +extern cap_rights_t cap_write_rights; #define IN_CAPABILITY_MODE(td) (((td)->td_ucred->cr_flags & CRED_FLAG_CAPMODE) != 0) struct filedesc; struct filedescent; /* * Test whether a capability grants the requested rights. */ int cap_check(const cap_rights_t *havep, const cap_rights_t *needp); /* * Convert capability rights into VM access flags. */ u_char cap_rights_to_vmprot(cap_rights_t *havep); /* * For the purposes of procstat(1) and similar tools, allow kern_descrip.c to * extract the rights from a capability. */ cap_rights_t *cap_rights_fde(struct filedescent *fde); cap_rights_t *cap_rights(struct filedesc *fdp, int fd); int cap_ioctl_check(struct filedesc *fdp, int fd, u_long cmd); int cap_fcntl_check_fde(struct filedescent *fde, int cmd); int cap_fcntl_check(struct filedesc *fdp, int fd, int cmd); extern bool trap_enotcap; #else /* !_KERNEL */ __BEGIN_DECLS /* * cap_enter(): Cause the process to enter capability mode, which will * prevent it from directly accessing global namespaces. System calls will * be limited to process-local, process-inherited, or file descriptor * operations. If already in capability mode, a no-op. */ int cap_enter(void); /* * Are we sandboxed (in capability mode)? * This is a libc wrapper around the cap_getmode(2) system call. */ bool cap_sandboxed(void); /* * cap_getmode(): Are we in capability mode? */ int cap_getmode(u_int *modep); /* * Limits capability rights for the given descriptor (CAP_*). */ int cap_rights_limit(int fd, const cap_rights_t *rights); /* * Returns capability rights for the given descriptor. */ #define cap_rights_get(fd, rights) \ __cap_rights_get(CAP_RIGHTS_VERSION, (fd), (rights)) int __cap_rights_get(int version, int fd, cap_rights_t *rights); /* * Limits allowed ioctls for the given descriptor. */ int cap_ioctls_limit(int fd, const cap_ioctl_t *cmds, size_t ncmds); /* * Returns array of allowed ioctls for the given descriptor. * If all ioctls are allowed, the cmds array is not populated and * the function returns CAP_IOCTLS_ALL. */ ssize_t cap_ioctls_get(int fd, cap_ioctl_t *cmds, size_t maxcmds); /* * Limits allowed fcntls for the given descriptor (CAP_FCNTL_*). */ int cap_fcntls_limit(int fd, uint32_t fcntlrights); /* * Returns bitmask of allowed fcntls for the given descriptor. */ int cap_fcntls_get(int fd, uint32_t *fcntlrightsp); __END_DECLS #endif /* !_KERNEL */ #endif /* !_SYS_CAPSICUM_H_ */