Index: head/sys/compat/freebsd32/freebsd32_capability.c =================================================================== --- head/sys/compat/freebsd32/freebsd32_capability.c (revision 350420) +++ head/sys/compat/freebsd32/freebsd32_capability.c (revision 350421) @@ -1,156 +1,157 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013 The FreeBSD Foundation * All rights reserved. * * This software was developed by Pawel Jakub Dawidek under sponsorship from * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include #include #include +#include #include #include #include #include #include #include #ifdef CAPABILITIES MALLOC_DECLARE(M_FILECAPS); int freebsd32_cap_ioctls_limit(struct thread *td, struct freebsd32_cap_ioctls_limit_args *uap) { u_long *cmds; uint32_t *cmds32; size_t ncmds; u_int i; int error; ncmds = uap->ncmds; if (ncmds > 256) /* XXX: Is 256 sane? */ return (EINVAL); if (ncmds == 0) { cmds = NULL; } else { cmds32 = malloc(sizeof(cmds32[0]) * ncmds, M_FILECAPS, M_WAITOK); error = copyin(uap->cmds, cmds32, sizeof(cmds32[0]) * ncmds); if (error != 0) { free(cmds32, M_FILECAPS); return (error); } cmds = malloc(sizeof(cmds[0]) * ncmds, M_FILECAPS, M_WAITOK); for (i = 0; i < ncmds; i++) cmds[i] = cmds32[i]; free(cmds32, M_FILECAPS); } return (kern_cap_ioctls_limit(td, uap->fd, cmds, ncmds)); } int freebsd32_cap_ioctls_get(struct thread *td, struct freebsd32_cap_ioctls_get_args *uap) { struct filedesc *fdp; struct filedescent *fdep; uint32_t *cmds32; u_long *cmds; size_t maxcmds; int error, fd; u_int i; fd = uap->fd; cmds32 = uap->cmds; maxcmds = uap->maxcmds; AUDIT_ARG_FD(fd); fdp = td->td_proc->p_fd; FILEDESC_SLOCK(fdp); if (fget_locked(fdp, fd) == NULL) { error = EBADF; goto out; } /* * If all ioctls are allowed (fde_nioctls == -1 && fde_ioctls == NULL) * the only sane thing we can do is to not populate the given array and * return CAP_IOCTLS_ALL (actually, INT_MAX). */ fdep = &fdp->fd_ofiles[fd]; cmds = fdep->fde_ioctls; if (cmds32 != NULL && cmds != NULL) { for (i = 0; i < MIN(fdep->fde_nioctls, maxcmds); i++) { error = suword32(&cmds32[i], cmds[i]); if (error != 0) goto out; } } if (fdep->fde_nioctls == -1) td->td_retval[0] = INT_MAX; else td->td_retval[0] = fdep->fde_nioctls; error = 0; out: FILEDESC_SUNLOCK(fdp); return (error); } #else /* !CAPABILITIES */ int freebsd32_cap_ioctls_limit(struct thread *td, struct freebsd32_cap_ioctls_limit_args *uap) { return (ENOSYS); } int freebsd32_cap_ioctls_get(struct thread *td, struct freebsd32_cap_ioctls_get_args *uap) { return (ENOSYS); } #endif /* CAPABILITIES */ Index: head/sys/dev/bhnd/nvram/bhnd_nvram_data_tlv.c =================================================================== --- head/sys/dev/bhnd/nvram/bhnd_nvram_data_tlv.c (revision 350420) +++ head/sys/dev/bhnd/nvram/bhnd_nvram_data_tlv.c (revision 350421) @@ -1,885 +1,886 @@ /*- * Copyright (c) 2016 Landon Fuller * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any * redistribution must be conditioned upon including a substantially * similar Disclaimer requirement for further binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGES. */ #include __FBSDID("$FreeBSD$"); #ifdef _KERNEL #include #include +#include #include #include #else /* !_KERNEL */ #include #include #include #include #include #include #endif /* _KERNEL */ #include "bhnd_nvram_private.h" #include "bhnd_nvram_datavar.h" #include "bhnd_nvram_data_tlvreg.h" /* * CFE TLV NVRAM data class. * * The CFE-defined TLV NVRAM format is used on the WGT634U. */ struct bhnd_nvram_tlv { struct bhnd_nvram_data nv; /**< common instance state */ struct bhnd_nvram_io *data; /**< backing buffer */ size_t count; /**< variable count */ }; BHND_NVRAM_DATA_CLASS_DEFN(tlv, "WGT634U", BHND_NVRAM_DATA_CAP_DEVPATHS, sizeof(struct bhnd_nvram_tlv)) /** Minimal TLV_ENV record header */ struct bhnd_nvram_tlv_env_hdr { uint8_t tag; uint8_t size; } __packed; /** Minimal TLV_ENV record */ struct bhnd_nvram_tlv_env { struct bhnd_nvram_tlv_env_hdr hdr; uint8_t flags; char envp[]; } __packed; /* Return the length in bytes of an TLV_ENV's envp data */ #define NVRAM_TLV_ENVP_DATA_LEN(_env) \ (((_env)->hdr.size < sizeof((_env)->flags)) ? 0 : \ ((_env)->hdr.size - sizeof((_env)->flags))) /* Maximum supported length of the envp data field, in bytes */ #define NVRAM_TLV_ENVP_DATA_MAX_LEN \ (UINT8_MAX - sizeof(uint8_t) /* flags */) static int bhnd_nvram_tlv_parse_size( struct bhnd_nvram_io *io, size_t *size); static int bhnd_nvram_tlv_next_record( struct bhnd_nvram_io *io, size_t *next, size_t *offset, uint8_t *tag); static struct bhnd_nvram_tlv_env *bhnd_nvram_tlv_next_env( struct bhnd_nvram_tlv *tlv, size_t *next, void **cookiep); static struct bhnd_nvram_tlv_env *bhnd_nvram_tlv_get_env( struct bhnd_nvram_tlv *tlv, void *cookiep); static void *bhnd_nvram_tlv_to_cookie( struct bhnd_nvram_tlv *tlv, size_t io_offset); static size_t bhnd_nvram_tlv_to_offset( struct bhnd_nvram_tlv *tlv, void *cookiep); static int bhnd_nvram_tlv_probe(struct bhnd_nvram_io *io) { struct bhnd_nvram_tlv_env ident; size_t nbytes; int error; nbytes = bhnd_nvram_io_getsize(io); /* Handle what might be an empty TLV image */ if (nbytes < sizeof(ident)) { uint8_t tag; /* Fetch just the first tag */ error = bhnd_nvram_io_read(io, 0x0, &tag, sizeof(tag)); if (error) return (error); /* This *could* be an empty TLV image, but all we're * testing for here is a single 0x0 byte followed by EOF */ if (tag == NVRAM_TLV_TYPE_END) return (BHND_NVRAM_DATA_PROBE_MAYBE); return (ENXIO); } /* Otherwise, look at the initial header for a valid TLV ENV tag, * plus one byte of the entry data */ error = bhnd_nvram_io_read(io, 0x0, &ident, sizeof(ident) + sizeof(ident.envp[0])); if (error) return (error); /* First entry should be a variable record (which we statically * assert as being defined to use a single byte size field) */ if (ident.hdr.tag != NVRAM_TLV_TYPE_ENV) return (ENXIO); _Static_assert(NVRAM_TLV_TYPE_ENV & NVRAM_TLV_TF_U8_LEN, "TYPE_ENV is not a U8-sized field"); /* The entry must be at least 3 characters ('x=\0') in length */ if (ident.hdr.size < 3) return (ENXIO); /* The first character should be a valid key char (alpha) */ if (!bhnd_nv_isalpha(ident.envp[0])) return (ENXIO); return (BHND_NVRAM_DATA_PROBE_DEFAULT); } static int bhnd_nvram_tlv_getvar_direct(struct bhnd_nvram_io *io, const char *name, void *buf, size_t *len, bhnd_nvram_type type) { struct bhnd_nvram_tlv_env env; char data[NVRAM_TLV_ENVP_DATA_MAX_LEN]; size_t data_len; const char *key, *value; size_t keylen, vlen; size_t namelen; size_t next, off; uint8_t tag; int error; namelen = strlen(name); /* Iterate over the input looking for the requested variable */ next = 0; while (!(error = bhnd_nvram_tlv_next_record(io, &next, &off, &tag))) { switch (tag) { case NVRAM_TLV_TYPE_END: /* Not found */ return (ENOENT); case NVRAM_TLV_TYPE_ENV: /* Read the record header */ error = bhnd_nvram_io_read(io, off, &env, sizeof(env)); if (error) { BHND_NV_LOG("error reading TLV_ENV record " "header: %d\n", error); return (error); } /* Read the record data */ data_len = NVRAM_TLV_ENVP_DATA_LEN(&env); error = bhnd_nvram_io_read(io, off + sizeof(env), data, data_len); if (error) { BHND_NV_LOG("error reading TLV_ENV record " "data: %d\n", error); return (error); } /* Parse the key=value string */ error = bhnd_nvram_parse_env(data, data_len, '=', &key, &keylen, &value, &vlen); if (error) { BHND_NV_LOG("error parsing TLV_ENV data: %d\n", error); return (error); } /* Match against requested variable name */ if (keylen == namelen && strncmp(key, name, namelen) == 0) { return (bhnd_nvram_value_coerce(value, vlen, BHND_NVRAM_TYPE_STRING, buf, len, type)); } break; default: /* Skip unknown tags */ break; } } /* Hit I/O error */ return (error); } static int bhnd_nvram_tlv_serialize(bhnd_nvram_data_class *cls, bhnd_nvram_plist *props, bhnd_nvram_plist *options, void *outp, size_t *olen) { bhnd_nvram_prop *prop; size_t limit, nbytes; int error; /* Determine output byte limit */ if (outp != NULL) limit = *olen; else limit = 0; nbytes = 0; /* Write all properties */ prop = NULL; while ((prop = bhnd_nvram_plist_next(props, prop)) != NULL) { struct bhnd_nvram_tlv_env env; const char *name; uint8_t *p; size_t name_len, value_len; size_t rec_size; env.hdr.tag = NVRAM_TLV_TYPE_ENV; env.hdr.size = sizeof(env.flags); env.flags = 0x0; /* Fetch name value and add to record length */ name = bhnd_nvram_prop_name(prop); name_len = strlen(name) + 1 /* '=' */; if (UINT8_MAX - env.hdr.size < name_len) { BHND_NV_LOG("%s name exceeds maximum TLV record " "length\n", name); return (EFTYPE); /* would overflow TLV size */ } env.hdr.size += name_len; /* Add string value to record length */ error = bhnd_nvram_prop_encode(prop, NULL, &value_len, BHND_NVRAM_TYPE_STRING); if (error) { BHND_NV_LOG("error serializing %s to required type " "%s: %d\n", name, bhnd_nvram_type_name(BHND_NVRAM_TYPE_STRING), error); return (error); } if (UINT8_MAX - env.hdr.size < value_len) { BHND_NV_LOG("%s value exceeds maximum TLV record " "length\n", name); return (EFTYPE); /* would overflow TLV size */ } env.hdr.size += value_len; /* Calculate total record size */ rec_size = sizeof(env.hdr) + env.hdr.size; if (SIZE_MAX - nbytes < rec_size) return (EFTYPE); /* would overflow size_t */ /* Calculate our output pointer */ if (nbytes > limit || limit - nbytes < rec_size) { /* buffer is full; cannot write */ p = NULL; } else { p = (uint8_t *)outp + nbytes; } /* Write to output */ if (p != NULL) { memcpy(p, &env, sizeof(env)); p += sizeof(env); memcpy(p, name, name_len - 1); p[name_len - 1] = '='; p += name_len; error = bhnd_nvram_prop_encode(prop, p, &value_len, BHND_NVRAM_TYPE_STRING); if (error) { BHND_NV_LOG("error serializing %s to required " "type %s: %d\n", name, bhnd_nvram_type_name( BHND_NVRAM_TYPE_STRING), error); return (error); } } nbytes += rec_size; } /* Write terminating END record */ if (limit > nbytes) *((uint8_t *)outp + nbytes) = NVRAM_TLV_TYPE_END; if (nbytes == SIZE_MAX) return (EFTYPE); /* would overflow size_t */ nbytes++; /* Provide required length */ *olen = nbytes; if (limit < *olen) { if (outp == NULL) return (0); return (ENOMEM); } return (0); } /** * Initialize @p tlv with the provided NVRAM TLV data mapped by @p src. * * @param tlv A newly allocated data instance. */ static int bhnd_nvram_tlv_init(struct bhnd_nvram_tlv *tlv, struct bhnd_nvram_io *src) { struct bhnd_nvram_tlv_env *env; size_t size; size_t next; int error; BHND_NV_ASSERT(tlv->data == NULL, ("tlv data already initialized")); /* Determine the actual size of the TLV source data */ if ((error = bhnd_nvram_tlv_parse_size(src, &size))) return (error); /* Copy to our own internal buffer */ if ((tlv->data = bhnd_nvram_iobuf_copy_range(src, 0x0, size)) == NULL) return (ENOMEM); /* Initialize our backing buffer */ tlv->count = 0; next = 0; while ((env = bhnd_nvram_tlv_next_env(tlv, &next, NULL)) != NULL) { size_t env_len; size_t name_len; /* TLV_ENV data must not be empty */ env_len = NVRAM_TLV_ENVP_DATA_LEN(env); if (env_len == 0) { BHND_NV_LOG("cannot parse zero-length TLV_ENV record " "data\n"); return (EINVAL); } /* Parse the key=value string, and then replace the '=' * delimiter with '\0' to allow us to provide direct * name pointers from our backing buffer */ error = bhnd_nvram_parse_env(env->envp, env_len, '=', NULL, &name_len, NULL, NULL); if (error) { BHND_NV_LOG("error parsing TLV_ENV data: %d\n", error); return (error); } /* Replace '=' with '\0' */ *(env->envp + name_len) = '\0'; /* Add to variable count */ tlv->count++; }; return (0); } static int bhnd_nvram_tlv_new(struct bhnd_nvram_data *nv, struct bhnd_nvram_io *io) { struct bhnd_nvram_tlv *tlv; int error; /* Allocate and initialize the TLV data instance */ tlv = (struct bhnd_nvram_tlv *)nv; /* Parse the TLV input data and initialize our backing * data representation */ if ((error = bhnd_nvram_tlv_init(tlv, io))) { bhnd_nvram_tlv_free(nv); return (error); } return (0); } static void bhnd_nvram_tlv_free(struct bhnd_nvram_data *nv) { struct bhnd_nvram_tlv *tlv = (struct bhnd_nvram_tlv *)nv; if (tlv->data != NULL) bhnd_nvram_io_free(tlv->data); } size_t bhnd_nvram_tlv_count(struct bhnd_nvram_data *nv) { struct bhnd_nvram_tlv *tlv = (struct bhnd_nvram_tlv *)nv; return (tlv->count); } static bhnd_nvram_plist * bhnd_nvram_tlv_options(struct bhnd_nvram_data *nv) { return (NULL); } static uint32_t bhnd_nvram_tlv_caps(struct bhnd_nvram_data *nv) { return (BHND_NVRAM_DATA_CAP_READ_PTR|BHND_NVRAM_DATA_CAP_DEVPATHS); } static const char * bhnd_nvram_tlv_next(struct bhnd_nvram_data *nv, void **cookiep) { struct bhnd_nvram_tlv *tlv; struct bhnd_nvram_tlv_env *env; size_t io_offset; tlv = (struct bhnd_nvram_tlv *)nv; /* Find next readable TLV record */ if (*cookiep == NULL) { /* Start search at offset 0x0 */ io_offset = 0x0; env = bhnd_nvram_tlv_next_env(tlv, &io_offset, cookiep); } else { /* Seek past the previous env record */ io_offset = bhnd_nvram_tlv_to_offset(tlv, *cookiep); env = bhnd_nvram_tlv_next_env(tlv, &io_offset, NULL); if (env == NULL) BHND_NV_PANIC("invalid cookiep; record missing"); /* Advance to next env record, update the caller's cookiep */ env = bhnd_nvram_tlv_next_env(tlv, &io_offset, cookiep); } /* Check for EOF */ if (env == NULL) return (NULL); /* Return the NUL terminated name */ return (env->envp); } static void * bhnd_nvram_tlv_find(struct bhnd_nvram_data *nv, const char *name) { return (bhnd_nvram_data_generic_find(nv, name)); } static int bhnd_nvram_tlv_getvar_order(struct bhnd_nvram_data *nv, void *cookiep1, void *cookiep2) { if (cookiep1 < cookiep2) return (-1); if (cookiep1 > cookiep2) return (1); return (0); } static int bhnd_nvram_tlv_getvar(struct bhnd_nvram_data *nv, void *cookiep, void *buf, size_t *len, bhnd_nvram_type type) { return (bhnd_nvram_data_generic_rp_getvar(nv, cookiep, buf, len, type)); } static int bhnd_nvram_tlv_copy_val(struct bhnd_nvram_data *nv, void *cookiep, bhnd_nvram_val **value) { return (bhnd_nvram_data_generic_rp_copy_val(nv, cookiep, value)); } static const void * bhnd_nvram_tlv_getvar_ptr(struct bhnd_nvram_data *nv, void *cookiep, size_t *len, bhnd_nvram_type *type) { struct bhnd_nvram_tlv *tlv; struct bhnd_nvram_tlv_env *env; const char *val; int error; tlv = (struct bhnd_nvram_tlv *)nv; /* Fetch pointer to the TLV_ENV record */ if ((env = bhnd_nvram_tlv_get_env(tlv, cookiep)) == NULL) BHND_NV_PANIC("invalid cookiep: %p", cookiep); /* Parse value pointer and length from key\0value data */ error = bhnd_nvram_parse_env(env->envp, NVRAM_TLV_ENVP_DATA_LEN(env), '\0', NULL, NULL, &val, len); if (error) BHND_NV_PANIC("unexpected error parsing '%s'", env->envp); /* Type is always CSTR */ *type = BHND_NVRAM_TYPE_STRING; return (val); } static const char * bhnd_nvram_tlv_getvar_name(struct bhnd_nvram_data *nv, void *cookiep) { struct bhnd_nvram_tlv *tlv; const struct bhnd_nvram_tlv_env *env; tlv = (struct bhnd_nvram_tlv *)nv; /* Fetch pointer to the TLV_ENV record */ if ((env = bhnd_nvram_tlv_get_env(tlv, cookiep)) == NULL) BHND_NV_PANIC("invalid cookiep: %p", cookiep); /* Return name pointer */ return (&env->envp[0]); } static int bhnd_nvram_tlv_filter_setvar(struct bhnd_nvram_data *nv, const char *name, bhnd_nvram_val *value, bhnd_nvram_val **result) { bhnd_nvram_val *str; const char *inp; bhnd_nvram_type itype; size_t ilen; size_t name_len, tlv_nremain; int error; tlv_nremain = NVRAM_TLV_ENVP_DATA_MAX_LEN; /* Name (trimmed of any path prefix) must be valid */ if (!bhnd_nvram_validate_name(bhnd_nvram_trim_path_name(name))) return (EINVAL); /* 'name=' must fit within the maximum TLV_ENV record length */ name_len = strlen(name) + 1; /* '=' */ if (tlv_nremain < name_len) { BHND_NV_LOG("'%s=' exceeds maximum TLV_ENV record length\n", name); return (EINVAL); } tlv_nremain -= name_len; /* Convert value to a (bcm-formatted) string */ error = bhnd_nvram_val_convert_new(&str, &bhnd_nvram_val_bcm_string_fmt, value, BHND_NVRAM_VAL_DYNAMIC); if (error) return (error); /* The string value must fit within remaining TLV_ENV record length */ inp = bhnd_nvram_val_bytes(str, &ilen, &itype); if (tlv_nremain < ilen) { BHND_NV_LOG("'%.*s\\0' exceeds maximum TLV_ENV record length\n", BHND_NV_PRINT_WIDTH(ilen), inp); bhnd_nvram_val_release(str); return (EINVAL); } tlv_nremain -= name_len; /* Success. Transfer result ownership to the caller. */ *result = str; return (0); } static int bhnd_nvram_tlv_filter_unsetvar(struct bhnd_nvram_data *nv, const char *name) { /* We permit deletion of any variable */ return (0); } /** * Iterate over the records starting at @p next, returning the parsed * record's @p tag, @p size, and @p offset. * * @param io The I/O context to parse. * @param[in,out] next The next offset to be parsed, or 0x0 * to begin parsing. Upon successful * return, will be set to the offset of the * next record (or EOF, if * NVRAM_TLV_TYPE_END was parsed). * @param[out] offset The record's value offset. * @param[out] tag The record's tag. * * @retval 0 success * @retval EINVAL if parsing @p io as TLV fails. * @retval non-zero if reading @p io otherwise fails, a regular unix error * code will be returned. */ static int bhnd_nvram_tlv_next_record(struct bhnd_nvram_io *io, size_t *next, size_t *offset, uint8_t *tag) { size_t io_offset, io_size; uint16_t parsed_len; uint8_t len_hdr[2]; int error; io_offset = *next; io_size = bhnd_nvram_io_getsize(io); /* Save the record offset */ if (offset != NULL) *offset = io_offset; /* Fetch initial tag */ error = bhnd_nvram_io_read(io, io_offset, tag, sizeof(*tag)); if (error) return (error); io_offset++; /* EOF */ if (*tag == NVRAM_TLV_TYPE_END) { *next = io_offset; return (0); } /* Read length field */ if (*tag & NVRAM_TLV_TF_U8_LEN) { error = bhnd_nvram_io_read(io, io_offset, &len_hdr, sizeof(len_hdr[0])); if (error) { BHND_NV_LOG("error reading TLV record size: %d\n", error); return (error); } parsed_len = len_hdr[0]; io_offset++; } else { error = bhnd_nvram_io_read(io, io_offset, &len_hdr, sizeof(len_hdr)); if (error) { BHND_NV_LOG("error reading 16-bit TLV record " "size: %d\n", error); return (error); } parsed_len = (len_hdr[0] << 8) | len_hdr[1]; io_offset += 2; } /* Advance to next record */ if (parsed_len > io_size || io_size - parsed_len < io_offset) { /* Hit early EOF */ BHND_NV_LOG("TLV record length %hu truncated by input " "size of %zu\n", parsed_len, io_size); return (EINVAL); } *next = io_offset + parsed_len; /* Valid record found */ return (0); } /** * Parse the TLV data in @p io to determine the total size of the TLV * data mapped by @p io (which may be less than the size of @p io). */ static int bhnd_nvram_tlv_parse_size(struct bhnd_nvram_io *io, size_t *size) { size_t next; uint8_t tag; int error; /* We have to perform a minimal parse to determine the actual length */ next = 0x0; *size = 0x0; /* Iterate over the input until we hit END tag or the read fails */ do { error = bhnd_nvram_tlv_next_record(io, &next, NULL, &tag); if (error) return (error); } while (tag != NVRAM_TLV_TYPE_END); /* Offset should now point to EOF */ BHND_NV_ASSERT(next <= bhnd_nvram_io_getsize(io), ("parse returned invalid EOF offset")); *size = next; return (0); } /** * Iterate over the records in @p tlv, returning a pointer to the next * NVRAM_TLV_TYPE_ENV record, or NULL if EOF is reached. * * @param tlv The TLV instance. * @param[in,out] next The next offset to be parsed, or 0x0 * to begin parsing. Upon successful * return, will be set to the offset of the * next record. */ static struct bhnd_nvram_tlv_env * bhnd_nvram_tlv_next_env(struct bhnd_nvram_tlv *tlv, size_t *next, void **cookiep) { uint8_t tag; int error; /* Find the next TLV_ENV record, starting at @p next */ do { void *c; size_t offset; /* Fetch the next TLV record */ error = bhnd_nvram_tlv_next_record(tlv->data, next, &offset, &tag); if (error) { BHND_NV_LOG("unexpected error in next_record(): %d\n", error); return (NULL); } /* Only interested in ENV records */ if (tag != NVRAM_TLV_TYPE_ENV) continue; /* Map and return TLV_ENV record pointer */ c = bhnd_nvram_tlv_to_cookie(tlv, offset); /* Provide the cookiep value for the returned record */ if (cookiep != NULL) *cookiep = c; return (bhnd_nvram_tlv_get_env(tlv, c)); } while (tag != NVRAM_TLV_TYPE_END); /* No remaining ENV records */ return (NULL); } /** * Return a pointer to the TLV_ENV record for @p cookiep, or NULL * if none vailable. */ static struct bhnd_nvram_tlv_env * bhnd_nvram_tlv_get_env(struct bhnd_nvram_tlv *tlv, void *cookiep) { struct bhnd_nvram_tlv_env *env; void *ptr; size_t navail; size_t io_offset, io_size; int error; io_size = bhnd_nvram_io_getsize(tlv->data); io_offset = bhnd_nvram_tlv_to_offset(tlv, cookiep); /* At EOF? */ if (io_offset == io_size) return (NULL); /* Fetch non-const pointer to the record entry */ error = bhnd_nvram_io_write_ptr(tlv->data, io_offset, &ptr, sizeof(env->hdr), &navail); if (error) { /* Should never occur with a valid cookiep */ BHND_NV_LOG("error mapping record for cookiep: %d\n", error); return (NULL); } /* Validate the record pointer */ env = ptr; if (env->hdr.tag != NVRAM_TLV_TYPE_ENV) { /* Should never occur with a valid cookiep */ BHND_NV_LOG("non-ENV record mapped for %p\n", cookiep); return (NULL); } /* Is the required variable name data is mapped? */ if (navail < sizeof(struct bhnd_nvram_tlv_env_hdr) + env->hdr.size || env->hdr.size == sizeof(env->flags)) { /* Should never occur with a valid cookiep */ BHND_NV_LOG("TLV_ENV variable data not mapped for %p\n", cookiep); return (NULL); } return (env); } /** * Return a cookiep for the given I/O offset. */ static void * bhnd_nvram_tlv_to_cookie(struct bhnd_nvram_tlv *tlv, size_t io_offset) { const void *ptr; int error; BHND_NV_ASSERT(io_offset < bhnd_nvram_io_getsize(tlv->data), ("io_offset %zu out-of-range", io_offset)); BHND_NV_ASSERT(io_offset < UINTPTR_MAX, ("io_offset %#zx exceeds UINTPTR_MAX", io_offset)); error = bhnd_nvram_io_read_ptr(tlv->data, 0x0, &ptr, io_offset, NULL); if (error) BHND_NV_PANIC("error mapping offset %zu: %d", io_offset, error); ptr = (const uint8_t *)ptr + io_offset; return (__DECONST(void *, ptr)); } /* Convert a cookiep back to an I/O offset */ static size_t bhnd_nvram_tlv_to_offset(struct bhnd_nvram_tlv *tlv, void *cookiep) { const void *ptr; intptr_t offset; size_t io_size; int error; BHND_NV_ASSERT(cookiep != NULL, ("null cookiep")); io_size = bhnd_nvram_io_getsize(tlv->data); error = bhnd_nvram_io_read_ptr(tlv->data, 0x0, &ptr, io_size, NULL); if (error) BHND_NV_PANIC("error mapping offset %zu: %d", io_size, error); offset = (const uint8_t *)cookiep - (const uint8_t *)ptr; BHND_NV_ASSERT(offset >= 0, ("invalid cookiep")); BHND_NV_ASSERT((uintptr_t)offset < SIZE_MAX, ("cookiep > SIZE_MAX)")); BHND_NV_ASSERT((uintptr_t)offset <= io_size, ("cookiep > io_size)")); return ((size_t)offset); } Index: head/sys/dev/bhnd/nvram/bhnd_nvram_store.c =================================================================== --- head/sys/dev/bhnd/nvram/bhnd_nvram_store.c (revision 350420) +++ head/sys/dev/bhnd/nvram/bhnd_nvram_store.c (revision 350421) @@ -1,1268 +1,1269 @@ /*- * Copyright (c) 2015-2016 Landon Fuller * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any * redistribution must be conditioned upon including a substantially * similar Disclaimer requirement for further binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGES. */ #include __FBSDID("$FreeBSD$"); #include #include +#include #include #ifdef _KERNEL #include #include #include #else /* !_KERNEL */ #include #include #include #include #include #include #include #include #endif /* _KERNEL */ #include "bhnd_nvram_private.h" #include "bhnd_nvram_datavar.h" #include "bhnd_nvram_storevar.h" /* * BHND NVRAM Store * * Manages in-memory and persistent representations of NVRAM data. */ static int bhnd_nvstore_parse_data( struct bhnd_nvram_store *sc); static int bhnd_nvstore_parse_path_entries( struct bhnd_nvram_store *sc); static int bhnd_nvram_store_export_child( struct bhnd_nvram_store *sc, bhnd_nvstore_path *top, bhnd_nvstore_path *child, bhnd_nvram_plist *plist, uint32_t flags); static int bhnd_nvstore_export_merge( struct bhnd_nvram_store *sc, bhnd_nvstore_path *path, bhnd_nvram_plist *merged, uint32_t flags); static int bhnd_nvstore_export_devpath_alias( struct bhnd_nvram_store *sc, bhnd_nvstore_path *path, const char *devpath, bhnd_nvram_plist *plist, u_long *alias_val); /** * Allocate and initialize a new NVRAM data store instance. * * The caller is responsible for deallocating the instance via * bhnd_nvram_store_free(). * * @param[out] store On success, a pointer to the newly allocated NVRAM data * instance. * @param data The NVRAM data to be managed by the returned NVRAM data store * instance. * * @retval 0 success * @retval non-zero if an error occurs during allocation or initialization, a * regular unix error code will be returned. */ int bhnd_nvram_store_new(struct bhnd_nvram_store **store, struct bhnd_nvram_data *data) { struct bhnd_nvram_store *sc; int error; /* Allocate new instance */ sc = bhnd_nv_calloc(1, sizeof(*sc)); if (sc == NULL) return (ENOMEM); BHND_NVSTORE_LOCK_INIT(sc); BHND_NVSTORE_LOCK(sc); /* Initialize path hash table */ sc->num_paths = 0; for (size_t i = 0; i < nitems(sc->paths); i++) LIST_INIT(&sc->paths[i]); /* Initialize alias hash table */ sc->num_aliases = 0; for (size_t i = 0; i < nitems(sc->aliases); i++) LIST_INIT(&sc->aliases[i]); /* Retain the NVRAM data */ sc->data = bhnd_nvram_data_retain(data); sc->data_caps = bhnd_nvram_data_caps(data); sc->data_opts = bhnd_nvram_data_options(data); if (sc->data_opts != NULL) { bhnd_nvram_plist_retain(sc->data_opts); } else { sc->data_opts = bhnd_nvram_plist_new(); if (sc->data_opts == NULL) { error = ENOMEM; goto cleanup; } } /* Register required root path */ error = bhnd_nvstore_register_path(sc, BHND_NVSTORE_ROOT_PATH, BHND_NVSTORE_ROOT_PATH_LEN); if (error) goto cleanup; sc->root_path = bhnd_nvstore_get_path(sc, BHND_NVSTORE_ROOT_PATH, BHND_NVSTORE_ROOT_PATH_LEN); BHND_NV_ASSERT(sc->root_path, ("missing root path")); /* Parse all variables vended by our backing NVRAM data instance, * generating all path entries, alias entries, and variable indexes */ if ((error = bhnd_nvstore_parse_data(sc))) goto cleanup; *store = sc; BHND_NVSTORE_UNLOCK(sc); return (0); cleanup: BHND_NVSTORE_UNLOCK(sc); bhnd_nvram_store_free(sc); return (error); } /** * Allocate and initialize a new NVRAM data store instance, parsing the * NVRAM data from @p io. * * The caller is responsible for deallocating the instance via * bhnd_nvram_store_free(). * * The NVRAM data mapped by @p io will be copied, and @p io may be safely * deallocated after bhnd_nvram_store_new() returns. * * @param[out] store On success, a pointer to the newly allocated NVRAM data * instance. * @param io An I/O context mapping the NVRAM data to be copied and parsed. * @param cls The NVRAM data class to be used when parsing @p io, or NULL * to perform runtime identification of the appropriate data class. * * @retval 0 success * @retval non-zero if an error occurs during allocation or initialization, a * regular unix error code will be returned. */ int bhnd_nvram_store_parse_new(struct bhnd_nvram_store **store, struct bhnd_nvram_io *io, bhnd_nvram_data_class *cls) { struct bhnd_nvram_data *data; int error; /* Try to parse the data */ if ((error = bhnd_nvram_data_new(cls, &data, io))) return (error); /* Try to create our new store instance */ error = bhnd_nvram_store_new(store, data); bhnd_nvram_data_release(data); return (error); } /** * Free an NVRAM store instance, releasing all associated resources. * * @param sc A store instance previously allocated via * bhnd_nvram_store_new(). */ void bhnd_nvram_store_free(struct bhnd_nvram_store *sc) { /* Clean up alias hash table */ for (size_t i = 0; i < nitems(sc->aliases); i++) { bhnd_nvstore_alias *alias, *anext; LIST_FOREACH_SAFE(alias, &sc->aliases[i], na_link, anext) bhnd_nv_free(alias); } /* Clean up path hash table */ for (size_t i = 0; i < nitems(sc->paths); i++) { bhnd_nvstore_path *path, *pnext; LIST_FOREACH_SAFE(path, &sc->paths[i], np_link, pnext) bhnd_nvstore_path_free(path); } if (sc->data != NULL) bhnd_nvram_data_release(sc->data); if (sc->data_opts != NULL) bhnd_nvram_plist_release(sc->data_opts); BHND_NVSTORE_LOCK_DESTROY(sc); bhnd_nv_free(sc); } /** * Parse all variables vended by our backing NVRAM data instance, * generating all path entries, alias entries, and variable indexes. * * @param sc The NVRAM store instance to be initialized with * paths, aliases, and data parsed from its backing * data. * * @retval 0 success * @retval non-zero if an error occurs during parsing, a regular unix error * code will be returned. */ static int bhnd_nvstore_parse_data(struct bhnd_nvram_store *sc) { const char *name; void *cookiep; int error; /* Parse and register all device paths and path aliases. This enables * resolution of _forward_ references to device paths aliases when * scanning variable entries below */ if ((error = bhnd_nvstore_parse_path_entries(sc))) return (error); /* Calculate the per-path variable counts, and report dangling alias * references as an error. */ cookiep = NULL; while ((name = bhnd_nvram_data_next(sc->data, &cookiep))) { bhnd_nvstore_path *path; bhnd_nvstore_name_info info; /* Parse the name info */ error = bhnd_nvstore_parse_name_info(name, BHND_NVSTORE_NAME_INTERNAL, sc->data_caps, &info); if (error) return (error); switch (info.type) { case BHND_NVSTORE_VAR: /* Fetch referenced path */ path = bhnd_nvstore_var_get_path(sc, &info); if (path == NULL) { BHND_NV_LOG("variable '%s' has dangling " "path reference\n", name); return (EFTYPE); } /* Increment path variable count */ if (path->num_vars == SIZE_MAX) { BHND_NV_LOG("more than SIZE_MAX variables in " "path %s\n", path->path_str); return (EFTYPE); } path->num_vars++; break; case BHND_NVSTORE_ALIAS_DECL: /* Skip -- path alias already parsed and recorded */ break; } } /* If the backing NVRAM data instance vends only a single root ("/") * path, we may be able to skip generating an index for the root * path */ if (sc->num_paths == 1) { bhnd_nvstore_path *path; /* If the backing instance provides its own name-based lookup * indexing, we can skip generating a duplicate here */ if (sc->data_caps & BHND_NVRAM_DATA_CAP_INDEXED) return (0); /* If the sole root path contains fewer variables than the * minimum indexing threshhold, we do not need to generate an * index */ path = bhnd_nvstore_get_root_path(sc); if (path->num_vars < BHND_NV_IDX_VAR_THRESHOLD) return (0); } /* Allocate per-path index instances */ for (size_t i = 0; i < nitems(sc->paths); i++) { bhnd_nvstore_path *path; LIST_FOREACH(path, &sc->paths[i], np_link) { path->index = bhnd_nvstore_index_new(path->num_vars); if (path->index == NULL) return (ENOMEM); } } /* Populate per-path indexes */ cookiep = NULL; while ((name = bhnd_nvram_data_next(sc->data, &cookiep))) { bhnd_nvstore_name_info info; bhnd_nvstore_path *path; /* Parse the name info */ error = bhnd_nvstore_parse_name_info(name, BHND_NVSTORE_NAME_INTERNAL, sc->data_caps, &info); if (error) return (error); switch (info.type) { case BHND_NVSTORE_VAR: /* Fetch referenced path */ path = bhnd_nvstore_var_get_path(sc, &info); BHND_NV_ASSERT(path != NULL, ("dangling path reference")); /* Append to index */ error = bhnd_nvstore_index_append(sc, path->index, cookiep); if (error) return (error); break; case BHND_NVSTORE_ALIAS_DECL: /* Skip */ break; } } /* Prepare indexes for querying */ for (size_t i = 0; i < nitems(sc->paths); i++) { bhnd_nvstore_path *path; LIST_FOREACH(path, &sc->paths[i], np_link) { error = bhnd_nvstore_index_prepare(sc, path->index); if (error) return (error); } } return (0); } /** * Parse and register path and path alias entries for all declarations found in * the NVRAM data backing @p nvram. * * @param sc The NVRAM store instance. * * @retval 0 success * @retval non-zero If parsing fails, a regular unix error code will be * returned. */ static int bhnd_nvstore_parse_path_entries(struct bhnd_nvram_store *sc) { const char *name; void *cookiep; int error; BHND_NVSTORE_LOCK_ASSERT(sc, MA_OWNED); /* Skip path registration if the data source does not support device * paths. */ if (!(sc->data_caps & BHND_NVRAM_DATA_CAP_DEVPATHS)) { BHND_NV_ASSERT(sc->root_path != NULL, ("missing root path")); return (0); } /* Otherwise, parse and register all paths and path aliases */ cookiep = NULL; while ((name = bhnd_nvram_data_next(sc->data, &cookiep))) { bhnd_nvstore_name_info info; /* Parse the name info */ error = bhnd_nvstore_parse_name_info(name, BHND_NVSTORE_NAME_INTERNAL, sc->data_caps, &info); if (error) return (error); /* Register the path */ error = bhnd_nvstore_var_register_path(sc, &info, cookiep); if (error) { BHND_NV_LOG("failed to register path for %s: %d\n", name, error); return (error); } } return (0); } /** * Merge exported per-path variables (uncommitted, committed, or both) into * the empty @p merged property list. * * @param sc The NVRAM store instance. * @param path The NVRAM path to be exported. * @param merged The property list to populate with the merged results. * @param flags Export flags. See BHND_NVSTORE_EXPORT_*. * * @retval 0 success * @retval ENOMEM If allocation fails. * @retval non-zero If merging the variables defined in @p path otherwise * fails, a regular unix error code will be returned. */ static int bhnd_nvstore_export_merge(struct bhnd_nvram_store *sc, bhnd_nvstore_path *path, bhnd_nvram_plist *merged, uint32_t flags) { void *cookiep, *idxp; int error; /* Populate merged list with all pending variables */ if (BHND_NVSTORE_GET_FLAG(flags, EXPORT_UNCOMMITTED)) { bhnd_nvram_prop *prop; prop = NULL; while ((prop = bhnd_nvram_plist_next(path->pending, prop))) { /* Skip variables marked for deletion */ if (!BHND_NVSTORE_GET_FLAG(flags, EXPORT_DELETED)) { if (bhnd_nvram_prop_is_null(prop)) continue; } /* Append to merged list */ error = bhnd_nvram_plist_append(merged, prop); if (error) return (error); } } /* Skip merging committed variables? */ if (!BHND_NVSTORE_GET_FLAG(flags, EXPORT_COMMITTED)) return (0); /* Merge in the committed NVRAM variables */ idxp = NULL; while ((cookiep = bhnd_nvstore_path_data_next(sc, path, &idxp))) { const char *name; bhnd_nvram_val *val; /* Fetch the variable name */ name = bhnd_nvram_data_getvar_name(sc->data, cookiep); /* Trim device path prefix */ if (sc->data_caps & BHND_NVRAM_DATA_CAP_DEVPATHS) name = bhnd_nvram_trim_path_name(name); /* Skip if already defined in pending updates */ if (BHND_NVSTORE_GET_FLAG(flags, EXPORT_UNCOMMITTED)) { if (bhnd_nvram_plist_contains(path->pending, name)) continue; } /* Skip if higher precedence value was already defined. This * may occur if the underlying data store contains duplicate * keys; iteration will always return the definition with * the highest precedence first */ if (bhnd_nvram_plist_contains(merged, name)) continue; /* Fetch the variable's value representation */ if ((error = bhnd_nvram_data_copy_val(sc->data, cookiep, &val))) return (error); /* Add to path variable list */ error = bhnd_nvram_plist_append_val(merged, name, val); bhnd_nvram_val_release(val); if (error) return (error); } return (0); } /** * Find a free alias value for @p path, and append the devpathXX alias * declaration to @p plist. * * @param sc The NVRAM store instance. * @param path The NVRAM path for which a devpath alias * variable should be produced. * @param devpath The devpathXX path value for @p path. * @param plist The property list to which @p path's devpath * variable will be appended. * @param[out] alias_val On success, will be set to the alias value * allocated for @p path. * * @retval 0 success * @retval ENOMEM If allocation fails. * @retval non-zero If merging the variables defined in @p path otherwise * fails, a regular unix error code will be returned. */ static int bhnd_nvstore_export_devpath_alias(struct bhnd_nvram_store *sc, bhnd_nvstore_path *path, const char *devpath, bhnd_nvram_plist *plist, u_long *alias_val) { bhnd_nvstore_alias *alias; char *pathvar; int error; *alias_val = 0; /* Prefer alias value already reserved for this path. */ alias = bhnd_nvstore_find_alias(sc, path->path_str); if (alias != NULL) { *alias_val = alias->alias; /* Allocate devpathXX variable name */ bhnd_nv_asprintf(&pathvar, "devpath%lu", *alias_val); if (pathvar == NULL) return (ENOMEM); /* Append alias variable to property list */ error = bhnd_nvram_plist_append_string(plist, pathvar, devpath); BHND_NV_ASSERT(error != EEXIST, ("reserved alias %lu:%s in use", * alias_val, path->path_str)); bhnd_nv_free(pathvar); return (error); } /* Find the next free devpathXX alias entry */ while (1) { /* Skip existing reserved alias values */ while (bhnd_nvstore_get_alias(sc, *alias_val) != NULL) { if (*alias_val == ULONG_MAX) return (ENOMEM); (*alias_val)++; } /* Allocate devpathXX variable name */ bhnd_nv_asprintf(&pathvar, "devpath%lu", *alias_val); if (pathvar == NULL) return (ENOMEM); /* If not in-use, we can terminate the search */ if (!bhnd_nvram_plist_contains(plist, pathvar)) break; /* Keep searching */ bhnd_nv_free(pathvar); if (*alias_val == ULONG_MAX) return (ENOMEM); (*alias_val)++; } /* Append alias variable to property list */ error = bhnd_nvram_plist_append_string(plist, pathvar, devpath); bhnd_nv_free(pathvar); return (error); } /** * Export a single @p child path's properties, appending the result to @p plist. * * @param sc The NVRAM store instance. * @param top The root NVRAM path being exported. * @param child The NVRAM path to be exported. * @param plist The property list to which @p child's exported * properties should be appended. * @param flags Export flags. See BHND_NVSTORE_EXPORT_*. * * @retval 0 success * @retval ENOMEM If allocation fails. * @retval non-zero If merging the variables defined in @p path otherwise * fails, a regular unix error code will be returned. */ static int bhnd_nvram_store_export_child(struct bhnd_nvram_store *sc, bhnd_nvstore_path *top, bhnd_nvstore_path *child, bhnd_nvram_plist *plist, uint32_t flags) { bhnd_nvram_plist *path_vars; bhnd_nvram_prop *prop; const char *relpath; char *prefix, *namebuf; size_t prefix_len, relpath_len; size_t namebuf_size, num_props; bool emit_compact_devpath; int error; BHND_NVSTORE_LOCK_ASSERT(sc, MA_OWNED); prefix = NULL; num_props = 0; path_vars = NULL; namebuf = NULL; /* Determine the path relative to the top-level path */ relpath = bhnd_nvstore_parse_relpath(top->path_str, child->path_str); if (relpath == NULL) { /* Skip -- not a child of the root path */ return (0); } relpath_len = strlen(relpath); /* Skip sub-path if export of children was not requested, */ if (!BHND_NVSTORE_GET_FLAG(flags, EXPORT_CHILDREN) && relpath_len > 0) return (0); /* Collect all variables to be included in the export */ if ((path_vars = bhnd_nvram_plist_new()) == NULL) return (ENOMEM); if ((error = bhnd_nvstore_export_merge(sc, child, path_vars, flags))) { bhnd_nvram_plist_release(path_vars); return (error); } /* Skip if no children are to be exported */ if (bhnd_nvram_plist_count(path_vars) == 0) { bhnd_nvram_plist_release(path_vars); return (0); } /* Determine appropriate device path encoding */ emit_compact_devpath = false; if (BHND_NVSTORE_GET_FLAG(flags, EXPORT_COMPACT_DEVPATHS)) { /* Re-encode as compact (if non-empty path) */ if (relpath_len > 0) emit_compact_devpath = true; } else if (BHND_NVSTORE_GET_FLAG(flags, EXPORT_EXPAND_DEVPATHS)) { /* Re-encode with fully expanded device path */ emit_compact_devpath = false; } else if (BHND_NVSTORE_GET_FLAG(flags, EXPORT_PRESERVE_DEVPATHS)) { /* Preserve existing encoding of this path */ if (bhnd_nvstore_find_alias(sc, child->path_str) != NULL) emit_compact_devpath = true; } else { BHND_NV_LOG("invalid device path flag: %#" PRIx32, flags); error = EINVAL; goto finished; } /* Allocate variable device path prefix to use for all property names, * and if using compact encoding, emit the devpathXX= variable */ prefix = NULL; prefix_len = 0; if (emit_compact_devpath) { u_long alias_val; int len; /* Reserve an alias value and append the devpathXX= variable to * the property list */ error = bhnd_nvstore_export_devpath_alias(sc, child, relpath, plist, &alias_val); if (error) goto finished; /* Allocate variable name prefix */ len = bhnd_nv_asprintf(&prefix, "%lu:", alias_val); if (prefix == NULL) { error = ENOMEM; goto finished; } prefix_len = len; } else if (relpath_len > 0) { int len; /* Allocate the variable name prefix, appending '/' to the * relative path */ len = bhnd_nv_asprintf(&prefix, "%s/", relpath); if (prefix == NULL) { error = ENOMEM; goto finished; } prefix_len = len; } /* If prefixing of variable names is required, allocate a name * formatting buffer */ namebuf_size = 0; if (prefix != NULL) { size_t maxlen; /* Find the maximum name length */ maxlen = 0; prop = NULL; while ((prop = bhnd_nvram_plist_next(path_vars, prop))) { const char *name; name = bhnd_nvram_prop_name(prop); maxlen = bhnd_nv_ummax(strlen(name), maxlen); } /* Allocate name buffer (path-prefix + name + '\0') */ namebuf_size = prefix_len + maxlen + 1; namebuf = bhnd_nv_malloc(namebuf_size); if (namebuf == NULL) { error = ENOMEM; goto finished; } } /* Append all path variables to the export plist, prepending the * device-path prefix to the variable names, if required */ prop = NULL; while ((prop = bhnd_nvram_plist_next(path_vars, prop)) != NULL) { const char *name; /* Prepend device prefix to the variable name */ name = bhnd_nvram_prop_name(prop); if (prefix != NULL) { int len; /* * Write prefixed variable name to our name buffer. * * We precalcuate the size when scanning all names * above, so this should always succeed. */ len = snprintf(namebuf, namebuf_size, "%s%s", prefix, name); if (len < 0 || (size_t)len >= namebuf_size) BHND_NV_PANIC("invalid max_name_len"); name = namebuf; } /* Add property to export plist */ error = bhnd_nvram_plist_append_val(plist, name, bhnd_nvram_prop_val(prop)); if (error) goto finished; } /* Success */ error = 0; finished: if (prefix != NULL) bhnd_nv_free(prefix); if (namebuf != NULL) bhnd_nv_free(namebuf); if (path_vars != NULL) bhnd_nvram_plist_release(path_vars); return (error); } /** * Export a flat, ordered NVRAM property list representation of all NVRAM * properties at @p path. * * @param sc The NVRAM store instance. * @param path The NVRAM path to export, or NULL to select the root * path. * @param[out] cls On success, will be set to the backing data class * of @p sc. If the data class is are not desired, * a NULL pointer may be provided. * @param[out] props On success, will be set to a caller-owned property * list containing the exported properties. The caller is * responsible for releasing this value via * bhnd_nvram_plist_release(). * @param[out] options On success, will be set to a caller-owned property * list containing the current NVRAM serialization options * for @p sc. The caller is responsible for releasing this * value via bhnd_nvram_plist_release(). * @param flags Export flags. See BHND_NVSTORE_EXPORT_*. * * @retval 0 success * @retval EINVAL If @p flags is invalid. * @retval ENOENT The requested path was not found. * @retval ENOMEM If allocation fails. * @retval non-zero If export of @p path otherwise fails, a regular unix * error code will be returned. */ int bhnd_nvram_store_export(struct bhnd_nvram_store *sc, const char *path, bhnd_nvram_data_class **cls, bhnd_nvram_plist **props, bhnd_nvram_plist **options, uint32_t flags) { bhnd_nvram_plist *unordered; bhnd_nvstore_path *top; bhnd_nvram_prop *prop; const char *name; void *cookiep; size_t num_dpath_flags; int error; *props = NULL; unordered = NULL; num_dpath_flags = 0; if (options != NULL) *options = NULL; /* Default to exporting root path */ if (path == NULL) path = BHND_NVSTORE_ROOT_PATH; /* Default to exporting all properties */ if (!BHND_NVSTORE_GET_FLAG(flags, EXPORT_COMMITTED) && !BHND_NVSTORE_GET_FLAG(flags, EXPORT_UNCOMMITTED)) { flags |= BHND_NVSTORE_EXPORT_ALL_VARS; } /* Default to preserving the current device path encoding */ if (!BHND_NVSTORE_GET_FLAG(flags, EXPORT_COMPACT_DEVPATHS) && !BHND_NVSTORE_GET_FLAG(flags, EXPORT_EXPAND_DEVPATHS)) { flags |= BHND_NVSTORE_EXPORT_PRESERVE_DEVPATHS; } /* Exactly one device path encoding flag must be set */ if (BHND_NVSTORE_GET_FLAG(flags, EXPORT_COMPACT_DEVPATHS)) num_dpath_flags++; if (BHND_NVSTORE_GET_FLAG(flags, EXPORT_EXPAND_DEVPATHS)) num_dpath_flags++; if (BHND_NVSTORE_GET_FLAG(flags, EXPORT_PRESERVE_DEVPATHS)) num_dpath_flags++; if (num_dpath_flags != 1) return (EINVAL); /* If EXPORT_DELETED is set, EXPORT_UNCOMMITTED must be set too */ if (BHND_NVSTORE_GET_FLAG(flags, EXPORT_DELETED) && !BHND_NVSTORE_GET_FLAG(flags, EXPORT_DELETED)) { return (EINVAL); } /* Lock internal state before querying paths/properties */ BHND_NVSTORE_LOCK(sc); /* Fetch referenced path */ top = bhnd_nvstore_get_path(sc, path, strlen(path)); if (top == NULL) { error = ENOENT; goto failed; } /* Allocate new, empty property list */ if ((unordered = bhnd_nvram_plist_new()) == NULL) { error = ENOMEM; goto failed; } /* Export the top-level path first */ error = bhnd_nvram_store_export_child(sc, top, top, unordered, flags); if (error) goto failed; /* Attempt to export any children of the root path */ for (size_t i = 0; i < nitems(sc->paths); i++) { bhnd_nvstore_path *child; LIST_FOREACH(child, &sc->paths[i], np_link) { /* Top-level path was already exported */ if (child == top) continue; error = bhnd_nvram_store_export_child(sc, top, child, unordered, flags); if (error) goto failed; } } /* If requested, provide the current class and serialization options */ if (cls != NULL) *cls = bhnd_nvram_data_get_class(sc->data); if (options != NULL) *options = bhnd_nvram_plist_retain(sc->data_opts); /* * If we're re-encoding device paths, don't bother preserving the * existing NVRAM variable order; our variable names will not match * the existing backing NVRAM data. */ if (!BHND_NVSTORE_GET_FLAG(flags, EXPORT_PRESERVE_DEVPATHS)) { *props = unordered; unordered = NULL; goto finished; } /* * Re-order the flattened output to match the existing NVRAM variable * ordering. * * We append all new variables at the end of the input; this should * reduce the delta that needs to be written (e.g. to flash) when * committing NVRAM updates, and should result in a serialization * identical to the input serialization if uncommitted updates are * excluded from the export. */ if ((*props = bhnd_nvram_plist_new()) == NULL) { error = ENOMEM; goto failed; } /* Using the backing NVRAM data ordering to order all variables * currently defined in the backing store */ cookiep = NULL; while ((name = bhnd_nvram_data_next(sc->data, &cookiep))) { prop = bhnd_nvram_plist_get_prop(unordered, name); if (prop == NULL) continue; /* Append to ordered result */ if ((error = bhnd_nvram_plist_append(*props, prop))) goto failed; /* Remove from unordered list */ bhnd_nvram_plist_remove(unordered, name); } /* Any remaining variables are new, and should be appended to the * end of the export list */ prop = NULL; while ((prop = bhnd_nvram_plist_next(unordered, prop)) != NULL) { if ((error = bhnd_nvram_plist_append(*props, prop))) goto failed; } /* Export complete */ finished: BHND_NVSTORE_UNLOCK(sc); if (unordered != NULL) bhnd_nvram_plist_release(unordered); return (0); failed: BHND_NVSTORE_UNLOCK(sc); if (unordered != NULL) bhnd_nvram_plist_release(unordered); if (options != NULL && *options != NULL) bhnd_nvram_plist_release(*options); if (*props != NULL) bhnd_nvram_plist_release(*props); return (error); } /** * Encode all NVRAM properties at @p path, using the @p store's current NVRAM * data format. * * @param sc The NVRAM store instance. * @param path The NVRAM path to export, or NULL to select the root * path. * @param[out] data On success, will be set to the newly serialized value. * The caller is responsible for freeing this value * via bhnd_nvram_io_free(). * @param flags Export flags. See BHND_NVSTORE_EXPORT_*. * * @retval 0 success * @retval EINVAL If @p flags is invalid. * @retval ENOENT The requested path was not found. * @retval ENOMEM If allocation fails. * @retval non-zero If serialization of @p path otherwise fails, a regular * unix error code will be returned. */ int bhnd_nvram_store_serialize(struct bhnd_nvram_store *sc, const char *path, struct bhnd_nvram_io **data, uint32_t flags) { bhnd_nvram_plist *props; bhnd_nvram_plist *options; bhnd_nvram_data_class *cls; struct bhnd_nvram_io *io; void *outp; size_t olen; int error; props = NULL; options = NULL; io = NULL; /* Perform requested export */ error = bhnd_nvram_store_export(sc, path, &cls, &props, &options, flags); if (error) return (error); /* Determine serialized size */ error = bhnd_nvram_data_serialize(cls, props, options, NULL, &olen); if (error) goto failed; /* Allocate output buffer */ if ((io = bhnd_nvram_iobuf_empty(olen, olen)) == NULL) { error = ENOMEM; goto failed; } /* Fetch write pointer */ if ((error = bhnd_nvram_io_write_ptr(io, 0, &outp, olen, NULL))) goto failed; /* Perform serialization */ error = bhnd_nvram_data_serialize(cls, props, options, outp, &olen); if (error) goto failed; if ((error = bhnd_nvram_io_setsize(io, olen))) goto failed; /* Success */ bhnd_nvram_plist_release(props); bhnd_nvram_plist_release(options); *data = io; return (0); failed: if (props != NULL) bhnd_nvram_plist_release(props); if (options != NULL) bhnd_nvram_plist_release(options); if (io != NULL) bhnd_nvram_io_free(io); return (error); } /** * Read an NVRAM variable. * * @param sc The NVRAM parser state. * @param name The NVRAM variable name. * @param[out] outp On success, the requested value will be written * to this buffer. This argment may be NULL if * the value is not desired. * @param[in,out] olen The capacity of @p outp. On success, will be set * to the actual size of the requested value. * @param otype The requested data type to be written to * @p outp. * * @retval 0 success * @retval ENOENT The requested variable was not found. * @retval ENOMEM If @p outp is non-NULL and a buffer of @p olen is too * small to hold the requested value. * @retval non-zero If reading @p name otherwise fails, a regular unix * error code will be returned. */ int bhnd_nvram_store_getvar(struct bhnd_nvram_store *sc, const char *name, void *outp, size_t *olen, bhnd_nvram_type otype) { bhnd_nvstore_name_info info; bhnd_nvstore_path *path; bhnd_nvram_prop *prop; void *cookiep; int error; BHND_NVSTORE_LOCK(sc); /* Parse the variable name */ error = bhnd_nvstore_parse_name_info(name, BHND_NVSTORE_NAME_EXTERNAL, sc->data_caps, &info); if (error) goto finished; /* Fetch the variable's enclosing path entry */ if ((path = bhnd_nvstore_var_get_path(sc, &info)) == NULL) { error = ENOENT; goto finished; } /* Search uncommitted updates first */ prop = bhnd_nvstore_path_get_update(sc, path, info.name); if (prop != NULL) { if (bhnd_nvram_prop_is_null(prop)) { /* NULL denotes a pending deletion */ error = ENOENT; } else { error = bhnd_nvram_prop_encode(prop, outp, olen, otype); } goto finished; } /* Search the backing NVRAM data */ cookiep = bhnd_nvstore_path_data_lookup(sc, path, info.name); if (cookiep != NULL) { /* Found in backing store */ error = bhnd_nvram_data_getvar(sc->data, cookiep, outp, olen, otype); goto finished; } /* Not found */ error = ENOENT; finished: BHND_NVSTORE_UNLOCK(sc); return (error); } /** * Common bhnd_nvram_store_set*() and bhnd_nvram_store_unsetvar() * implementation. * * If @p value is NULL, the variable will be marked for deletion. */ static int bhnd_nvram_store_setval_common(struct bhnd_nvram_store *sc, const char *name, bhnd_nvram_val *value) { bhnd_nvstore_path *path; bhnd_nvstore_name_info info; int error; BHND_NVSTORE_LOCK_ASSERT(sc, MA_OWNED); /* Parse the variable name */ error = bhnd_nvstore_parse_name_info(name, BHND_NVSTORE_NAME_EXTERNAL, sc->data_caps, &info); if (error) return (error); /* Fetch the variable's enclosing path entry */ if ((path = bhnd_nvstore_var_get_path(sc, &info)) == NULL) return (error); /* Register the update entry */ return (bhnd_nvstore_path_register_update(sc, path, info.name, value)); } /** * Set an NVRAM variable. * * @param sc The NVRAM parser state. * @param name The NVRAM variable name. * @param value The new value. * * @retval 0 success * @retval ENOENT The requested variable @p name was not found. * @retval EINVAL If @p value is invalid. */ int bhnd_nvram_store_setval(struct bhnd_nvram_store *sc, const char *name, bhnd_nvram_val *value) { int error; BHND_NVSTORE_LOCK(sc); error = bhnd_nvram_store_setval_common(sc, name, value); BHND_NVSTORE_UNLOCK(sc); return (error); } /** * Set an NVRAM variable. * * @param sc The NVRAM parser state. * @param name The NVRAM variable name. * @param[out] inp The new value. * @param[in,out] ilen The size of @p inp. * @param itype The data type of @p inp. * * @retval 0 success * @retval ENOENT The requested variable @p name was not found. * @retval EINVAL If the new value is invalid. * @retval EINVAL If @p name is read-only. */ int bhnd_nvram_store_setvar(struct bhnd_nvram_store *sc, const char *name, const void *inp, size_t ilen, bhnd_nvram_type itype) { bhnd_nvram_val val; int error; error = bhnd_nvram_val_init(&val, NULL, inp, ilen, itype, BHND_NVRAM_VAL_FIXED|BHND_NVRAM_VAL_BORROW_DATA); if (error) { BHND_NV_LOG("error initializing value: %d\n", error); return (EINVAL); } BHND_NVSTORE_LOCK(sc); error = bhnd_nvram_store_setval_common(sc, name, &val); BHND_NVSTORE_UNLOCK(sc); bhnd_nvram_val_release(&val); return (error); } /** * Unset an NVRAM variable. * * @param sc The NVRAM parser state. * @param name The NVRAM variable name. * * @retval 0 success * @retval ENOENT The requested variable @p name was not found. * @retval EINVAL If @p name is read-only. */ int bhnd_nvram_store_unsetvar(struct bhnd_nvram_store *sc, const char *name) { int error; BHND_NVSTORE_LOCK(sc); error = bhnd_nvram_store_setval_common(sc, name, BHND_NVRAM_VAL_NULL); BHND_NVSTORE_UNLOCK(sc); return (error); } Index: head/sys/dev/bhnd/nvram/bhnd_nvram_value.c =================================================================== --- head/sys/dev/bhnd/nvram/bhnd_nvram_value.c (revision 350420) +++ head/sys/dev/bhnd/nvram/bhnd_nvram_value.c (revision 350421) @@ -1,1936 +1,1937 @@ /*- * Copyright (c) 2015-2016 Landon Fuller * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any * redistribution must be conditioned upon including a substantially * similar Disclaimer requirement for further binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGES. */ #include __FBSDID("$FreeBSD$"); #include +#include #include #ifdef _KERNEL #include #include #include #include #include #else /* !_KERNEL */ #include #include #include #include #include #endif /* _KERNEL */ #include "bhnd_nvram_private.h" #include "bhnd_nvram_valuevar.h" static int bhnd_nvram_val_fmt_filter(const bhnd_nvram_val_fmt **fmt, const void *inp, size_t ilen, bhnd_nvram_type itype); static void *bhnd_nvram_val_alloc_bytes(bhnd_nvram_val *value, size_t ilen, bhnd_nvram_type itype, uint32_t flags); static int bhnd_nvram_val_set(bhnd_nvram_val *value, const void *inp, size_t ilen, bhnd_nvram_type itype, uint32_t flags); static int bhnd_nvram_val_set_inline(bhnd_nvram_val *value, const void *inp, size_t ilen, bhnd_nvram_type itype); static int bhnd_nvram_val_encode_data(const void *inp, size_t ilen, bhnd_nvram_type itype, void *outp, size_t *olen, bhnd_nvram_type otype); static int bhnd_nvram_val_encode_int(const void *inp, size_t ilen, bhnd_nvram_type itype, void *outp, size_t *olen, bhnd_nvram_type otype); static int bhnd_nvram_val_encode_null(const void *inp, size_t ilen, bhnd_nvram_type itype, void *outp, size_t *olen, bhnd_nvram_type otype); static int bhnd_nvram_val_encode_bool(const void *inp, size_t ilen, bhnd_nvram_type itype, void *outp, size_t *olen, bhnd_nvram_type otype); static int bhnd_nvram_val_encode_string(const void *inp, size_t ilen, bhnd_nvram_type itype, void *outp, size_t *olen, bhnd_nvram_type otype); /** Initialize an empty value instance with @p _fmt, @p _storage, and * an implicit callee-owned reference */ #define BHND_NVRAM_VAL_INITIALIZER(_fmt, _storage) \ (bhnd_nvram_val) { \ .refs = 1, \ .val_storage = _storage, \ .fmt = _fmt, \ .data_storage = BHND_NVRAM_VAL_DATA_NONE, \ }; /** Assert that @p value's backing representation state has initialized * as empty. */ #define BHND_NVRAM_VAL_ASSERT_EMPTY(_value) \ BHND_NV_ASSERT( \ value->data_storage == BHND_NVRAM_VAL_DATA_NONE && \ value->data_len == 0 && \ value->data.ptr == NULL, \ ("previously initialized value")) /** Return true if BHND_NVRAM_VAL_BORROW_DATA or BHND_NVRAM_VAL_STATIC_DATA is * set in @p _flags (e.g. we should attempt to directly reference external * data */ #define BHND_NVRAM_VAL_EXTREF_BORROWED_DATA(_flags) \ (((_flags) & BHND_NVRAM_VAL_BORROW_DATA) || \ ((_flags) & BHND_NVRAM_VAL_STATIC_DATA)) /** Flags permitted when performing val-based initialization via * bhnd_nvram_val_convert_init() or bhnd_nvram_val_convert_new() */ #define BHND_NVRAM_VALID_CONV_FLAGS \ (BHND_NVRAM_VAL_FIXED | \ BHND_NVRAM_VAL_DYNAMIC | \ BHND_NVRAM_VAL_COPY_DATA) /** Returns true if @p _val must be copied in bhnd_nvram_val_copy(), false * if its reference count may be safely incremented */ #define BHND_NVRAM_VAL_NEED_COPY(_val) \ ((_val)->val_storage == BHND_NVRAM_VAL_STORAGE_AUTO || \ (_val)->data_storage == BHND_NVRAM_VAL_DATA_EXT_WEAK) volatile u_int refs; /**< reference count */ bhnd_nvram_val_storage val_storage; /**< value structure storage */ const bhnd_nvram_val_fmt *fmt; /**< value format */ bhnd_nvram_val_data_storage data_storage; /**< data storage */ bhnd_nvram_type data_type; /**< data type */ size_t data_len; /**< data size */ /* Shared NULL value instance */ bhnd_nvram_val bhnd_nvram_val_null = { .refs = 1, .val_storage = BHND_NVRAM_VAL_STORAGE_STATIC, .fmt = &bhnd_nvram_val_null_fmt, .data_storage = BHND_NVRAM_VAL_DATA_INLINE, .data_type = BHND_NVRAM_TYPE_NULL, .data_len = 0, }; /** * Return the human-readable name of @p fmt. */ const char * bhnd_nvram_val_fmt_name(const bhnd_nvram_val_fmt *fmt) { return (fmt->name); } /** * Return the default format for values of @p type. */ const bhnd_nvram_val_fmt * bhnd_nvram_val_default_fmt(bhnd_nvram_type type) { switch (type) { case BHND_NVRAM_TYPE_UINT8: return (&bhnd_nvram_val_uint8_fmt); case BHND_NVRAM_TYPE_UINT16: return (&bhnd_nvram_val_uint16_fmt); case BHND_NVRAM_TYPE_UINT32: return (&bhnd_nvram_val_uint32_fmt); case BHND_NVRAM_TYPE_UINT64: return (&bhnd_nvram_val_uint64_fmt); case BHND_NVRAM_TYPE_INT8: return (&bhnd_nvram_val_int8_fmt); case BHND_NVRAM_TYPE_INT16: return (&bhnd_nvram_val_int16_fmt); case BHND_NVRAM_TYPE_INT32: return (&bhnd_nvram_val_int32_fmt); case BHND_NVRAM_TYPE_INT64: return (&bhnd_nvram_val_int64_fmt); case BHND_NVRAM_TYPE_CHAR: return (&bhnd_nvram_val_char_fmt); case BHND_NVRAM_TYPE_STRING: return (&bhnd_nvram_val_string_fmt); case BHND_NVRAM_TYPE_BOOL: return (&bhnd_nvram_val_bool_fmt); case BHND_NVRAM_TYPE_NULL: return (&bhnd_nvram_val_null_fmt); case BHND_NVRAM_TYPE_DATA: return (&bhnd_nvram_val_data_fmt); case BHND_NVRAM_TYPE_UINT8_ARRAY: return (&bhnd_nvram_val_uint8_array_fmt); case BHND_NVRAM_TYPE_UINT16_ARRAY: return (&bhnd_nvram_val_uint16_array_fmt); case BHND_NVRAM_TYPE_UINT32_ARRAY: return (&bhnd_nvram_val_uint32_array_fmt); case BHND_NVRAM_TYPE_UINT64_ARRAY: return (&bhnd_nvram_val_uint64_array_fmt); case BHND_NVRAM_TYPE_INT8_ARRAY: return (&bhnd_nvram_val_int8_array_fmt); case BHND_NVRAM_TYPE_INT16_ARRAY: return (&bhnd_nvram_val_int16_array_fmt); case BHND_NVRAM_TYPE_INT32_ARRAY: return (&bhnd_nvram_val_int32_array_fmt); case BHND_NVRAM_TYPE_INT64_ARRAY: return (&bhnd_nvram_val_int64_array_fmt); case BHND_NVRAM_TYPE_CHAR_ARRAY: return (&bhnd_nvram_val_char_array_fmt); case BHND_NVRAM_TYPE_STRING_ARRAY: return (&bhnd_nvram_val_string_array_fmt); case BHND_NVRAM_TYPE_BOOL_ARRAY: return (&bhnd_nvram_val_bool_array_fmt); } /* Quiesce gcc4.2 */ BHND_NV_PANIC("bhnd nvram type %u unknown", type); } /** * Determine whether @p fmt (or new format delegated to by @p fmt) is * capable of direct initialization from buffer @p inp. * * @param[in,out] fmt Indirect pointer to the NVRAM value format. If * the format instance cannot handle the data type * directly, it may delegate to a new format * instance. On success, this parameter will be * set to the format that should be used when * performing initialization from @p inp. * @param inp Input data. * @param ilen Input data length. * @param itype Input data type. * * @retval 0 If initialization from @p inp is supported. * @retval EFTYPE If initialization from @p inp is unsupported. * @retval EFAULT if @p ilen is not correctly aligned for elements of * @p itype. */ static int bhnd_nvram_val_fmt_filter(const bhnd_nvram_val_fmt **fmt, const void *inp, size_t ilen, bhnd_nvram_type itype) { const bhnd_nvram_val_fmt *ofmt, *nfmt; int error; nfmt = ofmt = *fmt; /* Validate alignment */ if ((error = bhnd_nvram_value_check_aligned(inp, ilen, itype))) return (error); /* If the format does not provide a filter function, it only supports * direct initialization from its native type */ if (ofmt->op_filter == NULL) { if (itype == ofmt->native_type) return (0); return (EFTYPE); } /* Use the filter function to determine whether direct initialization * from itype is permitted */ error = ofmt->op_filter(&nfmt, inp, ilen, itype); if (error) return (error); /* Retry filter with new format? */ if (ofmt != nfmt) { error = bhnd_nvram_val_fmt_filter(&nfmt, inp, ilen, itype); if (error) return (error); /* Success -- provide delegated format to caller */ *fmt = nfmt; } /* Value can be initialized with provided format and input type */ return (0); } /* Common initialization support for bhnd_nvram_val_init() and * bhnd_nvram_val_new() */ static int bhnd_nvram_val_init_common(bhnd_nvram_val *value, bhnd_nvram_val_storage val_storage, const bhnd_nvram_val_fmt *fmt, const void *inp, size_t ilen, bhnd_nvram_type itype, uint32_t flags) { void *outp; bhnd_nvram_type otype; size_t olen; int error; /* If the value format is unspecified, we use the default format * for the input data type */ if (fmt == NULL) fmt = bhnd_nvram_val_default_fmt(itype); /* Determine expected data type, and allow the format to delegate to * a new format instance */ if ((error = bhnd_nvram_val_fmt_filter(&fmt, inp, ilen, itype))) { /* Direct initialization from the provided input type is * not supported; alue must be initialized with the format's * native type */ otype = fmt->native_type; } else { /* Value can be initialized with provided input type */ otype = itype; } /* Initialize value instance */ *value = BHND_NVRAM_VAL_INITIALIZER(fmt, val_storage); /* If input data already in native format, init directly. */ if (otype == itype) { error = bhnd_nvram_val_set(value, inp, ilen, itype, flags); if (error) return (error); return (0); } /* Determine size when encoded in native format */ error = bhnd_nvram_value_coerce(inp, ilen, itype, NULL, &olen, otype); if (error) return (error); /* Fetch reference to (or allocate) an appropriately sized buffer */ outp = bhnd_nvram_val_alloc_bytes(value, olen, otype, flags); if (outp == NULL) return (ENOMEM); /* Perform encode */ error = bhnd_nvram_value_coerce(inp, ilen, itype, outp, &olen, otype); if (error) return (error); return (0); } /** * Initialize an externally allocated instance of @p value with @p fmt from the * given @p inp buffer of @p itype and @p ilen. * * On success, the caller owns a reference to @p value, and is responsible for * freeing any resources allocated for @p value via bhnd_nvram_val_release(). * * @param value The externally allocated value instance to be * initialized. * @param fmt The value's format, or NULL to use the default format * for @p itype. * @param inp Input buffer. * @param ilen Input buffer length. * @param itype Input buffer type. * @param flags Value flags (see BHND_NVRAM_VAL_*). * * @retval 0 success * @retval ENOMEM If allocation fails. * @retval EFTYPE If @p fmt initialization from @p itype is unsupported. * @retval EFAULT if @p ilen is not correctly aligned for elements of * @p itype. * @retval ERANGE If value coercion would overflow (or underflow) the * @p fmt representation. */ int bhnd_nvram_val_init(bhnd_nvram_val *value, const bhnd_nvram_val_fmt *fmt, const void *inp, size_t ilen, bhnd_nvram_type itype, uint32_t flags) { int error; error = bhnd_nvram_val_init_common(value, BHND_NVRAM_VAL_STORAGE_AUTO, fmt, inp, ilen, itype, flags); if (error) bhnd_nvram_val_release(value); return (error); } /** * Allocate a value instance with @p fmt, and attempt to initialize its internal * representation from the given @p inp buffer of @p itype and @p ilen. * * On success, the caller owns a reference to @p value, and is responsible for * freeing any resources allocated for @p value via bhnd_nvram_val_release(). * * @param[out] value On success, the allocated value instance. * @param fmt The value's format, or NULL to use the default format * for @p itype. * @param inp Input buffer. * @param ilen Input buffer length. * @param itype Input buffer type. * @param flags Value flags (see BHND_NVRAM_VAL_*). * * @retval 0 success * @retval ENOMEM If allocation fails. * @retval EFTYPE If @p fmt initialization from @p itype is unsupported. * @retval EFAULT if @p ilen is not correctly aligned for elements of * @p itype. * @retval ERANGE If value coercion would overflow (or underflow) the * @p fmt representation. */ int bhnd_nvram_val_new(bhnd_nvram_val **value, const bhnd_nvram_val_fmt *fmt, const void *inp, size_t ilen, bhnd_nvram_type itype, uint32_t flags) { int error; /* Allocate new instance */ if ((*value = bhnd_nv_malloc(sizeof(**value))) == NULL) return (ENOMEM); /* Perform common initialization. */ error = bhnd_nvram_val_init_common(*value, BHND_NVRAM_VAL_STORAGE_DYNAMIC, fmt, inp, ilen, itype, flags); if (error) { /* Will also free() the value allocation */ bhnd_nvram_val_release(*value); } return (error); } /* Common initialization support for bhnd_nvram_val_convert_init() and * bhnd_nvram_val_convert_new() */ static int bhnd_nvram_val_convert_common(bhnd_nvram_val *value, bhnd_nvram_val_storage val_storage, const bhnd_nvram_val_fmt *fmt, bhnd_nvram_val *src, uint32_t flags) { const void *inp; void *outp; bhnd_nvram_type itype, otype; size_t ilen, olen; int error; /* Determine whether direct initialization from the source value's * existing data type is supported by the new format */ inp = bhnd_nvram_val_bytes(src, &ilen, &itype); if (bhnd_nvram_val_fmt_filter(&fmt, inp, ilen, itype) == 0) { /* Adjust value flags based on the source data storage */ switch (src->data_storage) { case BHND_NVRAM_VAL_DATA_NONE: case BHND_NVRAM_VAL_DATA_INLINE: case BHND_NVRAM_VAL_DATA_EXT_WEAK: case BHND_NVRAM_VAL_DATA_EXT_ALLOC: break; case BHND_NVRAM_VAL_DATA_EXT_STATIC: /* If the source data has static storage duration, * we should apply that transitively */ if (flags & BHND_NVRAM_VAL_BORROW_DATA) flags |= BHND_NVRAM_VAL_STATIC_DATA; break; } /* Delegate to standard initialization */ return (bhnd_nvram_val_init_common(value, val_storage, fmt, inp, ilen, itype, flags)); } /* Value must be initialized with the format's native type */ otype = fmt->native_type; /* Initialize value instance */ *value = BHND_NVRAM_VAL_INITIALIZER(fmt, val_storage); /* Determine size when encoded in native format */ if ((error = bhnd_nvram_val_encode(src, NULL, &olen, otype))) return (error); /* Fetch reference to (or allocate) an appropriately sized buffer */ outp = bhnd_nvram_val_alloc_bytes(value, olen, otype, flags); if (outp == NULL) return (ENOMEM); /* Perform encode */ if ((error = bhnd_nvram_val_encode(src, outp, &olen, otype))) return (error); return (0); } /** * Initialize an externally allocated instance of @p value with @p fmt, and * attempt to initialize its internal representation from the given @p src * value. * * On success, the caller owns a reference to @p value, and is responsible for * freeing any resources allocated for @p value via bhnd_nvram_val_release(). * * @param value The externally allocated value instance to be * initialized. * @param fmt The value's format. * @param src Input value to be converted. * @param flags Value flags (see BHND_NVRAM_VAL_*). * * @retval 0 success * @retval ENOMEM If allocation fails. * @retval EFTYPE If @p fmt initialization from @p src is unsupported. * @retval EFAULT if @p ilen is not correctly aligned for elements of * @p itype. * @retval ERANGE If value coercion of @p src would overflow * (or underflow) the @p fmt representation. */ int bhnd_nvram_val_convert_init(bhnd_nvram_val *value, const bhnd_nvram_val_fmt *fmt, bhnd_nvram_val *src, uint32_t flags) { int error; error = bhnd_nvram_val_convert_common(value, BHND_NVRAM_VAL_STORAGE_AUTO, fmt, src, flags); if (error) bhnd_nvram_val_release(value); return (error); } /** * Allocate a value instance with @p fmt, and attempt to initialize its internal * representation from the given @p src value. * * On success, the caller owns a reference to @p value, and is responsible for * freeing any resources allocated for @p value via bhnd_nvram_val_release(). * * @param[out] value On success, the allocated value instance. * @param fmt The value's format. * @param src Input value to be converted. * @param flags Value flags (see BHND_NVRAM_VAL_*). * * @retval 0 success * @retval ENOMEM If allocation fails. * @retval EFTYPE If @p fmt initialization from @p src is unsupported. * @retval EFAULT if @p ilen is not correctly aligned for elements of * @p itype. * @retval ERANGE If value coercion of @p src would overflow * (or underflow) the @p fmt representation. */ int bhnd_nvram_val_convert_new(bhnd_nvram_val **value, const bhnd_nvram_val_fmt *fmt, bhnd_nvram_val *src, uint32_t flags) { int error; /* Allocate new instance */ if ((*value = bhnd_nv_malloc(sizeof(**value))) == NULL) return (ENOMEM); /* Perform common initialization. */ error = bhnd_nvram_val_convert_common(*value, BHND_NVRAM_VAL_STORAGE_DYNAMIC, fmt, src, flags); if (error) { /* Will also free() the value allocation */ bhnd_nvram_val_release(*value); } return (error); } /** * Copy or retain a reference to @p value. * * On success, the caller is responsible for freeing the result via * bhnd_nvram_val_release(). * * @param value The value to be copied (or retained). * * @retval bhnd_nvram_val if @p value was successfully copied or retained. * @retval NULL if allocation failed. */ bhnd_nvram_val * bhnd_nvram_val_copy(bhnd_nvram_val *value) { bhnd_nvram_val *result; const void *bytes; bhnd_nvram_type type; size_t len; uint32_t flags; int error; switch (value->val_storage) { case BHND_NVRAM_VAL_STORAGE_STATIC: /* If static, can return as-is */ return (value); case BHND_NVRAM_VAL_STORAGE_DYNAMIC: if (!BHND_NVRAM_VAL_NEED_COPY(value)) { refcount_acquire(&value->refs); return (value); } /* Perform copy below */ break; case BHND_NVRAM_VAL_STORAGE_AUTO: BHND_NV_ASSERT(value->refs == 1, ("non-allocated value has " "active refcount (%u)", value->refs)); /* Perform copy below */ break; } /* Compute the new value's flags based on the source value */ switch (value->data_storage) { case BHND_NVRAM_VAL_DATA_NONE: case BHND_NVRAM_VAL_DATA_INLINE: case BHND_NVRAM_VAL_DATA_EXT_WEAK: case BHND_NVRAM_VAL_DATA_EXT_ALLOC: /* Copy the source data and permit additional allocation if the * value cannot be represented inline */ flags = BHND_NVRAM_VAL_COPY_DATA|BHND_NVRAM_VAL_DYNAMIC; break; case BHND_NVRAM_VAL_DATA_EXT_STATIC: flags = BHND_NVRAM_VAL_STATIC_DATA; break; default: BHND_NV_PANIC("invalid storage type: %d", value->data_storage); } /* Allocate new value copy */ bytes = bhnd_nvram_val_bytes(value, &len, &type); error = bhnd_nvram_val_new(&result, value->fmt, bytes, len, type, flags); if (error) { BHND_NV_LOG("copy failed: %d", error); return (NULL); } return (result); } /** * Release a reference to @p value. * * If this is the last reference, all associated resources will be freed. * * @param value The value to be released. */ void bhnd_nvram_val_release(bhnd_nvram_val *value) { BHND_NV_ASSERT(value->refs >= 1, ("value over-released")); /* Skip if value is static */ if (value->val_storage == BHND_NVRAM_VAL_STORAGE_STATIC) return; /* Drop reference */ if (!refcount_release(&value->refs)) return; /* Free allocated external representation data */ switch (value->data_storage) { case BHND_NVRAM_VAL_DATA_EXT_ALLOC: bhnd_nv_free(__DECONST(void *, value->data.ptr)); break; case BHND_NVRAM_VAL_DATA_NONE: case BHND_NVRAM_VAL_DATA_INLINE: case BHND_NVRAM_VAL_DATA_EXT_WEAK: case BHND_NVRAM_VAL_DATA_EXT_STATIC: /* Nothing to free */ break; } /* Free instance if dynamically allocated */ if (value->val_storage == BHND_NVRAM_VAL_STORAGE_DYNAMIC) bhnd_nv_free(value); } /** * Standard BHND_NVRAM_TYPE_NULL encoding implementation. */ static int bhnd_nvram_val_encode_null(const void *inp, size_t ilen, bhnd_nvram_type itype, void *outp, size_t *olen, bhnd_nvram_type otype) { size_t limit, nbytes; BHND_NV_ASSERT(itype == BHND_NVRAM_TYPE_NULL, ("unsupported type: %d", itype)); /* Determine output byte limit */ if (outp != NULL) limit = *olen; else limit = 0; nbytes = 0; /* Write to output */ switch (otype) { case BHND_NVRAM_TYPE_NULL: /* Can be directly encoded as a zero-length NULL value */ nbytes = 0; break; default: /* Not representable */ return (EFTYPE); } /* Provide required length */ *olen = nbytes; if (limit < *olen) { if (outp == NULL) return (0); return (ENOMEM); } return (0); } /** * Standard BHND_NVRAM_TYPE_BOOL encoding implementation. */ static int bhnd_nvram_val_encode_bool(const void *inp, size_t ilen, bhnd_nvram_type itype, void *outp, size_t *olen, bhnd_nvram_type otype) { bhnd_nvram_bool_t bval; size_t limit, nbytes, nelem; int error; BHND_NV_ASSERT(itype == BHND_NVRAM_TYPE_BOOL, ("unsupported type: %d", itype)); /* Determine output byte limit */ if (outp != NULL) limit = *olen; else limit = 0; /* Must be exactly one element in input */ if ((error = bhnd_nvram_value_nelem(inp, ilen, itype, &nelem))) return (error); if (nelem != 1) return (EFTYPE); /* Fetch (and normalize) boolean value */ bval = (*(const bhnd_nvram_bool_t *)inp != 0) ? true : false; /* Write to output */ switch (otype) { case BHND_NVRAM_TYPE_NULL: /* False can be directly encoded as a zero-length NULL value */ if (bval != false) return (EFTYPE); nbytes = 0; break; case BHND_NVRAM_TYPE_STRING: case BHND_NVRAM_TYPE_STRING_ARRAY: { /* Can encode as "true" or "false" */ const char *str = bval ? "true" : "false"; nbytes = strlen(str) + 1; if (limit > nbytes) strcpy(outp, str); break; } default: /* If output type is an integer, we can delegate to standard * integer encoding to encode as zero or one. */ if (bhnd_nvram_is_int_type(otype)) { uint8_t ival = bval ? 1 : 0; return (bhnd_nvram_val_encode_int(&ival, sizeof(ival), BHND_NVRAM_TYPE_UINT8, outp, olen, otype)); } /* Otherwise not representable */ return (EFTYPE); } /* Provide required length */ *olen = nbytes; if (limit < *olen) { if (outp == NULL) return (0); return (ENOMEM); } return (0); } /** * Standard BHND_NVRAM_TYPE_DATA encoding implementation. */ static int bhnd_nvram_val_encode_data(const void *inp, size_t ilen, bhnd_nvram_type itype, void *outp, size_t *olen, bhnd_nvram_type otype) { BHND_NV_ASSERT(itype == BHND_NVRAM_TYPE_DATA, ("unsupported type: %d", itype)); /* Write to output */ switch (otype) { case BHND_NVRAM_TYPE_STRING: case BHND_NVRAM_TYPE_STRING_ARRAY: /* If encoding as a string, produce an EFI-style hexadecimal * byte array (HF1F...) by interpreting the octet string * as an array of uint8 values */ return (bhnd_nvram_value_printf("H%[]02hhX", inp, ilen, BHND_NVRAM_TYPE_UINT8_ARRAY, outp, olen, "")); default: /* Fall back on direct interpretation as an array of 8-bit * integers array */ return (bhnd_nvram_value_coerce(inp, ilen, BHND_NVRAM_TYPE_UINT8_ARRAY, outp, olen, otype)); } } /** * Standard string/char array/char encoding implementation. * * Input type must be one of: * - BHND_NVRAM_TYPE_STRING * - BHND_NVRAM_TYPE_CHAR * - BHND_NVRAM_TYPE_CHAR_ARRAY */ static int bhnd_nvram_val_encode_string(const void *inp, size_t ilen, bhnd_nvram_type itype, void *outp, size_t *olen, bhnd_nvram_type otype) { const char *cstr; bhnd_nvram_type otype_base; size_t cstr_size, cstr_len; size_t limit, nbytes; BHND_NV_ASSERT( itype == BHND_NVRAM_TYPE_STRING || itype == BHND_NVRAM_TYPE_CHAR || itype == BHND_NVRAM_TYPE_CHAR_ARRAY, ("unsupported type: %d", itype)); cstr = inp; cstr_size = ilen; nbytes = 0; otype_base = bhnd_nvram_base_type(otype); /* Determine output byte limit */ if (outp != NULL) limit = *olen; else limit = 0; /* Determine string length, minus trailing NUL (if any) */ cstr_len = strnlen(cstr, cstr_size); /* Parse the string data and write to output */ switch (otype) { case BHND_NVRAM_TYPE_NULL: /* Only an empty string may be represented as a NULL value */ if (cstr_len != 0) return (EFTYPE); *olen = 0; return (0); case BHND_NVRAM_TYPE_CHAR: case BHND_NVRAM_TYPE_CHAR_ARRAY: /* String must contain exactly 1 non-terminating-NUL character * to be represented as a single char */ if (!bhnd_nvram_is_array_type(otype)) { if (cstr_len != 1) return (EFTYPE); } /* Copy out the characters directly (excluding trailing NUL) */ for (size_t i = 0; i < cstr_len; i++) { if (limit > nbytes) *((uint8_t *)outp + nbytes) = cstr[i]; nbytes++; } /* Provide required length */ *olen = nbytes; if (limit < *olen && outp != NULL) return (ENOMEM); return (0); case BHND_NVRAM_TYPE_BOOL: case BHND_NVRAM_TYPE_BOOL_ARRAY: { const char *p; size_t plen; bhnd_nvram_bool_t bval; /* Trim leading/trailing whitespace */ p = cstr; plen = bhnd_nvram_trim_field(&p, cstr_len, '\0'); /* Parse string representation */ if (strncasecmp(p, "true", plen) == 0 || strncasecmp(p, "yes", plen) == 0 || strncmp(p, "1", plen) == 0) { bval = true; } else if (strncasecmp(p, "false", plen) == 0 || strncasecmp(p, "no", plen) == 0 || strncmp(p, "0", plen) == 0) { bval = false; } else { /* Not a recognized boolean string */ return (EFTYPE); } /* Write to output */ nbytes = sizeof(bhnd_nvram_bool_t); if (limit >= nbytes) *((bhnd_nvram_bool_t *)outp) = bval; /* Provide required length */ *olen = nbytes; if (limit < *olen && outp != NULL) return (ENOMEM); return (0); } case BHND_NVRAM_TYPE_DATA: { const char *p; size_t plen, parsed_len; int error; /* Trim leading/trailing whitespace */ p = cstr; plen = bhnd_nvram_trim_field(&p, cstr_len, '\0'); /* Check for EFI-style hexadecimal byte array string format. * Must have a 'H' prefix */ if (plen < 1 || bhnd_nv_toupper(*p) != 'H') return (EFTYPE); /* Skip leading 'H' */ p++; plen--; /* Parse the input string's two-char octets until the end * of input is reached. The last octet may contain only * one char */ while (plen > 0) { uint8_t byte; size_t byte_len = sizeof(byte); /* Parse next two-character hex octet */ error = bhnd_nvram_parse_int(p, bhnd_nv_ummin(plen, 2), 16, &parsed_len, &byte, &byte_len, otype_base); if (error) { BHND_NV_DEBUG("error parsing '%.*s' as " "integer: %d\n", BHND_NV_PRINT_WIDTH(plen), p, error); return (error); } /* Write to output */ if (limit > nbytes) *((uint8_t *)outp + nbytes) = byte; nbytes++; /* Advance input */ p += parsed_len; plen -= parsed_len; } /* Provide required length */ *olen = nbytes; if (limit < *olen && outp != NULL) return (ENOMEM); return (0); } case BHND_NVRAM_TYPE_UINT8: case BHND_NVRAM_TYPE_UINT8_ARRAY: case BHND_NVRAM_TYPE_UINT16: case BHND_NVRAM_TYPE_UINT16_ARRAY: case BHND_NVRAM_TYPE_UINT32: case BHND_NVRAM_TYPE_UINT32_ARRAY: case BHND_NVRAM_TYPE_UINT64: case BHND_NVRAM_TYPE_UINT64_ARRAY: case BHND_NVRAM_TYPE_INT8: case BHND_NVRAM_TYPE_INT8_ARRAY: case BHND_NVRAM_TYPE_INT16: case BHND_NVRAM_TYPE_INT16_ARRAY: case BHND_NVRAM_TYPE_INT32: case BHND_NVRAM_TYPE_INT32_ARRAY: case BHND_NVRAM_TYPE_INT64: case BHND_NVRAM_TYPE_INT64_ARRAY: { const char *p; size_t plen, parsed_len; int error; /* Trim leading/trailing whitespace */ p = cstr; plen = bhnd_nvram_trim_field(&p, cstr_len, '\0'); /* Try to parse the integer value */ error = bhnd_nvram_parse_int(p, plen, 0, &parsed_len, outp, olen, otype_base); if (error) { BHND_NV_DEBUG("error parsing '%.*s' as integer: %d\n", BHND_NV_PRINT_WIDTH(plen), p, error); return (error); } /* Do additional bytes remain unparsed? */ if (plen != parsed_len) { BHND_NV_DEBUG("error parsing '%.*s' as a single " "integer value; trailing garbage '%.*s'\n", BHND_NV_PRINT_WIDTH(plen), p, BHND_NV_PRINT_WIDTH(plen-parsed_len), p+parsed_len); return (EFTYPE); } return (0); } case BHND_NVRAM_TYPE_STRING: case BHND_NVRAM_TYPE_STRING_ARRAY: /* Copy out the string representation as-is */ *olen = cstr_size; /* Need additional space for trailing NUL? */ if (cstr_len == cstr_size) (*olen)++; /* Skip output? */ if (outp == NULL) return (0); /* Verify required length */ if (limit < *olen) return (ENOMEM); /* Copy and NUL terminate */ strncpy(outp, cstr, cstr_len); *((char *)outp + cstr_len) = '\0'; return (0); } BHND_NV_PANIC("unknown type %s", bhnd_nvram_type_name(otype)); } /** * Standard integer encoding implementation. */ static int bhnd_nvram_val_encode_int(const void *inp, size_t ilen, bhnd_nvram_type itype, void *outp, size_t *olen, bhnd_nvram_type otype) { bhnd_nvram_type otype_base; size_t limit, nbytes; bool itype_signed, otype_signed, otype_int; union { uint64_t u64; int64_t i64; } intv; BHND_NV_ASSERT(bhnd_nvram_is_int_type(itype), ("non-integer type")); /* Determine output byte limit */ if (outp != NULL) limit = *olen; else limit = 0; /* Fetch output type info */ otype_base = bhnd_nvram_base_type(otype); otype_int = bhnd_nvram_is_int_type(otype); otype_signed = bhnd_nvram_is_signed_type(otype_base); /* * Promote integer value to a common 64-bit representation. */ switch (itype) { case BHND_NVRAM_TYPE_UINT8: if (ilen != sizeof(uint8_t)) return (EFAULT); itype_signed = false; intv.u64 = *(const uint8_t *)inp; break; case BHND_NVRAM_TYPE_UINT16: if (ilen != sizeof(uint16_t)) return (EFAULT); itype_signed = false; intv.u64 = *(const uint16_t *)inp; break; case BHND_NVRAM_TYPE_UINT32: if (ilen != sizeof(uint32_t)) return (EFAULT); itype_signed = false; intv.u64 = *(const uint32_t *)inp; break; case BHND_NVRAM_TYPE_UINT64: if (ilen != sizeof(uint64_t)) return (EFAULT); itype_signed = false; intv.u64 = *(const uint64_t *)inp; break; case BHND_NVRAM_TYPE_INT8: if (ilen != sizeof(int8_t)) return (EFAULT); itype_signed = true; intv.i64 = *(const int8_t *)inp; break; case BHND_NVRAM_TYPE_INT16: if (ilen != sizeof(int16_t)) return (EFAULT); itype_signed = true; intv.i64 = *(const int16_t *)inp; break; case BHND_NVRAM_TYPE_INT32: if (ilen != sizeof(int32_t)) return (EFAULT); itype_signed = true; intv.i64 = *(const int32_t *)inp; break; case BHND_NVRAM_TYPE_INT64: if (ilen != sizeof(int32_t)) return (EFAULT); itype_signed = true; intv.i64 = *(const int32_t *)inp; break; default: BHND_NV_PANIC("invalid type %d\n", itype); } /* Perform signed/unsigned conversion */ if (itype_signed && otype_int && !otype_signed) { if (intv.i64 < 0) { /* Can't represent negative value */ BHND_NV_LOG("cannot represent %" PRId64 " as %s\n", intv.i64, bhnd_nvram_type_name(otype)); return (ERANGE); } /* Convert to unsigned representation */ intv.u64 = intv.i64; } else if (!itype_signed && otype_int && otype_signed) { /* Handle unsigned -> signed coercions */ if (intv.u64 > INT64_MAX) { /* Can't represent positive value */ BHND_NV_LOG("cannot represent %" PRIu64 " as %s\n", intv.u64, bhnd_nvram_type_name(otype)); return (ERANGE); } /* Convert to signed representation */ intv.i64 = intv.u64; } /* Write output */ switch (otype) { case BHND_NVRAM_TYPE_NULL: /* Cannot encode an integer value as NULL */ return (EFTYPE); case BHND_NVRAM_TYPE_BOOL: { bhnd_nvram_bool_t bval; if (intv.u64 == 0 || intv.u64 == 1) { bval = intv.u64; } else { /* Encoding as a bool would lose information */ return (ERANGE); } nbytes = sizeof(bhnd_nvram_bool_t); if (limit >= nbytes) *((bhnd_nvram_bool_t *)outp) = bval; break; } case BHND_NVRAM_TYPE_CHAR: case BHND_NVRAM_TYPE_CHAR_ARRAY: case BHND_NVRAM_TYPE_DATA: case BHND_NVRAM_TYPE_UINT8: case BHND_NVRAM_TYPE_UINT8_ARRAY: if (intv.u64 > UINT8_MAX) return (ERANGE); nbytes = sizeof(uint8_t); if (limit >= nbytes) *((uint8_t *)outp) = (uint8_t)intv.u64; break; case BHND_NVRAM_TYPE_UINT16: case BHND_NVRAM_TYPE_UINT16_ARRAY: if (intv.u64 > UINT16_MAX) return (ERANGE); nbytes = sizeof(uint16_t); if (limit >= nbytes) *((uint16_t *)outp) = (uint16_t)intv.u64; break; case BHND_NVRAM_TYPE_UINT32: case BHND_NVRAM_TYPE_UINT32_ARRAY: if (intv.u64 > UINT32_MAX) return (ERANGE); nbytes = sizeof(uint32_t); if (limit >= nbytes) *((uint32_t *)outp) = (uint32_t)intv.u64; break; case BHND_NVRAM_TYPE_UINT64: case BHND_NVRAM_TYPE_UINT64_ARRAY: nbytes = sizeof(uint64_t); if (limit >= nbytes) *((uint64_t *)outp) = intv.u64; break; case BHND_NVRAM_TYPE_INT8: case BHND_NVRAM_TYPE_INT8_ARRAY: if (intv.i64 < INT8_MIN || intv.i64 > INT8_MAX) return (ERANGE); nbytes = sizeof(int8_t); if (limit >= nbytes) *((int8_t *)outp) = (int8_t)intv.i64; break; case BHND_NVRAM_TYPE_INT16: case BHND_NVRAM_TYPE_INT16_ARRAY: if (intv.i64 < INT16_MIN || intv.i64 > INT16_MAX) return (ERANGE); nbytes = sizeof(int16_t); if (limit >= nbytes) *((int16_t *)outp) = (int16_t)intv.i64; break; case BHND_NVRAM_TYPE_INT32: case BHND_NVRAM_TYPE_INT32_ARRAY: if (intv.i64 < INT32_MIN || intv.i64 > INT32_MAX) return (ERANGE); nbytes = sizeof(int32_t); if (limit >= nbytes) *((int32_t *)outp) = (int32_t)intv.i64; break; case BHND_NVRAM_TYPE_INT64: case BHND_NVRAM_TYPE_INT64_ARRAY: nbytes = sizeof(int64_t); if (limit >= nbytes) *((int64_t *)outp) = intv.i64; break; case BHND_NVRAM_TYPE_STRING: case BHND_NVRAM_TYPE_STRING_ARRAY: { ssize_t len; /* Attempt to write the entry + NUL */ if (otype_signed) { len = snprintf(outp, limit, "%" PRId64, intv.i64); } else { len = snprintf(outp, limit, "%" PRIu64, intv.u64); } if (len < 0) { BHND_NV_LOG("snprintf() failed: %zd\n", len); return (EFTYPE); } /* Set total length to the formatted string length, plus * trailing NUL */ nbytes = len + 1; break; } default: BHND_NV_LOG("unknown type %s\n", bhnd_nvram_type_name(otype)); return (EFTYPE); } /* Provide required length */ *olen = nbytes; if (limit < *olen) { if (outp == NULL) return (0); return (ENOMEM); } return (0); } /** * Encode the given @p value as @p otype, writing the result to @p outp. * * @param value The value to be encoded. * @param[out] outp On success, the value will be written to this * buffer. This argment may be NULL if the value is * not desired. * @param[in,out] olen The capacity of @p outp. On success, will be set * to the actual size of the requested value. * @param otype The data type to be written to @p outp. * * @retval 0 success * @retval ENOMEM If the @p outp is non-NULL, and the provided @p olen * is too small to hold the encoded value. * @retval EFTYPE If value coercion from @p value to @p otype is * impossible. * @retval ERANGE If value coercion would overflow (or underflow) the * a @p otype representation. */ int bhnd_nvram_val_encode(bhnd_nvram_val *value, void *outp, size_t *olen, bhnd_nvram_type otype) { /* Prefer format implementation */ if (value->fmt->op_encode != NULL) return (value->fmt->op_encode(value, outp, olen, otype)); return (bhnd_nvram_val_generic_encode(value, outp, olen, otype)); } /** * Encode the given @p value's element as @p otype, writing the result to * @p outp. * * @param inp The element to be be encoded. Must be a value * previously returned by bhnd_nvram_val_next() * or bhnd_nvram_val_elem(). * @param ilen The size of @p inp, as returned by * bhnd_nvram_val_next() or bhnd_nvram_val_elem(). * @param[out] outp On success, the value will be written to this * buffer. This argment may be NULL if the value is * not desired. * @param[in,out] olen The capacity of @p outp. On success, will be set * to the actual size of the requested value. * @param otype The data type to be written to @p outp. * * @retval 0 success * @retval ENOMEM If the @p outp is non-NULL, and the provided @p olen * is too small to hold the encoded value. * @retval EFTYPE If value coercion from @p value to @p otype is * impossible. * @retval ERANGE If value coercion would overflow (or underflow) the * a @p otype representation. */ int bhnd_nvram_val_encode_elem(bhnd_nvram_val *value, const void *inp, size_t ilen, void *outp, size_t *olen, bhnd_nvram_type otype) { /* Prefer format implementation */ if (value->fmt->op_encode_elem != NULL) { return (value->fmt->op_encode_elem(value, inp, ilen, outp, olen, otype)); } return (bhnd_nvram_val_generic_encode_elem(value, inp, ilen, outp, olen, otype)); } /** * Return the type, size, and a pointer to the internal representation * of @p value. * * @param value The value to be queried. * @param[out] olen Size of the returned data, in bytes. * @param[out] otype Data type. */ const void * bhnd_nvram_val_bytes(bhnd_nvram_val *value, size_t *olen, bhnd_nvram_type *otype) { /* Provide type and length */ *otype = value->data_type; *olen = value->data_len; switch (value->data_storage) { case BHND_NVRAM_VAL_DATA_EXT_ALLOC: case BHND_NVRAM_VAL_DATA_EXT_STATIC: case BHND_NVRAM_VAL_DATA_EXT_WEAK: /* Return a pointer to external storage */ return (value->data.ptr); case BHND_NVRAM_VAL_DATA_INLINE: /* Return a pointer to inline storage */ return (&value->data); case BHND_NVRAM_VAL_DATA_NONE: BHND_NV_PANIC("uninitialized value"); } BHND_NV_PANIC("unknown storage type: %d", value->data_storage); } /** * Iterate over all array elements in @p value. * * @param value The value to be iterated * @param prev A value pointer previously returned by * bhnd_nvram_val_next() or bhnd_nvram_val_elem(), * or NULL to begin iteration at the first element. * @param[in,out] olen If @p prev is non-NULL, @p olen must be a * pointer to the length previously returned by * bhnd_nvram_val_next() or bhnd_nvram_val_elem(). * On success, will be set to the next element's * length, in bytes. * * @retval non-NULL A borrowed reference to the element data. * @retval NULL If the end of the element array is reached. */ const void * bhnd_nvram_val_next(bhnd_nvram_val *value, const void *prev, size_t *olen) { /* Prefer the format implementation */ if (value->fmt->op_next != NULL) return (value->fmt->op_next(value, prev, olen)); return (bhnd_nvram_val_generic_next(value, prev, olen)); } /** * Return the value's data type. * * @param value The value to be queried. */ bhnd_nvram_type bhnd_nvram_val_type(bhnd_nvram_val *value) { return (value->data_type); } /** * Return value's element data type. * * @param value The value to be queried. */ bhnd_nvram_type bhnd_nvram_val_elem_type(bhnd_nvram_val *value) { return (bhnd_nvram_base_type(value->data_type)); } /** * Return the total number of elements represented by @p value. */ size_t bhnd_nvram_val_nelem(bhnd_nvram_val *value) { const void *bytes; bhnd_nvram_type type; size_t nelem, len; int error; /* Prefer format implementation */ if (value->fmt->op_nelem != NULL) return (value->fmt->op_nelem(value)); /* * If a custom op_next() is defined, bhnd_nvram_value_nelem() almost * certainly cannot produce a valid element count; it assumes a standard * data format that may not apply when custom iteration is required. * * Instead, use bhnd_nvram_val_next() to parse the backing data and * produce a total count. */ if (value->fmt->op_next != NULL) { const void *next; next = NULL; nelem = 0; while ((next = bhnd_nvram_val_next(value, next, &len)) != NULL) nelem++; return (nelem); } /* Otherwise, compute the standard element count */ bytes = bhnd_nvram_val_bytes(value, &len, &type); if ((error = bhnd_nvram_value_nelem(bytes, len, type, &nelem))) { /* Should always succeed */ BHND_NV_PANIC("error calculating element count for type '%s' " "with length %zu: %d\n", bhnd_nvram_type_name(type), len, error); } return (nelem); } /** * Generic implementation of bhnd_nvram_val_op_encode(), compatible with * all supported NVRAM data types. */ int bhnd_nvram_val_generic_encode(bhnd_nvram_val *value, void *outp, size_t *olen, bhnd_nvram_type otype) { const void *inp; bhnd_nvram_type itype; size_t ilen; const void *next; bhnd_nvram_type otype_base; size_t limit, nelem, nbytes; size_t next_len; int error; nbytes = 0; nelem = 0; otype_base = bhnd_nvram_base_type(otype); inp = bhnd_nvram_val_bytes(value, &ilen, &itype); /* * Normally, an array type is not universally representable as * non-array type. * * As exceptions, we support conversion directly to/from: * - CHAR_ARRAY/STRING: * ->STRING Interpret the character array as a * non-NUL-terminated string. * ->CHAR_ARRAY Trim the trailing NUL from the string. */ #define BHND_NV_IS_ISO_CONV(_lhs, _rhs) \ ((itype == BHND_NVRAM_TYPE_ ## _lhs && \ otype == BHND_NVRAM_TYPE_ ## _rhs) || \ (itype == BHND_NVRAM_TYPE_ ## _rhs && \ otype == BHND_NVRAM_TYPE_ ## _lhs)) if (BHND_NV_IS_ISO_CONV(CHAR_ARRAY, STRING)) { return (bhnd_nvram_val_encode_elem(value, inp, ilen, outp, olen, otype)); } #undef BHND_NV_IS_ISO_CONV /* * If both input and output are non-array types, try to encode them * without performing element iteration. */ if (!bhnd_nvram_is_array_type(itype) && !bhnd_nvram_is_array_type(otype)) { return (bhnd_nvram_val_encode_elem(value, inp, ilen, outp, olen, otype)); } /* Determine output byte limit */ if (outp != NULL) limit = *olen; else limit = 0; /* Iterate over our array elements and encode as the requested * type */ next = NULL; while ((next = bhnd_nvram_val_next(value, next, &next_len))) { void *elem_outp; size_t elem_nbytes; /* If the output type is not an array type, we can only encode * one element */ nelem++; if (nelem > 1 && !bhnd_nvram_is_array_type(otype)) { return (EFTYPE); } /* Determine output offset / limit */ if (nbytes >= limit) { elem_nbytes = 0; elem_outp = NULL; } else { elem_nbytes = limit - nbytes; elem_outp = (uint8_t *)outp + nbytes; } /* Attempt encode */ error = bhnd_nvram_val_encode_elem(value, next, next_len, elem_outp, &elem_nbytes, otype_base); /* If encoding failed for any reason other than ENOMEM (which * we'll detect and report below), return immediately */ if (error && error != ENOMEM) return (error); /* Add to total length */ if (SIZE_MAX - nbytes < elem_nbytes) return (EFTYPE); /* would overflow size_t */ nbytes += elem_nbytes; } /* Provide the actual length */ *olen = nbytes; /* If no output was requested, nothing left to do */ if (outp == NULL) return (0); /* Otherwise, report a memory error if the output buffer was too * small */ if (limit < nbytes) return (ENOMEM); return (0); } /** * Generic implementation of bhnd_nvram_val_op_encode_elem(), compatible with * all supported NVRAM data types. */ int bhnd_nvram_val_generic_encode_elem(bhnd_nvram_val *value, const void *inp, size_t ilen, void *outp, size_t *olen, bhnd_nvram_type otype) { bhnd_nvram_type itype; itype = bhnd_nvram_val_elem_type(value); switch (itype) { case BHND_NVRAM_TYPE_NULL: return (bhnd_nvram_val_encode_null(inp, ilen, itype, outp, olen, otype)); case BHND_NVRAM_TYPE_DATA: return (bhnd_nvram_val_encode_data(inp, ilen, itype, outp, olen, otype)); case BHND_NVRAM_TYPE_STRING: case BHND_NVRAM_TYPE_CHAR: return (bhnd_nvram_val_encode_string(inp, ilen, itype, outp, olen, otype)); case BHND_NVRAM_TYPE_BOOL: return (bhnd_nvram_val_encode_bool(inp, ilen, itype, outp, olen, otype)); case BHND_NVRAM_TYPE_UINT8: case BHND_NVRAM_TYPE_UINT16: case BHND_NVRAM_TYPE_UINT32: case BHND_NVRAM_TYPE_UINT64: case BHND_NVRAM_TYPE_INT8: case BHND_NVRAM_TYPE_INT16: case BHND_NVRAM_TYPE_INT32: case BHND_NVRAM_TYPE_INT64: return (bhnd_nvram_val_encode_int(inp, ilen, itype, outp, olen, otype)); default: BHND_NV_PANIC("missing encode_elem() implementation"); } } /** * Generic implementation of bhnd_nvram_val_op_next(), compatible with * all supported NVRAM data types. */ const void * bhnd_nvram_val_generic_next(bhnd_nvram_val *value, const void *prev, size_t *olen) { const uint8_t *inp; bhnd_nvram_type itype; size_t ilen; /* Iterate over the backing representation */ inp = bhnd_nvram_val_bytes(value, &ilen, &itype); return (bhnd_nvram_value_array_next(inp, ilen, itype, prev, olen)); } /** * Initialize the representation of @p value with @p ptr. * * @param value The value to be initialized. * @param inp The external representation. * @param ilen The external representation length, in bytes. * @param itype The external representation's data type. * @param flags Value flags. * * @retval 0 success. * @retval ENOMEM if allocation fails * @retval EFTYPE if @p itype is not an array type, and @p ilen is not * equal to the size of a single element of @p itype. * @retval EFAULT if @p ilen is not correctly aligned for elements of * @p itype. */ static int bhnd_nvram_val_set(bhnd_nvram_val *value, const void *inp, size_t ilen, bhnd_nvram_type itype, uint32_t flags) { void *bytes; int error; BHND_NVRAM_VAL_ASSERT_EMPTY(value); /* Validate alignment */ if ((error = bhnd_nvram_value_check_aligned(inp, ilen, itype))) return (error); /* Reference the external data */ if ((flags & BHND_NVRAM_VAL_BORROW_DATA) || (flags & BHND_NVRAM_VAL_STATIC_DATA)) { if (flags & BHND_NVRAM_VAL_STATIC_DATA) value->data_storage = BHND_NVRAM_VAL_DATA_EXT_STATIC; else value->data_storage = BHND_NVRAM_VAL_DATA_EXT_WEAK; value->data.ptr = inp; value->data_type = itype; value->data_len = ilen; return (0); } /* Fetch reference to (or allocate) an appropriately sized buffer */ bytes = bhnd_nvram_val_alloc_bytes(value, ilen, itype, flags); if (bytes == NULL) return (ENOMEM); /* Copy data */ memcpy(bytes, inp, ilen); return (0); } /** * Initialize the internal inline representation of @p value with a copy of * the data referenced by @p inp of @p itype. * * If @p inp is NULL, @p itype and @p ilen will be validated, but no data will * be copied. * * @param value The value to be initialized. * @param inp The input data to be copied, or NULL to verify * that data of @p ilen and @p itype can be represented * inline. * @param ilen The size of the external buffer to be allocated. * @param itype The type of the external buffer to be allocated. * * @retval 0 success * @retval ENOMEM if @p ilen is too large to be represented inline. * @retval EFAULT if @p ilen is not correctly aligned for elements of * @p itype. */ static int bhnd_nvram_val_set_inline(bhnd_nvram_val *value, const void *inp, size_t ilen, bhnd_nvram_type itype) { BHND_NVRAM_VAL_ASSERT_EMPTY(value); #define NV_STORE_INIT_INLINE() do { \ value->data_len = ilen; \ value->data_type = itype; \ } while(0) #define NV_STORE_INLINE(_type, _dest) do { \ if (ilen != sizeof(_type)) \ return (EFAULT); \ \ if (inp != NULL) { \ value->data._dest[0] = *(const _type *)inp; \ NV_STORE_INIT_INLINE(); \ } \ } while (0) #define NV_COPY_ARRRAY_INLINE(_type, _dest) do { \ if (ilen % sizeof(_type) != 0) \ return (EFAULT); \ \ if (ilen > nitems(value->data. _dest)) \ return (ENOMEM); \ \ if (inp == NULL) \ return (0); \ \ memcpy(&value->data._dest, inp, ilen); \ if (inp != NULL) { \ memcpy(&value->data._dest, inp, ilen); \ NV_STORE_INIT_INLINE(); \ } \ } while (0) /* Attempt to copy to inline storage */ switch (itype) { case BHND_NVRAM_TYPE_NULL: if (ilen != 0) return (EFAULT); /* Nothing to copy */ NV_STORE_INIT_INLINE(); return (0); case BHND_NVRAM_TYPE_CHAR: NV_STORE_INLINE(uint8_t, ch); return (0); case BHND_NVRAM_TYPE_BOOL: NV_STORE_INLINE(bhnd_nvram_bool_t, b); return(0); case BHND_NVRAM_TYPE_UINT8: case BHND_NVRAM_TYPE_INT8: NV_STORE_INLINE(uint8_t, u8); return (0); case BHND_NVRAM_TYPE_UINT16: case BHND_NVRAM_TYPE_INT16: NV_STORE_INLINE(uint16_t, u16); return (0); case BHND_NVRAM_TYPE_UINT32: case BHND_NVRAM_TYPE_INT32: NV_STORE_INLINE(uint32_t, u32); return (0); case BHND_NVRAM_TYPE_UINT64: case BHND_NVRAM_TYPE_INT64: NV_STORE_INLINE(uint32_t, u32); return (0); case BHND_NVRAM_TYPE_CHAR_ARRAY: NV_COPY_ARRRAY_INLINE(uint8_t, ch); return (0); case BHND_NVRAM_TYPE_DATA: case BHND_NVRAM_TYPE_UINT8_ARRAY: case BHND_NVRAM_TYPE_INT8_ARRAY: NV_COPY_ARRRAY_INLINE(uint8_t, u8); return (0); case BHND_NVRAM_TYPE_UINT16_ARRAY: case BHND_NVRAM_TYPE_INT16_ARRAY: NV_COPY_ARRRAY_INLINE(uint16_t, u16); return (0); case BHND_NVRAM_TYPE_UINT32_ARRAY: case BHND_NVRAM_TYPE_INT32_ARRAY: NV_COPY_ARRRAY_INLINE(uint32_t, u32); return (0); case BHND_NVRAM_TYPE_UINT64_ARRAY: case BHND_NVRAM_TYPE_INT64_ARRAY: NV_COPY_ARRRAY_INLINE(uint64_t, u64); return (0); case BHND_NVRAM_TYPE_BOOL_ARRAY: NV_COPY_ARRRAY_INLINE(bhnd_nvram_bool_t, b); return(0); case BHND_NVRAM_TYPE_STRING: case BHND_NVRAM_TYPE_STRING_ARRAY: if (ilen > sizeof(value->data.ch)) return (ENOMEM); if (inp != NULL) { memcpy(&value->data.ch, inp, ilen); NV_STORE_INIT_INLINE(); } return (0); } #undef NV_STORE_INIT_INLINE #undef NV_STORE_INLINE #undef NV_COPY_ARRRAY_INLINE BHND_NV_PANIC("unknown data type %d", itype); } /** * Initialize the internal representation of @p value with a buffer allocation * of @p len and @p itype, returning a pointer to the allocated buffer. * * If a buffer of @p len and @p itype can be represented inline, no * external buffer will be allocated, and instead a pointer to the inline * data representation will be returned. * * @param value The value to be initialized. * @param ilen The size of the external buffer to be allocated. * @param itype The type of the external buffer to be allocated. * @param flags Value flags. * * @retval non-null The newly allocated buffer. * @retval NULL If allocation failed. * @retval NULL If @p value is an externally allocated instance. */ static void * bhnd_nvram_val_alloc_bytes(bhnd_nvram_val *value, size_t ilen, bhnd_nvram_type itype, uint32_t flags) { void *ptr; BHND_NVRAM_VAL_ASSERT_EMPTY(value); /* Can we use inline storage? */ if (bhnd_nvram_val_set_inline(value, NULL, ilen, itype) == 0) { BHND_NV_ASSERT(sizeof(value->data) >= ilen, ("ilen exceeds inline storage")); value->data_type = itype; value->data_len = ilen; value->data_storage = BHND_NVRAM_VAL_DATA_INLINE; return (&value->data); } /* Is allocation permitted? */ if (!(flags & BHND_NVRAM_VAL_DYNAMIC)) return (NULL); /* Allocate external storage */ if ((ptr = bhnd_nv_malloc(ilen)) == NULL) return (NULL); value->data.ptr = ptr; value->data_len = ilen; value->data_type = itype; value->data_storage = BHND_NVRAM_VAL_DATA_EXT_ALLOC; return (ptr); } Index: head/sys/dev/bhnd/nvram/bhnd_nvram_value_prf.c =================================================================== --- head/sys/dev/bhnd/nvram/bhnd_nvram_value_prf.c (revision 350420) +++ head/sys/dev/bhnd/nvram/bhnd_nvram_value_prf.c (revision 350421) @@ -1,882 +1,883 @@ /*- * Copyright (c) 2015-2016 Landon Fuller * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any * redistribution must be conditioned upon including a substantially * similar Disclaimer requirement for further binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGES. */ #include __FBSDID("$FreeBSD$"); #include +#include #include #ifdef _KERNEL #include #include #include #include #include #else /* !_KERNEL */ #include #include #include #include #include #endif /* _KERNEL */ #include "bhnd_nvram_private.h" #include "bhnd_nvram_valuevar.h" #ifdef _KERNEL #define bhnd_nv_hex2ascii(hex) hex2ascii(hex) #else /* !_KERNEL */ static char const bhnd_nv_hex2ascii[] = "0123456789abcdefghijklmnopqrstuvwxyz"; #define bhnd_nv_hex2ascii(hex) (bhnd_nv_hex2ascii[hex]) #endif /* _KERNEL */ /** * Maximum size, in bytes, of a string-encoded NVRAM integer value, not * including any prefix (0x, 0, etc). * * We assume the largest possible encoding is the base-2 representation * of a 64-bit integer. */ #define NV_NUMSTR_MAX ((sizeof(uint64_t) * CHAR_BIT) + 1) /** * Format a string representation of @p value using @p fmt, with, writing the * result to @p outp. * * @param value The value to be formatted. * @param fmt The format string. * @param[out] outp On success, the string will be written to this * buffer. This argment may be NULL if the value is * not desired. * @param[in,out] olen The capacity of @p outp. On success, will be set * to the actual number of bytes required for the * requested string encoding (including a trailing * NUL). * * Refer to bhnd_nvram_val_vprintf() for full format string documentation. * * @retval 0 success * @retval EINVAL If @p fmt contains unrecognized format string * specifiers. * @retval ENOMEM If the @p outp is non-NULL, and the provided @p olen * is too small to hold the encoded value. * @retval EFTYPE If value coercion from @p value to a single string * value via @p fmt is unsupported. * @retval ERANGE If value coercion of @p value would overflow (or * underflow) the representation defined by @p fmt. */ int bhnd_nvram_val_printf(bhnd_nvram_val *value, const char *fmt, char *outp, size_t *olen, ...) { va_list ap; int error; va_start(ap, olen); error = bhnd_nvram_val_vprintf(value, fmt, outp, olen, ap); va_end(ap); return (error); } /** * Format a string representation of the elements of @p value using @p fmt, * writing the result to @p outp. * * @param value The value to be formatted. * @param fmt The format string. * @param[out] outp On success, the string will be written to this * buffer. This argment may be NULL if the value is * not desired. * @param[in,out] olen The capacity of @p outp. On success, will be set * to the actual number of bytes required for the * requested string encoding (including a trailing * NUL). * @param ap Argument list. * * @par Format Strings * * Value format strings are similar, but not identical to, those used * by printf(3). * * Format specifier format: * %[repeat][flags][width][.precision][length modifier][specifier] * * The format specifier is interpreted as an encoding directive for an * individual value element; each format specifier will fetch the next element * from the value, encode the element as the appropriate type based on the * length modifiers and specifier, and then format the result as a string. * * For example, given a string value of '0x000F', and a format specifier of * '%#hhx', the value will be asked to encode its first element as * BHND_NVRAM_TYPE_UINT8. String formatting will then be applied to the 8-bit * unsigned integer representation, producing a string value of "0xF". * * Repeat: * - [digits] Repeatedly apply the format specifier to the input * value's elements up to `digits` times. The delimiter * must be passed as a string in the next variadic * argument. * - [] Repeatedly apply the format specifier to the input * value's elements until all elements have been. The * processed. The delimiter must be passed as a string in * the next variadic argument. * - [*] Repeatedly apply the format specifier to the input * value's elements. The repeat count is read from the * next variadic argument as a size_t value * * Flags: * - '#' use alternative form (e.g. 0x/0X prefixing of hex * strings). * - '0' zero padding * - '-' left adjust padding * - '+' include a sign character * - ' ' include a space in place of a sign character for * positive numbers. * * Width/Precision: * - digits minimum field width. * - * read the minimum field width from the next variadic * argument as a ssize_t value. A negative value enables * left adjustment. * - .digits field precision. * - .* read the field precision from the next variadic argument * as a ssize_t value. A negative value enables left * adjustment. * * Length Modifiers: * - 'hh', 'I8' Convert the value to an 8-bit signed or unsigned * integer. * - 'h', 'I16' Convert the value to an 16-bit signed or unsigned * integer. * - 'l', 'I32' Convert the value to an 32-bit signed or unsigned * integer. * - 'll', 'j', 'I64' Convert the value to an 64-bit signed or unsigned * integer. * * Data Specifiers: * - 'd', 'i' Convert and format as a signed decimal integer. * - 'u' Convert and format as an unsigned decimal integer. * - 'o' Convert and format as an unsigned octal integer. * - 'x' Convert and format as an unsigned hexadecimal integer, * using lowercase hex digits. * - 'X' Convert and format as an unsigned hexadecimal integer, * using uppercase hex digits. * - 's' Convert and format as a string. * - '%' Print a literal '%' character. * * @retval 0 success * @retval EINVAL If @p fmt contains unrecognized format string * specifiers. * @retval ENOMEM If the @p outp is non-NULL, and the provided @p olen * is too small to hold the encoded value. * @retval EFTYPE If value coercion from @p value to a single string * value via @p fmt is unsupported. * @retval ERANGE If value coercion of @p value would overflow (or * underflow) the representation defined by @p fmt. */ int bhnd_nvram_val_vprintf(bhnd_nvram_val *value, const char *fmt, char *outp, size_t *olen, va_list ap) { const void *elem; size_t elen; size_t limit, nbytes; int error; elem = NULL; /* Determine output byte limit */ nbytes = 0; if (outp != NULL) limit = *olen; else limit = 0; #define WRITE_CHAR(_c) do { \ if (limit > nbytes) \ *(outp + nbytes) = _c; \ \ if (nbytes == SIZE_MAX) \ return (EFTYPE); \ nbytes++; \ } while (0) /* Encode string value as per the format string */ for (const char *p = fmt; *p != '\0'; p++) { const char *delim; size_t precision, width, delim_len; u_long repeat, bits; bool alt_form, ladjust, have_precision; char padc, signc, lenc; padc = ' '; signc = '\0'; lenc = '\0'; delim = ""; delim_len = 0; ladjust = false; alt_form = false; have_precision = false; precision = 1; bits = 32; width = 0; repeat = 1; /* Copy all input to output until we hit a format specifier */ if (*p != '%') { WRITE_CHAR(*p); continue; } /* Hit '%' -- is this followed by an escaped '%' literal? */ p++; if (*p == '%') { WRITE_CHAR('%'); p++; continue; } /* Parse repeat specifier */ if (*p == '[') { p++; /* Determine repeat count */ if (*p == ']') { /* Repeat consumes all input */ repeat = bhnd_nvram_val_nelem(value); } else if (*p == '*') { /* Repeat is supplied as an argument */ repeat = va_arg(ap, size_t); p++; } else { char *endp; /* Repeat specified as argument */ repeat = strtoul(p, &endp, 10); if (p == endp) { BHND_NV_LOG("error parsing repeat " "count at '%s'", p); return (EINVAL); } /* Advance past repeat count */ p = endp; } /* Advance past terminating ']' */ if (*p != ']') { BHND_NV_LOG("error parsing repeat count at " "'%s'", p); return (EINVAL); } p++; delim = va_arg(ap, const char *); delim_len = strlen(delim); } /* Parse flags */ while (*p != '\0') { const char *np; bool stop; stop = false; np = p+1; switch (*p) { case '#': alt_form = true; break; case '0': padc = '0'; break; case '-': ladjust = true; break; case ' ': /* Must not override '+' */ if (signc != '+') signc = ' '; break; case '+': signc = '+'; break; default: /* Non-flag character */ stop = true; break; } if (stop) break; else p = np; } /* Parse minimum width */ if (*p == '*') { ssize_t arg; /* Width is supplied as an argument */ arg = va_arg(ap, int); /* Negative width argument is interpreted as * '-' flag followed by positive width */ if (arg < 0) { ladjust = true; arg = -arg; } width = arg; p++; } else if (bhnd_nv_isdigit(*p)) { uint32_t v; size_t len, parsed; /* Parse width value */ len = sizeof(v); error = bhnd_nvram_parse_int(p, strlen(p), 10, &parsed, &v, &len, BHND_NVRAM_TYPE_UINT32); if (error) { BHND_NV_LOG("error parsing width %s: %d\n", p, error); return (EINVAL); } /* Save width and advance input */ width = v; p += parsed; } /* Parse precision */ if (*p == '.') { uint32_t v; size_t len, parsed; p++; have_precision = true; if (*p == '*') { ssize_t arg; /* Precision is specified as an argument */ arg = va_arg(ap, int); /* Negative precision argument is interpreted * as '-' flag followed by positive * precision */ if (arg < 0) { ladjust = true; arg = -arg; } precision = arg; } else if (!bhnd_nv_isdigit(*p)) { /* Implicit precision of 0 */ precision = 0; } else { /* Parse precision value */ len = sizeof(v); error = bhnd_nvram_parse_int(p, strlen(p), 10, &parsed, &v, &len, BHND_NVRAM_TYPE_UINT32); if (error) { BHND_NV_LOG("error parsing width %s: " "%d\n", p, error); return (EINVAL); } /* Save precision and advance input */ precision = v; p += parsed; } } /* Parse length modifiers */ while (*p != '\0') { const char *np; bool stop; stop = false; np = p+1; switch (*p) { case 'h': if (lenc == '\0') { /* Set initial length value */ lenc = *p; bits = 16; } else if (lenc == *p && bits == 16) { /* Modify previous length value */ bits = 8; } else { BHND_NV_LOG("invalid length modifier " "%c\n", *p); return (EINVAL); } break; case 'l': if (lenc == '\0') { /* Set initial length value */ lenc = *p; bits = 32; } else if (lenc == *p && bits == 32) { /* Modify previous length value */ bits = 64; } else { BHND_NV_LOG("invalid length modifier " "%c\n", *p); return (EINVAL); } break; case 'j': /* Conflicts with all other length * specifications, and may only occur once */ if (lenc != '\0') { BHND_NV_LOG("invalid length modifier " "%c\n", *p); return (EINVAL); } lenc = *p; bits = 64; break; case 'I': { char *endp; /* Conflicts with all other length * specifications, and may only occur once */ if (lenc != '\0') { BHND_NV_LOG("invalid length modifier " "%c\n", *p); return (EINVAL); } lenc = *p; /* Parse the length specifier value */ p++; bits = strtoul(p, &endp, 10); if (p == endp) { BHND_NV_LOG("invalid size specifier: " "%s\n", p); return (EINVAL); } /* Advance input past the parsed integer */ np = endp; break; } default: /* Non-length modifier character */ stop = true; break; } if (stop) break; else p = np; } /* Parse conversion specifier and format the value(s) */ for (u_long n = 0; n < repeat; n++) { bhnd_nvram_type arg_type; size_t arg_size; size_t i; u_long base; bool is_signed, is_upper; is_signed = false; is_upper = false; base = 0; /* Fetch next element */ elem = bhnd_nvram_val_next(value, elem, &elen); if (elem == NULL) { BHND_NV_LOG("format string references more " "than %zu available value elements\n", bhnd_nvram_val_nelem(value)); return (EINVAL); } /* * If this is not the first value, append the delimiter. */ if (n > 0) { size_t nremain = 0; if (limit > nbytes) nremain = limit - nbytes; if (nremain >= delim_len) memcpy(outp + nbytes, delim, delim_len); /* Add delimiter length to the total byte count */ if (SIZE_MAX - nbytes < delim_len) return (EFTYPE); /* overflows size_t */ nbytes += delim_len; } /* Parse integer conversion specifiers */ switch (*p) { case 'd': case 'i': base = 10; is_signed = true; break; case 'u': base = 10; break; case 'o': base = 8; break; case 'x': base = 16; break; case 'X': base = 16; is_upper = true; break; } /* Format argument */ switch (*p) { #define NV_ENCODE_INT(_width) do { \ arg_type = (is_signed) ? BHND_NVRAM_TYPE_INT ## _width : \ BHND_NVRAM_TYPE_UINT ## _width; \ arg_size = sizeof(v.u ## _width); \ error = bhnd_nvram_val_encode_elem(value, elem, elen, \ &v.u ## _width, &arg_size, arg_type); \ if (error) { \ BHND_NV_LOG("error encoding argument as %s: %d\n", \ bhnd_nvram_type_name(arg_type), error); \ return (error); \ } \ \ if (is_signed) { \ if (v.i ## _width < 0) { \ add_neg = true; \ numval = (int64_t)-(v.i ## _width); \ } else { \ numval = (int64_t) (v.i ## _width); \ } \ } else { \ numval = v.u ## _width; \ } \ } while(0) case 'd': case 'i': case 'u': case 'o': case 'x': case 'X': { char numbuf[NV_NUMSTR_MAX]; char *sptr; uint64_t numval; size_t slen; bool add_neg; union { uint8_t u8; uint16_t u16; uint32_t u32; uint64_t u64; int8_t i8; int16_t i16; int32_t i32; int64_t i64; } v; add_neg = false; /* If precision is specified, it overrides * (and behaves identically) to a zero-prefixed * minimum width */ if (have_precision) { padc = '0'; width = precision; ladjust = false; } /* If zero-padding is used, value must be right * adjusted */ if (padc == '0') ladjust = false; /* Request encode to the appropriate integer * type, and then promote to common 64-bit * representation */ switch (bits) { case 8: NV_ENCODE_INT(8); break; case 16: NV_ENCODE_INT(16); break; case 32: NV_ENCODE_INT(32); break; case 64: NV_ENCODE_INT(64); break; default: BHND_NV_LOG("invalid length specifier: " "%lu\n", bits); return (EINVAL); } #undef NV_ENCODE_INT /* If a precision of 0 is specified and the * value is also zero, no characters should * be produced */ if (have_precision && precision == 0 && numval == 0) { break; } /* Emit string representation to local buffer */ BHND_NV_ASSERT(base <= 16, ("invalid base")); sptr = numbuf + nitems(numbuf) - 1; for (slen = 0; slen < sizeof(numbuf); slen++) { char c; uint64_t n; n = numval % base; c = bhnd_nv_hex2ascii(n); if (is_upper) c = bhnd_nv_toupper(c); sptr--; *sptr = c; numval /= (uint64_t)base; if (numval == 0) { slen++; break; } } arg_size = slen; /* Reserve space for 0/0x prefix? */ if (alt_form) { if (numval == 0) { /* If 0, no prefix */ alt_form = false; } else if (base == 8) { arg_size += 1; /* 0 */ } else if (base == 16) { arg_size += 2; /* 0x/0X */ } } /* Reserve space for ' ', '+', or '-' prefix? */ if (add_neg || signc != '\0') { if (add_neg) signc = '-'; arg_size++; } /* Right adjust (if using spaces) */ if (!ladjust && padc != '0') { for (i = arg_size; i < width; i++) WRITE_CHAR(padc); } if (signc != '\0') WRITE_CHAR(signc); if (alt_form) { if (base == 8) { WRITE_CHAR('0'); } else if (base == 16) { WRITE_CHAR('0'); if (is_upper) WRITE_CHAR('X'); else WRITE_CHAR('x'); } } /* Right adjust (if using zeros) */ if (!ladjust && padc == '0') { for (i = slen; i < width; i++) WRITE_CHAR(padc); } /* Write the string to our output buffer */ if (limit > nbytes && limit - nbytes >= slen) memcpy(outp + nbytes, sptr, slen); /* Update the total byte count */ if (SIZE_MAX - nbytes < arg_size) return (EFTYPE); /* overflows size_t */ nbytes += arg_size; /* Left adjust */ for (i = arg_size; ladjust && i < width; i++) WRITE_CHAR(padc); break; } case 's': { char *s; size_t slen; /* Query the total length of the element when * converted to a string */ arg_type = BHND_NVRAM_TYPE_STRING; error = bhnd_nvram_val_encode_elem(value, elem, elen, NULL, &arg_size, arg_type); if (error) { BHND_NV_LOG("error encoding argument " "as %s: %d\n", bhnd_nvram_type_name(arg_type), error); return (error); } /* Do not include trailing NUL in the string * length */ if (arg_size > 0) arg_size--; /* Right adjust */ for (i = arg_size; !ladjust && i < width; i++) WRITE_CHAR(padc); /* Determine output positition and remaining * buffer space */ if (limit > nbytes) { s = outp + nbytes; slen = limit - nbytes; } else { s = NULL; slen = 0; } /* Encode the string to our output buffer */ error = bhnd_nvram_val_encode_elem(value, elem, elen, s, &slen, arg_type); if (error && error != ENOMEM) { BHND_NV_LOG("error encoding argument " "as %s: %d\n", bhnd_nvram_type_name(arg_type), error); return (error); } /* Update the total byte count */ if (SIZE_MAX - nbytes < arg_size) return (EFTYPE); /* overflows size_t */ nbytes += arg_size; /* Left adjust */ for (i = arg_size; ladjust && i < width; i++) WRITE_CHAR(padc); break; } case 'c': { char c; arg_type = BHND_NVRAM_TYPE_CHAR; arg_size = bhnd_nvram_type_width(arg_type); /* Encode as single character */ error = bhnd_nvram_val_encode_elem(value, elem, elen, &c, &arg_size, arg_type); if (error) { BHND_NV_LOG("error encoding argument " "as %s: %d\n", bhnd_nvram_type_name(arg_type), error); return (error); } BHND_NV_ASSERT(arg_size == sizeof(c), ("invalid encoded size")); /* Right adjust */ for (i = arg_size; !ladjust && i < width; i++) WRITE_CHAR(padc); WRITE_CHAR(padc); /* Left adjust */ for (i = arg_size; ladjust && i < width; i++) WRITE_CHAR(padc); break; } } } } /* Append terminating NUL */ if (limit > nbytes) *(outp + nbytes) = '\0'; if (nbytes < SIZE_MAX) nbytes++; else return (EFTYPE); /* Report required space */ *olen = nbytes; if (limit < nbytes) { if (outp != NULL) return (ENOMEM); } return (0); } Index: head/sys/dev/drm2/drmP.h =================================================================== --- head/sys/dev/drm2/drmP.h (revision 350420) +++ head/sys/dev/drm2/drmP.h (revision 350421) @@ -1,1955 +1,1956 @@ /** * \file drmP.h * Private header for Direct Rendering Manager * * \author Rickard E. (Rik) Faith * \author Gareth Hughes */ /* * Copyright 1999 Precision Insight, Inc., Cedar Park, Texas. * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California. * Copyright (c) 2009-2010, Code Aurora Forum. * All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #ifndef _DRM_P_H_ #define _DRM_P_H_ #if defined(_KERNEL) || defined(__KERNEL__) #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__i386__) || defined(__amd64__) #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(CONFIG_AGP) || (defined(CONFIG_AGP_MODULE) && defined(MODULE)) #define __OS_HAS_AGP 1 #else #define __OS_HAS_AGP 0 #endif #if defined(CONFIG_MTRR) #define __OS_HAS_MTRR 1 #else #define __OS_HAS_MTRR 0 #endif struct drm_file; struct drm_device; #include #include #include "opt_drm.h" #include "opt_syscons.h" #ifdef DRM_DEBUG #undef DRM_DEBUG #define DRM_DEBUG_DEFAULT_ON 1 #endif /* DRM_DEBUG */ #define DRM_DEBUGBITS_DEBUG 0x1 #define DRM_DEBUGBITS_KMS 0x2 #define DRM_DEBUGBITS_FAILED_IOCTL 0x4 #undef DRM_LINUX #define DRM_LINUX 0 /***********************************************************************/ /** \name DRM template customization defaults */ /*@{*/ /* driver capabilities and requirements mask */ #define DRIVER_USE_AGP 0x1 #define DRIVER_REQUIRE_AGP 0x2 #define DRIVER_USE_MTRR 0x4 #define DRIVER_PCI_DMA 0x8 #define DRIVER_SG 0x10 #define DRIVER_HAVE_DMA 0x20 #define DRIVER_HAVE_IRQ 0x40 #define DRIVER_IRQ_SHARED 0x80 #define DRIVER_IRQ_VBL 0x100 #define DRIVER_DMA_QUEUE 0x200 #define DRIVER_FB_DMA 0x400 #define DRIVER_IRQ_VBL2 0x800 #define DRIVER_GEM 0x1000 #define DRIVER_MODESET 0x2000 #define DRIVER_PRIME 0x4000 #define DRIVER_BUS_PCI 0x1 #define DRIVER_BUS_PLATFORM 0x2 #define DRIVER_BUS_USB 0x3 /***********************************************************************/ /** \name Begin the DRM... */ /*@{*/ #define DRM_DEBUG_CODE 2 /**< Include debugging code if > 1, then also include looping detection. */ #define DRM_MAGIC_HASH_ORDER 4 /**< Size of key hash table. Must be power of 2. */ #define DRM_KERNEL_CONTEXT 0 /**< Change drm_resctx if changed */ #define DRM_RESERVED_CONTEXTS 1 /**< Change drm_resctx if changed */ #define DRM_LOOPING_LIMIT 5000000 #define DRM_TIME_SLICE (HZ/20) /**< Time slice for GLXContexts */ #define DRM_LOCK_SLICE 1 /**< Time slice for lock, in jiffies */ #define DRM_FLAG_DEBUG 0x01 #define DRM_MAX_CTXBITMAP (PAGE_SIZE * 8) #define DRM_MAP_HASH_OFFSET 0x10000000 /*@}*/ /***********************************************************************/ /** \name Macros to make printk easier */ /*@{*/ /** * Error output. * * \param fmt printf() like format string. * \param arg arguments */ #define DRM_ERROR(fmt, ...) \ printf("error: [" DRM_NAME ":pid%d:%s] *ERROR* " fmt, \ DRM_CURRENTPID, __func__ , ##__VA_ARGS__) #define DRM_WARNING(fmt, ...) printf("warning: [" DRM_NAME "] " fmt , ##__VA_ARGS__) #define DRM_INFO(fmt, ...) printf("info: [" DRM_NAME "] " fmt , ##__VA_ARGS__) /** * Debug output. * * \param fmt printf() like format string. * \param arg arguments */ #define DRM_DEBUG(fmt, ...) do { \ if ((drm_debug & DRM_DEBUGBITS_DEBUG) != 0) \ printf("[" DRM_NAME ":pid%d:%s] " fmt, DRM_CURRENTPID, \ __func__ , ##__VA_ARGS__); \ } while (0) #define DRM_DEBUG_DRIVER(fmt, ...) do { \ if ((drm_debug & DRM_DEBUGBITS_KMS) != 0) \ printf("[" DRM_NAME ":KMS:pid%d:%s] " fmt, DRM_CURRENTPID,\ __func__ , ##__VA_ARGS__); \ } while (0) #define DRM_DEBUG_KMS(fmt, ...) do { \ if ((drm_debug & DRM_DEBUGBITS_KMS) != 0) \ printf("[" DRM_NAME ":KMS:pid%d:%s] " fmt, DRM_CURRENTPID,\ __func__ , ##__VA_ARGS__); \ } while (0) #define DRM_LOG(fmt, ...) do { \ if ((drm_debug & DRM_DEBUGBITS_KMS) != 0) \ printf("[" DRM_NAME "]:pid%d:%s]" fmt, DRM_CURRENTPID, \ __func__ , ##__VA_ARGS__); \ } while (0) #define DRM_LOG_KMS(fmt, ...) do { \ if ((drm_debug & DRM_DEBUGBITS_KMS) != 0) \ printf("[" DRM_NAME "]:KMS:pid%d:%s]" fmt, DRM_CURRENTPID,\ __func__ , ##__VA_ARGS__); \ } while (0) #define DRM_LOG_MODE(fmt, ...) do { \ if ((drm_debug & DRM_DEBUGBITS_KMS) != 0) \ printf("[" DRM_NAME "]:pid%d:%s]" fmt, DRM_CURRENTPID, \ __func__ , ##__VA_ARGS__); \ } while (0) #define DRM_LOG_DRIVER(fmt, ...) do { \ if ((drm_debug & DRM_DEBUGBITS_KMS) != 0) \ printf("[" DRM_NAME "]:KMS:pid%d:%s]" fmt, DRM_CURRENTPID,\ __func__ , ##__VA_ARGS__); \ } while (0) /*@}*/ /***********************************************************************/ /** \name Internal types and structures */ /*@{*/ #define DRM_ARRAY_SIZE(x) ARRAY_SIZE(x) #define DRM_LEFTCOUNT(x) (((x)->rp + (x)->count - (x)->wp) % ((x)->count + 1)) #define DRM_BUFCOUNT(x) ((x)->count - DRM_LEFTCOUNT(x)) #define DRM_IF_VERSION(maj, min) (maj << 16 | min) /** * Test that the hardware lock is held by the caller, returning otherwise. * * \param dev DRM device. * \param filp file pointer of the caller. */ #define LOCK_TEST_WITH_RETURN( dev, _file_priv ) \ do { \ if (!_DRM_LOCK_IS_HELD(_file_priv->master->lock.hw_lock->lock) || \ _file_priv->master->lock.file_priv != _file_priv) { \ DRM_ERROR( "%s called without lock held, held %d owner %p %p\n",\ __func__, _DRM_LOCK_IS_HELD(_file_priv->master->lock.hw_lock->lock),\ _file_priv->master->lock.file_priv, _file_priv); \ return -EINVAL; \ } \ } while (0) /** * Ioctl function type. * * \param inode device inode. * \param file_priv DRM file private pointer. * \param cmd command. * \param arg argument. */ typedef int drm_ioctl_t(struct drm_device *dev, void *data, struct drm_file *file_priv); #define DRM_IOCTL_NR(n) ((n) & 0xff) #define DRM_MAJOR 226 #define DRM_AUTH 0x1 #define DRM_MASTER 0x2 #define DRM_ROOT_ONLY 0x4 #define DRM_CONTROL_ALLOW 0x8 #define DRM_UNLOCKED 0x10 struct drm_ioctl_desc { unsigned long cmd; int flags; drm_ioctl_t *func; unsigned int cmd_drv; }; /** * Creates a driver or general drm_ioctl_desc array entry for the given * ioctl, for use by drm_ioctl(). */ #define DRM_IOCTL_DEF(ioctl, _func, _flags) \ [DRM_IOCTL_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, .cmd_drv = 0} #define DRM_IOCTL_DEF_DRV(ioctl, _func, _flags) \ [DRM_IOCTL_NR(DRM_##ioctl)] = {.cmd = DRM_##ioctl, .func = _func, .flags = _flags, .cmd_drv = DRM_IOCTL_##ioctl} struct drm_magic_entry { struct list_head head; struct drm_hash_item hash_item; struct drm_file *priv; }; /** * DMA buffer. */ struct drm_buf { int idx; /**< Index into master buflist */ int total; /**< Buffer size */ int order; /**< log-base-2(total) */ int used; /**< Amount of buffer in use (for DMA) */ unsigned long offset; /**< Byte offset (used internally) */ void *address; /**< Address of buffer */ unsigned long bus_address; /**< Bus address of buffer */ struct drm_buf *next; /**< Kernel-only: used for free list */ __volatile__ int waiting; /**< On kernel DMA queue */ __volatile__ int pending; /**< On hardware DMA queue */ struct drm_file *file_priv; /**< Private of holding file descr */ int context; /**< Kernel queue for this buffer */ int while_locked; /**< Dispatch this buffer while locked */ enum { DRM_LIST_NONE = 0, DRM_LIST_FREE = 1, DRM_LIST_WAIT = 2, DRM_LIST_PEND = 3, DRM_LIST_PRIO = 4, DRM_LIST_RECLAIM = 5 } list; /**< Which list we're on */ int dev_priv_size; /**< Size of buffer private storage */ void *dev_private; /**< Per-buffer private storage */ }; struct drm_freelist { int initialized; /**< Freelist in use */ atomic_t count; /**< Number of free buffers */ struct drm_buf *next; /**< End pointer */ #ifdef FREEBSD_NOTYET wait_queue_head_t waiting; /**< Processes waiting on free bufs */ #endif /* defined(FREEBSD_NOTYET) */ int low_mark; /**< Low water mark */ int high_mark; /**< High water mark */ #ifdef FREEBSD_NOTYET atomic_t wfh; /**< If waiting for high mark */ spinlock_t lock; #endif /* defined(FREEBSD_NOTYET) */ }; typedef struct drm_dma_handle { void *vaddr; bus_addr_t busaddr; bus_dma_tag_t tag; bus_dmamap_t map; } drm_dma_handle_t; /** * Buffer entry. There is one of this for each buffer size order. */ struct drm_buf_entry { int buf_size; /**< size */ int buf_count; /**< number of buffers */ struct drm_buf *buflist; /**< buffer list */ int seg_count; int page_order; struct drm_dma_handle **seglist; struct drm_freelist freelist; }; /* Event queued up for userspace to read */ struct drm_pending_event { struct drm_event *event; struct list_head link; struct drm_file *file_priv; pid_t pid; /* pid of requester, no guarantee it's valid by the time we deliver the event, for tracing only */ void (*destroy)(struct drm_pending_event *event); }; /* initial implementaton using a linked list - todo hashtab */ struct drm_prime_file_private { struct list_head head; struct mtx lock; }; struct drm_file { int authenticated; pid_t pid; uid_t uid; drm_magic_t magic; unsigned long ioctl_count; struct list_head lhead; struct drm_minor *minor; unsigned long lock_count; void *driver_priv; struct drm_gem_names object_names; int is_master; /* this file private is a master for a minor */ struct drm_master *master; /* master this node is currently associated with N.B. not always minor->master */ struct list_head fbs; struct selinfo event_poll; struct list_head event_list; int event_space; struct drm_prime_file_private prime; }; /** * Lock data. */ struct drm_lock_data { struct drm_hw_lock *hw_lock; /**< Hardware lock */ /** Private of lock holder's file (NULL=kernel) */ struct drm_file *file_priv; wait_queue_head_t lock_queue; /**< Queue of blocked processes */ unsigned long lock_time; /**< Time of last lock in jiffies */ struct mtx spinlock; uint32_t kernel_waiters; uint32_t user_waiters; int idle_has_lock; }; /** * DMA data. */ struct drm_device_dma { struct drm_buf_entry bufs[DRM_MAX_ORDER + 1]; /**< buffers, grouped by their size order */ int buf_count; /**< total number of buffers */ struct drm_buf **buflist; /**< Vector of pointers into drm_device_dma::bufs */ int seg_count; int page_count; /**< number of pages */ unsigned long *pagelist; /**< page list */ unsigned long byte_count; enum { _DRM_DMA_USE_AGP = 0x01, _DRM_DMA_USE_SG = 0x02, _DRM_DMA_USE_FB = 0x04, _DRM_DMA_USE_PCI_RO = 0x08 } flags; }; /** * AGP memory entry. Stored as a doubly linked list. */ struct drm_agp_mem { unsigned long handle; /**< handle */ DRM_AGP_MEM *memory; unsigned long bound; /**< address */ int pages; struct list_head head; }; /** * AGP data. * * \sa drm_agp_init() and drm_device::agp. */ struct drm_agp_head { DRM_AGP_KERN agp_info; /**< AGP device information */ struct list_head memory; unsigned long mode; /**< AGP mode */ device_t bridge; int enabled; /**< whether the AGP bus as been enabled */ int acquired; /**< whether the AGP device has been acquired */ unsigned long base; int agp_mtrr; int cant_use_aperture; }; /** * Scatter-gather memory. */ struct drm_sg_mem { vm_offset_t vaddr; vm_paddr_t *busaddr; vm_pindex_t pages; }; struct drm_sigdata { int context; struct drm_hw_lock *lock; }; /** * Kernel side of a mapping */ #define DRM_MAP_HANDLE_BITS (sizeof(void *) == 4 ? 4 : 24) #define DRM_MAP_HANDLE_SHIFT (sizeof(void *) * 8 - DRM_MAP_HANDLE_BITS) struct drm_local_map { resource_size_t offset; /**< Requested physical address (0 for SAREA)*/ unsigned long size; /**< Requested physical size (bytes) */ enum drm_map_type type; /**< Type of memory to map */ enum drm_map_flags flags; /**< Flags */ void *handle; /**< User-space: "Handle" to pass to mmap() */ /**< Kernel-space: kernel-virtual address */ int mtrr; /**< MTRR slot used */ /* Private data */ drm_dma_handle_t *dmah; }; typedef struct drm_local_map drm_local_map_t; /** * Mappings list */ struct drm_map_list { struct list_head head; /**< list head */ struct drm_hash_item hash; struct drm_local_map *map; /**< mapping */ uint64_t user_token; struct drm_master *master; struct drm_mm_node *file_offset_node; /**< fake offset */ }; /** * Context handle list */ struct drm_ctx_list { struct list_head head; /**< list head */ drm_context_t handle; /**< context handle */ struct drm_file *tag; /**< associated fd private data */ }; /* location of GART table */ #define DRM_ATI_GART_MAIN 1 #define DRM_ATI_GART_FB 2 #define DRM_ATI_GART_PCI 1 #define DRM_ATI_GART_PCIE 2 #define DRM_ATI_GART_IGP 3 struct drm_ati_pcigart_info { int gart_table_location; int gart_reg_if; void *addr; dma_addr_t bus_addr; dma_addr_t table_mask; struct drm_dma_handle *table_handle; struct drm_local_map mapping; int table_size; struct drm_dma_handle *dmah; /* handle for ATI PCIGART table FIXME */ }; /** * GEM specific mm private for tracking GEM objects */ struct drm_gem_mm { struct unrhdr *idxunr; struct drm_open_hash offset_hash; /**< User token hash table for maps */ }; /** * This structure defines the drm_mm memory object, which will be used by the * DRM for its buffer objects. */ struct drm_gem_object { /** Reference count of this object */ u_int refcount; /** Handle count of this object. Each handle also holds a reference */ atomic_t handle_count; /* number of handles on this object */ /** Related drm device */ struct drm_device *dev; /** File representing the shmem storage: filp in Linux parlance */ vm_object_t vm_obj; /* Mapping info for this object */ bool on_map; struct drm_hash_item map_list; /** * Size of the object, in bytes. Immutable over the object's * lifetime. */ size_t size; /** * Global name for this object, starts at 1. 0 means unnamed. * Access is covered by the object_name_lock in the related drm_device */ int name; /** * Memory domains. These monitor which caches contain read/write data * related to the object. When transitioning from one set of domains * to another, the driver is called to ensure that caches are suitably * flushed and invalidated */ uint32_t read_domains; uint32_t write_domain; /** * While validating an exec operation, the * new read/write domain values are computed here. * They will be transferred to the above values * at the point that any cache flushing occurs */ uint32_t pending_read_domains; uint32_t pending_write_domain; void *driver_private; #ifdef FREEBSD_NOTYET /* dma buf exported from this GEM object */ struct dma_buf *export_dma_buf; /* dma buf attachment backing this object */ struct dma_buf_attachment *import_attach; #endif /* FREEBSD_NOTYET */ }; #include /* per-master structure */ struct drm_master { u_int refcount; /* refcount for this master */ struct list_head head; /**< each minor contains a list of masters */ struct drm_minor *minor; /**< link back to minor we are a master for */ char *unique; /**< Unique identifier: e.g., busid */ int unique_len; /**< Length of unique field */ int unique_size; /**< amount allocated */ int blocked; /**< Blocked due to VC switch? */ /** \name Authentication */ /*@{ */ struct drm_open_hash magiclist; struct list_head magicfree; /*@} */ struct drm_lock_data lock; /**< Information on hardware lock */ void *driver_priv; /**< Private structure for driver to use */ }; /* Size of ringbuffer for vblank timestamps. Just double-buffer * in initial implementation. */ #define DRM_VBLANKTIME_RBSIZE 2 /* Flags and return codes for get_vblank_timestamp() driver function. */ #define DRM_CALLED_FROM_VBLIRQ 1 #define DRM_VBLANKTIME_SCANOUTPOS_METHOD (1 << 0) #define DRM_VBLANKTIME_INVBL (1 << 1) /* get_scanout_position() return flags */ #define DRM_SCANOUTPOS_VALID (1 << 0) #define DRM_SCANOUTPOS_INVBL (1 << 1) #define DRM_SCANOUTPOS_ACCURATE (1 << 2) struct drm_bus { int bus_type; int (*get_irq)(struct drm_device *dev); void (*free_irq)(struct drm_device *dev); const char *(*get_name)(struct drm_device *dev); int (*set_busid)(struct drm_device *dev, struct drm_master *master); int (*set_unique)(struct drm_device *dev, struct drm_master *master, struct drm_unique *unique); int (*irq_by_busid)(struct drm_device *dev, struct drm_irq_busid *p); /* hooks that are for PCI */ int (*agp_init)(struct drm_device *dev); }; /** * DRM driver structure. This structure represent the common code for * a family of cards. There will one drm_device for each card present * in this family */ struct drm_driver { int (*load) (struct drm_device *, unsigned long flags); int (*firstopen) (struct drm_device *); int (*open) (struct drm_device *, struct drm_file *); void (*preclose) (struct drm_device *, struct drm_file *file_priv); void (*postclose) (struct drm_device *, struct drm_file *); void (*lastclose) (struct drm_device *); int (*unload) (struct drm_device *); int (*suspend) (struct drm_device *, pm_message_t state); int (*resume) (struct drm_device *); int (*dma_ioctl) (struct drm_device *dev, void *data, struct drm_file *file_priv); int (*dma_quiescent) (struct drm_device *); int (*context_dtor) (struct drm_device *dev, int context); /** * get_vblank_counter - get raw hardware vblank counter * @dev: DRM device * @crtc: counter to fetch * * Driver callback for fetching a raw hardware vblank counter for @crtc. * If a device doesn't have a hardware counter, the driver can simply * return the value of drm_vblank_count. The DRM core will account for * missed vblank events while interrupts where disabled based on system * timestamps. * * Wraparound handling and loss of events due to modesetting is dealt * with in the DRM core code. * * RETURNS * Raw vblank counter value. */ u32 (*get_vblank_counter) (struct drm_device *dev, int crtc); /** * enable_vblank - enable vblank interrupt events * @dev: DRM device * @crtc: which irq to enable * * Enable vblank interrupts for @crtc. If the device doesn't have * a hardware vblank counter, this routine should be a no-op, since * interrupts will have to stay on to keep the count accurate. * * RETURNS * Zero on success, appropriate errno if the given @crtc's vblank * interrupt cannot be enabled. */ int (*enable_vblank) (struct drm_device *dev, int crtc); /** * disable_vblank - disable vblank interrupt events * @dev: DRM device * @crtc: which irq to enable * * Disable vblank interrupts for @crtc. If the device doesn't have * a hardware vblank counter, this routine should be a no-op, since * interrupts will have to stay on to keep the count accurate. */ void (*disable_vblank) (struct drm_device *dev, int crtc); /** * Called by \c drm_device_is_agp. Typically used to determine if a * card is really attached to AGP or not. * * \param dev DRM device handle * * \returns * One of three values is returned depending on whether or not the * card is absolutely \b not AGP (return of 0), absolutely \b is AGP * (return of 1), or may or may not be AGP (return of 2). */ int (*device_is_agp) (struct drm_device *dev); /** * Called by vblank timestamping code. * * Return the current display scanout position from a crtc. * * \param dev DRM device. * \param crtc Id of the crtc to query. * \param *vpos Target location for current vertical scanout position. * \param *hpos Target location for current horizontal scanout position. * * Returns vpos as a positive number while in active scanout area. * Returns vpos as a negative number inside vblank, counting the number * of scanlines to go until end of vblank, e.g., -1 means "one scanline * until start of active scanout / end of vblank." * * \return Flags, or'ed together as follows: * * DRM_SCANOUTPOS_VALID = Query successful. * DRM_SCANOUTPOS_INVBL = Inside vblank. * DRM_SCANOUTPOS_ACCURATE = Returned position is accurate. A lack of * this flag means that returned position may be offset by a constant * but unknown small number of scanlines wrt. real scanout position. * */ int (*get_scanout_position) (struct drm_device *dev, int crtc, int *vpos, int *hpos); /** * Called by \c drm_get_last_vbltimestamp. Should return a precise * timestamp when the most recent VBLANK interval ended or will end. * * Specifically, the timestamp in @vblank_time should correspond as * closely as possible to the time when the first video scanline of * the video frame after the end of VBLANK will start scanning out, * the time immediately after end of the VBLANK interval. If the * @crtc is currently inside VBLANK, this will be a time in the future. * If the @crtc is currently scanning out a frame, this will be the * past start time of the current scanout. This is meant to adhere * to the OpenML OML_sync_control extension specification. * * \param dev dev DRM device handle. * \param crtc crtc for which timestamp should be returned. * \param *max_error Maximum allowable timestamp error in nanoseconds. * Implementation should strive to provide timestamp * with an error of at most *max_error nanoseconds. * Returns true upper bound on error for timestamp. * \param *vblank_time Target location for returned vblank timestamp. * \param flags 0 = Defaults, no special treatment needed. * \param DRM_CALLED_FROM_VBLIRQ = Function is called from vblank * irq handler. Some drivers need to apply some workarounds * for gpu-specific vblank irq quirks if flag is set. * * \returns * Zero if timestamping isn't supported in current display mode or a * negative number on failure. A positive status code on success, * which describes how the vblank_time timestamp was computed. */ int (*get_vblank_timestamp) (struct drm_device *dev, int crtc, int *max_error, struct timeval *vblank_time, unsigned flags); /* these have to be filled in */ irqreturn_t(*irq_handler) (DRM_IRQ_ARGS); void (*irq_preinstall) (struct drm_device *dev); int (*irq_postinstall) (struct drm_device *dev); void (*irq_uninstall) (struct drm_device *dev); void (*set_version) (struct drm_device *dev, struct drm_set_version *sv); /* Master routines */ int (*master_create)(struct drm_device *dev, struct drm_master *master); void (*master_destroy)(struct drm_device *dev, struct drm_master *master); /** * master_set is called whenever the minor master is set. * master_drop is called whenever the minor master is dropped. */ int (*master_set)(struct drm_device *dev, struct drm_file *file_priv, bool from_open); void (*master_drop)(struct drm_device *dev, struct drm_file *file_priv, bool from_release); /** * Driver-specific constructor for drm_gem_objects, to set up * obj->driver_private. * * Returns 0 on success. */ int (*gem_init_object) (struct drm_gem_object *obj); void (*gem_free_object) (struct drm_gem_object *obj); int (*gem_open_object) (struct drm_gem_object *, struct drm_file *); void (*gem_close_object) (struct drm_gem_object *, struct drm_file *); #ifdef FREEBSD_NOTYET /* prime: */ /* export handle -> fd (see drm_gem_prime_handle_to_fd() helper) */ int (*prime_handle_to_fd)(struct drm_device *dev, struct drm_file *file_priv, uint32_t handle, uint32_t flags, int *prime_fd); /* import fd -> handle (see drm_gem_prime_fd_to_handle() helper) */ int (*prime_fd_to_handle)(struct drm_device *dev, struct drm_file *file_priv, int prime_fd, uint32_t *handle); /* export GEM -> dmabuf */ struct dma_buf * (*gem_prime_export)(struct drm_device *dev, struct drm_gem_object *obj, int flags); /* import dmabuf -> GEM */ struct drm_gem_object * (*gem_prime_import)(struct drm_device *dev, struct dma_buf *dma_buf); #endif /* defined(FREEBSD_NOTYET) */ /* dumb alloc support */ int (*dumb_create)(struct drm_file *file_priv, struct drm_device *dev, struct drm_mode_create_dumb *args); int (*dumb_map_offset)(struct drm_file *file_priv, struct drm_device *dev, uint32_t handle, uint64_t *offset); int (*dumb_destroy)(struct drm_file *file_priv, struct drm_device *dev, uint32_t handle); /* Driver private ops for this object */ struct cdev_pager_ops *gem_pager_ops; int (*sysctl_init)(struct drm_device *dev, struct sysctl_ctx_list *ctx, struct sysctl_oid *top); void (*sysctl_cleanup)(struct drm_device *dev); int major; int minor; int patchlevel; char *name; char *desc; char *date; u32 driver_features; int dev_priv_size; struct drm_ioctl_desc *ioctls; int num_ioctls; struct drm_bus *bus; #ifdef COMPAT_FREEBSD32 struct drm_ioctl_desc *compat_ioctls; int *num_compat_ioctls; #endif int buf_priv_size; }; #define DRM_MINOR_UNASSIGNED 0 #define DRM_MINOR_LEGACY 1 #define DRM_MINOR_CONTROL 2 #define DRM_MINOR_RENDER 3 /** * DRM minor structure. This structure represents a drm minor number. */ struct drm_minor { int index; /**< Minor device number */ int type; /**< Control or render */ struct cdev *device; /**< Device number for mknod */ device_t kdev; /**< OS device */ struct drm_device *dev; struct drm_master *master; /* currently active master for this node */ struct list_head master_list; struct drm_mode_group mode_group; struct sigio *buf_sigio; /* Processes waiting for SIGIO */ }; /* mode specified on the command line */ struct drm_cmdline_mode { bool specified; bool refresh_specified; bool bpp_specified; int xres, yres; int bpp; int refresh; bool rb; bool interlace; bool cvt; bool margins; enum drm_connector_force force; }; struct drm_pending_vblank_event { struct drm_pending_event base; int pipe; struct drm_event_vblank event; }; /** * DRM device structure. This structure represent a complete card that * may contain multiple heads. */ struct drm_device { int if_version; /**< Highest interface version set */ /** \name Locks */ /*@{ */ struct mtx count_lock; /**< For inuse, drm_device::open_count, drm_device::buf_use */ struct sx dev_struct_lock; /**< For others */ /*@} */ /** \name Usage Counters */ /*@{ */ int open_count; /**< Outstanding files open */ atomic_t ioctl_count; /**< Outstanding IOCTLs pending */ atomic_t vma_count; /**< Outstanding vma areas open */ int buf_use; /**< Buffers in use -- cannot alloc */ atomic_t buf_alloc; /**< Buffer allocation in progress */ /*@} */ /** \name Performance counters */ /*@{ */ unsigned long counters; enum drm_stat_type types[15]; atomic_t counts[15]; /*@} */ struct list_head filelist; /** \name Memory management */ /*@{ */ struct list_head maplist; /**< Linked list of regions */ int map_count; /**< Number of mappable regions */ struct drm_open_hash map_hash; /**< User token hash table for maps */ /** \name Context handle management */ /*@{ */ struct list_head ctxlist; /**< Linked list of context handles */ int ctx_count; /**< Number of context handles */ struct mtx ctxlist_mutex; /**< For ctxlist */ drm_local_map_t **context_sareas; int max_context; unsigned long *ctx_bitmap; /*@} */ /** \name DMA support */ /*@{ */ struct drm_device_dma *dma; /**< Optional pointer for DMA support */ /*@} */ /** \name Context support */ /*@{ */ int irq_enabled; /**< True if irq handler is enabled */ atomic_t context_flag; /**< Context swapping flag */ atomic_t interrupt_flag; /**< Interruption handler flag */ atomic_t dma_flag; /**< DMA dispatch flag */ wait_queue_head_t context_wait; /**< Processes waiting on ctx switch */ int last_checked; /**< Last context checked for DMA */ int last_context; /**< Last current context */ unsigned long last_switch; /**< jiffies at last context switch */ /*@} */ /** \name VBLANK IRQ support */ /*@{ */ /* * At load time, disabling the vblank interrupt won't be allowed since * old clients may not call the modeset ioctl and therefore misbehave. * Once the modeset ioctl *has* been called though, we can safely * disable them when unused. */ int vblank_disable_allowed; atomic_t *_vblank_count; /**< number of VBLANK interrupts (driver must alloc the right number of counters) */ struct timeval *_vblank_time; /**< timestamp of current vblank_count (drivers must alloc right number of fields) */ struct mtx vblank_time_lock; /**< Protects vblank count and time updates during vblank enable/disable */ struct mtx vbl_lock; atomic_t *vblank_refcount; /* number of users of vblank interruptsper crtc */ u32 *last_vblank; /* protected by dev->vbl_lock, used */ /* for wraparound handling */ int *vblank_enabled; /* so we don't call enable more than once per disable */ int *vblank_inmodeset; /* Display driver is setting mode */ u32 *last_vblank_wait; /* Last vblank seqno waited per CRTC */ struct callout vblank_disable_callout; u32 max_vblank_count; /**< size of vblank counter register */ /** * List of events */ struct list_head vblank_event_list; struct mtx event_lock; /*@} */ struct drm_agp_head *agp; /**< AGP data */ device_t dev; /* Device instance from newbus */ uint16_t pci_device; /* PCI device id */ uint16_t pci_vendor; /* PCI vendor id */ uint16_t pci_subdevice; /* PCI subsystem device id */ uint16_t pci_subvendor; /* PCI subsystem vendor id */ struct drm_sg_mem *sg; /**< Scatter gather memory */ unsigned int num_crtcs; /**< Number of CRTCs on this device */ void *dev_private; /**< device private data */ void *mm_private; struct drm_sigdata sigdata; /**< For block_all_signals */ sigset_t sigmask; struct drm_driver *driver; struct drm_local_map *agp_buffer_map; unsigned int agp_buffer_token; struct drm_minor *control; /**< Control node for card */ struct drm_minor *primary; /**< render type primary screen head */ struct drm_mode_config mode_config; /**< Current mode config */ /** \name GEM information */ /*@{ */ struct sx object_name_lock; struct drm_gem_names object_names; /*@} */ int switch_power_state; atomic_t unplugged; /* device has been unplugged or gone away */ /* Locks */ struct mtx dma_lock; /* protects dev->dma */ struct mtx irq_lock; /* protects irq condition checks */ /* Context support */ int irq; /* Interrupt used by board */ int msi_enabled; /* MSI enabled */ int irqrid; /* Interrupt used by board */ struct resource *irqr; /* Resource for interrupt used by board */ void *irqh; /* Handle from bus_setup_intr */ /* Storage of resource pointers for drm_get_resource_* */ #define DRM_MAX_PCI_RESOURCE 6 struct resource *pcir[DRM_MAX_PCI_RESOURCE]; int pcirid[DRM_MAX_PCI_RESOURCE]; struct mtx pcir_lock; int pci_domain; int pci_bus; int pci_slot; int pci_func; /* Sysctl support */ struct drm_sysctl_info *sysctl; int sysctl_node_idx; void *drm_ttm_bdev; void *sysctl_private; char busid_str[128]; int modesetting; const drm_pci_id_list_t *id_entry; /* PCI ID, name, and chipset private */ }; #define DRM_SWITCH_POWER_ON 0 #define DRM_SWITCH_POWER_OFF 1 #define DRM_SWITCH_POWER_CHANGING 2 static __inline__ int drm_core_check_feature(struct drm_device *dev, int feature) { return ((dev->driver->driver_features & feature) ? 1 : 0); } static inline int drm_dev_to_irq(struct drm_device *dev) { return dev->driver->bus->get_irq(dev); } #if __OS_HAS_AGP static inline int drm_core_has_AGP(struct drm_device *dev) { return drm_core_check_feature(dev, DRIVER_USE_AGP); } #else #define drm_core_has_AGP(dev) (0) #endif #if __OS_HAS_MTRR static inline int drm_core_has_MTRR(struct drm_device *dev) { return drm_core_check_feature(dev, DRIVER_USE_MTRR); } #define DRM_MTRR_WC MDF_WRITECOMBINE int drm_mtrr_add(unsigned long offset, unsigned long size, unsigned int flags); int drm_mtrr_del(int handle, unsigned long offset, unsigned long size, unsigned int flags); #else #define drm_core_has_MTRR(dev) (0) #define DRM_MTRR_WC 0 static inline int drm_mtrr_add(unsigned long offset, unsigned long size, unsigned int flags) { return 0; } static inline int drm_mtrr_del(int handle, unsigned long offset, unsigned long size, unsigned int flags) { return 0; } #endif /******************************************************************/ /** \name Internal function definitions */ /*@{*/ /* Driver support (drm_drv.h) */ d_ioctl_t drm_ioctl; extern int drm_lastclose(struct drm_device *dev); /* Device support (drm_fops.h) */ extern struct sx drm_global_mutex; d_open_t drm_open; d_read_t drm_read; extern void drm_release(void *data); /* Mapping support (drm_vm.h) */ d_mmap_t drm_mmap; int drm_mmap_single(struct cdev *kdev, vm_ooffset_t *offset, vm_size_t size, struct vm_object **obj_res, int nprot); d_poll_t drm_poll; /* Misc. IOCTL support (drm_ioctl.h) */ extern int drm_irq_by_busid(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_getunique(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_setunique(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_getmap(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_getclient(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_getstats(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_getcap(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_setversion(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_noop(struct drm_device *dev, void *data, struct drm_file *file_priv); /* Context IOCTL support (drm_context.h) */ extern int drm_resctx(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_addctx(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_modctx(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_getctx(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_switchctx(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_newctx(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_rmctx(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_ctxbitmap_init(struct drm_device *dev); extern void drm_ctxbitmap_cleanup(struct drm_device *dev); extern void drm_ctxbitmap_free(struct drm_device *dev, int ctx_handle); extern int drm_setsareactx(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_getsareactx(struct drm_device *dev, void *data, struct drm_file *file_priv); /* Authentication IOCTL support (drm_auth.h) */ extern int drm_getmagic(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_authmagic(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_remove_magic(struct drm_master *master, drm_magic_t magic); /* Cache management (drm_cache.c) */ void drm_clflush_pages(vm_page_t *pages, unsigned long num_pages); void drm_clflush_virt_range(char *addr, unsigned long length); /* Locking IOCTL support (drm_lock.h) */ extern int drm_lock(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_unlock(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_lock_free(struct drm_lock_data *lock_data, unsigned int context); extern void drm_idlelock_take(struct drm_lock_data *lock_data); extern void drm_idlelock_release(struct drm_lock_data *lock_data); /* * These are exported to drivers so that they can implement fencing using * DMA quiscent + idle. DMA quiescent usually requires the hardware lock. */ extern int drm_i_have_hw_lock(struct drm_device *dev, struct drm_file *file_priv); /* Buffer management support (drm_bufs.h) */ extern int drm_addbufs_agp(struct drm_device *dev, struct drm_buf_desc * request); extern int drm_addbufs_pci(struct drm_device *dev, struct drm_buf_desc * request); extern int drm_addmap(struct drm_device *dev, resource_size_t offset, unsigned int size, enum drm_map_type type, enum drm_map_flags flags, struct drm_local_map **map_ptr); extern int drm_addmap_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_rmmap(struct drm_device *dev, struct drm_local_map *map); extern int drm_rmmap_locked(struct drm_device *dev, struct drm_local_map *map); extern int drm_rmmap_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_addbufs(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_infobufs(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_markbufs(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_freebufs(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_mapbufs(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_order(unsigned long size); /* DMA support (drm_dma.h) */ extern int drm_dma_setup(struct drm_device *dev); extern void drm_dma_takedown(struct drm_device *dev); extern void drm_free_buffer(struct drm_device *dev, struct drm_buf * buf); extern void drm_core_reclaim_buffers(struct drm_device *dev, struct drm_file *filp); /* IRQ support (drm_irq.h) */ extern int drm_control(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_irq_install(struct drm_device *dev); extern int drm_irq_uninstall(struct drm_device *dev); extern int drm_vblank_init(struct drm_device *dev, int num_crtcs); extern int drm_wait_vblank(struct drm_device *dev, void *data, struct drm_file *filp); extern int drm_vblank_wait(struct drm_device *dev, unsigned int *vbl_seq); extern u32 drm_vblank_count(struct drm_device *dev, int crtc); extern u32 drm_vblank_count_and_time(struct drm_device *dev, int crtc, struct timeval *vblanktime); extern void drm_send_vblank_event(struct drm_device *dev, int crtc, struct drm_pending_vblank_event *e); extern bool drm_handle_vblank(struct drm_device *dev, int crtc); extern int drm_vblank_get(struct drm_device *dev, int crtc); extern void drm_vblank_put(struct drm_device *dev, int crtc); extern void drm_vblank_off(struct drm_device *dev, int crtc); extern void drm_vblank_cleanup(struct drm_device *dev); extern u32 drm_get_last_vbltimestamp(struct drm_device *dev, int crtc, struct timeval *tvblank, unsigned flags); extern int drm_calc_vbltimestamp_from_scanoutpos(struct drm_device *dev, int crtc, int *max_error, struct timeval *vblank_time, unsigned flags, struct drm_crtc *refcrtc); extern void drm_calc_timestamping_constants(struct drm_crtc *crtc); extern bool drm_mode_parse_command_line_for_connector(const char *mode_option, struct drm_connector *connector, struct drm_cmdline_mode *mode); extern struct drm_display_mode * drm_mode_create_from_cmdline_mode(struct drm_device *dev, struct drm_cmdline_mode *cmd); /* Modesetting support */ extern void drm_vblank_pre_modeset(struct drm_device *dev, int crtc); extern void drm_vblank_post_modeset(struct drm_device *dev, int crtc); extern int drm_modeset_ctl(struct drm_device *dev, void *data, struct drm_file *file_priv); /* Stub support (drm_stub.h) */ extern int drm_setmaster_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_dropmaster_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); struct drm_master *drm_master_create(struct drm_minor *minor); extern struct drm_master *drm_master_get(struct drm_master *master); extern void drm_master_put(struct drm_master **master); extern void drm_put_dev(struct drm_device *dev); extern int drm_put_minor(struct drm_minor **minor); extern void drm_unplug_dev(struct drm_device *dev); extern unsigned int drm_debug; extern unsigned int drm_notyet; extern unsigned int drm_vblank_offdelay; extern unsigned int drm_timestamp_precision; extern unsigned int drm_timestamp_monotonic; extern struct drm_local_map *drm_getsarea(struct drm_device *dev); #ifdef FREEBSD_NOTYET extern int drm_gem_prime_handle_to_fd(struct drm_device *dev, struct drm_file *file_priv, uint32_t handle, uint32_t flags, int *prime_fd); extern int drm_gem_prime_fd_to_handle(struct drm_device *dev, struct drm_file *file_priv, int prime_fd, uint32_t *handle); extern int drm_prime_handle_to_fd_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_prime_fd_to_handle_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_prime_sg_to_page_addr_arrays(struct sg_table *sgt, vm_page_t *pages, dma_addr_t *addrs, int max_pages); extern struct sg_table *drm_prime_pages_to_sg(vm_page_t *pages, int nr_pages); extern void drm_prime_gem_destroy(struct drm_gem_object *obj, struct sg_table *sg); void drm_prime_init_file_private(struct drm_prime_file_private *prime_fpriv); void drm_prime_destroy_file_private(struct drm_prime_file_private *prime_fpriv); int drm_prime_add_imported_buf_handle(struct drm_prime_file_private *prime_fpriv, struct dma_buf *dma_buf, uint32_t handle); int drm_prime_lookup_imported_buf_handle(struct drm_prime_file_private *prime_fpriv, struct dma_buf *dma_buf, uint32_t *handle); void drm_prime_remove_imported_buf_handle(struct drm_prime_file_private *prime_fpriv, struct dma_buf *dma_buf); int drm_prime_add_dma_buf(struct drm_device *dev, struct drm_gem_object *obj); int drm_prime_lookup_obj(struct drm_device *dev, struct dma_buf *buf, struct drm_gem_object **obj); #endif /* FREEBSD_NOTYET */ /* Scatter Gather Support (drm_scatter.h) */ extern void drm_sg_cleanup(struct drm_sg_mem * entry); extern int drm_sg_alloc_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_sg_alloc(struct drm_device *dev, struct drm_scatter_gather * request); extern int drm_sg_free(struct drm_device *dev, void *data, struct drm_file *file_priv); /* ATI PCIGART support (ati_pcigart.h) */ extern int drm_ati_pcigart_init(struct drm_device *dev, struct drm_ati_pcigart_info * gart_info); extern int drm_ati_pcigart_cleanup(struct drm_device *dev, struct drm_ati_pcigart_info * gart_info); extern drm_dma_handle_t *drm_pci_alloc(struct drm_device *dev, size_t size, size_t align, dma_addr_t maxaddr); extern void __drm_pci_free(struct drm_device *dev, drm_dma_handle_t * dmah); extern void drm_pci_free(struct drm_device *dev, drm_dma_handle_t * dmah); /* Graphics Execution Manager library functions (drm_gem.c) */ int drm_gem_init(struct drm_device *dev); void drm_gem_destroy(struct drm_device *dev); void drm_gem_object_release(struct drm_gem_object *obj); void drm_gem_object_free(struct drm_gem_object *obj); struct drm_gem_object *drm_gem_object_alloc(struct drm_device *dev, size_t size); int drm_gem_object_init(struct drm_device *dev, struct drm_gem_object *obj, size_t size); int drm_gem_private_object_init(struct drm_device *dev, struct drm_gem_object *obj, size_t size); void drm_gem_object_handle_free(struct drm_gem_object *obj); int drm_gem_mmap_single(struct drm_device *dev, vm_ooffset_t *offset, vm_size_t size, struct vm_object **obj_res, int nprot); void drm_gem_pager_dtr(void *obj); #include static inline void drm_gem_object_reference(struct drm_gem_object *obj) { KASSERT(obj->refcount > 0, ("Dangling obj %p", obj)); refcount_acquire(&obj->refcount); } static inline void drm_gem_object_unreference(struct drm_gem_object *obj) { if (obj == NULL) return; if (refcount_release(&obj->refcount)) drm_gem_object_free(obj); } static inline void drm_gem_object_unreference_unlocked(struct drm_gem_object *obj) { if (obj != NULL) { struct drm_device *dev = obj->dev; DRM_LOCK(dev); drm_gem_object_unreference(obj); DRM_UNLOCK(dev); } } int drm_gem_handle_create(struct drm_file *file_priv, struct drm_gem_object *obj, u32 *handlep); int drm_gem_handle_delete(struct drm_file *filp, u32 handle); static inline void drm_gem_object_handle_reference(struct drm_gem_object *obj) { drm_gem_object_reference(obj); atomic_inc(&obj->handle_count); } static inline void drm_gem_object_handle_unreference(struct drm_gem_object *obj) { if (obj == NULL) return; if (atomic_read(&obj->handle_count) == 0) return; /* * Must bump handle count first as this may be the last * ref, in which case the object would disappear before we * checked for a name */ if (atomic_dec_and_test(&obj->handle_count)) drm_gem_object_handle_free(obj); drm_gem_object_unreference(obj); } static inline void drm_gem_object_handle_unreference_unlocked(struct drm_gem_object *obj) { if (obj == NULL) return; if (atomic_read(&obj->handle_count) == 0) return; /* * Must bump handle count first as this may be the last * ref, in which case the object would disappear before we * checked for a name */ if (atomic_dec_and_test(&obj->handle_count)) drm_gem_object_handle_free(obj); drm_gem_object_unreference_unlocked(obj); } void drm_gem_free_mmap_offset(struct drm_gem_object *obj); int drm_gem_create_mmap_offset(struct drm_gem_object *obj); struct drm_gem_object *drm_gem_object_lookup(struct drm_device *dev, struct drm_file *filp, u32 handle); int drm_gem_close_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); int drm_gem_flink_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); int drm_gem_open_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); void drm_gem_open(struct drm_device *dev, struct drm_file *file_private); void drm_gem_release(struct drm_device *dev, struct drm_file *file_private); extern void drm_core_ioremap(struct drm_local_map *map, struct drm_device *dev); extern void drm_core_ioremap_wc(struct drm_local_map *map, struct drm_device *dev); extern void drm_core_ioremapfree(struct drm_local_map *map, struct drm_device *dev); static __inline__ struct drm_local_map *drm_core_findmap(struct drm_device *dev, unsigned int token) { struct drm_map_list *_entry; list_for_each_entry(_entry, &dev->maplist, head) if (_entry->user_token == token) return _entry->map; return NULL; } static __inline__ void drm_core_dropmap(struct drm_local_map *map) { } #include extern int drm_fill_in_dev(struct drm_device *dev, struct drm_driver *driver); extern void drm_cancel_fill_in_dev(struct drm_device *dev); int drm_get_minor(struct drm_device *dev, struct drm_minor **minor, int type); /*@}*/ /* PCI section */ int drm_pci_device_is_agp(struct drm_device *dev); int drm_pci_device_is_pcie(struct drm_device *dev); extern int drm_get_pci_dev(device_t kdev, struct drm_device *dev, struct drm_driver *driver); #define DRM_PCIE_SPEED_25 1 #define DRM_PCIE_SPEED_50 2 #define DRM_PCIE_SPEED_80 4 extern int drm_pcie_get_speed_cap_mask(struct drm_device *dev, u32 *speed_mask); #define drm_can_sleep() (DRM_HZ & 1) /* Platform section */ int drm_get_platform_dev(device_t kdev, struct drm_device *dev, struct drm_driver *driver); /* FreeBSD specific -- should be moved to drm_os_freebsd.h */ #define DRM_GEM_MAPPING_MASK (3ULL << 62) #define DRM_GEM_MAPPING_KEY (2ULL << 62) /* Non-canonical address form */ #define DRM_GEM_MAX_IDX 0x3fffff #define DRM_GEM_MAPPING_IDX(o) (((o) >> 40) & DRM_GEM_MAX_IDX) #define DRM_GEM_MAPPING_OFF(i) (((uint64_t)(i)) << 40) #define DRM_GEM_MAPPING_MAPOFF(o) \ ((o) & ~(DRM_GEM_MAPPING_OFF(DRM_GEM_MAX_IDX) | DRM_GEM_MAPPING_KEY)) SYSCTL_DECL(_hw_drm); #define DRM_DEV_MODE (S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP) #define DRM_DEV_UID UID_ROOT #define DRM_DEV_GID GID_VIDEO #define DRM_WAKEUP(w) wakeup((void *)w) #define DRM_WAKEUP_INT(w) wakeup(w) #define DRM_INIT_WAITQUEUE(queue) do {(void)(queue);} while (0) #define DRM_CURPROC curthread #define DRM_STRUCTPROC struct thread #define DRM_SPINTYPE struct mtx #define DRM_SPININIT(l,name) mtx_init(l, name, NULL, MTX_DEF) #define DRM_SPINUNINIT(l) mtx_destroy(l) #define DRM_SPINLOCK(l) mtx_lock(l) #define DRM_SPINUNLOCK(u) mtx_unlock(u) #define DRM_SPINLOCK_IRQSAVE(l, irqflags) do { \ mtx_lock(l); \ (void)irqflags; \ } while (0) #define DRM_SPINUNLOCK_IRQRESTORE(u, irqflags) mtx_unlock(u) #define DRM_SPINLOCK_ASSERT(l) mtx_assert(l, MA_OWNED) #define DRM_LOCK_SLEEP(dev, chan, flags, msg, timeout) \ (sx_sleep((chan), &(dev)->dev_struct_lock, (flags), (msg), (timeout))) #if defined(INVARIANTS) #define DRM_LOCK_ASSERT(dev) sx_assert(&(dev)->dev_struct_lock, SA_XLOCKED) #define DRM_UNLOCK_ASSERT(dev) sx_assert(&(dev)->dev_struct_lock, SA_UNLOCKED) #else #define DRM_LOCK_ASSERT(d) #define DRM_UNLOCK_ASSERT(d) #endif #define DRM_SYSCTL_HANDLER_ARGS (SYSCTL_HANDLER_ARGS) enum { DRM_IS_NOT_AGP, DRM_IS_AGP, DRM_MIGHT_BE_AGP }; #define DRM_VERIFYAREA_READ( uaddr, size ) \ (!useracc(__DECONST(caddr_t, uaddr), size, VM_PROT_READ)) #define DRM_COPY_TO_USER(user, kern, size) \ copyout(kern, user, size) #define DRM_COPY_FROM_USER(kern, user, size) \ copyin(user, kern, size) #define DRM_COPY_FROM_USER_UNCHECKED(arg1, arg2, arg3) \ copyin(arg2, arg1, arg3) #define DRM_COPY_TO_USER_UNCHECKED(arg1, arg2, arg3) \ copyout(arg2, arg1, arg3) #define DRM_GET_USER_UNCHECKED(val, uaddr) \ ((val) = fuword32(uaddr), 0) #define DRM_GET_PRIV_SAREA(_dev, _ctx, _map) do { \ (_map) = (_dev)->context_sareas[_ctx]; \ } while(0) /* Returns -errno to shared code */ #define DRM_WAIT_ON( ret, queue, timeout, condition ) \ for ( ret = 0 ; !ret && !(condition) ; ) { \ DRM_UNLOCK(dev); \ mtx_lock(&dev->irq_lock); \ if (!(condition)) \ ret = -mtx_sleep(&(queue), &dev->irq_lock, \ PCATCH, "drmwtq", (timeout)); \ if (ret == -ERESTART) \ ret = -ERESTARTSYS; \ mtx_unlock(&dev->irq_lock); \ DRM_LOCK(dev); \ } #define dev_err(dev, fmt, ...) \ device_printf((dev), "error: " fmt, ## __VA_ARGS__) #define dev_warn(dev, fmt, ...) \ device_printf((dev), "warning: " fmt, ## __VA_ARGS__) #define dev_info(dev, fmt, ...) \ device_printf((dev), "info: " fmt, ## __VA_ARGS__) #define dev_dbg(dev, fmt, ...) do { \ if ((drm_debug& DRM_DEBUGBITS_KMS) != 0) { \ device_printf((dev), "debug: " fmt, ## __VA_ARGS__); \ } \ } while (0) struct drm_msi_blacklist_entry { int vendor; int device; }; struct drm_vblank_info { wait_queue_head_t queue; /* vblank wait queue */ atomic_t count; /* number of VBLANK interrupts */ /* (driver must alloc the right number of counters) */ atomic_t refcount; /* number of users of vblank interrupts */ u32 last; /* protected by dev->vbl_lock, used */ /* for wraparound handling */ int enabled; /* so we don't call enable more than */ /* once per disable */ int inmodeset; /* Display driver is setting mode */ }; #ifndef DMA_BIT_MASK #define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : (1ULL<<(n)) - 1) #endif #define upper_32_bits(n) ((u32)(((n) >> 16) >> 16)) enum dmi_field { DMI_NONE, DMI_BIOS_VENDOR, DMI_BIOS_VERSION, DMI_BIOS_DATE, DMI_SYS_VENDOR, DMI_PRODUCT_NAME, DMI_PRODUCT_VERSION, DMI_PRODUCT_SERIAL, DMI_PRODUCT_UUID, DMI_BOARD_VENDOR, DMI_BOARD_NAME, DMI_BOARD_VERSION, DMI_BOARD_SERIAL, DMI_BOARD_ASSET_TAG, DMI_CHASSIS_VENDOR, DMI_CHASSIS_TYPE, DMI_CHASSIS_VERSION, DMI_CHASSIS_SERIAL, DMI_CHASSIS_ASSET_TAG, DMI_STRING_MAX, }; struct dmi_strmatch { unsigned char slot; char substr[79]; }; struct dmi_system_id { int (*callback)(const struct dmi_system_id *); const char *ident; struct dmi_strmatch matches[4]; }; #define DMI_MATCH(a, b) {(a), (b)} bool dmi_check_system(const struct dmi_system_id *); /* Device setup support (drm_drv.c) */ int drm_probe_helper(device_t kdev, const drm_pci_id_list_t *idlist); int drm_attach_helper(device_t kdev, const drm_pci_id_list_t *idlist, struct drm_driver *driver); int drm_generic_suspend(device_t kdev); int drm_generic_resume(device_t kdev); int drm_generic_detach(device_t kdev); void drm_event_wakeup(struct drm_pending_event *e); int drm_add_busid_modesetting(struct drm_device *dev, struct sysctl_ctx_list *ctx, struct sysctl_oid *top); /* Buffer management support (drm_bufs.c) */ unsigned long drm_get_resource_start(struct drm_device *dev, unsigned int resource); unsigned long drm_get_resource_len(struct drm_device *dev, unsigned int resource); /* IRQ support (drm_irq.c) */ irqreturn_t drm_irq_handler(DRM_IRQ_ARGS); void drm_driver_irq_preinstall(struct drm_device *dev); void drm_driver_irq_postinstall(struct drm_device *dev); void drm_driver_irq_uninstall(struct drm_device *dev); /* sysctl support (drm_sysctl.h) */ extern int drm_sysctl_init(struct drm_device *dev); extern int drm_sysctl_cleanup(struct drm_device *dev); int drm_version(struct drm_device *dev, void *data, struct drm_file *file_priv); /* consistent PCI memory functions (drm_pci.c) */ int drm_pci_set_busid(struct drm_device *dev, struct drm_master *master); int drm_pci_set_unique(struct drm_device *dev, struct drm_master *master, struct drm_unique *u); int drm_pci_agp_init(struct drm_device *dev); int drm_pci_enable_msi(struct drm_device *dev); void drm_pci_disable_msi(struct drm_device *dev); struct ttm_bo_device; int ttm_bo_mmap_single(struct ttm_bo_device *bdev, vm_ooffset_t *offset, vm_size_t size, struct vm_object **obj_res, int nprot); struct ttm_buffer_object; void ttm_bo_release_mmap(struct ttm_buffer_object *bo); #if __OS_HAS_AGP /* Memory management support (drm_memory.h) */ extern void drm_free_agp(DRM_AGP_MEM * handle, int pages); extern int drm_bind_agp(DRM_AGP_MEM * handle, unsigned int start); #ifdef FREEBSD_NOTYET extern DRM_AGP_MEM *drm_agp_bind_pages(struct drm_device *dev, struct page **pages, unsigned long num_pages, uint32_t gtt_offset, uint32_t type); #endif /* FREEBSD_NOTYET */ extern int drm_unbind_agp(DRM_AGP_MEM * handle); /* AGP/GART support (drm_agpsupport.h) */ extern struct drm_agp_head *drm_agp_init(struct drm_device *dev); extern int drm_agp_acquire(struct drm_device *dev); extern int drm_agp_acquire_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_agp_release(struct drm_device *dev); extern int drm_agp_release_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_agp_enable(struct drm_device *dev, struct drm_agp_mode mode); extern int drm_agp_enable_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_agp_info(struct drm_device *dev, struct drm_agp_info *info); extern int drm_agp_info_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_agp_alloc(struct drm_device *dev, struct drm_agp_buffer *request); extern int drm_agp_alloc_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_agp_free(struct drm_device *dev, struct drm_agp_buffer *request); extern int drm_agp_free_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_agp_unbind(struct drm_device *dev, struct drm_agp_binding *request); extern int drm_agp_unbind_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); extern int drm_agp_bind(struct drm_device *dev, struct drm_agp_binding *request); extern int drm_agp_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); #else static inline void drm_free_agp(DRM_AGP_MEM * handle, int pages) { } static inline int drm_bind_agp(DRM_AGP_MEM * handle, unsigned int start) { return -ENODEV; } static inline int drm_unbind_agp(DRM_AGP_MEM * handle) { return -ENODEV; } #ifdef FREEBSD_NOTYET static inline struct agp_memory *drm_agp_bind_pages(struct drm_device *dev, struct page **pages, unsigned long num_pages, uint32_t gtt_offset, uint32_t type) { return NULL; } #endif static inline struct drm_agp_head *drm_agp_init(struct drm_device *dev) { return NULL; } static inline void drm_agp_clear(struct drm_device *dev) { } static inline int drm_agp_acquire(struct drm_device *dev) { return -ENODEV; } static inline int drm_agp_acquire_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { return -ENODEV; } static inline int drm_agp_release(struct drm_device *dev) { return -ENODEV; } static inline int drm_agp_release_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { return -ENODEV; } static inline int drm_agp_enable(struct drm_device *dev, struct drm_agp_mode mode) { return -ENODEV; } static inline int drm_agp_enable_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { return -ENODEV; } static inline int drm_agp_info(struct drm_device *dev, struct drm_agp_info *info) { return -ENODEV; } static inline int drm_agp_info_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { return -ENODEV; } static inline int drm_agp_alloc(struct drm_device *dev, struct drm_agp_buffer *request) { return -ENODEV; } static inline int drm_agp_alloc_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { return -ENODEV; } static inline int drm_agp_free(struct drm_device *dev, struct drm_agp_buffer *request) { return -ENODEV; } static inline int drm_agp_free_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { return -ENODEV; } static inline int drm_agp_unbind(struct drm_device *dev, struct drm_agp_binding *request) { return -ENODEV; } static inline int drm_agp_unbind_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { return -ENODEV; } static inline int drm_agp_bind(struct drm_device *dev, struct drm_agp_binding *request) { return -ENODEV; } static inline int drm_agp_bind_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { return -ENODEV; } #endif /* __OS_HAS_AGP */ #endif /* __KERNEL__ */ #endif Index: head/sys/fs/devfs/devfs_vnops.c =================================================================== --- head/sys/fs/devfs/devfs_vnops.c (revision 350420) +++ head/sys/fs/devfs/devfs_vnops.c (revision 350421) @@ -1,1995 +1,1996 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2000-2004 * Poul-Henning Kamp. All rights reserved. * Copyright (c) 1989, 1992-1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kernfs_vnops.c 8.15 (Berkeley) 5/21/95 * From: FreeBSD: src/sys/miscfs/kernfs/kernfs_vnops.c 1.43 * * $FreeBSD$ */ /* * TODO: * mkdir: want it ? */ #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static struct vop_vector devfs_vnodeops; static struct vop_vector devfs_specops; static struct fileops devfs_ops_f; #include #include #include #include #include #include static MALLOC_DEFINE(M_CDEVPDATA, "DEVFSP", "Metainfo for cdev-fp data"); struct mtx devfs_de_interlock; MTX_SYSINIT(devfs_de_interlock, &devfs_de_interlock, "devfs interlock", MTX_DEF); struct sx clone_drain_lock; SX_SYSINIT(clone_drain_lock, &clone_drain_lock, "clone events drain lock"); struct mtx cdevpriv_mtx; MTX_SYSINIT(cdevpriv_mtx, &cdevpriv_mtx, "cdevpriv lock", MTX_DEF); SYSCTL_DECL(_vfs_devfs); static int devfs_dotimes; SYSCTL_INT(_vfs_devfs, OID_AUTO, dotimes, CTLFLAG_RW, &devfs_dotimes, 0, "Update timestamps on DEVFS with default precision"); /* * Update devfs node timestamp. Note that updates are unlocked and * stat(2) could see partially updated times. */ static void devfs_timestamp(struct timespec *tsp) { time_t ts; if (devfs_dotimes) { vfs_timestamp(tsp); } else { ts = time_second; if (tsp->tv_sec != ts) { tsp->tv_sec = ts; tsp->tv_nsec = 0; } } } static int devfs_fp_check(struct file *fp, struct cdev **devp, struct cdevsw **dswp, int *ref) { *dswp = devvn_refthread(fp->f_vnode, devp, ref); if (*devp != fp->f_data) { if (*dswp != NULL) dev_relthread(*devp, *ref); return (ENXIO); } KASSERT((*devp)->si_refcount > 0, ("devfs: un-referenced struct cdev *(%s)", devtoname(*devp))); if (*dswp == NULL) return (ENXIO); curthread->td_fpop = fp; return (0); } int devfs_get_cdevpriv(void **datap) { struct file *fp; struct cdev_privdata *p; int error; fp = curthread->td_fpop; if (fp == NULL) return (EBADF); p = fp->f_cdevpriv; if (p != NULL) { error = 0; *datap = p->cdpd_data; } else error = ENOENT; return (error); } int devfs_set_cdevpriv(void *priv, d_priv_dtor_t *priv_dtr) { struct file *fp; struct cdev_priv *cdp; struct cdev_privdata *p; int error; fp = curthread->td_fpop; if (fp == NULL) return (ENOENT); cdp = cdev2priv((struct cdev *)fp->f_data); p = malloc(sizeof(struct cdev_privdata), M_CDEVPDATA, M_WAITOK); p->cdpd_data = priv; p->cdpd_dtr = priv_dtr; p->cdpd_fp = fp; mtx_lock(&cdevpriv_mtx); if (fp->f_cdevpriv == NULL) { LIST_INSERT_HEAD(&cdp->cdp_fdpriv, p, cdpd_list); fp->f_cdevpriv = p; mtx_unlock(&cdevpriv_mtx); error = 0; } else { mtx_unlock(&cdevpriv_mtx); free(p, M_CDEVPDATA); error = EBUSY; } return (error); } void devfs_destroy_cdevpriv(struct cdev_privdata *p) { mtx_assert(&cdevpriv_mtx, MA_OWNED); KASSERT(p->cdpd_fp->f_cdevpriv == p, ("devfs_destoy_cdevpriv %p != %p", p->cdpd_fp->f_cdevpriv, p)); p->cdpd_fp->f_cdevpriv = NULL; LIST_REMOVE(p, cdpd_list); mtx_unlock(&cdevpriv_mtx); (p->cdpd_dtr)(p->cdpd_data); free(p, M_CDEVPDATA); } static void devfs_fpdrop(struct file *fp) { struct cdev_privdata *p; mtx_lock(&cdevpriv_mtx); if ((p = fp->f_cdevpriv) == NULL) { mtx_unlock(&cdevpriv_mtx); return; } devfs_destroy_cdevpriv(p); } void devfs_clear_cdevpriv(void) { struct file *fp; fp = curthread->td_fpop; if (fp == NULL) return; devfs_fpdrop(fp); } /* * On success devfs_populate_vp() returns with dmp->dm_lock held. */ static int devfs_populate_vp(struct vnode *vp) { struct devfs_dirent *de; struct devfs_mount *dmp; int locked; ASSERT_VOP_LOCKED(vp, "devfs_populate_vp"); dmp = VFSTODEVFS(vp->v_mount); locked = VOP_ISLOCKED(vp); sx_xlock(&dmp->dm_lock); DEVFS_DMP_HOLD(dmp); /* Can't call devfs_populate() with the vnode lock held. */ VOP_UNLOCK(vp, 0); devfs_populate(dmp); sx_xunlock(&dmp->dm_lock); vn_lock(vp, locked | LK_RETRY); sx_xlock(&dmp->dm_lock); if (DEVFS_DMP_DROP(dmp)) { sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); return (ERESTART); } if ((vp->v_iflag & VI_DOOMED) != 0) { sx_xunlock(&dmp->dm_lock); return (ERESTART); } de = vp->v_data; KASSERT(de != NULL, ("devfs_populate_vp: vp->v_data == NULL but vnode not doomed")); if ((de->de_flags & DE_DOOMED) != 0) { sx_xunlock(&dmp->dm_lock); return (ERESTART); } return (0); } static int devfs_vptocnp(struct vop_vptocnp_args *ap) { struct vnode *vp = ap->a_vp; struct vnode **dvp = ap->a_vpp; struct devfs_mount *dmp; char *buf = ap->a_buf; int *buflen = ap->a_buflen; struct devfs_dirent *dd, *de; int i, error; dmp = VFSTODEVFS(vp->v_mount); error = devfs_populate_vp(vp); if (error != 0) return (error); i = *buflen; dd = vp->v_data; if (vp->v_type == VCHR) { i -= strlen(dd->de_cdp->cdp_c.si_name); if (i < 0) { error = ENOMEM; goto finished; } bcopy(dd->de_cdp->cdp_c.si_name, buf + i, strlen(dd->de_cdp->cdp_c.si_name)); de = dd->de_dir; } else if (vp->v_type == VDIR) { if (dd == dmp->dm_rootdir) { *dvp = vp; vref(*dvp); goto finished; } i -= dd->de_dirent->d_namlen; if (i < 0) { error = ENOMEM; goto finished; } bcopy(dd->de_dirent->d_name, buf + i, dd->de_dirent->d_namlen); de = dd; } else { error = ENOENT; goto finished; } *buflen = i; de = devfs_parent_dirent(de); if (de == NULL) { error = ENOENT; goto finished; } mtx_lock(&devfs_de_interlock); *dvp = de->de_vnode; if (*dvp != NULL) { VI_LOCK(*dvp); mtx_unlock(&devfs_de_interlock); vholdl(*dvp); VI_UNLOCK(*dvp); vref(*dvp); vdrop(*dvp); } else { mtx_unlock(&devfs_de_interlock); error = ENOENT; } finished: sx_xunlock(&dmp->dm_lock); return (error); } /* * Construct the fully qualified path name relative to the mountpoint. * If a NULL cnp is provided, no '/' is appended to the resulting path. */ char * devfs_fqpn(char *buf, struct devfs_mount *dmp, struct devfs_dirent *dd, struct componentname *cnp) { int i; struct devfs_dirent *de; sx_assert(&dmp->dm_lock, SA_LOCKED); i = SPECNAMELEN; buf[i] = '\0'; if (cnp != NULL) i -= cnp->cn_namelen; if (i < 0) return (NULL); if (cnp != NULL) bcopy(cnp->cn_nameptr, buf + i, cnp->cn_namelen); de = dd; while (de != dmp->dm_rootdir) { if (cnp != NULL || i < SPECNAMELEN) { i--; if (i < 0) return (NULL); buf[i] = '/'; } i -= de->de_dirent->d_namlen; if (i < 0) return (NULL); bcopy(de->de_dirent->d_name, buf + i, de->de_dirent->d_namlen); de = devfs_parent_dirent(de); if (de == NULL) return (NULL); } return (buf + i); } static int devfs_allocv_drop_refs(int drop_dm_lock, struct devfs_mount *dmp, struct devfs_dirent *de) { int not_found; not_found = 0; if (de->de_flags & DE_DOOMED) not_found = 1; if (DEVFS_DE_DROP(de)) { KASSERT(not_found == 1, ("DEVFS de dropped but not doomed")); devfs_dirent_free(de); } if (DEVFS_DMP_DROP(dmp)) { KASSERT(not_found == 1, ("DEVFS mount struct freed before dirent")); not_found = 2; sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); } if (not_found == 1 || (drop_dm_lock && not_found != 2)) sx_unlock(&dmp->dm_lock); return (not_found); } static void devfs_insmntque_dtr(struct vnode *vp, void *arg) { struct devfs_dirent *de; de = (struct devfs_dirent *)arg; mtx_lock(&devfs_de_interlock); vp->v_data = NULL; de->de_vnode = NULL; mtx_unlock(&devfs_de_interlock); vgone(vp); vput(vp); } /* * devfs_allocv shall be entered with dmp->dm_lock held, and it drops * it on return. */ int devfs_allocv(struct devfs_dirent *de, struct mount *mp, int lockmode, struct vnode **vpp) { int error; struct vnode *vp; struct cdev *dev; struct devfs_mount *dmp; struct cdevsw *dsw; dmp = VFSTODEVFS(mp); if (de->de_flags & DE_DOOMED) { sx_xunlock(&dmp->dm_lock); return (ENOENT); } loop: DEVFS_DE_HOLD(de); DEVFS_DMP_HOLD(dmp); mtx_lock(&devfs_de_interlock); vp = de->de_vnode; if (vp != NULL) { VI_LOCK(vp); mtx_unlock(&devfs_de_interlock); sx_xunlock(&dmp->dm_lock); vget(vp, lockmode | LK_INTERLOCK | LK_RETRY, curthread); sx_xlock(&dmp->dm_lock); if (devfs_allocv_drop_refs(0, dmp, de)) { vput(vp); return (ENOENT); } else if ((vp->v_iflag & VI_DOOMED) != 0) { mtx_lock(&devfs_de_interlock); if (de->de_vnode == vp) { de->de_vnode = NULL; vp->v_data = NULL; } mtx_unlock(&devfs_de_interlock); vput(vp); goto loop; } sx_xunlock(&dmp->dm_lock); *vpp = vp; return (0); } mtx_unlock(&devfs_de_interlock); if (de->de_dirent->d_type == DT_CHR) { if (!(de->de_cdp->cdp_flags & CDP_ACTIVE)) { devfs_allocv_drop_refs(1, dmp, de); return (ENOENT); } dev = &de->de_cdp->cdp_c; } else { dev = NULL; } error = getnewvnode("devfs", mp, &devfs_vnodeops, &vp); if (error != 0) { devfs_allocv_drop_refs(1, dmp, de); printf("devfs_allocv: failed to allocate new vnode\n"); return (error); } if (de->de_dirent->d_type == DT_CHR) { vp->v_type = VCHR; VI_LOCK(vp); dev_lock(); dev_refl(dev); /* XXX: v_rdev should be protect by vnode lock */ vp->v_rdev = dev; KASSERT(vp->v_usecount == 1, ("%s %d (%d)\n", __func__, __LINE__, vp->v_usecount)); dev->si_usecount += vp->v_usecount; /* Special casing of ttys for deadfs. Probably redundant. */ dsw = dev->si_devsw; if (dsw != NULL && (dsw->d_flags & D_TTY) != 0) vp->v_vflag |= VV_ISTTY; dev_unlock(); VI_UNLOCK(vp); if ((dev->si_flags & SI_ETERNAL) != 0) vp->v_vflag |= VV_ETERNALDEV; vp->v_op = &devfs_specops; } else if (de->de_dirent->d_type == DT_DIR) { vp->v_type = VDIR; } else if (de->de_dirent->d_type == DT_LNK) { vp->v_type = VLNK; } else { vp->v_type = VBAD; } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWITNESS); VN_LOCK_ASHARE(vp); mtx_lock(&devfs_de_interlock); vp->v_data = de; de->de_vnode = vp; mtx_unlock(&devfs_de_interlock); error = insmntque1(vp, mp, devfs_insmntque_dtr, de); if (error != 0) { (void) devfs_allocv_drop_refs(1, dmp, de); return (error); } if (devfs_allocv_drop_refs(0, dmp, de)) { vput(vp); return (ENOENT); } #ifdef MAC mac_devfs_vnode_associate(mp, de, vp); #endif sx_xunlock(&dmp->dm_lock); *vpp = vp; return (0); } static int devfs_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; struct devfs_dirent *de; struct proc *p; int error; de = vp->v_data; if (vp->v_type == VDIR) de = de->de_dir; error = vaccess(vp->v_type, de->de_mode, de->de_uid, de->de_gid, ap->a_accmode, ap->a_cred, NULL); if (error == 0) return (0); if (error != EACCES) return (error); p = ap->a_td->td_proc; /* We do, however, allow access to the controlling terminal */ PROC_LOCK(p); if (!(p->p_flag & P_CONTROLT)) { PROC_UNLOCK(p); return (error); } if (p->p_session->s_ttydp == de->de_cdp) error = 0; PROC_UNLOCK(p); return (error); } _Static_assert(((FMASK | FCNTLFLAGS) & (FLASTCLOSE | FREVOKE)) == 0, "devfs-only flag reuse failed"); static int devfs_close(struct vop_close_args *ap) { struct vnode *vp = ap->a_vp, *oldvp; struct thread *td = ap->a_td; struct proc *p; struct cdev *dev = vp->v_rdev; struct cdevsw *dsw; int dflags, error, ref, vp_locked; /* * XXX: Don't call d_close() if we were called because of * XXX: insmntque1() failure. */ if (vp->v_data == NULL) return (0); /* * Hack: a tty device that is a controlling terminal * has a reference from the session structure. * We cannot easily tell that a character device is * a controlling terminal, unless it is the closing * process' controlling terminal. In that case, * if the reference count is 2 (this last descriptor * plus the session), release the reference from the session. */ if (td != NULL) { p = td->td_proc; PROC_LOCK(p); if (vp == p->p_session->s_ttyvp) { PROC_UNLOCK(p); oldvp = NULL; sx_xlock(&proctree_lock); if (vp == p->p_session->s_ttyvp) { SESS_LOCK(p->p_session); VI_LOCK(vp); if (count_dev(dev) == 2 && (vp->v_iflag & VI_DOOMED) == 0) { p->p_session->s_ttyvp = NULL; p->p_session->s_ttydp = NULL; oldvp = vp; } VI_UNLOCK(vp); SESS_UNLOCK(p->p_session); } sx_xunlock(&proctree_lock); if (oldvp != NULL) vrele(oldvp); } else PROC_UNLOCK(p); } /* * We do not want to really close the device if it * is still in use unless we are trying to close it * forcibly. Since every use (buffer, vnode, swap, cmap) * holds a reference to the vnode, and because we mark * any other vnodes that alias this device, when the * sum of the reference counts on all the aliased * vnodes descends to one, we are on last close. */ dsw = dev_refthread(dev, &ref); if (dsw == NULL) return (ENXIO); dflags = 0; VI_LOCK(vp); if (vp->v_iflag & VI_DOOMED) { /* Forced close. */ dflags |= FREVOKE | FNONBLOCK; } else if (dsw->d_flags & D_TRACKCLOSE) { /* Keep device updated on status. */ } else if (count_dev(dev) > 1) { VI_UNLOCK(vp); dev_relthread(dev, ref); return (0); } if (count_dev(dev) == 1) dflags |= FLASTCLOSE; vholdl(vp); VI_UNLOCK(vp); vp_locked = VOP_ISLOCKED(vp); VOP_UNLOCK(vp, 0); KASSERT(dev->si_refcount > 0, ("devfs_close() on un-referenced struct cdev *(%s)", devtoname(dev))); error = dsw->d_close(dev, ap->a_fflag | dflags, S_IFCHR, td); dev_relthread(dev, ref); vn_lock(vp, vp_locked | LK_RETRY); vdrop(vp); return (error); } static int devfs_close_f(struct file *fp, struct thread *td) { int error; struct file *fpop; /* * NB: td may be NULL if this descriptor is closed due to * garbage collection from a closed UNIX domain socket. */ fpop = curthread->td_fpop; curthread->td_fpop = fp; error = vnops.fo_close(fp, td); curthread->td_fpop = fpop; /* * The f_cdevpriv cannot be assigned non-NULL value while we * are destroying the file. */ if (fp->f_cdevpriv != NULL) devfs_fpdrop(fp); return (error); } static int devfs_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; struct devfs_dirent *de; struct devfs_mount *dmp; struct cdev *dev; struct timeval boottime; int error; error = devfs_populate_vp(vp); if (error != 0) return (error); dmp = VFSTODEVFS(vp->v_mount); sx_xunlock(&dmp->dm_lock); de = vp->v_data; KASSERT(de != NULL, ("Null dirent in devfs_getattr vp=%p", vp)); if (vp->v_type == VDIR) { de = de->de_dir; KASSERT(de != NULL, ("Null dir dirent in devfs_getattr vp=%p", vp)); } vap->va_uid = de->de_uid; vap->va_gid = de->de_gid; vap->va_mode = de->de_mode; if (vp->v_type == VLNK) vap->va_size = strlen(de->de_symlink); else if (vp->v_type == VDIR) vap->va_size = vap->va_bytes = DEV_BSIZE; else vap->va_size = 0; if (vp->v_type != VDIR) vap->va_bytes = 0; vap->va_blocksize = DEV_BSIZE; vap->va_type = vp->v_type; getboottime(&boottime); #define fix(aa) \ do { \ if ((aa).tv_sec <= 3600) { \ (aa).tv_sec = boottime.tv_sec; \ (aa).tv_nsec = boottime.tv_usec * 1000; \ } \ } while (0) if (vp->v_type != VCHR) { fix(de->de_atime); vap->va_atime = de->de_atime; fix(de->de_mtime); vap->va_mtime = de->de_mtime; fix(de->de_ctime); vap->va_ctime = de->de_ctime; } else { dev = vp->v_rdev; fix(dev->si_atime); vap->va_atime = dev->si_atime; fix(dev->si_mtime); vap->va_mtime = dev->si_mtime; fix(dev->si_ctime); vap->va_ctime = dev->si_ctime; vap->va_rdev = cdev2priv(dev)->cdp_inode; } vap->va_gen = 0; vap->va_flags = 0; vap->va_filerev = 0; vap->va_nlink = de->de_links; vap->va_fileid = de->de_inode; return (error); } /* ARGSUSED */ static int devfs_ioctl_f(struct file *fp, u_long com, void *data, struct ucred *cred, struct thread *td) { struct file *fpop; int error; fpop = td->td_fpop; td->td_fpop = fp; error = vnops.fo_ioctl(fp, com, data, cred, td); td->td_fpop = fpop; return (error); } void * fiodgname_buf_get_ptr(void *fgnp, u_long com) { union { struct fiodgname_arg fgn; #ifdef COMPAT_FREEBSD32 struct fiodgname_arg32 fgn32; #endif } *fgnup; fgnup = fgnp; switch (com) { case FIODGNAME: return (fgnup->fgn.buf); #ifdef COMPAT_FREEBSD32 case FIODGNAME_32: return ((void *)(uintptr_t)fgnup->fgn32.buf); #endif default: panic("Unhandled ioctl command %ld", com); } } static int devfs_ioctl(struct vop_ioctl_args *ap) { struct fiodgname_arg *fgn; struct vnode *vpold, *vp; struct cdevsw *dsw; struct thread *td; struct cdev *dev; int error, ref, i; const char *p; u_long com; vp = ap->a_vp; com = ap->a_command; td = ap->a_td; dsw = devvn_refthread(vp, &dev, &ref); if (dsw == NULL) return (ENXIO); KASSERT(dev->si_refcount > 0, ("devfs: un-referenced struct cdev *(%s)", devtoname(dev))); switch (com) { case FIODTYPE: *(int *)ap->a_data = dsw->d_flags & D_TYPEMASK; error = 0; break; case FIODGNAME: #ifdef COMPAT_FREEBSD32 case FIODGNAME_32: #endif fgn = ap->a_data; p = devtoname(dev); i = strlen(p) + 1; if (i > fgn->len) error = EINVAL; else error = copyout(p, fiodgname_buf_get_ptr(fgn, com), i); break; default: error = dsw->d_ioctl(dev, com, ap->a_data, ap->a_fflag, td); } dev_relthread(dev, ref); if (error == ENOIOCTL) error = ENOTTY; if (error == 0 && com == TIOCSCTTY) { /* Do nothing if reassigning same control tty */ sx_slock(&proctree_lock); if (td->td_proc->p_session->s_ttyvp == vp) { sx_sunlock(&proctree_lock); return (0); } vpold = td->td_proc->p_session->s_ttyvp; VREF(vp); SESS_LOCK(td->td_proc->p_session); td->td_proc->p_session->s_ttyvp = vp; td->td_proc->p_session->s_ttydp = cdev2priv(dev); SESS_UNLOCK(td->td_proc->p_session); sx_sunlock(&proctree_lock); /* Get rid of reference to old control tty */ if (vpold) vrele(vpold); } return (error); } /* ARGSUSED */ static int devfs_kqfilter_f(struct file *fp, struct knote *kn) { struct cdev *dev; struct cdevsw *dsw; int error, ref; struct file *fpop; struct thread *td; td = curthread; fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error) return (error); error = dsw->d_kqfilter(dev, kn); td->td_fpop = fpop; dev_relthread(dev, ref); return (error); } static inline int devfs_prison_check(struct devfs_dirent *de, struct thread *td) { struct cdev_priv *cdp; struct ucred *dcr; struct proc *p; int error; cdp = de->de_cdp; if (cdp == NULL) return (0); dcr = cdp->cdp_c.si_cred; if (dcr == NULL) return (0); error = prison_check(td->td_ucred, dcr); if (error == 0) return (0); /* We do, however, allow access to the controlling terminal */ p = td->td_proc; PROC_LOCK(p); if (!(p->p_flag & P_CONTROLT)) { PROC_UNLOCK(p); return (error); } if (p->p_session->s_ttydp == cdp) error = 0; PROC_UNLOCK(p); return (error); } static int devfs_lookupx(struct vop_lookup_args *ap, int *dm_unlock) { struct componentname *cnp; struct vnode *dvp, **vpp; struct thread *td; struct devfs_dirent *de, *dd; struct devfs_dirent **dde; struct devfs_mount *dmp; struct mount *mp; struct cdev *cdev; int error, flags, nameiop, dvplocked; char specname[SPECNAMELEN + 1], *pname; cnp = ap->a_cnp; vpp = ap->a_vpp; dvp = ap->a_dvp; pname = cnp->cn_nameptr; td = cnp->cn_thread; flags = cnp->cn_flags; nameiop = cnp->cn_nameiop; mp = dvp->v_mount; dmp = VFSTODEVFS(mp); dd = dvp->v_data; *vpp = NULLVP; if ((flags & ISLASTCN) && nameiop == RENAME) return (EOPNOTSUPP); if (dvp->v_type != VDIR) return (ENOTDIR); if ((flags & ISDOTDOT) && (dvp->v_vflag & VV_ROOT)) return (EIO); error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td); if (error) return (error); if (cnp->cn_namelen == 1 && *pname == '.') { if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); *vpp = dvp; VREF(dvp); return (0); } if (flags & ISDOTDOT) { if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); de = devfs_parent_dirent(dd); if (de == NULL) return (ENOENT); dvplocked = VOP_ISLOCKED(dvp); VOP_UNLOCK(dvp, 0); error = devfs_allocv(de, mp, cnp->cn_lkflags & LK_TYPE_MASK, vpp); *dm_unlock = 0; vn_lock(dvp, dvplocked | LK_RETRY); return (error); } dd = dvp->v_data; de = devfs_find(dd, cnp->cn_nameptr, cnp->cn_namelen, 0); while (de == NULL) { /* While(...) so we can use break */ if (nameiop == DELETE) return (ENOENT); /* * OK, we didn't have an entry for the name we were asked for * so we try to see if anybody can create it on demand. */ pname = devfs_fqpn(specname, dmp, dd, cnp); if (pname == NULL) break; cdev = NULL; DEVFS_DMP_HOLD(dmp); sx_xunlock(&dmp->dm_lock); sx_slock(&clone_drain_lock); EVENTHANDLER_INVOKE(dev_clone, td->td_ucred, pname, strlen(pname), &cdev); sx_sunlock(&clone_drain_lock); if (cdev == NULL) sx_xlock(&dmp->dm_lock); else if (devfs_populate_vp(dvp) != 0) { *dm_unlock = 0; sx_xlock(&dmp->dm_lock); if (DEVFS_DMP_DROP(dmp)) { sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); } else sx_xunlock(&dmp->dm_lock); dev_rel(cdev); return (ENOENT); } if (DEVFS_DMP_DROP(dmp)) { *dm_unlock = 0; sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); if (cdev != NULL) dev_rel(cdev); return (ENOENT); } if (cdev == NULL) break; dev_lock(); dde = &cdev2priv(cdev)->cdp_dirents[dmp->dm_idx]; if (dde != NULL && *dde != NULL) de = *dde; dev_unlock(); dev_rel(cdev); break; } if (de == NULL || de->de_flags & DE_WHITEOUT) { if ((nameiop == CREATE || nameiop == RENAME) && (flags & (LOCKPARENT | WANTPARENT)) && (flags & ISLASTCN)) { cnp->cn_flags |= SAVENAME; return (EJUSTRETURN); } return (ENOENT); } if (devfs_prison_check(de, td)) return (ENOENT); if ((cnp->cn_nameiop == DELETE) && (flags & ISLASTCN)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) return (error); if (*vpp == dvp) { VREF(dvp); *vpp = dvp; return (0); } } error = devfs_allocv(de, mp, cnp->cn_lkflags & LK_TYPE_MASK, vpp); *dm_unlock = 0; return (error); } static int devfs_lookup(struct vop_lookup_args *ap) { int j; struct devfs_mount *dmp; int dm_unlock; if (devfs_populate_vp(ap->a_dvp) != 0) return (ENOTDIR); dmp = VFSTODEVFS(ap->a_dvp->v_mount); dm_unlock = 1; j = devfs_lookupx(ap, &dm_unlock); if (dm_unlock == 1) sx_xunlock(&dmp->dm_lock); return (j); } static int devfs_mknod(struct vop_mknod_args *ap) { struct componentname *cnp; struct vnode *dvp, **vpp; struct devfs_dirent *dd, *de; struct devfs_mount *dmp; int error; /* * The only type of node we should be creating here is a * character device, for anything else return EOPNOTSUPP. */ if (ap->a_vap->va_type != VCHR) return (EOPNOTSUPP); dvp = ap->a_dvp; dmp = VFSTODEVFS(dvp->v_mount); cnp = ap->a_cnp; vpp = ap->a_vpp; dd = dvp->v_data; error = ENOENT; sx_xlock(&dmp->dm_lock); TAILQ_FOREACH(de, &dd->de_dlist, de_list) { if (cnp->cn_namelen != de->de_dirent->d_namlen) continue; if (de->de_dirent->d_type == DT_CHR && (de->de_cdp->cdp_flags & CDP_ACTIVE) == 0) continue; if (bcmp(cnp->cn_nameptr, de->de_dirent->d_name, de->de_dirent->d_namlen) != 0) continue; if (de->de_flags & DE_WHITEOUT) break; goto notfound; } if (de == NULL) goto notfound; de->de_flags &= ~DE_WHITEOUT; error = devfs_allocv(de, dvp->v_mount, LK_EXCLUSIVE, vpp); return (error); notfound: sx_xunlock(&dmp->dm_lock); return (error); } /* ARGSUSED */ static int devfs_open(struct vop_open_args *ap) { struct thread *td = ap->a_td; struct vnode *vp = ap->a_vp; struct cdev *dev = vp->v_rdev; struct file *fp = ap->a_fp; int error, ref, vlocked; struct cdevsw *dsw; struct file *fpop; struct mtx *mtxp; if (vp->v_type == VBLK) return (ENXIO); if (dev == NULL) return (ENXIO); /* Make this field valid before any I/O in d_open. */ if (dev->si_iosize_max == 0) dev->si_iosize_max = DFLTPHYS; dsw = dev_refthread(dev, &ref); if (dsw == NULL) return (ENXIO); if (fp == NULL && dsw->d_fdopen != NULL) { dev_relthread(dev, ref); return (ENXIO); } vlocked = VOP_ISLOCKED(vp); VOP_UNLOCK(vp, 0); fpop = td->td_fpop; td->td_fpop = fp; if (fp != NULL) { fp->f_data = dev; fp->f_vnode = vp; } if (dsw->d_fdopen != NULL) error = dsw->d_fdopen(dev, ap->a_mode, td, fp); else error = dsw->d_open(dev, ap->a_mode, S_IFCHR, td); /* Clean up any cdevpriv upon error. */ if (error != 0) devfs_clear_cdevpriv(); td->td_fpop = fpop; vn_lock(vp, vlocked | LK_RETRY); dev_relthread(dev, ref); if (error != 0) { if (error == ERESTART) error = EINTR; return (error); } #if 0 /* /dev/console */ KASSERT(fp != NULL, ("Could not vnode bypass device on NULL fp")); #else if (fp == NULL) return (error); #endif if (fp->f_ops == &badfileops) finit(fp, fp->f_flag, DTYPE_VNODE, dev, &devfs_ops_f); mtxp = mtx_pool_find(mtxpool_sleep, fp); /* * Hint to the dofilewrite() to not force the buffer draining * on the writer to the file. Most likely, the write would * not need normal buffers. */ mtx_lock(mtxp); fp->f_vnread_flags |= FDEVFS_VNODE; mtx_unlock(mtxp); return (error); } static int devfs_pathconf(struct vop_pathconf_args *ap) { switch (ap->a_name) { case _PC_FILESIZEBITS: *ap->a_retval = 64; return (0); case _PC_NAME_MAX: *ap->a_retval = NAME_MAX; return (0); case _PC_LINK_MAX: *ap->a_retval = INT_MAX; return (0); case _PC_SYMLINK_MAX: *ap->a_retval = MAXPATHLEN; return (0); case _PC_MAX_CANON: if (ap->a_vp->v_vflag & VV_ISTTY) { *ap->a_retval = MAX_CANON; return (0); } return (EINVAL); case _PC_MAX_INPUT: if (ap->a_vp->v_vflag & VV_ISTTY) { *ap->a_retval = MAX_INPUT; return (0); } return (EINVAL); case _PC_VDISABLE: if (ap->a_vp->v_vflag & VV_ISTTY) { *ap->a_retval = _POSIX_VDISABLE; return (0); } return (EINVAL); case _PC_MAC_PRESENT: #ifdef MAC /* * If MAC is enabled, devfs automatically supports * trivial non-persistant label storage. */ *ap->a_retval = 1; #else *ap->a_retval = 0; #endif return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); default: return (vop_stdpathconf(ap)); } /* NOTREACHED */ } /* ARGSUSED */ static int devfs_poll_f(struct file *fp, int events, struct ucred *cred, struct thread *td) { struct cdev *dev; struct cdevsw *dsw; int error, ref; struct file *fpop; fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error != 0) { error = vnops.fo_poll(fp, events, cred, td); return (error); } error = dsw->d_poll(dev, events, td); td->td_fpop = fpop; dev_relthread(dev, ref); return(error); } /* * Print out the contents of a special device vnode. */ static int devfs_print(struct vop_print_args *ap) { printf("\tdev %s\n", devtoname(ap->a_vp->v_rdev)); return (0); } static int devfs_read_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td) { struct cdev *dev; int ioflag, error, ref; ssize_t resid; struct cdevsw *dsw; struct file *fpop; if (uio->uio_resid > DEVFS_IOSIZE_MAX) return (EINVAL); fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error != 0) { error = vnops.fo_read(fp, uio, cred, flags, td); return (error); } resid = uio->uio_resid; ioflag = fp->f_flag & (O_NONBLOCK | O_DIRECT); if (ioflag & O_DIRECT) ioflag |= IO_DIRECT; foffset_lock_uio(fp, uio, flags | FOF_NOLOCK); error = dsw->d_read(dev, uio, ioflag); if (uio->uio_resid != resid || (error == 0 && resid != 0)) devfs_timestamp(&dev->si_atime); td->td_fpop = fpop; dev_relthread(dev, ref); foffset_unlock_uio(fp, uio, flags | FOF_NOLOCK | FOF_NEXTOFF); return (error); } static int devfs_readdir(struct vop_readdir_args *ap) { int error; struct uio *uio; struct dirent *dp; struct devfs_dirent *dd; struct devfs_dirent *de; struct devfs_mount *dmp; off_t off; int *tmp_ncookies = NULL; if (ap->a_vp->v_type != VDIR) return (ENOTDIR); uio = ap->a_uio; if (uio->uio_offset < 0) return (EINVAL); /* * XXX: This is a temporary hack to get around this filesystem not * supporting cookies. We store the location of the ncookies pointer * in a temporary variable before calling vfs_subr.c:vfs_read_dirent() * and set the number of cookies to 0. We then set the pointer to * NULL so that vfs_read_dirent doesn't try to call realloc() on * ap->a_cookies. Later in this function, we restore the ap->a_ncookies * pointer to its original location before returning to the caller. */ if (ap->a_ncookies != NULL) { tmp_ncookies = ap->a_ncookies; *ap->a_ncookies = 0; ap->a_ncookies = NULL; } dmp = VFSTODEVFS(ap->a_vp->v_mount); if (devfs_populate_vp(ap->a_vp) != 0) { if (tmp_ncookies != NULL) ap->a_ncookies = tmp_ncookies; return (EIO); } error = 0; de = ap->a_vp->v_data; off = 0; TAILQ_FOREACH(dd, &de->de_dlist, de_list) { KASSERT(dd->de_cdp != (void *)0xdeadc0de, ("%s %d\n", __func__, __LINE__)); if (dd->de_flags & (DE_COVERED | DE_WHITEOUT)) continue; if (devfs_prison_check(dd, uio->uio_td)) continue; if (dd->de_dirent->d_type == DT_DIR) de = dd->de_dir; else de = dd; dp = dd->de_dirent; MPASS(dp->d_reclen == GENERIC_DIRSIZ(dp)); if (dp->d_reclen > uio->uio_resid) break; dp->d_fileno = de->de_inode; /* NOTE: d_off is the offset for the *next* entry. */ dp->d_off = off + dp->d_reclen; if (off >= uio->uio_offset) { error = vfs_read_dirent(ap, dp, off); if (error) break; } off += dp->d_reclen; } sx_xunlock(&dmp->dm_lock); uio->uio_offset = off; /* * Restore ap->a_ncookies if it wasn't originally NULL in the first * place. */ if (tmp_ncookies != NULL) ap->a_ncookies = tmp_ncookies; return (error); } static int devfs_readlink(struct vop_readlink_args *ap) { struct devfs_dirent *de; de = ap->a_vp->v_data; return (uiomove(de->de_symlink, strlen(de->de_symlink), ap->a_uio)); } static int devfs_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp; struct devfs_dirent *de; vp = ap->a_vp; mtx_lock(&devfs_de_interlock); de = vp->v_data; if (de != NULL) { de->de_vnode = NULL; vp->v_data = NULL; } mtx_unlock(&devfs_de_interlock); vnode_destroy_vobject(vp); return (0); } static int devfs_reclaim_vchr(struct vop_reclaim_args *ap) { struct vnode *vp; struct cdev *dev; vp = ap->a_vp; MPASS(vp->v_type == VCHR); devfs_reclaim(ap); VI_LOCK(vp); dev_lock(); dev = vp->v_rdev; vp->v_rdev = NULL; if (dev != NULL) dev->si_usecount -= vp->v_usecount; dev_unlock(); VI_UNLOCK(vp); if (dev != NULL) dev_rel(dev); return (0); } static int devfs_remove(struct vop_remove_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode *vp = ap->a_vp; struct devfs_dirent *dd; struct devfs_dirent *de, *de_covered; struct devfs_mount *dmp = VFSTODEVFS(vp->v_mount); ASSERT_VOP_ELOCKED(dvp, "devfs_remove"); ASSERT_VOP_ELOCKED(vp, "devfs_remove"); sx_xlock(&dmp->dm_lock); dd = ap->a_dvp->v_data; de = vp->v_data; if (de->de_cdp == NULL) { TAILQ_REMOVE(&dd->de_dlist, de, de_list); if (de->de_dirent->d_type == DT_LNK) { de_covered = devfs_find(dd, de->de_dirent->d_name, de->de_dirent->d_namlen, 0); if (de_covered != NULL) de_covered->de_flags &= ~DE_COVERED; } /* We need to unlock dvp because devfs_delete() may lock it. */ VOP_UNLOCK(vp, 0); if (dvp != vp) VOP_UNLOCK(dvp, 0); devfs_delete(dmp, de, 0); sx_xunlock(&dmp->dm_lock); if (dvp != vp) vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } else { de->de_flags |= DE_WHITEOUT; sx_xunlock(&dmp->dm_lock); } return (0); } /* * Revoke is called on a tty when a terminal session ends. The vnode * is orphaned by setting v_op to deadfs so we need to let go of it * as well so that we create a new one next time around. * */ static int devfs_revoke(struct vop_revoke_args *ap) { struct vnode *vp = ap->a_vp, *vp2; struct cdev *dev; struct cdev_priv *cdp; struct devfs_dirent *de; u_int i; KASSERT((ap->a_flags & REVOKEALL) != 0, ("devfs_revoke !REVOKEALL")); dev = vp->v_rdev; cdp = cdev2priv(dev); dev_lock(); cdp->cdp_inuse++; dev_unlock(); vhold(vp); vgone(vp); vdrop(vp); VOP_UNLOCK(vp,0); loop: for (;;) { mtx_lock(&devfs_de_interlock); dev_lock(); vp2 = NULL; for (i = 0; i <= cdp->cdp_maxdirent; i++) { de = cdp->cdp_dirents[i]; if (de == NULL) continue; vp2 = de->de_vnode; if (vp2 != NULL) { dev_unlock(); VI_LOCK(vp2); mtx_unlock(&devfs_de_interlock); if (vget(vp2, LK_EXCLUSIVE | LK_INTERLOCK, curthread)) goto loop; vhold(vp2); vgone(vp2); vdrop(vp2); vput(vp2); break; } } if (vp2 != NULL) { continue; } dev_unlock(); mtx_unlock(&devfs_de_interlock); break; } dev_lock(); cdp->cdp_inuse--; if (!(cdp->cdp_flags & CDP_ACTIVE) && cdp->cdp_inuse == 0) { TAILQ_REMOVE(&cdevp_list, cdp, cdp_list); dev_unlock(); dev_rel(&cdp->cdp_c); } else dev_unlock(); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); return (0); } static int devfs_rioctl(struct vop_ioctl_args *ap) { struct vnode *vp; struct devfs_mount *dmp; int error; vp = ap->a_vp; vn_lock(vp, LK_SHARED | LK_RETRY); if (vp->v_iflag & VI_DOOMED) { VOP_UNLOCK(vp, 0); return (EBADF); } dmp = VFSTODEVFS(vp->v_mount); sx_xlock(&dmp->dm_lock); VOP_UNLOCK(vp, 0); DEVFS_DMP_HOLD(dmp); devfs_populate(dmp); if (DEVFS_DMP_DROP(dmp)) { sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); return (ENOENT); } error = devfs_rules_ioctl(dmp, ap->a_command, ap->a_data, ap->a_td); sx_xunlock(&dmp->dm_lock); return (error); } static int devfs_rread(struct vop_read_args *ap) { if (ap->a_vp->v_type != VDIR) return (EINVAL); return (VOP_READDIR(ap->a_vp, ap->a_uio, ap->a_cred, NULL, NULL, NULL)); } static int devfs_setattr(struct vop_setattr_args *ap) { struct devfs_dirent *de; struct vattr *vap; struct vnode *vp; struct thread *td; int c, error; uid_t uid; gid_t gid; vap = ap->a_vap; vp = ap->a_vp; td = curthread; if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_flags != VNOVAL && vap->va_flags != 0) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { return (EINVAL); } error = devfs_populate_vp(vp); if (error != 0) return (error); de = vp->v_data; if (vp->v_type == VDIR) de = de->de_dir; c = 0; if (vap->va_uid == (uid_t)VNOVAL) uid = de->de_uid; else uid = vap->va_uid; if (vap->va_gid == (gid_t)VNOVAL) gid = de->de_gid; else gid = vap->va_gid; if (uid != de->de_uid || gid != de->de_gid) { if ((ap->a_cred->cr_uid != de->de_uid) || uid != de->de_uid || (gid != de->de_gid && !groupmember(gid, ap->a_cred))) { error = priv_check(td, PRIV_VFS_CHOWN); if (error != 0) goto ret; } de->de_uid = uid; de->de_gid = gid; c = 1; } if (vap->va_mode != (mode_t)VNOVAL) { if (ap->a_cred->cr_uid != de->de_uid) { error = priv_check(td, PRIV_VFS_ADMIN); if (error != 0) goto ret; } de->de_mode = vap->va_mode; c = 1; } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { error = vn_utimes_perm(vp, vap, ap->a_cred, td); if (error != 0) goto ret; if (vap->va_atime.tv_sec != VNOVAL) { if (vp->v_type == VCHR) vp->v_rdev->si_atime = vap->va_atime; else de->de_atime = vap->va_atime; } if (vap->va_mtime.tv_sec != VNOVAL) { if (vp->v_type == VCHR) vp->v_rdev->si_mtime = vap->va_mtime; else de->de_mtime = vap->va_mtime; } c = 1; } if (c) { if (vp->v_type == VCHR) vfs_timestamp(&vp->v_rdev->si_ctime); else vfs_timestamp(&de->de_mtime); } ret: sx_xunlock(&VFSTODEVFS(vp->v_mount)->dm_lock); return (error); } #ifdef MAC static int devfs_setlabel(struct vop_setlabel_args *ap) { struct vnode *vp; struct devfs_dirent *de; vp = ap->a_vp; de = vp->v_data; mac_vnode_relabel(ap->a_cred, vp, ap->a_label); mac_devfs_update(vp->v_mount, de, vp); return (0); } #endif static int devfs_stat_f(struct file *fp, struct stat *sb, struct ucred *cred, struct thread *td) { return (vnops.fo_stat(fp, sb, cred, td)); } static int devfs_symlink(struct vop_symlink_args *ap) { int i, error; struct devfs_dirent *dd; struct devfs_dirent *de, *de_covered, *de_dotdot; struct devfs_mount *dmp; error = priv_check(curthread, PRIV_DEVFS_SYMLINK); if (error) return(error); dmp = VFSTODEVFS(ap->a_dvp->v_mount); if (devfs_populate_vp(ap->a_dvp) != 0) return (ENOENT); dd = ap->a_dvp->v_data; de = devfs_newdirent(ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen); de->de_flags = DE_USER; de->de_uid = 0; de->de_gid = 0; de->de_mode = 0755; de->de_inode = alloc_unr(devfs_inos); de->de_dir = dd; de->de_dirent->d_type = DT_LNK; i = strlen(ap->a_target) + 1; de->de_symlink = malloc(i, M_DEVFS, M_WAITOK); bcopy(ap->a_target, de->de_symlink, i); #ifdef MAC mac_devfs_create_symlink(ap->a_cnp->cn_cred, dmp->dm_mount, dd, de); #endif de_covered = devfs_find(dd, de->de_dirent->d_name, de->de_dirent->d_namlen, 0); if (de_covered != NULL) { if ((de_covered->de_flags & DE_USER) != 0) { devfs_delete(dmp, de, DEVFS_DEL_NORECURSE); sx_xunlock(&dmp->dm_lock); return (EEXIST); } KASSERT((de_covered->de_flags & DE_COVERED) == 0, ("devfs_symlink: entry %p already covered", de_covered)); de_covered->de_flags |= DE_COVERED; } de_dotdot = TAILQ_FIRST(&dd->de_dlist); /* "." */ de_dotdot = TAILQ_NEXT(de_dotdot, de_list); /* ".." */ TAILQ_INSERT_AFTER(&dd->de_dlist, de_dotdot, de, de_list); devfs_dir_ref_de(dmp, dd); devfs_rules_apply(dmp, de); return (devfs_allocv(de, ap->a_dvp->v_mount, LK_EXCLUSIVE, ap->a_vpp)); } static int devfs_truncate_f(struct file *fp, off_t length, struct ucred *cred, struct thread *td) { return (vnops.fo_truncate(fp, length, cred, td)); } static int devfs_write_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td) { struct cdev *dev; int error, ioflag, ref; ssize_t resid; struct cdevsw *dsw; struct file *fpop; if (uio->uio_resid > DEVFS_IOSIZE_MAX) return (EINVAL); fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error != 0) { error = vnops.fo_write(fp, uio, cred, flags, td); return (error); } KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); ioflag = fp->f_flag & (O_NONBLOCK | O_DIRECT | O_FSYNC); if (ioflag & O_DIRECT) ioflag |= IO_DIRECT; foffset_lock_uio(fp, uio, flags | FOF_NOLOCK); resid = uio->uio_resid; error = dsw->d_write(dev, uio, ioflag); if (uio->uio_resid != resid || (error == 0 && resid != 0)) { devfs_timestamp(&dev->si_ctime); dev->si_mtime = dev->si_ctime; } td->td_fpop = fpop; dev_relthread(dev, ref); foffset_unlock_uio(fp, uio, flags | FOF_NOLOCK | FOF_NEXTOFF); return (error); } static int devfs_mmap_f(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, struct thread *td) { struct cdev *dev; struct cdevsw *dsw; struct mount *mp; struct vnode *vp; struct file *fpop; vm_object_t object; vm_prot_t maxprot; int error, ref; vp = fp->f_vnode; /* * Ensure that file and memory protections are * compatible. */ mp = vp->v_mount; if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) { maxprot = VM_PROT_NONE; if ((prot & VM_PROT_EXECUTE) != 0) return (EACCES); } else maxprot = VM_PROT_EXECUTE; if ((fp->f_flag & FREAD) != 0) maxprot |= VM_PROT_READ; else if ((prot & VM_PROT_READ) != 0) return (EACCES); /* * If we are sharing potential changes via MAP_SHARED and we * are trying to get write permission although we opened it * without asking for it, bail out. * * Note that most character devices always share mappings. * The one exception is that D_MMAP_ANON devices * (i.e. /dev/zero) permit private writable mappings. * * Rely on vm_mmap_cdev() to fail invalid MAP_PRIVATE requests * as well as updating maxprot to permit writing for * D_MMAP_ANON devices rather than doing that here. */ if ((flags & MAP_SHARED) != 0) { if ((fp->f_flag & FWRITE) != 0) maxprot |= VM_PROT_WRITE; else if ((prot & VM_PROT_WRITE) != 0) return (EACCES); } maxprot &= cap_maxprot; fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error != 0) return (error); error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, dev, dsw, &foff, &object); td->td_fpop = fpop; dev_relthread(dev, ref); if (error != 0) return (error); error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, foff, FALSE, td); if (error != 0) vm_object_deallocate(object); return (error); } dev_t dev2udev(struct cdev *x) { if (x == NULL) return (NODEV); return (cdev2priv(x)->cdp_inode); } static struct fileops devfs_ops_f = { .fo_read = devfs_read_f, .fo_write = devfs_write_f, .fo_truncate = devfs_truncate_f, .fo_ioctl = devfs_ioctl_f, .fo_poll = devfs_poll_f, .fo_kqfilter = devfs_kqfilter_f, .fo_stat = devfs_stat_f, .fo_close = devfs_close_f, .fo_chmod = vn_chmod, .fo_chown = vn_chown, .fo_sendfile = vn_sendfile, .fo_seek = vn_seek, .fo_fill_kinfo = vn_fill_kinfo, .fo_mmap = devfs_mmap_f, .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE }; /* Vops for non-CHR vnodes in /dev. */ static struct vop_vector devfs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = devfs_access, .vop_getattr = devfs_getattr, .vop_ioctl = devfs_rioctl, .vop_lookup = devfs_lookup, .vop_mknod = devfs_mknod, .vop_pathconf = devfs_pathconf, .vop_read = devfs_rread, .vop_readdir = devfs_readdir, .vop_readlink = devfs_readlink, .vop_reclaim = devfs_reclaim, .vop_remove = devfs_remove, .vop_revoke = devfs_revoke, .vop_setattr = devfs_setattr, #ifdef MAC .vop_setlabel = devfs_setlabel, #endif .vop_symlink = devfs_symlink, .vop_vptocnp = devfs_vptocnp, }; /* Vops for VCHR vnodes in /dev. */ static struct vop_vector devfs_specops = { .vop_default = &default_vnodeops, .vop_access = devfs_access, .vop_bmap = VOP_PANIC, .vop_close = devfs_close, .vop_create = VOP_PANIC, .vop_fsync = vop_stdfsync, .vop_getattr = devfs_getattr, .vop_ioctl = devfs_ioctl, .vop_link = VOP_PANIC, .vop_mkdir = VOP_PANIC, .vop_mknod = VOP_PANIC, .vop_open = devfs_open, .vop_pathconf = devfs_pathconf, .vop_poll = dead_poll, .vop_print = devfs_print, .vop_read = dead_read, .vop_readdir = VOP_PANIC, .vop_readlink = VOP_PANIC, .vop_reallocblks = VOP_PANIC, .vop_reclaim = devfs_reclaim_vchr, .vop_remove = devfs_remove, .vop_rename = VOP_PANIC, .vop_revoke = devfs_revoke, .vop_rmdir = VOP_PANIC, .vop_setattr = devfs_setattr, #ifdef MAC .vop_setlabel = devfs_setlabel, #endif .vop_strategy = VOP_PANIC, .vop_symlink = VOP_PANIC, .vop_vptocnp = devfs_vptocnp, .vop_write = dead_write, }; /* * Our calling convention to the device drivers used to be that we passed * vnode.h IO_* flags to read()/write(), but we're moving to fcntl.h O_ * flags instead since that's what open(), close() and ioctl() takes and * we don't really want vnode.h in device drivers. * We solved the source compatibility by redefining some vnode flags to * be the same as the fcntl ones and by sending down the bitwise OR of * the respective fcntl/vnode flags. These CTASSERTS make sure nobody * pulls the rug out under this. */ CTASSERT(O_NONBLOCK == IO_NDELAY); CTASSERT(O_FSYNC == IO_SYNC); Index: head/sys/fs/ext2fs/ext2_vnops.c =================================================================== --- head/sys/fs/ext2fs/ext2_vnops.c (revision 350420) +++ head/sys/fs/ext2fs/ext2_vnops.c (revision 350421) @@ -1,2346 +1,2347 @@ /*- * modified for EXT2FS support in Lites 1.1 * * Aug 1995, Godmar Back (gback@cs.utah.edu) * University of Utah, Department of Computer Science */ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_vnops.c 8.7 (Berkeley) 2/3/94 * @(#)ufs_vnops.c 8.27 (Berkeley) 5/27/95 * $FreeBSD$ */ #include "opt_suiddir.h" #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_directio.h" #include #include #include #include #include #include #include #include #include #include #include SDT_PROVIDER_DECLARE(ext2fs); /* * ext2fs trace probe: * arg0: verbosity. Higher numbers give more verbose messages * arg1: Textual message */ SDT_PROBE_DEFINE2(ext2fs, , vnops, trace, "int", "char*"); static int ext2_makeinode(int mode, struct vnode *, struct vnode **, struct componentname *); static void ext2_itimes_locked(struct vnode *); static vop_access_t ext2_access; static int ext2_chmod(struct vnode *, int, struct ucred *, struct thread *); static int ext2_chown(struct vnode *, uid_t, gid_t, struct ucred *, struct thread *); static vop_close_t ext2_close; static vop_create_t ext2_create; static vop_fsync_t ext2_fsync; static vop_getattr_t ext2_getattr; static vop_ioctl_t ext2_ioctl; static vop_link_t ext2_link; static vop_mkdir_t ext2_mkdir; static vop_mknod_t ext2_mknod; static vop_open_t ext2_open; static vop_pathconf_t ext2_pathconf; static vop_print_t ext2_print; static vop_read_t ext2_read; static vop_readlink_t ext2_readlink; static vop_remove_t ext2_remove; static vop_rename_t ext2_rename; static vop_rmdir_t ext2_rmdir; static vop_setattr_t ext2_setattr; static vop_strategy_t ext2_strategy; static vop_symlink_t ext2_symlink; static vop_write_t ext2_write; static vop_deleteextattr_t ext2_deleteextattr; static vop_getextattr_t ext2_getextattr; static vop_listextattr_t ext2_listextattr; static vop_setextattr_t ext2_setextattr; static vop_vptofh_t ext2_vptofh; static vop_close_t ext2fifo_close; static vop_kqfilter_t ext2fifo_kqfilter; /* Global vfs data structures for ext2. */ struct vop_vector ext2_vnodeops = { .vop_default = &default_vnodeops, .vop_access = ext2_access, .vop_bmap = ext2_bmap, .vop_cachedlookup = ext2_lookup, .vop_close = ext2_close, .vop_create = ext2_create, .vop_fsync = ext2_fsync, .vop_getpages = vnode_pager_local_getpages, .vop_getpages_async = vnode_pager_local_getpages_async, .vop_getattr = ext2_getattr, .vop_inactive = ext2_inactive, .vop_ioctl = ext2_ioctl, .vop_link = ext2_link, .vop_lookup = vfs_cache_lookup, .vop_mkdir = ext2_mkdir, .vop_mknod = ext2_mknod, .vop_open = ext2_open, .vop_pathconf = ext2_pathconf, .vop_poll = vop_stdpoll, .vop_print = ext2_print, .vop_read = ext2_read, .vop_readdir = ext2_readdir, .vop_readlink = ext2_readlink, .vop_reallocblks = ext2_reallocblks, .vop_reclaim = ext2_reclaim, .vop_remove = ext2_remove, .vop_rename = ext2_rename, .vop_rmdir = ext2_rmdir, .vop_setattr = ext2_setattr, .vop_strategy = ext2_strategy, .vop_symlink = ext2_symlink, .vop_write = ext2_write, .vop_deleteextattr = ext2_deleteextattr, .vop_getextattr = ext2_getextattr, .vop_listextattr = ext2_listextattr, .vop_setextattr = ext2_setextattr, #ifdef UFS_ACL .vop_getacl = ext2_getacl, .vop_setacl = ext2_setacl, .vop_aclcheck = ext2_aclcheck, #endif /* UFS_ACL */ .vop_vptofh = ext2_vptofh, }; struct vop_vector ext2_fifoops = { .vop_default = &fifo_specops, .vop_access = ext2_access, .vop_close = ext2fifo_close, .vop_fsync = ext2_fsync, .vop_getattr = ext2_getattr, .vop_inactive = ext2_inactive, .vop_kqfilter = ext2fifo_kqfilter, .vop_pathconf = ext2_pathconf, .vop_print = ext2_print, .vop_read = VOP_PANIC, .vop_reclaim = ext2_reclaim, .vop_setattr = ext2_setattr, .vop_write = VOP_PANIC, .vop_vptofh = ext2_vptofh, }; /* * A virgin directory (no blushing please). * Note that the type and namlen fields are reversed relative to ext2. * Also, we don't use `struct odirtemplate', since it would just cause * endianness problems. */ static struct dirtemplate mastertemplate = { 0, 12, 1, EXT2_FT_DIR, ".", 0, DIRBLKSIZ - 12, 2, EXT2_FT_DIR, ".." }; static struct dirtemplate omastertemplate = { 0, 12, 1, EXT2_FT_UNKNOWN, ".", 0, DIRBLKSIZ - 12, 2, EXT2_FT_UNKNOWN, ".." }; static void ext2_itimes_locked(struct vnode *vp) { struct inode *ip; struct timespec ts; ASSERT_VI_LOCKED(vp, __func__); ip = VTOI(vp); if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE)) == 0) return; if ((vp->v_type == VBLK || vp->v_type == VCHR)) ip->i_flag |= IN_LAZYMOD; else ip->i_flag |= IN_MODIFIED; if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { vfs_timestamp(&ts); if (ip->i_flag & IN_ACCESS) { ip->i_atime = ts.tv_sec; ip->i_atimensec = ts.tv_nsec; } if (ip->i_flag & IN_UPDATE) { ip->i_mtime = ts.tv_sec; ip->i_mtimensec = ts.tv_nsec; ip->i_modrev++; } if (ip->i_flag & IN_CHANGE) { ip->i_ctime = ts.tv_sec; ip->i_ctimensec = ts.tv_nsec; } } ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE); } void ext2_itimes(struct vnode *vp) { VI_LOCK(vp); ext2_itimes_locked(vp); VI_UNLOCK(vp); } /* * Create a regular file */ static int ext2_create(struct vop_create_args *ap) { int error; error = ext2_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode), ap->a_dvp, ap->a_vpp, ap->a_cnp); if (error != 0) return (error); if ((ap->a_cnp->cn_flags & MAKEENTRY) != 0) cache_enter(ap->a_dvp, *ap->a_vpp, ap->a_cnp); return (0); } static int ext2_open(struct vop_open_args *ap) { if (ap->a_vp->v_type == VBLK || ap->a_vp->v_type == VCHR) return (EOPNOTSUPP); /* * Files marked append-only must be opened for appending. */ if ((VTOI(ap->a_vp)->i_flags & APPEND) && (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE) return (EPERM); vnode_create_vobject(ap->a_vp, VTOI(ap->a_vp)->i_size, ap->a_td); return (0); } /* * Close called. * * Update the times on the inode. */ static int ext2_close(struct vop_close_args *ap) { struct vnode *vp = ap->a_vp; VI_LOCK(vp); if (vp->v_usecount > 1) ext2_itimes_locked(vp); VI_UNLOCK(vp); return (0); } static int ext2_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); accmode_t accmode = ap->a_accmode; int error; if (vp->v_type == VBLK || vp->v_type == VCHR) return (EOPNOTSUPP); /* * Disallow write attempts on read-only file systems; * unless the file is a socket, fifo, or a block or * character device resident on the file system. */ if (accmode & VWRITE) { switch (vp->v_type) { case VDIR: case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); break; default: break; } } /* If immutable bit set, nobody gets to write it. */ if ((accmode & VWRITE) && (ip->i_flags & (SF_IMMUTABLE | SF_SNAPSHOT))) return (EPERM); error = vaccess(vp->v_type, ip->i_mode, ip->i_uid, ip->i_gid, ap->a_accmode, ap->a_cred, NULL); return (error); } static int ext2_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); struct vattr *vap = ap->a_vap; ext2_itimes(vp); /* * Copy from inode table */ vap->va_fsid = dev2udev(ip->i_devvp->v_rdev); vap->va_fileid = ip->i_number; vap->va_mode = ip->i_mode & ~IFMT; vap->va_nlink = ip->i_nlink; vap->va_uid = ip->i_uid; vap->va_gid = ip->i_gid; vap->va_rdev = ip->i_rdev; vap->va_size = ip->i_size; vap->va_atime.tv_sec = ip->i_atime; vap->va_atime.tv_nsec = E2DI_HAS_XTIME(ip) ? ip->i_atimensec : 0; vap->va_mtime.tv_sec = ip->i_mtime; vap->va_mtime.tv_nsec = E2DI_HAS_XTIME(ip) ? ip->i_mtimensec : 0; vap->va_ctime.tv_sec = ip->i_ctime; vap->va_ctime.tv_nsec = E2DI_HAS_XTIME(ip) ? ip->i_ctimensec : 0; if E2DI_HAS_XTIME(ip) { vap->va_birthtime.tv_sec = ip->i_birthtime; vap->va_birthtime.tv_nsec = ip->i_birthnsec; } vap->va_flags = ip->i_flags; vap->va_gen = ip->i_gen; vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize; vap->va_bytes = dbtob((u_quad_t)ip->i_blocks); vap->va_type = IFTOVT(ip->i_mode); vap->va_filerev = ip->i_modrev; return (0); } /* * Set attribute vnode op. called from several syscalls */ static int ext2_setattr(struct vop_setattr_args *ap) { struct vattr *vap = ap->a_vap; struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); struct ucred *cred = ap->a_cred; struct thread *td = curthread; int error; /* * Check for unsettable attributes. */ if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { return (EINVAL); } if (vap->va_flags != VNOVAL) { /* Disallow flags not supported by ext2fs. */ if (vap->va_flags & ~(SF_APPEND | SF_IMMUTABLE | UF_NODUMP)) return (EOPNOTSUPP); if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); /* * Callers may only modify the file flags on objects they * have VADMIN rights for. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) return (error); /* * Unprivileged processes and privileged processes in * jail() are not permitted to unset system flags, or * modify flags if any system flags are set. * Privileged non-jail processes may not modify system flags * if securelevel > 0 and any existing system flags are set. */ if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS)) { if (ip->i_flags & (SF_IMMUTABLE | SF_APPEND)) { error = securelevel_gt(cred, 0); if (error) return (error); } } else { if (ip->i_flags & (SF_IMMUTABLE | SF_APPEND) || ((vap->va_flags ^ ip->i_flags) & SF_SETTABLE)) return (EPERM); } ip->i_flags = vap->va_flags; ip->i_flag |= IN_CHANGE; if (ip->i_flags & (IMMUTABLE | APPEND)) return (0); } if (ip->i_flags & (IMMUTABLE | APPEND)) return (EPERM); /* * Go through the fields and update iff not VNOVAL. */ if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if ((error = ext2_chown(vp, vap->va_uid, vap->va_gid, cred, td)) != 0) return (error); } if (vap->va_size != VNOVAL) { /* * Disallow write attempts on read-only file systems; * unless the file is a socket, fifo, or a block or * character device resident on the file system. */ switch (vp->v_type) { case VDIR: return (EISDIR); case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); break; default: break; } if ((error = ext2_truncate(vp, vap->va_size, 0, cred, td)) != 0) return (error); } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); /* * From utimes(2): * If times is NULL, ... The caller must be the owner of * the file, have permission to write the file, or be the * super-user. * If times is non-NULL, ... The caller must be the owner of * the file or be the super-user. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, td)) && ((vap->va_vaflags & VA_UTIMES_NULL) == 0 || (error = VOP_ACCESS(vp, VWRITE, cred, td)))) return (error); ip->i_flag |= IN_CHANGE | IN_MODIFIED; if (vap->va_atime.tv_sec != VNOVAL) { ip->i_flag &= ~IN_ACCESS; ip->i_atime = vap->va_atime.tv_sec; ip->i_atimensec = vap->va_atime.tv_nsec; } if (vap->va_mtime.tv_sec != VNOVAL) { ip->i_flag &= ~IN_UPDATE; ip->i_mtime = vap->va_mtime.tv_sec; ip->i_mtimensec = vap->va_mtime.tv_nsec; } ip->i_birthtime = vap->va_birthtime.tv_sec; ip->i_birthnsec = vap->va_birthtime.tv_nsec; error = ext2_update(vp, 0); if (error) return (error); } error = 0; if (vap->va_mode != (mode_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); error = ext2_chmod(vp, (int)vap->va_mode, cred, td); } return (error); } /* * Change the mode on a file. * Inode must be locked before calling. */ static int ext2_chmod(struct vnode *vp, int mode, struct ucred *cred, struct thread *td) { struct inode *ip = VTOI(vp); int error; /* * To modify the permissions on a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) return (error); /* * Privileged processes may set the sticky bit on non-directories, * as well as set the setgid bit on a file with a group that the * process is not a member of. */ if (vp->v_type != VDIR && (mode & S_ISTXT)) { error = priv_check_cred(cred, PRIV_VFS_STICKYFILE); if (error) return (EFTYPE); } if (!groupmember(ip->i_gid, cred) && (mode & ISGID)) { error = priv_check_cred(cred, PRIV_VFS_SETGID); if (error) return (error); } ip->i_mode &= ~ALLPERMS; ip->i_mode |= (mode & ALLPERMS); ip->i_flag |= IN_CHANGE; return (0); } /* * Perform chown operation on inode ip; * inode must be locked prior to call. */ static int ext2_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred, struct thread *td) { struct inode *ip = VTOI(vp); uid_t ouid; gid_t ogid; int error = 0; if (uid == (uid_t)VNOVAL) uid = ip->i_uid; if (gid == (gid_t)VNOVAL) gid = ip->i_gid; /* * To modify the ownership of a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) return (error); /* * To change the owner of a file, or change the group of a file * to a group of which we are not a member, the caller must * have privilege. */ if (uid != ip->i_uid || (gid != ip->i_gid && !groupmember(gid, cred))) { error = priv_check_cred(cred, PRIV_VFS_CHOWN); if (error) return (error); } ogid = ip->i_gid; ouid = ip->i_uid; ip->i_gid = gid; ip->i_uid = uid; ip->i_flag |= IN_CHANGE; if ((ip->i_mode & (ISUID | ISGID)) && (ouid != uid || ogid != gid)) { if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID) != 0) ip->i_mode &= ~(ISUID | ISGID); } return (0); } /* * Synch an open file. */ /* ARGSUSED */ static int ext2_fsync(struct vop_fsync_args *ap) { /* * Flush all dirty buffers associated with a vnode. */ vop_stdfsync(ap); return (ext2_update(ap->a_vp, ap->a_waitfor == MNT_WAIT)); } /* * Mknod vnode call */ /* ARGSUSED */ static int ext2_mknod(struct vop_mknod_args *ap) { struct vattr *vap = ap->a_vap; struct vnode **vpp = ap->a_vpp; struct inode *ip; ino_t ino; int error; error = ext2_makeinode(MAKEIMODE(vap->va_type, vap->va_mode), ap->a_dvp, vpp, ap->a_cnp); if (error) return (error); ip = VTOI(*vpp); ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; if (vap->va_rdev != VNOVAL) { /* * Want to be able to use this to make badblock * inodes, so don't truncate the dev number. */ if (!(ip->i_flag & IN_E4EXTENTS)) ip->i_rdev = vap->va_rdev; } /* * Remove inode, then reload it through VFS_VGET so it is * checked to see if it is an alias of an existing entry in * the inode cache. XXX I don't believe this is necessary now. */ (*vpp)->v_type = VNON; ino = ip->i_number; /* Save this before vgone() invalidates ip. */ vgone(*vpp); vput(*vpp); error = VFS_VGET(ap->a_dvp->v_mount, ino, LK_EXCLUSIVE, vpp); if (error) { *vpp = NULL; return (error); } return (0); } static int ext2_remove(struct vop_remove_args *ap) { struct inode *ip; struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; int error; ip = VTOI(vp); if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (VTOI(dvp)->i_flags & APPEND)) { error = EPERM; goto out; } error = ext2_dirremove(dvp, ap->a_cnp); if (error == 0) { ip->i_nlink--; ip->i_flag |= IN_CHANGE; } out: return (error); } /* * link vnode call */ static int ext2_link(struct vop_link_args *ap) { struct vnode *vp = ap->a_vp; struct vnode *tdvp = ap->a_tdvp; struct componentname *cnp = ap->a_cnp; struct inode *ip; int error; #ifdef INVARIANTS if ((cnp->cn_flags & HASBUF) == 0) panic("ext2_link: no name"); #endif ip = VTOI(vp); if ((nlink_t)ip->i_nlink >= EXT4_LINK_MAX) { error = EMLINK; goto out; } if (ip->i_flags & (IMMUTABLE | APPEND)) { error = EPERM; goto out; } ip->i_nlink++; ip->i_flag |= IN_CHANGE; error = ext2_update(vp, !DOINGASYNC(vp)); if (!error) error = ext2_direnter(ip, tdvp, cnp); if (error) { ip->i_nlink--; ip->i_flag |= IN_CHANGE; } out: return (error); } static int ext2_inc_nlink(struct inode *ip) { ip->i_nlink++; if (S_ISDIR(ip->i_mode) && EXT2_HAS_RO_COMPAT_FEATURE(ip->i_e2fs, EXT2F_ROCOMPAT_DIR_NLINK) && ip->i_nlink > 1) { if (ip->i_nlink >= EXT4_LINK_MAX || ip->i_nlink == 2) ip->i_nlink = 1; } else if (ip->i_nlink > EXT4_LINK_MAX) { ip->i_nlink--; return (EMLINK); } return (0); } static void ext2_dec_nlink(struct inode *ip) { if (!S_ISDIR(ip->i_mode) || ip->i_nlink > 2) ip->i_nlink--; } /* * Rename system call. * rename("foo", "bar"); * is essentially * unlink("bar"); * link("foo", "bar"); * unlink("foo"); * but ``atomically''. Can't do full commit without saving state in the * inode on disk which isn't feasible at this time. Best we can do is * always guarantee the target exists. * * Basic algorithm is: * * 1) Bump link count on source while we're linking it to the * target. This also ensure the inode won't be deleted out * from underneath us while we work (it may be truncated by * a concurrent `trunc' or `open' for creation). * 2) Link source to destination. If destination already exists, * delete it first. * 3) Unlink source reference to inode if still around. If a * directory was moved and the parent of the destination * is different from the source, patch the ".." entry in the * directory. */ static int ext2_rename(struct vop_rename_args *ap) { struct vnode *tvp = ap->a_tvp; struct vnode *tdvp = ap->a_tdvp; struct vnode *fvp = ap->a_fvp; struct vnode *fdvp = ap->a_fdvp; struct componentname *tcnp = ap->a_tcnp; struct componentname *fcnp = ap->a_fcnp; struct inode *ip, *xp, *dp; struct dirtemplate *dirbuf; int doingdirectory = 0, oldparent = 0, newparent = 0; int error = 0; u_char namlen; #ifdef INVARIANTS if ((tcnp->cn_flags & HASBUF) == 0 || (fcnp->cn_flags & HASBUF) == 0) panic("ext2_rename: no name"); #endif /* * Check for cross-device rename. */ if ((fvp->v_mount != tdvp->v_mount) || (tvp && (fvp->v_mount != tvp->v_mount))) { error = EXDEV; abortit: if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp) vput(tvp); vrele(fdvp); vrele(fvp); return (error); } if (tvp && ((VTOI(tvp)->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (VTOI(tdvp)->i_flags & APPEND))) { error = EPERM; goto abortit; } /* * Renaming a file to itself has no effect. The upper layers should * not call us in that case. Temporarily just warn if they do. */ if (fvp == tvp) { SDT_PROBE2(ext2fs, , vnops, trace, 1, "rename: fvp == tvp (can't happen)"); error = 0; goto abortit; } if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0) goto abortit; dp = VTOI(fdvp); ip = VTOI(fvp); if (ip->i_nlink >= EXT4_LINK_MAX && !EXT2_HAS_RO_COMPAT_FEATURE(ip->i_e2fs, EXT2F_ROCOMPAT_DIR_NLINK)) { VOP_UNLOCK(fvp, 0); error = EMLINK; goto abortit; } if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (dp->i_flags & APPEND)) { VOP_UNLOCK(fvp, 0); error = EPERM; goto abortit; } if ((ip->i_mode & IFMT) == IFDIR) { /* * Avoid ".", "..", and aliases of "." for obvious reasons. */ if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || dp == ip || (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT || (ip->i_flag & IN_RENAME)) { VOP_UNLOCK(fvp, 0); error = EINVAL; goto abortit; } ip->i_flag |= IN_RENAME; oldparent = dp->i_number; doingdirectory++; } vrele(fdvp); /* * When the target exists, both the directory * and target vnodes are returned locked. */ dp = VTOI(tdvp); xp = NULL; if (tvp) xp = VTOI(tvp); /* * 1) Bump link count while we're moving stuff * around. If we crash somewhere before * completing our work, the link count * may be wrong, but correctable. */ ext2_inc_nlink(ip); ip->i_flag |= IN_CHANGE; if ((error = ext2_update(fvp, !DOINGASYNC(fvp))) != 0) { VOP_UNLOCK(fvp, 0); goto bad; } /* * If ".." must be changed (ie the directory gets a new * parent) then the source directory must not be in the * directory hierarchy above the target, as this would * orphan everything below the source directory. Also * the user must have write permission in the source so * as to be able to change "..". We must repeat the call * to namei, as the parent directory is unlocked by the * call to checkpath(). */ error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread); VOP_UNLOCK(fvp, 0); if (oldparent != dp->i_number) newparent = dp->i_number; if (doingdirectory && newparent) { if (error) /* write access check above */ goto bad; if (xp != NULL) vput(tvp); error = ext2_checkpath(ip, dp, tcnp->cn_cred); if (error) goto out; VREF(tdvp); error = relookup(tdvp, &tvp, tcnp); if (error) goto out; vrele(tdvp); dp = VTOI(tdvp); xp = NULL; if (tvp) xp = VTOI(tvp); } /* * 2) If target doesn't exist, link the target * to the source and unlink the source. * Otherwise, rewrite the target directory * entry to reference the source inode and * expunge the original entry's existence. */ if (xp == NULL) { if (dp->i_devvp != ip->i_devvp) panic("ext2_rename: EXDEV"); /* * Account for ".." in new directory. * When source and destination have the same * parent we don't fool with the link count. */ if (doingdirectory && newparent) { error = ext2_inc_nlink(dp); if (error) goto bad; dp->i_flag |= IN_CHANGE; error = ext2_update(tdvp, !DOINGASYNC(tdvp)); if (error) goto bad; } error = ext2_direnter(ip, tdvp, tcnp); if (error) { if (doingdirectory && newparent) { ext2_dec_nlink(dp); dp->i_flag |= IN_CHANGE; (void)ext2_update(tdvp, 1); } goto bad; } vput(tdvp); } else { if (xp->i_devvp != dp->i_devvp || xp->i_devvp != ip->i_devvp) panic("ext2_rename: EXDEV"); /* * Short circuit rename(foo, foo). */ if (xp->i_number == ip->i_number) panic("ext2_rename: same file"); /* * If the parent directory is "sticky", then the user must * own the parent directory, or the destination of the rename, * otherwise the destination may not be changed (except by * root). This implements append-only directories. */ if ((dp->i_mode & S_ISTXT) && tcnp->cn_cred->cr_uid != 0 && tcnp->cn_cred->cr_uid != dp->i_uid && xp->i_uid != tcnp->cn_cred->cr_uid) { error = EPERM; goto bad; } /* * Target must be empty if a directory and have no links * to it. Also, ensure source and target are compatible * (both directories, or both not directories). */ if ((xp->i_mode & IFMT) == IFDIR) { if (!ext2_dirempty(xp, dp->i_number, tcnp->cn_cred)) { error = ENOTEMPTY; goto bad; } if (!doingdirectory) { error = ENOTDIR; goto bad; } cache_purge(tdvp); } else if (doingdirectory) { error = EISDIR; goto bad; } error = ext2_dirrewrite(dp, ip, tcnp); if (error) goto bad; /* * If the target directory is in the same * directory as the source directory, * decrement the link count on the parent * of the target directory. */ if (doingdirectory && !newparent) { ext2_dec_nlink(dp); dp->i_flag |= IN_CHANGE; } vput(tdvp); /* * Adjust the link count of the target to * reflect the dirrewrite above. If this is * a directory it is empty and there are * no links to it, so we can squash the inode and * any space associated with it. We disallowed * renaming over top of a directory with links to * it above, as the remaining link would point to * a directory without "." or ".." entries. */ ext2_dec_nlink(xp); if (doingdirectory) { if (--xp->i_nlink != 0) panic("ext2_rename: linked directory"); error = ext2_truncate(tvp, (off_t)0, IO_SYNC, tcnp->cn_cred, tcnp->cn_thread); } xp->i_flag |= IN_CHANGE; vput(tvp); xp = NULL; } /* * 3) Unlink the source. */ fcnp->cn_flags &= ~MODMASK; fcnp->cn_flags |= LOCKPARENT | LOCKLEAF; VREF(fdvp); error = relookup(fdvp, &fvp, fcnp); if (error == 0) vrele(fdvp); if (fvp != NULL) { xp = VTOI(fvp); dp = VTOI(fdvp); } else { /* * From name has disappeared. IN_RENAME is not sufficient * to protect against directory races due to timing windows, * so we can't panic here. */ vrele(ap->a_fvp); return (0); } /* * Ensure that the directory entry still exists and has not * changed while the new name has been entered. If the source is * a file then the entry may have been unlinked or renamed. In * either case there is no further work to be done. If the source * is a directory then it cannot have been rmdir'ed; its link * count of three would cause a rmdir to fail with ENOTEMPTY. * The IN_RENAME flag ensures that it cannot be moved by another * rename. */ if (xp != ip) { /* * From name resolves to a different inode. IN_RENAME is * not sufficient protection against timing window races * so we can't panic here. */ } else { /* * If the source is a directory with a * new parent, the link count of the old * parent directory must be decremented * and ".." set to point to the new parent. */ if (doingdirectory && newparent) { ext2_dec_nlink(dp); dp->i_flag |= IN_CHANGE; dirbuf = malloc(dp->i_e2fs->e2fs_bsize, M_TEMP, M_WAITOK | M_ZERO); if (!dirbuf) { error = ENOMEM; goto bad; } error = vn_rdwr(UIO_READ, fvp, (caddr_t)dirbuf, ip->i_e2fs->e2fs_bsize, (off_t)0, UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK, tcnp->cn_cred, NOCRED, NULL, NULL); if (error == 0) { /* Like ufs little-endian: */ namlen = dirbuf->dotdot_type; if (namlen != 2 || dirbuf->dotdot_name[0] != '.' || dirbuf->dotdot_name[1] != '.') { ext2_dirbad(xp, (doff_t)12, "rename: mangled dir"); } else { dirbuf->dotdot_ino = newparent; /* * dirblock 0 could be htree root, * try both csum update functions. */ ext2_dirent_csum_set(ip, (struct ext2fs_direct_2 *)dirbuf); ext2_dx_csum_set(ip, (struct ext2fs_direct_2 *)dirbuf); (void)vn_rdwr(UIO_WRITE, fvp, (caddr_t)dirbuf, ip->i_e2fs->e2fs_bsize, (off_t)0, UIO_SYSSPACE, IO_NODELOCKED | IO_SYNC | IO_NOMACCHECK, tcnp->cn_cred, NOCRED, NULL, NULL); cache_purge(fdvp); } } free(dirbuf, M_TEMP); } error = ext2_dirremove(fdvp, fcnp); if (!error) { ext2_dec_nlink(xp); xp->i_flag |= IN_CHANGE; } xp->i_flag &= ~IN_RENAME; } if (dp) vput(fdvp); if (xp) vput(fvp); vrele(ap->a_fvp); return (error); bad: if (xp) vput(ITOV(xp)); vput(ITOV(dp)); out: if (doingdirectory) ip->i_flag &= ~IN_RENAME; if (vn_lock(fvp, LK_EXCLUSIVE) == 0) { ext2_dec_nlink(ip); ip->i_flag |= IN_CHANGE; ip->i_flag &= ~IN_RENAME; vput(fvp); } else vrele(fvp); return (error); } #ifdef UFS_ACL static int ext2_do_posix1e_acl_inheritance_dir(struct vnode *dvp, struct vnode *tvp, mode_t dmode, struct ucred *cred, struct thread *td) { int error; struct inode *ip = VTOI(tvp); struct acl *dacl, *acl; acl = acl_alloc(M_WAITOK); dacl = acl_alloc(M_WAITOK); /* * Retrieve default ACL from parent, if any. */ error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cred, td); switch (error) { case 0: /* * Retrieved a default ACL, so merge mode and ACL if * necessary. If the ACL is empty, fall through to * the "not defined or available" case. */ if (acl->acl_cnt != 0) { dmode = acl_posix1e_newfilemode(dmode, acl); ip->i_mode = dmode; *dacl = *acl; ext2_sync_acl_from_inode(ip, acl); break; } /* FALLTHROUGH */ case EOPNOTSUPP: /* * Just use the mode as-is. */ ip->i_mode = dmode; error = 0; goto out; default: goto out; } error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cred, td); if (error == 0) error = VOP_SETACL(tvp, ACL_TYPE_DEFAULT, dacl, cred, td); switch (error) { case 0: break; case EOPNOTSUPP: /* * XXX: This should not happen, as EOPNOTSUPP above * was supposed to free acl. */ #ifdef DEBUG printf("ext2_mkdir: VOP_GETACL() but no VOP_SETACL()\n"); #endif /* DEBUG */ break; default: goto out; } out: acl_free(acl); acl_free(dacl); return (error); } static int ext2_do_posix1e_acl_inheritance_file(struct vnode *dvp, struct vnode *tvp, mode_t mode, struct ucred *cred, struct thread *td) { int error; struct inode *ip = VTOI(tvp); struct acl *acl; acl = acl_alloc(M_WAITOK); /* * Retrieve default ACL for parent, if any. */ error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cred, td); switch (error) { case 0: /* * Retrieved a default ACL, so merge mode and ACL if * necessary. */ if (acl->acl_cnt != 0) { /* * Two possible ways for default ACL to not * be present. First, the EA can be * undefined, or second, the default ACL can * be blank. If it's blank, fall through to * the it's not defined case. */ mode = acl_posix1e_newfilemode(mode, acl); ip->i_mode = mode; ext2_sync_acl_from_inode(ip, acl); break; } /* FALLTHROUGH */ case EOPNOTSUPP: /* * Just use the mode as-is. */ ip->i_mode = mode; error = 0; goto out; default: goto out; } error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cred, td); switch (error) { case 0: break; case EOPNOTSUPP: /* * XXX: This should not happen, as EOPNOTSUPP above was * supposed to free acl. */ printf("ufs_do_posix1e_acl_inheritance_file: VOP_GETACL() " "but no VOP_SETACL()\n"); /* panic("ufs_do_posix1e_acl_inheritance_file: VOP_GETACL() " "but no VOP_SETACL()"); */ break; default: goto out; } out: acl_free(acl); return (error); } #endif /* UFS_ACL */ /* * Mkdir system call */ static int ext2_mkdir(struct vop_mkdir_args *ap) { struct m_ext2fs *fs; struct vnode *dvp = ap->a_dvp; struct vattr *vap = ap->a_vap; struct componentname *cnp = ap->a_cnp; struct inode *ip, *dp; struct vnode *tvp; struct dirtemplate dirtemplate, *dtp; char *buf = NULL; int error, dmode; #ifdef INVARIANTS if ((cnp->cn_flags & HASBUF) == 0) panic("ext2_mkdir: no name"); #endif dp = VTOI(dvp); if ((nlink_t)dp->i_nlink >= EXT4_LINK_MAX && !EXT2_HAS_RO_COMPAT_FEATURE(dp->i_e2fs, EXT2F_ROCOMPAT_DIR_NLINK)) { error = EMLINK; goto out; } dmode = vap->va_mode & 0777; dmode |= IFDIR; /* * Must simulate part of ext2_makeinode here to acquire the inode, * but not have it entered in the parent directory. The entry is * made later after writing "." and ".." entries. */ error = ext2_valloc(dvp, dmode, cnp->cn_cred, &tvp); if (error) goto out; ip = VTOI(tvp); fs = ip->i_e2fs; ip->i_gid = dp->i_gid; #ifdef SUIDDIR { /* * if we are hacking owners here, (only do this where told to) * and we are not giving it TOO root, (would subvert quotas) * then go ahead and give it to the other user. * The new directory also inherits the SUID bit. * If user's UID and dir UID are the same, * 'give it away' so that the SUID is still forced on. */ if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) && (dp->i_mode & ISUID) && dp->i_uid) { dmode |= ISUID; ip->i_uid = dp->i_uid; } else { ip->i_uid = cnp->cn_cred->cr_uid; } } #else ip->i_uid = cnp->cn_cred->cr_uid; #endif ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; ip->i_mode = dmode; tvp->v_type = VDIR; /* Rest init'd in getnewvnode(). */ ip->i_nlink = 2; if (cnp->cn_flags & ISWHITEOUT) ip->i_flags |= UF_OPAQUE; error = ext2_update(tvp, 1); /* * Bump link count in parent directory * to reflect work done below. Should * be done before reference is created * so reparation is possible if we crash. */ ext2_inc_nlink(dp); dp->i_flag |= IN_CHANGE; error = ext2_update(dvp, !DOINGASYNC(dvp)); if (error) goto bad; /* Initialize directory with "." and ".." from static template. */ if (EXT2_HAS_INCOMPAT_FEATURE(ip->i_e2fs, EXT2F_INCOMPAT_FTYPE)) dtp = &mastertemplate; else dtp = &omastertemplate; dirtemplate = *dtp; dirtemplate.dot_ino = ip->i_number; dirtemplate.dotdot_ino = dp->i_number; /* * note that in ext2 DIRBLKSIZ == blocksize, not DEV_BSIZE so let's * just redefine it - for this function only */ #undef DIRBLKSIZ #define DIRBLKSIZ VTOI(dvp)->i_e2fs->e2fs_bsize dirtemplate.dotdot_reclen = DIRBLKSIZ - 12; buf = malloc(DIRBLKSIZ, M_TEMP, M_WAITOK | M_ZERO); if (!buf) { error = ENOMEM; ext2_dec_nlink(dp); dp->i_flag |= IN_CHANGE; goto bad; } if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM)) { dirtemplate.dotdot_reclen -= sizeof(struct ext2fs_direct_tail); ext2_init_dirent_tail(EXT2_DIRENT_TAIL(buf, DIRBLKSIZ)); } memcpy(buf, &dirtemplate, sizeof(dirtemplate)); ext2_dirent_csum_set(ip, (struct ext2fs_direct_2 *)buf); error = vn_rdwr(UIO_WRITE, tvp, (caddr_t)buf, DIRBLKSIZ, (off_t)0, UIO_SYSSPACE, IO_NODELOCKED | IO_SYNC | IO_NOMACCHECK, cnp->cn_cred, NOCRED, NULL, NULL); if (error) { ext2_dec_nlink(dp); dp->i_flag |= IN_CHANGE; goto bad; } if (DIRBLKSIZ > VFSTOEXT2(dvp->v_mount)->um_mountp->mnt_stat.f_bsize) /* XXX should grow with balloc() */ panic("ext2_mkdir: blksize"); else { ip->i_size = DIRBLKSIZ; ip->i_flag |= IN_CHANGE; } #ifdef UFS_ACL if (dvp->v_mount->mnt_flag & MNT_ACLS) { error = ext2_do_posix1e_acl_inheritance_dir(dvp, tvp, dmode, cnp->cn_cred, cnp->cn_thread); if (error) goto bad; } #endif /* UFS_ACL */ /* Directory set up, now install its entry in the parent directory. */ error = ext2_direnter(ip, dvp, cnp); if (error) { ext2_dec_nlink(dp); dp->i_flag |= IN_CHANGE; } bad: /* * No need to do an explicit VOP_TRUNCATE here, vrele will do this * for us because we set the link count to 0. */ if (error) { ip->i_nlink = 0; ip->i_flag |= IN_CHANGE; vput(tvp); } else *ap->a_vpp = tvp; out: free(buf, M_TEMP); return (error); #undef DIRBLKSIZ #define DIRBLKSIZ DEV_BSIZE } /* * Rmdir system call. */ static int ext2_rmdir(struct vop_rmdir_args *ap) { struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; struct inode *ip, *dp; int error; ip = VTOI(vp); dp = VTOI(dvp); /* * Verify the directory is empty (and valid). * (Rmdir ".." won't be valid since * ".." will contain a reference to * the current directory and thus be * non-empty.) */ if (!ext2_dirempty(ip, dp->i_number, cnp->cn_cred)) { error = ENOTEMPTY; goto out; } if ((dp->i_flags & APPEND) || (ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))) { error = EPERM; goto out; } /* * Delete reference to directory before purging * inode. If we crash in between, the directory * will be reattached to lost+found, */ error = ext2_dirremove(dvp, cnp); if (error) goto out; ext2_dec_nlink(dp); dp->i_flag |= IN_CHANGE; cache_purge(dvp); VOP_UNLOCK(dvp, 0); /* * Truncate inode. The only stuff left * in the directory is "." and "..". */ ip->i_nlink = 0; error = ext2_truncate(vp, (off_t)0, IO_SYNC, cnp->cn_cred, cnp->cn_thread); cache_purge(ITOV(ip)); if (vn_lock(dvp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { VOP_UNLOCK(vp, 0); vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } out: return (error); } /* * symlink -- make a symbolic link */ static int ext2_symlink(struct vop_symlink_args *ap) { struct vnode *vp, **vpp = ap->a_vpp; struct inode *ip; int len, error; error = ext2_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp, vpp, ap->a_cnp); if (error) return (error); vp = *vpp; len = strlen(ap->a_target); if (len < vp->v_mount->mnt_maxsymlinklen) { ip = VTOI(vp); bcopy(ap->a_target, (char *)ip->i_shortlink, len); ip->i_size = len; ip->i_flag |= IN_CHANGE | IN_UPDATE; } else error = vn_rdwr(UIO_WRITE, vp, __DECONST(void *, ap->a_target), len, (off_t)0, UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK, ap->a_cnp->cn_cred, NOCRED, NULL, NULL); if (error) vput(vp); return (error); } /* * Return target name of a symbolic link */ static int ext2_readlink(struct vop_readlink_args *ap) { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); int isize; isize = ip->i_size; if (isize < vp->v_mount->mnt_maxsymlinklen) { uiomove((char *)ip->i_shortlink, isize, ap->a_uio); return (0); } return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred)); } /* * Calculate the logical to physical mapping if not done already, * then call the device strategy routine. * * In order to be able to swap to a file, the ext2_bmaparray() operation may not * deadlock on memory. See ext2_bmap() for details. */ static int ext2_strategy(struct vop_strategy_args *ap) { struct buf *bp = ap->a_bp; struct vnode *vp = ap->a_vp; struct bufobj *bo; daddr_t blkno; int error; if (vp->v_type == VBLK || vp->v_type == VCHR) panic("ext2_strategy: spec"); if (bp->b_blkno == bp->b_lblkno) { if (VTOI(ap->a_vp)->i_flag & IN_E4EXTENTS) error = ext4_bmapext(vp, bp->b_lblkno, &blkno, NULL, NULL); else error = ext2_bmaparray(vp, bp->b_lblkno, &blkno, NULL, NULL); bp->b_blkno = blkno; if (error) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return (0); } if ((long)bp->b_blkno == -1) vfs_bio_clrbuf(bp); } if ((long)bp->b_blkno == -1) { bufdone(bp); return (0); } bp->b_iooffset = dbtob(bp->b_blkno); bo = VFSTOEXT2(vp->v_mount)->um_bo; BO_STRATEGY(bo, bp); return (0); } /* * Print out the contents of an inode. */ static int ext2_print(struct vop_print_args *ap) { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); vn_printf(ip->i_devvp, "\tino %ju", (uintmax_t)ip->i_number); if (vp->v_type == VFIFO) fifo_printinfo(vp); printf("\n"); return (0); } /* * Close wrapper for fifos. * * Update the times on the inode then do device close. */ static int ext2fifo_close(struct vop_close_args *ap) { struct vnode *vp = ap->a_vp; VI_LOCK(vp); if (vp->v_usecount > 1) ext2_itimes_locked(vp); VI_UNLOCK(vp); return (fifo_specops.vop_close(ap)); } /* * Kqfilter wrapper for fifos. * * Fall through to ext2 kqfilter routines if needed */ static int ext2fifo_kqfilter(struct vop_kqfilter_args *ap) { int error; error = fifo_specops.vop_kqfilter(ap); if (error) error = vfs_kqfilter(ap); return (error); } /* * Return POSIX pathconf information applicable to ext2 filesystems. */ static int ext2_pathconf(struct vop_pathconf_args *ap) { int error = 0; switch (ap->a_name) { case _PC_LINK_MAX: if (EXT2_HAS_RO_COMPAT_FEATURE(VTOI(ap->a_vp)->i_e2fs, EXT2F_ROCOMPAT_DIR_NLINK)) *ap->a_retval = INT_MAX; else *ap->a_retval = EXT4_LINK_MAX; break; case _PC_NAME_MAX: *ap->a_retval = NAME_MAX; break; case _PC_PIPE_BUF: if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) *ap->a_retval = PIPE_BUF; else error = EINVAL; break; case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; break; case _PC_NO_TRUNC: *ap->a_retval = 1; break; #ifdef UFS_ACL case _PC_ACL_EXTENDED: if (ap->a_vp->v_mount->mnt_flag & MNT_ACLS) *ap->a_retval = 1; else *ap->a_retval = 0; break; case _PC_ACL_PATH_MAX: if (ap->a_vp->v_mount->mnt_flag & MNT_ACLS) *ap->a_retval = ACL_MAX_ENTRIES; else *ap->a_retval = 3; break; #endif /* UFS_ACL */ case _PC_MIN_HOLE_SIZE: *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize; break; case _PC_PRIO_IO: *ap->a_retval = 0; break; case _PC_SYNC_IO: *ap->a_retval = 0; break; case _PC_ALLOC_SIZE_MIN: *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_bsize; break; case _PC_FILESIZEBITS: *ap->a_retval = 64; break; case _PC_REC_INCR_XFER_SIZE: *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize; break; case _PC_REC_MAX_XFER_SIZE: *ap->a_retval = -1; /* means ``unlimited'' */ break; case _PC_REC_MIN_XFER_SIZE: *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize; break; case _PC_REC_XFER_ALIGN: *ap->a_retval = PAGE_SIZE; break; case _PC_SYMLINK_MAX: *ap->a_retval = MAXPATHLEN; break; default: error = vop_stdpathconf(ap); break; } return (error); } /* * Vnode operation to remove a named attribute. */ static int ext2_deleteextattr(struct vop_deleteextattr_args *ap) { struct inode *ip; struct m_ext2fs *fs; int error; ip = VTOI(ap->a_vp); fs = ip->i_e2fs; if (!EXT2_HAS_COMPAT_FEATURE(ip->i_e2fs, EXT2F_COMPAT_EXT_ATTR)) return (EOPNOTSUPP); if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VWRITE); if (error) return (error); error = ENOATTR; if (EXT2_INODE_SIZE(fs) != E2FS_REV0_INODE_SIZE) { error = ext2_extattr_inode_delete(ip, ap->a_attrnamespace, ap->a_name); if (error != ENOATTR) return (error); } if (ip->i_facl) error = ext2_extattr_block_delete(ip, ap->a_attrnamespace, ap->a_name); return (error); } /* * Vnode operation to retrieve a named extended attribute. */ static int ext2_getextattr(struct vop_getextattr_args *ap) { struct inode *ip; struct m_ext2fs *fs; int error; ip = VTOI(ap->a_vp); fs = ip->i_e2fs; if (!EXT2_HAS_COMPAT_FEATURE(ip->i_e2fs, EXT2F_COMPAT_EXT_ATTR)) return (EOPNOTSUPP); if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VREAD); if (error) return (error); if (ap->a_size != NULL) *ap->a_size = 0; error = ENOATTR; if (EXT2_INODE_SIZE(fs) != E2FS_REV0_INODE_SIZE) { error = ext2_extattr_inode_get(ip, ap->a_attrnamespace, ap->a_name, ap->a_uio, ap->a_size); if (error != ENOATTR) return (error); } if (ip->i_facl) error = ext2_extattr_block_get(ip, ap->a_attrnamespace, ap->a_name, ap->a_uio, ap->a_size); return (error); } /* * Vnode operation to retrieve extended attributes on a vnode. */ static int ext2_listextattr(struct vop_listextattr_args *ap) { struct inode *ip; struct m_ext2fs *fs; int error; ip = VTOI(ap->a_vp); fs = ip->i_e2fs; if (!EXT2_HAS_COMPAT_FEATURE(ip->i_e2fs, EXT2F_COMPAT_EXT_ATTR)) return (EOPNOTSUPP); if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VREAD); if (error) return (error); if (ap->a_size != NULL) *ap->a_size = 0; if (EXT2_INODE_SIZE(fs) != E2FS_REV0_INODE_SIZE) { error = ext2_extattr_inode_list(ip, ap->a_attrnamespace, ap->a_uio, ap->a_size); if (error) return (error); } if (ip->i_facl) error = ext2_extattr_block_list(ip, ap->a_attrnamespace, ap->a_uio, ap->a_size); return (error); } /* * Vnode operation to set a named attribute. */ static int ext2_setextattr(struct vop_setextattr_args *ap) { struct inode *ip; struct m_ext2fs *fs; int error; ip = VTOI(ap->a_vp); fs = ip->i_e2fs; if (!EXT2_HAS_COMPAT_FEATURE(ip->i_e2fs, EXT2F_COMPAT_EXT_ATTR)) return (EOPNOTSUPP); if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) return (EOPNOTSUPP); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VWRITE); if (error) return (error); error = ext2_extattr_valid_attrname(ap->a_attrnamespace, ap->a_name); if (error) return (error); if (EXT2_INODE_SIZE(fs) != E2FS_REV0_INODE_SIZE) { error = ext2_extattr_inode_set(ip, ap->a_attrnamespace, ap->a_name, ap->a_uio); if (error != ENOSPC) return (error); } error = ext2_extattr_block_set(ip, ap->a_attrnamespace, ap->a_name, ap->a_uio); return (error); } /* * Vnode pointer to File handle */ /* ARGSUSED */ static int ext2_vptofh(struct vop_vptofh_args *ap) { struct inode *ip; struct ufid *ufhp; ip = VTOI(ap->a_vp); ufhp = (struct ufid *)ap->a_fhp; ufhp->ufid_len = sizeof(struct ufid); ufhp->ufid_ino = ip->i_number; ufhp->ufid_gen = ip->i_gen; return (0); } /* * Initialize the vnode associated with a new inode, handle aliased * vnodes. */ int ext2_vinit(struct mount *mntp, struct vop_vector *fifoops, struct vnode **vpp) { struct inode *ip; struct vnode *vp; vp = *vpp; ip = VTOI(vp); vp->v_type = IFTOVT(ip->i_mode); /* * Only unallocated inodes should be of type VNON. */ if (ip->i_mode != 0 && vp->v_type == VNON) return (EINVAL); if (vp->v_type == VFIFO) vp->v_op = fifoops; if (ip->i_number == EXT2_ROOTINO) vp->v_vflag |= VV_ROOT; ip->i_modrev = init_va_filerev(); *vpp = vp; return (0); } /* * Allocate a new inode. */ static int ext2_makeinode(int mode, struct vnode *dvp, struct vnode **vpp, struct componentname *cnp) { struct inode *ip, *pdir; struct vnode *tvp; int error; pdir = VTOI(dvp); #ifdef INVARIANTS if ((cnp->cn_flags & HASBUF) == 0) panic("ext2_makeinode: no name"); #endif *vpp = NULL; if ((mode & IFMT) == 0) mode |= IFREG; error = ext2_valloc(dvp, mode, cnp->cn_cred, &tvp); if (error) { return (error); } ip = VTOI(tvp); ip->i_gid = pdir->i_gid; #ifdef SUIDDIR { /* * if we are * not the owner of the directory, * and we are hacking owners here, (only do this where told to) * and we are not giving it TOO root, (would subvert quotas) * then go ahead and give it to the other user. * Note that this drops off the execute bits for security. */ if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) && (pdir->i_mode & ISUID) && (pdir->i_uid != cnp->cn_cred->cr_uid) && pdir->i_uid) { ip->i_uid = pdir->i_uid; mode &= ~07111; } else { ip->i_uid = cnp->cn_cred->cr_uid; } } #else ip->i_uid = cnp->cn_cred->cr_uid; #endif ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; ip->i_mode = mode; tvp->v_type = IFTOVT(mode); /* Rest init'd in getnewvnode(). */ ip->i_nlink = 1; if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred)) { if (priv_check_cred(cnp->cn_cred, PRIV_VFS_RETAINSUGID)) ip->i_mode &= ~ISGID; } if (cnp->cn_flags & ISWHITEOUT) ip->i_flags |= UF_OPAQUE; /* * Make sure inode goes to disk before directory entry. */ error = ext2_update(tvp, !DOINGASYNC(tvp)); if (error) goto bad; #ifdef UFS_ACL if (dvp->v_mount->mnt_flag & MNT_ACLS) { error = ext2_do_posix1e_acl_inheritance_file(dvp, tvp, mode, cnp->cn_cred, cnp->cn_thread); if (error) goto bad; } #endif /* UFS_ACL */ error = ext2_direnter(ip, dvp, cnp); if (error) goto bad; *vpp = tvp; return (0); bad: /* * Write error occurred trying to update the inode * or the directory so must deallocate the inode. */ ip->i_nlink = 0; ip->i_flag |= IN_CHANGE; vput(tvp); return (error); } /* * Vnode op for reading. */ static int ext2_read(struct vop_read_args *ap) { struct vnode *vp; struct inode *ip; struct uio *uio; struct m_ext2fs *fs; struct buf *bp; daddr_t lbn, nextlbn; off_t bytesinfile; long size, xfersize, blkoffset; int error, orig_resid, seqcount; int ioflag; vp = ap->a_vp; uio = ap->a_uio; ioflag = ap->a_ioflag; seqcount = ap->a_ioflag >> IO_SEQSHIFT; ip = VTOI(vp); #ifdef INVARIANTS if (uio->uio_rw != UIO_READ) panic("%s: mode", "ext2_read"); if (vp->v_type == VLNK) { if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) panic("%s: short symlink", "ext2_read"); } else if (vp->v_type != VREG && vp->v_type != VDIR) panic("%s: type %d", "ext2_read", vp->v_type); #endif orig_resid = uio->uio_resid; KASSERT(orig_resid >= 0, ("ext2_read: uio->uio_resid < 0")); if (orig_resid == 0) return (0); KASSERT(uio->uio_offset >= 0, ("ext2_read: uio->uio_offset < 0")); fs = ip->i_e2fs; if (uio->uio_offset < ip->i_size && uio->uio_offset >= fs->e2fs_maxfilesize) return (EOVERFLOW); for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) break; lbn = lblkno(fs, uio->uio_offset); nextlbn = lbn + 1; size = blksize(fs, ip, lbn); blkoffset = blkoff(fs, uio->uio_offset); xfersize = fs->e2fs_fsize - blkoffset; if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; if (bytesinfile < xfersize) xfersize = bytesinfile; if (lblktosize(fs, nextlbn) >= ip->i_size) error = bread(vp, lbn, size, NOCRED, &bp); else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { error = cluster_read(vp, ip->i_size, lbn, size, NOCRED, blkoffset + uio->uio_resid, seqcount, 0, &bp); } else if (seqcount > 1) { u_int nextsize = blksize(fs, ip, nextlbn); error = breadn(vp, lbn, size, &nextlbn, &nextsize, 1, NOCRED, &bp); } else error = bread(vp, lbn, size, NOCRED, &bp); if (error) { brelse(bp); bp = NULL; break; } /* * We should only get non-zero b_resid when an I/O error * has occurred, which should cause us to break above. * However, if the short read did not cause an error, * then we want to ensure that we do not uiomove bad * or uninitialized data. */ size -= bp->b_resid; if (size < xfersize) { if (size == 0) break; xfersize = size; } error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); if (error) break; vfs_bio_brelse(bp, ioflag); } /* * This can only happen in the case of an error because the loop * above resets bp to NULL on each iteration and on normal * completion has not set a new value into it. so it must have come * from a 'break' statement */ if (bp != NULL) vfs_bio_brelse(bp, ioflag); if ((error == 0 || uio->uio_resid != orig_resid) && (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) ip->i_flag |= IN_ACCESS; return (error); } static int ext2_ioctl(struct vop_ioctl_args *ap) { switch (ap->a_command) { case FIOSEEKDATA: case FIOSEEKHOLE: return (vn_bmap_seekhole(ap->a_vp, ap->a_command, (off_t *)ap->a_data, ap->a_cred)); default: return (ENOTTY); } } /* * Vnode op for writing. */ static int ext2_write(struct vop_write_args *ap) { struct vnode *vp; struct uio *uio; struct inode *ip; struct m_ext2fs *fs; struct buf *bp; daddr_t lbn; off_t osize; int blkoffset, error, flags, ioflag, resid, size, seqcount, xfersize; ioflag = ap->a_ioflag; uio = ap->a_uio; vp = ap->a_vp; seqcount = ioflag >> IO_SEQSHIFT; ip = VTOI(vp); #ifdef INVARIANTS if (uio->uio_rw != UIO_WRITE) panic("%s: mode", "ext2_write"); #endif switch (vp->v_type) { case VREG: if (ioflag & IO_APPEND) uio->uio_offset = ip->i_size; if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) return (EPERM); /* FALLTHROUGH */ case VLNK: break; case VDIR: /* XXX differs from ffs -- this is called from ext2_mkdir(). */ if ((ioflag & IO_SYNC) == 0) panic("ext2_write: nonsync dir write"); break; default: panic("ext2_write: type %p %d (%jd,%jd)", (void *)vp, vp->v_type, (intmax_t)uio->uio_offset, (intmax_t)uio->uio_resid); } KASSERT(uio->uio_resid >= 0, ("ext2_write: uio->uio_resid < 0")); KASSERT(uio->uio_offset >= 0, ("ext2_write: uio->uio_offset < 0")); fs = ip->i_e2fs; if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->e2fs_maxfilesize) return (EFBIG); /* * Maybe this should be above the vnode op call, but so long as * file servers have no limits, I don't think it matters. */ if (vn_rlimit_fsize(vp, uio, uio->uio_td)) return (EFBIG); resid = uio->uio_resid; osize = ip->i_size; if (seqcount > BA_SEQMAX) flags = BA_SEQMAX << BA_SEQSHIFT; else flags = seqcount << BA_SEQSHIFT; if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) flags |= IO_SYNC; for (error = 0; uio->uio_resid > 0;) { lbn = lblkno(fs, uio->uio_offset); blkoffset = blkoff(fs, uio->uio_offset); xfersize = fs->e2fs_fsize - blkoffset; if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; if (uio->uio_offset + xfersize > ip->i_size) vnode_pager_setsize(vp, uio->uio_offset + xfersize); /* * We must perform a read-before-write if the transfer size * does not cover the entire buffer. */ if (fs->e2fs_bsize > xfersize) flags |= BA_CLRBUF; else flags &= ~BA_CLRBUF; error = ext2_balloc(ip, lbn, blkoffset + xfersize, ap->a_cred, &bp, flags); if (error != 0) break; if ((ioflag & (IO_SYNC | IO_INVAL)) == (IO_SYNC | IO_INVAL)) bp->b_flags |= B_NOCACHE; if (uio->uio_offset + xfersize > ip->i_size) ip->i_size = uio->uio_offset + xfersize; size = blksize(fs, ip, lbn) - bp->b_resid; if (size < xfersize) xfersize = size; error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); /* * If the buffer is not already filled and we encounter an * error while trying to fill it, we have to clear out any * garbage data from the pages instantiated for the buffer. * If we do not, a failed uiomove() during a write can leave * the prior contents of the pages exposed to a userland mmap. * * Note that we need only clear buffers with a transfer size * equal to the block size because buffers with a shorter * transfer size were cleared above by the call to ext2_balloc() * with the BA_CLRBUF flag set. * * If the source region for uiomove identically mmaps the * buffer, uiomove() performed the NOP copy, and the buffer * content remains valid because the page fault handler * validated the pages. */ if (error != 0 && (bp->b_flags & B_CACHE) == 0 && fs->e2fs_bsize == xfersize) vfs_bio_clrbuf(bp); vfs_bio_set_flags(bp, ioflag); /* * If IO_SYNC each buffer is written synchronously. Otherwise * if we have a severe page deficiency write the buffer * asynchronously. Otherwise try to cluster, and if that * doesn't do it then either do an async write (if O_DIRECT), * or a delayed write (if not). */ if (ioflag & IO_SYNC) { (void)bwrite(bp); } else if (vm_page_count_severe() || buf_dirty_count_severe() || (ioflag & IO_ASYNC)) { bp->b_flags |= B_CLUSTEROK; bawrite(bp); } else if (xfersize + blkoffset == fs->e2fs_fsize) { if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { bp->b_flags |= B_CLUSTEROK; cluster_write(vp, bp, ip->i_size, seqcount, 0); } else { bawrite(bp); } } else if (ioflag & IO_DIRECT) { bp->b_flags |= B_CLUSTEROK; bawrite(bp); } else { bp->b_flags |= B_CLUSTEROK; bdwrite(bp); } if (error || xfersize == 0) break; } /* * If we successfully wrote any data, and we are not the superuser * we clear the setuid and setgid bits as a precaution against * tampering. */ if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ap->a_cred) { if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID)) ip->i_mode &= ~(ISUID | ISGID); } if (error) { if (ioflag & IO_UNIT) { (void)ext2_truncate(vp, osize, ioflag & IO_SYNC, ap->a_cred, uio->uio_td); uio->uio_offset -= resid - uio->uio_resid; uio->uio_resid = resid; } } if (uio->uio_resid != resid) { ip->i_flag |= IN_CHANGE | IN_UPDATE; if (ioflag & IO_SYNC) error = ext2_update(vp, 1); } return (error); } Index: head/sys/fs/fuse/fuse_vnops.c =================================================================== --- head/sys/fs/fuse/fuse_vnops.c (revision 350420) +++ head/sys/fs/fuse/fuse_vnops.c (revision 350421) @@ -1,2375 +1,2376 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2007-2009 Google Inc. and Amit Singh * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Copyright (C) 2005 Csaba Henk. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "fuse.h" #include "fuse_file.h" #include "fuse_internal.h" #include "fuse_ipc.h" #include "fuse_node.h" #include "fuse_param.h" #include "fuse_io.h" #include SDT_PROVIDER_DECLARE(fuse); /* * Fuse trace probe: * arg0: verbosity. Higher numbers give more verbose messages * arg1: Textual message */ SDT_PROBE_DEFINE2(fuse, , vnops, trace, "int", "char*"); /* vnode ops */ static vop_access_t fuse_vnop_access; static vop_close_t fuse_vnop_close; static vop_create_t fuse_vnop_create; static vop_deleteextattr_t fuse_vnop_deleteextattr; static vop_fsync_t fuse_vnop_fsync; static vop_getattr_t fuse_vnop_getattr; static vop_getextattr_t fuse_vnop_getextattr; static vop_inactive_t fuse_vnop_inactive; static vop_link_t fuse_vnop_link; static vop_listextattr_t fuse_vnop_listextattr; static vop_lookup_t fuse_vnop_lookup; static vop_mkdir_t fuse_vnop_mkdir; static vop_mknod_t fuse_vnop_mknod; static vop_open_t fuse_vnop_open; static vop_pathconf_t fuse_vnop_pathconf; static vop_read_t fuse_vnop_read; static vop_readdir_t fuse_vnop_readdir; static vop_readlink_t fuse_vnop_readlink; static vop_reclaim_t fuse_vnop_reclaim; static vop_remove_t fuse_vnop_remove; static vop_rename_t fuse_vnop_rename; static vop_rmdir_t fuse_vnop_rmdir; static vop_setattr_t fuse_vnop_setattr; static vop_setextattr_t fuse_vnop_setextattr; static vop_strategy_t fuse_vnop_strategy; static vop_symlink_t fuse_vnop_symlink; static vop_write_t fuse_vnop_write; static vop_getpages_t fuse_vnop_getpages; static vop_putpages_t fuse_vnop_putpages; static vop_print_t fuse_vnop_print; struct vop_vector fuse_vnops = { .vop_default = &default_vnodeops, .vop_access = fuse_vnop_access, .vop_close = fuse_vnop_close, .vop_create = fuse_vnop_create, .vop_deleteextattr = fuse_vnop_deleteextattr, .vop_fsync = fuse_vnop_fsync, .vop_getattr = fuse_vnop_getattr, .vop_getextattr = fuse_vnop_getextattr, .vop_inactive = fuse_vnop_inactive, .vop_link = fuse_vnop_link, .vop_listextattr = fuse_vnop_listextattr, .vop_lookup = fuse_vnop_lookup, .vop_mkdir = fuse_vnop_mkdir, .vop_mknod = fuse_vnop_mknod, .vop_open = fuse_vnop_open, .vop_pathconf = fuse_vnop_pathconf, .vop_read = fuse_vnop_read, .vop_readdir = fuse_vnop_readdir, .vop_readlink = fuse_vnop_readlink, .vop_reclaim = fuse_vnop_reclaim, .vop_remove = fuse_vnop_remove, .vop_rename = fuse_vnop_rename, .vop_rmdir = fuse_vnop_rmdir, .vop_setattr = fuse_vnop_setattr, .vop_setextattr = fuse_vnop_setextattr, .vop_strategy = fuse_vnop_strategy, .vop_symlink = fuse_vnop_symlink, .vop_write = fuse_vnop_write, .vop_getpages = fuse_vnop_getpages, .vop_putpages = fuse_vnop_putpages, .vop_print = fuse_vnop_print, }; static u_long fuse_lookup_cache_hits = 0; SYSCTL_ULONG(_vfs_fusefs, OID_AUTO, lookup_cache_hits, CTLFLAG_RD, &fuse_lookup_cache_hits, 0, "number of positive cache hits in lookup"); static u_long fuse_lookup_cache_misses = 0; SYSCTL_ULONG(_vfs_fusefs, OID_AUTO, lookup_cache_misses, CTLFLAG_RD, &fuse_lookup_cache_misses, 0, "number of cache misses in lookup"); int fuse_lookup_cache_enable = 1; SYSCTL_INT(_vfs_fusefs, OID_AUTO, lookup_cache_enable, CTLFLAG_RW, &fuse_lookup_cache_enable, 0, "if non-zero, enable lookup cache"); /* * XXX: This feature is highly experimental and can bring to instabilities, * needs revisiting before to be enabled by default. */ static int fuse_reclaim_revoked = 0; SYSCTL_INT(_vfs_fusefs, OID_AUTO, reclaim_revoked, CTLFLAG_RW, &fuse_reclaim_revoked, 0, ""); uma_zone_t fuse_pbuf_zone; #define fuse_vm_page_lock(m) vm_page_lock((m)); #define fuse_vm_page_unlock(m) vm_page_unlock((m)); #define fuse_vm_page_lock_queues() ((void)0) #define fuse_vm_page_unlock_queues() ((void)0) /* struct vnop_access_args { struct vnode *a_vp; #if VOP_ACCESS_TAKES_ACCMODE_T accmode_t a_accmode; #else int a_mode; #endif struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; int accmode = ap->a_accmode; struct ucred *cred = ap->a_cred; struct fuse_access_param facp; struct fuse_data *data = fuse_get_mpdata(vnode_mount(vp)); int err; if (fuse_isdeadfs(vp)) { if (vnode_isvroot(vp)) { return 0; } return ENXIO; } if (!(data->dataflags & FSESS_INITED)) { if (vnode_isvroot(vp)) { if (priv_check_cred(cred, PRIV_VFS_ADMIN) || (fuse_match_cred(data->daemoncred, cred) == 0)) { return 0; } } return EBADF; } if (vnode_islnk(vp)) { return 0; } bzero(&facp, sizeof(facp)); err = fuse_internal_access(vp, accmode, &facp, ap->a_td, ap->a_cred); return err; } /* struct vnop_close_args { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_close(struct vop_close_args *ap) { struct vnode *vp = ap->a_vp; struct ucred *cred = ap->a_cred; int fflag = ap->a_fflag; fufh_type_t fufh_type; if (fuse_isdeadfs(vp)) { return 0; } if (vnode_isdir(vp)) { if (fuse_filehandle_valid(vp, FUFH_RDONLY)) { fuse_filehandle_close(vp, FUFH_RDONLY, NULL, cred); } return 0; } if (fflag & IO_NDELAY) { return 0; } fufh_type = fuse_filehandle_xlate_from_fflags(fflag); if (!fuse_filehandle_valid(vp, fufh_type)) { int i; for (i = 0; i < FUFH_MAXTYPE; i++) if (fuse_filehandle_valid(vp, i)) break; if (i == FUFH_MAXTYPE) panic("FUSE: fufh type %d found to be invalid in close" " (fflag=0x%x)\n", fufh_type, fflag); } if ((VTOFUD(vp)->flag & FN_SIZECHANGE) != 0) { fuse_vnode_savesize(vp, cred); } return 0; } /* struct vnop_create_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; */ static int fuse_vnop_create(struct vop_create_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct vattr *vap = ap->a_vap; struct thread *td = cnp->cn_thread; struct ucred *cred = cnp->cn_cred; struct fuse_open_in *foi; struct fuse_entry_out *feo; struct fuse_dispatcher fdi; struct fuse_dispatcher *fdip = &fdi; int err; struct mount *mp = vnode_mount(dvp); uint64_t parentnid = VTOFUD(dvp)->nid; mode_t mode = MAKEIMODE(vap->va_type, vap->va_mode); uint64_t x_fh_id; uint32_t x_open_flags; if (fuse_isdeadfs(dvp)) { return ENXIO; } bzero(&fdi, sizeof(fdi)); /* XXX: Will we ever want devices ? */ if ((vap->va_type != VREG)) { printf("fuse_vnop_create: unsupported va_type %d\n", vap->va_type); return (EINVAL); } fdisp_init(fdip, sizeof(*foi) + cnp->cn_namelen + 1); if (!fsess_isimpl(mp, FUSE_CREATE)) { SDT_PROBE2(fuse, , vnops, trace, 1, "eh, daemon doesn't implement create?"); return (EINVAL); } fdisp_make(fdip, FUSE_CREATE, vnode_mount(dvp), parentnid, td, cred); foi = fdip->indata; foi->mode = mode; foi->flags = O_CREAT | O_RDWR; memcpy((char *)fdip->indata + sizeof(*foi), cnp->cn_nameptr, cnp->cn_namelen); ((char *)fdip->indata)[sizeof(*foi) + cnp->cn_namelen] = '\0'; err = fdisp_wait_answ(fdip); if (err) { if (err == ENOSYS) fsess_set_notimpl(mp, FUSE_CREATE); goto out; } feo = fdip->answ; if ((err = fuse_internal_checkentry(feo, VREG))) { goto out; } err = fuse_vnode_get(mp, feo, feo->nodeid, dvp, vpp, cnp, VREG); if (err) { struct fuse_release_in *fri; uint64_t nodeid = feo->nodeid; uint64_t fh_id = ((struct fuse_open_out *)(feo + 1))->fh; fdisp_init(fdip, sizeof(*fri)); fdisp_make(fdip, FUSE_RELEASE, mp, nodeid, td, cred); fri = fdip->indata; fri->fh = fh_id; fri->flags = OFLAGS(mode); fuse_insert_callback(fdip->tick, fuse_internal_forget_callback); fuse_insert_message(fdip->tick); return err; } ASSERT_VOP_ELOCKED(*vpp, "fuse_vnop_create"); fdip->answ = feo + 1; x_fh_id = ((struct fuse_open_out *)(feo + 1))->fh; x_open_flags = ((struct fuse_open_out *)(feo + 1))->open_flags; fuse_filehandle_init(*vpp, FUFH_RDWR, NULL, x_fh_id); fuse_vnode_open(*vpp, x_open_flags, td); cache_purge_negative(dvp); out: fdisp_destroy(fdip); return err; } /* * Our vnop_fsync roughly corresponds to the FUSE_FSYNC method. The Linux * version of FUSE also has a FUSE_FLUSH method. * * On Linux, fsync() synchronizes a file's complete in-core state with that * on disk. The call is not supposed to return until the system has completed * that action or until an error is detected. * * Linux also has an fdatasync() call that is similar to fsync() but is not * required to update the metadata such as access time and modification time. */ /* struct vnop_fsync_args { struct vnodeop_desc *a_desc; struct vnode * a_vp; struct ucred * a_cred; int a_waitfor; struct thread * a_td; }; */ static int fuse_vnop_fsync(struct vop_fsync_args *ap) { struct vnode *vp = ap->a_vp; struct thread *td = ap->a_td; struct fuse_filehandle *fufh; struct fuse_vnode_data *fvdat = VTOFUD(vp); int type, err = 0; if (fuse_isdeadfs(vp)) { return 0; } if ((err = vop_stdfsync(ap))) return err; if (!fsess_isimpl(vnode_mount(vp), (vnode_vtype(vp) == VDIR ? FUSE_FSYNCDIR : FUSE_FSYNC))) { goto out; } for (type = 0; type < FUFH_MAXTYPE; type++) { fufh = &(fvdat->fufh[type]); if (FUFH_IS_VALID(fufh)) { fuse_internal_fsync(vp, td, NULL, fufh); } } out: return 0; } /* struct vnop_getattr_args { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; struct ucred *cred = ap->a_cred; struct thread *td = curthread; struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_attr_out *fao; int err = 0; int dataflags; struct fuse_dispatcher fdi; dataflags = fuse_get_mpdata(vnode_mount(vp))->dataflags; /* Note that we are not bailing out on a dead file system just yet. */ if (!(dataflags & FSESS_INITED)) { if (!vnode_isvroot(vp)) { fdata_set_dead(fuse_get_mpdata(vnode_mount(vp))); err = ENOTCONN; return err; } else { goto fake; } } fdisp_init(&fdi, 0); if ((err = fdisp_simple_putget_vp(&fdi, FUSE_GETATTR, vp, td, cred))) { if ((err == ENOTCONN) && vnode_isvroot(vp)) { /* see comment in fuse_vfsop_statfs() */ fdisp_destroy(&fdi); goto fake; } if (err == ENOENT) { fuse_internal_vnode_disappear(vp); } goto out; } fao = (struct fuse_attr_out *)fdi.answ; fuse_internal_cache_attrs(vp, &fao->attr, fao->attr_valid, fao->attr_valid_nsec, vap); if (vap->va_type != vnode_vtype(vp)) { fuse_internal_vnode_disappear(vp); err = ENOENT; goto out; } if ((fvdat->flag & FN_SIZECHANGE) != 0) vap->va_size = fvdat->filesize; if (vnode_isreg(vp) && (fvdat->flag & FN_SIZECHANGE) == 0) { /* * This is for those cases when the file size changed without us * knowing, and we want to catch up. */ off_t new_filesize = ((struct fuse_attr_out *) fdi.answ)->attr.size; if (fvdat->filesize != new_filesize) { fuse_vnode_setsize(vp, new_filesize); fvdat->flag &= ~FN_SIZECHANGE; } } out: fdisp_destroy(&fdi); return err; fake: bzero(vap, sizeof(*vap)); vap->va_type = vnode_vtype(vp); return 0; } /* struct vnop_inactive_args { struct vnode *a_vp; struct thread *a_td; }; */ static int fuse_vnop_inactive(struct vop_inactive_args *ap) { struct vnode *vp = ap->a_vp; struct thread *td = ap->a_td; struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_filehandle *fufh = NULL; int type, need_flush = 1; for (type = 0; type < FUFH_MAXTYPE; type++) { fufh = &(fvdat->fufh[type]); if (FUFH_IS_VALID(fufh)) { if (need_flush && vp->v_type == VREG) { if ((VTOFUD(vp)->flag & FN_SIZECHANGE) != 0) { fuse_vnode_savesize(vp, NULL); } if (fuse_data_cache_invalidate || (fvdat->flag & FN_REVOKED) != 0) fuse_io_invalbuf(vp, td); else fuse_io_flushbuf(vp, MNT_WAIT, td); need_flush = 0; } fuse_filehandle_close(vp, type, td, NULL); } } if ((fvdat->flag & FN_REVOKED) != 0 && fuse_reclaim_revoked) { vrecycle(vp); } return 0; } /* struct vnop_link_args { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; }; */ static int fuse_vnop_link(struct vop_link_args *ap) { struct vnode *vp = ap->a_vp; struct vnode *tdvp = ap->a_tdvp; struct componentname *cnp = ap->a_cnp; struct vattr *vap = VTOVA(vp); struct fuse_dispatcher fdi; struct fuse_entry_out *feo; struct fuse_link_in fli; int err; if (fuse_isdeadfs(vp)) { return ENXIO; } if (vnode_mount(tdvp) != vnode_mount(vp)) { return EXDEV; } /* * This is a seatbelt check to protect naive userspace filesystems from * themselves and the limitations of the FUSE IPC protocol. If a * filesystem does not allow attribute caching, assume it is capable of * validating that nlink does not overflow. */ if (vap != NULL && vap->va_nlink >= FUSE_LINK_MAX) return EMLINK; fli.oldnodeid = VTOI(vp); fdisp_init(&fdi, 0); fuse_internal_newentry_makerequest(vnode_mount(tdvp), VTOI(tdvp), cnp, FUSE_LINK, &fli, sizeof(fli), &fdi); if ((err = fdisp_wait_answ(&fdi))) { goto out; } feo = fdi.answ; err = fuse_internal_checkentry(feo, vnode_vtype(vp)); out: fdisp_destroy(&fdi); return err; } /* struct vnop_lookup_args { struct vnodeop_desc *a_desc; struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; }; */ int fuse_vnop_lookup(struct vop_lookup_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct thread *td = cnp->cn_thread; struct ucred *cred = cnp->cn_cred; int nameiop = cnp->cn_nameiop; int flags = cnp->cn_flags; int wantparent = flags & (LOCKPARENT | WANTPARENT); int islastcn = flags & ISLASTCN; struct mount *mp = vnode_mount(dvp); int err = 0; int lookup_err = 0; struct vnode *vp = NULL; struct fuse_dispatcher fdi; enum fuse_opcode op; uint64_t nid; struct fuse_access_param facp; if (fuse_isdeadfs(dvp)) { *vpp = NULL; return ENXIO; } if (!vnode_isdir(dvp)) { return ENOTDIR; } if (islastcn && vfs_isrdonly(mp) && (nameiop != LOOKUP)) { return EROFS; } /* * We do access check prior to doing anything else only in the case * when we are at fs root (we'd like to say, "we are at the first * component", but that's not exactly the same... nevermind). * See further comments at further access checks. */ bzero(&facp, sizeof(facp)); if (vnode_isvroot(dvp)) { /* early permission check hack */ if ((err = fuse_internal_access(dvp, VEXEC, &facp, td, cred))) { return err; } } if (flags & ISDOTDOT) { nid = VTOFUD(dvp)->parent_nid; if (nid == 0) { return ENOENT; } fdisp_init(&fdi, 0); op = FUSE_GETATTR; goto calldaemon; } else if (cnp->cn_namelen == 1 && *(cnp->cn_nameptr) == '.') { nid = VTOI(dvp); fdisp_init(&fdi, 0); op = FUSE_GETATTR; goto calldaemon; } else if (fuse_lookup_cache_enable) { err = cache_lookup(dvp, vpp, cnp, NULL, NULL); switch (err) { case -1: /* positive match */ atomic_add_acq_long(&fuse_lookup_cache_hits, 1); return 0; case 0: /* no match in cache */ atomic_add_acq_long(&fuse_lookup_cache_misses, 1); break; case ENOENT: /* negative match */ /* fall through */ default: return err; } } nid = VTOI(dvp); fdisp_init(&fdi, cnp->cn_namelen + 1); op = FUSE_LOOKUP; calldaemon: fdisp_make(&fdi, op, mp, nid, td, cred); if (op == FUSE_LOOKUP) { memcpy(fdi.indata, cnp->cn_nameptr, cnp->cn_namelen); ((char *)fdi.indata)[cnp->cn_namelen] = '\0'; } lookup_err = fdisp_wait_answ(&fdi); if ((op == FUSE_LOOKUP) && !lookup_err) { /* lookup call succeeded */ nid = ((struct fuse_entry_out *)fdi.answ)->nodeid; if (!nid) { /* * zero nodeid is the same as "not found", * but it's also cacheable (which we keep * keep on doing not as of writing this) */ lookup_err = ENOENT; } else if (nid == FUSE_ROOT_ID) { lookup_err = EINVAL; } } if (lookup_err && (!fdi.answ_stat || lookup_err != ENOENT || op != FUSE_LOOKUP)) { fdisp_destroy(&fdi); return lookup_err; } /* lookup_err, if non-zero, must be ENOENT at this point */ if (lookup_err) { if ((nameiop == CREATE || nameiop == RENAME) && islastcn /* && directory dvp has not been removed */ ) { if (vfs_isrdonly(mp)) { err = EROFS; goto out; } #if 0 /* THINK_ABOUT_THIS */ if ((err = fuse_internal_access(dvp, VWRITE, cred, td, &facp))) { goto out; } #endif /* * Possibly record the position of a slot in the * directory large enough for the new component name. * This can be recorded in the vnode private data for * dvp. Set the SAVENAME flag to hold onto the * pathname for use later in VOP_CREATE or VOP_RENAME. */ cnp->cn_flags |= SAVENAME; err = EJUSTRETURN; goto out; } /* Consider inserting name into cache. */ /* * No we can't use negative caching, as the fs * changes are out of our control. * False positives' falseness turns out just as things * go by, but false negatives' falseness doesn't. * (and aiding the caching mechanism with extra control * mechanisms comes quite close to beating the whole purpose * caching...) */ #if 0 if ((cnp->cn_flags & MAKEENTRY) != 0) { SDT_PROBE2(fuse, , vnops, trace, 1, "inserting NULL into cache"); cache_enter(dvp, NULL, cnp); } #endif err = ENOENT; goto out; } else { /* !lookup_err */ struct fuse_entry_out *feo = NULL; struct fuse_attr *fattr = NULL; if (op == FUSE_GETATTR) { fattr = &((struct fuse_attr_out *)fdi.answ)->attr; } else { feo = (struct fuse_entry_out *)fdi.answ; fattr = &(feo->attr); } /* * If deleting, and at end of pathname, return parameters * which can be used to remove file. If the wantparent flag * isn't set, we return only the directory, otherwise we go on * and lock the inode, being careful with ".". */ if (nameiop == DELETE && islastcn) { /* * Check for write access on directory. */ facp.xuid = fattr->uid; facp.facc_flags |= FACCESS_STICKY; err = fuse_internal_access(dvp, VWRITE, &facp, td, cred); facp.facc_flags &= ~FACCESS_XQUERIES; if (err) { goto out; } if (nid == VTOI(dvp)) { vref(dvp); *vpp = dvp; } else { err = fuse_vnode_get(dvp->v_mount, feo, nid, dvp, &vp, cnp, IFTOVT(fattr->mode)); if (err) goto out; *vpp = vp; } /* * Save the name for use in VOP_RMDIR and VOP_REMOVE * later. */ cnp->cn_flags |= SAVENAME; goto out; } /* * If rewriting (RENAME), return the inode and the * information required to rewrite the present directory * Must get inode of directory entry to verify it's a * regular file, or empty directory. */ if (nameiop == RENAME && wantparent && islastcn) { #if 0 /* THINK_ABOUT_THIS */ if ((err = fuse_internal_access(dvp, VWRITE, cred, td, &facp))) { goto out; } #endif /* * Check for "." */ if (nid == VTOI(dvp)) { err = EISDIR; goto out; } err = fuse_vnode_get(vnode_mount(dvp), feo, nid, dvp, &vp, cnp, IFTOVT(fattr->mode)); if (err) { goto out; } *vpp = vp; /* * Save the name for use in VOP_RENAME later. */ cnp->cn_flags |= SAVENAME; goto out; } if (flags & ISDOTDOT) { struct mount *mp; int ltype; /* * Expanded copy of vn_vget_ino() so that * fuse_vnode_get() can be used. */ mp = dvp->v_mount; ltype = VOP_ISLOCKED(dvp); err = vfs_busy(mp, MBF_NOWAIT); if (err != 0) { vfs_ref(mp); VOP_UNLOCK(dvp, 0); err = vfs_busy(mp, 0); vn_lock(dvp, ltype | LK_RETRY); vfs_rel(mp); if (err) goto out; if ((dvp->v_iflag & VI_DOOMED) != 0) { err = ENOENT; vfs_unbusy(mp); goto out; } } VOP_UNLOCK(dvp, 0); err = fuse_vnode_get(vnode_mount(dvp), feo, nid, NULL, &vp, cnp, IFTOVT(fattr->mode)); vfs_unbusy(mp); vn_lock(dvp, ltype | LK_RETRY); if ((dvp->v_iflag & VI_DOOMED) != 0) { if (err == 0) vput(vp); err = ENOENT; } if (err) goto out; *vpp = vp; } else if (nid == VTOI(dvp)) { vref(dvp); *vpp = dvp; } else { struct fuse_vnode_data *fvdat; err = fuse_vnode_get(vnode_mount(dvp), feo, nid, dvp, &vp, cnp, IFTOVT(fattr->mode)); if (err) { goto out; } fuse_vnode_setparent(vp, dvp); /* * In the case where we are looking up a FUSE node * represented by an existing cached vnode, and the * true size reported by FUSE_LOOKUP doesn't match * the vnode's cached size, fix the vnode cache to * match the real object size. * * This can occur via FUSE distributed filesystems, * irregular files, etc. */ fvdat = VTOFUD(vp); if (vnode_isreg(vp) && fattr->size != fvdat->filesize) { /* * The FN_SIZECHANGE flag reflects a dirty * append. If userspace lets us know our cache * is invalid, that write was lost. (Dirty * writes that do not cause append are also * lost, but we don't detect them here.) * * XXX: Maybe disable WB caching on this mount. */ if (fvdat->flag & FN_SIZECHANGE) printf("%s: WB cache incoherent on " "%s!\n", __func__, vnode_mount(vp)->mnt_stat.f_mntonname); (void)fuse_vnode_setsize(vp, fattr->size); fvdat->flag &= ~FN_SIZECHANGE; } *vpp = vp; } if (op == FUSE_GETATTR) { struct fuse_attr_out *fao = (struct fuse_attr_out*)fdi.answ; fuse_internal_cache_attrs(*vpp, &fao->attr, fao->attr_valid, fao->attr_valid_nsec, NULL); } else { struct fuse_entry_out *feo = (struct fuse_entry_out*)fdi.answ; fuse_internal_cache_attrs(*vpp, &feo->attr, feo->attr_valid, feo->attr_valid_nsec, NULL); } /* Insert name into cache if appropriate. */ /* * Nooo, caching is evil. With caching, we can't avoid stale * information taking over the playground (cached info is not * just positive/negative, it does have qualitative aspects, * too). And a (VOP/FUSE)_GETATTR is always thrown anyway, when * walking down along cached path components, and that's not * any cheaper than FUSE_LOOKUP. This might change with * implementing kernel side attr caching, but... In Linux, * lookup results are not cached, and the daemon is bombarded * with FUSE_LOOKUPS on and on. This shows that by design, the * daemon is expected to handle frequent lookup queries * efficiently, do its caching in userspace, and so on. * * So just leave the name cache alone. */ /* * Well, now I know, Linux caches lookups, but with a * timeout... So it's the same thing as attribute caching: * we can deal with it when implement timeouts. */ #if 0 if (cnp->cn_flags & MAKEENTRY) { cache_enter(dvp, *vpp, cnp); } #endif } out: if (!lookup_err) { /* No lookup error; need to clean up. */ if (err) { /* Found inode; exit with no vnode. */ if (op == FUSE_LOOKUP) { fuse_internal_forget_send(vnode_mount(dvp), td, cred, nid, 1); } fdisp_destroy(&fdi); return err; } else { #ifndef NO_EARLY_PERM_CHECK_HACK if (!islastcn) { /* * We have the attributes of the next item * *now*, and it's a fact, and we do not * have to do extra work for it (ie, beg the * daemon), and it neither depends on such * accidental things like attr caching. So * the big idea: check credentials *now*, * not at the beginning of the next call to * lookup. * * The first item of the lookup chain (fs root) * won't be checked then here, of course, as * its never "the next". But go and see that * the root is taken care about at the very * beginning of this function. * * Now, given we want to do the access check * this way, one might ask: so then why not * do the access check just after fetching * the inode and its attributes from the * daemon? Why bother with producing the * corresponding vnode at all if something * is not OK? We know what's the deal as * soon as we get those attrs... There is * one bit of info though not given us by * the daemon: whether his response is * authoritative or not... His response should * be ignored if something is mounted over * the dir in question. But that can be * known only by having the vnode... */ int tmpvtype = vnode_vtype(*vpp); bzero(&facp, sizeof(facp)); /*the early perm check hack */ facp.facc_flags |= FACCESS_VA_VALID; if ((tmpvtype != VDIR) && (tmpvtype != VLNK)) { err = ENOTDIR; } if (!err && !vnode_mountedhere(*vpp)) { err = fuse_internal_access(*vpp, VEXEC, &facp, td, cred); } if (err) { if (tmpvtype == VLNK) SDT_PROBE2(fuse, , vnops, trace, 1, "weird, permission " "error with a symlink?"); vput(*vpp); *vpp = NULL; } } #endif } } fdisp_destroy(&fdi); return err; } /* struct vnop_mkdir_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; */ static int fuse_vnop_mkdir(struct vop_mkdir_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct vattr *vap = ap->a_vap; struct fuse_mkdir_in fmdi; if (fuse_isdeadfs(dvp)) { return ENXIO; } fmdi.mode = MAKEIMODE(vap->va_type, vap->va_mode); return (fuse_internal_newentry(dvp, vpp, cnp, FUSE_MKDIR, &fmdi, sizeof(fmdi), VDIR)); } /* struct vnop_mknod_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; */ static int fuse_vnop_mknod(struct vop_mknod_args *ap) { return (EINVAL); } /* struct vnop_open_args { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; int a_fdidx; / struct file *a_fp; }; */ static int fuse_vnop_open(struct vop_open_args *ap) { struct vnode *vp = ap->a_vp; int mode = ap->a_mode; struct thread *td = ap->a_td; struct ucred *cred = ap->a_cred; fufh_type_t fufh_type; struct fuse_vnode_data *fvdat; int error, isdir = 0; int32_t fuse_open_flags; if (fuse_isdeadfs(vp)) { return ENXIO; } if ((mode & (FREAD | FWRITE)) == 0) return EINVAL; fvdat = VTOFUD(vp); if (vnode_isdir(vp)) { isdir = 1; } fuse_open_flags = 0; if (isdir) { fufh_type = FUFH_RDONLY; } else { fufh_type = fuse_filehandle_xlate_from_fflags(mode); /* * For WRONLY opens, force DIRECT_IO. This is necessary * since writing a partial block through the buffer cache * will result in a read of the block and that read won't * be allowed by the WRONLY open. */ if (fufh_type == FUFH_WRONLY || (fvdat->flag & FN_DIRECTIO) != 0) fuse_open_flags = FOPEN_DIRECT_IO; } if (fuse_filehandle_validrw(vp, fufh_type) != FUFH_INVALID) { fuse_vnode_open(vp, fuse_open_flags, td); return 0; } error = fuse_filehandle_open(vp, fufh_type, NULL, td, cred); return error; } static int fuse_vnop_pathconf(struct vop_pathconf_args *ap) { switch (ap->a_name) { case _PC_FILESIZEBITS: *ap->a_retval = 64; return (0); case _PC_NAME_MAX: *ap->a_retval = NAME_MAX; return (0); case _PC_LINK_MAX: *ap->a_retval = MIN(LONG_MAX, FUSE_LINK_MAX); return (0); case _PC_SYMLINK_MAX: *ap->a_retval = MAXPATHLEN; return (0); case _PC_NO_TRUNC: *ap->a_retval = 1; return (0); default: return (vop_stdpathconf(ap)); } } /* struct vnop_read_args { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; }; */ static int fuse_vnop_read(struct vop_read_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; int ioflag = ap->a_ioflag; struct ucred *cred = ap->a_cred; if (fuse_isdeadfs(vp)) { return ENXIO; } if (VTOFUD(vp)->flag & FN_DIRECTIO) { ioflag |= IO_DIRECT; } return fuse_io_dispatch(vp, uio, ioflag, cred); } /* struct vnop_readdir_args { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *ncookies; u_long **a_cookies; }; */ static int fuse_vnop_readdir(struct vop_readdir_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct ucred *cred = ap->a_cred; struct fuse_filehandle *fufh = NULL; struct fuse_iov cookediov; int err = 0; int freefufh = 0; if (fuse_isdeadfs(vp)) { return ENXIO; } if ( /* XXXIP ((uio_iovcnt(uio) > 1)) || */ (uio_resid(uio) < sizeof(struct dirent))) { return EINVAL; } if (!fuse_filehandle_valid(vp, FUFH_RDONLY)) { SDT_PROBE2(fuse, , vnops, trace, 1, "calling readdir() before open()"); err = fuse_filehandle_open(vp, FUFH_RDONLY, &fufh, NULL, cred); freefufh = 1; } else { err = fuse_filehandle_get(vp, FUFH_RDONLY, &fufh); } if (err) { return (err); } #define DIRCOOKEDSIZE FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + MAXNAMLEN + 1) fiov_init(&cookediov, DIRCOOKEDSIZE); err = fuse_internal_readdir(vp, uio, fufh, &cookediov); fiov_teardown(&cookediov); if (freefufh) { fuse_filehandle_close(vp, FUFH_RDONLY, NULL, cred); } return err; } /* struct vnop_readlink_args { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; }; */ static int fuse_vnop_readlink(struct vop_readlink_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct ucred *cred = ap->a_cred; struct fuse_dispatcher fdi; int err; if (fuse_isdeadfs(vp)) { return ENXIO; } if (!vnode_islnk(vp)) { return EINVAL; } fdisp_init(&fdi, 0); err = fdisp_simple_putget_vp(&fdi, FUSE_READLINK, vp, curthread, cred); if (err) { goto out; } if (((char *)fdi.answ)[0] == '/' && fuse_get_mpdata(vnode_mount(vp))->dataflags & FSESS_PUSH_SYMLINKS_IN) { char *mpth = vnode_mount(vp)->mnt_stat.f_mntonname; err = uiomove(mpth, strlen(mpth), uio); } if (!err) { err = uiomove(fdi.answ, fdi.iosize, uio); } out: fdisp_destroy(&fdi); return err; } /* struct vnop_reclaim_args { struct vnode *a_vp; struct thread *a_td; }; */ static int fuse_vnop_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp = ap->a_vp; struct thread *td = ap->a_td; struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_filehandle *fufh = NULL; int type; if (!fvdat) { panic("FUSE: no vnode data during recycling"); } for (type = 0; type < FUFH_MAXTYPE; type++) { fufh = &(fvdat->fufh[type]); if (FUFH_IS_VALID(fufh)) { printf("FUSE: vnode being reclaimed but fufh (type=%d) is valid", type); fuse_filehandle_close(vp, type, td, NULL); } } if ((!fuse_isdeadfs(vp)) && (fvdat->nlookup)) { fuse_internal_forget_send(vnode_mount(vp), td, NULL, VTOI(vp), fvdat->nlookup); } fuse_vnode_setparent(vp, NULL); cache_purge(vp); vfs_hash_remove(vp); vnode_destroy_vobject(vp); fuse_vnode_destroy(vp); return 0; } /* struct vnop_remove_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; }; */ static int fuse_vnop_remove(struct vop_remove_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode *vp = ap->a_vp; struct componentname *cnp = ap->a_cnp; int err; if (fuse_isdeadfs(vp)) { return ENXIO; } if (vnode_isdir(vp)) { return EPERM; } cache_purge(vp); err = fuse_internal_remove(dvp, vp, cnp, FUSE_UNLINK); if (err == 0) fuse_internal_vnode_disappear(vp); return err; } /* struct vnop_rename_args { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; }; */ static int fuse_vnop_rename(struct vop_rename_args *ap) { struct vnode *fdvp = ap->a_fdvp; struct vnode *fvp = ap->a_fvp; struct componentname *fcnp = ap->a_fcnp; struct vnode *tdvp = ap->a_tdvp; struct vnode *tvp = ap->a_tvp; struct componentname *tcnp = ap->a_tcnp; struct fuse_data *data; int err = 0; if (fuse_isdeadfs(fdvp)) { return ENXIO; } if (fvp->v_mount != tdvp->v_mount || (tvp && fvp->v_mount != tvp->v_mount)) { SDT_PROBE2(fuse, , vnops, trace, 1, "cross-device rename"); err = EXDEV; goto out; } cache_purge(fvp); /* * FUSE library is expected to check if target directory is not * under the source directory in the file system tree. * Linux performs this check at VFS level. */ data = fuse_get_mpdata(vnode_mount(tdvp)); sx_xlock(&data->rename_lock); err = fuse_internal_rename(fdvp, fcnp, tdvp, tcnp); if (err == 0) { if (tdvp != fdvp) fuse_vnode_setparent(fvp, tdvp); if (tvp != NULL) fuse_vnode_setparent(tvp, NULL); } sx_unlock(&data->rename_lock); if (tvp != NULL && tvp != fvp) { cache_purge(tvp); } if (vnode_isdir(fvp)) { if ((tvp != NULL) && vnode_isdir(tvp)) { cache_purge(tdvp); } cache_purge(fdvp); } out: if (tdvp == tvp) { vrele(tdvp); } else { vput(tdvp); } if (tvp != NULL) { vput(tvp); } vrele(fdvp); vrele(fvp); return err; } /* struct vnop_rmdir_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } *ap; */ static int fuse_vnop_rmdir(struct vop_rmdir_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode *vp = ap->a_vp; int err; if (fuse_isdeadfs(vp)) { return ENXIO; } if (VTOFUD(vp) == VTOFUD(dvp)) { return EINVAL; } err = fuse_internal_remove(dvp, vp, ap->a_cnp, FUSE_RMDIR); if (err == 0) fuse_internal_vnode_disappear(vp); return err; } /* struct vnop_setattr_args { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_setattr(struct vop_setattr_args *ap) { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; struct ucred *cred = ap->a_cred; struct thread *td = curthread; struct fuse_dispatcher fdi; struct fuse_setattr_in *fsai; struct fuse_access_param facp; int err = 0; enum vtype vtyp; int sizechanged = 0; uint64_t newsize = 0; if (fuse_isdeadfs(vp)) { return ENXIO; } fdisp_init(&fdi, sizeof(*fsai)); fdisp_make_vp(&fdi, FUSE_SETATTR, vp, td, cred); fsai = fdi.indata; fsai->valid = 0; bzero(&facp, sizeof(facp)); facp.xuid = vap->va_uid; facp.xgid = vap->va_gid; if (vap->va_uid != (uid_t)VNOVAL) { facp.facc_flags |= FACCESS_CHOWN; fsai->uid = vap->va_uid; fsai->valid |= FATTR_UID; } if (vap->va_gid != (gid_t)VNOVAL) { facp.facc_flags |= FACCESS_CHOWN; fsai->gid = vap->va_gid; fsai->valid |= FATTR_GID; } if (vap->va_size != VNOVAL) { struct fuse_filehandle *fufh = NULL; /*Truncate to a new value. */ fsai->size = vap->va_size; sizechanged = 1; newsize = vap->va_size; fsai->valid |= FATTR_SIZE; fuse_filehandle_getrw(vp, FUFH_WRONLY, &fufh); if (fufh) { fsai->fh = fufh->fh_id; fsai->valid |= FATTR_FH; } } if (vap->va_atime.tv_sec != VNOVAL) { fsai->atime = vap->va_atime.tv_sec; fsai->atimensec = vap->va_atime.tv_nsec; fsai->valid |= FATTR_ATIME; } if (vap->va_mtime.tv_sec != VNOVAL) { fsai->mtime = vap->va_mtime.tv_sec; fsai->mtimensec = vap->va_mtime.tv_nsec; fsai->valid |= FATTR_MTIME; } if (vap->va_mode != (mode_t)VNOVAL) { fsai->mode = vap->va_mode & ALLPERMS; fsai->valid |= FATTR_MODE; } if (!fsai->valid) { goto out; } vtyp = vnode_vtype(vp); if (fsai->valid & FATTR_SIZE && vtyp == VDIR) { err = EISDIR; goto out; } if (vfs_isrdonly(vnode_mount(vp)) && (fsai->valid & ~FATTR_SIZE || vtyp == VREG)) { err = EROFS; goto out; } if (fsai->valid & ~FATTR_SIZE) { /*err = fuse_internal_access(vp, VADMIN, context, &facp); */ /*XXX */ err = 0; } facp.facc_flags &= ~FACCESS_XQUERIES; if (err && !(fsai->valid & ~(FATTR_ATIME | FATTR_MTIME)) && vap->va_vaflags & VA_UTIMES_NULL) { err = fuse_internal_access(vp, VWRITE, &facp, td, cred); } if (err) goto out; if ((err = fdisp_wait_answ(&fdi))) goto out; vtyp = IFTOVT(((struct fuse_attr_out *)fdi.answ)->attr.mode); if (vnode_vtype(vp) != vtyp) { if (vnode_vtype(vp) == VNON && vtyp != VNON) { SDT_PROBE2(fuse, , vnops, trace, 1, "FUSE: Dang! " "vnode_vtype is VNON and vtype isn't."); } else { /* * STALE vnode, ditch * * The vnode has changed its type "behind our back". * There's nothing really we can do, so let us just * force an internal revocation and tell the caller to * try again, if interested. */ fuse_internal_vnode_disappear(vp); err = EAGAIN; } } if (err == 0) { struct fuse_attr_out *fao = (struct fuse_attr_out*)fdi.answ; fuse_internal_cache_attrs(vp, &fao->attr, fao->attr_valid, fao->attr_valid_nsec, NULL); } out: fdisp_destroy(&fdi); if (!err && sizechanged) { fuse_vnode_setsize(vp, newsize); VTOFUD(vp)->flag &= ~FN_SIZECHANGE; } return err; } /* struct vnop_strategy_args { struct vnode *a_vp; struct buf *a_bp; }; */ static int fuse_vnop_strategy(struct vop_strategy_args *ap) { struct vnode *vp = ap->a_vp; struct buf *bp = ap->a_bp; if (!vp || fuse_isdeadfs(vp)) { bp->b_ioflags |= BIO_ERROR; bp->b_error = ENXIO; bufdone(bp); return ENXIO; } if (bp->b_iocmd == BIO_WRITE) fuse_vnode_refreshsize(vp, NOCRED); (void)fuse_io_strategy(vp, bp); /* * This is a dangerous function. If returns error, that might mean a * panic. We prefer pretty much anything over being forced to panic * by a malicious daemon (a demon?). So we just return 0 anyway. You * should never mind this: this function has its own error * propagation mechanism via the argument buffer, so * not-that-melodramatic residents of the call chain still will be * able to know what to do. */ return 0; } /* struct vnop_symlink_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; }; */ static int fuse_vnop_symlink(struct vop_symlink_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; const char *target = ap->a_target; struct fuse_dispatcher fdi; int err; size_t len; if (fuse_isdeadfs(dvp)) { return ENXIO; } /* * Unlike the other creator type calls, here we have to create a message * where the name of the new entry comes first, and the data describing * the entry comes second. * Hence we can't rely on our handy fuse_internal_newentry() routine, * but put together the message manually and just call the core part. */ len = strlen(target) + 1; fdisp_init(&fdi, len + cnp->cn_namelen + 1); fdisp_make_vp(&fdi, FUSE_SYMLINK, dvp, curthread, NULL); memcpy(fdi.indata, cnp->cn_nameptr, cnp->cn_namelen); ((char *)fdi.indata)[cnp->cn_namelen] = '\0'; memcpy((char *)fdi.indata + cnp->cn_namelen + 1, target, len); err = fuse_internal_newentry_core(dvp, vpp, cnp, VLNK, &fdi); fdisp_destroy(&fdi); return err; } /* struct vnop_write_args { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; }; */ static int fuse_vnop_write(struct vop_write_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; int ioflag = ap->a_ioflag; struct ucred *cred = ap->a_cred; if (fuse_isdeadfs(vp)) { return ENXIO; } fuse_vnode_refreshsize(vp, cred); if (VTOFUD(vp)->flag & FN_DIRECTIO) { ioflag |= IO_DIRECT; } return fuse_io_dispatch(vp, uio, ioflag, cred); } SDT_PROBE_DEFINE1(fuse, , vnops, vnop_getpages_error, "int"); /* struct vnop_getpages_args { struct vnode *a_vp; vm_page_t *a_m; int a_count; int a_reqpage; }; */ static int fuse_vnop_getpages(struct vop_getpages_args *ap) { int i, error, nextoff, size, toff, count, npages; struct uio uio; struct iovec iov; vm_offset_t kva; struct buf *bp; struct vnode *vp; struct thread *td; struct ucred *cred; vm_page_t *pages; vp = ap->a_vp; KASSERT(vp->v_object, ("objectless vp passed to getpages")); td = curthread; /* XXX */ cred = curthread->td_ucred; /* XXX */ pages = ap->a_m; npages = ap->a_count; if (!fsess_opt_mmap(vnode_mount(vp))) { SDT_PROBE2(fuse, , vnops, trace, 1, "called on non-cacheable vnode??\n"); return (VM_PAGER_ERROR); } /* * If the last page is partially valid, just return it and allow * the pager to zero-out the blanks. Partially valid pages can * only occur at the file EOF. * * XXXGL: is that true for FUSE, which is a local filesystem, * but still somewhat disconnected from the kernel? */ VM_OBJECT_WLOCK(vp->v_object); if (pages[npages - 1]->valid != 0 && --npages == 0) goto out; VM_OBJECT_WUNLOCK(vp->v_object); /* * We use only the kva address for the buffer, but this is extremely * convenient and fast. */ bp = uma_zalloc(fuse_pbuf_zone, M_WAITOK); kva = (vm_offset_t)bp->b_data; pmap_qenter(kva, pages, npages); VM_CNT_INC(v_vnodein); VM_CNT_ADD(v_vnodepgsin, npages); count = npages << PAGE_SHIFT; iov.iov_base = (caddr_t)kva; iov.iov_len = count; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = IDX_TO_OFF(pages[0]->pindex); uio.uio_resid = count; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_td = td; error = fuse_io_dispatch(vp, &uio, IO_DIRECT, cred); pmap_qremove(kva, npages); uma_zfree(fuse_pbuf_zone, bp); if (error && (uio.uio_resid == count)) { SDT_PROBE1(fuse, , vnops, vnop_getpages_error, error); return VM_PAGER_ERROR; } /* * Calculate the number of bytes read and validate only that number * of bytes. Note that due to pending writes, size may be 0. This * does not mean that the remaining data is invalid! */ size = count - uio.uio_resid; VM_OBJECT_WLOCK(vp->v_object); fuse_vm_page_lock_queues(); for (i = 0, toff = 0; i < npages; i++, toff = nextoff) { vm_page_t m; nextoff = toff + PAGE_SIZE; m = pages[i]; if (nextoff <= size) { /* * Read operation filled an entire page */ m->valid = VM_PAGE_BITS_ALL; KASSERT(m->dirty == 0, ("fuse_getpages: page %p is dirty", m)); } else if (size > toff) { /* * Read operation filled a partial page. */ m->valid = 0; vm_page_set_valid_range(m, 0, size - toff); KASSERT(m->dirty == 0, ("fuse_getpages: page %p is dirty", m)); } else { /* * Read operation was short. If no error occurred * we may have hit a zero-fill section. We simply * leave valid set to 0. */ ; } } fuse_vm_page_unlock_queues(); out: VM_OBJECT_WUNLOCK(vp->v_object); if (ap->a_rbehind) *ap->a_rbehind = 0; if (ap->a_rahead) *ap->a_rahead = 0; return (VM_PAGER_OK); } /* struct vnop_putpages_args { struct vnode *a_vp; vm_page_t *a_m; int a_count; int a_sync; int *a_rtvals; vm_ooffset_t a_offset; }; */ static int fuse_vnop_putpages(struct vop_putpages_args *ap) { struct uio uio; struct iovec iov; vm_offset_t kva; struct buf *bp; int i, error, npages, count; off_t offset; int *rtvals; struct vnode *vp; struct thread *td; struct ucred *cred; vm_page_t *pages; vm_ooffset_t fsize; vp = ap->a_vp; KASSERT(vp->v_object, ("objectless vp passed to putpages")); fsize = vp->v_object->un_pager.vnp.vnp_size; td = curthread; /* XXX */ cred = curthread->td_ucred; /* XXX */ pages = ap->a_m; count = ap->a_count; rtvals = ap->a_rtvals; npages = btoc(count); offset = IDX_TO_OFF(pages[0]->pindex); if (!fsess_opt_mmap(vnode_mount(vp))) { SDT_PROBE2(fuse, , vnops, trace, 1, "called on non-cacheable vnode??\n"); } for (i = 0; i < npages; i++) rtvals[i] = VM_PAGER_AGAIN; /* * When putting pages, do not extend file past EOF. */ if (offset + count > fsize) { count = fsize - offset; if (count < 0) count = 0; } /* * We use only the kva address for the buffer, but this is extremely * convenient and fast. */ bp = uma_zalloc(fuse_pbuf_zone, M_WAITOK); kva = (vm_offset_t)bp->b_data; pmap_qenter(kva, pages, npages); VM_CNT_INC(v_vnodeout); VM_CNT_ADD(v_vnodepgsout, count); iov.iov_base = (caddr_t)kva; iov.iov_len = count; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = offset; uio.uio_resid = count; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_WRITE; uio.uio_td = td; error = fuse_io_dispatch(vp, &uio, IO_DIRECT, cred); pmap_qremove(kva, npages); uma_zfree(fuse_pbuf_zone, bp); if (!error) { int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE; for (i = 0; i < nwritten; i++) { rtvals[i] = VM_PAGER_OK; VM_OBJECT_WLOCK(pages[i]->object); vm_page_undirty(pages[i]); VM_OBJECT_WUNLOCK(pages[i]->object); } } return rtvals[0]; } static const char extattr_namespace_separator = '.'; /* struct vop_getextattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_attrnamespace; const char *a_name; struct uio *a_uio; size_t *a_size; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_getextattr(struct vop_getextattr_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct fuse_dispatcher fdi; struct fuse_getxattr_in *get_xattr_in; struct fuse_getxattr_out *get_xattr_out; struct mount *mp = vnode_mount(vp); struct thread *td = ap->a_td; struct ucred *cred = ap->a_cred; char *prefix; char *attr_str; size_t len; int err; if (fuse_isdeadfs(vp)) return (ENXIO); /* Default to looking for user attributes. */ if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM) prefix = EXTATTR_NAMESPACE_SYSTEM_STRING; else prefix = EXTATTR_NAMESPACE_USER_STRING; len = strlen(prefix) + sizeof(extattr_namespace_separator) + strlen(ap->a_name) + 1; fdisp_init(&fdi, len + sizeof(*get_xattr_in)); fdisp_make_vp(&fdi, FUSE_GETXATTR, vp, td, cred); get_xattr_in = fdi.indata; /* * Check to see whether we're querying the available size or * issuing the actual request. If we pass in 0, we get back struct * fuse_getxattr_out. If we pass in a non-zero size, we get back * that much data, without the struct fuse_getxattr_out header. */ if (uio == NULL) get_xattr_in->size = 0; else get_xattr_in->size = uio->uio_resid; attr_str = (char *)fdi.indata + sizeof(*get_xattr_in); snprintf(attr_str, len, "%s%c%s", prefix, extattr_namespace_separator, ap->a_name); err = fdisp_wait_answ(&fdi); if (err != 0) { if (err == ENOSYS) fsess_set_notimpl(mp, FUSE_GETXATTR); goto out; } get_xattr_out = fdi.answ; if (ap->a_size != NULL) *ap->a_size = get_xattr_out->size; if (uio != NULL) err = uiomove(fdi.answ, fdi.iosize, uio); out: fdisp_destroy(&fdi); return (err); } /* struct vop_setextattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_attrnamespace; const char *a_name; struct uio *a_uio; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_setextattr(struct vop_setextattr_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct fuse_dispatcher fdi; struct fuse_setxattr_in *set_xattr_in; struct mount *mp = vnode_mount(vp); struct thread *td = ap->a_td; struct ucred *cred = ap->a_cred; char *prefix; size_t len; char *attr_str; int err; if (fuse_isdeadfs(vp)) return (ENXIO); /* Default to looking for user attributes. */ if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM) prefix = EXTATTR_NAMESPACE_SYSTEM_STRING; else prefix = EXTATTR_NAMESPACE_USER_STRING; len = strlen(prefix) + sizeof(extattr_namespace_separator) + strlen(ap->a_name) + 1; fdisp_init(&fdi, len + sizeof(*set_xattr_in) + uio->uio_resid); fdisp_make_vp(&fdi, FUSE_SETXATTR, vp, td, cred); set_xattr_in = fdi.indata; set_xattr_in->size = uio->uio_resid; attr_str = (char *)fdi.indata + sizeof(*set_xattr_in); snprintf(attr_str, len, "%s%c%s", prefix, extattr_namespace_separator, ap->a_name); err = uiomove((char *)fdi.indata + sizeof(*set_xattr_in) + len, uio->uio_resid, uio); if (err != 0) { goto out; } err = fdisp_wait_answ(&fdi); if (err != 0) { if (err == ENOSYS) fsess_set_notimpl(mp, FUSE_SETXATTR); goto out; } out: fdisp_destroy(&fdi); return (err); } /* * The Linux / FUSE extended attribute list is simply a collection of * NUL-terminated strings. The FreeBSD extended attribute list is a single * byte length followed by a non-NUL terminated string. So, this allows * conversion of the Linux / FUSE format to the FreeBSD format in place. * Linux attribute names are reported with the namespace as a prefix (e.g. * "user.attribute_name"), but in FreeBSD they are reported without the * namespace prefix (e.g. "attribute_name"). So, we're going from: * * user.attr_name1\0user.attr_name2\0 * * to: * * attr_name1attr_name2 * * Where "" is a single byte number of characters in the attribute name. * * Args: * prefix - exattr namespace prefix string * list, list_len - input list with namespace prefixes * bsd_list, bsd_list_len - output list compatible with bsd vfs */ static int fuse_xattrlist_convert(char *prefix, const char *list, int list_len, char *bsd_list, int *bsd_list_len) { int len, pos, dist_to_next, prefix_len; pos = 0; *bsd_list_len = 0; prefix_len = strlen(prefix); while (pos < list_len && list[pos] != '\0') { dist_to_next = strlen(&list[pos]) + 1; if (bcmp(&list[pos], prefix, prefix_len) == 0 && list[pos + prefix_len] == extattr_namespace_separator) { len = dist_to_next - (prefix_len + sizeof(extattr_namespace_separator)) - 1; if (len >= EXTATTR_MAXNAMELEN) return (ENAMETOOLONG); bsd_list[*bsd_list_len] = len; memcpy(&bsd_list[*bsd_list_len + 1], &list[pos + prefix_len + sizeof(extattr_namespace_separator)], len); *bsd_list_len += len + 1; } pos += dist_to_next; } return (0); } /* struct vop_listextattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_attrnamespace; struct uio *a_uio; size_t *a_size; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_listextattr(struct vop_listextattr_args *ap) { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct fuse_dispatcher fdi; struct fuse_listxattr_in *list_xattr_in; struct fuse_listxattr_out *list_xattr_out; struct mount *mp = vnode_mount(vp); struct thread *td = ap->a_td; struct ucred *cred = ap->a_cred; size_t len; char *prefix; char *attr_str; char *bsd_list = NULL; char *linux_list; int bsd_list_len; int linux_list_len; int err; if (fuse_isdeadfs(vp)) return (ENXIO); /* * Add space for a NUL and the period separator if enabled. * Default to looking for user attributes. */ if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM) prefix = EXTATTR_NAMESPACE_SYSTEM_STRING; else prefix = EXTATTR_NAMESPACE_USER_STRING; len = strlen(prefix) + sizeof(extattr_namespace_separator) + 1; fdisp_init(&fdi, sizeof(*list_xattr_in) + len); fdisp_make_vp(&fdi, FUSE_LISTXATTR, vp, td, cred); /* * Retrieve Linux / FUSE compatible list size. */ list_xattr_in = fdi.indata; list_xattr_in->size = 0; attr_str = (char *)fdi.indata + sizeof(*list_xattr_in); snprintf(attr_str, len, "%s%c", prefix, extattr_namespace_separator); err = fdisp_wait_answ(&fdi); if (err != 0) { if (err == ENOSYS) fsess_set_notimpl(mp, FUSE_LISTXATTR); goto out; } list_xattr_out = fdi.answ; linux_list_len = list_xattr_out->size; if (linux_list_len == 0) { if (ap->a_size != NULL) *ap->a_size = linux_list_len; goto out; } /* * Retrieve Linux / FUSE compatible list values. */ fdisp_make_vp(&fdi, FUSE_LISTXATTR, vp, td, cred); list_xattr_in = fdi.indata; list_xattr_in->size = linux_list_len + sizeof(*list_xattr_out); attr_str = (char *)fdi.indata + sizeof(*list_xattr_in); snprintf(attr_str, len, "%s%c", prefix, extattr_namespace_separator); err = fdisp_wait_answ(&fdi); if (err != 0) goto out; linux_list = fdi.answ; linux_list_len = fdi.iosize; /* * Retrieve the BSD compatible list values. * The Linux / FUSE attribute list format isn't the same * as FreeBSD's format. So we need to transform it into * FreeBSD's format before giving it to the user. */ bsd_list = malloc(linux_list_len, M_TEMP, M_WAITOK); err = fuse_xattrlist_convert(prefix, linux_list, linux_list_len, bsd_list, &bsd_list_len); if (err != 0) goto out; if (ap->a_size != NULL) *ap->a_size = bsd_list_len; if (uio != NULL) err = uiomove(bsd_list, bsd_list_len, uio); out: free(bsd_list, M_TEMP); fdisp_destroy(&fdi); return (err); } /* struct vop_deleteextattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_attrnamespace; const char *a_name; struct ucred *a_cred; struct thread *a_td; }; */ static int fuse_vnop_deleteextattr(struct vop_deleteextattr_args *ap) { struct vnode *vp = ap->a_vp; struct fuse_dispatcher fdi; struct mount *mp = vnode_mount(vp); struct thread *td = ap->a_td; struct ucred *cred = ap->a_cred; char *prefix; size_t len; char *attr_str; int err; if (fuse_isdeadfs(vp)) return (ENXIO); /* Default to looking for user attributes. */ if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM) prefix = EXTATTR_NAMESPACE_SYSTEM_STRING; else prefix = EXTATTR_NAMESPACE_USER_STRING; len = strlen(prefix) + sizeof(extattr_namespace_separator) + strlen(ap->a_name) + 1; fdisp_init(&fdi, len); fdisp_make_vp(&fdi, FUSE_REMOVEXATTR, vp, td, cred); attr_str = fdi.indata; snprintf(attr_str, len, "%s%c%s", prefix, extattr_namespace_separator, ap->a_name); err = fdisp_wait_answ(&fdi); if (err != 0) { if (err == ENOSYS) fsess_set_notimpl(mp, FUSE_REMOVEXATTR); } fdisp_destroy(&fdi); return (err); } /* struct vnop_print_args { struct vnode *a_vp; }; */ static int fuse_vnop_print(struct vop_print_args *ap) { struct fuse_vnode_data *fvdat = VTOFUD(ap->a_vp); printf("nodeid: %ju, parent nodeid: %ju, nlookup: %ju, flag: %#x\n", (uintmax_t)VTOILLU(ap->a_vp), (uintmax_t)fvdat->parent_nid, (uintmax_t)fvdat->nlookup, fvdat->flag); return 0; } Index: head/sys/kern/kern_event.c =================================================================== --- head/sys/kern/kern_event.c (revision 350420) +++ head/sys/kern/kern_event.c (revision 350421) @@ -1,2740 +1,2741 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1999,2000,2001 Jonathan Lemon * Copyright 2004 John-Mark Gurney * Copyright (c) 2009 Apple, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include "opt_kqueue.h" #ifdef COMPAT_FREEBSD11 #define _WANT_FREEBSD11_KEVENT #endif #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system"); /* * This lock is used if multiple kq locks are required. This possibly * should be made into a per proc lock. */ static struct mtx kq_global; MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF); #define KQ_GLOBAL_LOCK(lck, haslck) do { \ if (!haslck) \ mtx_lock(lck); \ haslck = 1; \ } while (0) #define KQ_GLOBAL_UNLOCK(lck, haslck) do { \ if (haslck) \ mtx_unlock(lck); \ haslck = 0; \ } while (0) TASKQUEUE_DEFINE_THREAD(kqueue_ctx); static int kevent_copyout(void *arg, struct kevent *kevp, int count); static int kevent_copyin(void *arg, struct kevent *kevp, int count); static int kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int mflag); static int kqueue_acquire(struct file *fp, struct kqueue **kqp); static void kqueue_release(struct kqueue *kq, int locked); static void kqueue_destroy(struct kqueue *kq); static void kqueue_drain(struct kqueue *kq, struct thread *td); static int kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident, int mflag); static void kqueue_task(void *arg, int pending); static int kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops, const struct timespec *timeout, struct kevent *keva, struct thread *td); static void kqueue_wakeup(struct kqueue *kq); static struct filterops *kqueue_fo_find(int filt); static void kqueue_fo_release(int filt); struct g_kevent_args; static int kern_kevent_generic(struct thread *td, struct g_kevent_args *uap, struct kevent_copyops *k_ops, const char *struct_name); static fo_ioctl_t kqueue_ioctl; static fo_poll_t kqueue_poll; static fo_kqfilter_t kqueue_kqfilter; static fo_stat_t kqueue_stat; static fo_close_t kqueue_close; static fo_fill_kinfo_t kqueue_fill_kinfo; static struct fileops kqueueops = { .fo_read = invfo_rdwr, .fo_write = invfo_rdwr, .fo_truncate = invfo_truncate, .fo_ioctl = kqueue_ioctl, .fo_poll = kqueue_poll, .fo_kqfilter = kqueue_kqfilter, .fo_stat = kqueue_stat, .fo_close = kqueue_close, .fo_chmod = invfo_chmod, .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = kqueue_fill_kinfo, }; static int knote_attach(struct knote *kn, struct kqueue *kq); static void knote_drop(struct knote *kn, struct thread *td); static void knote_drop_detached(struct knote *kn, struct thread *td); static void knote_enqueue(struct knote *kn); static void knote_dequeue(struct knote *kn); static void knote_init(void); static struct knote *knote_alloc(int mflag); static void knote_free(struct knote *kn); static void filt_kqdetach(struct knote *kn); static int filt_kqueue(struct knote *kn, long hint); static int filt_procattach(struct knote *kn); static void filt_procdetach(struct knote *kn); static int filt_proc(struct knote *kn, long hint); static int filt_fileattach(struct knote *kn); static void filt_timerexpire(void *knx); static int filt_timerattach(struct knote *kn); static void filt_timerdetach(struct knote *kn); static void filt_timerstart(struct knote *kn, sbintime_t to); static void filt_timertouch(struct knote *kn, struct kevent *kev, u_long type); static int filt_timervalidate(struct knote *kn, sbintime_t *to); static int filt_timer(struct knote *kn, long hint); static int filt_userattach(struct knote *kn); static void filt_userdetach(struct knote *kn); static int filt_user(struct knote *kn, long hint); static void filt_usertouch(struct knote *kn, struct kevent *kev, u_long type); static struct filterops file_filtops = { .f_isfd = 1, .f_attach = filt_fileattach, }; static struct filterops kqread_filtops = { .f_isfd = 1, .f_detach = filt_kqdetach, .f_event = filt_kqueue, }; /* XXX - move to kern_proc.c? */ static struct filterops proc_filtops = { .f_isfd = 0, .f_attach = filt_procattach, .f_detach = filt_procdetach, .f_event = filt_proc, }; static struct filterops timer_filtops = { .f_isfd = 0, .f_attach = filt_timerattach, .f_detach = filt_timerdetach, .f_event = filt_timer, .f_touch = filt_timertouch, }; static struct filterops user_filtops = { .f_attach = filt_userattach, .f_detach = filt_userdetach, .f_event = filt_user, .f_touch = filt_usertouch, }; static uma_zone_t knote_zone; static unsigned int kq_ncallouts = 0; static unsigned int kq_calloutmax = 4 * 1024; SYSCTL_UINT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW, &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue"); /* XXX - ensure not influx ? */ #define KNOTE_ACTIVATE(kn, islock) do { \ if ((islock)) \ mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED); \ else \ KQ_LOCK((kn)->kn_kq); \ (kn)->kn_status |= KN_ACTIVE; \ if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) \ knote_enqueue((kn)); \ if (!(islock)) \ KQ_UNLOCK((kn)->kn_kq); \ } while(0) #define KQ_LOCK(kq) do { \ mtx_lock(&(kq)->kq_lock); \ } while (0) #define KQ_FLUX_WAKEUP(kq) do { \ if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) { \ (kq)->kq_state &= ~KQ_FLUXWAIT; \ wakeup((kq)); \ } \ } while (0) #define KQ_UNLOCK_FLUX(kq) do { \ KQ_FLUX_WAKEUP(kq); \ mtx_unlock(&(kq)->kq_lock); \ } while (0) #define KQ_UNLOCK(kq) do { \ mtx_unlock(&(kq)->kq_lock); \ } while (0) #define KQ_OWNED(kq) do { \ mtx_assert(&(kq)->kq_lock, MA_OWNED); \ } while (0) #define KQ_NOTOWNED(kq) do { \ mtx_assert(&(kq)->kq_lock, MA_NOTOWNED); \ } while (0) static struct knlist * kn_list_lock(struct knote *kn) { struct knlist *knl; knl = kn->kn_knlist; if (knl != NULL) knl->kl_lock(knl->kl_lockarg); return (knl); } static void kn_list_unlock(struct knlist *knl) { bool do_free; if (knl == NULL) return; do_free = knl->kl_autodestroy && knlist_empty(knl); knl->kl_unlock(knl->kl_lockarg); if (do_free) { knlist_destroy(knl); free(knl, M_KQUEUE); } } static bool kn_in_flux(struct knote *kn) { return (kn->kn_influx > 0); } static void kn_enter_flux(struct knote *kn) { KQ_OWNED(kn->kn_kq); MPASS(kn->kn_influx < INT_MAX); kn->kn_influx++; } static bool kn_leave_flux(struct knote *kn) { KQ_OWNED(kn->kn_kq); MPASS(kn->kn_influx > 0); kn->kn_influx--; return (kn->kn_influx == 0); } #define KNL_ASSERT_LOCK(knl, islocked) do { \ if (islocked) \ KNL_ASSERT_LOCKED(knl); \ else \ KNL_ASSERT_UNLOCKED(knl); \ } while (0) #ifdef INVARIANTS #define KNL_ASSERT_LOCKED(knl) do { \ knl->kl_assert_locked((knl)->kl_lockarg); \ } while (0) #define KNL_ASSERT_UNLOCKED(knl) do { \ knl->kl_assert_unlocked((knl)->kl_lockarg); \ } while (0) #else /* !INVARIANTS */ #define KNL_ASSERT_LOCKED(knl) do {} while(0) #define KNL_ASSERT_UNLOCKED(knl) do {} while (0) #endif /* INVARIANTS */ #ifndef KN_HASHSIZE #define KN_HASHSIZE 64 /* XXX should be tunable */ #endif #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) static int filt_nullattach(struct knote *kn) { return (ENXIO); }; struct filterops null_filtops = { .f_isfd = 0, .f_attach = filt_nullattach, }; /* XXX - make SYSINIT to add these, and move into respective modules. */ extern struct filterops sig_filtops; extern struct filterops fs_filtops; /* * Table for for all system-defined filters. */ static struct mtx filterops_lock; MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops", MTX_DEF); static struct { struct filterops *for_fop; int for_nolock; int for_refcnt; } sysfilt_ops[EVFILT_SYSCOUNT] = { { &file_filtops, 1 }, /* EVFILT_READ */ { &file_filtops, 1 }, /* EVFILT_WRITE */ { &null_filtops }, /* EVFILT_AIO */ { &file_filtops, 1 }, /* EVFILT_VNODE */ { &proc_filtops, 1 }, /* EVFILT_PROC */ { &sig_filtops, 1 }, /* EVFILT_SIGNAL */ { &timer_filtops, 1 }, /* EVFILT_TIMER */ { &file_filtops, 1 }, /* EVFILT_PROCDESC */ { &fs_filtops, 1 }, /* EVFILT_FS */ { &null_filtops }, /* EVFILT_LIO */ { &user_filtops, 1 }, /* EVFILT_USER */ { &null_filtops }, /* EVFILT_SENDFILE */ { &file_filtops, 1 }, /* EVFILT_EMPTY */ }; /* * Simple redirection for all cdevsw style objects to call their fo_kqfilter * method. */ static int filt_fileattach(struct knote *kn) { return (fo_kqfilter(kn->kn_fp, kn)); } /*ARGSUSED*/ static int kqueue_kqfilter(struct file *fp, struct knote *kn) { struct kqueue *kq = kn->kn_fp->f_data; if (kn->kn_filter != EVFILT_READ) return (EINVAL); kn->kn_status |= KN_KQUEUE; kn->kn_fop = &kqread_filtops; knlist_add(&kq->kq_sel.si_note, kn, 0); return (0); } static void filt_kqdetach(struct knote *kn) { struct kqueue *kq = kn->kn_fp->f_data; knlist_remove(&kq->kq_sel.si_note, kn, 0); } /*ARGSUSED*/ static int filt_kqueue(struct knote *kn, long hint) { struct kqueue *kq = kn->kn_fp->f_data; kn->kn_data = kq->kq_count; return (kn->kn_data > 0); } /* XXX - move to kern_proc.c? */ static int filt_procattach(struct knote *kn) { struct proc *p; int error; bool exiting, immediate; exiting = immediate = false; if (kn->kn_sfflags & NOTE_EXIT) p = pfind_any(kn->kn_id); else p = pfind(kn->kn_id); if (p == NULL) return (ESRCH); if (p->p_flag & P_WEXIT) exiting = true; if ((error = p_cansee(curthread, p))) { PROC_UNLOCK(p); return (error); } kn->kn_ptr.p_proc = p; kn->kn_flags |= EV_CLEAR; /* automatically set */ /* * Internal flag indicating registration done by kernel for the * purposes of getting a NOTE_CHILD notification. */ if (kn->kn_flags & EV_FLAG2) { kn->kn_flags &= ~EV_FLAG2; kn->kn_data = kn->kn_sdata; /* ppid */ kn->kn_fflags = NOTE_CHILD; kn->kn_sfflags &= ~(NOTE_EXIT | NOTE_EXEC | NOTE_FORK); immediate = true; /* Force immediate activation of child note. */ } /* * Internal flag indicating registration done by kernel (for other than * NOTE_CHILD). */ if (kn->kn_flags & EV_FLAG1) { kn->kn_flags &= ~EV_FLAG1; } knlist_add(p->p_klist, kn, 1); /* * Immediately activate any child notes or, in the case of a zombie * target process, exit notes. The latter is necessary to handle the * case where the target process, e.g. a child, dies before the kevent * is registered. */ if (immediate || (exiting && filt_proc(kn, NOTE_EXIT))) KNOTE_ACTIVATE(kn, 0); PROC_UNLOCK(p); return (0); } /* * The knote may be attached to a different process, which may exit, * leaving nothing for the knote to be attached to. So when the process * exits, the knote is marked as DETACHED and also flagged as ONESHOT so * it will be deleted when read out. However, as part of the knote deletion, * this routine is called, so a check is needed to avoid actually performing * a detach, because the original process does not exist any more. */ /* XXX - move to kern_proc.c? */ static void filt_procdetach(struct knote *kn) { knlist_remove(kn->kn_knlist, kn, 0); kn->kn_ptr.p_proc = NULL; } /* XXX - move to kern_proc.c? */ static int filt_proc(struct knote *kn, long hint) { struct proc *p; u_int event; p = kn->kn_ptr.p_proc; if (p == NULL) /* already activated, from attach filter */ return (0); /* Mask off extra data. */ event = (u_int)hint & NOTE_PCTRLMASK; /* If the user is interested in this event, record it. */ if (kn->kn_sfflags & event) kn->kn_fflags |= event; /* Process is gone, so flag the event as finished. */ if (event == NOTE_EXIT) { kn->kn_flags |= EV_EOF | EV_ONESHOT; kn->kn_ptr.p_proc = NULL; if (kn->kn_fflags & NOTE_EXIT) kn->kn_data = KW_EXITCODE(p->p_xexit, p->p_xsig); if (kn->kn_fflags == 0) kn->kn_flags |= EV_DROP; return (1); } return (kn->kn_fflags != 0); } /* * Called when the process forked. It mostly does the same as the * knote(), activating all knotes registered to be activated when the * process forked. Additionally, for each knote attached to the * parent, check whether user wants to track the new process. If so * attach a new knote to it, and immediately report an event with the * child's pid. */ void knote_fork(struct knlist *list, int pid) { struct kqueue *kq; struct knote *kn; struct kevent kev; int error; MPASS(list != NULL); KNL_ASSERT_LOCKED(list); if (SLIST_EMPTY(&list->kl_list)) return; memset(&kev, 0, sizeof(kev)); SLIST_FOREACH(kn, &list->kl_list, kn_selnext) { kq = kn->kn_kq; KQ_LOCK(kq); if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) { KQ_UNLOCK(kq); continue; } /* * The same as knote(), activate the event. */ if ((kn->kn_sfflags & NOTE_TRACK) == 0) { if (kn->kn_fop->f_event(kn, NOTE_FORK)) KNOTE_ACTIVATE(kn, 1); KQ_UNLOCK(kq); continue; } /* * The NOTE_TRACK case. In addition to the activation * of the event, we need to register new events to * track the child. Drop the locks in preparation for * the call to kqueue_register(). */ kn_enter_flux(kn); KQ_UNLOCK(kq); list->kl_unlock(list->kl_lockarg); /* * Activate existing knote and register tracking knotes with * new process. * * First register a knote to get just the child notice. This * must be a separate note from a potential NOTE_EXIT * notification since both NOTE_CHILD and NOTE_EXIT are defined * to use the data field (in conflicting ways). */ kev.ident = pid; kev.filter = kn->kn_filter; kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_ONESHOT | EV_FLAG2; kev.fflags = kn->kn_sfflags; kev.data = kn->kn_id; /* parent */ kev.udata = kn->kn_kevent.udata;/* preserve udata */ error = kqueue_register(kq, &kev, NULL, M_NOWAIT); if (error) kn->kn_fflags |= NOTE_TRACKERR; /* * Then register another knote to track other potential events * from the new process. */ kev.ident = pid; kev.filter = kn->kn_filter; kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1; kev.fflags = kn->kn_sfflags; kev.data = kn->kn_id; /* parent */ kev.udata = kn->kn_kevent.udata;/* preserve udata */ error = kqueue_register(kq, &kev, NULL, M_NOWAIT); if (error) kn->kn_fflags |= NOTE_TRACKERR; if (kn->kn_fop->f_event(kn, NOTE_FORK)) KNOTE_ACTIVATE(kn, 0); list->kl_lock(list->kl_lockarg); KQ_LOCK(kq); kn_leave_flux(kn); KQ_UNLOCK_FLUX(kq); } } /* * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the * interval timer support code. */ #define NOTE_TIMER_PRECMASK \ (NOTE_SECONDS | NOTE_MSECONDS | NOTE_USECONDS | NOTE_NSECONDS) static sbintime_t timer2sbintime(intptr_t data, int flags) { int64_t secs; /* * Macros for converting to the fractional second portion of an * sbintime_t using 64bit multiplication to improve precision. */ #define NS_TO_SBT(ns) (((ns) * (((uint64_t)1 << 63) / 500000000)) >> 32) #define US_TO_SBT(us) (((us) * (((uint64_t)1 << 63) / 500000)) >> 32) #define MS_TO_SBT(ms) (((ms) * (((uint64_t)1 << 63) / 500)) >> 32) switch (flags & NOTE_TIMER_PRECMASK) { case NOTE_SECONDS: #ifdef __LP64__ if (data > (SBT_MAX / SBT_1S)) return (SBT_MAX); #endif return ((sbintime_t)data << 32); case NOTE_MSECONDS: /* FALLTHROUGH */ case 0: if (data >= 1000) { secs = data / 1000; #ifdef __LP64__ if (secs > (SBT_MAX / SBT_1S)) return (SBT_MAX); #endif return (secs << 32 | MS_TO_SBT(data % 1000)); } return (MS_TO_SBT(data)); case NOTE_USECONDS: if (data >= 1000000) { secs = data / 1000000; #ifdef __LP64__ if (secs > (SBT_MAX / SBT_1S)) return (SBT_MAX); #endif return (secs << 32 | US_TO_SBT(data % 1000000)); } return (US_TO_SBT(data)); case NOTE_NSECONDS: if (data >= 1000000000) { secs = data / 1000000000; #ifdef __LP64__ if (secs > (SBT_MAX / SBT_1S)) return (SBT_MAX); #endif return (secs << 32 | US_TO_SBT(data % 1000000000)); } return (NS_TO_SBT(data)); default: break; } return (-1); } struct kq_timer_cb_data { struct callout c; sbintime_t next; /* next timer event fires at */ sbintime_t to; /* precalculated timer period, 0 for abs */ }; static void filt_timerexpire(void *knx) { struct knote *kn; struct kq_timer_cb_data *kc; kn = knx; kn->kn_data++; KNOTE_ACTIVATE(kn, 0); /* XXX - handle locking */ if ((kn->kn_flags & EV_ONESHOT) != 0) return; kc = kn->kn_ptr.p_v; if (kc->to == 0) return; kc->next += kc->to; callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kn, PCPU_GET(cpuid), C_ABSOLUTE); } /* * data contains amount of time to sleep */ static int filt_timervalidate(struct knote *kn, sbintime_t *to) { struct bintime bt; sbintime_t sbt; if (kn->kn_sdata < 0) return (EINVAL); if (kn->kn_sdata == 0 && (kn->kn_flags & EV_ONESHOT) == 0) kn->kn_sdata = 1; /* * The only fflags values supported are the timer unit * (precision) and the absolute time indicator. */ if ((kn->kn_sfflags & ~(NOTE_TIMER_PRECMASK | NOTE_ABSTIME)) != 0) return (EINVAL); *to = timer2sbintime(kn->kn_sdata, kn->kn_sfflags); if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) { getboottimebin(&bt); sbt = bttosbt(bt); *to -= sbt; } if (*to < 0) return (EINVAL); return (0); } static int filt_timerattach(struct knote *kn) { struct kq_timer_cb_data *kc; sbintime_t to; unsigned int ncallouts; int error; error = filt_timervalidate(kn, &to); if (error != 0) return (error); do { ncallouts = kq_ncallouts; if (ncallouts >= kq_calloutmax) return (ENOMEM); } while (!atomic_cmpset_int(&kq_ncallouts, ncallouts, ncallouts + 1)); if ((kn->kn_sfflags & NOTE_ABSTIME) == 0) kn->kn_flags |= EV_CLEAR; /* automatically set */ kn->kn_status &= ~KN_DETACHED; /* knlist_add clears it */ kn->kn_ptr.p_v = kc = malloc(sizeof(*kc), M_KQUEUE, M_WAITOK); callout_init(&kc->c, 1); filt_timerstart(kn, to); return (0); } static void filt_timerstart(struct knote *kn, sbintime_t to) { struct kq_timer_cb_data *kc; kc = kn->kn_ptr.p_v; if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) { kc->next = to; kc->to = 0; } else { kc->next = to + sbinuptime(); kc->to = to; } callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kn, PCPU_GET(cpuid), C_ABSOLUTE); } static void filt_timerdetach(struct knote *kn) { struct kq_timer_cb_data *kc; unsigned int old __unused; kc = kn->kn_ptr.p_v; callout_drain(&kc->c); free(kc, M_KQUEUE); old = atomic_fetchadd_int(&kq_ncallouts, -1); KASSERT(old > 0, ("Number of callouts cannot become negative")); kn->kn_status |= KN_DETACHED; /* knlist_remove sets it */ } static void filt_timertouch(struct knote *kn, struct kevent *kev, u_long type) { struct kq_timer_cb_data *kc; struct kqueue *kq; sbintime_t to; int error; switch (type) { case EVENT_REGISTER: /* Handle re-added timers that update data/fflags */ if (kev->flags & EV_ADD) { kc = kn->kn_ptr.p_v; /* Drain any existing callout. */ callout_drain(&kc->c); /* Throw away any existing undelivered record * of the timer expiration. This is done under * the presumption that if a process is * re-adding this timer with new parameters, * it is no longer interested in what may have * happened under the old parameters. If it is * interested, it can wait for the expiration, * delete the old timer definition, and then * add the new one. * * This has to be done while the kq is locked: * - if enqueued, dequeue * - make it no longer active * - clear the count of expiration events */ kq = kn->kn_kq; KQ_LOCK(kq); if (kn->kn_status & KN_QUEUED) knote_dequeue(kn); kn->kn_status &= ~KN_ACTIVE; kn->kn_data = 0; KQ_UNLOCK(kq); /* Reschedule timer based on new data/fflags */ kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; error = filt_timervalidate(kn, &to); if (error != 0) { kn->kn_flags |= EV_ERROR; kn->kn_data = error; } else filt_timerstart(kn, to); } break; case EVENT_PROCESS: *kev = kn->kn_kevent; if (kn->kn_flags & EV_CLEAR) { kn->kn_data = 0; kn->kn_fflags = 0; } break; default: panic("filt_timertouch() - invalid type (%ld)", type); break; } } static int filt_timer(struct knote *kn, long hint) { return (kn->kn_data != 0); } static int filt_userattach(struct knote *kn) { /* * EVFILT_USER knotes are not attached to anything in the kernel. */ kn->kn_hook = NULL; if (kn->kn_fflags & NOTE_TRIGGER) kn->kn_hookid = 1; else kn->kn_hookid = 0; return (0); } static void filt_userdetach(__unused struct knote *kn) { /* * EVFILT_USER knotes are not attached to anything in the kernel. */ } static int filt_user(struct knote *kn, __unused long hint) { return (kn->kn_hookid); } static void filt_usertouch(struct knote *kn, struct kevent *kev, u_long type) { u_int ffctrl; switch (type) { case EVENT_REGISTER: if (kev->fflags & NOTE_TRIGGER) kn->kn_hookid = 1; ffctrl = kev->fflags & NOTE_FFCTRLMASK; kev->fflags &= NOTE_FFLAGSMASK; switch (ffctrl) { case NOTE_FFNOP: break; case NOTE_FFAND: kn->kn_sfflags &= kev->fflags; break; case NOTE_FFOR: kn->kn_sfflags |= kev->fflags; break; case NOTE_FFCOPY: kn->kn_sfflags = kev->fflags; break; default: /* XXX Return error? */ break; } kn->kn_sdata = kev->data; if (kev->flags & EV_CLEAR) { kn->kn_hookid = 0; kn->kn_data = 0; kn->kn_fflags = 0; } break; case EVENT_PROCESS: *kev = kn->kn_kevent; kev->fflags = kn->kn_sfflags; kev->data = kn->kn_sdata; if (kn->kn_flags & EV_CLEAR) { kn->kn_hookid = 0; kn->kn_data = 0; kn->kn_fflags = 0; } break; default: panic("filt_usertouch() - invalid type (%ld)", type); break; } } int sys_kqueue(struct thread *td, struct kqueue_args *uap) { return (kern_kqueue(td, 0, NULL)); } static void kqueue_init(struct kqueue *kq) { mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF | MTX_DUPOK); TAILQ_INIT(&kq->kq_head); knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock); TASK_INIT(&kq->kq_task, 0, kqueue_task, kq); } int kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps) { struct filedesc *fdp; struct kqueue *kq; struct file *fp; struct ucred *cred; int fd, error; fdp = td->td_proc->p_fd; cred = td->td_ucred; if (!chgkqcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_KQUEUES))) return (ENOMEM); error = falloc_caps(td, &fp, &fd, flags, fcaps); if (error != 0) { chgkqcnt(cred->cr_ruidinfo, -1, 0); return (error); } /* An extra reference on `fp' has been held for us by falloc(). */ kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO); kqueue_init(kq); kq->kq_fdp = fdp; kq->kq_cred = crhold(cred); FILEDESC_XLOCK(fdp); TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list); FILEDESC_XUNLOCK(fdp); finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops); fdrop(fp, td); td->td_retval[0] = fd; return (0); } struct g_kevent_args { int fd; void *changelist; int nchanges; void *eventlist; int nevents; const struct timespec *timeout; }; int sys_kevent(struct thread *td, struct kevent_args *uap) { struct kevent_copyops k_ops = { .arg = uap, .k_copyout = kevent_copyout, .k_copyin = kevent_copyin, .kevent_size = sizeof(struct kevent), }; struct g_kevent_args gk_args = { .fd = uap->fd, .changelist = uap->changelist, .nchanges = uap->nchanges, .eventlist = uap->eventlist, .nevents = uap->nevents, .timeout = uap->timeout, }; return (kern_kevent_generic(td, &gk_args, &k_ops, "kevent")); } static int kern_kevent_generic(struct thread *td, struct g_kevent_args *uap, struct kevent_copyops *k_ops, const char *struct_name) { struct timespec ts, *tsp; #ifdef KTRACE struct kevent *eventlist = uap->eventlist; #endif int error; if (uap->timeout != NULL) { error = copyin(uap->timeout, &ts, sizeof(ts)); if (error) return (error); tsp = &ts; } else tsp = NULL; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT_ARRAY)) ktrstructarray(struct_name, UIO_USERSPACE, uap->changelist, uap->nchanges, k_ops->kevent_size); #endif error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents, k_ops, tsp); #ifdef KTRACE if (error == 0 && KTRPOINT(td, KTR_STRUCT_ARRAY)) ktrstructarray(struct_name, UIO_USERSPACE, eventlist, td->td_retval[0], k_ops->kevent_size); #endif return (error); } /* * Copy 'count' items into the destination list pointed to by uap->eventlist. */ static int kevent_copyout(void *arg, struct kevent *kevp, int count) { struct kevent_args *uap; int error; KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); uap = (struct kevent_args *)arg; error = copyout(kevp, uap->eventlist, count * sizeof *kevp); if (error == 0) uap->eventlist += count; return (error); } /* * Copy 'count' items from the list pointed to by uap->changelist. */ static int kevent_copyin(void *arg, struct kevent *kevp, int count) { struct kevent_args *uap; int error; KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); uap = (struct kevent_args *)arg; error = copyin(uap->changelist, kevp, count * sizeof *kevp); if (error == 0) uap->changelist += count; return (error); } #ifdef COMPAT_FREEBSD11 static int kevent11_copyout(void *arg, struct kevent *kevp, int count) { struct freebsd11_kevent_args *uap; struct kevent_freebsd11 kev11; int error, i; KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); uap = (struct freebsd11_kevent_args *)arg; for (i = 0; i < count; i++) { kev11.ident = kevp->ident; kev11.filter = kevp->filter; kev11.flags = kevp->flags; kev11.fflags = kevp->fflags; kev11.data = kevp->data; kev11.udata = kevp->udata; error = copyout(&kev11, uap->eventlist, sizeof(kev11)); if (error != 0) break; uap->eventlist++; kevp++; } return (error); } /* * Copy 'count' items from the list pointed to by uap->changelist. */ static int kevent11_copyin(void *arg, struct kevent *kevp, int count) { struct freebsd11_kevent_args *uap; struct kevent_freebsd11 kev11; int error, i; KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); uap = (struct freebsd11_kevent_args *)arg; for (i = 0; i < count; i++) { error = copyin(uap->changelist, &kev11, sizeof(kev11)); if (error != 0) break; kevp->ident = kev11.ident; kevp->filter = kev11.filter; kevp->flags = kev11.flags; kevp->fflags = kev11.fflags; kevp->data = (uintptr_t)kev11.data; kevp->udata = kev11.udata; bzero(&kevp->ext, sizeof(kevp->ext)); uap->changelist++; kevp++; } return (error); } int freebsd11_kevent(struct thread *td, struct freebsd11_kevent_args *uap) { struct kevent_copyops k_ops = { .arg = uap, .k_copyout = kevent11_copyout, .k_copyin = kevent11_copyin, .kevent_size = sizeof(struct kevent_freebsd11), }; struct g_kevent_args gk_args = { .fd = uap->fd, .changelist = uap->changelist, .nchanges = uap->nchanges, .eventlist = uap->eventlist, .nevents = uap->nevents, .timeout = uap->timeout, }; return (kern_kevent_generic(td, &gk_args, &k_ops, "kevent_freebsd11")); } #endif int kern_kevent(struct thread *td, int fd, int nchanges, int nevents, struct kevent_copyops *k_ops, const struct timespec *timeout) { cap_rights_t rights; struct file *fp; int error; cap_rights_init(&rights); if (nchanges > 0) cap_rights_set(&rights, CAP_KQUEUE_CHANGE); if (nevents > 0) cap_rights_set(&rights, CAP_KQUEUE_EVENT); error = fget(td, fd, &rights, &fp); if (error != 0) return (error); error = kern_kevent_fp(td, fp, nchanges, nevents, k_ops, timeout); fdrop(fp, td); return (error); } static int kqueue_kevent(struct kqueue *kq, struct thread *td, int nchanges, int nevents, struct kevent_copyops *k_ops, const struct timespec *timeout) { struct kevent keva[KQ_NEVENTS]; struct kevent *kevp, *changes; int i, n, nerrors, error; nerrors = 0; while (nchanges > 0) { n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges; error = k_ops->k_copyin(k_ops->arg, keva, n); if (error) return (error); changes = keva; for (i = 0; i < n; i++) { kevp = &changes[i]; if (!kevp->filter) continue; kevp->flags &= ~EV_SYSFLAGS; error = kqueue_register(kq, kevp, td, M_WAITOK); if (error || (kevp->flags & EV_RECEIPT)) { if (nevents == 0) return (error); kevp->flags = EV_ERROR; kevp->data = error; (void)k_ops->k_copyout(k_ops->arg, kevp, 1); nevents--; nerrors++; } } nchanges -= n; } if (nerrors) { td->td_retval[0] = nerrors; return (0); } return (kqueue_scan(kq, nevents, k_ops, timeout, keva, td)); } int kern_kevent_fp(struct thread *td, struct file *fp, int nchanges, int nevents, struct kevent_copyops *k_ops, const struct timespec *timeout) { struct kqueue *kq; int error; error = kqueue_acquire(fp, &kq); if (error != 0) return (error); error = kqueue_kevent(kq, td, nchanges, nevents, k_ops, timeout); kqueue_release(kq, 0); return (error); } /* * Performs a kevent() call on a temporarily created kqueue. This can be * used to perform one-shot polling, similar to poll() and select(). */ int kern_kevent_anonymous(struct thread *td, int nevents, struct kevent_copyops *k_ops) { struct kqueue kq = {}; int error; kqueue_init(&kq); kq.kq_refcnt = 1; error = kqueue_kevent(&kq, td, nevents, nevents, k_ops, NULL); kqueue_drain(&kq, td); kqueue_destroy(&kq); return (error); } int kqueue_add_filteropts(int filt, struct filterops *filtops) { int error; error = 0; if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) { printf( "trying to add a filterop that is out of range: %d is beyond %d\n", ~filt, EVFILT_SYSCOUNT); return EINVAL; } mtx_lock(&filterops_lock); if (sysfilt_ops[~filt].for_fop != &null_filtops && sysfilt_ops[~filt].for_fop != NULL) error = EEXIST; else { sysfilt_ops[~filt].for_fop = filtops; sysfilt_ops[~filt].for_refcnt = 0; } mtx_unlock(&filterops_lock); return (error); } int kqueue_del_filteropts(int filt) { int error; error = 0; if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) return EINVAL; mtx_lock(&filterops_lock); if (sysfilt_ops[~filt].for_fop == &null_filtops || sysfilt_ops[~filt].for_fop == NULL) error = EINVAL; else if (sysfilt_ops[~filt].for_refcnt != 0) error = EBUSY; else { sysfilt_ops[~filt].for_fop = &null_filtops; sysfilt_ops[~filt].for_refcnt = 0; } mtx_unlock(&filterops_lock); return error; } static struct filterops * kqueue_fo_find(int filt) { if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) return NULL; if (sysfilt_ops[~filt].for_nolock) return sysfilt_ops[~filt].for_fop; mtx_lock(&filterops_lock); sysfilt_ops[~filt].for_refcnt++; if (sysfilt_ops[~filt].for_fop == NULL) sysfilt_ops[~filt].for_fop = &null_filtops; mtx_unlock(&filterops_lock); return sysfilt_ops[~filt].for_fop; } static void kqueue_fo_release(int filt) { if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) return; if (sysfilt_ops[~filt].for_nolock) return; mtx_lock(&filterops_lock); KASSERT(sysfilt_ops[~filt].for_refcnt > 0, ("filter object refcount not valid on release")); sysfilt_ops[~filt].for_refcnt--; mtx_unlock(&filterops_lock); } /* * A ref to kq (obtained via kqueue_acquire) must be held. */ static int kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int mflag) { struct filterops *fops; struct file *fp; struct knote *kn, *tkn; struct knlist *knl; int error, filt, event; int haskqglobal, filedesc_unlock; if ((kev->flags & (EV_ENABLE | EV_DISABLE)) == (EV_ENABLE | EV_DISABLE)) return (EINVAL); fp = NULL; kn = NULL; knl = NULL; error = 0; haskqglobal = 0; filedesc_unlock = 0; filt = kev->filter; fops = kqueue_fo_find(filt); if (fops == NULL) return EINVAL; if (kev->flags & EV_ADD) { /* * Prevent waiting with locks. Non-sleepable * allocation failures are handled in the loop, only * if the spare knote appears to be actually required. */ tkn = knote_alloc(mflag); } else { tkn = NULL; } findkn: if (fops->f_isfd) { KASSERT(td != NULL, ("td is NULL")); if (kev->ident > INT_MAX) error = EBADF; else error = fget(td, kev->ident, &cap_event_rights, &fp); if (error) goto done; if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops, kev->ident, M_NOWAIT) != 0) { /* try again */ fdrop(fp, td); fp = NULL; error = kqueue_expand(kq, fops, kev->ident, mflag); if (error) goto done; goto findkn; } if (fp->f_type == DTYPE_KQUEUE) { /* * If we add some intelligence about what we are doing, * we should be able to support events on ourselves. * We need to know when we are doing this to prevent * getting both the knlist lock and the kq lock since * they are the same thing. */ if (fp->f_data == kq) { error = EINVAL; goto done; } /* * Pre-lock the filedesc before the global * lock mutex, see the comment in * kqueue_close(). */ FILEDESC_XLOCK(td->td_proc->p_fd); filedesc_unlock = 1; KQ_GLOBAL_LOCK(&kq_global, haskqglobal); } KQ_LOCK(kq); if (kev->ident < kq->kq_knlistsize) { SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link) if (kev->filter == kn->kn_filter) break; } } else { if ((kev->flags & EV_ADD) == EV_ADD) { error = kqueue_expand(kq, fops, kev->ident, mflag); if (error != 0) goto done; } KQ_LOCK(kq); /* * If possible, find an existing knote to use for this kevent. */ if (kev->filter == EVFILT_PROC && (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) { /* This is an internal creation of a process tracking * note. Don't attempt to coalesce this with an * existing note. */ ; } else if (kq->kq_knhashmask != 0) { struct klist *list; list = &kq->kq_knhash[ KN_HASH((u_long)kev->ident, kq->kq_knhashmask)]; SLIST_FOREACH(kn, list, kn_link) if (kev->ident == kn->kn_id && kev->filter == kn->kn_filter) break; } } /* knote is in the process of changing, wait for it to stabilize. */ if (kn != NULL && kn_in_flux(kn)) { KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); if (filedesc_unlock) { FILEDESC_XUNLOCK(td->td_proc->p_fd); filedesc_unlock = 0; } kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0); if (fp != NULL) { fdrop(fp, td); fp = NULL; } goto findkn; } /* * kn now contains the matching knote, or NULL if no match */ if (kn == NULL) { if (kev->flags & EV_ADD) { kn = tkn; tkn = NULL; if (kn == NULL) { KQ_UNLOCK(kq); error = ENOMEM; goto done; } kn->kn_fp = fp; kn->kn_kq = kq; kn->kn_fop = fops; /* * apply reference counts to knote structure, and * do not release it at the end of this routine. */ fops = NULL; fp = NULL; kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; kev->fflags = 0; kev->data = 0; kn->kn_kevent = *kev; kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE | EV_ENABLE | EV_DISABLE | EV_FORCEONESHOT); kn->kn_status = KN_DETACHED; if ((kev->flags & EV_DISABLE) != 0) kn->kn_status |= KN_DISABLED; kn_enter_flux(kn); error = knote_attach(kn, kq); KQ_UNLOCK(kq); if (error != 0) { tkn = kn; goto done; } if ((error = kn->kn_fop->f_attach(kn)) != 0) { knote_drop_detached(kn, td); goto done; } knl = kn_list_lock(kn); goto done_ev_add; } else { /* No matching knote and the EV_ADD flag is not set. */ KQ_UNLOCK(kq); error = ENOENT; goto done; } } if (kev->flags & EV_DELETE) { kn_enter_flux(kn); KQ_UNLOCK(kq); knote_drop(kn, td); goto done; } if (kev->flags & EV_FORCEONESHOT) { kn->kn_flags |= EV_ONESHOT; KNOTE_ACTIVATE(kn, 1); } if ((kev->flags & EV_ENABLE) != 0) kn->kn_status &= ~KN_DISABLED; else if ((kev->flags & EV_DISABLE) != 0) kn->kn_status |= KN_DISABLED; /* * The user may change some filter values after the initial EV_ADD, * but doing so will not reset any filter which has already been * triggered. */ kn->kn_status |= KN_SCAN; kn_enter_flux(kn); KQ_UNLOCK(kq); knl = kn_list_lock(kn); kn->kn_kevent.udata = kev->udata; if (!fops->f_isfd && fops->f_touch != NULL) { fops->f_touch(kn, kev, EVENT_REGISTER); } else { kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; } done_ev_add: /* * We can get here with kn->kn_knlist == NULL. This can happen when * the initial attach event decides that the event is "completed" * already, e.g., filt_procattach() is called on a zombie process. It * will call filt_proc() which will remove it from the list, and NULL * kn_knlist. * * KN_DISABLED will be stable while the knote is in flux, so the * unlocked read will not race with an update. */ if ((kn->kn_status & KN_DISABLED) == 0) event = kn->kn_fop->f_event(kn, 0); else event = 0; KQ_LOCK(kq); if (event) kn->kn_status |= KN_ACTIVE; if ((kn->kn_status & (KN_ACTIVE | KN_DISABLED | KN_QUEUED)) == KN_ACTIVE) knote_enqueue(kn); kn->kn_status &= ~KN_SCAN; kn_leave_flux(kn); kn_list_unlock(knl); KQ_UNLOCK_FLUX(kq); done: KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); if (filedesc_unlock) FILEDESC_XUNLOCK(td->td_proc->p_fd); if (fp != NULL) fdrop(fp, td); knote_free(tkn); if (fops != NULL) kqueue_fo_release(filt); return (error); } static int kqueue_acquire(struct file *fp, struct kqueue **kqp) { int error; struct kqueue *kq; error = 0; kq = fp->f_data; if (fp->f_type != DTYPE_KQUEUE || kq == NULL) return (EBADF); *kqp = kq; KQ_LOCK(kq); if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) { KQ_UNLOCK(kq); return (EBADF); } kq->kq_refcnt++; KQ_UNLOCK(kq); return error; } static void kqueue_release(struct kqueue *kq, int locked) { if (locked) KQ_OWNED(kq); else KQ_LOCK(kq); kq->kq_refcnt--; if (kq->kq_refcnt == 1) wakeup(&kq->kq_refcnt); if (!locked) KQ_UNLOCK(kq); } static void kqueue_schedtask(struct kqueue *kq) { KQ_OWNED(kq); KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN), ("scheduling kqueue task while draining")); if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) { taskqueue_enqueue(taskqueue_kqueue_ctx, &kq->kq_task); kq->kq_state |= KQ_TASKSCHED; } } /* * Expand the kq to make sure we have storage for fops/ident pair. * * Return 0 on success (or no work necessary), return errno on failure. */ static int kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident, int mflag) { struct klist *list, *tmp_knhash, *to_free; u_long tmp_knhashmask; int error, fd, size; KQ_NOTOWNED(kq); error = 0; to_free = NULL; if (fops->f_isfd) { fd = ident; if (kq->kq_knlistsize <= fd) { size = kq->kq_knlistsize; while (size <= fd) size += KQEXTENT; list = malloc(size * sizeof(*list), M_KQUEUE, mflag); if (list == NULL) return ENOMEM; KQ_LOCK(kq); if ((kq->kq_state & KQ_CLOSING) != 0) { to_free = list; error = EBADF; } else if (kq->kq_knlistsize > fd) { to_free = list; } else { if (kq->kq_knlist != NULL) { bcopy(kq->kq_knlist, list, kq->kq_knlistsize * sizeof(*list)); to_free = kq->kq_knlist; kq->kq_knlist = NULL; } bzero((caddr_t)list + kq->kq_knlistsize * sizeof(*list), (size - kq->kq_knlistsize) * sizeof(*list)); kq->kq_knlistsize = size; kq->kq_knlist = list; } KQ_UNLOCK(kq); } } else { if (kq->kq_knhashmask == 0) { tmp_knhash = hashinit_flags(KN_HASHSIZE, M_KQUEUE, &tmp_knhashmask, (mflag & M_WAITOK) != 0 ? HASH_WAITOK : HASH_NOWAIT); if (tmp_knhash == NULL) return (ENOMEM); KQ_LOCK(kq); if ((kq->kq_state & KQ_CLOSING) != 0) { to_free = tmp_knhash; error = EBADF; } else if (kq->kq_knhashmask == 0) { kq->kq_knhash = tmp_knhash; kq->kq_knhashmask = tmp_knhashmask; } else { to_free = tmp_knhash; } KQ_UNLOCK(kq); } } free(to_free, M_KQUEUE); KQ_NOTOWNED(kq); return (error); } static void kqueue_task(void *arg, int pending) { struct kqueue *kq; int haskqglobal; haskqglobal = 0; kq = arg; KQ_GLOBAL_LOCK(&kq_global, haskqglobal); KQ_LOCK(kq); KNOTE_LOCKED(&kq->kq_sel.si_note, 0); kq->kq_state &= ~KQ_TASKSCHED; if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) { wakeup(&kq->kq_state); } KQ_UNLOCK(kq); KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); } /* * Scan, update kn_data (if not ONESHOT), and copyout triggered events. * We treat KN_MARKER knotes as if they are in flux. */ static int kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops, const struct timespec *tsp, struct kevent *keva, struct thread *td) { struct kevent *kevp; struct knote *kn, *marker; struct knlist *knl; sbintime_t asbt, rsbt; int count, error, haskqglobal, influx, nkev, touch; count = maxevents; nkev = 0; error = 0; haskqglobal = 0; if (maxevents == 0) goto done_nl; rsbt = 0; if (tsp != NULL) { if (tsp->tv_sec < 0 || tsp->tv_nsec < 0 || tsp->tv_nsec >= 1000000000) { error = EINVAL; goto done_nl; } if (timespecisset(tsp)) { if (tsp->tv_sec <= INT32_MAX) { rsbt = tstosbt(*tsp); if (TIMESEL(&asbt, rsbt)) asbt += tc_tick_sbt; if (asbt <= SBT_MAX - rsbt) asbt += rsbt; else asbt = 0; rsbt >>= tc_precexp; } else asbt = 0; } else asbt = -1; } else asbt = 0; marker = knote_alloc(M_WAITOK); marker->kn_status = KN_MARKER; KQ_LOCK(kq); retry: kevp = keva; if (kq->kq_count == 0) { if (asbt == -1) { error = EWOULDBLOCK; } else { kq->kq_state |= KQ_SLEEP; error = msleep_sbt(kq, &kq->kq_lock, PSOCK | PCATCH, "kqread", asbt, rsbt, C_ABSOLUTE); } if (error == 0) goto retry; /* don't restart after signals... */ if (error == ERESTART) error = EINTR; else if (error == EWOULDBLOCK) error = 0; goto done; } TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe); influx = 0; while (count) { KQ_OWNED(kq); kn = TAILQ_FIRST(&kq->kq_head); if ((kn->kn_status == KN_MARKER && kn != marker) || kn_in_flux(kn)) { if (influx) { influx = 0; KQ_FLUX_WAKEUP(kq); } kq->kq_state |= KQ_FLUXWAIT; error = msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0); continue; } TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) { kn->kn_status &= ~KN_QUEUED; kq->kq_count--; continue; } if (kn == marker) { KQ_FLUX_WAKEUP(kq); if (count == maxevents) goto retry; goto done; } KASSERT(!kn_in_flux(kn), ("knote %p is unexpectedly in flux", kn)); if ((kn->kn_flags & EV_DROP) == EV_DROP) { kn->kn_status &= ~KN_QUEUED; kn_enter_flux(kn); kq->kq_count--; KQ_UNLOCK(kq); /* * We don't need to lock the list since we've * marked it as in flux. */ knote_drop(kn, td); KQ_LOCK(kq); continue; } else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) { kn->kn_status &= ~KN_QUEUED; kn_enter_flux(kn); kq->kq_count--; KQ_UNLOCK(kq); /* * We don't need to lock the list since we've * marked the knote as being in flux. */ *kevp = kn->kn_kevent; knote_drop(kn, td); KQ_LOCK(kq); kn = NULL; } else { kn->kn_status |= KN_SCAN; kn_enter_flux(kn); KQ_UNLOCK(kq); if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE) KQ_GLOBAL_LOCK(&kq_global, haskqglobal); knl = kn_list_lock(kn); if (kn->kn_fop->f_event(kn, 0) == 0) { KQ_LOCK(kq); KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE | KN_SCAN); kn_leave_flux(kn); kq->kq_count--; kn_list_unlock(knl); influx = 1; continue; } touch = (!kn->kn_fop->f_isfd && kn->kn_fop->f_touch != NULL); if (touch) kn->kn_fop->f_touch(kn, kevp, EVENT_PROCESS); else *kevp = kn->kn_kevent; KQ_LOCK(kq); KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) { /* * Manually clear knotes who weren't * 'touch'ed. */ if (touch == 0 && kn->kn_flags & EV_CLEAR) { kn->kn_data = 0; kn->kn_fflags = 0; } if (kn->kn_flags & EV_DISPATCH) kn->kn_status |= KN_DISABLED; kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE); kq->kq_count--; } else TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); kn->kn_status &= ~KN_SCAN; kn_leave_flux(kn); kn_list_unlock(knl); influx = 1; } /* we are returning a copy to the user */ kevp++; nkev++; count--; if (nkev == KQ_NEVENTS) { influx = 0; KQ_UNLOCK_FLUX(kq); error = k_ops->k_copyout(k_ops->arg, keva, nkev); nkev = 0; kevp = keva; KQ_LOCK(kq); if (error) break; } } TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe); done: KQ_OWNED(kq); KQ_UNLOCK_FLUX(kq); knote_free(marker); done_nl: KQ_NOTOWNED(kq); if (nkev != 0) error = k_ops->k_copyout(k_ops->arg, keva, nkev); td->td_retval[0] = maxevents - count; return (error); } /*ARGSUSED*/ static int kqueue_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { /* * Enabling sigio causes two major problems: * 1) infinite recursion: * Synopsys: kevent is being used to track signals and have FIOASYNC * set. On receipt of a signal this will cause a kqueue to recurse * into itself over and over. Sending the sigio causes the kqueue * to become ready, which in turn posts sigio again, forever. * Solution: this can be solved by setting a flag in the kqueue that * we have a SIGIO in progress. * 2) locking problems: * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts * us above the proc and pgrp locks. * Solution: Post a signal using an async mechanism, being sure to * record a generation count in the delivery so that we do not deliver * a signal to the wrong process. * * Note, these two mechanisms are somewhat mutually exclusive! */ #if 0 struct kqueue *kq; kq = fp->f_data; switch (cmd) { case FIOASYNC: if (*(int *)data) { kq->kq_state |= KQ_ASYNC; } else { kq->kq_state &= ~KQ_ASYNC; } return (0); case FIOSETOWN: return (fsetown(*(int *)data, &kq->kq_sigio)); case FIOGETOWN: *(int *)data = fgetown(&kq->kq_sigio); return (0); } #endif return (ENOTTY); } /*ARGSUSED*/ static int kqueue_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct kqueue *kq; int revents = 0; int error; if ((error = kqueue_acquire(fp, &kq))) return POLLERR; KQ_LOCK(kq); if (events & (POLLIN | POLLRDNORM)) { if (kq->kq_count) { revents |= events & (POLLIN | POLLRDNORM); } else { selrecord(td, &kq->kq_sel); if (SEL_WAITING(&kq->kq_sel)) kq->kq_state |= KQ_SEL; } } kqueue_release(kq, 1); KQ_UNLOCK(kq); return (revents); } /*ARGSUSED*/ static int kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred, struct thread *td) { bzero((void *)st, sizeof *st); /* * We no longer return kq_count because the unlocked value is useless. * If you spent all this time getting the count, why not spend your * syscall better by calling kevent? * * XXX - This is needed for libc_r. */ st->st_mode = S_IFIFO; return (0); } static void kqueue_drain(struct kqueue *kq, struct thread *td) { struct knote *kn; int i; KQ_LOCK(kq); KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING, ("kqueue already closing")); kq->kq_state |= KQ_CLOSING; if (kq->kq_refcnt > 1) msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0); KASSERT(kq->kq_refcnt == 1, ("other refs are out there!")); KASSERT(knlist_empty(&kq->kq_sel.si_note), ("kqueue's knlist not empty")); for (i = 0; i < kq->kq_knlistsize; i++) { while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) { if (kn_in_flux(kn)) { kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK, "kqclo1", 0); continue; } kn_enter_flux(kn); KQ_UNLOCK(kq); knote_drop(kn, td); KQ_LOCK(kq); } } if (kq->kq_knhashmask != 0) { for (i = 0; i <= kq->kq_knhashmask; i++) { while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) { if (kn_in_flux(kn)) { kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK, "kqclo2", 0); continue; } kn_enter_flux(kn); KQ_UNLOCK(kq); knote_drop(kn, td); KQ_LOCK(kq); } } } if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) { kq->kq_state |= KQ_TASKDRAIN; msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0); } if ((kq->kq_state & KQ_SEL) == KQ_SEL) { selwakeuppri(&kq->kq_sel, PSOCK); if (!SEL_WAITING(&kq->kq_sel)) kq->kq_state &= ~KQ_SEL; } KQ_UNLOCK(kq); } static void kqueue_destroy(struct kqueue *kq) { KASSERT(kq->kq_fdp == NULL, ("kqueue still attached to a file descriptor")); seldrain(&kq->kq_sel); knlist_destroy(&kq->kq_sel.si_note); mtx_destroy(&kq->kq_lock); if (kq->kq_knhash != NULL) free(kq->kq_knhash, M_KQUEUE); if (kq->kq_knlist != NULL) free(kq->kq_knlist, M_KQUEUE); funsetown(&kq->kq_sigio); } /*ARGSUSED*/ static int kqueue_close(struct file *fp, struct thread *td) { struct kqueue *kq = fp->f_data; struct filedesc *fdp; int error; int filedesc_unlock; if ((error = kqueue_acquire(fp, &kq))) return error; kqueue_drain(kq, td); /* * We could be called due to the knote_drop() doing fdrop(), * called from kqueue_register(). In this case the global * lock is owned, and filedesc sx is locked before, to not * take the sleepable lock after non-sleepable. */ fdp = kq->kq_fdp; kq->kq_fdp = NULL; if (!sx_xlocked(FILEDESC_LOCK(fdp))) { FILEDESC_XLOCK(fdp); filedesc_unlock = 1; } else filedesc_unlock = 0; TAILQ_REMOVE(&fdp->fd_kqlist, kq, kq_list); if (filedesc_unlock) FILEDESC_XUNLOCK(fdp); kqueue_destroy(kq); chgkqcnt(kq->kq_cred->cr_ruidinfo, -1, 0); crfree(kq->kq_cred); free(kq, M_KQUEUE); fp->f_data = NULL; return (0); } static int kqueue_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { kif->kf_type = KF_TYPE_KQUEUE; return (0); } static void kqueue_wakeup(struct kqueue *kq) { KQ_OWNED(kq); if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) { kq->kq_state &= ~KQ_SLEEP; wakeup(kq); } if ((kq->kq_state & KQ_SEL) == KQ_SEL) { selwakeuppri(&kq->kq_sel, PSOCK); if (!SEL_WAITING(&kq->kq_sel)) kq->kq_state &= ~KQ_SEL; } if (!knlist_empty(&kq->kq_sel.si_note)) kqueue_schedtask(kq); if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) { pgsigio(&kq->kq_sigio, SIGIO, 0); } } /* * Walk down a list of knotes, activating them if their event has triggered. * * There is a possibility to optimize in the case of one kq watching another. * Instead of scheduling a task to wake it up, you could pass enough state * down the chain to make up the parent kqueue. Make this code functional * first. */ void knote(struct knlist *list, long hint, int lockflags) { struct kqueue *kq; struct knote *kn, *tkn; int error; if (list == NULL) return; KNL_ASSERT_LOCK(list, lockflags & KNF_LISTLOCKED); if ((lockflags & KNF_LISTLOCKED) == 0) list->kl_lock(list->kl_lockarg); /* * If we unlock the list lock (and enter influx), we can * eliminate the kqueue scheduling, but this will introduce * four lock/unlock's for each knote to test. Also, marker * would be needed to keep iteration position, since filters * or other threads could remove events. */ SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, tkn) { kq = kn->kn_kq; KQ_LOCK(kq); if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) { /* * Do not process the influx notes, except for * the influx coming from the kq unlock in the * kqueue_scan(). In the later case, we do * not interfere with the scan, since the code * fragment in kqueue_scan() locks the knlist, * and cannot proceed until we finished. */ KQ_UNLOCK(kq); } else if ((lockflags & KNF_NOKQLOCK) != 0) { kn_enter_flux(kn); KQ_UNLOCK(kq); error = kn->kn_fop->f_event(kn, hint); KQ_LOCK(kq); kn_leave_flux(kn); if (error) KNOTE_ACTIVATE(kn, 1); KQ_UNLOCK_FLUX(kq); } else { if (kn->kn_fop->f_event(kn, hint)) KNOTE_ACTIVATE(kn, 1); KQ_UNLOCK(kq); } } if ((lockflags & KNF_LISTLOCKED) == 0) list->kl_unlock(list->kl_lockarg); } /* * add a knote to a knlist */ void knlist_add(struct knlist *knl, struct knote *kn, int islocked) { KNL_ASSERT_LOCK(knl, islocked); KQ_NOTOWNED(kn->kn_kq); KASSERT(kn_in_flux(kn), ("knote %p not in flux", kn)); KASSERT((kn->kn_status & KN_DETACHED) != 0, ("knote %p was not detached", kn)); if (!islocked) knl->kl_lock(knl->kl_lockarg); SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext); if (!islocked) knl->kl_unlock(knl->kl_lockarg); KQ_LOCK(kn->kn_kq); kn->kn_knlist = knl; kn->kn_status &= ~KN_DETACHED; KQ_UNLOCK(kn->kn_kq); } static void knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked, int kqislocked) { KASSERT(!kqislocked || knlislocked, ("kq locked w/o knl locked")); KNL_ASSERT_LOCK(knl, knlislocked); mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED); KASSERT(kqislocked || kn_in_flux(kn), ("knote %p not in flux", kn)); KASSERT((kn->kn_status & KN_DETACHED) == 0, ("knote %p was already detached", kn)); if (!knlislocked) knl->kl_lock(knl->kl_lockarg); SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext); kn->kn_knlist = NULL; if (!knlislocked) kn_list_unlock(knl); if (!kqislocked) KQ_LOCK(kn->kn_kq); kn->kn_status |= KN_DETACHED; if (!kqislocked) KQ_UNLOCK(kn->kn_kq); } /* * remove knote from the specified knlist */ void knlist_remove(struct knlist *knl, struct knote *kn, int islocked) { knlist_remove_kq(knl, kn, islocked, 0); } int knlist_empty(struct knlist *knl) { KNL_ASSERT_LOCKED(knl); return (SLIST_EMPTY(&knl->kl_list)); } static struct mtx knlist_lock; MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects", MTX_DEF); static void knlist_mtx_lock(void *arg); static void knlist_mtx_unlock(void *arg); static void knlist_mtx_lock(void *arg) { mtx_lock((struct mtx *)arg); } static void knlist_mtx_unlock(void *arg) { mtx_unlock((struct mtx *)arg); } static void knlist_mtx_assert_locked(void *arg) { mtx_assert((struct mtx *)arg, MA_OWNED); } static void knlist_mtx_assert_unlocked(void *arg) { mtx_assert((struct mtx *)arg, MA_NOTOWNED); } static void knlist_rw_rlock(void *arg) { rw_rlock((struct rwlock *)arg); } static void knlist_rw_runlock(void *arg) { rw_runlock((struct rwlock *)arg); } static void knlist_rw_assert_locked(void *arg) { rw_assert((struct rwlock *)arg, RA_LOCKED); } static void knlist_rw_assert_unlocked(void *arg) { rw_assert((struct rwlock *)arg, RA_UNLOCKED); } void knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *), void (*kl_unlock)(void *), void (*kl_assert_locked)(void *), void (*kl_assert_unlocked)(void *)) { if (lock == NULL) knl->kl_lockarg = &knlist_lock; else knl->kl_lockarg = lock; if (kl_lock == NULL) knl->kl_lock = knlist_mtx_lock; else knl->kl_lock = kl_lock; if (kl_unlock == NULL) knl->kl_unlock = knlist_mtx_unlock; else knl->kl_unlock = kl_unlock; if (kl_assert_locked == NULL) knl->kl_assert_locked = knlist_mtx_assert_locked; else knl->kl_assert_locked = kl_assert_locked; if (kl_assert_unlocked == NULL) knl->kl_assert_unlocked = knlist_mtx_assert_unlocked; else knl->kl_assert_unlocked = kl_assert_unlocked; knl->kl_autodestroy = 0; SLIST_INIT(&knl->kl_list); } void knlist_init_mtx(struct knlist *knl, struct mtx *lock) { knlist_init(knl, lock, NULL, NULL, NULL, NULL); } struct knlist * knlist_alloc(struct mtx *lock) { struct knlist *knl; knl = malloc(sizeof(struct knlist), M_KQUEUE, M_WAITOK); knlist_init_mtx(knl, lock); return (knl); } void knlist_init_rw_reader(struct knlist *knl, struct rwlock *lock) { knlist_init(knl, lock, knlist_rw_rlock, knlist_rw_runlock, knlist_rw_assert_locked, knlist_rw_assert_unlocked); } void knlist_destroy(struct knlist *knl) { KASSERT(KNLIST_EMPTY(knl), ("destroying knlist %p with knotes on it", knl)); } void knlist_detach(struct knlist *knl) { KNL_ASSERT_LOCKED(knl); knl->kl_autodestroy = 1; if (knlist_empty(knl)) { knlist_destroy(knl); free(knl, M_KQUEUE); } } /* * Even if we are locked, we may need to drop the lock to allow any influx * knotes time to "settle". */ void knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn) { struct knote *kn, *kn2; struct kqueue *kq; KASSERT(!knl->kl_autodestroy, ("cleardel for autodestroy %p", knl)); if (islocked) KNL_ASSERT_LOCKED(knl); else { KNL_ASSERT_UNLOCKED(knl); again: /* need to reacquire lock since we have dropped it */ knl->kl_lock(knl->kl_lockarg); } SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) { kq = kn->kn_kq; KQ_LOCK(kq); if (kn_in_flux(kn)) { KQ_UNLOCK(kq); continue; } knlist_remove_kq(knl, kn, 1, 1); if (killkn) { kn_enter_flux(kn); KQ_UNLOCK(kq); knote_drop_detached(kn, td); } else { /* Make sure cleared knotes disappear soon */ kn->kn_flags |= EV_EOF | EV_ONESHOT; KQ_UNLOCK(kq); } kq = NULL; } if (!SLIST_EMPTY(&knl->kl_list)) { /* there are still in flux knotes remaining */ kn = SLIST_FIRST(&knl->kl_list); kq = kn->kn_kq; KQ_LOCK(kq); KASSERT(kn_in_flux(kn), ("knote removed w/o list lock")); knl->kl_unlock(knl->kl_lockarg); kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0); kq = NULL; goto again; } if (islocked) KNL_ASSERT_LOCKED(knl); else { knl->kl_unlock(knl->kl_lockarg); KNL_ASSERT_UNLOCKED(knl); } } /* * Remove all knotes referencing a specified fd must be called with FILEDESC * lock. This prevents a race where a new fd comes along and occupies the * entry and we attach a knote to the fd. */ void knote_fdclose(struct thread *td, int fd) { struct filedesc *fdp = td->td_proc->p_fd; struct kqueue *kq; struct knote *kn; int influx; FILEDESC_XLOCK_ASSERT(fdp); /* * We shouldn't have to worry about new kevents appearing on fd * since filedesc is locked. */ TAILQ_FOREACH(kq, &fdp->fd_kqlist, kq_list) { KQ_LOCK(kq); again: influx = 0; while (kq->kq_knlistsize > fd && (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) { if (kn_in_flux(kn)) { /* someone else might be waiting on our knote */ if (influx) wakeup(kq); kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0); goto again; } kn_enter_flux(kn); KQ_UNLOCK(kq); influx = 1; knote_drop(kn, td); KQ_LOCK(kq); } KQ_UNLOCK_FLUX(kq); } } static int knote_attach(struct knote *kn, struct kqueue *kq) { struct klist *list; KASSERT(kn_in_flux(kn), ("knote %p not marked influx", kn)); KQ_OWNED(kq); if ((kq->kq_state & KQ_CLOSING) != 0) return (EBADF); if (kn->kn_fop->f_isfd) { if (kn->kn_id >= kq->kq_knlistsize) return (ENOMEM); list = &kq->kq_knlist[kn->kn_id]; } else { if (kq->kq_knhash == NULL) return (ENOMEM); list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; } SLIST_INSERT_HEAD(list, kn, kn_link); return (0); } static void knote_drop(struct knote *kn, struct thread *td) { if ((kn->kn_status & KN_DETACHED) == 0) kn->kn_fop->f_detach(kn); knote_drop_detached(kn, td); } static void knote_drop_detached(struct knote *kn, struct thread *td) { struct kqueue *kq; struct klist *list; kq = kn->kn_kq; KASSERT((kn->kn_status & KN_DETACHED) != 0, ("knote %p still attached", kn)); KQ_NOTOWNED(kq); KQ_LOCK(kq); KASSERT(kn->kn_influx == 1, ("knote_drop called on %p with influx %d", kn, kn->kn_influx)); if (kn->kn_fop->f_isfd) list = &kq->kq_knlist[kn->kn_id]; else list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; if (!SLIST_EMPTY(list)) SLIST_REMOVE(list, kn, knote, kn_link); if (kn->kn_status & KN_QUEUED) knote_dequeue(kn); KQ_UNLOCK_FLUX(kq); if (kn->kn_fop->f_isfd) { fdrop(kn->kn_fp, td); kn->kn_fp = NULL; } kqueue_fo_release(kn->kn_kevent.filter); kn->kn_fop = NULL; knote_free(kn); } static void knote_enqueue(struct knote *kn) { struct kqueue *kq = kn->kn_kq; KQ_OWNED(kn->kn_kq); KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued")); TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); kn->kn_status |= KN_QUEUED; kq->kq_count++; kqueue_wakeup(kq); } static void knote_dequeue(struct knote *kn) { struct kqueue *kq = kn->kn_kq; KQ_OWNED(kn->kn_kq); KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued")); TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); kn->kn_status &= ~KN_QUEUED; kq->kq_count--; } static void knote_init(void) { knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); } SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL); static struct knote * knote_alloc(int mflag) { return (uma_zalloc(knote_zone, mflag | M_ZERO)); } static void knote_free(struct knote *kn) { uma_zfree(knote_zone, kn); } /* * Register the kev w/ the kq specified by fd. */ int kqfd_register(int fd, struct kevent *kev, struct thread *td, int mflag) { struct kqueue *kq; struct file *fp; cap_rights_t rights; int error; error = fget(td, fd, cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &fp); if (error != 0) return (error); if ((error = kqueue_acquire(fp, &kq)) != 0) goto noacquire; error = kqueue_register(kq, kev, td, mflag); kqueue_release(kq, 0); noacquire: fdrop(fp, td); return (error); } Index: head/sys/kern/kern_sig.c =================================================================== --- head/sys/kern/kern_sig.c (revision 350420) +++ head/sys/kern/kern_sig.c (revision 350421) @@ -1,3858 +1,3859 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_sig.c 8.7 (Berkeley) 4/18/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define ONSIG 32 /* NSIG for osig* syscalls. XXX. */ SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE3(proc, , , signal__send, "struct thread *", "struct proc *", "int"); SDT_PROBE_DEFINE2(proc, , , signal__clear, "int", "ksiginfo_t *"); SDT_PROBE_DEFINE3(proc, , , signal__discard, "struct thread *", "struct proc *", "int"); static int coredump(struct thread *); static int killpg1(struct thread *td, int sig, int pgid, int all, ksiginfo_t *ksi); static int issignal(struct thread *td); static int sigprop(int sig); static void tdsigwakeup(struct thread *, int, sig_t, int); static int sig_suspend_threads(struct thread *, struct proc *, int); static int filt_sigattach(struct knote *kn); static void filt_sigdetach(struct knote *kn); static int filt_signal(struct knote *kn, long hint); static struct thread *sigtd(struct proc *p, int sig, int prop); static void sigqueue_start(void); static uma_zone_t ksiginfo_zone = NULL; struct filterops sig_filtops = { .f_isfd = 0, .f_attach = filt_sigattach, .f_detach = filt_sigdetach, .f_event = filt_signal, }; static int kern_logsigexit = 1; SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW, &kern_logsigexit, 0, "Log processes quitting on abnormal signals to syslog(3)"); static int kern_forcesigexit = 1; SYSCTL_INT(_kern, OID_AUTO, forcesigexit, CTLFLAG_RW, &kern_forcesigexit, 0, "Force trap signal to be handled"); static SYSCTL_NODE(_kern, OID_AUTO, sigqueue, CTLFLAG_RW, 0, "POSIX real time signal"); static int max_pending_per_proc = 128; SYSCTL_INT(_kern_sigqueue, OID_AUTO, max_pending_per_proc, CTLFLAG_RW, &max_pending_per_proc, 0, "Max pending signals per proc"); static int preallocate_siginfo = 1024; SYSCTL_INT(_kern_sigqueue, OID_AUTO, preallocate, CTLFLAG_RDTUN, &preallocate_siginfo, 0, "Preallocated signal memory size"); static int signal_overflow = 0; SYSCTL_INT(_kern_sigqueue, OID_AUTO, overflow, CTLFLAG_RD, &signal_overflow, 0, "Number of signals overflew"); static int signal_alloc_fail = 0; SYSCTL_INT(_kern_sigqueue, OID_AUTO, alloc_fail, CTLFLAG_RD, &signal_alloc_fail, 0, "signals failed to be allocated"); static int kern_lognosys = 0; SYSCTL_INT(_kern, OID_AUTO, lognosys, CTLFLAG_RWTUN, &kern_lognosys, 0, "Log invalid syscalls"); SYSINIT(signal, SI_SUB_P1003_1B, SI_ORDER_FIRST+3, sigqueue_start, NULL); /* * Policy -- Can ucred cr1 send SIGIO to process cr2? * Should use cr_cansignal() once cr_cansignal() allows SIGIO and SIGURG * in the right situations. */ #define CANSIGIO(cr1, cr2) \ ((cr1)->cr_uid == 0 || \ (cr1)->cr_ruid == (cr2)->cr_ruid || \ (cr1)->cr_uid == (cr2)->cr_ruid || \ (cr1)->cr_ruid == (cr2)->cr_uid || \ (cr1)->cr_uid == (cr2)->cr_uid) static int sugid_coredump; SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RWTUN, &sugid_coredump, 0, "Allow setuid and setgid processes to dump core"); static int capmode_coredump; SYSCTL_INT(_kern, OID_AUTO, capmode_coredump, CTLFLAG_RWTUN, &capmode_coredump, 0, "Allow processes in capability mode to dump core"); static int do_coredump = 1; SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW, &do_coredump, 0, "Enable/Disable coredumps"); static int set_core_nodump_flag = 0; SYSCTL_INT(_kern, OID_AUTO, nodump_coredump, CTLFLAG_RW, &set_core_nodump_flag, 0, "Enable setting the NODUMP flag on coredump files"); static int coredump_devctl = 0; SYSCTL_INT(_kern, OID_AUTO, coredump_devctl, CTLFLAG_RW, &coredump_devctl, 0, "Generate a devctl notification when processes coredump"); /* * Signal properties and actions. * The array below categorizes the signals and their default actions * according to the following properties: */ #define SIGPROP_KILL 0x01 /* terminates process by default */ #define SIGPROP_CORE 0x02 /* ditto and coredumps */ #define SIGPROP_STOP 0x04 /* suspend process */ #define SIGPROP_TTYSTOP 0x08 /* ditto, from tty */ #define SIGPROP_IGNORE 0x10 /* ignore by default */ #define SIGPROP_CONT 0x20 /* continue if suspended */ #define SIGPROP_CANTMASK 0x40 /* non-maskable, catchable */ static int sigproptbl[NSIG] = { [SIGHUP] = SIGPROP_KILL, [SIGINT] = SIGPROP_KILL, [SIGQUIT] = SIGPROP_KILL | SIGPROP_CORE, [SIGILL] = SIGPROP_KILL | SIGPROP_CORE, [SIGTRAP] = SIGPROP_KILL | SIGPROP_CORE, [SIGABRT] = SIGPROP_KILL | SIGPROP_CORE, [SIGEMT] = SIGPROP_KILL | SIGPROP_CORE, [SIGFPE] = SIGPROP_KILL | SIGPROP_CORE, [SIGKILL] = SIGPROP_KILL, [SIGBUS] = SIGPROP_KILL | SIGPROP_CORE, [SIGSEGV] = SIGPROP_KILL | SIGPROP_CORE, [SIGSYS] = SIGPROP_KILL | SIGPROP_CORE, [SIGPIPE] = SIGPROP_KILL, [SIGALRM] = SIGPROP_KILL, [SIGTERM] = SIGPROP_KILL, [SIGURG] = SIGPROP_IGNORE, [SIGSTOP] = SIGPROP_STOP, [SIGTSTP] = SIGPROP_STOP | SIGPROP_TTYSTOP, [SIGCONT] = SIGPROP_IGNORE | SIGPROP_CONT, [SIGCHLD] = SIGPROP_IGNORE, [SIGTTIN] = SIGPROP_STOP | SIGPROP_TTYSTOP, [SIGTTOU] = SIGPROP_STOP | SIGPROP_TTYSTOP, [SIGIO] = SIGPROP_IGNORE, [SIGXCPU] = SIGPROP_KILL, [SIGXFSZ] = SIGPROP_KILL, [SIGVTALRM] = SIGPROP_KILL, [SIGPROF] = SIGPROP_KILL, [SIGWINCH] = SIGPROP_IGNORE, [SIGINFO] = SIGPROP_IGNORE, [SIGUSR1] = SIGPROP_KILL, [SIGUSR2] = SIGPROP_KILL, }; static void reschedule_signals(struct proc *p, sigset_t block, int flags); static void sigqueue_start(void) { ksiginfo_zone = uma_zcreate("ksiginfo", sizeof(ksiginfo_t), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_prealloc(ksiginfo_zone, preallocate_siginfo); p31b_setcfg(CTL_P1003_1B_REALTIME_SIGNALS, _POSIX_REALTIME_SIGNALS); p31b_setcfg(CTL_P1003_1B_RTSIG_MAX, SIGRTMAX - SIGRTMIN + 1); p31b_setcfg(CTL_P1003_1B_SIGQUEUE_MAX, max_pending_per_proc); } ksiginfo_t * ksiginfo_alloc(int wait) { int flags; flags = M_ZERO; if (! wait) flags |= M_NOWAIT; if (ksiginfo_zone != NULL) return ((ksiginfo_t *)uma_zalloc(ksiginfo_zone, flags)); return (NULL); } void ksiginfo_free(ksiginfo_t *ksi) { uma_zfree(ksiginfo_zone, ksi); } static __inline int ksiginfo_tryfree(ksiginfo_t *ksi) { if (!(ksi->ksi_flags & KSI_EXT)) { uma_zfree(ksiginfo_zone, ksi); return (1); } return (0); } void sigqueue_init(sigqueue_t *list, struct proc *p) { SIGEMPTYSET(list->sq_signals); SIGEMPTYSET(list->sq_kill); SIGEMPTYSET(list->sq_ptrace); TAILQ_INIT(&list->sq_list); list->sq_proc = p; list->sq_flags = SQ_INIT; } /* * Get a signal's ksiginfo. * Return: * 0 - signal not found * others - signal number */ static int sigqueue_get(sigqueue_t *sq, int signo, ksiginfo_t *si) { struct proc *p = sq->sq_proc; struct ksiginfo *ksi, *next; int count = 0; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); if (!SIGISMEMBER(sq->sq_signals, signo)) return (0); if (SIGISMEMBER(sq->sq_ptrace, signo)) { count++; SIGDELSET(sq->sq_ptrace, signo); si->ksi_flags |= KSI_PTRACE; } if (SIGISMEMBER(sq->sq_kill, signo)) { count++; if (count == 1) SIGDELSET(sq->sq_kill, signo); } TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) { if (ksi->ksi_signo == signo) { if (count == 0) { TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; ksiginfo_copy(ksi, si); if (ksiginfo_tryfree(ksi) && p != NULL) p->p_pendingcnt--; } if (++count > 1) break; } } if (count <= 1) SIGDELSET(sq->sq_signals, signo); si->ksi_signo = signo; return (signo); } void sigqueue_take(ksiginfo_t *ksi) { struct ksiginfo *kp; struct proc *p; sigqueue_t *sq; if (ksi == NULL || (sq = ksi->ksi_sigq) == NULL) return; p = sq->sq_proc; TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; if (!(ksi->ksi_flags & KSI_EXT) && p != NULL) p->p_pendingcnt--; for (kp = TAILQ_FIRST(&sq->sq_list); kp != NULL; kp = TAILQ_NEXT(kp, ksi_link)) { if (kp->ksi_signo == ksi->ksi_signo) break; } if (kp == NULL && !SIGISMEMBER(sq->sq_kill, ksi->ksi_signo) && !SIGISMEMBER(sq->sq_ptrace, ksi->ksi_signo)) SIGDELSET(sq->sq_signals, ksi->ksi_signo); } static int sigqueue_add(sigqueue_t *sq, int signo, ksiginfo_t *si) { struct proc *p = sq->sq_proc; struct ksiginfo *ksi; int ret = 0; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); /* * SIGKILL/SIGSTOP cannot be caught or masked, so take the fast path * for these signals. */ if (signo == SIGKILL || signo == SIGSTOP || si == NULL) { SIGADDSET(sq->sq_kill, signo); goto out_set_bit; } /* directly insert the ksi, don't copy it */ if (si->ksi_flags & KSI_INS) { if (si->ksi_flags & KSI_HEAD) TAILQ_INSERT_HEAD(&sq->sq_list, si, ksi_link); else TAILQ_INSERT_TAIL(&sq->sq_list, si, ksi_link); si->ksi_sigq = sq; goto out_set_bit; } if (__predict_false(ksiginfo_zone == NULL)) { SIGADDSET(sq->sq_kill, signo); goto out_set_bit; } if (p != NULL && p->p_pendingcnt >= max_pending_per_proc) { signal_overflow++; ret = EAGAIN; } else if ((ksi = ksiginfo_alloc(0)) == NULL) { signal_alloc_fail++; ret = EAGAIN; } else { if (p != NULL) p->p_pendingcnt++; ksiginfo_copy(si, ksi); ksi->ksi_signo = signo; if (si->ksi_flags & KSI_HEAD) TAILQ_INSERT_HEAD(&sq->sq_list, ksi, ksi_link); else TAILQ_INSERT_TAIL(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = sq; } if (ret != 0) { if ((si->ksi_flags & KSI_PTRACE) != 0) { SIGADDSET(sq->sq_ptrace, signo); ret = 0; goto out_set_bit; } else if ((si->ksi_flags & KSI_TRAP) != 0 || (si->ksi_flags & KSI_SIGQ) == 0) { SIGADDSET(sq->sq_kill, signo); ret = 0; goto out_set_bit; } return (ret); } out_set_bit: SIGADDSET(sq->sq_signals, signo); return (ret); } void sigqueue_flush(sigqueue_t *sq) { struct proc *p = sq->sq_proc; ksiginfo_t *ksi; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); if (p != NULL) PROC_LOCK_ASSERT(p, MA_OWNED); while ((ksi = TAILQ_FIRST(&sq->sq_list)) != NULL) { TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; if (ksiginfo_tryfree(ksi) && p != NULL) p->p_pendingcnt--; } SIGEMPTYSET(sq->sq_signals); SIGEMPTYSET(sq->sq_kill); SIGEMPTYSET(sq->sq_ptrace); } static void sigqueue_move_set(sigqueue_t *src, sigqueue_t *dst, const sigset_t *set) { sigset_t tmp; struct proc *p1, *p2; ksiginfo_t *ksi, *next; KASSERT(src->sq_flags & SQ_INIT, ("src sigqueue not inited")); KASSERT(dst->sq_flags & SQ_INIT, ("dst sigqueue not inited")); p1 = src->sq_proc; p2 = dst->sq_proc; /* Move siginfo to target list */ TAILQ_FOREACH_SAFE(ksi, &src->sq_list, ksi_link, next) { if (SIGISMEMBER(*set, ksi->ksi_signo)) { TAILQ_REMOVE(&src->sq_list, ksi, ksi_link); if (p1 != NULL) p1->p_pendingcnt--; TAILQ_INSERT_TAIL(&dst->sq_list, ksi, ksi_link); ksi->ksi_sigq = dst; if (p2 != NULL) p2->p_pendingcnt++; } } /* Move pending bits to target list */ tmp = src->sq_kill; SIGSETAND(tmp, *set); SIGSETOR(dst->sq_kill, tmp); SIGSETNAND(src->sq_kill, tmp); tmp = src->sq_ptrace; SIGSETAND(tmp, *set); SIGSETOR(dst->sq_ptrace, tmp); SIGSETNAND(src->sq_ptrace, tmp); tmp = src->sq_signals; SIGSETAND(tmp, *set); SIGSETOR(dst->sq_signals, tmp); SIGSETNAND(src->sq_signals, tmp); } #if 0 static void sigqueue_move(sigqueue_t *src, sigqueue_t *dst, int signo) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, signo); sigqueue_move_set(src, dst, &set); } #endif static void sigqueue_delete_set(sigqueue_t *sq, const sigset_t *set) { struct proc *p = sq->sq_proc; ksiginfo_t *ksi, *next; KASSERT(sq->sq_flags & SQ_INIT, ("src sigqueue not inited")); /* Remove siginfo queue */ TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) { if (SIGISMEMBER(*set, ksi->ksi_signo)) { TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; if (ksiginfo_tryfree(ksi) && p != NULL) p->p_pendingcnt--; } } SIGSETNAND(sq->sq_kill, *set); SIGSETNAND(sq->sq_ptrace, *set); SIGSETNAND(sq->sq_signals, *set); } void sigqueue_delete(sigqueue_t *sq, int signo) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, signo); sigqueue_delete_set(sq, &set); } /* Remove a set of signals for a process */ static void sigqueue_delete_set_proc(struct proc *p, const sigset_t *set) { sigqueue_t worklist; struct thread *td0; PROC_LOCK_ASSERT(p, MA_OWNED); sigqueue_init(&worklist, NULL); sigqueue_move_set(&p->p_sigqueue, &worklist, set); FOREACH_THREAD_IN_PROC(p, td0) sigqueue_move_set(&td0->td_sigqueue, &worklist, set); sigqueue_flush(&worklist); } void sigqueue_delete_proc(struct proc *p, int signo) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, signo); sigqueue_delete_set_proc(p, &set); } static void sigqueue_delete_stopmask_proc(struct proc *p) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, SIGSTOP); SIGADDSET(set, SIGTSTP); SIGADDSET(set, SIGTTIN); SIGADDSET(set, SIGTTOU); sigqueue_delete_set_proc(p, &set); } /* * Determine signal that should be delivered to thread td, the current * thread, 0 if none. If there is a pending stop signal with default * action, the process stops in issignal(). */ int cursig(struct thread *td) { PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); mtx_assert(&td->td_proc->p_sigacts->ps_mtx, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_NOTOWNED); return (SIGPENDING(td) ? issignal(td) : 0); } /* * Arrange for ast() to handle unmasked pending signals on return to user * mode. This must be called whenever a signal is added to td_sigqueue or * unmasked in td_sigmask. */ void signotify(struct thread *td) { PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); if (SIGPENDING(td)) { thread_lock(td); td->td_flags |= TDF_NEEDSIGCHK | TDF_ASTPENDING; thread_unlock(td); } } /* * Returns 1 (true) if altstack is configured for the thread, and the * passed stack bottom address falls into the altstack range. Handles * the 43 compat special case where the alt stack size is zero. */ int sigonstack(size_t sp) { struct thread *td; td = curthread; if ((td->td_pflags & TDP_ALTSTACK) == 0) return (0); #if defined(COMPAT_43) if (td->td_sigstk.ss_size == 0) return ((td->td_sigstk.ss_flags & SS_ONSTACK) != 0); #endif return (sp >= (size_t)td->td_sigstk.ss_sp && sp < td->td_sigstk.ss_size + (size_t)td->td_sigstk.ss_sp); } static __inline int sigprop(int sig) { if (sig > 0 && sig < nitems(sigproptbl)) return (sigproptbl[sig]); return (0); } int sig_ffs(sigset_t *set) { int i; for (i = 0; i < _SIG_WORDS; i++) if (set->__bits[i]) return (ffs(set->__bits[i]) + (i * 32)); return (0); } static bool sigact_flag_test(const struct sigaction *act, int flag) { /* * SA_SIGINFO is reset when signal disposition is set to * ignore or default. Other flags are kept according to user * settings. */ return ((act->sa_flags & flag) != 0 && (flag != SA_SIGINFO || ((__sighandler_t *)act->sa_sigaction != SIG_IGN && (__sighandler_t *)act->sa_sigaction != SIG_DFL))); } /* * kern_sigaction * sigaction * freebsd4_sigaction * osigaction */ int kern_sigaction(struct thread *td, int sig, const struct sigaction *act, struct sigaction *oact, int flags) { struct sigacts *ps; struct proc *p = td->td_proc; if (!_SIG_VALID(sig)) return (EINVAL); if (act != NULL && act->sa_handler != SIG_DFL && act->sa_handler != SIG_IGN && (act->sa_flags & ~(SA_ONSTACK | SA_RESTART | SA_RESETHAND | SA_NOCLDSTOP | SA_NODEFER | SA_NOCLDWAIT | SA_SIGINFO)) != 0) return (EINVAL); PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); if (oact) { memset(oact, 0, sizeof(*oact)); oact->sa_mask = ps->ps_catchmask[_SIG_IDX(sig)]; if (SIGISMEMBER(ps->ps_sigonstack, sig)) oact->sa_flags |= SA_ONSTACK; if (!SIGISMEMBER(ps->ps_sigintr, sig)) oact->sa_flags |= SA_RESTART; if (SIGISMEMBER(ps->ps_sigreset, sig)) oact->sa_flags |= SA_RESETHAND; if (SIGISMEMBER(ps->ps_signodefer, sig)) oact->sa_flags |= SA_NODEFER; if (SIGISMEMBER(ps->ps_siginfo, sig)) { oact->sa_flags |= SA_SIGINFO; oact->sa_sigaction = (__siginfohandler_t *)ps->ps_sigact[_SIG_IDX(sig)]; } else oact->sa_handler = ps->ps_sigact[_SIG_IDX(sig)]; if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDSTOP) oact->sa_flags |= SA_NOCLDSTOP; if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDWAIT) oact->sa_flags |= SA_NOCLDWAIT; } if (act) { if ((sig == SIGKILL || sig == SIGSTOP) && act->sa_handler != SIG_DFL) { mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); return (EINVAL); } /* * Change setting atomically. */ ps->ps_catchmask[_SIG_IDX(sig)] = act->sa_mask; SIG_CANTMASK(ps->ps_catchmask[_SIG_IDX(sig)]); if (sigact_flag_test(act, SA_SIGINFO)) { ps->ps_sigact[_SIG_IDX(sig)] = (__sighandler_t *)act->sa_sigaction; SIGADDSET(ps->ps_siginfo, sig); } else { ps->ps_sigact[_SIG_IDX(sig)] = act->sa_handler; SIGDELSET(ps->ps_siginfo, sig); } if (!sigact_flag_test(act, SA_RESTART)) SIGADDSET(ps->ps_sigintr, sig); else SIGDELSET(ps->ps_sigintr, sig); if (sigact_flag_test(act, SA_ONSTACK)) SIGADDSET(ps->ps_sigonstack, sig); else SIGDELSET(ps->ps_sigonstack, sig); if (sigact_flag_test(act, SA_RESETHAND)) SIGADDSET(ps->ps_sigreset, sig); else SIGDELSET(ps->ps_sigreset, sig); if (sigact_flag_test(act, SA_NODEFER)) SIGADDSET(ps->ps_signodefer, sig); else SIGDELSET(ps->ps_signodefer, sig); if (sig == SIGCHLD) { if (act->sa_flags & SA_NOCLDSTOP) ps->ps_flag |= PS_NOCLDSTOP; else ps->ps_flag &= ~PS_NOCLDSTOP; if (act->sa_flags & SA_NOCLDWAIT) { /* * Paranoia: since SA_NOCLDWAIT is implemented * by reparenting the dying child to PID 1 (and * trust it to reap the zombie), PID 1 itself * is forbidden to set SA_NOCLDWAIT. */ if (p->p_pid == 1) ps->ps_flag &= ~PS_NOCLDWAIT; else ps->ps_flag |= PS_NOCLDWAIT; } else ps->ps_flag &= ~PS_NOCLDWAIT; if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN) ps->ps_flag |= PS_CLDSIGIGN; else ps->ps_flag &= ~PS_CLDSIGIGN; } /* * Set bit in ps_sigignore for signals that are set to SIG_IGN, * and for signals set to SIG_DFL where the default is to * ignore. However, don't put SIGCONT in ps_sigignore, as we * have to restart the process. */ if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || (sigprop(sig) & SIGPROP_IGNORE && ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)) { /* never to be seen again */ sigqueue_delete_proc(p, sig); if (sig != SIGCONT) /* easier in psignal */ SIGADDSET(ps->ps_sigignore, sig); SIGDELSET(ps->ps_sigcatch, sig); } else { SIGDELSET(ps->ps_sigignore, sig); if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL) SIGDELSET(ps->ps_sigcatch, sig); else SIGADDSET(ps->ps_sigcatch, sig); } #ifdef COMPAT_FREEBSD4 if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL || (flags & KSA_FREEBSD4) == 0) SIGDELSET(ps->ps_freebsd4, sig); else SIGADDSET(ps->ps_freebsd4, sig); #endif #ifdef COMPAT_43 if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL || (flags & KSA_OSIGSET) == 0) SIGDELSET(ps->ps_osigset, sig); else SIGADDSET(ps->ps_osigset, sig); #endif } mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); return (0); } #ifndef _SYS_SYSPROTO_H_ struct sigaction_args { int sig; struct sigaction *act; struct sigaction *oact; }; #endif int sys_sigaction(struct thread *td, struct sigaction_args *uap) { struct sigaction act, oact; struct sigaction *actp, *oactp; int error; actp = (uap->act != NULL) ? &act : NULL; oactp = (uap->oact != NULL) ? &oact : NULL; if (actp) { error = copyin(uap->act, actp, sizeof(act)); if (error) return (error); } error = kern_sigaction(td, uap->sig, actp, oactp, 0); if (oactp && !error) error = copyout(oactp, uap->oact, sizeof(oact)); return (error); } #ifdef COMPAT_FREEBSD4 #ifndef _SYS_SYSPROTO_H_ struct freebsd4_sigaction_args { int sig; struct sigaction *act; struct sigaction *oact; }; #endif int freebsd4_sigaction(struct thread *td, struct freebsd4_sigaction_args *uap) { struct sigaction act, oact; struct sigaction *actp, *oactp; int error; actp = (uap->act != NULL) ? &act : NULL; oactp = (uap->oact != NULL) ? &oact : NULL; if (actp) { error = copyin(uap->act, actp, sizeof(act)); if (error) return (error); } error = kern_sigaction(td, uap->sig, actp, oactp, KSA_FREEBSD4); if (oactp && !error) error = copyout(oactp, uap->oact, sizeof(oact)); return (error); } #endif /* COMAPT_FREEBSD4 */ #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigaction_args { int signum; struct osigaction *nsa; struct osigaction *osa; }; #endif int osigaction(struct thread *td, struct osigaction_args *uap) { struct osigaction sa; struct sigaction nsa, osa; struct sigaction *nsap, *osap; int error; if (uap->signum <= 0 || uap->signum >= ONSIG) return (EINVAL); nsap = (uap->nsa != NULL) ? &nsa : NULL; osap = (uap->osa != NULL) ? &osa : NULL; if (nsap) { error = copyin(uap->nsa, &sa, sizeof(sa)); if (error) return (error); nsap->sa_handler = sa.sa_handler; nsap->sa_flags = sa.sa_flags; OSIG2SIG(sa.sa_mask, nsap->sa_mask); } error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET); if (osap && !error) { sa.sa_handler = osap->sa_handler; sa.sa_flags = osap->sa_flags; SIG2OSIG(osap->sa_mask, sa.sa_mask); error = copyout(&sa, uap->osa, sizeof(sa)); } return (error); } #if !defined(__i386__) /* Avoid replicating the same stub everywhere */ int osigreturn(struct thread *td, struct osigreturn_args *uap) { return (nosys(td, (struct nosys_args *)uap)); } #endif #endif /* COMPAT_43 */ /* * Initialize signal state for process 0; * set to ignore signals that are ignored by default. */ void siginit(struct proc *p) { int i; struct sigacts *ps; PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); for (i = 1; i <= NSIG; i++) { if (sigprop(i) & SIGPROP_IGNORE && i != SIGCONT) { SIGADDSET(ps->ps_sigignore, i); } } mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); } /* * Reset specified signal to the default disposition. */ static void sigdflt(struct sigacts *ps, int sig) { mtx_assert(&ps->ps_mtx, MA_OWNED); SIGDELSET(ps->ps_sigcatch, sig); if ((sigprop(sig) & SIGPROP_IGNORE) != 0 && sig != SIGCONT) SIGADDSET(ps->ps_sigignore, sig); ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; SIGDELSET(ps->ps_siginfo, sig); } /* * Reset signals for an exec of the specified process. */ void execsigs(struct proc *p) { sigset_t osigignore; struct sigacts *ps; int sig; struct thread *td; /* * Reset caught signals. Held signals remain held * through td_sigmask (unless they were caught, * and are now ignored by default). */ PROC_LOCK_ASSERT(p, MA_OWNED); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); while (SIGNOTEMPTY(ps->ps_sigcatch)) { sig = sig_ffs(&ps->ps_sigcatch); sigdflt(ps, sig); if ((sigprop(sig) & SIGPROP_IGNORE) != 0) sigqueue_delete_proc(p, sig); } /* * As CloudABI processes cannot modify signal handlers, fully * reset all signals to their default behavior. Do ignore * SIGPIPE, as it would otherwise be impossible to recover from * writes to broken pipes and sockets. */ if (SV_PROC_ABI(p) == SV_ABI_CLOUDABI) { osigignore = ps->ps_sigignore; while (SIGNOTEMPTY(osigignore)) { sig = sig_ffs(&osigignore); SIGDELSET(osigignore, sig); if (sig != SIGPIPE) sigdflt(ps, sig); } SIGADDSET(ps->ps_sigignore, SIGPIPE); } /* * Reset stack state to the user stack. * Clear set of signals caught on the signal stack. */ td = curthread; MPASS(td->td_proc == p); td->td_sigstk.ss_flags = SS_DISABLE; td->td_sigstk.ss_size = 0; td->td_sigstk.ss_sp = 0; td->td_pflags &= ~TDP_ALTSTACK; /* * Reset no zombies if child dies flag as Solaris does. */ ps->ps_flag &= ~(PS_NOCLDWAIT | PS_CLDSIGIGN); if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN) ps->ps_sigact[_SIG_IDX(SIGCHLD)] = SIG_DFL; mtx_unlock(&ps->ps_mtx); } /* * kern_sigprocmask() * * Manipulate signal mask. */ int kern_sigprocmask(struct thread *td, int how, sigset_t *set, sigset_t *oset, int flags) { sigset_t new_block, oset1; struct proc *p; int error; p = td->td_proc; if ((flags & SIGPROCMASK_PROC_LOCKED) != 0) PROC_LOCK_ASSERT(p, MA_OWNED); else PROC_LOCK(p); mtx_assert(&p->p_sigacts->ps_mtx, (flags & SIGPROCMASK_PS_LOCKED) != 0 ? MA_OWNED : MA_NOTOWNED); if (oset != NULL) *oset = td->td_sigmask; error = 0; if (set != NULL) { switch (how) { case SIG_BLOCK: SIG_CANTMASK(*set); oset1 = td->td_sigmask; SIGSETOR(td->td_sigmask, *set); new_block = td->td_sigmask; SIGSETNAND(new_block, oset1); break; case SIG_UNBLOCK: SIGSETNAND(td->td_sigmask, *set); signotify(td); goto out; case SIG_SETMASK: SIG_CANTMASK(*set); oset1 = td->td_sigmask; if (flags & SIGPROCMASK_OLD) SIGSETLO(td->td_sigmask, *set); else td->td_sigmask = *set; new_block = td->td_sigmask; SIGSETNAND(new_block, oset1); signotify(td); break; default: error = EINVAL; goto out; } /* * The new_block set contains signals that were not previously * blocked, but are blocked now. * * In case we block any signal that was not previously blocked * for td, and process has the signal pending, try to schedule * signal delivery to some thread that does not block the * signal, possibly waking it up. */ if (p->p_numthreads != 1) reschedule_signals(p, new_block, flags); } out: if (!(flags & SIGPROCMASK_PROC_LOCKED)) PROC_UNLOCK(p); return (error); } #ifndef _SYS_SYSPROTO_H_ struct sigprocmask_args { int how; const sigset_t *set; sigset_t *oset; }; #endif int sys_sigprocmask(struct thread *td, struct sigprocmask_args *uap) { sigset_t set, oset; sigset_t *setp, *osetp; int error; setp = (uap->set != NULL) ? &set : NULL; osetp = (uap->oset != NULL) ? &oset : NULL; if (setp) { error = copyin(uap->set, setp, sizeof(set)); if (error) return (error); } error = kern_sigprocmask(td, uap->how, setp, osetp, 0); if (osetp && !error) { error = copyout(osetp, uap->oset, sizeof(oset)); } return (error); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigprocmask_args { int how; osigset_t mask; }; #endif int osigprocmask(struct thread *td, struct osigprocmask_args *uap) { sigset_t set, oset; int error; OSIG2SIG(uap->mask, set); error = kern_sigprocmask(td, uap->how, &set, &oset, 1); SIG2OSIG(oset, td->td_retval[0]); return (error); } #endif /* COMPAT_43 */ int sys_sigwait(struct thread *td, struct sigwait_args *uap) { ksiginfo_t ksi; sigset_t set; int error; error = copyin(uap->set, &set, sizeof(set)); if (error) { td->td_retval[0] = error; return (0); } error = kern_sigtimedwait(td, set, &ksi, NULL); if (error) { if (error == EINTR && td->td_proc->p_osrel < P_OSREL_SIGWAIT) error = ERESTART; if (error == ERESTART) return (error); td->td_retval[0] = error; return (0); } error = copyout(&ksi.ksi_signo, uap->sig, sizeof(ksi.ksi_signo)); td->td_retval[0] = error; return (0); } int sys_sigtimedwait(struct thread *td, struct sigtimedwait_args *uap) { struct timespec ts; struct timespec *timeout; sigset_t set; ksiginfo_t ksi; int error; if (uap->timeout) { error = copyin(uap->timeout, &ts, sizeof(ts)); if (error) return (error); timeout = &ts; } else timeout = NULL; error = copyin(uap->set, &set, sizeof(set)); if (error) return (error); error = kern_sigtimedwait(td, set, &ksi, timeout); if (error) return (error); if (uap->info) error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t)); if (error == 0) td->td_retval[0] = ksi.ksi_signo; return (error); } int sys_sigwaitinfo(struct thread *td, struct sigwaitinfo_args *uap) { ksiginfo_t ksi; sigset_t set; int error; error = copyin(uap->set, &set, sizeof(set)); if (error) return (error); error = kern_sigtimedwait(td, set, &ksi, NULL); if (error) return (error); if (uap->info) error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t)); if (error == 0) td->td_retval[0] = ksi.ksi_signo; return (error); } static void proc_td_siginfo_capture(struct thread *td, siginfo_t *si) { struct thread *thr; FOREACH_THREAD_IN_PROC(td->td_proc, thr) { if (thr == td) thr->td_si = *si; else thr->td_si.si_signo = 0; } } int kern_sigtimedwait(struct thread *td, sigset_t waitset, ksiginfo_t *ksi, struct timespec *timeout) { struct sigacts *ps; sigset_t saved_mask, new_block; struct proc *p; int error, sig, timo, timevalid = 0; struct timespec rts, ets, ts; struct timeval tv; p = td->td_proc; error = 0; ets.tv_sec = 0; ets.tv_nsec = 0; if (timeout != NULL) { if (timeout->tv_nsec >= 0 && timeout->tv_nsec < 1000000000) { timevalid = 1; getnanouptime(&rts); timespecadd(&rts, timeout, &ets); } } ksiginfo_init(ksi); /* Some signals can not be waited for. */ SIG_CANTMASK(waitset); ps = p->p_sigacts; PROC_LOCK(p); saved_mask = td->td_sigmask; SIGSETNAND(td->td_sigmask, waitset); for (;;) { mtx_lock(&ps->ps_mtx); sig = cursig(td); mtx_unlock(&ps->ps_mtx); KASSERT(sig >= 0, ("sig %d", sig)); if (sig != 0 && SIGISMEMBER(waitset, sig)) { if (sigqueue_get(&td->td_sigqueue, sig, ksi) != 0 || sigqueue_get(&p->p_sigqueue, sig, ksi) != 0) { error = 0; break; } } if (error != 0) break; /* * POSIX says this must be checked after looking for pending * signals. */ if (timeout != NULL) { if (!timevalid) { error = EINVAL; break; } getnanouptime(&rts); if (timespeccmp(&rts, &ets, >=)) { error = EAGAIN; break; } timespecsub(&ets, &rts, &ts); TIMESPEC_TO_TIMEVAL(&tv, &ts); timo = tvtohz(&tv); } else { timo = 0; } error = msleep(ps, &p->p_mtx, PPAUSE|PCATCH, "sigwait", timo); if (timeout != NULL) { if (error == ERESTART) { /* Timeout can not be restarted. */ error = EINTR; } else if (error == EAGAIN) { /* We will calculate timeout by ourself. */ error = 0; } } } new_block = saved_mask; SIGSETNAND(new_block, td->td_sigmask); td->td_sigmask = saved_mask; /* * Fewer signals can be delivered to us, reschedule signal * notification. */ if (p->p_numthreads != 1) reschedule_signals(p, new_block, 0); if (error == 0) { SDT_PROBE2(proc, , , signal__clear, sig, ksi); if (ksi->ksi_code == SI_TIMER) itimer_accept(p, ksi->ksi_timerid, ksi); #ifdef KTRACE if (KTRPOINT(td, KTR_PSIG)) { sig_t action; mtx_lock(&ps->ps_mtx); action = ps->ps_sigact[_SIG_IDX(sig)]; mtx_unlock(&ps->ps_mtx); ktrpsig(sig, action, &td->td_sigmask, ksi->ksi_code); } #endif if (sig == SIGKILL) { proc_td_siginfo_capture(td, &ksi->ksi_info); sigexit(td, sig); } } PROC_UNLOCK(p); return (error); } #ifndef _SYS_SYSPROTO_H_ struct sigpending_args { sigset_t *set; }; #endif int sys_sigpending(struct thread *td, struct sigpending_args *uap) { struct proc *p = td->td_proc; sigset_t pending; PROC_LOCK(p); pending = p->p_sigqueue.sq_signals; SIGSETOR(pending, td->td_sigqueue.sq_signals); PROC_UNLOCK(p); return (copyout(&pending, uap->set, sizeof(sigset_t))); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigpending_args { int dummy; }; #endif int osigpending(struct thread *td, struct osigpending_args *uap) { struct proc *p = td->td_proc; sigset_t pending; PROC_LOCK(p); pending = p->p_sigqueue.sq_signals; SIGSETOR(pending, td->td_sigqueue.sq_signals); PROC_UNLOCK(p); SIG2OSIG(pending, td->td_retval[0]); return (0); } #endif /* COMPAT_43 */ #if defined(COMPAT_43) /* * Generalized interface signal handler, 4.3-compatible. */ #ifndef _SYS_SYSPROTO_H_ struct osigvec_args { int signum; struct sigvec *nsv; struct sigvec *osv; }; #endif /* ARGSUSED */ int osigvec(struct thread *td, struct osigvec_args *uap) { struct sigvec vec; struct sigaction nsa, osa; struct sigaction *nsap, *osap; int error; if (uap->signum <= 0 || uap->signum >= ONSIG) return (EINVAL); nsap = (uap->nsv != NULL) ? &nsa : NULL; osap = (uap->osv != NULL) ? &osa : NULL; if (nsap) { error = copyin(uap->nsv, &vec, sizeof(vec)); if (error) return (error); nsap->sa_handler = vec.sv_handler; OSIG2SIG(vec.sv_mask, nsap->sa_mask); nsap->sa_flags = vec.sv_flags; nsap->sa_flags ^= SA_RESTART; /* opposite of SV_INTERRUPT */ } error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET); if (osap && !error) { vec.sv_handler = osap->sa_handler; SIG2OSIG(osap->sa_mask, vec.sv_mask); vec.sv_flags = osap->sa_flags; vec.sv_flags &= ~SA_NOCLDWAIT; vec.sv_flags ^= SA_RESTART; error = copyout(&vec, uap->osv, sizeof(vec)); } return (error); } #ifndef _SYS_SYSPROTO_H_ struct osigblock_args { int mask; }; #endif int osigblock(struct thread *td, struct osigblock_args *uap) { sigset_t set, oset; OSIG2SIG(uap->mask, set); kern_sigprocmask(td, SIG_BLOCK, &set, &oset, 0); SIG2OSIG(oset, td->td_retval[0]); return (0); } #ifndef _SYS_SYSPROTO_H_ struct osigsetmask_args { int mask; }; #endif int osigsetmask(struct thread *td, struct osigsetmask_args *uap) { sigset_t set, oset; OSIG2SIG(uap->mask, set); kern_sigprocmask(td, SIG_SETMASK, &set, &oset, 0); SIG2OSIG(oset, td->td_retval[0]); return (0); } #endif /* COMPAT_43 */ /* * Suspend calling thread until signal, providing mask to be set in the * meantime. */ #ifndef _SYS_SYSPROTO_H_ struct sigsuspend_args { const sigset_t *sigmask; }; #endif /* ARGSUSED */ int sys_sigsuspend(struct thread *td, struct sigsuspend_args *uap) { sigset_t mask; int error; error = copyin(uap->sigmask, &mask, sizeof(mask)); if (error) return (error); return (kern_sigsuspend(td, mask)); } int kern_sigsuspend(struct thread *td, sigset_t mask) { struct proc *p = td->td_proc; int has_sig, sig; /* * When returning from sigsuspend, we want * the old mask to be restored after the * signal handler has finished. Thus, we * save it here and mark the sigacts structure * to indicate this. */ PROC_LOCK(p); kern_sigprocmask(td, SIG_SETMASK, &mask, &td->td_oldsigmask, SIGPROCMASK_PROC_LOCKED); td->td_pflags |= TDP_OLDMASK; /* * Process signals now. Otherwise, we can get spurious wakeup * due to signal entered process queue, but delivered to other * thread. But sigsuspend should return only on signal * delivery. */ (p->p_sysent->sv_set_syscall_retval)(td, EINTR); for (has_sig = 0; !has_sig;) { while (msleep(&p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH, "pause", 0) == 0) /* void */; thread_suspend_check(0); mtx_lock(&p->p_sigacts->ps_mtx); while ((sig = cursig(td)) != 0) { KASSERT(sig >= 0, ("sig %d", sig)); has_sig += postsig(sig); } mtx_unlock(&p->p_sigacts->ps_mtx); } PROC_UNLOCK(p); td->td_errno = EINTR; td->td_pflags |= TDP_NERRNO; return (EJUSTRETURN); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ /* * Compatibility sigsuspend call for old binaries. Note nonstandard calling * convention: libc stub passes mask, not pointer, to save a copyin. */ #ifndef _SYS_SYSPROTO_H_ struct osigsuspend_args { osigset_t mask; }; #endif /* ARGSUSED */ int osigsuspend(struct thread *td, struct osigsuspend_args *uap) { sigset_t mask; OSIG2SIG(uap->mask, mask); return (kern_sigsuspend(td, mask)); } #endif /* COMPAT_43 */ #if defined(COMPAT_43) #ifndef _SYS_SYSPROTO_H_ struct osigstack_args { struct sigstack *nss; struct sigstack *oss; }; #endif /* ARGSUSED */ int osigstack(struct thread *td, struct osigstack_args *uap) { struct sigstack nss, oss; int error = 0; if (uap->nss != NULL) { error = copyin(uap->nss, &nss, sizeof(nss)); if (error) return (error); } oss.ss_sp = td->td_sigstk.ss_sp; oss.ss_onstack = sigonstack(cpu_getstack(td)); if (uap->nss != NULL) { td->td_sigstk.ss_sp = nss.ss_sp; td->td_sigstk.ss_size = 0; td->td_sigstk.ss_flags |= nss.ss_onstack & SS_ONSTACK; td->td_pflags |= TDP_ALTSTACK; } if (uap->oss != NULL) error = copyout(&oss, uap->oss, sizeof(oss)); return (error); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct sigaltstack_args { stack_t *ss; stack_t *oss; }; #endif /* ARGSUSED */ int sys_sigaltstack(struct thread *td, struct sigaltstack_args *uap) { stack_t ss, oss; int error; if (uap->ss != NULL) { error = copyin(uap->ss, &ss, sizeof(ss)); if (error) return (error); } error = kern_sigaltstack(td, (uap->ss != NULL) ? &ss : NULL, (uap->oss != NULL) ? &oss : NULL); if (error) return (error); if (uap->oss != NULL) error = copyout(&oss, uap->oss, sizeof(stack_t)); return (error); } int kern_sigaltstack(struct thread *td, stack_t *ss, stack_t *oss) { struct proc *p = td->td_proc; int oonstack; oonstack = sigonstack(cpu_getstack(td)); if (oss != NULL) { *oss = td->td_sigstk; oss->ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; } if (ss != NULL) { if (oonstack) return (EPERM); if ((ss->ss_flags & ~SS_DISABLE) != 0) return (EINVAL); if (!(ss->ss_flags & SS_DISABLE)) { if (ss->ss_size < p->p_sysent->sv_minsigstksz) return (ENOMEM); td->td_sigstk = *ss; td->td_pflags |= TDP_ALTSTACK; } else { td->td_pflags &= ~TDP_ALTSTACK; } } return (0); } /* * Common code for kill process group/broadcast kill. * cp is calling process. */ static int killpg1(struct thread *td, int sig, int pgid, int all, ksiginfo_t *ksi) { struct proc *p; struct pgrp *pgrp; int err; int ret; ret = ESRCH; if (all) { /* * broadcast */ sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if (p->p_pid <= 1 || p->p_flag & P_SYSTEM || p == td->td_proc || p->p_state == PRS_NEW) { continue; } PROC_LOCK(p); err = p_cansignal(td, p, sig); if (err == 0) { if (sig) pksignal(p, sig, ksi); ret = err; } else if (ret == ESRCH) ret = err; PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); } else { sx_slock(&proctree_lock); if (pgid == 0) { /* * zero pgid means send to my process group. */ pgrp = td->td_proc->p_pgrp; PGRP_LOCK(pgrp); } else { pgrp = pgfind(pgid); if (pgrp == NULL) { sx_sunlock(&proctree_lock); return (ESRCH); } } sx_sunlock(&proctree_lock); LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_pid <= 1 || p->p_flag & P_SYSTEM || p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } err = p_cansignal(td, p, sig); if (err == 0) { if (sig) pksignal(p, sig, ksi); ret = err; } else if (ret == ESRCH) ret = err; PROC_UNLOCK(p); } PGRP_UNLOCK(pgrp); } return (ret); } #ifndef _SYS_SYSPROTO_H_ struct kill_args { int pid; int signum; }; #endif /* ARGSUSED */ int sys_kill(struct thread *td, struct kill_args *uap) { ksiginfo_t ksi; struct proc *p; int error; /* * A process in capability mode can send signals only to himself. * The main rationale behind this is that abort(3) is implemented as * kill(getpid(), SIGABRT). */ if (IN_CAPABILITY_MODE(td) && uap->pid != td->td_proc->p_pid) return (ECAPMODE); AUDIT_ARG_SIGNUM(uap->signum); AUDIT_ARG_PID(uap->pid); if ((u_int)uap->signum > _SIG_MAXSIG) return (EINVAL); ksiginfo_init(&ksi); ksi.ksi_signo = uap->signum; ksi.ksi_code = SI_USER; ksi.ksi_pid = td->td_proc->p_pid; ksi.ksi_uid = td->td_ucred->cr_ruid; if (uap->pid > 0) { /* kill single process */ if ((p = pfind_any(uap->pid)) == NULL) return (ESRCH); AUDIT_ARG_PROCESS(p); error = p_cansignal(td, p, uap->signum); if (error == 0 && uap->signum) pksignal(p, uap->signum, &ksi); PROC_UNLOCK(p); return (error); } switch (uap->pid) { case -1: /* broadcast signal */ return (killpg1(td, uap->signum, 0, 1, &ksi)); case 0: /* signal own process group */ return (killpg1(td, uap->signum, 0, 0, &ksi)); default: /* negative explicit process group */ return (killpg1(td, uap->signum, -uap->pid, 0, &ksi)); } /* NOTREACHED */ } int sys_pdkill(struct thread *td, struct pdkill_args *uap) { struct proc *p; int error; AUDIT_ARG_SIGNUM(uap->signum); AUDIT_ARG_FD(uap->fd); if ((u_int)uap->signum > _SIG_MAXSIG) return (EINVAL); error = procdesc_find(td, uap->fd, &cap_pdkill_rights, &p); if (error) return (error); AUDIT_ARG_PROCESS(p); error = p_cansignal(td, p, uap->signum); if (error == 0 && uap->signum) kern_psignal(p, uap->signum); PROC_UNLOCK(p); return (error); } #if defined(COMPAT_43) #ifndef _SYS_SYSPROTO_H_ struct okillpg_args { int pgid; int signum; }; #endif /* ARGSUSED */ int okillpg(struct thread *td, struct okillpg_args *uap) { ksiginfo_t ksi; AUDIT_ARG_SIGNUM(uap->signum); AUDIT_ARG_PID(uap->pgid); if ((u_int)uap->signum > _SIG_MAXSIG) return (EINVAL); ksiginfo_init(&ksi); ksi.ksi_signo = uap->signum; ksi.ksi_code = SI_USER; ksi.ksi_pid = td->td_proc->p_pid; ksi.ksi_uid = td->td_ucred->cr_ruid; return (killpg1(td, uap->signum, uap->pgid, 0, &ksi)); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct sigqueue_args { pid_t pid; int signum; /* union sigval */ void *value; }; #endif int sys_sigqueue(struct thread *td, struct sigqueue_args *uap) { union sigval sv; sv.sival_ptr = uap->value; return (kern_sigqueue(td, uap->pid, uap->signum, &sv)); } int kern_sigqueue(struct thread *td, pid_t pid, int signum, union sigval *value) { ksiginfo_t ksi; struct proc *p; int error; if ((u_int)signum > _SIG_MAXSIG) return (EINVAL); /* * Specification says sigqueue can only send signal to * single process. */ if (pid <= 0) return (EINVAL); if ((p = pfind_any(pid)) == NULL) return (ESRCH); error = p_cansignal(td, p, signum); if (error == 0 && signum != 0) { ksiginfo_init(&ksi); ksi.ksi_flags = KSI_SIGQ; ksi.ksi_signo = signum; ksi.ksi_code = SI_QUEUE; ksi.ksi_pid = td->td_proc->p_pid; ksi.ksi_uid = td->td_ucred->cr_ruid; ksi.ksi_value = *value; error = pksignal(p, ksi.ksi_signo, &ksi); } PROC_UNLOCK(p); return (error); } /* * Send a signal to a process group. */ void gsignal(int pgid, int sig, ksiginfo_t *ksi) { struct pgrp *pgrp; if (pgid != 0) { sx_slock(&proctree_lock); pgrp = pgfind(pgid); sx_sunlock(&proctree_lock); if (pgrp != NULL) { pgsignal(pgrp, sig, 0, ksi); PGRP_UNLOCK(pgrp); } } } /* * Send a signal to a process group. If checktty is 1, * limit to members which have a controlling terminal. */ void pgsignal(struct pgrp *pgrp, int sig, int checkctty, ksiginfo_t *ksi) { struct proc *p; if (pgrp) { PGRP_LOCK_ASSERT(pgrp, MA_OWNED); LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && (checkctty == 0 || p->p_flag & P_CONTROLT)) pksignal(p, sig, ksi); PROC_UNLOCK(p); } } } /* * Recalculate the signal mask and reset the signal disposition after * usermode frame for delivery is formed. Should be called after * mach-specific routine, because sysent->sv_sendsig() needs correct * ps_siginfo and signal mask. */ static void postsig_done(int sig, struct thread *td, struct sigacts *ps) { sigset_t mask; mtx_assert(&ps->ps_mtx, MA_OWNED); td->td_ru.ru_nsignals++; mask = ps->ps_catchmask[_SIG_IDX(sig)]; if (!SIGISMEMBER(ps->ps_signodefer, sig)) SIGADDSET(mask, sig); kern_sigprocmask(td, SIG_BLOCK, &mask, NULL, SIGPROCMASK_PROC_LOCKED | SIGPROCMASK_PS_LOCKED); if (SIGISMEMBER(ps->ps_sigreset, sig)) sigdflt(ps, sig); } /* * Send a signal caused by a trap to the current thread. If it will be * caught immediately, deliver it with correct code. Otherwise, post it * normally. */ void trapsignal(struct thread *td, ksiginfo_t *ksi) { struct sigacts *ps; struct proc *p; int sig; int code; p = td->td_proc; sig = ksi->ksi_signo; code = ksi->ksi_code; KASSERT(_SIG_VALID(sig), ("invalid signal")); PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); if ((p->p_flag & P_TRACED) == 0 && SIGISMEMBER(ps->ps_sigcatch, sig) && !SIGISMEMBER(td->td_sigmask, sig)) { #ifdef KTRACE if (KTRPOINT(curthread, KTR_PSIG)) ktrpsig(sig, ps->ps_sigact[_SIG_IDX(sig)], &td->td_sigmask, code); #endif (*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)], ksi, &td->td_sigmask); postsig_done(sig, td, ps); mtx_unlock(&ps->ps_mtx); } else { /* * Avoid a possible infinite loop if the thread * masking the signal or process is ignoring the * signal. */ if (kern_forcesigexit && (SIGISMEMBER(td->td_sigmask, sig) || ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN)) { SIGDELSET(td->td_sigmask, sig); SIGDELSET(ps->ps_sigcatch, sig); SIGDELSET(ps->ps_sigignore, sig); ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; } mtx_unlock(&ps->ps_mtx); p->p_sig = sig; /* XXX to verify code */ tdsendsignal(p, td, sig, ksi); } PROC_UNLOCK(p); } static struct thread * sigtd(struct proc *p, int sig, int prop) { struct thread *td, *signal_td; PROC_LOCK_ASSERT(p, MA_OWNED); /* * Check if current thread can handle the signal without * switching context to another thread. */ if (curproc == p && !SIGISMEMBER(curthread->td_sigmask, sig)) return (curthread); signal_td = NULL; FOREACH_THREAD_IN_PROC(p, td) { if (!SIGISMEMBER(td->td_sigmask, sig)) { signal_td = td; break; } } if (signal_td == NULL) signal_td = FIRST_THREAD_IN_PROC(p); return (signal_td); } /* * Send the signal to the process. If the signal has an action, the action * is usually performed by the target process rather than the caller; we add * the signal to the set of pending signals for the process. * * Exceptions: * o When a stop signal is sent to a sleeping process that takes the * default action, the process is stopped without awakening it. * o SIGCONT restarts stopped processes (or puts them back to sleep) * regardless of the signal action (eg, blocked or ignored). * * Other ignored signals are discarded immediately. * * NB: This function may be entered from the debugger via the "kill" DDB * command. There is little that can be done to mitigate the possibly messy * side effects of this unwise possibility. */ void kern_psignal(struct proc *p, int sig) { ksiginfo_t ksi; ksiginfo_init(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = SI_KERNEL; (void) tdsendsignal(p, NULL, sig, &ksi); } int pksignal(struct proc *p, int sig, ksiginfo_t *ksi) { return (tdsendsignal(p, NULL, sig, ksi)); } /* Utility function for finding a thread to send signal event to. */ int sigev_findtd(struct proc *p ,struct sigevent *sigev, struct thread **ttd) { struct thread *td; if (sigev->sigev_notify == SIGEV_THREAD_ID) { td = tdfind(sigev->sigev_notify_thread_id, p->p_pid); if (td == NULL) return (ESRCH); *ttd = td; } else { *ttd = NULL; PROC_LOCK(p); } return (0); } void tdsignal(struct thread *td, int sig) { ksiginfo_t ksi; ksiginfo_init(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = SI_KERNEL; (void) tdsendsignal(td->td_proc, td, sig, &ksi); } void tdksignal(struct thread *td, int sig, ksiginfo_t *ksi) { (void) tdsendsignal(td->td_proc, td, sig, ksi); } int tdsendsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi) { sig_t action; sigqueue_t *sigqueue; int prop; struct sigacts *ps; int intrval; int ret = 0; int wakeup_swapper; MPASS(td == NULL || p == td->td_proc); PROC_LOCK_ASSERT(p, MA_OWNED); if (!_SIG_VALID(sig)) panic("%s(): invalid signal %d", __func__, sig); KASSERT(ksi == NULL || !KSI_ONQ(ksi), ("%s: ksi on queue", __func__)); /* * IEEE Std 1003.1-2001: return success when killing a zombie. */ if (p->p_state == PRS_ZOMBIE) { if (ksi && (ksi->ksi_flags & KSI_INS)) ksiginfo_tryfree(ksi); return (ret); } ps = p->p_sigacts; KNOTE_LOCKED(p->p_klist, NOTE_SIGNAL | sig); prop = sigprop(sig); if (td == NULL) { td = sigtd(p, sig, prop); sigqueue = &p->p_sigqueue; } else sigqueue = &td->td_sigqueue; SDT_PROBE3(proc, , , signal__send, td, p, sig); /* * If the signal is being ignored, * then we forget about it immediately. * (Note: we don't set SIGCONT in ps_sigignore, * and if it is set to SIG_IGN, * action will be SIG_DFL here.) */ mtx_lock(&ps->ps_mtx); if (SIGISMEMBER(ps->ps_sigignore, sig)) { SDT_PROBE3(proc, , , signal__discard, td, p, sig); mtx_unlock(&ps->ps_mtx); if (ksi && (ksi->ksi_flags & KSI_INS)) ksiginfo_tryfree(ksi); return (ret); } if (SIGISMEMBER(td->td_sigmask, sig)) action = SIG_HOLD; else if (SIGISMEMBER(ps->ps_sigcatch, sig)) action = SIG_CATCH; else action = SIG_DFL; if (SIGISMEMBER(ps->ps_sigintr, sig)) intrval = EINTR; else intrval = ERESTART; mtx_unlock(&ps->ps_mtx); if (prop & SIGPROP_CONT) sigqueue_delete_stopmask_proc(p); else if (prop & SIGPROP_STOP) { /* * If sending a tty stop signal to a member of an orphaned * process group, discard the signal here if the action * is default; don't stop the process below if sleeping, * and don't clear any pending SIGCONT. */ if ((prop & SIGPROP_TTYSTOP) && (p->p_pgrp->pg_jobc == 0) && (action == SIG_DFL)) { if (ksi && (ksi->ksi_flags & KSI_INS)) ksiginfo_tryfree(ksi); return (ret); } sigqueue_delete_proc(p, SIGCONT); if (p->p_flag & P_CONTINUED) { p->p_flag &= ~P_CONTINUED; PROC_LOCK(p->p_pptr); sigqueue_take(p->p_ksi); PROC_UNLOCK(p->p_pptr); } } ret = sigqueue_add(sigqueue, sig, ksi); if (ret != 0) return (ret); signotify(td); /* * Defer further processing for signals which are held, * except that stopped processes must be continued by SIGCONT. */ if (action == SIG_HOLD && !((prop & SIGPROP_CONT) && (p->p_flag & P_STOPPED_SIG))) return (ret); /* SIGKILL: Remove procfs STOPEVENTs. */ if (sig == SIGKILL) { /* from procfs_ioctl.c: PIOCBIC */ p->p_stops = 0; /* from procfs_ioctl.c: PIOCCONT */ p->p_step = 0; wakeup(&p->p_step); } /* * Some signals have a process-wide effect and a per-thread * component. Most processing occurs when the process next * tries to cross the user boundary, however there are some * times when processing needs to be done immediately, such as * waking up threads so that they can cross the user boundary. * We try to do the per-process part here. */ if (P_SHOULDSTOP(p)) { KASSERT(!(p->p_flag & P_WEXIT), ("signal to stopped but exiting process")); if (sig == SIGKILL) { /* * If traced process is already stopped, * then no further action is necessary. */ if (p->p_flag & P_TRACED) goto out; /* * SIGKILL sets process running. * It will die elsewhere. * All threads must be restarted. */ p->p_flag &= ~P_STOPPED_SIG; goto runfast; } if (prop & SIGPROP_CONT) { /* * If traced process is already stopped, * then no further action is necessary. */ if (p->p_flag & P_TRACED) goto out; /* * If SIGCONT is default (or ignored), we continue the * process but don't leave the signal in sigqueue as * it has no further action. If SIGCONT is held, we * continue the process and leave the signal in * sigqueue. If the process catches SIGCONT, let it * handle the signal itself. If it isn't waiting on * an event, it goes back to run state. * Otherwise, process goes back to sleep state. */ p->p_flag &= ~P_STOPPED_SIG; PROC_SLOCK(p); if (p->p_numthreads == p->p_suspcount) { PROC_SUNLOCK(p); p->p_flag |= P_CONTINUED; p->p_xsig = SIGCONT; PROC_LOCK(p->p_pptr); childproc_continued(p); PROC_UNLOCK(p->p_pptr); PROC_SLOCK(p); } if (action == SIG_DFL) { thread_unsuspend(p); PROC_SUNLOCK(p); sigqueue_delete(sigqueue, sig); goto out; } if (action == SIG_CATCH) { /* * The process wants to catch it so it needs * to run at least one thread, but which one? */ PROC_SUNLOCK(p); goto runfast; } /* * The signal is not ignored or caught. */ thread_unsuspend(p); PROC_SUNLOCK(p); goto out; } if (prop & SIGPROP_STOP) { /* * If traced process is already stopped, * then no further action is necessary. */ if (p->p_flag & P_TRACED) goto out; /* * Already stopped, don't need to stop again * (If we did the shell could get confused). * Just make sure the signal STOP bit set. */ p->p_flag |= P_STOPPED_SIG; sigqueue_delete(sigqueue, sig); goto out; } /* * All other kinds of signals: * If a thread is sleeping interruptibly, simulate a * wakeup so that when it is continued it will be made * runnable and can look at the signal. However, don't make * the PROCESS runnable, leave it stopped. * It may run a bit until it hits a thread_suspend_check(). */ wakeup_swapper = 0; PROC_SLOCK(p); thread_lock(td); if (TD_ON_SLEEPQ(td) && (td->td_flags & TDF_SINTR)) wakeup_swapper = sleepq_abort(td, intrval); thread_unlock(td); PROC_SUNLOCK(p); if (wakeup_swapper) kick_proc0(); goto out; /* * Mutexes are short lived. Threads waiting on them will * hit thread_suspend_check() soon. */ } else if (p->p_state == PRS_NORMAL) { if (p->p_flag & P_TRACED || action == SIG_CATCH) { tdsigwakeup(td, sig, action, intrval); goto out; } MPASS(action == SIG_DFL); if (prop & SIGPROP_STOP) { if (p->p_flag & (P_PPWAIT|P_WEXIT)) goto out; p->p_flag |= P_STOPPED_SIG; p->p_xsig = sig; PROC_SLOCK(p); wakeup_swapper = sig_suspend_threads(td, p, 1); if (p->p_numthreads == p->p_suspcount) { /* * only thread sending signal to another * process can reach here, if thread is sending * signal to its process, because thread does * not suspend itself here, p_numthreads * should never be equal to p_suspcount. */ thread_stopped(p); PROC_SUNLOCK(p); sigqueue_delete_proc(p, p->p_xsig); } else PROC_SUNLOCK(p); if (wakeup_swapper) kick_proc0(); goto out; } } else { /* Not in "NORMAL" state. discard the signal. */ sigqueue_delete(sigqueue, sig); goto out; } /* * The process is not stopped so we need to apply the signal to all the * running threads. */ runfast: tdsigwakeup(td, sig, action, intrval); PROC_SLOCK(p); thread_unsuspend(p); PROC_SUNLOCK(p); out: /* If we jump here, proc slock should not be owned. */ PROC_SLOCK_ASSERT(p, MA_NOTOWNED); return (ret); } /* * The force of a signal has been directed against a single * thread. We need to see what we can do about knocking it * out of any sleep it may be in etc. */ static void tdsigwakeup(struct thread *td, int sig, sig_t action, int intrval) { struct proc *p = td->td_proc; int prop; int wakeup_swapper; wakeup_swapper = 0; PROC_LOCK_ASSERT(p, MA_OWNED); prop = sigprop(sig); PROC_SLOCK(p); thread_lock(td); /* * Bring the priority of a thread up if we want it to get * killed in this lifetime. Be careful to avoid bumping the * priority of the idle thread, since we still allow to signal * kernel processes. */ if (action == SIG_DFL && (prop & SIGPROP_KILL) != 0 && td->td_priority > PUSER && !TD_IS_IDLETHREAD(td)) sched_prio(td, PUSER); if (TD_ON_SLEEPQ(td)) { /* * If thread is sleeping uninterruptibly * we can't interrupt the sleep... the signal will * be noticed when the process returns through * trap() or syscall(). */ if ((td->td_flags & TDF_SINTR) == 0) goto out; /* * If SIGCONT is default (or ignored) and process is * asleep, we are finished; the process should not * be awakened. */ if ((prop & SIGPROP_CONT) && action == SIG_DFL) { thread_unlock(td); PROC_SUNLOCK(p); sigqueue_delete(&p->p_sigqueue, sig); /* * It may be on either list in this state. * Remove from both for now. */ sigqueue_delete(&td->td_sigqueue, sig); return; } /* * Don't awaken a sleeping thread for SIGSTOP if the * STOP signal is deferred. */ if ((prop & SIGPROP_STOP) != 0 && (td->td_flags & (TDF_SBDRY | TDF_SERESTART | TDF_SEINTR)) == TDF_SBDRY) goto out; /* * Give low priority threads a better chance to run. */ if (td->td_priority > PUSER && !TD_IS_IDLETHREAD(td)) sched_prio(td, PUSER); wakeup_swapper = sleepq_abort(td, intrval); } else { /* * Other states do nothing with the signal immediately, * other than kicking ourselves if we are running. * It will either never be noticed, or noticed very soon. */ #ifdef SMP if (TD_IS_RUNNING(td) && td != curthread) forward_signal(td); #endif } out: PROC_SUNLOCK(p); thread_unlock(td); if (wakeup_swapper) kick_proc0(); } static int sig_suspend_threads(struct thread *td, struct proc *p, int sending) { struct thread *td2; int wakeup_swapper; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); MPASS(sending || td == curthread); wakeup_swapper = 0; FOREACH_THREAD_IN_PROC(p, td2) { thread_lock(td2); td2->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK; if ((TD_IS_SLEEPING(td2) || TD_IS_SWAPPED(td2)) && (td2->td_flags & TDF_SINTR)) { if (td2->td_flags & TDF_SBDRY) { /* * Once a thread is asleep with * TDF_SBDRY and without TDF_SERESTART * or TDF_SEINTR set, it should never * become suspended due to this check. */ KASSERT(!TD_IS_SUSPENDED(td2), ("thread with deferred stops suspended")); if (TD_SBDRY_INTR(td2)) wakeup_swapper |= sleepq_abort(td2, TD_SBDRY_ERRNO(td2)); } else if (!TD_IS_SUSPENDED(td2)) { thread_suspend_one(td2); } } else if (!TD_IS_SUSPENDED(td2)) { if (sending || td != td2) td2->td_flags |= TDF_ASTPENDING; #ifdef SMP if (TD_IS_RUNNING(td2) && td2 != td) forward_signal(td2); #endif } thread_unlock(td2); } return (wakeup_swapper); } /* * Stop the process for an event deemed interesting to the debugger. If si is * non-NULL, this is a signal exchange; the new signal requested by the * debugger will be returned for handling. If si is NULL, this is some other * type of interesting event. The debugger may request a signal be delivered in * that case as well, however it will be deferred until it can be handled. */ int ptracestop(struct thread *td, int sig, ksiginfo_t *si) { struct proc *p = td->td_proc; struct thread *td2; ksiginfo_t ksi; int prop; PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(!(p->p_flag & P_WEXIT), ("Stopping exiting process")); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.lock_object, "Stopping for traced signal"); td->td_xsig = sig; if (si == NULL || (si->ksi_flags & KSI_PTRACE) == 0) { td->td_dbgflags |= TDB_XSIG; CTR4(KTR_PTRACE, "ptracestop: tid %d (pid %d) flags %#x sig %d", td->td_tid, p->p_pid, td->td_dbgflags, sig); PROC_SLOCK(p); while ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_XSIG)) { if (P_KILLED(p)) { /* * Ensure that, if we've been PT_KILLed, the * exit status reflects that. Another thread * may also be in ptracestop(), having just * received the SIGKILL, but this thread was * unsuspended first. */ td->td_dbgflags &= ~TDB_XSIG; td->td_xsig = SIGKILL; p->p_ptevents = 0; break; } if (p->p_flag & P_SINGLE_EXIT && !(td->td_dbgflags & TDB_EXIT)) { /* * Ignore ptrace stops except for thread exit * events when the process exits. */ td->td_dbgflags &= ~TDB_XSIG; PROC_SUNLOCK(p); return (0); } /* * Make wait(2) work. Ensure that right after the * attach, the thread which was decided to become the * leader of attach gets reported to the waiter. * Otherwise, just avoid overwriting another thread's * assignment to p_xthread. If another thread has * already set p_xthread, the current thread will get * a chance to report itself upon the next iteration. */ if ((td->td_dbgflags & TDB_FSTP) != 0 || ((p->p_flag2 & P2_PTRACE_FSTP) == 0 && p->p_xthread == NULL)) { p->p_xsig = sig; p->p_xthread = td; /* * If we are on sleepqueue already, * let sleepqueue code decide if it * needs to go sleep after attach. */ if (td->td_wchan == NULL) td->td_dbgflags &= ~TDB_FSTP; p->p_flag2 &= ~P2_PTRACE_FSTP; p->p_flag |= P_STOPPED_SIG | P_STOPPED_TRACE; sig_suspend_threads(td, p, 0); } if ((td->td_dbgflags & TDB_STOPATFORK) != 0) { td->td_dbgflags &= ~TDB_STOPATFORK; } stopme: thread_suspend_switch(td, p); if (p->p_xthread == td) p->p_xthread = NULL; if (!(p->p_flag & P_TRACED)) break; if (td->td_dbgflags & TDB_SUSPEND) { if (p->p_flag & P_SINGLE_EXIT) break; goto stopme; } } PROC_SUNLOCK(p); } if (si != NULL && sig == td->td_xsig) { /* Parent wants us to take the original signal unchanged. */ si->ksi_flags |= KSI_HEAD; if (sigqueue_add(&td->td_sigqueue, sig, si) != 0) si->ksi_signo = 0; } else if (td->td_xsig != 0) { /* * If parent wants us to take a new signal, then it will leave * it in td->td_xsig; otherwise we just look for signals again. */ ksiginfo_init(&ksi); ksi.ksi_signo = td->td_xsig; ksi.ksi_flags |= KSI_PTRACE; prop = sigprop(td->td_xsig); td2 = sigtd(p, td->td_xsig, prop); tdsendsignal(p, td2, td->td_xsig, &ksi); if (td != td2) return (0); } return (td->td_xsig); } static void reschedule_signals(struct proc *p, sigset_t block, int flags) { struct sigacts *ps; struct thread *td; int sig; PROC_LOCK_ASSERT(p, MA_OWNED); ps = p->p_sigacts; mtx_assert(&ps->ps_mtx, (flags & SIGPROCMASK_PS_LOCKED) != 0 ? MA_OWNED : MA_NOTOWNED); if (SIGISEMPTY(p->p_siglist)) return; SIGSETAND(block, p->p_siglist); while ((sig = sig_ffs(&block)) != 0) { SIGDELSET(block, sig); td = sigtd(p, sig, 0); signotify(td); if (!(flags & SIGPROCMASK_PS_LOCKED)) mtx_lock(&ps->ps_mtx); if (p->p_flag & P_TRACED || (SIGISMEMBER(ps->ps_sigcatch, sig) && !SIGISMEMBER(td->td_sigmask, sig))) tdsigwakeup(td, sig, SIG_CATCH, (SIGISMEMBER(ps->ps_sigintr, sig) ? EINTR : ERESTART)); if (!(flags & SIGPROCMASK_PS_LOCKED)) mtx_unlock(&ps->ps_mtx); } } void tdsigcleanup(struct thread *td) { struct proc *p; sigset_t unblocked; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sigqueue_flush(&td->td_sigqueue); if (p->p_numthreads == 1) return; /* * Since we cannot handle signals, notify signal post code * about this by filling the sigmask. * * Also, if needed, wake up thread(s) that do not block the * same signals as the exiting thread, since the thread might * have been selected for delivery and woken up. */ SIGFILLSET(unblocked); SIGSETNAND(unblocked, td->td_sigmask); SIGFILLSET(td->td_sigmask); reschedule_signals(p, unblocked, 0); } static int sigdeferstop_curr_flags(int cflags) { MPASS((cflags & (TDF_SEINTR | TDF_SERESTART)) == 0 || (cflags & TDF_SBDRY) != 0); return (cflags & (TDF_SBDRY | TDF_SEINTR | TDF_SERESTART)); } /* * Defer the delivery of SIGSTOP for the current thread, according to * the requested mode. Returns previous flags, which must be restored * by sigallowstop(). * * TDF_SBDRY, TDF_SEINTR, and TDF_SERESTART flags are only set and * cleared by the current thread, which allow the lock-less read-only * accesses below. */ int sigdeferstop_impl(int mode) { struct thread *td; int cflags, nflags; td = curthread; cflags = sigdeferstop_curr_flags(td->td_flags); switch (mode) { case SIGDEFERSTOP_NOP: nflags = cflags; break; case SIGDEFERSTOP_OFF: nflags = 0; break; case SIGDEFERSTOP_SILENT: nflags = (cflags | TDF_SBDRY) & ~(TDF_SEINTR | TDF_SERESTART); break; case SIGDEFERSTOP_EINTR: nflags = (cflags | TDF_SBDRY | TDF_SEINTR) & ~TDF_SERESTART; break; case SIGDEFERSTOP_ERESTART: nflags = (cflags | TDF_SBDRY | TDF_SERESTART) & ~TDF_SEINTR; break; default: panic("sigdeferstop: invalid mode %x", mode); break; } if (cflags == nflags) return (SIGDEFERSTOP_VAL_NCHG); thread_lock(td); td->td_flags = (td->td_flags & ~cflags) | nflags; thread_unlock(td); return (cflags); } /* * Restores the STOP handling mode, typically permitting the delivery * of SIGSTOP for the current thread. This does not immediately * suspend if a stop was posted. Instead, the thread will suspend * either via ast() or a subsequent interruptible sleep. */ void sigallowstop_impl(int prev) { struct thread *td; int cflags; KASSERT(prev != SIGDEFERSTOP_VAL_NCHG, ("failed sigallowstop")); KASSERT((prev & ~(TDF_SBDRY | TDF_SEINTR | TDF_SERESTART)) == 0, ("sigallowstop: incorrect previous mode %x", prev)); td = curthread; cflags = sigdeferstop_curr_flags(td->td_flags); if (cflags != prev) { thread_lock(td); td->td_flags = (td->td_flags & ~cflags) | prev; thread_unlock(td); } } /* * If the current process has received a signal (should be caught or cause * termination, should interrupt current syscall), return the signal number. * Stop signals with default action are processed immediately, then cleared; * they aren't returned. This is checked after each entry to the system for * a syscall or trap (though this can usually be done without calling issignal * by checking the pending signal masks in cursig.) The normal call * sequence is * * while (sig = cursig(curthread)) * postsig(sig); */ static int issignal(struct thread *td) { struct proc *p; struct sigacts *ps; struct sigqueue *queue; sigset_t sigpending; ksiginfo_t ksi; int prop, sig, traced; p = td->td_proc; ps = p->p_sigacts; mtx_assert(&ps->ps_mtx, MA_OWNED); PROC_LOCK_ASSERT(p, MA_OWNED); for (;;) { traced = (p->p_flag & P_TRACED) || (p->p_stops & S_SIG); sigpending = td->td_sigqueue.sq_signals; SIGSETOR(sigpending, p->p_sigqueue.sq_signals); SIGSETNAND(sigpending, td->td_sigmask); if ((p->p_flag & P_PPWAIT) != 0 || (td->td_flags & (TDF_SBDRY | TDF_SERESTART | TDF_SEINTR)) == TDF_SBDRY) SIG_STOPSIGMASK(sigpending); if (SIGISEMPTY(sigpending)) /* no signal to send */ return (0); if ((p->p_flag & (P_TRACED | P_PPTRACE)) == P_TRACED && (p->p_flag2 & P2_PTRACE_FSTP) != 0 && SIGISMEMBER(sigpending, SIGSTOP)) { /* * If debugger just attached, always consume * SIGSTOP from ptrace(PT_ATTACH) first, to * execute the debugger attach ritual in * order. */ sig = SIGSTOP; td->td_dbgflags |= TDB_FSTP; } else { sig = sig_ffs(&sigpending); } if (p->p_stops & S_SIG) { mtx_unlock(&ps->ps_mtx); stopevent(p, S_SIG, sig); mtx_lock(&ps->ps_mtx); } /* * We should see pending but ignored signals * only if P_TRACED was on when they were posted. */ if (SIGISMEMBER(ps->ps_sigignore, sig) && (traced == 0)) { sigqueue_delete(&td->td_sigqueue, sig); sigqueue_delete(&p->p_sigqueue, sig); continue; } if ((p->p_flag & (P_TRACED | P_PPTRACE)) == P_TRACED) { /* * If traced, always stop. * Remove old signal from queue before the stop. * XXX shrug off debugger, it causes siginfo to * be thrown away. */ queue = &td->td_sigqueue; ksiginfo_init(&ksi); if (sigqueue_get(queue, sig, &ksi) == 0) { queue = &p->p_sigqueue; sigqueue_get(queue, sig, &ksi); } td->td_si = ksi.ksi_info; mtx_unlock(&ps->ps_mtx); sig = ptracestop(td, sig, &ksi); mtx_lock(&ps->ps_mtx); td->td_si.si_signo = 0; /* * Keep looking if the debugger discarded or * replaced the signal. */ if (sig == 0) continue; /* * If the signal became masked, re-queue it. */ if (SIGISMEMBER(td->td_sigmask, sig)) { ksi.ksi_flags |= KSI_HEAD; sigqueue_add(&p->p_sigqueue, sig, &ksi); continue; } /* * If the traced bit got turned off, requeue * the signal and go back up to the top to * rescan signals. This ensures that p_sig* * and p_sigact are consistent. */ if ((p->p_flag & P_TRACED) == 0) { ksi.ksi_flags |= KSI_HEAD; sigqueue_add(queue, sig, &ksi); continue; } } prop = sigprop(sig); /* * Decide whether the signal should be returned. * Return the signal's number, or fall through * to clear it from the pending mask. */ switch ((intptr_t)p->p_sigacts->ps_sigact[_SIG_IDX(sig)]) { case (intptr_t)SIG_DFL: /* * Don't take default actions on system processes. */ if (p->p_pid <= 1) { #ifdef DIAGNOSTIC /* * Are you sure you want to ignore SIGSEGV * in init? XXX */ printf("Process (pid %lu) got signal %d\n", (u_long)p->p_pid, sig); #endif break; /* == ignore */ } /* * If there is a pending stop signal to process with * default action, stop here, then clear the signal. * Traced or exiting processes should ignore stops. * Additionally, a member of an orphaned process group * should ignore tty stops. */ if (prop & SIGPROP_STOP) { if (p->p_flag & (P_TRACED | P_WEXIT | P_SINGLE_EXIT) || (p->p_pgrp->pg_jobc == 0 && prop & SIGPROP_TTYSTOP)) break; /* == ignore */ if (TD_SBDRY_INTR(td)) { KASSERT((td->td_flags & TDF_SBDRY) != 0, ("lost TDF_SBDRY")); return (-1); } mtx_unlock(&ps->ps_mtx); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.lock_object, "Catching SIGSTOP"); sigqueue_delete(&td->td_sigqueue, sig); sigqueue_delete(&p->p_sigqueue, sig); p->p_flag |= P_STOPPED_SIG; p->p_xsig = sig; PROC_SLOCK(p); sig_suspend_threads(td, p, 0); thread_suspend_switch(td, p); PROC_SUNLOCK(p); mtx_lock(&ps->ps_mtx); goto next; } else if (prop & SIGPROP_IGNORE) { /* * Except for SIGCONT, shouldn't get here. * Default action is to ignore; drop it. */ break; /* == ignore */ } else return (sig); /*NOTREACHED*/ case (intptr_t)SIG_IGN: /* * Masking above should prevent us ever trying * to take action on an ignored signal other * than SIGCONT, unless process is traced. */ if ((prop & SIGPROP_CONT) == 0 && (p->p_flag & P_TRACED) == 0) printf("issignal\n"); break; /* == ignore */ default: /* * This signal has an action, let * postsig() process it. */ return (sig); } sigqueue_delete(&td->td_sigqueue, sig); /* take the signal! */ sigqueue_delete(&p->p_sigqueue, sig); next:; } /* NOTREACHED */ } void thread_stopped(struct proc *p) { int n; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); n = p->p_suspcount; if (p == curproc) n++; if ((p->p_flag & P_STOPPED_SIG) && (n == p->p_numthreads)) { PROC_SUNLOCK(p); p->p_flag &= ~P_WAITED; PROC_LOCK(p->p_pptr); childproc_stopped(p, (p->p_flag & P_TRACED) ? CLD_TRAPPED : CLD_STOPPED); PROC_UNLOCK(p->p_pptr); PROC_SLOCK(p); } } /* * Take the action for the specified signal * from the current set of pending signals. */ int postsig(int sig) { struct thread *td; struct proc *p; struct sigacts *ps; sig_t action; ksiginfo_t ksi; sigset_t returnmask; KASSERT(sig != 0, ("postsig")); td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); ps = p->p_sigacts; mtx_assert(&ps->ps_mtx, MA_OWNED); ksiginfo_init(&ksi); if (sigqueue_get(&td->td_sigqueue, sig, &ksi) == 0 && sigqueue_get(&p->p_sigqueue, sig, &ksi) == 0) return (0); ksi.ksi_signo = sig; if (ksi.ksi_code == SI_TIMER) itimer_accept(p, ksi.ksi_timerid, &ksi); action = ps->ps_sigact[_SIG_IDX(sig)]; #ifdef KTRACE if (KTRPOINT(td, KTR_PSIG)) ktrpsig(sig, action, td->td_pflags & TDP_OLDMASK ? &td->td_oldsigmask : &td->td_sigmask, ksi.ksi_code); #endif if ((p->p_stops & S_SIG) != 0) { mtx_unlock(&ps->ps_mtx); stopevent(p, S_SIG, sig); mtx_lock(&ps->ps_mtx); } if (action == SIG_DFL) { /* * Default action, where the default is to kill * the process. (Other cases were ignored above.) */ mtx_unlock(&ps->ps_mtx); proc_td_siginfo_capture(td, &ksi.ksi_info); sigexit(td, sig); /* NOTREACHED */ } else { /* * If we get here, the signal must be caught. */ KASSERT(action != SIG_IGN, ("postsig action %p", action)); KASSERT(!SIGISMEMBER(td->td_sigmask, sig), ("postsig action: blocked sig %d", sig)); /* * Set the new mask value and also defer further * occurrences of this signal. * * Special case: user has done a sigsuspend. Here the * current mask is not of interest, but rather the * mask from before the sigsuspend is what we want * restored after the signal processing is completed. */ if (td->td_pflags & TDP_OLDMASK) { returnmask = td->td_oldsigmask; td->td_pflags &= ~TDP_OLDMASK; } else returnmask = td->td_sigmask; if (p->p_sig == sig) { p->p_sig = 0; } (*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask); postsig_done(sig, td, ps); } return (1); } void proc_wkilled(struct proc *p) { PROC_LOCK_ASSERT(p, MA_OWNED); if ((p->p_flag & P_WKILLED) == 0) { p->p_flag |= P_WKILLED; /* * Notify swapper that there is a process to swap in. * The notification is racy, at worst it would take 10 * seconds for the swapper process to notice. */ if ((p->p_flag & (P_INMEM | P_SWAPPINGIN)) == 0) wakeup(&proc0); } } /* * Kill the current process for stated reason. */ void killproc(struct proc *p, char *why) { PROC_LOCK_ASSERT(p, MA_OWNED); CTR3(KTR_PROC, "killproc: proc %p (pid %d, %s)", p, p->p_pid, p->p_comm); log(LOG_ERR, "pid %d (%s), jid %d, uid %d, was killed: %s\n", p->p_pid, p->p_comm, p->p_ucred->cr_prison->pr_id, p->p_ucred->cr_uid, why); proc_wkilled(p); kern_psignal(p, SIGKILL); } /* * Force the current process to exit with the specified signal, dumping core * if appropriate. We bypass the normal tests for masked and caught signals, * allowing unrecoverable failures to terminate the process without changing * signal state. Mark the accounting record with the signal termination. * If dumping core, save the signal number for the debugger. Calls exit and * does not return. */ void sigexit(struct thread *td, int sig) { struct proc *p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); p->p_acflag |= AXSIG; /* * We must be single-threading to generate a core dump. This * ensures that the registers in the core file are up-to-date. * Also, the ELF dump handler assumes that the thread list doesn't * change out from under it. * * XXX If another thread attempts to single-thread before us * (e.g. via fork()), we won't get a dump at all. */ if ((sigprop(sig) & SIGPROP_CORE) && thread_single(p, SINGLE_NO_EXIT) == 0) { p->p_sig = sig; /* * Log signals which would cause core dumps * (Log as LOG_INFO to appease those who don't want * these messages.) * XXX : Todo, as well as euid, write out ruid too * Note that coredump() drops proc lock. */ if (coredump(td) == 0) sig |= WCOREFLAG; if (kern_logsigexit) log(LOG_INFO, "pid %d (%s), jid %d, uid %d: exited on " "signal %d%s\n", p->p_pid, p->p_comm, p->p_ucred->cr_prison->pr_id, td->td_ucred->cr_uid, sig &~ WCOREFLAG, sig & WCOREFLAG ? " (core dumped)" : ""); } else PROC_UNLOCK(p); exit1(td, 0, sig); /* NOTREACHED */ } /* * Send queued SIGCHLD to parent when child process's state * is changed. */ static void sigparent(struct proc *p, int reason, int status) { PROC_LOCK_ASSERT(p, MA_OWNED); PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED); if (p->p_ksi != NULL) { p->p_ksi->ksi_signo = SIGCHLD; p->p_ksi->ksi_code = reason; p->p_ksi->ksi_status = status; p->p_ksi->ksi_pid = p->p_pid; p->p_ksi->ksi_uid = p->p_ucred->cr_ruid; if (KSI_ONQ(p->p_ksi)) return; } pksignal(p->p_pptr, SIGCHLD, p->p_ksi); } static void childproc_jobstate(struct proc *p, int reason, int sig) { struct sigacts *ps; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED); /* * Wake up parent sleeping in kern_wait(), also send * SIGCHLD to parent, but SIGCHLD does not guarantee * that parent will awake, because parent may masked * the signal. */ p->p_pptr->p_flag |= P_STATCHILD; wakeup(p->p_pptr); ps = p->p_pptr->p_sigacts; mtx_lock(&ps->ps_mtx); if ((ps->ps_flag & PS_NOCLDSTOP) == 0) { mtx_unlock(&ps->ps_mtx); sigparent(p, reason, sig); } else mtx_unlock(&ps->ps_mtx); } void childproc_stopped(struct proc *p, int reason) { childproc_jobstate(p, reason, p->p_xsig); } void childproc_continued(struct proc *p) { childproc_jobstate(p, CLD_CONTINUED, SIGCONT); } void childproc_exited(struct proc *p) { int reason, status; if (WCOREDUMP(p->p_xsig)) { reason = CLD_DUMPED; status = WTERMSIG(p->p_xsig); } else if (WIFSIGNALED(p->p_xsig)) { reason = CLD_KILLED; status = WTERMSIG(p->p_xsig); } else { reason = CLD_EXITED; status = p->p_xexit; } /* * XXX avoid calling wakeup(p->p_pptr), the work is * done in exit1(). */ sigparent(p, reason, status); } #define MAX_NUM_CORE_FILES 100000 #ifndef NUM_CORE_FILES #define NUM_CORE_FILES 5 #endif CTASSERT(NUM_CORE_FILES >= 0 && NUM_CORE_FILES <= MAX_NUM_CORE_FILES); static int num_cores = NUM_CORE_FILES; static int sysctl_debug_num_cores_check (SYSCTL_HANDLER_ARGS) { int error; int new_val; new_val = num_cores; error = sysctl_handle_int(oidp, &new_val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (new_val > MAX_NUM_CORE_FILES) new_val = MAX_NUM_CORE_FILES; if (new_val < 0) new_val = 0; num_cores = new_val; return (0); } SYSCTL_PROC(_debug, OID_AUTO, ncores, CTLTYPE_INT|CTLFLAG_RW, 0, sizeof(int), sysctl_debug_num_cores_check, "I", "Maximum number of generated process corefiles while using index format"); #define GZIP_SUFFIX ".gz" #define ZSTD_SUFFIX ".zst" int compress_user_cores = 0; static int sysctl_compress_user_cores(SYSCTL_HANDLER_ARGS) { int error, val; val = compress_user_cores; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val != 0 && !compressor_avail(val)) return (EINVAL); compress_user_cores = val; return (error); } SYSCTL_PROC(_kern, OID_AUTO, compress_user_cores, CTLTYPE_INT | CTLFLAG_RWTUN, 0, sizeof(int), sysctl_compress_user_cores, "I", "Enable compression of user corefiles (" __XSTRING(COMPRESS_GZIP) " = gzip, " __XSTRING(COMPRESS_ZSTD) " = zstd)"); int compress_user_cores_level = 6; SYSCTL_INT(_kern, OID_AUTO, compress_user_cores_level, CTLFLAG_RWTUN, &compress_user_cores_level, 0, "Corefile compression level"); /* * Protect the access to corefilename[] by allproc_lock. */ #define corefilename_lock allproc_lock static char corefilename[MAXPATHLEN] = {"%N.core"}; TUNABLE_STR("kern.corefile", corefilename, sizeof(corefilename)); static int sysctl_kern_corefile(SYSCTL_HANDLER_ARGS) { int error; sx_xlock(&corefilename_lock); error = sysctl_handle_string(oidp, corefilename, sizeof(corefilename), req); sx_xunlock(&corefilename_lock); return (error); } SYSCTL_PROC(_kern, OID_AUTO, corefile, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_kern_corefile, "A", "Process corefile name format string"); static void vnode_close_locked(struct thread *td, struct vnode *vp) { VOP_UNLOCK(vp, 0); vn_close(vp, FWRITE, td->td_ucred, td); } /* * If the core format has a %I in it, then we need to check * for existing corefiles before defining a name. * To do this we iterate over 0..ncores to find a * non-existing core file name to use. If all core files are * already used we choose the oldest one. */ static int corefile_open_last(struct thread *td, char *name, int indexpos, int indexlen, int ncores, struct vnode **vpp) { struct vnode *oldvp, *nextvp, *vp; struct vattr vattr; struct nameidata nd; int error, i, flags, oflags, cmode; char ch; struct timespec lasttime; nextvp = oldvp = NULL; cmode = S_IRUSR | S_IWUSR; oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE | (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0); for (i = 0; i < ncores; i++) { flags = O_CREAT | FWRITE | O_NOFOLLOW; ch = name[indexpos + indexlen]; (void)snprintf(name + indexpos, indexlen + 1, "%.*u", indexlen, i); name[indexpos + indexlen] = ch; NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name, td); error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred, NULL); if (error != 0) break; vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if ((flags & O_CREAT) == O_CREAT) { nextvp = vp; break; } error = VOP_GETATTR(vp, &vattr, td->td_ucred); if (error != 0) { vnode_close_locked(td, vp); break; } if (oldvp == NULL || lasttime.tv_sec > vattr.va_mtime.tv_sec || (lasttime.tv_sec == vattr.va_mtime.tv_sec && lasttime.tv_nsec >= vattr.va_mtime.tv_nsec)) { if (oldvp != NULL) vnode_close_locked(td, oldvp); oldvp = vp; lasttime = vattr.va_mtime; } else { vnode_close_locked(td, vp); } } if (oldvp != NULL) { if (nextvp == NULL) { if ((td->td_proc->p_flag & P_SUGID) != 0) { error = EFAULT; vnode_close_locked(td, oldvp); } else { nextvp = oldvp; } } else { vnode_close_locked(td, oldvp); } } if (error != 0) { if (nextvp != NULL) vnode_close_locked(td, oldvp); } else { *vpp = nextvp; } return (error); } /* * corefile_open(comm, uid, pid, td, compress, vpp, namep) * Expand the name described in corefilename, using name, uid, and pid * and open/create core file. * corefilename is a printf-like string, with three format specifiers: * %N name of process ("name") * %P process id (pid) * %U user id (uid) * For example, "%N.core" is the default; they can be disabled completely * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P". * This is controlled by the sysctl variable kern.corefile (see above). */ static int corefile_open(const char *comm, uid_t uid, pid_t pid, struct thread *td, int compress, int signum, struct vnode **vpp, char **namep) { struct sbuf sb; struct nameidata nd; const char *format; char *hostname, *name; int cmode, error, flags, i, indexpos, indexlen, oflags, ncores; hostname = NULL; format = corefilename; name = malloc(MAXPATHLEN, M_TEMP, M_WAITOK | M_ZERO); indexlen = 0; indexpos = -1; ncores = num_cores; (void)sbuf_new(&sb, name, MAXPATHLEN, SBUF_FIXEDLEN); sx_slock(&corefilename_lock); for (i = 0; format[i] != '\0'; i++) { switch (format[i]) { case '%': /* Format character */ i++; switch (format[i]) { case '%': sbuf_putc(&sb, '%'); break; case 'H': /* hostname */ if (hostname == NULL) { hostname = malloc(MAXHOSTNAMELEN, M_TEMP, M_WAITOK); } getcredhostname(td->td_ucred, hostname, MAXHOSTNAMELEN); sbuf_printf(&sb, "%s", hostname); break; case 'I': /* autoincrementing index */ if (indexpos != -1) { sbuf_printf(&sb, "%%I"); break; } indexpos = sbuf_len(&sb); sbuf_printf(&sb, "%u", ncores - 1); indexlen = sbuf_len(&sb) - indexpos; break; case 'N': /* process name */ sbuf_printf(&sb, "%s", comm); break; case 'P': /* process id */ sbuf_printf(&sb, "%u", pid); break; case 'S': /* signal number */ sbuf_printf(&sb, "%i", signum); break; case 'U': /* user id */ sbuf_printf(&sb, "%u", uid); break; default: log(LOG_ERR, "Unknown format character %c in " "corename `%s'\n", format[i], format); break; } break; default: sbuf_putc(&sb, format[i]); break; } } sx_sunlock(&corefilename_lock); free(hostname, M_TEMP); if (compress == COMPRESS_GZIP) sbuf_printf(&sb, GZIP_SUFFIX); else if (compress == COMPRESS_ZSTD) sbuf_printf(&sb, ZSTD_SUFFIX); if (sbuf_error(&sb) != 0) { log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too " "long\n", (long)pid, comm, (u_long)uid); sbuf_delete(&sb); free(name, M_TEMP); return (ENOMEM); } sbuf_finish(&sb); sbuf_delete(&sb); if (indexpos != -1) { error = corefile_open_last(td, name, indexpos, indexlen, ncores, vpp); if (error != 0) { log(LOG_ERR, "pid %d (%s), uid (%u): Path `%s' failed " "on initial open test, error = %d\n", pid, comm, uid, name, error); } } else { cmode = S_IRUSR | S_IWUSR; oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE | (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0); flags = O_CREAT | FWRITE | O_NOFOLLOW; if ((td->td_proc->p_flag & P_SUGID) != 0) flags |= O_EXCL; NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name, td); error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred, NULL); if (error == 0) { *vpp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); } } if (error != 0) { #ifdef AUDIT audit_proc_coredump(td, name, error); #endif free(name, M_TEMP); return (error); } *namep = name; return (0); } /* * Dump a process' core. The main routine does some * policy checking, and creates the name of the coredump; * then it passes on a vnode and a size limit to the process-specific * coredump routine if there is one; if there _is not_ one, it returns * ENOSYS; otherwise it returns the error from the process-specific routine. */ static int coredump(struct thread *td) { struct proc *p = td->td_proc; struct ucred *cred = td->td_ucred; struct vnode *vp; struct flock lf; struct vattr vattr; int error, error1, locked; char *name; /* name of corefile */ void *rl_cookie; off_t limit; char *fullpath, *freepath = NULL; struct sbuf *sb; PROC_LOCK_ASSERT(p, MA_OWNED); MPASS((p->p_flag & P_HADTHREADS) == 0 || p->p_singlethread == td); _STOPEVENT(p, S_CORE, 0); if (!do_coredump || (!sugid_coredump && (p->p_flag & P_SUGID) != 0) || (p->p_flag2 & P2_NOTRACE) != 0) { PROC_UNLOCK(p); return (EFAULT); } /* * Note that the bulk of limit checking is done after * the corefile is created. The exception is if the limit * for corefiles is 0, in which case we don't bother * creating the corefile at all. This layout means that * a corefile is truncated instead of not being created, * if it is larger than the limit. */ limit = (off_t)lim_cur(td, RLIMIT_CORE); if (limit == 0 || racct_get_available(p, RACCT_CORE) == 0) { PROC_UNLOCK(p); return (EFBIG); } PROC_UNLOCK(p); error = corefile_open(p->p_comm, cred->cr_uid, p->p_pid, td, compress_user_cores, p->p_sig, &vp, &name); if (error != 0) return (error); /* * Don't dump to non-regular files or files with links. * Do not dump into system files. Effective user must own the corefile. */ if (vp->v_type != VREG || VOP_GETATTR(vp, &vattr, cred) != 0 || vattr.va_nlink != 1 || (vp->v_vflag & VV_SYSTEM) != 0 || vattr.va_uid != cred->cr_uid) { VOP_UNLOCK(vp, 0); error = EFAULT; goto out; } VOP_UNLOCK(vp, 0); /* Postpone other writers, including core dumps of other processes. */ rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_WRLCK; locked = (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK) == 0); VATTR_NULL(&vattr); vattr.va_size = 0; if (set_core_nodump_flag) vattr.va_flags = UF_NODUMP; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VOP_SETATTR(vp, &vattr, cred); VOP_UNLOCK(vp, 0); PROC_LOCK(p); p->p_acflag |= ACORE; PROC_UNLOCK(p); if (p->p_sysent->sv_coredump != NULL) { error = p->p_sysent->sv_coredump(td, vp, limit, 0); } else { error = ENOSYS; } if (locked) { lf.l_type = F_UNLCK; VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK); } vn_rangelock_unlock(vp, rl_cookie); /* * Notify the userland helper that a process triggered a core dump. * This allows the helper to run an automated debugging session. */ if (error != 0 || coredump_devctl == 0) goto out; sb = sbuf_new_auto(); if (vn_fullpath_global(td, p->p_textvp, &fullpath, &freepath) != 0) goto out2; sbuf_printf(sb, "comm=\""); devctl_safe_quote_sb(sb, fullpath); free(freepath, M_TEMP); sbuf_printf(sb, "\" core=\""); /* * We can't lookup core file vp directly. When we're replacing a core, and * other random times, we flush the name cache, so it will fail. Instead, * if the path of the core is relative, add the current dir in front if it. */ if (name[0] != '/') { fullpath = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); if (kern___getcwd(td, fullpath, UIO_SYSSPACE, MAXPATHLEN, MAXPATHLEN) != 0) { free(fullpath, M_TEMP); goto out2; } devctl_safe_quote_sb(sb, fullpath); free(fullpath, M_TEMP); sbuf_putc(sb, '/'); } devctl_safe_quote_sb(sb, name); sbuf_printf(sb, "\""); if (sbuf_finish(sb) == 0) devctl_notify("kernel", "signal", "coredump", sbuf_data(sb)); out2: sbuf_delete(sb); out: error1 = vn_close(vp, FWRITE, cred, td); if (error == 0) error = error1; #ifdef AUDIT audit_proc_coredump(td, name, error); #endif free(name, M_TEMP); return (error); } /* * Nonexistent system call-- signal process (may want to handle it). Flag * error in case process won't see signal immediately (blocked or ignored). */ #ifndef _SYS_SYSPROTO_H_ struct nosys_args { int dummy; }; #endif /* ARGSUSED */ int nosys(struct thread *td, struct nosys_args *args) { struct proc *p; p = td->td_proc; PROC_LOCK(p); tdsignal(td, SIGSYS); PROC_UNLOCK(p); if (kern_lognosys == 1 || kern_lognosys == 3) { uprintf("pid %d comm %s: nosys %d\n", p->p_pid, p->p_comm, td->td_sa.code); } if (kern_lognosys == 2 || kern_lognosys == 3) { printf("pid %d comm %s: nosys %d\n", p->p_pid, p->p_comm, td->td_sa.code); } return (ENOSYS); } /* * Send a SIGIO or SIGURG signal to a process or process group using stored * credentials rather than those of the current process. */ void pgsigio(struct sigio **sigiop, int sig, int checkctty) { ksiginfo_t ksi; struct sigio *sigio; ksiginfo_init(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = SI_KERNEL; SIGIO_LOCK(); sigio = *sigiop; if (sigio == NULL) { SIGIO_UNLOCK(); return; } if (sigio->sio_pgid > 0) { PROC_LOCK(sigio->sio_proc); if (CANSIGIO(sigio->sio_ucred, sigio->sio_proc->p_ucred)) kern_psignal(sigio->sio_proc, sig); PROC_UNLOCK(sigio->sio_proc); } else if (sigio->sio_pgid < 0) { struct proc *p; PGRP_LOCK(sigio->sio_pgrp); LIST_FOREACH(p, &sigio->sio_pgrp->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && CANSIGIO(sigio->sio_ucred, p->p_ucred) && (checkctty == 0 || (p->p_flag & P_CONTROLT))) kern_psignal(p, sig); PROC_UNLOCK(p); } PGRP_UNLOCK(sigio->sio_pgrp); } SIGIO_UNLOCK(); } static int filt_sigattach(struct knote *kn) { struct proc *p = curproc; kn->kn_ptr.p_proc = p; kn->kn_flags |= EV_CLEAR; /* automatically set */ knlist_add(p->p_klist, kn, 0); return (0); } static void filt_sigdetach(struct knote *kn) { struct proc *p = kn->kn_ptr.p_proc; knlist_remove(p->p_klist, kn, 0); } /* * signal knotes are shared with proc knotes, so we apply a mask to * the hint in order to differentiate them from process hints. This * could be avoided by using a signal-specific knote list, but probably * isn't worth the trouble. */ static int filt_signal(struct knote *kn, long hint) { if (hint & NOTE_SIGNAL) { hint &= ~NOTE_SIGNAL; if (kn->kn_id == hint) kn->kn_data++; } return (kn->kn_data != 0); } struct sigacts * sigacts_alloc(void) { struct sigacts *ps; ps = malloc(sizeof(struct sigacts), M_SUBPROC, M_WAITOK | M_ZERO); refcount_init(&ps->ps_refcnt, 1); mtx_init(&ps->ps_mtx, "sigacts", NULL, MTX_DEF); return (ps); } void sigacts_free(struct sigacts *ps) { if (refcount_release(&ps->ps_refcnt) == 0) return; mtx_destroy(&ps->ps_mtx); free(ps, M_SUBPROC); } struct sigacts * sigacts_hold(struct sigacts *ps) { refcount_acquire(&ps->ps_refcnt); return (ps); } void sigacts_copy(struct sigacts *dest, struct sigacts *src) { KASSERT(dest->ps_refcnt == 1, ("sigacts_copy to shared dest")); mtx_lock(&src->ps_mtx); bcopy(src, dest, offsetof(struct sigacts, ps_refcnt)); mtx_unlock(&src->ps_mtx); } int sigacts_shared(struct sigacts *ps) { return (ps->ps_refcnt > 1); } Index: head/sys/kern/sys_process.c =================================================================== --- head/sys/kern/sys_process.c (revision 350420) +++ head/sys/kern/sys_process.c (revision 350421) @@ -1,1557 +1,1558 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1994, Sean Eric Fagan * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Sean Eric Fagan. * 4. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_FREEBSD32 #include #include struct ptrace_io_desc32 { int piod_op; uint32_t piod_offs; uint32_t piod_addr; uint32_t piod_len; }; struct ptrace_sc_ret32 { uint32_t sr_retval[2]; int sr_error; }; struct ptrace_vm_entry32 { int pve_entry; int pve_timestamp; uint32_t pve_start; uint32_t pve_end; uint32_t pve_offset; u_int pve_prot; u_int pve_pathlen; int32_t pve_fileid; u_int pve_fsid; uint32_t pve_path; }; #endif /* * Functions implemented using PROC_ACTION(): * * proc_read_regs(proc, regs) * Get the current user-visible register set from the process * and copy it into the regs structure (). * The process is stopped at the time read_regs is called. * * proc_write_regs(proc, regs) * Update the current register set from the passed in regs * structure. Take care to avoid clobbering special CPU * registers or privileged bits in the PSL. * Depending on the architecture this may have fix-up work to do, * especially if the IAR or PCW are modified. * The process is stopped at the time write_regs is called. * * proc_read_fpregs, proc_write_fpregs * deal with the floating point register set, otherwise as above. * * proc_read_dbregs, proc_write_dbregs * deal with the processor debug register set, otherwise as above. * * proc_sstep(proc) * Arrange for the process to trap after executing a single instruction. */ #define PROC_ACTION(action) do { \ int error; \ \ PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); \ if ((td->td_proc->p_flag & P_INMEM) == 0) \ error = EIO; \ else \ error = (action); \ return (error); \ } while(0) int proc_read_regs(struct thread *td, struct reg *regs) { PROC_ACTION(fill_regs(td, regs)); } int proc_write_regs(struct thread *td, struct reg *regs) { PROC_ACTION(set_regs(td, regs)); } int proc_read_dbregs(struct thread *td, struct dbreg *dbregs) { PROC_ACTION(fill_dbregs(td, dbregs)); } int proc_write_dbregs(struct thread *td, struct dbreg *dbregs) { PROC_ACTION(set_dbregs(td, dbregs)); } /* * Ptrace doesn't support fpregs at all, and there are no security holes * or translations for fpregs, so we can just copy them. */ int proc_read_fpregs(struct thread *td, struct fpreg *fpregs) { PROC_ACTION(fill_fpregs(td, fpregs)); } int proc_write_fpregs(struct thread *td, struct fpreg *fpregs) { PROC_ACTION(set_fpregs(td, fpregs)); } #ifdef COMPAT_FREEBSD32 /* For 32 bit binaries, we need to expose the 32 bit regs layouts. */ int proc_read_regs32(struct thread *td, struct reg32 *regs32) { PROC_ACTION(fill_regs32(td, regs32)); } int proc_write_regs32(struct thread *td, struct reg32 *regs32) { PROC_ACTION(set_regs32(td, regs32)); } int proc_read_dbregs32(struct thread *td, struct dbreg32 *dbregs32) { PROC_ACTION(fill_dbregs32(td, dbregs32)); } int proc_write_dbregs32(struct thread *td, struct dbreg32 *dbregs32) { PROC_ACTION(set_dbregs32(td, dbregs32)); } int proc_read_fpregs32(struct thread *td, struct fpreg32 *fpregs32) { PROC_ACTION(fill_fpregs32(td, fpregs32)); } int proc_write_fpregs32(struct thread *td, struct fpreg32 *fpregs32) { PROC_ACTION(set_fpregs32(td, fpregs32)); } #endif int proc_sstep(struct thread *td) { PROC_ACTION(ptrace_single_step(td)); } int proc_rwmem(struct proc *p, struct uio *uio) { vm_map_t map; vm_offset_t pageno; /* page number */ vm_prot_t reqprot; int error, fault_flags, page_offset, writing; /* * Assert that someone has locked this vmspace. (Should be * curthread but we can't assert that.) This keeps the process * from exiting out from under us until this operation completes. */ PROC_ASSERT_HELD(p); PROC_LOCK_ASSERT(p, MA_NOTOWNED); /* * The map we want... */ map = &p->p_vmspace->vm_map; /* * If we are writing, then we request vm_fault() to create a private * copy of each page. Since these copies will not be writeable by the * process, we must explicity request that they be dirtied. */ writing = uio->uio_rw == UIO_WRITE; reqprot = writing ? VM_PROT_COPY | VM_PROT_READ : VM_PROT_READ; fault_flags = writing ? VM_FAULT_DIRTY : VM_FAULT_NORMAL; /* * Only map in one page at a time. We don't have to, but it * makes things easier. This way is trivial - right? */ do { vm_offset_t uva; u_int len; vm_page_t m; uva = (vm_offset_t)uio->uio_offset; /* * Get the page number of this segment. */ pageno = trunc_page(uva); page_offset = uva - pageno; /* * How many bytes to copy */ len = min(PAGE_SIZE - page_offset, uio->uio_resid); /* * Fault and hold the page on behalf of the process. */ error = vm_fault_hold(map, pageno, reqprot, fault_flags, &m); if (error != KERN_SUCCESS) { if (error == KERN_RESOURCE_SHORTAGE) error = ENOMEM; else error = EFAULT; break; } /* * Now do the i/o move. */ error = uiomove_fromphys(&m, page_offset, len, uio); /* Make the I-cache coherent for breakpoints. */ if (writing && error == 0) { vm_map_lock_read(map); if (vm_map_check_protection(map, pageno, pageno + PAGE_SIZE, VM_PROT_EXECUTE)) vm_sync_icache(map, uva, len); vm_map_unlock_read(map); } /* * Release the page. */ vm_page_lock(m); if (vm_page_unwire(m, PQ_ACTIVE) && m->object == NULL) vm_page_free(m); vm_page_unlock(m); } while (error == 0 && uio->uio_resid > 0); return (error); } static ssize_t proc_iop(struct thread *td, struct proc *p, vm_offset_t va, void *buf, size_t len, enum uio_rw rw) { struct iovec iov; struct uio uio; ssize_t slen; MPASS(len < SSIZE_MAX); slen = (ssize_t)len; iov.iov_base = (caddr_t)buf; iov.iov_len = len; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = va; uio.uio_resid = slen; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = rw; uio.uio_td = td; proc_rwmem(p, &uio); if (uio.uio_resid == slen) return (-1); return (slen - uio.uio_resid); } ssize_t proc_readmem(struct thread *td, struct proc *p, vm_offset_t va, void *buf, size_t len) { return (proc_iop(td, p, va, buf, len, UIO_READ)); } ssize_t proc_writemem(struct thread *td, struct proc *p, vm_offset_t va, void *buf, size_t len) { return (proc_iop(td, p, va, buf, len, UIO_WRITE)); } static int ptrace_vm_entry(struct thread *td, struct proc *p, struct ptrace_vm_entry *pve) { struct vattr vattr; vm_map_t map; vm_map_entry_t entry; vm_object_t obj, tobj, lobj; struct vmspace *vm; struct vnode *vp; char *freepath, *fullpath; u_int pathlen; int error, index; error = 0; obj = NULL; vm = vmspace_acquire_ref(p); map = &vm->vm_map; vm_map_lock_read(map); do { entry = map->header.next; index = 0; while (index < pve->pve_entry && entry != &map->header) { entry = entry->next; index++; } if (index != pve->pve_entry) { error = EINVAL; break; } KASSERT((map->header.eflags & MAP_ENTRY_IS_SUB_MAP) == 0, ("Submap in map header")); while ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) { entry = entry->next; index++; } if (entry == &map->header) { error = ENOENT; break; } /* We got an entry. */ pve->pve_entry = index + 1; pve->pve_timestamp = map->timestamp; pve->pve_start = entry->start; pve->pve_end = entry->end - 1; pve->pve_offset = entry->offset; pve->pve_prot = entry->protection; /* Backing object's path needed? */ if (pve->pve_pathlen == 0) break; pathlen = pve->pve_pathlen; pve->pve_pathlen = 0; obj = entry->object.vm_object; if (obj != NULL) VM_OBJECT_RLOCK(obj); } while (0); vm_map_unlock_read(map); pve->pve_fsid = VNOVAL; pve->pve_fileid = VNOVAL; if (error == 0 && obj != NULL) { lobj = obj; for (tobj = obj; tobj != NULL; tobj = tobj->backing_object) { if (tobj != obj) VM_OBJECT_RLOCK(tobj); if (lobj != obj) VM_OBJECT_RUNLOCK(lobj); lobj = tobj; pve->pve_offset += tobj->backing_object_offset; } vp = vm_object_vnode(lobj); if (vp != NULL) vref(vp); if (lobj != obj) VM_OBJECT_RUNLOCK(lobj); VM_OBJECT_RUNLOCK(obj); if (vp != NULL) { freepath = NULL; fullpath = NULL; vn_fullpath(td, vp, &fullpath, &freepath); vn_lock(vp, LK_SHARED | LK_RETRY); if (VOP_GETATTR(vp, &vattr, td->td_ucred) == 0) { pve->pve_fileid = vattr.va_fileid; pve->pve_fsid = vattr.va_fsid; } vput(vp); if (fullpath != NULL) { pve->pve_pathlen = strlen(fullpath) + 1; if (pve->pve_pathlen <= pathlen) { error = copyout(fullpath, pve->pve_path, pve->pve_pathlen); } else error = ENAMETOOLONG; } if (freepath != NULL) free(freepath, M_TEMP); } } vmspace_free(vm); if (error == 0) CTR3(KTR_PTRACE, "PT_VM_ENTRY: pid %d, entry %d, start %p", p->p_pid, pve->pve_entry, pve->pve_start); return (error); } #ifdef COMPAT_FREEBSD32 static int ptrace_vm_entry32(struct thread *td, struct proc *p, struct ptrace_vm_entry32 *pve32) { struct ptrace_vm_entry pve; int error; pve.pve_entry = pve32->pve_entry; pve.pve_pathlen = pve32->pve_pathlen; pve.pve_path = (void *)(uintptr_t)pve32->pve_path; error = ptrace_vm_entry(td, p, &pve); if (error == 0) { pve32->pve_entry = pve.pve_entry; pve32->pve_timestamp = pve.pve_timestamp; pve32->pve_start = pve.pve_start; pve32->pve_end = pve.pve_end; pve32->pve_offset = pve.pve_offset; pve32->pve_prot = pve.pve_prot; pve32->pve_fileid = pve.pve_fileid; pve32->pve_fsid = pve.pve_fsid; } pve32->pve_pathlen = pve.pve_pathlen; return (error); } static void ptrace_lwpinfo_to32(const struct ptrace_lwpinfo *pl, struct ptrace_lwpinfo32 *pl32) { bzero(pl32, sizeof(*pl32)); pl32->pl_lwpid = pl->pl_lwpid; pl32->pl_event = pl->pl_event; pl32->pl_flags = pl->pl_flags; pl32->pl_sigmask = pl->pl_sigmask; pl32->pl_siglist = pl->pl_siglist; siginfo_to_siginfo32(&pl->pl_siginfo, &pl32->pl_siginfo); strcpy(pl32->pl_tdname, pl->pl_tdname); pl32->pl_child_pid = pl->pl_child_pid; pl32->pl_syscall_code = pl->pl_syscall_code; pl32->pl_syscall_narg = pl->pl_syscall_narg; } static void ptrace_sc_ret_to32(const struct ptrace_sc_ret *psr, struct ptrace_sc_ret32 *psr32) { bzero(psr32, sizeof(*psr32)); psr32->sr_retval[0] = psr->sr_retval[0]; psr32->sr_retval[1] = psr->sr_retval[1]; psr32->sr_error = psr->sr_error; } #endif /* COMPAT_FREEBSD32 */ /* * Process debugging system call. */ #ifndef _SYS_SYSPROTO_H_ struct ptrace_args { int req; pid_t pid; caddr_t addr; int data; }; #endif #ifdef COMPAT_FREEBSD32 /* * This CPP subterfuge is to try and reduce the number of ifdefs in * the body of the code. * COPYIN(uap->addr, &r.reg, sizeof r.reg); * becomes either: * copyin(uap->addr, &r.reg, sizeof r.reg); * or * copyin(uap->addr, &r.reg32, sizeof r.reg32); * .. except this is done at runtime. */ #define BZERO(a, s) wrap32 ? \ bzero(a ## 32, s ## 32) : \ bzero(a, s) #define COPYIN(u, k, s) wrap32 ? \ copyin(u, k ## 32, s ## 32) : \ copyin(u, k, s) #define COPYOUT(k, u, s) wrap32 ? \ copyout(k ## 32, u, s ## 32) : \ copyout(k, u, s) #else #define BZERO(a, s) bzero(a, s) #define COPYIN(u, k, s) copyin(u, k, s) #define COPYOUT(k, u, s) copyout(k, u, s) #endif int sys_ptrace(struct thread *td, struct ptrace_args *uap) { /* * XXX this obfuscation is to reduce stack usage, but the register * structs may be too large to put on the stack anyway. */ union { struct ptrace_io_desc piod; struct ptrace_lwpinfo pl; struct ptrace_vm_entry pve; struct dbreg dbreg; struct fpreg fpreg; struct reg reg; #ifdef COMPAT_FREEBSD32 struct dbreg32 dbreg32; struct fpreg32 fpreg32; struct reg32 reg32; struct ptrace_io_desc32 piod32; struct ptrace_lwpinfo32 pl32; struct ptrace_vm_entry32 pve32; #endif char args[sizeof(td->td_sa.args)]; struct ptrace_sc_ret psr; int ptevents; } r; void *addr; int error = 0; #ifdef COMPAT_FREEBSD32 int wrap32 = 0; if (SV_CURPROC_FLAG(SV_ILP32)) wrap32 = 1; #endif AUDIT_ARG_PID(uap->pid); AUDIT_ARG_CMD(uap->req); AUDIT_ARG_VALUE(uap->data); addr = &r; switch (uap->req) { case PT_GET_EVENT_MASK: case PT_LWPINFO: case PT_GET_SC_ARGS: case PT_GET_SC_RET: break; case PT_GETREGS: BZERO(&r.reg, sizeof r.reg); break; case PT_GETFPREGS: BZERO(&r.fpreg, sizeof r.fpreg); break; case PT_GETDBREGS: BZERO(&r.dbreg, sizeof r.dbreg); break; case PT_SETREGS: error = COPYIN(uap->addr, &r.reg, sizeof r.reg); break; case PT_SETFPREGS: error = COPYIN(uap->addr, &r.fpreg, sizeof r.fpreg); break; case PT_SETDBREGS: error = COPYIN(uap->addr, &r.dbreg, sizeof r.dbreg); break; case PT_SET_EVENT_MASK: if (uap->data != sizeof(r.ptevents)) error = EINVAL; else error = copyin(uap->addr, &r.ptevents, uap->data); break; case PT_IO: error = COPYIN(uap->addr, &r.piod, sizeof r.piod); break; case PT_VM_ENTRY: error = COPYIN(uap->addr, &r.pve, sizeof r.pve); break; default: addr = uap->addr; break; } if (error) return (error); error = kern_ptrace(td, uap->req, uap->pid, addr, uap->data); if (error) return (error); switch (uap->req) { case PT_VM_ENTRY: error = COPYOUT(&r.pve, uap->addr, sizeof r.pve); break; case PT_IO: error = COPYOUT(&r.piod, uap->addr, sizeof r.piod); break; case PT_GETREGS: error = COPYOUT(&r.reg, uap->addr, sizeof r.reg); break; case PT_GETFPREGS: error = COPYOUT(&r.fpreg, uap->addr, sizeof r.fpreg); break; case PT_GETDBREGS: error = COPYOUT(&r.dbreg, uap->addr, sizeof r.dbreg); break; case PT_GET_EVENT_MASK: /* NB: The size in uap->data is validated in kern_ptrace(). */ error = copyout(&r.ptevents, uap->addr, uap->data); break; case PT_LWPINFO: /* NB: The size in uap->data is validated in kern_ptrace(). */ error = copyout(&r.pl, uap->addr, uap->data); break; case PT_GET_SC_ARGS: error = copyout(r.args, uap->addr, MIN(uap->data, sizeof(r.args))); break; case PT_GET_SC_RET: error = copyout(&r.psr, uap->addr, MIN(uap->data, sizeof(r.psr))); break; } return (error); } #undef COPYIN #undef COPYOUT #undef BZERO #ifdef COMPAT_FREEBSD32 /* * PROC_READ(regs, td2, addr); * becomes either: * proc_read_regs(td2, addr); * or * proc_read_regs32(td2, addr); * .. except this is done at runtime. There is an additional * complication in that PROC_WRITE disallows 32 bit consumers * from writing to 64 bit address space targets. */ #define PROC_READ(w, t, a) wrap32 ? \ proc_read_ ## w ## 32(t, a) : \ proc_read_ ## w (t, a) #define PROC_WRITE(w, t, a) wrap32 ? \ (safe ? proc_write_ ## w ## 32(t, a) : EINVAL ) : \ proc_write_ ## w (t, a) #else #define PROC_READ(w, t, a) proc_read_ ## w (t, a) #define PROC_WRITE(w, t, a) proc_write_ ## w (t, a) #endif void proc_set_traced(struct proc *p, bool stop) { sx_assert(&proctree_lock, SX_XLOCKED); PROC_LOCK_ASSERT(p, MA_OWNED); p->p_flag |= P_TRACED; if (stop) p->p_flag2 |= P2_PTRACE_FSTP; p->p_ptevents = PTRACE_DEFAULT; } int kern_ptrace(struct thread *td, int req, pid_t pid, void *addr, int data) { struct iovec iov; struct uio uio; struct proc *curp, *p, *pp; struct thread *td2 = NULL, *td3; struct ptrace_io_desc *piod = NULL; struct ptrace_lwpinfo *pl; struct ptrace_sc_ret *psr; int error, num, tmp; int proctree_locked = 0; lwpid_t tid = 0, *buf; #ifdef COMPAT_FREEBSD32 int wrap32 = 0, safe = 0; struct ptrace_io_desc32 *piod32 = NULL; struct ptrace_lwpinfo32 *pl32 = NULL; struct ptrace_sc_ret32 *psr32 = NULL; union { struct ptrace_lwpinfo pl; struct ptrace_sc_ret psr; } r; #endif curp = td->td_proc; /* Lock proctree before locking the process. */ switch (req) { case PT_TRACE_ME: case PT_ATTACH: case PT_STEP: case PT_CONTINUE: case PT_TO_SCE: case PT_TO_SCX: case PT_SYSCALL: case PT_FOLLOW_FORK: case PT_LWP_EVENTS: case PT_GET_EVENT_MASK: case PT_SET_EVENT_MASK: case PT_DETACH: case PT_GET_SC_ARGS: sx_xlock(&proctree_lock); proctree_locked = 1; break; default: break; } if (req == PT_TRACE_ME) { p = td->td_proc; PROC_LOCK(p); } else { if (pid <= PID_MAX) { if ((p = pfind(pid)) == NULL) { if (proctree_locked) sx_xunlock(&proctree_lock); return (ESRCH); } } else { td2 = tdfind(pid, -1); if (td2 == NULL) { if (proctree_locked) sx_xunlock(&proctree_lock); return (ESRCH); } p = td2->td_proc; tid = pid; pid = p->p_pid; } } AUDIT_ARG_PROCESS(p); if ((p->p_flag & P_WEXIT) != 0) { error = ESRCH; goto fail; } if ((error = p_cansee(td, p)) != 0) goto fail; if ((error = p_candebug(td, p)) != 0) goto fail; /* * System processes can't be debugged. */ if ((p->p_flag & P_SYSTEM) != 0) { error = EINVAL; goto fail; } if (tid == 0) { if ((p->p_flag & P_STOPPED_TRACE) != 0) { KASSERT(p->p_xthread != NULL, ("NULL p_xthread")); td2 = p->p_xthread; } else { td2 = FIRST_THREAD_IN_PROC(p); } tid = td2->td_tid; } #ifdef COMPAT_FREEBSD32 /* * Test if we're a 32 bit client and what the target is. * Set the wrap controls accordingly. */ if (SV_CURPROC_FLAG(SV_ILP32)) { if (SV_PROC_FLAG(td2->td_proc, SV_ILP32)) safe = 1; wrap32 = 1; } #endif /* * Permissions check */ switch (req) { case PT_TRACE_ME: /* * Always legal, when there is a parent process which * could trace us. Otherwise, reject. */ if ((p->p_flag & P_TRACED) != 0) { error = EBUSY; goto fail; } if (p->p_pptr == initproc) { error = EPERM; goto fail; } break; case PT_ATTACH: /* Self */ if (p == td->td_proc) { error = EINVAL; goto fail; } /* Already traced */ if (p->p_flag & P_TRACED) { error = EBUSY; goto fail; } /* Can't trace an ancestor if you're being traced. */ if (curp->p_flag & P_TRACED) { for (pp = curp->p_pptr; pp != NULL; pp = pp->p_pptr) { if (pp == p) { error = EINVAL; goto fail; } } } /* OK */ break; case PT_CLEARSTEP: /* Allow thread to clear single step for itself */ if (td->td_tid == tid) break; /* FALLTHROUGH */ default: /* not being traced... */ if ((p->p_flag & P_TRACED) == 0) { error = EPERM; goto fail; } /* not being traced by YOU */ if (p->p_pptr != td->td_proc) { error = EBUSY; goto fail; } /* not currently stopped */ if ((p->p_flag & P_STOPPED_TRACE) == 0 || p->p_suspcount != p->p_numthreads || (p->p_flag & P_WAITED) == 0) { error = EBUSY; goto fail; } /* OK */ break; } /* Keep this process around until we finish this request. */ _PHOLD(p); #ifdef FIX_SSTEP /* * Single step fixup ala procfs */ FIX_SSTEP(td2); #endif /* * Actually do the requests */ td->td_retval[0] = 0; switch (req) { case PT_TRACE_ME: /* set my trace flag and "owner" so it can read/write me */ proc_set_traced(p, false); if (p->p_flag & P_PPWAIT) p->p_flag |= P_PPTRACE; CTR1(KTR_PTRACE, "PT_TRACE_ME: pid %d", p->p_pid); break; case PT_ATTACH: /* security check done above */ /* * It would be nice if the tracing relationship was separate * from the parent relationship but that would require * another set of links in the proc struct or for "wait" * to scan the entire proc table. To make life easier, * we just re-parent the process we're trying to trace. * The old parent is remembered so we can put things back * on a "detach". */ proc_set_traced(p, true); if (p->p_pptr != td->td_proc) { proc_reparent(p, td->td_proc, false); } CTR2(KTR_PTRACE, "PT_ATTACH: pid %d, oppid %d", p->p_pid, p->p_oppid); sx_xunlock(&proctree_lock); proctree_locked = 0; MPASS(p->p_xthread == NULL); MPASS((p->p_flag & P_STOPPED_TRACE) == 0); /* * If already stopped due to a stop signal, clear the * existing stop before triggering a traced SIGSTOP. */ if ((p->p_flag & P_STOPPED_SIG) != 0) { PROC_SLOCK(p); p->p_flag &= ~(P_STOPPED_SIG | P_WAITED); thread_unsuspend(p); PROC_SUNLOCK(p); } kern_psignal(p, SIGSTOP); break; case PT_CLEARSTEP: CTR2(KTR_PTRACE, "PT_CLEARSTEP: tid %d (pid %d)", td2->td_tid, p->p_pid); error = ptrace_clear_single_step(td2); break; case PT_SETSTEP: CTR2(KTR_PTRACE, "PT_SETSTEP: tid %d (pid %d)", td2->td_tid, p->p_pid); error = ptrace_single_step(td2); break; case PT_SUSPEND: CTR2(KTR_PTRACE, "PT_SUSPEND: tid %d (pid %d)", td2->td_tid, p->p_pid); td2->td_dbgflags |= TDB_SUSPEND; thread_lock(td2); td2->td_flags |= TDF_NEEDSUSPCHK; thread_unlock(td2); break; case PT_RESUME: CTR2(KTR_PTRACE, "PT_RESUME: tid %d (pid %d)", td2->td_tid, p->p_pid); td2->td_dbgflags &= ~TDB_SUSPEND; break; case PT_FOLLOW_FORK: CTR3(KTR_PTRACE, "PT_FOLLOW_FORK: pid %d %s -> %s", p->p_pid, p->p_ptevents & PTRACE_FORK ? "enabled" : "disabled", data ? "enabled" : "disabled"); if (data) p->p_ptevents |= PTRACE_FORK; else p->p_ptevents &= ~PTRACE_FORK; break; case PT_LWP_EVENTS: CTR3(KTR_PTRACE, "PT_LWP_EVENTS: pid %d %s -> %s", p->p_pid, p->p_ptevents & PTRACE_LWP ? "enabled" : "disabled", data ? "enabled" : "disabled"); if (data) p->p_ptevents |= PTRACE_LWP; else p->p_ptevents &= ~PTRACE_LWP; break; case PT_GET_EVENT_MASK: if (data != sizeof(p->p_ptevents)) { error = EINVAL; break; } CTR2(KTR_PTRACE, "PT_GET_EVENT_MASK: pid %d mask %#x", p->p_pid, p->p_ptevents); *(int *)addr = p->p_ptevents; break; case PT_SET_EVENT_MASK: if (data != sizeof(p->p_ptevents)) { error = EINVAL; break; } tmp = *(int *)addr; if ((tmp & ~(PTRACE_EXEC | PTRACE_SCE | PTRACE_SCX | PTRACE_FORK | PTRACE_LWP | PTRACE_VFORK)) != 0) { error = EINVAL; break; } CTR3(KTR_PTRACE, "PT_SET_EVENT_MASK: pid %d mask %#x -> %#x", p->p_pid, p->p_ptevents, tmp); p->p_ptevents = tmp; break; case PT_GET_SC_ARGS: CTR1(KTR_PTRACE, "PT_GET_SC_ARGS: pid %d", p->p_pid); if ((td2->td_dbgflags & (TDB_SCE | TDB_SCX)) == 0 #ifdef COMPAT_FREEBSD32 || (wrap32 && !safe) #endif ) { error = EINVAL; break; } bzero(addr, sizeof(td2->td_sa.args)); #ifdef COMPAT_FREEBSD32 if (wrap32) for (num = 0; num < nitems(td2->td_sa.args); num++) ((uint32_t *)addr)[num] = (uint32_t) td2->td_sa.args[num]; else #endif bcopy(td2->td_sa.args, addr, td2->td_sa.narg * sizeof(register_t)); break; case PT_GET_SC_RET: if ((td2->td_dbgflags & (TDB_SCX)) == 0 #ifdef COMPAT_FREEBSD32 || (wrap32 && !safe) #endif ) { error = EINVAL; break; } #ifdef COMPAT_FREEBSD32 if (wrap32) { psr = &r.psr; psr32 = addr; } else #endif psr = addr; bzero(psr, sizeof(*psr)); psr->sr_error = td2->td_errno; if (psr->sr_error == 0) { psr->sr_retval[0] = td2->td_retval[0]; psr->sr_retval[1] = td2->td_retval[1]; } #ifdef COMPAT_FREEBSD32 if (wrap32) ptrace_sc_ret_to32(psr, psr32); #endif CTR4(KTR_PTRACE, "PT_GET_SC_RET: pid %d error %d retval %#lx,%#lx", p->p_pid, psr->sr_error, psr->sr_retval[0], psr->sr_retval[1]); break; case PT_STEP: case PT_CONTINUE: case PT_TO_SCE: case PT_TO_SCX: case PT_SYSCALL: case PT_DETACH: /* Zero means do not send any signal */ if (data < 0 || data > _SIG_MAXSIG) { error = EINVAL; break; } switch (req) { case PT_STEP: CTR3(KTR_PTRACE, "PT_STEP: tid %d (pid %d), sig = %d", td2->td_tid, p->p_pid, data); error = ptrace_single_step(td2); if (error) goto out; break; case PT_CONTINUE: case PT_TO_SCE: case PT_TO_SCX: case PT_SYSCALL: if (addr != (void *)1) { error = ptrace_set_pc(td2, (u_long)(uintfptr_t)addr); if (error) goto out; } switch (req) { case PT_TO_SCE: p->p_ptevents |= PTRACE_SCE; CTR4(KTR_PTRACE, "PT_TO_SCE: pid %d, events = %#x, PC = %#lx, sig = %d", p->p_pid, p->p_ptevents, (u_long)(uintfptr_t)addr, data); break; case PT_TO_SCX: p->p_ptevents |= PTRACE_SCX; CTR4(KTR_PTRACE, "PT_TO_SCX: pid %d, events = %#x, PC = %#lx, sig = %d", p->p_pid, p->p_ptevents, (u_long)(uintfptr_t)addr, data); break; case PT_SYSCALL: p->p_ptevents |= PTRACE_SYSCALL; CTR4(KTR_PTRACE, "PT_SYSCALL: pid %d, events = %#x, PC = %#lx, sig = %d", p->p_pid, p->p_ptevents, (u_long)(uintfptr_t)addr, data); break; case PT_CONTINUE: CTR3(KTR_PTRACE, "PT_CONTINUE: pid %d, PC = %#lx, sig = %d", p->p_pid, (u_long)(uintfptr_t)addr, data); break; } break; case PT_DETACH: /* * Reset the process parent. * * NB: This clears P_TRACED before reparenting * a detached process back to its original * parent. Otherwise the debugee will be set * as an orphan of the debugger. */ p->p_flag &= ~(P_TRACED | P_WAITED); if (p->p_oppid != p->p_pptr->p_pid) { PROC_LOCK(p->p_pptr); sigqueue_take(p->p_ksi); PROC_UNLOCK(p->p_pptr); pp = proc_realparent(p); proc_reparent(p, pp, false); if (pp == initproc) p->p_sigparent = SIGCHLD; CTR3(KTR_PTRACE, "PT_DETACH: pid %d reparented to pid %d, sig %d", p->p_pid, pp->p_pid, data); } else CTR2(KTR_PTRACE, "PT_DETACH: pid %d, sig %d", p->p_pid, data); p->p_ptevents = 0; FOREACH_THREAD_IN_PROC(p, td3) { if ((td3->td_dbgflags & TDB_FSTP) != 0) { sigqueue_delete(&td3->td_sigqueue, SIGSTOP); } td3->td_dbgflags &= ~(TDB_XSIG | TDB_FSTP | TDB_SUSPEND); } if ((p->p_flag2 & P2_PTRACE_FSTP) != 0) { sigqueue_delete(&p->p_sigqueue, SIGSTOP); p->p_flag2 &= ~P2_PTRACE_FSTP; } /* should we send SIGCHLD? */ /* childproc_continued(p); */ break; } sx_xunlock(&proctree_lock); proctree_locked = 0; sendsig: MPASS(proctree_locked == 0); /* * Clear the pending event for the thread that just * reported its event (p_xthread). This may not be * the thread passed to PT_CONTINUE, PT_STEP, etc. if * the debugger is resuming a different thread. * * Deliver any pending signal via the reporting thread. */ MPASS(p->p_xthread != NULL); p->p_xthread->td_dbgflags &= ~TDB_XSIG; p->p_xthread->td_xsig = data; p->p_xthread = NULL; p->p_xsig = data; /* * P_WKILLED is insurance that a PT_KILL/SIGKILL * always works immediately, even if another thread is * unsuspended first and attempts to handle a * different signal or if the POSIX.1b style signal * queue cannot accommodate any new signals. */ if (data == SIGKILL) proc_wkilled(p); /* * Unsuspend all threads. To leave a thread * suspended, use PT_SUSPEND to suspend it before * continuing the process. */ PROC_SLOCK(p); p->p_flag &= ~(P_STOPPED_TRACE | P_STOPPED_SIG | P_WAITED); thread_unsuspend(p); PROC_SUNLOCK(p); break; case PT_WRITE_I: case PT_WRITE_D: td2->td_dbgflags |= TDB_USERWR; PROC_UNLOCK(p); error = 0; if (proc_writemem(td, p, (off_t)(uintptr_t)addr, &data, sizeof(int)) != sizeof(int)) error = ENOMEM; else CTR3(KTR_PTRACE, "PT_WRITE: pid %d: %p <= %#x", p->p_pid, addr, data); PROC_LOCK(p); break; case PT_READ_I: case PT_READ_D: PROC_UNLOCK(p); error = tmp = 0; if (proc_readmem(td, p, (off_t)(uintptr_t)addr, &tmp, sizeof(int)) != sizeof(int)) error = ENOMEM; else CTR3(KTR_PTRACE, "PT_READ: pid %d: %p >= %#x", p->p_pid, addr, tmp); td->td_retval[0] = tmp; PROC_LOCK(p); break; case PT_IO: #ifdef COMPAT_FREEBSD32 if (wrap32) { piod32 = addr; iov.iov_base = (void *)(uintptr_t)piod32->piod_addr; iov.iov_len = piod32->piod_len; uio.uio_offset = (off_t)(uintptr_t)piod32->piod_offs; uio.uio_resid = piod32->piod_len; } else #endif { piod = addr; iov.iov_base = piod->piod_addr; iov.iov_len = piod->piod_len; uio.uio_offset = (off_t)(uintptr_t)piod->piod_offs; uio.uio_resid = piod->piod_len; } uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_segflg = UIO_USERSPACE; uio.uio_td = td; #ifdef COMPAT_FREEBSD32 tmp = wrap32 ? piod32->piod_op : piod->piod_op; #else tmp = piod->piod_op; #endif switch (tmp) { case PIOD_READ_D: case PIOD_READ_I: CTR3(KTR_PTRACE, "PT_IO: pid %d: READ (%p, %#x)", p->p_pid, (uintptr_t)uio.uio_offset, uio.uio_resid); uio.uio_rw = UIO_READ; break; case PIOD_WRITE_D: case PIOD_WRITE_I: CTR3(KTR_PTRACE, "PT_IO: pid %d: WRITE (%p, %#x)", p->p_pid, (uintptr_t)uio.uio_offset, uio.uio_resid); td2->td_dbgflags |= TDB_USERWR; uio.uio_rw = UIO_WRITE; break; default: error = EINVAL; goto out; } PROC_UNLOCK(p); error = proc_rwmem(p, &uio); #ifdef COMPAT_FREEBSD32 if (wrap32) piod32->piod_len -= uio.uio_resid; else #endif piod->piod_len -= uio.uio_resid; PROC_LOCK(p); break; case PT_KILL: CTR1(KTR_PTRACE, "PT_KILL: pid %d", p->p_pid); data = SIGKILL; goto sendsig; /* in PT_CONTINUE above */ case PT_SETREGS: CTR2(KTR_PTRACE, "PT_SETREGS: tid %d (pid %d)", td2->td_tid, p->p_pid); td2->td_dbgflags |= TDB_USERWR; error = PROC_WRITE(regs, td2, addr); break; case PT_GETREGS: CTR2(KTR_PTRACE, "PT_GETREGS: tid %d (pid %d)", td2->td_tid, p->p_pid); error = PROC_READ(regs, td2, addr); break; case PT_SETFPREGS: CTR2(KTR_PTRACE, "PT_SETFPREGS: tid %d (pid %d)", td2->td_tid, p->p_pid); td2->td_dbgflags |= TDB_USERWR; error = PROC_WRITE(fpregs, td2, addr); break; case PT_GETFPREGS: CTR2(KTR_PTRACE, "PT_GETFPREGS: tid %d (pid %d)", td2->td_tid, p->p_pid); error = PROC_READ(fpregs, td2, addr); break; case PT_SETDBREGS: CTR2(KTR_PTRACE, "PT_SETDBREGS: tid %d (pid %d)", td2->td_tid, p->p_pid); td2->td_dbgflags |= TDB_USERWR; error = PROC_WRITE(dbregs, td2, addr); break; case PT_GETDBREGS: CTR2(KTR_PTRACE, "PT_GETDBREGS: tid %d (pid %d)", td2->td_tid, p->p_pid); error = PROC_READ(dbregs, td2, addr); break; case PT_LWPINFO: if (data <= 0 || #ifdef COMPAT_FREEBSD32 (!wrap32 && data > sizeof(*pl)) || (wrap32 && data > sizeof(*pl32))) { #else data > sizeof(*pl)) { #endif error = EINVAL; break; } #ifdef COMPAT_FREEBSD32 if (wrap32) { pl = &r.pl; pl32 = addr; } else #endif pl = addr; bzero(pl, sizeof(*pl)); pl->pl_lwpid = td2->td_tid; pl->pl_event = PL_EVENT_NONE; pl->pl_flags = 0; if (td2->td_dbgflags & TDB_XSIG) { pl->pl_event = PL_EVENT_SIGNAL; if (td2->td_si.si_signo != 0 && #ifdef COMPAT_FREEBSD32 ((!wrap32 && data >= offsetof(struct ptrace_lwpinfo, pl_siginfo) + sizeof(pl->pl_siginfo)) || (wrap32 && data >= offsetof(struct ptrace_lwpinfo32, pl_siginfo) + sizeof(struct siginfo32))) #else data >= offsetof(struct ptrace_lwpinfo, pl_siginfo) + sizeof(pl->pl_siginfo) #endif ){ pl->pl_flags |= PL_FLAG_SI; pl->pl_siginfo = td2->td_si; } } if (td2->td_dbgflags & TDB_SCE) pl->pl_flags |= PL_FLAG_SCE; else if (td2->td_dbgflags & TDB_SCX) pl->pl_flags |= PL_FLAG_SCX; if (td2->td_dbgflags & TDB_EXEC) pl->pl_flags |= PL_FLAG_EXEC; if (td2->td_dbgflags & TDB_FORK) { pl->pl_flags |= PL_FLAG_FORKED; pl->pl_child_pid = td2->td_dbg_forked; if (td2->td_dbgflags & TDB_VFORK) pl->pl_flags |= PL_FLAG_VFORKED; } else if ((td2->td_dbgflags & (TDB_SCX | TDB_VFORK)) == TDB_VFORK) pl->pl_flags |= PL_FLAG_VFORK_DONE; if (td2->td_dbgflags & TDB_CHILD) pl->pl_flags |= PL_FLAG_CHILD; if (td2->td_dbgflags & TDB_BORN) pl->pl_flags |= PL_FLAG_BORN; if (td2->td_dbgflags & TDB_EXIT) pl->pl_flags |= PL_FLAG_EXITED; pl->pl_sigmask = td2->td_sigmask; pl->pl_siglist = td2->td_siglist; strcpy(pl->pl_tdname, td2->td_name); if ((td2->td_dbgflags & (TDB_SCE | TDB_SCX)) != 0) { pl->pl_syscall_code = td2->td_sa.code; pl->pl_syscall_narg = td2->td_sa.narg; } else { pl->pl_syscall_code = 0; pl->pl_syscall_narg = 0; } #ifdef COMPAT_FREEBSD32 if (wrap32) ptrace_lwpinfo_to32(pl, pl32); #endif CTR6(KTR_PTRACE, "PT_LWPINFO: tid %d (pid %d) event %d flags %#x child pid %d syscall %d", td2->td_tid, p->p_pid, pl->pl_event, pl->pl_flags, pl->pl_child_pid, pl->pl_syscall_code); break; case PT_GETNUMLWPS: CTR2(KTR_PTRACE, "PT_GETNUMLWPS: pid %d: %d threads", p->p_pid, p->p_numthreads); td->td_retval[0] = p->p_numthreads; break; case PT_GETLWPLIST: CTR3(KTR_PTRACE, "PT_GETLWPLIST: pid %d: data %d, actual %d", p->p_pid, data, p->p_numthreads); if (data <= 0) { error = EINVAL; break; } num = imin(p->p_numthreads, data); PROC_UNLOCK(p); buf = malloc(num * sizeof(lwpid_t), M_TEMP, M_WAITOK); tmp = 0; PROC_LOCK(p); FOREACH_THREAD_IN_PROC(p, td2) { if (tmp >= num) break; buf[tmp++] = td2->td_tid; } PROC_UNLOCK(p); error = copyout(buf, addr, tmp * sizeof(lwpid_t)); free(buf, M_TEMP); if (!error) td->td_retval[0] = tmp; PROC_LOCK(p); break; case PT_VM_TIMESTAMP: CTR2(KTR_PTRACE, "PT_VM_TIMESTAMP: pid %d: timestamp %d", p->p_pid, p->p_vmspace->vm_map.timestamp); td->td_retval[0] = p->p_vmspace->vm_map.timestamp; break; case PT_VM_ENTRY: PROC_UNLOCK(p); #ifdef COMPAT_FREEBSD32 if (wrap32) error = ptrace_vm_entry32(td, p, addr); else #endif error = ptrace_vm_entry(td, p, addr); PROC_LOCK(p); break; default: #ifdef __HAVE_PTRACE_MACHDEP if (req >= PT_FIRSTMACH) { PROC_UNLOCK(p); error = cpu_ptrace(td2, req, addr, data); PROC_LOCK(p); } else #endif /* Unknown request. */ error = EINVAL; break; } out: /* Drop our hold on this process now that the request has completed. */ _PRELE(p); fail: PROC_UNLOCK(p); if (proctree_locked) sx_xunlock(&proctree_lock); return (error); } #undef PROC_READ #undef PROC_WRITE /* * Stop a process because of a debugging event; * stay stopped until p->p_step is cleared * (cleared by PIOCCONT in procfs). */ void stopevent(struct proc *p, unsigned int event, unsigned int val) { PROC_LOCK_ASSERT(p, MA_OWNED); p->p_step = 1; CTR3(KTR_PTRACE, "stopevent: pid %d event %u val %u", p->p_pid, event, val); do { if (event != S_EXIT) p->p_xsig = val; p->p_xthread = NULL; p->p_stype = event; /* Which event caused the stop? */ wakeup(&p->p_stype); /* Wake up any PIOCWAIT'ing procs */ msleep(&p->p_step, &p->p_mtx, PWAIT, "stopevent", 0); } while (p->p_step); } Index: head/sys/kern/uipc_shm.c =================================================================== --- head/sys/kern/uipc_shm.c (revision 350420) +++ head/sys/kern/uipc_shm.c (revision 350421) @@ -1,1180 +1,1181 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006, 2011, 2016-2017 Robert N. M. Watson * All rights reserved. * * Portions of this software were developed by BAE Systems, the University of * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent * Computing (TC) research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Support for shared swap-backed anonymous memory objects via * shm_open(2) and shm_unlink(2). While most of the implementation is * here, vm_mmap.c contains mapping logic changes. * * posixshmcontrol(1) allows users to inspect the state of the memory * objects. Per-uid swap resource limit controls total amount of * memory that user can consume for anonymous objects, including * shared. */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct shm_mapping { char *sm_path; Fnv32_t sm_fnv; struct shmfd *sm_shmfd; LIST_ENTRY(shm_mapping) sm_link; }; static MALLOC_DEFINE(M_SHMFD, "shmfd", "shared memory file descriptor"); static LIST_HEAD(, shm_mapping) *shm_dictionary; static struct sx shm_dict_lock; static struct mtx shm_timestamp_lock; static u_long shm_hash; static struct unrhdr64 shm_ino_unr; static dev_t shm_dev_ino; #define SHM_HASH(fnv) (&shm_dictionary[(fnv) & shm_hash]) static void shm_init(void *arg); static void shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd); static struct shmfd *shm_lookup(char *path, Fnv32_t fnv); static int shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred); static fo_rdwr_t shm_read; static fo_rdwr_t shm_write; static fo_truncate_t shm_truncate; static fo_ioctl_t shm_ioctl; static fo_stat_t shm_stat; static fo_close_t shm_close; static fo_chmod_t shm_chmod; static fo_chown_t shm_chown; static fo_seek_t shm_seek; static fo_fill_kinfo_t shm_fill_kinfo; static fo_mmap_t shm_mmap; /* File descriptor operations. */ struct fileops shm_ops = { .fo_read = shm_read, .fo_write = shm_write, .fo_truncate = shm_truncate, .fo_ioctl = shm_ioctl, .fo_poll = invfo_poll, .fo_kqfilter = invfo_kqfilter, .fo_stat = shm_stat, .fo_close = shm_close, .fo_chmod = shm_chmod, .fo_chown = shm_chown, .fo_sendfile = vn_sendfile, .fo_seek = shm_seek, .fo_fill_kinfo = shm_fill_kinfo, .fo_mmap = shm_mmap, .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE }; FEATURE(posix_shm, "POSIX shared memory"); static int uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio) { vm_page_t m; vm_pindex_t idx; size_t tlen; int error, offset, rv; idx = OFF_TO_IDX(uio->uio_offset); offset = uio->uio_offset & PAGE_MASK; tlen = MIN(PAGE_SIZE - offset, len); VM_OBJECT_WLOCK(obj); /* * Read I/O without either a corresponding resident page or swap * page: use zero_region. This is intended to avoid instantiating * pages on read from a sparse region. */ if (uio->uio_rw == UIO_READ && vm_page_lookup(obj, idx) == NULL && !vm_pager_has_page(obj, idx, NULL, NULL)) { VM_OBJECT_WUNLOCK(obj); return (uiomove(__DECONST(void *, zero_region), tlen, uio)); } /* * Parallel reads of the page content from disk are prevented * by exclusive busy. * * Although the tmpfs vnode lock is held here, it is * nonetheless safe to sleep waiting for a free page. The * pageout daemon does not need to acquire the tmpfs vnode * lock to page out tobj's pages because tobj is a OBJT_SWAP * type object. */ m = vm_page_grab(obj, idx, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY); if (m->valid != VM_PAGE_BITS_ALL) { vm_page_xbusy(m); if (vm_pager_has_page(obj, idx, NULL, NULL)) { rv = vm_pager_get_pages(obj, &m, 1, NULL, NULL); if (rv != VM_PAGER_OK) { printf( "uiomove_object: vm_obj %p idx %jd valid %x pager error %d\n", obj, idx, m->valid, rv); vm_page_lock(m); vm_page_free(m); vm_page_unlock(m); VM_OBJECT_WUNLOCK(obj); return (EIO); } } else vm_page_zero_invalid(m, TRUE); vm_page_xunbusy(m); } vm_page_lock(m); vm_page_wire(m); vm_page_unlock(m); VM_OBJECT_WUNLOCK(obj); error = uiomove_fromphys(&m, offset, tlen, uio); if (uio->uio_rw == UIO_WRITE && error == 0) { VM_OBJECT_WLOCK(obj); vm_page_dirty(m); vm_pager_page_unswapped(m); VM_OBJECT_WUNLOCK(obj); } vm_page_lock(m); vm_page_unwire(m, PQ_ACTIVE); vm_page_unlock(m); return (error); } int uiomove_object(vm_object_t obj, off_t obj_size, struct uio *uio) { ssize_t resid; size_t len; int error; error = 0; while ((resid = uio->uio_resid) > 0) { if (obj_size <= uio->uio_offset) break; len = MIN(obj_size - uio->uio_offset, resid); if (len == 0) break; error = uiomove_object_page(obj, len, uio); if (error != 0 || resid == uio->uio_resid) break; } return (error); } static int shm_seek(struct file *fp, off_t offset, int whence, struct thread *td) { struct shmfd *shmfd; off_t foffset; int error; shmfd = fp->f_data; foffset = foffset_lock(fp, 0); error = 0; switch (whence) { case L_INCR: if (foffset < 0 || (offset > 0 && foffset > OFF_MAX - offset)) { error = EOVERFLOW; break; } offset += foffset; break; case L_XTND: if (offset > 0 && shmfd->shm_size > OFF_MAX - offset) { error = EOVERFLOW; break; } offset += shmfd->shm_size; break; case L_SET: break; default: error = EINVAL; } if (error == 0) { if (offset < 0 || offset > shmfd->shm_size) error = EINVAL; else td->td_uretoff.tdu_off = offset; } foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0); return (error); } static int shm_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct shmfd *shmfd; void *rl_cookie; int error; shmfd = fp->f_data; #ifdef MAC error = mac_posixshm_check_read(active_cred, fp->f_cred, shmfd); if (error) return (error); #endif foffset_lock_uio(fp, uio, flags); rl_cookie = rangelock_rlock(&shmfd->shm_rl, uio->uio_offset, uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx); error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio); rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); foffset_unlock_uio(fp, uio, flags); return (error); } static int shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct shmfd *shmfd; void *rl_cookie; int error; shmfd = fp->f_data; #ifdef MAC error = mac_posixshm_check_write(active_cred, fp->f_cred, shmfd); if (error) return (error); #endif foffset_lock_uio(fp, uio, flags); if ((flags & FOF_OFFSET) == 0) { rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX, &shmfd->shm_mtx); } else { rl_cookie = rangelock_wlock(&shmfd->shm_rl, uio->uio_offset, uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx); } error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio); rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx); foffset_unlock_uio(fp, uio, flags); return (error); } static int shm_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { struct shmfd *shmfd; #ifdef MAC int error; #endif shmfd = fp->f_data; #ifdef MAC error = mac_posixshm_check_truncate(active_cred, fp->f_cred, shmfd); if (error) return (error); #endif return (shm_dotruncate(shmfd, length)); } int shm_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { switch (com) { case FIONBIO: case FIOASYNC: /* * Allow fcntl(fd, F_SETFL, O_NONBLOCK) to work, * just like it would on an unlinked regular file */ return (0); default: return (ENOTTY); } } static int shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td) { struct shmfd *shmfd; #ifdef MAC int error; #endif shmfd = fp->f_data; #ifdef MAC error = mac_posixshm_check_stat(active_cred, fp->f_cred, shmfd); if (error) return (error); #endif /* * Attempt to return sanish values for fstat() on a memory file * descriptor. */ bzero(sb, sizeof(*sb)); sb->st_blksize = PAGE_SIZE; sb->st_size = shmfd->shm_size; sb->st_blocks = howmany(sb->st_size, sb->st_blksize); mtx_lock(&shm_timestamp_lock); sb->st_atim = shmfd->shm_atime; sb->st_ctim = shmfd->shm_ctime; sb->st_mtim = shmfd->shm_mtime; sb->st_birthtim = shmfd->shm_birthtime; sb->st_mode = S_IFREG | shmfd->shm_mode; /* XXX */ sb->st_uid = shmfd->shm_uid; sb->st_gid = shmfd->shm_gid; mtx_unlock(&shm_timestamp_lock); sb->st_dev = shm_dev_ino; sb->st_ino = shmfd->shm_ino; sb->st_nlink = shmfd->shm_object->ref_count; return (0); } static int shm_close(struct file *fp, struct thread *td) { struct shmfd *shmfd; shmfd = fp->f_data; fp->f_data = NULL; shm_drop(shmfd); return (0); } int shm_dotruncate(struct shmfd *shmfd, off_t length) { vm_object_t object; vm_page_t m; vm_pindex_t idx, nobjsize; vm_ooffset_t delta; int base, rv; KASSERT(length >= 0, ("shm_dotruncate: length < 0")); object = shmfd->shm_object; VM_OBJECT_WLOCK(object); if (length == shmfd->shm_size) { VM_OBJECT_WUNLOCK(object); return (0); } nobjsize = OFF_TO_IDX(length + PAGE_MASK); /* Are we shrinking? If so, trim the end. */ if (length < shmfd->shm_size) { /* * Disallow any requests to shrink the size if this * object is mapped into the kernel. */ if (shmfd->shm_kmappings > 0) { VM_OBJECT_WUNLOCK(object); return (EBUSY); } /* * Zero the truncated part of the last page. */ base = length & PAGE_MASK; if (base != 0) { idx = OFF_TO_IDX(length); retry: m = vm_page_lookup(object, idx); if (m != NULL) { if (vm_page_sleep_if_busy(m, "shmtrc")) goto retry; } else if (vm_pager_has_page(object, idx, NULL, NULL)) { m = vm_page_alloc(object, idx, VM_ALLOC_NORMAL | VM_ALLOC_WAITFAIL); if (m == NULL) goto retry; rv = vm_pager_get_pages(object, &m, 1, NULL, NULL); vm_page_lock(m); if (rv == VM_PAGER_OK) { /* * Since the page was not resident, * and therefore not recently * accessed, immediately enqueue it * for asynchronous laundering. The * current operation is not regarded * as an access. */ vm_page_launder(m); vm_page_unlock(m); vm_page_xunbusy(m); } else { vm_page_free(m); vm_page_unlock(m); VM_OBJECT_WUNLOCK(object); return (EIO); } } if (m != NULL) { pmap_zero_page_area(m, base, PAGE_SIZE - base); KASSERT(m->valid == VM_PAGE_BITS_ALL, ("shm_dotruncate: page %p is invalid", m)); vm_page_dirty(m); vm_pager_page_unswapped(m); } } delta = IDX_TO_OFF(object->size - nobjsize); /* Toss in memory pages. */ if (nobjsize < object->size) vm_object_page_remove(object, nobjsize, object->size, 0); /* Toss pages from swap. */ if (object->type == OBJT_SWAP) swap_pager_freespace(object, nobjsize, delta); /* Free the swap accounted for shm */ swap_release_by_cred(delta, object->cred); object->charge -= delta; } else { /* Try to reserve additional swap space. */ delta = IDX_TO_OFF(nobjsize - object->size); if (!swap_reserve_by_cred(delta, object->cred)) { VM_OBJECT_WUNLOCK(object); return (ENOMEM); } object->charge += delta; } shmfd->shm_size = length; mtx_lock(&shm_timestamp_lock); vfs_timestamp(&shmfd->shm_ctime); shmfd->shm_mtime = shmfd->shm_ctime; mtx_unlock(&shm_timestamp_lock); object->size = nobjsize; VM_OBJECT_WUNLOCK(object); return (0); } /* * shmfd object management including creation and reference counting * routines. */ struct shmfd * shm_alloc(struct ucred *ucred, mode_t mode) { struct shmfd *shmfd; shmfd = malloc(sizeof(*shmfd), M_SHMFD, M_WAITOK | M_ZERO); shmfd->shm_size = 0; shmfd->shm_uid = ucred->cr_uid; shmfd->shm_gid = ucred->cr_gid; shmfd->shm_mode = mode; shmfd->shm_object = vm_pager_allocate(OBJT_DEFAULT, NULL, shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred); KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate")); shmfd->shm_object->pg_color = 0; VM_OBJECT_WLOCK(shmfd->shm_object); vm_object_clear_flag(shmfd->shm_object, OBJ_ONEMAPPING); vm_object_set_flag(shmfd->shm_object, OBJ_COLORED | OBJ_NOSPLIT); VM_OBJECT_WUNLOCK(shmfd->shm_object); vfs_timestamp(&shmfd->shm_birthtime); shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime = shmfd->shm_birthtime; shmfd->shm_ino = alloc_unr64(&shm_ino_unr); refcount_init(&shmfd->shm_refs, 1); mtx_init(&shmfd->shm_mtx, "shmrl", NULL, MTX_DEF); rangelock_init(&shmfd->shm_rl); #ifdef MAC mac_posixshm_init(shmfd); mac_posixshm_create(ucred, shmfd); #endif return (shmfd); } struct shmfd * shm_hold(struct shmfd *shmfd) { refcount_acquire(&shmfd->shm_refs); return (shmfd); } void shm_drop(struct shmfd *shmfd) { if (refcount_release(&shmfd->shm_refs)) { #ifdef MAC mac_posixshm_destroy(shmfd); #endif rangelock_destroy(&shmfd->shm_rl); mtx_destroy(&shmfd->shm_mtx); vm_object_deallocate(shmfd->shm_object); free(shmfd, M_SHMFD); } } /* * Determine if the credentials have sufficient permissions for a * specified combination of FREAD and FWRITE. */ int shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags) { accmode_t accmode; int error; accmode = 0; if (flags & FREAD) accmode |= VREAD; if (flags & FWRITE) accmode |= VWRITE; mtx_lock(&shm_timestamp_lock); error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid, accmode, ucred, NULL); mtx_unlock(&shm_timestamp_lock); return (error); } /* * Dictionary management. We maintain an in-kernel dictionary to map * paths to shmfd objects. We use the FNV hash on the path to store * the mappings in a hash table. */ static void shm_init(void *arg) { mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF); sx_init(&shm_dict_lock, "shm dictionary"); shm_dictionary = hashinit(1024, M_SHMFD, &shm_hash); new_unrhdr64(&shm_ino_unr, 1); shm_dev_ino = devfs_alloc_cdp_inode(); KASSERT(shm_dev_ino > 0, ("shm dev inode not initialized")); } SYSINIT(shm_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_init, NULL); static struct shmfd * shm_lookup(char *path, Fnv32_t fnv) { struct shm_mapping *map; LIST_FOREACH(map, SHM_HASH(fnv), sm_link) { if (map->sm_fnv != fnv) continue; if (strcmp(map->sm_path, path) == 0) return (map->sm_shmfd); } return (NULL); } static void shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd) { struct shm_mapping *map; map = malloc(sizeof(struct shm_mapping), M_SHMFD, M_WAITOK); map->sm_path = path; map->sm_fnv = fnv; map->sm_shmfd = shm_hold(shmfd); shmfd->shm_path = path; LIST_INSERT_HEAD(SHM_HASH(fnv), map, sm_link); } static int shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred) { struct shm_mapping *map; int error; LIST_FOREACH(map, SHM_HASH(fnv), sm_link) { if (map->sm_fnv != fnv) continue; if (strcmp(map->sm_path, path) == 0) { #ifdef MAC error = mac_posixshm_check_unlink(ucred, map->sm_shmfd); if (error) return (error); #endif error = shm_access(map->sm_shmfd, ucred, FREAD | FWRITE); if (error) return (error); map->sm_shmfd->shm_path = NULL; LIST_REMOVE(map, sm_link); shm_drop(map->sm_shmfd); free(map->sm_path, M_SHMFD); free(map, M_SHMFD); return (0); } } return (ENOENT); } int kern_shm_open(struct thread *td, const char *userpath, int flags, mode_t mode, struct filecaps *fcaps) { struct filedesc *fdp; struct shmfd *shmfd; struct file *fp; char *path; const char *pr_path; size_t pr_pathlen; Fnv32_t fnv; mode_t cmode; int fd, error; #ifdef CAPABILITY_MODE /* * shm_open(2) is only allowed for anonymous objects. */ if (IN_CAPABILITY_MODE(td) && (userpath != SHM_ANON)) return (ECAPMODE); #endif AUDIT_ARG_FFLAGS(flags); AUDIT_ARG_MODE(mode); if ((flags & O_ACCMODE) != O_RDONLY && (flags & O_ACCMODE) != O_RDWR) return (EINVAL); if ((flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0) return (EINVAL); fdp = td->td_proc->p_fd; cmode = (mode & ~fdp->fd_cmask) & ACCESSPERMS; error = falloc_caps(td, &fp, &fd, O_CLOEXEC, fcaps); if (error) return (error); /* A SHM_ANON path pointer creates an anonymous object. */ if (userpath == SHM_ANON) { /* A read-only anonymous object is pointless. */ if ((flags & O_ACCMODE) == O_RDONLY) { fdclose(td, fp, fd); fdrop(fp, td); return (EINVAL); } shmfd = shm_alloc(td->td_ucred, cmode); } else { path = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK); pr_path = td->td_ucred->cr_prison->pr_path; /* Construct a full pathname for jailed callers. */ pr_pathlen = strcmp(pr_path, "/") == 0 ? 0 : strlcpy(path, pr_path, MAXPATHLEN); error = copyinstr(userpath, path + pr_pathlen, MAXPATHLEN - pr_pathlen, NULL); #ifdef KTRACE if (error == 0 && KTRPOINT(curthread, KTR_NAMEI)) ktrnamei(path); #endif /* Require paths to start with a '/' character. */ if (error == 0 && path[pr_pathlen] != '/') error = EINVAL; if (error) { fdclose(td, fp, fd); fdrop(fp, td); free(path, M_SHMFD); return (error); } AUDIT_ARG_UPATH1_CANON(path); fnv = fnv_32_str(path, FNV1_32_INIT); sx_xlock(&shm_dict_lock); shmfd = shm_lookup(path, fnv); if (shmfd == NULL) { /* Object does not yet exist, create it if requested. */ if (flags & O_CREAT) { #ifdef MAC error = mac_posixshm_check_create(td->td_ucred, path); if (error == 0) { #endif shmfd = shm_alloc(td->td_ucred, cmode); shm_insert(path, fnv, shmfd); #ifdef MAC } #endif } else { free(path, M_SHMFD); error = ENOENT; } } else { /* * Object already exists, obtain a new * reference if requested and permitted. */ free(path, M_SHMFD); if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) error = EEXIST; else { #ifdef MAC error = mac_posixshm_check_open(td->td_ucred, shmfd, FFLAGS(flags & O_ACCMODE)); if (error == 0) #endif error = shm_access(shmfd, td->td_ucred, FFLAGS(flags & O_ACCMODE)); } /* * Truncate the file back to zero length if * O_TRUNC was specified and the object was * opened with read/write. */ if (error == 0 && (flags & (O_ACCMODE | O_TRUNC)) == (O_RDWR | O_TRUNC)) { #ifdef MAC error = mac_posixshm_check_truncate( td->td_ucred, fp->f_cred, shmfd); if (error == 0) #endif shm_dotruncate(shmfd, 0); } if (error == 0) shm_hold(shmfd); } sx_xunlock(&shm_dict_lock); if (error) { fdclose(td, fp, fd); fdrop(fp, td); return (error); } } finit(fp, FFLAGS(flags & O_ACCMODE), DTYPE_SHM, shmfd, &shm_ops); td->td_retval[0] = fd; fdrop(fp, td); return (0); } /* System calls. */ int sys_shm_open(struct thread *td, struct shm_open_args *uap) { return (kern_shm_open(td, uap->path, uap->flags, uap->mode, NULL)); } int sys_shm_unlink(struct thread *td, struct shm_unlink_args *uap) { char *path; const char *pr_path; size_t pr_pathlen; Fnv32_t fnv; int error; path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); pr_path = td->td_ucred->cr_prison->pr_path; pr_pathlen = strcmp(pr_path, "/") == 0 ? 0 : strlcpy(path, pr_path, MAXPATHLEN); error = copyinstr(uap->path, path + pr_pathlen, MAXPATHLEN - pr_pathlen, NULL); if (error) { free(path, M_TEMP); return (error); } #ifdef KTRACE if (KTRPOINT(curthread, KTR_NAMEI)) ktrnamei(path); #endif AUDIT_ARG_UPATH1_CANON(path); fnv = fnv_32_str(path, FNV1_32_INIT); sx_xlock(&shm_dict_lock); error = shm_remove(path, fnv, td->td_ucred); sx_xunlock(&shm_dict_lock); free(path, M_TEMP); return (error); } int shm_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t objsize, vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, struct thread *td) { struct shmfd *shmfd; vm_prot_t maxprot; int error; shmfd = fp->f_data; maxprot = VM_PROT_NONE; /* FREAD should always be set. */ if ((fp->f_flag & FREAD) != 0) maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; if ((fp->f_flag & FWRITE) != 0) maxprot |= VM_PROT_WRITE; /* Don't permit shared writable mappings on read-only descriptors. */ if ((flags & MAP_SHARED) != 0 && (maxprot & VM_PROT_WRITE) == 0 && (prot & VM_PROT_WRITE) != 0) return (EACCES); maxprot &= cap_maxprot; /* See comment in vn_mmap(). */ if ( #ifdef _LP64 objsize > OFF_MAX || #endif foff < 0 || foff > OFF_MAX - objsize) return (EINVAL); #ifdef MAC error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, flags); if (error != 0) return (error); #endif mtx_lock(&shm_timestamp_lock); vfs_timestamp(&shmfd->shm_atime); mtx_unlock(&shm_timestamp_lock); vm_object_reference(shmfd->shm_object); error = vm_mmap_object(map, addr, objsize, prot, maxprot, flags, shmfd->shm_object, foff, FALSE, td); if (error != 0) vm_object_deallocate(shmfd->shm_object); return (error); } static int shm_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { struct shmfd *shmfd; int error; error = 0; shmfd = fp->f_data; mtx_lock(&shm_timestamp_lock); /* * SUSv4 says that x bits of permission need not be affected. * Be consistent with our shm_open there. */ #ifdef MAC error = mac_posixshm_check_setmode(active_cred, shmfd, mode); if (error != 0) goto out; #endif error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid, VADMIN, active_cred, NULL); if (error != 0) goto out; shmfd->shm_mode = mode & ACCESSPERMS; out: mtx_unlock(&shm_timestamp_lock); return (error); } static int shm_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { struct shmfd *shmfd; int error; error = 0; shmfd = fp->f_data; mtx_lock(&shm_timestamp_lock); #ifdef MAC error = mac_posixshm_check_setowner(active_cred, shmfd, uid, gid); if (error != 0) goto out; #endif if (uid == (uid_t)-1) uid = shmfd->shm_uid; if (gid == (gid_t)-1) gid = shmfd->shm_gid; if (((uid != shmfd->shm_uid && uid != active_cred->cr_uid) || (gid != shmfd->shm_gid && !groupmember(gid, active_cred))) && (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN))) goto out; shmfd->shm_uid = uid; shmfd->shm_gid = gid; out: mtx_unlock(&shm_timestamp_lock); return (error); } /* * Helper routines to allow the backing object of a shared memory file * descriptor to be mapped in the kernel. */ int shm_map(struct file *fp, size_t size, off_t offset, void **memp) { struct shmfd *shmfd; vm_offset_t kva, ofs; vm_object_t obj; int rv; if (fp->f_type != DTYPE_SHM) return (EINVAL); shmfd = fp->f_data; obj = shmfd->shm_object; VM_OBJECT_WLOCK(obj); /* * XXXRW: This validation is probably insufficient, and subject to * sign errors. It should be fixed. */ if (offset >= shmfd->shm_size || offset + size > round_page(shmfd->shm_size)) { VM_OBJECT_WUNLOCK(obj); return (EINVAL); } shmfd->shm_kmappings++; vm_object_reference_locked(obj); VM_OBJECT_WUNLOCK(obj); /* Map the object into the kernel_map and wire it. */ kva = vm_map_min(kernel_map); ofs = offset & PAGE_MASK; offset = trunc_page(offset); size = round_page(size + ofs); rv = vm_map_find(kernel_map, obj, offset, &kva, size, 0, VMFS_OPTIMAL_SPACE, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0); if (rv == KERN_SUCCESS) { rv = vm_map_wire(kernel_map, kva, kva + size, VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); if (rv == KERN_SUCCESS) { *memp = (void *)(kva + ofs); return (0); } vm_map_remove(kernel_map, kva, kva + size); } else vm_object_deallocate(obj); /* On failure, drop our mapping reference. */ VM_OBJECT_WLOCK(obj); shmfd->shm_kmappings--; VM_OBJECT_WUNLOCK(obj); return (vm_mmap_to_errno(rv)); } /* * We require the caller to unmap the entire entry. This allows us to * safely decrement shm_kmappings when a mapping is removed. */ int shm_unmap(struct file *fp, void *mem, size_t size) { struct shmfd *shmfd; vm_map_entry_t entry; vm_offset_t kva, ofs; vm_object_t obj; vm_pindex_t pindex; vm_prot_t prot; boolean_t wired; vm_map_t map; int rv; if (fp->f_type != DTYPE_SHM) return (EINVAL); shmfd = fp->f_data; kva = (vm_offset_t)mem; ofs = kva & PAGE_MASK; kva = trunc_page(kva); size = round_page(size + ofs); map = kernel_map; rv = vm_map_lookup(&map, kva, VM_PROT_READ | VM_PROT_WRITE, &entry, &obj, &pindex, &prot, &wired); if (rv != KERN_SUCCESS) return (EINVAL); if (entry->start != kva || entry->end != kva + size) { vm_map_lookup_done(map, entry); return (EINVAL); } vm_map_lookup_done(map, entry); if (obj != shmfd->shm_object) return (EINVAL); vm_map_remove(map, kva, kva + size); VM_OBJECT_WLOCK(obj); KASSERT(shmfd->shm_kmappings > 0, ("shm_unmap: object not mapped")); shmfd->shm_kmappings--; VM_OBJECT_WUNLOCK(obj); return (0); } static int shm_fill_kinfo_locked(struct shmfd *shmfd, struct kinfo_file *kif, bool list) { const char *path, *pr_path; size_t pr_pathlen; bool visible; sx_assert(&shm_dict_lock, SA_LOCKED); kif->kf_type = KF_TYPE_SHM; kif->kf_un.kf_file.kf_file_mode = S_IFREG | shmfd->shm_mode; kif->kf_un.kf_file.kf_file_size = shmfd->shm_size; if (shmfd->shm_path != NULL) { if (shmfd->shm_path != NULL) { path = shmfd->shm_path; pr_path = curthread->td_ucred->cr_prison->pr_path; if (strcmp(pr_path, "/") != 0) { /* Return the jail-rooted pathname. */ pr_pathlen = strlen(pr_path); visible = strncmp(path, pr_path, pr_pathlen) == 0 && path[pr_pathlen] == '/'; if (list && !visible) return (EPERM); if (visible) path += pr_pathlen; } strlcpy(kif->kf_path, path, sizeof(kif->kf_path)); } } return (0); } static int shm_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp __unused) { int res; sx_slock(&shm_dict_lock); res = shm_fill_kinfo_locked(fp->f_data, kif, false); sx_sunlock(&shm_dict_lock); return (res); } static int sysctl_posix_shm_list(SYSCTL_HANDLER_ARGS) { struct shm_mapping *shmm; struct sbuf sb; struct kinfo_file kif; u_long i; ssize_t curlen; int error, error2; sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file) * 5, req); sbuf_clear_flags(&sb, SBUF_INCLUDENUL); curlen = 0; error = 0; sx_slock(&shm_dict_lock); for (i = 0; i < shm_hash + 1; i++) { LIST_FOREACH(shmm, &shm_dictionary[i], sm_link) { error = shm_fill_kinfo_locked(shmm->sm_shmfd, &kif, true); if (error == EPERM) continue; if (error != 0) break; pack_kinfo(&kif); if (req->oldptr != NULL && kif.kf_structsize + curlen > req->oldlen) break; error = sbuf_bcat(&sb, &kif, kif.kf_structsize) == 0 ? 0 : ENOMEM; if (error != 0) break; curlen += kif.kf_structsize; } } sx_sunlock(&shm_dict_lock); error2 = sbuf_finish(&sb); sbuf_delete(&sb); return (error != 0 ? error : error2); } SYSCTL_PROC(_kern_ipc, OID_AUTO, posix_shm_list, CTLFLAG_RD | CTLFLAG_MPSAFE | CTLTYPE_OPAQUE, NULL, 0, sysctl_posix_shm_list, "", "POSIX SHM list"); Index: head/sys/mips/broadcom/bhnd_nexus.c =================================================================== --- head/sys/mips/broadcom/bhnd_nexus.c (revision 350420) +++ head/sys/mips/broadcom/bhnd_nexus.c (revision 350421) @@ -1,281 +1,282 @@ /*- * Copyright (c) 2015-2016 Landon Fuller * Copyright (c) 2017 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by Landon Fuller * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any * redistribution must be conditioned upon including a substantially * similar Disclaimer requirement for further binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGES. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); /* * bhnd(4) driver mix-in providing shared common methods for * bhnd bus devices attached via a MIPS root nexus. */ #include #include #include #include +#include #include #include #include #include #include #include #include #include #include "bcm_machdep.h" #include "bcm_mipsvar.h" #include "bhnd_nexusvar.h" /** * Default bhnd_nexus implementation of BHND_BUS_GET_SERVICE_REGISTRY(). */ static struct bhnd_service_registry * bhnd_nexus_get_service_registry(device_t dev, device_t child) { struct bcm_platform *bp = bcm_get_platform(); return (&bp->services); } /** * Default bhnd_nexus implementation of BHND_BUS_ACTIVATE_RESOURCE(). */ static int bhnd_nexus_activate_resource(device_t dev, device_t child, int type, int rid, struct bhnd_resource *r) { int error; /* Always direct */ if ((error = bus_activate_resource(child, type, rid, r->res))) return (error); r->direct = true; return (0); } /** * Default bhnd_nexus implementation of BHND_BUS_DEACTIVATE_RESOURCE(). */ static int bhnd_nexus_deactivate_resource(device_t dev, device_t child, int type, int rid, struct bhnd_resource *r) { int error; /* Always direct */ KASSERT(r->direct, ("indirect resource delegated to bhnd_nexus\n")); if ((error = bus_deactivate_resource(child, type, rid, r->res))) return (error); r->direct = false; return (0); } /** * Default bhnd_nexus implementation of BHND_BUS_IS_HW_DISABLED(). */ static bool bhnd_nexus_is_hw_disabled(device_t dev, device_t child) { struct bcm_platform *bp; struct bhnd_chipid *cid; bp = bcm_get_platform(); cid = &bp->cid; /* The BCM4706 low-cost package leaves secondary GMAC cores * floating */ if (cid->chip_id == BHND_CHIPID_BCM4706 && cid->chip_pkg == BHND_PKGID_BCM4706L && bhnd_get_device(child) == BHND_COREID_4706_GMAC && bhnd_get_core_unit(child) != 0) { return (true); } return (false); } /** * Default bhnd_nexus implementation of BHND_BUS_AGET_ATTACH_TYPE(). */ static bhnd_attach_type bhnd_nexus_get_attach_type(device_t dev, device_t child) { return (BHND_ATTACH_NATIVE); } /** * Default bhnd_nexus implementation of BHND_BUS_GET_CHIPID(). */ static const struct bhnd_chipid * bhnd_nexus_get_chipid(device_t dev, device_t child) { return (&bcm_get_platform()->cid); } /** * Default bhnd_nexus implementation of BHND_BUS_READ_BOARD_INFO(). */ static int bhnd_nexus_read_board_info(device_t dev, device_t child, struct bhnd_board_info *info) { int error; /* Initialize with NVRAM-derived values */ if ((error = bhnd_bus_generic_read_board_info(dev, child, info))) return (error); /* The board vendor should default to PCI_VENDOR_BROADCOM if not * otherwise specified */ if (info->board_vendor == 0) info->board_vendor = PCI_VENDOR_BROADCOM; return (0); } /** * Default bhnd_nexus implementation of BHND_BUS_MAP_INTR(). */ static int bhnd_nexus_map_intr(device_t dev, device_t child, u_int intr, rman_res_t *irq) { struct bcm_mips_intr_map_data *imd; u_int ivec; uintptr_t xref; int error; /* Fetch the backplane interrupt vector */ if ((error = bhnd_get_intr_ivec(child, intr, &ivec))) { device_printf(dev, "error fetching ivec for intr %u: %d\n", intr, error); return (error); } /* Determine our interrupt domain */ xref = BHND_BUS_GET_INTR_DOMAIN(dev, child, false); KASSERT(xref != 0, ("missing interrupt domain")); /* Allocate our map data */ imd = (struct bcm_mips_intr_map_data *)intr_alloc_map_data( INTR_MAP_DATA_BCM_MIPS, sizeof(*imd), M_WAITOK | M_ZERO); imd->ivec = ivec; /* Map the IRQ */ *irq = intr_map_irq(NULL, xref, &imd->mdata); return (0); } /** * Default bhnd_nexus implementation of BHND_BUS_UNMAP_INTR(). */ static void bhnd_nexus_unmap_intr(device_t dev, device_t child, rman_res_t irq) { if (irq > UINT_MAX) panic("invalid irq: %ju", (uintmax_t)irq); intr_unmap_irq(irq); } /** * Default bhnd_nexus implementation of BHND_BUS_GET_DMA_TRANSLATION(). */ static int bhnd_nexus_get_dma_translation(device_t dev, device_t child, u_int width, uint32_t flags, bus_dma_tag_t *dmat, struct bhnd_dma_translation *translation) { struct bcm_platform *bp = bcm_get_platform(); /* We don't (currently) support any flags */ if (flags != 0x0) return (ENOENT); KASSERT(width > 0 && width <= BHND_DMA_ADDR_64BIT, ("invalid width %u", width)); /* Is the requested width supported? */ if (width > BHND_DMA_ADDR_32BIT) { /* Backplane must support 64-bit addressing */ if (!(bp->cid.chip_caps & BHND_CAP_BP64)) width = BHND_DMA_ADDR_32BIT; } /* No DMA address translation required */ if (dmat != NULL) *dmat = bus_get_dma_tag(dev); if (translation != NULL) { *translation = (struct bhnd_dma_translation) { .base_addr = 0x0, .addr_mask = BHND_DMA_ADDR_BITMASK(width), .addrext_mask = 0 }; } return (0); } static device_method_t bhnd_nexus_methods[] = { /* bhnd interface */ DEVMETHOD(bhnd_bus_get_service_registry,bhnd_nexus_get_service_registry), DEVMETHOD(bhnd_bus_register_provider, bhnd_bus_generic_sr_register_provider), DEVMETHOD(bhnd_bus_deregister_provider, bhnd_bus_generic_sr_deregister_provider), DEVMETHOD(bhnd_bus_retain_provider, bhnd_bus_generic_sr_retain_provider), DEVMETHOD(bhnd_bus_release_provider, bhnd_bus_generic_sr_release_provider), DEVMETHOD(bhnd_bus_activate_resource, bhnd_nexus_activate_resource), DEVMETHOD(bhnd_bus_deactivate_resource, bhnd_nexus_deactivate_resource), DEVMETHOD(bhnd_bus_is_hw_disabled, bhnd_nexus_is_hw_disabled), DEVMETHOD(bhnd_bus_get_attach_type, bhnd_nexus_get_attach_type), DEVMETHOD(bhnd_bus_get_chipid, bhnd_nexus_get_chipid), DEVMETHOD(bhnd_bus_get_dma_translation, bhnd_nexus_get_dma_translation), DEVMETHOD(bhnd_bus_get_intr_domain, bhnd_bus_generic_get_intr_domain), DEVMETHOD(bhnd_bus_map_intr, bhnd_nexus_map_intr), DEVMETHOD(bhnd_bus_read_board_info, bhnd_nexus_read_board_info), DEVMETHOD(bhnd_bus_unmap_intr, bhnd_nexus_unmap_intr), DEVMETHOD_END }; DEFINE_CLASS_0(bhnd, bhnd_nexus_driver, bhnd_nexus_methods, sizeof(struct bhnd_softc)); Index: head/sys/rpc/svc_vc.c =================================================================== --- head/sys/rpc/svc_vc.c (revision 350420) +++ head/sys/rpc/svc_vc.c (revision 350421) @@ -1,994 +1,995 @@ /* $NetBSD: svc_vc.c,v 1.7 2000/08/03 00:01:53 fvdl Exp $ */ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2009, Sun Microsystems, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of Sun Microsystems, Inc. nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #if defined(LIBC_SCCS) && !defined(lint) static char *sccsid2 = "@(#)svc_tcp.c 1.21 87/08/11 Copyr 1984 Sun Micro"; static char *sccsid = "@(#)svc_tcp.c 2.2 88/08/01 4.0 RPCSRC"; #endif #include __FBSDID("$FreeBSD$"); /* * svc_vc.c, Server side for Connection Oriented based RPC. * * Actually implements two flavors of transporter - * a tcp rendezvouser (a listner and connection establisher) * and a record/tcp stream. */ #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static bool_t svc_vc_rendezvous_recv(SVCXPRT *, struct rpc_msg *, struct sockaddr **, struct mbuf **); static enum xprt_stat svc_vc_rendezvous_stat(SVCXPRT *); static void svc_vc_rendezvous_destroy(SVCXPRT *); static bool_t svc_vc_null(void); static void svc_vc_destroy(SVCXPRT *); static enum xprt_stat svc_vc_stat(SVCXPRT *); static bool_t svc_vc_ack(SVCXPRT *, uint32_t *); static bool_t svc_vc_recv(SVCXPRT *, struct rpc_msg *, struct sockaddr **, struct mbuf **); static bool_t svc_vc_reply(SVCXPRT *, struct rpc_msg *, struct sockaddr *, struct mbuf *, uint32_t *seq); static bool_t svc_vc_control(SVCXPRT *xprt, const u_int rq, void *in); static bool_t svc_vc_rendezvous_control (SVCXPRT *xprt, const u_int rq, void *in); static void svc_vc_backchannel_destroy(SVCXPRT *); static enum xprt_stat svc_vc_backchannel_stat(SVCXPRT *); static bool_t svc_vc_backchannel_recv(SVCXPRT *, struct rpc_msg *, struct sockaddr **, struct mbuf **); static bool_t svc_vc_backchannel_reply(SVCXPRT *, struct rpc_msg *, struct sockaddr *, struct mbuf *, uint32_t *); static bool_t svc_vc_backchannel_control(SVCXPRT *xprt, const u_int rq, void *in); static SVCXPRT *svc_vc_create_conn(SVCPOOL *pool, struct socket *so, struct sockaddr *raddr); static int svc_vc_accept(struct socket *head, struct socket **sop); static int svc_vc_soupcall(struct socket *so, void *arg, int waitflag); static int svc_vc_rendezvous_soupcall(struct socket *, void *, int); static struct xp_ops svc_vc_rendezvous_ops = { .xp_recv = svc_vc_rendezvous_recv, .xp_stat = svc_vc_rendezvous_stat, .xp_reply = (bool_t (*)(SVCXPRT *, struct rpc_msg *, struct sockaddr *, struct mbuf *, uint32_t *))svc_vc_null, .xp_destroy = svc_vc_rendezvous_destroy, .xp_control = svc_vc_rendezvous_control }; static struct xp_ops svc_vc_ops = { .xp_recv = svc_vc_recv, .xp_stat = svc_vc_stat, .xp_ack = svc_vc_ack, .xp_reply = svc_vc_reply, .xp_destroy = svc_vc_destroy, .xp_control = svc_vc_control }; static struct xp_ops svc_vc_backchannel_ops = { .xp_recv = svc_vc_backchannel_recv, .xp_stat = svc_vc_backchannel_stat, .xp_reply = svc_vc_backchannel_reply, .xp_destroy = svc_vc_backchannel_destroy, .xp_control = svc_vc_backchannel_control }; /* * Usage: * xprt = svc_vc_create(sock, send_buf_size, recv_buf_size); * * Creates, registers, and returns a (rpc) tcp based transporter. * Once *xprt is initialized, it is registered as a transporter * see (svc.h, xprt_register). This routine returns * a NULL if a problem occurred. * * The filedescriptor passed in is expected to refer to a bound, but * not yet connected socket. * * Since streams do buffered io similar to stdio, the caller can specify * how big the send and receive buffers are via the second and third parms; * 0 => use the system default. */ SVCXPRT * svc_vc_create(SVCPOOL *pool, struct socket *so, size_t sendsize, size_t recvsize) { SVCXPRT *xprt; struct sockaddr* sa; int error; SOCK_LOCK(so); if (so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED)) { SOCK_UNLOCK(so); CURVNET_SET(so->so_vnet); error = so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa); CURVNET_RESTORE(); if (error) return (NULL); xprt = svc_vc_create_conn(pool, so, sa); free(sa, M_SONAME); return (xprt); } SOCK_UNLOCK(so); xprt = svc_xprt_alloc(); sx_init(&xprt->xp_lock, "xprt->xp_lock"); xprt->xp_pool = pool; xprt->xp_socket = so; xprt->xp_p1 = NULL; xprt->xp_p2 = NULL; xprt->xp_ops = &svc_vc_rendezvous_ops; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa); CURVNET_RESTORE(); if (error) { goto cleanup_svc_vc_create; } memcpy(&xprt->xp_ltaddr, sa, sa->sa_len); free(sa, M_SONAME); xprt_register(xprt); solisten(so, -1, curthread); SOLISTEN_LOCK(so); xprt->xp_upcallset = 1; solisten_upcall_set(so, svc_vc_rendezvous_soupcall, xprt); SOLISTEN_UNLOCK(so); return (xprt); cleanup_svc_vc_create: sx_destroy(&xprt->xp_lock); svc_xprt_free(xprt); return (NULL); } /* * Create a new transport for a socket optained via soaccept(). */ SVCXPRT * svc_vc_create_conn(SVCPOOL *pool, struct socket *so, struct sockaddr *raddr) { SVCXPRT *xprt; struct cf_conn *cd; struct sockaddr* sa = NULL; struct sockopt opt; int one = 1; int error; bzero(&opt, sizeof(struct sockopt)); opt.sopt_dir = SOPT_SET; opt.sopt_level = SOL_SOCKET; opt.sopt_name = SO_KEEPALIVE; opt.sopt_val = &one; opt.sopt_valsize = sizeof(one); error = sosetopt(so, &opt); if (error) { return (NULL); } if (so->so_proto->pr_protocol == IPPROTO_TCP) { bzero(&opt, sizeof(struct sockopt)); opt.sopt_dir = SOPT_SET; opt.sopt_level = IPPROTO_TCP; opt.sopt_name = TCP_NODELAY; opt.sopt_val = &one; opt.sopt_valsize = sizeof(one); error = sosetopt(so, &opt); if (error) { return (NULL); } } cd = mem_alloc(sizeof(*cd)); cd->strm_stat = XPRT_IDLE; xprt = svc_xprt_alloc(); sx_init(&xprt->xp_lock, "xprt->xp_lock"); xprt->xp_pool = pool; xprt->xp_socket = so; xprt->xp_p1 = cd; xprt->xp_p2 = NULL; xprt->xp_ops = &svc_vc_ops; /* * See http://www.connectathon.org/talks96/nfstcp.pdf - client * has a 5 minute timer, server has a 6 minute timer. */ xprt->xp_idletimeout = 6 * 60; memcpy(&xprt->xp_rtaddr, raddr, raddr->sa_len); CURVNET_SET(so->so_vnet); error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa); CURVNET_RESTORE(); if (error) goto cleanup_svc_vc_create; memcpy(&xprt->xp_ltaddr, sa, sa->sa_len); free(sa, M_SONAME); xprt_register(xprt); SOCKBUF_LOCK(&so->so_rcv); xprt->xp_upcallset = 1; soupcall_set(so, SO_RCV, svc_vc_soupcall, xprt); SOCKBUF_UNLOCK(&so->so_rcv); /* * Throw the transport into the active list in case it already * has some data buffered. */ sx_xlock(&xprt->xp_lock); xprt_active(xprt); sx_xunlock(&xprt->xp_lock); return (xprt); cleanup_svc_vc_create: sx_destroy(&xprt->xp_lock); svc_xprt_free(xprt); mem_free(cd, sizeof(*cd)); return (NULL); } /* * Create a new transport for a backchannel on a clnt_vc socket. */ SVCXPRT * svc_vc_create_backchannel(SVCPOOL *pool) { SVCXPRT *xprt = NULL; struct cf_conn *cd = NULL; cd = mem_alloc(sizeof(*cd)); cd->strm_stat = XPRT_IDLE; xprt = svc_xprt_alloc(); sx_init(&xprt->xp_lock, "xprt->xp_lock"); xprt->xp_pool = pool; xprt->xp_socket = NULL; xprt->xp_p1 = cd; xprt->xp_p2 = NULL; xprt->xp_ops = &svc_vc_backchannel_ops; return (xprt); } /* * This does all of the accept except the final call to soaccept. The * caller will call soaccept after dropping its locks (soaccept may * call malloc). */ int svc_vc_accept(struct socket *head, struct socket **sop) { struct socket *so; int error = 0; short nbio; /* XXXGL: shouldn't that be an assertion? */ if ((head->so_options & SO_ACCEPTCONN) == 0) { error = EINVAL; goto done; } #ifdef MAC error = mac_socket_check_accept(curthread->td_ucred, head); if (error != 0) goto done; #endif /* * XXXGL: we want non-blocking semantics. The socket could be a * socket created by kernel as well as socket shared with userland, * so we can't be sure about presense of SS_NBIO. We also shall not * toggle it on the socket, since that may surprise userland. So we * set SS_NBIO only temporarily. */ SOLISTEN_LOCK(head); nbio = head->so_state & SS_NBIO; head->so_state |= SS_NBIO; error = solisten_dequeue(head, &so, 0); head->so_state &= (nbio & ~SS_NBIO); if (error) goto done; so->so_state |= nbio; *sop = so; /* connection has been removed from the listen queue */ KNOTE_UNLOCKED(&head->so_rdsel.si_note, 0); done: return (error); } /*ARGSUSED*/ static bool_t svc_vc_rendezvous_recv(SVCXPRT *xprt, struct rpc_msg *msg, struct sockaddr **addrp, struct mbuf **mp) { struct socket *so = NULL; struct sockaddr *sa = NULL; int error; SVCXPRT *new_xprt; /* * The socket upcall calls xprt_active() which will eventually * cause the server to call us here. We attempt to accept a * connection from the socket and turn it into a new * transport. If the accept fails, we have drained all pending * connections so we call xprt_inactive(). */ sx_xlock(&xprt->xp_lock); error = svc_vc_accept(xprt->xp_socket, &so); if (error == EWOULDBLOCK) { /* * We must re-test for new connections after taking * the lock to protect us in the case where a new * connection arrives after our call to accept fails * with EWOULDBLOCK. */ SOLISTEN_LOCK(xprt->xp_socket); if (TAILQ_EMPTY(&xprt->xp_socket->sol_comp)) xprt_inactive_self(xprt); SOLISTEN_UNLOCK(xprt->xp_socket); sx_xunlock(&xprt->xp_lock); return (FALSE); } if (error) { SOLISTEN_LOCK(xprt->xp_socket); if (xprt->xp_upcallset) { xprt->xp_upcallset = 0; soupcall_clear(xprt->xp_socket, SO_RCV); } SOLISTEN_UNLOCK(xprt->xp_socket); xprt_inactive_self(xprt); sx_xunlock(&xprt->xp_lock); return (FALSE); } sx_xunlock(&xprt->xp_lock); sa = NULL; error = soaccept(so, &sa); if (error) { /* * XXX not sure if I need to call sofree or soclose here. */ if (sa) free(sa, M_SONAME); return (FALSE); } /* * svc_vc_create_conn will call xprt_register - we don't need * to do anything with the new connection except derefence it. */ new_xprt = svc_vc_create_conn(xprt->xp_pool, so, sa); if (!new_xprt) { soclose(so); } else { SVC_RELEASE(new_xprt); } free(sa, M_SONAME); return (FALSE); /* there is never an rpc msg to be processed */ } /*ARGSUSED*/ static enum xprt_stat svc_vc_rendezvous_stat(SVCXPRT *xprt) { return (XPRT_IDLE); } static void svc_vc_destroy_common(SVCXPRT *xprt) { if (xprt->xp_socket) (void)soclose(xprt->xp_socket); if (xprt->xp_netid) (void) mem_free(xprt->xp_netid, strlen(xprt->xp_netid) + 1); svc_xprt_free(xprt); } static void svc_vc_rendezvous_destroy(SVCXPRT *xprt) { SOLISTEN_LOCK(xprt->xp_socket); if (xprt->xp_upcallset) { xprt->xp_upcallset = 0; solisten_upcall_set(xprt->xp_socket, NULL, NULL); } SOLISTEN_UNLOCK(xprt->xp_socket); svc_vc_destroy_common(xprt); } static void svc_vc_destroy(SVCXPRT *xprt) { struct cf_conn *cd = (struct cf_conn *)xprt->xp_p1; SOCKBUF_LOCK(&xprt->xp_socket->so_rcv); if (xprt->xp_upcallset) { xprt->xp_upcallset = 0; soupcall_clear(xprt->xp_socket, SO_RCV); } SOCKBUF_UNLOCK(&xprt->xp_socket->so_rcv); svc_vc_destroy_common(xprt); if (cd->mreq) m_freem(cd->mreq); if (cd->mpending) m_freem(cd->mpending); mem_free(cd, sizeof(*cd)); } static void svc_vc_backchannel_destroy(SVCXPRT *xprt) { struct cf_conn *cd = (struct cf_conn *)xprt->xp_p1; struct mbuf *m, *m2; svc_xprt_free(xprt); m = cd->mreq; while (m != NULL) { m2 = m; m = m->m_nextpkt; m_freem(m2); } mem_free(cd, sizeof(*cd)); } /*ARGSUSED*/ static bool_t svc_vc_control(SVCXPRT *xprt, const u_int rq, void *in) { return (FALSE); } static bool_t svc_vc_rendezvous_control(SVCXPRT *xprt, const u_int rq, void *in) { return (FALSE); } static bool_t svc_vc_backchannel_control(SVCXPRT *xprt, const u_int rq, void *in) { return (FALSE); } static enum xprt_stat svc_vc_stat(SVCXPRT *xprt) { struct cf_conn *cd; cd = (struct cf_conn *)(xprt->xp_p1); if (cd->strm_stat == XPRT_DIED) return (XPRT_DIED); if (cd->mreq != NULL && cd->resid == 0 && cd->eor) return (XPRT_MOREREQS); if (soreadable(xprt->xp_socket)) return (XPRT_MOREREQS); return (XPRT_IDLE); } static bool_t svc_vc_ack(SVCXPRT *xprt, uint32_t *ack) { *ack = atomic_load_acq_32(&xprt->xp_snt_cnt); *ack -= sbused(&xprt->xp_socket->so_snd); return (TRUE); } static enum xprt_stat svc_vc_backchannel_stat(SVCXPRT *xprt) { struct cf_conn *cd; cd = (struct cf_conn *)(xprt->xp_p1); if (cd->mreq != NULL) return (XPRT_MOREREQS); return (XPRT_IDLE); } /* * If we have an mbuf chain in cd->mpending, try to parse a record from it, * leaving the result in cd->mreq. If we don't have a complete record, leave * the partial result in cd->mreq and try to read more from the socket. */ static int svc_vc_process_pending(SVCXPRT *xprt) { struct cf_conn *cd = (struct cf_conn *) xprt->xp_p1; struct socket *so = xprt->xp_socket; struct mbuf *m; /* * If cd->resid is non-zero, we have part of the * record already, otherwise we are expecting a record * marker. */ if (!cd->resid && cd->mpending) { /* * See if there is enough data buffered to * make up a record marker. Make sure we can * handle the case where the record marker is * split across more than one mbuf. */ size_t n = 0; uint32_t header; m = cd->mpending; while (n < sizeof(uint32_t) && m) { n += m->m_len; m = m->m_next; } if (n < sizeof(uint32_t)) { so->so_rcv.sb_lowat = sizeof(uint32_t) - n; return (FALSE); } m_copydata(cd->mpending, 0, sizeof(header), (char *)&header); header = ntohl(header); cd->eor = (header & 0x80000000) != 0; cd->resid = header & 0x7fffffff; m_adj(cd->mpending, sizeof(uint32_t)); } /* * Start pulling off mbufs from cd->mpending * until we either have a complete record or * we run out of data. We use m_split to pull * data - it will pull as much as possible and * split the last mbuf if necessary. */ while (cd->mpending && cd->resid) { m = cd->mpending; if (cd->mpending->m_next || cd->mpending->m_len > cd->resid) cd->mpending = m_split(cd->mpending, cd->resid, M_WAITOK); else cd->mpending = NULL; if (cd->mreq) m_last(cd->mreq)->m_next = m; else cd->mreq = m; while (m) { cd->resid -= m->m_len; m = m->m_next; } } /* * Block receive upcalls if we have more data pending, * otherwise report our need. */ if (cd->mpending) so->so_rcv.sb_lowat = INT_MAX; else so->so_rcv.sb_lowat = imax(1, imin(cd->resid, so->so_rcv.sb_hiwat / 2)); return (TRUE); } static bool_t svc_vc_recv(SVCXPRT *xprt, struct rpc_msg *msg, struct sockaddr **addrp, struct mbuf **mp) { struct cf_conn *cd = (struct cf_conn *) xprt->xp_p1; struct uio uio; struct mbuf *m; struct socket* so = xprt->xp_socket; XDR xdrs; int error, rcvflag; uint32_t xid_plus_direction[2]; /* * Serialise access to the socket and our own record parsing * state. */ sx_xlock(&xprt->xp_lock); for (;;) { /* If we have no request ready, check pending queue. */ while (cd->mpending && (cd->mreq == NULL || cd->resid != 0 || !cd->eor)) { if (!svc_vc_process_pending(xprt)) break; } /* Process and return complete request in cd->mreq. */ if (cd->mreq != NULL && cd->resid == 0 && cd->eor) { /* * Now, check for a backchannel reply. * The XID is in the first uint32_t of the reply * and the message direction is the second one. */ if ((cd->mreq->m_len >= sizeof(xid_plus_direction) || m_length(cd->mreq, NULL) >= sizeof(xid_plus_direction)) && xprt->xp_p2 != NULL) { m_copydata(cd->mreq, 0, sizeof(xid_plus_direction), (char *)xid_plus_direction); xid_plus_direction[0] = ntohl(xid_plus_direction[0]); xid_plus_direction[1] = ntohl(xid_plus_direction[1]); /* Check message direction. */ if (xid_plus_direction[1] == REPLY) { clnt_bck_svccall(xprt->xp_p2, cd->mreq, xid_plus_direction[0]); cd->mreq = NULL; continue; } } xdrmbuf_create(&xdrs, cd->mreq, XDR_DECODE); cd->mreq = NULL; /* Check for next request in a pending queue. */ svc_vc_process_pending(xprt); if (cd->mreq == NULL || cd->resid != 0) { SOCKBUF_LOCK(&so->so_rcv); if (!soreadable(so)) xprt_inactive_self(xprt); SOCKBUF_UNLOCK(&so->so_rcv); } sx_xunlock(&xprt->xp_lock); if (! xdr_callmsg(&xdrs, msg)) { XDR_DESTROY(&xdrs); return (FALSE); } *addrp = NULL; *mp = xdrmbuf_getall(&xdrs); XDR_DESTROY(&xdrs); return (TRUE); } /* * The socket upcall calls xprt_active() which will eventually * cause the server to call us here. We attempt to * read as much as possible from the socket and put * the result in cd->mpending. If the read fails, * we have drained both cd->mpending and the socket so * we can call xprt_inactive(). */ uio.uio_resid = 1000000000; uio.uio_td = curthread; m = NULL; rcvflag = MSG_DONTWAIT; error = soreceive(so, NULL, &uio, &m, NULL, &rcvflag); if (error == EWOULDBLOCK) { /* * We must re-test for readability after * taking the lock to protect us in the case * where a new packet arrives on the socket * after our call to soreceive fails with * EWOULDBLOCK. */ SOCKBUF_LOCK(&so->so_rcv); if (!soreadable(so)) xprt_inactive_self(xprt); SOCKBUF_UNLOCK(&so->so_rcv); sx_xunlock(&xprt->xp_lock); return (FALSE); } if (error) { SOCKBUF_LOCK(&so->so_rcv); if (xprt->xp_upcallset) { xprt->xp_upcallset = 0; soupcall_clear(so, SO_RCV); } SOCKBUF_UNLOCK(&so->so_rcv); xprt_inactive_self(xprt); cd->strm_stat = XPRT_DIED; sx_xunlock(&xprt->xp_lock); return (FALSE); } if (!m) { /* * EOF - the other end has closed the socket. */ xprt_inactive_self(xprt); cd->strm_stat = XPRT_DIED; sx_xunlock(&xprt->xp_lock); return (FALSE); } if (cd->mpending) m_last(cd->mpending)->m_next = m; else cd->mpending = m; } } static bool_t svc_vc_backchannel_recv(SVCXPRT *xprt, struct rpc_msg *msg, struct sockaddr **addrp, struct mbuf **mp) { struct cf_conn *cd = (struct cf_conn *) xprt->xp_p1; struct ct_data *ct; struct mbuf *m; XDR xdrs; sx_xlock(&xprt->xp_lock); ct = (struct ct_data *)xprt->xp_p2; if (ct == NULL) { sx_xunlock(&xprt->xp_lock); return (FALSE); } mtx_lock(&ct->ct_lock); m = cd->mreq; if (m == NULL) { xprt_inactive_self(xprt); mtx_unlock(&ct->ct_lock); sx_xunlock(&xprt->xp_lock); return (FALSE); } cd->mreq = m->m_nextpkt; mtx_unlock(&ct->ct_lock); sx_xunlock(&xprt->xp_lock); xdrmbuf_create(&xdrs, m, XDR_DECODE); if (! xdr_callmsg(&xdrs, msg)) { XDR_DESTROY(&xdrs); return (FALSE); } *addrp = NULL; *mp = xdrmbuf_getall(&xdrs); XDR_DESTROY(&xdrs); return (TRUE); } static bool_t svc_vc_reply(SVCXPRT *xprt, struct rpc_msg *msg, struct sockaddr *addr, struct mbuf *m, uint32_t *seq) { XDR xdrs; struct mbuf *mrep; bool_t stat = TRUE; int error, len; /* * Leave space for record mark. */ mrep = m_gethdr(M_WAITOK, MT_DATA); mrep->m_data += sizeof(uint32_t); xdrmbuf_create(&xdrs, mrep, XDR_ENCODE); if (msg->rm_reply.rp_stat == MSG_ACCEPTED && msg->rm_reply.rp_acpt.ar_stat == SUCCESS) { if (!xdr_replymsg(&xdrs, msg)) stat = FALSE; else xdrmbuf_append(&xdrs, m); } else { stat = xdr_replymsg(&xdrs, msg); } if (stat) { m_fixhdr(mrep); /* * Prepend a record marker containing the reply length. */ M_PREPEND(mrep, sizeof(uint32_t), M_WAITOK); len = mrep->m_pkthdr.len; *mtod(mrep, uint32_t *) = htonl(0x80000000 | (len - sizeof(uint32_t))); atomic_add_32(&xprt->xp_snd_cnt, len); error = sosend(xprt->xp_socket, NULL, NULL, mrep, NULL, 0, curthread); if (!error) { atomic_add_rel_32(&xprt->xp_snt_cnt, len); if (seq) *seq = xprt->xp_snd_cnt; stat = TRUE; } else atomic_subtract_32(&xprt->xp_snd_cnt, len); } else { m_freem(mrep); } XDR_DESTROY(&xdrs); return (stat); } static bool_t svc_vc_backchannel_reply(SVCXPRT *xprt, struct rpc_msg *msg, struct sockaddr *addr, struct mbuf *m, uint32_t *seq) { struct ct_data *ct; XDR xdrs; struct mbuf *mrep; bool_t stat = TRUE; int error; /* * Leave space for record mark. */ mrep = m_gethdr(M_WAITOK, MT_DATA); mrep->m_data += sizeof(uint32_t); xdrmbuf_create(&xdrs, mrep, XDR_ENCODE); if (msg->rm_reply.rp_stat == MSG_ACCEPTED && msg->rm_reply.rp_acpt.ar_stat == SUCCESS) { if (!xdr_replymsg(&xdrs, msg)) stat = FALSE; else xdrmbuf_append(&xdrs, m); } else { stat = xdr_replymsg(&xdrs, msg); } if (stat) { m_fixhdr(mrep); /* * Prepend a record marker containing the reply length. */ M_PREPEND(mrep, sizeof(uint32_t), M_WAITOK); *mtod(mrep, uint32_t *) = htonl(0x80000000 | (mrep->m_pkthdr.len - sizeof(uint32_t))); sx_xlock(&xprt->xp_lock); ct = (struct ct_data *)xprt->xp_p2; if (ct != NULL) error = sosend(ct->ct_socket, NULL, NULL, mrep, NULL, 0, curthread); else error = EPIPE; sx_xunlock(&xprt->xp_lock); if (!error) { stat = TRUE; } } else { m_freem(mrep); } XDR_DESTROY(&xdrs); return (stat); } static bool_t svc_vc_null() { return (FALSE); } static int svc_vc_soupcall(struct socket *so, void *arg, int waitflag) { SVCXPRT *xprt = (SVCXPRT *) arg; if (soreadable(xprt->xp_socket)) xprt_active(xprt); return (SU_OK); } static int svc_vc_rendezvous_soupcall(struct socket *head, void *arg, int waitflag) { SVCXPRT *xprt = (SVCXPRT *) arg; if (!TAILQ_EMPTY(&head->sol_comp)) xprt_active(xprt); return (SU_OK); } #if 0 /* * Get the effective UID of the sending process. Used by rpcbind, keyserv * and rpc.yppasswdd on AF_LOCAL. */ int __rpc_get_local_uid(SVCXPRT *transp, uid_t *uid) { int sock, ret; gid_t egid; uid_t euid; struct sockaddr *sa; sock = transp->xp_fd; sa = (struct sockaddr *)transp->xp_rtaddr; if (sa->sa_family == AF_LOCAL) { ret = getpeereid(sock, &euid, &egid); if (ret == 0) *uid = euid; return (ret); } else return (-1); } #endif