diff --git a/sys/arm/ti/ti_pruss.c b/sys/arm/ti/ti_pruss.c index b6895a8a9cef..9f8bcedbdb05 100644 --- a/sys/arm/ti/ti_pruss.c +++ b/sys/arm/ti/ti_pruss.c @@ -1,843 +1,843 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2013 Rui Paulo * Copyright (c) 2017 Manuel Stuehn * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEBUG #define DPRINTF(fmt, ...) do { \ printf("%s: ", __func__); \ printf(fmt, __VA_ARGS__); \ } while (0) #else #define DPRINTF(fmt, ...) #endif static d_open_t ti_pruss_irq_open; static d_read_t ti_pruss_irq_read; static d_poll_t ti_pruss_irq_poll; static device_probe_t ti_pruss_probe; static device_attach_t ti_pruss_attach; static device_detach_t ti_pruss_detach; static void ti_pruss_intr(void *); static d_open_t ti_pruss_open; static d_mmap_t ti_pruss_mmap; static void ti_pruss_irq_kqread_detach(struct knote *); static int ti_pruss_irq_kqevent(struct knote *, long); static d_kqfilter_t ti_pruss_irq_kqfilter; static void ti_pruss_privdtor(void *data); #define TI_PRUSS_PRU_IRQS 2 #define TI_PRUSS_HOST_IRQS 8 #define TI_PRUSS_IRQS (TI_PRUSS_HOST_IRQS+TI_PRUSS_PRU_IRQS) #define TI_PRUSS_EVENTS 64 #define NOT_SET_STR "NONE" #define TI_TS_ARRAY 16 struct ctl { size_t cnt; size_t idx; }; struct ts_ring_buf { struct ctl ctl; uint64_t ts[TI_TS_ARRAY]; }; struct ti_pruss_irqsc { struct mtx sc_mtx; struct cdev *sc_pdev; struct selinfo sc_selinfo; int8_t channel; int8_t last; int8_t event; bool enable; struct ts_ring_buf tstamps; }; static struct cdevsw ti_pruss_cdevirq = { .d_version = D_VERSION, .d_name = "ti_pruss_irq", .d_open = ti_pruss_irq_open, .d_read = ti_pruss_irq_read, .d_poll = ti_pruss_irq_poll, .d_kqfilter = ti_pruss_irq_kqfilter, }; struct ti_pruss_softc { struct mtx sc_mtx; struct resource *sc_mem_res; struct resource *sc_irq_res[TI_PRUSS_HOST_IRQS]; void *sc_intr[TI_PRUSS_HOST_IRQS]; struct ti_pruss_irqsc sc_irq_devs[TI_PRUSS_IRQS]; bus_space_tag_t sc_bt; bus_space_handle_t sc_bh; struct cdev *sc_pdev; struct selinfo sc_selinfo; bool sc_glob_irqen; }; static struct cdevsw ti_pruss_cdevsw = { .d_version = D_VERSION, .d_name = "ti_pruss", .d_open = ti_pruss_open, .d_mmap = ti_pruss_mmap, }; static device_method_t ti_pruss_methods[] = { DEVMETHOD(device_probe, ti_pruss_probe), DEVMETHOD(device_attach, ti_pruss_attach), DEVMETHOD(device_detach, ti_pruss_detach), DEVMETHOD_END }; static driver_t ti_pruss_driver = { "ti_pruss", ti_pruss_methods, sizeof(struct ti_pruss_softc) }; DRIVER_MODULE(ti_pruss, simplebus, ti_pruss_driver, 0, 0); MODULE_DEPEND(ti_pruss, ti_sysc, 1, 1, 1); MODULE_DEPEND(ti_pruss, ti_prm, 1, 1, 1); static struct resource_spec ti_pruss_irq_spec[] = { { SYS_RES_IRQ, 0, RF_ACTIVE }, { SYS_RES_IRQ, 1, RF_ACTIVE }, { SYS_RES_IRQ, 2, RF_ACTIVE }, { SYS_RES_IRQ, 3, RF_ACTIVE }, { SYS_RES_IRQ, 4, RF_ACTIVE }, { SYS_RES_IRQ, 5, RF_ACTIVE }, { SYS_RES_IRQ, 6, RF_ACTIVE }, { SYS_RES_IRQ, 7, RF_ACTIVE }, { -1, 0, 0 } }; CTASSERT(TI_PRUSS_HOST_IRQS == nitems(ti_pruss_irq_spec) - 1); static int ti_pruss_irq_open(struct cdev *dev, int oflags, int devtype, struct thread *td) { struct ctl* irqs; struct ti_pruss_irqsc *sc; sc = dev->si_drv1; irqs = malloc(sizeof(struct ctl), M_DEVBUF, M_WAITOK); irqs->cnt = sc->tstamps.ctl.cnt; irqs->idx = sc->tstamps.ctl.idx; return devfs_set_cdevpriv(irqs, ti_pruss_privdtor); } static void ti_pruss_privdtor(void *data) { free(data, M_DEVBUF); } static int ti_pruss_irq_poll(struct cdev *dev, int events, struct thread *td) { struct ctl* irqs; struct ti_pruss_irqsc *sc; sc = dev->si_drv1; devfs_get_cdevpriv((void**)&irqs); if (events & (POLLIN | POLLRDNORM)) { if (sc->tstamps.ctl.cnt != irqs->cnt) return events & (POLLIN | POLLRDNORM); else selrecord(td, &sc->sc_selinfo); } return 0; } static int ti_pruss_irq_read(struct cdev *cdev, struct uio *uio, int ioflag) { const size_t ts_len = sizeof(uint64_t); struct ti_pruss_irqsc* irq; struct ctl* priv; int error = 0; size_t idx; ssize_t level; irq = cdev->si_drv1; if (uio->uio_resid < ts_len) return (EINVAL); error = devfs_get_cdevpriv((void**)&priv); if (error) return (error); mtx_lock(&irq->sc_mtx); if (irq->tstamps.ctl.cnt - priv->cnt > TI_TS_ARRAY) { priv->cnt = irq->tstamps.ctl.cnt; priv->idx = irq->tstamps.ctl.idx; mtx_unlock(&irq->sc_mtx); return (ENXIO); } do { idx = priv->idx; level = irq->tstamps.ctl.idx - idx; if (level < 0) level += TI_TS_ARRAY; if (level == 0) { if (ioflag & O_NONBLOCK) { mtx_unlock(&irq->sc_mtx); return (EWOULDBLOCK); } error = msleep(irq, &irq->sc_mtx, PCATCH | PDROP, "pruirq", 0); if (error) return error; mtx_lock(&irq->sc_mtx); } }while(level == 0); mtx_unlock(&irq->sc_mtx); error = uiomove(&irq->tstamps.ts[idx], ts_len, uio); if (++idx == TI_TS_ARRAY) idx = 0; priv->idx = idx; atomic_add_32(&priv->cnt, 1); return (error); } static struct ti_pruss_irq_arg { int irq; struct ti_pruss_softc *sc; } ti_pruss_irq_args[TI_PRUSS_IRQS]; static __inline uint32_t ti_pruss_reg_read(struct ti_pruss_softc *sc, uint32_t reg) { return (bus_space_read_4(sc->sc_bt, sc->sc_bh, reg)); } static __inline void ti_pruss_reg_write(struct ti_pruss_softc *sc, uint32_t reg, uint32_t val) { bus_space_write_4(sc->sc_bt, sc->sc_bh, reg, val); } static __inline void ti_pruss_interrupts_clear(struct ti_pruss_softc *sc) { /* disable global interrupt */ ti_pruss_reg_write(sc, PRUSS_INTC_GER, 0 ); /* clear all events */ ti_pruss_reg_write(sc, PRUSS_INTC_SECR0, 0xFFFFFFFF); ti_pruss_reg_write(sc, PRUSS_INTC_SECR1, 0xFFFFFFFF); /* disable all host interrupts */ ti_pruss_reg_write(sc, PRUSS_INTC_HIER, 0); } static __inline int ti_pruss_interrupts_enable(struct ti_pruss_softc *sc, int8_t irq, bool enable) { if (enable && ((sc->sc_irq_devs[irq].channel == -1) || (sc->sc_irq_devs[irq].event== -1))) { device_printf( sc->sc_pdev->si_drv1, "Interrupt chain not fully configured, not possible to enable\n" ); return (EINVAL); } sc->sc_irq_devs[irq].enable = enable; if (sc->sc_irq_devs[irq].sc_pdev) { destroy_dev(sc->sc_irq_devs[irq].sc_pdev); sc->sc_irq_devs[irq].sc_pdev = NULL; } if (enable) { sc->sc_irq_devs[irq].sc_pdev = make_dev(&ti_pruss_cdevirq, 0, UID_ROOT, GID_WHEEL, 0600, "pruss%d.irq%d", device_get_unit(sc->sc_pdev->si_drv1), irq); sc->sc_irq_devs[irq].sc_pdev->si_drv1 = &sc->sc_irq_devs[irq]; sc->sc_irq_devs[irq].tstamps.ctl.idx = 0; } uint32_t reg = enable ? PRUSS_INTC_HIEISR : PRUSS_INTC_HIDISR; ti_pruss_reg_write(sc, reg, sc->sc_irq_devs[irq].channel); reg = enable ? PRUSS_INTC_EISR : PRUSS_INTC_EICR; ti_pruss_reg_write(sc, reg, sc->sc_irq_devs[irq].event ); return (0); } static __inline void ti_pruss_map_write(struct ti_pruss_softc *sc, uint32_t basereg, uint8_t index, uint8_t content) { const size_t regadr = basereg + index & ~0x03; const size_t bitpos = (index & 0x03) * 8; uint32_t rmw = ti_pruss_reg_read(sc, regadr); rmw = (rmw & ~( 0xF << bitpos)) | ( (content & 0xF) << bitpos); ti_pruss_reg_write(sc, regadr, rmw); } static int ti_pruss_event_map( SYSCTL_HANDLER_ARGS ) { struct ti_pruss_softc *sc; const int8_t irq = arg2; int err; char event[sizeof(NOT_SET_STR)]; sc = arg1; if(sc->sc_irq_devs[irq].event == -1) bcopy(NOT_SET_STR, event, sizeof(event)); else snprintf(event, sizeof(event), "%d", sc->sc_irq_devs[irq].event); err = sysctl_handle_string(oidp, event, sizeof(event), req); if(err != 0) return (err); if (req->newptr) { // write event if (strcmp(NOT_SET_STR, event) == 0) { ti_pruss_interrupts_enable(sc, irq, false); sc->sc_irq_devs[irq].event = -1; } else { if (sc->sc_irq_devs[irq].channel == -1) { device_printf( sc->sc_pdev->si_drv1, "corresponding channel not configured\n"); return (ENXIO); } const int8_t channelnr = sc->sc_irq_devs[irq].channel; const int8_t eventnr = strtol( event, NULL, 10 ); // TODO: check if strol is valid if (eventnr > TI_PRUSS_EVENTS || eventnr < 0) { device_printf( sc->sc_pdev->si_drv1, "Event number %d not valid (0 - %d)", channelnr, TI_PRUSS_EVENTS -1); return (EINVAL); } sc->sc_irq_devs[irq].channel = channelnr; sc->sc_irq_devs[irq].event = eventnr; // event[nr] <= channel ti_pruss_map_write(sc, PRUSS_INTC_CMR_BASE, eventnr, channelnr); } } return (err); } static int ti_pruss_channel_map(SYSCTL_HANDLER_ARGS) { struct ti_pruss_softc *sc; int err; char channel[sizeof(NOT_SET_STR)]; const int8_t irq = arg2; sc = arg1; if (sc->sc_irq_devs[irq].channel == -1) bcopy(NOT_SET_STR, channel, sizeof(channel)); else snprintf(channel, sizeof(channel), "%d", sc->sc_irq_devs[irq].channel); err = sysctl_handle_string(oidp, channel, sizeof(channel), req); if (err != 0) return (err); if (req->newptr) { // write event if (strcmp(NOT_SET_STR, channel) == 0) { ti_pruss_interrupts_enable(sc, irq, false); ti_pruss_reg_write(sc, PRUSS_INTC_HIDISR, sc->sc_irq_devs[irq].channel); sc->sc_irq_devs[irq].channel = -1; } else { const int8_t channelnr = strtol(channel, NULL, 10); // TODO: check if strol is valid if (channelnr > TI_PRUSS_IRQS || channelnr < 0) { device_printf(sc->sc_pdev->si_drv1, "Channel number %d not valid (0 - %d)", channelnr, TI_PRUSS_IRQS-1); return (EINVAL); } sc->sc_irq_devs[irq].channel = channelnr; sc->sc_irq_devs[irq].last = -1; // channel[nr] <= irqnr ti_pruss_map_write(sc, PRUSS_INTC_HMR_BASE, irq, channelnr); } } return (err); } static int ti_pruss_interrupt_enable(SYSCTL_HANDLER_ARGS) { struct ti_pruss_softc *sc; int err; bool irqenable; const int8_t irq = arg2; sc = arg1; irqenable = sc->sc_irq_devs[arg2].enable; err = sysctl_handle_bool(oidp, &irqenable, arg2, req); if (err != 0) return (err); if (req->newptr) // write enable return ti_pruss_interrupts_enable(sc, irq, irqenable); return (err); } static int ti_pruss_global_interrupt_enable(SYSCTL_HANDLER_ARGS) { struct ti_pruss_softc *sc; int err; bool glob_irqen; sc = arg1; glob_irqen = sc->sc_glob_irqen; err = sysctl_handle_bool(oidp, &glob_irqen, arg2, req); if (err != 0) return (err); if (req->newptr) { sc->sc_glob_irqen = glob_irqen; ti_pruss_reg_write(sc, PRUSS_INTC_GER, glob_irqen); } return (err); } static int ti_pruss_probe(device_t dev) { if (!ofw_bus_status_okay(dev)) return (ENXIO); if (ofw_bus_is_compatible(dev, "ti,pruss-v1") || ofw_bus_is_compatible(dev, "ti,pruss-v2")) { device_set_desc(dev, "TI Programmable Realtime Unit Subsystem"); return (BUS_PROBE_DEFAULT); } return (ENXIO); } static int ti_pruss_attach(device_t dev) { struct ti_pruss_softc *sc; int rid, i, err, ncells; phandle_t node; clk_t l3_gclk, pruss_ocp_gclk; phandle_t ti_prm_ref, *cells; device_t ti_prm_dev; rid = 0; sc = device_get_softc(dev); node = ofw_bus_get_node(device_get_parent(dev)); if (node <= 0) { device_printf(dev, "Cant get ofw node\n"); return (ENXIO); } /* * Follow activate pattern from sys/arm/ti/am335x/am335x_prcm.c * by Damjan Marion */ /* Set MODULEMODE to ENABLE(2) */ /* Wait for MODULEMODE to become ENABLE(2) */ if (ti_sysc_clock_enable(device_get_parent(dev)) != 0) { device_printf(dev, "Could not enable PRUSS clock\n"); return (ENXIO); } /* Set CLKTRCTRL to SW_WKUP(2) */ /* Wait for the 200 MHz OCP clock to become active */ /* Wait for the 200 MHz IEP clock to become active */ /* Wait for the 192 MHz UART clock to become active */ /* * At the moment there is no reference to CM_PER_PRU_ICSS_CLKSTCTRL@140 * in the devicetree. The register reset state are SW_WKUP(2) as default * so at the moment ignore setting this register. */ /* Select L3F as OCP clock */ /* Get the clock and set the parent */ err = clk_get_by_name(dev, "l3_gclk", &l3_gclk); if (err) { device_printf(dev, "Cant get l3_gclk err %d\n", err); return (ENXIO); } err = clk_get_by_name(dev, "pruss_ocp_gclk@530", &pruss_ocp_gclk); if (err) { device_printf(dev, "Cant get pruss_ocp_gclk@530 err %d\n", err); return (ENXIO); } err = clk_set_parent_by_clk(pruss_ocp_gclk, l3_gclk); if (err) { device_printf(dev, "Cant set pruss_ocp_gclk parent to l3_gclk err %d\n", err); return (ENXIO); } /* Clear the RESET bit */ /* Find the ti_prm */ /* #reset-cells should not been used in this way but... */ err = ofw_bus_parse_xref_list_alloc(node, "resets", "#reset-cells", 0, &ti_prm_ref, &ncells, &cells); OF_prop_free(cells); if (err) { device_printf(dev, "Cant fetch \"resets\" reference %x\n", err); return (ENXIO); } ti_prm_dev = OF_device_from_xref(ti_prm_ref); if (ti_prm_dev == NULL) { device_printf(dev, "Cant get device from \"resets\"\n"); return (ENXIO); } err = ti_prm_reset(ti_prm_dev); if (err) { device_printf(dev, "ti_prm_reset failed %d\n", err); return (ENXIO); } /* End of clock activation */ mtx_init(&sc->sc_mtx, "TI PRUSS", NULL, MTX_DEF); sc->sc_mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (sc->sc_mem_res == NULL) { device_printf(dev, "could not allocate memory resource\n"); return (ENXIO); } struct sysctl_ctx_list *clist = device_get_sysctl_ctx(dev); if (!clist) return (EINVAL); struct sysctl_oid *poid; poid = device_get_sysctl_tree( dev ); if (!poid) return (EINVAL); sc->sc_glob_irqen = false; struct sysctl_oid *irq_root = SYSCTL_ADD_NODE(clist, SYSCTL_CHILDREN(poid), OID_AUTO, "irq", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "PRUSS Host Interrupts"); SYSCTL_ADD_PROC(clist, SYSCTL_CHILDREN(poid), OID_AUTO, "global_interrupt_enable", CTLFLAG_RW | CTLTYPE_U8 | CTLFLAG_NEEDGIANT, sc, 0, ti_pruss_global_interrupt_enable, "CU", "Global interrupt enable"); sc->sc_bt = rman_get_bustag(sc->sc_mem_res); sc->sc_bh = rman_get_bushandle(sc->sc_mem_res); if (bus_alloc_resources(dev, ti_pruss_irq_spec, sc->sc_irq_res) != 0) { device_printf(dev, "could not allocate interrupt resource\n"); ti_pruss_detach(dev); return (ENXIO); } ti_pruss_interrupts_clear(sc); for (i = 0; i < TI_PRUSS_IRQS; i++) { char name[8]; snprintf(name, sizeof(name), "%d", i); struct sysctl_oid *irq_nodes = SYSCTL_ADD_NODE(clist, SYSCTL_CHILDREN(irq_root), OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "PRUSS Interrupts"); SYSCTL_ADD_PROC(clist, SYSCTL_CHILDREN(irq_nodes), OID_AUTO, "channel", CTLFLAG_RW | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, sc, i, ti_pruss_channel_map, "A", "Channel attached to this irq"); SYSCTL_ADD_PROC(clist, SYSCTL_CHILDREN(irq_nodes), OID_AUTO, "event", CTLFLAG_RW | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, sc, i, ti_pruss_event_map, "A", "Event attached to this irq"); SYSCTL_ADD_PROC(clist, SYSCTL_CHILDREN(irq_nodes), OID_AUTO, "enable", CTLFLAG_RW | CTLTYPE_U8 | CTLFLAG_NEEDGIANT, sc, i, ti_pruss_interrupt_enable, "CU", "Enable/Disable interrupt"); sc->sc_irq_devs[i].event = -1; sc->sc_irq_devs[i].channel = -1; sc->sc_irq_devs[i].tstamps.ctl.idx = 0; if (i < TI_PRUSS_HOST_IRQS) { ti_pruss_irq_args[i].irq = i; ti_pruss_irq_args[i].sc = sc; if (bus_setup_intr(dev, sc->sc_irq_res[i], INTR_MPSAFE | INTR_TYPE_MISC, NULL, ti_pruss_intr, &ti_pruss_irq_args[i], &sc->sc_intr[i]) != 0) { device_printf(dev, "unable to setup the interrupt handler\n"); ti_pruss_detach(dev); return (ENXIO); } mtx_init(&sc->sc_irq_devs[i].sc_mtx, "TI PRUSS IRQ", NULL, MTX_DEF); knlist_init_mtx(&sc->sc_irq_devs[i].sc_selinfo.si_note, &sc->sc_irq_devs[i].sc_mtx); } } if (ti_pruss_reg_read(sc, PRUSS_AM33XX_INTC) == PRUSS_AM33XX_REV) device_printf(dev, "AM33xx PRU-ICSS\n"); sc->sc_pdev = make_dev(&ti_pruss_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "pruss%d", device_get_unit(dev)); sc->sc_pdev->si_drv1 = dev; /* Acc. to datasheet always write 1 to polarity registers */ ti_pruss_reg_write(sc, PRUSS_INTC_SIPR0, 0xFFFFFFFF); ti_pruss_reg_write(sc, PRUSS_INTC_SIPR1, 0xFFFFFFFF); /* Acc. to datasheet always write 0 to event type registers */ ti_pruss_reg_write(sc, PRUSS_INTC_SITR0, 0); ti_pruss_reg_write(sc, PRUSS_INTC_SITR1, 0); return (0); } static int ti_pruss_detach(device_t dev) { struct ti_pruss_softc *sc = device_get_softc(dev); ti_pruss_interrupts_clear(sc); for (int i = 0; i < TI_PRUSS_HOST_IRQS; i++) { ti_pruss_interrupts_enable( sc, i, false ); if (sc->sc_intr[i]) bus_teardown_intr(dev, sc->sc_irq_res[i], sc->sc_intr[i]); if (sc->sc_irq_res[i]) bus_release_resource(dev, SYS_RES_IRQ, rman_get_rid(sc->sc_irq_res[i]), sc->sc_irq_res[i]); knlist_clear(&sc->sc_irq_devs[i].sc_selinfo.si_note, 0); mtx_lock(&sc->sc_irq_devs[i].sc_mtx); if (!knlist_empty(&sc->sc_irq_devs[i].sc_selinfo.si_note)) printf("IRQ %d KQueue not empty!\n", i ); mtx_unlock(&sc->sc_irq_devs[i].sc_mtx); knlist_destroy(&sc->sc_irq_devs[i].sc_selinfo.si_note); mtx_destroy(&sc->sc_irq_devs[i].sc_mtx); } mtx_destroy(&sc->sc_mtx); if (sc->sc_mem_res) bus_release_resource(dev, SYS_RES_MEMORY, rman_get_rid(sc->sc_mem_res), sc->sc_mem_res); if (sc->sc_pdev) destroy_dev(sc->sc_pdev); return (0); } static void ti_pruss_intr(void *arg) { int val; struct ti_pruss_irq_arg *iap = arg; struct ti_pruss_softc *sc = iap->sc; /* * Interrupts pr1_host_intr[0:7] are mapped to * Host-2 to Host-9 of PRU-ICSS IRQ-controller. */ const int pru_int = iap->irq + TI_PRUSS_PRU_IRQS; const int pru_int_mask = (1 << pru_int); const int pru_channel = sc->sc_irq_devs[pru_int].channel; const int pru_event = sc->sc_irq_devs[pru_channel].event; val = ti_pruss_reg_read(sc, PRUSS_INTC_HIER); if (!(val & pru_int_mask)) return; ti_pruss_reg_write(sc, PRUSS_INTC_HIDISR, pru_int); ti_pruss_reg_write(sc, PRUSS_INTC_SICR, pru_event); ti_pruss_reg_write(sc, PRUSS_INTC_HIEISR, pru_int); struct ti_pruss_irqsc* irq = &sc->sc_irq_devs[pru_channel]; size_t wr = irq->tstamps.ctl.idx; struct timespec ts; nanouptime(&ts); irq->tstamps.ts[wr] = ts.tv_sec * 1000000000 + ts.tv_nsec; if (++wr == TI_TS_ARRAY) wr = 0; atomic_add_32(&irq->tstamps.ctl.cnt, 1); irq->tstamps.ctl.idx = wr; KNOTE_UNLOCKED(&irq->sc_selinfo.si_note, pru_int); wakeup(irq); selwakeup(&irq->sc_selinfo); } static int ti_pruss_open(struct cdev *cdev __unused, int oflags __unused, int devtype __unused, struct thread *td __unused) { return (0); } static int ti_pruss_mmap(struct cdev *cdev, vm_ooffset_t offset, vm_paddr_t *paddr, int nprot, vm_memattr_t *memattr) { device_t dev = cdev->si_drv1; struct ti_pruss_softc *sc = device_get_softc(dev); if (offset >= rman_get_size(sc->sc_mem_res)) return (ENOSPC); *paddr = rman_get_start(sc->sc_mem_res) + offset; *memattr = VM_MEMATTR_UNCACHEABLE; return (0); } -static struct filterops ti_pruss_kq_read = { +static const struct filterops ti_pruss_kq_read = { .f_isfd = 1, .f_detach = ti_pruss_irq_kqread_detach, .f_event = ti_pruss_irq_kqevent, }; static void ti_pruss_irq_kqread_detach(struct knote *kn) { struct ti_pruss_irqsc *sc = kn->kn_hook; knlist_remove(&sc->sc_selinfo.si_note, kn, 0); } static int ti_pruss_irq_kqevent(struct knote *kn, long hint) { struct ti_pruss_irqsc* irq_sc; int notify; irq_sc = kn->kn_hook; if (hint > 0) kn->kn_data = hint - 2; if (hint > 0 || irq_sc->last > 0) notify = 1; else notify = 0; irq_sc->last = hint; return (notify); } static int ti_pruss_irq_kqfilter(struct cdev *cdev, struct knote *kn) { struct ti_pruss_irqsc *sc = cdev->si_drv1; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_hook = sc; kn->kn_fop = &ti_pruss_kq_read; knlist_add(&sc->sc_selinfo.si_note, kn, 0); break; default: return (EINVAL); } return (0); } diff --git a/sys/cam/scsi/scsi_pass.c b/sys/cam/scsi/scsi_pass.c index ff48bed30e68..72035e1e0d0e 100644 --- a/sys/cam/scsi/scsi_pass.c +++ b/sys/cam/scsi/scsi_pass.c @@ -1,2274 +1,2274 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 1997, 1998, 2000 Justin T. Gibbs. * Copyright (c) 1997, 1998, 1999 Kenneth D. Merry. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include typedef enum { PASS_FLAG_OPEN = 0x01, PASS_FLAG_LOCKED = 0x02, PASS_FLAG_INVALID = 0x04, PASS_FLAG_INITIAL_PHYSPATH = 0x08, PASS_FLAG_ZONE_INPROG = 0x10, PASS_FLAG_ZONE_VALID = 0x20, PASS_FLAG_UNMAPPED_CAPABLE = 0x40, PASS_FLAG_ABANDONED_REF_SET = 0x80 } pass_flags; typedef enum { PASS_STATE_NORMAL } pass_state; typedef enum { PASS_CCB_BUFFER_IO, PASS_CCB_QUEUED_IO } pass_ccb_types; #define ccb_type ppriv_field0 #define ccb_ioreq ppriv_ptr1 /* * The maximum number of memory segments we preallocate. */ #define PASS_MAX_SEGS 16 typedef enum { PASS_IO_NONE = 0x00, PASS_IO_USER_SEG_MALLOC = 0x01, PASS_IO_KERN_SEG_MALLOC = 0x02, PASS_IO_ABANDONED = 0x04 } pass_io_flags; struct pass_io_req { union ccb ccb; union ccb *alloced_ccb; union ccb *user_ccb_ptr; camq_entry user_periph_links; ccb_ppriv_area user_periph_priv; struct cam_periph_map_info mapinfo; pass_io_flags flags; ccb_flags data_flags; int num_user_segs; bus_dma_segment_t user_segs[PASS_MAX_SEGS]; int num_kern_segs; bus_dma_segment_t kern_segs[PASS_MAX_SEGS]; bus_dma_segment_t *user_segptr; bus_dma_segment_t *kern_segptr; int num_bufs; uint32_t dirs[CAM_PERIPH_MAXMAPS]; uint32_t lengths[CAM_PERIPH_MAXMAPS]; uint8_t *user_bufs[CAM_PERIPH_MAXMAPS]; uint8_t *kern_bufs[CAM_PERIPH_MAXMAPS]; struct bintime start_time; TAILQ_ENTRY(pass_io_req) links; }; struct pass_softc { pass_state state; pass_flags flags; uint8_t pd_type; int open_count; u_int maxio; struct devstat *device_stats; struct cdev *dev; struct cdev *alias_dev; struct task add_physpath_task; struct task shutdown_kqueue_task; struct selinfo read_select; TAILQ_HEAD(, pass_io_req) incoming_queue; TAILQ_HEAD(, pass_io_req) active_queue; TAILQ_HEAD(, pass_io_req) abandoned_queue; TAILQ_HEAD(, pass_io_req) done_queue; struct cam_periph *periph; char zone_name[12]; char io_zone_name[12]; uma_zone_t pass_zone; uma_zone_t pass_io_zone; size_t io_zone_size; }; static d_open_t passopen; static d_close_t passclose; static d_ioctl_t passioctl; static d_ioctl_t passdoioctl; static d_poll_t passpoll; static d_kqfilter_t passkqfilter; static void passreadfiltdetach(struct knote *kn); static int passreadfilt(struct knote *kn, long hint); static periph_init_t passinit; static periph_ctor_t passregister; static periph_oninv_t passoninvalidate; static periph_dtor_t passcleanup; static periph_start_t passstart; static void pass_shutdown_kqueue(void *context, int pending); static void pass_add_physpath(void *context, int pending); static void passasync(void *callback_arg, uint32_t code, struct cam_path *path, void *arg); static void passdone(struct cam_periph *periph, union ccb *done_ccb); static int passcreatezone(struct cam_periph *periph); static void passiocleanup(struct pass_softc *softc, struct pass_io_req *io_req); static int passcopysglist(struct cam_periph *periph, struct pass_io_req *io_req, ccb_flags direction); static int passmemsetup(struct cam_periph *periph, struct pass_io_req *io_req); static int passmemdone(struct cam_periph *periph, struct pass_io_req *io_req); static int passerror(union ccb *ccb, uint32_t cam_flags, uint32_t sense_flags); static int passsendccb(struct cam_periph *periph, union ccb *ccb, union ccb *inccb); static void passflags(union ccb *ccb, uint32_t *cam_flags, uint32_t *sense_flags); static struct periph_driver passdriver = { passinit, "pass", TAILQ_HEAD_INITIALIZER(passdriver.units), /* generation */ 0 }; PERIPHDRIVER_DECLARE(pass, passdriver); static struct cdevsw pass_cdevsw = { .d_version = D_VERSION, .d_flags = D_TRACKCLOSE, .d_open = passopen, .d_close = passclose, .d_ioctl = passioctl, .d_poll = passpoll, .d_kqfilter = passkqfilter, .d_name = "pass", }; -static struct filterops passread_filtops = { +static const struct filterops passread_filtops = { .f_isfd = 1, .f_detach = passreadfiltdetach, .f_event = passreadfilt }; static MALLOC_DEFINE(M_SCSIPASS, "scsi_pass", "scsi passthrough buffers"); static void passinit(void) { cam_status status; /* * Install a global async callback. This callback will * receive async callbacks like "new device found". */ status = xpt_register_async(AC_FOUND_DEVICE, passasync, NULL, NULL); if (status != CAM_REQ_CMP) { printf("pass: Failed to attach master async callback " "due to status 0x%x!\n", status); } } static void passrejectios(struct cam_periph *periph) { struct pass_io_req *io_req, *io_req2; struct pass_softc *softc; softc = (struct pass_softc *)periph->softc; /* * The user can no longer get status for I/O on the done queue, so * clean up all outstanding I/O on the done queue. */ TAILQ_FOREACH_SAFE(io_req, &softc->done_queue, links, io_req2) { TAILQ_REMOVE(&softc->done_queue, io_req, links); passiocleanup(softc, io_req); uma_zfree(softc->pass_zone, io_req); } /* * The underlying device is gone, so we can't issue these I/Os. * The devfs node has been shut down, so we can't return status to * the user. Free any I/O left on the incoming queue. */ TAILQ_FOREACH_SAFE(io_req, &softc->incoming_queue, links, io_req2) { TAILQ_REMOVE(&softc->incoming_queue, io_req, links); passiocleanup(softc, io_req); uma_zfree(softc->pass_zone, io_req); } /* * Normally we would put I/Os on the abandoned queue and acquire a * reference when we saw the final close. But, the device went * away and devfs may have moved everything off to deadfs by the * time the I/O done callback is called; as a result, we won't see * any more closes. So, if we have any active I/Os, we need to put * them on the abandoned queue. When the abandoned queue is empty, * we'll release the remaining reference (see below) to the peripheral. */ TAILQ_FOREACH_SAFE(io_req, &softc->active_queue, links, io_req2) { TAILQ_REMOVE(&softc->active_queue, io_req, links); io_req->flags |= PASS_IO_ABANDONED; TAILQ_INSERT_TAIL(&softc->abandoned_queue, io_req, links); } /* * If we put any I/O on the abandoned queue, acquire a reference. */ if ((!TAILQ_EMPTY(&softc->abandoned_queue)) && ((softc->flags & PASS_FLAG_ABANDONED_REF_SET) == 0)) { cam_periph_doacquire(periph); softc->flags |= PASS_FLAG_ABANDONED_REF_SET; } } static void passdevgonecb(void *arg) { struct cam_periph *periph; struct mtx *mtx; struct pass_softc *softc; int i; periph = (struct cam_periph *)arg; mtx = cam_periph_mtx(periph); mtx_lock(mtx); softc = (struct pass_softc *)periph->softc; KASSERT(softc->open_count >= 0, ("Negative open count %d", softc->open_count)); /* * When we get this callback, we will get no more close calls from * devfs. So if we have any dangling opens, we need to release the * reference held for that particular context. */ for (i = 0; i < softc->open_count; i++) cam_periph_release_locked(periph); softc->open_count = 0; /* * Release the reference held for the device node, it is gone now. * Accordingly, inform all queued I/Os of their fate. */ cam_periph_release_locked(periph); passrejectios(periph); /* * We reference the SIM lock directly here, instead of using * cam_periph_unlock(). The reason is that the final call to * cam_periph_release_locked() above could result in the periph * getting freed. If that is the case, dereferencing the periph * with a cam_periph_unlock() call would cause a page fault. */ mtx_unlock(mtx); /* * We have to remove our kqueue context from a thread because it * may sleep. It would be nice if we could get a callback from * kqueue when it is done cleaning up resources. */ taskqueue_enqueue(taskqueue_thread, &softc->shutdown_kqueue_task); } static void passoninvalidate(struct cam_periph *periph) { struct pass_softc *softc; softc = (struct pass_softc *)periph->softc; /* * De-register any async callbacks. */ xpt_register_async(0, passasync, periph, periph->path); softc->flags |= PASS_FLAG_INVALID; /* * Tell devfs this device has gone away, and ask for a callback * when it has cleaned up its state. */ destroy_dev_sched_cb(softc->dev, passdevgonecb, periph); } static void passcleanup(struct cam_periph *periph) { struct pass_softc *softc; softc = (struct pass_softc *)periph->softc; cam_periph_assert(periph, MA_OWNED); KASSERT(TAILQ_EMPTY(&softc->active_queue), ("%s called when there are commands on the active queue!\n", __func__)); KASSERT(TAILQ_EMPTY(&softc->abandoned_queue), ("%s called when there are commands on the abandoned queue!\n", __func__)); KASSERT(TAILQ_EMPTY(&softc->incoming_queue), ("%s called when there are commands on the incoming queue!\n", __func__)); KASSERT(TAILQ_EMPTY(&softc->done_queue), ("%s called when there are commands on the done queue!\n", __func__)); devstat_remove_entry(softc->device_stats); cam_periph_unlock(periph); /* * We call taskqueue_drain() for the physpath task to make sure it * is complete. We drop the lock because this can potentially * sleep. XXX KDM that is bad. Need a way to get a callback when * a taskqueue is drained. * * Note that we don't drain the kqueue shutdown task queue. This * is because we hold a reference on the periph for kqueue, and * release that reference from the kqueue shutdown task queue. So * we cannot come into this routine unless we've released that * reference. Also, because that could be the last reference, we * could be called from the cam_periph_release() call in * pass_shutdown_kqueue(). In that case, the taskqueue_drain() * would deadlock. It would be preferable if we had a way to * get a callback when a taskqueue is done. */ taskqueue_drain(taskqueue_thread, &softc->add_physpath_task); /* * It should be safe to destroy the zones from here, because all * of the references to this peripheral have been freed, and all * I/O has been terminated and freed. We check the zones for NULL * because they may not have been allocated yet if the device went * away before any asynchronous I/O has been issued. */ if (softc->pass_zone != NULL) uma_zdestroy(softc->pass_zone); if (softc->pass_io_zone != NULL) uma_zdestroy(softc->pass_io_zone); cam_periph_lock(periph); free(softc, M_DEVBUF); } static void pass_shutdown_kqueue(void *context, int pending) { struct cam_periph *periph; struct pass_softc *softc; periph = context; softc = periph->softc; knlist_clear(&softc->read_select.si_note, /*is_locked*/ 0); knlist_destroy(&softc->read_select.si_note); /* * Release the reference we held for kqueue. */ cam_periph_release(periph); } static void pass_add_physpath(void *context, int pending) { struct cam_periph *periph; struct pass_softc *softc; struct mtx *mtx; char *physpath; /* * If we have one, create a devfs alias for our * physical path. */ periph = context; softc = periph->softc; physpath = malloc(MAXPATHLEN, M_DEVBUF, M_WAITOK); mtx = cam_periph_mtx(periph); mtx_lock(mtx); if (periph->flags & CAM_PERIPH_INVALID) goto out; if (xpt_getattr(physpath, MAXPATHLEN, "GEOM::physpath", periph->path) == 0 && strlen(physpath) != 0) { mtx_unlock(mtx); make_dev_physpath_alias(MAKEDEV_WAITOK | MAKEDEV_CHECKNAME, &softc->alias_dev, softc->dev, softc->alias_dev, physpath); mtx_lock(mtx); } out: /* * Now that we've made our alias, we no longer have to have a * reference to the device. */ if ((softc->flags & PASS_FLAG_INITIAL_PHYSPATH) == 0) softc->flags |= PASS_FLAG_INITIAL_PHYSPATH; /* * We always acquire a reference to the periph before queueing this * task queue function, so it won't go away before we run. */ while (pending-- > 0) cam_periph_release_locked(periph); mtx_unlock(mtx); free(physpath, M_DEVBUF); } static void passasync(void *callback_arg, uint32_t code, struct cam_path *path, void *arg) { struct cam_periph *periph; periph = (struct cam_periph *)callback_arg; switch (code) { case AC_FOUND_DEVICE: { struct ccb_getdev *cgd; cam_status status; cgd = (struct ccb_getdev *)arg; if (cgd == NULL) break; /* * Allocate a peripheral instance for * this device and start the probe * process. */ status = cam_periph_alloc(passregister, passoninvalidate, passcleanup, passstart, "pass", CAM_PERIPH_BIO, path, passasync, AC_FOUND_DEVICE, cgd); if (status != CAM_REQ_CMP && status != CAM_REQ_INPROG) { const struct cam_status_entry *entry; entry = cam_fetch_status_entry(status); printf("passasync: Unable to attach new device " "due to status %#x: %s\n", status, entry ? entry->status_text : "Unknown"); } break; } case AC_ADVINFO_CHANGED: { uintptr_t buftype; buftype = (uintptr_t)arg; if (buftype == CDAI_TYPE_PHYS_PATH) { struct pass_softc *softc; softc = (struct pass_softc *)periph->softc; /* * Acquire a reference to the periph before we * start the taskqueue, so that we don't run into * a situation where the periph goes away before * the task queue has a chance to run. */ if (cam_periph_acquire(periph) != 0) break; taskqueue_enqueue(taskqueue_thread, &softc->add_physpath_task); } break; } default: cam_periph_async(periph, code, path, arg); break; } } static cam_status passregister(struct cam_periph *periph, void *arg) { struct pass_softc *softc; struct ccb_getdev *cgd; struct ccb_pathinq cpi; struct make_dev_args args; int error, no_tags; cgd = (struct ccb_getdev *)arg; if (cgd == NULL) { printf("%s: no getdev CCB, can't register device\n", __func__); return(CAM_REQ_CMP_ERR); } softc = (struct pass_softc *)malloc(sizeof(*softc), M_DEVBUF, M_NOWAIT); if (softc == NULL) { printf("%s: Unable to probe new device. " "Unable to allocate softc\n", __func__); return(CAM_REQ_CMP_ERR); } bzero(softc, sizeof(*softc)); softc->state = PASS_STATE_NORMAL; if (cgd->protocol == PROTO_SCSI || cgd->protocol == PROTO_ATAPI) softc->pd_type = SID_TYPE(&cgd->inq_data); else if (cgd->protocol == PROTO_SATAPM) softc->pd_type = T_ENCLOSURE; else softc->pd_type = T_DIRECT; periph->softc = softc; softc->periph = periph; TAILQ_INIT(&softc->incoming_queue); TAILQ_INIT(&softc->active_queue); TAILQ_INIT(&softc->abandoned_queue); TAILQ_INIT(&softc->done_queue); snprintf(softc->zone_name, sizeof(softc->zone_name), "%s%d", periph->periph_name, periph->unit_number); snprintf(softc->io_zone_name, sizeof(softc->io_zone_name), "%s%dIO", periph->periph_name, periph->unit_number); softc->io_zone_size = maxphys; knlist_init_mtx(&softc->read_select.si_note, cam_periph_mtx(periph)); xpt_path_inq(&cpi, periph->path); if (cpi.maxio == 0) softc->maxio = DFLTPHYS; /* traditional default */ else if (cpi.maxio > maxphys) softc->maxio = maxphys; /* for safety */ else softc->maxio = cpi.maxio; /* real value */ if (cpi.hba_misc & PIM_UNMAPPED) softc->flags |= PASS_FLAG_UNMAPPED_CAPABLE; /* * We pass in 0 for a blocksize, since we don't * know what the blocksize of this device is, if * it even has a blocksize. */ cam_periph_unlock(periph); no_tags = (cgd->inq_data.flags & SID_CmdQue) == 0; softc->device_stats = devstat_new_entry("pass", periph->unit_number, 0, DEVSTAT_NO_BLOCKSIZE | (no_tags ? DEVSTAT_NO_ORDERED_TAGS : 0), softc->pd_type | XPORT_DEVSTAT_TYPE(cpi.transport) | DEVSTAT_TYPE_PASS, DEVSTAT_PRIORITY_PASS); /* * Initialize the taskqueue handler for shutting down kqueue. */ TASK_INIT(&softc->shutdown_kqueue_task, /*priority*/ 0, pass_shutdown_kqueue, periph); /* * Acquire a reference to the periph that we can release once we've * cleaned up the kqueue. */ if (cam_periph_acquire(periph) != 0) { xpt_print(periph->path, "%s: lost periph during " "registration!\n", __func__); cam_periph_lock(periph); return (CAM_REQ_CMP_ERR); } /* * Acquire a reference to the periph before we create the devfs * instance for it. We'll release this reference once the devfs * instance has been freed. */ if (cam_periph_acquire(periph) != 0) { xpt_print(periph->path, "%s: lost periph during " "registration!\n", __func__); cam_periph_lock(periph); return (CAM_REQ_CMP_ERR); } /* Register the device */ make_dev_args_init(&args); args.mda_devsw = &pass_cdevsw; args.mda_unit = periph->unit_number; args.mda_uid = UID_ROOT; args.mda_gid = GID_OPERATOR; args.mda_mode = 0600; args.mda_si_drv1 = periph; args.mda_flags = MAKEDEV_NOWAIT; error = make_dev_s(&args, &softc->dev, "%s%d", periph->periph_name, periph->unit_number); if (error != 0) { cam_periph_lock(periph); cam_periph_release_locked(periph); return (CAM_REQ_CMP_ERR); } /* * Hold a reference to the periph before we create the physical * path alias so it can't go away. */ if (cam_periph_acquire(periph) != 0) { xpt_print(periph->path, "%s: lost periph during " "registration!\n", __func__); cam_periph_lock(periph); return (CAM_REQ_CMP_ERR); } cam_periph_lock(periph); TASK_INIT(&softc->add_physpath_task, /*priority*/0, pass_add_physpath, periph); /* * See if physical path information is already available. */ taskqueue_enqueue(taskqueue_thread, &softc->add_physpath_task); /* * Add an async callback so that we get notified if * this device goes away or its physical path * (stored in the advanced info data of the EDT) has * changed. */ xpt_register_async(AC_LOST_DEVICE | AC_ADVINFO_CHANGED, passasync, periph, periph->path); if (bootverbose) xpt_announce_periph(periph, NULL); return(CAM_REQ_CMP); } static int passopen(struct cdev *dev, int flags, int fmt, struct thread *td) { struct cam_periph *periph; struct pass_softc *softc; int error; periph = (struct cam_periph *)dev->si_drv1; if (cam_periph_acquire(periph) != 0) return (ENXIO); cam_periph_lock(periph); softc = (struct pass_softc *)periph->softc; if (softc->flags & PASS_FLAG_INVALID) { cam_periph_release_locked(periph); cam_periph_unlock(periph); return(ENXIO); } /* * Don't allow access when we're running at a high securelevel. */ error = securelevel_gt(td->td_ucred, 1); if (error) { cam_periph_release_locked(periph); cam_periph_unlock(periph); return(error); } /* * Only allow read-write access. */ if (((flags & FWRITE) == 0) || ((flags & FREAD) == 0)) { cam_periph_release_locked(periph); cam_periph_unlock(periph); return(EPERM); } /* * We don't allow nonblocking access. */ if ((flags & O_NONBLOCK) != 0) { xpt_print(periph->path, "can't do nonblocking access\n"); cam_periph_release_locked(periph); cam_periph_unlock(periph); return(EINVAL); } softc->open_count++; cam_periph_unlock(periph); return (error); } static int passclose(struct cdev *dev, int flag, int fmt, struct thread *td) { struct cam_periph *periph; struct pass_softc *softc; struct mtx *mtx; periph = (struct cam_periph *)dev->si_drv1; mtx = cam_periph_mtx(periph); mtx_lock(mtx); softc = periph->softc; softc->open_count--; if (softc->open_count == 0) { struct pass_io_req *io_req, *io_req2; TAILQ_FOREACH_SAFE(io_req, &softc->done_queue, links, io_req2) { TAILQ_REMOVE(&softc->done_queue, io_req, links); passiocleanup(softc, io_req); uma_zfree(softc->pass_zone, io_req); } TAILQ_FOREACH_SAFE(io_req, &softc->incoming_queue, links, io_req2) { TAILQ_REMOVE(&softc->incoming_queue, io_req, links); passiocleanup(softc, io_req); uma_zfree(softc->pass_zone, io_req); } /* * If there are any active I/Os, we need to forcibly acquire a * reference to the peripheral so that we don't go away * before they complete. We'll release the reference when * the abandoned queue is empty. */ io_req = TAILQ_FIRST(&softc->active_queue); if ((io_req != NULL) && (softc->flags & PASS_FLAG_ABANDONED_REF_SET) == 0) { cam_periph_doacquire(periph); softc->flags |= PASS_FLAG_ABANDONED_REF_SET; } /* * Since the I/O in the active queue is not under our * control, just set a flag so that we can clean it up when * it completes and put it on the abandoned queue. This * will prevent our sending spurious completions in the * event that the device is opened again before these I/Os * complete. */ TAILQ_FOREACH_SAFE(io_req, &softc->active_queue, links, io_req2) { TAILQ_REMOVE(&softc->active_queue, io_req, links); io_req->flags |= PASS_IO_ABANDONED; TAILQ_INSERT_TAIL(&softc->abandoned_queue, io_req, links); } } cam_periph_release_locked(periph); /* * We reference the lock directly here, instead of using * cam_periph_unlock(). The reason is that the call to * cam_periph_release_locked() above could result in the periph * getting freed. If that is the case, dereferencing the periph * with a cam_periph_unlock() call would cause a page fault. * * cam_periph_release() avoids this problem using the same method, * but we're manually acquiring and dropping the lock here to * protect the open count and avoid another lock acquisition and * release. */ mtx_unlock(mtx); return (0); } static void passstart(struct cam_periph *periph, union ccb *start_ccb) { struct pass_softc *softc; softc = (struct pass_softc *)periph->softc; switch (softc->state) { case PASS_STATE_NORMAL: { struct pass_io_req *io_req; /* * Check for any queued I/O requests that require an * allocated slot. */ io_req = TAILQ_FIRST(&softc->incoming_queue); if (io_req == NULL) { xpt_release_ccb(start_ccb); break; } TAILQ_REMOVE(&softc->incoming_queue, io_req, links); TAILQ_INSERT_TAIL(&softc->active_queue, io_req, links); /* * Merge the user's CCB into the allocated CCB. */ xpt_merge_ccb(start_ccb, &io_req->ccb); start_ccb->ccb_h.ccb_type = PASS_CCB_QUEUED_IO; start_ccb->ccb_h.ccb_ioreq = io_req; start_ccb->ccb_h.cbfcnp = passdone; io_req->alloced_ccb = start_ccb; binuptime(&io_req->start_time); devstat_start_transaction(softc->device_stats, &io_req->start_time); xpt_action(start_ccb); /* * If we have any more I/O waiting, schedule ourselves again. */ if (!TAILQ_EMPTY(&softc->incoming_queue)) xpt_schedule(periph, CAM_PRIORITY_NORMAL); break; } default: break; } } static void passdone(struct cam_periph *periph, union ccb *done_ccb) { struct pass_softc *softc; struct ccb_scsiio *csio; softc = (struct pass_softc *)periph->softc; cam_periph_assert(periph, MA_OWNED); csio = &done_ccb->csio; switch (csio->ccb_h.ccb_type) { case PASS_CCB_QUEUED_IO: { struct pass_io_req *io_req; io_req = done_ccb->ccb_h.ccb_ioreq; #if 0 xpt_print(periph->path, "%s: called for user CCB %p\n", __func__, io_req->user_ccb_ptr); #endif if (((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) && ((io_req->flags & PASS_IO_ABANDONED) == 0)) { int error; uint32_t cam_flags, sense_flags; passflags(done_ccb, &cam_flags, &sense_flags); error = passerror(done_ccb, cam_flags, sense_flags); if (error == ERESTART) { KASSERT(((sense_flags & SF_NO_RETRY) == 0), ("passerror returned ERESTART with no retry requested\n")); return; } } /* * Copy the allocated CCB contents back to the malloced CCB * so we can give status back to the user when he requests it. */ bcopy(done_ccb, &io_req->ccb, sizeof(*done_ccb)); /* * Log data/transaction completion with devstat(9). */ switch (done_ccb->ccb_h.func_code) { case XPT_SCSI_IO: devstat_end_transaction(softc->device_stats, done_ccb->csio.dxfer_len - done_ccb->csio.resid, done_ccb->csio.tag_action & 0x3, ((done_ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE) ? DEVSTAT_NO_DATA : (done_ccb->ccb_h.flags & CAM_DIR_OUT) ? DEVSTAT_WRITE : DEVSTAT_READ, NULL, &io_req->start_time); break; case XPT_ATA_IO: devstat_end_transaction(softc->device_stats, done_ccb->ataio.dxfer_len - done_ccb->ataio.resid, 0, /* Not used in ATA */ ((done_ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE) ? DEVSTAT_NO_DATA : (done_ccb->ccb_h.flags & CAM_DIR_OUT) ? DEVSTAT_WRITE : DEVSTAT_READ, NULL, &io_req->start_time); break; case XPT_SMP_IO: /* * XXX KDM this isn't quite right, but there isn't * currently an easy way to represent a bidirectional * transfer in devstat. The only way to do it * and have the byte counts come out right would * mean that we would have to record two * transactions, one for the request and one for the * response. For now, so that we report something, * just treat the entire thing as a read. */ devstat_end_transaction(softc->device_stats, done_ccb->smpio.smp_request_len + done_ccb->smpio.smp_response_len, DEVSTAT_TAG_SIMPLE, DEVSTAT_READ, NULL, &io_req->start_time); break; default: devstat_end_transaction(softc->device_stats, 0, DEVSTAT_TAG_NONE, DEVSTAT_NO_DATA, NULL, &io_req->start_time); break; } /* * In the normal case, take the completed I/O off of the * active queue and put it on the done queue. Notitfy the * user that we have a completed I/O. */ if ((io_req->flags & PASS_IO_ABANDONED) == 0) { TAILQ_REMOVE(&softc->active_queue, io_req, links); TAILQ_INSERT_TAIL(&softc->done_queue, io_req, links); selwakeuppri(&softc->read_select, PRIBIO); KNOTE_LOCKED(&softc->read_select.si_note, 0); } else { /* * In the case of an abandoned I/O (final close * without fetching the I/O), take it off of the * abandoned queue and free it. */ TAILQ_REMOVE(&softc->abandoned_queue, io_req, links); passiocleanup(softc, io_req); uma_zfree(softc->pass_zone, io_req); /* * Release the done_ccb here, since we may wind up * freeing the peripheral when we decrement the * reference count below. */ xpt_release_ccb(done_ccb); /* * If the abandoned queue is empty, we can release * our reference to the periph since we won't have * any more completions coming. */ if ((TAILQ_EMPTY(&softc->abandoned_queue)) && (softc->flags & PASS_FLAG_ABANDONED_REF_SET)) { softc->flags &= ~PASS_FLAG_ABANDONED_REF_SET; cam_periph_release_locked(periph); } /* * We have already released the CCB, so we can * return. */ return; } break; } } xpt_release_ccb(done_ccb); } static int passcreatezone(struct cam_periph *periph) { struct pass_softc *softc; int error; error = 0; softc = (struct pass_softc *)periph->softc; cam_periph_assert(periph, MA_OWNED); KASSERT(((softc->flags & PASS_FLAG_ZONE_VALID) == 0), ("%s called when the pass(4) zone is valid!\n", __func__)); KASSERT((softc->pass_zone == NULL), ("%s called when the pass(4) zone is allocated!\n", __func__)); if ((softc->flags & PASS_FLAG_ZONE_INPROG) == 0) { /* * We're the first context through, so we need to create * the pass(4) UMA zone for I/O requests. */ softc->flags |= PASS_FLAG_ZONE_INPROG; /* * uma_zcreate() does a blocking (M_WAITOK) allocation, * so we cannot hold a mutex while we call it. */ cam_periph_unlock(periph); softc->pass_zone = uma_zcreate(softc->zone_name, sizeof(struct pass_io_req), NULL, NULL, NULL, NULL, /*align*/ 0, /*flags*/ 0); softc->pass_io_zone = uma_zcreate(softc->io_zone_name, softc->io_zone_size, NULL, NULL, NULL, NULL, /*align*/ 0, /*flags*/ 0); cam_periph_lock(periph); if ((softc->pass_zone == NULL) || (softc->pass_io_zone == NULL)) { if (softc->pass_zone == NULL) xpt_print(periph->path, "unable to allocate " "IO Req UMA zone\n"); else xpt_print(periph->path, "unable to allocate " "IO UMA zone\n"); softc->flags &= ~PASS_FLAG_ZONE_INPROG; goto bailout; } /* * Set the flags appropriately and notify any other waiters. */ softc->flags &= ~PASS_FLAG_ZONE_INPROG; softc->flags |= PASS_FLAG_ZONE_VALID; wakeup(&softc->pass_zone); } else { /* * In this case, the UMA zone has not yet been created, but * another context is in the process of creating it. We * need to sleep until the creation is either done or has * failed. */ while ((softc->flags & PASS_FLAG_ZONE_INPROG) && ((softc->flags & PASS_FLAG_ZONE_VALID) == 0)) { error = msleep(&softc->pass_zone, cam_periph_mtx(periph), PRIBIO, "paszon", 0); if (error != 0) goto bailout; } /* * If the zone creation failed, no luck for the user. */ if ((softc->flags & PASS_FLAG_ZONE_VALID) == 0){ error = ENOMEM; goto bailout; } } bailout: return (error); } static void passiocleanup(struct pass_softc *softc, struct pass_io_req *io_req) { union ccb *ccb; uint8_t **data_ptrs[CAM_PERIPH_MAXMAPS]; int i, numbufs; ccb = &io_req->ccb; switch (ccb->ccb_h.func_code) { case XPT_DEV_MATCH: numbufs = min(io_req->num_bufs, 2); if (numbufs == 1) { data_ptrs[0] = (uint8_t **)&ccb->cdm.matches; } else { data_ptrs[0] = (uint8_t **)&ccb->cdm.patterns; data_ptrs[1] = (uint8_t **)&ccb->cdm.matches; } break; case XPT_SCSI_IO: case XPT_CONT_TARGET_IO: data_ptrs[0] = &ccb->csio.data_ptr; numbufs = min(io_req->num_bufs, 1); break; case XPT_ATA_IO: data_ptrs[0] = &ccb->ataio.data_ptr; numbufs = min(io_req->num_bufs, 1); break; case XPT_SMP_IO: numbufs = min(io_req->num_bufs, 2); data_ptrs[0] = &ccb->smpio.smp_request; data_ptrs[1] = &ccb->smpio.smp_response; break; case XPT_DEV_ADVINFO: numbufs = min(io_req->num_bufs, 1); data_ptrs[0] = (uint8_t **)&ccb->cdai.buf; break; case XPT_NVME_IO: case XPT_NVME_ADMIN: data_ptrs[0] = &ccb->nvmeio.data_ptr; numbufs = min(io_req->num_bufs, 1); break; default: /* allow ourselves to be swapped once again */ return; break; /* NOTREACHED */ } if (io_req->flags & PASS_IO_USER_SEG_MALLOC) { free(io_req->user_segptr, M_SCSIPASS); io_req->user_segptr = NULL; } /* * We only want to free memory we malloced. */ if (io_req->data_flags == CAM_DATA_VADDR) { for (i = 0; i < io_req->num_bufs; i++) { if (io_req->kern_bufs[i] == NULL) continue; free(io_req->kern_bufs[i], M_SCSIPASS); io_req->kern_bufs[i] = NULL; } } else if (io_req->data_flags == CAM_DATA_SG) { for (i = 0; i < io_req->num_kern_segs; i++) { if ((uint8_t *)(uintptr_t) io_req->kern_segptr[i].ds_addr == NULL) continue; uma_zfree(softc->pass_io_zone, (uint8_t *)(uintptr_t) io_req->kern_segptr[i].ds_addr); io_req->kern_segptr[i].ds_addr = 0; } } if (io_req->flags & PASS_IO_KERN_SEG_MALLOC) { free(io_req->kern_segptr, M_SCSIPASS); io_req->kern_segptr = NULL; } if (io_req->data_flags != CAM_DATA_PADDR) { for (i = 0; i < numbufs; i++) { /* * Restore the user's buffer pointers to their * previous values. */ if (io_req->user_bufs[i] != NULL) *data_ptrs[i] = io_req->user_bufs[i]; } } } static int passcopysglist(struct cam_periph *periph, struct pass_io_req *io_req, ccb_flags direction) { bus_size_t kern_watermark, user_watermark, len_to_copy; bus_dma_segment_t *user_sglist, *kern_sglist; int i, j, error; error = 0; kern_watermark = 0; user_watermark = 0; len_to_copy = 0; user_sglist = io_req->user_segptr; kern_sglist = io_req->kern_segptr; for (i = 0, j = 0; i < io_req->num_user_segs && j < io_req->num_kern_segs;) { uint8_t *user_ptr, *kern_ptr; len_to_copy = min(user_sglist[i].ds_len -user_watermark, kern_sglist[j].ds_len - kern_watermark); user_ptr = (uint8_t *)(uintptr_t)user_sglist[i].ds_addr; user_ptr = user_ptr + user_watermark; kern_ptr = (uint8_t *)(uintptr_t)kern_sglist[j].ds_addr; kern_ptr = kern_ptr + kern_watermark; user_watermark += len_to_copy; kern_watermark += len_to_copy; if (direction == CAM_DIR_IN) { error = copyout(kern_ptr, user_ptr, len_to_copy); if (error != 0) { xpt_print(periph->path, "%s: copyout of %u " "bytes from %p to %p failed with " "error %d\n", __func__, len_to_copy, kern_ptr, user_ptr, error); goto bailout; } } else { error = copyin(user_ptr, kern_ptr, len_to_copy); if (error != 0) { xpt_print(periph->path, "%s: copyin of %u " "bytes from %p to %p failed with " "error %d\n", __func__, len_to_copy, user_ptr, kern_ptr, error); goto bailout; } } if (user_sglist[i].ds_len == user_watermark) { i++; user_watermark = 0; } if (kern_sglist[j].ds_len == kern_watermark) { j++; kern_watermark = 0; } } bailout: return (error); } static int passmemsetup(struct cam_periph *periph, struct pass_io_req *io_req) { union ccb *ccb; struct pass_softc *softc; int numbufs, i; uint8_t **data_ptrs[CAM_PERIPH_MAXMAPS]; uint32_t lengths[CAM_PERIPH_MAXMAPS]; uint32_t dirs[CAM_PERIPH_MAXMAPS]; uint32_t num_segs; uint16_t *seg_cnt_ptr; size_t maxmap; int error; cam_periph_assert(periph, MA_NOTOWNED); softc = periph->softc; error = 0; ccb = &io_req->ccb; maxmap = 0; num_segs = 0; seg_cnt_ptr = NULL; switch(ccb->ccb_h.func_code) { case XPT_DEV_MATCH: if (ccb->cdm.match_buf_len == 0) { printf("%s: invalid match buffer length 0\n", __func__); return(EINVAL); } if (ccb->cdm.pattern_buf_len > 0) { data_ptrs[0] = (uint8_t **)&ccb->cdm.patterns; lengths[0] = ccb->cdm.pattern_buf_len; dirs[0] = CAM_DIR_OUT; data_ptrs[1] = (uint8_t **)&ccb->cdm.matches; lengths[1] = ccb->cdm.match_buf_len; dirs[1] = CAM_DIR_IN; numbufs = 2; } else { data_ptrs[0] = (uint8_t **)&ccb->cdm.matches; lengths[0] = ccb->cdm.match_buf_len; dirs[0] = CAM_DIR_IN; numbufs = 1; } io_req->data_flags = CAM_DATA_VADDR; break; case XPT_SCSI_IO: case XPT_CONT_TARGET_IO: if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE) return(0); /* * The user shouldn't be able to supply a bio. */ if ((ccb->ccb_h.flags & CAM_DATA_MASK) == CAM_DATA_BIO) return (EINVAL); io_req->data_flags = ccb->ccb_h.flags & CAM_DATA_MASK; data_ptrs[0] = &ccb->csio.data_ptr; lengths[0] = ccb->csio.dxfer_len; dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK; num_segs = ccb->csio.sglist_cnt; seg_cnt_ptr = &ccb->csio.sglist_cnt; numbufs = 1; maxmap = softc->maxio; break; case XPT_ATA_IO: if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE) return(0); /* * We only support a single virtual address for ATA I/O. */ if ((ccb->ccb_h.flags & CAM_DATA_MASK) != CAM_DATA_VADDR) return (EINVAL); io_req->data_flags = CAM_DATA_VADDR; data_ptrs[0] = &ccb->ataio.data_ptr; lengths[0] = ccb->ataio.dxfer_len; dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK; numbufs = 1; maxmap = softc->maxio; break; case XPT_SMP_IO: io_req->data_flags = CAM_DATA_VADDR; data_ptrs[0] = &ccb->smpio.smp_request; lengths[0] = ccb->smpio.smp_request_len; dirs[0] = CAM_DIR_OUT; data_ptrs[1] = &ccb->smpio.smp_response; lengths[1] = ccb->smpio.smp_response_len; dirs[1] = CAM_DIR_IN; numbufs = 2; maxmap = softc->maxio; break; case XPT_DEV_ADVINFO: if (ccb->cdai.bufsiz == 0) return (0); io_req->data_flags = CAM_DATA_VADDR; data_ptrs[0] = (uint8_t **)&ccb->cdai.buf; lengths[0] = ccb->cdai.bufsiz; dirs[0] = CAM_DIR_IN; numbufs = 1; break; case XPT_NVME_ADMIN: case XPT_NVME_IO: if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE) return (0); io_req->data_flags = ccb->ccb_h.flags & CAM_DATA_MASK; data_ptrs[0] = &ccb->nvmeio.data_ptr; lengths[0] = ccb->nvmeio.dxfer_len; dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK; num_segs = ccb->nvmeio.sglist_cnt; seg_cnt_ptr = &ccb->nvmeio.sglist_cnt; numbufs = 1; maxmap = softc->maxio; break; default: return(EINVAL); break; /* NOTREACHED */ } io_req->num_bufs = numbufs; /* * If there is a maximum, check to make sure that the user's * request fits within the limit. In general, we should only have * a maximum length for requests that go to hardware. Otherwise it * is whatever we're able to malloc. */ for (i = 0; i < numbufs; i++) { io_req->user_bufs[i] = *data_ptrs[i]; io_req->dirs[i] = dirs[i]; io_req->lengths[i] = lengths[i]; if (maxmap == 0) continue; if (lengths[i] <= maxmap) continue; xpt_print(periph->path, "%s: data length %u > max allowed %u " "bytes\n", __func__, lengths[i], maxmap); error = EINVAL; goto bailout; } switch (io_req->data_flags) { case CAM_DATA_VADDR: /* Map or copy the buffer into kernel address space */ for (i = 0; i < numbufs; i++) { uint8_t *tmp_buf; /* * If for some reason no length is specified, we * don't need to allocate anything. */ if (io_req->lengths[i] == 0) continue; tmp_buf = malloc(lengths[i], M_SCSIPASS, M_WAITOK | M_ZERO); io_req->kern_bufs[i] = tmp_buf; *data_ptrs[i] = tmp_buf; #if 0 xpt_print(periph->path, "%s: malloced %p len %u, user " "buffer %p, operation: %s\n", __func__, tmp_buf, lengths[i], io_req->user_bufs[i], (dirs[i] == CAM_DIR_IN) ? "read" : "write"); #endif /* * We only need to copy in if the user is writing. */ if (dirs[i] != CAM_DIR_OUT) continue; error = copyin(io_req->user_bufs[i], io_req->kern_bufs[i], lengths[i]); if (error != 0) { xpt_print(periph->path, "%s: copy of user " "buffer from %p to %p failed with " "error %d\n", __func__, io_req->user_bufs[i], io_req->kern_bufs[i], error); goto bailout; } } break; case CAM_DATA_PADDR: /* Pass down the pointer as-is */ break; case CAM_DATA_SG: { size_t sg_length, size_to_go, alloc_size; uint32_t num_segs_needed; /* * Copy the user S/G list in, and then copy in the * individual segments. */ /* * We shouldn't see this, but check just in case. */ if (numbufs != 1) { xpt_print(periph->path, "%s: cannot currently handle " "more than one S/G list per CCB\n", __func__); error = EINVAL; goto bailout; } /* * We have to have at least one segment. */ if (num_segs == 0) { xpt_print(periph->path, "%s: CAM_DATA_SG flag set, " "but sglist_cnt=0!\n", __func__); error = EINVAL; goto bailout; } /* * Make sure the user specified the total length and didn't * just leave it to us to decode the S/G list. */ if (lengths[0] == 0) { xpt_print(periph->path, "%s: no dxfer_len specified, " "but CAM_DATA_SG flag is set!\n", __func__); error = EINVAL; goto bailout; } /* * We allocate buffers in io_zone_size increments for an * S/G list. This will generally be maxphys. */ if (lengths[0] <= softc->io_zone_size) num_segs_needed = 1; else { num_segs_needed = lengths[0] / softc->io_zone_size; if ((lengths[0] % softc->io_zone_size) != 0) num_segs_needed++; } /* Figure out the size of the S/G list */ sg_length = num_segs * sizeof(bus_dma_segment_t); io_req->num_user_segs = num_segs; io_req->num_kern_segs = num_segs_needed; /* Save the user's S/G list pointer for later restoration */ io_req->user_bufs[0] = *data_ptrs[0]; /* * If we have enough segments allocated by default to handle * the length of the user's S/G list, */ if (num_segs > PASS_MAX_SEGS) { io_req->user_segptr = malloc(sizeof(bus_dma_segment_t) * num_segs, M_SCSIPASS, M_WAITOK | M_ZERO); io_req->flags |= PASS_IO_USER_SEG_MALLOC; } else io_req->user_segptr = io_req->user_segs; error = copyin(*data_ptrs[0], io_req->user_segptr, sg_length); if (error != 0) { xpt_print(periph->path, "%s: copy of user S/G list " "from %p to %p failed with error %d\n", __func__, *data_ptrs[0], io_req->user_segptr, error); goto bailout; } if (num_segs_needed > PASS_MAX_SEGS) { io_req->kern_segptr = malloc(sizeof(bus_dma_segment_t) * num_segs_needed, M_SCSIPASS, M_WAITOK | M_ZERO); io_req->flags |= PASS_IO_KERN_SEG_MALLOC; } else { io_req->kern_segptr = io_req->kern_segs; } /* * Allocate the kernel S/G list. */ for (size_to_go = lengths[0], i = 0; size_to_go > 0 && i < num_segs_needed; i++, size_to_go -= alloc_size) { uint8_t *kern_ptr; alloc_size = min(size_to_go, softc->io_zone_size); kern_ptr = uma_zalloc(softc->pass_io_zone, M_WAITOK); io_req->kern_segptr[i].ds_addr = (bus_addr_t)(uintptr_t)kern_ptr; io_req->kern_segptr[i].ds_len = alloc_size; } if (size_to_go > 0) { printf("%s: size_to_go = %zu, software error!\n", __func__, size_to_go); error = EINVAL; goto bailout; } *data_ptrs[0] = (uint8_t *)io_req->kern_segptr; *seg_cnt_ptr = io_req->num_kern_segs; /* * We only need to copy data here if the user is writing. */ if (dirs[0] == CAM_DIR_OUT) error = passcopysglist(periph, io_req, dirs[0]); break; } case CAM_DATA_SG_PADDR: { size_t sg_length; /* * We shouldn't see this, but check just in case. */ if (numbufs != 1) { printf("%s: cannot currently handle more than one " "S/G list per CCB\n", __func__); error = EINVAL; goto bailout; } /* * We have to have at least one segment. */ if (num_segs == 0) { xpt_print(periph->path, "%s: CAM_DATA_SG_PADDR flag " "set, but sglist_cnt=0!\n", __func__); error = EINVAL; goto bailout; } /* * Make sure the user specified the total length and didn't * just leave it to us to decode the S/G list. */ if (lengths[0] == 0) { xpt_print(periph->path, "%s: no dxfer_len specified, " "but CAM_DATA_SG flag is set!\n", __func__); error = EINVAL; goto bailout; } /* Figure out the size of the S/G list */ sg_length = num_segs * sizeof(bus_dma_segment_t); io_req->num_user_segs = num_segs; io_req->num_kern_segs = io_req->num_user_segs; /* Save the user's S/G list pointer for later restoration */ io_req->user_bufs[0] = *data_ptrs[0]; if (num_segs > PASS_MAX_SEGS) { io_req->user_segptr = malloc(sizeof(bus_dma_segment_t) * num_segs, M_SCSIPASS, M_WAITOK | M_ZERO); io_req->flags |= PASS_IO_USER_SEG_MALLOC; } else io_req->user_segptr = io_req->user_segs; io_req->kern_segptr = io_req->user_segptr; error = copyin(*data_ptrs[0], io_req->user_segptr, sg_length); if (error != 0) { xpt_print(periph->path, "%s: copy of user S/G list " "from %p to %p failed with error %d\n", __func__, *data_ptrs[0], io_req->user_segptr, error); goto bailout; } break; } default: case CAM_DATA_BIO: /* * A user shouldn't be attaching a bio to the CCB. It * isn't a user-accessible structure. */ error = EINVAL; break; } bailout: if (error != 0) passiocleanup(softc, io_req); return (error); } static int passmemdone(struct cam_periph *periph, struct pass_io_req *io_req) { struct pass_softc *softc; int error; int i; error = 0; softc = (struct pass_softc *)periph->softc; switch (io_req->data_flags) { case CAM_DATA_VADDR: /* * Copy back to the user buffer if this was a read. */ for (i = 0; i < io_req->num_bufs; i++) { if (io_req->dirs[i] != CAM_DIR_IN) continue; error = copyout(io_req->kern_bufs[i], io_req->user_bufs[i], io_req->lengths[i]); if (error != 0) { xpt_print(periph->path, "Unable to copy %u " "bytes from %p to user address %p\n", io_req->lengths[i], io_req->kern_bufs[i], io_req->user_bufs[i]); goto bailout; } } break; case CAM_DATA_PADDR: /* Do nothing. The pointer is a physical address already */ break; case CAM_DATA_SG: /* * Copy back to the user buffer if this was a read. * Restore the user's S/G list buffer pointer. */ if (io_req->dirs[0] == CAM_DIR_IN) error = passcopysglist(periph, io_req, io_req->dirs[0]); break; case CAM_DATA_SG_PADDR: /* * Restore the user's S/G list buffer pointer. No need to * copy. */ break; default: case CAM_DATA_BIO: error = EINVAL; break; } bailout: /* * Reset the user's pointers to their original values and free * allocated memory. */ passiocleanup(softc, io_req); return (error); } static int passioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td) { int error; if ((error = passdoioctl(dev, cmd, addr, flag, td)) == ENOTTY) { error = cam_compat_ioctl(dev, cmd, addr, flag, td, passdoioctl); } return (error); } static int passdoioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td) { struct cam_periph *periph; struct pass_softc *softc; int error; uint32_t priority; periph = (struct cam_periph *)dev->si_drv1; cam_periph_lock(periph); softc = (struct pass_softc *)periph->softc; error = 0; switch (cmd) { case CAMIOCOMMAND: { union ccb *inccb; union ccb *ccb; int ccb_malloced; inccb = (union ccb *)addr; #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) if (inccb->ccb_h.func_code == XPT_SCSI_IO) inccb->csio.bio = NULL; #endif if (inccb->ccb_h.flags & CAM_UNLOCKED) { error = EINVAL; break; } /* * Some CCB types, like scan bus and scan lun can only go * through the transport layer device. */ if (inccb->ccb_h.func_code & XPT_FC_XPT_ONLY) { xpt_print(periph->path, "CCB function code %#x is " "restricted to the XPT device\n", inccb->ccb_h.func_code); error = ENODEV; break; } /* Compatibility for RL/priority-unaware code. */ priority = inccb->ccb_h.pinfo.priority; if (priority <= CAM_PRIORITY_OOB) priority += CAM_PRIORITY_OOB + 1; /* * Non-immediate CCBs need a CCB from the per-device pool * of CCBs, which is scheduled by the transport layer. * Immediate CCBs and user-supplied CCBs should just be * malloced. */ if ((inccb->ccb_h.func_code & XPT_FC_QUEUED) && ((inccb->ccb_h.func_code & XPT_FC_USER_CCB) == 0)) { ccb = cam_periph_getccb(periph, priority); ccb_malloced = 0; } else { ccb = xpt_alloc_ccb_nowait(); if (ccb != NULL) xpt_setup_ccb(&ccb->ccb_h, periph->path, priority); ccb_malloced = 1; } if (ccb == NULL) { xpt_print(periph->path, "unable to allocate CCB\n"); error = ENOMEM; break; } error = passsendccb(periph, ccb, inccb); if (ccb_malloced) xpt_free_ccb(ccb); else xpt_release_ccb(ccb); break; } case CAMIOQUEUE: { struct pass_io_req *io_req; union ccb **user_ccb, *ccb; xpt_opcode fc; #ifdef COMPAT_FREEBSD32 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { error = ENOTTY; goto bailout; } #endif if ((softc->flags & PASS_FLAG_ZONE_VALID) == 0) { error = passcreatezone(periph); if (error != 0) goto bailout; } /* * We're going to do a blocking allocation for this I/O * request, so we have to drop the lock. */ cam_periph_unlock(periph); io_req = uma_zalloc(softc->pass_zone, M_WAITOK | M_ZERO); ccb = &io_req->ccb; user_ccb = (union ccb **)addr; /* * Unlike the CAMIOCOMMAND ioctl above, we only have a * pointer to the user's CCB, so we have to copy the whole * thing in to a buffer we have allocated (above) instead * of allowing the ioctl code to malloc a buffer and copy * it in. * * This is an advantage for this asynchronous interface, * since we don't want the memory to get freed while the * CCB is outstanding. */ #if 0 xpt_print(periph->path, "Copying user CCB %p to " "kernel address %p\n", *user_ccb, ccb); #endif error = copyin(*user_ccb, ccb, sizeof(*ccb)); if (error != 0) { xpt_print(periph->path, "Copy of user CCB %p to " "kernel address %p failed with error %d\n", *user_ccb, ccb, error); goto camioqueue_error; } #if defined(BUF_TRACKING) || defined(FULL_BUF_TRACKING) if (ccb->ccb_h.func_code == XPT_SCSI_IO) ccb->csio.bio = NULL; #endif if (ccb->ccb_h.flags & CAM_UNLOCKED) { error = EINVAL; goto camioqueue_error; } if (ccb->ccb_h.flags & CAM_CDB_POINTER) { if (ccb->csio.cdb_len > IOCDBLEN) { error = EINVAL; goto camioqueue_error; } error = copyin(ccb->csio.cdb_io.cdb_ptr, ccb->csio.cdb_io.cdb_bytes, ccb->csio.cdb_len); if (error != 0) goto camioqueue_error; ccb->ccb_h.flags &= ~CAM_CDB_POINTER; } /* * Some CCB types, like scan bus and scan lun can only go * through the transport layer device. */ if (ccb->ccb_h.func_code & XPT_FC_XPT_ONLY) { xpt_print(periph->path, "CCB function code %#x is " "restricted to the XPT device\n", ccb->ccb_h.func_code); error = ENODEV; goto camioqueue_error; } /* * Save the user's CCB pointer as well as his linked list * pointers and peripheral private area so that we can * restore these later. */ io_req->user_ccb_ptr = *user_ccb; io_req->user_periph_links = ccb->ccb_h.periph_links; io_req->user_periph_priv = ccb->ccb_h.periph_priv; /* * Now that we've saved the user's values, we can set our * own peripheral private entry. */ ccb->ccb_h.ccb_ioreq = io_req; /* Compatibility for RL/priority-unaware code. */ priority = ccb->ccb_h.pinfo.priority; if (priority <= CAM_PRIORITY_OOB) priority += CAM_PRIORITY_OOB + 1; /* * Setup fields in the CCB like the path and the priority. * The path in particular cannot be done in userland, since * it is a pointer to a kernel data structure. */ xpt_setup_ccb_flags(&ccb->ccb_h, periph->path, priority, ccb->ccb_h.flags); /* * Setup our done routine. There is no way for the user to * have a valid pointer here. */ ccb->ccb_h.cbfcnp = passdone; fc = ccb->ccb_h.func_code; /* * If this function code has memory that can be mapped in * or out, we need to call passmemsetup(). */ if ((fc == XPT_SCSI_IO) || (fc == XPT_ATA_IO) || (fc == XPT_SMP_IO) || (fc == XPT_DEV_MATCH) || (fc == XPT_DEV_ADVINFO) || (fc == XPT_NVME_ADMIN) || (fc == XPT_NVME_IO)) { error = passmemsetup(periph, io_req); if (error != 0) goto camioqueue_error; } else io_req->mapinfo.num_bufs_used = 0; cam_periph_lock(periph); /* * Everything goes on the incoming queue initially. */ TAILQ_INSERT_TAIL(&softc->incoming_queue, io_req, links); /* * If the CCB is queued, and is not a user CCB, then * we need to allocate a slot for it. Call xpt_schedule() * so that our start routine will get called when a CCB is * available. */ if ((fc & XPT_FC_QUEUED) && ((fc & XPT_FC_USER_CCB) == 0)) { xpt_schedule(periph, priority); break; } /* * At this point, the CCB in question is either an * immediate CCB (like XPT_DEV_ADVINFO) or it is a user CCB * and therefore should be malloced, not allocated via a slot. * Remove the CCB from the incoming queue and add it to the * active queue. */ TAILQ_REMOVE(&softc->incoming_queue, io_req, links); TAILQ_INSERT_TAIL(&softc->active_queue, io_req, links); xpt_action(ccb); /* * If this is not a queued CCB (i.e. it is an immediate CCB), * then it is already done. We need to put it on the done * queue for the user to fetch. */ if ((fc & XPT_FC_QUEUED) == 0) { TAILQ_REMOVE(&softc->active_queue, io_req, links); TAILQ_INSERT_TAIL(&softc->done_queue, io_req, links); } break; camioqueue_error: uma_zfree(softc->pass_zone, io_req); cam_periph_lock(periph); break; } case CAMIOGET: { union ccb **user_ccb; struct pass_io_req *io_req; int old_error; #ifdef COMPAT_FREEBSD32 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { error = ENOTTY; goto bailout; } #endif user_ccb = (union ccb **)addr; old_error = 0; io_req = TAILQ_FIRST(&softc->done_queue); if (io_req == NULL) { error = ENOENT; break; } /* * Remove the I/O from the done queue. */ TAILQ_REMOVE(&softc->done_queue, io_req, links); /* * We have to drop the lock during the copyout because the * copyout can result in VM faults that require sleeping. */ cam_periph_unlock(periph); /* * Do any needed copies (e.g. for reads) and revert the * pointers in the CCB back to the user's pointers. */ error = passmemdone(periph, io_req); old_error = error; io_req->ccb.ccb_h.periph_links = io_req->user_periph_links; io_req->ccb.ccb_h.periph_priv = io_req->user_periph_priv; #if 0 xpt_print(periph->path, "Copying to user CCB %p from " "kernel address %p\n", *user_ccb, &io_req->ccb); #endif error = copyout(&io_req->ccb, *user_ccb, sizeof(union ccb)); if (error != 0) { xpt_print(periph->path, "Copy to user CCB %p from " "kernel address %p failed with error %d\n", *user_ccb, &io_req->ccb, error); } /* * Prefer the first error we got back, and make sure we * don't overwrite bad status with good. */ if (old_error != 0) error = old_error; cam_periph_lock(periph); /* * At this point, if there was an error, we could potentially * re-queue the I/O and try again. But why? The error * would almost certainly happen again. We might as well * not leak memory. */ uma_zfree(softc->pass_zone, io_req); break; } default: error = cam_periph_ioctl(periph, cmd, addr, passerror); break; } bailout: cam_periph_unlock(periph); return(error); } static int passpoll(struct cdev *dev, int poll_events, struct thread *td) { struct cam_periph *periph; struct pass_softc *softc; int revents; periph = (struct cam_periph *)dev->si_drv1; softc = (struct pass_softc *)periph->softc; revents = poll_events & (POLLOUT | POLLWRNORM); if ((poll_events & (POLLIN | POLLRDNORM)) != 0) { cam_periph_lock(periph); if (!TAILQ_EMPTY(&softc->done_queue)) { revents |= poll_events & (POLLIN | POLLRDNORM); } cam_periph_unlock(periph); if (revents == 0) selrecord(td, &softc->read_select); } return (revents); } static int passkqfilter(struct cdev *dev, struct knote *kn) { struct cam_periph *periph; struct pass_softc *softc; periph = (struct cam_periph *)dev->si_drv1; softc = (struct pass_softc *)periph->softc; kn->kn_hook = (caddr_t)periph; kn->kn_fop = &passread_filtops; knlist_add(&softc->read_select.si_note, kn, 0); return (0); } static void passreadfiltdetach(struct knote *kn) { struct cam_periph *periph; struct pass_softc *softc; periph = (struct cam_periph *)kn->kn_hook; softc = (struct pass_softc *)periph->softc; knlist_remove(&softc->read_select.si_note, kn, 0); } static int passreadfilt(struct knote *kn, long hint) { struct cam_periph *periph; struct pass_softc *softc; int retval; periph = (struct cam_periph *)kn->kn_hook; softc = (struct pass_softc *)periph->softc; cam_periph_assert(periph, MA_OWNED); if (TAILQ_EMPTY(&softc->done_queue)) retval = 0; else retval = 1; return (retval); } /* * Generally, "ccb" should be the CCB supplied by the kernel. "inccb" * should be the CCB that is copied in from the user. */ static int passsendccb(struct cam_periph *periph, union ccb *ccb, union ccb *inccb) { struct pass_softc *softc; struct cam_periph_map_info mapinfo; uint8_t *cmd; xpt_opcode fc; int error; softc = (struct pass_softc *)periph->softc; /* * There are some fields in the CCB header that need to be * preserved, the rest we get from the user. */ xpt_merge_ccb(ccb, inccb); if (ccb->ccb_h.flags & CAM_CDB_POINTER) { cmd = __builtin_alloca(ccb->csio.cdb_len); error = copyin(ccb->csio.cdb_io.cdb_ptr, cmd, ccb->csio.cdb_len); if (error) return (error); ccb->csio.cdb_io.cdb_ptr = cmd; } /* * Let cam_periph_mapmem do a sanity check on the data pointer format. * Even if no data transfer is needed, it's a cheap check and it * simplifies the code. */ fc = ccb->ccb_h.func_code; if ((fc == XPT_SCSI_IO) || (fc == XPT_ATA_IO) || (fc == XPT_SMP_IO) || (fc == XPT_DEV_MATCH) || (fc == XPT_DEV_ADVINFO) || (fc == XPT_MMC_IO) || (fc == XPT_NVME_ADMIN) || (fc == XPT_NVME_IO)) { bzero(&mapinfo, sizeof(mapinfo)); /* * cam_periph_mapmem calls into proc and vm functions that can * sleep as well as trigger I/O, so we can't hold the lock. * Dropping it here is reasonably safe. */ cam_periph_unlock(periph); error = cam_periph_mapmem(ccb, &mapinfo, softc->maxio); cam_periph_lock(periph); /* * cam_periph_mapmem returned an error, we can't continue. * Return the error to the user. */ if (error) return(error); } else /* Ensure that the unmap call later on is a no-op. */ mapinfo.num_bufs_used = 0; /* * If the user wants us to perform any error recovery, then honor * that request. Otherwise, it's up to the user to perform any * error recovery. */ { uint32_t cam_flags, sense_flags; passflags(ccb, &cam_flags, &sense_flags); cam_periph_runccb(ccb, passerror, cam_flags, sense_flags, softc->device_stats); } cam_periph_unlock(periph); error = cam_periph_unmapmem(ccb, &mapinfo); cam_periph_lock(periph); ccb->ccb_h.cbfcnp = NULL; ccb->ccb_h.periph_priv = inccb->ccb_h.periph_priv; bcopy(ccb, inccb, sizeof(union ccb)); return (error); } /* * Set the cam_flags and sense_flags based on whether or not the request wants * error recovery. In order to log errors via devctl, we need to do at least * minimal recovery. We do this by not retrying unit attention (we let the * requester do it, or not, if appropriate) and specifically asking for no * recovery, like we do during device probing. */ static void passflags(union ccb *ccb, uint32_t *cam_flags, uint32_t *sense_flags) { if ((ccb->ccb_h.flags & CAM_PASS_ERR_RECOVER) != 0) { *cam_flags = CAM_RETRY_SELTO; *sense_flags = SF_RETRY_UA | SF_NO_PRINT; } else { *cam_flags = 0; *sense_flags = SF_NO_RETRY | SF_NO_RECOVERY | SF_NO_PRINT; } } static int passerror(union ccb *ccb, uint32_t cam_flags, uint32_t sense_flags) { return(cam_periph_error(ccb, cam_flags, sense_flags)); } diff --git a/sys/cam/scsi/scsi_target.c b/sys/cam/scsi/scsi_target.c index 6872e3a2a93b..278fcd908d7b 100644 --- a/sys/cam/scsi/scsi_target.c +++ b/sys/cam/scsi/scsi_target.c @@ -1,1169 +1,1169 @@ /*- * Generic SCSI Target Kernel Mode Driver * * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2002 Nate Lawson. * Copyright (c) 1998, 1999, 2001, 2002 Justin T. Gibbs. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include /* Includes to support callout */ #include #include #include #include #include #include #include #include /* Transaction information attached to each CCB sent by the user */ struct targ_cmd_descr { struct cam_periph_map_info mapinfo; TAILQ_ENTRY(targ_cmd_descr) tqe; union ccb *user_ccb; int priority; int func_code; }; /* Offset into the private CCB area for storing our descriptor */ #define targ_descr periph_priv.entries[1].ptr TAILQ_HEAD(descr_queue, targ_cmd_descr); typedef enum { TARG_STATE_RESV = 0x00, /* Invalid state */ TARG_STATE_OPENED = 0x01, /* Device opened, softc initialized */ TARG_STATE_LUN_ENABLED = 0x02 /* Device enabled for a path */ } targ_state; /* Per-instance device software context */ struct targ_softc { /* CCBs (CTIOs, ATIOs, INOTs) pending on the controller */ struct ccb_queue pending_ccb_queue; /* Command descriptors awaiting CTIO resources from the XPT */ struct descr_queue work_queue; /* Command descriptors that have been aborted back to the user. */ struct descr_queue abort_queue; /* * Queue of CCBs that have been copied out to userland, but our * userland daemon has not yet seen. */ struct ccb_queue user_ccb_queue; struct cam_periph *periph; struct cam_path *path; targ_state state; u_int maxio; struct selinfo read_select; struct devstat device_stats; }; static d_open_t targopen; static d_read_t targread; static d_write_t targwrite; static d_ioctl_t targioctl; static d_poll_t targpoll; static d_kqfilter_t targkqfilter; static void targreadfiltdetach(struct knote *kn); static int targreadfilt(struct knote *kn, long hint); -static struct filterops targread_filtops = { +static const struct filterops targread_filtops = { .f_isfd = 1, .f_detach = targreadfiltdetach, .f_event = targreadfilt, }; static struct cdevsw targ_cdevsw = { .d_version = D_VERSION, .d_open = targopen, .d_read = targread, .d_write = targwrite, .d_ioctl = targioctl, .d_poll = targpoll, .d_name = "targ", .d_kqfilter = targkqfilter }; static cam_status targendislun(struct cam_path *path, int enable, int grp6_len, int grp7_len); static cam_status targenable(struct targ_softc *softc, struct cam_path *path, int grp6_len, int grp7_len); static cam_status targdisable(struct targ_softc *softc); static periph_ctor_t targctor; static periph_dtor_t targdtor; static periph_start_t targstart; static int targusermerge(struct targ_softc *softc, struct targ_cmd_descr *descr, union ccb *ccb); static int targsendccb(struct targ_softc *softc, union ccb *ccb, struct targ_cmd_descr *descr); static void targdone(struct cam_periph *periph, union ccb *done_ccb); static int targreturnccb(struct targ_softc *softc, union ccb *ccb); static union ccb * targgetccb(struct targ_softc *softc, xpt_opcode type, int priority); static void targfreeccb(struct targ_softc *softc, union ccb *ccb); static struct targ_cmd_descr * targgetdescr(struct targ_softc *softc); static periph_init_t targinit; static void targasync(void *callback_arg, uint32_t code, struct cam_path *path, void *arg); static void abort_all_pending(struct targ_softc *softc); static void notify_user(struct targ_softc *softc); static int targcamstatus(cam_status status); static size_t targccblen(xpt_opcode func_code); static struct periph_driver targdriver = { targinit, "targ", TAILQ_HEAD_INITIALIZER(targdriver.units), /* generation */ 0 }; PERIPHDRIVER_DECLARE(targ, targdriver); static MALLOC_DEFINE(M_TARG, "TARG", "TARG data"); /* Disable LUN if enabled and teardown softc */ static void targcdevdtor(void *data) { struct targ_softc *softc; struct cam_periph *periph; softc = data; if (softc->periph == NULL) { printf("%s: destroying non-enabled target\n", __func__); free(softc, M_TARG); return; } /* * Acquire a hold on the periph so that it doesn't go away before * we are ready at the end of the function. */ periph = softc->periph; cam_periph_acquire(periph); cam_periph_lock(periph); (void)targdisable(softc); if (softc->periph != NULL) { cam_periph_invalidate(softc->periph); softc->periph = NULL; } cam_periph_unlock(periph); cam_periph_release(periph); free(softc, M_TARG); } /* * Create softc and initialize it. There is no locking here because a * periph doesn't get created until an ioctl is issued to do so, and * that can't happen until this method returns. */ static int targopen(struct cdev *dev, int flags, int fmt, struct thread *td) { struct targ_softc *softc; /* Allocate its softc, initialize it */ softc = malloc(sizeof(*softc), M_TARG, M_WAITOK | M_ZERO); softc->state = TARG_STATE_OPENED; softc->periph = NULL; softc->path = NULL; TAILQ_INIT(&softc->pending_ccb_queue); TAILQ_INIT(&softc->work_queue); TAILQ_INIT(&softc->abort_queue); TAILQ_INIT(&softc->user_ccb_queue); knlist_init_mtx(&softc->read_select.si_note, NULL); devfs_set_cdevpriv(softc, targcdevdtor); return (0); } /* Enable/disable LUNs, set debugging level */ static int targioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td) { struct targ_softc *softc; cam_status status; devfs_get_cdevpriv((void **)&softc); switch (cmd) { case TARGIOCENABLE: { struct ioc_enable_lun *new_lun; struct cam_path *path; new_lun = (struct ioc_enable_lun *)addr; status = xpt_create_path(&path, /*periph*/NULL, new_lun->path_id, new_lun->target_id, new_lun->lun_id); if (status != CAM_REQ_CMP) { printf("Couldn't create path, status %#x\n", status); break; } xpt_path_lock(path); status = targenable(softc, path, new_lun->grp6_len, new_lun->grp7_len); xpt_path_unlock(path); xpt_free_path(path); break; } case TARGIOCDISABLE: if (softc->periph == NULL) { status = CAM_DEV_NOT_THERE; break; } cam_periph_lock(softc->periph); status = targdisable(softc); cam_periph_unlock(softc->periph); break; case TARGIOCDEBUG: { struct ccb_debug cdbg; /* If no periph available, disallow debugging changes */ if ((softc->state & TARG_STATE_LUN_ENABLED) == 0) { status = CAM_DEV_NOT_THERE; break; } bzero(&cdbg, sizeof cdbg); if (*((int *)addr) != 0) cdbg.flags = CAM_DEBUG_PERIPH; else cdbg.flags = CAM_DEBUG_NONE; xpt_setup_ccb(&cdbg.ccb_h, softc->path, CAM_PRIORITY_NORMAL); cdbg.ccb_h.func_code = XPT_DEBUG; cdbg.ccb_h.cbfcnp = targdone; xpt_action((union ccb *)&cdbg); status = cdbg.ccb_h.status & CAM_STATUS_MASK; break; } default: status = CAM_PROVIDE_FAIL; break; } return (targcamstatus(status)); } /* Writes are always ready, reads wait for user_ccb_queue or abort_queue */ static int targpoll(struct cdev *dev, int poll_events, struct thread *td) { struct targ_softc *softc; int revents; devfs_get_cdevpriv((void **)&softc); /* Poll for write() is always ok. */ revents = poll_events & (POLLOUT | POLLWRNORM); if ((poll_events & (POLLIN | POLLRDNORM)) != 0) { /* Poll for read() depends on user and abort queues. */ cam_periph_lock(softc->periph); if (!TAILQ_EMPTY(&softc->user_ccb_queue) || !TAILQ_EMPTY(&softc->abort_queue)) { revents |= poll_events & (POLLIN | POLLRDNORM); } cam_periph_unlock(softc->periph); /* Only sleep if the user didn't poll for write. */ if (revents == 0) selrecord(td, &softc->read_select); } return (revents); } static int targkqfilter(struct cdev *dev, struct knote *kn) { struct targ_softc *softc; devfs_get_cdevpriv((void **)&softc); kn->kn_hook = (caddr_t)softc; kn->kn_fop = &targread_filtops; knlist_add(&softc->read_select.si_note, kn, 0); return (0); } static void targreadfiltdetach(struct knote *kn) { struct targ_softc *softc; softc = (struct targ_softc *)kn->kn_hook; knlist_remove(&softc->read_select.si_note, kn, 0); } /* Notify the user's kqueue when the user queue or abort queue gets a CCB */ static int targreadfilt(struct knote *kn, long hint) { struct targ_softc *softc; int retval; softc = (struct targ_softc *)kn->kn_hook; cam_periph_lock(softc->periph); retval = !TAILQ_EMPTY(&softc->user_ccb_queue) || !TAILQ_EMPTY(&softc->abort_queue); cam_periph_unlock(softc->periph); return (retval); } /* Send the HBA the enable/disable message */ static cam_status targendislun(struct cam_path *path, int enable, int grp6_len, int grp7_len) { struct ccb_en_lun en_ccb; cam_status status; /* Tell the lun to begin answering selects */ memset(&en_ccb, 0, sizeof(en_ccb)); xpt_setup_ccb(&en_ccb.ccb_h, path, CAM_PRIORITY_NORMAL); en_ccb.ccb_h.func_code = XPT_EN_LUN; /* Don't need support for any vendor specific commands */ en_ccb.grp6_len = grp6_len; en_ccb.grp7_len = grp7_len; en_ccb.enable = enable ? 1 : 0; xpt_action((union ccb *)&en_ccb); status = en_ccb.ccb_h.status & CAM_STATUS_MASK; if (status != CAM_REQ_CMP) { xpt_print(path, "%sable lun CCB rejected, status %#x\n", enable ? "en" : "dis", status); } return (status); } /* Enable target mode on a LUN, given its path */ static cam_status targenable(struct targ_softc *softc, struct cam_path *path, int grp6_len, int grp7_len) { struct cam_periph *periph; struct ccb_pathinq cpi; cam_status status; if ((softc->state & TARG_STATE_LUN_ENABLED) != 0) return (CAM_LUN_ALRDY_ENA); /* Make sure SIM supports target mode */ xpt_path_inq(&cpi, path); status = cpi.ccb_h.status & CAM_STATUS_MASK; if (status != CAM_REQ_CMP) { printf("pathinq failed, status %#x\n", status); goto enable_fail; } if ((cpi.target_sprt & PIT_PROCESSOR) == 0) { printf("controller does not support target mode\n"); status = CAM_FUNC_NOTAVAIL; goto enable_fail; } if (cpi.maxio == 0) softc->maxio = DFLTPHYS; /* traditional default */ else if (cpi.maxio > maxphys) softc->maxio = maxphys; /* for safety */ else softc->maxio = cpi.maxio; /* real value */ /* Destroy any periph on our path if it is disabled */ periph = cam_periph_find(path, "targ"); if (periph != NULL) { struct targ_softc *del_softc; del_softc = (struct targ_softc *)periph->softc; if ((del_softc->state & TARG_STATE_LUN_ENABLED) == 0) { cam_periph_invalidate(del_softc->periph); del_softc->periph = NULL; } else { printf("Requested path still in use by targ%d\n", periph->unit_number); status = CAM_LUN_ALRDY_ENA; goto enable_fail; } } /* Create a periph instance attached to this path */ status = cam_periph_alloc(targctor, NULL, targdtor, targstart, "targ", CAM_PERIPH_BIO, path, targasync, 0, softc); if (status != CAM_REQ_CMP) { printf("cam_periph_alloc failed, status %#x\n", status); goto enable_fail; } /* Ensure that the periph now exists. */ if (cam_periph_find(path, "targ") == NULL) { panic("targenable: succeeded but no periph?"); /* NOTREACHED */ } /* Send the enable lun message */ status = targendislun(path, /*enable*/1, grp6_len, grp7_len); if (status != CAM_REQ_CMP) { printf("enable lun failed, status %#x\n", status); goto enable_fail; } softc->state |= TARG_STATE_LUN_ENABLED; enable_fail: return (status); } /* Disable this softc's target instance if enabled */ static cam_status targdisable(struct targ_softc *softc) { cam_status status; if ((softc->state & TARG_STATE_LUN_ENABLED) == 0) return (CAM_REQ_CMP); CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH, ("targdisable\n")); /* Abort any ccbs pending on the controller */ abort_all_pending(softc); /* Disable this lun */ status = targendislun(softc->path, /*enable*/0, /*grp6_len*/0, /*grp7_len*/0); if (status == CAM_REQ_CMP) softc->state &= ~TARG_STATE_LUN_ENABLED; else printf("Disable lun failed, status %#x\n", status); return (status); } /* Initialize a periph (called from cam_periph_alloc) */ static cam_status targctor(struct cam_periph *periph, void *arg) { struct targ_softc *softc; /* Store pointer to softc for periph-driven routines */ softc = (struct targ_softc *)arg; periph->softc = softc; softc->periph = periph; softc->path = periph->path; return (CAM_REQ_CMP); } static void targdtor(struct cam_periph *periph) { struct targ_softc *softc; struct ccb_hdr *ccb_h; struct targ_cmd_descr *descr; softc = (struct targ_softc *)periph->softc; /* * targdisable() aborts CCBs back to the user and leaves them * on user_ccb_queue and abort_queue in case the user is still * interested in them. We free them now. */ while ((ccb_h = TAILQ_FIRST(&softc->user_ccb_queue)) != NULL) { TAILQ_REMOVE(&softc->user_ccb_queue, ccb_h, periph_links.tqe); targfreeccb(softc, (union ccb *)ccb_h); } while ((descr = TAILQ_FIRST(&softc->abort_queue)) != NULL) { TAILQ_REMOVE(&softc->abort_queue, descr, tqe); free(descr, M_TARG); } softc->periph = NULL; softc->path = NULL; periph->softc = NULL; } /* Receive CCBs from user mode proc and send them to the HBA */ static int targwrite(struct cdev *dev, struct uio *uio, int ioflag) { union ccb *user_ccb; struct targ_softc *softc; struct targ_cmd_descr *descr; int write_len, error; int func_code, priority; devfs_get_cdevpriv((void **)&softc); write_len = error = 0; CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH, ("write - uio_resid %zd\n", uio->uio_resid)); while (uio->uio_resid >= sizeof(user_ccb) && error == 0) { union ccb *ccb; error = uiomove((caddr_t)&user_ccb, sizeof(user_ccb), uio); if (error != 0) { CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH, ("write - uiomove failed (%d)\n", error)); break; } priority = fuword32(&user_ccb->ccb_h.pinfo.priority); if (priority == CAM_PRIORITY_NONE) { error = EINVAL; break; } func_code = fuword32(&user_ccb->ccb_h.func_code); switch (func_code) { case XPT_ACCEPT_TARGET_IO: case XPT_IMMED_NOTIFY: case XPT_IMMEDIATE_NOTIFY: cam_periph_lock(softc->periph); ccb = targgetccb(softc, func_code, priority); descr = (struct targ_cmd_descr *)ccb->ccb_h.targ_descr; descr->user_ccb = user_ccb; descr->func_code = func_code; CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH, ("Sent ATIO/INOT (%p)\n", user_ccb)); xpt_action(ccb); TAILQ_INSERT_TAIL(&softc->pending_ccb_queue, &ccb->ccb_h, periph_links.tqe); cam_periph_unlock(softc->periph); break; default: cam_periph_lock(softc->periph); if ((func_code & XPT_FC_QUEUED) != 0) { CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH, ("Sending queued ccb %#x (%p)\n", func_code, user_ccb)); descr = targgetdescr(softc); descr->user_ccb = user_ccb; descr->priority = priority; descr->func_code = func_code; TAILQ_INSERT_TAIL(&softc->work_queue, descr, tqe); xpt_schedule(softc->periph, priority); } else { CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH, ("Sending inline ccb %#x (%p)\n", func_code, user_ccb)); ccb = targgetccb(softc, func_code, priority); descr = (struct targ_cmd_descr *) ccb->ccb_h.targ_descr; descr->user_ccb = user_ccb; descr->priority = priority; descr->func_code = func_code; if (targusermerge(softc, descr, ccb) != EFAULT) targsendccb(softc, ccb, descr); targreturnccb(softc, ccb); } cam_periph_unlock(softc->periph); break; } write_len += sizeof(user_ccb); } /* * If we've successfully taken in some amount of * data, return success for that data first. If * an error is persistent, it will be reported * on the next write. */ if (error != 0 && write_len == 0) return (error); if (write_len == 0 && uio->uio_resid != 0) return (ENOSPC); return (0); } /* Process requests (descrs) via the periph-supplied CCBs */ static void targstart(struct cam_periph *periph, union ccb *start_ccb) { struct targ_softc *softc; struct targ_cmd_descr *descr, *next_descr; int error; softc = (struct targ_softc *)periph->softc; CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH, ("targstart %p\n", start_ccb)); descr = TAILQ_FIRST(&softc->work_queue); if (descr == NULL) { xpt_release_ccb(start_ccb); } else { TAILQ_REMOVE(&softc->work_queue, descr, tqe); next_descr = TAILQ_FIRST(&softc->work_queue); /* Initiate a transaction using the descr and supplied CCB */ error = targusermerge(softc, descr, start_ccb); if (error == 0) error = targsendccb(softc, start_ccb, descr); if (error != 0) { xpt_print(periph->path, "targsendccb failed, err %d\n", error); xpt_release_ccb(start_ccb); (void)suword(&descr->user_ccb->ccb_h.status, CAM_REQ_CMP_ERR); TAILQ_INSERT_TAIL(&softc->abort_queue, descr, tqe); notify_user(softc); } /* If we have more work to do, stay scheduled */ if (next_descr != NULL) xpt_schedule(periph, next_descr->priority); } } static int targusermerge(struct targ_softc *softc, struct targ_cmd_descr *descr, union ccb *ccb) { struct ccb_hdr *u_ccbh, *k_ccbh; size_t ccb_len; int error; u_ccbh = &descr->user_ccb->ccb_h; k_ccbh = &ccb->ccb_h; /* * There are some fields in the CCB header that need to be * preserved, the rest we get from the user ccb. (See xpt_merge_ccb) */ xpt_setup_ccb(k_ccbh, softc->path, descr->priority); k_ccbh->retry_count = fuword32(&u_ccbh->retry_count); k_ccbh->func_code = descr->func_code; k_ccbh->flags = fuword32(&u_ccbh->flags); k_ccbh->timeout = fuword32(&u_ccbh->timeout); ccb_len = targccblen(k_ccbh->func_code) - sizeof(struct ccb_hdr); error = copyin(u_ccbh + 1, k_ccbh + 1, ccb_len); if (error != 0) { k_ccbh->status = CAM_REQ_CMP_ERR; return (error); } /* Translate usermode abort_ccb pointer to its kernel counterpart */ if (k_ccbh->func_code == XPT_ABORT) { struct ccb_abort *cab; struct ccb_hdr *ccb_h; cab = (struct ccb_abort *)ccb; TAILQ_FOREACH(ccb_h, &softc->pending_ccb_queue, periph_links.tqe) { struct targ_cmd_descr *ab_descr; ab_descr = (struct targ_cmd_descr *)ccb_h->targ_descr; if (ab_descr->user_ccb == cab->abort_ccb) { CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH, ("Changing abort for %p to %p\n", cab->abort_ccb, ccb_h)); cab->abort_ccb = (union ccb *)ccb_h; break; } } /* CCB not found, set appropriate status */ if (ccb_h == NULL) { k_ccbh->status = CAM_PATH_INVALID; error = ESRCH; } } return (error); } /* Build and send a kernel CCB formed from descr->user_ccb */ static int targsendccb(struct targ_softc *softc, union ccb *ccb, struct targ_cmd_descr *descr) { struct cam_periph_map_info *mapinfo; struct ccb_hdr *ccb_h; int error; ccb_h = &ccb->ccb_h; mapinfo = &descr->mapinfo; mapinfo->num_bufs_used = 0; /* * There's no way for the user to have a completion * function, so we put our own completion function in here. * We also stash in a reference to our descriptor so targreturnccb() * can find our mapping info. */ ccb_h->cbfcnp = targdone; ccb_h->targ_descr = descr; if ((ccb_h->func_code == XPT_CONT_TARGET_IO) || (ccb_h->func_code == XPT_DEV_MATCH)) { error = cam_periph_mapmem(ccb, mapinfo, softc->maxio); /* * cam_periph_mapmem returned an error, we can't continue. * Return the error to the user. */ if (error) { ccb_h->status = CAM_REQ_CMP_ERR; mapinfo->num_bufs_used = 0; return (error); } } /* * Once queued on the pending CCB list, this CCB will be protected * by our error recovery handler. */ CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH, ("sendccb %p\n", ccb)); if (XPT_FC_IS_QUEUED(ccb)) { TAILQ_INSERT_TAIL(&softc->pending_ccb_queue, ccb_h, periph_links.tqe); } xpt_action(ccb); return (0); } /* Completion routine for CCBs (called at splsoftcam) */ static void targdone(struct cam_periph *periph, union ccb *done_ccb) { struct targ_softc *softc; CAM_DEBUG(periph->path, CAM_DEBUG_PERIPH, ("targdone %p\n", done_ccb)); softc = (struct targ_softc *)periph->softc; TAILQ_REMOVE(&softc->pending_ccb_queue, &done_ccb->ccb_h, periph_links.tqe); /* If we're no longer enabled, throw away CCB */ if ((softc->state & TARG_STATE_LUN_ENABLED) == 0) { targfreeccb(softc, done_ccb); return; } /* abort_all_pending() waits for pending queue to be empty */ if (TAILQ_EMPTY(&softc->pending_ccb_queue)) wakeup(&softc->pending_ccb_queue); switch (done_ccb->ccb_h.func_code) { /* All FC_*_QUEUED CCBs go back to userland */ case XPT_IMMED_NOTIFY: case XPT_IMMEDIATE_NOTIFY: case XPT_ACCEPT_TARGET_IO: case XPT_CONT_TARGET_IO: TAILQ_INSERT_TAIL(&softc->user_ccb_queue, &done_ccb->ccb_h, periph_links.tqe); cam_periph_unlock(softc->periph); notify_user(softc); cam_periph_lock(softc->periph); break; default: panic("targdone: impossible xpt opcode %#x", done_ccb->ccb_h.func_code); /* NOTREACHED */ } } /* Return CCBs to the user from the user queue and abort queue */ static int targread(struct cdev *dev, struct uio *uio, int ioflag) { struct descr_queue *abort_queue; struct targ_cmd_descr *user_descr; struct targ_softc *softc; struct ccb_queue *user_queue; struct ccb_hdr *ccb_h; union ccb *user_ccb; int read_len, error; error = 0; read_len = 0; devfs_get_cdevpriv((void **)&softc); user_queue = &softc->user_ccb_queue; abort_queue = &softc->abort_queue; CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH, ("targread\n")); /* If no data is available, wait or return immediately */ cam_periph_lock(softc->periph); ccb_h = TAILQ_FIRST(user_queue); user_descr = TAILQ_FIRST(abort_queue); while (ccb_h == NULL && user_descr == NULL) { if ((ioflag & IO_NDELAY) == 0) { error = cam_periph_sleep(softc->periph, user_queue, PRIBIO | PCATCH, "targrd", 0); ccb_h = TAILQ_FIRST(user_queue); user_descr = TAILQ_FIRST(abort_queue); if (error != 0) { if (error == ERESTART) { continue; } else { goto read_fail; } } } else { cam_periph_unlock(softc->periph); return (EAGAIN); } } /* Data is available so fill the user's buffer */ while (ccb_h != NULL) { struct targ_cmd_descr *descr; if (uio->uio_resid < sizeof(user_ccb)) break; TAILQ_REMOVE(user_queue, ccb_h, periph_links.tqe); descr = (struct targ_cmd_descr *)ccb_h->targ_descr; user_ccb = descr->user_ccb; CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH, ("targread ccb %p (%p)\n", ccb_h, user_ccb)); error = targreturnccb(softc, (union ccb *)ccb_h); if (error != 0) goto read_fail; cam_periph_unlock(softc->periph); error = uiomove((caddr_t)&user_ccb, sizeof(user_ccb), uio); cam_periph_lock(softc->periph); if (error != 0) goto read_fail; read_len += sizeof(user_ccb); ccb_h = TAILQ_FIRST(user_queue); } /* Flush out any aborted descriptors */ while (user_descr != NULL) { if (uio->uio_resid < sizeof(user_ccb)) break; TAILQ_REMOVE(abort_queue, user_descr, tqe); user_ccb = user_descr->user_ccb; CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH, ("targread aborted descr %p (%p)\n", user_descr, user_ccb)); if (suword(&user_ccb->ccb_h.status, CAM_REQ_ABORTED) != 0) { error = EFAULT; goto read_fail; } cam_periph_unlock(softc->periph); error = uiomove((caddr_t)&user_ccb, sizeof(user_ccb), uio); cam_periph_lock(softc->periph); if (error != 0) goto read_fail; read_len += sizeof(user_ccb); user_descr = TAILQ_FIRST(abort_queue); } /* * If we've successfully read some amount of data, don't report an * error. If the error is persistent, it will be reported on the * next read(). */ if (read_len == 0 && uio->uio_resid != 0) error = ENOSPC; read_fail: cam_periph_unlock(softc->periph); return (error); } /* Copy completed ccb back to the user */ static int targreturnccb(struct targ_softc *softc, union ccb *ccb) { struct targ_cmd_descr *descr; struct ccb_hdr *u_ccbh; size_t ccb_len; int error; CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH, ("targreturnccb %p\n", ccb)); descr = (struct targ_cmd_descr *)ccb->ccb_h.targ_descr; u_ccbh = &descr->user_ccb->ccb_h; /* Copy out the central portion of the ccb_hdr */ error = copyout(&ccb->ccb_h.retry_count, &u_ccbh->retry_count, offsetof(struct ccb_hdr, periph_priv) - offsetof(struct ccb_hdr, retry_count)); if (error != 0) { xpt_print(softc->path, "targreturnccb - CCB header copyout failed (%d)\n", error); } /* Copy out the rest of the ccb (after the ccb_hdr) */ ccb_len = targccblen(ccb->ccb_h.func_code) - sizeof(struct ccb_hdr); if (descr->mapinfo.num_bufs_used != 0) { int error1; error1 = cam_periph_unmapmem(ccb, &descr->mapinfo); if (error == 0) error = error1; } if (error == 0) { error = copyout(&ccb->ccb_h + 1, u_ccbh + 1, ccb_len); if (error != 0) { xpt_print(softc->path, "targreturnccb - CCB copyout failed (%d)\n", error); } } /* Free CCB or send back to devq. */ targfreeccb(softc, ccb); return (error); } static union ccb * targgetccb(struct targ_softc *softc, xpt_opcode type, int priority) { union ccb *ccb; int ccb_len; ccb_len = targccblen(type); ccb = malloc(ccb_len, M_TARG, M_NOWAIT | M_ZERO); CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH, ("getccb %p\n", ccb)); if (ccb == NULL) { return (ccb); } xpt_setup_ccb(&ccb->ccb_h, softc->path, priority); ccb->ccb_h.func_code = type; ccb->ccb_h.cbfcnp = targdone; ccb->ccb_h.targ_descr = targgetdescr(softc); if (ccb->ccb_h.targ_descr == NULL) { free (ccb, M_TARG); ccb = NULL; } return (ccb); } static void targfreeccb(struct targ_softc *softc, union ccb *ccb) { CAM_DEBUG_PRINT(CAM_DEBUG_PERIPH, ("targfreeccb descr %p and\n", ccb->ccb_h.targ_descr)); free(ccb->ccb_h.targ_descr, M_TARG); switch (ccb->ccb_h.func_code) { case XPT_ACCEPT_TARGET_IO: case XPT_IMMED_NOTIFY: case XPT_IMMEDIATE_NOTIFY: CAM_DEBUG_PRINT(CAM_DEBUG_PERIPH, ("freeing ccb %p\n", ccb)); free(ccb, M_TARG); break; default: /* Send back CCB if we got it from the periph */ if (XPT_FC_IS_QUEUED(ccb)) { CAM_DEBUG_PRINT(CAM_DEBUG_PERIPH, ("returning queued ccb %p\n", ccb)); xpt_release_ccb(ccb); } else { CAM_DEBUG_PRINT(CAM_DEBUG_PERIPH, ("freeing ccb %p\n", ccb)); free(ccb, M_TARG); } break; } } static struct targ_cmd_descr * targgetdescr(struct targ_softc *softc) { struct targ_cmd_descr *descr; descr = malloc(sizeof(*descr), M_TARG, M_NOWAIT); if (descr) { descr->mapinfo.num_bufs_used = 0; } return (descr); } static void targinit(void) { struct cdev *dev; /* Add symbolic link to targ0 for compatibility. */ dev = make_dev(&targ_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "targ"); make_dev_alias(dev, "targ0"); } static void targasync(void *callback_arg, uint32_t code, struct cam_path *path, void *arg) { /* All events are handled in usermode by INOTs */ panic("targasync() called, should be an INOT instead"); } /* Cancel all pending requests and CCBs awaiting work. */ static void abort_all_pending(struct targ_softc *softc) { struct targ_cmd_descr *descr; struct ccb_abort cab; struct ccb_hdr *ccb_h; CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH, ("abort_all_pending\n")); /* First abort the descriptors awaiting resources */ while ((descr = TAILQ_FIRST(&softc->work_queue)) != NULL) { CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH, ("Aborting descr from workq %p\n", descr)); TAILQ_REMOVE(&softc->work_queue, descr, tqe); TAILQ_INSERT_TAIL(&softc->abort_queue, descr, tqe); } /* * Then abort all pending CCBs. * targdone() will return the aborted CCB via user_ccb_queue */ memset(&cab, 0, sizeof(cab)); xpt_setup_ccb(&cab.ccb_h, softc->path, CAM_PRIORITY_NORMAL); cab.ccb_h.func_code = XPT_ABORT; cab.ccb_h.status = CAM_REQ_CMP_ERR; TAILQ_FOREACH(ccb_h, &softc->pending_ccb_queue, periph_links.tqe) { CAM_DEBUG(softc->path, CAM_DEBUG_PERIPH, ("Aborting pending CCB %p\n", ccb_h)); cab.abort_ccb = (union ccb *)ccb_h; xpt_action((union ccb *)&cab); if (cab.ccb_h.status != CAM_REQ_CMP) { xpt_print(cab.ccb_h.path, "Unable to abort CCB, status %#x\n", cab.ccb_h.status); } } /* If we aborted at least one pending CCB ok, wait for it. */ if (cab.ccb_h.status == CAM_REQ_CMP) { cam_periph_sleep(softc->periph, &softc->pending_ccb_queue, PRIBIO | PCATCH, "tgabrt", 0); } /* If we aborted anything from the work queue, wakeup user. */ if (!TAILQ_EMPTY(&softc->user_ccb_queue) || !TAILQ_EMPTY(&softc->abort_queue)) { cam_periph_unlock(softc->periph); notify_user(softc); cam_periph_lock(softc->periph); } } /* Notify the user that data is ready */ static void notify_user(struct targ_softc *softc) { /* * Notify users sleeping via poll(), kqueue(), and * blocking read(). */ selwakeuppri(&softc->read_select, PRIBIO); KNOTE_UNLOCKED(&softc->read_select.si_note, 0); wakeup(&softc->user_ccb_queue); } /* Convert CAM status to errno values */ static int targcamstatus(cam_status status) { switch (status & CAM_STATUS_MASK) { case CAM_REQ_CMP: /* CCB request completed without error */ return (0); case CAM_REQ_INPROG: /* CCB request is in progress */ return (EINPROGRESS); case CAM_REQ_CMP_ERR: /* CCB request completed with an error */ return (EIO); case CAM_PROVIDE_FAIL: /* Unable to provide requested capability */ return (ENOTTY); case CAM_FUNC_NOTAVAIL: /* The requested function is not available */ return (ENOTSUP); case CAM_LUN_ALRDY_ENA: /* LUN is already enabled for target mode */ return (EADDRINUSE); case CAM_PATH_INVALID: /* Supplied Path ID is invalid */ case CAM_DEV_NOT_THERE: /* SCSI Device Not Installed/there */ return (ENOENT); case CAM_REQ_ABORTED: /* CCB request aborted by the host */ return (ECANCELED); case CAM_CMD_TIMEOUT: /* Command timeout */ return (ETIMEDOUT); case CAM_REQUEUE_REQ: /* Requeue to preserve transaction ordering */ return (EAGAIN); case CAM_REQ_INVALID: /* CCB request was invalid */ return (EINVAL); case CAM_RESRC_UNAVAIL: /* Resource Unavailable */ return (ENOMEM); case CAM_BUSY: /* CAM subsystem is busy */ case CAM_UA_ABORT: /* Unable to abort CCB request */ return (EBUSY); default: return (ENXIO); } } static size_t targccblen(xpt_opcode func_code) { int len; /* Codes we expect to see as a target */ switch (func_code) { case XPT_CONT_TARGET_IO: case XPT_SCSI_IO: len = sizeof(struct ccb_scsiio); break; case XPT_ACCEPT_TARGET_IO: len = sizeof(struct ccb_accept_tio); break; case XPT_IMMED_NOTIFY: len = sizeof(struct ccb_immed_notify); break; case XPT_IMMEDIATE_NOTIFY: len = sizeof(struct ccb_immediate_notify); break; case XPT_REL_SIMQ: len = sizeof(struct ccb_relsim); break; case XPT_PATH_INQ: len = sizeof(struct ccb_pathinq); break; case XPT_DEBUG: len = sizeof(struct ccb_debug); break; case XPT_ABORT: len = sizeof(struct ccb_abort); break; case XPT_EN_LUN: len = sizeof(struct ccb_en_lun); break; default: len = sizeof(union ccb); break; } return (len); } diff --git a/sys/compat/linuxkpi/common/include/linux/file.h b/sys/compat/linuxkpi/common/include/linux/file.h index f94e3d89ced1..f6e988c2d88e 100644 --- a/sys/compat/linuxkpi/common/include/linux/file.h +++ b/sys/compat/linuxkpi/common/include/linux/file.h @@ -1,185 +1,185 @@ /*- * Copyright (c) 2010 Isilon Systems, Inc. * Copyright (c) 2010 iX Systems, Inc. * Copyright (c) 2010 Panasas, Inc. * Copyright (c) 2013-2017 Mellanox Technologies, Ltd. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef _LINUXKPI_LINUX_FILE_H_ #define _LINUXKPI_LINUX_FILE_H_ #include #include #include #include #include #include #include #include struct linux_file; #undef file -extern struct fileops linuxfileops; +extern const struct fileops linuxfileops; static inline struct linux_file * linux_fget(unsigned int fd) { struct file *file; /* lookup file pointer by file descriptor index */ if (fget_unlocked(curthread, fd, &cap_no_rights, &file) != 0) return (NULL); /* check if file handle really belongs to us */ if (file->f_data == NULL || file->f_ops != &linuxfileops) { fdrop(file, curthread); return (NULL); } return ((struct linux_file *)file->f_data); } extern void linux_file_free(struct linux_file *filp); static inline void fput(struct linux_file *filp) { if (refcount_release(filp->_file == NULL ? &filp->f_count : &filp->_file->f_count)) { linux_file_free(filp); } } static inline unsigned int file_count(struct linux_file *filp) { return (filp->_file == NULL ? filp->f_count : filp->_file->f_count); } static inline void put_unused_fd(unsigned int fd) { struct file *file; if (fget_unlocked(curthread, fd, &cap_no_rights, &file) != 0) { return; } /* * NOTE: We should only get here when the "fd" has not been * installed, so no need to free the associated Linux file * structure. */ fdclose(curthread, file, fd); /* drop extra reference */ fdrop(file, curthread); } static inline void fd_install(unsigned int fd, struct linux_file *filp) { struct file *file; if (fget_unlocked(curthread, fd, &cap_no_rights, &file) != 0) { filp->_file = NULL; } else { filp->_file = file; finit(file, filp->f_mode, DTYPE_DEV, filp, &linuxfileops); /* transfer reference count from "filp" to "file" */ while (refcount_release(&filp->f_count) == 0) refcount_acquire(&file->f_count); } /* drop the extra reference */ fput(filp); } static inline int get_unused_fd(void) { struct file *file; int error; int fd; error = falloc(curthread, &file, &fd, 0); if (error) return -error; /* drop the extra reference */ fdrop(file, curthread); return fd; } static inline int get_unused_fd_flags(int flags) { struct file *file; int error; int fd; error = falloc(curthread, &file, &fd, flags); if (error) return -error; /* drop the extra reference */ fdrop(file, curthread); return fd; } extern struct linux_file *linux_file_alloc(void); static inline struct linux_file * alloc_file(int mode, const struct file_operations *fops) { struct linux_file *filp; filp = linux_file_alloc(); filp->f_op = fops; filp->f_mode = mode; return (filp); } struct fd { struct linux_file *linux_file; }; static inline void fdput(struct fd fd) { fput(fd.linux_file); } static inline struct fd fdget(unsigned int fd) { struct linux_file *f = linux_fget(fd); return (struct fd){f}; } #define file linux_file #define fget(...) linux_fget(__VA_ARGS__) #endif /* _LINUXKPI_LINUX_FILE_H_ */ diff --git a/sys/compat/linuxkpi/common/src/linux_compat.c b/sys/compat/linuxkpi/common/src/linux_compat.c index fe1a545c6a3a..1fc71c55469a 100644 --- a/sys/compat/linuxkpi/common/src/linux_compat.c +++ b/sys/compat/linuxkpi/common/src/linux_compat.c @@ -1,2876 +1,2876 @@ /*- * Copyright (c) 2010 Isilon Systems, Inc. * Copyright (c) 2010 iX Systems, Inc. * Copyright (c) 2010 Panasas, Inc. * Copyright (c) 2013-2021 Mellanox Technologies, Ltd. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include "opt_global.h" #include "opt_stack.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__i386__) || defined(__amd64__) #include #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__i386__) || defined(__amd64__) #include #include #endif #include #ifdef XENHVM #undef xen_pv_domain #undef xen_initial_domain /* xen/xen-os.h redefines __must_check */ #undef __must_check #include #endif SYSCTL_NODE(_compat, OID_AUTO, linuxkpi, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "LinuxKPI parameters"); int linuxkpi_debug; SYSCTL_INT(_compat_linuxkpi, OID_AUTO, debug, CTLFLAG_RWTUN, &linuxkpi_debug, 0, "Set to enable pr_debug() prints. Clear to disable."); int linuxkpi_rcu_debug; SYSCTL_INT(_compat_linuxkpi, OID_AUTO, rcu_debug, CTLFLAG_RWTUN, &linuxkpi_rcu_debug, 0, "Set to enable RCU warning. Clear to disable."); int linuxkpi_warn_dump_stack = 0; SYSCTL_INT(_compat_linuxkpi, OID_AUTO, warn_dump_stack, CTLFLAG_RWTUN, &linuxkpi_warn_dump_stack, 0, "Set to enable stack traces from WARN_ON(). Clear to disable."); static struct timeval lkpi_net_lastlog; static int lkpi_net_curpps; static int lkpi_net_maxpps = 99; SYSCTL_INT(_compat_linuxkpi, OID_AUTO, net_ratelimit, CTLFLAG_RWTUN, &lkpi_net_maxpps, 0, "Limit number of LinuxKPI net messages per second."); MALLOC_DEFINE(M_KMALLOC, "lkpikmalloc", "Linux kmalloc compat"); #include /* Undo Linux compat changes. */ #undef RB_ROOT #undef file #undef cdev #define RB_ROOT(head) (head)->rbh_root static void linux_destroy_dev(struct linux_cdev *); static void linux_cdev_deref(struct linux_cdev *ldev); static struct vm_area_struct *linux_cdev_handle_find(void *handle); cpumask_t cpu_online_mask; static cpumask_t **static_single_cpu_mask; static cpumask_t *static_single_cpu_mask_lcs; struct kobject linux_class_root; struct device linux_root_device; struct class linux_class_misc; struct list_head pci_drivers; struct list_head pci_devices; spinlock_t pci_lock; struct uts_namespace init_uts_ns; unsigned long linux_timer_hz_mask; wait_queue_head_t linux_bit_waitq; wait_queue_head_t linux_var_waitq; int panic_cmp(struct rb_node *one, struct rb_node *two) { panic("no cmp"); } RB_GENERATE(linux_root, rb_node, __entry, panic_cmp); #define START(node) ((node)->start) #define LAST(node) ((node)->last) INTERVAL_TREE_DEFINE(struct interval_tree_node, rb, unsigned long,, START, LAST,, lkpi_interval_tree) static void linux_device_release(struct device *dev) { pr_debug("linux_device_release: %s\n", dev_name(dev)); kfree(dev); } static ssize_t linux_class_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct class_attribute *dattr; ssize_t error; dattr = container_of(attr, struct class_attribute, attr); error = -EIO; if (dattr->show) error = dattr->show(container_of(kobj, struct class, kobj), dattr, buf); return (error); } static ssize_t linux_class_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { struct class_attribute *dattr; ssize_t error; dattr = container_of(attr, struct class_attribute, attr); error = -EIO; if (dattr->store) error = dattr->store(container_of(kobj, struct class, kobj), dattr, buf, count); return (error); } static void linux_class_release(struct kobject *kobj) { struct class *class; class = container_of(kobj, struct class, kobj); if (class->class_release) class->class_release(class); } static const struct sysfs_ops linux_class_sysfs = { .show = linux_class_show, .store = linux_class_store, }; const struct kobj_type linux_class_ktype = { .release = linux_class_release, .sysfs_ops = &linux_class_sysfs }; static void linux_dev_release(struct kobject *kobj) { struct device *dev; dev = container_of(kobj, struct device, kobj); /* This is the precedence defined by linux. */ if (dev->release) dev->release(dev); else if (dev->class && dev->class->dev_release) dev->class->dev_release(dev); } static ssize_t linux_dev_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct device_attribute *dattr; ssize_t error; dattr = container_of(attr, struct device_attribute, attr); error = -EIO; if (dattr->show) error = dattr->show(container_of(kobj, struct device, kobj), dattr, buf); return (error); } static ssize_t linux_dev_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { struct device_attribute *dattr; ssize_t error; dattr = container_of(attr, struct device_attribute, attr); error = -EIO; if (dattr->store) error = dattr->store(container_of(kobj, struct device, kobj), dattr, buf, count); return (error); } static const struct sysfs_ops linux_dev_sysfs = { .show = linux_dev_show, .store = linux_dev_store, }; const struct kobj_type linux_dev_ktype = { .release = linux_dev_release, .sysfs_ops = &linux_dev_sysfs }; struct device * device_create(struct class *class, struct device *parent, dev_t devt, void *drvdata, const char *fmt, ...) { struct device *dev; va_list args; dev = kzalloc(sizeof(*dev), M_WAITOK); dev->parent = parent; dev->class = class; dev->devt = devt; dev->driver_data = drvdata; dev->release = linux_device_release; va_start(args, fmt); kobject_set_name_vargs(&dev->kobj, fmt, args); va_end(args); device_register(dev); return (dev); } struct device * device_create_groups_vargs(struct class *class, struct device *parent, dev_t devt, void *drvdata, const struct attribute_group **groups, const char *fmt, va_list args) { struct device *dev = NULL; int retval = -ENODEV; if (class == NULL || IS_ERR(class)) goto error; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) { retval = -ENOMEM; goto error; } dev->devt = devt; dev->class = class; dev->parent = parent; dev->groups = groups; dev->release = device_create_release; /* device_initialize() needs the class and parent to be set */ device_initialize(dev); dev_set_drvdata(dev, drvdata); retval = kobject_set_name_vargs(&dev->kobj, fmt, args); if (retval) goto error; retval = device_add(dev); if (retval) goto error; return dev; error: put_device(dev); return ERR_PTR(retval); } struct class * class_create(struct module *owner, const char *name) { struct class *class; int error; class = kzalloc(sizeof(*class), M_WAITOK); class->owner = owner; class->name = name; class->class_release = linux_class_kfree; error = class_register(class); if (error) { kfree(class); return (NULL); } return (class); } static void linux_kq_lock(void *arg) { spinlock_t *s = arg; spin_lock(s); } static void linux_kq_unlock(void *arg) { spinlock_t *s = arg; spin_unlock(s); } static void linux_kq_assert_lock(void *arg, int what) { #ifdef INVARIANTS spinlock_t *s = arg; if (what == LA_LOCKED) mtx_assert(s, MA_OWNED); else mtx_assert(s, MA_NOTOWNED); #endif } static void linux_file_kqfilter_poll(struct linux_file *, int); struct linux_file * linux_file_alloc(void) { struct linux_file *filp; filp = kzalloc(sizeof(*filp), GFP_KERNEL); /* set initial refcount */ filp->f_count = 1; /* setup fields needed by kqueue support */ spin_lock_init(&filp->f_kqlock); knlist_init(&filp->f_selinfo.si_note, &filp->f_kqlock, linux_kq_lock, linux_kq_unlock, linux_kq_assert_lock); return (filp); } void linux_file_free(struct linux_file *filp) { if (filp->_file == NULL) { if (filp->f_op != NULL && filp->f_op->release != NULL) filp->f_op->release(filp->f_vnode, filp); if (filp->f_shmem != NULL) vm_object_deallocate(filp->f_shmem); kfree_rcu(filp, rcu); } else { /* * The close method of the character device or file * will free the linux_file structure: */ _fdrop(filp->_file, curthread); } } struct linux_cdev * cdev_alloc(void) { struct linux_cdev *cdev; cdev = kzalloc(sizeof(struct linux_cdev), M_WAITOK); kobject_init(&cdev->kobj, &linux_cdev_ktype); cdev->refs = 1; return (cdev); } static int linux_cdev_pager_fault(vm_object_t vm_obj, vm_ooffset_t offset, int prot, vm_page_t *mres) { struct vm_area_struct *vmap; vmap = linux_cdev_handle_find(vm_obj->handle); MPASS(vmap != NULL); MPASS(vmap->vm_private_data == vm_obj->handle); if (likely(vmap->vm_ops != NULL && offset < vmap->vm_len)) { vm_paddr_t paddr = IDX_TO_OFF(vmap->vm_pfn) + offset; vm_page_t page; if (((*mres)->flags & PG_FICTITIOUS) != 0) { /* * If the passed in result page is a fake * page, update it with the new physical * address. */ page = *mres; vm_page_updatefake(page, paddr, vm_obj->memattr); } else { /* * Replace the passed in "mres" page with our * own fake page and free up the all of the * original pages. */ VM_OBJECT_WUNLOCK(vm_obj); page = vm_page_getfake(paddr, vm_obj->memattr); VM_OBJECT_WLOCK(vm_obj); vm_page_replace(page, vm_obj, (*mres)->pindex, *mres); *mres = page; } vm_page_valid(page); return (VM_PAGER_OK); } return (VM_PAGER_FAIL); } static int linux_cdev_pager_populate(vm_object_t vm_obj, vm_pindex_t pidx, int fault_type, vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last) { struct vm_area_struct *vmap; int err; /* get VM area structure */ vmap = linux_cdev_handle_find(vm_obj->handle); MPASS(vmap != NULL); MPASS(vmap->vm_private_data == vm_obj->handle); VM_OBJECT_WUNLOCK(vm_obj); linux_set_current(curthread); down_write(&vmap->vm_mm->mmap_sem); if (unlikely(vmap->vm_ops == NULL)) { err = VM_FAULT_SIGBUS; } else { struct vm_fault vmf; /* fill out VM fault structure */ vmf.virtual_address = (void *)(uintptr_t)IDX_TO_OFF(pidx); vmf.flags = (fault_type & VM_PROT_WRITE) ? FAULT_FLAG_WRITE : 0; vmf.pgoff = 0; vmf.page = NULL; vmf.vma = vmap; vmap->vm_pfn_count = 0; vmap->vm_pfn_pcount = &vmap->vm_pfn_count; vmap->vm_obj = vm_obj; err = vmap->vm_ops->fault(&vmf); while (vmap->vm_pfn_count == 0 && err == VM_FAULT_NOPAGE) { kern_yield(PRI_USER); err = vmap->vm_ops->fault(&vmf); } } /* translate return code */ switch (err) { case VM_FAULT_OOM: err = VM_PAGER_AGAIN; break; case VM_FAULT_SIGBUS: err = VM_PAGER_BAD; break; case VM_FAULT_NOPAGE: /* * By contract the fault handler will return having * busied all the pages itself. If pidx is already * found in the object, it will simply xbusy the first * page and return with vm_pfn_count set to 1. */ *first = vmap->vm_pfn_first; *last = *first + vmap->vm_pfn_count - 1; err = VM_PAGER_OK; break; default: err = VM_PAGER_ERROR; break; } up_write(&vmap->vm_mm->mmap_sem); VM_OBJECT_WLOCK(vm_obj); return (err); } static struct rwlock linux_vma_lock; static TAILQ_HEAD(, vm_area_struct) linux_vma_head = TAILQ_HEAD_INITIALIZER(linux_vma_head); static void linux_cdev_handle_free(struct vm_area_struct *vmap) { /* Drop reference on vm_file */ if (vmap->vm_file != NULL) fput(vmap->vm_file); /* Drop reference on mm_struct */ mmput(vmap->vm_mm); kfree(vmap); } static void linux_cdev_handle_remove(struct vm_area_struct *vmap) { rw_wlock(&linux_vma_lock); TAILQ_REMOVE(&linux_vma_head, vmap, vm_entry); rw_wunlock(&linux_vma_lock); } static struct vm_area_struct * linux_cdev_handle_find(void *handle) { struct vm_area_struct *vmap; rw_rlock(&linux_vma_lock); TAILQ_FOREACH(vmap, &linux_vma_head, vm_entry) { if (vmap->vm_private_data == handle) break; } rw_runlock(&linux_vma_lock); return (vmap); } static int linux_cdev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred, u_short *color) { MPASS(linux_cdev_handle_find(handle) != NULL); *color = 0; return (0); } static void linux_cdev_pager_dtor(void *handle) { const struct vm_operations_struct *vm_ops; struct vm_area_struct *vmap; vmap = linux_cdev_handle_find(handle); MPASS(vmap != NULL); /* * Remove handle before calling close operation to prevent * other threads from reusing the handle pointer. */ linux_cdev_handle_remove(vmap); down_write(&vmap->vm_mm->mmap_sem); vm_ops = vmap->vm_ops; if (likely(vm_ops != NULL)) vm_ops->close(vmap); up_write(&vmap->vm_mm->mmap_sem); linux_cdev_handle_free(vmap); } static struct cdev_pager_ops linux_cdev_pager_ops[2] = { { /* OBJT_MGTDEVICE */ .cdev_pg_populate = linux_cdev_pager_populate, .cdev_pg_ctor = linux_cdev_pager_ctor, .cdev_pg_dtor = linux_cdev_pager_dtor }, { /* OBJT_DEVICE */ .cdev_pg_fault = linux_cdev_pager_fault, .cdev_pg_ctor = linux_cdev_pager_ctor, .cdev_pg_dtor = linux_cdev_pager_dtor }, }; int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size) { vm_object_t obj; vm_page_t m; obj = vma->vm_obj; if (obj == NULL || (obj->flags & OBJ_UNMANAGED) != 0) return (-ENOTSUP); VM_OBJECT_RLOCK(obj); for (m = vm_page_find_least(obj, OFF_TO_IDX(address)); m != NULL && m->pindex < OFF_TO_IDX(address + size); m = TAILQ_NEXT(m, listq)) pmap_remove_all(m); VM_OBJECT_RUNLOCK(obj); return (0); } void vma_set_file(struct vm_area_struct *vma, struct linux_file *file) { struct linux_file *tmp; /* Changing an anonymous vma with this is illegal */ get_file(file); tmp = vma->vm_file; vma->vm_file = file; fput(tmp); } static struct file_operations dummy_ldev_ops = { /* XXXKIB */ }; static struct linux_cdev dummy_ldev = { .ops = &dummy_ldev_ops, }; #define LDEV_SI_DTR 0x0001 #define LDEV_SI_REF 0x0002 static void linux_get_fop(struct linux_file *filp, const struct file_operations **fop, struct linux_cdev **dev) { struct linux_cdev *ldev; u_int siref; ldev = filp->f_cdev; *fop = filp->f_op; if (ldev != NULL) { if (ldev->kobj.ktype == &linux_cdev_static_ktype) { refcount_acquire(&ldev->refs); } else { for (siref = ldev->siref;;) { if ((siref & LDEV_SI_DTR) != 0) { ldev = &dummy_ldev; *fop = ldev->ops; siref = ldev->siref; MPASS((ldev->siref & LDEV_SI_DTR) == 0); } else if (atomic_fcmpset_int(&ldev->siref, &siref, siref + LDEV_SI_REF)) { break; } } } } *dev = ldev; } static void linux_drop_fop(struct linux_cdev *ldev) { if (ldev == NULL) return; if (ldev->kobj.ktype == &linux_cdev_static_ktype) { linux_cdev_deref(ldev); } else { MPASS(ldev->kobj.ktype == &linux_cdev_ktype); MPASS((ldev->siref & ~LDEV_SI_DTR) != 0); atomic_subtract_int(&ldev->siref, LDEV_SI_REF); } } #define OPW(fp,td,code) ({ \ struct file *__fpop; \ __typeof(code) __retval; \ \ __fpop = (td)->td_fpop; \ (td)->td_fpop = (fp); \ __retval = (code); \ (td)->td_fpop = __fpop; \ __retval; \ }) static int linux_dev_fdopen(struct cdev *dev, int fflags, struct thread *td, struct file *file) { struct linux_cdev *ldev; struct linux_file *filp; const struct file_operations *fop; int error; ldev = dev->si_drv1; filp = linux_file_alloc(); filp->f_dentry = &filp->f_dentry_store; filp->f_op = ldev->ops; filp->f_mode = file->f_flag; filp->f_flags = file->f_flag; filp->f_vnode = file->f_vnode; filp->_file = file; refcount_acquire(&ldev->refs); filp->f_cdev = ldev; linux_set_current(td); linux_get_fop(filp, &fop, &ldev); if (fop->open != NULL) { error = -fop->open(file->f_vnode, filp); if (error != 0) { linux_drop_fop(ldev); linux_cdev_deref(filp->f_cdev); kfree(filp); return (error); } } /* hold on to the vnode - used for fstat() */ vhold(filp->f_vnode); /* release the file from devfs */ finit(file, filp->f_mode, DTYPE_DEV, filp, &linuxfileops); linux_drop_fop(ldev); return (ENXIO); } #define LINUX_IOCTL_MIN_PTR 0x10000UL #define LINUX_IOCTL_MAX_PTR (LINUX_IOCTL_MIN_PTR + IOCPARM_MAX) static inline int linux_remap_address(void **uaddr, size_t len) { uintptr_t uaddr_val = (uintptr_t)(*uaddr); if (unlikely(uaddr_val >= LINUX_IOCTL_MIN_PTR && uaddr_val < LINUX_IOCTL_MAX_PTR)) { struct task_struct *pts = current; if (pts == NULL) { *uaddr = NULL; return (1); } /* compute data offset */ uaddr_val -= LINUX_IOCTL_MIN_PTR; /* check that length is within bounds */ if ((len > IOCPARM_MAX) || (uaddr_val + len) > pts->bsd_ioctl_len) { *uaddr = NULL; return (1); } /* re-add kernel buffer address */ uaddr_val += (uintptr_t)pts->bsd_ioctl_data; /* update address location */ *uaddr = (void *)uaddr_val; return (1); } return (0); } int linux_copyin(const void *uaddr, void *kaddr, size_t len) { if (linux_remap_address(__DECONST(void **, &uaddr), len)) { if (uaddr == NULL) return (-EFAULT); memcpy(kaddr, uaddr, len); return (0); } return (-copyin(uaddr, kaddr, len)); } int linux_copyout(const void *kaddr, void *uaddr, size_t len) { if (linux_remap_address(&uaddr, len)) { if (uaddr == NULL) return (-EFAULT); memcpy(uaddr, kaddr, len); return (0); } return (-copyout(kaddr, uaddr, len)); } size_t linux_clear_user(void *_uaddr, size_t _len) { uint8_t *uaddr = _uaddr; size_t len = _len; /* make sure uaddr is aligned before going into the fast loop */ while (((uintptr_t)uaddr & 7) != 0 && len > 7) { if (subyte(uaddr, 0)) return (_len); uaddr++; len--; } /* zero 8 bytes at a time */ while (len > 7) { #ifdef __LP64__ if (suword64(uaddr, 0)) return (_len); #else if (suword32(uaddr, 0)) return (_len); if (suword32(uaddr + 4, 0)) return (_len); #endif uaddr += 8; len -= 8; } /* zero fill end, if any */ while (len > 0) { if (subyte(uaddr, 0)) return (_len); uaddr++; len--; } return (0); } int linux_access_ok(const void *uaddr, size_t len) { uintptr_t saddr; uintptr_t eaddr; /* get start and end address */ saddr = (uintptr_t)uaddr; eaddr = (uintptr_t)uaddr + len; /* verify addresses are valid for userspace */ return ((saddr == eaddr) || (eaddr > saddr && eaddr <= VM_MAXUSER_ADDRESS)); } /* * This function should return either EINTR or ERESTART depending on * the signal type sent to this thread: */ static int linux_get_error(struct task_struct *task, int error) { /* check for signal type interrupt code */ if (error == EINTR || error == ERESTARTSYS || error == ERESTART) { error = -linux_schedule_get_interrupt_value(task); if (error == 0) error = EINTR; } return (error); } static int linux_file_ioctl_sub(struct file *fp, struct linux_file *filp, const struct file_operations *fop, u_long cmd, caddr_t data, struct thread *td) { struct task_struct *task = current; unsigned size; int error; size = IOCPARM_LEN(cmd); /* refer to logic in sys_ioctl() */ if (size > 0) { /* * Setup hint for linux_copyin() and linux_copyout(). * * Background: Linux code expects a user-space address * while FreeBSD supplies a kernel-space address. */ task->bsd_ioctl_data = data; task->bsd_ioctl_len = size; data = (void *)LINUX_IOCTL_MIN_PTR; } else { /* fetch user-space pointer */ data = *(void **)data; } #ifdef COMPAT_FREEBSD32 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { /* try the compat IOCTL handler first */ if (fop->compat_ioctl != NULL) { error = -OPW(fp, td, fop->compat_ioctl(filp, cmd, (u_long)data)); } else { error = ENOTTY; } /* fallback to the regular IOCTL handler, if any */ if (error == ENOTTY && fop->unlocked_ioctl != NULL) { error = -OPW(fp, td, fop->unlocked_ioctl(filp, cmd, (u_long)data)); } } else #endif { if (fop->unlocked_ioctl != NULL) { error = -OPW(fp, td, fop->unlocked_ioctl(filp, cmd, (u_long)data)); } else { error = ENOTTY; } } if (size > 0) { task->bsd_ioctl_data = NULL; task->bsd_ioctl_len = 0; } if (error == EWOULDBLOCK) { /* update kqfilter status, if any */ linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_READ | LINUX_KQ_FLAG_HAS_WRITE); } else { error = linux_get_error(task, error); } return (error); } #define LINUX_POLL_TABLE_NORMAL ((poll_table *)1) /* * This function atomically updates the poll wakeup state and returns * the previous state at the time of update. */ static uint8_t linux_poll_wakeup_state(atomic_t *v, const uint8_t *pstate) { int c, old; c = v->counter; while ((old = atomic_cmpxchg(v, c, pstate[c])) != c) c = old; return (c); } static int linux_poll_wakeup_callback(wait_queue_t *wq, unsigned int wq_state, int flags, void *key) { static const uint8_t state[LINUX_FWQ_STATE_MAX] = { [LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_INIT, /* NOP */ [LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_NOT_READY, /* NOP */ [LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_READY, [LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_READY, /* NOP */ }; struct linux_file *filp = container_of(wq, struct linux_file, f_wait_queue.wq); switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) { case LINUX_FWQ_STATE_QUEUED: linux_poll_wakeup(filp); return (1); default: return (0); } } void linux_poll_wait(struct linux_file *filp, wait_queue_head_t *wqh, poll_table *p) { static const uint8_t state[LINUX_FWQ_STATE_MAX] = { [LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_NOT_READY, [LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_NOT_READY, /* NOP */ [LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_QUEUED, /* NOP */ [LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_QUEUED, }; /* check if we are called inside the select system call */ if (p == LINUX_POLL_TABLE_NORMAL) selrecord(curthread, &filp->f_selinfo); switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) { case LINUX_FWQ_STATE_INIT: /* NOTE: file handles can only belong to one wait-queue */ filp->f_wait_queue.wqh = wqh; filp->f_wait_queue.wq.func = &linux_poll_wakeup_callback; add_wait_queue(wqh, &filp->f_wait_queue.wq); atomic_set(&filp->f_wait_queue.state, LINUX_FWQ_STATE_QUEUED); break; default: break; } } static void linux_poll_wait_dequeue(struct linux_file *filp) { static const uint8_t state[LINUX_FWQ_STATE_MAX] = { [LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_INIT, /* NOP */ [LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_INIT, [LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_INIT, [LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_INIT, }; seldrain(&filp->f_selinfo); switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) { case LINUX_FWQ_STATE_NOT_READY: case LINUX_FWQ_STATE_QUEUED: case LINUX_FWQ_STATE_READY: remove_wait_queue(filp->f_wait_queue.wqh, &filp->f_wait_queue.wq); break; default: break; } } void linux_poll_wakeup(struct linux_file *filp) { /* this function should be NULL-safe */ if (filp == NULL) return; selwakeup(&filp->f_selinfo); spin_lock(&filp->f_kqlock); filp->f_kqflags |= LINUX_KQ_FLAG_NEED_READ | LINUX_KQ_FLAG_NEED_WRITE; /* make sure the "knote" gets woken up */ KNOTE_LOCKED(&filp->f_selinfo.si_note, 1); spin_unlock(&filp->f_kqlock); } static void linux_file_kqfilter_detach(struct knote *kn) { struct linux_file *filp = kn->kn_hook; spin_lock(&filp->f_kqlock); knlist_remove(&filp->f_selinfo.si_note, kn, 1); spin_unlock(&filp->f_kqlock); } static int linux_file_kqfilter_read_event(struct knote *kn, long hint) { struct linux_file *filp = kn->kn_hook; mtx_assert(&filp->f_kqlock, MA_OWNED); return ((filp->f_kqflags & LINUX_KQ_FLAG_NEED_READ) ? 1 : 0); } static int linux_file_kqfilter_write_event(struct knote *kn, long hint) { struct linux_file *filp = kn->kn_hook; mtx_assert(&filp->f_kqlock, MA_OWNED); return ((filp->f_kqflags & LINUX_KQ_FLAG_NEED_WRITE) ? 1 : 0); } -static struct filterops linux_dev_kqfiltops_read = { +static const struct filterops linux_dev_kqfiltops_read = { .f_isfd = 1, .f_detach = linux_file_kqfilter_detach, .f_event = linux_file_kqfilter_read_event, }; -static struct filterops linux_dev_kqfiltops_write = { +static const struct filterops linux_dev_kqfiltops_write = { .f_isfd = 1, .f_detach = linux_file_kqfilter_detach, .f_event = linux_file_kqfilter_write_event, }; static void linux_file_kqfilter_poll(struct linux_file *filp, int kqflags) { struct thread *td; const struct file_operations *fop; struct linux_cdev *ldev; int temp; if ((filp->f_kqflags & kqflags) == 0) return; td = curthread; linux_get_fop(filp, &fop, &ldev); /* get the latest polling state */ temp = OPW(filp->_file, td, fop->poll(filp, NULL)); linux_drop_fop(ldev); spin_lock(&filp->f_kqlock); /* clear kqflags */ filp->f_kqflags &= ~(LINUX_KQ_FLAG_NEED_READ | LINUX_KQ_FLAG_NEED_WRITE); /* update kqflags */ if ((temp & (POLLIN | POLLOUT)) != 0) { if ((temp & POLLIN) != 0) filp->f_kqflags |= LINUX_KQ_FLAG_NEED_READ; if ((temp & POLLOUT) != 0) filp->f_kqflags |= LINUX_KQ_FLAG_NEED_WRITE; /* make sure the "knote" gets woken up */ KNOTE_LOCKED(&filp->f_selinfo.si_note, 0); } spin_unlock(&filp->f_kqlock); } static int linux_file_kqfilter(struct file *file, struct knote *kn) { struct linux_file *filp; struct thread *td; int error; td = curthread; filp = (struct linux_file *)file->f_data; filp->f_flags = file->f_flag; if (filp->f_op->poll == NULL) return (EINVAL); spin_lock(&filp->f_kqlock); switch (kn->kn_filter) { case EVFILT_READ: filp->f_kqflags |= LINUX_KQ_FLAG_HAS_READ; kn->kn_fop = &linux_dev_kqfiltops_read; kn->kn_hook = filp; knlist_add(&filp->f_selinfo.si_note, kn, 1); error = 0; break; case EVFILT_WRITE: filp->f_kqflags |= LINUX_KQ_FLAG_HAS_WRITE; kn->kn_fop = &linux_dev_kqfiltops_write; kn->kn_hook = filp; knlist_add(&filp->f_selinfo.si_note, kn, 1); error = 0; break; default: error = EINVAL; break; } spin_unlock(&filp->f_kqlock); if (error == 0) { linux_set_current(td); /* update kqfilter status, if any */ linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_READ | LINUX_KQ_FLAG_HAS_WRITE); } return (error); } static int linux_file_mmap_single(struct file *fp, const struct file_operations *fop, vm_ooffset_t *offset, vm_size_t size, struct vm_object **object, int nprot, bool is_shared, struct thread *td) { struct task_struct *task; struct vm_area_struct *vmap; struct mm_struct *mm; struct linux_file *filp; vm_memattr_t attr; int error; filp = (struct linux_file *)fp->f_data; filp->f_flags = fp->f_flag; if (fop->mmap == NULL) return (EOPNOTSUPP); linux_set_current(td); /* * The same VM object might be shared by multiple processes * and the mm_struct is usually freed when a process exits. * * The atomic reference below makes sure the mm_struct is * available as long as the vmap is in the linux_vma_head. */ task = current; mm = task->mm; if (atomic_inc_not_zero(&mm->mm_users) == 0) return (EINVAL); vmap = kzalloc(sizeof(*vmap), GFP_KERNEL); vmap->vm_start = 0; vmap->vm_end = size; vmap->vm_pgoff = *offset / PAGE_SIZE; vmap->vm_pfn = 0; vmap->vm_flags = vmap->vm_page_prot = (nprot & VM_PROT_ALL); if (is_shared) vmap->vm_flags |= VM_SHARED; vmap->vm_ops = NULL; vmap->vm_file = get_file(filp); vmap->vm_mm = mm; if (unlikely(down_write_killable(&vmap->vm_mm->mmap_sem))) { error = linux_get_error(task, EINTR); } else { error = -OPW(fp, td, fop->mmap(filp, vmap)); error = linux_get_error(task, error); up_write(&vmap->vm_mm->mmap_sem); } if (error != 0) { linux_cdev_handle_free(vmap); return (error); } attr = pgprot2cachemode(vmap->vm_page_prot); if (vmap->vm_ops != NULL) { struct vm_area_struct *ptr; void *vm_private_data; bool vm_no_fault; if (vmap->vm_ops->open == NULL || vmap->vm_ops->close == NULL || vmap->vm_private_data == NULL) { /* free allocated VM area struct */ linux_cdev_handle_free(vmap); return (EINVAL); } vm_private_data = vmap->vm_private_data; rw_wlock(&linux_vma_lock); TAILQ_FOREACH(ptr, &linux_vma_head, vm_entry) { if (ptr->vm_private_data == vm_private_data) break; } /* check if there is an existing VM area struct */ if (ptr != NULL) { /* check if the VM area structure is invalid */ if (ptr->vm_ops == NULL || ptr->vm_ops->open == NULL || ptr->vm_ops->close == NULL) { error = ESTALE; vm_no_fault = 1; } else { error = EEXIST; vm_no_fault = (ptr->vm_ops->fault == NULL); } } else { /* insert VM area structure into list */ TAILQ_INSERT_TAIL(&linux_vma_head, vmap, vm_entry); error = 0; vm_no_fault = (vmap->vm_ops->fault == NULL); } rw_wunlock(&linux_vma_lock); if (error != 0) { /* free allocated VM area struct */ linux_cdev_handle_free(vmap); /* check for stale VM area struct */ if (error != EEXIST) return (error); } /* check if there is no fault handler */ if (vm_no_fault) { *object = cdev_pager_allocate(vm_private_data, OBJT_DEVICE, &linux_cdev_pager_ops[1], size, nprot, *offset, td->td_ucred); } else { *object = cdev_pager_allocate(vm_private_data, OBJT_MGTDEVICE, &linux_cdev_pager_ops[0], size, nprot, *offset, td->td_ucred); } /* check if allocating the VM object failed */ if (*object == NULL) { if (error == 0) { /* remove VM area struct from list */ linux_cdev_handle_remove(vmap); /* free allocated VM area struct */ linux_cdev_handle_free(vmap); } return (EINVAL); } } else { struct sglist *sg; sg = sglist_alloc(1, M_WAITOK); sglist_append_phys(sg, (vm_paddr_t)vmap->vm_pfn << PAGE_SHIFT, vmap->vm_len); *object = vm_pager_allocate(OBJT_SG, sg, vmap->vm_len, nprot, 0, td->td_ucred); linux_cdev_handle_free(vmap); if (*object == NULL) { sglist_free(sg); return (EINVAL); } } if (attr != VM_MEMATTR_DEFAULT) { VM_OBJECT_WLOCK(*object); vm_object_set_memattr(*object, attr); VM_OBJECT_WUNLOCK(*object); } *offset = 0; return (0); } struct cdevsw linuxcdevsw = { .d_version = D_VERSION, .d_fdopen = linux_dev_fdopen, .d_name = "lkpidev", }; static int linux_file_read(struct file *file, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct linux_file *filp; const struct file_operations *fop; struct linux_cdev *ldev; ssize_t bytes; int error; error = 0; filp = (struct linux_file *)file->f_data; filp->f_flags = file->f_flag; /* XXX no support for I/O vectors currently */ if (uio->uio_iovcnt != 1) return (EOPNOTSUPP); if (uio->uio_resid > DEVFS_IOSIZE_MAX) return (EINVAL); linux_set_current(td); linux_get_fop(filp, &fop, &ldev); if (fop->read != NULL) { bytes = OPW(file, td, fop->read(filp, uio->uio_iov->iov_base, uio->uio_iov->iov_len, &uio->uio_offset)); if (bytes >= 0) { uio->uio_iov->iov_base = ((uint8_t *)uio->uio_iov->iov_base) + bytes; uio->uio_iov->iov_len -= bytes; uio->uio_resid -= bytes; } else { error = linux_get_error(current, -bytes); } } else error = ENXIO; /* update kqfilter status, if any */ linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_READ); linux_drop_fop(ldev); return (error); } static int linux_file_write(struct file *file, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct linux_file *filp; const struct file_operations *fop; struct linux_cdev *ldev; ssize_t bytes; int error; filp = (struct linux_file *)file->f_data; filp->f_flags = file->f_flag; /* XXX no support for I/O vectors currently */ if (uio->uio_iovcnt != 1) return (EOPNOTSUPP); if (uio->uio_resid > DEVFS_IOSIZE_MAX) return (EINVAL); linux_set_current(td); linux_get_fop(filp, &fop, &ldev); if (fop->write != NULL) { bytes = OPW(file, td, fop->write(filp, uio->uio_iov->iov_base, uio->uio_iov->iov_len, &uio->uio_offset)); if (bytes >= 0) { uio->uio_iov->iov_base = ((uint8_t *)uio->uio_iov->iov_base) + bytes; uio->uio_iov->iov_len -= bytes; uio->uio_resid -= bytes; error = 0; } else { error = linux_get_error(current, -bytes); } } else error = ENXIO; /* update kqfilter status, if any */ linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_WRITE); linux_drop_fop(ldev); return (error); } static int linux_file_poll(struct file *file, int events, struct ucred *active_cred, struct thread *td) { struct linux_file *filp; const struct file_operations *fop; struct linux_cdev *ldev; int revents; filp = (struct linux_file *)file->f_data; filp->f_flags = file->f_flag; linux_set_current(td); linux_get_fop(filp, &fop, &ldev); if (fop->poll != NULL) { revents = OPW(file, td, fop->poll(filp, LINUX_POLL_TABLE_NORMAL)) & events; } else { revents = 0; } linux_drop_fop(ldev); return (revents); } static int linux_file_close(struct file *file, struct thread *td) { struct linux_file *filp; int (*release)(struct inode *, struct linux_file *); const struct file_operations *fop; struct linux_cdev *ldev; int error; filp = (struct linux_file *)file->f_data; KASSERT(file_count(filp) == 0, ("File refcount(%d) is not zero", file_count(filp))); if (td == NULL) td = curthread; error = 0; filp->f_flags = file->f_flag; linux_set_current(td); linux_poll_wait_dequeue(filp); linux_get_fop(filp, &fop, &ldev); /* * Always use the real release function, if any, to avoid * leaking device resources: */ release = filp->f_op->release; if (release != NULL) error = -OPW(file, td, release(filp->f_vnode, filp)); funsetown(&filp->f_sigio); if (filp->f_vnode != NULL) vdrop(filp->f_vnode); linux_drop_fop(ldev); ldev = filp->f_cdev; if (ldev != NULL) linux_cdev_deref(ldev); linux_synchronize_rcu(RCU_TYPE_REGULAR); kfree(filp); return (error); } static int linux_file_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *cred, struct thread *td) { struct linux_file *filp; const struct file_operations *fop; struct linux_cdev *ldev; struct fiodgname_arg *fgn; const char *p; int error, i; error = 0; filp = (struct linux_file *)fp->f_data; filp->f_flags = fp->f_flag; linux_get_fop(filp, &fop, &ldev); linux_set_current(td); switch (cmd) { case FIONBIO: break; case FIOASYNC: if (fop->fasync == NULL) break; error = -OPW(fp, td, fop->fasync(0, filp, fp->f_flag & FASYNC)); break; case FIOSETOWN: error = fsetown(*(int *)data, &filp->f_sigio); if (error == 0) { if (fop->fasync == NULL) break; error = -OPW(fp, td, fop->fasync(0, filp, fp->f_flag & FASYNC)); } break; case FIOGETOWN: *(int *)data = fgetown(&filp->f_sigio); break; case FIODGNAME: #ifdef COMPAT_FREEBSD32 case FIODGNAME_32: #endif if (filp->f_cdev == NULL || filp->f_cdev->cdev == NULL) { error = ENXIO; break; } fgn = data; p = devtoname(filp->f_cdev->cdev); i = strlen(p) + 1; if (i > fgn->len) { error = EINVAL; break; } error = copyout(p, fiodgname_buf_get_ptr(fgn, cmd), i); break; default: error = linux_file_ioctl_sub(fp, filp, fop, cmd, data, td); break; } linux_drop_fop(ldev); return (error); } static int linux_file_mmap_sub(struct thread *td, vm_size_t objsize, vm_prot_t prot, vm_prot_t maxprot, int flags, struct file *fp, vm_ooffset_t *foff, const struct file_operations *fop, vm_object_t *objp) { /* * Character devices do not provide private mappings * of any kind: */ if ((maxprot & VM_PROT_WRITE) == 0 && (prot & VM_PROT_WRITE) != 0) return (EACCES); if ((flags & (MAP_PRIVATE | MAP_COPY)) != 0) return (EINVAL); return (linux_file_mmap_single(fp, fop, foff, objsize, objp, (int)prot, (flags & MAP_SHARED) ? true : false, td)); } static int linux_file_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, struct thread *td) { struct linux_file *filp; const struct file_operations *fop; struct linux_cdev *ldev; struct mount *mp; struct vnode *vp; vm_object_t object; vm_prot_t maxprot; int error; filp = (struct linux_file *)fp->f_data; vp = filp->f_vnode; if (vp == NULL) return (EOPNOTSUPP); /* * Ensure that file and memory protections are * compatible. */ mp = vp->v_mount; if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) { maxprot = VM_PROT_NONE; if ((prot & VM_PROT_EXECUTE) != 0) return (EACCES); } else maxprot = VM_PROT_EXECUTE; if ((fp->f_flag & FREAD) != 0) maxprot |= VM_PROT_READ; else if ((prot & VM_PROT_READ) != 0) return (EACCES); /* * If we are sharing potential changes via MAP_SHARED and we * are trying to get write permission although we opened it * without asking for it, bail out. * * Note that most character devices always share mappings. * * Rely on linux_file_mmap_sub() to fail invalid MAP_PRIVATE * requests rather than doing it here. */ if ((flags & MAP_SHARED) != 0) { if ((fp->f_flag & FWRITE) != 0) maxprot |= VM_PROT_WRITE; else if ((prot & VM_PROT_WRITE) != 0) return (EACCES); } maxprot &= cap_maxprot; linux_get_fop(filp, &fop, &ldev); error = linux_file_mmap_sub(td, size, prot, maxprot, flags, fp, &foff, fop, &object); if (error != 0) goto out; error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, foff, FALSE, td); if (error != 0) vm_object_deallocate(object); out: linux_drop_fop(ldev); return (error); } static int linux_file_stat(struct file *fp, struct stat *sb, struct ucred *active_cred) { struct linux_file *filp; struct vnode *vp; int error; filp = (struct linux_file *)fp->f_data; if (filp->f_vnode == NULL) return (EOPNOTSUPP); vp = filp->f_vnode; vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_STAT(vp, sb, curthread->td_ucred, NOCRED); VOP_UNLOCK(vp); return (error); } static int linux_file_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { struct linux_file *filp; struct vnode *vp; int error; filp = fp->f_data; vp = filp->f_vnode; if (vp == NULL) { error = 0; kif->kf_type = KF_TYPE_DEV; } else { vref(vp); FILEDESC_SUNLOCK(fdp); error = vn_fill_kinfo_vnode(vp, kif); vrele(vp); kif->kf_type = KF_TYPE_VNODE; FILEDESC_SLOCK(fdp); } return (error); } unsigned int linux_iminor(struct inode *inode) { struct linux_cdev *ldev; if (inode == NULL || inode->v_rdev == NULL || inode->v_rdev->si_devsw != &linuxcdevsw) return (-1U); ldev = inode->v_rdev->si_drv1; if (ldev == NULL) return (-1U); return (minor(ldev->dev)); } static int linux_file_kcmp(struct file *fp1, struct file *fp2, struct thread *td) { struct linux_file *filp1, *filp2; if (fp2->f_type != DTYPE_DEV) return (3); filp1 = fp1->f_data; filp2 = fp2->f_data; return (kcmp_cmp((uintptr_t)filp1->f_cdev, (uintptr_t)filp2->f_cdev)); } -struct fileops linuxfileops = { +const struct fileops linuxfileops = { .fo_read = linux_file_read, .fo_write = linux_file_write, .fo_truncate = invfo_truncate, .fo_kqfilter = linux_file_kqfilter, .fo_stat = linux_file_stat, .fo_fill_kinfo = linux_file_fill_kinfo, .fo_poll = linux_file_poll, .fo_close = linux_file_close, .fo_ioctl = linux_file_ioctl, .fo_mmap = linux_file_mmap, .fo_chmod = invfo_chmod, .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, .fo_cmp = linux_file_kcmp, .fo_flags = DFLAG_PASSABLE, }; /* * Hash of vmmap addresses. This is infrequently accessed and does not * need to be particularly large. This is done because we must store the * caller's idea of the map size to properly unmap. */ struct vmmap { LIST_ENTRY(vmmap) vm_next; void *vm_addr; unsigned long vm_size; }; struct vmmaphd { struct vmmap *lh_first; }; #define VMMAP_HASH_SIZE 64 #define VMMAP_HASH_MASK (VMMAP_HASH_SIZE - 1) #define VM_HASH(addr) ((uintptr_t)(addr) >> PAGE_SHIFT) & VMMAP_HASH_MASK static struct vmmaphd vmmaphead[VMMAP_HASH_SIZE]; static struct mtx vmmaplock; static void vmmap_add(void *addr, unsigned long size) { struct vmmap *vmmap; vmmap = kmalloc(sizeof(*vmmap), GFP_KERNEL); mtx_lock(&vmmaplock); vmmap->vm_size = size; vmmap->vm_addr = addr; LIST_INSERT_HEAD(&vmmaphead[VM_HASH(addr)], vmmap, vm_next); mtx_unlock(&vmmaplock); } static struct vmmap * vmmap_remove(void *addr) { struct vmmap *vmmap; mtx_lock(&vmmaplock); LIST_FOREACH(vmmap, &vmmaphead[VM_HASH(addr)], vm_next) if (vmmap->vm_addr == addr) break; if (vmmap) LIST_REMOVE(vmmap, vm_next); mtx_unlock(&vmmaplock); return (vmmap); } #if defined(__i386__) || defined(__amd64__) || defined(__powerpc__) || defined(__aarch64__) || defined(__riscv) void * _ioremap_attr(vm_paddr_t phys_addr, unsigned long size, int attr) { void *addr; addr = pmap_mapdev_attr(phys_addr, size, attr); if (addr == NULL) return (NULL); vmmap_add(addr, size); return (addr); } #endif void iounmap(void *addr) { struct vmmap *vmmap; vmmap = vmmap_remove(addr); if (vmmap == NULL) return; #if defined(__i386__) || defined(__amd64__) || defined(__powerpc__) || defined(__aarch64__) || defined(__riscv) pmap_unmapdev(addr, vmmap->vm_size); #endif kfree(vmmap); } void * vmap(struct page **pages, unsigned int count, unsigned long flags, int prot) { vm_offset_t off; size_t size; size = count * PAGE_SIZE; off = kva_alloc(size); if (off == 0) return (NULL); vmmap_add((void *)off, size); pmap_qenter(off, pages, count); return ((void *)off); } void vunmap(void *addr) { struct vmmap *vmmap; vmmap = vmmap_remove(addr); if (vmmap == NULL) return; pmap_qremove((vm_offset_t)addr, vmmap->vm_size / PAGE_SIZE); kva_free((vm_offset_t)addr, vmmap->vm_size); kfree(vmmap); } static char * devm_kvasprintf(struct device *dev, gfp_t gfp, const char *fmt, va_list ap) { unsigned int len; char *p; va_list aq; va_copy(aq, ap); len = vsnprintf(NULL, 0, fmt, aq); va_end(aq); if (dev != NULL) p = devm_kmalloc(dev, len + 1, gfp); else p = kmalloc(len + 1, gfp); if (p != NULL) vsnprintf(p, len + 1, fmt, ap); return (p); } char * kvasprintf(gfp_t gfp, const char *fmt, va_list ap) { return (devm_kvasprintf(NULL, gfp, fmt, ap)); } char * lkpi_devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...) { va_list ap; char *p; va_start(ap, fmt); p = devm_kvasprintf(dev, gfp, fmt, ap); va_end(ap); return (p); } char * kasprintf(gfp_t gfp, const char *fmt, ...) { va_list ap; char *p; va_start(ap, fmt); p = kvasprintf(gfp, fmt, ap); va_end(ap); return (p); } static void linux_timer_callback_wrapper(void *context) { struct timer_list *timer; timer = context; /* the timer is about to be shutdown permanently */ if (timer->function == NULL) return; if (linux_set_current_flags(curthread, M_NOWAIT)) { /* try again later */ callout_reset(&timer->callout, 1, &linux_timer_callback_wrapper, timer); return; } timer->function(timer->data); } int mod_timer(struct timer_list *timer, int expires) { int ret; timer->expires = expires; ret = callout_reset(&timer->callout, linux_timer_jiffies_until(expires), &linux_timer_callback_wrapper, timer); MPASS(ret == 0 || ret == 1); return (ret == 1); } void add_timer(struct timer_list *timer) { callout_reset(&timer->callout, linux_timer_jiffies_until(timer->expires), &linux_timer_callback_wrapper, timer); } void add_timer_on(struct timer_list *timer, int cpu) { callout_reset_on(&timer->callout, linux_timer_jiffies_until(timer->expires), &linux_timer_callback_wrapper, timer, cpu); } int del_timer(struct timer_list *timer) { if (callout_stop(&(timer)->callout) == -1) return (0); return (1); } int del_timer_sync(struct timer_list *timer) { if (callout_drain(&(timer)->callout) == -1) return (0); return (1); } int timer_delete_sync(struct timer_list *timer) { return (del_timer_sync(timer)); } int timer_shutdown_sync(struct timer_list *timer) { timer->function = NULL; return (del_timer_sync(timer)); } /* greatest common divisor, Euclid equation */ static uint64_t lkpi_gcd_64(uint64_t a, uint64_t b) { uint64_t an; uint64_t bn; while (b != 0) { an = b; bn = a % b; a = an; b = bn; } return (a); } uint64_t lkpi_nsec2hz_rem; uint64_t lkpi_nsec2hz_div = 1000000000ULL; uint64_t lkpi_nsec2hz_max; uint64_t lkpi_usec2hz_rem; uint64_t lkpi_usec2hz_div = 1000000ULL; uint64_t lkpi_usec2hz_max; uint64_t lkpi_msec2hz_rem; uint64_t lkpi_msec2hz_div = 1000ULL; uint64_t lkpi_msec2hz_max; static void linux_timer_init(void *arg) { uint64_t gcd; /* * Compute an internal HZ value which can divide 2**32 to * avoid timer rounding problems when the tick value wraps * around 2**32: */ linux_timer_hz_mask = 1; while (linux_timer_hz_mask < (unsigned long)hz) linux_timer_hz_mask *= 2; linux_timer_hz_mask--; /* compute some internal constants */ lkpi_nsec2hz_rem = hz; lkpi_usec2hz_rem = hz; lkpi_msec2hz_rem = hz; gcd = lkpi_gcd_64(lkpi_nsec2hz_rem, lkpi_nsec2hz_div); lkpi_nsec2hz_rem /= gcd; lkpi_nsec2hz_div /= gcd; lkpi_nsec2hz_max = -1ULL / lkpi_nsec2hz_rem; gcd = lkpi_gcd_64(lkpi_usec2hz_rem, lkpi_usec2hz_div); lkpi_usec2hz_rem /= gcd; lkpi_usec2hz_div /= gcd; lkpi_usec2hz_max = -1ULL / lkpi_usec2hz_rem; gcd = lkpi_gcd_64(lkpi_msec2hz_rem, lkpi_msec2hz_div); lkpi_msec2hz_rem /= gcd; lkpi_msec2hz_div /= gcd; lkpi_msec2hz_max = -1ULL / lkpi_msec2hz_rem; } SYSINIT(linux_timer, SI_SUB_DRIVERS, SI_ORDER_FIRST, linux_timer_init, NULL); void linux_complete_common(struct completion *c, int all) { int wakeup_swapper; sleepq_lock(c); if (all) { c->done = UINT_MAX; wakeup_swapper = sleepq_broadcast(c, SLEEPQ_SLEEP, 0, 0); } else { if (c->done != UINT_MAX) c->done++; wakeup_swapper = sleepq_signal(c, SLEEPQ_SLEEP, 0, 0); } sleepq_release(c); if (wakeup_swapper) kick_proc0(); } /* * Indefinite wait for done != 0 with or without signals. */ int linux_wait_for_common(struct completion *c, int flags) { struct task_struct *task; int error; if (SCHEDULER_STOPPED()) return (0); task = current; if (flags != 0) flags = SLEEPQ_INTERRUPTIBLE | SLEEPQ_SLEEP; else flags = SLEEPQ_SLEEP; error = 0; for (;;) { sleepq_lock(c); if (c->done) break; sleepq_add(c, NULL, "completion", flags, 0); if (flags & SLEEPQ_INTERRUPTIBLE) { DROP_GIANT(); error = -sleepq_wait_sig(c, 0); PICKUP_GIANT(); if (error != 0) { linux_schedule_save_interrupt_value(task, error); error = -ERESTARTSYS; goto intr; } } else { DROP_GIANT(); sleepq_wait(c, 0); PICKUP_GIANT(); } } if (c->done != UINT_MAX) c->done--; sleepq_release(c); intr: return (error); } /* * Time limited wait for done != 0 with or without signals. */ int linux_wait_for_timeout_common(struct completion *c, int timeout, int flags) { struct task_struct *task; int end = jiffies + timeout; int error; if (SCHEDULER_STOPPED()) return (0); task = current; if (flags != 0) flags = SLEEPQ_INTERRUPTIBLE | SLEEPQ_SLEEP; else flags = SLEEPQ_SLEEP; for (;;) { sleepq_lock(c); if (c->done) break; sleepq_add(c, NULL, "completion", flags, 0); sleepq_set_timeout(c, linux_timer_jiffies_until(end)); DROP_GIANT(); if (flags & SLEEPQ_INTERRUPTIBLE) error = -sleepq_timedwait_sig(c, 0); else error = -sleepq_timedwait(c, 0); PICKUP_GIANT(); if (error != 0) { /* check for timeout */ if (error == -EWOULDBLOCK) { error = 0; /* timeout */ } else { /* signal happened */ linux_schedule_save_interrupt_value(task, error); error = -ERESTARTSYS; } goto done; } } if (c->done != UINT_MAX) c->done--; sleepq_release(c); /* return how many jiffies are left */ error = linux_timer_jiffies_until(end); done: return (error); } int linux_try_wait_for_completion(struct completion *c) { int isdone; sleepq_lock(c); isdone = (c->done != 0); if (c->done != 0 && c->done != UINT_MAX) c->done--; sleepq_release(c); return (isdone); } int linux_completion_done(struct completion *c) { int isdone; sleepq_lock(c); isdone = (c->done != 0); sleepq_release(c); return (isdone); } static void linux_cdev_deref(struct linux_cdev *ldev) { if (refcount_release(&ldev->refs) && ldev->kobj.ktype == &linux_cdev_ktype) kfree(ldev); } static void linux_cdev_release(struct kobject *kobj) { struct linux_cdev *cdev; struct kobject *parent; cdev = container_of(kobj, struct linux_cdev, kobj); parent = kobj->parent; linux_destroy_dev(cdev); linux_cdev_deref(cdev); kobject_put(parent); } static void linux_cdev_static_release(struct kobject *kobj) { struct cdev *cdev; struct linux_cdev *ldev; ldev = container_of(kobj, struct linux_cdev, kobj); cdev = ldev->cdev; if (cdev != NULL) { destroy_dev(cdev); ldev->cdev = NULL; } kobject_put(kobj->parent); } int linux_cdev_device_add(struct linux_cdev *ldev, struct device *dev) { int ret; if (dev->devt != 0) { /* Set parent kernel object. */ ldev->kobj.parent = &dev->kobj; /* * Unlike Linux we require the kobject of the * character device structure to have a valid name * before calling this function: */ if (ldev->kobj.name == NULL) return (-EINVAL); ret = cdev_add(ldev, dev->devt, 1); if (ret) return (ret); } ret = device_add(dev); if (ret != 0 && dev->devt != 0) cdev_del(ldev); return (ret); } void linux_cdev_device_del(struct linux_cdev *ldev, struct device *dev) { device_del(dev); if (dev->devt != 0) cdev_del(ldev); } static void linux_destroy_dev(struct linux_cdev *ldev) { if (ldev->cdev == NULL) return; MPASS((ldev->siref & LDEV_SI_DTR) == 0); MPASS(ldev->kobj.ktype == &linux_cdev_ktype); atomic_set_int(&ldev->siref, LDEV_SI_DTR); while ((atomic_load_int(&ldev->siref) & ~LDEV_SI_DTR) != 0) pause("ldevdtr", hz / 4); destroy_dev(ldev->cdev); ldev->cdev = NULL; } const struct kobj_type linux_cdev_ktype = { .release = linux_cdev_release, }; const struct kobj_type linux_cdev_static_ktype = { .release = linux_cdev_static_release, }; static void linux_handle_ifnet_link_event(void *arg, struct ifnet *ifp, int linkstate) { struct notifier_block *nb; struct netdev_notifier_info ni; nb = arg; ni.ifp = ifp; ni.dev = (struct net_device *)ifp; if (linkstate == LINK_STATE_UP) nb->notifier_call(nb, NETDEV_UP, &ni); else nb->notifier_call(nb, NETDEV_DOWN, &ni); } static void linux_handle_ifnet_arrival_event(void *arg, struct ifnet *ifp) { struct notifier_block *nb; struct netdev_notifier_info ni; nb = arg; ni.ifp = ifp; ni.dev = (struct net_device *)ifp; nb->notifier_call(nb, NETDEV_REGISTER, &ni); } static void linux_handle_ifnet_departure_event(void *arg, struct ifnet *ifp) { struct notifier_block *nb; struct netdev_notifier_info ni; nb = arg; ni.ifp = ifp; ni.dev = (struct net_device *)ifp; nb->notifier_call(nb, NETDEV_UNREGISTER, &ni); } static void linux_handle_iflladdr_event(void *arg, struct ifnet *ifp) { struct notifier_block *nb; struct netdev_notifier_info ni; nb = arg; ni.ifp = ifp; ni.dev = (struct net_device *)ifp; nb->notifier_call(nb, NETDEV_CHANGEADDR, &ni); } static void linux_handle_ifaddr_event(void *arg, struct ifnet *ifp) { struct notifier_block *nb; struct netdev_notifier_info ni; nb = arg; ni.ifp = ifp; ni.dev = (struct net_device *)ifp; nb->notifier_call(nb, NETDEV_CHANGEIFADDR, &ni); } int register_netdevice_notifier(struct notifier_block *nb) { nb->tags[NETDEV_UP] = EVENTHANDLER_REGISTER( ifnet_link_event, linux_handle_ifnet_link_event, nb, 0); nb->tags[NETDEV_REGISTER] = EVENTHANDLER_REGISTER( ifnet_arrival_event, linux_handle_ifnet_arrival_event, nb, 0); nb->tags[NETDEV_UNREGISTER] = EVENTHANDLER_REGISTER( ifnet_departure_event, linux_handle_ifnet_departure_event, nb, 0); nb->tags[NETDEV_CHANGEADDR] = EVENTHANDLER_REGISTER( iflladdr_event, linux_handle_iflladdr_event, nb, 0); return (0); } int register_inetaddr_notifier(struct notifier_block *nb) { nb->tags[NETDEV_CHANGEIFADDR] = EVENTHANDLER_REGISTER( ifaddr_event, linux_handle_ifaddr_event, nb, 0); return (0); } int unregister_netdevice_notifier(struct notifier_block *nb) { EVENTHANDLER_DEREGISTER(ifnet_link_event, nb->tags[NETDEV_UP]); EVENTHANDLER_DEREGISTER(ifnet_arrival_event, nb->tags[NETDEV_REGISTER]); EVENTHANDLER_DEREGISTER(ifnet_departure_event, nb->tags[NETDEV_UNREGISTER]); EVENTHANDLER_DEREGISTER(iflladdr_event, nb->tags[NETDEV_CHANGEADDR]); return (0); } int unregister_inetaddr_notifier(struct notifier_block *nb) { EVENTHANDLER_DEREGISTER(ifaddr_event, nb->tags[NETDEV_CHANGEIFADDR]); return (0); } struct list_sort_thunk { int (*cmp)(void *, struct list_head *, struct list_head *); void *priv; }; static inline int linux_le_cmp(const void *d1, const void *d2, void *priv) { struct list_head *le1, *le2; struct list_sort_thunk *thunk; thunk = priv; le1 = *(__DECONST(struct list_head **, d1)); le2 = *(__DECONST(struct list_head **, d2)); return ((thunk->cmp)(thunk->priv, le1, le2)); } void list_sort(void *priv, struct list_head *head, int (*cmp)(void *priv, struct list_head *a, struct list_head *b)) { struct list_sort_thunk thunk; struct list_head **ar, *le; size_t count, i; count = 0; list_for_each(le, head) count++; ar = malloc(sizeof(struct list_head *) * count, M_KMALLOC, M_WAITOK); i = 0; list_for_each(le, head) ar[i++] = le; thunk.cmp = cmp; thunk.priv = priv; qsort_r(ar, count, sizeof(struct list_head *), linux_le_cmp, &thunk); INIT_LIST_HEAD(head); for (i = 0; i < count; i++) list_add_tail(ar[i], head); free(ar, M_KMALLOC); } #if defined(__i386__) || defined(__amd64__) int linux_wbinvd_on_all_cpus(void) { pmap_invalidate_cache(); return (0); } #endif int linux_on_each_cpu(void callback(void *), void *data) { smp_rendezvous(smp_no_rendezvous_barrier, callback, smp_no_rendezvous_barrier, data); return (0); } int linux_in_atomic(void) { return ((curthread->td_pflags & TDP_NOFAULTING) != 0); } struct linux_cdev * linux_find_cdev(const char *name, unsigned major, unsigned minor) { dev_t dev = MKDEV(major, minor); struct cdev *cdev; dev_lock(); LIST_FOREACH(cdev, &linuxcdevsw.d_devs, si_list) { struct linux_cdev *ldev = cdev->si_drv1; if (ldev->dev == dev && strcmp(kobject_name(&ldev->kobj), name) == 0) { break; } } dev_unlock(); return (cdev != NULL ? cdev->si_drv1 : NULL); } int __register_chrdev(unsigned int major, unsigned int baseminor, unsigned int count, const char *name, const struct file_operations *fops) { struct linux_cdev *cdev; int ret = 0; int i; for (i = baseminor; i < baseminor + count; i++) { cdev = cdev_alloc(); cdev->ops = fops; kobject_set_name(&cdev->kobj, name); ret = cdev_add(cdev, makedev(major, i), 1); if (ret != 0) break; } return (ret); } int __register_chrdev_p(unsigned int major, unsigned int baseminor, unsigned int count, const char *name, const struct file_operations *fops, uid_t uid, gid_t gid, int mode) { struct linux_cdev *cdev; int ret = 0; int i; for (i = baseminor; i < baseminor + count; i++) { cdev = cdev_alloc(); cdev->ops = fops; kobject_set_name(&cdev->kobj, name); ret = cdev_add_ext(cdev, makedev(major, i), uid, gid, mode); if (ret != 0) break; } return (ret); } void __unregister_chrdev(unsigned int major, unsigned int baseminor, unsigned int count, const char *name) { struct linux_cdev *cdevp; int i; for (i = baseminor; i < baseminor + count; i++) { cdevp = linux_find_cdev(name, major, i); if (cdevp != NULL) cdev_del(cdevp); } } void linux_dump_stack(void) { #ifdef STACK struct stack st; stack_save(&st); stack_print(&st); #endif } int linuxkpi_net_ratelimit(void) { return (ppsratecheck(&lkpi_net_lastlog, &lkpi_net_curpps, lkpi_net_maxpps)); } struct io_mapping * io_mapping_create_wc(resource_size_t base, unsigned long size) { struct io_mapping *mapping; mapping = kmalloc(sizeof(*mapping), GFP_KERNEL); if (mapping == NULL) return (NULL); return (io_mapping_init_wc(mapping, base, size)); } /* We likely want a linuxkpi_device.c at some point. */ bool device_can_wakeup(struct device *dev) { if (dev == NULL) return (false); /* * XXX-BZ iwlwifi queries it as part of enabling WoWLAN. * Normally this would be based on a bool in dev->power.XXX. * Check such as PCI PCIM_PCAP_*PME. We have no way to enable this yet. * We may get away by directly calling into bsddev for as long as * we can assume PCI only avoiding changing struct device breaking KBI. */ pr_debug("%s:%d: not enabled; see comment.\n", __func__, __LINE__); return (false); } static void devm_device_group_remove(struct device *dev, void *p) { const struct attribute_group **dr = p; const struct attribute_group *group = *dr; sysfs_remove_group(&dev->kobj, group); } int lkpi_devm_device_add_group(struct device *dev, const struct attribute_group *group) { const struct attribute_group **dr; int ret; dr = devres_alloc(devm_device_group_remove, sizeof(*dr), GFP_KERNEL); if (dr == NULL) return (-ENOMEM); ret = sysfs_create_group(&dev->kobj, group); if (ret == 0) { *dr = group; devres_add(dev, dr); } else devres_free(dr); return (ret); } #if defined(__i386__) || defined(__amd64__) bool linux_cpu_has_clflush; struct cpuinfo_x86 boot_cpu_data; struct cpuinfo_x86 *__cpu_data; #endif cpumask_t * lkpi_get_static_single_cpu_mask(int cpuid) { KASSERT((cpuid >= 0 && cpuid <= mp_maxid), ("%s: invalid cpuid %d\n", __func__, cpuid)); KASSERT(!CPU_ABSENT(cpuid), ("%s: cpu with cpuid %d is absent\n", __func__, cpuid)); return (static_single_cpu_mask[cpuid]); } bool lkpi_xen_initial_domain(void) { #ifdef XENHVM return (xen_initial_domain()); #else return (false); #endif } bool lkpi_xen_pv_domain(void) { #ifdef XENHVM return (xen_pv_domain()); #else return (false); #endif } static void linux_compat_init(void *arg) { struct sysctl_oid *rootoid; int i; #if defined(__i386__) || defined(__amd64__) static const uint32_t x86_vendors[X86_VENDOR_NUM] = { [X86_VENDOR_INTEL] = CPU_VENDOR_INTEL, [X86_VENDOR_CYRIX] = CPU_VENDOR_CYRIX, [X86_VENDOR_AMD] = CPU_VENDOR_AMD, [X86_VENDOR_UMC] = CPU_VENDOR_UMC, [X86_VENDOR_CENTAUR] = CPU_VENDOR_CENTAUR, [X86_VENDOR_TRANSMETA] = CPU_VENDOR_TRANSMETA, [X86_VENDOR_NSC] = CPU_VENDOR_NSC, [X86_VENDOR_HYGON] = CPU_VENDOR_HYGON, }; uint8_t x86_vendor = X86_VENDOR_UNKNOWN; for (i = 0; i < X86_VENDOR_NUM; i++) { if (cpu_vendor_id != 0 && cpu_vendor_id == x86_vendors[i]) { x86_vendor = i; break; } } linux_cpu_has_clflush = (cpu_feature & CPUID_CLFSH); boot_cpu_data.x86_clflush_size = cpu_clflush_line_size; boot_cpu_data.x86_max_cores = mp_ncpus; boot_cpu_data.x86 = CPUID_TO_FAMILY(cpu_id); boot_cpu_data.x86_model = CPUID_TO_MODEL(cpu_id); boot_cpu_data.x86_vendor = x86_vendor; __cpu_data = mallocarray(mp_maxid + 1, sizeof(*__cpu_data), M_KMALLOC, M_WAITOK | M_ZERO); CPU_FOREACH(i) { __cpu_data[i].x86_clflush_size = cpu_clflush_line_size; __cpu_data[i].x86_max_cores = mp_ncpus; __cpu_data[i].x86 = CPUID_TO_FAMILY(cpu_id); __cpu_data[i].x86_model = CPUID_TO_MODEL(cpu_id); __cpu_data[i].x86_vendor = x86_vendor; } #endif rw_init(&linux_vma_lock, "lkpi-vma-lock"); rootoid = SYSCTL_ADD_ROOT_NODE(NULL, OID_AUTO, "sys", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "sys"); kobject_init(&linux_class_root, &linux_class_ktype); kobject_set_name(&linux_class_root, "class"); linux_class_root.oidp = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(rootoid), OID_AUTO, "class", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "class"); kobject_init(&linux_root_device.kobj, &linux_dev_ktype); kobject_set_name(&linux_root_device.kobj, "device"); linux_root_device.kobj.oidp = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(rootoid), OID_AUTO, "device", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "device"); linux_root_device.bsddev = root_bus; linux_class_misc.name = "misc"; class_register(&linux_class_misc); INIT_LIST_HEAD(&pci_drivers); INIT_LIST_HEAD(&pci_devices); spin_lock_init(&pci_lock); mtx_init(&vmmaplock, "IO Map lock", NULL, MTX_DEF); for (i = 0; i < VMMAP_HASH_SIZE; i++) LIST_INIT(&vmmaphead[i]); init_waitqueue_head(&linux_bit_waitq); init_waitqueue_head(&linux_var_waitq); CPU_COPY(&all_cpus, &cpu_online_mask); /* * Generate a single-CPU cpumask_t for each CPU (possibly) in the system. * CPUs are indexed from 0..(mp_maxid). The entry for cpuid 0 will only * have itself in the cpumask, cupid 1 only itself on entry 1, and so on. * This is used by cpumask_of() (and possibly others in the future) for, * e.g., drivers to pass hints to irq_set_affinity_hint(). */ static_single_cpu_mask = mallocarray(mp_maxid + 1, sizeof(static_single_cpu_mask), M_KMALLOC, M_WAITOK | M_ZERO); /* * When the number of CPUs reach a threshold, we start to save memory * given the sets are static by overlapping those having their single * bit set at same position in a bitset word. Asymptotically, this * regular scheme is in O(n²) whereas the overlapping one is in O(n) * only with n being the maximum number of CPUs, so the gain will become * huge quite quickly. The threshold for 64-bit architectures is 128 * CPUs. */ if (mp_ncpus < (2 * _BITSET_BITS)) { cpumask_t *sscm_ptr; /* * This represents 'mp_ncpus * __bitset_words(CPU_SETSIZE) * * (_BITSET_BITS / 8)' bytes (for comparison with the * overlapping scheme). */ static_single_cpu_mask_lcs = mallocarray(mp_ncpus, sizeof(*static_single_cpu_mask_lcs), M_KMALLOC, M_WAITOK | M_ZERO); sscm_ptr = static_single_cpu_mask_lcs; CPU_FOREACH(i) { static_single_cpu_mask[i] = sscm_ptr++; CPU_SET(i, static_single_cpu_mask[i]); } } else { /* Pointer to a bitset word. */ __typeof(((cpuset_t *)NULL)->__bits[0]) *bwp; /* * Allocate memory for (static) spans of 'cpumask_t' ('cpuset_t' * really) with a single bit set that can be reused for all * single CPU masks by making them start at different offsets. * We need '__bitset_words(CPU_SETSIZE) - 1' bitset words before * the word having its single bit set, and the same amount * after. */ static_single_cpu_mask_lcs = mallocarray(_BITSET_BITS, (2 * __bitset_words(CPU_SETSIZE) - 1) * (_BITSET_BITS / 8), M_KMALLOC, M_WAITOK | M_ZERO); /* * We rely below on cpuset_t and the bitset generic * implementation assigning words in the '__bits' array in the * same order of bits (i.e., little-endian ordering, not to be * confused with machine endianness, which concerns bits in * words and other integers). This is an imperfect test, but it * will detect a change to big-endian ordering. */ _Static_assert( __bitset_word(_BITSET_BITS + 1, _BITSET_BITS) == 1, "Assumes a bitset implementation that is little-endian " "on its words"); /* Initialize the single bit of each static span. */ bwp = (__typeof(bwp))static_single_cpu_mask_lcs + (__bitset_words(CPU_SETSIZE) - 1); for (i = 0; i < _BITSET_BITS; i++) { CPU_SET(i, (cpuset_t *)bwp); bwp += (2 * __bitset_words(CPU_SETSIZE) - 1); } /* * Finally set all CPU masks to the proper word in their * relevant span. */ CPU_FOREACH(i) { bwp = (__typeof(bwp))static_single_cpu_mask_lcs; /* Find the non-zero word of the relevant span. */ bwp += (2 * __bitset_words(CPU_SETSIZE) - 1) * (i % _BITSET_BITS) + __bitset_words(CPU_SETSIZE) - 1; /* Shift to find the CPU mask start. */ bwp -= (i / _BITSET_BITS); static_single_cpu_mask[i] = (cpuset_t *)bwp; } } strlcpy(init_uts_ns.name.release, osrelease, sizeof(init_uts_ns.name.release)); } SYSINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_init, NULL); static void linux_compat_uninit(void *arg) { linux_kobject_kfree_name(&linux_class_root); linux_kobject_kfree_name(&linux_root_device.kobj); linux_kobject_kfree_name(&linux_class_misc.kobj); free(static_single_cpu_mask_lcs, M_KMALLOC); free(static_single_cpu_mask, M_KMALLOC); #if defined(__i386__) || defined(__amd64__) free(__cpu_data, M_KMALLOC); #endif mtx_destroy(&vmmaplock); spin_lock_destroy(&pci_lock); rw_destroy(&linux_vma_lock); } SYSUNINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_uninit, NULL); /* * NOTE: Linux frequently uses "unsigned long" for pointer to integer * conversion and vice versa, where in FreeBSD "uintptr_t" would be * used. Assert these types have the same size, else some parts of the * LinuxKPI may not work like expected: */ CTASSERT(sizeof(unsigned long) == sizeof(uintptr_t)); diff --git a/sys/dev/beri/beri_ring.c b/sys/dev/beri/beri_ring.c index 6d48a411da38..eed002a3dfd6 100644 --- a/sys/dev/beri/beri_ring.c +++ b/sys/dev/beri/beri_ring.c @@ -1,525 +1,525 @@ /*- * Copyright (c) 2014 Ruslan Bukin * All rights reserved. * * This software was developed by SRI International and the University of * Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237) * ("CTSRD"), as part of the DARPA CRASH research programme. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * SRI-Cambridge BERI soft processor <-> ARM core ring buffer. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define READ4(_sc, _reg) \ bus_read_4((_sc)->res[0], _reg) #define WRITE4(_sc, _reg, _val) \ bus_write_4((_sc)->res[0], _reg, _val) #define CDES_INT_EN (1 << 15) #define CDES_CAUSE_MASK 0x3 #define CDES_CAUSE_SHIFT 13 #define DEVNAME_MAXLEN 256 typedef struct { uint16_t cdes; uint16_t interrupt_level; uint16_t in; uint16_t out; } control_reg_t; struct beri_softc { struct resource *res[3]; bus_space_tag_t bst; bus_space_handle_t bsh; struct cdev *cdev; device_t dev; void *read_ih; void *write_ih; struct selinfo beri_rsel; struct mtx beri_mtx; int opened; char devname[DEVNAME_MAXLEN]; int control_read; int control_write; int data_read; int data_write; int data_size; }; static struct resource_spec beri_spec[] = { { SYS_RES_MEMORY, 0, RF_ACTIVE }, { SYS_RES_IRQ, 0, RF_ACTIVE }, { SYS_RES_IRQ, 1, RF_ACTIVE }, { -1, 0 } }; static control_reg_t get_control_reg(struct beri_softc *sc, int dir) { uint32_t offset; uint16_t dst[4]; control_reg_t c; uint16_t *cp; int i; cp = (uint16_t *)&c; offset = dir ? sc->control_write : sc->control_read; ((uint32_t *)dst)[0] = READ4(sc, offset); ((uint32_t *)dst)[1] = READ4(sc, offset + 4); for (i = 0; i < 4; i++) cp[i] = dst[3 - i]; return (c); } static void set_control_reg(struct beri_softc *sc, int dir, control_reg_t *c) { uint32_t offset; uint16_t src[4]; uint16_t *cp; int i; cp = (uint16_t *)c; for (i = 0; i < 4; i++) src[3 - i] = cp[i]; offset = dir ? sc->control_write : sc->control_read; WRITE4(sc, offset + 0, ((uint32_t *)src)[0]); WRITE4(sc, offset + 4, ((uint32_t *)src)[1]); } static int get_stock(struct beri_softc *sc, int dir, control_reg_t *c) { uint32_t fill; fill = (c->in - c->out + sc->data_size) % sc->data_size; if (dir) return (sc->data_size - fill - 1); else return (fill); } static void beri_intr_write(void *arg) { struct beri_softc *sc; control_reg_t c; sc = arg; c = get_control_reg(sc, 1); if (c.cdes & CDES_INT_EN) { c.cdes &= ~(CDES_INT_EN); set_control_reg(sc, 1, &c); } mtx_lock(&sc->beri_mtx); selwakeuppri(&sc->beri_rsel, PZERO + 1); KNOTE_LOCKED(&sc->beri_rsel.si_note, 0); mtx_unlock(&sc->beri_mtx); } static void beri_intr_read(void *arg) { struct beri_softc *sc; control_reg_t c; sc = arg; c = get_control_reg(sc, 0); if (c.cdes & CDES_INT_EN) { c.cdes &= ~(CDES_INT_EN); set_control_reg(sc, 0, &c); } mtx_lock(&sc->beri_mtx); selwakeuppri(&sc->beri_rsel, PZERO + 1); KNOTE_LOCKED(&sc->beri_rsel.si_note, 0); mtx_unlock(&sc->beri_mtx); } static int beri_open(struct cdev *dev, int flags __unused, int fmt __unused, struct thread *td __unused) { struct beri_softc *sc; control_reg_t c; sc = dev->si_drv1; if (sc->opened) return (1); /* Setup interrupt handlers */ if (bus_setup_intr(sc->dev, sc->res[1], INTR_TYPE_BIO | INTR_MPSAFE, NULL, beri_intr_read, sc, &sc->read_ih)) { device_printf(sc->dev, "Unable to setup read intr\n"); return (1); } if (bus_setup_intr(sc->dev, sc->res[2], INTR_TYPE_BIO | INTR_MPSAFE, NULL, beri_intr_write, sc, &sc->write_ih)) { device_printf(sc->dev, "Unable to setup write intr\n"); return (1); } sc->opened = 1; /* Clear write buffer */ c = get_control_reg(sc, 1); c.in = c.out; c.cdes = 0; set_control_reg(sc, 1, &c); /* Clear read buffer */ c = get_control_reg(sc, 0); c.out = c.in; c.cdes = 0; set_control_reg(sc, 0, &c); return (0); } static int beri_close(struct cdev *dev, int flags __unused, int fmt __unused, struct thread *td __unused) { struct beri_softc *sc; sc = dev->si_drv1; if (sc->opened) { sc->opened = 0; /* Unsetup interrupt handlers */ bus_teardown_intr(sc->dev, sc->res[1], sc->read_ih); bus_teardown_intr(sc->dev, sc->res[2], sc->write_ih); } return (0); } static int beri_rdwr(struct cdev *dev, struct uio *uio, int ioflag) { struct beri_softc *sc; uint32_t offset; control_reg_t c; uint16_t *ptr; uint8_t *dst; int stock; int dir; int amount; int count; sc = dev->si_drv1; dir = uio->uio_rw ? 1 : 0; c = get_control_reg(sc, dir); stock = get_stock(sc, dir, &c); if (stock < uio->uio_resid) { device_printf(sc->dev, "Err: no data/space available\n"); return (1); } amount = uio->uio_resid; ptr = dir ? &c.in : &c.out; count = (sc->data_size - *ptr); offset = dir ? sc->data_write : sc->data_read; dst = (uint8_t *)(sc->bsh + offset); if (amount <= count) { uiomove(dst + *ptr, amount, uio); } else { uiomove(dst + *ptr, count, uio); uiomove(dst, (amount - count), uio); } *ptr = (*ptr + amount) % sc->data_size; set_control_reg(sc, dir, &c); return (0); } static int beri_kqread(struct knote *kn, long hint) { struct beri_softc *sc; control_reg_t c; int stock; sc = kn->kn_hook; c = get_control_reg(sc, 0); stock = get_stock(sc, 0, &c); if (stock) { kn->kn_data = stock; return (1); } kn->kn_data = 0; /* Wait at least one new byte in buffer */ c.interrupt_level = 1; /* Enable interrupts */ c.cdes |= (CDES_INT_EN); set_control_reg(sc, 0, &c); return (0); } static int beri_kqwrite(struct knote *kn, long hint) { struct beri_softc *sc; control_reg_t c; int stock; sc = kn->kn_hook; c = get_control_reg(sc, 1); stock = get_stock(sc, 1, &c); if (stock) { kn->kn_data = stock; return (1); } kn->kn_data = 0; /* Wait at least one free position in buffer */ c.interrupt_level = sc->data_size - 2; /* Enable interrupts */ c.cdes |= (CDES_INT_EN); set_control_reg(sc, 1, &c); return (0); } static void beri_kqdetach(struct knote *kn) { struct beri_softc *sc; sc = kn->kn_hook; knlist_remove(&sc->beri_rsel.si_note, kn, 0); } -static struct filterops beri_read_filterops = { +static const struct filterops beri_read_filterops = { .f_isfd = 1, .f_attach = NULL, .f_detach = beri_kqdetach, .f_event = beri_kqread, }; -static struct filterops beri_write_filterops = { +static const struct filterops beri_write_filterops = { .f_isfd = 1, .f_attach = NULL, .f_detach = beri_kqdetach, .f_event = beri_kqwrite, }; static int beri_kqfilter(struct cdev *dev, struct knote *kn) { struct beri_softc *sc; sc = dev->si_drv1; switch(kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &beri_read_filterops; break; case EVFILT_WRITE: kn->kn_fop = &beri_write_filterops; break; default: return(EINVAL); } kn->kn_hook = sc; knlist_add(&sc->beri_rsel.si_note, kn, 0); return (0); } static struct cdevsw beri_cdevsw = { .d_version = D_VERSION, .d_open = beri_open, .d_close = beri_close, .d_write = beri_rdwr, .d_read = beri_rdwr, .d_kqfilter = beri_kqfilter, .d_name = "beri ring buffer", }; static int parse_fdt(struct beri_softc *sc) { pcell_t dts_value[0]; phandle_t node; int len; if ((node = ofw_bus_get_node(sc->dev)) == -1) return (ENXIO); /* get device name */ if (OF_getprop(ofw_bus_get_node(sc->dev), "device_name", &sc->devname, sizeof(sc->devname)) <= 0) { device_printf(sc->dev, "Can't get device_name\n"); return (ENXIO); } if ((len = OF_getproplen(node, "data_size")) <= 0) return (ENXIO); OF_getencprop(node, "data_size", dts_value, len); sc->data_size = dts_value[0]; if ((len = OF_getproplen(node, "data_read")) <= 0) return (ENXIO); OF_getencprop(node, "data_read", dts_value, len); sc->data_read = dts_value[0]; if ((len = OF_getproplen(node, "data_write")) <= 0) return (ENXIO); OF_getencprop(node, "data_write", dts_value, len); sc->data_write = dts_value[0]; if ((len = OF_getproplen(node, "control_read")) <= 0) return (ENXIO); OF_getencprop(node, "control_read", dts_value, len); sc->control_read = dts_value[0]; if ((len = OF_getproplen(node, "control_write")) <= 0) return (ENXIO); OF_getencprop(node, "control_write", dts_value, len); sc->control_write = dts_value[0]; return (0); } static int beri_probe(device_t dev) { if (!ofw_bus_status_okay(dev)) return (ENXIO); if (!ofw_bus_is_compatible(dev, "sri-cambridge,beri-ring")) return (ENXIO); device_set_desc(dev, "SRI-Cambridge BERI ring buffer"); return (BUS_PROBE_DEFAULT); } static int beri_attach(device_t dev) { struct beri_softc *sc; sc = device_get_softc(dev); sc->dev = dev; if (bus_alloc_resources(dev, beri_spec, sc->res)) { device_printf(dev, "could not allocate resources\n"); return (ENXIO); } /* Memory interface */ sc->bst = rman_get_bustag(sc->res[0]); sc->bsh = rman_get_bushandle(sc->res[0]); if (parse_fdt(sc)) { device_printf(sc->dev, "Can't get FDT values\n"); return (ENXIO); } sc->cdev = make_dev(&beri_cdevsw, 0, UID_ROOT, GID_WHEEL, S_IRWXU, "%s", sc->devname); if (sc->cdev == NULL) { device_printf(dev, "Failed to create character device.\n"); return (ENXIO); } sc->cdev->si_drv1 = sc; mtx_init(&sc->beri_mtx, "beri_mtx", NULL, MTX_DEF); knlist_init_mtx(&sc->beri_rsel.si_note, &sc->beri_mtx); return (0); } static device_method_t beri_methods[] = { DEVMETHOD(device_probe, beri_probe), DEVMETHOD(device_attach, beri_attach), { 0, 0 } }; static driver_t beri_driver = { "beri_ring", beri_methods, sizeof(struct beri_softc), }; DRIVER_MODULE(beri_ring, simplebus, beri_driver, 0, 0); diff --git a/sys/dev/cyapa/cyapa.c b/sys/dev/cyapa/cyapa.c index 307cd4d35b2e..50fa4faa560a 100644 --- a/sys/dev/cyapa/cyapa.c +++ b/sys/dev/cyapa/cyapa.c @@ -1,1818 +1,1818 @@ /* * Copyright (c) 2014 The DragonFly Project. All rights reserved. * * This code is derived from software contributed to The DragonFly Project * by Matthew Dillon and was subsequently ported, * modified and enhanced for FreeBSD by Michael Gmelin . * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * 3. Neither the name of The DragonFly Project nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific, prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include /* * CYAPA - Cypress APA trackpad with I2C Interface driver * * Based on DragonFlyBSD's cyapa driver, which referenced the linux * cyapa.c driver to figure out the bootstrapping and commands. * * Unable to locate any datasheet for the device. * * * Trackpad layout: * * 2/3 1/3 * +--------------------+------------+ * | | Middle | * | | Button | * | Left | | * | Button +------------+ * | | Right | * | | Button | * +--------------------+............| * | Thumb/Button Area | 15% * +---------------------------------+ * * * FEATURES * * IMPS/2 emulation - Emulates the IntelliMouse protocol. * * Jitter supression - Implements 2-pixel hysteresis with memory. * * Jump detecion - Detect jumps caused by touchpad. * * Two finger scrolling - Use two fingers for Z axis scrolling. * * Button down/2nd finger - While one finger clicks and holds down the * touchpad, the second one can be used to move * the mouse cursor. Useful for drawing or * selecting text. * * Thumb/Button Area - The lower 15%* of the trackpad will not affect * the mouse cursor position. This allows for high * precision clicking, by controlling the cursor * with the index finger and pushing/holding the * pad down with the thumb. * * can be changed using sysctl * * Track-pad button - Push physical button. Left 2/3rds of the pad * will issue a LEFT button event, upper right * corner will issue a MIDDLE button event, * lower right corner will issue a RIGHT button * event. Optional tap support can be enabled * and configured using sysctl. * * WARNINGS * * These trackpads get confused when three or more fingers are down on the * same horizontal axis and will start to glitch the finger detection. * Removing your hand for a few seconds will allow the trackpad to * recalibrate. Generally speaking, when using three or more fingers * please try to place at least one finger off-axis (a little above or * below) the other two. */ #include "opt_evdev.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef EVDEV_SUPPORT #include #include #endif #include "iicbus_if.h" #include "bus_if.h" #include "device_if.h" #define CYAPA_BUFSIZE 128 /* power of 2 */ #define CYAPA_BUFMASK (CYAPA_BUFSIZE - 1) #define ZSCALE 15 #define TIME_TO_IDLE (hz * 10) #define TIME_TO_RESET (hz * 3) static MALLOC_DEFINE(M_CYAPA, "cyapa", "CYAPA device data"); struct cyapa_fifo { int rindex; int windex; char buf[CYAPA_BUFSIZE]; }; struct cyapa_softc { device_t dev; int count; /* >0 if device opened */ struct cdev *devnode; struct selinfo selinfo; struct mtx mutex; struct intr_config_hook intr_hook; #ifdef EVDEV_SUPPORT struct evdev_dev *evdev; #endif int cap_resx; int cap_resy; int cap_phyx; int cap_phyy; uint8_t cap_buttons; int detaching; /* driver is detaching */ int poll_thread_running; /* poll thread is running */ /* PS/2 mouse emulation */ int track_x; /* current tracking */ int track_y; int track_z; int track_z_ticks; uint16_t track_but; char track_id; /* first finger id */ int track_nfingers; int delta_x; /* accumulation -> report */ int delta_y; int delta_z; int fuzz_x; int fuzz_y; int fuzz_z; int touch_x; /* touch down coordinates */ int touch_y; int touch_z; int finger1_ticks; int finger2_ticks; int finger3_ticks; uint16_t reported_but; struct cyapa_fifo rfifo; /* device->host */ struct cyapa_fifo wfifo; /* host->device */ uint8_t ps2_cmd; /* active p2_cmd waiting for data */ uint8_t ps2_acked; int active_tick; int data_signal; int blocked; int isselect; int reporting_mode; /* 0=disabled 1=enabled */ int scaling_mode; /* 0=1:1 1=2:1 */ int remote_mode; /* 0 for streaming mode */ int zenabled; /* z-axis enabled (mode 1 or 2) */ mousehw_t hw; /* hardware information */ mousemode_t mode; /* mode */ int poll_ticks; }; struct cyapa_cdevpriv { struct cyapa_softc *sc; }; #define CYPOLL_SHUTDOWN 0x0001 static void cyapa_poll_thread(void *arg); static int cyapa_raw_input(struct cyapa_softc *sc, struct cyapa_regs *regs, int freq); static void cyapa_set_power_mode(struct cyapa_softc *sc, int mode); static int fifo_empty(struct cyapa_softc *sc, struct cyapa_fifo *fifo); static size_t fifo_ready(struct cyapa_softc *sc, struct cyapa_fifo *fifo); static char *fifo_read(struct cyapa_softc *sc, struct cyapa_fifo *fifo, size_t n); static char *fifo_write(struct cyapa_softc *sc, struct cyapa_fifo *fifo, size_t n); static uint8_t fifo_read_char(struct cyapa_softc *sc, struct cyapa_fifo *fifo); static void fifo_write_char(struct cyapa_softc *sc, struct cyapa_fifo *fifo, uint8_t c); static size_t fifo_space(struct cyapa_softc *sc, struct cyapa_fifo *fifo); static void fifo_reset(struct cyapa_softc *sc, struct cyapa_fifo *fifo); static int cyapa_fuzz(int delta, int *fuzz); static int cyapa_idle_freq = 1; SYSCTL_INT(_debug, OID_AUTO, cyapa_idle_freq, CTLFLAG_RW, &cyapa_idle_freq, 0, "Scan frequency in idle mode"); static int cyapa_slow_freq = 20; SYSCTL_INT(_debug, OID_AUTO, cyapa_slow_freq, CTLFLAG_RW, &cyapa_slow_freq, 0, "Scan frequency in slow mode "); static int cyapa_norm_freq = 100; SYSCTL_INT(_debug, OID_AUTO, cyapa_norm_freq, CTLFLAG_RW, &cyapa_norm_freq, 0, "Normal scan frequency"); static int cyapa_minpressure = 12; SYSCTL_INT(_debug, OID_AUTO, cyapa_minpressure, CTLFLAG_RW, &cyapa_minpressure, 0, "Minimum pressure to detect finger"); static int cyapa_enable_tapclick = 0; SYSCTL_INT(_debug, OID_AUTO, cyapa_enable_tapclick, CTLFLAG_RW, &cyapa_enable_tapclick, 0, "Enable tap to click"); static int cyapa_tapclick_min_ticks = 1; SYSCTL_INT(_debug, OID_AUTO, cyapa_tapclick_min_ticks, CTLFLAG_RW, &cyapa_tapclick_min_ticks, 0, "Minimum tap duration for click"); static int cyapa_tapclick_max_ticks = 8; SYSCTL_INT(_debug, OID_AUTO, cyapa_tapclick_max_ticks, CTLFLAG_RW, &cyapa_tapclick_max_ticks, 0, "Maximum tap duration for click"); static int cyapa_move_min_ticks = 4; SYSCTL_INT(_debug, OID_AUTO, cyapa_move_min_ticks, CTLFLAG_RW, &cyapa_move_min_ticks, 0, "Minimum ticks before cursor position is changed"); static int cyapa_scroll_wait_ticks = 0; SYSCTL_INT(_debug, OID_AUTO, cyapa_scroll_wait_ticks, CTLFLAG_RW, &cyapa_scroll_wait_ticks, 0, "Wait N ticks before starting to scroll"); static int cyapa_scroll_stick_ticks = 15; SYSCTL_INT(_debug, OID_AUTO, cyapa_scroll_stick_ticks, CTLFLAG_RW, &cyapa_scroll_stick_ticks, 0, "Prevent cursor move on single finger for N ticks after scroll"); static int cyapa_thumbarea_percent = 15; SYSCTL_INT(_debug, OID_AUTO, cyapa_thumbarea_percent, CTLFLAG_RW, &cyapa_thumbarea_percent, 0, "Size of bottom thumb area in percent"); static int cyapa_debug = 0; SYSCTL_INT(_debug, OID_AUTO, cyapa_debug, CTLFLAG_RW, &cyapa_debug, 0, "Enable debugging"); static int cyapa_reset = 0; SYSCTL_INT(_debug, OID_AUTO, cyapa_reset, CTLFLAG_RW, &cyapa_reset, 0, "Reset track pad"); static int cyapa_read_bytes(device_t dev, uint8_t reg, uint8_t *val, int cnt) { uint16_t addr = iicbus_get_addr(dev); struct iic_msg msgs[] = { { addr, IIC_M_WR | IIC_M_NOSTOP, 1, ® }, { addr, IIC_M_RD, cnt, val }, }; return (iicbus_transfer(dev, msgs, nitems(msgs))); } static int cyapa_write_bytes(device_t dev, uint8_t reg, const uint8_t *val, int cnt) { uint16_t addr = iicbus_get_addr(dev); struct iic_msg msgs[] = { { addr, IIC_M_WR | IIC_M_NOSTOP, 1, ® }, { addr, IIC_M_WR | IIC_M_NOSTART, cnt, __DECONST(uint8_t *, val) }, }; return (iicbus_transfer(dev, msgs, nitems(msgs))); } static void cyapa_lock(struct cyapa_softc *sc) { mtx_lock(&sc->mutex); } static void cyapa_unlock(struct cyapa_softc *sc) { mtx_unlock(&sc->mutex); } #define CYAPA_LOCK_ASSERT(sc) mtx_assert(&(sc)->mutex, MA_OWNED); /* * Notify if possible receive data ready. Must be called * with sc->mutex held (cyapa_lock(sc)). */ static void cyapa_notify(struct cyapa_softc *sc) { CYAPA_LOCK_ASSERT(sc); if (sc->data_signal || !fifo_empty(sc, &sc->rfifo)) { KNOTE_LOCKED(&sc->selinfo.si_note, 0); if (sc->blocked || sc->isselect) { if (sc->blocked) { sc->blocked = 0; wakeup(&sc->blocked); } if (sc->isselect) { sc->isselect = 0; selwakeup(&sc->selinfo); } } } } /* * Initialize the device */ static int init_device(device_t dev, struct cyapa_cap *cap, int probe) { static char bl_exit[] = { 0x00, 0xff, 0xa5, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 }; static char bl_deactivate[] = { 0x00, 0xff, 0x3b, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 }; struct cyapa_boot_regs boot; int error; int retries; /* Get status */ error = cyapa_read_bytes(dev, CMD_BOOT_STATUS, (void *)&boot, sizeof(boot)); if (error) goto done; /* * Bootstrap the device if necessary. It can take up to 2 seconds * for the device to fully initialize. */ retries = 20; while ((boot.stat & CYAPA_STAT_RUNNING) == 0 && retries > 0) { if (boot.boot & CYAPA_BOOT_BUSY) { /* Busy, wait loop. */ } else if (boot.error & CYAPA_ERROR_BOOTLOADER) { /* Magic */ error = cyapa_write_bytes(dev, CMD_BOOT_STATUS, bl_deactivate, sizeof(bl_deactivate)); if (error) goto done; } else { /* Magic */ error = cyapa_write_bytes(dev, CMD_BOOT_STATUS, bl_exit, sizeof(bl_exit)); if (error) goto done; } pause("cyapab1", (hz * 2) / 10); --retries; error = cyapa_read_bytes(dev, CMD_BOOT_STATUS, (void *)&boot, sizeof(boot)); if (error) goto done; } if (retries == 0) { device_printf(dev, "Unable to bring device out of bootstrap\n"); error = ENXIO; goto done; } /* Check identity */ if (cap) { error = cyapa_read_bytes(dev, CMD_QUERY_CAPABILITIES, (void *)cap, sizeof(*cap)); if (strncmp(cap->prod_ida, "CYTRA", 5) != 0) { device_printf(dev, "Product ID \"%5.5s\" mismatch\n", cap->prod_ida); error = ENXIO; } } error = cyapa_read_bytes(dev, CMD_BOOT_STATUS, (void *)&boot, sizeof(boot)); if (probe == 0) /* official init */ device_printf(dev, "cyapa init status %02x\n", boot.stat); else if (probe == 2) device_printf(dev, "cyapa reset status %02x\n", boot.stat); done: if (error) device_printf(dev, "Unable to initialize\n"); return (error); } /* * Start the polling thread */ static void cyapa_start(void *xdev) { struct cyapa_softc *sc; device_t dev = xdev; sc = device_get_softc(dev); config_intrhook_disestablish(&sc->intr_hook); /* Setup input event tracking */ cyapa_set_power_mode(sc, CMD_POWER_MODE_IDLE); /* Start the polling thread */ kthread_add(cyapa_poll_thread, sc, NULL, NULL, 0, 0, "cyapa-poll"); } static int cyapa_probe(device_t); static int cyapa_attach(device_t); static int cyapa_detach(device_t); static void cyapa_cdevpriv_dtor(void*); static device_method_t cyapa_methods[] = { /* device interface */ DEVMETHOD(device_probe, cyapa_probe), DEVMETHOD(device_attach, cyapa_attach), DEVMETHOD(device_detach, cyapa_detach), DEVMETHOD_END }; static driver_t cyapa_driver = { "cyapa", cyapa_methods, sizeof(struct cyapa_softc), }; static d_open_t cyapaopen; static d_ioctl_t cyapaioctl; static d_read_t cyaparead; static d_write_t cyapawrite; static d_kqfilter_t cyapakqfilter; static d_poll_t cyapapoll; static struct cdevsw cyapa_cdevsw = { .d_version = D_VERSION, .d_open = cyapaopen, .d_ioctl = cyapaioctl, .d_read = cyaparead, .d_write = cyapawrite, .d_kqfilter = cyapakqfilter, .d_poll = cyapapoll, }; static int cyapa_probe(device_t dev) { struct cyapa_cap cap; int addr; int error; addr = iicbus_get_addr(dev); /* * 0x67 - cypress trackpad on the acer c720 * (other devices might use other ids). */ if (addr != 0xce) return (ENXIO); error = init_device(dev, &cap, 1); if (error != 0) return (ENXIO); device_set_desc(dev, "Cypress APA I2C Trackpad"); return (BUS_PROBE_VENDOR); } static int cyapa_attach(device_t dev) { struct cyapa_softc *sc; struct cyapa_cap cap; int unit; int addr; sc = device_get_softc(dev); sc->reporting_mode = 1; unit = device_get_unit(dev); addr = iicbus_get_addr(dev); if (init_device(dev, &cap, 0)) return (ENXIO); mtx_init(&sc->mutex, "cyapa", NULL, MTX_DEF); sc->dev = dev; knlist_init_mtx(&sc->selinfo.si_note, &sc->mutex); sc->cap_resx = ((cap.max_abs_xy_high << 4) & 0x0F00) | cap.max_abs_x_low; sc->cap_resy = ((cap.max_abs_xy_high << 8) & 0x0F00) | cap.max_abs_y_low; sc->cap_phyx = ((cap.phy_siz_xy_high << 4) & 0x0F00) | cap.phy_siz_x_low; sc->cap_phyy = ((cap.phy_siz_xy_high << 8) & 0x0F00) | cap.phy_siz_y_low; sc->cap_buttons = cap.buttons >> 3 & (CYAPA_FNGR_LEFT | CYAPA_FNGR_RIGHT | CYAPA_FNGR_MIDDLE); device_printf(dev, "%5.5s-%6.6s-%2.2s buttons=%c%c%c res=%dx%d\n", cap.prod_ida, cap.prod_idb, cap.prod_idc, ((sc->cap_buttons & CYAPA_FNGR_LEFT) ? 'L' : '-'), ((sc->cap_buttons & CYAPA_FNGR_MIDDLE) ? 'M' : '-'), ((sc->cap_buttons & CYAPA_FNGR_RIGHT) ? 'R' : '-'), sc->cap_resx, sc->cap_resy); sc->hw.buttons = 5; sc->hw.iftype = MOUSE_IF_PS2; sc->hw.type = MOUSE_MOUSE; sc->hw.model = MOUSE_MODEL_INTELLI; sc->hw.hwid = addr; sc->mode.protocol = MOUSE_PROTO_PS2; sc->mode.rate = 100; sc->mode.resolution = 4; sc->mode.accelfactor = 1; sc->mode.level = 0; sc->mode.packetsize = MOUSE_PS2_PACKETSIZE; sc->intr_hook.ich_func = cyapa_start; sc->intr_hook.ich_arg = sc->dev; #ifdef EVDEV_SUPPORT sc->evdev = evdev_alloc(); evdev_set_name(sc->evdev, device_get_desc(sc->dev)); evdev_set_phys(sc->evdev, device_get_nameunit(sc->dev)); evdev_set_id(sc->evdev, BUS_I2C, 0, 0, 1); evdev_set_flag(sc->evdev, EVDEV_FLAG_MT_STCOMPAT); evdev_set_flag(sc->evdev, EVDEV_FLAG_MT_AUTOREL); evdev_support_event(sc->evdev, EV_SYN); evdev_support_event(sc->evdev, EV_ABS); evdev_support_event(sc->evdev, EV_KEY); evdev_support_prop(sc->evdev, INPUT_PROP_POINTER); if (sc->cap_buttons & CYAPA_FNGR_LEFT) evdev_support_key(sc->evdev, BTN_LEFT); if (sc->cap_buttons & CYAPA_FNGR_RIGHT) evdev_support_key(sc->evdev, BTN_RIGHT); if (sc->cap_buttons & CYAPA_FNGR_MIDDLE) evdev_support_key(sc->evdev, BTN_MIDDLE); if (sc->cap_buttons == CYAPA_FNGR_LEFT) evdev_support_prop(sc->evdev, INPUT_PROP_BUTTONPAD); evdev_support_abs(sc->evdev, ABS_MT_SLOT, 0, CYAPA_MAX_MT - 1, 0, 0, 0); evdev_support_abs(sc->evdev, ABS_MT_TRACKING_ID, -1, 15, 0, 0, 0); evdev_support_abs(sc->evdev, ABS_MT_POSITION_X, 0, sc->cap_resx, 0, 0, sc->cap_phyx != 0 ? sc->cap_resx / sc->cap_phyx : 0); evdev_support_abs(sc->evdev, ABS_MT_POSITION_Y, 0, sc->cap_resy, 0, 0, sc->cap_phyy != 0 ? sc->cap_resy / sc->cap_phyy : 0); evdev_support_abs(sc->evdev, ABS_MT_PRESSURE, 0, 255, 0, 0, 0); if (evdev_register(sc->evdev) != 0) { mtx_destroy(&sc->mutex); return (ENOMEM); } #endif /* Postpone start of the polling thread until sleep is available */ if (config_intrhook_establish(&sc->intr_hook) != 0) { #ifdef EVDEV_SUPPORT evdev_free(sc->evdev); #endif mtx_destroy(&sc->mutex); return (ENOMEM); } sc->devnode = make_dev(&cyapa_cdevsw, unit, UID_ROOT, GID_WHEEL, 0600, "cyapa%d", unit); sc->devnode->si_drv1 = sc; return (0); } static int cyapa_detach(device_t dev) { struct cyapa_softc *sc; sc = device_get_softc(dev); /* Cleanup poller thread */ cyapa_lock(sc); while (sc->poll_thread_running) { sc->detaching = 1; mtx_sleep(&sc->detaching, &sc->mutex, PCATCH, "cyapadet", hz); } cyapa_unlock(sc); destroy_dev(sc->devnode); knlist_clear(&sc->selinfo.si_note, 0); seldrain(&sc->selinfo); knlist_destroy(&sc->selinfo.si_note); #ifdef EVDEV_SUPPORT evdev_free(sc->evdev); #endif mtx_destroy(&sc->mutex); return (0); } /* * USER DEVICE I/O FUNCTIONS */ static int cyapaopen(struct cdev *dev, int oflags, int devtype, struct thread *td) { struct cyapa_cdevpriv *priv; int error; priv = malloc(sizeof(*priv), M_CYAPA, M_WAITOK | M_ZERO); priv->sc = dev->si_drv1; error = devfs_set_cdevpriv(priv, cyapa_cdevpriv_dtor); if (error == 0) { cyapa_lock(priv->sc); priv->sc->count++; cyapa_unlock(priv->sc); } else free(priv, M_CYAPA); return (error); } static void cyapa_cdevpriv_dtor(void *data) { struct cyapa_cdevpriv *priv; priv = data; KASSERT(priv != NULL, ("cyapa cdevpriv should not be NULL!")); cyapa_lock(priv->sc); priv->sc->count--; cyapa_unlock(priv->sc); free(priv, M_CYAPA); } static int cyaparead(struct cdev *dev, struct uio *uio, int ioflag) { struct cyapa_softc *sc; int error; int didread; size_t n; char* ptr; sc = dev->si_drv1; /* If buffer is empty, load a new event if it is ready */ cyapa_lock(sc); again: if (fifo_empty(sc, &sc->rfifo) && (sc->data_signal || sc->delta_x || sc->delta_y || sc->track_but != sc->reported_but)) { uint8_t c0; uint16_t but; int delta_x; int delta_y; int delta_z; /* Accumulate delta_x, delta_y */ sc->data_signal = 0; delta_x = sc->delta_x; delta_y = sc->delta_y; delta_z = sc->delta_z; if (delta_x > 255) { delta_x = 255; sc->data_signal = 1; } if (delta_x < -256) { delta_x = -256; sc->data_signal = 1; } if (delta_y > 255) { delta_y = 255; sc->data_signal = 1; } if (delta_y < -256) { delta_y = -256; sc->data_signal = 1; } if (delta_z > 255) { delta_z = 255; sc->data_signal = 1; } if (delta_z < -256) { delta_z = -256; sc->data_signal = 1; } but = sc->track_but; /* Adjust baseline for next calculation */ sc->delta_x -= delta_x; sc->delta_y -= delta_y; sc->delta_z -= delta_z; sc->reported_but = but; /* * Fuzz reduces movement jitter by introducing some * hysteresis. It operates without cumulative error so * if you swish around quickly and return your finger to * where it started, so to will the mouse. */ delta_x = cyapa_fuzz(delta_x, &sc->fuzz_x); delta_y = cyapa_fuzz(delta_y, &sc->fuzz_y); delta_z = cyapa_fuzz(delta_z, &sc->fuzz_z); /* * Generate report */ c0 = 0; if (delta_x < 0) c0 |= 0x10; if (delta_y < 0) c0 |= 0x20; c0 |= 0x08; if (but & CYAPA_FNGR_LEFT) c0 |= 0x01; if (but & CYAPA_FNGR_MIDDLE) c0 |= 0x04; if (but & CYAPA_FNGR_RIGHT) c0 |= 0x02; fifo_write_char(sc, &sc->rfifo, c0); fifo_write_char(sc, &sc->rfifo, (uint8_t)delta_x); fifo_write_char(sc, &sc->rfifo, (uint8_t)delta_y); switch(sc->zenabled) { case 1: /* Z axis all 8 bits */ fifo_write_char(sc, &sc->rfifo, (uint8_t)delta_z); break; case 2: /* * Z axis low 4 bits + 4th button and 5th button * (high 2 bits must be left 0). Auto-scale * delta_z to fit to avoid a wrong-direction * overflow (don't try to retain the remainder). */ while (delta_z > 7 || delta_z < -8) delta_z >>= 1; c0 = (uint8_t)delta_z & 0x0F; fifo_write_char(sc, &sc->rfifo, c0); break; default: /* basic PS/2 */ break; } cyapa_notify(sc); } /* Blocking / Non-blocking */ error = 0; didread = (uio->uio_resid == 0); while ((ioflag & IO_NDELAY) == 0 && fifo_empty(sc, &sc->rfifo)) { if (sc->data_signal) goto again; sc->blocked = 1; error = mtx_sleep(&sc->blocked, &sc->mutex, PCATCH, "cyablk", 0); if (error) break; } /* Return any buffered data */ while (error == 0 && uio->uio_resid && (n = fifo_ready(sc, &sc->rfifo)) > 0) { if (n > uio->uio_resid) n = uio->uio_resid; ptr = fifo_read(sc, &sc->rfifo, 0); cyapa_unlock(sc); error = uiomove(ptr, n, uio); cyapa_lock(sc); if (error) break; fifo_read(sc, &sc->rfifo, n); didread = 1; } cyapa_unlock(sc); if (error == 0 && didread == 0) { error = EWOULDBLOCK; } return (didread ? 0 : error); } static int cyapawrite(struct cdev *dev, struct uio *uio, int ioflag) { struct cyapa_softc *sc; int error; int cmd_completed; size_t n; uint8_t c0; char* ptr; sc = dev->si_drv1; again: /* * Copy data from userland. This will also cross-over the end * of the fifo and keep filling. */ cyapa_lock(sc); while ((n = fifo_space(sc, &sc->wfifo)) > 0 && uio->uio_resid) { if (n > uio->uio_resid) n = uio->uio_resid; ptr = fifo_write(sc, &sc->wfifo, 0); cyapa_unlock(sc); error = uiomove(ptr, n, uio); cyapa_lock(sc); if (error) break; fifo_write(sc, &sc->wfifo, n); } /* Handle commands */ cmd_completed = (fifo_ready(sc, &sc->wfifo) != 0); while (fifo_ready(sc, &sc->wfifo) && cmd_completed && error == 0) { if (sc->ps2_cmd == 0) sc->ps2_cmd = fifo_read_char(sc, &sc->wfifo); switch(sc->ps2_cmd) { case 0xE6: /* SET SCALING 1:1 */ sc->scaling_mode = 0; fifo_write_char(sc, &sc->rfifo, 0xFA); break; case 0xE7: /* SET SCALING 2:1 */ sc->scaling_mode = 1; fifo_write_char(sc, &sc->rfifo, 0xFA); break; case 0xE8: /* SET RESOLUTION +1 byte */ if (sc->ps2_acked == 0) { sc->ps2_acked = 1; fifo_write_char(sc, &sc->rfifo, 0xFA); } if (fifo_ready(sc, &sc->wfifo) == 0) { cmd_completed = 0; break; } sc->mode.resolution = fifo_read_char(sc, &sc->wfifo); fifo_write_char(sc, &sc->rfifo, 0xFA); break; case 0xE9: /* * STATUS REQUEST * * byte1: * bit 7 0 * bit 6 Mode (1=remote mode, 0=stream mode) * bit 5 Enable (data reporting enabled) * bit 4 Scaling (0=1:1 1=2:1) * bit 3 0 * bit 2 LEFT BUTTON (1 if pressed) * bit 1 MIDDLE BUTTON (1 if pressed) * bit 0 RIGHT BUTTON (1 if pressed) * * byte2: resolution counts/mm * byte3: sample rate */ c0 = 0; if (sc->remote_mode) c0 |= 0x40; if (sc->reporting_mode) c0 |= 0x20; if (sc->scaling_mode) c0 |= 0x10; if (sc->track_but & CYAPA_FNGR_LEFT) c0 |= 0x04; if (sc->track_but & CYAPA_FNGR_MIDDLE) c0 |= 0x02; if (sc->track_but & CYAPA_FNGR_RIGHT) c0 |= 0x01; fifo_write_char(sc, &sc->rfifo, 0xFA); fifo_write_char(sc, &sc->rfifo, c0); fifo_write_char(sc, &sc->rfifo, 0x00); fifo_write_char(sc, &sc->rfifo, 100); break; case 0xEA: /* Set stream mode and reset movement counters */ sc->remote_mode = 0; fifo_write_char(sc, &sc->rfifo, 0xFA); sc->delta_x = 0; sc->delta_y = 0; sc->delta_z = 0; break; case 0xEB: /* * Read Data (if in remote mode). If not in remote * mode force an event. */ fifo_write_char(sc, &sc->rfifo, 0xFA); sc->data_signal = 1; break; case 0xEC: /* Reset Wrap Mode (ignored) */ fifo_write_char(sc, &sc->rfifo, 0xFA); break; case 0xEE: /* Set Wrap Mode (ignored) */ fifo_write_char(sc, &sc->rfifo, 0xFA); break; case 0xF0: /* Set Remote Mode */ sc->remote_mode = 1; fifo_write_char(sc, &sc->rfifo, 0xFA); sc->delta_x = 0; sc->delta_y = 0; sc->delta_z = 0; break; case 0xF2: /* * Get Device ID * * If we send 0x00 - normal PS/2 mouse, no Z-axis * * If we send 0x03 - Intellimouse, data packet has * an additional Z movement byte (8 bits signed). * (also reset movement counters) * * If we send 0x04 - Now includes z-axis and the * 4th and 5th mouse buttons. */ fifo_write_char(sc, &sc->rfifo, 0xFA); switch(sc->zenabled) { case 1: fifo_write_char(sc, &sc->rfifo, 0x03); break; case 2: fifo_write_char(sc, &sc->rfifo, 0x04); break; default: fifo_write_char(sc, &sc->rfifo, 0x00); break; } sc->delta_x = 0; sc->delta_y = 0; sc->delta_z = 0; break; case 0xF3: /* * Set Sample Rate * * byte1: the sample rate */ if (sc->ps2_acked == 0) { sc->ps2_acked = 1; fifo_write_char(sc, &sc->rfifo, 0xFA); } if (fifo_ready(sc, &sc->wfifo) == 0) { cmd_completed = 0; break; } sc->mode.rate = fifo_read_char(sc, &sc->wfifo); fifo_write_char(sc, &sc->rfifo, 0xFA); /* * zenabling sequence: 200,100,80 (device id 0x03) * 200,200,80 (device id 0x04) * * We support id 0x03 (no 4th or 5th button). * We support id 0x04 (w/ 4th and 5th button). */ if (sc->zenabled == 0 && sc->mode.rate == 200) sc->zenabled = -1; else if (sc->zenabled == -1 && sc->mode.rate == 100) sc->zenabled = -2; else if (sc->zenabled == -1 && sc->mode.rate == 200) sc->zenabled = -3; else if (sc->zenabled == -2 && sc->mode.rate == 80) sc->zenabled = 1; /* z-axis mode */ else if (sc->zenabled == -3 && sc->mode.rate == 80) sc->zenabled = 2; /* z-axis+but4/5 */ if (sc->mode.level) sc->zenabled = 1; break; case 0xF4: /* Enable data reporting. Only effects stream mode. */ fifo_write_char(sc, &sc->rfifo, 0xFA); sc->reporting_mode = 1; break; case 0xF5: /* * Disable data reporting. Only effects stream mode * and is ignored right now. */ fifo_write_char(sc, &sc->rfifo, 0xFA); sc->reporting_mode = 1; break; case 0xF6: /* * SET DEFAULTS * * (reset sampling rate, resolution, scaling and * enter stream mode) */ fifo_write_char(sc, &sc->rfifo, 0xFA); sc->mode.rate = 100; sc->mode.resolution = 4; sc->scaling_mode = 0; sc->reporting_mode = 1; sc->remote_mode = 0; sc->delta_x = 0; sc->delta_y = 0; sc->delta_z = 0; /* signal */ break; case 0xFE: /* * RESEND * * Force a resend by guaranteeing that reported_but * differs from track_but. */ fifo_write_char(sc, &sc->rfifo, 0xFA); sc->data_signal = 1; break; case 0xFF: /* * RESET */ fifo_reset(sc, &sc->rfifo); /* should we do this? */ fifo_reset(sc, &sc->wfifo); /* should we do this? */ fifo_write_char(sc, &sc->rfifo, 0xFA); sc->delta_x = 0; sc->delta_y = 0; sc->delta_z = 0; sc->zenabled = 0; sc->mode.level = 0; break; default: printf("unknown command %02x\n", sc->ps2_cmd); break; } if (cmd_completed) { sc->ps2_cmd = 0; sc->ps2_acked = 0; } cyapa_notify(sc); } cyapa_unlock(sc); if (error == 0 && (cmd_completed || uio->uio_resid)) goto again; return (error); } static void cyapafiltdetach(struct knote *); static int cyapafilt(struct knote *, long); -static struct filterops cyapa_filtops = { +static const struct filterops cyapa_filtops = { .f_isfd = 1, .f_detach = cyapafiltdetach, .f_event = cyapafilt }; static int cyapakqfilter(struct cdev *dev, struct knote *kn) { struct cyapa_softc *sc; struct knlist *knlist; sc = dev->si_drv1; switch(kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &cyapa_filtops; kn->kn_hook = (void *)sc; break; default: return (EOPNOTSUPP); } knlist = &sc->selinfo.si_note; knlist_add(knlist, kn, 0); return (0); } static int cyapapoll(struct cdev *dev, int events, struct thread *td) { struct cyapa_softc *sc; int revents; sc = dev->si_drv1; revents = 0; cyapa_lock(sc); if (events & (POLLIN | POLLRDNORM)) { if (sc->data_signal || !fifo_empty(sc, &sc->rfifo)) revents = events & (POLLIN | POLLRDNORM); else { sc->isselect = 1; selrecord(td, &sc->selinfo); } } cyapa_unlock(sc); return (revents); } static void cyapafiltdetach(struct knote *kn) { struct cyapa_softc *sc; struct knlist *knlist; sc = (struct cyapa_softc *)kn->kn_hook; knlist = &sc->selinfo.si_note; knlist_remove(knlist, kn, 0); } static int cyapafilt(struct knote *kn, long hint) { struct cyapa_softc *sc; int ready; sc = (struct cyapa_softc *)kn->kn_hook; cyapa_lock(sc); ready = fifo_ready(sc, &sc->rfifo) || sc->data_signal; cyapa_unlock(sc); return (ready); } static int cyapaioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) { struct cyapa_softc *sc; int error; sc = dev->si_drv1; error = 0; cyapa_lock(sc); switch (cmd) { case MOUSE_GETHWINFO: *(mousehw_t *)data = sc->hw; if (sc->mode.level == 0) ((mousehw_t *)data)->model = MOUSE_MODEL_GENERIC; break; case MOUSE_GETMODE: *(mousemode_t *)data = sc->mode; ((mousemode_t *)data)->resolution = MOUSE_RES_LOW - sc->mode.resolution; switch (sc->mode.level) { case 0: ((mousemode_t *)data)->protocol = MOUSE_PROTO_PS2; ((mousemode_t *)data)->packetsize = MOUSE_PS2_PACKETSIZE; break; case 2: ((mousemode_t *)data)->protocol = MOUSE_PROTO_PS2; ((mousemode_t *)data)->packetsize = MOUSE_PS2_PACKETSIZE + 1; break; } break; case MOUSE_GETLEVEL: *(int *)data = sc->mode.level; break; case MOUSE_SETLEVEL: if ((*(int *)data < 0) && (*(int *)data > 2)) { error = EINVAL; break; } sc->mode.level = *(int *)data ? 2 : 0; sc->zenabled = sc->mode.level ? 1 : 0; break; default: error = ENOTTY; break; } cyapa_unlock(sc); return (error); } /* * MAJOR SUPPORT FUNCTIONS */ static void cyapa_poll_thread(void *arg) { struct cyapa_softc *sc; struct cyapa_regs regs; device_t bus; /* iicbus */ int error; int freq; int isidle; int pstate; int npstate; int last_reset; sc = arg; freq = cyapa_norm_freq; isidle = 0; pstate = CMD_POWER_MODE_IDLE; last_reset = ticks; bus = device_get_parent(sc->dev); cyapa_lock(sc); sc->poll_thread_running = 1; while (!sc->detaching) { cyapa_unlock(sc); error = iicbus_request_bus(bus, sc->dev, IIC_WAIT); if (error == 0) { error = cyapa_read_bytes(sc->dev, CMD_DEV_STATUS, (void *)®s, sizeof(regs)); if (error == 0) { isidle = cyapa_raw_input(sc, ®s, freq); } /* * For some reason the device can crap-out. If it * drops back into bootstrap mode try to reinitialize * it. */ if (cyapa_reset || ((regs.stat & CYAPA_STAT_RUNNING) == 0 && (unsigned)(ticks - last_reset) > TIME_TO_RESET)) { cyapa_reset = 0; last_reset = ticks; init_device(sc->dev, NULL, 2); } iicbus_release_bus(bus, sc->dev); } pause("cyapw", hz / freq); ++sc->poll_ticks; if (sc->count == 0) { freq = cyapa_idle_freq; npstate = CMD_POWER_MODE_IDLE; } else if (isidle) { freq = cyapa_slow_freq; npstate = CMD_POWER_MODE_IDLE; } else { freq = cyapa_norm_freq; npstate = CMD_POWER_MODE_FULL; } if (pstate != npstate) { pstate = npstate; cyapa_set_power_mode(sc, pstate); if (cyapa_debug) { switch(pstate) { case CMD_POWER_MODE_OFF: printf("cyapa: power off\n"); break; case CMD_POWER_MODE_IDLE: printf("cyapa: power idle\n"); break; case CMD_POWER_MODE_FULL: printf("cyapa: power full\n"); break; } } } cyapa_lock(sc); } sc->poll_thread_running = 0; cyapa_unlock(sc); kthread_exit(); } static int cyapa_raw_input(struct cyapa_softc *sc, struct cyapa_regs *regs, int freq) { int nfingers; int afingers; /* actual fingers after culling */ int i; int j; int isidle; int thumbarea_begin; int seen_thumb; int x; int y; int z; int newfinger; int lessfingers; int click_x; int click_y; uint16_t but; /* high bits used for simulated but4/but5 */ thumbarea_begin = sc->cap_resy - ((sc->cap_resy * cyapa_thumbarea_percent) / 100); click_x = click_y = 0; /* * If the device is not running the rest of the status * means something else, set fingers to 0. */ if ((regs->stat & CYAPA_STAT_RUNNING) == 0) { regs->fngr = 0; } /* Process fingers/movement */ nfingers = CYAPA_FNGR_NUMFINGERS(regs->fngr); afingers = nfingers; if (cyapa_debug) { printf("stat %02x buttons %c%c%c nfngrs=%d ", regs->stat, ((regs->fngr & CYAPA_FNGR_LEFT) ? 'L' : '-'), ((regs->fngr & CYAPA_FNGR_MIDDLE) ? 'M' : '-'), ((regs->fngr & CYAPA_FNGR_RIGHT) ? 'R' : '-'), nfingers); } #ifdef EVDEV_SUPPORT if (evdev_rcpt_mask & EVDEV_RCPT_HW_MOUSE) { for (i = 0; i < nfingers; ++i) { int slot = evdev_mt_id_to_slot( sc->evdev, regs->touch[i].id); if (slot == -1) { if (cyapa_debug) printf("Slot overflow for i=%d\n", regs->touch[i].id); continue; } evdev_push_abs(sc->evdev, ABS_MT_SLOT, slot); evdev_push_abs(sc->evdev, ABS_MT_TRACKING_ID, regs->touch[i].id); evdev_push_abs(sc->evdev, ABS_MT_POSITION_X, CYAPA_TOUCH_X(regs, i)); evdev_push_abs(sc->evdev, ABS_MT_POSITION_Y, CYAPA_TOUCH_Y(regs, i)); evdev_push_abs(sc->evdev, ABS_MT_PRESSURE, CYAPA_TOUCH_P(regs, i)); } if (sc->cap_buttons & CYAPA_FNGR_LEFT) evdev_push_key(sc->evdev, BTN_LEFT, regs->fngr & CYAPA_FNGR_LEFT); if (sc->cap_buttons & CYAPA_FNGR_RIGHT) evdev_push_key(sc->evdev, BTN_RIGHT, regs->fngr & CYAPA_FNGR_RIGHT); if (sc->cap_buttons & CYAPA_FNGR_MIDDLE) evdev_push_key(sc->evdev, BTN_MIDDLE, regs->fngr & CYAPA_FNGR_MIDDLE); evdev_sync(sc->evdev); } #endif seen_thumb = 0; for (i = 0; i < afingers; ) { if (cyapa_debug) { printf(" [x=%04d y=%04d p=%d i=%d]", CYAPA_TOUCH_X(regs, i), CYAPA_TOUCH_Y(regs, i), CYAPA_TOUCH_P(regs, i), regs->touch[i].id); } if ((CYAPA_TOUCH_Y(regs, i) > thumbarea_begin && seen_thumb) || CYAPA_TOUCH_P(regs, i) < cyapa_minpressure) { --afingers; if (i < afingers) { regs->touch[i] = regs->touch[i+1]; continue; } } else { if (CYAPA_TOUCH_Y(regs, i) > thumbarea_begin) seen_thumb = 1; } ++i; } nfingers = afingers; /* Tracking for local solutions */ cyapa_lock(sc); /* * Track timing for finger-downs. Used to detect false-3-finger * button-down. */ switch(afingers) { case 0: break; case 1: if (sc->track_nfingers == 0) sc->finger1_ticks = sc->poll_ticks; break; case 2: if (sc->track_nfingers <= 0) sc->finger1_ticks = sc->poll_ticks; if (sc->track_nfingers <= 1) sc->finger2_ticks = sc->poll_ticks; break; case 3: default: if (sc->track_nfingers <= 0) sc->finger1_ticks = sc->poll_ticks; if (sc->track_nfingers <= 1) sc->finger2_ticks = sc->poll_ticks; if (sc->track_nfingers <= 2) sc->finger3_ticks = sc->poll_ticks; break; } newfinger = sc->track_nfingers < afingers; lessfingers = sc->track_nfingers > afingers; sc->track_nfingers = afingers; /* * Lookup and track finger indexes in the touch[] array. */ if (afingers == 0) { click_x = sc->track_x; click_y = sc->track_y; sc->track_x = -1; sc->track_y = -1; sc->track_z = -1; sc->fuzz_x = 0; sc->fuzz_y = 0; sc->fuzz_z = 0; sc->touch_x = -1; sc->touch_y = -1; sc->touch_z = -1; sc->track_id = -1; sc->track_but = 0; i = 0; j = 0; } else { /* * The id assigned on touch can move around in the array, * find it. If that finger is lifted up, assign some other * finger for mouse tracking and reset track_x and track_y * to avoid a mouse jump. * * If >= 2 fingers are down be sure not to assign i and * j to the same index. */ for (i = 0; i < nfingers; ++i) { if (sc->track_id == regs->touch[i].id) break; } if (i == nfingers) { i = 0; sc->track_x = -1; sc->track_y = -1; sc->track_z = -1; while (CYAPA_TOUCH_Y(regs, i) >= thumbarea_begin && i < nfingers) ++i; if (i == nfingers) { i = 0; } sc->track_id = regs->touch[i].id; } else if ((sc->track_but || CYAPA_TOUCH_Y(regs, i) >= thumbarea_begin) && newfinger && afingers == 2) { j = regs->touch[0].id == sc->track_id ? 1 : 0; if (CYAPA_TOUCH_Y(regs, j) < thumbarea_begin) { i = j; sc->track_x = -1; sc->track_y = -1; sc->track_z = -1; sc->track_id = regs->touch[i].id; } } } /* Two finger scrolling - reset after timeout */ if (sc->track_z != -1 && afingers != 2 && (sc->poll_ticks - sc->track_z_ticks) > cyapa_scroll_stick_ticks) { sc->track_z = -1; sc->track_z_ticks = 0; } /* Initiate two finger scrolling */ if (!(regs->fngr & CYAPA_FNGR_LEFT) && ((afingers && sc->track_z != -1) || (afingers == 2 && CYAPA_TOUCH_Y(regs, 0) < thumbarea_begin && CYAPA_TOUCH_Y(regs, 1) < thumbarea_begin))) { if (afingers == 2 && (sc->poll_ticks - sc->finger2_ticks) > cyapa_scroll_wait_ticks) { z = (CYAPA_TOUCH_Y(regs, 0) + CYAPA_TOUCH_Y(regs, 1)) >> 1; sc->delta_z += z / ZSCALE - sc->track_z; if (sc->track_z == -1) { sc->delta_z = 0; } if (sc->touch_z == -1) sc->touch_z = z; /* not used atm */ sc->track_z = z / ZSCALE; sc->track_z_ticks = sc->poll_ticks; } } else if (afingers) { /* Normal pad position reporting */ x = CYAPA_TOUCH_X(regs, i); y = CYAPA_TOUCH_Y(regs, i); click_x = x; click_y = y; if (sc->track_x != -1 && sc->track_y < thumbarea_begin && (afingers > 1 || (sc->poll_ticks - sc->finger1_ticks) >= cyapa_move_min_ticks || freq < cyapa_norm_freq)) { sc->delta_x += x - sc->track_x; sc->delta_y -= y - sc->track_y; if (sc->delta_x > sc->cap_resx) sc->delta_x = sc->cap_resx; if (sc->delta_x < -sc->cap_resx) sc->delta_x = -sc->cap_resx; if (sc->delta_y > sc->cap_resy) sc->delta_y = sc->cap_resy; if (sc->delta_y < -sc->cap_resy) sc->delta_y = -sc->cap_resy; if (abs(sc->delta_y) > sc->cap_resy / 2 || abs(sc->delta_x) > sc->cap_resx / 2) { if (cyapa_debug) printf("Detected jump by %i %i\n", sc->delta_x, sc->delta_y); sc->delta_x = sc->delta_y = 0; } } if (sc->touch_x == -1) { sc->touch_x = x; sc->touch_y = y; } sc->track_x = x; sc->track_y = y; } /* Select finger (L = 2/3x, M = 1/3u, R = 1/3d) */ int is_tapclick = (cyapa_enable_tapclick && lessfingers && afingers == 0 && sc->poll_ticks - sc->finger1_ticks >= cyapa_tapclick_min_ticks && sc->poll_ticks - sc->finger1_ticks < cyapa_tapclick_max_ticks); if (regs->fngr & CYAPA_FNGR_LEFT || is_tapclick) { if (sc->track_but) { but = sc->track_but; } else if (afingers == 1) { if (click_x < sc->cap_resx * 2 / 3) but = CYAPA_FNGR_LEFT; else if (click_y < sc->cap_resy / 2) but = CYAPA_FNGR_MIDDLE; else but = CYAPA_FNGR_RIGHT; } else if (is_tapclick) { if (click_x < sc->cap_resx * 2 / 3 || cyapa_enable_tapclick < 2) but = CYAPA_FNGR_LEFT; else if (click_y < sc->cap_resy / 2 && cyapa_enable_tapclick > 2) but = CYAPA_FNGR_MIDDLE; else but = CYAPA_FNGR_RIGHT; } else { but = CYAPA_FNGR_LEFT; } } else { but = 0; } /* * Detect state change from last reported state and * determine if we have gone idle. */ sc->track_but = but; if (sc->delta_x || sc->delta_y || sc->delta_z || sc->track_but != sc->reported_but) { sc->active_tick = ticks; if (sc->remote_mode == 0 && sc->reporting_mode) sc->data_signal = 1; isidle = 0; } else if ((unsigned)(ticks - sc->active_tick) >= TIME_TO_IDLE) { sc->active_tick = ticks - TIME_TO_IDLE; /* prevent overflow */ isidle = 1; } else { isidle = 0; } cyapa_notify(sc); cyapa_unlock(sc); if (cyapa_debug) printf("%i >> %i << %i\n", isidle, sc->track_id, sc->delta_y); return (isidle); } static void cyapa_set_power_mode(struct cyapa_softc *sc, int mode) { uint8_t data; device_t bus; int error; bus = device_get_parent(sc->dev); error = iicbus_request_bus(bus, sc->dev, IIC_WAIT); if (error == 0) { error = cyapa_read_bytes(sc->dev, CMD_POWER_MODE, &data, 1); data = (data & ~0xFC) | mode; if (error == 0) { error = cyapa_write_bytes(sc->dev, CMD_POWER_MODE, &data, 1); } iicbus_release_bus(bus, sc->dev); } } /* * FIFO FUNCTIONS */ /* * Returns non-zero if the fifo is empty */ static int fifo_empty(struct cyapa_softc *sc, struct cyapa_fifo *fifo) { CYAPA_LOCK_ASSERT(sc); return (fifo->rindex == fifo->windex); } /* * Returns the number of characters available for reading from * the fifo without wrapping the fifo buffer. */ static size_t fifo_ready(struct cyapa_softc *sc, struct cyapa_fifo *fifo) { size_t n; CYAPA_LOCK_ASSERT(sc); n = CYAPA_BUFSIZE - (fifo->rindex & CYAPA_BUFMASK); if (n > (size_t)(fifo->windex - fifo->rindex)) n = (size_t)(fifo->windex - fifo->rindex); return (n); } /* * Returns a read pointer into the fifo and then bumps * rindex. The FIFO must have at least 'n' characters in * it. The value (n) can cause the index to wrap but users * of the buffer should never supply a value for (n) that wraps * the buffer. */ static char * fifo_read(struct cyapa_softc *sc, struct cyapa_fifo *fifo, size_t n) { char *ptr; CYAPA_LOCK_ASSERT(sc); if (n > (CYAPA_BUFSIZE - (fifo->rindex & CYAPA_BUFMASK))) { printf("fifo_read: overflow\n"); return (fifo->buf); } ptr = fifo->buf + (fifo->rindex & CYAPA_BUFMASK); fifo->rindex += n; return (ptr); } static uint8_t fifo_read_char(struct cyapa_softc *sc, struct cyapa_fifo *fifo) { uint8_t c; CYAPA_LOCK_ASSERT(sc); if (fifo->rindex == fifo->windex) { printf("fifo_read_char: overflow\n"); c = 0; } else { c = fifo->buf[fifo->rindex & CYAPA_BUFMASK]; ++fifo->rindex; } return (c); } /* * Write a character to the FIFO. The character will be discarded * if the FIFO is full. */ static void fifo_write_char(struct cyapa_softc *sc, struct cyapa_fifo *fifo, uint8_t c) { CYAPA_LOCK_ASSERT(sc); if (fifo->windex - fifo->rindex < CYAPA_BUFSIZE) { fifo->buf[fifo->windex & CYAPA_BUFMASK] = c; ++fifo->windex; } } /* * Return the amount of space available for writing without wrapping * the fifo. */ static size_t fifo_space(struct cyapa_softc *sc, struct cyapa_fifo *fifo) { size_t n; CYAPA_LOCK_ASSERT(sc); n = CYAPA_BUFSIZE - (fifo->windex & CYAPA_BUFMASK); if (n > (size_t)(CYAPA_BUFSIZE - (fifo->windex - fifo->rindex))) n = (size_t)(CYAPA_BUFSIZE - (fifo->windex - fifo->rindex)); return (n); } static char * fifo_write(struct cyapa_softc *sc, struct cyapa_fifo *fifo, size_t n) { char *ptr; CYAPA_LOCK_ASSERT(sc); ptr = fifo->buf + (fifo->windex & CYAPA_BUFMASK); fifo->windex += n; return (ptr); } static void fifo_reset(struct cyapa_softc *sc, struct cyapa_fifo *fifo) { CYAPA_LOCK_ASSERT(sc); fifo->rindex = 0; fifo->windex = 0; } /* * Fuzz handling */ static int cyapa_fuzz(int delta, int *fuzzp) { int fuzz; fuzz = *fuzzp; if (fuzz >= 0 && delta < 0) { ++delta; --fuzz; } else if (fuzz <= 0 && delta > 0) { --delta; ++fuzz; } *fuzzp = fuzz; return (delta); } DRIVER_MODULE(cyapa, iicbus, cyapa_driver, NULL, NULL); MODULE_DEPEND(cyapa, iicbus, IICBUS_MINVER, IICBUS_PREFVER, IICBUS_MAXVER); #ifdef EVDEV_SUPPORT MODULE_DEPEND(cyapa, evdev, 1, 1, 1); #endif MODULE_VERSION(cyapa, 1); diff --git a/sys/dev/evdev/cdev.c b/sys/dev/evdev/cdev.c index c9a8258a03a9..9fe1299a0937 100644 --- a/sys/dev/evdev/cdev.c +++ b/sys/dev/evdev/cdev.c @@ -1,951 +1,951 @@ /*- * Copyright (c) 2014 Jakub Wojciech Klama * Copyright (c) 2015-2016 Vladimir Kondratyev * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_evdev.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_FREEBSD32 #include #include #include struct input_event32 { struct timeval32 time; uint16_t type; uint16_t code; int32_t value; }; #endif #ifdef EVDEV_DEBUG #define debugf(client, fmt, args...) printf("evdev cdev: "fmt"\n", ##args) #else #define debugf(client, fmt, args...) #endif #define DEF_RING_REPORTS 8 static d_open_t evdev_open; static d_read_t evdev_read; static d_write_t evdev_write; static d_ioctl_t evdev_ioctl; static d_poll_t evdev_poll; static d_kqfilter_t evdev_kqfilter; static int evdev_kqread(struct knote *kn, long hint); static void evdev_kqdetach(struct knote *kn); static void evdev_dtor(void *); static int evdev_ioctl_eviocgbit(struct evdev_dev *, int, int, caddr_t, struct thread *); static void evdev_client_filter_queue(struct evdev_client *, uint16_t); static struct cdevsw evdev_cdevsw = { .d_version = D_VERSION, .d_open = evdev_open, .d_read = evdev_read, .d_write = evdev_write, .d_ioctl = evdev_ioctl, .d_poll = evdev_poll, .d_kqfilter = evdev_kqfilter, .d_name = "evdev", }; -static struct filterops evdev_cdev_filterops = { +static const struct filterops evdev_cdev_filterops = { .f_isfd = 1, .f_attach = NULL, .f_detach = evdev_kqdetach, .f_event = evdev_kqread, }; static int evdev_open(struct cdev *dev, int oflags, int devtype, struct thread *td) { struct evdev_dev *evdev = dev->si_drv1; struct evdev_client *client; size_t buffer_size; int ret; if (evdev == NULL) return (ENODEV); /* Initialize client structure */ buffer_size = evdev->ev_report_size * DEF_RING_REPORTS; client = malloc(offsetof(struct evdev_client, ec_buffer) + sizeof(struct input_event) * buffer_size, M_EVDEV, M_WAITOK | M_ZERO); /* Initialize ring buffer */ client->ec_buffer_size = buffer_size; client->ec_buffer_head = 0; client->ec_buffer_tail = 0; client->ec_buffer_ready = 0; client->ec_evdev = evdev; mtx_init(&client->ec_buffer_mtx, "evclient", "evdev", MTX_DEF); knlist_init_mtx(&client->ec_selp.si_note, &client->ec_buffer_mtx); ret = EVDEV_LIST_LOCK_SIG(evdev); if (ret != 0) goto out; /* Avoid race with evdev_unregister */ if (dev->si_drv1 == NULL) ret = ENODEV; else ret = evdev_register_client(evdev, client); EVDEV_LIST_UNLOCK(evdev); out: if (ret == 0) ret = devfs_set_cdevpriv(client, evdev_dtor); else client->ec_revoked = true; if (ret != 0) { debugf(client, "cannot register evdev client"); evdev_dtor(client); } return (ret); } static void evdev_dtor(void *data) { struct evdev_client *client = (struct evdev_client *)data; EVDEV_LIST_LOCK(client->ec_evdev); if (!client->ec_revoked) evdev_dispose_client(client->ec_evdev, client); EVDEV_LIST_UNLOCK(client->ec_evdev); if (client->ec_evdev->ev_lock_type != EV_LOCK_MTX) epoch_wait_preempt(INPUT_EPOCH); knlist_clear(&client->ec_selp.si_note, 0); seldrain(&client->ec_selp); knlist_destroy(&client->ec_selp.si_note); funsetown(&client->ec_sigio); mtx_destroy(&client->ec_buffer_mtx); free(client, M_EVDEV); } static int evdev_read(struct cdev *dev, struct uio *uio, int ioflag) { struct evdev_client *client; union { struct input_event t; #ifdef COMPAT_FREEBSD32 struct input_event32 t32; #endif } event; struct input_event *head; size_t evsize; int ret = 0; int remaining; ret = devfs_get_cdevpriv((void **)&client); if (ret != 0) return (ret); debugf(client, "read %zd bytes by thread %d", uio->uio_resid, uio->uio_td->td_tid); if (client->ec_revoked) return (ENODEV); #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) evsize = sizeof(struct input_event32); else #endif evsize = sizeof(struct input_event); /* Zero-sized reads are allowed for error checking */ if (uio->uio_resid != 0 && uio->uio_resid < evsize) return (EINVAL); remaining = uio->uio_resid / evsize; EVDEV_CLIENT_LOCKQ(client); if (EVDEV_CLIENT_EMPTYQ(client)) { if (ioflag & O_NONBLOCK) ret = EWOULDBLOCK; else { if (remaining != 0) { client->ec_blocked = true; ret = mtx_sleep(client, &client->ec_buffer_mtx, PCATCH, "evread", 0); if (ret == 0 && client->ec_revoked) ret = ENODEV; } } } while (ret == 0 && !EVDEV_CLIENT_EMPTYQ(client) && remaining > 0) { head = client->ec_buffer + client->ec_buffer_head; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { bzero(&event.t32, sizeof(struct input_event32)); TV_CP(*head, event.t32, time); CP(*head, event.t32, type); CP(*head, event.t32, code); CP(*head, event.t32, value); } else #endif bcopy(head, &event.t, evsize); client->ec_buffer_head = (client->ec_buffer_head + 1) % client->ec_buffer_size; remaining--; EVDEV_CLIENT_UNLOCKQ(client); ret = uiomove(&event, evsize, uio); EVDEV_CLIENT_LOCKQ(client); } EVDEV_CLIENT_UNLOCKQ(client); return (ret); } static int evdev_write(struct cdev *dev, struct uio *uio, int ioflag) { struct evdev_dev *evdev = dev->si_drv1; struct evdev_client *client; union { struct input_event t; #ifdef COMPAT_FREEBSD32 struct input_event32 t32; #endif } event; size_t evsize; int ret = 0; ret = devfs_get_cdevpriv((void **)&client); if (ret != 0) return (ret); debugf(client, "write %zd bytes by thread %d", uio->uio_resid, uio->uio_td->td_tid); if (client->ec_revoked || evdev == NULL) return (ENODEV); #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) evsize = sizeof(struct input_event32); else #endif evsize = sizeof(struct input_event); if (uio->uio_resid % evsize != 0) { debugf(client, "write size not multiple of input_event size"); return (EINVAL); } while (uio->uio_resid > 0 && ret == 0) { ret = uiomove(&event, evsize, uio); if (ret == 0) { #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) ret = evdev_inject_event(evdev, event.t32.type, event.t32.code, event.t32.value); else #endif ret = evdev_inject_event(evdev, event.t.type, event.t.code, event.t.value); } } return (ret); } static int evdev_poll(struct cdev *dev, int events, struct thread *td) { struct evdev_client *client; int ret; int revents = 0; ret = devfs_get_cdevpriv((void **)&client); if (ret != 0) return (POLLNVAL); debugf(client, "poll by thread %d", td->td_tid); if (client->ec_revoked) return (POLLHUP); if (events & (POLLIN | POLLRDNORM)) { EVDEV_CLIENT_LOCKQ(client); if (!EVDEV_CLIENT_EMPTYQ(client)) revents = events & (POLLIN | POLLRDNORM); else { client->ec_selected = true; selrecord(td, &client->ec_selp); } EVDEV_CLIENT_UNLOCKQ(client); } return (revents); } static int evdev_kqfilter(struct cdev *dev, struct knote *kn) { struct evdev_client *client; int ret; ret = devfs_get_cdevpriv((void **)&client); if (ret != 0) return (ret); if (client->ec_revoked) return (ENODEV); switch(kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &evdev_cdev_filterops; break; default: return(EINVAL); } kn->kn_hook = (caddr_t)client; knlist_add(&client->ec_selp.si_note, kn, 0); return (0); } static int evdev_kqread(struct knote *kn, long hint) { struct evdev_client *client; int ret; client = (struct evdev_client *)kn->kn_hook; EVDEV_CLIENT_LOCKQ_ASSERT(client); if (client->ec_revoked) { kn->kn_flags |= EV_EOF; ret = 1; } else { kn->kn_data = EVDEV_CLIENT_SIZEQ(client) * sizeof(struct input_event); ret = !EVDEV_CLIENT_EMPTYQ(client); } return (ret); } static void evdev_kqdetach(struct knote *kn) { struct evdev_client *client; client = (struct evdev_client *)kn->kn_hook; knlist_remove(&client->ec_selp.si_note, kn, 0); } static int evdev_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) { struct evdev_dev *evdev = dev->si_drv1; struct evdev_client *client; struct input_keymap_entry *ke; struct epoch_tracker et; int ret, len, limit, type_num; uint32_t code; size_t nvalues; ret = devfs_get_cdevpriv((void **)&client); if (ret != 0) return (ret); if (client->ec_revoked || evdev == NULL) return (ENODEV); /* * Fix evdev state corrupted with discarding of kdb events. * EVIOCGKEY and EVIOCGLED ioctls can suffer from this. */ if (evdev->ev_kdb_active) { EVDEV_LOCK(evdev); if (evdev->ev_kdb_active) { evdev->ev_kdb_active = false; if (evdev->ev_lock_type == EV_LOCK_EXT_EPOCH) epoch_enter_preempt(INPUT_EPOCH, &et); evdev_restore_after_kdb(evdev); if (evdev->ev_lock_type == EV_LOCK_EXT_EPOCH) epoch_exit_preempt(INPUT_EPOCH, &et); } EVDEV_UNLOCK(evdev); } /* file I/O ioctl handling */ switch (cmd) { case FIOSETOWN: return (fsetown(*(int *)data, &client->ec_sigio)); case FIOGETOWN: *(int *)data = fgetown(&client->ec_sigio); return (0); case FIONBIO: return (0); case FIOASYNC: if (*(int *)data) client->ec_async = true; else client->ec_async = false; return (0); case FIONREAD: EVDEV_CLIENT_LOCKQ(client); *(int *)data = EVDEV_CLIENT_SIZEQ(client) * sizeof(struct input_event); EVDEV_CLIENT_UNLOCKQ(client); return (0); } len = IOCPARM_LEN(cmd); debugf(client, "ioctl called: cmd=0x%08lx, data=%p", cmd, data); /* evdev fixed-length ioctls handling */ switch (cmd) { case EVIOCGVERSION: *(int *)data = EV_VERSION; return (0); case EVIOCGID: debugf(client, "EVIOCGID: bus=%d vendor=0x%04x product=0x%04x", evdev->ev_id.bustype, evdev->ev_id.vendor, evdev->ev_id.product); memcpy(data, &evdev->ev_id, sizeof(struct input_id)); return (0); case EVIOCGREP: if (!evdev_event_supported(evdev, EV_REP)) return (ENOTSUP); memcpy(data, evdev->ev_rep, sizeof(evdev->ev_rep)); return (0); case EVIOCSREP: if (!evdev_event_supported(evdev, EV_REP)) return (ENOTSUP); evdev_inject_event(evdev, EV_REP, REP_DELAY, ((int *)data)[0]); evdev_inject_event(evdev, EV_REP, REP_PERIOD, ((int *)data)[1]); return (0); case EVIOCGKEYCODE: /* Fake unsupported ioctl */ return (0); case EVIOCGKEYCODE_V2: if (evdev->ev_methods == NULL || evdev->ev_methods->ev_get_keycode == NULL) return (ENOTSUP); ke = (struct input_keymap_entry *)data; evdev->ev_methods->ev_get_keycode(evdev, ke); return (0); case EVIOCSKEYCODE: /* Fake unsupported ioctl */ return (0); case EVIOCSKEYCODE_V2: if (evdev->ev_methods == NULL || evdev->ev_methods->ev_set_keycode == NULL) return (ENOTSUP); ke = (struct input_keymap_entry *)data; evdev->ev_methods->ev_set_keycode(evdev, ke); return (0); case EVIOCGABS(0) ... EVIOCGABS(ABS_MAX): if (evdev->ev_absinfo == NULL) return (EINVAL); memcpy(data, &evdev->ev_absinfo[cmd - EVIOCGABS(0)], sizeof(struct input_absinfo)); return (0); case EVIOCSABS(0) ... EVIOCSABS(ABS_MAX): if (evdev->ev_absinfo == NULL) return (EINVAL); code = cmd - EVIOCSABS(0); /* mt-slot number can not be changed */ if (code == ABS_MT_SLOT) return (EINVAL); EVDEV_LOCK(evdev); evdev_set_absinfo(evdev, code, (struct input_absinfo *)data); EVDEV_UNLOCK(evdev); return (0); case EVIOCSFF: case EVIOCRMFF: case EVIOCGEFFECTS: /* Fake unsupported ioctls */ return (0); case EVIOCGRAB: EVDEV_LOCK(evdev); if (*(int *)data) ret = evdev_grab_client(evdev, client); else ret = evdev_release_client(evdev, client); EVDEV_UNLOCK(evdev); return (ret); case EVIOCREVOKE: if (*(int *)data != 0) return (EINVAL); EVDEV_LIST_LOCK(evdev); if (dev->si_drv1 != NULL && !client->ec_revoked) { evdev_dispose_client(evdev, client); evdev_revoke_client(client); } EVDEV_LIST_UNLOCK(evdev); return (0); case EVIOCSCLOCKID: switch (*(int *)data) { case CLOCK_REALTIME: client->ec_clock_id = EV_CLOCK_REALTIME; return (0); case CLOCK_MONOTONIC: client->ec_clock_id = EV_CLOCK_MONOTONIC; return (0); default: return (EINVAL); } } /* evdev variable-length ioctls handling */ switch (IOCBASECMD(cmd)) { case EVIOCGNAME(0): /* Linux evdev does not terminate truncated strings with 0 */ limit = MIN(strlen(evdev->ev_name) + 1, len); memcpy(data, evdev->ev_name, limit); td->td_retval[0] = limit; return (0); case EVIOCGPHYS(0): if (evdev->ev_shortname[0] == 0) return (ENOENT); limit = MIN(strlen(evdev->ev_shortname) + 1, len); memcpy(data, evdev->ev_shortname, limit); td->td_retval[0] = limit; return (0); case EVIOCGUNIQ(0): if (evdev->ev_serial[0] == 0) return (ENOENT); limit = MIN(strlen(evdev->ev_serial) + 1, len); memcpy(data, evdev->ev_serial, limit); td->td_retval[0] = limit; return (0); case EVIOCGPROP(0): limit = MIN(len, bitstr_size(INPUT_PROP_CNT)); memcpy(data, evdev->ev_prop_flags, limit); td->td_retval[0] = limit; return (0); case EVIOCGMTSLOTS(0): /* EVIOCGMTSLOTS always returns 0 on success */ if (evdev->ev_mt == NULL) return (EINVAL); if (len < sizeof(uint32_t)) return (EINVAL); code = *(uint32_t *)data; if (!ABS_IS_MT(code)) return (EINVAL); nvalues = MIN(len / sizeof(int32_t) - 1, MAXIMAL_MT_SLOT(evdev) + 1); for (int i = 0; i < nvalues; i++) ((int32_t *)data)[i + 1] = evdev_mt_get_value(evdev, i, code); return (0); case EVIOCGKEY(0): limit = MIN(len, bitstr_size(KEY_CNT)); EVDEV_LOCK(evdev); evdev_client_filter_queue(client, EV_KEY); memcpy(data, evdev->ev_key_states, limit); EVDEV_UNLOCK(evdev); td->td_retval[0] = limit; return (0); case EVIOCGLED(0): limit = MIN(len, bitstr_size(LED_CNT)); EVDEV_LOCK(evdev); evdev_client_filter_queue(client, EV_LED); memcpy(data, evdev->ev_led_states, limit); EVDEV_UNLOCK(evdev); td->td_retval[0] = limit; return (0); case EVIOCGSND(0): limit = MIN(len, bitstr_size(SND_CNT)); EVDEV_LOCK(evdev); evdev_client_filter_queue(client, EV_SND); memcpy(data, evdev->ev_snd_states, limit); EVDEV_UNLOCK(evdev); td->td_retval[0] = limit; return (0); case EVIOCGSW(0): limit = MIN(len, bitstr_size(SW_CNT)); EVDEV_LOCK(evdev); evdev_client_filter_queue(client, EV_SW); memcpy(data, evdev->ev_sw_states, limit); EVDEV_UNLOCK(evdev); td->td_retval[0] = limit; return (0); case EVIOCGBIT(0, 0) ... EVIOCGBIT(EV_MAX, 0): type_num = IOCBASECMD(cmd) - EVIOCGBIT(0, 0); debugf(client, "EVIOCGBIT(%d): data=%p, len=%d", type_num, data, len); return (evdev_ioctl_eviocgbit(evdev, type_num, len, data, td)); } return (EINVAL); } static int evdev_ioctl_eviocgbit(struct evdev_dev *evdev, int type, int len, caddr_t data, struct thread *td) { unsigned long *bitmap; int limit; switch (type) { case 0: bitmap = evdev->ev_type_flags; limit = EV_CNT; break; case EV_KEY: bitmap = evdev->ev_key_flags; limit = KEY_CNT; break; case EV_REL: bitmap = evdev->ev_rel_flags; limit = REL_CNT; break; case EV_ABS: bitmap = evdev->ev_abs_flags; limit = ABS_CNT; break; case EV_MSC: bitmap = evdev->ev_msc_flags; limit = MSC_CNT; break; case EV_LED: bitmap = evdev->ev_led_flags; limit = LED_CNT; break; case EV_SND: bitmap = evdev->ev_snd_flags; limit = SND_CNT; break; case EV_SW: bitmap = evdev->ev_sw_flags; limit = SW_CNT; break; case EV_FF: /* * We don't support EV_FF now, so let's * just fake it returning only zeros. */ bzero(data, len); td->td_retval[0] = len; return (0); default: return (ENOTTY); } /* * Clear ioctl data buffer in case it's bigger than * bitmap size */ bzero(data, len); limit = bitstr_size(limit); len = MIN(limit, len); memcpy(data, bitmap, len); td->td_retval[0] = len; return (0); } void evdev_revoke_client(struct evdev_client *client) { EVDEV_LIST_LOCK_ASSERT(client->ec_evdev); client->ec_revoked = true; } void evdev_notify_event(struct evdev_client *client) { EVDEV_CLIENT_LOCKQ_ASSERT(client); if (client->ec_blocked) { client->ec_blocked = false; wakeup(client); } if (client->ec_selected) { client->ec_selected = false; selwakeup(&client->ec_selp); } KNOTE_LOCKED(&client->ec_selp.si_note, 0); if (client->ec_async && client->ec_sigio != NULL) pgsigio(&client->ec_sigio, SIGIO, 0); } int evdev_cdev_create(struct evdev_dev *evdev) { struct make_dev_args mda; int ret, unit = 0; make_dev_args_init(&mda); mda.mda_flags = MAKEDEV_WAITOK | MAKEDEV_CHECKNAME; mda.mda_devsw = &evdev_cdevsw; mda.mda_uid = UID_ROOT; mda.mda_gid = GID_WHEEL; mda.mda_mode = 0600; mda.mda_si_drv1 = evdev; /* Try to coexist with cuse-backed input/event devices */ while ((ret = make_dev_s(&mda, &evdev->ev_cdev, "input/event%d", unit)) == EEXIST) unit++; if (ret == 0) evdev->ev_unit = unit; return (ret); } int evdev_cdev_destroy(struct evdev_dev *evdev) { destroy_dev(evdev->ev_cdev); return (0); } static void evdev_client_gettime(struct evdev_client *client, struct timeval *tv) { switch (client->ec_clock_id) { case EV_CLOCK_BOOTTIME: /* * XXX: FreeBSD does not support true POSIX monotonic clock. * So aliase EV_CLOCK_BOOTTIME to EV_CLOCK_MONOTONIC. */ case EV_CLOCK_MONOTONIC: microuptime(tv); break; case EV_CLOCK_REALTIME: default: microtime(tv); break; } } void evdev_client_push(struct evdev_client *client, uint16_t type, uint16_t code, int32_t value) { struct timeval time; size_t count, head, tail, ready; EVDEV_CLIENT_LOCKQ_ASSERT(client); head = client->ec_buffer_head; tail = client->ec_buffer_tail; ready = client->ec_buffer_ready; count = client->ec_buffer_size; /* If queue is full drop its content and place SYN_DROPPED event */ if ((tail + 1) % count == head) { debugf(client, "client %p: buffer overflow", client); head = (tail + count - 1) % count; client->ec_buffer[head] = (struct input_event) { .type = EV_SYN, .code = SYN_DROPPED, .value = 0 }; /* * XXX: Here is a small race window from now till the end of * report. The queue is empty but client has been already * notified of data readyness. Can be fixed in two ways: * 1. Implement bulk insert so queue lock would not be dropped * till the SYN_REPORT event. * 2. Insert SYN_REPORT just now and skip remaining events */ client->ec_buffer_head = head; client->ec_buffer_ready = head; } client->ec_buffer[tail].type = type; client->ec_buffer[tail].code = code; client->ec_buffer[tail].value = value; client->ec_buffer_tail = (tail + 1) % count; /* Allow users to read events only after report has been completed */ if (type == EV_SYN && code == SYN_REPORT) { evdev_client_gettime(client, &time); for (; ready != client->ec_buffer_tail; ready = (ready + 1) % count) client->ec_buffer[ready].time = time; client->ec_buffer_ready = client->ec_buffer_tail; } } void evdev_client_dumpqueue(struct evdev_client *client) { struct input_event *event; size_t i, head, tail, ready, size; head = client->ec_buffer_head; tail = client->ec_buffer_tail; ready = client->ec_buffer_ready; size = client->ec_buffer_size; printf("evdev client: %p\n", client); printf("event queue: head=%zu ready=%zu tail=%zu size=%zu\n", head, ready, tail, size); printf("queue contents:\n"); for (i = 0; i < size; i++) { event = &client->ec_buffer[i]; printf("%zu: ", i); if (i < head || i > tail) printf("unused\n"); else printf("type=%d code=%d value=%d ", event->type, event->code, event->value); if (i == head) printf("<- head\n"); else if (i == tail) printf("<- tail\n"); else if (i == ready) printf("<- ready\n"); else printf("\n"); } } static void evdev_client_filter_queue(struct evdev_client *client, uint16_t type) { struct input_event *event; size_t head, tail, count, i; bool last_was_syn = false; EVDEV_CLIENT_LOCKQ(client); i = head = client->ec_buffer_head; tail = client->ec_buffer_tail; count = client->ec_buffer_size; client->ec_buffer_ready = client->ec_buffer_tail; while (i != client->ec_buffer_tail) { event = &client->ec_buffer[i]; i = (i + 1) % count; /* Skip event of given type */ if (event->type == type) continue; /* Remove empty SYN_REPORT events */ if (event->type == EV_SYN && event->code == SYN_REPORT) { if (last_was_syn) continue; else client->ec_buffer_ready = (tail + 1) % count; } /* Rewrite entry */ memcpy(&client->ec_buffer[tail], event, sizeof(struct input_event)); last_was_syn = (event->type == EV_SYN && event->code == SYN_REPORT); tail = (tail + 1) % count; } client->ec_buffer_head = i; client->ec_buffer_tail = tail; EVDEV_CLIENT_UNLOCKQ(client); } diff --git a/sys/dev/evdev/uinput.c b/sys/dev/evdev/uinput.c index 3bf0e91b7360..9ac9fee8a157 100644 --- a/sys/dev/evdev/uinput.c +++ b/sys/dev/evdev/uinput.c @@ -1,714 +1,714 @@ /*- * Copyright (c) 2014 Jakub Wojciech Klama * Copyright (c) 2015-2016 Vladimir Kondratyev * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_evdev.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef UINPUT_DEBUG #define debugf(state, fmt, args...) printf("uinput: " fmt "\n", ##args) #else #define debugf(state, fmt, args...) #endif #define UINPUT_BUFFER_SIZE 16 #define UINPUT_LOCK(state) sx_xlock(&(state)->ucs_lock) #define UINPUT_UNLOCK(state) sx_unlock(&(state)->ucs_lock) #define UINPUT_LOCK_ASSERT(state) sx_assert(&(state)->ucs_lock, SA_LOCKED) #define UINPUT_EMPTYQ(state) \ ((state)->ucs_buffer_head == (state)->ucs_buffer_tail) enum uinput_state { UINPUT_NEW = 0, UINPUT_CONFIGURED, UINPUT_RUNNING }; static evdev_event_t uinput_ev_event; static d_open_t uinput_open; static d_read_t uinput_read; static d_write_t uinput_write; static d_ioctl_t uinput_ioctl; static d_poll_t uinput_poll; static d_kqfilter_t uinput_kqfilter; static void uinput_dtor(void *); static int uinput_kqread(struct knote *kn, long hint); static void uinput_kqdetach(struct knote *kn); static struct cdevsw uinput_cdevsw = { .d_version = D_VERSION, .d_open = uinput_open, .d_read = uinput_read, .d_write = uinput_write, .d_ioctl = uinput_ioctl, .d_poll = uinput_poll, .d_kqfilter = uinput_kqfilter, .d_name = "uinput", }; static struct cdev *uinput_cdev; -static struct evdev_methods uinput_ev_methods = { +static const struct evdev_methods uinput_ev_methods = { .ev_open = NULL, .ev_close = NULL, .ev_event = uinput_ev_event, }; -static struct filterops uinput_filterops = { +static const struct filterops uinput_filterops = { .f_isfd = 1, .f_attach = NULL, .f_detach = uinput_kqdetach, .f_event = uinput_kqread, }; struct uinput_cdev_state { enum uinput_state ucs_state; struct evdev_dev * ucs_evdev; struct sx ucs_lock; size_t ucs_buffer_head; size_t ucs_buffer_tail; struct selinfo ucs_selp; bool ucs_blocked; bool ucs_selected; struct input_event ucs_buffer[UINPUT_BUFFER_SIZE]; }; static void uinput_enqueue_event(struct uinput_cdev_state *, uint16_t, uint16_t, int32_t); static int uinput_setup_provider(struct uinput_cdev_state *, struct uinput_user_dev *); static int uinput_cdev_create(void); static void uinput_notify(struct uinput_cdev_state *); static void uinput_knllock(void *arg) { struct sx *sx = arg; sx_xlock(sx); } static void uinput_knlunlock(void *arg) { struct sx *sx = arg; sx_unlock(sx); } static void uinput_knl_assert_lock(void *arg, int what) { if (what == LA_LOCKED) sx_assert((struct sx*)arg, SA_XLOCKED); else sx_assert((struct sx*)arg, SA_UNLOCKED); } static void uinput_ev_event(struct evdev_dev *evdev, uint16_t type, uint16_t code, int32_t value) { struct uinput_cdev_state *state = evdev_get_softc(evdev); if (type == EV_LED) evdev_push_event(evdev, type, code, value); UINPUT_LOCK(state); if (state->ucs_state == UINPUT_RUNNING) { uinput_enqueue_event(state, type, code, value); uinput_notify(state); } UINPUT_UNLOCK(state); } static void uinput_enqueue_event(struct uinput_cdev_state *state, uint16_t type, uint16_t code, int32_t value) { size_t head, tail; UINPUT_LOCK_ASSERT(state); head = state->ucs_buffer_head; tail = (state->ucs_buffer_tail + 1) % UINPUT_BUFFER_SIZE; microtime(&state->ucs_buffer[tail].time); state->ucs_buffer[tail].type = type; state->ucs_buffer[tail].code = code; state->ucs_buffer[tail].value = value; state->ucs_buffer_tail = tail; /* If queue is full remove oldest event */ if (tail == head) { debugf(state, "state %p: buffer overflow", state); head = (head + 1) % UINPUT_BUFFER_SIZE; state->ucs_buffer_head = head; } } static int uinput_open(struct cdev *dev, int oflags, int devtype, struct thread *td) { struct uinput_cdev_state *state; state = malloc(sizeof(struct uinput_cdev_state), M_EVDEV, M_WAITOK | M_ZERO); state->ucs_evdev = evdev_alloc(); sx_init(&state->ucs_lock, "uinput"); knlist_init(&state->ucs_selp.si_note, &state->ucs_lock, uinput_knllock, uinput_knlunlock, uinput_knl_assert_lock); devfs_set_cdevpriv(state, uinput_dtor); return (0); } static void uinput_dtor(void *data) { struct uinput_cdev_state *state = (struct uinput_cdev_state *)data; evdev_free(state->ucs_evdev); knlist_clear(&state->ucs_selp.si_note, 0); seldrain(&state->ucs_selp); knlist_destroy(&state->ucs_selp.si_note); sx_destroy(&state->ucs_lock); free(data, M_EVDEV); } static int uinput_read(struct cdev *dev, struct uio *uio, int ioflag) { struct uinput_cdev_state *state; struct input_event *event; int remaining, ret; ret = devfs_get_cdevpriv((void **)&state); if (ret != 0) return (ret); debugf(state, "read %zd bytes by thread %d", uio->uio_resid, uio->uio_td->td_tid); /* Zero-sized reads are allowed for error checking */ if (uio->uio_resid != 0 && uio->uio_resid < sizeof(struct input_event)) return (EINVAL); remaining = uio->uio_resid / sizeof(struct input_event); UINPUT_LOCK(state); if (state->ucs_state != UINPUT_RUNNING) ret = EINVAL; if (ret == 0 && UINPUT_EMPTYQ(state)) { if (ioflag & O_NONBLOCK) ret = EWOULDBLOCK; else { if (remaining != 0) { state->ucs_blocked = true; ret = sx_sleep(state, &state->ucs_lock, PCATCH, "uiread", 0); } } } while (ret == 0 && !UINPUT_EMPTYQ(state) && remaining > 0) { event = &state->ucs_buffer[state->ucs_buffer_head]; state->ucs_buffer_head = (state->ucs_buffer_head + 1) % UINPUT_BUFFER_SIZE; remaining--; ret = uiomove(event, sizeof(struct input_event), uio); } UINPUT_UNLOCK(state); return (ret); } static int uinput_write(struct cdev *dev, struct uio *uio, int ioflag) { struct uinput_cdev_state *state; struct uinput_user_dev userdev; struct input_event event; int ret = 0; ret = devfs_get_cdevpriv((void **)&state); if (ret != 0) return (ret); debugf(state, "write %zd bytes by thread %d", uio->uio_resid, uio->uio_td->td_tid); UINPUT_LOCK(state); if (state->ucs_state != UINPUT_RUNNING) { /* Process written struct uinput_user_dev */ if (uio->uio_resid != sizeof(struct uinput_user_dev)) { debugf(state, "write size not multiple of " "struct uinput_user_dev size"); ret = EINVAL; } else { ret = uiomove(&userdev, sizeof(struct uinput_user_dev), uio); if (ret == 0) uinput_setup_provider(state, &userdev); } } else { /* Process written event */ if (uio->uio_resid % sizeof(struct input_event) != 0) { debugf(state, "write size not multiple of " "struct input_event size"); ret = EINVAL; } while (ret == 0 && uio->uio_resid > 0) { uiomove(&event, sizeof(struct input_event), uio); ret = evdev_push_event(state->ucs_evdev, event.type, event.code, event.value); } } UINPUT_UNLOCK(state); return (ret); } static int uinput_setup_dev(struct uinput_cdev_state *state, struct input_id *id, char *name, uint32_t ff_effects_max) { if (name[0] == 0) return (EINVAL); evdev_set_name(state->ucs_evdev, name); evdev_set_id(state->ucs_evdev, id->bustype, id->vendor, id->product, id->version); state->ucs_state = UINPUT_CONFIGURED; return (0); } static int uinput_setup_provider(struct uinput_cdev_state *state, struct uinput_user_dev *udev) { struct input_absinfo absinfo; int i, ret; debugf(state, "setup_provider called, udev=%p", udev); ret = uinput_setup_dev(state, &udev->id, udev->name, udev->ff_effects_max); if (ret) return (ret); bzero(&absinfo, sizeof(struct input_absinfo)); for (i = 0; i < ABS_CNT; i++) { if (!bit_test(state->ucs_evdev->ev_abs_flags, i)) continue; absinfo.minimum = udev->absmin[i]; absinfo.maximum = udev->absmax[i]; absinfo.fuzz = udev->absfuzz[i]; absinfo.flat = udev->absflat[i]; evdev_set_absinfo(state->ucs_evdev, i, &absinfo); } return (0); } static int uinput_poll(struct cdev *dev, int events, struct thread *td) { struct uinput_cdev_state *state; int revents = 0; if (devfs_get_cdevpriv((void **)&state) != 0) return (POLLNVAL); debugf(state, "poll by thread %d", td->td_tid); /* Always allow write */ if (events & (POLLOUT | POLLWRNORM)) revents |= (events & (POLLOUT | POLLWRNORM)); if (events & (POLLIN | POLLRDNORM)) { UINPUT_LOCK(state); if (!UINPUT_EMPTYQ(state)) revents = events & (POLLIN | POLLRDNORM); else { state->ucs_selected = true; selrecord(td, &state->ucs_selp); } UINPUT_UNLOCK(state); } return (revents); } static int uinput_kqfilter(struct cdev *dev, struct knote *kn) { struct uinput_cdev_state *state; int ret; ret = devfs_get_cdevpriv((void **)&state); if (ret != 0) return (ret); switch(kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &uinput_filterops; break; default: return(EINVAL); } kn->kn_hook = (caddr_t)state; knlist_add(&state->ucs_selp.si_note, kn, 0); return (0); } static int uinput_kqread(struct knote *kn, long hint) { struct uinput_cdev_state *state; int ret; state = (struct uinput_cdev_state *)kn->kn_hook; UINPUT_LOCK_ASSERT(state); ret = !UINPUT_EMPTYQ(state); return (ret); } static void uinput_kqdetach(struct knote *kn) { struct uinput_cdev_state *state; state = (struct uinput_cdev_state *)kn->kn_hook; knlist_remove(&state->ucs_selp.si_note, kn, 0); } static void uinput_notify(struct uinput_cdev_state *state) { UINPUT_LOCK_ASSERT(state); if (state->ucs_blocked) { state->ucs_blocked = false; wakeup(state); } if (state->ucs_selected) { state->ucs_selected = false; selwakeup(&state->ucs_selp); } KNOTE_LOCKED(&state->ucs_selp.si_note, 0); } static int uinput_ioctl_sub(struct uinput_cdev_state *state, u_long cmd, caddr_t data) { struct uinput_setup *us; struct uinput_abs_setup *uabs; int ret, len, intdata; char buf[NAMELEN]; UINPUT_LOCK_ASSERT(state); len = IOCPARM_LEN(cmd); if ((cmd & IOC_DIRMASK) == IOC_VOID && len == sizeof(int)) intdata = *(int *)data; switch (IOCBASECMD(cmd)) { case UI_GET_SYSNAME(0): if (state->ucs_state != UINPUT_RUNNING) return (ENOENT); if (len == 0) return (EINVAL); snprintf(data, len, "event%d", state->ucs_evdev->ev_unit); return (0); } switch (cmd) { case UI_DEV_CREATE: if (state->ucs_state != UINPUT_CONFIGURED) return (EINVAL); evdev_set_methods(state->ucs_evdev, state, &uinput_ev_methods); evdev_set_flag(state->ucs_evdev, EVDEV_FLAG_SOFTREPEAT); evdev_set_flag(state->ucs_evdev, EVDEV_FLAG_MT_KEEPID); ret = evdev_register(state->ucs_evdev); if (ret == 0) state->ucs_state = UINPUT_RUNNING; return (ret); case UI_DEV_DESTROY: if (state->ucs_state != UINPUT_RUNNING) return (0); evdev_unregister(state->ucs_evdev); bzero(state->ucs_evdev, sizeof(struct evdev_dev)); state->ucs_state = UINPUT_NEW; return (0); case UI_DEV_SETUP: if (state->ucs_state == UINPUT_RUNNING) return (EINVAL); us = (struct uinput_setup *)data; return (uinput_setup_dev(state, &us->id, us->name, us->ff_effects_max)); case UI_ABS_SETUP: if (state->ucs_state == UINPUT_RUNNING) return (EINVAL); uabs = (struct uinput_abs_setup *)data; if (uabs->code > ABS_MAX) return (EINVAL); evdev_set_abs_bit(state->ucs_evdev, uabs->code); evdev_set_absinfo(state->ucs_evdev, uabs->code, &uabs->absinfo); return (0); case UI_SET_EVBIT: if (state->ucs_state == UINPUT_RUNNING || intdata > EV_MAX || intdata < 0) return (EINVAL); evdev_support_event(state->ucs_evdev, intdata); return (0); case UI_SET_KEYBIT: if (state->ucs_state == UINPUT_RUNNING || intdata > KEY_MAX || intdata < 0) return (EINVAL); evdev_support_key(state->ucs_evdev, intdata); return (0); case UI_SET_RELBIT: if (state->ucs_state == UINPUT_RUNNING || intdata > REL_MAX || intdata < 0) return (EINVAL); evdev_support_rel(state->ucs_evdev, intdata); return (0); case UI_SET_ABSBIT: if (state->ucs_state == UINPUT_RUNNING || intdata > ABS_MAX || intdata < 0) return (EINVAL); evdev_set_abs_bit(state->ucs_evdev, intdata); return (0); case UI_SET_MSCBIT: if (state->ucs_state == UINPUT_RUNNING || intdata > MSC_MAX || intdata < 0) return (EINVAL); evdev_support_msc(state->ucs_evdev, intdata); return (0); case UI_SET_LEDBIT: if (state->ucs_state == UINPUT_RUNNING || intdata > LED_MAX || intdata < 0) return (EINVAL); evdev_support_led(state->ucs_evdev, intdata); return (0); case UI_SET_SNDBIT: if (state->ucs_state == UINPUT_RUNNING || intdata > SND_MAX || intdata < 0) return (EINVAL); evdev_support_snd(state->ucs_evdev, intdata); return (0); case UI_SET_FFBIT: if (state->ucs_state == UINPUT_RUNNING || intdata > FF_MAX || intdata < 0) return (EINVAL); /* Fake unsupported ioctl */ return (0); case UI_SET_PHYS: if (state->ucs_state == UINPUT_RUNNING) return (EINVAL); ret = copyinstr(*(void **)data, buf, sizeof(buf), NULL); /* Linux returns EINVAL when string does not fit the buffer */ if (ret == ENAMETOOLONG) ret = EINVAL; if (ret != 0) return (ret); evdev_set_phys(state->ucs_evdev, buf); return (0); case UI_SET_BSDUNIQ: if (state->ucs_state == UINPUT_RUNNING) return (EINVAL); ret = copyinstr(*(void **)data, buf, sizeof(buf), NULL); if (ret != 0) return (ret); evdev_set_serial(state->ucs_evdev, buf); return (0); case UI_SET_SWBIT: if (state->ucs_state == UINPUT_RUNNING || intdata > SW_MAX || intdata < 0) return (EINVAL); evdev_support_sw(state->ucs_evdev, intdata); return (0); case UI_SET_PROPBIT: if (state->ucs_state == UINPUT_RUNNING || intdata > INPUT_PROP_MAX || intdata < 0) return (EINVAL); evdev_support_prop(state->ucs_evdev, intdata); return (0); case UI_BEGIN_FF_UPLOAD: case UI_END_FF_UPLOAD: case UI_BEGIN_FF_ERASE: case UI_END_FF_ERASE: if (state->ucs_state == UINPUT_RUNNING) return (EINVAL); /* Fake unsupported ioctl */ return (0); case UI_GET_VERSION: *(unsigned int *)data = UINPUT_VERSION; return (0); } return (EINVAL); } static int uinput_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) { struct uinput_cdev_state *state; int ret; ret = devfs_get_cdevpriv((void **)&state); if (ret != 0) return (ret); debugf(state, "ioctl called: cmd=0x%08lx, data=%p", cmd, data); UINPUT_LOCK(state); ret = uinput_ioctl_sub(state, cmd, data); UINPUT_UNLOCK(state); return (ret); } static int uinput_cdev_create(void) { struct make_dev_args mda; int ret; make_dev_args_init(&mda); mda.mda_flags = MAKEDEV_WAITOK | MAKEDEV_CHECKNAME; mda.mda_devsw = &uinput_cdevsw; mda.mda_uid = UID_ROOT; mda.mda_gid = GID_WHEEL; mda.mda_mode = 0600; ret = make_dev_s(&mda, &uinput_cdev, "uinput"); return (ret); } static int uinput_cdev_destroy(void) { destroy_dev(uinput_cdev); return (0); } static int uinput_modevent(module_t mod __unused, int cmd, void *data) { int ret = 0; switch (cmd) { case MOD_LOAD: ret = uinput_cdev_create(); break; case MOD_UNLOAD: ret = uinput_cdev_destroy(); break; case MOD_SHUTDOWN: break; default: ret = EINVAL; break; } return (ret); } DEV_MODULE(uinput, uinput_modevent, NULL); MODULE_VERSION(uinput, 1); MODULE_DEPEND(uinput, evdev, 1, 1, 1); diff --git a/sys/dev/gpio/gpioc.c b/sys/dev/gpio/gpioc.c index 6fb79fa8d751..067a43617f11 100644 --- a/sys/dev/gpio/gpioc.c +++ b/sys/dev/gpio/gpioc.c @@ -1,1063 +1,1063 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2009 Oleksandr Tymoshenko * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "gpio_if.h" #include "gpiobus_if.h" #undef GPIOC_DEBUG #ifdef GPIOC_DEBUG #define dprintf printf #define ddevice_printf device_printf #else #define dprintf(x, arg...) #define ddevice_printf(dev, x, arg...) #endif struct gpioc_softc { device_t sc_dev; /* gpiocX dev */ device_t sc_pdev; /* gpioX dev */ struct cdev *sc_ctl_dev; /* controller device */ int sc_unit; int sc_npins; struct gpioc_pin_intr *sc_pin_intr; }; struct gpioc_pin_intr { struct gpioc_softc *sc; gpio_pin_t pin; bool config_locked; int intr_rid; struct resource *intr_res; void *intr_cookie; struct mtx mtx; SLIST_HEAD(gpioc_privs_list, gpioc_privs) privs; }; struct gpioc_cdevpriv { struct gpioc_softc *sc; struct selinfo selinfo; bool async; uint8_t report_option; struct sigio *sigio; struct mtx mtx; struct gpioc_pin_event *events; int numevents; int evidx_head; int evidx_tail; SLIST_HEAD(gpioc_pins_list, gpioc_pins) pins; }; struct gpioc_privs { struct gpioc_cdevpriv *priv; SLIST_ENTRY(gpioc_privs) next; }; struct gpioc_pins { struct gpioc_pin_intr *pin; int eventcount; int firstevent; SLIST_ENTRY(gpioc_pins) next; }; struct gpioc_pin_event { struct gpioc_pins *privpin; sbintime_t event_time; bool event_pin_state; }; static MALLOC_DEFINE(M_GPIOC, "gpioc", "gpioc device data"); static int gpioc_allocate_pin_intr(struct gpioc_pin_intr*, uint32_t); static int gpioc_release_pin_intr(struct gpioc_pin_intr*); static int gpioc_attach_priv_pin(struct gpioc_cdevpriv*, struct gpioc_pin_intr*); static int gpioc_detach_priv_pin(struct gpioc_cdevpriv*, struct gpioc_pin_intr*); static bool gpioc_intr_reconfig_allowed(struct gpioc_cdevpriv*, struct gpioc_pin_intr *intr_conf); static uint32_t gpioc_get_intr_config(struct gpioc_softc*, struct gpioc_cdevpriv*, uint32_t pin); static int gpioc_set_intr_config(struct gpioc_softc*, struct gpioc_cdevpriv*, uint32_t, uint32_t); static void gpioc_interrupt_handler(void*); static int gpioc_kqread(struct knote*, long); static void gpioc_kqdetach(struct knote*); static int gpioc_probe(device_t dev); static int gpioc_attach(device_t dev); static int gpioc_detach(device_t dev); static void gpioc_cdevpriv_dtor(void*); static d_open_t gpioc_open; static d_read_t gpioc_read; static d_ioctl_t gpioc_ioctl; static d_poll_t gpioc_poll; static d_kqfilter_t gpioc_kqfilter; static struct cdevsw gpioc_cdevsw = { .d_version = D_VERSION, .d_open = gpioc_open, .d_read = gpioc_read, .d_ioctl = gpioc_ioctl, .d_poll = gpioc_poll, .d_kqfilter = gpioc_kqfilter, .d_name = "gpioc", }; -static struct filterops gpioc_read_filterops = { +static const struct filterops gpioc_read_filterops = { .f_isfd = true, .f_attach = NULL, .f_detach = gpioc_kqdetach, .f_event = gpioc_kqread, .f_touch = NULL }; static struct gpioc_pin_event * next_head_event(struct gpioc_cdevpriv *priv) { struct gpioc_pin_event *rv; rv = &priv->events[priv->evidx_head++]; if (priv->evidx_head == priv->numevents) priv->evidx_head = 0; return (rv); } static struct gpioc_pin_event * next_tail_event(struct gpioc_cdevpriv *priv) { struct gpioc_pin_event *rv; rv = &priv->events[priv->evidx_tail++]; if (priv->evidx_tail == priv->numevents) priv->evidx_tail = 0; return (rv); } static size_t number_of_events(struct gpioc_cdevpriv *priv) { if (priv->evidx_head >= priv->evidx_tail) return (priv->evidx_head - priv->evidx_tail); else return (priv->numevents + priv->evidx_head - priv->evidx_tail); } static int gpioc_allocate_pin_intr(struct gpioc_pin_intr *intr_conf, uint32_t flags) { int err; intr_conf->config_locked = true; mtx_unlock(&intr_conf->mtx); intr_conf->intr_res = gpio_alloc_intr_resource(intr_conf->pin->dev, &intr_conf->intr_rid, RF_ACTIVE, intr_conf->pin, flags); if (intr_conf->intr_res == NULL) { err = ENXIO; goto error_exit; } err = bus_setup_intr(intr_conf->pin->dev, intr_conf->intr_res, INTR_TYPE_MISC | INTR_MPSAFE, NULL, gpioc_interrupt_handler, intr_conf, &intr_conf->intr_cookie); if (err != 0) goto error_exit; intr_conf->pin->flags = flags; error_exit: mtx_lock(&intr_conf->mtx); intr_conf->config_locked = false; wakeup(&intr_conf->config_locked); return (err); } static int gpioc_release_pin_intr(struct gpioc_pin_intr *intr_conf) { int err; intr_conf->config_locked = true; mtx_unlock(&intr_conf->mtx); if (intr_conf->intr_cookie != NULL) { err = bus_teardown_intr(intr_conf->pin->dev, intr_conf->intr_res, intr_conf->intr_cookie); if (err != 0) goto error_exit; else intr_conf->intr_cookie = NULL; } if (intr_conf->intr_res != NULL) { err = bus_release_resource(intr_conf->pin->dev, SYS_RES_IRQ, intr_conf->intr_rid, intr_conf->intr_res); if (err != 0) goto error_exit; else { intr_conf->intr_rid = 0; intr_conf->intr_res = NULL; } } intr_conf->pin->flags = 0; err = 0; error_exit: mtx_lock(&intr_conf->mtx); intr_conf->config_locked = false; wakeup(&intr_conf->config_locked); return (err); } static int gpioc_attach_priv_pin(struct gpioc_cdevpriv *priv, struct gpioc_pin_intr *intr_conf) { struct gpioc_privs *priv_link; struct gpioc_pins *pin_link; unsigned int consistency_a __diagused; unsigned int consistency_b __diagused; consistency_a = 0; consistency_b = 0; mtx_assert(&intr_conf->mtx, MA_OWNED); mtx_lock(&priv->mtx); SLIST_FOREACH(priv_link, &intr_conf->privs, next) { if (priv_link->priv == priv) consistency_a++; } KASSERT(consistency_a <= 1, ("inconsistent links between pin config and cdevpriv")); SLIST_FOREACH(pin_link, &priv->pins, next) { if (pin_link->pin == intr_conf) consistency_b++; } KASSERT(consistency_a == consistency_b, ("inconsistent links between pin config and cdevpriv")); if (consistency_a == 1 && consistency_b == 1) { mtx_unlock(&priv->mtx); return (EEXIST); } priv_link = malloc(sizeof(struct gpioc_privs), M_GPIOC, M_NOWAIT | M_ZERO); if (priv_link == NULL) { mtx_unlock(&priv->mtx); return (ENOMEM); } pin_link = malloc(sizeof(struct gpioc_pins), M_GPIOC, M_NOWAIT | M_ZERO); if (pin_link == NULL) { mtx_unlock(&priv->mtx); return (ENOMEM); } priv_link->priv = priv; pin_link->pin = intr_conf; SLIST_INSERT_HEAD(&intr_conf->privs, priv_link, next); SLIST_INSERT_HEAD(&priv->pins, pin_link, next); mtx_unlock(&priv->mtx); return (0); } static int gpioc_detach_priv_pin(struct gpioc_cdevpriv *priv, struct gpioc_pin_intr *intr_conf) { struct gpioc_privs *priv_link, *priv_link_temp; struct gpioc_pins *pin_link, *pin_link_temp; unsigned int consistency_a __diagused; unsigned int consistency_b __diagused; consistency_a = 0; consistency_b = 0; mtx_assert(&intr_conf->mtx, MA_OWNED); mtx_lock(&priv->mtx); SLIST_FOREACH_SAFE(priv_link, &intr_conf->privs, next, priv_link_temp) { if (priv_link->priv == priv) { SLIST_REMOVE(&intr_conf->privs, priv_link, gpioc_privs, next); free(priv_link, M_GPIOC); consistency_a++; } } KASSERT(consistency_a <= 1, ("inconsistent links between pin config and cdevpriv")); SLIST_FOREACH_SAFE(pin_link, &priv->pins, next, pin_link_temp) { if (pin_link->pin == intr_conf) { /* * If the pin we're removing has events in the priv's * event fifo, we can't leave dangling pointers from * those events to the gpioc_pins struct we're about to * free. We also can't remove random items and leave * holes in the events fifo, so just empty it out. */ if (pin_link->eventcount > 0) { priv->evidx_head = priv->evidx_tail = 0; } SLIST_REMOVE(&priv->pins, pin_link, gpioc_pins, next); free(pin_link, M_GPIOC); consistency_b++; } } KASSERT(consistency_a == consistency_b, ("inconsistent links between pin config and cdevpriv")); mtx_unlock(&priv->mtx); return (0); } static bool gpioc_intr_reconfig_allowed(struct gpioc_cdevpriv *priv, struct gpioc_pin_intr *intr_conf) { struct gpioc_privs *priv_link; mtx_assert(&intr_conf->mtx, MA_OWNED); if (SLIST_EMPTY(&intr_conf->privs)) return (true); SLIST_FOREACH(priv_link, &intr_conf->privs, next) { if (priv_link->priv != priv) return (false); } return (true); } static uint32_t gpioc_get_intr_config(struct gpioc_softc *sc, struct gpioc_cdevpriv *priv, uint32_t pin) { struct gpioc_pin_intr *intr_conf = &sc->sc_pin_intr[pin]; struct gpioc_privs *priv_link; uint32_t flags; flags = intr_conf->pin->flags; if (flags == 0) return (0); mtx_lock(&intr_conf->mtx); SLIST_FOREACH(priv_link, &intr_conf->privs, next) { if (priv_link->priv == priv) { flags |= GPIO_INTR_ATTACHED; break; } } mtx_unlock(&intr_conf->mtx); return (flags); } static int gpioc_set_intr_config(struct gpioc_softc *sc, struct gpioc_cdevpriv *priv, uint32_t pin, uint32_t flags) { struct gpioc_pin_intr *intr_conf = &sc->sc_pin_intr[pin]; int res; res = 0; if (intr_conf->pin->flags == 0 && flags == 0) { /* No interrupt configured and none requested: Do nothing. */ return (0); } mtx_lock(&intr_conf->mtx); while (intr_conf->config_locked == true) mtx_sleep(&intr_conf->config_locked, &intr_conf->mtx, 0, "gpicfg", 0); if (intr_conf->pin->flags == 0 && flags != 0) { /* * No interrupt is configured, but one is requested: Allocate * and setup interrupt on the according pin. */ res = gpioc_allocate_pin_intr(intr_conf, flags); if (res == 0) res = gpioc_attach_priv_pin(priv, intr_conf); if (res == EEXIST) res = 0; } else if (intr_conf->pin->flags == flags) { /* * Same interrupt requested as already configured: Attach the * cdevpriv to the corresponding pin. */ res = gpioc_attach_priv_pin(priv, intr_conf); if (res == EEXIST) res = 0; } else if (intr_conf->pin->flags != 0 && flags == 0) { /* * Interrupt configured, but none requested: Teardown and * release the pin when no other cdevpriv is attached. Otherwise * just detach pin and cdevpriv from each other. */ if (gpioc_intr_reconfig_allowed(priv, intr_conf)) { res = gpioc_release_pin_intr(intr_conf); } if (res == 0) res = gpioc_detach_priv_pin(priv, intr_conf); } else { /* * Other flag requested than configured: Reconfigure when no * other cdevpriv is are attached to the pin. */ if (!gpioc_intr_reconfig_allowed(priv, intr_conf)) res = EBUSY; else { res = gpioc_release_pin_intr(intr_conf); if (res == 0) res = gpioc_allocate_pin_intr(intr_conf, flags); if (res == 0) res = gpioc_attach_priv_pin(priv, intr_conf); if (res == EEXIST) res = 0; } } mtx_unlock(&intr_conf->mtx); return (res); } static void gpioc_interrupt_handler(void *arg) { struct gpioc_pin_intr *intr_conf; struct gpioc_privs *privs; struct gpioc_softc *sc; sbintime_t evtime; uint32_t pin_state; intr_conf = arg; sc = intr_conf->sc; /* Capture time and pin state first. */ evtime = sbinuptime(); if (intr_conf->pin->flags & GPIO_INTR_EDGE_BOTH) GPIO_PIN_GET(sc->sc_pdev, intr_conf->pin->pin, &pin_state); else if (intr_conf->pin->flags & GPIO_INTR_EDGE_RISING) pin_state = true; else pin_state = false; mtx_lock(&intr_conf->mtx); if (intr_conf->config_locked == true) { ddevice_printf(sc->sc_dev, "Interrupt configuration in " "progress. Discarding interrupt on pin %d.\n", intr_conf->pin->pin); mtx_unlock(&intr_conf->mtx); return; } if (SLIST_EMPTY(&intr_conf->privs)) { ddevice_printf(sc->sc_dev, "No file descriptor associated with " "occurred interrupt on pin %d.\n", intr_conf->pin->pin); mtx_unlock(&intr_conf->mtx); return; } SLIST_FOREACH(privs, &intr_conf->privs, next) { struct gpioc_cdevpriv *priv = privs->priv; struct gpioc_pins *privpin; struct gpioc_pin_event *event; mtx_lock(&priv->mtx); SLIST_FOREACH(privpin, &priv->pins, next) { if (privpin->pin == intr_conf) break; } if (privpin == NULL) { /* Should be impossible. */ ddevice_printf(sc->sc_dev, "Cannot find privpin\n"); mtx_unlock(&priv->mtx); continue; } if (priv->report_option == GPIO_EVENT_REPORT_DETAIL) { event = next_head_event(priv); /* If head is overtaking tail, advance tail. */ if (priv->evidx_head == priv->evidx_tail) next_tail_event(priv); } else { if (privpin->eventcount > 0) event = &priv->events[privpin->firstevent + 1]; else { privpin->firstevent = priv->evidx_head; event = next_head_event(priv); event->privpin = privpin; event->event_time = evtime; event->event_pin_state = pin_state; event = next_head_event(priv); } ++privpin->eventcount; } event->privpin = privpin; event->event_time = evtime; event->event_pin_state = pin_state; wakeup(priv); selwakeup(&priv->selinfo); KNOTE_LOCKED(&priv->selinfo.si_note, 0); if (priv->async == true && priv->sigio != NULL) pgsigio(&priv->sigio, SIGIO, 0); mtx_unlock(&priv->mtx); } mtx_unlock(&intr_conf->mtx); } static int gpioc_probe(device_t dev) { device_set_desc(dev, "GPIO controller"); return (0); } static int gpioc_attach(device_t dev) { int err; struct gpioc_softc *sc; struct make_dev_args devargs; sc = device_get_softc(dev); sc->sc_dev = dev; sc->sc_pdev = device_get_parent(dev); sc->sc_unit = device_get_unit(dev); err = GPIO_PIN_MAX(sc->sc_pdev, &sc->sc_npins); sc->sc_npins++; /* Number of pins is one more than max pin number. */ if (err != 0) return (err); sc->sc_pin_intr = malloc(sizeof(struct gpioc_pin_intr) * sc->sc_npins, M_GPIOC, M_WAITOK | M_ZERO); for (int i = 0; i < sc->sc_npins; i++) { sc->sc_pin_intr[i].pin = malloc(sizeof(struct gpiobus_pin), M_GPIOC, M_WAITOK | M_ZERO); sc->sc_pin_intr[i].sc = sc; sc->sc_pin_intr[i].pin->pin = i; sc->sc_pin_intr[i].pin->dev = sc->sc_pdev; mtx_init(&sc->sc_pin_intr[i].mtx, "gpioc pin", NULL, MTX_DEF); SLIST_INIT(&sc->sc_pin_intr[i].privs); } make_dev_args_init(&devargs); devargs.mda_devsw = &gpioc_cdevsw; devargs.mda_uid = UID_ROOT; devargs.mda_gid = GID_WHEEL; devargs.mda_mode = 0600; devargs.mda_si_drv1 = sc; err = make_dev_s(&devargs, &sc->sc_ctl_dev, "gpioc%d", sc->sc_unit); if (err != 0) { device_printf(dev, "Failed to create gpioc%d", sc->sc_unit); return (ENXIO); } return (0); } static int gpioc_detach(device_t dev) { struct gpioc_softc *sc = device_get_softc(dev); int err; if (sc->sc_ctl_dev) destroy_dev(sc->sc_ctl_dev); for (int i = 0; i < sc->sc_npins; i++) { mtx_destroy(&sc->sc_pin_intr[i].mtx); free(sc->sc_pin_intr[i].pin, M_GPIOC); } free(sc->sc_pin_intr, M_GPIOC); if ((err = bus_generic_detach(dev)) != 0) return (err); return (0); } static void gpioc_cdevpriv_dtor(void *data) { struct gpioc_cdevpriv *priv; struct gpioc_privs *priv_link, *priv_link_temp; struct gpioc_pins *pin_link, *pin_link_temp; unsigned int consistency __diagused; priv = data; SLIST_FOREACH_SAFE(pin_link, &priv->pins, next, pin_link_temp) { consistency = 0; mtx_lock(&pin_link->pin->mtx); while (pin_link->pin->config_locked == true) mtx_sleep(&pin_link->pin->config_locked, &pin_link->pin->mtx, 0, "gpicfg", 0); SLIST_FOREACH_SAFE(priv_link, &pin_link->pin->privs, next, priv_link_temp) { if (priv_link->priv == priv) { SLIST_REMOVE(&pin_link->pin->privs, priv_link, gpioc_privs, next); free(priv_link, M_GPIOC); consistency++; } } KASSERT(consistency == 1, ("inconsistent links between pin config and cdevpriv")); if (gpioc_intr_reconfig_allowed(priv, pin_link->pin)) { gpioc_release_pin_intr(pin_link->pin); } mtx_unlock(&pin_link->pin->mtx); SLIST_REMOVE(&priv->pins, pin_link, gpioc_pins, next); free(pin_link, M_GPIOC); } wakeup(&priv); knlist_clear(&priv->selinfo.si_note, 0); seldrain(&priv->selinfo); knlist_destroy(&priv->selinfo.si_note); funsetown(&priv->sigio); mtx_destroy(&priv->mtx); free(priv->events, M_GPIOC); free(data, M_GPIOC); } static int gpioc_open(struct cdev *dev, int oflags, int devtype, struct thread *td) { struct gpioc_cdevpriv *priv; int err; priv = malloc(sizeof(*priv), M_GPIOC, M_WAITOK | M_ZERO); priv->sc = dev->si_drv1; priv->report_option = GPIO_EVENT_REPORT_DETAIL; err = devfs_set_cdevpriv(priv, gpioc_cdevpriv_dtor); if (err != 0) { gpioc_cdevpriv_dtor(priv); return (err); } mtx_init(&priv->mtx, "gpioc priv", NULL, MTX_DEF); knlist_init_mtx(&priv->selinfo.si_note, &priv->mtx); /* * Allocate a circular buffer for events. The scheme we use for summary * reporting assumes there will always be a pair of events available to * record the first/last events on any pin, so we allocate 2 * npins. * Even though we actually default to detailed event reporting, 2 * * npins isn't a horrible fifo size for that either. */ priv->numevents = priv->sc->sc_npins * 2; priv->events = malloc(priv->numevents * sizeof(struct gpio_event_detail), M_GPIOC, M_WAITOK | M_ZERO); return (0); } static int gpioc_read(struct cdev *dev, struct uio *uio, int ioflag) { struct gpioc_cdevpriv *priv; struct gpioc_pin_event *event; union { struct gpio_event_summary sum; struct gpio_event_detail evt; uint8_t data[1]; } recbuf; size_t recsize; int err; if ((err = devfs_get_cdevpriv((void **)&priv)) != 0) return (err); if (priv->report_option == GPIO_EVENT_REPORT_SUMMARY) recsize = sizeof(struct gpio_event_summary); else recsize = sizeof(struct gpio_event_detail); if (uio->uio_resid < recsize) return (EINVAL); mtx_lock(&priv->mtx); while (priv->evidx_head == priv->evidx_tail) { if (SLIST_EMPTY(&priv->pins)) { err = ENXIO; break; } else if (ioflag & O_NONBLOCK) { err = EWOULDBLOCK; break; } else { err = mtx_sleep(priv, &priv->mtx, PCATCH, "gpintr", 0); if (err != 0) break; } } while (err == 0 && uio->uio_resid >= recsize && priv->evidx_tail != priv->evidx_head) { event = next_tail_event(priv); if (priv->report_option == GPIO_EVENT_REPORT_SUMMARY) { recbuf.sum.gp_first_time = event->event_time; recbuf.sum.gp_pin = event->privpin->pin->pin->pin; recbuf.sum.gp_count = event->privpin->eventcount; recbuf.sum.gp_first_state = event->event_pin_state; event = next_tail_event(priv); recbuf.sum.gp_last_time = event->event_time; recbuf.sum.gp_last_state = event->event_pin_state; event->privpin->eventcount = 0; event->privpin->firstevent = 0; } else { recbuf.evt.gp_time = event->event_time; recbuf.evt.gp_pin = event->privpin->pin->pin->pin; recbuf.evt.gp_pinstate = event->event_pin_state; } mtx_unlock(&priv->mtx); err = uiomove(recbuf.data, recsize, uio); mtx_lock(&priv->mtx); } mtx_unlock(&priv->mtx); return (err); } static int gpioc_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int fflag, struct thread *td) { device_t bus; int max_pin, res; struct gpioc_softc *sc = cdev->si_drv1; struct gpioc_cdevpriv *priv; struct gpio_pin pin; struct gpio_req req; struct gpio_access_32 *a32; struct gpio_config_32 *c32; struct gpio_event_config *evcfg; uint32_t caps, intrflags; bus = GPIO_GET_BUS(sc->sc_pdev); if (bus == NULL) return (EINVAL); switch (cmd) { case GPIOMAXPIN: max_pin = -1; res = GPIO_PIN_MAX(sc->sc_pdev, &max_pin); bcopy(&max_pin, arg, sizeof(max_pin)); break; case GPIOGETCONFIG: bcopy(arg, &pin, sizeof(pin)); dprintf("get config pin %d\n", pin.gp_pin); res = GPIO_PIN_GETFLAGS(sc->sc_pdev, pin.gp_pin, &pin.gp_flags); /* Fail early */ if (res) break; res = devfs_get_cdevpriv((void **)&priv); if (res) break; pin.gp_flags |= gpioc_get_intr_config(sc, priv, pin.gp_pin); GPIO_PIN_GETCAPS(sc->sc_pdev, pin.gp_pin, &pin.gp_caps); GPIOBUS_PIN_GETNAME(bus, pin.gp_pin, pin.gp_name); bcopy(&pin, arg, sizeof(pin)); break; case GPIOSETCONFIG: bcopy(arg, &pin, sizeof(pin)); dprintf("set config pin %d\n", pin.gp_pin); res = devfs_get_cdevpriv((void **)&priv); if (res != 0) break; res = GPIO_PIN_GETCAPS(sc->sc_pdev, pin.gp_pin, &caps); if (res != 0) break; res = gpio_check_flags(caps, pin.gp_flags); if (res != 0) break; intrflags = pin.gp_flags & GPIO_INTR_MASK; /* * We can do only edge interrupts, and only if the * hardware supports that interrupt type on that pin. */ switch (intrflags) { case GPIO_INTR_NONE: break; case GPIO_INTR_EDGE_RISING: case GPIO_INTR_EDGE_FALLING: case GPIO_INTR_EDGE_BOTH: if ((intrflags & caps) == 0) res = EOPNOTSUPP; break; default: res = EINVAL; break; } if (res != 0) break; res = GPIO_PIN_SETFLAGS(sc->sc_pdev, pin.gp_pin, (pin.gp_flags & ~GPIO_INTR_MASK)); if (res != 0) break; res = gpioc_set_intr_config(sc, priv, pin.gp_pin, intrflags); break; case GPIOGET: bcopy(arg, &req, sizeof(req)); res = GPIO_PIN_GET(sc->sc_pdev, req.gp_pin, &req.gp_value); dprintf("read pin %d -> %d\n", req.gp_pin, req.gp_value); bcopy(&req, arg, sizeof(req)); break; case GPIOSET: bcopy(arg, &req, sizeof(req)); res = GPIO_PIN_SET(sc->sc_pdev, req.gp_pin, req.gp_value); dprintf("write pin %d -> %d\n", req.gp_pin, req.gp_value); break; case GPIOTOGGLE: bcopy(arg, &req, sizeof(req)); dprintf("toggle pin %d\n", req.gp_pin); res = GPIO_PIN_TOGGLE(sc->sc_pdev, req.gp_pin); break; case GPIOSETNAME: bcopy(arg, &pin, sizeof(pin)); dprintf("set name on pin %d\n", pin.gp_pin); res = GPIOBUS_PIN_SETNAME(bus, pin.gp_pin, pin.gp_name); break; case GPIOACCESS32: a32 = (struct gpio_access_32 *)arg; res = GPIO_PIN_ACCESS_32(sc->sc_pdev, a32->first_pin, a32->clear_pins, a32->change_pins, &a32->orig_pins); break; case GPIOCONFIG32: c32 = (struct gpio_config_32 *)arg; res = GPIO_PIN_CONFIG_32(sc->sc_pdev, c32->first_pin, c32->num_pins, c32->pin_flags); break; case GPIOCONFIGEVENTS: evcfg = (struct gpio_event_config *)arg; res = devfs_get_cdevpriv((void **)&priv); if (res != 0) break; /* If any pins have been configured, changes aren't allowed. */ if (!SLIST_EMPTY(&priv->pins)) { res = EINVAL; break; } if (evcfg->gp_report_type != GPIO_EVENT_REPORT_DETAIL && evcfg->gp_report_type != GPIO_EVENT_REPORT_SUMMARY) { res = EINVAL; break; } priv->report_option = evcfg->gp_report_type; /* Reallocate the events buffer if the user wants it bigger. */ if (priv->report_option == GPIO_EVENT_REPORT_DETAIL && priv->numevents < evcfg->gp_fifo_size) { free(priv->events, M_GPIOC); priv->numevents = evcfg->gp_fifo_size; priv->events = malloc(priv->numevents * sizeof(struct gpio_event_detail), M_GPIOC, M_WAITOK | M_ZERO); priv->evidx_head = priv->evidx_tail = 0; } break; case FIONBIO: /* * This dummy handler is necessary to prevent fcntl() * from failing. The actual handling of non-blocking IO * is done using the O_NONBLOCK ioflag passed to the * read() syscall. */ res = 0; break; case FIOASYNC: res = devfs_get_cdevpriv((void **)&priv); if (res == 0) { if (*(int *)arg == FASYNC) priv->async = true; else priv->async = false; } break; case FIOGETOWN: res = devfs_get_cdevpriv((void **)&priv); if (res == 0) *(int *)arg = fgetown(&priv->sigio); break; case FIOSETOWN: res = devfs_get_cdevpriv((void **)&priv); if (res == 0) res = fsetown(*(int *)arg, &priv->sigio); break; default: return (ENOTTY); break; } return (res); } static int gpioc_poll(struct cdev *dev, int events, struct thread *td) { struct gpioc_cdevpriv *priv; int err; int revents; revents = 0; err = devfs_get_cdevpriv((void **)&priv); if (err != 0) { revents = POLLERR; return (revents); } if (SLIST_EMPTY(&priv->pins)) { revents = POLLHUP; return (revents); } if (events & (POLLIN | POLLRDNORM)) { if (priv->evidx_head != priv->evidx_tail) revents |= events & (POLLIN | POLLRDNORM); else selrecord(td, &priv->selinfo); } return (revents); } static int gpioc_kqfilter(struct cdev *dev, struct knote *kn) { struct gpioc_cdevpriv *priv; struct knlist *knlist; int err; err = devfs_get_cdevpriv((void **)&priv); if (err != 0) return err; if (SLIST_EMPTY(&priv->pins)) return (ENXIO); switch(kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &gpioc_read_filterops; kn->kn_hook = (void *)priv; break; default: return (EOPNOTSUPP); } knlist = &priv->selinfo.si_note; knlist_add(knlist, kn, 0); return (0); } static int gpioc_kqread(struct knote *kn, long hint) { struct gpioc_cdevpriv *priv = kn->kn_hook; size_t recsize; if (SLIST_EMPTY(&priv->pins)) { kn->kn_flags |= EV_EOF; return (1); } else { if (priv->evidx_head != priv->evidx_tail) { if (priv->report_option == GPIO_EVENT_REPORT_SUMMARY) recsize = sizeof(struct gpio_event_summary); else recsize = sizeof(struct gpio_event_detail); kn->kn_data = recsize * number_of_events(priv); return (1); } } return (0); } static void gpioc_kqdetach(struct knote *kn) { struct gpioc_cdevpriv *priv = kn->kn_hook; struct knlist *knlist = &priv->selinfo.si_note; knlist_remove(knlist, kn, 0); } static device_method_t gpioc_methods[] = { /* Device interface */ DEVMETHOD(device_probe, gpioc_probe), DEVMETHOD(device_attach, gpioc_attach), DEVMETHOD(device_detach, gpioc_detach), DEVMETHOD(device_shutdown, bus_generic_shutdown), DEVMETHOD(device_suspend, bus_generic_suspend), DEVMETHOD(device_resume, bus_generic_resume), DEVMETHOD_END }; driver_t gpioc_driver = { "gpioc", gpioc_methods, sizeof(struct gpioc_softc) }; DRIVER_MODULE(gpioc, gpio, gpioc_driver, 0, 0); MODULE_VERSION(gpioc, 1); diff --git a/sys/dev/hid/hidraw.c b/sys/dev/hid/hidraw.c index 6a05b633cfc8..618a6d2d5c31 100644 --- a/sys/dev/hid/hidraw.c +++ b/sys/dev/hid/hidraw.c @@ -1,1018 +1,1018 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 1998 The NetBSD Foundation, Inc. * All rights reserved. * Copyright (c) 2020 Vladimir Kondratyev * * This code is derived from software contributed to The NetBSD Foundation * by Lennart Augustsson (lennart@augustsson.net) at * Carlstedt Research & Technology. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * HID spec: http://www.usb.org/developers/devclass_docs/HID1_11.pdf */ #include #include "opt_hid.h" #include #ifdef COMPAT_FREEBSD32 #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define HID_DEBUG_VAR hidraw_debug #include #include #include #ifdef HID_DEBUG static int hidraw_debug = 0; static SYSCTL_NODE(_hw_hid, OID_AUTO, hidraw, CTLFLAG_RW, 0, "HID raw interface"); SYSCTL_INT(_hw_hid_hidraw, OID_AUTO, debug, CTLFLAG_RWTUN, &hidraw_debug, 0, "Debug level"); #endif #define HIDRAW_INDEX 0xFF /* Arbitrary high value */ #define HIDRAW_LOCAL_BUFSIZE 64 /* Size of on-stack buffer. */ #define HIDRAW_LOCAL_ALLOC(local_buf, size) \ (sizeof(local_buf) > (size) ? (local_buf) : \ malloc((size), M_DEVBUF, M_ZERO | M_WAITOK)) #define HIDRAW_LOCAL_FREE(local_buf, buf) \ if ((local_buf) != (buf)) { \ free((buf), M_DEVBUF); \ } struct hidraw_softc { device_t sc_dev; /* base device */ struct mtx sc_mtx; /* hidbus private mutex */ struct hid_rdesc_info *sc_rdesc; const struct hid_device_info *sc_hw; uint8_t *sc_q; hid_size_t *sc_qlen; int sc_head; int sc_tail; int sc_sleepcnt; struct selinfo sc_rsel; struct proc *sc_async; /* process that wants SIGIO */ struct { /* driver state */ bool open:1; /* device is open */ bool aslp:1; /* waiting for device data in read() */ bool sel:1; /* waiting for device data in poll() */ bool quiet:1; /* Ignore input data */ bool immed:1; /* return read data immediately */ bool uhid:1; /* driver switched in to uhid mode */ bool lock:1; /* input queue sleepable lock */ bool flush:1; /* do not wait for data in read() */ } sc_state; int sc_fflags; /* access mode for open lifetime */ struct cdev *dev; }; #ifdef COMPAT_FREEBSD32 struct hidraw_gen_descriptor32 { uint32_t hgd_data; /* void * */ uint16_t hgd_lang_id; uint16_t hgd_maxlen; uint16_t hgd_actlen; uint16_t hgd_offset; uint8_t hgd_config_index; uint8_t hgd_string_index; uint8_t hgd_iface_index; uint8_t hgd_altif_index; uint8_t hgd_endpt_index; uint8_t hgd_report_type; uint8_t reserved[8]; }; #define HIDRAW_GET_REPORT_DESC32 \ _IOC_NEWTYPE(HIDRAW_GET_REPORT_DESC, struct hidraw_gen_descriptor32) #define HIDRAW_GET_REPORT32 \ _IOC_NEWTYPE(HIDRAW_GET_REPORT, struct hidraw_gen_descriptor32) #define HIDRAW_SET_REPORT_DESC32 \ _IOC_NEWTYPE(HIDRAW_SET_REPORT_DESC, struct hidraw_gen_descriptor32) #define HIDRAW_SET_REPORT32 \ _IOC_NEWTYPE(HIDRAW_SET_REPORT, struct hidraw_gen_descriptor32) #endif static d_open_t hidraw_open; static d_read_t hidraw_read; static d_write_t hidraw_write; static d_ioctl_t hidraw_ioctl; static d_poll_t hidraw_poll; static d_kqfilter_t hidraw_kqfilter; static d_priv_dtor_t hidraw_dtor; static struct cdevsw hidraw_cdevsw = { .d_version = D_VERSION, .d_open = hidraw_open, .d_read = hidraw_read, .d_write = hidraw_write, .d_ioctl = hidraw_ioctl, .d_poll = hidraw_poll, .d_kqfilter = hidraw_kqfilter, .d_name = "hidraw", }; static hid_intr_t hidraw_intr; static device_identify_t hidraw_identify; static device_probe_t hidraw_probe; static device_attach_t hidraw_attach; static device_detach_t hidraw_detach; static int hidraw_kqread(struct knote *, long); static void hidraw_kqdetach(struct knote *); static void hidraw_notify(struct hidraw_softc *); -static struct filterops hidraw_filterops_read = { +static const struct filterops hidraw_filterops_read = { .f_isfd = 1, .f_detach = hidraw_kqdetach, .f_event = hidraw_kqread, }; static void hidraw_identify(driver_t *driver, device_t parent) { device_t child; if (device_find_child(parent, "hidraw", -1) == NULL) { child = BUS_ADD_CHILD(parent, 0, "hidraw", device_get_unit(parent)); if (child != NULL) hidbus_set_index(child, HIDRAW_INDEX); } } static int hidraw_probe(device_t self) { if (hidbus_get_index(self) != HIDRAW_INDEX) return (ENXIO); hidbus_set_desc(self, "Raw HID Device"); return (BUS_PROBE_GENERIC); } static int hidraw_attach(device_t self) { struct hidraw_softc *sc = device_get_softc(self); struct make_dev_args mda; int error; sc->sc_dev = self; sc->sc_rdesc = hidbus_get_rdesc_info(self); sc->sc_hw = hid_get_device_info(self); /* Hidraw mode does not require report descriptor to work */ if (sc->sc_rdesc->data == NULL || sc->sc_rdesc->len == 0) device_printf(self, "no report descriptor\n"); mtx_init(&sc->sc_mtx, "hidraw lock", NULL, MTX_DEF); knlist_init_mtx(&sc->sc_rsel.si_note, &sc->sc_mtx); make_dev_args_init(&mda); mda.mda_flags = MAKEDEV_WAITOK; mda.mda_devsw = &hidraw_cdevsw; mda.mda_uid = UID_ROOT; mda.mda_gid = GID_OPERATOR; mda.mda_mode = 0600; mda.mda_si_drv1 = sc; error = make_dev_s(&mda, &sc->dev, "hidraw%d", device_get_unit(self)); if (error) { device_printf(self, "Can not create character device\n"); hidraw_detach(self); return (error); } #ifdef HIDRAW_MAKE_UHID_ALIAS (void)make_dev_alias(sc->dev, "uhid%d", device_get_unit(self)); #endif hidbus_set_lock(self, &sc->sc_mtx); hidbus_set_intr(self, hidraw_intr, sc); return (0); } static int hidraw_detach(device_t self) { struct hidraw_softc *sc = device_get_softc(self); DPRINTF("sc=%p\n", sc); if (sc->dev != NULL) { mtx_lock(&sc->sc_mtx); sc->dev->si_drv1 = NULL; /* Wake everyone */ hidraw_notify(sc); mtx_unlock(&sc->sc_mtx); destroy_dev(sc->dev); } knlist_clear(&sc->sc_rsel.si_note, 0); knlist_destroy(&sc->sc_rsel.si_note); seldrain(&sc->sc_rsel); mtx_destroy(&sc->sc_mtx); return (0); } void hidraw_intr(void *context, void *buf, hid_size_t len) { struct hidraw_softc *sc = context; int next; DPRINTFN(5, "len=%d\n", len); DPRINTFN(5, "data = %*D\n", len, buf, " "); next = (sc->sc_tail + 1) % HIDRAW_BUFFER_SIZE; if (sc->sc_state.quiet || next == sc->sc_head) return; bcopy(buf, sc->sc_q + sc->sc_tail * sc->sc_rdesc->rdsize, len); /* Make sure we don't process old data */ if (len < sc->sc_rdesc->rdsize) bzero(sc->sc_q + sc->sc_tail * sc->sc_rdesc->rdsize + len, sc->sc_rdesc->isize - len); sc->sc_qlen[sc->sc_tail] = len; sc->sc_tail = next; hidraw_notify(sc); } static inline int hidraw_lock_queue(struct hidraw_softc *sc, bool flush) { int error = 0; mtx_assert(&sc->sc_mtx, MA_OWNED); if (flush) sc->sc_state.flush = true; ++sc->sc_sleepcnt; while (sc->sc_state.lock && error == 0) { /* Flush is requested. Wakeup all readers and forbid sleeps */ if (flush && sc->sc_state.aslp) { sc->sc_state.aslp = false; DPRINTFN(5, "waking %p\n", &sc->sc_q); wakeup(&sc->sc_q); } error = mtx_sleep(&sc->sc_sleepcnt, &sc->sc_mtx, PZERO | PCATCH, "hidrawio", 0); } --sc->sc_sleepcnt; if (flush) sc->sc_state.flush = false; if (error == 0) sc->sc_state.lock = true; return (error); } static inline void hidraw_unlock_queue(struct hidraw_softc *sc) { mtx_assert(&sc->sc_mtx, MA_OWNED); KASSERT(sc->sc_state.lock, ("input buffer is not locked")); if (sc->sc_sleepcnt != 0) wakeup_one(&sc->sc_sleepcnt); sc->sc_state.lock = false; } static int hidraw_open(struct cdev *dev, int flag, int mode, struct thread *td) { struct hidraw_softc *sc; int error; sc = dev->si_drv1; if (sc == NULL) return (ENXIO); DPRINTF("sc=%p\n", sc); mtx_lock(&sc->sc_mtx); if (sc->sc_state.open) { mtx_unlock(&sc->sc_mtx); return (EBUSY); } sc->sc_state.open = true; mtx_unlock(&sc->sc_mtx); error = devfs_set_cdevpriv(sc, hidraw_dtor); if (error != 0) { mtx_lock(&sc->sc_mtx); sc->sc_state.open = false; mtx_unlock(&sc->sc_mtx); return (error); } sc->sc_q = malloc(sc->sc_rdesc->rdsize * HIDRAW_BUFFER_SIZE, M_DEVBUF, M_ZERO | M_WAITOK); sc->sc_qlen = malloc(sizeof(hid_size_t) * HIDRAW_BUFFER_SIZE, M_DEVBUF, M_ZERO | M_WAITOK); /* Set up interrupt pipe. */ sc->sc_state.immed = false; sc->sc_async = 0; sc->sc_state.uhid = false; /* hidraw mode is default */ sc->sc_state.quiet = false; sc->sc_head = sc->sc_tail = 0; sc->sc_fflags = flag; hid_intr_start(sc->sc_dev); return (0); } static void hidraw_dtor(void *data) { struct hidraw_softc *sc = data; DPRINTF("sc=%p\n", sc); /* Disable interrupts. */ hid_intr_stop(sc->sc_dev); sc->sc_tail = sc->sc_head = 0; sc->sc_async = 0; free(sc->sc_q, M_DEVBUF); free(sc->sc_qlen, M_DEVBUF); sc->sc_q = NULL; mtx_lock(&sc->sc_mtx); sc->sc_state.open = false; mtx_unlock(&sc->sc_mtx); } static int hidraw_read(struct cdev *dev, struct uio *uio, int flag) { struct hidraw_softc *sc; size_t length; int error; DPRINTFN(1, "\n"); sc = dev->si_drv1; if (sc == NULL) return (EIO); mtx_lock(&sc->sc_mtx); error = dev->si_drv1 == NULL ? EIO : hidraw_lock_queue(sc, false); if (error != 0) { mtx_unlock(&sc->sc_mtx); return (error); } if (sc->sc_state.immed) { mtx_unlock(&sc->sc_mtx); DPRINTFN(1, "immed\n"); error = hid_get_report(sc->sc_dev, sc->sc_q, sc->sc_rdesc->isize, NULL, HID_INPUT_REPORT, sc->sc_rdesc->iid); if (error == 0) error = uiomove(sc->sc_q, sc->sc_rdesc->isize, uio); mtx_lock(&sc->sc_mtx); goto exit; } while (sc->sc_tail == sc->sc_head && !sc->sc_state.flush) { if (flag & O_NONBLOCK) { error = EWOULDBLOCK; goto exit; } sc->sc_state.aslp = true; DPRINTFN(5, "sleep on %p\n", &sc->sc_q); error = mtx_sleep(&sc->sc_q, &sc->sc_mtx, PZERO | PCATCH, "hidrawrd", 0); DPRINTFN(5, "woke, error=%d\n", error); if (dev->si_drv1 == NULL) error = EIO; if (error) { sc->sc_state.aslp = false; goto exit; } } while (sc->sc_tail != sc->sc_head && uio->uio_resid > 0) { length = min(uio->uio_resid, sc->sc_state.uhid ? sc->sc_rdesc->isize : sc->sc_qlen[sc->sc_head]); mtx_unlock(&sc->sc_mtx); /* Copy the data to the user process. */ DPRINTFN(5, "got %lu chars\n", (u_long)length); error = uiomove(sc->sc_q + sc->sc_head * sc->sc_rdesc->rdsize, length, uio); mtx_lock(&sc->sc_mtx); if (error != 0) goto exit; /* Remove a small chunk from the input queue. */ sc->sc_head = (sc->sc_head + 1) % HIDRAW_BUFFER_SIZE; /* * In uhid mode transfer as many chunks as possible. Hidraw * packets are transferred one by one due to different length. */ if (!sc->sc_state.uhid) goto exit; } exit: hidraw_unlock_queue(sc); mtx_unlock(&sc->sc_mtx); return (error); } static int hidraw_write(struct cdev *dev, struct uio *uio, int flag) { uint8_t local_buf[HIDRAW_LOCAL_BUFSIZE], *buf; struct hidraw_softc *sc; int error; int size; size_t buf_offset; uint8_t id = 0; DPRINTFN(1, "\n"); sc = dev->si_drv1; if (sc == NULL) return (EIO); if (sc->sc_rdesc->osize == 0) return (EOPNOTSUPP); buf_offset = 0; if (sc->sc_state.uhid) { size = sc->sc_rdesc->osize; if (uio->uio_resid != size) return (EINVAL); } else { size = uio->uio_resid; if (size < 2) return (EINVAL); /* Strip leading 0 if the device doesnt use numbered reports */ error = uiomove(&id, 1, uio); if (error) return (error); if (id != 0) buf_offset++; else size--; /* Check if underlying driver could process this request */ if (size > sc->sc_rdesc->wrsize) return (ENOBUFS); } buf = HIDRAW_LOCAL_ALLOC(local_buf, size); buf[0] = id; error = uiomove(buf + buf_offset, uio->uio_resid, uio); if (error == 0) error = hid_write(sc->sc_dev, buf, size); HIDRAW_LOCAL_FREE(local_buf, buf); return (error); } #ifdef COMPAT_FREEBSD32 static void update_hgd32(const struct hidraw_gen_descriptor *hgd, struct hidraw_gen_descriptor32 *hgd32) { /* Don't update hgd_data pointer */ CP(*hgd, *hgd32, hgd_lang_id); CP(*hgd, *hgd32, hgd_maxlen); CP(*hgd, *hgd32, hgd_actlen); CP(*hgd, *hgd32, hgd_offset); CP(*hgd, *hgd32, hgd_config_index); CP(*hgd, *hgd32, hgd_string_index); CP(*hgd, *hgd32, hgd_iface_index); CP(*hgd, *hgd32, hgd_altif_index); CP(*hgd, *hgd32, hgd_endpt_index); CP(*hgd, *hgd32, hgd_report_type); /* Don't update reserved */ } #endif static int hidraw_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td) { uint8_t local_buf[HIDRAW_LOCAL_BUFSIZE]; #ifdef COMPAT_FREEBSD32 struct hidraw_gen_descriptor local_hgd; struct hidraw_gen_descriptor32 *hgd32 = NULL; #endif void *buf; struct hidraw_softc *sc; struct hidraw_device_info *hdi; struct hidraw_gen_descriptor *hgd; struct hidraw_report_descriptor *hrd; struct hidraw_devinfo *hd; const char *devname; uint32_t size; int id, len; int error = 0; DPRINTFN(2, "cmd=%lx\n", cmd); sc = dev->si_drv1; if (sc == NULL) return (EIO); hgd = (struct hidraw_gen_descriptor *)addr; #ifdef COMPAT_FREEBSD32 switch (cmd) { case HIDRAW_GET_REPORT_DESC32: case HIDRAW_GET_REPORT32: case HIDRAW_SET_REPORT_DESC32: case HIDRAW_SET_REPORT32: cmd = _IOC_NEWTYPE(cmd, struct hidraw_gen_descriptor); hgd32 = (struct hidraw_gen_descriptor32 *)addr; hgd = &local_hgd; PTRIN_CP(*hgd32, *hgd, hgd_data); CP(*hgd32, *hgd, hgd_lang_id); CP(*hgd32, *hgd, hgd_maxlen); CP(*hgd32, *hgd, hgd_actlen); CP(*hgd32, *hgd, hgd_offset); CP(*hgd32, *hgd, hgd_config_index); CP(*hgd32, *hgd, hgd_string_index); CP(*hgd32, *hgd, hgd_iface_index); CP(*hgd32, *hgd, hgd_altif_index); CP(*hgd32, *hgd, hgd_endpt_index); CP(*hgd32, *hgd, hgd_report_type); /* Don't copy reserved */ break; } #endif /* fixed-length ioctls handling */ switch (cmd) { case FIONBIO: /* All handled in the upper FS layer. */ return (0); case FIOASYNC: mtx_lock(&sc->sc_mtx); if (*(int *)addr) { if (sc->sc_async == NULL) { sc->sc_async = td->td_proc; DPRINTF("FIOASYNC %p\n", sc->sc_async); } else error = EBUSY; } else sc->sc_async = NULL; mtx_unlock(&sc->sc_mtx); return (error); /* XXX this is not the most general solution. */ case TIOCSPGRP: mtx_lock(&sc->sc_mtx); if (sc->sc_async == NULL) error = EINVAL; else if (*(int *)addr != sc->sc_async->p_pgid) error = EPERM; mtx_unlock(&sc->sc_mtx); return (error); case HIDRAW_GET_REPORT_DESC: if (sc->sc_rdesc->data == NULL || sc->sc_rdesc->len == 0) return (EOPNOTSUPP); mtx_lock(&sc->sc_mtx); sc->sc_state.uhid = true; mtx_unlock(&sc->sc_mtx); if (sc->sc_rdesc->len > hgd->hgd_maxlen) { size = hgd->hgd_maxlen; } else { size = sc->sc_rdesc->len; } hgd->hgd_actlen = size; #ifdef COMPAT_FREEBSD32 if (hgd32 != NULL) update_hgd32(hgd, hgd32); #endif if (hgd->hgd_data == NULL) return (0); /* descriptor length only */ return (copyout(sc->sc_rdesc->data, hgd->hgd_data, size)); case HIDRAW_SET_REPORT_DESC: if (!(sc->sc_fflags & FWRITE)) return (EPERM); /* check privileges */ error = priv_check(curthread, PRIV_DRIVER); if (error) return (error); /* Stop interrupts and clear input report buffer */ mtx_lock(&sc->sc_mtx); sc->sc_tail = sc->sc_head = 0; error = hidraw_lock_queue(sc, true); if (error == 0) sc->sc_state.quiet = true; mtx_unlock(&sc->sc_mtx); if (error != 0) return (error); buf = HIDRAW_LOCAL_ALLOC(local_buf, hgd->hgd_maxlen); error = copyin(hgd->hgd_data, buf, hgd->hgd_maxlen); if (error == 0) { bus_topo_lock(); error = hid_set_report_descr(sc->sc_dev, buf, hgd->hgd_maxlen); bus_topo_unlock(); } HIDRAW_LOCAL_FREE(local_buf, buf); /* Realloc hidraw input queue */ if (error == 0) sc->sc_q = realloc(sc->sc_q, sc->sc_rdesc->rdsize * HIDRAW_BUFFER_SIZE, M_DEVBUF, M_ZERO | M_WAITOK); /* Start interrupts again */ mtx_lock(&sc->sc_mtx); sc->sc_state.quiet = false; hidraw_unlock_queue(sc); mtx_unlock(&sc->sc_mtx); return (error); case HIDRAW_SET_IMMED: if (!(sc->sc_fflags & FREAD)) return (EPERM); if (*(int *)addr) { /* XXX should read into ibuf, but does it matter? */ size = sc->sc_rdesc->isize; buf = HIDRAW_LOCAL_ALLOC(local_buf, size); error = hid_get_report(sc->sc_dev, buf, size, NULL, HID_INPUT_REPORT, sc->sc_rdesc->iid); HIDRAW_LOCAL_FREE(local_buf, buf); if (error) return (EOPNOTSUPP); mtx_lock(&sc->sc_mtx); sc->sc_state.immed = true; mtx_unlock(&sc->sc_mtx); } else { mtx_lock(&sc->sc_mtx); sc->sc_state.immed = false; mtx_unlock(&sc->sc_mtx); } return (0); case HIDRAW_GET_REPORT: if (!(sc->sc_fflags & FREAD)) return (EPERM); switch (hgd->hgd_report_type) { case HID_INPUT_REPORT: size = sc->sc_rdesc->isize; id = sc->sc_rdesc->iid; break; case HID_OUTPUT_REPORT: size = sc->sc_rdesc->osize; id = sc->sc_rdesc->oid; break; case HID_FEATURE_REPORT: size = sc->sc_rdesc->fsize; id = sc->sc_rdesc->fid; break; default: return (EINVAL); } if (id != 0) { error = copyin(hgd->hgd_data, &id, 1); if (error != 0) return (error); } size = MIN(hgd->hgd_maxlen, size); buf = HIDRAW_LOCAL_ALLOC(local_buf, size); error = hid_get_report(sc->sc_dev, buf, size, NULL, hgd->hgd_report_type, id); if (!error) error = copyout(buf, hgd->hgd_data, size); HIDRAW_LOCAL_FREE(local_buf, buf); #ifdef COMPAT_FREEBSD32 /* * HIDRAW_GET_REPORT is declared _IOWR, but hgd is not written * so we don't call update_hgd32(). */ #endif return (error); case HIDRAW_SET_REPORT: if (!(sc->sc_fflags & FWRITE)) return (EPERM); switch (hgd->hgd_report_type) { case HID_INPUT_REPORT: size = sc->sc_rdesc->isize; id = sc->sc_rdesc->iid; break; case HID_OUTPUT_REPORT: size = sc->sc_rdesc->osize; id = sc->sc_rdesc->oid; break; case HID_FEATURE_REPORT: size = sc->sc_rdesc->fsize; id = sc->sc_rdesc->fid; break; default: return (EINVAL); } size = MIN(hgd->hgd_maxlen, size); buf = HIDRAW_LOCAL_ALLOC(local_buf, size); error = copyin(hgd->hgd_data, buf, size); if (error == 0) { if (id != 0) id = *(uint8_t *)buf; error = hid_set_report(sc->sc_dev, buf, size, hgd->hgd_report_type, id); } HIDRAW_LOCAL_FREE(local_buf, buf); return (error); case HIDRAW_GET_REPORT_ID: *(int *)addr = 0; /* XXX: we only support reportid 0? */ return (0); case HIDRAW_GET_DEVICEINFO: hdi = (struct hidraw_device_info *)addr; bzero(hdi, sizeof(struct hidraw_device_info)); hdi->hdi_product = sc->sc_hw->idProduct; hdi->hdi_vendor = sc->sc_hw->idVendor; hdi->hdi_version = sc->sc_hw->idVersion; hdi->hdi_bustype = sc->sc_hw->idBus; strlcpy(hdi->hdi_name, sc->sc_hw->name, sizeof(hdi->hdi_name)); strlcpy(hdi->hdi_phys, device_get_nameunit(sc->sc_dev), sizeof(hdi->hdi_phys)); strlcpy(hdi->hdi_uniq, sc->sc_hw->serial, sizeof(hdi->hdi_uniq)); snprintf(hdi->hdi_release, sizeof(hdi->hdi_release), "%x.%02x", sc->sc_hw->idVersion >> 8, sc->sc_hw->idVersion & 0xff); return(0); case HIDIOCGRDESCSIZE: *(int *)addr = sc->sc_hw->rdescsize; return (0); case HIDIOCGRDESC: hrd = *(struct hidraw_report_descriptor **)addr; error = copyin(&hrd->size, &size, sizeof(uint32_t)); if (error) return (error); /* * HID_MAX_DESCRIPTOR_SIZE-1 is a limit of report descriptor * size in current Linux implementation. */ if (size >= HID_MAX_DESCRIPTOR_SIZE) return (EINVAL); buf = HIDRAW_LOCAL_ALLOC(local_buf, size); error = hid_get_rdesc(sc->sc_dev, buf, size); if (error == 0) { size = MIN(size, sc->sc_rdesc->len); error = copyout(buf, hrd->value, size); } HIDRAW_LOCAL_FREE(local_buf, buf); return (error); case HIDIOCGRAWINFO: hd = (struct hidraw_devinfo *)addr; hd->bustype = sc->sc_hw->idBus; hd->vendor = sc->sc_hw->idVendor; hd->product = sc->sc_hw->idProduct; return (0); } /* variable-length ioctls handling */ len = IOCPARM_LEN(cmd); switch (IOCBASECMD(cmd)) { case HIDIOCGRAWNAME(0): strlcpy(addr, sc->sc_hw->name, len); td->td_retval[0] = min(strlen(sc->sc_hw->name) + 1, len); return (0); case HIDIOCGRAWPHYS(0): devname = device_get_nameunit(sc->sc_dev); strlcpy(addr, devname, len); td->td_retval[0] = min(strlen(devname) + 1, len); return (0); case HIDIOCSFEATURE(0): if (!(sc->sc_fflags & FWRITE)) return (EPERM); if (len < 2) return (EINVAL); id = *(uint8_t *)addr; if (id == 0) { addr = (uint8_t *)addr + 1; len--; } return (hid_set_report(sc->sc_dev, addr, len, HID_FEATURE_REPORT, id)); case HIDIOCGFEATURE(0): if (!(sc->sc_fflags & FREAD)) return (EPERM); if (len < 2) return (EINVAL); id = *(uint8_t *)addr; if (id == 0) { addr = (uint8_t *)addr + 1; len--; } return (hid_get_report(sc->sc_dev, addr, len, NULL, HID_FEATURE_REPORT, id)); case HIDIOCGRAWUNIQ(0): strlcpy(addr, sc->sc_hw->serial, len); td->td_retval[0] = min(strlen(sc->sc_hw->serial) + 1, len); return (0); } return (EINVAL); } static int hidraw_poll(struct cdev *dev, int events, struct thread *td) { struct hidraw_softc *sc; int revents = 0; sc = dev->si_drv1; if (sc == NULL) return (POLLHUP); if (events & (POLLOUT | POLLWRNORM) && (sc->sc_fflags & FWRITE)) revents |= events & (POLLOUT | POLLWRNORM); if (events & (POLLIN | POLLRDNORM) && (sc->sc_fflags & FREAD)) { mtx_lock(&sc->sc_mtx); if (sc->sc_head != sc->sc_tail) revents |= events & (POLLIN | POLLRDNORM); else { sc->sc_state.sel = true; selrecord(td, &sc->sc_rsel); } mtx_unlock(&sc->sc_mtx); } return (revents); } static int hidraw_kqfilter(struct cdev *dev, struct knote *kn) { struct hidraw_softc *sc; sc = dev->si_drv1; if (sc == NULL) return (ENXIO); switch(kn->kn_filter) { case EVFILT_READ: if (sc->sc_fflags & FREAD) { kn->kn_fop = &hidraw_filterops_read; break; } /* FALLTHROUGH */ default: return(EINVAL); } kn->kn_hook = sc; knlist_add(&sc->sc_rsel.si_note, kn, 0); return (0); } static int hidraw_kqread(struct knote *kn, long hint) { struct hidraw_softc *sc; int ret; sc = kn->kn_hook; mtx_assert(&sc->sc_mtx, MA_OWNED); if (sc->dev->si_drv1 == NULL) { kn->kn_flags |= EV_EOF; ret = 1; } else ret = (sc->sc_head != sc->sc_tail) ? 1 : 0; return (ret); } static void hidraw_kqdetach(struct knote *kn) { struct hidraw_softc *sc; sc = kn->kn_hook; knlist_remove(&sc->sc_rsel.si_note, kn, 0); } static void hidraw_notify(struct hidraw_softc *sc) { mtx_assert(&sc->sc_mtx, MA_OWNED); if (sc->sc_state.aslp) { sc->sc_state.aslp = false; DPRINTFN(5, "waking %p\n", &sc->sc_q); wakeup(&sc->sc_q); } if (sc->sc_state.sel) { sc->sc_state.sel = false; selwakeuppri(&sc->sc_rsel, PZERO); } if (sc->sc_async != NULL) { DPRINTFN(3, "sending SIGIO %p\n", sc->sc_async); PROC_LOCK(sc->sc_async); kern_psignal(sc->sc_async, SIGIO); PROC_UNLOCK(sc->sc_async); } KNOTE_LOCKED(&sc->sc_rsel.si_note, 0); } static device_method_t hidraw_methods[] = { /* Device interface */ DEVMETHOD(device_identify, hidraw_identify), DEVMETHOD(device_probe, hidraw_probe), DEVMETHOD(device_attach, hidraw_attach), DEVMETHOD(device_detach, hidraw_detach), DEVMETHOD_END }; static driver_t hidraw_driver = { "hidraw", hidraw_methods, sizeof(struct hidraw_softc) }; DRIVER_MODULE(hidraw, hidbus, hidraw_driver, NULL, NULL); MODULE_DEPEND(hidraw, hidbus, 1, 1, 1); MODULE_DEPEND(hidraw, hid, 1, 1, 1); MODULE_VERSION(hidraw, 1); diff --git a/sys/dev/netmap/netmap_freebsd.c b/sys/dev/netmap/netmap_freebsd.c index 215b1f7bd09e..6448fdc74160 100644 --- a/sys/dev/netmap/netmap_freebsd.c +++ b/sys/dev/netmap/netmap_freebsd.c @@ -1,1585 +1,1585 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include /* POLLIN, POLLOUT */ #include /* types used in module initialization */ #include /* DEV_MODULE_ORDERED */ #include #include /* kern_ioctl() */ #include #include /* vtophys */ #include /* vtophys */ #include #include #include #include #include #include #include /* sockaddrs */ #include #include /* kthread_add() */ #include /* PROC_LOCK() */ #include /* RFNOWAIT */ #include /* sched_bind() */ #include /* mp_maxid */ #include /* taskqueue_enqueue(), taskqueue_create(), ... */ #include #include #include /* IFT_ETHER */ #include /* ether_ifdetach */ #include /* LLADDR */ #include /* bus_dmamap_* */ #include /* in6_cksum_pseudo() */ #include /* in_pseudo(), in_cksum_hdr() */ #include #include #include #include /* ======================== FREEBSD-SPECIFIC ROUTINES ================== */ static void nm_kqueue_notify(void *opaque, int pending) { struct nm_selinfo *si = opaque; /* We use a non-zero hint to distinguish this notification call * from the call done in kqueue_scan(), which uses hint=0. */ KNOTE_UNLOCKED(&si->si.si_note, /*hint=*/0x100); } int nm_os_selinfo_init(NM_SELINFO_T *si, const char *name) { int err; TASK_INIT(&si->ntfytask, 0, nm_kqueue_notify, si); si->ntfytq = taskqueue_create(name, M_NOWAIT, taskqueue_thread_enqueue, &si->ntfytq); if (si->ntfytq == NULL) return -ENOMEM; err = taskqueue_start_threads(&si->ntfytq, 1, PI_NET, "tq %s", name); if (err) { taskqueue_free(si->ntfytq); si->ntfytq = NULL; return err; } snprintf(si->mtxname, sizeof(si->mtxname), "nmkl%s", name); mtx_init(&si->m, si->mtxname, NULL, MTX_DEF); knlist_init_mtx(&si->si.si_note, &si->m); si->kqueue_users = 0; return (0); } void nm_os_selinfo_uninit(NM_SELINFO_T *si) { if (si->ntfytq == NULL) { return; /* si was not initialized */ } taskqueue_drain(si->ntfytq, &si->ntfytask); taskqueue_free(si->ntfytq); si->ntfytq = NULL; knlist_delete(&si->si.si_note, curthread, /*islocked=*/0); knlist_destroy(&si->si.si_note); /* now we don't need the mutex anymore */ mtx_destroy(&si->m); } void * nm_os_malloc(size_t size) { return malloc(size, M_DEVBUF, M_NOWAIT | M_ZERO); } void * nm_os_realloc(void *addr, size_t new_size, size_t old_size __unused) { return realloc(addr, new_size, M_DEVBUF, M_NOWAIT | M_ZERO); } void nm_os_free(void *addr) { free(addr, M_DEVBUF); } void nm_os_ifnet_lock(void) { IFNET_RLOCK(); } void nm_os_ifnet_unlock(void) { IFNET_RUNLOCK(); } static int netmap_use_count = 0; void nm_os_get_module(void) { netmap_use_count++; } void nm_os_put_module(void) { netmap_use_count--; } static void netmap_ifnet_arrival_handler(void *arg __unused, if_t ifp) { netmap_undo_zombie(ifp); } static void netmap_ifnet_departure_handler(void *arg __unused, if_t ifp) { netmap_make_zombie(ifp); } static eventhandler_tag nm_ifnet_ah_tag; static eventhandler_tag nm_ifnet_dh_tag; int nm_os_ifnet_init(void) { nm_ifnet_ah_tag = EVENTHANDLER_REGISTER(ifnet_arrival_event, netmap_ifnet_arrival_handler, NULL, EVENTHANDLER_PRI_ANY); nm_ifnet_dh_tag = EVENTHANDLER_REGISTER(ifnet_departure_event, netmap_ifnet_departure_handler, NULL, EVENTHANDLER_PRI_ANY); return 0; } void nm_os_ifnet_fini(void) { EVENTHANDLER_DEREGISTER(ifnet_arrival_event, nm_ifnet_ah_tag); EVENTHANDLER_DEREGISTER(ifnet_departure_event, nm_ifnet_dh_tag); } unsigned nm_os_ifnet_mtu(if_t ifp) { return if_getmtu(ifp); } rawsum_t nm_os_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum) { /* TODO XXX please use the FreeBSD implementation for this. */ uint16_t *words = (uint16_t *)data; int nw = len / 2; int i; for (i = 0; i < nw; i++) cur_sum += be16toh(words[i]); if (len & 1) cur_sum += (data[len-1] << 8); return cur_sum; } /* Fold a raw checksum: 'cur_sum' is in host byte order, while the * return value is in network byte order. */ uint16_t nm_os_csum_fold(rawsum_t cur_sum) { /* TODO XXX please use the FreeBSD implementation for this. */ while (cur_sum >> 16) cur_sum = (cur_sum & 0xFFFF) + (cur_sum >> 16); return htobe16((~cur_sum) & 0xFFFF); } uint16_t nm_os_csum_ipv4(struct nm_iphdr *iph) { #if 0 return in_cksum_hdr((void *)iph); #else return nm_os_csum_fold(nm_os_csum_raw((uint8_t*)iph, sizeof(struct nm_iphdr), 0)); #endif } void nm_os_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data, size_t datalen, uint16_t *check) { #ifdef INET uint16_t pseudolen = datalen + iph->protocol; /* Compute and insert the pseudo-header checksum. */ *check = in_pseudo(iph->saddr, iph->daddr, htobe16(pseudolen)); /* Compute the checksum on TCP/UDP header + payload * (includes the pseudo-header). */ *check = nm_os_csum_fold(nm_os_csum_raw(data, datalen, 0)); #else static int notsupported = 0; if (!notsupported) { notsupported = 1; nm_prerr("inet4 segmentation not supported"); } #endif } void nm_os_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data, size_t datalen, uint16_t *check) { #ifdef INET6 *check = in6_cksum_pseudo((void*)ip6h, datalen, ip6h->nexthdr, 0); *check = nm_os_csum_fold(nm_os_csum_raw(data, datalen, 0)); #else static int notsupported = 0; if (!notsupported) { notsupported = 1; nm_prerr("inet6 segmentation not supported"); } #endif } /* on FreeBSD we send up one packet at a time */ void * nm_os_send_up(if_t ifp, struct mbuf *m, struct mbuf *prev) { NA(ifp)->if_input(ifp, m); return NULL; } int nm_os_mbuf_has_csum_offld(struct mbuf *m) { return m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_SCTP | CSUM_TCP_IPV6 | CSUM_UDP_IPV6 | CSUM_SCTP_IPV6); } int nm_os_mbuf_has_seg_offld(struct mbuf *m) { return m->m_pkthdr.csum_flags & CSUM_TSO; } static void freebsd_generic_rx_handler(if_t ifp, struct mbuf *m) { int stolen; if (unlikely(!NM_NA_VALID(ifp))) { nm_prlim(1, "Warning: RX packet intercepted, but no" " emulated adapter"); return; } do { struct mbuf *n; n = m->m_nextpkt; m->m_nextpkt = NULL; stolen = generic_rx_handler(ifp, m); if (!stolen) { NA(ifp)->if_input(ifp, m); } m = n; } while (m != NULL); } /* * Intercept the rx routine in the standard device driver. * Second argument is non-zero to intercept, 0 to restore */ int nm_os_catch_rx(struct netmap_generic_adapter *gna, int intercept) { struct netmap_adapter *na = &gna->up.up; if_t ifp = na->ifp; int ret = 0; nm_os_ifnet_lock(); if (intercept) { if_setcapenablebit(ifp, IFCAP_NETMAP, 0); if_setinputfn(ifp, freebsd_generic_rx_handler); } else { if_setcapenablebit(ifp, 0, IFCAP_NETMAP); if_setinputfn(ifp, na->if_input); } nm_os_ifnet_unlock(); return ret; } /* * Intercept the packet steering routine in the tx path, * so that we can decide which queue is used for an mbuf. * Second argument is non-zero to intercept, 0 to restore. * On freebsd we just intercept if_transmit. */ int nm_os_catch_tx(struct netmap_generic_adapter *gna, int intercept) { struct netmap_adapter *na = &gna->up.up; if_t ifp = netmap_generic_getifp(gna); nm_os_ifnet_lock(); if (intercept) { na->if_transmit = if_gettransmitfn(ifp); if_settransmitfn(ifp, netmap_transmit); } else { if_settransmitfn(ifp, na->if_transmit); } nm_os_ifnet_unlock(); return 0; } /* * Transmit routine used by generic_netmap_txsync(). Returns 0 on success * and non-zero on error (which may be packet drops or other errors). * addr and len identify the netmap buffer, m is the (preallocated) * mbuf to use for transmissions. * * Zero-copy transmission is possible if netmap is attached directly to a * hardware interface: when cleaning we simply wait for the mbuf cluster * refcount to decrement to 1, indicating that the driver has completed * transmission and is done with the buffer. However, this approach can * lead to queue deadlocks when attaching to software interfaces (e.g., * if_bridge) since we cannot rely on member ports to promptly reclaim * transmitted mbufs. Since there is no easy way to distinguish these * cases, we currently always copy the buffer. * * On multiqueue cards, we can force the queue using * if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) * i = m->m_pkthdr.flowid % adapter->num_queues; * else * i = curcpu % adapter->num_queues; */ int nm_os_generic_xmit_frame(struct nm_os_gen_arg *a) { int ret; u_int len = a->len; if_t ifp = a->ifp; struct mbuf *m = a->m; M_ASSERTPKTHDR(m); KASSERT((m->m_flags & M_EXT) != 0, ("%s: mbuf %p has no cluster", __func__, m)); if (MBUF_REFCNT(m) != 1) { nm_prerr("invalid refcnt %d for %p", MBUF_REFCNT(m), m); panic("in generic_xmit_frame"); } if (unlikely(m->m_ext.ext_size < len)) { nm_prlim(2, "size %d < len %d", m->m_ext.ext_size, len); len = m->m_ext.ext_size; } m_copyback(m, 0, len, a->addr); m->m_len = m->m_pkthdr.len = len; SET_MBUF_REFCNT(m, 2); M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE); m->m_pkthdr.flowid = a->ring_nr; m->m_pkthdr.rcvif = ifp; /* used for tx notification */ CURVNET_SET(if_getvnet(ifp)); ret = NA(ifp)->if_transmit(ifp, m); CURVNET_RESTORE(); return ret ? -1 : 0; } struct netmap_adapter * netmap_getna(if_t ifp) { return (NA(ifp)); } /* * The following two functions are empty until we have a generic * way to extract the info from the ifp */ int nm_os_generic_find_num_desc(if_t ifp, unsigned int *tx, unsigned int *rx) { return 0; } void nm_os_generic_find_num_queues(if_t ifp, u_int *txq, u_int *rxq) { unsigned num_rings = netmap_generic_rings ? netmap_generic_rings : 1; *txq = num_rings; *rxq = num_rings; } void nm_os_generic_set_features(struct netmap_generic_adapter *gna) { gna->rxsg = 1; /* Supported through m_copydata. */ gna->txqdisc = 0; /* Not supported. */ } void nm_os_mitigation_init(struct nm_generic_mit *mit, int idx, struct netmap_adapter *na) { mit->mit_pending = 0; mit->mit_ring_idx = idx; mit->mit_na = na; } void nm_os_mitigation_start(struct nm_generic_mit *mit) { } void nm_os_mitigation_restart(struct nm_generic_mit *mit) { } int nm_os_mitigation_active(struct nm_generic_mit *mit) { return 0; } void nm_os_mitigation_cleanup(struct nm_generic_mit *mit) { } static int nm_vi_dummy(if_t ifp, u_long cmd, caddr_t addr) { return EINVAL; } static void nm_vi_start(if_t ifp) { panic("nm_vi_start() must not be called"); } /* * Index manager of persistent virtual interfaces. * It is used to decide the lowest byte of the MAC address. * We use the same algorithm with management of bridge port index. */ #define NM_VI_MAX 255 static struct { uint8_t index[NM_VI_MAX]; /* XXX just for a reasonable number */ uint8_t active; struct mtx lock; } nm_vi_indices; void nm_os_vi_init_index(void) { int i; for (i = 0; i < NM_VI_MAX; i++) nm_vi_indices.index[i] = i; nm_vi_indices.active = 0; mtx_init(&nm_vi_indices.lock, "nm_vi_indices_lock", NULL, MTX_DEF); } /* return -1 if no index available */ static int nm_vi_get_index(void) { int ret; mtx_lock(&nm_vi_indices.lock); ret = nm_vi_indices.active == NM_VI_MAX ? -1 : nm_vi_indices.index[nm_vi_indices.active++]; mtx_unlock(&nm_vi_indices.lock); return ret; } static void nm_vi_free_index(uint8_t val) { int i, lim; mtx_lock(&nm_vi_indices.lock); lim = nm_vi_indices.active; for (i = 0; i < lim; i++) { if (nm_vi_indices.index[i] == val) { /* swap index[lim-1] and j */ int tmp = nm_vi_indices.index[lim-1]; nm_vi_indices.index[lim-1] = val; nm_vi_indices.index[i] = tmp; nm_vi_indices.active--; break; } } if (lim == nm_vi_indices.active) nm_prerr("Index %u not found", val); mtx_unlock(&nm_vi_indices.lock); } #undef NM_VI_MAX /* * Implementation of a netmap-capable virtual interface that * registered to the system. * It is based on if_tap.c and ip_fw_log.c in FreeBSD 9. * * Note: Linux sets refcount to 0 on allocation of net_device, * then increments it on registration to the system. * FreeBSD sets refcount to 1 on if_alloc(), and does not * increment this refcount on if_attach(). */ int nm_os_vi_persist(const char *name, if_t *ret) { if_t ifp; u_short macaddr_hi; uint32_t macaddr_mid; u_char eaddr[6]; int unit = nm_vi_get_index(); /* just to decide MAC address */ if (unit < 0) return EBUSY; /* * We use the same MAC address generation method with tap * except for the highest octet is 00:be instead of 00:bd */ macaddr_hi = htons(0x00be); /* XXX tap + 1 */ macaddr_mid = (uint32_t) ticks; bcopy(&macaddr_hi, eaddr, sizeof(short)); bcopy(&macaddr_mid, &eaddr[2], sizeof(uint32_t)); eaddr[5] = (uint8_t)unit; ifp = if_alloc(IFT_ETHER); if_initname(ifp, name, IF_DUNIT_NONE); if_setflags(ifp, IFF_UP | IFF_SIMPLEX | IFF_MULTICAST); if_setinitfn(ifp, (void *)nm_vi_dummy); if_setioctlfn(ifp, nm_vi_dummy); if_setstartfn(ifp, nm_vi_start); if_setmtu(ifp, ETHERMTU); if_setsendqlen(ifp, ifqmaxlen); if_setcapabilitiesbit(ifp, IFCAP_LINKSTATE, 0); if_setcapenablebit(ifp, IFCAP_LINKSTATE, 0); ether_ifattach(ifp, eaddr); *ret = ifp; return 0; } /* unregister from the system and drop the final refcount */ void nm_os_vi_detach(if_t ifp) { nm_vi_free_index(((char *)if_getlladdr(ifp))[5]); ether_ifdetach(ifp); if_free(ifp); } #ifdef WITH_EXTMEM #include #include #include struct nm_os_extmem { vm_object_t obj; vm_offset_t kva; vm_offset_t size; uintptr_t scan; }; void nm_os_extmem_delete(struct nm_os_extmem *e) { nm_prinf("freeing %zx bytes", (size_t)e->size); vm_map_remove(kernel_map, e->kva, e->kva + e->size); nm_os_free(e); } char * nm_os_extmem_nextpage(struct nm_os_extmem *e) { char *rv = NULL; if (e->scan < e->kva + e->size) { rv = (char *)e->scan; e->scan += PAGE_SIZE; } return rv; } int nm_os_extmem_isequal(struct nm_os_extmem *e1, struct nm_os_extmem *e2) { return (e1->obj == e2->obj); } int nm_os_extmem_nr_pages(struct nm_os_extmem *e) { return e->size >> PAGE_SHIFT; } struct nm_os_extmem * nm_os_extmem_create(unsigned long p, struct nmreq_pools_info *pi, int *perror) { vm_map_t map; vm_map_entry_t entry; vm_object_t obj; vm_prot_t prot; vm_pindex_t index; boolean_t wired; struct nm_os_extmem *e = NULL; int rv, error = 0; e = nm_os_malloc(sizeof(*e)); if (e == NULL) { error = ENOMEM; goto out; } map = &curthread->td_proc->p_vmspace->vm_map; rv = vm_map_lookup(&map, p, VM_PROT_RW, &entry, &obj, &index, &prot, &wired); if (rv != KERN_SUCCESS) { nm_prerr("address %lx not found", p); error = vm_mmap_to_errno(rv); goto out_free; } vm_object_reference(obj); /* check that we are given the whole vm_object ? */ vm_map_lookup_done(map, entry); e->obj = obj; /* Wire the memory and add the vm_object to the kernel map, * to make sure that it is not freed even if all the processes * that are mmap()ing should munmap() it. */ e->kva = vm_map_min(kernel_map); e->size = obj->size << PAGE_SHIFT; rv = vm_map_find(kernel_map, obj, 0, &e->kva, e->size, 0, VMFS_OPTIMAL_SPACE, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0); if (rv != KERN_SUCCESS) { nm_prerr("vm_map_find(%zx) failed", (size_t)e->size); error = vm_mmap_to_errno(rv); goto out_rel; } rv = vm_map_wire(kernel_map, e->kva, e->kva + e->size, VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); if (rv != KERN_SUCCESS) { nm_prerr("vm_map_wire failed"); error = vm_mmap_to_errno(rv); goto out_rem; } e->scan = e->kva; return e; out_rem: vm_map_remove(kernel_map, e->kva, e->kva + e->size); out_rel: vm_object_deallocate(e->obj); e->obj = NULL; out_free: nm_os_free(e); out: if (perror) *perror = error; return NULL; } #endif /* WITH_EXTMEM */ /* ================== PTNETMAP GUEST SUPPORT ==================== */ #ifdef WITH_PTNETMAP #include #include #include /* bus_dmamap_* */ #include #include #include /* * ptnetmap memory device (memdev) for freebsd guest, * ssed to expose host netmap memory to the guest through a PCI BAR. */ /* * ptnetmap memdev private data structure */ struct ptnetmap_memdev { device_t dev; struct resource *pci_io; struct resource *pci_mem; struct netmap_mem_d *nm_mem; }; static int ptn_memdev_probe(device_t); static int ptn_memdev_attach(device_t); static int ptn_memdev_detach(device_t); static int ptn_memdev_shutdown(device_t); static device_method_t ptn_memdev_methods[] = { DEVMETHOD(device_probe, ptn_memdev_probe), DEVMETHOD(device_attach, ptn_memdev_attach), DEVMETHOD(device_detach, ptn_memdev_detach), DEVMETHOD(device_shutdown, ptn_memdev_shutdown), DEVMETHOD_END }; static driver_t ptn_memdev_driver = { PTNETMAP_MEMDEV_NAME, ptn_memdev_methods, sizeof(struct ptnetmap_memdev), }; /* We use (SI_ORDER_MIDDLE+1) here, see DEV_MODULE_ORDERED() invocation * below. */ DRIVER_MODULE_ORDERED(ptn_memdev, pci, ptn_memdev_driver, NULL, NULL, SI_ORDER_MIDDLE + 1); /* * Map host netmap memory through PCI-BAR in the guest OS, * returning physical (nm_paddr) and virtual (nm_addr) addresses * of the netmap memory mapped in the guest. */ int nm_os_pt_memdev_iomap(struct ptnetmap_memdev *ptn_dev, vm_paddr_t *nm_paddr, void **nm_addr, uint64_t *mem_size) { int rid; nm_prinf("ptn_memdev_driver iomap"); rid = PCIR_BAR(PTNETMAP_MEM_PCI_BAR); *mem_size = bus_read_4(ptn_dev->pci_io, PTNET_MDEV_IO_MEMSIZE_HI); *mem_size = bus_read_4(ptn_dev->pci_io, PTNET_MDEV_IO_MEMSIZE_LO) | (*mem_size << 32); /* map memory allocator */ ptn_dev->pci_mem = bus_alloc_resource(ptn_dev->dev, SYS_RES_MEMORY, &rid, 0, ~0, *mem_size, RF_ACTIVE); if (ptn_dev->pci_mem == NULL) { *nm_paddr = 0; *nm_addr = NULL; return ENOMEM; } *nm_paddr = rman_get_start(ptn_dev->pci_mem); *nm_addr = rman_get_virtual(ptn_dev->pci_mem); nm_prinf("=== BAR %d start %lx len %lx mem_size %lx ===", PTNETMAP_MEM_PCI_BAR, (unsigned long)(*nm_paddr), (unsigned long)rman_get_size(ptn_dev->pci_mem), (unsigned long)*mem_size); return (0); } uint32_t nm_os_pt_memdev_ioread(struct ptnetmap_memdev *ptn_dev, unsigned int reg) { return bus_read_4(ptn_dev->pci_io, reg); } /* Unmap host netmap memory. */ void nm_os_pt_memdev_iounmap(struct ptnetmap_memdev *ptn_dev) { nm_prinf("ptn_memdev_driver iounmap"); if (ptn_dev->pci_mem) { bus_release_resource(ptn_dev->dev, SYS_RES_MEMORY, PCIR_BAR(PTNETMAP_MEM_PCI_BAR), ptn_dev->pci_mem); ptn_dev->pci_mem = NULL; } } /* Device identification routine, return BUS_PROBE_DEFAULT on success, * positive on failure */ static int ptn_memdev_probe(device_t dev) { if (pci_get_vendor(dev) != PTNETMAP_PCI_VENDOR_ID) return (ENXIO); if (pci_get_device(dev) != PTNETMAP_PCI_DEVICE_ID) return (ENXIO); device_set_descf(dev, "%s PCI adapter", PTNETMAP_MEMDEV_NAME); return (BUS_PROBE_DEFAULT); } /* Device initialization routine. */ static int ptn_memdev_attach(device_t dev) { struct ptnetmap_memdev *ptn_dev; int rid; uint16_t mem_id; ptn_dev = device_get_softc(dev); ptn_dev->dev = dev; pci_enable_busmaster(dev); rid = PCIR_BAR(PTNETMAP_IO_PCI_BAR); ptn_dev->pci_io = bus_alloc_resource_any(dev, SYS_RES_IOPORT, &rid, RF_ACTIVE); if (ptn_dev->pci_io == NULL) { device_printf(dev, "cannot map I/O space\n"); return (ENXIO); } mem_id = bus_read_4(ptn_dev->pci_io, PTNET_MDEV_IO_MEMID); /* create guest allocator */ ptn_dev->nm_mem = netmap_mem_pt_guest_attach(ptn_dev, mem_id); if (ptn_dev->nm_mem == NULL) { ptn_memdev_detach(dev); return (ENOMEM); } netmap_mem_get(ptn_dev->nm_mem); nm_prinf("ptnetmap memdev attached, host memid: %u", mem_id); return (0); } /* Device removal routine. */ static int ptn_memdev_detach(device_t dev) { struct ptnetmap_memdev *ptn_dev; ptn_dev = device_get_softc(dev); if (ptn_dev->nm_mem) { nm_prinf("ptnetmap memdev detached, host memid %u", netmap_mem_get_id(ptn_dev->nm_mem)); netmap_mem_put(ptn_dev->nm_mem); ptn_dev->nm_mem = NULL; } if (ptn_dev->pci_mem) { bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(PTNETMAP_MEM_PCI_BAR), ptn_dev->pci_mem); ptn_dev->pci_mem = NULL; } if (ptn_dev->pci_io) { bus_release_resource(dev, SYS_RES_IOPORT, PCIR_BAR(PTNETMAP_IO_PCI_BAR), ptn_dev->pci_io); ptn_dev->pci_io = NULL; } return (0); } static int ptn_memdev_shutdown(device_t dev) { return bus_generic_shutdown(dev); } #endif /* WITH_PTNETMAP */ /* * In order to track whether pages are still mapped, we hook into * the standard cdev_pager and intercept the constructor and * destructor. */ struct netmap_vm_handle_t { struct cdev *dev; struct netmap_priv_d *priv; }; static int netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred, u_short *color) { struct netmap_vm_handle_t *vmh = handle; if (netmap_verbose) nm_prinf("handle %p size %jd prot %d foff %jd", handle, (intmax_t)size, prot, (intmax_t)foff); if (color) *color = 0; dev_ref(vmh->dev); return 0; } static void netmap_dev_pager_dtor(void *handle) { struct netmap_vm_handle_t *vmh = handle; struct cdev *dev = vmh->dev; struct netmap_priv_d *priv = vmh->priv; if (netmap_verbose) nm_prinf("handle %p", handle); netmap_dtor(priv); free(vmh, M_DEVBUF); dev_rel(dev); } static int netmap_dev_pager_fault(vm_object_t object, vm_ooffset_t offset, int prot, vm_page_t *mres) { struct netmap_vm_handle_t *vmh = object->handle; struct netmap_priv_d *priv = vmh->priv; struct netmap_adapter *na = priv->np_na; vm_paddr_t paddr; vm_page_t page; vm_memattr_t memattr; nm_prdis("object %p offset %jd prot %d mres %p", object, (intmax_t)offset, prot, mres); memattr = object->memattr; paddr = netmap_mem_ofstophys(na->nm_mem, offset); if (paddr == 0) return VM_PAGER_FAIL; if (((*mres)->flags & PG_FICTITIOUS) != 0) { /* * If the passed in result page is a fake page, update it with * the new physical address. */ page = *mres; vm_page_updatefake(page, paddr, memattr); } else { /* * Replace the passed in reqpage page with our own fake page and * free up the all of the original pages. */ VM_OBJECT_WUNLOCK(object); page = vm_page_getfake(paddr, memattr); VM_OBJECT_WLOCK(object); vm_page_replace(page, object, (*mres)->pindex, *mres); *mres = page; } page->valid = VM_PAGE_BITS_ALL; return (VM_PAGER_OK); } static struct cdev_pager_ops netmap_cdev_pager_ops = { .cdev_pg_ctor = netmap_dev_pager_ctor, .cdev_pg_dtor = netmap_dev_pager_dtor, .cdev_pg_fault = netmap_dev_pager_fault, }; static int netmap_mmap_single(struct cdev *cdev, vm_ooffset_t *foff, vm_size_t objsize, vm_object_t *objp, int prot) { int error; struct netmap_vm_handle_t *vmh; struct netmap_priv_d *priv; vm_object_t obj; if (netmap_verbose) nm_prinf("cdev %p foff %jd size %jd objp %p prot %d", cdev, (intmax_t )*foff, (intmax_t )objsize, objp, prot); vmh = malloc(sizeof(struct netmap_vm_handle_t), M_DEVBUF, M_NOWAIT | M_ZERO); if (vmh == NULL) return ENOMEM; vmh->dev = cdev; NMG_LOCK(); error = devfs_get_cdevpriv((void**)&priv); if (error) goto err_unlock; if (priv->np_nifp == NULL) { error = EINVAL; goto err_unlock; } vmh->priv = priv; priv->np_refs++; NMG_UNLOCK(); obj = cdev_pager_allocate(vmh, OBJT_DEVICE, &netmap_cdev_pager_ops, objsize, prot, *foff, NULL); if (obj == NULL) { nm_prerr("cdev_pager_allocate failed"); error = EINVAL; goto err_deref; } *objp = obj; return 0; err_deref: NMG_LOCK(); priv->np_refs--; err_unlock: NMG_UNLOCK(); // err: free(vmh, M_DEVBUF); return error; } /* * On FreeBSD the close routine is only called on the last close on * the device (/dev/netmap) so we cannot do anything useful. * To track close() on individual file descriptors we pass netmap_dtor() to * devfs_set_cdevpriv() on open(). The FreeBSD kernel will call the destructor * when the last fd pointing to the device is closed. * * Note that FreeBSD does not even munmap() on close() so we also have * to track mmap() ourselves, and postpone the call to * netmap_dtor() is called when the process has no open fds and no active * memory maps on /dev/netmap, as in linux. */ static int netmap_close(struct cdev *dev, int fflag, int devtype, struct thread *td) { if (netmap_verbose) nm_prinf("dev %p fflag 0x%x devtype %d td %p", dev, fflag, devtype, td); return 0; } static int netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td) { struct netmap_priv_d *priv; int error; (void)dev; (void)oflags; (void)devtype; (void)td; NMG_LOCK(); priv = netmap_priv_new(); if (priv == NULL) { error = ENOMEM; goto out; } error = devfs_set_cdevpriv(priv, netmap_dtor); if (error) { netmap_priv_delete(priv); } out: NMG_UNLOCK(); return error; } /******************** kthread wrapper ****************/ #include u_int nm_os_ncpus(void) { return mp_maxid + 1; } struct nm_kctx_ctx { /* Userspace thread (kthread creator). */ struct thread *user_td; /* worker function and parameter */ nm_kctx_worker_fn_t worker_fn; void *worker_private; struct nm_kctx *nmk; /* integer to manage multiple worker contexts (e.g., RX or TX on ptnetmap) */ long type; }; struct nm_kctx { struct thread *worker; struct mtx worker_lock; struct nm_kctx_ctx worker_ctx; int run; /* used to stop kthread */ int attach_user; /* kthread attached to user_process */ int affinity; }; static void nm_kctx_worker(void *data) { struct nm_kctx *nmk = data; struct nm_kctx_ctx *ctx = &nmk->worker_ctx; if (nmk->affinity >= 0) { thread_lock(curthread); sched_bind(curthread, nmk->affinity); thread_unlock(curthread); } while (nmk->run) { /* * check if the parent process dies * (when kthread is attached to user process) */ if (ctx->user_td) { PROC_LOCK(curproc); thread_suspend_check(0); PROC_UNLOCK(curproc); } else { kthread_suspend_check(); } /* Continuously execute worker process. */ ctx->worker_fn(ctx->worker_private); /* worker body */ } kthread_exit(); } void nm_os_kctx_worker_setaff(struct nm_kctx *nmk, int affinity) { nmk->affinity = affinity; } struct nm_kctx * nm_os_kctx_create(struct nm_kctx_cfg *cfg, void *opaque) { struct nm_kctx *nmk = NULL; nmk = malloc(sizeof(*nmk), M_DEVBUF, M_NOWAIT | M_ZERO); if (!nmk) return NULL; mtx_init(&nmk->worker_lock, "nm_kthread lock", NULL, MTX_DEF); nmk->worker_ctx.worker_fn = cfg->worker_fn; nmk->worker_ctx.worker_private = cfg->worker_private; nmk->worker_ctx.type = cfg->type; nmk->affinity = -1; /* attach kthread to user process (ptnetmap) */ nmk->attach_user = cfg->attach_user; return nmk; } int nm_os_kctx_worker_start(struct nm_kctx *nmk) { struct proc *p = NULL; int error = 0; /* Temporarily disable this function as it is currently broken * and causes kernel crashes. The failure can be triggered by * the "vale_polling_enable_disable" test in ctrl-api-test.c. */ return EOPNOTSUPP; if (nmk->worker) return EBUSY; /* check if we want to attach kthread to user process */ if (nmk->attach_user) { nmk->worker_ctx.user_td = curthread; p = curthread->td_proc; } /* enable kthread main loop */ nmk->run = 1; /* create kthread */ if((error = kthread_add(nm_kctx_worker, nmk, p, &nmk->worker, RFNOWAIT /* to be checked */, 0, "nm-kthread-%ld", nmk->worker_ctx.type))) { goto err; } nm_prinf("nm_kthread started td %p", nmk->worker); return 0; err: nm_prerr("nm_kthread start failed err %d", error); nmk->worker = NULL; return error; } void nm_os_kctx_worker_stop(struct nm_kctx *nmk) { if (!nmk->worker) return; /* tell to kthread to exit from main loop */ nmk->run = 0; /* wake up kthread if it sleeps */ kthread_resume(nmk->worker); nmk->worker = NULL; } void nm_os_kctx_destroy(struct nm_kctx *nmk) { if (!nmk) return; if (nmk->worker) nm_os_kctx_worker_stop(nmk); free(nmk, M_DEVBUF); } /******************** kqueue support ****************/ /* * In addition to calling selwakeuppri(), nm_os_selwakeup() also * needs to call knote() to wake up kqueue listeners. * This operation is deferred to a taskqueue in order to avoid possible * lock order reversals; these may happen because knote() grabs a * private lock associated to the 'si' (see struct selinfo, * struct nm_selinfo, and nm_os_selinfo_init), and nm_os_selwakeup() * can be called while holding the lock associated to a different * 'si'. * When calling knote() we use a non-zero 'hint' argument to inform * the netmap_knrw() function that it is being called from * 'nm_os_selwakeup'; this is necessary because when netmap_knrw() is * called by the kevent subsystem (i.e. kevent_scan()) we also need to * call netmap_poll(). * * The netmap_kqfilter() function registers one or another f_event * depending on read or write mode. A pointer to the struct * 'netmap_priv_d' is stored into kn->kn_hook, so that it can later * be passed to netmap_poll(). We pass NULL as a third argument to * netmap_poll(), so that the latter only runs the txsync/rxsync * (if necessary), and skips the nm_os_selrecord() calls. */ void nm_os_selwakeup(struct nm_selinfo *si) { selwakeuppri(&si->si, PI_NET); if (si->kqueue_users > 0) { taskqueue_enqueue(si->ntfytq, &si->ntfytask); } } void nm_os_selrecord(struct thread *td, struct nm_selinfo *si) { selrecord(td, &si->si); } static void netmap_knrdetach(struct knote *kn) { struct netmap_priv_d *priv = (struct netmap_priv_d *)kn->kn_hook; struct nm_selinfo *si = priv->np_si[NR_RX]; knlist_remove(&si->si.si_note, kn, /*islocked=*/0); NMG_LOCK(); KASSERT(si->kqueue_users > 0, ("kqueue_user underflow on %s", si->mtxname)); si->kqueue_users--; nm_prinf("kqueue users for %s: %d", si->mtxname, si->kqueue_users); NMG_UNLOCK(); } static void netmap_knwdetach(struct knote *kn) { struct netmap_priv_d *priv = (struct netmap_priv_d *)kn->kn_hook; struct nm_selinfo *si = priv->np_si[NR_TX]; knlist_remove(&si->si.si_note, kn, /*islocked=*/0); NMG_LOCK(); si->kqueue_users--; nm_prinf("kqueue users for %s: %d", si->mtxname, si->kqueue_users); NMG_UNLOCK(); } /* * Callback triggered by netmap notifications (see netmap_notify()), * and by the application calling kevent(). In the former case we * just return 1 (events ready), since we are not able to do better. * In the latter case we use netmap_poll() to see which events are * ready. */ static int netmap_knrw(struct knote *kn, long hint, int events) { struct netmap_priv_d *priv; int revents; if (hint != 0) { /* Called from netmap_notify(), typically from a * thread different from the one issuing kevent(). * Assume we are ready. */ return 1; } /* Called from kevent(). */ priv = kn->kn_hook; revents = netmap_poll(priv, events, /*thread=*/NULL); return (events & revents) ? 1 : 0; } static int netmap_knread(struct knote *kn, long hint) { return netmap_knrw(kn, hint, POLLIN); } static int netmap_knwrite(struct knote *kn, long hint) { return netmap_knrw(kn, hint, POLLOUT); } -static struct filterops netmap_rfiltops = { +static const struct filterops netmap_rfiltops = { .f_isfd = 1, .f_detach = netmap_knrdetach, .f_event = netmap_knread, }; -static struct filterops netmap_wfiltops = { +static const struct filterops netmap_wfiltops = { .f_isfd = 1, .f_detach = netmap_knwdetach, .f_event = netmap_knwrite, }; /* * This is called when a thread invokes kevent() to record * a change in the configuration of the kqueue(). * The 'priv' is the one associated to the open netmap device. */ static int netmap_kqfilter(struct cdev *dev, struct knote *kn) { struct netmap_priv_d *priv; int error; struct netmap_adapter *na; struct nm_selinfo *si; int ev = kn->kn_filter; if (ev != EVFILT_READ && ev != EVFILT_WRITE) { nm_prerr("bad filter request %d", ev); return 1; } error = devfs_get_cdevpriv((void**)&priv); if (error) { nm_prerr("device not yet setup"); return 1; } na = priv->np_na; if (na == NULL) { nm_prerr("no netmap adapter for this file descriptor"); return 1; } /* the si is indicated in the priv */ si = priv->np_si[(ev == EVFILT_WRITE) ? NR_TX : NR_RX]; kn->kn_fop = (ev == EVFILT_WRITE) ? &netmap_wfiltops : &netmap_rfiltops; kn->kn_hook = priv; NMG_LOCK(); si->kqueue_users++; nm_prinf("kqueue users for %s: %d", si->mtxname, si->kqueue_users); NMG_UNLOCK(); knlist_add(&si->si.si_note, kn, /*islocked=*/0); return 0; } static int freebsd_netmap_poll(struct cdev *cdevi __unused, int events, struct thread *td) { struct netmap_priv_d *priv; if (devfs_get_cdevpriv((void **)&priv)) { return POLLERR; } return netmap_poll(priv, events, td); } static int freebsd_netmap_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t data, int ffla __unused, struct thread *td) { int error; struct netmap_priv_d *priv; CURVNET_SET(TD_TO_VNET(td)); error = devfs_get_cdevpriv((void **)&priv); if (error) { /* XXX ENOENT should be impossible, since the priv * is now created in the open */ if (error == ENOENT) error = ENXIO; goto out; } error = netmap_ioctl(priv, cmd, data, td, /*nr_body_is_user=*/1); out: CURVNET_RESTORE(); return error; } void nm_os_onattach(if_t ifp) { if_setcapabilitiesbit(ifp, IFCAP_NETMAP, 0); } void nm_os_onenter(if_t ifp) { struct netmap_adapter *na = NA(ifp); na->if_transmit = if_gettransmitfn(ifp); if_settransmitfn(ifp, netmap_transmit); if_setcapenablebit(ifp, IFCAP_NETMAP, 0); } void nm_os_onexit(if_t ifp) { struct netmap_adapter *na = NA(ifp); if_settransmitfn(ifp, na->if_transmit); if_setcapenablebit(ifp, 0, IFCAP_NETMAP); } extern struct cdevsw netmap_cdevsw; /* XXX used in netmap.c, should go elsewhere */ struct cdevsw netmap_cdevsw = { .d_version = D_VERSION, .d_name = "netmap", .d_open = netmap_open, .d_mmap_single = netmap_mmap_single, .d_ioctl = freebsd_netmap_ioctl, .d_poll = freebsd_netmap_poll, .d_kqfilter = netmap_kqfilter, .d_close = netmap_close, }; /*--- end of kqueue support ----*/ /* * Kernel entry point. * * Initialize/finalize the module and return. * * Return 0 on success, errno on failure. */ static int netmap_loader(__unused struct module *module, int event, __unused void *arg) { int error = 0; switch (event) { case MOD_LOAD: error = netmap_init(); break; case MOD_UNLOAD: /* * if some one is still using netmap, * then the module can not be unloaded. */ if (netmap_use_count) { nm_prerr("netmap module can not be unloaded - netmap_use_count: %d", netmap_use_count); error = EBUSY; break; } netmap_fini(); break; default: error = EOPNOTSUPP; break; } return (error); } #ifdef DEV_MODULE_ORDERED /* * The netmap module contains three drivers: (i) the netmap character device * driver; (ii) the ptnetmap memdev PCI device driver, (iii) the ptnet PCI * device driver. The attach() routines of both (ii) and (iii) need the * lock of the global allocator, and such lock is initialized in netmap_init(), * which is part of (i). * Therefore, we make sure that (i) is loaded before (ii) and (iii), using * the 'order' parameter of driver declaration macros. For (i), we specify * SI_ORDER_MIDDLE, while higher orders are used with the DRIVER_MODULE_ORDERED * macros for (ii) and (iii). */ DEV_MODULE_ORDERED(netmap, netmap_loader, NULL, SI_ORDER_MIDDLE); #else /* !DEV_MODULE_ORDERED */ DEV_MODULE(netmap, netmap_loader, NULL); #endif /* DEV_MODULE_ORDERED */ MODULE_DEPEND(netmap, pci, 1, 1, 1); MODULE_VERSION(netmap, 1); /* reduce conditional code */ // linux API, use for the knlist in FreeBSD /* use a private mutex for the knlist */ diff --git a/sys/dev/qat/qat_common/adf_freebsd_dev_processes.c b/sys/dev/qat/qat_common/adf_freebsd_dev_processes.c index a70f25d57dcb..661d5bd0f14e 100644 --- a/sys/dev/qat/qat_common/adf_freebsd_dev_processes.c +++ b/sys/dev/qat/qat_common/adf_freebsd_dev_processes.c @@ -1,672 +1,672 @@ /* SPDX-License-Identifier: BSD-3-Clause */ /* Copyright(c) 2007-2022 Intel Corporation */ #include "qat_freebsd.h" #include "adf_cfg.h" #include "adf_common_drv.h" #include "adf_accel_devices.h" #include "icp_qat_uclo.h" #include "icp_qat_fw.h" #include "icp_qat_fw_init_admin.h" #include "adf_cfg_strings.h" #include "adf_uio_control.h" #include "adf_uio_cleanup.h" #include "adf_uio.h" #include "adf_transport_access_macros.h" #include "adf_transport_internal.h" #define ADF_DEV_PROCESSES_NAME "qat_dev_processes" #define ADF_DEV_STATE_NAME "qat_dev_state" #define ADF_STATE_CALLOUT_TIME 10 static const char *mtx_name = "state_mtx"; static const char *mtx_callout_name = "callout_mtx"; static d_open_t adf_processes_open; static void adf_processes_release(void *data); static d_read_t adf_processes_read; static d_write_t adf_processes_write; static d_open_t adf_state_open; static void adf_state_release(void *data); static d_read_t adf_state_read; static int adf_state_kqfilter(struct cdev *dev, struct knote *kn); static int adf_state_kqread_event(struct knote *kn, long hint); static void adf_state_kqread_detach(struct knote *kn); static struct callout callout; static struct mtx mtx; static struct mtx callout_mtx; static struct service_hndl adf_state_hndl; struct entry_proc_events { struct adf_state_priv_data *proc_events; SLIST_ENTRY(entry_proc_events) entries_proc_events; }; struct entry_state { struct adf_state state; STAILQ_ENTRY(entry_state) entries_state; }; SLIST_HEAD(proc_events_head, entry_proc_events); STAILQ_HEAD(state_head, entry_state); static struct proc_events_head proc_events_head; struct adf_processes_priv_data { char name[ADF_CFG_MAX_SECTION_LEN_IN_BYTES]; int read_flag; struct list_head list; }; struct adf_state_priv_data { struct cdev *cdev; struct selinfo rsel; struct state_head state_head; }; static struct cdevsw adf_processes_cdevsw = { .d_version = D_VERSION, .d_open = adf_processes_open, .d_read = adf_processes_read, .d_write = adf_processes_write, .d_name = ADF_DEV_PROCESSES_NAME, }; static struct cdevsw adf_state_cdevsw = { .d_version = D_VERSION, .d_open = adf_state_open, .d_read = adf_state_read, .d_kqfilter = adf_state_kqfilter, .d_name = ADF_DEV_STATE_NAME, }; -static struct filterops adf_state_read_filterops = { +static const struct filterops adf_state_read_filterops = { .f_isfd = 1, .f_attach = NULL, .f_detach = adf_state_kqread_detach, .f_event = adf_state_kqread_event, }; static struct cdev *adf_processes_dev; static struct cdev *adf_state_dev; static LINUX_LIST_HEAD(processes_list); struct sx processes_list_sema; SX_SYSINIT(processes_list_sema, &processes_list_sema, "adf proc list"); static void adf_chr_drv_destroy(void) { destroy_dev(adf_processes_dev); } static int adf_chr_drv_create(void) { adf_processes_dev = make_dev(&adf_processes_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, ADF_DEV_PROCESSES_NAME); if (adf_processes_dev == NULL) { printf("QAT: failed to create device\n"); goto err_cdev_del; } return 0; err_cdev_del: return EFAULT; } static int adf_processes_open(struct cdev *dev, int oflags, int devtype, struct thread *td) { int i = 0, devices = 0; struct adf_accel_dev *accel_dev = NULL; struct adf_processes_priv_data *prv_data = NULL; int error = 0; for (i = 0; i < ADF_MAX_DEVICES; i++) { accel_dev = adf_devmgr_get_dev_by_id(i); if (!accel_dev) continue; if (!adf_dev_started(accel_dev)) continue; devices++; } if (!devices) { printf("QAT: No active devices found.\n"); return ENXIO; } prv_data = malloc(sizeof(*prv_data), M_QAT, M_WAITOK | M_ZERO); INIT_LIST_HEAD(&prv_data->list); error = devfs_set_cdevpriv(prv_data, adf_processes_release); if (error) { free(prv_data, M_QAT); return error; } return 0; } static int adf_get_first_started_dev(void) { int i = 0; struct adf_accel_dev *accel_dev = NULL; for (i = 0; i < ADF_MAX_DEVICES; i++) { accel_dev = adf_devmgr_get_dev_by_id(i); if (!accel_dev) continue; if (adf_dev_started(accel_dev)) return i; } return -1; } static int adf_processes_write(struct cdev *dev, struct uio *uio, int ioflag) { struct adf_processes_priv_data *prv_data = NULL; struct adf_processes_priv_data *pdata = NULL; int dev_num = 0, pr_num = 0; struct list_head *lpos = NULL; char usr_name[ADF_CFG_MAX_SECTION_LEN_IN_BYTES] = { 0 }; struct adf_accel_dev *accel_dev = NULL; struct adf_cfg_section *section_ptr = NULL; bool pr_name_available = 1; uint32_t num_accel_devs = 0; int error = 0; ssize_t count; int dev_id; error = devfs_get_cdevpriv((void **)&prv_data); if (error) { printf("QAT: invalid file descriptor\n"); return error; } if (prv_data->read_flag == 1) { printf("QAT: can only write once\n"); return EBADF; } count = uio->uio_resid; if ((count <= 0) || (count > ADF_CFG_MAX_SECTION_LEN_IN_BYTES)) { printf("QAT: wrong size %d\n", (int)count); return EIO; } error = uiomove(usr_name, count, uio); if (error) { printf("QAT: can't copy data\n"); return error; } /* Lock other processes and try to find out the process name */ if (sx_xlock_sig(&processes_list_sema)) { printf("QAT: can't aquire process info lock\n"); return EBADF; } dev_id = adf_get_first_started_dev(); if (-1 == dev_id) { pr_err("QAT: could not find started device\n"); sx_xunlock(&processes_list_sema); return -EIO; } accel_dev = adf_devmgr_get_dev_by_id(dev_id); if (!accel_dev) { pr_err("QAT: could not find started device\n"); sx_xunlock(&processes_list_sema); return -EIO; } /* If there is nothing there then take the first name and return */ if (list_empty(&processes_list)) { snprintf(prv_data->name, ADF_CFG_MAX_SECTION_LEN_IN_BYTES, "%s" ADF_INTERNAL_USERSPACE_SEC_SUFF "%d", usr_name, 0); list_add(&prv_data->list, &processes_list); sx_xunlock(&processes_list_sema); prv_data->read_flag = 1; return 0; } /* If there are processes running then search for a first free name */ adf_devmgr_get_num_dev(&num_accel_devs); for (dev_num = 0; dev_num < num_accel_devs; dev_num++) { accel_dev = adf_devmgr_get_dev_by_id(dev_num); if (!accel_dev) continue; if (!adf_dev_started(accel_dev)) continue; /* to next device */ for (pr_num = 0; pr_num < GET_MAX_PROCESSES(accel_dev); pr_num++) { snprintf(prv_data->name, ADF_CFG_MAX_SECTION_LEN_IN_BYTES, "%s" ADF_INTERNAL_USERSPACE_SEC_SUFF "%d", usr_name, pr_num); pr_name_available = 1; /* Figure out if section exists in the config table */ section_ptr = adf_cfg_sec_find(accel_dev, prv_data->name); if (NULL == section_ptr) { /* This section name doesn't exist */ pr_name_available = 0; /* As process_num enumerates from 0, once we get * to one which doesn't exist no further ones * will exist. On to next device */ break; } /* Figure out if it's been taken already */ list_for_each(lpos, &processes_list) { pdata = list_entry(lpos, struct adf_processes_priv_data, list); if (!strncmp( pdata->name, prv_data->name, ADF_CFG_MAX_SECTION_LEN_IN_BYTES)) { pr_name_available = 0; break; } } if (pr_name_available) break; } if (pr_name_available) break; } /* * If we have a valid name that is not on * the list take it and add to the list */ if (pr_name_available) { list_add(&prv_data->list, &processes_list); sx_xunlock(&processes_list_sema); prv_data->read_flag = 1; return 0; } /* If not then the process needs to wait */ sx_xunlock(&processes_list_sema); explicit_bzero(prv_data->name, ADF_CFG_MAX_SECTION_LEN_IN_BYTES); prv_data->read_flag = 0; return 1; } static int adf_processes_read(struct cdev *dev, struct uio *uio, int ioflag) { struct adf_processes_priv_data *prv_data = NULL; int error = 0; error = devfs_get_cdevpriv((void **)&prv_data); if (error) { printf("QAT: invalid file descriptor\n"); return error; } /* * If there is a name that the process can use then give it * to the proocess. */ if (prv_data->read_flag) { error = uiomove(prv_data->name, strnlen(prv_data->name, ADF_CFG_MAX_SECTION_LEN_IN_BYTES), uio); if (error) { printf("QAT: failed to copy data to user\n"); return error; } return 0; } return EIO; } static void adf_processes_release(void *data) { struct adf_processes_priv_data *prv_data = NULL; prv_data = (struct adf_processes_priv_data *)data; sx_xlock(&processes_list_sema); list_del(&prv_data->list); sx_xunlock(&processes_list_sema); free(prv_data, M_QAT); } int adf_processes_dev_register(void) { return adf_chr_drv_create(); } void adf_processes_dev_unregister(void) { adf_chr_drv_destroy(); } static void adf_state_callout_notify_ev(void *arg) { int notified = 0; struct adf_state_priv_data *priv = NULL; struct entry_proc_events *proc_events = NULL; SLIST_FOREACH (proc_events, &proc_events_head, entries_proc_events) { if (!STAILQ_EMPTY(&proc_events->proc_events->state_head)) { notified = 1; priv = proc_events->proc_events; wakeup(priv); selwakeup(&priv->rsel); KNOTE_UNLOCKED(&priv->rsel.si_note, 0); } } if (notified) callout_schedule(&callout, ADF_STATE_CALLOUT_TIME); } static void adf_state_set(int dev, enum adf_event event) { struct adf_accel_dev *accel_dev = NULL; struct state_head *head = NULL; struct entry_proc_events *proc_events = NULL; struct entry_state *state = NULL; accel_dev = adf_devmgr_get_dev_by_id(dev); if (!accel_dev) return; mtx_lock(&mtx); SLIST_FOREACH (proc_events, &proc_events_head, entries_proc_events) { state = NULL; head = &proc_events->proc_events->state_head; state = malloc(sizeof(struct entry_state), M_QAT, M_NOWAIT | M_ZERO); if (!state) continue; state->state.dev_state = event; state->state.dev_id = dev; STAILQ_INSERT_TAIL(head, state, entries_state); if (event == ADF_EVENT_STOP) { state = NULL; state = malloc(sizeof(struct entry_state), M_QAT, M_NOWAIT | M_ZERO); if (!state) continue; state->state.dev_state = ADF_EVENT_SHUTDOWN; state->state.dev_id = dev; STAILQ_INSERT_TAIL(head, state, entries_state); } } mtx_unlock(&mtx); callout_schedule(&callout, ADF_STATE_CALLOUT_TIME); } static int adf_state_event_handler(struct adf_accel_dev *accel_dev, enum adf_event event) { int ret = 0; #if defined(QAT_UIO) && defined(QAT_DBG) if (event > ADF_EVENT_DBG_SHUTDOWN) return -EINVAL; #else if (event > ADF_EVENT_ERROR) return -EINVAL; #endif /* defined(QAT_UIO) && defined(QAT_DBG) */ switch (event) { case ADF_EVENT_INIT: return ret; case ADF_EVENT_SHUTDOWN: return ret; case ADF_EVENT_RESTARTING: break; case ADF_EVENT_RESTARTED: break; case ADF_EVENT_START: return ret; case ADF_EVENT_STOP: break; case ADF_EVENT_ERROR: break; #if defined(QAT_UIO) && defined(QAT_DBG) case ADF_EVENT_PROC_CRASH: break; case ADF_EVENT_MANUAL_DUMP: break; case ADF_EVENT_SLICE_HANG: break; case ADF_EVENT_DBG_SHUTDOWN: break; #endif /* defined(QAT_UIO) && defined(QAT_DBG) */ default: return -1; } adf_state_set(accel_dev->accel_id, event); return 0; } static int adf_state_kqfilter(struct cdev *dev, struct knote *kn) { struct adf_state_priv_data *priv; mtx_lock(&mtx); priv = dev->si_drv1; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &adf_state_read_filterops; kn->kn_hook = priv; knlist_add(&priv->rsel.si_note, kn, 1); mtx_unlock(&mtx); return 0; default: mtx_unlock(&mtx); return -EINVAL; } } static int adf_state_kqread_event(struct knote *kn, long hint) { return 1; } static void adf_state_kqread_detach(struct knote *kn) { struct adf_state_priv_data *priv = NULL; mtx_lock(&mtx); if (!kn) { mtx_unlock(&mtx); return; } priv = kn->kn_hook; if (!priv) { mtx_unlock(&mtx); return; } knlist_remove(&priv->rsel.si_note, kn, 1); mtx_unlock(&mtx); } void adf_state_init(void) { adf_state_dev = make_dev(&adf_state_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "%s", ADF_DEV_STATE_NAME); SLIST_INIT(&proc_events_head); mtx_init(&mtx, mtx_name, NULL, MTX_DEF); mtx_init(&callout_mtx, mtx_callout_name, NULL, MTX_DEF); callout_init_mtx(&callout, &callout_mtx, 0); explicit_bzero(&adf_state_hndl, sizeof(adf_state_hndl)); adf_state_hndl.event_hld = adf_state_event_handler; adf_state_hndl.name = "adf_state_event_handler"; adf_service_register(&adf_state_hndl); callout_reset(&callout, ADF_STATE_CALLOUT_TIME, adf_state_callout_notify_ev, NULL); } void adf_state_destroy(void) { struct entry_proc_events *proc_events = NULL; adf_service_unregister(&adf_state_hndl); mtx_lock(&callout_mtx); callout_stop(&callout); mtx_unlock(&callout_mtx); mtx_destroy(&callout_mtx); mtx_lock(&mtx); while (!SLIST_EMPTY(&proc_events_head)) { proc_events = SLIST_FIRST(&proc_events_head); SLIST_REMOVE_HEAD(&proc_events_head, entries_proc_events); free(proc_events, M_QAT); } mtx_unlock(&mtx); mtx_destroy(&mtx); destroy_dev(adf_state_dev); } static int adf_state_open(struct cdev *dev, int oflags, int devtype, struct thread *td) { struct adf_state_priv_data *prv_data = NULL; struct entry_proc_events *entry_proc_events = NULL; int ret = 0; prv_data = malloc(sizeof(*prv_data), M_QAT, M_WAITOK | M_ZERO); entry_proc_events = malloc(sizeof(struct entry_proc_events), M_QAT, M_WAITOK | M_ZERO); mtx_lock(&mtx); prv_data->cdev = dev; prv_data->cdev->si_drv1 = prv_data; knlist_init_mtx(&prv_data->rsel.si_note, &mtx); STAILQ_INIT(&prv_data->state_head); entry_proc_events->proc_events = prv_data; SLIST_INSERT_HEAD(&proc_events_head, entry_proc_events, entries_proc_events); mtx_unlock(&mtx); ret = devfs_set_cdevpriv(prv_data, adf_state_release); if (ret) { SLIST_REMOVE(&proc_events_head, entry_proc_events, entry_proc_events, entries_proc_events); free(entry_proc_events, M_QAT); free(prv_data, M_QAT); } callout_schedule(&callout, ADF_STATE_CALLOUT_TIME); return ret; } static int adf_state_read(struct cdev *dev, struct uio *uio, int ioflag) { int ret = 0; struct adf_state_priv_data *prv_data = NULL; struct state_head *state_head = NULL; struct entry_state *entry_state = NULL; struct adf_state *state = NULL; struct entry_proc_events *proc_events = NULL; mtx_lock(&mtx); ret = devfs_get_cdevpriv((void **)&prv_data); if (ret) { mtx_unlock(&mtx); return 0; } state_head = &prv_data->state_head; if (STAILQ_EMPTY(state_head)) { mtx_unlock(&mtx); return 0; } entry_state = STAILQ_FIRST(state_head); state = &entry_state->state; ret = uiomove(state, sizeof(struct adf_state), uio); if (!ret && !STAILQ_EMPTY(state_head)) { STAILQ_REMOVE_HEAD(state_head, entries_state); free(entry_state, M_QAT); } SLIST_FOREACH (proc_events, &proc_events_head, entries_proc_events) { if (!STAILQ_EMPTY(&proc_events->proc_events->state_head)) { prv_data = proc_events->proc_events; wakeup(prv_data); selwakeup(&prv_data->rsel); KNOTE_UNLOCKED(&prv_data->rsel.si_note, 0); } } mtx_unlock(&mtx); callout_schedule(&callout, ADF_STATE_CALLOUT_TIME); return ret; } static void adf_state_release(void *data) { struct adf_state_priv_data *prv_data = NULL; struct entry_state *entry_state = NULL; struct entry_proc_events *entry_proc_events = NULL; struct entry_proc_events *tmp = NULL; mtx_lock(&mtx); prv_data = (struct adf_state_priv_data *)data; knlist_delete(&prv_data->rsel.si_note, curthread, 1); knlist_destroy(&prv_data->rsel.si_note); seldrain(&prv_data->rsel); while (!STAILQ_EMPTY(&prv_data->state_head)) { entry_state = STAILQ_FIRST(&prv_data->state_head); STAILQ_REMOVE_HEAD(&prv_data->state_head, entries_state); free(entry_state, M_QAT); } SLIST_FOREACH_SAFE (entry_proc_events, &proc_events_head, entries_proc_events, tmp) { if (entry_proc_events->proc_events == prv_data) { SLIST_REMOVE(&proc_events_head, entry_proc_events, entry_proc_events, entries_proc_events); free(entry_proc_events, M_QAT); } } free(prv_data, M_QAT); mtx_unlock(&mtx); } diff --git a/sys/dev/usb/usb_dev.c b/sys/dev/usb/usb_dev.c index c58c3b5f64d5..a736a12fc4f4 100644 --- a/sys/dev/usb/usb_dev.c +++ b/sys/dev/usb/usb_dev.c @@ -1,2471 +1,2471 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2006-2023 Hans Petter Selasky * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * * usb_dev.c - An abstraction layer for creating devices under /dev/... */ #ifdef USB_GLOBAL_INCLUDE_FILE #include USB_GLOBAL_INCLUDE_FILE #else #ifdef COMPAT_FREEBSD32 #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define USB_DEBUG_VAR usb_fifo_debug #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #endif /* USB_GLOBAL_INCLUDE_FILE */ #if USB_HAVE_UGEN #ifdef USB_DEBUG static int usb_fifo_debug = 0; static SYSCTL_NODE(_hw_usb, OID_AUTO, dev, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "USB device"); SYSCTL_INT(_hw_usb_dev, OID_AUTO, debug, CTLFLAG_RWTUN, &usb_fifo_debug, 0, "Debug Level"); #endif #define USB_UCRED struct ucred *ucred, /* prototypes */ static int usb_fifo_open(struct usb_cdev_privdata *, struct usb_fifo *, int); static void usb_fifo_close(struct usb_fifo *, int); static void usb_dev_init(void *); static void usb_dev_init_post(void *); static void usb_dev_uninit(void *); static int usb_fifo_uiomove(struct usb_fifo *, void *, int, struct uio *); static void usb_fifo_check_methods(struct usb_fifo_methods *); static struct usb_fifo *usb_fifo_alloc(struct mtx *); static struct usb_endpoint *usb_dev_get_ep(struct usb_device *, uint8_t, uint8_t); static void usb_loc_fill(struct usb_fs_privdata *, struct usb_cdev_privdata *); static void usb_close(void *); static usb_error_t usb_ref_device(struct usb_cdev_privdata *, struct usb_cdev_refdata *, int); static usb_error_t usb_usb_ref_device(struct usb_cdev_privdata *, struct usb_cdev_refdata *); static void usb_unref_device(struct usb_cdev_privdata *, struct usb_cdev_refdata *); static d_open_t usb_open; static d_ioctl_t usb_ioctl; static d_read_t usb_read; static d_write_t usb_write; static d_poll_t usb_poll; static d_kqfilter_t usb_kqfilter; static d_ioctl_t usb_static_ioctl; static usb_fifo_open_t usb_fifo_dummy_open; static usb_fifo_close_t usb_fifo_dummy_close; static usb_fifo_ioctl_t usb_fifo_dummy_ioctl; static usb_fifo_cmd_t usb_fifo_dummy_cmd; /* character device structure used for devices (/dev/ugenX.Y and /dev/uXXX) */ struct cdevsw usb_devsw = { .d_version = D_VERSION, .d_open = usb_open, .d_ioctl = usb_ioctl, .d_name = "usbdev", .d_flags = D_TRACKCLOSE, .d_read = usb_read, .d_write = usb_write, .d_poll = usb_poll, .d_kqfilter = usb_kqfilter, }; static struct cdev* usb_dev = NULL; /* character device structure used for /dev/usb */ static struct cdevsw usb_static_devsw = { .d_version = D_VERSION, .d_ioctl = usb_static_ioctl, .d_name = "usb" }; static TAILQ_HEAD(, usb_symlink) usb_sym_head; static struct sx usb_sym_lock; struct mtx usb_ref_lock; /*------------------------------------------------------------------------* * usb_loc_fill * * This is used to fill out a usb_cdev_privdata structure based on the * device's address as contained in usb_fs_privdata. *------------------------------------------------------------------------*/ static void usb_loc_fill(struct usb_fs_privdata* pd, struct usb_cdev_privdata *cpd) { cpd->bus_index = pd->bus_index; cpd->dev_index = pd->dev_index; cpd->ep_addr = pd->ep_addr; cpd->fifo_index = pd->fifo_index; } /*------------------------------------------------------------------------* * usb_ref_device * * This function is used to atomically refer an USB device by its * device location. If this function returns success the USB device * will not disappear until the USB device is unreferenced. * * Return values: * 0: Success, refcount incremented on the given USB device. * Else: Failure. *------------------------------------------------------------------------*/ static usb_error_t usb_ref_device(struct usb_cdev_privdata *cpd, struct usb_cdev_refdata *crd, int need_uref) { struct usb_fifo **ppf; struct usb_fifo *f; DPRINTFN(2, "cpd=%p need uref=%d\n", cpd, need_uref); /* clear all refs */ memset(crd, 0, sizeof(*crd)); mtx_lock(&usb_ref_lock); cpd->bus = devclass_get_softc(usb_devclass_ptr, cpd->bus_index); if (cpd->bus == NULL) { DPRINTFN(2, "no bus at %u\n", cpd->bus_index); goto error; } cpd->udev = cpd->bus->devices[cpd->dev_index]; if (cpd->udev == NULL) { DPRINTFN(2, "no device at %u\n", cpd->dev_index); goto error; } if (cpd->udev->state == USB_STATE_DETACHED && (need_uref != 2)) { DPRINTFN(2, "device is detached\n"); goto error; } if (need_uref) { DPRINTFN(2, "ref udev - needed\n"); if (cpd->udev->refcount == USB_DEV_REF_MAX) { DPRINTFN(2, "no dev ref\n"); goto error; } cpd->udev->refcount++; mtx_unlock(&usb_ref_lock); /* * We need to grab the enumeration SX-lock before * grabbing the FIFO refs to avoid deadlock at detach! */ crd->do_unlock = usbd_enum_lock_sig(cpd->udev); mtx_lock(&usb_ref_lock); /* * Set "is_uref" after grabbing the default SX lock */ crd->is_uref = 1; /* check for signal */ if (crd->do_unlock > 1) { crd->do_unlock = 0; goto error; } } /* check if we are doing an open */ if (cpd->fflags == 0) { /* use zero defaults */ } else { /* check for write */ if (cpd->fflags & FWRITE) { ppf = cpd->udev->fifo; f = ppf[cpd->fifo_index + USB_FIFO_TX]; crd->txfifo = f; crd->is_write = 1; /* ref */ if (f == NULL || f->refcount == USB_FIFO_REF_MAX) goto error; if (f->curr_cpd != cpd) goto error; /* check if USB-FS is active */ if (f->fs_ep_max != 0) { crd->is_usbfs = 1; } } /* check for read */ if (cpd->fflags & FREAD) { ppf = cpd->udev->fifo; f = ppf[cpd->fifo_index + USB_FIFO_RX]; crd->rxfifo = f; crd->is_read = 1; /* ref */ if (f == NULL || f->refcount == USB_FIFO_REF_MAX) goto error; if (f->curr_cpd != cpd) goto error; /* check if USB-FS is active */ if (f->fs_ep_max != 0) { crd->is_usbfs = 1; } } } /* when everything is OK we increment the refcounts */ if (crd->is_write) { DPRINTFN(2, "ref write\n"); crd->txfifo->refcount++; } if (crd->is_read) { DPRINTFN(2, "ref read\n"); crd->rxfifo->refcount++; } mtx_unlock(&usb_ref_lock); return (0); error: if (crd->do_unlock) usbd_enum_unlock(cpd->udev); if (crd->is_uref) { if (--(cpd->udev->refcount) == 0) cv_broadcast(&cpd->udev->ref_cv); } mtx_unlock(&usb_ref_lock); DPRINTFN(2, "fail\n"); /* clear all refs */ memset(crd, 0, sizeof(*crd)); return (USB_ERR_INVAL); } /*------------------------------------------------------------------------* * usb_usb_ref_device * * This function is used to upgrade an USB reference to include the * USB device reference on a USB location. * * Return values: * 0: Success, refcount incremented on the given USB device. * Else: Failure. *------------------------------------------------------------------------*/ static usb_error_t usb_usb_ref_device(struct usb_cdev_privdata *cpd, struct usb_cdev_refdata *crd) { /* * Check if we already got an USB reference on this location: */ if (crd->is_uref) return (0); /* success */ /* * To avoid deadlock at detach we need to drop the FIFO ref * and re-acquire a new ref! */ usb_unref_device(cpd, crd); return (usb_ref_device(cpd, crd, 1 /* need uref */)); } /*------------------------------------------------------------------------* * usb_unref_device * * This function will release the reference count by one unit for the * given USB device. *------------------------------------------------------------------------*/ static void usb_unref_device(struct usb_cdev_privdata *cpd, struct usb_cdev_refdata *crd) { DPRINTFN(2, "cpd=%p is_uref=%d\n", cpd, crd->is_uref); if (crd->do_unlock) usbd_enum_unlock(cpd->udev); mtx_lock(&usb_ref_lock); if (crd->is_read) { if (--(crd->rxfifo->refcount) == 0) { cv_signal(&crd->rxfifo->cv_drain); } crd->is_read = 0; } if (crd->is_write) { if (--(crd->txfifo->refcount) == 0) { cv_signal(&crd->txfifo->cv_drain); } crd->is_write = 0; } if (crd->is_uref) { crd->is_uref = 0; if (--(cpd->udev->refcount) == 0) cv_broadcast(&cpd->udev->ref_cv); } mtx_unlock(&usb_ref_lock); } static struct usb_fifo * usb_fifo_alloc(struct mtx *mtx) { struct usb_fifo *f; f = malloc(sizeof(*f), M_USBDEV, M_WAITOK | M_ZERO); cv_init(&f->cv_io, "FIFO-IO"); cv_init(&f->cv_drain, "FIFO-DRAIN"); sx_init(&f->fs_fastpath_lock, "FIFO-FP"); f->priv_mtx = mtx; f->refcount = 1; knlist_init_mtx(&f->selinfo.si_note, mtx); return (f); } /*------------------------------------------------------------------------* * usb_fifo_create *------------------------------------------------------------------------*/ static int usb_fifo_create(struct usb_cdev_privdata *cpd, struct usb_cdev_refdata *crd) { struct usb_device *udev = cpd->udev; struct usb_fifo *f; struct usb_endpoint *ep; uint8_t n; uint8_t is_tx; uint8_t is_rx; uint8_t no_null; uint8_t is_busy; int e = cpd->ep_addr; is_tx = (cpd->fflags & FWRITE) ? 1 : 0; is_rx = (cpd->fflags & FREAD) ? 1 : 0; no_null = 1; is_busy = 0; /* Preallocated FIFO */ if (e < 0) { DPRINTFN(5, "Preallocated FIFO\n"); if (is_tx) { f = udev->fifo[cpd->fifo_index + USB_FIFO_TX]; if (f == NULL) return (EINVAL); crd->txfifo = f; } if (is_rx) { f = udev->fifo[cpd->fifo_index + USB_FIFO_RX]; if (f == NULL) return (EINVAL); crd->rxfifo = f; } return (0); } KASSERT(e >= 0 && e <= 15, ("endpoint %d out of range", e)); /* search for a free FIFO slot */ DPRINTFN(5, "Endpoint device, searching for 0x%02x\n", e); for (n = 0;; n += 2) { if (n == USB_FIFO_MAX) { if (no_null) { no_null = 0; n = 0; } else { /* end of FIFOs reached */ DPRINTFN(5, "out of FIFOs\n"); return (ENOMEM); } } /* Check for TX FIFO */ if (is_tx) { f = udev->fifo[n + USB_FIFO_TX]; if (f != NULL) { if (f->dev_ep_index != e) { /* wrong endpoint index */ continue; } if (f->curr_cpd != NULL) { /* FIFO is opened */ is_busy = 1; continue; } } else if (no_null) { continue; } } /* Check for RX FIFO */ if (is_rx) { f = udev->fifo[n + USB_FIFO_RX]; if (f != NULL) { if (f->dev_ep_index != e) { /* wrong endpoint index */ continue; } if (f->curr_cpd != NULL) { /* FIFO is opened */ is_busy = 1; continue; } } else if (no_null) { continue; } } break; } if (no_null == 0) { if (e >= (USB_EP_MAX / 2)) { /* we don't create any endpoints in this range */ DPRINTFN(5, "ep out of range\n"); return (is_busy ? EBUSY : EINVAL); } } if ((e != 0) && is_busy) { /* * Only the default control endpoint is allowed to be * opened multiple times! */ DPRINTFN(5, "busy\n"); return (EBUSY); } /* Check TX FIFO */ if (is_tx && (udev->fifo[n + USB_FIFO_TX] == NULL)) { ep = usb_dev_get_ep(udev, e, USB_FIFO_TX); DPRINTFN(5, "dev_get_endpoint(%d, 0x%x)\n", e, USB_FIFO_TX); if (ep == NULL) { DPRINTFN(5, "dev_get_endpoint returned NULL\n"); return (EINVAL); } f = usb_fifo_alloc(&udev->device_mtx); if (f == NULL) { DPRINTFN(5, "could not alloc tx fifo\n"); return (ENOMEM); } /* update some fields */ f->fifo_index = n + USB_FIFO_TX; f->dev_ep_index = e; f->priv_sc0 = ep; f->methods = &usb_ugen_methods; f->iface_index = ep->iface_index; f->udev = udev; mtx_lock(&usb_ref_lock); udev->fifo[n + USB_FIFO_TX] = f; mtx_unlock(&usb_ref_lock); } /* Check RX FIFO */ if (is_rx && (udev->fifo[n + USB_FIFO_RX] == NULL)) { ep = usb_dev_get_ep(udev, e, USB_FIFO_RX); DPRINTFN(5, "dev_get_endpoint(%d, 0x%x)\n", e, USB_FIFO_RX); if (ep == NULL) { DPRINTFN(5, "dev_get_endpoint returned NULL\n"); return (EINVAL); } f = usb_fifo_alloc(&udev->device_mtx); if (f == NULL) { DPRINTFN(5, "could not alloc rx fifo\n"); return (ENOMEM); } /* update some fields */ f->fifo_index = n + USB_FIFO_RX; f->dev_ep_index = e; f->priv_sc0 = ep; f->methods = &usb_ugen_methods; f->iface_index = ep->iface_index; f->udev = udev; mtx_lock(&usb_ref_lock); udev->fifo[n + USB_FIFO_RX] = f; mtx_unlock(&usb_ref_lock); } if (is_tx) { crd->txfifo = udev->fifo[n + USB_FIFO_TX]; } if (is_rx) { crd->rxfifo = udev->fifo[n + USB_FIFO_RX]; } /* fill out fifo index */ DPRINTFN(5, "fifo index = %d\n", n); cpd->fifo_index = n; /* complete */ return (0); } void usb_fifo_free(struct usb_fifo *f) { uint8_t n; if (f == NULL) { /* be NULL safe */ return; } /* destroy symlink devices, if any */ for (n = 0; n != 2; n++) { if (f->symlink[n]) { usb_free_symlink(f->symlink[n]); f->symlink[n] = NULL; } } mtx_lock(&usb_ref_lock); /* delink ourselves to stop calls from userland */ if ((f->fifo_index < USB_FIFO_MAX) && (f->udev != NULL) && (f->udev->fifo[f->fifo_index] == f)) { f->udev->fifo[f->fifo_index] = NULL; } else { DPRINTFN(0, "USB FIFO %p has not been linked\n", f); } /* decrease refcount */ f->refcount--; /* need to wait until all callers have exited */ while (f->refcount != 0) { mtx_unlock(&usb_ref_lock); /* avoid LOR */ mtx_lock(f->priv_mtx); /* prevent write flush, if any */ f->flag_iserror = 1; /* get I/O thread out of any sleep state */ if (f->flag_sleeping) { f->flag_sleeping = 0; cv_broadcast(&f->cv_io); } mtx_unlock(f->priv_mtx); mtx_lock(&usb_ref_lock); /* * Check if the "f->refcount" variable reached zero * during the unlocked time before entering wait: */ if (f->refcount == 0) break; /* wait for sync */ cv_wait(&f->cv_drain, &usb_ref_lock); } mtx_unlock(&usb_ref_lock); /* take care of closing the device here, if any */ usb_fifo_close(f, 0); cv_destroy(&f->cv_io); cv_destroy(&f->cv_drain); sx_destroy(&f->fs_fastpath_lock); knlist_clear(&f->selinfo.si_note, 0); seldrain(&f->selinfo); knlist_destroy(&f->selinfo.si_note); free(f, M_USBDEV); } static struct usb_endpoint * usb_dev_get_ep(struct usb_device *udev, uint8_t ep_index, uint8_t dir) { struct usb_endpoint *ep; uint8_t ep_dir; if (ep_index == 0) { ep = &udev->ctrl_ep; } else { if (dir == USB_FIFO_RX) { if (udev->flags.usb_mode == USB_MODE_HOST) { ep_dir = UE_DIR_IN; } else { ep_dir = UE_DIR_OUT; } } else { if (udev->flags.usb_mode == USB_MODE_HOST) { ep_dir = UE_DIR_OUT; } else { ep_dir = UE_DIR_IN; } } ep = usbd_get_ep_by_addr(udev, ep_index | ep_dir); } if (ep == NULL) { /* if the endpoint does not exist then return */ return (NULL); } if (ep->edesc == NULL) { /* invalid endpoint */ return (NULL); } return (ep); /* success */ } /*------------------------------------------------------------------------* * usb_fifo_open * * Returns: * 0: Success * Else: Failure *------------------------------------------------------------------------*/ static int usb_fifo_open(struct usb_cdev_privdata *cpd, struct usb_fifo *f, int fflags) { int err; if (f == NULL) { /* no FIFO there */ DPRINTFN(2, "no FIFO\n"); return (ENXIO); } /* remove FWRITE and FREAD flags */ fflags &= ~(FWRITE | FREAD); /* set correct file flags */ if ((f->fifo_index & 1) == USB_FIFO_TX) { fflags |= FWRITE; } else { fflags |= FREAD; } /* check if we are already opened */ /* we don't need any locks when checking this variable */ if (f->curr_cpd != NULL) { err = EBUSY; goto done; } /* reset short flag before open */ f->flag_short = 0; /* call open method */ err = (f->methods->f_open) (f, fflags); if (err) { goto done; } mtx_lock(f->priv_mtx); /* reset sleep flag */ f->flag_sleeping = 0; /* reset error flag */ f->flag_iserror = 0; /* reset complete flag */ f->flag_iscomplete = 0; /* reset select flag */ f->flag_isselect = 0; /* reset flushing flag */ f->flag_flushing = 0; /* reset ASYNC proc flag */ f->async_p = NULL; mtx_lock(&usb_ref_lock); /* flag the fifo as opened to prevent others */ f->curr_cpd = cpd; mtx_unlock(&usb_ref_lock); /* reset queue */ usb_fifo_reset(f); mtx_unlock(f->priv_mtx); done: return (err); } /*------------------------------------------------------------------------* * usb_fifo_reset *------------------------------------------------------------------------*/ void usb_fifo_reset(struct usb_fifo *f) { struct usb_mbuf *m; if (f == NULL) { return; } while (1) { USB_IF_DEQUEUE(&f->used_q, m); if (m) { USB_IF_ENQUEUE(&f->free_q, m); } else { break; } } /* reset have fragment flag */ f->flag_have_fragment = 0; } /*------------------------------------------------------------------------* * usb_fifo_close *------------------------------------------------------------------------*/ static void usb_fifo_close(struct usb_fifo *f, int fflags) { int err; /* check if we are not opened */ if (f->curr_cpd == NULL) { /* nothing to do - already closed */ return; } mtx_lock(f->priv_mtx); /* clear current cdev private data pointer */ mtx_lock(&usb_ref_lock); f->curr_cpd = NULL; mtx_unlock(&usb_ref_lock); /* check if we are watched by kevent */ KNOTE_LOCKED(&f->selinfo.si_note, 0); /* check if we are selected */ if (f->flag_isselect) { selwakeup(&f->selinfo); f->flag_isselect = 0; } /* check if a thread wants SIGIO */ if (f->async_p != NULL) { PROC_LOCK(f->async_p); kern_psignal(f->async_p, SIGIO); PROC_UNLOCK(f->async_p); f->async_p = NULL; } /* remove FWRITE and FREAD flags */ fflags &= ~(FWRITE | FREAD); /* flush written data, if any */ if ((f->fifo_index & 1) == USB_FIFO_TX) { if (!f->flag_iserror) { /* set flushing flag */ f->flag_flushing = 1; /* get the last packet in */ if (f->flag_have_fragment) { struct usb_mbuf *m; f->flag_have_fragment = 0; USB_IF_DEQUEUE(&f->free_q, m); if (m) { USB_IF_ENQUEUE(&f->used_q, m); } } /* start write transfer, if not already started */ (f->methods->f_start_write) (f); /* check if flushed already */ while (f->flag_flushing && (!f->flag_iserror)) { /* wait until all data has been written */ f->flag_sleeping = 1; err = cv_timedwait_sig(&f->cv_io, f->priv_mtx, USB_MS_TO_TICKS(USB_DEFAULT_TIMEOUT)); if (err) { DPRINTF("signal received\n"); break; } } } fflags |= FWRITE; /* stop write transfer, if not already stopped */ (f->methods->f_stop_write) (f); } else { fflags |= FREAD; /* stop write transfer, if not already stopped */ (f->methods->f_stop_read) (f); } /* check if we are sleeping */ if (f->flag_sleeping) { DPRINTFN(2, "Sleeping at close!\n"); } mtx_unlock(f->priv_mtx); /* call close method */ (f->methods->f_close) (f, fflags); DPRINTF("closed\n"); } /*------------------------------------------------------------------------* * usb_open - cdev callback *------------------------------------------------------------------------*/ static int usb_open(struct cdev *dev, int fflags, int devtype, struct thread *td) { struct usb_fs_privdata* pd = (struct usb_fs_privdata*)dev->si_drv1; struct usb_cdev_refdata refs; struct usb_cdev_privdata *cpd; int err; DPRINTFN(2, "%s fflags=0x%08x\n", devtoname(dev), fflags); KASSERT(fflags & (FREAD|FWRITE), ("invalid open flags")); if (((fflags & FREAD) && !(pd->mode & FREAD)) || ((fflags & FWRITE) && !(pd->mode & FWRITE))) { DPRINTFN(2, "access mode not supported\n"); return (EPERM); } cpd = malloc(sizeof(*cpd), M_USBDEV, M_WAITOK | M_ZERO); usb_loc_fill(pd, cpd); err = usb_ref_device(cpd, &refs, 1); if (err) { DPRINTFN(2, "cannot ref device\n"); free(cpd, M_USBDEV); return (ENXIO); } cpd->fflags = fflags; /* access mode for open lifetime */ /* create FIFOs, if any */ err = usb_fifo_create(cpd, &refs); /* check for error */ if (err) { DPRINTFN(2, "cannot create fifo\n"); usb_unref_device(cpd, &refs); free(cpd, M_USBDEV); return (err); } if (fflags & FREAD) { err = usb_fifo_open(cpd, refs.rxfifo, fflags); if (err) { DPRINTFN(2, "read open failed\n"); usb_unref_device(cpd, &refs); free(cpd, M_USBDEV); return (err); } } if (fflags & FWRITE) { err = usb_fifo_open(cpd, refs.txfifo, fflags); if (err) { DPRINTFN(2, "write open failed\n"); if (fflags & FREAD) { usb_fifo_close(refs.rxfifo, fflags); } usb_unref_device(cpd, &refs); free(cpd, M_USBDEV); return (err); } } usb_unref_device(cpd, &refs); devfs_set_cdevpriv(cpd, usb_close); return (0); } /*------------------------------------------------------------------------* * usb_close - cdev callback *------------------------------------------------------------------------*/ static void usb_close(void *arg) { struct usb_cdev_refdata refs; struct usb_cdev_privdata *cpd = arg; int err; DPRINTFN(2, "cpd=%p\n", cpd); err = usb_ref_device(cpd, &refs, 2 /* uref and allow detached state */); if (err) { DPRINTFN(2, "Cannot grab USB reference when " "closing USB file handle\n"); goto done; } if (cpd->fflags & FREAD) { usb_fifo_close(refs.rxfifo, cpd->fflags); } if (cpd->fflags & FWRITE) { usb_fifo_close(refs.txfifo, cpd->fflags); } usb_unref_device(cpd, &refs); done: free(cpd, M_USBDEV); } static void usb_dev_init(void *arg) { mtx_init(&usb_ref_lock, "USB ref mutex", NULL, MTX_DEF); sx_init(&usb_sym_lock, "USB sym mutex"); TAILQ_INIT(&usb_sym_head); /* check the UGEN methods */ usb_fifo_check_methods(&usb_ugen_methods); } SYSINIT(usb_dev_init, SI_SUB_KLD, SI_ORDER_FIRST, usb_dev_init, NULL); static void usb_dev_init_post(void *arg) { /* * Create /dev/usb - this is needed for usbconfig(8), which * needs a well-known device name to access. */ usb_dev = make_dev(&usb_static_devsw, 0, UID_ROOT, GID_OPERATOR, 0644, USB_DEVICE_NAME); if (usb_dev == NULL) { DPRINTFN(0, "Could not create usb bus device\n"); } } SYSINIT(usb_dev_init_post, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, usb_dev_init_post, NULL); static void usb_dev_uninit(void *arg) { if (usb_dev != NULL) { destroy_dev(usb_dev); usb_dev = NULL; } mtx_destroy(&usb_ref_lock); sx_destroy(&usb_sym_lock); } SYSUNINIT(usb_dev_uninit, SI_SUB_KICK_SCHEDULER, SI_ORDER_ANY, usb_dev_uninit, NULL); static int usb_ioctl_f_sub(struct usb_fifo *f, u_long cmd, void *addr, struct thread *td) { int error = 0; switch (cmd) { case FIODTYPE: *(int *)addr = 0; /* character device */ break; case FIONBIO: /* handled by upper FS layer */ break; case FIOASYNC: if (*(int *)addr) { if (f->async_p != NULL) { error = EBUSY; break; } f->async_p = USB_TD_GET_PROC(td); } else { f->async_p = NULL; } break; /* XXX this is not the most general solution */ case TIOCSPGRP: if (f->async_p == NULL) { error = EINVAL; break; } if (*(int *)addr != USB_PROC_GET_GID(f->async_p)) { error = EPERM; break; } break; default: return (ENOIOCTL); } DPRINTFN(3, "cmd 0x%lx = %d\n", cmd, error); return (error); } /*------------------------------------------------------------------------* * usb_ioctl - cdev callback *------------------------------------------------------------------------*/ static int usb_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int fflag, struct thread* td) { struct usb_cdev_refdata refs; struct usb_cdev_privdata* cpd; struct usb_fifo *f; int fflags; int err; DPRINTFN(2, "cmd=0x%lx\n", cmd); err = devfs_get_cdevpriv((void **)&cpd); if (err != 0) return (err); /* * Performance optimisation: We try to check for IOCTL's that * don't need the USB reference first. Then we grab the USB * reference if we need it! */ err = usb_ref_device(cpd, &refs, 0 /* no uref */ ); if (err) return (ENXIO); fflags = cpd->fflags; f = NULL; /* set default value */ err = ENOIOCTL; /* set default value */ if (fflags & FWRITE) { f = refs.txfifo; err = usb_ioctl_f_sub(f, cmd, addr, td); } if (fflags & FREAD) { f = refs.rxfifo; err = usb_ioctl_f_sub(f, cmd, addr, td); } KASSERT(f != NULL, ("fifo not found")); if (err != ENOIOCTL) goto done; err = (f->methods->f_ioctl) (f, cmd, addr, fflags); DPRINTFN(2, "f_ioctl cmd 0x%lx = %d\n", cmd, err); if (err != ENOIOCTL) goto done; if (usb_usb_ref_device(cpd, &refs)) { /* we lost the reference */ return (ENXIO); } err = (f->methods->f_ioctl_post) (f, cmd, addr, fflags); DPRINTFN(2, "f_ioctl_post cmd 0x%lx = %d\n", cmd, err); if (err == ENOIOCTL) err = ENOTTY; if (err) goto done; /* Wait for re-enumeration, if any */ while (f->udev->re_enumerate_wait != USB_RE_ENUM_DONE) { usb_unref_device(cpd, &refs); usb_pause_mtx(NULL, hz / 128); while (usb_ref_device(cpd, &refs, 1 /* need uref */)) { if (usb_ref_device(cpd, &refs, 0)) { /* device no longer exists */ return (ENXIO); } usb_unref_device(cpd, &refs); usb_pause_mtx(NULL, hz / 128); } } done: usb_unref_device(cpd, &refs); return (err); } static void usb_filter_detach(struct knote *kn) { struct usb_fifo *f = kn->kn_hook; knlist_remove(&f->selinfo.si_note, kn, 0); } static int usb_filter_write(struct knote *kn, long hint) { struct usb_cdev_privdata* cpd; struct usb_fifo *f; struct usb_mbuf *m; DPRINTFN(2, "\n"); f = kn->kn_hook; USB_MTX_ASSERT(f->priv_mtx, MA_OWNED); cpd = f->curr_cpd; if (cpd == NULL) { m = (void *)1; } else if (f->fs_ep_max == 0) { if (f->flag_iserror) { /* we got an error */ m = (void *)1; } else { if (f->queue_data == NULL) { /* * start write transfer, if not * already started */ (f->methods->f_start_write) (f); } /* check if any packets are available */ USB_IF_POLL(&f->free_q, m); } } else { if (f->flag_iscomplete) { m = (void *)1; } else { m = NULL; } } return (m ? 1 : 0); } static int usb_filter_read(struct knote *kn, long hint) { struct usb_cdev_privdata* cpd; struct usb_fifo *f; struct usb_mbuf *m; DPRINTFN(2, "\n"); f = kn->kn_hook; USB_MTX_ASSERT(f->priv_mtx, MA_OWNED); cpd = f->curr_cpd; if (cpd == NULL) { m = (void *)1; } else if (f->fs_ep_max == 0) { if (f->flag_iserror) { /* we have an error */ m = (void *)1; } else { if (f->queue_data == NULL) { /* * start read transfer, if not * already started */ (f->methods->f_start_read) (f); } /* check if any packets are available */ USB_IF_POLL(&f->used_q, m); /* start reading data, if any */ if (m == NULL) (f->methods->f_start_read) (f); } } else { if (f->flag_iscomplete) { m = (void *)1; } else { m = NULL; } } return (m ? 1 : 0); } -static struct filterops usb_filtops_write = { +static const struct filterops usb_filtops_write = { .f_isfd = 1, .f_detach = usb_filter_detach, .f_event = usb_filter_write, }; -static struct filterops usb_filtops_read = { +static const struct filterops usb_filtops_read = { .f_isfd = 1, .f_detach = usb_filter_detach, .f_event = usb_filter_read, }; /* ARGSUSED */ static int usb_kqfilter(struct cdev* dev, struct knote *kn) { struct usb_cdev_refdata refs; struct usb_cdev_privdata* cpd; struct usb_fifo *f; int fflags; int err = EINVAL; DPRINTFN(2, "\n"); if (devfs_get_cdevpriv((void **)&cpd) != 0 || usb_ref_device(cpd, &refs, 0) != 0) return (ENXIO); fflags = cpd->fflags; /* Figure out who needs service */ switch (kn->kn_filter) { case EVFILT_WRITE: if (fflags & FWRITE) { f = refs.txfifo; kn->kn_fop = &usb_filtops_write; err = 0; } break; case EVFILT_READ: if (fflags & FREAD) { f = refs.rxfifo; kn->kn_fop = &usb_filtops_read; err = 0; } break; default: err = EOPNOTSUPP; break; } if (err == 0) { kn->kn_hook = f; mtx_lock(f->priv_mtx); knlist_add(&f->selinfo.si_note, kn, 1); mtx_unlock(f->priv_mtx); } usb_unref_device(cpd, &refs); return (err); } /* ARGSUSED */ static int usb_poll(struct cdev* dev, int events, struct thread* td) { struct usb_cdev_refdata refs; struct usb_cdev_privdata* cpd; struct usb_fifo *f; struct usb_mbuf *m; int fflags, revents; if (devfs_get_cdevpriv((void **)&cpd) != 0 || usb_ref_device(cpd, &refs, 0) != 0) return (events & (POLLHUP|POLLIN|POLLRDNORM|POLLOUT|POLLWRNORM)); fflags = cpd->fflags; /* Figure out who needs service */ revents = 0; if ((events & (POLLOUT | POLLWRNORM)) && (fflags & FWRITE)) { f = refs.txfifo; mtx_lock(f->priv_mtx); if (!refs.is_usbfs) { if (f->flag_iserror) { /* we got an error */ m = (void *)1; } else { if (f->queue_data == NULL) { /* * start write transfer, if not * already started */ (f->methods->f_start_write) (f); } /* check if any packets are available */ USB_IF_POLL(&f->free_q, m); } } else { if (f->flag_iscomplete) { m = (void *)1; } else { m = NULL; } } if (m) { revents |= events & (POLLOUT | POLLWRNORM); } else { f->flag_isselect = 1; selrecord(td, &f->selinfo); } mtx_unlock(f->priv_mtx); } if ((events & (POLLIN | POLLRDNORM)) && (fflags & FREAD)) { f = refs.rxfifo; mtx_lock(f->priv_mtx); if (!refs.is_usbfs) { if (f->flag_iserror) { /* we have an error */ m = (void *)1; } else { if (f->queue_data == NULL) { /* * start read transfer, if not * already started */ (f->methods->f_start_read) (f); } /* check if any packets are available */ USB_IF_POLL(&f->used_q, m); } } else { if (f->flag_iscomplete) { m = (void *)1; } else { m = NULL; } } if (m) { revents |= events & (POLLIN | POLLRDNORM); } else { f->flag_isselect = 1; selrecord(td, &f->selinfo); if (!refs.is_usbfs) { /* start reading data */ (f->methods->f_start_read) (f); } } mtx_unlock(f->priv_mtx); } usb_unref_device(cpd, &refs); return (revents); } static int usb_read(struct cdev *dev, struct uio *uio, int ioflag) { struct usb_cdev_refdata refs; struct usb_cdev_privdata* cpd; struct usb_fifo *f; struct usb_mbuf *m; int io_len; int err; uint8_t tr_data = 0; err = devfs_get_cdevpriv((void **)&cpd); if (err != 0) return (err); err = usb_ref_device(cpd, &refs, 0 /* no uref */ ); if (err) return (ENXIO); f = refs.rxfifo; if (f == NULL) { /* should not happen */ usb_unref_device(cpd, &refs); return (EPERM); } mtx_lock(f->priv_mtx); /* check for permanent read error */ if (f->flag_iserror) { err = EIO; goto done; } /* check if USB-FS interface is active */ if (refs.is_usbfs) { /* * The queue is used for events that should be * retrieved using the "USB_FS_COMPLETE" ioctl. */ err = EINVAL; goto done; } while (uio->uio_resid > 0) { USB_IF_DEQUEUE(&f->used_q, m); if (m == NULL) { /* start read transfer, if not already started */ (f->methods->f_start_read) (f); if (ioflag & IO_NDELAY) { if (tr_data) { /* return length before error */ break; } err = EWOULDBLOCK; break; } DPRINTF("sleeping\n"); err = usb_fifo_wait(f); if (err) { break; } continue; } if (f->methods->f_filter_read) { /* * Sometimes it is convenient to process data at the * expense of a userland process instead of a kernel * process. */ (f->methods->f_filter_read) (f, m); } tr_data = 1; io_len = MIN(m->cur_data_len, uio->uio_resid); DPRINTFN(2, "transfer %d bytes from %p\n", io_len, m->cur_data_ptr); err = usb_fifo_uiomove(f, m->cur_data_ptr, io_len, uio); m->cur_data_len -= io_len; m->cur_data_ptr += io_len; if (m->cur_data_len == 0) { uint8_t last_packet; last_packet = m->last_packet; USB_IF_ENQUEUE(&f->free_q, m); if (last_packet) { /* keep framing */ break; } } else { USB_IF_PREPEND(&f->used_q, m); } if (err) { break; } } done: mtx_unlock(f->priv_mtx); usb_unref_device(cpd, &refs); return (err); } static int usb_write(struct cdev *dev, struct uio *uio, int ioflag) { struct usb_cdev_refdata refs; struct usb_cdev_privdata* cpd; struct usb_fifo *f; struct usb_mbuf *m; uint8_t *pdata; int io_len; int err; uint8_t tr_data = 0; DPRINTFN(2, "\n"); err = devfs_get_cdevpriv((void **)&cpd); if (err != 0) return (err); err = usb_ref_device(cpd, &refs, 0 /* no uref */ ); if (err) return (ENXIO); f = refs.txfifo; if (f == NULL) { /* should not happen */ usb_unref_device(cpd, &refs); return (EPERM); } mtx_lock(f->priv_mtx); /* check for permanent write error */ if (f->flag_iserror) { err = EIO; goto done; } /* check if USB-FS interface is active */ if (refs.is_usbfs) { /* * The queue is used for events that should be * retrieved using the "USB_FS_COMPLETE" ioctl. */ err = EINVAL; goto done; } if (f->queue_data == NULL) { /* start write transfer, if not already started */ (f->methods->f_start_write) (f); } /* we allow writing zero length data */ do { USB_IF_DEQUEUE(&f->free_q, m); if (m == NULL) { if (ioflag & IO_NDELAY) { if (tr_data) { /* return length before error */ break; } err = EWOULDBLOCK; break; } DPRINTF("sleeping\n"); err = usb_fifo_wait(f); if (err) { break; } continue; } tr_data = 1; if (f->flag_have_fragment == 0) { USB_MBUF_RESET(m); io_len = m->cur_data_len; pdata = m->cur_data_ptr; if (io_len > uio->uio_resid) io_len = uio->uio_resid; m->cur_data_len = io_len; } else { io_len = m->max_data_len - m->cur_data_len; pdata = m->cur_data_ptr + m->cur_data_len; if (io_len > uio->uio_resid) io_len = uio->uio_resid; m->cur_data_len += io_len; } DPRINTFN(2, "transfer %d bytes to %p\n", io_len, pdata); err = usb_fifo_uiomove(f, pdata, io_len, uio); if (err) { f->flag_have_fragment = 0; USB_IF_ENQUEUE(&f->free_q, m); break; } /* check if the buffer is ready to be transmitted */ if ((f->flag_write_defrag == 0) || (m->cur_data_len == m->max_data_len)) { f->flag_have_fragment = 0; /* * Check for write filter: * * Sometimes it is convenient to process data * at the expense of a userland process * instead of a kernel process. */ if (f->methods->f_filter_write) { (f->methods->f_filter_write) (f, m); } /* Put USB mbuf in the used queue */ USB_IF_ENQUEUE(&f->used_q, m); /* Start writing data, if not already started */ (f->methods->f_start_write) (f); } else { /* Wait for more data or close */ f->flag_have_fragment = 1; USB_IF_PREPEND(&f->free_q, m); } } while (uio->uio_resid > 0); done: mtx_unlock(f->priv_mtx); usb_unref_device(cpd, &refs); return (err); } int usb_static_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) { union { struct usb_read_dir *urd; #ifdef COMPAT_FREEBSD32 struct usb_read_dir32 *urd32; #endif void* data; } u; int err; u.data = data; switch (cmd) { case USB_READ_DIR: err = usb_read_symlink(u.urd->urd_data, u.urd->urd_startentry, u.urd->urd_maxlen); break; #ifdef COMPAT_FREEBSD32 case USB_READ_DIR32: err = usb_read_symlink(PTRIN(u.urd32->urd_data), u.urd32->urd_startentry, u.urd32->urd_maxlen); break; #endif case USB_DEV_QUIRK_GET: case USB_QUIRK_NAME_GET: case USB_DEV_QUIRK_ADD: case USB_DEV_QUIRK_REMOVE: err = usb_quirk_ioctl_p(cmd, data, fflag, td); break; case USB_GET_TEMPLATE: *(int *)data = usb_template; err = 0; break; case USB_SET_TEMPLATE: err = priv_check(curthread, PRIV_DRIVER); if (err) break; usb_template = *(int *)data; break; default: err = ENOTTY; break; } return (err); } static int usb_fifo_uiomove(struct usb_fifo *f, void *cp, int n, struct uio *uio) { int error; mtx_unlock(f->priv_mtx); /* * "uiomove()" can sleep so one needs to make a wrapper, * exiting the mutex and checking things: */ error = uiomove(cp, n, uio); mtx_lock(f->priv_mtx); return (error); } int usb_fifo_wait(struct usb_fifo *f) { int err; USB_MTX_ASSERT(f->priv_mtx, MA_OWNED); if (f->flag_iserror) { /* we are gone */ return (EIO); } f->flag_sleeping = 1; err = cv_wait_sig(&f->cv_io, f->priv_mtx); if (f->flag_iserror) { /* we are gone */ err = EIO; } return (err); } void usb_fifo_signal(struct usb_fifo *f) { if (f->flag_sleeping) { f->flag_sleeping = 0; cv_broadcast(&f->cv_io); } } void usb_fifo_wakeup(struct usb_fifo *f) { usb_fifo_signal(f); KNOTE_LOCKED(&f->selinfo.si_note, 0); if (f->flag_isselect) { selwakeup(&f->selinfo); f->flag_isselect = 0; } if (f->async_p != NULL) { PROC_LOCK(f->async_p); kern_psignal(f->async_p, SIGIO); PROC_UNLOCK(f->async_p); } } static int usb_fifo_dummy_open(struct usb_fifo *fifo, int fflags) { return (0); } static void usb_fifo_dummy_close(struct usb_fifo *fifo, int fflags) { return; } static int usb_fifo_dummy_ioctl(struct usb_fifo *fifo, u_long cmd, void *addr, int fflags) { return (ENOIOCTL); } static void usb_fifo_dummy_cmd(struct usb_fifo *fifo) { fifo->flag_flushing = 0; /* not flushing */ } static void usb_fifo_check_methods(struct usb_fifo_methods *pm) { /* check that all callback functions are OK */ if (pm->f_open == NULL) pm->f_open = &usb_fifo_dummy_open; if (pm->f_close == NULL) pm->f_close = &usb_fifo_dummy_close; if (pm->f_ioctl == NULL) pm->f_ioctl = &usb_fifo_dummy_ioctl; if (pm->f_ioctl_post == NULL) pm->f_ioctl_post = &usb_fifo_dummy_ioctl; if (pm->f_start_read == NULL) pm->f_start_read = &usb_fifo_dummy_cmd; if (pm->f_stop_read == NULL) pm->f_stop_read = &usb_fifo_dummy_cmd; if (pm->f_start_write == NULL) pm->f_start_write = &usb_fifo_dummy_cmd; if (pm->f_stop_write == NULL) pm->f_stop_write = &usb_fifo_dummy_cmd; } /*------------------------------------------------------------------------* * usb_fifo_attach * * The following function will create a duplex FIFO. * * Return values: * 0: Success. * Else: Failure. *------------------------------------------------------------------------*/ int usb_fifo_attach(struct usb_device *udev, void *priv_sc, struct mtx *priv_mtx, struct usb_fifo_methods *pm, struct usb_fifo_sc *f_sc, uint16_t unit, int16_t subunit, uint8_t iface_index, uid_t uid, gid_t gid, int mode) { struct usb_fifo *f_tx; struct usb_fifo *f_rx; char devname[32]; uint8_t n; f_sc->fp[USB_FIFO_TX] = NULL; f_sc->fp[USB_FIFO_RX] = NULL; if (pm == NULL) return (EINVAL); /* check the methods */ usb_fifo_check_methods(pm); if (priv_mtx == NULL) priv_mtx = &Giant; /* search for a free FIFO slot */ for (n = 0;; n += 2) { if (n == USB_FIFO_MAX) { /* end of FIFOs reached */ return (ENOMEM); } /* Check for TX FIFO */ if (udev->fifo[n + USB_FIFO_TX] != NULL) { continue; } /* Check for RX FIFO */ if (udev->fifo[n + USB_FIFO_RX] != NULL) { continue; } break; } f_tx = usb_fifo_alloc(priv_mtx); f_rx = usb_fifo_alloc(priv_mtx); if ((f_tx == NULL) || (f_rx == NULL)) { usb_fifo_free(f_tx); usb_fifo_free(f_rx); return (ENOMEM); } /* initialise FIFO structures */ f_tx->fifo_index = n + USB_FIFO_TX; f_tx->dev_ep_index = -1; f_tx->priv_sc0 = priv_sc; f_tx->methods = pm; f_tx->iface_index = iface_index; f_tx->udev = udev; f_rx->fifo_index = n + USB_FIFO_RX; f_rx->dev_ep_index = -1; f_rx->priv_sc0 = priv_sc; f_rx->methods = pm; f_rx->iface_index = iface_index; f_rx->udev = udev; f_sc->fp[USB_FIFO_TX] = f_tx; f_sc->fp[USB_FIFO_RX] = f_rx; mtx_lock(&usb_ref_lock); udev->fifo[f_tx->fifo_index] = f_tx; udev->fifo[f_rx->fifo_index] = f_rx; mtx_unlock(&usb_ref_lock); for (n = 0; n != 4; n++) { if (pm->basename[n] == NULL) { continue; } if (subunit < 0) { if (snprintf(devname, sizeof(devname), "%s%u%s", pm->basename[n], unit, pm->postfix[n] ? pm->postfix[n] : "")) { /* ignore */ } } else { if (snprintf(devname, sizeof(devname), "%s%u.%d%s", pm->basename[n], unit, subunit, pm->postfix[n] ? pm->postfix[n] : "")) { /* ignore */ } } /* * Distribute the symbolic links into two FIFO structures: */ if (n & 1) { f_rx->symlink[n / 2] = usb_alloc_symlink(devname); } else { f_tx->symlink[n / 2] = usb_alloc_symlink(devname); } /* Create the device */ f_sc->dev = usb_make_dev(udev, devname, -1, f_tx->fifo_index & f_rx->fifo_index, FREAD|FWRITE, uid, gid, mode); } DPRINTFN(2, "attached %p/%p\n", f_tx, f_rx); return (0); } /*------------------------------------------------------------------------* * usb_fifo_alloc_buffer * * Return values: * 0: Success * Else failure *------------------------------------------------------------------------*/ int usb_fifo_alloc_buffer(struct usb_fifo *f, usb_size_t bufsize, uint16_t nbuf) { struct usb_ifqueue temp_q = {}; void *queue_data; usb_fifo_free_buffer(f); temp_q.ifq_maxlen = nbuf; queue_data = usb_alloc_mbufs( M_USBDEV, &temp_q, bufsize, nbuf); if (queue_data == NULL && bufsize != 0 && nbuf != 0) return (ENOMEM); mtx_lock(f->priv_mtx); /* * Setup queues and sizes under lock to avoid early use by * concurrent FIFO access: */ f->free_q = temp_q; f->used_q.ifq_maxlen = nbuf; f->queue_data = queue_data; mtx_unlock(f->priv_mtx); return (0); /* success */ } /*------------------------------------------------------------------------* * usb_fifo_free_buffer * * This function will free the buffers associated with a FIFO. This * function can be called multiple times in a row. *------------------------------------------------------------------------*/ void usb_fifo_free_buffer(struct usb_fifo *f) { void *queue_data; mtx_lock(f->priv_mtx); /* Get and clear pointer to free, if any. */ queue_data = f->queue_data; f->queue_data = NULL; /* * Reset queues under lock to avoid use of freed buffers by * concurrent FIFO activity: */ memset(&f->free_q, 0, sizeof(f->free_q)); memset(&f->used_q, 0, sizeof(f->used_q)); mtx_unlock(f->priv_mtx); /* Free old buffer, if any. */ free(queue_data, M_USBDEV); } void usb_fifo_detach(struct usb_fifo_sc *f_sc) { if (f_sc == NULL) { return; } usb_fifo_free(f_sc->fp[USB_FIFO_TX]); usb_fifo_free(f_sc->fp[USB_FIFO_RX]); f_sc->fp[USB_FIFO_TX] = NULL; f_sc->fp[USB_FIFO_RX] = NULL; usb_destroy_dev(f_sc->dev); f_sc->dev = NULL; DPRINTFN(2, "detached %p\n", f_sc); } usb_size_t usb_fifo_put_bytes_max(struct usb_fifo *f) { struct usb_mbuf *m; usb_size_t len; USB_IF_POLL(&f->free_q, m); if (m) { len = m->max_data_len; } else { len = 0; } return (len); } /*------------------------------------------------------------------------* * usb_fifo_put_data * * what: * 0 - normal operation * 1 - set last packet flag to enforce framing *------------------------------------------------------------------------*/ void usb_fifo_put_data(struct usb_fifo *f, struct usb_page_cache *pc, usb_frlength_t offset, usb_frlength_t len, uint8_t what) { struct usb_mbuf *m; usb_frlength_t io_len; while (len || (what == 1)) { USB_IF_DEQUEUE(&f->free_q, m); if (m) { USB_MBUF_RESET(m); io_len = MIN(len, m->cur_data_len); usbd_copy_out(pc, offset, m->cur_data_ptr, io_len); m->cur_data_len = io_len; offset += io_len; len -= io_len; if ((len == 0) && (what == 1)) { m->last_packet = 1; } USB_IF_ENQUEUE(&f->used_q, m); usb_fifo_wakeup(f); if ((len == 0) || (what == 1)) { break; } } else { break; } } } void usb_fifo_put_data_linear(struct usb_fifo *f, void *ptr, usb_size_t len, uint8_t what) { struct usb_mbuf *m; usb_size_t io_len; while (len || (what == 1)) { USB_IF_DEQUEUE(&f->free_q, m); if (m) { USB_MBUF_RESET(m); io_len = MIN(len, m->cur_data_len); memcpy(m->cur_data_ptr, ptr, io_len); m->cur_data_len = io_len; ptr = USB_ADD_BYTES(ptr, io_len); len -= io_len; if ((len == 0) && (what == 1)) { m->last_packet = 1; } USB_IF_ENQUEUE(&f->used_q, m); usb_fifo_wakeup(f); if ((len == 0) || (what == 1)) { break; } } else { break; } } } uint8_t usb_fifo_put_data_buffer(struct usb_fifo *f, void *ptr, usb_size_t len) { struct usb_mbuf *m; USB_IF_DEQUEUE(&f->free_q, m); if (m) { m->cur_data_len = len; m->cur_data_ptr = ptr; USB_IF_ENQUEUE(&f->used_q, m); usb_fifo_wakeup(f); return (1); } return (0); } void usb_fifo_put_data_error(struct usb_fifo *f) { f->flag_iserror = 1; usb_fifo_wakeup(f); } /*------------------------------------------------------------------------* * usb_fifo_get_data * * what: * 0 - normal operation * 1 - only get one "usb_mbuf" * * returns: * 0 - no more data * 1 - data in buffer *------------------------------------------------------------------------*/ uint8_t usb_fifo_get_data(struct usb_fifo *f, struct usb_page_cache *pc, usb_frlength_t offset, usb_frlength_t len, usb_frlength_t *actlen, uint8_t what) { struct usb_mbuf *m; usb_frlength_t io_len; uint8_t tr_data = 0; actlen[0] = 0; while (1) { USB_IF_DEQUEUE(&f->used_q, m); if (m) { tr_data = 1; io_len = MIN(len, m->cur_data_len); usbd_copy_in(pc, offset, m->cur_data_ptr, io_len); len -= io_len; offset += io_len; actlen[0] += io_len; m->cur_data_ptr += io_len; m->cur_data_len -= io_len; if ((m->cur_data_len == 0) || (what == 1)) { USB_IF_ENQUEUE(&f->free_q, m); usb_fifo_wakeup(f); if (what == 1) { break; } } else { USB_IF_PREPEND(&f->used_q, m); } } else { if (tr_data) { /* wait for data to be written out */ break; } if (f->flag_flushing) { /* check if we should send a short packet */ if (f->flag_short != 0) { f->flag_short = 0; tr_data = 1; break; } /* flushing complete */ f->flag_flushing = 0; usb_fifo_wakeup(f); } break; } if (len == 0) { break; } } return (tr_data); } uint8_t usb_fifo_get_data_linear(struct usb_fifo *f, void *ptr, usb_size_t len, usb_size_t *actlen, uint8_t what) { struct usb_mbuf *m; usb_size_t io_len; uint8_t tr_data = 0; actlen[0] = 0; while (1) { USB_IF_DEQUEUE(&f->used_q, m); if (m) { tr_data = 1; io_len = MIN(len, m->cur_data_len); memcpy(ptr, m->cur_data_ptr, io_len); len -= io_len; ptr = USB_ADD_BYTES(ptr, io_len); actlen[0] += io_len; m->cur_data_ptr += io_len; m->cur_data_len -= io_len; if ((m->cur_data_len == 0) || (what == 1)) { USB_IF_ENQUEUE(&f->free_q, m); usb_fifo_wakeup(f); if (what == 1) { break; } } else { USB_IF_PREPEND(&f->used_q, m); } } else { if (tr_data) { /* wait for data to be written out */ break; } if (f->flag_flushing) { /* check if we should send a short packet */ if (f->flag_short != 0) { f->flag_short = 0; tr_data = 1; break; } /* flushing complete */ f->flag_flushing = 0; usb_fifo_wakeup(f); } break; } if (len == 0) { break; } } return (tr_data); } uint8_t usb_fifo_get_data_buffer(struct usb_fifo *f, void **pptr, usb_size_t *plen) { struct usb_mbuf *m; USB_IF_POLL(&f->used_q, m); if (m) { *plen = m->cur_data_len; *pptr = m->cur_data_ptr; return (1); } return (0); } void usb_fifo_get_data_error(struct usb_fifo *f) { f->flag_iserror = 1; usb_fifo_wakeup(f); } /*------------------------------------------------------------------------* * usb_alloc_symlink * * Return values: * NULL: Failure * Else: Pointer to symlink entry *------------------------------------------------------------------------*/ struct usb_symlink * usb_alloc_symlink(const char *target) { struct usb_symlink *ps; ps = malloc(sizeof(*ps), M_USBDEV, M_WAITOK); /* XXX no longer needed */ strlcpy(ps->src_path, target, sizeof(ps->src_path)); ps->src_len = strlen(ps->src_path); strlcpy(ps->dst_path, target, sizeof(ps->dst_path)); ps->dst_len = strlen(ps->dst_path); sx_xlock(&usb_sym_lock); TAILQ_INSERT_TAIL(&usb_sym_head, ps, sym_entry); sx_unlock(&usb_sym_lock); return (ps); } /*------------------------------------------------------------------------* * usb_free_symlink *------------------------------------------------------------------------*/ void usb_free_symlink(struct usb_symlink *ps) { if (ps == NULL) { return; } sx_xlock(&usb_sym_lock); TAILQ_REMOVE(&usb_sym_head, ps, sym_entry); sx_unlock(&usb_sym_lock); free(ps, M_USBDEV); } /*------------------------------------------------------------------------* * usb_read_symlink * * Return value: * 0: Success * Else: Failure *------------------------------------------------------------------------*/ int usb_read_symlink(uint8_t *user_ptr, uint32_t startentry, uint32_t user_len) { struct usb_symlink *ps; uint32_t temp; uint32_t delta = 0; uint8_t len; int error = 0; sx_xlock(&usb_sym_lock); TAILQ_FOREACH(ps, &usb_sym_head, sym_entry) { /* * Compute total length of source and destination symlink * strings pluss one length byte and two NUL bytes: */ temp = ps->src_len + ps->dst_len + 3; if (temp > 255) { /* * Skip entry because this length cannot fit * into one byte: */ continue; } if (startentry != 0) { /* decrement read offset */ startentry--; continue; } if (temp > user_len) { /* out of buffer space */ break; } len = temp; /* copy out total length */ error = copyout(&len, USB_ADD_BYTES(user_ptr, delta), 1); if (error) { break; } delta += 1; /* copy out source string */ error = copyout(ps->src_path, USB_ADD_BYTES(user_ptr, delta), ps->src_len); if (error) { break; } len = 0; delta += ps->src_len; error = copyout(&len, USB_ADD_BYTES(user_ptr, delta), 1); if (error) { break; } delta += 1; /* copy out destination string */ error = copyout(ps->dst_path, USB_ADD_BYTES(user_ptr, delta), ps->dst_len); if (error) { break; } len = 0; delta += ps->dst_len; error = copyout(&len, USB_ADD_BYTES(user_ptr, delta), 1); if (error) { break; } delta += 1; user_len -= temp; } /* a zero length entry indicates the end */ if ((user_len != 0) && (error == 0)) { len = 0; error = copyout(&len, USB_ADD_BYTES(user_ptr, delta), 1); } sx_unlock(&usb_sym_lock); return (error); } void usb_fifo_set_close_zlp(struct usb_fifo *f, uint8_t onoff) { if (f == NULL) return; /* send a Zero Length Packet, ZLP, before close */ f->flag_short = onoff; } void usb_fifo_set_write_defrag(struct usb_fifo *f, uint8_t onoff) { if (f == NULL) return; /* defrag written data */ f->flag_write_defrag = onoff; /* reset defrag state */ f->flag_have_fragment = 0; } void * usb_fifo_softc(struct usb_fifo *f) { return (f->priv_sc0); } #endif /* USB_HAVE_UGEN */ diff --git a/sys/fs/cuse/cuse.c b/sys/fs/cuse/cuse.c index 9ef234c35427..e32154654386 100644 --- a/sys/fs/cuse/cuse.c +++ b/sys/fs/cuse/cuse.c @@ -1,2042 +1,2042 @@ /*- * Copyright (c) 2010-2022 Hans Petter Selasky * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* set this define to zero to disable this feature */ #define CUSE_COPY_BUFFER_MAX \ CUSE_BUFFER_MAX #define CUSE_ALLOC_PAGES_MAX \ (CUSE_ALLOC_BYTES_MAX / PAGE_SIZE) #if (CUSE_ALLOC_PAGES_MAX == 0) #error "PAGE_SIZE is too big!" #endif static int cuse_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: case MOD_UNLOAD: return (0); default: return (EOPNOTSUPP); } } static moduledata_t cuse_mod = { .name = "cuse", .evhand = &cuse_modevent, }; DECLARE_MODULE(cuse, cuse_mod, SI_SUB_DEVFS, SI_ORDER_FIRST); MODULE_VERSION(cuse, 1); /* * Prevent cuse4bsd.ko and cuse.ko from loading at the same time by * declaring support for the cuse4bsd interface in cuse.ko: */ MODULE_VERSION(cuse4bsd, 1); #ifdef FEATURE FEATURE(cuse, "Userspace character devices"); #endif struct cuse_command; struct cuse_server; struct cuse_client; struct cuse_client_command { TAILQ_ENTRY(cuse_client_command) entry; struct cuse_command sub; struct sx sx; struct cv cv; struct thread *entered; struct cuse_client *client; struct proc *proc_curr; int proc_refs; int got_signal; int error; int command; }; struct cuse_memory { TAILQ_ENTRY(cuse_memory) entry; vm_object_t object; uint32_t page_count; uint32_t alloc_nr; }; struct cuse_server_dev { TAILQ_ENTRY(cuse_server_dev) entry; struct cuse_server *server; struct cdev *kern_dev; struct cuse_dev *user_dev; }; struct cuse_server { TAILQ_ENTRY(cuse_server) entry; TAILQ_HEAD(, cuse_client_command) head; TAILQ_HEAD(, cuse_server_dev) hdev; TAILQ_HEAD(, cuse_client) hcli; TAILQ_HEAD(, cuse_memory) hmem; struct mtx mtx; struct cv cv; struct selinfo selinfo; pid_t pid; int is_closing; int refs; }; struct cuse_client { TAILQ_ENTRY(cuse_client) entry; TAILQ_ENTRY(cuse_client) entry_ref; struct cuse_client_command cmds[CUSE_CMD_MAX]; struct cuse_server *server; struct cuse_server_dev *server_dev; uintptr_t read_base; uintptr_t write_base; int read_length; int write_length; uint8_t read_buffer[CUSE_COPY_BUFFER_MAX] __aligned(4); uint8_t write_buffer[CUSE_COPY_BUFFER_MAX] __aligned(4); uint8_t ioctl_buffer[CUSE_BUFFER_MAX] __aligned(4); int fflags; /* file flags */ int cflags; /* client flags */ #define CUSE_CLI_IS_CLOSING 0x01 #define CUSE_CLI_KNOTE_NEED_READ 0x02 #define CUSE_CLI_KNOTE_NEED_WRITE 0x04 #define CUSE_CLI_KNOTE_HAS_READ 0x08 #define CUSE_CLI_KNOTE_HAS_WRITE 0x10 }; #define CUSE_CLIENT_CLOSING(pcc) \ ((pcc)->cflags & CUSE_CLI_IS_CLOSING) static MALLOC_DEFINE(M_CUSE, "cuse", "CUSE memory"); static TAILQ_HEAD(, cuse_server) cuse_server_head; static struct mtx cuse_global_mtx; static struct cdev *cuse_dev; static struct cuse_server *cuse_alloc_unit[CUSE_DEVICES_MAX]; static int cuse_alloc_unit_id[CUSE_DEVICES_MAX]; static void cuse_server_wakeup_all_client_locked(struct cuse_server *pcs); static void cuse_client_kqfilter_read_detach(struct knote *kn); static void cuse_client_kqfilter_write_detach(struct knote *kn); static int cuse_client_kqfilter_read_event(struct knote *kn, long hint); static int cuse_client_kqfilter_write_event(struct knote *kn, long hint); -static struct filterops cuse_client_kqfilter_read_ops = { +static const struct filterops cuse_client_kqfilter_read_ops = { .f_isfd = 1, .f_detach = cuse_client_kqfilter_read_detach, .f_event = cuse_client_kqfilter_read_event, }; -static struct filterops cuse_client_kqfilter_write_ops = { +static const struct filterops cuse_client_kqfilter_write_ops = { .f_isfd = 1, .f_detach = cuse_client_kqfilter_write_detach, .f_event = cuse_client_kqfilter_write_event, }; static d_open_t cuse_client_open; static d_close_t cuse_client_close; static d_ioctl_t cuse_client_ioctl; static d_read_t cuse_client_read; static d_write_t cuse_client_write; static d_poll_t cuse_client_poll; static d_mmap_single_t cuse_client_mmap_single; static d_kqfilter_t cuse_client_kqfilter; static struct cdevsw cuse_client_devsw = { .d_version = D_VERSION, .d_open = cuse_client_open, .d_close = cuse_client_close, .d_ioctl = cuse_client_ioctl, .d_name = "cuse_client", .d_flags = D_TRACKCLOSE, .d_read = cuse_client_read, .d_write = cuse_client_write, .d_poll = cuse_client_poll, .d_mmap_single = cuse_client_mmap_single, .d_kqfilter = cuse_client_kqfilter, }; static d_open_t cuse_server_open; static d_close_t cuse_server_close; static d_ioctl_t cuse_server_ioctl; static d_read_t cuse_server_read; static d_write_t cuse_server_write; static d_poll_t cuse_server_poll; static d_mmap_single_t cuse_server_mmap_single; static struct cdevsw cuse_server_devsw = { .d_version = D_VERSION, .d_open = cuse_server_open, .d_close = cuse_server_close, .d_ioctl = cuse_server_ioctl, .d_name = "cuse_server", .d_flags = D_TRACKCLOSE, .d_read = cuse_server_read, .d_write = cuse_server_write, .d_poll = cuse_server_poll, .d_mmap_single = cuse_server_mmap_single, }; static void cuse_client_is_closing(struct cuse_client *); static int cuse_free_unit_by_id_locked(struct cuse_server *, int); static void cuse_global_lock(void) { mtx_lock(&cuse_global_mtx); } static void cuse_global_unlock(void) { mtx_unlock(&cuse_global_mtx); } static void cuse_server_lock(struct cuse_server *pcs) { mtx_lock(&pcs->mtx); } static void cuse_server_unlock(struct cuse_server *pcs) { mtx_unlock(&pcs->mtx); } static bool cuse_server_is_locked(struct cuse_server *pcs) { return (mtx_owned(&pcs->mtx)); } static void cuse_cmd_lock(struct cuse_client_command *pccmd) { sx_xlock(&pccmd->sx); } static void cuse_cmd_unlock(struct cuse_client_command *pccmd) { sx_xunlock(&pccmd->sx); } static void cuse_kern_init(void *arg) { TAILQ_INIT(&cuse_server_head); mtx_init(&cuse_global_mtx, "cuse-global-mtx", NULL, MTX_DEF); cuse_dev = make_dev(&cuse_server_devsw, 0, UID_ROOT, GID_OPERATOR, 0600, "cuse"); printf("Cuse v%d.%d.%d @ /dev/cuse\n", (CUSE_VERSION >> 16) & 0xFF, (CUSE_VERSION >> 8) & 0xFF, (CUSE_VERSION >> 0) & 0xFF); } SYSINIT(cuse_kern_init, SI_SUB_DEVFS, SI_ORDER_ANY, cuse_kern_init, NULL); static void cuse_kern_uninit(void *arg) { void *ptr; while (1) { printf("Cuse: Please exit all /dev/cuse instances " "and processes which have used this device.\n"); pause("DRAIN", 2 * hz); cuse_global_lock(); ptr = TAILQ_FIRST(&cuse_server_head); cuse_global_unlock(); if (ptr == NULL) break; } if (cuse_dev != NULL) destroy_dev(cuse_dev); mtx_destroy(&cuse_global_mtx); } SYSUNINIT(cuse_kern_uninit, SI_SUB_DEVFS, SI_ORDER_ANY, cuse_kern_uninit, 0); static int cuse_server_get(struct cuse_server **ppcs) { struct cuse_server *pcs; int error; error = devfs_get_cdevpriv((void **)&pcs); if (error != 0) { *ppcs = NULL; return (error); } if (pcs->is_closing) { *ppcs = NULL; return (EINVAL); } *ppcs = pcs; return (0); } static void cuse_server_is_closing(struct cuse_server *pcs) { struct cuse_client *pcc; if (pcs->is_closing) return; pcs->is_closing = 1; TAILQ_FOREACH(pcc, &pcs->hcli, entry) { cuse_client_is_closing(pcc); } } static struct cuse_client_command * cuse_server_find_command(struct cuse_server *pcs, struct thread *td) { struct cuse_client *pcc; int n; if (pcs->is_closing) goto done; TAILQ_FOREACH(pcc, &pcs->hcli, entry) { if (CUSE_CLIENT_CLOSING(pcc)) continue; for (n = 0; n != CUSE_CMD_MAX; n++) { if (pcc->cmds[n].entered == td) return (&pcc->cmds[n]); } } done: return (NULL); } static void cuse_str_filter(char *ptr) { int c; while (((c = *ptr) != 0)) { if ((c >= 'a') && (c <= 'z')) { ptr++; continue; } if ((c >= 'A') && (c <= 'Z')) { ptr++; continue; } if ((c >= '0') && (c <= '9')) { ptr++; continue; } if ((c == '.') || (c == '_') || (c == '/')) { ptr++; continue; } *ptr = '_'; ptr++; } } static int cuse_convert_error(int error) { ; /* indent fix */ switch (error) { case CUSE_ERR_NONE: return (0); case CUSE_ERR_BUSY: return (EBUSY); case CUSE_ERR_WOULDBLOCK: return (EWOULDBLOCK); case CUSE_ERR_INVALID: return (EINVAL); case CUSE_ERR_NO_MEMORY: return (ENOMEM); case CUSE_ERR_FAULT: return (EFAULT); case CUSE_ERR_SIGNAL: return (EINTR); case CUSE_ERR_NO_DEVICE: return (ENODEV); default: return (ENXIO); } } static void cuse_vm_memory_free(struct cuse_memory *mem) { /* last user is gone - free */ vm_object_deallocate(mem->object); /* free CUSE memory */ free(mem, M_CUSE); } static int cuse_server_alloc_memory(struct cuse_server *pcs, uint32_t alloc_nr, uint32_t page_count) { struct cuse_memory *temp; struct cuse_memory *mem; vm_object_t object; int error; mem = malloc(sizeof(*mem), M_CUSE, M_WAITOK | M_ZERO); object = vm_pager_allocate(OBJT_SWAP, NULL, PAGE_SIZE * page_count, VM_PROT_DEFAULT, 0, curthread->td_ucred); if (object == NULL) { error = ENOMEM; goto error_0; } cuse_server_lock(pcs); /* check if allocation number already exists */ TAILQ_FOREACH(temp, &pcs->hmem, entry) { if (temp->alloc_nr == alloc_nr) break; } if (temp != NULL) { cuse_server_unlock(pcs); error = EBUSY; goto error_1; } mem->object = object; mem->page_count = page_count; mem->alloc_nr = alloc_nr; TAILQ_INSERT_TAIL(&pcs->hmem, mem, entry); cuse_server_unlock(pcs); return (0); error_1: vm_object_deallocate(object); error_0: free(mem, M_CUSE); return (error); } static int cuse_server_free_memory(struct cuse_server *pcs, uint32_t alloc_nr) { struct cuse_memory *mem; cuse_server_lock(pcs); TAILQ_FOREACH(mem, &pcs->hmem, entry) { if (mem->alloc_nr == alloc_nr) break; } if (mem == NULL) { cuse_server_unlock(pcs); return (EINVAL); } TAILQ_REMOVE(&pcs->hmem, mem, entry); cuse_server_unlock(pcs); cuse_vm_memory_free(mem); return (0); } static int cuse_client_get(struct cuse_client **ppcc) { struct cuse_client *pcc; int error; /* try to get private data */ error = devfs_get_cdevpriv((void **)&pcc); if (error != 0) { *ppcc = NULL; return (error); } if (CUSE_CLIENT_CLOSING(pcc) || pcc->server->is_closing) { *ppcc = NULL; return (EINVAL); } *ppcc = pcc; return (0); } static void cuse_client_is_closing(struct cuse_client *pcc) { struct cuse_client_command *pccmd; uint32_t n; if (CUSE_CLIENT_CLOSING(pcc)) return; pcc->cflags |= CUSE_CLI_IS_CLOSING; pcc->server_dev = NULL; for (n = 0; n != CUSE_CMD_MAX; n++) { pccmd = &pcc->cmds[n]; if (pccmd->entry.tqe_prev != NULL) { TAILQ_REMOVE(&pcc->server->head, pccmd, entry); pccmd->entry.tqe_prev = NULL; } cv_broadcast(&pccmd->cv); } } static void cuse_client_send_command_locked(struct cuse_client_command *pccmd, uintptr_t data_ptr, unsigned long arg, int fflags, int ioflag) { unsigned long cuse_fflags = 0; struct cuse_server *pcs; if (fflags & FREAD) cuse_fflags |= CUSE_FFLAG_READ; if (fflags & FWRITE) cuse_fflags |= CUSE_FFLAG_WRITE; if (ioflag & IO_NDELAY) cuse_fflags |= CUSE_FFLAG_NONBLOCK; #if defined(__LP64__) if (SV_CURPROC_FLAG(SV_ILP32)) cuse_fflags |= CUSE_FFLAG_COMPAT32; #endif pccmd->sub.fflags = cuse_fflags; pccmd->sub.data_pointer = data_ptr; pccmd->sub.argument = arg; pcs = pccmd->client->server; if ((pccmd->entry.tqe_prev == NULL) && (CUSE_CLIENT_CLOSING(pccmd->client) == 0) && (pcs->is_closing == 0)) { TAILQ_INSERT_TAIL(&pcs->head, pccmd, entry); cv_signal(&pcs->cv); } } static void cuse_client_got_signal(struct cuse_client_command *pccmd) { struct cuse_server *pcs; pccmd->got_signal = 1; pccmd = &pccmd->client->cmds[CUSE_CMD_SIGNAL]; pcs = pccmd->client->server; if ((pccmd->entry.tqe_prev == NULL) && (CUSE_CLIENT_CLOSING(pccmd->client) == 0) && (pcs->is_closing == 0)) { TAILQ_INSERT_TAIL(&pcs->head, pccmd, entry); cv_signal(&pcs->cv); } } static int cuse_client_receive_command_locked(struct cuse_client_command *pccmd, uint8_t *arg_ptr, uint32_t arg_len) { struct cuse_server *pcs; int error; pcs = pccmd->client->server; error = 0; pccmd->proc_curr = curthread->td_proc; if (CUSE_CLIENT_CLOSING(pccmd->client) || pcs->is_closing) { error = CUSE_ERR_OTHER; goto done; } while (pccmd->command == CUSE_CMD_NONE) { if (error != 0) { cv_wait(&pccmd->cv, &pcs->mtx); } else { error = cv_wait_sig(&pccmd->cv, &pcs->mtx); if (error != 0) cuse_client_got_signal(pccmd); } if (CUSE_CLIENT_CLOSING(pccmd->client) || pcs->is_closing) { error = CUSE_ERR_OTHER; goto done; } } error = pccmd->error; pccmd->command = CUSE_CMD_NONE; cv_signal(&pccmd->cv); done: /* wait until all process references are gone */ pccmd->proc_curr = NULL; while (pccmd->proc_refs != 0) cv_wait(&pccmd->cv, &pcs->mtx); return (error); } /*------------------------------------------------------------------------* * CUSE SERVER PART *------------------------------------------------------------------------*/ static void cuse_server_free_dev(struct cuse_server_dev *pcsd) { struct cuse_server *pcs; struct cuse_client *pcc; /* get server pointer */ pcs = pcsd->server; /* prevent creation of more devices */ cuse_server_lock(pcs); if (pcsd->kern_dev != NULL) pcsd->kern_dev->si_drv1 = NULL; TAILQ_FOREACH(pcc, &pcs->hcli, entry) { if (pcc->server_dev == pcsd) cuse_client_is_closing(pcc); } cuse_server_unlock(pcs); /* destroy device, if any */ if (pcsd->kern_dev != NULL) { /* destroy device synchronously */ destroy_dev(pcsd->kern_dev); } free(pcsd, M_CUSE); } static void cuse_server_unref(struct cuse_server *pcs) { struct cuse_server_dev *pcsd; struct cuse_memory *mem; cuse_server_lock(pcs); if (--(pcs->refs) != 0) { cuse_server_unlock(pcs); return; } cuse_server_is_closing(pcs); /* final client wakeup, if any */ cuse_server_wakeup_all_client_locked(pcs); cuse_global_lock(); TAILQ_REMOVE(&cuse_server_head, pcs, entry); cuse_global_unlock(); while ((pcsd = TAILQ_FIRST(&pcs->hdev)) != NULL) { TAILQ_REMOVE(&pcs->hdev, pcsd, entry); cuse_server_unlock(pcs); cuse_server_free_dev(pcsd); cuse_server_lock(pcs); } cuse_free_unit_by_id_locked(pcs, -1); while ((mem = TAILQ_FIRST(&pcs->hmem)) != NULL) { TAILQ_REMOVE(&pcs->hmem, mem, entry); cuse_server_unlock(pcs); cuse_vm_memory_free(mem); cuse_server_lock(pcs); } knlist_clear(&pcs->selinfo.si_note, 1); knlist_destroy(&pcs->selinfo.si_note); cuse_server_unlock(pcs); seldrain(&pcs->selinfo); cv_destroy(&pcs->cv); mtx_destroy(&pcs->mtx); free(pcs, M_CUSE); } static int cuse_server_do_close(struct cuse_server *pcs) { int retval; cuse_server_lock(pcs); cuse_server_is_closing(pcs); /* final client wakeup, if any */ cuse_server_wakeup_all_client_locked(pcs); knlist_clear(&pcs->selinfo.si_note, 1); retval = pcs->refs; cuse_server_unlock(pcs); return (retval); } static void cuse_server_free(void *arg) { struct cuse_server *pcs = arg; /* * The final server unref should be done by the server thread * to prevent deadlock in the client cdevpriv destructor, * which cannot destroy itself. */ while (cuse_server_do_close(pcs) != 1) pause("W", hz); /* drop final refcount */ cuse_server_unref(pcs); } static int cuse_server_open(struct cdev *dev, int fflags, int devtype, struct thread *td) { struct cuse_server *pcs; pcs = malloc(sizeof(*pcs), M_CUSE, M_WAITOK | M_ZERO); if (devfs_set_cdevpriv(pcs, &cuse_server_free)) { printf("Cuse: Cannot set cdevpriv.\n"); free(pcs, M_CUSE); return (ENOMEM); } /* store current process ID */ pcs->pid = curproc->p_pid; TAILQ_INIT(&pcs->head); TAILQ_INIT(&pcs->hdev); TAILQ_INIT(&pcs->hcli); TAILQ_INIT(&pcs->hmem); cv_init(&pcs->cv, "cuse-server-cv"); mtx_init(&pcs->mtx, "cuse-server-mtx", NULL, MTX_DEF); knlist_init_mtx(&pcs->selinfo.si_note, &pcs->mtx); cuse_global_lock(); pcs->refs++; TAILQ_INSERT_TAIL(&cuse_server_head, pcs, entry); cuse_global_unlock(); return (0); } static int cuse_server_close(struct cdev *dev, int fflag, int devtype, struct thread *td) { struct cuse_server *pcs; if (cuse_server_get(&pcs) == 0) cuse_server_do_close(pcs); return (0); } static int cuse_server_read(struct cdev *dev, struct uio *uio, int ioflag) { return (ENXIO); } static int cuse_server_write(struct cdev *dev, struct uio *uio, int ioflag) { return (ENXIO); } static int cuse_server_ioctl_copy_locked(struct cuse_server *pcs, struct cuse_client_command *pccmd, struct cuse_data_chunk *pchk, bool isread) { struct proc *p_proc; uint32_t offset; int error; offset = pchk->peer_ptr - CUSE_BUF_MIN_PTR; if (pchk->length > CUSE_BUFFER_MAX) return (EFAULT); if (offset >= CUSE_BUFFER_MAX) return (EFAULT); if ((offset + pchk->length) > CUSE_BUFFER_MAX) return (EFAULT); p_proc = pccmd->proc_curr; if (p_proc == NULL) return (ENXIO); if (pccmd->proc_refs < 0) return (ENOMEM); pccmd->proc_refs++; cuse_server_unlock(pcs); if (!isread) { error = copyin( (void *)pchk->local_ptr, pccmd->client->ioctl_buffer + offset, pchk->length); } else { error = copyout( pccmd->client->ioctl_buffer + offset, (void *)pchk->local_ptr, pchk->length); } cuse_server_lock(pcs); pccmd->proc_refs--; if (pccmd->proc_curr == NULL) cv_signal(&pccmd->cv); return (error); } static int cuse_proc2proc_copy(struct proc *proc_s, vm_offset_t data_s, struct proc *proc_d, vm_offset_t data_d, size_t len) { struct thread *td; struct proc *proc_cur; int error; td = curthread; proc_cur = td->td_proc; if (proc_cur == proc_d) { struct iovec iov = { .iov_base = (caddr_t)data_d, .iov_len = len, }; struct uio uio = { .uio_iov = &iov, .uio_iovcnt = 1, .uio_offset = (off_t)data_s, .uio_resid = len, .uio_segflg = UIO_USERSPACE, .uio_rw = UIO_READ, .uio_td = td, }; PHOLD(proc_s); error = proc_rwmem(proc_s, &uio); PRELE(proc_s); } else if (proc_cur == proc_s) { struct iovec iov = { .iov_base = (caddr_t)data_s, .iov_len = len, }; struct uio uio = { .uio_iov = &iov, .uio_iovcnt = 1, .uio_offset = (off_t)data_d, .uio_resid = len, .uio_segflg = UIO_USERSPACE, .uio_rw = UIO_WRITE, .uio_td = td, }; PHOLD(proc_d); error = proc_rwmem(proc_d, &uio); PRELE(proc_d); } else { error = EINVAL; } return (error); } static int cuse_server_data_copy_locked(struct cuse_server *pcs, struct cuse_client_command *pccmd, struct cuse_data_chunk *pchk, bool isread) { struct proc *p_proc; int error; p_proc = pccmd->proc_curr; if (p_proc == NULL) return (ENXIO); if (pccmd->proc_refs < 0) return (ENOMEM); pccmd->proc_refs++; cuse_server_unlock(pcs); if (!isread) { error = cuse_proc2proc_copy( curthread->td_proc, pchk->local_ptr, p_proc, pchk->peer_ptr, pchk->length); } else { error = cuse_proc2proc_copy( p_proc, pchk->peer_ptr, curthread->td_proc, pchk->local_ptr, pchk->length); } cuse_server_lock(pcs); pccmd->proc_refs--; if (pccmd->proc_curr == NULL) cv_signal(&pccmd->cv); return (error); } static int cuse_server_data_copy_optimized_locked(struct cuse_server *pcs, struct cuse_client_command *pccmd, struct cuse_data_chunk *pchk, bool isread) { uintptr_t offset; int error; /* * Check if data is stored locally to avoid accessing * other process's data space: */ if (isread) { offset = pchk->peer_ptr - pccmd->client->write_base; if (offset < (uintptr_t)pccmd->client->write_length && pchk->length <= (unsigned long)pccmd->client->write_length && offset + pchk->length <= (uintptr_t)pccmd->client->write_length) { cuse_server_unlock(pcs); error = copyout(pccmd->client->write_buffer + offset, (void *)pchk->local_ptr, pchk->length); goto done; } } else { offset = pchk->peer_ptr - pccmd->client->read_base; if (offset < (uintptr_t)pccmd->client->read_length && pchk->length <= (unsigned long)pccmd->client->read_length && offset + pchk->length <= (uintptr_t)pccmd->client->read_length) { cuse_server_unlock(pcs); error = copyin((void *)pchk->local_ptr, pccmd->client->read_buffer + offset, pchk->length); goto done; } } /* use process to process copy function */ error = cuse_server_data_copy_locked(pcs, pccmd, pchk, isread); done: return (error); } static int cuse_alloc_unit_by_id_locked(struct cuse_server *pcs, int id) { int n; int x = 0; int match; do { for (match = n = 0; n != CUSE_DEVICES_MAX; n++) { if (cuse_alloc_unit[n] != NULL) { if ((cuse_alloc_unit_id[n] ^ id) & CUSE_ID_MASK) continue; if ((cuse_alloc_unit_id[n] & ~CUSE_ID_MASK) == x) { x++; match = 1; } } } } while (match); if (x < 256) { for (n = 0; n != CUSE_DEVICES_MAX; n++) { if (cuse_alloc_unit[n] == NULL) { cuse_alloc_unit[n] = pcs; cuse_alloc_unit_id[n] = id | x; return (x); } } } return (-1); } static void cuse_server_wakeup_locked(struct cuse_server *pcs) { selwakeup(&pcs->selinfo); KNOTE_LOCKED(&pcs->selinfo.si_note, 0); } static void cuse_server_wakeup_all_client_locked(struct cuse_server *pcs) { struct cuse_client *pcc; TAILQ_FOREACH(pcc, &pcs->hcli, entry) { pcc->cflags |= (CUSE_CLI_KNOTE_NEED_READ | CUSE_CLI_KNOTE_NEED_WRITE); } cuse_server_wakeup_locked(pcs); } static int cuse_free_unit_by_id_locked(struct cuse_server *pcs, int id) { int n; int found = 0; for (n = 0; n != CUSE_DEVICES_MAX; n++) { if (cuse_alloc_unit[n] == pcs) { if (cuse_alloc_unit_id[n] == id || id == -1) { cuse_alloc_unit[n] = NULL; cuse_alloc_unit_id[n] = 0; found = 1; } } } return (found ? 0 : EINVAL); } static int cuse_server_ioctl(struct cdev *dev, unsigned long cmd, caddr_t data, int fflag, struct thread *td) { struct cuse_server *pcs; int error; error = cuse_server_get(&pcs); if (error != 0) return (error); switch (cmd) { struct cuse_client_command *pccmd; struct cuse_client *pcc; struct cuse_command *pcmd; struct cuse_alloc_info *pai; struct cuse_create_dev *pcd; struct cuse_server_dev *pcsd; struct cuse_data_chunk *pchk; int n; case CUSE_IOCTL_GET_COMMAND: pcmd = (void *)data; cuse_server_lock(pcs); while ((pccmd = TAILQ_FIRST(&pcs->head)) == NULL) { error = cv_wait_sig(&pcs->cv, &pcs->mtx); if (pcs->is_closing) error = ENXIO; if (error) { cuse_server_unlock(pcs); return (error); } } TAILQ_REMOVE(&pcs->head, pccmd, entry); pccmd->entry.tqe_prev = NULL; pccmd->entered = curthread; *pcmd = pccmd->sub; cuse_server_unlock(pcs); break; case CUSE_IOCTL_SYNC_COMMAND: cuse_server_lock(pcs); while ((pccmd = cuse_server_find_command(pcs, curthread)) != NULL) { /* send sync command */ pccmd->entered = NULL; pccmd->error = *(int *)data; pccmd->command = CUSE_CMD_SYNC; /* signal peer, if any */ cv_signal(&pccmd->cv); } cuse_server_unlock(pcs); break; case CUSE_IOCTL_ALLOC_UNIT: cuse_server_lock(pcs); n = cuse_alloc_unit_by_id_locked(pcs, CUSE_ID_DEFAULT(0)); cuse_server_unlock(pcs); if (n < 0) error = ENOMEM; else *(int *)data = n; break; case CUSE_IOCTL_ALLOC_UNIT_BY_ID: n = *(int *)data; n = (n & CUSE_ID_MASK); cuse_server_lock(pcs); n = cuse_alloc_unit_by_id_locked(pcs, n); cuse_server_unlock(pcs); if (n < 0) error = ENOMEM; else *(int *)data = n; break; case CUSE_IOCTL_FREE_UNIT: n = *(int *)data; n = CUSE_ID_DEFAULT(n); cuse_server_lock(pcs); error = cuse_free_unit_by_id_locked(pcs, n); cuse_server_unlock(pcs); break; case CUSE_IOCTL_FREE_UNIT_BY_ID: n = *(int *)data; cuse_server_lock(pcs); error = cuse_free_unit_by_id_locked(pcs, n); cuse_server_unlock(pcs); break; case CUSE_IOCTL_ALLOC_MEMORY: pai = (void *)data; if (pai->alloc_nr >= CUSE_ALLOC_UNIT_MAX) { error = ENOMEM; break; } if (pai->page_count > CUSE_ALLOC_PAGES_MAX) { error = ENOMEM; break; } error = cuse_server_alloc_memory(pcs, pai->alloc_nr, pai->page_count); break; case CUSE_IOCTL_FREE_MEMORY: pai = (void *)data; if (pai->alloc_nr >= CUSE_ALLOC_UNIT_MAX) { error = ENOMEM; break; } error = cuse_server_free_memory(pcs, pai->alloc_nr); break; case CUSE_IOCTL_GET_SIG: cuse_server_lock(pcs); pccmd = cuse_server_find_command(pcs, curthread); if (pccmd != NULL) { n = pccmd->got_signal; pccmd->got_signal = 0; } else { n = 0; } cuse_server_unlock(pcs); *(int *)data = n; break; case CUSE_IOCTL_SET_PFH: cuse_server_lock(pcs); pccmd = cuse_server_find_command(pcs, curthread); if (pccmd != NULL) { pcc = pccmd->client; for (n = 0; n != CUSE_CMD_MAX; n++) { pcc->cmds[n].sub.per_file_handle = *(uintptr_t *)data; } } else { error = ENXIO; } cuse_server_unlock(pcs); break; case CUSE_IOCTL_CREATE_DEV: error = priv_check(curthread, PRIV_DRIVER); if (error) break; pcd = (void *)data; /* filter input */ pcd->devname[sizeof(pcd->devname) - 1] = 0; if (pcd->devname[0] == 0) { error = EINVAL; break; } cuse_str_filter(pcd->devname); pcd->permissions &= 0777; /* try to allocate a character device */ pcsd = malloc(sizeof(*pcsd), M_CUSE, M_WAITOK | M_ZERO); pcsd->server = pcs; pcsd->user_dev = pcd->dev; pcsd->kern_dev = make_dev_credf(MAKEDEV_CHECKNAME, &cuse_client_devsw, 0, NULL, pcd->user_id, pcd->group_id, pcd->permissions, "%s", pcd->devname); if (pcsd->kern_dev == NULL) { free(pcsd, M_CUSE); error = ENOMEM; break; } pcsd->kern_dev->si_drv1 = pcsd; cuse_server_lock(pcs); TAILQ_INSERT_TAIL(&pcs->hdev, pcsd, entry); cuse_server_unlock(pcs); break; case CUSE_IOCTL_DESTROY_DEV: error = priv_check(curthread, PRIV_DRIVER); if (error) break; cuse_server_lock(pcs); error = EINVAL; pcsd = TAILQ_FIRST(&pcs->hdev); while (pcsd != NULL) { if (pcsd->user_dev == *(struct cuse_dev **)data) { TAILQ_REMOVE(&pcs->hdev, pcsd, entry); cuse_server_unlock(pcs); cuse_server_free_dev(pcsd); cuse_server_lock(pcs); error = 0; pcsd = TAILQ_FIRST(&pcs->hdev); } else { pcsd = TAILQ_NEXT(pcsd, entry); } } cuse_server_unlock(pcs); break; case CUSE_IOCTL_WRITE_DATA: case CUSE_IOCTL_READ_DATA: cuse_server_lock(pcs); pchk = (struct cuse_data_chunk *)data; pccmd = cuse_server_find_command(pcs, curthread); if (pccmd == NULL) { error = ENXIO; /* invalid request */ } else if (pchk->peer_ptr < CUSE_BUF_MIN_PTR) { error = EFAULT; /* NULL pointer */ } else if (pchk->length == 0) { /* NOP */ } else if (pchk->peer_ptr < CUSE_BUF_MAX_PTR) { error = cuse_server_ioctl_copy_locked(pcs, pccmd, pchk, cmd == CUSE_IOCTL_READ_DATA); } else { error = cuse_server_data_copy_optimized_locked( pcs, pccmd, pchk, cmd == CUSE_IOCTL_READ_DATA); } /* * Sometimes the functions above drop the server lock * early as an optimization: */ if (cuse_server_is_locked(pcs)) cuse_server_unlock(pcs); break; case CUSE_IOCTL_SELWAKEUP: cuse_server_lock(pcs); /* * We don't know which direction caused the event. * Wakeup both! */ cuse_server_wakeup_all_client_locked(pcs); cuse_server_unlock(pcs); break; default: error = ENXIO; break; } return (error); } static int cuse_server_poll(struct cdev *dev, int events, struct thread *td) { return (events & (POLLHUP | POLLPRI | POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM)); } static int cuse_common_mmap_single(struct cuse_server *pcs, vm_ooffset_t *offset, vm_size_t size, struct vm_object **object) { struct cuse_memory *mem; int error; /* verify size */ if ((size % PAGE_SIZE) != 0 || (size < PAGE_SIZE)) return (EINVAL); cuse_server_lock(pcs); error = ENOMEM; /* lookup memory structure, if any */ TAILQ_FOREACH(mem, &pcs->hmem, entry) { vm_ooffset_t min_off; vm_ooffset_t max_off; min_off = (mem->alloc_nr << CUSE_ALLOC_UNIT_SHIFT); max_off = min_off + (PAGE_SIZE * mem->page_count); if (*offset >= min_off && *offset < max_off) { /* range check size */ if (size > (max_off - *offset)) { error = EINVAL; } else { /* get new VM object offset to use */ *offset -= min_off; vm_object_reference(mem->object); *object = mem->object; error = 0; } break; } } cuse_server_unlock(pcs); return (error); } static int cuse_server_mmap_single(struct cdev *dev, vm_ooffset_t *offset, vm_size_t size, struct vm_object **object, int nprot) { struct cuse_server *pcs; int error; error = cuse_server_get(&pcs); if (error != 0) return (error); return (cuse_common_mmap_single(pcs, offset, size, object)); } /*------------------------------------------------------------------------* * CUSE CLIENT PART *------------------------------------------------------------------------*/ static void cuse_client_free(void *arg) { struct cuse_client *pcc = arg; struct cuse_client_command *pccmd; struct cuse_server *pcs; int n; pcs = pcc->server; cuse_server_lock(pcs); cuse_client_is_closing(pcc); TAILQ_REMOVE(&pcs->hcli, pcc, entry); cuse_server_unlock(pcs); for (n = 0; n != CUSE_CMD_MAX; n++) { pccmd = &pcc->cmds[n]; sx_destroy(&pccmd->sx); cv_destroy(&pccmd->cv); } free(pcc, M_CUSE); /* drop reference on server */ cuse_server_unref(pcs); } static int cuse_client_open(struct cdev *dev, int fflags, int devtype, struct thread *td) { struct cuse_client_command *pccmd; struct cuse_server_dev *pcsd; struct cuse_client *pcc; struct cuse_server *pcs; struct cuse_dev *pcd; int error; int n; pcsd = dev->si_drv1; if (pcsd != NULL) { pcs = pcsd->server; pcd = pcsd->user_dev; cuse_server_lock(pcs); /* * Check that the refcount didn't wrap and that the * same process is not both client and server. This * can easily lead to deadlocks when destroying the * CUSE character device nodes: */ pcs->refs++; if (pcs->refs < 0 || pcs->pid == curproc->p_pid) { /* overflow or wrong PID */ pcs->refs--; cuse_server_unlock(pcs); return (EINVAL); } cuse_server_unlock(pcs); } else { return (EINVAL); } pcc = malloc(sizeof(*pcc), M_CUSE, M_WAITOK | M_ZERO); if (devfs_set_cdevpriv(pcc, &cuse_client_free)) { printf("Cuse: Cannot set cdevpriv.\n"); /* drop reference on server */ cuse_server_unref(pcs); free(pcc, M_CUSE); return (ENOMEM); } pcc->fflags = fflags; pcc->server_dev = pcsd; pcc->server = pcs; for (n = 0; n != CUSE_CMD_MAX; n++) { pccmd = &pcc->cmds[n]; pccmd->sub.dev = pcd; pccmd->sub.command = n; pccmd->client = pcc; sx_init(&pccmd->sx, "cuse-client-sx"); cv_init(&pccmd->cv, "cuse-client-cv"); } cuse_server_lock(pcs); /* cuse_client_free() assumes that the client is listed somewhere! */ /* always enqueue */ TAILQ_INSERT_TAIL(&pcs->hcli, pcc, entry); /* check if server is closing */ if ((pcs->is_closing != 0) || (dev->si_drv1 == NULL)) { error = EINVAL; } else { error = 0; } cuse_server_unlock(pcs); if (error) { devfs_clear_cdevpriv(); /* XXX bugfix */ return (error); } pccmd = &pcc->cmds[CUSE_CMD_OPEN]; cuse_cmd_lock(pccmd); cuse_server_lock(pcs); cuse_client_send_command_locked(pccmd, 0, 0, pcc->fflags, 0); error = cuse_client_receive_command_locked(pccmd, 0, 0); cuse_server_unlock(pcs); if (error < 0) { error = cuse_convert_error(error); } else { error = 0; } cuse_cmd_unlock(pccmd); if (error) devfs_clear_cdevpriv(); /* XXX bugfix */ return (error); } static int cuse_client_close(struct cdev *dev, int fflag, int devtype, struct thread *td) { struct cuse_client_command *pccmd; struct cuse_client *pcc; struct cuse_server *pcs; int error; error = cuse_client_get(&pcc); if (error != 0) return (0); pccmd = &pcc->cmds[CUSE_CMD_CLOSE]; pcs = pcc->server; cuse_cmd_lock(pccmd); cuse_server_lock(pcs); cuse_client_send_command_locked(pccmd, 0, 0, pcc->fflags, 0); error = cuse_client_receive_command_locked(pccmd, 0, 0); cuse_cmd_unlock(pccmd); cuse_client_is_closing(pcc); cuse_server_unlock(pcs); return (0); } static void cuse_client_kqfilter_poll(struct cdev *dev, struct cuse_client *pcc) { struct cuse_server *pcs = pcc->server; int temp; cuse_server_lock(pcs); temp = (pcc->cflags & (CUSE_CLI_KNOTE_HAS_READ | CUSE_CLI_KNOTE_HAS_WRITE)); pcc->cflags &= ~(CUSE_CLI_KNOTE_NEED_READ | CUSE_CLI_KNOTE_NEED_WRITE); cuse_server_unlock(pcs); if (temp != 0) { /* get the latest polling state from the server */ temp = cuse_client_poll(dev, POLLIN | POLLOUT, NULL); if (temp & (POLLIN | POLLOUT)) { cuse_server_lock(pcs); if (temp & POLLIN) pcc->cflags |= CUSE_CLI_KNOTE_NEED_READ; if (temp & POLLOUT) pcc->cflags |= CUSE_CLI_KNOTE_NEED_WRITE; /* make sure the "knote" gets woken up */ cuse_server_wakeup_locked(pcc->server); cuse_server_unlock(pcs); } } } static int cuse_client_read(struct cdev *dev, struct uio *uio, int ioflag) { struct cuse_client_command *pccmd; struct cuse_client *pcc; struct cuse_server *pcs; int error; int temp; int len; error = cuse_client_get(&pcc); if (error != 0) return (error); pccmd = &pcc->cmds[CUSE_CMD_READ]; pcs = pcc->server; if (uio->uio_segflg != UIO_USERSPACE) { return (EINVAL); } uio->uio_segflg = UIO_NOCOPY; cuse_cmd_lock(pccmd); while (uio->uio_resid != 0) { if (uio->uio_iov->iov_len > CUSE_LENGTH_MAX) { error = ENOMEM; break; } len = uio->uio_iov->iov_len; cuse_server_lock(pcs); if (len <= CUSE_COPY_BUFFER_MAX) { /* set read buffer region for small reads */ pcc->read_base = (uintptr_t)uio->uio_iov->iov_base; pcc->read_length = len; } cuse_client_send_command_locked(pccmd, (uintptr_t)uio->uio_iov->iov_base, (unsigned long)(unsigned int)len, pcc->fflags, ioflag); error = cuse_client_receive_command_locked(pccmd, 0, 0); /* * After finishing reading data, disable the read * region for the cuse_server_data_copy_optimized_locked() * function: */ pcc->read_base = 0; pcc->read_length = 0; cuse_server_unlock(pcs); /* * The return value indicates the read length, when * not negative. Range check it just in case to avoid * passing invalid length values to uiomove(). */ if (error > len) { error = ERANGE; break; } else if (error > 0 && len <= CUSE_COPY_BUFFER_MAX) { temp = copyout(pcc->read_buffer, uio->uio_iov->iov_base, error); if (temp != 0) { error = temp; break; } } if (error < 0) { error = cuse_convert_error(error); break; } else if (error == len) { error = uiomove(NULL, error, uio); if (error) break; } else { error = uiomove(NULL, error, uio); break; } } cuse_cmd_unlock(pccmd); uio->uio_segflg = UIO_USERSPACE;/* restore segment flag */ if (error == EWOULDBLOCK) cuse_client_kqfilter_poll(dev, pcc); return (error); } static int cuse_client_write(struct cdev *dev, struct uio *uio, int ioflag) { struct cuse_client_command *pccmd; struct cuse_client *pcc; struct cuse_server *pcs; int error; int len; error = cuse_client_get(&pcc); if (error != 0) return (error); pccmd = &pcc->cmds[CUSE_CMD_WRITE]; pcs = pcc->server; if (uio->uio_segflg != UIO_USERSPACE) { return (EINVAL); } uio->uio_segflg = UIO_NOCOPY; cuse_cmd_lock(pccmd); while (uio->uio_resid != 0) { if (uio->uio_iov->iov_len > CUSE_LENGTH_MAX) { error = ENOMEM; break; } len = uio->uio_iov->iov_len; if (len <= CUSE_COPY_BUFFER_MAX) { error = copyin(uio->uio_iov->iov_base, pcc->write_buffer, len); if (error != 0) break; } cuse_server_lock(pcs); if (len <= CUSE_COPY_BUFFER_MAX) { /* set write buffer region for small writes */ pcc->write_base = (uintptr_t)uio->uio_iov->iov_base; pcc->write_length = len; } cuse_client_send_command_locked(pccmd, (uintptr_t)uio->uio_iov->iov_base, (unsigned long)(unsigned int)len, pcc->fflags, ioflag); error = cuse_client_receive_command_locked(pccmd, 0, 0); /* * After finishing writing data, disable the write * region for the cuse_server_data_copy_optimized_locked() * function: */ pcc->write_base = 0; pcc->write_length = 0; cuse_server_unlock(pcs); /* * The return value indicates the write length, when * not negative. Range check it just in case to avoid * passing invalid length values to uiomove(). */ if (error > len) { error = ERANGE; break; } else if (error < 0) { error = cuse_convert_error(error); break; } else if (error == len) { error = uiomove(NULL, error, uio); if (error) break; } else { error = uiomove(NULL, error, uio); break; } } cuse_cmd_unlock(pccmd); /* restore segment flag */ uio->uio_segflg = UIO_USERSPACE; if (error == EWOULDBLOCK) cuse_client_kqfilter_poll(dev, pcc); return (error); } int cuse_client_ioctl(struct cdev *dev, unsigned long cmd, caddr_t data, int fflag, struct thread *td) { struct cuse_client_command *pccmd; struct cuse_client *pcc; struct cuse_server *pcs; int error; int len; error = cuse_client_get(&pcc); if (error != 0) return (error); len = IOCPARM_LEN(cmd); if (len > CUSE_BUFFER_MAX) return (ENOMEM); pccmd = &pcc->cmds[CUSE_CMD_IOCTL]; pcs = pcc->server; cuse_cmd_lock(pccmd); if (cmd & (IOC_IN | IOC_VOID)) memcpy(pcc->ioctl_buffer, data, len); /* * When the ioctl-length is zero drivers can pass information * through the data pointer of the ioctl. Make sure this information * is forwarded to the driver. */ cuse_server_lock(pcs); cuse_client_send_command_locked(pccmd, (len == 0) ? *(long *)data : CUSE_BUF_MIN_PTR, (unsigned long)cmd, pcc->fflags, (fflag & O_NONBLOCK) ? IO_NDELAY : 0); error = cuse_client_receive_command_locked(pccmd, data, len); cuse_server_unlock(pcs); if (error < 0) { error = cuse_convert_error(error); } else { error = 0; } if (cmd & IOC_OUT) memcpy(data, pcc->ioctl_buffer, len); cuse_cmd_unlock(pccmd); if (error == EWOULDBLOCK) cuse_client_kqfilter_poll(dev, pcc); return (error); } static int cuse_client_poll(struct cdev *dev, int events, struct thread *td) { struct cuse_client_command *pccmd; struct cuse_client *pcc; struct cuse_server *pcs; unsigned long temp; int error; int revents; error = cuse_client_get(&pcc); if (error != 0) goto pollnval; temp = 0; pcs = pcc->server; if (events & (POLLPRI | POLLIN | POLLRDNORM)) temp |= CUSE_POLL_READ; if (events & (POLLOUT | POLLWRNORM)) temp |= CUSE_POLL_WRITE; if (events & POLLHUP) temp |= CUSE_POLL_ERROR; pccmd = &pcc->cmds[CUSE_CMD_POLL]; cuse_cmd_lock(pccmd); /* Need to selrecord() first to not loose any events. */ if (temp != 0 && td != NULL) selrecord(td, &pcs->selinfo); cuse_server_lock(pcs); cuse_client_send_command_locked(pccmd, 0, temp, pcc->fflags, IO_NDELAY); error = cuse_client_receive_command_locked(pccmd, 0, 0); cuse_server_unlock(pcs); cuse_cmd_unlock(pccmd); if (error < 0) { goto pollnval; } else { revents = 0; if (error & CUSE_POLL_READ) revents |= (events & (POLLPRI | POLLIN | POLLRDNORM)); if (error & CUSE_POLL_WRITE) revents |= (events & (POLLOUT | POLLWRNORM)); if (error & CUSE_POLL_ERROR) revents |= (events & POLLHUP); } return (revents); pollnval: /* XXX many clients don't understand POLLNVAL */ return (events & (POLLHUP | POLLPRI | POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM)); } static int cuse_client_mmap_single(struct cdev *dev, vm_ooffset_t *offset, vm_size_t size, struct vm_object **object, int nprot) { struct cuse_client *pcc; int error; error = cuse_client_get(&pcc); if (error != 0) return (error); return (cuse_common_mmap_single(pcc->server, offset, size, object)); } static void cuse_client_kqfilter_read_detach(struct knote *kn) { struct cuse_client *pcc; struct cuse_server *pcs; pcc = kn->kn_hook; pcs = pcc->server; cuse_server_lock(pcs); knlist_remove(&pcs->selinfo.si_note, kn, 1); cuse_server_unlock(pcs); } static void cuse_client_kqfilter_write_detach(struct knote *kn) { struct cuse_client *pcc; struct cuse_server *pcs; pcc = kn->kn_hook; pcs = pcc->server; cuse_server_lock(pcs); knlist_remove(&pcs->selinfo.si_note, kn, 1); cuse_server_unlock(pcs); } static int cuse_client_kqfilter_read_event(struct knote *kn, long hint) { struct cuse_client *pcc; pcc = kn->kn_hook; mtx_assert(&pcc->server->mtx, MA_OWNED); return ((pcc->cflags & CUSE_CLI_KNOTE_NEED_READ) ? 1 : 0); } static int cuse_client_kqfilter_write_event(struct knote *kn, long hint) { struct cuse_client *pcc; pcc = kn->kn_hook; mtx_assert(&pcc->server->mtx, MA_OWNED); return ((pcc->cflags & CUSE_CLI_KNOTE_NEED_WRITE) ? 1 : 0); } static int cuse_client_kqfilter(struct cdev *dev, struct knote *kn) { struct cuse_client *pcc; struct cuse_server *pcs; int error; error = cuse_client_get(&pcc); if (error != 0) return (error); pcs = pcc->server; cuse_server_lock(pcs); switch (kn->kn_filter) { case EVFILT_READ: pcc->cflags |= CUSE_CLI_KNOTE_HAS_READ; kn->kn_hook = pcc; kn->kn_fop = &cuse_client_kqfilter_read_ops; knlist_add(&pcs->selinfo.si_note, kn, 1); break; case EVFILT_WRITE: pcc->cflags |= CUSE_CLI_KNOTE_HAS_WRITE; kn->kn_hook = pcc; kn->kn_fop = &cuse_client_kqfilter_write_ops; knlist_add(&pcs->selinfo.si_note, kn, 1); break; default: error = EINVAL; break; } cuse_server_unlock(pcs); if (error == 0) cuse_client_kqfilter_poll(dev, pcc); return (error); } diff --git a/sys/fs/devfs/devfs_vnops.c b/sys/fs/devfs/devfs_vnops.c index 9dcf3b235feb..7d17362df05e 100644 --- a/sys/fs/devfs/devfs_vnops.c +++ b/sys/fs/devfs/devfs_vnops.c @@ -1,2143 +1,2143 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2000-2004 * Poul-Henning Kamp. All rights reserved. * Copyright (c) 1989, 1992-1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kernfs_vnops.c 8.15 (Berkeley) 5/21/95 * From: FreeBSD: src/sys/miscfs/kernfs/kernfs_vnops.c 1.43 */ /* * TODO: * mkdir: want it ? */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static struct vop_vector devfs_vnodeops; static struct vop_vector devfs_specops; -static struct fileops devfs_ops_f; +static const struct fileops devfs_ops_f; #include #include #include #include #include #include static MALLOC_DEFINE(M_CDEVPDATA, "DEVFSP", "Metainfo for cdev-fp data"); struct mtx devfs_de_interlock; MTX_SYSINIT(devfs_de_interlock, &devfs_de_interlock, "devfs interlock", MTX_DEF); struct mtx cdevpriv_mtx; MTX_SYSINIT(cdevpriv_mtx, &cdevpriv_mtx, "cdevpriv lock", MTX_DEF); SYSCTL_DECL(_vfs_devfs); static int devfs_dotimes; SYSCTL_INT(_vfs_devfs, OID_AUTO, dotimes, CTLFLAG_RW, &devfs_dotimes, 0, "Update timestamps on DEVFS with default precision"); /* * Update devfs node timestamp. Note that updates are unlocked and * stat(2) could see partially updated times. */ static void devfs_timestamp(struct timespec *tsp) { time_t ts; if (devfs_dotimes) { vfs_timestamp(tsp); } else { ts = time_second; if (tsp->tv_sec != ts) { tsp->tv_sec = ts; tsp->tv_nsec = 0; } } } static int devfs_fp_check(struct file *fp, struct cdev **devp, struct cdevsw **dswp, int *ref) { *dswp = devvn_refthread(fp->f_vnode, devp, ref); if (*dswp == NULL || *devp != fp->f_data) { if (*dswp != NULL) dev_relthread(*devp, *ref); return (ENXIO); } KASSERT((*devp)->si_refcount > 0, ("devfs: un-referenced struct cdev *(%s)", devtoname(*devp))); if (*dswp == NULL) return (ENXIO); curthread->td_fpop = fp; return (0); } int devfs_get_cdevpriv(void **datap) { struct file *fp; struct cdev_privdata *p; int error; fp = curthread->td_fpop; if (fp == NULL) return (EBADF); p = fp->f_cdevpriv; if (p != NULL) { error = 0; *datap = p->cdpd_data; } else error = ENOENT; return (error); } int devfs_set_cdevpriv(void *priv, d_priv_dtor_t *priv_dtr) { struct file *fp; struct cdev_priv *cdp; struct cdev_privdata *p; int error; fp = curthread->td_fpop; if (fp == NULL) return (ENOENT); cdp = cdev2priv((struct cdev *)fp->f_data); p = malloc(sizeof(struct cdev_privdata), M_CDEVPDATA, M_WAITOK); p->cdpd_data = priv; p->cdpd_dtr = priv_dtr; p->cdpd_fp = fp; mtx_lock(&cdevpriv_mtx); if (fp->f_cdevpriv == NULL) { LIST_INSERT_HEAD(&cdp->cdp_fdpriv, p, cdpd_list); fp->f_cdevpriv = p; mtx_unlock(&cdevpriv_mtx); error = 0; } else { mtx_unlock(&cdevpriv_mtx); free(p, M_CDEVPDATA); error = EBUSY; } return (error); } int devfs_foreach_cdevpriv(struct cdev *dev, int (*cb)(void *data, void *arg), void *arg) { struct cdev_priv *cdp; struct cdev_privdata *p; int error; cdp = cdev2priv(dev); error = 0; mtx_lock(&cdevpriv_mtx); LIST_FOREACH(p, &cdp->cdp_fdpriv, cdpd_list) { error = cb(p->cdpd_data, arg); if (error != 0) break; } mtx_unlock(&cdevpriv_mtx); return (error); } void devfs_destroy_cdevpriv(struct cdev_privdata *p) { mtx_assert(&cdevpriv_mtx, MA_OWNED); KASSERT(p->cdpd_fp->f_cdevpriv == p, ("devfs_destoy_cdevpriv %p != %p", p->cdpd_fp->f_cdevpriv, p)); p->cdpd_fp->f_cdevpriv = NULL; LIST_REMOVE(p, cdpd_list); mtx_unlock(&cdevpriv_mtx); (p->cdpd_dtr)(p->cdpd_data); free(p, M_CDEVPDATA); } static void devfs_fpdrop(struct file *fp) { struct cdev_privdata *p; mtx_lock(&cdevpriv_mtx); if ((p = fp->f_cdevpriv) == NULL) { mtx_unlock(&cdevpriv_mtx); return; } devfs_destroy_cdevpriv(p); } void devfs_clear_cdevpriv(void) { struct file *fp; fp = curthread->td_fpop; if (fp == NULL) return; devfs_fpdrop(fp); } static void devfs_usecount_add(struct vnode *vp) { struct devfs_dirent *de; struct cdev *dev; mtx_lock(&devfs_de_interlock); VI_LOCK(vp); VNPASS(vp->v_type == VCHR || vp->v_type == VBAD, vp); if (VN_IS_DOOMED(vp)) { goto out_unlock; } de = vp->v_data; dev = vp->v_rdev; MPASS(de != NULL); MPASS(dev != NULL); dev->si_usecount++; de->de_usecount++; out_unlock: VI_UNLOCK(vp); mtx_unlock(&devfs_de_interlock); } static void devfs_usecount_subl(struct vnode *vp) { struct devfs_dirent *de; struct cdev *dev; mtx_assert(&devfs_de_interlock, MA_OWNED); ASSERT_VI_LOCKED(vp, __func__); VNPASS(vp->v_type == VCHR || vp->v_type == VBAD, vp); de = vp->v_data; dev = vp->v_rdev; if (de == NULL) return; if (dev == NULL) { MPASS(de->de_usecount == 0); return; } if (dev->si_usecount < de->de_usecount) panic("%s: si_usecount underflow for dev %p " "(has %ld, dirent has %d)\n", __func__, dev, dev->si_usecount, de->de_usecount); if (VN_IS_DOOMED(vp)) { dev->si_usecount -= de->de_usecount; de->de_usecount = 0; } else { if (de->de_usecount == 0) panic("%s: de_usecount underflow for dev %p\n", __func__, dev); dev->si_usecount--; de->de_usecount--; } } static void devfs_usecount_sub(struct vnode *vp) { mtx_lock(&devfs_de_interlock); VI_LOCK(vp); devfs_usecount_subl(vp); VI_UNLOCK(vp); mtx_unlock(&devfs_de_interlock); } static int devfs_usecountl(struct vnode *vp) { VNPASS(vp->v_type == VCHR, vp); mtx_assert(&devfs_de_interlock, MA_OWNED); ASSERT_VI_LOCKED(vp, __func__); return (vp->v_rdev->si_usecount); } int devfs_usecount(struct vnode *vp) { int count; VNPASS(vp->v_type == VCHR, vp); mtx_lock(&devfs_de_interlock); VI_LOCK(vp); count = devfs_usecountl(vp); VI_UNLOCK(vp); mtx_unlock(&devfs_de_interlock); return (count); } void devfs_ctty_ref(struct vnode *vp) { vrefact(vp); devfs_usecount_add(vp); } void devfs_ctty_unref(struct vnode *vp) { devfs_usecount_sub(vp); vrele(vp); } /* * On success devfs_populate_vp() returns with dmp->dm_lock held. */ static int devfs_populate_vp(struct vnode *vp) { struct devfs_dirent *de; struct devfs_mount *dmp; int locked; ASSERT_VOP_LOCKED(vp, "devfs_populate_vp"); dmp = VFSTODEVFS(vp->v_mount); if (!devfs_populate_needed(dmp)) { sx_xlock(&dmp->dm_lock); goto out_nopopulate; } locked = VOP_ISLOCKED(vp); sx_xlock(&dmp->dm_lock); DEVFS_DMP_HOLD(dmp); /* Can't call devfs_populate() with the vnode lock held. */ VOP_UNLOCK(vp); devfs_populate(dmp); sx_xunlock(&dmp->dm_lock); vn_lock(vp, locked | LK_RETRY); sx_xlock(&dmp->dm_lock); if (DEVFS_DMP_DROP(dmp)) { sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); return (ERESTART); } out_nopopulate: if (VN_IS_DOOMED(vp)) { sx_xunlock(&dmp->dm_lock); return (ERESTART); } de = vp->v_data; KASSERT(de != NULL, ("devfs_populate_vp: vp->v_data == NULL but vnode not doomed")); if ((de->de_flags & DE_DOOMED) != 0) { sx_xunlock(&dmp->dm_lock); return (ERESTART); } return (0); } static int devfs_vptocnp(struct vop_vptocnp_args *ap) { struct vnode *vp = ap->a_vp; struct vnode **dvp = ap->a_vpp; struct devfs_mount *dmp; char *buf = ap->a_buf; size_t *buflen = ap->a_buflen; struct devfs_dirent *dd, *de; int i, error; dmp = VFSTODEVFS(vp->v_mount); error = devfs_populate_vp(vp); if (error != 0) return (error); if (vp->v_type != VCHR && vp->v_type != VDIR) { error = ENOENT; goto finished; } dd = vp->v_data; if (vp->v_type == VDIR && dd == dmp->dm_rootdir) { *dvp = vp; vref(*dvp); goto finished; } i = *buflen; i -= dd->de_dirent->d_namlen; if (i < 0) { error = ENOMEM; goto finished; } bcopy(dd->de_dirent->d_name, buf + i, dd->de_dirent->d_namlen); *buflen = i; de = devfs_parent_dirent(dd); if (de == NULL) { error = ENOENT; goto finished; } mtx_lock(&devfs_de_interlock); *dvp = de->de_vnode; if (*dvp != NULL) { VI_LOCK(*dvp); mtx_unlock(&devfs_de_interlock); vholdl(*dvp); VI_UNLOCK(*dvp); vref(*dvp); vdrop(*dvp); } else { mtx_unlock(&devfs_de_interlock); error = ENOENT; } finished: sx_xunlock(&dmp->dm_lock); return (error); } /* * Construct the fully qualified path name relative to the mountpoint. * If a NULL cnp is provided, no '/' is appended to the resulting path. */ char * devfs_fqpn(char *buf, struct devfs_mount *dmp, struct devfs_dirent *dd, struct componentname *cnp) { int i; struct devfs_dirent *de; sx_assert(&dmp->dm_lock, SA_LOCKED); i = SPECNAMELEN; buf[i] = '\0'; if (cnp != NULL) i -= cnp->cn_namelen; if (i < 0) return (NULL); if (cnp != NULL) bcopy(cnp->cn_nameptr, buf + i, cnp->cn_namelen); de = dd; while (de != dmp->dm_rootdir) { if (cnp != NULL || i < SPECNAMELEN) { i--; if (i < 0) return (NULL); buf[i] = '/'; } i -= de->de_dirent->d_namlen; if (i < 0) return (NULL); bcopy(de->de_dirent->d_name, buf + i, de->de_dirent->d_namlen); de = devfs_parent_dirent(de); if (de == NULL) return (NULL); } return (buf + i); } static int devfs_allocv_drop_refs(int drop_dm_lock, struct devfs_mount *dmp, struct devfs_dirent *de) { int not_found; not_found = 0; if (de->de_flags & DE_DOOMED) not_found = 1; if (DEVFS_DE_DROP(de)) { KASSERT(not_found == 1, ("DEVFS de dropped but not doomed")); devfs_dirent_free(de); } if (DEVFS_DMP_DROP(dmp)) { KASSERT(not_found == 1, ("DEVFS mount struct freed before dirent")); not_found = 2; sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); } if (not_found == 1 || (drop_dm_lock && not_found != 2)) sx_unlock(&dmp->dm_lock); return (not_found); } /* * devfs_allocv shall be entered with dmp->dm_lock held, and it drops * it on return. */ int devfs_allocv(struct devfs_dirent *de, struct mount *mp, int lockmode, struct vnode **vpp) { int error; struct vnode *vp; struct cdev *dev; struct devfs_mount *dmp; struct cdevsw *dsw; enum vgetstate vs; dmp = VFSTODEVFS(mp); if (de->de_flags & DE_DOOMED) { sx_xunlock(&dmp->dm_lock); return (ENOENT); } loop: DEVFS_DE_HOLD(de); DEVFS_DMP_HOLD(dmp); mtx_lock(&devfs_de_interlock); vp = de->de_vnode; if (vp != NULL) { vs = vget_prep(vp); mtx_unlock(&devfs_de_interlock); sx_xunlock(&dmp->dm_lock); vget_finish(vp, lockmode | LK_RETRY, vs); sx_xlock(&dmp->dm_lock); if (devfs_allocv_drop_refs(0, dmp, de)) { vput(vp); return (ENOENT); } else if (VN_IS_DOOMED(vp)) { mtx_lock(&devfs_de_interlock); if (de->de_vnode == vp) { de->de_vnode = NULL; vp->v_data = NULL; } mtx_unlock(&devfs_de_interlock); vput(vp); goto loop; } sx_xunlock(&dmp->dm_lock); *vpp = vp; return (0); } mtx_unlock(&devfs_de_interlock); if (de->de_dirent->d_type == DT_CHR) { if (!(de->de_cdp->cdp_flags & CDP_ACTIVE)) { devfs_allocv_drop_refs(1, dmp, de); return (ENOENT); } dev = &de->de_cdp->cdp_c; } else { dev = NULL; } error = getnewvnode("devfs", mp, &devfs_vnodeops, &vp); if (error != 0) { devfs_allocv_drop_refs(1, dmp, de); printf("devfs_allocv: failed to allocate new vnode\n"); return (error); } if (de->de_dirent->d_type == DT_CHR) { vp->v_type = VCHR; VI_LOCK(vp); dev_lock(); dev_refl(dev); /* XXX: v_rdev should be protect by vnode lock */ vp->v_rdev = dev; VNPASS(vp->v_usecount == 1, vp); /* Special casing of ttys for deadfs. Probably redundant. */ dsw = dev->si_devsw; if (dsw != NULL && (dsw->d_flags & D_TTY) != 0) vp->v_vflag |= VV_ISTTY; dev_unlock(); VI_UNLOCK(vp); if ((dev->si_flags & SI_ETERNAL) != 0) vp->v_vflag |= VV_ETERNALDEV; vp->v_op = &devfs_specops; } else if (de->de_dirent->d_type == DT_DIR) { vp->v_type = VDIR; } else if (de->de_dirent->d_type == DT_LNK) { vp->v_type = VLNK; } else { vp->v_type = VBAD; } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWITNESS); VN_LOCK_ASHARE(vp); mtx_lock(&devfs_de_interlock); vp->v_data = de; de->de_vnode = vp; mtx_unlock(&devfs_de_interlock); error = insmntque1(vp, mp); if (error != 0) { mtx_lock(&devfs_de_interlock); vp->v_data = NULL; de->de_vnode = NULL; mtx_unlock(&devfs_de_interlock); vgone(vp); vput(vp); (void) devfs_allocv_drop_refs(1, dmp, de); return (error); } if (devfs_allocv_drop_refs(0, dmp, de)) { vgone(vp); vput(vp); return (ENOENT); } #ifdef MAC mac_devfs_vnode_associate(mp, de, vp); #endif sx_xunlock(&dmp->dm_lock); vn_set_state(vp, VSTATE_CONSTRUCTED); *vpp = vp; return (0); } static int devfs_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; struct devfs_dirent *de; struct proc *p; int error; de = vp->v_data; if (vp->v_type == VDIR) de = de->de_dir; error = vaccess(vp->v_type, de->de_mode, de->de_uid, de->de_gid, ap->a_accmode, ap->a_cred); if (error == 0) return (0); if (error != EACCES) return (error); p = ap->a_td->td_proc; /* We do, however, allow access to the controlling terminal */ PROC_LOCK(p); if (!(p->p_flag & P_CONTROLT)) { PROC_UNLOCK(p); return (error); } if (p->p_session->s_ttydp == de->de_cdp) error = 0; PROC_UNLOCK(p); return (error); } _Static_assert(((FMASK | FCNTLFLAGS) & (FLASTCLOSE | FREVOKE)) == 0, "devfs-only flag reuse failed"); static int devfs_close(struct vop_close_args *ap) { struct vnode *vp = ap->a_vp, *oldvp; struct thread *td = ap->a_td; struct proc *p; struct cdev *dev = vp->v_rdev; struct cdevsw *dsw; struct devfs_dirent *de = vp->v_data; int dflags, error, ref, vp_locked; /* * XXX: Don't call d_close() if we were called because of * XXX: insmntque() failure. */ if (vp->v_data == NULL) return (0); /* * Hack: a tty device that is a controlling terminal * has a reference from the session structure. * We cannot easily tell that a character device is * a controlling terminal, unless it is the closing * process' controlling terminal. In that case, * if the reference count is 2 (this last descriptor * plus the session), release the reference from the session. */ if (de->de_usecount == 2 && td != NULL) { p = td->td_proc; PROC_LOCK(p); if (vp == p->p_session->s_ttyvp) { PROC_UNLOCK(p); oldvp = NULL; sx_xlock(&proctree_lock); if (vp == p->p_session->s_ttyvp) { SESS_LOCK(p->p_session); mtx_lock(&devfs_de_interlock); VI_LOCK(vp); if (devfs_usecountl(vp) == 2 && !VN_IS_DOOMED(vp)) { p->p_session->s_ttyvp = NULL; p->p_session->s_ttydp = NULL; oldvp = vp; } VI_UNLOCK(vp); mtx_unlock(&devfs_de_interlock); SESS_UNLOCK(p->p_session); } sx_xunlock(&proctree_lock); if (oldvp != NULL) devfs_ctty_unref(oldvp); } else PROC_UNLOCK(p); } /* * We do not want to really close the device if it * is still in use unless we are trying to close it * forcibly. Since every use (buffer, vnode, swap, cmap) * holds a reference to the vnode, and because we mark * any other vnodes that alias this device, when the * sum of the reference counts on all the aliased * vnodes descends to one, we are on last close. */ dsw = dev_refthread(dev, &ref); if (dsw == NULL) return (ENXIO); dflags = 0; mtx_lock(&devfs_de_interlock); VI_LOCK(vp); if (devfs_usecountl(vp) == 1) dflags |= FLASTCLOSE; devfs_usecount_subl(vp); mtx_unlock(&devfs_de_interlock); if (VN_IS_DOOMED(vp)) { /* Forced close. */ dflags |= FREVOKE | FNONBLOCK; } else if (dsw->d_flags & D_TRACKCLOSE) { /* Keep device updated on status. */ } else if ((dflags & FLASTCLOSE) == 0) { VI_UNLOCK(vp); dev_relthread(dev, ref); return (0); } vholdnz(vp); VI_UNLOCK(vp); vp_locked = VOP_ISLOCKED(vp); VOP_UNLOCK(vp); KASSERT(dev->si_refcount > 0, ("devfs_close() on un-referenced struct cdev *(%s)", devtoname(dev))); error = dsw->d_close(dev, ap->a_fflag | dflags, S_IFCHR, td); dev_relthread(dev, ref); vn_lock(vp, vp_locked | LK_RETRY); vdrop(vp); return (error); } static int devfs_close_f(struct file *fp, struct thread *td) { int error; struct file *fpop; /* * NB: td may be NULL if this descriptor is closed due to * garbage collection from a closed UNIX domain socket. */ fpop = curthread->td_fpop; curthread->td_fpop = fp; error = vnops.fo_close(fp, td); curthread->td_fpop = fpop; /* * The f_cdevpriv cannot be assigned non-NULL value while we * are destroying the file. */ if (fp->f_cdevpriv != NULL) devfs_fpdrop(fp); return (error); } static int devfs_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; struct devfs_dirent *de; struct devfs_mount *dmp; struct cdev *dev; struct timeval boottime; int error; error = devfs_populate_vp(vp); if (error != 0) return (error); dmp = VFSTODEVFS(vp->v_mount); sx_xunlock(&dmp->dm_lock); de = vp->v_data; KASSERT(de != NULL, ("Null dirent in devfs_getattr vp=%p", vp)); if (vp->v_type == VDIR) { de = de->de_dir; KASSERT(de != NULL, ("Null dir dirent in devfs_getattr vp=%p", vp)); } vap->va_uid = de->de_uid; vap->va_gid = de->de_gid; vap->va_mode = de->de_mode; if (vp->v_type == VLNK) vap->va_size = strlen(de->de_symlink); else if (vp->v_type == VDIR) vap->va_size = vap->va_bytes = DEV_BSIZE; else vap->va_size = 0; if (vp->v_type != VDIR) vap->va_bytes = 0; vap->va_blocksize = DEV_BSIZE; vap->va_type = vp->v_type; getboottime(&boottime); #define fix(aa) \ do { \ if ((aa).tv_sec <= 3600) { \ (aa).tv_sec = boottime.tv_sec; \ (aa).tv_nsec = boottime.tv_usec * 1000; \ } \ } while (0) if (vp->v_type != VCHR) { fix(de->de_atime); vap->va_atime = de->de_atime; fix(de->de_mtime); vap->va_mtime = de->de_mtime; fix(de->de_ctime); vap->va_ctime = de->de_ctime; } else { dev = vp->v_rdev; fix(dev->si_atime); vap->va_atime = dev->si_atime; fix(dev->si_mtime); vap->va_mtime = dev->si_mtime; fix(dev->si_ctime); vap->va_ctime = dev->si_ctime; vap->va_rdev = cdev2priv(dev)->cdp_inode; } vap->va_gen = 0; vap->va_flags = 0; vap->va_filerev = 0; vap->va_nlink = de->de_links; vap->va_fileid = de->de_inode; return (error); } /* ARGSUSED */ static int devfs_ioctl_f(struct file *fp, u_long com, void *data, struct ucred *cred, struct thread *td) { struct file *fpop; int error; fpop = td->td_fpop; td->td_fpop = fp; error = vnops.fo_ioctl(fp, com, data, cred, td); td->td_fpop = fpop; return (error); } void * fiodgname_buf_get_ptr(void *fgnp, u_long com) { union { struct fiodgname_arg fgn; #ifdef COMPAT_FREEBSD32 struct fiodgname_arg32 fgn32; #endif } *fgnup; fgnup = fgnp; switch (com) { case FIODGNAME: return (fgnup->fgn.buf); #ifdef COMPAT_FREEBSD32 case FIODGNAME_32: return ((void *)(uintptr_t)fgnup->fgn32.buf); #endif default: panic("Unhandled ioctl command %ld", com); } } static int devfs_ioctl(struct vop_ioctl_args *ap) { struct fiodgname_arg *fgn; struct vnode *vpold, *vp; struct cdevsw *dsw; struct thread *td; struct session *sess; struct cdev *dev; int error, ref, i; const char *p; u_long com; vp = ap->a_vp; com = ap->a_command; td = ap->a_td; dsw = devvn_refthread(vp, &dev, &ref); if (dsw == NULL) return (ENXIO); KASSERT(dev->si_refcount > 0, ("devfs: un-referenced struct cdev *(%s)", devtoname(dev))); switch (com) { case FIODTYPE: *(int *)ap->a_data = dsw->d_flags & D_TYPEMASK; error = 0; break; case FIODGNAME: #ifdef COMPAT_FREEBSD32 case FIODGNAME_32: #endif fgn = ap->a_data; p = devtoname(dev); i = strlen(p) + 1; if (i > fgn->len) error = EINVAL; else error = copyout(p, fiodgname_buf_get_ptr(fgn, com), i); break; default: error = dsw->d_ioctl(dev, com, ap->a_data, ap->a_fflag, td); } dev_relthread(dev, ref); if (error == ENOIOCTL) error = ENOTTY; if (error == 0 && com == TIOCSCTTY) { /* * Do nothing if reassigning same control tty, or if the * control tty has already disappeared. If it disappeared, * it's because we were racing with TIOCNOTTY. TIOCNOTTY * already took care of releasing the old vnode and we have * nothing left to do. */ sx_slock(&proctree_lock); sess = td->td_proc->p_session; if (sess->s_ttyvp == vp || sess->s_ttyp == NULL) { sx_sunlock(&proctree_lock); return (0); } devfs_ctty_ref(vp); SESS_LOCK(sess); vpold = sess->s_ttyvp; sess->s_ttyvp = vp; sess->s_ttydp = cdev2priv(dev); SESS_UNLOCK(sess); sx_sunlock(&proctree_lock); /* Get rid of reference to old control tty */ if (vpold) devfs_ctty_unref(vpold); } return (error); } /* ARGSUSED */ static int devfs_kqfilter_f(struct file *fp, struct knote *kn) { struct cdev *dev; struct cdevsw *dsw; int error, ref; struct file *fpop; struct thread *td; td = curthread; fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error) return (error); error = dsw->d_kqfilter(dev, kn); td->td_fpop = fpop; dev_relthread(dev, ref); return (error); } static inline int devfs_prison_check(struct devfs_dirent *de, struct thread *td) { struct cdev_priv *cdp; struct ucred *dcr; struct proc *p; int error; cdp = de->de_cdp; if (cdp == NULL) return (0); dcr = cdp->cdp_c.si_cred; if (dcr == NULL) return (0); error = prison_check(td->td_ucred, dcr); if (error == 0) return (0); /* We do, however, allow access to the controlling terminal */ p = td->td_proc; PROC_LOCK(p); if (!(p->p_flag & P_CONTROLT)) { PROC_UNLOCK(p); return (error); } if (p->p_session->s_ttydp == cdp) error = 0; PROC_UNLOCK(p); return (error); } static int devfs_lookupx(struct vop_lookup_args *ap, int *dm_unlock) { struct componentname *cnp; struct vnode *dvp, **vpp; struct thread *td; struct devfs_dirent *de, *dd; struct devfs_dirent **dde; struct devfs_mount *dmp; struct mount *mp; struct cdev *cdev; int error, flags, nameiop, dvplocked; char specname[SPECNAMELEN + 1], *pname; td = curthread; cnp = ap->a_cnp; vpp = ap->a_vpp; dvp = ap->a_dvp; pname = cnp->cn_nameptr; flags = cnp->cn_flags; nameiop = cnp->cn_nameiop; mp = dvp->v_mount; dmp = VFSTODEVFS(mp); dd = dvp->v_data; *vpp = NULLVP; if ((flags & ISLASTCN) && nameiop == RENAME) return (EOPNOTSUPP); if (dvp->v_type != VDIR) return (ENOTDIR); if ((flags & ISDOTDOT) && (dvp->v_vflag & VV_ROOT)) return (EIO); error = vn_dir_check_exec(dvp, cnp); if (error != 0) return (error); if (cnp->cn_namelen == 1 && *pname == '.') { if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); *vpp = dvp; VREF(dvp); return (0); } if (flags & ISDOTDOT) { if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); de = devfs_parent_dirent(dd); if (de == NULL) return (ENOENT); dvplocked = VOP_ISLOCKED(dvp); VOP_UNLOCK(dvp); error = devfs_allocv(de, mp, cnp->cn_lkflags & LK_TYPE_MASK, vpp); *dm_unlock = 0; vn_lock(dvp, dvplocked | LK_RETRY); return (error); } dd = dvp->v_data; de = devfs_find(dd, cnp->cn_nameptr, cnp->cn_namelen, 0); while (de == NULL) { /* While(...) so we can use break */ if (nameiop == DELETE) return (ENOENT); /* * OK, we didn't have an entry for the name we were asked for * so we try to see if anybody can create it on demand. */ pname = devfs_fqpn(specname, dmp, dd, cnp); if (pname == NULL) break; cdev = NULL; DEVFS_DMP_HOLD(dmp); sx_xunlock(&dmp->dm_lock); EVENTHANDLER_INVOKE(dev_clone, td->td_ucred, pname, strlen(pname), &cdev); if (cdev == NULL) sx_xlock(&dmp->dm_lock); else if (devfs_populate_vp(dvp) != 0) { *dm_unlock = 0; sx_xlock(&dmp->dm_lock); if (DEVFS_DMP_DROP(dmp)) { sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); } else sx_xunlock(&dmp->dm_lock); dev_rel(cdev); return (ENOENT); } if (DEVFS_DMP_DROP(dmp)) { *dm_unlock = 0; sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); if (cdev != NULL) dev_rel(cdev); return (ENOENT); } if (cdev == NULL) break; dev_lock(); dde = &cdev2priv(cdev)->cdp_dirents[dmp->dm_idx]; if (dde != NULL && *dde != NULL) de = *dde; dev_unlock(); dev_rel(cdev); break; } if (de == NULL || de->de_flags & DE_WHITEOUT) { if ((nameiop == CREATE || nameiop == RENAME) && (flags & (LOCKPARENT | WANTPARENT)) && (flags & ISLASTCN)) { return (EJUSTRETURN); } return (ENOENT); } if (devfs_prison_check(de, td)) return (ENOENT); if ((cnp->cn_nameiop == DELETE) && (flags & ISLASTCN)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) return (error); if (*vpp == dvp) { VREF(dvp); *vpp = dvp; return (0); } } error = devfs_allocv(de, mp, cnp->cn_lkflags & LK_TYPE_MASK, vpp); *dm_unlock = 0; return (error); } static int devfs_lookup(struct vop_lookup_args *ap) { int j; struct devfs_mount *dmp; int dm_unlock; if (devfs_populate_vp(ap->a_dvp) != 0) return (ENOTDIR); dmp = VFSTODEVFS(ap->a_dvp->v_mount); dm_unlock = 1; j = devfs_lookupx(ap, &dm_unlock); if (dm_unlock == 1) sx_xunlock(&dmp->dm_lock); return (j); } static int devfs_mknod(struct vop_mknod_args *ap) { struct componentname *cnp; struct vnode *dvp, **vpp; struct devfs_dirent *dd, *de; struct devfs_mount *dmp; int error; /* * The only type of node we should be creating here is a * character device, for anything else return EOPNOTSUPP. */ if (ap->a_vap->va_type != VCHR) return (EOPNOTSUPP); dvp = ap->a_dvp; dmp = VFSTODEVFS(dvp->v_mount); cnp = ap->a_cnp; vpp = ap->a_vpp; dd = dvp->v_data; error = ENOENT; sx_xlock(&dmp->dm_lock); TAILQ_FOREACH(de, &dd->de_dlist, de_list) { if (cnp->cn_namelen != de->de_dirent->d_namlen) continue; if (de->de_dirent->d_type == DT_CHR && (de->de_cdp->cdp_flags & CDP_ACTIVE) == 0) continue; if (bcmp(cnp->cn_nameptr, de->de_dirent->d_name, de->de_dirent->d_namlen) != 0) continue; if (de->de_flags & DE_WHITEOUT) break; goto notfound; } if (de == NULL) goto notfound; de->de_flags &= ~DE_WHITEOUT; error = devfs_allocv(de, dvp->v_mount, LK_EXCLUSIVE, vpp); return (error); notfound: sx_xunlock(&dmp->dm_lock); return (error); } /* ARGSUSED */ static int devfs_open(struct vop_open_args *ap) { struct thread *td = ap->a_td; struct vnode *vp = ap->a_vp; struct cdev *dev = vp->v_rdev; struct file *fp = ap->a_fp; int error, ref, vlocked; struct cdevsw *dsw; struct file *fpop; if (vp->v_type == VBLK) return (ENXIO); if (dev == NULL) return (ENXIO); /* Make this field valid before any I/O in d_open. */ if (dev->si_iosize_max == 0) dev->si_iosize_max = DFLTPHYS; dsw = dev_refthread(dev, &ref); if (dsw == NULL) return (ENXIO); if (fp == NULL && dsw->d_fdopen != NULL) { dev_relthread(dev, ref); return (ENXIO); } if (vp->v_type == VCHR) devfs_usecount_add(vp); vlocked = VOP_ISLOCKED(vp); VOP_UNLOCK(vp); fpop = td->td_fpop; td->td_fpop = fp; if (fp != NULL) { fp->f_data = dev; fp->f_vnode = vp; } if (dsw->d_fdopen != NULL) error = dsw->d_fdopen(dev, ap->a_mode, td, fp); else error = dsw->d_open(dev, ap->a_mode, S_IFCHR, td); /* Clean up any cdevpriv upon error. */ if (error != 0) devfs_clear_cdevpriv(); td->td_fpop = fpop; vn_lock(vp, vlocked | LK_RETRY); if (error != 0 && vp->v_type == VCHR) devfs_usecount_sub(vp); dev_relthread(dev, ref); if (error != 0) { if (error == ERESTART) error = EINTR; return (error); } #if 0 /* /dev/console */ KASSERT(fp != NULL, ("Could not vnode bypass device on NULL fp")); #else if (fp == NULL) return (error); #endif if (fp->f_ops == &badfileops) finit(fp, fp->f_flag, DTYPE_VNODE, dev, &devfs_ops_f); return (error); } static int devfs_pathconf(struct vop_pathconf_args *ap) { switch (ap->a_name) { case _PC_FILESIZEBITS: *ap->a_retval = 64; return (0); case _PC_NAME_MAX: *ap->a_retval = NAME_MAX; return (0); case _PC_LINK_MAX: *ap->a_retval = INT_MAX; return (0); case _PC_SYMLINK_MAX: *ap->a_retval = MAXPATHLEN; return (0); case _PC_MAX_CANON: if (ap->a_vp->v_vflag & VV_ISTTY) { *ap->a_retval = MAX_CANON; return (0); } return (EINVAL); case _PC_MAX_INPUT: if (ap->a_vp->v_vflag & VV_ISTTY) { *ap->a_retval = MAX_INPUT; return (0); } return (EINVAL); case _PC_VDISABLE: if (ap->a_vp->v_vflag & VV_ISTTY) { *ap->a_retval = _POSIX_VDISABLE; return (0); } return (EINVAL); case _PC_MAC_PRESENT: #ifdef MAC /* * If MAC is enabled, devfs automatically supports * trivial non-persistent label storage. */ *ap->a_retval = 1; #else *ap->a_retval = 0; #endif return (0); case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; return (0); default: return (vop_stdpathconf(ap)); } /* NOTREACHED */ } /* ARGSUSED */ static int devfs_poll_f(struct file *fp, int events, struct ucred *cred, struct thread *td) { struct cdev *dev; struct cdevsw *dsw; int error, ref; struct file *fpop; fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error != 0) { error = vnops.fo_poll(fp, events, cred, td); return (error); } error = dsw->d_poll(dev, events, td); td->td_fpop = fpop; dev_relthread(dev, ref); return(error); } /* * Print out the contents of a special device vnode. */ static int devfs_print(struct vop_print_args *ap) { printf("\tdev %s\n", devtoname(ap->a_vp->v_rdev)); return (0); } static int devfs_read_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td) { struct cdev *dev; int ioflag, error, ref; ssize_t resid; struct cdevsw *dsw; struct file *fpop; if (uio->uio_resid > DEVFS_IOSIZE_MAX) return (EINVAL); fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error != 0) { error = vnops.fo_read(fp, uio, cred, flags, td); return (error); } resid = uio->uio_resid; ioflag = fp->f_flag & (O_NONBLOCK | O_DIRECT); if (ioflag & O_DIRECT) ioflag |= IO_DIRECT; foffset_lock_uio(fp, uio, flags | FOF_NOLOCK); error = dsw->d_read(dev, uio, ioflag); if (uio->uio_resid != resid || (error == 0 && resid != 0)) devfs_timestamp(&dev->si_atime); td->td_fpop = fpop; dev_relthread(dev, ref); foffset_unlock_uio(fp, uio, flags | FOF_NOLOCK | FOF_NEXTOFF_R); return (error); } static int devfs_readdir(struct vop_readdir_args *ap) { int error; struct uio *uio; struct dirent *dp; struct devfs_dirent *dd; struct devfs_dirent *de; struct devfs_mount *dmp; off_t off; int *tmp_ncookies = NULL; if (ap->a_vp->v_type != VDIR) return (ENOTDIR); uio = ap->a_uio; if (uio->uio_offset < 0) return (EINVAL); /* * XXX: This is a temporary hack to get around this filesystem not * supporting cookies. We store the location of the ncookies pointer * in a temporary variable before calling vfs_subr.c:vfs_read_dirent() * and set the number of cookies to 0. We then set the pointer to * NULL so that vfs_read_dirent doesn't try to call realloc() on * ap->a_cookies. Later in this function, we restore the ap->a_ncookies * pointer to its original location before returning to the caller. */ if (ap->a_ncookies != NULL) { tmp_ncookies = ap->a_ncookies; *ap->a_ncookies = 0; ap->a_ncookies = NULL; } dmp = VFSTODEVFS(ap->a_vp->v_mount); if (devfs_populate_vp(ap->a_vp) != 0) { if (tmp_ncookies != NULL) ap->a_ncookies = tmp_ncookies; return (EIO); } error = 0; de = ap->a_vp->v_data; off = 0; TAILQ_FOREACH(dd, &de->de_dlist, de_list) { KASSERT(dd->de_cdp != (void *)0xdeadc0de, ("%s %d\n", __func__, __LINE__)); if (dd->de_flags & (DE_COVERED | DE_WHITEOUT)) continue; if (devfs_prison_check(dd, uio->uio_td)) continue; if (dd->de_dirent->d_type == DT_DIR) de = dd->de_dir; else de = dd; dp = dd->de_dirent; MPASS(dp->d_reclen == GENERIC_DIRSIZ(dp)); if (dp->d_reclen > uio->uio_resid) break; dp->d_fileno = de->de_inode; /* NOTE: d_off is the offset for the *next* entry. */ dp->d_off = off + dp->d_reclen; if (off >= uio->uio_offset) { error = vfs_read_dirent(ap, dp, off); if (error) break; } off += dp->d_reclen; } sx_xunlock(&dmp->dm_lock); uio->uio_offset = off; /* * Restore ap->a_ncookies if it wasn't originally NULL in the first * place. */ if (tmp_ncookies != NULL) ap->a_ncookies = tmp_ncookies; return (error); } static int devfs_readlink(struct vop_readlink_args *ap) { struct devfs_dirent *de; de = ap->a_vp->v_data; return (uiomove(de->de_symlink, strlen(de->de_symlink), ap->a_uio)); } static void devfs_reclaiml(struct vnode *vp) { struct devfs_dirent *de; mtx_assert(&devfs_de_interlock, MA_OWNED); de = vp->v_data; if (de != NULL) { MPASS(de->de_usecount == 0); de->de_vnode = NULL; vp->v_data = NULL; } } static int devfs_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp; vp = ap->a_vp; mtx_lock(&devfs_de_interlock); devfs_reclaiml(vp); mtx_unlock(&devfs_de_interlock); return (0); } static int devfs_reclaim_vchr(struct vop_reclaim_args *ap) { struct vnode *vp; struct cdev *dev; vp = ap->a_vp; MPASS(vp->v_type == VCHR); mtx_lock(&devfs_de_interlock); VI_LOCK(vp); devfs_usecount_subl(vp); devfs_reclaiml(vp); mtx_unlock(&devfs_de_interlock); dev_lock(); dev = vp->v_rdev; vp->v_rdev = NULL; dev_unlock(); VI_UNLOCK(vp); if (dev != NULL) dev_rel(dev); return (0); } static int devfs_remove(struct vop_remove_args *ap) { struct vnode *dvp = ap->a_dvp; struct vnode *vp = ap->a_vp; struct devfs_dirent *dd; struct devfs_dirent *de, *de_covered; struct devfs_mount *dmp = VFSTODEVFS(vp->v_mount); ASSERT_VOP_ELOCKED(dvp, "devfs_remove"); ASSERT_VOP_ELOCKED(vp, "devfs_remove"); sx_xlock(&dmp->dm_lock); dd = ap->a_dvp->v_data; de = vp->v_data; if (de->de_cdp == NULL) { TAILQ_REMOVE(&dd->de_dlist, de, de_list); if (de->de_dirent->d_type == DT_LNK) { de_covered = devfs_find(dd, de->de_dirent->d_name, de->de_dirent->d_namlen, 0); if (de_covered != NULL) de_covered->de_flags &= ~DE_COVERED; } /* We need to unlock dvp because devfs_delete() may lock it. */ VOP_UNLOCK(vp); if (dvp != vp) VOP_UNLOCK(dvp); devfs_delete(dmp, de, 0); sx_xunlock(&dmp->dm_lock); if (dvp != vp) vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } else { de->de_flags |= DE_WHITEOUT; sx_xunlock(&dmp->dm_lock); } return (0); } /* * Revoke is called on a tty when a terminal session ends. The vnode * is orphaned by setting v_op to deadfs so we need to let go of it * as well so that we create a new one next time around. * */ static int devfs_revoke(struct vop_revoke_args *ap) { struct vnode *vp = ap->a_vp, *vp2; struct cdev *dev; struct cdev_priv *cdp; struct devfs_dirent *de; enum vgetstate vs; u_int i; KASSERT((ap->a_flags & REVOKEALL) != 0, ("devfs_revoke !REVOKEALL")); dev = vp->v_rdev; cdp = cdev2priv(dev); dev_lock(); cdp->cdp_inuse++; dev_unlock(); vhold(vp); vgone(vp); vdrop(vp); VOP_UNLOCK(vp); loop: for (;;) { mtx_lock(&devfs_de_interlock); dev_lock(); vp2 = NULL; for (i = 0; i <= cdp->cdp_maxdirent; i++) { de = cdp->cdp_dirents[i]; if (de == NULL) continue; vp2 = de->de_vnode; if (vp2 != NULL) { dev_unlock(); vs = vget_prep(vp2); mtx_unlock(&devfs_de_interlock); if (vget_finish(vp2, LK_EXCLUSIVE, vs) != 0) goto loop; vhold(vp2); vgone(vp2); vdrop(vp2); vput(vp2); break; } } if (vp2 != NULL) { continue; } dev_unlock(); mtx_unlock(&devfs_de_interlock); break; } dev_lock(); cdp->cdp_inuse--; if (!(cdp->cdp_flags & CDP_ACTIVE) && cdp->cdp_inuse == 0) { KASSERT((cdp->cdp_flags & CDP_ON_ACTIVE_LIST) != 0, ("%s: cdp %p (%s) not on active list", __func__, cdp, dev->si_name)); cdp->cdp_flags &= ~CDP_ON_ACTIVE_LIST; TAILQ_REMOVE(&cdevp_list, cdp, cdp_list); dev_unlock(); dev_rel(&cdp->cdp_c); } else dev_unlock(); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); return (0); } static int devfs_rioctl(struct vop_ioctl_args *ap) { struct vnode *vp; struct devfs_mount *dmp; int error; vp = ap->a_vp; vn_lock(vp, LK_SHARED | LK_RETRY); if (VN_IS_DOOMED(vp)) { VOP_UNLOCK(vp); return (EBADF); } dmp = VFSTODEVFS(vp->v_mount); sx_xlock(&dmp->dm_lock); VOP_UNLOCK(vp); DEVFS_DMP_HOLD(dmp); devfs_populate(dmp); if (DEVFS_DMP_DROP(dmp)) { sx_xunlock(&dmp->dm_lock); devfs_unmount_final(dmp); return (ENOENT); } error = devfs_rules_ioctl(dmp, ap->a_command, ap->a_data, ap->a_td); sx_xunlock(&dmp->dm_lock); return (error); } static int devfs_rread(struct vop_read_args *ap) { if (ap->a_vp->v_type != VDIR) return (EINVAL); return (VOP_READDIR(ap->a_vp, ap->a_uio, ap->a_cred, NULL, NULL, NULL)); } static int devfs_setattr(struct vop_setattr_args *ap) { struct devfs_dirent *de; struct vattr *vap; struct vnode *vp; struct thread *td; int c, error; uid_t uid; gid_t gid; vap = ap->a_vap; vp = ap->a_vp; td = curthread; if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_flags != VNOVAL && vap->va_flags != 0) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { return (EINVAL); } error = devfs_populate_vp(vp); if (error != 0) return (error); de = vp->v_data; if (vp->v_type == VDIR) de = de->de_dir; c = 0; if (vap->va_uid == (uid_t)VNOVAL) uid = de->de_uid; else uid = vap->va_uid; if (vap->va_gid == (gid_t)VNOVAL) gid = de->de_gid; else gid = vap->va_gid; if (uid != de->de_uid || gid != de->de_gid) { if ((ap->a_cred->cr_uid != de->de_uid) || uid != de->de_uid || (gid != de->de_gid && !groupmember(gid, ap->a_cred))) { error = priv_check(td, PRIV_VFS_CHOWN); if (error != 0) goto ret; } de->de_uid = uid; de->de_gid = gid; c = 1; } if (vap->va_mode != (mode_t)VNOVAL) { if (ap->a_cred->cr_uid != de->de_uid) { error = priv_check(td, PRIV_VFS_ADMIN); if (error != 0) goto ret; } de->de_mode = vap->va_mode; c = 1; } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { error = vn_utimes_perm(vp, vap, ap->a_cred, td); if (error != 0) goto ret; if (vap->va_atime.tv_sec != VNOVAL) { if (vp->v_type == VCHR) vp->v_rdev->si_atime = vap->va_atime; else de->de_atime = vap->va_atime; } if (vap->va_mtime.tv_sec != VNOVAL) { if (vp->v_type == VCHR) vp->v_rdev->si_mtime = vap->va_mtime; else de->de_mtime = vap->va_mtime; } c = 1; } if (c) { if (vp->v_type == VCHR) vfs_timestamp(&vp->v_rdev->si_ctime); else vfs_timestamp(&de->de_mtime); } ret: sx_xunlock(&VFSTODEVFS(vp->v_mount)->dm_lock); return (error); } #ifdef MAC static int devfs_setlabel(struct vop_setlabel_args *ap) { struct vnode *vp; struct devfs_dirent *de; vp = ap->a_vp; de = vp->v_data; mac_vnode_relabel(ap->a_cred, vp, ap->a_label); mac_devfs_update(vp->v_mount, de, vp); return (0); } #endif static int devfs_stat_f(struct file *fp, struct stat *sb, struct ucred *cred) { return (vnops.fo_stat(fp, sb, cred)); } static int devfs_symlink(struct vop_symlink_args *ap) { int i, error; struct devfs_dirent *dd; struct devfs_dirent *de, *de_covered, *de_dotdot; struct devfs_mount *dmp; error = priv_check(curthread, PRIV_DEVFS_SYMLINK); if (error) return(error); dmp = VFSTODEVFS(ap->a_dvp->v_mount); if (devfs_populate_vp(ap->a_dvp) != 0) return (ENOENT); dd = ap->a_dvp->v_data; de = devfs_newdirent(ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen); de->de_flags = DE_USER; de->de_uid = 0; de->de_gid = 0; de->de_mode = 0755; de->de_inode = alloc_unr(devfs_inos); de->de_dir = dd; de->de_dirent->d_type = DT_LNK; i = strlen(ap->a_target) + 1; de->de_symlink = malloc(i, M_DEVFS, M_WAITOK); bcopy(ap->a_target, de->de_symlink, i); #ifdef MAC mac_devfs_create_symlink(ap->a_cnp->cn_cred, dmp->dm_mount, dd, de); #endif de_covered = devfs_find(dd, de->de_dirent->d_name, de->de_dirent->d_namlen, 0); if (de_covered != NULL) { if ((de_covered->de_flags & DE_USER) != 0) { devfs_delete(dmp, de, DEVFS_DEL_NORECURSE); sx_xunlock(&dmp->dm_lock); return (EEXIST); } KASSERT((de_covered->de_flags & DE_COVERED) == 0, ("devfs_symlink: entry %p already covered", de_covered)); de_covered->de_flags |= DE_COVERED; } de_dotdot = TAILQ_FIRST(&dd->de_dlist); /* "." */ de_dotdot = TAILQ_NEXT(de_dotdot, de_list); /* ".." */ TAILQ_INSERT_AFTER(&dd->de_dlist, de_dotdot, de, de_list); devfs_dir_ref_de(dmp, dd); devfs_rules_apply(dmp, de); return (devfs_allocv(de, ap->a_dvp->v_mount, LK_EXCLUSIVE, ap->a_vpp)); } static int devfs_truncate_f(struct file *fp, off_t length, struct ucred *cred, struct thread *td) { return (vnops.fo_truncate(fp, length, cred, td)); } static int devfs_write_f(struct file *fp, struct uio *uio, struct ucred *cred, int flags, struct thread *td) { struct cdev *dev; int error, ioflag, ref; ssize_t resid; struct cdevsw *dsw; struct file *fpop; if (uio->uio_resid > DEVFS_IOSIZE_MAX) return (EINVAL); fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error != 0) { error = vnops.fo_write(fp, uio, cred, flags, td); return (error); } KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); ioflag = fp->f_flag & (O_NONBLOCK | O_DIRECT | O_FSYNC); if (ioflag & O_DIRECT) ioflag |= IO_DIRECT; foffset_lock_uio(fp, uio, flags | FOF_NOLOCK); resid = uio->uio_resid; error = dsw->d_write(dev, uio, ioflag); if (uio->uio_resid != resid || (error == 0 && resid != 0)) { devfs_timestamp(&dev->si_ctime); dev->si_mtime = dev->si_ctime; } td->td_fpop = fpop; dev_relthread(dev, ref); foffset_unlock_uio(fp, uio, flags | FOF_NOLOCK | FOF_NEXTOFF_W); return (error); } static int devfs_mmap_f(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, struct thread *td) { struct cdev *dev; struct cdevsw *dsw; struct mount *mp; struct vnode *vp; struct file *fpop; vm_object_t object; vm_prot_t maxprot; int error, ref; vp = fp->f_vnode; /* * Ensure that file and memory protections are * compatible. */ mp = vp->v_mount; if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) { maxprot = VM_PROT_NONE; if ((prot & VM_PROT_EXECUTE) != 0) return (EACCES); } else maxprot = VM_PROT_EXECUTE; if ((fp->f_flag & FREAD) != 0) maxprot |= VM_PROT_READ; else if ((prot & VM_PROT_READ) != 0) return (EACCES); /* * If we are sharing potential changes via MAP_SHARED and we * are trying to get write permission although we opened it * without asking for it, bail out. * * Note that most character devices always share mappings. * The one exception is that D_MMAP_ANON devices * (i.e. /dev/zero) permit private writable mappings. * * Rely on vm_mmap_cdev() to fail invalid MAP_PRIVATE requests * as well as updating maxprot to permit writing for * D_MMAP_ANON devices rather than doing that here. */ if ((flags & MAP_SHARED) != 0) { if ((fp->f_flag & FWRITE) != 0) maxprot |= VM_PROT_WRITE; else if ((prot & VM_PROT_WRITE) != 0) return (EACCES); } maxprot &= cap_maxprot; fpop = td->td_fpop; error = devfs_fp_check(fp, &dev, &dsw, &ref); if (error != 0) return (error); error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, dev, dsw, &foff, &object); td->td_fpop = fpop; dev_relthread(dev, ref); if (error != 0) return (error); error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, foff, FALSE, td); if (error != 0) vm_object_deallocate(object); return (error); } dev_t dev2udev(struct cdev *x) { if (x == NULL) return (NODEV); return (cdev2priv(x)->cdp_inode); } static int devfs_cmp_f(struct file *fp1, struct file *fp2, struct thread *td) { if (fp2->f_type != DTYPE_VNODE || fp2->f_ops != &devfs_ops_f) return (3); return (kcmp_cmp((uintptr_t)fp1->f_data, (uintptr_t)fp2->f_data)); } -static struct fileops devfs_ops_f = { +static const struct fileops devfs_ops_f = { .fo_read = devfs_read_f, .fo_write = devfs_write_f, .fo_truncate = devfs_truncate_f, .fo_ioctl = devfs_ioctl_f, .fo_poll = devfs_poll_f, .fo_kqfilter = devfs_kqfilter_f, .fo_stat = devfs_stat_f, .fo_close = devfs_close_f, .fo_chmod = vn_chmod, .fo_chown = vn_chown, .fo_sendfile = vn_sendfile, .fo_seek = vn_seek, .fo_fill_kinfo = vn_fill_kinfo, .fo_mmap = devfs_mmap_f, .fo_cmp = devfs_cmp_f, .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE }; /* Vops for non-CHR vnodes in /dev. */ static struct vop_vector devfs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = devfs_access, .vop_getattr = devfs_getattr, .vop_ioctl = devfs_rioctl, .vop_lookup = devfs_lookup, .vop_mknod = devfs_mknod, .vop_pathconf = devfs_pathconf, .vop_read = devfs_rread, .vop_readdir = devfs_readdir, .vop_readlink = devfs_readlink, .vop_reclaim = devfs_reclaim, .vop_remove = devfs_remove, .vop_revoke = devfs_revoke, .vop_setattr = devfs_setattr, #ifdef MAC .vop_setlabel = devfs_setlabel, #endif .vop_symlink = devfs_symlink, .vop_vptocnp = devfs_vptocnp, .vop_lock1 = vop_lock, .vop_unlock = vop_unlock, .vop_islocked = vop_islocked, .vop_add_writecount = vop_stdadd_writecount_nomsync, }; VFS_VOP_VECTOR_REGISTER(devfs_vnodeops); /* Vops for VCHR vnodes in /dev. */ static struct vop_vector devfs_specops = { .vop_default = &default_vnodeops, .vop_access = devfs_access, .vop_bmap = VOP_PANIC, .vop_close = devfs_close, .vop_create = VOP_PANIC, .vop_fsync = vop_stdfsync, .vop_getattr = devfs_getattr, .vop_ioctl = devfs_ioctl, .vop_link = VOP_PANIC, .vop_mkdir = VOP_PANIC, .vop_mknod = VOP_PANIC, .vop_open = devfs_open, .vop_pathconf = devfs_pathconf, .vop_poll = dead_poll, .vop_print = devfs_print, .vop_read = dead_read, .vop_readdir = VOP_PANIC, .vop_readlink = VOP_PANIC, .vop_reallocblks = VOP_PANIC, .vop_reclaim = devfs_reclaim_vchr, .vop_remove = devfs_remove, .vop_rename = VOP_PANIC, .vop_revoke = devfs_revoke, .vop_rmdir = VOP_PANIC, .vop_setattr = devfs_setattr, #ifdef MAC .vop_setlabel = devfs_setlabel, #endif .vop_strategy = VOP_PANIC, .vop_symlink = VOP_PANIC, .vop_vptocnp = devfs_vptocnp, .vop_write = dead_write, .vop_lock1 = vop_lock, .vop_unlock = vop_unlock, .vop_islocked = vop_islocked, .vop_add_writecount = vop_stdadd_writecount_nomsync, }; VFS_VOP_VECTOR_REGISTER(devfs_specops); /* * Our calling convention to the device drivers used to be that we passed * vnode.h IO_* flags to read()/write(), but we're moving to fcntl.h O_ * flags instead since that's what open(), close() and ioctl() takes and * we don't really want vnode.h in device drivers. * We solved the source compatibility by redefining some vnode flags to * be the same as the fcntl ones and by sending down the bitwise OR of * the respective fcntl/vnode flags. These CTASSERTS make sure nobody * pulls the rug out under this. */ CTASSERT(O_NONBLOCK == IO_NDELAY); CTASSERT(O_FSYNC == IO_SYNC); diff --git a/sys/fs/fuse/fuse_device.c b/sys/fs/fuse/fuse_device.c index 88ebe702ec0a..5df9be59ce36 100644 --- a/sys/fs/fuse/fuse_device.c +++ b/sys/fs/fuse/fuse_device.c @@ -1,610 +1,610 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2007-2009 Google Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Copyright (C) 2005 Csaba Henk. * All rights reserved. * * Copyright (c) 2019 The FreeBSD Foundation * * Portions of this software were developed by BFF Storage Systems, LLC under * sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "fuse.h" #include "fuse_internal.h" #include "fuse_ipc.h" #include #include SDT_PROVIDER_DECLARE(fusefs); /* * Fuse trace probe: * arg0: verbosity. Higher numbers give more verbose messages * arg1: Textual message */ SDT_PROBE_DEFINE2(fusefs, , device, trace, "int", "char*"); static struct cdev *fuse_dev; static d_kqfilter_t fuse_device_filter; static d_open_t fuse_device_open; static d_poll_t fuse_device_poll; static d_read_t fuse_device_read; static d_write_t fuse_device_write; static struct cdevsw fuse_device_cdevsw = { .d_kqfilter = fuse_device_filter, .d_open = fuse_device_open, .d_name = "fuse", .d_poll = fuse_device_poll, .d_read = fuse_device_read, .d_write = fuse_device_write, .d_version = D_VERSION, }; static int fuse_device_filt_read(struct knote *kn, long hint); static int fuse_device_filt_write(struct knote *kn, long hint); static void fuse_device_filt_detach(struct knote *kn); -struct filterops fuse_device_rfiltops = { +static const struct filterops fuse_device_rfiltops = { .f_isfd = 1, .f_detach = fuse_device_filt_detach, .f_event = fuse_device_filt_read, }; -struct filterops fuse_device_wfiltops = { +static const struct filterops fuse_device_wfiltops = { .f_isfd = 1, .f_event = fuse_device_filt_write, }; /**************************** * * >>> Fuse device op defs * ****************************/ static void fdata_dtor(void *arg) { struct fuse_data *fdata; struct fuse_ticket *tick; fdata = arg; if (fdata == NULL) return; fdata_set_dead(fdata); FUSE_LOCK(); fuse_lck_mtx_lock(fdata->aw_mtx); /* wakup poll()ers */ selwakeuppri(&fdata->ks_rsel, PZERO + 1); /* Don't let syscall handlers wait in vain */ while ((tick = fuse_aw_pop(fdata))) { fuse_lck_mtx_lock(tick->tk_aw_mtx); fticket_set_answered(tick); tick->tk_aw_errno = ENOTCONN; wakeup(tick); fuse_lck_mtx_unlock(tick->tk_aw_mtx); FUSE_ASSERT_AW_DONE(tick); fuse_ticket_drop(tick); } fuse_lck_mtx_unlock(fdata->aw_mtx); /* Cleanup unsent operations */ fuse_lck_mtx_lock(fdata->ms_mtx); while ((tick = fuse_ms_pop(fdata))) { fuse_ticket_drop(tick); } fuse_lck_mtx_unlock(fdata->ms_mtx); FUSE_UNLOCK(); fdata_trydestroy(fdata); } static int fuse_device_filter(struct cdev *dev, struct knote *kn) { struct fuse_data *data; int error; error = devfs_get_cdevpriv((void **)&data); if (error == 0 && kn->kn_filter == EVFILT_READ) { kn->kn_fop = &fuse_device_rfiltops; kn->kn_hook = data; knlist_add(&data->ks_rsel.si_note, kn, 0); error = 0; } else if (error == 0 && kn->kn_filter == EVFILT_WRITE) { kn->kn_fop = &fuse_device_wfiltops; error = 0; } else if (error == 0) { error = EINVAL; kn->kn_data = error; } return (error); } static void fuse_device_filt_detach(struct knote *kn) { struct fuse_data *data; data = (struct fuse_data*)kn->kn_hook; MPASS(data != NULL); knlist_remove(&data->ks_rsel.si_note, kn, 0); kn->kn_hook = NULL; } static int fuse_device_filt_read(struct knote *kn, long hint) { struct fuse_data *data; int ready; data = (struct fuse_data*)kn->kn_hook; MPASS(data != NULL); mtx_assert(&data->ms_mtx, MA_OWNED); if (fdata_get_dead(data)) { kn->kn_flags |= EV_EOF; kn->kn_fflags = ENODEV; kn->kn_data = 1; ready = 1; } else if (STAILQ_FIRST(&data->ms_head)) { MPASS(data->ms_count >= 1); kn->kn_data = data->ms_count; ready = 1; } else { ready = 0; } return (ready); } static int fuse_device_filt_write(struct knote *kn, long hint) { kn->kn_data = 0; /* The device is always ready to write, so we return 1*/ return (1); } /* * Resources are set up on a per-open basis */ static int fuse_device_open(struct cdev *dev, int oflags, int devtype, struct thread *td) { struct fuse_data *fdata; int error; SDT_PROBE2(fusefs, , device, trace, 1, "device open"); fdata = fdata_alloc(dev, td->td_ucred); error = devfs_set_cdevpriv(fdata, fdata_dtor); if (error != 0) fdata_trydestroy(fdata); else SDT_PROBE2(fusefs, , device, trace, 1, "device open success"); return (error); } int fuse_device_poll(struct cdev *dev, int events, struct thread *td) { struct fuse_data *data; int error, revents = 0; error = devfs_get_cdevpriv((void **)&data); if (error != 0) return (events & (POLLHUP|POLLIN|POLLRDNORM|POLLOUT|POLLWRNORM)); if (events & (POLLIN | POLLRDNORM)) { fuse_lck_mtx_lock(data->ms_mtx); if (fdata_get_dead(data) || STAILQ_FIRST(&data->ms_head)) revents |= events & (POLLIN | POLLRDNORM); else selrecord(td, &data->ks_rsel); fuse_lck_mtx_unlock(data->ms_mtx); } if (events & (POLLOUT | POLLWRNORM)) { revents |= events & (POLLOUT | POLLWRNORM); } return (revents); } /* * fuse_device_read hangs on the queue of VFS messages. * When it's notified that there is a new one, it picks that and * passes up to the daemon */ int fuse_device_read(struct cdev *dev, struct uio *uio, int ioflag) { int err; struct fuse_data *data; struct fuse_ticket *tick; void *buf; int buflen; SDT_PROBE2(fusefs, , device, trace, 1, "fuse device read"); err = devfs_get_cdevpriv((void **)&data); if (err != 0) return (err); fuse_lck_mtx_lock(data->ms_mtx); again: if (fdata_get_dead(data)) { SDT_PROBE2(fusefs, , device, trace, 2, "we know early on that reader should be kicked so we " "don't wait for news"); fuse_lck_mtx_unlock(data->ms_mtx); return (ENODEV); } if (!(tick = fuse_ms_pop(data))) { /* check if we may block */ if (ioflag & O_NONBLOCK) { /* get outa here soon */ fuse_lck_mtx_unlock(data->ms_mtx); return (EAGAIN); } else { err = msleep(data, &data->ms_mtx, PCATCH, "fu_msg", 0); if (err != 0) { fuse_lck_mtx_unlock(data->ms_mtx); return (fdata_get_dead(data) ? ENODEV : err); } tick = fuse_ms_pop(data); } } if (!tick) { /* * We can get here if fuse daemon suddenly terminates, * eg, by being hit by a SIGKILL * -- and some other cases, too, tho not totally clear, when * (cv_signal/wakeup_one signals the whole process ?) */ SDT_PROBE2(fusefs, , device, trace, 1, "no message on thread"); goto again; } fuse_lck_mtx_unlock(data->ms_mtx); if (fdata_get_dead(data)) { /* * somebody somewhere -- eg., umount routine -- * wants this liaison finished off */ SDT_PROBE2(fusefs, , device, trace, 2, "reader is to be sacked"); if (tick) { SDT_PROBE2(fusefs, , device, trace, 2, "weird -- " "\"kick\" is set tho there is message"); FUSE_ASSERT_MS_DONE(tick); fuse_ticket_drop(tick); } return (ENODEV); /* This should make the daemon get off * of us */ } SDT_PROBE2(fusefs, , device, trace, 1, "fuse device read message successfully"); buf = tick->tk_ms_fiov.base; buflen = tick->tk_ms_fiov.len; /* * Why not ban mercilessly stupid daemons who can't keep up * with us? (There is no much use of a partial read here...) */ /* * XXX note that in such cases Linux FUSE throws EIO at the * syscall invoker and stands back to the message queue. The * rationale should be made clear (and possibly adopt that * behaviour). Keeping the current scheme at least makes * fallacy as loud as possible... */ if (uio->uio_resid < buflen) { fdata_set_dead(data); SDT_PROBE2(fusefs, , device, trace, 2, "daemon is stupid, kick it off..."); err = ENODEV; } else { err = uiomove(buf, buflen, uio); } FUSE_ASSERT_MS_DONE(tick); fuse_ticket_drop(tick); return (err); } static inline int fuse_ohead_audit(struct fuse_out_header *ohead, struct uio *uio) { if (uio->uio_resid + sizeof(struct fuse_out_header) != ohead->len) { SDT_PROBE2(fusefs, , device, trace, 1, "Format error: body size " "differs from size claimed by header"); return (EINVAL); } if (uio->uio_resid && ohead->unique != 0 && ohead->error) { SDT_PROBE2(fusefs, , device, trace, 1, "Format error: non zero error but message had a body"); return (EINVAL); } return (0); } SDT_PROBE_DEFINE1(fusefs, , device, fuse_device_write_notify, "struct fuse_out_header*"); SDT_PROBE_DEFINE1(fusefs, , device, fuse_device_write_missing_ticket, "uint64_t"); SDT_PROBE_DEFINE1(fusefs, , device, fuse_device_write_found, "struct fuse_ticket*"); /* * fuse_device_write first reads the header sent by the daemon. * If that's OK, looks up ticket/callback node by the unique id seen in header. * If the callback node contains a handler function, the uio is passed over * that. */ static int fuse_device_write(struct cdev *dev, struct uio *uio, int ioflag) { struct fuse_out_header ohead; int err = 0; struct fuse_data *data; struct mount *mp; struct fuse_ticket *tick, *itick, *x_tick; int found = 0; err = devfs_get_cdevpriv((void **)&data); if (err != 0) return (err); mp = data->mp; if (uio->uio_resid < sizeof(struct fuse_out_header)) { SDT_PROBE2(fusefs, , device, trace, 1, "fuse_device_write got less than a header!"); fdata_set_dead(data); return (EINVAL); } if ((err = uiomove(&ohead, sizeof(struct fuse_out_header), uio)) != 0) return (err); if (data->linux_errnos != 0 && ohead.error != 0) { err = -ohead.error; if (err < 0 || err >= nitems(linux_to_bsd_errtbl)) return (EINVAL); /* '-', because it will get flipped again below */ ohead.error = -linux_to_bsd_errtbl[err]; } /* * We check header information (which is redundant) and compare it * with what we see. If we see some inconsistency we discard the * whole answer and proceed on as if it had never existed. In * particular, no pretender will be woken up, regardless the * "unique" value in the header. */ if ((err = fuse_ohead_audit(&ohead, uio))) { fdata_set_dead(data); return (err); } /* Pass stuff over to callback if there is one installed */ /* Looking for ticket with the unique id of header */ fuse_lck_mtx_lock(data->aw_mtx); TAILQ_FOREACH_SAFE(tick, &data->aw_head, tk_aw_link, x_tick) { if (tick->tk_unique == ohead.unique) { SDT_PROBE1(fusefs, , device, fuse_device_write_found, tick); found = 1; fuse_aw_remove(tick); break; } } if (found && tick->irq_unique > 0) { /* * Discard the FUSE_INTERRUPT ticket that tried to interrupt * this operation */ TAILQ_FOREACH_SAFE(itick, &data->aw_head, tk_aw_link, x_tick) { if (itick->tk_unique == tick->irq_unique) { fuse_aw_remove(itick); fuse_ticket_drop(itick); break; } } tick->irq_unique = 0; } fuse_lck_mtx_unlock(data->aw_mtx); if (found) { if (tick->tk_aw_handler) { /* * We found a callback with proper handler. In this * case the out header will be 0wnd by the callback, * so the fun of freeing that is left for her. * (Then, by all chance, she'll just get that's done * via ticket_drop(), so no manual mucking * around...) */ SDT_PROBE2(fusefs, , device, trace, 1, "pass ticket to a callback"); /* Sanitize the linuxism of negative errnos */ ohead.error *= -1; if (ohead.error < 0 || ohead.error > ELAST) { /* Illegal error code */ ohead.error = EIO; memcpy(&tick->tk_aw_ohead, &ohead, sizeof(ohead)); tick->tk_aw_handler(tick, uio); err = EINVAL; } else { memcpy(&tick->tk_aw_ohead, &ohead, sizeof(ohead)); err = tick->tk_aw_handler(tick, uio); } } else { /* pretender doesn't wanna do anything with answer */ SDT_PROBE2(fusefs, , device, trace, 1, "stuff devalidated, so we drop it"); } /* * As aw_mtx was not held during the callback execution the * ticket may have been inserted again. However, this is safe * because fuse_ticket_drop() will deal with refcount anyway. */ fuse_ticket_drop(tick); } else if (ohead.unique == 0){ /* unique == 0 means asynchronous notification */ SDT_PROBE1(fusefs, , device, fuse_device_write_notify, &ohead); switch (ohead.error) { case FUSE_NOTIFY_INVAL_ENTRY: err = fuse_internal_invalidate_entry(mp, uio); break; case FUSE_NOTIFY_INVAL_INODE: err = fuse_internal_invalidate_inode(mp, uio); break; case FUSE_NOTIFY_RETRIEVE: case FUSE_NOTIFY_STORE: /* * Unimplemented. I don't know of any file systems * that use them, and the protocol isn't sound anyway, * since the notification messages don't include the * inode's generation number. Without that, it's * possible to manipulate the cache of the wrong vnode. * Finally, it's not defined what this message should * do for a file with dirty cache. */ case FUSE_NOTIFY_POLL: /* Unimplemented. See comments in fuse_vnops */ default: /* Not implemented */ err = ENOSYS; } } else { /* no callback at all! */ SDT_PROBE1(fusefs, , device, fuse_device_write_missing_ticket, ohead.unique); if (ohead.error == -EAGAIN) { /* * This was probably a response to a FUSE_INTERRUPT * operation whose original operation is already * complete. We can't store FUSE_INTERRUPT tickets * indefinitely because their responses are optional. * So we delete them when the original operation * completes. And sadly the fuse_header_out doesn't * identify the opcode, so we have to guess. */ err = 0; } else { err = EINVAL; } } return (err); } int fuse_device_init(void) { fuse_dev = make_dev(&fuse_device_cdevsw, 0, UID_ROOT, GID_OPERATOR, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH, "fuse"); if (fuse_dev == NULL) return (ENOMEM); return (0); } void fuse_device_destroy(void) { MPASS(fuse_dev != NULL); destroy_dev(fuse_dev); } diff --git a/sys/geom/geom_dev.c b/sys/geom/geom_dev.c index f12236264d19..0b5cfdf77149 100644 --- a/sys/geom/geom_dev.c +++ b/sys/geom/geom_dev.c @@ -1,911 +1,911 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2002 Poul-Henning Kamp * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Poul-Henning Kamp * and NAI Labs, the Security Research Division of Network Associates, Inc. * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct g_dev_softc { struct mtx sc_mtx; struct cdev *sc_dev; struct cdev *sc_alias; int sc_open; u_int sc_active; struct selinfo sc_selinfo; #define SC_A_DESTROY (1 << 31) #define SC_A_OPEN (1 << 30) #define SC_A_ACTIVE (SC_A_OPEN - 1) }; static d_open_t g_dev_open; static d_close_t g_dev_close; static d_strategy_t g_dev_strategy; static d_ioctl_t g_dev_ioctl; static d_kqfilter_t g_dev_kqfilter; static void gdev_filter_detach(struct knote *kn); static int gdev_filter_vnode(struct knote *kn, long hint); -static struct filterops gdev_filterops_vnode = { +static const struct filterops gdev_filterops_vnode = { .f_isfd = 1, .f_detach = gdev_filter_detach, .f_event = gdev_filter_vnode, }; static struct cdevsw g_dev_cdevsw = { .d_version = D_VERSION, .d_open = g_dev_open, .d_close = g_dev_close, .d_read = physread, .d_write = physwrite, .d_ioctl = g_dev_ioctl, .d_strategy = g_dev_strategy, .d_name = "g_dev", .d_flags = D_DISK | D_TRACKCLOSE, .d_kqfilter = g_dev_kqfilter, }; static g_init_t g_dev_init; static g_fini_t g_dev_fini; static g_taste_t g_dev_taste; static g_orphan_t g_dev_orphan; static g_attrchanged_t g_dev_attrchanged; static g_resize_t g_dev_resize; static struct g_class g_dev_class = { .name = "DEV", .version = G_VERSION, .init = g_dev_init, .fini = g_dev_fini, .taste = g_dev_taste, .orphan = g_dev_orphan, .attrchanged = g_dev_attrchanged, .resize = g_dev_resize }; /* * We target 262144 (8 x 32768) sectors by default as this significantly * increases the throughput on commonly used SSD's with a marginal * increase in non-interruptible request latency. */ static uint64_t g_dev_del_max_sectors = 262144; SYSCTL_DECL(_kern_geom); SYSCTL_NODE(_kern_geom, OID_AUTO, dev, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "GEOM_DEV stuff"); SYSCTL_QUAD(_kern_geom_dev, OID_AUTO, delete_max_sectors, CTLFLAG_RW, &g_dev_del_max_sectors, 0, "Maximum number of sectors in a single " "delete request sent to the provider. Larger requests are chunked " "so they can be interrupted. (0 = disable chunking)"); static char *dumpdev = NULL; static void g_dev_init(struct g_class *mp) { dumpdev = kern_getenv("dumpdev"); } static void g_dev_fini(struct g_class *mp) { freeenv(dumpdev); dumpdev = NULL; } static int g_dev_setdumpdev(struct cdev *dev, struct diocskerneldump_arg *kda) { struct g_kerneldump kd; struct g_consumer *cp; int error, len; MPASS(dev != NULL && kda != NULL); MPASS(kda->kda_index != KDA_REMOVE); cp = dev->si_drv2; len = sizeof(kd); memset(&kd, 0, len); kd.offset = 0; kd.length = OFF_MAX; error = g_io_getattr("GEOM::kerneldump", cp, &len, &kd); if (error != 0) return (error); error = dumper_insert(&kd.di, devtoname(dev), kda); if (error == 0) dev->si_flags |= SI_DUMPDEV; return (error); } static int init_dumpdev(struct cdev *dev) { struct diocskerneldump_arg kda; struct g_consumer *cp; const char *devprefix = _PATH_DEV, *devname; int error; size_t len; bzero(&kda, sizeof(kda)); kda.kda_index = KDA_APPEND; if (dumpdev == NULL) return (0); len = strlen(devprefix); devname = devtoname(dev); if (strcmp(devname, dumpdev) != 0 && (strncmp(dumpdev, devprefix, len) != 0 || strcmp(devname, dumpdev + len) != 0)) return (0); cp = (struct g_consumer *)dev->si_drv2; error = g_access(cp, 1, 0, 0); if (error != 0) return (error); error = g_dev_setdumpdev(dev, &kda); if (error == 0) { freeenv(dumpdev); dumpdev = NULL; } (void)g_access(cp, -1, 0, 0); return (error); } static void g_dev_destroy(void *arg, int flags __unused) { struct g_consumer *cp; struct g_geom *gp; struct g_dev_softc *sc; char buf[SPECNAMELEN + 6]; g_topology_assert(); cp = arg; gp = cp->geom; sc = cp->private; g_trace(G_T_TOPOLOGY, "g_dev_destroy(%p(%s))", cp, gp->name); snprintf(buf, sizeof(buf), "cdev=%s", gp->name); devctl_notify("GEOM", "DEV", "DESTROY", buf); knlist_clear(&sc->sc_selinfo.si_note, 0); knlist_destroy(&sc->sc_selinfo.si_note); if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); mtx_destroy(&sc->sc_mtx); g_free(sc); } void g_dev_print(void) { struct g_geom *gp; char const *p = ""; LIST_FOREACH(gp, &g_dev_class.geom, geom) { printf("%s%s", p, gp->name); p = " "; } printf("\n"); } static void g_dev_set_physpath(struct g_consumer *cp) { struct g_dev_softc *sc; char *physpath; int error, physpath_len; if (g_access(cp, 1, 0, 0) != 0) return; sc = cp->private; physpath_len = MAXPATHLEN; physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO); error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath); g_access(cp, -1, 0, 0); if (error == 0 && strlen(physpath) != 0) { struct cdev *dev, *old_alias_dev; struct cdev **alias_devp; dev = sc->sc_dev; old_alias_dev = sc->sc_alias; alias_devp = (struct cdev **)&sc->sc_alias; make_dev_physpath_alias(MAKEDEV_WAITOK | MAKEDEV_CHECKNAME, alias_devp, dev, old_alias_dev, physpath); } else if (sc->sc_alias) { destroy_dev((struct cdev *)sc->sc_alias); sc->sc_alias = NULL; } g_free(physpath); } static void g_dev_set_media(struct g_consumer *cp) { struct g_dev_softc *sc; struct cdev *dev; char buf[SPECNAMELEN + 6]; sc = cp->private; dev = sc->sc_dev; snprintf(buf, sizeof(buf), "cdev=%s", dev->si_name); devctl_notify("DEVFS", "CDEV", "MEDIACHANGE", buf); devctl_notify("GEOM", "DEV", "MEDIACHANGE", buf); dev = sc->sc_alias; if (dev != NULL) { snprintf(buf, sizeof(buf), "cdev=%s", dev->si_name); devctl_notify("DEVFS", "CDEV", "MEDIACHANGE", buf); devctl_notify("GEOM", "DEV", "MEDIACHANGE", buf); } } static void g_dev_attrchanged(struct g_consumer *cp, const char *attr) { if (strcmp(attr, "GEOM::media") == 0) { g_dev_set_media(cp); return; } if (strcmp(attr, "GEOM::physpath") == 0) { g_dev_set_physpath(cp); return; } } static void g_dev_resize(struct g_consumer *cp) { struct g_dev_softc *sc; char buf[SPECNAMELEN + 6]; sc = cp->private; KNOTE_UNLOCKED(&sc->sc_selinfo.si_note, NOTE_ATTRIB); snprintf(buf, sizeof(buf), "cdev=%s", cp->provider->name); devctl_notify("GEOM", "DEV", "SIZECHANGE", buf); } struct g_provider * g_dev_getprovider(struct cdev *dev) { struct g_consumer *cp; g_topology_assert(); if (dev == NULL) return (NULL); if (dev->si_devsw != &g_dev_cdevsw) return (NULL); cp = dev->si_drv2; return (cp->provider); } static struct g_geom * g_dev_taste(struct g_class *mp, struct g_provider *pp, int insist __unused) { struct g_geom *gp; struct g_geom_alias *gap; struct g_consumer *cp; struct g_dev_softc *sc; int error; struct cdev *dev, *adev; char buf[SPECNAMELEN + 6]; struct make_dev_args args; g_trace(G_T_TOPOLOGY, "dev_taste(%s,%s)", mp->name, pp->name); g_topology_assert(); gp = g_new_geomf(mp, "%s", pp->name); sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO); mtx_init(&sc->sc_mtx, "g_dev", NULL, MTX_DEF); cp = g_new_consumer(gp); cp->private = sc; cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; error = g_attach(cp, pp); if (error != 0) { printf("%s: g_dev_taste(%s) failed to g_attach, error=%d\n", __func__, pp->name, error); g_destroy_consumer(cp); g_destroy_geom(gp); mtx_destroy(&sc->sc_mtx); g_free(sc); return (NULL); } make_dev_args_init(&args); args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; args.mda_devsw = &g_dev_cdevsw; args.mda_cr = NULL; args.mda_uid = UID_ROOT; args.mda_gid = GID_OPERATOR; args.mda_mode = 0640; args.mda_si_drv1 = sc; args.mda_si_drv2 = cp; error = make_dev_s(&args, &sc->sc_dev, "%s", gp->name); if (error != 0) { printf("%s: make_dev_p() failed (gp->name=%s, error=%d)\n", __func__, gp->name, error); g_detach(cp); g_destroy_consumer(cp); g_destroy_geom(gp); mtx_destroy(&sc->sc_mtx); g_free(sc); return (NULL); } dev = sc->sc_dev; dev->si_flags |= SI_UNMAPPED; dev->si_iosize_max = maxphys; knlist_init_mtx(&sc->sc_selinfo.si_note, &sc->sc_mtx); error = init_dumpdev(dev); if (error != 0) printf("%s: init_dumpdev() failed (gp->name=%s, error=%d)\n", __func__, gp->name, error); g_dev_attrchanged(cp, "GEOM::physpath"); snprintf(buf, sizeof(buf), "cdev=%s", gp->name); devctl_notify("GEOM", "DEV", "CREATE", buf); /* * Now add all the aliases for this drive */ LIST_FOREACH(gap, &pp->aliases, ga_next) { error = make_dev_alias_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK, &adev, dev, "%s", gap->ga_alias); if (error) { printf("%s: make_dev_alias_p() failed (name=%s, error=%d)\n", __func__, gap->ga_alias, error); continue; } snprintf(buf, sizeof(buf), "cdev=%s", gap->ga_alias); devctl_notify("GEOM", "DEV", "CREATE", buf); } return (gp); } static int g_dev_open(struct cdev *dev, int flags, int fmt, struct thread *td) { struct g_consumer *cp; struct g_dev_softc *sc; int error, r, w, e; cp = dev->si_drv2; g_trace(G_T_ACCESS, "g_dev_open(%s, %d, %d, %p)", cp->geom->name, flags, fmt, td); r = flags & FREAD ? 1 : 0; w = flags & FWRITE ? 1 : 0; #ifdef notyet e = flags & O_EXCL ? 1 : 0; #else e = 0; #endif /* * This happens on attempt to open a device node with O_EXEC. */ if (r + w + e == 0) return (EINVAL); if (w) { /* * When running in very secure mode, do not allow * opens for writing of any disks. */ error = securelevel_ge(td->td_ucred, 2); if (error) return (error); } g_topology_lock(); error = g_access(cp, r, w, e); g_topology_unlock(); if (error == 0) { sc = dev->si_drv1; mtx_lock(&sc->sc_mtx); if (sc->sc_open == 0 && (sc->sc_active & SC_A_ACTIVE) != 0) wakeup(&sc->sc_active); sc->sc_open += r + w + e; if (sc->sc_open == 0) atomic_clear_int(&sc->sc_active, SC_A_OPEN); else atomic_set_int(&sc->sc_active, SC_A_OPEN); KNOTE_LOCKED(&sc->sc_selinfo.si_note, NOTE_OPEN); mtx_unlock(&sc->sc_mtx); } return (error); } static int g_dev_close(struct cdev *dev, int flags, int fmt, struct thread *td) { struct g_consumer *cp; struct g_dev_softc *sc; int error, r, w, e; cp = dev->si_drv2; g_trace(G_T_ACCESS, "g_dev_close(%s, %d, %d, %p)", cp->geom->name, flags, fmt, td); r = flags & FREAD ? -1 : 0; w = flags & FWRITE ? -1 : 0; #ifdef notyet e = flags & O_EXCL ? -1 : 0; #else e = 0; #endif /* * The vgonel(9) - caused by eg. forced unmount of devfs - calls * VOP_CLOSE(9) on devfs vnode without any FREAD or FWRITE flags, * which would result in zero deltas, which in turn would cause * panic in g_access(9). * * Note that we cannot zero the counters (ie. do "r = cp->acr" * etc) instead, because the consumer might be opened in another * devfs instance. */ if (r + w + e == 0) return (EINVAL); sc = dev->si_drv1; mtx_lock(&sc->sc_mtx); sc->sc_open += r + w + e; if (sc->sc_open == 0) atomic_clear_int(&sc->sc_active, SC_A_OPEN); else atomic_set_int(&sc->sc_active, SC_A_OPEN); while (sc->sc_open == 0 && (sc->sc_active & SC_A_ACTIVE) != 0) msleep(&sc->sc_active, &sc->sc_mtx, 0, "g_dev_close", hz / 10); KNOTE_LOCKED(&sc->sc_selinfo.si_note, NOTE_CLOSE | (w ? NOTE_CLOSE_WRITE : 0)); mtx_unlock(&sc->sc_mtx); g_topology_lock(); error = g_access(cp, r, w, e); g_topology_unlock(); return (error); } static int g_dev_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) { struct g_consumer *cp; struct g_provider *pp; off_t offset, length, chunk, odd; int i, error; cp = dev->si_drv2; pp = cp->provider; /* If consumer or provider is dying, don't disturb. */ if (cp->flags & G_CF_ORPHAN) return (ENXIO); if (pp->error) return (pp->error); error = 0; KASSERT(cp->acr || cp->acw, ("Consumer with zero access count in g_dev_ioctl")); i = IOCPARM_LEN(cmd); switch (cmd) { case DIOCGSECTORSIZE: *(u_int *)data = pp->sectorsize; if (*(u_int *)data == 0) error = ENOENT; break; case DIOCGMEDIASIZE: *(off_t *)data = pp->mediasize; if (*(off_t *)data == 0) error = ENOENT; break; case DIOCGFWSECTORS: error = g_io_getattr("GEOM::fwsectors", cp, &i, data); if (error == 0 && *(u_int *)data == 0) error = ENOENT; break; case DIOCGFWHEADS: error = g_io_getattr("GEOM::fwheads", cp, &i, data); if (error == 0 && *(u_int *)data == 0) error = ENOENT; break; case DIOCSKERNELDUMP: { struct diocskerneldump_arg *kda; uint8_t *encryptedkey; kda = (struct diocskerneldump_arg *)data; if (kda->kda_index == KDA_REMOVE_ALL || kda->kda_index == KDA_REMOVE_DEV || kda->kda_index == KDA_REMOVE) { error = dumper_remove(devtoname(dev), kda); explicit_bzero(kda, sizeof(*kda)); break; } if (kda->kda_encryption != KERNELDUMP_ENC_NONE) { if (kda->kda_encryptedkeysize == 0 || kda->kda_encryptedkeysize > KERNELDUMP_ENCKEY_MAX_SIZE) { explicit_bzero(kda, sizeof(*kda)); return (EINVAL); } encryptedkey = malloc(kda->kda_encryptedkeysize, M_TEMP, M_WAITOK); error = copyin(kda->kda_encryptedkey, encryptedkey, kda->kda_encryptedkeysize); } else { encryptedkey = NULL; } if (error == 0) { kda->kda_encryptedkey = encryptedkey; error = g_dev_setdumpdev(dev, kda); } zfree(encryptedkey, M_TEMP); explicit_bzero(kda, sizeof(*kda)); break; } case DIOCGFLUSH: error = g_io_flush(cp); break; case DIOCGDELETE: offset = ((off_t *)data)[0]; length = ((off_t *)data)[1]; if ((offset % pp->sectorsize) != 0 || (length % pp->sectorsize) != 0 || length <= 0) { printf("%s: offset=%jd length=%jd\n", __func__, offset, length); error = EINVAL; break; } while (length > 0) { chunk = length; if (g_dev_del_max_sectors != 0 && chunk > g_dev_del_max_sectors * pp->sectorsize) { chunk = g_dev_del_max_sectors * pp->sectorsize; if (pp->stripesize > 0) { odd = (offset + chunk + pp->stripeoffset) % pp->stripesize; if (chunk > odd) chunk -= odd; } } error = g_delete_data(cp, offset, chunk); length -= chunk; offset += chunk; if (error) break; /* * Since the request size can be large, the service * time can be is likewise. We make this ioctl * interruptible by checking for signals for each bio. */ if (SIGPENDING(td)) break; } break; case DIOCGIDENT: error = g_io_getattr("GEOM::ident", cp, &i, data); break; case DIOCGPROVIDERNAME: strlcpy(data, pp->name, i); break; case DIOCGSTRIPESIZE: *(off_t *)data = pp->stripesize; break; case DIOCGSTRIPEOFFSET: *(off_t *)data = pp->stripeoffset; break; case DIOCGPHYSPATH: error = g_io_getattr("GEOM::physpath", cp, &i, data); if (error == 0 && *(char *)data == '\0') error = ENOENT; break; case DIOCGATTR: { struct diocgattr_arg *arg = (struct diocgattr_arg *)data; if (arg->len > sizeof(arg->value)) { error = EINVAL; break; } error = g_io_getattr(arg->name, cp, &arg->len, &arg->value); break; } case DIOCZONECMD: { struct disk_zone_args *zone_args =(struct disk_zone_args *)data; struct disk_zone_rep_entry *new_entries, *old_entries; struct disk_zone_report *rep; size_t alloc_size; old_entries = NULL; new_entries = NULL; rep = NULL; alloc_size = 0; if (zone_args->zone_cmd == DISK_ZONE_REPORT_ZONES) { rep = &zone_args->zone_params.report; #define MAXENTRIES (maxphys / sizeof(struct disk_zone_rep_entry)) if (rep->entries_allocated > MAXENTRIES) rep->entries_allocated = MAXENTRIES; alloc_size = rep->entries_allocated * sizeof(struct disk_zone_rep_entry); if (alloc_size != 0) new_entries = g_malloc(alloc_size, M_WAITOK | M_ZERO); old_entries = rep->entries; rep->entries = new_entries; } error = g_io_zonecmd(zone_args, cp); if (zone_args->zone_cmd == DISK_ZONE_REPORT_ZONES && alloc_size != 0 && error == 0) error = copyout(new_entries, old_entries, alloc_size); if (old_entries != NULL && rep != NULL) rep->entries = old_entries; g_free(new_entries); break; } default: if (pp->geom->ioctl != NULL) { error = pp->geom->ioctl(pp, cmd, data, fflag, td); } else { error = ENOIOCTL; } } return (error); } static void g_dev_done(struct bio *bp2) { struct g_consumer *cp; struct g_dev_softc *sc; struct bio *bp; int active; cp = bp2->bio_from; sc = cp->private; bp = bp2->bio_parent; bp->bio_error = bp2->bio_error; bp->bio_completed = bp2->bio_completed; bp->bio_resid = bp->bio_length - bp2->bio_completed; if (bp2->bio_cmd == BIO_ZONE) bcopy(&bp2->bio_zone, &bp->bio_zone, sizeof(bp->bio_zone)); if (bp2->bio_error != 0) { g_trace(G_T_BIO, "g_dev_done(%p) had error %d", bp2, bp2->bio_error); bp->bio_flags |= BIO_ERROR; } else { if (bp->bio_cmd == BIO_READ) KNOTE_UNLOCKED(&sc->sc_selinfo.si_note, NOTE_READ); if (bp->bio_cmd == BIO_WRITE) KNOTE_UNLOCKED(&sc->sc_selinfo.si_note, NOTE_WRITE); g_trace(G_T_BIO, "g_dev_done(%p/%p) resid %ld completed %jd", bp2, bp, bp2->bio_resid, (intmax_t)bp2->bio_completed); } g_destroy_bio(bp2); active = atomic_fetchadd_int(&sc->sc_active, -1) - 1; if ((active & SC_A_ACTIVE) == 0) { if ((active & SC_A_OPEN) == 0) wakeup(&sc->sc_active); if (active & SC_A_DESTROY) g_post_event(g_dev_destroy, cp, M_NOWAIT, NULL); } biodone(bp); } static void g_dev_strategy(struct bio *bp) { struct g_consumer *cp; struct bio *bp2; struct cdev *dev; struct g_dev_softc *sc; KASSERT(bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE || bp->bio_cmd == BIO_FLUSH || bp->bio_cmd == BIO_ZONE, ("Wrong bio_cmd bio=%p cmd=%d", bp, bp->bio_cmd)); dev = bp->bio_dev; cp = dev->si_drv2; KASSERT(cp->acr || cp->acw, ("Consumer with zero access count in g_dev_strategy")); biotrack(bp, __func__); #ifdef INVARIANTS if ((bp->bio_offset % cp->provider->sectorsize) != 0 || (bp->bio_bcount % cp->provider->sectorsize) != 0) { bp->bio_resid = bp->bio_bcount; biofinish(bp, NULL, EINVAL); return; } #endif sc = dev->si_drv1; KASSERT(sc->sc_open > 0, ("Closed device in g_dev_strategy")); atomic_add_int(&sc->sc_active, 1); for (;;) { /* * XXX: This is not an ideal solution, but I believe it to * XXX: deadlock safely, all things considered. */ bp2 = g_clone_bio(bp); if (bp2 != NULL) break; pause("gdstrat", hz / 10); } KASSERT(bp2 != NULL, ("XXX: ENOMEM in a bad place")); bp2->bio_done = g_dev_done; g_trace(G_T_BIO, "g_dev_strategy(%p/%p) offset %jd length %jd data %p cmd %d", bp, bp2, (intmax_t)bp->bio_offset, (intmax_t)bp2->bio_length, bp2->bio_data, bp2->bio_cmd); g_io_request(bp2, cp); KASSERT(cp->acr || cp->acw, ("g_dev_strategy raced with g_dev_close and lost")); } /* * g_dev_callback() * * Called by devfs when asynchronous device destruction is completed. * - Mark that we have no attached device any more. * - If there are no outstanding requests, schedule geom destruction. * Otherwise destruction will be scheduled later by g_dev_done(). */ static void g_dev_callback(void *arg) { struct g_consumer *cp; struct g_dev_softc *sc; int active; cp = arg; sc = cp->private; g_trace(G_T_TOPOLOGY, "g_dev_callback(%p(%s))", cp, cp->geom->name); sc->sc_dev = NULL; sc->sc_alias = NULL; active = atomic_fetchadd_int(&sc->sc_active, SC_A_DESTROY); if ((active & SC_A_ACTIVE) == 0) g_post_event(g_dev_destroy, cp, M_WAITOK, NULL); } /* * g_dev_orphan() * * Called from below when the provider orphaned us. * - Clear any dump settings. * - Request asynchronous device destruction to prevent any more requests * from coming in. The provider is already marked with an error, so * anything which comes in the interim will be returned immediately. */ static void g_dev_orphan(struct g_consumer *cp) { struct cdev *dev; struct g_dev_softc *sc; g_topology_assert(); sc = cp->private; dev = sc->sc_dev; g_trace(G_T_TOPOLOGY, "g_dev_orphan(%p(%s))", cp, cp->geom->name); /* Reset any dump-area set on this device */ if (dev->si_flags & SI_DUMPDEV) { struct diocskerneldump_arg kda; bzero(&kda, sizeof(kda)); kda.kda_index = KDA_REMOVE_DEV; (void)dumper_remove(devtoname(dev), &kda); } /* Destroy the struct cdev *so we get no more requests */ delist_dev(dev); destroy_dev_sched_cb(dev, g_dev_callback, cp); } static void gdev_filter_detach(struct knote *kn) { struct g_dev_softc *sc; sc = kn->kn_hook; knlist_remove(&sc->sc_selinfo.si_note, kn, 0); } static int gdev_filter_vnode(struct knote *kn, long hint) { kn->kn_fflags |= kn->kn_sfflags & hint; return (kn->kn_fflags != 0); } static int g_dev_kqfilter(struct cdev *dev, struct knote *kn) { struct g_dev_softc *sc; sc = dev->si_drv1; if (kn->kn_filter != EVFILT_VNODE) return (EINVAL); #define SUPPORTED_EVENTS (NOTE_ATTRIB | NOTE_OPEN | NOTE_CLOSE | \ NOTE_CLOSE_WRITE | NOTE_READ | NOTE_WRITE) if (kn->kn_sfflags & ~SUPPORTED_EVENTS) return (EOPNOTSUPP); kn->kn_fop = &gdev_filterops_vnode; kn->kn_hook = sc; knlist_add(&sc->sc_selinfo.si_note, kn, 0); return (0); } DECLARE_GEOM_CLASS(g_dev_class, g_dev); diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c index 9036e3a25ab8..61cf7fc845a2 100644 --- a/sys/kern/kern_descrip.c +++ b/sys/kern/kern_descrip.c @@ -1,5434 +1,5435 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94 */ #include #include "opt_capsicum.h" #include "opt_ddb.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table"); static MALLOC_DEFINE(M_PWD, "pwd", "Descriptor table vnodes"); static MALLOC_DEFINE(M_PWDDESC, "pwddesc", "Pwd descriptors"); static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader", "file desc to leader structures"); static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures"); MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities"); MALLOC_DECLARE(M_FADVISE); static __read_mostly uma_zone_t file_zone; static __read_mostly uma_zone_t filedesc0_zone; __read_mostly uma_zone_t pwd_zone; VFS_SMR_DECLARE; static int closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, bool holdleaders, bool audit); static void export_file_to_kinfo(struct file *fp, int fd, cap_rights_t *rightsp, struct kinfo_file *kif, struct filedesc *fdp, int flags); static int fd_first_free(struct filedesc *fdp, int low, int size); static void fdgrowtable(struct filedesc *fdp, int nfd); static void fdgrowtable_exp(struct filedesc *fdp, int nfd); static void fdunused(struct filedesc *fdp, int fd); static void fdused(struct filedesc *fdp, int fd); static int fget_unlocked_seq(struct thread *td, int fd, cap_rights_t *needrightsp, struct file **fpp, seqc_t *seqp); static int getmaxfd(struct thread *td); static u_long *filecaps_copy_prep(const struct filecaps *src); static void filecaps_copy_finish(const struct filecaps *src, struct filecaps *dst, u_long *ioctls); static u_long *filecaps_free_prep(struct filecaps *fcaps); static void filecaps_free_finish(u_long *ioctls); static struct pwd *pwd_alloc(void); /* * Each process has: * * - An array of open file descriptors (fd_ofiles) * - An array of file flags (fd_ofileflags) * - A bitmap recording which descriptors are in use (fd_map) * * A process starts out with NDFILE descriptors. The value of NDFILE has * been selected based the historical limit of 20 open files, and an * assumption that the majority of processes, especially short-lived * processes like shells, will never need more. * * If this initial allocation is exhausted, a larger descriptor table and * map are allocated dynamically, and the pointers in the process's struct * filedesc are updated to point to those. This is repeated every time * the process runs out of file descriptors (provided it hasn't hit its * resource limit). * * Since threads may hold references to individual descriptor table * entries, the tables are never freed. Instead, they are placed on a * linked list and freed only when the struct filedesc is released. */ #define NDFILE 20 #define NDSLOTSIZE sizeof(NDSLOTTYPE) #define NDENTRIES (NDSLOTSIZE * __CHAR_BIT) #define NDSLOT(x) ((x) / NDENTRIES) #define NDBIT(x) ((NDSLOTTYPE)1 << ((x) % NDENTRIES)) #define NDSLOTS(x) (((x) + NDENTRIES - 1) / NDENTRIES) #define FILEDESC_FOREACH_FDE(fdp, _iterator, _fde) \ struct filedesc *_fdp = (fdp); \ int _lastfile = fdlastfile_single(_fdp); \ for (_iterator = 0; _iterator <= _lastfile; _iterator++) \ if ((_fde = &_fdp->fd_ofiles[_iterator])->fde_file != NULL) #define FILEDESC_FOREACH_FP(fdp, _iterator, _fp) \ struct filedesc *_fdp = (fdp); \ int _lastfile = fdlastfile_single(_fdp); \ for (_iterator = 0; _iterator <= _lastfile; _iterator++) \ if ((_fp = _fdp->fd_ofiles[_iterator].fde_file) != NULL) /* * SLIST entry used to keep track of ofiles which must be reclaimed when * the process exits. */ struct freetable { struct fdescenttbl *ft_table; SLIST_ENTRY(freetable) ft_next; }; /* * Initial allocation: a filedesc structure + the head of SLIST used to * keep track of old ofiles + enough space for NDFILE descriptors. */ struct fdescenttbl0 { int fdt_nfiles; struct filedescent fdt_ofiles[NDFILE]; }; struct filedesc0 { struct filedesc fd_fd; SLIST_HEAD(, freetable) fd_free; struct fdescenttbl0 fd_dfiles; NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)]; }; /* * Descriptor management. */ static int __exclusive_cache_line openfiles; /* actual number of open files */ struct mtx sigio_lock; /* mtx to protect pointers to sigio */ void __read_mostly (*mq_fdclose)(struct thread *td, int fd, struct file *fp); /* * If low >= size, just return low. Otherwise find the first zero bit in the * given bitmap, starting at low and not exceeding size - 1. Return size if * not found. */ static int fd_first_free(struct filedesc *fdp, int low, int size) { NDSLOTTYPE *map = fdp->fd_map; NDSLOTTYPE mask; int off, maxoff; if (low >= size) return (low); off = NDSLOT(low); if (low % NDENTRIES) { mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES))); if ((mask &= ~map[off]) != 0UL) return (off * NDENTRIES + ffsl(mask) - 1); ++off; } for (maxoff = NDSLOTS(size); off < maxoff; ++off) if (map[off] != ~0UL) return (off * NDENTRIES + ffsl(~map[off]) - 1); return (size); } /* * Find the last used fd. * * Call this variant if fdp can't be modified by anyone else (e.g, during exec). * Otherwise use fdlastfile. */ int fdlastfile_single(struct filedesc *fdp) { NDSLOTTYPE *map = fdp->fd_map; int off, minoff; off = NDSLOT(fdp->fd_nfiles - 1); for (minoff = NDSLOT(0); off >= minoff; --off) if (map[off] != 0) return (off * NDENTRIES + flsl(map[off]) - 1); return (-1); } int fdlastfile(struct filedesc *fdp) { FILEDESC_LOCK_ASSERT(fdp); return (fdlastfile_single(fdp)); } static int fdisused(struct filedesc *fdp, int fd) { KASSERT(fd >= 0 && fd < fdp->fd_nfiles, ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles)); return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0); } /* * Mark a file descriptor as used. */ static void fdused_init(struct filedesc *fdp, int fd) { KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd)); fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd); } static void fdused(struct filedesc *fdp, int fd) { FILEDESC_XLOCK_ASSERT(fdp); fdused_init(fdp, fd); if (fd == fdp->fd_freefile) fdp->fd_freefile++; } /* * Mark a file descriptor as unused. */ static void fdunused(struct filedesc *fdp, int fd) { FILEDESC_XLOCK_ASSERT(fdp); KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd)); KASSERT(fdp->fd_ofiles[fd].fde_file == NULL, ("fd=%d is still in use", fd)); fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd); if (fd < fdp->fd_freefile) fdp->fd_freefile = fd; } /* * Free a file descriptor. * * Avoid some work if fdp is about to be destroyed. */ static inline void fdefree_last(struct filedescent *fde) { filecaps_free(&fde->fde_caps); } static inline void fdfree(struct filedesc *fdp, int fd) { struct filedescent *fde; FILEDESC_XLOCK_ASSERT(fdp); fde = &fdp->fd_ofiles[fd]; #ifdef CAPABILITIES seqc_write_begin(&fde->fde_seqc); #endif fde->fde_file = NULL; #ifdef CAPABILITIES seqc_write_end(&fde->fde_seqc); #endif fdefree_last(fde); fdunused(fdp, fd); } /* * System calls on descriptors. */ #ifndef _SYS_SYSPROTO_H_ struct getdtablesize_args { int dummy; }; #endif /* ARGSUSED */ int sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap) { #ifdef RACCT uint64_t lim; #endif td->td_retval[0] = getmaxfd(td); #ifdef RACCT PROC_LOCK(td->td_proc); lim = racct_get_limit(td->td_proc, RACCT_NOFILE); PROC_UNLOCK(td->td_proc); if (lim < td->td_retval[0]) td->td_retval[0] = lim; #endif return (0); } /* * Duplicate a file descriptor to a particular value. * * Note: keep in mind that a potential race condition exists when closing * descriptors from a shared descriptor table (via rfork). */ #ifndef _SYS_SYSPROTO_H_ struct dup2_args { u_int from; u_int to; }; #endif /* ARGSUSED */ int sys_dup2(struct thread *td, struct dup2_args *uap) { return (kern_dup(td, FDDUP_FIXED, 0, (int)uap->from, (int)uap->to)); } /* * Duplicate a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct dup_args { u_int fd; }; #endif /* ARGSUSED */ int sys_dup(struct thread *td, struct dup_args *uap) { return (kern_dup(td, FDDUP_NORMAL, 0, (int)uap->fd, 0)); } /* * The file control system call. */ #ifndef _SYS_SYSPROTO_H_ struct fcntl_args { int fd; int cmd; long arg; }; #endif /* ARGSUSED */ int sys_fcntl(struct thread *td, struct fcntl_args *uap) { return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, uap->arg)); } int kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg) { struct flock fl; struct __oflock ofl; intptr_t arg1; int error, newcmd; error = 0; newcmd = cmd; switch (cmd) { case F_OGETLK: case F_OSETLK: case F_OSETLKW: /* * Convert old flock structure to new. */ error = copyin((void *)(intptr_t)arg, &ofl, sizeof(ofl)); fl.l_start = ofl.l_start; fl.l_len = ofl.l_len; fl.l_pid = ofl.l_pid; fl.l_type = ofl.l_type; fl.l_whence = ofl.l_whence; fl.l_sysid = 0; switch (cmd) { case F_OGETLK: newcmd = F_GETLK; break; case F_OSETLK: newcmd = F_SETLK; break; case F_OSETLKW: newcmd = F_SETLKW; break; } arg1 = (intptr_t)&fl; break; case F_GETLK: case F_SETLK: case F_SETLKW: case F_SETLK_REMOTE: error = copyin((void *)(intptr_t)arg, &fl, sizeof(fl)); arg1 = (intptr_t)&fl; break; default: arg1 = arg; break; } if (error) return (error); error = kern_fcntl(td, fd, newcmd, arg1); if (error) return (error); if (cmd == F_OGETLK) { ofl.l_start = fl.l_start; ofl.l_len = fl.l_len; ofl.l_pid = fl.l_pid; ofl.l_type = fl.l_type; ofl.l_whence = fl.l_whence; error = copyout(&ofl, (void *)(intptr_t)arg, sizeof(ofl)); } else if (cmd == F_GETLK) { error = copyout(&fl, (void *)(intptr_t)arg, sizeof(fl)); } return (error); } int kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) { struct filedesc *fdp; struct flock *flp; struct file *fp, *fp2; struct filedescent *fde; struct proc *p; struct vnode *vp; struct mount *mp; struct kinfo_file *kif; int error, flg, kif_sz, seals, tmp, got_set, got_cleared; uint64_t bsize; off_t foffset; error = 0; flg = F_POSIX; p = td->td_proc; fdp = p->p_fd; AUDIT_ARG_FD(cmd); AUDIT_ARG_CMD(cmd); switch (cmd) { case F_DUPFD: tmp = arg; error = kern_dup(td, FDDUP_FCNTL, 0, fd, tmp); break; case F_DUPFD_CLOEXEC: tmp = arg; error = kern_dup(td, FDDUP_FCNTL, FDDUP_FLAG_CLOEXEC, fd, tmp); break; case F_DUP2FD: tmp = arg; error = kern_dup(td, FDDUP_FIXED, 0, fd, tmp); break; case F_DUP2FD_CLOEXEC: tmp = arg; error = kern_dup(td, FDDUP_FIXED, FDDUP_FLAG_CLOEXEC, fd, tmp); break; case F_GETFD: error = EBADF; FILEDESC_SLOCK(fdp); fde = fdeget_noref(fdp, fd); if (fde != NULL) { td->td_retval[0] = (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0; error = 0; } FILEDESC_SUNLOCK(fdp); break; case F_SETFD: error = EBADF; FILEDESC_XLOCK(fdp); fde = fdeget_noref(fdp, fd); if (fde != NULL) { fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) | (arg & FD_CLOEXEC ? UF_EXCLOSE : 0); error = 0; } FILEDESC_XUNLOCK(fdp); break; case F_GETFL: error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETFL, &fp); if (error != 0) break; td->td_retval[0] = OFLAGS(fp->f_flag); fdrop(fp, td); break; case F_SETFL: error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETFL, &fp); if (error != 0) break; if (fp->f_ops == &path_fileops) { fdrop(fp, td); error = EBADF; break; } do { tmp = flg = fp->f_flag; tmp &= ~FCNTLFLAGS; tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS; } while (atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0); got_set = tmp & ~flg; got_cleared = flg & ~tmp; tmp = fp->f_flag & FNONBLOCK; error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); if (error != 0) goto revert_f_setfl; tmp = fp->f_flag & FASYNC; error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td); if (error == 0) { fdrop(fp, td); break; } atomic_clear_int(&fp->f_flag, FNONBLOCK); tmp = 0; (void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td); revert_f_setfl: do { tmp = flg = fp->f_flag; tmp &= ~FCNTLFLAGS; tmp |= got_cleared; tmp &= ~got_set; } while (atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0); fdrop(fp, td); break; case F_GETOWN: error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETOWN, &fp); if (error != 0) break; error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td); if (error == 0) td->td_retval[0] = tmp; fdrop(fp, td); break; case F_SETOWN: error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETOWN, &fp); if (error != 0) break; tmp = arg; error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td); fdrop(fp, td); break; case F_SETLK_REMOTE: error = priv_check(td, PRIV_NFS_LOCKD); if (error != 0) return (error); flg = F_REMOTE; goto do_setlk; case F_SETLKW: flg |= F_WAIT; /* FALLTHROUGH F_SETLK */ case F_SETLK: do_setlk: flp = (struct flock *)arg; if ((flg & F_REMOTE) != 0 && flp->l_sysid == 0) { error = EINVAL; break; } error = fget_unlocked(td, fd, &cap_flock_rights, &fp); if (error != 0) break; if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) { error = EBADF; fdrop(fp, td); break; } if (flp->l_whence == SEEK_CUR) { foffset = foffset_get(fp); if (foffset < 0 || (flp->l_start > 0 && foffset > OFF_MAX - flp->l_start)) { error = EOVERFLOW; fdrop(fp, td); break; } flp->l_start += foffset; } vp = fp->f_vnode; switch (flp->l_type) { case F_RDLCK: if ((fp->f_flag & FREAD) == 0) { error = EBADF; break; } if ((p->p_leader->p_flag & P_ADVLOCK) == 0) { PROC_LOCK(p->p_leader); p->p_leader->p_flag |= P_ADVLOCK; PROC_UNLOCK(p->p_leader); } error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, flp, flg); break; case F_WRLCK: if ((fp->f_flag & FWRITE) == 0) { error = EBADF; break; } if ((p->p_leader->p_flag & P_ADVLOCK) == 0) { PROC_LOCK(p->p_leader); p->p_leader->p_flag |= P_ADVLOCK; PROC_UNLOCK(p->p_leader); } error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK, flp, flg); break; case F_UNLCK: error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, flp, flg); break; case F_UNLCKSYS: if (flg != F_REMOTE) { error = EINVAL; break; } error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCKSYS, flp, flg); break; default: error = EINVAL; break; } if (error != 0 || flp->l_type == F_UNLCK || flp->l_type == F_UNLCKSYS) { fdrop(fp, td); break; } /* * Check for a race with close. * * The vnode is now advisory locked (or unlocked, but this case * is not really important) as the caller requested. * We had to drop the filedesc lock, so we need to recheck if * the descriptor is still valid, because if it was closed * in the meantime we need to remove advisory lock from the * vnode - close on any descriptor leading to an advisory * locked vnode, removes that lock. * We will return 0 on purpose in that case, as the result of * successful advisory lock might have been externally visible * already. This is fine - effectively we pretend to the caller * that the closing thread was a bit slower and that the * advisory lock succeeded before the close. */ error = fget_unlocked(td, fd, &cap_no_rights, &fp2); if (error != 0) { fdrop(fp, td); break; } if (fp != fp2) { flp->l_whence = SEEK_SET; flp->l_start = 0; flp->l_len = 0; flp->l_type = F_UNLCK; (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, flp, F_POSIX); } fdrop(fp, td); fdrop(fp2, td); break; case F_GETLK: error = fget_unlocked(td, fd, &cap_flock_rights, &fp); if (error != 0) break; if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) { error = EBADF; fdrop(fp, td); break; } flp = (struct flock *)arg; if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK && flp->l_type != F_UNLCK) { error = EINVAL; fdrop(fp, td); break; } if (flp->l_whence == SEEK_CUR) { foffset = foffset_get(fp); if ((flp->l_start > 0 && foffset > OFF_MAX - flp->l_start) || (flp->l_start < 0 && foffset < OFF_MIN - flp->l_start)) { error = EOVERFLOW; fdrop(fp, td); break; } flp->l_start += foffset; } vp = fp->f_vnode; error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp, F_POSIX); fdrop(fp, td); break; case F_ADD_SEALS: error = fget_unlocked(td, fd, &cap_no_rights, &fp); if (error != 0) break; error = fo_add_seals(fp, arg); fdrop(fp, td); break; case F_GET_SEALS: error = fget_unlocked(td, fd, &cap_no_rights, &fp); if (error != 0) break; if (fo_get_seals(fp, &seals) == 0) td->td_retval[0] = seals; else error = EINVAL; fdrop(fp, td); break; case F_RDAHEAD: arg = arg ? 128 * 1024: 0; /* FALLTHROUGH */ case F_READAHEAD: error = fget_unlocked(td, fd, &cap_no_rights, &fp); if (error != 0) break; if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) { fdrop(fp, td); error = EBADF; break; } vp = fp->f_vnode; if (vp->v_type != VREG) { fdrop(fp, td); error = ENOTTY; break; } /* * Exclusive lock synchronizes against f_seqcount reads and * writes in sequential_heuristic(). */ error = vn_lock(vp, LK_EXCLUSIVE); if (error != 0) { fdrop(fp, td); break; } if (arg >= 0) { bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize; arg = MIN(arg, INT_MAX - bsize + 1); fp->f_seqcount[UIO_READ] = MIN(IO_SEQMAX, (arg + bsize - 1) / bsize); atomic_set_int(&fp->f_flag, FRDAHEAD); } else { atomic_clear_int(&fp->f_flag, FRDAHEAD); } VOP_UNLOCK(vp); fdrop(fp, td); break; case F_ISUNIONSTACK: /* * Check if the vnode is part of a union stack (either the * "union" flag from mount(2) or unionfs). * * Prior to introduction of this op libc's readdir would call * fstatfs(2), in effect unnecessarily copying kilobytes of * data just to check fs name and a mount flag. * * Fixing the code to handle everything in the kernel instead * is a non-trivial endeavor and has low priority, thus this * horrible kludge facilitates the current behavior in a much * cheaper manner until someone(tm) sorts this out. */ error = fget_unlocked(td, fd, &cap_no_rights, &fp); if (error != 0) break; if (fp->f_type != DTYPE_VNODE) { fdrop(fp, td); error = EBADF; break; } vp = fp->f_vnode; /* * Since we don't prevent dooming the vnode even non-null mp * found can become immediately stale. This is tolerable since * mount points are type-stable (providing safe memory access) * and any vfs op on this vnode going forward will return an * error (meaning return value in this case is meaningless). */ mp = atomic_load_ptr(&vp->v_mount); if (__predict_false(mp == NULL)) { fdrop(fp, td); error = EBADF; break; } td->td_retval[0] = 0; if (mp->mnt_kern_flag & MNTK_UNIONFS || mp->mnt_flag & MNT_UNION) td->td_retval[0] = 1; fdrop(fp, td); break; case F_KINFO: #ifdef CAPABILITY_MODE if (CAP_TRACING(td)) ktrcapfail(CAPFAIL_SYSCALL, &cmd); if (IN_CAPABILITY_MODE(td)) { error = ECAPMODE; break; } #endif error = copyin((void *)arg, &kif_sz, sizeof(kif_sz)); if (error != 0) break; if (kif_sz != sizeof(*kif)) { error = EINVAL; break; } kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK | M_ZERO); FILEDESC_SLOCK(fdp); error = fget_cap_noref(fdp, fd, &cap_fcntl_rights, &fp, NULL); if (error == 0 && fhold(fp)) { export_file_to_kinfo(fp, fd, NULL, kif, fdp, 0); FILEDESC_SUNLOCK(fdp); fdrop(fp, td); if ((kif->kf_status & KF_ATTR_VALID) != 0) { kif->kf_structsize = sizeof(*kif); error = copyout(kif, (void *)arg, sizeof(*kif)); } else { error = EBADF; } } else { FILEDESC_SUNLOCK(fdp); if (error == 0) error = EBADF; } free(kif, M_TEMP); break; default: error = EINVAL; break; } return (error); } static int getmaxfd(struct thread *td) { return (min((int)lim_cur(td, RLIMIT_NOFILE), maxfilesperproc)); } /* * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD). */ int kern_dup(struct thread *td, u_int mode, int flags, int old, int new) { struct filedesc *fdp; struct filedescent *oldfde, *newfde; struct proc *p; struct file *delfp, *oldfp; u_long *oioctls, *nioctls; int error, maxfd; p = td->td_proc; fdp = p->p_fd; oioctls = NULL; MPASS((flags & ~(FDDUP_FLAG_CLOEXEC)) == 0); MPASS(mode < FDDUP_LASTMODE); AUDIT_ARG_FD(old); /* XXXRW: if (flags & FDDUP_FIXED) AUDIT_ARG_FD2(new); */ /* * Verify we have a valid descriptor to dup from and possibly to * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should * return EINVAL when the new descriptor is out of bounds. */ if (old < 0) return (EBADF); if (new < 0) return (mode == FDDUP_FCNTL ? EINVAL : EBADF); maxfd = getmaxfd(td); if (new >= maxfd) return (mode == FDDUP_FCNTL ? EINVAL : EBADF); error = EBADF; FILEDESC_XLOCK(fdp); if (fget_noref(fdp, old) == NULL) goto unlock; if (mode == FDDUP_FIXED && old == new) { td->td_retval[0] = new; if (flags & FDDUP_FLAG_CLOEXEC) fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE; error = 0; goto unlock; } oldfde = &fdp->fd_ofiles[old]; oldfp = oldfde->fde_file; if (!fhold(oldfp)) goto unlock; /* * If the caller specified a file descriptor, make sure the file * table is large enough to hold it, and grab it. Otherwise, just * allocate a new descriptor the usual way. */ switch (mode) { case FDDUP_NORMAL: case FDDUP_FCNTL: if ((error = fdalloc(td, new, &new)) != 0) { fdrop(oldfp, td); goto unlock; } break; case FDDUP_FIXED: if (new >= fdp->fd_nfiles) { /* * The resource limits are here instead of e.g. * fdalloc(), because the file descriptor table may be * shared between processes, so we can't really use * racct_add()/racct_sub(). Instead of counting the * number of actually allocated descriptors, just put * the limit on the size of the file descriptor table. */ #ifdef RACCT if (RACCT_ENABLED()) { error = racct_set_unlocked(p, RACCT_NOFILE, new + 1); if (error != 0) { error = EMFILE; fdrop(oldfp, td); goto unlock; } } #endif fdgrowtable_exp(fdp, new + 1); } if (!fdisused(fdp, new)) fdused(fdp, new); break; default: KASSERT(0, ("%s unsupported mode %d", __func__, mode)); } KASSERT(old != new, ("new fd is same as old")); /* Refetch oldfde because the table may have grown and old one freed. */ oldfde = &fdp->fd_ofiles[old]; KASSERT(oldfp == oldfde->fde_file, ("fdt_ofiles shift from growth observed at fd %d", old)); newfde = &fdp->fd_ofiles[new]; delfp = newfde->fde_file; nioctls = filecaps_copy_prep(&oldfde->fde_caps); /* * Duplicate the source descriptor. */ #ifdef CAPABILITIES seqc_write_begin(&newfde->fde_seqc); #endif oioctls = filecaps_free_prep(&newfde->fde_caps); fde_copy(oldfde, newfde); filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps, nioctls); if ((flags & FDDUP_FLAG_CLOEXEC) != 0) newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE; else newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE; #ifdef CAPABILITIES seqc_write_end(&newfde->fde_seqc); #endif td->td_retval[0] = new; error = 0; if (delfp != NULL) { (void) closefp(fdp, new, delfp, td, true, false); FILEDESC_UNLOCK_ASSERT(fdp); } else { unlock: FILEDESC_XUNLOCK(fdp); } filecaps_free_finish(oioctls); return (error); } static void sigiofree(struct sigio *sigio) { crfree(sigio->sio_ucred); free(sigio, M_SIGIO); } static struct sigio * funsetown_locked(struct sigio *sigio) { struct proc *p; struct pgrp *pg; SIGIO_ASSERT_LOCKED(); if (sigio == NULL) return (NULL); *sigio->sio_myref = NULL; if (sigio->sio_pgid < 0) { pg = sigio->sio_pgrp; PGRP_LOCK(pg); SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio, sio_pgsigio); PGRP_UNLOCK(pg); } else { p = sigio->sio_proc; PROC_LOCK(p); SLIST_REMOVE(&p->p_sigiolst, sigio, sigio, sio_pgsigio); PROC_UNLOCK(p); } return (sigio); } /* * If sigio is on the list associated with a process or process group, * disable signalling from the device, remove sigio from the list and * free sigio. */ void funsetown(struct sigio **sigiop) { struct sigio *sigio; /* Racy check, consumers must provide synchronization. */ if (*sigiop == NULL) return; SIGIO_LOCK(); sigio = funsetown_locked(*sigiop); SIGIO_UNLOCK(); if (sigio != NULL) sigiofree(sigio); } /* * Free a list of sigio structures. The caller must ensure that new sigio * structures cannot be added after this point. For process groups this is * guaranteed using the proctree lock; for processes, the P_WEXIT flag serves * as an interlock. */ void funsetownlst(struct sigiolst *sigiolst) { struct proc *p; struct pgrp *pg; struct sigio *sigio, *tmp; /* Racy check. */ sigio = SLIST_FIRST(sigiolst); if (sigio == NULL) return; p = NULL; pg = NULL; SIGIO_LOCK(); sigio = SLIST_FIRST(sigiolst); if (sigio == NULL) { SIGIO_UNLOCK(); return; } /* * Every entry of the list should belong to a single proc or pgrp. */ if (sigio->sio_pgid < 0) { pg = sigio->sio_pgrp; sx_assert(&proctree_lock, SX_XLOCKED); PGRP_LOCK(pg); } else /* if (sigio->sio_pgid > 0) */ { p = sigio->sio_proc; PROC_LOCK(p); KASSERT((p->p_flag & P_WEXIT) != 0, ("%s: process %p is not exiting", __func__, p)); } SLIST_FOREACH(sigio, sigiolst, sio_pgsigio) { *sigio->sio_myref = NULL; if (pg != NULL) { KASSERT(sigio->sio_pgid < 0, ("Proc sigio in pgrp sigio list")); KASSERT(sigio->sio_pgrp == pg, ("Bogus pgrp in sigio list")); } else /* if (p != NULL) */ { KASSERT(sigio->sio_pgid > 0, ("Pgrp sigio in proc sigio list")); KASSERT(sigio->sio_proc == p, ("Bogus proc in sigio list")); } } if (pg != NULL) PGRP_UNLOCK(pg); else PROC_UNLOCK(p); SIGIO_UNLOCK(); SLIST_FOREACH_SAFE(sigio, sigiolst, sio_pgsigio, tmp) sigiofree(sigio); } /* * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg). * * After permission checking, add a sigio structure to the sigio list for * the process or process group. */ int fsetown(pid_t pgid, struct sigio **sigiop) { struct proc *proc; struct pgrp *pgrp; struct sigio *osigio, *sigio; int ret; if (pgid == 0) { funsetown(sigiop); return (0); } sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK); sigio->sio_pgid = pgid; sigio->sio_ucred = crhold(curthread->td_ucred); sigio->sio_myref = sigiop; ret = 0; if (pgid > 0) { ret = pget(pgid, PGET_NOTWEXIT | PGET_NOTID | PGET_HOLD, &proc); SIGIO_LOCK(); osigio = funsetown_locked(*sigiop); if (ret == 0) { PROC_LOCK(proc); _PRELE(proc); if ((proc->p_flag & P_WEXIT) != 0) { ret = ESRCH; } else if (proc->p_session != curthread->td_proc->p_session) { /* * Policy - Don't allow a process to FSETOWN a * process in another session. * * Remove this test to allow maximum flexibility * or restrict FSETOWN to the current process or * process group for maximum safety. */ ret = EPERM; } else { sigio->sio_proc = proc; SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio); } PROC_UNLOCK(proc); } } else /* if (pgid < 0) */ { sx_slock(&proctree_lock); SIGIO_LOCK(); osigio = funsetown_locked(*sigiop); pgrp = pgfind(-pgid); if (pgrp == NULL) { ret = ESRCH; } else { if (pgrp->pg_session != curthread->td_proc->p_session) { /* * Policy - Don't allow a process to FSETOWN a * process in another session. * * Remove this test to allow maximum flexibility * or restrict FSETOWN to the current process or * process group for maximum safety. */ ret = EPERM; } else { sigio->sio_pgrp = pgrp; SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio); } PGRP_UNLOCK(pgrp); } sx_sunlock(&proctree_lock); } if (ret == 0) *sigiop = sigio; SIGIO_UNLOCK(); if (osigio != NULL) sigiofree(osigio); return (ret); } /* * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg). */ pid_t fgetown(struct sigio **sigiop) { pid_t pgid; SIGIO_LOCK(); pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0; SIGIO_UNLOCK(); return (pgid); } static int closefp_impl(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, bool audit) { int error; FILEDESC_XLOCK_ASSERT(fdp); /* * We now hold the fp reference that used to be owned by the * descriptor array. We have to unlock the FILEDESC *AFTER* * knote_fdclose to prevent a race of the fd getting opened, a knote * added, and deleteing a knote for the new fd. */ if (__predict_false(!TAILQ_EMPTY(&fdp->fd_kqlist))) knote_fdclose(td, fd); /* * We need to notify mqueue if the object is of type mqueue. */ if (__predict_false(fp->f_type == DTYPE_MQUEUE)) mq_fdclose(td, fd, fp); FILEDESC_XUNLOCK(fdp); #ifdef AUDIT if (AUDITING_TD(td) && audit) audit_sysclose(td, fd, fp); #endif error = closef(fp, td); /* * All paths leading up to closefp() will have already removed or * replaced the fd in the filedesc table, so a restart would not * operate on the same file. */ if (error == ERESTART) error = EINTR; return (error); } static int closefp_hl(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, bool holdleaders, bool audit) { int error; FILEDESC_XLOCK_ASSERT(fdp); if (holdleaders) { if (td->td_proc->p_fdtol != NULL) { /* * Ask fdfree() to sleep to ensure that all relevant * process leaders can be traversed in closef(). */ fdp->fd_holdleaderscount++; } else { holdleaders = false; } } error = closefp_impl(fdp, fd, fp, td, audit); if (holdleaders) { FILEDESC_XLOCK(fdp); fdp->fd_holdleaderscount--; if (fdp->fd_holdleaderscount == 0 && fdp->fd_holdleaderswakeup != 0) { fdp->fd_holdleaderswakeup = 0; wakeup(&fdp->fd_holdleaderscount); } FILEDESC_XUNLOCK(fdp); } return (error); } static int closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, bool holdleaders, bool audit) { FILEDESC_XLOCK_ASSERT(fdp); if (__predict_false(td->td_proc->p_fdtol != NULL)) { return (closefp_hl(fdp, fd, fp, td, holdleaders, audit)); } else { return (closefp_impl(fdp, fd, fp, td, audit)); } } /* * Close a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct close_args { int fd; }; #endif /* ARGSUSED */ int sys_close(struct thread *td, struct close_args *uap) { return (kern_close(td, uap->fd)); } int kern_close(struct thread *td, int fd) { struct filedesc *fdp; struct file *fp; fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); if ((fp = fget_noref(fdp, fd)) == NULL) { FILEDESC_XUNLOCK(fdp); return (EBADF); } fdfree(fdp, fd); /* closefp() drops the FILEDESC lock for us. */ return (closefp(fdp, fd, fp, td, true, true)); } static int close_range_cloexec(struct thread *td, u_int lowfd, u_int highfd) { struct filedesc *fdp; struct fdescenttbl *fdt; struct filedescent *fde; int fd; fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); fdt = atomic_load_ptr(&fdp->fd_files); highfd = MIN(highfd, fdt->fdt_nfiles - 1); fd = lowfd; if (__predict_false(fd > highfd)) { goto out_locked; } for (; fd <= highfd; fd++) { fde = &fdt->fdt_ofiles[fd]; if (fde->fde_file != NULL) fde->fde_flags |= UF_EXCLOSE; } out_locked: FILEDESC_XUNLOCK(fdp); return (0); } static int close_range_impl(struct thread *td, u_int lowfd, u_int highfd) { struct filedesc *fdp; const struct fdescenttbl *fdt; struct file *fp; int fd; fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); fdt = atomic_load_ptr(&fdp->fd_files); highfd = MIN(highfd, fdt->fdt_nfiles - 1); fd = lowfd; if (__predict_false(fd > highfd)) { goto out_locked; } for (;;) { fp = fdt->fdt_ofiles[fd].fde_file; if (fp == NULL) { if (fd == highfd) goto out_locked; } else { fdfree(fdp, fd); (void) closefp(fdp, fd, fp, td, true, true); if (fd == highfd) goto out_unlocked; FILEDESC_XLOCK(fdp); fdt = atomic_load_ptr(&fdp->fd_files); } fd++; } out_locked: FILEDESC_XUNLOCK(fdp); out_unlocked: return (0); } int kern_close_range(struct thread *td, int flags, u_int lowfd, u_int highfd) { /* * Check this prior to clamping; closefrom(3) with only fd 0, 1, and 2 * open should not be a usage error. From a close_range() perspective, * close_range(3, ~0U, 0) in the same scenario should also likely not * be a usage error as all fd above 3 are in-fact already closed. */ if (highfd < lowfd) { return (EINVAL); } if ((flags & CLOSE_RANGE_CLOEXEC) != 0) return (close_range_cloexec(td, lowfd, highfd)); return (close_range_impl(td, lowfd, highfd)); } #ifndef _SYS_SYSPROTO_H_ struct close_range_args { u_int lowfd; u_int highfd; int flags; }; #endif int sys_close_range(struct thread *td, struct close_range_args *uap) { AUDIT_ARG_FD(uap->lowfd); AUDIT_ARG_CMD(uap->highfd); AUDIT_ARG_FFLAGS(uap->flags); if ((uap->flags & ~(CLOSE_RANGE_CLOEXEC)) != 0) return (EINVAL); return (kern_close_range(td, uap->flags, uap->lowfd, uap->highfd)); } #ifdef COMPAT_FREEBSD12 /* * Close open file descriptors. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd12_closefrom_args { int lowfd; }; #endif /* ARGSUSED */ int freebsd12_closefrom(struct thread *td, struct freebsd12_closefrom_args *uap) { u_int lowfd; AUDIT_ARG_FD(uap->lowfd); /* * Treat negative starting file descriptor values identical to * closefrom(0) which closes all files. */ lowfd = MAX(0, uap->lowfd); return (kern_close_range(td, 0, lowfd, ~0U)); } #endif /* COMPAT_FREEBSD12 */ #if defined(COMPAT_43) /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct ofstat_args { int fd; struct ostat *sb; }; #endif /* ARGSUSED */ int ofstat(struct thread *td, struct ofstat_args *uap) { struct ostat oub; struct stat ub; int error; error = kern_fstat(td, uap->fd, &ub); if (error == 0) { cvtstat(&ub, &oub); error = copyout(&oub, uap->sb, sizeof(oub)); } return (error); } #endif /* COMPAT_43 */ #if defined(COMPAT_FREEBSD11) int freebsd11_fstat(struct thread *td, struct freebsd11_fstat_args *uap) { struct stat sb; struct freebsd11_stat osb; int error; error = kern_fstat(td, uap->fd, &sb); if (error != 0) return (error); error = freebsd11_cvtstat(&sb, &osb); if (error == 0) error = copyout(&osb, uap->sb, sizeof(osb)); return (error); } #endif /* COMPAT_FREEBSD11 */ /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fstat_args { int fd; struct stat *sb; }; #endif /* ARGSUSED */ int sys_fstat(struct thread *td, struct fstat_args *uap) { struct stat ub; int error; error = kern_fstat(td, uap->fd, &ub); if (error == 0) error = copyout(&ub, uap->sb, sizeof(ub)); return (error); } int kern_fstat(struct thread *td, int fd, struct stat *sbp) { struct file *fp; int error; AUDIT_ARG_FD(fd); error = fget(td, fd, &cap_fstat_rights, &fp); if (__predict_false(error != 0)) return (error); AUDIT_ARG_FILE(td->td_proc, fp); error = fo_stat(fp, sbp, td->td_ucred); fdrop(fp, td); #ifdef __STAT_TIME_T_EXT sbp->st_atim_ext = 0; sbp->st_mtim_ext = 0; sbp->st_ctim_ext = 0; sbp->st_btim_ext = 0; #endif #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrstat_error(sbp, error); #endif return (error); } #if defined(COMPAT_FREEBSD11) /* * Return status information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct freebsd11_nfstat_args { int fd; struct nstat *sb; }; #endif /* ARGSUSED */ int freebsd11_nfstat(struct thread *td, struct freebsd11_nfstat_args *uap) { struct nstat nub; struct stat ub; int error; error = kern_fstat(td, uap->fd, &ub); if (error != 0) return (error); error = freebsd11_cvtnstat(&ub, &nub); if (error != 0) error = copyout(&nub, uap->sb, sizeof(nub)); return (error); } #endif /* COMPAT_FREEBSD11 */ /* * Return pathconf information about a file descriptor. */ #ifndef _SYS_SYSPROTO_H_ struct fpathconf_args { int fd; int name; }; #endif /* ARGSUSED */ int sys_fpathconf(struct thread *td, struct fpathconf_args *uap) { long value; int error; error = kern_fpathconf(td, uap->fd, uap->name, &value); if (error == 0) td->td_retval[0] = value; return (error); } int kern_fpathconf(struct thread *td, int fd, int name, long *valuep) { struct file *fp; struct vnode *vp; int error; error = fget(td, fd, &cap_fpathconf_rights, &fp); if (error != 0) return (error); if (name == _PC_ASYNC_IO) { *valuep = _POSIX_ASYNCHRONOUS_IO; goto out; } vp = fp->f_vnode; if (vp != NULL) { vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_PATHCONF(vp, name, valuep); VOP_UNLOCK(vp); } else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) { if (name != _PC_PIPE_BUF) { error = EINVAL; } else { *valuep = PIPE_BUF; error = 0; } } else { error = EOPNOTSUPP; } out: fdrop(fp, td); return (error); } /* * Copy filecaps structure allocating memory for ioctls array if needed. * * The last parameter indicates whether the fdtable is locked. If it is not and * ioctls are encountered, copying fails and the caller must lock the table. * * Note that if the table was not locked, the caller has to check the relevant * sequence counter to determine whether the operation was successful. */ bool filecaps_copy(const struct filecaps *src, struct filecaps *dst, bool locked) { size_t size; if (src->fc_ioctls != NULL && !locked) return (false); memcpy(dst, src, sizeof(*src)); if (src->fc_ioctls == NULL) return (true); KASSERT(src->fc_nioctls > 0, ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls)); size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK); memcpy(dst->fc_ioctls, src->fc_ioctls, size); return (true); } static u_long * filecaps_copy_prep(const struct filecaps *src) { u_long *ioctls; size_t size; if (__predict_true(src->fc_ioctls == NULL)) return (NULL); KASSERT(src->fc_nioctls > 0, ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls)); size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; ioctls = malloc(size, M_FILECAPS, M_WAITOK); return (ioctls); } static void filecaps_copy_finish(const struct filecaps *src, struct filecaps *dst, u_long *ioctls) { size_t size; *dst = *src; if (__predict_true(src->fc_ioctls == NULL)) { MPASS(ioctls == NULL); return; } size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls; dst->fc_ioctls = ioctls; bcopy(src->fc_ioctls, dst->fc_ioctls, size); } /* * Move filecaps structure to the new place and clear the old place. */ void filecaps_move(struct filecaps *src, struct filecaps *dst) { *dst = *src; bzero(src, sizeof(*src)); } /* * Fill the given filecaps structure with full rights. */ static void filecaps_fill(struct filecaps *fcaps) { CAP_ALL(&fcaps->fc_rights); fcaps->fc_ioctls = NULL; fcaps->fc_nioctls = -1; fcaps->fc_fcntls = CAP_FCNTL_ALL; } /* * Free memory allocated within filecaps structure. */ static void filecaps_free_ioctl(struct filecaps *fcaps) { free(fcaps->fc_ioctls, M_FILECAPS); fcaps->fc_ioctls = NULL; } void filecaps_free(struct filecaps *fcaps) { filecaps_free_ioctl(fcaps); bzero(fcaps, sizeof(*fcaps)); } static u_long * filecaps_free_prep(struct filecaps *fcaps) { u_long *ioctls; ioctls = fcaps->fc_ioctls; bzero(fcaps, sizeof(*fcaps)); return (ioctls); } static void filecaps_free_finish(u_long *ioctls) { free(ioctls, M_FILECAPS); } /* * Validate the given filecaps structure. */ static void filecaps_validate(const struct filecaps *fcaps, const char *func) { KASSERT(cap_rights_is_valid(&fcaps->fc_rights), ("%s: invalid rights", func)); KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0, ("%s: invalid fcntls", func)); KASSERT(fcaps->fc_fcntls == 0 || cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL), ("%s: fcntls without CAP_FCNTL", func)); /* * open calls without WANTIOCTLCAPS free caps but leave the counter */ #if 0 KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 : (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0), ("%s: invalid ioctls", func)); #endif KASSERT(fcaps->fc_nioctls == 0 || cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL), ("%s: ioctls without CAP_IOCTL", func)); } static void fdgrowtable_exp(struct filedesc *fdp, int nfd) { int nfd1; FILEDESC_XLOCK_ASSERT(fdp); nfd1 = fdp->fd_nfiles * 2; if (nfd1 < nfd) nfd1 = nfd; fdgrowtable(fdp, nfd1); } /* * Grow the file table to accommodate (at least) nfd descriptors. */ static void fdgrowtable(struct filedesc *fdp, int nfd) { struct filedesc0 *fdp0; struct freetable *ft; struct fdescenttbl *ntable; struct fdescenttbl *otable; int nnfiles, onfiles; NDSLOTTYPE *nmap, *omap; KASSERT(fdp->fd_nfiles > 0, ("zero-length file table")); /* save old values */ onfiles = fdp->fd_nfiles; otable = fdp->fd_files; omap = fdp->fd_map; /* compute the size of the new table */ nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */ if (nnfiles <= onfiles) /* the table is already large enough */ return; /* * Allocate a new table. We need enough space for the number of * entries, file entries themselves and the struct freetable we will use * when we decommission the table and place it on the freelist. * We place the struct freetable in the middle so we don't have * to worry about padding. */ ntable = malloc(offsetof(struct fdescenttbl, fdt_ofiles) + nnfiles * sizeof(ntable->fdt_ofiles[0]) + sizeof(struct freetable), M_FILEDESC, M_ZERO | M_WAITOK); /* copy the old data */ ntable->fdt_nfiles = nnfiles; memcpy(ntable->fdt_ofiles, otable->fdt_ofiles, onfiles * sizeof(ntable->fdt_ofiles[0])); /* * Allocate a new map only if the old is not large enough. It will * grow at a slower rate than the table as it can map more * entries than the table can hold. */ if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) { nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC, M_ZERO | M_WAITOK); /* copy over the old data and update the pointer */ memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap)); fdp->fd_map = nmap; } /* * Make sure that ntable is correctly initialized before we replace * fd_files poiner. Otherwise fget_unlocked() may see inconsistent * data. */ atomic_store_rel_ptr((volatile void *)&fdp->fd_files, (uintptr_t)ntable); /* * Free the old file table when not shared by other threads or processes. * The old file table is considered to be shared when either are true: * - The process has more than one thread. * - The file descriptor table has been shared via fdshare(). * * When shared, the old file table will be placed on a freelist * which will be processed when the struct filedesc is released. * * Note that if onfiles == NDFILE, we're dealing with the original * static allocation contained within (struct filedesc0 *)fdp, * which must not be freed. */ if (onfiles > NDFILE) { /* * Note we may be called here from fdinit while allocating a * table for a new process in which case ->p_fd points * elsewhere. */ if (curproc->p_fd != fdp || FILEDESC_IS_ONLY_USER(fdp)) { free(otable, M_FILEDESC); } else { ft = (struct freetable *)&otable->fdt_ofiles[onfiles]; fdp0 = (struct filedesc0 *)fdp; ft->ft_table = otable; SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next); } } /* * The map does not have the same possibility of threads still * holding references to it. So always free it as long as it * does not reference the original static allocation. */ if (NDSLOTS(onfiles) > NDSLOTS(NDFILE)) free(omap, M_FILEDESC); } /* * Allocate a file descriptor for the process. */ int fdalloc(struct thread *td, int minfd, int *result) { struct proc *p = td->td_proc; struct filedesc *fdp = p->p_fd; int fd, maxfd, allocfd; #ifdef RACCT int error; #endif FILEDESC_XLOCK_ASSERT(fdp); if (fdp->fd_freefile > minfd) minfd = fdp->fd_freefile; maxfd = getmaxfd(td); /* * Search the bitmap for a free descriptor starting at minfd. * If none is found, grow the file table. */ fd = fd_first_free(fdp, minfd, fdp->fd_nfiles); if (__predict_false(fd >= maxfd)) return (EMFILE); if (__predict_false(fd >= fdp->fd_nfiles)) { allocfd = min(fd * 2, maxfd); #ifdef RACCT if (RACCT_ENABLED()) { error = racct_set_unlocked(p, RACCT_NOFILE, allocfd); if (error != 0) return (EMFILE); } #endif /* * fd is already equal to first free descriptor >= minfd, so * we only need to grow the table and we are done. */ fdgrowtable_exp(fdp, allocfd); } /* * Perform some sanity checks, then mark the file descriptor as * used and return it to the caller. */ KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles), ("invalid descriptor %d", fd)); KASSERT(!fdisused(fdp, fd), ("fd_first_free() returned non-free descriptor")); KASSERT(fdp->fd_ofiles[fd].fde_file == NULL, ("file descriptor isn't free")); fdused(fdp, fd); *result = fd; return (0); } /* * Allocate n file descriptors for the process. */ int fdallocn(struct thread *td, int minfd, int *fds, int n) { struct proc *p = td->td_proc; struct filedesc *fdp = p->p_fd; int i; FILEDESC_XLOCK_ASSERT(fdp); for (i = 0; i < n; i++) if (fdalloc(td, 0, &fds[i]) != 0) break; if (i < n) { for (i--; i >= 0; i--) fdunused(fdp, fds[i]); return (EMFILE); } return (0); } /* * Create a new open file structure and allocate a file descriptor for the * process that refers to it. We add one reference to the file for the * descriptor table and one reference for resultfp. This is to prevent us * being preempted and the entry in the descriptor table closed after we * release the FILEDESC lock. */ int falloc_caps(struct thread *td, struct file **resultfp, int *resultfd, int flags, struct filecaps *fcaps) { struct file *fp; int error, fd; MPASS(resultfp != NULL); MPASS(resultfd != NULL); error = _falloc_noinstall(td, &fp, 2); if (__predict_false(error != 0)) { return (error); } error = finstall_refed(td, fp, &fd, flags, fcaps); if (__predict_false(error != 0)) { falloc_abort(td, fp); return (error); } *resultfp = fp; *resultfd = fd; return (0); } /* * Create a new open file structure without allocating a file descriptor. */ int _falloc_noinstall(struct thread *td, struct file **resultfp, u_int n) { struct file *fp; int maxuserfiles = maxfiles - (maxfiles / 20); int openfiles_new; static struct timeval lastfail; static int curfail; KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__)); MPASS(n > 0); openfiles_new = atomic_fetchadd_int(&openfiles, 1) + 1; if ((openfiles_new >= maxuserfiles && priv_check(td, PRIV_MAXFILES) != 0) || openfiles_new >= maxfiles) { atomic_subtract_int(&openfiles, 1); if (ppsratecheck(&lastfail, &curfail, 1)) { printf("kern.maxfiles limit exceeded by uid %i, (%s) " "please see tuning(7).\n", td->td_ucred->cr_ruid, td->td_proc->p_comm); } return (ENFILE); } fp = uma_zalloc(file_zone, M_WAITOK); bzero(fp, sizeof(*fp)); refcount_init(&fp->f_count, n); fp->f_cred = crhold(td->td_ucred); fp->f_ops = &badfileops; *resultfp = fp; return (0); } void falloc_abort(struct thread *td, struct file *fp) { /* * For assertion purposes. */ refcount_init(&fp->f_count, 0); _fdrop(fp, td); } /* * Install a file in a file descriptor table. */ void _finstall(struct filedesc *fdp, struct file *fp, int fd, int flags, struct filecaps *fcaps) { struct filedescent *fde; MPASS(fp != NULL); if (fcaps != NULL) filecaps_validate(fcaps, __func__); FILEDESC_XLOCK_ASSERT(fdp); fde = &fdp->fd_ofiles[fd]; #ifdef CAPABILITIES seqc_write_begin(&fde->fde_seqc); #endif fde->fde_file = fp; fde->fde_flags = (flags & O_CLOEXEC) != 0 ? UF_EXCLOSE : 0; if (fcaps != NULL) filecaps_move(fcaps, &fde->fde_caps); else filecaps_fill(&fde->fde_caps); #ifdef CAPABILITIES seqc_write_end(&fde->fde_seqc); #endif } int finstall_refed(struct thread *td, struct file *fp, int *fd, int flags, struct filecaps *fcaps) { struct filedesc *fdp = td->td_proc->p_fd; int error; MPASS(fd != NULL); FILEDESC_XLOCK(fdp); error = fdalloc(td, 0, fd); if (__predict_true(error == 0)) { _finstall(fdp, fp, *fd, flags, fcaps); } FILEDESC_XUNLOCK(fdp); return (error); } int finstall(struct thread *td, struct file *fp, int *fd, int flags, struct filecaps *fcaps) { int error; MPASS(fd != NULL); if (!fhold(fp)) return (EBADF); error = finstall_refed(td, fp, fd, flags, fcaps); if (__predict_false(error != 0)) { fdrop(fp, td); } return (error); } /* * Build a new filedesc structure from another. * * If fdp is not NULL, return with it shared locked. */ struct filedesc * fdinit(void) { struct filedesc0 *newfdp0; struct filedesc *newfdp; newfdp0 = uma_zalloc(filedesc0_zone, M_WAITOK | M_ZERO); newfdp = &newfdp0->fd_fd; /* Create the file descriptor table. */ FILEDESC_LOCK_INIT(newfdp); refcount_init(&newfdp->fd_refcnt, 1); refcount_init(&newfdp->fd_holdcnt, 1); newfdp->fd_map = newfdp0->fd_dmap; newfdp->fd_files = (struct fdescenttbl *)&newfdp0->fd_dfiles; newfdp->fd_files->fdt_nfiles = NDFILE; return (newfdp); } /* * Build a pwddesc structure from another. * Copy the current, root, and jail root vnode references. * * If pdp is not NULL, return with it shared locked. */ struct pwddesc * pdinit(struct pwddesc *pdp, bool keeplock) { struct pwddesc *newpdp; struct pwd *newpwd; newpdp = malloc(sizeof(*newpdp), M_PWDDESC, M_WAITOK | M_ZERO); PWDDESC_LOCK_INIT(newpdp); refcount_init(&newpdp->pd_refcount, 1); newpdp->pd_cmask = CMASK; if (pdp == NULL) { newpwd = pwd_alloc(); smr_serialized_store(&newpdp->pd_pwd, newpwd, true); return (newpdp); } PWDDESC_XLOCK(pdp); newpwd = pwd_hold_pwddesc(pdp); smr_serialized_store(&newpdp->pd_pwd, newpwd, true); if (!keeplock) PWDDESC_XUNLOCK(pdp); return (newpdp); } /* * Hold either filedesc or pwddesc of the passed process. * * The process lock is used to synchronize against the target exiting and * freeing the data. * * Clearing can be ilustrated in 3 steps: * 1. set the pointer to NULL. Either routine can race against it, hence * atomic_load_ptr. * 2. observe the process lock as not taken. Until then fdhold/pdhold can * race to either still see the pointer or find NULL. It is still safe to * grab a reference as clearing is stalled. * 3. after the lock is observed as not taken, any fdhold/pdhold calls are * guaranteed to see NULL, making it safe to finish clearing */ static struct filedesc * fdhold(struct proc *p) { struct filedesc *fdp; PROC_LOCK_ASSERT(p, MA_OWNED); fdp = atomic_load_ptr(&p->p_fd); if (fdp != NULL) refcount_acquire(&fdp->fd_holdcnt); return (fdp); } static struct pwddesc * pdhold(struct proc *p) { struct pwddesc *pdp; PROC_LOCK_ASSERT(p, MA_OWNED); pdp = atomic_load_ptr(&p->p_pd); if (pdp != NULL) refcount_acquire(&pdp->pd_refcount); return (pdp); } static void fddrop(struct filedesc *fdp) { if (refcount_load(&fdp->fd_holdcnt) > 1) { if (refcount_release(&fdp->fd_holdcnt) == 0) return; } FILEDESC_LOCK_DESTROY(fdp); uma_zfree(filedesc0_zone, fdp); } static void pddrop(struct pwddesc *pdp) { struct pwd *pwd; if (refcount_release_if_not_last(&pdp->pd_refcount)) return; PWDDESC_XLOCK(pdp); if (refcount_release(&pdp->pd_refcount) == 0) { PWDDESC_XUNLOCK(pdp); return; } pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); pwd_set(pdp, NULL); PWDDESC_XUNLOCK(pdp); pwd_drop(pwd); PWDDESC_LOCK_DESTROY(pdp); free(pdp, M_PWDDESC); } /* * Share a filedesc structure. */ struct filedesc * fdshare(struct filedesc *fdp) { refcount_acquire(&fdp->fd_refcnt); return (fdp); } /* * Share a pwddesc structure. */ struct pwddesc * pdshare(struct pwddesc *pdp) { refcount_acquire(&pdp->pd_refcount); return (pdp); } /* * Unshare a filedesc structure, if necessary by making a copy */ void fdunshare(struct thread *td) { struct filedesc *tmp; struct proc *p = td->td_proc; if (refcount_load(&p->p_fd->fd_refcnt) == 1) return; tmp = fdcopy(p->p_fd); fdescfree(td); p->p_fd = tmp; } /* * Unshare a pwddesc structure. */ void pdunshare(struct thread *td) { struct pwddesc *pdp; struct proc *p; p = td->td_proc; /* Not shared. */ if (refcount_load(&p->p_pd->pd_refcount) == 1) return; pdp = pdcopy(p->p_pd); pdescfree(td); p->p_pd = pdp; } /* * Copy a filedesc structure. A NULL pointer in returns a NULL reference, * this is to ease callers, not catch errors. */ struct filedesc * fdcopy(struct filedesc *fdp) { struct filedesc *newfdp; struct filedescent *nfde, *ofde; int i, lastfile; MPASS(fdp != NULL); newfdp = fdinit(); FILEDESC_SLOCK(fdp); for (;;) { lastfile = fdlastfile(fdp); if (lastfile < newfdp->fd_nfiles) break; FILEDESC_SUNLOCK(fdp); fdgrowtable(newfdp, lastfile + 1); FILEDESC_SLOCK(fdp); } /* copy all passable descriptors (i.e. not kqueue) */ newfdp->fd_freefile = fdp->fd_freefile; FILEDESC_FOREACH_FDE(fdp, i, ofde) { if ((ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0 || !fhold(ofde->fde_file)) { if (newfdp->fd_freefile == fdp->fd_freefile) newfdp->fd_freefile = i; continue; } nfde = &newfdp->fd_ofiles[i]; *nfde = *ofde; filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true); fdused_init(newfdp, i); } MPASS(newfdp->fd_freefile != -1); FILEDESC_SUNLOCK(fdp); return (newfdp); } /* * Copy a pwddesc structure. */ struct pwddesc * pdcopy(struct pwddesc *pdp) { struct pwddesc *newpdp; MPASS(pdp != NULL); newpdp = pdinit(pdp, true); newpdp->pd_cmask = pdp->pd_cmask; PWDDESC_XUNLOCK(pdp); return (newpdp); } /* * Clear POSIX style locks. This is only used when fdp looses a reference (i.e. * one of processes using it exits) and the table used to be shared. */ static void fdclearlocks(struct thread *td) { struct filedesc *fdp; struct filedesc_to_leader *fdtol; struct flock lf; struct file *fp; struct proc *p; struct vnode *vp; int i; p = td->td_proc; fdp = p->p_fd; fdtol = p->p_fdtol; MPASS(fdtol != NULL); FILEDESC_XLOCK(fdp); KASSERT(fdtol->fdl_refcount > 0, ("filedesc_to_refcount botch: fdl_refcount=%d", fdtol->fdl_refcount)); if (fdtol->fdl_refcount == 1 && (p->p_leader->p_flag & P_ADVLOCK) != 0) { FILEDESC_FOREACH_FP(fdp, i, fp) { if (fp->f_type != DTYPE_VNODE || !fhold(fp)) continue; FILEDESC_XUNLOCK(fdp); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; vp = fp->f_vnode; (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK, &lf, F_POSIX); FILEDESC_XLOCK(fdp); fdrop(fp, td); } } retry: if (fdtol->fdl_refcount == 1) { if (fdp->fd_holdleaderscount > 0 && (p->p_leader->p_flag & P_ADVLOCK) != 0) { /* * close() or kern_dup() has cleared a reference * in a shared file descriptor table. */ fdp->fd_holdleaderswakeup = 1; sx_sleep(&fdp->fd_holdleaderscount, FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0); goto retry; } if (fdtol->fdl_holdcount > 0) { /* * Ensure that fdtol->fdl_leader remains * valid in closef(). */ fdtol->fdl_wakeup = 1; sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0); goto retry; } } fdtol->fdl_refcount--; if (fdtol->fdl_refcount == 0 && fdtol->fdl_holdcount == 0) { fdtol->fdl_next->fdl_prev = fdtol->fdl_prev; fdtol->fdl_prev->fdl_next = fdtol->fdl_next; } else fdtol = NULL; p->p_fdtol = NULL; FILEDESC_XUNLOCK(fdp); if (fdtol != NULL) free(fdtol, M_FILEDESC_TO_LEADER); } /* * Release a filedesc structure. */ static void fdescfree_fds(struct thread *td, struct filedesc *fdp) { struct filedesc0 *fdp0; struct freetable *ft, *tft; struct filedescent *fde; struct file *fp; int i; KASSERT(refcount_load(&fdp->fd_refcnt) == 0, ("%s: fd table %p carries references", __func__, fdp)); /* * Serialize with threads iterating over the table, if any. */ if (refcount_load(&fdp->fd_holdcnt) > 1) { FILEDESC_XLOCK(fdp); FILEDESC_XUNLOCK(fdp); } FILEDESC_FOREACH_FDE(fdp, i, fde) { fp = fde->fde_file; fdefree_last(fde); (void) closef(fp, td); } if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE)) free(fdp->fd_map, M_FILEDESC); if (fdp->fd_nfiles > NDFILE) free(fdp->fd_files, M_FILEDESC); fdp0 = (struct filedesc0 *)fdp; SLIST_FOREACH_SAFE(ft, &fdp0->fd_free, ft_next, tft) free(ft->ft_table, M_FILEDESC); fddrop(fdp); } void fdescfree(struct thread *td) { struct proc *p; struct filedesc *fdp; p = td->td_proc; fdp = p->p_fd; MPASS(fdp != NULL); #ifdef RACCT if (RACCT_ENABLED()) racct_set_unlocked(p, RACCT_NOFILE, 0); #endif if (p->p_fdtol != NULL) fdclearlocks(td); /* * Check fdhold for an explanation. */ atomic_store_ptr(&p->p_fd, NULL); atomic_thread_fence_seq_cst(); PROC_WAIT_UNLOCKED(p); if (refcount_release(&fdp->fd_refcnt) == 0) return; fdescfree_fds(td, fdp); } void pdescfree(struct thread *td) { struct proc *p; struct pwddesc *pdp; p = td->td_proc; pdp = p->p_pd; MPASS(pdp != NULL); /* * Check pdhold for an explanation. */ atomic_store_ptr(&p->p_pd, NULL); atomic_thread_fence_seq_cst(); PROC_WAIT_UNLOCKED(p); pddrop(pdp); } /* * For setugid programs, we don't want to people to use that setugidness * to generate error messages which write to a file which otherwise would * otherwise be off-limits to the process. We check for filesystems where * the vnode can change out from under us after execve (like [lin]procfs). * * Since fdsetugidsafety calls this only for fd 0, 1 and 2, this check is * sufficient. We also don't check for setugidness since we know we are. */ static bool is_unsafe(struct file *fp) { struct vnode *vp; if (fp->f_type != DTYPE_VNODE) return (false); vp = fp->f_vnode; return ((vp->v_vflag & VV_PROCDEP) != 0); } /* * Make this setguid thing safe, if at all possible. */ void fdsetugidsafety(struct thread *td) { struct filedesc *fdp; struct file *fp; int i; fdp = td->td_proc->p_fd; KASSERT(refcount_load(&fdp->fd_refcnt) == 1, ("the fdtable should not be shared")); MPASS(fdp->fd_nfiles >= 3); for (i = 0; i <= 2; i++) { fp = fdp->fd_ofiles[i].fde_file; if (fp != NULL && is_unsafe(fp)) { FILEDESC_XLOCK(fdp); knote_fdclose(td, i); /* * NULL-out descriptor prior to close to avoid * a race while close blocks. */ fdfree(fdp, i); FILEDESC_XUNLOCK(fdp); (void) closef(fp, td); } } } /* * If a specific file object occupies a specific file descriptor, close the * file descriptor entry and drop a reference on the file object. This is a * convenience function to handle a subsequent error in a function that calls * falloc() that handles the race that another thread might have closed the * file descriptor out from under the thread creating the file object. */ void fdclose(struct thread *td, struct file *fp, int idx) { struct filedesc *fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); if (fdp->fd_ofiles[idx].fde_file == fp) { fdfree(fdp, idx); FILEDESC_XUNLOCK(fdp); fdrop(fp, td); } else FILEDESC_XUNLOCK(fdp); } /* * Close any files on exec? */ void fdcloseexec(struct thread *td) { struct filedesc *fdp; struct filedescent *fde; struct file *fp; int i; fdp = td->td_proc->p_fd; KASSERT(refcount_load(&fdp->fd_refcnt) == 1, ("the fdtable should not be shared")); FILEDESC_FOREACH_FDE(fdp, i, fde) { fp = fde->fde_file; if (fp->f_type == DTYPE_MQUEUE || (fde->fde_flags & UF_EXCLOSE)) { FILEDESC_XLOCK(fdp); fdfree(fdp, i); (void) closefp(fdp, i, fp, td, false, false); FILEDESC_UNLOCK_ASSERT(fdp); } } } /* * It is unsafe for set[ug]id processes to be started with file * descriptors 0..2 closed, as these descriptors are given implicit * significance in the Standard C library. fdcheckstd() will create a * descriptor referencing /dev/null for each of stdin, stdout, and * stderr that is not already open. */ int fdcheckstd(struct thread *td) { struct filedesc *fdp; register_t save; int i, error, devnull; fdp = td->td_proc->p_fd; KASSERT(refcount_load(&fdp->fd_refcnt) == 1, ("the fdtable should not be shared")); MPASS(fdp->fd_nfiles >= 3); devnull = -1; for (i = 0; i <= 2; i++) { if (fdp->fd_ofiles[i].fde_file != NULL) continue; save = td->td_retval[0]; if (devnull != -1) { error = kern_dup(td, FDDUP_FIXED, 0, devnull, i); } else { error = kern_openat(td, AT_FDCWD, "/dev/null", UIO_SYSSPACE, O_RDWR, 0); if (error == 0) { devnull = td->td_retval[0]; KASSERT(devnull == i, ("we didn't get our fd")); } } td->td_retval[0] = save; if (error != 0) return (error); } return (0); } /* * Internal form of close. Decrement reference count on file structure. * Note: td may be NULL when closing a file that was being passed in a * message. */ int closef(struct file *fp, struct thread *td) { struct vnode *vp; struct flock lf; struct filedesc_to_leader *fdtol; struct filedesc *fdp; MPASS(td != NULL); /* * POSIX record locking dictates that any close releases ALL * locks owned by this process. This is handled by setting * a flag in the unlock to free ONLY locks obeying POSIX * semantics, and not to free BSD-style file locks. * If the descriptor was in a message, POSIX-style locks * aren't passed with the descriptor, and the thread pointer * will be NULL. Callers should be careful only to pass a * NULL thread pointer when there really is no owning * context that might have locks, or the locks will be * leaked. */ if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; (void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader, F_UNLCK, &lf, F_POSIX); } fdtol = td->td_proc->p_fdtol; if (fdtol != NULL) { /* * Handle special case where file descriptor table is * shared between multiple process leaders. */ fdp = td->td_proc->p_fd; FILEDESC_XLOCK(fdp); for (fdtol = fdtol->fdl_next; fdtol != td->td_proc->p_fdtol; fdtol = fdtol->fdl_next) { if ((fdtol->fdl_leader->p_flag & P_ADVLOCK) == 0) continue; fdtol->fdl_holdcount++; FILEDESC_XUNLOCK(fdp); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; vp = fp->f_vnode; (void) VOP_ADVLOCK(vp, (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf, F_POSIX); FILEDESC_XLOCK(fdp); fdtol->fdl_holdcount--; if (fdtol->fdl_holdcount == 0 && fdtol->fdl_wakeup != 0) { fdtol->fdl_wakeup = 0; wakeup(fdtol); } } FILEDESC_XUNLOCK(fdp); } } return (fdrop_close(fp, td)); } /* * Hack for file descriptor passing code. */ void closef_nothread(struct file *fp) { fdrop(fp, NULL); } /* * Initialize the file pointer with the specified properties. * * The ops are set with release semantics to be certain that the flags, type, * and data are visible when ops is. This is to prevent ops methods from being * called with bad data. */ void -finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops) +finit(struct file *fp, u_int flag, short type, void *data, + const struct fileops *ops) { fp->f_data = data; fp->f_flag = flag; fp->f_type = type; atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops); } void -finit_vnode(struct file *fp, u_int flag, void *data, struct fileops *ops) +finit_vnode(struct file *fp, u_int flag, void *data, const struct fileops *ops) { fp->f_seqcount[UIO_READ] = 1; fp->f_seqcount[UIO_WRITE] = 1; finit(fp, (flag & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE, data, ops); } int fget_cap_noref(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, struct file **fpp, struct filecaps *havecapsp) { struct filedescent *fde; int error; FILEDESC_LOCK_ASSERT(fdp); *fpp = NULL; fde = fdeget_noref(fdp, fd); if (fde == NULL) { error = EBADF; goto out; } #ifdef CAPABILITIES error = cap_check(cap_rights_fde_inline(fde), needrightsp); if (error != 0) goto out; #endif if (havecapsp != NULL) filecaps_copy(&fde->fde_caps, havecapsp, true); *fpp = fde->fde_file; error = 0; out: return (error); } #ifdef CAPABILITIES int fget_cap(struct thread *td, int fd, cap_rights_t *needrightsp, struct file **fpp, struct filecaps *havecapsp) { struct filedesc *fdp = td->td_proc->p_fd; int error; struct file *fp; seqc_t seq; *fpp = NULL; for (;;) { error = fget_unlocked_seq(td, fd, needrightsp, &fp, &seq); if (error != 0) return (error); if (havecapsp != NULL) { if (!filecaps_copy(&fdp->fd_ofiles[fd].fde_caps, havecapsp, false)) { fdrop(fp, td); goto get_locked; } } if (!fd_modified(fdp, fd, seq)) break; fdrop(fp, td); } *fpp = fp; return (0); get_locked: FILEDESC_SLOCK(fdp); error = fget_cap_noref(fdp, fd, needrightsp, fpp, havecapsp); if (error == 0 && !fhold(*fpp)) error = EBADF; FILEDESC_SUNLOCK(fdp); return (error); } #else int fget_cap(struct thread *td, int fd, cap_rights_t *needrightsp, struct file **fpp, struct filecaps *havecapsp) { int error; error = fget_unlocked(td, fd, needrightsp, fpp); if (havecapsp != NULL && error == 0) filecaps_fill(havecapsp); return (error); } #endif int fget_remote(struct thread *td, struct proc *p, int fd, struct file **fpp) { struct filedesc *fdp; struct file *fp; int error; if (p == td->td_proc) /* curproc */ return (fget_unlocked(td, fd, &cap_no_rights, fpp)); PROC_LOCK(p); fdp = fdhold(p); PROC_UNLOCK(p); if (fdp == NULL) return (ENOENT); FILEDESC_SLOCK(fdp); if (refcount_load(&fdp->fd_refcnt) != 0) { fp = fget_noref(fdp, fd); if (fp != NULL && fhold(fp)) { *fpp = fp; error = 0; } else { error = EBADF; } } else { error = ENOENT; } FILEDESC_SUNLOCK(fdp); fddrop(fdp); return (error); } #ifdef CAPABILITIES int fgetvp_lookup_smr(struct nameidata *ndp, struct vnode **vpp, bool *fsearch) { const struct filedescent *fde; const struct fdescenttbl *fdt; struct filedesc *fdp; struct file *fp; struct vnode *vp; const cap_rights_t *haverights; cap_rights_t rights; seqc_t seq; int fd; VFS_SMR_ASSERT_ENTERED(); fd = ndp->ni_dirfd; rights = *ndp->ni_rightsneeded; cap_rights_set_one(&rights, CAP_LOOKUP); fdp = curproc->p_fd; fdt = fdp->fd_files; if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) return (EBADF); seq = seqc_read_notmodify(fd_seqc(fdt, fd)); fde = &fdt->fdt_ofiles[fd]; haverights = cap_rights_fde_inline(fde); fp = fde->fde_file; if (__predict_false(fp == NULL)) return (EAGAIN); if (__predict_false(cap_check_inline_transient(haverights, &rights))) return (EAGAIN); *fsearch = ((fp->f_flag & FSEARCH) != 0); vp = fp->f_vnode; if (__predict_false(vp == NULL)) { return (EAGAIN); } if (!filecaps_copy(&fde->fde_caps, &ndp->ni_filecaps, false)) { return (EAGAIN); } /* * Use an acquire barrier to force re-reading of fdt so it is * refreshed for verification. */ atomic_thread_fence_acq(); fdt = fdp->fd_files; if (__predict_false(!seqc_consistent_no_fence(fd_seqc(fdt, fd), seq))) return (EAGAIN); /* * If file descriptor doesn't have all rights, * all lookups relative to it must also be * strictly relative. * * Not yet supported by fast path. */ CAP_ALL(&rights); if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights, &rights) || ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL || ndp->ni_filecaps.fc_nioctls != -1) { #ifdef notyet ndp->ni_lcf |= NI_LCF_STRICTREL; #else return (EAGAIN); #endif } *vpp = vp; return (0); } #else int fgetvp_lookup_smr(struct nameidata *ndp, struct vnode **vpp, bool *fsearch) { const struct fdescenttbl *fdt; struct filedesc *fdp; struct file *fp; struct vnode *vp; int fd; VFS_SMR_ASSERT_ENTERED(); fd = ndp->ni_dirfd; fdp = curproc->p_fd; fdt = fdp->fd_files; if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) return (EBADF); fp = fdt->fdt_ofiles[fd].fde_file; if (__predict_false(fp == NULL)) return (EAGAIN); *fsearch = ((fp->f_flag & FSEARCH) != 0); vp = fp->f_vnode; if (__predict_false(vp == NULL || vp->v_type != VDIR)) { return (EAGAIN); } /* * Use an acquire barrier to force re-reading of fdt so it is * refreshed for verification. */ atomic_thread_fence_acq(); fdt = fdp->fd_files; if (__predict_false(fp != fdt->fdt_ofiles[fd].fde_file)) return (EAGAIN); filecaps_fill(&ndp->ni_filecaps); *vpp = vp; return (0); } #endif int fgetvp_lookup(struct nameidata *ndp, struct vnode **vpp) { struct thread *td; struct file *fp; struct vnode *vp; struct componentname *cnp; cap_rights_t rights; int error; td = curthread; rights = *ndp->ni_rightsneeded; cap_rights_set_one(&rights, CAP_LOOKUP); cnp = &ndp->ni_cnd; error = fget_cap(td, ndp->ni_dirfd, &rights, &fp, &ndp->ni_filecaps); if (__predict_false(error != 0)) return (error); if (__predict_false(fp->f_ops == &badfileops)) { error = EBADF; goto out_free; } vp = fp->f_vnode; if (__predict_false(vp == NULL)) { error = ENOTDIR; goto out_free; } vrefact(vp); /* * XXX does not check for VDIR, handled by namei_setup */ if ((fp->f_flag & FSEARCH) != 0) cnp->cn_flags |= NOEXECCHECK; fdrop(fp, td); #ifdef CAPABILITIES /* * If file descriptor doesn't have all rights, * all lookups relative to it must also be * strictly relative. */ CAP_ALL(&rights); if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights, &rights) || ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL || ndp->ni_filecaps.fc_nioctls != -1) { ndp->ni_lcf |= NI_LCF_STRICTREL; ndp->ni_resflags |= NIRES_STRICTREL; } #endif /* * TODO: avoid copying ioctl caps if it can be helped to begin with */ if ((cnp->cn_flags & WANTIOCTLCAPS) == 0) filecaps_free_ioctl(&ndp->ni_filecaps); *vpp = vp; return (0); out_free: filecaps_free(&ndp->ni_filecaps); fdrop(fp, td); return (error); } /* * Fetch the descriptor locklessly. * * We avoid fdrop() races by never raising a refcount above 0. To accomplish * this we have to use a cmpset loop rather than an atomic_add. The descriptor * must be re-verified once we acquire a reference to be certain that the * identity is still correct and we did not lose a race due to preemption. * * Force a reload of fdt when looping. Another thread could reallocate * the table before this fd was closed, so it is possible that there is * a stale fp pointer in cached version. */ #ifdef CAPABILITIES static int fget_unlocked_seq(struct thread *td, int fd, cap_rights_t *needrightsp, struct file **fpp, seqc_t *seqp) { struct filedesc *fdp; const struct filedescent *fde; const struct fdescenttbl *fdt; struct file *fp; seqc_t seq; cap_rights_t haverights; int error; fdp = td->td_proc->p_fd; fdt = fdp->fd_files; if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) return (EBADF); for (;;) { seq = seqc_read_notmodify(fd_seqc(fdt, fd)); fde = &fdt->fdt_ofiles[fd]; haverights = *cap_rights_fde_inline(fde); fp = fde->fde_file; if (__predict_false(fp == NULL)) { if (seqc_consistent(fd_seqc(fdt, fd), seq)) return (EBADF); fdt = atomic_load_ptr(&fdp->fd_files); continue; } error = cap_check_inline(&haverights, needrightsp); if (__predict_false(error != 0)) { if (seqc_consistent(fd_seqc(fdt, fd), seq)) return (error); fdt = atomic_load_ptr(&fdp->fd_files); continue; } if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count))) { fdt = atomic_load_ptr(&fdp->fd_files); continue; } /* * Use an acquire barrier to force re-reading of fdt so it is * refreshed for verification. */ atomic_thread_fence_acq(); fdt = fdp->fd_files; if (seqc_consistent_no_fence(fd_seqc(fdt, fd), seq)) break; fdrop(fp, td); } *fpp = fp; if (seqp != NULL) { *seqp = seq; } return (0); } #else static int fget_unlocked_seq(struct thread *td, int fd, cap_rights_t *needrightsp, struct file **fpp, seqc_t *seqp __unused) { struct filedesc *fdp; const struct fdescenttbl *fdt; struct file *fp; fdp = td->td_proc->p_fd; fdt = fdp->fd_files; if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) return (EBADF); for (;;) { fp = fdt->fdt_ofiles[fd].fde_file; if (__predict_false(fp == NULL)) return (EBADF); if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count))) { fdt = atomic_load_ptr(&fdp->fd_files); continue; } /* * Use an acquire barrier to force re-reading of fdt so it is * refreshed for verification. */ atomic_thread_fence_acq(); fdt = fdp->fd_files; if (__predict_true(fp == fdt->fdt_ofiles[fd].fde_file)) break; fdrop(fp, td); } *fpp = fp; return (0); } #endif /* * See the comments in fget_unlocked_seq for an explanation of how this works. * * This is a simplified variant which bails out to the aforementioned routine * if anything goes wrong. In practice this only happens when userspace is * racing with itself. */ int fget_unlocked(struct thread *td, int fd, cap_rights_t *needrightsp, struct file **fpp) { struct filedesc *fdp; #ifdef CAPABILITIES const struct filedescent *fde; #endif const struct fdescenttbl *fdt; struct file *fp; #ifdef CAPABILITIES seqc_t seq; const cap_rights_t *haverights; #endif fdp = td->td_proc->p_fd; fdt = fdp->fd_files; if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) { *fpp = NULL; return (EBADF); } #ifdef CAPABILITIES seq = seqc_read_notmodify(fd_seqc(fdt, fd)); fde = &fdt->fdt_ofiles[fd]; haverights = cap_rights_fde_inline(fde); fp = fde->fde_file; #else fp = fdt->fdt_ofiles[fd].fde_file; #endif if (__predict_false(fp == NULL)) goto out_fallback; #ifdef CAPABILITIES if (__predict_false(cap_check_inline_transient(haverights, needrightsp))) goto out_fallback; #endif if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count))) goto out_fallback; /* * Use an acquire barrier to force re-reading of fdt so it is * refreshed for verification. */ atomic_thread_fence_acq(); fdt = fdp->fd_files; #ifdef CAPABILITIES if (__predict_false(!seqc_consistent_no_fence(fd_seqc(fdt, fd), seq))) #else if (__predict_false(fp != fdt->fdt_ofiles[fd].fde_file)) #endif goto out_fdrop; *fpp = fp; return (0); out_fdrop: fdrop(fp, td); out_fallback: *fpp = NULL; return (fget_unlocked_seq(td, fd, needrightsp, fpp, NULL)); } /* * Translate fd -> file when the caller guarantees the file descriptor table * can't be changed by others. * * Note this does not mean the file object itself is only visible to the caller, * merely that it wont disappear without having to be referenced. * * Must be paired with fput_only_user. */ #ifdef CAPABILITIES int fget_only_user(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, struct file **fpp) { const struct filedescent *fde; const struct fdescenttbl *fdt; const cap_rights_t *haverights; struct file *fp; int error; MPASS(FILEDESC_IS_ONLY_USER(fdp)); *fpp = NULL; if (__predict_false(fd >= fdp->fd_nfiles)) return (EBADF); fdt = fdp->fd_files; fde = &fdt->fdt_ofiles[fd]; fp = fde->fde_file; if (__predict_false(fp == NULL)) return (EBADF); MPASS(refcount_load(&fp->f_count) > 0); haverights = cap_rights_fde_inline(fde); error = cap_check_inline(haverights, needrightsp); if (__predict_false(error != 0)) return (error); *fpp = fp; return (0); } #else int fget_only_user(struct filedesc *fdp, int fd, cap_rights_t *needrightsp, struct file **fpp) { struct file *fp; MPASS(FILEDESC_IS_ONLY_USER(fdp)); *fpp = NULL; if (__predict_false(fd >= fdp->fd_nfiles)) return (EBADF); fp = fdp->fd_ofiles[fd].fde_file; if (__predict_false(fp == NULL)) return (EBADF); MPASS(refcount_load(&fp->f_count) > 0); *fpp = fp; return (0); } #endif /* * Extract the file pointer associated with the specified descriptor for the * current user process. * * If the descriptor doesn't exist or doesn't match 'flags', EBADF is * returned. * * File's rights will be checked against the capability rights mask. * * If an error occurred the non-zero error is returned and *fpp is set to * NULL. Otherwise *fpp is held and set and zero is returned. Caller is * responsible for fdrop(). */ static __inline int _fget(struct thread *td, int fd, struct file **fpp, int flags, cap_rights_t *needrightsp) { struct file *fp; int error; *fpp = NULL; error = fget_unlocked(td, fd, needrightsp, &fp); if (__predict_false(error != 0)) return (error); if (__predict_false(fp->f_ops == &badfileops)) { fdrop(fp, td); return (EBADF); } /* * FREAD and FWRITE failure return EBADF as per POSIX. */ error = 0; switch (flags) { case FREAD: case FWRITE: if ((fp->f_flag & flags) == 0) error = EBADF; break; case FEXEC: if (fp->f_ops != &path_fileops && ((fp->f_flag & (FREAD | FEXEC)) == 0 || (fp->f_flag & FWRITE) != 0)) error = EBADF; break; case 0: break; default: KASSERT(0, ("wrong flags")); } if (error != 0) { fdrop(fp, td); return (error); } *fpp = fp; return (0); } int fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) { return (_fget(td, fd, fpp, 0, rightsp)); } int fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, vm_prot_t *maxprotp, struct file **fpp) { int error; #ifndef CAPABILITIES error = _fget(td, fd, fpp, 0, rightsp); if (maxprotp != NULL) *maxprotp = VM_PROT_ALL; return (error); #else cap_rights_t fdrights; struct filedesc *fdp; struct file *fp; seqc_t seq; *fpp = NULL; fdp = td->td_proc->p_fd; MPASS(cap_rights_is_set(rightsp, CAP_MMAP)); for (;;) { error = fget_unlocked_seq(td, fd, rightsp, &fp, &seq); if (__predict_false(error != 0)) return (error); if (__predict_false(fp->f_ops == &badfileops)) { fdrop(fp, td); return (EBADF); } if (maxprotp != NULL) fdrights = *cap_rights(fdp, fd); if (!fd_modified(fdp, fd, seq)) break; fdrop(fp, td); } /* * If requested, convert capability rights to access flags. */ if (maxprotp != NULL) *maxprotp = cap_rights_to_vmprot(&fdrights); *fpp = fp; return (0); #endif } int fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) { return (_fget(td, fd, fpp, FREAD, rightsp)); } int fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp) { return (_fget(td, fd, fpp, FWRITE, rightsp)); } int fget_fcntl(struct thread *td, int fd, cap_rights_t *rightsp, int needfcntl, struct file **fpp) { #ifndef CAPABILITIES return (fget_unlocked(td, fd, rightsp, fpp)); #else struct filedesc *fdp = td->td_proc->p_fd; struct file *fp; int error; seqc_t seq; *fpp = NULL; MPASS(cap_rights_is_set(rightsp, CAP_FCNTL)); for (;;) { error = fget_unlocked_seq(td, fd, rightsp, &fp, &seq); if (error != 0) return (error); error = cap_fcntl_check(fdp, fd, needfcntl); if (!fd_modified(fdp, fd, seq)) break; fdrop(fp, td); } if (error != 0) { fdrop(fp, td); return (error); } *fpp = fp; return (0); #endif } /* * Like fget() but loads the underlying vnode, or returns an error if the * descriptor does not represent a vnode. Note that pipes use vnodes but * never have VM objects. The returned vnode will be vref()'d. * * XXX: what about the unused flags ? */ static __inline int _fgetvp(struct thread *td, int fd, int flags, cap_rights_t *needrightsp, struct vnode **vpp) { struct file *fp; int error; *vpp = NULL; error = _fget(td, fd, &fp, flags, needrightsp); if (error != 0) return (error); if (fp->f_vnode == NULL) { error = EINVAL; } else { *vpp = fp->f_vnode; vrefact(*vpp); } fdrop(fp, td); return (error); } int fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) { return (_fgetvp(td, fd, 0, rightsp, vpp)); } int fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp, struct filecaps *havecaps, struct vnode **vpp) { struct filecaps caps; struct file *fp; int error; error = fget_cap(td, fd, needrightsp, &fp, &caps); if (error != 0) return (error); if (fp->f_ops == &badfileops) { error = EBADF; goto out; } if (fp->f_vnode == NULL) { error = EINVAL; goto out; } *havecaps = caps; *vpp = fp->f_vnode; vrefact(*vpp); fdrop(fp, td); return (0); out: filecaps_free(&caps); fdrop(fp, td); return (error); } int fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) { return (_fgetvp(td, fd, FREAD, rightsp, vpp)); } int fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) { return (_fgetvp(td, fd, FEXEC, rightsp, vpp)); } #ifdef notyet int fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) { return (_fgetvp(td, fd, FWRITE, rightsp, vpp)); } #endif /* * Handle the last reference to a file being closed. * * Without the noinline attribute clang keeps inlining the func thorough this * file when fdrop is used. */ int __noinline _fdrop(struct file *fp, struct thread *td) { int error; #ifdef INVARIANTS int count; count = refcount_load(&fp->f_count); if (count != 0) panic("fdrop: fp %p count %d", fp, count); #endif error = fo_close(fp, td); atomic_subtract_int(&openfiles, 1); crfree(fp->f_cred); free(fp->f_advice, M_FADVISE); uma_zfree(file_zone, fp); return (error); } /* * Apply an advisory lock on a file descriptor. * * Just attempt to get a record lock of the requested type on the entire file * (l_whence = SEEK_SET, l_start = 0, l_len = 0). */ #ifndef _SYS_SYSPROTO_H_ struct flock_args { int fd; int how; }; #endif /* ARGSUSED */ int sys_flock(struct thread *td, struct flock_args *uap) { struct file *fp; struct vnode *vp; struct flock lf; int error; error = fget(td, uap->fd, &cap_flock_rights, &fp); if (error != 0) return (error); error = EOPNOTSUPP; if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) { goto done; } if (fp->f_ops == &path_fileops) { goto done; } error = 0; vp = fp->f_vnode; lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; if (uap->how & LOCK_UN) { lf.l_type = F_UNLCK; atomic_clear_int(&fp->f_flag, FHASLOCK); error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK); goto done; } if (uap->how & LOCK_EX) lf.l_type = F_WRLCK; else if (uap->how & LOCK_SH) lf.l_type = F_RDLCK; else { error = EBADF; goto done; } atomic_set_int(&fp->f_flag, FHASLOCK); error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT); done: fdrop(fp, td); return (error); } /* * Duplicate the specified descriptor to a free descriptor. */ int dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode, int openerror, int *indxp) { struct filedescent *newfde, *oldfde; struct file *fp; u_long *ioctls; int error, indx; KASSERT(openerror == ENODEV || openerror == ENXIO, ("unexpected error %d in %s", openerror, __func__)); /* * If the to-be-dup'd fd number is greater than the allowed number * of file descriptors, or the fd to be dup'd has already been * closed, then reject. */ FILEDESC_XLOCK(fdp); if ((fp = fget_noref(fdp, dfd)) == NULL) { FILEDESC_XUNLOCK(fdp); return (EBADF); } error = fdalloc(td, 0, &indx); if (error != 0) { FILEDESC_XUNLOCK(fdp); return (error); } /* * There are two cases of interest here. * * For ENODEV simply dup (dfd) to file descriptor (indx) and return. * * For ENXIO steal away the file structure from (dfd) and store it in * (indx). (dfd) is effectively closed by this operation. */ switch (openerror) { case ENODEV: /* * Check that the mode the file is being opened for is a * subset of the mode of the existing descriptor. */ if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) { fdunused(fdp, indx); FILEDESC_XUNLOCK(fdp); return (EACCES); } if (!fhold(fp)) { fdunused(fdp, indx); FILEDESC_XUNLOCK(fdp); return (EBADF); } newfde = &fdp->fd_ofiles[indx]; oldfde = &fdp->fd_ofiles[dfd]; ioctls = filecaps_copy_prep(&oldfde->fde_caps); #ifdef CAPABILITIES seqc_write_begin(&newfde->fde_seqc); #endif fde_copy(oldfde, newfde); filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps, ioctls); #ifdef CAPABILITIES seqc_write_end(&newfde->fde_seqc); #endif break; case ENXIO: /* * Steal away the file pointer from dfd and stuff it into indx. */ newfde = &fdp->fd_ofiles[indx]; oldfde = &fdp->fd_ofiles[dfd]; #ifdef CAPABILITIES seqc_write_begin(&oldfde->fde_seqc); seqc_write_begin(&newfde->fde_seqc); #endif fde_copy(oldfde, newfde); oldfde->fde_file = NULL; fdunused(fdp, dfd); #ifdef CAPABILITIES seqc_write_end(&newfde->fde_seqc); seqc_write_end(&oldfde->fde_seqc); #endif break; } FILEDESC_XUNLOCK(fdp); *indxp = indx; return (0); } /* * This sysctl determines if we will allow a process to chroot(2) if it * has a directory open: * 0: disallowed for all processes. * 1: allowed for processes that were not already chroot(2)'ed. * 2: allowed for all processes. */ static int chroot_allow_open_directories = 1; SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW, &chroot_allow_open_directories, 0, "Allow a process to chroot(2) if it has a directory open"); /* * Helper function for raised chroot(2) security function: Refuse if * any filedescriptors are open directories. */ static int chroot_refuse_vdir_fds(struct filedesc *fdp) { struct vnode *vp; struct file *fp; int i; FILEDESC_LOCK_ASSERT(fdp); FILEDESC_FOREACH_FP(fdp, i, fp) { if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; if (vp->v_type == VDIR) return (EPERM); } } return (0); } static void pwd_fill(struct pwd *oldpwd, struct pwd *newpwd) { if (newpwd->pwd_cdir == NULL && oldpwd->pwd_cdir != NULL) { vrefact(oldpwd->pwd_cdir); newpwd->pwd_cdir = oldpwd->pwd_cdir; } if (newpwd->pwd_rdir == NULL && oldpwd->pwd_rdir != NULL) { vrefact(oldpwd->pwd_rdir); newpwd->pwd_rdir = oldpwd->pwd_rdir; } if (newpwd->pwd_jdir == NULL && oldpwd->pwd_jdir != NULL) { vrefact(oldpwd->pwd_jdir); newpwd->pwd_jdir = oldpwd->pwd_jdir; } if (newpwd->pwd_adir == NULL && oldpwd->pwd_adir != NULL) { vrefact(oldpwd->pwd_adir); newpwd->pwd_adir = oldpwd->pwd_adir; } } struct pwd * pwd_hold_pwddesc(struct pwddesc *pdp) { struct pwd *pwd; PWDDESC_ASSERT_XLOCKED(pdp); pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); if (pwd != NULL) refcount_acquire(&pwd->pwd_refcount); return (pwd); } bool pwd_hold_smr(struct pwd *pwd) { MPASS(pwd != NULL); if (__predict_true(refcount_acquire_if_not_zero(&pwd->pwd_refcount))) { return (true); } return (false); } struct pwd * pwd_hold(struct thread *td) { struct pwddesc *pdp; struct pwd *pwd; pdp = td->td_proc->p_pd; vfs_smr_enter(); pwd = vfs_smr_entered_load(&pdp->pd_pwd); if (pwd_hold_smr(pwd)) { vfs_smr_exit(); return (pwd); } vfs_smr_exit(); PWDDESC_XLOCK(pdp); pwd = pwd_hold_pwddesc(pdp); MPASS(pwd != NULL); PWDDESC_XUNLOCK(pdp); return (pwd); } struct pwd * pwd_hold_proc(struct proc *p) { struct pwddesc *pdp; struct pwd *pwd; PROC_ASSERT_HELD(p); PROC_LOCK(p); pdp = pdhold(p); MPASS(pdp != NULL); PROC_UNLOCK(p); PWDDESC_XLOCK(pdp); pwd = pwd_hold_pwddesc(pdp); MPASS(pwd != NULL); PWDDESC_XUNLOCK(pdp); pddrop(pdp); return (pwd); } static struct pwd * pwd_alloc(void) { struct pwd *pwd; pwd = uma_zalloc_smr(pwd_zone, M_WAITOK); bzero(pwd, sizeof(*pwd)); refcount_init(&pwd->pwd_refcount, 1); return (pwd); } void pwd_drop(struct pwd *pwd) { if (!refcount_release(&pwd->pwd_refcount)) return; if (pwd->pwd_cdir != NULL) vrele(pwd->pwd_cdir); if (pwd->pwd_rdir != NULL) vrele(pwd->pwd_rdir); if (pwd->pwd_jdir != NULL) vrele(pwd->pwd_jdir); if (pwd->pwd_adir != NULL) vrele(pwd->pwd_adir); uma_zfree_smr(pwd_zone, pwd); } /* * The caller is responsible for invoking priv_check() and * mac_vnode_check_chroot() to authorize this operation. */ int pwd_chroot(struct thread *td, struct vnode *vp) { struct pwddesc *pdp; struct filedesc *fdp; struct pwd *newpwd, *oldpwd; int error; fdp = td->td_proc->p_fd; pdp = td->td_proc->p_pd; newpwd = pwd_alloc(); FILEDESC_SLOCK(fdp); PWDDESC_XLOCK(pdp); oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); if (chroot_allow_open_directories == 0 || (chroot_allow_open_directories == 1 && oldpwd->pwd_rdir != rootvnode)) { error = chroot_refuse_vdir_fds(fdp); FILEDESC_SUNLOCK(fdp); if (error != 0) { PWDDESC_XUNLOCK(pdp); pwd_drop(newpwd); return (error); } } else { FILEDESC_SUNLOCK(fdp); } vrefact(vp); newpwd->pwd_rdir = vp; vrefact(vp); newpwd->pwd_adir = vp; if (oldpwd->pwd_jdir == NULL) { vrefact(vp); newpwd->pwd_jdir = vp; } pwd_fill(oldpwd, newpwd); pwd_set(pdp, newpwd); PWDDESC_XUNLOCK(pdp); pwd_drop(oldpwd); return (0); } void pwd_chdir(struct thread *td, struct vnode *vp) { struct pwddesc *pdp; struct pwd *newpwd, *oldpwd; VNPASS(vp->v_usecount > 0, vp); newpwd = pwd_alloc(); pdp = td->td_proc->p_pd; PWDDESC_XLOCK(pdp); oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); newpwd->pwd_cdir = vp; pwd_fill(oldpwd, newpwd); pwd_set(pdp, newpwd); PWDDESC_XUNLOCK(pdp); pwd_drop(oldpwd); } /* * Process is transitioning to/from a non-native ABI. */ void pwd_altroot(struct thread *td, struct vnode *altroot_vp) { struct pwddesc *pdp; struct pwd *newpwd, *oldpwd; newpwd = pwd_alloc(); pdp = td->td_proc->p_pd; PWDDESC_XLOCK(pdp); oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); if (altroot_vp != NULL) { /* * Native process to a non-native ABI. */ vrefact(altroot_vp); newpwd->pwd_adir = altroot_vp; } else { /* * Non-native process to the native ABI. */ vrefact(oldpwd->pwd_rdir); newpwd->pwd_adir = oldpwd->pwd_rdir; } pwd_fill(oldpwd, newpwd); pwd_set(pdp, newpwd); PWDDESC_XUNLOCK(pdp); pwd_drop(oldpwd); } /* * jail_attach(2) changes both root and working directories. */ int pwd_chroot_chdir(struct thread *td, struct vnode *vp) { struct pwddesc *pdp; struct filedesc *fdp; struct pwd *newpwd, *oldpwd; int error; fdp = td->td_proc->p_fd; pdp = td->td_proc->p_pd; newpwd = pwd_alloc(); FILEDESC_SLOCK(fdp); PWDDESC_XLOCK(pdp); oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); error = chroot_refuse_vdir_fds(fdp); FILEDESC_SUNLOCK(fdp); if (error != 0) { PWDDESC_XUNLOCK(pdp); pwd_drop(newpwd); return (error); } vrefact(vp); newpwd->pwd_rdir = vp; vrefact(vp); newpwd->pwd_cdir = vp; if (oldpwd->pwd_jdir == NULL) { vrefact(vp); newpwd->pwd_jdir = vp; } vrefact(vp); newpwd->pwd_adir = vp; pwd_fill(oldpwd, newpwd); pwd_set(pdp, newpwd); PWDDESC_XUNLOCK(pdp); pwd_drop(oldpwd); return (0); } void pwd_ensure_dirs(void) { struct pwddesc *pdp; struct pwd *oldpwd, *newpwd; pdp = curproc->p_pd; PWDDESC_XLOCK(pdp); oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); if (oldpwd->pwd_cdir != NULL && oldpwd->pwd_rdir != NULL && oldpwd->pwd_adir != NULL) { PWDDESC_XUNLOCK(pdp); return; } PWDDESC_XUNLOCK(pdp); newpwd = pwd_alloc(); PWDDESC_XLOCK(pdp); oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); pwd_fill(oldpwd, newpwd); if (newpwd->pwd_cdir == NULL) { vrefact(rootvnode); newpwd->pwd_cdir = rootvnode; } if (newpwd->pwd_rdir == NULL) { vrefact(rootvnode); newpwd->pwd_rdir = rootvnode; } if (newpwd->pwd_adir == NULL) { vrefact(rootvnode); newpwd->pwd_adir = rootvnode; } pwd_set(pdp, newpwd); PWDDESC_XUNLOCK(pdp); pwd_drop(oldpwd); } void pwd_set_rootvnode(void) { struct pwddesc *pdp; struct pwd *oldpwd, *newpwd; pdp = curproc->p_pd; newpwd = pwd_alloc(); PWDDESC_XLOCK(pdp); oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); vrefact(rootvnode); newpwd->pwd_cdir = rootvnode; vrefact(rootvnode); newpwd->pwd_rdir = rootvnode; vrefact(rootvnode); newpwd->pwd_adir = rootvnode; pwd_fill(oldpwd, newpwd); pwd_set(pdp, newpwd); PWDDESC_XUNLOCK(pdp); pwd_drop(oldpwd); } /* * Scan all active processes and prisons to see if any of them have a current * or root directory of `olddp'. If so, replace them with the new mount point. */ void mountcheckdirs(struct vnode *olddp, struct vnode *newdp) { struct pwddesc *pdp; struct pwd *newpwd, *oldpwd; struct prison *pr; struct proc *p; int nrele; if (vrefcnt(olddp) == 1) return; nrele = 0; newpwd = pwd_alloc(); sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); pdp = pdhold(p); PROC_UNLOCK(p); if (pdp == NULL) continue; PWDDESC_XLOCK(pdp); oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); if (oldpwd == NULL || (oldpwd->pwd_cdir != olddp && oldpwd->pwd_rdir != olddp && oldpwd->pwd_jdir != olddp && oldpwd->pwd_adir != olddp)) { PWDDESC_XUNLOCK(pdp); pddrop(pdp); continue; } if (oldpwd->pwd_cdir == olddp) { vrefact(newdp); newpwd->pwd_cdir = newdp; } if (oldpwd->pwd_rdir == olddp) { vrefact(newdp); newpwd->pwd_rdir = newdp; } if (oldpwd->pwd_jdir == olddp) { vrefact(newdp); newpwd->pwd_jdir = newdp; } if (oldpwd->pwd_adir == olddp) { vrefact(newdp); newpwd->pwd_adir = newdp; } pwd_fill(oldpwd, newpwd); pwd_set(pdp, newpwd); PWDDESC_XUNLOCK(pdp); pwd_drop(oldpwd); pddrop(pdp); newpwd = pwd_alloc(); } sx_sunlock(&allproc_lock); pwd_drop(newpwd); if (rootvnode == olddp) { vrefact(newdp); rootvnode = newdp; nrele++; } mtx_lock(&prison0.pr_mtx); if (prison0.pr_root == olddp) { vrefact(newdp); prison0.pr_root = newdp; nrele++; } mtx_unlock(&prison0.pr_mtx); sx_slock(&allprison_lock); TAILQ_FOREACH(pr, &allprison, pr_list) { mtx_lock(&pr->pr_mtx); if (pr->pr_root == olddp) { vrefact(newdp); pr->pr_root = newdp; nrele++; } mtx_unlock(&pr->pr_mtx); } sx_sunlock(&allprison_lock); while (nrele--) vrele(olddp); } int descrip_check_write_mp(struct filedesc *fdp, struct mount *mp) { struct file *fp; struct vnode *vp; int error, i; error = 0; FILEDESC_SLOCK(fdp); FILEDESC_FOREACH_FP(fdp, i, fp) { if (fp->f_type != DTYPE_VNODE || (atomic_load_int(&fp->f_flag) & FWRITE) == 0) continue; vp = fp->f_vnode; if (vp->v_mount == mp) { error = EDEADLK; break; } } FILEDESC_SUNLOCK(fdp); return (error); } struct filedesc_to_leader * filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader) { struct filedesc_to_leader *fdtol; fdtol = malloc(sizeof(struct filedesc_to_leader), M_FILEDESC_TO_LEADER, M_WAITOK); fdtol->fdl_refcount = 1; fdtol->fdl_holdcount = 0; fdtol->fdl_wakeup = 0; fdtol->fdl_leader = leader; if (old != NULL) { FILEDESC_XLOCK(fdp); fdtol->fdl_next = old->fdl_next; fdtol->fdl_prev = old; old->fdl_next = fdtol; fdtol->fdl_next->fdl_prev = fdtol; FILEDESC_XUNLOCK(fdp); } else { fdtol->fdl_next = fdtol; fdtol->fdl_prev = fdtol; } return (fdtol); } struct filedesc_to_leader * filedesc_to_leader_share(struct filedesc_to_leader *fdtol, struct filedesc *fdp) { FILEDESC_XLOCK(fdp); fdtol->fdl_refcount++; FILEDESC_XUNLOCK(fdp); return (fdtol); } static int filedesc_nfiles(struct filedesc *fdp) { NDSLOTTYPE *map; int count, off, minoff; if (fdp == NULL) return (0); count = 0; FILEDESC_SLOCK(fdp); map = fdp->fd_map; off = NDSLOT(fdp->fd_nfiles - 1); for (minoff = NDSLOT(0); off >= minoff; --off) count += bitcountl(map[off]); FILEDESC_SUNLOCK(fdp); return (count); } int proc_nfiles(struct proc *p) { struct filedesc *fdp; int res; PROC_LOCK(p); fdp = fdhold(p); PROC_UNLOCK(p); res = filedesc_nfiles(fdp); fddrop(fdp); return (res); } static int sysctl_kern_proc_nfds(SYSCTL_HANDLER_ARGS) { u_int namelen; int count; namelen = arg2; if (namelen != 1) return (EINVAL); if (*(int *)arg1 != 0) return (EINVAL); count = filedesc_nfiles(curproc->p_fd); return (SYSCTL_OUT(req, &count, sizeof(count))); } static SYSCTL_NODE(_kern_proc, KERN_PROC_NFDS, nfds, CTLFLAG_RD|CTLFLAG_CAPRD|CTLFLAG_MPSAFE, sysctl_kern_proc_nfds, "Number of open file descriptors"); /* * Get file structures globally. */ static int sysctl_kern_file(SYSCTL_HANDLER_ARGS) { struct xfile xf; struct filedesc *fdp; struct file *fp; struct proc *p; int error, n; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); if (req->oldptr == NULL) { n = 0; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } fdp = fdhold(p); PROC_UNLOCK(p); if (fdp == NULL) continue; /* overestimates sparse tables. */ n += fdp->fd_nfiles; fddrop(fdp); } sx_sunlock(&allproc_lock); return (SYSCTL_OUT(req, 0, n * sizeof(xf))); } error = 0; bzero(&xf, sizeof(xf)); xf.xf_size = sizeof(xf); sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } if (p_cansee(req->td, p) != 0) { PROC_UNLOCK(p); continue; } xf.xf_pid = p->p_pid; xf.xf_uid = p->p_ucred->cr_uid; fdp = fdhold(p); PROC_UNLOCK(p); if (fdp == NULL) continue; FILEDESC_SLOCK(fdp); if (refcount_load(&fdp->fd_refcnt) == 0) goto nextproc; FILEDESC_FOREACH_FP(fdp, n, fp) { xf.xf_fd = n; xf.xf_file = (uintptr_t)fp; xf.xf_data = (uintptr_t)fp->f_data; xf.xf_vnode = (uintptr_t)fp->f_vnode; xf.xf_type = (uintptr_t)fp->f_type; xf.xf_count = refcount_load(&fp->f_count); xf.xf_msgcount = 0; xf.xf_offset = foffset_get(fp); xf.xf_flag = fp->f_flag; error = SYSCTL_OUT(req, &xf, sizeof(xf)); /* * There is no need to re-check the fdtable refcount * here since the filedesc lock is not dropped in the * loop body. */ if (error != 0) break; } nextproc: FILEDESC_SUNLOCK(fdp); fddrop(fdp); if (error) break; } sx_sunlock(&allproc_lock); return (error); } SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE, 0, 0, sysctl_kern_file, "S,xfile", "Entire file table"); #ifdef KINFO_FILE_SIZE CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE); #endif static int xlate_fflags(int fflags) { static const struct { int fflag; int kf_fflag; } fflags_table[] = { { FAPPEND, KF_FLAG_APPEND }, { FASYNC, KF_FLAG_ASYNC }, { FFSYNC, KF_FLAG_FSYNC }, { FHASLOCK, KF_FLAG_HASLOCK }, { FNONBLOCK, KF_FLAG_NONBLOCK }, { FREAD, KF_FLAG_READ }, { FWRITE, KF_FLAG_WRITE }, { O_CREAT, KF_FLAG_CREAT }, { O_DIRECT, KF_FLAG_DIRECT }, { O_EXCL, KF_FLAG_EXCL }, { O_EXEC, KF_FLAG_EXEC }, { O_EXLOCK, KF_FLAG_EXLOCK }, { O_NOFOLLOW, KF_FLAG_NOFOLLOW }, { O_SHLOCK, KF_FLAG_SHLOCK }, { O_TRUNC, KF_FLAG_TRUNC } }; unsigned int i; int kflags; kflags = 0; for (i = 0; i < nitems(fflags_table); i++) if (fflags & fflags_table[i].fflag) kflags |= fflags_table[i].kf_fflag; return (kflags); } /* Trim unused data from kf_path by truncating the structure size. */ void pack_kinfo(struct kinfo_file *kif) { kif->kf_structsize = offsetof(struct kinfo_file, kf_path) + strlen(kif->kf_path) + 1; kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t)); } static void export_file_to_kinfo(struct file *fp, int fd, cap_rights_t *rightsp, struct kinfo_file *kif, struct filedesc *fdp, int flags) { int error; bzero(kif, sizeof(*kif)); /* Set a default type to allow for empty fill_kinfo() methods. */ kif->kf_type = KF_TYPE_UNKNOWN; kif->kf_flags = xlate_fflags(fp->f_flag); if (rightsp != NULL) kif->kf_cap_rights = *rightsp; else cap_rights_init_zero(&kif->kf_cap_rights); kif->kf_fd = fd; kif->kf_ref_count = refcount_load(&fp->f_count); kif->kf_offset = foffset_get(fp); /* * This may drop the filedesc lock, so the 'fp' cannot be * accessed after this call. */ error = fo_fill_kinfo(fp, kif, fdp); if (error == 0) kif->kf_status |= KF_ATTR_VALID; if ((flags & KERN_FILEDESC_PACK_KINFO) != 0) pack_kinfo(kif); else kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t)); } static void export_vnode_to_kinfo(struct vnode *vp, int fd, int fflags, struct kinfo_file *kif, int flags) { int error; bzero(kif, sizeof(*kif)); kif->kf_type = KF_TYPE_VNODE; error = vn_fill_kinfo_vnode(vp, kif); if (error == 0) kif->kf_status |= KF_ATTR_VALID; kif->kf_flags = xlate_fflags(fflags); cap_rights_init_zero(&kif->kf_cap_rights); kif->kf_fd = fd; kif->kf_ref_count = -1; kif->kf_offset = -1; if ((flags & KERN_FILEDESC_PACK_KINFO) != 0) pack_kinfo(kif); else kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t)); vrele(vp); } struct export_fd_buf { struct filedesc *fdp; struct pwddesc *pdp; struct sbuf *sb; ssize_t remainder; struct kinfo_file kif; int flags; }; static int export_kinfo_to_sb(struct export_fd_buf *efbuf) { struct kinfo_file *kif; kif = &efbuf->kif; if (efbuf->remainder != -1) { if (efbuf->remainder < kif->kf_structsize) return (ENOMEM); efbuf->remainder -= kif->kf_structsize; } if (sbuf_bcat(efbuf->sb, kif, kif->kf_structsize) != 0) return (sbuf_error(efbuf->sb)); return (0); } static int export_file_to_sb(struct file *fp, int fd, cap_rights_t *rightsp, struct export_fd_buf *efbuf) { int error; if (efbuf->remainder == 0) return (ENOMEM); export_file_to_kinfo(fp, fd, rightsp, &efbuf->kif, efbuf->fdp, efbuf->flags); FILEDESC_SUNLOCK(efbuf->fdp); error = export_kinfo_to_sb(efbuf); FILEDESC_SLOCK(efbuf->fdp); return (error); } static int export_vnode_to_sb(struct vnode *vp, int fd, int fflags, struct export_fd_buf *efbuf) { int error; if (efbuf->remainder == 0) return (ENOMEM); if (efbuf->pdp != NULL) PWDDESC_XUNLOCK(efbuf->pdp); export_vnode_to_kinfo(vp, fd, fflags, &efbuf->kif, efbuf->flags); error = export_kinfo_to_sb(efbuf); if (efbuf->pdp != NULL) PWDDESC_XLOCK(efbuf->pdp); return (error); } /* * Store a process file descriptor information to sbuf. * * Takes a locked proc as argument, and returns with the proc unlocked. */ int kern_proc_filedesc_out(struct proc *p, struct sbuf *sb, ssize_t maxlen, int flags) { struct file *fp; struct filedesc *fdp; struct pwddesc *pdp; struct export_fd_buf *efbuf; struct vnode *cttyvp, *textvp, *tracevp; struct pwd *pwd; int error, i; cap_rights_t rights; PROC_LOCK_ASSERT(p, MA_OWNED); /* ktrace vnode */ tracevp = ktr_get_tracevp(p, true); /* text vnode */ textvp = p->p_textvp; if (textvp != NULL) vrefact(textvp); /* Controlling tty. */ cttyvp = NULL; if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) { cttyvp = p->p_pgrp->pg_session->s_ttyvp; if (cttyvp != NULL) vrefact(cttyvp); } fdp = fdhold(p); pdp = pdhold(p); PROC_UNLOCK(p); efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK); efbuf->fdp = NULL; efbuf->pdp = NULL; efbuf->sb = sb; efbuf->remainder = maxlen; efbuf->flags = flags; error = 0; if (tracevp != NULL) error = export_vnode_to_sb(tracevp, KF_FD_TYPE_TRACE, FREAD | FWRITE, efbuf); if (error == 0 && textvp != NULL) error = export_vnode_to_sb(textvp, KF_FD_TYPE_TEXT, FREAD, efbuf); if (error == 0 && cttyvp != NULL) error = export_vnode_to_sb(cttyvp, KF_FD_TYPE_CTTY, FREAD | FWRITE, efbuf); if (error != 0 || pdp == NULL || fdp == NULL) goto fail; efbuf->fdp = fdp; efbuf->pdp = pdp; PWDDESC_XLOCK(pdp); pwd = pwd_hold_pwddesc(pdp); if (pwd != NULL) { /* working directory */ if (pwd->pwd_cdir != NULL) { vrefact(pwd->pwd_cdir); error = export_vnode_to_sb(pwd->pwd_cdir, KF_FD_TYPE_CWD, FREAD, efbuf); } /* root directory */ if (error == 0 && pwd->pwd_rdir != NULL) { vrefact(pwd->pwd_rdir); error = export_vnode_to_sb(pwd->pwd_rdir, KF_FD_TYPE_ROOT, FREAD, efbuf); } /* jail directory */ if (error == 0 && pwd->pwd_jdir != NULL) { vrefact(pwd->pwd_jdir); error = export_vnode_to_sb(pwd->pwd_jdir, KF_FD_TYPE_JAIL, FREAD, efbuf); } } PWDDESC_XUNLOCK(pdp); if (error != 0) goto fail; if (pwd != NULL) pwd_drop(pwd); FILEDESC_SLOCK(fdp); if (refcount_load(&fdp->fd_refcnt) == 0) goto skip; FILEDESC_FOREACH_FP(fdp, i, fp) { #ifdef CAPABILITIES rights = *cap_rights(fdp, i); #else /* !CAPABILITIES */ rights = cap_no_rights; #endif /* * Create sysctl entry. It is OK to drop the filedesc * lock inside of export_file_to_sb() as we will * re-validate and re-evaluate its properties when the * loop continues. */ error = export_file_to_sb(fp, i, &rights, efbuf); if (error != 0 || refcount_load(&fdp->fd_refcnt) == 0) break; } skip: FILEDESC_SUNLOCK(fdp); fail: if (fdp != NULL) fddrop(fdp); if (pdp != NULL) pddrop(pdp); free(efbuf, M_TEMP); return (error); } #define FILEDESC_SBUF_SIZE (sizeof(struct kinfo_file) * 5) /* * Get per-process file descriptors for use by procstat(1), et al. */ static int sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS) { struct sbuf sb; struct proc *p; ssize_t maxlen; u_int namelen; int error, error2, *name; namelen = arg2; if (namelen != 1) return (EINVAL); name = (int *)arg1; sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req); sbuf_clear_flags(&sb, SBUF_INCLUDENUL); error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); if (error != 0) { sbuf_delete(&sb); return (error); } maxlen = req->oldptr != NULL ? req->oldlen : -1; error = kern_proc_filedesc_out(p, &sb, maxlen, KERN_FILEDESC_PACK_KINFO); error2 = sbuf_finish(&sb); sbuf_delete(&sb); return (error != 0 ? error : error2); } #ifdef COMPAT_FREEBSD7 #ifdef KINFO_OFILE_SIZE CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE); #endif static void kinfo_to_okinfo(struct kinfo_file *kif, struct kinfo_ofile *okif) { okif->kf_structsize = sizeof(*okif); okif->kf_type = kif->kf_type; okif->kf_fd = kif->kf_fd; okif->kf_ref_count = kif->kf_ref_count; okif->kf_flags = kif->kf_flags & (KF_FLAG_READ | KF_FLAG_WRITE | KF_FLAG_APPEND | KF_FLAG_ASYNC | KF_FLAG_FSYNC | KF_FLAG_NONBLOCK | KF_FLAG_DIRECT | KF_FLAG_HASLOCK); okif->kf_offset = kif->kf_offset; if (kif->kf_type == KF_TYPE_VNODE) okif->kf_vnode_type = kif->kf_un.kf_file.kf_file_type; else okif->kf_vnode_type = KF_VTYPE_VNON; strlcpy(okif->kf_path, kif->kf_path, sizeof(okif->kf_path)); if (kif->kf_type == KF_TYPE_SOCKET) { okif->kf_sock_domain = kif->kf_un.kf_sock.kf_sock_domain0; okif->kf_sock_type = kif->kf_un.kf_sock.kf_sock_type0; okif->kf_sock_protocol = kif->kf_un.kf_sock.kf_sock_protocol0; okif->kf_sa_local = kif->kf_un.kf_sock.kf_sa_local; okif->kf_sa_peer = kif->kf_un.kf_sock.kf_sa_peer; } else { okif->kf_sa_local.ss_family = AF_UNSPEC; okif->kf_sa_peer.ss_family = AF_UNSPEC; } } static int export_vnode_for_osysctl(struct vnode *vp, int type, struct kinfo_file *kif, struct kinfo_ofile *okif, struct pwddesc *pdp, struct sysctl_req *req) { int error; vrefact(vp); PWDDESC_XUNLOCK(pdp); export_vnode_to_kinfo(vp, type, 0, kif, KERN_FILEDESC_PACK_KINFO); kinfo_to_okinfo(kif, okif); error = SYSCTL_OUT(req, okif, sizeof(*okif)); PWDDESC_XLOCK(pdp); return (error); } /* * Get per-process file descriptors for use by procstat(1), et al. */ static int sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS) { struct kinfo_ofile *okif; struct kinfo_file *kif; struct filedesc *fdp; struct pwddesc *pdp; struct pwd *pwd; u_int namelen; int error, i, *name; struct file *fp; struct proc *p; namelen = arg2; if (namelen != 1) return (EINVAL); name = (int *)arg1; error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); if (error != 0) return (error); fdp = fdhold(p); if (fdp != NULL) pdp = pdhold(p); PROC_UNLOCK(p); if (fdp == NULL || pdp == NULL) { if (fdp != NULL) fddrop(fdp); return (ENOENT); } kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK); okif = malloc(sizeof(*okif), M_TEMP, M_WAITOK); PWDDESC_XLOCK(pdp); pwd = pwd_hold_pwddesc(pdp); if (pwd != NULL) { if (pwd->pwd_cdir != NULL) export_vnode_for_osysctl(pwd->pwd_cdir, KF_FD_TYPE_CWD, kif, okif, pdp, req); if (pwd->pwd_rdir != NULL) export_vnode_for_osysctl(pwd->pwd_rdir, KF_FD_TYPE_ROOT, kif, okif, pdp, req); if (pwd->pwd_jdir != NULL) export_vnode_for_osysctl(pwd->pwd_jdir, KF_FD_TYPE_JAIL, kif, okif, pdp, req); } PWDDESC_XUNLOCK(pdp); if (pwd != NULL) pwd_drop(pwd); FILEDESC_SLOCK(fdp); if (refcount_load(&fdp->fd_refcnt) == 0) goto skip; FILEDESC_FOREACH_FP(fdp, i, fp) { export_file_to_kinfo(fp, i, NULL, kif, fdp, KERN_FILEDESC_PACK_KINFO); FILEDESC_SUNLOCK(fdp); kinfo_to_okinfo(kif, okif); error = SYSCTL_OUT(req, okif, sizeof(*okif)); FILEDESC_SLOCK(fdp); if (error != 0 || refcount_load(&fdp->fd_refcnt) == 0) break; } skip: FILEDESC_SUNLOCK(fdp); fddrop(fdp); pddrop(pdp); free(kif, M_TEMP); free(okif, M_TEMP); return (0); } static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc, CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_ofiledesc, "Process ofiledesc entries"); #endif /* COMPAT_FREEBSD7 */ int vntype_to_kinfo(int vtype) { struct { int vtype; int kf_vtype; } vtypes_table[] = { { VBAD, KF_VTYPE_VBAD }, { VBLK, KF_VTYPE_VBLK }, { VCHR, KF_VTYPE_VCHR }, { VDIR, KF_VTYPE_VDIR }, { VFIFO, KF_VTYPE_VFIFO }, { VLNK, KF_VTYPE_VLNK }, { VNON, KF_VTYPE_VNON }, { VREG, KF_VTYPE_VREG }, { VSOCK, KF_VTYPE_VSOCK } }; unsigned int i; /* * Perform vtype translation. */ for (i = 0; i < nitems(vtypes_table); i++) if (vtypes_table[i].vtype == vtype) return (vtypes_table[i].kf_vtype); return (KF_VTYPE_UNKNOWN); } static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc, CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_filedesc, "Process filedesc entries"); /* * Store a process current working directory information to sbuf. * * Takes a locked proc as argument, and returns with the proc unlocked. */ int kern_proc_cwd_out(struct proc *p, struct sbuf *sb, ssize_t maxlen) { struct pwddesc *pdp; struct pwd *pwd; struct export_fd_buf *efbuf; struct vnode *cdir; int error; PROC_LOCK_ASSERT(p, MA_OWNED); pdp = pdhold(p); PROC_UNLOCK(p); if (pdp == NULL) return (EINVAL); efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK); efbuf->fdp = NULL; efbuf->pdp = pdp; efbuf->sb = sb; efbuf->remainder = maxlen; efbuf->flags = 0; PWDDESC_XLOCK(pdp); pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp); cdir = pwd->pwd_cdir; if (cdir == NULL) { error = EINVAL; } else { vrefact(cdir); error = export_vnode_to_sb(cdir, KF_FD_TYPE_CWD, FREAD, efbuf); } PWDDESC_XUNLOCK(pdp); pddrop(pdp); free(efbuf, M_TEMP); return (error); } /* * Get per-process current working directory. */ static int sysctl_kern_proc_cwd(SYSCTL_HANDLER_ARGS) { struct sbuf sb; struct proc *p; ssize_t maxlen; u_int namelen; int error, error2, *name; namelen = arg2; if (namelen != 1) return (EINVAL); name = (int *)arg1; sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file), req); sbuf_clear_flags(&sb, SBUF_INCLUDENUL); error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p); if (error != 0) { sbuf_delete(&sb); return (error); } maxlen = req->oldptr != NULL ? req->oldlen : -1; error = kern_proc_cwd_out(p, &sb, maxlen); error2 = sbuf_finish(&sb); sbuf_delete(&sb); return (error != 0 ? error : error2); } static SYSCTL_NODE(_kern_proc, KERN_PROC_CWD, cwd, CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_cwd, "Process current working directory"); #ifdef DDB /* * For the purposes of debugging, generate a human-readable string for the * file type. */ static const char * file_type_to_name(short type) { switch (type) { case 0: return ("zero"); case DTYPE_VNODE: return ("vnode"); case DTYPE_SOCKET: return ("socket"); case DTYPE_PIPE: return ("pipe"); case DTYPE_FIFO: return ("fifo"); case DTYPE_KQUEUE: return ("kqueue"); case DTYPE_CRYPTO: return ("crypto"); case DTYPE_MQUEUE: return ("mqueue"); case DTYPE_SHM: return ("shm"); case DTYPE_SEM: return ("ksem"); case DTYPE_PTS: return ("pts"); case DTYPE_DEV: return ("dev"); case DTYPE_PROCDESC: return ("proc"); case DTYPE_EVENTFD: return ("eventfd"); case DTYPE_TIMERFD: return ("timerfd"); default: return ("unkn"); } } /* * For the purposes of debugging, identify a process (if any, perhaps one of * many) that references the passed file in its file descriptor array. Return * NULL if none. */ static struct proc * file_to_first_proc(struct file *fp) { struct filedesc *fdp; struct proc *p; int n; FOREACH_PROC_IN_SYSTEM(p) { if (p->p_state == PRS_NEW) continue; fdp = p->p_fd; if (fdp == NULL) continue; for (n = 0; n < fdp->fd_nfiles; n++) { if (fp == fdp->fd_ofiles[n].fde_file) return (p); } } return (NULL); } static void db_print_file(struct file *fp, int header) { #define XPTRWIDTH ((int)howmany(sizeof(void *) * NBBY, 4)) struct proc *p; if (header) db_printf("%*s %6s %*s %8s %4s %5s %6s %*s %5s %s\n", XPTRWIDTH, "File", "Type", XPTRWIDTH, "Data", "Flag", "GCFl", "Count", "MCount", XPTRWIDTH, "Vnode", "FPID", "FCmd"); p = file_to_first_proc(fp); db_printf("%*p %6s %*p %08x %04x %5d %6d %*p %5d %s\n", XPTRWIDTH, fp, file_type_to_name(fp->f_type), XPTRWIDTH, fp->f_data, fp->f_flag, 0, refcount_load(&fp->f_count), 0, XPTRWIDTH, fp->f_vnode, p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-"); #undef XPTRWIDTH } DB_SHOW_COMMAND(file, db_show_file) { struct file *fp; if (!have_addr) { db_printf("usage: show file \n"); return; } fp = (struct file *)addr; db_print_file(fp, 1); } DB_SHOW_COMMAND_FLAGS(files, db_show_files, DB_CMD_MEMSAFE) { struct filedesc *fdp; struct file *fp; struct proc *p; int header; int n; header = 1; FOREACH_PROC_IN_SYSTEM(p) { if (p->p_state == PRS_NEW) continue; if ((fdp = p->p_fd) == NULL) continue; for (n = 0; n < fdp->fd_nfiles; ++n) { if ((fp = fdp->fd_ofiles[n].fde_file) == NULL) continue; db_print_file(fp, header); header = 0; } } } #endif SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RWTUN | CTLFLAG_NOFETCH, &maxfilesperproc, 0, "Maximum files allowed open per process"); SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RWTUN | CTLFLAG_NOFETCH, &maxfiles, 0, "Maximum number of files"); SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD, &openfiles, 0, "System-wide number of open files"); /* ARGSUSED*/ static void filelistinit(void *dummy) { file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); filedesc0_zone = uma_zcreate("filedesc0", sizeof(struct filedesc0), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); pwd_zone = uma_zcreate("PWD", sizeof(struct pwd), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_SMR); /* * XXXMJG this is a temporary hack due to boot ordering issues against * the vnode zone. */ vfs_smr = uma_zone_get_smr(pwd_zone); mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF); } SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL); /*-------------------------------------------------------------------*/ static int badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return (EBADF); } static int badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { return (EINVAL); } static int badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { return (EBADF); } static int badfo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { return (0); } static int badfo_kqfilter(struct file *fp, struct knote *kn) { return (EBADF); } static int badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred) { return (EBADF); } static int badfo_close(struct file *fp, struct thread *td) { return (0); } static int badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { return (EBADF); } static int badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { return (EBADF); } static int badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, struct thread *td) { return (EBADF); } static int badfo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { return (0); } -struct fileops badfileops = { +const struct fileops badfileops = { .fo_read = badfo_readwrite, .fo_write = badfo_readwrite, .fo_truncate = badfo_truncate, .fo_ioctl = badfo_ioctl, .fo_poll = badfo_poll, .fo_kqfilter = badfo_kqfilter, .fo_stat = badfo_stat, .fo_close = badfo_close, .fo_chmod = badfo_chmod, .fo_chown = badfo_chown, .fo_sendfile = badfo_sendfile, .fo_fill_kinfo = badfo_fill_kinfo, }; static int path_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { return (POLLNVAL); } static int path_close(struct file *fp, struct thread *td) { MPASS(fp->f_type == DTYPE_VNODE); fp->f_ops = &badfileops; vrele(fp->f_vnode); return (0); } -struct fileops path_fileops = { +const struct fileops path_fileops = { .fo_read = badfo_readwrite, .fo_write = badfo_readwrite, .fo_truncate = badfo_truncate, .fo_ioctl = badfo_ioctl, .fo_poll = path_poll, .fo_kqfilter = vn_kqfilter_opath, .fo_stat = vn_statfile, .fo_close = path_close, .fo_chmod = badfo_chmod, .fo_chown = badfo_chown, .fo_sendfile = badfo_sendfile, .fo_fill_kinfo = vn_fill_kinfo, .fo_cmp = vn_cmp, .fo_flags = DFLAG_PASSABLE, }; int invfo_rdwr(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return (EOPNOTSUPP); } int invfo_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { return (EINVAL); } int invfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { return (ENOTTY); } int invfo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { return (poll_no_poll(events)); } int invfo_kqfilter(struct file *fp, struct knote *kn) { return (EINVAL); } int invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { return (EINVAL); } int invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { return (EINVAL); } int invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, struct thread *td) { return (EINVAL); } /*-------------------------------------------------------------------*/ /* * File Descriptor pseudo-device driver (/dev/fd/). * * Opening minor device N dup()s the file (if any) connected to file * descriptor N belonging to the calling process. Note that this driver * consists of only the ``open()'' routine, because all subsequent * references to this file will be direct to the other driver. * * XXX: we could give this one a cloning event handler if necessary. */ /* ARGSUSED */ static int fdopen(struct cdev *dev, int mode, int type, struct thread *td) { /* * XXX Kludge: set curthread->td_dupfd to contain the value of the * the file descriptor being sought for duplication. The error * return ensures that the vnode for this device will be released * by vn_open. Open will detect this special error and take the * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN * will simply report the error. */ td->td_dupfd = dev2unit(dev); return (ENODEV); } static struct cdevsw fildesc_cdevsw = { .d_version = D_VERSION, .d_open = fdopen, .d_name = "FD", }; static void fildesc_drvinit(void *unused) { struct cdev *dev; dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0666, "fd/0"); make_dev_alias(dev, "stdin"); dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL, UID_ROOT, GID_WHEEL, 0666, "fd/1"); make_dev_alias(dev, "stdout"); dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL, UID_ROOT, GID_WHEEL, 0666, "fd/2"); make_dev_alias(dev, "stderr"); } SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL); diff --git a/sys/kern/kern_devctl.c b/sys/kern/kern_devctl.c index 602b82105525..d83bc380c2fe 100644 --- a/sys/kern/kern_devctl.c +++ b/sys/kern/kern_devctl.c @@ -1,610 +1,610 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2002-2020 M. Warner Losh * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_bus.h" #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include STAILQ_HEAD(devq, dev_event_info); static struct dev_softc { int inuse; int nonblock; int queued; int async; struct mtx mtx; struct cv cv; struct selinfo sel; struct devq devq; struct sigio *sigio; uma_zone_t zone; } devsoftc; /* * This design allows only one reader for /dev/devctl. This is not desirable * in the long run, but will get a lot of hair out of this implementation. * Maybe we should make this device a clonable device. * * Also note: we specifically do not attach a device to the device_t tree * to avoid potential chicken and egg problems. One could argue that all * of this belongs to the root node. */ #define DEVCTL_DEFAULT_QUEUE_LEN 1000 static int sysctl_devctl_queue(SYSCTL_HANDLER_ARGS); static int devctl_queue_length = DEVCTL_DEFAULT_QUEUE_LEN; SYSCTL_PROC(_hw_bus, OID_AUTO, devctl_queue, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 0, sysctl_devctl_queue, "I", "devctl queue length"); static bool nomatch_enabled = true; SYSCTL_BOOL(_hw_bus, OID_AUTO, devctl_nomatch_enabled, CTLFLAG_RWTUN, &nomatch_enabled, 0, "enable nomatch events"); static void devctl_attach_handler(void *arg __unused, device_t dev); static void devctl_detach_handler(void *arg __unused, device_t dev, enum evhdev_detach state); static void devctl_nomatch_handler(void *arg __unused, device_t dev); static d_open_t devopen; static d_close_t devclose; static d_read_t devread; static d_ioctl_t devioctl; static d_poll_t devpoll; static d_kqfilter_t devkqfilter; #define DEVCTL_BUFFER (1024 - sizeof(void *)) struct dev_event_info { STAILQ_ENTRY(dev_event_info) dei_link; char dei_data[DEVCTL_BUFFER]; }; static struct cdevsw dev_cdevsw = { .d_version = D_VERSION, .d_open = devopen, .d_close = devclose, .d_read = devread, .d_ioctl = devioctl, .d_poll = devpoll, .d_kqfilter = devkqfilter, .d_name = "devctl", }; static void filt_devctl_detach(struct knote *kn); static int filt_devctl_read(struct knote *kn, long hint); -static struct filterops devctl_rfiltops = { +static const struct filterops devctl_rfiltops = { .f_isfd = 1, .f_detach = filt_devctl_detach, .f_event = filt_devctl_read, }; static struct cdev *devctl_dev; static void devaddq(const char *type, const char *what, device_t dev); static struct devctlbridge { send_event_f *send_f; } devctl_notify_hook = { .send_f = NULL }; static void devctl_init(void) { int reserve; uma_zone_t z; devctl_dev = make_dev_credf(MAKEDEV_ETERNAL, &dev_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0600, "devctl"); mtx_init(&devsoftc.mtx, "dev mtx", "devd", MTX_DEF); cv_init(&devsoftc.cv, "dev cv"); STAILQ_INIT(&devsoftc.devq); knlist_init_mtx(&devsoftc.sel.si_note, &devsoftc.mtx); if (devctl_queue_length > 0) { /* * Allocate a zone for the messages. Preallocate 2% of these for * a reserve. Allow only devctl_queue_length slabs to cap memory * usage. The reserve usually allows coverage of surges of * events during memory shortages. Normally we won't have to * re-use events from the queue, but will in extreme shortages. */ z = devsoftc.zone = uma_zcreate("DEVCTL", sizeof(struct dev_event_info), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); reserve = max(devctl_queue_length / 50, 100); /* 2% reserve */ uma_zone_set_max(z, devctl_queue_length); uma_zone_set_maxcache(z, 0); uma_zone_reserve(z, reserve); uma_prealloc(z, reserve); } EVENTHANDLER_REGISTER(device_attach, devctl_attach_handler, NULL, EVENTHANDLER_PRI_LAST); EVENTHANDLER_REGISTER(device_detach, devctl_detach_handler, NULL, EVENTHANDLER_PRI_LAST); EVENTHANDLER_REGISTER(device_nomatch, devctl_nomatch_handler, NULL, EVENTHANDLER_PRI_LAST); } SYSINIT(devctl_init, SI_SUB_DRIVERS, SI_ORDER_SECOND, devctl_init, NULL); /* * A device was added to the tree. We are called just after it successfully * attaches (that is, probe and attach success for this device). No call * is made if a device is merely parented into the tree. See devnomatch * if probe fails. If attach fails, no notification is sent (but maybe * we should have a different message for this). */ static void devctl_attach_handler(void *arg __unused, device_t dev) { devaddq("+", device_get_nameunit(dev), dev); } /* * A device was removed from the tree. We are called just before this * happens. */ static void devctl_detach_handler(void *arg __unused, device_t dev, enum evhdev_detach state) { if (state == EVHDEV_DETACH_COMPLETE) devaddq("-", device_get_nameunit(dev), dev); } /* * Called when there's no match for this device. This is only called * the first time that no match happens, so we don't keep getting this * message. Should that prove to be undesirable, we can change it. * This is called when all drivers that can attach to a given bus * decline to accept this device. Other errors may not be detected. */ static void devctl_nomatch_handler(void *arg __unused, device_t dev) { if (nomatch_enabled) devaddq("?", "", dev); } static int devopen(struct cdev *dev, int oflags, int devtype, struct thread *td) { mtx_lock(&devsoftc.mtx); if (devsoftc.inuse) { mtx_unlock(&devsoftc.mtx); return (EBUSY); } /* move to init */ devsoftc.inuse = 1; mtx_unlock(&devsoftc.mtx); return (0); } static int devclose(struct cdev *dev, int fflag, int devtype, struct thread *td) { mtx_lock(&devsoftc.mtx); devsoftc.inuse = 0; devsoftc.nonblock = 0; devsoftc.async = 0; cv_broadcast(&devsoftc.cv); funsetown(&devsoftc.sigio); mtx_unlock(&devsoftc.mtx); return (0); } /* * The read channel for this device is used to report changes to * userland in realtime. We are required to free the data as well as * the n1 object because we allocate them separately. Also note that * we return one record at a time. If you try to read this device a * character at a time, you will lose the rest of the data. Listening * programs are expected to cope. */ static int devread(struct cdev *dev, struct uio *uio, int ioflag) { struct dev_event_info *n1; int rv; mtx_lock(&devsoftc.mtx); while (STAILQ_EMPTY(&devsoftc.devq)) { if (devsoftc.nonblock) { mtx_unlock(&devsoftc.mtx); return (EAGAIN); } rv = cv_wait_sig(&devsoftc.cv, &devsoftc.mtx); if (rv) { /* * Need to translate ERESTART to EINTR here? -- jake */ mtx_unlock(&devsoftc.mtx); return (rv); } } n1 = STAILQ_FIRST(&devsoftc.devq); STAILQ_REMOVE_HEAD(&devsoftc.devq, dei_link); devsoftc.queued--; mtx_unlock(&devsoftc.mtx); rv = uiomove(n1->dei_data, strlen(n1->dei_data), uio); uma_zfree(devsoftc.zone, n1); return (rv); } static int devioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) { switch (cmd) { case FIONBIO: if (*(int*)data) devsoftc.nonblock = 1; else devsoftc.nonblock = 0; return (0); case FIOASYNC: if (*(int*)data) devsoftc.async = 1; else devsoftc.async = 0; return (0); case FIOSETOWN: return fsetown(*(int *)data, &devsoftc.sigio); case FIOGETOWN: *(int *)data = fgetown(&devsoftc.sigio); return (0); /* (un)Support for other fcntl() calls. */ case FIOCLEX: case FIONCLEX: case FIONREAD: default: break; } return (ENOTTY); } static int devpoll(struct cdev *dev, int events, struct thread *td) { int revents = 0; mtx_lock(&devsoftc.mtx); if (events & (POLLIN | POLLRDNORM)) { if (!STAILQ_EMPTY(&devsoftc.devq)) revents = events & (POLLIN | POLLRDNORM); else selrecord(td, &devsoftc.sel); } mtx_unlock(&devsoftc.mtx); return (revents); } static int devkqfilter(struct cdev *dev, struct knote *kn) { int error; if (kn->kn_filter == EVFILT_READ) { kn->kn_fop = &devctl_rfiltops; knlist_add(&devsoftc.sel.si_note, kn, 0); error = 0; } else error = EINVAL; return (error); } static void filt_devctl_detach(struct knote *kn) { knlist_remove(&devsoftc.sel.si_note, kn, 0); } static int filt_devctl_read(struct knote *kn, long hint) { kn->kn_data = devsoftc.queued; return (kn->kn_data != 0); } /** * @brief Return whether the userland process is running */ bool devctl_process_running(void) { return (devsoftc.inuse == 1); } static struct dev_event_info * devctl_alloc_dei(void) { struct dev_event_info *dei = NULL; mtx_lock(&devsoftc.mtx); if (devctl_queue_length == 0) goto out; dei = uma_zalloc(devsoftc.zone, M_NOWAIT); if (dei == NULL) dei = uma_zalloc(devsoftc.zone, M_NOWAIT | M_USE_RESERVE); if (dei == NULL) { /* * Guard against no items in the queue. Normally, this won't * happen, but if lots of events happen all at once and there's * a chance we're out of allocated space but none have yet been * queued when we get here, leaving nothing to steal. This can * also happen with error injection. Fail safe by returning * NULL in that case.. */ if (devsoftc.queued == 0) goto out; dei = STAILQ_FIRST(&devsoftc.devq); STAILQ_REMOVE_HEAD(&devsoftc.devq, dei_link); devsoftc.queued--; } MPASS(dei != NULL); *dei->dei_data = '\0'; out: mtx_unlock(&devsoftc.mtx); return (dei); } static struct dev_event_info * devctl_alloc_dei_sb(struct sbuf *sb) { struct dev_event_info *dei; dei = devctl_alloc_dei(); if (dei != NULL) sbuf_new(sb, dei->dei_data, sizeof(dei->dei_data), SBUF_FIXEDLEN); return (dei); } static void devctl_free_dei(struct dev_event_info *dei) { uma_zfree(devsoftc.zone, dei); } static void devctl_queue(struct dev_event_info *dei) { mtx_lock(&devsoftc.mtx); STAILQ_INSERT_TAIL(&devsoftc.devq, dei, dei_link); devsoftc.queued++; cv_broadcast(&devsoftc.cv); KNOTE_LOCKED(&devsoftc.sel.si_note, 0); mtx_unlock(&devsoftc.mtx); selwakeup(&devsoftc.sel); if (devsoftc.async && devsoftc.sigio != NULL) pgsigio(&devsoftc.sigio, SIGIO, 0); } /** * @brief Send a 'notification' to userland, using standard ways */ void devctl_notify(const char *system, const char *subsystem, const char *type, const char *data) { struct dev_event_info *dei; struct sbuf sb; if (system == NULL || subsystem == NULL || type == NULL) return; if (devctl_notify_hook.send_f != NULL) devctl_notify_hook.send_f(system, subsystem, type, data); dei = devctl_alloc_dei_sb(&sb); if (dei == NULL) return; sbuf_cpy(&sb, "!system="); sbuf_cat(&sb, system); sbuf_cat(&sb, " subsystem="); sbuf_cat(&sb, subsystem); sbuf_cat(&sb, " type="); sbuf_cat(&sb, type); if (data != NULL) { sbuf_putc(&sb, ' '); sbuf_cat(&sb, data); } sbuf_putc(&sb, '\n'); if (sbuf_finish(&sb) != 0) devctl_free_dei(dei); /* overflow -> drop it */ else devctl_queue(dei); } /* * Common routine that tries to make sending messages as easy as possible. * We allocate memory for the data, copy strings into that, but do not * free it unless there's an error. The dequeue part of the driver should * free the data. We don't send data when the device is disabled. We do * send data, even when we have no listeners, because we wish to avoid * races relating to startup and restart of listening applications. * * devaddq is designed to string together the type of event, with the * object of that event, plus the plug and play info and location info * for that event. This is likely most useful for devices, but less * useful for other consumers of this interface. Those should use * the devctl_notify() interface instead. * * Output: * ${type}${what} at $(location dev) $(pnp-info dev) on $(parent dev) */ static void devaddq(const char *type, const char *what, device_t dev) { struct dev_event_info *dei; const char *parstr; struct sbuf sb; size_t beginlen; dei = devctl_alloc_dei_sb(&sb); if (dei == NULL) return; sbuf_cpy(&sb, type); sbuf_cat(&sb, what); sbuf_cat(&sb, " at "); beginlen = sbuf_len(&sb); /* Add in the location */ bus_child_location(dev, &sb); sbuf_putc(&sb, ' '); /* Add in pnpinfo */ bus_child_pnpinfo(dev, &sb); /* Get the parent of this device, or / if high enough in the tree. */ if (device_get_parent(dev) == NULL) parstr = "."; /* Or '/' ? */ else parstr = device_get_nameunit(device_get_parent(dev)); sbuf_cat(&sb, " on "); sbuf_cat(&sb, parstr); sbuf_putc(&sb, '\n'); if (sbuf_finish(&sb) != 0) goto bad; if (devctl_notify_hook.send_f != NULL) { const char *t; switch (*type) { case '+': t = "ATTACH"; break; case '-': t = "DETACH"; break; default: t = "NOMATCH"; break; } devctl_notify_hook.send_f("device", what, t, sbuf_data(&sb) + beginlen); } devctl_queue(dei); return; bad: devctl_free_dei(dei); } static int sysctl_devctl_queue(SYSCTL_HANDLER_ARGS) { int q, error; q = devctl_queue_length; error = sysctl_handle_int(oidp, &q, 0, req); if (error || !req->newptr) return (error); if (q < 0) return (EINVAL); /* * When set as a tunable, we've not yet initialized the mutex. * It is safe to just assign to devctl_queue_length and return * as we're racing no one. We'll use whatever value set in * devinit. */ if (!mtx_initialized(&devsoftc.mtx)) { devctl_queue_length = q; return (0); } /* * XXX It's hard to grow or shrink the UMA zone. Only allow * disabling the queue size for the moment until underlying * UMA issues can be sorted out. */ if (q != 0) return (EINVAL); if (q == devctl_queue_length) return (0); mtx_lock(&devsoftc.mtx); devctl_queue_length = 0; uma_zdestroy(devsoftc.zone); devsoftc.zone = 0; mtx_unlock(&devsoftc.mtx); return (0); } /** * @brief safely quotes strings that might have double quotes in them. * * The devctl protocol relies on quoted strings having matching quotes. * This routine quotes any internal quotes so the resulting string * is safe to pass to snprintf to construct, for example pnp info strings. * * @param sb sbuf to place the characters into * @param src Original buffer. */ void devctl_safe_quote_sb(struct sbuf *sb, const char *src) { while (*src != '\0') { if (*src == '"' || *src == '\\') sbuf_putc(sb, '\\'); sbuf_putc(sb, *src++); } } void devctl_set_notify_hook(send_event_f *hook) { devctl_notify_hook.send_f = hook; } void devctl_unset_notify_hook(void) { devctl_notify_hook.send_f = NULL; } diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c index fa96fbad20ce..dcb2c10ee1f5 100644 --- a/sys/kern/kern_event.c +++ b/sys/kern/kern_event.c @@ -1,2831 +1,2831 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 1999,2000,2001 Jonathan Lemon * Copyright 2004 John-Mark Gurney * Copyright (c) 2009 Apple, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_ktrace.h" #include "opt_kqueue.h" #ifdef COMPAT_FREEBSD11 #define _WANT_FREEBSD11_KEVENT #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system"); /* * This lock is used if multiple kq locks are required. This possibly * should be made into a per proc lock. */ static struct mtx kq_global; MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF); #define KQ_GLOBAL_LOCK(lck, haslck) do { \ if (!haslck) \ mtx_lock(lck); \ haslck = 1; \ } while (0) #define KQ_GLOBAL_UNLOCK(lck, haslck) do { \ if (haslck) \ mtx_unlock(lck); \ haslck = 0; \ } while (0) TASKQUEUE_DEFINE_THREAD(kqueue_ctx); static int kevent_copyout(void *arg, struct kevent *kevp, int count); static int kevent_copyin(void *arg, struct kevent *kevp, int count); static int kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int mflag); static int kqueue_acquire(struct file *fp, struct kqueue **kqp); static void kqueue_release(struct kqueue *kq, int locked); static void kqueue_destroy(struct kqueue *kq); static void kqueue_drain(struct kqueue *kq, struct thread *td); static int kqueue_expand(struct kqueue *kq, const struct filterops *fops, uintptr_t ident, int mflag); static void kqueue_task(void *arg, int pending); static int kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops, const struct timespec *timeout, struct kevent *keva, struct thread *td); static void kqueue_wakeup(struct kqueue *kq); static const struct filterops *kqueue_fo_find(int filt); static void kqueue_fo_release(int filt); struct g_kevent_args; static int kern_kevent_generic(struct thread *td, struct g_kevent_args *uap, struct kevent_copyops *k_ops, const char *struct_name); static fo_ioctl_t kqueue_ioctl; static fo_poll_t kqueue_poll; static fo_kqfilter_t kqueue_kqfilter; static fo_stat_t kqueue_stat; static fo_close_t kqueue_close; static fo_fill_kinfo_t kqueue_fill_kinfo; -static struct fileops kqueueops = { +static const struct fileops kqueueops = { .fo_read = invfo_rdwr, .fo_write = invfo_rdwr, .fo_truncate = invfo_truncate, .fo_ioctl = kqueue_ioctl, .fo_poll = kqueue_poll, .fo_kqfilter = kqueue_kqfilter, .fo_stat = kqueue_stat, .fo_close = kqueue_close, .fo_chmod = invfo_chmod, .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, .fo_cmp = file_kcmp_generic, .fo_fill_kinfo = kqueue_fill_kinfo, }; static int knote_attach(struct knote *kn, struct kqueue *kq); static void knote_drop(struct knote *kn, struct thread *td); static void knote_drop_detached(struct knote *kn, struct thread *td); static void knote_enqueue(struct knote *kn); static void knote_dequeue(struct knote *kn); static void knote_init(void); static struct knote *knote_alloc(int mflag); static void knote_free(struct knote *kn); static void filt_kqdetach(struct knote *kn); static int filt_kqueue(struct knote *kn, long hint); static int filt_procattach(struct knote *kn); static void filt_procdetach(struct knote *kn); static int filt_proc(struct knote *kn, long hint); static int filt_fileattach(struct knote *kn); static void filt_timerexpire(void *knx); static void filt_timerexpire_l(struct knote *kn, bool proc_locked); static int filt_timerattach(struct knote *kn); static void filt_timerdetach(struct knote *kn); static void filt_timerstart(struct knote *kn, sbintime_t to); static void filt_timertouch(struct knote *kn, struct kevent *kev, u_long type); static int filt_timervalidate(struct knote *kn, sbintime_t *to); static int filt_timer(struct knote *kn, long hint); static int filt_userattach(struct knote *kn); static void filt_userdetach(struct knote *kn); static int filt_user(struct knote *kn, long hint); static void filt_usertouch(struct knote *kn, struct kevent *kev, u_long type); -static struct filterops file_filtops = { +static const struct filterops file_filtops = { .f_isfd = 1, .f_attach = filt_fileattach, }; -static struct filterops kqread_filtops = { +static const struct filterops kqread_filtops = { .f_isfd = 1, .f_detach = filt_kqdetach, .f_event = filt_kqueue, }; /* XXX - move to kern_proc.c? */ -static struct filterops proc_filtops = { +static const struct filterops proc_filtops = { .f_isfd = 0, .f_attach = filt_procattach, .f_detach = filt_procdetach, .f_event = filt_proc, }; -static struct filterops timer_filtops = { +static const struct filterops timer_filtops = { .f_isfd = 0, .f_attach = filt_timerattach, .f_detach = filt_timerdetach, .f_event = filt_timer, .f_touch = filt_timertouch, }; -static struct filterops user_filtops = { +static const struct filterops user_filtops = { .f_attach = filt_userattach, .f_detach = filt_userdetach, .f_event = filt_user, .f_touch = filt_usertouch, }; static uma_zone_t knote_zone; static unsigned int __exclusive_cache_line kq_ncallouts; static unsigned int kq_calloutmax = 4 * 1024; SYSCTL_UINT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW, &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue"); /* XXX - ensure not influx ? */ #define KNOTE_ACTIVATE(kn, islock) do { \ if ((islock)) \ mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED); \ else \ KQ_LOCK((kn)->kn_kq); \ (kn)->kn_status |= KN_ACTIVE; \ if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) \ knote_enqueue((kn)); \ if (!(islock)) \ KQ_UNLOCK((kn)->kn_kq); \ } while (0) #define KQ_LOCK(kq) do { \ mtx_lock(&(kq)->kq_lock); \ } while (0) #define KQ_FLUX_WAKEUP(kq) do { \ if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) { \ (kq)->kq_state &= ~KQ_FLUXWAIT; \ wakeup((kq)); \ } \ } while (0) #define KQ_UNLOCK_FLUX(kq) do { \ KQ_FLUX_WAKEUP(kq); \ mtx_unlock(&(kq)->kq_lock); \ } while (0) #define KQ_UNLOCK(kq) do { \ mtx_unlock(&(kq)->kq_lock); \ } while (0) #define KQ_OWNED(kq) do { \ mtx_assert(&(kq)->kq_lock, MA_OWNED); \ } while (0) #define KQ_NOTOWNED(kq) do { \ mtx_assert(&(kq)->kq_lock, MA_NOTOWNED); \ } while (0) static struct knlist * kn_list_lock(struct knote *kn) { struct knlist *knl; knl = kn->kn_knlist; if (knl != NULL) knl->kl_lock(knl->kl_lockarg); return (knl); } static void kn_list_unlock(struct knlist *knl) { bool do_free; if (knl == NULL) return; do_free = knl->kl_autodestroy && knlist_empty(knl); knl->kl_unlock(knl->kl_lockarg); if (do_free) { knlist_destroy(knl); free(knl, M_KQUEUE); } } static bool kn_in_flux(struct knote *kn) { return (kn->kn_influx > 0); } static void kn_enter_flux(struct knote *kn) { KQ_OWNED(kn->kn_kq); MPASS(kn->kn_influx < INT_MAX); kn->kn_influx++; } static bool kn_leave_flux(struct knote *kn) { KQ_OWNED(kn->kn_kq); MPASS(kn->kn_influx > 0); kn->kn_influx--; return (kn->kn_influx == 0); } #define KNL_ASSERT_LOCK(knl, islocked) do { \ if (islocked) \ KNL_ASSERT_LOCKED(knl); \ else \ KNL_ASSERT_UNLOCKED(knl); \ } while (0) #ifdef INVARIANTS #define KNL_ASSERT_LOCKED(knl) do { \ knl->kl_assert_lock((knl)->kl_lockarg, LA_LOCKED); \ } while (0) #define KNL_ASSERT_UNLOCKED(knl) do { \ knl->kl_assert_lock((knl)->kl_lockarg, LA_UNLOCKED); \ } while (0) #else /* !INVARIANTS */ #define KNL_ASSERT_LOCKED(knl) do {} while (0) #define KNL_ASSERT_UNLOCKED(knl) do {} while (0) #endif /* INVARIANTS */ #ifndef KN_HASHSIZE #define KN_HASHSIZE 64 /* XXX should be tunable */ #endif #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask)) static int filt_nullattach(struct knote *kn) { return (ENXIO); }; -struct filterops null_filtops = { +static const struct filterops null_filtops = { .f_isfd = 0, .f_attach = filt_nullattach, }; /* XXX - make SYSINIT to add these, and move into respective modules. */ -extern struct filterops sig_filtops; -extern struct filterops fs_filtops; +extern const struct filterops sig_filtops; +extern const struct filterops fs_filtops; /* * Table for all system-defined filters. */ static struct mtx filterops_lock; MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops", MTX_DEF); static struct { const struct filterops *for_fop; int for_nolock; int for_refcnt; } sysfilt_ops[EVFILT_SYSCOUNT] = { [~EVFILT_READ] = { &file_filtops, 1 }, [~EVFILT_WRITE] = { &file_filtops, 1 }, [~EVFILT_AIO] = { &null_filtops }, [~EVFILT_VNODE] = { &file_filtops, 1 }, [~EVFILT_PROC] = { &proc_filtops, 1 }, [~EVFILT_SIGNAL] = { &sig_filtops, 1 }, [~EVFILT_TIMER] = { &timer_filtops, 1 }, [~EVFILT_PROCDESC] = { &file_filtops, 1 }, [~EVFILT_FS] = { &fs_filtops, 1 }, [~EVFILT_LIO] = { &null_filtops }, [~EVFILT_USER] = { &user_filtops, 1 }, [~EVFILT_SENDFILE] = { &null_filtops }, [~EVFILT_EMPTY] = { &file_filtops, 1 }, }; /* * Simple redirection for all cdevsw style objects to call their fo_kqfilter * method. */ static int filt_fileattach(struct knote *kn) { return (fo_kqfilter(kn->kn_fp, kn)); } /*ARGSUSED*/ static int kqueue_kqfilter(struct file *fp, struct knote *kn) { struct kqueue *kq = kn->kn_fp->f_data; if (kn->kn_filter != EVFILT_READ) return (EINVAL); kn->kn_status |= KN_KQUEUE; kn->kn_fop = &kqread_filtops; knlist_add(&kq->kq_sel.si_note, kn, 0); return (0); } static void filt_kqdetach(struct knote *kn) { struct kqueue *kq = kn->kn_fp->f_data; knlist_remove(&kq->kq_sel.si_note, kn, 0); } /*ARGSUSED*/ static int filt_kqueue(struct knote *kn, long hint) { struct kqueue *kq = kn->kn_fp->f_data; kn->kn_data = kq->kq_count; return (kn->kn_data > 0); } /* XXX - move to kern_proc.c? */ static int filt_procattach(struct knote *kn) { struct proc *p; int error; bool exiting, immediate; exiting = immediate = false; if (kn->kn_sfflags & NOTE_EXIT) p = pfind_any(kn->kn_id); else p = pfind(kn->kn_id); if (p == NULL) return (ESRCH); if (p->p_flag & P_WEXIT) exiting = true; if ((error = p_cansee(curthread, p))) { PROC_UNLOCK(p); return (error); } kn->kn_ptr.p_proc = p; kn->kn_flags |= EV_CLEAR; /* automatically set */ /* * Internal flag indicating registration done by kernel for the * purposes of getting a NOTE_CHILD notification. */ if (kn->kn_flags & EV_FLAG2) { kn->kn_flags &= ~EV_FLAG2; kn->kn_data = kn->kn_sdata; /* ppid */ kn->kn_fflags = NOTE_CHILD; kn->kn_sfflags &= ~(NOTE_EXIT | NOTE_EXEC | NOTE_FORK); immediate = true; /* Force immediate activation of child note. */ } /* * Internal flag indicating registration done by kernel (for other than * NOTE_CHILD). */ if (kn->kn_flags & EV_FLAG1) { kn->kn_flags &= ~EV_FLAG1; } knlist_add(p->p_klist, kn, 1); /* * Immediately activate any child notes or, in the case of a zombie * target process, exit notes. The latter is necessary to handle the * case where the target process, e.g. a child, dies before the kevent * is registered. */ if (immediate || (exiting && filt_proc(kn, NOTE_EXIT))) KNOTE_ACTIVATE(kn, 0); PROC_UNLOCK(p); return (0); } /* * The knote may be attached to a different process, which may exit, * leaving nothing for the knote to be attached to. So when the process * exits, the knote is marked as DETACHED and also flagged as ONESHOT so * it will be deleted when read out. However, as part of the knote deletion, * this routine is called, so a check is needed to avoid actually performing * a detach, because the original process does not exist any more. */ /* XXX - move to kern_proc.c? */ static void filt_procdetach(struct knote *kn) { knlist_remove(kn->kn_knlist, kn, 0); kn->kn_ptr.p_proc = NULL; } /* XXX - move to kern_proc.c? */ static int filt_proc(struct knote *kn, long hint) { struct proc *p; u_int event; p = kn->kn_ptr.p_proc; if (p == NULL) /* already activated, from attach filter */ return (0); /* Mask off extra data. */ event = (u_int)hint & NOTE_PCTRLMASK; /* If the user is interested in this event, record it. */ if (kn->kn_sfflags & event) kn->kn_fflags |= event; /* Process is gone, so flag the event as finished. */ if (event == NOTE_EXIT) { kn->kn_flags |= EV_EOF | EV_ONESHOT; kn->kn_ptr.p_proc = NULL; if (kn->kn_fflags & NOTE_EXIT) kn->kn_data = KW_EXITCODE(p->p_xexit, p->p_xsig); if (kn->kn_fflags == 0) kn->kn_flags |= EV_DROP; return (1); } return (kn->kn_fflags != 0); } /* * Called when the process forked. It mostly does the same as the * knote(), activating all knotes registered to be activated when the * process forked. Additionally, for each knote attached to the * parent, check whether user wants to track the new process. If so * attach a new knote to it, and immediately report an event with the * child's pid. */ void knote_fork(struct knlist *list, int pid) { struct kqueue *kq; struct knote *kn; struct kevent kev; int error; MPASS(list != NULL); KNL_ASSERT_LOCKED(list); if (SLIST_EMPTY(&list->kl_list)) return; memset(&kev, 0, sizeof(kev)); SLIST_FOREACH(kn, &list->kl_list, kn_selnext) { kq = kn->kn_kq; KQ_LOCK(kq); if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) { KQ_UNLOCK(kq); continue; } /* * The same as knote(), activate the event. */ if ((kn->kn_sfflags & NOTE_TRACK) == 0) { if (kn->kn_fop->f_event(kn, NOTE_FORK)) KNOTE_ACTIVATE(kn, 1); KQ_UNLOCK(kq); continue; } /* * The NOTE_TRACK case. In addition to the activation * of the event, we need to register new events to * track the child. Drop the locks in preparation for * the call to kqueue_register(). */ kn_enter_flux(kn); KQ_UNLOCK(kq); list->kl_unlock(list->kl_lockarg); /* * Activate existing knote and register tracking knotes with * new process. * * First register a knote to get just the child notice. This * must be a separate note from a potential NOTE_EXIT * notification since both NOTE_CHILD and NOTE_EXIT are defined * to use the data field (in conflicting ways). */ kev.ident = pid; kev.filter = kn->kn_filter; kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_ONESHOT | EV_FLAG2; kev.fflags = kn->kn_sfflags; kev.data = kn->kn_id; /* parent */ kev.udata = kn->kn_kevent.udata;/* preserve udata */ error = kqueue_register(kq, &kev, NULL, M_NOWAIT); if (error) kn->kn_fflags |= NOTE_TRACKERR; /* * Then register another knote to track other potential events * from the new process. */ kev.ident = pid; kev.filter = kn->kn_filter; kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1; kev.fflags = kn->kn_sfflags; kev.data = kn->kn_id; /* parent */ kev.udata = kn->kn_kevent.udata;/* preserve udata */ error = kqueue_register(kq, &kev, NULL, M_NOWAIT); if (error) kn->kn_fflags |= NOTE_TRACKERR; if (kn->kn_fop->f_event(kn, NOTE_FORK)) KNOTE_ACTIVATE(kn, 0); list->kl_lock(list->kl_lockarg); KQ_LOCK(kq); kn_leave_flux(kn); KQ_UNLOCK_FLUX(kq); } } /* * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the * interval timer support code. */ #define NOTE_TIMER_PRECMASK \ (NOTE_SECONDS | NOTE_MSECONDS | NOTE_USECONDS | NOTE_NSECONDS) static sbintime_t timer2sbintime(int64_t data, int flags) { int64_t secs; /* * Macros for converting to the fractional second portion of an * sbintime_t using 64bit multiplication to improve precision. */ #define NS_TO_SBT(ns) (((ns) * (((uint64_t)1 << 63) / 500000000)) >> 32) #define US_TO_SBT(us) (((us) * (((uint64_t)1 << 63) / 500000)) >> 32) #define MS_TO_SBT(ms) (((ms) * (((uint64_t)1 << 63) / 500)) >> 32) switch (flags & NOTE_TIMER_PRECMASK) { case NOTE_SECONDS: #ifdef __LP64__ if (data > (SBT_MAX / SBT_1S)) return (SBT_MAX); #endif return ((sbintime_t)data << 32); case NOTE_MSECONDS: /* FALLTHROUGH */ case 0: if (data >= 1000) { secs = data / 1000; #ifdef __LP64__ if (secs > (SBT_MAX / SBT_1S)) return (SBT_MAX); #endif return (secs << 32 | MS_TO_SBT(data % 1000)); } return (MS_TO_SBT(data)); case NOTE_USECONDS: if (data >= 1000000) { secs = data / 1000000; #ifdef __LP64__ if (secs > (SBT_MAX / SBT_1S)) return (SBT_MAX); #endif return (secs << 32 | US_TO_SBT(data % 1000000)); } return (US_TO_SBT(data)); case NOTE_NSECONDS: if (data >= 1000000000) { secs = data / 1000000000; #ifdef __LP64__ if (secs > (SBT_MAX / SBT_1S)) return (SBT_MAX); #endif return (secs << 32 | NS_TO_SBT(data % 1000000000)); } return (NS_TO_SBT(data)); default: break; } return (-1); } struct kq_timer_cb_data { struct callout c; struct proc *p; struct knote *kn; int cpuid; int flags; TAILQ_ENTRY(kq_timer_cb_data) link; sbintime_t next; /* next timer event fires at */ sbintime_t to; /* precalculated timer period, 0 for abs */ }; #define KQ_TIMER_CB_ENQUEUED 0x01 static void kqtimer_sched_callout(struct kq_timer_cb_data *kc) { callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kc->kn, kc->cpuid, C_ABSOLUTE); } void kqtimer_proc_continue(struct proc *p) { struct kq_timer_cb_data *kc, *kc1; struct bintime bt; sbintime_t now; PROC_LOCK_ASSERT(p, MA_OWNED); getboottimebin(&bt); now = bttosbt(bt); TAILQ_FOREACH_SAFE(kc, &p->p_kqtim_stop, link, kc1) { TAILQ_REMOVE(&p->p_kqtim_stop, kc, link); kc->flags &= ~KQ_TIMER_CB_ENQUEUED; if (kc->next <= now) filt_timerexpire_l(kc->kn, true); else kqtimer_sched_callout(kc); } } static void filt_timerexpire_l(struct knote *kn, bool proc_locked) { struct kq_timer_cb_data *kc; struct proc *p; uint64_t delta; sbintime_t now; kc = kn->kn_ptr.p_v; if ((kn->kn_flags & EV_ONESHOT) != 0 || kc->to == 0) { kn->kn_data++; KNOTE_ACTIVATE(kn, 0); return; } now = sbinuptime(); if (now >= kc->next) { delta = (now - kc->next) / kc->to; if (delta == 0) delta = 1; kn->kn_data += delta; kc->next += delta * kc->to; if (now >= kc->next) /* overflow */ kc->next = now + kc->to; KNOTE_ACTIVATE(kn, 0); /* XXX - handle locking */ } /* * Initial check for stopped kc->p is racy. It is fine to * miss the set of the stop flags, at worst we would schedule * one more callout. On the other hand, it is not fine to not * schedule when we we missed clearing of the flags, we * recheck them under the lock and observe consistent state. */ p = kc->p; if (P_SHOULDSTOP(p) || P_KILLED(p)) { if (!proc_locked) PROC_LOCK(p); if (P_SHOULDSTOP(p) || P_KILLED(p)) { if ((kc->flags & KQ_TIMER_CB_ENQUEUED) == 0) { kc->flags |= KQ_TIMER_CB_ENQUEUED; TAILQ_INSERT_TAIL(&p->p_kqtim_stop, kc, link); } if (!proc_locked) PROC_UNLOCK(p); return; } if (!proc_locked) PROC_UNLOCK(p); } kqtimer_sched_callout(kc); } static void filt_timerexpire(void *knx) { filt_timerexpire_l(knx, false); } /* * data contains amount of time to sleep */ static int filt_timervalidate(struct knote *kn, sbintime_t *to) { struct bintime bt; sbintime_t sbt; if (kn->kn_sdata < 0) return (EINVAL); if (kn->kn_sdata == 0 && (kn->kn_flags & EV_ONESHOT) == 0) kn->kn_sdata = 1; /* * The only fflags values supported are the timer unit * (precision) and the absolute time indicator. */ if ((kn->kn_sfflags & ~(NOTE_TIMER_PRECMASK | NOTE_ABSTIME)) != 0) return (EINVAL); *to = timer2sbintime(kn->kn_sdata, kn->kn_sfflags); if (*to < 0) return (EINVAL); if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) { getboottimebin(&bt); sbt = bttosbt(bt); *to = MAX(0, *to - sbt); } return (0); } static int filt_timerattach(struct knote *kn) { struct kq_timer_cb_data *kc; sbintime_t to; int error; to = -1; error = filt_timervalidate(kn, &to); if (error != 0) return (error); KASSERT(to > 0 || (kn->kn_flags & EV_ONESHOT) != 0 || (kn->kn_sfflags & NOTE_ABSTIME) != 0, ("%s: periodic timer has a calculated zero timeout", __func__)); KASSERT(to >= 0, ("%s: timer has a calculated negative timeout", __func__)); if (atomic_fetchadd_int(&kq_ncallouts, 1) + 1 > kq_calloutmax) { atomic_subtract_int(&kq_ncallouts, 1); return (ENOMEM); } if ((kn->kn_sfflags & NOTE_ABSTIME) == 0) kn->kn_flags |= EV_CLEAR; /* automatically set */ kn->kn_status &= ~KN_DETACHED; /* knlist_add clears it */ kn->kn_ptr.p_v = kc = malloc(sizeof(*kc), M_KQUEUE, M_WAITOK); kc->kn = kn; kc->p = curproc; kc->cpuid = PCPU_GET(cpuid); kc->flags = 0; callout_init(&kc->c, 1); filt_timerstart(kn, to); return (0); } static void filt_timerstart(struct knote *kn, sbintime_t to) { struct kq_timer_cb_data *kc; kc = kn->kn_ptr.p_v; if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) { kc->next = to; kc->to = 0; } else { kc->next = to + sbinuptime(); kc->to = to; } kqtimer_sched_callout(kc); } static void filt_timerdetach(struct knote *kn) { struct kq_timer_cb_data *kc; unsigned int old __unused; bool pending; kc = kn->kn_ptr.p_v; do { callout_drain(&kc->c); /* * kqtimer_proc_continue() might have rescheduled this callout. * Double-check, using the process mutex as an interlock. */ PROC_LOCK(kc->p); if ((kc->flags & KQ_TIMER_CB_ENQUEUED) != 0) { kc->flags &= ~KQ_TIMER_CB_ENQUEUED; TAILQ_REMOVE(&kc->p->p_kqtim_stop, kc, link); } pending = callout_pending(&kc->c); PROC_UNLOCK(kc->p); } while (pending); free(kc, M_KQUEUE); old = atomic_fetchadd_int(&kq_ncallouts, -1); KASSERT(old > 0, ("Number of callouts cannot become negative")); kn->kn_status |= KN_DETACHED; /* knlist_remove sets it */ } static void filt_timertouch(struct knote *kn, struct kevent *kev, u_long type) { struct kq_timer_cb_data *kc; struct kqueue *kq; sbintime_t to; int error; switch (type) { case EVENT_REGISTER: /* Handle re-added timers that update data/fflags */ if (kev->flags & EV_ADD) { kc = kn->kn_ptr.p_v; /* Drain any existing callout. */ callout_drain(&kc->c); /* Throw away any existing undelivered record * of the timer expiration. This is done under * the presumption that if a process is * re-adding this timer with new parameters, * it is no longer interested in what may have * happened under the old parameters. If it is * interested, it can wait for the expiration, * delete the old timer definition, and then * add the new one. * * This has to be done while the kq is locked: * - if enqueued, dequeue * - make it no longer active * - clear the count of expiration events */ kq = kn->kn_kq; KQ_LOCK(kq); if (kn->kn_status & KN_QUEUED) knote_dequeue(kn); kn->kn_status &= ~KN_ACTIVE; kn->kn_data = 0; KQ_UNLOCK(kq); /* Reschedule timer based on new data/fflags */ kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; error = filt_timervalidate(kn, &to); if (error != 0) { kn->kn_flags |= EV_ERROR; kn->kn_data = error; } else filt_timerstart(kn, to); } break; case EVENT_PROCESS: *kev = kn->kn_kevent; if (kn->kn_flags & EV_CLEAR) { kn->kn_data = 0; kn->kn_fflags = 0; } break; default: panic("filt_timertouch() - invalid type (%ld)", type); break; } } static int filt_timer(struct knote *kn, long hint) { return (kn->kn_data != 0); } static int filt_userattach(struct knote *kn) { /* * EVFILT_USER knotes are not attached to anything in the kernel. */ kn->kn_hook = NULL; if (kn->kn_fflags & NOTE_TRIGGER) kn->kn_hookid = 1; else kn->kn_hookid = 0; return (0); } static void filt_userdetach(__unused struct knote *kn) { /* * EVFILT_USER knotes are not attached to anything in the kernel. */ } static int filt_user(struct knote *kn, __unused long hint) { return (kn->kn_hookid); } static void filt_usertouch(struct knote *kn, struct kevent *kev, u_long type) { u_int ffctrl; switch (type) { case EVENT_REGISTER: if (kev->fflags & NOTE_TRIGGER) kn->kn_hookid = 1; ffctrl = kev->fflags & NOTE_FFCTRLMASK; kev->fflags &= NOTE_FFLAGSMASK; switch (ffctrl) { case NOTE_FFNOP: break; case NOTE_FFAND: kn->kn_sfflags &= kev->fflags; break; case NOTE_FFOR: kn->kn_sfflags |= kev->fflags; break; case NOTE_FFCOPY: kn->kn_sfflags = kev->fflags; break; default: /* XXX Return error? */ break; } kn->kn_sdata = kev->data; if (kev->flags & EV_CLEAR) { kn->kn_hookid = 0; kn->kn_data = 0; kn->kn_fflags = 0; } break; case EVENT_PROCESS: *kev = kn->kn_kevent; kev->fflags = kn->kn_sfflags; kev->data = kn->kn_sdata; if (kn->kn_flags & EV_CLEAR) { kn->kn_hookid = 0; kn->kn_data = 0; kn->kn_fflags = 0; } break; default: panic("filt_usertouch() - invalid type (%ld)", type); break; } } int sys_kqueue(struct thread *td, struct kqueue_args *uap) { return (kern_kqueue(td, 0, NULL)); } int sys_kqueuex(struct thread *td, struct kqueuex_args *uap) { int flags; if ((uap->flags & ~(KQUEUE_CLOEXEC)) != 0) return (EINVAL); flags = 0; if ((uap->flags & KQUEUE_CLOEXEC) != 0) flags |= O_CLOEXEC; return (kern_kqueue(td, flags, NULL)); } static void kqueue_init(struct kqueue *kq) { mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF | MTX_DUPOK); TAILQ_INIT(&kq->kq_head); knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock); TASK_INIT(&kq->kq_task, 0, kqueue_task, kq); } int kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps) { struct filedesc *fdp; struct kqueue *kq; struct file *fp; struct ucred *cred; int fd, error; fdp = td->td_proc->p_fd; cred = td->td_ucred; if (!chgkqcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_KQUEUES))) return (ENOMEM); error = falloc_caps(td, &fp, &fd, flags, fcaps); if (error != 0) { chgkqcnt(cred->cr_ruidinfo, -1, 0); return (error); } /* An extra reference on `fp' has been held for us by falloc(). */ kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO); kqueue_init(kq); kq->kq_fdp = fdp; kq->kq_cred = crhold(cred); FILEDESC_XLOCK(fdp); TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list); FILEDESC_XUNLOCK(fdp); finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops); fdrop(fp, td); td->td_retval[0] = fd; return (0); } struct g_kevent_args { int fd; const void *changelist; int nchanges; void *eventlist; int nevents; const struct timespec *timeout; }; int sys_kevent(struct thread *td, struct kevent_args *uap) { struct kevent_copyops k_ops = { .arg = uap, .k_copyout = kevent_copyout, .k_copyin = kevent_copyin, .kevent_size = sizeof(struct kevent), }; struct g_kevent_args gk_args = { .fd = uap->fd, .changelist = uap->changelist, .nchanges = uap->nchanges, .eventlist = uap->eventlist, .nevents = uap->nevents, .timeout = uap->timeout, }; return (kern_kevent_generic(td, &gk_args, &k_ops, "kevent")); } static int kern_kevent_generic(struct thread *td, struct g_kevent_args *uap, struct kevent_copyops *k_ops, const char *struct_name) { struct timespec ts, *tsp; #ifdef KTRACE struct kevent *eventlist = uap->eventlist; #endif int error; if (uap->timeout != NULL) { error = copyin(uap->timeout, &ts, sizeof(ts)); if (error) return (error); tsp = &ts; } else tsp = NULL; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT_ARRAY)) ktrstructarray(struct_name, UIO_USERSPACE, uap->changelist, uap->nchanges, k_ops->kevent_size); #endif error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents, k_ops, tsp); #ifdef KTRACE if (error == 0 && KTRPOINT(td, KTR_STRUCT_ARRAY)) ktrstructarray(struct_name, UIO_USERSPACE, eventlist, td->td_retval[0], k_ops->kevent_size); #endif return (error); } /* * Copy 'count' items into the destination list pointed to by uap->eventlist. */ static int kevent_copyout(void *arg, struct kevent *kevp, int count) { struct kevent_args *uap; int error; KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); uap = (struct kevent_args *)arg; error = copyout(kevp, uap->eventlist, count * sizeof *kevp); if (error == 0) uap->eventlist += count; return (error); } /* * Copy 'count' items from the list pointed to by uap->changelist. */ static int kevent_copyin(void *arg, struct kevent *kevp, int count) { struct kevent_args *uap; int error; KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); uap = (struct kevent_args *)arg; error = copyin(uap->changelist, kevp, count * sizeof *kevp); if (error == 0) uap->changelist += count; return (error); } #ifdef COMPAT_FREEBSD11 static int kevent11_copyout(void *arg, struct kevent *kevp, int count) { struct freebsd11_kevent_args *uap; struct freebsd11_kevent kev11; int error, i; KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); uap = (struct freebsd11_kevent_args *)arg; for (i = 0; i < count; i++) { kev11.ident = kevp->ident; kev11.filter = kevp->filter; kev11.flags = kevp->flags; kev11.fflags = kevp->fflags; kev11.data = kevp->data; kev11.udata = kevp->udata; error = copyout(&kev11, uap->eventlist, sizeof(kev11)); if (error != 0) break; uap->eventlist++; kevp++; } return (error); } /* * Copy 'count' items from the list pointed to by uap->changelist. */ static int kevent11_copyin(void *arg, struct kevent *kevp, int count) { struct freebsd11_kevent_args *uap; struct freebsd11_kevent kev11; int error, i; KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count)); uap = (struct freebsd11_kevent_args *)arg; for (i = 0; i < count; i++) { error = copyin(uap->changelist, &kev11, sizeof(kev11)); if (error != 0) break; kevp->ident = kev11.ident; kevp->filter = kev11.filter; kevp->flags = kev11.flags; kevp->fflags = kev11.fflags; kevp->data = (uintptr_t)kev11.data; kevp->udata = kev11.udata; bzero(&kevp->ext, sizeof(kevp->ext)); uap->changelist++; kevp++; } return (error); } int freebsd11_kevent(struct thread *td, struct freebsd11_kevent_args *uap) { struct kevent_copyops k_ops = { .arg = uap, .k_copyout = kevent11_copyout, .k_copyin = kevent11_copyin, .kevent_size = sizeof(struct freebsd11_kevent), }; struct g_kevent_args gk_args = { .fd = uap->fd, .changelist = uap->changelist, .nchanges = uap->nchanges, .eventlist = uap->eventlist, .nevents = uap->nevents, .timeout = uap->timeout, }; return (kern_kevent_generic(td, &gk_args, &k_ops, "freebsd11_kevent")); } #endif int kern_kevent(struct thread *td, int fd, int nchanges, int nevents, struct kevent_copyops *k_ops, const struct timespec *timeout) { cap_rights_t rights; struct file *fp; int error; cap_rights_init_zero(&rights); if (nchanges > 0) cap_rights_set_one(&rights, CAP_KQUEUE_CHANGE); if (nevents > 0) cap_rights_set_one(&rights, CAP_KQUEUE_EVENT); error = fget(td, fd, &rights, &fp); if (error != 0) return (error); error = kern_kevent_fp(td, fp, nchanges, nevents, k_ops, timeout); fdrop(fp, td); return (error); } static int kqueue_kevent(struct kqueue *kq, struct thread *td, int nchanges, int nevents, struct kevent_copyops *k_ops, const struct timespec *timeout) { struct kevent keva[KQ_NEVENTS]; struct kevent *kevp, *changes; int i, n, nerrors, error; if (nchanges < 0) return (EINVAL); nerrors = 0; while (nchanges > 0) { n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges; error = k_ops->k_copyin(k_ops->arg, keva, n); if (error) return (error); changes = keva; for (i = 0; i < n; i++) { kevp = &changes[i]; if (!kevp->filter) continue; kevp->flags &= ~EV_SYSFLAGS; error = kqueue_register(kq, kevp, td, M_WAITOK); if (error || (kevp->flags & EV_RECEIPT)) { if (nevents == 0) return (error); kevp->flags = EV_ERROR; kevp->data = error; (void)k_ops->k_copyout(k_ops->arg, kevp, 1); nevents--; nerrors++; } } nchanges -= n; } if (nerrors) { td->td_retval[0] = nerrors; return (0); } return (kqueue_scan(kq, nevents, k_ops, timeout, keva, td)); } int kern_kevent_fp(struct thread *td, struct file *fp, int nchanges, int nevents, struct kevent_copyops *k_ops, const struct timespec *timeout) { struct kqueue *kq; int error; error = kqueue_acquire(fp, &kq); if (error != 0) return (error); error = kqueue_kevent(kq, td, nchanges, nevents, k_ops, timeout); kqueue_release(kq, 0); return (error); } /* * Performs a kevent() call on a temporarily created kqueue. This can be * used to perform one-shot polling, similar to poll() and select(). */ int kern_kevent_anonymous(struct thread *td, int nevents, struct kevent_copyops *k_ops) { struct kqueue kq = {}; int error; kqueue_init(&kq); kq.kq_refcnt = 1; error = kqueue_kevent(&kq, td, nevents, nevents, k_ops, NULL); kqueue_drain(&kq, td); kqueue_destroy(&kq); return (error); } int kqueue_add_filteropts(int filt, const struct filterops *filtops) { int error; error = 0; if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) { printf( "trying to add a filterop that is out of range: %d is beyond %d\n", ~filt, EVFILT_SYSCOUNT); return EINVAL; } mtx_lock(&filterops_lock); if (sysfilt_ops[~filt].for_fop != &null_filtops && sysfilt_ops[~filt].for_fop != NULL) error = EEXIST; else { sysfilt_ops[~filt].for_fop = filtops; sysfilt_ops[~filt].for_refcnt = 0; } mtx_unlock(&filterops_lock); return (error); } int kqueue_del_filteropts(int filt) { int error; error = 0; if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) return EINVAL; mtx_lock(&filterops_lock); if (sysfilt_ops[~filt].for_fop == &null_filtops || sysfilt_ops[~filt].for_fop == NULL) error = EINVAL; else if (sysfilt_ops[~filt].for_refcnt != 0) error = EBUSY; else { sysfilt_ops[~filt].for_fop = &null_filtops; sysfilt_ops[~filt].for_refcnt = 0; } mtx_unlock(&filterops_lock); return error; } static const struct filterops * kqueue_fo_find(int filt) { if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) return NULL; if (sysfilt_ops[~filt].for_nolock) return sysfilt_ops[~filt].for_fop; mtx_lock(&filterops_lock); sysfilt_ops[~filt].for_refcnt++; if (sysfilt_ops[~filt].for_fop == NULL) sysfilt_ops[~filt].for_fop = &null_filtops; mtx_unlock(&filterops_lock); return sysfilt_ops[~filt].for_fop; } static void kqueue_fo_release(int filt) { if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) return; if (sysfilt_ops[~filt].for_nolock) return; mtx_lock(&filterops_lock); KASSERT(sysfilt_ops[~filt].for_refcnt > 0, ("filter object refcount not valid on release")); sysfilt_ops[~filt].for_refcnt--; mtx_unlock(&filterops_lock); } /* * A ref to kq (obtained via kqueue_acquire) must be held. */ static int kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int mflag) { const struct filterops *fops; struct file *fp; struct knote *kn, *tkn; struct knlist *knl; int error, filt, event; int haskqglobal, filedesc_unlock; if ((kev->flags & (EV_ENABLE | EV_DISABLE)) == (EV_ENABLE | EV_DISABLE)) return (EINVAL); fp = NULL; kn = NULL; knl = NULL; error = 0; haskqglobal = 0; filedesc_unlock = 0; filt = kev->filter; fops = kqueue_fo_find(filt); if (fops == NULL) return EINVAL; if (kev->flags & EV_ADD) { /* Reject an invalid flag pair early */ if (kev->flags & EV_KEEPUDATA) { tkn = NULL; error = EINVAL; goto done; } /* * Prevent waiting with locks. Non-sleepable * allocation failures are handled in the loop, only * if the spare knote appears to be actually required. */ tkn = knote_alloc(mflag); } else { tkn = NULL; } findkn: if (fops->f_isfd) { KASSERT(td != NULL, ("td is NULL")); if (kev->ident > INT_MAX) error = EBADF; else error = fget(td, kev->ident, &cap_event_rights, &fp); if (error) goto done; if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops, kev->ident, M_NOWAIT) != 0) { /* try again */ fdrop(fp, td); fp = NULL; error = kqueue_expand(kq, fops, kev->ident, mflag); if (error) goto done; goto findkn; } if (fp->f_type == DTYPE_KQUEUE) { /* * If we add some intelligence about what we are doing, * we should be able to support events on ourselves. * We need to know when we are doing this to prevent * getting both the knlist lock and the kq lock since * they are the same thing. */ if (fp->f_data == kq) { error = EINVAL; goto done; } /* * Pre-lock the filedesc before the global * lock mutex, see the comment in * kqueue_close(). */ FILEDESC_XLOCK(td->td_proc->p_fd); filedesc_unlock = 1; KQ_GLOBAL_LOCK(&kq_global, haskqglobal); } KQ_LOCK(kq); if (kev->ident < kq->kq_knlistsize) { SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link) if (kev->filter == kn->kn_filter) break; } } else { if ((kev->flags & EV_ADD) == EV_ADD) { error = kqueue_expand(kq, fops, kev->ident, mflag); if (error != 0) goto done; } KQ_LOCK(kq); /* * If possible, find an existing knote to use for this kevent. */ if (kev->filter == EVFILT_PROC && (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) { /* This is an internal creation of a process tracking * note. Don't attempt to coalesce this with an * existing note. */ ; } else if (kq->kq_knhashmask != 0) { struct klist *list; list = &kq->kq_knhash[ KN_HASH((u_long)kev->ident, kq->kq_knhashmask)]; SLIST_FOREACH(kn, list, kn_link) if (kev->ident == kn->kn_id && kev->filter == kn->kn_filter) break; } } /* knote is in the process of changing, wait for it to stabilize. */ if (kn != NULL && kn_in_flux(kn)) { KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); if (filedesc_unlock) { FILEDESC_XUNLOCK(td->td_proc->p_fd); filedesc_unlock = 0; } kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0); if (fp != NULL) { fdrop(fp, td); fp = NULL; } goto findkn; } /* * kn now contains the matching knote, or NULL if no match */ if (kn == NULL) { if (kev->flags & EV_ADD) { kn = tkn; tkn = NULL; if (kn == NULL) { KQ_UNLOCK(kq); error = ENOMEM; goto done; } kn->kn_fp = fp; kn->kn_kq = kq; kn->kn_fop = fops; /* * apply reference counts to knote structure, and * do not release it at the end of this routine. */ fops = NULL; fp = NULL; kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; kev->fflags = 0; kev->data = 0; kn->kn_kevent = *kev; kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE | EV_ENABLE | EV_DISABLE | EV_FORCEONESHOT); kn->kn_status = KN_DETACHED; if ((kev->flags & EV_DISABLE) != 0) kn->kn_status |= KN_DISABLED; kn_enter_flux(kn); error = knote_attach(kn, kq); KQ_UNLOCK(kq); if (error != 0) { tkn = kn; goto done; } if ((error = kn->kn_fop->f_attach(kn)) != 0) { knote_drop_detached(kn, td); goto done; } knl = kn_list_lock(kn); goto done_ev_add; } else { /* No matching knote and the EV_ADD flag is not set. */ KQ_UNLOCK(kq); error = ENOENT; goto done; } } if (kev->flags & EV_DELETE) { kn_enter_flux(kn); KQ_UNLOCK(kq); knote_drop(kn, td); goto done; } if (kev->flags & EV_FORCEONESHOT) { kn->kn_flags |= EV_ONESHOT; KNOTE_ACTIVATE(kn, 1); } if ((kev->flags & EV_ENABLE) != 0) kn->kn_status &= ~KN_DISABLED; else if ((kev->flags & EV_DISABLE) != 0) kn->kn_status |= KN_DISABLED; /* * The user may change some filter values after the initial EV_ADD, * but doing so will not reset any filter which has already been * triggered. */ kn->kn_status |= KN_SCAN; kn_enter_flux(kn); KQ_UNLOCK(kq); knl = kn_list_lock(kn); if ((kev->flags & EV_KEEPUDATA) == 0) kn->kn_kevent.udata = kev->udata; if (!fops->f_isfd && fops->f_touch != NULL) { fops->f_touch(kn, kev, EVENT_REGISTER); } else { kn->kn_sfflags = kev->fflags; kn->kn_sdata = kev->data; } done_ev_add: /* * We can get here with kn->kn_knlist == NULL. This can happen when * the initial attach event decides that the event is "completed" * already, e.g., filt_procattach() is called on a zombie process. It * will call filt_proc() which will remove it from the list, and NULL * kn_knlist. * * KN_DISABLED will be stable while the knote is in flux, so the * unlocked read will not race with an update. */ if ((kn->kn_status & KN_DISABLED) == 0) event = kn->kn_fop->f_event(kn, 0); else event = 0; KQ_LOCK(kq); if (event) kn->kn_status |= KN_ACTIVE; if ((kn->kn_status & (KN_ACTIVE | KN_DISABLED | KN_QUEUED)) == KN_ACTIVE) knote_enqueue(kn); kn->kn_status &= ~KN_SCAN; kn_leave_flux(kn); kn_list_unlock(knl); KQ_UNLOCK_FLUX(kq); done: KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); if (filedesc_unlock) FILEDESC_XUNLOCK(td->td_proc->p_fd); if (fp != NULL) fdrop(fp, td); knote_free(tkn); if (fops != NULL) kqueue_fo_release(filt); return (error); } static int kqueue_acquire(struct file *fp, struct kqueue **kqp) { int error; struct kqueue *kq; error = 0; kq = fp->f_data; if (fp->f_type != DTYPE_KQUEUE || kq == NULL) return (EBADF); *kqp = kq; KQ_LOCK(kq); if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) { KQ_UNLOCK(kq); return (EBADF); } kq->kq_refcnt++; KQ_UNLOCK(kq); return error; } static void kqueue_release(struct kqueue *kq, int locked) { if (locked) KQ_OWNED(kq); else KQ_LOCK(kq); kq->kq_refcnt--; if (kq->kq_refcnt == 1) wakeup(&kq->kq_refcnt); if (!locked) KQ_UNLOCK(kq); } static void ast_kqueue(struct thread *td, int tda __unused) { taskqueue_quiesce(taskqueue_kqueue_ctx); } static void kqueue_schedtask(struct kqueue *kq) { KQ_OWNED(kq); KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN), ("scheduling kqueue task while draining")); if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) { taskqueue_enqueue(taskqueue_kqueue_ctx, &kq->kq_task); kq->kq_state |= KQ_TASKSCHED; ast_sched(curthread, TDA_KQUEUE); } } /* * Expand the kq to make sure we have storage for fops/ident pair. * * Return 0 on success (or no work necessary), return errno on failure. */ static int kqueue_expand(struct kqueue *kq, const struct filterops *fops, uintptr_t ident, int mflag) { struct klist *list, *tmp_knhash, *to_free; u_long tmp_knhashmask; int error, fd, size; KQ_NOTOWNED(kq); error = 0; to_free = NULL; if (fops->f_isfd) { fd = ident; if (kq->kq_knlistsize <= fd) { size = kq->kq_knlistsize; while (size <= fd) size += KQEXTENT; list = malloc(size * sizeof(*list), M_KQUEUE, mflag); if (list == NULL) return ENOMEM; KQ_LOCK(kq); if ((kq->kq_state & KQ_CLOSING) != 0) { to_free = list; error = EBADF; } else if (kq->kq_knlistsize > fd) { to_free = list; } else { if (kq->kq_knlist != NULL) { bcopy(kq->kq_knlist, list, kq->kq_knlistsize * sizeof(*list)); to_free = kq->kq_knlist; kq->kq_knlist = NULL; } bzero((caddr_t)list + kq->kq_knlistsize * sizeof(*list), (size - kq->kq_knlistsize) * sizeof(*list)); kq->kq_knlistsize = size; kq->kq_knlist = list; } KQ_UNLOCK(kq); } } else { if (kq->kq_knhashmask == 0) { tmp_knhash = hashinit_flags(KN_HASHSIZE, M_KQUEUE, &tmp_knhashmask, (mflag & M_WAITOK) != 0 ? HASH_WAITOK : HASH_NOWAIT); if (tmp_knhash == NULL) return (ENOMEM); KQ_LOCK(kq); if ((kq->kq_state & KQ_CLOSING) != 0) { to_free = tmp_knhash; error = EBADF; } else if (kq->kq_knhashmask == 0) { kq->kq_knhash = tmp_knhash; kq->kq_knhashmask = tmp_knhashmask; } else { to_free = tmp_knhash; } KQ_UNLOCK(kq); } } free(to_free, M_KQUEUE); KQ_NOTOWNED(kq); return (error); } static void kqueue_task(void *arg, int pending) { struct kqueue *kq; int haskqglobal; haskqglobal = 0; kq = arg; KQ_GLOBAL_LOCK(&kq_global, haskqglobal); KQ_LOCK(kq); KNOTE_LOCKED(&kq->kq_sel.si_note, 0); kq->kq_state &= ~KQ_TASKSCHED; if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) { wakeup(&kq->kq_state); } KQ_UNLOCK(kq); KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); } /* * Scan, update kn_data (if not ONESHOT), and copyout triggered events. * We treat KN_MARKER knotes as if they are in flux. */ static int kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops, const struct timespec *tsp, struct kevent *keva, struct thread *td) { struct kevent *kevp; struct knote *kn, *marker; struct knlist *knl; sbintime_t asbt, rsbt; int count, error, haskqglobal, influx, nkev, touch; count = maxevents; nkev = 0; error = 0; haskqglobal = 0; if (maxevents == 0) goto done_nl; if (maxevents < 0) { error = EINVAL; goto done_nl; } rsbt = 0; if (tsp != NULL) { if (!timespecvalid_interval(tsp)) { error = EINVAL; goto done_nl; } if (timespecisset(tsp)) { if (tsp->tv_sec <= INT32_MAX) { rsbt = tstosbt(*tsp); if (TIMESEL(&asbt, rsbt)) asbt += tc_tick_sbt; if (asbt <= SBT_MAX - rsbt) asbt += rsbt; else asbt = 0; rsbt >>= tc_precexp; } else asbt = 0; } else asbt = -1; } else asbt = 0; marker = knote_alloc(M_WAITOK); marker->kn_status = KN_MARKER; KQ_LOCK(kq); retry: kevp = keva; if (kq->kq_count == 0) { if (asbt == -1) { error = EWOULDBLOCK; } else { kq->kq_state |= KQ_SLEEP; error = msleep_sbt(kq, &kq->kq_lock, PSOCK | PCATCH, "kqread", asbt, rsbt, C_ABSOLUTE); } if (error == 0) goto retry; /* don't restart after signals... */ if (error == ERESTART) error = EINTR; else if (error == EWOULDBLOCK) error = 0; goto done; } TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe); influx = 0; while (count) { KQ_OWNED(kq); kn = TAILQ_FIRST(&kq->kq_head); if ((kn->kn_status == KN_MARKER && kn != marker) || kn_in_flux(kn)) { if (influx) { influx = 0; KQ_FLUX_WAKEUP(kq); } kq->kq_state |= KQ_FLUXWAIT; error = msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0); continue; } TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) { kn->kn_status &= ~KN_QUEUED; kq->kq_count--; continue; } if (kn == marker) { KQ_FLUX_WAKEUP(kq); if (count == maxevents) goto retry; goto done; } KASSERT(!kn_in_flux(kn), ("knote %p is unexpectedly in flux", kn)); if ((kn->kn_flags & EV_DROP) == EV_DROP) { kn->kn_status &= ~KN_QUEUED; kn_enter_flux(kn); kq->kq_count--; KQ_UNLOCK(kq); /* * We don't need to lock the list since we've * marked it as in flux. */ knote_drop(kn, td); KQ_LOCK(kq); continue; } else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) { kn->kn_status &= ~KN_QUEUED; kn_enter_flux(kn); kq->kq_count--; KQ_UNLOCK(kq); /* * We don't need to lock the list since we've * marked the knote as being in flux. */ *kevp = kn->kn_kevent; knote_drop(kn, td); KQ_LOCK(kq); kn = NULL; } else { kn->kn_status |= KN_SCAN; kn_enter_flux(kn); KQ_UNLOCK(kq); if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE) KQ_GLOBAL_LOCK(&kq_global, haskqglobal); knl = kn_list_lock(kn); if (kn->kn_fop->f_event(kn, 0) == 0) { KQ_LOCK(kq); KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE | KN_SCAN); kn_leave_flux(kn); kq->kq_count--; kn_list_unlock(knl); influx = 1; continue; } touch = (!kn->kn_fop->f_isfd && kn->kn_fop->f_touch != NULL); if (touch) kn->kn_fop->f_touch(kn, kevp, EVENT_PROCESS); else *kevp = kn->kn_kevent; KQ_LOCK(kq); KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal); if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) { /* * Manually clear knotes who weren't * 'touch'ed. */ if (touch == 0 && kn->kn_flags & EV_CLEAR) { kn->kn_data = 0; kn->kn_fflags = 0; } if (kn->kn_flags & EV_DISPATCH) kn->kn_status |= KN_DISABLED; kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE); kq->kq_count--; } else TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); kn->kn_status &= ~KN_SCAN; kn_leave_flux(kn); kn_list_unlock(knl); influx = 1; } /* we are returning a copy to the user */ kevp++; nkev++; count--; if (nkev == KQ_NEVENTS) { influx = 0; KQ_UNLOCK_FLUX(kq); error = k_ops->k_copyout(k_ops->arg, keva, nkev); nkev = 0; kevp = keva; KQ_LOCK(kq); if (error) break; } } TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe); done: KQ_OWNED(kq); KQ_UNLOCK_FLUX(kq); knote_free(marker); done_nl: KQ_NOTOWNED(kq); if (nkev != 0) error = k_ops->k_copyout(k_ops->arg, keva, nkev); td->td_retval[0] = maxevents - count; return (error); } /*ARGSUSED*/ static int kqueue_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { /* * Enabling sigio causes two major problems: * 1) infinite recursion: * Synopsys: kevent is being used to track signals and have FIOASYNC * set. On receipt of a signal this will cause a kqueue to recurse * into itself over and over. Sending the sigio causes the kqueue * to become ready, which in turn posts sigio again, forever. * Solution: this can be solved by setting a flag in the kqueue that * we have a SIGIO in progress. * 2) locking problems: * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts * us above the proc and pgrp locks. * Solution: Post a signal using an async mechanism, being sure to * record a generation count in the delivery so that we do not deliver * a signal to the wrong process. * * Note, these two mechanisms are somewhat mutually exclusive! */ #if 0 struct kqueue *kq; kq = fp->f_data; switch (cmd) { case FIOASYNC: if (*(int *)data) { kq->kq_state |= KQ_ASYNC; } else { kq->kq_state &= ~KQ_ASYNC; } return (0); case FIOSETOWN: return (fsetown(*(int *)data, &kq->kq_sigio)); case FIOGETOWN: *(int *)data = fgetown(&kq->kq_sigio); return (0); } #endif return (ENOTTY); } /*ARGSUSED*/ static int kqueue_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct kqueue *kq; int revents = 0; int error; if ((error = kqueue_acquire(fp, &kq))) return POLLERR; KQ_LOCK(kq); if (events & (POLLIN | POLLRDNORM)) { if (kq->kq_count) { revents |= events & (POLLIN | POLLRDNORM); } else { selrecord(td, &kq->kq_sel); if (SEL_WAITING(&kq->kq_sel)) kq->kq_state |= KQ_SEL; } } kqueue_release(kq, 1); KQ_UNLOCK(kq); return (revents); } /*ARGSUSED*/ static int kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred) { bzero((void *)st, sizeof *st); /* * We no longer return kq_count because the unlocked value is useless. * If you spent all this time getting the count, why not spend your * syscall better by calling kevent? * * XXX - This is needed for libc_r. */ st->st_mode = S_IFIFO; return (0); } static void kqueue_drain(struct kqueue *kq, struct thread *td) { struct knote *kn; int i; KQ_LOCK(kq); KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING, ("kqueue already closing")); kq->kq_state |= KQ_CLOSING; if (kq->kq_refcnt > 1) msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0); KASSERT(kq->kq_refcnt == 1, ("other refs are out there!")); KASSERT(knlist_empty(&kq->kq_sel.si_note), ("kqueue's knlist not empty")); for (i = 0; i < kq->kq_knlistsize; i++) { while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) { if (kn_in_flux(kn)) { kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK, "kqclo1", 0); continue; } kn_enter_flux(kn); KQ_UNLOCK(kq); knote_drop(kn, td); KQ_LOCK(kq); } } if (kq->kq_knhashmask != 0) { for (i = 0; i <= kq->kq_knhashmask; i++) { while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) { if (kn_in_flux(kn)) { kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK, "kqclo2", 0); continue; } kn_enter_flux(kn); KQ_UNLOCK(kq); knote_drop(kn, td); KQ_LOCK(kq); } } } if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) { kq->kq_state |= KQ_TASKDRAIN; msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0); } if ((kq->kq_state & KQ_SEL) == KQ_SEL) { selwakeuppri(&kq->kq_sel, PSOCK); if (!SEL_WAITING(&kq->kq_sel)) kq->kq_state &= ~KQ_SEL; } KQ_UNLOCK(kq); } static void kqueue_destroy(struct kqueue *kq) { KASSERT(kq->kq_fdp == NULL, ("kqueue still attached to a file descriptor")); seldrain(&kq->kq_sel); knlist_destroy(&kq->kq_sel.si_note); mtx_destroy(&kq->kq_lock); if (kq->kq_knhash != NULL) free(kq->kq_knhash, M_KQUEUE); if (kq->kq_knlist != NULL) free(kq->kq_knlist, M_KQUEUE); funsetown(&kq->kq_sigio); } /*ARGSUSED*/ static int kqueue_close(struct file *fp, struct thread *td) { struct kqueue *kq = fp->f_data; struct filedesc *fdp; int error; int filedesc_unlock; if ((error = kqueue_acquire(fp, &kq))) return error; kqueue_drain(kq, td); /* * We could be called due to the knote_drop() doing fdrop(), * called from kqueue_register(). In this case the global * lock is owned, and filedesc sx is locked before, to not * take the sleepable lock after non-sleepable. */ fdp = kq->kq_fdp; kq->kq_fdp = NULL; if (!sx_xlocked(FILEDESC_LOCK(fdp))) { FILEDESC_XLOCK(fdp); filedesc_unlock = 1; } else filedesc_unlock = 0; TAILQ_REMOVE(&fdp->fd_kqlist, kq, kq_list); if (filedesc_unlock) FILEDESC_XUNLOCK(fdp); kqueue_destroy(kq); chgkqcnt(kq->kq_cred->cr_ruidinfo, -1, 0); crfree(kq->kq_cred); free(kq, M_KQUEUE); fp->f_data = NULL; return (0); } static int kqueue_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { struct kqueue *kq = fp->f_data; kif->kf_type = KF_TYPE_KQUEUE; kif->kf_un.kf_kqueue.kf_kqueue_addr = (uintptr_t)kq; kif->kf_un.kf_kqueue.kf_kqueue_count = kq->kq_count; kif->kf_un.kf_kqueue.kf_kqueue_state = kq->kq_state; return (0); } static void kqueue_wakeup(struct kqueue *kq) { KQ_OWNED(kq); if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) { kq->kq_state &= ~KQ_SLEEP; wakeup(kq); } if ((kq->kq_state & KQ_SEL) == KQ_SEL) { selwakeuppri(&kq->kq_sel, PSOCK); if (!SEL_WAITING(&kq->kq_sel)) kq->kq_state &= ~KQ_SEL; } if (!knlist_empty(&kq->kq_sel.si_note)) kqueue_schedtask(kq); if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) { pgsigio(&kq->kq_sigio, SIGIO, 0); } } /* * Walk down a list of knotes, activating them if their event has triggered. * * There is a possibility to optimize in the case of one kq watching another. * Instead of scheduling a task to wake it up, you could pass enough state * down the chain to make up the parent kqueue. Make this code functional * first. */ void knote(struct knlist *list, long hint, int lockflags) { struct kqueue *kq; struct knote *kn, *tkn; int error; if (list == NULL) return; KNL_ASSERT_LOCK(list, lockflags & KNF_LISTLOCKED); if ((lockflags & KNF_LISTLOCKED) == 0) list->kl_lock(list->kl_lockarg); /* * If we unlock the list lock (and enter influx), we can * eliminate the kqueue scheduling, but this will introduce * four lock/unlock's for each knote to test. Also, marker * would be needed to keep iteration position, since filters * or other threads could remove events. */ SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, tkn) { kq = kn->kn_kq; KQ_LOCK(kq); if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) { /* * Do not process the influx notes, except for * the influx coming from the kq unlock in the * kqueue_scan(). In the later case, we do * not interfere with the scan, since the code * fragment in kqueue_scan() locks the knlist, * and cannot proceed until we finished. */ KQ_UNLOCK(kq); } else if ((lockflags & KNF_NOKQLOCK) != 0) { kn_enter_flux(kn); KQ_UNLOCK(kq); error = kn->kn_fop->f_event(kn, hint); KQ_LOCK(kq); kn_leave_flux(kn); if (error) KNOTE_ACTIVATE(kn, 1); KQ_UNLOCK_FLUX(kq); } else { if (kn->kn_fop->f_event(kn, hint)) KNOTE_ACTIVATE(kn, 1); KQ_UNLOCK(kq); } } if ((lockflags & KNF_LISTLOCKED) == 0) list->kl_unlock(list->kl_lockarg); } /* * add a knote to a knlist */ void knlist_add(struct knlist *knl, struct knote *kn, int islocked) { KNL_ASSERT_LOCK(knl, islocked); KQ_NOTOWNED(kn->kn_kq); KASSERT(kn_in_flux(kn), ("knote %p not in flux", kn)); KASSERT((kn->kn_status & KN_DETACHED) != 0, ("knote %p was not detached", kn)); if (!islocked) knl->kl_lock(knl->kl_lockarg); SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext); if (!islocked) knl->kl_unlock(knl->kl_lockarg); KQ_LOCK(kn->kn_kq); kn->kn_knlist = knl; kn->kn_status &= ~KN_DETACHED; KQ_UNLOCK(kn->kn_kq); } static void knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked, int kqislocked) { KASSERT(!kqislocked || knlislocked, ("kq locked w/o knl locked")); KNL_ASSERT_LOCK(knl, knlislocked); mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED); KASSERT(kqislocked || kn_in_flux(kn), ("knote %p not in flux", kn)); KASSERT((kn->kn_status & KN_DETACHED) == 0, ("knote %p was already detached", kn)); if (!knlislocked) knl->kl_lock(knl->kl_lockarg); SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext); kn->kn_knlist = NULL; if (!knlislocked) kn_list_unlock(knl); if (!kqislocked) KQ_LOCK(kn->kn_kq); kn->kn_status |= KN_DETACHED; if (!kqislocked) KQ_UNLOCK(kn->kn_kq); } /* * remove knote from the specified knlist */ void knlist_remove(struct knlist *knl, struct knote *kn, int islocked) { knlist_remove_kq(knl, kn, islocked, 0); } int knlist_empty(struct knlist *knl) { KNL_ASSERT_LOCKED(knl); return (SLIST_EMPTY(&knl->kl_list)); } static struct mtx knlist_lock; MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects", MTX_DEF); static void knlist_mtx_lock(void *arg); static void knlist_mtx_unlock(void *arg); static void knlist_mtx_lock(void *arg) { mtx_lock((struct mtx *)arg); } static void knlist_mtx_unlock(void *arg) { mtx_unlock((struct mtx *)arg); } static void knlist_mtx_assert_lock(void *arg, int what) { if (what == LA_LOCKED) mtx_assert((struct mtx *)arg, MA_OWNED); else mtx_assert((struct mtx *)arg, MA_NOTOWNED); } void knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *), void (*kl_unlock)(void *), void (*kl_assert_lock)(void *, int)) { if (lock == NULL) knl->kl_lockarg = &knlist_lock; else knl->kl_lockarg = lock; if (kl_lock == NULL) knl->kl_lock = knlist_mtx_lock; else knl->kl_lock = kl_lock; if (kl_unlock == NULL) knl->kl_unlock = knlist_mtx_unlock; else knl->kl_unlock = kl_unlock; if (kl_assert_lock == NULL) knl->kl_assert_lock = knlist_mtx_assert_lock; else knl->kl_assert_lock = kl_assert_lock; knl->kl_autodestroy = 0; SLIST_INIT(&knl->kl_list); } void knlist_init_mtx(struct knlist *knl, struct mtx *lock) { knlist_init(knl, lock, NULL, NULL, NULL); } struct knlist * knlist_alloc(struct mtx *lock) { struct knlist *knl; knl = malloc(sizeof(struct knlist), M_KQUEUE, M_WAITOK); knlist_init_mtx(knl, lock); return (knl); } void knlist_destroy(struct knlist *knl) { KASSERT(KNLIST_EMPTY(knl), ("destroying knlist %p with knotes on it", knl)); } void knlist_detach(struct knlist *knl) { KNL_ASSERT_LOCKED(knl); knl->kl_autodestroy = 1; if (knlist_empty(knl)) { knlist_destroy(knl); free(knl, M_KQUEUE); } } /* * Even if we are locked, we may need to drop the lock to allow any influx * knotes time to "settle". */ void knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn) { struct knote *kn, *kn2; struct kqueue *kq; KASSERT(!knl->kl_autodestroy, ("cleardel for autodestroy %p", knl)); if (islocked) KNL_ASSERT_LOCKED(knl); else { KNL_ASSERT_UNLOCKED(knl); again: /* need to reacquire lock since we have dropped it */ knl->kl_lock(knl->kl_lockarg); } SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) { kq = kn->kn_kq; KQ_LOCK(kq); if (kn_in_flux(kn)) { KQ_UNLOCK(kq); continue; } knlist_remove_kq(knl, kn, 1, 1); if (killkn) { kn_enter_flux(kn); KQ_UNLOCK(kq); knote_drop_detached(kn, td); } else { /* Make sure cleared knotes disappear soon */ kn->kn_flags |= EV_EOF | EV_ONESHOT; KQ_UNLOCK(kq); } kq = NULL; } if (!SLIST_EMPTY(&knl->kl_list)) { /* there are still in flux knotes remaining */ kn = SLIST_FIRST(&knl->kl_list); kq = kn->kn_kq; KQ_LOCK(kq); KASSERT(kn_in_flux(kn), ("knote removed w/o list lock")); knl->kl_unlock(knl->kl_lockarg); kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0); kq = NULL; goto again; } if (islocked) KNL_ASSERT_LOCKED(knl); else { knl->kl_unlock(knl->kl_lockarg); KNL_ASSERT_UNLOCKED(knl); } } /* * Remove all knotes referencing a specified fd must be called with FILEDESC * lock. This prevents a race where a new fd comes along and occupies the * entry and we attach a knote to the fd. */ void knote_fdclose(struct thread *td, int fd) { struct filedesc *fdp = td->td_proc->p_fd; struct kqueue *kq; struct knote *kn; int influx; FILEDESC_XLOCK_ASSERT(fdp); /* * We shouldn't have to worry about new kevents appearing on fd * since filedesc is locked. */ TAILQ_FOREACH(kq, &fdp->fd_kqlist, kq_list) { KQ_LOCK(kq); again: influx = 0; while (kq->kq_knlistsize > fd && (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) { if (kn_in_flux(kn)) { /* someone else might be waiting on our knote */ if (influx) wakeup(kq); kq->kq_state |= KQ_FLUXWAIT; msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0); goto again; } kn_enter_flux(kn); KQ_UNLOCK(kq); influx = 1; knote_drop(kn, td); KQ_LOCK(kq); } KQ_UNLOCK_FLUX(kq); } } static int knote_attach(struct knote *kn, struct kqueue *kq) { struct klist *list; KASSERT(kn_in_flux(kn), ("knote %p not marked influx", kn)); KQ_OWNED(kq); if ((kq->kq_state & KQ_CLOSING) != 0) return (EBADF); if (kn->kn_fop->f_isfd) { if (kn->kn_id >= kq->kq_knlistsize) return (ENOMEM); list = &kq->kq_knlist[kn->kn_id]; } else { if (kq->kq_knhash == NULL) return (ENOMEM); list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; } SLIST_INSERT_HEAD(list, kn, kn_link); return (0); } static void knote_drop(struct knote *kn, struct thread *td) { if ((kn->kn_status & KN_DETACHED) == 0) kn->kn_fop->f_detach(kn); knote_drop_detached(kn, td); } static void knote_drop_detached(struct knote *kn, struct thread *td) { struct kqueue *kq; struct klist *list; kq = kn->kn_kq; KASSERT((kn->kn_status & KN_DETACHED) != 0, ("knote %p still attached", kn)); KQ_NOTOWNED(kq); KQ_LOCK(kq); KASSERT(kn->kn_influx == 1, ("knote_drop called on %p with influx %d", kn, kn->kn_influx)); if (kn->kn_fop->f_isfd) list = &kq->kq_knlist[kn->kn_id]; else list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)]; if (!SLIST_EMPTY(list)) SLIST_REMOVE(list, kn, knote, kn_link); if (kn->kn_status & KN_QUEUED) knote_dequeue(kn); KQ_UNLOCK_FLUX(kq); if (kn->kn_fop->f_isfd) { fdrop(kn->kn_fp, td); kn->kn_fp = NULL; } kqueue_fo_release(kn->kn_kevent.filter); kn->kn_fop = NULL; knote_free(kn); } static void knote_enqueue(struct knote *kn) { struct kqueue *kq = kn->kn_kq; KQ_OWNED(kn->kn_kq); KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued")); TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); kn->kn_status |= KN_QUEUED; kq->kq_count++; kqueue_wakeup(kq); } static void knote_dequeue(struct knote *kn) { struct kqueue *kq = kn->kn_kq; KQ_OWNED(kn->kn_kq); KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued")); TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); kn->kn_status &= ~KN_QUEUED; kq->kq_count--; } static void knote_init(void) { knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); ast_register(TDA_KQUEUE, ASTR_ASTF_REQUIRED, 0, ast_kqueue); } SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL); static struct knote * knote_alloc(int mflag) { return (uma_zalloc(knote_zone, mflag | M_ZERO)); } static void knote_free(struct knote *kn) { uma_zfree(knote_zone, kn); } /* * Register the kev w/ the kq specified by fd. */ int kqfd_register(int fd, struct kevent *kev, struct thread *td, int mflag) { struct kqueue *kq; struct file *fp; cap_rights_t rights; int error; error = fget(td, fd, cap_rights_init_one(&rights, CAP_KQUEUE_CHANGE), &fp); if (error != 0) return (error); if ((error = kqueue_acquire(fp, &kq)) != 0) goto noacquire; error = kqueue_register(kq, kev, td, mflag); kqueue_release(kq, 0); noacquire: fdrop(fp, td); return (error); } diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c index 802231767762..6661f4cd6187 100644 --- a/sys/kern/kern_sig.c +++ b/sys/kern/kern_sig.c @@ -1,4618 +1,4618 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_sig.c 8.7 (Berkeley) 4/18/94 */ #include "opt_capsicum.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define ONSIG 32 /* NSIG for osig* syscalls. XXX. */ SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE3(proc, , , signal__send, "struct thread *", "struct proc *", "int"); SDT_PROBE_DEFINE2(proc, , , signal__clear, "int", "ksiginfo_t *"); SDT_PROBE_DEFINE3(proc, , , signal__discard, "struct thread *", "struct proc *", "int"); static int coredump(struct thread *); static int killpg1(struct thread *td, int sig, int pgid, int all, ksiginfo_t *ksi); static int issignal(struct thread *td); static void reschedule_signals(struct proc *p, sigset_t block, int flags); static int sigprop(int sig); static void tdsigwakeup(struct thread *, int, sig_t, int); static int sig_suspend_threads(struct thread *, struct proc *); static int filt_sigattach(struct knote *kn); static void filt_sigdetach(struct knote *kn); static int filt_signal(struct knote *kn, long hint); static struct thread *sigtd(struct proc *p, int sig, bool fast_sigblock); static void sigqueue_start(void); static void sigfastblock_setpend(struct thread *td, bool resched); static uma_zone_t ksiginfo_zone = NULL; -struct filterops sig_filtops = { +const struct filterops sig_filtops = { .f_isfd = 0, .f_attach = filt_sigattach, .f_detach = filt_sigdetach, .f_event = filt_signal, }; static int kern_logsigexit = 1; SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW, &kern_logsigexit, 0, "Log processes quitting on abnormal signals to syslog(3)"); static int kern_forcesigexit = 1; SYSCTL_INT(_kern, OID_AUTO, forcesigexit, CTLFLAG_RW, &kern_forcesigexit, 0, "Force trap signal to be handled"); static SYSCTL_NODE(_kern, OID_AUTO, sigqueue, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "POSIX real time signal"); static int max_pending_per_proc = 128; SYSCTL_INT(_kern_sigqueue, OID_AUTO, max_pending_per_proc, CTLFLAG_RW, &max_pending_per_proc, 0, "Max pending signals per proc"); static int preallocate_siginfo = 1024; SYSCTL_INT(_kern_sigqueue, OID_AUTO, preallocate, CTLFLAG_RDTUN, &preallocate_siginfo, 0, "Preallocated signal memory size"); static int signal_overflow = 0; SYSCTL_INT(_kern_sigqueue, OID_AUTO, overflow, CTLFLAG_RD, &signal_overflow, 0, "Number of signals overflew"); static int signal_alloc_fail = 0; SYSCTL_INT(_kern_sigqueue, OID_AUTO, alloc_fail, CTLFLAG_RD, &signal_alloc_fail, 0, "signals failed to be allocated"); static int kern_lognosys = 0; SYSCTL_INT(_kern, OID_AUTO, lognosys, CTLFLAG_RWTUN, &kern_lognosys, 0, "Log invalid syscalls"); static int kern_signosys = 1; SYSCTL_INT(_kern, OID_AUTO, signosys, CTLFLAG_RWTUN, &kern_signosys, 0, "Send SIGSYS on return from invalid syscall"); __read_frequently bool sigfastblock_fetch_always = false; SYSCTL_BOOL(_kern, OID_AUTO, sigfastblock_fetch_always, CTLFLAG_RWTUN, &sigfastblock_fetch_always, 0, "Fetch sigfastblock word on each syscall entry for proper " "blocking semantic"); static bool kern_sig_discard_ign = true; SYSCTL_BOOL(_kern, OID_AUTO, sig_discard_ign, CTLFLAG_RWTUN, &kern_sig_discard_ign, 0, "Discard ignored signals on delivery, otherwise queue them to " "the target queue"); SYSINIT(signal, SI_SUB_P1003_1B, SI_ORDER_FIRST+3, sigqueue_start, NULL); /* * Policy -- Can ucred cr1 send SIGIO to process cr2? * Should use cr_cansignal() once cr_cansignal() allows SIGIO and SIGURG * in the right situations. */ #define CANSIGIO(cr1, cr2) \ ((cr1)->cr_uid == 0 || \ (cr1)->cr_ruid == (cr2)->cr_ruid || \ (cr1)->cr_uid == (cr2)->cr_ruid || \ (cr1)->cr_ruid == (cr2)->cr_uid || \ (cr1)->cr_uid == (cr2)->cr_uid) static int sugid_coredump; SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RWTUN, &sugid_coredump, 0, "Allow setuid and setgid processes to dump core"); static int capmode_coredump; SYSCTL_INT(_kern, OID_AUTO, capmode_coredump, CTLFLAG_RWTUN, &capmode_coredump, 0, "Allow processes in capability mode to dump core"); static int do_coredump = 1; SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW, &do_coredump, 0, "Enable/Disable coredumps"); static int set_core_nodump_flag = 0; SYSCTL_INT(_kern, OID_AUTO, nodump_coredump, CTLFLAG_RW, &set_core_nodump_flag, 0, "Enable setting the NODUMP flag on coredump files"); static int coredump_devctl = 0; SYSCTL_INT(_kern, OID_AUTO, coredump_devctl, CTLFLAG_RW, &coredump_devctl, 0, "Generate a devctl notification when processes coredump"); /* * Signal properties and actions. * The array below categorizes the signals and their default actions * according to the following properties: */ #define SIGPROP_KILL 0x01 /* terminates process by default */ #define SIGPROP_CORE 0x02 /* ditto and coredumps */ #define SIGPROP_STOP 0x04 /* suspend process */ #define SIGPROP_TTYSTOP 0x08 /* ditto, from tty */ #define SIGPROP_IGNORE 0x10 /* ignore by default */ #define SIGPROP_CONT 0x20 /* continue if suspended */ static const int sigproptbl[NSIG] = { [SIGHUP] = SIGPROP_KILL, [SIGINT] = SIGPROP_KILL, [SIGQUIT] = SIGPROP_KILL | SIGPROP_CORE, [SIGILL] = SIGPROP_KILL | SIGPROP_CORE, [SIGTRAP] = SIGPROP_KILL | SIGPROP_CORE, [SIGABRT] = SIGPROP_KILL | SIGPROP_CORE, [SIGEMT] = SIGPROP_KILL | SIGPROP_CORE, [SIGFPE] = SIGPROP_KILL | SIGPROP_CORE, [SIGKILL] = SIGPROP_KILL, [SIGBUS] = SIGPROP_KILL | SIGPROP_CORE, [SIGSEGV] = SIGPROP_KILL | SIGPROP_CORE, [SIGSYS] = SIGPROP_KILL | SIGPROP_CORE, [SIGPIPE] = SIGPROP_KILL, [SIGALRM] = SIGPROP_KILL, [SIGTERM] = SIGPROP_KILL, [SIGURG] = SIGPROP_IGNORE, [SIGSTOP] = SIGPROP_STOP, [SIGTSTP] = SIGPROP_STOP | SIGPROP_TTYSTOP, [SIGCONT] = SIGPROP_IGNORE | SIGPROP_CONT, [SIGCHLD] = SIGPROP_IGNORE, [SIGTTIN] = SIGPROP_STOP | SIGPROP_TTYSTOP, [SIGTTOU] = SIGPROP_STOP | SIGPROP_TTYSTOP, [SIGIO] = SIGPROP_IGNORE, [SIGXCPU] = SIGPROP_KILL, [SIGXFSZ] = SIGPROP_KILL, [SIGVTALRM] = SIGPROP_KILL, [SIGPROF] = SIGPROP_KILL, [SIGWINCH] = SIGPROP_IGNORE, [SIGINFO] = SIGPROP_IGNORE, [SIGUSR1] = SIGPROP_KILL, [SIGUSR2] = SIGPROP_KILL, }; #define _SIG_FOREACH_ADVANCE(i, set) ({ \ int __found; \ for (;;) { \ if (__bits != 0) { \ int __sig = ffs(__bits); \ __bits &= ~(1u << (__sig - 1)); \ sig = __i * sizeof((set)->__bits[0]) * NBBY + __sig; \ __found = 1; \ break; \ } \ if (++__i == _SIG_WORDS) { \ __found = 0; \ break; \ } \ __bits = (set)->__bits[__i]; \ } \ __found != 0; \ }) #define SIG_FOREACH(i, set) \ for (int32_t __i = -1, __bits = 0; \ _SIG_FOREACH_ADVANCE(i, set); ) \ static sigset_t fastblock_mask; static void ast_sig(struct thread *td, int tda) { struct proc *p; int old_boundary, sig; bool resched_sigs; p = td->td_proc; #ifdef DIAGNOSTIC if (p->p_numthreads == 1 && (tda & (TDAI(TDA_SIG) | TDAI(TDA_AST))) == 0) { PROC_LOCK(p); thread_lock(td); /* * Note that TDA_SIG should be re-read from * td_ast, since signal might have been delivered * after we cleared td_flags above. This is one of * the reason for looping check for AST condition. * See comment in userret() about P_PPWAIT. */ if ((p->p_flag & P_PPWAIT) == 0 && (td->td_pflags & TDP_SIGFASTBLOCK) == 0) { if (SIGPENDING(td) && ((tda | td->td_ast) & (TDAI(TDA_SIG) | TDAI(TDA_AST))) == 0) { thread_unlock(td); /* fix dumps */ panic( "failed2 to set signal flags for ast p %p " "td %p tda %#x td_ast %#x fl %#x", p, td, tda, td->td_ast, td->td_flags); } } thread_unlock(td); PROC_UNLOCK(p); } #endif /* * Check for signals. Unlocked reads of p_pendingcnt or * p_siglist might cause process-directed signal to be handled * later. */ if ((tda & TDAI(TDA_SIG)) != 0 || p->p_pendingcnt > 0 || !SIGISEMPTY(p->p_siglist)) { sigfastblock_fetch(td); PROC_LOCK(p); old_boundary = ~TDB_BOUNDARY | (td->td_dbgflags & TDB_BOUNDARY); td->td_dbgflags |= TDB_BOUNDARY; mtx_lock(&p->p_sigacts->ps_mtx); while ((sig = cursig(td)) != 0) { KASSERT(sig >= 0, ("sig %d", sig)); postsig(sig); } mtx_unlock(&p->p_sigacts->ps_mtx); td->td_dbgflags &= old_boundary; PROC_UNLOCK(p); resched_sigs = true; } else { resched_sigs = false; } /* * Handle deferred update of the fast sigblock value, after * the postsig() loop was performed. */ sigfastblock_setpend(td, resched_sigs); } static void ast_sigsuspend(struct thread *td, int tda __unused) { MPASS((td->td_pflags & TDP_OLDMASK) != 0); td->td_pflags &= ~TDP_OLDMASK; kern_sigprocmask(td, SIG_SETMASK, &td->td_oldsigmask, NULL, 0); } static void sigqueue_start(void) { ksiginfo_zone = uma_zcreate("ksiginfo", sizeof(ksiginfo_t), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_prealloc(ksiginfo_zone, preallocate_siginfo); p31b_setcfg(CTL_P1003_1B_REALTIME_SIGNALS, _POSIX_REALTIME_SIGNALS); p31b_setcfg(CTL_P1003_1B_RTSIG_MAX, SIGRTMAX - SIGRTMIN + 1); p31b_setcfg(CTL_P1003_1B_SIGQUEUE_MAX, max_pending_per_proc); SIGFILLSET(fastblock_mask); SIG_CANTMASK(fastblock_mask); ast_register(TDA_SIG, ASTR_UNCOND, 0, ast_sig); ast_register(TDA_SIGSUSPEND, ASTR_ASTF_REQUIRED | ASTR_TDP, TDP_OLDMASK, ast_sigsuspend); } ksiginfo_t * ksiginfo_alloc(int mwait) { MPASS(mwait == M_WAITOK || mwait == M_NOWAIT); if (ksiginfo_zone == NULL) return (NULL); return (uma_zalloc(ksiginfo_zone, mwait | M_ZERO)); } void ksiginfo_free(ksiginfo_t *ksi) { uma_zfree(ksiginfo_zone, ksi); } static __inline bool ksiginfo_tryfree(ksiginfo_t *ksi) { if ((ksi->ksi_flags & KSI_EXT) == 0) { uma_zfree(ksiginfo_zone, ksi); return (true); } return (false); } void sigqueue_init(sigqueue_t *list, struct proc *p) { SIGEMPTYSET(list->sq_signals); SIGEMPTYSET(list->sq_kill); SIGEMPTYSET(list->sq_ptrace); TAILQ_INIT(&list->sq_list); list->sq_proc = p; list->sq_flags = SQ_INIT; } /* * Get a signal's ksiginfo. * Return: * 0 - signal not found * others - signal number */ static int sigqueue_get(sigqueue_t *sq, int signo, ksiginfo_t *si) { struct proc *p = sq->sq_proc; struct ksiginfo *ksi, *next; int count = 0; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); if (!SIGISMEMBER(sq->sq_signals, signo)) return (0); if (SIGISMEMBER(sq->sq_ptrace, signo)) { count++; SIGDELSET(sq->sq_ptrace, signo); si->ksi_flags |= KSI_PTRACE; } if (SIGISMEMBER(sq->sq_kill, signo)) { count++; if (count == 1) SIGDELSET(sq->sq_kill, signo); } TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) { if (ksi->ksi_signo == signo) { if (count == 0) { TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; ksiginfo_copy(ksi, si); if (ksiginfo_tryfree(ksi) && p != NULL) p->p_pendingcnt--; } if (++count > 1) break; } } if (count <= 1) SIGDELSET(sq->sq_signals, signo); si->ksi_signo = signo; return (signo); } void sigqueue_take(ksiginfo_t *ksi) { struct ksiginfo *kp; struct proc *p; sigqueue_t *sq; if (ksi == NULL || (sq = ksi->ksi_sigq) == NULL) return; p = sq->sq_proc; TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; if (!(ksi->ksi_flags & KSI_EXT) && p != NULL) p->p_pendingcnt--; for (kp = TAILQ_FIRST(&sq->sq_list); kp != NULL; kp = TAILQ_NEXT(kp, ksi_link)) { if (kp->ksi_signo == ksi->ksi_signo) break; } if (kp == NULL && !SIGISMEMBER(sq->sq_kill, ksi->ksi_signo) && !SIGISMEMBER(sq->sq_ptrace, ksi->ksi_signo)) SIGDELSET(sq->sq_signals, ksi->ksi_signo); } static int sigqueue_add(sigqueue_t *sq, int signo, ksiginfo_t *si) { struct proc *p = sq->sq_proc; struct ksiginfo *ksi; int ret = 0; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); /* * SIGKILL/SIGSTOP cannot be caught or masked, so take the fast path * for these signals. */ if (signo == SIGKILL || signo == SIGSTOP || si == NULL) { SIGADDSET(sq->sq_kill, signo); goto out_set_bit; } /* directly insert the ksi, don't copy it */ if (si->ksi_flags & KSI_INS) { if (si->ksi_flags & KSI_HEAD) TAILQ_INSERT_HEAD(&sq->sq_list, si, ksi_link); else TAILQ_INSERT_TAIL(&sq->sq_list, si, ksi_link); si->ksi_sigq = sq; goto out_set_bit; } if (__predict_false(ksiginfo_zone == NULL)) { SIGADDSET(sq->sq_kill, signo); goto out_set_bit; } if (p != NULL && p->p_pendingcnt >= max_pending_per_proc) { signal_overflow++; ret = EAGAIN; } else if ((ksi = ksiginfo_alloc(M_NOWAIT)) == NULL) { signal_alloc_fail++; ret = EAGAIN; } else { if (p != NULL) p->p_pendingcnt++; ksiginfo_copy(si, ksi); ksi->ksi_signo = signo; if (si->ksi_flags & KSI_HEAD) TAILQ_INSERT_HEAD(&sq->sq_list, ksi, ksi_link); else TAILQ_INSERT_TAIL(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = sq; } if (ret != 0) { if ((si->ksi_flags & KSI_PTRACE) != 0) { SIGADDSET(sq->sq_ptrace, signo); ret = 0; goto out_set_bit; } else if ((si->ksi_flags & KSI_TRAP) != 0 || (si->ksi_flags & KSI_SIGQ) == 0) { SIGADDSET(sq->sq_kill, signo); ret = 0; goto out_set_bit; } return (ret); } out_set_bit: SIGADDSET(sq->sq_signals, signo); return (ret); } void sigqueue_flush(sigqueue_t *sq) { struct proc *p = sq->sq_proc; ksiginfo_t *ksi; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); if (p != NULL) PROC_LOCK_ASSERT(p, MA_OWNED); while ((ksi = TAILQ_FIRST(&sq->sq_list)) != NULL) { TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; if (ksiginfo_tryfree(ksi) && p != NULL) p->p_pendingcnt--; } SIGEMPTYSET(sq->sq_signals); SIGEMPTYSET(sq->sq_kill); SIGEMPTYSET(sq->sq_ptrace); } static void sigqueue_move_set(sigqueue_t *src, sigqueue_t *dst, const sigset_t *set) { sigset_t tmp; struct proc *p1, *p2; ksiginfo_t *ksi, *next; KASSERT(src->sq_flags & SQ_INIT, ("src sigqueue not inited")); KASSERT(dst->sq_flags & SQ_INIT, ("dst sigqueue not inited")); p1 = src->sq_proc; p2 = dst->sq_proc; /* Move siginfo to target list */ TAILQ_FOREACH_SAFE(ksi, &src->sq_list, ksi_link, next) { if (SIGISMEMBER(*set, ksi->ksi_signo)) { TAILQ_REMOVE(&src->sq_list, ksi, ksi_link); if (p1 != NULL) p1->p_pendingcnt--; TAILQ_INSERT_TAIL(&dst->sq_list, ksi, ksi_link); ksi->ksi_sigq = dst; if (p2 != NULL) p2->p_pendingcnt++; } } /* Move pending bits to target list */ tmp = src->sq_kill; SIGSETAND(tmp, *set); SIGSETOR(dst->sq_kill, tmp); SIGSETNAND(src->sq_kill, tmp); tmp = src->sq_ptrace; SIGSETAND(tmp, *set); SIGSETOR(dst->sq_ptrace, tmp); SIGSETNAND(src->sq_ptrace, tmp); tmp = src->sq_signals; SIGSETAND(tmp, *set); SIGSETOR(dst->sq_signals, tmp); SIGSETNAND(src->sq_signals, tmp); } #if 0 static void sigqueue_move(sigqueue_t *src, sigqueue_t *dst, int signo) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, signo); sigqueue_move_set(src, dst, &set); } #endif static void sigqueue_delete_set(sigqueue_t *sq, const sigset_t *set) { struct proc *p = sq->sq_proc; ksiginfo_t *ksi, *next; KASSERT(sq->sq_flags & SQ_INIT, ("src sigqueue not inited")); /* Remove siginfo queue */ TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) { if (SIGISMEMBER(*set, ksi->ksi_signo)) { TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; if (ksiginfo_tryfree(ksi) && p != NULL) p->p_pendingcnt--; } } SIGSETNAND(sq->sq_kill, *set); SIGSETNAND(sq->sq_ptrace, *set); SIGSETNAND(sq->sq_signals, *set); } void sigqueue_delete(sigqueue_t *sq, int signo) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, signo); sigqueue_delete_set(sq, &set); } /* Remove a set of signals for a process */ static void sigqueue_delete_set_proc(struct proc *p, const sigset_t *set) { sigqueue_t worklist; struct thread *td0; PROC_LOCK_ASSERT(p, MA_OWNED); sigqueue_init(&worklist, NULL); sigqueue_move_set(&p->p_sigqueue, &worklist, set); FOREACH_THREAD_IN_PROC(p, td0) sigqueue_move_set(&td0->td_sigqueue, &worklist, set); sigqueue_flush(&worklist); } void sigqueue_delete_proc(struct proc *p, int signo) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, signo); sigqueue_delete_set_proc(p, &set); } static void sigqueue_delete_stopmask_proc(struct proc *p) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, SIGSTOP); SIGADDSET(set, SIGTSTP); SIGADDSET(set, SIGTTIN); SIGADDSET(set, SIGTTOU); sigqueue_delete_set_proc(p, &set); } /* * Determine signal that should be delivered to thread td, the current * thread, 0 if none. If there is a pending stop signal with default * action, the process stops in issignal(). */ int cursig(struct thread *td) { PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); mtx_assert(&td->td_proc->p_sigacts->ps_mtx, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_NOTOWNED); return (SIGPENDING(td) ? issignal(td) : 0); } /* * Arrange for ast() to handle unmasked pending signals on return to user * mode. This must be called whenever a signal is added to td_sigqueue or * unmasked in td_sigmask. */ void signotify(struct thread *td) { PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); if (SIGPENDING(td)) ast_sched(td, TDA_SIG); } /* * Returns 1 (true) if altstack is configured for the thread, and the * passed stack bottom address falls into the altstack range. Handles * the 43 compat special case where the alt stack size is zero. */ int sigonstack(size_t sp) { struct thread *td; td = curthread; if ((td->td_pflags & TDP_ALTSTACK) == 0) return (0); #if defined(COMPAT_43) if (SV_PROC_FLAG(td->td_proc, SV_AOUT) && td->td_sigstk.ss_size == 0) return ((td->td_sigstk.ss_flags & SS_ONSTACK) != 0); #endif return (sp >= (size_t)td->td_sigstk.ss_sp && sp < td->td_sigstk.ss_size + (size_t)td->td_sigstk.ss_sp); } static __inline int sigprop(int sig) { if (sig > 0 && sig < nitems(sigproptbl)) return (sigproptbl[sig]); return (0); } static bool sigact_flag_test(const struct sigaction *act, int flag) { /* * SA_SIGINFO is reset when signal disposition is set to * ignore or default. Other flags are kept according to user * settings. */ return ((act->sa_flags & flag) != 0 && (flag != SA_SIGINFO || ((__sighandler_t *)act->sa_sigaction != SIG_IGN && (__sighandler_t *)act->sa_sigaction != SIG_DFL))); } /* * kern_sigaction * sigaction * freebsd4_sigaction * osigaction */ int kern_sigaction(struct thread *td, int sig, const struct sigaction *act, struct sigaction *oact, int flags) { struct sigacts *ps; struct proc *p = td->td_proc; if (!_SIG_VALID(sig)) return (EINVAL); if (act != NULL && act->sa_handler != SIG_DFL && act->sa_handler != SIG_IGN && (act->sa_flags & ~(SA_ONSTACK | SA_RESTART | SA_RESETHAND | SA_NOCLDSTOP | SA_NODEFER | SA_NOCLDWAIT | SA_SIGINFO)) != 0) return (EINVAL); PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); if (oact) { memset(oact, 0, sizeof(*oact)); oact->sa_mask = ps->ps_catchmask[_SIG_IDX(sig)]; if (SIGISMEMBER(ps->ps_sigonstack, sig)) oact->sa_flags |= SA_ONSTACK; if (!SIGISMEMBER(ps->ps_sigintr, sig)) oact->sa_flags |= SA_RESTART; if (SIGISMEMBER(ps->ps_sigreset, sig)) oact->sa_flags |= SA_RESETHAND; if (SIGISMEMBER(ps->ps_signodefer, sig)) oact->sa_flags |= SA_NODEFER; if (SIGISMEMBER(ps->ps_siginfo, sig)) { oact->sa_flags |= SA_SIGINFO; oact->sa_sigaction = (__siginfohandler_t *)ps->ps_sigact[_SIG_IDX(sig)]; } else oact->sa_handler = ps->ps_sigact[_SIG_IDX(sig)]; if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDSTOP) oact->sa_flags |= SA_NOCLDSTOP; if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDWAIT) oact->sa_flags |= SA_NOCLDWAIT; } if (act) { if ((sig == SIGKILL || sig == SIGSTOP) && act->sa_handler != SIG_DFL) { mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); return (EINVAL); } /* * Change setting atomically. */ ps->ps_catchmask[_SIG_IDX(sig)] = act->sa_mask; SIG_CANTMASK(ps->ps_catchmask[_SIG_IDX(sig)]); if (sigact_flag_test(act, SA_SIGINFO)) { ps->ps_sigact[_SIG_IDX(sig)] = (__sighandler_t *)act->sa_sigaction; SIGADDSET(ps->ps_siginfo, sig); } else { ps->ps_sigact[_SIG_IDX(sig)] = act->sa_handler; SIGDELSET(ps->ps_siginfo, sig); } if (!sigact_flag_test(act, SA_RESTART)) SIGADDSET(ps->ps_sigintr, sig); else SIGDELSET(ps->ps_sigintr, sig); if (sigact_flag_test(act, SA_ONSTACK)) SIGADDSET(ps->ps_sigonstack, sig); else SIGDELSET(ps->ps_sigonstack, sig); if (sigact_flag_test(act, SA_RESETHAND)) SIGADDSET(ps->ps_sigreset, sig); else SIGDELSET(ps->ps_sigreset, sig); if (sigact_flag_test(act, SA_NODEFER)) SIGADDSET(ps->ps_signodefer, sig); else SIGDELSET(ps->ps_signodefer, sig); if (sig == SIGCHLD) { if (act->sa_flags & SA_NOCLDSTOP) ps->ps_flag |= PS_NOCLDSTOP; else ps->ps_flag &= ~PS_NOCLDSTOP; if (act->sa_flags & SA_NOCLDWAIT) { /* * Paranoia: since SA_NOCLDWAIT is implemented * by reparenting the dying child to PID 1 (and * trust it to reap the zombie), PID 1 itself * is forbidden to set SA_NOCLDWAIT. */ if (p->p_pid == 1) ps->ps_flag &= ~PS_NOCLDWAIT; else ps->ps_flag |= PS_NOCLDWAIT; } else ps->ps_flag &= ~PS_NOCLDWAIT; if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN) ps->ps_flag |= PS_CLDSIGIGN; else ps->ps_flag &= ~PS_CLDSIGIGN; } /* * Set bit in ps_sigignore for signals that are set to SIG_IGN, * and for signals set to SIG_DFL where the default is to * ignore. However, don't put SIGCONT in ps_sigignore, as we * have to restart the process. */ if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || (sigprop(sig) & SIGPROP_IGNORE && ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)) { /* never to be seen again */ sigqueue_delete_proc(p, sig); if (sig != SIGCONT) /* easier in psignal */ SIGADDSET(ps->ps_sigignore, sig); SIGDELSET(ps->ps_sigcatch, sig); } else { SIGDELSET(ps->ps_sigignore, sig); if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL) SIGDELSET(ps->ps_sigcatch, sig); else SIGADDSET(ps->ps_sigcatch, sig); } #ifdef COMPAT_FREEBSD4 if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL || (flags & KSA_FREEBSD4) == 0) SIGDELSET(ps->ps_freebsd4, sig); else SIGADDSET(ps->ps_freebsd4, sig); #endif #ifdef COMPAT_43 if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL || (flags & KSA_OSIGSET) == 0) SIGDELSET(ps->ps_osigset, sig); else SIGADDSET(ps->ps_osigset, sig); #endif } mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); return (0); } #ifndef _SYS_SYSPROTO_H_ struct sigaction_args { int sig; struct sigaction *act; struct sigaction *oact; }; #endif int sys_sigaction(struct thread *td, struct sigaction_args *uap) { struct sigaction act, oact; struct sigaction *actp, *oactp; int error; actp = (uap->act != NULL) ? &act : NULL; oactp = (uap->oact != NULL) ? &oact : NULL; if (actp) { error = copyin(uap->act, actp, sizeof(act)); if (error) return (error); } error = kern_sigaction(td, uap->sig, actp, oactp, 0); if (oactp && !error) error = copyout(oactp, uap->oact, sizeof(oact)); return (error); } #ifdef COMPAT_FREEBSD4 #ifndef _SYS_SYSPROTO_H_ struct freebsd4_sigaction_args { int sig; struct sigaction *act; struct sigaction *oact; }; #endif int freebsd4_sigaction(struct thread *td, struct freebsd4_sigaction_args *uap) { struct sigaction act, oact; struct sigaction *actp, *oactp; int error; actp = (uap->act != NULL) ? &act : NULL; oactp = (uap->oact != NULL) ? &oact : NULL; if (actp) { error = copyin(uap->act, actp, sizeof(act)); if (error) return (error); } error = kern_sigaction(td, uap->sig, actp, oactp, KSA_FREEBSD4); if (oactp && !error) error = copyout(oactp, uap->oact, sizeof(oact)); return (error); } #endif /* COMAPT_FREEBSD4 */ #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigaction_args { int signum; struct osigaction *nsa; struct osigaction *osa; }; #endif int osigaction(struct thread *td, struct osigaction_args *uap) { struct osigaction sa; struct sigaction nsa, osa; struct sigaction *nsap, *osap; int error; if (uap->signum <= 0 || uap->signum >= ONSIG) return (EINVAL); nsap = (uap->nsa != NULL) ? &nsa : NULL; osap = (uap->osa != NULL) ? &osa : NULL; if (nsap) { error = copyin(uap->nsa, &sa, sizeof(sa)); if (error) return (error); nsap->sa_handler = sa.sa_handler; nsap->sa_flags = sa.sa_flags; OSIG2SIG(sa.sa_mask, nsap->sa_mask); } error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET); if (osap && !error) { sa.sa_handler = osap->sa_handler; sa.sa_flags = osap->sa_flags; SIG2OSIG(osap->sa_mask, sa.sa_mask); error = copyout(&sa, uap->osa, sizeof(sa)); } return (error); } #if !defined(__i386__) /* Avoid replicating the same stub everywhere */ int osigreturn(struct thread *td, struct osigreturn_args *uap) { return (nosys(td, (struct nosys_args *)uap)); } #endif #endif /* COMPAT_43 */ /* * Initialize signal state for process 0; * set to ignore signals that are ignored by default. */ void siginit(struct proc *p) { int i; struct sigacts *ps; PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); for (i = 1; i <= NSIG; i++) { if (sigprop(i) & SIGPROP_IGNORE && i != SIGCONT) { SIGADDSET(ps->ps_sigignore, i); } } mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); } /* * Reset specified signal to the default disposition. */ static void sigdflt(struct sigacts *ps, int sig) { mtx_assert(&ps->ps_mtx, MA_OWNED); SIGDELSET(ps->ps_sigcatch, sig); if ((sigprop(sig) & SIGPROP_IGNORE) != 0 && sig != SIGCONT) SIGADDSET(ps->ps_sigignore, sig); ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; SIGDELSET(ps->ps_siginfo, sig); } /* * Reset signals for an exec of the specified process. */ void execsigs(struct proc *p) { struct sigacts *ps; struct thread *td; /* * Reset caught signals. Held signals remain held * through td_sigmask (unless they were caught, * and are now ignored by default). */ PROC_LOCK_ASSERT(p, MA_OWNED); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); sig_drop_caught(p); /* * Reset stack state to the user stack. * Clear set of signals caught on the signal stack. */ td = curthread; MPASS(td->td_proc == p); td->td_sigstk.ss_flags = SS_DISABLE; td->td_sigstk.ss_size = 0; td->td_sigstk.ss_sp = 0; td->td_pflags &= ~TDP_ALTSTACK; /* * Reset no zombies if child dies flag as Solaris does. */ ps->ps_flag &= ~(PS_NOCLDWAIT | PS_CLDSIGIGN); if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN) ps->ps_sigact[_SIG_IDX(SIGCHLD)] = SIG_DFL; mtx_unlock(&ps->ps_mtx); } /* * kern_sigprocmask() * * Manipulate signal mask. */ int kern_sigprocmask(struct thread *td, int how, sigset_t *set, sigset_t *oset, int flags) { sigset_t new_block, oset1; struct proc *p; int error; p = td->td_proc; if ((flags & SIGPROCMASK_PROC_LOCKED) != 0) PROC_LOCK_ASSERT(p, MA_OWNED); else PROC_LOCK(p); mtx_assert(&p->p_sigacts->ps_mtx, (flags & SIGPROCMASK_PS_LOCKED) != 0 ? MA_OWNED : MA_NOTOWNED); if (oset != NULL) *oset = td->td_sigmask; error = 0; if (set != NULL) { switch (how) { case SIG_BLOCK: SIG_CANTMASK(*set); oset1 = td->td_sigmask; SIGSETOR(td->td_sigmask, *set); new_block = td->td_sigmask; SIGSETNAND(new_block, oset1); break; case SIG_UNBLOCK: SIGSETNAND(td->td_sigmask, *set); signotify(td); goto out; case SIG_SETMASK: SIG_CANTMASK(*set); oset1 = td->td_sigmask; if (flags & SIGPROCMASK_OLD) SIGSETLO(td->td_sigmask, *set); else td->td_sigmask = *set; new_block = td->td_sigmask; SIGSETNAND(new_block, oset1); signotify(td); break; default: error = EINVAL; goto out; } /* * The new_block set contains signals that were not previously * blocked, but are blocked now. * * In case we block any signal that was not previously blocked * for td, and process has the signal pending, try to schedule * signal delivery to some thread that does not block the * signal, possibly waking it up. */ if (p->p_numthreads != 1) reschedule_signals(p, new_block, flags); } out: if (!(flags & SIGPROCMASK_PROC_LOCKED)) PROC_UNLOCK(p); return (error); } #ifndef _SYS_SYSPROTO_H_ struct sigprocmask_args { int how; const sigset_t *set; sigset_t *oset; }; #endif int sys_sigprocmask(struct thread *td, struct sigprocmask_args *uap) { sigset_t set, oset; sigset_t *setp, *osetp; int error; setp = (uap->set != NULL) ? &set : NULL; osetp = (uap->oset != NULL) ? &oset : NULL; if (setp) { error = copyin(uap->set, setp, sizeof(set)); if (error) return (error); } error = kern_sigprocmask(td, uap->how, setp, osetp, 0); if (osetp && !error) { error = copyout(osetp, uap->oset, sizeof(oset)); } return (error); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigprocmask_args { int how; osigset_t mask; }; #endif int osigprocmask(struct thread *td, struct osigprocmask_args *uap) { sigset_t set, oset; int error; OSIG2SIG(uap->mask, set); error = kern_sigprocmask(td, uap->how, &set, &oset, 1); SIG2OSIG(oset, td->td_retval[0]); return (error); } #endif /* COMPAT_43 */ int sys_sigwait(struct thread *td, struct sigwait_args *uap) { ksiginfo_t ksi; sigset_t set; int error; error = copyin(uap->set, &set, sizeof(set)); if (error) { td->td_retval[0] = error; return (0); } error = kern_sigtimedwait(td, set, &ksi, NULL); if (error) { /* * sigwait() function shall not return EINTR, but * the syscall does. Non-ancient libc provides the * wrapper which hides EINTR. Otherwise, EINTR return * is used by libthr to handle required cancellation * point in the sigwait(). */ if (error == EINTR && td->td_proc->p_osrel < P_OSREL_SIGWAIT) return (ERESTART); td->td_retval[0] = error; return (0); } error = copyout(&ksi.ksi_signo, uap->sig, sizeof(ksi.ksi_signo)); td->td_retval[0] = error; return (0); } int sys_sigtimedwait(struct thread *td, struct sigtimedwait_args *uap) { struct timespec ts; struct timespec *timeout; sigset_t set; ksiginfo_t ksi; int error; if (uap->timeout) { error = copyin(uap->timeout, &ts, sizeof(ts)); if (error) return (error); timeout = &ts; } else timeout = NULL; error = copyin(uap->set, &set, sizeof(set)); if (error) return (error); error = kern_sigtimedwait(td, set, &ksi, timeout); if (error) return (error); if (uap->info) error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t)); if (error == 0) td->td_retval[0] = ksi.ksi_signo; return (error); } int sys_sigwaitinfo(struct thread *td, struct sigwaitinfo_args *uap) { ksiginfo_t ksi; sigset_t set; int error; error = copyin(uap->set, &set, sizeof(set)); if (error) return (error); error = kern_sigtimedwait(td, set, &ksi, NULL); if (error) return (error); if (uap->info) error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t)); if (error == 0) td->td_retval[0] = ksi.ksi_signo; return (error); } static void proc_td_siginfo_capture(struct thread *td, siginfo_t *si) { struct thread *thr; FOREACH_THREAD_IN_PROC(td->td_proc, thr) { if (thr == td) thr->td_si = *si; else thr->td_si.si_signo = 0; } } int kern_sigtimedwait(struct thread *td, sigset_t waitset, ksiginfo_t *ksi, struct timespec *timeout) { struct sigacts *ps; sigset_t saved_mask, new_block; struct proc *p; int error, sig, timevalid = 0; sbintime_t sbt, precision, tsbt; struct timespec ts; bool traced; p = td->td_proc; error = 0; traced = false; /* Ensure the sigfastblock value is up to date. */ sigfastblock_fetch(td); if (timeout != NULL) { if (timeout->tv_nsec >= 0 && timeout->tv_nsec < 1000000000) { timevalid = 1; ts = *timeout; if (ts.tv_sec < INT32_MAX / 2) { tsbt = tstosbt(ts); precision = tsbt; precision >>= tc_precexp; if (TIMESEL(&sbt, tsbt)) sbt += tc_tick_sbt; sbt += tsbt; } else precision = sbt = 0; } } else precision = sbt = 0; ksiginfo_init(ksi); /* Some signals can not be waited for. */ SIG_CANTMASK(waitset); ps = p->p_sigacts; PROC_LOCK(p); saved_mask = td->td_sigmask; SIGSETNAND(td->td_sigmask, waitset); if ((p->p_sysent->sv_flags & SV_SIG_DISCIGN) != 0 || !kern_sig_discard_ign) { thread_lock(td); td->td_flags |= TDF_SIGWAIT; thread_unlock(td); } for (;;) { mtx_lock(&ps->ps_mtx); sig = cursig(td); mtx_unlock(&ps->ps_mtx); KASSERT(sig >= 0, ("sig %d", sig)); if (sig != 0 && SIGISMEMBER(waitset, sig)) { if (sigqueue_get(&td->td_sigqueue, sig, ksi) != 0 || sigqueue_get(&p->p_sigqueue, sig, ksi) != 0) { error = 0; break; } } if (error != 0) break; /* * POSIX says this must be checked after looking for pending * signals. */ if (timeout != NULL && !timevalid) { error = EINVAL; break; } if (traced) { error = EINTR; break; } error = msleep_sbt(&p->p_sigacts, &p->p_mtx, PPAUSE | PCATCH, "sigwait", sbt, precision, C_ABSOLUTE); /* The syscalls can not be restarted. */ if (error == ERESTART) error = EINTR; /* * If PTRACE_SCE or PTRACE_SCX were set after * userspace entered the syscall, return spurious * EINTR after wait was done. Only do this as last * resort after rechecking for possible queued signals * and expired timeouts. */ if (error == 0 && (p->p_ptevents & PTRACE_SYSCALL) != 0) traced = true; } thread_lock(td); td->td_flags &= ~TDF_SIGWAIT; thread_unlock(td); new_block = saved_mask; SIGSETNAND(new_block, td->td_sigmask); td->td_sigmask = saved_mask; /* * Fewer signals can be delivered to us, reschedule signal * notification. */ if (p->p_numthreads != 1) reschedule_signals(p, new_block, 0); if (error == 0) { SDT_PROBE2(proc, , , signal__clear, sig, ksi); if (ksi->ksi_code == SI_TIMER) itimer_accept(p, ksi->ksi_timerid, ksi); #ifdef KTRACE if (KTRPOINT(td, KTR_PSIG)) { sig_t action; mtx_lock(&ps->ps_mtx); action = ps->ps_sigact[_SIG_IDX(sig)]; mtx_unlock(&ps->ps_mtx); ktrpsig(sig, action, &td->td_sigmask, ksi->ksi_code); } #endif if (sig == SIGKILL) { proc_td_siginfo_capture(td, &ksi->ksi_info); sigexit(td, sig); } } PROC_UNLOCK(p); return (error); } #ifndef _SYS_SYSPROTO_H_ struct sigpending_args { sigset_t *set; }; #endif int sys_sigpending(struct thread *td, struct sigpending_args *uap) { struct proc *p = td->td_proc; sigset_t pending; PROC_LOCK(p); pending = p->p_sigqueue.sq_signals; SIGSETOR(pending, td->td_sigqueue.sq_signals); PROC_UNLOCK(p); return (copyout(&pending, uap->set, sizeof(sigset_t))); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigpending_args { int dummy; }; #endif int osigpending(struct thread *td, struct osigpending_args *uap) { struct proc *p = td->td_proc; sigset_t pending; PROC_LOCK(p); pending = p->p_sigqueue.sq_signals; SIGSETOR(pending, td->td_sigqueue.sq_signals); PROC_UNLOCK(p); SIG2OSIG(pending, td->td_retval[0]); return (0); } #endif /* COMPAT_43 */ #if defined(COMPAT_43) /* * Generalized interface signal handler, 4.3-compatible. */ #ifndef _SYS_SYSPROTO_H_ struct osigvec_args { int signum; struct sigvec *nsv; struct sigvec *osv; }; #endif /* ARGSUSED */ int osigvec(struct thread *td, struct osigvec_args *uap) { struct sigvec vec; struct sigaction nsa, osa; struct sigaction *nsap, *osap; int error; if (uap->signum <= 0 || uap->signum >= ONSIG) return (EINVAL); nsap = (uap->nsv != NULL) ? &nsa : NULL; osap = (uap->osv != NULL) ? &osa : NULL; if (nsap) { error = copyin(uap->nsv, &vec, sizeof(vec)); if (error) return (error); nsap->sa_handler = vec.sv_handler; OSIG2SIG(vec.sv_mask, nsap->sa_mask); nsap->sa_flags = vec.sv_flags; nsap->sa_flags ^= SA_RESTART; /* opposite of SV_INTERRUPT */ } error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET); if (osap && !error) { vec.sv_handler = osap->sa_handler; SIG2OSIG(osap->sa_mask, vec.sv_mask); vec.sv_flags = osap->sa_flags; vec.sv_flags &= ~SA_NOCLDWAIT; vec.sv_flags ^= SA_RESTART; error = copyout(&vec, uap->osv, sizeof(vec)); } return (error); } #ifndef _SYS_SYSPROTO_H_ struct osigblock_args { int mask; }; #endif int osigblock(struct thread *td, struct osigblock_args *uap) { sigset_t set, oset; OSIG2SIG(uap->mask, set); kern_sigprocmask(td, SIG_BLOCK, &set, &oset, 0); SIG2OSIG(oset, td->td_retval[0]); return (0); } #ifndef _SYS_SYSPROTO_H_ struct osigsetmask_args { int mask; }; #endif int osigsetmask(struct thread *td, struct osigsetmask_args *uap) { sigset_t set, oset; OSIG2SIG(uap->mask, set); kern_sigprocmask(td, SIG_SETMASK, &set, &oset, 0); SIG2OSIG(oset, td->td_retval[0]); return (0); } #endif /* COMPAT_43 */ /* * Suspend calling thread until signal, providing mask to be set in the * meantime. */ #ifndef _SYS_SYSPROTO_H_ struct sigsuspend_args { const sigset_t *sigmask; }; #endif /* ARGSUSED */ int sys_sigsuspend(struct thread *td, struct sigsuspend_args *uap) { sigset_t mask; int error; error = copyin(uap->sigmask, &mask, sizeof(mask)); if (error) return (error); return (kern_sigsuspend(td, mask)); } int kern_sigsuspend(struct thread *td, sigset_t mask) { struct proc *p = td->td_proc; int has_sig, sig; /* Ensure the sigfastblock value is up to date. */ sigfastblock_fetch(td); /* * When returning from sigsuspend, we want * the old mask to be restored after the * signal handler has finished. Thus, we * save it here and mark the sigacts structure * to indicate this. */ PROC_LOCK(p); kern_sigprocmask(td, SIG_SETMASK, &mask, &td->td_oldsigmask, SIGPROCMASK_PROC_LOCKED); td->td_pflags |= TDP_OLDMASK; ast_sched(td, TDA_SIGSUSPEND); /* * Process signals now. Otherwise, we can get spurious wakeup * due to signal entered process queue, but delivered to other * thread. But sigsuspend should return only on signal * delivery. */ (p->p_sysent->sv_set_syscall_retval)(td, EINTR); for (has_sig = 0; !has_sig;) { while (msleep(&p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH, "pause", 0) == 0) /* void */; thread_suspend_check(0); mtx_lock(&p->p_sigacts->ps_mtx); while ((sig = cursig(td)) != 0) { KASSERT(sig >= 0, ("sig %d", sig)); has_sig += postsig(sig); } mtx_unlock(&p->p_sigacts->ps_mtx); /* * If PTRACE_SCE or PTRACE_SCX were set after * userspace entered the syscall, return spurious * EINTR. */ if ((p->p_ptevents & PTRACE_SYSCALL) != 0) has_sig += 1; } PROC_UNLOCK(p); td->td_errno = EINTR; td->td_pflags |= TDP_NERRNO; return (EJUSTRETURN); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ /* * Compatibility sigsuspend call for old binaries. Note nonstandard calling * convention: libc stub passes mask, not pointer, to save a copyin. */ #ifndef _SYS_SYSPROTO_H_ struct osigsuspend_args { osigset_t mask; }; #endif /* ARGSUSED */ int osigsuspend(struct thread *td, struct osigsuspend_args *uap) { sigset_t mask; OSIG2SIG(uap->mask, mask); return (kern_sigsuspend(td, mask)); } #endif /* COMPAT_43 */ #if defined(COMPAT_43) #ifndef _SYS_SYSPROTO_H_ struct osigstack_args { struct sigstack *nss; struct sigstack *oss; }; #endif /* ARGSUSED */ int osigstack(struct thread *td, struct osigstack_args *uap) { struct sigstack nss, oss; int error = 0; if (uap->nss != NULL) { error = copyin(uap->nss, &nss, sizeof(nss)); if (error) return (error); } oss.ss_sp = td->td_sigstk.ss_sp; oss.ss_onstack = sigonstack(cpu_getstack(td)); if (uap->nss != NULL) { td->td_sigstk.ss_sp = nss.ss_sp; td->td_sigstk.ss_size = 0; td->td_sigstk.ss_flags |= nss.ss_onstack & SS_ONSTACK; td->td_pflags |= TDP_ALTSTACK; } if (uap->oss != NULL) error = copyout(&oss, uap->oss, sizeof(oss)); return (error); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct sigaltstack_args { stack_t *ss; stack_t *oss; }; #endif /* ARGSUSED */ int sys_sigaltstack(struct thread *td, struct sigaltstack_args *uap) { stack_t ss, oss; int error; if (uap->ss != NULL) { error = copyin(uap->ss, &ss, sizeof(ss)); if (error) return (error); } error = kern_sigaltstack(td, (uap->ss != NULL) ? &ss : NULL, (uap->oss != NULL) ? &oss : NULL); if (error) return (error); if (uap->oss != NULL) error = copyout(&oss, uap->oss, sizeof(stack_t)); return (error); } int kern_sigaltstack(struct thread *td, stack_t *ss, stack_t *oss) { struct proc *p = td->td_proc; int oonstack; oonstack = sigonstack(cpu_getstack(td)); if (oss != NULL) { *oss = td->td_sigstk; oss->ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; } if (ss != NULL) { if (oonstack) return (EPERM); if ((ss->ss_flags & ~SS_DISABLE) != 0) return (EINVAL); if (!(ss->ss_flags & SS_DISABLE)) { if (ss->ss_size < p->p_sysent->sv_minsigstksz) return (ENOMEM); td->td_sigstk = *ss; td->td_pflags |= TDP_ALTSTACK; } else { td->td_pflags &= ~TDP_ALTSTACK; } } return (0); } struct killpg1_ctx { struct thread *td; ksiginfo_t *ksi; int sig; bool sent; bool found; int ret; }; static void killpg1_sendsig_locked(struct proc *p, struct killpg1_ctx *arg) { int err; err = p_cansignal(arg->td, p, arg->sig); if (err == 0 && arg->sig != 0) pksignal(p, arg->sig, arg->ksi); if (err != ESRCH) arg->found = true; if (err == 0) arg->sent = true; else if (arg->ret == 0 && err != ESRCH && err != EPERM) arg->ret = err; } static void killpg1_sendsig(struct proc *p, bool notself, struct killpg1_ctx *arg) { if (p->p_pid <= 1 || (p->p_flag & P_SYSTEM) != 0 || (notself && p == arg->td->td_proc) || p->p_state == PRS_NEW) return; PROC_LOCK(p); killpg1_sendsig_locked(p, arg); PROC_UNLOCK(p); } static void kill_processes_prison_cb(struct proc *p, void *arg) { struct killpg1_ctx *ctx = arg; if (p->p_pid <= 1 || (p->p_flag & P_SYSTEM) != 0 || (p == ctx->td->td_proc) || p->p_state == PRS_NEW) return; killpg1_sendsig_locked(p, ctx); } /* * Common code for kill process group/broadcast kill. * td is the calling thread, as usual. */ static int killpg1(struct thread *td, int sig, int pgid, int all, ksiginfo_t *ksi) { struct proc *p; struct pgrp *pgrp; struct killpg1_ctx arg; arg.td = td; arg.ksi = ksi; arg.sig = sig; arg.sent = false; arg.found = false; arg.ret = 0; if (all) { /* * broadcast */ prison_proc_iterate(td->td_ucred->cr_prison, kill_processes_prison_cb, &arg); } else { again: sx_slock(&proctree_lock); if (pgid == 0) { /* * zero pgid means send to my process group. */ pgrp = td->td_proc->p_pgrp; PGRP_LOCK(pgrp); } else { pgrp = pgfind(pgid); if (pgrp == NULL) { sx_sunlock(&proctree_lock); return (ESRCH); } } sx_sunlock(&proctree_lock); if (!sx_try_xlock(&pgrp->pg_killsx)) { PGRP_UNLOCK(pgrp); sx_xlock(&pgrp->pg_killsx); sx_xunlock(&pgrp->pg_killsx); goto again; } LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { killpg1_sendsig(p, false, &arg); } PGRP_UNLOCK(pgrp); sx_xunlock(&pgrp->pg_killsx); } MPASS(arg.ret != 0 || arg.found || !arg.sent); if (arg.ret == 0 && !arg.sent) arg.ret = arg.found ? EPERM : ESRCH; return (arg.ret); } #ifndef _SYS_SYSPROTO_H_ struct kill_args { int pid; int signum; }; #endif /* ARGSUSED */ int sys_kill(struct thread *td, struct kill_args *uap) { return (kern_kill(td, uap->pid, uap->signum)); } int kern_kill(struct thread *td, pid_t pid, int signum) { ksiginfo_t ksi; struct proc *p; int error; /* * A process in capability mode can send signals only to himself. * The main rationale behind this is that abort(3) is implemented as * kill(getpid(), SIGABRT). */ if (pid != td->td_proc->p_pid) { if (CAP_TRACING(td)) ktrcapfail(CAPFAIL_SIGNAL, &signum); if (IN_CAPABILITY_MODE(td)) return (ECAPMODE); } AUDIT_ARG_SIGNUM(signum); AUDIT_ARG_PID(pid); if ((u_int)signum > _SIG_MAXSIG) return (EINVAL); ksiginfo_init(&ksi); ksi.ksi_signo = signum; ksi.ksi_code = SI_USER; ksi.ksi_pid = td->td_proc->p_pid; ksi.ksi_uid = td->td_ucred->cr_ruid; if (pid > 0) { /* kill single process */ if ((p = pfind_any(pid)) == NULL) return (ESRCH); AUDIT_ARG_PROCESS(p); error = p_cansignal(td, p, signum); if (error == 0 && signum) pksignal(p, signum, &ksi); PROC_UNLOCK(p); return (error); } switch (pid) { case -1: /* broadcast signal */ return (killpg1(td, signum, 0, 1, &ksi)); case 0: /* signal own process group */ return (killpg1(td, signum, 0, 0, &ksi)); default: /* negative explicit process group */ return (killpg1(td, signum, -pid, 0, &ksi)); } /* NOTREACHED */ } int sys_pdkill(struct thread *td, struct pdkill_args *uap) { struct proc *p; int error; AUDIT_ARG_SIGNUM(uap->signum); AUDIT_ARG_FD(uap->fd); if ((u_int)uap->signum > _SIG_MAXSIG) return (EINVAL); error = procdesc_find(td, uap->fd, &cap_pdkill_rights, &p); if (error) return (error); AUDIT_ARG_PROCESS(p); error = p_cansignal(td, p, uap->signum); if (error == 0 && uap->signum) kern_psignal(p, uap->signum); PROC_UNLOCK(p); return (error); } #if defined(COMPAT_43) #ifndef _SYS_SYSPROTO_H_ struct okillpg_args { int pgid; int signum; }; #endif /* ARGSUSED */ int okillpg(struct thread *td, struct okillpg_args *uap) { ksiginfo_t ksi; AUDIT_ARG_SIGNUM(uap->signum); AUDIT_ARG_PID(uap->pgid); if ((u_int)uap->signum > _SIG_MAXSIG) return (EINVAL); ksiginfo_init(&ksi); ksi.ksi_signo = uap->signum; ksi.ksi_code = SI_USER; ksi.ksi_pid = td->td_proc->p_pid; ksi.ksi_uid = td->td_ucred->cr_ruid; return (killpg1(td, uap->signum, uap->pgid, 0, &ksi)); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct sigqueue_args { pid_t pid; int signum; /* union sigval */ void *value; }; #endif int sys_sigqueue(struct thread *td, struct sigqueue_args *uap) { union sigval sv; sv.sival_ptr = uap->value; return (kern_sigqueue(td, uap->pid, uap->signum, &sv)); } int kern_sigqueue(struct thread *td, pid_t pid, int signumf, union sigval *value) { ksiginfo_t ksi; struct proc *p; struct thread *td2; u_int signum; int error; signum = signumf & ~__SIGQUEUE_TID; if (signum > _SIG_MAXSIG) return (EINVAL); /* * Specification says sigqueue can only send signal to * single process. */ if (pid <= 0) return (EINVAL); if ((signumf & __SIGQUEUE_TID) == 0) { if ((p = pfind_any(pid)) == NULL) return (ESRCH); td2 = NULL; } else { p = td->td_proc; td2 = tdfind((lwpid_t)pid, p->p_pid); if (td2 == NULL) return (ESRCH); } error = p_cansignal(td, p, signum); if (error == 0 && signum != 0) { ksiginfo_init(&ksi); ksi.ksi_flags = KSI_SIGQ; ksi.ksi_signo = signum; ksi.ksi_code = SI_QUEUE; ksi.ksi_pid = td->td_proc->p_pid; ksi.ksi_uid = td->td_ucred->cr_ruid; ksi.ksi_value = *value; error = tdsendsignal(p, td2, ksi.ksi_signo, &ksi); } PROC_UNLOCK(p); return (error); } /* * Send a signal to a process group. If checktty is 1, * limit to members which have a controlling terminal. */ void pgsignal(struct pgrp *pgrp, int sig, int checkctty, ksiginfo_t *ksi) { struct proc *p; if (pgrp) { PGRP_LOCK_ASSERT(pgrp, MA_OWNED); LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && (checkctty == 0 || p->p_flag & P_CONTROLT)) pksignal(p, sig, ksi); PROC_UNLOCK(p); } } } /* * Recalculate the signal mask and reset the signal disposition after * usermode frame for delivery is formed. Should be called after * mach-specific routine, because sysent->sv_sendsig() needs correct * ps_siginfo and signal mask. */ static void postsig_done(int sig, struct thread *td, struct sigacts *ps) { sigset_t mask; mtx_assert(&ps->ps_mtx, MA_OWNED); td->td_ru.ru_nsignals++; mask = ps->ps_catchmask[_SIG_IDX(sig)]; if (!SIGISMEMBER(ps->ps_signodefer, sig)) SIGADDSET(mask, sig); kern_sigprocmask(td, SIG_BLOCK, &mask, NULL, SIGPROCMASK_PROC_LOCKED | SIGPROCMASK_PS_LOCKED); if (SIGISMEMBER(ps->ps_sigreset, sig)) sigdflt(ps, sig); } /* * Send a signal caused by a trap to the current thread. If it will be * caught immediately, deliver it with correct code. Otherwise, post it * normally. */ void trapsignal(struct thread *td, ksiginfo_t *ksi) { struct sigacts *ps; struct proc *p; sigset_t sigmask; int sig; p = td->td_proc; sig = ksi->ksi_signo; KASSERT(_SIG_VALID(sig), ("invalid signal")); sigfastblock_fetch(td); PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); sigmask = td->td_sigmask; if (td->td_sigblock_val != 0) SIGSETOR(sigmask, fastblock_mask); if ((p->p_flag & P_TRACED) == 0 && SIGISMEMBER(ps->ps_sigcatch, sig) && !SIGISMEMBER(sigmask, sig)) { #ifdef KTRACE if (KTRPOINT(curthread, KTR_PSIG)) ktrpsig(sig, ps->ps_sigact[_SIG_IDX(sig)], &td->td_sigmask, ksi->ksi_code); #endif (*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)], ksi, &td->td_sigmask); postsig_done(sig, td, ps); mtx_unlock(&ps->ps_mtx); } else { /* * Avoid a possible infinite loop if the thread * masking the signal or process is ignoring the * signal. */ if (kern_forcesigexit && (SIGISMEMBER(sigmask, sig) || ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN)) { SIGDELSET(td->td_sigmask, sig); SIGDELSET(ps->ps_sigcatch, sig); SIGDELSET(ps->ps_sigignore, sig); ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; td->td_pflags &= ~TDP_SIGFASTBLOCK; td->td_sigblock_val = 0; } mtx_unlock(&ps->ps_mtx); p->p_sig = sig; /* XXX to verify code */ tdsendsignal(p, td, sig, ksi); } PROC_UNLOCK(p); } static struct thread * sigtd(struct proc *p, int sig, bool fast_sigblock) { struct thread *td, *signal_td; PROC_LOCK_ASSERT(p, MA_OWNED); MPASS(!fast_sigblock || p == curproc); /* * Check if current thread can handle the signal without * switching context to another thread. */ if (curproc == p && !SIGISMEMBER(curthread->td_sigmask, sig) && (!fast_sigblock || curthread->td_sigblock_val == 0)) return (curthread); /* Find a non-stopped thread that does not mask the signal. */ signal_td = NULL; FOREACH_THREAD_IN_PROC(p, td) { if (!SIGISMEMBER(td->td_sigmask, sig) && (!fast_sigblock || td != curthread || td->td_sigblock_val == 0) && (td->td_flags & TDF_BOUNDARY) == 0) { signal_td = td; break; } } /* Select random (first) thread if no better match was found. */ if (signal_td == NULL) signal_td = FIRST_THREAD_IN_PROC(p); return (signal_td); } /* * Send the signal to the process. If the signal has an action, the action * is usually performed by the target process rather than the caller; we add * the signal to the set of pending signals for the process. * * Exceptions: * o When a stop signal is sent to a sleeping process that takes the * default action, the process is stopped without awakening it. * o SIGCONT restarts stopped processes (or puts them back to sleep) * regardless of the signal action (eg, blocked or ignored). * * Other ignored signals are discarded immediately. * * NB: This function may be entered from the debugger via the "kill" DDB * command. There is little that can be done to mitigate the possibly messy * side effects of this unwise possibility. */ void kern_psignal(struct proc *p, int sig) { ksiginfo_t ksi; ksiginfo_init(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = SI_KERNEL; (void) tdsendsignal(p, NULL, sig, &ksi); } int pksignal(struct proc *p, int sig, ksiginfo_t *ksi) { return (tdsendsignal(p, NULL, sig, ksi)); } /* Utility function for finding a thread to send signal event to. */ int sigev_findtd(struct proc *p, struct sigevent *sigev, struct thread **ttd) { struct thread *td; if (sigev->sigev_notify == SIGEV_THREAD_ID) { td = tdfind(sigev->sigev_notify_thread_id, p->p_pid); if (td == NULL) return (ESRCH); *ttd = td; } else { *ttd = NULL; PROC_LOCK(p); } return (0); } void tdsignal(struct thread *td, int sig) { ksiginfo_t ksi; ksiginfo_init(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = SI_KERNEL; (void) tdsendsignal(td->td_proc, td, sig, &ksi); } void tdksignal(struct thread *td, int sig, ksiginfo_t *ksi) { (void) tdsendsignal(td->td_proc, td, sig, ksi); } static int sig_sleepq_abort(struct thread *td, int intrval) { THREAD_LOCK_ASSERT(td, MA_OWNED); if (intrval == 0 && (td->td_flags & TDF_SIGWAIT) == 0) { thread_unlock(td); return (0); } return (sleepq_abort(td, intrval)); } int tdsendsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi) { sig_t action; sigqueue_t *sigqueue; int prop; struct sigacts *ps; int intrval; int ret = 0; int wakeup_swapper; MPASS(td == NULL || p == td->td_proc); PROC_LOCK_ASSERT(p, MA_OWNED); if (!_SIG_VALID(sig)) panic("%s(): invalid signal %d", __func__, sig); KASSERT(ksi == NULL || !KSI_ONQ(ksi), ("%s: ksi on queue", __func__)); /* * IEEE Std 1003.1-2001: return success when killing a zombie. */ if (p->p_state == PRS_ZOMBIE) { if (ksi != NULL && (ksi->ksi_flags & KSI_INS) != 0) ksiginfo_tryfree(ksi); return (ret); } ps = p->p_sigacts; KNOTE_LOCKED(p->p_klist, NOTE_SIGNAL | sig); prop = sigprop(sig); if (td == NULL) { td = sigtd(p, sig, false); sigqueue = &p->p_sigqueue; } else sigqueue = &td->td_sigqueue; SDT_PROBE3(proc, , , signal__send, td, p, sig); /* * If the signal is being ignored, then we forget about it * immediately, except when the target process executes * sigwait(). (Note: we don't set SIGCONT in ps_sigignore, * and if it is set to SIG_IGN, action will be SIG_DFL here.) */ mtx_lock(&ps->ps_mtx); if (SIGISMEMBER(ps->ps_sigignore, sig)) { if (kern_sig_discard_ign && (p->p_sysent->sv_flags & SV_SIG_DISCIGN) == 0) { SDT_PROBE3(proc, , , signal__discard, td, p, sig); mtx_unlock(&ps->ps_mtx); if (ksi != NULL && (ksi->ksi_flags & KSI_INS) != 0) ksiginfo_tryfree(ksi); return (ret); } else { action = SIG_CATCH; intrval = 0; } } else { if (SIGISMEMBER(td->td_sigmask, sig)) action = SIG_HOLD; else if (SIGISMEMBER(ps->ps_sigcatch, sig)) action = SIG_CATCH; else action = SIG_DFL; if (SIGISMEMBER(ps->ps_sigintr, sig)) intrval = EINTR; else intrval = ERESTART; } mtx_unlock(&ps->ps_mtx); if (prop & SIGPROP_CONT) sigqueue_delete_stopmask_proc(p); else if (prop & SIGPROP_STOP) { /* * If sending a tty stop signal to a member of an orphaned * process group, discard the signal here if the action * is default; don't stop the process below if sleeping, * and don't clear any pending SIGCONT. */ if ((prop & SIGPROP_TTYSTOP) != 0 && (p->p_pgrp->pg_flags & PGRP_ORPHANED) != 0 && action == SIG_DFL) { if (ksi != NULL && (ksi->ksi_flags & KSI_INS) != 0) ksiginfo_tryfree(ksi); return (ret); } sigqueue_delete_proc(p, SIGCONT); if (p->p_flag & P_CONTINUED) { p->p_flag &= ~P_CONTINUED; PROC_LOCK(p->p_pptr); sigqueue_take(p->p_ksi); PROC_UNLOCK(p->p_pptr); } } ret = sigqueue_add(sigqueue, sig, ksi); if (ret != 0) return (ret); signotify(td); /* * Defer further processing for signals which are held, * except that stopped processes must be continued by SIGCONT. */ if (action == SIG_HOLD && !((prop & SIGPROP_CONT) && (p->p_flag & P_STOPPED_SIG))) return (ret); wakeup_swapper = 0; /* * Some signals have a process-wide effect and a per-thread * component. Most processing occurs when the process next * tries to cross the user boundary, however there are some * times when processing needs to be done immediately, such as * waking up threads so that they can cross the user boundary. * We try to do the per-process part here. */ if (P_SHOULDSTOP(p)) { KASSERT(!(p->p_flag & P_WEXIT), ("signal to stopped but exiting process")); if (sig == SIGKILL) { /* * If traced process is already stopped, * then no further action is necessary. */ if (p->p_flag & P_TRACED) goto out; /* * SIGKILL sets process running. * It will die elsewhere. * All threads must be restarted. */ p->p_flag &= ~P_STOPPED_SIG; goto runfast; } if (prop & SIGPROP_CONT) { /* * If traced process is already stopped, * then no further action is necessary. */ if (p->p_flag & P_TRACED) goto out; /* * If SIGCONT is default (or ignored), we continue the * process but don't leave the signal in sigqueue as * it has no further action. If SIGCONT is held, we * continue the process and leave the signal in * sigqueue. If the process catches SIGCONT, let it * handle the signal itself. If it isn't waiting on * an event, it goes back to run state. * Otherwise, process goes back to sleep state. */ p->p_flag &= ~P_STOPPED_SIG; PROC_SLOCK(p); if (p->p_numthreads == p->p_suspcount) { PROC_SUNLOCK(p); p->p_flag |= P_CONTINUED; p->p_xsig = SIGCONT; PROC_LOCK(p->p_pptr); childproc_continued(p); PROC_UNLOCK(p->p_pptr); PROC_SLOCK(p); } if (action == SIG_DFL) { thread_unsuspend(p); PROC_SUNLOCK(p); sigqueue_delete(sigqueue, sig); goto out_cont; } if (action == SIG_CATCH) { /* * The process wants to catch it so it needs * to run at least one thread, but which one? */ PROC_SUNLOCK(p); goto runfast; } /* * The signal is not ignored or caught. */ thread_unsuspend(p); PROC_SUNLOCK(p); goto out_cont; } if (prop & SIGPROP_STOP) { /* * If traced process is already stopped, * then no further action is necessary. */ if (p->p_flag & P_TRACED) goto out; /* * Already stopped, don't need to stop again * (If we did the shell could get confused). * Just make sure the signal STOP bit set. */ p->p_flag |= P_STOPPED_SIG; sigqueue_delete(sigqueue, sig); goto out; } /* * All other kinds of signals: * If a thread is sleeping interruptibly, simulate a * wakeup so that when it is continued it will be made * runnable and can look at the signal. However, don't make * the PROCESS runnable, leave it stopped. * It may run a bit until it hits a thread_suspend_check(). */ PROC_SLOCK(p); thread_lock(td); if (TD_CAN_ABORT(td)) wakeup_swapper = sig_sleepq_abort(td, intrval); else thread_unlock(td); PROC_SUNLOCK(p); goto out; /* * Mutexes are short lived. Threads waiting on them will * hit thread_suspend_check() soon. */ } else if (p->p_state == PRS_NORMAL) { if (p->p_flag & P_TRACED || action == SIG_CATCH) { tdsigwakeup(td, sig, action, intrval); goto out; } MPASS(action == SIG_DFL); if (prop & SIGPROP_STOP) { if (p->p_flag & (P_PPWAIT|P_WEXIT)) goto out; p->p_flag |= P_STOPPED_SIG; p->p_xsig = sig; PROC_SLOCK(p); wakeup_swapper = sig_suspend_threads(td, p); if (p->p_numthreads == p->p_suspcount) { /* * only thread sending signal to another * process can reach here, if thread is sending * signal to its process, because thread does * not suspend itself here, p_numthreads * should never be equal to p_suspcount. */ thread_stopped(p); PROC_SUNLOCK(p); sigqueue_delete_proc(p, p->p_xsig); } else PROC_SUNLOCK(p); goto out; } } else { /* Not in "NORMAL" state. discard the signal. */ sigqueue_delete(sigqueue, sig); goto out; } /* * The process is not stopped so we need to apply the signal to all the * running threads. */ runfast: tdsigwakeup(td, sig, action, intrval); PROC_SLOCK(p); thread_unsuspend(p); PROC_SUNLOCK(p); out_cont: itimer_proc_continue(p); kqtimer_proc_continue(p); out: /* If we jump here, proc slock should not be owned. */ PROC_SLOCK_ASSERT(p, MA_NOTOWNED); if (wakeup_swapper) kick_proc0(); return (ret); } /* * The force of a signal has been directed against a single * thread. We need to see what we can do about knocking it * out of any sleep it may be in etc. */ static void tdsigwakeup(struct thread *td, int sig, sig_t action, int intrval) { struct proc *p = td->td_proc; int prop, wakeup_swapper; PROC_LOCK_ASSERT(p, MA_OWNED); prop = sigprop(sig); PROC_SLOCK(p); thread_lock(td); /* * Bring the priority of a thread up if we want it to get * killed in this lifetime. Be careful to avoid bumping the * priority of the idle thread, since we still allow to signal * kernel processes. */ if (action == SIG_DFL && (prop & SIGPROP_KILL) != 0 && td->td_priority > PUSER && !TD_IS_IDLETHREAD(td)) sched_prio(td, PUSER); if (TD_ON_SLEEPQ(td)) { /* * If thread is sleeping uninterruptibly * we can't interrupt the sleep... the signal will * be noticed when the process returns through * trap() or syscall(). */ if ((td->td_flags & TDF_SINTR) == 0) goto out; /* * If SIGCONT is default (or ignored) and process is * asleep, we are finished; the process should not * be awakened. */ if ((prop & SIGPROP_CONT) && action == SIG_DFL) { thread_unlock(td); PROC_SUNLOCK(p); sigqueue_delete(&p->p_sigqueue, sig); /* * It may be on either list in this state. * Remove from both for now. */ sigqueue_delete(&td->td_sigqueue, sig); return; } /* * Don't awaken a sleeping thread for SIGSTOP if the * STOP signal is deferred. */ if ((prop & SIGPROP_STOP) != 0 && (td->td_flags & (TDF_SBDRY | TDF_SERESTART | TDF_SEINTR)) == TDF_SBDRY) goto out; /* * Give low priority threads a better chance to run. */ if (td->td_priority > PUSER && !TD_IS_IDLETHREAD(td)) sched_prio(td, PUSER); wakeup_swapper = sig_sleepq_abort(td, intrval); PROC_SUNLOCK(p); if (wakeup_swapper) kick_proc0(); return; } /* * Other states do nothing with the signal immediately, * other than kicking ourselves if we are running. * It will either never be noticed, or noticed very soon. */ #ifdef SMP if (TD_IS_RUNNING(td) && td != curthread) forward_signal(td); #endif out: PROC_SUNLOCK(p); thread_unlock(td); } static void ptrace_coredumpreq(struct thread *td, struct proc *p, struct thr_coredump_req *tcq) { void *rl_cookie; if (p->p_sysent->sv_coredump == NULL) { tcq->tc_error = ENOSYS; return; } rl_cookie = vn_rangelock_wlock(tcq->tc_vp, 0, OFF_MAX); tcq->tc_error = p->p_sysent->sv_coredump(td, tcq->tc_vp, tcq->tc_limit, tcq->tc_flags); vn_rangelock_unlock(tcq->tc_vp, rl_cookie); } static void ptrace_syscallreq(struct thread *td, struct proc *p, struct thr_syscall_req *tsr) { struct sysentvec *sv; struct sysent *se; register_t rv_saved[2]; int error, nerror; int sc; bool audited, sy_thr_static; sv = p->p_sysent; if (sv->sv_table == NULL || sv->sv_size < tsr->ts_sa.code) { tsr->ts_ret.sr_error = ENOSYS; return; } sc = tsr->ts_sa.code; if (sc == SYS_syscall || sc == SYS___syscall) { sc = tsr->ts_sa.args[0]; memmove(&tsr->ts_sa.args[0], &tsr->ts_sa.args[1], sizeof(register_t) * (tsr->ts_nargs - 1)); } tsr->ts_sa.callp = se = &sv->sv_table[sc]; VM_CNT_INC(v_syscall); td->td_pticks = 0; if (__predict_false(td->td_cowgen != atomic_load_int( &td->td_proc->p_cowgen))) thread_cow_update(td); td->td_sa = tsr->ts_sa; #ifdef CAPABILITY_MODE if ((se->sy_flags & SYF_CAPENABLED) == 0) { if (CAP_TRACING(td)) ktrcapfail(CAPFAIL_SYSCALL, NULL); if (IN_CAPABILITY_MODE(td)) { tsr->ts_ret.sr_error = ECAPMODE; return; } } #endif sy_thr_static = (se->sy_thrcnt & SY_THR_STATIC) != 0; audited = AUDIT_SYSCALL_ENTER(sc, td) != 0; if (!sy_thr_static) { error = syscall_thread_enter(td, &se); sy_thr_static = (se->sy_thrcnt & SY_THR_STATIC) != 0; if (error != 0) { tsr->ts_ret.sr_error = error; return; } } rv_saved[0] = td->td_retval[0]; rv_saved[1] = td->td_retval[1]; nerror = td->td_errno; td->td_retval[0] = 0; td->td_retval[1] = 0; #ifdef KDTRACE_HOOKS if (se->sy_entry != 0) (*systrace_probe_func)(&tsr->ts_sa, SYSTRACE_ENTRY, 0); #endif tsr->ts_ret.sr_error = se->sy_call(td, tsr->ts_sa.args); #ifdef KDTRACE_HOOKS if (se->sy_return != 0) (*systrace_probe_func)(&tsr->ts_sa, SYSTRACE_RETURN, tsr->ts_ret.sr_error != 0 ? -1 : td->td_retval[0]); #endif tsr->ts_ret.sr_retval[0] = td->td_retval[0]; tsr->ts_ret.sr_retval[1] = td->td_retval[1]; td->td_retval[0] = rv_saved[0]; td->td_retval[1] = rv_saved[1]; td->td_errno = nerror; if (audited) AUDIT_SYSCALL_EXIT(error, td); if (!sy_thr_static) syscall_thread_exit(td, se); } static void ptrace_remotereq(struct thread *td, int flag) { struct proc *p; MPASS(td == curthread); p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); if ((td->td_dbgflags & flag) == 0) return; KASSERT((p->p_flag & P_STOPPED_TRACE) != 0, ("not stopped")); KASSERT(td->td_remotereq != NULL, ("td_remotereq is NULL")); PROC_UNLOCK(p); switch (flag) { case TDB_COREDUMPREQ: ptrace_coredumpreq(td, p, td->td_remotereq); break; case TDB_SCREMOTEREQ: ptrace_syscallreq(td, p, td->td_remotereq); break; default: __unreachable(); } PROC_LOCK(p); MPASS((td->td_dbgflags & flag) != 0); td->td_dbgflags &= ~flag; td->td_remotereq = NULL; wakeup(p); } static int sig_suspend_threads(struct thread *td, struct proc *p) { struct thread *td2; int wakeup_swapper; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); wakeup_swapper = 0; FOREACH_THREAD_IN_PROC(p, td2) { thread_lock(td2); ast_sched_locked(td2, TDA_SUSPEND); if ((TD_IS_SLEEPING(td2) || TD_IS_SWAPPED(td2)) && (td2->td_flags & TDF_SINTR)) { if (td2->td_flags & TDF_SBDRY) { /* * Once a thread is asleep with * TDF_SBDRY and without TDF_SERESTART * or TDF_SEINTR set, it should never * become suspended due to this check. */ KASSERT(!TD_IS_SUSPENDED(td2), ("thread with deferred stops suspended")); if (TD_SBDRY_INTR(td2)) { wakeup_swapper |= sleepq_abort(td2, TD_SBDRY_ERRNO(td2)); continue; } } else if (!TD_IS_SUSPENDED(td2)) thread_suspend_one(td2); } else if (!TD_IS_SUSPENDED(td2)) { #ifdef SMP if (TD_IS_RUNNING(td2) && td2 != td) forward_signal(td2); #endif } thread_unlock(td2); } return (wakeup_swapper); } /* * Stop the process for an event deemed interesting to the debugger. If si is * non-NULL, this is a signal exchange; the new signal requested by the * debugger will be returned for handling. If si is NULL, this is some other * type of interesting event. The debugger may request a signal be delivered in * that case as well, however it will be deferred until it can be handled. */ int ptracestop(struct thread *td, int sig, ksiginfo_t *si) { struct proc *p = td->td_proc; struct thread *td2; ksiginfo_t ksi; PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(!(p->p_flag & P_WEXIT), ("Stopping exiting process")); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.lock_object, "Stopping for traced signal"); td->td_xsig = sig; if (si == NULL || (si->ksi_flags & KSI_PTRACE) == 0) { td->td_dbgflags |= TDB_XSIG; CTR4(KTR_PTRACE, "ptracestop: tid %d (pid %d) flags %#x sig %d", td->td_tid, p->p_pid, td->td_dbgflags, sig); PROC_SLOCK(p); while ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_XSIG)) { if (P_KILLED(p)) { /* * Ensure that, if we've been PT_KILLed, the * exit status reflects that. Another thread * may also be in ptracestop(), having just * received the SIGKILL, but this thread was * unsuspended first. */ td->td_dbgflags &= ~TDB_XSIG; td->td_xsig = SIGKILL; p->p_ptevents = 0; break; } if (p->p_flag & P_SINGLE_EXIT && !(td->td_dbgflags & TDB_EXIT)) { /* * Ignore ptrace stops except for thread exit * events when the process exits. */ td->td_dbgflags &= ~TDB_XSIG; PROC_SUNLOCK(p); return (0); } /* * Make wait(2) work. Ensure that right after the * attach, the thread which was decided to become the * leader of attach gets reported to the waiter. * Otherwise, just avoid overwriting another thread's * assignment to p_xthread. If another thread has * already set p_xthread, the current thread will get * a chance to report itself upon the next iteration. */ if ((td->td_dbgflags & TDB_FSTP) != 0 || ((p->p_flag2 & P2_PTRACE_FSTP) == 0 && p->p_xthread == NULL)) { p->p_xsig = sig; p->p_xthread = td; /* * If we are on sleepqueue already, * let sleepqueue code decide if it * needs to go sleep after attach. */ if (td->td_wchan == NULL) td->td_dbgflags &= ~TDB_FSTP; p->p_flag2 &= ~P2_PTRACE_FSTP; p->p_flag |= P_STOPPED_SIG | P_STOPPED_TRACE; sig_suspend_threads(td, p); } if ((td->td_dbgflags & TDB_STOPATFORK) != 0) { td->td_dbgflags &= ~TDB_STOPATFORK; } stopme: td->td_dbgflags |= TDB_SSWITCH; thread_suspend_switch(td, p); td->td_dbgflags &= ~TDB_SSWITCH; if ((td->td_dbgflags & (TDB_COREDUMPREQ | TDB_SCREMOTEREQ)) != 0) { MPASS((td->td_dbgflags & (TDB_COREDUMPREQ | TDB_SCREMOTEREQ)) != (TDB_COREDUMPREQ | TDB_SCREMOTEREQ)); PROC_SUNLOCK(p); ptrace_remotereq(td, td->td_dbgflags & (TDB_COREDUMPREQ | TDB_SCREMOTEREQ)); PROC_SLOCK(p); goto stopme; } if (p->p_xthread == td) p->p_xthread = NULL; if (!(p->p_flag & P_TRACED)) break; if (td->td_dbgflags & TDB_SUSPEND) { if (p->p_flag & P_SINGLE_EXIT) break; goto stopme; } } PROC_SUNLOCK(p); } if (si != NULL && sig == td->td_xsig) { /* Parent wants us to take the original signal unchanged. */ si->ksi_flags |= KSI_HEAD; if (sigqueue_add(&td->td_sigqueue, sig, si) != 0) si->ksi_signo = 0; } else if (td->td_xsig != 0) { /* * If parent wants us to take a new signal, then it will leave * it in td->td_xsig; otherwise we just look for signals again. */ ksiginfo_init(&ksi); ksi.ksi_signo = td->td_xsig; ksi.ksi_flags |= KSI_PTRACE; td2 = sigtd(p, td->td_xsig, false); tdsendsignal(p, td2, td->td_xsig, &ksi); if (td != td2) return (0); } return (td->td_xsig); } static void reschedule_signals(struct proc *p, sigset_t block, int flags) { struct sigacts *ps; struct thread *td; int sig; bool fastblk, pslocked; PROC_LOCK_ASSERT(p, MA_OWNED); ps = p->p_sigacts; pslocked = (flags & SIGPROCMASK_PS_LOCKED) != 0; mtx_assert(&ps->ps_mtx, pslocked ? MA_OWNED : MA_NOTOWNED); if (SIGISEMPTY(p->p_siglist)) return; SIGSETAND(block, p->p_siglist); fastblk = (flags & SIGPROCMASK_FASTBLK) != 0; SIG_FOREACH(sig, &block) { td = sigtd(p, sig, fastblk); /* * If sigtd() selected us despite sigfastblock is * blocking, do not activate AST or wake us, to avoid * loop in AST handler. */ if (fastblk && td == curthread) continue; signotify(td); if (!pslocked) mtx_lock(&ps->ps_mtx); if (p->p_flag & P_TRACED || (SIGISMEMBER(ps->ps_sigcatch, sig) && !SIGISMEMBER(td->td_sigmask, sig))) { tdsigwakeup(td, sig, SIG_CATCH, (SIGISMEMBER(ps->ps_sigintr, sig) ? EINTR : ERESTART)); } if (!pslocked) mtx_unlock(&ps->ps_mtx); } } void tdsigcleanup(struct thread *td) { struct proc *p; sigset_t unblocked; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sigqueue_flush(&td->td_sigqueue); if (p->p_numthreads == 1) return; /* * Since we cannot handle signals, notify signal post code * about this by filling the sigmask. * * Also, if needed, wake up thread(s) that do not block the * same signals as the exiting thread, since the thread might * have been selected for delivery and woken up. */ SIGFILLSET(unblocked); SIGSETNAND(unblocked, td->td_sigmask); SIGFILLSET(td->td_sigmask); reschedule_signals(p, unblocked, 0); } static int sigdeferstop_curr_flags(int cflags) { MPASS((cflags & (TDF_SEINTR | TDF_SERESTART)) == 0 || (cflags & TDF_SBDRY) != 0); return (cflags & (TDF_SBDRY | TDF_SEINTR | TDF_SERESTART)); } /* * Defer the delivery of SIGSTOP for the current thread, according to * the requested mode. Returns previous flags, which must be restored * by sigallowstop(). * * TDF_SBDRY, TDF_SEINTR, and TDF_SERESTART flags are only set and * cleared by the current thread, which allow the lock-less read-only * accesses below. */ int sigdeferstop_impl(int mode) { struct thread *td; int cflags, nflags; td = curthread; cflags = sigdeferstop_curr_flags(td->td_flags); switch (mode) { case SIGDEFERSTOP_NOP: nflags = cflags; break; case SIGDEFERSTOP_OFF: nflags = 0; break; case SIGDEFERSTOP_SILENT: nflags = (cflags | TDF_SBDRY) & ~(TDF_SEINTR | TDF_SERESTART); break; case SIGDEFERSTOP_EINTR: nflags = (cflags | TDF_SBDRY | TDF_SEINTR) & ~TDF_SERESTART; break; case SIGDEFERSTOP_ERESTART: nflags = (cflags | TDF_SBDRY | TDF_SERESTART) & ~TDF_SEINTR; break; default: panic("sigdeferstop: invalid mode %x", mode); break; } if (cflags == nflags) return (SIGDEFERSTOP_VAL_NCHG); thread_lock(td); td->td_flags = (td->td_flags & ~cflags) | nflags; thread_unlock(td); return (cflags); } /* * Restores the STOP handling mode, typically permitting the delivery * of SIGSTOP for the current thread. This does not immediately * suspend if a stop was posted. Instead, the thread will suspend * either via ast() or a subsequent interruptible sleep. */ void sigallowstop_impl(int prev) { struct thread *td; int cflags; KASSERT(prev != SIGDEFERSTOP_VAL_NCHG, ("failed sigallowstop")); KASSERT((prev & ~(TDF_SBDRY | TDF_SEINTR | TDF_SERESTART)) == 0, ("sigallowstop: incorrect previous mode %x", prev)); td = curthread; cflags = sigdeferstop_curr_flags(td->td_flags); if (cflags != prev) { thread_lock(td); td->td_flags = (td->td_flags & ~cflags) | prev; thread_unlock(td); } } enum sigstatus { SIGSTATUS_HANDLE, SIGSTATUS_HANDLED, SIGSTATUS_IGNORE, SIGSTATUS_SBDRY_STOP, }; /* * The thread has signal "sig" pending. Figure out what to do with it: * * _HANDLE -> the caller should handle the signal * _HANDLED -> handled internally, reload pending signal set * _IGNORE -> ignored, remove from the set of pending signals and try the * next pending signal * _SBDRY_STOP -> the signal should stop the thread but this is not * permitted in the current context */ static enum sigstatus sigprocess(struct thread *td, int sig) { struct proc *p; struct sigacts *ps; struct sigqueue *queue; ksiginfo_t ksi; int prop; KASSERT(_SIG_VALID(sig), ("%s: invalid signal %d", __func__, sig)); p = td->td_proc; ps = p->p_sigacts; mtx_assert(&ps->ps_mtx, MA_OWNED); PROC_LOCK_ASSERT(p, MA_OWNED); /* * We should allow pending but ignored signals below * if there is sigwait() active, or P_TRACED was * on when they were posted. */ if (SIGISMEMBER(ps->ps_sigignore, sig) && (p->p_flag & P_TRACED) == 0 && (td->td_flags & TDF_SIGWAIT) == 0) { return (SIGSTATUS_IGNORE); } /* * If the process is going to single-thread mode to prepare * for exit, there is no sense in delivering any signal * to usermode. Another important consequence is that * msleep(..., PCATCH, ...) now is only interruptible by a * suspend request. */ if ((p->p_flag2 & P2_WEXIT) != 0) return (SIGSTATUS_IGNORE); if ((p->p_flag & (P_TRACED | P_PPTRACE)) == P_TRACED) { /* * If traced, always stop. * Remove old signal from queue before the stop. * XXX shrug off debugger, it causes siginfo to * be thrown away. */ queue = &td->td_sigqueue; ksiginfo_init(&ksi); if (sigqueue_get(queue, sig, &ksi) == 0) { queue = &p->p_sigqueue; sigqueue_get(queue, sig, &ksi); } td->td_si = ksi.ksi_info; mtx_unlock(&ps->ps_mtx); sig = ptracestop(td, sig, &ksi); mtx_lock(&ps->ps_mtx); td->td_si.si_signo = 0; /* * Keep looking if the debugger discarded or * replaced the signal. */ if (sig == 0) return (SIGSTATUS_HANDLED); /* * If the signal became masked, re-queue it. */ if (SIGISMEMBER(td->td_sigmask, sig)) { ksi.ksi_flags |= KSI_HEAD; sigqueue_add(&p->p_sigqueue, sig, &ksi); return (SIGSTATUS_HANDLED); } /* * If the traced bit got turned off, requeue the signal and * reload the set of pending signals. This ensures that p_sig* * and p_sigact are consistent. */ if ((p->p_flag & P_TRACED) == 0) { if ((ksi.ksi_flags & KSI_PTRACE) == 0) { ksi.ksi_flags |= KSI_HEAD; sigqueue_add(queue, sig, &ksi); } return (SIGSTATUS_HANDLED); } } /* * Decide whether the signal should be returned. * Return the signal's number, or fall through * to clear it from the pending mask. */ switch ((intptr_t)p->p_sigacts->ps_sigact[_SIG_IDX(sig)]) { case (intptr_t)SIG_DFL: /* * Don't take default actions on system processes. */ if (p->p_pid <= 1) { #ifdef DIAGNOSTIC /* * Are you sure you want to ignore SIGSEGV * in init? XXX */ printf("Process (pid %lu) got signal %d\n", (u_long)p->p_pid, sig); #endif return (SIGSTATUS_IGNORE); } /* * If there is a pending stop signal to process with * default action, stop here, then clear the signal. * Traced or exiting processes should ignore stops. * Additionally, a member of an orphaned process group * should ignore tty stops. */ prop = sigprop(sig); if (prop & SIGPROP_STOP) { mtx_unlock(&ps->ps_mtx); if ((p->p_flag & (P_TRACED | P_WEXIT | P_SINGLE_EXIT)) != 0 || ((p->p_pgrp-> pg_flags & PGRP_ORPHANED) != 0 && (prop & SIGPROP_TTYSTOP) != 0)) { mtx_lock(&ps->ps_mtx); return (SIGSTATUS_IGNORE); } if (TD_SBDRY_INTR(td)) { KASSERT((td->td_flags & TDF_SBDRY) != 0, ("lost TDF_SBDRY")); mtx_lock(&ps->ps_mtx); return (SIGSTATUS_SBDRY_STOP); } WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.lock_object, "Catching SIGSTOP"); sigqueue_delete(&td->td_sigqueue, sig); sigqueue_delete(&p->p_sigqueue, sig); p->p_flag |= P_STOPPED_SIG; p->p_xsig = sig; PROC_SLOCK(p); sig_suspend_threads(td, p); thread_suspend_switch(td, p); PROC_SUNLOCK(p); mtx_lock(&ps->ps_mtx); return (SIGSTATUS_HANDLED); } else if ((prop & SIGPROP_IGNORE) != 0 && (td->td_flags & TDF_SIGWAIT) == 0) { /* * Default action is to ignore; drop it if * not in kern_sigtimedwait(). */ return (SIGSTATUS_IGNORE); } else { return (SIGSTATUS_HANDLE); } case (intptr_t)SIG_IGN: if ((td->td_flags & TDF_SIGWAIT) == 0) return (SIGSTATUS_IGNORE); else return (SIGSTATUS_HANDLE); default: /* * This signal has an action, let postsig() process it. */ return (SIGSTATUS_HANDLE); } } /* * If the current process has received a signal (should be caught or cause * termination, should interrupt current syscall), return the signal number. * Stop signals with default action are processed immediately, then cleared; * they aren't returned. This is checked after each entry to the system for * a syscall or trap (though this can usually be done without calling * issignal by checking the pending signal masks in cursig.) The normal call * sequence is * * while (sig = cursig(curthread)) * postsig(sig); */ static int issignal(struct thread *td) { struct proc *p; sigset_t sigpending; int sig; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); for (;;) { sigpending = td->td_sigqueue.sq_signals; SIGSETOR(sigpending, p->p_sigqueue.sq_signals); SIGSETNAND(sigpending, td->td_sigmask); if ((p->p_flag & P_PPWAIT) != 0 || (td->td_flags & (TDF_SBDRY | TDF_SERESTART | TDF_SEINTR)) == TDF_SBDRY) SIG_STOPSIGMASK(sigpending); if (SIGISEMPTY(sigpending)) /* no signal to send */ return (0); /* * Do fast sigblock if requested by usermode. Since * we do know that there was a signal pending at this * point, set the FAST_SIGBLOCK_PEND as indicator for * usermode to perform a dummy call to * FAST_SIGBLOCK_UNBLOCK, which causes immediate * delivery of postponed pending signal. */ if ((td->td_pflags & TDP_SIGFASTBLOCK) != 0) { if (td->td_sigblock_val != 0) SIGSETNAND(sigpending, fastblock_mask); if (SIGISEMPTY(sigpending)) { td->td_pflags |= TDP_SIGFASTPENDING; return (0); } } if ((p->p_flag & (P_TRACED | P_PPTRACE)) == P_TRACED && (p->p_flag2 & P2_PTRACE_FSTP) != 0 && SIGISMEMBER(sigpending, SIGSTOP)) { /* * If debugger just attached, always consume * SIGSTOP from ptrace(PT_ATTACH) first, to * execute the debugger attach ritual in * order. */ td->td_dbgflags |= TDB_FSTP; SIGEMPTYSET(sigpending); SIGADDSET(sigpending, SIGSTOP); } SIG_FOREACH(sig, &sigpending) { switch (sigprocess(td, sig)) { case SIGSTATUS_HANDLE: return (sig); case SIGSTATUS_HANDLED: goto next; case SIGSTATUS_IGNORE: sigqueue_delete(&td->td_sigqueue, sig); sigqueue_delete(&p->p_sigqueue, sig); break; case SIGSTATUS_SBDRY_STOP: return (-1); } } next:; } } void thread_stopped(struct proc *p) { int n; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); n = p->p_suspcount; if (p == curproc) n++; if ((p->p_flag & P_STOPPED_SIG) && (n == p->p_numthreads)) { PROC_SUNLOCK(p); p->p_flag &= ~P_WAITED; PROC_LOCK(p->p_pptr); childproc_stopped(p, (p->p_flag & P_TRACED) ? CLD_TRAPPED : CLD_STOPPED); PROC_UNLOCK(p->p_pptr); PROC_SLOCK(p); } } /* * Take the action for the specified signal * from the current set of pending signals. */ int postsig(int sig) { struct thread *td; struct proc *p; struct sigacts *ps; sig_t action; ksiginfo_t ksi; sigset_t returnmask; KASSERT(sig != 0, ("postsig")); td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); ps = p->p_sigacts; mtx_assert(&ps->ps_mtx, MA_OWNED); ksiginfo_init(&ksi); if (sigqueue_get(&td->td_sigqueue, sig, &ksi) == 0 && sigqueue_get(&p->p_sigqueue, sig, &ksi) == 0) return (0); ksi.ksi_signo = sig; if (ksi.ksi_code == SI_TIMER) itimer_accept(p, ksi.ksi_timerid, &ksi); action = ps->ps_sigact[_SIG_IDX(sig)]; #ifdef KTRACE if (KTRPOINT(td, KTR_PSIG)) ktrpsig(sig, action, td->td_pflags & TDP_OLDMASK ? &td->td_oldsigmask : &td->td_sigmask, ksi.ksi_code); #endif if (action == SIG_DFL) { /* * Default action, where the default is to kill * the process. (Other cases were ignored above.) */ mtx_unlock(&ps->ps_mtx); proc_td_siginfo_capture(td, &ksi.ksi_info); sigexit(td, sig); /* NOTREACHED */ } else { /* * If we get here, the signal must be caught. */ KASSERT(action != SIG_IGN, ("postsig action %p", action)); KASSERT(!SIGISMEMBER(td->td_sigmask, sig), ("postsig action: blocked sig %d", sig)); /* * Set the new mask value and also defer further * occurrences of this signal. * * Special case: user has done a sigsuspend. Here the * current mask is not of interest, but rather the * mask from before the sigsuspend is what we want * restored after the signal processing is completed. */ if (td->td_pflags & TDP_OLDMASK) { returnmask = td->td_oldsigmask; td->td_pflags &= ~TDP_OLDMASK; } else returnmask = td->td_sigmask; if (p->p_sig == sig) { p->p_sig = 0; } (*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask); postsig_done(sig, td, ps); } return (1); } int sig_ast_checksusp(struct thread *td) { struct proc *p __diagused; int ret; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); if (!td_ast_pending(td, TDA_SUSPEND)) return (0); ret = thread_suspend_check(1); MPASS(ret == 0 || ret == EINTR || ret == ERESTART); return (ret); } int sig_ast_needsigchk(struct thread *td) { struct proc *p; struct sigacts *ps; int ret, sig; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); if (!td_ast_pending(td, TDA_SIG)) return (0); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); sig = cursig(td); if (sig == -1) { mtx_unlock(&ps->ps_mtx); KASSERT((td->td_flags & TDF_SBDRY) != 0, ("lost TDF_SBDRY")); KASSERT(TD_SBDRY_INTR(td), ("lost TDF_SERESTART of TDF_SEINTR")); KASSERT((td->td_flags & (TDF_SEINTR | TDF_SERESTART)) != (TDF_SEINTR | TDF_SERESTART), ("both TDF_SEINTR and TDF_SERESTART")); ret = TD_SBDRY_ERRNO(td); } else if (sig != 0) { ret = SIGISMEMBER(ps->ps_sigintr, sig) ? EINTR : ERESTART; mtx_unlock(&ps->ps_mtx); } else { mtx_unlock(&ps->ps_mtx); ret = 0; } /* * Do not go into sleep if this thread was the ptrace(2) * attach leader. cursig() consumed SIGSTOP from PT_ATTACH, * but we usually act on the signal by interrupting sleep, and * should do that here as well. */ if ((td->td_dbgflags & TDB_FSTP) != 0) { if (ret == 0) ret = EINTR; td->td_dbgflags &= ~TDB_FSTP; } return (ret); } int sig_intr(void) { struct thread *td; struct proc *p; int ret; td = curthread; if (!td_ast_pending(td, TDA_SIG) && !td_ast_pending(td, TDA_SUSPEND)) return (0); p = td->td_proc; PROC_LOCK(p); ret = sig_ast_checksusp(td); if (ret == 0) ret = sig_ast_needsigchk(td); PROC_UNLOCK(p); return (ret); } bool curproc_sigkilled(void) { struct thread *td; struct proc *p; struct sigacts *ps; bool res; td = curthread; if (!td_ast_pending(td, TDA_SIG)) return (false); p = td->td_proc; PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); res = SIGISMEMBER(td->td_sigqueue.sq_signals, SIGKILL) || SIGISMEMBER(p->p_sigqueue.sq_signals, SIGKILL); mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); return (res); } void proc_wkilled(struct proc *p) { PROC_LOCK_ASSERT(p, MA_OWNED); if ((p->p_flag & P_WKILLED) == 0) { p->p_flag |= P_WKILLED; /* * Notify swapper that there is a process to swap in. * The notification is racy, at worst it would take 10 * seconds for the swapper process to notice. */ if ((p->p_flag & (P_INMEM | P_SWAPPINGIN)) == 0) wakeup(&proc0); } } /* * Kill the current process for stated reason. */ void killproc(struct proc *p, const char *why) { PROC_LOCK_ASSERT(p, MA_OWNED); CTR3(KTR_PROC, "killproc: proc %p (pid %d, %s)", p, p->p_pid, p->p_comm); log(LOG_ERR, "pid %d (%s), jid %d, uid %d, was killed: %s\n", p->p_pid, p->p_comm, p->p_ucred->cr_prison->pr_id, p->p_ucred->cr_uid, why); proc_wkilled(p); kern_psignal(p, SIGKILL); } /* * Force the current process to exit with the specified signal, dumping core * if appropriate. We bypass the normal tests for masked and caught signals, * allowing unrecoverable failures to terminate the process without changing * signal state. Mark the accounting record with the signal termination. * If dumping core, save the signal number for the debugger. Calls exit and * does not return. */ void sigexit(struct thread *td, int sig) { struct proc *p = td->td_proc; const char *coreinfo; int rv; PROC_LOCK_ASSERT(p, MA_OWNED); proc_set_p2_wexit(p); p->p_acflag |= AXSIG; /* * We must be single-threading to generate a core dump. This * ensures that the registers in the core file are up-to-date. * Also, the ELF dump handler assumes that the thread list doesn't * change out from under it. * * XXX If another thread attempts to single-thread before us * (e.g. via fork()), we won't get a dump at all. */ if ((sigprop(sig) & SIGPROP_CORE) && thread_single(p, SINGLE_NO_EXIT) == 0) { p->p_sig = sig; /* * Log signals which would cause core dumps * (Log as LOG_INFO to appease those who don't want * these messages.) * XXX : Todo, as well as euid, write out ruid too * Note that coredump() drops proc lock. */ rv = coredump(td); switch (rv) { case 0: sig |= WCOREFLAG; coreinfo = " (core dumped)"; break; case EFAULT: coreinfo = " (no core dump - bad address)"; break; case EINVAL: coreinfo = " (no core dump - invalid argument)"; break; case EFBIG: coreinfo = " (no core dump - too large)"; break; default: coreinfo = " (no core dump - other error)"; break; } if (kern_logsigexit) log(LOG_INFO, "pid %d (%s), jid %d, uid %d: exited on " "signal %d%s\n", p->p_pid, p->p_comm, p->p_ucred->cr_prison->pr_id, td->td_ucred->cr_uid, sig &~ WCOREFLAG, coreinfo); } else PROC_UNLOCK(p); exit1(td, 0, sig); /* NOTREACHED */ } /* * Send queued SIGCHLD to parent when child process's state * is changed. */ static void sigparent(struct proc *p, int reason, int status) { PROC_LOCK_ASSERT(p, MA_OWNED); PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED); if (p->p_ksi != NULL) { p->p_ksi->ksi_signo = SIGCHLD; p->p_ksi->ksi_code = reason; p->p_ksi->ksi_status = status; p->p_ksi->ksi_pid = p->p_pid; p->p_ksi->ksi_uid = p->p_ucred->cr_ruid; if (KSI_ONQ(p->p_ksi)) return; } pksignal(p->p_pptr, SIGCHLD, p->p_ksi); } static void childproc_jobstate(struct proc *p, int reason, int sig) { struct sigacts *ps; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED); /* * Wake up parent sleeping in kern_wait(), also send * SIGCHLD to parent, but SIGCHLD does not guarantee * that parent will awake, because parent may masked * the signal. */ p->p_pptr->p_flag |= P_STATCHILD; wakeup(p->p_pptr); ps = p->p_pptr->p_sigacts; mtx_lock(&ps->ps_mtx); if ((ps->ps_flag & PS_NOCLDSTOP) == 0) { mtx_unlock(&ps->ps_mtx); sigparent(p, reason, sig); } else mtx_unlock(&ps->ps_mtx); } void childproc_stopped(struct proc *p, int reason) { childproc_jobstate(p, reason, p->p_xsig); } void childproc_continued(struct proc *p) { childproc_jobstate(p, CLD_CONTINUED, SIGCONT); } void childproc_exited(struct proc *p) { int reason, status; if (WCOREDUMP(p->p_xsig)) { reason = CLD_DUMPED; status = WTERMSIG(p->p_xsig); } else if (WIFSIGNALED(p->p_xsig)) { reason = CLD_KILLED; status = WTERMSIG(p->p_xsig); } else { reason = CLD_EXITED; status = p->p_xexit; } /* * XXX avoid calling wakeup(p->p_pptr), the work is * done in exit1(). */ sigparent(p, reason, status); } #define MAX_NUM_CORE_FILES 100000 #ifndef NUM_CORE_FILES #define NUM_CORE_FILES 5 #endif CTASSERT(NUM_CORE_FILES >= 0 && NUM_CORE_FILES <= MAX_NUM_CORE_FILES); static int num_cores = NUM_CORE_FILES; static int sysctl_debug_num_cores_check (SYSCTL_HANDLER_ARGS) { int error; int new_val; new_val = num_cores; error = sysctl_handle_int(oidp, &new_val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (new_val > MAX_NUM_CORE_FILES) new_val = MAX_NUM_CORE_FILES; if (new_val < 0) new_val = 0; num_cores = new_val; return (0); } SYSCTL_PROC(_debug, OID_AUTO, ncores, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, sizeof(int), sysctl_debug_num_cores_check, "I", "Maximum number of generated process corefiles while using index format"); #define GZIP_SUFFIX ".gz" #define ZSTD_SUFFIX ".zst" int compress_user_cores = 0; static int sysctl_compress_user_cores(SYSCTL_HANDLER_ARGS) { int error, val; val = compress_user_cores; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val != 0 && !compressor_avail(val)) return (EINVAL); compress_user_cores = val; return (error); } SYSCTL_PROC(_kern, OID_AUTO, compress_user_cores, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NEEDGIANT, 0, sizeof(int), sysctl_compress_user_cores, "I", "Enable compression of user corefiles (" __XSTRING(COMPRESS_GZIP) " = gzip, " __XSTRING(COMPRESS_ZSTD) " = zstd)"); int compress_user_cores_level = 6; SYSCTL_INT(_kern, OID_AUTO, compress_user_cores_level, CTLFLAG_RWTUN, &compress_user_cores_level, 0, "Corefile compression level"); /* * Protect the access to corefilename[] by allproc_lock. */ #define corefilename_lock allproc_lock static char corefilename[MAXPATHLEN] = {"%N.core"}; TUNABLE_STR("kern.corefile", corefilename, sizeof(corefilename)); static int sysctl_kern_corefile(SYSCTL_HANDLER_ARGS) { int error; sx_xlock(&corefilename_lock); error = sysctl_handle_string(oidp, corefilename, sizeof(corefilename), req); sx_xunlock(&corefilename_lock); return (error); } SYSCTL_PROC(_kern, OID_AUTO, corefile, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_kern_corefile, "A", "Process corefile name format string"); static void vnode_close_locked(struct thread *td, struct vnode *vp) { VOP_UNLOCK(vp); vn_close(vp, FWRITE, td->td_ucred, td); } /* * If the core format has a %I in it, then we need to check * for existing corefiles before defining a name. * To do this we iterate over 0..ncores to find a * non-existing core file name to use. If all core files are * already used we choose the oldest one. */ static int corefile_open_last(struct thread *td, char *name, int indexpos, int indexlen, int ncores, struct vnode **vpp) { struct vnode *oldvp, *nextvp, *vp; struct vattr vattr; struct nameidata nd; int error, i, flags, oflags, cmode; char ch; struct timespec lasttime; nextvp = oldvp = NULL; cmode = S_IRUSR | S_IWUSR; oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE | (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0); for (i = 0; i < ncores; i++) { flags = O_CREAT | FWRITE | O_NOFOLLOW; ch = name[indexpos + indexlen]; (void)snprintf(name + indexpos, indexlen + 1, "%.*u", indexlen, i); name[indexpos + indexlen] = ch; NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name); error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred, NULL); if (error != 0) break; vp = nd.ni_vp; NDFREE_PNBUF(&nd); if ((flags & O_CREAT) == O_CREAT) { nextvp = vp; break; } error = VOP_GETATTR(vp, &vattr, td->td_ucred); if (error != 0) { vnode_close_locked(td, vp); break; } if (oldvp == NULL || lasttime.tv_sec > vattr.va_mtime.tv_sec || (lasttime.tv_sec == vattr.va_mtime.tv_sec && lasttime.tv_nsec >= vattr.va_mtime.tv_nsec)) { if (oldvp != NULL) vn_close(oldvp, FWRITE, td->td_ucred, td); oldvp = vp; VOP_UNLOCK(oldvp); lasttime = vattr.va_mtime; } else { vnode_close_locked(td, vp); } } if (oldvp != NULL) { if (nextvp == NULL) { if ((td->td_proc->p_flag & P_SUGID) != 0) { error = EFAULT; vn_close(oldvp, FWRITE, td->td_ucred, td); } else { nextvp = oldvp; error = vn_lock(nextvp, LK_EXCLUSIVE); if (error != 0) { vn_close(nextvp, FWRITE, td->td_ucred, td); nextvp = NULL; } } } else { vn_close(oldvp, FWRITE, td->td_ucred, td); } } if (error != 0) { if (nextvp != NULL) vnode_close_locked(td, oldvp); } else { *vpp = nextvp; } return (error); } /* * corefile_open(comm, uid, pid, td, compress, vpp, namep) * Expand the name described in corefilename, using name, uid, and pid * and open/create core file. * corefilename is a printf-like string, with three format specifiers: * %N name of process ("name") * %P process id (pid) * %U user id (uid) * For example, "%N.core" is the default; they can be disabled completely * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P". * This is controlled by the sysctl variable kern.corefile (see above). */ static int corefile_open(const char *comm, uid_t uid, pid_t pid, struct thread *td, int compress, int signum, struct vnode **vpp, char **namep) { struct sbuf sb; struct nameidata nd; const char *format; char *hostname, *name; int cmode, error, flags, i, indexpos, indexlen, oflags, ncores; hostname = NULL; format = corefilename; name = malloc(MAXPATHLEN, M_TEMP, M_WAITOK | M_ZERO); indexlen = 0; indexpos = -1; ncores = num_cores; (void)sbuf_new(&sb, name, MAXPATHLEN, SBUF_FIXEDLEN); sx_slock(&corefilename_lock); for (i = 0; format[i] != '\0'; i++) { switch (format[i]) { case '%': /* Format character */ i++; switch (format[i]) { case '%': sbuf_putc(&sb, '%'); break; case 'H': /* hostname */ if (hostname == NULL) { hostname = malloc(MAXHOSTNAMELEN, M_TEMP, M_WAITOK); } getcredhostname(td->td_ucred, hostname, MAXHOSTNAMELEN); sbuf_printf(&sb, "%s", hostname); break; case 'I': /* autoincrementing index */ if (indexpos != -1) { sbuf_printf(&sb, "%%I"); break; } indexpos = sbuf_len(&sb); sbuf_printf(&sb, "%u", ncores - 1); indexlen = sbuf_len(&sb) - indexpos; break; case 'N': /* process name */ sbuf_printf(&sb, "%s", comm); break; case 'P': /* process id */ sbuf_printf(&sb, "%u", pid); break; case 'S': /* signal number */ sbuf_printf(&sb, "%i", signum); break; case 'U': /* user id */ sbuf_printf(&sb, "%u", uid); break; default: log(LOG_ERR, "Unknown format character %c in " "corename `%s'\n", format[i], format); break; } break; default: sbuf_putc(&sb, format[i]); break; } } sx_sunlock(&corefilename_lock); free(hostname, M_TEMP); if (compress == COMPRESS_GZIP) sbuf_printf(&sb, GZIP_SUFFIX); else if (compress == COMPRESS_ZSTD) sbuf_printf(&sb, ZSTD_SUFFIX); if (sbuf_error(&sb) != 0) { log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too " "long\n", (long)pid, comm, (u_long)uid); sbuf_delete(&sb); free(name, M_TEMP); return (ENOMEM); } sbuf_finish(&sb); sbuf_delete(&sb); if (indexpos != -1) { error = corefile_open_last(td, name, indexpos, indexlen, ncores, vpp); if (error != 0) { log(LOG_ERR, "pid %d (%s), uid (%u): Path `%s' failed " "on initial open test, error = %d\n", pid, comm, uid, name, error); } } else { cmode = S_IRUSR | S_IWUSR; oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE | (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0); flags = O_CREAT | FWRITE | O_NOFOLLOW; if ((td->td_proc->p_flag & P_SUGID) != 0) flags |= O_EXCL; NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name); error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred, NULL); if (error == 0) { *vpp = nd.ni_vp; NDFREE_PNBUF(&nd); } } if (error != 0) { #ifdef AUDIT audit_proc_coredump(td, name, error); #endif free(name, M_TEMP); return (error); } *namep = name; return (0); } /* * Dump a process' core. The main routine does some * policy checking, and creates the name of the coredump; * then it passes on a vnode and a size limit to the process-specific * coredump routine if there is one; if there _is not_ one, it returns * ENOSYS; otherwise it returns the error from the process-specific routine. */ static int coredump(struct thread *td) { struct proc *p = td->td_proc; struct ucred *cred = td->td_ucred; struct vnode *vp; struct flock lf; struct vattr vattr; size_t fullpathsize; int error, error1, locked; char *name; /* name of corefile */ void *rl_cookie; off_t limit; char *fullpath, *freepath = NULL; struct sbuf *sb; PROC_LOCK_ASSERT(p, MA_OWNED); MPASS((p->p_flag & P_HADTHREADS) == 0 || p->p_singlethread == td); if (!do_coredump || (!sugid_coredump && (p->p_flag & P_SUGID) != 0) || (p->p_flag2 & P2_NOTRACE) != 0) { PROC_UNLOCK(p); return (EFAULT); } /* * Note that the bulk of limit checking is done after * the corefile is created. The exception is if the limit * for corefiles is 0, in which case we don't bother * creating the corefile at all. This layout means that * a corefile is truncated instead of not being created, * if it is larger than the limit. */ limit = (off_t)lim_cur(td, RLIMIT_CORE); if (limit == 0 || racct_get_available(p, RACCT_CORE) == 0) { PROC_UNLOCK(p); return (EFBIG); } PROC_UNLOCK(p); error = corefile_open(p->p_comm, cred->cr_uid, p->p_pid, td, compress_user_cores, p->p_sig, &vp, &name); if (error != 0) return (error); /* * Don't dump to non-regular files or files with links. * Do not dump into system files. Effective user must own the corefile. */ if (vp->v_type != VREG || VOP_GETATTR(vp, &vattr, cred) != 0 || vattr.va_nlink != 1 || (vp->v_vflag & VV_SYSTEM) != 0 || vattr.va_uid != cred->cr_uid) { VOP_UNLOCK(vp); error = EFAULT; goto out; } VOP_UNLOCK(vp); /* Postpone other writers, including core dumps of other processes. */ rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_WRLCK; locked = (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK) == 0); VATTR_NULL(&vattr); vattr.va_size = 0; if (set_core_nodump_flag) vattr.va_flags = UF_NODUMP; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VOP_SETATTR(vp, &vattr, cred); VOP_UNLOCK(vp); PROC_LOCK(p); p->p_acflag |= ACORE; PROC_UNLOCK(p); if (p->p_sysent->sv_coredump != NULL) { error = p->p_sysent->sv_coredump(td, vp, limit, 0); } else { error = ENOSYS; } if (locked) { lf.l_type = F_UNLCK; VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK); } vn_rangelock_unlock(vp, rl_cookie); /* * Notify the userland helper that a process triggered a core dump. * This allows the helper to run an automated debugging session. */ if (error != 0 || coredump_devctl == 0) goto out; sb = sbuf_new_auto(); if (vn_fullpath_global(p->p_textvp, &fullpath, &freepath) != 0) goto out2; sbuf_printf(sb, "comm=\""); devctl_safe_quote_sb(sb, fullpath); free(freepath, M_TEMP); sbuf_printf(sb, "\" core=\""); /* * We can't lookup core file vp directly. When we're replacing a core, and * other random times, we flush the name cache, so it will fail. Instead, * if the path of the core is relative, add the current dir in front if it. */ if (name[0] != '/') { fullpathsize = MAXPATHLEN; freepath = malloc(fullpathsize, M_TEMP, M_WAITOK); if (vn_getcwd(freepath, &fullpath, &fullpathsize) != 0) { free(freepath, M_TEMP); goto out2; } devctl_safe_quote_sb(sb, fullpath); free(freepath, M_TEMP); sbuf_putc(sb, '/'); } devctl_safe_quote_sb(sb, name); sbuf_printf(sb, "\""); if (sbuf_finish(sb) == 0) devctl_notify("kernel", "signal", "coredump", sbuf_data(sb)); out2: sbuf_delete(sb); out: error1 = vn_close(vp, FWRITE, cred, td); if (error == 0) error = error1; #ifdef AUDIT audit_proc_coredump(td, name, error); #endif free(name, M_TEMP); return (error); } /* * Nonexistent system call-- signal process (may want to handle it). Flag * error in case process won't see signal immediately (blocked or ignored). */ #ifndef _SYS_SYSPROTO_H_ struct nosys_args { int dummy; }; #endif /* ARGSUSED */ int nosys(struct thread *td, struct nosys_args *args) { struct proc *p; p = td->td_proc; if (SV_PROC_FLAG(p, SV_SIGSYS) != 0 && kern_signosys) { PROC_LOCK(p); tdsignal(td, SIGSYS); PROC_UNLOCK(p); } if (kern_lognosys == 1 || kern_lognosys == 3) { uprintf("pid %d comm %s: nosys %d\n", p->p_pid, p->p_comm, td->td_sa.code); } if (kern_lognosys == 2 || kern_lognosys == 3 || (p->p_pid == 1 && (kern_lognosys & 3) == 0)) { printf("pid %d comm %s: nosys %d\n", p->p_pid, p->p_comm, td->td_sa.code); } return (ENOSYS); } /* * Send a SIGIO or SIGURG signal to a process or process group using stored * credentials rather than those of the current process. */ void pgsigio(struct sigio **sigiop, int sig, int checkctty) { ksiginfo_t ksi; struct sigio *sigio; ksiginfo_init(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = SI_KERNEL; SIGIO_LOCK(); sigio = *sigiop; if (sigio == NULL) { SIGIO_UNLOCK(); return; } if (sigio->sio_pgid > 0) { PROC_LOCK(sigio->sio_proc); if (CANSIGIO(sigio->sio_ucred, sigio->sio_proc->p_ucred)) kern_psignal(sigio->sio_proc, sig); PROC_UNLOCK(sigio->sio_proc); } else if (sigio->sio_pgid < 0) { struct proc *p; PGRP_LOCK(sigio->sio_pgrp); LIST_FOREACH(p, &sigio->sio_pgrp->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && CANSIGIO(sigio->sio_ucred, p->p_ucred) && (checkctty == 0 || (p->p_flag & P_CONTROLT))) kern_psignal(p, sig); PROC_UNLOCK(p); } PGRP_UNLOCK(sigio->sio_pgrp); } SIGIO_UNLOCK(); } static int filt_sigattach(struct knote *kn) { struct proc *p = curproc; kn->kn_ptr.p_proc = p; kn->kn_flags |= EV_CLEAR; /* automatically set */ knlist_add(p->p_klist, kn, 0); return (0); } static void filt_sigdetach(struct knote *kn) { knlist_remove(kn->kn_knlist, kn, 0); } /* * signal knotes are shared with proc knotes, so we apply a mask to * the hint in order to differentiate them from process hints. This * could be avoided by using a signal-specific knote list, but probably * isn't worth the trouble. */ static int filt_signal(struct knote *kn, long hint) { if (hint & NOTE_SIGNAL) { hint &= ~NOTE_SIGNAL; if (kn->kn_id == hint) kn->kn_data++; } return (kn->kn_data != 0); } struct sigacts * sigacts_alloc(void) { struct sigacts *ps; ps = malloc(sizeof(struct sigacts), M_SUBPROC, M_WAITOK | M_ZERO); refcount_init(&ps->ps_refcnt, 1); mtx_init(&ps->ps_mtx, "sigacts", NULL, MTX_DEF); return (ps); } void sigacts_free(struct sigacts *ps) { if (refcount_release(&ps->ps_refcnt) == 0) return; mtx_destroy(&ps->ps_mtx); free(ps, M_SUBPROC); } struct sigacts * sigacts_hold(struct sigacts *ps) { refcount_acquire(&ps->ps_refcnt); return (ps); } void sigacts_copy(struct sigacts *dest, struct sigacts *src) { KASSERT(dest->ps_refcnt == 1, ("sigacts_copy to shared dest")); mtx_lock(&src->ps_mtx); bcopy(src, dest, offsetof(struct sigacts, ps_refcnt)); mtx_unlock(&src->ps_mtx); } int sigacts_shared(struct sigacts *ps) { return (ps->ps_refcnt > 1); } void sig_drop_caught(struct proc *p) { int sig; struct sigacts *ps; ps = p->p_sigacts; PROC_LOCK_ASSERT(p, MA_OWNED); mtx_assert(&ps->ps_mtx, MA_OWNED); SIG_FOREACH(sig, &ps->ps_sigcatch) { sigdflt(ps, sig); if ((sigprop(sig) & SIGPROP_IGNORE) != 0) sigqueue_delete_proc(p, sig); } } static void sigfastblock_failed(struct thread *td, bool sendsig, bool write) { ksiginfo_t ksi; /* * Prevent further fetches and SIGSEGVs, allowing thread to * issue syscalls despite corruption. */ sigfastblock_clear(td); if (!sendsig) return; ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGSEGV; ksi.ksi_code = write ? SEGV_ACCERR : SEGV_MAPERR; ksi.ksi_addr = td->td_sigblock_ptr; trapsignal(td, &ksi); } static bool sigfastblock_fetch_sig(struct thread *td, bool sendsig, uint32_t *valp) { uint32_t res; if ((td->td_pflags & TDP_SIGFASTBLOCK) == 0) return (true); if (fueword32((void *)td->td_sigblock_ptr, &res) == -1) { sigfastblock_failed(td, sendsig, false); return (false); } *valp = res; td->td_sigblock_val = res & ~SIGFASTBLOCK_FLAGS; return (true); } static void sigfastblock_resched(struct thread *td, bool resched) { struct proc *p; if (resched) { p = td->td_proc; PROC_LOCK(p); reschedule_signals(p, td->td_sigmask, 0); PROC_UNLOCK(p); } ast_sched(td, TDA_SIG); } int sys_sigfastblock(struct thread *td, struct sigfastblock_args *uap) { struct proc *p; int error, res; uint32_t oldval; error = 0; p = td->td_proc; switch (uap->cmd) { case SIGFASTBLOCK_SETPTR: if ((td->td_pflags & TDP_SIGFASTBLOCK) != 0) { error = EBUSY; break; } if (((uintptr_t)(uap->ptr) & (sizeof(uint32_t) - 1)) != 0) { error = EINVAL; break; } td->td_pflags |= TDP_SIGFASTBLOCK; td->td_sigblock_ptr = uap->ptr; break; case SIGFASTBLOCK_UNBLOCK: if ((td->td_pflags & TDP_SIGFASTBLOCK) == 0) { error = EINVAL; break; } for (;;) { res = casueword32(td->td_sigblock_ptr, SIGFASTBLOCK_PEND, &oldval, 0); if (res == -1) { error = EFAULT; sigfastblock_failed(td, false, true); break; } if (res == 0) break; MPASS(res == 1); if (oldval != SIGFASTBLOCK_PEND) { error = EBUSY; break; } error = thread_check_susp(td, false); if (error != 0) break; } if (error != 0) break; /* * td_sigblock_val is cleared there, but not on a * syscall exit. The end effect is that a single * interruptible sleep, while user sigblock word is * set, might return EINTR or ERESTART to usermode * without delivering signal. All further sleeps, * until userspace clears the word and does * sigfastblock(UNBLOCK), observe current word and no * longer get interrupted. It is slight * non-conformance, with alternative to have read the * sigblock word on each syscall entry. */ td->td_sigblock_val = 0; /* * Rely on normal ast mechanism to deliver pending * signals to current thread. But notify others about * fake unblock. */ sigfastblock_resched(td, error == 0 && p->p_numthreads != 1); break; case SIGFASTBLOCK_UNSETPTR: if ((td->td_pflags & TDP_SIGFASTBLOCK) == 0) { error = EINVAL; break; } if (!sigfastblock_fetch_sig(td, false, &oldval)) { error = EFAULT; break; } if (oldval != 0 && oldval != SIGFASTBLOCK_PEND) { error = EBUSY; break; } sigfastblock_clear(td); break; default: error = EINVAL; break; } return (error); } void sigfastblock_clear(struct thread *td) { bool resched; if ((td->td_pflags & TDP_SIGFASTBLOCK) == 0) return; td->td_sigblock_val = 0; resched = (td->td_pflags & TDP_SIGFASTPENDING) != 0 || SIGPENDING(td); td->td_pflags &= ~(TDP_SIGFASTBLOCK | TDP_SIGFASTPENDING); sigfastblock_resched(td, resched); } void sigfastblock_fetch(struct thread *td) { uint32_t val; (void)sigfastblock_fetch_sig(td, true, &val); } static void sigfastblock_setpend1(struct thread *td) { int res; uint32_t oldval; if ((td->td_pflags & TDP_SIGFASTPENDING) == 0) return; res = fueword32((void *)td->td_sigblock_ptr, &oldval); if (res == -1) { sigfastblock_failed(td, true, false); return; } for (;;) { res = casueword32(td->td_sigblock_ptr, oldval, &oldval, oldval | SIGFASTBLOCK_PEND); if (res == -1) { sigfastblock_failed(td, true, true); return; } if (res == 0) { td->td_sigblock_val = oldval & ~SIGFASTBLOCK_FLAGS; td->td_pflags &= ~TDP_SIGFASTPENDING; break; } MPASS(res == 1); if (thread_check_susp(td, false) != 0) break; } } static void sigfastblock_setpend(struct thread *td, bool resched) { struct proc *p; sigfastblock_setpend1(td); if (resched) { p = td->td_proc; PROC_LOCK(p); reschedule_signals(p, fastblock_mask, SIGPROCMASK_FASTBLK); PROC_UNLOCK(p); } } diff --git a/sys/kern/subr_log.c b/sys/kern/subr_log.c index bb78e4a35451..5a6ebacb780c 100644 --- a/sys/kern/subr_log.c +++ b/sys/kern/subr_log.c @@ -1,309 +1,309 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)subr_log.c 8.1 (Berkeley) 6/10/93 */ /* * Error log buffer for kernel printf's. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define LOG_RDPRI (PZERO + 1) #define LOG_ASYNC 0x04 static d_open_t logopen; static d_close_t logclose; static d_read_t logread; static d_ioctl_t logioctl; static d_poll_t logpoll; static d_kqfilter_t logkqfilter; static void logtimeout(void *arg); static struct cdevsw log_cdevsw = { .d_version = D_VERSION, .d_open = logopen, .d_close = logclose, .d_read = logread, .d_ioctl = logioctl, .d_poll = logpoll, .d_kqfilter = logkqfilter, .d_name = "log", }; static int logkqread(struct knote *note, long hint); static void logkqdetach(struct knote *note); -static struct filterops log_read_filterops = { +static const struct filterops log_read_filterops = { .f_isfd = 1, .f_attach = NULL, .f_detach = logkqdetach, .f_event = logkqread, }; static struct logsoftc { int sc_state; /* see above for possibilities */ struct selinfo sc_selp; /* process waiting on select call */ struct sigio *sc_sigio; /* information for async I/O */ struct callout sc_callout; /* callout to wakeup syslog */ } logsoftc; int log_open; /* also used in log() */ static struct cv log_wakeup; struct mtx msgbuf_lock; MTX_SYSINIT(msgbuf_lock, &msgbuf_lock, "msgbuf lock", MTX_DEF); static int log_wakeups_per_second = 5; SYSCTL_INT(_kern, OID_AUTO, log_wakeups_per_second, CTLFLAG_RW, &log_wakeups_per_second, 0, "How often (times per second) to check for /dev/log waiters."); /*ARGSUSED*/ static int logopen(struct cdev *dev, int flags, int mode, struct thread *td) { if (log_wakeups_per_second < 1) { printf("syslog wakeup is less than one. Adjusting to 1.\n"); log_wakeups_per_second = 1; } mtx_lock(&msgbuf_lock); if (log_open) { mtx_unlock(&msgbuf_lock); return (EBUSY); } log_open = 1; callout_reset_sbt(&logsoftc.sc_callout, SBT_1S / log_wakeups_per_second, 0, logtimeout, NULL, C_PREL(1)); mtx_unlock(&msgbuf_lock); fsetown(td->td_proc->p_pid, &logsoftc.sc_sigio); /* signal process only */ return (0); } /*ARGSUSED*/ static int logclose(struct cdev *dev, int flag, int mode, struct thread *td) { funsetown(&logsoftc.sc_sigio); mtx_lock(&msgbuf_lock); callout_stop(&logsoftc.sc_callout); logsoftc.sc_state = 0; log_open = 0; mtx_unlock(&msgbuf_lock); return (0); } /*ARGSUSED*/ static int logread(struct cdev *dev, struct uio *uio, int flag) { char buf[128]; struct msgbuf *mbp = msgbufp; int error = 0, l; mtx_lock(&msgbuf_lock); while (msgbuf_getcount(mbp) == 0) { if (flag & IO_NDELAY) { mtx_unlock(&msgbuf_lock); return (EWOULDBLOCK); } if ((error = cv_wait_sig(&log_wakeup, &msgbuf_lock)) != 0) { mtx_unlock(&msgbuf_lock); return (error); } } while (uio->uio_resid > 0) { l = imin(sizeof(buf), uio->uio_resid); l = msgbuf_getbytes(mbp, buf, l); if (l == 0) break; mtx_unlock(&msgbuf_lock); error = uiomove(buf, l, uio); if (error || uio->uio_resid == 0) return (error); mtx_lock(&msgbuf_lock); } mtx_unlock(&msgbuf_lock); return (error); } /*ARGSUSED*/ static int logpoll(struct cdev *dev, int events, struct thread *td) { int revents = 0; if (events & (POLLIN | POLLRDNORM)) { mtx_lock(&msgbuf_lock); if (msgbuf_getcount(msgbufp) > 0) revents |= events & (POLLIN | POLLRDNORM); else selrecord(td, &logsoftc.sc_selp); mtx_unlock(&msgbuf_lock); } return (revents); } static int logkqfilter(struct cdev *dev, struct knote *kn) { if (kn->kn_filter != EVFILT_READ) return (EINVAL); kn->kn_fop = &log_read_filterops; kn->kn_hook = NULL; mtx_lock(&msgbuf_lock); knlist_add(&logsoftc.sc_selp.si_note, kn, 1); mtx_unlock(&msgbuf_lock); return (0); } static int logkqread(struct knote *kn, long hint) { mtx_assert(&msgbuf_lock, MA_OWNED); kn->kn_data = msgbuf_getcount(msgbufp); return (kn->kn_data != 0); } static void logkqdetach(struct knote *kn) { mtx_lock(&msgbuf_lock); knlist_remove(&logsoftc.sc_selp.si_note, kn, 1); mtx_unlock(&msgbuf_lock); } static void logtimeout(void *arg) { if (!log_open) return; if (msgbuftrigger == 0) goto done; msgbuftrigger = 0; selwakeuppri(&logsoftc.sc_selp, LOG_RDPRI); KNOTE_LOCKED(&logsoftc.sc_selp.si_note, 0); if ((logsoftc.sc_state & LOG_ASYNC) && logsoftc.sc_sigio != NULL) pgsigio(&logsoftc.sc_sigio, SIGIO, 0); cv_broadcastpri(&log_wakeup, LOG_RDPRI); done: if (log_wakeups_per_second < 1) { printf("syslog wakeup is less than one. Adjusting to 1.\n"); log_wakeups_per_second = 1; } callout_reset_sbt(&logsoftc.sc_callout, SBT_1S / log_wakeups_per_second, 0, logtimeout, NULL, C_PREL(1)); } /*ARGSUSED*/ static int logioctl(struct cdev *dev, u_long com, caddr_t data, int flag, struct thread *td) { switch (com) { /* return number of characters immediately available */ case FIONREAD: *(int *)data = msgbuf_getcount(msgbufp); break; case FIONBIO: break; case FIOASYNC: mtx_lock(&msgbuf_lock); if (*(int *)data) logsoftc.sc_state |= LOG_ASYNC; else logsoftc.sc_state &= ~LOG_ASYNC; mtx_unlock(&msgbuf_lock); break; case FIOSETOWN: return (fsetown(*(int *)data, &logsoftc.sc_sigio)); case FIOGETOWN: *(int *)data = fgetown(&logsoftc.sc_sigio); break; /* This is deprecated, FIOSETOWN should be used instead. */ case TIOCSPGRP: return (fsetown(-(*(int *)data), &logsoftc.sc_sigio)); /* This is deprecated, FIOGETOWN should be used instead */ case TIOCGPGRP: *(int *)data = -fgetown(&logsoftc.sc_sigio); break; default: return (ENOTTY); } return (0); } static void log_drvinit(void *unused) { cv_init(&log_wakeup, "klog"); callout_init_mtx(&logsoftc.sc_callout, &msgbuf_lock, 0); knlist_init_mtx(&logsoftc.sc_selp.si_note, &msgbuf_lock); make_dev_credf(MAKEDEV_ETERNAL, &log_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0600, "klog"); } SYSINIT(logdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE,log_drvinit,NULL); diff --git a/sys/kern/sys_eventfd.c b/sys/kern/sys_eventfd.c index 739dbf75b01e..20e73f9c6b1b 100644 --- a/sys/kern/sys_eventfd.c +++ b/sys/kern/sys_eventfd.c @@ -1,346 +1,346 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2014 Dmitry Chagin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include _Static_assert(EFD_CLOEXEC == O_CLOEXEC, "Mismatched EFD_CLOEXEC"); _Static_assert(EFD_NONBLOCK == O_NONBLOCK, "Mismatched EFD_NONBLOCK"); MALLOC_DEFINE(M_EVENTFD, "eventfd", "eventfd structures"); static fo_rdwr_t eventfd_read; static fo_rdwr_t eventfd_write; static fo_ioctl_t eventfd_ioctl; static fo_poll_t eventfd_poll; static fo_kqfilter_t eventfd_kqfilter; static fo_stat_t eventfd_stat; static fo_close_t eventfd_close; static fo_fill_kinfo_t eventfd_fill_kinfo; -static struct fileops eventfdops = { +static const struct fileops eventfdops = { .fo_read = eventfd_read, .fo_write = eventfd_write, .fo_truncate = invfo_truncate, .fo_ioctl = eventfd_ioctl, .fo_poll = eventfd_poll, .fo_kqfilter = eventfd_kqfilter, .fo_stat = eventfd_stat, .fo_close = eventfd_close, .fo_chmod = invfo_chmod, .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = eventfd_fill_kinfo, .fo_cmp = file_kcmp_generic, .fo_flags = DFLAG_PASSABLE }; static void filt_eventfddetach(struct knote *kn); static int filt_eventfdread(struct knote *kn, long hint); static int filt_eventfdwrite(struct knote *kn, long hint); -static struct filterops eventfd_rfiltops = { +static const struct filterops eventfd_rfiltops = { .f_isfd = 1, .f_detach = filt_eventfddetach, .f_event = filt_eventfdread }; -static struct filterops eventfd_wfiltops = { +static const struct filterops eventfd_wfiltops = { .f_isfd = 1, .f_detach = filt_eventfddetach, .f_event = filt_eventfdwrite }; struct eventfd { eventfd_t efd_count; uint32_t efd_flags; struct selinfo efd_sel; struct mtx efd_lock; }; int eventfd_create_file(struct thread *td, struct file *fp, uint32_t initval, int flags) { struct eventfd *efd; int fflags; AUDIT_ARG_FFLAGS(flags); AUDIT_ARG_VALUE(initval); efd = malloc(sizeof(*efd), M_EVENTFD, M_WAITOK | M_ZERO); efd->efd_flags = flags; efd->efd_count = initval; mtx_init(&efd->efd_lock, "eventfd", NULL, MTX_DEF); knlist_init_mtx(&efd->efd_sel.si_note, &efd->efd_lock); fflags = FREAD | FWRITE; if ((flags & EFD_NONBLOCK) != 0) fflags |= FNONBLOCK; finit(fp, fflags, DTYPE_EVENTFD, efd, &eventfdops); return (0); } static int eventfd_close(struct file *fp, struct thread *td) { struct eventfd *efd; efd = fp->f_data; seldrain(&efd->efd_sel); knlist_destroy(&efd->efd_sel.si_note); mtx_destroy(&efd->efd_lock); free(efd, M_EVENTFD); return (0); } static int eventfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct eventfd *efd; eventfd_t count; int error; if (uio->uio_resid < sizeof(eventfd_t)) return (EINVAL); error = 0; efd = fp->f_data; mtx_lock(&efd->efd_lock); while (error == 0 && efd->efd_count == 0) { if ((fp->f_flag & FNONBLOCK) != 0) { mtx_unlock(&efd->efd_lock); return (EAGAIN); } error = mtx_sleep(&efd->efd_count, &efd->efd_lock, PCATCH, "efdrd", 0); } if (error == 0) { MPASS(efd->efd_count > 0); if ((efd->efd_flags & EFD_SEMAPHORE) != 0) { count = 1; --efd->efd_count; } else { count = efd->efd_count; efd->efd_count = 0; } KNOTE_LOCKED(&efd->efd_sel.si_note, 0); selwakeup(&efd->efd_sel); wakeup(&efd->efd_count); mtx_unlock(&efd->efd_lock); error = uiomove(&count, sizeof(eventfd_t), uio); } else mtx_unlock(&efd->efd_lock); return (error); } static int eventfd_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct eventfd *efd; eventfd_t count; int error; if (uio->uio_resid < sizeof(eventfd_t)) return (EINVAL); error = uiomove(&count, sizeof(eventfd_t), uio); if (error != 0) return (error); if (count == UINT64_MAX) return (EINVAL); efd = fp->f_data; mtx_lock(&efd->efd_lock); retry: if (UINT64_MAX - efd->efd_count <= count) { if ((fp->f_flag & FNONBLOCK) != 0) { mtx_unlock(&efd->efd_lock); /* Do not not return the number of bytes written */ uio->uio_resid += sizeof(eventfd_t); return (EAGAIN); } error = mtx_sleep(&efd->efd_count, &efd->efd_lock, PCATCH, "efdwr", 0); if (error == 0) goto retry; } if (error == 0) { MPASS(UINT64_MAX - efd->efd_count > count); efd->efd_count += count; KNOTE_LOCKED(&efd->efd_sel.si_note, 0); selwakeup(&efd->efd_sel); wakeup(&efd->efd_count); } mtx_unlock(&efd->efd_lock); return (error); } static int eventfd_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct eventfd *efd; int revents; efd = fp->f_data; revents = 0; mtx_lock(&efd->efd_lock); if ((events & (POLLIN | POLLRDNORM)) != 0 && efd->efd_count > 0) revents |= events & (POLLIN | POLLRDNORM); if ((events & (POLLOUT | POLLWRNORM)) != 0 && UINT64_MAX - 1 > efd->efd_count) revents |= events & (POLLOUT | POLLWRNORM); if (revents == 0) selrecord(td, &efd->efd_sel); mtx_unlock(&efd->efd_lock); return (revents); } static int eventfd_kqfilter(struct file *fp, struct knote *kn) { struct eventfd *efd = fp->f_data; mtx_lock(&efd->efd_lock); switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &eventfd_rfiltops; break; case EVFILT_WRITE: kn->kn_fop = &eventfd_wfiltops; break; default: mtx_unlock(&efd->efd_lock); return (EINVAL); } kn->kn_hook = efd; knlist_add(&efd->efd_sel.si_note, kn, 1); mtx_unlock(&efd->efd_lock); return (0); } static void filt_eventfddetach(struct knote *kn) { struct eventfd *efd = kn->kn_hook; mtx_lock(&efd->efd_lock); knlist_remove(&efd->efd_sel.si_note, kn, 1); mtx_unlock(&efd->efd_lock); } static int filt_eventfdread(struct knote *kn, long hint) { struct eventfd *efd = kn->kn_hook; int ret; mtx_assert(&efd->efd_lock, MA_OWNED); kn->kn_data = (int64_t)efd->efd_count; ret = efd->efd_count > 0; return (ret); } static int filt_eventfdwrite(struct knote *kn, long hint) { struct eventfd *efd = kn->kn_hook; int ret; mtx_assert(&efd->efd_lock, MA_OWNED); kn->kn_data = (int64_t)(UINT64_MAX - 1 - efd->efd_count); ret = UINT64_MAX - 1 > efd->efd_count; return (ret); } static int eventfd_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { switch (cmd) { case FIONBIO: case FIOASYNC: return (0); } return (ENOTTY); } static int eventfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred) { bzero((void *)st, sizeof *st); st->st_mode = S_IFIFO; return (0); } static int eventfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { struct eventfd *efd = fp->f_data; kif->kf_type = KF_TYPE_EVENTFD; mtx_lock(&efd->efd_lock); kif->kf_un.kf_eventfd.kf_eventfd_value = efd->efd_count; kif->kf_un.kf_eventfd.kf_eventfd_flags = efd->efd_flags; kif->kf_un.kf_eventfd.kf_eventfd_addr = (uintptr_t)efd; mtx_unlock(&efd->efd_lock); return (0); } diff --git a/sys/kern/sys_pipe.c b/sys/kern/sys_pipe.c index c34c7b24a269..f2f1a42adf2b 100644 --- a/sys/kern/sys_pipe.c +++ b/sys/kern/sys_pipe.c @@ -1,1903 +1,1903 @@ /*- * Copyright (c) 1996 John S. Dyson * Copyright (c) 2012 Giovanni Trematerra * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice immediately at the beginning of the file, without modification, * this list of conditions, and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Absolutely no warranty of function or purpose is made by the author * John S. Dyson. * 4. Modifications may be freely made to this file if the above conditions * are met. */ /* * This file contains a high-performance replacement for the socket-based * pipes scheme originally used in FreeBSD/4.4Lite. It does not support * all features of sockets, but does do everything that pipes normally * do. */ /* * This code has two modes of operation, a small write mode and a large * write mode. The small write mode acts like conventional pipes with * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT * and PIPE_SIZE in size, the sending process pins the underlying pages in * memory, and the receiving process copies directly from these pinned pages * in the sending process. * * If the sending process receives a signal, it is possible that it will * go away, and certainly its address space can change, because control * is returned back to the user-mode side. In that case, the pipe code * arranges to copy the buffer supplied by the user process, to a pageable * kernel buffer, and the receiving process will grab the data from the * pageable kernel buffer. Since signals don't happen all that often, * the copy operation is normally eliminated. * * The constant PIPE_MINDIRECT is chosen to make sure that buffering will * happen for small transfers so that the system will not spend all of * its time context switching. * * In order to limit the resource use of pipes, two sysctls exist: * * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable * address space available to us in pipe_map. This value is normally * autotuned, but may also be loader tuned. * * kern.ipc.pipekva - This read-only sysctl tracks the current amount of * memory in use by pipes. * * Based on how large pipekva is relative to maxpipekva, the following * will happen: * * 0% - 50%: * New pipes are given 16K of memory backing, pipes may dynamically * grow to as large as 64K where needed. * 50% - 75%: * New pipes are given 4K (or PAGE_SIZE) of memory backing, * existing pipes may NOT grow. * 75% - 100%: * New pipes are given 4K (or PAGE_SIZE) of memory backing, * existing pipes will be shrunk down to 4K whenever possible. * * Resizing may be disabled by setting kern.ipc.piperesizeallowed=0. If * that is set, the only resize that will occur is the 0 -> SMALL_PIPE_SIZE * resize which MUST occur for reverse-direction pipes when they are * first used. * * Additional information about the current state of pipes may be obtained * from kern.ipc.pipes, kern.ipc.pipefragretry, kern.ipc.pipeallocfail, * and kern.ipc.piperesizefail. * * Locking rules: There are two locks present here: A mutex, used via * PIPE_LOCK, and a flag, used via pipelock(). All locking is done via * the flag, as mutexes can not persist over uiomove. The mutex * exists only to guard access to the flag, and is not in itself a * locking mechanism. Also note that there is only a single mutex for * both directions of a pipe. * * As pipelock() may have to sleep before it can acquire the flag, it * is important to reread all data after a call to pipelock(); everything * in the structure may have changed. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Use this define if you want to disable *fancy* VM things. Expect an * approx 30% decrease in transfer rate. This could be useful for * NetBSD or OpenBSD. */ /* #define PIPE_NODIRECT */ #define PIPE_PEER(pipe) \ (((pipe)->pipe_type & PIPE_TYPE_NAMED) ? (pipe) : ((pipe)->pipe_peer)) /* * interfaces to the outside world */ static fo_rdwr_t pipe_read; static fo_rdwr_t pipe_write; static fo_truncate_t pipe_truncate; static fo_ioctl_t pipe_ioctl; static fo_poll_t pipe_poll; static fo_kqfilter_t pipe_kqfilter; static fo_stat_t pipe_stat; static fo_close_t pipe_close; static fo_chmod_t pipe_chmod; static fo_chown_t pipe_chown; static fo_fill_kinfo_t pipe_fill_kinfo; -struct fileops pipeops = { +const struct fileops pipeops = { .fo_read = pipe_read, .fo_write = pipe_write, .fo_truncate = pipe_truncate, .fo_ioctl = pipe_ioctl, .fo_poll = pipe_poll, .fo_kqfilter = pipe_kqfilter, .fo_stat = pipe_stat, .fo_close = pipe_close, .fo_chmod = pipe_chmod, .fo_chown = pipe_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = pipe_fill_kinfo, .fo_cmp = file_kcmp_generic, .fo_flags = DFLAG_PASSABLE }; static void filt_pipedetach(struct knote *kn); static void filt_pipedetach_notsup(struct knote *kn); static int filt_pipenotsup(struct knote *kn, long hint); static int filt_piperead(struct knote *kn, long hint); static int filt_pipewrite(struct knote *kn, long hint); -static struct filterops pipe_nfiltops = { +static const struct filterops pipe_nfiltops = { .f_isfd = 1, .f_detach = filt_pipedetach_notsup, .f_event = filt_pipenotsup }; -static struct filterops pipe_rfiltops = { +static const struct filterops pipe_rfiltops = { .f_isfd = 1, .f_detach = filt_pipedetach, .f_event = filt_piperead }; -static struct filterops pipe_wfiltops = { +static const struct filterops pipe_wfiltops = { .f_isfd = 1, .f_detach = filt_pipedetach, .f_event = filt_pipewrite }; /* * Default pipe buffer size(s), this can be kind-of large now because pipe * space is pageable. The pipe code will try to maintain locality of * reference for performance reasons, so small amounts of outstanding I/O * will not wipe the cache. */ #define MINPIPESIZE (PIPE_SIZE/3) #define MAXPIPESIZE (2*PIPE_SIZE/3) static long amountpipekva; static int pipefragretry; static int pipeallocfail; static int piperesizefail; static int piperesizeallowed = 1; static long pipe_mindirect = PIPE_MINDIRECT; static int pipebuf_reserv = 2; SYSCTL_LONG(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &maxpipekva, 0, "Pipe KVA limit"); SYSCTL_LONG(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD, &amountpipekva, 0, "Pipe KVA usage"); SYSCTL_INT(_kern_ipc, OID_AUTO, pipefragretry, CTLFLAG_RD, &pipefragretry, 0, "Pipe allocation retries due to fragmentation"); SYSCTL_INT(_kern_ipc, OID_AUTO, pipeallocfail, CTLFLAG_RD, &pipeallocfail, 0, "Pipe allocation failures"); SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizefail, CTLFLAG_RD, &piperesizefail, 0, "Pipe resize failures"); SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizeallowed, CTLFLAG_RW, &piperesizeallowed, 0, "Pipe resizing allowed"); SYSCTL_INT(_kern_ipc, OID_AUTO, pipebuf_reserv, CTLFLAG_RW, &pipebuf_reserv, 0, "Superuser-reserved percentage of the pipe buffers space"); static void pipeinit(void *dummy __unused); static void pipeclose(struct pipe *cpipe); static void pipe_free_kmem(struct pipe *cpipe); static int pipe_create(struct pipe *pipe, bool backing); static int pipe_paircreate(struct thread *td, struct pipepair **p_pp); static __inline int pipelock(struct pipe *cpipe, int catch); static __inline void pipeunlock(struct pipe *cpipe); static void pipe_timestamp(struct timespec *tsp); #ifndef PIPE_NODIRECT static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio); static void pipe_destroy_write_buffer(struct pipe *wpipe); static int pipe_direct_write(struct pipe *wpipe, struct uio *uio); static void pipe_clone_write_buffer(struct pipe *wpipe); #endif static int pipespace(struct pipe *cpipe, int size); static int pipespace_new(struct pipe *cpipe, int size); static int pipe_zone_ctor(void *mem, int size, void *arg, int flags); static int pipe_zone_init(void *mem, int size, int flags); static void pipe_zone_fini(void *mem, int size); static uma_zone_t pipe_zone; static struct unrhdr64 pipeino_unr; static dev_t pipedev_ino; SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL); static void pipeinit(void *dummy __unused) { pipe_zone = uma_zcreate("pipe", sizeof(struct pipepair), pipe_zone_ctor, NULL, pipe_zone_init, pipe_zone_fini, UMA_ALIGN_PTR, 0); KASSERT(pipe_zone != NULL, ("pipe_zone not initialized")); new_unrhdr64(&pipeino_unr, 1); pipedev_ino = devfs_alloc_cdp_inode(); KASSERT(pipedev_ino > 0, ("pipe dev inode not initialized")); } static int sysctl_handle_pipe_mindirect(SYSCTL_HANDLER_ARGS) { int error = 0; long tmp_pipe_mindirect = pipe_mindirect; error = sysctl_handle_long(oidp, &tmp_pipe_mindirect, arg2, req); if (error != 0 || req->newptr == NULL) return (error); /* * Don't allow pipe_mindirect to be set so low that we violate * atomicity requirements. */ if (tmp_pipe_mindirect <= PIPE_BUF) return (EINVAL); pipe_mindirect = tmp_pipe_mindirect; return (0); } SYSCTL_OID(_kern_ipc, OID_AUTO, pipe_mindirect, CTLTYPE_LONG | CTLFLAG_RW, &pipe_mindirect, 0, sysctl_handle_pipe_mindirect, "L", "Minimum write size triggering VM optimization"); static int pipe_zone_ctor(void *mem, int size, void *arg, int flags) { struct pipepair *pp; struct pipe *rpipe, *wpipe; KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size")); pp = (struct pipepair *)mem; /* * We zero both pipe endpoints to make sure all the kmem pointers * are NULL, flag fields are zero'd, etc. We timestamp both * endpoints with the same time. */ rpipe = &pp->pp_rpipe; bzero(rpipe, sizeof(*rpipe)); pipe_timestamp(&rpipe->pipe_ctime); rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime; wpipe = &pp->pp_wpipe; bzero(wpipe, sizeof(*wpipe)); wpipe->pipe_ctime = rpipe->pipe_ctime; wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime; rpipe->pipe_peer = wpipe; rpipe->pipe_pair = pp; wpipe->pipe_peer = rpipe; wpipe->pipe_pair = pp; /* * Mark both endpoints as present; they will later get free'd * one at a time. When both are free'd, then the whole pair * is released. */ rpipe->pipe_present = PIPE_ACTIVE; wpipe->pipe_present = PIPE_ACTIVE; /* * Eventually, the MAC Framework may initialize the label * in ctor or init, but for now we do it elswhere to avoid * blocking in ctor or init. */ pp->pp_label = NULL; return (0); } static int pipe_zone_init(void *mem, int size, int flags) { struct pipepair *pp; KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size")); pp = (struct pipepair *)mem; mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_NEW); return (0); } static void pipe_zone_fini(void *mem, int size) { struct pipepair *pp; KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size")); pp = (struct pipepair *)mem; mtx_destroy(&pp->pp_mtx); } static int pipe_paircreate(struct thread *td, struct pipepair **p_pp) { struct pipepair *pp; struct pipe *rpipe, *wpipe; int error; *p_pp = pp = uma_zalloc(pipe_zone, M_WAITOK); #ifdef MAC /* * The MAC label is shared between the connected endpoints. As a * result mac_pipe_init() and mac_pipe_create() are called once * for the pair, and not on the endpoints. */ mac_pipe_init(pp); mac_pipe_create(td->td_ucred, pp); #endif rpipe = &pp->pp_rpipe; wpipe = &pp->pp_wpipe; pp->pp_owner = crhold(td->td_ucred); knlist_init_mtx(&rpipe->pipe_sel.si_note, PIPE_MTX(rpipe)); knlist_init_mtx(&wpipe->pipe_sel.si_note, PIPE_MTX(wpipe)); /* * Only the forward direction pipe is backed by big buffer by * default. */ error = pipe_create(rpipe, true); if (error != 0) goto fail; error = pipe_create(wpipe, false); if (error != 0) { /* * This cleanup leaves the pipe inode number for rpipe * still allocated, but never used. We do not free * inode numbers for opened pipes, which is required * for correctness because numbers must be unique. * But also it avoids any memory use by the unr * allocator, so stashing away the transient inode * number is reasonable. */ pipe_free_kmem(rpipe); goto fail; } rpipe->pipe_state |= PIPE_DIRECTOK; wpipe->pipe_state |= PIPE_DIRECTOK; return (0); fail: knlist_destroy(&rpipe->pipe_sel.si_note); knlist_destroy(&wpipe->pipe_sel.si_note); crfree(pp->pp_owner); #ifdef MAC mac_pipe_destroy(pp); #endif uma_zfree(pipe_zone, pp); return (error); } int pipe_named_ctor(struct pipe **ppipe, struct thread *td) { struct pipepair *pp; int error; error = pipe_paircreate(td, &pp); if (error != 0) return (error); pp->pp_rpipe.pipe_type |= PIPE_TYPE_NAMED; *ppipe = &pp->pp_rpipe; return (0); } void pipe_dtor(struct pipe *dpipe) { struct pipe *peer; peer = (dpipe->pipe_type & PIPE_TYPE_NAMED) != 0 ? dpipe->pipe_peer : NULL; funsetown(&dpipe->pipe_sigio); pipeclose(dpipe); if (peer != NULL) { funsetown(&peer->pipe_sigio); pipeclose(peer); } } /* * Get a timestamp. * * This used to be vfs_timestamp but the higher precision is unnecessary and * can very negatively affect performance in virtualized environments (e.g., on * vms running on amd64 when using the rdtscp instruction). */ static void pipe_timestamp(struct timespec *tsp) { getnanotime(tsp); } /* * The pipe system call for the DTYPE_PIPE type of pipes. If we fail, let * the zone pick up the pieces via pipeclose(). */ int kern_pipe(struct thread *td, int fildes[2], int flags, struct filecaps *fcaps1, struct filecaps *fcaps2) { struct file *rf, *wf; struct pipe *rpipe, *wpipe; struct pipepair *pp; int fd, fflags, error; error = pipe_paircreate(td, &pp); if (error != 0) return (error); rpipe = &pp->pp_rpipe; wpipe = &pp->pp_wpipe; error = falloc_caps(td, &rf, &fd, flags, fcaps1); if (error) { pipeclose(rpipe); pipeclose(wpipe); return (error); } /* An extra reference on `rf' has been held for us by falloc_caps(). */ fildes[0] = fd; fflags = FREAD | FWRITE; if ((flags & O_NONBLOCK) != 0) fflags |= FNONBLOCK; /* * Warning: once we've gotten past allocation of the fd for the * read-side, we can only drop the read side via fdrop() in order * to avoid races against processes which manage to dup() the read * side while we are blocked trying to allocate the write side. */ finit(rf, fflags, DTYPE_PIPE, rpipe, &pipeops); error = falloc_caps(td, &wf, &fd, flags, fcaps2); if (error) { fdclose(td, rf, fildes[0]); fdrop(rf, td); /* rpipe has been closed by fdrop(). */ pipeclose(wpipe); return (error); } /* An extra reference on `wf' has been held for us by falloc_caps(). */ finit(wf, fflags, DTYPE_PIPE, wpipe, &pipeops); fdrop(wf, td); fildes[1] = fd; fdrop(rf, td); return (0); } #ifdef COMPAT_FREEBSD10 /* ARGSUSED */ int freebsd10_pipe(struct thread *td, struct freebsd10_pipe_args *uap __unused) { int error; int fildes[2]; error = kern_pipe(td, fildes, 0, NULL, NULL); if (error) return (error); td->td_retval[0] = fildes[0]; td->td_retval[1] = fildes[1]; return (0); } #endif int sys_pipe2(struct thread *td, struct pipe2_args *uap) { int error, fildes[2]; if (uap->flags & ~(O_CLOEXEC | O_NONBLOCK)) return (EINVAL); error = kern_pipe(td, fildes, uap->flags, NULL, NULL); if (error) return (error); error = copyout(fildes, uap->fildes, 2 * sizeof(int)); if (error) { (void)kern_close(td, fildes[0]); (void)kern_close(td, fildes[1]); } return (error); } /* * Allocate kva for pipe circular buffer, the space is pageable * This routine will 'realloc' the size of a pipe safely, if it fails * it will retain the old buffer. * If it fails it will return ENOMEM. */ static int pipespace_new(struct pipe *cpipe, int size) { caddr_t buffer; int error, cnt, firstseg; static int curfail = 0; static struct timeval lastfail; KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked")); KASSERT(!(cpipe->pipe_state & PIPE_DIRECTW), ("pipespace: resize of direct writes not allowed")); retry: cnt = cpipe->pipe_buffer.cnt; if (cnt > size) size = cnt; size = round_page(size); buffer = (caddr_t) vm_map_min(pipe_map); if (!chgpipecnt(cpipe->pipe_pair->pp_owner->cr_ruidinfo, size, lim_cur(curthread, RLIMIT_PIPEBUF))) { if (cpipe->pipe_buffer.buffer == NULL && size > SMALL_PIPE_SIZE) { size = SMALL_PIPE_SIZE; goto retry; } return (ENOMEM); } vm_map_lock(pipe_map); if (priv_check(curthread, PRIV_PIPEBUF) != 0 && maxpipekva / 100 * (100 - pipebuf_reserv) < amountpipekva + size) { vm_map_unlock(pipe_map); chgpipecnt(cpipe->pipe_pair->pp_owner->cr_ruidinfo, -size, 0); if (cpipe->pipe_buffer.buffer == NULL && size > SMALL_PIPE_SIZE) { size = SMALL_PIPE_SIZE; pipefragretry++; goto retry; } return (ENOMEM); } error = vm_map_find_locked(pipe_map, NULL, 0, (vm_offset_t *)&buffer, size, 0, VMFS_ANY_SPACE, VM_PROT_RW, VM_PROT_RW, 0); vm_map_unlock(pipe_map); if (error != KERN_SUCCESS) { chgpipecnt(cpipe->pipe_pair->pp_owner->cr_ruidinfo, -size, 0); if (cpipe->pipe_buffer.buffer == NULL && size > SMALL_PIPE_SIZE) { size = SMALL_PIPE_SIZE; pipefragretry++; goto retry; } if (cpipe->pipe_buffer.buffer == NULL) { pipeallocfail++; if (ppsratecheck(&lastfail, &curfail, 1)) printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n"); } else { piperesizefail++; } return (ENOMEM); } /* copy data, then free old resources if we're resizing */ if (cnt > 0) { if (cpipe->pipe_buffer.in <= cpipe->pipe_buffer.out) { firstseg = cpipe->pipe_buffer.size - cpipe->pipe_buffer.out; bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out], buffer, firstseg); if ((cnt - firstseg) > 0) bcopy(cpipe->pipe_buffer.buffer, &buffer[firstseg], cpipe->pipe_buffer.in); } else { bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out], buffer, cnt); } } pipe_free_kmem(cpipe); cpipe->pipe_buffer.buffer = buffer; cpipe->pipe_buffer.size = size; cpipe->pipe_buffer.in = cnt; cpipe->pipe_buffer.out = 0; cpipe->pipe_buffer.cnt = cnt; atomic_add_long(&amountpipekva, cpipe->pipe_buffer.size); return (0); } /* * Wrapper for pipespace_new() that performs locking assertions. */ static int pipespace(struct pipe *cpipe, int size) { KASSERT(cpipe->pipe_state & PIPE_LOCKFL, ("Unlocked pipe passed to pipespace")); return (pipespace_new(cpipe, size)); } /* * lock a pipe for I/O, blocking other access */ static __inline int pipelock(struct pipe *cpipe, int catch) { int error, prio; PIPE_LOCK_ASSERT(cpipe, MA_OWNED); prio = PRIBIO; if (catch) prio |= PCATCH; while (cpipe->pipe_state & PIPE_LOCKFL) { KASSERT(cpipe->pipe_waiters >= 0, ("%s: bad waiter count %d", __func__, cpipe->pipe_waiters)); cpipe->pipe_waiters++; error = msleep(&cpipe->pipe_waiters, PIPE_MTX(cpipe), prio, "pipelk", 0); cpipe->pipe_waiters--; if (error != 0) return (error); } cpipe->pipe_state |= PIPE_LOCKFL; return (0); } /* * unlock a pipe I/O lock */ static __inline void pipeunlock(struct pipe *cpipe) { PIPE_LOCK_ASSERT(cpipe, MA_OWNED); KASSERT(cpipe->pipe_state & PIPE_LOCKFL, ("Unlocked pipe passed to pipeunlock")); KASSERT(cpipe->pipe_waiters >= 0, ("%s: bad waiter count %d", __func__, cpipe->pipe_waiters)); cpipe->pipe_state &= ~PIPE_LOCKFL; if (cpipe->pipe_waiters > 0) wakeup_one(&cpipe->pipe_waiters); } void pipeselwakeup(struct pipe *cpipe) { PIPE_LOCK_ASSERT(cpipe, MA_OWNED); if (cpipe->pipe_state & PIPE_SEL) { selwakeuppri(&cpipe->pipe_sel, PSOCK); if (!SEL_WAITING(&cpipe->pipe_sel)) cpipe->pipe_state &= ~PIPE_SEL; } if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio) pgsigio(&cpipe->pipe_sigio, SIGIO, 0); KNOTE_LOCKED(&cpipe->pipe_sel.si_note, 0); } /* * Initialize and allocate VM and memory for pipe. The structure * will start out zero'd from the ctor, so we just manage the kmem. */ static int pipe_create(struct pipe *pipe, bool large_backing) { int error; error = pipespace_new(pipe, !large_backing || amountpipekva > maxpipekva / 2 ? SMALL_PIPE_SIZE : PIPE_SIZE); if (error == 0) pipe->pipe_ino = alloc_unr64(&pipeino_unr); return (error); } /* ARGSUSED */ static int pipe_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct pipe *rpipe; int error; int nread = 0; int size; rpipe = fp->f_data; /* * Try to avoid locking the pipe if we have nothing to do. * * There are programs which share one pipe amongst multiple processes * and perform non-blocking reads in parallel, even if the pipe is * empty. This in particular is the case with BSD make, which when * spawned with a high -j number can find itself with over half of the * calls failing to find anything. */ if ((fp->f_flag & FNONBLOCK) != 0 && !mac_pipe_check_read_enabled()) { if (__predict_false(uio->uio_resid == 0)) return (0); if ((atomic_load_short(&rpipe->pipe_state) & PIPE_EOF) == 0 && atomic_load_int(&rpipe->pipe_buffer.cnt) == 0 && atomic_load_int(&rpipe->pipe_pages.cnt) == 0) return (EAGAIN); } PIPE_LOCK(rpipe); ++rpipe->pipe_busy; error = pipelock(rpipe, 1); if (error) goto unlocked_error; #ifdef MAC error = mac_pipe_check_read(active_cred, rpipe->pipe_pair); if (error) goto locked_error; #endif if (amountpipekva > (3 * maxpipekva) / 4) { if ((rpipe->pipe_state & PIPE_DIRECTW) == 0 && rpipe->pipe_buffer.size > SMALL_PIPE_SIZE && rpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE && piperesizeallowed == 1) { PIPE_UNLOCK(rpipe); pipespace(rpipe, SMALL_PIPE_SIZE); PIPE_LOCK(rpipe); } } while (uio->uio_resid) { /* * normal pipe buffer receive */ if (rpipe->pipe_buffer.cnt > 0) { size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out; if (size > rpipe->pipe_buffer.cnt) size = rpipe->pipe_buffer.cnt; if (size > uio->uio_resid) size = uio->uio_resid; PIPE_UNLOCK(rpipe); error = uiomove( &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out], size, uio); PIPE_LOCK(rpipe); if (error) break; rpipe->pipe_buffer.out += size; if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size) rpipe->pipe_buffer.out = 0; rpipe->pipe_buffer.cnt -= size; /* * If there is no more to read in the pipe, reset * its pointers to the beginning. This improves * cache hit stats. */ if (rpipe->pipe_buffer.cnt == 0) { rpipe->pipe_buffer.in = 0; rpipe->pipe_buffer.out = 0; } nread += size; #ifndef PIPE_NODIRECT /* * Direct copy, bypassing a kernel buffer. */ } else if ((size = rpipe->pipe_pages.cnt) != 0) { if (size > uio->uio_resid) size = (u_int) uio->uio_resid; PIPE_UNLOCK(rpipe); error = uiomove_fromphys(rpipe->pipe_pages.ms, rpipe->pipe_pages.pos, size, uio); PIPE_LOCK(rpipe); if (error) break; nread += size; rpipe->pipe_pages.pos += size; rpipe->pipe_pages.cnt -= size; if (rpipe->pipe_pages.cnt == 0) { rpipe->pipe_state &= ~PIPE_WANTW; wakeup(rpipe); } #endif } else { /* * detect EOF condition * read returns 0 on EOF, no need to set error */ if (rpipe->pipe_state & PIPE_EOF) break; /* * If the "write-side" has been blocked, wake it up now. */ if (rpipe->pipe_state & PIPE_WANTW) { rpipe->pipe_state &= ~PIPE_WANTW; wakeup(rpipe); } /* * Break if some data was read. */ if (nread > 0) break; /* * Unlock the pipe buffer for our remaining processing. * We will either break out with an error or we will * sleep and relock to loop. */ pipeunlock(rpipe); /* * Handle non-blocking mode operation or * wait for more data. */ if (fp->f_flag & FNONBLOCK) { error = EAGAIN; } else { rpipe->pipe_state |= PIPE_WANTR; if ((error = msleep(rpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, "piperd", 0)) == 0) error = pipelock(rpipe, 1); } if (error) goto unlocked_error; } } #ifdef MAC locked_error: #endif pipeunlock(rpipe); /* XXX: should probably do this before getting any locks. */ if (error == 0) pipe_timestamp(&rpipe->pipe_atime); unlocked_error: --rpipe->pipe_busy; /* * PIPE_WANT processing only makes sense if pipe_busy is 0. */ if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) { rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW); wakeup(rpipe); } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) { /* * Handle write blocking hysteresis. */ if (rpipe->pipe_state & PIPE_WANTW) { rpipe->pipe_state &= ~PIPE_WANTW; wakeup(rpipe); } } /* * Only wake up writers if there was actually something read. * Otherwise, when calling read(2) at EOF, a spurious wakeup occurs. */ if (nread > 0 && rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt >= PIPE_BUF) pipeselwakeup(rpipe); PIPE_UNLOCK(rpipe); if (nread > 0) td->td_ru.ru_msgrcv++; return (error); } #ifndef PIPE_NODIRECT /* * Map the sending processes' buffer into kernel space and wire it. * This is similar to a physical write operation. */ static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio) { u_int size; int i; PIPE_LOCK_ASSERT(wpipe, MA_OWNED); KASSERT((wpipe->pipe_state & PIPE_DIRECTW) == 0, ("%s: PIPE_DIRECTW set on %p", __func__, wpipe)); KASSERT(wpipe->pipe_pages.cnt == 0, ("%s: pipe map for %p contains residual data", __func__, wpipe)); if (uio->uio_iov->iov_len > wpipe->pipe_buffer.size) size = wpipe->pipe_buffer.size; else size = uio->uio_iov->iov_len; wpipe->pipe_state |= PIPE_DIRECTW; PIPE_UNLOCK(wpipe); i = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, (vm_offset_t)uio->uio_iov->iov_base, size, VM_PROT_READ, wpipe->pipe_pages.ms, PIPENPAGES); PIPE_LOCK(wpipe); if (i < 0) { wpipe->pipe_state &= ~PIPE_DIRECTW; return (EFAULT); } wpipe->pipe_pages.npages = i; wpipe->pipe_pages.pos = ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK; wpipe->pipe_pages.cnt = size; uio->uio_iov->iov_len -= size; uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size; if (uio->uio_iov->iov_len == 0) { uio->uio_iov++; uio->uio_iovcnt--; } uio->uio_resid -= size; uio->uio_offset += size; return (0); } /* * Unwire the process buffer. */ static void pipe_destroy_write_buffer(struct pipe *wpipe) { PIPE_LOCK_ASSERT(wpipe, MA_OWNED); KASSERT((wpipe->pipe_state & PIPE_DIRECTW) != 0, ("%s: PIPE_DIRECTW not set on %p", __func__, wpipe)); KASSERT(wpipe->pipe_pages.cnt == 0, ("%s: pipe map for %p contains residual data", __func__, wpipe)); wpipe->pipe_state &= ~PIPE_DIRECTW; vm_page_unhold_pages(wpipe->pipe_pages.ms, wpipe->pipe_pages.npages); wpipe->pipe_pages.npages = 0; } /* * In the case of a signal, the writing process might go away. This * code copies the data into the circular buffer so that the source * pages can be freed without loss of data. */ static void pipe_clone_write_buffer(struct pipe *wpipe) { struct uio uio; struct iovec iov; int size; int pos; PIPE_LOCK_ASSERT(wpipe, MA_OWNED); KASSERT((wpipe->pipe_state & PIPE_DIRECTW) != 0, ("%s: PIPE_DIRECTW not set on %p", __func__, wpipe)); size = wpipe->pipe_pages.cnt; pos = wpipe->pipe_pages.pos; wpipe->pipe_pages.cnt = 0; wpipe->pipe_buffer.in = size; wpipe->pipe_buffer.out = 0; wpipe->pipe_buffer.cnt = size; PIPE_UNLOCK(wpipe); iov.iov_base = wpipe->pipe_buffer.buffer; iov.iov_len = size; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = 0; uio.uio_resid = size; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_td = curthread; uiomove_fromphys(wpipe->pipe_pages.ms, pos, size, &uio); PIPE_LOCK(wpipe); pipe_destroy_write_buffer(wpipe); } /* * This implements the pipe buffer write mechanism. Note that only * a direct write OR a normal pipe write can be pending at any given time. * If there are any characters in the pipe buffer, the direct write will * be deferred until the receiving process grabs all of the bytes from * the pipe buffer. Then the direct mapping write is set-up. */ static int pipe_direct_write(struct pipe *wpipe, struct uio *uio) { int error; retry: PIPE_LOCK_ASSERT(wpipe, MA_OWNED); if ((wpipe->pipe_state & PIPE_EOF) != 0) { error = EPIPE; goto error1; } if (wpipe->pipe_state & PIPE_DIRECTW) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } pipeselwakeup(wpipe); wpipe->pipe_state |= PIPE_WANTW; pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipdww", 0); pipelock(wpipe, 0); if (error != 0) goto error1; goto retry; } if (wpipe->pipe_buffer.cnt > 0) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } pipeselwakeup(wpipe); wpipe->pipe_state |= PIPE_WANTW; pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipdwc", 0); pipelock(wpipe, 0); if (error != 0) goto error1; goto retry; } error = pipe_build_write_buffer(wpipe, uio); if (error) { goto error1; } while (wpipe->pipe_pages.cnt != 0 && (wpipe->pipe_state & PIPE_EOF) == 0) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } pipeselwakeup(wpipe); wpipe->pipe_state |= PIPE_WANTW; pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH, "pipdwt", 0); pipelock(wpipe, 0); if (error != 0) break; } if ((wpipe->pipe_state & PIPE_EOF) != 0) { wpipe->pipe_pages.cnt = 0; pipe_destroy_write_buffer(wpipe); pipeselwakeup(wpipe); error = EPIPE; } else if (error == EINTR || error == ERESTART) { pipe_clone_write_buffer(wpipe); } else { pipe_destroy_write_buffer(wpipe); } KASSERT((wpipe->pipe_state & PIPE_DIRECTW) == 0, ("pipe %p leaked PIPE_DIRECTW", wpipe)); return (error); error1: wakeup(wpipe); return (error); } #endif static int pipe_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct pipe *wpipe, *rpipe; ssize_t orig_resid; int desiredsize, error; rpipe = fp->f_data; wpipe = PIPE_PEER(rpipe); PIPE_LOCK(rpipe); error = pipelock(wpipe, 1); if (error) { PIPE_UNLOCK(rpipe); return (error); } /* * detect loss of pipe read side, issue SIGPIPE if lost. */ if (wpipe->pipe_present != PIPE_ACTIVE || (wpipe->pipe_state & PIPE_EOF)) { pipeunlock(wpipe); PIPE_UNLOCK(rpipe); return (EPIPE); } #ifdef MAC error = mac_pipe_check_write(active_cred, wpipe->pipe_pair); if (error) { pipeunlock(wpipe); PIPE_UNLOCK(rpipe); return (error); } #endif ++wpipe->pipe_busy; /* Choose a larger size if it's advantageous */ desiredsize = max(SMALL_PIPE_SIZE, wpipe->pipe_buffer.size); while (desiredsize < wpipe->pipe_buffer.cnt + uio->uio_resid) { if (piperesizeallowed != 1) break; if (amountpipekva > maxpipekva / 2) break; if (desiredsize == BIG_PIPE_SIZE) break; desiredsize = desiredsize * 2; } /* Choose a smaller size if we're in a OOM situation */ if (amountpipekva > (3 * maxpipekva) / 4 && wpipe->pipe_buffer.size > SMALL_PIPE_SIZE && wpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE && piperesizeallowed == 1) desiredsize = SMALL_PIPE_SIZE; /* Resize if the above determined that a new size was necessary */ if (desiredsize != wpipe->pipe_buffer.size && (wpipe->pipe_state & PIPE_DIRECTW) == 0) { PIPE_UNLOCK(wpipe); pipespace(wpipe, desiredsize); PIPE_LOCK(wpipe); } MPASS(wpipe->pipe_buffer.size != 0); orig_resid = uio->uio_resid; while (uio->uio_resid) { int space; if (wpipe->pipe_state & PIPE_EOF) { error = EPIPE; break; } #ifndef PIPE_NODIRECT /* * If the transfer is large, we can gain performance if * we do process-to-process copies directly. * If the write is non-blocking, we don't use the * direct write mechanism. * * The direct write mechanism will detect the reader going * away on us. */ if (uio->uio_segflg == UIO_USERSPACE && uio->uio_iov->iov_len >= pipe_mindirect && wpipe->pipe_buffer.size >= pipe_mindirect && (fp->f_flag & FNONBLOCK) == 0) { error = pipe_direct_write(wpipe, uio); if (error != 0) break; continue; } #endif /* * Pipe buffered writes cannot be coincidental with * direct writes. We wait until the currently executing * direct write is completed before we start filling the * pipe buffer. We break out if a signal occurs or the * reader goes away. */ if (wpipe->pipe_pages.cnt != 0) { if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } pipeselwakeup(wpipe); wpipe->pipe_state |= PIPE_WANTW; pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, "pipbww", 0); pipelock(wpipe, 0); if (error != 0) break; continue; } space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; /* Writes of size <= PIPE_BUF must be atomic. */ if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF)) space = 0; if (space > 0) { int size; /* Transfer size */ int segsize; /* first segment to transfer */ /* * Transfer size is minimum of uio transfer * and free space in pipe buffer. */ if (space > uio->uio_resid) size = uio->uio_resid; else size = space; /* * First segment to transfer is minimum of * transfer size and contiguous space in * pipe buffer. If first segment to transfer * is less than the transfer size, we've got * a wraparound in the buffer. */ segsize = wpipe->pipe_buffer.size - wpipe->pipe_buffer.in; if (segsize > size) segsize = size; /* Transfer first segment */ PIPE_UNLOCK(rpipe); error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], segsize, uio); PIPE_LOCK(rpipe); if (error == 0 && segsize < size) { KASSERT(wpipe->pipe_buffer.in + segsize == wpipe->pipe_buffer.size, ("Pipe buffer wraparound disappeared")); /* * Transfer remaining part now, to * support atomic writes. Wraparound * happened. */ PIPE_UNLOCK(rpipe); error = uiomove( &wpipe->pipe_buffer.buffer[0], size - segsize, uio); PIPE_LOCK(rpipe); } if (error == 0) { wpipe->pipe_buffer.in += size; if (wpipe->pipe_buffer.in >= wpipe->pipe_buffer.size) { KASSERT(wpipe->pipe_buffer.in == size - segsize + wpipe->pipe_buffer.size, ("Expected wraparound bad")); wpipe->pipe_buffer.in = size - segsize; } wpipe->pipe_buffer.cnt += size; KASSERT(wpipe->pipe_buffer.cnt <= wpipe->pipe_buffer.size, ("Pipe buffer overflow")); } if (error != 0) break; continue; } else { /* * If the "read-side" has been blocked, wake it up now. */ if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } /* * don't block on non-blocking I/O */ if (fp->f_flag & FNONBLOCK) { error = EAGAIN; break; } /* * We have no more space and have something to offer, * wake up select/poll. */ pipeselwakeup(wpipe); wpipe->pipe_state |= PIPE_WANTW; pipeunlock(wpipe); error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH, "pipewr", 0); pipelock(wpipe, 0); if (error != 0) break; continue; } } --wpipe->pipe_busy; if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) { wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR); wakeup(wpipe); } else if (wpipe->pipe_buffer.cnt > 0) { /* * If we have put any characters in the buffer, we wake up * the reader. */ if (wpipe->pipe_state & PIPE_WANTR) { wpipe->pipe_state &= ~PIPE_WANTR; wakeup(wpipe); } } /* * Don't return EPIPE if any byte was written. * EINTR and other interrupts are handled by generic I/O layer. * Do not pretend that I/O succeeded for obvious user error * like EFAULT. */ if (uio->uio_resid != orig_resid && error == EPIPE) error = 0; if (error == 0) pipe_timestamp(&wpipe->pipe_mtime); /* * We have something to offer, * wake up select/poll. */ if (wpipe->pipe_buffer.cnt) pipeselwakeup(wpipe); pipeunlock(wpipe); PIPE_UNLOCK(rpipe); if (uio->uio_resid != orig_resid) td->td_ru.ru_msgsnd++; return (error); } /* ARGSUSED */ static int pipe_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { struct pipe *cpipe; int error; cpipe = fp->f_data; if (cpipe->pipe_type & PIPE_TYPE_NAMED) error = vnops.fo_truncate(fp, length, active_cred, td); else error = invfo_truncate(fp, length, active_cred, td); return (error); } /* * we implement a very minimal set of ioctls for compatibility with sockets. */ static int pipe_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { struct pipe *mpipe = fp->f_data; int error; PIPE_LOCK(mpipe); #ifdef MAC error = mac_pipe_check_ioctl(active_cred, mpipe->pipe_pair, cmd, data); if (error) { PIPE_UNLOCK(mpipe); return (error); } #endif error = 0; switch (cmd) { case FIONBIO: break; case FIOASYNC: if (*(int *)data) { mpipe->pipe_state |= PIPE_ASYNC; } else { mpipe->pipe_state &= ~PIPE_ASYNC; } break; case FIONREAD: if (!(fp->f_flag & FREAD)) { *(int *)data = 0; PIPE_UNLOCK(mpipe); return (0); } if (mpipe->pipe_pages.cnt != 0) *(int *)data = mpipe->pipe_pages.cnt; else *(int *)data = mpipe->pipe_buffer.cnt; break; case FIOSETOWN: PIPE_UNLOCK(mpipe); error = fsetown(*(int *)data, &mpipe->pipe_sigio); goto out_unlocked; case FIOGETOWN: *(int *)data = fgetown(&mpipe->pipe_sigio); break; /* This is deprecated, FIOSETOWN should be used instead. */ case TIOCSPGRP: PIPE_UNLOCK(mpipe); error = fsetown(-(*(int *)data), &mpipe->pipe_sigio); goto out_unlocked; /* This is deprecated, FIOGETOWN should be used instead. */ case TIOCGPGRP: *(int *)data = -fgetown(&mpipe->pipe_sigio); break; default: error = ENOTTY; break; } PIPE_UNLOCK(mpipe); out_unlocked: return (error); } static int pipe_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct pipe *rpipe; struct pipe *wpipe; int levents, revents; #ifdef MAC int error; #endif revents = 0; rpipe = fp->f_data; wpipe = PIPE_PEER(rpipe); PIPE_LOCK(rpipe); #ifdef MAC error = mac_pipe_check_poll(active_cred, rpipe->pipe_pair); if (error) goto locked_error; #endif if (fp->f_flag & FREAD && events & (POLLIN | POLLRDNORM)) if (rpipe->pipe_pages.cnt > 0 || rpipe->pipe_buffer.cnt > 0) revents |= events & (POLLIN | POLLRDNORM); if (fp->f_flag & FWRITE && events & (POLLOUT | POLLWRNORM)) if (wpipe->pipe_present != PIPE_ACTIVE || (wpipe->pipe_state & PIPE_EOF) || ((wpipe->pipe_state & PIPE_DIRECTW) == 0 && ((wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF || wpipe->pipe_buffer.size == 0))) revents |= events & (POLLOUT | POLLWRNORM); levents = events & (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | POLLRDBAND); if (rpipe->pipe_type & PIPE_TYPE_NAMED && fp->f_flag & FREAD && levents && fp->f_pipegen == rpipe->pipe_wgen) events |= POLLINIGNEOF; if ((events & POLLINIGNEOF) == 0) { if (rpipe->pipe_state & PIPE_EOF) { if (fp->f_flag & FREAD) revents |= (events & (POLLIN | POLLRDNORM)); if (wpipe->pipe_present != PIPE_ACTIVE || (wpipe->pipe_state & PIPE_EOF)) revents |= POLLHUP; } } if (revents == 0) { /* * Add ourselves regardless of eventmask as we have to return * POLLHUP even if it was not asked for. */ if ((fp->f_flag & FREAD) != 0) { selrecord(td, &rpipe->pipe_sel); if (SEL_WAITING(&rpipe->pipe_sel)) rpipe->pipe_state |= PIPE_SEL; } if ((fp->f_flag & FWRITE) != 0 && wpipe->pipe_present == PIPE_ACTIVE) { selrecord(td, &wpipe->pipe_sel); if (SEL_WAITING(&wpipe->pipe_sel)) wpipe->pipe_state |= PIPE_SEL; } } #ifdef MAC locked_error: #endif PIPE_UNLOCK(rpipe); return (revents); } /* * We shouldn't need locks here as we're doing a read and this should * be a natural race. */ static int pipe_stat(struct file *fp, struct stat *ub, struct ucred *active_cred) { struct pipe *pipe; #ifdef MAC int error; #endif pipe = fp->f_data; #ifdef MAC if (mac_pipe_check_stat_enabled()) { PIPE_LOCK(pipe); error = mac_pipe_check_stat(active_cred, pipe->pipe_pair); PIPE_UNLOCK(pipe); if (error) { return (error); } } #endif /* For named pipes ask the underlying filesystem. */ if (pipe->pipe_type & PIPE_TYPE_NAMED) { return (vnops.fo_stat(fp, ub, active_cred)); } bzero(ub, sizeof(*ub)); ub->st_mode = S_IFIFO; ub->st_blksize = PAGE_SIZE; if (pipe->pipe_pages.cnt != 0) ub->st_size = pipe->pipe_pages.cnt; else ub->st_size = pipe->pipe_buffer.cnt; ub->st_blocks = howmany(ub->st_size, ub->st_blksize); ub->st_atim = pipe->pipe_atime; ub->st_mtim = pipe->pipe_mtime; ub->st_ctim = pipe->pipe_ctime; ub->st_uid = fp->f_cred->cr_uid; ub->st_gid = fp->f_cred->cr_gid; ub->st_dev = pipedev_ino; ub->st_ino = pipe->pipe_ino; /* * Left as 0: st_nlink, st_rdev, st_flags, st_gen. */ return (0); } /* ARGSUSED */ static int pipe_close(struct file *fp, struct thread *td) { if (fp->f_vnode != NULL) return vnops.fo_close(fp, td); fp->f_ops = &badfileops; pipe_dtor(fp->f_data); fp->f_data = NULL; return (0); } static int pipe_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { struct pipe *cpipe; int error; cpipe = fp->f_data; if (cpipe->pipe_type & PIPE_TYPE_NAMED) error = vn_chmod(fp, mode, active_cred, td); else error = invfo_chmod(fp, mode, active_cred, td); return (error); } static int pipe_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { struct pipe *cpipe; int error; cpipe = fp->f_data; if (cpipe->pipe_type & PIPE_TYPE_NAMED) error = vn_chown(fp, uid, gid, active_cred, td); else error = invfo_chown(fp, uid, gid, active_cred, td); return (error); } static int pipe_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { struct pipe *pi; if (fp->f_type == DTYPE_FIFO) return (vn_fill_kinfo(fp, kif, fdp)); kif->kf_type = KF_TYPE_PIPE; pi = fp->f_data; kif->kf_un.kf_pipe.kf_pipe_addr = (uintptr_t)pi; kif->kf_un.kf_pipe.kf_pipe_peer = (uintptr_t)pi->pipe_peer; kif->kf_un.kf_pipe.kf_pipe_buffer_cnt = pi->pipe_buffer.cnt; kif->kf_un.kf_pipe.kf_pipe_buffer_in = pi->pipe_buffer.in; kif->kf_un.kf_pipe.kf_pipe_buffer_out = pi->pipe_buffer.out; kif->kf_un.kf_pipe.kf_pipe_buffer_size = pi->pipe_buffer.size; return (0); } static void pipe_free_kmem(struct pipe *cpipe) { KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipe_free_kmem: pipe mutex locked")); if (cpipe->pipe_buffer.buffer != NULL) { atomic_subtract_long(&amountpipekva, cpipe->pipe_buffer.size); chgpipecnt(cpipe->pipe_pair->pp_owner->cr_ruidinfo, -cpipe->pipe_buffer.size, 0); vm_map_remove(pipe_map, (vm_offset_t)cpipe->pipe_buffer.buffer, (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size); cpipe->pipe_buffer.buffer = NULL; } #ifndef PIPE_NODIRECT { cpipe->pipe_pages.cnt = 0; cpipe->pipe_pages.pos = 0; cpipe->pipe_pages.npages = 0; } #endif } /* * shutdown the pipe */ static void pipeclose(struct pipe *cpipe) { #ifdef MAC struct pipepair *pp; #endif struct pipe *ppipe; KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL")); PIPE_LOCK(cpipe); pipelock(cpipe, 0); #ifdef MAC pp = cpipe->pipe_pair; #endif /* * If the other side is blocked, wake it up saying that * we want to close it down. */ cpipe->pipe_state |= PIPE_EOF; while (cpipe->pipe_busy) { wakeup(cpipe); cpipe->pipe_state |= PIPE_WANT; pipeunlock(cpipe); msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0); pipelock(cpipe, 0); } pipeselwakeup(cpipe); /* * Disconnect from peer, if any. */ ppipe = cpipe->pipe_peer; if (ppipe->pipe_present == PIPE_ACTIVE) { ppipe->pipe_state |= PIPE_EOF; wakeup(ppipe); pipeselwakeup(ppipe); } /* * Mark this endpoint as free. Release kmem resources. We * don't mark this endpoint as unused until we've finished * doing that, or the pipe might disappear out from under * us. */ PIPE_UNLOCK(cpipe); pipe_free_kmem(cpipe); PIPE_LOCK(cpipe); cpipe->pipe_present = PIPE_CLOSING; pipeunlock(cpipe); /* * knlist_clear() may sleep dropping the PIPE_MTX. Set the * PIPE_FINALIZED, that allows other end to free the * pipe_pair, only after the knotes are completely dismantled. */ knlist_clear(&cpipe->pipe_sel.si_note, 1); cpipe->pipe_present = PIPE_FINALIZED; seldrain(&cpipe->pipe_sel); knlist_destroy(&cpipe->pipe_sel.si_note); /* * If both endpoints are now closed, release the memory for the * pipe pair. If not, unlock. */ if (ppipe->pipe_present == PIPE_FINALIZED) { PIPE_UNLOCK(cpipe); crfree(cpipe->pipe_pair->pp_owner); #ifdef MAC mac_pipe_destroy(pp); #endif uma_zfree(pipe_zone, cpipe->pipe_pair); } else PIPE_UNLOCK(cpipe); } /*ARGSUSED*/ static int pipe_kqfilter(struct file *fp, struct knote *kn) { struct pipe *cpipe; /* * If a filter is requested that is not supported by this file * descriptor, don't return an error, but also don't ever generate an * event. */ if ((kn->kn_filter == EVFILT_READ) && !(fp->f_flag & FREAD)) { kn->kn_fop = &pipe_nfiltops; return (0); } if ((kn->kn_filter == EVFILT_WRITE) && !(fp->f_flag & FWRITE)) { kn->kn_fop = &pipe_nfiltops; return (0); } cpipe = fp->f_data; PIPE_LOCK(cpipe); switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &pipe_rfiltops; break; case EVFILT_WRITE: kn->kn_fop = &pipe_wfiltops; if (cpipe->pipe_peer->pipe_present != PIPE_ACTIVE) { /* other end of pipe has been closed */ PIPE_UNLOCK(cpipe); return (EPIPE); } cpipe = PIPE_PEER(cpipe); break; default: if ((cpipe->pipe_type & PIPE_TYPE_NAMED) != 0) { PIPE_UNLOCK(cpipe); return (vnops.fo_kqfilter(fp, kn)); } PIPE_UNLOCK(cpipe); return (EINVAL); } kn->kn_hook = cpipe; knlist_add(&cpipe->pipe_sel.si_note, kn, 1); PIPE_UNLOCK(cpipe); return (0); } static void filt_pipedetach(struct knote *kn) { struct pipe *cpipe = kn->kn_hook; PIPE_LOCK(cpipe); knlist_remove(&cpipe->pipe_sel.si_note, kn, 1); PIPE_UNLOCK(cpipe); } /*ARGSUSED*/ static int filt_piperead(struct knote *kn, long hint) { struct file *fp = kn->kn_fp; struct pipe *rpipe = kn->kn_hook; PIPE_LOCK_ASSERT(rpipe, MA_OWNED); kn->kn_data = rpipe->pipe_buffer.cnt; if (kn->kn_data == 0) kn->kn_data = rpipe->pipe_pages.cnt; if ((rpipe->pipe_state & PIPE_EOF) != 0 && ((rpipe->pipe_type & PIPE_TYPE_NAMED) == 0 || fp->f_pipegen != rpipe->pipe_wgen)) { kn->kn_flags |= EV_EOF; return (1); } kn->kn_flags &= ~EV_EOF; return (kn->kn_data > 0); } /*ARGSUSED*/ static int filt_pipewrite(struct knote *kn, long hint) { struct pipe *wpipe = kn->kn_hook; /* * If this end of the pipe is closed, the knote was removed from the * knlist and the list lock (i.e., the pipe lock) is therefore not held. */ if (wpipe->pipe_present == PIPE_ACTIVE || (wpipe->pipe_type & PIPE_TYPE_NAMED) != 0) { PIPE_LOCK_ASSERT(wpipe, MA_OWNED); if (wpipe->pipe_state & PIPE_DIRECTW) { kn->kn_data = 0; } else if (wpipe->pipe_buffer.size > 0) { kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; } else { kn->kn_data = PIPE_BUF; } } if (wpipe->pipe_present != PIPE_ACTIVE || (wpipe->pipe_state & PIPE_EOF)) { kn->kn_flags |= EV_EOF; return (1); } kn->kn_flags &= ~EV_EOF; return (kn->kn_data >= PIPE_BUF); } static void filt_pipedetach_notsup(struct knote *kn) { } static int filt_pipenotsup(struct knote *kn, long hint) { return (0); } diff --git a/sys/kern/sys_procdesc.c b/sys/kern/sys_procdesc.c index e8e0efd5bb00..dbf8e579530f 100644 --- a/sys/kern/sys_procdesc.c +++ b/sys/kern/sys_procdesc.c @@ -1,571 +1,571 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2009, 2016 Robert N. M. Watson * All rights reserved. * * This software was developed at the University of Cambridge Computer * Laboratory with support from a grant from Google, Inc. * * Portions of this software were developed by BAE Systems, the University of * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent * Computing (TC) research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /*- * FreeBSD process descriptor facility. * * Some processes are represented by a file descriptor, which will be used in * preference to signaling and pids for the purposes of process management, * and is, in effect, a form of capability. When a process descriptor is * used with a process, it ceases to be visible to certain traditional UNIX * process facilities, such as waitpid(2). * * Some semantics: * * - At most one process descriptor will exist for any process, although * references to that descriptor may be held from many processes (or even * be in flight between processes over a local domain socket). * - Last close on the process descriptor will terminate the process using * SIGKILL and reparent it to init so that there's a process to reap it * when it's done exiting. * - If the process exits before the descriptor is closed, it will not * generate SIGCHLD on termination, or be picked up by waitpid(). * - The pdkill(2) system call may be used to deliver a signal to the process * using its process descriptor. * - The pdwait4(2) system call may be used to block (or not) on a process * descriptor to collect termination information. * * Open questions: * * - Will we want to add a pidtoprocdesc(2) system call to allow process * descriptors to be created for processes without pdfork(2)? */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(process_descriptors, "Process Descriptors"); MALLOC_DEFINE(M_PROCDESC, "procdesc", "process descriptors"); static fo_poll_t procdesc_poll; static fo_kqfilter_t procdesc_kqfilter; static fo_stat_t procdesc_stat; static fo_close_t procdesc_close; static fo_fill_kinfo_t procdesc_fill_kinfo; static fo_cmp_t procdesc_cmp; -static struct fileops procdesc_ops = { +static const struct fileops procdesc_ops = { .fo_read = invfo_rdwr, .fo_write = invfo_rdwr, .fo_truncate = invfo_truncate, .fo_ioctl = invfo_ioctl, .fo_poll = procdesc_poll, .fo_kqfilter = procdesc_kqfilter, .fo_stat = procdesc_stat, .fo_close = procdesc_close, .fo_chmod = invfo_chmod, .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = procdesc_fill_kinfo, .fo_cmp = procdesc_cmp, .fo_flags = DFLAG_PASSABLE, }; /* * Return a locked process given a process descriptor, or ESRCH if it has * died. */ int procdesc_find(struct thread *td, int fd, cap_rights_t *rightsp, struct proc **p) { struct procdesc *pd; struct file *fp; int error; error = fget(td, fd, rightsp, &fp); if (error) return (error); if (fp->f_type != DTYPE_PROCDESC) { error = EBADF; goto out; } pd = fp->f_data; sx_slock(&proctree_lock); if (pd->pd_proc != NULL) { *p = pd->pd_proc; PROC_LOCK(*p); } else error = ESRCH; sx_sunlock(&proctree_lock); out: fdrop(fp, td); return (error); } /* * Function to be used by procstat(1) sysctls when returning procdesc * information. */ pid_t procdesc_pid(struct file *fp_procdesc) { struct procdesc *pd; KASSERT(fp_procdesc->f_type == DTYPE_PROCDESC, ("procdesc_pid: !procdesc")); pd = fp_procdesc->f_data; return (pd->pd_pid); } /* * Retrieve the PID associated with a process descriptor. */ int kern_pdgetpid(struct thread *td, int fd, cap_rights_t *rightsp, pid_t *pidp) { struct file *fp; int error; error = fget(td, fd, rightsp, &fp); if (error) return (error); if (fp->f_type != DTYPE_PROCDESC) { error = EBADF; goto out; } *pidp = procdesc_pid(fp); out: fdrop(fp, td); return (error); } /* * System call to return the pid of a process given its process descriptor. */ int sys_pdgetpid(struct thread *td, struct pdgetpid_args *uap) { pid_t pid; int error; AUDIT_ARG_FD(uap->fd); error = kern_pdgetpid(td, uap->fd, &cap_pdgetpid_rights, &pid); if (error == 0) error = copyout(&pid, uap->pidp, sizeof(pid)); return (error); } /* * When a new process is forked by pdfork(), a file descriptor is allocated * by the fork code first, then the process is forked, and then we get a * chance to set up the process descriptor. Failure is not permitted at this * point, so procdesc_new() must succeed. */ void procdesc_new(struct proc *p, int flags) { struct procdesc *pd; pd = malloc(sizeof(*pd), M_PROCDESC, M_WAITOK | M_ZERO); pd->pd_proc = p; pd->pd_pid = p->p_pid; p->p_procdesc = pd; pd->pd_flags = 0; if (flags & PD_DAEMON) pd->pd_flags |= PDF_DAEMON; PROCDESC_LOCK_INIT(pd); knlist_init_mtx(&pd->pd_selinfo.si_note, &pd->pd_lock); /* * Process descriptors start out with two references: one from their * struct file, and the other from their struct proc. */ refcount_init(&pd->pd_refcount, 2); } /* * Create a new process decriptor for the process that refers to it. */ int procdesc_falloc(struct thread *td, struct file **resultfp, int *resultfd, int flags, struct filecaps *fcaps) { int fflags; fflags = 0; if (flags & PD_CLOEXEC) fflags = O_CLOEXEC; return (falloc_caps(td, resultfp, resultfd, fflags, fcaps)); } /* * Initialize a file with a process descriptor. */ void procdesc_finit(struct procdesc *pdp, struct file *fp) { finit(fp, FREAD | FWRITE, DTYPE_PROCDESC, pdp, &procdesc_ops); } static void procdesc_free(struct procdesc *pd) { /* * When the last reference is released, we assert that the descriptor * has been closed, but not that the process has exited, as we will * detach the descriptor before the process dies if the descript is * closed, as we can't wait synchronously. */ if (refcount_release(&pd->pd_refcount)) { KASSERT(pd->pd_proc == NULL, ("procdesc_free: pd_proc != NULL")); KASSERT((pd->pd_flags & PDF_CLOSED), ("procdesc_free: !PDF_CLOSED")); knlist_destroy(&pd->pd_selinfo.si_note); PROCDESC_LOCK_DESTROY(pd); free(pd, M_PROCDESC); } } /* * procdesc_exit() - notify a process descriptor that its process is exiting. * We use the proctree_lock to ensure that process exit either happens * strictly before or strictly after a concurrent call to procdesc_close(). */ int procdesc_exit(struct proc *p) { struct procdesc *pd; sx_assert(&proctree_lock, SA_XLOCKED); PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(p->p_procdesc != NULL, ("procdesc_exit: p_procdesc NULL")); pd = p->p_procdesc; PROCDESC_LOCK(pd); KASSERT((pd->pd_flags & PDF_CLOSED) == 0 || p->p_pptr == p->p_reaper, ("procdesc_exit: closed && parent not reaper")); pd->pd_flags |= PDF_EXITED; pd->pd_xstat = KW_EXITCODE(p->p_xexit, p->p_xsig); /* * If the process descriptor has been closed, then we have nothing * to do; return 1 so that init will get SIGCHLD and do the reaping. * Clean up the procdesc now rather than letting it happen during * that reap. */ if (pd->pd_flags & PDF_CLOSED) { PROCDESC_UNLOCK(pd); pd->pd_proc = NULL; p->p_procdesc = NULL; procdesc_free(pd); return (1); } if (pd->pd_flags & PDF_SELECTED) { pd->pd_flags &= ~PDF_SELECTED; selwakeup(&pd->pd_selinfo); } KNOTE_LOCKED(&pd->pd_selinfo.si_note, NOTE_EXIT); PROCDESC_UNLOCK(pd); return (0); } /* * When a process descriptor is reaped, perhaps as a result of close() or * pdwait4(), release the process's reference on the process descriptor. */ void procdesc_reap(struct proc *p) { struct procdesc *pd; sx_assert(&proctree_lock, SA_XLOCKED); KASSERT(p->p_procdesc != NULL, ("procdesc_reap: p_procdesc == NULL")); pd = p->p_procdesc; pd->pd_proc = NULL; p->p_procdesc = NULL; procdesc_free(pd); } /* * procdesc_close() - last close on a process descriptor. If the process is * still running, terminate with SIGKILL (unless PDF_DAEMON is set) and let * its reaper clean up the mess; if not, we have to clean up the zombie * ourselves. */ static int procdesc_close(struct file *fp, struct thread *td) { struct procdesc *pd; struct proc *p; KASSERT(fp->f_type == DTYPE_PROCDESC, ("procdesc_close: !procdesc")); pd = fp->f_data; fp->f_ops = &badfileops; fp->f_data = NULL; sx_xlock(&proctree_lock); PROCDESC_LOCK(pd); pd->pd_flags |= PDF_CLOSED; PROCDESC_UNLOCK(pd); p = pd->pd_proc; if (p == NULL) { /* * This is the case where process' exit status was already * collected and procdesc_reap() was already called. */ sx_xunlock(&proctree_lock); } else { PROC_LOCK(p); AUDIT_ARG_PROCESS(p); if (p->p_state == PRS_ZOMBIE) { /* * If the process is already dead and just awaiting * reaping, do that now. This will release the * process's reference to the process descriptor when it * calls back into procdesc_reap(). */ proc_reap(curthread, p, NULL, 0); } else { /* * If the process is not yet dead, we need to kill it, * but we can't wait around synchronously for it to go * away, as that path leads to madness (and deadlocks). * First, detach the process from its descriptor so that * its exit status will be reported normally. */ pd->pd_proc = NULL; p->p_procdesc = NULL; procdesc_free(pd); /* * Next, reparent it to its reaper (usually init(8)) so * that there's someone to pick up the pieces; finally, * terminate with prejudice. */ p->p_sigparent = SIGCHLD; if ((p->p_flag & P_TRACED) == 0) { proc_reparent(p, p->p_reaper, true); } else { proc_clear_orphan(p); p->p_oppid = p->p_reaper->p_pid; proc_add_orphan(p, p->p_reaper); } if ((pd->pd_flags & PDF_DAEMON) == 0) kern_psignal(p, SIGKILL); PROC_UNLOCK(p); sx_xunlock(&proctree_lock); } } /* * Release the file descriptor's reference on the process descriptor. */ procdesc_free(pd); return (0); } static int procdesc_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct procdesc *pd; int revents; revents = 0; pd = fp->f_data; PROCDESC_LOCK(pd); if (pd->pd_flags & PDF_EXITED) revents |= POLLHUP; if (revents == 0) { selrecord(td, &pd->pd_selinfo); pd->pd_flags |= PDF_SELECTED; } PROCDESC_UNLOCK(pd); return (revents); } static void procdesc_kqops_detach(struct knote *kn) { struct procdesc *pd; pd = kn->kn_fp->f_data; knlist_remove(&pd->pd_selinfo.si_note, kn, 0); } static int procdesc_kqops_event(struct knote *kn, long hint) { struct procdesc *pd; u_int event; pd = kn->kn_fp->f_data; if (hint == 0) { /* * Initial test after registration. Generate a NOTE_EXIT in * case the process already terminated before registration. */ event = pd->pd_flags & PDF_EXITED ? NOTE_EXIT : 0; } else { /* Mask off extra data. */ event = (u_int)hint & NOTE_PCTRLMASK; } /* If the user is interested in this event, record it. */ if (kn->kn_sfflags & event) kn->kn_fflags |= event; /* Process is gone, so flag the event as finished. */ if (event == NOTE_EXIT) { kn->kn_flags |= EV_EOF | EV_ONESHOT; if (kn->kn_fflags & NOTE_EXIT) kn->kn_data = pd->pd_xstat; if (kn->kn_fflags == 0) kn->kn_flags |= EV_DROP; return (1); } return (kn->kn_fflags != 0); } -static struct filterops procdesc_kqops = { +static const struct filterops procdesc_kqops = { .f_isfd = 1, .f_detach = procdesc_kqops_detach, .f_event = procdesc_kqops_event, }; static int procdesc_kqfilter(struct file *fp, struct knote *kn) { struct procdesc *pd; pd = fp->f_data; switch (kn->kn_filter) { case EVFILT_PROCDESC: kn->kn_fop = &procdesc_kqops; kn->kn_flags |= EV_CLEAR; knlist_add(&pd->pd_selinfo.si_note, kn, 0); return (0); default: return (EINVAL); } } static int procdesc_stat(struct file *fp, struct stat *sb, struct ucred *active_cred) { struct procdesc *pd; struct timeval pstart, boottime; /* * XXXRW: Perhaps we should cache some more information from the * process so that we can return it reliably here even after it has * died. For example, caching its credential data. */ bzero(sb, sizeof(*sb)); pd = fp->f_data; sx_slock(&proctree_lock); if (pd->pd_proc != NULL) { PROC_LOCK(pd->pd_proc); AUDIT_ARG_PROCESS(pd->pd_proc); /* Set birth and [acm] times to process start time. */ pstart = pd->pd_proc->p_stats->p_start; getboottime(&boottime); timevaladd(&pstart, &boottime); TIMEVAL_TO_TIMESPEC(&pstart, &sb->st_birthtim); sb->st_atim = sb->st_birthtim; sb->st_ctim = sb->st_birthtim; sb->st_mtim = sb->st_birthtim; if (pd->pd_proc->p_state != PRS_ZOMBIE) sb->st_mode = S_IFREG | S_IRWXU; else sb->st_mode = S_IFREG; sb->st_uid = pd->pd_proc->p_ucred->cr_ruid; sb->st_gid = pd->pd_proc->p_ucred->cr_rgid; PROC_UNLOCK(pd->pd_proc); } else sb->st_mode = S_IFREG; sx_sunlock(&proctree_lock); return (0); } static int procdesc_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { struct procdesc *pdp; kif->kf_type = KF_TYPE_PROCDESC; pdp = fp->f_data; kif->kf_un.kf_proc.kf_pid = pdp->pd_pid; return (0); } static int procdesc_cmp(struct file *fp1, struct file *fp2, struct thread *td) { struct procdesc *pdp1, *pdp2; if (fp2->f_type != DTYPE_PROCDESC) return (3); pdp1 = fp1->f_data; pdp2 = fp2->f_data; return (kcmp_cmp((uintptr_t)pdp1->pd_pid, (uintptr_t)pdp2->pd_pid)); } diff --git a/sys/kern/sys_socket.c b/sys/kern/sys_socket.c index 58891b0de000..ca7ead961e68 100644 --- a/sys/kern/sys_socket.c +++ b/sys/kern/sys_socket.c @@ -1,855 +1,855 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)sys_socket.c 8.1 (Berkeley) 6/10/93 */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* XXX */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static SYSCTL_NODE(_kern_ipc, OID_AUTO, aio, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "socket AIO stats"); static int empty_results; SYSCTL_INT(_kern_ipc_aio, OID_AUTO, empty_results, CTLFLAG_RD, &empty_results, 0, "socket operation returned EAGAIN"); static int empty_retries; SYSCTL_INT(_kern_ipc_aio, OID_AUTO, empty_retries, CTLFLAG_RD, &empty_retries, 0, "socket operation retries"); static fo_rdwr_t soo_read; static fo_rdwr_t soo_write; static fo_ioctl_t soo_ioctl; static fo_poll_t soo_poll; extern fo_kqfilter_t soo_kqfilter; static fo_stat_t soo_stat; static fo_close_t soo_close; static fo_chmod_t soo_chmod; static fo_fill_kinfo_t soo_fill_kinfo; static fo_aio_queue_t soo_aio_queue; static void soo_aio_cancel(struct kaiocb *job); -struct fileops socketops = { +const struct fileops socketops = { .fo_read = soo_read, .fo_write = soo_write, .fo_truncate = invfo_truncate, .fo_ioctl = soo_ioctl, .fo_poll = soo_poll, .fo_kqfilter = soo_kqfilter, .fo_stat = soo_stat, .fo_close = soo_close, .fo_chmod = soo_chmod, .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = soo_fill_kinfo, .fo_aio_queue = soo_aio_queue, .fo_cmp = file_kcmp_generic, .fo_flags = DFLAG_PASSABLE }; static int soo_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct socket *so = fp->f_data; int error; #ifdef MAC error = mac_socket_check_receive(active_cred, so); if (error) return (error); #endif error = soreceive(so, 0, uio, 0, 0, 0); return (error); } static int soo_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct socket *so = fp->f_data; int error; #ifdef MAC error = mac_socket_check_send(active_cred, so); if (error) return (error); #endif error = sousrsend(so, NULL, uio, NULL, 0, NULL); return (error); } static int soo_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { struct socket *so = fp->f_data; int error = 0; switch (cmd) { case FIONBIO: SOCK_LOCK(so); if (*(int *)data) so->so_state |= SS_NBIO; else so->so_state &= ~SS_NBIO; SOCK_UNLOCK(so); break; case FIOASYNC: if (*(int *)data) { SOCK_LOCK(so); so->so_state |= SS_ASYNC; if (SOLISTENING(so)) { so->sol_sbrcv_flags |= SB_ASYNC; so->sol_sbsnd_flags |= SB_ASYNC; } else { SOCK_RECVBUF_LOCK(so); so->so_rcv.sb_flags |= SB_ASYNC; SOCK_RECVBUF_UNLOCK(so); SOCK_SENDBUF_LOCK(so); so->so_snd.sb_flags |= SB_ASYNC; SOCK_SENDBUF_UNLOCK(so); } SOCK_UNLOCK(so); } else { SOCK_LOCK(so); so->so_state &= ~SS_ASYNC; if (SOLISTENING(so)) { so->sol_sbrcv_flags &= ~SB_ASYNC; so->sol_sbsnd_flags &= ~SB_ASYNC; } else { SOCK_RECVBUF_LOCK(so); so->so_rcv.sb_flags &= ~SB_ASYNC; SOCK_RECVBUF_UNLOCK(so); SOCK_SENDBUF_LOCK(so); so->so_snd.sb_flags &= ~SB_ASYNC; SOCK_SENDBUF_UNLOCK(so); } SOCK_UNLOCK(so); } break; case FIONREAD: SOCK_RECVBUF_LOCK(so); if (SOLISTENING(so)) { error = EINVAL; } else { *(int *)data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl; } SOCK_RECVBUF_UNLOCK(so); break; case FIONWRITE: /* Unlocked read. */ if (SOLISTENING(so)) { error = EINVAL; } else { *(int *)data = sbavail(&so->so_snd); } break; case FIONSPACE: /* Unlocked read. */ if (SOLISTENING(so)) { error = EINVAL; } else { if ((so->so_snd.sb_hiwat < sbused(&so->so_snd)) || (so->so_snd.sb_mbmax < so->so_snd.sb_mbcnt)) { *(int *)data = 0; } else { *(int *)data = sbspace(&so->so_snd); } } break; case FIOSETOWN: error = fsetown(*(int *)data, &so->so_sigio); break; case FIOGETOWN: *(int *)data = fgetown(&so->so_sigio); break; case SIOCSPGRP: error = fsetown(-(*(int *)data), &so->so_sigio); break; case SIOCGPGRP: *(int *)data = -fgetown(&so->so_sigio); break; case SIOCATMARK: /* Unlocked read. */ if (SOLISTENING(so)) { error = EINVAL; } else { *(int *)data = (so->so_rcv.sb_state & SBS_RCVATMARK) != 0; } break; default: /* * Interface/routing/protocol specific ioctls: interface and * routing ioctls should have a different entry since a * socket is unnecessary. */ if (IOCGROUP(cmd) == 'i') error = ifioctl(so, cmd, data, td); else if (IOCGROUP(cmd) == 'r') { CURVNET_SET(so->so_vnet); error = rtioctl_fib(cmd, data, so->so_fibnum); CURVNET_RESTORE(); } else { CURVNET_SET(so->so_vnet); error = so->so_proto->pr_control(so, cmd, data, 0, td); CURVNET_RESTORE(); } break; } return (error); } static int soo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct socket *so = fp->f_data; #ifdef MAC int error; error = mac_socket_check_poll(active_cred, so); if (error) return (error); #endif return (sopoll(so, events, fp->f_cred, td)); } static int soo_stat(struct file *fp, struct stat *ub, struct ucred *active_cred) { struct socket *so = fp->f_data; int error = 0; bzero((caddr_t)ub, sizeof (*ub)); ub->st_mode = S_IFSOCK; #ifdef MAC error = mac_socket_check_stat(active_cred, so); if (error) return (error); #endif SOCK_LOCK(so); if (!SOLISTENING(so)) { struct sockbuf *sb; /* * If SBS_CANTRCVMORE is set, but there's still data left * in the receive buffer, the socket is still readable. */ sb = &so->so_rcv; SOCK_RECVBUF_LOCK(so); if ((sb->sb_state & SBS_CANTRCVMORE) == 0 || sbavail(sb)) ub->st_mode |= S_IRUSR | S_IRGRP | S_IROTH; ub->st_size = sbavail(sb) - sb->sb_ctl; SOCK_RECVBUF_UNLOCK(so); sb = &so->so_snd; SOCK_SENDBUF_LOCK(so); if ((sb->sb_state & SBS_CANTSENDMORE) == 0) ub->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH; SOCK_SENDBUF_UNLOCK(so); } ub->st_uid = so->so_cred->cr_uid; ub->st_gid = so->so_cred->cr_gid; if (so->so_proto->pr_sense) error = so->so_proto->pr_sense(so, ub); SOCK_UNLOCK(so); return (error); } /* * API socket close on file pointer. We call soclose() to close the socket * (including initiating closing protocols). soclose() will sorele() the * file reference but the actual socket will not go away until the socket's * ref count hits 0. */ static int soo_close(struct file *fp, struct thread *td) { int error = 0; struct socket *so; so = fp->f_data; fp->f_ops = &badfileops; fp->f_data = NULL; if (so) error = soclose(so); return (error); } static int soo_chmod(struct file *fp, mode_t mode, struct ucred *cred, struct thread *td) { struct socket *so; int error; so = fp->f_data; if (so->so_proto->pr_chmod != NULL) error = so->so_proto->pr_chmod(so, mode, cred, td); else error = EINVAL; return (error); } static int soo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { struct sockaddr *sa; struct inpcb *inpcb; struct unpcb *unpcb; struct socket *so; int error; kif->kf_type = KF_TYPE_SOCKET; so = fp->f_data; CURVNET_SET(so->so_vnet); kif->kf_un.kf_sock.kf_sock_domain0 = so->so_proto->pr_domain->dom_family; kif->kf_un.kf_sock.kf_sock_type0 = so->so_type; kif->kf_un.kf_sock.kf_sock_protocol0 = so->so_proto->pr_protocol; kif->kf_un.kf_sock.kf_sock_pcb = (uintptr_t)so->so_pcb; switch (kif->kf_un.kf_sock.kf_sock_domain0) { case AF_INET: case AF_INET6: if (so->so_pcb != NULL) { inpcb = (struct inpcb *)(so->so_pcb); kif->kf_un.kf_sock.kf_sock_inpcb = (uintptr_t)inpcb->inp_ppcb; } kif->kf_un.kf_sock.kf_sock_rcv_sb_state = so->so_rcv.sb_state; kif->kf_un.kf_sock.kf_sock_snd_sb_state = so->so_snd.sb_state; kif->kf_un.kf_sock.kf_sock_sendq = sbused(&so->so_snd); kif->kf_un.kf_sock.kf_sock_recvq = sbused(&so->so_rcv); break; case AF_UNIX: if (so->so_pcb != NULL) { unpcb = (struct unpcb *)(so->so_pcb); if (unpcb->unp_conn) { kif->kf_un.kf_sock.kf_sock_unpconn = (uintptr_t)unpcb->unp_conn; kif->kf_un.kf_sock.kf_sock_rcv_sb_state = so->so_rcv.sb_state; kif->kf_un.kf_sock.kf_sock_snd_sb_state = so->so_snd.sb_state; kif->kf_un.kf_sock.kf_sock_sendq = sbused(&so->so_snd); kif->kf_un.kf_sock.kf_sock_recvq = sbused(&so->so_rcv); } } break; } error = so->so_proto->pr_sockaddr(so, &sa); if (error == 0 && sa->sa_len <= sizeof(kif->kf_un.kf_sock.kf_sa_local)) { bcopy(sa, &kif->kf_un.kf_sock.kf_sa_local, sa->sa_len); free(sa, M_SONAME); } error = so->so_proto->pr_peeraddr(so, &sa); if (error == 0 && sa->sa_len <= sizeof(kif->kf_un.kf_sock.kf_sa_peer)) { bcopy(sa, &kif->kf_un.kf_sock.kf_sa_peer, sa->sa_len); free(sa, M_SONAME); } strncpy(kif->kf_path, so->so_proto->pr_domain->dom_name, sizeof(kif->kf_path)); CURVNET_RESTORE(); return (0); } /* * Use the 'backend3' field in AIO jobs to store the amount of data * completed by the AIO job so far. */ #define aio_done backend3 static STAILQ_HEAD(, task) soaio_jobs; static struct mtx soaio_jobs_lock; static struct task soaio_kproc_task; static int soaio_starting, soaio_idle, soaio_queued; static struct unrhdr *soaio_kproc_unr; static int soaio_max_procs = MAX_AIO_PROCS; SYSCTL_INT(_kern_ipc_aio, OID_AUTO, max_procs, CTLFLAG_RW, &soaio_max_procs, 0, "Maximum number of kernel processes to use for async socket IO"); static int soaio_num_procs; SYSCTL_INT(_kern_ipc_aio, OID_AUTO, num_procs, CTLFLAG_RD, &soaio_num_procs, 0, "Number of active kernel processes for async socket IO"); static int soaio_target_procs = TARGET_AIO_PROCS; SYSCTL_INT(_kern_ipc_aio, OID_AUTO, target_procs, CTLFLAG_RD, &soaio_target_procs, 0, "Preferred number of ready kernel processes for async socket IO"); static int soaio_lifetime; SYSCTL_INT(_kern_ipc_aio, OID_AUTO, lifetime, CTLFLAG_RW, &soaio_lifetime, 0, "Maximum lifetime for idle aiod"); static void soaio_kproc_loop(void *arg) { struct proc *p; struct vmspace *myvm; struct task *task; int error, id, pending; id = (intptr_t)arg; /* * Grab an extra reference on the daemon's vmspace so that it * doesn't get freed by jobs that switch to a different * vmspace. */ p = curproc; myvm = vmspace_acquire_ref(p); mtx_lock(&soaio_jobs_lock); MPASS(soaio_starting > 0); soaio_starting--; for (;;) { while (!STAILQ_EMPTY(&soaio_jobs)) { task = STAILQ_FIRST(&soaio_jobs); STAILQ_REMOVE_HEAD(&soaio_jobs, ta_link); soaio_queued--; pending = task->ta_pending; task->ta_pending = 0; mtx_unlock(&soaio_jobs_lock); task->ta_func(task->ta_context, pending); mtx_lock(&soaio_jobs_lock); } MPASS(soaio_queued == 0); if (p->p_vmspace != myvm) { mtx_unlock(&soaio_jobs_lock); vmspace_switch_aio(myvm); mtx_lock(&soaio_jobs_lock); continue; } soaio_idle++; error = mtx_sleep(&soaio_idle, &soaio_jobs_lock, 0, "-", soaio_lifetime); soaio_idle--; if (error == EWOULDBLOCK && STAILQ_EMPTY(&soaio_jobs) && soaio_num_procs > soaio_target_procs) break; } soaio_num_procs--; mtx_unlock(&soaio_jobs_lock); free_unr(soaio_kproc_unr, id); kproc_exit(0); } static void soaio_kproc_create(void *context, int pending) { struct proc *p; int error, id; mtx_lock(&soaio_jobs_lock); for (;;) { if (soaio_num_procs < soaio_target_procs) { /* Must create */ } else if (soaio_num_procs >= soaio_max_procs) { /* * Hit the limit on kernel processes, don't * create another one. */ break; } else if (soaio_queued <= soaio_idle + soaio_starting) { /* * No more AIO jobs waiting for a process to be * created, so stop. */ break; } soaio_starting++; mtx_unlock(&soaio_jobs_lock); id = alloc_unr(soaio_kproc_unr); error = kproc_create(soaio_kproc_loop, (void *)(intptr_t)id, &p, 0, 0, "soaiod%d", id); if (error != 0) { free_unr(soaio_kproc_unr, id); mtx_lock(&soaio_jobs_lock); soaio_starting--; break; } mtx_lock(&soaio_jobs_lock); soaio_num_procs++; } mtx_unlock(&soaio_jobs_lock); } void soaio_enqueue(struct task *task) { mtx_lock(&soaio_jobs_lock); MPASS(task->ta_pending == 0); task->ta_pending++; STAILQ_INSERT_TAIL(&soaio_jobs, task, ta_link); soaio_queued++; if (soaio_queued <= soaio_idle) wakeup_one(&soaio_idle); else if (soaio_num_procs < soaio_max_procs) taskqueue_enqueue(taskqueue_thread, &soaio_kproc_task); mtx_unlock(&soaio_jobs_lock); } static void soaio_init(void) { soaio_lifetime = AIOD_LIFETIME_DEFAULT; STAILQ_INIT(&soaio_jobs); mtx_init(&soaio_jobs_lock, "soaio jobs", NULL, MTX_DEF); soaio_kproc_unr = new_unrhdr(1, INT_MAX, NULL); TASK_INIT(&soaio_kproc_task, 0, soaio_kproc_create, NULL); } SYSINIT(soaio, SI_SUB_VFS, SI_ORDER_ANY, soaio_init, NULL); static __inline int soaio_ready(struct socket *so, struct sockbuf *sb) { return (sb == &so->so_rcv ? soreadable(so) : sowriteable(so)); } static void soaio_process_job(struct socket *so, sb_which which, struct kaiocb *job) { struct ucred *td_savedcred; struct thread *td; struct sockbuf *sb = sobuf(so, which); #ifdef MAC struct file *fp = job->fd_file; #endif size_t cnt, done, job_total_nbytes __diagused; long ru_before; int error, flags; SOCK_BUF_UNLOCK(so, which); aio_switch_vmspace(job); td = curthread; retry: td_savedcred = td->td_ucred; td->td_ucred = job->cred; job_total_nbytes = job->uiop->uio_resid + job->aio_done; done = job->aio_done; cnt = job->uiop->uio_resid; job->uiop->uio_offset = 0; job->uiop->uio_td = td; flags = MSG_NBIO; /* * For resource usage accounting, only count a completed request * as a single message to avoid counting multiple calls to * sosend/soreceive on a blocking socket. */ if (sb == &so->so_rcv) { ru_before = td->td_ru.ru_msgrcv; #ifdef MAC error = mac_socket_check_receive(fp->f_cred, so); if (error == 0) #endif error = soreceive(so, NULL, job->uiop, NULL, NULL, &flags); if (td->td_ru.ru_msgrcv != ru_before) job->msgrcv = 1; } else { if (!TAILQ_EMPTY(&sb->sb_aiojobq)) flags |= MSG_MORETOCOME; ru_before = td->td_ru.ru_msgsnd; #ifdef MAC error = mac_socket_check_send(fp->f_cred, so); if (error == 0) #endif error = sousrsend(so, NULL, job->uiop, NULL, flags, job->userproc); if (td->td_ru.ru_msgsnd != ru_before) job->msgsnd = 1; } done += cnt - job->uiop->uio_resid; job->aio_done = done; td->td_ucred = td_savedcred; if (error == EWOULDBLOCK) { /* * The request was either partially completed or not * completed at all due to racing with a read() or * write() on the socket. If the socket is * non-blocking, return with any partial completion. * If the socket is blocking or if no progress has * been made, requeue this request at the head of the * queue to try again when the socket is ready. */ MPASS(done != job_total_nbytes); SOCK_BUF_LOCK(so, which); if (done == 0 || !(so->so_state & SS_NBIO)) { empty_results++; if (soaio_ready(so, sb)) { empty_retries++; SOCK_BUF_UNLOCK(so, which); goto retry; } if (!aio_set_cancel_function(job, soo_aio_cancel)) { SOCK_BUF_UNLOCK(so, which); if (done != 0) aio_complete(job, done, 0); else aio_cancel(job); SOCK_BUF_LOCK(so, which); } else { TAILQ_INSERT_HEAD(&sb->sb_aiojobq, job, list); } return; } SOCK_BUF_UNLOCK(so, which); } if (done != 0 && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; if (error) aio_complete(job, -1, error); else aio_complete(job, done, 0); SOCK_BUF_LOCK(so, which); } static void soaio_process_sb(struct socket *so, sb_which which) { struct kaiocb *job; struct sockbuf *sb = sobuf(so, which); CURVNET_SET(so->so_vnet); SOCK_BUF_LOCK(so, which); while (!TAILQ_EMPTY(&sb->sb_aiojobq) && soaio_ready(so, sb)) { job = TAILQ_FIRST(&sb->sb_aiojobq); TAILQ_REMOVE(&sb->sb_aiojobq, job, list); if (!aio_clear_cancel_function(job)) continue; soaio_process_job(so, which, job); } /* * If there are still pending requests, the socket must not be * ready so set SB_AIO to request a wakeup when the socket * becomes ready. */ if (!TAILQ_EMPTY(&sb->sb_aiojobq)) sb->sb_flags |= SB_AIO; sb->sb_flags &= ~SB_AIO_RUNNING; SOCK_BUF_UNLOCK(so, which); sorele(so); CURVNET_RESTORE(); } void soaio_rcv(void *context, int pending) { struct socket *so; so = context; soaio_process_sb(so, SO_RCV); } void soaio_snd(void *context, int pending) { struct socket *so; so = context; soaio_process_sb(so, SO_SND); } void sowakeup_aio(struct socket *so, sb_which which) { struct sockbuf *sb = sobuf(so, which); SOCK_BUF_LOCK_ASSERT(so, which); sb->sb_flags &= ~SB_AIO; if (sb->sb_flags & SB_AIO_RUNNING) return; sb->sb_flags |= SB_AIO_RUNNING; soref(so); soaio_enqueue(&sb->sb_aiotask); } static void soo_aio_cancel(struct kaiocb *job) { struct socket *so; struct sockbuf *sb; long done; int opcode; sb_which which; so = job->fd_file->f_data; opcode = job->uaiocb.aio_lio_opcode; if (opcode & LIO_READ) { sb = &so->so_rcv; which = SO_RCV; } else { MPASS(opcode & LIO_WRITE); sb = &so->so_snd; which = SO_SND; } SOCK_BUF_LOCK(so, which); if (!aio_cancel_cleared(job)) TAILQ_REMOVE(&sb->sb_aiojobq, job, list); if (TAILQ_EMPTY(&sb->sb_aiojobq)) sb->sb_flags &= ~SB_AIO; SOCK_BUF_UNLOCK(so, which); done = job->aio_done; if (done != 0) aio_complete(job, done, 0); else aio_cancel(job); } static int soo_aio_queue(struct file *fp, struct kaiocb *job) { struct socket *so; struct sockbuf *sb; sb_which which; int error; so = fp->f_data; error = so->so_proto->pr_aio_queue(so, job); if (error == 0) return (0); /* Lock through the socket, since this may be a listening socket. */ switch (job->uaiocb.aio_lio_opcode & (LIO_WRITE | LIO_READ)) { case LIO_READ: SOCK_RECVBUF_LOCK(so); sb = &so->so_rcv; which = SO_RCV; break; case LIO_WRITE: SOCK_SENDBUF_LOCK(so); sb = &so->so_snd; which = SO_SND; break; default: return (EINVAL); } if (SOLISTENING(so)) { SOCK_BUF_UNLOCK(so, which); return (EINVAL); } if (!aio_set_cancel_function(job, soo_aio_cancel)) panic("new job was cancelled"); TAILQ_INSERT_TAIL(&sb->sb_aiojobq, job, list); if (!(sb->sb_flags & SB_AIO_RUNNING)) { if (soaio_ready(so, sb)) sowakeup_aio(so, which); else sb->sb_flags |= SB_AIO; } SOCK_BUF_UNLOCK(so, which); return (0); } diff --git a/sys/kern/sys_timerfd.c b/sys/kern/sys_timerfd.c index 8ac5b845f7ac..ab7e048a2ab1 100644 --- a/sys/kern/sys_timerfd.c +++ b/sys/kern/sys_timerfd.c @@ -1,609 +1,609 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2014 Dmitry Chagin * Copyright (c) 2023 Jake Freeland * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_TIMERFD, "timerfd", "timerfd structures"); static struct mtx timerfd_list_lock; static LIST_HEAD(, timerfd) timerfd_list; MTX_SYSINIT(timerfd, &timerfd_list_lock, "timerfd_list_lock", MTX_DEF); static struct unrhdr64 tfdino_unr; #define TFD_NOJUMP 0 /* Realtime clock has not jumped. */ #define TFD_READ 1 /* Jumped, tfd has been read since. */ #define TFD_ZREAD 2 /* Jumped backwards, CANCEL_ON_SET=false. */ #define TFD_CANCELED 4 /* Jumped, CANCEL_ON_SET=true. */ #define TFD_JUMPED (TFD_ZREAD | TFD_CANCELED) /* * One structure allocated per timerfd descriptor. * * Locking semantics: * (t) locked by tfd_lock mtx * (l) locked by timerfd_list_lock sx * (c) const until freeing */ struct timerfd { /* User specified. */ struct itimerspec tfd_time; /* (t) tfd timer */ clockid_t tfd_clockid; /* (c) timing base */ int tfd_flags; /* (c) creation flags */ int tfd_timflags; /* (t) timer flags */ /* Used internally. */ timerfd_t tfd_count; /* (t) expiration count since read */ bool tfd_expired; /* (t) true upon initial expiration */ struct mtx tfd_lock; /* tfd mtx lock */ struct callout tfd_callout; /* (t) expiration notification */ struct selinfo tfd_sel; /* (t) I/O alerts */ struct timespec tfd_boottim; /* (t) cached boottime */ int tfd_jumped; /* (t) timer jump status */ LIST_ENTRY(timerfd) entry; /* (l) entry in list */ /* For stat(2). */ ino_t tfd_ino; /* (c) inode number */ struct timespec tfd_atim; /* (t) time of last read */ struct timespec tfd_mtim; /* (t) time of last settime */ struct timespec tfd_birthtim; /* (c) creation time */ }; static void timerfd_init(void *data) { new_unrhdr64(&tfdino_unr, 1); } SYSINIT(timerfd, SI_SUB_VFS, SI_ORDER_ANY, timerfd_init, NULL); static inline void timerfd_getboottime(struct timespec *ts) { struct timeval tv; getboottime(&tv); TIMEVAL_TO_TIMESPEC(&tv, ts); } /* * Call when a discontinuous jump has occured in CLOCK_REALTIME and * update timerfd's cached boottime. A jump can be triggered using * functions like clock_settime(2) or settimeofday(2). * * Timer is marked TFD_CANCELED if TFD_TIMER_CANCEL_ON_SET is set * and the realtime clock jumps. * Timer is marked TFD_ZREAD if TFD_TIMER_CANCEL_ON_SET is not set, * but the realtime clock jumps backwards. */ void timerfd_jumped(void) { struct timerfd *tfd; struct timespec boottime, diff; if (LIST_EMPTY(&timerfd_list)) return; timerfd_getboottime(&boottime); mtx_lock(&timerfd_list_lock); LIST_FOREACH(tfd, &timerfd_list, entry) { mtx_lock(&tfd->tfd_lock); if (tfd->tfd_clockid != CLOCK_REALTIME || (tfd->tfd_timflags & TFD_TIMER_ABSTIME) == 0 || timespeccmp(&boottime, &tfd->tfd_boottim, ==)) { mtx_unlock(&tfd->tfd_lock); continue; } if (callout_active(&tfd->tfd_callout)) { if ((tfd->tfd_timflags & TFD_TIMER_CANCEL_ON_SET) != 0) tfd->tfd_jumped = TFD_CANCELED; else if (timespeccmp(&boottime, &tfd->tfd_boottim, <)) tfd->tfd_jumped = TFD_ZREAD; /* * Do not reschedule callout when * inside interval time loop. */ if (!tfd->tfd_expired) { timespecsub(&boottime, &tfd->tfd_boottim, &diff); timespecsub(&tfd->tfd_time.it_value, &diff, &tfd->tfd_time.it_value); if (callout_stop(&tfd->tfd_callout) == 1) { callout_schedule_sbt(&tfd->tfd_callout, tstosbt(tfd->tfd_time.it_value), 0, C_ABSOLUTE); } } } tfd->tfd_boottim = boottime; mtx_unlock(&tfd->tfd_lock); } mtx_unlock(&timerfd_list_lock); } static int timerfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct timerfd *tfd = fp->f_data; timerfd_t count; int error = 0; if (uio->uio_resid < sizeof(timerfd_t)) return (EINVAL); mtx_lock(&tfd->tfd_lock); retry: getnanotime(&tfd->tfd_atim); if ((tfd->tfd_jumped & TFD_JUMPED) != 0) { if (tfd->tfd_jumped == TFD_CANCELED) error = ECANCELED; tfd->tfd_jumped = TFD_READ; tfd->tfd_count = 0; mtx_unlock(&tfd->tfd_lock); return (error); } else { tfd->tfd_jumped = TFD_NOJUMP; } if (tfd->tfd_count == 0) { if ((fp->f_flag & FNONBLOCK) != 0) { mtx_unlock(&tfd->tfd_lock); return (EAGAIN); } td->td_rtcgen = atomic_load_acq_int(&rtc_generation); error = mtx_sleep(&tfd->tfd_count, &tfd->tfd_lock, PCATCH, "tfdrd", 0); if (error == 0) { goto retry; } else { mtx_unlock(&tfd->tfd_lock); return (error); } } count = tfd->tfd_count; tfd->tfd_count = 0; mtx_unlock(&tfd->tfd_lock); error = uiomove(&count, sizeof(timerfd_t), uio); return (error); } static int timerfd_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { switch (cmd) { case FIOASYNC: if (*(int *)data != 0) atomic_set_int(&fp->f_flag, FASYNC); else atomic_clear_int(&fp->f_flag, FASYNC); return (0); case FIONBIO: if (*(int *)data != 0) atomic_set_int(&fp->f_flag, FNONBLOCK); else atomic_clear_int(&fp->f_flag, FNONBLOCK); return (0); } return (ENOTTY); } static int timerfd_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct timerfd *tfd = fp->f_data; int revents = 0; mtx_lock(&tfd->tfd_lock); if ((events & (POLLIN | POLLRDNORM)) != 0 && tfd->tfd_count > 0 && tfd->tfd_jumped != TFD_READ) revents |= events & (POLLIN | POLLRDNORM); if (revents == 0) selrecord(td, &tfd->tfd_sel); mtx_unlock(&tfd->tfd_lock); return (revents); } static void filt_timerfddetach(struct knote *kn) { struct timerfd *tfd = kn->kn_hook; mtx_lock(&tfd->tfd_lock); knlist_remove(&tfd->tfd_sel.si_note, kn, 1); mtx_unlock(&tfd->tfd_lock); } static int filt_timerfdread(struct knote *kn, long hint) { struct timerfd *tfd = kn->kn_hook; mtx_assert(&tfd->tfd_lock, MA_OWNED); kn->kn_data = (int64_t)tfd->tfd_count; return (tfd->tfd_count > 0); } -static struct filterops timerfd_rfiltops = { +static const struct filterops timerfd_rfiltops = { .f_isfd = 1, .f_detach = filt_timerfddetach, .f_event = filt_timerfdread, }; static int timerfd_kqfilter(struct file *fp, struct knote *kn) { struct timerfd *tfd = fp->f_data; if (kn->kn_filter != EVFILT_READ) return (EINVAL); kn->kn_fop = &timerfd_rfiltops; kn->kn_hook = tfd; knlist_add(&tfd->tfd_sel.si_note, kn, 0); return (0); } static int timerfd_stat(struct file *fp, struct stat *sb, struct ucred *active_cred) { struct timerfd *tfd = fp->f_data; bzero(sb, sizeof(*sb)); sb->st_nlink = fp->f_count - 1; sb->st_uid = fp->f_cred->cr_uid; sb->st_gid = fp->f_cred->cr_gid; sb->st_blksize = PAGE_SIZE; mtx_lock(&tfd->tfd_lock); sb->st_atim = tfd->tfd_atim; sb->st_mtim = tfd->tfd_mtim; mtx_unlock(&tfd->tfd_lock); sb->st_ctim = sb->st_mtim; sb->st_ino = tfd->tfd_ino; sb->st_birthtim = tfd->tfd_birthtim; return (0); } static int timerfd_close(struct file *fp, struct thread *td) { struct timerfd *tfd = fp->f_data; mtx_lock(&timerfd_list_lock); LIST_REMOVE(tfd, entry); mtx_unlock(&timerfd_list_lock); callout_drain(&tfd->tfd_callout); seldrain(&tfd->tfd_sel); knlist_destroy(&tfd->tfd_sel.si_note); mtx_destroy(&tfd->tfd_lock); free(tfd, M_TIMERFD); fp->f_ops = &badfileops; return (0); } static int timerfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { struct timerfd *tfd = fp->f_data; kif->kf_type = KF_TYPE_TIMERFD; kif->kf_un.kf_timerfd.kf_timerfd_clockid = tfd->tfd_clockid; kif->kf_un.kf_timerfd.kf_timerfd_flags = tfd->tfd_flags; kif->kf_un.kf_timerfd.kf_timerfd_addr = (uintptr_t)tfd; return (0); } -static struct fileops timerfdops = { +static const struct fileops timerfdops = { .fo_read = timerfd_read, .fo_write = invfo_rdwr, .fo_truncate = invfo_truncate, .fo_ioctl = timerfd_ioctl, .fo_poll = timerfd_poll, .fo_kqfilter = timerfd_kqfilter, .fo_stat = timerfd_stat, .fo_close = timerfd_close, .fo_chmod = invfo_chmod, .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = timerfd_fill_kinfo, .fo_cmp = file_kcmp_generic, .fo_flags = DFLAG_PASSABLE, }; static void timerfd_curval(struct timerfd *tfd, struct itimerspec *old_value) { struct timespec curr_value; mtx_assert(&tfd->tfd_lock, MA_OWNED); *old_value = tfd->tfd_time; if (timespecisset(&tfd->tfd_time.it_value)) { nanouptime(&curr_value); timespecsub(&tfd->tfd_time.it_value, &curr_value, &old_value->it_value); } } static void timerfd_expire(void *arg) { struct timerfd *tfd = (struct timerfd *)arg; struct timespec uptime; ++tfd->tfd_count; tfd->tfd_expired = true; if (timespecisset(&tfd->tfd_time.it_interval)) { /* Count missed events. */ nanouptime(&uptime); if (timespeccmp(&uptime, &tfd->tfd_time.it_value, >)) { timespecsub(&uptime, &tfd->tfd_time.it_value, &uptime); tfd->tfd_count += tstosbt(uptime) / tstosbt(tfd->tfd_time.it_interval); } timespecadd(&tfd->tfd_time.it_value, &tfd->tfd_time.it_interval, &tfd->tfd_time.it_value); callout_schedule_sbt(&tfd->tfd_callout, tstosbt(tfd->tfd_time.it_value), 0, C_ABSOLUTE); } else { /* Single shot timer. */ callout_deactivate(&tfd->tfd_callout); timespecclear(&tfd->tfd_time.it_value); } wakeup(&tfd->tfd_count); selwakeup(&tfd->tfd_sel); KNOTE_LOCKED(&tfd->tfd_sel.si_note, 0); } int kern_timerfd_create(struct thread *td, int clockid, int flags) { struct file *fp; struct timerfd *tfd; int error, fd, fflags; AUDIT_ARG_VALUE(clockid); AUDIT_ARG_FFLAGS(flags); switch (clockid) { case CLOCK_REALTIME: /* FALLTHROUGH */ case CLOCK_MONOTONIC: /* FALLTHROUGH */ case CLOCK_UPTIME: /* * CLOCK_BOOTTIME should be added once different from * CLOCK_UPTIME */ break; default: return (EINVAL); } if ((flags & ~(TFD_CLOEXEC | TFD_NONBLOCK)) != 0) return (EINVAL); fflags = FREAD; if ((flags & TFD_CLOEXEC) != 0) fflags |= O_CLOEXEC; if ((flags & TFD_NONBLOCK) != 0) fflags |= FNONBLOCK; error = falloc(td, &fp, &fd, fflags); if (error != 0) return (error); tfd = malloc(sizeof(*tfd), M_TIMERFD, M_WAITOK | M_ZERO); tfd->tfd_clockid = (clockid_t)clockid; tfd->tfd_flags = flags; tfd->tfd_ino = alloc_unr64(&tfdino_unr); mtx_init(&tfd->tfd_lock, "timerfd", NULL, MTX_DEF); callout_init_mtx(&tfd->tfd_callout, &tfd->tfd_lock, 0); knlist_init_mtx(&tfd->tfd_sel.si_note, &tfd->tfd_lock); timerfd_getboottime(&tfd->tfd_boottim); getnanotime(&tfd->tfd_birthtim); mtx_lock(&timerfd_list_lock); LIST_INSERT_HEAD(&timerfd_list, tfd, entry); mtx_unlock(&timerfd_list_lock); finit(fp, fflags, DTYPE_TIMERFD, tfd, &timerfdops); fdrop(fp, td); td->td_retval[0] = fd; return (0); } int kern_timerfd_gettime(struct thread *td, int fd, struct itimerspec *curr_value) { struct file *fp; struct timerfd *tfd; int error; error = fget(td, fd, &cap_write_rights, &fp); if (error != 0) return (error); if (fp->f_type != DTYPE_TIMERFD) { fdrop(fp, td); return (EINVAL); } tfd = fp->f_data; mtx_lock(&tfd->tfd_lock); timerfd_curval(tfd, curr_value); mtx_unlock(&tfd->tfd_lock); fdrop(fp, td); return (0); } int kern_timerfd_settime(struct thread *td, int fd, int flags, const struct itimerspec *new_value, struct itimerspec *old_value) { struct file *fp; struct timerfd *tfd; struct timespec ts; int error = 0; if ((flags & ~(TFD_TIMER_ABSTIME | TFD_TIMER_CANCEL_ON_SET)) != 0) return (EINVAL); if (!timespecvalid_interval(&new_value->it_value) || !timespecvalid_interval(&new_value->it_interval)) return (EINVAL); error = fget(td, fd, &cap_write_rights, &fp); if (error != 0) return (error); if (fp->f_type != DTYPE_TIMERFD) { fdrop(fp, td); return (EINVAL); } tfd = fp->f_data; mtx_lock(&tfd->tfd_lock); getnanotime(&tfd->tfd_mtim); tfd->tfd_timflags = flags; /* Store old itimerspec, if applicable. */ if (old_value != NULL) timerfd_curval(tfd, old_value); /* Set new expiration. */ tfd->tfd_time = *new_value; if (timespecisset(&tfd->tfd_time.it_value)) { if ((flags & TFD_TIMER_ABSTIME) == 0) { nanouptime(&ts); timespecadd(&tfd->tfd_time.it_value, &ts, &tfd->tfd_time.it_value); } else if (tfd->tfd_clockid == CLOCK_REALTIME) { /* ECANCELED if unread jump is pending. */ if (tfd->tfd_jumped == TFD_CANCELED) error = ECANCELED; /* Convert from CLOCK_REALTIME to CLOCK_BOOTTIME. */ timespecsub(&tfd->tfd_time.it_value, &tfd->tfd_boottim, &tfd->tfd_time.it_value); } callout_reset_sbt(&tfd->tfd_callout, tstosbt(tfd->tfd_time.it_value), 0, timerfd_expire, tfd, C_ABSOLUTE); } else { callout_stop(&tfd->tfd_callout); } tfd->tfd_count = 0; tfd->tfd_expired = false; tfd->tfd_jumped = TFD_NOJUMP; mtx_unlock(&tfd->tfd_lock); fdrop(fp, td); return (error); } int sys_timerfd_create(struct thread *td, struct timerfd_create_args *uap) { return (kern_timerfd_create(td, uap->clockid, uap->flags)); } int sys_timerfd_gettime(struct thread *td, struct timerfd_gettime_args *uap) { struct itimerspec curr_value; int error; error = kern_timerfd_gettime(td, uap->fd, &curr_value); if (error == 0) error = copyout(&curr_value, uap->curr_value, sizeof(curr_value)); return (error); } int sys_timerfd_settime(struct thread *td, struct timerfd_settime_args *uap) { struct itimerspec new_value, old_value; int error; error = copyin(uap->new_value, &new_value, sizeof(new_value)); if (error != 0) return (error); if (uap->old_value == NULL) { error = kern_timerfd_settime(td, uap->fd, uap->flags, &new_value, NULL); } else { error = kern_timerfd_settime(td, uap->fd, uap->flags, &new_value, &old_value); if (error == 0) error = copyout(&old_value, uap->old_value, sizeof(old_value)); } return (error); } diff --git a/sys/kern/tty.c b/sys/kern/tty.c index b6e300321e9c..b1b3b268d0e9 100644 --- a/sys/kern/tty.c +++ b/sys/kern/tty.c @@ -1,2487 +1,2487 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2008 Ed Schouten * All rights reserved. * * Portions of this software were developed under sponsorship from Snow * B.V., the Netherlands. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_capsicum.h" #include "opt_printf.h" #include #include #include #include #include #include #include #include #ifdef COMPAT_43TTY #include #endif /* COMPAT_43TTY */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TTYDEFCHARS #include #undef TTYDEFCHARS #include #include #include #include static MALLOC_DEFINE(M_TTY, "tty", "tty device"); static void tty_rel_free(struct tty *tp); static TAILQ_HEAD(, tty) tty_list = TAILQ_HEAD_INITIALIZER(tty_list); static struct sx tty_list_sx; SX_SYSINIT(tty_list, &tty_list_sx, "tty list"); static unsigned int tty_list_count = 0; /* Character device of /dev/console. */ static struct cdev *dev_console; static const char *dev_console_filename; /* * Flags that are supported and stored by this implementation. */ #define TTYSUP_IFLAG (IGNBRK|BRKINT|IGNPAR|PARMRK|INPCK|ISTRIP|INLCR|\ IGNCR|ICRNL|IXON|IXOFF|IXANY|IMAXBEL|IUTF8) #define TTYSUP_OFLAG (OPOST|ONLCR|TAB3|ONOEOT|OCRNL|ONOCR|ONLRET) #define TTYSUP_LFLAG (ECHOKE|ECHOE|ECHOK|ECHO|ECHONL|ECHOPRT|\ ECHOCTL|ISIG|ICANON|ALTWERASE|IEXTEN|TOSTOP|\ FLUSHO|NOKERNINFO|NOFLSH) #define TTYSUP_CFLAG (CIGNORE|CSIZE|CSTOPB|CREAD|PARENB|PARODD|\ HUPCL|CLOCAL|CCTS_OFLOW|CRTS_IFLOW|CDTR_IFLOW|\ CDSR_OFLOW|CCAR_OFLOW|CNO_RTSDTR) #define TTY_CALLOUT(tp,d) (dev2unit(d) & TTYUNIT_CALLOUT) static int tty_drainwait = 5 * 60; SYSCTL_INT(_kern, OID_AUTO, tty_drainwait, CTLFLAG_RWTUN, &tty_drainwait, 0, "Default output drain timeout in seconds"); /* * Set TTY buffer sizes. */ #define TTYBUF_MAX 65536 #ifdef PRINTF_BUFR_SIZE #define TTY_PRBUF_SIZE PRINTF_BUFR_SIZE #else #define TTY_PRBUF_SIZE 256 #endif /* * Allocate buffer space if necessary, and set low watermarks, based on speed. * Note that the ttyxxxq_setsize() functions may drop and then reacquire the tty * lock during memory allocation. They will return ENXIO if the tty disappears * while unlocked. */ static int tty_watermarks(struct tty *tp) { size_t bs = 0; int error; /* Provide an input buffer for 2 seconds of data. */ if (tp->t_termios.c_cflag & CREAD) bs = MIN(tp->t_termios.c_ispeed / 5, TTYBUF_MAX); error = ttyinq_setsize(&tp->t_inq, tp, bs); if (error != 0) return (error); /* Set low watermark at 10% (when 90% is available). */ tp->t_inlow = (ttyinq_getallocatedsize(&tp->t_inq) * 9) / 10; /* Provide an output buffer for 2 seconds of data. */ bs = MIN(tp->t_termios.c_ospeed / 5, TTYBUF_MAX); error = ttyoutq_setsize(&tp->t_outq, tp, bs); if (error != 0) return (error); /* Set low watermark at 10% (when 90% is available). */ tp->t_outlow = (ttyoutq_getallocatedsize(&tp->t_outq) * 9) / 10; return (0); } static int tty_drain(struct tty *tp, int leaving) { sbintime_t timeout_at; size_t bytes; int error; if (ttyhook_hashook(tp, getc_inject)) /* buffer is inaccessible */ return (0); /* * For close(), use the recent historic timeout of "1 second without * making progress". For tcdrain(), use t_drainwait as the timeout, * with zero meaning "no timeout" which gives POSIX behavior. */ if (leaving) timeout_at = getsbinuptime() + SBT_1S; else if (tp->t_drainwait != 0) timeout_at = getsbinuptime() + SBT_1S * tp->t_drainwait; else timeout_at = 0; /* * Poll the output buffer and the hardware for completion, at 10 Hz. * Polling is required for devices which are not able to signal an * interrupt when the transmitter becomes idle (most USB serial devs). * The unusual structure of this loop ensures we check for busy one more * time after tty_timedwait() returns EWOULDBLOCK, so that success has * higher priority than timeout if the IO completed in the last 100mS. */ error = 0; bytes = ttyoutq_bytesused(&tp->t_outq); for (;;) { if (ttyoutq_bytesused(&tp->t_outq) == 0 && !ttydevsw_busy(tp)) return (0); if (error != 0) return (error); ttydevsw_outwakeup(tp); error = tty_timedwait(tp, &tp->t_outwait, hz / 10); if (error != 0 && error != EWOULDBLOCK) return (error); else if (timeout_at == 0 || getsbinuptime() < timeout_at) error = 0; else if (leaving && ttyoutq_bytesused(&tp->t_outq) < bytes) { /* In close, making progress, grant an extra second. */ error = 0; timeout_at += SBT_1S; bytes = ttyoutq_bytesused(&tp->t_outq); } } } /* * Though ttydev_enter() and ttydev_leave() seem to be related, they * don't have to be used together. ttydev_enter() is used by the cdev * operations to prevent an actual operation from being processed when * the TTY has been abandoned. ttydev_leave() is used by ttydev_open() * and ttydev_close() to determine whether per-TTY data should be * deallocated. */ static __inline int ttydev_enter(struct tty *tp) { tty_lock(tp); if (tty_gone(tp) || !tty_opened(tp)) { /* Device is already gone. */ tty_unlock(tp); return (ENXIO); } return (0); } static void ttydev_leave(struct tty *tp) { tty_assert_locked(tp); if (tty_opened(tp) || tp->t_flags & TF_OPENCLOSE) { /* Device is still opened somewhere. */ tty_unlock(tp); return; } tp->t_flags |= TF_OPENCLOSE; /* Remove console TTY. */ constty_clear(tp); /* Drain any output. */ if (!tty_gone(tp)) tty_drain(tp, 1); ttydisc_close(tp); /* Free i/o queues now since they might be large. */ ttyinq_free(&tp->t_inq); tp->t_inlow = 0; ttyoutq_free(&tp->t_outq); tp->t_outlow = 0; if (!tty_gone(tp)) ttydevsw_close(tp); tp->t_flags &= ~TF_OPENCLOSE; cv_broadcast(&tp->t_dcdwait); tty_rel_free(tp); } /* * Operations that are exposed through the character device in /dev. */ static int ttydev_open(struct cdev *dev, int oflags, int devtype __unused, struct thread *td) { struct tty *tp; int error; tp = dev->si_drv1; error = 0; tty_lock(tp); if (tty_gone(tp)) { /* Device is already gone. */ tty_unlock(tp); return (ENXIO); } /* * Block when other processes are currently opening or closing * the TTY. */ while (tp->t_flags & TF_OPENCLOSE) { error = tty_wait(tp, &tp->t_dcdwait); if (error != 0) { tty_unlock(tp); return (error); } } tp->t_flags |= TF_OPENCLOSE; /* * Make sure the "tty" and "cua" device cannot be opened at the * same time. The console is a "tty" device. */ if (TTY_CALLOUT(tp, dev)) { if (tp->t_flags & (TF_OPENED_CONS | TF_OPENED_IN)) { error = EBUSY; goto done; } } else { if (tp->t_flags & TF_OPENED_OUT) { error = EBUSY; goto done; } } if (tp->t_flags & TF_EXCLUDE && priv_check(td, PRIV_TTY_EXCLUSIVE)) { error = EBUSY; goto done; } if (!tty_opened(tp)) { /* Set proper termios flags. */ if (TTY_CALLOUT(tp, dev)) tp->t_termios = tp->t_termios_init_out; else tp->t_termios = tp->t_termios_init_in; ttydevsw_param(tp, &tp->t_termios); /* Prevent modem control on callout devices and /dev/console. */ if (TTY_CALLOUT(tp, dev) || dev == dev_console) tp->t_termios.c_cflag |= CLOCAL; if ((tp->t_termios.c_cflag & CNO_RTSDTR) == 0) ttydevsw_modem(tp, SER_DTR|SER_RTS, 0); error = ttydevsw_open(tp); if (error != 0) goto done; ttydisc_open(tp); error = tty_watermarks(tp); if (error != 0) goto done; } /* Wait for Carrier Detect. */ if ((oflags & O_NONBLOCK) == 0 && (tp->t_termios.c_cflag & CLOCAL) == 0) { while ((ttydevsw_modem(tp, 0, 0) & SER_DCD) == 0) { error = tty_wait(tp, &tp->t_dcdwait); if (error != 0) goto done; } } if (dev == dev_console) tp->t_flags |= TF_OPENED_CONS; else if (TTY_CALLOUT(tp, dev)) tp->t_flags |= TF_OPENED_OUT; else tp->t_flags |= TF_OPENED_IN; MPASS((tp->t_flags & (TF_OPENED_CONS | TF_OPENED_IN)) == 0 || (tp->t_flags & TF_OPENED_OUT) == 0); done: tp->t_flags &= ~TF_OPENCLOSE; cv_broadcast(&tp->t_dcdwait); ttydev_leave(tp); return (error); } static int ttydev_close(struct cdev *dev, int fflag, int devtype __unused, struct thread *td) { struct tty *tp = dev->si_drv1; tty_lock(tp); /* * Don't actually close the device if it is being used as the * console. */ MPASS((tp->t_flags & (TF_OPENED_CONS | TF_OPENED_IN)) == 0 || (tp->t_flags & TF_OPENED_OUT) == 0); if (dev == dev_console) tp->t_flags &= ~TF_OPENED_CONS; else tp->t_flags &= ~(TF_OPENED_IN|TF_OPENED_OUT); if (tp->t_flags & TF_OPENED) { tty_unlock(tp); return (0); } /* If revoking, flush output now to avoid draining it later. */ if ((fflag & FREVOKE) != 0) { tty_flush(tp, FWRITE); knlist_delete(&tp->t_inpoll.si_note, td, 1); knlist_delete(&tp->t_outpoll.si_note, td, 1); } tp->t_flags &= ~TF_EXCLUDE; /* Properly wake up threads that are stuck - revoke(). */ tp->t_revokecnt++; tty_wakeup(tp, FREAD|FWRITE); cv_broadcast(&tp->t_bgwait); cv_broadcast(&tp->t_dcdwait); ttydev_leave(tp); return (0); } static __inline int tty_is_ctty(struct tty *tp, struct proc *p) { tty_assert_locked(tp); return (p->p_session == tp->t_session && p->p_flag & P_CONTROLT); } int tty_wait_background(struct tty *tp, struct thread *td, int sig) { struct proc *p; struct pgrp *pg; ksiginfo_t ksi; int error; MPASS(sig == SIGTTIN || sig == SIGTTOU); tty_assert_locked(tp); p = td->td_proc; for (;;) { pg = p->p_pgrp; PGRP_LOCK(pg); PROC_LOCK(p); /* * pg may no longer be our process group. * Re-check after locking. */ if (p->p_pgrp != pg) { PROC_UNLOCK(p); PGRP_UNLOCK(pg); continue; } /* * The process should only sleep, when: * - This terminal is the controlling terminal * - Its process group is not the foreground process * group * - The parent process isn't waiting for the child to * exit * - the signal to send to the process isn't masked */ if (!tty_is_ctty(tp, p) || p->p_pgrp == tp->t_pgrp) { /* Allow the action to happen. */ PROC_UNLOCK(p); PGRP_UNLOCK(pg); return (0); } if (SIGISMEMBER(p->p_sigacts->ps_sigignore, sig) || SIGISMEMBER(td->td_sigmask, sig)) { /* Only allow them in write()/ioctl(). */ PROC_UNLOCK(p); PGRP_UNLOCK(pg); return (sig == SIGTTOU ? 0 : EIO); } if ((p->p_flag & P_PPWAIT) != 0 || (pg->pg_flags & PGRP_ORPHANED) != 0) { /* Don't allow the action to happen. */ PROC_UNLOCK(p); PGRP_UNLOCK(pg); return (EIO); } PROC_UNLOCK(p); /* * Send the signal and sleep until we're the new * foreground process group. */ if (sig != 0) { ksiginfo_init(&ksi); ksi.ksi_code = SI_KERNEL; ksi.ksi_signo = sig; sig = 0; } pgsignal(pg, ksi.ksi_signo, 1, &ksi); PGRP_UNLOCK(pg); error = tty_wait(tp, &tp->t_bgwait); if (error) return (error); } } static int ttydev_read(struct cdev *dev, struct uio *uio, int ioflag) { struct tty *tp = dev->si_drv1; int error; error = ttydev_enter(tp); if (error) goto done; error = ttydisc_read(tp, uio, ioflag); tty_unlock(tp); /* * The read() call should not throw an error when the device is * being destroyed. Silently convert it to an EOF. */ done: if (error == ENXIO) error = 0; return (error); } static int ttydev_write(struct cdev *dev, struct uio *uio, int ioflag) { struct tty *tp = dev->si_drv1; int defer, error; error = ttydev_enter(tp); if (error) return (error); if (tp->t_termios.c_lflag & TOSTOP) { error = tty_wait_background(tp, curthread, SIGTTOU); if (error) goto done; } if (ioflag & IO_NDELAY && tp->t_flags & TF_BUSY_OUT) { /* Allow non-blocking writes to bypass serialization. */ error = ttydisc_write(tp, uio, ioflag); } else { /* Serialize write() calls. */ while (tp->t_flags & TF_BUSY_OUT) { error = tty_wait(tp, &tp->t_outserwait); if (error) goto done; } tp->t_flags |= TF_BUSY_OUT; defer = sigdeferstop(SIGDEFERSTOP_ERESTART); error = ttydisc_write(tp, uio, ioflag); sigallowstop(defer); tp->t_flags &= ~TF_BUSY_OUT; cv_signal(&tp->t_outserwait); } done: tty_unlock(tp); return (error); } static int ttydev_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) { struct tty *tp = dev->si_drv1; int error; error = ttydev_enter(tp); if (error) return (error); switch (cmd) { case TIOCCBRK: case TIOCCONS: case TIOCDRAIN: case TIOCEXCL: case TIOCFLUSH: case TIOCNXCL: case TIOCSBRK: case TIOCSCTTY: case TIOCSETA: case TIOCSETAF: case TIOCSETAW: case TIOCSPGRP: case TIOCSTART: case TIOCSTAT: case TIOCSTI: case TIOCSTOP: case TIOCSWINSZ: #if 0 case TIOCSDRAINWAIT: case TIOCSETD: #endif #ifdef COMPAT_43TTY case TIOCLBIC: case TIOCLBIS: case TIOCLSET: case TIOCSETC: case OTIOCSETD: case TIOCSETN: case TIOCSETP: case TIOCSLTC: #endif /* COMPAT_43TTY */ /* * If the ioctl() causes the TTY to be modified, let it * wait in the background. */ error = tty_wait_background(tp, curthread, SIGTTOU); if (error) goto done; } if (cmd == TIOCSETA || cmd == TIOCSETAW || cmd == TIOCSETAF) { struct termios *old = &tp->t_termios; struct termios *new = (struct termios *)data; struct termios *lock = TTY_CALLOUT(tp, dev) ? &tp->t_termios_lock_out : &tp->t_termios_lock_in; int cc; /* * Lock state devices. Just overwrite the values of the * commands that are currently in use. */ new->c_iflag = (old->c_iflag & lock->c_iflag) | (new->c_iflag & ~lock->c_iflag); new->c_oflag = (old->c_oflag & lock->c_oflag) | (new->c_oflag & ~lock->c_oflag); new->c_cflag = (old->c_cflag & lock->c_cflag) | (new->c_cflag & ~lock->c_cflag); new->c_lflag = (old->c_lflag & lock->c_lflag) | (new->c_lflag & ~lock->c_lflag); for (cc = 0; cc < NCCS; ++cc) if (lock->c_cc[cc]) new->c_cc[cc] = old->c_cc[cc]; if (lock->c_ispeed) new->c_ispeed = old->c_ispeed; if (lock->c_ospeed) new->c_ospeed = old->c_ospeed; } error = tty_ioctl(tp, cmd, data, fflag, td); done: tty_unlock(tp); return (error); } static int ttydev_poll(struct cdev *dev, int events, struct thread *td) { struct tty *tp = dev->si_drv1; int error, revents = 0; error = ttydev_enter(tp); if (error) return ((events & (POLLIN|POLLRDNORM)) | POLLHUP); if (events & (POLLIN|POLLRDNORM)) { /* See if we can read something. */ if (ttydisc_read_poll(tp) > 0) revents |= events & (POLLIN|POLLRDNORM); } if (tp->t_flags & TF_ZOMBIE) { /* Hangup flag on zombie state. */ revents |= POLLHUP; } else if (events & (POLLOUT|POLLWRNORM)) { /* See if we can write something. */ if (ttydisc_write_poll(tp) > 0) revents |= events & (POLLOUT|POLLWRNORM); } if (revents == 0) { if (events & (POLLIN|POLLRDNORM)) selrecord(td, &tp->t_inpoll); if (events & (POLLOUT|POLLWRNORM)) selrecord(td, &tp->t_outpoll); } tty_unlock(tp); return (revents); } static int ttydev_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr, int nprot, vm_memattr_t *memattr) { struct tty *tp = dev->si_drv1; int error; /* Handle mmap() through the driver. */ error = ttydev_enter(tp); if (error) return (-1); error = ttydevsw_mmap(tp, offset, paddr, nprot, memattr); tty_unlock(tp); return (error); } /* * kqueue support. */ static void tty_kqops_read_detach(struct knote *kn) { struct tty *tp = kn->kn_hook; knlist_remove(&tp->t_inpoll.si_note, kn, 0); } static int tty_kqops_read_event(struct knote *kn, long hint __unused) { struct tty *tp = kn->kn_hook; tty_assert_locked(tp); if (tty_gone(tp) || tp->t_flags & TF_ZOMBIE) { kn->kn_flags |= EV_EOF; return (1); } else { kn->kn_data = ttydisc_read_poll(tp); return (kn->kn_data > 0); } } static void tty_kqops_write_detach(struct knote *kn) { struct tty *tp = kn->kn_hook; knlist_remove(&tp->t_outpoll.si_note, kn, 0); } static int tty_kqops_write_event(struct knote *kn, long hint __unused) { struct tty *tp = kn->kn_hook; tty_assert_locked(tp); if (tty_gone(tp)) { kn->kn_flags |= EV_EOF; return (1); } else { kn->kn_data = ttydisc_write_poll(tp); return (kn->kn_data > 0); } } -static struct filterops tty_kqops_read = { +static const struct filterops tty_kqops_read = { .f_isfd = 1, .f_detach = tty_kqops_read_detach, .f_event = tty_kqops_read_event, }; -static struct filterops tty_kqops_write = { +static const struct filterops tty_kqops_write = { .f_isfd = 1, .f_detach = tty_kqops_write_detach, .f_event = tty_kqops_write_event, }; static int ttydev_kqfilter(struct cdev *dev, struct knote *kn) { struct tty *tp = dev->si_drv1; int error; error = ttydev_enter(tp); if (error) return (error); switch (kn->kn_filter) { case EVFILT_READ: kn->kn_hook = tp; kn->kn_fop = &tty_kqops_read; knlist_add(&tp->t_inpoll.si_note, kn, 1); break; case EVFILT_WRITE: kn->kn_hook = tp; kn->kn_fop = &tty_kqops_write; knlist_add(&tp->t_outpoll.si_note, kn, 1); break; default: error = EINVAL; break; } tty_unlock(tp); return (error); } static struct cdevsw ttydev_cdevsw = { .d_version = D_VERSION, .d_open = ttydev_open, .d_close = ttydev_close, .d_read = ttydev_read, .d_write = ttydev_write, .d_ioctl = ttydev_ioctl, .d_kqfilter = ttydev_kqfilter, .d_poll = ttydev_poll, .d_mmap = ttydev_mmap, .d_name = "ttydev", .d_flags = D_TTY, }; /* * Init/lock-state devices */ static int ttyil_open(struct cdev *dev, int oflags __unused, int devtype __unused, struct thread *td) { struct tty *tp; int error; tp = dev->si_drv1; error = 0; tty_lock(tp); if (tty_gone(tp)) error = ENODEV; tty_unlock(tp); return (error); } static int ttyil_close(struct cdev *dev __unused, int flag __unused, int mode __unused, struct thread *td __unused) { return (0); } static int ttyil_rdwr(struct cdev *dev __unused, struct uio *uio __unused, int ioflag __unused) { return (ENODEV); } static int ttyil_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) { struct tty *tp = dev->si_drv1; int error; tty_lock(tp); if (tty_gone(tp)) { error = ENODEV; goto done; } error = ttydevsw_cioctl(tp, dev2unit(dev), cmd, data, td); if (error != ENOIOCTL) goto done; error = 0; switch (cmd) { case TIOCGETA: /* Obtain terminal flags through tcgetattr(). */ *(struct termios*)data = *(struct termios*)dev->si_drv2; break; case TIOCSETA: /* Set terminal flags through tcsetattr(). */ error = priv_check(td, PRIV_TTY_SETA); if (error) break; *(struct termios*)dev->si_drv2 = *(struct termios*)data; break; case TIOCGETD: *(int *)data = TTYDISC; break; case TIOCGWINSZ: bzero(data, sizeof(struct winsize)); break; default: error = ENOTTY; } done: tty_unlock(tp); return (error); } static struct cdevsw ttyil_cdevsw = { .d_version = D_VERSION, .d_open = ttyil_open, .d_close = ttyil_close, .d_read = ttyil_rdwr, .d_write = ttyil_rdwr, .d_ioctl = ttyil_ioctl, .d_name = "ttyil", .d_flags = D_TTY, }; static void tty_init_termios(struct tty *tp) { struct termios *t = &tp->t_termios_init_in; t->c_cflag = TTYDEF_CFLAG; t->c_iflag = TTYDEF_IFLAG; t->c_lflag = TTYDEF_LFLAG; t->c_oflag = TTYDEF_OFLAG; t->c_ispeed = TTYDEF_SPEED; t->c_ospeed = TTYDEF_SPEED; memcpy(&t->c_cc, ttydefchars, sizeof ttydefchars); tp->t_termios_init_out = *t; } void tty_init_console(struct tty *tp, speed_t s) { struct termios *ti = &tp->t_termios_init_in; struct termios *to = &tp->t_termios_init_out; if (s != 0) { ti->c_ispeed = ti->c_ospeed = s; to->c_ispeed = to->c_ospeed = s; } ti->c_cflag |= CLOCAL; to->c_cflag |= CLOCAL; } /* * Standard device routine implementations, mostly meant for * pseudo-terminal device drivers. When a driver creates a new terminal * device class, missing routines are patched. */ static int ttydevsw_defopen(struct tty *tp __unused) { return (0); } static void ttydevsw_defclose(struct tty *tp __unused) { } static void ttydevsw_defoutwakeup(struct tty *tp __unused) { panic("Terminal device has output, while not implemented"); } static void ttydevsw_definwakeup(struct tty *tp __unused) { } static int ttydevsw_defioctl(struct tty *tp __unused, u_long cmd __unused, caddr_t data __unused, struct thread *td __unused) { return (ENOIOCTL); } static int ttydevsw_defcioctl(struct tty *tp __unused, int unit __unused, u_long cmd __unused, caddr_t data __unused, struct thread *td __unused) { return (ENOIOCTL); } static int ttydevsw_defparam(struct tty *tp __unused, struct termios *t) { /* * Allow the baud rate to be adjusted for pseudo-devices, but at * least restrict it to 115200 to prevent excessive buffer * usage. Also disallow 0, to prevent foot shooting. */ if (t->c_ispeed < B50) t->c_ispeed = B50; else if (t->c_ispeed > B115200) t->c_ispeed = B115200; if (t->c_ospeed < B50) t->c_ospeed = B50; else if (t->c_ospeed > B115200) t->c_ospeed = B115200; t->c_cflag |= CREAD; return (0); } static int ttydevsw_defmodem(struct tty *tp __unused, int sigon __unused, int sigoff __unused) { /* Simulate a carrier to make the TTY layer happy. */ return (SER_DCD); } static int ttydevsw_defmmap(struct tty *tp __unused, vm_ooffset_t offset __unused, vm_paddr_t *paddr __unused, int nprot __unused, vm_memattr_t *memattr __unused) { return (-1); } static void ttydevsw_defpktnotify(struct tty *tp __unused, char event __unused) { } static void ttydevsw_deffree(void *softc __unused) { panic("Terminal device freed without a free-handler"); } static bool ttydevsw_defbusy(struct tty *tp __unused) { return (FALSE); } /* * TTY allocation and deallocation. TTY devices can be deallocated when * the driver doesn't use it anymore, when the TTY isn't a session's * controlling TTY and when the device node isn't opened through devfs. */ struct tty * tty_alloc(struct ttydevsw *tsw, void *sc) { return (tty_alloc_mutex(tsw, sc, NULL)); } struct tty * tty_alloc_mutex(struct ttydevsw *tsw, void *sc, struct mtx *mutex) { struct tty *tp; /* Make sure the driver defines all routines. */ #define PATCH_FUNC(x) do { \ if (tsw->tsw_ ## x == NULL) \ tsw->tsw_ ## x = ttydevsw_def ## x; \ } while (0) PATCH_FUNC(open); PATCH_FUNC(close); PATCH_FUNC(outwakeup); PATCH_FUNC(inwakeup); PATCH_FUNC(ioctl); PATCH_FUNC(cioctl); PATCH_FUNC(param); PATCH_FUNC(modem); PATCH_FUNC(mmap); PATCH_FUNC(pktnotify); PATCH_FUNC(free); PATCH_FUNC(busy); #undef PATCH_FUNC tp = malloc(sizeof(struct tty) + TTY_PRBUF_SIZE, M_TTY, M_WAITOK | M_ZERO); tp->t_prbufsz = TTY_PRBUF_SIZE; tp->t_devsw = tsw; tp->t_devswsoftc = sc; tp->t_flags = tsw->tsw_flags; tp->t_drainwait = tty_drainwait; tty_init_termios(tp); cv_init(&tp->t_inwait, "ttyin"); cv_init(&tp->t_outwait, "ttyout"); cv_init(&tp->t_outserwait, "ttyosr"); cv_init(&tp->t_bgwait, "ttybg"); cv_init(&tp->t_dcdwait, "ttydcd"); /* Allow drivers to use a custom mutex to lock the TTY. */ if (mutex != NULL) { tp->t_mtx = mutex; } else { tp->t_mtx = &tp->t_mtxobj; mtx_init(&tp->t_mtxobj, "ttymtx", NULL, MTX_DEF); } knlist_init_mtx(&tp->t_inpoll.si_note, tp->t_mtx); knlist_init_mtx(&tp->t_outpoll.si_note, tp->t_mtx); return (tp); } static void tty_dealloc(void *arg) { struct tty *tp = arg; /* * ttyydev_leave() usually frees the i/o queues earlier, but it is * not always called between queue allocation and here. The queues * may be allocated by ioctls on a pty control device without the * corresponding pty slave device ever being open, or after it is * closed. */ ttyinq_free(&tp->t_inq); ttyoutq_free(&tp->t_outq); seldrain(&tp->t_inpoll); seldrain(&tp->t_outpoll); knlist_clear(&tp->t_inpoll.si_note, 0); knlist_clear(&tp->t_outpoll.si_note, 0); knlist_destroy(&tp->t_inpoll.si_note); knlist_destroy(&tp->t_outpoll.si_note); cv_destroy(&tp->t_inwait); cv_destroy(&tp->t_outwait); cv_destroy(&tp->t_bgwait); cv_destroy(&tp->t_dcdwait); cv_destroy(&tp->t_outserwait); if (tp->t_mtx == &tp->t_mtxobj) mtx_destroy(&tp->t_mtxobj); ttydevsw_free(tp); free(tp, M_TTY); } static void tty_rel_free(struct tty *tp) { struct cdev *dev; tty_assert_locked(tp); #define TF_ACTIVITY (TF_GONE|TF_OPENED|TF_HOOK|TF_OPENCLOSE) if (tp->t_sessioncnt != 0 || (tp->t_flags & TF_ACTIVITY) != TF_GONE) { /* TTY is still in use. */ tty_unlock(tp); return; } /* Stop asynchronous I/O. */ funsetown(&tp->t_sigio); /* TTY can be deallocated. */ dev = tp->t_dev; tp->t_dev = NULL; tty_unlock(tp); if (dev != NULL) { sx_xlock(&tty_list_sx); TAILQ_REMOVE(&tty_list, tp, t_list); tty_list_count--; sx_xunlock(&tty_list_sx); destroy_dev_sched_cb(dev, tty_dealloc, tp); } } void tty_rel_pgrp(struct tty *tp, struct pgrp *pg) { MPASS(tp->t_sessioncnt > 0); tty_assert_locked(tp); if (tp->t_pgrp == pg) tp->t_pgrp = NULL; tty_unlock(tp); } void tty_rel_sess(struct tty *tp, struct session *sess) { MPASS(tp->t_sessioncnt > 0); /* Current session has left. */ if (tp->t_session == sess) { tp->t_session = NULL; MPASS(tp->t_pgrp == NULL); } tp->t_sessioncnt--; tty_rel_free(tp); } void tty_rel_gone(struct tty *tp) { tty_assert_locked(tp); MPASS(!tty_gone(tp)); /* Simulate carrier removal. */ ttydisc_modem(tp, 0); /* Wake up all blocked threads. */ tty_wakeup(tp, FREAD|FWRITE); cv_broadcast(&tp->t_bgwait); cv_broadcast(&tp->t_dcdwait); tp->t_flags |= TF_GONE; tty_rel_free(tp); } static int tty_drop_ctty(struct tty *tp, struct proc *p) { struct session *session; struct vnode *vp; /* * This looks terrible, but it's generally safe as long as the tty * hasn't gone away while we had the lock dropped. All of our sanity * checking that this operation is OK happens after we've picked it back * up, so other state changes are generally not fatal and the potential * for this particular operation to happen out-of-order in a * multithreaded scenario is likely a non-issue. */ tty_unlock(tp); sx_xlock(&proctree_lock); tty_lock(tp); if (tty_gone(tp)) { sx_xunlock(&proctree_lock); return (ENODEV); } /* * If the session doesn't have a controlling TTY, or if we weren't * invoked on the controlling TTY, we'll return ENOIOCTL as we've * historically done. */ session = p->p_session; if (session->s_ttyp == NULL || session->s_ttyp != tp) { sx_xunlock(&proctree_lock); return (ENOTTY); } if (!SESS_LEADER(p)) { sx_xunlock(&proctree_lock); return (EPERM); } PROC_LOCK(p); SESS_LOCK(session); vp = session->s_ttyvp; session->s_ttyp = NULL; session->s_ttyvp = NULL; session->s_ttydp = NULL; SESS_UNLOCK(session); tp->t_sessioncnt--; p->p_flag &= ~P_CONTROLT; PROC_UNLOCK(p); sx_xunlock(&proctree_lock); /* * If we did have a vnode, release our reference. Ordinarily we manage * these at the devfs layer, but we can't necessarily know that we were * invoked on the vnode referenced in the session (i.e. the vnode we * hold a reference to). We explicitly don't check VBAD/VIRF_DOOMED here * to avoid a vnode leak -- in circumstances elsewhere where we'd hit a * VIRF_DOOMED vnode, release has been deferred until the controlling TTY * is either changed or released. */ if (vp != NULL) devfs_ctty_unref(vp); return (0); } /* * Exposing information about current TTY's through sysctl */ static void tty_to_xtty(struct tty *tp, struct xtty *xt) { tty_assert_locked(tp); memset(xt, 0, sizeof(*xt)); xt->xt_size = sizeof(struct xtty); xt->xt_insize = ttyinq_getsize(&tp->t_inq); xt->xt_incc = ttyinq_bytescanonicalized(&tp->t_inq); xt->xt_inlc = ttyinq_bytesline(&tp->t_inq); xt->xt_inlow = tp->t_inlow; xt->xt_outsize = ttyoutq_getsize(&tp->t_outq); xt->xt_outcc = ttyoutq_bytesused(&tp->t_outq); xt->xt_outlow = tp->t_outlow; xt->xt_column = tp->t_column; xt->xt_pgid = tp->t_pgrp ? tp->t_pgrp->pg_id : 0; xt->xt_sid = tp->t_session ? tp->t_session->s_sid : 0; xt->xt_flags = tp->t_flags; xt->xt_dev = tp->t_dev ? dev2udev(tp->t_dev) : (uint32_t)NODEV; } static int sysctl_kern_ttys(SYSCTL_HANDLER_ARGS) { unsigned long lsize; struct thread *td = curthread; struct xtty *xtlist, *xt; struct tty *tp; struct proc *p; int error; bool cansee; sx_slock(&tty_list_sx); lsize = tty_list_count * sizeof(struct xtty); if (lsize == 0) { sx_sunlock(&tty_list_sx); return (0); } xtlist = xt = malloc(lsize, M_TTY, M_WAITOK); TAILQ_FOREACH(tp, &tty_list, t_list) { tty_lock(tp); if (tp->t_session != NULL && (p = atomic_load_ptr(&tp->t_session->s_leader)) != NULL) { PROC_LOCK(p); cansee = (p_cansee(td, p) == 0); PROC_UNLOCK(p); } else { cansee = !jailed(td->td_ucred); } if (cansee) { tty_to_xtty(tp, xt); xt++; } tty_unlock(tp); } sx_sunlock(&tty_list_sx); lsize = (xt - xtlist) * sizeof(struct xtty); if (lsize > 0) { error = SYSCTL_OUT(req, xtlist, lsize); } else { error = 0; } free(xtlist, M_TTY); return (error); } SYSCTL_PROC(_kern, OID_AUTO, ttys, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE, 0, 0, sysctl_kern_ttys, "S,xtty", "List of TTYs"); /* * Device node creation. Device has been set up, now we can expose it to * the user. */ int tty_makedevf(struct tty *tp, struct ucred *cred, int flags, const char *fmt, ...) { va_list ap; struct make_dev_args args; struct cdev *dev, *init, *lock, *cua, *cinit, *clock; const char *prefix = "tty"; char name[SPECNAMELEN - 3]; /* for "tty" and "cua". */ uid_t uid; gid_t gid; mode_t mode; int error; /* Remove "tty" prefix from devices like PTY's. */ if (tp->t_flags & TF_NOPREFIX) prefix = ""; va_start(ap, fmt); vsnrprintf(name, sizeof name, 32, fmt, ap); va_end(ap); if (cred == NULL) { /* System device. */ uid = UID_ROOT; gid = GID_WHEEL; mode = S_IRUSR|S_IWUSR; } else { /* User device. */ uid = cred->cr_ruid; gid = GID_TTY; mode = S_IRUSR|S_IWUSR|S_IWGRP; } flags = flags & TTYMK_CLONING ? MAKEDEV_REF : 0; flags |= MAKEDEV_CHECKNAME; /* Master call-in device. */ make_dev_args_init(&args); args.mda_flags = flags; args.mda_devsw = &ttydev_cdevsw; args.mda_cr = cred; args.mda_uid = uid; args.mda_gid = gid; args.mda_mode = mode; args.mda_si_drv1 = tp; error = make_dev_s(&args, &dev, "%s%s", prefix, name); if (error != 0) return (error); tp->t_dev = dev; init = lock = cua = cinit = clock = NULL; /* Slave call-in devices. */ if (tp->t_flags & TF_INITLOCK) { args.mda_devsw = &ttyil_cdevsw; args.mda_unit = TTYUNIT_INIT; args.mda_si_drv1 = tp; args.mda_si_drv2 = &tp->t_termios_init_in; error = make_dev_s(&args, &init, "%s%s.init", prefix, name); if (error != 0) goto fail; dev_depends(dev, init); args.mda_unit = TTYUNIT_LOCK; args.mda_si_drv2 = &tp->t_termios_lock_in; error = make_dev_s(&args, &lock, "%s%s.lock", prefix, name); if (error != 0) goto fail; dev_depends(dev, lock); } /* Call-out devices. */ if (tp->t_flags & TF_CALLOUT) { make_dev_args_init(&args); args.mda_flags = flags; args.mda_devsw = &ttydev_cdevsw; args.mda_cr = cred; args.mda_uid = UID_UUCP; args.mda_gid = GID_DIALER; args.mda_mode = 0660; args.mda_unit = TTYUNIT_CALLOUT; args.mda_si_drv1 = tp; error = make_dev_s(&args, &cua, "cua%s", name); if (error != 0) goto fail; dev_depends(dev, cua); /* Slave call-out devices. */ if (tp->t_flags & TF_INITLOCK) { args.mda_devsw = &ttyil_cdevsw; args.mda_unit = TTYUNIT_CALLOUT | TTYUNIT_INIT; args.mda_si_drv2 = &tp->t_termios_init_out; error = make_dev_s(&args, &cinit, "cua%s.init", name); if (error != 0) goto fail; dev_depends(dev, cinit); args.mda_unit = TTYUNIT_CALLOUT | TTYUNIT_LOCK; args.mda_si_drv2 = &tp->t_termios_lock_out; error = make_dev_s(&args, &clock, "cua%s.lock", name); if (error != 0) goto fail; dev_depends(dev, clock); } } sx_xlock(&tty_list_sx); TAILQ_INSERT_TAIL(&tty_list, tp, t_list); tty_list_count++; sx_xunlock(&tty_list_sx); return (0); fail: destroy_dev(dev); if (init) destroy_dev(init); if (lock) destroy_dev(lock); if (cinit) destroy_dev(cinit); if (clock) destroy_dev(clock); return (error); } /* * Signalling processes. */ void tty_signal_sessleader(struct tty *tp, int sig) { struct proc *p; struct session *s; tty_assert_locked(tp); MPASS(sig >= 1 && sig < NSIG); /* Make signals start output again. */ tp->t_flags &= ~TF_STOPPED; tp->t_termios.c_lflag &= ~FLUSHO; /* * Load s_leader exactly once to avoid race where s_leader is * set to NULL by a concurrent invocation of killjobc() by the * session leader. Note that we are not holding t_session's * lock for the read. */ if ((s = tp->t_session) != NULL && (p = atomic_load_ptr(&s->s_leader)) != NULL) { PROC_LOCK(p); kern_psignal(p, sig); PROC_UNLOCK(p); } } void tty_signal_pgrp(struct tty *tp, int sig) { ksiginfo_t ksi; tty_assert_locked(tp); MPASS(sig >= 1 && sig < NSIG); /* Make signals start output again. */ tp->t_flags &= ~TF_STOPPED; tp->t_termios.c_lflag &= ~FLUSHO; if (sig == SIGINFO && !(tp->t_termios.c_lflag & NOKERNINFO)) tty_info(tp); if (tp->t_pgrp != NULL) { ksiginfo_init(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = SI_KERNEL; PGRP_LOCK(tp->t_pgrp); pgsignal(tp->t_pgrp, sig, 1, &ksi); PGRP_UNLOCK(tp->t_pgrp); } } void tty_wakeup(struct tty *tp, int flags) { if (tp->t_flags & TF_ASYNC && tp->t_sigio != NULL) pgsigio(&tp->t_sigio, SIGIO, (tp->t_session != NULL)); if (flags & FWRITE) { cv_broadcast(&tp->t_outwait); selwakeup(&tp->t_outpoll); KNOTE_LOCKED(&tp->t_outpoll.si_note, 0); } if (flags & FREAD) { cv_broadcast(&tp->t_inwait); selwakeup(&tp->t_inpoll); KNOTE_LOCKED(&tp->t_inpoll.si_note, 0); } } int tty_wait(struct tty *tp, struct cv *cv) { int error; int revokecnt = tp->t_revokecnt; tty_lock_assert(tp, MA_OWNED|MA_NOTRECURSED); MPASS(!tty_gone(tp)); error = cv_wait_sig(cv, tp->t_mtx); /* Bail out when the device slipped away. */ if (tty_gone(tp)) return (ENXIO); /* Restart the system call when we may have been revoked. */ if (tp->t_revokecnt != revokecnt) return (ERESTART); return (error); } int tty_timedwait(struct tty *tp, struct cv *cv, int hz) { int error; int revokecnt = tp->t_revokecnt; tty_lock_assert(tp, MA_OWNED|MA_NOTRECURSED); MPASS(!tty_gone(tp)); error = cv_timedwait_sig(cv, tp->t_mtx, hz); /* Bail out when the device slipped away. */ if (tty_gone(tp)) return (ENXIO); /* Restart the system call when we may have been revoked. */ if (tp->t_revokecnt != revokecnt) return (ERESTART); return (error); } void tty_flush(struct tty *tp, int flags) { if (flags & FWRITE) { tp->t_flags &= ~TF_HIWAT_OUT; ttyoutq_flush(&tp->t_outq); tty_wakeup(tp, FWRITE); if (!tty_gone(tp)) { ttydevsw_outwakeup(tp); ttydevsw_pktnotify(tp, TIOCPKT_FLUSHWRITE); } } if (flags & FREAD) { tty_hiwat_in_unblock(tp); ttyinq_flush(&tp->t_inq); tty_wakeup(tp, FREAD); if (!tty_gone(tp)) { ttydevsw_inwakeup(tp); ttydevsw_pktnotify(tp, TIOCPKT_FLUSHREAD); } } } void tty_set_winsize(struct tty *tp, const struct winsize *wsz) { if (memcmp(&tp->t_winsize, wsz, sizeof(*wsz)) == 0) return; tp->t_winsize = *wsz; tty_signal_pgrp(tp, SIGWINCH); } static int tty_generic_ioctl(struct tty *tp, u_long cmd, void *data, int fflag, struct thread *td) { int error; switch (cmd) { /* * Modem commands. * The SER_* and TIOCM_* flags are the same, but one bit * shifted. I don't know why. */ case TIOCSDTR: ttydevsw_modem(tp, SER_DTR, 0); return (0); case TIOCCDTR: ttydevsw_modem(tp, 0, SER_DTR); return (0); case TIOCMSET: { int bits = *(int *)data; ttydevsw_modem(tp, (bits & (TIOCM_DTR | TIOCM_RTS)) >> 1, ((~bits) & (TIOCM_DTR | TIOCM_RTS)) >> 1); return (0); } case TIOCMBIS: { int bits = *(int *)data; ttydevsw_modem(tp, (bits & (TIOCM_DTR | TIOCM_RTS)) >> 1, 0); return (0); } case TIOCMBIC: { int bits = *(int *)data; ttydevsw_modem(tp, 0, (bits & (TIOCM_DTR | TIOCM_RTS)) >> 1); return (0); } case TIOCMGET: *(int *)data = TIOCM_LE + (ttydevsw_modem(tp, 0, 0) << 1); return (0); case FIOASYNC: if (*(int *)data) tp->t_flags |= TF_ASYNC; else tp->t_flags &= ~TF_ASYNC; return (0); case FIONBIO: /* This device supports non-blocking operation. */ return (0); case FIONREAD: *(int *)data = ttydisc_bytesavail(tp); return (0); case FIONWRITE: case TIOCOUTQ: *(int *)data = ttyoutq_bytesused(&tp->t_outq); return (0); case FIOSETOWN: if (tp->t_session != NULL && !tty_is_ctty(tp, td->td_proc)) /* Not allowed to set ownership. */ return (ENOTTY); /* Temporarily unlock the TTY to set ownership. */ tty_unlock(tp); error = fsetown(*(int *)data, &tp->t_sigio); tty_lock(tp); return (error); case FIOGETOWN: if (tp->t_session != NULL && !tty_is_ctty(tp, td->td_proc)) /* Not allowed to set ownership. */ return (ENOTTY); /* Get ownership. */ *(int *)data = fgetown(&tp->t_sigio); return (0); case TIOCGETA: /* Obtain terminal flags through tcgetattr(). */ *(struct termios*)data = tp->t_termios; return (0); case TIOCSETA: case TIOCSETAW: case TIOCSETAF: { struct termios *t = data; bool canonicalize = false; /* * Who makes up these funny rules? According to POSIX, * input baud rate is set equal to the output baud rate * when zero. */ if (t->c_ispeed == 0) t->c_ispeed = t->c_ospeed; /* Discard any unsupported bits. */ t->c_iflag &= TTYSUP_IFLAG; t->c_oflag &= TTYSUP_OFLAG; t->c_lflag &= TTYSUP_LFLAG; t->c_cflag &= TTYSUP_CFLAG; /* Set terminal flags through tcsetattr(). */ if (cmd == TIOCSETAW || cmd == TIOCSETAF) { error = tty_drain(tp, 0); if (error) return (error); if (cmd == TIOCSETAF) tty_flush(tp, FREAD); } /* * Only call param() when the flags really change. */ if ((t->c_cflag & CIGNORE) == 0 && (tp->t_termios.c_cflag != t->c_cflag || ((tp->t_termios.c_iflag ^ t->c_iflag) & (IXON|IXOFF|IXANY)) || tp->t_termios.c_ispeed != t->c_ispeed || tp->t_termios.c_ospeed != t->c_ospeed)) { error = ttydevsw_param(tp, t); if (error) return (error); /* XXX: CLOCAL? */ tp->t_termios.c_cflag = t->c_cflag & ~CIGNORE; tp->t_termios.c_ispeed = t->c_ispeed; tp->t_termios.c_ospeed = t->c_ospeed; /* Baud rate has changed - update watermarks. */ error = tty_watermarks(tp); if (error) return (error); } /* * We'll canonicalize any partial input if we're transitioning * ICANON one way or the other. If we're going from -ICANON -> * ICANON, then in the worst case scenario we're in the middle * of a line but both ttydisc_read() and FIONREAD will search * for one of our line terminals. */ if ((t->c_lflag & ICANON) != (tp->t_termios.c_lflag & ICANON)) canonicalize = true; else if (tp->t_termios.c_cc[VEOF] != t->c_cc[VEOF] || tp->t_termios.c_cc[VEOL] != t->c_cc[VEOL]) canonicalize = true; /* Copy new non-device driver parameters. */ tp->t_termios.c_iflag = t->c_iflag; tp->t_termios.c_oflag = t->c_oflag; tp->t_termios.c_lflag = t->c_lflag; memcpy(&tp->t_termios.c_cc, t->c_cc, sizeof t->c_cc); ttydisc_optimize(tp); if (canonicalize) ttydisc_canonicalize(tp); if ((t->c_lflag & ICANON) == 0) { /* * When in non-canonical mode, wake up all * readers. Any partial input has already been * canonicalized above if we were in canonical mode. * VMIN and VTIME could also be adjusted. */ tty_wakeup(tp, FREAD); } /* * For packet mode: notify the PTY consumer that VSTOP * and VSTART may have been changed. */ if (tp->t_termios.c_iflag & IXON && tp->t_termios.c_cc[VSTOP] == CTRL('S') && tp->t_termios.c_cc[VSTART] == CTRL('Q')) ttydevsw_pktnotify(tp, TIOCPKT_DOSTOP); else ttydevsw_pktnotify(tp, TIOCPKT_NOSTOP); return (0); } case TIOCGETD: /* For compatibility - we only support TTYDISC. */ *(int *)data = TTYDISC; return (0); case TIOCGPGRP: if (!tty_is_ctty(tp, td->td_proc)) return (ENOTTY); if (tp->t_pgrp != NULL) *(int *)data = tp->t_pgrp->pg_id; else *(int *)data = NO_PID; return (0); case TIOCGSID: if (!tty_is_ctty(tp, td->td_proc)) return (ENOTTY); MPASS(tp->t_session); *(int *)data = tp->t_session->s_sid; return (0); case TIOCNOTTY: return (tty_drop_ctty(tp, td->td_proc)); case TIOCSCTTY: { struct proc *p = td->td_proc; /* XXX: This looks awful. */ tty_unlock(tp); sx_xlock(&proctree_lock); tty_lock(tp); if (!SESS_LEADER(p)) { /* Only the session leader may do this. */ sx_xunlock(&proctree_lock); return (EPERM); } if (tp->t_session != NULL && tp->t_session == p->p_session) { /* This is already our controlling TTY. */ sx_xunlock(&proctree_lock); return (0); } if (p->p_session->s_ttyp != NULL || (tp->t_session != NULL && tp->t_session->s_ttyvp != NULL && tp->t_session->s_ttyvp->v_type != VBAD)) { /* * There is already a relation between a TTY and * a session, or the caller is not the session * leader. * * Allow the TTY to be stolen when the vnode is * invalid, but the reference to the TTY is * still active. This allows immediate reuse of * TTYs of which the session leader has been * killed or the TTY revoked. */ sx_xunlock(&proctree_lock); return (EPERM); } /* Connect the session to the TTY. */ tp->t_session = p->p_session; tp->t_session->s_ttyp = tp; tp->t_sessioncnt++; /* Assign foreground process group. */ tp->t_pgrp = p->p_pgrp; PROC_LOCK(p); p->p_flag |= P_CONTROLT; PROC_UNLOCK(p); sx_xunlock(&proctree_lock); return (0); } case TIOCSPGRP: { struct pgrp *pg; /* * XXX: Temporarily unlock the TTY to locate the process * group. This code would be lot nicer if we would ever * decompose proctree_lock. */ tty_unlock(tp); sx_slock(&proctree_lock); pg = pgfind(*(int *)data); if (pg != NULL) PGRP_UNLOCK(pg); if (pg == NULL || pg->pg_session != td->td_proc->p_session) { sx_sunlock(&proctree_lock); tty_lock(tp); return (EPERM); } tty_lock(tp); /* * Determine if this TTY is the controlling TTY after * relocking the TTY. */ if (!tty_is_ctty(tp, td->td_proc)) { sx_sunlock(&proctree_lock); return (ENOTTY); } tp->t_pgrp = pg; sx_sunlock(&proctree_lock); /* Wake up the background process groups. */ cv_broadcast(&tp->t_bgwait); return (0); } case TIOCFLUSH: { int flags = *(int *)data; if (flags == 0) flags = (FREAD|FWRITE); else flags &= (FREAD|FWRITE); tty_flush(tp, flags); return (0); } case TIOCDRAIN: /* Drain TTY output. */ return tty_drain(tp, 0); case TIOCGDRAINWAIT: *(int *)data = tp->t_drainwait; return (0); case TIOCSDRAINWAIT: error = priv_check(td, PRIV_TTY_DRAINWAIT); if (error == 0) tp->t_drainwait = *(int *)data; return (error); case TIOCCONS: /* Set terminal as console TTY. */ if (*(int *)data) { error = priv_check(td, PRIV_TTY_CONSOLE); if (error) return (error); error = constty_set(tp); } else { error = constty_clear(tp); } return (error); case TIOCGWINSZ: /* Obtain window size. */ *(struct winsize*)data = tp->t_winsize; return (0); case TIOCSWINSZ: /* Set window size. */ tty_set_winsize(tp, data); return (0); case TIOCEXCL: tp->t_flags |= TF_EXCLUDE; return (0); case TIOCNXCL: tp->t_flags &= ~TF_EXCLUDE; return (0); case TIOCSTOP: tp->t_flags |= TF_STOPPED; ttydevsw_pktnotify(tp, TIOCPKT_STOP); return (0); case TIOCSTART: tp->t_flags &= ~TF_STOPPED; tp->t_termios.c_lflag &= ~FLUSHO; ttydevsw_outwakeup(tp); ttydevsw_pktnotify(tp, TIOCPKT_START); return (0); case TIOCSTAT: tty_info(tp); return (0); case TIOCSTI: if ((fflag & FREAD) == 0 && priv_check(td, PRIV_TTY_STI)) return (EPERM); if (!tty_is_ctty(tp, td->td_proc) && priv_check(td, PRIV_TTY_STI)) return (EACCES); ttydisc_rint(tp, *(char *)data, 0); ttydisc_rint_done(tp); return (0); } #ifdef COMPAT_43TTY return tty_ioctl_compat(tp, cmd, data, fflag, td); #else /* !COMPAT_43TTY */ return (ENOIOCTL); #endif /* COMPAT_43TTY */ } int tty_ioctl(struct tty *tp, u_long cmd, void *data, int fflag, struct thread *td) { int error; tty_assert_locked(tp); if (tty_gone(tp)) return (ENXIO); error = ttydevsw_ioctl(tp, cmd, data, td); if (error == ENOIOCTL) error = tty_generic_ioctl(tp, cmd, data, fflag, td); return (error); } dev_t tty_udev(struct tty *tp) { if (tp->t_dev) return (dev2udev(tp->t_dev)); else return (NODEV); } int tty_checkoutq(struct tty *tp) { /* 256 bytes should be enough to print a log message. */ return (ttyoutq_bytesleft(&tp->t_outq) >= 256); } void tty_hiwat_in_block(struct tty *tp) { if ((tp->t_flags & TF_HIWAT_IN) == 0 && tp->t_termios.c_iflag & IXOFF && tp->t_termios.c_cc[VSTOP] != _POSIX_VDISABLE) { /* * Input flow control. Only enter the high watermark when we * can successfully store the VSTOP character. */ if (ttyoutq_write_nofrag(&tp->t_outq, &tp->t_termios.c_cc[VSTOP], 1) == 0) tp->t_flags |= TF_HIWAT_IN; } else { /* No input flow control. */ tp->t_flags |= TF_HIWAT_IN; } } void tty_hiwat_in_unblock(struct tty *tp) { if (tp->t_flags & TF_HIWAT_IN && tp->t_termios.c_iflag & IXOFF && tp->t_termios.c_cc[VSTART] != _POSIX_VDISABLE) { /* * Input flow control. Only leave the high watermark when we * can successfully store the VSTART character. */ if (ttyoutq_write_nofrag(&tp->t_outq, &tp->t_termios.c_cc[VSTART], 1) == 0) tp->t_flags &= ~TF_HIWAT_IN; } else { /* No input flow control. */ tp->t_flags &= ~TF_HIWAT_IN; } if (!tty_gone(tp)) ttydevsw_inwakeup(tp); } /* * TTY hooks interface. */ static int ttyhook_defrint(struct tty *tp, char c, int flags) { if (ttyhook_rint_bypass(tp, &c, 1) != 1) return (-1); return (0); } int ttyhook_register(struct tty **rtp, struct proc *p, int fd, struct ttyhook *th, void *softc) { struct tty *tp; struct file *fp; struct cdev *dev; struct cdevsw *cdp; struct filedesc *fdp; cap_rights_t rights; int error, ref; /* Validate the file descriptor. */ /* * XXX this code inspects a file descriptor from a different process, * but there is no dedicated routine to do it in fd code, making the * ordeal highly questionable. */ fdp = p->p_fd; FILEDESC_SLOCK(fdp); error = fget_cap_noref(fdp, fd, cap_rights_init_one(&rights, CAP_TTYHOOK), &fp, NULL); if (error == 0 && !fhold(fp)) error = EBADF; FILEDESC_SUNLOCK(fdp); if (error != 0) return (error); if (fp->f_ops == &badfileops) { error = EBADF; goto done1; } /* * Make sure the vnode is bound to a character device. * Unlocked check for the vnode type is ok there, because we * only shall prevent calling devvn_refthread on the file that * never has been opened over a character device. */ if (fp->f_type != DTYPE_VNODE || fp->f_vnode->v_type != VCHR) { error = EINVAL; goto done1; } /* Make sure it is a TTY. */ cdp = devvn_refthread(fp->f_vnode, &dev, &ref); if (cdp == NULL) { error = ENXIO; goto done1; } if (dev != fp->f_data) { error = ENXIO; goto done2; } if (cdp != &ttydev_cdevsw) { error = ENOTTY; goto done2; } tp = dev->si_drv1; /* Try to attach the hook to the TTY. */ error = EBUSY; tty_lock(tp); MPASS((tp->t_hook == NULL) == ((tp->t_flags & TF_HOOK) == 0)); if (tp->t_flags & TF_HOOK) goto done3; tp->t_flags |= TF_HOOK; tp->t_hook = th; tp->t_hooksoftc = softc; *rtp = tp; error = 0; /* Maybe we can switch into bypass mode now. */ ttydisc_optimize(tp); /* Silently convert rint() calls to rint_bypass() when possible. */ if (!ttyhook_hashook(tp, rint) && ttyhook_hashook(tp, rint_bypass)) th->th_rint = ttyhook_defrint; done3: tty_unlock(tp); done2: dev_relthread(dev, ref); done1: fdrop(fp, curthread); return (error); } void ttyhook_unregister(struct tty *tp) { tty_assert_locked(tp); MPASS(tp->t_flags & TF_HOOK); /* Disconnect the hook. */ tp->t_flags &= ~TF_HOOK; tp->t_hook = NULL; /* Maybe we need to leave bypass mode. */ ttydisc_optimize(tp); /* Maybe deallocate the TTY as well. */ tty_rel_free(tp); } /* * /dev/console handling. */ static int ttyconsdev_open(struct cdev *dev, int oflags, int devtype, struct thread *td) { struct tty *tp; /* System has no console device. */ if (dev_console_filename == NULL) return (ENXIO); /* Look up corresponding TTY by device name. */ sx_slock(&tty_list_sx); TAILQ_FOREACH(tp, &tty_list, t_list) { if (strcmp(dev_console_filename, tty_devname(tp)) == 0) { dev_console->si_drv1 = tp; break; } } sx_sunlock(&tty_list_sx); /* System console has no TTY associated. */ if (dev_console->si_drv1 == NULL) return (ENXIO); return (ttydev_open(dev, oflags, devtype, td)); } static int ttyconsdev_write(struct cdev *dev, struct uio *uio, int ioflag) { log_console(uio); return (ttydev_write(dev, uio, ioflag)); } /* * /dev/console is a little different than normal TTY's. When opened, * it determines which TTY to use. When data gets written to it, it * will be logged in the kernel message buffer. */ static struct cdevsw ttyconsdev_cdevsw = { .d_version = D_VERSION, .d_open = ttyconsdev_open, .d_close = ttydev_close, .d_read = ttydev_read, .d_write = ttyconsdev_write, .d_ioctl = ttydev_ioctl, .d_kqfilter = ttydev_kqfilter, .d_poll = ttydev_poll, .d_mmap = ttydev_mmap, .d_name = "ttyconsdev", .d_flags = D_TTY, }; static void ttyconsdev_init(void *unused __unused) { dev_console = make_dev_credf(MAKEDEV_ETERNAL, &ttyconsdev_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0600, "console"); } SYSINIT(tty, SI_SUB_DRIVERS, SI_ORDER_FIRST, ttyconsdev_init, NULL); void ttyconsdev_select(const char *name) { dev_console_filename = name; } /* * Debugging routines. */ #include "opt_ddb.h" #ifdef DDB #include #include static const struct { int flag; char val; } ttystates[] = { #if 0 { TF_NOPREFIX, 'N' }, #endif { TF_INITLOCK, 'I' }, { TF_CALLOUT, 'C' }, /* Keep these together -> 'Oi' and 'Oo'. */ { TF_OPENED, 'O' }, { TF_OPENED_IN, 'i' }, { TF_OPENED_OUT, 'o' }, { TF_OPENED_CONS, 'c' }, { TF_GONE, 'G' }, { TF_OPENCLOSE, 'B' }, { TF_ASYNC, 'Y' }, { TF_LITERAL, 'L' }, /* Keep these together -> 'Hi' and 'Ho'. */ { TF_HIWAT, 'H' }, { TF_HIWAT_IN, 'i' }, { TF_HIWAT_OUT, 'o' }, { TF_STOPPED, 'S' }, { TF_EXCLUDE, 'X' }, { TF_BYPASS, 'l' }, { TF_ZOMBIE, 'Z' }, { TF_HOOK, 's' }, /* Keep these together -> 'bi' and 'bo'. */ { TF_BUSY, 'b' }, { TF_BUSY_IN, 'i' }, { TF_BUSY_OUT, 'o' }, { 0, '\0'}, }; #define TTY_FLAG_BITS \ "\20\1NOPREFIX\2INITLOCK\3CALLOUT\4OPENED_IN" \ "\5OPENED_OUT\6OPENED_CONS\7GONE\10OPENCLOSE" \ "\11ASYNC\12LITERAL\13HIWAT_IN\14HIWAT_OUT" \ "\15STOPPED\16EXCLUDE\17BYPASS\20ZOMBIE" \ "\21HOOK\22BUSY_IN\23BUSY_OUT" #define DB_PRINTSYM(name, addr) \ db_printf("%s " #name ": ", sep); \ db_printsym((db_addr_t) addr, DB_STGY_ANY); \ db_printf("\n"); static void _db_show_devsw(const char *sep, const struct ttydevsw *tsw) { db_printf("%sdevsw: ", sep); db_printsym((db_addr_t)tsw, DB_STGY_ANY); db_printf(" (%p)\n", tsw); DB_PRINTSYM(open, tsw->tsw_open); DB_PRINTSYM(close, tsw->tsw_close); DB_PRINTSYM(outwakeup, tsw->tsw_outwakeup); DB_PRINTSYM(inwakeup, tsw->tsw_inwakeup); DB_PRINTSYM(ioctl, tsw->tsw_ioctl); DB_PRINTSYM(param, tsw->tsw_param); DB_PRINTSYM(modem, tsw->tsw_modem); DB_PRINTSYM(mmap, tsw->tsw_mmap); DB_PRINTSYM(pktnotify, tsw->tsw_pktnotify); DB_PRINTSYM(free, tsw->tsw_free); } static void _db_show_hooks(const char *sep, const struct ttyhook *th) { db_printf("%shook: ", sep); db_printsym((db_addr_t)th, DB_STGY_ANY); db_printf(" (%p)\n", th); if (th == NULL) return; DB_PRINTSYM(rint, th->th_rint); DB_PRINTSYM(rint_bypass, th->th_rint_bypass); DB_PRINTSYM(rint_done, th->th_rint_done); DB_PRINTSYM(rint_poll, th->th_rint_poll); DB_PRINTSYM(getc_inject, th->th_getc_inject); DB_PRINTSYM(getc_capture, th->th_getc_capture); DB_PRINTSYM(getc_poll, th->th_getc_poll); DB_PRINTSYM(close, th->th_close); } static void _db_show_termios(const char *name, const struct termios *t) { db_printf("%s: iflag 0x%x oflag 0x%x cflag 0x%x " "lflag 0x%x ispeed %u ospeed %u\n", name, t->c_iflag, t->c_oflag, t->c_cflag, t->c_lflag, t->c_ispeed, t->c_ospeed); } /* DDB command to show TTY statistics. */ DB_SHOW_COMMAND(tty, db_show_tty) { struct tty *tp; if (!have_addr) { db_printf("usage: show tty \n"); return; } tp = (struct tty *)addr; db_printf("%p: %s\n", tp, tty_devname(tp)); db_printf("\tmtx: %p\n", tp->t_mtx); db_printf("\tflags: 0x%b\n", tp->t_flags, TTY_FLAG_BITS); db_printf("\trevokecnt: %u\n", tp->t_revokecnt); /* Buffering mechanisms. */ db_printf("\tinq: %p begin %u linestart %u reprint %u end %u " "nblocks %u quota %u\n", &tp->t_inq, tp->t_inq.ti_begin, tp->t_inq.ti_linestart, tp->t_inq.ti_reprint, tp->t_inq.ti_end, tp->t_inq.ti_nblocks, tp->t_inq.ti_quota); db_printf("\toutq: %p begin %u end %u nblocks %u quota %u\n", &tp->t_outq, tp->t_outq.to_begin, tp->t_outq.to_end, tp->t_outq.to_nblocks, tp->t_outq.to_quota); db_printf("\tinlow: %zu\n", tp->t_inlow); db_printf("\toutlow: %zu\n", tp->t_outlow); _db_show_termios("\ttermios", &tp->t_termios); db_printf("\twinsize: row %u col %u xpixel %u ypixel %u\n", tp->t_winsize.ws_row, tp->t_winsize.ws_col, tp->t_winsize.ws_xpixel, tp->t_winsize.ws_ypixel); db_printf("\tcolumn: %u\n", tp->t_column); db_printf("\twritepos: %u\n", tp->t_writepos); db_printf("\tcompatflags: 0x%x\n", tp->t_compatflags); /* Init/lock-state devices. */ _db_show_termios("\ttermios_init_in", &tp->t_termios_init_in); _db_show_termios("\ttermios_init_out", &tp->t_termios_init_out); _db_show_termios("\ttermios_lock_in", &tp->t_termios_lock_in); _db_show_termios("\ttermios_lock_out", &tp->t_termios_lock_out); /* Hooks */ _db_show_devsw("\t", tp->t_devsw); _db_show_hooks("\t", tp->t_hook); /* Process info. */ db_printf("\tpgrp: %p gid %d\n", tp->t_pgrp, tp->t_pgrp ? tp->t_pgrp->pg_id : 0); db_printf("\tsession: %p", tp->t_session); if (tp->t_session != NULL) db_printf(" count %u leader %p tty %p sid %d login %s", tp->t_session->s_count, tp->t_session->s_leader, tp->t_session->s_ttyp, tp->t_session->s_sid, tp->t_session->s_login); db_printf("\n"); db_printf("\tsessioncnt: %u\n", tp->t_sessioncnt); db_printf("\tdevswsoftc: %p\n", tp->t_devswsoftc); db_printf("\thooksoftc: %p\n", tp->t_hooksoftc); db_printf("\tdev: %p\n", tp->t_dev); } /* DDB command to list TTYs. */ DB_SHOW_ALL_COMMAND(ttys, db_show_all_ttys) { struct tty *tp; size_t isiz, osiz; int i, j; /* Make the output look like `pstat -t'. */ db_printf("PTR "); #if defined(__LP64__) db_printf(" "); #endif db_printf(" LINE INQ CAN LIN LOW OUTQ USE LOW " "COL SESS PGID STATE\n"); TAILQ_FOREACH(tp, &tty_list, t_list) { isiz = tp->t_inq.ti_nblocks * TTYINQ_DATASIZE; osiz = tp->t_outq.to_nblocks * TTYOUTQ_DATASIZE; db_printf("%p %10s %5zu %4u %4u %4zu %5zu %4u %4zu %5u %5d " "%5d ", tp, tty_devname(tp), isiz, tp->t_inq.ti_linestart - tp->t_inq.ti_begin, tp->t_inq.ti_end - tp->t_inq.ti_linestart, isiz - tp->t_inlow, osiz, tp->t_outq.to_end - tp->t_outq.to_begin, osiz - tp->t_outlow, MIN(tp->t_column, 99999), tp->t_session ? tp->t_session->s_sid : 0, tp->t_pgrp ? tp->t_pgrp->pg_id : 0); /* Flag bits. */ for (i = j = 0; ttystates[i].flag; i++) if (tp->t_flags & ttystates[i].flag) { db_printf("%c", ttystates[i].val); j++; } if (j == 0) db_printf("-"); db_printf("\n"); } } #endif /* DDB */ diff --git a/sys/kern/tty_pts.c b/sys/kern/tty_pts.c index 4a3b3d77c89e..d629fa0e7593 100644 --- a/sys/kern/tty_pts.c +++ b/sys/kern/tty_pts.c @@ -1,869 +1,869 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2008 Ed Schouten * All rights reserved. * * Portions of this software were developed under sponsorship from Snow * B.V., the Netherlands. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include /* Add compatibility bits for FreeBSD. */ #define PTS_COMPAT /* Add pty(4) compat bits. */ #define PTS_EXTERNAL /* Add bits to make Linux binaries work. */ #define PTS_LINUX #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Our utmp(5) format is limited to 8-byte TTY line names. This means * we can at most allocate 1000 pseudo-terminals ("pts/999"). Allow * users to increase this number, assuming they have manually increased * UT_LINESIZE. */ static struct unrhdr *pts_pool; static MALLOC_DEFINE(M_PTS, "pts", "pseudo tty device"); /* * Per-PTS structure. * * List of locks * (t) locked by tty_lock() * (c) const until freeing */ struct pts_softc { int pts_unit; /* (c) Device unit number. */ unsigned int pts_flags; /* (t) Device flags. */ #define PTS_PKT 0x1 /* Packet mode. */ #define PTS_FINISHED 0x2 /* Return errors on read()/write(). */ char pts_pkt; /* (t) Unread packet mode data. */ struct cv pts_inwait; /* (t) Blocking write() on master. */ struct selinfo pts_inpoll; /* (t) Select queue for write(). */ struct cv pts_outwait; /* (t) Blocking read() on master. */ struct selinfo pts_outpoll; /* (t) Select queue for read(). */ #ifdef PTS_EXTERNAL struct cdev *pts_cdev; /* (c) Master device node. */ #endif /* PTS_EXTERNAL */ struct ucred *pts_cred; /* (c) Resource limit. */ }; /* * Controller-side file operations. */ static int ptsdev_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); int error = 0; char pkt; if (uio->uio_resid == 0) return (0); tty_lock(tp); for (;;) { /* * Implement packet mode. When packet mode is turned on, * the first byte contains a bitmask of events that * occurred (start, stop, flush, window size, etc). */ if (psc->pts_flags & PTS_PKT && psc->pts_pkt) { pkt = psc->pts_pkt; psc->pts_pkt = 0; tty_unlock(tp); error = ureadc(pkt, uio); return (error); } /* * Transmit regular data. * * XXX: We shouldn't use ttydisc_getc_poll()! Even * though in this implementation, there is likely going * to be data, we should just call ttydisc_getc_uio() * and use its return value to sleep. */ if (ttydisc_getc_poll(tp)) { if (psc->pts_flags & PTS_PKT) { /* * XXX: Small race. Fortunately PTY * consumers aren't multithreaded. */ tty_unlock(tp); error = ureadc(TIOCPKT_DATA, uio); if (error) return (error); tty_lock(tp); } error = ttydisc_getc_uio(tp, uio); break; } /* Maybe the device isn't used anyway. */ if (psc->pts_flags & PTS_FINISHED) break; /* Wait for more data. */ if (fp->f_flag & O_NONBLOCK) { error = EWOULDBLOCK; break; } error = cv_wait_sig(&psc->pts_outwait, tp->t_mtx); if (error != 0) break; } tty_unlock(tp); return (error); } static int ptsdev_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); char ib[256], *ibstart; size_t iblen, rintlen; int error = 0; if (uio->uio_resid == 0) return (0); for (;;) { ibstart = ib; iblen = MIN(uio->uio_resid, sizeof ib); error = uiomove(ib, iblen, uio); tty_lock(tp); if (error != 0) { iblen = 0; goto done; } /* * When possible, avoid the slow path. rint_bypass() * copies all input to the input queue at once. */ MPASS(iblen > 0); do { rintlen = ttydisc_rint_simple(tp, ibstart, iblen); ibstart += rintlen; iblen -= rintlen; if (iblen == 0) { /* All data written. */ break; } /* Maybe the device isn't used anyway. */ if (psc->pts_flags & PTS_FINISHED) { error = EIO; goto done; } /* Wait for more data. */ if (fp->f_flag & O_NONBLOCK) { error = EWOULDBLOCK; goto done; } /* Wake up users on the slave side. */ ttydisc_rint_done(tp); error = cv_wait_sig(&psc->pts_inwait, tp->t_mtx); if (error != 0) goto done; } while (iblen > 0); if (uio->uio_resid == 0) break; tty_unlock(tp); } done: ttydisc_rint_done(tp); tty_unlock(tp); /* * Don't account for the part of the buffer that we couldn't * pass to the TTY. */ uio->uio_resid += iblen; return (error); } static int ptsdev_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); int error = 0, sig; switch (cmd) { case FIODTYPE: *(int *)data = D_TTY; return (0); case FIONBIO: /* This device supports non-blocking operation. */ return (0); case FIONREAD: tty_lock(tp); *(int *)data = ttydisc_getc_poll(tp); tty_unlock(tp); return (0); case FIODGNAME: #ifdef COMPAT_FREEBSD32 case FIODGNAME_32: #endif { struct fiodgname_arg *fgn; const char *p; int i; /* Reverse device name lookups, for ptsname() and ttyname(). */ fgn = data; p = tty_devname(tp); i = strlen(p) + 1; if (i > fgn->len) return (EINVAL); return (copyout(p, fiodgname_buf_get_ptr(fgn, cmd), i)); } /* * We need to implement TIOCGPGRP and TIOCGSID here again. When * called on the pseudo-terminal master, it should not check if * the terminal is the foreground terminal of the calling * process. * * TIOCGETA is also implemented here. Various Linux PTY routines * often call isatty(), which is implemented by tcgetattr(). */ #ifdef PTS_LINUX case TIOCGETA: /* Obtain terminal flags through tcgetattr(). */ tty_lock(tp); *(struct termios*)data = tp->t_termios; tty_unlock(tp); return (0); #endif /* PTS_LINUX */ case TIOCSETAF: case TIOCSETAW: /* * We must make sure we turn tcsetattr() calls of TCSAFLUSH and * TCSADRAIN into something different. If an application would * call TCSAFLUSH or TCSADRAIN on the master descriptor, it may * deadlock waiting for all data to be read. */ cmd = TIOCSETA; break; #if defined(PTS_COMPAT) || defined(PTS_LINUX) case TIOCGPTN: /* * Get the device unit number. */ if (psc->pts_unit < 0) return (ENOTTY); *(unsigned int *)data = psc->pts_unit; return (0); #endif /* PTS_COMPAT || PTS_LINUX */ case TIOCGPGRP: /* Get the foreground process group ID. */ tty_lock(tp); if (tp->t_pgrp != NULL) *(int *)data = tp->t_pgrp->pg_id; else *(int *)data = NO_PID; tty_unlock(tp); return (0); case TIOCGSID: /* Get the session leader process ID. */ tty_lock(tp); if (tp->t_session == NULL) error = ENOTTY; else *(int *)data = tp->t_session->s_sid; tty_unlock(tp); return (error); case TIOCPTMASTER: /* Yes, we are a pseudo-terminal master. */ return (0); case TIOCSIG: /* Signal the foreground process group. */ sig = *(int *)data; if (sig < 1 || sig >= NSIG) return (EINVAL); tty_lock(tp); tty_signal_pgrp(tp, sig); tty_unlock(tp); return (0); case TIOCPKT: /* Enable/disable packet mode. */ tty_lock(tp); if (*(int *)data) psc->pts_flags |= PTS_PKT; else psc->pts_flags &= ~PTS_PKT; tty_unlock(tp); return (0); } /* Just redirect this ioctl to the slave device. */ tty_lock(tp); error = tty_ioctl(tp, cmd, data, fp->f_flag, td); tty_unlock(tp); if (error == ENOIOCTL) error = ENOTTY; return (error); } static int ptsdev_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); int revents = 0; tty_lock(tp); if (psc->pts_flags & PTS_FINISHED) { /* Slave device is not opened. */ tty_unlock(tp); return ((events & (POLLIN|POLLRDNORM)) | POLLHUP); } if (events & (POLLIN|POLLRDNORM)) { /* See if we can getc something. */ if (ttydisc_getc_poll(tp) || (psc->pts_flags & PTS_PKT && psc->pts_pkt)) revents |= events & (POLLIN|POLLRDNORM); } if (events & (POLLOUT|POLLWRNORM)) { /* See if we can rint something. */ if (ttydisc_rint_poll(tp)) revents |= events & (POLLOUT|POLLWRNORM); } /* * No need to check for POLLHUP here. This device cannot be used * as a callout device, which means we always have a carrier, * because the master is. */ if (revents == 0) { /* * This code might look misleading, but the naming of * poll events on this side is the opposite of the slave * device. */ if (events & (POLLIN|POLLRDNORM)) selrecord(td, &psc->pts_outpoll); if (events & (POLLOUT|POLLWRNORM)) selrecord(td, &psc->pts_inpoll); } tty_unlock(tp); return (revents); } /* * kqueue support. */ static void pts_kqops_read_detach(struct knote *kn) { struct file *fp = kn->kn_fp; struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); knlist_remove(&psc->pts_outpoll.si_note, kn, 0); } static int pts_kqops_read_event(struct knote *kn, long hint) { struct file *fp = kn->kn_fp; struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); if (psc->pts_flags & PTS_FINISHED) { kn->kn_flags |= EV_EOF; return (1); } else { kn->kn_data = ttydisc_getc_poll(tp); return (kn->kn_data > 0); } } static void pts_kqops_write_detach(struct knote *kn) { struct file *fp = kn->kn_fp; struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); knlist_remove(&psc->pts_inpoll.si_note, kn, 0); } static int pts_kqops_write_event(struct knote *kn, long hint) { struct file *fp = kn->kn_fp; struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); if (psc->pts_flags & PTS_FINISHED) { kn->kn_flags |= EV_EOF; return (1); } else { kn->kn_data = ttydisc_rint_poll(tp); return (kn->kn_data > 0); } } -static struct filterops pts_kqops_read = { +static const struct filterops pts_kqops_read = { .f_isfd = 1, .f_detach = pts_kqops_read_detach, .f_event = pts_kqops_read_event, }; -static struct filterops pts_kqops_write = { +static const struct filterops pts_kqops_write = { .f_isfd = 1, .f_detach = pts_kqops_write_detach, .f_event = pts_kqops_write_event, }; static int ptsdev_kqfilter(struct file *fp, struct knote *kn) { struct tty *tp = fp->f_data; struct pts_softc *psc = tty_softc(tp); int error = 0; tty_lock(tp); switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &pts_kqops_read; knlist_add(&psc->pts_outpoll.si_note, kn, 1); break; case EVFILT_WRITE: kn->kn_fop = &pts_kqops_write; knlist_add(&psc->pts_inpoll.si_note, kn, 1); break; default: error = EINVAL; break; } tty_unlock(tp); return (error); } static int ptsdev_stat(struct file *fp, struct stat *sb, struct ucred *active_cred) { struct tty *tp = fp->f_data; #ifdef PTS_EXTERNAL struct pts_softc *psc = tty_softc(tp); #endif /* PTS_EXTERNAL */ struct cdev *dev = tp->t_dev; /* * According to POSIX, we must implement an fstat(). This also * makes this implementation compatible with Linux binaries, * because Linux calls fstat() on the pseudo-terminal master to * obtain st_rdev. * * XXX: POSIX also mentions we must fill in st_dev, but how? */ bzero(sb, sizeof *sb); #ifdef PTS_EXTERNAL if (psc->pts_cdev != NULL) sb->st_ino = sb->st_rdev = dev2udev(psc->pts_cdev); else #endif /* PTS_EXTERNAL */ sb->st_ino = sb->st_rdev = tty_udev(tp); sb->st_atim = dev->si_atime; sb->st_ctim = dev->si_ctime; sb->st_mtim = dev->si_mtime; sb->st_uid = dev->si_uid; sb->st_gid = dev->si_gid; sb->st_mode = dev->si_mode | S_IFCHR; return (0); } static int ptsdev_close(struct file *fp, struct thread *td) { struct tty *tp = fp->f_data; /* Deallocate TTY device. */ tty_lock(tp); tty_rel_gone(tp); /* * Open of /dev/ptmx or /dev/ptyXX changes the type of file * from DTYPE_VNODE to DTYPE_PTS. vn_open() increases vnode * use count, we need to decrement it, and possibly do other * required cleanup. */ if (fp->f_vnode != NULL) return (vnops.fo_close(fp, td)); return (0); } static int ptsdev_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { struct tty *tp; kif->kf_type = KF_TYPE_PTS; tp = fp->f_data; kif->kf_un.kf_pts.kf_pts_dev = tty_udev(tp); kif->kf_un.kf_pts.kf_pts_dev_freebsd11 = kif->kf_un.kf_pts.kf_pts_dev; /* truncate */ strlcpy(kif->kf_path, tty_devname(tp), sizeof(kif->kf_path)); return (0); } -static struct fileops ptsdev_ops = { +static const struct fileops ptsdev_ops = { .fo_read = ptsdev_read, .fo_write = ptsdev_write, .fo_truncate = invfo_truncate, .fo_ioctl = ptsdev_ioctl, .fo_poll = ptsdev_poll, .fo_kqfilter = ptsdev_kqfilter, .fo_stat = ptsdev_stat, .fo_close = ptsdev_close, .fo_chmod = invfo_chmod, .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = ptsdev_fill_kinfo, .fo_cmp = file_kcmp_generic, .fo_flags = DFLAG_PASSABLE, }; /* * Driver-side hooks. */ static void ptsdrv_outwakeup(struct tty *tp) { struct pts_softc *psc = tty_softc(tp); cv_broadcast(&psc->pts_outwait); selwakeup(&psc->pts_outpoll); KNOTE_LOCKED(&psc->pts_outpoll.si_note, 0); } static void ptsdrv_inwakeup(struct tty *tp) { struct pts_softc *psc = tty_softc(tp); cv_broadcast(&psc->pts_inwait); selwakeup(&psc->pts_inpoll); KNOTE_LOCKED(&psc->pts_inpoll.si_note, 0); } static int ptsdrv_open(struct tty *tp) { struct pts_softc *psc = tty_softc(tp); psc->pts_flags &= ~PTS_FINISHED; return (0); } static void ptsdrv_close(struct tty *tp) { struct pts_softc *psc = tty_softc(tp); /* Wake up any blocked readers/writers. */ psc->pts_flags |= PTS_FINISHED; ptsdrv_outwakeup(tp); ptsdrv_inwakeup(tp); } static void ptsdrv_pktnotify(struct tty *tp, char event) { struct pts_softc *psc = tty_softc(tp); /* * Clear conflicting flags. */ switch (event) { case TIOCPKT_STOP: psc->pts_pkt &= ~TIOCPKT_START; break; case TIOCPKT_START: psc->pts_pkt &= ~TIOCPKT_STOP; break; case TIOCPKT_NOSTOP: psc->pts_pkt &= ~TIOCPKT_DOSTOP; break; case TIOCPKT_DOSTOP: psc->pts_pkt &= ~TIOCPKT_NOSTOP; break; } psc->pts_pkt |= event; ptsdrv_outwakeup(tp); } static void ptsdrv_free(void *softc) { struct pts_softc *psc = softc; /* Make device number available again. */ if (psc->pts_unit >= 0) free_unr(pts_pool, psc->pts_unit); chgptscnt(psc->pts_cred->cr_ruidinfo, -1, 0); racct_sub_cred(psc->pts_cred, RACCT_NPTS, 1); crfree(psc->pts_cred); seldrain(&psc->pts_inpoll); seldrain(&psc->pts_outpoll); knlist_destroy(&psc->pts_inpoll.si_note); knlist_destroy(&psc->pts_outpoll.si_note); #ifdef PTS_EXTERNAL /* Destroy master device as well. */ if (psc->pts_cdev != NULL) destroy_dev_sched(psc->pts_cdev); #endif /* PTS_EXTERNAL */ free(psc, M_PTS); } static struct ttydevsw pts_class = { .tsw_flags = TF_NOPREFIX, .tsw_outwakeup = ptsdrv_outwakeup, .tsw_inwakeup = ptsdrv_inwakeup, .tsw_open = ptsdrv_open, .tsw_close = ptsdrv_close, .tsw_pktnotify = ptsdrv_pktnotify, .tsw_free = ptsdrv_free, }; #ifndef PTS_EXTERNAL static #endif /* !PTS_EXTERNAL */ int pts_alloc(int fflags, struct thread *td, struct file *fp) { int unit, ok, error; struct tty *tp; struct pts_softc *psc; struct proc *p = td->td_proc; struct ucred *cred = td->td_ucred; /* Resource limiting. */ PROC_LOCK(p); error = racct_add(p, RACCT_NPTS, 1); if (error != 0) { PROC_UNLOCK(p); return (EAGAIN); } ok = chgptscnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_NPTS)); if (!ok) { racct_sub(p, RACCT_NPTS, 1); PROC_UNLOCK(p); return (EAGAIN); } PROC_UNLOCK(p); /* Try to allocate a new pts unit number. */ unit = alloc_unr(pts_pool); if (unit < 0) { racct_sub(p, RACCT_NPTS, 1); chgptscnt(cred->cr_ruidinfo, -1, 0); return (EAGAIN); } /* Allocate TTY and softc. */ psc = malloc(sizeof(struct pts_softc), M_PTS, M_WAITOK|M_ZERO); cv_init(&psc->pts_inwait, "ptsin"); cv_init(&psc->pts_outwait, "ptsout"); psc->pts_unit = unit; psc->pts_cred = crhold(cred); tp = tty_alloc(&pts_class, psc); knlist_init_mtx(&psc->pts_inpoll.si_note, tp->t_mtx); knlist_init_mtx(&psc->pts_outpoll.si_note, tp->t_mtx); /* Expose the slave device as well. */ tty_makedev(tp, td->td_ucred, "pts/%u", psc->pts_unit); finit(fp, fflags, DTYPE_PTS, tp, &ptsdev_ops); return (0); } #ifdef PTS_EXTERNAL int pts_alloc_external(int fflags, struct thread *td, struct file *fp, struct cdev *dev, const char *name) { int ok, error; struct tty *tp; struct pts_softc *psc; struct proc *p = td->td_proc; struct ucred *cred = td->td_ucred; /* Resource limiting. */ PROC_LOCK(p); error = racct_add(p, RACCT_NPTS, 1); if (error != 0) { PROC_UNLOCK(p); return (EAGAIN); } ok = chgptscnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_NPTS)); if (!ok) { racct_sub(p, RACCT_NPTS, 1); PROC_UNLOCK(p); return (EAGAIN); } PROC_UNLOCK(p); /* Allocate TTY and softc. */ psc = malloc(sizeof(struct pts_softc), M_PTS, M_WAITOK|M_ZERO); cv_init(&psc->pts_inwait, "ptsin"); cv_init(&psc->pts_outwait, "ptsout"); psc->pts_unit = -1; psc->pts_cdev = dev; psc->pts_cred = crhold(cred); tp = tty_alloc(&pts_class, psc); knlist_init_mtx(&psc->pts_inpoll.si_note, tp->t_mtx); knlist_init_mtx(&psc->pts_outpoll.si_note, tp->t_mtx); /* Expose the slave device as well. */ tty_makedev(tp, td->td_ucred, "%s", name); finit(fp, fflags, DTYPE_PTS, tp, &ptsdev_ops); return (0); } #endif /* PTS_EXTERNAL */ int sys_posix_openpt(struct thread *td, struct posix_openpt_args *uap) { int error, fd; struct file *fp; /* * POSIX states it's unspecified when other flags are passed. We * don't allow this. */ if (uap->flags & ~(O_RDWR|O_NOCTTY|O_CLOEXEC)) return (EINVAL); error = falloc(td, &fp, &fd, uap->flags); if (error) return (error); /* Allocate the actual pseudo-TTY. */ error = pts_alloc(FFLAGS(uap->flags & O_ACCMODE), td, fp); if (error != 0) { fdclose(td, fp, fd); fdrop(fp, td); return (error); } /* Pass it back to userspace. */ td->td_retval[0] = fd; fdrop(fp, td); return (0); } static void pts_init(void *unused) { pts_pool = new_unrhdr(0, INT_MAX, NULL); } SYSINIT(pts, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, pts_init, NULL); diff --git a/sys/kern/uipc_mqueue.c b/sys/kern/uipc_mqueue.c index 5fcabbac7923..7dd0f9796682 100644 --- a/sys/kern/uipc_mqueue.c +++ b/sys/kern/uipc_mqueue.c @@ -1,2938 +1,2938 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2005 David Xu * Copyright (c) 2016-2017 Robert N. M. Watson * All rights reserved. * * Portions of this software were developed by BAE Systems, the University of * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent * Computing (TC) research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* * POSIX message queue implementation. * * 1) A mqueue filesystem can be mounted, each message queue appears * in mounted directory, user can change queue's permission and * ownership, or remove a queue. Manually creating a file in the * directory causes a message queue to be created in the kernel with * default message queue attributes applied and same name used, this * method is not advocated since mq_open syscall allows user to specify * different attributes. Also the file system can be mounted multiple * times at different mount points but shows same contents. * * 2) Standard POSIX message queue API. The syscalls do not use vfs layer, * but directly operate on internal data structure, this allows user to * use the IPC facility without having to mount mqueue file system. */ #include "opt_capsicum.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(p1003_1b_mqueue, "POSIX P1003.1B message queues support"); /* * Limits and constants */ #define MQFS_NAMELEN NAME_MAX #define MQFS_DELEN (8 + MQFS_NAMELEN) /* node types */ typedef enum { mqfstype_none = 0, mqfstype_root, mqfstype_dir, mqfstype_this, mqfstype_parent, mqfstype_file, mqfstype_symlink, } mqfs_type_t; struct mqfs_node; /* * mqfs_info: describes a mqfs instance */ struct mqfs_info { struct sx mi_lock; struct mqfs_node *mi_root; struct unrhdr *mi_unrhdr; }; struct mqfs_vdata { LIST_ENTRY(mqfs_vdata) mv_link; struct mqfs_node *mv_node; struct vnode *mv_vnode; struct task mv_task; }; /* * mqfs_node: describes a node (file or directory) within a mqfs */ struct mqfs_node { char mn_name[MQFS_NAMELEN+1]; struct mqfs_info *mn_info; struct mqfs_node *mn_parent; LIST_HEAD(,mqfs_node) mn_children; LIST_ENTRY(mqfs_node) mn_sibling; LIST_HEAD(,mqfs_vdata) mn_vnodes; const void *mn_pr_root; int mn_refcount; mqfs_type_t mn_type; int mn_deleted; uint32_t mn_fileno; void *mn_data; struct timespec mn_birth; struct timespec mn_ctime; struct timespec mn_atime; struct timespec mn_mtime; uid_t mn_uid; gid_t mn_gid; int mn_mode; }; #define VTON(vp) (((struct mqfs_vdata *)((vp)->v_data))->mv_node) #define VTOMQ(vp) ((struct mqueue *)(VTON(vp)->mn_data)) #define VFSTOMQFS(m) ((struct mqfs_info *)((m)->mnt_data)) #define FPTOMQ(fp) ((struct mqueue *)(((struct mqfs_node *) \ (fp)->f_data)->mn_data)) TAILQ_HEAD(msgq, mqueue_msg); struct mqueue; struct mqueue_notifier { LIST_ENTRY(mqueue_notifier) nt_link; struct sigevent nt_sigev; ksiginfo_t nt_ksi; struct proc *nt_proc; }; struct mqueue { struct mtx mq_mutex; int mq_flags; long mq_maxmsg; long mq_msgsize; long mq_curmsgs; long mq_totalbytes; struct msgq mq_msgq; int mq_receivers; int mq_senders; struct selinfo mq_rsel; struct selinfo mq_wsel; struct mqueue_notifier *mq_notifier; }; #define MQ_RSEL 0x01 #define MQ_WSEL 0x02 struct mqueue_msg { TAILQ_ENTRY(mqueue_msg) msg_link; unsigned int msg_prio; unsigned int msg_size; /* following real data... */ }; static SYSCTL_NODE(_kern, OID_AUTO, mqueue, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "POSIX real time message queue"); static int default_maxmsg = 10; static int default_msgsize = 1024; static int maxmsg = 100; SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsg, CTLFLAG_RW, &maxmsg, 0, "Default maximum messages in queue"); static int maxmsgsize = 16384; SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsgsize, CTLFLAG_RW, &maxmsgsize, 0, "Default maximum message size"); static int maxmq = 100; SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmq, CTLFLAG_RW, &maxmq, 0, "maximum message queues"); static int curmq = 0; SYSCTL_INT(_kern_mqueue, OID_AUTO, curmq, CTLFLAG_RW, &curmq, 0, "current message queue number"); static int unloadable = 0; static MALLOC_DEFINE(M_MQUEUEDATA, "mqdata", "mqueue data"); static eventhandler_tag exit_tag; /* Only one instance per-system */ static struct mqfs_info mqfs_data; static uma_zone_t mqnode_zone; static uma_zone_t mqueue_zone; static uma_zone_t mvdata_zone; static uma_zone_t mqnoti_zone; static struct vop_vector mqfs_vnodeops; -static struct fileops mqueueops; +static const struct fileops mqueueops; static unsigned mqfs_osd_jail_slot; /* * Directory structure construction and manipulation */ #ifdef notyet static struct mqfs_node *mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode); static struct mqfs_node *mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode); #endif static struct mqfs_node *mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode); static int mqfs_destroy(struct mqfs_node *mn); static void mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn); static void mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn); static int mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn); static int mqfs_prison_remove(void *obj, void *data); /* * Message queue construction and maniplation */ static struct mqueue *mqueue_alloc(const struct mq_attr *attr); static void mqueue_free(struct mqueue *mq); static int mqueue_send(struct mqueue *mq, const char *msg_ptr, size_t msg_len, unsigned msg_prio, int waitok, const struct timespec *abs_timeout); static int mqueue_receive(struct mqueue *mq, char *msg_ptr, size_t msg_len, unsigned *msg_prio, int waitok, const struct timespec *abs_timeout); static int _mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo); static int _mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo); static void mqueue_send_notification(struct mqueue *mq); static void mqueue_fdclose(struct thread *td, int fd, struct file *fp); static void mq_proc_exit(void *arg, struct proc *p); /* * kqueue filters */ static void filt_mqdetach(struct knote *kn); static int filt_mqread(struct knote *kn, long hint); static int filt_mqwrite(struct knote *kn, long hint); -struct filterops mq_rfiltops = { +static const struct filterops mq_rfiltops = { .f_isfd = 1, .f_detach = filt_mqdetach, .f_event = filt_mqread, }; -struct filterops mq_wfiltops = { +static const struct filterops mq_wfiltops = { .f_isfd = 1, .f_detach = filt_mqdetach, .f_event = filt_mqwrite, }; /* * Initialize fileno bitmap */ static void mqfs_fileno_init(struct mqfs_info *mi) { struct unrhdr *up; up = new_unrhdr(1, INT_MAX, NULL); mi->mi_unrhdr = up; } /* * Tear down fileno bitmap */ static void mqfs_fileno_uninit(struct mqfs_info *mi) { struct unrhdr *up; up = mi->mi_unrhdr; mi->mi_unrhdr = NULL; delete_unrhdr(up); } /* * Allocate a file number */ static void mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn) { /* make sure our parent has a file number */ if (mn->mn_parent && !mn->mn_parent->mn_fileno) mqfs_fileno_alloc(mi, mn->mn_parent); switch (mn->mn_type) { case mqfstype_root: case mqfstype_dir: case mqfstype_file: case mqfstype_symlink: mn->mn_fileno = alloc_unr(mi->mi_unrhdr); break; case mqfstype_this: KASSERT(mn->mn_parent != NULL, ("mqfstype_this node has no parent")); mn->mn_fileno = mn->mn_parent->mn_fileno; break; case mqfstype_parent: KASSERT(mn->mn_parent != NULL, ("mqfstype_parent node has no parent")); if (mn->mn_parent == mi->mi_root) { mn->mn_fileno = mn->mn_parent->mn_fileno; break; } KASSERT(mn->mn_parent->mn_parent != NULL, ("mqfstype_parent node has no grandparent")); mn->mn_fileno = mn->mn_parent->mn_parent->mn_fileno; break; default: KASSERT(0, ("mqfs_fileno_alloc() called for unknown type node: %d", mn->mn_type)); break; } } /* * Release a file number */ static void mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn) { switch (mn->mn_type) { case mqfstype_root: case mqfstype_dir: case mqfstype_file: case mqfstype_symlink: free_unr(mi->mi_unrhdr, mn->mn_fileno); break; case mqfstype_this: case mqfstype_parent: /* ignore these, as they don't "own" their file number */ break; default: KASSERT(0, ("mqfs_fileno_free() called for unknown type node: %d", mn->mn_type)); break; } } static __inline struct mqfs_node * mqnode_alloc(void) { return (uma_zalloc(mqnode_zone, M_WAITOK | M_ZERO)); } static __inline void mqnode_free(struct mqfs_node *node) { uma_zfree(mqnode_zone, node); } static __inline void mqnode_addref(struct mqfs_node *node) { atomic_add_int(&node->mn_refcount, 1); } static __inline void mqnode_release(struct mqfs_node *node) { struct mqfs_info *mqfs; int old, exp; mqfs = node->mn_info; old = atomic_fetchadd_int(&node->mn_refcount, -1); if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root) exp = 3; /* include . and .. */ else exp = 1; if (old == exp) { int locked = sx_xlocked(&mqfs->mi_lock); if (!locked) sx_xlock(&mqfs->mi_lock); mqfs_destroy(node); if (!locked) sx_xunlock(&mqfs->mi_lock); } } /* * Add a node to a directory */ static int mqfs_add_node(struct mqfs_node *parent, struct mqfs_node *node) { KASSERT(parent != NULL, ("%s(): parent is NULL", __func__)); KASSERT(parent->mn_info != NULL, ("%s(): parent has no mn_info", __func__)); KASSERT(parent->mn_type == mqfstype_dir || parent->mn_type == mqfstype_root, ("%s(): parent is not a directory", __func__)); node->mn_info = parent->mn_info; node->mn_parent = parent; LIST_INIT(&node->mn_children); LIST_INIT(&node->mn_vnodes); LIST_INSERT_HEAD(&parent->mn_children, node, mn_sibling); mqnode_addref(parent); return (0); } static struct mqfs_node * mqfs_create_node(const char *name, int namelen, struct ucred *cred, int mode, int nodetype) { struct mqfs_node *node; node = mqnode_alloc(); strncpy(node->mn_name, name, namelen); node->mn_pr_root = cred->cr_prison->pr_root; node->mn_type = nodetype; node->mn_refcount = 1; vfs_timestamp(&node->mn_birth); node->mn_ctime = node->mn_atime = node->mn_mtime = node->mn_birth; node->mn_uid = cred->cr_uid; node->mn_gid = cred->cr_gid; node->mn_mode = mode; return (node); } /* * Create a file */ static struct mqfs_node * mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode) { struct mqfs_node *node; node = mqfs_create_node(name, namelen, cred, mode, mqfstype_file); if (mqfs_add_node(parent, node) != 0) { mqnode_free(node); return (NULL); } return (node); } /* * Add . and .. to a directory */ static int mqfs_fixup_dir(struct mqfs_node *parent) { struct mqfs_node *dir; dir = mqnode_alloc(); dir->mn_name[0] = '.'; dir->mn_type = mqfstype_this; dir->mn_refcount = 1; if (mqfs_add_node(parent, dir) != 0) { mqnode_free(dir); return (-1); } dir = mqnode_alloc(); dir->mn_name[0] = dir->mn_name[1] = '.'; dir->mn_type = mqfstype_parent; dir->mn_refcount = 1; if (mqfs_add_node(parent, dir) != 0) { mqnode_free(dir); return (-1); } return (0); } #ifdef notyet /* * Create a directory */ static struct mqfs_node * mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode) { struct mqfs_node *node; node = mqfs_create_node(name, namelen, cred, mode, mqfstype_dir); if (mqfs_add_node(parent, node) != 0) { mqnode_free(node); return (NULL); } if (mqfs_fixup_dir(node) != 0) { mqfs_destroy(node); return (NULL); } return (node); } /* * Create a symlink */ static struct mqfs_node * mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode) { struct mqfs_node *node; node = mqfs_create_node(name, namelen, cred, mode, mqfstype_symlink); if (mqfs_add_node(parent, node) != 0) { mqnode_free(node); return (NULL); } return (node); } #endif /* * Destroy a node or a tree of nodes */ static int mqfs_destroy(struct mqfs_node *node) { struct mqfs_node *parent; KASSERT(node != NULL, ("%s(): node is NULL", __func__)); KASSERT(node->mn_info != NULL, ("%s(): node has no mn_info", __func__)); /* destroy children */ if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root) while (! LIST_EMPTY(&node->mn_children)) mqfs_destroy(LIST_FIRST(&node->mn_children)); /* unlink from parent */ if ((parent = node->mn_parent) != NULL) { KASSERT(parent->mn_info == node->mn_info, ("%s(): parent has different mn_info", __func__)); LIST_REMOVE(node, mn_sibling); } if (node->mn_fileno != 0) mqfs_fileno_free(node->mn_info, node); if (node->mn_data != NULL) mqueue_free(node->mn_data); mqnode_free(node); return (0); } /* * Mount a mqfs instance */ static int mqfs_mount(struct mount *mp) { struct statfs *sbp; if (mp->mnt_flag & MNT_UPDATE) return (EOPNOTSUPP); mp->mnt_data = &mqfs_data; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; MNT_IUNLOCK(mp); vfs_getnewfsid(mp); sbp = &mp->mnt_stat; vfs_mountedfrom(mp, "mqueue"); sbp->f_bsize = PAGE_SIZE; sbp->f_iosize = PAGE_SIZE; sbp->f_blocks = 1; sbp->f_bfree = 0; sbp->f_bavail = 0; sbp->f_files = 1; sbp->f_ffree = 0; return (0); } /* * Unmount a mqfs instance */ static int mqfs_unmount(struct mount *mp, int mntflags) { int error; error = vflush(mp, 0, (mntflags & MNT_FORCE) ? FORCECLOSE : 0, curthread); return (error); } /* * Return a root vnode */ static int mqfs_root(struct mount *mp, int flags, struct vnode **vpp) { struct mqfs_info *mqfs; int ret; mqfs = VFSTOMQFS(mp); ret = mqfs_allocv(mp, vpp, mqfs->mi_root); return (ret); } /* * Return filesystem stats */ static int mqfs_statfs(struct mount *mp, struct statfs *sbp) { /* XXX update statistics */ return (0); } /* * Initialize a mqfs instance */ static int mqfs_init(struct vfsconf *vfc) { struct mqfs_node *root; struct mqfs_info *mi; osd_method_t methods[PR_MAXMETHOD] = { [PR_METHOD_REMOVE] = mqfs_prison_remove, }; mqnode_zone = uma_zcreate("mqnode", sizeof(struct mqfs_node), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mqueue_zone = uma_zcreate("mqueue", sizeof(struct mqueue), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mvdata_zone = uma_zcreate("mvdata", sizeof(struct mqfs_vdata), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mqnoti_zone = uma_zcreate("mqnotifier", sizeof(struct mqueue_notifier), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mi = &mqfs_data; sx_init(&mi->mi_lock, "mqfs lock"); /* set up the root diretory */ root = mqfs_create_node("/", 1, curthread->td_ucred, 01777, mqfstype_root); root->mn_info = mi; LIST_INIT(&root->mn_children); LIST_INIT(&root->mn_vnodes); mi->mi_root = root; mqfs_fileno_init(mi); mqfs_fileno_alloc(mi, root); mqfs_fixup_dir(root); exit_tag = EVENTHANDLER_REGISTER(process_exit, mq_proc_exit, NULL, EVENTHANDLER_PRI_ANY); mq_fdclose = mqueue_fdclose; p31b_setcfg(CTL_P1003_1B_MESSAGE_PASSING, _POSIX_MESSAGE_PASSING); mqfs_osd_jail_slot = osd_jail_register(NULL, methods); return (0); } /* * Destroy a mqfs instance */ static int mqfs_uninit(struct vfsconf *vfc) { struct mqfs_info *mi; if (!unloadable) return (EOPNOTSUPP); osd_jail_deregister(mqfs_osd_jail_slot); EVENTHANDLER_DEREGISTER(process_exit, exit_tag); mi = &mqfs_data; mqfs_destroy(mi->mi_root); mi->mi_root = NULL; mqfs_fileno_uninit(mi); sx_destroy(&mi->mi_lock); uma_zdestroy(mqnode_zone); uma_zdestroy(mqueue_zone); uma_zdestroy(mvdata_zone); uma_zdestroy(mqnoti_zone); return (0); } /* * task routine */ static void do_recycle(void *context, int pending __unused) { struct vnode *vp = (struct vnode *)context; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vrecycle(vp); VOP_UNLOCK(vp); vdrop(vp); } /* * Allocate a vnode */ static int mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn) { struct mqfs_vdata *vd; struct mqfs_info *mqfs; struct vnode *newvpp; int error; mqfs = pn->mn_info; *vpp = NULL; sx_xlock(&mqfs->mi_lock); LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) { if (vd->mv_vnode->v_mount == mp) { vhold(vd->mv_vnode); break; } } if (vd != NULL) { found: *vpp = vd->mv_vnode; sx_xunlock(&mqfs->mi_lock); error = vget(*vpp, LK_RETRY | LK_EXCLUSIVE); vdrop(*vpp); return (error); } sx_xunlock(&mqfs->mi_lock); error = getnewvnode("mqueue", mp, &mqfs_vnodeops, &newvpp); if (error) return (error); vn_lock(newvpp, LK_EXCLUSIVE | LK_RETRY); error = insmntque(newvpp, mp); if (error != 0) return (error); sx_xlock(&mqfs->mi_lock); /* * Check if it has already been allocated * while we were blocked. */ LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) { if (vd->mv_vnode->v_mount == mp) { vhold(vd->mv_vnode); sx_xunlock(&mqfs->mi_lock); vgone(newvpp); vput(newvpp); goto found; } } *vpp = newvpp; vd = uma_zalloc(mvdata_zone, M_WAITOK); (*vpp)->v_data = vd; vd->mv_vnode = *vpp; vd->mv_node = pn; TASK_INIT(&vd->mv_task, 0, do_recycle, *vpp); LIST_INSERT_HEAD(&pn->mn_vnodes, vd, mv_link); mqnode_addref(pn); switch (pn->mn_type) { case mqfstype_root: (*vpp)->v_vflag = VV_ROOT; /* fall through */ case mqfstype_dir: case mqfstype_this: case mqfstype_parent: (*vpp)->v_type = VDIR; break; case mqfstype_file: (*vpp)->v_type = VREG; break; case mqfstype_symlink: (*vpp)->v_type = VLNK; break; case mqfstype_none: KASSERT(0, ("mqfs_allocf called for null node\n")); default: panic("%s has unexpected type: %d", pn->mn_name, pn->mn_type); } sx_xunlock(&mqfs->mi_lock); vn_set_state(*vpp, VSTATE_CONSTRUCTED); return (0); } /* * Search a directory entry */ static struct mqfs_node * mqfs_search(struct mqfs_node *pd, const char *name, int len, struct ucred *cred) { struct mqfs_node *pn; const void *pr_root; sx_assert(&pd->mn_info->mi_lock, SX_LOCKED); pr_root = cred->cr_prison->pr_root; LIST_FOREACH(pn, &pd->mn_children, mn_sibling) { /* Only match names within the same prison root directory */ if ((pn->mn_pr_root == NULL || pn->mn_pr_root == pr_root) && strncmp(pn->mn_name, name, len) == 0 && pn->mn_name[len] == '\0') return (pn); } return (NULL); } /* * Look up a file or directory. */ static int mqfs_lookupx(struct vop_cachedlookup_args *ap) { struct componentname *cnp; struct vnode *dvp, **vpp; struct mqfs_node *pd; struct mqfs_node *pn; struct mqfs_info *mqfs; int nameiop, flags, error, namelen; char *pname; struct thread *td; td = curthread; cnp = ap->a_cnp; vpp = ap->a_vpp; dvp = ap->a_dvp; pname = cnp->cn_nameptr; namelen = cnp->cn_namelen; flags = cnp->cn_flags; nameiop = cnp->cn_nameiop; pd = VTON(dvp); pn = NULL; mqfs = pd->mn_info; *vpp = NULLVP; if (dvp->v_type != VDIR) return (ENOTDIR); error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td); if (error) return (error); /* shortcut: check if the name is too long */ if (cnp->cn_namelen >= MQFS_NAMELEN) return (ENOENT); /* self */ if (namelen == 1 && pname[0] == '.') { if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); pn = pd; *vpp = dvp; VREF(dvp); return (0); } /* parent */ if (cnp->cn_flags & ISDOTDOT) { if (dvp->v_vflag & VV_ROOT) return (EIO); if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); VOP_UNLOCK(dvp); KASSERT(pd->mn_parent, ("non-root directory has no parent")); pn = pd->mn_parent; error = mqfs_allocv(dvp->v_mount, vpp, pn); vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); return (error); } /* named node */ sx_xlock(&mqfs->mi_lock); pn = mqfs_search(pd, pname, namelen, cnp->cn_cred); if (pn != NULL) mqnode_addref(pn); sx_xunlock(&mqfs->mi_lock); /* found */ if (pn != NULL) { /* DELETE */ if (nameiop == DELETE && (flags & ISLASTCN)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) { mqnode_release(pn); return (error); } if (*vpp == dvp) { VREF(dvp); *vpp = dvp; mqnode_release(pn); return (0); } } /* allocate vnode */ error = mqfs_allocv(dvp->v_mount, vpp, pn); mqnode_release(pn); if (error == 0 && cnp->cn_flags & MAKEENTRY) cache_enter(dvp, *vpp, cnp); return (error); } /* not found */ /* will create a new entry in the directory ? */ if ((nameiop == CREATE || nameiop == RENAME) && (flags & LOCKPARENT) && (flags & ISLASTCN)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) return (error); return (EJUSTRETURN); } return (ENOENT); } #if 0 struct vop_lookup_args { struct vop_generic_args a_gen; struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; }; #endif /* * vnode lookup operation */ static int mqfs_lookup(struct vop_cachedlookup_args *ap) { int rc; rc = mqfs_lookupx(ap); return (rc); } #if 0 struct vop_create_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; #endif /* * vnode creation operation */ static int mqfs_create(struct vop_create_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct componentname *cnp = ap->a_cnp; struct mqfs_node *pd; struct mqfs_node *pn; struct mqueue *mq; int error; pd = VTON(ap->a_dvp); if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir) return (ENOTDIR); mq = mqueue_alloc(NULL); if (mq == NULL) return (EAGAIN); sx_xlock(&mqfs->mi_lock); pn = mqfs_create_file(pd, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, ap->a_vap->va_mode); if (pn == NULL) { sx_xunlock(&mqfs->mi_lock); error = ENOSPC; } else { mqnode_addref(pn); sx_xunlock(&mqfs->mi_lock); error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn); mqnode_release(pn); if (error) mqfs_destroy(pn); else pn->mn_data = mq; } if (error) mqueue_free(mq); return (error); } /* * Remove an entry */ static int do_unlink(struct mqfs_node *pn, struct ucred *ucred) { struct mqfs_node *parent; struct mqfs_vdata *vd; int error = 0; sx_assert(&pn->mn_info->mi_lock, SX_LOCKED); if (ucred->cr_uid != pn->mn_uid && (error = priv_check_cred(ucred, PRIV_MQ_ADMIN)) != 0) error = EACCES; else if (!pn->mn_deleted) { parent = pn->mn_parent; pn->mn_parent = NULL; pn->mn_deleted = 1; LIST_REMOVE(pn, mn_sibling); LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) { cache_purge(vd->mv_vnode); vhold(vd->mv_vnode); taskqueue_enqueue(taskqueue_thread, &vd->mv_task); } mqnode_release(pn); mqnode_release(parent); } else error = ENOENT; return (error); } #if 0 struct vop_remove_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; }; #endif /* * vnode removal operation */ static int mqfs_remove(struct vop_remove_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct mqfs_node *pn; int error; if (ap->a_vp->v_type == VDIR) return (EPERM); pn = VTON(ap->a_vp); sx_xlock(&mqfs->mi_lock); error = do_unlink(pn, ap->a_cnp->cn_cred); sx_xunlock(&mqfs->mi_lock); return (error); } #if 0 struct vop_inactive_args { struct vnode *a_vp; struct thread *a_td; }; #endif static int mqfs_inactive(struct vop_inactive_args *ap) { struct mqfs_node *pn = VTON(ap->a_vp); if (pn->mn_deleted) vrecycle(ap->a_vp); return (0); } #if 0 struct vop_reclaim_args { struct vop_generic_args a_gen; struct vnode *a_vp; }; #endif static int mqfs_reclaim(struct vop_reclaim_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_vp->v_mount); struct vnode *vp = ap->a_vp; struct mqfs_node *pn; struct mqfs_vdata *vd; vd = vp->v_data; pn = vd->mv_node; sx_xlock(&mqfs->mi_lock); vp->v_data = NULL; LIST_REMOVE(vd, mv_link); mqnode_release(pn); sx_xunlock(&mqfs->mi_lock); uma_zfree(mvdata_zone, vd); return (0); } #if 0 struct vop_open_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; struct file *a_fp; }; #endif static int mqfs_open(struct vop_open_args *ap) { return (0); } #if 0 struct vop_close_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; }; #endif static int mqfs_close(struct vop_close_args *ap) { return (0); } #if 0 struct vop_access_args { struct vop_generic_args a_gen; struct vnode *a_vp; accmode_t a_accmode; struct ucred *a_cred; struct thread *a_td; }; #endif /* * Verify permissions */ static int mqfs_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; struct vattr vattr; int error; error = VOP_GETATTR(vp, &vattr, ap->a_cred); if (error) return (error); error = vaccess(vp->v_type, vattr.va_mode, vattr.va_uid, vattr.va_gid, ap->a_accmode, ap->a_cred); return (error); } #if 0 struct vop_getattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; }; #endif /* * Get file attributes */ static int mqfs_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct mqfs_node *pn = VTON(vp); struct vattr *vap = ap->a_vap; int error = 0; vap->va_type = vp->v_type; vap->va_mode = pn->mn_mode; vap->va_nlink = 1; vap->va_uid = pn->mn_uid; vap->va_gid = pn->mn_gid; vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; vap->va_fileid = pn->mn_fileno; vap->va_size = 0; vap->va_blocksize = PAGE_SIZE; vap->va_bytes = vap->va_size = 0; vap->va_atime = pn->mn_atime; vap->va_mtime = pn->mn_mtime; vap->va_ctime = pn->mn_ctime; vap->va_birthtime = pn->mn_birth; vap->va_gen = 0; vap->va_flags = 0; vap->va_rdev = NODEV; vap->va_bytes = 0; vap->va_filerev = 0; return (error); } #if 0 struct vop_setattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; }; #endif /* * Set attributes */ static int mqfs_setattr(struct vop_setattr_args *ap) { struct mqfs_node *pn; struct vattr *vap; struct vnode *vp; struct thread *td; int c, error; uid_t uid; gid_t gid; td = curthread; vap = ap->a_vap; vp = ap->a_vp; if (vap->va_type != VNON || vap->va_nlink != VNOVAL || vap->va_fsid != VNOVAL || vap->va_fileid != VNOVAL || vap->va_blocksize != VNOVAL || (vap->va_flags != VNOVAL && vap->va_flags != 0) || vap->va_rdev != VNOVAL || (int)vap->va_bytes != VNOVAL || vap->va_gen != VNOVAL) { return (EINVAL); } pn = VTON(vp); error = c = 0; if (vap->va_uid == (uid_t)VNOVAL) uid = pn->mn_uid; else uid = vap->va_uid; if (vap->va_gid == (gid_t)VNOVAL) gid = pn->mn_gid; else gid = vap->va_gid; if (uid != pn->mn_uid || gid != pn->mn_gid) { /* * To modify the ownership of a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td))) return (error); /* * XXXRW: Why is there a privilege check here: shouldn't the * check in VOP_ACCESS() be enough? Also, are the group bits * below definitely right? */ if ((ap->a_cred->cr_uid != pn->mn_uid || uid != pn->mn_uid || (gid != pn->mn_gid && !groupmember(gid, ap->a_cred))) && (error = priv_check(td, PRIV_MQ_ADMIN)) != 0) return (error); pn->mn_uid = uid; pn->mn_gid = gid; c = 1; } if (vap->va_mode != (mode_t)VNOVAL) { if (ap->a_cred->cr_uid != pn->mn_uid && (error = priv_check(td, PRIV_MQ_ADMIN))) return (error); pn->mn_mode = vap->va_mode; c = 1; } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { /* See the comment in ufs_vnops::ufs_setattr(). */ if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td)) && ((vap->va_vaflags & VA_UTIMES_NULL) == 0 || (error = VOP_ACCESS(vp, VWRITE, ap->a_cred, td)))) return (error); if (vap->va_atime.tv_sec != VNOVAL) { pn->mn_atime = vap->va_atime; } if (vap->va_mtime.tv_sec != VNOVAL) { pn->mn_mtime = vap->va_mtime; } c = 1; } if (c) { vfs_timestamp(&pn->mn_ctime); } return (0); } #if 0 struct vop_read_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; }; #endif /* * Read from a file */ static int mqfs_read(struct vop_read_args *ap) { char buf[80]; struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct mqueue *mq; int len, error; if (vp->v_type != VREG) return (EINVAL); mq = VTOMQ(vp); snprintf(buf, sizeof(buf), "QSIZE:%-10ld MAXMSG:%-10ld CURMSG:%-10ld MSGSIZE:%-10ld\n", mq->mq_totalbytes, mq->mq_maxmsg, mq->mq_curmsgs, mq->mq_msgsize); buf[sizeof(buf)-1] = '\0'; len = strlen(buf); error = uiomove_frombuf(buf, len, uio); return (error); } #if 0 struct vop_readdir_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; uint64_t **a_cookies; }; #endif /* * Return directory entries. */ static int mqfs_readdir(struct vop_readdir_args *ap) { struct vnode *vp; struct mqfs_info *mi; struct mqfs_node *pd; struct mqfs_node *pn; struct dirent entry; struct uio *uio; const void *pr_root; int *tmp_ncookies = NULL; off_t offset; int error, i; vp = ap->a_vp; mi = VFSTOMQFS(vp->v_mount); pd = VTON(vp); uio = ap->a_uio; if (vp->v_type != VDIR) return (ENOTDIR); if (uio->uio_offset < 0) return (EINVAL); if (ap->a_ncookies != NULL) { tmp_ncookies = ap->a_ncookies; *ap->a_ncookies = 0; ap->a_ncookies = NULL; } error = 0; offset = 0; pr_root = ap->a_cred->cr_prison->pr_root; sx_xlock(&mi->mi_lock); LIST_FOREACH(pn, &pd->mn_children, mn_sibling) { entry.d_reclen = sizeof(entry); /* * Only show names within the same prison root directory * (or not associated with a prison, e.g. "." and ".."). */ if (pn->mn_pr_root != NULL && pn->mn_pr_root != pr_root) continue; if (!pn->mn_fileno) mqfs_fileno_alloc(mi, pn); entry.d_fileno = pn->mn_fileno; entry.d_off = offset + entry.d_reclen; for (i = 0; i < MQFS_NAMELEN - 1 && pn->mn_name[i] != '\0'; ++i) entry.d_name[i] = pn->mn_name[i]; entry.d_namlen = i; switch (pn->mn_type) { case mqfstype_root: case mqfstype_dir: case mqfstype_this: case mqfstype_parent: entry.d_type = DT_DIR; break; case mqfstype_file: entry.d_type = DT_REG; break; case mqfstype_symlink: entry.d_type = DT_LNK; break; default: panic("%s has unexpected node type: %d", pn->mn_name, pn->mn_type); } dirent_terminate(&entry); if (entry.d_reclen > uio->uio_resid) break; if (offset >= uio->uio_offset) { error = vfs_read_dirent(ap, &entry, offset); if (error) break; } offset += entry.d_reclen; } sx_xunlock(&mi->mi_lock); uio->uio_offset = offset; if (tmp_ncookies != NULL) ap->a_ncookies = tmp_ncookies; return (error); } #ifdef notyet #if 0 struct vop_mkdir_args { struct vnode *a_dvp; struvt vnode **a_vpp; struvt componentname *a_cnp; struct vattr *a_vap; }; #endif /* * Create a directory. */ static int mqfs_mkdir(struct vop_mkdir_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct componentname *cnp = ap->a_cnp; struct mqfs_node *pd = VTON(ap->a_dvp); struct mqfs_node *pn; int error; if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir) return (ENOTDIR); sx_xlock(&mqfs->mi_lock); pn = mqfs_create_dir(pd, cnp->cn_nameptr, cnp->cn_namelen, ap->a_vap->cn_cred, ap->a_vap->va_mode); if (pn != NULL) mqnode_addref(pn); sx_xunlock(&mqfs->mi_lock); if (pn == NULL) { error = ENOSPC; } else { error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn); mqnode_release(pn); } return (error); } #if 0 struct vop_rmdir_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; }; #endif /* * Remove a directory. */ static int mqfs_rmdir(struct vop_rmdir_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct mqfs_node *pn = VTON(ap->a_vp); struct mqfs_node *pt; if (pn->mn_type != mqfstype_dir) return (ENOTDIR); sx_xlock(&mqfs->mi_lock); if (pn->mn_deleted) { sx_xunlock(&mqfs->mi_lock); return (ENOENT); } pt = LIST_FIRST(&pn->mn_children); pt = LIST_NEXT(pt, mn_sibling); pt = LIST_NEXT(pt, mn_sibling); if (pt != NULL) { sx_xunlock(&mqfs->mi_lock); return (ENOTEMPTY); } pt = pn->mn_parent; pn->mn_parent = NULL; pn->mn_deleted = 1; LIST_REMOVE(pn, mn_sibling); mqnode_release(pn); mqnode_release(pt); sx_xunlock(&mqfs->mi_lock); cache_purge(ap->a_vp); return (0); } #endif /* notyet */ /* * See if this prison root is obsolete, and clean up associated queues if it is. */ static int mqfs_prison_remove(void *obj, void *data __unused) { const struct prison *pr = obj; struct prison *tpr; struct mqfs_node *pn, *tpn; struct vnode *pr_root; pr_root = pr->pr_root; if (pr->pr_parent->pr_root == pr_root) return (0); TAILQ_FOREACH(tpr, &allprison, pr_list) { if (tpr != pr && tpr->pr_root == pr_root) return (0); } /* * No jails are rooted in this directory anymore, * so no queues should be either. */ sx_xlock(&mqfs_data.mi_lock); LIST_FOREACH_SAFE(pn, &mqfs_data.mi_root->mn_children, mn_sibling, tpn) { if (pn->mn_pr_root == pr_root) (void)do_unlink(pn, curthread->td_ucred); } sx_xunlock(&mqfs_data.mi_lock); return (0); } /* * Allocate a message queue */ static struct mqueue * mqueue_alloc(const struct mq_attr *attr) { struct mqueue *mq; if (curmq >= maxmq) return (NULL); mq = uma_zalloc(mqueue_zone, M_WAITOK | M_ZERO); TAILQ_INIT(&mq->mq_msgq); if (attr != NULL) { mq->mq_maxmsg = attr->mq_maxmsg; mq->mq_msgsize = attr->mq_msgsize; } else { mq->mq_maxmsg = default_maxmsg; mq->mq_msgsize = default_msgsize; } mtx_init(&mq->mq_mutex, "mqueue lock", NULL, MTX_DEF); knlist_init_mtx(&mq->mq_rsel.si_note, &mq->mq_mutex); knlist_init_mtx(&mq->mq_wsel.si_note, &mq->mq_mutex); atomic_add_int(&curmq, 1); return (mq); } /* * Destroy a message queue */ static void mqueue_free(struct mqueue *mq) { struct mqueue_msg *msg; while ((msg = TAILQ_FIRST(&mq->mq_msgq)) != NULL) { TAILQ_REMOVE(&mq->mq_msgq, msg, msg_link); free(msg, M_MQUEUEDATA); } mtx_destroy(&mq->mq_mutex); seldrain(&mq->mq_rsel); seldrain(&mq->mq_wsel); knlist_destroy(&mq->mq_rsel.si_note); knlist_destroy(&mq->mq_wsel.si_note); uma_zfree(mqueue_zone, mq); atomic_add_int(&curmq, -1); } /* * Load a message from user space */ static struct mqueue_msg * mqueue_loadmsg(const char *msg_ptr, size_t msg_size, int msg_prio) { struct mqueue_msg *msg; size_t len; int error; len = sizeof(struct mqueue_msg) + msg_size; msg = malloc(len, M_MQUEUEDATA, M_WAITOK); error = copyin(msg_ptr, ((char *)msg) + sizeof(struct mqueue_msg), msg_size); if (error) { free(msg, M_MQUEUEDATA); msg = NULL; } else { msg->msg_size = msg_size; msg->msg_prio = msg_prio; } return (msg); } /* * Save a message to user space */ static int mqueue_savemsg(struct mqueue_msg *msg, char *msg_ptr, int *msg_prio) { int error; error = copyout(((char *)msg) + sizeof(*msg), msg_ptr, msg->msg_size); if (error == 0 && msg_prio != NULL) error = copyout(&msg->msg_prio, msg_prio, sizeof(int)); return (error); } /* * Free a message's memory */ static __inline void mqueue_freemsg(struct mqueue_msg *msg) { free(msg, M_MQUEUEDATA); } /* * Send a message. if waitok is false, thread will not be * blocked if there is no data in queue, otherwise, absolute * time will be checked. */ int mqueue_send(struct mqueue *mq, const char *msg_ptr, size_t msg_len, unsigned msg_prio, int waitok, const struct timespec *abs_timeout) { struct mqueue_msg *msg; struct timespec ts, ts2; struct timeval tv; int error; if (msg_prio >= MQ_PRIO_MAX) return (EINVAL); if (msg_len > mq->mq_msgsize) return (EMSGSIZE); msg = mqueue_loadmsg(msg_ptr, msg_len, msg_prio); if (msg == NULL) return (EFAULT); /* O_NONBLOCK case */ if (!waitok) { error = _mqueue_send(mq, msg, -1); if (error) goto bad; return (0); } /* we allow a null timeout (wait forever) */ if (abs_timeout == NULL) { error = _mqueue_send(mq, msg, 0); if (error) goto bad; return (0); } /* send it before checking time */ error = _mqueue_send(mq, msg, -1); if (error == 0) return (0); if (error != EAGAIN) goto bad; if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) { error = EINVAL; goto bad; } for (;;) { getnanotime(&ts); timespecsub(abs_timeout, &ts, &ts2); if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) { error = ETIMEDOUT; break; } TIMESPEC_TO_TIMEVAL(&tv, &ts2); error = _mqueue_send(mq, msg, tvtohz(&tv)); if (error != ETIMEDOUT) break; } if (error == 0) return (0); bad: mqueue_freemsg(msg); return (error); } /* * Common routine to send a message */ static int _mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo) { struct mqueue_msg *msg2; int error = 0; mtx_lock(&mq->mq_mutex); while (mq->mq_curmsgs >= mq->mq_maxmsg && error == 0) { if (timo < 0) { mtx_unlock(&mq->mq_mutex); return (EAGAIN); } mq->mq_senders++; error = msleep(&mq->mq_senders, &mq->mq_mutex, PCATCH, "mqsend", timo); mq->mq_senders--; if (error == EAGAIN) error = ETIMEDOUT; } if (mq->mq_curmsgs >= mq->mq_maxmsg) { mtx_unlock(&mq->mq_mutex); return (error); } error = 0; if (TAILQ_EMPTY(&mq->mq_msgq)) { TAILQ_INSERT_HEAD(&mq->mq_msgq, msg, msg_link); } else { if (msg->msg_prio <= TAILQ_LAST(&mq->mq_msgq, msgq)->msg_prio) { TAILQ_INSERT_TAIL(&mq->mq_msgq, msg, msg_link); } else { TAILQ_FOREACH(msg2, &mq->mq_msgq, msg_link) { if (msg2->msg_prio < msg->msg_prio) break; } TAILQ_INSERT_BEFORE(msg2, msg, msg_link); } } mq->mq_curmsgs++; mq->mq_totalbytes += msg->msg_size; if (mq->mq_receivers) wakeup_one(&mq->mq_receivers); else if (mq->mq_notifier != NULL) mqueue_send_notification(mq); if (mq->mq_flags & MQ_RSEL) { mq->mq_flags &= ~MQ_RSEL; selwakeup(&mq->mq_rsel); } KNOTE_LOCKED(&mq->mq_rsel.si_note, 0); mtx_unlock(&mq->mq_mutex); return (0); } /* * Send realtime a signal to process which registered itself * successfully by mq_notify. */ static void mqueue_send_notification(struct mqueue *mq) { struct mqueue_notifier *nt; struct thread *td; struct proc *p; int error; mtx_assert(&mq->mq_mutex, MA_OWNED); nt = mq->mq_notifier; if (nt->nt_sigev.sigev_notify != SIGEV_NONE) { p = nt->nt_proc; error = sigev_findtd(p, &nt->nt_sigev, &td); if (error) { mq->mq_notifier = NULL; return; } if (!KSI_ONQ(&nt->nt_ksi)) { ksiginfo_set_sigev(&nt->nt_ksi, &nt->nt_sigev); tdsendsignal(p, td, nt->nt_ksi.ksi_signo, &nt->nt_ksi); } PROC_UNLOCK(p); } mq->mq_notifier = NULL; } /* * Get a message. if waitok is false, thread will not be * blocked if there is no data in queue, otherwise, absolute * time will be checked. */ int mqueue_receive(struct mqueue *mq, char *msg_ptr, size_t msg_len, unsigned *msg_prio, int waitok, const struct timespec *abs_timeout) { struct mqueue_msg *msg; struct timespec ts, ts2; struct timeval tv; int error; if (msg_len < mq->mq_msgsize) return (EMSGSIZE); /* O_NONBLOCK case */ if (!waitok) { error = _mqueue_recv(mq, &msg, -1); if (error) return (error); goto received; } /* we allow a null timeout (wait forever). */ if (abs_timeout == NULL) { error = _mqueue_recv(mq, &msg, 0); if (error) return (error); goto received; } /* try to get a message before checking time */ error = _mqueue_recv(mq, &msg, -1); if (error == 0) goto received; if (error != EAGAIN) return (error); if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) { error = EINVAL; return (error); } for (;;) { getnanotime(&ts); timespecsub(abs_timeout, &ts, &ts2); if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) { error = ETIMEDOUT; return (error); } TIMESPEC_TO_TIMEVAL(&tv, &ts2); error = _mqueue_recv(mq, &msg, tvtohz(&tv)); if (error == 0) break; if (error != ETIMEDOUT) return (error); } received: error = mqueue_savemsg(msg, msg_ptr, msg_prio); if (error == 0) { curthread->td_retval[0] = msg->msg_size; curthread->td_retval[1] = 0; } mqueue_freemsg(msg); return (error); } /* * Common routine to receive a message */ static int _mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo) { int error = 0; mtx_lock(&mq->mq_mutex); while ((*msg = TAILQ_FIRST(&mq->mq_msgq)) == NULL && error == 0) { if (timo < 0) { mtx_unlock(&mq->mq_mutex); return (EAGAIN); } mq->mq_receivers++; error = msleep(&mq->mq_receivers, &mq->mq_mutex, PCATCH, "mqrecv", timo); mq->mq_receivers--; if (error == EAGAIN) error = ETIMEDOUT; } if (*msg != NULL) { error = 0; TAILQ_REMOVE(&mq->mq_msgq, *msg, msg_link); mq->mq_curmsgs--; mq->mq_totalbytes -= (*msg)->msg_size; if (mq->mq_senders) wakeup_one(&mq->mq_senders); if (mq->mq_flags & MQ_WSEL) { mq->mq_flags &= ~MQ_WSEL; selwakeup(&mq->mq_wsel); } KNOTE_LOCKED(&mq->mq_wsel.si_note, 0); } if (mq->mq_notifier != NULL && mq->mq_receivers == 0 && !TAILQ_EMPTY(&mq->mq_msgq)) { mqueue_send_notification(mq); } mtx_unlock(&mq->mq_mutex); return (error); } static __inline struct mqueue_notifier * notifier_alloc(void) { return (uma_zalloc(mqnoti_zone, M_WAITOK | M_ZERO)); } static __inline void notifier_free(struct mqueue_notifier *p) { uma_zfree(mqnoti_zone, p); } static struct mqueue_notifier * notifier_search(struct proc *p, int fd) { struct mqueue_notifier *nt; LIST_FOREACH(nt, &p->p_mqnotifier, nt_link) { if (nt->nt_ksi.ksi_mqd == fd) break; } return (nt); } static __inline void notifier_insert(struct proc *p, struct mqueue_notifier *nt) { LIST_INSERT_HEAD(&p->p_mqnotifier, nt, nt_link); } static __inline void notifier_delete(struct proc *p, struct mqueue_notifier *nt) { LIST_REMOVE(nt, nt_link); notifier_free(nt); } static void notifier_remove(struct proc *p, struct mqueue *mq, int fd) { struct mqueue_notifier *nt; mtx_assert(&mq->mq_mutex, MA_OWNED); PROC_LOCK(p); nt = notifier_search(p, fd); if (nt != NULL) { if (mq->mq_notifier == nt) mq->mq_notifier = NULL; sigqueue_take(&nt->nt_ksi); notifier_delete(p, nt); } PROC_UNLOCK(p); } static int kern_kmq_open(struct thread *td, const char *upath, int flags, mode_t mode, const struct mq_attr *attr) { char path[MQFS_NAMELEN + 1]; struct mqfs_node *pn; struct pwddesc *pdp; struct file *fp; struct mqueue *mq; int fd, error, len, cmode; AUDIT_ARG_FFLAGS(flags); AUDIT_ARG_MODE(mode); pdp = td->td_proc->p_pd; cmode = ((mode & ~pdp->pd_cmask) & ALLPERMS) & ~S_ISTXT; mq = NULL; if ((flags & O_CREAT) != 0 && attr != NULL) { if (attr->mq_maxmsg <= 0 || attr->mq_maxmsg > maxmsg) return (EINVAL); if (attr->mq_msgsize <= 0 || attr->mq_msgsize > maxmsgsize) return (EINVAL); } error = copyinstr(upath, path, MQFS_NAMELEN + 1, NULL); if (error) return (error); /* * The first character of name must be a slash (/) character * and the remaining characters of name cannot include any slash * characters. */ len = strlen(path); if (len < 2 || path[0] != '/' || strchr(path + 1, '/') != NULL) return (EINVAL); /* * "." and ".." are magic directories, populated on the fly, and cannot * be opened as queues. */ if (strcmp(path, "/.") == 0 || strcmp(path, "/..") == 0) return (EINVAL); AUDIT_ARG_UPATH1_CANON(path); error = falloc(td, &fp, &fd, O_CLOEXEC); if (error) return (error); sx_xlock(&mqfs_data.mi_lock); pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred); if (pn == NULL) { if (!(flags & O_CREAT)) { error = ENOENT; } else { mq = mqueue_alloc(attr); if (mq == NULL) { error = ENFILE; } else { pn = mqfs_create_file(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred, cmode); if (pn == NULL) { error = ENOSPC; mqueue_free(mq); } } } if (error == 0) { pn->mn_data = mq; } } else { if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) { error = EEXIST; } else { accmode_t accmode = 0; if (flags & FREAD) accmode |= VREAD; if (flags & FWRITE) accmode |= VWRITE; error = vaccess(VREG, pn->mn_mode, pn->mn_uid, pn->mn_gid, accmode, td->td_ucred); } } if (error) { sx_xunlock(&mqfs_data.mi_lock); fdclose(td, fp, fd); fdrop(fp, td); return (error); } mqnode_addref(pn); sx_xunlock(&mqfs_data.mi_lock); finit(fp, flags & (FREAD | FWRITE | O_NONBLOCK), DTYPE_MQUEUE, pn, &mqueueops); td->td_retval[0] = fd; fdrop(fp, td); return (0); } /* * Syscall to open a message queue. */ int sys_kmq_open(struct thread *td, struct kmq_open_args *uap) { struct mq_attr attr; int flags, error; if ((uap->flags & O_ACCMODE) == O_ACCMODE || uap->flags & O_EXEC) return (EINVAL); flags = FFLAGS(uap->flags); if ((flags & O_CREAT) != 0 && uap->attr != NULL) { error = copyin(uap->attr, &attr, sizeof(attr)); if (error) return (error); } return (kern_kmq_open(td, uap->path, flags, uap->mode, uap->attr != NULL ? &attr : NULL)); } /* * Syscall to unlink a message queue. */ int sys_kmq_unlink(struct thread *td, struct kmq_unlink_args *uap) { char path[MQFS_NAMELEN+1]; struct mqfs_node *pn; int error, len; error = copyinstr(uap->path, path, MQFS_NAMELEN + 1, NULL); if (error) return (error); len = strlen(path); if (len < 2 || path[0] != '/' || strchr(path + 1, '/') != NULL) return (EINVAL); if (strcmp(path, "/.") == 0 || strcmp(path, "/..") == 0) return (EINVAL); AUDIT_ARG_UPATH1_CANON(path); sx_xlock(&mqfs_data.mi_lock); pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred); if (pn != NULL) error = do_unlink(pn, td->td_ucred); else error = ENOENT; sx_xunlock(&mqfs_data.mi_lock); return (error); } typedef int (*_fgetf)(struct thread *, int, cap_rights_t *, struct file **); /* * Get message queue by giving file slot */ static int _getmq(struct thread *td, int fd, cap_rights_t *rightsp, _fgetf func, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { struct mqfs_node *pn; int error; error = func(td, fd, rightsp, fpp); if (error) return (error); if (&mqueueops != (*fpp)->f_ops) { fdrop(*fpp, td); return (EBADF); } pn = (*fpp)->f_data; if (ppn) *ppn = pn; if (pmq) *pmq = pn->mn_data; return (0); } static __inline int getmq(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { return _getmq(td, fd, &cap_event_rights, fget, fpp, ppn, pmq); } static __inline int getmq_read(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { return _getmq(td, fd, &cap_read_rights, fget_read, fpp, ppn, pmq); } static __inline int getmq_write(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { return _getmq(td, fd, &cap_write_rights, fget_write, fpp, ppn, pmq); } static int kern_kmq_setattr(struct thread *td, int mqd, const struct mq_attr *attr, struct mq_attr *oattr) { struct mqueue *mq; struct file *fp; u_int oflag, flag; int error; AUDIT_ARG_FD(mqd); if (attr != NULL && (attr->mq_flags & ~O_NONBLOCK) != 0) return (EINVAL); error = getmq(td, mqd, &fp, NULL, &mq); if (error) return (error); oattr->mq_maxmsg = mq->mq_maxmsg; oattr->mq_msgsize = mq->mq_msgsize; oattr->mq_curmsgs = mq->mq_curmsgs; if (attr != NULL) { do { oflag = flag = fp->f_flag; flag &= ~O_NONBLOCK; flag |= (attr->mq_flags & O_NONBLOCK); } while (atomic_cmpset_int(&fp->f_flag, oflag, flag) == 0); } else oflag = fp->f_flag; oattr->mq_flags = (O_NONBLOCK & oflag); fdrop(fp, td); return (error); } int sys_kmq_setattr(struct thread *td, struct kmq_setattr_args *uap) { struct mq_attr attr, oattr; int error; if (uap->attr != NULL) { error = copyin(uap->attr, &attr, sizeof(attr)); if (error != 0) return (error); } error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL, &oattr); if (error == 0 && uap->oattr != NULL) { bzero(oattr.__reserved, sizeof(oattr.__reserved)); error = copyout(&oattr, uap->oattr, sizeof(oattr)); } return (error); } int sys_kmq_timedreceive(struct thread *td, struct kmq_timedreceive_args *uap) { struct mqueue *mq; struct file *fp; struct timespec *abs_timeout, ets; int error; int waitok; AUDIT_ARG_FD(uap->mqd); error = getmq_read(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); if (uap->abs_timeout != NULL) { error = copyin(uap->abs_timeout, &ets, sizeof(ets)); if (error != 0) goto out; abs_timeout = &ets; } else abs_timeout = NULL; waitok = !(fp->f_flag & O_NONBLOCK); error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len, uap->msg_prio, waitok, abs_timeout); out: fdrop(fp, td); return (error); } int sys_kmq_timedsend(struct thread *td, struct kmq_timedsend_args *uap) { struct mqueue *mq; struct file *fp; struct timespec *abs_timeout, ets; int error, waitok; AUDIT_ARG_FD(uap->mqd); error = getmq_write(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); if (uap->abs_timeout != NULL) { error = copyin(uap->abs_timeout, &ets, sizeof(ets)); if (error != 0) goto out; abs_timeout = &ets; } else abs_timeout = NULL; waitok = !(fp->f_flag & O_NONBLOCK); error = mqueue_send(mq, uap->msg_ptr, uap->msg_len, uap->msg_prio, waitok, abs_timeout); out: fdrop(fp, td); return (error); } static int kern_kmq_notify(struct thread *td, int mqd, struct sigevent *sigev) { struct filedesc *fdp; struct proc *p; struct mqueue *mq; struct file *fp, *fp2; struct mqueue_notifier *nt, *newnt = NULL; int error; AUDIT_ARG_FD(mqd); if (sigev != NULL) { if (sigev->sigev_notify != SIGEV_SIGNAL && sigev->sigev_notify != SIGEV_THREAD_ID && sigev->sigev_notify != SIGEV_NONE) return (EINVAL); if ((sigev->sigev_notify == SIGEV_SIGNAL || sigev->sigev_notify == SIGEV_THREAD_ID) && !_SIG_VALID(sigev->sigev_signo)) return (EINVAL); } p = td->td_proc; fdp = td->td_proc->p_fd; error = getmq(td, mqd, &fp, NULL, &mq); if (error) return (error); again: FILEDESC_SLOCK(fdp); fp2 = fget_noref(fdp, mqd); if (fp2 == NULL) { FILEDESC_SUNLOCK(fdp); error = EBADF; goto out; } #ifdef CAPABILITIES error = cap_check(cap_rights(fdp, mqd), &cap_event_rights); if (error) { FILEDESC_SUNLOCK(fdp); goto out; } #endif if (fp2 != fp) { FILEDESC_SUNLOCK(fdp); error = EBADF; goto out; } mtx_lock(&mq->mq_mutex); FILEDESC_SUNLOCK(fdp); if (sigev != NULL) { if (mq->mq_notifier != NULL) { error = EBUSY; } else { PROC_LOCK(p); nt = notifier_search(p, mqd); if (nt == NULL) { if (newnt == NULL) { PROC_UNLOCK(p); mtx_unlock(&mq->mq_mutex); newnt = notifier_alloc(); goto again; } } if (nt != NULL) { sigqueue_take(&nt->nt_ksi); if (newnt != NULL) { notifier_free(newnt); newnt = NULL; } } else { nt = newnt; newnt = NULL; ksiginfo_init(&nt->nt_ksi); nt->nt_ksi.ksi_flags |= KSI_INS | KSI_EXT; nt->nt_ksi.ksi_code = SI_MESGQ; nt->nt_proc = p; nt->nt_ksi.ksi_mqd = mqd; notifier_insert(p, nt); } nt->nt_sigev = *sigev; mq->mq_notifier = nt; PROC_UNLOCK(p); /* * if there is no receivers and message queue * is not empty, we should send notification * as soon as possible. */ if (mq->mq_receivers == 0 && !TAILQ_EMPTY(&mq->mq_msgq)) mqueue_send_notification(mq); } } else { notifier_remove(p, mq, mqd); } mtx_unlock(&mq->mq_mutex); out: fdrop(fp, td); if (newnt != NULL) notifier_free(newnt); return (error); } int sys_kmq_notify(struct thread *td, struct kmq_notify_args *uap) { struct sigevent ev, *evp; int error; if (uap->sigev == NULL) { evp = NULL; } else { error = copyin(uap->sigev, &ev, sizeof(ev)); if (error != 0) return (error); evp = &ev; } return (kern_kmq_notify(td, uap->mqd, evp)); } static void mqueue_fdclose(struct thread *td, int fd, struct file *fp) { struct mqueue *mq; #ifdef INVARIANTS struct filedesc *fdp; fdp = td->td_proc->p_fd; FILEDESC_LOCK_ASSERT(fdp); #endif if (fp->f_ops == &mqueueops) { mq = FPTOMQ(fp); mtx_lock(&mq->mq_mutex); notifier_remove(td->td_proc, mq, fd); /* have to wakeup thread in same process */ if (mq->mq_flags & MQ_RSEL) { mq->mq_flags &= ~MQ_RSEL; selwakeup(&mq->mq_rsel); } if (mq->mq_flags & MQ_WSEL) { mq->mq_flags &= ~MQ_WSEL; selwakeup(&mq->mq_wsel); } mtx_unlock(&mq->mq_mutex); } } static void mq_proc_exit(void *arg __unused, struct proc *p) { struct filedesc *fdp; struct file *fp; struct mqueue *mq; int i; fdp = p->p_fd; FILEDESC_SLOCK(fdp); for (i = 0; i < fdp->fd_nfiles; ++i) { fp = fget_noref(fdp, i); if (fp != NULL && fp->f_ops == &mqueueops) { mq = FPTOMQ(fp); mtx_lock(&mq->mq_mutex); notifier_remove(p, FPTOMQ(fp), i); mtx_unlock(&mq->mq_mutex); } } FILEDESC_SUNLOCK(fdp); KASSERT(LIST_EMPTY(&p->p_mqnotifier), ("mq notifiers left")); } static int mqf_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct mqueue *mq = FPTOMQ(fp); int revents = 0; mtx_lock(&mq->mq_mutex); if (events & (POLLIN | POLLRDNORM)) { if (mq->mq_curmsgs) { revents |= events & (POLLIN | POLLRDNORM); } else { mq->mq_flags |= MQ_RSEL; selrecord(td, &mq->mq_rsel); } } if (events & POLLOUT) { if (mq->mq_curmsgs < mq->mq_maxmsg) revents |= POLLOUT; else { mq->mq_flags |= MQ_WSEL; selrecord(td, &mq->mq_wsel); } } mtx_unlock(&mq->mq_mutex); return (revents); } static int mqf_close(struct file *fp, struct thread *td) { struct mqfs_node *pn; fp->f_ops = &badfileops; pn = fp->f_data; fp->f_data = NULL; sx_xlock(&mqfs_data.mi_lock); mqnode_release(pn); sx_xunlock(&mqfs_data.mi_lock); return (0); } static int mqf_stat(struct file *fp, struct stat *st, struct ucred *active_cred) { struct mqfs_node *pn = fp->f_data; bzero(st, sizeof *st); sx_xlock(&mqfs_data.mi_lock); st->st_atim = pn->mn_atime; st->st_mtim = pn->mn_mtime; st->st_ctim = pn->mn_ctime; st->st_birthtim = pn->mn_birth; st->st_uid = pn->mn_uid; st->st_gid = pn->mn_gid; st->st_mode = S_IFIFO | pn->mn_mode; sx_xunlock(&mqfs_data.mi_lock); return (0); } static int mqf_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { struct mqfs_node *pn; int error; error = 0; pn = fp->f_data; sx_xlock(&mqfs_data.mi_lock); error = vaccess(VREG, pn->mn_mode, pn->mn_uid, pn->mn_gid, VADMIN, active_cred); if (error != 0) goto out; pn->mn_mode = mode & ACCESSPERMS; out: sx_xunlock(&mqfs_data.mi_lock); return (error); } static int mqf_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { struct mqfs_node *pn; int error; error = 0; pn = fp->f_data; sx_xlock(&mqfs_data.mi_lock); if (uid == (uid_t)-1) uid = pn->mn_uid; if (gid == (gid_t)-1) gid = pn->mn_gid; if (((uid != pn->mn_uid && uid != active_cred->cr_uid) || (gid != pn->mn_gid && !groupmember(gid, active_cred))) && (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN))) goto out; pn->mn_uid = uid; pn->mn_gid = gid; out: sx_xunlock(&mqfs_data.mi_lock); return (error); } static int mqf_kqfilter(struct file *fp, struct knote *kn) { struct mqueue *mq = FPTOMQ(fp); int error = 0; if (kn->kn_filter == EVFILT_READ) { kn->kn_fop = &mq_rfiltops; knlist_add(&mq->mq_rsel.si_note, kn, 0); } else if (kn->kn_filter == EVFILT_WRITE) { kn->kn_fop = &mq_wfiltops; knlist_add(&mq->mq_wsel.si_note, kn, 0); } else error = EINVAL; return (error); } static void filt_mqdetach(struct knote *kn) { struct mqueue *mq = FPTOMQ(kn->kn_fp); if (kn->kn_filter == EVFILT_READ) knlist_remove(&mq->mq_rsel.si_note, kn, 0); else if (kn->kn_filter == EVFILT_WRITE) knlist_remove(&mq->mq_wsel.si_note, kn, 0); else panic("filt_mqdetach"); } static int filt_mqread(struct knote *kn, long hint) { struct mqueue *mq = FPTOMQ(kn->kn_fp); mtx_assert(&mq->mq_mutex, MA_OWNED); return (mq->mq_curmsgs != 0); } static int filt_mqwrite(struct knote *kn, long hint) { struct mqueue *mq = FPTOMQ(kn->kn_fp); mtx_assert(&mq->mq_mutex, MA_OWNED); return (mq->mq_curmsgs < mq->mq_maxmsg); } static int mqf_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { kif->kf_type = KF_TYPE_MQUEUE; return (0); } -static struct fileops mqueueops = { +static const struct fileops mqueueops = { .fo_read = invfo_rdwr, .fo_write = invfo_rdwr, .fo_truncate = invfo_truncate, .fo_ioctl = invfo_ioctl, .fo_poll = mqf_poll, .fo_kqfilter = mqf_kqfilter, .fo_stat = mqf_stat, .fo_close = mqf_close, .fo_chmod = mqf_chmod, .fo_chown = mqf_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = mqf_fill_kinfo, .fo_cmp = file_kcmp_generic, .fo_flags = DFLAG_PASSABLE, }; static struct vop_vector mqfs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = mqfs_access, .vop_cachedlookup = mqfs_lookup, .vop_lookup = vfs_cache_lookup, .vop_reclaim = mqfs_reclaim, .vop_create = mqfs_create, .vop_remove = mqfs_remove, .vop_inactive = mqfs_inactive, .vop_open = mqfs_open, .vop_close = mqfs_close, .vop_getattr = mqfs_getattr, .vop_setattr = mqfs_setattr, .vop_read = mqfs_read, .vop_write = VOP_EOPNOTSUPP, .vop_readdir = mqfs_readdir, .vop_mkdir = VOP_EOPNOTSUPP, .vop_rmdir = VOP_EOPNOTSUPP }; VFS_VOP_VECTOR_REGISTER(mqfs_vnodeops); static struct vfsops mqfs_vfsops = { .vfs_init = mqfs_init, .vfs_uninit = mqfs_uninit, .vfs_mount = mqfs_mount, .vfs_unmount = mqfs_unmount, .vfs_root = mqfs_root, .vfs_statfs = mqfs_statfs, }; static struct vfsconf mqueuefs_vfsconf = { .vfc_version = VFS_VERSION, .vfc_name = "mqueuefs", .vfc_vfsops = &mqfs_vfsops, .vfc_typenum = -1, .vfc_flags = VFCF_SYNTHETIC }; static struct syscall_helper_data mq_syscalls[] = { SYSCALL_INIT_HELPER(kmq_open), SYSCALL_INIT_HELPER_F(kmq_setattr, SYF_CAPENABLED), SYSCALL_INIT_HELPER_F(kmq_timedsend, SYF_CAPENABLED), SYSCALL_INIT_HELPER_F(kmq_timedreceive, SYF_CAPENABLED), SYSCALL_INIT_HELPER_F(kmq_notify, SYF_CAPENABLED), SYSCALL_INIT_HELPER(kmq_unlink), SYSCALL_INIT_LAST }; #ifdef COMPAT_FREEBSD32 #include #include #include #include #include static void mq_attr_from32(const struct mq_attr32 *from, struct mq_attr *to) { to->mq_flags = from->mq_flags; to->mq_maxmsg = from->mq_maxmsg; to->mq_msgsize = from->mq_msgsize; to->mq_curmsgs = from->mq_curmsgs; } static void mq_attr_to32(const struct mq_attr *from, struct mq_attr32 *to) { to->mq_flags = from->mq_flags; to->mq_maxmsg = from->mq_maxmsg; to->mq_msgsize = from->mq_msgsize; to->mq_curmsgs = from->mq_curmsgs; } int freebsd32_kmq_open(struct thread *td, struct freebsd32_kmq_open_args *uap) { struct mq_attr attr; struct mq_attr32 attr32; int flags, error; if ((uap->flags & O_ACCMODE) == O_ACCMODE || uap->flags & O_EXEC) return (EINVAL); flags = FFLAGS(uap->flags); if ((flags & O_CREAT) != 0 && uap->attr != NULL) { error = copyin(uap->attr, &attr32, sizeof(attr32)); if (error) return (error); mq_attr_from32(&attr32, &attr); } return (kern_kmq_open(td, uap->path, flags, uap->mode, uap->attr != NULL ? &attr : NULL)); } int freebsd32_kmq_setattr(struct thread *td, struct freebsd32_kmq_setattr_args *uap) { struct mq_attr attr, oattr; struct mq_attr32 attr32, oattr32; int error; if (uap->attr != NULL) { error = copyin(uap->attr, &attr32, sizeof(attr32)); if (error != 0) return (error); mq_attr_from32(&attr32, &attr); } error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL, &oattr); if (error == 0 && uap->oattr != NULL) { mq_attr_to32(&oattr, &oattr32); bzero(oattr32.__reserved, sizeof(oattr32.__reserved)); error = copyout(&oattr32, uap->oattr, sizeof(oattr32)); } return (error); } int freebsd32_kmq_timedsend(struct thread *td, struct freebsd32_kmq_timedsend_args *uap) { struct mqueue *mq; struct file *fp; struct timespec32 ets32; struct timespec *abs_timeout, ets; int error; int waitok; AUDIT_ARG_FD(uap->mqd); error = getmq_write(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); if (uap->abs_timeout != NULL) { error = copyin(uap->abs_timeout, &ets32, sizeof(ets32)); if (error != 0) goto out; CP(ets32, ets, tv_sec); CP(ets32, ets, tv_nsec); abs_timeout = &ets; } else abs_timeout = NULL; waitok = !(fp->f_flag & O_NONBLOCK); error = mqueue_send(mq, uap->msg_ptr, uap->msg_len, uap->msg_prio, waitok, abs_timeout); out: fdrop(fp, td); return (error); } int freebsd32_kmq_timedreceive(struct thread *td, struct freebsd32_kmq_timedreceive_args *uap) { struct mqueue *mq; struct file *fp; struct timespec32 ets32; struct timespec *abs_timeout, ets; int error, waitok; AUDIT_ARG_FD(uap->mqd); error = getmq_read(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); if (uap->abs_timeout != NULL) { error = copyin(uap->abs_timeout, &ets32, sizeof(ets32)); if (error != 0) goto out; CP(ets32, ets, tv_sec); CP(ets32, ets, tv_nsec); abs_timeout = &ets; } else abs_timeout = NULL; waitok = !(fp->f_flag & O_NONBLOCK); error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len, uap->msg_prio, waitok, abs_timeout); out: fdrop(fp, td); return (error); } int freebsd32_kmq_notify(struct thread *td, struct freebsd32_kmq_notify_args *uap) { struct sigevent ev, *evp; struct sigevent32 ev32; int error; if (uap->sigev == NULL) { evp = NULL; } else { error = copyin(uap->sigev, &ev32, sizeof(ev32)); if (error != 0) return (error); error = convert_sigevent32(&ev32, &ev); if (error != 0) return (error); evp = &ev; } return (kern_kmq_notify(td, uap->mqd, evp)); } static struct syscall_helper_data mq32_syscalls[] = { SYSCALL32_INIT_HELPER(freebsd32_kmq_open), SYSCALL32_INIT_HELPER_F(freebsd32_kmq_setattr, SYF_CAPENABLED), SYSCALL32_INIT_HELPER_F(freebsd32_kmq_timedsend, SYF_CAPENABLED), SYSCALL32_INIT_HELPER_F(freebsd32_kmq_timedreceive, SYF_CAPENABLED), SYSCALL32_INIT_HELPER_F(freebsd32_kmq_notify, SYF_CAPENABLED), SYSCALL32_INIT_HELPER_COMPAT(kmq_unlink), SYSCALL_INIT_LAST }; #endif static int mqinit(void) { int error; error = syscall_helper_register(mq_syscalls, SY_THR_STATIC_KLD); if (error != 0) return (error); #ifdef COMPAT_FREEBSD32 error = syscall32_helper_register(mq32_syscalls, SY_THR_STATIC_KLD); if (error != 0) return (error); #endif return (0); } static int mqunload(void) { #ifdef COMPAT_FREEBSD32 syscall32_helper_unregister(mq32_syscalls); #endif syscall_helper_unregister(mq_syscalls); return (0); } static int mq_modload(struct module *module, int cmd, void *arg) { int error = 0; error = vfs_modevent(module, cmd, arg); if (error != 0) return (error); switch (cmd) { case MOD_LOAD: error = mqinit(); if (error != 0) mqunload(); break; case MOD_UNLOAD: error = mqunload(); break; default: break; } return (error); } static moduledata_t mqueuefs_mod = { "mqueuefs", mq_modload, &mqueuefs_vfsconf }; DECLARE_MODULE(mqueuefs, mqueuefs_mod, SI_SUB_VFS, SI_ORDER_MIDDLE); MODULE_VERSION(mqueuefs, 1); diff --git a/sys/kern/uipc_sem.c b/sys/kern/uipc_sem.c index b4652e9106ac..35ca9a9fb06e 100644 --- a/sys/kern/uipc_sem.c +++ b/sys/kern/uipc_sem.c @@ -1,1109 +1,1109 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2002 Alfred Perlstein * Copyright (c) 2003-2005 SPARTA, Inc. * Copyright (c) 2005, 2016-2017 Robert N. M. Watson * All rights reserved. * * This software was developed for the FreeBSD Project in part by Network * Associates Laboratories, the Security Research Division of Network * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), * as part of the DARPA CHATS research program. * * Portions of this software were developed by BAE Systems, the University of * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent * Computing (TC) research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_posix.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(p1003_1b_semaphores, "POSIX P1003.1B semaphores support"); /* * TODO * * - Resource limits? * - Replace global sem_lock with mtx_pool locks? * - Add a MAC check_create() hook for creating new named semaphores. */ #ifndef SEM_MAX #define SEM_MAX 30 #endif #ifdef SEM_DEBUG #define DP(x) printf x #else #define DP(x) #endif struct ksem_mapping { char *km_path; Fnv32_t km_fnv; struct ksem *km_ksem; LIST_ENTRY(ksem_mapping) km_link; }; static MALLOC_DEFINE(M_KSEM, "ksem", "semaphore file descriptor"); static LIST_HEAD(, ksem_mapping) *ksem_dictionary; static struct sx ksem_dict_lock; static struct mtx ksem_count_lock; static struct mtx sem_lock; static u_long ksem_hash; static int ksem_dead; #define KSEM_HASH(fnv) (&ksem_dictionary[(fnv) & ksem_hash]) static int nsems = 0; SYSCTL_DECL(_p1003_1b); SYSCTL_INT(_p1003_1b, OID_AUTO, nsems, CTLFLAG_RD, &nsems, 0, "Number of active kernel POSIX semaphores"); static int kern_sem_wait(struct thread *td, semid_t id, int tryflag, struct timespec *abstime); static int ksem_access(struct ksem *ks, struct ucred *ucred); static struct ksem *ksem_alloc(struct ucred *ucred, mode_t mode, unsigned int value); static int ksem_create(struct thread *td, const char *path, semid_t *semidp, mode_t mode, unsigned int value, int flags, int compat32); static void ksem_drop(struct ksem *ks); static int ksem_get(struct thread *td, semid_t id, cap_rights_t *rightsp, struct file **fpp); static struct ksem *ksem_hold(struct ksem *ks); static void ksem_insert(char *path, Fnv32_t fnv, struct ksem *ks); static struct ksem *ksem_lookup(char *path, Fnv32_t fnv); static void ksem_module_destroy(void); static int ksem_module_init(void); static int ksem_remove(char *path, Fnv32_t fnv, struct ucred *ucred); static int sem_modload(struct module *module, int cmd, void *arg); static fo_stat_t ksem_stat; static fo_close_t ksem_closef; static fo_chmod_t ksem_chmod; static fo_chown_t ksem_chown; static fo_fill_kinfo_t ksem_fill_kinfo; /* File descriptor operations. */ -static struct fileops ksem_ops = { +static const struct fileops ksem_ops = { .fo_read = invfo_rdwr, .fo_write = invfo_rdwr, .fo_truncate = invfo_truncate, .fo_ioctl = invfo_ioctl, .fo_poll = invfo_poll, .fo_kqfilter = invfo_kqfilter, .fo_stat = ksem_stat, .fo_close = ksem_closef, .fo_chmod = ksem_chmod, .fo_chown = ksem_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = ksem_fill_kinfo, .fo_cmp = file_kcmp_generic, .fo_flags = DFLAG_PASSABLE }; FEATURE(posix_sem, "POSIX semaphores"); static int ksem_stat(struct file *fp, struct stat *sb, struct ucred *active_cred) { struct ksem *ks; #ifdef MAC int error; #endif ks = fp->f_data; #ifdef MAC error = mac_posixsem_check_stat(active_cred, fp->f_cred, ks); if (error) return (error); #endif /* * Attempt to return sanish values for fstat() on a semaphore * file descriptor. */ bzero(sb, sizeof(*sb)); mtx_lock(&sem_lock); sb->st_atim = ks->ks_atime; sb->st_ctim = ks->ks_ctime; sb->st_mtim = ks->ks_mtime; sb->st_birthtim = ks->ks_birthtime; sb->st_uid = ks->ks_uid; sb->st_gid = ks->ks_gid; sb->st_mode = S_IFREG | ks->ks_mode; /* XXX */ mtx_unlock(&sem_lock); return (0); } static int ksem_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { struct ksem *ks; int error; error = 0; ks = fp->f_data; mtx_lock(&sem_lock); #ifdef MAC error = mac_posixsem_check_setmode(active_cred, ks, mode); if (error != 0) goto out; #endif error = vaccess(VREG, ks->ks_mode, ks->ks_uid, ks->ks_gid, VADMIN, active_cred); if (error != 0) goto out; ks->ks_mode = mode & ACCESSPERMS; out: mtx_unlock(&sem_lock); return (error); } static int ksem_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { struct ksem *ks; int error; error = 0; ks = fp->f_data; mtx_lock(&sem_lock); #ifdef MAC error = mac_posixsem_check_setowner(active_cred, ks, uid, gid); if (error != 0) goto out; #endif if (uid == (uid_t)-1) uid = ks->ks_uid; if (gid == (gid_t)-1) gid = ks->ks_gid; if (((uid != ks->ks_uid && uid != active_cred->cr_uid) || (gid != ks->ks_gid && !groupmember(gid, active_cred))) && (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN))) goto out; ks->ks_uid = uid; ks->ks_gid = gid; out: mtx_unlock(&sem_lock); return (error); } static int ksem_closef(struct file *fp, struct thread *td) { struct ksem *ks; ks = fp->f_data; fp->f_data = NULL; ksem_drop(ks); return (0); } static int ksem_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { const char *path, *pr_path; struct ksem *ks; size_t pr_pathlen; kif->kf_type = KF_TYPE_SEM; ks = fp->f_data; mtx_lock(&sem_lock); kif->kf_un.kf_sem.kf_sem_value = ks->ks_value; kif->kf_un.kf_sem.kf_sem_mode = S_IFREG | ks->ks_mode; /* XXX */ mtx_unlock(&sem_lock); if (ks->ks_path != NULL) { sx_slock(&ksem_dict_lock); if (ks->ks_path != NULL) { path = ks->ks_path; pr_path = curthread->td_ucred->cr_prison->pr_path; if (strcmp(pr_path, "/") != 0) { /* Return the jail-rooted pathname. */ pr_pathlen = strlen(pr_path); if (strncmp(path, pr_path, pr_pathlen) == 0 && path[pr_pathlen] == '/') path += pr_pathlen; } strlcpy(kif->kf_path, path, sizeof(kif->kf_path)); } sx_sunlock(&ksem_dict_lock); } return (0); } /* * ksem object management including creation and reference counting * routines. */ static struct ksem * ksem_alloc(struct ucred *ucred, mode_t mode, unsigned int value) { struct ksem *ks; mtx_lock(&ksem_count_lock); if (nsems == p31b_getcfg(CTL_P1003_1B_SEM_NSEMS_MAX) || ksem_dead) { mtx_unlock(&ksem_count_lock); return (NULL); } nsems++; mtx_unlock(&ksem_count_lock); ks = malloc(sizeof(*ks), M_KSEM, M_WAITOK | M_ZERO); ks->ks_uid = ucred->cr_uid; ks->ks_gid = ucred->cr_gid; ks->ks_mode = mode; ks->ks_value = value; cv_init(&ks->ks_cv, "ksem"); vfs_timestamp(&ks->ks_birthtime); ks->ks_atime = ks->ks_mtime = ks->ks_ctime = ks->ks_birthtime; refcount_init(&ks->ks_ref, 1); #ifdef MAC mac_posixsem_init(ks); mac_posixsem_create(ucred, ks); #endif return (ks); } static struct ksem * ksem_hold(struct ksem *ks) { refcount_acquire(&ks->ks_ref); return (ks); } static void ksem_drop(struct ksem *ks) { if (refcount_release(&ks->ks_ref)) { #ifdef MAC mac_posixsem_destroy(ks); #endif cv_destroy(&ks->ks_cv); free(ks, M_KSEM); mtx_lock(&ksem_count_lock); nsems--; mtx_unlock(&ksem_count_lock); } } /* * Determine if the credentials have sufficient permissions for read * and write access. */ static int ksem_access(struct ksem *ks, struct ucred *ucred) { int error; error = vaccess(VREG, ks->ks_mode, ks->ks_uid, ks->ks_gid, VREAD | VWRITE, ucred); if (error) error = priv_check_cred(ucred, PRIV_SEM_WRITE); return (error); } /* * Dictionary management. We maintain an in-kernel dictionary to map * paths to semaphore objects. We use the FNV hash on the path to * store the mappings in a hash table. */ static struct ksem * ksem_lookup(char *path, Fnv32_t fnv) { struct ksem_mapping *map; LIST_FOREACH(map, KSEM_HASH(fnv), km_link) { if (map->km_fnv != fnv) continue; if (strcmp(map->km_path, path) == 0) return (map->km_ksem); } return (NULL); } static void ksem_insert(char *path, Fnv32_t fnv, struct ksem *ks) { struct ksem_mapping *map; map = malloc(sizeof(struct ksem_mapping), M_KSEM, M_WAITOK); map->km_path = path; map->km_fnv = fnv; map->km_ksem = ksem_hold(ks); ks->ks_path = path; LIST_INSERT_HEAD(KSEM_HASH(fnv), map, km_link); } static int ksem_remove(char *path, Fnv32_t fnv, struct ucred *ucred) { struct ksem_mapping *map; int error; LIST_FOREACH(map, KSEM_HASH(fnv), km_link) { if (map->km_fnv != fnv) continue; if (strcmp(map->km_path, path) == 0) { #ifdef MAC error = mac_posixsem_check_unlink(ucred, map->km_ksem); if (error) return (error); #endif error = ksem_access(map->km_ksem, ucred); if (error) return (error); map->km_ksem->ks_path = NULL; LIST_REMOVE(map, km_link); ksem_drop(map->km_ksem); free(map->km_path, M_KSEM); free(map, M_KSEM); return (0); } } return (ENOENT); } static int ksem_create_copyout_semid(struct thread *td, semid_t *semidp, int fd, int compat32) { semid_t semid; #ifdef COMPAT_FREEBSD32 int32_t semid32; #endif void *ptr; size_t ptrs; #ifdef COMPAT_FREEBSD32 if (compat32) { semid32 = fd; ptr = &semid32; ptrs = sizeof(semid32); } else { #endif semid = fd; ptr = &semid; ptrs = sizeof(semid); compat32 = 0; /* silence gcc */ #ifdef COMPAT_FREEBSD32 } #endif return (copyout(ptr, semidp, ptrs)); } /* Other helper routines. */ static int ksem_create(struct thread *td, const char *name, semid_t *semidp, mode_t mode, unsigned int value, int flags, int compat32) { struct pwddesc *pdp; struct ksem *ks; struct file *fp; char *path; const char *pr_path; size_t pr_pathlen; Fnv32_t fnv; int error, fd; AUDIT_ARG_FFLAGS(flags); AUDIT_ARG_MODE(mode); AUDIT_ARG_VALUE(value); if (value > SEM_VALUE_MAX) return (EINVAL); pdp = td->td_proc->p_pd; mode = (mode & ~pdp->pd_cmask) & ACCESSPERMS; error = falloc(td, &fp, &fd, O_CLOEXEC); if (error) { if (name == NULL) error = ENOSPC; return (error); } /* * Go ahead and copyout the file descriptor now. This is a bit * premature, but it is a lot easier to handle errors as opposed * to later when we've possibly created a new semaphore, etc. */ error = ksem_create_copyout_semid(td, semidp, fd, compat32); if (error) { fdclose(td, fp, fd); fdrop(fp, td); return (error); } if (name == NULL) { /* Create an anonymous semaphore. */ ks = ksem_alloc(td->td_ucred, mode, value); if (ks == NULL) error = ENOSPC; else ks->ks_flags |= KS_ANONYMOUS; } else { path = malloc(MAXPATHLEN, M_KSEM, M_WAITOK); pr_path = td->td_ucred->cr_prison->pr_path; /* Construct a full pathname for jailed callers. */ pr_pathlen = strcmp(pr_path, "/") == 0 ? 0 : strlcpy(path, pr_path, MAXPATHLEN); error = copyinstr(name, path + pr_pathlen, MAXPATHLEN - pr_pathlen, NULL); /* Require paths to start with a '/' character. */ if (error == 0 && path[pr_pathlen] != '/') error = EINVAL; if (error) { fdclose(td, fp, fd); fdrop(fp, td); free(path, M_KSEM); return (error); } AUDIT_ARG_UPATH1_CANON(path); fnv = fnv_32_str(path, FNV1_32_INIT); sx_xlock(&ksem_dict_lock); ks = ksem_lookup(path, fnv); if (ks == NULL) { /* Object does not exist, create it if requested. */ if (flags & O_CREAT) { ks = ksem_alloc(td->td_ucred, mode, value); if (ks == NULL) error = ENFILE; else { ksem_insert(path, fnv, ks); path = NULL; } } else error = ENOENT; } else { /* * Object already exists, obtain a new * reference if requested and permitted. */ if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) error = EEXIST; else { #ifdef MAC error = mac_posixsem_check_open(td->td_ucred, ks); if (error == 0) #endif error = ksem_access(ks, td->td_ucred); } if (error == 0) ksem_hold(ks); #ifdef INVARIANTS else ks = NULL; #endif } sx_xunlock(&ksem_dict_lock); if (path) free(path, M_KSEM); } if (error) { KASSERT(ks == NULL, ("ksem_create error with a ksem")); fdclose(td, fp, fd); fdrop(fp, td); return (error); } KASSERT(ks != NULL, ("ksem_create w/o a ksem")); finit(fp, FREAD | FWRITE, DTYPE_SEM, ks, &ksem_ops); fdrop(fp, td); return (0); } static int ksem_get(struct thread *td, semid_t id, cap_rights_t *rightsp, struct file **fpp) { struct ksem *ks; struct file *fp; int error; error = fget(td, id, rightsp, &fp); if (error) return (EINVAL); if (fp->f_type != DTYPE_SEM) { fdrop(fp, td); return (EINVAL); } ks = fp->f_data; if (ks->ks_flags & KS_DEAD) { fdrop(fp, td); return (EINVAL); } *fpp = fp; return (0); } /* System calls. */ #ifndef _SYS_SYSPROTO_H_ struct ksem_init_args { unsigned int value; semid_t *idp; }; #endif int sys_ksem_init(struct thread *td, struct ksem_init_args *uap) { return (ksem_create(td, NULL, uap->idp, S_IRWXU | S_IRWXG, uap->value, 0, 0)); } #ifndef _SYS_SYSPROTO_H_ struct ksem_open_args { char *name; int oflag; mode_t mode; unsigned int value; semid_t *idp; }; #endif int sys_ksem_open(struct thread *td, struct ksem_open_args *uap) { DP((">>> ksem_open start, pid=%d\n", (int)td->td_proc->p_pid)); if ((uap->oflag & ~(O_CREAT | O_EXCL)) != 0) return (EINVAL); return (ksem_create(td, uap->name, uap->idp, uap->mode, uap->value, uap->oflag, 0)); } #ifndef _SYS_SYSPROTO_H_ struct ksem_unlink_args { char *name; }; #endif int sys_ksem_unlink(struct thread *td, struct ksem_unlink_args *uap) { char *path; const char *pr_path; size_t pr_pathlen; Fnv32_t fnv; int error; path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); pr_path = td->td_ucred->cr_prison->pr_path; pr_pathlen = strcmp(pr_path, "/") == 0 ? 0 : strlcpy(path, pr_path, MAXPATHLEN); error = copyinstr(uap->name, path + pr_pathlen, MAXPATHLEN - pr_pathlen, NULL); if (error) { free(path, M_TEMP); return (error); } AUDIT_ARG_UPATH1_CANON(path); fnv = fnv_32_str(path, FNV1_32_INIT); sx_xlock(&ksem_dict_lock); error = ksem_remove(path, fnv, td->td_ucred); sx_xunlock(&ksem_dict_lock); free(path, M_TEMP); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ksem_close_args { semid_t id; }; #endif int sys_ksem_close(struct thread *td, struct ksem_close_args *uap) { struct ksem *ks; struct file *fp; int error; /* No capability rights required to close a semaphore. */ AUDIT_ARG_FD(uap->id); error = ksem_get(td, uap->id, &cap_no_rights, &fp); if (error) return (error); ks = fp->f_data; if (ks->ks_flags & KS_ANONYMOUS) { fdrop(fp, td); return (EINVAL); } error = kern_close(td, uap->id); fdrop(fp, td); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ksem_post_args { semid_t id; }; #endif int sys_ksem_post(struct thread *td, struct ksem_post_args *uap) { cap_rights_t rights; struct file *fp; struct ksem *ks; int error; AUDIT_ARG_FD(uap->id); error = ksem_get(td, uap->id, cap_rights_init_one(&rights, CAP_SEM_POST), &fp); if (error) return (error); ks = fp->f_data; mtx_lock(&sem_lock); #ifdef MAC error = mac_posixsem_check_post(td->td_ucred, fp->f_cred, ks); if (error) goto err; #endif if (ks->ks_value == SEM_VALUE_MAX) { error = EOVERFLOW; goto err; } ++ks->ks_value; if (ks->ks_waiters > 0) cv_signal(&ks->ks_cv); error = 0; vfs_timestamp(&ks->ks_ctime); err: mtx_unlock(&sem_lock); fdrop(fp, td); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ksem_wait_args { semid_t id; }; #endif int sys_ksem_wait(struct thread *td, struct ksem_wait_args *uap) { return (kern_sem_wait(td, uap->id, 0, NULL)); } #ifndef _SYS_SYSPROTO_H_ struct ksem_timedwait_args { semid_t id; const struct timespec *abstime; }; #endif int sys_ksem_timedwait(struct thread *td, struct ksem_timedwait_args *uap) { struct timespec abstime; struct timespec *ts; int error; /* * We allow a null timespec (wait forever). */ if (uap->abstime == NULL) ts = NULL; else { error = copyin(uap->abstime, &abstime, sizeof(abstime)); if (error != 0) return (error); if (abstime.tv_nsec >= 1000000000 || abstime.tv_nsec < 0) return (EINVAL); ts = &abstime; } return (kern_sem_wait(td, uap->id, 0, ts)); } #ifndef _SYS_SYSPROTO_H_ struct ksem_trywait_args { semid_t id; }; #endif int sys_ksem_trywait(struct thread *td, struct ksem_trywait_args *uap) { return (kern_sem_wait(td, uap->id, 1, NULL)); } static int kern_sem_wait(struct thread *td, semid_t id, int tryflag, struct timespec *abstime) { struct timespec ts1, ts2; struct timeval tv; cap_rights_t rights; struct file *fp; struct ksem *ks; int error; DP((">>> kern_sem_wait entered! pid=%d\n", (int)td->td_proc->p_pid)); AUDIT_ARG_FD(id); error = ksem_get(td, id, cap_rights_init_one(&rights, CAP_SEM_WAIT), &fp); if (error) return (error); ks = fp->f_data; mtx_lock(&sem_lock); DP((">>> kern_sem_wait critical section entered! pid=%d\n", (int)td->td_proc->p_pid)); #ifdef MAC error = mac_posixsem_check_wait(td->td_ucred, fp->f_cred, ks); if (error) { DP(("kern_sem_wait mac failed\n")); goto err; } #endif DP(("kern_sem_wait value = %d, tryflag %d\n", ks->ks_value, tryflag)); vfs_timestamp(&ks->ks_atime); while (ks->ks_value == 0) { ks->ks_waiters++; if (tryflag != 0) error = EAGAIN; else if (abstime == NULL) error = cv_wait_sig(&ks->ks_cv, &sem_lock); else { for (;;) { ts1 = *abstime; getnanotime(&ts2); timespecsub(&ts1, &ts2, &ts1); TIMESPEC_TO_TIMEVAL(&tv, &ts1); if (tv.tv_sec < 0) { error = ETIMEDOUT; break; } error = cv_timedwait_sig(&ks->ks_cv, &sem_lock, tvtohz(&tv)); if (error != EWOULDBLOCK) break; } } ks->ks_waiters--; if (error) goto err; } ks->ks_value--; DP(("kern_sem_wait value post-decrement = %d\n", ks->ks_value)); error = 0; err: mtx_unlock(&sem_lock); fdrop(fp, td); DP(("<<< kern_sem_wait leaving, pid=%d, error = %d\n", (int)td->td_proc->p_pid, error)); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ksem_getvalue_args { semid_t id; int *val; }; #endif int sys_ksem_getvalue(struct thread *td, struct ksem_getvalue_args *uap) { cap_rights_t rights; struct file *fp; struct ksem *ks; int error, val; AUDIT_ARG_FD(uap->id); error = ksem_get(td, uap->id, cap_rights_init_one(&rights, CAP_SEM_GETVALUE), &fp); if (error) return (error); ks = fp->f_data; mtx_lock(&sem_lock); #ifdef MAC error = mac_posixsem_check_getvalue(td->td_ucred, fp->f_cred, ks); if (error) { mtx_unlock(&sem_lock); fdrop(fp, td); return (error); } #endif val = ks->ks_value; vfs_timestamp(&ks->ks_atime); mtx_unlock(&sem_lock); fdrop(fp, td); error = copyout(&val, uap->val, sizeof(val)); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ksem_destroy_args { semid_t id; }; #endif int sys_ksem_destroy(struct thread *td, struct ksem_destroy_args *uap) { struct file *fp; struct ksem *ks; int error; /* No capability rights required to close a semaphore. */ AUDIT_ARG_FD(uap->id); error = ksem_get(td, uap->id, &cap_no_rights, &fp); if (error) return (error); ks = fp->f_data; if (!(ks->ks_flags & KS_ANONYMOUS)) { fdrop(fp, td); return (EINVAL); } mtx_lock(&sem_lock); if (ks->ks_waiters != 0) { mtx_unlock(&sem_lock); error = EBUSY; goto err; } ks->ks_flags |= KS_DEAD; mtx_unlock(&sem_lock); error = kern_close(td, uap->id); err: fdrop(fp, td); return (error); } static struct syscall_helper_data ksem_syscalls[] = { SYSCALL_INIT_HELPER(ksem_init), SYSCALL_INIT_HELPER(ksem_open), SYSCALL_INIT_HELPER(ksem_unlink), SYSCALL_INIT_HELPER(ksem_close), SYSCALL_INIT_HELPER(ksem_post), SYSCALL_INIT_HELPER(ksem_wait), SYSCALL_INIT_HELPER(ksem_timedwait), SYSCALL_INIT_HELPER(ksem_trywait), SYSCALL_INIT_HELPER(ksem_getvalue), SYSCALL_INIT_HELPER(ksem_destroy), SYSCALL_INIT_LAST }; #ifdef COMPAT_FREEBSD32 #include #include #include #include #include int freebsd32_ksem_init(struct thread *td, struct freebsd32_ksem_init_args *uap) { return (ksem_create(td, NULL, (semid_t *)uap->idp, S_IRWXU | S_IRWXG, uap->value, 0, 1)); } int freebsd32_ksem_open(struct thread *td, struct freebsd32_ksem_open_args *uap) { if ((uap->oflag & ~(O_CREAT | O_EXCL)) != 0) return (EINVAL); return (ksem_create(td, uap->name, (semid_t *)uap->idp, uap->mode, uap->value, uap->oflag, 1)); } int freebsd32_ksem_timedwait(struct thread *td, struct freebsd32_ksem_timedwait_args *uap) { struct timespec32 abstime32; struct timespec *ts, abstime; int error; /* * We allow a null timespec (wait forever). */ if (uap->abstime == NULL) ts = NULL; else { error = copyin(uap->abstime, &abstime32, sizeof(abstime32)); if (error != 0) return (error); CP(abstime32, abstime, tv_sec); CP(abstime32, abstime, tv_nsec); if (abstime.tv_nsec >= 1000000000 || abstime.tv_nsec < 0) return (EINVAL); ts = &abstime; } return (kern_sem_wait(td, uap->id, 0, ts)); } static struct syscall_helper_data ksem32_syscalls[] = { SYSCALL32_INIT_HELPER(freebsd32_ksem_init), SYSCALL32_INIT_HELPER(freebsd32_ksem_open), SYSCALL32_INIT_HELPER_COMPAT(ksem_unlink), SYSCALL32_INIT_HELPER_COMPAT(ksem_close), SYSCALL32_INIT_HELPER_COMPAT(ksem_post), SYSCALL32_INIT_HELPER_COMPAT(ksem_wait), SYSCALL32_INIT_HELPER(freebsd32_ksem_timedwait), SYSCALL32_INIT_HELPER_COMPAT(ksem_trywait), SYSCALL32_INIT_HELPER_COMPAT(ksem_getvalue), SYSCALL32_INIT_HELPER_COMPAT(ksem_destroy), SYSCALL_INIT_LAST }; #endif static int ksem_module_init(void) { int error; mtx_init(&sem_lock, "sem", NULL, MTX_DEF); mtx_init(&ksem_count_lock, "ksem count", NULL, MTX_DEF); sx_init(&ksem_dict_lock, "ksem dictionary"); ksem_dictionary = hashinit(1024, M_KSEM, &ksem_hash); p31b_setcfg(CTL_P1003_1B_SEMAPHORES, 200112L); p31b_setcfg(CTL_P1003_1B_SEM_NSEMS_MAX, SEM_MAX); p31b_setcfg(CTL_P1003_1B_SEM_VALUE_MAX, SEM_VALUE_MAX); error = syscall_helper_register(ksem_syscalls, SY_THR_STATIC_KLD); if (error) return (error); #ifdef COMPAT_FREEBSD32 error = syscall32_helper_register(ksem32_syscalls, SY_THR_STATIC_KLD); if (error) return (error); #endif return (0); } static void ksem_module_destroy(void) { #ifdef COMPAT_FREEBSD32 syscall32_helper_unregister(ksem32_syscalls); #endif syscall_helper_unregister(ksem_syscalls); p31b_setcfg(CTL_P1003_1B_SEMAPHORES, 0); hashdestroy(ksem_dictionary, M_KSEM, ksem_hash); sx_destroy(&ksem_dict_lock); mtx_destroy(&ksem_count_lock); mtx_destroy(&sem_lock); p31b_unsetcfg(CTL_P1003_1B_SEM_VALUE_MAX); p31b_unsetcfg(CTL_P1003_1B_SEM_NSEMS_MAX); } static int sem_modload(struct module *module, int cmd, void *arg) { int error = 0; switch (cmd) { case MOD_LOAD: error = ksem_module_init(); if (error) ksem_module_destroy(); break; case MOD_UNLOAD: mtx_lock(&ksem_count_lock); if (nsems != 0) { error = EOPNOTSUPP; mtx_unlock(&ksem_count_lock); break; } ksem_dead = 1; mtx_unlock(&ksem_count_lock); ksem_module_destroy(); break; case MOD_SHUTDOWN: break; default: error = EINVAL; break; } return (error); } static moduledata_t sem_mod = { "sem", &sem_modload, NULL }; DECLARE_MODULE(sem, sem_mod, SI_SUB_SYSV_SEM, SI_ORDER_FIRST); MODULE_VERSION(sem, 1); diff --git a/sys/kern/uipc_shm.c b/sys/kern/uipc_shm.c index f51998d0ed00..dad9fb23250f 100644 --- a/sys/kern/uipc_shm.c +++ b/sys/kern/uipc_shm.c @@ -1,2225 +1,2225 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2006, 2011, 2016-2017 Robert N. M. Watson * Copyright 2020 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by BAE Systems, the University of * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent * Computing (TC) research program. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Support for shared swap-backed anonymous memory objects via * shm_open(2), shm_rename(2), and shm_unlink(2). * While most of the implementation is here, vm_mmap.c contains * mapping logic changes. * * posixshmcontrol(1) allows users to inspect the state of the memory * objects. Per-uid swap resource limit controls total amount of * memory that user can consume for anonymous objects, including * shared. */ #include #include "opt_capsicum.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct shm_mapping { char *sm_path; Fnv32_t sm_fnv; struct shmfd *sm_shmfd; LIST_ENTRY(shm_mapping) sm_link; }; static MALLOC_DEFINE(M_SHMFD, "shmfd", "shared memory file descriptor"); static LIST_HEAD(, shm_mapping) *shm_dictionary; static struct sx shm_dict_lock; static struct mtx shm_timestamp_lock; static u_long shm_hash; static struct unrhdr64 shm_ino_unr; static dev_t shm_dev_ino; #define SHM_HASH(fnv) (&shm_dictionary[(fnv) & shm_hash]) static void shm_init(void *arg); static void shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd); static struct shmfd *shm_lookup(char *path, Fnv32_t fnv); static int shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred); static void shm_doremove(struct shm_mapping *map); static int shm_dotruncate_cookie(struct shmfd *shmfd, off_t length, void *rl_cookie); static int shm_dotruncate_locked(struct shmfd *shmfd, off_t length, void *rl_cookie); static int shm_copyin_path(struct thread *td, const char *userpath_in, char **path_out); static int shm_deallocate(struct shmfd *shmfd, off_t *offset, off_t *length, int flags); static fo_rdwr_t shm_read; static fo_rdwr_t shm_write; static fo_truncate_t shm_truncate; static fo_ioctl_t shm_ioctl; static fo_stat_t shm_stat; static fo_close_t shm_close; static fo_chmod_t shm_chmod; static fo_chown_t shm_chown; static fo_seek_t shm_seek; static fo_fill_kinfo_t shm_fill_kinfo; static fo_mmap_t shm_mmap; static fo_get_seals_t shm_get_seals; static fo_add_seals_t shm_add_seals; static fo_fallocate_t shm_fallocate; static fo_fspacectl_t shm_fspacectl; /* File descriptor operations. */ -struct fileops shm_ops = { +const struct fileops shm_ops = { .fo_read = shm_read, .fo_write = shm_write, .fo_truncate = shm_truncate, .fo_ioctl = shm_ioctl, .fo_poll = invfo_poll, .fo_kqfilter = invfo_kqfilter, .fo_stat = shm_stat, .fo_close = shm_close, .fo_chmod = shm_chmod, .fo_chown = shm_chown, .fo_sendfile = vn_sendfile, .fo_seek = shm_seek, .fo_fill_kinfo = shm_fill_kinfo, .fo_mmap = shm_mmap, .fo_get_seals = shm_get_seals, .fo_add_seals = shm_add_seals, .fo_fallocate = shm_fallocate, .fo_fspacectl = shm_fspacectl, .fo_cmp = file_kcmp_generic, .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE, }; FEATURE(posix_shm, "POSIX shared memory"); static SYSCTL_NODE(_vm, OID_AUTO, largepages, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); static int largepage_reclaim_tries = 1; SYSCTL_INT(_vm_largepages, OID_AUTO, reclaim_tries, CTLFLAG_RWTUN, &largepage_reclaim_tries, 0, "Number of contig reclaims before giving up for default alloc policy"); #define shm_rangelock_unlock(shmfd, cookie) \ rangelock_unlock(&(shmfd)->shm_rl, (cookie), &(shmfd)->shm_mtx) #define shm_rangelock_rlock(shmfd, start, end) \ rangelock_rlock(&(shmfd)->shm_rl, (start), (end), &(shmfd)->shm_mtx) #define shm_rangelock_tryrlock(shmfd, start, end) \ rangelock_tryrlock(&(shmfd)->shm_rl, (start), (end), &(shmfd)->shm_mtx) #define shm_rangelock_wlock(shmfd, start, end) \ rangelock_wlock(&(shmfd)->shm_rl, (start), (end), &(shmfd)->shm_mtx) static int uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio) { vm_page_t m; vm_pindex_t idx; size_t tlen; int error, offset, rv; idx = OFF_TO_IDX(uio->uio_offset); offset = uio->uio_offset & PAGE_MASK; tlen = MIN(PAGE_SIZE - offset, len); rv = vm_page_grab_valid_unlocked(&m, obj, idx, VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY | VM_ALLOC_NOCREAT); if (rv == VM_PAGER_OK) goto found; /* * Read I/O without either a corresponding resident page or swap * page: use zero_region. This is intended to avoid instantiating * pages on read from a sparse region. */ VM_OBJECT_WLOCK(obj); m = vm_page_lookup(obj, idx); if (uio->uio_rw == UIO_READ && m == NULL && !vm_pager_has_page(obj, idx, NULL, NULL)) { VM_OBJECT_WUNLOCK(obj); return (uiomove(__DECONST(void *, zero_region), tlen, uio)); } /* * Although the tmpfs vnode lock is held here, it is * nonetheless safe to sleep waiting for a free page. The * pageout daemon does not need to acquire the tmpfs vnode * lock to page out tobj's pages because tobj is a OBJT_SWAP * type object. */ rv = vm_page_grab_valid(&m, obj, idx, VM_ALLOC_NORMAL | VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY); if (rv != VM_PAGER_OK) { VM_OBJECT_WUNLOCK(obj); if (bootverbose) { printf("uiomove_object: vm_obj %p idx %jd " "pager error %d\n", obj, idx, rv); } return (rv == VM_PAGER_AGAIN ? ENOSPC : EIO); } VM_OBJECT_WUNLOCK(obj); found: error = uiomove_fromphys(&m, offset, tlen, uio); if (uio->uio_rw == UIO_WRITE && error == 0) vm_page_set_dirty(m); vm_page_activate(m); vm_page_sunbusy(m); return (error); } int uiomove_object(vm_object_t obj, off_t obj_size, struct uio *uio) { ssize_t resid; size_t len; int error; error = 0; while ((resid = uio->uio_resid) > 0) { if (obj_size <= uio->uio_offset) break; len = MIN(obj_size - uio->uio_offset, resid); if (len == 0) break; error = uiomove_object_page(obj, len, uio); if (error != 0 || resid == uio->uio_resid) break; } return (error); } static u_long count_largepages[MAXPAGESIZES]; static int shm_largepage_phys_populate(vm_object_t object, vm_pindex_t pidx, int fault_type, vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last) { vm_page_t m __diagused; int psind; psind = object->un_pager.phys.data_val; if (psind == 0 || pidx >= object->size) return (VM_PAGER_FAIL); *first = rounddown2(pidx, pagesizes[psind] / PAGE_SIZE); /* * We only busy the first page in the superpage run. It is * useless to busy whole run since we only remove full * superpage, and it takes too long to busy e.g. 512 * 512 == * 262144 pages constituing 1G amd64 superage. */ m = vm_page_grab(object, *first, VM_ALLOC_NORMAL | VM_ALLOC_NOCREAT); MPASS(m != NULL); *last = *first + atop(pagesizes[psind]) - 1; return (VM_PAGER_OK); } static boolean_t shm_largepage_phys_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after) { int psind; psind = object->un_pager.phys.data_val; if (psind == 0 || pindex >= object->size) return (FALSE); if (before != NULL) { *before = pindex - rounddown2(pindex, pagesizes[psind] / PAGE_SIZE); } if (after != NULL) { *after = roundup2(pindex, pagesizes[psind] / PAGE_SIZE) - pindex; } return (TRUE); } static void shm_largepage_phys_ctor(vm_object_t object, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred) { } static void shm_largepage_phys_dtor(vm_object_t object) { int psind; psind = object->un_pager.phys.data_val; if (psind != 0) { atomic_subtract_long(&count_largepages[psind], object->size / (pagesizes[psind] / PAGE_SIZE)); vm_wire_sub(object->size); } else { KASSERT(object->size == 0, ("largepage phys obj %p not initialized bit size %#jx > 0", object, (uintmax_t)object->size)); } } static const struct phys_pager_ops shm_largepage_phys_ops = { .phys_pg_populate = shm_largepage_phys_populate, .phys_pg_haspage = shm_largepage_phys_haspage, .phys_pg_ctor = shm_largepage_phys_ctor, .phys_pg_dtor = shm_largepage_phys_dtor, }; bool shm_largepage(struct shmfd *shmfd) { return (shmfd->shm_object->type == OBJT_PHYS); } static void shm_pager_freespace(vm_object_t obj, vm_pindex_t start, vm_size_t size) { struct shmfd *shm; vm_size_t c; swap_pager_freespace(obj, start, size, &c); if (c == 0) return; shm = obj->un_pager.swp.swp_priv; if (shm == NULL) return; KASSERT(shm->shm_pages >= c, ("shm %p pages %jd free %jd", shm, (uintmax_t)shm->shm_pages, (uintmax_t)c)); shm->shm_pages -= c; } static void shm_page_inserted(vm_object_t obj, vm_page_t m) { struct shmfd *shm; shm = obj->un_pager.swp.swp_priv; if (shm == NULL) return; if (!vm_pager_has_page(obj, m->pindex, NULL, NULL)) shm->shm_pages += 1; } static void shm_page_removed(vm_object_t obj, vm_page_t m) { struct shmfd *shm; shm = obj->un_pager.swp.swp_priv; if (shm == NULL) return; if (!vm_pager_has_page(obj, m->pindex, NULL, NULL)) { KASSERT(shm->shm_pages >= 1, ("shm %p pages %jd free 1", shm, (uintmax_t)shm->shm_pages)); shm->shm_pages -= 1; } } static struct pagerops shm_swap_pager_ops = { .pgo_kvme_type = KVME_TYPE_SWAP, .pgo_freespace = shm_pager_freespace, .pgo_page_inserted = shm_page_inserted, .pgo_page_removed = shm_page_removed, }; static int shmfd_pager_type = -1; static int shm_seek(struct file *fp, off_t offset, int whence, struct thread *td) { struct shmfd *shmfd; off_t foffset; int error; shmfd = fp->f_data; foffset = foffset_lock(fp, 0); error = 0; switch (whence) { case L_INCR: if (foffset < 0 || (offset > 0 && foffset > OFF_MAX - offset)) { error = EOVERFLOW; break; } offset += foffset; break; case L_XTND: if (offset > 0 && shmfd->shm_size > OFF_MAX - offset) { error = EOVERFLOW; break; } offset += shmfd->shm_size; break; case L_SET: break; default: error = EINVAL; } if (error == 0) { if (offset < 0 || offset > shmfd->shm_size) error = EINVAL; else td->td_uretoff.tdu_off = offset; } foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0); return (error); } static int shm_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct shmfd *shmfd; void *rl_cookie; int error; shmfd = fp->f_data; #ifdef MAC error = mac_posixshm_check_read(active_cred, fp->f_cred, shmfd); if (error) return (error); #endif foffset_lock_uio(fp, uio, flags); rl_cookie = shm_rangelock_rlock(shmfd, uio->uio_offset, uio->uio_offset + uio->uio_resid); error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio); shm_rangelock_unlock(shmfd, rl_cookie); foffset_unlock_uio(fp, uio, flags); return (error); } static int shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct shmfd *shmfd; void *rl_cookie; int error; off_t size; shmfd = fp->f_data; #ifdef MAC error = mac_posixshm_check_write(active_cred, fp->f_cred, shmfd); if (error) return (error); #endif if (shm_largepage(shmfd) && shmfd->shm_lp_psind == 0) return (EINVAL); foffset_lock_uio(fp, uio, flags); if (uio->uio_resid > OFF_MAX - uio->uio_offset) { /* * Overflow is only an error if we're supposed to expand on * write. Otherwise, we'll just truncate the write to the * size of the file, which can only grow up to OFF_MAX. */ if ((shmfd->shm_flags & SHM_GROW_ON_WRITE) != 0) { foffset_unlock_uio(fp, uio, flags); return (EFBIG); } size = shmfd->shm_size; } else { size = uio->uio_offset + uio->uio_resid; } if ((flags & FOF_OFFSET) == 0) rl_cookie = shm_rangelock_wlock(shmfd, 0, OFF_MAX); else rl_cookie = shm_rangelock_wlock(shmfd, uio->uio_offset, size); if ((shmfd->shm_seals & F_SEAL_WRITE) != 0) { error = EPERM; } else { error = 0; if ((shmfd->shm_flags & SHM_GROW_ON_WRITE) != 0 && size > shmfd->shm_size) { error = shm_dotruncate_cookie(shmfd, size, rl_cookie); } if (error == 0) error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio); } shm_rangelock_unlock(shmfd, rl_cookie); foffset_unlock_uio(fp, uio, flags); return (error); } static int shm_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { struct shmfd *shmfd; #ifdef MAC int error; #endif shmfd = fp->f_data; #ifdef MAC error = mac_posixshm_check_truncate(active_cred, fp->f_cred, shmfd); if (error) return (error); #endif return (shm_dotruncate(shmfd, length)); } int shm_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { struct shmfd *shmfd; struct shm_largepage_conf *conf; void *rl_cookie; shmfd = fp->f_data; switch (com) { case FIONBIO: case FIOASYNC: /* * Allow fcntl(fd, F_SETFL, O_NONBLOCK) to work, * just like it would on an unlinked regular file */ return (0); case FIOSSHMLPGCNF: if (!shm_largepage(shmfd)) return (ENOTTY); conf = data; if (shmfd->shm_lp_psind != 0 && conf->psind != shmfd->shm_lp_psind) return (EINVAL); if (conf->psind <= 0 || conf->psind >= MAXPAGESIZES || pagesizes[conf->psind] == 0) return (EINVAL); if (conf->alloc_policy != SHM_LARGEPAGE_ALLOC_DEFAULT && conf->alloc_policy != SHM_LARGEPAGE_ALLOC_NOWAIT && conf->alloc_policy != SHM_LARGEPAGE_ALLOC_HARD) return (EINVAL); rl_cookie = shm_rangelock_wlock(shmfd, 0, OFF_MAX); shmfd->shm_lp_psind = conf->psind; shmfd->shm_lp_alloc_policy = conf->alloc_policy; shmfd->shm_object->un_pager.phys.data_val = conf->psind; shm_rangelock_unlock(shmfd, rl_cookie); return (0); case FIOGSHMLPGCNF: if (!shm_largepage(shmfd)) return (ENOTTY); conf = data; rl_cookie = shm_rangelock_rlock(shmfd, 0, OFF_MAX); conf->psind = shmfd->shm_lp_psind; conf->alloc_policy = shmfd->shm_lp_alloc_policy; shm_rangelock_unlock(shmfd, rl_cookie); return (0); default: return (ENOTTY); } } static int shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred) { struct shmfd *shmfd; #ifdef MAC int error; #endif shmfd = fp->f_data; #ifdef MAC error = mac_posixshm_check_stat(active_cred, fp->f_cred, shmfd); if (error) return (error); #endif /* * Attempt to return sanish values for fstat() on a memory file * descriptor. */ bzero(sb, sizeof(*sb)); sb->st_blksize = PAGE_SIZE; sb->st_size = shmfd->shm_size; mtx_lock(&shm_timestamp_lock); sb->st_atim = shmfd->shm_atime; sb->st_ctim = shmfd->shm_ctime; sb->st_mtim = shmfd->shm_mtime; sb->st_birthtim = shmfd->shm_birthtime; sb->st_mode = S_IFREG | shmfd->shm_mode; /* XXX */ sb->st_uid = shmfd->shm_uid; sb->st_gid = shmfd->shm_gid; mtx_unlock(&shm_timestamp_lock); sb->st_dev = shm_dev_ino; sb->st_ino = shmfd->shm_ino; sb->st_nlink = shmfd->shm_object->ref_count; if (shm_largepage(shmfd)) { sb->st_blocks = shmfd->shm_object->size / (pagesizes[shmfd->shm_lp_psind] >> PAGE_SHIFT); } else { sb->st_blocks = shmfd->shm_pages; } return (0); } static int shm_close(struct file *fp, struct thread *td) { struct shmfd *shmfd; shmfd = fp->f_data; fp->f_data = NULL; shm_drop(shmfd); return (0); } static int shm_copyin_path(struct thread *td, const char *userpath_in, char **path_out) { int error; char *path; const char *pr_path; size_t pr_pathlen; path = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK); pr_path = td->td_ucred->cr_prison->pr_path; /* Construct a full pathname for jailed callers. */ pr_pathlen = strcmp(pr_path, "/") == 0 ? 0 : strlcpy(path, pr_path, MAXPATHLEN); error = copyinstr(userpath_in, path + pr_pathlen, MAXPATHLEN - pr_pathlen, NULL); if (error != 0) goto out; #ifdef KTRACE if (KTRPOINT(curthread, KTR_NAMEI)) ktrnamei(path); #endif /* Require paths to start with a '/' character. */ if (path[pr_pathlen] != '/') { error = EINVAL; goto out; } *path_out = path; out: if (error != 0) free(path, M_SHMFD); return (error); } static int shm_partial_page_invalidate(vm_object_t object, vm_pindex_t idx, int base, int end) { vm_page_t m; int rv; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(base >= 0, ("%s: base %d", __func__, base)); KASSERT(end - base <= PAGE_SIZE, ("%s: base %d end %d", __func__, base, end)); retry: m = vm_page_grab(object, idx, VM_ALLOC_NOCREAT); if (m != NULL) { MPASS(vm_page_all_valid(m)); } else if (vm_pager_has_page(object, idx, NULL, NULL)) { m = vm_page_alloc(object, idx, VM_ALLOC_NORMAL | VM_ALLOC_WAITFAIL); if (m == NULL) goto retry; vm_object_pip_add(object, 1); VM_OBJECT_WUNLOCK(object); rv = vm_pager_get_pages(object, &m, 1, NULL, NULL); VM_OBJECT_WLOCK(object); vm_object_pip_wakeup(object); if (rv == VM_PAGER_OK) { /* * Since the page was not resident, and therefore not * recently accessed, immediately enqueue it for * asynchronous laundering. The current operation is * not regarded as an access. */ vm_page_launder(m); } else { vm_page_free(m); VM_OBJECT_WUNLOCK(object); return (EIO); } } if (m != NULL) { pmap_zero_page_area(m, base, end - base); KASSERT(vm_page_all_valid(m), ("%s: page %p is invalid", __func__, m)); vm_page_set_dirty(m); vm_page_xunbusy(m); } return (0); } static int shm_dotruncate_locked(struct shmfd *shmfd, off_t length, void *rl_cookie) { vm_object_t object; vm_pindex_t nobjsize; vm_ooffset_t delta; int base, error; KASSERT(length >= 0, ("shm_dotruncate: length < 0")); object = shmfd->shm_object; VM_OBJECT_ASSERT_WLOCKED(object); rangelock_cookie_assert(rl_cookie, RA_WLOCKED); if (length == shmfd->shm_size) return (0); nobjsize = OFF_TO_IDX(length + PAGE_MASK); /* Are we shrinking? If so, trim the end. */ if (length < shmfd->shm_size) { if ((shmfd->shm_seals & F_SEAL_SHRINK) != 0) return (EPERM); /* * Disallow any requests to shrink the size if this * object is mapped into the kernel. */ if (shmfd->shm_kmappings > 0) return (EBUSY); /* * Zero the truncated part of the last page. */ base = length & PAGE_MASK; if (base != 0) { error = shm_partial_page_invalidate(object, OFF_TO_IDX(length), base, PAGE_SIZE); if (error) return (error); } delta = IDX_TO_OFF(object->size - nobjsize); if (nobjsize < object->size) vm_object_page_remove(object, nobjsize, object->size, 0); /* Free the swap accounted for shm */ swap_release_by_cred(delta, object->cred); object->charge -= delta; } else { if ((shmfd->shm_seals & F_SEAL_GROW) != 0) return (EPERM); /* Try to reserve additional swap space. */ delta = IDX_TO_OFF(nobjsize - object->size); if (!swap_reserve_by_cred(delta, object->cred)) return (ENOMEM); object->charge += delta; } shmfd->shm_size = length; mtx_lock(&shm_timestamp_lock); vfs_timestamp(&shmfd->shm_ctime); shmfd->shm_mtime = shmfd->shm_ctime; mtx_unlock(&shm_timestamp_lock); object->size = nobjsize; return (0); } static int shm_dotruncate_largepage(struct shmfd *shmfd, off_t length, void *rl_cookie) { vm_object_t object; vm_page_t m; vm_pindex_t newobjsz; vm_pindex_t oldobjsz __unused; int aflags, error, i, psind, try; KASSERT(length >= 0, ("shm_dotruncate: length < 0")); object = shmfd->shm_object; VM_OBJECT_ASSERT_WLOCKED(object); rangelock_cookie_assert(rl_cookie, RA_WLOCKED); oldobjsz = object->size; newobjsz = OFF_TO_IDX(length); if (length == shmfd->shm_size) return (0); psind = shmfd->shm_lp_psind; if (psind == 0 && length != 0) return (EINVAL); if ((length & (pagesizes[psind] - 1)) != 0) return (EINVAL); if (length < shmfd->shm_size) { if ((shmfd->shm_seals & F_SEAL_SHRINK) != 0) return (EPERM); if (shmfd->shm_kmappings > 0) return (EBUSY); return (ENOTSUP); /* Pages are unmanaged. */ #if 0 vm_object_page_remove(object, newobjsz, oldobjsz, 0); object->size = newobjsz; shmfd->shm_size = length; return (0); #endif } if ((shmfd->shm_seals & F_SEAL_GROW) != 0) return (EPERM); aflags = VM_ALLOC_NORMAL | VM_ALLOC_ZERO; if (shmfd->shm_lp_alloc_policy == SHM_LARGEPAGE_ALLOC_NOWAIT) aflags |= VM_ALLOC_WAITFAIL; try = 0; /* * Extend shmfd and object, keeping all already fully * allocated large pages intact even on error, because dropped * object lock might allowed mapping of them. */ while (object->size < newobjsz) { m = vm_page_alloc_contig(object, object->size, aflags, pagesizes[psind] / PAGE_SIZE, 0, ~0, pagesizes[psind], 0, VM_MEMATTR_DEFAULT); if (m == NULL) { VM_OBJECT_WUNLOCK(object); if (shmfd->shm_lp_alloc_policy == SHM_LARGEPAGE_ALLOC_NOWAIT || (shmfd->shm_lp_alloc_policy == SHM_LARGEPAGE_ALLOC_DEFAULT && try >= largepage_reclaim_tries)) { VM_OBJECT_WLOCK(object); return (ENOMEM); } error = vm_page_reclaim_contig(aflags, pagesizes[psind] / PAGE_SIZE, 0, ~0, pagesizes[psind], 0) ? 0 : vm_wait_intr(object); if (error != 0) { VM_OBJECT_WLOCK(object); return (error); } try++; VM_OBJECT_WLOCK(object); continue; } try = 0; for (i = 0; i < pagesizes[psind] / PAGE_SIZE; i++) { if ((m[i].flags & PG_ZERO) == 0) pmap_zero_page(&m[i]); vm_page_valid(&m[i]); vm_page_xunbusy(&m[i]); } object->size += OFF_TO_IDX(pagesizes[psind]); shmfd->shm_size += pagesizes[psind]; atomic_add_long(&count_largepages[psind], 1); vm_wire_add(atop(pagesizes[psind])); } return (0); } static int shm_dotruncate_cookie(struct shmfd *shmfd, off_t length, void *rl_cookie) { int error; VM_OBJECT_WLOCK(shmfd->shm_object); error = shm_largepage(shmfd) ? shm_dotruncate_largepage(shmfd, length, rl_cookie) : shm_dotruncate_locked(shmfd, length, rl_cookie); VM_OBJECT_WUNLOCK(shmfd->shm_object); return (error); } int shm_dotruncate(struct shmfd *shmfd, off_t length) { void *rl_cookie; int error; rl_cookie = shm_rangelock_wlock(shmfd, 0, OFF_MAX); error = shm_dotruncate_cookie(shmfd, length, rl_cookie); shm_rangelock_unlock(shmfd, rl_cookie); return (error); } /* * shmfd object management including creation and reference counting * routines. */ struct shmfd * shm_alloc(struct ucred *ucred, mode_t mode, bool largepage) { struct shmfd *shmfd; vm_object_t obj; shmfd = malloc(sizeof(*shmfd), M_SHMFD, M_WAITOK | M_ZERO); shmfd->shm_size = 0; shmfd->shm_uid = ucred->cr_uid; shmfd->shm_gid = ucred->cr_gid; shmfd->shm_mode = mode; if (largepage) { obj = phys_pager_allocate(NULL, &shm_largepage_phys_ops, NULL, shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred); obj->un_pager.phys.phys_priv = shmfd; shmfd->shm_lp_alloc_policy = SHM_LARGEPAGE_ALLOC_DEFAULT; } else { obj = vm_pager_allocate(shmfd_pager_type, NULL, shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred); obj->un_pager.swp.swp_priv = shmfd; } KASSERT(obj != NULL, ("shm_create: vm_pager_allocate")); VM_OBJECT_WLOCK(obj); vm_object_set_flag(obj, OBJ_POSIXSHM); VM_OBJECT_WUNLOCK(obj); shmfd->shm_object = obj; vfs_timestamp(&shmfd->shm_birthtime); shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime = shmfd->shm_birthtime; shmfd->shm_ino = alloc_unr64(&shm_ino_unr); refcount_init(&shmfd->shm_refs, 1); mtx_init(&shmfd->shm_mtx, "shmrl", NULL, MTX_DEF); rangelock_init(&shmfd->shm_rl); #ifdef MAC mac_posixshm_init(shmfd); mac_posixshm_create(ucred, shmfd); #endif return (shmfd); } struct shmfd * shm_hold(struct shmfd *shmfd) { refcount_acquire(&shmfd->shm_refs); return (shmfd); } void shm_drop(struct shmfd *shmfd) { vm_object_t obj; if (refcount_release(&shmfd->shm_refs)) { #ifdef MAC mac_posixshm_destroy(shmfd); #endif rangelock_destroy(&shmfd->shm_rl); mtx_destroy(&shmfd->shm_mtx); obj = shmfd->shm_object; VM_OBJECT_WLOCK(obj); if (shm_largepage(shmfd)) obj->un_pager.phys.phys_priv = NULL; else obj->un_pager.swp.swp_priv = NULL; VM_OBJECT_WUNLOCK(obj); vm_object_deallocate(obj); free(shmfd, M_SHMFD); } } /* * Determine if the credentials have sufficient permissions for a * specified combination of FREAD and FWRITE. */ int shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags) { accmode_t accmode; int error; accmode = 0; if (flags & FREAD) accmode |= VREAD; if (flags & FWRITE) accmode |= VWRITE; mtx_lock(&shm_timestamp_lock); error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid, accmode, ucred); mtx_unlock(&shm_timestamp_lock); return (error); } static void shm_init(void *arg) { char name[32]; int i; mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF); sx_init(&shm_dict_lock, "shm dictionary"); shm_dictionary = hashinit(1024, M_SHMFD, &shm_hash); new_unrhdr64(&shm_ino_unr, 1); shm_dev_ino = devfs_alloc_cdp_inode(); KASSERT(shm_dev_ino > 0, ("shm dev inode not initialized")); shmfd_pager_type = vm_pager_alloc_dyn_type(&shm_swap_pager_ops, OBJT_SWAP); MPASS(shmfd_pager_type != -1); for (i = 1; i < MAXPAGESIZES; i++) { if (pagesizes[i] == 0) break; #define M (1024 * 1024) #define G (1024 * M) if (pagesizes[i] >= G) snprintf(name, sizeof(name), "%luG", pagesizes[i] / G); else if (pagesizes[i] >= M) snprintf(name, sizeof(name), "%luM", pagesizes[i] / M); else snprintf(name, sizeof(name), "%lu", pagesizes[i]); #undef G #undef M SYSCTL_ADD_ULONG(NULL, SYSCTL_STATIC_CHILDREN(_vm_largepages), OID_AUTO, name, CTLFLAG_RD, &count_largepages[i], "number of non-transient largepages allocated"); } } SYSINIT(shm_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_init, NULL); /* * Remove all shared memory objects that belong to a prison. */ void shm_remove_prison(struct prison *pr) { struct shm_mapping *shmm, *tshmm; u_long i; sx_xlock(&shm_dict_lock); for (i = 0; i < shm_hash + 1; i++) { LIST_FOREACH_SAFE(shmm, &shm_dictionary[i], sm_link, tshmm) { if (shmm->sm_shmfd->shm_object->cred && shmm->sm_shmfd->shm_object->cred->cr_prison == pr) shm_doremove(shmm); } } sx_xunlock(&shm_dict_lock); } /* * Dictionary management. We maintain an in-kernel dictionary to map * paths to shmfd objects. We use the FNV hash on the path to store * the mappings in a hash table. */ static struct shmfd * shm_lookup(char *path, Fnv32_t fnv) { struct shm_mapping *map; LIST_FOREACH(map, SHM_HASH(fnv), sm_link) { if (map->sm_fnv != fnv) continue; if (strcmp(map->sm_path, path) == 0) return (map->sm_shmfd); } return (NULL); } static void shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd) { struct shm_mapping *map; map = malloc(sizeof(struct shm_mapping), M_SHMFD, M_WAITOK); map->sm_path = path; map->sm_fnv = fnv; map->sm_shmfd = shm_hold(shmfd); shmfd->shm_path = path; LIST_INSERT_HEAD(SHM_HASH(fnv), map, sm_link); } static int shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred) { struct shm_mapping *map; int error; LIST_FOREACH(map, SHM_HASH(fnv), sm_link) { if (map->sm_fnv != fnv) continue; if (strcmp(map->sm_path, path) == 0) { #ifdef MAC error = mac_posixshm_check_unlink(ucred, map->sm_shmfd); if (error) return (error); #endif error = shm_access(map->sm_shmfd, ucred, FREAD | FWRITE); if (error) return (error); shm_doremove(map); return (0); } } return (ENOENT); } static void shm_doremove(struct shm_mapping *map) { map->sm_shmfd->shm_path = NULL; LIST_REMOVE(map, sm_link); shm_drop(map->sm_shmfd); free(map->sm_path, M_SHMFD); free(map, M_SHMFD); } int kern_shm_open2(struct thread *td, const char *userpath, int flags, mode_t mode, int shmflags, struct filecaps *fcaps, const char *name __unused) { struct pwddesc *pdp; struct shmfd *shmfd; struct file *fp; char *path; void *rl_cookie; Fnv32_t fnv; mode_t cmode; int error, fd, initial_seals; bool largepage; if ((shmflags & ~(SHM_ALLOW_SEALING | SHM_GROW_ON_WRITE | SHM_LARGEPAGE)) != 0) return (EINVAL); initial_seals = F_SEAL_SEAL; if ((shmflags & SHM_ALLOW_SEALING) != 0) initial_seals &= ~F_SEAL_SEAL; AUDIT_ARG_FFLAGS(flags); AUDIT_ARG_MODE(mode); if ((flags & O_ACCMODE) != O_RDONLY && (flags & O_ACCMODE) != O_RDWR) return (EINVAL); if ((flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0) return (EINVAL); largepage = (shmflags & SHM_LARGEPAGE) != 0; if (largepage && !PMAP_HAS_LARGEPAGES) return (ENOTTY); /* * Currently only F_SEAL_SEAL may be set when creating or opening shmfd. * If the decision is made later to allow additional seals, care must be * taken below to ensure that the seals are properly set if the shmfd * already existed -- this currently assumes that only F_SEAL_SEAL can * be set and doesn't take further precautions to ensure the validity of * the seals being added with respect to current mappings. */ if ((initial_seals & ~F_SEAL_SEAL) != 0) return (EINVAL); if (userpath != SHM_ANON) { error = shm_copyin_path(td, userpath, &path); if (error != 0) return (error); #ifdef CAPABILITY_MODE /* * shm_open(2) is only allowed for anonymous objects. */ if (CAP_TRACING(td)) ktrcapfail(CAPFAIL_NAMEI, path); if (IN_CAPABILITY_MODE(td)) { free(path, M_SHMFD); return (ECAPMODE); } #endif AUDIT_ARG_UPATH1_CANON(path); } else { path = NULL; } pdp = td->td_proc->p_pd; cmode = (mode & ~pdp->pd_cmask) & ACCESSPERMS; /* * shm_open(2) created shm should always have O_CLOEXEC set, as mandated * by POSIX. We allow it to be unset here so that an in-kernel * interface may be written as a thin layer around shm, optionally not * setting CLOEXEC. For shm_open(2), O_CLOEXEC is set unconditionally * in sys_shm_open() to keep this implementation compliant. */ error = falloc_caps(td, &fp, &fd, flags & O_CLOEXEC, fcaps); if (error) { free(path, M_SHMFD); return (error); } /* A SHM_ANON path pointer creates an anonymous object. */ if (userpath == SHM_ANON) { /* A read-only anonymous object is pointless. */ if ((flags & O_ACCMODE) == O_RDONLY) { fdclose(td, fp, fd); fdrop(fp, td); return (EINVAL); } shmfd = shm_alloc(td->td_ucred, cmode, largepage); shmfd->shm_seals = initial_seals; shmfd->shm_flags = shmflags; } else { fnv = fnv_32_str(path, FNV1_32_INIT); sx_xlock(&shm_dict_lock); shmfd = shm_lookup(path, fnv); if (shmfd == NULL) { /* Object does not yet exist, create it if requested. */ if (flags & O_CREAT) { #ifdef MAC error = mac_posixshm_check_create(td->td_ucred, path); if (error == 0) { #endif shmfd = shm_alloc(td->td_ucred, cmode, largepage); shmfd->shm_seals = initial_seals; shmfd->shm_flags = shmflags; shm_insert(path, fnv, shmfd); #ifdef MAC } #endif } else { free(path, M_SHMFD); error = ENOENT; } } else { rl_cookie = shm_rangelock_wlock(shmfd, 0, OFF_MAX); /* * kern_shm_open() likely shouldn't ever error out on * trying to set a seal that already exists, unlike * F_ADD_SEALS. This would break terribly as * shm_open(2) actually sets F_SEAL_SEAL to maintain * historical behavior where the underlying file could * not be sealed. */ initial_seals &= ~shmfd->shm_seals; /* * Object already exists, obtain a new * reference if requested and permitted. */ free(path, M_SHMFD); /* * initial_seals can't set additional seals if we've * already been set F_SEAL_SEAL. If F_SEAL_SEAL is set, * then we've already removed that one from * initial_seals. This is currently redundant as we * only allow setting F_SEAL_SEAL at creation time, but * it's cheap to check and decreases the effort required * to allow additional seals. */ if ((shmfd->shm_seals & F_SEAL_SEAL) != 0 && initial_seals != 0) error = EPERM; else if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) error = EEXIST; else if (shmflags != 0 && shmflags != shmfd->shm_flags) error = EINVAL; else { #ifdef MAC error = mac_posixshm_check_open(td->td_ucred, shmfd, FFLAGS(flags & O_ACCMODE)); if (error == 0) #endif error = shm_access(shmfd, td->td_ucred, FFLAGS(flags & O_ACCMODE)); } /* * Truncate the file back to zero length if * O_TRUNC was specified and the object was * opened with read/write. */ if (error == 0 && (flags & (O_ACCMODE | O_TRUNC)) == (O_RDWR | O_TRUNC)) { VM_OBJECT_WLOCK(shmfd->shm_object); #ifdef MAC error = mac_posixshm_check_truncate( td->td_ucred, fp->f_cred, shmfd); if (error == 0) #endif error = shm_dotruncate_locked(shmfd, 0, rl_cookie); VM_OBJECT_WUNLOCK(shmfd->shm_object); } if (error == 0) { /* * Currently we only allow F_SEAL_SEAL to be * set initially. As noted above, this would * need to be reworked should that change. */ shmfd->shm_seals |= initial_seals; shm_hold(shmfd); } shm_rangelock_unlock(shmfd, rl_cookie); } sx_xunlock(&shm_dict_lock); if (error) { fdclose(td, fp, fd); fdrop(fp, td); return (error); } } finit(fp, FFLAGS(flags & O_ACCMODE), DTYPE_SHM, shmfd, &shm_ops); td->td_retval[0] = fd; fdrop(fp, td); return (0); } /* System calls. */ #ifdef COMPAT_FREEBSD12 int freebsd12_shm_open(struct thread *td, struct freebsd12_shm_open_args *uap) { return (kern_shm_open(td, uap->path, uap->flags | O_CLOEXEC, uap->mode, NULL)); } #endif int sys_shm_unlink(struct thread *td, struct shm_unlink_args *uap) { char *path; Fnv32_t fnv; int error; error = shm_copyin_path(td, uap->path, &path); if (error != 0) return (error); AUDIT_ARG_UPATH1_CANON(path); fnv = fnv_32_str(path, FNV1_32_INIT); sx_xlock(&shm_dict_lock); error = shm_remove(path, fnv, td->td_ucred); sx_xunlock(&shm_dict_lock); free(path, M_SHMFD); return (error); } int sys_shm_rename(struct thread *td, struct shm_rename_args *uap) { char *path_from = NULL, *path_to = NULL; Fnv32_t fnv_from, fnv_to; struct shmfd *fd_from; struct shmfd *fd_to; int error; int flags; flags = uap->flags; AUDIT_ARG_FFLAGS(flags); /* * Make sure the user passed only valid flags. * If you add a new flag, please add a new term here. */ if ((flags & ~( SHM_RENAME_NOREPLACE | SHM_RENAME_EXCHANGE )) != 0) { error = EINVAL; goto out; } /* * EXCHANGE and NOREPLACE don't quite make sense together. Let's * force the user to choose one or the other. */ if ((flags & SHM_RENAME_NOREPLACE) != 0 && (flags & SHM_RENAME_EXCHANGE) != 0) { error = EINVAL; goto out; } /* Renaming to or from anonymous makes no sense */ if (uap->path_from == SHM_ANON || uap->path_to == SHM_ANON) { error = EINVAL; goto out; } error = shm_copyin_path(td, uap->path_from, &path_from); if (error != 0) goto out; error = shm_copyin_path(td, uap->path_to, &path_to); if (error != 0) goto out; AUDIT_ARG_UPATH1_CANON(path_from); AUDIT_ARG_UPATH2_CANON(path_to); /* Rename with from/to equal is a no-op */ if (strcmp(path_from, path_to) == 0) goto out; fnv_from = fnv_32_str(path_from, FNV1_32_INIT); fnv_to = fnv_32_str(path_to, FNV1_32_INIT); sx_xlock(&shm_dict_lock); fd_from = shm_lookup(path_from, fnv_from); if (fd_from == NULL) { error = ENOENT; goto out_locked; } fd_to = shm_lookup(path_to, fnv_to); if ((flags & SHM_RENAME_NOREPLACE) != 0 && fd_to != NULL) { error = EEXIST; goto out_locked; } /* * Unconditionally prevents shm_remove from invalidating the 'from' * shm's state. */ shm_hold(fd_from); error = shm_remove(path_from, fnv_from, td->td_ucred); /* * One of my assumptions failed if ENOENT (e.g. locking didn't * protect us) */ KASSERT(error != ENOENT, ("Our shm disappeared during shm_rename: %s", path_from)); if (error != 0) { shm_drop(fd_from); goto out_locked; } /* * If we are exchanging, we need to ensure the shm_remove below * doesn't invalidate the dest shm's state. */ if ((flags & SHM_RENAME_EXCHANGE) != 0 && fd_to != NULL) shm_hold(fd_to); /* * NOTE: if path_to is not already in the hash, c'est la vie; * it simply means we have nothing already at path_to to unlink. * That is the ENOENT case. * * If we somehow don't have access to unlink this guy, but * did for the shm at path_from, then relink the shm to path_from * and abort with EACCES. * * All other errors: that is weird; let's relink and abort the * operation. */ error = shm_remove(path_to, fnv_to, td->td_ucred); if (error != 0 && error != ENOENT) { shm_insert(path_from, fnv_from, fd_from); shm_drop(fd_from); /* Don't free path_from now, since the hash references it */ path_from = NULL; goto out_locked; } error = 0; shm_insert(path_to, fnv_to, fd_from); /* Don't free path_to now, since the hash references it */ path_to = NULL; /* We kept a ref when we removed, and incremented again in insert */ shm_drop(fd_from); KASSERT(fd_from->shm_refs > 0, ("Expected >0 refs; got: %d\n", fd_from->shm_refs)); if ((flags & SHM_RENAME_EXCHANGE) != 0 && fd_to != NULL) { shm_insert(path_from, fnv_from, fd_to); path_from = NULL; shm_drop(fd_to); KASSERT(fd_to->shm_refs > 0, ("Expected >0 refs; got: %d\n", fd_to->shm_refs)); } out_locked: sx_xunlock(&shm_dict_lock); out: free(path_from, M_SHMFD); free(path_to, M_SHMFD); return (error); } static int shm_mmap_large(struct shmfd *shmfd, vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_prot_t max_prot, int flags, vm_ooffset_t foff, struct thread *td) { struct vmspace *vms; vm_map_entry_t next_entry, prev_entry; vm_offset_t align, mask, maxaddr; int docow, error, rv, try; bool curmap; if (shmfd->shm_lp_psind == 0) return (EINVAL); /* MAP_PRIVATE is disabled */ if ((flags & ~(MAP_SHARED | MAP_FIXED | MAP_EXCL | MAP_NOCORE | MAP_32BIT | MAP_ALIGNMENT_MASK)) != 0) return (EINVAL); vms = td->td_proc->p_vmspace; curmap = map == &vms->vm_map; if (curmap) { error = kern_mmap_racct_check(td, map, size); if (error != 0) return (error); } docow = shmfd->shm_lp_psind << MAP_SPLIT_BOUNDARY_SHIFT; docow |= MAP_INHERIT_SHARE; if ((flags & MAP_NOCORE) != 0) docow |= MAP_DISABLE_COREDUMP; mask = pagesizes[shmfd->shm_lp_psind] - 1; if ((foff & mask) != 0) return (EINVAL); maxaddr = vm_map_max(map); if ((flags & MAP_32BIT) != 0 && maxaddr > MAP_32BIT_MAX_ADDR) maxaddr = MAP_32BIT_MAX_ADDR; if (size == 0 || (size & mask) != 0 || (*addr != 0 && ((*addr & mask) != 0 || *addr + size < *addr || *addr + size > maxaddr))) return (EINVAL); align = flags & MAP_ALIGNMENT_MASK; if (align == 0) { align = pagesizes[shmfd->shm_lp_psind]; } else if (align == MAP_ALIGNED_SUPER) { if (shmfd->shm_lp_psind != 1) return (EINVAL); align = pagesizes[1]; } else { align >>= MAP_ALIGNMENT_SHIFT; align = 1ULL << align; /* Also handles overflow. */ if (align < pagesizes[shmfd->shm_lp_psind]) return (EINVAL); } vm_map_lock(map); if ((flags & MAP_FIXED) == 0) { try = 1; if (curmap && (*addr == 0 || (*addr >= round_page((vm_offset_t)vms->vm_taddr) && *addr < round_page((vm_offset_t)vms->vm_daddr + lim_max(td, RLIMIT_DATA))))) { *addr = roundup2((vm_offset_t)vms->vm_daddr + lim_max(td, RLIMIT_DATA), pagesizes[shmfd->shm_lp_psind]); } again: rv = vm_map_find_aligned(map, addr, size, maxaddr, align); if (rv != KERN_SUCCESS) { if (try == 1) { try = 2; *addr = vm_map_min(map); if ((*addr & mask) != 0) *addr = (*addr + mask) & mask; goto again; } goto fail1; } } else if ((flags & MAP_EXCL) == 0) { rv = vm_map_delete(map, *addr, *addr + size); if (rv != KERN_SUCCESS) goto fail1; } else { error = ENOSPC; if (vm_map_lookup_entry(map, *addr, &prev_entry)) goto fail; next_entry = vm_map_entry_succ(prev_entry); if (next_entry->start < *addr + size) goto fail; } rv = vm_map_insert(map, shmfd->shm_object, foff, *addr, *addr + size, prot, max_prot, docow); fail1: error = vm_mmap_to_errno(rv); fail: vm_map_unlock(map); return (error); } static int shm_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t objsize, vm_prot_t prot, vm_prot_t max_maxprot, int flags, vm_ooffset_t foff, struct thread *td) { struct shmfd *shmfd; vm_prot_t maxprot; int error; bool writecnt; void *rl_cookie; shmfd = fp->f_data; maxprot = VM_PROT_NONE; rl_cookie = shm_rangelock_rlock(shmfd, 0, objsize); /* FREAD should always be set. */ if ((fp->f_flag & FREAD) != 0) maxprot |= VM_PROT_EXECUTE | VM_PROT_READ; /* * If FWRITE's set, we can allow VM_PROT_WRITE unless it's a shared * mapping with a write seal applied. Private mappings are always * writeable. */ if ((flags & MAP_SHARED) == 0) { if ((max_maxprot & VM_PROT_WRITE) != 0) maxprot |= VM_PROT_WRITE; writecnt = false; } else { if ((fp->f_flag & FWRITE) != 0 && (shmfd->shm_seals & F_SEAL_WRITE) == 0) maxprot |= VM_PROT_WRITE; /* * Any mappings from a writable descriptor may be upgraded to * VM_PROT_WRITE with mprotect(2), unless a write-seal was * applied between the open and subsequent mmap(2). We want to * reject application of a write seal as long as any such * mapping exists so that the seal cannot be trivially bypassed. */ writecnt = (maxprot & VM_PROT_WRITE) != 0; if (!writecnt && (prot & VM_PROT_WRITE) != 0) { error = EACCES; goto out; } } maxprot &= max_maxprot; /* See comment in vn_mmap(). */ if ( #ifdef _LP64 objsize > OFF_MAX || #endif foff > OFF_MAX - objsize) { error = EINVAL; goto out; } #ifdef MAC error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, flags); if (error != 0) goto out; #endif mtx_lock(&shm_timestamp_lock); vfs_timestamp(&shmfd->shm_atime); mtx_unlock(&shm_timestamp_lock); vm_object_reference(shmfd->shm_object); if (shm_largepage(shmfd)) { writecnt = false; error = shm_mmap_large(shmfd, map, addr, objsize, prot, maxprot, flags, foff, td); } else { if (writecnt) { vm_pager_update_writecount(shmfd->shm_object, 0, objsize); } error = vm_mmap_object(map, addr, objsize, prot, maxprot, flags, shmfd->shm_object, foff, writecnt, td); } if (error != 0) { if (writecnt) vm_pager_release_writecount(shmfd->shm_object, 0, objsize); vm_object_deallocate(shmfd->shm_object); } out: shm_rangelock_unlock(shmfd, rl_cookie); return (error); } static int shm_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { struct shmfd *shmfd; int error; error = 0; shmfd = fp->f_data; mtx_lock(&shm_timestamp_lock); /* * SUSv4 says that x bits of permission need not be affected. * Be consistent with our shm_open there. */ #ifdef MAC error = mac_posixshm_check_setmode(active_cred, shmfd, mode); if (error != 0) goto out; #endif error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid, VADMIN, active_cred); if (error != 0) goto out; shmfd->shm_mode = mode & ACCESSPERMS; out: mtx_unlock(&shm_timestamp_lock); return (error); } static int shm_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { struct shmfd *shmfd; int error; error = 0; shmfd = fp->f_data; mtx_lock(&shm_timestamp_lock); #ifdef MAC error = mac_posixshm_check_setowner(active_cred, shmfd, uid, gid); if (error != 0) goto out; #endif if (uid == (uid_t)-1) uid = shmfd->shm_uid; if (gid == (gid_t)-1) gid = shmfd->shm_gid; if (((uid != shmfd->shm_uid && uid != active_cred->cr_uid) || (gid != shmfd->shm_gid && !groupmember(gid, active_cred))) && (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN))) goto out; shmfd->shm_uid = uid; shmfd->shm_gid = gid; out: mtx_unlock(&shm_timestamp_lock); return (error); } /* * Helper routines to allow the backing object of a shared memory file * descriptor to be mapped in the kernel. */ int shm_map(struct file *fp, size_t size, off_t offset, void **memp) { struct shmfd *shmfd; vm_offset_t kva, ofs; vm_object_t obj; int rv; if (fp->f_type != DTYPE_SHM) return (EINVAL); shmfd = fp->f_data; obj = shmfd->shm_object; VM_OBJECT_WLOCK(obj); /* * XXXRW: This validation is probably insufficient, and subject to * sign errors. It should be fixed. */ if (offset >= shmfd->shm_size || offset + size > round_page(shmfd->shm_size)) { VM_OBJECT_WUNLOCK(obj); return (EINVAL); } shmfd->shm_kmappings++; vm_object_reference_locked(obj); VM_OBJECT_WUNLOCK(obj); /* Map the object into the kernel_map and wire it. */ kva = vm_map_min(kernel_map); ofs = offset & PAGE_MASK; offset = trunc_page(offset); size = round_page(size + ofs); rv = vm_map_find(kernel_map, obj, offset, &kva, size, 0, VMFS_OPTIMAL_SPACE, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0); if (rv == KERN_SUCCESS) { rv = vm_map_wire(kernel_map, kva, kva + size, VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); if (rv == KERN_SUCCESS) { *memp = (void *)(kva + ofs); return (0); } vm_map_remove(kernel_map, kva, kva + size); } else vm_object_deallocate(obj); /* On failure, drop our mapping reference. */ VM_OBJECT_WLOCK(obj); shmfd->shm_kmappings--; VM_OBJECT_WUNLOCK(obj); return (vm_mmap_to_errno(rv)); } /* * We require the caller to unmap the entire entry. This allows us to * safely decrement shm_kmappings when a mapping is removed. */ int shm_unmap(struct file *fp, void *mem, size_t size) { struct shmfd *shmfd; vm_map_entry_t entry; vm_offset_t kva, ofs; vm_object_t obj; vm_pindex_t pindex; vm_prot_t prot; boolean_t wired; vm_map_t map; int rv; if (fp->f_type != DTYPE_SHM) return (EINVAL); shmfd = fp->f_data; kva = (vm_offset_t)mem; ofs = kva & PAGE_MASK; kva = trunc_page(kva); size = round_page(size + ofs); map = kernel_map; rv = vm_map_lookup(&map, kva, VM_PROT_READ | VM_PROT_WRITE, &entry, &obj, &pindex, &prot, &wired); if (rv != KERN_SUCCESS) return (EINVAL); if (entry->start != kva || entry->end != kva + size) { vm_map_lookup_done(map, entry); return (EINVAL); } vm_map_lookup_done(map, entry); if (obj != shmfd->shm_object) return (EINVAL); vm_map_remove(map, kva, kva + size); VM_OBJECT_WLOCK(obj); KASSERT(shmfd->shm_kmappings > 0, ("shm_unmap: object not mapped")); shmfd->shm_kmappings--; VM_OBJECT_WUNLOCK(obj); return (0); } static int shm_fill_kinfo_locked(struct shmfd *shmfd, struct kinfo_file *kif, bool list) { const char *path, *pr_path; size_t pr_pathlen; bool visible; sx_assert(&shm_dict_lock, SA_LOCKED); kif->kf_type = KF_TYPE_SHM; kif->kf_un.kf_file.kf_file_mode = S_IFREG | shmfd->shm_mode; kif->kf_un.kf_file.kf_file_size = shmfd->shm_size; if (shmfd->shm_path != NULL) { path = shmfd->shm_path; pr_path = curthread->td_ucred->cr_prison->pr_path; if (strcmp(pr_path, "/") != 0) { /* Return the jail-rooted pathname. */ pr_pathlen = strlen(pr_path); visible = strncmp(path, pr_path, pr_pathlen) == 0 && path[pr_pathlen] == '/'; if (list && !visible) return (EPERM); if (visible) path += pr_pathlen; } strlcpy(kif->kf_path, path, sizeof(kif->kf_path)); } return (0); } static int shm_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp __unused) { int res; sx_slock(&shm_dict_lock); res = shm_fill_kinfo_locked(fp->f_data, kif, false); sx_sunlock(&shm_dict_lock); return (res); } static int shm_add_seals(struct file *fp, int seals) { struct shmfd *shmfd; void *rl_cookie; vm_ooffset_t writemappings; int error, nseals; error = 0; shmfd = fp->f_data; rl_cookie = shm_rangelock_wlock(shmfd, 0, OFF_MAX); /* Even already-set seals should result in EPERM. */ if ((shmfd->shm_seals & F_SEAL_SEAL) != 0) { error = EPERM; goto out; } nseals = seals & ~shmfd->shm_seals; if ((nseals & F_SEAL_WRITE) != 0) { if (shm_largepage(shmfd)) { error = ENOTSUP; goto out; } /* * The rangelock above prevents writable mappings from being * added after we've started applying seals. The RLOCK here * is to avoid torn reads on ILP32 arches as unmapping/reducing * writemappings will be done without a rangelock. */ VM_OBJECT_RLOCK(shmfd->shm_object); writemappings = shmfd->shm_object->un_pager.swp.writemappings; VM_OBJECT_RUNLOCK(shmfd->shm_object); /* kmappings are also writable */ if (writemappings > 0) { error = EBUSY; goto out; } } shmfd->shm_seals |= nseals; out: shm_rangelock_unlock(shmfd, rl_cookie); return (error); } static int shm_get_seals(struct file *fp, int *seals) { struct shmfd *shmfd; shmfd = fp->f_data; *seals = shmfd->shm_seals; return (0); } static int shm_deallocate(struct shmfd *shmfd, off_t *offset, off_t *length, int flags) { vm_object_t object; vm_pindex_t pistart, pi, piend; vm_ooffset_t off, len; int startofs, endofs, end; int error; off = *offset; len = *length; KASSERT(off + len <= (vm_ooffset_t)OFF_MAX, ("off + len overflows")); if (off + len > shmfd->shm_size) len = shmfd->shm_size - off; object = shmfd->shm_object; startofs = off & PAGE_MASK; endofs = (off + len) & PAGE_MASK; pistart = OFF_TO_IDX(off); piend = OFF_TO_IDX(off + len); pi = OFF_TO_IDX(off + PAGE_MASK); error = 0; /* Handle the case when offset is on or beyond shm size. */ if ((off_t)len <= 0) { *length = 0; return (0); } VM_OBJECT_WLOCK(object); if (startofs != 0) { end = pistart != piend ? PAGE_SIZE : endofs; error = shm_partial_page_invalidate(object, pistart, startofs, end); if (error) goto out; off += end - startofs; len -= end - startofs; } if (pi < piend) { vm_object_page_remove(object, pi, piend, 0); off += IDX_TO_OFF(piend - pi); len -= IDX_TO_OFF(piend - pi); } if (endofs != 0 && pistart != piend) { error = shm_partial_page_invalidate(object, piend, 0, endofs); if (error) goto out; off += endofs; len -= endofs; } out: VM_OBJECT_WUNLOCK(shmfd->shm_object); *offset = off; *length = len; return (error); } static int shm_fspacectl(struct file *fp, int cmd, off_t *offset, off_t *length, int flags, struct ucred *active_cred, struct thread *td) { void *rl_cookie; struct shmfd *shmfd; off_t off, len; int error; KASSERT(cmd == SPACECTL_DEALLOC, ("shm_fspacectl: Invalid cmd")); KASSERT((flags & ~SPACECTL_F_SUPPORTED) == 0, ("shm_fspacectl: non-zero flags")); KASSERT(*offset >= 0 && *length > 0 && *length <= OFF_MAX - *offset, ("shm_fspacectl: offset/length overflow or underflow")); error = EINVAL; shmfd = fp->f_data; off = *offset; len = *length; rl_cookie = shm_rangelock_wlock(shmfd, off, off + len); switch (cmd) { case SPACECTL_DEALLOC: if ((shmfd->shm_seals & F_SEAL_WRITE) != 0) { error = EPERM; break; } error = shm_deallocate(shmfd, &off, &len, flags); *offset = off; *length = len; break; default: __assert_unreachable(); } shm_rangelock_unlock(shmfd, rl_cookie); return (error); } static int shm_fallocate(struct file *fp, off_t offset, off_t len, struct thread *td) { void *rl_cookie; struct shmfd *shmfd; size_t size; int error; /* This assumes that the caller already checked for overflow. */ error = 0; shmfd = fp->f_data; size = offset + len; /* * Just grab the rangelock for the range that we may be attempting to * grow, rather than blocking read/write for regions we won't be * touching while this (potential) resize is in progress. Other * attempts to resize the shmfd will have to take a write lock from 0 to * OFF_MAX, so this being potentially beyond the current usable range of * the shmfd is not necessarily a concern. If other mechanisms are * added to grow a shmfd, this may need to be re-evaluated. */ rl_cookie = shm_rangelock_wlock(shmfd, offset, size); if (size > shmfd->shm_size) error = shm_dotruncate_cookie(shmfd, size, rl_cookie); shm_rangelock_unlock(shmfd, rl_cookie); /* Translate to posix_fallocate(2) return value as needed. */ if (error == ENOMEM) error = ENOSPC; return (error); } static int sysctl_posix_shm_list(SYSCTL_HANDLER_ARGS) { struct shm_mapping *shmm; struct sbuf sb; struct kinfo_file kif; u_long i; int error, error2; sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file) * 5, req); sbuf_clear_flags(&sb, SBUF_INCLUDENUL); error = 0; sx_slock(&shm_dict_lock); for (i = 0; i < shm_hash + 1; i++) { LIST_FOREACH(shmm, &shm_dictionary[i], sm_link) { error = shm_fill_kinfo_locked(shmm->sm_shmfd, &kif, true); if (error == EPERM) { error = 0; continue; } if (error != 0) break; pack_kinfo(&kif); error = sbuf_bcat(&sb, &kif, kif.kf_structsize) == 0 ? 0 : ENOMEM; if (error != 0) break; } } sx_sunlock(&shm_dict_lock); error2 = sbuf_finish(&sb); sbuf_delete(&sb); return (error != 0 ? error : error2); } SYSCTL_PROC(_kern_ipc, OID_AUTO, posix_shm_list, CTLFLAG_RD | CTLFLAG_PRISON | CTLFLAG_MPSAFE | CTLTYPE_OPAQUE, NULL, 0, sysctl_posix_shm_list, "", "POSIX SHM list"); int kern_shm_open(struct thread *td, const char *path, int flags, mode_t mode, struct filecaps *caps) { return (kern_shm_open2(td, path, flags, mode, 0, caps, NULL)); } /* * This version of the shm_open() interface leaves CLOEXEC behavior up to the * caller, and libc will enforce it for the traditional shm_open() call. This * allows other consumers, like memfd_create(), to opt-in for CLOEXEC. This * interface also includes a 'name' argument that is currently unused, but could * potentially be exported later via some interface for debugging purposes. * From the kernel's perspective, it is optional. Individual consumers like * memfd_create() may require it in order to be compatible with other systems * implementing the same function. */ int sys_shm_open2(struct thread *td, struct shm_open2_args *uap) { return (kern_shm_open2(td, uap->path, uap->flags, uap->mode, uap->shmflags, NULL, uap->name)); } int shm_get_path(struct vm_object *obj, char *path, size_t sz) { struct shmfd *shmfd; int error; error = 0; shmfd = NULL; sx_slock(&shm_dict_lock); VM_OBJECT_RLOCK(obj); if ((obj->flags & OBJ_POSIXSHM) == 0) { error = EINVAL; } else { if (obj->type == shmfd_pager_type) shmfd = obj->un_pager.swp.swp_priv; else if (obj->type == OBJT_PHYS) shmfd = obj->un_pager.phys.phys_priv; if (shmfd == NULL) { error = ENXIO; } else { strlcpy(path, shmfd->shm_path == NULL ? "anon" : shmfd->shm_path, sz); } } if (error != 0) path[0] = '\0'; VM_OBJECT_RUNLOCK(obj); sx_sunlock(&shm_dict_lock); return (error); } diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index 4ecd69d509ed..162c489ea6fe 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -1,5168 +1,5168 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. * Copyright (c) 2004 The FreeBSD Foundation * Copyright (c) 2004-2008 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 */ /* * Comments on the socket life cycle: * * soalloc() sets of socket layer state for a socket, called only by * socreate() and sonewconn(). Socket layer private. * * sodealloc() tears down socket layer state for a socket, called only by * sofree() and sonewconn(). Socket layer private. * * pru_attach() associates protocol layer state with an allocated socket; * called only once, may fail, aborting socket allocation. This is called * from socreate() and sonewconn(). Socket layer private. * * pru_detach() disassociates protocol layer state from an attached socket, * and will be called exactly once for sockets in which pru_attach() has * been successfully called. If pru_attach() returned an error, * pru_detach() will not be called. Socket layer private. * * pru_abort() and pru_close() notify the protocol layer that the last * consumer of a socket is starting to tear down the socket, and that the * protocol should terminate the connection. Historically, pru_abort() also * detached protocol state from the socket state, but this is no longer the * case. * * socreate() creates a socket and attaches protocol state. This is a public * interface that may be used by socket layer consumers to create new * sockets. * * sonewconn() creates a socket and attaches protocol state. This is a * public interface that may be used by protocols to create new sockets when * a new connection is received and will be available for accept() on a * listen socket. * * soclose() destroys a socket after possibly waiting for it to disconnect. * This is a public interface that socket consumers should use to close and * release a socket when done with it. * * soabort() destroys a socket without waiting for it to disconnect (used * only for incoming connections that are already partially or fully * connected). This is used internally by the socket layer when clearing * listen socket queues (due to overflow or close on the listen socket), but * is also a public interface protocols may use to abort connections in * their incomplete listen queues should they no longer be required. Sockets * placed in completed connection listen queues should not be aborted for * reasons described in the comment above the soclose() implementation. This * is not a general purpose close routine, and except in the specific * circumstances described here, should not be used. * * sofree() will free a socket and its protocol state if all references on * the socket have been released, and is the public interface to attempt to * free a socket when a reference is removed. This is a socket layer private * interface. * * NOTE: In addition to socreate() and soclose(), which provide a single * socket reference to the consumer to be managed as required, there are two * calls to explicitly manage socket references, soref(), and sorele(). * Currently, these are generally required only when transitioning a socket * from a listen queue to a file descriptor, in order to prevent garbage * collection of the socket at an untimely moment. For a number of reasons, * these interfaces are not preferred, and should be avoided. * * NOTE: With regard to VNETs the general rule is that callers do not set * curvnet. Exceptions to this rule include soabort(), sodisconnect(), * sofree() (and with that sorele(), sotryfree()), as well as sonewconn() * and sorflush(), which are usually called from a pre-set VNET context. * sopoll() currently does not need a VNET context to be set. */ #include #include "opt_inet.h" #include "opt_inet6.h" #include "opt_kern_tls.h" #include "opt_ktrace.h" #include "opt_sctp.h" #include #include #include #include #include #include #include #include #include #include #include #include /* for struct knote */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_FREEBSD32 #include #include #include #endif static int soreceive_generic_locked(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp, struct mbuf **controlp, int *flagsp); static int soreceive_rcvoob(struct socket *so, struct uio *uio, int flags); static int soreceive_stream_locked(struct socket *so, struct sockbuf *sb, struct sockaddr **psa, struct uio *uio, struct mbuf **mp, struct mbuf **controlp, int flags); static int sosend_generic_locked(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td); static void so_rdknl_lock(void *); static void so_rdknl_unlock(void *); static void so_rdknl_assert_lock(void *, int); static void so_wrknl_lock(void *); static void so_wrknl_unlock(void *); static void so_wrknl_assert_lock(void *, int); static void filt_sordetach(struct knote *kn); static int filt_soread(struct knote *kn, long hint); static void filt_sowdetach(struct knote *kn); static int filt_sowrite(struct knote *kn, long hint); static int filt_soempty(struct knote *kn, long hint); static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id); fo_kqfilter_t soo_kqfilter; -static struct filterops soread_filtops = { +static const struct filterops soread_filtops = { .f_isfd = 1, .f_detach = filt_sordetach, .f_event = filt_soread, }; -static struct filterops sowrite_filtops = { +static const struct filterops sowrite_filtops = { .f_isfd = 1, .f_detach = filt_sowdetach, .f_event = filt_sowrite, }; -static struct filterops soempty_filtops = { +static const struct filterops soempty_filtops = { .f_isfd = 1, .f_detach = filt_sowdetach, .f_event = filt_soempty, }; so_gen_t so_gencnt; /* generation count for sockets */ MALLOC_DEFINE(M_SONAME, "soname", "socket name"); MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); #define VNET_SO_ASSERT(so) \ VNET_ASSERT(curvnet != NULL, \ ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so))); VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]); #define V_socket_hhh VNET(socket_hhh) #ifdef COMPAT_FREEBSD32 #ifdef __amd64__ /* off_t has 4-byte alignment on i386 but not on other 32-bit platforms. */ #define __splice32_packed __packed #else #define __splice32_packed #endif struct splice32 { int32_t sp_fd; int64_t sp_max; struct timeval32 sp_idle; } __splice32_packed; #undef __splice32_packed #endif /* * Limit on the number of connections in the listen queue waiting * for accept(2). * NB: The original sysctl somaxconn is still available but hidden * to prevent confusion about the actual purpose of this number. */ static u_int somaxconn = SOMAXCONN; static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS) { int error; int val; val = somaxconn; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr ) return (error); /* * The purpose of the UINT_MAX / 3 limit, is so that the formula * 3 * so_qlimit / 2 * below, will not overflow. */ if (val < 1 || val > UINT_MAX / 3) return (EINVAL); somaxconn = val; return (0); } SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, sizeof(int), sysctl_somaxconn, "I", "Maximum listen socket pending connection accept queue size"); SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE, 0, sizeof(int), sysctl_somaxconn, "I", "Maximum listen socket pending connection accept queue size (compat)"); static int numopensockets; SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, &numopensockets, 0, "Number of open sockets"); /* * so_global_mtx protects so_gencnt, numopensockets, and the per-socket * so_gencnt field. */ static struct mtx so_global_mtx; MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); /* * General IPC sysctl name space, used by sockets and a variety of other IPC * types. */ SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IPC"); /* * Initialize the socket subsystem and set up the socket * memory allocator. */ static uma_zone_t socket_zone; int maxsockets; static void socket_zone_change(void *tag) { maxsockets = uma_zone_set_max(socket_zone, maxsockets); } static int splice_init_state; static struct sx splice_init_lock; SX_SYSINIT(splice_init_lock, &splice_init_lock, "splice_init"); static SYSCTL_NODE(_kern_ipc, OID_AUTO, splice, CTLFLAG_RW, 0, "Settings relating to the SO_SPLICE socket option"); static bool splice_receive_stream = true; SYSCTL_BOOL(_kern_ipc_splice, OID_AUTO, receive_stream, CTLFLAG_RWTUN, &splice_receive_stream, 0, "Use soreceive_stream() for stream splices"); static uma_zone_t splice_zone; static struct proc *splice_proc; struct splice_wq { struct mtx mtx; STAILQ_HEAD(, so_splice) head; bool running; } __aligned(CACHE_LINE_SIZE); static struct splice_wq *splice_wq; static uint32_t splice_index = 0; static void so_splice_timeout(void *arg, int pending); static void so_splice_xfer(struct so_splice *s); static int so_unsplice(struct socket *so, bool timeout); static void splice_work_thread(void *ctx) { struct splice_wq *wq = ctx; struct so_splice *s, *s_temp; STAILQ_HEAD(, so_splice) local_head; int cpu; cpu = wq - splice_wq; if (bootverbose) printf("starting so_splice worker thread for CPU %d\n", cpu); for (;;) { mtx_lock(&wq->mtx); while (STAILQ_EMPTY(&wq->head)) { wq->running = false; mtx_sleep(wq, &wq->mtx, 0, "-", 0); wq->running = true; } STAILQ_INIT(&local_head); STAILQ_CONCAT(&local_head, &wq->head); STAILQ_INIT(&wq->head); mtx_unlock(&wq->mtx); STAILQ_FOREACH_SAFE(s, &local_head, next, s_temp) { mtx_lock(&s->mtx); CURVNET_SET(s->src->so_vnet); so_splice_xfer(s); CURVNET_RESTORE(); } } } static void so_splice_dispatch_async(struct so_splice *sp) { struct splice_wq *wq; bool running; wq = &splice_wq[sp->wq_index]; mtx_lock(&wq->mtx); STAILQ_INSERT_TAIL(&wq->head, sp, next); running = wq->running; mtx_unlock(&wq->mtx); if (!running) wakeup(wq); } void so_splice_dispatch(struct so_splice *sp) { mtx_assert(&sp->mtx, MA_OWNED); if (sp->state != SPLICE_IDLE) { mtx_unlock(&sp->mtx); } else { sp->state = SPLICE_QUEUED; mtx_unlock(&sp->mtx); so_splice_dispatch_async(sp); } } static int splice_zinit(void *mem, int size __unused, int flags __unused) { struct so_splice *s; s = (struct so_splice *)mem; mtx_init(&s->mtx, "so_splice", NULL, MTX_DEF); return (0); } static void splice_zfini(void *mem, int size) { struct so_splice *s; s = (struct so_splice *)mem; mtx_destroy(&s->mtx); } static int splice_init(void) { struct thread *td; int error, i, state; state = atomic_load_acq_int(&splice_init_state); if (__predict_true(state > 0)) return (0); if (state < 0) return (ENXIO); sx_xlock(&splice_init_lock); if (splice_init_state != 0) { sx_xunlock(&splice_init_lock); return (0); } splice_zone = uma_zcreate("splice", sizeof(struct so_splice), NULL, NULL, splice_zinit, splice_zfini, UMA_ALIGN_CACHE, 0); splice_wq = mallocarray(mp_maxid + 1, sizeof(*splice_wq), M_TEMP, M_WAITOK | M_ZERO); /* * Initialize the workqueues to run the splice work. We create a * work queue for each CPU. */ CPU_FOREACH(i) { STAILQ_INIT(&splice_wq[i].head); mtx_init(&splice_wq[i].mtx, "splice work queue", NULL, MTX_DEF); } /* Start kthreads for each workqueue. */ error = 0; CPU_FOREACH(i) { error = kproc_kthread_add(splice_work_thread, &splice_wq[i], &splice_proc, &td, 0, 0, "so_splice", "thr_%d", i); if (error) { printf("Can't add so_splice thread %d error %d\n", i, error); break; } /* * It's possible to create loops with SO_SPLICE; ensure that * worker threads aren't able to starve the system too easily. */ thread_lock(td); sched_prio(td, PUSER); thread_unlock(td); } splice_init_state = error != 0 ? -1 : 1; sx_xunlock(&splice_init_lock); return (error); } /* * Lock a pair of socket's I/O locks for splicing. Avoid blocking while holding * one lock in order to avoid potential deadlocks in case there is some other * code path which acquires more than one I/O lock at a time. */ static void splice_lock_pair(struct socket *so_src, struct socket *so_dst) { int error; for (;;) { error = SOCK_IO_SEND_LOCK(so_dst, SBL_WAIT | SBL_NOINTR); KASSERT(error == 0, ("%s: failed to lock send I/O lock: %d", __func__, error)); error = SOCK_IO_RECV_LOCK(so_src, 0); KASSERT(error == 0 || error == EWOULDBLOCK, ("%s: failed to lock recv I/O lock: %d", __func__, error)); if (error == 0) break; SOCK_IO_SEND_UNLOCK(so_dst); error = SOCK_IO_RECV_LOCK(so_src, SBL_WAIT | SBL_NOINTR); KASSERT(error == 0, ("%s: failed to lock recv I/O lock: %d", __func__, error)); error = SOCK_IO_SEND_LOCK(so_dst, 0); KASSERT(error == 0 || error == EWOULDBLOCK, ("%s: failed to lock send I/O lock: %d", __func__, error)); if (error == 0) break; SOCK_IO_RECV_UNLOCK(so_src); } } static void splice_unlock_pair(struct socket *so_src, struct socket *so_dst) { SOCK_IO_RECV_UNLOCK(so_src); SOCK_IO_SEND_UNLOCK(so_dst); } /* * Move data from the source to the sink. Assumes that both of the relevant * socket I/O locks are held. */ static int so_splice_xfer_data(struct socket *so_src, struct socket *so_dst, off_t max, ssize_t *lenp) { struct uio uio; struct mbuf *m; struct sockbuf *sb_src, *sb_dst; ssize_t len; long space; int error, flags; SOCK_IO_RECV_ASSERT_LOCKED(so_src); SOCK_IO_SEND_ASSERT_LOCKED(so_dst); error = 0; m = NULL; memset(&uio, 0, sizeof(uio)); sb_src = &so_src->so_rcv; sb_dst = &so_dst->so_snd; space = sbspace(sb_dst); if (space < 0) space = 0; len = MIN(max, MIN(space, sbavail(sb_src))); if (len == 0) { SOCK_RECVBUF_LOCK(so_src); if ((sb_src->sb_state & SBS_CANTRCVMORE) != 0) error = EPIPE; SOCK_RECVBUF_UNLOCK(so_src); } else { flags = MSG_DONTWAIT; uio.uio_resid = len; if (splice_receive_stream && sb_src->sb_tls_info == NULL) { error = soreceive_stream_locked(so_src, sb_src, NULL, &uio, &m, NULL, flags); } else { error = soreceive_generic_locked(so_src, NULL, &uio, &m, NULL, &flags); } if (error != 0 && m != NULL) { m_freem(m); m = NULL; } } if (m != NULL) { len -= uio.uio_resid; error = sosend_generic_locked(so_dst, NULL, NULL, m, NULL, MSG_DONTWAIT, curthread); } else if (error == 0) { len = 0; SOCK_SENDBUF_LOCK(so_dst); if ((sb_dst->sb_state & SBS_CANTSENDMORE) != 0) error = EPIPE; SOCK_SENDBUF_UNLOCK(so_dst); } if (error == 0) *lenp = len; return (error); } /* * Transfer data from the source to the sink. * * If "direct" is true, the transfer is done in the context of whichever thread * is operating on one of the socket buffers. We do not know which locks are * held, so we can only trylock the socket buffers; if this fails, we fall back * to the worker thread, which invokes this routine with "direct" set to false. */ static void so_splice_xfer(struct so_splice *sp) { struct socket *so_src, *so_dst; off_t max; ssize_t len; int error; mtx_assert(&sp->mtx, MA_OWNED); KASSERT(sp->state == SPLICE_QUEUED || sp->state == SPLICE_CLOSING, ("so_splice_xfer: invalid state %d", sp->state)); KASSERT(sp->max != 0, ("so_splice_xfer: max == 0")); if (sp->state == SPLICE_CLOSING) { /* Userspace asked us to close the splice. */ goto closing; } sp->state = SPLICE_RUNNING; so_src = sp->src; so_dst = sp->dst; max = sp->max > 0 ? sp->max - so_src->so_splice_sent : OFF_MAX; if (max < 0) max = 0; /* * Lock the sockets in order to block userspace from doing anything * sneaky. If an error occurs or one of the sockets can no longer * transfer data, we will automatically unsplice. */ mtx_unlock(&sp->mtx); splice_lock_pair(so_src, so_dst); error = so_splice_xfer_data(so_src, so_dst, max, &len); mtx_lock(&sp->mtx); /* * Update our stats while still holding the socket locks. This * synchronizes with getsockopt(SO_SPLICE), see the comment there. */ if (error == 0) { KASSERT(len >= 0, ("%s: len %zd < 0", __func__, len)); so_src->so_splice_sent += len; } splice_unlock_pair(so_src, so_dst); switch (sp->state) { case SPLICE_CLOSING: closing: sp->state = SPLICE_CLOSED; wakeup(sp); mtx_unlock(&sp->mtx); break; case SPLICE_RUNNING: if (error != 0 || (sp->max > 0 && so_src->so_splice_sent >= sp->max)) { sp->state = SPLICE_EXCEPTION; soref(so_src); mtx_unlock(&sp->mtx); (void)so_unsplice(so_src, false); sorele(so_src); } else { /* * Locklessly check for additional bytes in the source's * receive buffer and queue more work if possible. We * may end up queuing needless work, but that's ok, and * if we race with a thread inserting more data into the * buffer and observe sbavail() == 0, the splice mutex * ensures that splice_push() will queue more work for * us. */ if (sbavail(&so_src->so_rcv) > 0 && sbspace(&so_dst->so_snd) > 0) { sp->state = SPLICE_QUEUED; mtx_unlock(&sp->mtx); so_splice_dispatch_async(sp); } else { sp->state = SPLICE_IDLE; mtx_unlock(&sp->mtx); } } break; default: __assert_unreachable(); } } static void socket_hhook_register(int subtype) { if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype, &V_socket_hhh[subtype], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register hook\n", __func__); } static void socket_hhook_deregister(int subtype) { if (hhook_head_deregister(V_socket_hhh[subtype]) != 0) printf("%s: WARNING: unable to deregister hook\n", __func__); } static void socket_init(void *tag) { socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); maxsockets = uma_zone_set_max(socket_zone, maxsockets); uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached"); EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL, EVENTHANDLER_PRI_FIRST); } SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL); static void socket_vnet_init(const void *unused __unused) { int i; /* We expect a contiguous range */ for (i = 0; i <= HHOOK_SOCKET_LAST; i++) socket_hhook_register(i); } VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_vnet_init, NULL); static void socket_vnet_uninit(const void *unused __unused) { int i; for (i = 0; i <= HHOOK_SOCKET_LAST; i++) socket_hhook_deregister(i); } VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_vnet_uninit, NULL); /* * Initialise maxsockets. This SYSINIT must be run after * tunable_mbinit(). */ static void init_maxsockets(void *ignored) { TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); maxsockets = imax(maxsockets, maxfiles); } SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); /* * Sysctl to get and set the maximum global sockets limit. Notify protocols * of the change so that they can update their dependent limits as required. */ static int sysctl_maxsockets(SYSCTL_HANDLER_ARGS) { int error, newmaxsockets; newmaxsockets = maxsockets; error = sysctl_handle_int(oidp, &newmaxsockets, 0, req); if (error == 0 && req->newptr && newmaxsockets != maxsockets) { if (newmaxsockets > maxsockets && newmaxsockets <= maxfiles) { maxsockets = newmaxsockets; EVENTHANDLER_INVOKE(maxsockets_change); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, &maxsockets, 0, sysctl_maxsockets, "IU", "Maximum number of sockets available"); /* * Socket operation routines. These routines are called by the routines in * sys_socket.c or from a system process, and implement the semantics of * socket operations by switching out to the protocol specific routines. */ /* * Get a socket structure from our zone, and initialize it. Note that it * would probably be better to allocate socket and PCB at the same time, but * I'm not convinced that all the protocols can be easily modified to do * this. * * soalloc() returns a socket with a ref count of 0. */ static struct socket * soalloc(struct vnet *vnet) { struct socket *so; so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO); if (so == NULL) return (NULL); #ifdef MAC if (mac_socket_init(so, M_NOWAIT) != 0) { uma_zfree(socket_zone, so); return (NULL); } #endif if (khelp_init_osd(HELPER_CLASS_SOCKET, &so->osd)) { uma_zfree(socket_zone, so); return (NULL); } /* * The socket locking protocol allows to lock 2 sockets at a time, * however, the first one must be a listening socket. WITNESS lacks * a feature to change class of an existing lock, so we use DUPOK. */ mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK); mtx_init(&so->so_snd_mtx, "so_snd", NULL, MTX_DEF); mtx_init(&so->so_rcv_mtx, "so_rcv", NULL, MTX_DEF); so->so_rcv.sb_sel = &so->so_rdsel; so->so_snd.sb_sel = &so->so_wrsel; sx_init(&so->so_snd_sx, "so_snd_sx"); sx_init(&so->so_rcv_sx, "so_rcv_sx"); TAILQ_INIT(&so->so_snd.sb_aiojobq); TAILQ_INIT(&so->so_rcv.sb_aiojobq); TASK_INIT(&so->so_snd.sb_aiotask, 0, soaio_snd, so); TASK_INIT(&so->so_rcv.sb_aiotask, 0, soaio_rcv, so); #ifdef VIMAGE VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p", __func__, __LINE__, so)); so->so_vnet = vnet; #endif /* We shouldn't need the so_global_mtx */ if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE)) { /* Do we need more comprehensive error returns? */ uma_zfree(socket_zone, so); return (NULL); } mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; ++numopensockets; #ifdef VIMAGE vnet->vnet_sockcnt++; #endif mtx_unlock(&so_global_mtx); return (so); } /* * Free the storage associated with a socket at the socket layer, tear down * locks, labels, etc. All protocol state is assumed already to have been * torn down (and possibly never set up) by the caller. */ void sodealloc(struct socket *so) { KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; --numopensockets; /* Could be below, but faster here. */ #ifdef VIMAGE VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p", __func__, __LINE__, so)); so->so_vnet->vnet_sockcnt--; #endif mtx_unlock(&so_global_mtx); #ifdef MAC mac_socket_destroy(so); #endif hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE); khelp_destroy_osd(&so->osd); if (SOLISTENING(so)) { if (so->sol_accept_filter != NULL) accept_filt_setopt(so, NULL); } else { if (so->so_rcv.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); if (so->so_snd.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); sx_destroy(&so->so_snd_sx); sx_destroy(&so->so_rcv_sx); mtx_destroy(&so->so_snd_mtx); mtx_destroy(&so->so_rcv_mtx); } crfree(so->so_cred); mtx_destroy(&so->so_lock); uma_zfree(socket_zone, so); } /* * socreate returns a socket with a ref count of 1 and a file descriptor * reference. The socket should be closed with soclose(). */ int socreate(int dom, struct socket **aso, int type, int proto, struct ucred *cred, struct thread *td) { struct protosw *prp; struct socket *so; int error; /* * XXX: divert(4) historically abused PF_INET. Keep this compatibility * shim until all applications have been updated. */ if (__predict_false(dom == PF_INET && type == SOCK_RAW && proto == IPPROTO_DIVERT)) { dom = PF_DIVERT; printf("%s uses obsolete way to create divert(4) socket\n", td->td_proc->p_comm); } prp = pffindproto(dom, type, proto); if (prp == NULL) { /* No support for domain. */ if (pffinddomain(dom) == NULL) return (EAFNOSUPPORT); /* No support for socket type. */ if (proto == 0 && type != 0) return (EPROTOTYPE); return (EPROTONOSUPPORT); } MPASS(prp->pr_attach); if ((prp->pr_flags & PR_CAPATTACH) == 0) { if (CAP_TRACING(td)) ktrcapfail(CAPFAIL_PROTO, &proto); if (IN_CAPABILITY_MODE(td)) return (ECAPMODE); } if (prison_check_af(cred, prp->pr_domain->dom_family) != 0) return (EPROTONOSUPPORT); so = soalloc(CRED_TO_VNET(cred)); if (so == NULL) return (ENOBUFS); so->so_type = type; so->so_cred = crhold(cred); if ((prp->pr_domain->dom_family == PF_INET) || (prp->pr_domain->dom_family == PF_INET6) || (prp->pr_domain->dom_family == PF_ROUTE)) so->so_fibnum = td->td_proc->p_fibnum; else so->so_fibnum = 0; so->so_proto = prp; #ifdef MAC mac_socket_create(cred, so); #endif knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, so_rdknl_assert_lock); knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, so_wrknl_assert_lock); if ((prp->pr_flags & PR_SOCKBUF) == 0) { so->so_snd.sb_mtx = &so->so_snd_mtx; so->so_rcv.sb_mtx = &so->so_rcv_mtx; } /* * Auto-sizing of socket buffers is managed by the protocols and * the appropriate flags must be set in the pru_attach function. */ CURVNET_SET(so->so_vnet); error = prp->pr_attach(so, proto, td); CURVNET_RESTORE(); if (error) { sodealloc(so); return (error); } soref(so); *aso = so; return (0); } #ifdef REGRESSION static int regression_sonewconn_earlytest = 1; SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW, ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test"); #endif static int sooverprio = LOG_DEBUG; SYSCTL_INT(_kern_ipc, OID_AUTO, sooverprio, CTLFLAG_RW, &sooverprio, 0, "Log priority for listen socket overflows: 0..7 or -1 to disable"); static struct timeval overinterval = { 60, 0 }; SYSCTL_TIMEVAL_SEC(_kern_ipc, OID_AUTO, sooverinterval, CTLFLAG_RW, &overinterval, "Delay in seconds between warnings for listen socket overflows"); /* * When an attempt at a new connection is noted on a socket which supports * accept(2), the protocol has two options: * 1) Call legacy sonewconn() function, which would call protocol attach * method, same as used for socket(2). * 2) Call solisten_clone(), do attach that is specific to a cloned connection, * and then call solisten_enqueue(). * * Note: the ref count on the socket is 0 on return. */ struct socket * solisten_clone(struct socket *head) { struct sbuf descrsb; struct socket *so; int len, overcount; u_int qlen; const char localprefix[] = "local:"; char descrbuf[SUNPATHLEN + sizeof(localprefix)]; #if defined(INET6) char addrbuf[INET6_ADDRSTRLEN]; #elif defined(INET) char addrbuf[INET_ADDRSTRLEN]; #endif bool dolog, over; SOLISTEN_LOCK(head); over = (head->sol_qlen > 3 * head->sol_qlimit / 2); #ifdef REGRESSION if (regression_sonewconn_earlytest && over) { #else if (over) { #endif head->sol_overcount++; dolog = (sooverprio >= 0) && !!ratecheck(&head->sol_lastover, &overinterval); /* * If we're going to log, copy the overflow count and queue * length from the listen socket before dropping the lock. * Also, reset the overflow count. */ if (dolog) { overcount = head->sol_overcount; head->sol_overcount = 0; qlen = head->sol_qlen; } SOLISTEN_UNLOCK(head); if (dolog) { /* * Try to print something descriptive about the * socket for the error message. */ sbuf_new(&descrsb, descrbuf, sizeof(descrbuf), SBUF_FIXEDLEN); switch (head->so_proto->pr_domain->dom_family) { #if defined(INET) || defined(INET6) #ifdef INET case AF_INET: #endif #ifdef INET6 case AF_INET6: if (head->so_proto->pr_domain->dom_family == AF_INET6 || (sotoinpcb(head)->inp_inc.inc_flags & INC_ISIPV6)) { ip6_sprintf(addrbuf, &sotoinpcb(head)->inp_inc.inc6_laddr); sbuf_printf(&descrsb, "[%s]", addrbuf); } else #endif { #ifdef INET inet_ntoa_r( sotoinpcb(head)->inp_inc.inc_laddr, addrbuf); sbuf_cat(&descrsb, addrbuf); #endif } sbuf_printf(&descrsb, ":%hu (proto %u)", ntohs(sotoinpcb(head)->inp_inc.inc_lport), head->so_proto->pr_protocol); break; #endif /* INET || INET6 */ case AF_UNIX: sbuf_cat(&descrsb, localprefix); if (sotounpcb(head)->unp_addr != NULL) len = sotounpcb(head)->unp_addr->sun_len - offsetof(struct sockaddr_un, sun_path); else len = 0; if (len > 0) sbuf_bcat(&descrsb, sotounpcb(head)->unp_addr->sun_path, len); else sbuf_cat(&descrsb, "(unknown)"); break; } /* * If we can't print something more specific, at least * print the domain name. */ if (sbuf_finish(&descrsb) != 0 || sbuf_len(&descrsb) <= 0) { sbuf_clear(&descrsb); sbuf_cat(&descrsb, head->so_proto->pr_domain->dom_name ?: "unknown"); sbuf_finish(&descrsb); } KASSERT(sbuf_len(&descrsb) > 0, ("%s: sbuf creation failed", __func__)); /* * Preserve the historic listen queue overflow log * message, that starts with "sonewconn:". It has * been known to sysadmins for years and also test * sys/kern/sonewconn_overflow checks for it. */ if (head->so_cred == 0) { log(LOG_PRI(sooverprio), "sonewconn: pcb %p (%s): " "Listen queue overflow: %i already in " "queue awaiting acceptance (%d " "occurrences)\n", head->so_pcb, sbuf_data(&descrsb), qlen, overcount); } else { log(LOG_PRI(sooverprio), "sonewconn: pcb %p (%s): " "Listen queue overflow: " "%i already in queue awaiting acceptance " "(%d occurrences), euid %d, rgid %d, jail %s\n", head->so_pcb, sbuf_data(&descrsb), qlen, overcount, head->so_cred->cr_uid, head->so_cred->cr_rgid, head->so_cred->cr_prison ? head->so_cred->cr_prison->pr_name : "not_jailed"); } sbuf_delete(&descrsb); overcount = 0; } return (NULL); } SOLISTEN_UNLOCK(head); VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL", __func__, head)); so = soalloc(head->so_vnet); if (so == NULL) { log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " "limit reached or out of memory\n", __func__, head->so_pcb); return (NULL); } so->so_listen = head; so->so_type = head->so_type; /* * POSIX is ambiguous on what options an accept(2)ed socket should * inherit from the listener. Words "create a new socket" may be * interpreted as not inheriting anything. Best programming practice * for application developers is to not rely on such inheritance. * FreeBSD had historically inherited all so_options excluding * SO_ACCEPTCONN, which virtually means all SOL_SOCKET level options, * including those completely irrelevant to a new born socket. For * compatibility with older versions we will inherit a list of * meaningful options. */ so->so_options = head->so_options & (SO_KEEPALIVE | SO_DONTROUTE | SO_LINGER | SO_OOBINLINE | SO_NOSIGPIPE); so->so_linger = head->so_linger; so->so_state = head->so_state; so->so_fibnum = head->so_fibnum; so->so_proto = head->so_proto; so->so_cred = crhold(head->so_cred); #ifdef MAC mac_socket_newconn(head, so); #endif knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, so_rdknl_assert_lock); knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, so_wrknl_assert_lock); VNET_SO_ASSERT(head); if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", __func__, head->so_pcb); return (NULL); } so->so_rcv.sb_lowat = head->sol_sbrcv_lowat; so->so_snd.sb_lowat = head->sol_sbsnd_lowat; so->so_rcv.sb_timeo = head->sol_sbrcv_timeo; so->so_snd.sb_timeo = head->sol_sbsnd_timeo; so->so_rcv.sb_flags = head->sol_sbrcv_flags & SB_AUTOSIZE; so->so_snd.sb_flags = head->sol_sbsnd_flags & SB_AUTOSIZE; if ((so->so_proto->pr_flags & PR_SOCKBUF) == 0) { so->so_snd.sb_mtx = &so->so_snd_mtx; so->so_rcv.sb_mtx = &so->so_rcv_mtx; } return (so); } /* Connstatus may be 0, or SS_ISCONFIRMING, or SS_ISCONNECTED. */ struct socket * sonewconn(struct socket *head, int connstatus) { struct socket *so; if ((so = solisten_clone(head)) == NULL) return (NULL); if (so->so_proto->pr_attach(so, 0, NULL) != 0) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: pr_attach() failed\n", __func__, head->so_pcb); return (NULL); } (void)solisten_enqueue(so, connstatus); return (so); } /* * Enqueue socket cloned by solisten_clone() to the listen queue of the * listener it has been cloned from. * * Return 'true' if socket landed on complete queue, otherwise 'false'. */ bool solisten_enqueue(struct socket *so, int connstatus) { struct socket *head = so->so_listen; MPASS(refcount_load(&so->so_count) == 0); refcount_init(&so->so_count, 1); SOLISTEN_LOCK(head); if (head->sol_accept_filter != NULL) connstatus = 0; so->so_state |= connstatus; soref(head); /* A socket on (in)complete queue refs head. */ if (connstatus) { TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); so->so_qstate = SQ_COMP; head->sol_qlen++; solisten_wakeup(head); /* unlocks */ return (true); } else { /* * Keep removing sockets from the head until there's room for * us to insert on the tail. In pre-locking revisions, this * was a simple if(), but as we could be racing with other * threads and soabort() requires dropping locks, we must * loop waiting for the condition to be true. */ while (head->sol_incqlen > head->sol_qlimit) { struct socket *sp; sp = TAILQ_FIRST(&head->sol_incomp); TAILQ_REMOVE(&head->sol_incomp, sp, so_list); head->sol_incqlen--; SOCK_LOCK(sp); sp->so_qstate = SQ_NONE; sp->so_listen = NULL; SOCK_UNLOCK(sp); sorele_locked(head); /* does SOLISTEN_UNLOCK, head stays */ soabort(sp); SOLISTEN_LOCK(head); } TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list); so->so_qstate = SQ_INCOMP; head->sol_incqlen++; SOLISTEN_UNLOCK(head); return (false); } } #if defined(SCTP) || defined(SCTP_SUPPORT) /* * Socket part of sctp_peeloff(). Detach a new socket from an * association. The new socket is returned with a reference. * * XXXGL: reduce copy-paste with solisten_clone(). */ struct socket * sopeeloff(struct socket *head) { struct socket *so; VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", __func__, __LINE__, head)); so = soalloc(head->so_vnet); if (so == NULL) { log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " "limit reached or out of memory\n", __func__, head->so_pcb); return (NULL); } so->so_type = head->so_type; so->so_options = head->so_options; so->so_linger = head->so_linger; so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED; so->so_fibnum = head->so_fibnum; so->so_proto = head->so_proto; so->so_cred = crhold(head->so_cred); #ifdef MAC mac_socket_newconn(head, so); #endif knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, so_rdknl_assert_lock); knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, so_wrknl_assert_lock); VNET_SO_ASSERT(head); if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", __func__, head->so_pcb); return (NULL); } if ((*so->so_proto->pr_attach)(so, 0, NULL)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", __func__, head->so_pcb); return (NULL); } so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; so->so_snd.sb_lowat = head->so_snd.sb_lowat; so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; so->so_snd.sb_timeo = head->so_snd.sb_timeo; so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; if ((so->so_proto->pr_flags & PR_SOCKBUF) == 0) { so->so_snd.sb_mtx = &so->so_snd_mtx; so->so_rcv.sb_mtx = &so->so_rcv_mtx; } soref(so); return (so); } #endif /* SCTP */ int sobind(struct socket *so, struct sockaddr *nam, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_bind(so, nam, td); CURVNET_RESTORE(); return (error); } int sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_bindat(fd, so, nam, td); CURVNET_RESTORE(); return (error); } /* * solisten() transitions a socket from a non-listening state to a listening * state, but can also be used to update the listen queue depth on an * existing listen socket. The protocol will call back into the sockets * layer using solisten_proto_check() and solisten_proto() to check and set * socket-layer listen state. Call backs are used so that the protocol can * acquire both protocol and socket layer locks in whatever order is required * by the protocol. * * Protocol implementors are advised to hold the socket lock across the * socket-layer test and set to avoid races at the socket layer. */ int solisten(struct socket *so, int backlog, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_listen(so, backlog, td); CURVNET_RESTORE(); return (error); } /* * Prepare for a call to solisten_proto(). Acquire all socket buffer locks in * order to interlock with socket I/O. */ int solisten_proto_check(struct socket *so) { SOCK_LOCK_ASSERT(so); if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) != 0) return (EINVAL); /* * Sleeping is not permitted here, so simply fail if userspace is * attempting to transmit or receive on the socket. This kind of * transient failure is not ideal, but it should occur only if userspace * is misusing the socket interfaces. */ if (!sx_try_xlock(&so->so_snd_sx)) return (EAGAIN); if (!sx_try_xlock(&so->so_rcv_sx)) { sx_xunlock(&so->so_snd_sx); return (EAGAIN); } mtx_lock(&so->so_snd_mtx); mtx_lock(&so->so_rcv_mtx); /* Interlock with soo_aio_queue() and KTLS. */ if (!SOLISTENING(so)) { bool ktls; #ifdef KERN_TLS ktls = so->so_snd.sb_tls_info != NULL || so->so_rcv.sb_tls_info != NULL; #else ktls = false; #endif if (ktls || (so->so_snd.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0 || (so->so_rcv.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0) { solisten_proto_abort(so); return (EINVAL); } } return (0); } /* * Undo the setup done by solisten_proto_check(). */ void solisten_proto_abort(struct socket *so) { mtx_unlock(&so->so_snd_mtx); mtx_unlock(&so->so_rcv_mtx); sx_xunlock(&so->so_snd_sx); sx_xunlock(&so->so_rcv_sx); } void solisten_proto(struct socket *so, int backlog) { int sbrcv_lowat, sbsnd_lowat; u_int sbrcv_hiwat, sbsnd_hiwat; short sbrcv_flags, sbsnd_flags; sbintime_t sbrcv_timeo, sbsnd_timeo; SOCK_LOCK_ASSERT(so); KASSERT((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0, ("%s: bad socket state %p", __func__, so)); if (SOLISTENING(so)) goto listening; /* * Change this socket to listening state. */ sbrcv_lowat = so->so_rcv.sb_lowat; sbsnd_lowat = so->so_snd.sb_lowat; sbrcv_hiwat = so->so_rcv.sb_hiwat; sbsnd_hiwat = so->so_snd.sb_hiwat; sbrcv_flags = so->so_rcv.sb_flags; sbsnd_flags = so->so_snd.sb_flags; sbrcv_timeo = so->so_rcv.sb_timeo; sbsnd_timeo = so->so_snd.sb_timeo; #ifdef MAC mac_socketpeer_label_free(so->so_peerlabel); #endif sbdestroy(so, SO_SND); sbdestroy(so, SO_RCV); #ifdef INVARIANTS bzero(&so->so_rcv, sizeof(struct socket) - offsetof(struct socket, so_rcv)); #endif so->sol_sbrcv_lowat = sbrcv_lowat; so->sol_sbsnd_lowat = sbsnd_lowat; so->sol_sbrcv_hiwat = sbrcv_hiwat; so->sol_sbsnd_hiwat = sbsnd_hiwat; so->sol_sbrcv_flags = sbrcv_flags; so->sol_sbsnd_flags = sbsnd_flags; so->sol_sbrcv_timeo = sbrcv_timeo; so->sol_sbsnd_timeo = sbsnd_timeo; so->sol_qlen = so->sol_incqlen = 0; TAILQ_INIT(&so->sol_incomp); TAILQ_INIT(&so->sol_comp); so->sol_accept_filter = NULL; so->sol_accept_filter_arg = NULL; so->sol_accept_filter_str = NULL; so->sol_upcall = NULL; so->sol_upcallarg = NULL; so->so_options |= SO_ACCEPTCONN; listening: if (backlog < 0 || backlog > somaxconn) backlog = somaxconn; so->sol_qlimit = backlog; mtx_unlock(&so->so_snd_mtx); mtx_unlock(&so->so_rcv_mtx); sx_xunlock(&so->so_snd_sx); sx_xunlock(&so->so_rcv_sx); } /* * Wakeup listeners/subsystems once we have a complete connection. * Enters with lock, returns unlocked. */ void solisten_wakeup(struct socket *sol) { if (sol->sol_upcall != NULL) (void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT); else { selwakeuppri(&sol->so_rdsel, PSOCK); KNOTE_LOCKED(&sol->so_rdsel.si_note, 0); } SOLISTEN_UNLOCK(sol); wakeup_one(&sol->sol_comp); if ((sol->so_state & SS_ASYNC) && sol->so_sigio != NULL) pgsigio(&sol->so_sigio, SIGIO, 0); } /* * Return single connection off a listening socket queue. Main consumer of * the function is kern_accept4(). Some modules, that do their own accept * management also use the function. The socket reference held by the * listen queue is handed to the caller. * * Listening socket must be locked on entry and is returned unlocked on * return. * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT. */ int solisten_dequeue(struct socket *head, struct socket **ret, int flags) { struct socket *so; int error; SOLISTEN_LOCK_ASSERT(head); while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) && head->so_error == 0) { error = msleep(&head->sol_comp, SOCK_MTX(head), PSOCK | PCATCH, "accept", 0); if (error != 0) { SOLISTEN_UNLOCK(head); return (error); } } if (head->so_error) { error = head->so_error; head->so_error = 0; } else if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp)) error = EWOULDBLOCK; else error = 0; if (error) { SOLISTEN_UNLOCK(head); return (error); } so = TAILQ_FIRST(&head->sol_comp); SOCK_LOCK(so); KASSERT(so->so_qstate == SQ_COMP, ("%s: so %p not SQ_COMP", __func__, so)); head->sol_qlen--; so->so_qstate = SQ_NONE; so->so_listen = NULL; TAILQ_REMOVE(&head->sol_comp, so, so_list); if (flags & ACCEPT4_INHERIT) so->so_state |= (head->so_state & SS_NBIO); else so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; SOCK_UNLOCK(so); sorele_locked(head); *ret = so; return (0); } static struct so_splice * so_splice_alloc(off_t max) { struct so_splice *sp; sp = uma_zalloc(splice_zone, M_WAITOK); sp->src = NULL; sp->dst = NULL; sp->max = max > 0 ? max : -1; do { sp->wq_index = atomic_fetchadd_32(&splice_index, 1) % (mp_maxid + 1); } while (CPU_ABSENT(sp->wq_index)); sp->state = SPLICE_IDLE; TIMEOUT_TASK_INIT(taskqueue_thread, &sp->timeout, 0, so_splice_timeout, sp); return (sp); } static void so_splice_free(struct so_splice *sp) { KASSERT(sp->state == SPLICE_CLOSED, ("so_splice_free: sp %p not closed", sp)); uma_zfree(splice_zone, sp); } static void so_splice_timeout(void *arg, int pending __unused) { struct so_splice *sp; sp = arg; (void)so_unsplice(sp->src, true); } /* * Splice the output from so to the input of so2. */ static int so_splice(struct socket *so, struct socket *so2, struct splice *splice) { struct so_splice *sp; int error; if (splice->sp_max < 0) return (EINVAL); /* Handle only TCP for now; TODO: other streaming protos */ if (so->so_proto->pr_protocol != IPPROTO_TCP || so2->so_proto->pr_protocol != IPPROTO_TCP) return (EPROTONOSUPPORT); if (so->so_vnet != so2->so_vnet) return (EINVAL); /* so_splice_xfer() assumes that we're using these implementations. */ KASSERT(so->so_proto->pr_sosend == sosend_generic, ("so_splice: sosend not sosend_generic")); KASSERT(so2->so_proto->pr_soreceive == soreceive_generic || so2->so_proto->pr_soreceive == soreceive_stream, ("so_splice: soreceive not soreceive_generic/stream")); sp = so_splice_alloc(splice->sp_max); so->so_splice_sent = 0; sp->src = so; sp->dst = so2; error = 0; SOCK_LOCK(so); if (SOLISTENING(so)) error = EINVAL; else if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0) error = ENOTCONN; else if (so->so_splice != NULL) error = EBUSY; if (error != 0) { SOCK_UNLOCK(so); uma_zfree(splice_zone, sp); return (error); } soref(so); so->so_splice = sp; SOCK_RECVBUF_LOCK(so); so->so_rcv.sb_flags |= SB_SPLICED; SOCK_RECVBUF_UNLOCK(so); SOCK_UNLOCK(so); error = 0; SOCK_LOCK(so2); if (SOLISTENING(so2)) error = EINVAL; else if ((so2->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0) error = ENOTCONN; else if (so2->so_splice_back != NULL) error = EBUSY; if (error != 0) { SOCK_UNLOCK(so2); SOCK_LOCK(so); so->so_splice = NULL; SOCK_RECVBUF_LOCK(so); so->so_rcv.sb_flags &= ~SB_SPLICED; SOCK_RECVBUF_UNLOCK(so); SOCK_UNLOCK(so); sorele(so); uma_zfree(splice_zone, sp); return (error); } soref(so2); so2->so_splice_back = sp; SOCK_SENDBUF_LOCK(so2); so2->so_snd.sb_flags |= SB_SPLICED; mtx_lock(&sp->mtx); SOCK_SENDBUF_UNLOCK(so2); SOCK_UNLOCK(so2); if (splice->sp_idle.tv_sec != 0 || splice->sp_idle.tv_usec != 0) { taskqueue_enqueue_timeout_sbt(taskqueue_thread, &sp->timeout, tvtosbt(splice->sp_idle), 0, C_PREL(4)); } /* * Transfer any data already present in the socket buffer. */ sp->state = SPLICE_QUEUED; so_splice_xfer(sp); return (0); } static int so_unsplice(struct socket *so, bool timeout) { struct socket *so2; struct so_splice *sp; bool drain; /* * First unset SB_SPLICED and hide the splice structure so that * wakeup routines will stop enqueuing work. This also ensures that * a only a single thread will proceed with the unsplice. */ SOCK_LOCK(so); if (SOLISTENING(so)) { SOCK_UNLOCK(so); return (EINVAL); } SOCK_RECVBUF_LOCK(so); if ((so->so_rcv.sb_flags & SB_SPLICED) == 0) { SOCK_RECVBUF_UNLOCK(so); SOCK_UNLOCK(so); return (ENOTCONN); } so->so_rcv.sb_flags &= ~SB_SPLICED; sp = so->so_splice; so->so_splice = NULL; SOCK_RECVBUF_UNLOCK(so); SOCK_UNLOCK(so); so2 = sp->dst; SOCK_LOCK(so2); KASSERT(!SOLISTENING(so2), ("%s: so2 is listening", __func__)); SOCK_SENDBUF_LOCK(so2); KASSERT((so2->so_snd.sb_flags & SB_SPLICED) != 0, ("%s: so2 is not spliced", __func__)); KASSERT(so2->so_splice_back == sp, ("%s: so_splice_back != sp", __func__)); so2->so_snd.sb_flags &= ~SB_SPLICED; so2->so_splice_back = NULL; SOCK_SENDBUF_UNLOCK(so2); SOCK_UNLOCK(so2); /* * No new work is being enqueued. The worker thread might be * splicing data right now, in which case we want to wait for it to * finish before proceeding. */ mtx_lock(&sp->mtx); switch (sp->state) { case SPLICE_QUEUED: case SPLICE_RUNNING: sp->state = SPLICE_CLOSING; while (sp->state == SPLICE_CLOSING) msleep(sp, &sp->mtx, PSOCK, "unsplice", 0); break; case SPLICE_IDLE: case SPLICE_EXCEPTION: sp->state = SPLICE_CLOSED; break; default: __assert_unreachable(); } if (!timeout) { drain = taskqueue_cancel_timeout(taskqueue_thread, &sp->timeout, NULL) != 0; } else { drain = false; } mtx_unlock(&sp->mtx); if (drain) taskqueue_drain_timeout(taskqueue_thread, &sp->timeout); /* * Now we hold the sole reference to the splice structure. * Clean up: signal userspace and release socket references. */ sorwakeup(so); CURVNET_SET(so->so_vnet); sorele(so); sowwakeup(so2); sorele(so2); CURVNET_RESTORE(); so_splice_free(sp); return (0); } /* * Free socket upon release of the very last reference. */ static void sofree(struct socket *so) { struct protosw *pr = so->so_proto; SOCK_LOCK_ASSERT(so); KASSERT(refcount_load(&so->so_count) == 0, ("%s: so %p has references", __func__, so)); KASSERT(SOLISTENING(so) || so->so_qstate == SQ_NONE, ("%s: so %p is on listen queue", __func__, so)); KASSERT(SOLISTENING(so) || (so->so_rcv.sb_flags & SB_SPLICED) == 0, ("%s: so %p rcvbuf is spliced", __func__, so)); KASSERT(SOLISTENING(so) || (so->so_snd.sb_flags & SB_SPLICED) == 0, ("%s: so %p sndbuf is spliced", __func__, so)); KASSERT(so->so_splice == NULL && so->so_splice_back == NULL, ("%s: so %p has spliced data", __func__, so)); SOCK_UNLOCK(so); if (so->so_dtor != NULL) so->so_dtor(so); VNET_SO_ASSERT(so); if ((pr->pr_flags & PR_RIGHTS) && !SOLISTENING(so)) { MPASS(pr->pr_domain->dom_dispose != NULL); (*pr->pr_domain->dom_dispose)(so); } if (pr->pr_detach != NULL) pr->pr_detach(so); /* * From this point on, we assume that no other references to this * socket exist anywhere else in the stack. Therefore, no locks need * to be acquired or held. */ if (!(pr->pr_flags & PR_SOCKBUF) && !SOLISTENING(so)) { sbdestroy(so, SO_SND); sbdestroy(so, SO_RCV); } seldrain(&so->so_rdsel); seldrain(&so->so_wrsel); knlist_destroy(&so->so_rdsel.si_note); knlist_destroy(&so->so_wrsel.si_note); sodealloc(so); } /* * Release a reference on a socket while holding the socket lock. * Unlocks the socket lock before returning. */ void sorele_locked(struct socket *so) { SOCK_LOCK_ASSERT(so); if (refcount_release(&so->so_count)) sofree(so); else SOCK_UNLOCK(so); } /* * Close a socket on last file table reference removal. Initiate disconnect * if connected. Free socket when disconnect complete. * * This function will sorele() the socket. Note that soclose() may be called * prior to the ref count reaching zero. The actual socket structure will * not be freed until the ref count reaches zero. */ int soclose(struct socket *so) { struct accept_queue lqueue; int error = 0; bool listening, last __diagused; CURVNET_SET(so->so_vnet); funsetown(&so->so_sigio); if (so->so_state & SS_ISCONNECTED) { if ((so->so_state & SS_ISDISCONNECTING) == 0) { error = sodisconnect(so); if (error) { if (error == ENOTCONN) error = 0; goto drop; } } if ((so->so_options & SO_LINGER) != 0 && so->so_linger != 0) { if ((so->so_state & SS_ISDISCONNECTING) && (so->so_state & SS_NBIO)) goto drop; while (so->so_state & SS_ISCONNECTED) { error = tsleep(&so->so_timeo, PSOCK | PCATCH, "soclos", so->so_linger * hz); if (error) break; } } } drop: if (so->so_proto->pr_close != NULL) so->so_proto->pr_close(so); SOCK_LOCK(so); if ((listening = SOLISTENING(so))) { struct socket *sp; TAILQ_INIT(&lqueue); TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list); TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list); so->sol_qlen = so->sol_incqlen = 0; TAILQ_FOREACH(sp, &lqueue, so_list) { SOCK_LOCK(sp); sp->so_qstate = SQ_NONE; sp->so_listen = NULL; SOCK_UNLOCK(sp); last = refcount_release(&so->so_count); KASSERT(!last, ("%s: released last reference for %p", __func__, so)); } } sorele_locked(so); if (listening) { struct socket *sp, *tsp; TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp) soabort(sp); } CURVNET_RESTORE(); return (error); } /* * soabort() is used to abruptly tear down a connection, such as when a * resource limit is reached (listen queue depth exceeded), or if a listen * socket is closed while there are sockets waiting to be accepted. * * This interface is tricky, because it is called on an unreferenced socket, * and must be called only by a thread that has actually removed the socket * from the listen queue it was on. Likely this thread holds the last * reference on the socket and soabort() will proceed with sofree(). But * it might be not the last, as the sockets on the listen queues are seen * from the protocol side. * * This interface will call into the protocol code, so must not be called * with any socket locks held. Protocols do call it while holding their own * recursible protocol mutexes, but this is something that should be subject * to review in the future. * * Usually socket should have a single reference left, but this is not a * requirement. In the past, when we have had named references for file * descriptor and protocol, we asserted that none of them are being held. */ void soabort(struct socket *so) { VNET_SO_ASSERT(so); if (so->so_proto->pr_abort != NULL) so->so_proto->pr_abort(so); SOCK_LOCK(so); sorele_locked(so); } int soaccept(struct socket *so, struct sockaddr **nam) { int error; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_accept(so, nam); CURVNET_RESTORE(); return (error); } int soconnect(struct socket *so, struct sockaddr *nam, struct thread *td) { return (soconnectat(AT_FDCWD, so, nam, td)); } int soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { int error; CURVNET_SET(so->so_vnet); /* * If protocol is connection-based, can only connect once. * Otherwise, if connected, try to disconnect first. This allows * user to disconnect by connecting to, e.g., a null address. * * Note, this check is racy and may need to be re-evaluated at the * protocol layer. */ if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && ((so->so_proto->pr_flags & PR_CONNREQUIRED) || (error = sodisconnect(so)))) { error = EISCONN; } else { /* * Prevent accumulated error from previous connection from * biting us. */ so->so_error = 0; if (fd == AT_FDCWD) { error = so->so_proto->pr_connect(so, nam, td); } else { error = so->so_proto->pr_connectat(fd, so, nam, td); } } CURVNET_RESTORE(); return (error); } int soconnect2(struct socket *so1, struct socket *so2) { int error; CURVNET_SET(so1->so_vnet); error = so1->so_proto->pr_connect2(so1, so2); CURVNET_RESTORE(); return (error); } int sodisconnect(struct socket *so) { int error; if ((so->so_state & SS_ISCONNECTED) == 0) return (ENOTCONN); if (so->so_state & SS_ISDISCONNECTING) return (EALREADY); VNET_SO_ASSERT(so); error = so->so_proto->pr_disconnect(so); return (error); } int sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { long space; ssize_t resid; int clen = 0, error, dontroute; KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM")); KASSERT(so->so_proto->pr_flags & PR_ATOMIC, ("sosend_dgram: !PR_ATOMIC")); if (uio != NULL) resid = uio->uio_resid; else resid = top->m_pkthdr.len; /* * In theory resid should be unsigned. However, space must be * signed, as it might be less than 0 if we over-committed, and we * must use a signed comparison of space and resid. On the other * hand, a negative resid causes us to loop sending 0-length * segments to the protocol. */ if (resid < 0) { error = EINVAL; goto out; } dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0; if (td != NULL) td->td_ru.ru_msgsnd++; if (control != NULL) clen = control->m_len; SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { SOCKBUF_UNLOCK(&so->so_snd); error = EPIPE; goto out; } if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto out; } if ((so->so_state & SS_ISCONNECTED) == 0) { /* * `sendto' and `sendmsg' is allowed on a connection-based * socket if it supports implied connect. Return ENOTCONN if * not connected and no address is supplied. */ if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { if ((so->so_state & SS_ISCONFIRMING) == 0 && !(resid == 0 && clen != 0)) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOTCONN; goto out; } } else if (addr == NULL) { if (so->so_proto->pr_flags & PR_CONNREQUIRED) error = ENOTCONN; else error = EDESTADDRREQ; SOCKBUF_UNLOCK(&so->so_snd); goto out; } } /* * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a * problem and need fixing. */ space = sbspace(&so->so_snd); if (flags & MSG_OOB) space += 1024; space -= clen; SOCKBUF_UNLOCK(&so->so_snd); if (resid > space) { error = EMSGSIZE; goto out; } if (uio == NULL) { resid = 0; if (flags & MSG_EOR) top->m_flags |= M_EOR; } else { /* * Copy the data from userland into a mbuf chain. * If no data is to be copied in, a single empty mbuf * is returned. */ top = m_uiotombuf(uio, M_WAITOK, space, max_hdr, (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0))); if (top == NULL) { error = EFAULT; /* only possible error */ goto out; } space -= resid - uio->uio_resid; resid = uio->uio_resid; } KASSERT(resid == 0, ("sosend_dgram: resid != 0")); /* * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock * than with. */ if (dontroute) { SOCK_LOCK(so); so->so_options |= SO_DONTROUTE; SOCK_UNLOCK(so); } /* * XXX all the SBS_CANTSENDMORE checks previously done could be out * of date. We could have received a reset packet in an interrupt or * maybe we slept while doing page faults in uiomove() etc. We could * probably recheck again inside the locking protection here, but * there are probably other places that this also happens. We must * rethink this. */ VNET_SO_ASSERT(so); error = so->so_proto->pr_send(so, (flags & MSG_OOB) ? PRUS_OOB : /* * If the user set MSG_EOF, the protocol understands this flag and * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND. */ ((flags & MSG_EOF) && (so->so_proto->pr_flags & PR_IMPLOPCL) && (resid <= 0)) ? PRUS_EOF : /* If there is more to send set PRUS_MORETOCOME */ (flags & MSG_MORETOCOME) || (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, top, addr, control, td); if (dontroute) { SOCK_LOCK(so); so->so_options &= ~SO_DONTROUTE; SOCK_UNLOCK(so); } clen = 0; control = NULL; top = NULL; out: if (top != NULL) m_freem(top); if (control != NULL) m_freem(control); return (error); } /* * Send on a socket. If send must go all at once and message is larger than * send buffering, then hard error. Lock against other senders. If must go * all at once and not enough room now, then inform user that this would * block and do nothing. Otherwise, if nonblocking, send as much as * possible. The data to be sent is described by "uio" if nonzero, otherwise * by the mbuf chain "top" (which must be null if uio is not). Data provided * in mbuf chain must be small enough to send all at once. * * Returns nonzero on error, timeout or signal; callers must check for short * counts if EINTR/ERESTART are returned. Data and control buffers are freed * on return. */ static int sosend_generic_locked(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { long space; ssize_t resid; int clen = 0, error, dontroute; int atomic = sosendallatonce(so) || top; int pr_send_flag; #ifdef KERN_TLS struct ktls_session *tls; int tls_enq_cnt, tls_send_flag; uint8_t tls_rtype; tls = NULL; tls_rtype = TLS_RLTYPE_APP; #endif SOCK_IO_SEND_ASSERT_LOCKED(so); if (uio != NULL) resid = uio->uio_resid; else if ((top->m_flags & M_PKTHDR) != 0) resid = top->m_pkthdr.len; else resid = m_length(top, NULL); /* * In theory resid should be unsigned. However, space must be * signed, as it might be less than 0 if we over-committed, and we * must use a signed comparison of space and resid. On the other * hand, a negative resid causes us to loop sending 0-length * segments to the protocol. * * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM * type sockets since that's an error. */ if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { error = EINVAL; goto out; } dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && (so->so_proto->pr_flags & PR_ATOMIC); if (td != NULL) td->td_ru.ru_msgsnd++; if (control != NULL) clen = control->m_len; #ifdef KERN_TLS tls_send_flag = 0; tls = ktls_hold(so->so_snd.sb_tls_info); if (tls != NULL) { if (tls->mode == TCP_TLS_MODE_SW) tls_send_flag = PRUS_NOTREADY; if (control != NULL) { struct cmsghdr *cm = mtod(control, struct cmsghdr *); if (clen >= sizeof(*cm) && cm->cmsg_type == TLS_SET_RECORD_TYPE) { tls_rtype = *((uint8_t *)CMSG_DATA(cm)); clen = 0; m_freem(control); control = NULL; atomic = 1; } } if (resid == 0 && !ktls_permit_empty_frames(tls)) { error = EINVAL; goto out; } } #endif restart: do { SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { SOCKBUF_UNLOCK(&so->so_snd); error = EPIPE; goto out; } if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto out; } if ((so->so_state & SS_ISCONNECTED) == 0) { /* * `sendto' and `sendmsg' is allowed on a connection- * based socket if it supports implied connect. * Return ENOTCONN if not connected and no address is * supplied. */ if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { if ((so->so_state & SS_ISCONFIRMING) == 0 && !(resid == 0 && clen != 0)) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOTCONN; goto out; } } else if (addr == NULL) { SOCKBUF_UNLOCK(&so->so_snd); if (so->so_proto->pr_flags & PR_CONNREQUIRED) error = ENOTCONN; else error = EDESTADDRREQ; goto out; } } space = sbspace(&so->so_snd); if (flags & MSG_OOB) space += 1024; if ((atomic && resid > so->so_snd.sb_hiwat) || clen > so->so_snd.sb_hiwat) { SOCKBUF_UNLOCK(&so->so_snd); error = EMSGSIZE; goto out; } if (space < resid + clen && (atomic || space < so->so_snd.sb_lowat || space < clen)) { if ((so->so_state & SS_NBIO) || (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { SOCKBUF_UNLOCK(&so->so_snd); error = EWOULDBLOCK; goto out; } error = sbwait(so, SO_SND); SOCKBUF_UNLOCK(&so->so_snd); if (error) goto out; goto restart; } SOCKBUF_UNLOCK(&so->so_snd); space -= clen; do { if (uio == NULL) { resid = 0; if (flags & MSG_EOR) top->m_flags |= M_EOR; #ifdef KERN_TLS if (tls != NULL) { ktls_frame(top, tls, &tls_enq_cnt, tls_rtype); tls_rtype = TLS_RLTYPE_APP; } #endif } else { /* * Copy the data from userland into a mbuf * chain. If resid is 0, which can happen * only if we have control to send, then * a single empty mbuf is returned. This * is a workaround to prevent protocol send * methods to panic. */ #ifdef KERN_TLS if (tls != NULL) { top = m_uiotombuf(uio, M_WAITOK, space, tls->params.max_frame_len, M_EXTPG | ((flags & MSG_EOR) ? M_EOR : 0)); if (top != NULL) { ktls_frame(top, tls, &tls_enq_cnt, tls_rtype); } tls_rtype = TLS_RLTYPE_APP; } else #endif top = m_uiotombuf(uio, M_WAITOK, space, (atomic ? max_hdr : 0), (atomic ? M_PKTHDR : 0) | ((flags & MSG_EOR) ? M_EOR : 0)); if (top == NULL) { error = EFAULT; /* only possible error */ goto out; } space -= resid - uio->uio_resid; resid = uio->uio_resid; } if (dontroute) { SOCK_LOCK(so); so->so_options |= SO_DONTROUTE; SOCK_UNLOCK(so); } /* * XXX all the SBS_CANTSENDMORE checks previously * done could be out of date. We could have received * a reset packet in an interrupt or maybe we slept * while doing page faults in uiomove() etc. We * could probably recheck again inside the locking * protection here, but there are probably other * places that this also happens. We must rethink * this. */ VNET_SO_ASSERT(so); pr_send_flag = (flags & MSG_OOB) ? PRUS_OOB : /* * If the user set MSG_EOF, the protocol understands * this flag and nothing left to send then use * PRU_SEND_EOF instead of PRU_SEND. */ ((flags & MSG_EOF) && (so->so_proto->pr_flags & PR_IMPLOPCL) && (resid <= 0)) ? PRUS_EOF : /* If there is more to send set PRUS_MORETOCOME. */ (flags & MSG_MORETOCOME) || (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0; #ifdef KERN_TLS pr_send_flag |= tls_send_flag; #endif error = so->so_proto->pr_send(so, pr_send_flag, top, addr, control, td); if (dontroute) { SOCK_LOCK(so); so->so_options &= ~SO_DONTROUTE; SOCK_UNLOCK(so); } #ifdef KERN_TLS if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) { if (error != 0) { m_freem(top); top = NULL; } else { soref(so); ktls_enqueue(top, so, tls_enq_cnt); } } #endif clen = 0; control = NULL; top = NULL; if (error) goto out; } while (resid && space > 0); } while (resid); out: #ifdef KERN_TLS if (tls != NULL) ktls_free(tls); #endif if (top != NULL) m_freem(top); if (control != NULL) m_freem(control); return (error); } int sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { int error; error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags)); if (error) return (error); error = sosend_generic_locked(so, addr, uio, top, control, flags, td); SOCK_IO_SEND_UNLOCK(so); return (error); } /* * Send to a socket from a kernel thread. * * XXXGL: in almost all cases uio is NULL and the mbuf is supplied. * Exception is nfs/bootp_subr.c. It is arguable that the VNET context needs * to be set at all. This function should just boil down to a static inline * calling the protocol method. */ int sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_sosend(so, addr, uio, top, control, flags, td); CURVNET_RESTORE(); return (error); } /* * send(2), write(2) or aio_write(2) on a socket. */ int sousrsend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *control, int flags, struct proc *userproc) { struct thread *td; ssize_t len; int error; td = uio->uio_td; len = uio->uio_resid; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_sosend(so, addr, uio, NULL, control, flags, td); CURVNET_RESTORE(); if (error != 0) { /* * Clear transient errors for stream protocols if they made * some progress. Make exclusion for aio(4) that would * schedule a new write in case of EWOULDBLOCK and clear * error itself. See soaio_process_job(). */ if (uio->uio_resid != len && (so->so_proto->pr_flags & PR_ATOMIC) == 0 && userproc == NULL && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; /* Generation of SIGPIPE can be controlled per socket. */ if (error == EPIPE && (so->so_options & SO_NOSIGPIPE) == 0 && (flags & MSG_NOSIGNAL) == 0) { if (userproc != NULL) { /* aio(4) job */ PROC_LOCK(userproc); kern_psignal(userproc, SIGPIPE); PROC_UNLOCK(userproc); } else { PROC_LOCK(td->td_proc); tdsignal(td, SIGPIPE); PROC_UNLOCK(td->td_proc); } } } return (error); } /* * The part of soreceive() that implements reading non-inline out-of-band * data from a socket. For more complete comments, see soreceive(), from * which this code originated. * * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is * unable to return an mbuf chain to the caller. */ static int soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) { struct protosw *pr = so->so_proto; struct mbuf *m; int error; KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); VNET_SO_ASSERT(so); m = m_get(M_WAITOK, MT_DATA); error = pr->pr_rcvoob(so, m, flags & MSG_PEEK); if (error) goto bad; do { error = uiomove(mtod(m, void *), (int) min(uio->uio_resid, m->m_len), uio); m = m_free(m); } while (uio->uio_resid && error == 0 && m); bad: if (m != NULL) m_freem(m); return (error); } /* * Following replacement or removal of the first mbuf on the first mbuf chain * of a socket buffer, push necessary state changes back into the socket * buffer so that other consumers see the values consistently. 'nextrecord' * is the callers locally stored value of the original value of * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. * NOTE: 'nextrecord' may be NULL. */ static __inline void sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) { SOCKBUF_LOCK_ASSERT(sb); /* * First, update for the new value of nextrecord. If necessary, make * it the first record. */ if (sb->sb_mb != NULL) sb->sb_mb->m_nextpkt = nextrecord; else sb->sb_mb = nextrecord; /* * Now update any dependent socket buffer fields to reflect the new * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the * addition of a second clause that takes care of the case where * sb_mb has been updated, but remains the last record. */ if (sb->sb_mb == NULL) { sb->sb_mbtail = NULL; sb->sb_lastrecord = NULL; } else if (sb->sb_mb->m_nextpkt == NULL) sb->sb_lastrecord = sb->sb_mb; } /* * Implement receive operations on a socket. We depend on the way that * records are added to the sockbuf by sbappend. In particular, each record * (mbufs linked through m_next) must begin with an address if the protocol * so specifies, followed by an optional mbuf or mbufs containing ancillary * data, and then zero or more mbufs of data. In order to allow parallelism * between network receive and copying to user space, as well as avoid * sleeping with a mutex held, we release the socket buffer mutex during the * user space copy. Although the sockbuf is locked, new data may still be * appended, and thus we must maintain consistency of the sockbuf during that * time. * * The caller may receive the data as a single mbuf chain by supplying an * mbuf **mp0 for use in returning the chain. The uio is then used only for * the count in uio_resid. */ static int soreceive_generic_locked(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp, struct mbuf **controlp, int *flagsp) { struct mbuf *m; int flags, error, offset; ssize_t len; struct protosw *pr = so->so_proto; struct mbuf *nextrecord; int moff, type = 0; ssize_t orig_resid = uio->uio_resid; bool report_real_len = false; SOCK_IO_RECV_ASSERT_LOCKED(so); error = 0; if (flagsp != NULL) { report_real_len = *flagsp & MSG_TRUNC; *flagsp &= ~MSG_TRUNC; flags = *flagsp &~ MSG_EOR; } else flags = 0; restart: SOCKBUF_LOCK(&so->so_rcv); m = so->so_rcv.sb_mb; /* * If we have less data than requested, block awaiting more (subject * to any timeout) if: * 1. the current count is less than the low water mark, or * 2. MSG_DONTWAIT is not set */ if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && sbavail(&so->so_rcv) < uio->uio_resid) && sbavail(&so->so_rcv) < so->so_rcv.sb_lowat && m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { KASSERT(m != NULL || !sbavail(&so->so_rcv), ("receive: m == %p sbavail == %u", m, sbavail(&so->so_rcv))); if (so->so_error || so->so_rerror) { if (m != NULL) goto dontblock; if (so->so_error) error = so->so_error; else error = so->so_rerror; if ((flags & MSG_PEEK) == 0) { if (so->so_error) so->so_error = 0; else so->so_rerror = 0; } SOCKBUF_UNLOCK(&so->so_rcv); goto release; } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { if (m != NULL) goto dontblock; #ifdef KERN_TLS else if (so->so_rcv.sb_tlsdcc == 0 && so->so_rcv.sb_tlscc == 0) { #else else { #endif SOCKBUF_UNLOCK(&so->so_rcv); goto release; } } for (; m != NULL; m = m->m_next) if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { m = so->so_rcv.sb_mb; goto dontblock; } if ((so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED | SS_ISDISCONNECTING | SS_ISDISCONNECTED)) == 0 && (so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) { SOCKBUF_UNLOCK(&so->so_rcv); error = ENOTCONN; goto release; } if (uio->uio_resid == 0 && !report_real_len) { SOCKBUF_UNLOCK(&so->so_rcv); goto release; } if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO))) { SOCKBUF_UNLOCK(&so->so_rcv); error = EWOULDBLOCK; goto release; } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); error = sbwait(so, SO_RCV); SOCKBUF_UNLOCK(&so->so_rcv); if (error) goto release; goto restart; } dontblock: /* * From this point onward, we maintain 'nextrecord' as a cache of the * pointer to the next record in the socket buffer. We must keep the * various socket buffer pointers and local stack versions of the * pointers in sync, pushing out modifications before dropping the * socket buffer mutex, and re-reading them when picking it up. * * Otherwise, we will race with the network stack appending new data * or records onto the socket buffer by using inconsistent/stale * versions of the field, possibly resulting in socket buffer * corruption. * * By holding the high-level sblock(), we prevent simultaneous * readers from pulling off the front of the socket buffer. */ SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); nextrecord = m->m_nextpkt; if (pr->pr_flags & PR_ADDR) { KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type)); orig_resid = 0; if (psa != NULL) *psa = sodupsockaddr(mtod(m, struct sockaddr *), M_NOWAIT); if (flags & MSG_PEEK) { m = m->m_next; } else { sbfree(&so->so_rcv, m); so->so_rcv.sb_mb = m_free(m); m = so->so_rcv.sb_mb; sockbuf_pushsync(&so->so_rcv, nextrecord); } } /* * Process one or more MT_CONTROL mbufs present before any data mbufs * in the first mbuf chain on the socket buffer. If MSG_PEEK, we * just copy the data; if !MSG_PEEK, we call into the protocol to * perform externalization (or freeing if controlp == NULL). */ if (m != NULL && m->m_type == MT_CONTROL) { struct mbuf *cm = NULL, *cmn; struct mbuf **cme = &cm; #ifdef KERN_TLS struct cmsghdr *cmsg; struct tls_get_record tgr; /* * For MSG_TLSAPPDATA, check for an alert record. * If found, return ENXIO without removing * it from the receive queue. This allows a subsequent * call without MSG_TLSAPPDATA to receive it. * Note that, for TLS, there should only be a single * control mbuf with the TLS_GET_RECORD message in it. */ if (flags & MSG_TLSAPPDATA) { cmsg = mtod(m, struct cmsghdr *); if (cmsg->cmsg_type == TLS_GET_RECORD && cmsg->cmsg_len == CMSG_LEN(sizeof(tgr))) { memcpy(&tgr, CMSG_DATA(cmsg), sizeof(tgr)); if (__predict_false(tgr.tls_type == TLS_RLTYPE_ALERT)) { SOCKBUF_UNLOCK(&so->so_rcv); error = ENXIO; goto release; } } } #endif do { if (flags & MSG_PEEK) { if (controlp != NULL) { *controlp = m_copym(m, 0, m->m_len, M_NOWAIT); controlp = &(*controlp)->m_next; } m = m->m_next; } else { sbfree(&so->so_rcv, m); so->so_rcv.sb_mb = m->m_next; m->m_next = NULL; *cme = m; cme = &(*cme)->m_next; m = so->so_rcv.sb_mb; } } while (m != NULL && m->m_type == MT_CONTROL); if ((flags & MSG_PEEK) == 0) sockbuf_pushsync(&so->so_rcv, nextrecord); while (cm != NULL) { cmn = cm->m_next; cm->m_next = NULL; if (pr->pr_domain->dom_externalize != NULL) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); error = (*pr->pr_domain->dom_externalize) (cm, controlp, flags); SOCKBUF_LOCK(&so->so_rcv); } else if (controlp != NULL) *controlp = cm; else m_freem(cm); if (controlp != NULL) { while (*controlp != NULL) controlp = &(*controlp)->m_next; } cm = cmn; } if (m != NULL) nextrecord = so->so_rcv.sb_mb->m_nextpkt; else nextrecord = so->so_rcv.sb_mb; orig_resid = 0; } if (m != NULL) { if ((flags & MSG_PEEK) == 0) { KASSERT(m->m_nextpkt == nextrecord, ("soreceive: post-control, nextrecord !sync")); if (nextrecord == NULL) { KASSERT(so->so_rcv.sb_mb == m, ("soreceive: post-control, sb_mb!=m")); KASSERT(so->so_rcv.sb_lastrecord == m, ("soreceive: post-control, lastrecord!=m")); } } type = m->m_type; if (type == MT_OOBDATA) flags |= MSG_OOB; } else { if ((flags & MSG_PEEK) == 0) { KASSERT(so->so_rcv.sb_mb == nextrecord, ("soreceive: sb_mb != nextrecord")); if (so->so_rcv.sb_mb == NULL) { KASSERT(so->so_rcv.sb_lastrecord == NULL, ("soreceive: sb_lastercord != NULL")); } } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * Now continue to read any data mbufs off of the head of the socket * buffer until the read request is satisfied. Note that 'type' is * used to store the type of any mbuf reads that have happened so far * such that soreceive() can stop reading if the type changes, which * causes soreceive() to return only one of regular data and inline * out-of-band data in a single socket receive operation. */ moff = 0; offset = 0; while (m != NULL && !(m->m_flags & M_NOTAVAIL) && uio->uio_resid > 0 && error == 0) { /* * If the type of mbuf has changed since the last mbuf * examined ('type'), end the receive operation. */ SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) { if (type != m->m_type) break; } else if (type == MT_OOBDATA) break; else KASSERT(m->m_type == MT_DATA, ("m->m_type == %d", m->m_type)); so->so_rcv.sb_state &= ~SBS_RCVATMARK; len = uio->uio_resid; if (so->so_oobmark && len > so->so_oobmark - offset) len = so->so_oobmark - offset; if (len > m->m_len - moff) len = m->m_len - moff; /* * If mp is set, just pass back the mbufs. Otherwise copy * them out via the uio, then free. Sockbuf must be * consistent here (points to current mbuf, it points to next * record) when we drop priority; we must note any additions * to the sockbuf when we block interrupts again. */ if (mp == NULL) { SOCKBUF_LOCK_ASSERT(&so->so_rcv); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); if ((m->m_flags & M_EXTPG) != 0) error = m_unmapped_uiomove(m, moff, uio, (int)len); else error = uiomove(mtod(m, char *) + moff, (int)len, uio); SOCKBUF_LOCK(&so->so_rcv); if (error) { /* * The MT_SONAME mbuf has already been removed * from the record, so it is necessary to * remove the data mbufs, if any, to preserve * the invariant in the case of PR_ADDR that * requires MT_SONAME mbufs at the head of * each record. */ if (pr->pr_flags & PR_ATOMIC && ((flags & MSG_PEEK) == 0)) (void)sbdroprecord_locked(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); goto release; } } else uio->uio_resid -= len; SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (len == m->m_len - moff) { if (m->m_flags & M_EOR) flags |= MSG_EOR; if (flags & MSG_PEEK) { m = m->m_next; moff = 0; } else { nextrecord = m->m_nextpkt; sbfree(&so->so_rcv, m); if (mp != NULL) { m->m_nextpkt = NULL; *mp = m; mp = &m->m_next; so->so_rcv.sb_mb = m = m->m_next; *mp = NULL; } else { so->so_rcv.sb_mb = m_free(m); m = so->so_rcv.sb_mb; } sockbuf_pushsync(&so->so_rcv, nextrecord); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); } } else { if (flags & MSG_PEEK) moff += len; else { if (mp != NULL) { if (flags & MSG_DONTWAIT) { *mp = m_copym(m, 0, len, M_NOWAIT); if (*mp == NULL) { /* * m_copym() couldn't * allocate an mbuf. * Adjust uio_resid back * (it was adjusted * down by len bytes, * which we didn't end * up "copying" over). */ uio->uio_resid += len; break; } } else { SOCKBUF_UNLOCK(&so->so_rcv); *mp = m_copym(m, 0, len, M_WAITOK); SOCKBUF_LOCK(&so->so_rcv); } } sbcut_locked(&so->so_rcv, len); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_oobmark) { if ((flags & MSG_PEEK) == 0) { so->so_oobmark -= len; if (so->so_oobmark == 0) { so->so_rcv.sb_state |= SBS_RCVATMARK; break; } } else { offset += len; if (offset == so->so_oobmark) break; } } if (flags & MSG_EOR) break; /* * If the MSG_WAITALL flag is set (for non-atomic socket), we * must not quit until "uio->uio_resid == 0" or an error * termination. If a signal/timeout occurs, return with a * short count but without error. Keep sockbuf locked * against other readers. */ while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && !sosendallatonce(so) && nextrecord == NULL) { SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_error || so->so_rerror || so->so_rcv.sb_state & SBS_CANTRCVMORE) break; /* * Notify the protocol that some data has been * drained before blocking. */ if (pr->pr_flags & PR_WANTRCVD) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); pr->pr_rcvd(so, flags); SOCKBUF_LOCK(&so->so_rcv); if (__predict_false(so->so_rcv.sb_mb == NULL && (so->so_error || so->so_rerror || so->so_rcv.sb_state & SBS_CANTRCVMORE))) break; } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * We could receive some data while was notifying * the protocol. Skip blocking in this case. */ if (so->so_rcv.sb_mb == NULL) { error = sbwait(so, SO_RCV); if (error) { SOCKBUF_UNLOCK(&so->so_rcv); goto release; } } m = so->so_rcv.sb_mb; if (m != NULL) nextrecord = m->m_nextpkt; } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (m != NULL && pr->pr_flags & PR_ATOMIC) { if (report_real_len) uio->uio_resid -= m_length(m, NULL) - moff; flags |= MSG_TRUNC; if ((flags & MSG_PEEK) == 0) (void) sbdroprecord_locked(&so->so_rcv); } if ((flags & MSG_PEEK) == 0) { if (m == NULL) { /* * First part is an inline SB_EMPTY_FIXUP(). Second * part makes sure sb_lastrecord is up-to-date if * there is still data in the socket buffer. */ so->so_rcv.sb_mb = nextrecord; if (so->so_rcv.sb_mb == NULL) { so->so_rcv.sb_mbtail = NULL; so->so_rcv.sb_lastrecord = NULL; } else if (nextrecord->m_nextpkt == NULL) so->so_rcv.sb_lastrecord = nextrecord; } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * If soreceive() is being done from the socket callback, * then don't need to generate ACK to peer to update window, * since ACK will be generated on return to TCP. */ if (!(flags & MSG_SOCALLBCK) && (pr->pr_flags & PR_WANTRCVD)) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); pr->pr_rcvd(so, flags); SOCKBUF_LOCK(&so->so_rcv); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (orig_resid == uio->uio_resid && orig_resid && (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { SOCKBUF_UNLOCK(&so->so_rcv); goto restart; } SOCKBUF_UNLOCK(&so->so_rcv); if (flagsp != NULL) *flagsp |= flags; release: return (error); } int soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp, struct mbuf **controlp, int *flagsp) { int error, flags; if (psa != NULL) *psa = NULL; if (controlp != NULL) *controlp = NULL; if (flagsp != NULL) { flags = *flagsp; if ((flags & MSG_OOB) != 0) return (soreceive_rcvoob(so, uio, flags)); } else { flags = 0; } if (mp != NULL) *mp = NULL; if ((so->so_proto->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING) && uio->uio_resid) { VNET_SO_ASSERT(so); so->so_proto->pr_rcvd(so, 0); } error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); if (error) return (error); error = soreceive_generic_locked(so, psa, uio, mp, controlp, flagsp); SOCK_IO_RECV_UNLOCK(so); return (error); } /* * Optimized version of soreceive() for stream (TCP) sockets. */ static int soreceive_stream_locked(struct socket *so, struct sockbuf *sb, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int flags) { int len = 0, error = 0, oresid; struct mbuf *m, *n = NULL; SOCK_IO_RECV_ASSERT_LOCKED(so); /* Easy one, no space to copyout anything. */ if (uio->uio_resid == 0) return (EINVAL); oresid = uio->uio_resid; SOCKBUF_LOCK(sb); /* We will never ever get anything unless we are or were connected. */ if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { error = ENOTCONN; goto out; } restart: SOCKBUF_LOCK_ASSERT(&so->so_rcv); /* Abort if socket has reported problems. */ if (so->so_error) { if (sbavail(sb) > 0) goto deliver; if (oresid > uio->uio_resid) goto out; error = so->so_error; if (!(flags & MSG_PEEK)) so->so_error = 0; goto out; } /* Door is closed. Deliver what is left, if any. */ if (sb->sb_state & SBS_CANTRCVMORE) { if (sbavail(sb) > 0) goto deliver; else goto out; } /* Socket buffer is empty and we shall not block. */ if (sbavail(sb) == 0 && ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { error = EAGAIN; goto out; } /* Socket buffer got some data that we shall deliver now. */ if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) && ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)) || sbavail(sb) >= sb->sb_lowat || sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat) ) { goto deliver; } /* On MSG_WAITALL we must wait until all data or error arrives. */ if ((flags & MSG_WAITALL) && (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat)) goto deliver; /* * Wait and block until (more) data comes in. * NB: Drops the sockbuf lock during wait. */ error = sbwait(so, SO_RCV); if (error) goto out; goto restart; deliver: SOCKBUF_LOCK_ASSERT(&so->so_rcv); KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__)); KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); /* Statistics. */ if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; /* Fill uio until full or current end of socket buffer is reached. */ len = min(uio->uio_resid, sbavail(sb)); if (mp0 != NULL) { /* Dequeue as many mbufs as possible. */ if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { if (*mp0 == NULL) *mp0 = sb->sb_mb; else m_cat(*mp0, sb->sb_mb); for (m = sb->sb_mb; m != NULL && m->m_len <= len; m = m->m_next) { KASSERT(!(m->m_flags & M_NOTAVAIL), ("%s: m %p not available", __func__, m)); len -= m->m_len; uio->uio_resid -= m->m_len; sbfree(sb, m); n = m; } n->m_next = NULL; sb->sb_mb = m; sb->sb_lastrecord = sb->sb_mb; if (sb->sb_mb == NULL) SB_EMPTY_FIXUP(sb); } /* Copy the remainder. */ if (len > 0) { KASSERT(sb->sb_mb != NULL, ("%s: len > 0 && sb->sb_mb empty", __func__)); m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); if (m == NULL) len = 0; /* Don't flush data from sockbuf. */ else uio->uio_resid -= len; if (*mp0 != NULL) m_cat(*mp0, m); else *mp0 = m; if (*mp0 == NULL) { error = ENOBUFS; goto out; } } } else { /* NB: Must unlock socket buffer as uiomove may sleep. */ SOCKBUF_UNLOCK(sb); error = m_mbuftouio(uio, sb->sb_mb, len); SOCKBUF_LOCK(sb); if (error) goto out; } SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); /* * Remove the delivered data from the socket buffer unless we * were only peeking. */ if (!(flags & MSG_PEEK)) { if (len > 0) sbdrop_locked(sb, len); /* Notify protocol that we drained some data. */ if ((so->so_proto->pr_flags & PR_WANTRCVD) && (((flags & MSG_WAITALL) && uio->uio_resid > 0) || !(flags & MSG_SOCALLBCK))) { SOCKBUF_UNLOCK(sb); VNET_SO_ASSERT(so); so->so_proto->pr_rcvd(so, flags); SOCKBUF_LOCK(sb); } } /* * For MSG_WAITALL we may have to loop again and wait for * more data to come in. */ if ((flags & MSG_WAITALL) && uio->uio_resid > 0) goto restart; out: SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); SOCKBUF_UNLOCK(sb); return (error); } int soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct sockbuf *sb; int error, flags; sb = &so->so_rcv; /* We only do stream sockets. */ if (so->so_type != SOCK_STREAM) return (EINVAL); if (psa != NULL) *psa = NULL; if (flagsp != NULL) flags = *flagsp & ~MSG_EOR; else flags = 0; if (controlp != NULL) *controlp = NULL; if (flags & MSG_OOB) return (soreceive_rcvoob(so, uio, flags)); if (mp0 != NULL) *mp0 = NULL; #ifdef KERN_TLS /* * KTLS store TLS records as records with a control message to * describe the framing. * * We check once here before acquiring locks to optimize the * common case. */ if (sb->sb_tls_info != NULL) return (soreceive_generic(so, psa, uio, mp0, controlp, flagsp)); #endif /* * Prevent other threads from reading from the socket. This lock may be * dropped in order to sleep waiting for data to arrive. */ error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); if (error) return (error); #ifdef KERN_TLS if (__predict_false(sb->sb_tls_info != NULL)) { SOCK_IO_RECV_UNLOCK(so); return (soreceive_generic(so, psa, uio, mp0, controlp, flagsp)); } #endif error = soreceive_stream_locked(so, sb, psa, uio, mp0, controlp, flags); SOCK_IO_RECV_UNLOCK(so); return (error); } /* * Optimized version of soreceive() for simple datagram cases from userspace. * Unlike in the stream case, we're able to drop a datagram if copyout() * fails, and because we handle datagrams atomically, we don't need to use a * sleep lock to prevent I/O interlacing. */ int soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct mbuf *m, *m2; int flags, error; ssize_t len; struct protosw *pr = so->so_proto; struct mbuf *nextrecord; if (psa != NULL) *psa = NULL; if (controlp != NULL) *controlp = NULL; if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; /* * For any complicated cases, fall back to the full * soreceive_generic(). */ if (mp0 != NULL || (flags & (MSG_PEEK | MSG_OOB | MSG_TRUNC))) return (soreceive_generic(so, psa, uio, mp0, controlp, flagsp)); /* * Enforce restrictions on use. */ KASSERT((pr->pr_flags & PR_WANTRCVD) == 0, ("soreceive_dgram: wantrcvd")); KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic")); KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0, ("soreceive_dgram: SBS_RCVATMARK")); KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0, ("soreceive_dgram: P_CONNREQUIRED")); /* * Loop blocking while waiting for a datagram. */ SOCKBUF_LOCK(&so->so_rcv); while ((m = so->so_rcv.sb_mb) == NULL) { KASSERT(sbavail(&so->so_rcv) == 0, ("soreceive_dgram: sb_mb NULL but sbavail %u", sbavail(&so->so_rcv))); if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_rcv); return (error); } if (so->so_rcv.sb_state & SBS_CANTRCVMORE || uio->uio_resid == 0) { SOCKBUF_UNLOCK(&so->so_rcv); return (0); } if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO))) { SOCKBUF_UNLOCK(&so->so_rcv); return (EWOULDBLOCK); } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); error = sbwait(so, SO_RCV); if (error) { SOCKBUF_UNLOCK(&so->so_rcv); return (error); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); nextrecord = m->m_nextpkt; if (nextrecord == NULL) { KASSERT(so->so_rcv.sb_lastrecord == m, ("soreceive_dgram: lastrecord != m")); } KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord, ("soreceive_dgram: m_nextpkt != nextrecord")); /* * Pull 'm' and its chain off the front of the packet queue. */ so->so_rcv.sb_mb = NULL; sockbuf_pushsync(&so->so_rcv, nextrecord); /* * Walk 'm's chain and free that many bytes from the socket buffer. */ for (m2 = m; m2 != NULL; m2 = m2->m_next) sbfree(&so->so_rcv, m2); /* * Do a few last checks before we let go of the lock. */ SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); if (pr->pr_flags & PR_ADDR) { KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type)); if (psa != NULL) *psa = sodupsockaddr(mtod(m, struct sockaddr *), M_NOWAIT); m = m_free(m); } if (m == NULL) { /* XXXRW: Can this happen? */ return (0); } /* * Packet to copyout() is now in 'm' and it is disconnected from the * queue. * * Process one or more MT_CONTROL mbufs present before any data mbufs * in the first mbuf chain on the socket buffer. We call into the * protocol to perform externalization (or freeing if controlp == * NULL). In some cases there can be only MT_CONTROL mbufs without * MT_DATA mbufs. */ if (m->m_type == MT_CONTROL) { struct mbuf *cm = NULL, *cmn; struct mbuf **cme = &cm; do { m2 = m->m_next; m->m_next = NULL; *cme = m; cme = &(*cme)->m_next; m = m2; } while (m != NULL && m->m_type == MT_CONTROL); while (cm != NULL) { cmn = cm->m_next; cm->m_next = NULL; if (pr->pr_domain->dom_externalize != NULL) { error = (*pr->pr_domain->dom_externalize) (cm, controlp, flags); } else if (controlp != NULL) *controlp = cm; else m_freem(cm); if (controlp != NULL) { while (*controlp != NULL) controlp = &(*controlp)->m_next; } cm = cmn; } } KASSERT(m == NULL || m->m_type == MT_DATA, ("soreceive_dgram: !data")); while (m != NULL && uio->uio_resid > 0) { len = uio->uio_resid; if (len > m->m_len) len = m->m_len; error = uiomove(mtod(m, char *), (int)len, uio); if (error) { m_freem(m); return (error); } if (len == m->m_len) m = m_free(m); else { m->m_data += len; m->m_len -= len; } } if (m != NULL) { flags |= MSG_TRUNC; m_freem(m); } if (flagsp != NULL) *flagsp |= flags; return (0); } int soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { int error; CURVNET_SET(so->so_vnet); error = so->so_proto->pr_soreceive(so, psa, uio, mp0, controlp, flagsp); CURVNET_RESTORE(); return (error); } int soshutdown(struct socket *so, int how) { struct protosw *pr; int error, soerror_enotconn; if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) return (EINVAL); soerror_enotconn = 0; SOCK_LOCK(so); if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) { /* * POSIX mandates us to return ENOTCONN when shutdown(2) is * invoked on a datagram sockets, however historically we would * actually tear socket down. This is known to be leveraged by * some applications to unblock process waiting in recvXXX(2) * by other process that it shares that socket with. Try to meet * both backward-compatibility and POSIX requirements by forcing * ENOTCONN but still asking protocol to perform pru_shutdown(). */ if (so->so_type != SOCK_DGRAM && !SOLISTENING(so)) { SOCK_UNLOCK(so); return (ENOTCONN); } soerror_enotconn = 1; } if (SOLISTENING(so)) { if (how != SHUT_WR) { so->so_error = ECONNABORTED; solisten_wakeup(so); /* unlocks so */ } else { SOCK_UNLOCK(so); } goto done; } SOCK_UNLOCK(so); CURVNET_SET(so->so_vnet); pr = so->so_proto; if (pr->pr_flush != NULL) pr->pr_flush(so, how); if (how != SHUT_WR) sorflush(so); if (how != SHUT_RD) { error = pr->pr_shutdown(so); wakeup(&so->so_timeo); CURVNET_RESTORE(); return ((error == 0 && soerror_enotconn) ? ENOTCONN : error); } wakeup(&so->so_timeo); CURVNET_RESTORE(); done: return (soerror_enotconn ? ENOTCONN : 0); } void sorflush(struct socket *so) { struct protosw *pr; int error; VNET_SO_ASSERT(so); /* * Dislodge threads currently blocked in receive and wait to acquire * a lock against other simultaneous readers before clearing the * socket buffer. Don't let our acquire be interrupted by a signal * despite any existing socket disposition on interruptable waiting. */ socantrcvmore(so); error = SOCK_IO_RECV_LOCK(so, SBL_WAIT | SBL_NOINTR); if (error != 0) { KASSERT(SOLISTENING(so), ("%s: soiolock(%p) failed", __func__, so)); return; } pr = so->so_proto; if (pr->pr_flags & PR_RIGHTS) { MPASS(pr->pr_domain->dom_dispose != NULL); (*pr->pr_domain->dom_dispose)(so); } else { sbrelease(so, SO_RCV); SOCK_IO_RECV_UNLOCK(so); } } /* * Wrapper for Socket established helper hook. * Parameters: socket, context of the hook point, hook id. */ static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id) { struct socket_hhook_data hhook_data = { .so = so, .hctx = hctx, .m = NULL, .status = 0 }; CURVNET_SET(so->so_vnet); HHOOKS_RUN_IF(V_socket_hhh[h_id], &hhook_data, &so->osd); CURVNET_RESTORE(); /* Ugly but needed, since hhooks return void for now */ return (hhook_data.status); } /* * Perhaps this routine, and sooptcopyout(), below, ought to come in an * additional variant to handle the case where the option value needs to be * some kind of integer, but not a specific size. In addition to their use * here, these functions are also called by the protocol-level pr_ctloutput() * routines. */ int sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) { size_t valsize; /* * If the user gives us more than we wanted, we ignore it, but if we * don't get the minimum length the caller wants, we return EINVAL. * On success, sopt->sopt_valsize is set to however much we actually * retrieved. */ if ((valsize = sopt->sopt_valsize) < minlen) return EINVAL; if (valsize > len) sopt->sopt_valsize = valsize = len; if (sopt->sopt_td != NULL) return (copyin(sopt->sopt_val, buf, valsize)); bcopy(sopt->sopt_val, buf, valsize); return (0); } /* * Kernel version of setsockopt(2). * * XXX: optlen is size_t, not socklen_t */ int so_setsockopt(struct socket *so, int level, int optname, void *optval, size_t optlen) { struct sockopt sopt; sopt.sopt_level = level; sopt.sopt_name = optname; sopt.sopt_dir = SOPT_SET; sopt.sopt_val = optval; sopt.sopt_valsize = optlen; sopt.sopt_td = NULL; return (sosetopt(so, &sopt)); } int sosetopt(struct socket *so, struct sockopt *sopt) { int error, optval; struct linger l; struct timeval tv; sbintime_t val, *valp; uint32_t val32; #ifdef MAC struct mac extmac; #endif CURVNET_SET(so->so_vnet); error = 0; if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto->pr_ctloutput != NULL) error = (*so->so_proto->pr_ctloutput)(so, sopt); else error = ENOPROTOOPT; } else { switch (sopt->sopt_name) { case SO_ACCEPTFILTER: error = accept_filt_setopt(so, sopt); if (error) goto bad; break; case SO_LINGER: error = sooptcopyin(sopt, &l, sizeof l, sizeof l); if (error) goto bad; if (l.l_linger < 0 || l.l_linger > USHRT_MAX || l.l_linger > (INT_MAX / hz)) { error = EDOM; goto bad; } SOCK_LOCK(so); so->so_linger = l.l_linger; if (l.l_onoff) so->so_options |= SO_LINGER; else so->so_options &= ~SO_LINGER; SOCK_UNLOCK(so); break; case SO_DEBUG: case SO_KEEPALIVE: case SO_DONTROUTE: case SO_USELOOPBACK: case SO_BROADCAST: case SO_REUSEADDR: case SO_REUSEPORT: case SO_REUSEPORT_LB: case SO_OOBINLINE: case SO_TIMESTAMP: case SO_BINTIME: case SO_NOSIGPIPE: case SO_NO_DDP: case SO_NO_OFFLOAD: case SO_RERROR: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; SOCK_LOCK(so); if (optval) so->so_options |= sopt->sopt_name; else so->so_options &= ~sopt->sopt_name; SOCK_UNLOCK(so); break; case SO_SETFIB: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; if (optval < 0 || optval >= rt_numfibs) { error = EINVAL; goto bad; } if (((so->so_proto->pr_domain->dom_family == PF_INET) || (so->so_proto->pr_domain->dom_family == PF_INET6) || (so->so_proto->pr_domain->dom_family == PF_ROUTE))) so->so_fibnum = optval; else so->so_fibnum = 0; break; case SO_USER_COOKIE: error = sooptcopyin(sopt, &val32, sizeof val32, sizeof val32); if (error) goto bad; so->so_user_cookie = val32; break; case SO_SNDBUF: case SO_RCVBUF: case SO_SNDLOWAT: case SO_RCVLOWAT: error = so->so_proto->pr_setsbopt(so, sopt); if (error) goto bad; break; case SO_SNDTIMEO: case SO_RCVTIMEO: #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { struct timeval32 tv32; error = sooptcopyin(sopt, &tv32, sizeof tv32, sizeof tv32); CP(tv32, tv, tv_sec); CP(tv32, tv, tv_usec); } else #endif error = sooptcopyin(sopt, &tv, sizeof tv, sizeof tv); if (error) goto bad; if (tv.tv_sec < 0 || tv.tv_usec < 0 || tv.tv_usec >= 1000000) { error = EDOM; goto bad; } if (tv.tv_sec > INT32_MAX) val = SBT_MAX; else val = tvtosbt(tv); SOCK_LOCK(so); valp = sopt->sopt_name == SO_SNDTIMEO ? (SOLISTENING(so) ? &so->sol_sbsnd_timeo : &so->so_snd.sb_timeo) : (SOLISTENING(so) ? &so->sol_sbrcv_timeo : &so->so_rcv.sb_timeo); *valp = val; SOCK_UNLOCK(so); break; case SO_LABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof extmac, sizeof extmac); if (error) goto bad; error = mac_setsockopt_label(sopt->sopt_td->td_ucred, so, &extmac); #else error = EOPNOTSUPP; #endif break; case SO_TS_CLOCK: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; if (optval < 0 || optval > SO_TS_CLOCK_MAX) { error = EINVAL; goto bad; } so->so_ts_clock = optval; break; case SO_MAX_PACING_RATE: error = sooptcopyin(sopt, &val32, sizeof(val32), sizeof(val32)); if (error) goto bad; so->so_max_pacing_rate = val32; break; case SO_SPLICE: { struct splice splice; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { struct splice32 splice32; error = sooptcopyin(sopt, &splice32, sizeof(splice32), sizeof(splice32)); if (error == 0) { splice.sp_fd = splice32.sp_fd; splice.sp_max = splice32.sp_max; CP(splice32.sp_idle, splice.sp_idle, tv_sec); CP(splice32.sp_idle, splice.sp_idle, tv_usec); } } else #endif { error = sooptcopyin(sopt, &splice, sizeof(splice), sizeof(splice)); } if (error) goto bad; #ifdef KTRACE if (KTRPOINT(curthread, KTR_STRUCT)) ktrsplice(&splice); #endif error = splice_init(); if (error != 0) goto bad; if (splice.sp_fd >= 0) { struct file *fp; struct socket *so2; if (!cap_rights_contains(sopt->sopt_rights, &cap_recv_rights)) { error = ENOTCAPABLE; goto bad; } error = getsock(sopt->sopt_td, splice.sp_fd, &cap_send_rights, &fp); if (error != 0) goto bad; so2 = fp->f_data; error = so_splice(so, so2, &splice); fdrop(fp, sopt->sopt_td); } else { error = so_unsplice(so, false); } break; } default: if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) error = hhook_run_socket(so, sopt, HHOOK_SOCKET_OPT); else error = ENOPROTOOPT; break; } if (error == 0 && so->so_proto->pr_ctloutput != NULL) (void)(*so->so_proto->pr_ctloutput)(so, sopt); } bad: CURVNET_RESTORE(); return (error); } /* * Helper routine for getsockopt. */ int sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) { int error; size_t valsize; error = 0; /* * Documented get behavior is that we always return a value, possibly * truncated to fit in the user's buffer. Traditional behavior is * that we always tell the user precisely how much we copied, rather * than something useful like the total amount we had available for * her. Note that this interface is not idempotent; the entire * answer must be generated ahead of time. */ valsize = min(len, sopt->sopt_valsize); sopt->sopt_valsize = valsize; if (sopt->sopt_val != NULL) { if (sopt->sopt_td != NULL) error = copyout(buf, sopt->sopt_val, valsize); else bcopy(buf, sopt->sopt_val, valsize); } return (error); } int sogetopt(struct socket *so, struct sockopt *sopt) { int error, optval; struct linger l; struct timeval tv; #ifdef MAC struct mac extmac; #endif CURVNET_SET(so->so_vnet); error = 0; if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto->pr_ctloutput != NULL) error = (*so->so_proto->pr_ctloutput)(so, sopt); else error = ENOPROTOOPT; CURVNET_RESTORE(); return (error); } else { switch (sopt->sopt_name) { case SO_ACCEPTFILTER: error = accept_filt_getopt(so, sopt); break; case SO_LINGER: SOCK_LOCK(so); l.l_onoff = so->so_options & SO_LINGER; l.l_linger = so->so_linger; SOCK_UNLOCK(so); error = sooptcopyout(sopt, &l, sizeof l); break; case SO_USELOOPBACK: case SO_DONTROUTE: case SO_DEBUG: case SO_KEEPALIVE: case SO_REUSEADDR: case SO_REUSEPORT: case SO_REUSEPORT_LB: case SO_BROADCAST: case SO_OOBINLINE: case SO_ACCEPTCONN: case SO_TIMESTAMP: case SO_BINTIME: case SO_NOSIGPIPE: case SO_NO_DDP: case SO_NO_OFFLOAD: case SO_RERROR: optval = so->so_options & sopt->sopt_name; integer: error = sooptcopyout(sopt, &optval, sizeof optval); break; case SO_DOMAIN: optval = so->so_proto->pr_domain->dom_family; goto integer; case SO_TYPE: optval = so->so_type; goto integer; case SO_PROTOCOL: optval = so->so_proto->pr_protocol; goto integer; case SO_ERROR: SOCK_LOCK(so); if (so->so_error) { optval = so->so_error; so->so_error = 0; } else { optval = so->so_rerror; so->so_rerror = 0; } SOCK_UNLOCK(so); goto integer; case SO_SNDBUF: SOCK_LOCK(so); optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat : so->so_snd.sb_hiwat; SOCK_UNLOCK(so); goto integer; case SO_RCVBUF: SOCK_LOCK(so); optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat : so->so_rcv.sb_hiwat; SOCK_UNLOCK(so); goto integer; case SO_SNDLOWAT: SOCK_LOCK(so); optval = SOLISTENING(so) ? so->sol_sbsnd_lowat : so->so_snd.sb_lowat; SOCK_UNLOCK(so); goto integer; case SO_RCVLOWAT: SOCK_LOCK(so); optval = SOLISTENING(so) ? so->sol_sbrcv_lowat : so->so_rcv.sb_lowat; SOCK_UNLOCK(so); goto integer; case SO_SNDTIMEO: case SO_RCVTIMEO: SOCK_LOCK(so); tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ? (SOLISTENING(so) ? so->sol_sbsnd_timeo : so->so_snd.sb_timeo) : (SOLISTENING(so) ? so->sol_sbrcv_timeo : so->so_rcv.sb_timeo)); SOCK_UNLOCK(so); #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { struct timeval32 tv32; CP(tv, tv32, tv_sec); CP(tv, tv32, tv_usec); error = sooptcopyout(sopt, &tv32, sizeof tv32); } else #endif error = sooptcopyout(sopt, &tv, sizeof tv); break; case SO_LABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof(extmac), sizeof(extmac)); if (error) goto bad; error = mac_getsockopt_label(sopt->sopt_td->td_ucred, so, &extmac); if (error) goto bad; error = sooptcopyout(sopt, &extmac, sizeof extmac); #else error = EOPNOTSUPP; #endif break; case SO_PEERLABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof(extmac), sizeof(extmac)); if (error) goto bad; error = mac_getsockopt_peerlabel( sopt->sopt_td->td_ucred, so, &extmac); if (error) goto bad; error = sooptcopyout(sopt, &extmac, sizeof extmac); #else error = EOPNOTSUPP; #endif break; case SO_LISTENQLIMIT: SOCK_LOCK(so); optval = SOLISTENING(so) ? so->sol_qlimit : 0; SOCK_UNLOCK(so); goto integer; case SO_LISTENQLEN: SOCK_LOCK(so); optval = SOLISTENING(so) ? so->sol_qlen : 0; SOCK_UNLOCK(so); goto integer; case SO_LISTENINCQLEN: SOCK_LOCK(so); optval = SOLISTENING(so) ? so->sol_incqlen : 0; SOCK_UNLOCK(so); goto integer; case SO_TS_CLOCK: optval = so->so_ts_clock; goto integer; case SO_MAX_PACING_RATE: optval = so->so_max_pacing_rate; goto integer; case SO_SPLICE: { off_t n; /* * Acquire the I/O lock to serialize with * so_splice_xfer(). This is not required for * correctness, but makes testing simpler: once a byte * has been transmitted to the sink and observed (e.g., * by reading from the socket to which the sink is * connected), a subsequent getsockopt(SO_SPLICE) will * return an up-to-date value. */ error = SOCK_IO_RECV_LOCK(so, SBL_WAIT); if (error != 0) goto bad; SOCK_LOCK(so); if (SOLISTENING(so)) { n = 0; } else { n = so->so_splice_sent; } SOCK_UNLOCK(so); SOCK_IO_RECV_UNLOCK(so); error = sooptcopyout(sopt, &n, sizeof(n)); break; } default: if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) error = hhook_run_socket(so, sopt, HHOOK_SOCKET_OPT); else error = ENOPROTOOPT; break; } } bad: CURVNET_RESTORE(); return (error); } int soopt_getm(struct sockopt *sopt, struct mbuf **mp) { struct mbuf *m, *m_prev; int sopt_size = sopt->sopt_valsize; MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) return ENOBUFS; if (sopt_size > MLEN) { MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); return ENOBUFS; } m->m_len = min(MCLBYTES, sopt_size); } else { m->m_len = min(MLEN, sopt_size); } sopt_size -= m->m_len; *mp = m; m_prev = m; while (sopt_size) { MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) { m_freem(*mp); return ENOBUFS; } if (sopt_size > MLEN) { MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK : M_NOWAIT); if ((m->m_flags & M_EXT) == 0) { m_freem(m); m_freem(*mp); return ENOBUFS; } m->m_len = min(MCLBYTES, sopt_size); } else { m->m_len = min(MLEN, sopt_size); } sopt_size -= m->m_len; m_prev->m_next = m; m_prev = m; } return (0); } int soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) { struct mbuf *m0 = m; if (sopt->sopt_val == NULL) return (0); while (m != NULL && sopt->sopt_valsize >= m->m_len) { if (sopt->sopt_td != NULL) { int error; error = copyin(sopt->sopt_val, mtod(m, char *), m->m_len); if (error != 0) { m_freem(m0); return(error); } } else bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); sopt->sopt_valsize -= m->m_len; sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; m = m->m_next; } if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ panic("ip6_sooptmcopyin"); return (0); } int soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) { struct mbuf *m0 = m; size_t valsize = 0; if (sopt->sopt_val == NULL) return (0); while (m != NULL && sopt->sopt_valsize >= m->m_len) { if (sopt->sopt_td != NULL) { int error; error = copyout(mtod(m, char *), sopt->sopt_val, m->m_len); if (error != 0) { m_freem(m0); return(error); } } else bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); sopt->sopt_valsize -= m->m_len; sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; valsize += m->m_len; m = m->m_next; } if (m != NULL) { /* enough soopt buffer should be given from user-land */ m_freem(m0); return(EINVAL); } sopt->sopt_valsize = valsize; return (0); } /* * sohasoutofband(): protocol notifies socket layer of the arrival of new * out-of-band data, which will then notify socket consumers. */ void sohasoutofband(struct socket *so) { if (so->so_sigio != NULL) pgsigio(&so->so_sigio, SIGURG, 0); selwakeuppri(&so->so_rdsel, PSOCK); } int sopoll(struct socket *so, int events, struct ucred *active_cred, struct thread *td) { /* * We do not need to set or assert curvnet as long as everyone uses * sopoll_generic(). */ return (so->so_proto->pr_sopoll(so, events, active_cred, td)); } int sopoll_generic(struct socket *so, int events, struct ucred *active_cred, struct thread *td) { int revents; SOCK_LOCK(so); if (SOLISTENING(so)) { if (!(events & (POLLIN | POLLRDNORM))) revents = 0; else if (!TAILQ_EMPTY(&so->sol_comp)) revents = events & (POLLIN | POLLRDNORM); else if ((events & POLLINIGNEOF) == 0 && so->so_error) revents = (events & (POLLIN | POLLRDNORM)) | POLLHUP; else { selrecord(td, &so->so_rdsel); revents = 0; } } else { revents = 0; SOCK_SENDBUF_LOCK(so); SOCK_RECVBUF_LOCK(so); if (events & (POLLIN | POLLRDNORM)) if (soreadabledata(so) && !isspliced(so)) revents |= events & (POLLIN | POLLRDNORM); if (events & (POLLOUT | POLLWRNORM)) if (sowriteable(so) && !issplicedback(so)) revents |= events & (POLLOUT | POLLWRNORM); if (events & (POLLPRI | POLLRDBAND)) if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) revents |= events & (POLLPRI | POLLRDBAND); if ((events & POLLINIGNEOF) == 0) { if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { revents |= events & (POLLIN | POLLRDNORM); if (so->so_snd.sb_state & SBS_CANTSENDMORE) revents |= POLLHUP; } } if (so->so_rcv.sb_state & SBS_CANTRCVMORE) revents |= events & POLLRDHUP; if (revents == 0) { if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND | POLLRDHUP)) { selrecord(td, &so->so_rdsel); so->so_rcv.sb_flags |= SB_SEL; } if (events & (POLLOUT | POLLWRNORM)) { selrecord(td, &so->so_wrsel); so->so_snd.sb_flags |= SB_SEL; } } SOCK_RECVBUF_UNLOCK(so); SOCK_SENDBUF_UNLOCK(so); } SOCK_UNLOCK(so); return (revents); } int soo_kqfilter(struct file *fp, struct knote *kn) { struct socket *so = kn->kn_fp->f_data; struct sockbuf *sb; sb_which which; struct knlist *knl; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &soread_filtops; knl = &so->so_rdsel.si_note; sb = &so->so_rcv; which = SO_RCV; break; case EVFILT_WRITE: kn->kn_fop = &sowrite_filtops; knl = &so->so_wrsel.si_note; sb = &so->so_snd; which = SO_SND; break; case EVFILT_EMPTY: kn->kn_fop = &soempty_filtops; knl = &so->so_wrsel.si_note; sb = &so->so_snd; which = SO_SND; break; default: return (EINVAL); } SOCK_LOCK(so); if (SOLISTENING(so)) { knlist_add(knl, kn, 1); } else { SOCK_BUF_LOCK(so, which); knlist_add(knl, kn, 1); sb->sb_flags |= SB_KNOTE; SOCK_BUF_UNLOCK(so, which); } SOCK_UNLOCK(so); return (0); } static void filt_sordetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; so_rdknl_lock(so); knlist_remove(&so->so_rdsel.si_note, kn, 1); if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note)) so->so_rcv.sb_flags &= ~SB_KNOTE; so_rdknl_unlock(so); } /*ARGSUSED*/ static int filt_soread(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; if (SOLISTENING(so)) { SOCK_LOCK_ASSERT(so); kn->kn_data = so->sol_qlen; if (so->so_error) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } return (!TAILQ_EMPTY(&so->sol_comp)); } if ((so->so_rcv.sb_flags & SB_SPLICED) != 0) return (0); SOCK_RECVBUF_LOCK_ASSERT(so); kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl; if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } else if (so->so_error || so->so_rerror) return (1); if (kn->kn_sfflags & NOTE_LOWAT) { if (kn->kn_data >= kn->kn_sdata) return (1); } else if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat) return (1); /* This hook returning non-zero indicates an event, not error */ return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD)); } static void filt_sowdetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; so_wrknl_lock(so); knlist_remove(&so->so_wrsel.si_note, kn, 1); if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note)) so->so_snd.sb_flags &= ~SB_KNOTE; so_wrknl_unlock(so); } /*ARGSUSED*/ static int filt_sowrite(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; if (SOLISTENING(so)) return (0); SOCK_SENDBUF_LOCK_ASSERT(so); kn->kn_data = sbspace(&so->so_snd); hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } else if (so->so_error) /* temporary udp error */ return (1); else if (((so->so_state & SS_ISCONNECTED) == 0) && (so->so_proto->pr_flags & PR_CONNREQUIRED)) return (0); else if (kn->kn_sfflags & NOTE_LOWAT) return (kn->kn_data >= kn->kn_sdata); else return (kn->kn_data >= so->so_snd.sb_lowat); } static int filt_soempty(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; if (SOLISTENING(so)) return (1); SOCK_SENDBUF_LOCK_ASSERT(so); kn->kn_data = sbused(&so->so_snd); if (kn->kn_data == 0) return (1); else return (0); } int socheckuid(struct socket *so, uid_t uid) { if (so == NULL) return (EPERM); if (so->so_cred->cr_uid != uid) return (EPERM); return (0); } /* * These functions are used by protocols to notify the socket layer (and its * consumers) of state changes in the sockets driven by protocol-side events. */ /* * Procedures to manipulate state flags of socket and do appropriate wakeups. * * Normal sequence from the active (originating) side is that * soisconnecting() is called during processing of connect() call, resulting * in an eventual call to soisconnected() if/when the connection is * established. When the connection is torn down soisdisconnecting() is * called during processing of disconnect() call, and soisdisconnected() is * called when the connection to the peer is totally severed. The semantics * of these routines are such that connectionless protocols can call * soisconnected() and soisdisconnected() only, bypassing the in-progress * calls when setting up a ``connection'' takes no time. * * From the passive side, a socket is created with two queues of sockets: * so_incomp for connections in progress and so_comp for connections already * made and awaiting user acceptance. As a protocol is preparing incoming * connections, it creates a socket structure queued on so_incomp by calling * sonewconn(). When the connection is established, soisconnected() is * called, and transfers the socket structure to so_comp, making it available * to accept(). * * If a socket is closed with sockets on either so_incomp or so_comp, these * sockets are dropped. * * If higher-level protocols are implemented in the kernel, the wakeups done * here will sometimes cause software-interrupt process scheduling. */ void soisconnecting(struct socket *so) { SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISCONNECTING; SOCK_UNLOCK(so); } void soisconnected(struct socket *so) { bool last __diagused; SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); so->so_state |= SS_ISCONNECTED; if (so->so_qstate == SQ_INCOMP) { struct socket *head = so->so_listen; int ret; KASSERT(head, ("%s: so %p on incomp of NULL", __func__, so)); /* * Promoting a socket from incomplete queue to complete, we * need to go through reverse order of locking. We first do * trylock, and if that doesn't succeed, we go the hard way * leaving a reference and rechecking consistency after proper * locking. */ if (__predict_false(SOLISTEN_TRYLOCK(head) == 0)) { soref(head); SOCK_UNLOCK(so); SOLISTEN_LOCK(head); SOCK_LOCK(so); if (__predict_false(head != so->so_listen)) { /* * The socket went off the listen queue, * should be lost race to close(2) of sol. * The socket is about to soabort(). */ SOCK_UNLOCK(so); sorele_locked(head); return; } last = refcount_release(&head->so_count); KASSERT(!last, ("%s: released last reference for %p", __func__, head)); } again: if ((so->so_options & SO_ACCEPTFILTER) == 0) { TAILQ_REMOVE(&head->sol_incomp, so, so_list); head->sol_incqlen--; TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); head->sol_qlen++; so->so_qstate = SQ_COMP; SOCK_UNLOCK(so); solisten_wakeup(head); /* unlocks */ } else { SOCK_RECVBUF_LOCK(so); soupcall_set(so, SO_RCV, head->sol_accept_filter->accf_callback, head->sol_accept_filter_arg); so->so_options &= ~SO_ACCEPTFILTER; ret = head->sol_accept_filter->accf_callback(so, head->sol_accept_filter_arg, M_NOWAIT); if (ret == SU_ISCONNECTED) { soupcall_clear(so, SO_RCV); SOCK_RECVBUF_UNLOCK(so); goto again; } SOCK_RECVBUF_UNLOCK(so); SOCK_UNLOCK(so); SOLISTEN_UNLOCK(head); } return; } SOCK_UNLOCK(so); wakeup(&so->so_timeo); sorwakeup(so); sowwakeup(so); } void soisdisconnecting(struct socket *so) { SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTING; so->so_state |= SS_ISDISCONNECTING; if (!SOLISTENING(so)) { SOCK_RECVBUF_LOCK(so); socantrcvmore_locked(so); SOCK_SENDBUF_LOCK(so); socantsendmore_locked(so); } SOCK_UNLOCK(so); wakeup(&so->so_timeo); } void soisdisconnected(struct socket *so) { SOCK_LOCK(so); /* * There is at least one reader of so_state that does not * acquire socket lock, namely soreceive_generic(). Ensure * that it never sees all flags that track connection status * cleared, by ordering the update with a barrier semantic of * our release thread fence. */ so->so_state |= SS_ISDISCONNECTED; atomic_thread_fence_rel(); so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); if (!SOLISTENING(so)) { SOCK_UNLOCK(so); SOCK_RECVBUF_LOCK(so); socantrcvmore_locked(so); SOCK_SENDBUF_LOCK(so); sbdrop_locked(&so->so_snd, sbused(&so->so_snd)); socantsendmore_locked(so); } else SOCK_UNLOCK(so); wakeup(&so->so_timeo); } int soiolock(struct socket *so, struct sx *sx, int flags) { int error; KASSERT((flags & SBL_VALID) == flags, ("soiolock: invalid flags %#x", flags)); if ((flags & SBL_WAIT) != 0) { if ((flags & SBL_NOINTR) != 0) { sx_xlock(sx); } else { error = sx_xlock_sig(sx); if (error != 0) return (error); } } else if (!sx_try_xlock(sx)) { return (EWOULDBLOCK); } if (__predict_false(SOLISTENING(so))) { sx_xunlock(sx); return (ENOTCONN); } return (0); } void soiounlock(struct sx *sx) { sx_xunlock(sx); } /* * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. */ struct sockaddr * sodupsockaddr(const struct sockaddr *sa, int mflags) { struct sockaddr *sa2; sa2 = malloc(sa->sa_len, M_SONAME, mflags); if (sa2) bcopy(sa, sa2, sa->sa_len); return sa2; } /* * Register per-socket destructor. */ void sodtor_set(struct socket *so, so_dtor_t *func) { SOCK_LOCK_ASSERT(so); so->so_dtor = func; } /* * Register per-socket buffer upcalls. */ void soupcall_set(struct socket *so, sb_which which, so_upcall_t func, void *arg) { struct sockbuf *sb; KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); switch (which) { case SO_RCV: sb = &so->so_rcv; break; case SO_SND: sb = &so->so_snd; break; } SOCK_BUF_LOCK_ASSERT(so, which); sb->sb_upcall = func; sb->sb_upcallarg = arg; sb->sb_flags |= SB_UPCALL; } void soupcall_clear(struct socket *so, sb_which which) { struct sockbuf *sb; KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); switch (which) { case SO_RCV: sb = &so->so_rcv; break; case SO_SND: sb = &so->so_snd; break; } SOCK_BUF_LOCK_ASSERT(so, which); KASSERT(sb->sb_upcall != NULL, ("%s: so %p no upcall to clear", __func__, so)); sb->sb_upcall = NULL; sb->sb_upcallarg = NULL; sb->sb_flags &= ~SB_UPCALL; } void solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg) { SOLISTEN_LOCK_ASSERT(so); so->sol_upcall = func; so->sol_upcallarg = arg; } static void so_rdknl_lock(void *arg) { struct socket *so = arg; retry: if (SOLISTENING(so)) { SOLISTEN_LOCK(so); } else { SOCK_RECVBUF_LOCK(so); if (__predict_false(SOLISTENING(so))) { SOCK_RECVBUF_UNLOCK(so); goto retry; } } } static void so_rdknl_unlock(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOLISTEN_UNLOCK(so); else SOCK_RECVBUF_UNLOCK(so); } static void so_rdknl_assert_lock(void *arg, int what) { struct socket *so = arg; if (what == LA_LOCKED) { if (SOLISTENING(so)) SOLISTEN_LOCK_ASSERT(so); else SOCK_RECVBUF_LOCK_ASSERT(so); } else { if (SOLISTENING(so)) SOLISTEN_UNLOCK_ASSERT(so); else SOCK_RECVBUF_UNLOCK_ASSERT(so); } } static void so_wrknl_lock(void *arg) { struct socket *so = arg; retry: if (SOLISTENING(so)) { SOLISTEN_LOCK(so); } else { SOCK_SENDBUF_LOCK(so); if (__predict_false(SOLISTENING(so))) { SOCK_SENDBUF_UNLOCK(so); goto retry; } } } static void so_wrknl_unlock(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOLISTEN_UNLOCK(so); else SOCK_SENDBUF_UNLOCK(so); } static void so_wrknl_assert_lock(void *arg, int what) { struct socket *so = arg; if (what == LA_LOCKED) { if (SOLISTENING(so)) SOLISTEN_LOCK_ASSERT(so); else SOCK_SENDBUF_LOCK_ASSERT(so); } else { if (SOLISTENING(so)) SOLISTEN_UNLOCK_ASSERT(so); else SOCK_SENDBUF_UNLOCK_ASSERT(so); } } /* * Create an external-format (``xsocket'') structure using the information in * the kernel-format socket structure pointed to by so. This is done to * reduce the spew of irrelevant information over this interface, to isolate * user code from changes in the kernel structure, and potentially to provide * information-hiding if we decide that some of this information should be * hidden from users. */ void sotoxsocket(struct socket *so, struct xsocket *xso) { bzero(xso, sizeof(*xso)); xso->xso_len = sizeof *xso; xso->xso_so = (uintptr_t)so; xso->so_type = so->so_type; xso->so_options = so->so_options; xso->so_linger = so->so_linger; xso->so_state = so->so_state; xso->so_pcb = (uintptr_t)so->so_pcb; xso->xso_protocol = so->so_proto->pr_protocol; xso->xso_family = so->so_proto->pr_domain->dom_family; xso->so_timeo = so->so_timeo; xso->so_error = so->so_error; xso->so_uid = so->so_cred->cr_uid; xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; SOCK_LOCK(so); xso->so_fibnum = so->so_fibnum; if (SOLISTENING(so)) { xso->so_qlen = so->sol_qlen; xso->so_incqlen = so->sol_incqlen; xso->so_qlimit = so->sol_qlimit; xso->so_oobmark = 0; } else { xso->so_state |= so->so_qstate; xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0; xso->so_oobmark = so->so_oobmark; sbtoxsockbuf(&so->so_snd, &xso->so_snd); sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); if ((so->so_rcv.sb_flags & SB_SPLICED) != 0) xso->so_splice_so = (uintptr_t)so->so_splice->dst; } SOCK_UNLOCK(so); } struct sockbuf * so_sockbuf_rcv(struct socket *so) { return (&so->so_rcv); } struct sockbuf * so_sockbuf_snd(struct socket *so) { return (&so->so_snd); } int so_state_get(const struct socket *so) { return (so->so_state); } void so_state_set(struct socket *so, int val) { so->so_state = val; } int so_options_get(const struct socket *so) { return (so->so_options); } void so_options_set(struct socket *so, int val) { so->so_options = val; } int so_error_get(const struct socket *so) { return (so->so_error); } void so_error_set(struct socket *so, int val) { so->so_error = val; } int so_linger_get(const struct socket *so) { return (so->so_linger); } void so_linger_set(struct socket *so, int val) { KASSERT(val >= 0 && val <= USHRT_MAX && val <= (INT_MAX / hz), ("%s: val %d out of range", __func__, val)); so->so_linger = val; } struct protosw * so_protosw_get(const struct socket *so) { return (so->so_proto); } void so_protosw_set(struct socket *so, struct protosw *val) { so->so_proto = val; } void so_sorwakeup(struct socket *so) { sorwakeup(so); } void so_sowwakeup(struct socket *so) { sowwakeup(so); } void so_sorwakeup_locked(struct socket *so) { sorwakeup_locked(so); } void so_sowwakeup_locked(struct socket *so) { sowwakeup_locked(so); } void so_lock(struct socket *so) { SOCK_LOCK(so); } void so_unlock(struct socket *so) { SOCK_UNLOCK(so); } diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c index fd6682ef03b0..c5b0c7896a17 100644 --- a/sys/kern/vfs_aio.c +++ b/sys/kern/vfs_aio.c @@ -1,3164 +1,3164 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 1997 John S. Dyson. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. John S. Dyson's name may not be used to endorse or promote products * derived from this software without specific prior written permission. * * DISCLAIMER: This code isn't warranted to do anything useful. Anything * bad that happens because of using this software isn't the responsibility * of the author. This software is distributed AS-IS. */ /* * This file contains support for the POSIX 1003.1B AIO/LIO facility. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Counter for allocating reference ids to new jobs. Wrapped to 1 on * overflow. (XXX will be removed soon.) */ static u_long jobrefid; /* * Counter for aio_fsync. */ static uint64_t jobseqno; #ifndef MAX_AIO_PER_PROC #define MAX_AIO_PER_PROC 32 #endif #ifndef MAX_AIO_QUEUE_PER_PROC #define MAX_AIO_QUEUE_PER_PROC 256 #endif #ifndef MAX_AIO_QUEUE #define MAX_AIO_QUEUE 1024 /* Bigger than MAX_AIO_QUEUE_PER_PROC */ #endif #ifndef MAX_BUF_AIO #define MAX_BUF_AIO 16 #endif FEATURE(aio, "Asynchronous I/O"); SYSCTL_DECL(_p1003_1b); static MALLOC_DEFINE(M_LIO, "lio", "listio aio control block list"); static MALLOC_DEFINE(M_AIO, "aio", "structures for asynchronous I/O"); static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Async IO management"); static int enable_aio_unsafe = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, enable_unsafe, CTLFLAG_RW, &enable_aio_unsafe, 0, "Permit asynchronous IO on all file types, not just known-safe types"); static unsigned int unsafe_warningcnt = 1; SYSCTL_UINT(_vfs_aio, OID_AUTO, unsafe_warningcnt, CTLFLAG_RW, &unsafe_warningcnt, 0, "Warnings that will be triggered upon failed IO requests on unsafe files"); static int max_aio_procs = MAX_AIO_PROCS; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, CTLFLAG_RW, &max_aio_procs, 0, "Maximum number of kernel processes to use for handling async IO "); static int num_aio_procs = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, CTLFLAG_RD, &num_aio_procs, 0, "Number of presently active kernel processes for async IO"); /* * The code will adjust the actual number of AIO processes towards this * number when it gets a chance. */ static int target_aio_procs = TARGET_AIO_PROCS; SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs, 0, "Preferred number of ready kernel processes for async IO"); static int max_queue_count = MAX_AIO_QUEUE; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0, "Maximum number of aio requests to queue, globally"); static int num_queue_count = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0, "Number of queued aio requests"); static int num_buf_aio = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0, "Number of aio requests presently handled by the buf subsystem"); static int num_unmapped_aio = 0; SYSCTL_INT(_vfs_aio, OID_AUTO, num_unmapped_aio, CTLFLAG_RD, &num_unmapped_aio, 0, "Number of aio requests presently handled by unmapped I/O buffers"); /* Number of async I/O processes in the process of being started */ /* XXX This should be local to aio_aqueue() */ static int num_aio_resv_start = 0; static int aiod_lifetime; SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0, "Maximum lifetime for idle aiod"); static int max_aio_per_proc = MAX_AIO_PER_PROC; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc, 0, "Maximum active aio requests per process"); static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC; SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW, &max_aio_queue_per_proc, 0, "Maximum queued aio requests per process"); static int max_buf_aio = MAX_BUF_AIO; SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0, "Maximum buf aio requests per process"); /* * Though redundant with vfs.aio.max_aio_queue_per_proc, POSIX requires * sysconf(3) to support AIO_LISTIO_MAX, and we implement that with * vfs.aio.aio_listio_max. */ SYSCTL_INT(_p1003_1b, CTL_P1003_1B_AIO_LISTIO_MAX, aio_listio_max, CTLFLAG_RD | CTLFLAG_CAPRD, &max_aio_queue_per_proc, 0, "Maximum aio requests for a single lio_listio call"); #ifdef COMPAT_FREEBSD6 typedef struct oaiocb { int aio_fildes; /* File descriptor */ off_t aio_offset; /* File offset for I/O */ volatile void *aio_buf; /* I/O buffer in process space */ size_t aio_nbytes; /* Number of bytes for I/O */ struct osigevent aio_sigevent; /* Signal to deliver */ int aio_lio_opcode; /* LIO opcode */ int aio_reqprio; /* Request priority -- ignored */ struct __aiocb_private _aiocb_private; } oaiocb_t; #endif /* * Below is a key of locks used to protect each member of struct kaiocb * aioliojob and kaioinfo and any backends. * * * - need not protected * a - locked by kaioinfo lock * b - locked by backend lock, the backend lock can be null in some cases, * for example, BIO belongs to this type, in this case, proc lock is * reused. * c - locked by aio_job_mtx, the lock for the generic file I/O backend. */ /* * If the routine that services an AIO request blocks while running in an * AIO kernel process it can starve other I/O requests. BIO requests * queued via aio_qbio() complete asynchronously and do not use AIO kernel * processes at all. Socket I/O requests use a separate pool of * kprocs and also force non-blocking I/O. Other file I/O requests * use the generic fo_read/fo_write operations which can block. The * fsync and mlock operations can also block while executing. Ideally * none of these requests would block while executing. * * Note that the service routines cannot toggle O_NONBLOCK in the file * structure directly while handling a request due to races with * userland threads. */ /* jobflags */ #define KAIOCB_QUEUEING 0x01 #define KAIOCB_CANCELLED 0x02 #define KAIOCB_CANCELLING 0x04 #define KAIOCB_CHECKSYNC 0x08 #define KAIOCB_CLEARED 0x10 #define KAIOCB_FINISHED 0x20 /* ioflags */ #define KAIOCB_IO_FOFFSET 0x01 /* * AIO process info */ #define AIOP_FREE 0x1 /* proc on free queue */ struct aioproc { int aioprocflags; /* (c) AIO proc flags */ TAILQ_ENTRY(aioproc) list; /* (c) list of processes */ struct proc *aioproc; /* (*) the AIO proc */ }; /* * data-structure for lio signal management */ struct aioliojob { int lioj_flags; /* (a) listio flags */ int lioj_count; /* (a) count of jobs */ int lioj_finished_count; /* (a) count of finished jobs */ struct sigevent lioj_signal; /* (a) signal on all I/O done */ TAILQ_ENTRY(aioliojob) lioj_list; /* (a) lio list */ struct knlist klist; /* (a) list of knotes */ ksiginfo_t lioj_ksi; /* (a) Realtime signal info */ }; #define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ #define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ #define LIOJ_KEVENT_POSTED 0x4 /* kevent triggered */ /* * per process aio data structure */ struct kaioinfo { struct mtx kaio_mtx; /* the lock to protect this struct */ int kaio_flags; /* (a) per process kaio flags */ int kaio_active_count; /* (c) number of currently used AIOs */ int kaio_count; /* (a) size of AIO queue */ int kaio_buffer_count; /* (a) number of bio buffers */ TAILQ_HEAD(,kaiocb) kaio_all; /* (a) all AIOs in a process */ TAILQ_HEAD(,kaiocb) kaio_done; /* (a) done queue for process */ TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */ TAILQ_HEAD(,kaiocb) kaio_jobqueue; /* (a) job queue for process */ TAILQ_HEAD(,kaiocb) kaio_syncqueue; /* (a) queue for aio_fsync */ TAILQ_HEAD(,kaiocb) kaio_syncready; /* (a) second q for aio_fsync */ struct task kaio_task; /* (*) task to kick aio processes */ struct task kaio_sync_task; /* (*) task to schedule fsync jobs */ }; #define AIO_LOCK(ki) mtx_lock(&(ki)->kaio_mtx) #define AIO_UNLOCK(ki) mtx_unlock(&(ki)->kaio_mtx) #define AIO_LOCK_ASSERT(ki, f) mtx_assert(&(ki)->kaio_mtx, (f)) #define AIO_MTX(ki) (&(ki)->kaio_mtx) #define KAIO_RUNDOWN 0x1 /* process is being run down */ #define KAIO_WAKEUP 0x2 /* wakeup process when AIO completes */ /* * Operations used to interact with userland aio control blocks. * Different ABIs provide their own operations. */ struct aiocb_ops { int (*aio_copyin)(struct aiocb *ujob, struct kaiocb *kjob, int ty); long (*fetch_status)(struct aiocb *ujob); long (*fetch_error)(struct aiocb *ujob); int (*store_status)(struct aiocb *ujob, long status); int (*store_error)(struct aiocb *ujob, long error); int (*store_kernelinfo)(struct aiocb *ujob, long jobref); int (*store_aiocb)(struct aiocb **ujobp, struct aiocb *ujob); }; static TAILQ_HEAD(,aioproc) aio_freeproc; /* (c) Idle daemons */ static struct sema aio_newproc_sem; static struct mtx aio_job_mtx; static TAILQ_HEAD(,kaiocb) aio_jobs; /* (c) Async job list */ static struct unrhdr *aiod_unr; static void aio_biocleanup(struct bio *bp); void aio_init_aioinfo(struct proc *p); static int aio_onceonly(void); static int aio_free_entry(struct kaiocb *job); static void aio_process_rw(struct kaiocb *job); static void aio_process_sync(struct kaiocb *job); static void aio_process_mlock(struct kaiocb *job); static void aio_schedule_fsync(void *context, int pending); static int aio_newproc(int *); int aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lio, int type, struct aiocb_ops *ops); static int aio_queue_file(struct file *fp, struct kaiocb *job); static void aio_biowakeup(struct bio *bp); static void aio_proc_rundown(void *arg, struct proc *p); static void aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp); static int aio_qbio(struct proc *p, struct kaiocb *job); static void aio_daemon(void *param); static void aio_bio_done_notify(struct proc *userp, struct kaiocb *job); static bool aio_clear_cancel_function_locked(struct kaiocb *job); static int aio_kick(struct proc *userp); static void aio_kick_nowait(struct proc *userp); static void aio_kick_helper(void *context, int pending); static int filt_aioattach(struct knote *kn); static void filt_aiodetach(struct knote *kn); static int filt_aio(struct knote *kn, long hint); static int filt_lioattach(struct knote *kn); static void filt_liodetach(struct knote *kn); static int filt_lio(struct knote *kn, long hint); /* * Zones for: * kaio Per process async io info * aiocb async io jobs * aiolio list io jobs */ static uma_zone_t kaio_zone, aiocb_zone, aiolio_zone; /* kqueue filters for aio */ -static struct filterops aio_filtops = { +static const struct filterops aio_filtops = { .f_isfd = 0, .f_attach = filt_aioattach, .f_detach = filt_aiodetach, .f_event = filt_aio, }; -static struct filterops lio_filtops = { +static const struct filterops lio_filtops = { .f_isfd = 0, .f_attach = filt_lioattach, .f_detach = filt_liodetach, .f_event = filt_lio }; static eventhandler_tag exit_tag, exec_tag; TASKQUEUE_DEFINE_THREAD(aiod_kick); /* * Main operations function for use as a kernel module. */ static int aio_modload(struct module *module, int cmd, void *arg) { int error = 0; switch (cmd) { case MOD_LOAD: aio_onceonly(); break; case MOD_SHUTDOWN: break; default: error = EOPNOTSUPP; break; } return (error); } static moduledata_t aio_mod = { "aio", &aio_modload, NULL }; DECLARE_MODULE(aio, aio_mod, SI_SUB_VFS, SI_ORDER_ANY); MODULE_VERSION(aio, 1); /* * Startup initialization */ static int aio_onceonly(void) { exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL, EVENTHANDLER_PRI_ANY); exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec, NULL, EVENTHANDLER_PRI_ANY); kqueue_add_filteropts(EVFILT_AIO, &aio_filtops); kqueue_add_filteropts(EVFILT_LIO, &lio_filtops); TAILQ_INIT(&aio_freeproc); sema_init(&aio_newproc_sem, 0, "aio_new_proc"); mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF); TAILQ_INIT(&aio_jobs); aiod_unr = new_unrhdr(1, INT_MAX, NULL); kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); aiocb_zone = uma_zcreate("AIOCB", sizeof(struct kaiocb), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); aiod_lifetime = AIOD_LIFETIME_DEFAULT; jobrefid = 1; p31b_setcfg(CTL_P1003_1B_ASYNCHRONOUS_IO, _POSIX_ASYNCHRONOUS_IO); p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE); p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0); return (0); } /* * Init the per-process aioinfo structure. The aioinfo limits are set * per-process for user limit (resource) management. */ void aio_init_aioinfo(struct proc *p) { struct kaioinfo *ki; ki = uma_zalloc(kaio_zone, M_WAITOK); mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF | MTX_NEW); ki->kaio_flags = 0; ki->kaio_active_count = 0; ki->kaio_count = 0; ki->kaio_buffer_count = 0; TAILQ_INIT(&ki->kaio_all); TAILQ_INIT(&ki->kaio_done); TAILQ_INIT(&ki->kaio_jobqueue); TAILQ_INIT(&ki->kaio_liojoblist); TAILQ_INIT(&ki->kaio_syncqueue); TAILQ_INIT(&ki->kaio_syncready); TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p); TASK_INIT(&ki->kaio_sync_task, 0, aio_schedule_fsync, ki); PROC_LOCK(p); if (p->p_aioinfo == NULL) { p->p_aioinfo = ki; PROC_UNLOCK(p); } else { PROC_UNLOCK(p); mtx_destroy(&ki->kaio_mtx); uma_zfree(kaio_zone, ki); } while (num_aio_procs < MIN(target_aio_procs, max_aio_procs)) aio_newproc(NULL); } static int aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi, bool ext) { struct thread *td; int error; error = sigev_findtd(p, sigev, &td); if (error) return (error); if (!KSI_ONQ(ksi)) { ksiginfo_set_sigev(ksi, sigev); ksi->ksi_code = SI_ASYNCIO; ksi->ksi_flags |= ext ? (KSI_EXT | KSI_INS) : 0; tdsendsignal(p, td, ksi->ksi_signo, ksi); } PROC_UNLOCK(p); return (error); } /* * Free a job entry. Wait for completion if it is currently active, but don't * delay forever. If we delay, we return a flag that says that we have to * restart the queue scan. */ static int aio_free_entry(struct kaiocb *job) { struct kaioinfo *ki; struct aioliojob *lj; struct proc *p; p = job->userproc; MPASS(curproc == p); ki = p->p_aioinfo; MPASS(ki != NULL); AIO_LOCK_ASSERT(ki, MA_OWNED); MPASS(job->jobflags & KAIOCB_FINISHED); atomic_subtract_int(&num_queue_count, 1); ki->kaio_count--; MPASS(ki->kaio_count >= 0); TAILQ_REMOVE(&ki->kaio_done, job, plist); TAILQ_REMOVE(&ki->kaio_all, job, allist); lj = job->lio; if (lj) { lj->lioj_count--; lj->lioj_finished_count--; if (lj->lioj_count == 0) { TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); /* lio is going away, we need to destroy any knotes */ knlist_delete(&lj->klist, curthread, 1); PROC_LOCK(p); sigqueue_take(&lj->lioj_ksi); PROC_UNLOCK(p); uma_zfree(aiolio_zone, lj); } } /* job is going away, we need to destroy any knotes */ knlist_delete(&job->klist, curthread, 1); PROC_LOCK(p); sigqueue_take(&job->ksi); PROC_UNLOCK(p); AIO_UNLOCK(ki); /* * The thread argument here is used to find the owning process * and is also passed to fo_close() which may pass it to various * places such as devsw close() routines. Because of that, we * need a thread pointer from the process owning the job that is * persistent and won't disappear out from under us or move to * another process. * * Currently, all the callers of this function call it to remove * a kaiocb from the current process' job list either via a * syscall or due to the current process calling exit() or * execve(). Thus, we know that p == curproc. We also know that * curthread can't exit since we are curthread. * * Therefore, we use curthread as the thread to pass to * knlist_delete(). This does mean that it is possible for the * thread pointer at close time to differ from the thread pointer * at open time, but this is already true of file descriptors in * a multithreaded process. */ if (job->fd_file) fdrop(job->fd_file, curthread); crfree(job->cred); if (job->uiop != &job->uio) freeuio(job->uiop); uma_zfree(aiocb_zone, job); AIO_LOCK(ki); return (0); } static void aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp __unused) { aio_proc_rundown(arg, p); } static int aio_cancel_job(struct proc *p, struct kaioinfo *ki, struct kaiocb *job) { aio_cancel_fn_t *func; int cancelled; AIO_LOCK_ASSERT(ki, MA_OWNED); if (job->jobflags & (KAIOCB_CANCELLED | KAIOCB_FINISHED)) return (0); MPASS((job->jobflags & KAIOCB_CANCELLING) == 0); job->jobflags |= KAIOCB_CANCELLED; func = job->cancel_fn; /* * If there is no cancel routine, just leave the job marked as * cancelled. The job should be in active use by a caller who * should complete it normally or when it fails to install a * cancel routine. */ if (func == NULL) return (0); /* * Set the CANCELLING flag so that aio_complete() will defer * completions of this job. This prevents the job from being * freed out from under the cancel callback. After the * callback any deferred completion (whether from the callback * or any other source) will be completed. */ job->jobflags |= KAIOCB_CANCELLING; AIO_UNLOCK(ki); func(job); AIO_LOCK(ki); job->jobflags &= ~KAIOCB_CANCELLING; if (job->jobflags & KAIOCB_FINISHED) { cancelled = job->uaiocb._aiocb_private.error == ECANCELED; TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist); aio_bio_done_notify(p, job); } else { /* * The cancel callback might have scheduled an * operation to cancel this request, but it is * only counted as cancelled if the request is * cancelled when the callback returns. */ cancelled = 0; } return (cancelled); } /* * Rundown the jobs for a given process. */ static void aio_proc_rundown(void *arg, struct proc *p) { struct kaioinfo *ki; struct aioliojob *lj; struct kaiocb *job, *jobn; KASSERT(curthread->td_proc == p, ("%s: called on non-curproc", __func__)); ki = p->p_aioinfo; if (ki == NULL) return; AIO_LOCK(ki); ki->kaio_flags |= KAIO_RUNDOWN; restart: /* * Try to cancel all pending requests. This code simulates * aio_cancel on all pending I/O requests. */ TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) { aio_cancel_job(p, ki, job); } /* Wait for all running I/O to be finished */ if (TAILQ_FIRST(&ki->kaio_jobqueue) || ki->kaio_active_count != 0) { ki->kaio_flags |= KAIO_WAKEUP; msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz); goto restart; } /* Free all completed I/O requests. */ while ((job = TAILQ_FIRST(&ki->kaio_done)) != NULL) aio_free_entry(job); while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) { if (lj->lioj_count == 0) { TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); knlist_delete(&lj->klist, curthread, 1); PROC_LOCK(p); sigqueue_take(&lj->lioj_ksi); PROC_UNLOCK(p); uma_zfree(aiolio_zone, lj); } else { panic("LIO job not cleaned up: C:%d, FC:%d\n", lj->lioj_count, lj->lioj_finished_count); } } AIO_UNLOCK(ki); taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_task); taskqueue_drain(taskqueue_aiod_kick, &ki->kaio_sync_task); mtx_destroy(&ki->kaio_mtx); uma_zfree(kaio_zone, ki); p->p_aioinfo = NULL; } /* * Select a job to run (called by an AIO daemon). */ static struct kaiocb * aio_selectjob(struct aioproc *aiop) { struct kaiocb *job; struct kaioinfo *ki; struct proc *userp; mtx_assert(&aio_job_mtx, MA_OWNED); restart: TAILQ_FOREACH(job, &aio_jobs, list) { userp = job->userproc; ki = userp->p_aioinfo; if (ki->kaio_active_count < max_aio_per_proc) { TAILQ_REMOVE(&aio_jobs, job, list); if (!aio_clear_cancel_function(job)) goto restart; /* Account for currently active jobs. */ ki->kaio_active_count++; break; } } return (job); } /* * Move all data to a permanent storage device. This code * simulates the fsync and fdatasync syscalls. */ static int aio_fsync_vnode(struct thread *td, struct vnode *vp, int op) { struct mount *mp; int error; for (;;) { error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH); if (error != 0) break; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vnode_pager_clean_async(vp); if (op == LIO_DSYNC) error = VOP_FDATASYNC(vp, td); else error = VOP_FSYNC(vp, MNT_WAIT, td); VOP_UNLOCK(vp); vn_finished_write(mp); if (error != ERELOOKUP) break; } return (error); } /* * The AIO processing activity for LIO_READ/LIO_WRITE. This is the code that * does the I/O request for the non-bio version of the operations. The normal * vn operations are used, and this code should work in all instances for every * type of file, including pipes, sockets, fifos, and regular files. * * XXX I don't think it works well for socket, pipe, and fifo. */ static void aio_process_rw(struct kaiocb *job) { struct ucred *td_savedcred; struct thread *td; struct file *fp; ssize_t cnt; long msgsnd_st, msgsnd_end; long msgrcv_st, msgrcv_end; long oublock_st, oublock_end; long inblock_st, inblock_end; int error, opcode; KASSERT(job->uaiocb.aio_lio_opcode == LIO_READ || job->uaiocb.aio_lio_opcode == LIO_READV || job->uaiocb.aio_lio_opcode == LIO_WRITE || job->uaiocb.aio_lio_opcode == LIO_WRITEV, ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode)); aio_switch_vmspace(job); td = curthread; td_savedcred = td->td_ucred; td->td_ucred = job->cred; job->uiop->uio_td = td; fp = job->fd_file; opcode = job->uaiocb.aio_lio_opcode; cnt = job->uiop->uio_resid; msgrcv_st = td->td_ru.ru_msgrcv; msgsnd_st = td->td_ru.ru_msgsnd; inblock_st = td->td_ru.ru_inblock; oublock_st = td->td_ru.ru_oublock; /* * aio_aqueue() acquires a reference to the file that is * released in aio_free_entry(). */ if (opcode == LIO_READ || opcode == LIO_READV) { if (job->uiop->uio_resid == 0) error = 0; else error = fo_read(fp, job->uiop, fp->f_cred, (job->ioflags & KAIOCB_IO_FOFFSET) != 0 ? 0 : FOF_OFFSET, td); } else { if (fp->f_type == DTYPE_VNODE) bwillwrite(); error = fo_write(fp, job->uiop, fp->f_cred, (job->ioflags & KAIOCB_IO_FOFFSET) != 0 ? 0 : FOF_OFFSET, td); } msgrcv_end = td->td_ru.ru_msgrcv; msgsnd_end = td->td_ru.ru_msgsnd; inblock_end = td->td_ru.ru_inblock; oublock_end = td->td_ru.ru_oublock; job->msgrcv = msgrcv_end - msgrcv_st; job->msgsnd = msgsnd_end - msgsnd_st; job->inblock = inblock_end - inblock_st; job->outblock = oublock_end - oublock_st; if (error != 0 && job->uiop->uio_resid != cnt) { if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) error = 0; if (error == EPIPE && (opcode & LIO_WRITE)) { PROC_LOCK(job->userproc); kern_psignal(job->userproc, SIGPIPE); PROC_UNLOCK(job->userproc); } } cnt -= job->uiop->uio_resid; td->td_ucred = td_savedcred; if (error) aio_complete(job, -1, error); else aio_complete(job, cnt, 0); } static void aio_process_sync(struct kaiocb *job) { struct thread *td = curthread; struct ucred *td_savedcred = td->td_ucred; struct file *fp = job->fd_file; int error = 0; KASSERT(job->uaiocb.aio_lio_opcode & LIO_SYNC, ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode)); td->td_ucred = job->cred; if (fp->f_vnode != NULL) { error = aio_fsync_vnode(td, fp->f_vnode, job->uaiocb.aio_lio_opcode); } td->td_ucred = td_savedcred; if (error) aio_complete(job, -1, error); else aio_complete(job, 0, 0); } static void aio_process_mlock(struct kaiocb *job) { struct aiocb *cb = &job->uaiocb; int error; KASSERT(job->uaiocb.aio_lio_opcode == LIO_MLOCK, ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode)); aio_switch_vmspace(job); error = kern_mlock(job->userproc, job->cred, __DEVOLATILE(uintptr_t, cb->aio_buf), cb->aio_nbytes); aio_complete(job, error != 0 ? -1 : 0, error); } static void aio_bio_done_notify(struct proc *userp, struct kaiocb *job) { struct aioliojob *lj; struct kaioinfo *ki; struct kaiocb *sjob, *sjobn; int lj_done; bool schedule_fsync; ki = userp->p_aioinfo; AIO_LOCK_ASSERT(ki, MA_OWNED); lj = job->lio; lj_done = 0; if (lj) { lj->lioj_finished_count++; if (lj->lioj_count == lj->lioj_finished_count) lj_done = 1; } TAILQ_INSERT_TAIL(&ki->kaio_done, job, plist); MPASS(job->jobflags & KAIOCB_FINISHED); if (ki->kaio_flags & KAIO_RUNDOWN) goto notification_done; if (job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL || job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) aio_sendsig(userp, &job->uaiocb.aio_sigevent, &job->ksi, true); KNOTE_LOCKED(&job->klist, 1); if (lj_done) { if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { lj->lioj_flags |= LIOJ_KEVENT_POSTED; KNOTE_LOCKED(&lj->klist, 1); } if ((lj->lioj_flags & (LIOJ_SIGNAL | LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) { aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi, true); lj->lioj_flags |= LIOJ_SIGNAL_POSTED; } } notification_done: if (job->jobflags & KAIOCB_CHECKSYNC) { schedule_fsync = false; TAILQ_FOREACH_SAFE(sjob, &ki->kaio_syncqueue, list, sjobn) { if (job->fd_file != sjob->fd_file || job->seqno >= sjob->seqno) continue; if (--sjob->pending > 0) continue; TAILQ_REMOVE(&ki->kaio_syncqueue, sjob, list); if (!aio_clear_cancel_function_locked(sjob)) continue; TAILQ_INSERT_TAIL(&ki->kaio_syncready, sjob, list); schedule_fsync = true; } if (schedule_fsync) taskqueue_enqueue(taskqueue_aiod_kick, &ki->kaio_sync_task); } if (ki->kaio_flags & KAIO_WAKEUP) { ki->kaio_flags &= ~KAIO_WAKEUP; wakeup(&userp->p_aioinfo); } } static void aio_schedule_fsync(void *context, int pending) { struct kaioinfo *ki; struct kaiocb *job; ki = context; AIO_LOCK(ki); while (!TAILQ_EMPTY(&ki->kaio_syncready)) { job = TAILQ_FIRST(&ki->kaio_syncready); TAILQ_REMOVE(&ki->kaio_syncready, job, list); AIO_UNLOCK(ki); aio_schedule(job, aio_process_sync); AIO_LOCK(ki); } AIO_UNLOCK(ki); } bool aio_cancel_cleared(struct kaiocb *job) { /* * The caller should hold the same queue lock held when * aio_clear_cancel_function() was called and set this flag * ensuring this check sees an up-to-date value. However, * there is no way to assert that. */ return ((job->jobflags & KAIOCB_CLEARED) != 0); } static bool aio_clear_cancel_function_locked(struct kaiocb *job) { AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED); MPASS(job->cancel_fn != NULL); if (job->jobflags & KAIOCB_CANCELLING) { job->jobflags |= KAIOCB_CLEARED; return (false); } job->cancel_fn = NULL; return (true); } bool aio_clear_cancel_function(struct kaiocb *job) { struct kaioinfo *ki; bool ret; ki = job->userproc->p_aioinfo; AIO_LOCK(ki); ret = aio_clear_cancel_function_locked(job); AIO_UNLOCK(ki); return (ret); } static bool aio_set_cancel_function_locked(struct kaiocb *job, aio_cancel_fn_t *func) { AIO_LOCK_ASSERT(job->userproc->p_aioinfo, MA_OWNED); if (job->jobflags & KAIOCB_CANCELLED) return (false); job->cancel_fn = func; return (true); } bool aio_set_cancel_function(struct kaiocb *job, aio_cancel_fn_t *func) { struct kaioinfo *ki; bool ret; ki = job->userproc->p_aioinfo; AIO_LOCK(ki); ret = aio_set_cancel_function_locked(job, func); AIO_UNLOCK(ki); return (ret); } void aio_complete(struct kaiocb *job, long status, int error) { struct kaioinfo *ki; struct proc *userp; job->uaiocb._aiocb_private.error = error; job->uaiocb._aiocb_private.status = status; userp = job->userproc; ki = userp->p_aioinfo; AIO_LOCK(ki); KASSERT(!(job->jobflags & KAIOCB_FINISHED), ("duplicate aio_complete")); job->jobflags |= KAIOCB_FINISHED; if ((job->jobflags & (KAIOCB_QUEUEING | KAIOCB_CANCELLING)) == 0) { TAILQ_REMOVE(&ki->kaio_jobqueue, job, plist); aio_bio_done_notify(userp, job); } AIO_UNLOCK(ki); } void aio_cancel(struct kaiocb *job) { aio_complete(job, -1, ECANCELED); } void aio_switch_vmspace(struct kaiocb *job) { vmspace_switch_aio(job->userproc->p_vmspace); } /* * The AIO daemon, most of the actual work is done in aio_process_*, * but the setup (and address space mgmt) is done in this routine. */ static void aio_daemon(void *_id) { struct kaiocb *job; struct aioproc *aiop; struct kaioinfo *ki; struct proc *p; struct vmspace *myvm; struct thread *td = curthread; int id = (intptr_t)_id; /* * Grab an extra reference on the daemon's vmspace so that it * doesn't get freed by jobs that switch to a different * vmspace. */ p = td->td_proc; myvm = vmspace_acquire_ref(p); KASSERT(p->p_textvp == NULL, ("kthread has a textvp")); /* * Allocate and ready the aio control info. There is one aiop structure * per daemon. */ aiop = malloc(sizeof(*aiop), M_AIO, M_WAITOK); aiop->aioproc = p; aiop->aioprocflags = 0; /* * Wakeup parent process. (Parent sleeps to keep from blasting away * and creating too many daemons.) */ sema_post(&aio_newproc_sem); mtx_lock(&aio_job_mtx); for (;;) { /* * Take daemon off of free queue */ if (aiop->aioprocflags & AIOP_FREE) { TAILQ_REMOVE(&aio_freeproc, aiop, list); aiop->aioprocflags &= ~AIOP_FREE; } /* * Check for jobs. */ while ((job = aio_selectjob(aiop)) != NULL) { mtx_unlock(&aio_job_mtx); ki = job->userproc->p_aioinfo; job->handle_fn(job); mtx_lock(&aio_job_mtx); /* Decrement the active job count. */ ki->kaio_active_count--; } /* * Disconnect from user address space. */ if (p->p_vmspace != myvm) { mtx_unlock(&aio_job_mtx); vmspace_switch_aio(myvm); mtx_lock(&aio_job_mtx); /* * We have to restart to avoid race, we only sleep if * no job can be selected. */ continue; } mtx_assert(&aio_job_mtx, MA_OWNED); TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); aiop->aioprocflags |= AIOP_FREE; /* * If daemon is inactive for a long time, allow it to exit, * thereby freeing resources. */ if (msleep(p, &aio_job_mtx, PRIBIO, "aiordy", aiod_lifetime) == EWOULDBLOCK && TAILQ_EMPTY(&aio_jobs) && (aiop->aioprocflags & AIOP_FREE) && num_aio_procs > target_aio_procs) break; } TAILQ_REMOVE(&aio_freeproc, aiop, list); num_aio_procs--; mtx_unlock(&aio_job_mtx); free(aiop, M_AIO); free_unr(aiod_unr, id); vmspace_free(myvm); KASSERT(p->p_vmspace == myvm, ("AIOD: bad vmspace for exiting daemon")); KASSERT(refcount_load(&myvm->vm_refcnt) > 1, ("AIOD: bad vm refcnt for exiting daemon: %d", refcount_load(&myvm->vm_refcnt))); kproc_exit(0); } /* * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The * AIO daemon modifies its environment itself. */ static int aio_newproc(int *start) { int error; struct proc *p; int id; id = alloc_unr(aiod_unr); error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p, RFNOWAIT, 0, "aiod%d", id); if (error == 0) { /* * Wait until daemon is started. */ sema_wait(&aio_newproc_sem); mtx_lock(&aio_job_mtx); num_aio_procs++; if (start != NULL) (*start)--; mtx_unlock(&aio_job_mtx); } else { free_unr(aiod_unr, id); } return (error); } /* * Try the high-performance, low-overhead bio method for eligible * VCHR devices. This method doesn't use an aio helper thread, and * thus has very low overhead. * * Assumes that the caller, aio_aqueue(), has incremented the file * structure's reference count, preventing its deallocation for the * duration of this call. */ static int aio_qbio(struct proc *p, struct kaiocb *job) { struct aiocb *cb; struct file *fp; struct buf *pbuf; struct vnode *vp; struct cdevsw *csw; struct cdev *dev; struct kaioinfo *ki; struct bio **bios = NULL; off_t offset; int bio_cmd, error, i, iovcnt, opcode, poff, ref; vm_prot_t prot; bool use_unmapped; cb = &job->uaiocb; fp = job->fd_file; opcode = cb->aio_lio_opcode; if (!(opcode == LIO_WRITE || opcode == LIO_WRITEV || opcode == LIO_READ || opcode == LIO_READV)) return (-1); if (fp == NULL || fp->f_type != DTYPE_VNODE) return (-1); vp = fp->f_vnode; if (vp->v_type != VCHR) return (-1); if (vp->v_bufobj.bo_bsize == 0) return (-1); bio_cmd = (opcode & LIO_WRITE) ? BIO_WRITE : BIO_READ; iovcnt = job->uiop->uio_iovcnt; if (iovcnt > max_buf_aio) return (-1); for (i = 0; i < iovcnt; i++) { if (job->uiop->uio_iov[i].iov_len % vp->v_bufobj.bo_bsize != 0) return (-1); if (job->uiop->uio_iov[i].iov_len > maxphys) { error = -1; return (-1); } } offset = cb->aio_offset; ref = 0; csw = devvn_refthread(vp, &dev, &ref); if (csw == NULL) return (ENXIO); if ((csw->d_flags & D_DISK) == 0) { error = -1; goto unref; } if (job->uiop->uio_resid > dev->si_iosize_max) { error = -1; goto unref; } ki = p->p_aioinfo; job->error = 0; use_unmapped = (dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed; if (!use_unmapped) { AIO_LOCK(ki); if (ki->kaio_buffer_count + iovcnt > max_buf_aio) { AIO_UNLOCK(ki); error = EAGAIN; goto unref; } ki->kaio_buffer_count += iovcnt; AIO_UNLOCK(ki); } bios = malloc(sizeof(struct bio *) * iovcnt, M_TEMP, M_WAITOK); refcount_init(&job->nbio, iovcnt); for (i = 0; i < iovcnt; i++) { struct vm_page** pages; struct bio *bp; void *buf; size_t nbytes; int npages; buf = job->uiop->uio_iov[i].iov_base; nbytes = job->uiop->uio_iov[i].iov_len; bios[i] = g_alloc_bio(); bp = bios[i]; poff = (vm_offset_t)buf & PAGE_MASK; if (use_unmapped) { pbuf = NULL; pages = malloc(sizeof(vm_page_t) * (atop(round_page( nbytes)) + 1), M_TEMP, M_WAITOK | M_ZERO); } else { pbuf = uma_zalloc(pbuf_zone, M_WAITOK); BUF_KERNPROC(pbuf); pages = pbuf->b_pages; } bp->bio_length = nbytes; bp->bio_bcount = nbytes; bp->bio_done = aio_biowakeup; bp->bio_offset = offset; bp->bio_cmd = bio_cmd; bp->bio_dev = dev; bp->bio_caller1 = job; bp->bio_caller2 = pbuf; prot = VM_PROT_READ; if (opcode == LIO_READ || opcode == LIO_READV) prot |= VM_PROT_WRITE; /* Less backwards than it looks */ npages = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, (vm_offset_t)buf, bp->bio_length, prot, pages, atop(maxphys) + 1); if (npages < 0) { if (pbuf != NULL) uma_zfree(pbuf_zone, pbuf); else free(pages, M_TEMP); error = EFAULT; g_destroy_bio(bp); i--; goto destroy_bios; } if (pbuf != NULL) { pmap_qenter((vm_offset_t)pbuf->b_data, pages, npages); bp->bio_data = pbuf->b_data + poff; pbuf->b_npages = npages; atomic_add_int(&num_buf_aio, 1); } else { bp->bio_ma = pages; bp->bio_ma_n = npages; bp->bio_ma_offset = poff; bp->bio_data = unmapped_buf; bp->bio_flags |= BIO_UNMAPPED; atomic_add_int(&num_unmapped_aio, 1); } offset += nbytes; } /* Perform transfer. */ for (i = 0; i < iovcnt; i++) csw->d_strategy(bios[i]); free(bios, M_TEMP); dev_relthread(dev, ref); return (0); destroy_bios: for (; i >= 0; i--) aio_biocleanup(bios[i]); free(bios, M_TEMP); unref: dev_relthread(dev, ref); return (error); } #ifdef COMPAT_FREEBSD6 static int convert_old_sigevent(struct osigevent *osig, struct sigevent *nsig) { /* * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are * supported by AIO with the old sigevent structure. */ nsig->sigev_notify = osig->sigev_notify; switch (nsig->sigev_notify) { case SIGEV_NONE: break; case SIGEV_SIGNAL: nsig->sigev_signo = osig->__sigev_u.__sigev_signo; break; case SIGEV_KEVENT: nsig->sigev_notify_kqueue = osig->__sigev_u.__sigev_notify_kqueue; nsig->sigev_value.sival_ptr = osig->sigev_value.sival_ptr; break; default: return (EINVAL); } return (0); } static int aiocb_copyin_old_sigevent(struct aiocb *ujob, struct kaiocb *kjob, int type __unused) { struct oaiocb *ojob; struct aiocb *kcb = &kjob->uaiocb; int error; bzero(kcb, sizeof(struct aiocb)); error = copyin(ujob, kcb, sizeof(struct oaiocb)); if (error) return (error); /* No need to copyin aio_iov, because it did not exist in FreeBSD 6 */ ojob = (struct oaiocb *)kcb; return (convert_old_sigevent(&ojob->aio_sigevent, &kcb->aio_sigevent)); } #endif static int aiocb_copyin(struct aiocb *ujob, struct kaiocb *kjob, int type) { struct aiocb *kcb = &kjob->uaiocb; int error; error = copyin(ujob, kcb, sizeof(struct aiocb)); if (error) return (error); if (type == LIO_NOP) type = kcb->aio_lio_opcode; if (type & LIO_VECTORED) { /* malloc a uio and copy in the iovec */ error = copyinuio(__DEVOLATILE(struct iovec*, kcb->aio_iov), kcb->aio_iovcnt, &kjob->uiop); } return (error); } static long aiocb_fetch_status(struct aiocb *ujob) { return (fuword(&ujob->_aiocb_private.status)); } static long aiocb_fetch_error(struct aiocb *ujob) { return (fuword(&ujob->_aiocb_private.error)); } static int aiocb_store_status(struct aiocb *ujob, long status) { return (suword(&ujob->_aiocb_private.status, status)); } static int aiocb_store_error(struct aiocb *ujob, long error) { return (suword(&ujob->_aiocb_private.error, error)); } static int aiocb_store_kernelinfo(struct aiocb *ujob, long jobref) { return (suword(&ujob->_aiocb_private.kernelinfo, jobref)); } static int aiocb_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob) { return (suword(ujobp, (long)ujob)); } static struct aiocb_ops aiocb_ops = { .aio_copyin = aiocb_copyin, .fetch_status = aiocb_fetch_status, .fetch_error = aiocb_fetch_error, .store_status = aiocb_store_status, .store_error = aiocb_store_error, .store_kernelinfo = aiocb_store_kernelinfo, .store_aiocb = aiocb_store_aiocb, }; #ifdef COMPAT_FREEBSD6 static struct aiocb_ops aiocb_ops_osigevent = { .aio_copyin = aiocb_copyin_old_sigevent, .fetch_status = aiocb_fetch_status, .fetch_error = aiocb_fetch_error, .store_status = aiocb_store_status, .store_error = aiocb_store_error, .store_kernelinfo = aiocb_store_kernelinfo, .store_aiocb = aiocb_store_aiocb, }; #endif /* * Queue a new AIO request. Choosing either the threaded or direct bio VCHR * technique is done in this code. */ int aio_aqueue(struct thread *td, struct aiocb *ujob, struct aioliojob *lj, int type, struct aiocb_ops *ops) { struct proc *p = td->td_proc; struct file *fp = NULL; struct kaiocb *job; struct kaioinfo *ki; struct kevent kev; int opcode; int error; int fd, kqfd; int jid; u_short evflags; if (p->p_aioinfo == NULL) aio_init_aioinfo(p); ki = p->p_aioinfo; ops->store_status(ujob, -1); ops->store_error(ujob, 0); ops->store_kernelinfo(ujob, -1); if (num_queue_count >= max_queue_count || ki->kaio_count >= max_aio_queue_per_proc) { error = EAGAIN; goto err1; } job = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO); knlist_init_mtx(&job->klist, AIO_MTX(ki)); error = ops->aio_copyin(ujob, job, type); if (error) goto err2; if (job->uaiocb.aio_nbytes > IOSIZE_MAX) { error = EINVAL; goto err2; } if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT && job->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL && job->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID && job->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) { error = EINVAL; goto err2; } if ((job->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL || job->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) && !_SIG_VALID(job->uaiocb.aio_sigevent.sigev_signo)) { error = EINVAL; goto err2; } /* Get the opcode. */ if (type == LIO_NOP) { switch (job->uaiocb.aio_lio_opcode & ~LIO_FOFFSET) { case LIO_WRITE: case LIO_WRITEV: case LIO_NOP: case LIO_READ: case LIO_READV: opcode = job->uaiocb.aio_lio_opcode & ~LIO_FOFFSET; if ((job->uaiocb.aio_lio_opcode & LIO_FOFFSET) != 0) job->ioflags |= KAIOCB_IO_FOFFSET; break; default: error = EINVAL; goto err2; } } else opcode = job->uaiocb.aio_lio_opcode = type; ksiginfo_init(&job->ksi); /* Save userspace address of the job info. */ job->ujob = ujob; /* * Validate the opcode and fetch the file object for the specified * file descriptor. * * XXXRW: Moved the opcode validation up here so that we don't * retrieve a file descriptor without knowing what the capabiltity * should be. */ fd = job->uaiocb.aio_fildes; switch (opcode) { case LIO_WRITE: case LIO_WRITEV: error = fget_write(td, fd, &cap_pwrite_rights, &fp); break; case LIO_READ: case LIO_READV: error = fget_read(td, fd, &cap_pread_rights, &fp); break; case LIO_SYNC: case LIO_DSYNC: error = fget(td, fd, &cap_fsync_rights, &fp); break; case LIO_MLOCK: break; case LIO_NOP: error = fget(td, fd, &cap_no_rights, &fp); break; default: error = EINVAL; } if (error) goto err3; if ((opcode & LIO_SYNC) && fp->f_vnode == NULL) { error = EINVAL; goto err3; } if ((opcode == LIO_READ || opcode == LIO_READV || opcode == LIO_WRITE || opcode == LIO_WRITEV) && job->uaiocb.aio_offset < 0 && (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR)) { error = EINVAL; goto err3; } if (fp != NULL && fp->f_ops == &path_fileops) { error = EBADF; goto err3; } job->fd_file = fp; mtx_lock(&aio_job_mtx); jid = jobrefid++; job->seqno = jobseqno++; mtx_unlock(&aio_job_mtx); error = ops->store_kernelinfo(ujob, jid); if (error) { error = EINVAL; goto err3; } job->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid; if (opcode == LIO_NOP) { fdrop(fp, td); MPASS(job->uiop == &job->uio || job->uiop == NULL); uma_zfree(aiocb_zone, job); return (0); } if (job->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT) goto no_kqueue; evflags = job->uaiocb.aio_sigevent.sigev_notify_kevent_flags; if ((evflags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0) { error = EINVAL; goto err3; } kqfd = job->uaiocb.aio_sigevent.sigev_notify_kqueue; memset(&kev, 0, sizeof(kev)); kev.ident = (uintptr_t)job->ujob; kev.filter = EVFILT_AIO; kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | evflags; kev.data = (intptr_t)job; kev.udata = job->uaiocb.aio_sigevent.sigev_value.sival_ptr; error = kqfd_register(kqfd, &kev, td, M_WAITOK); if (error) goto err3; no_kqueue: ops->store_error(ujob, EINPROGRESS); job->uaiocb._aiocb_private.error = EINPROGRESS; job->userproc = p; job->cred = crhold(td->td_ucred); job->jobflags = KAIOCB_QUEUEING; job->lio = lj; if (opcode & LIO_VECTORED) { /* Use the uio copied in by aio_copyin */ MPASS(job->uiop != &job->uio && job->uiop != NULL); } else { /* Setup the inline uio */ job->iov[0].iov_base = (void *)(uintptr_t)job->uaiocb.aio_buf; job->iov[0].iov_len = job->uaiocb.aio_nbytes; job->uio.uio_iov = job->iov; job->uio.uio_iovcnt = 1; job->uio.uio_resid = job->uaiocb.aio_nbytes; job->uio.uio_segflg = UIO_USERSPACE; job->uiop = &job->uio; } switch (opcode & (LIO_READ | LIO_WRITE)) { case LIO_READ: job->uiop->uio_rw = UIO_READ; break; case LIO_WRITE: job->uiop->uio_rw = UIO_WRITE; break; } job->uiop->uio_offset = job->uaiocb.aio_offset; job->uiop->uio_td = td; if (opcode == LIO_MLOCK) { aio_schedule(job, aio_process_mlock); error = 0; } else if (fp->f_ops->fo_aio_queue == NULL) error = aio_queue_file(fp, job); else error = fo_aio_queue(fp, job); if (error) goto err4; AIO_LOCK(ki); job->jobflags &= ~KAIOCB_QUEUEING; TAILQ_INSERT_TAIL(&ki->kaio_all, job, allist); ki->kaio_count++; if (lj) lj->lioj_count++; atomic_add_int(&num_queue_count, 1); if (job->jobflags & KAIOCB_FINISHED) { /* * The queue callback completed the request synchronously. * The bulk of the completion is deferred in that case * until this point. */ aio_bio_done_notify(p, job); } else TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, job, plist); AIO_UNLOCK(ki); return (0); err4: crfree(job->cred); err3: if (fp) fdrop(fp, td); knlist_delete(&job->klist, curthread, 0); err2: if (job->uiop != &job->uio) freeuio(job->uiop); uma_zfree(aiocb_zone, job); err1: ops->store_error(ujob, error); return (error); } static void aio_cancel_daemon_job(struct kaiocb *job) { mtx_lock(&aio_job_mtx); if (!aio_cancel_cleared(job)) TAILQ_REMOVE(&aio_jobs, job, list); mtx_unlock(&aio_job_mtx); aio_cancel(job); } void aio_schedule(struct kaiocb *job, aio_handle_fn_t *func) { mtx_lock(&aio_job_mtx); if (!aio_set_cancel_function(job, aio_cancel_daemon_job)) { mtx_unlock(&aio_job_mtx); aio_cancel(job); return; } job->handle_fn = func; TAILQ_INSERT_TAIL(&aio_jobs, job, list); aio_kick_nowait(job->userproc); mtx_unlock(&aio_job_mtx); } static void aio_cancel_sync(struct kaiocb *job) { struct kaioinfo *ki; ki = job->userproc->p_aioinfo; AIO_LOCK(ki); if (!aio_cancel_cleared(job)) TAILQ_REMOVE(&ki->kaio_syncqueue, job, list); AIO_UNLOCK(ki); aio_cancel(job); } int aio_queue_file(struct file *fp, struct kaiocb *job) { struct kaioinfo *ki; struct kaiocb *job2; struct vnode *vp; struct mount *mp; int error; bool safe; ki = job->userproc->p_aioinfo; error = aio_qbio(job->userproc, job); if (error >= 0) return (error); safe = false; if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; if (vp->v_type == VREG || vp->v_type == VDIR) { mp = fp->f_vnode->v_mount; if (mp == NULL || (mp->mnt_flag & MNT_LOCAL) != 0) safe = true; } } if (!(safe || enable_aio_unsafe)) { counted_warning(&unsafe_warningcnt, "is attempting to use unsafe AIO requests"); return (EOPNOTSUPP); } if (job->uaiocb.aio_lio_opcode & (LIO_WRITE | LIO_READ)) { aio_schedule(job, aio_process_rw); error = 0; } else if (job->uaiocb.aio_lio_opcode & LIO_SYNC) { AIO_LOCK(ki); TAILQ_FOREACH(job2, &ki->kaio_jobqueue, plist) { if (job2->fd_file == job->fd_file && ((job2->uaiocb.aio_lio_opcode & LIO_SYNC) == 0) && job2->seqno < job->seqno) { job2->jobflags |= KAIOCB_CHECKSYNC; job->pending++; } } if (job->pending != 0) { if (!aio_set_cancel_function_locked(job, aio_cancel_sync)) { AIO_UNLOCK(ki); aio_cancel(job); return (0); } TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, job, list); AIO_UNLOCK(ki); return (0); } AIO_UNLOCK(ki); aio_schedule(job, aio_process_sync); error = 0; } else { error = EINVAL; } return (error); } static void aio_kick_nowait(struct proc *userp) { struct kaioinfo *ki = userp->p_aioinfo; struct aioproc *aiop; mtx_assert(&aio_job_mtx, MA_OWNED); if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { TAILQ_REMOVE(&aio_freeproc, aiop, list); aiop->aioprocflags &= ~AIOP_FREE; wakeup(aiop->aioproc); } else if (num_aio_resv_start + num_aio_procs < max_aio_procs && ki->kaio_active_count + num_aio_resv_start < max_aio_per_proc) { taskqueue_enqueue(taskqueue_aiod_kick, &ki->kaio_task); } } static int aio_kick(struct proc *userp) { struct kaioinfo *ki = userp->p_aioinfo; struct aioproc *aiop; int error, ret = 0; mtx_assert(&aio_job_mtx, MA_OWNED); retryproc: if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) { TAILQ_REMOVE(&aio_freeproc, aiop, list); aiop->aioprocflags &= ~AIOP_FREE; wakeup(aiop->aioproc); } else if (num_aio_resv_start + num_aio_procs < max_aio_procs && ki->kaio_active_count + num_aio_resv_start < max_aio_per_proc) { num_aio_resv_start++; mtx_unlock(&aio_job_mtx); error = aio_newproc(&num_aio_resv_start); mtx_lock(&aio_job_mtx); if (error) { num_aio_resv_start--; goto retryproc; } } else { ret = -1; } return (ret); } static void aio_kick_helper(void *context, int pending) { struct proc *userp = context; mtx_lock(&aio_job_mtx); while (--pending >= 0) { if (aio_kick(userp)) break; } mtx_unlock(&aio_job_mtx); } /* * Support the aio_return system call, as a side-effect, kernel resources are * released. */ static int kern_aio_return(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops) { struct proc *p = td->td_proc; struct kaiocb *job; struct kaioinfo *ki; long status, error; ki = p->p_aioinfo; if (ki == NULL) return (EINVAL); AIO_LOCK(ki); TAILQ_FOREACH(job, &ki->kaio_done, plist) { if (job->ujob == ujob) break; } if (job != NULL) { MPASS(job->jobflags & KAIOCB_FINISHED); status = job->uaiocb._aiocb_private.status; error = job->uaiocb._aiocb_private.error; td->td_retval[0] = status; td->td_ru.ru_oublock += job->outblock; td->td_ru.ru_inblock += job->inblock; td->td_ru.ru_msgsnd += job->msgsnd; td->td_ru.ru_msgrcv += job->msgrcv; aio_free_entry(job); AIO_UNLOCK(ki); ops->store_error(ujob, error); ops->store_status(ujob, status); } else { error = EINVAL; AIO_UNLOCK(ki); } return (error); } int sys_aio_return(struct thread *td, struct aio_return_args *uap) { return (kern_aio_return(td, uap->aiocbp, &aiocb_ops)); } /* * Allow a process to wakeup when any of the I/O requests are completed. */ static int kern_aio_suspend(struct thread *td, int njoblist, struct aiocb **ujoblist, struct timespec *ts) { struct proc *p = td->td_proc; struct timeval atv; struct kaioinfo *ki; struct kaiocb *firstjob, *job; int error, i, timo; timo = 0; if (ts) { if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000) return (EINVAL); TIMESPEC_TO_TIMEVAL(&atv, ts); if (itimerfix(&atv)) return (EINVAL); timo = tvtohz(&atv); } ki = p->p_aioinfo; if (ki == NULL) return (EAGAIN); if (njoblist == 0) return (0); AIO_LOCK(ki); for (;;) { firstjob = NULL; error = 0; TAILQ_FOREACH(job, &ki->kaio_all, allist) { for (i = 0; i < njoblist; i++) { if (job->ujob == ujoblist[i]) { if (firstjob == NULL) firstjob = job; if (job->jobflags & KAIOCB_FINISHED) goto RETURN; } } } /* All tasks were finished. */ if (firstjob == NULL) break; ki->kaio_flags |= KAIO_WAKEUP; error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH, "aiospn", timo); if (error == ERESTART) error = EINTR; if (error) break; } RETURN: AIO_UNLOCK(ki); return (error); } int sys_aio_suspend(struct thread *td, struct aio_suspend_args *uap) { struct timespec ts, *tsp; struct aiocb **ujoblist; int error; if (uap->nent < 0 || uap->nent > max_aio_queue_per_proc) return (EINVAL); if (uap->timeout) { /* Get timespec struct. */ if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0) return (error); tsp = &ts; } else tsp = NULL; ujoblist = malloc(uap->nent * sizeof(ujoblist[0]), M_AIO, M_WAITOK); error = copyin(uap->aiocbp, ujoblist, uap->nent * sizeof(ujoblist[0])); if (error == 0) error = kern_aio_suspend(td, uap->nent, ujoblist, tsp); free(ujoblist, M_AIO); return (error); } /* * aio_cancel cancels any non-bio aio operations not currently in progress. */ int sys_aio_cancel(struct thread *td, struct aio_cancel_args *uap) { struct proc *p = td->td_proc; struct kaioinfo *ki; struct kaiocb *job, *jobn; struct file *fp; int error; int cancelled = 0; int notcancelled = 0; struct vnode *vp; /* Lookup file object. */ error = fget(td, uap->fd, &cap_no_rights, &fp); if (error) return (error); ki = p->p_aioinfo; if (ki == NULL) goto done; if (fp->f_type == DTYPE_VNODE) { vp = fp->f_vnode; if (vn_isdisk(vp)) { fdrop(fp, td); td->td_retval[0] = AIO_NOTCANCELED; return (0); } } AIO_LOCK(ki); TAILQ_FOREACH_SAFE(job, &ki->kaio_jobqueue, plist, jobn) { if ((uap->fd == job->uaiocb.aio_fildes) && ((uap->aiocbp == NULL) || (uap->aiocbp == job->ujob))) { if (aio_cancel_job(p, ki, job)) { cancelled++; } else { notcancelled++; } if (uap->aiocbp != NULL) break; } } AIO_UNLOCK(ki); done: fdrop(fp, td); if (uap->aiocbp != NULL) { if (cancelled) { td->td_retval[0] = AIO_CANCELED; return (0); } } if (notcancelled) { td->td_retval[0] = AIO_NOTCANCELED; return (0); } if (cancelled) { td->td_retval[0] = AIO_CANCELED; return (0); } td->td_retval[0] = AIO_ALLDONE; return (0); } /* * aio_error is implemented in the kernel level for compatibility purposes * only. For a user mode async implementation, it would be best to do it in * a userland subroutine. */ static int kern_aio_error(struct thread *td, struct aiocb *ujob, struct aiocb_ops *ops) { struct proc *p = td->td_proc; struct kaiocb *job; struct kaioinfo *ki; int status; ki = p->p_aioinfo; if (ki == NULL) { td->td_retval[0] = EINVAL; return (0); } AIO_LOCK(ki); TAILQ_FOREACH(job, &ki->kaio_all, allist) { if (job->ujob == ujob) { if (job->jobflags & KAIOCB_FINISHED) td->td_retval[0] = job->uaiocb._aiocb_private.error; else td->td_retval[0] = EINPROGRESS; AIO_UNLOCK(ki); return (0); } } AIO_UNLOCK(ki); /* * Hack for failure of aio_aqueue. */ status = ops->fetch_status(ujob); if (status == -1) { td->td_retval[0] = ops->fetch_error(ujob); return (0); } td->td_retval[0] = EINVAL; return (0); } int sys_aio_error(struct thread *td, struct aio_error_args *uap) { return (kern_aio_error(td, uap->aiocbp, &aiocb_ops)); } /* syscall - asynchronous read from a file (REALTIME) */ #ifdef COMPAT_FREEBSD6 int freebsd6_aio_read(struct thread *td, struct freebsd6_aio_read_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, &aiocb_ops_osigevent)); } #endif int sys_aio_read(struct thread *td, struct aio_read_args *uap) { return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops)); } int sys_aio_readv(struct thread *td, struct aio_readv_args *uap) { return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READV, &aiocb_ops)); } /* syscall - asynchronous write to a file (REALTIME) */ #ifdef COMPAT_FREEBSD6 int freebsd6_aio_write(struct thread *td, struct freebsd6_aio_write_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops_osigevent)); } #endif int sys_aio_write(struct thread *td, struct aio_write_args *uap) { return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops)); } int sys_aio_writev(struct thread *td, struct aio_writev_args *uap) { return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITEV, &aiocb_ops)); } int sys_aio_mlock(struct thread *td, struct aio_mlock_args *uap) { return (aio_aqueue(td, uap->aiocbp, NULL, LIO_MLOCK, &aiocb_ops)); } static int kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list, struct aiocb **acb_list, int nent, struct sigevent *sig, struct aiocb_ops *ops) { struct proc *p = td->td_proc; struct aiocb *job; struct kaioinfo *ki; struct aioliojob *lj; struct kevent kev; int error; int nagain, nerror; int i; if ((mode != LIO_NOWAIT) && (mode != LIO_WAIT)) return (EINVAL); if (nent < 0 || nent > max_aio_queue_per_proc) return (EINVAL); if (p->p_aioinfo == NULL) aio_init_aioinfo(p); ki = p->p_aioinfo; lj = uma_zalloc(aiolio_zone, M_WAITOK); lj->lioj_flags = 0; lj->lioj_count = 0; lj->lioj_finished_count = 0; lj->lioj_signal.sigev_notify = SIGEV_NONE; knlist_init_mtx(&lj->klist, AIO_MTX(ki)); ksiginfo_init(&lj->lioj_ksi); /* * Setup signal. */ if (sig && (mode == LIO_NOWAIT)) { bcopy(sig, &lj->lioj_signal, sizeof(lj->lioj_signal)); if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { /* Assume only new style KEVENT */ memset(&kev, 0, sizeof(kev)); kev.filter = EVFILT_LIO; kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1; kev.ident = (uintptr_t)uacb_list; /* something unique */ kev.data = (intptr_t)lj; /* pass user defined sigval data */ kev.udata = lj->lioj_signal.sigev_value.sival_ptr; error = kqfd_register( lj->lioj_signal.sigev_notify_kqueue, &kev, td, M_WAITOK); if (error) { uma_zfree(aiolio_zone, lj); return (error); } } else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) { ; } else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) { if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) { uma_zfree(aiolio_zone, lj); return EINVAL; } lj->lioj_flags |= LIOJ_SIGNAL; } else { uma_zfree(aiolio_zone, lj); return EINVAL; } } AIO_LOCK(ki); TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list); /* * Add extra aiocb count to avoid the lio to be freed * by other threads doing aio_waitcomplete or aio_return, * and prevent event from being sent until we have queued * all tasks. */ lj->lioj_count = 1; AIO_UNLOCK(ki); /* * Get pointers to the list of I/O requests. */ nagain = 0; nerror = 0; for (i = 0; i < nent; i++) { job = acb_list[i]; if (job != NULL) { error = aio_aqueue(td, job, lj, LIO_NOP, ops); if (error == EAGAIN) nagain++; else if (error != 0) nerror++; } } error = 0; AIO_LOCK(ki); if (mode == LIO_WAIT) { while (lj->lioj_count - 1 != lj->lioj_finished_count) { ki->kaio_flags |= KAIO_WAKEUP; error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH, "aiospn", 0); if (error == ERESTART) error = EINTR; if (error) break; } } else { if (lj->lioj_count - 1 == lj->lioj_finished_count) { if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) { lj->lioj_flags |= LIOJ_KEVENT_POSTED; KNOTE_LOCKED(&lj->klist, 1); } if ((lj->lioj_flags & (LIOJ_SIGNAL | LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL || lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) { aio_sendsig(p, &lj->lioj_signal, &lj->lioj_ksi, lj->lioj_count != 1); lj->lioj_flags |= LIOJ_SIGNAL_POSTED; } } } lj->lioj_count--; if (lj->lioj_count == 0) { TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); knlist_delete(&lj->klist, curthread, 1); PROC_LOCK(p); sigqueue_take(&lj->lioj_ksi); PROC_UNLOCK(p); AIO_UNLOCK(ki); uma_zfree(aiolio_zone, lj); } else AIO_UNLOCK(ki); if (nerror) return (EIO); else if (nagain) return (EAGAIN); else return (error); } /* syscall - list directed I/O (REALTIME) */ #ifdef COMPAT_FREEBSD6 int freebsd6_lio_listio(struct thread *td, struct freebsd6_lio_listio_args *uap) { struct aiocb **acb_list; struct sigevent *sigp, sig; struct osigevent osig; int error, nent; if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) return (EINVAL); nent = uap->nent; if (nent < 0 || nent > max_aio_queue_per_proc) return (EINVAL); if (uap->sig && (uap->mode == LIO_NOWAIT)) { error = copyin(uap->sig, &osig, sizeof(osig)); if (error) return (error); error = convert_old_sigevent(&osig, &sig); if (error) return (error); sigp = &sig; } else sigp = NULL; acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0])); if (error == 0) error = kern_lio_listio(td, uap->mode, (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp, &aiocb_ops_osigevent); free(acb_list, M_LIO); return (error); } #endif /* syscall - list directed I/O (REALTIME) */ int sys_lio_listio(struct thread *td, struct lio_listio_args *uap) { struct aiocb **acb_list; struct sigevent *sigp, sig; int error, nent; if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) return (EINVAL); nent = uap->nent; if (nent < 0 || nent > max_aio_queue_per_proc) return (EINVAL); if (uap->sig && (uap->mode == LIO_NOWAIT)) { error = copyin(uap->sig, &sig, sizeof(sig)); if (error) return (error); sigp = &sig; } else sigp = NULL; acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0])); if (error == 0) error = kern_lio_listio(td, uap->mode, uap->acb_list, acb_list, nent, sigp, &aiocb_ops); free(acb_list, M_LIO); return (error); } static void aio_biocleanup(struct bio *bp) { struct kaiocb *job = (struct kaiocb *)bp->bio_caller1; struct kaioinfo *ki; struct buf *pbuf = (struct buf *)bp->bio_caller2; /* Release mapping into kernel space. */ if (pbuf != NULL) { MPASS(pbuf->b_npages <= atop(maxphys) + 1); pmap_qremove((vm_offset_t)pbuf->b_data, pbuf->b_npages); vm_page_unhold_pages(pbuf->b_pages, pbuf->b_npages); uma_zfree(pbuf_zone, pbuf); atomic_subtract_int(&num_buf_aio, 1); ki = job->userproc->p_aioinfo; AIO_LOCK(ki); ki->kaio_buffer_count--; AIO_UNLOCK(ki); } else { MPASS(bp->bio_ma_n <= atop(maxphys) + 1); vm_page_unhold_pages(bp->bio_ma, bp->bio_ma_n); free(bp->bio_ma, M_TEMP); atomic_subtract_int(&num_unmapped_aio, 1); } g_destroy_bio(bp); } static void aio_biowakeup(struct bio *bp) { struct kaiocb *job = (struct kaiocb *)bp->bio_caller1; size_t nbytes; long bcount = bp->bio_bcount; long resid = bp->bio_resid; int opcode, nblks; int bio_error = bp->bio_error; uint16_t flags = bp->bio_flags; opcode = job->uaiocb.aio_lio_opcode; aio_biocleanup(bp); nbytes = bcount - resid; atomic_add_acq_long(&job->nbytes, nbytes); nblks = btodb(nbytes); /* * If multiple bios experienced an error, the job will reflect the * error of whichever failed bio completed last. */ if (flags & BIO_ERROR) atomic_store_int(&job->error, bio_error); if (opcode & LIO_WRITE) atomic_add_int(&job->outblock, nblks); else atomic_add_int(&job->inblock, nblks); if (refcount_release(&job->nbio)) { bio_error = atomic_load_int(&job->error); if (bio_error != 0) aio_complete(job, -1, bio_error); else aio_complete(job, atomic_load_long(&job->nbytes), 0); } } /* syscall - wait for the next completion of an aio request */ static int kern_aio_waitcomplete(struct thread *td, struct aiocb **ujobp, struct timespec *ts, struct aiocb_ops *ops) { struct proc *p = td->td_proc; struct timeval atv; struct kaioinfo *ki; struct kaiocb *job; struct aiocb *ujob; long error, status; int timo; ops->store_aiocb(ujobp, NULL); if (ts == NULL) { timo = 0; } else if (ts->tv_sec == 0 && ts->tv_nsec == 0) { timo = -1; } else { if ((ts->tv_nsec < 0) || (ts->tv_nsec >= 1000000000)) return (EINVAL); TIMESPEC_TO_TIMEVAL(&atv, ts); if (itimerfix(&atv)) return (EINVAL); timo = tvtohz(&atv); } if (p->p_aioinfo == NULL) aio_init_aioinfo(p); ki = p->p_aioinfo; error = 0; job = NULL; AIO_LOCK(ki); while ((job = TAILQ_FIRST(&ki->kaio_done)) == NULL) { if (timo == -1) { error = EWOULDBLOCK; break; } ki->kaio_flags |= KAIO_WAKEUP; error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH, "aiowc", timo); if (timo && error == ERESTART) error = EINTR; if (error) break; } if (job != NULL) { MPASS(job->jobflags & KAIOCB_FINISHED); ujob = job->ujob; status = job->uaiocb._aiocb_private.status; error = job->uaiocb._aiocb_private.error; td->td_retval[0] = status; td->td_ru.ru_oublock += job->outblock; td->td_ru.ru_inblock += job->inblock; td->td_ru.ru_msgsnd += job->msgsnd; td->td_ru.ru_msgrcv += job->msgrcv; aio_free_entry(job); AIO_UNLOCK(ki); ops->store_aiocb(ujobp, ujob); ops->store_error(ujob, error); ops->store_status(ujob, status); } else AIO_UNLOCK(ki); return (error); } int sys_aio_waitcomplete(struct thread *td, struct aio_waitcomplete_args *uap) { struct timespec ts, *tsp; int error; if (uap->timeout) { /* Get timespec struct. */ error = copyin(uap->timeout, &ts, sizeof(ts)); if (error) return (error); tsp = &ts; } else tsp = NULL; return (kern_aio_waitcomplete(td, uap->aiocbp, tsp, &aiocb_ops)); } static int kern_aio_fsync(struct thread *td, int op, struct aiocb *ujob, struct aiocb_ops *ops) { int listop; switch (op) { case O_SYNC: listop = LIO_SYNC; break; case O_DSYNC: listop = LIO_DSYNC; break; default: return (EINVAL); } return (aio_aqueue(td, ujob, NULL, listop, ops)); } int sys_aio_fsync(struct thread *td, struct aio_fsync_args *uap) { return (kern_aio_fsync(td, uap->op, uap->aiocbp, &aiocb_ops)); } /* kqueue attach function */ static int filt_aioattach(struct knote *kn) { struct kaiocb *job; job = (struct kaiocb *)(uintptr_t)kn->kn_sdata; /* * The job pointer must be validated before using it, so * registration is restricted to the kernel; the user cannot * set EV_FLAG1. */ if ((kn->kn_flags & EV_FLAG1) == 0) return (EPERM); kn->kn_ptr.p_aio = job; kn->kn_flags &= ~EV_FLAG1; knlist_add(&job->klist, kn, 0); return (0); } /* kqueue detach function */ static void filt_aiodetach(struct knote *kn) { struct knlist *knl; knl = &kn->kn_ptr.p_aio->klist; knl->kl_lock(knl->kl_lockarg); if (!knlist_empty(knl)) knlist_remove(knl, kn, 1); knl->kl_unlock(knl->kl_lockarg); } /* kqueue filter function */ /*ARGSUSED*/ static int filt_aio(struct knote *kn, long hint) { struct kaiocb *job = kn->kn_ptr.p_aio; kn->kn_data = job->uaiocb._aiocb_private.error; if (!(job->jobflags & KAIOCB_FINISHED)) return (0); kn->kn_flags |= EV_EOF; return (1); } /* kqueue attach function */ static int filt_lioattach(struct knote *kn) { struct aioliojob *lj; lj = (struct aioliojob *)(uintptr_t)kn->kn_sdata; /* * The aioliojob pointer must be validated before using it, so * registration is restricted to the kernel; the user cannot * set EV_FLAG1. */ if ((kn->kn_flags & EV_FLAG1) == 0) return (EPERM); kn->kn_ptr.p_lio = lj; kn->kn_flags &= ~EV_FLAG1; knlist_add(&lj->klist, kn, 0); return (0); } /* kqueue detach function */ static void filt_liodetach(struct knote *kn) { struct knlist *knl; knl = &kn->kn_ptr.p_lio->klist; knl->kl_lock(knl->kl_lockarg); if (!knlist_empty(knl)) knlist_remove(knl, kn, 1); knl->kl_unlock(knl->kl_lockarg); } /* kqueue filter function */ /*ARGSUSED*/ static int filt_lio(struct knote *kn, long hint) { struct aioliojob * lj = kn->kn_ptr.p_lio; return (lj->lioj_flags & LIOJ_KEVENT_POSTED); } #ifdef COMPAT_FREEBSD32 #include #include #include #include #include #include #include #include struct __aiocb_private32 { int32_t status; int32_t error; uint32_t kernelinfo; }; #ifdef COMPAT_FREEBSD6 typedef struct oaiocb32 { int aio_fildes; /* File descriptor */ uint64_t aio_offset __packed; /* File offset for I/O */ uint32_t aio_buf; /* I/O buffer in process space */ uint32_t aio_nbytes; /* Number of bytes for I/O */ struct osigevent32 aio_sigevent; /* Signal to deliver */ int aio_lio_opcode; /* LIO opcode */ int aio_reqprio; /* Request priority -- ignored */ struct __aiocb_private32 _aiocb_private; } oaiocb32_t; #endif typedef struct aiocb32 { int32_t aio_fildes; /* File descriptor */ uint64_t aio_offset __packed; /* File offset for I/O */ uint32_t aio_buf; /* I/O buffer in process space */ uint32_t aio_nbytes; /* Number of bytes for I/O */ int __spare__[2]; uint32_t __spare2__; int aio_lio_opcode; /* LIO opcode */ int aio_reqprio; /* Request priority -- ignored */ struct __aiocb_private32 _aiocb_private; struct sigevent32 aio_sigevent; /* Signal to deliver */ } aiocb32_t; #ifdef COMPAT_FREEBSD6 static int convert_old_sigevent32(struct osigevent32 *osig, struct sigevent *nsig) { /* * Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are * supported by AIO with the old sigevent structure. */ CP(*osig, *nsig, sigev_notify); switch (nsig->sigev_notify) { case SIGEV_NONE: break; case SIGEV_SIGNAL: nsig->sigev_signo = osig->__sigev_u.__sigev_signo; break; case SIGEV_KEVENT: nsig->sigev_notify_kqueue = osig->__sigev_u.__sigev_notify_kqueue; PTRIN_CP(*osig, *nsig, sigev_value.sival_ptr); break; default: return (EINVAL); } return (0); } static int aiocb32_copyin_old_sigevent(struct aiocb *ujob, struct kaiocb *kjob, int type __unused) { struct oaiocb32 job32; struct aiocb *kcb = &kjob->uaiocb; int error; bzero(kcb, sizeof(struct aiocb)); error = copyin(ujob, &job32, sizeof(job32)); if (error) return (error); /* No need to copyin aio_iov, because it did not exist in FreeBSD 6 */ CP(job32, *kcb, aio_fildes); CP(job32, *kcb, aio_offset); PTRIN_CP(job32, *kcb, aio_buf); CP(job32, *kcb, aio_nbytes); CP(job32, *kcb, aio_lio_opcode); CP(job32, *kcb, aio_reqprio); CP(job32, *kcb, _aiocb_private.status); CP(job32, *kcb, _aiocb_private.error); PTRIN_CP(job32, *kcb, _aiocb_private.kernelinfo); return (convert_old_sigevent32(&job32.aio_sigevent, &kcb->aio_sigevent)); } #endif static int aiocb32_copyin(struct aiocb *ujob, struct kaiocb *kjob, int type) { struct aiocb32 job32; struct aiocb *kcb = &kjob->uaiocb; struct iovec32 *iov32; int error; error = copyin(ujob, &job32, sizeof(job32)); if (error) return (error); CP(job32, *kcb, aio_fildes); CP(job32, *kcb, aio_offset); CP(job32, *kcb, aio_lio_opcode); if (type == LIO_NOP) type = kcb->aio_lio_opcode; if (type & LIO_VECTORED) { iov32 = PTRIN(job32.aio_iov); CP(job32, *kcb, aio_iovcnt); /* malloc a uio and copy in the iovec */ error = freebsd32_copyinuio(iov32, kcb->aio_iovcnt, &kjob->uiop); if (error) return (error); } else { PTRIN_CP(job32, *kcb, aio_buf); CP(job32, *kcb, aio_nbytes); } CP(job32, *kcb, aio_reqprio); CP(job32, *kcb, _aiocb_private.status); CP(job32, *kcb, _aiocb_private.error); PTRIN_CP(job32, *kcb, _aiocb_private.kernelinfo); error = convert_sigevent32(&job32.aio_sigevent, &kcb->aio_sigevent); return (error); } static long aiocb32_fetch_status(struct aiocb *ujob) { struct aiocb32 *ujob32; ujob32 = (struct aiocb32 *)ujob; return (fuword32(&ujob32->_aiocb_private.status)); } static long aiocb32_fetch_error(struct aiocb *ujob) { struct aiocb32 *ujob32; ujob32 = (struct aiocb32 *)ujob; return (fuword32(&ujob32->_aiocb_private.error)); } static int aiocb32_store_status(struct aiocb *ujob, long status) { struct aiocb32 *ujob32; ujob32 = (struct aiocb32 *)ujob; return (suword32(&ujob32->_aiocb_private.status, status)); } static int aiocb32_store_error(struct aiocb *ujob, long error) { struct aiocb32 *ujob32; ujob32 = (struct aiocb32 *)ujob; return (suword32(&ujob32->_aiocb_private.error, error)); } static int aiocb32_store_kernelinfo(struct aiocb *ujob, long jobref) { struct aiocb32 *ujob32; ujob32 = (struct aiocb32 *)ujob; return (suword32(&ujob32->_aiocb_private.kernelinfo, jobref)); } static int aiocb32_store_aiocb(struct aiocb **ujobp, struct aiocb *ujob) { return (suword32(ujobp, (long)ujob)); } static struct aiocb_ops aiocb32_ops = { .aio_copyin = aiocb32_copyin, .fetch_status = aiocb32_fetch_status, .fetch_error = aiocb32_fetch_error, .store_status = aiocb32_store_status, .store_error = aiocb32_store_error, .store_kernelinfo = aiocb32_store_kernelinfo, .store_aiocb = aiocb32_store_aiocb, }; #ifdef COMPAT_FREEBSD6 static struct aiocb_ops aiocb32_ops_osigevent = { .aio_copyin = aiocb32_copyin_old_sigevent, .fetch_status = aiocb32_fetch_status, .fetch_error = aiocb32_fetch_error, .store_status = aiocb32_store_status, .store_error = aiocb32_store_error, .store_kernelinfo = aiocb32_store_kernelinfo, .store_aiocb = aiocb32_store_aiocb, }; #endif int freebsd32_aio_return(struct thread *td, struct freebsd32_aio_return_args *uap) { return (kern_aio_return(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops)); } int freebsd32_aio_suspend(struct thread *td, struct freebsd32_aio_suspend_args *uap) { struct timespec32 ts32; struct timespec ts, *tsp; struct aiocb **ujoblist; uint32_t *ujoblist32; int error, i; if (uap->nent < 0 || uap->nent > max_aio_queue_per_proc) return (EINVAL); if (uap->timeout) { /* Get timespec struct. */ if ((error = copyin(uap->timeout, &ts32, sizeof(ts32))) != 0) return (error); CP(ts32, ts, tv_sec); CP(ts32, ts, tv_nsec); tsp = &ts; } else tsp = NULL; ujoblist = malloc(uap->nent * sizeof(ujoblist[0]), M_AIO, M_WAITOK); ujoblist32 = (uint32_t *)ujoblist; error = copyin(uap->aiocbp, ujoblist32, uap->nent * sizeof(ujoblist32[0])); if (error == 0) { for (i = uap->nent - 1; i >= 0; i--) ujoblist[i] = PTRIN(ujoblist32[i]); error = kern_aio_suspend(td, uap->nent, ujoblist, tsp); } free(ujoblist, M_AIO); return (error); } int freebsd32_aio_error(struct thread *td, struct freebsd32_aio_error_args *uap) { return (kern_aio_error(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops)); } #ifdef COMPAT_FREEBSD6 int freebsd6_freebsd32_aio_read(struct thread *td, struct freebsd6_freebsd32_aio_read_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, &aiocb32_ops_osigevent)); } #endif int freebsd32_aio_read(struct thread *td, struct freebsd32_aio_read_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, &aiocb32_ops)); } int freebsd32_aio_readv(struct thread *td, struct freebsd32_aio_readv_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READV, &aiocb32_ops)); } #ifdef COMPAT_FREEBSD6 int freebsd6_freebsd32_aio_write(struct thread *td, struct freebsd6_freebsd32_aio_write_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, &aiocb32_ops_osigevent)); } #endif int freebsd32_aio_write(struct thread *td, struct freebsd32_aio_write_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, &aiocb32_ops)); } int freebsd32_aio_writev(struct thread *td, struct freebsd32_aio_writev_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITEV, &aiocb32_ops)); } int freebsd32_aio_mlock(struct thread *td, struct freebsd32_aio_mlock_args *uap) { return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_MLOCK, &aiocb32_ops)); } int freebsd32_aio_waitcomplete(struct thread *td, struct freebsd32_aio_waitcomplete_args *uap) { struct timespec32 ts32; struct timespec ts, *tsp; int error; if (uap->timeout) { /* Get timespec struct. */ error = copyin(uap->timeout, &ts32, sizeof(ts32)); if (error) return (error); CP(ts32, ts, tv_sec); CP(ts32, ts, tv_nsec); tsp = &ts; } else tsp = NULL; return (kern_aio_waitcomplete(td, (struct aiocb **)uap->aiocbp, tsp, &aiocb32_ops)); } int freebsd32_aio_fsync(struct thread *td, struct freebsd32_aio_fsync_args *uap) { return (kern_aio_fsync(td, uap->op, (struct aiocb *)uap->aiocbp, &aiocb32_ops)); } #ifdef COMPAT_FREEBSD6 int freebsd6_freebsd32_lio_listio(struct thread *td, struct freebsd6_freebsd32_lio_listio_args *uap) { struct aiocb **acb_list; struct sigevent *sigp, sig; struct osigevent32 osig; uint32_t *acb_list32; int error, i, nent; if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) return (EINVAL); nent = uap->nent; if (nent < 0 || nent > max_aio_queue_per_proc) return (EINVAL); if (uap->sig && (uap->mode == LIO_NOWAIT)) { error = copyin(uap->sig, &osig, sizeof(osig)); if (error) return (error); error = convert_old_sigevent32(&osig, &sig); if (error) return (error); sigp = &sig; } else sigp = NULL; acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK); error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t)); if (error) { free(acb_list32, M_LIO); return (error); } acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); for (i = 0; i < nent; i++) acb_list[i] = PTRIN(acb_list32[i]); free(acb_list32, M_LIO); error = kern_lio_listio(td, uap->mode, (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp, &aiocb32_ops_osigevent); free(acb_list, M_LIO); return (error); } #endif int freebsd32_lio_listio(struct thread *td, struct freebsd32_lio_listio_args *uap) { struct aiocb **acb_list; struct sigevent *sigp, sig; struct sigevent32 sig32; uint32_t *acb_list32; int error, i, nent; if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) return (EINVAL); nent = uap->nent; if (nent < 0 || nent > max_aio_queue_per_proc) return (EINVAL); if (uap->sig && (uap->mode == LIO_NOWAIT)) { error = copyin(uap->sig, &sig32, sizeof(sig32)); if (error) return (error); error = convert_sigevent32(&sig32, &sig); if (error) return (error); sigp = &sig; } else sigp = NULL; acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK); error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t)); if (error) { free(acb_list32, M_LIO); return (error); } acb_list = malloc(sizeof(struct aiocb *) * nent, M_LIO, M_WAITOK); for (i = 0; i < nent; i++) acb_list[i] = PTRIN(acb_list32[i]); free(acb_list32, M_LIO); error = kern_lio_listio(td, uap->mode, (struct aiocb * const *)uap->acb_list, acb_list, nent, sigp, &aiocb32_ops); free(acb_list, M_LIO); return (error); } #endif diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index 747cdf722cc9..404c51b7db77 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -1,7331 +1,7331 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 */ /* * External virtual filesystem routines */ #include #include "opt_ddb.h" #include "opt_watchdog.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(DEBUG_VFS_LOCKS) && (!defined(INVARIANTS) || !defined(WITNESS)) #error DEBUG_VFS_LOCKS requires INVARIANTS and WITNESS #endif #ifdef DDB #include #endif static void delmntque(struct vnode *vp); static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, int slptimeo); static void syncer_shutdown(void *arg, int howto); static int vtryrecycle(struct vnode *vp, bool isvnlru); static void v_init_counters(struct vnode *); static void vn_seqc_init(struct vnode *); static void vn_seqc_write_end_free(struct vnode *vp); static void vgonel(struct vnode *); static bool vhold_recycle_free(struct vnode *); static void vdropl_recycle(struct vnode *vp); static void vdrop_recycle(struct vnode *vp); static void vfs_knllock(void *arg); static void vfs_knlunlock(void *arg); static void vfs_knl_assert_lock(void *arg, int what); static void destroy_vpollinfo(struct vpollinfo *vi); static int v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, daddr_t startlbn, daddr_t endlbn); static void vnlru_recalc(void); static SYSCTL_NODE(_vfs, OID_AUTO, vnode, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "vnode configuration and statistics"); static SYSCTL_NODE(_vfs_vnode, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "vnode configuration"); static SYSCTL_NODE(_vfs_vnode, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "vnode statistics"); static SYSCTL_NODE(_vfs_vnode, OID_AUTO, vnlru, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "vnode recycling"); /* * Number of vnodes in existence. Increased whenever getnewvnode() * allocates a new vnode, decreased in vdropl() for VIRF_DOOMED vnode. */ static u_long __exclusive_cache_line numvnodes; SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "Number of vnodes in existence (legacy)"); SYSCTL_ULONG(_vfs_vnode_stats, OID_AUTO, count, CTLFLAG_RD, &numvnodes, 0, "Number of vnodes in existence"); static counter_u64_t vnodes_created; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created, "Number of vnodes created by getnewvnode (legacy)"); SYSCTL_COUNTER_U64(_vfs_vnode_stats, OID_AUTO, created, CTLFLAG_RD, &vnodes_created, "Number of vnodes created by getnewvnode"); /* * Conversion tables for conversion from vnode types to inode formats * and back. */ __enum_uint8(vtype) iftovt_tab[16] = { VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON }; int vttoif_tab[10] = { 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT }; /* * List of allocates vnodes in the system. */ static TAILQ_HEAD(freelst, vnode) vnode_list; static struct vnode *vnode_list_free_marker; static struct vnode *vnode_list_reclaim_marker; /* * "Free" vnode target. Free vnodes are rarely completely free, but are * just ones that are cheap to recycle. Usually they are for files which * have been stat'd but not read; these usually have inode and namecache * data attached to them. This target is the preferred minimum size of a * sub-cache consisting mostly of such files. The system balances the size * of this sub-cache with its complement to try to prevent either from * thrashing while the other is relatively inactive. The targets express * a preference for the best balance. * * "Above" this target there are 2 further targets (watermarks) related * to recyling of free vnodes. In the best-operating case, the cache is * exactly full, the free list has size between vlowat and vhiwat above the * free target, and recycling from it and normal use maintains this state. * Sometimes the free list is below vlowat or even empty, but this state * is even better for immediate use provided the cache is not full. * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free * ones) to reach one of these states. The watermarks are currently hard- * coded as 4% and 9% of the available space higher. These and the default * of 25% for wantfreevnodes are too large if the memory size is large. * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim * whenever vnlru_proc() becomes active. */ static long wantfreevnodes; static long __exclusive_cache_line freevnodes; static long freevnodes_old; static u_long recycles_count; SYSCTL_ULONG(_vfs, OID_AUTO, recycles, CTLFLAG_RD | CTLFLAG_STATS, &recycles_count, 0, "Number of vnodes recycled to meet vnode cache targets (legacy)"); SYSCTL_ULONG(_vfs_vnode_vnlru, OID_AUTO, recycles, CTLFLAG_RD | CTLFLAG_STATS, &recycles_count, 0, "Number of vnodes recycled to meet vnode cache targets"); static u_long recycles_free_count; SYSCTL_ULONG(_vfs, OID_AUTO, recycles_free, CTLFLAG_RD | CTLFLAG_STATS, &recycles_free_count, 0, "Number of free vnodes recycled to meet vnode cache targets (legacy)"); SYSCTL_ULONG(_vfs_vnode_vnlru, OID_AUTO, recycles_free, CTLFLAG_RD | CTLFLAG_STATS, &recycles_free_count, 0, "Number of free vnodes recycled to meet vnode cache targets"); static counter_u64_t direct_recycles_free_count; SYSCTL_COUNTER_U64(_vfs_vnode_vnlru, OID_AUTO, direct_recycles_free, CTLFLAG_RD, &direct_recycles_free_count, "Number of free vnodes recycled by vn_alloc callers to meet vnode cache targets"); static counter_u64_t vnode_skipped_requeues; SYSCTL_COUNTER_U64(_vfs_vnode_stats, OID_AUTO, skipped_requeues, CTLFLAG_RD, &vnode_skipped_requeues, "Number of times LRU requeue was skipped due to lock contention"); static __read_mostly bool vnode_can_skip_requeue; SYSCTL_BOOL(_vfs_vnode_param, OID_AUTO, can_skip_requeue, CTLFLAG_RW, &vnode_can_skip_requeue, 0, "Is LRU requeue skippable"); static u_long deferred_inact; SYSCTL_ULONG(_vfs, OID_AUTO, deferred_inact, CTLFLAG_RD, &deferred_inact, 0, "Number of times inactive processing was deferred"); /* To keep more than one thread at a time from running vfs_getnewfsid */ static struct mtx mntid_mtx; /* * Lock for any access to the following: * vnode_list * numvnodes * freevnodes */ static struct mtx __exclusive_cache_line vnode_list_mtx; /* Publicly exported FS */ struct nfs_public nfs_pub; static uma_zone_t buf_trie_zone; static smr_t buf_trie_smr; /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ static uma_zone_t vnode_zone; MALLOC_DEFINE(M_VNODEPOLL, "VN POLL", "vnode poll"); __read_frequently smr_t vfs_smr; /* * The workitem queue. * * It is useful to delay writes of file data and filesystem metadata * for tens of seconds so that quickly created and deleted files need * not waste disk bandwidth being created and removed. To realize this, * we append vnodes to a "workitem" queue. When running with a soft * updates implementation, most pending metadata dependencies should * not wait for more than a few seconds. Thus, mounted on block devices * are delayed only about a half the time that file data is delayed. * Similarly, directory updates are more critical, so are only delayed * about a third the time that file data is delayed. Thus, there are * SYNCER_MAXDELAY queues that are processed round-robin at a rate of * one each second (driven off the filesystem syncer process). The * syncer_delayno variable indicates the next queue that is to be processed. * Items that need to be processed soon are placed in this queue: * * syncer_workitem_pending[syncer_delayno] * * A delay of fifteen seconds is done by placing the request fifteen * entries later in the queue: * * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] * */ static int syncer_delayno; static long syncer_mask; LIST_HEAD(synclist, bufobj); static struct synclist *syncer_workitem_pending; /* * The sync_mtx protects: * bo->bo_synclist * sync_vnode_count * syncer_delayno * syncer_state * syncer_workitem_pending * syncer_worklist_len * rushjob */ static struct mtx sync_mtx; static struct cv sync_wakeup; #define SYNCER_MAXDELAY 32 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ static int syncdelay = 30; /* max time to delay syncing data */ static int filedelay = 30; /* time to delay syncing files */ SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "Time to delay syncing files (in seconds)"); static int dirdelay = 29; /* time to delay syncing directories */ SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "Time to delay syncing directories (in seconds)"); static int metadelay = 28; /* time to delay syncing metadata */ SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "Time to delay syncing metadata (in seconds)"); static int rushjob; /* number of slots to run ASAP */ static int stat_rush_requests; /* number of times I/O speeded up */ SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "Number of times I/O speeded up (rush requests)"); #define VDBATCH_SIZE 8 struct vdbatch { u_int index; struct mtx lock; struct vnode *tab[VDBATCH_SIZE]; }; DPCPU_DEFINE_STATIC(struct vdbatch, vd); static void vdbatch_dequeue(struct vnode *vp); /* * When shutting down the syncer, run it at four times normal speed. */ #define SYNCER_SHUTDOWN_SPEEDUP 4 static int sync_vnode_count; static int syncer_worklist_len; static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } syncer_state; /* Target for maximum number of vnodes. */ u_long desiredvnodes; static u_long gapvnodes; /* gap between wanted and desired */ static u_long vhiwat; /* enough extras after expansion */ static u_long vlowat; /* minimal extras before expansion */ static bool vstir; /* nonzero to stir non-free vnodes */ static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */ static u_long vnlru_read_freevnodes(void); /* * Note that no attempt is made to sanitize these parameters. */ static int sysctl_maxvnodes(SYSCTL_HANDLER_ARGS) { u_long val; int error; val = desiredvnodes; error = sysctl_handle_long(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val == desiredvnodes) return (0); mtx_lock(&vnode_list_mtx); desiredvnodes = val; wantfreevnodes = desiredvnodes / 4; vnlru_recalc(); mtx_unlock(&vnode_list_mtx); /* * XXX There is no protection against multiple threads changing * desiredvnodes at the same time. Locking above only helps vnlru and * getnewvnode. */ vfs_hash_changesize(desiredvnodes); cache_changesize(desiredvnodes); return (0); } SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes, CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes, "LU", "Target for maximum number of vnodes (legacy)"); SYSCTL_PROC(_vfs_vnode_param, OID_AUTO, limit, CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes, "LU", "Target for maximum number of vnodes"); static int sysctl_freevnodes(SYSCTL_HANDLER_ARGS) { u_long rfreevnodes; rfreevnodes = vnlru_read_freevnodes(); return (sysctl_handle_long(oidp, &rfreevnodes, 0, req)); } SYSCTL_PROC(_vfs, OID_AUTO, freevnodes, CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_freevnodes, "LU", "Number of \"free\" vnodes (legacy)"); SYSCTL_PROC(_vfs_vnode_stats, OID_AUTO, free, CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_freevnodes, "LU", "Number of \"free\" vnodes"); static int sysctl_wantfreevnodes(SYSCTL_HANDLER_ARGS) { u_long val; int error; val = wantfreevnodes; error = sysctl_handle_long(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val == wantfreevnodes) return (0); mtx_lock(&vnode_list_mtx); wantfreevnodes = val; vnlru_recalc(); mtx_unlock(&vnode_list_mtx); return (0); } SYSCTL_PROC(_vfs, OID_AUTO, wantfreevnodes, CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes, "LU", "Target for minimum number of \"free\" vnodes (legacy)"); SYSCTL_PROC(_vfs_vnode_param, OID_AUTO, wantfree, CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes, "LU", "Target for minimum number of \"free\" vnodes"); static int vnlru_nowhere; SYSCTL_INT(_vfs_vnode_vnlru, OID_AUTO, failed_runs, CTLFLAG_RD | CTLFLAG_STATS, &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); static int sysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS) { struct vnode *vp; struct nameidata nd; char *buf; unsigned long ndflags; int error; if (req->newptr == NULL) return (EINVAL); if (req->newlen >= PATH_MAX) return (E2BIG); buf = malloc(PATH_MAX, M_TEMP, M_WAITOK); error = SYSCTL_IN(req, buf, req->newlen); if (error != 0) goto out; buf[req->newlen] = '\0'; ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1; NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf); if ((error = namei(&nd)) != 0) goto out; vp = nd.ni_vp; if (VN_IS_DOOMED(vp)) { /* * This vnode is being recycled. Return != 0 to let the caller * know that the sysctl had no effect. Return EAGAIN because a * subsequent call will likely succeed (since namei will create * a new vnode if necessary) */ error = EAGAIN; goto putvnode; } vgone(vp); putvnode: vput(vp); NDFREE_PNBUF(&nd); out: free(buf, M_TEMP); return (error); } static int sysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS) { struct thread *td = curthread; struct vnode *vp; struct file *fp; int error; int fd; if (req->newptr == NULL) return (EBADF); error = sysctl_handle_int(oidp, &fd, 0, req); if (error != 0) return (error); error = getvnode(curthread, fd, &cap_fcntl_rights, &fp); if (error != 0) return (error); vp = fp->f_vnode; error = vn_lock(vp, LK_EXCLUSIVE); if (error != 0) goto drop; vgone(vp); VOP_UNLOCK(vp); drop: fdrop(fp, td); return (error); } SYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode, CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname"); SYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode, CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, sysctl_ftry_reclaim_vnode, "I", "Try to reclaim a vnode by its file descriptor"); /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */ #define vnsz2log 8 #ifndef DEBUG_LOCKS _Static_assert(sizeof(struct vnode) >= 1UL << vnsz2log && sizeof(struct vnode) < 1UL << (vnsz2log + 1), "vnsz2log needs to be updated"); #endif /* * Support for the bufobj clean & dirty pctrie. */ static void * buf_trie_alloc(struct pctrie *ptree) { return (uma_zalloc_smr(buf_trie_zone, M_NOWAIT)); } static void buf_trie_free(struct pctrie *ptree, void *node) { uma_zfree_smr(buf_trie_zone, node); } PCTRIE_DEFINE_SMR(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free, buf_trie_smr); /* * Initialize the vnode management data structures. * * Reevaluate the following cap on the number of vnodes after the physical * memory size exceeds 512GB. In the limit, as the physical memory size * grows, the ratio of the memory size in KB to vnodes approaches 64:1. */ #ifndef MAXVNODES_MAX #define MAXVNODES_MAX (512UL * 1024 * 1024 / 64) /* 8M */ #endif static MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker"); static struct vnode * vn_alloc_marker(struct mount *mp) { struct vnode *vp; vp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); vp->v_type = VMARKER; vp->v_mount = mp; return (vp); } static void vn_free_marker(struct vnode *vp) { MPASS(vp->v_type == VMARKER); free(vp, M_VNODE_MARKER); } #ifdef KASAN static int vnode_ctor(void *mem, int size, void *arg __unused, int flags __unused) { kasan_mark(mem, size, roundup2(size, UMA_ALIGN_PTR + 1), 0); return (0); } static void vnode_dtor(void *mem, int size, void *arg __unused) { size_t end1, end2, off1, off2; _Static_assert(offsetof(struct vnode, v_vnodelist) < offsetof(struct vnode, v_dbatchcpu), "KASAN marks require updating"); off1 = offsetof(struct vnode, v_vnodelist); off2 = offsetof(struct vnode, v_dbatchcpu); end1 = off1 + sizeof(((struct vnode *)NULL)->v_vnodelist); end2 = off2 + sizeof(((struct vnode *)NULL)->v_dbatchcpu); /* * Access to the v_vnodelist and v_dbatchcpu fields are permitted even * after the vnode has been freed. Try to get some KASAN coverage by * marking everything except those two fields as invalid. Because * KASAN's tracking is not byte-granular, any preceding fields sharing * the same 8-byte aligned word must also be marked valid. */ /* Handle the area from the start until v_vnodelist... */ off1 = rounddown2(off1, KASAN_SHADOW_SCALE); kasan_mark(mem, off1, off1, KASAN_UMA_FREED); /* ... then the area between v_vnodelist and v_dbatchcpu ... */ off1 = roundup2(end1, KASAN_SHADOW_SCALE); off2 = rounddown2(off2, KASAN_SHADOW_SCALE); if (off2 > off1) kasan_mark((void *)((char *)mem + off1), off2 - off1, off2 - off1, KASAN_UMA_FREED); /* ... and finally the area from v_dbatchcpu to the end. */ off2 = roundup2(end2, KASAN_SHADOW_SCALE); kasan_mark((void *)((char *)mem + off2), size - off2, size - off2, KASAN_UMA_FREED); } #endif /* KASAN */ /* * Initialize a vnode as it first enters the zone. */ static int vnode_init(void *mem, int size, int flags) { struct vnode *vp; vp = mem; bzero(vp, size); /* * Setup locks. */ vp->v_vnlock = &vp->v_lock; mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); /* * By default, don't allow shared locks unless filesystems opt-in. */ lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT, LK_NOSHARE | LK_IS_VNODE); /* * Initialize bufobj. */ bufobj_init(&vp->v_bufobj, vp); /* * Initialize namecache. */ cache_vnode_init(vp); /* * Initialize rangelocks. */ rangelock_init(&vp->v_rl); vp->v_dbatchcpu = NOCPU; vp->v_state = VSTATE_DEAD; /* * Check vhold_recycle_free for an explanation. */ vp->v_holdcnt = VHOLD_NO_SMR; vp->v_type = VNON; mtx_lock(&vnode_list_mtx); TAILQ_INSERT_BEFORE(vnode_list_free_marker, vp, v_vnodelist); mtx_unlock(&vnode_list_mtx); return (0); } /* * Free a vnode when it is cleared from the zone. */ static void vnode_fini(void *mem, int size) { struct vnode *vp; struct bufobj *bo; vp = mem; vdbatch_dequeue(vp); mtx_lock(&vnode_list_mtx); TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); mtx_unlock(&vnode_list_mtx); rangelock_destroy(&vp->v_rl); lockdestroy(vp->v_vnlock); mtx_destroy(&vp->v_interlock); bo = &vp->v_bufobj; rw_destroy(BO_LOCKPTR(bo)); kasan_mark(mem, size, size, 0); } /* * Provide the size of NFS nclnode and NFS fh for calculation of the * vnode memory consumption. The size is specified directly to * eliminate dependency on NFS-private header. * * Other filesystems may use bigger or smaller (like UFS and ZFS) * private inode data, but the NFS-based estimation is ample enough. * Still, we care about differences in the size between 64- and 32-bit * platforms. * * Namecache structure size is heuristically * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1. */ #ifdef _LP64 #define NFS_NCLNODE_SZ (528 + 64) #define NC_SZ 148 #else #define NFS_NCLNODE_SZ (360 + 32) #define NC_SZ 92 #endif static void vntblinit(void *dummy __unused) { struct vdbatch *vd; uma_ctor ctor; uma_dtor dtor; int cpu, physvnodes, virtvnodes; /* * Desiredvnodes is a function of the physical memory size and the * kernel's heap size. Generally speaking, it scales with the * physical memory size. The ratio of desiredvnodes to the physical * memory size is 1:16 until desiredvnodes exceeds 98,304. * Thereafter, the * marginal ratio of desiredvnodes to the physical memory size is * 1:64. However, desiredvnodes is limited by the kernel's heap * size. The memory required by desiredvnodes vnodes and vm objects * must not exceed 1/10th of the kernel's heap size. */ physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 + 3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64; virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) + sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ)); desiredvnodes = min(physvnodes, virtvnodes); if (desiredvnodes > MAXVNODES_MAX) { if (bootverbose) printf("Reducing kern.maxvnodes %lu -> %lu\n", desiredvnodes, MAXVNODES_MAX); desiredvnodes = MAXVNODES_MAX; } wantfreevnodes = desiredvnodes / 4; mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); TAILQ_INIT(&vnode_list); mtx_init(&vnode_list_mtx, "vnode_list", NULL, MTX_DEF); /* * The lock is taken to appease WITNESS. */ mtx_lock(&vnode_list_mtx); vnlru_recalc(); mtx_unlock(&vnode_list_mtx); vnode_list_free_marker = vn_alloc_marker(NULL); TAILQ_INSERT_HEAD(&vnode_list, vnode_list_free_marker, v_vnodelist); vnode_list_reclaim_marker = vn_alloc_marker(NULL); TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist); #ifdef KASAN ctor = vnode_ctor; dtor = vnode_dtor; #else ctor = NULL; dtor = NULL; #endif vnode_zone = uma_zcreate("VNODE", sizeof(struct vnode), ctor, dtor, vnode_init, vnode_fini, UMA_ALIGN_PTR, UMA_ZONE_NOKASAN); uma_zone_set_smr(vnode_zone, vfs_smr); /* * Preallocate enough nodes to support one-per buf so that * we can not fail an insert. reassignbuf() callers can not * tolerate the insertion failure. */ buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(), NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_SMR); buf_trie_smr = uma_zone_get_smr(buf_trie_zone); uma_prealloc(buf_trie_zone, nbuf); vnodes_created = counter_u64_alloc(M_WAITOK); direct_recycles_free_count = counter_u64_alloc(M_WAITOK); vnode_skipped_requeues = counter_u64_alloc(M_WAITOK); /* * Initialize the filesystem syncer. */ syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, &syncer_mask); syncer_maxdelay = syncer_mask + 1; mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); cv_init(&sync_wakeup, "syncer"); CPU_FOREACH(cpu) { vd = DPCPU_ID_PTR((cpu), vd); bzero(vd, sizeof(*vd)); mtx_init(&vd->lock, "vdbatch", NULL, MTX_DEF); } } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); /* * Mark a mount point as busy. Used to synchronize access and to delay * unmounting. Eventually, mountlist_mtx is not released on failure. * * vfs_busy() is a custom lock, it can block the caller. * vfs_busy() only sleeps if the unmount is active on the mount point. * For a mountpoint mp, vfs_busy-enforced lock is before lock of any * vnode belonging to mp. * * Lookup uses vfs_busy() to traverse mount points. * root fs var fs * / vnode lock A / vnode lock (/var) D * /var vnode lock B /log vnode lock(/var/log) E * vfs_busy lock C vfs_busy lock F * * Within each file system, the lock order is C->A->B and F->D->E. * * When traversing across mounts, the system follows that lock order: * * C->A->B * | * +->F->D->E * * The lookup() process for namei("/var") illustrates the process: * 1. VOP_LOOKUP() obtains B while A is held * 2. vfs_busy() obtains a shared lock on F while A and B are held * 3. vput() releases lock on B * 4. vput() releases lock on A * 5. VFS_ROOT() obtains lock on D while shared lock on F is held * 6. vfs_unbusy() releases shared lock on F * 7. vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. * Attempt to lock A (instead of vp_crossmp) while D is held would * violate the global order, causing deadlocks. * * dounmount() locks B while F is drained. Note that for stacked * filesystems, D and B in the example above may be the same lock, * which introdues potential lock order reversal deadlock between * dounmount() and step 5 above. These filesystems may avoid the LOR * by setting VV_CROSSLOCK on the covered vnode so that lock B will * remain held until after step 5. */ int vfs_busy(struct mount *mp, int flags) { struct mount_pcpu *mpcpu; MPASS((flags & ~MBF_MASK) == 0); CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags); if (vfs_op_thread_enter(mp, mpcpu)) { MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); MPASS((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0); MPASS((mp->mnt_kern_flag & MNTK_REFEXPIRE) == 0); vfs_mp_count_add_pcpu(mpcpu, ref, 1); vfs_mp_count_add_pcpu(mpcpu, lockref, 1); vfs_op_thread_exit(mp, mpcpu); if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); return (0); } MNT_ILOCK(mp); vfs_assert_mount_counters(mp); MNT_REF(mp); /* * If mount point is currently being unmounted, sleep until the * mount point fate is decided. If thread doing the unmounting fails, * it will clear MNTK_UNMOUNT flag before waking us up, indicating * that this mount point has survived the unmount attempt and vfs_busy * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE * flag in addition to MNTK_UNMOUNT, indicating that mount point is * about to be really destroyed. vfs_busy needs to release its * reference on the mount point in this case and return with ENOENT, * telling the caller the mount it tried to busy is no longer valid. */ while (mp->mnt_kern_flag & MNTK_UNMOUNT) { KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("%s: non-empty upper mount list with pending unmount", __func__)); if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) { MNT_REL(mp); MNT_IUNLOCK(mp); CTR1(KTR_VFS, "%s: failed busying before sleeping", __func__); return (ENOENT); } if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); mp->mnt_kern_flag |= MNTK_MWAIT; msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0); if (flags & MBF_MNTLSTLOCK) mtx_lock(&mountlist_mtx); MNT_ILOCK(mp); } if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); mp->mnt_lockref++; MNT_IUNLOCK(mp); return (0); } /* * Free a busy filesystem. */ void vfs_unbusy(struct mount *mp) { struct mount_pcpu *mpcpu; int c; CTR2(KTR_VFS, "%s: mp %p", __func__, mp); if (vfs_op_thread_enter(mp, mpcpu)) { MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); vfs_mp_count_sub_pcpu(mpcpu, lockref, 1); vfs_mp_count_sub_pcpu(mpcpu, ref, 1); vfs_op_thread_exit(mp, mpcpu); return; } MNT_ILOCK(mp); vfs_assert_mount_counters(mp); MNT_REL(mp); c = --mp->mnt_lockref; if (mp->mnt_vfs_ops == 0) { MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); MNT_IUNLOCK(mp); return; } if (c < 0) vfs_dump_mount_counters(mp); if (c == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); CTR1(KTR_VFS, "%s: waking up waiters", __func__); mp->mnt_kern_flag &= ~MNTK_DRAINING; wakeup(&mp->mnt_lockref); } MNT_IUNLOCK(mp); } /* * Lookup a mount point by filesystem identifier. */ struct mount * vfs_getvfs(fsid_t *fsid) { struct mount *mp; CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { vfs_ref(mp); mtx_unlock(&mountlist_mtx); return (mp); } } mtx_unlock(&mountlist_mtx); CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); return ((struct mount *) 0); } /* * Lookup a mount point by filesystem identifier, busying it before * returning. * * To avoid congestion on mountlist_mtx, implement simple direct-mapped * cache for popular filesystem identifiers. The cache is lockess, using * the fact that struct mount's are never freed. In worst case we may * get pointer to unmounted or even different filesystem, so we have to * check what we got, and go slow way if so. */ struct mount * vfs_busyfs(fsid_t *fsid) { #define FSID_CACHE_SIZE 256 typedef struct mount * volatile vmp_t; static vmp_t cache[FSID_CACHE_SIZE]; struct mount *mp; int error; uint32_t hash; CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); hash = fsid->val[0] ^ fsid->val[1]; hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1); mp = cache[hash]; if (mp == NULL || fsidcmp(&mp->mnt_stat.f_fsid, fsid) != 0) goto slow; if (vfs_busy(mp, 0) != 0) { cache[hash] = NULL; goto slow; } if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) return (mp); else vfs_unbusy(mp); slow: mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { error = vfs_busy(mp, MBF_MNTLSTLOCK); if (error) { cache[hash] = NULL; mtx_unlock(&mountlist_mtx); return (NULL); } cache[hash] = mp; return (mp); } } CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); mtx_unlock(&mountlist_mtx); return ((struct mount *) 0); } /* * Check if a user can access privileged mount options. */ int vfs_suser(struct mount *mp, struct thread *td) { int error; if (jailed(td->td_ucred)) { /* * If the jail of the calling thread lacks permission for * this type of file system, deny immediately. */ if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag)) return (EPERM); /* * If the file system was mounted outside the jail of the * calling thread, deny immediately. */ if (prison_check(td->td_ucred, mp->mnt_cred) != 0) return (EPERM); } /* * If file system supports delegated administration, we don't check * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified * by the file system itself. * If this is not the user that did original mount, we check for * the PRIV_VFS_MOUNT_OWNER privilege. */ if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) && mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) return (error); } return (0); } /* * Get a new unique fsid. Try to make its val[0] unique, since this value * will be used to create fake device numbers for stat(). Also try (but * not so hard) make its val[0] unique mod 2^16, since some emulators only * support 16-bit device numbers. We end up with unique val[0]'s for the * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. * * Keep in mind that several mounts may be running in parallel. Starting * the search one past where the previous search terminated is both a * micro-optimization and a defense against returning the same fsid to * different mounts. */ void vfs_getnewfsid(struct mount *mp) { static uint16_t mntid_base; struct mount *nmp; fsid_t tfsid; int mtype; CTR2(KTR_VFS, "%s: mp %p", __func__, mp); mtx_lock(&mntid_mtx); mtype = mp->mnt_vfc->vfc_typenum; tfsid.val[1] = mtype; mtype = (mtype & 0xFF) << 24; for (;;) { tfsid.val[0] = makedev(255, mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); mntid_base++; if ((nmp = vfs_getvfs(&tfsid)) == NULL) break; vfs_rel(nmp); } mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; mtx_unlock(&mntid_mtx); } /* * Knob to control the precision of file timestamps: * * 0 = seconds only; nanoseconds zeroed. * 1 = seconds and nanoseconds, accurate within 1/HZ. * 2 = seconds and nanoseconds, truncated to microseconds. * >=3 = seconds and nanoseconds, maximum precision. */ enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; static int timestamp_precision = TSP_USEC; SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, ×tamp_precision, 0, "File timestamp precision (0: seconds, " "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, " "3+: sec + ns (max. precision))"); /* * Get a current timestamp. */ void vfs_timestamp(struct timespec *tsp) { struct timeval tv; switch (timestamp_precision) { case TSP_SEC: tsp->tv_sec = time_second; tsp->tv_nsec = 0; break; case TSP_HZ: getnanotime(tsp); break; case TSP_USEC: microtime(&tv); TIMEVAL_TO_TIMESPEC(&tv, tsp); break; case TSP_NSEC: default: nanotime(tsp); break; } } /* * Set vnode attributes to VNOVAL */ void vattr_null(struct vattr *vap) { vap->va_type = VNON; vap->va_size = VNOVAL; vap->va_bytes = VNOVAL; vap->va_mode = VNOVAL; vap->va_nlink = VNOVAL; vap->va_uid = VNOVAL; vap->va_gid = VNOVAL; vap->va_fsid = VNOVAL; vap->va_fileid = VNOVAL; vap->va_blocksize = VNOVAL; vap->va_rdev = VNOVAL; vap->va_atime.tv_sec = VNOVAL; vap->va_atime.tv_nsec = VNOVAL; vap->va_mtime.tv_sec = VNOVAL; vap->va_mtime.tv_nsec = VNOVAL; vap->va_ctime.tv_sec = VNOVAL; vap->va_ctime.tv_nsec = VNOVAL; vap->va_birthtime.tv_sec = VNOVAL; vap->va_birthtime.tv_nsec = VNOVAL; vap->va_flags = VNOVAL; vap->va_gen = VNOVAL; vap->va_vaflags = 0; } /* * Try to reduce the total number of vnodes. * * This routine (and its user) are buggy in at least the following ways: * - all parameters were picked years ago when RAM sizes were significantly * smaller * - it can pick vnodes based on pages used by the vm object, but filesystems * like ZFS don't use it making the pick broken * - since ZFS has its own aging policy it gets partially combated by this one * - a dedicated method should be provided for filesystems to let them decide * whether the vnode should be recycled * * This routine is called when we have too many vnodes. It attempts * to free vnodes and will potentially free vnodes that still * have VM backing store (VM backing store is typically the cause * of a vnode blowout so we want to do this). Therefore, this operation * is not considered cheap. * * A number of conditions may prevent a vnode from being reclaimed. * the buffer cache may have references on the vnode, a directory * vnode may still have references due to the namei cache representing * underlying files, or the vnode may be in active use. It is not * desirable to reuse such vnodes. These conditions may cause the * number of vnodes to reach some minimum value regardless of what * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. * * @param reclaim_nc_src Only reclaim directories with outgoing namecache * entries if this argument is strue * @param trigger Only reclaim vnodes with fewer than this many resident * pages. * @param target How many vnodes to reclaim. * @return The number of vnodes that were reclaimed. */ static int vlrureclaim(bool reclaim_nc_src, int trigger, u_long target) { struct vnode *vp, *mvp; struct mount *mp; struct vm_object *object; u_long done; bool retried; mtx_assert(&vnode_list_mtx, MA_OWNED); retried = false; done = 0; mvp = vnode_list_reclaim_marker; restart: vp = mvp; while (done < target) { vp = TAILQ_NEXT(vp, v_vnodelist); if (__predict_false(vp == NULL)) break; if (__predict_false(vp->v_type == VMARKER)) continue; /* * If it's been deconstructed already, it's still * referenced, or it exceeds the trigger, skip it. * Also skip free vnodes. We are trying to make space * to expand the free list, not reduce it. */ if (vp->v_usecount > 0 || vp->v_holdcnt == 0 || (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src))) goto next_iter; if (vp->v_type == VBAD || vp->v_type == VNON) goto next_iter; object = atomic_load_ptr(&vp->v_object); if (object == NULL || object->resident_page_count > trigger) { goto next_iter; } /* * Handle races against vnode allocation. Filesystems lock the * vnode some time after it gets returned from getnewvnode, * despite type and hold count being manipulated earlier. * Resorting to checking v_mount restores guarantees present * before the global list was reworked to contain all vnodes. */ if (!VI_TRYLOCK(vp)) goto next_iter; if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) { VI_UNLOCK(vp); goto next_iter; } if (vp->v_mount == NULL) { VI_UNLOCK(vp); goto next_iter; } vholdl(vp); VI_UNLOCK(vp); TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); mtx_unlock(&vnode_list_mtx); if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { vdrop_recycle(vp); goto next_iter_unlocked; } if (VOP_LOCK(vp, LK_EXCLUSIVE|LK_NOWAIT) != 0) { vdrop_recycle(vp); vn_finished_write(mp); goto next_iter_unlocked; } VI_LOCK(vp); if (vp->v_usecount > 0 || (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || (vp->v_object != NULL && vp->v_object->handle == vp && vp->v_object->resident_page_count > trigger)) { VOP_UNLOCK(vp); vdropl_recycle(vp); vn_finished_write(mp); goto next_iter_unlocked; } recycles_count++; vgonel(vp); VOP_UNLOCK(vp); vdropl_recycle(vp); vn_finished_write(mp); done++; next_iter_unlocked: maybe_yield(); mtx_lock(&vnode_list_mtx); goto restart; next_iter: MPASS(vp->v_type != VMARKER); if (!should_yield()) continue; TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); mtx_unlock(&vnode_list_mtx); kern_yield(PRI_USER); mtx_lock(&vnode_list_mtx); goto restart; } if (done == 0 && !retried) { TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist); retried = true; goto restart; } return (done); } static int max_free_per_call = 10000; SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_free_per_call, 0, "limit on vnode free requests per call to the vnlru_free routine (legacy)"); SYSCTL_INT(_vfs_vnode_vnlru, OID_AUTO, max_free_per_call, CTLFLAG_RW, &max_free_per_call, 0, "limit on vnode free requests per call to the vnlru_free routine"); /* * Attempt to reduce the free list by the requested amount. */ static int vnlru_free_impl(int count, struct vfsops *mnt_op, struct vnode *mvp, bool isvnlru) { struct vnode *vp; struct mount *mp; int ocount; bool retried; mtx_assert(&vnode_list_mtx, MA_OWNED); if (count > max_free_per_call) count = max_free_per_call; if (count == 0) { mtx_unlock(&vnode_list_mtx); return (0); } ocount = count; retried = false; vp = mvp; for (;;) { vp = TAILQ_NEXT(vp, v_vnodelist); if (__predict_false(vp == NULL)) { /* * The free vnode marker can be past eligible vnodes: * 1. if vdbatch_process trylock failed * 2. if vtryrecycle failed * * If so, start the scan from scratch. */ if (!retried && vnlru_read_freevnodes() > 0) { TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist); vp = mvp; retried = true; continue; } /* * Give up */ TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_TAIL(&vnode_list, mvp, v_vnodelist); mtx_unlock(&vnode_list_mtx); break; } if (__predict_false(vp->v_type == VMARKER)) continue; if (vp->v_holdcnt > 0) continue; /* * Don't recycle if our vnode is from different type * of mount point. Note that mp is type-safe, the * check does not reach unmapped address even if * vnode is reclaimed. */ if (mnt_op != NULL && (mp = vp->v_mount) != NULL && mp->mnt_op != mnt_op) { continue; } if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) { continue; } if (!vhold_recycle_free(vp)) continue; TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); mtx_unlock(&vnode_list_mtx); /* * FIXME: ignores the return value, meaning it may be nothing * got recycled but it claims otherwise to the caller. * * Originally the value started being ignored in 2005 with * 114a1006a8204aa156e1f9ad6476cdff89cada7f . * * Respecting the value can run into significant stalls if most * vnodes belong to one file system and it has writes * suspended. In presence of many threads and millions of * vnodes they keep contending on the vnode_list_mtx lock only * to find vnodes they can't recycle. * * The solution would be to pre-check if the vnode is likely to * be recycle-able, but it needs to happen with the * vnode_list_mtx lock held. This runs into a problem where * VOP_GETWRITEMOUNT (currently needed to find out about if * writes are frozen) can take locks which LOR against it. * * Check nullfs for one example (null_getwritemount). */ vtryrecycle(vp, isvnlru); count--; if (count == 0) { break; } mtx_lock(&vnode_list_mtx); vp = mvp; } mtx_assert(&vnode_list_mtx, MA_NOTOWNED); return (ocount - count); } /* * XXX: returns without vnode_list_mtx locked! */ static int vnlru_free_locked_direct(int count) { int ret; mtx_assert(&vnode_list_mtx, MA_OWNED); ret = vnlru_free_impl(count, NULL, vnode_list_free_marker, false); mtx_assert(&vnode_list_mtx, MA_NOTOWNED); return (ret); } static int vnlru_free_locked_vnlru(int count) { int ret; mtx_assert(&vnode_list_mtx, MA_OWNED); ret = vnlru_free_impl(count, NULL, vnode_list_free_marker, true); mtx_assert(&vnode_list_mtx, MA_NOTOWNED); return (ret); } static int vnlru_free_vnlru(int count) { mtx_lock(&vnode_list_mtx); return (vnlru_free_locked_vnlru(count)); } void vnlru_free_vfsops(int count, struct vfsops *mnt_op, struct vnode *mvp) { MPASS(mnt_op != NULL); MPASS(mvp != NULL); VNPASS(mvp->v_type == VMARKER, mvp); mtx_lock(&vnode_list_mtx); vnlru_free_impl(count, mnt_op, mvp, true); mtx_assert(&vnode_list_mtx, MA_NOTOWNED); } struct vnode * vnlru_alloc_marker(void) { struct vnode *mvp; mvp = vn_alloc_marker(NULL); mtx_lock(&vnode_list_mtx); TAILQ_INSERT_BEFORE(vnode_list_free_marker, mvp, v_vnodelist); mtx_unlock(&vnode_list_mtx); return (mvp); } void vnlru_free_marker(struct vnode *mvp) { mtx_lock(&vnode_list_mtx); TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); mtx_unlock(&vnode_list_mtx); vn_free_marker(mvp); } static void vnlru_recalc(void) { mtx_assert(&vnode_list_mtx, MA_OWNED); gapvnodes = imax(desiredvnodes - wantfreevnodes, 100); vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */ vlowat = vhiwat / 2; } /* * Attempt to recycle vnodes in a context that is always safe to block. * Calling vlrurecycle() from the bowels of filesystem code has some * interesting deadlock problems. */ static struct proc *vnlruproc; static int vnlruproc_sig; static u_long vnlruproc_kicks; SYSCTL_ULONG(_vfs_vnode_vnlru, OID_AUTO, kicks, CTLFLAG_RD, &vnlruproc_kicks, 0, "Number of times vnlru got woken up due to vnode shortage"); #define VNLRU_COUNT_SLOP 100 /* * The main freevnodes counter is only updated when a counter local to CPU * diverges from 0 by more than VNLRU_FREEVNODES_SLOP. CPUs are conditionally * walked to compute a more accurate total. * * Note: the actual value at any given moment can still exceed slop, but it * should not be by significant margin in practice. */ #define VNLRU_FREEVNODES_SLOP 126 static void __noinline vfs_freevnodes_rollup(int8_t *lfreevnodes) { atomic_add_long(&freevnodes, *lfreevnodes); *lfreevnodes = 0; critical_exit(); } static __inline void vfs_freevnodes_inc(void) { int8_t *lfreevnodes; critical_enter(); lfreevnodes = PCPU_PTR(vfs_freevnodes); (*lfreevnodes)++; if (__predict_false(*lfreevnodes == VNLRU_FREEVNODES_SLOP)) vfs_freevnodes_rollup(lfreevnodes); else critical_exit(); } static __inline void vfs_freevnodes_dec(void) { int8_t *lfreevnodes; critical_enter(); lfreevnodes = PCPU_PTR(vfs_freevnodes); (*lfreevnodes)--; if (__predict_false(*lfreevnodes == -VNLRU_FREEVNODES_SLOP)) vfs_freevnodes_rollup(lfreevnodes); else critical_exit(); } static u_long vnlru_read_freevnodes(void) { long slop, rfreevnodes, rfreevnodes_old; int cpu; rfreevnodes = atomic_load_long(&freevnodes); rfreevnodes_old = atomic_load_long(&freevnodes_old); if (rfreevnodes > rfreevnodes_old) slop = rfreevnodes - rfreevnodes_old; else slop = rfreevnodes_old - rfreevnodes; if (slop < VNLRU_FREEVNODES_SLOP) return (rfreevnodes >= 0 ? rfreevnodes : 0); CPU_FOREACH(cpu) { rfreevnodes += cpuid_to_pcpu[cpu]->pc_vfs_freevnodes; } atomic_store_long(&freevnodes_old, rfreevnodes); return (freevnodes_old >= 0 ? freevnodes_old : 0); } static bool vnlru_under(u_long rnumvnodes, u_long limit) { u_long rfreevnodes, space; if (__predict_false(rnumvnodes > desiredvnodes)) return (true); space = desiredvnodes - rnumvnodes; if (space < limit) { rfreevnodes = vnlru_read_freevnodes(); if (rfreevnodes > wantfreevnodes) space += rfreevnodes - wantfreevnodes; } return (space < limit); } static void vnlru_kick_locked(void) { mtx_assert(&vnode_list_mtx, MA_OWNED); if (vnlruproc_sig == 0) { vnlruproc_sig = 1; vnlruproc_kicks++; wakeup(vnlruproc); } } static void vnlru_kick_cond(void) { if (vnlru_read_freevnodes() > wantfreevnodes) return; if (vnlruproc_sig) return; mtx_lock(&vnode_list_mtx); vnlru_kick_locked(); mtx_unlock(&vnode_list_mtx); } static void vnlru_proc_sleep(void) { if (vnlruproc_sig) { vnlruproc_sig = 0; wakeup(&vnlruproc_sig); } msleep(vnlruproc, &vnode_list_mtx, PVFS|PDROP, "vlruwt", hz); } /* * A lighter version of the machinery below. * * Tries to reach goals only by recycling free vnodes and does not invoke * uma_reclaim(UMA_RECLAIM_DRAIN). * * This works around pathological behavior in vnlru in presence of tons of free * vnodes, but without having to rewrite the machinery at this time. Said * behavior boils down to continuously trying to reclaim all kinds of vnodes * (cycling through all levels of "force") when the count is transiently above * limit. This happens a lot when all vnodes are used up and vn_alloc * speculatively increments the counter. * * Sample testcase: vnode limit 8388608, 20 separate directory trees each with * 1 million files in total and 20 find(1) processes stating them in parallel * (one per each tree). * * On a kernel with only stock machinery this needs anywhere between 60 and 120 * seconds to execute (time varies *wildly* between runs). With the workaround * it consistently stays around 20 seconds [it got further down with later * changes]. * * That is to say the entire thing needs a fundamental redesign (most notably * to accommodate faster recycling), the above only tries to get it ouf the way. * * Return values are: * -1 -- fallback to regular vnlru loop * 0 -- do nothing, go to sleep * >0 -- recycle this many vnodes */ static long vnlru_proc_light_pick(void) { u_long rnumvnodes, rfreevnodes; if (vstir || vnlruproc_sig == 1) return (-1); rnumvnodes = atomic_load_long(&numvnodes); rfreevnodes = vnlru_read_freevnodes(); /* * vnode limit might have changed and now we may be at a significant * excess. Bail if we can't sort it out with free vnodes. * * Due to atomic updates the count can legitimately go above * the limit for a short period, don't bother doing anything in * that case. */ if (rnumvnodes > desiredvnodes + VNLRU_COUNT_SLOP + 10) { if (rnumvnodes - rfreevnodes >= desiredvnodes || rfreevnodes <= wantfreevnodes) { return (-1); } return (rnumvnodes - desiredvnodes); } /* * Don't try to reach wantfreevnodes target if there are too few vnodes * to begin with. */ if (rnumvnodes < wantfreevnodes) { return (0); } if (rfreevnodes < wantfreevnodes) { return (-1); } return (0); } static bool vnlru_proc_light(void) { long freecount; mtx_assert(&vnode_list_mtx, MA_NOTOWNED); freecount = vnlru_proc_light_pick(); if (freecount == -1) return (false); if (freecount != 0) { vnlru_free_vnlru(freecount); } mtx_lock(&vnode_list_mtx); vnlru_proc_sleep(); mtx_assert(&vnode_list_mtx, MA_NOTOWNED); return (true); } static u_long uma_reclaim_calls; SYSCTL_ULONG(_vfs_vnode_vnlru, OID_AUTO, uma_reclaim_calls, CTLFLAG_RD | CTLFLAG_STATS, &uma_reclaim_calls, 0, "Number of calls to uma_reclaim"); static void vnlru_proc(void) { u_long rnumvnodes, rfreevnodes, target; unsigned long onumvnodes; int done, force, trigger, usevnodes; bool reclaim_nc_src, want_reread; EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc, SHUTDOWN_PRI_FIRST); force = 0; want_reread = false; for (;;) { kproc_suspend_check(vnlruproc); if (force == 0 && vnlru_proc_light()) continue; mtx_lock(&vnode_list_mtx); rnumvnodes = atomic_load_long(&numvnodes); if (want_reread) { force = vnlru_under(numvnodes, vhiwat) ? 1 : 0; want_reread = false; } /* * If numvnodes is too large (due to desiredvnodes being * adjusted using its sysctl, or emergency growth), first * try to reduce it by discarding from the free list. */ if (rnumvnodes > desiredvnodes + 10) { vnlru_free_locked_vnlru(rnumvnodes - desiredvnodes); mtx_lock(&vnode_list_mtx); rnumvnodes = atomic_load_long(&numvnodes); } /* * Sleep if the vnode cache is in a good state. This is * when it is not over-full and has space for about a 4% * or 9% expansion (by growing its size or inexcessively * reducing its free list). Otherwise, try to reclaim * space for a 10% expansion. */ if (vstir && force == 0) { force = 1; vstir = false; } if (force == 0 && !vnlru_under(rnumvnodes, vlowat)) { vnlru_proc_sleep(); continue; } rfreevnodes = vnlru_read_freevnodes(); onumvnodes = rnumvnodes; /* * Calculate parameters for recycling. These are the same * throughout the loop to give some semblance of fairness. * The trigger point is to avoid recycling vnodes with lots * of resident pages. We aren't trying to free memory; we * are trying to recycle or at least free vnodes. */ if (rnumvnodes <= desiredvnodes) usevnodes = rnumvnodes - rfreevnodes; else usevnodes = rnumvnodes; if (usevnodes <= 0) usevnodes = 1; /* * The trigger value is chosen to give a conservatively * large value to ensure that it alone doesn't prevent * making progress. The value can easily be so large that * it is effectively infinite in some congested and * misconfigured cases, and this is necessary. Normally * it is about 8 to 100 (pages), which is quite large. */ trigger = vm_cnt.v_page_count * 2 / usevnodes; if (force < 2) trigger = vsmalltrigger; reclaim_nc_src = force >= 3; target = rnumvnodes * (int64_t)gapvnodes / imax(desiredvnodes, 1); target = target / 10 + 1; done = vlrureclaim(reclaim_nc_src, trigger, target); mtx_unlock(&vnode_list_mtx); /* * Total number of vnodes can transiently go slightly above the * limit (see vn_alloc_hard), no need to call uma_reclaim if * this happens. */ if (onumvnodes + VNLRU_COUNT_SLOP + 1000 > desiredvnodes && numvnodes <= desiredvnodes) { uma_reclaim_calls++; uma_reclaim(UMA_RECLAIM_DRAIN); } if (done == 0) { if (force == 0 || force == 1) { force = 2; continue; } if (force == 2) { force = 3; continue; } want_reread = true; force = 0; vnlru_nowhere++; tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); } else { want_reread = true; kern_yield(PRI_USER); } } } static struct kproc_desc vnlru_kp = { "vnlru", vnlru_proc, &vnlruproc }; SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp); /* * Routines having to do with the management of the vnode table. */ /* * Try to recycle a freed vnode. We abort if anyone picks up a reference * before we actually vgone(). This function must be called with the vnode * held to prevent the vnode from being returned to the free list midway * through vgone(). */ static int vtryrecycle(struct vnode *vp, bool isvnlru) { struct mount *vnmp; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); VNPASS(vp->v_holdcnt > 0, vp); /* * This vnode may found and locked via some other list, if so we * can't recycle it yet. */ if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { CTR2(KTR_VFS, "%s: impossible to recycle, vp %p lock is already held", __func__, vp); vdrop_recycle(vp); return (EWOULDBLOCK); } /* * Don't recycle if its filesystem is being suspended. */ if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { VOP_UNLOCK(vp); CTR2(KTR_VFS, "%s: impossible to recycle, cannot start the write for %p", __func__, vp); vdrop_recycle(vp); return (EBUSY); } /* * If we got this far, we need to acquire the interlock and see if * anyone picked up this vnode from another list. If not, we will * mark it with DOOMED via vgonel() so that anyone who does find it * will skip over it. */ VI_LOCK(vp); if (vp->v_usecount) { VOP_UNLOCK(vp); vdropl_recycle(vp); vn_finished_write(vnmp); CTR2(KTR_VFS, "%s: impossible to recycle, %p is already referenced", __func__, vp); return (EBUSY); } if (!VN_IS_DOOMED(vp)) { if (isvnlru) recycles_free_count++; else counter_u64_add(direct_recycles_free_count, 1); vgonel(vp); } VOP_UNLOCK(vp); vdropl_recycle(vp); vn_finished_write(vnmp); return (0); } /* * Allocate a new vnode. * * The operation never returns an error. Returning an error was disabled * in r145385 (dated 2005) with the following comment: * * XXX Not all VFS_VGET/ffs_vget callers check returns. * * Given the age of this commit (almost 15 years at the time of writing this * comment) restoring the ability to fail requires a significant audit of * all codepaths. * * The routine can try to free a vnode or stall for up to 1 second waiting for * vnlru to clear things up, but ultimately always performs a M_WAITOK allocation. */ static u_long vn_alloc_cyclecount; static u_long vn_alloc_sleeps; SYSCTL_ULONG(_vfs_vnode_stats, OID_AUTO, alloc_sleeps, CTLFLAG_RD, &vn_alloc_sleeps, 0, "Number of times vnode allocation blocked waiting on vnlru"); static struct vnode * __noinline vn_alloc_hard(struct mount *mp, u_long rnumvnodes, bool bumped) { u_long rfreevnodes; if (bumped) { if (rnumvnodes > desiredvnodes + VNLRU_COUNT_SLOP) { atomic_subtract_long(&numvnodes, 1); bumped = false; } } mtx_lock(&vnode_list_mtx); rfreevnodes = vnlru_read_freevnodes(); if (vn_alloc_cyclecount++ >= rfreevnodes) { vn_alloc_cyclecount = 0; vstir = true; } /* * Grow the vnode cache if it will not be above its target max * after growing. Otherwise, if the free list is nonempty, try * to reclaim 1 item from it before growing the cache (possibly * above its target max if the reclamation failed or is delayed). * Otherwise, wait for some space. In all cases, schedule * vnlru_proc() if we are getting short of space. The watermarks * should be chosen so that we never wait or even reclaim from * the free list to below its target minimum. */ if (vnlru_free_locked_direct(1) > 0) goto alloc; mtx_assert(&vnode_list_mtx, MA_NOTOWNED); if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { /* * Wait for space for a new vnode. */ if (bumped) { atomic_subtract_long(&numvnodes, 1); bumped = false; } mtx_lock(&vnode_list_mtx); vnlru_kick_locked(); vn_alloc_sleeps++; msleep(&vnlruproc_sig, &vnode_list_mtx, PVFS, "vlruwk", hz); if (atomic_load_long(&numvnodes) + 1 > desiredvnodes && vnlru_read_freevnodes() > 1) vnlru_free_locked_direct(1); else mtx_unlock(&vnode_list_mtx); } alloc: mtx_assert(&vnode_list_mtx, MA_NOTOWNED); if (!bumped) atomic_add_long(&numvnodes, 1); vnlru_kick_cond(); return (uma_zalloc_smr(vnode_zone, M_WAITOK)); } static struct vnode * vn_alloc(struct mount *mp) { u_long rnumvnodes; if (__predict_false(vn_alloc_cyclecount != 0)) return (vn_alloc_hard(mp, 0, false)); rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1; if (__predict_false(vnlru_under(rnumvnodes, vlowat))) { return (vn_alloc_hard(mp, rnumvnodes, true)); } return (uma_zalloc_smr(vnode_zone, M_WAITOK)); } static void vn_free(struct vnode *vp) { atomic_subtract_long(&numvnodes, 1); uma_zfree_smr(vnode_zone, vp); } /* * Return the next vnode from the free list. */ int getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, struct vnode **vpp) { struct vnode *vp; struct thread *td; struct lock_object *lo; CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); KASSERT(vops->registered, ("%s: not registered vector op %p\n", __func__, vops)); cache_validate_vop_vector(mp, vops); td = curthread; if (td->td_vp_reserved != NULL) { vp = td->td_vp_reserved; td->td_vp_reserved = NULL; } else { vp = vn_alloc(mp); } counter_u64_add(vnodes_created, 1); vn_set_state(vp, VSTATE_UNINITIALIZED); /* * Locks are given the generic name "vnode" when created. * Follow the historic practice of using the filesystem * name when they allocated, e.g., "zfs", "ufs", "nfs, etc. * * Locks live in a witness group keyed on their name. Thus, * when a lock is renamed, it must also move from the witness * group of its old name to the witness group of its new name. * * The change only needs to be made when the vnode moves * from one filesystem type to another. We ensure that each * filesystem use a single static name pointer for its tag so * that we can compare pointers rather than doing a strcmp(). */ lo = &vp->v_vnlock->lock_object; #ifdef WITNESS if (lo->lo_name != tag) { #endif lo->lo_name = tag; #ifdef WITNESS WITNESS_DESTROY(lo); WITNESS_INIT(lo, tag); } #endif /* * By default, don't allow shared locks unless filesystems opt-in. */ vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE; /* * Finalize various vnode identity bits. */ KASSERT(vp->v_object == NULL, ("stale v_object %p", vp)); KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp)); KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp)); vp->v_type = VNON; vp->v_op = vops; vp->v_irflag = 0; v_init_counters(vp); vn_seqc_init(vp); vp->v_bufobj.bo_ops = &buf_ops_bio; #ifdef DIAGNOSTIC if (mp == NULL && vops != &dead_vnodeops) printf("NULL mp in getnewvnode(9), tag %s\n", tag); #endif #ifdef MAC mac_vnode_init(vp); if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) mac_vnode_associate_singlelabel(mp, vp); #endif if (mp != NULL) { vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize; } /* * For the filesystems which do not use vfs_hash_insert(), * still initialize v_hash to have vfs_hash_index() useful. * E.g., nullfs uses vfs_hash_index() on the lower vnode for * its own hashing. */ vp->v_hash = (uintptr_t)vp >> vnsz2log; *vpp = vp; return (0); } void getnewvnode_reserve(void) { struct thread *td; td = curthread; MPASS(td->td_vp_reserved == NULL); td->td_vp_reserved = vn_alloc(NULL); } void getnewvnode_drop_reserve(void) { struct thread *td; td = curthread; if (td->td_vp_reserved != NULL) { vn_free(td->td_vp_reserved); td->td_vp_reserved = NULL; } } static void __noinline freevnode(struct vnode *vp) { struct bufobj *bo; /* * The vnode has been marked for destruction, so free it. * * The vnode will be returned to the zone where it will * normally remain until it is needed for another vnode. We * need to cleanup (or verify that the cleanup has already * been done) any residual data left from its current use * so as not to contaminate the freshly allocated vnode. */ CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); /* * Paired with vgone. */ vn_seqc_write_end_free(vp); bo = &vp->v_bufobj; VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); VNPASS(vp->v_holdcnt == VHOLD_NO_SMR, vp); VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp, ("clean blk trie not empty")); VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp, ("dirty blk trie not empty")); VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp, ("Dangling rangelock waiters")); VNASSERT((vp->v_iflag & (VI_DOINGINACT | VI_OWEINACT)) == 0, vp, ("Leaked inactivation")); VI_UNLOCK(vp); cache_assert_no_entries(vp); #ifdef MAC mac_vnode_destroy(vp); #endif if (vp->v_pollinfo != NULL) { /* * Use LK_NOWAIT to shut up witness about the lock. We may get * here while having another vnode locked when trying to * satisfy a lookup and needing to recycle. */ VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT); destroy_vpollinfo(vp->v_pollinfo); VOP_UNLOCK(vp); vp->v_pollinfo = NULL; } vp->v_mountedhere = NULL; vp->v_unpcb = NULL; vp->v_rdev = NULL; vp->v_fifoinfo = NULL; vp->v_iflag = 0; vp->v_vflag = 0; bo->bo_flag = 0; vn_free(vp); } /* * Delete from old mount point vnode list, if on one. */ static void delmntque(struct vnode *vp) { struct mount *mp; VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); mp = vp->v_mount; MNT_ILOCK(mp); VI_LOCK(vp); vp->v_mount = NULL; VNASSERT(mp->mnt_nvnodelistsize > 0, vp, ("bad mount point vnode list size")); TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); mp->mnt_nvnodelistsize--; MNT_REL(mp); MNT_IUNLOCK(mp); /* * The caller expects the interlock to be still held. */ ASSERT_VI_LOCKED(vp, __func__); } static int insmntque1_int(struct vnode *vp, struct mount *mp, bool dtr) { KASSERT(vp->v_mount == NULL, ("insmntque: vnode already on per mount vnode list")); VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); if ((mp->mnt_kern_flag & MNTK_UNLOCKED_INSMNTQUE) == 0) { ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp"); } else { KASSERT(!dtr, ("%s: can't have MNTK_UNLOCKED_INSMNTQUE and cleanup", __func__)); } /* * We acquire the vnode interlock early to ensure that the * vnode cannot be recycled by another process releasing a * holdcnt on it before we get it on both the vnode list * and the active vnode list. The mount mutex protects only * manipulation of the vnode list and the vnode freelist * mutex protects only manipulation of the active vnode list. * Hence the need to hold the vnode interlock throughout. */ MNT_ILOCK(mp); VI_LOCK(vp); if (((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 && ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 || mp->mnt_nvnodelistsize == 0)) && (vp->v_vflag & VV_FORCEINSMQ) == 0) { VI_UNLOCK(vp); MNT_IUNLOCK(mp); if (dtr) { vp->v_data = NULL; vp->v_op = &dead_vnodeops; vgone(vp); vput(vp); } return (EBUSY); } vp->v_mount = mp; MNT_REF(mp); TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, ("neg mount point vnode list size")); mp->mnt_nvnodelistsize++; VI_UNLOCK(vp); MNT_IUNLOCK(mp); return (0); } /* * Insert into list of vnodes for the new mount point, if available. * insmntque() reclaims the vnode on insertion failure, insmntque1() * leaves handling of the vnode to the caller. */ int insmntque(struct vnode *vp, struct mount *mp) { return (insmntque1_int(vp, mp, true)); } int insmntque1(struct vnode *vp, struct mount *mp) { return (insmntque1_int(vp, mp, false)); } /* * Flush out and invalidate all buffers associated with a bufobj * Called with the underlying object locked. */ int bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) { int error; BO_LOCK(bo); if (flags & V_SAVE) { error = bufobj_wwait(bo, slpflag, slptimeo); if (error) { BO_UNLOCK(bo); return (error); } if (bo->bo_dirty.bv_cnt > 0) { BO_UNLOCK(bo); do { error = BO_SYNC(bo, MNT_WAIT); } while (error == ERELOOKUP); if (error != 0) return (error); BO_LOCK(bo); if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) { BO_UNLOCK(bo); return (EBUSY); } } } /* * If you alter this loop please notice that interlock is dropped and * reacquired in flushbuflist. Special care is needed to ensure that * no race conditions occur from this. */ do { error = flushbuflist(&bo->bo_clean, flags, bo, slpflag, slptimeo); if (error == 0 && !(flags & V_CLEANONLY)) error = flushbuflist(&bo->bo_dirty, flags, bo, slpflag, slptimeo); if (error != 0 && error != EAGAIN) { BO_UNLOCK(bo); return (error); } } while (error != 0); /* * Wait for I/O to complete. XXX needs cleaning up. The vnode can * have write I/O in-progress but if there is a VM object then the * VM object can also have read-I/O in-progress. */ do { bufobj_wwait(bo, 0, 0); if ((flags & V_VMIO) == 0 && bo->bo_object != NULL) { BO_UNLOCK(bo); vm_object_pip_wait_unlocked(bo->bo_object, "bovlbx"); BO_LOCK(bo); } } while (bo->bo_numoutput > 0); BO_UNLOCK(bo); /* * Destroy the copy in the VM cache, too. */ if (bo->bo_object != NULL && (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) { VM_OBJECT_WLOCK(bo->bo_object); vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? OBJPR_CLEANONLY : 0); VM_OBJECT_WUNLOCK(bo->bo_object); } #ifdef INVARIANTS BO_LOCK(bo); if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO | V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0)) panic("vinvalbuf: flush failed"); if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 && bo->bo_dirty.bv_cnt > 0) panic("vinvalbuf: flush dirty failed"); BO_UNLOCK(bo); #endif return (0); } /* * Flush out and invalidate all buffers associated with a vnode. * Called with the underlying object locked. */ int vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) { CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); ASSERT_VOP_LOCKED(vp, "vinvalbuf"); if (vp->v_object != NULL && vp->v_object->handle != vp) return (0); return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo)); } /* * Flush out buffers on the specified list. * */ static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, int slptimeo) { struct buf *bp, *nbp; int retval, error; daddr_t lblkno; b_xflags_t xflags; ASSERT_BO_WLOCKED(bo); retval = 0; TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { /* * If we are flushing both V_NORMAL and V_ALT buffers then * do not skip any buffers. If we are flushing only V_NORMAL * buffers then skip buffers marked as BX_ALTDATA. If we are * flushing only V_ALT buffers then skip buffers not marked * as BX_ALTDATA. */ if (((flags & (V_NORMAL | V_ALT)) != (V_NORMAL | V_ALT)) && (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA) != 0) || ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0))) { continue; } if (nbp != NULL) { lblkno = nbp->b_lblkno; xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN); } retval = EAGAIN; error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), "flushbuf", slpflag, slptimeo); if (error) { BO_LOCK(bo); return (error != ENOLCK ? error : EAGAIN); } KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); /* * XXX Since there are no node locks for NFS, I * believe there is a slight chance that a delayed * write will occur while sleeping just above, so * check for it. */ if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && (flags & V_SAVE)) { bremfree(bp); bp->b_flags |= B_ASYNC; bwrite(bp); BO_LOCK(bo); return (EAGAIN); /* XXX: why not loop ? */ } bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); BO_LOCK(bo); if (nbp == NULL) break; nbp = gbincore(bo, lblkno); if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) != xflags) break; /* nbp invalid */ } return (retval); } int bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn) { struct buf *bp; int error; daddr_t lblkno; ASSERT_BO_LOCKED(bo); for (lblkno = startn;;) { again: bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno); if (bp == NULL || bp->b_lblkno >= endn || bp->b_lblkno < startn) break; error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0); if (error != 0) { BO_RLOCK(bo); if (error == ENOLCK) goto again; return (error); } KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); lblkno = bp->b_lblkno + 1; if ((bp->b_flags & B_MANAGED) == 0) bremfree(bp); bp->b_flags |= B_RELBUF; /* * In the VMIO case, use the B_NOREUSE flag to hint that the * pages backing each buffer in the range are unlikely to be * reused. Dirty buffers will have the hint applied once * they've been written. */ if ((bp->b_flags & B_VMIO) != 0) bp->b_flags |= B_NOREUSE; brelse(bp); BO_RLOCK(bo); } return (0); } /* * Truncate a file's buffer and pages to a specified length. This * is in lieu of the old vinvalbuf mechanism, which performed unneeded * sync activity. */ int vtruncbuf(struct vnode *vp, off_t length, int blksize) { struct buf *bp, *nbp; struct bufobj *bo; daddr_t startlbn; CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__, vp, blksize, (uintmax_t)length); /* * Round up to the *next* lbn. */ startlbn = howmany(length, blksize); ASSERT_VOP_LOCKED(vp, "vtruncbuf"); bo = &vp->v_bufobj; restart_unlocked: BO_LOCK(bo); while (v_inval_buf_range_locked(vp, bo, startlbn, INT64_MAX) == EAGAIN) ; if (length > 0) { /* * Write out vnode metadata, e.g. indirect blocks. */ restartsync: TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno >= 0) continue; /* * Since we hold the vnode lock this should only * fail if we're racing with the buf daemon. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) goto restart_unlocked; VNASSERT((bp->b_flags & B_DELWRI), vp, ("buf(%p) on dirty queue without DELWRI", bp)); bremfree(bp); bawrite(bp); BO_LOCK(bo); goto restartsync; } } bufobj_wwait(bo, 0, 0); BO_UNLOCK(bo); vnode_pager_setsize(vp, length); return (0); } /* * Invalidate the cached pages of a file's buffer within the range of block * numbers [startlbn, endlbn). */ void v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn, int blksize) { struct bufobj *bo; off_t start, end; ASSERT_VOP_LOCKED(vp, "v_inval_buf_range"); start = blksize * startlbn; end = blksize * endlbn; bo = &vp->v_bufobj; BO_LOCK(bo); MPASS(blksize == bo->bo_bsize); while (v_inval_buf_range_locked(vp, bo, startlbn, endlbn) == EAGAIN) ; BO_UNLOCK(bo); vn_pages_remove(vp, OFF_TO_IDX(start), OFF_TO_IDX(end + PAGE_SIZE - 1)); } static int v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, daddr_t startlbn, daddr_t endlbn) { struct buf *bp, *nbp; bool anyfreed; ASSERT_VOP_LOCKED(vp, "v_inval_buf_range_locked"); ASSERT_BO_LOCKED(bo); do { anyfreed = false; TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) { BO_LOCK(bo); return (EAGAIN); } bremfree(bp); bp->b_flags |= B_INVAL | B_RELBUF; bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = true; BO_LOCK(bo); if (nbp != NULL && (((nbp->b_xflags & BX_VNCLEAN) == 0) || nbp->b_vp != vp || (nbp->b_flags & B_DELWRI) != 0)) return (EAGAIN); } TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) { BO_LOCK(bo); return (EAGAIN); } bremfree(bp); bp->b_flags |= B_INVAL | B_RELBUF; bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = true; BO_LOCK(bo); if (nbp != NULL && (((nbp->b_xflags & BX_VNDIRTY) == 0) || (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI) == 0)) return (EAGAIN); } } while (anyfreed); return (0); } static void buf_vlist_remove(struct buf *bp) { struct bufv *bv; b_xflags_t flags; flags = bp->b_xflags; KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); ASSERT_BO_WLOCKED(bp->b_bufobj); KASSERT((flags & (BX_VNDIRTY | BX_VNCLEAN)) != 0 && (flags & (BX_VNDIRTY | BX_VNCLEAN)) != (BX_VNDIRTY | BX_VNCLEAN), ("%s: buffer %p has invalid queue state", __func__, bp)); if ((flags & BX_VNDIRTY) != 0) bv = &bp->b_bufobj->bo_dirty; else bv = &bp->b_bufobj->bo_clean; BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno); TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); bv->bv_cnt--; bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); } /* * Add the buffer to the sorted clean or dirty block list. * * NOTE: xflags is passed as a constant, optimizing this inline function! */ static void buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) { struct bufv *bv; struct buf *n; int error; ASSERT_BO_WLOCKED(bo); KASSERT((bo->bo_flag & BO_NOBUFS) == 0, ("buf_vlist_add: bo %p does not allow bufs", bo)); KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0, ("dead bo %p", bo)); KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); bp->b_xflags |= xflags; if (xflags & BX_VNDIRTY) bv = &bo->bo_dirty; else bv = &bo->bo_clean; /* * Keep the list ordered. Optimize empty list insertion. Assume * we tend to grow at the tail so lookup_le should usually be cheaper * than _ge. */ if (bv->bv_cnt == 0 || bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno) TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL) TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs); else TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs); error = BUF_PCTRIE_INSERT(&bv->bv_root, bp); if (error) panic("buf_vlist_add: Preallocated nodes insufficient."); bv->bv_cnt++; } /* * Look up a buffer using the buffer tries. */ struct buf * gbincore(struct bufobj *bo, daddr_t lblkno) { struct buf *bp; ASSERT_BO_LOCKED(bo); bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno); if (bp != NULL) return (bp); return (BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno)); } /* * Look up a buf using the buffer tries, without the bufobj lock. This relies * on SMR for safe lookup, and bufs being in a no-free zone to provide type * stability of the result. Like other lockless lookups, the found buf may * already be invalid by the time this function returns. */ struct buf * gbincore_unlocked(struct bufobj *bo, daddr_t lblkno) { struct buf *bp; ASSERT_BO_UNLOCKED(bo); bp = BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_clean.bv_root, lblkno); if (bp != NULL) return (bp); return (BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_dirty.bv_root, lblkno)); } /* * Associate a buffer with a vnode. */ void bgetvp(struct vnode *vp, struct buf *bp) { struct bufobj *bo; bo = &vp->v_bufobj; ASSERT_BO_WLOCKED(bo); VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, ("bgetvp: bp already attached! %p", bp)); vhold(vp); bp->b_vp = vp; bp->b_bufobj = bo; /* * Insert onto list for new vnode. */ buf_vlist_add(bp, bo, BX_VNCLEAN); } /* * Disassociate a buffer from a vnode. */ void brelvp(struct buf *bp) { struct bufobj *bo; struct vnode *vp; CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); /* * Delete from old vnode list, if on one. */ vp = bp->b_vp; /* XXX */ bo = bp->b_bufobj; BO_LOCK(bo); buf_vlist_remove(bp); if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { bo->bo_flag &= ~BO_ONWORKLST; mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; mtx_unlock(&sync_mtx); } bp->b_vp = NULL; bp->b_bufobj = NULL; BO_UNLOCK(bo); vdrop(vp); } /* * Add an item to the syncer work queue. */ static void vn_syncer_add_to_worklist(struct bufobj *bo, int delay) { int slot; ASSERT_BO_WLOCKED(bo); mtx_lock(&sync_mtx); if (bo->bo_flag & BO_ONWORKLST) LIST_REMOVE(bo, bo_synclist); else { bo->bo_flag |= BO_ONWORKLST; syncer_worklist_len++; } if (delay > syncer_maxdelay - 2) delay = syncer_maxdelay - 2; slot = (syncer_delayno + delay) & syncer_mask; LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist); mtx_unlock(&sync_mtx); } static int sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) { int error, len; mtx_lock(&sync_mtx); len = syncer_worklist_len - sync_vnode_count; mtx_unlock(&sync_mtx); error = SYSCTL_OUT(req, &len, sizeof(len)); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_MPSAFE| CTLFLAG_RD, NULL, 0, sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); static struct proc *updateproc; static void sched_sync(void); static struct kproc_desc up_kp = { "syncer", sched_sync, &updateproc }; SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); static int sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) { struct vnode *vp; struct mount *mp; *bo = LIST_FIRST(slp); if (*bo == NULL) return (0); vp = bo2vnode(*bo); if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0) return (1); /* * We use vhold in case the vnode does not * successfully sync. vhold prevents the vnode from * going away when we unlock the sync_mtx so that * we can acquire the vnode interlock. */ vholdl(vp); mtx_unlock(&sync_mtx); VI_UNLOCK(vp); if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { vdrop(vp); mtx_lock(&sync_mtx); return (*bo == LIST_FIRST(slp)); } MPASSERT(mp == NULL || (curthread->td_pflags & TDP_IGNSUSP) != 0 || (mp->mnt_kern_flag & MNTK_SUSPENDED) == 0, mp, ("suspended mp syncing vp %p", vp)); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); (void) VOP_FSYNC(vp, MNT_LAZY, td); VOP_UNLOCK(vp); vn_finished_write(mp); BO_LOCK(*bo); if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { /* * Put us back on the worklist. The worklist * routine will remove us from our current * position and then add us back in at a later * position. */ vn_syncer_add_to_worklist(*bo, syncdelay); } BO_UNLOCK(*bo); vdrop(vp); mtx_lock(&sync_mtx); return (0); } static int first_printf = 1; /* * System filesystem synchronizer daemon. */ static void sched_sync(void) { struct synclist *next, *slp; struct bufobj *bo; long starttime; struct thread *td = curthread; int last_work_seen; int net_worklist_len; int syncer_final_iter; int error; last_work_seen = 0; syncer_final_iter = 0; syncer_state = SYNCER_RUNNING; starttime = time_uptime; td->td_pflags |= TDP_NORUNNINGBUF; EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, SHUTDOWN_PRI_LAST); mtx_lock(&sync_mtx); for (;;) { if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter == 0) { mtx_unlock(&sync_mtx); kproc_suspend_check(td->td_proc); mtx_lock(&sync_mtx); } net_worklist_len = syncer_worklist_len - sync_vnode_count; if (syncer_state != SYNCER_RUNNING && starttime != time_uptime) { if (first_printf) { printf("\nSyncing disks, vnodes remaining... "); first_printf = 0; } printf("%d ", net_worklist_len); } starttime = time_uptime; /* * Push files whose dirty time has expired. Be careful * of interrupt race on slp queue. * * Skip over empty worklist slots when shutting down. */ do { slp = &syncer_workitem_pending[syncer_delayno]; syncer_delayno += 1; if (syncer_delayno == syncer_maxdelay) syncer_delayno = 0; next = &syncer_workitem_pending[syncer_delayno]; /* * If the worklist has wrapped since the * it was emptied of all but syncer vnodes, * switch to the FINAL_DELAY state and run * for one more second. */ if (syncer_state == SYNCER_SHUTTING_DOWN && net_worklist_len == 0 && last_work_seen == syncer_delayno) { syncer_state = SYNCER_FINAL_DELAY; syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; } } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && syncer_worklist_len > 0); /* * Keep track of the last time there was anything * on the worklist other than syncer vnodes. * Return to the SHUTTING_DOWN state if any * new work appears. */ if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) last_work_seen = syncer_delayno; if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) syncer_state = SYNCER_SHUTTING_DOWN; while (!LIST_EMPTY(slp)) { error = sync_vnode(slp, &bo, td); if (error == 1) { LIST_REMOVE(bo, bo_synclist); LIST_INSERT_HEAD(next, bo, bo_synclist); continue; } if (first_printf == 0) { /* * Drop the sync mutex, because some watchdog * drivers need to sleep while patting */ mtx_unlock(&sync_mtx); wdog_kern_pat(WD_LASTVAL); mtx_lock(&sync_mtx); } } if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) syncer_final_iter--; /* * The variable rushjob allows the kernel to speed up the * processing of the filesystem syncer process. A rushjob * value of N tells the filesystem syncer to process the next * N seconds worth of work on its queue ASAP. Currently rushjob * is used by the soft update code to speed up the filesystem * syncer process when the incore state is getting so far * ahead of the disk that the kernel memory pool is being * threatened with exhaustion. */ if (rushjob > 0) { rushjob -= 1; continue; } /* * Just sleep for a short period of time between * iterations when shutting down to allow some I/O * to happen. * * If it has taken us less than a second to process the * current work, then wait. Otherwise start right over * again. We can still lose time if any single round * takes more than two seconds, but it does not really * matter as we are just trying to generally pace the * filesystem activity. */ if (syncer_state != SYNCER_RUNNING || time_uptime == starttime) { thread_lock(td); sched_prio(td, PPAUSE); thread_unlock(td); } if (syncer_state != SYNCER_RUNNING) cv_timedwait(&sync_wakeup, &sync_mtx, hz / SYNCER_SHUTDOWN_SPEEDUP); else if (time_uptime == starttime) cv_timedwait(&sync_wakeup, &sync_mtx, hz); } } /* * Request the syncer daemon to speed up its work. * We never push it to speed up more than half of its * normal turn time, otherwise it could take over the cpu. */ int speedup_syncer(void) { int ret = 0; mtx_lock(&sync_mtx); if (rushjob < syncdelay / 2) { rushjob += 1; stat_rush_requests += 1; ret = 1; } mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); return (ret); } /* * Tell the syncer to speed up its work and run though its work * list several times, then tell it to shut down. */ static void syncer_shutdown(void *arg, int howto) { if (howto & RB_NOSYNC) return; mtx_lock(&sync_mtx); syncer_state = SYNCER_SHUTTING_DOWN; rushjob = 0; mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); kproc_shutdown(arg, howto); } void syncer_suspend(void) { syncer_shutdown(updateproc, 0); } void syncer_resume(void) { mtx_lock(&sync_mtx); first_printf = 1; syncer_state = SYNCER_RUNNING; mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); kproc_resume(updateproc); } /* * Move the buffer between the clean and dirty lists of its vnode. */ void reassignbuf(struct buf *bp) { struct vnode *vp; struct bufobj *bo; int delay; #ifdef INVARIANTS struct bufv *bv; #endif vp = bp->b_vp; bo = bp->b_bufobj; KASSERT((bp->b_flags & B_PAGING) == 0, ("%s: cannot reassign paging buffer %p", __func__, bp)); CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); BO_LOCK(bo); buf_vlist_remove(bp); /* * If dirty, put on list of dirty buffers; otherwise insert onto list * of clean buffers. */ if (bp->b_flags & B_DELWRI) { if ((bo->bo_flag & BO_ONWORKLST) == 0) { switch (vp->v_type) { case VDIR: delay = dirdelay; break; case VCHR: delay = metadelay; break; default: delay = filedelay; } vn_syncer_add_to_worklist(bo, delay); } buf_vlist_add(bp, bo, BX_VNDIRTY); } else { buf_vlist_add(bp, bo, BX_VNCLEAN); if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; mtx_unlock(&sync_mtx); bo->bo_flag &= ~BO_ONWORKLST; } } #ifdef INVARIANTS bv = &bo->bo_clean; bp = TAILQ_FIRST(&bv->bv_hd); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bp = TAILQ_LAST(&bv->bv_hd, buflists); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bv = &bo->bo_dirty; bp = TAILQ_FIRST(&bv->bv_hd); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bp = TAILQ_LAST(&bv->bv_hd, buflists); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); #endif BO_UNLOCK(bo); } static void v_init_counters(struct vnode *vp) { VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0, vp, ("%s called for an initialized vnode", __FUNCTION__)); ASSERT_VI_UNLOCKED(vp, __FUNCTION__); refcount_init(&vp->v_holdcnt, 1); refcount_init(&vp->v_usecount, 1); } /* * Grab a particular vnode from the free list, increment its * reference count and lock it. VIRF_DOOMED is set if the vnode * is being destroyed. Only callers who specify LK_RETRY will * see doomed vnodes. If inactive processing was delayed in * vput try to do it here. * * usecount is manipulated using atomics without holding any locks. * * holdcnt can be manipulated using atomics without holding any locks, * except when transitioning 1<->0, in which case the interlock is held. * * Consumers which don't guarantee liveness of the vnode can use SMR to * try to get a reference. Note this operation can fail since the vnode * may be awaiting getting freed by the time they get to it. */ enum vgetstate vget_prep_smr(struct vnode *vp) { enum vgetstate vs; VFS_SMR_ASSERT_ENTERED(); if (refcount_acquire_if_not_zero(&vp->v_usecount)) { vs = VGET_USECOUNT; } else { if (vhold_smr(vp)) vs = VGET_HOLDCNT; else vs = VGET_NONE; } return (vs); } enum vgetstate vget_prep(struct vnode *vp) { enum vgetstate vs; if (refcount_acquire_if_not_zero(&vp->v_usecount)) { vs = VGET_USECOUNT; } else { vhold(vp); vs = VGET_HOLDCNT; } return (vs); } void vget_abort(struct vnode *vp, enum vgetstate vs) { switch (vs) { case VGET_USECOUNT: vrele(vp); break; case VGET_HOLDCNT: vdrop(vp); break; default: __assert_unreachable(); } } int vget(struct vnode *vp, int flags) { enum vgetstate vs; vs = vget_prep(vp); return (vget_finish(vp, flags, vs)); } int vget_finish(struct vnode *vp, int flags, enum vgetstate vs) { int error; if ((flags & LK_INTERLOCK) != 0) ASSERT_VI_LOCKED(vp, __func__); else ASSERT_VI_UNLOCKED(vp, __func__); VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); VNPASS(vp->v_holdcnt > 0, vp); VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); error = vn_lock(vp, flags); if (__predict_false(error != 0)) { vget_abort(vp, vs); CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__, vp); return (error); } vget_finish_ref(vp, vs); return (0); } void vget_finish_ref(struct vnode *vp, enum vgetstate vs) { int old; VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); VNPASS(vp->v_holdcnt > 0, vp); VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); if (vs == VGET_USECOUNT) return; /* * We hold the vnode. If the usecount is 0 it will be utilized to keep * the vnode around. Otherwise someone else lended their hold count and * we have to drop ours. */ old = atomic_fetchadd_int(&vp->v_usecount, 1); VNASSERT(old >= 0, vp, ("%s: wrong use count %d", __func__, old)); if (old != 0) { #ifdef INVARIANTS old = atomic_fetchadd_int(&vp->v_holdcnt, -1); VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old)); #else refcount_release(&vp->v_holdcnt); #endif } } void vref(struct vnode *vp) { enum vgetstate vs; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); vs = vget_prep(vp); vget_finish_ref(vp, vs); } void vrefact(struct vnode *vp) { int old __diagused; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); old = refcount_acquire(&vp->v_usecount); VNASSERT(old > 0, vp, ("%s: wrong use count %d", __func__, old)); } void vlazy(struct vnode *vp) { struct mount *mp; VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__)); if ((vp->v_mflag & VMP_LAZYLIST) != 0) return; /* * We may get here for inactive routines after the vnode got doomed. */ if (VN_IS_DOOMED(vp)) return; mp = vp->v_mount; mtx_lock(&mp->mnt_listmtx); if ((vp->v_mflag & VMP_LAZYLIST) == 0) { vp->v_mflag |= VMP_LAZYLIST; TAILQ_INSERT_TAIL(&mp->mnt_lazyvnodelist, vp, v_lazylist); mp->mnt_lazyvnodelistsize++; } mtx_unlock(&mp->mnt_listmtx); } static void vunlazy(struct vnode *vp) { struct mount *mp; ASSERT_VI_LOCKED(vp, __func__); VNPASS(!VN_IS_DOOMED(vp), vp); mp = vp->v_mount; mtx_lock(&mp->mnt_listmtx); VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); /* * Don't remove the vnode from the lazy list if another thread * has increased the hold count. It may have re-enqueued the * vnode to the lazy list and is now responsible for its * removal. */ if (vp->v_holdcnt == 0) { vp->v_mflag &= ~VMP_LAZYLIST; TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); mp->mnt_lazyvnodelistsize--; } mtx_unlock(&mp->mnt_listmtx); } /* * This routine is only meant to be called from vgonel prior to dooming * the vnode. */ static void vunlazy_gone(struct vnode *vp) { struct mount *mp; ASSERT_VOP_ELOCKED(vp, __func__); ASSERT_VI_LOCKED(vp, __func__); VNPASS(!VN_IS_DOOMED(vp), vp); if (vp->v_mflag & VMP_LAZYLIST) { mp = vp->v_mount; mtx_lock(&mp->mnt_listmtx); VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); vp->v_mflag &= ~VMP_LAZYLIST; TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); mp->mnt_lazyvnodelistsize--; mtx_unlock(&mp->mnt_listmtx); } } static void vdefer_inactive(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); VNPASS(vp->v_holdcnt > 0, vp); if (VN_IS_DOOMED(vp)) { vdropl(vp); return; } if (vp->v_iflag & VI_DEFINACT) { VNPASS(vp->v_holdcnt > 1, vp); vdropl(vp); return; } if (vp->v_usecount > 0) { vp->v_iflag &= ~VI_OWEINACT; vdropl(vp); return; } vlazy(vp); vp->v_iflag |= VI_DEFINACT; VI_UNLOCK(vp); atomic_add_long(&deferred_inact, 1); } static void vdefer_inactive_unlocked(struct vnode *vp) { VI_LOCK(vp); if ((vp->v_iflag & VI_OWEINACT) == 0) { vdropl(vp); return; } vdefer_inactive(vp); } enum vput_op { VRELE, VPUT, VUNREF }; /* * Handle ->v_usecount transitioning to 0. * * By releasing the last usecount we take ownership of the hold count which * provides liveness of the vnode, meaning we have to vdrop. * * For all vnodes we may need to perform inactive processing. It requires an * exclusive lock on the vnode, while it is legal to call here with only a * shared lock (or no locks). If locking the vnode in an expected manner fails, * inactive processing gets deferred to the syncer. * * XXX Some filesystems pass in an exclusively locked vnode and strongly depend * on the lock being held all the way until VOP_INACTIVE. This in particular * happens with UFS which adds half-constructed vnodes to the hash, where they * can be found by other code. */ static void vput_final(struct vnode *vp, enum vput_op func) { int error; bool want_unlock; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); VNPASS(vp->v_holdcnt > 0, vp); VI_LOCK(vp); /* * By the time we got here someone else might have transitioned * the count back to > 0. */ if (vp->v_usecount > 0) goto out; /* * If the vnode is doomed vgone already performed inactive processing * (if needed). */ if (VN_IS_DOOMED(vp)) goto out; if (__predict_true(VOP_NEED_INACTIVE(vp) == 0)) goto out; if (vp->v_iflag & VI_DOINGINACT) goto out; /* * Locking operations here will drop the interlock and possibly the * vnode lock, opening a window where the vnode can get doomed all the * while ->v_usecount is 0. Set VI_OWEINACT to let vgone know to * perform inactive. */ vp->v_iflag |= VI_OWEINACT; want_unlock = false; error = 0; switch (func) { case VRELE: switch (VOP_ISLOCKED(vp)) { case LK_EXCLUSIVE: break; case LK_EXCLOTHER: case 0: want_unlock = true; error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); VI_LOCK(vp); break; default: /* * The lock has at least one sharer, but we have no way * to conclude whether this is us. Play it safe and * defer processing. */ error = EAGAIN; break; } break; case VPUT: want_unlock = true; if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK | LK_NOWAIT); VI_LOCK(vp); } break; case VUNREF: if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK); VI_LOCK(vp); } break; } if (error == 0) { if (func == VUNREF) { VNASSERT((vp->v_vflag & VV_UNREF) == 0, vp, ("recursive vunref")); vp->v_vflag |= VV_UNREF; } for (;;) { error = vinactive(vp); if (want_unlock) VOP_UNLOCK(vp); if (error != ERELOOKUP || !want_unlock) break; VOP_LOCK(vp, LK_EXCLUSIVE); } if (func == VUNREF) vp->v_vflag &= ~VV_UNREF; vdropl(vp); } else { vdefer_inactive(vp); } return; out: if (func == VPUT) VOP_UNLOCK(vp); vdropl(vp); } /* * Decrement ->v_usecount for a vnode. * * Releasing the last use count requires additional processing, see vput_final * above for details. * * Comment above each variant denotes lock state on entry and exit. */ /* * in: any * out: same as passed in */ void vrele(struct vnode *vp) { ASSERT_VI_UNLOCKED(vp, __func__); if (!refcount_release(&vp->v_usecount)) return; vput_final(vp, VRELE); } /* * in: locked * out: unlocked */ void vput(struct vnode *vp) { ASSERT_VOP_LOCKED(vp, __func__); ASSERT_VI_UNLOCKED(vp, __func__); if (!refcount_release(&vp->v_usecount)) { VOP_UNLOCK(vp); return; } vput_final(vp, VPUT); } /* * in: locked * out: locked */ void vunref(struct vnode *vp) { ASSERT_VOP_LOCKED(vp, __func__); ASSERT_VI_UNLOCKED(vp, __func__); if (!refcount_release(&vp->v_usecount)) return; vput_final(vp, VUNREF); } void vhold(struct vnode *vp) { int old; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); old = atomic_fetchadd_int(&vp->v_holdcnt, 1); VNASSERT(old >= 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, ("%s: wrong hold count %d", __func__, old)); if (old == 0) vfs_freevnodes_dec(); } void vholdnz(struct vnode *vp) { CTR2(KTR_VFS, "%s: vp %p", __func__, vp); #ifdef INVARIANTS int old = atomic_fetchadd_int(&vp->v_holdcnt, 1); VNASSERT(old > 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, ("%s: wrong hold count %d", __func__, old)); #else atomic_add_int(&vp->v_holdcnt, 1); #endif } /* * Grab a hold count unless the vnode is freed. * * Only use this routine if vfs smr is the only protection you have against * freeing the vnode. * * The code loops trying to add a hold count as long as the VHOLD_NO_SMR flag * is not set. After the flag is set the vnode becomes immutable to anyone but * the thread which managed to set the flag. * * It may be tempting to replace the loop with: * count = atomic_fetchadd_int(&vp->v_holdcnt, 1); * if (count & VHOLD_NO_SMR) { * backpedal and error out; * } * * However, while this is more performant, it hinders debugging by eliminating * the previously mentioned invariant. */ bool vhold_smr(struct vnode *vp) { int count; VFS_SMR_ASSERT_ENTERED(); count = atomic_load_int(&vp->v_holdcnt); for (;;) { if (count & VHOLD_NO_SMR) { VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, ("non-zero hold count with flags %d\n", count)); return (false); } VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) { if (count == 0) vfs_freevnodes_dec(); return (true); } } } /* * Hold a free vnode for recycling. * * Note: vnode_init references this comment. * * Attempts to recycle only need the global vnode list lock and have no use for * SMR. * * However, vnodes get inserted into the global list before they get fully * initialized and stay there until UMA decides to free the memory. This in * particular means the target can be found before it becomes usable and after * it becomes recycled. Picking up such vnodes is guarded with v_holdcnt set to * VHOLD_NO_SMR. * * Note: the vnode may gain more references after we transition the count 0->1. */ static bool vhold_recycle_free(struct vnode *vp) { int count; mtx_assert(&vnode_list_mtx, MA_OWNED); count = atomic_load_int(&vp->v_holdcnt); for (;;) { if (count & VHOLD_NO_SMR) { VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, ("non-zero hold count with flags %d\n", count)); return (false); } VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); if (count > 0) { return (false); } if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) { vfs_freevnodes_dec(); return (true); } } } static void __noinline vdbatch_process(struct vdbatch *vd) { struct vnode *vp; int i; mtx_assert(&vd->lock, MA_OWNED); MPASS(curthread->td_pinned > 0); MPASS(vd->index == VDBATCH_SIZE); /* * Attempt to requeue the passed batch, but give up easily. * * Despite batching the mechanism is prone to transient *significant* * lock contention, where vnode_list_mtx becomes the primary bottleneck * if multiple CPUs get here (one real-world example is highly parallel * do-nothing make , which will stat *tons* of vnodes). Since it is * quasi-LRU (read: not that great even if fully honoured) provide an * option to just dodge the problem. Parties which don't like it are * welcome to implement something better. */ if (vnode_can_skip_requeue) { if (!mtx_trylock(&vnode_list_mtx)) { counter_u64_add(vnode_skipped_requeues, 1); critical_enter(); for (i = 0; i < VDBATCH_SIZE; i++) { vp = vd->tab[i]; vd->tab[i] = NULL; MPASS(vp->v_dbatchcpu != NOCPU); vp->v_dbatchcpu = NOCPU; } vd->index = 0; critical_exit(); return; } /* fallthrough to locked processing */ } else { mtx_lock(&vnode_list_mtx); } mtx_assert(&vnode_list_mtx, MA_OWNED); critical_enter(); for (i = 0; i < VDBATCH_SIZE; i++) { vp = vd->tab[i]; vd->tab[i] = NULL; TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist); MPASS(vp->v_dbatchcpu != NOCPU); vp->v_dbatchcpu = NOCPU; } mtx_unlock(&vnode_list_mtx); vd->index = 0; critical_exit(); } static void vdbatch_enqueue(struct vnode *vp) { struct vdbatch *vd; ASSERT_VI_LOCKED(vp, __func__); VNPASS(!VN_IS_DOOMED(vp), vp); if (vp->v_dbatchcpu != NOCPU) { VI_UNLOCK(vp); return; } sched_pin(); vd = DPCPU_PTR(vd); mtx_lock(&vd->lock); MPASS(vd->index < VDBATCH_SIZE); MPASS(vd->tab[vd->index] == NULL); /* * A hack: we depend on being pinned so that we know what to put in * ->v_dbatchcpu. */ vp->v_dbatchcpu = curcpu; vd->tab[vd->index] = vp; vd->index++; VI_UNLOCK(vp); if (vd->index == VDBATCH_SIZE) vdbatch_process(vd); mtx_unlock(&vd->lock); sched_unpin(); } /* * This routine must only be called for vnodes which are about to be * deallocated. Supporting dequeue for arbitrary vndoes would require * validating that the locked batch matches. */ static void vdbatch_dequeue(struct vnode *vp) { struct vdbatch *vd; int i; short cpu; VNPASS(vp->v_type == VBAD || vp->v_type == VNON, vp); cpu = vp->v_dbatchcpu; if (cpu == NOCPU) return; vd = DPCPU_ID_PTR(cpu, vd); mtx_lock(&vd->lock); for (i = 0; i < vd->index; i++) { if (vd->tab[i] != vp) continue; vp->v_dbatchcpu = NOCPU; vd->index--; vd->tab[i] = vd->tab[vd->index]; vd->tab[vd->index] = NULL; break; } mtx_unlock(&vd->lock); /* * Either we dequeued the vnode above or the target CPU beat us to it. */ MPASS(vp->v_dbatchcpu == NOCPU); } /* * Drop the hold count of the vnode. If this is the last reference to * the vnode we place it on the free list unless it has been vgone'd * (marked VIRF_DOOMED) in which case we will free it. * * Because the vnode vm object keeps a hold reference on the vnode if * there is at least one resident non-cached page, the vnode cannot * leave the active list without the page cleanup done. */ static void __noinline vdropl_final(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); VNPASS(VN_IS_DOOMED(vp), vp); /* * Set the VHOLD_NO_SMR flag. * * We may be racing against vhold_smr. If they win we can just pretend * we never got this far, they will vdrop later. */ if (__predict_false(!atomic_cmpset_int(&vp->v_holdcnt, 0, VHOLD_NO_SMR))) { vfs_freevnodes_inc(); VI_UNLOCK(vp); /* * We lost the aforementioned race. Any subsequent access is * invalid as they might have managed to vdropl on their own. */ return; } /* * Don't bump freevnodes as this one is going away. */ freevnode(vp); } void vdrop(struct vnode *vp) { ASSERT_VI_UNLOCKED(vp, __func__); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); if (refcount_release_if_not_last(&vp->v_holdcnt)) return; VI_LOCK(vp); vdropl(vp); } static void __always_inline vdropl_impl(struct vnode *vp, bool enqueue) { ASSERT_VI_LOCKED(vp, __func__); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); if (!refcount_release(&vp->v_holdcnt)) { VI_UNLOCK(vp); return; } VNPASS((vp->v_iflag & VI_OWEINACT) == 0, vp); VNPASS((vp->v_iflag & VI_DEFINACT) == 0, vp); if (VN_IS_DOOMED(vp)) { vdropl_final(vp); return; } vfs_freevnodes_inc(); if (vp->v_mflag & VMP_LAZYLIST) { vunlazy(vp); } if (!enqueue) { VI_UNLOCK(vp); return; } /* * Also unlocks the interlock. We can't assert on it as we * released our hold and by now the vnode might have been * freed. */ vdbatch_enqueue(vp); } void vdropl(struct vnode *vp) { vdropl_impl(vp, true); } /* * vdrop a vnode when recycling * * This is a special case routine only to be used when recycling, differs from * regular vdrop by not requeieing the vnode on LRU. * * Consider a case where vtryrecycle continuously fails with all vnodes (due to * e.g., frozen writes on the filesystem), filling the batch and causing it to * be requeued. Then vnlru will end up revisiting the same vnodes. This is a * loop which can last for as long as writes are frozen. */ static void vdropl_recycle(struct vnode *vp) { vdropl_impl(vp, false); } static void vdrop_recycle(struct vnode *vp) { VI_LOCK(vp); vdropl_recycle(vp); } /* * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT * flags. DOINGINACT prevents us from recursing in calls to vinactive. */ static int vinactivef(struct vnode *vp) { int error; ASSERT_VOP_ELOCKED(vp, "vinactive"); ASSERT_VI_LOCKED(vp, "vinactive"); VNPASS((vp->v_iflag & VI_DOINGINACT) == 0, vp); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); vp->v_iflag |= VI_DOINGINACT; vp->v_iflag &= ~VI_OWEINACT; VI_UNLOCK(vp); /* * Before moving off the active list, we must be sure that any * modified pages are converted into the vnode's dirty * buffers, since these will no longer be checked once the * vnode is on the inactive list. * * The write-out of the dirty pages is asynchronous. At the * point that VOP_INACTIVE() is called, there could still be * pending I/O and dirty pages in the object. */ if ((vp->v_vflag & VV_NOSYNC) == 0) vnode_pager_clean_async(vp); error = VOP_INACTIVE(vp); VI_LOCK(vp); VNPASS(vp->v_iflag & VI_DOINGINACT, vp); vp->v_iflag &= ~VI_DOINGINACT; return (error); } int vinactive(struct vnode *vp) { ASSERT_VOP_ELOCKED(vp, "vinactive"); ASSERT_VI_LOCKED(vp, "vinactive"); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); if ((vp->v_iflag & VI_OWEINACT) == 0) return (0); if (vp->v_iflag & VI_DOINGINACT) return (0); if (vp->v_usecount > 0) { vp->v_iflag &= ~VI_OWEINACT; return (0); } return (vinactivef(vp)); } /* * Remove any vnodes in the vnode table belonging to mount point mp. * * If FORCECLOSE is not specified, there should not be any active ones, * return error if any are found (nb: this is a user error, not a * system error). If FORCECLOSE is specified, detach any active vnodes * that are found. * * If WRITECLOSE is set, only flush out regular file vnodes open for * writing. * * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. * * `rootrefs' specifies the base reference count for the root vnode * of this filesystem. The root vnode is considered busy if its * v_usecount exceeds this value. On a successful return, vflush(, td) * will call vrele() on the root vnode exactly rootrefs times. * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must * be zero. */ #ifdef DIAGNOSTIC static int busyprt = 0; /* print out busy vnodes */ SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes"); #endif int vflush(struct mount *mp, int rootrefs, int flags, struct thread *td) { struct vnode *vp, *mvp, *rootvp = NULL; struct vattr vattr; int busy = 0, error; CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp, rootrefs, flags); if (rootrefs > 0) { KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, ("vflush: bad args")); /* * Get the filesystem root vnode. We can vput() it * immediately, since with rootrefs > 0, it won't go away. */ if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) { CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d", __func__, error); return (error); } vput(rootvp); } loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { vholdl(vp); error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE); if (error) { vdrop(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } /* * Skip over a vnodes marked VV_SYSTEM. */ if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { VOP_UNLOCK(vp); vdrop(vp); continue; } /* * If WRITECLOSE is set, flush out unlinked but still open * files (even if open only for reading) and regular file * vnodes open for writing. */ if (flags & WRITECLOSE) { vnode_pager_clean_async(vp); do { error = VOP_FSYNC(vp, MNT_WAIT, td); } while (error == ERELOOKUP); if (error != 0) { VOP_UNLOCK(vp); vdrop(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); return (error); } error = VOP_GETATTR(vp, &vattr, td->td_ucred); VI_LOCK(vp); if ((vp->v_type == VNON || (error == 0 && vattr.va_nlink > 0)) && (vp->v_writecount <= 0 || vp->v_type != VREG)) { VOP_UNLOCK(vp); vdropl(vp); continue; } } else VI_LOCK(vp); /* * With v_usecount == 0, all we need to do is clear out the * vnode data structures and we are done. * * If FORCECLOSE is set, forcibly close the vnode. */ if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { vgonel(vp); } else { busy++; #ifdef DIAGNOSTIC if (busyprt) vn_printf(vp, "vflush: busy vnode "); #endif } VOP_UNLOCK(vp); vdropl(vp); } if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { /* * If just the root vnode is busy, and if its refcount * is equal to `rootrefs', then go ahead and kill it. */ VI_LOCK(rootvp); KASSERT(busy > 0, ("vflush: not busy")); VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, ("vflush: usecount %d < rootrefs %d", rootvp->v_usecount, rootrefs)); if (busy == 1 && rootvp->v_usecount == rootrefs) { VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK); vgone(rootvp); VOP_UNLOCK(rootvp); busy = 0; } else VI_UNLOCK(rootvp); } if (busy) { CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__, busy); return (EBUSY); } for (; rootrefs > 0; rootrefs--) vrele(rootvp); return (0); } /* * Recycle an unused vnode to the front of the free list. */ int vrecycle(struct vnode *vp) { int recycled; VI_LOCK(vp); recycled = vrecyclel(vp); VI_UNLOCK(vp); return (recycled); } /* * vrecycle, with the vp interlock held. */ int vrecyclel(struct vnode *vp) { int recycled; ASSERT_VOP_ELOCKED(vp, __func__); ASSERT_VI_LOCKED(vp, __func__); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); recycled = 0; if (vp->v_usecount == 0) { recycled = 1; vgonel(vp); } return (recycled); } /* * Eliminate all activity associated with a vnode * in preparation for reuse. */ void vgone(struct vnode *vp) { VI_LOCK(vp); vgonel(vp); VI_UNLOCK(vp); } /* * Notify upper mounts about reclaimed or unlinked vnode. */ void vfs_notify_upper(struct vnode *vp, enum vfs_notify_upper_type event) { struct mount *mp; struct mount_upper_node *ump; mp = atomic_load_ptr(&vp->v_mount); if (mp == NULL) return; if (TAILQ_EMPTY(&mp->mnt_notify)) return; MNT_ILOCK(mp); mp->mnt_upper_pending++; KASSERT(mp->mnt_upper_pending > 0, ("%s: mnt_upper_pending %d", __func__, mp->mnt_upper_pending)); TAILQ_FOREACH(ump, &mp->mnt_notify, mnt_upper_link) { MNT_IUNLOCK(mp); switch (event) { case VFS_NOTIFY_UPPER_RECLAIM: VFS_RECLAIM_LOWERVP(ump->mp, vp); break; case VFS_NOTIFY_UPPER_UNLINK: VFS_UNLINK_LOWERVP(ump->mp, vp); break; } MNT_ILOCK(mp); } mp->mnt_upper_pending--; if ((mp->mnt_kern_flag & MNTK_UPPER_WAITER) != 0 && mp->mnt_upper_pending == 0) { mp->mnt_kern_flag &= ~MNTK_UPPER_WAITER; wakeup(&mp->mnt_uppers); } MNT_IUNLOCK(mp); } /* * vgone, with the vp interlock held. */ static void vgonel(struct vnode *vp) { struct thread *td; struct mount *mp; vm_object_t object; bool active, doinginact, oweinact; ASSERT_VOP_ELOCKED(vp, "vgonel"); ASSERT_VI_LOCKED(vp, "vgonel"); VNASSERT(vp->v_holdcnt, vp, ("vgonel: vp %p has no reference.", vp)); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); td = curthread; /* * Don't vgonel if we're already doomed. */ if (VN_IS_DOOMED(vp)) { VNPASS(vn_get_state(vp) == VSTATE_DESTROYING || \ vn_get_state(vp) == VSTATE_DEAD, vp); return; } /* * Paired with freevnode. */ vn_seqc_write_begin_locked(vp); vunlazy_gone(vp); vn_irflag_set_locked(vp, VIRF_DOOMED); vn_set_state(vp, VSTATE_DESTROYING); /* * Check to see if the vnode is in use. If so, we have to * call VOP_CLOSE() and VOP_INACTIVE(). * * It could be that VOP_INACTIVE() requested reclamation, in * which case we should avoid recursion, so check * VI_DOINGINACT. This is not precise but good enough. */ active = vp->v_usecount > 0; oweinact = (vp->v_iflag & VI_OWEINACT) != 0; doinginact = (vp->v_iflag & VI_DOINGINACT) != 0; /* * If we need to do inactive VI_OWEINACT will be set. */ if (vp->v_iflag & VI_DEFINACT) { VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count")); vp->v_iflag &= ~VI_DEFINACT; vdropl(vp); } else { VNASSERT(vp->v_holdcnt > 0, vp, ("vnode without hold count")); VI_UNLOCK(vp); } cache_purge_vgone(vp); vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM); /* * If purging an active vnode, it must be closed and * deactivated before being reclaimed. */ if (active) VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); if (!doinginact) { do { if (oweinact || active) { VI_LOCK(vp); vinactivef(vp); oweinact = (vp->v_iflag & VI_OWEINACT) != 0; VI_UNLOCK(vp); } } while (oweinact); } if (vp->v_type == VSOCK) vfs_unp_reclaim(vp); /* * Clean out any buffers associated with the vnode. * If the flush fails, just toss the buffers. */ mp = NULL; if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) (void) vn_start_secondary_write(vp, &mp, V_WAIT); if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) { while (vinvalbuf(vp, 0, 0, 0) != 0) ; } BO_LOCK(&vp->v_bufobj); KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) && vp->v_bufobj.bo_dirty.bv_cnt == 0 && TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) && vp->v_bufobj.bo_clean.bv_cnt == 0, ("vp %p bufobj not invalidated", vp)); /* * For VMIO bufobj, BO_DEAD is set later, or in * vm_object_terminate() after the object's page queue is * flushed. */ object = vp->v_bufobj.bo_object; if (object == NULL) vp->v_bufobj.bo_flag |= BO_DEAD; BO_UNLOCK(&vp->v_bufobj); /* * Handle the VM part. Tmpfs handles v_object on its own (the * OBJT_VNODE check). Nullfs or other bypassing filesystems * should not touch the object borrowed from the lower vnode * (the handle check). */ if (object != NULL && object->type == OBJT_VNODE && object->handle == vp) vnode_destroy_vobject(vp); /* * Reclaim the vnode. */ if (VOP_RECLAIM(vp)) panic("vgone: cannot reclaim"); if (mp != NULL) vn_finished_secondary_write(mp); VNASSERT(vp->v_object == NULL, vp, ("vop_reclaim left v_object vp=%p", vp)); /* * Clear the advisory locks and wake up waiting threads. */ if (vp->v_lockf != NULL) { (void)VOP_ADVLOCKPURGE(vp); vp->v_lockf = NULL; } /* * Delete from old mount point vnode list. */ if (vp->v_mount == NULL) { VI_LOCK(vp); } else { delmntque(vp); ASSERT_VI_LOCKED(vp, "vgonel 2"); } /* * Done with purge, reset to the standard lock and invalidate * the vnode. */ vp->v_vnlock = &vp->v_lock; vp->v_op = &dead_vnodeops; vp->v_type = VBAD; vn_set_state(vp, VSTATE_DEAD); } /* * Print out a description of a vnode. */ static const char *const vtypename[] = { [VNON] = "VNON", [VREG] = "VREG", [VDIR] = "VDIR", [VBLK] = "VBLK", [VCHR] = "VCHR", [VLNK] = "VLNK", [VSOCK] = "VSOCK", [VFIFO] = "VFIFO", [VBAD] = "VBAD", [VMARKER] = "VMARKER", }; _Static_assert(nitems(vtypename) == VLASTTYPE + 1, "vnode type name not added to vtypename"); static const char *const vstatename[] = { [VSTATE_UNINITIALIZED] = "VSTATE_UNINITIALIZED", [VSTATE_CONSTRUCTED] = "VSTATE_CONSTRUCTED", [VSTATE_DESTROYING] = "VSTATE_DESTROYING", [VSTATE_DEAD] = "VSTATE_DEAD", }; _Static_assert(nitems(vstatename) == VLASTSTATE + 1, "vnode state name not added to vstatename"); _Static_assert((VHOLD_ALL_FLAGS & ~VHOLD_NO_SMR) == 0, "new hold count flag not added to vn_printf"); void vn_printf(struct vnode *vp, const char *fmt, ...) { va_list ap; char buf[256], buf2[16]; u_long flags; u_int holdcnt; short irflag; va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf("%p: ", (void *)vp); printf("type %s state %s op %p\n", vtypename[vp->v_type], vstatename[vp->v_state], vp->v_op); holdcnt = atomic_load_int(&vp->v_holdcnt); printf(" usecount %d, writecount %d, refcount %d seqc users %d", vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS, vp->v_seqc_users); switch (vp->v_type) { case VDIR: printf(" mountedhere %p\n", vp->v_mountedhere); break; case VCHR: printf(" rdev %p\n", vp->v_rdev); break; case VSOCK: printf(" socket %p\n", vp->v_unpcb); break; case VFIFO: printf(" fifoinfo %p\n", vp->v_fifoinfo); break; default: printf("\n"); break; } buf[0] = '\0'; buf[1] = '\0'; if (holdcnt & VHOLD_NO_SMR) strlcat(buf, "|VHOLD_NO_SMR", sizeof(buf)); printf(" hold count flags (%s)\n", buf + 1); buf[0] = '\0'; buf[1] = '\0'; irflag = vn_irflag_read(vp); if (irflag & VIRF_DOOMED) strlcat(buf, "|VIRF_DOOMED", sizeof(buf)); if (irflag & VIRF_PGREAD) strlcat(buf, "|VIRF_PGREAD", sizeof(buf)); if (irflag & VIRF_MOUNTPOINT) strlcat(buf, "|VIRF_MOUNTPOINT", sizeof(buf)); if (irflag & VIRF_TEXT_REF) strlcat(buf, "|VIRF_TEXT_REF", sizeof(buf)); flags = irflag & ~(VIRF_DOOMED | VIRF_PGREAD | VIRF_MOUNTPOINT | VIRF_TEXT_REF); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VIRF(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } if (vp->v_vflag & VV_ROOT) strlcat(buf, "|VV_ROOT", sizeof(buf)); if (vp->v_vflag & VV_ISTTY) strlcat(buf, "|VV_ISTTY", sizeof(buf)); if (vp->v_vflag & VV_NOSYNC) strlcat(buf, "|VV_NOSYNC", sizeof(buf)); if (vp->v_vflag & VV_ETERNALDEV) strlcat(buf, "|VV_ETERNALDEV", sizeof(buf)); if (vp->v_vflag & VV_CACHEDLABEL) strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); if (vp->v_vflag & VV_VMSIZEVNLOCK) strlcat(buf, "|VV_VMSIZEVNLOCK", sizeof(buf)); if (vp->v_vflag & VV_COPYONWRITE) strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); if (vp->v_vflag & VV_SYSTEM) strlcat(buf, "|VV_SYSTEM", sizeof(buf)); if (vp->v_vflag & VV_PROCDEP) strlcat(buf, "|VV_PROCDEP", sizeof(buf)); if (vp->v_vflag & VV_DELETED) strlcat(buf, "|VV_DELETED", sizeof(buf)); if (vp->v_vflag & VV_MD) strlcat(buf, "|VV_MD", sizeof(buf)); if (vp->v_vflag & VV_FORCEINSMQ) strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf)); if (vp->v_vflag & VV_READLINK) strlcat(buf, "|VV_READLINK", sizeof(buf)); flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV | VV_CACHEDLABEL | VV_VMSIZEVNLOCK | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP | VV_DELETED | VV_MD | VV_FORCEINSMQ | VV_READLINK); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } if (vp->v_iflag & VI_MOUNT) strlcat(buf, "|VI_MOUNT", sizeof(buf)); if (vp->v_iflag & VI_DOINGINACT) strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); if (vp->v_iflag & VI_OWEINACT) strlcat(buf, "|VI_OWEINACT", sizeof(buf)); if (vp->v_iflag & VI_DEFINACT) strlcat(buf, "|VI_DEFINACT", sizeof(buf)); if (vp->v_iflag & VI_FOPENING) strlcat(buf, "|VI_FOPENING", sizeof(buf)); flags = vp->v_iflag & ~(VI_MOUNT | VI_DOINGINACT | VI_OWEINACT | VI_DEFINACT | VI_FOPENING); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } if (vp->v_mflag & VMP_LAZYLIST) strlcat(buf, "|VMP_LAZYLIST", sizeof(buf)); flags = vp->v_mflag & ~(VMP_LAZYLIST); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } printf(" flags (%s)", buf + 1); if (mtx_owned(VI_MTX(vp))) printf(" VI_LOCKed"); printf("\n"); if (vp->v_object != NULL) printf(" v_object %p ref %d pages %d " "cleanbuf %d dirtybuf %d\n", vp->v_object, vp->v_object->ref_count, vp->v_object->resident_page_count, vp->v_bufobj.bo_clean.bv_cnt, vp->v_bufobj.bo_dirty.bv_cnt); printf(" "); lockmgr_printinfo(vp->v_vnlock); if (vp->v_data != NULL) VOP_PRINT(vp); } #ifdef DDB /* * List all of the locked vnodes in the system. * Called when debugging the kernel. */ DB_SHOW_COMMAND_FLAGS(lockedvnods, lockedvnodes, DB_CMD_MEMSAFE) { struct mount *mp; struct vnode *vp; /* * Note: because this is DDB, we can't obey the locking semantics * for these structures, which means we could catch an inconsistent * state and dereference a nasty pointer. Not much to be done * about that. */ db_printf("Locked vnodes\n"); TAILQ_FOREACH(mp, &mountlist, mnt_list) { TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (vp->v_type != VMARKER && VOP_ISLOCKED(vp)) vn_printf(vp, "vnode "); } } } /* * Show details about the given vnode. */ DB_SHOW_COMMAND(vnode, db_show_vnode) { struct vnode *vp; if (!have_addr) return; vp = (struct vnode *)addr; vn_printf(vp, "vnode "); } /* * Show details about the given mount point. */ DB_SHOW_COMMAND(mount, db_show_mount) { struct mount *mp; struct vfsopt *opt; struct statfs *sp; struct vnode *vp; char buf[512]; uint64_t mflags; u_int flags; if (!have_addr) { /* No address given, print short info about all mount points. */ TAILQ_FOREACH(mp, &mountlist, mnt_list) { db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); if (db_pager_quit) break; } db_printf("\nMore info: show mount \n"); return; } mp = (struct mount *)addr; db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); buf[0] = '\0'; mflags = mp->mnt_flag; #define MNT_FLAG(flag) do { \ if (mflags & (flag)) { \ if (buf[0] != '\0') \ strlcat(buf, ", ", sizeof(buf)); \ strlcat(buf, (#flag) + 4, sizeof(buf)); \ mflags &= ~(flag); \ } \ } while (0) MNT_FLAG(MNT_RDONLY); MNT_FLAG(MNT_SYNCHRONOUS); MNT_FLAG(MNT_NOEXEC); MNT_FLAG(MNT_NOSUID); MNT_FLAG(MNT_NFS4ACLS); MNT_FLAG(MNT_UNION); MNT_FLAG(MNT_ASYNC); MNT_FLAG(MNT_SUIDDIR); MNT_FLAG(MNT_SOFTDEP); MNT_FLAG(MNT_NOSYMFOLLOW); MNT_FLAG(MNT_GJOURNAL); MNT_FLAG(MNT_MULTILABEL); MNT_FLAG(MNT_ACLS); MNT_FLAG(MNT_NOATIME); MNT_FLAG(MNT_NOCLUSTERR); MNT_FLAG(MNT_NOCLUSTERW); MNT_FLAG(MNT_SUJ); MNT_FLAG(MNT_EXRDONLY); MNT_FLAG(MNT_EXPORTED); MNT_FLAG(MNT_DEFEXPORTED); MNT_FLAG(MNT_EXPORTANON); MNT_FLAG(MNT_EXKERB); MNT_FLAG(MNT_EXPUBLIC); MNT_FLAG(MNT_LOCAL); MNT_FLAG(MNT_QUOTA); MNT_FLAG(MNT_ROOTFS); MNT_FLAG(MNT_USER); MNT_FLAG(MNT_IGNORE); MNT_FLAG(MNT_UPDATE); MNT_FLAG(MNT_DELEXPORT); MNT_FLAG(MNT_RELOAD); MNT_FLAG(MNT_FORCE); MNT_FLAG(MNT_SNAPSHOT); MNT_FLAG(MNT_BYFSID); #undef MNT_FLAG if (mflags != 0) { if (buf[0] != '\0') strlcat(buf, ", ", sizeof(buf)); snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "0x%016jx", mflags); } db_printf(" mnt_flag = %s\n", buf); buf[0] = '\0'; flags = mp->mnt_kern_flag; #define MNT_KERN_FLAG(flag) do { \ if (flags & (flag)) { \ if (buf[0] != '\0') \ strlcat(buf, ", ", sizeof(buf)); \ strlcat(buf, (#flag) + 5, sizeof(buf)); \ flags &= ~(flag); \ } \ } while (0) MNT_KERN_FLAG(MNTK_UNMOUNTF); MNT_KERN_FLAG(MNTK_ASYNC); MNT_KERN_FLAG(MNTK_SOFTDEP); MNT_KERN_FLAG(MNTK_NOMSYNC); MNT_KERN_FLAG(MNTK_DRAINING); MNT_KERN_FLAG(MNTK_REFEXPIRE); MNT_KERN_FLAG(MNTK_EXTENDED_SHARED); MNT_KERN_FLAG(MNTK_SHARED_WRITES); MNT_KERN_FLAG(MNTK_NO_IOPF); MNT_KERN_FLAG(MNTK_RECURSE); MNT_KERN_FLAG(MNTK_UPPER_WAITER); MNT_KERN_FLAG(MNTK_UNLOCKED_INSMNTQUE); MNT_KERN_FLAG(MNTK_USES_BCACHE); MNT_KERN_FLAG(MNTK_VMSETSIZE_BUG); MNT_KERN_FLAG(MNTK_FPLOOKUP); MNT_KERN_FLAG(MNTK_TASKQUEUE_WAITER); MNT_KERN_FLAG(MNTK_NOASYNC); MNT_KERN_FLAG(MNTK_UNMOUNT); MNT_KERN_FLAG(MNTK_MWAIT); MNT_KERN_FLAG(MNTK_SUSPEND); MNT_KERN_FLAG(MNTK_SUSPEND2); MNT_KERN_FLAG(MNTK_SUSPENDED); MNT_KERN_FLAG(MNTK_NULL_NOCACHE); MNT_KERN_FLAG(MNTK_LOOKUP_SHARED); #undef MNT_KERN_FLAG if (flags != 0) { if (buf[0] != '\0') strlcat(buf, ", ", sizeof(buf)); snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "0x%08x", flags); } db_printf(" mnt_kern_flag = %s\n", buf); db_printf(" mnt_opt = "); opt = TAILQ_FIRST(mp->mnt_opt); if (opt != NULL) { db_printf("%s", opt->name); opt = TAILQ_NEXT(opt, link); while (opt != NULL) { db_printf(", %s", opt->name); opt = TAILQ_NEXT(opt, link); } } db_printf("\n"); sp = &mp->mnt_stat; db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx " "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju " "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju " "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n", (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags, (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize, (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree, (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files, (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites, (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads, (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax, (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]); db_printf(" mnt_cred = { uid=%u ruid=%u", (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid); if (jailed(mp->mnt_cred)) db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); db_printf(" }\n"); db_printf(" mnt_ref = %d (with %d in the struct)\n", vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref); db_printf(" mnt_gen = %d\n", mp->mnt_gen); db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); db_printf(" mnt_lazyvnodelistsize = %d\n", mp->mnt_lazyvnodelistsize); db_printf(" mnt_writeopcount = %d (with %d in the struct)\n", vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount); db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); db_printf(" mnt_lockref = %d (with %d in the struct)\n", vfs_mount_fetch_counter(mp, MNT_COUNT_LOCKREF), mp->mnt_lockref); db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); db_printf(" mnt_secondary_accwrites = %d\n", mp->mnt_secondary_accwrites); db_printf(" mnt_gjprovider = %s\n", mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL"); db_printf(" mnt_vfs_ops = %d\n", mp->mnt_vfs_ops); db_printf("\n\nList of active vnodes\n"); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (vp->v_type != VMARKER && vp->v_holdcnt > 0) { vn_printf(vp, "vnode "); if (db_pager_quit) break; } } db_printf("\n\nList of inactive vnodes\n"); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (vp->v_type != VMARKER && vp->v_holdcnt == 0) { vn_printf(vp, "vnode "); if (db_pager_quit) break; } } } #endif /* DDB */ /* * Fill in a struct xvfsconf based on a struct vfsconf. */ static int vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp) { struct xvfsconf xvfsp; bzero(&xvfsp, sizeof(xvfsp)); strcpy(xvfsp.vfc_name, vfsp->vfc_name); xvfsp.vfc_typenum = vfsp->vfc_typenum; xvfsp.vfc_refcount = vfsp->vfc_refcount; xvfsp.vfc_flags = vfsp->vfc_flags; /* * These are unused in userland, we keep them * to not break binary compatibility. */ xvfsp.vfc_vfsops = NULL; xvfsp.vfc_next = NULL; return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); } #ifdef COMPAT_FREEBSD32 struct xvfsconf32 { uint32_t vfc_vfsops; char vfc_name[MFSNAMELEN]; int32_t vfc_typenum; int32_t vfc_refcount; int32_t vfc_flags; uint32_t vfc_next; }; static int vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp) { struct xvfsconf32 xvfsp; bzero(&xvfsp, sizeof(xvfsp)); strcpy(xvfsp.vfc_name, vfsp->vfc_name); xvfsp.vfc_typenum = vfsp->vfc_typenum; xvfsp.vfc_refcount = vfsp->vfc_refcount; xvfsp.vfc_flags = vfsp->vfc_flags; return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); } #endif /* * Top level filesystem related information gathering. */ static int sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) { struct vfsconf *vfsp; int error; error = 0; vfsconf_slock(); TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { #ifdef COMPAT_FREEBSD32 if (req->flags & SCTL_MASK32) error = vfsconf2x32(req, vfsp); else #endif error = vfsconf2x(req, vfsp); if (error) break; } vfsconf_sunlock(); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist, "S,xvfsconf", "List of all configured filesystems"); #ifndef BURN_BRIDGES static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); static int vfs_sysctl(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1 - 1; /* XXX */ u_int namelen = arg2 + 1; /* XXX */ struct vfsconf *vfsp; log(LOG_WARNING, "userland calling deprecated sysctl, " "please rebuild world\n"); #if 1 || defined(COMPAT_PRELITE2) /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ if (namelen == 1) return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); #endif switch (name[1]) { case VFS_MAXTYPENUM: if (namelen != 2) return (ENOTDIR); return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); case VFS_CONF: if (namelen != 3) return (ENOTDIR); /* overloaded */ vfsconf_slock(); TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { if (vfsp->vfc_typenum == name[2]) break; } vfsconf_sunlock(); if (vfsp == NULL) return (EOPNOTSUPP); #ifdef COMPAT_FREEBSD32 if (req->flags & SCTL_MASK32) return (vfsconf2x32(req, vfsp)); else #endif return (vfsconf2x(req, vfsp)); } return (EOPNOTSUPP); } static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP | CTLFLAG_MPSAFE, vfs_sysctl, "Generic filesystem"); #if 1 || defined(COMPAT_PRELITE2) static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) { int error; struct vfsconf *vfsp; struct ovfsconf ovfs; vfsconf_slock(); TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { bzero(&ovfs, sizeof(ovfs)); ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ strcpy(ovfs.vfc_name, vfsp->vfc_name); ovfs.vfc_index = vfsp->vfc_typenum; ovfs.vfc_refcount = vfsp->vfc_refcount; ovfs.vfc_flags = vfsp->vfc_flags; error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); if (error != 0) { vfsconf_sunlock(); return (error); } } vfsconf_sunlock(); return (0); } #endif /* 1 || COMPAT_PRELITE2 */ #endif /* !BURN_BRIDGES */ static void unmount_or_warn(struct mount *mp) { int error; error = dounmount(mp, MNT_FORCE, curthread); if (error != 0) { printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); if (error == EBUSY) printf("BUSY)\n"); else printf("%d)\n", error); } } /* * Unmount all filesystems. The list is traversed in reverse order * of mounting to avoid dependencies. */ void vfs_unmountall(void) { struct mount *mp, *tmp; CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__); /* * Since this only runs when rebooting, it is not interlocked. */ TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) { vfs_ref(mp); /* * Forcibly unmounting "/dev" before "/" would prevent clean * unmount of the latter. */ if (mp == rootdevmp) continue; unmount_or_warn(mp); } if (rootdevmp != NULL) unmount_or_warn(rootdevmp); } static void vfs_deferred_inactive(struct vnode *vp, int lkflags) { ASSERT_VI_LOCKED(vp, __func__); VNPASS((vp->v_iflag & VI_DEFINACT) == 0, vp); if ((vp->v_iflag & VI_OWEINACT) == 0) { vdropl(vp); return; } if (vn_lock(vp, lkflags) == 0) { VI_LOCK(vp); vinactive(vp); VOP_UNLOCK(vp); vdropl(vp); return; } vdefer_inactive_unlocked(vp); } static int vfs_periodic_inactive_filter(struct vnode *vp, void *arg) { return (vp->v_iflag & VI_DEFINACT); } static void __noinline vfs_periodic_inactive(struct mount *mp, int flags) { struct vnode *vp, *mvp; int lkflags; lkflags = LK_EXCLUSIVE | LK_INTERLOCK; if (flags != MNT_WAIT) lkflags |= LK_NOWAIT; MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_inactive_filter, NULL) { if ((vp->v_iflag & VI_DEFINACT) == 0) { VI_UNLOCK(vp); continue; } vp->v_iflag &= ~VI_DEFINACT; vfs_deferred_inactive(vp, lkflags); } } static inline bool vfs_want_msync(struct vnode *vp) { struct vm_object *obj; /* * This test may be performed without any locks held. * We rely on vm_object's type stability. */ if (vp->v_vflag & VV_NOSYNC) return (false); obj = vp->v_object; return (obj != NULL && vm_object_mightbedirty(obj)); } static int vfs_periodic_msync_inactive_filter(struct vnode *vp, void *arg __unused) { if (vp->v_vflag & VV_NOSYNC) return (false); if (vp->v_iflag & VI_DEFINACT) return (true); return (vfs_want_msync(vp)); } static void __noinline vfs_periodic_msync_inactive(struct mount *mp, int flags) { struct vnode *vp, *mvp; int lkflags; bool seen_defer; lkflags = LK_EXCLUSIVE | LK_INTERLOCK; if (flags != MNT_WAIT) lkflags |= LK_NOWAIT; MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_msync_inactive_filter, NULL) { seen_defer = false; if (vp->v_iflag & VI_DEFINACT) { vp->v_iflag &= ~VI_DEFINACT; seen_defer = true; } if (!vfs_want_msync(vp)) { if (seen_defer) vfs_deferred_inactive(vp, lkflags); else VI_UNLOCK(vp); continue; } if (vget(vp, lkflags) == 0) { if ((vp->v_vflag & VV_NOSYNC) == 0) { if (flags == MNT_WAIT) vnode_pager_clean_sync(vp); else vnode_pager_clean_async(vp); } vput(vp); if (seen_defer) vdrop(vp); } else { if (seen_defer) vdefer_inactive_unlocked(vp); } } } void vfs_periodic(struct mount *mp, int flags) { CTR2(KTR_VFS, "%s: mp %p", __func__, mp); if ((mp->mnt_kern_flag & MNTK_NOMSYNC) != 0) vfs_periodic_inactive(mp, flags); else vfs_periodic_msync_inactive(mp, flags); } static void destroy_vpollinfo_free(struct vpollinfo *vi) { knlist_destroy(&vi->vpi_selinfo.si_note); mtx_destroy(&vi->vpi_lock); free(vi, M_VNODEPOLL); } static void destroy_vpollinfo(struct vpollinfo *vi) { knlist_clear(&vi->vpi_selinfo.si_note, 1); seldrain(&vi->vpi_selinfo); destroy_vpollinfo_free(vi); } /* * Initialize per-vnode helper structure to hold poll-related state. */ void v_addpollinfo(struct vnode *vp) { struct vpollinfo *vi; if (vp->v_pollinfo != NULL) return; vi = malloc(sizeof(*vi), M_VNODEPOLL, M_WAITOK | M_ZERO); mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, vfs_knlunlock, vfs_knl_assert_lock); VI_LOCK(vp); if (vp->v_pollinfo != NULL) { VI_UNLOCK(vp); destroy_vpollinfo_free(vi); return; } vp->v_pollinfo = vi; VI_UNLOCK(vp); } /* * Record a process's interest in events which might happen to * a vnode. Because poll uses the historic select-style interface * internally, this routine serves as both the ``check for any * pending events'' and the ``record my interest in future events'' * functions. (These are done together, while the lock is held, * to avoid race conditions.) */ int vn_pollrecord(struct vnode *vp, struct thread *td, int events) { v_addpollinfo(vp); mtx_lock(&vp->v_pollinfo->vpi_lock); if (vp->v_pollinfo->vpi_revents & events) { /* * This leaves events we are not interested * in available for the other process which * which presumably had requested them * (otherwise they would never have been * recorded). */ events &= vp->v_pollinfo->vpi_revents; vp->v_pollinfo->vpi_revents &= ~events; mtx_unlock(&vp->v_pollinfo->vpi_lock); return (events); } vp->v_pollinfo->vpi_events |= events; selrecord(td, &vp->v_pollinfo->vpi_selinfo); mtx_unlock(&vp->v_pollinfo->vpi_lock); return (0); } /* * Routine to create and manage a filesystem syncer vnode. */ #define sync_close ((int (*)(struct vop_close_args *))nullop) static int sync_fsync(struct vop_fsync_args *); static int sync_inactive(struct vop_inactive_args *); static int sync_reclaim(struct vop_reclaim_args *); static struct vop_vector sync_vnodeops = { .vop_bypass = VOP_EOPNOTSUPP, .vop_close = sync_close, .vop_fsync = sync_fsync, .vop_getwritemount = vop_stdgetwritemount, .vop_inactive = sync_inactive, .vop_need_inactive = vop_stdneed_inactive, .vop_reclaim = sync_reclaim, .vop_lock1 = vop_stdlock, .vop_unlock = vop_stdunlock, .vop_islocked = vop_stdislocked, .vop_fplookup_vexec = VOP_EAGAIN, .vop_fplookup_symlink = VOP_EAGAIN, }; VFS_VOP_VECTOR_REGISTER(sync_vnodeops); /* * Create a new filesystem syncer vnode for the specified mount point. */ void vfs_allocate_syncvnode(struct mount *mp) { struct vnode *vp; struct bufobj *bo; static long start, incr, next; int error; /* Allocate a new vnode */ error = getnewvnode("syncer", mp, &sync_vnodeops, &vp); if (error != 0) panic("vfs_allocate_syncvnode: getnewvnode() failed"); vp->v_type = VNON; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vp->v_vflag |= VV_FORCEINSMQ; error = insmntque1(vp, mp); if (error != 0) panic("vfs_allocate_syncvnode: insmntque() failed"); vp->v_vflag &= ~VV_FORCEINSMQ; vn_set_state(vp, VSTATE_CONSTRUCTED); VOP_UNLOCK(vp); /* * Place the vnode onto the syncer worklist. We attempt to * scatter them about on the list so that they will go off * at evenly distributed times even if all the filesystems * are mounted at once. */ next += incr; if (next == 0 || next > syncer_maxdelay) { start /= 2; incr /= 2; if (start == 0) { start = syncer_maxdelay / 2; incr = syncer_maxdelay; } next = start; } bo = &vp->v_bufobj; BO_LOCK(bo); vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ mtx_lock(&sync_mtx); sync_vnode_count++; if (mp->mnt_syncer == NULL) { mp->mnt_syncer = vp; vp = NULL; } mtx_unlock(&sync_mtx); BO_UNLOCK(bo); if (vp != NULL) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vgone(vp); vput(vp); } } void vfs_deallocate_syncvnode(struct mount *mp) { struct vnode *vp; mtx_lock(&sync_mtx); vp = mp->mnt_syncer; if (vp != NULL) mp->mnt_syncer = NULL; mtx_unlock(&sync_mtx); if (vp != NULL) vrele(vp); } /* * Do a lazy sync of the filesystem. */ static int sync_fsync(struct vop_fsync_args *ap) { struct vnode *syncvp = ap->a_vp; struct mount *mp = syncvp->v_mount; int error, save; struct bufobj *bo; /* * We only need to do something if this is a lazy evaluation. */ if (ap->a_waitfor != MNT_LAZY) return (0); /* * Move ourselves to the back of the sync list. */ bo = &syncvp->v_bufobj; BO_LOCK(bo); vn_syncer_add_to_worklist(bo, syncdelay); BO_UNLOCK(bo); /* * Walk the list of vnodes pushing all that are dirty and * not already on the sync list. */ if (vfs_busy(mp, MBF_NOWAIT) != 0) return (0); VOP_UNLOCK(syncvp); save = curthread_pflags_set(TDP_SYNCIO); /* * The filesystem at hand may be idle with free vnodes stored in the * batch. Return them instead of letting them stay there indefinitely. */ vfs_periodic(mp, MNT_NOWAIT); error = VFS_SYNC(mp, MNT_LAZY); curthread_pflags_restore(save); vn_lock(syncvp, LK_EXCLUSIVE | LK_RETRY); vfs_unbusy(mp); return (error); } /* * The syncer vnode is no referenced. */ static int sync_inactive(struct vop_inactive_args *ap) { vgone(ap->a_vp); return (0); } /* * The syncer vnode is no longer needed and is being decommissioned. * * Modifications to the worklist must be protected by sync_mtx. */ static int sync_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp = ap->a_vp; struct bufobj *bo; bo = &vp->v_bufobj; BO_LOCK(bo); mtx_lock(&sync_mtx); if (vp->v_mount->mnt_syncer == vp) vp->v_mount->mnt_syncer = NULL; if (bo->bo_flag & BO_ONWORKLST) { LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; sync_vnode_count--; bo->bo_flag &= ~BO_ONWORKLST; } mtx_unlock(&sync_mtx); BO_UNLOCK(bo); return (0); } int vn_need_pageq_flush(struct vnode *vp) { struct vm_object *obj; obj = vp->v_object; return (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && vm_object_mightbedirty(obj)); } /* * Check if vnode represents a disk device */ bool vn_isdisk_error(struct vnode *vp, int *errp) { int error; if (vp->v_type != VCHR) { error = ENOTBLK; goto out; } error = 0; dev_lock(); if (vp->v_rdev == NULL) error = ENXIO; else if (vp->v_rdev->si_devsw == NULL) error = ENXIO; else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) error = ENOTBLK; dev_unlock(); out: *errp = error; return (error == 0); } bool vn_isdisk(struct vnode *vp) { int error; return (vn_isdisk_error(vp, &error)); } /* * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see * the comment above cache_fplookup for details. */ int vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred) { int error; VFS_SMR_ASSERT_ENTERED(); /* Check the owner. */ if (cred->cr_uid == file_uid) { if (file_mode & S_IXUSR) return (0); goto out_error; } /* Otherwise, check the groups (first match) */ if (groupmember(file_gid, cred)) { if (file_mode & S_IXGRP) return (0); goto out_error; } /* Otherwise, check everyone else. */ if (file_mode & S_IXOTH) return (0); out_error: /* * Permission check failed, but it is possible denial will get overwritten * (e.g., when root is traversing through a 700 directory owned by someone * else). * * vaccess() calls priv_check_cred which in turn can descent into MAC * modules overriding this result. It's quite unclear what semantics * are allowed for them to operate, thus for safety we don't call them * from within the SMR section. This also means if any such modules * are present, we have to let the regular lookup decide. */ error = priv_check_cred_vfs_lookup_nomac(cred); switch (error) { case 0: return (0); case EAGAIN: /* * MAC modules present. */ return (EAGAIN); case EPERM: return (EACCES); default: return (error); } } /* * Common filesystem object access control check routine. Accepts a * vnode's type, "mode", uid and gid, requested access mode, and credentials. * Returns 0 on success, or an errno on failure. */ int vaccess(__enum_uint8(vtype) type, mode_t file_mode, uid_t file_uid, gid_t file_gid, accmode_t accmode, struct ucred *cred) { accmode_t dac_granted; accmode_t priv_granted; KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, ("invalid bit in accmode")); KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE), ("VAPPEND without VWRITE")); /* * Look for a normal, non-privileged way to access the file/directory * as requested. If it exists, go with that. */ dac_granted = 0; /* Check the owner. */ if (cred->cr_uid == file_uid) { dac_granted |= VADMIN; if (file_mode & S_IXUSR) dac_granted |= VEXEC; if (file_mode & S_IRUSR) dac_granted |= VREAD; if (file_mode & S_IWUSR) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); goto privcheck; } /* Otherwise, check the groups (first match) */ if (groupmember(file_gid, cred)) { if (file_mode & S_IXGRP) dac_granted |= VEXEC; if (file_mode & S_IRGRP) dac_granted |= VREAD; if (file_mode & S_IWGRP) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); goto privcheck; } /* Otherwise, check everyone else. */ if (file_mode & S_IXOTH) dac_granted |= VEXEC; if (file_mode & S_IROTH) dac_granted |= VREAD; if (file_mode & S_IWOTH) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); privcheck: /* * Build a privilege mask to determine if the set of privileges * satisfies the requirements when combined with the granted mask * from above. For each privilege, if the privilege is required, * bitwise or the request type onto the priv_granted mask. */ priv_granted = 0; if (type == VDIR) { /* * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC * requests, instead of PRIV_VFS_EXEC. */ if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && !priv_check_cred(cred, PRIV_VFS_LOOKUP)) priv_granted |= VEXEC; } else { /* * Ensure that at least one execute bit is on. Otherwise, * a privileged user will always succeed, and we don't want * this to happen unless the file really is executable. */ if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 && !priv_check_cred(cred, PRIV_VFS_EXEC)) priv_granted |= VEXEC; } if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) && !priv_check_cred(cred, PRIV_VFS_READ)) priv_granted |= VREAD; if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) && !priv_check_cred(cred, PRIV_VFS_WRITE)) priv_granted |= (VWRITE | VAPPEND); if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) && !priv_check_cred(cred, PRIV_VFS_ADMIN)) priv_granted |= VADMIN; if ((accmode & (priv_granted | dac_granted)) == accmode) { return (0); } return ((accmode & VADMIN) ? EPERM : EACCES); } /* * Credential check based on process requesting service, and per-attribute * permissions. */ int extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, struct thread *td, accmode_t accmode) { /* * Kernel-invoked always succeeds. */ if (cred == NOCRED) return (0); /* * Do not allow privileged processes in jail to directly manipulate * system attributes. */ switch (attrnamespace) { case EXTATTR_NAMESPACE_SYSTEM: /* Potentially should be: return (EPERM); */ return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM)); case EXTATTR_NAMESPACE_USER: return (VOP_ACCESS(vp, accmode, cred, td)); default: return (EPERM); } } #ifdef DEBUG_VFS_LOCKS int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, "Drop into debugger on lock violation"); int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 0, "Check for interlock across VOPs"); int vfs_badlock_print = 1; /* Print lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 0, "Print lock violations"); int vfs_badlock_vnode = 1; /* Print vnode details on lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode, 0, "Print vnode details on lock violations"); #ifdef KDB int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, &vfs_badlock_backtrace, 0, "Print backtrace at lock violations"); #endif static void vfs_badlock(const char *msg, const char *str, struct vnode *vp) { #ifdef KDB if (vfs_badlock_backtrace) kdb_backtrace(); #endif if (vfs_badlock_vnode) vn_printf(vp, "vnode "); if (vfs_badlock_print) printf("%s: %p %s\n", str, (void *)vp, msg); if (vfs_badlock_ddb) kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); } void assert_vi_locked(struct vnode *vp, const char *str) { if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) vfs_badlock("interlock is not locked but should be", str, vp); } void assert_vi_unlocked(struct vnode *vp, const char *str) { if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) vfs_badlock("interlock is locked but should not be", str, vp); } void assert_vop_locked(struct vnode *vp, const char *str) { if (KERNEL_PANICKED() || vp == NULL) return; #ifdef WITNESS if ((vp->v_irflag & VIRF_CROSSMP) == 0 && witness_is_owned(&vp->v_vnlock->lock_object) == -1) #else int locked = VOP_ISLOCKED(vp); if (locked == 0 || locked == LK_EXCLOTHER) #endif vfs_badlock("is not locked but should be", str, vp); } void assert_vop_unlocked(struct vnode *vp, const char *str) { if (KERNEL_PANICKED() || vp == NULL) return; #ifdef WITNESS if ((vp->v_irflag & VIRF_CROSSMP) == 0 && witness_is_owned(&vp->v_vnlock->lock_object) == 1) #else if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) #endif vfs_badlock("is locked but should not be", str, vp); } void assert_vop_elocked(struct vnode *vp, const char *str) { if (KERNEL_PANICKED() || vp == NULL) return; if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) vfs_badlock("is not exclusive locked but should be", str, vp); } #endif /* DEBUG_VFS_LOCKS */ void vop_rename_fail(struct vop_rename_args *ap) { if (ap->a_tvp != NULL) vput(ap->a_tvp); if (ap->a_tdvp == ap->a_tvp) vrele(ap->a_tdvp); else vput(ap->a_tdvp); vrele(ap->a_fdvp); vrele(ap->a_fvp); } void vop_rename_pre(void *ap) { struct vop_rename_args *a = ap; #ifdef DEBUG_VFS_LOCKS if (a->a_tvp) ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); /* Check the source (from). */ if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock && (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock)) ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock) ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); /* Check the target. */ if (a->a_tvp) ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); #endif /* * It may be tempting to add vn_seqc_write_begin/end calls here and * in vop_rename_post but that's not going to work out since some * filesystems relookup vnodes mid-rename. This is probably a bug. * * For now filesystems are expected to do the relevant calls after they * decide what vnodes to operate on. */ if (a->a_tdvp != a->a_fdvp) vhold(a->a_fdvp); if (a->a_tvp != a->a_fvp) vhold(a->a_fvp); vhold(a->a_tdvp); if (a->a_tvp) vhold(a->a_tvp); } #ifdef DEBUG_VFS_LOCKS void vop_fplookup_vexec_debugpre(void *ap __unused) { VFS_SMR_ASSERT_ENTERED(); } void vop_fplookup_vexec_debugpost(void *ap, int rc) { struct vop_fplookup_vexec_args *a; struct vnode *vp; a = ap; vp = a->a_vp; VFS_SMR_ASSERT_ENTERED(); if (rc == EOPNOTSUPP) VNPASS(VN_IS_DOOMED(vp), vp); } void vop_fplookup_symlink_debugpre(void *ap __unused) { VFS_SMR_ASSERT_ENTERED(); } void vop_fplookup_symlink_debugpost(void *ap __unused, int rc __unused) { VFS_SMR_ASSERT_ENTERED(); } static void vop_fsync_debugprepost(struct vnode *vp, const char *name) { if (vp->v_type == VCHR) ; /* * The shared vs. exclusive locking policy for fsync() * is actually determined by vp's write mount as indicated * by VOP_GETWRITEMOUNT(), which for stacked filesystems * may not be the same as vp->v_mount. However, if the * underlying filesystem which really handles the fsync() * supports shared locking, the stacked filesystem must also * be prepared for its VOP_FSYNC() operation to be called * with only a shared lock. On the other hand, if the * stacked filesystem claims support for shared write * locking but the underlying filesystem does not, and the * caller incorrectly uses a shared lock, this condition * should still be caught when the stacked filesystem * invokes VOP_FSYNC() on the underlying filesystem. */ else if (MNT_SHARED_WRITES(vp->v_mount)) ASSERT_VOP_LOCKED(vp, name); else ASSERT_VOP_ELOCKED(vp, name); } void vop_fsync_debugpre(void *a) { struct vop_fsync_args *ap; ap = a; vop_fsync_debugprepost(ap->a_vp, "fsync"); } void vop_fsync_debugpost(void *a, int rc __unused) { struct vop_fsync_args *ap; ap = a; vop_fsync_debugprepost(ap->a_vp, "fsync"); } void vop_fdatasync_debugpre(void *a) { struct vop_fdatasync_args *ap; ap = a; vop_fsync_debugprepost(ap->a_vp, "fsync"); } void vop_fdatasync_debugpost(void *a, int rc __unused) { struct vop_fdatasync_args *ap; ap = a; vop_fsync_debugprepost(ap->a_vp, "fsync"); } void vop_strategy_debugpre(void *ap) { struct vop_strategy_args *a; struct buf *bp; a = ap; bp = a->a_bp; /* * Cluster ops lock their component buffers but not the IO container. */ if ((bp->b_flags & B_CLUSTER) != 0) return; if (!KERNEL_PANICKED() && !BUF_ISLOCKED(bp)) { if (vfs_badlock_print) printf( "VOP_STRATEGY: bp is not locked but should be\n"); if (vfs_badlock_ddb) kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); } } void vop_lock_debugpre(void *ap) { struct vop_lock1_args *a = ap; if ((a->a_flags & LK_INTERLOCK) == 0) ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); else ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); } void vop_lock_debugpost(void *ap, int rc) { struct vop_lock1_args *a = ap; ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0) ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); } void vop_unlock_debugpre(void *ap) { struct vop_unlock_args *a = ap; struct vnode *vp = a->a_vp; VNPASS(vn_get_state(vp) != VSTATE_UNINITIALIZED, vp); ASSERT_VOP_LOCKED(vp, "VOP_UNLOCK"); } void vop_need_inactive_debugpre(void *ap) { struct vop_need_inactive_args *a = ap; ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); } void vop_need_inactive_debugpost(void *ap, int rc) { struct vop_need_inactive_args *a = ap; ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); } #endif void vop_create_pre(void *ap) { struct vop_create_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_create_post(void *ap, int rc) { struct vop_create_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); if (!rc) VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); } void vop_whiteout_pre(void *ap) { struct vop_whiteout_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_whiteout_post(void *ap, int rc) { struct vop_whiteout_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); } void vop_deleteextattr_pre(void *ap) { struct vop_deleteextattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_begin(vp); } void vop_deleteextattr_post(void *ap, int rc) { struct vop_deleteextattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_end(vp); if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); } void vop_link_pre(void *ap) { struct vop_link_args *a; struct vnode *vp, *tdvp; a = ap; vp = a->a_vp; tdvp = a->a_tdvp; vn_seqc_write_begin(vp); vn_seqc_write_begin(tdvp); } void vop_link_post(void *ap, int rc) { struct vop_link_args *a; struct vnode *vp, *tdvp; a = ap; vp = a->a_vp; tdvp = a->a_tdvp; vn_seqc_write_end(vp); vn_seqc_write_end(tdvp); if (!rc) { VFS_KNOTE_LOCKED(vp, NOTE_LINK); VFS_KNOTE_LOCKED(tdvp, NOTE_WRITE); } } void vop_mkdir_pre(void *ap) { struct vop_mkdir_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_mkdir_post(void *ap, int rc) { struct vop_mkdir_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); if (!rc) VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); } #ifdef DEBUG_VFS_LOCKS void vop_mkdir_debugpost(void *ap, int rc) { struct vop_mkdir_args *a; a = ap; if (!rc) cache_validate(a->a_dvp, *a->a_vpp, a->a_cnp); } #endif void vop_mknod_pre(void *ap) { struct vop_mknod_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_mknod_post(void *ap, int rc) { struct vop_mknod_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); if (!rc) VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); } void vop_reclaim_post(void *ap, int rc) { struct vop_reclaim_args *a; struct vnode *vp; a = ap; vp = a->a_vp; ASSERT_VOP_IN_SEQC(vp); if (!rc) VFS_KNOTE_LOCKED(vp, NOTE_REVOKE); } void vop_remove_pre(void *ap) { struct vop_remove_args *a; struct vnode *dvp, *vp; a = ap; dvp = a->a_dvp; vp = a->a_vp; vn_seqc_write_begin(dvp); vn_seqc_write_begin(vp); } void vop_remove_post(void *ap, int rc) { struct vop_remove_args *a; struct vnode *dvp, *vp; a = ap; dvp = a->a_dvp; vp = a->a_vp; vn_seqc_write_end(dvp); vn_seqc_write_end(vp); if (!rc) { VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); VFS_KNOTE_LOCKED(vp, NOTE_DELETE); } } void vop_rename_post(void *ap, int rc) { struct vop_rename_args *a = ap; long hint; if (!rc) { hint = NOTE_WRITE; if (a->a_fdvp == a->a_tdvp) { if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR) hint |= NOTE_LINK; VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); } else { hint |= NOTE_EXTEND; if (a->a_fvp->v_type == VDIR) hint |= NOTE_LINK; VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL && a->a_tvp->v_type == VDIR) hint &= ~NOTE_LINK; VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); } VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); if (a->a_tvp) VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); } if (a->a_tdvp != a->a_fdvp) vdrop(a->a_fdvp); if (a->a_tvp != a->a_fvp) vdrop(a->a_fvp); vdrop(a->a_tdvp); if (a->a_tvp) vdrop(a->a_tvp); } void vop_rmdir_pre(void *ap) { struct vop_rmdir_args *a; struct vnode *dvp, *vp; a = ap; dvp = a->a_dvp; vp = a->a_vp; vn_seqc_write_begin(dvp); vn_seqc_write_begin(vp); } void vop_rmdir_post(void *ap, int rc) { struct vop_rmdir_args *a; struct vnode *dvp, *vp; a = ap; dvp = a->a_dvp; vp = a->a_vp; vn_seqc_write_end(dvp); vn_seqc_write_end(vp); if (!rc) { vp->v_vflag |= VV_UNLINKED; VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); VFS_KNOTE_LOCKED(vp, NOTE_DELETE); } } void vop_setattr_pre(void *ap) { struct vop_setattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_begin(vp); } void vop_setattr_post(void *ap, int rc) { struct vop_setattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_end(vp); if (!rc) VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); } void vop_setacl_pre(void *ap) { struct vop_setacl_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_begin(vp); } void vop_setacl_post(void *ap, int rc __unused) { struct vop_setacl_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_end(vp); } void vop_setextattr_pre(void *ap) { struct vop_setextattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_begin(vp); } void vop_setextattr_post(void *ap, int rc) { struct vop_setextattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_end(vp); if (!rc) VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); } void vop_symlink_pre(void *ap) { struct vop_symlink_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_symlink_post(void *ap, int rc) { struct vop_symlink_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); if (!rc) VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); } void vop_open_post(void *ap, int rc) { struct vop_open_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN); } void vop_close_post(void *ap, int rc) { struct vop_close_args *a = ap; if (!rc && (a->a_cred != NOCRED || /* filter out revokes */ !VN_IS_DOOMED(a->a_vp))) { VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ? NOTE_CLOSE_WRITE : NOTE_CLOSE); } } void vop_read_post(void *ap, int rc) { struct vop_read_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); } void vop_read_pgcache_post(void *ap, int rc) { struct vop_read_pgcache_args *a = ap; if (!rc) VFS_KNOTE_UNLOCKED(a->a_vp, NOTE_READ); } void vop_readdir_post(void *ap, int rc) { struct vop_readdir_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); } static struct knlist fs_knlist; static void vfs_event_init(void *arg) { knlist_init_mtx(&fs_knlist, NULL); } /* XXX - correct order? */ SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); void vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused) { KNOTE_UNLOCKED(&fs_knlist, event); } static int filt_fsattach(struct knote *kn); static void filt_fsdetach(struct knote *kn); static int filt_fsevent(struct knote *kn, long hint); -struct filterops fs_filtops = { +const struct filterops fs_filtops = { .f_isfd = 0, .f_attach = filt_fsattach, .f_detach = filt_fsdetach, .f_event = filt_fsevent }; static int filt_fsattach(struct knote *kn) { kn->kn_flags |= EV_CLEAR; knlist_add(&fs_knlist, kn, 0); return (0); } static void filt_fsdetach(struct knote *kn) { knlist_remove(&fs_knlist, kn, 0); } static int filt_fsevent(struct knote *kn, long hint) { kn->kn_fflags |= kn->kn_sfflags & hint; return (kn->kn_fflags != 0); } static int sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) { struct vfsidctl vc; int error; struct mount *mp; error = SYSCTL_IN(req, &vc, sizeof(vc)); if (error) return (error); if (vc.vc_vers != VFS_CTL_VERS1) return (EINVAL); mp = vfs_getvfs(&vc.vc_fsid); if (mp == NULL) return (ENOENT); /* ensure that a specific sysctl goes to the right filesystem. */ if (strcmp(vc.vc_fstypename, "*") != 0 && strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { vfs_rel(mp); return (EINVAL); } VCTLTOREQ(&vc, req); error = VFS_SYSCTL(mp, vc.vc_op, req); vfs_rel(mp); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, sysctl_vfs_ctl, "", "Sysctl by fsid"); /* * Function to initialize a va_filerev field sensibly. * XXX: Wouldn't a random number make a lot more sense ?? */ u_quad_t init_va_filerev(void) { struct bintime bt; getbinuptime(&bt); return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); } static int filt_vfsread(struct knote *kn, long hint); static int filt_vfswrite(struct knote *kn, long hint); static int filt_vfsvnode(struct knote *kn, long hint); static void filt_vfsdetach(struct knote *kn); -static struct filterops vfsread_filtops = { +static const struct filterops vfsread_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfsread }; -static struct filterops vfswrite_filtops = { +static const struct filterops vfswrite_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfswrite }; -static struct filterops vfsvnode_filtops = { +static const struct filterops vfsvnode_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfsvnode }; static void vfs_knllock(void *arg) { struct vnode *vp = arg; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } static void vfs_knlunlock(void *arg) { struct vnode *vp = arg; VOP_UNLOCK(vp); } static void vfs_knl_assert_lock(void *arg, int what) { #ifdef DEBUG_VFS_LOCKS struct vnode *vp = arg; if (what == LA_LOCKED) ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked"); else ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked"); #endif } int vfs_kqfilter(struct vop_kqfilter_args *ap) { struct vnode *vp = ap->a_vp; struct knote *kn = ap->a_kn; struct knlist *knl; KASSERT(vp->v_type != VFIFO || (kn->kn_filter != EVFILT_READ && kn->kn_filter != EVFILT_WRITE), ("READ/WRITE filter on a FIFO leaked through")); switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &vfsread_filtops; break; case EVFILT_WRITE: kn->kn_fop = &vfswrite_filtops; break; case EVFILT_VNODE: kn->kn_fop = &vfsvnode_filtops; break; default: return (EINVAL); } kn->kn_hook = (caddr_t)vp; v_addpollinfo(vp); if (vp->v_pollinfo == NULL) return (ENOMEM); knl = &vp->v_pollinfo->vpi_selinfo.si_note; vhold(vp); knlist_add(knl, kn, 0); return (0); } /* * Detach knote from vnode */ static void filt_vfsdetach(struct knote *kn) { struct vnode *vp = (struct vnode *)kn->kn_hook; KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); vdrop(vp); } /*ARGSUSED*/ static int filt_vfsread(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; off_t size; int res; /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { VI_LOCK(vp); kn->kn_flags |= (EV_EOF | EV_ONESHOT); VI_UNLOCK(vp); return (1); } if (vn_getsize_locked(vp, &size, curthread->td_ucred) != 0) return (0); VI_LOCK(vp); kn->kn_data = size - kn->kn_fp->f_offset; res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0; VI_UNLOCK(vp); return (res); } /*ARGSUSED*/ static int filt_vfswrite(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; VI_LOCK(vp); /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) kn->kn_flags |= (EV_EOF | EV_ONESHOT); kn->kn_data = 0; VI_UNLOCK(vp); return (1); } static int filt_vfsvnode(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; int res; VI_LOCK(vp); if (kn->kn_sfflags & hint) kn->kn_fflags |= hint; if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { kn->kn_flags |= EV_EOF; VI_UNLOCK(vp); return (1); } res = (kn->kn_fflags != 0); VI_UNLOCK(vp); return (res); } int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) { int error; if (dp->d_reclen > ap->a_uio->uio_resid) return (ENAMETOOLONG); error = uiomove(dp, dp->d_reclen, ap->a_uio); if (error) { if (ap->a_ncookies != NULL) { if (ap->a_cookies != NULL) free(ap->a_cookies, M_TEMP); ap->a_cookies = NULL; *ap->a_ncookies = 0; } return (error); } if (ap->a_ncookies == NULL) return (0); KASSERT(ap->a_cookies, ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); *ap->a_cookies = realloc(*ap->a_cookies, (*ap->a_ncookies + 1) * sizeof(uint64_t), M_TEMP, M_WAITOK | M_ZERO); (*ap->a_cookies)[*ap->a_ncookies] = off; *ap->a_ncookies += 1; return (0); } /* * The purpose of this routine is to remove granularity from accmode_t, * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE, * VADMIN and VAPPEND. * * If it returns 0, the caller is supposed to continue with the usual * access checks using 'accmode' as modified by this routine. If it * returns nonzero value, the caller is supposed to return that value * as errno. * * Note that after this routine runs, accmode may be zero. */ int vfs_unixify_accmode(accmode_t *accmode) { /* * There is no way to specify explicit "deny" rule using * file mode or POSIX.1e ACLs. */ if (*accmode & VEXPLICIT_DENY) { *accmode = 0; return (0); } /* * None of these can be translated into usual access bits. * Also, the common case for NFSv4 ACLs is to not contain * either of these bits. Caller should check for VWRITE * on the containing directory instead. */ if (*accmode & (VDELETE_CHILD | VDELETE)) return (EPERM); if (*accmode & VADMIN_PERMS) { *accmode &= ~VADMIN_PERMS; *accmode |= VADMIN; } /* * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL * or VSYNCHRONIZE using file mode or POSIX.1e ACL. */ *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE); return (0); } /* * Clear out a doomed vnode (if any) and replace it with a new one as long * as the fs is not being unmounted. Return the root vnode to the caller. */ static int __noinline vfs_cache_root_fallback(struct mount *mp, int flags, struct vnode **vpp) { struct vnode *vp; int error; restart: if (mp->mnt_rootvnode != NULL) { MNT_ILOCK(mp); vp = mp->mnt_rootvnode; if (vp != NULL) { if (!VN_IS_DOOMED(vp)) { vrefact(vp); MNT_IUNLOCK(mp); error = vn_lock(vp, flags); if (error == 0) { *vpp = vp; return (0); } vrele(vp); goto restart; } /* * Clear the old one. */ mp->mnt_rootvnode = NULL; } MNT_IUNLOCK(mp); if (vp != NULL) { vfs_op_barrier_wait(mp); vrele(vp); } } error = VFS_CACHEDROOT(mp, flags, vpp); if (error != 0) return (error); if (mp->mnt_vfs_ops == 0) { MNT_ILOCK(mp); if (mp->mnt_vfs_ops != 0) { MNT_IUNLOCK(mp); return (0); } if (mp->mnt_rootvnode == NULL) { vrefact(*vpp); mp->mnt_rootvnode = *vpp; } else { if (mp->mnt_rootvnode != *vpp) { if (!VN_IS_DOOMED(mp->mnt_rootvnode)) { panic("%s: mismatch between vnode returned " " by VFS_CACHEDROOT and the one cached " " (%p != %p)", __func__, *vpp, mp->mnt_rootvnode); } } } MNT_IUNLOCK(mp); } return (0); } int vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp) { struct mount_pcpu *mpcpu; struct vnode *vp; int error; if (!vfs_op_thread_enter(mp, mpcpu)) return (vfs_cache_root_fallback(mp, flags, vpp)); vp = atomic_load_ptr(&mp->mnt_rootvnode); if (vp == NULL || VN_IS_DOOMED(vp)) { vfs_op_thread_exit(mp, mpcpu); return (vfs_cache_root_fallback(mp, flags, vpp)); } vrefact(vp); vfs_op_thread_exit(mp, mpcpu); error = vn_lock(vp, flags); if (error != 0) { vrele(vp); return (vfs_cache_root_fallback(mp, flags, vpp)); } *vpp = vp; return (0); } struct vnode * vfs_cache_root_clear(struct mount *mp) { struct vnode *vp; /* * ops > 0 guarantees there is nobody who can see this vnode */ MPASS(mp->mnt_vfs_ops > 0); vp = mp->mnt_rootvnode; if (vp != NULL) vn_seqc_write_begin(vp); mp->mnt_rootvnode = NULL; return (vp); } void vfs_cache_root_set(struct mount *mp, struct vnode *vp) { MPASS(mp->mnt_vfs_ops > 0); vrefact(vp); mp->mnt_rootvnode = vp; } /* * These are helper functions for filesystems to traverse all * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h. * * This interface replaces MNT_VNODE_FOREACH. */ struct vnode * __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp) { struct vnode *vp; maybe_yield(); MNT_ILOCK(mp); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL; vp = TAILQ_NEXT(vp, v_nmntvnodes)) { /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) continue; VI_LOCK(vp); if (VN_IS_DOOMED(vp)) { VI_UNLOCK(vp); continue; } break; } if (vp == NULL) { __mnt_vnode_markerfree_all(mvp, mp); /* MNT_IUNLOCK(mp); -- done in above function */ mtx_assert(MNT_MTX(mp), MA_NOTOWNED); return (NULL); } TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); MNT_IUNLOCK(mp); return (vp); } struct vnode * __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp) { struct vnode *vp; *mvp = vn_alloc_marker(mp); MNT_ILOCK(mp); MNT_REF(mp); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) continue; VI_LOCK(vp); if (VN_IS_DOOMED(vp)) { VI_UNLOCK(vp); continue; } break; } if (vp == NULL) { MNT_REL(mp); MNT_IUNLOCK(mp); vn_free_marker(*mvp); *mvp = NULL; return (NULL); } TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); MNT_IUNLOCK(mp); return (vp); } void __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp) { if (*mvp == NULL) { MNT_IUNLOCK(mp); return; } mtx_assert(MNT_MTX(mp), MA_OWNED); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); MNT_REL(mp); MNT_IUNLOCK(mp); vn_free_marker(*mvp); *mvp = NULL; } /* * These are helper functions for filesystems to traverse their * lazy vnodes. See MNT_VNODE_FOREACH_LAZY() in sys/mount.h */ static void mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) { KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); MNT_ILOCK(mp); MNT_REL(mp); MNT_IUNLOCK(mp); vn_free_marker(*mvp); *mvp = NULL; } /* * Relock the mp mount vnode list lock with the vp vnode interlock in the * conventional lock order during mnt_vnode_next_lazy iteration. * * On entry, the mount vnode list lock is held and the vnode interlock is not. * The list lock is dropped and reacquired. On success, both locks are held. * On failure, the mount vnode list lock is held but the vnode interlock is * not, and the procedure may have yielded. */ static bool mnt_vnode_next_lazy_relock(struct vnode *mvp, struct mount *mp, struct vnode *vp) { VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER && TAILQ_NEXT(mvp, v_lazylist) != NULL, mvp, ("%s: bad marker", __func__)); VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp, ("%s: inappropriate vnode", __func__)); ASSERT_VI_UNLOCKED(vp, __func__); mtx_assert(&mp->mnt_listmtx, MA_OWNED); TAILQ_REMOVE(&mp->mnt_lazyvnodelist, mvp, v_lazylist); TAILQ_INSERT_BEFORE(vp, mvp, v_lazylist); /* * Note we may be racing against vdrop which transitioned the hold * count to 0 and now waits for the ->mnt_listmtx lock. This is fine, * if we are the only user after we get the interlock we will just * vdrop. */ vhold(vp); mtx_unlock(&mp->mnt_listmtx); VI_LOCK(vp); if (VN_IS_DOOMED(vp)) { VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); goto out_lost; } VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); /* * There is nothing to do if we are the last user. */ if (!refcount_release_if_not_last(&vp->v_holdcnt)) goto out_lost; mtx_lock(&mp->mnt_listmtx); return (true); out_lost: vdropl(vp); maybe_yield(); mtx_lock(&mp->mnt_listmtx); return (false); } static struct vnode * mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, void *cbarg) { struct vnode *vp; mtx_assert(&mp->mnt_listmtx, MA_OWNED); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); restart: vp = TAILQ_NEXT(*mvp, v_lazylist); while (vp != NULL) { if (vp->v_type == VMARKER) { vp = TAILQ_NEXT(vp, v_lazylist); continue; } /* * See if we want to process the vnode. Note we may encounter a * long string of vnodes we don't care about and hog the list * as a result. Check for it and requeue the marker. */ VNPASS(!VN_IS_DOOMED(vp), vp); if (!cb(vp, cbarg)) { if (!should_yield()) { vp = TAILQ_NEXT(vp, v_lazylist); continue; } TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist); mtx_unlock(&mp->mnt_listmtx); kern_yield(PRI_USER); mtx_lock(&mp->mnt_listmtx); goto restart; } /* * Try-lock because this is the wrong lock order. */ if (!VI_TRYLOCK(vp) && !mnt_vnode_next_lazy_relock(*mvp, mp, vp)) goto restart; KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp)); KASSERT(vp->v_mount == mp || vp->v_mount == NULL, ("alien vnode on the lazy list %p %p", vp, mp)); VNPASS(vp->v_mount == mp, vp); VNPASS(!VN_IS_DOOMED(vp), vp); break; } TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); /* Check if we are done */ if (vp == NULL) { mtx_unlock(&mp->mnt_listmtx); mnt_vnode_markerfree_lazy(mvp, mp); return (NULL); } TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist); mtx_unlock(&mp->mnt_listmtx); ASSERT_VI_LOCKED(vp, "lazy iter"); return (vp); } struct vnode * __mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, void *cbarg) { maybe_yield(); mtx_lock(&mp->mnt_listmtx); return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); } struct vnode * __mnt_vnode_first_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, void *cbarg) { struct vnode *vp; if (TAILQ_EMPTY(&mp->mnt_lazyvnodelist)) return (NULL); *mvp = vn_alloc_marker(mp); MNT_ILOCK(mp); MNT_REF(mp); MNT_IUNLOCK(mp); mtx_lock(&mp->mnt_listmtx); vp = TAILQ_FIRST(&mp->mnt_lazyvnodelist); if (vp == NULL) { mtx_unlock(&mp->mnt_listmtx); mnt_vnode_markerfree_lazy(mvp, mp); return (NULL); } TAILQ_INSERT_BEFORE(vp, *mvp, v_lazylist); return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); } void __mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) { if (*mvp == NULL) return; mtx_lock(&mp->mnt_listmtx); TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); mtx_unlock(&mp->mnt_listmtx); mnt_vnode_markerfree_lazy(mvp, mp); } int vn_dir_check_exec(struct vnode *vp, struct componentname *cnp) { if ((cnp->cn_flags & NOEXECCHECK) != 0) { cnp->cn_flags &= ~NOEXECCHECK; return (0); } return (VOP_ACCESS(vp, VEXEC, cnp->cn_cred, curthread)); } /* * Do not use this variant unless you have means other than the hold count * to prevent the vnode from getting freed. */ void vn_seqc_write_begin_locked(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); VNPASS(vp->v_holdcnt > 0, vp); VNPASS(vp->v_seqc_users >= 0, vp); vp->v_seqc_users++; if (vp->v_seqc_users == 1) seqc_sleepable_write_begin(&vp->v_seqc); } void vn_seqc_write_begin(struct vnode *vp) { VI_LOCK(vp); vn_seqc_write_begin_locked(vp); VI_UNLOCK(vp); } void vn_seqc_write_end_locked(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); VNPASS(vp->v_seqc_users > 0, vp); vp->v_seqc_users--; if (vp->v_seqc_users == 0) seqc_sleepable_write_end(&vp->v_seqc); } void vn_seqc_write_end(struct vnode *vp) { VI_LOCK(vp); vn_seqc_write_end_locked(vp); VI_UNLOCK(vp); } /* * Special case handling for allocating and freeing vnodes. * * The counter remains unchanged on free so that a doomed vnode will * keep testing as in modify as long as it is accessible with SMR. */ static void vn_seqc_init(struct vnode *vp) { vp->v_seqc = 0; vp->v_seqc_users = 0; } static void vn_seqc_write_end_free(struct vnode *vp) { VNPASS(seqc_in_modify(vp->v_seqc), vp); VNPASS(vp->v_seqc_users == 1, vp); } void vn_irflag_set_locked(struct vnode *vp, short toset) { short flags; ASSERT_VI_LOCKED(vp, __func__); flags = vn_irflag_read(vp); VNASSERT((flags & toset) == 0, vp, ("%s: some of the passed flags already set (have %d, passed %d)\n", __func__, flags, toset)); atomic_store_short(&vp->v_irflag, flags | toset); } void vn_irflag_set(struct vnode *vp, short toset) { VI_LOCK(vp); vn_irflag_set_locked(vp, toset); VI_UNLOCK(vp); } void vn_irflag_set_cond_locked(struct vnode *vp, short toset) { short flags; ASSERT_VI_LOCKED(vp, __func__); flags = vn_irflag_read(vp); atomic_store_short(&vp->v_irflag, flags | toset); } void vn_irflag_set_cond(struct vnode *vp, short toset) { VI_LOCK(vp); vn_irflag_set_cond_locked(vp, toset); VI_UNLOCK(vp); } void vn_irflag_unset_locked(struct vnode *vp, short tounset) { short flags; ASSERT_VI_LOCKED(vp, __func__); flags = vn_irflag_read(vp); VNASSERT((flags & tounset) == tounset, vp, ("%s: some of the passed flags not set (have %d, passed %d)\n", __func__, flags, tounset)); atomic_store_short(&vp->v_irflag, flags & ~tounset); } void vn_irflag_unset(struct vnode *vp, short tounset) { VI_LOCK(vp); vn_irflag_unset_locked(vp, tounset); VI_UNLOCK(vp); } int vn_getsize_locked(struct vnode *vp, off_t *size, struct ucred *cred) { struct vattr vattr; int error; ASSERT_VOP_LOCKED(vp, __func__); error = VOP_GETATTR(vp, &vattr, cred); if (__predict_true(error == 0)) { if (vattr.va_size <= OFF_MAX) *size = vattr.va_size; else error = EFBIG; } return (error); } int vn_getsize(struct vnode *vp, off_t *size, struct ucred *cred) { int error; VOP_LOCK(vp, LK_SHARED); error = vn_getsize_locked(vp, size, cred); VOP_UNLOCK(vp); return (error); } #ifdef INVARIANTS void vn_set_state_validate(struct vnode *vp, __enum_uint8(vstate) state) { switch (vp->v_state) { case VSTATE_UNINITIALIZED: switch (state) { case VSTATE_CONSTRUCTED: case VSTATE_DESTROYING: return; default: break; } break; case VSTATE_CONSTRUCTED: ASSERT_VOP_ELOCKED(vp, __func__); switch (state) { case VSTATE_DESTROYING: return; default: break; } break; case VSTATE_DESTROYING: ASSERT_VOP_ELOCKED(vp, __func__); switch (state) { case VSTATE_DEAD: return; default: break; } break; case VSTATE_DEAD: switch (state) { case VSTATE_UNINITIALIZED: return; default: break; } break; } vn_printf(vp, "invalid state transition %d -> %d\n", vp->v_state, state); panic("invalid state transition %d -> %d\n", vp->v_state, state); } #endif diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index 53a2ddf94862..c28d6e66853f 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -1,4272 +1,4272 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Copyright (c) 2012 Konstantin Belousov * Copyright (c) 2013, 2014 The FreeBSD Foundation * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94 */ #include #include "opt_hwpmc_hooks.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif static fo_rdwr_t vn_read; static fo_rdwr_t vn_write; static fo_rdwr_t vn_io_fault; static fo_truncate_t vn_truncate; static fo_ioctl_t vn_ioctl; static fo_poll_t vn_poll; static fo_kqfilter_t vn_kqfilter; static fo_close_t vn_closefile; static fo_mmap_t vn_mmap; static fo_fallocate_t vn_fallocate; static fo_fspacectl_t vn_fspacectl; -struct fileops vnops = { +const struct fileops vnops = { .fo_read = vn_io_fault, .fo_write = vn_io_fault, .fo_truncate = vn_truncate, .fo_ioctl = vn_ioctl, .fo_poll = vn_poll, .fo_kqfilter = vn_kqfilter, .fo_stat = vn_statfile, .fo_close = vn_closefile, .fo_chmod = vn_chmod, .fo_chown = vn_chown, .fo_sendfile = vn_sendfile, .fo_seek = vn_seek, .fo_fill_kinfo = vn_fill_kinfo, .fo_mmap = vn_mmap, .fo_fallocate = vn_fallocate, .fo_fspacectl = vn_fspacectl, .fo_cmp = vn_cmp, .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE }; const u_int io_hold_cnt = 16; static int vn_io_fault_enable = 1; SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RWTUN, &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance"); static int vn_io_fault_prefault = 0; SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_prefault, CTLFLAG_RWTUN, &vn_io_fault_prefault, 0, "Enable vn_io_fault prefaulting"); static int vn_io_pgcache_read_enable = 1; SYSCTL_INT(_debug, OID_AUTO, vn_io_pgcache_read_enable, CTLFLAG_RWTUN, &vn_io_pgcache_read_enable, 0, "Enable copying from page cache for reads, avoiding fs"); static u_long vn_io_faults_cnt; SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD, &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers"); static int vfs_allow_read_dir = 0; SYSCTL_INT(_security_bsd, OID_AUTO, allow_read_dir, CTLFLAG_RW, &vfs_allow_read_dir, 0, "Enable read(2) of directory by root for filesystems that support it"); /* * Returns true if vn_io_fault mode of handling the i/o request should * be used. */ static bool do_vn_io_fault(struct vnode *vp, struct uio *uio) { struct mount *mp; return (uio->uio_segflg == UIO_USERSPACE && vp->v_type == VREG && (mp = vp->v_mount) != NULL && (mp->mnt_kern_flag & MNTK_NO_IOPF) != 0 && vn_io_fault_enable); } /* * Structure used to pass arguments to vn_io_fault1(), to do either * file- or vnode-based I/O calls. */ struct vn_io_fault_args { enum { VN_IO_FAULT_FOP, VN_IO_FAULT_VOP } kind; struct ucred *cred; int flags; union { struct fop_args_tag { struct file *fp; fo_rdwr_t *doio; } fop_args; struct vop_args_tag { struct vnode *vp; } vop_args; } args; }; static int vn_io_fault1(struct vnode *vp, struct uio *uio, struct vn_io_fault_args *args, struct thread *td); int vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp) { struct thread *td = curthread; return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp)); } static uint64_t open2nameif(int fmode, u_int vn_open_flags) { uint64_t res; res = ISOPEN | LOCKLEAF; if ((fmode & O_RESOLVE_BENEATH) != 0) res |= RBENEATH; if ((fmode & O_EMPTY_PATH) != 0) res |= EMPTYPATH; if ((fmode & FREAD) != 0) res |= OPENREAD; if ((fmode & FWRITE) != 0) res |= OPENWRITE; if ((vn_open_flags & VN_OPEN_NOAUDIT) == 0) res |= AUDITVNODE1; if ((vn_open_flags & VN_OPEN_NOCAPCHECK) != 0) res |= NOCAPCHECK; if ((vn_open_flags & VN_OPEN_WANTIOCTLCAPS) != 0) res |= WANTIOCTLCAPS; return (res); } /* * Common code for vnode open operations via a name lookup. * Lookup the vnode and invoke VOP_CREATE if needed. * Check permissions, and call the VOP_OPEN or VOP_CREATE routine. * * Note that this does NOT free nameidata for the successful case, * due to the NDINIT being done elsewhere. */ int vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags, struct ucred *cred, struct file *fp) { struct vnode *vp; struct mount *mp; struct vattr vat; struct vattr *vap = &vat; int fmode, error; bool first_open; restart: first_open = false; fmode = *flagp; if ((fmode & (O_CREAT | O_EXCL | O_DIRECTORY)) == (O_CREAT | O_EXCL | O_DIRECTORY) || (fmode & (O_CREAT | O_EMPTY_PATH)) == (O_CREAT | O_EMPTY_PATH)) return (EINVAL); else if ((fmode & (O_CREAT | O_DIRECTORY)) == O_CREAT) { ndp->ni_cnd.cn_nameiop = CREATE; ndp->ni_cnd.cn_flags = open2nameif(fmode, vn_open_flags); /* * Set NOCACHE to avoid flushing the cache when * rolling in many files at once. * * Set NC_KEEPPOSENTRY to keep positive entries if they already * exist despite NOCACHE. */ ndp->ni_cnd.cn_flags |= LOCKPARENT | NOCACHE | NC_KEEPPOSENTRY; if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0) ndp->ni_cnd.cn_flags |= FOLLOW; if ((vn_open_flags & VN_OPEN_INVFS) == 0) bwillwrite(); if ((error = namei(ndp)) != 0) return (error); if (ndp->ni_vp == NULL) { VATTR_NULL(vap); vap->va_type = VREG; vap->va_mode = cmode; if (fmode & O_EXCL) vap->va_vaflags |= VA_EXCLUSIVE; if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) { NDFREE_PNBUF(ndp); vput(ndp->ni_dvp); if ((error = vn_start_write(NULL, &mp, V_XSLEEP | V_PCATCH)) != 0) return (error); NDREINIT(ndp); goto restart; } if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0) ndp->ni_cnd.cn_flags |= MAKEENTRY; #ifdef MAC error = mac_vnode_check_create(cred, ndp->ni_dvp, &ndp->ni_cnd, vap); if (error == 0) #endif error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd, vap); vp = ndp->ni_vp; if (error == 0 && (fmode & O_EXCL) != 0 && (fmode & (O_EXLOCK | O_SHLOCK)) != 0) { VI_LOCK(vp); vp->v_iflag |= VI_FOPENING; VI_UNLOCK(vp); first_open = true; } VOP_VPUT_PAIR(ndp->ni_dvp, error == 0 ? &vp : NULL, false); vn_finished_write(mp); if (error) { NDFREE_PNBUF(ndp); if (error == ERELOOKUP) { NDREINIT(ndp); goto restart; } return (error); } fmode &= ~O_TRUNC; } else { if (ndp->ni_dvp == ndp->ni_vp) vrele(ndp->ni_dvp); else vput(ndp->ni_dvp); ndp->ni_dvp = NULL; vp = ndp->ni_vp; if (fmode & O_EXCL) { error = EEXIST; goto bad; } if (vp->v_type == VDIR) { error = EISDIR; goto bad; } fmode &= ~O_CREAT; } } else { ndp->ni_cnd.cn_nameiop = LOOKUP; ndp->ni_cnd.cn_flags = open2nameif(fmode, vn_open_flags); ndp->ni_cnd.cn_flags |= (fmode & O_NOFOLLOW) != 0 ? NOFOLLOW : FOLLOW; if ((fmode & FWRITE) == 0) ndp->ni_cnd.cn_flags |= LOCKSHARED; if ((error = namei(ndp)) != 0) return (error); vp = ndp->ni_vp; } error = vn_open_vnode(vp, fmode, cred, curthread, fp); if (first_open) { VI_LOCK(vp); vp->v_iflag &= ~VI_FOPENING; wakeup(vp); VI_UNLOCK(vp); } if (error) goto bad; *flagp = fmode; return (0); bad: NDFREE_PNBUF(ndp); vput(vp); *flagp = fmode; ndp->ni_vp = NULL; return (error); } static int vn_open_vnode_advlock(struct vnode *vp, int fmode, struct file *fp) { struct flock lf; int error, lock_flags, type; ASSERT_VOP_LOCKED(vp, "vn_open_vnode_advlock"); if ((fmode & (O_EXLOCK | O_SHLOCK)) == 0) return (0); KASSERT(fp != NULL, ("open with flock requires fp")); if (fp->f_type != DTYPE_NONE && fp->f_type != DTYPE_VNODE) return (EOPNOTSUPP); lock_flags = VOP_ISLOCKED(vp); VOP_UNLOCK(vp); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = (fmode & O_EXLOCK) != 0 ? F_WRLCK : F_RDLCK; type = F_FLOCK; if ((fmode & FNONBLOCK) == 0) type |= F_WAIT; if ((fmode & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) type |= F_FIRSTOPEN; error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type); if (error == 0) fp->f_flag |= FHASLOCK; vn_lock(vp, lock_flags | LK_RETRY); return (error); } /* * Common code for vnode open operations once a vnode is located. * Check permissions, and call the VOP_OPEN routine. */ int vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred, struct thread *td, struct file *fp) { accmode_t accmode; int error; if (vp->v_type == VLNK) { if ((fmode & O_PATH) == 0 || (fmode & FEXEC) != 0) return (EMLINK); } if (vp->v_type != VDIR && fmode & O_DIRECTORY) return (ENOTDIR); accmode = 0; if ((fmode & O_PATH) == 0) { if (vp->v_type == VSOCK) return (EOPNOTSUPP); if ((fmode & (FWRITE | O_TRUNC)) != 0) { if (vp->v_type == VDIR) return (EISDIR); accmode |= VWRITE; } if ((fmode & FREAD) != 0) accmode |= VREAD; if ((fmode & O_APPEND) && (fmode & FWRITE)) accmode |= VAPPEND; #ifdef MAC if ((fmode & O_CREAT) != 0) accmode |= VCREAT; #endif } if ((fmode & FEXEC) != 0) accmode |= VEXEC; #ifdef MAC if ((fmode & O_VERIFY) != 0) accmode |= VVERIFY; error = mac_vnode_check_open(cred, vp, accmode); if (error != 0) return (error); accmode &= ~(VCREAT | VVERIFY); #endif if ((fmode & O_CREAT) == 0 && accmode != 0) { error = VOP_ACCESS(vp, accmode, cred, td); if (error != 0) return (error); } if ((fmode & O_PATH) != 0) { if (vp->v_type != VFIFO && vp->v_type != VSOCK && VOP_ACCESS(vp, VREAD, cred, td) == 0) fp->f_flag |= FKQALLOWED; return (0); } if (vp->v_type == VFIFO && VOP_ISLOCKED(vp) != LK_EXCLUSIVE) vn_lock(vp, LK_UPGRADE | LK_RETRY); error = VOP_OPEN(vp, fmode, cred, td, fp); if (error != 0) return (error); error = vn_open_vnode_advlock(vp, fmode, fp); if (error == 0 && (fmode & FWRITE) != 0) { error = VOP_ADD_WRITECOUNT(vp, 1); if (error == 0) { CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d", __func__, vp, vp->v_writecount); } } /* * Error from advlock or VOP_ADD_WRITECOUNT() still requires * calling VOP_CLOSE() to pair with earlier VOP_OPEN(). */ if (error != 0) { if (fp != NULL) { /* * Arrange the call by having fdrop() to use * vn_closefile(). This is to satisfy * filesystems like devfs or tmpfs, which * override fo_close(). */ fp->f_flag |= FOPENFAILED; fp->f_vnode = vp; if (fp->f_ops == &badfileops) { fp->f_type = DTYPE_VNODE; fp->f_ops = &vnops; } vref(vp); } else { /* * If there is no fp, due to kernel-mode open, * we can call VOP_CLOSE() now. */ if ((vp->v_type == VFIFO || !MNT_EXTENDED_SHARED(vp->v_mount)) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE) vn_lock(vp, LK_UPGRADE | LK_RETRY); (void)VOP_CLOSE(vp, fmode & (FREAD | FWRITE | FEXEC), cred, td); } } ASSERT_VOP_LOCKED(vp, "vn_open_vnode"); return (error); } /* * Check for write permissions on the specified vnode. * Prototype text segments cannot be written. * It is racy. */ int vn_writechk(struct vnode *vp) { ASSERT_VOP_LOCKED(vp, "vn_writechk"); /* * If there's shared text associated with * the vnode, try to free it up once. If * we fail, we can't allow writing. */ if (VOP_IS_TEXT(vp)) return (ETXTBSY); return (0); } /* * Vnode close call */ static int vn_close1(struct vnode *vp, int flags, struct ucred *file_cred, struct thread *td, bool keep_ref) { struct mount *mp; int error, lock_flags; lock_flags = vp->v_type != VFIFO && MNT_EXTENDED_SHARED(vp->v_mount) ? LK_SHARED : LK_EXCLUSIVE; vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, lock_flags | LK_RETRY); AUDIT_ARG_VNODE1(vp); if ((flags & (FWRITE | FOPENFAILED)) == FWRITE) { VOP_ADD_WRITECOUNT_CHECKED(vp, -1); CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d", __func__, vp, vp->v_writecount); } error = VOP_CLOSE(vp, flags, file_cred, td); if (keep_ref) VOP_UNLOCK(vp); else vput(vp); vn_finished_write(mp); return (error); } int vn_close(struct vnode *vp, int flags, struct ucred *file_cred, struct thread *td) { return (vn_close1(vp, flags, file_cred, td, false)); } /* * Heuristic to detect sequential operation. */ static int sequential_heuristic(struct uio *uio, struct file *fp) { enum uio_rw rw; ASSERT_VOP_LOCKED(fp->f_vnode, __func__); rw = uio->uio_rw; if (fp->f_flag & FRDAHEAD) return (fp->f_seqcount[rw] << IO_SEQSHIFT); /* * Offset 0 is handled specially. open() sets f_seqcount to 1 so * that the first I/O is normally considered to be slightly * sequential. Seeking to offset 0 doesn't change sequentiality * unless previous seeks have reduced f_seqcount to 0, in which * case offset 0 is not special. */ if ((uio->uio_offset == 0 && fp->f_seqcount[rw] > 0) || uio->uio_offset == fp->f_nextoff[rw]) { /* * f_seqcount is in units of fixed-size blocks so that it * depends mainly on the amount of sequential I/O and not * much on the number of sequential I/O's. The fixed size * of 16384 is hard-coded here since it is (not quite) just * a magic size that works well here. This size is more * closely related to the best I/O size for real disks than * to any block size used by software. */ if (uio->uio_resid >= IO_SEQMAX * 16384) fp->f_seqcount[rw] = IO_SEQMAX; else { fp->f_seqcount[rw] += howmany(uio->uio_resid, 16384); if (fp->f_seqcount[rw] > IO_SEQMAX) fp->f_seqcount[rw] = IO_SEQMAX; } return (fp->f_seqcount[rw] << IO_SEQSHIFT); } /* Not sequential. Quickly draw-down sequentiality. */ if (fp->f_seqcount[rw] > 1) fp->f_seqcount[rw] = 1; else fp->f_seqcount[rw] = 0; return (0); } /* * Package up an I/O request on a vnode into a uio and do it. */ int vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, struct ucred *file_cred, ssize_t *aresid, struct thread *td) { struct uio auio; struct iovec aiov; struct mount *mp; struct ucred *cred; void *rl_cookie; struct vn_io_fault_args args; int error, lock_flags; if (offset < 0 && vp->v_type != VCHR) return (EINVAL); auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = base; aiov.iov_len = len; auio.uio_resid = len; auio.uio_offset = offset; auio.uio_segflg = segflg; auio.uio_rw = rw; auio.uio_td = td; error = 0; if ((ioflg & IO_NODELOCKED) == 0) { if ((ioflg & IO_RANGELOCKED) == 0) { if (rw == UIO_READ) { rl_cookie = vn_rangelock_rlock(vp, offset, offset + len); } else if ((ioflg & IO_APPEND) != 0) { rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); } else { rl_cookie = vn_rangelock_wlock(vp, offset, offset + len); } } else rl_cookie = NULL; mp = NULL; if (rw == UIO_WRITE) { if (vp->v_type != VCHR && (error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH)) != 0) goto out; lock_flags = vn_lktype_write(mp, vp); } else lock_flags = LK_SHARED; vn_lock(vp, lock_flags | LK_RETRY); } else rl_cookie = NULL; ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); #ifdef MAC if ((ioflg & IO_NOMACCHECK) == 0) { if (rw == UIO_READ) error = mac_vnode_check_read(active_cred, file_cred, vp); else error = mac_vnode_check_write(active_cred, file_cred, vp); } #endif if (error == 0) { if (file_cred != NULL) cred = file_cred; else cred = active_cred; if (do_vn_io_fault(vp, &auio)) { args.kind = VN_IO_FAULT_VOP; args.cred = cred; args.flags = ioflg; args.args.vop_args.vp = vp; error = vn_io_fault1(vp, &auio, &args, td); } else if (rw == UIO_READ) { error = VOP_READ(vp, &auio, ioflg, cred); } else /* if (rw == UIO_WRITE) */ { error = VOP_WRITE(vp, &auio, ioflg, cred); } } if (aresid) *aresid = auio.uio_resid; else if (auio.uio_resid && error == 0) error = EIO; if ((ioflg & IO_NODELOCKED) == 0) { VOP_UNLOCK(vp); if (mp != NULL) vn_finished_write(mp); } out: if (rl_cookie != NULL) vn_rangelock_unlock(vp, rl_cookie); return (error); } /* * Package up an I/O request on a vnode into a uio and do it. The I/O * request is split up into smaller chunks and we try to avoid saturating * the buffer cache while potentially holding a vnode locked, so we * check bwillwrite() before calling vn_rdwr(). We also call kern_yield() * to give other processes a chance to lock the vnode (either other processes * core'ing the same binary, or unrelated processes scanning the directory). */ int vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base, size_t len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, struct ucred *file_cred, size_t *aresid, struct thread *td) { int error = 0; ssize_t iaresid; do { int chunk; /* * Force `offset' to a multiple of MAXBSIZE except possibly * for the first chunk, so that filesystems only need to * write full blocks except possibly for the first and last * chunks. */ chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE; if (chunk > len) chunk = len; if (rw != UIO_READ && vp->v_type == VREG) bwillwrite(); iaresid = 0; error = vn_rdwr(rw, vp, base, chunk, offset, segflg, ioflg, active_cred, file_cred, &iaresid, td); len -= chunk; /* aresid calc already includes length */ if (error) break; offset += chunk; base = (char *)base + chunk; kern_yield(PRI_USER); } while (len); if (aresid) *aresid = len + iaresid; return (error); } #if OFF_MAX <= LONG_MAX off_t foffset_lock(struct file *fp, int flags) { volatile short *flagsp; off_t res; short state; KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed")); if ((flags & FOF_NOLOCK) != 0) return (atomic_load_long(&fp->f_offset)); /* * According to McKusick the vn lock was protecting f_offset here. * It is now protected by the FOFFSET_LOCKED flag. */ flagsp = &fp->f_vnread_flags; if (atomic_cmpset_acq_16(flagsp, 0, FOFFSET_LOCKED)) return (atomic_load_long(&fp->f_offset)); sleepq_lock(&fp->f_vnread_flags); state = atomic_load_16(flagsp); for (;;) { if ((state & FOFFSET_LOCKED) == 0) { if (!atomic_fcmpset_acq_16(flagsp, &state, FOFFSET_LOCKED)) continue; break; } if ((state & FOFFSET_LOCK_WAITING) == 0) { if (!atomic_fcmpset_acq_16(flagsp, &state, state | FOFFSET_LOCK_WAITING)) continue; } DROP_GIANT(); sleepq_add(&fp->f_vnread_flags, NULL, "vofflock", 0, 0); sleepq_wait(&fp->f_vnread_flags, PUSER -1); PICKUP_GIANT(); sleepq_lock(&fp->f_vnread_flags); state = atomic_load_16(flagsp); } res = atomic_load_long(&fp->f_offset); sleepq_release(&fp->f_vnread_flags); return (res); } void foffset_unlock(struct file *fp, off_t val, int flags) { volatile short *flagsp; short state; KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed")); if ((flags & FOF_NOUPDATE) == 0) atomic_store_long(&fp->f_offset, val); if ((flags & FOF_NEXTOFF_R) != 0) fp->f_nextoff[UIO_READ] = val; if ((flags & FOF_NEXTOFF_W) != 0) fp->f_nextoff[UIO_WRITE] = val; if ((flags & FOF_NOLOCK) != 0) return; flagsp = &fp->f_vnread_flags; state = atomic_load_16(flagsp); if ((state & FOFFSET_LOCK_WAITING) == 0 && atomic_cmpset_rel_16(flagsp, state, 0)) return; sleepq_lock(&fp->f_vnread_flags); MPASS((fp->f_vnread_flags & FOFFSET_LOCKED) != 0); MPASS((fp->f_vnread_flags & FOFFSET_LOCK_WAITING) != 0); fp->f_vnread_flags = 0; sleepq_broadcast(&fp->f_vnread_flags, SLEEPQ_SLEEP, 0, 0); sleepq_release(&fp->f_vnread_flags); } static off_t foffset_read(struct file *fp) { return (atomic_load_long(&fp->f_offset)); } #else off_t foffset_lock(struct file *fp, int flags) { struct mtx *mtxp; off_t res; KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed")); mtxp = mtx_pool_find(mtxpool_sleep, fp); mtx_lock(mtxp); if ((flags & FOF_NOLOCK) == 0) { while (fp->f_vnread_flags & FOFFSET_LOCKED) { fp->f_vnread_flags |= FOFFSET_LOCK_WAITING; msleep(&fp->f_vnread_flags, mtxp, PUSER -1, "vofflock", 0); } fp->f_vnread_flags |= FOFFSET_LOCKED; } res = fp->f_offset; mtx_unlock(mtxp); return (res); } void foffset_unlock(struct file *fp, off_t val, int flags) { struct mtx *mtxp; KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed")); mtxp = mtx_pool_find(mtxpool_sleep, fp); mtx_lock(mtxp); if ((flags & FOF_NOUPDATE) == 0) fp->f_offset = val; if ((flags & FOF_NEXTOFF_R) != 0) fp->f_nextoff[UIO_READ] = val; if ((flags & FOF_NEXTOFF_W) != 0) fp->f_nextoff[UIO_WRITE] = val; if ((flags & FOF_NOLOCK) == 0) { KASSERT((fp->f_vnread_flags & FOFFSET_LOCKED) != 0, ("Lost FOFFSET_LOCKED")); if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING) wakeup(&fp->f_vnread_flags); fp->f_vnread_flags = 0; } mtx_unlock(mtxp); } static off_t foffset_read(struct file *fp) { return (foffset_lock(fp, FOF_NOLOCK)); } #endif void foffset_lock_uio(struct file *fp, struct uio *uio, int flags) { if ((flags & FOF_OFFSET) == 0) uio->uio_offset = foffset_lock(fp, flags); } void foffset_unlock_uio(struct file *fp, struct uio *uio, int flags) { if ((flags & FOF_OFFSET) == 0) foffset_unlock(fp, uio->uio_offset, flags); } static int get_advice(struct file *fp, struct uio *uio) { struct mtx *mtxp; int ret; ret = POSIX_FADV_NORMAL; if (fp->f_advice == NULL || fp->f_vnode->v_type != VREG) return (ret); mtxp = mtx_pool_find(mtxpool_sleep, fp); mtx_lock(mtxp); if (fp->f_advice != NULL && uio->uio_offset >= fp->f_advice->fa_start && uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end) ret = fp->f_advice->fa_advice; mtx_unlock(mtxp); return (ret); } static int get_write_ioflag(struct file *fp) { int ioflag; struct mount *mp; struct vnode *vp; ioflag = 0; vp = fp->f_vnode; mp = atomic_load_ptr(&vp->v_mount); if ((fp->f_flag & O_DIRECT) != 0) ioflag |= IO_DIRECT; if ((fp->f_flag & O_FSYNC) != 0 || (mp != NULL && (mp->mnt_flag & MNT_SYNCHRONOUS) != 0)) ioflag |= IO_SYNC; /* * For O_DSYNC we set both IO_SYNC and IO_DATASYNC, so that VOP_WRITE() * or VOP_DEALLOCATE() implementations that don't understand IO_DATASYNC * fall back to full O_SYNC behavior. */ if ((fp->f_flag & O_DSYNC) != 0) ioflag |= IO_SYNC | IO_DATASYNC; return (ioflag); } int vn_read_from_obj(struct vnode *vp, struct uio *uio) { vm_object_t obj; vm_page_t ma[io_hold_cnt + 2]; off_t off, vsz; ssize_t resid; int error, i, j; MPASS(uio->uio_resid <= ptoa(io_hold_cnt + 2)); obj = atomic_load_ptr(&vp->v_object); if (obj == NULL) return (EJUSTRETURN); /* * Depends on type stability of vm_objects. */ vm_object_pip_add(obj, 1); if ((obj->flags & OBJ_DEAD) != 0) { /* * Note that object might be already reused from the * vnode, and the OBJ_DEAD flag cleared. This is fine, * we recheck for DOOMED vnode state after all pages * are busied, and retract then. * * But we check for OBJ_DEAD to ensure that we do not * busy pages while vm_object_terminate_pages() * processes the queue. */ error = EJUSTRETURN; goto out_pip; } resid = uio->uio_resid; off = uio->uio_offset; for (i = 0; resid > 0; i++) { MPASS(i < io_hold_cnt + 2); ma[i] = vm_page_grab_unlocked(obj, atop(off), VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY | VM_ALLOC_NOWAIT); if (ma[i] == NULL) break; /* * Skip invalid pages. Valid mask can be partial only * at EOF, and we clip later. */ if (vm_page_none_valid(ma[i])) { vm_page_sunbusy(ma[i]); break; } resid -= PAGE_SIZE; off += PAGE_SIZE; } if (i == 0) { error = EJUSTRETURN; goto out_pip; } /* * Check VIRF_DOOMED after we busied our pages. Since * vgonel() terminates the vnode' vm_object, it cannot * process past pages busied by us. */ if (VN_IS_DOOMED(vp)) { error = EJUSTRETURN; goto out; } resid = PAGE_SIZE - (uio->uio_offset & PAGE_MASK) + ptoa(i - 1); if (resid > uio->uio_resid) resid = uio->uio_resid; /* * Unlocked read of vnp_size is safe because truncation cannot * pass busied page. But we load vnp_size into a local * variable so that possible concurrent extension does not * break calculation. */ #if defined(__powerpc__) && !defined(__powerpc64__) vsz = obj->un_pager.vnp.vnp_size; #else vsz = atomic_load_64(&obj->un_pager.vnp.vnp_size); #endif if (uio->uio_offset >= vsz) { error = EJUSTRETURN; goto out; } if (uio->uio_offset + resid > vsz) resid = vsz - uio->uio_offset; error = vn_io_fault_pgmove(ma, uio->uio_offset & PAGE_MASK, resid, uio); out: for (j = 0; j < i; j++) { if (error == 0) vm_page_reference(ma[j]); vm_page_sunbusy(ma[j]); } out_pip: vm_object_pip_wakeup(obj); if (error != 0) return (error); return (uio->uio_resid == 0 ? 0 : EJUSTRETURN); } /* * File table vnode read routine. */ static int vn_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct vnode *vp; off_t orig_offset; int error, ioflag; int advice; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET")); vp = fp->f_vnode; ioflag = 0; if (fp->f_flag & FNONBLOCK) ioflag |= IO_NDELAY; if (fp->f_flag & O_DIRECT) ioflag |= IO_DIRECT; /* * Try to read from page cache. VIRF_DOOMED check is racy but * allows us to avoid unneeded work outright. */ if (vn_io_pgcache_read_enable && !mac_vnode_check_read_enabled() && (vn_irflag_read(vp) & (VIRF_DOOMED | VIRF_PGREAD)) == VIRF_PGREAD) { error = VOP_READ_PGCACHE(vp, uio, ioflag, fp->f_cred); if (error == 0) { fp->f_nextoff[UIO_READ] = uio->uio_offset; return (0); } if (error != EJUSTRETURN) return (error); } advice = get_advice(fp, uio); vn_lock(vp, LK_SHARED | LK_RETRY); switch (advice) { case POSIX_FADV_NORMAL: case POSIX_FADV_SEQUENTIAL: case POSIX_FADV_NOREUSE: ioflag |= sequential_heuristic(uio, fp); break; case POSIX_FADV_RANDOM: /* Disable read-ahead for random I/O. */ break; } orig_offset = uio->uio_offset; #ifdef MAC error = mac_vnode_check_read(active_cred, fp->f_cred, vp); if (error == 0) #endif error = VOP_READ(vp, uio, ioflag, fp->f_cred); fp->f_nextoff[UIO_READ] = uio->uio_offset; VOP_UNLOCK(vp); if (error == 0 && advice == POSIX_FADV_NOREUSE && orig_offset != uio->uio_offset) /* * Use POSIX_FADV_DONTNEED to flush pages and buffers * for the backing file after a POSIX_FADV_NOREUSE * read(2). */ error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1, POSIX_FADV_DONTNEED); return (error); } /* * File table vnode write routine. */ static int vn_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct vnode *vp; struct mount *mp; off_t orig_offset; int error, ioflag; int advice; bool need_finished_write; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET")); vp = fp->f_vnode; if (vp->v_type == VREG) bwillwrite(); ioflag = IO_UNIT; if (vp->v_type == VREG && (fp->f_flag & O_APPEND) != 0) ioflag |= IO_APPEND; if ((fp->f_flag & FNONBLOCK) != 0) ioflag |= IO_NDELAY; ioflag |= get_write_ioflag(fp); mp = NULL; need_finished_write = false; if (vp->v_type != VCHR) { error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH); if (error != 0) goto unlock; need_finished_write = true; } advice = get_advice(fp, uio); vn_lock(vp, vn_lktype_write(mp, vp) | LK_RETRY); switch (advice) { case POSIX_FADV_NORMAL: case POSIX_FADV_SEQUENTIAL: case POSIX_FADV_NOREUSE: ioflag |= sequential_heuristic(uio, fp); break; case POSIX_FADV_RANDOM: /* XXX: Is this correct? */ break; } orig_offset = uio->uio_offset; #ifdef MAC error = mac_vnode_check_write(active_cred, fp->f_cred, vp); if (error == 0) #endif error = VOP_WRITE(vp, uio, ioflag, fp->f_cred); fp->f_nextoff[UIO_WRITE] = uio->uio_offset; VOP_UNLOCK(vp); if (need_finished_write) vn_finished_write(mp); if (error == 0 && advice == POSIX_FADV_NOREUSE && orig_offset != uio->uio_offset) /* * Use POSIX_FADV_DONTNEED to flush pages and buffers * for the backing file after a POSIX_FADV_NOREUSE * write(2). */ error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1, POSIX_FADV_DONTNEED); unlock: return (error); } /* * The vn_io_fault() is a wrapper around vn_read() and vn_write() to * prevent the following deadlock: * * Assume that the thread A reads from the vnode vp1 into userspace * buffer buf1 backed by the pages of vnode vp2. If a page in buf1 is * currently not resident, then system ends up with the call chain * vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] -> * vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2) * which establishes lock order vp1->vn_lock, then vp2->vn_lock. * If, at the same time, thread B reads from vnode vp2 into buffer buf2 * backed by the pages of vnode vp1, and some page in buf2 is not * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock. * * To prevent the lock order reversal and deadlock, vn_io_fault() does * not allow page faults to happen during VOP_READ() or VOP_WRITE(). * Instead, it first tries to do the whole range i/o with pagefaults * disabled. If all pages in the i/o buffer are resident and mapped, * VOP will succeed (ignoring the genuine filesystem errors). * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do * i/o in chunks, with all pages in the chunk prefaulted and held * using vm_fault_quick_hold_pages(). * * Filesystems using this deadlock avoidance scheme should use the * array of the held pages from uio, saved in the curthread->td_ma, * instead of doing uiomove(). A helper function * vn_io_fault_uiomove() converts uiomove request into * uiomove_fromphys() over td_ma array. * * Since vnode locks do not cover the whole i/o anymore, rangelocks * make the current i/o request atomic with respect to other i/os and * truncations. */ /* * Decode vn_io_fault_args and perform the corresponding i/o. */ static int vn_io_fault_doio(struct vn_io_fault_args *args, struct uio *uio, struct thread *td) { int error, save; error = 0; save = vm_fault_disable_pagefaults(); switch (args->kind) { case VN_IO_FAULT_FOP: error = (args->args.fop_args.doio)(args->args.fop_args.fp, uio, args->cred, args->flags, td); break; case VN_IO_FAULT_VOP: switch (uio->uio_rw) { case UIO_READ: error = VOP_READ(args->args.vop_args.vp, uio, args->flags, args->cred); break; case UIO_WRITE: error = VOP_WRITE(args->args.vop_args.vp, uio, args->flags, args->cred); break; } break; default: panic("vn_io_fault_doio: unknown kind of io %d %d", args->kind, uio->uio_rw); } vm_fault_enable_pagefaults(save); return (error); } static int vn_io_fault_touch(char *base, const struct uio *uio) { int r; r = fubyte(base); if (r == -1 || (uio->uio_rw == UIO_READ && subyte(base, r) == -1)) return (EFAULT); return (0); } static int vn_io_fault_prefault_user(const struct uio *uio) { char *base; const struct iovec *iov; size_t len; ssize_t resid; int error, i; KASSERT(uio->uio_segflg == UIO_USERSPACE, ("vn_io_fault_prefault userspace")); error = i = 0; iov = uio->uio_iov; resid = uio->uio_resid; base = iov->iov_base; len = iov->iov_len; while (resid > 0) { error = vn_io_fault_touch(base, uio); if (error != 0) break; if (len < PAGE_SIZE) { if (len != 0) { error = vn_io_fault_touch(base + len - 1, uio); if (error != 0) break; resid -= len; } if (++i >= uio->uio_iovcnt) break; iov = uio->uio_iov + i; base = iov->iov_base; len = iov->iov_len; } else { len -= PAGE_SIZE; base += PAGE_SIZE; resid -= PAGE_SIZE; } } return (error); } /* * Common code for vn_io_fault(), agnostic to the kind of i/o request. * Uses vn_io_fault_doio() to make the call to an actual i/o function. * Used from vn_rdwr() and vn_io_fault(), which encode the i/o request * into args and call vn_io_fault1() to handle faults during the user * mode buffer accesses. */ static int vn_io_fault1(struct vnode *vp, struct uio *uio, struct vn_io_fault_args *args, struct thread *td) { vm_page_t ma[io_hold_cnt + 2]; struct uio *uio_clone, short_uio; struct iovec short_iovec[1]; vm_page_t *prev_td_ma; vm_prot_t prot; vm_offset_t addr, end; size_t len, resid; ssize_t adv; int error, cnt, saveheld, prev_td_ma_cnt; if (vn_io_fault_prefault) { error = vn_io_fault_prefault_user(uio); if (error != 0) return (error); /* Or ignore ? */ } prot = uio->uio_rw == UIO_READ ? VM_PROT_WRITE : VM_PROT_READ; /* * The UFS follows IO_UNIT directive and replays back both * uio_offset and uio_resid if an error is encountered during the * operation. But, since the iovec may be already advanced, * uio is still in an inconsistent state. * * Cache a copy of the original uio, which is advanced to the redo * point using UIO_NOCOPY below. */ uio_clone = cloneuio(uio); resid = uio->uio_resid; short_uio.uio_segflg = UIO_USERSPACE; short_uio.uio_rw = uio->uio_rw; short_uio.uio_td = uio->uio_td; error = vn_io_fault_doio(args, uio, td); if (error != EFAULT) goto out; atomic_add_long(&vn_io_faults_cnt, 1); uio_clone->uio_segflg = UIO_NOCOPY; uiomove(NULL, resid - uio->uio_resid, uio_clone); uio_clone->uio_segflg = uio->uio_segflg; saveheld = curthread_pflags_set(TDP_UIOHELD); prev_td_ma = td->td_ma; prev_td_ma_cnt = td->td_ma_cnt; while (uio_clone->uio_resid != 0) { len = uio_clone->uio_iov->iov_len; if (len == 0) { KASSERT(uio_clone->uio_iovcnt >= 1, ("iovcnt underflow")); uio_clone->uio_iov++; uio_clone->uio_iovcnt--; continue; } if (len > ptoa(io_hold_cnt)) len = ptoa(io_hold_cnt); addr = (uintptr_t)uio_clone->uio_iov->iov_base; end = round_page(addr + len); if (end < addr) { error = EFAULT; break; } /* * A perfectly misaligned address and length could cause * both the start and the end of the chunk to use partial * page. +2 accounts for such a situation. */ cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map, addr, len, prot, ma, io_hold_cnt + 2); if (cnt == -1) { error = EFAULT; break; } short_uio.uio_iov = &short_iovec[0]; short_iovec[0].iov_base = (void *)addr; short_uio.uio_iovcnt = 1; short_uio.uio_resid = short_iovec[0].iov_len = len; short_uio.uio_offset = uio_clone->uio_offset; td->td_ma = ma; td->td_ma_cnt = cnt; error = vn_io_fault_doio(args, &short_uio, td); vm_page_unhold_pages(ma, cnt); adv = len - short_uio.uio_resid; uio_clone->uio_iov->iov_base = (char *)uio_clone->uio_iov->iov_base + adv; uio_clone->uio_iov->iov_len -= adv; uio_clone->uio_resid -= adv; uio_clone->uio_offset += adv; uio->uio_resid -= adv; uio->uio_offset += adv; if (error != 0 || adv == 0) break; } td->td_ma = prev_td_ma; td->td_ma_cnt = prev_td_ma_cnt; curthread_pflags_restore(saveheld); out: freeuio(uio_clone); return (error); } static int vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { fo_rdwr_t *doio; struct vnode *vp; void *rl_cookie; struct vn_io_fault_args args; int error; bool do_io_fault, do_rangelock; doio = uio->uio_rw == UIO_READ ? vn_read : vn_write; vp = fp->f_vnode; /* * The ability to read(2) on a directory has historically been * allowed for all users, but this can and has been the source of * at least one security issue in the past. As such, it is now hidden * away behind a sysctl for those that actually need it to use it, and * restricted to root when it's turned on to make it relatively safe to * leave on for longer sessions of need. */ if (vp->v_type == VDIR) { KASSERT(uio->uio_rw == UIO_READ, ("illegal write attempted on a directory")); if (!vfs_allow_read_dir) return (EISDIR); if ((error = priv_check(td, PRIV_VFS_READ_DIR)) != 0) return (EISDIR); } do_io_fault = do_vn_io_fault(vp, uio); do_rangelock = do_io_fault || (vn_irflag_read(vp) & VIRF_PGREAD) != 0; foffset_lock_uio(fp, uio, flags); if (do_rangelock) { if (uio->uio_rw == UIO_READ) { rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset, uio->uio_offset + uio->uio_resid); } else if ((fp->f_flag & O_APPEND) != 0 || (flags & FOF_OFFSET) == 0) { /* For appenders, punt and lock the whole range. */ rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); } else { rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset, uio->uio_offset + uio->uio_resid); } } if (do_io_fault) { args.kind = VN_IO_FAULT_FOP; args.args.fop_args.fp = fp; args.args.fop_args.doio = doio; args.cred = active_cred; args.flags = flags | FOF_OFFSET; error = vn_io_fault1(vp, uio, &args, td); } else { error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td); } if (do_rangelock) vn_rangelock_unlock(vp, rl_cookie); foffset_unlock_uio(fp, uio, flags); return (error); } /* * Helper function to perform the requested uiomove operation using * the held pages for io->uio_iov[0].iov_base buffer instead of * copyin/copyout. Access to the pages with uiomove_fromphys() * instead of iov_base prevents page faults that could occur due to * pmap_collect() invalidating the mapping created by * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or * object cleanup revoking the write access from page mappings. * * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove() * instead of plain uiomove(). */ int vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio) { struct uio transp_uio; struct iovec transp_iov[1]; struct thread *td; size_t adv; int error, pgadv; td = curthread; if ((td->td_pflags & TDP_UIOHELD) == 0 || uio->uio_segflg != UIO_USERSPACE) return (uiomove(data, xfersize, uio)); KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt)); transp_iov[0].iov_base = data; transp_uio.uio_iov = &transp_iov[0]; transp_uio.uio_iovcnt = 1; if (xfersize > uio->uio_resid) xfersize = uio->uio_resid; transp_uio.uio_resid = transp_iov[0].iov_len = xfersize; transp_uio.uio_offset = 0; transp_uio.uio_segflg = UIO_SYSSPACE; /* * Since transp_iov points to data, and td_ma page array * corresponds to original uio->uio_iov, we need to invert the * direction of the i/o operation as passed to * uiomove_fromphys(). */ switch (uio->uio_rw) { case UIO_WRITE: transp_uio.uio_rw = UIO_READ; break; case UIO_READ: transp_uio.uio_rw = UIO_WRITE; break; } transp_uio.uio_td = uio->uio_td; error = uiomove_fromphys(td->td_ma, ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK, xfersize, &transp_uio); adv = xfersize - transp_uio.uio_resid; pgadv = (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) - (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT); td->td_ma += pgadv; KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt, pgadv)); td->td_ma_cnt -= pgadv; uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv; uio->uio_iov->iov_len -= adv; uio->uio_resid -= adv; uio->uio_offset += adv; return (error); } int vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize, struct uio *uio) { struct thread *td; vm_offset_t iov_base; int cnt, pgadv; td = curthread; if ((td->td_pflags & TDP_UIOHELD) == 0 || uio->uio_segflg != UIO_USERSPACE) return (uiomove_fromphys(ma, offset, xfersize, uio)); KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt)); cnt = xfersize > uio->uio_resid ? uio->uio_resid : xfersize; iov_base = (vm_offset_t)uio->uio_iov->iov_base; switch (uio->uio_rw) { case UIO_WRITE: pmap_copy_pages(td->td_ma, iov_base & PAGE_MASK, ma, offset, cnt); break; case UIO_READ: pmap_copy_pages(ma, offset, td->td_ma, iov_base & PAGE_MASK, cnt); break; } pgadv = ((iov_base + cnt) >> PAGE_SHIFT) - (iov_base >> PAGE_SHIFT); td->td_ma += pgadv; KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt, pgadv)); td->td_ma_cnt -= pgadv; uio->uio_iov->iov_base = (char *)(iov_base + cnt); uio->uio_iov->iov_len -= cnt; uio->uio_resid -= cnt; uio->uio_offset += cnt; return (0); } /* * File table truncate routine. */ static int vn_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { struct mount *mp; struct vnode *vp; void *rl_cookie; int error; vp = fp->f_vnode; retry: /* * Lock the whole range for truncation. Otherwise split i/o * might happen partly before and partly after the truncation. */ rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH); if (error) goto out1; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG_VNODE1(vp); if (vp->v_type == VDIR) { error = EISDIR; goto out; } #ifdef MAC error = mac_vnode_check_write(active_cred, fp->f_cred, vp); if (error) goto out; #endif error = vn_truncate_locked(vp, length, (fp->f_flag & O_FSYNC) != 0, fp->f_cred); out: VOP_UNLOCK(vp); vn_finished_write(mp); out1: vn_rangelock_unlock(vp, rl_cookie); if (error == ERELOOKUP) goto retry; return (error); } /* * Truncate a file that is already locked. */ int vn_truncate_locked(struct vnode *vp, off_t length, bool sync, struct ucred *cred) { struct vattr vattr; int error; error = VOP_ADD_WRITECOUNT(vp, 1); if (error == 0) { VATTR_NULL(&vattr); vattr.va_size = length; if (sync) vattr.va_vaflags |= VA_SYNC; error = VOP_SETATTR(vp, &vattr, cred); VOP_ADD_WRITECOUNT_CHECKED(vp, -1); } return (error); } /* * File table vnode stat routine. */ int vn_statfile(struct file *fp, struct stat *sb, struct ucred *active_cred) { struct vnode *vp = fp->f_vnode; int error; vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_STAT(vp, sb, active_cred, fp->f_cred); VOP_UNLOCK(vp); return (error); } /* * File table vnode ioctl routine. */ static int vn_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { struct vnode *vp; struct fiobmap2_arg *bmarg; off_t size; int error; vp = fp->f_vnode; switch (vp->v_type) { case VDIR: case VREG: switch (com) { case FIONREAD: error = vn_getsize(vp, &size, active_cred); if (error == 0) *(int *)data = size - fp->f_offset; return (error); case FIOBMAP2: bmarg = (struct fiobmap2_arg *)data; vn_lock(vp, LK_SHARED | LK_RETRY); #ifdef MAC error = mac_vnode_check_read(active_cred, fp->f_cred, vp); if (error == 0) #endif error = VOP_BMAP(vp, bmarg->bn, NULL, &bmarg->bn, &bmarg->runp, &bmarg->runb); VOP_UNLOCK(vp); return (error); case FIONBIO: case FIOASYNC: return (0); default: return (VOP_IOCTL(vp, com, data, fp->f_flag, active_cred, td)); } break; case VCHR: return (VOP_IOCTL(vp, com, data, fp->f_flag, active_cred, td)); default: return (ENOTTY); } } /* * File table vnode poll routine. */ static int vn_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct vnode *vp; int error; vp = fp->f_vnode; #if defined(MAC) || defined(AUDIT) if (AUDITING_TD(td) || mac_vnode_check_poll_enabled()) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG_VNODE1(vp); error = mac_vnode_check_poll(active_cred, fp->f_cred, vp); VOP_UNLOCK(vp); if (error != 0) return (error); } #endif error = VOP_POLL(vp, events, fp->f_cred, td); return (error); } /* * Acquire the requested lock and then check for validity. LK_RETRY * permits vn_lock to return doomed vnodes. */ static int __noinline _vn_lock_fallback(struct vnode *vp, int flags, const char *file, int line, int error) { KASSERT((flags & LK_RETRY) == 0 || error == 0, ("vn_lock: error %d incompatible with flags %#x", error, flags)); if (error == 0) VNASSERT(VN_IS_DOOMED(vp), vp, ("vnode not doomed")); if ((flags & LK_RETRY) == 0) { if (error == 0) { VOP_UNLOCK(vp); error = ENOENT; } return (error); } /* * LK_RETRY case. * * Nothing to do if we got the lock. */ if (error == 0) return (0); /* * Interlock was dropped by the call in _vn_lock. */ flags &= ~LK_INTERLOCK; do { error = VOP_LOCK1(vp, flags, file, line); } while (error != 0); return (0); } int _vn_lock(struct vnode *vp, int flags, const char *file, int line) { int error; VNASSERT((flags & LK_TYPE_MASK) != 0, vp, ("vn_lock: no locktype (%d passed)", flags)); VNPASS(vp->v_holdcnt > 0, vp); error = VOP_LOCK1(vp, flags, file, line); if (__predict_false(error != 0 || VN_IS_DOOMED(vp))) return (_vn_lock_fallback(vp, flags, file, line, error)); return (0); } /* * File table vnode close routine. */ static int vn_closefile(struct file *fp, struct thread *td) { struct vnode *vp; struct flock lf; int error; bool ref; vp = fp->f_vnode; fp->f_ops = &badfileops; ref = (fp->f_flag & FHASLOCK) != 0; error = vn_close1(vp, fp->f_flag, fp->f_cred, td, ref); if (__predict_false(ref)) { lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_UNLCK; (void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK); vrele(vp); } return (error); } /* * Preparing to start a filesystem write operation. If the operation is * permitted, then we bump the count of operations in progress and * proceed. If a suspend request is in progress, we wait until the * suspension is over, and then proceed. */ static int vn_start_write_refed(struct mount *mp, int flags, bool mplocked) { struct mount_pcpu *mpcpu; int error, mflags; if (__predict_true(!mplocked) && (flags & V_XSLEEP) == 0 && vfs_op_thread_enter(mp, mpcpu)) { MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0); vfs_mp_count_add_pcpu(mpcpu, writeopcount, 1); vfs_op_thread_exit(mp, mpcpu); return (0); } if (mplocked) mtx_assert(MNT_MTX(mp), MA_OWNED); else MNT_ILOCK(mp); error = 0; /* * Check on status of suspension. */ if ((curthread->td_pflags & TDP_IGNSUSP) == 0 || mp->mnt_susp_owner != curthread) { mflags = 0; if ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0) { if (flags & V_PCATCH) mflags |= PCATCH; } mflags |= (PUSER - 1); while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { if ((flags & V_NOWAIT) != 0) { error = EWOULDBLOCK; goto unlock; } error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags, "suspfs", 0); if (error != 0) goto unlock; } } if ((flags & V_XSLEEP) != 0) goto unlock; mp->mnt_writeopcount++; unlock: if (error != 0 || (flags & V_XSLEEP) != 0) MNT_REL(mp); MNT_IUNLOCK(mp); return (error); } int vn_start_write(struct vnode *vp, struct mount **mpp, int flags) { struct mount *mp; int error; KASSERT((flags & ~V_VALID_FLAGS) == 0, ("%s: invalid flags passed %d\n", __func__, flags)); error = 0; /* * If a vnode is provided, get and return the mount point that * to which it will write. */ if (vp != NULL) { if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) { *mpp = NULL; if (error != EOPNOTSUPP) return (error); return (0); } } if ((mp = *mpp) == NULL) return (0); /* * VOP_GETWRITEMOUNT() returns with the mp refcount held through * a vfs_ref(). * As long as a vnode is not provided we need to acquire a * refcount for the provided mountpoint too, in order to * emulate a vfs_ref(). */ if (vp == NULL) vfs_ref(mp); error = vn_start_write_refed(mp, flags, false); if (error != 0 && (flags & V_NOWAIT) == 0) *mpp = NULL; return (error); } /* * Secondary suspension. Used by operations such as vop_inactive * routines that are needed by the higher level functions. These * are allowed to proceed until all the higher level functions have * completed (indicated by mnt_writeopcount dropping to zero). At that * time, these operations are halted until the suspension is over. */ int vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags) { struct mount *mp; int error, mflags; KASSERT((flags & (~V_VALID_FLAGS | V_XSLEEP)) == 0, ("%s: invalid flags passed %d\n", __func__, flags)); retry: if (vp != NULL) { if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) { *mpp = NULL; if (error != EOPNOTSUPP) return (error); return (0); } } /* * If we are not suspended or have not yet reached suspended * mode, then let the operation proceed. */ if ((mp = *mpp) == NULL) return (0); /* * VOP_GETWRITEMOUNT() returns with the mp refcount held through * a vfs_ref(). * As long as a vnode is not provided we need to acquire a * refcount for the provided mountpoint too, in order to * emulate a vfs_ref(). */ MNT_ILOCK(mp); if (vp == NULL) MNT_REF(mp); if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) { mp->mnt_secondary_writes++; mp->mnt_secondary_accwrites++; MNT_IUNLOCK(mp); return (0); } if ((flags & V_NOWAIT) != 0) { MNT_REL(mp); MNT_IUNLOCK(mp); *mpp = NULL; return (EWOULDBLOCK); } /* * Wait for the suspension to finish. */ mflags = 0; if ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0) { if ((flags & V_PCATCH) != 0) mflags |= PCATCH; } mflags |= (PUSER - 1) | PDROP; error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags, "suspfs", 0); vfs_rel(mp); if (error == 0) goto retry; *mpp = NULL; return (error); } /* * Filesystem write operation has completed. If we are suspending and this * operation is the last one, notify the suspender that the suspension is * now in effect. */ void vn_finished_write(struct mount *mp) { struct mount_pcpu *mpcpu; int c; if (mp == NULL) return; if (vfs_op_thread_enter(mp, mpcpu)) { vfs_mp_count_sub_pcpu(mpcpu, writeopcount, 1); vfs_mp_count_sub_pcpu(mpcpu, ref, 1); vfs_op_thread_exit(mp, mpcpu); return; } MNT_ILOCK(mp); vfs_assert_mount_counters(mp); MNT_REL(mp); c = --mp->mnt_writeopcount; if (mp->mnt_vfs_ops == 0) { MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0); MNT_IUNLOCK(mp); return; } if (c < 0) vfs_dump_mount_counters(mp); if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && c == 0) wakeup(&mp->mnt_writeopcount); MNT_IUNLOCK(mp); } /* * Filesystem secondary write operation has completed. If we are * suspending and this operation is the last one, notify the suspender * that the suspension is now in effect. */ void vn_finished_secondary_write(struct mount *mp) { if (mp == NULL) return; MNT_ILOCK(mp); MNT_REL(mp); mp->mnt_secondary_writes--; if (mp->mnt_secondary_writes < 0) panic("vn_finished_secondary_write: neg cnt"); if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && mp->mnt_secondary_writes <= 0) wakeup(&mp->mnt_secondary_writes); MNT_IUNLOCK(mp); } /* * Request a filesystem to suspend write operations. */ int vfs_write_suspend(struct mount *mp, int flags) { int error; vfs_op_enter(mp); MNT_ILOCK(mp); vfs_assert_mount_counters(mp); if (mp->mnt_susp_owner == curthread) { vfs_op_exit_locked(mp); MNT_IUNLOCK(mp); return (EALREADY); } while (mp->mnt_kern_flag & MNTK_SUSPEND) msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0); /* * Unmount holds a write reference on the mount point. If we * own busy reference and drain for writers, we deadlock with * the reference draining in the unmount path. Callers of * vfs_write_suspend() must specify VS_SKIP_UNMOUNT if * vfs_busy() reference is owned and caller is not in the * unmount context. */ if ((flags & VS_SKIP_UNMOUNT) != 0 && (mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) { vfs_op_exit_locked(mp); MNT_IUNLOCK(mp); return (EBUSY); } mp->mnt_kern_flag |= MNTK_SUSPEND; mp->mnt_susp_owner = curthread; if (mp->mnt_writeopcount > 0) (void) msleep(&mp->mnt_writeopcount, MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0); else MNT_IUNLOCK(mp); if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0) { vfs_write_resume(mp, 0); /* vfs_write_resume does vfs_op_exit() for us */ } return (error); } /* * Request a filesystem to resume write operations. */ void vfs_write_resume(struct mount *mp, int flags) { MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) { KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner")); mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 | MNTK_SUSPENDED); mp->mnt_susp_owner = NULL; wakeup(&mp->mnt_writeopcount); wakeup(&mp->mnt_flag); curthread->td_pflags &= ~TDP_IGNSUSP; if ((flags & VR_START_WRITE) != 0) { MNT_REF(mp); mp->mnt_writeopcount++; } MNT_IUNLOCK(mp); if ((flags & VR_NO_SUSPCLR) == 0) VFS_SUSP_CLEAN(mp); vfs_op_exit(mp); } else if ((flags & VR_START_WRITE) != 0) { MNT_REF(mp); vn_start_write_refed(mp, 0, true); } else { MNT_IUNLOCK(mp); } } /* * Helper loop around vfs_write_suspend() for filesystem unmount VFS * methods. */ int vfs_write_suspend_umnt(struct mount *mp) { int error; KASSERT((curthread->td_pflags & TDP_IGNSUSP) == 0, ("vfs_write_suspend_umnt: recursed")); /* dounmount() already called vn_start_write(). */ for (;;) { vn_finished_write(mp); error = vfs_write_suspend(mp, 0); if (error != 0) { vn_start_write(NULL, &mp, V_WAIT); return (error); } MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_SUSPENDED) != 0) break; MNT_IUNLOCK(mp); vn_start_write(NULL, &mp, V_WAIT); } mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2); wakeup(&mp->mnt_flag); MNT_IUNLOCK(mp); curthread->td_pflags |= TDP_IGNSUSP; return (0); } /* * Implement kqueues for files by translating it to vnode operation. */ static int vn_kqfilter(struct file *fp, struct knote *kn) { return (VOP_KQFILTER(fp->f_vnode, kn)); } int vn_kqfilter_opath(struct file *fp, struct knote *kn) { if ((fp->f_flag & FKQALLOWED) == 0) return (EBADF); return (vn_kqfilter(fp, kn)); } /* * Simplified in-kernel wrapper calls for extended attribute access. * Both calls pass in a NULL credential, authorizing as "kernel" access. * Set IO_NODELOCKED in ioflg if the vnode is already locked. */ int vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int *buflen, char *buf, struct thread *td) { struct uio auio; struct iovec iov; int error; iov.iov_len = *buflen; iov.iov_base = buf; auio.uio_iov = &iov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_offset = 0; auio.uio_resid = *buflen; if ((ioflg & IO_NODELOCKED) == 0) vn_lock(vp, LK_SHARED | LK_RETRY); ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); /* authorize attribute retrieval as kernel */ error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL, td); if ((ioflg & IO_NODELOCKED) == 0) VOP_UNLOCK(vp); if (error == 0) { *buflen = *buflen - auio.uio_resid; } return (error); } /* * XXX failure mode if partially written? */ int vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int buflen, char *buf, struct thread *td) { struct uio auio; struct iovec iov; struct mount *mp; int error; iov.iov_len = buflen; iov.iov_base = buf; auio.uio_iov = &iov; auio.uio_iovcnt = 1; auio.uio_rw = UIO_WRITE; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_offset = 0; auio.uio_resid = buflen; if ((ioflg & IO_NODELOCKED) == 0) { if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); /* authorize attribute setting as kernel */ error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td); if ((ioflg & IO_NODELOCKED) == 0) { vn_finished_write(mp); VOP_UNLOCK(vp); } return (error); } int vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, struct thread *td) { struct mount *mp; int error; if ((ioflg & IO_NODELOCKED) == 0) { if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); /* authorize attribute removal as kernel */ error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td); if (error == EOPNOTSUPP) error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, NULL, td); if ((ioflg & IO_NODELOCKED) == 0) { vn_finished_write(mp); VOP_UNLOCK(vp); } return (error); } static int vn_get_ino_alloc_vget(struct mount *mp, void *arg, int lkflags, struct vnode **rvp) { return (VFS_VGET(mp, *(ino_t *)arg, lkflags, rvp)); } int vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp) { return (vn_vget_ino_gen(vp, vn_get_ino_alloc_vget, &ino, lkflags, rvp)); } int vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc, void *alloc_arg, int lkflags, struct vnode **rvp) { struct mount *mp; int ltype, error; ASSERT_VOP_LOCKED(vp, "vn_vget_ino_get"); mp = vp->v_mount; ltype = VOP_ISLOCKED(vp); KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED, ("vn_vget_ino: vp not locked")); error = vfs_busy(mp, MBF_NOWAIT); if (error != 0) { vfs_ref(mp); VOP_UNLOCK(vp); error = vfs_busy(mp, 0); vn_lock(vp, ltype | LK_RETRY); vfs_rel(mp); if (error != 0) return (ENOENT); if (VN_IS_DOOMED(vp)) { vfs_unbusy(mp); return (ENOENT); } } VOP_UNLOCK(vp); error = alloc(mp, alloc_arg, lkflags, rvp); vfs_unbusy(mp); if (error != 0 || *rvp != vp) vn_lock(vp, ltype | LK_RETRY); if (VN_IS_DOOMED(vp)) { if (error == 0) { if (*rvp == vp) vunref(vp); else vput(*rvp); } error = ENOENT; } return (error); } static void vn_send_sigxfsz(struct proc *p) { PROC_LOCK(p); kern_psignal(p, SIGXFSZ); PROC_UNLOCK(p); } int vn_rlimit_trunc(u_quad_t size, struct thread *td) { if (size <= lim_cur(td, RLIMIT_FSIZE)) return (0); vn_send_sigxfsz(td->td_proc); return (EFBIG); } static int vn_rlimit_fsizex1(const struct vnode *vp, struct uio *uio, off_t maxfsz, bool adj, struct thread *td) { off_t lim; bool ktr_write; if (vp->v_type != VREG) return (0); /* * Handle file system maximum file size. */ if (maxfsz != 0 && uio->uio_offset + uio->uio_resid > maxfsz) { if (!adj || uio->uio_offset >= maxfsz) return (EFBIG); uio->uio_resid = maxfsz - uio->uio_offset; } /* * This is kernel write (e.g. vnode_pager) or accounting * write, ignore limit. */ if (td == NULL || (td->td_pflags2 & TDP2_ACCT) != 0) return (0); /* * Calculate file size limit. */ ktr_write = (td->td_pflags & TDP_INKTRACE) != 0; lim = __predict_false(ktr_write) ? td->td_ktr_io_lim : lim_cur(td, RLIMIT_FSIZE); /* * Is the limit reached? */ if (__predict_true((uoff_t)uio->uio_offset + uio->uio_resid <= lim)) return (0); /* * Prepared filesystems can handle writes truncated to the * file size limit. */ if (adj && (uoff_t)uio->uio_offset < lim) { uio->uio_resid = lim - (uoff_t)uio->uio_offset; return (0); } if (!ktr_write || ktr_filesize_limit_signal) vn_send_sigxfsz(td->td_proc); return (EFBIG); } /* * Helper for VOP_WRITE() implementations, the common code to * handle maximum supported file size on the filesystem, and * RLIMIT_FSIZE, except for special writes from accounting subsystem * and ktrace. * * For maximum file size (maxfsz argument): * - return EFBIG if uio_offset is beyond it * - otherwise, clamp uio_resid if write would extend file beyond maxfsz. * * For RLIMIT_FSIZE: * - return EFBIG and send SIGXFSZ if uio_offset is beyond the limit * - otherwise, clamp uio_resid if write would extend file beyond limit. * * If clamping occured, the adjustment for uio_resid is stored in * *resid_adj, to be re-applied by vn_rlimit_fsizex_res() on return * from the VOP. */ int vn_rlimit_fsizex(const struct vnode *vp, struct uio *uio, off_t maxfsz, ssize_t *resid_adj, struct thread *td) { ssize_t resid_orig; int error; bool adj; resid_orig = uio->uio_resid; adj = resid_adj != NULL; error = vn_rlimit_fsizex1(vp, uio, maxfsz, adj, td); if (adj) *resid_adj = resid_orig - uio->uio_resid; return (error); } void vn_rlimit_fsizex_res(struct uio *uio, ssize_t resid_adj) { uio->uio_resid += resid_adj; } int vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio, struct thread *td) { return (vn_rlimit_fsizex(vp, __DECONST(struct uio *, uio), 0, NULL, td)); } int vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { struct vnode *vp; vp = fp->f_vnode; #ifdef AUDIT vn_lock(vp, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(vp); VOP_UNLOCK(vp); #endif return (setfmode(td, active_cred, vp, mode)); } int vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { struct vnode *vp; vp = fp->f_vnode; #ifdef AUDIT vn_lock(vp, LK_SHARED | LK_RETRY); AUDIT_ARG_VNODE1(vp); VOP_UNLOCK(vp); #endif return (setfown(td, active_cred, vp, uid, gid)); } /* * Remove pages in the range ["start", "end") from the vnode's VM object. If * "end" is 0, then the range extends to the end of the object. */ void vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end) { vm_object_t object; if ((object = vp->v_object) == NULL) return; VM_OBJECT_WLOCK(object); vm_object_page_remove(object, start, end, 0); VM_OBJECT_WUNLOCK(object); } /* * Like vn_pages_remove(), but skips invalid pages, which by definition are not * mapped into any process' address space. Filesystems may use this in * preference to vn_pages_remove() to avoid blocking on pages busied in * preparation for a VOP_GETPAGES. */ void vn_pages_remove_valid(struct vnode *vp, vm_pindex_t start, vm_pindex_t end) { vm_object_t object; if ((object = vp->v_object) == NULL) return; VM_OBJECT_WLOCK(object); vm_object_page_remove(object, start, end, OBJPR_VALIDONLY); VM_OBJECT_WUNLOCK(object); } int vn_bmap_seekhole_locked(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred) { off_t size; daddr_t bn, bnp; uint64_t bsize; off_t noff; int error; KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA, ("%s: Wrong command %lu", __func__, cmd)); ASSERT_VOP_ELOCKED(vp, "vn_bmap_seekhole_locked"); if (vp->v_type != VREG) { error = ENOTTY; goto out; } error = vn_getsize_locked(vp, &size, cred); if (error != 0) goto out; noff = *off; if (noff < 0 || noff >= size) { error = ENXIO; goto out; } /* See the comment in ufs_bmap_seekdata(). */ vnode_pager_clean_sync(vp); bsize = vp->v_mount->mnt_stat.f_iosize; for (bn = noff / bsize; noff < size; bn++, noff += bsize - noff % bsize) { error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL); if (error == EOPNOTSUPP) { error = ENOTTY; goto out; } if ((bnp == -1 && cmd == FIOSEEKHOLE) || (bnp != -1 && cmd == FIOSEEKDATA)) { noff = bn * bsize; if (noff < *off) noff = *off; goto out; } } if (noff > size) noff = size; /* noff == size. There is an implicit hole at the end of file. */ if (cmd == FIOSEEKDATA) error = ENXIO; out: if (error == 0) *off = noff; return (error); } int vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred) { int error; KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA, ("%s: Wrong command %lu", __func__, cmd)); if (vn_lock(vp, LK_EXCLUSIVE) != 0) return (EBADF); error = vn_bmap_seekhole_locked(vp, cmd, off, cred); VOP_UNLOCK(vp); return (error); } int vn_seek(struct file *fp, off_t offset, int whence, struct thread *td) { struct ucred *cred; struct vnode *vp; off_t foffset, fsize, size; int error, noneg; cred = td->td_ucred; vp = fp->f_vnode; noneg = (vp->v_type != VCHR); /* * Try to dodge locking for common case of querying the offset. */ if (whence == L_INCR && offset == 0) { foffset = foffset_read(fp); if (__predict_false(foffset < 0 && noneg)) { return (EOVERFLOW); } td->td_uretoff.tdu_off = foffset; return (0); } foffset = foffset_lock(fp, 0); error = 0; switch (whence) { case L_INCR: if (noneg && (foffset < 0 || (offset > 0 && foffset > OFF_MAX - offset))) { error = EOVERFLOW; break; } offset += foffset; break; case L_XTND: error = vn_getsize(vp, &fsize, cred); if (error != 0) break; /* * If the file references a disk device, then fetch * the media size and use that to determine the ending * offset. */ if (fsize == 0 && vp->v_type == VCHR && fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0) fsize = size; if (noneg && offset > 0 && fsize > OFF_MAX - offset) { error = EOVERFLOW; break; } offset += fsize; break; case L_SET: break; case SEEK_DATA: error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td); if (error == ENOTTY) error = EINVAL; break; case SEEK_HOLE: error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td); if (error == ENOTTY) error = EINVAL; break; default: error = EINVAL; } if (error == 0 && noneg && offset < 0) error = EINVAL; if (error != 0) goto drop; VFS_KNOTE_UNLOCKED(vp, 0); td->td_uretoff.tdu_off = offset; drop: foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0); return (error); } int vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred, struct thread *td) { int error; /* * Grant permission if the caller is the owner of the file, or * the super-user, or has ACL_WRITE_ATTRIBUTES permission on * on the file. If the time pointer is null, then write * permission on the file is also sufficient. * * From NFSv4.1, draft 21, 6.2.1.3.1, Discussion of Mask Attributes: * A user having ACL_WRITE_DATA or ACL_WRITE_ATTRIBUTES * will be allowed to set the times [..] to the current * server time. */ error = VOP_ACCESSX(vp, VWRITE_ATTRIBUTES, cred, td); if (error != 0 && (vap->va_vaflags & VA_UTIMES_NULL) != 0) error = VOP_ACCESS(vp, VWRITE, cred, td); return (error); } int vn_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { struct vnode *vp; int error; if (fp->f_type == DTYPE_FIFO) kif->kf_type = KF_TYPE_FIFO; else kif->kf_type = KF_TYPE_VNODE; vp = fp->f_vnode; vref(vp); FILEDESC_SUNLOCK(fdp); error = vn_fill_kinfo_vnode(vp, kif); vrele(vp); FILEDESC_SLOCK(fdp); return (error); } static inline void vn_fill_junk(struct kinfo_file *kif) { size_t len, olen; /* * Simulate vn_fullpath returning changing values for a given * vp during e.g. coredump. */ len = (arc4random() % (sizeof(kif->kf_path) - 2)) + 1; olen = strlen(kif->kf_path); if (len < olen) strcpy(&kif->kf_path[len - 1], "$"); else for (; olen < len; olen++) strcpy(&kif->kf_path[olen], "A"); } int vn_fill_kinfo_vnode(struct vnode *vp, struct kinfo_file *kif) { struct vattr va; char *fullpath, *freepath; int error; kif->kf_un.kf_file.kf_file_type = vntype_to_kinfo(vp->v_type); freepath = NULL; fullpath = "-"; error = vn_fullpath(vp, &fullpath, &freepath); if (error == 0) { strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path)); } if (freepath != NULL) free(freepath, M_TEMP); KFAIL_POINT_CODE(DEBUG_FP, fill_kinfo_vnode__random_path, vn_fill_junk(kif); ); /* * Retrieve vnode attributes. */ va.va_fsid = VNOVAL; va.va_rdev = NODEV; vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &va, curthread->td_ucred); VOP_UNLOCK(vp); if (error != 0) return (error); if (va.va_fsid != VNOVAL) kif->kf_un.kf_file.kf_file_fsid = va.va_fsid; else kif->kf_un.kf_file.kf_file_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; kif->kf_un.kf_file.kf_file_fsid_freebsd11 = kif->kf_un.kf_file.kf_file_fsid; /* truncate */ kif->kf_un.kf_file.kf_file_fileid = va.va_fileid; kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode); kif->kf_un.kf_file.kf_file_size = va.va_size; kif->kf_un.kf_file.kf_file_rdev = va.va_rdev; kif->kf_un.kf_file.kf_file_rdev_freebsd11 = kif->kf_un.kf_file.kf_file_rdev; /* truncate */ kif->kf_un.kf_file.kf_file_nlink = va.va_nlink; return (0); } int vn_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, struct thread *td) { #ifdef HWPMC_HOOKS struct pmckern_map_in pkm; #endif struct mount *mp; struct vnode *vp; vm_object_t object; vm_prot_t maxprot; boolean_t writecounted; int error; #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \ defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) /* * POSIX shared-memory objects are defined to have * kernel persistence, and are not defined to support * read(2)/write(2) -- or even open(2). Thus, we can * use MAP_ASYNC to trade on-disk coherence for speed. * The shm_open(3) library routine turns on the FPOSIXSHM * flag to request this behavior. */ if ((fp->f_flag & FPOSIXSHM) != 0) flags |= MAP_NOSYNC; #endif vp = fp->f_vnode; /* * Ensure that file and memory protections are * compatible. Note that we only worry about * writability if mapping is shared; in this case, * current and max prot are dictated by the open file. * XXX use the vnode instead? Problem is: what * credentials do we use for determination? What if * proc does a setuid? */ mp = vp->v_mount; if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) { maxprot = VM_PROT_NONE; if ((prot & VM_PROT_EXECUTE) != 0) return (EACCES); } else maxprot = VM_PROT_EXECUTE; if ((fp->f_flag & FREAD) != 0) maxprot |= VM_PROT_READ; else if ((prot & VM_PROT_READ) != 0) return (EACCES); /* * If we are sharing potential changes via MAP_SHARED and we * are trying to get write permission although we opened it * without asking for it, bail out. */ if ((flags & MAP_SHARED) != 0) { if ((fp->f_flag & FWRITE) != 0) maxprot |= VM_PROT_WRITE; else if ((prot & VM_PROT_WRITE) != 0) return (EACCES); } else { maxprot |= VM_PROT_WRITE; cap_maxprot |= VM_PROT_WRITE; } maxprot &= cap_maxprot; /* * For regular files and shared memory, POSIX requires that * the value of foff be a legitimate offset within the data * object. In particular, negative offsets are invalid. * Blocking negative offsets and overflows here avoids * possible wraparound or user-level access into reserved * ranges of the data object later. In contrast, POSIX does * not dictate how offsets are used by device drivers, so in * the case of a device mapping a negative offset is passed * on. */ if ( #ifdef _LP64 size > OFF_MAX || #endif foff > OFF_MAX - size) return (EINVAL); writecounted = FALSE; error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, vp, &foff, &object, &writecounted); if (error != 0) return (error); error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, foff, writecounted, td); if (error != 0) { /* * If this mapping was accounted for in the vnode's * writecount, then undo that now. */ if (writecounted) vm_pager_release_writecount(object, 0, size); vm_object_deallocate(object); } #ifdef HWPMC_HOOKS /* Inform hwpmc(4) if an executable is being mapped. */ if (PMC_HOOK_INSTALLED(PMC_FN_MMAP)) { if ((prot & VM_PROT_EXECUTE) != 0 && error == 0) { pkm.pm_file = vp; pkm.pm_address = (uintptr_t) *addr; PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_MMAP, (void *) &pkm); } } #endif return (error); } void vn_fsid(struct vnode *vp, struct vattr *va) { fsid_t *f; f = &vp->v_mount->mnt_stat.f_fsid; va->va_fsid = (uint32_t)f->val[1]; va->va_fsid <<= sizeof(f->val[1]) * NBBY; va->va_fsid += (uint32_t)f->val[0]; } int vn_fsync_buf(struct vnode *vp, int waitfor) { struct buf *bp, *nbp; struct bufobj *bo; struct mount *mp; int error, maxretry; error = 0; maxretry = 10000; /* large, arbitrarily chosen */ mp = NULL; if (vp->v_type == VCHR) { VI_LOCK(vp); mp = vp->v_rdev->si_mountpt; VI_UNLOCK(vp); } bo = &vp->v_bufobj; BO_LOCK(bo); loop1: /* * MARK/SCAN initialization to avoid infinite loops. */ TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) { bp->b_vflags &= ~BV_SCANNED; bp->b_error = 0; } /* * Flush all dirty buffers associated with a vnode. */ loop2: TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if ((bp->b_vflags & BV_SCANNED) != 0) continue; bp->b_vflags |= BV_SCANNED; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) { if (waitfor != MNT_WAIT) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_INTERLOCK | LK_SLEEPFAIL, BO_LOCKPTR(bo)) != 0) { BO_LOCK(bo); goto loop1; } BO_LOCK(bo); } BO_UNLOCK(bo); KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); if ((bp->b_flags & B_DELWRI) == 0) panic("fsync: not dirty"); if ((vp->v_object != NULL) && (bp->b_flags & B_CLUSTEROK)) { vfs_bio_awrite(bp); } else { bremfree(bp); bawrite(bp); } if (maxretry < 1000) pause("dirty", hz < 1000 ? 1 : hz / 1000); BO_LOCK(bo); goto loop2; } /* * If synchronous the caller expects us to completely resolve all * dirty buffers in the system. Wait for in-progress I/O to * complete (which could include background bitmap writes), then * retry if dirty blocks still exist. */ if (waitfor == MNT_WAIT) { bufobj_wwait(bo, 0, 0); if (bo->bo_dirty.bv_cnt > 0) { /* * If we are unable to write any of these buffers * then we fail now rather than trying endlessly * to write them out. */ TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) if ((error = bp->b_error) != 0) break; if ((mp != NULL && mp->mnt_secondary_writes > 0) || (error == 0 && --maxretry >= 0)) goto loop1; if (error == 0) error = EAGAIN; } } BO_UNLOCK(bo); if (error != 0) vn_printf(vp, "fsync: giving up on dirty (error = %d) ", error); return (error); } /* * Copies a byte range from invp to outvp. Calls VOP_COPY_FILE_RANGE() * or vn_generic_copy_file_range() after rangelocking the byte ranges, * to do the actual copy. * vn_generic_copy_file_range() is factored out, so it can be called * from a VOP_COPY_FILE_RANGE() call as well, but handles vnodes from * different file systems. */ int vn_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp, off_t *outoffp, size_t *lenp, unsigned int flags, struct ucred *incred, struct ucred *outcred, struct thread *fsize_td) { struct mount *inmp, *outmp; struct vnode *invpl, *outvpl; int error; size_t len; uint64_t uval; invpl = outvpl = NULL; len = *lenp; *lenp = 0; /* For error returns. */ error = 0; /* Do some sanity checks on the arguments. */ if (invp->v_type == VDIR || outvp->v_type == VDIR) error = EISDIR; else if (*inoffp < 0 || *outoffp < 0 || invp->v_type != VREG || outvp->v_type != VREG) error = EINVAL; if (error != 0) goto out; /* Ensure offset + len does not wrap around. */ uval = *inoffp; uval += len; if (uval > INT64_MAX) len = INT64_MAX - *inoffp; uval = *outoffp; uval += len; if (uval > INT64_MAX) len = INT64_MAX - *outoffp; if (len == 0) goto out; error = VOP_GETLOWVNODE(invp, &invpl, FREAD); if (error != 0) goto out; error = VOP_GETLOWVNODE(outvp, &outvpl, FWRITE); if (error != 0) goto out1; inmp = invpl->v_mount; outmp = outvpl->v_mount; if (inmp == NULL || outmp == NULL) goto out2; for (;;) { error = vfs_busy(inmp, 0); if (error != 0) goto out2; if (inmp == outmp) break; error = vfs_busy(outmp, MBF_NOWAIT); if (error != 0) { vfs_unbusy(inmp); error = vfs_busy(outmp, 0); if (error == 0) { vfs_unbusy(outmp); continue; } goto out2; } break; } /* * If the two vnodes are for the same file system type, call * VOP_COPY_FILE_RANGE(), otherwise call vn_generic_copy_file_range() * which can handle copies across multiple file system types. */ *lenp = len; if (inmp == outmp || inmp->mnt_vfc == outmp->mnt_vfc) error = VOP_COPY_FILE_RANGE(invpl, inoffp, outvpl, outoffp, lenp, flags, incred, outcred, fsize_td); else error = ENOSYS; if (error == ENOSYS) error = vn_generic_copy_file_range(invpl, inoffp, outvpl, outoffp, lenp, flags, incred, outcred, fsize_td); vfs_unbusy(outmp); if (inmp != outmp) vfs_unbusy(inmp); out2: if (outvpl != NULL) vrele(outvpl); out1: if (invpl != NULL) vrele(invpl); out: return (error); } /* * Test len bytes of data starting at dat for all bytes == 0. * Return true if all bytes are zero, false otherwise. * Expects dat to be well aligned. */ static bool mem_iszero(void *dat, int len) { int i; const u_int *p; const char *cp; for (p = dat; len > 0; len -= sizeof(*p), p++) { if (len >= sizeof(*p)) { if (*p != 0) return (false); } else { cp = (const char *)p; for (i = 0; i < len; i++, cp++) if (*cp != '\0') return (false); } } return (true); } /* * Look for a hole in the output file and, if found, adjust *outoffp * and *xferp to skip past the hole. * *xferp is the entire hole length to be written and xfer2 is how many bytes * to be written as 0's upon return. */ static off_t vn_skip_hole(struct vnode *outvp, off_t xfer2, off_t *outoffp, off_t *xferp, off_t *dataoffp, off_t *holeoffp, struct ucred *cred) { int error; off_t delta; if (*holeoffp == 0 || *holeoffp <= *outoffp) { *dataoffp = *outoffp; error = VOP_IOCTL(outvp, FIOSEEKDATA, dataoffp, 0, cred, curthread); if (error == 0) { *holeoffp = *dataoffp; error = VOP_IOCTL(outvp, FIOSEEKHOLE, holeoffp, 0, cred, curthread); } if (error != 0 || *holeoffp == *dataoffp) { /* * Since outvp is unlocked, it may be possible for * another thread to do a truncate(), lseek(), write() * creating a hole at startoff between the above * VOP_IOCTL() calls, if the other thread does not do * rangelocking. * If that happens, *holeoffp == *dataoffp and finding * the hole has failed, so disable vn_skip_hole(). */ *holeoffp = -1; /* Disable use of vn_skip_hole(). */ return (xfer2); } KASSERT(*dataoffp >= *outoffp, ("vn_skip_hole: dataoff=%jd < outoff=%jd", (intmax_t)*dataoffp, (intmax_t)*outoffp)); KASSERT(*holeoffp > *dataoffp, ("vn_skip_hole: holeoff=%jd <= dataoff=%jd", (intmax_t)*holeoffp, (intmax_t)*dataoffp)); } /* * If there is a hole before the data starts, advance *outoffp and * *xferp past the hole. */ if (*dataoffp > *outoffp) { delta = *dataoffp - *outoffp; if (delta >= *xferp) { /* Entire *xferp is a hole. */ *outoffp += *xferp; *xferp = 0; return (0); } *xferp -= delta; *outoffp += delta; xfer2 = MIN(xfer2, *xferp); } /* * If a hole starts before the end of this xfer2, reduce this xfer2 so * that the write ends at the start of the hole. * *holeoffp should always be greater than *outoffp, but for the * non-INVARIANTS case, check this to make sure xfer2 remains a sane * value. */ if (*holeoffp > *outoffp && *holeoffp < *outoffp + xfer2) xfer2 = *holeoffp - *outoffp; return (xfer2); } /* * Write an xfer sized chunk to outvp in blksize blocks from dat. * dat is a maximum of blksize in length and can be written repeatedly in * the chunk. * If growfile == true, just grow the file via vn_truncate_locked() instead * of doing actual writes. * If checkhole == true, a hole is being punched, so skip over any hole * already in the output file. */ static int vn_write_outvp(struct vnode *outvp, char *dat, off_t outoff, off_t xfer, u_long blksize, bool growfile, bool checkhole, struct ucred *cred) { struct mount *mp; off_t dataoff, holeoff, xfer2; int error; /* * Loop around doing writes of blksize until write has been completed. * Lock/unlock on each loop iteration so that a bwillwrite() can be * done for each iteration, since the xfer argument can be very * large if there is a large hole to punch in the output file. */ error = 0; holeoff = 0; do { xfer2 = MIN(xfer, blksize); if (checkhole) { /* * Punching a hole. Skip writing if there is * already a hole in the output file. */ xfer2 = vn_skip_hole(outvp, xfer2, &outoff, &xfer, &dataoff, &holeoff, cred); if (xfer == 0) break; if (holeoff < 0) checkhole = false; KASSERT(xfer2 > 0, ("vn_write_outvp: xfer2=%jd", (intmax_t)xfer2)); } bwillwrite(); mp = NULL; error = vn_start_write(outvp, &mp, V_WAIT); if (error != 0) break; if (growfile) { error = vn_lock(outvp, LK_EXCLUSIVE); if (error == 0) { error = vn_truncate_locked(outvp, outoff + xfer, false, cred); VOP_UNLOCK(outvp); } } else { error = vn_lock(outvp, vn_lktype_write(mp, outvp)); if (error == 0) { error = vn_rdwr(UIO_WRITE, outvp, dat, xfer2, outoff, UIO_SYSSPACE, IO_NODELOCKED, curthread->td_ucred, cred, NULL, curthread); outoff += xfer2; xfer -= xfer2; VOP_UNLOCK(outvp); } } if (mp != NULL) vn_finished_write(mp); } while (!growfile && xfer > 0 && error == 0); return (error); } /* * Copy a byte range of one file to another. This function can handle the * case where invp and outvp are on different file systems. * It can also be called by a VOP_COPY_FILE_RANGE() to do the work, if there * is no better file system specific way to do it. */ int vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp, off_t *outoffp, size_t *lenp, unsigned int flags, struct ucred *incred, struct ucred *outcred, struct thread *fsize_td) { struct vattr inva; struct mount *mp; off_t startoff, endoff, xfer, xfer2; u_long blksize; int error, interrupted; bool cantseek, readzeros, eof, first, lastblock, holetoeof, sparse; ssize_t aresid, r = 0; size_t copylen, len, savlen; off_t outsize; char *dat; long holein, holeout; struct timespec curts, endts; holein = holeout = 0; savlen = len = *lenp; error = 0; interrupted = 0; dat = NULL; error = vn_lock(invp, LK_SHARED); if (error != 0) goto out; if (VOP_PATHCONF(invp, _PC_MIN_HOLE_SIZE, &holein) != 0) holein = 0; error = VOP_GETATTR(invp, &inva, incred); if (error == 0 && inva.va_size > OFF_MAX) error = EFBIG; VOP_UNLOCK(invp); if (error != 0) goto out; /* * Use va_bytes >= va_size as a hint that the file does not have * sufficient holes to justify the overhead of doing FIOSEEKHOLE. * This hint does not work well for file systems doing compression * and may fail when allocations for extended attributes increases * the value of va_bytes to >= va_size. */ sparse = true; if (holein != 0 && inva.va_bytes >= inva.va_size) { holein = 0; sparse = false; } mp = NULL; error = vn_start_write(outvp, &mp, V_WAIT); if (error == 0) error = vn_lock(outvp, LK_EXCLUSIVE); if (error == 0) { /* * If fsize_td != NULL, do a vn_rlimit_fsizex() call, * now that outvp is locked. */ if (fsize_td != NULL) { struct uio io; io.uio_offset = *outoffp; io.uio_resid = len; error = vn_rlimit_fsizex(outvp, &io, 0, &r, fsize_td); len = savlen = io.uio_resid; /* * No need to call vn_rlimit_fsizex_res before return, * since the uio is local. */ } if (VOP_PATHCONF(outvp, _PC_MIN_HOLE_SIZE, &holeout) != 0) holeout = 0; /* * Holes that are past EOF do not need to be written as a block * of zero bytes. So, truncate the output file as far as * possible and then use size to decide if writing 0 * bytes is necessary in the loop below. */ if (error == 0) error = vn_getsize_locked(outvp, &outsize, outcred); if (error == 0 && outsize > *outoffp && *outoffp <= OFF_MAX - len && outsize <= *outoffp + len && *inoffp < inva.va_size && *outoffp <= OFF_MAX - (inva.va_size - *inoffp) && outsize <= *outoffp + (inva.va_size - *inoffp)) { #ifdef MAC error = mac_vnode_check_write(curthread->td_ucred, outcred, outvp); if (error == 0) #endif error = vn_truncate_locked(outvp, *outoffp, false, outcred); if (error == 0) outsize = *outoffp; } VOP_UNLOCK(outvp); } if (mp != NULL) vn_finished_write(mp); if (error != 0) goto out; if (sparse && holein == 0 && holeout > 0) { /* * For this special case, the input data will be scanned * for blocks of all 0 bytes. For these blocks, the * write can be skipped for the output file to create * an unallocated region. * Therefore, use the appropriate size for the output file. */ blksize = holeout; if (blksize <= 512) { /* * Use f_iosize, since ZFS reports a _PC_MIN_HOLE_SIZE * of 512, although it actually only creates * unallocated regions for blocks >= f_iosize. */ blksize = outvp->v_mount->mnt_stat.f_iosize; } } else { /* * Use the larger of the two f_iosize values. If they are * not the same size, one will normally be an exact multiple of * the other, since they are both likely to be a power of 2. */ blksize = MAX(invp->v_mount->mnt_stat.f_iosize, outvp->v_mount->mnt_stat.f_iosize); } /* Clip to sane limits. */ if (blksize < 4096) blksize = 4096; else if (blksize > maxphys) blksize = maxphys; dat = malloc(blksize, M_TEMP, M_WAITOK); /* * If VOP_IOCTL(FIOSEEKHOLE) works for invp, use it and FIOSEEKDATA * to find holes. Otherwise, just scan the read block for all 0s * in the inner loop where the data copying is done. * Note that some file systems such as NFSv3, NFSv4.0 and NFSv4.1 may * support holes on the server, but do not support FIOSEEKHOLE. * The kernel flag COPY_FILE_RANGE_TIMEO1SEC is used to indicate * that this function should return after 1second with a partial * completion. */ if ((flags & COPY_FILE_RANGE_TIMEO1SEC) != 0) { getnanouptime(&endts); endts.tv_sec++; } else timespecclear(&endts); first = true; holetoeof = eof = false; while (len > 0 && error == 0 && !eof && interrupted == 0) { endoff = 0; /* To shut up compilers. */ cantseek = true; startoff = *inoffp; copylen = len; /* * Find the next data area. If there is just a hole to EOF, * FIOSEEKDATA should fail with ENXIO. * (I do not know if any file system will report a hole to * EOF via FIOSEEKHOLE, but I am pretty sure FIOSEEKDATA * will fail for those file systems.) * * For input files that don't support FIOSEEKDATA/FIOSEEKHOLE, * the code just falls through to the inner copy loop. */ error = EINVAL; if (holein > 0) { error = VOP_IOCTL(invp, FIOSEEKDATA, &startoff, 0, incred, curthread); if (error == ENXIO) { startoff = endoff = inva.va_size; eof = holetoeof = true; error = 0; } } if (error == 0 && !holetoeof) { endoff = startoff; error = VOP_IOCTL(invp, FIOSEEKHOLE, &endoff, 0, incred, curthread); /* * Since invp is unlocked, it may be possible for * another thread to do a truncate(), lseek(), write() * creating a hole at startoff between the above * VOP_IOCTL() calls, if the other thread does not do * rangelocking. * If that happens, startoff == endoff and finding * the hole has failed, so set an error. */ if (error == 0 && startoff == endoff) error = EINVAL; /* Any error. Reset to 0. */ } if (error == 0) { if (startoff > *inoffp) { /* Found hole before data block. */ xfer = MIN(startoff - *inoffp, len); if (*outoffp < outsize) { /* Must write 0s to punch hole. */ xfer2 = MIN(outsize - *outoffp, xfer); memset(dat, 0, MIN(xfer2, blksize)); error = vn_write_outvp(outvp, dat, *outoffp, xfer2, blksize, false, holeout > 0, outcred); } if (error == 0 && *outoffp + xfer > outsize && (xfer == len || holetoeof)) { /* Grow output file (hole at end). */ error = vn_write_outvp(outvp, dat, *outoffp, xfer, blksize, true, false, outcred); } if (error == 0) { *inoffp += xfer; *outoffp += xfer; len -= xfer; if (len < savlen) { interrupted = sig_intr(); if (timespecisset(&endts) && interrupted == 0) { getnanouptime(&curts); if (timespeccmp(&curts, &endts, >=)) interrupted = EINTR; } } } } copylen = MIN(len, endoff - startoff); cantseek = false; } else { cantseek = true; if (!sparse) cantseek = false; startoff = *inoffp; copylen = len; error = 0; } xfer = blksize; if (cantseek) { /* * Set first xfer to end at a block boundary, so that * holes are more likely detected in the loop below via * the for all bytes 0 method. */ xfer -= (*inoffp % blksize); } /* * Loop copying the data block. If this was our first attempt * to copy anything, allow a zero-length block so that the VOPs * get a chance to update metadata, specifically the atime. */ while (error == 0 && ((copylen > 0 && !eof) || first) && interrupted == 0) { if (copylen < xfer) xfer = copylen; first = false; error = vn_lock(invp, LK_SHARED); if (error != 0) goto out; error = vn_rdwr(UIO_READ, invp, dat, xfer, startoff, UIO_SYSSPACE, IO_NODELOCKED, curthread->td_ucred, incred, &aresid, curthread); VOP_UNLOCK(invp); lastblock = false; if (error == 0 && (xfer == 0 || aresid > 0)) { /* Stop the copy at EOF on the input file. */ xfer -= aresid; eof = true; lastblock = true; } if (error == 0) { /* * Skip the write for holes past the initial EOF * of the output file, unless this is the last * write of the output file at EOF. */ readzeros = cantseek ? mem_iszero(dat, xfer) : false; if (xfer == len) lastblock = true; if (!cantseek || *outoffp < outsize || lastblock || !readzeros) error = vn_write_outvp(outvp, dat, *outoffp, xfer, blksize, readzeros && lastblock && *outoffp >= outsize, false, outcred); if (error == 0) { *inoffp += xfer; startoff += xfer; *outoffp += xfer; copylen -= xfer; len -= xfer; if (len < savlen) { interrupted = sig_intr(); if (timespecisset(&endts) && interrupted == 0) { getnanouptime(&curts); if (timespeccmp(&curts, &endts, >=)) interrupted = EINTR; } } } } xfer = blksize; } } out: *lenp = savlen - len; free(dat, M_TEMP); return (error); } static int vn_fallocate(struct file *fp, off_t offset, off_t len, struct thread *td) { struct mount *mp; struct vnode *vp; off_t olen, ooffset; int error; #ifdef AUDIT int audited_vnode1 = 0; #endif vp = fp->f_vnode; if (vp->v_type != VREG) return (ENODEV); /* Allocating blocks may take a long time, so iterate. */ for (;;) { olen = len; ooffset = offset; bwillwrite(); mp = NULL; error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH); if (error != 0) break; error = vn_lock(vp, LK_EXCLUSIVE); if (error != 0) { vn_finished_write(mp); break; } #ifdef AUDIT if (!audited_vnode1) { AUDIT_ARG_VNODE1(vp); audited_vnode1 = 1; } #endif #ifdef MAC error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp); if (error == 0) #endif error = VOP_ALLOCATE(vp, &offset, &len, 0, td->td_ucred); VOP_UNLOCK(vp); vn_finished_write(mp); if (olen + ooffset != offset + len) { panic("offset + len changed from %jx/%jx to %jx/%jx", ooffset, olen, offset, len); } if (error != 0 || len == 0) break; KASSERT(olen > len, ("Iteration did not make progress?")); maybe_yield(); } return (error); } static int vn_deallocate_impl(struct vnode *vp, off_t *offset, off_t *length, int flags, int ioflag, struct ucred *cred, struct ucred *active_cred, struct ucred *file_cred) { struct mount *mp; void *rl_cookie; off_t off, len; int error; #ifdef AUDIT bool audited_vnode1 = false; #endif rl_cookie = NULL; error = 0; mp = NULL; off = *offset; len = *length; if ((ioflag & (IO_NODELOCKED | IO_RANGELOCKED)) == 0) rl_cookie = vn_rangelock_wlock(vp, off, off + len); while (len > 0 && error == 0) { /* * Try to deallocate the longest range in one pass. * In case a pass takes too long to be executed, it returns * partial result. The residue will be proceeded in the next * pass. */ if ((ioflag & IO_NODELOCKED) == 0) { bwillwrite(); if ((error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH)) != 0) goto out; vn_lock(vp, vn_lktype_write(mp, vp) | LK_RETRY); } #ifdef AUDIT if (!audited_vnode1) { AUDIT_ARG_VNODE1(vp); audited_vnode1 = true; } #endif #ifdef MAC if ((ioflag & IO_NOMACCHECK) == 0) error = mac_vnode_check_write(active_cred, file_cred, vp); #endif if (error == 0) error = VOP_DEALLOCATE(vp, &off, &len, flags, ioflag, cred); if ((ioflag & IO_NODELOCKED) == 0) { VOP_UNLOCK(vp); if (mp != NULL) { vn_finished_write(mp); mp = NULL; } } if (error == 0 && len != 0) maybe_yield(); } out: if (rl_cookie != NULL) vn_rangelock_unlock(vp, rl_cookie); *offset = off; *length = len; return (error); } /* * This function is supposed to be used in the situations where the deallocation * is not triggered by a user request. */ int vn_deallocate(struct vnode *vp, off_t *offset, off_t *length, int flags, int ioflag, struct ucred *active_cred, struct ucred *file_cred) { struct ucred *cred; if (*offset < 0 || *length <= 0 || *length > OFF_MAX - *offset || flags != 0) return (EINVAL); if (vp->v_type != VREG) return (ENODEV); cred = file_cred != NOCRED ? file_cred : active_cred; return (vn_deallocate_impl(vp, offset, length, flags, ioflag, cred, active_cred, file_cred)); } static int vn_fspacectl(struct file *fp, int cmd, off_t *offset, off_t *length, int flags, struct ucred *active_cred, struct thread *td) { int error; struct vnode *vp; int ioflag; KASSERT(cmd == SPACECTL_DEALLOC, ("vn_fspacectl: Invalid cmd")); KASSERT((flags & ~SPACECTL_F_SUPPORTED) == 0, ("vn_fspacectl: non-zero flags")); KASSERT(*offset >= 0 && *length > 0 && *length <= OFF_MAX - *offset, ("vn_fspacectl: offset/length overflow or underflow")); vp = fp->f_vnode; if (vp->v_type != VREG) return (ENODEV); ioflag = get_write_ioflag(fp); switch (cmd) { case SPACECTL_DEALLOC: error = vn_deallocate_impl(vp, offset, length, flags, ioflag, active_cred, active_cred, fp->f_cred); break; default: panic("vn_fspacectl: unknown cmd %d", cmd); } return (error); } /* * Keep this assert as long as sizeof(struct dirent) is used as the maximum * entry size. */ _Static_assert(_GENERIC_MAXDIRSIZ == sizeof(struct dirent), "'struct dirent' size must be a multiple of its alignment " "(see _GENERIC_DIRLEN())"); /* * Returns successive directory entries through some caller's provided buffer. * * This function automatically refills the provided buffer with calls to * VOP_READDIR() (after MAC permission checks). * * 'td' is used for credentials and passed to uiomove(). 'dirbuf' is the * caller's buffer to fill and 'dirbuflen' its allocated size. 'dirbuf' must * be properly aligned to access 'struct dirent' structures and 'dirbuflen' * must be greater than GENERIC_MAXDIRSIZ to avoid VOP_READDIR() returning * EINVAL (the latter is not a strong guarantee (yet); but EINVAL will always * be returned if this requirement is not verified). '*dpp' points to the * current directory entry in the buffer and '*len' contains the remaining * valid bytes in 'dirbuf' after 'dpp' (including the pointed entry). * * At first call (or when restarting the read), '*len' must have been set to 0, * '*off' to 0 (or any valid start offset) and '*eofflag' to 0. There are no * more entries as soon as '*len' is 0 after a call that returned 0. Calling * again this function after such a condition is considered an error and EINVAL * will be returned. Other possible error codes are those of VOP_READDIR(), * EINTEGRITY if the returned entries do not pass coherency tests, or EINVAL * (bad call). All errors are unrecoverable, i.e., the state ('*len', '*off' * and '*eofflag') must be re-initialized before a subsequent call. On error * or at end of directory, '*dpp' is reset to NULL. * * '*len', '*off' and '*eofflag' are internal state the caller should not * tamper with except as explained above. '*off' is the next directory offset * to read from to refill the buffer. '*eofflag' is set to 0 or 1 by the last * internal call to VOP_READDIR() that returned without error, indicating * whether it reached the end of the directory, and to 2 by this function after * all entries have been read. */ int vn_dir_next_dirent(struct vnode *vp, struct thread *td, char *dirbuf, size_t dirbuflen, struct dirent **dpp, size_t *len, off_t *off, int *eofflag) { struct dirent *dp = NULL; int reclen; int error; struct uio uio; struct iovec iov; ASSERT_VOP_LOCKED(vp, "vnode not locked"); VNASSERT(vp->v_type == VDIR, vp, ("vnode is not a directory")); MPASS2((uintptr_t)dirbuf < (uintptr_t)dirbuf + dirbuflen, "Address space overflow"); if (__predict_false(dirbuflen < GENERIC_MAXDIRSIZ)) { /* Don't take any chances in this case */ error = EINVAL; goto out; } if (*len != 0) { dp = *dpp; /* * The caller continued to call us after an error (we set dp to * NULL in a previous iteration). Bail out right now. */ if (__predict_false(dp == NULL)) return (EINVAL); MPASS(*len <= dirbuflen); MPASS2((uintptr_t)dirbuf <= (uintptr_t)dp && (uintptr_t)dp + *len <= (uintptr_t)dirbuf + dirbuflen, "Filled range not inside buffer"); reclen = dp->d_reclen; if (reclen >= *len) { /* End of buffer reached */ *len = 0; } else { dp = (struct dirent *)((char *)dp + reclen); *len -= reclen; } } if (*len == 0) { dp = NULL; /* Have to refill. */ switch (*eofflag) { case 0: break; case 1: /* Nothing more to read. */ *eofflag = 2; /* Remember the caller reached EOF. */ goto success; default: /* The caller didn't test for EOF. */ error = EINVAL; goto out; } iov.iov_base = dirbuf; iov.iov_len = dirbuflen; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = *off; uio.uio_resid = dirbuflen; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_td = td; #ifdef MAC error = mac_vnode_check_readdir(td->td_ucred, vp); if (error == 0) #endif error = VOP_READDIR(vp, &uio, td->td_ucred, eofflag, NULL, NULL); if (error != 0) goto out; *len = dirbuflen - uio.uio_resid; *off = uio.uio_offset; if (*len == 0) { /* Sanity check on INVARIANTS. */ MPASS(*eofflag != 0); *eofflag = 1; goto success; } /* * Normalize the flag returned by VOP_READDIR(), since we use 2 * as a sentinel value. */ if (*eofflag != 0) *eofflag = 1; dp = (struct dirent *)dirbuf; } if (__predict_false(*len < GENERIC_MINDIRSIZ || dp->d_reclen < GENERIC_MINDIRSIZ)) { error = EINTEGRITY; dp = NULL; goto out; } success: error = 0; out: *dpp = dp; return (error); } /* * Checks whether a directory is empty or not. * * If the directory is empty, returns 0, and if it is not, ENOTEMPTY. Other * values are genuine errors preventing the check. */ int vn_dir_check_empty(struct vnode *vp) { struct thread *const td = curthread; char *dirbuf; size_t dirbuflen, len; off_t off; int eofflag, error; struct dirent *dp; struct vattr va; ASSERT_VOP_LOCKED(vp, "vfs_emptydir"); VNPASS(vp->v_type == VDIR, vp); error = VOP_GETATTR(vp, &va, td->td_ucred); if (error != 0) return (error); dirbuflen = max(DEV_BSIZE, GENERIC_MAXDIRSIZ); if (dirbuflen < va.va_blocksize) dirbuflen = va.va_blocksize; dirbuf = malloc(dirbuflen, M_TEMP, M_WAITOK); len = 0; off = 0; eofflag = 0; for (;;) { error = vn_dir_next_dirent(vp, td, dirbuf, dirbuflen, &dp, &len, &off, &eofflag); if (error != 0) goto end; if (len == 0) { /* EOF */ error = 0; goto end; } /* * Skip whiteouts. Unionfs operates on filesystems only and * not on hierarchies, so these whiteouts would be shadowed on * the system hierarchy but not for a union using the * filesystem of their directories as the upper layer. * Additionally, unionfs currently transparently exposes * union-specific metadata of its upper layer, meaning that * whiteouts can be seen through the union view in empty * directories. Taking into account these whiteouts would then * prevent mounting another filesystem on such effectively * empty directories. */ if (dp->d_type == DT_WHT) continue; /* * Any file in the directory which is not '.' or '..' indicates * the directory is not empty. */ switch (dp->d_namlen) { case 2: if (dp->d_name[1] != '.') { /* Can't be '..' (nor '.') */ error = ENOTEMPTY; goto end; } /* FALLTHROUGH */ case 1: if (dp->d_name[0] != '.') { /* Can't be '..' nor '.' */ error = ENOTEMPTY; goto end; } break; default: error = ENOTEMPTY; goto end; } } end: free(dirbuf, M_TEMP); return (error); } static u_long vn_lock_pair_pause_cnt; SYSCTL_ULONG(_debug, OID_AUTO, vn_lock_pair_pause, CTLFLAG_RD, &vn_lock_pair_pause_cnt, 0, "Count of vn_lock_pair deadlocks"); u_int vn_lock_pair_pause_max; SYSCTL_UINT(_debug, OID_AUTO, vn_lock_pair_pause_max, CTLFLAG_RW, &vn_lock_pair_pause_max, 0, "Max ticks for vn_lock_pair deadlock avoidance sleep"); static void vn_lock_pair_pause(const char *wmesg) { atomic_add_long(&vn_lock_pair_pause_cnt, 1); pause(wmesg, prng32_bounded(vn_lock_pair_pause_max)); } /* * Lock pair of (possibly same) vnodes vp1, vp2, avoiding lock order * reversal. vp1_locked indicates whether vp1 is locked; if not, vp1 * must be unlocked. Same for vp2 and vp2_locked. One of the vnodes * can be NULL. * * The function returns with both vnodes exclusively or shared locked, * according to corresponding lkflags, and guarantees that it does not * create lock order reversal with other threads during its execution. * Both vnodes could be unlocked temporary (and reclaimed). * * If requesting shared locking, locked vnode lock must not be recursed. * * Only one of LK_SHARED and LK_EXCLUSIVE must be specified. * LK_NODDLKTREAT can be optionally passed. * * If vp1 == vp2, only one, most exclusive, lock is obtained on it. */ void vn_lock_pair(struct vnode *vp1, bool vp1_locked, int lkflags1, struct vnode *vp2, bool vp2_locked, int lkflags2) { int error, locked1; MPASS((((lkflags1 & LK_SHARED) != 0) ^ ((lkflags1 & LK_EXCLUSIVE) != 0)) || (vp1 == NULL && lkflags1 == 0)); MPASS((lkflags1 & ~(LK_SHARED | LK_EXCLUSIVE | LK_NODDLKTREAT)) == 0); MPASS((((lkflags2 & LK_SHARED) != 0) ^ ((lkflags2 & LK_EXCLUSIVE) != 0)) || (vp2 == NULL && lkflags2 == 0)); MPASS((lkflags2 & ~(LK_SHARED | LK_EXCLUSIVE | LK_NODDLKTREAT)) == 0); if (vp1 == NULL && vp2 == NULL) return; if (vp1 == vp2) { MPASS(vp1_locked == vp2_locked); /* Select the most exclusive mode for lock. */ if ((lkflags1 & LK_TYPE_MASK) != (lkflags2 & LK_TYPE_MASK)) lkflags1 = (lkflags1 & ~LK_SHARED) | LK_EXCLUSIVE; if (vp1_locked) { ASSERT_VOP_LOCKED(vp1, "vp1"); /* No need to relock if any lock is exclusive. */ if ((vp1->v_vnlock->lock_object.lo_flags & LK_NOSHARE) != 0) return; locked1 = VOP_ISLOCKED(vp1); if (((lkflags1 & LK_SHARED) != 0 && locked1 != LK_EXCLUSIVE) || ((lkflags1 & LK_EXCLUSIVE) != 0 && locked1 == LK_EXCLUSIVE)) return; VOP_UNLOCK(vp1); } ASSERT_VOP_UNLOCKED(vp1, "vp1"); vn_lock(vp1, lkflags1 | LK_RETRY); return; } if (vp1 != NULL) { if ((lkflags1 & LK_SHARED) != 0 && (vp1->v_vnlock->lock_object.lo_flags & LK_NOSHARE) != 0) lkflags1 = (lkflags1 & ~LK_SHARED) | LK_EXCLUSIVE; if (vp1_locked && VOP_ISLOCKED(vp1) != LK_EXCLUSIVE) { ASSERT_VOP_LOCKED(vp1, "vp1"); if ((lkflags1 & LK_EXCLUSIVE) != 0) { VOP_UNLOCK(vp1); ASSERT_VOP_UNLOCKED(vp1, "vp1 shared recursed"); vp1_locked = false; } } else if (!vp1_locked) ASSERT_VOP_UNLOCKED(vp1, "vp1"); } else { vp1_locked = true; } if (vp2 != NULL) { if ((lkflags2 & LK_SHARED) != 0 && (vp2->v_vnlock->lock_object.lo_flags & LK_NOSHARE) != 0) lkflags2 = (lkflags2 & ~LK_SHARED) | LK_EXCLUSIVE; if (vp2_locked && VOP_ISLOCKED(vp2) != LK_EXCLUSIVE) { ASSERT_VOP_LOCKED(vp2, "vp2"); if ((lkflags2 & LK_EXCLUSIVE) != 0) { VOP_UNLOCK(vp2); ASSERT_VOP_UNLOCKED(vp2, "vp2 shared recursed"); vp2_locked = false; } } else if (!vp2_locked) ASSERT_VOP_UNLOCKED(vp2, "vp2"); } else { vp2_locked = true; } if (!vp1_locked && !vp2_locked) { vn_lock(vp1, lkflags1 | LK_RETRY); vp1_locked = true; } while (!vp1_locked || !vp2_locked) { if (vp1_locked && vp2 != NULL) { if (vp1 != NULL) { error = VOP_LOCK1(vp2, lkflags2 | LK_NOWAIT, __FILE__, __LINE__); if (error == 0) break; VOP_UNLOCK(vp1); vp1_locked = false; vn_lock_pair_pause("vlp1"); } vn_lock(vp2, lkflags2 | LK_RETRY); vp2_locked = true; } if (vp2_locked && vp1 != NULL) { if (vp2 != NULL) { error = VOP_LOCK1(vp1, lkflags1 | LK_NOWAIT, __FILE__, __LINE__); if (error == 0) break; VOP_UNLOCK(vp2); vp2_locked = false; vn_lock_pair_pause("vlp2"); } vn_lock(vp1, lkflags1 | LK_RETRY); vp1_locked = true; } } if (vp1 != NULL) { if (lkflags1 == LK_EXCLUSIVE) ASSERT_VOP_ELOCKED(vp1, "vp1 ret"); else ASSERT_VOP_LOCKED(vp1, "vp1 ret"); } if (vp2 != NULL) { if (lkflags2 == LK_EXCLUSIVE) ASSERT_VOP_ELOCKED(vp2, "vp2 ret"); else ASSERT_VOP_LOCKED(vp2, "vp2 ret"); } } int vn_lktype_write(struct mount *mp, struct vnode *vp) { if (MNT_SHARED_WRITES(mp) || (mp == NULL && MNT_SHARED_WRITES(vp->v_mount))) return (LK_SHARED); return (LK_EXCLUSIVE); } int vn_cmp(struct file *fp1, struct file *fp2, struct thread *td) { if (fp2->f_type != DTYPE_VNODE) return (3); return (kcmp_cmp((uintptr_t)fp1->f_vnode, (uintptr_t)fp2->f_vnode)); } diff --git a/sys/net/bpf.c b/sys/net/bpf.c index c0631591a10e..8a68f65a80f7 100644 --- a/sys/net/bpf.c +++ b/sys/net/bpf.c @@ -1,3221 +1,3221 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. * Copyright (c) 2019 Andrey V. Elsukov * * This code is derived from the Stanford/CMU enet packet filter, * (net/enet.c) distributed as part of 4.3BSD, and code contributed * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence * Berkeley Laboratory. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)bpf.c 8.4 (Berkeley) 1/9/95 */ #include #include "opt_bpf.h" #include "opt_ddb.h" #include "opt_netgraph.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #ifdef BPF_JITTER #include #endif #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_BPF, "BPF", "BPF data"); static const struct bpf_if_ext dead_bpf_if = { .bif_dlist = CK_LIST_HEAD_INITIALIZER() }; struct bpf_if { #define bif_next bif_ext.bif_next #define bif_dlist bif_ext.bif_dlist struct bpf_if_ext bif_ext; /* public members */ u_int bif_dlt; /* link layer type */ u_int bif_hdrlen; /* length of link header */ struct bpfd_list bif_wlist; /* writer-only list */ struct ifnet *bif_ifp; /* corresponding interface */ struct bpf_if **bif_bpf; /* Pointer to pointer to us */ volatile u_int bif_refcnt; struct epoch_context epoch_ctx; }; CTASSERT(offsetof(struct bpf_if, bif_ext) == 0); struct bpf_program_buffer { struct epoch_context epoch_ctx; #ifdef BPF_JITTER bpf_jit_filter *func; #endif void *buffer[0]; }; #if defined(DEV_BPF) || defined(NETGRAPH_BPF) #define PRINET 26 /* interruptible */ #define BPF_PRIO_MAX 7 #define SIZEOF_BPF_HDR(type) \ (offsetof(type, bh_hdrlen) + sizeof(((type *)0)->bh_hdrlen)) #ifdef COMPAT_FREEBSD32 #include #include #define BPF_ALIGNMENT32 sizeof(int32_t) #define BPF_WORDALIGN32(x) roundup2(x, BPF_ALIGNMENT32) #ifndef BURN_BRIDGES /* * 32-bit version of structure prepended to each packet. We use this header * instead of the standard one for 32-bit streams. We mark the a stream as * 32-bit the first time we see a 32-bit compat ioctl request. */ struct bpf_hdr32 { struct timeval32 bh_tstamp; /* time stamp */ uint32_t bh_caplen; /* length of captured portion */ uint32_t bh_datalen; /* original length of packet */ uint16_t bh_hdrlen; /* length of bpf header (this struct plus alignment padding) */ }; #endif struct bpf_program32 { u_int bf_len; uint32_t bf_insns; }; struct bpf_dltlist32 { u_int bfl_len; u_int bfl_list; }; #define BIOCSETF32 _IOW('B', 103, struct bpf_program32) #define BIOCSRTIMEOUT32 _IOW('B', 109, struct timeval32) #define BIOCGRTIMEOUT32 _IOR('B', 110, struct timeval32) #define BIOCGDLTLIST32 _IOWR('B', 121, struct bpf_dltlist32) #define BIOCSETWF32 _IOW('B', 123, struct bpf_program32) #define BIOCSETFNR32 _IOW('B', 130, struct bpf_program32) #endif #define BPF_LOCK() sx_xlock(&bpf_sx) #define BPF_UNLOCK() sx_xunlock(&bpf_sx) #define BPF_LOCK_ASSERT() sx_assert(&bpf_sx, SA_XLOCKED) /* * bpf_iflist is a list of BPF interface structures, each corresponding to a * specific DLT. The same network interface might have several BPF interface * structures registered by different layers in the stack (i.e., 802.11 * frames, ethernet frames, etc). */ CK_LIST_HEAD(bpf_iflist, bpf_if); static struct bpf_iflist bpf_iflist; static struct sx bpf_sx; /* bpf global lock */ static int bpf_bpfd_cnt; static void bpfif_ref(struct bpf_if *); static void bpfif_rele(struct bpf_if *); static void bpfd_ref(struct bpf_d *); static void bpfd_rele(struct bpf_d *); static void bpf_attachd(struct bpf_d *, struct bpf_if *); static void bpf_detachd(struct bpf_d *); static void bpf_detachd_locked(struct bpf_d *, bool); static void bpfd_free(epoch_context_t); static int bpf_movein(struct uio *, int, struct ifnet *, struct mbuf **, struct sockaddr *, int *, struct bpf_d *); static int bpf_setif(struct bpf_d *, struct ifreq *); static void bpf_timed_out(void *); static __inline void bpf_wakeup(struct bpf_d *); static void catchpacket(struct bpf_d *, u_char *, u_int, u_int, void (*)(struct bpf_d *, caddr_t, u_int, void *, u_int), struct bintime *); static void reset_d(struct bpf_d *); static int bpf_setf(struct bpf_d *, struct bpf_program *, u_long cmd); static int bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *); static int bpf_setdlt(struct bpf_d *, u_int); static void filt_bpfdetach(struct knote *); static int filt_bpfread(struct knote *, long); static int filt_bpfwrite(struct knote *, long); static void bpf_drvinit(void *); static int bpf_stats_sysctl(SYSCTL_HANDLER_ARGS); SYSCTL_NODE(_net, OID_AUTO, bpf, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "bpf sysctl"); int bpf_maxinsns = BPF_MAXINSNS; SYSCTL_INT(_net_bpf, OID_AUTO, maxinsns, CTLFLAG_RW, &bpf_maxinsns, 0, "Maximum bpf program instructions"); static int bpf_zerocopy_enable = 0; SYSCTL_INT(_net_bpf, OID_AUTO, zerocopy_enable, CTLFLAG_RW, &bpf_zerocopy_enable, 0, "Enable new zero-copy BPF buffer sessions"); static SYSCTL_NODE(_net_bpf, OID_AUTO, stats, CTLFLAG_MPSAFE | CTLFLAG_RW, bpf_stats_sysctl, "bpf statistics portal"); VNET_DEFINE_STATIC(int, bpf_optimize_writers) = 0; #define V_bpf_optimize_writers VNET(bpf_optimize_writers) SYSCTL_INT(_net_bpf, OID_AUTO, optimize_writers, CTLFLAG_VNET | CTLFLAG_RWTUN, &VNET_NAME(bpf_optimize_writers), 0, "Do not send packets until BPF program is set"); static d_open_t bpfopen; static d_read_t bpfread; static d_write_t bpfwrite; static d_ioctl_t bpfioctl; static d_poll_t bpfpoll; static d_kqfilter_t bpfkqfilter; static struct cdevsw bpf_cdevsw = { .d_version = D_VERSION, .d_open = bpfopen, .d_read = bpfread, .d_write = bpfwrite, .d_ioctl = bpfioctl, .d_poll = bpfpoll, .d_name = "bpf", .d_kqfilter = bpfkqfilter, }; -static struct filterops bpfread_filtops = { +static const struct filterops bpfread_filtops = { .f_isfd = 1, .f_detach = filt_bpfdetach, .f_event = filt_bpfread, }; -static struct filterops bpfwrite_filtops = { +static const struct filterops bpfwrite_filtops = { .f_isfd = 1, .f_detach = filt_bpfdetach, .f_event = filt_bpfwrite, }; /* * LOCKING MODEL USED BY BPF * * Locks: * 1) global lock (BPF_LOCK). Sx, used to protect some global counters, * every bpf_iflist changes, serializes ioctl access to bpf descriptors. * 2) Descriptor lock. Mutex, used to protect BPF buffers and various * structure fields used by bpf_*tap* code. * * Lock order: global lock, then descriptor lock. * * There are several possible consumers: * * 1. The kernel registers interface pointer with bpfattach(). * Each call allocates new bpf_if structure, references ifnet pointer * and links bpf_if into bpf_iflist chain. This is protected with global * lock. * * 2. An userland application uses ioctl() call to bpf_d descriptor. * All such call are serialized with global lock. BPF filters can be * changed, but pointer to old filter will be freed using NET_EPOCH_CALL(). * Thus it should be safe for bpf_tap/bpf_mtap* code to do access to * filter pointers, even if change will happen during bpf_tap execution. * Destroying of bpf_d descriptor also is doing using NET_EPOCH_CALL(). * * 3. An userland application can write packets into bpf_d descriptor. * There we need to be sure, that ifnet won't disappear during bpfwrite(). * * 4. The kernel invokes bpf_tap/bpf_mtap* functions. The access to * bif_dlist is protected with net_epoch_preempt section. So, it should * be safe to make access to bpf_d descriptor inside the section. * * 5. The kernel invokes bpfdetach() on interface destroying. All lists * are modified with global lock held and actual free() is done using * NET_EPOCH_CALL(). */ static void bpfif_free(epoch_context_t ctx) { struct bpf_if *bp; bp = __containerof(ctx, struct bpf_if, epoch_ctx); if_rele(bp->bif_ifp); free(bp, M_BPF); } static void bpfif_ref(struct bpf_if *bp) { refcount_acquire(&bp->bif_refcnt); } static void bpfif_rele(struct bpf_if *bp) { if (!refcount_release(&bp->bif_refcnt)) return; NET_EPOCH_CALL(bpfif_free, &bp->epoch_ctx); } static void bpfd_ref(struct bpf_d *d) { refcount_acquire(&d->bd_refcnt); } static void bpfd_rele(struct bpf_d *d) { if (!refcount_release(&d->bd_refcnt)) return; NET_EPOCH_CALL(bpfd_free, &d->epoch_ctx); } static struct bpf_program_buffer* bpf_program_buffer_alloc(size_t size, int flags) { return (malloc(sizeof(struct bpf_program_buffer) + size, M_BPF, flags)); } static void bpf_program_buffer_free(epoch_context_t ctx) { struct bpf_program_buffer *ptr; ptr = __containerof(ctx, struct bpf_program_buffer, epoch_ctx); #ifdef BPF_JITTER if (ptr->func != NULL) bpf_destroy_jit_filter(ptr->func); #endif free(ptr, M_BPF); } /* * Wrapper functions for various buffering methods. If the set of buffer * modes expands, we will probably want to introduce a switch data structure * similar to protosw, et. */ static void bpf_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, void *src, u_int len) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_BUFFER: return (bpf_buffer_append_bytes(d, buf, offset, src, len)); case BPF_BUFMODE_ZBUF: counter_u64_add(d->bd_zcopy, 1); return (bpf_zerocopy_append_bytes(d, buf, offset, src, len)); default: panic("bpf_buf_append_bytes"); } } static void bpf_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src, u_int len) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_BUFFER: return (bpf_buffer_append_mbuf(d, buf, offset, src, len)); case BPF_BUFMODE_ZBUF: counter_u64_add(d->bd_zcopy, 1); return (bpf_zerocopy_append_mbuf(d, buf, offset, src, len)); default: panic("bpf_buf_append_mbuf"); } } /* * This function gets called when the free buffer is re-assigned. */ static void bpf_buf_reclaimed(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_BUFFER: return; case BPF_BUFMODE_ZBUF: bpf_zerocopy_buf_reclaimed(d); return; default: panic("bpf_buf_reclaimed"); } } /* * If the buffer mechanism has a way to decide that a held buffer can be made * free, then it is exposed via the bpf_canfreebuf() interface. (1) is * returned if the buffer can be discarded, (0) is returned if it cannot. */ static int bpf_canfreebuf(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_ZBUF: return (bpf_zerocopy_canfreebuf(d)); } return (0); } /* * Allow the buffer model to indicate that the current store buffer is * immutable, regardless of the appearance of space. Return (1) if the * buffer is writable, and (0) if not. */ static int bpf_canwritebuf(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_ZBUF: return (bpf_zerocopy_canwritebuf(d)); } return (1); } /* * Notify buffer model that an attempt to write to the store buffer has * resulted in a dropped packet, in which case the buffer may be considered * full. */ static void bpf_buffull(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_ZBUF: bpf_zerocopy_buffull(d); break; } } /* * Notify the buffer model that a buffer has moved into the hold position. */ void bpf_bufheld(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_ZBUF: bpf_zerocopy_bufheld(d); break; } } static void bpf_free(struct bpf_d *d) { switch (d->bd_bufmode) { case BPF_BUFMODE_BUFFER: return (bpf_buffer_free(d)); case BPF_BUFMODE_ZBUF: return (bpf_zerocopy_free(d)); default: panic("bpf_buf_free"); } } static int bpf_uiomove(struct bpf_d *d, caddr_t buf, u_int len, struct uio *uio) { if (d->bd_bufmode != BPF_BUFMODE_BUFFER) return (EOPNOTSUPP); return (bpf_buffer_uiomove(d, buf, len, uio)); } static int bpf_ioctl_sblen(struct bpf_d *d, u_int *i) { if (d->bd_bufmode != BPF_BUFMODE_BUFFER) return (EOPNOTSUPP); return (bpf_buffer_ioctl_sblen(d, i)); } static int bpf_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i) { if (d->bd_bufmode != BPF_BUFMODE_ZBUF) return (EOPNOTSUPP); return (bpf_zerocopy_ioctl_getzmax(td, d, i)); } static int bpf_ioctl_rotzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz) { if (d->bd_bufmode != BPF_BUFMODE_ZBUF) return (EOPNOTSUPP); return (bpf_zerocopy_ioctl_rotzbuf(td, d, bz)); } static int bpf_ioctl_setzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz) { if (d->bd_bufmode != BPF_BUFMODE_ZBUF) return (EOPNOTSUPP); return (bpf_zerocopy_ioctl_setzbuf(td, d, bz)); } /* * General BPF functions. */ static int bpf_movein(struct uio *uio, int linktype, struct ifnet *ifp, struct mbuf **mp, struct sockaddr *sockp, int *hdrlen, struct bpf_d *d) { const struct ieee80211_bpf_params *p; struct ether_header *eh; struct mbuf *m; int error; int len; int hlen; int slen; /* * Build a sockaddr based on the data link layer type. * We do this at this level because the ethernet header * is copied directly into the data field of the sockaddr. * In the case of SLIP, there is no header and the packet * is forwarded as is. * Also, we are careful to leave room at the front of the mbuf * for the link level header. */ switch (linktype) { case DLT_SLIP: sockp->sa_family = AF_INET; hlen = 0; break; case DLT_EN10MB: sockp->sa_family = AF_UNSPEC; /* XXX Would MAXLINKHDR be better? */ hlen = ETHER_HDR_LEN; break; case DLT_FDDI: sockp->sa_family = AF_IMPLINK; hlen = 0; break; case DLT_RAW: sockp->sa_family = AF_UNSPEC; hlen = 0; break; case DLT_NULL: /* * null interface types require a 4 byte pseudo header which * corresponds to the address family of the packet. */ sockp->sa_family = AF_UNSPEC; hlen = 4; break; case DLT_ATM_RFC1483: /* * en atm driver requires 4-byte atm pseudo header. * though it isn't standard, vpi:vci needs to be * specified anyway. */ sockp->sa_family = AF_UNSPEC; hlen = 12; /* XXX 4(ATM_PH) + 3(LLC) + 5(SNAP) */ break; case DLT_PPP: sockp->sa_family = AF_UNSPEC; hlen = 4; /* This should match PPP_HDRLEN */ break; case DLT_IEEE802_11: /* IEEE 802.11 wireless */ sockp->sa_family = AF_IEEE80211; hlen = 0; break; case DLT_IEEE802_11_RADIO: /* IEEE 802.11 wireless w/ phy params */ sockp->sa_family = AF_IEEE80211; sockp->sa_len = 12; /* XXX != 0 */ hlen = sizeof(struct ieee80211_bpf_params); break; default: return (EIO); } len = uio->uio_resid; if (len < hlen || len - hlen > ifp->if_mtu) return (EMSGSIZE); /* Allocate a mbuf, up to MJUM16BYTES bytes, for our write. */ m = m_get3(len, M_WAITOK, MT_DATA, M_PKTHDR); if (m == NULL) return (EIO); m->m_pkthdr.len = m->m_len = len; *mp = m; error = uiomove(mtod(m, u_char *), len, uio); if (error) goto bad; slen = bpf_filter(d->bd_wfilter, mtod(m, u_char *), len, len); if (slen == 0) { error = EPERM; goto bad; } /* Check for multicast destination */ switch (linktype) { case DLT_EN10MB: eh = mtod(m, struct ether_header *); if (ETHER_IS_MULTICAST(eh->ether_dhost)) { if (bcmp(ifp->if_broadcastaddr, eh->ether_dhost, ETHER_ADDR_LEN) == 0) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; } if (d->bd_hdrcmplt == 0) { memcpy(eh->ether_shost, IF_LLADDR(ifp), sizeof(eh->ether_shost)); } break; } /* * Make room for link header, and copy it to sockaddr */ if (hlen != 0) { if (sockp->sa_family == AF_IEEE80211) { /* * Collect true length from the parameter header * NB: sockp is known to be zero'd so if we do a * short copy unspecified parameters will be * zero. * NB: packet may not be aligned after stripping * bpf params * XXX check ibp_vers */ p = mtod(m, const struct ieee80211_bpf_params *); hlen = p->ibp_len; if (hlen > sizeof(sockp->sa_data)) { error = EINVAL; goto bad; } } bcopy(mtod(m, const void *), sockp->sa_data, hlen); } *hdrlen = hlen; return (0); bad: m_freem(m); return (error); } /* * Attach descriptor to the bpf interface, i.e. make d listen on bp, * then reset its buffers and counters with reset_d(). */ static void bpf_attachd(struct bpf_d *d, struct bpf_if *bp) { int op_w; BPF_LOCK_ASSERT(); /* * Save sysctl value to protect from sysctl change * between reads */ op_w = V_bpf_optimize_writers || d->bd_writer; if (d->bd_bif != NULL) bpf_detachd_locked(d, false); /* * Point d at bp, and add d to the interface's list. * Since there are many applications using BPF for * sending raw packets only (dhcpd, cdpd are good examples) * we can delay adding d to the list of active listeners until * some filter is configured. */ BPFD_LOCK(d); /* * Hold reference to bpif while descriptor uses this interface. */ bpfif_ref(bp); d->bd_bif = bp; if (op_w != 0) { /* Add to writers-only list */ CK_LIST_INSERT_HEAD(&bp->bif_wlist, d, bd_next); /* * We decrement bd_writer on every filter set operation. * First BIOCSETF is done by pcap_open_live() to set up * snap length. After that appliation usually sets its own * filter. */ d->bd_writer = 2; } else CK_LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next); reset_d(d); /* Trigger EVFILT_WRITE events. */ bpf_wakeup(d); BPFD_UNLOCK(d); bpf_bpfd_cnt++; CTR3(KTR_NET, "%s: bpf_attach called by pid %d, adding to %s list", __func__, d->bd_pid, d->bd_writer ? "writer" : "active"); if (op_w == 0) EVENTHANDLER_INVOKE(bpf_track, bp->bif_ifp, bp->bif_dlt, 1); } /* * Check if we need to upgrade our descriptor @d from write-only mode. */ static int bpf_check_upgrade(u_long cmd, struct bpf_d *d, struct bpf_insn *fcode, int flen) { int is_snap, need_upgrade; /* * Check if we've already upgraded or new filter is empty. */ if (d->bd_writer == 0 || fcode == NULL) return (0); need_upgrade = 0; /* * Check if cmd looks like snaplen setting from * pcap_bpf.c:pcap_open_live(). * Note we're not checking .k value here: * while pcap_open_live() definitely sets to non-zero value, * we'd prefer to treat k=0 (deny ALL) case the same way: e.g. * do not consider upgrading immediately */ if (cmd == BIOCSETF && flen == 1 && fcode[0].code == (BPF_RET | BPF_K)) is_snap = 1; else is_snap = 0; if (is_snap == 0) { /* * We're setting first filter and it doesn't look like * setting snaplen. We're probably using bpf directly. * Upgrade immediately. */ need_upgrade = 1; } else { /* * Do not require upgrade by first BIOCSETF * (used to set snaplen) by pcap_open_live(). */ if (--d->bd_writer == 0) { /* * First snaplen filter has already * been set. This is probably catch-all * filter */ need_upgrade = 1; } } CTR5(KTR_NET, "%s: filter function set by pid %d, " "bd_writer counter %d, snap %d upgrade %d", __func__, d->bd_pid, d->bd_writer, is_snap, need_upgrade); return (need_upgrade); } /* * Detach a file from its interface. */ static void bpf_detachd(struct bpf_d *d) { BPF_LOCK(); bpf_detachd_locked(d, false); BPF_UNLOCK(); } static void bpf_detachd_locked(struct bpf_d *d, bool detached_ifp) { struct bpf_if *bp; struct ifnet *ifp; int error; BPF_LOCK_ASSERT(); CTR2(KTR_NET, "%s: detach required by pid %d", __func__, d->bd_pid); /* Check if descriptor is attached */ if ((bp = d->bd_bif) == NULL) return; BPFD_LOCK(d); /* Remove d from the interface's descriptor list. */ CK_LIST_REMOVE(d, bd_next); /* Save bd_writer value */ error = d->bd_writer; ifp = bp->bif_ifp; d->bd_bif = NULL; if (detached_ifp) { /* * Notify descriptor as it's detached, so that any * sleepers wake up and get ENXIO. */ bpf_wakeup(d); } BPFD_UNLOCK(d); bpf_bpfd_cnt--; /* Call event handler iff d is attached */ if (error == 0) EVENTHANDLER_INVOKE(bpf_track, ifp, bp->bif_dlt, 0); /* * Check if this descriptor had requested promiscuous mode. * If so and ifnet is not detached, turn it off. */ if (d->bd_promisc && !detached_ifp) { d->bd_promisc = 0; CURVNET_SET(ifp->if_vnet); error = ifpromisc(ifp, 0); CURVNET_RESTORE(); if (error != 0 && error != ENXIO) { /* * ENXIO can happen if a pccard is unplugged * Something is really wrong if we were able to put * the driver into promiscuous mode, but can't * take it out. */ if_printf(bp->bif_ifp, "bpf_detach: ifpromisc failed (%d)\n", error); } } bpfif_rele(bp); } /* * Close the descriptor by detaching it from its interface, * deallocating its buffers, and marking it free. */ static void bpf_dtor(void *data) { struct bpf_d *d = data; BPFD_LOCK(d); if (d->bd_state == BPF_WAITING) callout_stop(&d->bd_callout); d->bd_state = BPF_IDLE; BPFD_UNLOCK(d); funsetown(&d->bd_sigio); bpf_detachd(d); #ifdef MAC mac_bpfdesc_destroy(d); #endif /* MAC */ seldrain(&d->bd_sel); knlist_destroy(&d->bd_sel.si_note); callout_drain(&d->bd_callout); bpfd_rele(d); } /* * Open ethernet device. Returns ENXIO for illegal minor device number, * EBUSY if file is open by another process. */ /* ARGSUSED */ static int bpfopen(struct cdev *dev, int flags, int fmt, struct thread *td) { struct bpf_d *d; int error; d = malloc(sizeof(*d), M_BPF, M_WAITOK | M_ZERO); error = devfs_set_cdevpriv(d, bpf_dtor); if (error != 0) { free(d, M_BPF); return (error); } /* Setup counters */ d->bd_rcount = counter_u64_alloc(M_WAITOK); d->bd_dcount = counter_u64_alloc(M_WAITOK); d->bd_fcount = counter_u64_alloc(M_WAITOK); d->bd_wcount = counter_u64_alloc(M_WAITOK); d->bd_wfcount = counter_u64_alloc(M_WAITOK); d->bd_wdcount = counter_u64_alloc(M_WAITOK); d->bd_zcopy = counter_u64_alloc(M_WAITOK); /* * For historical reasons, perform a one-time initialization call to * the buffer routines, even though we're not yet committed to a * particular buffer method. */ bpf_buffer_init(d); if ((flags & FREAD) == 0) d->bd_writer = 2; d->bd_hbuf_in_use = 0; d->bd_bufmode = BPF_BUFMODE_BUFFER; d->bd_sig = SIGIO; d->bd_direction = BPF_D_INOUT; refcount_init(&d->bd_refcnt, 1); BPF_PID_REFRESH(d, td); #ifdef MAC mac_bpfdesc_init(d); mac_bpfdesc_create(td->td_ucred, d); #endif mtx_init(&d->bd_lock, devtoname(dev), "bpf cdev lock", MTX_DEF); callout_init_mtx(&d->bd_callout, &d->bd_lock, 0); knlist_init_mtx(&d->bd_sel.si_note, &d->bd_lock); /* Disable VLAN pcp tagging. */ d->bd_pcp = 0; return (0); } /* * bpfread - read next chunk of packets from buffers */ static int bpfread(struct cdev *dev, struct uio *uio, int ioflag) { struct bpf_d *d; int error; int non_block; int timed_out; error = devfs_get_cdevpriv((void **)&d); if (error != 0) return (error); /* * Restrict application to use a buffer the same size as * as kernel buffers. */ if (uio->uio_resid != d->bd_bufsize) return (EINVAL); non_block = ((ioflag & O_NONBLOCK) != 0); BPFD_LOCK(d); BPF_PID_REFRESH_CUR(d); if (d->bd_bufmode != BPF_BUFMODE_BUFFER) { BPFD_UNLOCK(d); return (EOPNOTSUPP); } if (d->bd_state == BPF_WAITING) callout_stop(&d->bd_callout); timed_out = (d->bd_state == BPF_TIMED_OUT); d->bd_state = BPF_IDLE; while (d->bd_hbuf_in_use) { error = mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock, PRINET | PCATCH, "bd_hbuf", 0); if (error != 0) { BPFD_UNLOCK(d); return (error); } } /* * If the hold buffer is empty, then do a timed sleep, which * ends when the timeout expires or when enough packets * have arrived to fill the store buffer. */ while (d->bd_hbuf == NULL) { if (d->bd_slen != 0) { /* * A packet(s) either arrived since the previous * read or arrived while we were asleep. */ if (d->bd_immediate || non_block || timed_out) { /* * Rotate the buffers and return what's here * if we are in immediate mode, non-blocking * flag is set, or this descriptor timed out. */ ROTATE_BUFFERS(d); break; } } /* * No data is available, check to see if the bpf device * is still pointed at a real interface. If not, return * ENXIO so that the userland process knows to rebind * it before using it again. */ if (d->bd_bif == NULL) { BPFD_UNLOCK(d); return (ENXIO); } if (non_block) { BPFD_UNLOCK(d); return (EWOULDBLOCK); } error = msleep(d, &d->bd_lock, PRINET | PCATCH, "bpf", d->bd_rtout); if (error == EINTR || error == ERESTART) { BPFD_UNLOCK(d); return (error); } if (error == EWOULDBLOCK) { /* * On a timeout, return what's in the buffer, * which may be nothing. If there is something * in the store buffer, we can rotate the buffers. */ if (d->bd_hbuf) /* * We filled up the buffer in between * getting the timeout and arriving * here, so we don't need to rotate. */ break; if (d->bd_slen == 0) { BPFD_UNLOCK(d); return (0); } ROTATE_BUFFERS(d); break; } } /* * At this point, we know we have something in the hold slot. */ d->bd_hbuf_in_use = 1; BPFD_UNLOCK(d); /* * Move data from hold buffer into user space. * We know the entire buffer is transferred since * we checked above that the read buffer is bpf_bufsize bytes. * * We do not have to worry about simultaneous reads because * we waited for sole access to the hold buffer above. */ error = bpf_uiomove(d, d->bd_hbuf, d->bd_hlen, uio); BPFD_LOCK(d); KASSERT(d->bd_hbuf != NULL, ("bpfread: lost bd_hbuf")); d->bd_fbuf = d->bd_hbuf; d->bd_hbuf = NULL; d->bd_hlen = 0; bpf_buf_reclaimed(d); d->bd_hbuf_in_use = 0; wakeup(&d->bd_hbuf_in_use); BPFD_UNLOCK(d); return (error); } /* * If there are processes sleeping on this descriptor, wake them up. */ static __inline void bpf_wakeup(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); if (d->bd_state == BPF_WAITING) { callout_stop(&d->bd_callout); d->bd_state = BPF_IDLE; } wakeup(d); if (d->bd_async && d->bd_sig && d->bd_sigio) pgsigio(&d->bd_sigio, d->bd_sig, 0); selwakeuppri(&d->bd_sel, PRINET); KNOTE_LOCKED(&d->bd_sel.si_note, 0); } static void bpf_timed_out(void *arg) { struct bpf_d *d = (struct bpf_d *)arg; BPFD_LOCK_ASSERT(d); if (callout_pending(&d->bd_callout) || !callout_active(&d->bd_callout)) return; if (d->bd_state == BPF_WAITING) { d->bd_state = BPF_TIMED_OUT; if (d->bd_slen != 0) bpf_wakeup(d); } } static int bpf_ready(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); if (!bpf_canfreebuf(d) && d->bd_hlen != 0) return (1); if ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) && d->bd_slen != 0) return (1); return (0); } static int bpfwrite(struct cdev *dev, struct uio *uio, int ioflag) { struct route ro; struct sockaddr dst; struct epoch_tracker et; struct bpf_if *bp; struct bpf_d *d; struct ifnet *ifp; struct mbuf *m, *mc; int error, hlen; error = devfs_get_cdevpriv((void **)&d); if (error != 0) return (error); NET_EPOCH_ENTER(et); BPFD_LOCK(d); BPF_PID_REFRESH_CUR(d); counter_u64_add(d->bd_wcount, 1); if ((bp = d->bd_bif) == NULL) { error = ENXIO; goto out_locked; } ifp = bp->bif_ifp; if ((ifp->if_flags & IFF_UP) == 0) { error = ENETDOWN; goto out_locked; } if (uio->uio_resid == 0) goto out_locked; bzero(&dst, sizeof(dst)); m = NULL; hlen = 0; /* * Take extra reference, unlock d and exit from epoch section, * since bpf_movein() can sleep. */ bpfd_ref(d); NET_EPOCH_EXIT(et); BPFD_UNLOCK(d); error = bpf_movein(uio, (int)bp->bif_dlt, ifp, &m, &dst, &hlen, d); if (error != 0) { counter_u64_add(d->bd_wdcount, 1); bpfd_rele(d); return (error); } BPFD_LOCK(d); /* * Check that descriptor is still attached to the interface. * This can happen on bpfdetach(). To avoid access to detached * ifnet, free mbuf and return ENXIO. */ if (d->bd_bif == NULL) { counter_u64_add(d->bd_wdcount, 1); BPFD_UNLOCK(d); bpfd_rele(d); m_freem(m); return (ENXIO); } counter_u64_add(d->bd_wfcount, 1); if (d->bd_hdrcmplt) dst.sa_family = pseudo_AF_HDRCMPLT; if (d->bd_feedback) { mc = m_dup(m, M_NOWAIT); if (mc != NULL) mc->m_pkthdr.rcvif = ifp; /* Set M_PROMISC for outgoing packets to be discarded. */ if (d->bd_direction == BPF_D_INOUT) m->m_flags |= M_PROMISC; } else mc = NULL; m->m_pkthdr.len -= hlen; m->m_len -= hlen; m->m_data += hlen; /* XXX */ CURVNET_SET(ifp->if_vnet); #ifdef MAC mac_bpfdesc_create_mbuf(d, m); if (mc != NULL) mac_bpfdesc_create_mbuf(d, mc); #endif bzero(&ro, sizeof(ro)); if (hlen != 0) { ro.ro_prepend = (u_char *)&dst.sa_data; ro.ro_plen = hlen; ro.ro_flags = RT_HAS_HEADER; } if (d->bd_pcp != 0) vlan_set_pcp(m, d->bd_pcp); /* Avoid possible recursion on BPFD_LOCK(). */ NET_EPOCH_ENTER(et); BPFD_UNLOCK(d); error = (*ifp->if_output)(ifp, m, &dst, &ro); if (error) counter_u64_add(d->bd_wdcount, 1); if (mc != NULL) { if (error == 0) (*ifp->if_input)(ifp, mc); else m_freem(mc); } NET_EPOCH_EXIT(et); CURVNET_RESTORE(); bpfd_rele(d); return (error); out_locked: counter_u64_add(d->bd_wdcount, 1); NET_EPOCH_EXIT(et); BPFD_UNLOCK(d); return (error); } /* * Reset a descriptor by flushing its packet buffer and clearing the receive * and drop counts. This is doable for kernel-only buffers, but with * zero-copy buffers, we can't write to (or rotate) buffers that are * currently owned by userspace. It would be nice if we could encapsulate * this logic in the buffer code rather than here. */ static void reset_d(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); while (d->bd_hbuf_in_use) mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock, PRINET, "bd_hbuf", 0); if ((d->bd_hbuf != NULL) && (d->bd_bufmode != BPF_BUFMODE_ZBUF || bpf_canfreebuf(d))) { /* Free the hold buffer. */ d->bd_fbuf = d->bd_hbuf; d->bd_hbuf = NULL; d->bd_hlen = 0; bpf_buf_reclaimed(d); } if (bpf_canwritebuf(d)) d->bd_slen = 0; counter_u64_zero(d->bd_rcount); counter_u64_zero(d->bd_dcount); counter_u64_zero(d->bd_fcount); counter_u64_zero(d->bd_wcount); counter_u64_zero(d->bd_wfcount); counter_u64_zero(d->bd_wdcount); counter_u64_zero(d->bd_zcopy); } /* * FIONREAD Check for read packet available. * BIOCGBLEN Get buffer len [for read()]. * BIOCSETF Set read filter. * BIOCSETFNR Set read filter without resetting descriptor. * BIOCSETWF Set write filter. * BIOCFLUSH Flush read packet buffer. * BIOCPROMISC Put interface into promiscuous mode. * BIOCGDLT Get link layer type. * BIOCGETIF Get interface name. * BIOCSETIF Set interface. * BIOCSRTIMEOUT Set read timeout. * BIOCGRTIMEOUT Get read timeout. * BIOCGSTATS Get packet stats. * BIOCIMMEDIATE Set immediate mode. * BIOCVERSION Get filter language version. * BIOCGHDRCMPLT Get "header already complete" flag * BIOCSHDRCMPLT Set "header already complete" flag * BIOCGDIRECTION Get packet direction flag * BIOCSDIRECTION Set packet direction flag * BIOCGTSTAMP Get time stamp format and resolution. * BIOCSTSTAMP Set time stamp format and resolution. * BIOCLOCK Set "locked" flag * BIOCFEEDBACK Set packet feedback mode. * BIOCSETZBUF Set current zero-copy buffer locations. * BIOCGETZMAX Get maximum zero-copy buffer size. * BIOCROTZBUF Force rotation of zero-copy buffer * BIOCSETBUFMODE Set buffer mode. * BIOCGETBUFMODE Get current buffer mode. * BIOCSETVLANPCP Set VLAN PCP tag. */ /* ARGSUSED */ static int bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td) { struct bpf_d *d; int error; error = devfs_get_cdevpriv((void **)&d); if (error != 0) return (error); /* * Refresh PID associated with this descriptor. */ BPFD_LOCK(d); BPF_PID_REFRESH(d, td); if (d->bd_state == BPF_WAITING) callout_stop(&d->bd_callout); d->bd_state = BPF_IDLE; BPFD_UNLOCK(d); if (d->bd_locked == 1) { switch (cmd) { case BIOCGBLEN: case BIOCFLUSH: case BIOCGDLT: case BIOCGDLTLIST: #ifdef COMPAT_FREEBSD32 case BIOCGDLTLIST32: #endif case BIOCGETIF: case BIOCGRTIMEOUT: #if defined(COMPAT_FREEBSD32) && defined(__amd64__) case BIOCGRTIMEOUT32: #endif case BIOCGSTATS: case BIOCVERSION: case BIOCGRSIG: case BIOCGHDRCMPLT: case BIOCSTSTAMP: case BIOCFEEDBACK: case FIONREAD: case BIOCLOCK: case BIOCSRTIMEOUT: #if defined(COMPAT_FREEBSD32) && defined(__amd64__) case BIOCSRTIMEOUT32: #endif case BIOCIMMEDIATE: case TIOCGPGRP: case BIOCROTZBUF: break; default: return (EPERM); } } #ifdef COMPAT_FREEBSD32 /* * If we see a 32-bit compat ioctl, mark the stream as 32-bit so * that it will get 32-bit packet headers. */ switch (cmd) { case BIOCSETF32: case BIOCSETFNR32: case BIOCSETWF32: case BIOCGDLTLIST32: case BIOCGRTIMEOUT32: case BIOCSRTIMEOUT32: if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { BPFD_LOCK(d); d->bd_compat32 = 1; BPFD_UNLOCK(d); } } #endif CURVNET_SET(TD_TO_VNET(td)); switch (cmd) { default: error = EINVAL; break; /* * Check for read packet available. */ case FIONREAD: { int n; BPFD_LOCK(d); n = d->bd_slen; while (d->bd_hbuf_in_use) mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock, PRINET, "bd_hbuf", 0); if (d->bd_hbuf) n += d->bd_hlen; BPFD_UNLOCK(d); *(int *)addr = n; break; } /* * Get buffer len [for read()]. */ case BIOCGBLEN: BPFD_LOCK(d); *(u_int *)addr = d->bd_bufsize; BPFD_UNLOCK(d); break; /* * Set buffer length. */ case BIOCSBLEN: error = bpf_ioctl_sblen(d, (u_int *)addr); break; /* * Set link layer read filter. */ case BIOCSETF: case BIOCSETFNR: case BIOCSETWF: #ifdef COMPAT_FREEBSD32 case BIOCSETF32: case BIOCSETFNR32: case BIOCSETWF32: #endif error = bpf_setf(d, (struct bpf_program *)addr, cmd); break; /* * Flush read packet buffer. */ case BIOCFLUSH: BPFD_LOCK(d); reset_d(d); BPFD_UNLOCK(d); break; /* * Put interface into promiscuous mode. */ case BIOCPROMISC: BPF_LOCK(); if (d->bd_bif == NULL) { /* * No interface attached yet. */ error = EINVAL; } else if (d->bd_promisc == 0) { error = ifpromisc(d->bd_bif->bif_ifp, 1); if (error == 0) d->bd_promisc = 1; } BPF_UNLOCK(); break; /* * Get current data link type. */ case BIOCGDLT: BPF_LOCK(); if (d->bd_bif == NULL) error = EINVAL; else *(u_int *)addr = d->bd_bif->bif_dlt; BPF_UNLOCK(); break; /* * Get a list of supported data link types. */ #ifdef COMPAT_FREEBSD32 case BIOCGDLTLIST32: { struct bpf_dltlist32 *list32; struct bpf_dltlist dltlist; list32 = (struct bpf_dltlist32 *)addr; dltlist.bfl_len = list32->bfl_len; dltlist.bfl_list = PTRIN(list32->bfl_list); BPF_LOCK(); if (d->bd_bif == NULL) error = EINVAL; else { error = bpf_getdltlist(d, &dltlist); if (error == 0) list32->bfl_len = dltlist.bfl_len; } BPF_UNLOCK(); break; } #endif case BIOCGDLTLIST: BPF_LOCK(); if (d->bd_bif == NULL) error = EINVAL; else error = bpf_getdltlist(d, (struct bpf_dltlist *)addr); BPF_UNLOCK(); break; /* * Set data link type. */ case BIOCSDLT: BPF_LOCK(); if (d->bd_bif == NULL) error = EINVAL; else error = bpf_setdlt(d, *(u_int *)addr); BPF_UNLOCK(); break; /* * Get interface name. */ case BIOCGETIF: BPF_LOCK(); if (d->bd_bif == NULL) error = EINVAL; else { struct ifnet *const ifp = d->bd_bif->bif_ifp; struct ifreq *const ifr = (struct ifreq *)addr; strlcpy(ifr->ifr_name, ifp->if_xname, sizeof(ifr->ifr_name)); } BPF_UNLOCK(); break; /* * Set interface. */ case BIOCSETIF: { int alloc_buf, size; /* * Behavior here depends on the buffering model. If * we're using kernel memory buffers, then we can * allocate them here. If we're using zero-copy, * then the user process must have registered buffers * by the time we get here. */ alloc_buf = 0; BPFD_LOCK(d); if (d->bd_bufmode == BPF_BUFMODE_BUFFER && d->bd_sbuf == NULL) alloc_buf = 1; BPFD_UNLOCK(d); if (alloc_buf) { size = d->bd_bufsize; error = bpf_buffer_ioctl_sblen(d, &size); if (error != 0) break; } BPF_LOCK(); error = bpf_setif(d, (struct ifreq *)addr); BPF_UNLOCK(); break; } /* * Set read timeout. */ case BIOCSRTIMEOUT: #if defined(COMPAT_FREEBSD32) && defined(__amd64__) case BIOCSRTIMEOUT32: #endif { struct timeval *tv = (struct timeval *)addr; #if defined(COMPAT_FREEBSD32) struct timeval32 *tv32; struct timeval tv64; if (cmd == BIOCSRTIMEOUT32) { tv32 = (struct timeval32 *)addr; tv = &tv64; tv->tv_sec = tv32->tv_sec; tv->tv_usec = tv32->tv_usec; } else #endif tv = (struct timeval *)addr; /* * Subtract 1 tick from tvtohz() since this isn't * a one-shot timer. */ if ((error = itimerfix(tv)) == 0) d->bd_rtout = tvtohz(tv) - 1; break; } /* * Get read timeout. */ case BIOCGRTIMEOUT: #if defined(COMPAT_FREEBSD32) && defined(__amd64__) case BIOCGRTIMEOUT32: #endif { struct timeval *tv; #if defined(COMPAT_FREEBSD32) && defined(__amd64__) struct timeval32 *tv32; struct timeval tv64; if (cmd == BIOCGRTIMEOUT32) tv = &tv64; else #endif tv = (struct timeval *)addr; tv->tv_sec = d->bd_rtout / hz; tv->tv_usec = (d->bd_rtout % hz) * tick; #if defined(COMPAT_FREEBSD32) && defined(__amd64__) if (cmd == BIOCGRTIMEOUT32) { tv32 = (struct timeval32 *)addr; tv32->tv_sec = tv->tv_sec; tv32->tv_usec = tv->tv_usec; } #endif break; } /* * Get packet stats. */ case BIOCGSTATS: { struct bpf_stat *bs = (struct bpf_stat *)addr; /* XXXCSJP overflow */ bs->bs_recv = (u_int)counter_u64_fetch(d->bd_rcount); bs->bs_drop = (u_int)counter_u64_fetch(d->bd_dcount); break; } /* * Set immediate mode. */ case BIOCIMMEDIATE: BPFD_LOCK(d); d->bd_immediate = *(u_int *)addr; BPFD_UNLOCK(d); break; case BIOCVERSION: { struct bpf_version *bv = (struct bpf_version *)addr; bv->bv_major = BPF_MAJOR_VERSION; bv->bv_minor = BPF_MINOR_VERSION; break; } /* * Get "header already complete" flag */ case BIOCGHDRCMPLT: BPFD_LOCK(d); *(u_int *)addr = d->bd_hdrcmplt; BPFD_UNLOCK(d); break; /* * Set "header already complete" flag */ case BIOCSHDRCMPLT: BPFD_LOCK(d); d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0; BPFD_UNLOCK(d); break; /* * Get packet direction flag */ case BIOCGDIRECTION: BPFD_LOCK(d); *(u_int *)addr = d->bd_direction; BPFD_UNLOCK(d); break; /* * Set packet direction flag */ case BIOCSDIRECTION: { u_int direction; direction = *(u_int *)addr; switch (direction) { case BPF_D_IN: case BPF_D_INOUT: case BPF_D_OUT: BPFD_LOCK(d); d->bd_direction = direction; BPFD_UNLOCK(d); break; default: error = EINVAL; } } break; /* * Get packet timestamp format and resolution. */ case BIOCGTSTAMP: BPFD_LOCK(d); *(u_int *)addr = d->bd_tstamp; BPFD_UNLOCK(d); break; /* * Set packet timestamp format and resolution. */ case BIOCSTSTAMP: { u_int func; func = *(u_int *)addr; if (BPF_T_VALID(func)) d->bd_tstamp = func; else error = EINVAL; } break; case BIOCFEEDBACK: BPFD_LOCK(d); d->bd_feedback = *(u_int *)addr; BPFD_UNLOCK(d); break; case BIOCLOCK: BPFD_LOCK(d); d->bd_locked = 1; BPFD_UNLOCK(d); break; case FIONBIO: /* Non-blocking I/O */ break; case FIOASYNC: /* Send signal on receive packets */ BPFD_LOCK(d); d->bd_async = *(int *)addr; BPFD_UNLOCK(d); break; case FIOSETOWN: /* * XXX: Add some sort of locking here? * fsetown() can sleep. */ error = fsetown(*(int *)addr, &d->bd_sigio); break; case FIOGETOWN: BPFD_LOCK(d); *(int *)addr = fgetown(&d->bd_sigio); BPFD_UNLOCK(d); break; /* This is deprecated, FIOSETOWN should be used instead. */ case TIOCSPGRP: error = fsetown(-(*(int *)addr), &d->bd_sigio); break; /* This is deprecated, FIOGETOWN should be used instead. */ case TIOCGPGRP: *(int *)addr = -fgetown(&d->bd_sigio); break; case BIOCSRSIG: /* Set receive signal */ { u_int sig; sig = *(u_int *)addr; if (sig >= NSIG) error = EINVAL; else { BPFD_LOCK(d); d->bd_sig = sig; BPFD_UNLOCK(d); } break; } case BIOCGRSIG: BPFD_LOCK(d); *(u_int *)addr = d->bd_sig; BPFD_UNLOCK(d); break; case BIOCGETBUFMODE: BPFD_LOCK(d); *(u_int *)addr = d->bd_bufmode; BPFD_UNLOCK(d); break; case BIOCSETBUFMODE: /* * Allow the buffering mode to be changed as long as we * haven't yet committed to a particular mode. Our * definition of commitment, for now, is whether or not a * buffer has been allocated or an interface attached, since * that's the point where things get tricky. */ switch (*(u_int *)addr) { case BPF_BUFMODE_BUFFER: break; case BPF_BUFMODE_ZBUF: if (bpf_zerocopy_enable) break; /* FALLSTHROUGH */ default: CURVNET_RESTORE(); return (EINVAL); } BPFD_LOCK(d); if (d->bd_sbuf != NULL || d->bd_hbuf != NULL || d->bd_fbuf != NULL || d->bd_bif != NULL) { BPFD_UNLOCK(d); CURVNET_RESTORE(); return (EBUSY); } d->bd_bufmode = *(u_int *)addr; BPFD_UNLOCK(d); break; case BIOCGETZMAX: error = bpf_ioctl_getzmax(td, d, (size_t *)addr); break; case BIOCSETZBUF: error = bpf_ioctl_setzbuf(td, d, (struct bpf_zbuf *)addr); break; case BIOCROTZBUF: error = bpf_ioctl_rotzbuf(td, d, (struct bpf_zbuf *)addr); break; case BIOCSETVLANPCP: { u_int pcp; pcp = *(u_int *)addr; if (pcp > BPF_PRIO_MAX || pcp < 0) { error = EINVAL; break; } d->bd_pcp = pcp; break; } } CURVNET_RESTORE(); return (error); } /* * Set d's packet filter program to fp. If this file already has a filter, * free it and replace it. Returns EINVAL for bogus requests. * * Note we use global lock here to serialize bpf_setf() and bpf_setif() * calls. */ static int bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd) { #ifdef COMPAT_FREEBSD32 struct bpf_program fp_swab; struct bpf_program32 *fp32; #endif struct bpf_program_buffer *fcode; struct bpf_insn *filter; #ifdef BPF_JITTER bpf_jit_filter *jfunc; #endif size_t size; u_int flen; bool track_event; #ifdef COMPAT_FREEBSD32 switch (cmd) { case BIOCSETF32: case BIOCSETWF32: case BIOCSETFNR32: fp32 = (struct bpf_program32 *)fp; fp_swab.bf_len = fp32->bf_len; fp_swab.bf_insns = (struct bpf_insn *)(uintptr_t)fp32->bf_insns; fp = &fp_swab; switch (cmd) { case BIOCSETF32: cmd = BIOCSETF; break; case BIOCSETWF32: cmd = BIOCSETWF; break; } break; } #endif filter = NULL; #ifdef BPF_JITTER jfunc = NULL; #endif /* * Check new filter validness before acquiring any locks. * Allocate memory for new filter, if needed. */ flen = fp->bf_len; if (flen > bpf_maxinsns || (fp->bf_insns == NULL && flen != 0)) return (EINVAL); size = flen * sizeof(*fp->bf_insns); if (size > 0) { /* We're setting up new filter. Copy and check actual data. */ fcode = bpf_program_buffer_alloc(size, M_WAITOK); filter = (struct bpf_insn *)fcode->buffer; if (copyin(fp->bf_insns, filter, size) != 0 || !bpf_validate(filter, flen)) { free(fcode, M_BPF); return (EINVAL); } #ifdef BPF_JITTER if (cmd != BIOCSETWF) { /* * Filter is copied inside fcode and is * perfectly valid. */ jfunc = bpf_jitter(filter, flen); } #endif } track_event = false; fcode = NULL; BPF_LOCK(); BPFD_LOCK(d); /* Set up new filter. */ if (cmd == BIOCSETWF) { if (d->bd_wfilter != NULL) { fcode = __containerof((void *)d->bd_wfilter, struct bpf_program_buffer, buffer); #ifdef BPF_JITTER fcode->func = NULL; #endif } d->bd_wfilter = filter; } else { if (d->bd_rfilter != NULL) { fcode = __containerof((void *)d->bd_rfilter, struct bpf_program_buffer, buffer); #ifdef BPF_JITTER fcode->func = d->bd_bfilter; #endif } d->bd_rfilter = filter; #ifdef BPF_JITTER d->bd_bfilter = jfunc; #endif if (cmd == BIOCSETF) reset_d(d); if (bpf_check_upgrade(cmd, d, filter, flen) != 0) { /* * Filter can be set several times without * specifying interface. In this case just mark d * as reader. */ d->bd_writer = 0; if (d->bd_bif != NULL) { /* * Remove descriptor from writers-only list * and add it to active readers list. */ CK_LIST_REMOVE(d, bd_next); CK_LIST_INSERT_HEAD(&d->bd_bif->bif_dlist, d, bd_next); CTR2(KTR_NET, "%s: upgrade required by pid %d", __func__, d->bd_pid); track_event = true; } } } BPFD_UNLOCK(d); if (fcode != NULL) NET_EPOCH_CALL(bpf_program_buffer_free, &fcode->epoch_ctx); if (track_event) EVENTHANDLER_INVOKE(bpf_track, d->bd_bif->bif_ifp, d->bd_bif->bif_dlt, 1); BPF_UNLOCK(); return (0); } /* * Detach a file from its current interface (if attached at all) and attach * to the interface indicated by the name stored in ifr. * Return an errno or 0. */ static int bpf_setif(struct bpf_d *d, struct ifreq *ifr) { struct bpf_if *bp; struct ifnet *theywant; BPF_LOCK_ASSERT(); theywant = ifunit(ifr->ifr_name); if (theywant == NULL || theywant->if_bpf == NULL) return (ENXIO); bp = theywant->if_bpf; /* * At this point, we expect the buffer is already allocated. If not, * return an error. */ switch (d->bd_bufmode) { case BPF_BUFMODE_BUFFER: case BPF_BUFMODE_ZBUF: if (d->bd_sbuf == NULL) return (EINVAL); break; default: panic("bpf_setif: bufmode %d", d->bd_bufmode); } if (bp != d->bd_bif) bpf_attachd(d, bp); else { BPFD_LOCK(d); reset_d(d); BPFD_UNLOCK(d); } return (0); } /* * Support for select() and poll() system calls * * Return true iff the specific operation will not block indefinitely. * Otherwise, return false but make a note that a selwakeup() must be done. */ static int bpfpoll(struct cdev *dev, int events, struct thread *td) { struct bpf_d *d; int revents; if (devfs_get_cdevpriv((void **)&d) != 0 || d->bd_bif == NULL) return (events & (POLLHUP | POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM)); /* * Refresh PID associated with this descriptor. */ revents = events & (POLLOUT | POLLWRNORM); BPFD_LOCK(d); BPF_PID_REFRESH(d, td); if (events & (POLLIN | POLLRDNORM)) { if (bpf_ready(d)) revents |= events & (POLLIN | POLLRDNORM); else { selrecord(td, &d->bd_sel); /* Start the read timeout if necessary. */ if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) { callout_reset(&d->bd_callout, d->bd_rtout, bpf_timed_out, d); d->bd_state = BPF_WAITING; } } } BPFD_UNLOCK(d); return (revents); } /* * Support for kevent() system call. Register EVFILT_READ filters and * reject all others. */ int bpfkqfilter(struct cdev *dev, struct knote *kn) { struct bpf_d *d; if (devfs_get_cdevpriv((void **)&d) != 0) return (1); switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &bpfread_filtops; break; case EVFILT_WRITE: kn->kn_fop = &bpfwrite_filtops; break; default: return (1); } /* * Refresh PID associated with this descriptor. */ BPFD_LOCK(d); BPF_PID_REFRESH_CUR(d); kn->kn_hook = d; knlist_add(&d->bd_sel.si_note, kn, 1); BPFD_UNLOCK(d); return (0); } static void filt_bpfdetach(struct knote *kn) { struct bpf_d *d = (struct bpf_d *)kn->kn_hook; knlist_remove(&d->bd_sel.si_note, kn, 0); } static int filt_bpfread(struct knote *kn, long hint) { struct bpf_d *d = (struct bpf_d *)kn->kn_hook; int ready; BPFD_LOCK_ASSERT(d); ready = bpf_ready(d); if (ready) { kn->kn_data = d->bd_slen; /* * Ignore the hold buffer if it is being copied to user space. */ if (!d->bd_hbuf_in_use && d->bd_hbuf) kn->kn_data += d->bd_hlen; } else if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) { callout_reset(&d->bd_callout, d->bd_rtout, bpf_timed_out, d); d->bd_state = BPF_WAITING; } return (ready); } static int filt_bpfwrite(struct knote *kn, long hint) { struct bpf_d *d = (struct bpf_d *)kn->kn_hook; BPFD_LOCK_ASSERT(d); if (d->bd_bif == NULL) { kn->kn_data = 0; return (0); } else { kn->kn_data = d->bd_bif->bif_ifp->if_mtu; return (1); } } #define BPF_TSTAMP_NONE 0 #define BPF_TSTAMP_FAST 1 #define BPF_TSTAMP_NORMAL 2 #define BPF_TSTAMP_EXTERN 3 static int bpf_ts_quality(int tstype) { if (tstype == BPF_T_NONE) return (BPF_TSTAMP_NONE); if ((tstype & BPF_T_FAST) != 0) return (BPF_TSTAMP_FAST); return (BPF_TSTAMP_NORMAL); } static int bpf_gettime(struct bintime *bt, int tstype, struct mbuf *m) { struct timespec ts; struct m_tag *tag; int quality; quality = bpf_ts_quality(tstype); if (quality == BPF_TSTAMP_NONE) return (quality); if (m != NULL) { if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { mbuf_tstmp2timespec(m, &ts); timespec2bintime(&ts, bt); return (BPF_TSTAMP_EXTERN); } tag = m_tag_locate(m, MTAG_BPF, MTAG_BPF_TIMESTAMP, NULL); if (tag != NULL) { *bt = *(struct bintime *)(tag + 1); return (BPF_TSTAMP_EXTERN); } } if (quality == BPF_TSTAMP_NORMAL) binuptime(bt); else getbinuptime(bt); return (quality); } /* * Incoming linkage from device drivers. Process the packet pkt, of length * pktlen, which is stored in a contiguous buffer. The packet is parsed * by each process' filter, and if accepted, stashed into the corresponding * buffer. */ void bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen) { struct epoch_tracker et; struct bintime bt; struct bpf_d *d; #ifdef BPF_JITTER bpf_jit_filter *bf; #endif u_int slen; int gottime; gottime = BPF_TSTAMP_NONE; NET_EPOCH_ENTER(et); CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) { counter_u64_add(d->bd_rcount, 1); /* * NB: We dont call BPF_CHECK_DIRECTION() here since there * is no way for the caller to indiciate to us whether this * packet is inbound or outbound. In the bpf_mtap() routines, * we use the interface pointers on the mbuf to figure it out. */ #ifdef BPF_JITTER bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL; if (bf != NULL) slen = (*(bf->func))(pkt, pktlen, pktlen); else #endif slen = bpf_filter(d->bd_rfilter, pkt, pktlen, pktlen); if (slen != 0) { /* * Filter matches. Let's to acquire write lock. */ BPFD_LOCK(d); counter_u64_add(d->bd_fcount, 1); if (gottime < bpf_ts_quality(d->bd_tstamp)) gottime = bpf_gettime(&bt, d->bd_tstamp, NULL); #ifdef MAC if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0) #endif catchpacket(d, pkt, pktlen, slen, bpf_append_bytes, &bt); BPFD_UNLOCK(d); } } NET_EPOCH_EXIT(et); } void bpf_tap_if(if_t ifp, u_char *pkt, u_int pktlen) { if (bpf_peers_present(ifp->if_bpf)) bpf_tap(ifp->if_bpf, pkt, pktlen); } #define BPF_CHECK_DIRECTION(d, r, i) \ (((d)->bd_direction == BPF_D_IN && (r) != (i)) || \ ((d)->bd_direction == BPF_D_OUT && (r) == (i))) /* * Incoming linkage from device drivers, when packet is in an mbuf chain. * Locking model is explained in bpf_tap(). */ void bpf_mtap(struct bpf_if *bp, struct mbuf *m) { struct epoch_tracker et; struct bintime bt; struct bpf_d *d; #ifdef BPF_JITTER bpf_jit_filter *bf; #endif u_int pktlen, slen; int gottime; /* Skip outgoing duplicate packets. */ if ((m->m_flags & M_PROMISC) != 0 && m_rcvif(m) == NULL) { m->m_flags &= ~M_PROMISC; return; } pktlen = m_length(m, NULL); gottime = BPF_TSTAMP_NONE; NET_EPOCH_ENTER(et); CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) { if (BPF_CHECK_DIRECTION(d, m_rcvif(m), bp->bif_ifp)) continue; counter_u64_add(d->bd_rcount, 1); #ifdef BPF_JITTER bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL; /* XXX We cannot handle multiple mbufs. */ if (bf != NULL && m->m_next == NULL) slen = (*(bf->func))(mtod(m, u_char *), pktlen, pktlen); else #endif slen = bpf_filter(d->bd_rfilter, (u_char *)m, pktlen, 0); if (slen != 0) { BPFD_LOCK(d); counter_u64_add(d->bd_fcount, 1); if (gottime < bpf_ts_quality(d->bd_tstamp)) gottime = bpf_gettime(&bt, d->bd_tstamp, m); #ifdef MAC if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0) #endif catchpacket(d, (u_char *)m, pktlen, slen, bpf_append_mbuf, &bt); BPFD_UNLOCK(d); } } NET_EPOCH_EXIT(et); } void bpf_mtap_if(if_t ifp, struct mbuf *m) { if (bpf_peers_present(ifp->if_bpf)) { M_ASSERTVALID(m); bpf_mtap(ifp->if_bpf, m); } } /* * Incoming linkage from device drivers, when packet is in * an mbuf chain and to be prepended by a contiguous header. */ void bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m) { struct epoch_tracker et; struct bintime bt; struct mbuf mb; struct bpf_d *d; u_int pktlen, slen; int gottime; /* Skip outgoing duplicate packets. */ if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif == NULL) { m->m_flags &= ~M_PROMISC; return; } pktlen = m_length(m, NULL); /* * Craft on-stack mbuf suitable for passing to bpf_filter. * Note that we cut corners here; we only setup what's * absolutely needed--this mbuf should never go anywhere else. */ mb.m_flags = 0; mb.m_next = m; mb.m_data = data; mb.m_len = dlen; pktlen += dlen; gottime = BPF_TSTAMP_NONE; NET_EPOCH_ENTER(et); CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) { if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp)) continue; counter_u64_add(d->bd_rcount, 1); slen = bpf_filter(d->bd_rfilter, (u_char *)&mb, pktlen, 0); if (slen != 0) { BPFD_LOCK(d); counter_u64_add(d->bd_fcount, 1); if (gottime < bpf_ts_quality(d->bd_tstamp)) gottime = bpf_gettime(&bt, d->bd_tstamp, m); #ifdef MAC if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0) #endif catchpacket(d, (u_char *)&mb, pktlen, slen, bpf_append_mbuf, &bt); BPFD_UNLOCK(d); } } NET_EPOCH_EXIT(et); } void bpf_mtap2_if(if_t ifp, void *data, u_int dlen, struct mbuf *m) { if (bpf_peers_present(ifp->if_bpf)) { M_ASSERTVALID(m); bpf_mtap2(ifp->if_bpf, data, dlen, m); } } #undef BPF_CHECK_DIRECTION #undef BPF_TSTAMP_NONE #undef BPF_TSTAMP_FAST #undef BPF_TSTAMP_NORMAL #undef BPF_TSTAMP_EXTERN static int bpf_hdrlen(struct bpf_d *d) { int hdrlen; hdrlen = d->bd_bif->bif_hdrlen; #ifndef BURN_BRIDGES if (d->bd_tstamp == BPF_T_NONE || BPF_T_FORMAT(d->bd_tstamp) == BPF_T_MICROTIME) #ifdef COMPAT_FREEBSD32 if (d->bd_compat32) hdrlen += SIZEOF_BPF_HDR(struct bpf_hdr32); else #endif hdrlen += SIZEOF_BPF_HDR(struct bpf_hdr); else #endif hdrlen += SIZEOF_BPF_HDR(struct bpf_xhdr); #ifdef COMPAT_FREEBSD32 if (d->bd_compat32) hdrlen = BPF_WORDALIGN32(hdrlen); else #endif hdrlen = BPF_WORDALIGN(hdrlen); return (hdrlen - d->bd_bif->bif_hdrlen); } static void bpf_bintime2ts(struct bintime *bt, struct bpf_ts *ts, int tstype) { struct bintime bt2, boottimebin; struct timeval tsm; struct timespec tsn; if ((tstype & BPF_T_MONOTONIC) == 0) { bt2 = *bt; getboottimebin(&boottimebin); bintime_add(&bt2, &boottimebin); bt = &bt2; } switch (BPF_T_FORMAT(tstype)) { case BPF_T_MICROTIME: bintime2timeval(bt, &tsm); ts->bt_sec = tsm.tv_sec; ts->bt_frac = tsm.tv_usec; break; case BPF_T_NANOTIME: bintime2timespec(bt, &tsn); ts->bt_sec = tsn.tv_sec; ts->bt_frac = tsn.tv_nsec; break; case BPF_T_BINTIME: ts->bt_sec = bt->sec; ts->bt_frac = bt->frac; break; } } /* * Move the packet data from interface memory (pkt) into the * store buffer. "cpfn" is the routine called to do the actual data * transfer. bcopy is passed in to copy contiguous chunks, while * bpf_append_mbuf is passed in to copy mbuf chains. In the latter case, * pkt is really an mbuf. */ static void catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen, void (*cpfn)(struct bpf_d *, caddr_t, u_int, void *, u_int), struct bintime *bt) { static char zeroes[BPF_ALIGNMENT]; struct bpf_xhdr hdr; #ifndef BURN_BRIDGES struct bpf_hdr hdr_old; #ifdef COMPAT_FREEBSD32 struct bpf_hdr32 hdr32_old; #endif #endif int caplen, curlen, hdrlen, pad, totlen; int do_wakeup = 0; int do_timestamp; int tstype; BPFD_LOCK_ASSERT(d); if (d->bd_bif == NULL) { /* Descriptor was detached in concurrent thread */ counter_u64_add(d->bd_dcount, 1); return; } /* * Detect whether user space has released a buffer back to us, and if * so, move it from being a hold buffer to a free buffer. This may * not be the best place to do it (for example, we might only want to * run this check if we need the space), but for now it's a reliable * spot to do it. */ if (d->bd_fbuf == NULL && bpf_canfreebuf(d)) { d->bd_fbuf = d->bd_hbuf; d->bd_hbuf = NULL; d->bd_hlen = 0; bpf_buf_reclaimed(d); } /* * Figure out how many bytes to move. If the packet is * greater or equal to the snapshot length, transfer that * much. Otherwise, transfer the whole packet (unless * we hit the buffer size limit). */ hdrlen = bpf_hdrlen(d); totlen = hdrlen + min(snaplen, pktlen); if (totlen > d->bd_bufsize) totlen = d->bd_bufsize; /* * Round up the end of the previous packet to the next longword. * * Drop the packet if there's no room and no hope of room * If the packet would overflow the storage buffer or the storage * buffer is considered immutable by the buffer model, try to rotate * the buffer and wakeup pending processes. */ #ifdef COMPAT_FREEBSD32 if (d->bd_compat32) curlen = BPF_WORDALIGN32(d->bd_slen); else #endif curlen = BPF_WORDALIGN(d->bd_slen); if (curlen + totlen > d->bd_bufsize || !bpf_canwritebuf(d)) { if (d->bd_fbuf == NULL) { /* * There's no room in the store buffer, and no * prospect of room, so drop the packet. Notify the * buffer model. */ bpf_buffull(d); counter_u64_add(d->bd_dcount, 1); return; } KASSERT(!d->bd_hbuf_in_use, ("hold buffer is in use")); ROTATE_BUFFERS(d); do_wakeup = 1; curlen = 0; } else { if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT) { /* * Immediate mode is set, or the read timeout has * already expired during a select call. A packet * arrived, so the reader should be woken up. */ do_wakeup = 1; } pad = curlen - d->bd_slen; KASSERT(pad >= 0 && pad <= sizeof(zeroes), ("%s: invalid pad byte count %d", __func__, pad)); if (pad > 0) { /* Zero pad bytes. */ bpf_append_bytes(d, d->bd_sbuf, d->bd_slen, zeroes, pad); } } caplen = totlen - hdrlen; tstype = d->bd_tstamp; do_timestamp = tstype != BPF_T_NONE; #ifndef BURN_BRIDGES if (tstype == BPF_T_NONE || BPF_T_FORMAT(tstype) == BPF_T_MICROTIME) { struct bpf_ts ts; if (do_timestamp) bpf_bintime2ts(bt, &ts, tstype); #ifdef COMPAT_FREEBSD32 if (d->bd_compat32) { bzero(&hdr32_old, sizeof(hdr32_old)); if (do_timestamp) { hdr32_old.bh_tstamp.tv_sec = ts.bt_sec; hdr32_old.bh_tstamp.tv_usec = ts.bt_frac; } hdr32_old.bh_datalen = pktlen; hdr32_old.bh_hdrlen = hdrlen; hdr32_old.bh_caplen = caplen; bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr32_old, sizeof(hdr32_old)); goto copy; } #endif bzero(&hdr_old, sizeof(hdr_old)); if (do_timestamp) { hdr_old.bh_tstamp.tv_sec = ts.bt_sec; hdr_old.bh_tstamp.tv_usec = ts.bt_frac; } hdr_old.bh_datalen = pktlen; hdr_old.bh_hdrlen = hdrlen; hdr_old.bh_caplen = caplen; bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr_old, sizeof(hdr_old)); goto copy; } #endif /* * Append the bpf header. Note we append the actual header size, but * move forward the length of the header plus padding. */ bzero(&hdr, sizeof(hdr)); if (do_timestamp) bpf_bintime2ts(bt, &hdr.bh_tstamp, tstype); hdr.bh_datalen = pktlen; hdr.bh_hdrlen = hdrlen; hdr.bh_caplen = caplen; bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr, sizeof(hdr)); /* * Copy the packet data into the store buffer and update its length. */ #ifndef BURN_BRIDGES copy: #endif (*cpfn)(d, d->bd_sbuf, curlen + hdrlen, pkt, caplen); d->bd_slen = curlen + totlen; if (do_wakeup) bpf_wakeup(d); } /* * Free buffers currently in use by a descriptor. * Called on close. */ static void bpfd_free(epoch_context_t ctx) { struct bpf_d *d; struct bpf_program_buffer *p; /* * We don't need to lock out interrupts since this descriptor has * been detached from its interface and it yet hasn't been marked * free. */ d = __containerof(ctx, struct bpf_d, epoch_ctx); bpf_free(d); if (d->bd_rfilter != NULL) { p = __containerof((void *)d->bd_rfilter, struct bpf_program_buffer, buffer); #ifdef BPF_JITTER p->func = d->bd_bfilter; #endif bpf_program_buffer_free(&p->epoch_ctx); } if (d->bd_wfilter != NULL) { p = __containerof((void *)d->bd_wfilter, struct bpf_program_buffer, buffer); #ifdef BPF_JITTER p->func = NULL; #endif bpf_program_buffer_free(&p->epoch_ctx); } mtx_destroy(&d->bd_lock); counter_u64_free(d->bd_rcount); counter_u64_free(d->bd_dcount); counter_u64_free(d->bd_fcount); counter_u64_free(d->bd_wcount); counter_u64_free(d->bd_wfcount); counter_u64_free(d->bd_wdcount); counter_u64_free(d->bd_zcopy); free(d, M_BPF); } /* * Attach an interface to bpf. dlt is the link layer type; hdrlen is the * fixed size of the link header (variable length headers not yet supported). */ void bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen) { bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf); } /* * Attach an interface to bpf. ifp is a pointer to the structure * defining the interface to be attached, dlt is the link layer type, * and hdrlen is the fixed size of the link header (variable length * headers are not yet supporrted). */ void bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp) { struct bpf_if *bp; KASSERT(*driverp == NULL, ("bpfattach2: driverp already initialized")); bp = malloc(sizeof(*bp), M_BPF, M_WAITOK | M_ZERO); CK_LIST_INIT(&bp->bif_dlist); CK_LIST_INIT(&bp->bif_wlist); bp->bif_ifp = ifp; bp->bif_dlt = dlt; bp->bif_hdrlen = hdrlen; bp->bif_bpf = driverp; refcount_init(&bp->bif_refcnt, 1); *driverp = bp; /* * Reference ifnet pointer, so it won't freed until * we release it. */ if_ref(ifp); BPF_LOCK(); CK_LIST_INSERT_HEAD(&bpf_iflist, bp, bif_next); BPF_UNLOCK(); if (bootverbose && IS_DEFAULT_VNET(curvnet)) if_printf(ifp, "bpf attached\n"); } #ifdef VIMAGE /* * When moving interfaces between vnet instances we need a way to * query the dlt and hdrlen before detach so we can re-attch the if_bpf * after the vmove. We unfortunately have no device driver infrastructure * to query the interface for these values after creation/attach, thus * add this as a workaround. */ int bpf_get_bp_params(struct bpf_if *bp, u_int *bif_dlt, u_int *bif_hdrlen) { if (bp == NULL) return (ENXIO); if (bif_dlt == NULL && bif_hdrlen == NULL) return (0); if (bif_dlt != NULL) *bif_dlt = bp->bif_dlt; if (bif_hdrlen != NULL) *bif_hdrlen = bp->bif_hdrlen; return (0); } #endif /* * Detach bpf from an interface. This involves detaching each descriptor * associated with the interface. Notify each descriptor as it's detached * so that any sleepers wake up and get ENXIO. */ void bpfdetach(struct ifnet *ifp) { struct bpf_if *bp, *bp_temp; struct bpf_d *d; BPF_LOCK(); /* Find all bpf_if struct's which reference ifp and detach them. */ CK_LIST_FOREACH_SAFE(bp, &bpf_iflist, bif_next, bp_temp) { if (ifp != bp->bif_ifp) continue; CK_LIST_REMOVE(bp, bif_next); *bp->bif_bpf = __DECONST(struct bpf_if *, &dead_bpf_if); CTR4(KTR_NET, "%s: sheduling free for encap %d (%p) for if %p", __func__, bp->bif_dlt, bp, ifp); /* Detach common descriptors */ while ((d = CK_LIST_FIRST(&bp->bif_dlist)) != NULL) { bpf_detachd_locked(d, true); } /* Detach writer-only descriptors */ while ((d = CK_LIST_FIRST(&bp->bif_wlist)) != NULL) { bpf_detachd_locked(d, true); } bpfif_rele(bp); } BPF_UNLOCK(); } bool bpf_peers_present_if(struct ifnet *ifp) { return (bpf_peers_present(ifp->if_bpf)); } /* * Get a list of available data link type of the interface. */ static int bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl) { struct ifnet *ifp; struct bpf_if *bp; u_int *lst; int error, n, n1; BPF_LOCK_ASSERT(); ifp = d->bd_bif->bif_ifp; n1 = 0; CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) { if (bp->bif_ifp == ifp) n1++; } if (bfl->bfl_list == NULL) { bfl->bfl_len = n1; return (0); } if (n1 > bfl->bfl_len) return (ENOMEM); lst = malloc(n1 * sizeof(u_int), M_TEMP, M_WAITOK); n = 0; CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) { if (bp->bif_ifp != ifp) continue; lst[n++] = bp->bif_dlt; } error = copyout(lst, bfl->bfl_list, sizeof(u_int) * n); free(lst, M_TEMP); bfl->bfl_len = n; return (error); } /* * Set the data link type of a BPF instance. */ static int bpf_setdlt(struct bpf_d *d, u_int dlt) { int error, opromisc; struct ifnet *ifp; struct bpf_if *bp; BPF_LOCK_ASSERT(); MPASS(d->bd_bif != NULL); /* * It is safe to check bd_bif without BPFD_LOCK, it can not be * changed while we hold global lock. */ if (d->bd_bif->bif_dlt == dlt) return (0); ifp = d->bd_bif->bif_ifp; CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) { if (bp->bif_ifp == ifp && bp->bif_dlt == dlt) break; } if (bp == NULL) return (EINVAL); opromisc = d->bd_promisc; bpf_attachd(d, bp); if (opromisc) { error = ifpromisc(bp->bif_ifp, 1); if (error) if_printf(bp->bif_ifp, "%s: ifpromisc failed (%d)\n", __func__, error); else d->bd_promisc = 1; } return (0); } static void bpf_drvinit(void *unused) { struct cdev *dev; sx_init(&bpf_sx, "bpf global lock"); CK_LIST_INIT(&bpf_iflist); dev = make_dev(&bpf_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "bpf"); /* For compatibility */ make_dev_alias(dev, "bpf0"); } /* * Zero out the various packet counters associated with all of the bpf * descriptors. At some point, we will probably want to get a bit more * granular and allow the user to specify descriptors to be zeroed. */ static void bpf_zero_counters(void) { struct bpf_if *bp; struct bpf_d *bd; BPF_LOCK(); /* * We are protected by global lock here, interfaces and * descriptors can not be deleted while we hold it. */ CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) { CK_LIST_FOREACH(bd, &bp->bif_dlist, bd_next) { counter_u64_zero(bd->bd_rcount); counter_u64_zero(bd->bd_dcount); counter_u64_zero(bd->bd_fcount); counter_u64_zero(bd->bd_wcount); counter_u64_zero(bd->bd_wfcount); counter_u64_zero(bd->bd_zcopy); } } BPF_UNLOCK(); } /* * Fill filter statistics */ static void bpfstats_fill_xbpf(struct xbpf_d *d, struct bpf_d *bd) { BPF_LOCK_ASSERT(); bzero(d, sizeof(*d)); d->bd_structsize = sizeof(*d); d->bd_immediate = bd->bd_immediate; d->bd_promisc = bd->bd_promisc; d->bd_hdrcmplt = bd->bd_hdrcmplt; d->bd_direction = bd->bd_direction; d->bd_feedback = bd->bd_feedback; d->bd_async = bd->bd_async; d->bd_rcount = counter_u64_fetch(bd->bd_rcount); d->bd_dcount = counter_u64_fetch(bd->bd_dcount); d->bd_fcount = counter_u64_fetch(bd->bd_fcount); d->bd_sig = bd->bd_sig; d->bd_slen = bd->bd_slen; d->bd_hlen = bd->bd_hlen; d->bd_bufsize = bd->bd_bufsize; d->bd_pid = bd->bd_pid; strlcpy(d->bd_ifname, bd->bd_bif->bif_ifp->if_xname, IFNAMSIZ); d->bd_locked = bd->bd_locked; d->bd_wcount = counter_u64_fetch(bd->bd_wcount); d->bd_wdcount = counter_u64_fetch(bd->bd_wdcount); d->bd_wfcount = counter_u64_fetch(bd->bd_wfcount); d->bd_zcopy = counter_u64_fetch(bd->bd_zcopy); d->bd_bufmode = bd->bd_bufmode; } /* * Handle `netstat -B' stats request */ static int bpf_stats_sysctl(SYSCTL_HANDLER_ARGS) { static const struct xbpf_d zerostats; struct xbpf_d *xbdbuf, *xbd, tempstats; int index, error; struct bpf_if *bp; struct bpf_d *bd; /* * XXX This is not technically correct. It is possible for non * privileged users to open bpf devices. It would make sense * if the users who opened the devices were able to retrieve * the statistics for them, too. */ error = priv_check(req->td, PRIV_NET_BPF); if (error) return (error); /* * Check to see if the user is requesting that the counters be * zeroed out. Explicitly check that the supplied data is zeroed, * as we aren't allowing the user to set the counters currently. */ if (req->newptr != NULL) { if (req->newlen != sizeof(tempstats)) return (EINVAL); memset(&tempstats, 0, sizeof(tempstats)); error = SYSCTL_IN(req, &tempstats, sizeof(tempstats)); if (error) return (error); if (bcmp(&tempstats, &zerostats, sizeof(tempstats)) != 0) return (EINVAL); bpf_zero_counters(); return (0); } if (req->oldptr == NULL) return (SYSCTL_OUT(req, 0, bpf_bpfd_cnt * sizeof(*xbd))); if (bpf_bpfd_cnt == 0) return (SYSCTL_OUT(req, 0, 0)); xbdbuf = malloc(req->oldlen, M_BPF, M_WAITOK); BPF_LOCK(); if (req->oldlen < (bpf_bpfd_cnt * sizeof(*xbd))) { BPF_UNLOCK(); free(xbdbuf, M_BPF); return (ENOMEM); } index = 0; CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) { /* Send writers-only first */ CK_LIST_FOREACH(bd, &bp->bif_wlist, bd_next) { xbd = &xbdbuf[index++]; bpfstats_fill_xbpf(xbd, bd); } CK_LIST_FOREACH(bd, &bp->bif_dlist, bd_next) { xbd = &xbdbuf[index++]; bpfstats_fill_xbpf(xbd, bd); } } BPF_UNLOCK(); error = SYSCTL_OUT(req, xbdbuf, index * sizeof(*xbd)); free(xbdbuf, M_BPF); return (error); } SYSINIT(bpfdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, bpf_drvinit, NULL); #else /* !DEV_BPF && !NETGRAPH_BPF */ /* * NOP stubs to allow bpf-using drivers to load and function. * * A 'better' implementation would allow the core bpf functionality * to be loaded at runtime. */ void bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen) { } void bpf_tap_if(if_t ifp, u_char *pkt, u_int pktlen) { } void bpf_mtap(struct bpf_if *bp, struct mbuf *m) { } void bpf_mtap_if(if_t ifp, struct mbuf *m) { } void bpf_mtap2(struct bpf_if *bp, void *d, u_int l, struct mbuf *m) { } void bpf_mtap2_if(if_t ifp, void *data, u_int dlen, struct mbuf *m) { } void bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen) { bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf); } void bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp) { *driverp = __DECONST(struct bpf_if *, &dead_bpf_if); } void bpfdetach(struct ifnet *ifp) { } bool bpf_peers_present_if(struct ifnet *ifp) { return (false); } u_int bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen) { return (-1); /* "no filter" behaviour */ } int bpf_validate(const struct bpf_insn *f, int len) { return (0); /* false */ } #endif /* !DEV_BPF && !NETGRAPH_BPF */ #ifdef DDB static void bpf_show_bpf_if(struct bpf_if *bpf_if) { if (bpf_if == NULL) return; db_printf("%p:\n", bpf_if); #define BPF_DB_PRINTF(f, e) db_printf(" %s = " f "\n", #e, bpf_if->e); #define BPF_DB_PRINTF_RAW(f, e) db_printf(" %s = " f "\n", #e, e); /* bif_ext.bif_next */ /* bif_ext.bif_dlist */ BPF_DB_PRINTF("%#x", bif_dlt); BPF_DB_PRINTF("%u", bif_hdrlen); /* bif_wlist */ BPF_DB_PRINTF("%p", bif_ifp); BPF_DB_PRINTF("%p", bif_bpf); BPF_DB_PRINTF_RAW("%u", refcount_load(&bpf_if->bif_refcnt)); } DB_SHOW_COMMAND(bpf_if, db_show_bpf_if) { if (!have_addr) { db_printf("usage: show bpf_if \n"); return; } bpf_show_bpf_if((struct bpf_if *)addr); } #endif diff --git a/sys/net/if_tuntap.c b/sys/net/if_tuntap.c index 0dee2260973d..a0275a7471e5 100644 --- a/sys/net/if_tuntap.c +++ b/sys/net/if_tuntap.c @@ -1,2071 +1,2071 @@ /* $NetBSD: if_tun.c,v 1.14 1994/06/29 06:36:25 cgd Exp $ */ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (C) 1999-2000 by Maksim Yevmenkin * All rights reserved. * Copyright (c) 2019 Kyle Evans * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * BASED ON: * ------------------------------------------------------------------------- * * Copyright (c) 1988, Julian Onions * Nottingham University 1987. * * This source may be freely distributed, however I would be interested * in any changes that are made. * * This driver takes packets off the IP i/f and hands them up to a * user process to have its wicked way with. This driver has it's * roots in a similar driver written by Phil Cockcroft (formerly) at * UCL. This driver is based much more on read/write/poll mode of * operation though. */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #endif #ifdef INET6 #include #include #endif #include #include #include #include #include #include #include #include #include #include struct tuntap_driver; /* * tun_list is protected by global tunmtx. Other mutable fields are * protected by tun->tun_mtx, or by their owning subsystem. tun_dev is * static for the duration of a tunnel interface. */ struct tuntap_softc { TAILQ_ENTRY(tuntap_softc) tun_list; struct cdev *tun_alias; struct cdev *tun_dev; u_short tun_flags; /* misc flags */ #define TUN_OPEN 0x0001 #define TUN_INITED 0x0002 #define TUN_UNUSED1 0x0008 #define TUN_UNUSED2 0x0010 #define TUN_LMODE 0x0020 #define TUN_RWAIT 0x0040 #define TUN_ASYNC 0x0080 #define TUN_IFHEAD 0x0100 #define TUN_DYING 0x0200 #define TUN_L2 0x0400 #define TUN_VMNET 0x0800 #define TUN_DRIVER_IDENT_MASK (TUN_L2 | TUN_VMNET) #define TUN_READY (TUN_OPEN | TUN_INITED) pid_t tun_pid; /* owning pid */ struct ifnet *tun_ifp; /* the interface */ struct sigio *tun_sigio; /* async I/O info */ struct tuntap_driver *tun_drv; /* appropriate driver */ struct selinfo tun_rsel; /* read select */ struct mtx tun_mtx; /* softc field mutex */ struct cv tun_cv; /* for ref'd dev destroy */ struct ether_addr tun_ether; /* remote address */ int tun_busy; /* busy count */ int tun_vhdrlen; /* virtio-net header length */ struct lro_ctrl tun_lro; /* for TCP LRO */ bool tun_lro_ready; /* TCP LRO initialized */ }; #define TUN2IFP(sc) ((sc)->tun_ifp) #define TUNDEBUG if (tundebug) if_printf #define TUN_LOCK(tp) mtx_lock(&(tp)->tun_mtx) #define TUN_UNLOCK(tp) mtx_unlock(&(tp)->tun_mtx) #define TUN_LOCK_ASSERT(tp) mtx_assert(&(tp)->tun_mtx, MA_OWNED); #define TUN_VMIO_FLAG_MASK 0x0fff /* * Interface capabilities of a tap device that supports the virtio-net * header. */ #define TAP_VNET_HDR_CAPS (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6 \ | IFCAP_VLAN_HWCSUM \ | IFCAP_TSO | IFCAP_LRO \ | IFCAP_VLAN_HWTSO) #define TAP_ALL_OFFLOAD (CSUM_TSO | CSUM_TCP | CSUM_UDP |\ CSUM_TCP_IPV6 | CSUM_UDP_IPV6) /* * All mutable global variables in if_tun are locked using tunmtx, with * the exception of tundebug, which is used unlocked, and the drivers' *clones, * which are static after setup. */ static struct mtx tunmtx; static eventhandler_tag arrival_tag; static eventhandler_tag clone_tag; static const char tunname[] = "tun"; static const char tapname[] = "tap"; static const char vmnetname[] = "vmnet"; static MALLOC_DEFINE(M_TUN, tunname, "Tunnel Interface"); static int tundebug = 0; static int tundclone = 1; static int tap_allow_uopen = 0; /* allow user devfs cloning */ static int tapuponopen = 0; /* IFF_UP on open() */ static int tapdclone = 1; /* enable devfs cloning */ static TAILQ_HEAD(,tuntap_softc) tunhead = TAILQ_HEAD_INITIALIZER(tunhead); SYSCTL_INT(_debug, OID_AUTO, if_tun_debug, CTLFLAG_RW, &tundebug, 0, ""); static struct sx tun_ioctl_sx; SX_SYSINIT(tun_ioctl_sx, &tun_ioctl_sx, "tun_ioctl"); SYSCTL_DECL(_net_link); /* tun */ static SYSCTL_NODE(_net_link, OID_AUTO, tun, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IP tunnel software network interface"); SYSCTL_INT(_net_link_tun, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tundclone, 0, "Enable legacy devfs interface creation"); /* tap */ static SYSCTL_NODE(_net_link, OID_AUTO, tap, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Ethernet tunnel software network interface"); SYSCTL_INT(_net_link_tap, OID_AUTO, user_open, CTLFLAG_RW, &tap_allow_uopen, 0, "Enable legacy devfs interface creation for all users"); SYSCTL_INT(_net_link_tap, OID_AUTO, up_on_open, CTLFLAG_RW, &tapuponopen, 0, "Bring interface up when /dev/tap is opened"); SYSCTL_INT(_net_link_tap, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tapdclone, 0, "Enable legacy devfs interface creation"); SYSCTL_INT(_net_link_tap, OID_AUTO, debug, CTLFLAG_RW, &tundebug, 0, ""); static int tun_create_device(struct tuntap_driver *drv, int unit, struct ucred *cr, struct cdev **dev, const char *name); static int tun_busy_locked(struct tuntap_softc *tp); static void tun_unbusy_locked(struct tuntap_softc *tp); static int tun_busy(struct tuntap_softc *tp); static void tun_unbusy(struct tuntap_softc *tp); static int tuntap_name2info(const char *name, int *unit, int *flags); static void tunclone(void *arg, struct ucred *cred, char *name, int namelen, struct cdev **dev); static void tuncreate(struct cdev *dev); static void tundtor(void *data); static void tunrename(void *arg, struct ifnet *ifp); static int tunifioctl(struct ifnet *, u_long, caddr_t); static void tuninit(struct ifnet *); static void tunifinit(void *xtp); static int tuntapmodevent(module_t, int, void *); static int tunoutput(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *ro); static void tunstart(struct ifnet *); static void tunstart_l2(struct ifnet *); static int tun_clone_match(struct if_clone *ifc, const char *name); static int tap_clone_match(struct if_clone *ifc, const char *name); static int vmnet_clone_match(struct if_clone *ifc, const char *name); static int tun_clone_create(struct if_clone *, char *, size_t, struct ifc_data *, struct ifnet **); static int tun_clone_destroy(struct if_clone *, struct ifnet *, uint32_t); static void tun_vnethdr_set(struct ifnet *ifp, int vhdrlen); static d_open_t tunopen; static d_read_t tunread; static d_write_t tunwrite; static d_ioctl_t tunioctl; static d_poll_t tunpoll; static d_kqfilter_t tunkqfilter; static int tunkqread(struct knote *, long); static int tunkqwrite(struct knote *, long); static void tunkqdetach(struct knote *); -static struct filterops tun_read_filterops = { +static const struct filterops tun_read_filterops = { .f_isfd = 1, .f_attach = NULL, .f_detach = tunkqdetach, .f_event = tunkqread, }; -static struct filterops tun_write_filterops = { +static const struct filterops tun_write_filterops = { .f_isfd = 1, .f_attach = NULL, .f_detach = tunkqdetach, .f_event = tunkqwrite, }; static struct tuntap_driver { struct cdevsw cdevsw; int ident_flags; struct unrhdr *unrhdr; struct clonedevs *clones; ifc_match_f *clone_match_fn; ifc_create_f *clone_create_fn; ifc_destroy_f *clone_destroy_fn; } tuntap_drivers[] = { { .ident_flags = 0, .cdevsw = { .d_version = D_VERSION, .d_flags = D_NEEDMINOR, .d_open = tunopen, .d_read = tunread, .d_write = tunwrite, .d_ioctl = tunioctl, .d_poll = tunpoll, .d_kqfilter = tunkqfilter, .d_name = tunname, }, .clone_match_fn = tun_clone_match, .clone_create_fn = tun_clone_create, .clone_destroy_fn = tun_clone_destroy, }, { .ident_flags = TUN_L2, .cdevsw = { .d_version = D_VERSION, .d_flags = D_NEEDMINOR, .d_open = tunopen, .d_read = tunread, .d_write = tunwrite, .d_ioctl = tunioctl, .d_poll = tunpoll, .d_kqfilter = tunkqfilter, .d_name = tapname, }, .clone_match_fn = tap_clone_match, .clone_create_fn = tun_clone_create, .clone_destroy_fn = tun_clone_destroy, }, { .ident_flags = TUN_L2 | TUN_VMNET, .cdevsw = { .d_version = D_VERSION, .d_flags = D_NEEDMINOR, .d_open = tunopen, .d_read = tunread, .d_write = tunwrite, .d_ioctl = tunioctl, .d_poll = tunpoll, .d_kqfilter = tunkqfilter, .d_name = vmnetname, }, .clone_match_fn = vmnet_clone_match, .clone_create_fn = tun_clone_create, .clone_destroy_fn = tun_clone_destroy, }, }; struct tuntap_driver_cloner { SLIST_ENTRY(tuntap_driver_cloner) link; struct tuntap_driver *drv; struct if_clone *cloner; }; VNET_DEFINE_STATIC(SLIST_HEAD(, tuntap_driver_cloner), tuntap_driver_cloners) = SLIST_HEAD_INITIALIZER(tuntap_driver_cloners); #define V_tuntap_driver_cloners VNET(tuntap_driver_cloners) /* * Mechanism for marking a tunnel device as busy so that we can safely do some * orthogonal operations (such as operations on devices) without racing against * tun_destroy. tun_destroy will wait on the condvar if we're at all busy or * open, to be woken up when the condition is alleviated. */ static int tun_busy_locked(struct tuntap_softc *tp) { TUN_LOCK_ASSERT(tp); if ((tp->tun_flags & TUN_DYING) != 0) { /* * Perhaps unintuitive, but the device is busy going away. * Other interpretations of EBUSY from tun_busy make little * sense, since making a busy device even more busy doesn't * sound like a problem. */ return (EBUSY); } ++tp->tun_busy; return (0); } static void tun_unbusy_locked(struct tuntap_softc *tp) { TUN_LOCK_ASSERT(tp); KASSERT(tp->tun_busy != 0, ("tun_unbusy: called for non-busy tunnel")); --tp->tun_busy; /* Wake up anything that may be waiting on our busy tunnel. */ if (tp->tun_busy == 0) cv_broadcast(&tp->tun_cv); } static int tun_busy(struct tuntap_softc *tp) { int ret; TUN_LOCK(tp); ret = tun_busy_locked(tp); TUN_UNLOCK(tp); return (ret); } static void tun_unbusy(struct tuntap_softc *tp) { TUN_LOCK(tp); tun_unbusy_locked(tp); TUN_UNLOCK(tp); } /* * Sets unit and/or flags given the device name. Must be called with correct * vnet context. */ static int tuntap_name2info(const char *name, int *outunit, int *outflags) { struct tuntap_driver *drv; struct tuntap_driver_cloner *drvc; char *dname; int flags, unit; bool found; if (name == NULL) return (EINVAL); /* * Needed for dev_stdclone, but dev_stdclone will not modify, it just * wants to be able to pass back a char * through the second param. We * will always set that as NULL here, so we'll fake it. */ dname = __DECONST(char *, name); found = false; KASSERT(!SLIST_EMPTY(&V_tuntap_driver_cloners), ("tuntap_driver_cloners failed to initialize")); SLIST_FOREACH(drvc, &V_tuntap_driver_cloners, link) { KASSERT(drvc->drv != NULL, ("tuntap_driver_cloners entry not properly initialized")); drv = drvc->drv; if (strcmp(name, drv->cdevsw.d_name) == 0) { found = true; unit = -1; flags = drv->ident_flags; break; } if (dev_stdclone(dname, NULL, drv->cdevsw.d_name, &unit) == 1) { found = true; flags = drv->ident_flags; break; } } if (!found) return (ENXIO); if (outunit != NULL) *outunit = unit; if (outflags != NULL) *outflags = flags; return (0); } /* * Get driver information from a set of flags specified. Masks the identifying * part of the flags and compares it against all of the available * tuntap_drivers. Must be called with correct vnet context. */ static struct tuntap_driver * tuntap_driver_from_flags(int tun_flags) { struct tuntap_driver *drv; struct tuntap_driver_cloner *drvc; KASSERT(!SLIST_EMPTY(&V_tuntap_driver_cloners), ("tuntap_driver_cloners failed to initialize")); SLIST_FOREACH(drvc, &V_tuntap_driver_cloners, link) { KASSERT(drvc->drv != NULL, ("tuntap_driver_cloners entry not properly initialized")); drv = drvc->drv; if ((tun_flags & TUN_DRIVER_IDENT_MASK) == drv->ident_flags) return (drv); } return (NULL); } static int tun_clone_match(struct if_clone *ifc, const char *name) { int tunflags; if (tuntap_name2info(name, NULL, &tunflags) == 0) { if ((tunflags & TUN_L2) == 0) return (1); } return (0); } static int tap_clone_match(struct if_clone *ifc, const char *name) { int tunflags; if (tuntap_name2info(name, NULL, &tunflags) == 0) { if ((tunflags & (TUN_L2 | TUN_VMNET)) == TUN_L2) return (1); } return (0); } static int vmnet_clone_match(struct if_clone *ifc, const char *name) { int tunflags; if (tuntap_name2info(name, NULL, &tunflags) == 0) { if ((tunflags & TUN_VMNET) != 0) return (1); } return (0); } static int tun_clone_create(struct if_clone *ifc, char *name, size_t len, struct ifc_data *ifd, struct ifnet **ifpp) { struct tuntap_driver *drv; struct cdev *dev; int err, i, tunflags, unit; tunflags = 0; /* The name here tells us exactly what we're creating */ err = tuntap_name2info(name, &unit, &tunflags); if (err != 0) return (err); drv = tuntap_driver_from_flags(tunflags); if (drv == NULL) return (ENXIO); if (unit != -1) { /* If this unit number is still available that's okay. */ if (alloc_unr_specific(drv->unrhdr, unit) == -1) return (EEXIST); } else { unit = alloc_unr(drv->unrhdr); } snprintf(name, IFNAMSIZ, "%s%d", drv->cdevsw.d_name, unit); /* find any existing device, or allocate new unit number */ dev = NULL; i = clone_create(&drv->clones, &drv->cdevsw, &unit, &dev, 0); /* No preexisting struct cdev *, create one */ if (i != 0) i = tun_create_device(drv, unit, NULL, &dev, name); if (i == 0) { dev_ref(dev); tuncreate(dev); struct tuntap_softc *tp = dev->si_drv1; *ifpp = tp->tun_ifp; } return (i); } static void tunclone(void *arg, struct ucred *cred, char *name, int namelen, struct cdev **dev) { char devname[SPECNAMELEN + 1]; struct tuntap_driver *drv; int append_unit, i, u, tunflags; bool mayclone; if (*dev != NULL) return; tunflags = 0; CURVNET_SET(CRED_TO_VNET(cred)); if (tuntap_name2info(name, &u, &tunflags) != 0) goto out; /* Not recognized */ if (u != -1 && u > IF_MAXUNIT) goto out; /* Unit number too high */ mayclone = priv_check_cred(cred, PRIV_NET_IFCREATE) == 0; if ((tunflags & TUN_L2) != 0) { /* tap/vmnet allow user open with a sysctl */ mayclone = (mayclone || tap_allow_uopen) && tapdclone; } else { mayclone = mayclone && tundclone; } /* * If tun cloning is enabled, only the superuser can create an * interface. */ if (!mayclone) goto out; if (u == -1) append_unit = 1; else append_unit = 0; drv = tuntap_driver_from_flags(tunflags); if (drv == NULL) goto out; /* find any existing device, or allocate new unit number */ i = clone_create(&drv->clones, &drv->cdevsw, &u, dev, 0); if (i) { if (append_unit) { namelen = snprintf(devname, sizeof(devname), "%s%d", name, u); name = devname; } i = tun_create_device(drv, u, cred, dev, name); } if (i == 0) { dev_ref(*dev); if_clone_create(name, namelen, NULL); } out: CURVNET_RESTORE(); } static void tun_destroy(struct tuntap_softc *tp) { TUN_LOCK(tp); tp->tun_flags |= TUN_DYING; if (tp->tun_busy != 0) cv_wait_unlock(&tp->tun_cv, &tp->tun_mtx); else TUN_UNLOCK(tp); CURVNET_SET(TUN2IFP(tp)->if_vnet); /* destroy_dev will take care of any alias. */ destroy_dev(tp->tun_dev); seldrain(&tp->tun_rsel); knlist_clear(&tp->tun_rsel.si_note, 0); knlist_destroy(&tp->tun_rsel.si_note); if ((tp->tun_flags & TUN_L2) != 0) { ether_ifdetach(TUN2IFP(tp)); } else { bpfdetach(TUN2IFP(tp)); if_detach(TUN2IFP(tp)); } sx_xlock(&tun_ioctl_sx); TUN2IFP(tp)->if_softc = NULL; sx_xunlock(&tun_ioctl_sx); free_unr(tp->tun_drv->unrhdr, TUN2IFP(tp)->if_dunit); if_free(TUN2IFP(tp)); mtx_destroy(&tp->tun_mtx); cv_destroy(&tp->tun_cv); free(tp, M_TUN); CURVNET_RESTORE(); } static int tun_clone_destroy(struct if_clone *ifc __unused, struct ifnet *ifp, uint32_t flags) { struct tuntap_softc *tp = ifp->if_softc; mtx_lock(&tunmtx); TAILQ_REMOVE(&tunhead, tp, tun_list); mtx_unlock(&tunmtx); tun_destroy(tp); return (0); } static void vnet_tun_init(const void *unused __unused) { struct tuntap_driver *drv; struct tuntap_driver_cloner *drvc; int i; for (i = 0; i < nitems(tuntap_drivers); ++i) { drv = &tuntap_drivers[i]; drvc = malloc(sizeof(*drvc), M_TUN, M_WAITOK | M_ZERO); drvc->drv = drv; struct if_clone_addreq req = { .match_f = drv->clone_match_fn, .create_f = drv->clone_create_fn, .destroy_f = drv->clone_destroy_fn, }; drvc->cloner = ifc_attach_cloner(drv->cdevsw.d_name, &req); SLIST_INSERT_HEAD(&V_tuntap_driver_cloners, drvc, link); }; } VNET_SYSINIT(vnet_tun_init, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_tun_init, NULL); static void vnet_tun_uninit(const void *unused __unused) { struct tuntap_driver_cloner *drvc; while (!SLIST_EMPTY(&V_tuntap_driver_cloners)) { drvc = SLIST_FIRST(&V_tuntap_driver_cloners); SLIST_REMOVE_HEAD(&V_tuntap_driver_cloners, link); if_clone_detach(drvc->cloner); free(drvc, M_TUN); } } VNET_SYSUNINIT(vnet_tun_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_tun_uninit, NULL); static void tun_uninit(const void *unused __unused) { struct tuntap_driver *drv; struct tuntap_softc *tp; int i; EVENTHANDLER_DEREGISTER(ifnet_arrival_event, arrival_tag); EVENTHANDLER_DEREGISTER(dev_clone, clone_tag); mtx_lock(&tunmtx); while ((tp = TAILQ_FIRST(&tunhead)) != NULL) { TAILQ_REMOVE(&tunhead, tp, tun_list); mtx_unlock(&tunmtx); tun_destroy(tp); mtx_lock(&tunmtx); } mtx_unlock(&tunmtx); for (i = 0; i < nitems(tuntap_drivers); ++i) { drv = &tuntap_drivers[i]; delete_unrhdr(drv->unrhdr); clone_cleanup(&drv->clones); } mtx_destroy(&tunmtx); } SYSUNINIT(tun_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, tun_uninit, NULL); static struct tuntap_driver * tuntap_driver_from_ifnet(const struct ifnet *ifp) { struct tuntap_driver *drv; int i; if (ifp == NULL) return (NULL); for (i = 0; i < nitems(tuntap_drivers); ++i) { drv = &tuntap_drivers[i]; if (strcmp(ifp->if_dname, drv->cdevsw.d_name) == 0) return (drv); } return (NULL); } static int tuntapmodevent(module_t mod, int type, void *data) { struct tuntap_driver *drv; int i; switch (type) { case MOD_LOAD: mtx_init(&tunmtx, "tunmtx", NULL, MTX_DEF); for (i = 0; i < nitems(tuntap_drivers); ++i) { drv = &tuntap_drivers[i]; clone_setup(&drv->clones); drv->unrhdr = new_unrhdr(0, IF_MAXUNIT, &tunmtx); } arrival_tag = EVENTHANDLER_REGISTER(ifnet_arrival_event, tunrename, 0, 1000); if (arrival_tag == NULL) return (ENOMEM); clone_tag = EVENTHANDLER_REGISTER(dev_clone, tunclone, 0, 1000); if (clone_tag == NULL) return (ENOMEM); break; case MOD_UNLOAD: /* See tun_uninit, so it's done after the vnet_sysuninit() */ break; default: return EOPNOTSUPP; } return 0; } static moduledata_t tuntap_mod = { "if_tuntap", tuntapmodevent, 0 }; /* We'll only ever have these two, so no need for a macro. */ static moduledata_t tun_mod = { "if_tun", NULL, 0 }; static moduledata_t tap_mod = { "if_tap", NULL, 0 }; DECLARE_MODULE(if_tuntap, tuntap_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_tuntap, 1); DECLARE_MODULE(if_tun, tun_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_tun, 1); DECLARE_MODULE(if_tap, tap_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_tap, 1); static int tun_create_device(struct tuntap_driver *drv, int unit, struct ucred *cr, struct cdev **dev, const char *name) { struct make_dev_args args; struct tuntap_softc *tp; int error; tp = malloc(sizeof(*tp), M_TUN, M_WAITOK | M_ZERO); mtx_init(&tp->tun_mtx, "tun_mtx", NULL, MTX_DEF); cv_init(&tp->tun_cv, "tun_condvar"); tp->tun_flags = drv->ident_flags; tp->tun_drv = drv; make_dev_args_init(&args); if (cr != NULL) args.mda_flags = MAKEDEV_REF | MAKEDEV_CHECKNAME; args.mda_devsw = &drv->cdevsw; args.mda_cr = cr; args.mda_uid = UID_UUCP; args.mda_gid = GID_DIALER; args.mda_mode = 0600; args.mda_unit = unit; args.mda_si_drv1 = tp; error = make_dev_s(&args, dev, "%s", name); if (error != 0) { free(tp, M_TUN); return (error); } KASSERT((*dev)->si_drv1 != NULL, ("Failed to set si_drv1 at %s creation", name)); tp->tun_dev = *dev; knlist_init_mtx(&tp->tun_rsel.si_note, &tp->tun_mtx); mtx_lock(&tunmtx); TAILQ_INSERT_TAIL(&tunhead, tp, tun_list); mtx_unlock(&tunmtx); return (0); } static void tunstart(struct ifnet *ifp) { struct tuntap_softc *tp = ifp->if_softc; struct mbuf *m; TUNDEBUG(ifp, "starting\n"); if (ALTQ_IS_ENABLED(&ifp->if_snd)) { IFQ_LOCK(&ifp->if_snd); IFQ_POLL_NOLOCK(&ifp->if_snd, m); if (m == NULL) { IFQ_UNLOCK(&ifp->if_snd); return; } IFQ_UNLOCK(&ifp->if_snd); } TUN_LOCK(tp); if (tp->tun_flags & TUN_RWAIT) { tp->tun_flags &= ~TUN_RWAIT; wakeup(tp); } selwakeuppri(&tp->tun_rsel, PZERO + 1); KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); if (tp->tun_flags & TUN_ASYNC && tp->tun_sigio) { TUN_UNLOCK(tp); pgsigio(&tp->tun_sigio, SIGIO, 0); } else TUN_UNLOCK(tp); } /* * tunstart_l2 * * queue packets from higher level ready to put out */ static void tunstart_l2(struct ifnet *ifp) { struct tuntap_softc *tp = ifp->if_softc; TUNDEBUG(ifp, "starting\n"); /* * do not junk pending output if we are in VMnet mode. * XXX: can this do any harm because of queue overflow? */ TUN_LOCK(tp); if (((tp->tun_flags & TUN_VMNET) == 0) && ((tp->tun_flags & TUN_READY) != TUN_READY)) { struct mbuf *m; /* Unlocked read. */ TUNDEBUG(ifp, "not ready, tun_flags = 0x%x\n", tp->tun_flags); for (;;) { IF_DEQUEUE(&ifp->if_snd, m); if (m != NULL) { m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } else break; } TUN_UNLOCK(tp); return; } ifp->if_drv_flags |= IFF_DRV_OACTIVE; if (!IFQ_IS_EMPTY(&ifp->if_snd)) { if (tp->tun_flags & TUN_RWAIT) { tp->tun_flags &= ~TUN_RWAIT; wakeup(tp); } if ((tp->tun_flags & TUN_ASYNC) && (tp->tun_sigio != NULL)) { TUN_UNLOCK(tp); pgsigio(&tp->tun_sigio, SIGIO, 0); TUN_LOCK(tp); } selwakeuppri(&tp->tun_rsel, PZERO+1); KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); /* obytes are counted in ether_output */ } ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; TUN_UNLOCK(tp); } /* tunstart_l2 */ static int tap_transmit(struct ifnet *ifp, struct mbuf *m) { int error; BPF_MTAP(ifp, m); IFQ_HANDOFF(ifp, m, error); return (error); } /* XXX: should return an error code so it can fail. */ static void tuncreate(struct cdev *dev) { struct tuntap_driver *drv; struct tuntap_softc *tp; struct ifnet *ifp; struct ether_addr eaddr; int iflags; u_char type; tp = dev->si_drv1; KASSERT(tp != NULL, ("si_drv1 should have been initialized at creation")); drv = tp->tun_drv; iflags = IFF_MULTICAST; if ((tp->tun_flags & TUN_L2) != 0) { type = IFT_ETHER; iflags |= IFF_BROADCAST | IFF_SIMPLEX; } else { type = IFT_PPP; iflags |= IFF_POINTOPOINT; } ifp = tp->tun_ifp = if_alloc(type); ifp->if_softc = tp; if_initname(ifp, drv->cdevsw.d_name, dev2unit(dev)); ifp->if_ioctl = tunifioctl; ifp->if_flags = iflags; IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); ifp->if_capabilities |= IFCAP_LINKSTATE | IFCAP_MEXTPG; if ((tp->tun_flags & TUN_L2) != 0) ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6 | IFCAP_LRO; ifp->if_capenable |= IFCAP_LINKSTATE | IFCAP_MEXTPG; if ((tp->tun_flags & TUN_L2) != 0) { ifp->if_init = tunifinit; ifp->if_start = tunstart_l2; ifp->if_transmit = tap_transmit; ifp->if_qflush = if_qflush; ether_gen_addr(ifp, &eaddr); ether_ifattach(ifp, eaddr.octet); } else { ifp->if_mtu = TUNMTU; ifp->if_start = tunstart; ifp->if_output = tunoutput; ifp->if_snd.ifq_drv_maxlen = 0; IFQ_SET_READY(&ifp->if_snd); if_attach(ifp); bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); } TUN_LOCK(tp); tp->tun_flags |= TUN_INITED; TUN_UNLOCK(tp); TUNDEBUG(ifp, "interface %s is created, minor = %#x\n", ifp->if_xname, dev2unit(dev)); } static void tunrename(void *arg __unused, struct ifnet *ifp) { struct tuntap_softc *tp; int error; if ((ifp->if_flags & IFF_RENAMING) == 0) return; if (tuntap_driver_from_ifnet(ifp) == NULL) return; /* * We need to grab the ioctl sx long enough to make sure the softc is * still there. If it is, we can safely try to busy the tun device. * The busy may fail if the device is currently dying, in which case * we do nothing. If it doesn't fail, the busy count stops the device * from dying until we've created the alias (that will then be * subsequently destroyed). */ sx_xlock(&tun_ioctl_sx); tp = ifp->if_softc; if (tp == NULL) { sx_xunlock(&tun_ioctl_sx); return; } error = tun_busy(tp); sx_xunlock(&tun_ioctl_sx); if (error != 0) return; if (tp->tun_alias != NULL) { destroy_dev(tp->tun_alias); tp->tun_alias = NULL; } if (strcmp(ifp->if_xname, tp->tun_dev->si_name) == 0) goto out; /* * Failure's ok, aliases are created on a best effort basis. If a * tun user/consumer decides to rename the interface to conflict with * another device (non-ifnet) on the system, we will assume they know * what they are doing. make_dev_alias_p won't touch tun_alias on * failure, so we use it but ignore the return value. */ make_dev_alias_p(MAKEDEV_CHECKNAME, &tp->tun_alias, tp->tun_dev, "%s", ifp->if_xname); out: tun_unbusy(tp); } static int tunopen(struct cdev *dev, int flag, int mode, struct thread *td) { struct ifnet *ifp; struct tuntap_softc *tp; int error __diagused, tunflags; tunflags = 0; CURVNET_SET(TD_TO_VNET(td)); error = tuntap_name2info(dev->si_name, NULL, &tunflags); if (error != 0) { CURVNET_RESTORE(); return (error); /* Shouldn't happen */ } tp = dev->si_drv1; KASSERT(tp != NULL, ("si_drv1 should have been initialized at creation")); TUN_LOCK(tp); if ((tp->tun_flags & TUN_INITED) == 0) { TUN_UNLOCK(tp); CURVNET_RESTORE(); return (ENXIO); } if ((tp->tun_flags & (TUN_OPEN | TUN_DYING)) != 0) { TUN_UNLOCK(tp); CURVNET_RESTORE(); return (EBUSY); } error = tun_busy_locked(tp); KASSERT(error == 0, ("Must be able to busy an unopen tunnel")); ifp = TUN2IFP(tp); if ((tp->tun_flags & TUN_L2) != 0) { bcopy(IF_LLADDR(ifp), tp->tun_ether.octet, sizeof(tp->tun_ether.octet)); ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; if (tapuponopen) ifp->if_flags |= IFF_UP; } tp->tun_pid = td->td_proc->p_pid; tp->tun_flags |= TUN_OPEN; if_link_state_change(ifp, LINK_STATE_UP); TUNDEBUG(ifp, "open\n"); TUN_UNLOCK(tp); /* * This can fail with either ENOENT or EBUSY. This is in the middle of * d_open, so ENOENT should not be possible. EBUSY is possible, but * the only cdevpriv dtor being set will be tundtor and the softc being * passed is constant for a given cdev. We ignore the possible error * because of this as either "unlikely" or "not actually a problem." */ (void)devfs_set_cdevpriv(tp, tundtor); CURVNET_RESTORE(); return (0); } /* * tundtor - tear down the device - mark i/f down & delete * routing info */ static void tundtor(void *data) { struct proc *p; struct tuntap_softc *tp; struct ifnet *ifp; bool l2tun; tp = data; p = curproc; ifp = TUN2IFP(tp); TUN_LOCK(tp); /* * Realistically, we can't be obstinate here. This only means that the * tuntap device was closed out of order, and the last closer wasn't the * controller. These are still good to know about, though, as software * should avoid multiple processes with a tuntap device open and * ill-defined transfer of control (e.g., handoff, TUNSIFPID, close in * parent). */ if (p->p_pid != tp->tun_pid) { log(LOG_INFO, "pid %d (%s), %s: tun/tap protocol violation, non-controlling process closed last.\n", p->p_pid, p->p_comm, tp->tun_dev->si_name); } /* * junk all pending output */ CURVNET_SET(ifp->if_vnet); l2tun = false; if ((tp->tun_flags & TUN_L2) != 0) { l2tun = true; IF_DRAIN(&ifp->if_snd); } else { IFQ_PURGE(&ifp->if_snd); } /* For vmnet, we won't do most of the address/route bits */ if ((tp->tun_flags & TUN_VMNET) != 0 || (l2tun && (ifp->if_flags & IFF_LINK0) != 0)) goto out; #if defined(INET) || defined(INET6) if (l2tun && tp->tun_lro_ready) { TUNDEBUG (ifp, "LRO disabled\n"); tcp_lro_free(&tp->tun_lro); tp->tun_lro_ready = false; } #endif if (ifp->if_flags & IFF_UP) { TUN_UNLOCK(tp); if_down(ifp); TUN_LOCK(tp); } /* Delete all addresses and routes which reference this interface. */ if (ifp->if_drv_flags & IFF_DRV_RUNNING) { ifp->if_drv_flags &= ~IFF_DRV_RUNNING; TUN_UNLOCK(tp); if_purgeaddrs(ifp); TUN_LOCK(tp); } out: if_link_state_change(ifp, LINK_STATE_DOWN); CURVNET_RESTORE(); funsetown(&tp->tun_sigio); selwakeuppri(&tp->tun_rsel, PZERO + 1); KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); TUNDEBUG (ifp, "closed\n"); tp->tun_flags &= ~TUN_OPEN; tp->tun_pid = 0; tun_vnethdr_set(ifp, 0); tun_unbusy_locked(tp); TUN_UNLOCK(tp); } static void tuninit(struct ifnet *ifp) { struct tuntap_softc *tp = ifp->if_softc; TUNDEBUG(ifp, "tuninit\n"); TUN_LOCK(tp); ifp->if_drv_flags |= IFF_DRV_RUNNING; if ((tp->tun_flags & TUN_L2) == 0) { ifp->if_flags |= IFF_UP; getmicrotime(&ifp->if_lastchange); TUN_UNLOCK(tp); } else { #if defined(INET) || defined(INET6) if (tcp_lro_init(&tp->tun_lro) == 0) { TUNDEBUG(ifp, "LRO enabled\n"); tp->tun_lro.ifp = ifp; tp->tun_lro_ready = true; } else { TUNDEBUG(ifp, "Could not enable LRO\n"); tp->tun_lro_ready = false; } #endif ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; TUN_UNLOCK(tp); /* attempt to start output */ tunstart_l2(ifp); } } /* * Used only for l2 tunnel. */ static void tunifinit(void *xtp) { struct tuntap_softc *tp; tp = (struct tuntap_softc *)xtp; tuninit(tp->tun_ifp); } /* * To be called under TUN_LOCK. Update ifp->if_hwassist according to the * current value of ifp->if_capenable. */ static void tun_caps_changed(struct ifnet *ifp) { uint64_t hwassist = 0; TUN_LOCK_ASSERT((struct tuntap_softc *)ifp->if_softc); if (ifp->if_capenable & IFCAP_TXCSUM) hwassist |= CSUM_TCP | CSUM_UDP; if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) hwassist |= CSUM_TCP_IPV6 | CSUM_UDP_IPV6; if (ifp->if_capenable & IFCAP_TSO4) hwassist |= CSUM_IP_TSO; if (ifp->if_capenable & IFCAP_TSO6) hwassist |= CSUM_IP6_TSO; ifp->if_hwassist = hwassist; } /* * To be called under TUN_LOCK. Update tp->tun_vhdrlen and adjust * if_capabilities and if_capenable as needed. */ static void tun_vnethdr_set(struct ifnet *ifp, int vhdrlen) { struct tuntap_softc *tp = ifp->if_softc; TUN_LOCK_ASSERT(tp); if (tp->tun_vhdrlen == vhdrlen) return; /* * Update if_capabilities to reflect the * functionalities offered by the virtio-net * header. */ if (vhdrlen != 0) ifp->if_capabilities |= TAP_VNET_HDR_CAPS; else ifp->if_capabilities &= ~TAP_VNET_HDR_CAPS; /* * Disable any capabilities that we don't * support anymore. */ ifp->if_capenable &= ifp->if_capabilities; tun_caps_changed(ifp); tp->tun_vhdrlen = vhdrlen; TUNDEBUG(ifp, "vnet_hdr_len=%d, if_capabilities=%x\n", vhdrlen, ifp->if_capabilities); } /* * Process an ioctl request. */ static int tunifioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifreq *ifr = (struct ifreq *)data; struct tuntap_softc *tp; struct ifstat *ifs; struct ifmediareq *ifmr; int dummy, error = 0; bool l2tun; ifmr = NULL; sx_xlock(&tun_ioctl_sx); tp = ifp->if_softc; if (tp == NULL) { error = ENXIO; goto bad; } l2tun = (tp->tun_flags & TUN_L2) != 0; switch(cmd) { case SIOCGIFSTATUS: ifs = (struct ifstat *)data; TUN_LOCK(tp); if (tp->tun_pid) snprintf(ifs->ascii, sizeof(ifs->ascii), "\tOpened by PID %d\n", tp->tun_pid); else ifs->ascii[0] = '\0'; TUN_UNLOCK(tp); break; case SIOCSIFADDR: if (l2tun) error = ether_ioctl(ifp, cmd, data); else tuninit(ifp); if (error == 0) TUNDEBUG(ifp, "address set\n"); break; case SIOCSIFMTU: ifp->if_mtu = ifr->ifr_mtu; TUNDEBUG(ifp, "mtu set\n"); break; case SIOCSIFFLAGS: case SIOCADDMULTI: case SIOCDELMULTI: break; case SIOCGIFMEDIA: if (!l2tun) { error = EINVAL; break; } ifmr = (struct ifmediareq *)data; dummy = ifmr->ifm_count; ifmr->ifm_count = 1; ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER | IFM_FDX | IFM_1000_T; if (tp->tun_flags & TUN_OPEN) ifmr->ifm_status |= IFM_ACTIVE; ifmr->ifm_current = ifmr->ifm_active; if (dummy >= 1) { int media = IFM_ETHER; error = copyout(&media, ifmr->ifm_ulist, sizeof(int)); } break; case SIOCSIFCAP: TUN_LOCK(tp); ifp->if_capenable = ifr->ifr_reqcap; tun_caps_changed(ifp); TUN_UNLOCK(tp); VLAN_CAPABILITIES(ifp); break; default: if (l2tun) { error = ether_ioctl(ifp, cmd, data); } else { error = EINVAL; } } bad: sx_xunlock(&tun_ioctl_sx); return (error); } /* * tunoutput - queue packets from higher level ready to put out. */ static int tunoutput(struct ifnet *ifp, struct mbuf *m0, const struct sockaddr *dst, struct route *ro) { struct tuntap_softc *tp = ifp->if_softc; u_short cached_tun_flags; int error; u_int32_t af; TUNDEBUG (ifp, "tunoutput\n"); #ifdef MAC error = mac_ifnet_check_transmit(ifp, m0); if (error) { m_freem(m0); return (error); } #endif /* Could be unlocked read? */ TUN_LOCK(tp); cached_tun_flags = tp->tun_flags; TUN_UNLOCK(tp); if ((cached_tun_flags & TUN_READY) != TUN_READY) { TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags); m_freem (m0); return (EHOSTDOWN); } if ((ifp->if_flags & IFF_UP) != IFF_UP) { m_freem (m0); return (EHOSTDOWN); } /* BPF writes need to be handled specially. */ if (dst->sa_family == AF_UNSPEC || dst->sa_family == pseudo_AF_HDRCMPLT) bcopy(dst->sa_data, &af, sizeof(af)); else af = RO_GET_FAMILY(ro, dst); BPF_MTAP2(ifp, &af, sizeof(af), m0); /* prepend sockaddr? this may abort if the mbuf allocation fails */ if (cached_tun_flags & TUN_LMODE) { /* allocate space for sockaddr */ M_PREPEND(m0, dst->sa_len, M_NOWAIT); /* if allocation failed drop packet */ if (m0 == NULL) { if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENOBUFS); } else { bcopy(dst, m0->m_data, dst->sa_len); } } if (cached_tun_flags & TUN_IFHEAD) { /* Prepend the address family */ M_PREPEND(m0, 4, M_NOWAIT); /* if allocation failed drop packet */ if (m0 == NULL) { if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENOBUFS); } else *(u_int32_t *)m0->m_data = htonl(af); } else { #ifdef INET if (af != AF_INET) #endif { m_freem(m0); return (EAFNOSUPPORT); } } error = (ifp->if_transmit)(ifp, m0); if (error) return (ENOBUFS); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); return (0); } /* * the cdevsw interface is now pretty minimal. */ static int tunioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td) { struct ifreq ifr, *ifrp; struct tuntap_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); struct tuninfo *tunp; int error, iflags, ival; bool l2tun; l2tun = (tp->tun_flags & TUN_L2) != 0; if (l2tun) { /* tap specific ioctls */ switch(cmd) { /* VMware/VMnet port ioctl's */ #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD4) case _IO('V', 0): ival = IOCPARM_IVAL(data); data = (caddr_t)&ival; /* FALLTHROUGH */ #endif case VMIO_SIOCSIFFLAGS: /* VMware/VMnet SIOCSIFFLAGS */ iflags = *(int *)data; iflags &= TUN_VMIO_FLAG_MASK; iflags &= ~IFF_CANTCHANGE; iflags |= IFF_UP; TUN_LOCK(tp); ifp->if_flags = iflags | (ifp->if_flags & IFF_CANTCHANGE); TUN_UNLOCK(tp); return (0); case SIOCGIFADDR: /* get MAC address of the remote side */ TUN_LOCK(tp); bcopy(&tp->tun_ether.octet, data, sizeof(tp->tun_ether.octet)); TUN_UNLOCK(tp); return (0); case SIOCSIFADDR: /* set MAC address of the remote side */ TUN_LOCK(tp); bcopy(data, &tp->tun_ether.octet, sizeof(tp->tun_ether.octet)); TUN_UNLOCK(tp); return (0); case TAPSVNETHDR: ival = *(int *)data; if (ival != 0 && ival != sizeof(struct virtio_net_hdr) && ival != sizeof(struct virtio_net_hdr_mrg_rxbuf)) { return (EINVAL); } TUN_LOCK(tp); tun_vnethdr_set(ifp, ival); TUN_UNLOCK(tp); return (0); case TAPGVNETHDR: TUN_LOCK(tp); *(int *)data = tp->tun_vhdrlen; TUN_UNLOCK(tp); return (0); } /* Fall through to the common ioctls if unhandled */ } else { switch (cmd) { case TUNSLMODE: TUN_LOCK(tp); if (*(int *)data) { tp->tun_flags |= TUN_LMODE; tp->tun_flags &= ~TUN_IFHEAD; } else tp->tun_flags &= ~TUN_LMODE; TUN_UNLOCK(tp); return (0); case TUNSIFHEAD: TUN_LOCK(tp); if (*(int *)data) { tp->tun_flags |= TUN_IFHEAD; tp->tun_flags &= ~TUN_LMODE; } else tp->tun_flags &= ~TUN_IFHEAD; TUN_UNLOCK(tp); return (0); case TUNGIFHEAD: TUN_LOCK(tp); *(int *)data = (tp->tun_flags & TUN_IFHEAD) ? 1 : 0; TUN_UNLOCK(tp); return (0); case TUNSIFMODE: /* deny this if UP */ if (TUN2IFP(tp)->if_flags & IFF_UP) return (EBUSY); switch (*(int *)data & ~IFF_MULTICAST) { case IFF_POINTOPOINT: case IFF_BROADCAST: TUN_LOCK(tp); TUN2IFP(tp)->if_flags &= ~(IFF_BROADCAST|IFF_POINTOPOINT|IFF_MULTICAST); TUN2IFP(tp)->if_flags |= *(int *)data; TUN_UNLOCK(tp); break; default: return (EINVAL); } return (0); case TUNSIFPID: TUN_LOCK(tp); tp->tun_pid = curthread->td_proc->p_pid; TUN_UNLOCK(tp); return (0); } /* Fall through to the common ioctls if unhandled */ } switch (cmd) { case TUNGIFNAME: ifrp = (struct ifreq *)data; strlcpy(ifrp->ifr_name, TUN2IFP(tp)->if_xname, IFNAMSIZ); return (0); case TUNSIFINFO: tunp = (struct tuninfo *)data; if (TUN2IFP(tp)->if_type != tunp->type) return (EPROTOTYPE); TUN_LOCK(tp); if (TUN2IFP(tp)->if_mtu != tunp->mtu) { strlcpy(ifr.ifr_name, if_name(TUN2IFP(tp)), IFNAMSIZ); ifr.ifr_mtu = tunp->mtu; CURVNET_SET(TUN2IFP(tp)->if_vnet); error = ifhwioctl(SIOCSIFMTU, TUN2IFP(tp), (caddr_t)&ifr, td); CURVNET_RESTORE(); if (error) { TUN_UNLOCK(tp); return (error); } } TUN2IFP(tp)->if_baudrate = tunp->baudrate; TUN_UNLOCK(tp); break; case TUNGIFINFO: tunp = (struct tuninfo *)data; TUN_LOCK(tp); tunp->mtu = TUN2IFP(tp)->if_mtu; tunp->type = TUN2IFP(tp)->if_type; tunp->baudrate = TUN2IFP(tp)->if_baudrate; TUN_UNLOCK(tp); break; case TUNSDEBUG: tundebug = *(int *)data; break; case TUNGDEBUG: *(int *)data = tundebug; break; case FIONBIO: break; case FIOASYNC: TUN_LOCK(tp); if (*(int *)data) tp->tun_flags |= TUN_ASYNC; else tp->tun_flags &= ~TUN_ASYNC; TUN_UNLOCK(tp); break; case FIONREAD: if (!IFQ_IS_EMPTY(&TUN2IFP(tp)->if_snd)) { struct mbuf *mb; IFQ_LOCK(&TUN2IFP(tp)->if_snd); IFQ_POLL_NOLOCK(&TUN2IFP(tp)->if_snd, mb); for (*(int *)data = 0; mb != NULL; mb = mb->m_next) *(int *)data += mb->m_len; IFQ_UNLOCK(&TUN2IFP(tp)->if_snd); } else *(int *)data = 0; break; case FIOSETOWN: return (fsetown(*(int *)data, &tp->tun_sigio)); case FIOGETOWN: *(int *)data = fgetown(&tp->tun_sigio); return (0); /* This is deprecated, FIOSETOWN should be used instead. */ case TIOCSPGRP: return (fsetown(-(*(int *)data), &tp->tun_sigio)); /* This is deprecated, FIOGETOWN should be used instead. */ case TIOCGPGRP: *(int *)data = -fgetown(&tp->tun_sigio); return (0); default: return (ENOTTY); } return (0); } /* * The cdevsw read interface - reads a packet at a time, or at * least as much of a packet as can be read. */ static int tunread(struct cdev *dev, struct uio *uio, int flag) { struct tuntap_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); struct mbuf *m; size_t len; int error = 0; TUNDEBUG (ifp, "read\n"); TUN_LOCK(tp); if ((tp->tun_flags & TUN_READY) != TUN_READY) { TUN_UNLOCK(tp); TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags); return (EHOSTDOWN); } tp->tun_flags &= ~TUN_RWAIT; for (;;) { IFQ_DEQUEUE(&ifp->if_snd, m); if (m != NULL) break; if (flag & O_NONBLOCK) { TUN_UNLOCK(tp); return (EWOULDBLOCK); } tp->tun_flags |= TUN_RWAIT; error = mtx_sleep(tp, &tp->tun_mtx, PCATCH | (PZERO + 1), "tunread", 0); if (error != 0) { TUN_UNLOCK(tp); return (error); } } TUN_UNLOCK(tp); len = min(tp->tun_vhdrlen, uio->uio_resid); if (len > 0) { struct virtio_net_hdr_mrg_rxbuf vhdr; bzero(&vhdr, sizeof(vhdr)); if (m->m_pkthdr.csum_flags & TAP_ALL_OFFLOAD) { m = virtio_net_tx_offload(ifp, m, false, &vhdr.hdr); } TUNDEBUG(ifp, "txvhdr: f %u, gt %u, hl %u, " "gs %u, cs %u, co %u\n", vhdr.hdr.flags, vhdr.hdr.gso_type, vhdr.hdr.hdr_len, vhdr.hdr.gso_size, vhdr.hdr.csum_start, vhdr.hdr.csum_offset); error = uiomove(&vhdr, len, uio); } if (error == 0) error = m_mbuftouio(uio, m, 0); m_freem(m); return (error); } static int tunwrite_l2(struct tuntap_softc *tp, struct mbuf *m, struct virtio_net_hdr_mrg_rxbuf *vhdr) { struct epoch_tracker et; struct ether_header *eh; struct ifnet *ifp; ifp = TUN2IFP(tp); /* * Only pass a unicast frame to ether_input(), if it would * actually have been received by non-virtual hardware. */ if (m->m_len < sizeof(struct ether_header)) { m_freem(m); return (0); } eh = mtod(m, struct ether_header *); if ((ifp->if_flags & IFF_PROMISC) == 0 && !ETHER_IS_MULTICAST(eh->ether_dhost) && bcmp(eh->ether_dhost, IF_LLADDR(ifp), ETHER_ADDR_LEN) != 0) { m_freem(m); return (0); } if (vhdr != NULL) { if (virtio_net_rx_csum(m, &vhdr->hdr)) { m_freem(m); return (0); } } else { switch (ntohs(eh->ether_type)) { #ifdef INET case ETHERTYPE_IP: if (ifp->if_capenable & IFCAP_RXCSUM) { m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_SCTP_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } break; #endif #ifdef INET6 case ETHERTYPE_IPV6: if (ifp->if_capenable & IFCAP_RXCSUM_IPV6) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID_IPV6 | CSUM_SCTP_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } break; #endif } } /* Pass packet up to parent. */ CURVNET_SET(ifp->if_vnet); NET_EPOCH_ENTER(et); #if defined(INET) || defined(INET6) if (tp->tun_lro_ready && ifp->if_capenable & IFCAP_LRO && tcp_lro_rx(&tp->tun_lro, m, 0) == 0) tcp_lro_flush_all(&tp->tun_lro); else #endif (*ifp->if_input)(ifp, m); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); /* ibytes are counted in parent */ if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); return (0); } static int tunwrite_l3(struct tuntap_softc *tp, struct mbuf *m) { struct epoch_tracker et; struct ifnet *ifp; int family, isr; ifp = TUN2IFP(tp); /* Could be unlocked read? */ TUN_LOCK(tp); if (tp->tun_flags & TUN_IFHEAD) { TUN_UNLOCK(tp); if (m->m_len < sizeof(family) && (m = m_pullup(m, sizeof(family))) == NULL) return (ENOBUFS); family = ntohl(*mtod(m, u_int32_t *)); m_adj(m, sizeof(family)); } else { TUN_UNLOCK(tp); family = AF_INET; } BPF_MTAP2(ifp, &family, sizeof(family), m); switch (family) { #ifdef INET case AF_INET: isr = NETISR_IP; break; #endif #ifdef INET6 case AF_INET6: isr = NETISR_IPV6; break; #endif default: m_freem(m); return (EAFNOSUPPORT); } random_harvest_queue(m, sizeof(*m), RANDOM_NET_TUN); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); CURVNET_SET(ifp->if_vnet); M_SETFIB(m, ifp->if_fib); NET_EPOCH_ENTER(et); netisr_dispatch(isr, m); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); } /* * the cdevsw write interface - an atomic write is a packet - or else! */ static int tunwrite(struct cdev *dev, struct uio *uio, int flag) { struct virtio_net_hdr_mrg_rxbuf vhdr; struct tuntap_softc *tp; struct ifnet *ifp; struct mbuf *m; uint32_t mru; int align, vhdrlen, error; bool l2tun; tp = dev->si_drv1; ifp = TUN2IFP(tp); TUNDEBUG(ifp, "tunwrite\n"); if ((ifp->if_flags & IFF_UP) != IFF_UP) /* ignore silently */ return (0); if (uio->uio_resid == 0) return (0); l2tun = (tp->tun_flags & TUN_L2) != 0; mru = l2tun ? TAPMRU : TUNMRU; vhdrlen = tp->tun_vhdrlen; align = 0; if (l2tun) { align = ETHER_ALIGN; mru += vhdrlen; } else if ((tp->tun_flags & TUN_IFHEAD) != 0) mru += sizeof(uint32_t); /* family */ if (uio->uio_resid < 0 || uio->uio_resid > mru) { TUNDEBUG(ifp, "len=%zd!\n", uio->uio_resid); return (EIO); } if (vhdrlen > 0) { error = uiomove(&vhdr, vhdrlen, uio); if (error != 0) return (error); TUNDEBUG(ifp, "txvhdr: f %u, gt %u, hl %u, " "gs %u, cs %u, co %u\n", vhdr.hdr.flags, vhdr.hdr.gso_type, vhdr.hdr.hdr_len, vhdr.hdr.gso_size, vhdr.hdr.csum_start, vhdr.hdr.csum_offset); } if ((m = m_uiotombuf(uio, M_NOWAIT, 0, align, M_PKTHDR)) == NULL) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); return (ENOBUFS); } m->m_pkthdr.rcvif = ifp; #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif if (l2tun) return (tunwrite_l2(tp, m, vhdrlen > 0 ? &vhdr : NULL)); return (tunwrite_l3(tp, m)); } /* * tunpoll - the poll interface, this is only useful on reads * really. The write detect always returns true, write never blocks * anyway, it either accepts the packet or drops it. */ static int tunpoll(struct cdev *dev, int events, struct thread *td) { struct tuntap_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); int revents = 0; TUNDEBUG(ifp, "tunpoll\n"); if (events & (POLLIN | POLLRDNORM)) { IFQ_LOCK(&ifp->if_snd); if (!IFQ_IS_EMPTY(&ifp->if_snd)) { TUNDEBUG(ifp, "tunpoll q=%d\n", ifp->if_snd.ifq_len); revents |= events & (POLLIN | POLLRDNORM); } else { TUNDEBUG(ifp, "tunpoll waiting\n"); selrecord(td, &tp->tun_rsel); } IFQ_UNLOCK(&ifp->if_snd); } revents |= events & (POLLOUT | POLLWRNORM); return (revents); } /* * tunkqfilter - support for the kevent() system call. */ static int tunkqfilter(struct cdev *dev, struct knote *kn) { struct tuntap_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); switch(kn->kn_filter) { case EVFILT_READ: TUNDEBUG(ifp, "%s kqfilter: EVFILT_READ, minor = %#x\n", ifp->if_xname, dev2unit(dev)); kn->kn_fop = &tun_read_filterops; break; case EVFILT_WRITE: TUNDEBUG(ifp, "%s kqfilter: EVFILT_WRITE, minor = %#x\n", ifp->if_xname, dev2unit(dev)); kn->kn_fop = &tun_write_filterops; break; default: TUNDEBUG(ifp, "%s kqfilter: invalid filter, minor = %#x\n", ifp->if_xname, dev2unit(dev)); return(EINVAL); } kn->kn_hook = tp; knlist_add(&tp->tun_rsel.si_note, kn, 0); return (0); } /* * Return true of there is data in the interface queue. */ static int tunkqread(struct knote *kn, long hint) { int ret; struct tuntap_softc *tp = kn->kn_hook; struct cdev *dev = tp->tun_dev; struct ifnet *ifp = TUN2IFP(tp); if ((kn->kn_data = ifp->if_snd.ifq_len) > 0) { TUNDEBUG(ifp, "%s have data in the queue. Len = %d, minor = %#x\n", ifp->if_xname, ifp->if_snd.ifq_len, dev2unit(dev)); ret = 1; } else { TUNDEBUG(ifp, "%s waiting for data, minor = %#x\n", ifp->if_xname, dev2unit(dev)); ret = 0; } return (ret); } /* * Always can write, always return MTU in kn->data. */ static int tunkqwrite(struct knote *kn, long hint) { struct tuntap_softc *tp = kn->kn_hook; struct ifnet *ifp = TUN2IFP(tp); kn->kn_data = ifp->if_mtu; return (1); } static void tunkqdetach(struct knote *kn) { struct tuntap_softc *tp = kn->kn_hook; knlist_remove(&tp->tun_rsel.si_note, kn, 0); } diff --git a/sys/security/audit/audit_pipe.c b/sys/security/audit/audit_pipe.c index 926865b499d1..c50287321cbd 100644 --- a/sys/security/audit/audit_pipe.c +++ b/sys/security/audit/audit_pipe.c @@ -1,1077 +1,1077 @@ /*- * Copyright (c) 2006 Robert N. M. Watson * Copyright (c) 2008-2009 Apple, Inc. * All rights reserved. * * This software was developed by Robert Watson for the TrustedBSD Project. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Implementation of a clonable special device providing a live stream of BSM * audit data. Consumers receive a "tee" of the system audit trail by * default, but may also define alternative event selections using ioctls. * This interface provides unreliable but timely access to audit events. * Consumers should be very careful to avoid introducing event cycles. */ /* * Memory types. */ static MALLOC_DEFINE(M_AUDIT_PIPE, "audit_pipe", "Audit pipes"); static MALLOC_DEFINE(M_AUDIT_PIPE_ENTRY, "audit_pipeent", "Audit pipe entries and buffers"); static MALLOC_DEFINE(M_AUDIT_PIPE_PRESELECT, "audit_pipe_presel", "Audit pipe preselection structure"); /* * Audit pipe buffer parameters. */ #define AUDIT_PIPE_QLIMIT_DEFAULT (128) #define AUDIT_PIPE_QLIMIT_MIN (1) #define AUDIT_PIPE_QLIMIT_MAX (1024) /* * Description of an entry in an audit_pipe. */ struct audit_pipe_entry { void *ape_record; u_int ape_record_len; TAILQ_ENTRY(audit_pipe_entry) ape_queue; }; /* * Audit pipes allow processes to express "interest" in the set of records * that are delivered via the pipe. They do this in a similar manner to the * mechanism for audit trail configuration, by expressing two global masks, * and optionally expressing per-auid masks. The following data structure is * the per-auid mask description. The global state is stored in the audit * pipe data structure. * * We may want to consider a more space/time-efficient data structure once * usage patterns for per-auid specifications are clear. */ struct audit_pipe_preselect { au_id_t app_auid; au_mask_t app_mask; TAILQ_ENTRY(audit_pipe_preselect) app_list; }; /* * Description of an individual audit_pipe. Consists largely of a bounded * length queue. */ #define AUDIT_PIPE_ASYNC 0x00000001 #define AUDIT_PIPE_NBIO 0x00000002 struct audit_pipe { u_int ap_flags; struct selinfo ap_selinfo; struct sigio *ap_sigio; /* * Per-pipe mutex protecting most fields in this data structure. */ struct mtx ap_mtx; /* * Per-pipe sleep lock serializing user-generated reads and flushes. * uiomove() is called to copy out the current head record's data * while the record remains in the queue, so we prevent other threads * from removing it using this lock. */ struct sx ap_sx; /* * Condition variable to signal when data has been delivered to a * pipe. */ struct cv ap_cv; /* * Various queue-reated variables: qlen and qlimit are a count of * records in the queue; qbyteslen is the number of bytes of data * across all records, and qoffset is the amount read so far of the * first record in the queue. The number of bytes available for * reading in the queue is qbyteslen - qoffset. */ u_int ap_qlen; u_int ap_qlimit; u_int ap_qbyteslen; u_int ap_qoffset; /* * Per-pipe operation statistics. */ u_int64_t ap_inserts; /* Records added. */ u_int64_t ap_reads; /* Records read. */ u_int64_t ap_drops; /* Records dropped. */ /* * Fields relating to pipe interest: global masks for unmatched * processes (attributable, non-attributable), and a list of specific * interest specifications by auid. */ int ap_preselect_mode; au_mask_t ap_preselect_flags; au_mask_t ap_preselect_naflags; TAILQ_HEAD(, audit_pipe_preselect) ap_preselect_list; /* * Current pending record list. Protected by a combination of ap_mtx * and ap_sx. Note particularly that *both* locks are required to * remove a record from the head of the queue, as an in-progress read * may sleep while copying and therefore cannot hold ap_mtx. */ TAILQ_HEAD(, audit_pipe_entry) ap_queue; /* * Global pipe list. */ TAILQ_ENTRY(audit_pipe) ap_list; }; #define AUDIT_PIPE_LOCK(ap) mtx_lock(&(ap)->ap_mtx) #define AUDIT_PIPE_LOCK_ASSERT(ap) mtx_assert(&(ap)->ap_mtx, MA_OWNED) #define AUDIT_PIPE_LOCK_DESTROY(ap) mtx_destroy(&(ap)->ap_mtx) #define AUDIT_PIPE_LOCK_INIT(ap) mtx_init(&(ap)->ap_mtx, \ "audit_pipe_mtx", NULL, MTX_DEF) #define AUDIT_PIPE_UNLOCK(ap) mtx_unlock(&(ap)->ap_mtx) #define AUDIT_PIPE_MTX(ap) (&(ap)->ap_mtx) #define AUDIT_PIPE_SX_LOCK_DESTROY(ap) sx_destroy(&(ap)->ap_sx) #define AUDIT_PIPE_SX_LOCK_INIT(ap) sx_init(&(ap)->ap_sx, "audit_pipe_sx") #define AUDIT_PIPE_SX_XLOCK_ASSERT(ap) sx_assert(&(ap)->ap_sx, SA_XLOCKED) #define AUDIT_PIPE_SX_XLOCK_SIG(ap) sx_xlock_sig(&(ap)->ap_sx) #define AUDIT_PIPE_SX_XUNLOCK(ap) sx_xunlock(&(ap)->ap_sx) /* * Global list of audit pipes, rwlock to protect it. Individual record * queues on pipes are protected by per-pipe locks; these locks synchronize * between threads walking the list to deliver to individual pipes and add/ * remove of pipes, and are mostly acquired for read. */ static TAILQ_HEAD(, audit_pipe) audit_pipe_list; static struct rwlock audit_pipe_lock; #define AUDIT_PIPE_LIST_LOCK_INIT() rw_init(&audit_pipe_lock, \ "audit_pipe_list_lock") #define AUDIT_PIPE_LIST_LOCK_DESTROY() rw_destroy(&audit_pipe_lock) #define AUDIT_PIPE_LIST_RLOCK() rw_rlock(&audit_pipe_lock) #define AUDIT_PIPE_LIST_RUNLOCK() rw_runlock(&audit_pipe_lock) #define AUDIT_PIPE_LIST_WLOCK() rw_wlock(&audit_pipe_lock) #define AUDIT_PIPE_LIST_WLOCK_ASSERT() rw_assert(&audit_pipe_lock, \ RA_WLOCKED) #define AUDIT_PIPE_LIST_WUNLOCK() rw_wunlock(&audit_pipe_lock) /* * Audit pipe device. */ static struct cdev *audit_pipe_dev; #define AUDIT_PIPE_NAME "auditpipe" /* * Special device methods and definition. */ static d_open_t audit_pipe_open; static d_read_t audit_pipe_read; static d_ioctl_t audit_pipe_ioctl; static d_poll_t audit_pipe_poll; static d_kqfilter_t audit_pipe_kqfilter; static struct cdevsw audit_pipe_cdevsw = { .d_version = D_VERSION, .d_open = audit_pipe_open, .d_read = audit_pipe_read, .d_ioctl = audit_pipe_ioctl, .d_poll = audit_pipe_poll, .d_kqfilter = audit_pipe_kqfilter, .d_name = AUDIT_PIPE_NAME, }; static int audit_pipe_kqread(struct knote *note, long hint); static void audit_pipe_kqdetach(struct knote *note); -static struct filterops audit_pipe_read_filterops = { +static const struct filterops audit_pipe_read_filterops = { .f_isfd = 1, .f_attach = NULL, .f_detach = audit_pipe_kqdetach, .f_event = audit_pipe_kqread, }; /* * Some global statistics on audit pipes. */ static int audit_pipe_count; /* Current number of pipes. */ static u_int64_t audit_pipe_ever; /* Pipes ever allocated. */ static u_int64_t audit_pipe_records; /* Records seen. */ static u_int64_t audit_pipe_drops; /* Global record drop count. */ /* * Free an audit pipe entry. */ static void audit_pipe_entry_free(struct audit_pipe_entry *ape) { free(ape->ape_record, M_AUDIT_PIPE_ENTRY); free(ape, M_AUDIT_PIPE_ENTRY); } /* * Find an audit pipe preselection specification for an auid, if any. */ static struct audit_pipe_preselect * audit_pipe_preselect_find(struct audit_pipe *ap, au_id_t auid) { struct audit_pipe_preselect *app; AUDIT_PIPE_LOCK_ASSERT(ap); TAILQ_FOREACH(app, &ap->ap_preselect_list, app_list) { if (app->app_auid == auid) return (app); } return (NULL); } /* * Query the per-pipe mask for a specific auid. */ static int audit_pipe_preselect_get(struct audit_pipe *ap, au_id_t auid, au_mask_t *maskp) { struct audit_pipe_preselect *app; int error; AUDIT_PIPE_LOCK(ap); app = audit_pipe_preselect_find(ap, auid); if (app != NULL) { *maskp = app->app_mask; error = 0; } else error = ENOENT; AUDIT_PIPE_UNLOCK(ap); return (error); } /* * Set the per-pipe mask for a specific auid. Add a new entry if needed; * otherwise, update the current entry. */ static void audit_pipe_preselect_set(struct audit_pipe *ap, au_id_t auid, au_mask_t mask) { struct audit_pipe_preselect *app, *app_new; /* * Pessimistically assume that the auid doesn't already have a mask * set, and allocate. We will free it if it is unneeded. */ app_new = malloc(sizeof(*app_new), M_AUDIT_PIPE_PRESELECT, M_WAITOK); AUDIT_PIPE_LOCK(ap); app = audit_pipe_preselect_find(ap, auid); if (app == NULL) { app = app_new; app_new = NULL; app->app_auid = auid; TAILQ_INSERT_TAIL(&ap->ap_preselect_list, app, app_list); } app->app_mask = mask; AUDIT_PIPE_UNLOCK(ap); if (app_new != NULL) free(app_new, M_AUDIT_PIPE_PRESELECT); } /* * Delete a per-auid mask on an audit pipe. */ static int audit_pipe_preselect_delete(struct audit_pipe *ap, au_id_t auid) { struct audit_pipe_preselect *app; int error; AUDIT_PIPE_LOCK(ap); app = audit_pipe_preselect_find(ap, auid); if (app != NULL) { TAILQ_REMOVE(&ap->ap_preselect_list, app, app_list); error = 0; } else error = ENOENT; AUDIT_PIPE_UNLOCK(ap); if (app != NULL) free(app, M_AUDIT_PIPE_PRESELECT); return (error); } /* * Delete all per-auid masks on an audit pipe. */ static void audit_pipe_preselect_flush_locked(struct audit_pipe *ap) { struct audit_pipe_preselect *app; AUDIT_PIPE_LOCK_ASSERT(ap); while ((app = TAILQ_FIRST(&ap->ap_preselect_list)) != NULL) { TAILQ_REMOVE(&ap->ap_preselect_list, app, app_list); free(app, M_AUDIT_PIPE_PRESELECT); } } static void audit_pipe_preselect_flush(struct audit_pipe *ap) { AUDIT_PIPE_LOCK(ap); audit_pipe_preselect_flush_locked(ap); AUDIT_PIPE_UNLOCK(ap); } /*- * Determine whether a specific audit pipe matches a record with these * properties. Algorithm is as follows: * * - If the pipe is configured to track the default trail configuration, then * use the results of global preselection matching. * - If not, search for a specifically configured auid entry matching the * event. If an entry is found, use that. * - Otherwise, use the default flags or naflags configured for the pipe. */ static int audit_pipe_preselect_check(struct audit_pipe *ap, au_id_t auid, au_event_t event, au_class_t class, int sorf, int trail_preselect) { struct audit_pipe_preselect *app; AUDIT_PIPE_LOCK_ASSERT(ap); switch (ap->ap_preselect_mode) { case AUDITPIPE_PRESELECT_MODE_TRAIL: return (trail_preselect); case AUDITPIPE_PRESELECT_MODE_LOCAL: app = audit_pipe_preselect_find(ap, auid); if (app == NULL) { if (auid == AU_DEFAUDITID) return (au_preselect(event, class, &ap->ap_preselect_naflags, sorf)); else return (au_preselect(event, class, &ap->ap_preselect_flags, sorf)); } else return (au_preselect(event, class, &app->app_mask, sorf)); default: panic("audit_pipe_preselect_check: mode %d", ap->ap_preselect_mode); } return (0); } /* * Determine whether there exists a pipe interested in a record with specific * properties. */ int audit_pipe_preselect(au_id_t auid, au_event_t event, au_class_t class, int sorf, int trail_preselect) { struct audit_pipe *ap; /* Lockless read to avoid acquiring the global lock if not needed. */ if (TAILQ_EMPTY(&audit_pipe_list)) return (0); AUDIT_PIPE_LIST_RLOCK(); TAILQ_FOREACH(ap, &audit_pipe_list, ap_list) { AUDIT_PIPE_LOCK(ap); if (audit_pipe_preselect_check(ap, auid, event, class, sorf, trail_preselect)) { AUDIT_PIPE_UNLOCK(ap); AUDIT_PIPE_LIST_RUNLOCK(); return (1); } AUDIT_PIPE_UNLOCK(ap); } AUDIT_PIPE_LIST_RUNLOCK(); return (0); } /* * Append individual record to a queue -- allocate queue-local buffer, and * add to the queue. If the queue is full or we can't allocate memory, drop * the newest record. */ static void audit_pipe_append(struct audit_pipe *ap, void *record, u_int record_len) { struct audit_pipe_entry *ape; AUDIT_PIPE_LOCK_ASSERT(ap); if (ap->ap_qlen >= ap->ap_qlimit) { ap->ap_drops++; audit_pipe_drops++; return; } ape = malloc(sizeof(*ape), M_AUDIT_PIPE_ENTRY, M_NOWAIT | M_ZERO); if (ape == NULL) { ap->ap_drops++; audit_pipe_drops++; return; } ape->ape_record = malloc(record_len, M_AUDIT_PIPE_ENTRY, M_NOWAIT); if (ape->ape_record == NULL) { free(ape, M_AUDIT_PIPE_ENTRY); ap->ap_drops++; audit_pipe_drops++; return; } bcopy(record, ape->ape_record, record_len); ape->ape_record_len = record_len; TAILQ_INSERT_TAIL(&ap->ap_queue, ape, ape_queue); ap->ap_inserts++; ap->ap_qlen++; ap->ap_qbyteslen += ape->ape_record_len; selwakeuppri(&ap->ap_selinfo, PSOCK); KNOTE_LOCKED(&ap->ap_selinfo.si_note, 0); if (ap->ap_flags & AUDIT_PIPE_ASYNC) pgsigio(&ap->ap_sigio, SIGIO, 0); cv_broadcast(&ap->ap_cv); } /* * audit_pipe_submit(): audit_worker submits audit records via this * interface, which arranges for them to be delivered to pipe queues. */ void audit_pipe_submit(au_id_t auid, au_event_t event, au_class_t class, int sorf, int trail_select, void *record, u_int record_len) { struct audit_pipe *ap; /* * Lockless read to avoid lock overhead if pipes are not in use. */ if (TAILQ_FIRST(&audit_pipe_list) == NULL) return; AUDIT_PIPE_LIST_RLOCK(); TAILQ_FOREACH(ap, &audit_pipe_list, ap_list) { AUDIT_PIPE_LOCK(ap); if (audit_pipe_preselect_check(ap, auid, event, class, sorf, trail_select)) audit_pipe_append(ap, record, record_len); AUDIT_PIPE_UNLOCK(ap); } AUDIT_PIPE_LIST_RUNLOCK(); /* Unlocked increment. */ audit_pipe_records++; } /* * audit_pipe_submit_user(): the same as audit_pipe_submit(), except that * since we don't currently have selection information available, it is * delivered to the pipe unconditionally. * * XXXRW: This is a bug. The BSM check routine for submitting a user record * should parse that information and return it. */ void audit_pipe_submit_user(void *record, u_int record_len) { struct audit_pipe *ap; /* * Lockless read to avoid lock overhead if pipes are not in use. */ if (TAILQ_FIRST(&audit_pipe_list) == NULL) return; AUDIT_PIPE_LIST_RLOCK(); TAILQ_FOREACH(ap, &audit_pipe_list, ap_list) { AUDIT_PIPE_LOCK(ap); audit_pipe_append(ap, record, record_len); AUDIT_PIPE_UNLOCK(ap); } AUDIT_PIPE_LIST_RUNLOCK(); /* Unlocked increment. */ audit_pipe_records++; } /* * Allocate a new audit pipe. Connects the pipe, on success, to the global * list and updates statistics. */ static struct audit_pipe * audit_pipe_alloc(void) { struct audit_pipe *ap; ap = malloc(sizeof(*ap), M_AUDIT_PIPE, M_NOWAIT | M_ZERO); if (ap == NULL) return (NULL); ap->ap_qlimit = AUDIT_PIPE_QLIMIT_DEFAULT; TAILQ_INIT(&ap->ap_queue); knlist_init_mtx(&ap->ap_selinfo.si_note, AUDIT_PIPE_MTX(ap)); AUDIT_PIPE_LOCK_INIT(ap); AUDIT_PIPE_SX_LOCK_INIT(ap); cv_init(&ap->ap_cv, "audit_pipe"); /* * Default flags, naflags, and auid-specific preselection settings to * 0. Initialize the mode to the global trail so that if praudit(1) * is run on /dev/auditpipe, it sees events associated with the * default trail. Pipe-aware application can clear the flag, set * custom masks, and flush the pipe as needed. */ bzero(&ap->ap_preselect_flags, sizeof(ap->ap_preselect_flags)); bzero(&ap->ap_preselect_naflags, sizeof(ap->ap_preselect_naflags)); TAILQ_INIT(&ap->ap_preselect_list); ap->ap_preselect_mode = AUDITPIPE_PRESELECT_MODE_TRAIL; /* * Add to global list and update global statistics. */ AUDIT_PIPE_LIST_WLOCK(); TAILQ_INSERT_HEAD(&audit_pipe_list, ap, ap_list); audit_pipe_count++; audit_pipe_ever++; AUDIT_PIPE_LIST_WUNLOCK(); return (ap); } /* * Flush all records currently present in an audit pipe; assume mutex is held. */ static void audit_pipe_flush(struct audit_pipe *ap) { struct audit_pipe_entry *ape; AUDIT_PIPE_LOCK_ASSERT(ap); while ((ape = TAILQ_FIRST(&ap->ap_queue)) != NULL) { TAILQ_REMOVE(&ap->ap_queue, ape, ape_queue); ap->ap_qbyteslen -= ape->ape_record_len; audit_pipe_entry_free(ape); ap->ap_qlen--; } ap->ap_qoffset = 0; KASSERT(ap->ap_qlen == 0, ("audit_pipe_free: ap_qbyteslen")); KASSERT(ap->ap_qbyteslen == 0, ("audit_pipe_flush: ap_qbyteslen")); } /* * Free an audit pipe; this means freeing all preselection state and all * records in the pipe. Assumes global write lock and pipe mutex are held to * prevent any new records from being inserted during the free, and that the * audit pipe is still on the global list. */ static void audit_pipe_free(struct audit_pipe *ap) { AUDIT_PIPE_LIST_WLOCK_ASSERT(); AUDIT_PIPE_LOCK_ASSERT(ap); audit_pipe_preselect_flush_locked(ap); audit_pipe_flush(ap); cv_destroy(&ap->ap_cv); AUDIT_PIPE_SX_LOCK_DESTROY(ap); AUDIT_PIPE_LOCK_DESTROY(ap); seldrain(&ap->ap_selinfo); knlist_destroy(&ap->ap_selinfo.si_note); TAILQ_REMOVE(&audit_pipe_list, ap, ap_list); free(ap, M_AUDIT_PIPE); audit_pipe_count--; } static void audit_pipe_dtor(void *arg) { struct audit_pipe *ap; ap = arg; funsetown(&ap->ap_sigio); AUDIT_PIPE_LIST_WLOCK(); AUDIT_PIPE_LOCK(ap); audit_pipe_free(ap); AUDIT_PIPE_LIST_WUNLOCK(); } /* * Audit pipe open method. Explicit privilege check isn't used as this * allows file permissions on the special device to be used to grant audit * review access. Those file permissions should be managed carefully. */ static int audit_pipe_open(struct cdev *dev, int oflags, int devtype, struct thread *td) { struct audit_pipe *ap; int error; ap = audit_pipe_alloc(); if (ap == NULL) return (ENOMEM); fsetown(td->td_proc->p_pid, &ap->ap_sigio); error = devfs_set_cdevpriv(ap, audit_pipe_dtor); if (error != 0) audit_pipe_dtor(ap); return (error); } /* * Audit pipe ioctl() routine. Handle file descriptor and audit pipe layer * commands. */ static int audit_pipe_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td) { struct auditpipe_ioctl_preselect *aip; struct audit_pipe *ap; au_mask_t *maskp; int error, mode; au_id_t auid; error = devfs_get_cdevpriv((void **)&ap); if (error != 0) return (error); /* * Audit pipe ioctls: first come standard device node ioctls, then * manipulation of pipe settings, and finally, statistics query * ioctls. */ switch (cmd) { case FIONBIO: AUDIT_PIPE_LOCK(ap); if (*(int *)data) ap->ap_flags |= AUDIT_PIPE_NBIO; else ap->ap_flags &= ~AUDIT_PIPE_NBIO; AUDIT_PIPE_UNLOCK(ap); error = 0; break; case FIONREAD: AUDIT_PIPE_LOCK(ap); *(int *)data = ap->ap_qbyteslen - ap->ap_qoffset; AUDIT_PIPE_UNLOCK(ap); error = 0; break; case FIOASYNC: AUDIT_PIPE_LOCK(ap); if (*(int *)data) ap->ap_flags |= AUDIT_PIPE_ASYNC; else ap->ap_flags &= ~AUDIT_PIPE_ASYNC; AUDIT_PIPE_UNLOCK(ap); error = 0; break; case FIOSETOWN: error = fsetown(*(int *)data, &ap->ap_sigio); break; case FIOGETOWN: *(int *)data = fgetown(&ap->ap_sigio); error = 0; break; case AUDITPIPE_GET_QLEN: *(u_int *)data = ap->ap_qlen; error = 0; break; case AUDITPIPE_GET_QLIMIT: *(u_int *)data = ap->ap_qlimit; error = 0; break; case AUDITPIPE_SET_QLIMIT: /* Lockless integer write. */ if (*(u_int *)data >= AUDIT_PIPE_QLIMIT_MIN && *(u_int *)data <= AUDIT_PIPE_QLIMIT_MAX) { ap->ap_qlimit = *(u_int *)data; error = 0; } else error = EINVAL; break; case AUDITPIPE_GET_QLIMIT_MIN: *(u_int *)data = AUDIT_PIPE_QLIMIT_MIN; error = 0; break; case AUDITPIPE_GET_QLIMIT_MAX: *(u_int *)data = AUDIT_PIPE_QLIMIT_MAX; error = 0; break; case AUDITPIPE_GET_PRESELECT_FLAGS: AUDIT_PIPE_LOCK(ap); maskp = (au_mask_t *)data; *maskp = ap->ap_preselect_flags; AUDIT_PIPE_UNLOCK(ap); error = 0; break; case AUDITPIPE_SET_PRESELECT_FLAGS: AUDIT_PIPE_LOCK(ap); maskp = (au_mask_t *)data; ap->ap_preselect_flags = *maskp; AUDIT_PIPE_UNLOCK(ap); error = 0; break; case AUDITPIPE_GET_PRESELECT_NAFLAGS: AUDIT_PIPE_LOCK(ap); maskp = (au_mask_t *)data; *maskp = ap->ap_preselect_naflags; AUDIT_PIPE_UNLOCK(ap); error = 0; break; case AUDITPIPE_SET_PRESELECT_NAFLAGS: AUDIT_PIPE_LOCK(ap); maskp = (au_mask_t *)data; ap->ap_preselect_naflags = *maskp; AUDIT_PIPE_UNLOCK(ap); error = 0; break; case AUDITPIPE_GET_PRESELECT_AUID: aip = (struct auditpipe_ioctl_preselect *)data; error = audit_pipe_preselect_get(ap, aip->aip_auid, &aip->aip_mask); break; case AUDITPIPE_SET_PRESELECT_AUID: aip = (struct auditpipe_ioctl_preselect *)data; audit_pipe_preselect_set(ap, aip->aip_auid, aip->aip_mask); error = 0; break; case AUDITPIPE_DELETE_PRESELECT_AUID: auid = *(au_id_t *)data; error = audit_pipe_preselect_delete(ap, auid); break; case AUDITPIPE_FLUSH_PRESELECT_AUID: audit_pipe_preselect_flush(ap); error = 0; break; case AUDITPIPE_GET_PRESELECT_MODE: AUDIT_PIPE_LOCK(ap); *(int *)data = ap->ap_preselect_mode; AUDIT_PIPE_UNLOCK(ap); error = 0; break; case AUDITPIPE_SET_PRESELECT_MODE: mode = *(int *)data; switch (mode) { case AUDITPIPE_PRESELECT_MODE_TRAIL: case AUDITPIPE_PRESELECT_MODE_LOCAL: AUDIT_PIPE_LOCK(ap); ap->ap_preselect_mode = mode; AUDIT_PIPE_UNLOCK(ap); error = 0; break; default: error = EINVAL; } break; case AUDITPIPE_FLUSH: if (AUDIT_PIPE_SX_XLOCK_SIG(ap) != 0) return (EINTR); AUDIT_PIPE_LOCK(ap); audit_pipe_flush(ap); AUDIT_PIPE_UNLOCK(ap); AUDIT_PIPE_SX_XUNLOCK(ap); error = 0; break; case AUDITPIPE_GET_MAXAUDITDATA: *(u_int *)data = MAXAUDITDATA; error = 0; break; case AUDITPIPE_GET_INSERTS: *(u_int *)data = ap->ap_inserts; error = 0; break; case AUDITPIPE_GET_READS: *(u_int *)data = ap->ap_reads; error = 0; break; case AUDITPIPE_GET_DROPS: *(u_int *)data = ap->ap_drops; error = 0; break; case AUDITPIPE_GET_TRUNCATES: *(u_int *)data = 0; error = 0; break; default: error = ENOTTY; } return (error); } /* * Audit pipe read. Read one or more partial or complete records to user * memory. */ static int audit_pipe_read(struct cdev *dev, struct uio *uio, int flag) { struct audit_pipe_entry *ape; struct audit_pipe *ap; u_int toread; int error; error = devfs_get_cdevpriv((void **)&ap); if (error != 0) return (error); /* * We hold an sx(9) lock over read and flush because we rely on the * stability of a record in the queue during uiomove(9). */ if (AUDIT_PIPE_SX_XLOCK_SIG(ap) != 0) return (EINTR); AUDIT_PIPE_LOCK(ap); while (TAILQ_EMPTY(&ap->ap_queue)) { if (ap->ap_flags & AUDIT_PIPE_NBIO) { AUDIT_PIPE_UNLOCK(ap); AUDIT_PIPE_SX_XUNLOCK(ap); return (EAGAIN); } error = cv_wait_sig(&ap->ap_cv, AUDIT_PIPE_MTX(ap)); if (error) { AUDIT_PIPE_UNLOCK(ap); AUDIT_PIPE_SX_XUNLOCK(ap); return (error); } } /* * Copy as many remaining bytes from the current record to userspace * as we can. Keep processing records until we run out of records in * the queue, or until the user buffer runs out of space. * * Note: we rely on the SX lock to maintain ape's stability here. */ ap->ap_reads++; while ((ape = TAILQ_FIRST(&ap->ap_queue)) != NULL && uio->uio_resid > 0) { AUDIT_PIPE_LOCK_ASSERT(ap); KASSERT(ape->ape_record_len > ap->ap_qoffset, ("audit_pipe_read: record_len > qoffset (1)")); toread = MIN(ape->ape_record_len - ap->ap_qoffset, uio->uio_resid); AUDIT_PIPE_UNLOCK(ap); error = uiomove((char *)ape->ape_record + ap->ap_qoffset, toread, uio); if (error) { AUDIT_PIPE_SX_XUNLOCK(ap); return (error); } /* * If the copy succeeded, update book-keeping, and if no * bytes remain in the current record, free it. */ AUDIT_PIPE_LOCK(ap); KASSERT(TAILQ_FIRST(&ap->ap_queue) == ape, ("audit_pipe_read: queue out of sync after uiomove")); ap->ap_qoffset += toread; KASSERT(ape->ape_record_len >= ap->ap_qoffset, ("audit_pipe_read: record_len >= qoffset (2)")); if (ap->ap_qoffset == ape->ape_record_len) { TAILQ_REMOVE(&ap->ap_queue, ape, ape_queue); ap->ap_qbyteslen -= ape->ape_record_len; audit_pipe_entry_free(ape); ap->ap_qlen--; ap->ap_qoffset = 0; } } AUDIT_PIPE_UNLOCK(ap); AUDIT_PIPE_SX_XUNLOCK(ap); return (0); } /* * Audit pipe poll. */ static int audit_pipe_poll(struct cdev *dev, int events, struct thread *td) { struct audit_pipe *ap; int error, revents; revents = 0; error = devfs_get_cdevpriv((void **)&ap); if (error != 0) return (error); if (events & (POLLIN | POLLRDNORM)) { AUDIT_PIPE_LOCK(ap); if (TAILQ_FIRST(&ap->ap_queue) != NULL) revents |= events & (POLLIN | POLLRDNORM); else selrecord(td, &ap->ap_selinfo); AUDIT_PIPE_UNLOCK(ap); } return (revents); } /* * Audit pipe kqfilter. */ static int audit_pipe_kqfilter(struct cdev *dev, struct knote *kn) { struct audit_pipe *ap; int error; error = devfs_get_cdevpriv((void **)&ap); if (error != 0) return (error); if (kn->kn_filter != EVFILT_READ) return (EINVAL); kn->kn_fop = &audit_pipe_read_filterops; kn->kn_hook = ap; AUDIT_PIPE_LOCK(ap); knlist_add(&ap->ap_selinfo.si_note, kn, 1); AUDIT_PIPE_UNLOCK(ap); return (0); } /* * Return true if there are records available for reading on the pipe. */ static int audit_pipe_kqread(struct knote *kn, long hint) { struct audit_pipe *ap; ap = (struct audit_pipe *)kn->kn_hook; AUDIT_PIPE_LOCK_ASSERT(ap); if (ap->ap_qlen != 0) { kn->kn_data = ap->ap_qbyteslen - ap->ap_qoffset; return (1); } else { kn->kn_data = 0; return (0); } } /* * Detach kqueue state from audit pipe. */ static void audit_pipe_kqdetach(struct knote *kn) { struct audit_pipe *ap; ap = (struct audit_pipe *)kn->kn_hook; AUDIT_PIPE_LOCK(ap); knlist_remove(&ap->ap_selinfo.si_note, kn, 1); AUDIT_PIPE_UNLOCK(ap); } /* * Initialize the audit pipe system. */ static void audit_pipe_init(void *unused) { TAILQ_INIT(&audit_pipe_list); AUDIT_PIPE_LIST_LOCK_INIT(); audit_pipe_dev = make_dev(&audit_pipe_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "%s", AUDIT_PIPE_NAME); if (audit_pipe_dev == NULL) { AUDIT_PIPE_LIST_LOCK_DESTROY(); panic("Can't initialize audit pipe subsystem"); } } SYSINIT(audit_pipe_init, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, audit_pipe_init, NULL); diff --git a/sys/sys/file.h b/sys/sys/file.h index c1439b9bbaac..da96f3e332fc 100644 --- a/sys/sys/file.h +++ b/sys/sys/file.h @@ -1,510 +1,510 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)file.h 8.3 (Berkeley) 1/9/95 */ #ifndef _SYS_FILE_H_ #define _SYS_FILE_H_ #ifndef _KERNEL #include /* XXX */ #include #include #else #include #include #include #include #include struct filedesc; struct stat; struct thread; struct uio; struct knote; struct vnode; struct nameidata; #endif /* _KERNEL */ #define DTYPE_NONE 0 /* not yet initialized */ #define DTYPE_VNODE 1 /* file */ #define DTYPE_SOCKET 2 /* communications endpoint */ #define DTYPE_PIPE 3 /* pipe */ #define DTYPE_FIFO 4 /* fifo (named pipe) */ #define DTYPE_KQUEUE 5 /* event queue */ #define DTYPE_CRYPTO 6 /* crypto */ #define DTYPE_MQUEUE 7 /* posix message queue */ #define DTYPE_SHM 8 /* swap-backed shared memory */ #define DTYPE_SEM 9 /* posix semaphore */ #define DTYPE_PTS 10 /* pseudo teletype master device */ #define DTYPE_DEV 11 /* Device specific fd type */ #define DTYPE_PROCDESC 12 /* process descriptor */ #define DTYPE_EVENTFD 13 /* eventfd */ #define DTYPE_TIMERFD 14 /* timerfd */ #ifdef _KERNEL struct file; struct filecaps; struct kaiocb; struct kinfo_file; struct ucred; #define FOF_OFFSET 0x01 /* Use the offset in uio argument */ #define FOF_NOLOCK 0x02 /* Do not take FOFFSET_LOCK */ #define FOF_NEXTOFF_R 0x04 /* Also update f_nextoff[UIO_READ] */ #define FOF_NEXTOFF_W 0x08 /* Also update f_nextoff[UIO_WRITE] */ #define FOF_NOUPDATE 0x10 /* Do not update f_offset */ off_t foffset_lock(struct file *fp, int flags); void foffset_lock_uio(struct file *fp, struct uio *uio, int flags); void foffset_unlock(struct file *fp, off_t val, int flags); void foffset_unlock_uio(struct file *fp, struct uio *uio, int flags); static inline off_t foffset_get(struct file *fp) { return (foffset_lock(fp, FOF_NOLOCK)); } typedef int fo_rdwr_t(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td); typedef int fo_truncate_t(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td); typedef int fo_ioctl_t(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td); typedef int fo_poll_t(struct file *fp, int events, struct ucred *active_cred, struct thread *td); typedef int fo_kqfilter_t(struct file *fp, struct knote *kn); typedef int fo_stat_t(struct file *fp, struct stat *sb, struct ucred *active_cred); typedef int fo_close_t(struct file *fp, struct thread *td); typedef int fo_chmod_t(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td); typedef int fo_chown_t(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td); typedef int fo_sendfile_t(struct file *fp, int sockfd, struct uio *hdr_uio, struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, struct thread *td); typedef int fo_seek_t(struct file *fp, off_t offset, int whence, struct thread *td); typedef int fo_fill_kinfo_t(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp); typedef int fo_mmap_t(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, struct thread *td); typedef int fo_aio_queue_t(struct file *fp, struct kaiocb *job); typedef int fo_add_seals_t(struct file *fp, int flags); typedef int fo_get_seals_t(struct file *fp, int *flags); typedef int fo_fallocate_t(struct file *fp, off_t offset, off_t len, struct thread *td); typedef int fo_fspacectl_t(struct file *fp, int cmd, off_t *offset, off_t *length, int flags, struct ucred *active_cred, struct thread *td); typedef int fo_cmp_t(struct file *fp, struct file *fp1, struct thread *td); typedef int fo_spare_t(struct file *fp); typedef int fo_flags_t; struct fileops { fo_rdwr_t *fo_read; fo_rdwr_t *fo_write; fo_truncate_t *fo_truncate; fo_ioctl_t *fo_ioctl; fo_poll_t *fo_poll; fo_kqfilter_t *fo_kqfilter; fo_stat_t *fo_stat; fo_close_t *fo_close; fo_chmod_t *fo_chmod; fo_chown_t *fo_chown; fo_sendfile_t *fo_sendfile; fo_seek_t *fo_seek; fo_fill_kinfo_t *fo_fill_kinfo; fo_mmap_t *fo_mmap; fo_aio_queue_t *fo_aio_queue; fo_add_seals_t *fo_add_seals; fo_get_seals_t *fo_get_seals; fo_fallocate_t *fo_fallocate; fo_fspacectl_t *fo_fspacectl; fo_cmp_t *fo_cmp; fo_spare_t *fo_spares[7]; /* Spare slots */ fo_flags_t fo_flags; /* DFLAG_* below */ }; #define DFLAG_PASSABLE 0x01 /* may be passed via unix sockets. */ #define DFLAG_SEEKABLE 0x02 /* seekable / nonsequential */ #endif /* _KERNEL */ #if defined(_KERNEL) || defined(_WANT_FILE) /* * Kernel descriptor table. * One entry for each open kernel vnode and socket. * * Below is the list of locks that protects members in struct file. * * (a) f_vnode lock required (shared allows both reads and writes) * (f) updated with atomics and blocking on sleepq * (d) cdevpriv_mtx * none not locked */ #if __BSD_VISIBLE struct fadvise_info { int fa_advice; /* (f) FADV_* type. */ off_t fa_start; /* (f) Region start. */ off_t fa_end; /* (f) Region end. */ }; struct file { volatile u_int f_flag; /* see fcntl.h */ volatile u_int f_count; /* reference count */ void *f_data; /* file descriptor specific data */ const struct fileops *f_ops; /* File operations */ struct vnode *f_vnode; /* NULL or applicable vnode */ struct ucred *f_cred; /* associated credentials. */ short f_type; /* descriptor type */ short f_vnread_flags; /* (f) Sleep lock for f_offset */ /* * DTYPE_VNODE specific fields. */ union { int16_t f_seqcount[2]; /* (a) Count of seq. reads and writes. */ int f_pipegen; }; off_t f_nextoff[2]; /* next expected read/write offset. */ union { struct cdev_privdata *fvn_cdevpriv; /* (d) Private data for the cdev. */ struct fadvise_info *fvn_advice; } f_vnun; /* * DFLAG_SEEKABLE specific fields */ off_t f_offset; }; #define f_cdevpriv f_vnun.fvn_cdevpriv #define f_advice f_vnun.fvn_advice #define FOFFSET_LOCKED 0x1 #define FOFFSET_LOCK_WAITING 0x2 #endif /* __BSD_VISIBLE */ #endif /* _KERNEL || _WANT_FILE */ /* * Userland version of struct file, for sysctl */ #if __BSD_VISIBLE struct xfile { ksize_t xf_size; /* size of struct xfile */ pid_t xf_pid; /* owning process */ uid_t xf_uid; /* effective uid of owning process */ int xf_fd; /* descriptor number */ int _xf_int_pad1; kvaddr_t xf_file; /* address of struct file */ short xf_type; /* descriptor type */ short _xf_short_pad1; int xf_count; /* reference count */ int xf_msgcount; /* references from message queue */ int _xf_int_pad2; off_t xf_offset; /* file offset */ kvaddr_t xf_data; /* file descriptor specific data */ kvaddr_t xf_vnode; /* vnode pointer */ u_int xf_flag; /* flags (see fcntl.h) */ int _xf_int_pad3; int64_t _xf_int64_pad[6]; }; #endif /* __BSD_VISIBLE */ #ifdef _KERNEL -extern struct fileops vnops; -extern struct fileops badfileops; -extern struct fileops path_fileops; -extern struct fileops socketops; +extern const struct fileops vnops; +extern const struct fileops badfileops; +extern const struct fileops path_fileops; +extern const struct fileops socketops; extern int maxfiles; /* kernel limit on number of open files */ extern int maxfilesperproc; /* per process limit on number of open files */ int fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp); int fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, vm_prot_t *maxprotp, struct file **fpp); int fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp); int fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp); int fget_fcntl(struct thread *td, int fd, cap_rights_t *rightsp, int needfcntl, struct file **fpp); int _fdrop(struct file *fp, struct thread *td); int fget_remote(struct thread *td, struct proc *p, int fd, struct file **fpp); fo_rdwr_t invfo_rdwr; fo_truncate_t invfo_truncate; fo_ioctl_t invfo_ioctl; fo_poll_t invfo_poll; fo_kqfilter_t invfo_kqfilter; fo_chmod_t invfo_chmod; fo_chown_t invfo_chown; fo_sendfile_t invfo_sendfile; fo_stat_t vn_statfile; fo_sendfile_t vn_sendfile; fo_seek_t vn_seek; fo_fill_kinfo_t vn_fill_kinfo; fo_kqfilter_t vn_kqfilter_opath; int vn_fill_kinfo_vnode(struct vnode *vp, struct kinfo_file *kif); int file_kcmp_generic(struct file *fp1, struct file *fp2, struct thread *td); -void finit(struct file *, u_int, short, void *, struct fileops *); -void finit_vnode(struct file *, u_int, void *, struct fileops *); +void finit(struct file *, u_int, short, void *, const struct fileops *); +void finit_vnode(struct file *, u_int, void *, const struct fileops *); int fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp); int fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp); int fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp, struct filecaps *havecaps, struct vnode **vpp); int fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp); int fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp); int fgetvp_lookup_smr(struct nameidata *ndp, struct vnode **vpp, bool *fsearch); int fgetvp_lookup(struct nameidata *ndp, struct vnode **vpp); static __inline __result_use_check bool fhold(struct file *fp) { return (refcount_acquire_checked(&fp->f_count)); } #define fdrop(fp, td) ({ \ struct file *_fp; \ int _error; \ \ _error = 0; \ _fp = (fp); \ if (__predict_false(refcount_release(&_fp->f_count))) \ _error = _fdrop(_fp, td); \ _error; \ }) #define fdrop_close(fp, td) ({ \ struct file *_fp; \ int _error; \ \ _error = 0; \ _fp = (fp); \ if (__predict_true(refcount_release(&_fp->f_count))) \ _error = _fdrop(_fp, td); \ _error; \ }) static __inline fo_rdwr_t fo_read; static __inline fo_rdwr_t fo_write; static __inline fo_truncate_t fo_truncate; static __inline fo_ioctl_t fo_ioctl; static __inline fo_poll_t fo_poll; static __inline fo_kqfilter_t fo_kqfilter; static __inline fo_stat_t fo_stat; static __inline fo_close_t fo_close; static __inline fo_chmod_t fo_chmod; static __inline fo_chown_t fo_chown; static __inline fo_sendfile_t fo_sendfile; static __inline int fo_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return ((*fp->f_ops->fo_read)(fp, uio, active_cred, flags, td)); } static __inline int fo_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { return ((*fp->f_ops->fo_write)(fp, uio, active_cred, flags, td)); } static __inline int fo_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { return ((*fp->f_ops->fo_truncate)(fp, length, active_cred, td)); } static __inline int fo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td) { return ((*fp->f_ops->fo_ioctl)(fp, com, data, active_cred, td)); } static __inline int fo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { return ((*fp->f_ops->fo_poll)(fp, events, active_cred, td)); } static __inline int fo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred) { return ((*fp->f_ops->fo_stat)(fp, sb, active_cred)); } static __inline int fo_close(struct file *fp, struct thread *td) { return ((*fp->f_ops->fo_close)(fp, td)); } static __inline int fo_kqfilter(struct file *fp, struct knote *kn) { return ((*fp->f_ops->fo_kqfilter)(fp, kn)); } static __inline int fo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { return ((*fp->f_ops->fo_chmod)(fp, mode, active_cred, td)); } static __inline int fo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { return ((*fp->f_ops->fo_chown)(fp, uid, gid, active_cred, td)); } static __inline int fo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, struct thread *td) { return ((*fp->f_ops->fo_sendfile)(fp, sockfd, hdr_uio, trl_uio, offset, nbytes, sent, flags, td)); } static __inline int fo_seek(struct file *fp, off_t offset, int whence, struct thread *td) { return ((*fp->f_ops->fo_seek)(fp, offset, whence, td)); } static __inline int fo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { return ((*fp->f_ops->fo_fill_kinfo)(fp, kif, fdp)); } static __inline int fo_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, struct thread *td) { if (fp->f_ops->fo_mmap == NULL) return (ENODEV); return ((*fp->f_ops->fo_mmap)(fp, map, addr, size, prot, cap_maxprot, flags, foff, td)); } static __inline int fo_aio_queue(struct file *fp, struct kaiocb *job) { return ((*fp->f_ops->fo_aio_queue)(fp, job)); } static __inline int fo_add_seals(struct file *fp, int seals) { if (fp->f_ops->fo_add_seals == NULL) return (EINVAL); return ((*fp->f_ops->fo_add_seals)(fp, seals)); } static __inline int fo_get_seals(struct file *fp, int *seals) { if (fp->f_ops->fo_get_seals == NULL) return (EINVAL); return ((*fp->f_ops->fo_get_seals)(fp, seals)); } static __inline int fo_fallocate(struct file *fp, off_t offset, off_t len, struct thread *td) { if (fp->f_ops->fo_fallocate == NULL) return (ENODEV); return ((*fp->f_ops->fo_fallocate)(fp, offset, len, td)); } static __inline int fo_fspacectl(struct file *fp, int cmd, off_t *offset, off_t *length, int flags, struct ucred *active_cred, struct thread *td) { if (fp->f_ops->fo_fspacectl == NULL) return (ENODEV); return ((*fp->f_ops->fo_fspacectl)(fp, cmd, offset, length, flags, active_cred, td)); } static __inline int fo_cmp(struct file *fp1, struct file *fp2, struct thread *td) { if (fp1->f_ops->fo_cmp == NULL) return (ENODEV); return ((*fp1->f_ops->fo_cmp)(fp1, fp2, td)); } #endif /* _KERNEL */ #endif /* !SYS_FILE_H */ diff --git a/sys/sys/mman.h b/sys/sys/mman.h index 36973b941e61..d444f02d3c89 100644 --- a/sys/sys/mman.h +++ b/sys/sys/mman.h @@ -1,359 +1,359 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)mman.h 8.2 (Berkeley) 1/9/95 */ #ifndef _SYS_MMAN_H_ #define _SYS_MMAN_H_ #include #include #if __BSD_VISIBLE /* * Inheritance for minherit() */ #define INHERIT_SHARE 0 #define INHERIT_COPY 1 #define INHERIT_NONE 2 #define INHERIT_ZERO 3 #endif /* * Protections are chosen from these bits, or-ed together */ #define PROT_NONE 0x00 /* no permissions */ #define PROT_READ 0x01 /* pages can be read */ #define PROT_WRITE 0x02 /* pages can be written */ #define PROT_EXEC 0x04 /* pages can be executed */ #if __BSD_VISIBLE #define _PROT_ALL (PROT_READ | PROT_WRITE | PROT_EXEC) #define PROT_EXTRACT(prot) ((prot) & _PROT_ALL) #define _PROT_MAX_SHIFT 16 #define PROT_MAX(prot) ((prot) << _PROT_MAX_SHIFT) #define PROT_MAX_EXTRACT(prot) (((prot) >> _PROT_MAX_SHIFT) & _PROT_ALL) #endif /* * Flags contain sharing type and options. * Sharing types; choose one. */ #define MAP_SHARED 0x0001 /* share changes */ #define MAP_PRIVATE 0x0002 /* changes are private */ #if __BSD_VISIBLE #define MAP_COPY MAP_PRIVATE /* Obsolete */ #endif /* * Other flags */ #define MAP_FIXED 0x0010 /* map addr must be exactly as requested */ #if __BSD_VISIBLE #define MAP_RESERVED0020 0x0020 /* previously unimplemented MAP_RENAME */ #define MAP_RESERVED0040 0x0040 /* previously unimplemented MAP_NORESERVE */ #define MAP_RESERVED0080 0x0080 /* previously misimplemented MAP_INHERIT */ #define MAP_RESERVED0100 0x0100 /* previously unimplemented MAP_NOEXTEND */ #define MAP_HASSEMAPHORE 0x0200 /* region may contain semaphores */ #define MAP_STACK 0x0400 /* region grows down, like a stack */ #define MAP_NOSYNC 0x0800 /* page to but do not sync underlying file */ /* * Mapping type */ #define MAP_FILE 0x0000 /* map from file (default) */ #define MAP_ANON 0x1000 /* allocated from memory, swap space */ #ifndef _KERNEL #define MAP_ANONYMOUS MAP_ANON /* For compatibility. */ #endif /* !_KERNEL */ /* * Extended flags */ #define MAP_GUARD 0x00002000 /* reserve but don't map address range */ #define MAP_EXCL 0x00004000 /* for MAP_FIXED, fail if address is used */ #define MAP_NOCORE 0x00020000 /* dont include these pages in a coredump */ #define MAP_PREFAULT_READ 0x00040000 /* prefault mapping for reading */ #define MAP_32BIT 0x00080000 /* map in the low 2GB of address space */ /* * Request specific alignment (n == log2 of the desired alignment). * * MAP_ALIGNED_SUPER requests optimal superpage alignment, but does * not enforce a specific alignment. */ #define MAP_ALIGNED(n) ((n) << MAP_ALIGNMENT_SHIFT) #define MAP_ALIGNMENT_SHIFT 24 #define MAP_ALIGNMENT_MASK MAP_ALIGNED(0xff) #define MAP_ALIGNED_SUPER MAP_ALIGNED(1) /* align on a superpage */ /* * Flags provided to shm_rename */ /* Don't overwrite dest, if it exists */ #define SHM_RENAME_NOREPLACE (1 << 0) /* Atomically swap src and dest */ #define SHM_RENAME_EXCHANGE (1 << 1) #endif /* __BSD_VISIBLE */ #if __POSIX_VISIBLE >= 199309 /* * Process memory locking */ #define MCL_CURRENT 0x0001 /* Lock only current memory */ #define MCL_FUTURE 0x0002 /* Lock all future memory as well */ #endif /* * Error return from mmap() */ #define MAP_FAILED ((void *)-1) /* * msync() flags */ #define MS_SYNC 0x0000 /* msync synchronously */ #define MS_ASYNC 0x0001 /* return immediately */ #define MS_INVALIDATE 0x0002 /* invalidate all cached data */ /* * Advice to madvise */ #define _MADV_NORMAL 0 /* no further special treatment */ #define _MADV_RANDOM 1 /* expect random page references */ #define _MADV_SEQUENTIAL 2 /* expect sequential page references */ #define _MADV_WILLNEED 3 /* will need these pages */ #define _MADV_DONTNEED 4 /* dont need these pages */ #if __BSD_VISIBLE #define MADV_NORMAL _MADV_NORMAL #define MADV_RANDOM _MADV_RANDOM #define MADV_SEQUENTIAL _MADV_SEQUENTIAL #define MADV_WILLNEED _MADV_WILLNEED #define MADV_DONTNEED _MADV_DONTNEED #define MADV_FREE 5 /* dont need these pages, and junk contents */ #define MADV_NOSYNC 6 /* try to avoid flushes to physical media */ #define MADV_AUTOSYNC 7 /* revert to default flushing strategy */ #define MADV_NOCORE 8 /* do not include these pages in a core file */ #define MADV_CORE 9 /* revert to including pages in a core file */ #define MADV_PROTECT 10 /* protect process from pageout kill */ /* * Return bits from mincore */ #define MINCORE_INCORE 0x1 /* Page is incore */ #define MINCORE_REFERENCED 0x2 /* Page has been referenced by us */ #define MINCORE_MODIFIED 0x4 /* Page has been modified by us */ #define MINCORE_REFERENCED_OTHER 0x8 /* Page has been referenced */ #define MINCORE_MODIFIED_OTHER 0x10 /* Page has been modified */ #define MINCORE_SUPER 0x60 /* Page is a "super" page */ #define MINCORE_PSIND(i) (((i) << 5) & MINCORE_SUPER) /* Page size */ /* * Anonymous object constant for shm_open(). */ #define SHM_ANON ((char *)1) /* * shmflags for shm_open2() */ #define SHM_ALLOW_SEALING 0x00000001 #define SHM_GROW_ON_WRITE 0x00000002 #define SHM_LARGEPAGE 0x00000004 #define SHM_LARGEPAGE_ALLOC_DEFAULT 0 #define SHM_LARGEPAGE_ALLOC_NOWAIT 1 #define SHM_LARGEPAGE_ALLOC_HARD 2 struct shm_largepage_conf { int psind; int alloc_policy; int pad[10]; }; /* * Flags for memfd_create(). */ #define MFD_CLOEXEC 0x00000001 #define MFD_ALLOW_SEALING 0x00000002 #define MFD_HUGETLB 0x00000004 #define MFD_HUGE_MASK 0xFC000000 #define MFD_HUGE_SHIFT 26 #define MFD_HUGE_64KB (16 << MFD_HUGE_SHIFT) #define MFD_HUGE_512KB (19 << MFD_HUGE_SHIFT) #define MFD_HUGE_1MB (20 << MFD_HUGE_SHIFT) #define MFD_HUGE_2MB (21 << MFD_HUGE_SHIFT) #define MFD_HUGE_8MB (23 << MFD_HUGE_SHIFT) #define MFD_HUGE_16MB (24 << MFD_HUGE_SHIFT) #define MFD_HUGE_32MB (25 << MFD_HUGE_SHIFT) #define MFD_HUGE_256MB (28 << MFD_HUGE_SHIFT) #define MFD_HUGE_512MB (29 << MFD_HUGE_SHIFT) #define MFD_HUGE_1GB (30 << MFD_HUGE_SHIFT) #define MFD_HUGE_2GB (31 << MFD_HUGE_SHIFT) #define MFD_HUGE_16GB (34 << MFD_HUGE_SHIFT) #endif /* __BSD_VISIBLE */ /* * XXX missing POSIX_TYPED_MEM_* macros and * posix_typed_mem_info structure. */ #if __POSIX_VISIBLE >= 200112 #define POSIX_MADV_NORMAL _MADV_NORMAL #define POSIX_MADV_RANDOM _MADV_RANDOM #define POSIX_MADV_SEQUENTIAL _MADV_SEQUENTIAL #define POSIX_MADV_WILLNEED _MADV_WILLNEED #define POSIX_MADV_DONTNEED _MADV_DONTNEED #endif #ifndef _MODE_T_DECLARED typedef __mode_t mode_t; #define _MODE_T_DECLARED #endif #ifndef _OFF_T_DECLARED typedef __off_t off_t; #define _OFF_T_DECLARED #endif #ifndef _SIZE_T_DECLARED typedef __size_t size_t; #define _SIZE_T_DECLARED #endif #if defined(_KERNEL) || defined(_WANT_FILE) #include #include #include #include #include struct file; struct shmfd { vm_ooffset_t shm_size; vm_object_t shm_object; vm_pindex_t shm_pages; /* allocated pages */ int shm_refs; uid_t shm_uid; gid_t shm_gid; mode_t shm_mode; int shm_kmappings; /* * Values maintained solely to make this a better-behaved file * descriptor for fstat() to run on. */ struct timespec shm_atime; struct timespec shm_mtime; struct timespec shm_ctime; struct timespec shm_birthtime; ino_t shm_ino; struct label *shm_label; /* MAC label */ const char *shm_path; struct rangelock shm_rl; struct mtx shm_mtx; int shm_flags; int shm_seals; /* largepage config */ int shm_lp_psind; int shm_lp_alloc_policy; }; #endif #ifdef _KERNEL struct prison; int shm_map(struct file *fp, size_t size, off_t offset, void **memp); int shm_unmap(struct file *fp, void *mem, size_t size); int shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags); struct shmfd *shm_alloc(struct ucred *ucred, mode_t mode, bool largepage); struct shmfd *shm_hold(struct shmfd *shmfd); void shm_drop(struct shmfd *shmfd); int shm_dotruncate(struct shmfd *shmfd, off_t length); bool shm_largepage(struct shmfd *shmfd); void shm_remove_prison(struct prison *pr); int shm_get_path(struct vm_object *obj, char *path, size_t sz); -extern struct fileops shm_ops; +extern const struct fileops shm_ops; #define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31) #else /* !_KERNEL */ __BEGIN_DECLS /* * XXX not yet implemented: posix_mem_offset(), posix_typed_mem_get_info(), * posix_typed_mem_open(). */ #if __BSD_VISIBLE int getpagesizes(size_t *, int); int madvise(void *, size_t, int); int mincore(const void *, size_t, char *); int minherit(void *, size_t, int); #endif int mlock(const void *, size_t); #ifndef _MMAP_DECLARED #define _MMAP_DECLARED void * mmap(void *, size_t, int, int, int, off_t); #endif int mprotect(void *, size_t, int); int msync(void *, size_t, int); int munlock(const void *, size_t); int munmap(void *, size_t); #if __POSIX_VISIBLE >= 200112 int posix_madvise(void *, size_t, int); #endif #if __POSIX_VISIBLE >= 199309 int mlockall(int); int munlockall(void); int shm_open(const char *, int, mode_t); int shm_unlink(const char *); #endif #if __BSD_VISIBLE int memfd_create(const char *, unsigned int); int shm_create_largepage(const char *, int, int, int, mode_t); int shm_rename(const char *, const char *, int); #endif __END_DECLS #endif /* !_KERNEL */ #endif /* !_SYS_MMAN_H_ */ diff --git a/sys/sys/pipe.h b/sys/sys/pipe.h index 0f35316432eb..a83ea800c677 100644 --- a/sys/sys/pipe.h +++ b/sys/sys/pipe.h @@ -1,152 +1,152 @@ /*- * Copyright (c) 1996 John S. Dyson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice immediately at the beginning of the file, without modification, * this list of conditions, and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Absolutely no warranty of function or purpose is made by the author * John S. Dyson. * 4. This work was done expressly for inclusion into FreeBSD. Other use * is allowed if this notation is included. * 5. Modifications may be freely made to this file if the above conditions * are met. */ #ifndef _SYS_PIPE_H_ #define _SYS_PIPE_H_ /* * Pipe buffer size, keep moderate in value, pipes take kva space. */ #ifndef PIPE_SIZE #define PIPE_SIZE 16384 #endif #ifndef BIG_PIPE_SIZE #define BIG_PIPE_SIZE (64*1024) #endif #ifndef SMALL_PIPE_SIZE #define SMALL_PIPE_SIZE PAGE_SIZE #endif /* * PIPE_MINDIRECT MUST be smaller than PIPE_SIZE and MUST be bigger * than PIPE_BUF. */ #ifndef PIPE_MINDIRECT #define PIPE_MINDIRECT 8192 #endif #define PIPENPAGES (BIG_PIPE_SIZE / PAGE_SIZE + 1) #ifdef _KERNEL /* * See sys_pipe.c for info on what these limits mean. */ extern long maxpipekva; -extern struct fileops pipeops; +extern const struct fileops pipeops; #endif /* * Pipe buffer information. * Separate in, out, cnt are used to simplify calculations. * Buffered write is active when the buffer.cnt field is set. */ struct pipebuf { u_int cnt; /* number of chars currently in buffer */ u_int in; /* in pointer */ u_int out; /* out pointer */ u_int size; /* size of buffer */ caddr_t buffer; /* kva of buffer */ }; /* * Information to support direct transfers between processes for pipes. */ struct pipemapping { u_int cnt; /* number of chars in buffer */ u_int pos; /* current position of transfer */ int npages; /* number of pages */ vm_page_t ms[PIPENPAGES]; /* pages in source process */ }; /* * Bits in pipe_state. */ #define PIPE_ASYNC 0x004 /* Async? I/O. */ #define PIPE_WANTR 0x008 /* Reader wants some characters. */ #define PIPE_WANTW 0x010 /* Writer wants space to put characters. */ #define PIPE_WANT 0x020 /* Pipe is wanted to be run-down. */ #define PIPE_SEL 0x040 /* Pipe has a select active. */ #define PIPE_EOF 0x080 /* Pipe is in EOF condition. */ #define PIPE_LOCKFL 0x100 /* Process has exclusive access to pointers/data. */ #define PIPE_LWANT 0x200 /* Process wants exclusive access to pointers/data. */ #define PIPE_DIRECTW 0x400 /* Pipe direct write active. */ #define PIPE_DIRECTOK 0x800 /* Direct mode ok. */ /* * Bits in pipe_type. */ #define PIPE_TYPE_NAMED 0x001 /* Is a named pipe. */ /* * Per-pipe data structure. * Two of these are linked together to produce bi-directional pipes. */ struct pipe { struct pipebuf pipe_buffer; /* data storage */ struct pipemapping pipe_pages; /* wired pages for direct I/O */ struct selinfo pipe_sel; /* for compat with select */ struct timespec pipe_atime; /* time of last access */ struct timespec pipe_mtime; /* time of last modify */ struct timespec pipe_ctime; /* time of status change */ struct sigio *pipe_sigio; /* information for async I/O */ struct pipe *pipe_peer; /* link with other direction */ struct pipepair *pipe_pair; /* container structure pointer */ u_short pipe_state; /* pipe status info */ u_char pipe_type; /* pipe type info */ u_char pipe_present; /* still present? */ int pipe_waiters; /* pipelock waiters */ int pipe_busy; /* busy flag, mostly to handle rundown sanely */ int pipe_wgen; /* writer generation for named pipe */ ino_t pipe_ino; /* fake inode for stat(2) */ }; /* * Values for the pipe_present. */ #define PIPE_ACTIVE 1 #define PIPE_CLOSING 2 #define PIPE_FINALIZED 3 /* * Container structure to hold the two pipe endpoints, mutex, and label * pointer. */ struct pipepair { struct pipe pp_rpipe; struct pipe pp_wpipe; struct mtx pp_mtx; struct label *pp_label; struct ucred *pp_owner; /* to dec pipe usage count */ }; #define PIPE_MTX(pipe) (&(pipe)->pipe_pair->pp_mtx) #define PIPE_LOCK(pipe) mtx_lock(PIPE_MTX(pipe)) #define PIPE_UNLOCK(pipe) mtx_unlock(PIPE_MTX(pipe)) #define PIPE_LOCK_ASSERT(pipe, type) mtx_assert(PIPE_MTX(pipe), (type)) #ifdef _KERNEL void pipe_dtor(struct pipe *dpipe); int pipe_named_ctor(struct pipe **ppipe, struct thread *td); void pipeselwakeup(struct pipe *cpipe); #endif #endif /* !_SYS_PIPE_H_ */ diff --git a/sys/x86/acpica/acpi_apm.c b/sys/x86/acpica/acpi_apm.c index 4e880c3e5411..e7e4b0f1a546 100644 --- a/sys/x86/acpica/acpi_apm.c +++ b/sys/x86/acpica/acpi_apm.c @@ -1,452 +1,452 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2001 Mitsuru IWASAKI * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * APM driver emulation */ #define APM_UNKNOWN 0xff static int apm_active; static MALLOC_DEFINE(M_APMDEV, "apmdev", "APM device emulation"); static d_open_t apmopen; static d_write_t apmwrite; static d_ioctl_t apmioctl; static d_poll_t apmpoll; static d_kqfilter_t apmkqfilter; static void apmreadfiltdetach(struct knote *kn); static int apmreadfilt(struct knote *kn, long hint); -static struct filterops apm_readfiltops = { +static const struct filterops apm_readfiltops = { .f_isfd = 1, .f_detach = apmreadfiltdetach, .f_event = apmreadfilt, }; static struct cdevsw apm_cdevsw = { .d_version = D_VERSION, .d_open = apmopen, .d_write = apmwrite, .d_ioctl = apmioctl, .d_poll = apmpoll, .d_name = "apm", .d_kqfilter = apmkqfilter }; static int acpi_capm_convert_battstate(struct acpi_battinfo *battp) { int state; state = APM_UNKNOWN; if (battp->state & ACPI_BATT_STAT_DISCHARG) { if (battp->cap >= 50) state = 0; /* high */ else state = 1; /* low */ } if (battp->state & ACPI_BATT_STAT_CRITICAL) state = 2; /* critical */ if (battp->state & ACPI_BATT_STAT_CHARGING) state = 3; /* charging */ /* If still unknown, determine it based on the battery capacity. */ if (state == APM_UNKNOWN) { if (battp->cap >= 50) state = 0; /* high */ else state = 1; /* low */ } return (state); } static int acpi_capm_convert_battflags(struct acpi_battinfo *battp) { int flags; flags = 0; if (battp->cap >= 50) flags |= APM_BATT_HIGH; else { if (battp->state & ACPI_BATT_STAT_CRITICAL) flags |= APM_BATT_CRITICAL; else flags |= APM_BATT_LOW; } if (battp->state & ACPI_BATT_STAT_CHARGING) flags |= APM_BATT_CHARGING; if (battp->state == ACPI_BATT_STAT_NOT_PRESENT) flags = APM_BATT_NOT_PRESENT; return (flags); } static int acpi_capm_get_info(apm_info_t aip) { int acline; struct acpi_battinfo batt; aip->ai_infoversion = 1; aip->ai_major = 1; aip->ai_minor = 2; aip->ai_status = apm_active; aip->ai_capabilities= 0xff00; /* unknown */ if (acpi_acad_get_acline(&acline)) aip->ai_acline = APM_UNKNOWN; /* unknown */ else aip->ai_acline = acline; /* on/off */ if (acpi_battery_get_battinfo(NULL, &batt) != 0) { aip->ai_batt_stat = APM_UNKNOWN; aip->ai_batt_life = APM_UNKNOWN; aip->ai_batt_time = -1; /* unknown */ aip->ai_batteries = ~0U; /* unknown */ } else { aip->ai_batt_stat = acpi_capm_convert_battstate(&batt); aip->ai_batt_life = batt.cap; aip->ai_batt_time = (batt.min == -1) ? -1 : batt.min * 60; aip->ai_batteries = acpi_battery_get_units(); } return (0); } static int acpi_capm_get_pwstatus(apm_pwstatus_t app) { device_t dev; int acline, unit, error; struct acpi_battinfo batt; if (app->ap_device != PMDV_ALLDEV && (app->ap_device < PMDV_BATT0 || app->ap_device > PMDV_BATT_ALL)) return (1); if (app->ap_device == PMDV_ALLDEV) error = acpi_battery_get_battinfo(NULL, &batt); else { unit = app->ap_device - PMDV_BATT0; dev = devclass_get_device(devclass_find("battery"), unit); if (dev != NULL) error = acpi_battery_get_battinfo(dev, &batt); else error = ENXIO; } if (error) return (1); app->ap_batt_stat = acpi_capm_convert_battstate(&batt); app->ap_batt_flag = acpi_capm_convert_battflags(&batt); app->ap_batt_life = batt.cap; app->ap_batt_time = (batt.min == -1) ? -1 : batt.min * 60; if (acpi_acad_get_acline(&acline)) app->ap_acline = APM_UNKNOWN; else app->ap_acline = acline; /* on/off */ return (0); } /* Create a struct for tracking per-device suspend notification. */ static struct apm_clone_data * apm_create_clone(struct cdev *dev, struct acpi_softc *acpi_sc) { struct apm_clone_data *clone; clone = malloc(sizeof(*clone), M_APMDEV, M_WAITOK); clone->cdev = dev; clone->acpi_sc = acpi_sc; clone->notify_status = APM_EV_NONE; bzero(&clone->sel_read, sizeof(clone->sel_read)); knlist_init_mtx(&clone->sel_read.si_note, &acpi_mutex); /* * The acpi device is always managed by devd(8) and is considered * writable (i.e., ack is required to allow suspend to proceed.) */ if (strcmp("acpi", devtoname(dev)) == 0) clone->flags = ACPI_EVF_DEVD | ACPI_EVF_WRITE; else clone->flags = ACPI_EVF_NONE; ACPI_LOCK(acpi); STAILQ_INSERT_TAIL(&acpi_sc->apm_cdevs, clone, entries); ACPI_UNLOCK(acpi); return (clone); } static void apmdtor(void *data) { struct apm_clone_data *clone; struct acpi_softc *acpi_sc; clone = data; acpi_sc = clone->acpi_sc; /* We are about to lose a reference so check if suspend should occur */ if (acpi_sc->acpi_next_sstate != 0 && clone->notify_status != APM_EV_ACKED) acpi_AckSleepState(clone, 0); /* Remove this clone's data from the list and free it. */ ACPI_LOCK(acpi); STAILQ_REMOVE(&acpi_sc->apm_cdevs, clone, apm_clone_data, entries); seldrain(&clone->sel_read); knlist_destroy(&clone->sel_read.si_note); ACPI_UNLOCK(acpi); free(clone, M_APMDEV); } static int apmopen(struct cdev *dev, int flag, int fmt, struct thread *td) { struct acpi_softc *acpi_sc; struct apm_clone_data *clone; acpi_sc = devclass_get_softc(devclass_find("acpi"), 0); clone = apm_create_clone(dev, acpi_sc); devfs_set_cdevpriv(clone, apmdtor); /* If the device is opened for write, record that. */ if ((flag & FWRITE) != 0) clone->flags |= ACPI_EVF_WRITE; return (0); } static int apmioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td) { int error; struct apm_clone_data *clone; struct acpi_softc *acpi_sc; struct apm_info info; struct apm_event_info *ev_info; apm_info_old_t aiop; error = 0; devfs_get_cdevpriv((void **)&clone); acpi_sc = clone->acpi_sc; switch (cmd) { case APMIO_SUSPEND: if ((flag & FWRITE) == 0) return (EPERM); if (acpi_sc->acpi_next_sstate == 0) { if (acpi_sc->acpi_suspend_sx != ACPI_STATE_S5) { error = acpi_ReqSleepState(acpi_sc, acpi_sc->acpi_suspend_sx); } else { printf( "power off via apm suspend not supported\n"); error = ENXIO; } } else error = acpi_AckSleepState(clone, 0); break; case APMIO_STANDBY: if ((flag & FWRITE) == 0) return (EPERM); if (acpi_sc->acpi_next_sstate == 0) { if (acpi_sc->acpi_standby_sx != ACPI_STATE_S5) { error = acpi_ReqSleepState(acpi_sc, acpi_sc->acpi_standby_sx); } else { printf( "power off via apm standby not supported\n"); error = ENXIO; } } else error = acpi_AckSleepState(clone, 0); break; case APMIO_NEXTEVENT: printf("apm nextevent start\n"); ACPI_LOCK(acpi); if (acpi_sc->acpi_next_sstate != 0 && clone->notify_status == APM_EV_NONE) { ev_info = (struct apm_event_info *)addr; if (acpi_sc->acpi_next_sstate <= ACPI_STATE_S3) ev_info->type = PMEV_STANDBYREQ; else ev_info->type = PMEV_SUSPENDREQ; ev_info->index = 0; clone->notify_status = APM_EV_NOTIFIED; printf("apm event returning %d\n", ev_info->type); } else error = EAGAIN; ACPI_UNLOCK(acpi); break; case APMIO_GETINFO_OLD: if (acpi_capm_get_info(&info)) error = ENXIO; aiop = (apm_info_old_t)addr; aiop->ai_major = info.ai_major; aiop->ai_minor = info.ai_minor; aiop->ai_acline = info.ai_acline; aiop->ai_batt_stat = info.ai_batt_stat; aiop->ai_batt_life = info.ai_batt_life; aiop->ai_status = info.ai_status; break; case APMIO_GETINFO: if (acpi_capm_get_info((apm_info_t)addr)) error = ENXIO; break; case APMIO_GETPWSTATUS: if (acpi_capm_get_pwstatus((apm_pwstatus_t)addr)) error = ENXIO; break; case APMIO_ENABLE: if ((flag & FWRITE) == 0) return (EPERM); apm_active = 1; break; case APMIO_DISABLE: if ((flag & FWRITE) == 0) return (EPERM); apm_active = 0; break; case APMIO_HALTCPU: break; case APMIO_NOTHALTCPU: break; case APMIO_DISPLAY: if ((flag & FWRITE) == 0) return (EPERM); break; case APMIO_BIOS: if ((flag & FWRITE) == 0) return (EPERM); bzero(addr, sizeof(struct apm_bios_arg)); break; default: error = EINVAL; break; } return (error); } static int apmwrite(struct cdev *dev, struct uio *uio, int ioflag) { return (uio->uio_resid); } static int apmpoll(struct cdev *dev, int events, struct thread *td) { struct apm_clone_data *clone; int revents; revents = 0; devfs_get_cdevpriv((void **)&clone); ACPI_LOCK(acpi); if (clone->acpi_sc->acpi_next_sstate) revents |= events & (POLLIN | POLLRDNORM); else selrecord(td, &clone->sel_read); ACPI_UNLOCK(acpi); return (revents); } static int apmkqfilter(struct cdev *dev, struct knote *kn) { struct apm_clone_data *clone; devfs_get_cdevpriv((void **)&clone); ACPI_LOCK(acpi); kn->kn_hook = clone; kn->kn_fop = &apm_readfiltops; knlist_add(&clone->sel_read.si_note, kn, 0); ACPI_UNLOCK(acpi); return (0); } static void apmreadfiltdetach(struct knote *kn) { struct apm_clone_data *clone; ACPI_LOCK(acpi); clone = kn->kn_hook; knlist_remove(&clone->sel_read.si_note, kn, 0); ACPI_UNLOCK(acpi); } static int apmreadfilt(struct knote *kn, long hint) { struct apm_clone_data *clone; int sleeping; ACPI_LOCK(acpi); clone = kn->kn_hook; sleeping = clone->acpi_sc->acpi_next_sstate ? 1 : 0; ACPI_UNLOCK(acpi); return (sleeping); } void acpi_apm_init(struct acpi_softc *sc) { /* Create a clone for /dev/acpi also. */ STAILQ_INIT(&sc->apm_cdevs); sc->acpi_clone = apm_create_clone(sc->acpi_dev_t, sc); make_dev(&apm_cdevsw, 0, UID_ROOT, GID_OPERATOR, 0660, "apmctl"); make_dev(&apm_cdevsw, 0, UID_ROOT, GID_OPERATOR, 0664, "apm"); }