Index: head/sys/dev/iscsi/icl.c =================================================================== --- head/sys/dev/iscsi/icl.c (revision 357970) +++ head/sys/dev/iscsi/icl.c (revision 357971) @@ -1,331 +1,332 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 The FreeBSD Foundation * All rights reserved. * * This software was developed by Edward Tomasz Napierala under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* * iSCSI Common Layer. It's used by both the initiator and target to send * and receive iSCSI PDUs. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct icl_module { TAILQ_ENTRY(icl_module) im_next; char *im_name; bool im_iser; int im_priority; int (*im_limits)(struct icl_drv_limits *idl); struct icl_conn *(*im_new_conn)(const char *name, struct mtx *lock); }; struct icl_softc { struct sx sc_lock; TAILQ_HEAD(, icl_module) sc_modules; }; static int sysctl_kern_icl_offloads(SYSCTL_HANDLER_ARGS); static MALLOC_DEFINE(M_ICL, "icl", "iSCSI Common Layer"); static struct icl_softc *sc; -SYSCTL_NODE(_kern, OID_AUTO, icl, CTLFLAG_RD, 0, "iSCSI Common Layer"); +SYSCTL_NODE(_kern, OID_AUTO, icl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, + "iSCSI Common Layer"); int icl_debug = 1; SYSCTL_INT(_kern_icl, OID_AUTO, debug, CTLFLAG_RWTUN, &icl_debug, 0, "Enable debug messages"); SYSCTL_PROC(_kern_icl, OID_AUTO, offloads, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, false, sysctl_kern_icl_offloads, "A", "List of ICL modules"); SYSCTL_PROC(_kern_icl, OID_AUTO, iser_offloads, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, true, sysctl_kern_icl_offloads, "A", "List of iSER ICL modules"); static int sysctl_kern_icl_offloads(SYSCTL_HANDLER_ARGS) { const struct icl_module *im; struct sbuf sb; bool iser = arg2; int error; sbuf_new(&sb, NULL, 256, SBUF_AUTOEXTEND | SBUF_INCLUDENUL); sx_slock(&sc->sc_lock); TAILQ_FOREACH(im, &sc->sc_modules, im_next) { if (im->im_iser != iser) continue; if (im != TAILQ_FIRST(&sc->sc_modules)) sbuf_putc(&sb, ' '); sbuf_printf(&sb, "%s", im->im_name); } sx_sunlock(&sc->sc_lock); error = sbuf_finish(&sb); if (error == 0) error = SYSCTL_OUT(req, sbuf_data(&sb), sbuf_len(&sb)); sbuf_delete(&sb); return (error); } static struct icl_module * icl_find(const char *name, bool iser, bool quiet) { struct icl_module *im, *im_max; sx_assert(&sc->sc_lock, SA_LOCKED); /* * If the name was not specified, pick a module with highest * priority. */ if (name == NULL || name[0] == '\0') { im_max = NULL; TAILQ_FOREACH(im, &sc->sc_modules, im_next) { if (im->im_iser != iser) continue; if (im_max == NULL || im->im_priority > im_max->im_priority) im_max = im; } if (iser && im_max == NULL && !quiet) ICL_WARN("no iSER-capable offload found"); return (im_max); } TAILQ_FOREACH(im, &sc->sc_modules, im_next) { if (strcasecmp(im->im_name, name) != 0) continue; if (!im->im_iser && iser && !quiet) { ICL_WARN("offload \"%s\" is not iSER-capable", name); return (NULL); } if (im->im_iser && !iser && !quiet) { ICL_WARN("offload \"%s\" is iSER-only", name); return (NULL); } return (im); } if (!quiet) ICL_WARN("offload \"%s\" not found", name); return (NULL); } struct icl_conn * icl_new_conn(const char *offload, bool iser, const char *name, struct mtx *lock) { struct icl_module *im; struct icl_conn *ic; sx_slock(&sc->sc_lock); im = icl_find(offload, iser, false); if (im == NULL) { sx_sunlock(&sc->sc_lock); return (NULL); } ic = im->im_new_conn(name, lock); sx_sunlock(&sc->sc_lock); return (ic); } int icl_limits(const char *offload, bool iser, struct icl_drv_limits *idl) { struct icl_module *im; int error; bzero(idl, sizeof(*idl)); sx_slock(&sc->sc_lock); im = icl_find(offload, iser, false); if (im == NULL) { sx_sunlock(&sc->sc_lock); return (ENXIO); } error = im->im_limits(idl); sx_sunlock(&sc->sc_lock); /* * Validate the limits provided by the driver against values allowed by * the iSCSI RFC. 0 means iscsid/ctld should pick a reasonable value. * * Note that max_send_dsl is an internal implementation detail and not * part of the RFC. */ #define OUT_OF_RANGE(x, lo, hi) ((x) != 0 && ((x) < (lo) || (x) > (hi))) if (error == 0 && (OUT_OF_RANGE(idl->idl_max_recv_data_segment_length, 512, 16777215) || OUT_OF_RANGE(idl->idl_max_send_data_segment_length, 512, 16777215) || OUT_OF_RANGE(idl->idl_max_burst_length, 512, 16777215) || OUT_OF_RANGE(idl->idl_first_burst_length, 512, 16777215))) { error = EINVAL; } #undef OUT_OF_RANGE /* * If both first_burst and max_burst are provided then first_burst must * not exceed max_burst. */ if (error == 0 && idl->idl_first_burst_length > 0 && idl->idl_max_burst_length > 0 && idl->idl_first_burst_length > idl->idl_max_burst_length) { error = EINVAL; } return (error); } int icl_register(const char *offload, bool iser, int priority, int (*limits)(struct icl_drv_limits *), struct icl_conn *(*new_conn)(const char *, struct mtx *)) { struct icl_module *im; sx_xlock(&sc->sc_lock); im = icl_find(offload, iser, true); if (im != NULL) { ICL_WARN("offload \"%s\" already registered", offload); sx_xunlock(&sc->sc_lock); return (EBUSY); } im = malloc(sizeof(*im), M_ICL, M_ZERO | M_WAITOK); im->im_name = strdup(offload, M_ICL); im->im_iser = iser; im->im_priority = priority; im->im_limits = limits; im->im_new_conn = new_conn; TAILQ_INSERT_HEAD(&sc->sc_modules, im, im_next); sx_xunlock(&sc->sc_lock); ICL_DEBUG("offload \"%s\" registered", offload); return (0); } int icl_unregister(const char *offload, bool rdma) { struct icl_module *im; sx_xlock(&sc->sc_lock); im = icl_find(offload, rdma, true); if (im == NULL) { ICL_WARN("offload \"%s\" not registered", offload); sx_xunlock(&sc->sc_lock); return (ENXIO); } TAILQ_REMOVE(&sc->sc_modules, im, im_next); sx_xunlock(&sc->sc_lock); free(im->im_name, M_ICL); free(im, M_ICL); ICL_DEBUG("offload \"%s\" unregistered", offload); return (0); } static int icl_load(void) { sc = malloc(sizeof(*sc), M_ICL, M_ZERO | M_WAITOK); sx_init(&sc->sc_lock, "icl"); TAILQ_INIT(&sc->sc_modules); return (0); } static int icl_unload(void) { sx_slock(&sc->sc_lock); KASSERT(TAILQ_EMPTY(&sc->sc_modules), ("still have modules")); sx_sunlock(&sc->sc_lock); sx_destroy(&sc->sc_lock); free(sc, M_ICL); return (0); } static int icl_modevent(module_t mod, int what, void *arg) { switch (what) { case MOD_LOAD: return (icl_load()); case MOD_UNLOAD: return (icl_unload()); default: return (EINVAL); } } moduledata_t icl_data = { "icl", icl_modevent, 0 }; DECLARE_MODULE(icl, icl_data, SI_SUB_DRIVERS, SI_ORDER_FIRST); MODULE_VERSION(icl, 1); Index: head/sys/dev/iscsi/iscsi.c =================================================================== --- head/sys/dev/iscsi/iscsi.c (revision 357970) +++ head/sys/dev/iscsi/iscsi.c (revision 357971) @@ -1,2612 +1,2613 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 The FreeBSD Foundation * All rights reserved. * * This software was developed by Edward Tomasz Napierala under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef ICL_KERNEL_PROXY #include #endif #ifdef ICL_KERNEL_PROXY FEATURE(iscsi_kernel_proxy, "iSCSI initiator built with ICL_KERNEL_PROXY"); #endif /* * XXX: This is global so the iscsi_unload() can access it. * Think about how to do this properly. */ static struct iscsi_softc *sc; -SYSCTL_NODE(_kern, OID_AUTO, iscsi, CTLFLAG_RD, 0, "iSCSI initiator"); +SYSCTL_NODE(_kern, OID_AUTO, iscsi, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, + "iSCSI initiator"); static int debug = 1; SYSCTL_INT(_kern_iscsi, OID_AUTO, debug, CTLFLAG_RWTUN, &debug, 0, "Enable debug messages"); static int ping_timeout = 5; SYSCTL_INT(_kern_iscsi, OID_AUTO, ping_timeout, CTLFLAG_RWTUN, &ping_timeout, 0, "Timeout for ping (NOP-Out) requests, in seconds"); static int iscsid_timeout = 60; SYSCTL_INT(_kern_iscsi, OID_AUTO, iscsid_timeout, CTLFLAG_RWTUN, &iscsid_timeout, 0, "Time to wait for iscsid(8) to handle reconnection, in seconds"); static int login_timeout = 60; SYSCTL_INT(_kern_iscsi, OID_AUTO, login_timeout, CTLFLAG_RWTUN, &login_timeout, 0, "Time to wait for iscsid(8) to finish Login Phase, in seconds"); static int maxtags = 255; SYSCTL_INT(_kern_iscsi, OID_AUTO, maxtags, CTLFLAG_RWTUN, &maxtags, 0, "Max number of IO requests queued"); static int fail_on_disconnection = 0; SYSCTL_INT(_kern_iscsi, OID_AUTO, fail_on_disconnection, CTLFLAG_RWTUN, &fail_on_disconnection, 0, "Destroy CAM SIM on connection failure"); static int fail_on_shutdown = 1; SYSCTL_INT(_kern_iscsi, OID_AUTO, fail_on_shutdown, CTLFLAG_RWTUN, &fail_on_shutdown, 0, "Fail disconnected sessions on shutdown"); static MALLOC_DEFINE(M_ISCSI, "iSCSI", "iSCSI initiator"); static uma_zone_t iscsi_outstanding_zone; #define CONN_SESSION(X) ((struct iscsi_session *)X->ic_prv0) #define PDU_SESSION(X) (CONN_SESSION(X->ip_conn)) #define ISCSI_DEBUG(X, ...) \ do { \ if (debug > 1) \ printf("%s: " X "\n", __func__, ## __VA_ARGS__);\ } while (0) #define ISCSI_WARN(X, ...) \ do { \ if (debug > 0) { \ printf("WARNING: %s: " X "\n", \ __func__, ## __VA_ARGS__); \ } \ } while (0) #define ISCSI_SESSION_DEBUG(S, X, ...) \ do { \ if (debug > 1) { \ printf("%s: %s (%s): " X "\n", \ __func__, S->is_conf.isc_target_addr, \ S->is_conf.isc_target, ## __VA_ARGS__); \ } \ } while (0) #define ISCSI_SESSION_WARN(S, X, ...) \ do { \ if (debug > 0) { \ printf("WARNING: %s (%s): " X "\n", \ S->is_conf.isc_target_addr, \ S->is_conf.isc_target, ## __VA_ARGS__); \ } \ } while (0) #define ISCSI_SESSION_LOCK(X) mtx_lock(&X->is_lock) #define ISCSI_SESSION_UNLOCK(X) mtx_unlock(&X->is_lock) #define ISCSI_SESSION_LOCK_ASSERT(X) mtx_assert(&X->is_lock, MA_OWNED) #define ISCSI_SESSION_LOCK_ASSERT_NOT(X) mtx_assert(&X->is_lock, MA_NOTOWNED) static int iscsi_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int mode, struct thread *td); static struct cdevsw iscsi_cdevsw = { .d_version = D_VERSION, .d_ioctl = iscsi_ioctl, .d_name = "iscsi", }; static void iscsi_pdu_queue_locked(struct icl_pdu *request); static void iscsi_pdu_queue(struct icl_pdu *request); static void iscsi_pdu_update_statsn(const struct icl_pdu *response); static void iscsi_pdu_handle_nop_in(struct icl_pdu *response); static void iscsi_pdu_handle_scsi_response(struct icl_pdu *response); static void iscsi_pdu_handle_task_response(struct icl_pdu *response); static void iscsi_pdu_handle_data_in(struct icl_pdu *response); static void iscsi_pdu_handle_logout_response(struct icl_pdu *response); static void iscsi_pdu_handle_r2t(struct icl_pdu *response); static void iscsi_pdu_handle_async_message(struct icl_pdu *response); static void iscsi_pdu_handle_reject(struct icl_pdu *response); static void iscsi_session_reconnect(struct iscsi_session *is); static void iscsi_session_terminate(struct iscsi_session *is); static void iscsi_action(struct cam_sim *sim, union ccb *ccb); static void iscsi_poll(struct cam_sim *sim); static struct iscsi_outstanding *iscsi_outstanding_find(struct iscsi_session *is, uint32_t initiator_task_tag); static struct iscsi_outstanding *iscsi_outstanding_add(struct iscsi_session *is, struct icl_pdu *request, union ccb *ccb, uint32_t *initiator_task_tagp); static void iscsi_outstanding_remove(struct iscsi_session *is, struct iscsi_outstanding *io); static bool iscsi_pdu_prepare(struct icl_pdu *request) { struct iscsi_session *is; struct iscsi_bhs_scsi_command *bhssc; is = PDU_SESSION(request); ISCSI_SESSION_LOCK_ASSERT(is); /* * We're only using fields common for all the request * (initiator -> target) PDUs. */ bhssc = (struct iscsi_bhs_scsi_command *)request->ip_bhs; /* * Data-Out PDU does not contain CmdSN. */ if (bhssc->bhssc_opcode != ISCSI_BHS_OPCODE_SCSI_DATA_OUT) { if (ISCSI_SNGT(is->is_cmdsn, is->is_maxcmdsn) && (bhssc->bhssc_opcode & ISCSI_BHS_OPCODE_IMMEDIATE) == 0) { /* * Current MaxCmdSN prevents us from sending any more * SCSI Command PDUs to the target; postpone the PDU. * It will get resent by either iscsi_pdu_queue(), * or by maintenance thread. */ #if 0 ISCSI_SESSION_DEBUG(is, "postponing send, CmdSN %u, " "ExpCmdSN %u, MaxCmdSN %u, opcode 0x%x", is->is_cmdsn, is->is_expcmdsn, is->is_maxcmdsn, bhssc->bhssc_opcode); #endif return (true); } bhssc->bhssc_cmdsn = htonl(is->is_cmdsn); if ((bhssc->bhssc_opcode & ISCSI_BHS_OPCODE_IMMEDIATE) == 0) is->is_cmdsn++; } bhssc->bhssc_expstatsn = htonl(is->is_statsn + 1); return (false); } static void iscsi_session_send_postponed(struct iscsi_session *is) { struct icl_pdu *request; bool postpone; ISCSI_SESSION_LOCK_ASSERT(is); if (STAILQ_EMPTY(&is->is_postponed)) return; while ((request = STAILQ_FIRST(&is->is_postponed)) != NULL) { postpone = iscsi_pdu_prepare(request); if (postpone) return; STAILQ_REMOVE_HEAD(&is->is_postponed, ip_next); icl_pdu_queue(request); } xpt_release_simq(is->is_sim, 1); } static void iscsi_pdu_queue_locked(struct icl_pdu *request) { struct iscsi_session *is; bool postpone; is = PDU_SESSION(request); ISCSI_SESSION_LOCK_ASSERT(is); iscsi_session_send_postponed(is); postpone = iscsi_pdu_prepare(request); if (postpone) { if (STAILQ_EMPTY(&is->is_postponed)) xpt_freeze_simq(is->is_sim, 1); STAILQ_INSERT_TAIL(&is->is_postponed, request, ip_next); return; } icl_pdu_queue(request); } static void iscsi_pdu_queue(struct icl_pdu *request) { struct iscsi_session *is; is = PDU_SESSION(request); ISCSI_SESSION_LOCK(is); iscsi_pdu_queue_locked(request); ISCSI_SESSION_UNLOCK(is); } static void iscsi_session_logout(struct iscsi_session *is) { struct icl_pdu *request; struct iscsi_bhs_logout_request *bhslr; request = icl_pdu_new(is->is_conn, M_NOWAIT); if (request == NULL) return; bhslr = (struct iscsi_bhs_logout_request *)request->ip_bhs; bhslr->bhslr_opcode = ISCSI_BHS_OPCODE_LOGOUT_REQUEST; bhslr->bhslr_reason = BHSLR_REASON_CLOSE_SESSION; iscsi_pdu_queue_locked(request); } static void iscsi_session_terminate_task(struct iscsi_session *is, struct iscsi_outstanding *io, cam_status status) { ISCSI_SESSION_LOCK_ASSERT(is); if (io->io_ccb != NULL) { io->io_ccb->ccb_h.status &= ~(CAM_SIM_QUEUED | CAM_STATUS_MASK); io->io_ccb->ccb_h.status |= status; if ((io->io_ccb->ccb_h.status & CAM_DEV_QFRZN) == 0) { io->io_ccb->ccb_h.status |= CAM_DEV_QFRZN; xpt_freeze_devq(io->io_ccb->ccb_h.path, 1); ISCSI_SESSION_DEBUG(is, "freezing devq"); } xpt_done(io->io_ccb); } iscsi_outstanding_remove(is, io); } static void iscsi_session_terminate_tasks(struct iscsi_session *is, cam_status status) { struct iscsi_outstanding *io, *tmp; ISCSI_SESSION_LOCK_ASSERT(is); TAILQ_FOREACH_SAFE(io, &is->is_outstanding, io_next, tmp) { iscsi_session_terminate_task(is, io, status); } } static void iscsi_session_cleanup(struct iscsi_session *is, bool destroy_sim) { struct icl_pdu *pdu; ISCSI_SESSION_LOCK_ASSERT(is); /* * Don't queue any new PDUs. */ if (is->is_sim != NULL && is->is_simq_frozen == false) { ISCSI_SESSION_DEBUG(is, "freezing"); xpt_freeze_simq(is->is_sim, 1); is->is_simq_frozen = true; } /* * Remove postponed PDUs. */ if (!STAILQ_EMPTY(&is->is_postponed)) xpt_release_simq(is->is_sim, 1); while ((pdu = STAILQ_FIRST(&is->is_postponed)) != NULL) { STAILQ_REMOVE_HEAD(&is->is_postponed, ip_next); icl_pdu_free(pdu); } if (destroy_sim == false) { /* * Terminate SCSI tasks, asking CAM to requeue them. */ iscsi_session_terminate_tasks(is, CAM_REQUEUE_REQ); return; } iscsi_session_terminate_tasks(is, CAM_DEV_NOT_THERE); if (is->is_sim == NULL) return; ISCSI_SESSION_DEBUG(is, "deregistering SIM"); xpt_async(AC_LOST_DEVICE, is->is_path, NULL); if (is->is_simq_frozen) { xpt_release_simq(is->is_sim, 1); is->is_simq_frozen = false; } xpt_free_path(is->is_path); is->is_path = NULL; xpt_bus_deregister(cam_sim_path(is->is_sim)); cam_sim_free(is->is_sim, TRUE /*free_devq*/); is->is_sim = NULL; is->is_devq = NULL; } static void iscsi_maintenance_thread_reconnect(struct iscsi_session *is) { icl_conn_close(is->is_conn); ISCSI_SESSION_LOCK(is); is->is_connected = false; is->is_reconnecting = false; is->is_login_phase = false; #ifdef ICL_KERNEL_PROXY if (is->is_login_pdu != NULL) { icl_pdu_free(is->is_login_pdu); is->is_login_pdu = NULL; } cv_signal(&is->is_login_cv); #endif if (fail_on_disconnection) { ISCSI_SESSION_DEBUG(is, "connection failed, destroying devices"); iscsi_session_cleanup(is, true); } else { iscsi_session_cleanup(is, false); } KASSERT(TAILQ_EMPTY(&is->is_outstanding), ("destroying session with active tasks")); KASSERT(STAILQ_EMPTY(&is->is_postponed), ("destroying session with postponed PDUs")); if (is->is_conf.isc_enable == 0 && is->is_conf.isc_discovery == 0) { ISCSI_SESSION_UNLOCK(is); return; } /* * Request immediate reconnection from iscsid(8). */ //ISCSI_SESSION_DEBUG(is, "waking up iscsid(8)"); is->is_waiting_for_iscsid = true; strlcpy(is->is_reason, "Waiting for iscsid(8)", sizeof(is->is_reason)); is->is_timeout = 0; ISCSI_SESSION_UNLOCK(is); cv_signal(&is->is_softc->sc_cv); } static void iscsi_maintenance_thread_terminate(struct iscsi_session *is) { struct iscsi_softc *sc; sc = is->is_softc; sx_xlock(&sc->sc_lock); TAILQ_REMOVE(&sc->sc_sessions, is, is_next); sx_xunlock(&sc->sc_lock); icl_conn_close(is->is_conn); callout_drain(&is->is_callout); ISCSI_SESSION_LOCK(is); KASSERT(is->is_terminating, ("is_terminating == false")); #ifdef ICL_KERNEL_PROXY if (is->is_login_pdu != NULL) { icl_pdu_free(is->is_login_pdu); is->is_login_pdu = NULL; } cv_signal(&is->is_login_cv); #endif iscsi_session_cleanup(is, true); KASSERT(TAILQ_EMPTY(&is->is_outstanding), ("destroying session with active tasks")); KASSERT(STAILQ_EMPTY(&is->is_postponed), ("destroying session with postponed PDUs")); ISCSI_SESSION_UNLOCK(is); icl_conn_free(is->is_conn); mtx_destroy(&is->is_lock); cv_destroy(&is->is_maintenance_cv); #ifdef ICL_KERNEL_PROXY cv_destroy(&is->is_login_cv); #endif ISCSI_SESSION_DEBUG(is, "terminated"); free(is, M_ISCSI); /* * The iscsi_unload() routine might be waiting. */ cv_signal(&sc->sc_cv); } static void iscsi_maintenance_thread(void *arg) { struct iscsi_session *is = arg; ISCSI_SESSION_LOCK(is); for (;;) { if (is->is_reconnecting == false && is->is_terminating == false && (STAILQ_EMPTY(&is->is_postponed) || ISCSI_SNGT(is->is_cmdsn, is->is_maxcmdsn))) cv_wait(&is->is_maintenance_cv, &is->is_lock); /* Terminate supersedes reconnect. */ if (is->is_terminating) { ISCSI_SESSION_UNLOCK(is); iscsi_maintenance_thread_terminate(is); kthread_exit(); return; } if (is->is_reconnecting) { ISCSI_SESSION_UNLOCK(is); iscsi_maintenance_thread_reconnect(is); ISCSI_SESSION_LOCK(is); continue; } iscsi_session_send_postponed(is); } ISCSI_SESSION_UNLOCK(is); } static void iscsi_session_reconnect(struct iscsi_session *is) { /* * XXX: We can't use locking here, because * it's being called from various contexts. * Hope it doesn't break anything. */ if (is->is_reconnecting) return; is->is_reconnecting = true; cv_signal(&is->is_maintenance_cv); } static void iscsi_session_terminate(struct iscsi_session *is) { if (is->is_terminating) return; is->is_terminating = true; #if 0 iscsi_session_logout(is); #endif cv_signal(&is->is_maintenance_cv); } static void iscsi_callout(void *context) { struct icl_pdu *request; struct iscsi_bhs_nop_out *bhsno; struct iscsi_session *is; bool reconnect_needed = false; is = context; ISCSI_SESSION_LOCK(is); if (is->is_terminating) { ISCSI_SESSION_UNLOCK(is); return; } callout_schedule(&is->is_callout, 1 * hz); if (is->is_conf.isc_enable == 0) goto out; is->is_timeout++; if (is->is_waiting_for_iscsid) { if (iscsid_timeout > 0 && is->is_timeout > iscsid_timeout) { ISCSI_SESSION_WARN(is, "timed out waiting for iscsid(8) " "for %d seconds; reconnecting", is->is_timeout); reconnect_needed = true; } goto out; } if (is->is_login_phase) { if (login_timeout > 0 && is->is_timeout > login_timeout) { ISCSI_SESSION_WARN(is, "login timed out after %d seconds; " "reconnecting", is->is_timeout); reconnect_needed = true; } goto out; } if (ping_timeout <= 0) { /* * Pings are disabled. Don't send NOP-Out in this case. * Reset the timeout, to avoid triggering reconnection, * should the user decide to reenable them. */ is->is_timeout = 0; goto out; } if (is->is_timeout >= ping_timeout) { ISCSI_SESSION_WARN(is, "no ping reply (NOP-In) after %d seconds; " "reconnecting", ping_timeout); reconnect_needed = true; goto out; } ISCSI_SESSION_UNLOCK(is); /* * If the ping was reset less than one second ago - which means * that we've received some PDU during the last second - assume * the traffic flows correctly and don't bother sending a NOP-Out. * * (It's 2 - one for one second, and one for incrementing is_timeout * earlier in this routine.) */ if (is->is_timeout < 2) return; request = icl_pdu_new(is->is_conn, M_NOWAIT); if (request == NULL) { ISCSI_SESSION_WARN(is, "failed to allocate PDU"); return; } bhsno = (struct iscsi_bhs_nop_out *)request->ip_bhs; bhsno->bhsno_opcode = ISCSI_BHS_OPCODE_NOP_OUT | ISCSI_BHS_OPCODE_IMMEDIATE; bhsno->bhsno_flags = 0x80; bhsno->bhsno_target_transfer_tag = 0xffffffff; iscsi_pdu_queue(request); return; out: if (is->is_terminating) { ISCSI_SESSION_UNLOCK(is); return; } ISCSI_SESSION_UNLOCK(is); if (reconnect_needed) iscsi_session_reconnect(is); } static void iscsi_pdu_update_statsn(const struct icl_pdu *response) { const struct iscsi_bhs_data_in *bhsdi; struct iscsi_session *is; uint32_t expcmdsn, maxcmdsn, statsn; is = PDU_SESSION(response); ISCSI_SESSION_LOCK_ASSERT(is); /* * We're only using fields common for all the response * (target -> initiator) PDUs. */ bhsdi = (const struct iscsi_bhs_data_in *)response->ip_bhs; /* * Ok, I lied. In case of Data-In, "The fields StatSN, Status, * and Residual Count only have meaningful content if the S bit * is set to 1", so we also need to check the bit specific for * Data-In PDU. */ if (bhsdi->bhsdi_opcode != ISCSI_BHS_OPCODE_SCSI_DATA_IN || (bhsdi->bhsdi_flags & BHSDI_FLAGS_S) != 0) { statsn = ntohl(bhsdi->bhsdi_statsn); if (statsn != is->is_statsn && statsn != (is->is_statsn + 1)) { /* XXX: This is normal situation for MCS */ ISCSI_SESSION_WARN(is, "PDU 0x%x StatSN %u != " "session ExpStatSN %u (or + 1); reconnecting", bhsdi->bhsdi_opcode, statsn, is->is_statsn); iscsi_session_reconnect(is); } if (ISCSI_SNGT(statsn, is->is_statsn)) is->is_statsn = statsn; } expcmdsn = ntohl(bhsdi->bhsdi_expcmdsn); maxcmdsn = ntohl(bhsdi->bhsdi_maxcmdsn); if (ISCSI_SNLT(maxcmdsn + 1, expcmdsn)) { ISCSI_SESSION_DEBUG(is, "PDU MaxCmdSN %u + 1 < PDU ExpCmdSN %u; ignoring", maxcmdsn, expcmdsn); } else { if (ISCSI_SNGT(maxcmdsn, is->is_maxcmdsn)) { is->is_maxcmdsn = maxcmdsn; /* * Command window increased; kick the maintanance thread * to send out postponed commands. */ if (!STAILQ_EMPTY(&is->is_postponed)) cv_signal(&is->is_maintenance_cv); } else if (ISCSI_SNLT(maxcmdsn, is->is_maxcmdsn)) { /* XXX: This is normal situation for MCS */ ISCSI_SESSION_DEBUG(is, "PDU MaxCmdSN %u < session MaxCmdSN %u; ignoring", maxcmdsn, is->is_maxcmdsn); } if (ISCSI_SNGT(expcmdsn, is->is_expcmdsn)) { is->is_expcmdsn = expcmdsn; } else if (ISCSI_SNLT(expcmdsn, is->is_expcmdsn)) { /* XXX: This is normal situation for MCS */ ISCSI_SESSION_DEBUG(is, "PDU ExpCmdSN %u < session ExpCmdSN %u; ignoring", expcmdsn, is->is_expcmdsn); } } /* * Every incoming PDU - not just NOP-In - resets the ping timer. * The purpose of the timeout is to reset the connection when it stalls; * we don't want this to happen when NOP-In or NOP-Out ends up delayed * in some queue. */ is->is_timeout = 0; } static void iscsi_receive_callback(struct icl_pdu *response) { struct iscsi_session *is; is = PDU_SESSION(response); ISCSI_SESSION_LOCK(is); iscsi_pdu_update_statsn(response); #ifdef ICL_KERNEL_PROXY if (is->is_login_phase) { if (is->is_login_pdu == NULL) is->is_login_pdu = response; else icl_pdu_free(response); ISCSI_SESSION_UNLOCK(is); cv_signal(&is->is_login_cv); return; } #endif /* * The handling routine is responsible for freeing the PDU * when it's no longer needed. */ switch (response->ip_bhs->bhs_opcode) { case ISCSI_BHS_OPCODE_NOP_IN: iscsi_pdu_handle_nop_in(response); ISCSI_SESSION_UNLOCK(is); break; case ISCSI_BHS_OPCODE_SCSI_RESPONSE: iscsi_pdu_handle_scsi_response(response); /* Session lock dropped inside. */ ISCSI_SESSION_LOCK_ASSERT_NOT(is); break; case ISCSI_BHS_OPCODE_TASK_RESPONSE: iscsi_pdu_handle_task_response(response); ISCSI_SESSION_UNLOCK(is); break; case ISCSI_BHS_OPCODE_SCSI_DATA_IN: iscsi_pdu_handle_data_in(response); /* Session lock dropped inside. */ ISCSI_SESSION_LOCK_ASSERT_NOT(is); break; case ISCSI_BHS_OPCODE_LOGOUT_RESPONSE: iscsi_pdu_handle_logout_response(response); ISCSI_SESSION_UNLOCK(is); break; case ISCSI_BHS_OPCODE_R2T: iscsi_pdu_handle_r2t(response); ISCSI_SESSION_UNLOCK(is); break; case ISCSI_BHS_OPCODE_ASYNC_MESSAGE: iscsi_pdu_handle_async_message(response); ISCSI_SESSION_UNLOCK(is); break; case ISCSI_BHS_OPCODE_REJECT: iscsi_pdu_handle_reject(response); ISCSI_SESSION_UNLOCK(is); break; default: ISCSI_SESSION_WARN(is, "received PDU with unsupported " "opcode 0x%x; reconnecting", response->ip_bhs->bhs_opcode); iscsi_session_reconnect(is); ISCSI_SESSION_UNLOCK(is); icl_pdu_free(response); } } static void iscsi_error_callback(struct icl_conn *ic) { struct iscsi_session *is; is = CONN_SESSION(ic); ISCSI_SESSION_WARN(is, "connection error; reconnecting"); iscsi_session_reconnect(is); } static void iscsi_pdu_handle_nop_in(struct icl_pdu *response) { struct iscsi_session *is; struct iscsi_bhs_nop_out *bhsno; struct iscsi_bhs_nop_in *bhsni; struct icl_pdu *request; void *data = NULL; size_t datasize; int error; is = PDU_SESSION(response); bhsni = (struct iscsi_bhs_nop_in *)response->ip_bhs; if (bhsni->bhsni_target_transfer_tag == 0xffffffff) { /* * Nothing to do; iscsi_pdu_update_statsn() already * zeroed the timeout. */ icl_pdu_free(response); return; } datasize = icl_pdu_data_segment_length(response); if (datasize > 0) { data = malloc(datasize, M_ISCSI, M_NOWAIT | M_ZERO); if (data == NULL) { ISCSI_SESSION_WARN(is, "failed to allocate memory; " "reconnecting"); icl_pdu_free(response); iscsi_session_reconnect(is); return; } icl_pdu_get_data(response, 0, data, datasize); } request = icl_pdu_new(response->ip_conn, M_NOWAIT); if (request == NULL) { ISCSI_SESSION_WARN(is, "failed to allocate memory; " "reconnecting"); free(data, M_ISCSI); icl_pdu_free(response); iscsi_session_reconnect(is); return; } bhsno = (struct iscsi_bhs_nop_out *)request->ip_bhs; bhsno->bhsno_opcode = ISCSI_BHS_OPCODE_NOP_OUT | ISCSI_BHS_OPCODE_IMMEDIATE; bhsno->bhsno_flags = 0x80; bhsno->bhsno_initiator_task_tag = 0xffffffff; bhsno->bhsno_target_transfer_tag = bhsni->bhsni_target_transfer_tag; if (datasize > 0) { error = icl_pdu_append_data(request, data, datasize, M_NOWAIT); if (error != 0) { ISCSI_SESSION_WARN(is, "failed to allocate memory; " "reconnecting"); free(data, M_ISCSI); icl_pdu_free(request); icl_pdu_free(response); iscsi_session_reconnect(is); return; } free(data, M_ISCSI); } icl_pdu_free(response); iscsi_pdu_queue_locked(request); } static void iscsi_pdu_handle_scsi_response(struct icl_pdu *response) { struct iscsi_bhs_scsi_response *bhssr; struct iscsi_outstanding *io; struct iscsi_session *is; union ccb *ccb; struct ccb_scsiio *csio; size_t data_segment_len, received; uint16_t sense_len; uint32_t resid; is = PDU_SESSION(response); bhssr = (struct iscsi_bhs_scsi_response *)response->ip_bhs; io = iscsi_outstanding_find(is, bhssr->bhssr_initiator_task_tag); if (io == NULL || io->io_ccb == NULL) { ISCSI_SESSION_WARN(is, "bad itt 0x%x", bhssr->bhssr_initiator_task_tag); icl_pdu_free(response); iscsi_session_reconnect(is); ISCSI_SESSION_UNLOCK(is); return; } ccb = io->io_ccb; /* * With iSER, after getting good response we can be sure * that all the data has been successfully transferred. */ if (is->is_conn->ic_iser) { resid = ntohl(bhssr->bhssr_residual_count); if (bhssr->bhssr_flags & BHSSR_FLAGS_RESIDUAL_UNDERFLOW) { io->io_received = ccb->csio.dxfer_len - resid; } else if (bhssr->bhssr_flags & BHSSR_FLAGS_RESIDUAL_OVERFLOW) { ISCSI_SESSION_WARN(is, "overflow: target indicates %d", resid); } else { io->io_received = ccb->csio.dxfer_len; } } received = io->io_received; iscsi_outstanding_remove(is, io); ISCSI_SESSION_UNLOCK(is); if (bhssr->bhssr_response != BHSSR_RESPONSE_COMMAND_COMPLETED) { ISCSI_SESSION_WARN(is, "service response 0x%x", bhssr->bhssr_response); if ((ccb->ccb_h.status & CAM_DEV_QFRZN) == 0) { xpt_freeze_devq(ccb->ccb_h.path, 1); ISCSI_SESSION_DEBUG(is, "freezing devq"); } ccb->ccb_h.status = CAM_REQ_CMP_ERR | CAM_DEV_QFRZN; } else if (bhssr->bhssr_status == 0) { ccb->ccb_h.status = CAM_REQ_CMP; } else { if ((ccb->ccb_h.status & CAM_DEV_QFRZN) == 0) { xpt_freeze_devq(ccb->ccb_h.path, 1); ISCSI_SESSION_DEBUG(is, "freezing devq"); } ccb->ccb_h.status = CAM_SCSI_STATUS_ERROR | CAM_DEV_QFRZN; ccb->csio.scsi_status = bhssr->bhssr_status; } csio = &ccb->csio; data_segment_len = icl_pdu_data_segment_length(response); if (data_segment_len > 0) { if (data_segment_len < sizeof(sense_len)) { ISCSI_SESSION_WARN(is, "truncated data segment (%zd bytes)", data_segment_len); if ((ccb->ccb_h.status & CAM_DEV_QFRZN) == 0) { xpt_freeze_devq(ccb->ccb_h.path, 1); ISCSI_SESSION_DEBUG(is, "freezing devq"); } ccb->ccb_h.status = CAM_REQ_CMP_ERR | CAM_DEV_QFRZN; goto out; } icl_pdu_get_data(response, 0, &sense_len, sizeof(sense_len)); sense_len = ntohs(sense_len); #if 0 ISCSI_SESSION_DEBUG(is, "sense_len %d, data len %zd", sense_len, data_segment_len); #endif if (sizeof(sense_len) + sense_len > data_segment_len) { ISCSI_SESSION_WARN(is, "truncated data segment " "(%zd bytes, should be %zd)", data_segment_len, sizeof(sense_len) + sense_len); if ((ccb->ccb_h.status & CAM_DEV_QFRZN) == 0) { xpt_freeze_devq(ccb->ccb_h.path, 1); ISCSI_SESSION_DEBUG(is, "freezing devq"); } ccb->ccb_h.status = CAM_REQ_CMP_ERR | CAM_DEV_QFRZN; goto out; } else if (sizeof(sense_len) + sense_len < data_segment_len) ISCSI_SESSION_WARN(is, "oversize data segment " "(%zd bytes, should be %zd)", data_segment_len, sizeof(sense_len) + sense_len); if (sense_len > csio->sense_len) { ISCSI_SESSION_DEBUG(is, "truncating sense from %d to %d", sense_len, csio->sense_len); sense_len = csio->sense_len; } icl_pdu_get_data(response, sizeof(sense_len), &csio->sense_data, sense_len); csio->sense_resid = csio->sense_len - sense_len; ccb->ccb_h.status |= CAM_AUTOSNS_VALID; } out: if (bhssr->bhssr_flags & BHSSR_FLAGS_RESIDUAL_UNDERFLOW) csio->resid = ntohl(bhssr->bhssr_residual_count); if ((csio->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_IN) { KASSERT(received <= csio->dxfer_len, ("received > csio->dxfer_len")); if (received < csio->dxfer_len) { if (csio->resid != csio->dxfer_len - received) { ISCSI_SESSION_WARN(is, "underflow mismatch: " "target indicates %d, we calculated %zd", csio->resid, csio->dxfer_len - received); } csio->resid = csio->dxfer_len - received; } } xpt_done(ccb); icl_pdu_free(response); } static void iscsi_pdu_handle_task_response(struct icl_pdu *response) { struct iscsi_bhs_task_management_response *bhstmr; struct iscsi_outstanding *io, *aio; struct iscsi_session *is; is = PDU_SESSION(response); bhstmr = (struct iscsi_bhs_task_management_response *)response->ip_bhs; io = iscsi_outstanding_find(is, bhstmr->bhstmr_initiator_task_tag); if (io == NULL || io->io_ccb != NULL) { ISCSI_SESSION_WARN(is, "bad itt 0x%x", bhstmr->bhstmr_initiator_task_tag); icl_pdu_free(response); iscsi_session_reconnect(is); return; } if (bhstmr->bhstmr_response != BHSTMR_RESPONSE_FUNCTION_COMPLETE) { ISCSI_SESSION_WARN(is, "task response 0x%x", bhstmr->bhstmr_response); } else { aio = iscsi_outstanding_find(is, io->io_datasn); if (aio != NULL && aio->io_ccb != NULL) iscsi_session_terminate_task(is, aio, CAM_REQ_ABORTED); } iscsi_outstanding_remove(is, io); icl_pdu_free(response); } static void iscsi_pdu_handle_data_in(struct icl_pdu *response) { struct iscsi_bhs_data_in *bhsdi; struct iscsi_outstanding *io; struct iscsi_session *is; union ccb *ccb; struct ccb_scsiio *csio; size_t data_segment_len, received, oreceived; is = PDU_SESSION(response); bhsdi = (struct iscsi_bhs_data_in *)response->ip_bhs; io = iscsi_outstanding_find(is, bhsdi->bhsdi_initiator_task_tag); if (io == NULL || io->io_ccb == NULL) { ISCSI_SESSION_WARN(is, "bad itt 0x%x", bhsdi->bhsdi_initiator_task_tag); icl_pdu_free(response); iscsi_session_reconnect(is); ISCSI_SESSION_UNLOCK(is); return; } data_segment_len = icl_pdu_data_segment_length(response); if (data_segment_len == 0) { /* * "The sending of 0 length data segments should be avoided, * but initiators and targets MUST be able to properly receive * 0 length data segments." */ ISCSI_SESSION_UNLOCK(is); icl_pdu_free(response); return; } /* * We need to track this for security reasons - without it, malicious target * could respond to SCSI READ without sending Data-In PDUs, which would result * in read operation on the initiator side returning random kernel data. */ if (ntohl(bhsdi->bhsdi_buffer_offset) != io->io_received) { ISCSI_SESSION_WARN(is, "data out of order; expected offset %zd, got %zd", io->io_received, (size_t)ntohl(bhsdi->bhsdi_buffer_offset)); icl_pdu_free(response); iscsi_session_reconnect(is); ISCSI_SESSION_UNLOCK(is); return; } ccb = io->io_ccb; csio = &ccb->csio; if (io->io_received + data_segment_len > csio->dxfer_len) { ISCSI_SESSION_WARN(is, "oversize data segment (%zd bytes " "at offset %zd, buffer is %d)", data_segment_len, io->io_received, csio->dxfer_len); icl_pdu_free(response); iscsi_session_reconnect(is); ISCSI_SESSION_UNLOCK(is); return; } oreceived = io->io_received; io->io_received += data_segment_len; received = io->io_received; if ((bhsdi->bhsdi_flags & BHSDI_FLAGS_S) != 0) iscsi_outstanding_remove(is, io); ISCSI_SESSION_UNLOCK(is); icl_pdu_get_data(response, 0, csio->data_ptr + oreceived, data_segment_len); /* * XXX: Check DataSN. * XXX: Check F. */ if ((bhsdi->bhsdi_flags & BHSDI_FLAGS_S) == 0) { /* * Nothing more to do. */ icl_pdu_free(response); return; } //ISCSI_SESSION_DEBUG(is, "got S flag; status 0x%x", bhsdi->bhsdi_status); if (bhsdi->bhsdi_status == 0) { ccb->ccb_h.status = CAM_REQ_CMP; } else { if ((ccb->ccb_h.status & CAM_DEV_QFRZN) == 0) { xpt_freeze_devq(ccb->ccb_h.path, 1); ISCSI_SESSION_DEBUG(is, "freezing devq"); } ccb->ccb_h.status = CAM_SCSI_STATUS_ERROR | CAM_DEV_QFRZN; csio->scsi_status = bhsdi->bhsdi_status; } if ((csio->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_IN) { KASSERT(received <= csio->dxfer_len, ("received > csio->dxfer_len")); if (received < csio->dxfer_len) { csio->resid = ntohl(bhsdi->bhsdi_residual_count); if (csio->resid != csio->dxfer_len - received) { ISCSI_SESSION_WARN(is, "underflow mismatch: " "target indicates %d, we calculated %zd", csio->resid, csio->dxfer_len - received); } csio->resid = csio->dxfer_len - received; } } xpt_done(ccb); icl_pdu_free(response); } static void iscsi_pdu_handle_logout_response(struct icl_pdu *response) { ISCSI_SESSION_DEBUG(PDU_SESSION(response), "logout response"); icl_pdu_free(response); } static void iscsi_pdu_handle_r2t(struct icl_pdu *response) { struct icl_pdu *request; struct iscsi_session *is; struct iscsi_bhs_r2t *bhsr2t; struct iscsi_bhs_data_out *bhsdo; struct iscsi_outstanding *io; struct ccb_scsiio *csio; size_t off, len, total_len; int error; is = PDU_SESSION(response); bhsr2t = (struct iscsi_bhs_r2t *)response->ip_bhs; io = iscsi_outstanding_find(is, bhsr2t->bhsr2t_initiator_task_tag); if (io == NULL || io->io_ccb == NULL) { ISCSI_SESSION_WARN(is, "bad itt 0x%x; reconnecting", bhsr2t->bhsr2t_initiator_task_tag); icl_pdu_free(response); iscsi_session_reconnect(is); return; } csio = &io->io_ccb->csio; if ((csio->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_OUT) { ISCSI_SESSION_WARN(is, "received R2T for read command; reconnecting"); icl_pdu_free(response); iscsi_session_reconnect(is); return; } /* * XXX: Verify R2TSN. */ io->io_datasn = 0; off = ntohl(bhsr2t->bhsr2t_buffer_offset); if (off > csio->dxfer_len) { ISCSI_SESSION_WARN(is, "target requested invalid offset " "%zd, buffer is is %d; reconnecting", off, csio->dxfer_len); icl_pdu_free(response); iscsi_session_reconnect(is); return; } total_len = ntohl(bhsr2t->bhsr2t_desired_data_transfer_length); if (total_len == 0 || total_len > csio->dxfer_len) { ISCSI_SESSION_WARN(is, "target requested invalid length " "%zd, buffer is %d; reconnecting", total_len, csio->dxfer_len); icl_pdu_free(response); iscsi_session_reconnect(is); return; } //ISCSI_SESSION_DEBUG(is, "r2t; off %zd, len %zd", off, total_len); for (;;) { len = total_len; if (len > is->is_max_send_data_segment_length) len = is->is_max_send_data_segment_length; if (off + len > csio->dxfer_len) { ISCSI_SESSION_WARN(is, "target requested invalid " "length/offset %zd, buffer is %d; reconnecting", off + len, csio->dxfer_len); icl_pdu_free(response); iscsi_session_reconnect(is); return; } request = icl_pdu_new(response->ip_conn, M_NOWAIT); if (request == NULL) { icl_pdu_free(response); iscsi_session_reconnect(is); return; } bhsdo = (struct iscsi_bhs_data_out *)request->ip_bhs; bhsdo->bhsdo_opcode = ISCSI_BHS_OPCODE_SCSI_DATA_OUT; bhsdo->bhsdo_lun = bhsr2t->bhsr2t_lun; bhsdo->bhsdo_initiator_task_tag = bhsr2t->bhsr2t_initiator_task_tag; bhsdo->bhsdo_target_transfer_tag = bhsr2t->bhsr2t_target_transfer_tag; bhsdo->bhsdo_datasn = htonl(io->io_datasn++); bhsdo->bhsdo_buffer_offset = htonl(off); error = icl_pdu_append_data(request, csio->data_ptr + off, len, M_NOWAIT); if (error != 0) { ISCSI_SESSION_WARN(is, "failed to allocate memory; " "reconnecting"); icl_pdu_free(request); icl_pdu_free(response); iscsi_session_reconnect(is); return; } off += len; total_len -= len; if (total_len == 0) { bhsdo->bhsdo_flags |= BHSDO_FLAGS_F; //ISCSI_SESSION_DEBUG(is, "setting F, off %zd", off); } else { //ISCSI_SESSION_DEBUG(is, "not finished, off %zd", off); } iscsi_pdu_queue_locked(request); if (total_len == 0) break; } icl_pdu_free(response); } static void iscsi_pdu_handle_async_message(struct icl_pdu *response) { struct iscsi_bhs_asynchronous_message *bhsam; struct iscsi_session *is; is = PDU_SESSION(response); bhsam = (struct iscsi_bhs_asynchronous_message *)response->ip_bhs; switch (bhsam->bhsam_async_event) { case BHSAM_EVENT_TARGET_REQUESTS_LOGOUT: ISCSI_SESSION_WARN(is, "target requests logout; removing session"); iscsi_session_logout(is); iscsi_session_terminate(is); break; case BHSAM_EVENT_TARGET_TERMINATES_CONNECTION: ISCSI_SESSION_WARN(is, "target indicates it will drop the connection"); break; case BHSAM_EVENT_TARGET_TERMINATES_SESSION: ISCSI_SESSION_WARN(is, "target indicates it will drop the session"); break; default: /* * XXX: Technically, we're obligated to also handle * parameter renegotiation. */ ISCSI_SESSION_WARN(is, "ignoring AsyncEvent %d", bhsam->bhsam_async_event); break; } icl_pdu_free(response); } static void iscsi_pdu_handle_reject(struct icl_pdu *response) { struct iscsi_bhs_reject *bhsr; struct iscsi_session *is; is = PDU_SESSION(response); bhsr = (struct iscsi_bhs_reject *)response->ip_bhs; ISCSI_SESSION_WARN(is, "received Reject PDU, reason 0x%x; protocol error?", bhsr->bhsr_reason); icl_pdu_free(response); } static int iscsi_ioctl_daemon_wait(struct iscsi_softc *sc, struct iscsi_daemon_request *request) { struct iscsi_session *is; struct icl_drv_limits idl; int error; sx_slock(&sc->sc_lock); for (;;) { TAILQ_FOREACH(is, &sc->sc_sessions, is_next) { ISCSI_SESSION_LOCK(is); if (is->is_conf.isc_enable == 0 && is->is_conf.isc_discovery == 0) { ISCSI_SESSION_UNLOCK(is); continue; } if (is->is_waiting_for_iscsid) break; ISCSI_SESSION_UNLOCK(is); } if (is == NULL) { /* * No session requires attention from iscsid(8); wait. */ error = cv_wait_sig(&sc->sc_cv, &sc->sc_lock); if (error != 0) { sx_sunlock(&sc->sc_lock); return (error); } continue; } is->is_waiting_for_iscsid = false; is->is_login_phase = true; is->is_reason[0] = '\0'; ISCSI_SESSION_UNLOCK(is); request->idr_session_id = is->is_id; memcpy(&request->idr_isid, &is->is_isid, sizeof(request->idr_isid)); request->idr_tsih = 0; /* New or reinstated session. */ memcpy(&request->idr_conf, &is->is_conf, sizeof(request->idr_conf)); error = icl_limits(is->is_conf.isc_offload, is->is_conf.isc_iser, &idl); if (error != 0) { ISCSI_SESSION_WARN(is, "icl_limits for offload \"%s\" " "failed with error %d", is->is_conf.isc_offload, error); sx_sunlock(&sc->sc_lock); return (error); } request->idr_limits.isl_max_recv_data_segment_length = idl.idl_max_recv_data_segment_length; request->idr_limits.isl_max_send_data_segment_length = idl.idl_max_send_data_segment_length; request->idr_limits.isl_max_burst_length = idl.idl_max_burst_length; request->idr_limits.isl_first_burst_length = idl.idl_first_burst_length; sx_sunlock(&sc->sc_lock); return (0); } } static int iscsi_ioctl_daemon_handoff(struct iscsi_softc *sc, struct iscsi_daemon_handoff *handoff) { struct iscsi_session *is; struct icl_conn *ic; int error; sx_slock(&sc->sc_lock); /* * Find the session to hand off socket to. */ TAILQ_FOREACH(is, &sc->sc_sessions, is_next) { if (is->is_id == handoff->idh_session_id) break; } if (is == NULL) { sx_sunlock(&sc->sc_lock); return (ESRCH); } ISCSI_SESSION_LOCK(is); ic = is->is_conn; if (is->is_conf.isc_discovery || is->is_terminating) { ISCSI_SESSION_UNLOCK(is); sx_sunlock(&sc->sc_lock); return (EINVAL); } if (is->is_connected) { /* * This might have happened because another iscsid(8) * instance handed off the connection in the meantime. * Just return. */ ISCSI_SESSION_WARN(is, "handoff on already connected " "session"); ISCSI_SESSION_UNLOCK(is); sx_sunlock(&sc->sc_lock); return (EBUSY); } strlcpy(is->is_target_alias, handoff->idh_target_alias, sizeof(is->is_target_alias)); is->is_tsih = handoff->idh_tsih; is->is_statsn = handoff->idh_statsn; is->is_initial_r2t = handoff->idh_initial_r2t; is->is_immediate_data = handoff->idh_immediate_data; is->is_max_recv_data_segment_length = handoff->idh_max_recv_data_segment_length; is->is_max_send_data_segment_length = handoff->idh_max_send_data_segment_length; is->is_max_burst_length = handoff->idh_max_burst_length; is->is_first_burst_length = handoff->idh_first_burst_length; if (handoff->idh_header_digest == ISCSI_DIGEST_CRC32C) ic->ic_header_crc32c = true; else ic->ic_header_crc32c = false; if (handoff->idh_data_digest == ISCSI_DIGEST_CRC32C) ic->ic_data_crc32c = true; else ic->ic_data_crc32c = false; ic->ic_maxtags = maxtags; is->is_cmdsn = 0; is->is_expcmdsn = 0; is->is_maxcmdsn = 0; is->is_waiting_for_iscsid = false; is->is_login_phase = false; is->is_timeout = 0; is->is_connected = true; is->is_reason[0] = '\0'; ISCSI_SESSION_UNLOCK(is); /* * If we're going through the proxy, the idh_socket will be 0, * and the ICL module can simply ignore this call. It can also * use it to determine it's no longer in the Login phase. */ error = icl_conn_handoff(ic, handoff->idh_socket); if (error != 0) { sx_sunlock(&sc->sc_lock); iscsi_session_terminate(is); return (error); } sx_sunlock(&sc->sc_lock); if (is->is_sim != NULL) { /* * When reconnecting, there already is SIM allocated for the session. */ KASSERT(is->is_simq_frozen, ("reconnect without frozen simq")); ISCSI_SESSION_LOCK(is); ISCSI_SESSION_DEBUG(is, "releasing"); xpt_release_simq(is->is_sim, 1); is->is_simq_frozen = false; ISCSI_SESSION_UNLOCK(is); } else { ISCSI_SESSION_LOCK(is); is->is_devq = cam_simq_alloc(ic->ic_maxtags); if (is->is_devq == NULL) { ISCSI_SESSION_WARN(is, "failed to allocate simq"); iscsi_session_terminate(is); return (ENOMEM); } is->is_sim = cam_sim_alloc(iscsi_action, iscsi_poll, "iscsi", is, is->is_id /* unit */, &is->is_lock, 1, ic->ic_maxtags, is->is_devq); if (is->is_sim == NULL) { ISCSI_SESSION_UNLOCK(is); ISCSI_SESSION_WARN(is, "failed to allocate SIM"); cam_simq_free(is->is_devq); iscsi_session_terminate(is); return (ENOMEM); } error = xpt_bus_register(is->is_sim, NULL, 0); if (error != 0) { ISCSI_SESSION_UNLOCK(is); ISCSI_SESSION_WARN(is, "failed to register bus"); iscsi_session_terminate(is); return (ENOMEM); } error = xpt_create_path(&is->is_path, /*periph*/NULL, cam_sim_path(is->is_sim), CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD); if (error != CAM_REQ_CMP) { ISCSI_SESSION_UNLOCK(is); ISCSI_SESSION_WARN(is, "failed to create path"); iscsi_session_terminate(is); return (ENOMEM); } ISCSI_SESSION_UNLOCK(is); } return (0); } static int iscsi_ioctl_daemon_fail(struct iscsi_softc *sc, struct iscsi_daemon_fail *fail) { struct iscsi_session *is; sx_slock(&sc->sc_lock); TAILQ_FOREACH(is, &sc->sc_sessions, is_next) { if (is->is_id == fail->idf_session_id) break; } if (is == NULL) { sx_sunlock(&sc->sc_lock); return (ESRCH); } ISCSI_SESSION_LOCK(is); ISCSI_SESSION_DEBUG(is, "iscsid(8) failed: %s", fail->idf_reason); strlcpy(is->is_reason, fail->idf_reason, sizeof(is->is_reason)); //is->is_waiting_for_iscsid = false; //is->is_login_phase = true; //iscsi_session_reconnect(is); ISCSI_SESSION_UNLOCK(is); sx_sunlock(&sc->sc_lock); return (0); } #ifdef ICL_KERNEL_PROXY static int iscsi_ioctl_daemon_connect(struct iscsi_softc *sc, struct iscsi_daemon_connect *idc) { struct iscsi_session *is; struct sockaddr *from_sa, *to_sa; int error; sx_slock(&sc->sc_lock); TAILQ_FOREACH(is, &sc->sc_sessions, is_next) { if (is->is_id == idc->idc_session_id) break; } if (is == NULL) { sx_sunlock(&sc->sc_lock); return (ESRCH); } sx_sunlock(&sc->sc_lock); if (idc->idc_from_addrlen > 0) { error = getsockaddr(&from_sa, (void *)idc->idc_from_addr, idc->idc_from_addrlen); if (error != 0) { ISCSI_SESSION_WARN(is, "getsockaddr failed with error %d", error); return (error); } } else { from_sa = NULL; } error = getsockaddr(&to_sa, (void *)idc->idc_to_addr, idc->idc_to_addrlen); if (error != 0) { ISCSI_SESSION_WARN(is, "getsockaddr failed with error %d", error); free(from_sa, M_SONAME); return (error); } ISCSI_SESSION_LOCK(is); is->is_statsn = 0; is->is_cmdsn = 0; is->is_expcmdsn = 0; is->is_maxcmdsn = 0; is->is_waiting_for_iscsid = false; is->is_login_phase = true; is->is_timeout = 0; ISCSI_SESSION_UNLOCK(is); error = icl_conn_connect(is->is_conn, idc->idc_domain, idc->idc_socktype, idc->idc_protocol, from_sa, to_sa); free(from_sa, M_SONAME); free(to_sa, M_SONAME); /* * Digests are always disabled during login phase. */ is->is_conn->ic_header_crc32c = false; is->is_conn->ic_data_crc32c = false; return (error); } static int iscsi_ioctl_daemon_send(struct iscsi_softc *sc, struct iscsi_daemon_send *ids) { struct iscsi_session *is; struct icl_pdu *ip; size_t datalen; void *data; int error; sx_slock(&sc->sc_lock); TAILQ_FOREACH(is, &sc->sc_sessions, is_next) { if (is->is_id == ids->ids_session_id) break; } if (is == NULL) { sx_sunlock(&sc->sc_lock); return (ESRCH); } sx_sunlock(&sc->sc_lock); if (is->is_login_phase == false) return (EBUSY); if (is->is_terminating || is->is_reconnecting) return (EIO); datalen = ids->ids_data_segment_len; if (datalen > is->is_max_send_data_segment_length) return (EINVAL); if (datalen > 0) { data = malloc(datalen, M_ISCSI, M_WAITOK); error = copyin(ids->ids_data_segment, data, datalen); if (error != 0) { free(data, M_ISCSI); return (error); } } ip = icl_pdu_new(is->is_conn, M_WAITOK); memcpy(ip->ip_bhs, ids->ids_bhs, sizeof(*ip->ip_bhs)); if (datalen > 0) { error = icl_pdu_append_data(ip, data, datalen, M_WAITOK); KASSERT(error == 0, ("icl_pdu_append_data(..., M_WAITOK) failed")); free(data, M_ISCSI); } iscsi_pdu_queue(ip); return (0); } static int iscsi_ioctl_daemon_receive(struct iscsi_softc *sc, struct iscsi_daemon_receive *idr) { struct iscsi_session *is; struct icl_pdu *ip; void *data; int error; sx_slock(&sc->sc_lock); TAILQ_FOREACH(is, &sc->sc_sessions, is_next) { if (is->is_id == idr->idr_session_id) break; } if (is == NULL) { sx_sunlock(&sc->sc_lock); return (ESRCH); } sx_sunlock(&sc->sc_lock); if (is->is_login_phase == false) return (EBUSY); ISCSI_SESSION_LOCK(is); while (is->is_login_pdu == NULL && is->is_terminating == false && is->is_reconnecting == false) { error = cv_wait_sig(&is->is_login_cv, &is->is_lock); if (error != 0) { ISCSI_SESSION_UNLOCK(is); return (error); } } if (is->is_terminating || is->is_reconnecting) { ISCSI_SESSION_UNLOCK(is); return (EIO); } ip = is->is_login_pdu; is->is_login_pdu = NULL; ISCSI_SESSION_UNLOCK(is); if (ip->ip_data_len > idr->idr_data_segment_len) { icl_pdu_free(ip); return (EMSGSIZE); } copyout(ip->ip_bhs, idr->idr_bhs, sizeof(*ip->ip_bhs)); if (ip->ip_data_len > 0) { data = malloc(ip->ip_data_len, M_ISCSI, M_WAITOK); icl_pdu_get_data(ip, 0, data, ip->ip_data_len); copyout(data, idr->idr_data_segment, ip->ip_data_len); free(data, M_ISCSI); } icl_pdu_free(ip); return (0); } #endif /* ICL_KERNEL_PROXY */ static void iscsi_sanitize_session_conf(struct iscsi_session_conf *isc) { /* * Just make sure all the fields are null-terminated. * * XXX: This is not particularly secure. We should * create our own conf and then copy in relevant * fields. */ isc->isc_initiator[ISCSI_NAME_LEN - 1] = '\0'; isc->isc_initiator_addr[ISCSI_ADDR_LEN - 1] = '\0'; isc->isc_initiator_alias[ISCSI_ALIAS_LEN - 1] = '\0'; isc->isc_target[ISCSI_NAME_LEN - 1] = '\0'; isc->isc_target_addr[ISCSI_ADDR_LEN - 1] = '\0'; isc->isc_user[ISCSI_NAME_LEN - 1] = '\0'; isc->isc_secret[ISCSI_SECRET_LEN - 1] = '\0'; isc->isc_mutual_user[ISCSI_NAME_LEN - 1] = '\0'; isc->isc_mutual_secret[ISCSI_SECRET_LEN - 1] = '\0'; } static bool iscsi_valid_session_conf(const struct iscsi_session_conf *isc) { if (isc->isc_initiator[0] == '\0') { ISCSI_DEBUG("empty isc_initiator"); return (false); } if (isc->isc_target_addr[0] == '\0') { ISCSI_DEBUG("empty isc_target_addr"); return (false); } if (isc->isc_discovery != 0 && isc->isc_target[0] != 0) { ISCSI_DEBUG("non-empty isc_target for discovery session"); return (false); } if (isc->isc_discovery == 0 && isc->isc_target[0] == 0) { ISCSI_DEBUG("empty isc_target for non-discovery session"); return (false); } return (true); } static int iscsi_ioctl_session_add(struct iscsi_softc *sc, struct iscsi_session_add *isa) { struct iscsi_session *is; const struct iscsi_session *is2; int error; iscsi_sanitize_session_conf(&isa->isa_conf); if (iscsi_valid_session_conf(&isa->isa_conf) == false) return (EINVAL); is = malloc(sizeof(*is), M_ISCSI, M_ZERO | M_WAITOK); memcpy(&is->is_conf, &isa->isa_conf, sizeof(is->is_conf)); /* * Set some default values, from RFC 3720, section 12. * * These values are updated by the handoff IOCTL, but are * needed prior to the handoff to support sending the ISER * login PDU. */ is->is_max_recv_data_segment_length = 8192; is->is_max_send_data_segment_length = 8192; is->is_max_burst_length = 262144; is->is_first_burst_length = 65536; sx_xlock(&sc->sc_lock); /* * Prevent duplicates. */ TAILQ_FOREACH(is2, &sc->sc_sessions, is_next) { if (!!is->is_conf.isc_discovery != !!is2->is_conf.isc_discovery) continue; if (strcmp(is->is_conf.isc_target_addr, is2->is_conf.isc_target_addr) != 0) continue; if (is->is_conf.isc_discovery == 0 && strcmp(is->is_conf.isc_target, is2->is_conf.isc_target) != 0) continue; sx_xunlock(&sc->sc_lock); free(is, M_ISCSI); return (EBUSY); } is->is_conn = icl_new_conn(is->is_conf.isc_offload, is->is_conf.isc_iser, "iscsi", &is->is_lock); if (is->is_conn == NULL) { sx_xunlock(&sc->sc_lock); free(is, M_ISCSI); return (EINVAL); } is->is_conn->ic_receive = iscsi_receive_callback; is->is_conn->ic_error = iscsi_error_callback; is->is_conn->ic_prv0 = is; TAILQ_INIT(&is->is_outstanding); STAILQ_INIT(&is->is_postponed); mtx_init(&is->is_lock, "iscsi_lock", NULL, MTX_DEF); cv_init(&is->is_maintenance_cv, "iscsi_mt"); #ifdef ICL_KERNEL_PROXY cv_init(&is->is_login_cv, "iscsi_login"); #endif is->is_softc = sc; sc->sc_last_session_id++; is->is_id = sc->sc_last_session_id; is->is_isid[0] = 0x80; /* RFC 3720, 10.12.5: 10b, "Random" ISID. */ arc4rand(&is->is_isid[1], 5, 0); is->is_tsih = 0; callout_init(&is->is_callout, 1); error = kthread_add(iscsi_maintenance_thread, is, NULL, NULL, 0, 0, "iscsimt"); if (error != 0) { ISCSI_SESSION_WARN(is, "kthread_add(9) failed with error %d", error); sx_xunlock(&sc->sc_lock); return (error); } callout_reset(&is->is_callout, 1 * hz, iscsi_callout, is); TAILQ_INSERT_TAIL(&sc->sc_sessions, is, is_next); ISCSI_SESSION_LOCK(is); /* * Don't notify iscsid(8) if the session is disabled and it's not * a discovery session, */ if (is->is_conf.isc_enable == 0 && is->is_conf.isc_discovery == 0) { ISCSI_SESSION_UNLOCK(is); sx_xunlock(&sc->sc_lock); return (0); } is->is_waiting_for_iscsid = true; strlcpy(is->is_reason, "Waiting for iscsid(8)", sizeof(is->is_reason)); ISCSI_SESSION_UNLOCK(is); cv_signal(&sc->sc_cv); sx_xunlock(&sc->sc_lock); return (0); } static bool iscsi_session_conf_matches(unsigned int id1, const struct iscsi_session_conf *c1, unsigned int id2, const struct iscsi_session_conf *c2) { if (id2 != 0 && id2 != id1) return (false); if (c2->isc_target[0] != '\0' && strcmp(c1->isc_target, c2->isc_target) != 0) return (false); if (c2->isc_target_addr[0] != '\0' && strcmp(c1->isc_target_addr, c2->isc_target_addr) != 0) return (false); return (true); } static int iscsi_ioctl_session_remove(struct iscsi_softc *sc, struct iscsi_session_remove *isr) { struct iscsi_session *is, *tmp; bool found = false; iscsi_sanitize_session_conf(&isr->isr_conf); sx_xlock(&sc->sc_lock); TAILQ_FOREACH_SAFE(is, &sc->sc_sessions, is_next, tmp) { ISCSI_SESSION_LOCK(is); if (iscsi_session_conf_matches(is->is_id, &is->is_conf, isr->isr_session_id, &isr->isr_conf)) { found = true; iscsi_session_logout(is); iscsi_session_terminate(is); } ISCSI_SESSION_UNLOCK(is); } sx_xunlock(&sc->sc_lock); if (!found) return (ESRCH); return (0); } static int iscsi_ioctl_session_list(struct iscsi_softc *sc, struct iscsi_session_list *isl) { int error; unsigned int i = 0; struct iscsi_session *is; struct iscsi_session_state iss; sx_slock(&sc->sc_lock); TAILQ_FOREACH(is, &sc->sc_sessions, is_next) { if (i >= isl->isl_nentries) { sx_sunlock(&sc->sc_lock); return (EMSGSIZE); } memset(&iss, 0, sizeof(iss)); memcpy(&iss.iss_conf, &is->is_conf, sizeof(iss.iss_conf)); iss.iss_id = is->is_id; strlcpy(iss.iss_target_alias, is->is_target_alias, sizeof(iss.iss_target_alias)); strlcpy(iss.iss_reason, is->is_reason, sizeof(iss.iss_reason)); strlcpy(iss.iss_offload, is->is_conn->ic_offload, sizeof(iss.iss_offload)); if (is->is_conn->ic_header_crc32c) iss.iss_header_digest = ISCSI_DIGEST_CRC32C; else iss.iss_header_digest = ISCSI_DIGEST_NONE; if (is->is_conn->ic_data_crc32c) iss.iss_data_digest = ISCSI_DIGEST_CRC32C; else iss.iss_data_digest = ISCSI_DIGEST_NONE; iss.iss_max_send_data_segment_length = is->is_max_send_data_segment_length; iss.iss_max_recv_data_segment_length = is->is_max_recv_data_segment_length; iss.iss_max_burst_length = is->is_max_burst_length; iss.iss_first_burst_length = is->is_first_burst_length; iss.iss_immediate_data = is->is_immediate_data; iss.iss_connected = is->is_connected; error = copyout(&iss, isl->isl_pstates + i, sizeof(iss)); if (error != 0) { sx_sunlock(&sc->sc_lock); return (error); } i++; } sx_sunlock(&sc->sc_lock); isl->isl_nentries = i; return (0); } static int iscsi_ioctl_session_modify(struct iscsi_softc *sc, struct iscsi_session_modify *ism) { struct iscsi_session *is; const struct iscsi_session *is2; iscsi_sanitize_session_conf(&ism->ism_conf); if (iscsi_valid_session_conf(&ism->ism_conf) == false) return (EINVAL); sx_xlock(&sc->sc_lock); TAILQ_FOREACH(is, &sc->sc_sessions, is_next) { ISCSI_SESSION_LOCK(is); if (is->is_id == ism->ism_session_id) { /* Note that the session remains locked. */ break; } ISCSI_SESSION_UNLOCK(is); } if (is == NULL) { sx_xunlock(&sc->sc_lock); return (ESRCH); } /* * Prevent duplicates. */ TAILQ_FOREACH(is2, &sc->sc_sessions, is_next) { if (is == is2) continue; if (!!ism->ism_conf.isc_discovery != !!is2->is_conf.isc_discovery) continue; if (strcmp(ism->ism_conf.isc_target_addr, is2->is_conf.isc_target_addr) != 0) continue; if (ism->ism_conf.isc_discovery == 0 && strcmp(ism->ism_conf.isc_target, is2->is_conf.isc_target) != 0) continue; ISCSI_SESSION_UNLOCK(is); sx_xunlock(&sc->sc_lock); return (EBUSY); } sx_xunlock(&sc->sc_lock); memcpy(&is->is_conf, &ism->ism_conf, sizeof(is->is_conf)); ISCSI_SESSION_UNLOCK(is); iscsi_session_reconnect(is); return (0); } static int iscsi_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int mode, struct thread *td) { struct iscsi_softc *sc; sc = dev->si_drv1; switch (cmd) { case ISCSIDWAIT: return (iscsi_ioctl_daemon_wait(sc, (struct iscsi_daemon_request *)arg)); case ISCSIDHANDOFF: return (iscsi_ioctl_daemon_handoff(sc, (struct iscsi_daemon_handoff *)arg)); case ISCSIDFAIL: return (iscsi_ioctl_daemon_fail(sc, (struct iscsi_daemon_fail *)arg)); #ifdef ICL_KERNEL_PROXY case ISCSIDCONNECT: return (iscsi_ioctl_daemon_connect(sc, (struct iscsi_daemon_connect *)arg)); case ISCSIDSEND: return (iscsi_ioctl_daemon_send(sc, (struct iscsi_daemon_send *)arg)); case ISCSIDRECEIVE: return (iscsi_ioctl_daemon_receive(sc, (struct iscsi_daemon_receive *)arg)); #endif /* ICL_KERNEL_PROXY */ case ISCSISADD: return (iscsi_ioctl_session_add(sc, (struct iscsi_session_add *)arg)); case ISCSISREMOVE: return (iscsi_ioctl_session_remove(sc, (struct iscsi_session_remove *)arg)); case ISCSISLIST: return (iscsi_ioctl_session_list(sc, (struct iscsi_session_list *)arg)); case ISCSISMODIFY: return (iscsi_ioctl_session_modify(sc, (struct iscsi_session_modify *)arg)); default: return (EINVAL); } } static struct iscsi_outstanding * iscsi_outstanding_find(struct iscsi_session *is, uint32_t initiator_task_tag) { struct iscsi_outstanding *io; ISCSI_SESSION_LOCK_ASSERT(is); TAILQ_FOREACH(io, &is->is_outstanding, io_next) { if (io->io_initiator_task_tag == initiator_task_tag) return (io); } return (NULL); } static struct iscsi_outstanding * iscsi_outstanding_find_ccb(struct iscsi_session *is, union ccb *ccb) { struct iscsi_outstanding *io; ISCSI_SESSION_LOCK_ASSERT(is); TAILQ_FOREACH(io, &is->is_outstanding, io_next) { if (io->io_ccb == ccb) return (io); } return (NULL); } static struct iscsi_outstanding * iscsi_outstanding_add(struct iscsi_session *is, struct icl_pdu *request, union ccb *ccb, uint32_t *initiator_task_tagp) { struct iscsi_outstanding *io; int error; ISCSI_SESSION_LOCK_ASSERT(is); io = uma_zalloc(iscsi_outstanding_zone, M_NOWAIT | M_ZERO); if (io == NULL) { ISCSI_SESSION_WARN(is, "failed to allocate %zd bytes", sizeof(*io)); return (NULL); } error = icl_conn_task_setup(is->is_conn, request, &ccb->csio, initiator_task_tagp, &io->io_icl_prv); if (error != 0) { ISCSI_SESSION_WARN(is, "icl_conn_task_setup() failed with error %d", error); uma_zfree(iscsi_outstanding_zone, io); return (NULL); } KASSERT(iscsi_outstanding_find(is, *initiator_task_tagp) == NULL, ("initiator_task_tag 0x%x already added", *initiator_task_tagp)); io->io_initiator_task_tag = *initiator_task_tagp; io->io_ccb = ccb; TAILQ_INSERT_TAIL(&is->is_outstanding, io, io_next); return (io); } static void iscsi_outstanding_remove(struct iscsi_session *is, struct iscsi_outstanding *io) { ISCSI_SESSION_LOCK_ASSERT(is); icl_conn_task_done(is->is_conn, io->io_icl_prv); TAILQ_REMOVE(&is->is_outstanding, io, io_next); uma_zfree(iscsi_outstanding_zone, io); } static void iscsi_action_abort(struct iscsi_session *is, union ccb *ccb) { struct icl_pdu *request; struct iscsi_bhs_task_management_request *bhstmr; struct ccb_abort *cab = &ccb->cab; struct iscsi_outstanding *io, *aio; uint32_t initiator_task_tag; ISCSI_SESSION_LOCK_ASSERT(is); #if 0 KASSERT(is->is_login_phase == false, ("%s called during Login Phase", __func__)); #else if (is->is_login_phase) { ccb->ccb_h.status = CAM_REQ_ABORTED; xpt_done(ccb); return; } #endif aio = iscsi_outstanding_find_ccb(is, cab->abort_ccb); if (aio == NULL) { ccb->ccb_h.status = CAM_REQ_CMP; xpt_done(ccb); return; } request = icl_pdu_new(is->is_conn, M_NOWAIT); if (request == NULL) { ccb->ccb_h.status = CAM_RESRC_UNAVAIL; xpt_done(ccb); return; } initiator_task_tag = is->is_initiator_task_tag++; io = iscsi_outstanding_add(is, request, NULL, &initiator_task_tag); if (io == NULL) { icl_pdu_free(request); ccb->ccb_h.status = CAM_RESRC_UNAVAIL; xpt_done(ccb); return; } io->io_datasn = aio->io_initiator_task_tag; bhstmr = (struct iscsi_bhs_task_management_request *)request->ip_bhs; bhstmr->bhstmr_opcode = ISCSI_BHS_OPCODE_TASK_REQUEST; bhstmr->bhstmr_function = 0x80 | BHSTMR_FUNCTION_ABORT_TASK; bhstmr->bhstmr_lun = htobe64(CAM_EXTLUN_BYTE_SWIZZLE(ccb->ccb_h.target_lun)); bhstmr->bhstmr_initiator_task_tag = initiator_task_tag; bhstmr->bhstmr_referenced_task_tag = aio->io_initiator_task_tag; iscsi_pdu_queue_locked(request); } static void iscsi_action_scsiio(struct iscsi_session *is, union ccb *ccb) { struct icl_pdu *request; struct iscsi_bhs_scsi_command *bhssc; struct ccb_scsiio *csio; struct iscsi_outstanding *io; size_t len; uint32_t initiator_task_tag; int error; ISCSI_SESSION_LOCK_ASSERT(is); #if 0 KASSERT(is->is_login_phase == false, ("%s called during Login Phase", __func__)); #else if (is->is_login_phase) { ISCSI_SESSION_DEBUG(is, "called during login phase"); if ((ccb->ccb_h.status & CAM_DEV_QFRZN) == 0) { xpt_freeze_devq(ccb->ccb_h.path, 1); ISCSI_SESSION_DEBUG(is, "freezing devq"); } ccb->ccb_h.status = CAM_REQ_ABORTED | CAM_DEV_QFRZN; xpt_done(ccb); return; } #endif request = icl_pdu_new(is->is_conn, M_NOWAIT); if (request == NULL) { if ((ccb->ccb_h.status & CAM_DEV_QFRZN) == 0) { xpt_freeze_devq(ccb->ccb_h.path, 1); ISCSI_SESSION_DEBUG(is, "freezing devq"); } ccb->ccb_h.status = CAM_RESRC_UNAVAIL | CAM_DEV_QFRZN; xpt_done(ccb); return; } initiator_task_tag = is->is_initiator_task_tag++; io = iscsi_outstanding_add(is, request, ccb, &initiator_task_tag); if (io == NULL) { icl_pdu_free(request); if ((ccb->ccb_h.status & CAM_DEV_QFRZN) == 0) { xpt_freeze_devq(ccb->ccb_h.path, 1); ISCSI_SESSION_DEBUG(is, "freezing devq"); } ccb->ccb_h.status = CAM_RESRC_UNAVAIL | CAM_DEV_QFRZN; xpt_done(ccb); return; } csio = &ccb->csio; bhssc = (struct iscsi_bhs_scsi_command *)request->ip_bhs; bhssc->bhssc_opcode = ISCSI_BHS_OPCODE_SCSI_COMMAND; bhssc->bhssc_flags |= BHSSC_FLAGS_F; switch (csio->ccb_h.flags & CAM_DIR_MASK) { case CAM_DIR_IN: bhssc->bhssc_flags |= BHSSC_FLAGS_R; break; case CAM_DIR_OUT: bhssc->bhssc_flags |= BHSSC_FLAGS_W; break; } if ((ccb->ccb_h.flags & CAM_TAG_ACTION_VALID) != 0) { switch (csio->tag_action) { case MSG_HEAD_OF_Q_TAG: bhssc->bhssc_flags |= BHSSC_FLAGS_ATTR_HOQ; break; case MSG_ORDERED_Q_TAG: bhssc->bhssc_flags |= BHSSC_FLAGS_ATTR_ORDERED; break; case MSG_ACA_TASK: bhssc->bhssc_flags |= BHSSC_FLAGS_ATTR_ACA; break; case MSG_SIMPLE_Q_TAG: default: bhssc->bhssc_flags |= BHSSC_FLAGS_ATTR_SIMPLE; break; } } else bhssc->bhssc_flags |= BHSSC_FLAGS_ATTR_UNTAGGED; bhssc->bhssc_lun = htobe64(CAM_EXTLUN_BYTE_SWIZZLE(ccb->ccb_h.target_lun)); bhssc->bhssc_initiator_task_tag = initiator_task_tag; bhssc->bhssc_expected_data_transfer_length = htonl(csio->dxfer_len); KASSERT(csio->cdb_len <= sizeof(bhssc->bhssc_cdb), ("unsupported CDB size %zd", (size_t)csio->cdb_len)); if (csio->ccb_h.flags & CAM_CDB_POINTER) memcpy(&bhssc->bhssc_cdb, csio->cdb_io.cdb_ptr, csio->cdb_len); else memcpy(&bhssc->bhssc_cdb, csio->cdb_io.cdb_bytes, csio->cdb_len); if (is->is_immediate_data && (csio->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_OUT) { len = csio->dxfer_len; //ISCSI_SESSION_DEBUG(is, "adding %zd of immediate data", len); if (len > is->is_first_burst_length) { ISCSI_SESSION_DEBUG(is, "len %zd -> %d", len, is->is_first_burst_length); len = is->is_first_burst_length; } if (len > is->is_max_send_data_segment_length) { ISCSI_SESSION_DEBUG(is, "len %zd -> %d", len, is->is_max_send_data_segment_length); len = is->is_max_send_data_segment_length; } error = icl_pdu_append_data(request, csio->data_ptr, len, M_NOWAIT); if (error != 0) { iscsi_outstanding_remove(is, io); icl_pdu_free(request); if ((ccb->ccb_h.status & CAM_DEV_QFRZN) == 0) { xpt_freeze_devq(ccb->ccb_h.path, 1); ISCSI_SESSION_DEBUG(is, "freezing devq"); } ccb->ccb_h.status = CAM_RESRC_UNAVAIL | CAM_DEV_QFRZN; xpt_done(ccb); return; } } iscsi_pdu_queue_locked(request); } static void iscsi_action(struct cam_sim *sim, union ccb *ccb) { struct iscsi_session *is; is = cam_sim_softc(sim); ISCSI_SESSION_LOCK_ASSERT(is); if (is->is_terminating || (is->is_connected == false && fail_on_disconnection)) { ccb->ccb_h.status = CAM_DEV_NOT_THERE; xpt_done(ccb); return; } switch (ccb->ccb_h.func_code) { case XPT_PATH_INQ: { struct ccb_pathinq *cpi = &ccb->cpi; cpi->version_num = 1; cpi->hba_inquiry = PI_TAG_ABLE; cpi->target_sprt = 0; cpi->hba_misc = PIM_EXTLUNS; /* * XXX: It shouldn't ever be NULL; this could be turned * into a KASSERT eventually. */ if (is->is_conn == NULL) ISCSI_WARN("NULL conn"); else if (is->is_conn->ic_unmapped) cpi->hba_misc |= PIM_UNMAPPED; cpi->hba_eng_cnt = 0; cpi->max_target = 0; /* * Note that the variable below is only relevant for targets * that don't claim compliance with anything above SPC2, which * means they don't support REPORT_LUNS. */ cpi->max_lun = 255; cpi->initiator_id = ~0; strlcpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN); strlcpy(cpi->hba_vid, "iSCSI", HBA_IDLEN); strlcpy(cpi->dev_name, cam_sim_name(sim), DEV_IDLEN); cpi->unit_number = cam_sim_unit(sim); cpi->bus_id = cam_sim_bus(sim); cpi->base_transfer_speed = 150000; /* XXX */ cpi->transport = XPORT_ISCSI; cpi->transport_version = 0; cpi->protocol = PROTO_SCSI; cpi->protocol_version = SCSI_REV_SPC3; cpi->maxio = MAXPHYS; cpi->ccb_h.status = CAM_REQ_CMP; break; } case XPT_GET_TRAN_SETTINGS: { struct ccb_trans_settings *cts; struct ccb_trans_settings_scsi *scsi; cts = &ccb->cts; scsi = &cts->proto_specific.scsi; cts->protocol = PROTO_SCSI; cts->protocol_version = SCSI_REV_SPC3; cts->transport = XPORT_ISCSI; cts->transport_version = 0; scsi->valid = CTS_SCSI_VALID_TQ; scsi->flags = CTS_SCSI_FLAGS_TAG_ENB; cts->ccb_h.status = CAM_REQ_CMP; break; } case XPT_CALC_GEOMETRY: cam_calc_geometry(&ccb->ccg, /*extended*/1); ccb->ccb_h.status = CAM_REQ_CMP; break; #if 0 /* * XXX: What's the point? */ case XPT_RESET_BUS: case XPT_TERM_IO: ISCSI_SESSION_DEBUG(is, "faking success for reset, abort, or term_io"); ccb->ccb_h.status = CAM_REQ_CMP; break; #endif case XPT_ABORT: iscsi_action_abort(is, ccb); return; case XPT_SCSI_IO: iscsi_action_scsiio(is, ccb); return; default: #if 0 ISCSI_SESSION_DEBUG(is, "got unsupported code 0x%x", ccb->ccb_h.func_code); #endif ccb->ccb_h.status = CAM_FUNC_NOTAVAIL; break; } xpt_done(ccb); } static void iscsi_poll(struct cam_sim *sim) { KASSERT(0, ("%s: you're not supposed to be here", __func__)); } static void iscsi_terminate_sessions(struct iscsi_softc *sc) { struct iscsi_session *is; sx_slock(&sc->sc_lock); TAILQ_FOREACH(is, &sc->sc_sessions, is_next) iscsi_session_terminate(is); while(!TAILQ_EMPTY(&sc->sc_sessions)) { ISCSI_DEBUG("waiting for sessions to terminate"); cv_wait(&sc->sc_cv, &sc->sc_lock); } ISCSI_DEBUG("all sessions terminated"); sx_sunlock(&sc->sc_lock); } static void iscsi_shutdown_pre(struct iscsi_softc *sc) { struct iscsi_session *is; if (!fail_on_shutdown) return; /* * If we have any sessions waiting for reconnection, request * maintenance thread to fail them immediately instead of waiting * for reconnect timeout. * * This prevents LUNs with mounted filesystems that are supported * by disconnected iSCSI sessions from hanging, however it will * fail all queued BIOs. */ ISCSI_DEBUG("forcing failing all disconnected sessions due to shutdown"); fail_on_disconnection = 1; sx_slock(&sc->sc_lock); TAILQ_FOREACH(is, &sc->sc_sessions, is_next) { ISCSI_SESSION_LOCK(is); if (!is->is_connected) { ISCSI_SESSION_DEBUG(is, "force failing disconnected session early"); iscsi_session_reconnect(is); } ISCSI_SESSION_UNLOCK(is); } sx_sunlock(&sc->sc_lock); } static void iscsi_shutdown_post(struct iscsi_softc *sc) { if (!KERNEL_PANICKED()) { ISCSI_DEBUG("removing all sessions due to shutdown"); iscsi_terminate_sessions(sc); } } static int iscsi_load(void) { int error; sc = malloc(sizeof(*sc), M_ISCSI, M_ZERO | M_WAITOK); sx_init(&sc->sc_lock, "iscsi"); TAILQ_INIT(&sc->sc_sessions); cv_init(&sc->sc_cv, "iscsi_cv"); iscsi_outstanding_zone = uma_zcreate("iscsi_outstanding", sizeof(struct iscsi_outstanding), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); error = make_dev_p(MAKEDEV_CHECKNAME, &sc->sc_cdev, &iscsi_cdevsw, NULL, UID_ROOT, GID_WHEEL, 0600, "iscsi"); if (error != 0) { ISCSI_WARN("failed to create device node, error %d", error); return (error); } sc->sc_cdev->si_drv1 = sc; sc->sc_shutdown_pre_eh = EVENTHANDLER_REGISTER(shutdown_pre_sync, iscsi_shutdown_pre, sc, SHUTDOWN_PRI_FIRST); /* * shutdown_post_sync needs to run after filesystem shutdown and before * CAM shutdown - otherwise when rebooting with an iSCSI session that is * disconnected but has outstanding requests, dashutdown() will hang on * cam_periph_runccb(). */ sc->sc_shutdown_post_eh = EVENTHANDLER_REGISTER(shutdown_post_sync, iscsi_shutdown_post, sc, SHUTDOWN_PRI_DEFAULT - 1); return (0); } static int iscsi_unload(void) { if (sc->sc_cdev != NULL) { ISCSI_DEBUG("removing device node"); destroy_dev(sc->sc_cdev); ISCSI_DEBUG("device node removed"); } if (sc->sc_shutdown_pre_eh != NULL) EVENTHANDLER_DEREGISTER(shutdown_pre_sync, sc->sc_shutdown_pre_eh); if (sc->sc_shutdown_post_eh != NULL) EVENTHANDLER_DEREGISTER(shutdown_post_sync, sc->sc_shutdown_post_eh); iscsi_terminate_sessions(sc); uma_zdestroy(iscsi_outstanding_zone); sx_destroy(&sc->sc_lock); cv_destroy(&sc->sc_cv); free(sc, M_ISCSI); return (0); } static int iscsi_quiesce(void) { sx_slock(&sc->sc_lock); if (!TAILQ_EMPTY(&sc->sc_sessions)) { sx_sunlock(&sc->sc_lock); return (EBUSY); } sx_sunlock(&sc->sc_lock); return (0); } static int iscsi_modevent(module_t mod, int what, void *arg) { int error; switch (what) { case MOD_LOAD: error = iscsi_load(); break; case MOD_UNLOAD: error = iscsi_unload(); break; case MOD_QUIESCE: error = iscsi_quiesce(); break; default: error = EINVAL; break; } return (error); } moduledata_t iscsi_data = { "iscsi", iscsi_modevent, 0 }; DECLARE_MODULE(iscsi, iscsi_data, SI_SUB_DRIVERS, SI_ORDER_MIDDLE); MODULE_DEPEND(iscsi, cam, 1, 1, 1); MODULE_DEPEND(iscsi, icl, 1, 1, 1); Index: head/sys/fs/autofs/autofs.c =================================================================== --- head/sys/fs/autofs/autofs.c (revision 357970) +++ head/sys/fs/autofs/autofs.c (revision 357971) @@ -1,705 +1,706 @@ /*- * Copyright (c) 2014 The FreeBSD Foundation * All rights reserved. * * This software was developed by Edward Tomasz Napierala under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /*- * Copyright (c) 1989, 1991, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_AUTOFS, "autofs", "Automounter filesystem"); uma_zone_t autofs_request_zone; uma_zone_t autofs_node_zone; static int autofs_open(struct cdev *dev, int flags, int fmt, struct thread *td); static int autofs_close(struct cdev *dev, int flag, int fmt, struct thread *td); static int autofs_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int mode, struct thread *td); static struct cdevsw autofs_cdevsw = { .d_version = D_VERSION, .d_open = autofs_open, .d_close = autofs_close, .d_ioctl = autofs_ioctl, .d_name = "autofs", }; /* * List of signals that can interrupt an autofs trigger. Might be a good * idea to keep it synchronised with list in sys/fs/nfs/nfs_commonkrpc.c. */ int autofs_sig_set[] = { SIGINT, SIGTERM, SIGHUP, SIGKILL, SIGQUIT }; struct autofs_softc *autofs_softc; -SYSCTL_NODE(_vfs, OID_AUTO, autofs, CTLFLAG_RD, 0, "Automounter filesystem"); +SYSCTL_NODE(_vfs, OID_AUTO, autofs, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, + "Automounter filesystem"); int autofs_debug = 1; TUNABLE_INT("vfs.autofs.debug", &autofs_debug); SYSCTL_INT(_vfs_autofs, OID_AUTO, debug, CTLFLAG_RWTUN, &autofs_debug, 1, "Enable debug messages"); int autofs_mount_on_stat = 0; TUNABLE_INT("vfs.autofs.mount_on_stat", &autofs_mount_on_stat); SYSCTL_INT(_vfs_autofs, OID_AUTO, mount_on_stat, CTLFLAG_RWTUN, &autofs_mount_on_stat, 0, "Trigger mount on stat(2) on mountpoint"); int autofs_timeout = 30; TUNABLE_INT("vfs.autofs.timeout", &autofs_timeout); SYSCTL_INT(_vfs_autofs, OID_AUTO, timeout, CTLFLAG_RWTUN, &autofs_timeout, 30, "Number of seconds to wait for automountd(8)"); int autofs_cache = 600; TUNABLE_INT("vfs.autofs.cache", &autofs_cache); SYSCTL_INT(_vfs_autofs, OID_AUTO, cache, CTLFLAG_RWTUN, &autofs_cache, 600, "Number of seconds to wait before reinvoking " "automountd(8) for any given file or directory"); int autofs_retry_attempts = 3; TUNABLE_INT("vfs.autofs.retry_attempts", &autofs_retry_attempts); SYSCTL_INT(_vfs_autofs, OID_AUTO, retry_attempts, CTLFLAG_RWTUN, &autofs_retry_attempts, 3, "Number of attempts before failing mount"); int autofs_retry_delay = 1; TUNABLE_INT("vfs.autofs.retry_delay", &autofs_retry_delay); SYSCTL_INT(_vfs_autofs, OID_AUTO, retry_delay, CTLFLAG_RWTUN, &autofs_retry_delay, 1, "Number of seconds before retrying"); int autofs_interruptible = 1; TUNABLE_INT("vfs.autofs.interruptible", &autofs_interruptible); SYSCTL_INT(_vfs_autofs, OID_AUTO, interruptible, CTLFLAG_RWTUN, &autofs_interruptible, 1, "Allow requests to be interrupted by signal"); static int autofs_node_cmp(const struct autofs_node *a, const struct autofs_node *b) { return (strcmp(a->an_name, b->an_name)); } RB_GENERATE(autofs_node_tree, autofs_node, an_link, autofs_node_cmp); int autofs_init(struct vfsconf *vfsp) { int error; KASSERT(autofs_softc == NULL, ("softc %p, should be NULL", autofs_softc)); autofs_softc = malloc(sizeof(*autofs_softc), M_AUTOFS, M_WAITOK | M_ZERO); autofs_request_zone = uma_zcreate("autofs_request", sizeof(struct autofs_request), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); autofs_node_zone = uma_zcreate("autofs_node", sizeof(struct autofs_node), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); TAILQ_INIT(&autofs_softc->sc_requests); cv_init(&autofs_softc->sc_cv, "autofscv"); sx_init(&autofs_softc->sc_lock, "autofslk"); error = make_dev_p(MAKEDEV_CHECKNAME, &autofs_softc->sc_cdev, &autofs_cdevsw, NULL, UID_ROOT, GID_WHEEL, 0600, "autofs"); if (error != 0) { AUTOFS_WARN("failed to create device node, error %d", error); uma_zdestroy(autofs_request_zone); uma_zdestroy(autofs_node_zone); free(autofs_softc, M_AUTOFS); return (error); } autofs_softc->sc_cdev->si_drv1 = autofs_softc; return (0); } int autofs_uninit(struct vfsconf *vfsp) { sx_xlock(&autofs_softc->sc_lock); if (autofs_softc->sc_dev_opened) { sx_xunlock(&autofs_softc->sc_lock); return (EBUSY); } if (autofs_softc->sc_cdev != NULL) destroy_dev(autofs_softc->sc_cdev); uma_zdestroy(autofs_request_zone); uma_zdestroy(autofs_node_zone); sx_xunlock(&autofs_softc->sc_lock); /* * XXX: Race with open? */ free(autofs_softc, M_AUTOFS); return (0); } bool autofs_ignore_thread(const struct thread *td) { struct proc *p; p = td->td_proc; if (autofs_softc->sc_dev_opened == false) return (false); PROC_LOCK(p); if (p->p_session->s_sid == autofs_softc->sc_dev_sid) { PROC_UNLOCK(p); return (true); } PROC_UNLOCK(p); return (false); } static char * autofs_path(struct autofs_node *anp) { struct autofs_mount *amp; char *path, *tmp; amp = anp->an_mount; path = strdup("", M_AUTOFS); for (; anp->an_parent != NULL; anp = anp->an_parent) { tmp = malloc(strlen(anp->an_name) + strlen(path) + 2, M_AUTOFS, M_WAITOK); strcpy(tmp, anp->an_name); strcat(tmp, "/"); strcat(tmp, path); free(path, M_AUTOFS); path = tmp; } tmp = malloc(strlen(amp->am_mountpoint) + strlen(path) + 2, M_AUTOFS, M_WAITOK); strcpy(tmp, amp->am_mountpoint); strcat(tmp, "/"); strcat(tmp, path); free(path, M_AUTOFS); path = tmp; return (path); } static void autofs_task(void *context, int pending) { struct autofs_request *ar; ar = context; sx_xlock(&autofs_softc->sc_lock); AUTOFS_WARN("request %d for %s timed out after %d seconds", ar->ar_id, ar->ar_path, autofs_timeout); /* * XXX: EIO perhaps? */ ar->ar_error = ETIMEDOUT; ar->ar_wildcards = true; ar->ar_done = true; ar->ar_in_progress = false; cv_broadcast(&autofs_softc->sc_cv); sx_xunlock(&autofs_softc->sc_lock); } bool autofs_cached(struct autofs_node *anp, const char *component, int componentlen) { int error; struct autofs_mount *amp; amp = anp->an_mount; AUTOFS_ASSERT_UNLOCKED(amp); /* * For root node we need to request automountd(8) assistance even * if the node is marked as cached, but the requested top-level * directory does not exist. This is necessary for wildcard indirect * map keys to work. We don't do this if we know that there are * no wildcards. */ if (anp->an_parent == NULL && componentlen != 0 && anp->an_wildcards) { AUTOFS_SLOCK(amp); error = autofs_node_find(anp, component, componentlen, NULL); AUTOFS_SUNLOCK(amp); if (error != 0) return (false); } return (anp->an_cached); } static void autofs_cache_callout(void *context) { struct autofs_node *anp; anp = context; anp->an_cached = false; } void autofs_flush(struct autofs_mount *amp) { /* * XXX: This will do for now, but ideally we should iterate * over all the nodes. */ amp->am_root->an_cached = false; AUTOFS_DEBUG("%s flushed", amp->am_mountpoint); } /* * The set/restore sigmask functions are used to (temporarily) overwrite * the thread td_sigmask during triggering. */ static void autofs_set_sigmask(sigset_t *oldset) { sigset_t newset; int i; SIGFILLSET(newset); /* Remove the autofs set of signals from newset */ PROC_LOCK(curproc); mtx_lock(&curproc->p_sigacts->ps_mtx); for (i = 0 ; i < nitems(autofs_sig_set); i++) { /* * But make sure we leave the ones already masked * by the process, i.e. remove the signal from the * temporary signalmask only if it wasn't already * in p_sigmask. */ if (!SIGISMEMBER(curthread->td_sigmask, autofs_sig_set[i]) && !SIGISMEMBER(curproc->p_sigacts->ps_sigignore, autofs_sig_set[i])) { SIGDELSET(newset, autofs_sig_set[i]); } } mtx_unlock(&curproc->p_sigacts->ps_mtx); kern_sigprocmask(curthread, SIG_SETMASK, &newset, oldset, SIGPROCMASK_PROC_LOCKED); PROC_UNLOCK(curproc); } static void autofs_restore_sigmask(sigset_t *set) { kern_sigprocmask(curthread, SIG_SETMASK, set, NULL, 0); } static int autofs_trigger_one(struct autofs_node *anp, const char *component, int componentlen) { sigset_t oldset; struct autofs_mount *amp; struct autofs_node *firstanp; struct autofs_request *ar; char *key, *path; int error = 0, request_error, last; bool wildcards; amp = anp->an_mount; sx_assert(&autofs_softc->sc_lock, SA_XLOCKED); if (anp->an_parent == NULL) { key = strndup(component, componentlen, M_AUTOFS); } else { for (firstanp = anp; firstanp->an_parent->an_parent != NULL; firstanp = firstanp->an_parent) continue; key = strdup(firstanp->an_name, M_AUTOFS); } path = autofs_path(anp); TAILQ_FOREACH(ar, &autofs_softc->sc_requests, ar_next) { if (strcmp(ar->ar_path, path) != 0) continue; if (strcmp(ar->ar_key, key) != 0) continue; KASSERT(strcmp(ar->ar_from, amp->am_from) == 0, ("from changed; %s != %s", ar->ar_from, amp->am_from)); KASSERT(strcmp(ar->ar_prefix, amp->am_prefix) == 0, ("prefix changed; %s != %s", ar->ar_prefix, amp->am_prefix)); KASSERT(strcmp(ar->ar_options, amp->am_options) == 0, ("options changed; %s != %s", ar->ar_options, amp->am_options)); break; } if (ar != NULL) { refcount_acquire(&ar->ar_refcount); } else { ar = uma_zalloc(autofs_request_zone, M_WAITOK | M_ZERO); ar->ar_mount = amp; ar->ar_id = atomic_fetchadd_int(&autofs_softc->sc_last_request_id, 1); strlcpy(ar->ar_from, amp->am_from, sizeof(ar->ar_from)); strlcpy(ar->ar_path, path, sizeof(ar->ar_path)); strlcpy(ar->ar_prefix, amp->am_prefix, sizeof(ar->ar_prefix)); strlcpy(ar->ar_key, key, sizeof(ar->ar_key)); strlcpy(ar->ar_options, amp->am_options, sizeof(ar->ar_options)); TIMEOUT_TASK_INIT(taskqueue_thread, &ar->ar_task, 0, autofs_task, ar); taskqueue_enqueue_timeout(taskqueue_thread, &ar->ar_task, autofs_timeout * hz); refcount_init(&ar->ar_refcount, 1); TAILQ_INSERT_TAIL(&autofs_softc->sc_requests, ar, ar_next); } cv_broadcast(&autofs_softc->sc_cv); while (ar->ar_done == false) { if (autofs_interruptible != 0) { autofs_set_sigmask(&oldset); error = cv_wait_sig(&autofs_softc->sc_cv, &autofs_softc->sc_lock); autofs_restore_sigmask(&oldset); if (error != 0) { AUTOFS_WARN("cv_wait_sig for %s failed " "with error %d", ar->ar_path, error); break; } } else { cv_wait(&autofs_softc->sc_cv, &autofs_softc->sc_lock); } } request_error = ar->ar_error; if (request_error != 0) { AUTOFS_WARN("request for %s completed with error %d", ar->ar_path, request_error); } wildcards = ar->ar_wildcards; last = refcount_release(&ar->ar_refcount); if (last) { TAILQ_REMOVE(&autofs_softc->sc_requests, ar, ar_next); /* * Unlock the sc_lock, so that autofs_task() can complete. */ sx_xunlock(&autofs_softc->sc_lock); taskqueue_cancel_timeout(taskqueue_thread, &ar->ar_task, NULL); taskqueue_drain_timeout(taskqueue_thread, &ar->ar_task); uma_zfree(autofs_request_zone, ar); sx_xlock(&autofs_softc->sc_lock); } /* * Note that we do not do negative caching on purpose. This * way the user can retry access at any time, e.g. after fixing * the failure reason, without waiting for cache timer to expire. */ if (error == 0 && request_error == 0 && autofs_cache > 0) { anp->an_cached = true; anp->an_wildcards = wildcards; callout_reset(&anp->an_callout, autofs_cache * hz, autofs_cache_callout, anp); } free(key, M_AUTOFS); free(path, M_AUTOFS); if (error != 0) return (error); return (request_error); } /* * Send request to automountd(8) and wait for completion. */ int autofs_trigger(struct autofs_node *anp, const char *component, int componentlen) { int error; for (;;) { error = autofs_trigger_one(anp, component, componentlen); if (error == 0) { anp->an_retries = 0; return (0); } if (error == EINTR || error == ERESTART) { AUTOFS_DEBUG("trigger interrupted by signal, " "not retrying"); anp->an_retries = 0; return (error); } anp->an_retries++; if (anp->an_retries >= autofs_retry_attempts) { AUTOFS_DEBUG("trigger failed %d times; returning " "error %d", anp->an_retries, error); anp->an_retries = 0; return (error); } AUTOFS_DEBUG("trigger failed with error %d; will retry in " "%d seconds, %d attempts left", error, autofs_retry_delay, autofs_retry_attempts - anp->an_retries); sx_xunlock(&autofs_softc->sc_lock); pause("autofs_retry", autofs_retry_delay * hz); sx_xlock(&autofs_softc->sc_lock); } } static int autofs_ioctl_request(struct autofs_daemon_request *adr) { struct autofs_request *ar; int error; sx_xlock(&autofs_softc->sc_lock); for (;;) { TAILQ_FOREACH(ar, &autofs_softc->sc_requests, ar_next) { if (ar->ar_done) continue; if (ar->ar_in_progress) continue; break; } if (ar != NULL) break; error = cv_wait_sig(&autofs_softc->sc_cv, &autofs_softc->sc_lock); if (error != 0) { sx_xunlock(&autofs_softc->sc_lock); return (error); } } ar->ar_in_progress = true; sx_xunlock(&autofs_softc->sc_lock); adr->adr_id = ar->ar_id; strlcpy(adr->adr_from, ar->ar_from, sizeof(adr->adr_from)); strlcpy(adr->adr_path, ar->ar_path, sizeof(adr->adr_path)); strlcpy(adr->adr_prefix, ar->ar_prefix, sizeof(adr->adr_prefix)); strlcpy(adr->adr_key, ar->ar_key, sizeof(adr->adr_key)); strlcpy(adr->adr_options, ar->ar_options, sizeof(adr->adr_options)); PROC_LOCK(curproc); autofs_softc->sc_dev_sid = curproc->p_session->s_sid; PROC_UNLOCK(curproc); return (0); } static int autofs_ioctl_done_101(struct autofs_daemon_done_101 *add) { struct autofs_request *ar; sx_xlock(&autofs_softc->sc_lock); TAILQ_FOREACH(ar, &autofs_softc->sc_requests, ar_next) { if (ar->ar_id == add->add_id) break; } if (ar == NULL) { sx_xunlock(&autofs_softc->sc_lock); AUTOFS_DEBUG("id %d not found", add->add_id); return (ESRCH); } ar->ar_error = add->add_error; ar->ar_wildcards = true; ar->ar_done = true; ar->ar_in_progress = false; cv_broadcast(&autofs_softc->sc_cv); sx_xunlock(&autofs_softc->sc_lock); return (0); } static int autofs_ioctl_done(struct autofs_daemon_done *add) { struct autofs_request *ar; sx_xlock(&autofs_softc->sc_lock); TAILQ_FOREACH(ar, &autofs_softc->sc_requests, ar_next) { if (ar->ar_id == add->add_id) break; } if (ar == NULL) { sx_xunlock(&autofs_softc->sc_lock); AUTOFS_DEBUG("id %d not found", add->add_id); return (ESRCH); } ar->ar_error = add->add_error; ar->ar_wildcards = add->add_wildcards; ar->ar_done = true; ar->ar_in_progress = false; cv_broadcast(&autofs_softc->sc_cv); sx_xunlock(&autofs_softc->sc_lock); return (0); } static int autofs_open(struct cdev *dev, int flags, int fmt, struct thread *td) { sx_xlock(&autofs_softc->sc_lock); /* * We must never block automountd(8) and its descendants, and we use * session ID to determine that: we store session id of the process * that opened the device, and then compare it with session ids * of triggering processes. This means running a second automountd(8) * instance would break the previous one. The check below prevents * it from happening. */ if (autofs_softc->sc_dev_opened) { sx_xunlock(&autofs_softc->sc_lock); return (EBUSY); } autofs_softc->sc_dev_opened = true; sx_xunlock(&autofs_softc->sc_lock); return (0); } static int autofs_close(struct cdev *dev, int flag, int fmt, struct thread *td) { sx_xlock(&autofs_softc->sc_lock); KASSERT(autofs_softc->sc_dev_opened, ("not opened?")); autofs_softc->sc_dev_opened = false; sx_xunlock(&autofs_softc->sc_lock); return (0); } static int autofs_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int mode, struct thread *td) { KASSERT(autofs_softc->sc_dev_opened, ("not opened?")); switch (cmd) { case AUTOFSREQUEST: return (autofs_ioctl_request( (struct autofs_daemon_request *)arg)); case AUTOFSDONE101: return (autofs_ioctl_done_101( (struct autofs_daemon_done_101 *)arg)); case AUTOFSDONE: return (autofs_ioctl_done( (struct autofs_daemon_done *)arg)); default: AUTOFS_DEBUG("invalid cmd %lx", cmd); return (EINVAL); } } Index: head/sys/kern/kern_rctl.c =================================================================== --- head/sys/kern/kern_rctl.c (revision 357970) +++ head/sys/kern/kern_rctl.c (revision 357971) @@ -1,2238 +1,2243 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2010 The FreeBSD Foundation * All rights reserved. * * This software was developed by Edward Tomasz Napierala under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RCTL #ifndef RACCT #error "The RCTL option requires the RACCT option" #endif FEATURE(rctl, "Resource Limits"); #define HRF_DEFAULT 0 #define HRF_DONT_INHERIT 1 #define HRF_DONT_ACCUMULATE 2 #define RCTL_MAX_INBUFSIZE 4 * 1024 #define RCTL_MAX_OUTBUFSIZE 16 * 1024 * 1024 #define RCTL_LOG_BUFSIZE 128 #define RCTL_PCPU_SHIFT (10 * 1000000) static unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE; static int rctl_log_rate_limit = 10; static int rctl_devctl_rate_limit = 10; /* * Values below are initialized in rctl_init(). */ static int rctl_throttle_min = -1; static int rctl_throttle_max = -1; static int rctl_throttle_pct = -1; static int rctl_throttle_pct2 = -1; static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS); static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS); static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS); static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS); -SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW, 0, "Resource Limits"); +SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + "Resource Limits"); SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, maxbufsize, CTLFLAG_RWTUN, &rctl_maxbufsize, 0, "Maximum output buffer size"); SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, log_rate_limit, CTLFLAG_RW, &rctl_log_rate_limit, 0, "Maximum number of log messages per second"); SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, devctl_rate_limit, CTLFLAG_RWTUN, &rctl_devctl_rate_limit, 0, "Maximum number of devctl messages per second"); SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_min, - CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_min_sysctl, "IU", + CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0, + &rctl_throttle_min_sysctl, "IU", "Shortest throttling duration, in hz"); TUNABLE_INT("kern.racct.rctl.throttle_min", &rctl_throttle_min); SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_max, - CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_max_sysctl, "IU", + CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0, + &rctl_throttle_max_sysctl, "IU", "Longest throttling duration, in hz"); TUNABLE_INT("kern.racct.rctl.throttle_max", &rctl_throttle_max); SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct, - CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_pct_sysctl, "IU", + CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0, + &rctl_throttle_pct_sysctl, "IU", "Throttling penalty for process consumption, in percent"); TUNABLE_INT("kern.racct.rctl.throttle_pct", &rctl_throttle_pct); SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct2, - CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_pct2_sysctl, "IU", + CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0, + &rctl_throttle_pct2_sysctl, "IU", "Throttling penalty for container consumption, in percent"); TUNABLE_INT("kern.racct.rctl.throttle_pct2", &rctl_throttle_pct2); /* * 'rctl_rule_link' connects a rule with every racct it's related to. * For example, rule 'user:X:openfiles:deny=N/process' is linked * with uidinfo for user X, and to each process of that user. */ struct rctl_rule_link { LIST_ENTRY(rctl_rule_link) rrl_next; struct rctl_rule *rrl_rule; int rrl_exceeded; }; struct dict { const char *d_name; int d_value; }; static struct dict subjectnames[] = { { "process", RCTL_SUBJECT_TYPE_PROCESS }, { "user", RCTL_SUBJECT_TYPE_USER }, { "loginclass", RCTL_SUBJECT_TYPE_LOGINCLASS }, { "jail", RCTL_SUBJECT_TYPE_JAIL }, { NULL, -1 }}; static struct dict resourcenames[] = { { "cputime", RACCT_CPU }, { "datasize", RACCT_DATA }, { "stacksize", RACCT_STACK }, { "coredumpsize", RACCT_CORE }, { "memoryuse", RACCT_RSS }, { "memorylocked", RACCT_MEMLOCK }, { "maxproc", RACCT_NPROC }, { "openfiles", RACCT_NOFILE }, { "vmemoryuse", RACCT_VMEM }, { "pseudoterminals", RACCT_NPTS }, { "swapuse", RACCT_SWAP }, { "nthr", RACCT_NTHR }, { "msgqqueued", RACCT_MSGQQUEUED }, { "msgqsize", RACCT_MSGQSIZE }, { "nmsgq", RACCT_NMSGQ }, { "nsem", RACCT_NSEM }, { "nsemop", RACCT_NSEMOP }, { "nshm", RACCT_NSHM }, { "shmsize", RACCT_SHMSIZE }, { "wallclock", RACCT_WALLCLOCK }, { "pcpu", RACCT_PCTCPU }, { "readbps", RACCT_READBPS }, { "writebps", RACCT_WRITEBPS }, { "readiops", RACCT_READIOPS }, { "writeiops", RACCT_WRITEIOPS }, { NULL, -1 }}; static struct dict actionnames[] = { { "sighup", RCTL_ACTION_SIGHUP }, { "sigint", RCTL_ACTION_SIGINT }, { "sigquit", RCTL_ACTION_SIGQUIT }, { "sigill", RCTL_ACTION_SIGILL }, { "sigtrap", RCTL_ACTION_SIGTRAP }, { "sigabrt", RCTL_ACTION_SIGABRT }, { "sigemt", RCTL_ACTION_SIGEMT }, { "sigfpe", RCTL_ACTION_SIGFPE }, { "sigkill", RCTL_ACTION_SIGKILL }, { "sigbus", RCTL_ACTION_SIGBUS }, { "sigsegv", RCTL_ACTION_SIGSEGV }, { "sigsys", RCTL_ACTION_SIGSYS }, { "sigpipe", RCTL_ACTION_SIGPIPE }, { "sigalrm", RCTL_ACTION_SIGALRM }, { "sigterm", RCTL_ACTION_SIGTERM }, { "sigurg", RCTL_ACTION_SIGURG }, { "sigstop", RCTL_ACTION_SIGSTOP }, { "sigtstp", RCTL_ACTION_SIGTSTP }, { "sigchld", RCTL_ACTION_SIGCHLD }, { "sigttin", RCTL_ACTION_SIGTTIN }, { "sigttou", RCTL_ACTION_SIGTTOU }, { "sigio", RCTL_ACTION_SIGIO }, { "sigxcpu", RCTL_ACTION_SIGXCPU }, { "sigxfsz", RCTL_ACTION_SIGXFSZ }, { "sigvtalrm", RCTL_ACTION_SIGVTALRM }, { "sigprof", RCTL_ACTION_SIGPROF }, { "sigwinch", RCTL_ACTION_SIGWINCH }, { "siginfo", RCTL_ACTION_SIGINFO }, { "sigusr1", RCTL_ACTION_SIGUSR1 }, { "sigusr2", RCTL_ACTION_SIGUSR2 }, { "sigthr", RCTL_ACTION_SIGTHR }, { "deny", RCTL_ACTION_DENY }, { "log", RCTL_ACTION_LOG }, { "devctl", RCTL_ACTION_DEVCTL }, { "throttle", RCTL_ACTION_THROTTLE }, { NULL, -1 }}; static void rctl_init(void); SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL); static uma_zone_t rctl_rule_zone; static uma_zone_t rctl_rule_link_zone; static int rctl_rule_fully_specified(const struct rctl_rule *rule); static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule); static MALLOC_DEFINE(M_RCTL, "rctl", "Resource Limits"); static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS) { int error, val = rctl_throttle_min; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr) return (error); if (val < 1 || val > rctl_throttle_max) return (EINVAL); RACCT_LOCK(); rctl_throttle_min = val; RACCT_UNLOCK(); return (0); } static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS) { int error, val = rctl_throttle_max; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr) return (error); if (val < rctl_throttle_min) return (EINVAL); RACCT_LOCK(); rctl_throttle_max = val; RACCT_UNLOCK(); return (0); } static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS) { int error, val = rctl_throttle_pct; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr) return (error); if (val < 0) return (EINVAL); RACCT_LOCK(); rctl_throttle_pct = val; RACCT_UNLOCK(); return (0); } static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS) { int error, val = rctl_throttle_pct2; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr) return (error); if (val < 0) return (EINVAL); RACCT_LOCK(); rctl_throttle_pct2 = val; RACCT_UNLOCK(); return (0); } static const char * rctl_subject_type_name(int subject) { int i; for (i = 0; subjectnames[i].d_name != NULL; i++) { if (subjectnames[i].d_value == subject) return (subjectnames[i].d_name); } panic("rctl_subject_type_name: unknown subject type %d", subject); } static const char * rctl_action_name(int action) { int i; for (i = 0; actionnames[i].d_name != NULL; i++) { if (actionnames[i].d_value == action) return (actionnames[i].d_name); } panic("rctl_action_name: unknown action %d", action); } const char * rctl_resource_name(int resource) { int i; for (i = 0; resourcenames[i].d_name != NULL; i++) { if (resourcenames[i].d_value == resource) return (resourcenames[i].d_name); } panic("rctl_resource_name: unknown resource %d", resource); } static struct racct * rctl_proc_rule_to_racct(const struct proc *p, const struct rctl_rule *rule) { struct ucred *cred = p->p_ucred; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); switch (rule->rr_per) { case RCTL_SUBJECT_TYPE_PROCESS: return (p->p_racct); case RCTL_SUBJECT_TYPE_USER: return (cred->cr_ruidinfo->ui_racct); case RCTL_SUBJECT_TYPE_LOGINCLASS: return (cred->cr_loginclass->lc_racct); case RCTL_SUBJECT_TYPE_JAIL: return (cred->cr_prison->pr_prison_racct->prr_racct); default: panic("%s: unknown per %d", __func__, rule->rr_per); } } /* * Return the amount of resource that can be allocated by 'p' before * hitting 'rule'. */ static int64_t rctl_available_resource(const struct proc *p, const struct rctl_rule *rule) { const struct racct *racct; int64_t available; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); racct = rctl_proc_rule_to_racct(p, rule); available = rule->rr_amount - racct->r_resources[rule->rr_resource]; return (available); } /* * Called every second for proc, uidinfo, loginclass, and jail containers. * If the limit isn't exceeded, it decreases the usage amount to zero. * Otherwise, it decreases it by the value of the limit. This way * resource consumption exceeding the limit "carries over" to the next * period. */ void rctl_throttle_decay(struct racct *racct, int resource) { struct rctl_rule *rule; struct rctl_rule_link *link; int64_t minavailable; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); minavailable = INT64_MAX; LIST_FOREACH(link, &racct->r_rule_links, rrl_next) { rule = link->rrl_rule; if (rule->rr_resource != resource) continue; if (rule->rr_action != RCTL_ACTION_THROTTLE) continue; if (rule->rr_amount < minavailable) minavailable = rule->rr_amount; } if (racct->r_resources[resource] < minavailable) { racct->r_resources[resource] = 0; } else { /* * Cap utilization counter at ten times the limit. Otherwise, * if we changed the rule lowering the allowed amount, it could * take unreasonably long time for the accumulated resource * usage to drop. */ if (racct->r_resources[resource] > minavailable * 10) racct->r_resources[resource] = minavailable * 10; racct->r_resources[resource] -= minavailable; } } /* * Special version of rctl_get_available() for the %CPU resource. * We slightly cheat here and return less than we normally would. */ int64_t rctl_pcpu_available(const struct proc *p) { struct rctl_rule *rule; struct rctl_rule_link *link; int64_t available, minavailable, limit; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); minavailable = INT64_MAX; limit = 0; LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { rule = link->rrl_rule; if (rule->rr_resource != RACCT_PCTCPU) continue; if (rule->rr_action != RCTL_ACTION_DENY) continue; available = rctl_available_resource(p, rule); if (available < minavailable) { minavailable = available; limit = rule->rr_amount; } } /* * Return slightly less than actual value of the available * %cpu resource. This makes %cpu throttling more aggressive * and lets us act sooner than the limits are already exceeded. */ if (limit != 0) { if (limit > 2 * RCTL_PCPU_SHIFT) minavailable -= RCTL_PCPU_SHIFT; else minavailable -= (limit / 2); } return (minavailable); } static uint64_t xadd(uint64_t a, uint64_t b) { uint64_t c; c = a + b; /* * Detect overflow. */ if (c < a || c < b) return (UINT64_MAX); return (c); } static uint64_t xmul(uint64_t a, uint64_t b) { if (b != 0 && a > UINT64_MAX / b) return (UINT64_MAX); return (a * b); } /* * Check whether the proc 'p' can allocate 'amount' of 'resource' in addition * to what it keeps allocated now. Returns non-zero if the allocation should * be denied, 0 otherwise. */ int rctl_enforce(struct proc *p, int resource, uint64_t amount) { static struct timeval log_lasttime, devctl_lasttime; static int log_curtime = 0, devctl_curtime = 0; struct rctl_rule *rule; struct rctl_rule_link *link; struct sbuf sb; char *buf; int64_t available; uint64_t sleep_ms, sleep_ratio; int should_deny = 0; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); /* * There may be more than one matching rule; go through all of them. * Denial should be done last, after logging and sending signals. */ LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { rule = link->rrl_rule; if (rule->rr_resource != resource) continue; available = rctl_available_resource(p, rule); if (available >= (int64_t)amount) { link->rrl_exceeded = 0; continue; } switch (rule->rr_action) { case RCTL_ACTION_DENY: should_deny = 1; continue; case RCTL_ACTION_LOG: /* * If rrl_exceeded != 0, it means we've already * logged a warning for this process. */ if (link->rrl_exceeded != 0) continue; /* * If the process state is not fully initialized yet, * we can't access most of the required fields, e.g. * p->p_comm. This happens when called from fork1(). * Ignore this rule for now; it will be processed just * after fork, when called from racct_proc_fork_done(). */ if (p->p_state != PRS_NORMAL) continue; if (!ppsratecheck(&log_lasttime, &log_curtime, rctl_log_rate_limit)) continue; buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT); if (buf == NULL) { printf("rctl_enforce: out of memory\n"); continue; } sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN); rctl_rule_to_sbuf(&sb, rule); sbuf_finish(&sb); printf("rctl: rule \"%s\" matched by pid %d " "(%s), uid %d, jail %s\n", sbuf_data(&sb), p->p_pid, p->p_comm, p->p_ucred->cr_uid, p->p_ucred->cr_prison->pr_prison_racct->prr_name); sbuf_delete(&sb); free(buf, M_RCTL); link->rrl_exceeded = 1; continue; case RCTL_ACTION_DEVCTL: if (link->rrl_exceeded != 0) continue; if (p->p_state != PRS_NORMAL) continue; if (!ppsratecheck(&devctl_lasttime, &devctl_curtime, rctl_devctl_rate_limit)) continue; buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT); if (buf == NULL) { printf("rctl_enforce: out of memory\n"); continue; } sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN); sbuf_printf(&sb, "rule="); rctl_rule_to_sbuf(&sb, rule); sbuf_printf(&sb, " pid=%d ruid=%d jail=%s", p->p_pid, p->p_ucred->cr_ruid, p->p_ucred->cr_prison->pr_prison_racct->prr_name); sbuf_finish(&sb); devctl_notify_f("RCTL", "rule", "matched", sbuf_data(&sb), M_NOWAIT); sbuf_delete(&sb); free(buf, M_RCTL); link->rrl_exceeded = 1; continue; case RCTL_ACTION_THROTTLE: if (p->p_state != PRS_NORMAL) continue; /* * Make the process sleep for a fraction of second * proportional to the ratio of process' resource * utilization compared to the limit. The point is * to penalize resource hogs: processes that consume * more of the available resources sleep for longer. * * We're trying to defer division until the very end, * to minimize the rounding effects. The following * calculation could have been written in a clearer * way like this: * * sleep_ms = hz * p->p_racct->r_resources[resource] / * rule->rr_amount; * sleep_ms *= rctl_throttle_pct / 100; * if (sleep_ms < rctl_throttle_min) * sleep_ms = rctl_throttle_min; * */ sleep_ms = xmul(hz, p->p_racct->r_resources[resource]); sleep_ms = xmul(sleep_ms, rctl_throttle_pct) / 100; if (sleep_ms < rctl_throttle_min * rule->rr_amount) sleep_ms = rctl_throttle_min * rule->rr_amount; /* * Multiply that by the ratio of the resource * consumption for the container compared to the limit, * squared. In other words, a process in a container * that is two times over the limit will be throttled * four times as much for hitting the same rule. The * point is to penalize processes more if the container * itself (eg certain UID or jail) is above the limit. */ if (available < 0) sleep_ratio = -available / rule->rr_amount; else sleep_ratio = 0; sleep_ratio = xmul(sleep_ratio, sleep_ratio); sleep_ratio = xmul(sleep_ratio, rctl_throttle_pct2) / 100; sleep_ms = xadd(sleep_ms, xmul(sleep_ms, sleep_ratio)); /* * Finally the division. */ sleep_ms /= rule->rr_amount; if (sleep_ms > rctl_throttle_max) sleep_ms = rctl_throttle_max; #if 0 printf("%s: pid %d (%s), %jd of %jd, will sleep for %ju ms (ratio %ju, available %jd)\n", __func__, p->p_pid, p->p_comm, p->p_racct->r_resources[resource], rule->rr_amount, (uintmax_t)sleep_ms, (uintmax_t)sleep_ratio, (intmax_t)available); #endif KASSERT(sleep_ms >= rctl_throttle_min, ("%s: %ju < %d\n", __func__, (uintmax_t)sleep_ms, rctl_throttle_min)); racct_proc_throttle(p, sleep_ms); continue; default: if (link->rrl_exceeded != 0) continue; if (p->p_state != PRS_NORMAL) continue; KASSERT(rule->rr_action > 0 && rule->rr_action <= RCTL_ACTION_SIGNAL_MAX, ("rctl_enforce: unknown action %d", rule->rr_action)); /* * We're using the fact that RCTL_ACTION_SIG* values * are equal to their counterparts from sys/signal.h. */ kern_psignal(p, rule->rr_action); link->rrl_exceeded = 1; continue; } } if (should_deny) { /* * Return fake error code; the caller should change it * into one proper for the situation - EFSIZ, ENOMEM etc. */ return (EDOOFUS); } return (0); } uint64_t rctl_get_limit(struct proc *p, int resource) { struct rctl_rule *rule; struct rctl_rule_link *link; uint64_t amount = UINT64_MAX; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); /* * There may be more than one matching rule; go through all of them. * Denial should be done last, after logging and sending signals. */ LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { rule = link->rrl_rule; if (rule->rr_resource != resource) continue; if (rule->rr_action != RCTL_ACTION_DENY) continue; if (rule->rr_amount < amount) amount = rule->rr_amount; } return (amount); } uint64_t rctl_get_available(struct proc *p, int resource) { struct rctl_rule *rule; struct rctl_rule_link *link; int64_t available, minavailable, allocated; minavailable = INT64_MAX; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); /* * There may be more than one matching rule; go through all of them. * Denial should be done last, after logging and sending signals. */ LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { rule = link->rrl_rule; if (rule->rr_resource != resource) continue; if (rule->rr_action != RCTL_ACTION_DENY) continue; available = rctl_available_resource(p, rule); if (available < minavailable) minavailable = available; } /* * XXX: Think about this _hard_. */ allocated = p->p_racct->r_resources[resource]; if (minavailable < INT64_MAX - allocated) minavailable += allocated; if (minavailable < 0) minavailable = 0; return (minavailable); } static int rctl_rule_matches(const struct rctl_rule *rule, const struct rctl_rule *filter) { ASSERT_RACCT_ENABLED(); if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) { if (rule->rr_subject_type != filter->rr_subject_type) return (0); switch (filter->rr_subject_type) { case RCTL_SUBJECT_TYPE_PROCESS: if (filter->rr_subject.rs_proc != NULL && rule->rr_subject.rs_proc != filter->rr_subject.rs_proc) return (0); break; case RCTL_SUBJECT_TYPE_USER: if (filter->rr_subject.rs_uip != NULL && rule->rr_subject.rs_uip != filter->rr_subject.rs_uip) return (0); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: if (filter->rr_subject.rs_loginclass != NULL && rule->rr_subject.rs_loginclass != filter->rr_subject.rs_loginclass) return (0); break; case RCTL_SUBJECT_TYPE_JAIL: if (filter->rr_subject.rs_prison_racct != NULL && rule->rr_subject.rs_prison_racct != filter->rr_subject.rs_prison_racct) return (0); break; default: panic("rctl_rule_matches: unknown subject type %d", filter->rr_subject_type); } } if (filter->rr_resource != RACCT_UNDEFINED) { if (rule->rr_resource != filter->rr_resource) return (0); } if (filter->rr_action != RCTL_ACTION_UNDEFINED) { if (rule->rr_action != filter->rr_action) return (0); } if (filter->rr_amount != RCTL_AMOUNT_UNDEFINED) { if (rule->rr_amount != filter->rr_amount) return (0); } if (filter->rr_per != RCTL_SUBJECT_TYPE_UNDEFINED) { if (rule->rr_per != filter->rr_per) return (0); } return (1); } static int str2value(const char *str, int *value, struct dict *table) { int i; if (value == NULL) return (EINVAL); for (i = 0; table[i].d_name != NULL; i++) { if (strcasecmp(table[i].d_name, str) == 0) { *value = table[i].d_value; return (0); } } return (EINVAL); } static int str2id(const char *str, id_t *value) { char *end; if (str == NULL) return (EINVAL); *value = strtoul(str, &end, 10); if ((size_t)(end - str) != strlen(str)) return (EINVAL); return (0); } static int str2int64(const char *str, int64_t *value) { char *end; if (str == NULL) return (EINVAL); *value = strtoul(str, &end, 10); if ((size_t)(end - str) != strlen(str)) return (EINVAL); if (*value < 0) return (ERANGE); return (0); } /* * Connect the rule to the racct, increasing refcount for the rule. */ static void rctl_racct_add_rule(struct racct *racct, struct rctl_rule *rule) { struct rctl_rule_link *link; ASSERT_RACCT_ENABLED(); KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified")); rctl_rule_acquire(rule); link = uma_zalloc(rctl_rule_link_zone, M_WAITOK); link->rrl_rule = rule; link->rrl_exceeded = 0; RACCT_LOCK(); LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next); RACCT_UNLOCK(); } static int rctl_racct_add_rule_locked(struct racct *racct, struct rctl_rule *rule) { struct rctl_rule_link *link; ASSERT_RACCT_ENABLED(); KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified")); RACCT_LOCK_ASSERT(); link = uma_zalloc(rctl_rule_link_zone, M_NOWAIT); if (link == NULL) return (ENOMEM); rctl_rule_acquire(rule); link->rrl_rule = rule; link->rrl_exceeded = 0; LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next); return (0); } /* * Remove limits for a rules matching the filter and release * the refcounts for the rules, possibly freeing them. Returns * the number of limit structures removed. */ static int rctl_racct_remove_rules(struct racct *racct, const struct rctl_rule *filter) { struct rctl_rule_link *link, *linktmp; int removed = 0; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); LIST_FOREACH_SAFE(link, &racct->r_rule_links, rrl_next, linktmp) { if (!rctl_rule_matches(link->rrl_rule, filter)) continue; LIST_REMOVE(link, rrl_next); rctl_rule_release(link->rrl_rule); uma_zfree(rctl_rule_link_zone, link); removed++; } return (removed); } static void rctl_rule_acquire_subject(struct rctl_rule *rule) { ASSERT_RACCT_ENABLED(); switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_UNDEFINED: case RCTL_SUBJECT_TYPE_PROCESS: break; case RCTL_SUBJECT_TYPE_JAIL: if (rule->rr_subject.rs_prison_racct != NULL) prison_racct_hold(rule->rr_subject.rs_prison_racct); break; case RCTL_SUBJECT_TYPE_USER: if (rule->rr_subject.rs_uip != NULL) uihold(rule->rr_subject.rs_uip); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: if (rule->rr_subject.rs_loginclass != NULL) loginclass_hold(rule->rr_subject.rs_loginclass); break; default: panic("rctl_rule_acquire_subject: unknown subject type %d", rule->rr_subject_type); } } static void rctl_rule_release_subject(struct rctl_rule *rule) { ASSERT_RACCT_ENABLED(); switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_UNDEFINED: case RCTL_SUBJECT_TYPE_PROCESS: break; case RCTL_SUBJECT_TYPE_JAIL: if (rule->rr_subject.rs_prison_racct != NULL) prison_racct_free(rule->rr_subject.rs_prison_racct); break; case RCTL_SUBJECT_TYPE_USER: if (rule->rr_subject.rs_uip != NULL) uifree(rule->rr_subject.rs_uip); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: if (rule->rr_subject.rs_loginclass != NULL) loginclass_free(rule->rr_subject.rs_loginclass); break; default: panic("rctl_rule_release_subject: unknown subject type %d", rule->rr_subject_type); } } struct rctl_rule * rctl_rule_alloc(int flags) { struct rctl_rule *rule; ASSERT_RACCT_ENABLED(); rule = uma_zalloc(rctl_rule_zone, flags); if (rule == NULL) return (NULL); rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED; rule->rr_subject.rs_proc = NULL; rule->rr_subject.rs_uip = NULL; rule->rr_subject.rs_loginclass = NULL; rule->rr_subject.rs_prison_racct = NULL; rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED; rule->rr_resource = RACCT_UNDEFINED; rule->rr_action = RCTL_ACTION_UNDEFINED; rule->rr_amount = RCTL_AMOUNT_UNDEFINED; refcount_init(&rule->rr_refcount, 1); return (rule); } struct rctl_rule * rctl_rule_duplicate(const struct rctl_rule *rule, int flags) { struct rctl_rule *copy; ASSERT_RACCT_ENABLED(); copy = uma_zalloc(rctl_rule_zone, flags); if (copy == NULL) return (NULL); copy->rr_subject_type = rule->rr_subject_type; copy->rr_subject.rs_proc = rule->rr_subject.rs_proc; copy->rr_subject.rs_uip = rule->rr_subject.rs_uip; copy->rr_subject.rs_loginclass = rule->rr_subject.rs_loginclass; copy->rr_subject.rs_prison_racct = rule->rr_subject.rs_prison_racct; copy->rr_per = rule->rr_per; copy->rr_resource = rule->rr_resource; copy->rr_action = rule->rr_action; copy->rr_amount = rule->rr_amount; refcount_init(©->rr_refcount, 1); rctl_rule_acquire_subject(copy); return (copy); } void rctl_rule_acquire(struct rctl_rule *rule) { ASSERT_RACCT_ENABLED(); KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0")); refcount_acquire(&rule->rr_refcount); } static void rctl_rule_free(void *context, int pending) { struct rctl_rule *rule; rule = (struct rctl_rule *)context; ASSERT_RACCT_ENABLED(); KASSERT(rule->rr_refcount == 0, ("rule->rr_refcount != 0")); /* * We don't need locking here; rule is guaranteed to be inaccessible. */ rctl_rule_release_subject(rule); uma_zfree(rctl_rule_zone, rule); } void rctl_rule_release(struct rctl_rule *rule) { ASSERT_RACCT_ENABLED(); KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0")); if (refcount_release(&rule->rr_refcount)) { /* * rctl_rule_release() is often called when iterating * over all the uidinfo structures in the system, * holding uihashtbl_lock. Since rctl_rule_free() * might end up calling uifree(), this would lead * to lock recursion. Use taskqueue to avoid this. */ TASK_INIT(&rule->rr_task, 0, rctl_rule_free, rule); taskqueue_enqueue(taskqueue_thread, &rule->rr_task); } } static int rctl_rule_fully_specified(const struct rctl_rule *rule) { ASSERT_RACCT_ENABLED(); switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_UNDEFINED: return (0); case RCTL_SUBJECT_TYPE_PROCESS: if (rule->rr_subject.rs_proc == NULL) return (0); break; case RCTL_SUBJECT_TYPE_USER: if (rule->rr_subject.rs_uip == NULL) return (0); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: if (rule->rr_subject.rs_loginclass == NULL) return (0); break; case RCTL_SUBJECT_TYPE_JAIL: if (rule->rr_subject.rs_prison_racct == NULL) return (0); break; default: panic("rctl_rule_fully_specified: unknown subject type %d", rule->rr_subject_type); } if (rule->rr_resource == RACCT_UNDEFINED) return (0); if (rule->rr_action == RCTL_ACTION_UNDEFINED) return (0); if (rule->rr_amount == RCTL_AMOUNT_UNDEFINED) return (0); if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED) return (0); return (1); } static int rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep) { struct rctl_rule *rule; char *subjectstr, *subject_idstr, *resourcestr, *actionstr, *amountstr, *perstr; id_t id; int error = 0; ASSERT_RACCT_ENABLED(); rule = rctl_rule_alloc(M_WAITOK); subjectstr = strsep(&rulestr, ":"); subject_idstr = strsep(&rulestr, ":"); resourcestr = strsep(&rulestr, ":"); actionstr = strsep(&rulestr, "=/"); amountstr = strsep(&rulestr, "/"); perstr = rulestr; if (subjectstr == NULL || subjectstr[0] == '\0') rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED; else { error = str2value(subjectstr, &rule->rr_subject_type, subjectnames); if (error != 0) goto out; } if (subject_idstr == NULL || subject_idstr[0] == '\0') { rule->rr_subject.rs_proc = NULL; rule->rr_subject.rs_uip = NULL; rule->rr_subject.rs_loginclass = NULL; rule->rr_subject.rs_prison_racct = NULL; } else { switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_UNDEFINED: error = EINVAL; goto out; case RCTL_SUBJECT_TYPE_PROCESS: error = str2id(subject_idstr, &id); if (error != 0) goto out; sx_assert(&allproc_lock, SA_LOCKED); rule->rr_subject.rs_proc = pfind(id); if (rule->rr_subject.rs_proc == NULL) { error = ESRCH; goto out; } PROC_UNLOCK(rule->rr_subject.rs_proc); break; case RCTL_SUBJECT_TYPE_USER: error = str2id(subject_idstr, &id); if (error != 0) goto out; rule->rr_subject.rs_uip = uifind(id); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: rule->rr_subject.rs_loginclass = loginclass_find(subject_idstr); if (rule->rr_subject.rs_loginclass == NULL) { error = ENAMETOOLONG; goto out; } break; case RCTL_SUBJECT_TYPE_JAIL: rule->rr_subject.rs_prison_racct = prison_racct_find(subject_idstr); if (rule->rr_subject.rs_prison_racct == NULL) { error = ENAMETOOLONG; goto out; } break; default: panic("rctl_string_to_rule: unknown subject type %d", rule->rr_subject_type); } } if (resourcestr == NULL || resourcestr[0] == '\0') rule->rr_resource = RACCT_UNDEFINED; else { error = str2value(resourcestr, &rule->rr_resource, resourcenames); if (error != 0) goto out; } if (actionstr == NULL || actionstr[0] == '\0') rule->rr_action = RCTL_ACTION_UNDEFINED; else { error = str2value(actionstr, &rule->rr_action, actionnames); if (error != 0) goto out; } if (amountstr == NULL || amountstr[0] == '\0') rule->rr_amount = RCTL_AMOUNT_UNDEFINED; else { error = str2int64(amountstr, &rule->rr_amount); if (error != 0) goto out; if (RACCT_IS_IN_MILLIONS(rule->rr_resource)) { if (rule->rr_amount > INT64_MAX / 1000000) { error = ERANGE; goto out; } rule->rr_amount *= 1000000; } } if (perstr == NULL || perstr[0] == '\0') rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED; else { error = str2value(perstr, &rule->rr_per, subjectnames); if (error != 0) goto out; } out: if (error == 0) *rulep = rule; else rctl_rule_release(rule); return (error); } /* * Link a rule with all the subjects it applies to. */ int rctl_rule_add(struct rctl_rule *rule) { struct proc *p; struct ucred *cred; struct uidinfo *uip; struct prison *pr; struct prison_racct *prr; struct loginclass *lc; struct rctl_rule *rule2; int match; ASSERT_RACCT_ENABLED(); KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified")); /* * Some rules just don't make sense, like "deny" rule for an undeniable * resource. The exception are the RSS and %CPU resources - they are * not deniable in the racct sense, but the limit is enforced in * a different way. */ if (rule->rr_action == RCTL_ACTION_DENY && !RACCT_IS_DENIABLE(rule->rr_resource) && rule->rr_resource != RACCT_RSS && rule->rr_resource != RACCT_PCTCPU) { return (EOPNOTSUPP); } if (rule->rr_action == RCTL_ACTION_THROTTLE && !RACCT_IS_DECAYING(rule->rr_resource)) { return (EOPNOTSUPP); } if (rule->rr_action == RCTL_ACTION_THROTTLE && rule->rr_resource == RACCT_PCTCPU) { return (EOPNOTSUPP); } if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS && RACCT_IS_SLOPPY(rule->rr_resource)) { return (EOPNOTSUPP); } /* * Make sure there are no duplicated rules. Also, for the "deny" * rules, remove ones differing only by "amount". */ if (rule->rr_action == RCTL_ACTION_DENY) { rule2 = rctl_rule_duplicate(rule, M_WAITOK); rule2->rr_amount = RCTL_AMOUNT_UNDEFINED; rctl_rule_remove(rule2); rctl_rule_release(rule2); } else rctl_rule_remove(rule); switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_PROCESS: p = rule->rr_subject.rs_proc; KASSERT(p != NULL, ("rctl_rule_add: NULL proc")); rctl_racct_add_rule(p->p_racct, rule); /* * In case of per-process rule, we don't have anything more * to do. */ return (0); case RCTL_SUBJECT_TYPE_USER: uip = rule->rr_subject.rs_uip; KASSERT(uip != NULL, ("rctl_rule_add: NULL uip")); rctl_racct_add_rule(uip->ui_racct, rule); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: lc = rule->rr_subject.rs_loginclass; KASSERT(lc != NULL, ("rctl_rule_add: NULL loginclass")); rctl_racct_add_rule(lc->lc_racct, rule); break; case RCTL_SUBJECT_TYPE_JAIL: prr = rule->rr_subject.rs_prison_racct; KASSERT(prr != NULL, ("rctl_rule_add: NULL pr")); rctl_racct_add_rule(prr->prr_racct, rule); break; default: panic("rctl_rule_add: unknown subject type %d", rule->rr_subject_type); } /* * Now go through all the processes and add the new rule to the ones * it applies to. */ sx_assert(&allproc_lock, SA_LOCKED); FOREACH_PROC_IN_SYSTEM(p) { cred = p->p_ucred; switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_USER: if (cred->cr_uidinfo == rule->rr_subject.rs_uip || cred->cr_ruidinfo == rule->rr_subject.rs_uip) break; continue; case RCTL_SUBJECT_TYPE_LOGINCLASS: if (cred->cr_loginclass == rule->rr_subject.rs_loginclass) break; continue; case RCTL_SUBJECT_TYPE_JAIL: match = 0; for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) { if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) { match = 1; break; } } if (match) break; continue; default: panic("rctl_rule_add: unknown subject type %d", rule->rr_subject_type); } rctl_racct_add_rule(p->p_racct, rule); } return (0); } static void rctl_rule_pre_callback(void) { RACCT_LOCK(); } static void rctl_rule_post_callback(void) { RACCT_UNLOCK(); } static void rctl_rule_remove_callback(struct racct *racct, void *arg2, void *arg3) { struct rctl_rule *filter = (struct rctl_rule *)arg2; int found = 0; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); found += rctl_racct_remove_rules(racct, filter); *((int *)arg3) += found; } /* * Remove all rules that match the filter. */ int rctl_rule_remove(struct rctl_rule *filter) { struct proc *p; int found = 0; ASSERT_RACCT_ENABLED(); if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS && filter->rr_subject.rs_proc != NULL) { p = filter->rr_subject.rs_proc; RACCT_LOCK(); found = rctl_racct_remove_rules(p->p_racct, filter); RACCT_UNLOCK(); if (found) return (0); return (ESRCH); } loginclass_racct_foreach(rctl_rule_remove_callback, rctl_rule_pre_callback, rctl_rule_post_callback, filter, (void *)&found); ui_racct_foreach(rctl_rule_remove_callback, rctl_rule_pre_callback, rctl_rule_post_callback, filter, (void *)&found); prison_racct_foreach(rctl_rule_remove_callback, rctl_rule_pre_callback, rctl_rule_post_callback, filter, (void *)&found); sx_assert(&allproc_lock, SA_LOCKED); RACCT_LOCK(); FOREACH_PROC_IN_SYSTEM(p) { found += rctl_racct_remove_rules(p->p_racct, filter); } RACCT_UNLOCK(); if (found) return (0); return (ESRCH); } /* * Appends a rule to the sbuf. */ static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule) { int64_t amount; ASSERT_RACCT_ENABLED(); sbuf_printf(sb, "%s:", rctl_subject_type_name(rule->rr_subject_type)); switch (rule->rr_subject_type) { case RCTL_SUBJECT_TYPE_PROCESS: if (rule->rr_subject.rs_proc == NULL) sbuf_printf(sb, ":"); else sbuf_printf(sb, "%d:", rule->rr_subject.rs_proc->p_pid); break; case RCTL_SUBJECT_TYPE_USER: if (rule->rr_subject.rs_uip == NULL) sbuf_printf(sb, ":"); else sbuf_printf(sb, "%d:", rule->rr_subject.rs_uip->ui_uid); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: if (rule->rr_subject.rs_loginclass == NULL) sbuf_printf(sb, ":"); else sbuf_printf(sb, "%s:", rule->rr_subject.rs_loginclass->lc_name); break; case RCTL_SUBJECT_TYPE_JAIL: if (rule->rr_subject.rs_prison_racct == NULL) sbuf_printf(sb, ":"); else sbuf_printf(sb, "%s:", rule->rr_subject.rs_prison_racct->prr_name); break; default: panic("rctl_rule_to_sbuf: unknown subject type %d", rule->rr_subject_type); } amount = rule->rr_amount; if (amount != RCTL_AMOUNT_UNDEFINED && RACCT_IS_IN_MILLIONS(rule->rr_resource)) amount /= 1000000; sbuf_printf(sb, "%s:%s=%jd", rctl_resource_name(rule->rr_resource), rctl_action_name(rule->rr_action), amount); if (rule->rr_per != rule->rr_subject_type) sbuf_printf(sb, "/%s", rctl_subject_type_name(rule->rr_per)); } /* * Routine used by RCTL syscalls to read in input string. */ static int rctl_read_inbuf(char **inputstr, const char *inbufp, size_t inbuflen) { char *str; int error; ASSERT_RACCT_ENABLED(); if (inbuflen <= 0) return (EINVAL); if (inbuflen > RCTL_MAX_INBUFSIZE) return (E2BIG); str = malloc(inbuflen + 1, M_RCTL, M_WAITOK); error = copyinstr(inbufp, str, inbuflen, NULL); if (error != 0) { free(str, M_RCTL); return (error); } *inputstr = str; return (0); } /* * Routine used by RCTL syscalls to write out output string. */ static int rctl_write_outbuf(struct sbuf *outputsbuf, char *outbufp, size_t outbuflen) { int error; ASSERT_RACCT_ENABLED(); if (outputsbuf == NULL) return (0); sbuf_finish(outputsbuf); if (outbuflen < sbuf_len(outputsbuf) + 1) { sbuf_delete(outputsbuf); return (ERANGE); } error = copyout(sbuf_data(outputsbuf), outbufp, sbuf_len(outputsbuf) + 1); sbuf_delete(outputsbuf); return (error); } static struct sbuf * rctl_racct_to_sbuf(struct racct *racct, int sloppy) { struct sbuf *sb; int64_t amount; int i; ASSERT_RACCT_ENABLED(); sb = sbuf_new_auto(); for (i = 0; i <= RACCT_MAX; i++) { if (sloppy == 0 && RACCT_IS_SLOPPY(i)) continue; RACCT_LOCK(); amount = racct->r_resources[i]; RACCT_UNLOCK(); if (RACCT_IS_IN_MILLIONS(i)) amount /= 1000000; sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount); } sbuf_setpos(sb, sbuf_len(sb) - 1); return (sb); } int sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap) { struct rctl_rule *filter; struct sbuf *outputsbuf = NULL; struct proc *p; struct uidinfo *uip; struct loginclass *lc; struct prison_racct *prr; char *inputstr; int error; if (!racct_enable) return (ENOSYS); error = priv_check(td, PRIV_RCTL_GET_RACCT); if (error != 0) return (error); error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen); if (error != 0) return (error); sx_slock(&allproc_lock); error = rctl_string_to_rule(inputstr, &filter); free(inputstr, M_RCTL); if (error != 0) { sx_sunlock(&allproc_lock); return (error); } switch (filter->rr_subject_type) { case RCTL_SUBJECT_TYPE_PROCESS: p = filter->rr_subject.rs_proc; if (p == NULL) { error = EINVAL; goto out; } outputsbuf = rctl_racct_to_sbuf(p->p_racct, 0); break; case RCTL_SUBJECT_TYPE_USER: uip = filter->rr_subject.rs_uip; if (uip == NULL) { error = EINVAL; goto out; } outputsbuf = rctl_racct_to_sbuf(uip->ui_racct, 1); break; case RCTL_SUBJECT_TYPE_LOGINCLASS: lc = filter->rr_subject.rs_loginclass; if (lc == NULL) { error = EINVAL; goto out; } outputsbuf = rctl_racct_to_sbuf(lc->lc_racct, 1); break; case RCTL_SUBJECT_TYPE_JAIL: prr = filter->rr_subject.rs_prison_racct; if (prr == NULL) { error = EINVAL; goto out; } outputsbuf = rctl_racct_to_sbuf(prr->prr_racct, 1); break; default: error = EINVAL; } out: rctl_rule_release(filter); sx_sunlock(&allproc_lock); if (error != 0) return (error); error = rctl_write_outbuf(outputsbuf, uap->outbufp, uap->outbuflen); return (error); } static void rctl_get_rules_callback(struct racct *racct, void *arg2, void *arg3) { struct rctl_rule *filter = (struct rctl_rule *)arg2; struct rctl_rule_link *link; struct sbuf *sb = (struct sbuf *)arg3; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); LIST_FOREACH(link, &racct->r_rule_links, rrl_next) { if (!rctl_rule_matches(link->rrl_rule, filter)) continue; rctl_rule_to_sbuf(sb, link->rrl_rule); sbuf_printf(sb, ","); } } int sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap) { struct sbuf *sb; struct rctl_rule *filter; struct rctl_rule_link *link; struct proc *p; char *inputstr, *buf; size_t bufsize; int error; if (!racct_enable) return (ENOSYS); error = priv_check(td, PRIV_RCTL_GET_RULES); if (error != 0) return (error); error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen); if (error != 0) return (error); sx_slock(&allproc_lock); error = rctl_string_to_rule(inputstr, &filter); free(inputstr, M_RCTL); if (error != 0) { sx_sunlock(&allproc_lock); return (error); } bufsize = uap->outbuflen; if (bufsize > rctl_maxbufsize) { sx_sunlock(&allproc_lock); return (E2BIG); } buf = malloc(bufsize, M_RCTL, M_WAITOK); sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN); KASSERT(sb != NULL, ("sbuf_new failed")); FOREACH_PROC_IN_SYSTEM(p) { RACCT_LOCK(); LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { /* * Non-process rules will be added to the buffer later. * Adding them here would result in duplicated output. */ if (link->rrl_rule->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) continue; if (!rctl_rule_matches(link->rrl_rule, filter)) continue; rctl_rule_to_sbuf(sb, link->rrl_rule); sbuf_printf(sb, ","); } RACCT_UNLOCK(); } loginclass_racct_foreach(rctl_get_rules_callback, rctl_rule_pre_callback, rctl_rule_post_callback, filter, sb); ui_racct_foreach(rctl_get_rules_callback, rctl_rule_pre_callback, rctl_rule_post_callback, filter, sb); prison_racct_foreach(rctl_get_rules_callback, rctl_rule_pre_callback, rctl_rule_post_callback, filter, sb); if (sbuf_error(sb) == ENOMEM) { error = ERANGE; goto out; } /* * Remove trailing ",". */ if (sbuf_len(sb) > 0) sbuf_setpos(sb, sbuf_len(sb) - 1); error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen); out: rctl_rule_release(filter); sx_sunlock(&allproc_lock); free(buf, M_RCTL); return (error); } int sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap) { struct sbuf *sb; struct rctl_rule *filter; struct rctl_rule_link *link; char *inputstr, *buf; size_t bufsize; int error; if (!racct_enable) return (ENOSYS); error = priv_check(td, PRIV_RCTL_GET_LIMITS); if (error != 0) return (error); error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen); if (error != 0) return (error); sx_slock(&allproc_lock); error = rctl_string_to_rule(inputstr, &filter); free(inputstr, M_RCTL); if (error != 0) { sx_sunlock(&allproc_lock); return (error); } if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) { rctl_rule_release(filter); sx_sunlock(&allproc_lock); return (EINVAL); } if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) { rctl_rule_release(filter); sx_sunlock(&allproc_lock); return (EOPNOTSUPP); } if (filter->rr_subject.rs_proc == NULL) { rctl_rule_release(filter); sx_sunlock(&allproc_lock); return (EINVAL); } bufsize = uap->outbuflen; if (bufsize > rctl_maxbufsize) { rctl_rule_release(filter); sx_sunlock(&allproc_lock); return (E2BIG); } buf = malloc(bufsize, M_RCTL, M_WAITOK); sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN); KASSERT(sb != NULL, ("sbuf_new failed")); RACCT_LOCK(); LIST_FOREACH(link, &filter->rr_subject.rs_proc->p_racct->r_rule_links, rrl_next) { rctl_rule_to_sbuf(sb, link->rrl_rule); sbuf_printf(sb, ","); } RACCT_UNLOCK(); if (sbuf_error(sb) == ENOMEM) { error = ERANGE; sbuf_delete(sb); goto out; } /* * Remove trailing ",". */ if (sbuf_len(sb) > 0) sbuf_setpos(sb, sbuf_len(sb) - 1); error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen); out: rctl_rule_release(filter); sx_sunlock(&allproc_lock); free(buf, M_RCTL); return (error); } int sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap) { struct rctl_rule *rule; char *inputstr; int error; if (!racct_enable) return (ENOSYS); error = priv_check(td, PRIV_RCTL_ADD_RULE); if (error != 0) return (error); error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen); if (error != 0) return (error); sx_slock(&allproc_lock); error = rctl_string_to_rule(inputstr, &rule); free(inputstr, M_RCTL); if (error != 0) { sx_sunlock(&allproc_lock); return (error); } /* * The 'per' part of a rule is optional. */ if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED && rule->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) rule->rr_per = rule->rr_subject_type; if (!rctl_rule_fully_specified(rule)) { error = EINVAL; goto out; } error = rctl_rule_add(rule); out: rctl_rule_release(rule); sx_sunlock(&allproc_lock); return (error); } int sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap) { struct rctl_rule *filter; char *inputstr; int error; if (!racct_enable) return (ENOSYS); error = priv_check(td, PRIV_RCTL_REMOVE_RULE); if (error != 0) return (error); error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen); if (error != 0) return (error); sx_slock(&allproc_lock); error = rctl_string_to_rule(inputstr, &filter); free(inputstr, M_RCTL); if (error != 0) { sx_sunlock(&allproc_lock); return (error); } error = rctl_rule_remove(filter); rctl_rule_release(filter); sx_sunlock(&allproc_lock); return (error); } /* * Update RCTL rule list after credential change. */ void rctl_proc_ucred_changed(struct proc *p, struct ucred *newcred) { LIST_HEAD(, rctl_rule_link) newrules; struct rctl_rule_link *link, *newlink; struct uidinfo *newuip; struct loginclass *newlc; struct prison_racct *newprr; int rulecnt, i; if (!racct_enable) return; PROC_LOCK_ASSERT(p, MA_NOTOWNED); newuip = newcred->cr_ruidinfo; newlc = newcred->cr_loginclass; newprr = newcred->cr_prison->pr_prison_racct; LIST_INIT(&newrules); again: /* * First, count the rules that apply to the process with new * credentials. */ rulecnt = 0; RACCT_LOCK(); LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { if (link->rrl_rule->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS) rulecnt++; } LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) rulecnt++; LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) rulecnt++; LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) rulecnt++; RACCT_UNLOCK(); /* * Create temporary list. We've dropped the rctl_lock in order * to use M_WAITOK. */ for (i = 0; i < rulecnt; i++) { newlink = uma_zalloc(rctl_rule_link_zone, M_WAITOK); newlink->rrl_rule = NULL; newlink->rrl_exceeded = 0; LIST_INSERT_HEAD(&newrules, newlink, rrl_next); } newlink = LIST_FIRST(&newrules); /* * Assign rules to the newly allocated list entries. */ RACCT_LOCK(); LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) { if (link->rrl_rule->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS) { if (newlink == NULL) goto goaround; rctl_rule_acquire(link->rrl_rule); newlink->rrl_rule = link->rrl_rule; newlink->rrl_exceeded = link->rrl_exceeded; newlink = LIST_NEXT(newlink, rrl_next); rulecnt--; } } LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) { if (newlink == NULL) goto goaround; rctl_rule_acquire(link->rrl_rule); newlink->rrl_rule = link->rrl_rule; newlink->rrl_exceeded = link->rrl_exceeded; newlink = LIST_NEXT(newlink, rrl_next); rulecnt--; } LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) { if (newlink == NULL) goto goaround; rctl_rule_acquire(link->rrl_rule); newlink->rrl_rule = link->rrl_rule; newlink->rrl_exceeded = link->rrl_exceeded; newlink = LIST_NEXT(newlink, rrl_next); rulecnt--; } LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) { if (newlink == NULL) goto goaround; rctl_rule_acquire(link->rrl_rule); newlink->rrl_rule = link->rrl_rule; newlink->rrl_exceeded = link->rrl_exceeded; newlink = LIST_NEXT(newlink, rrl_next); rulecnt--; } if (rulecnt == 0) { /* * Free the old rule list. */ while (!LIST_EMPTY(&p->p_racct->r_rule_links)) { link = LIST_FIRST(&p->p_racct->r_rule_links); LIST_REMOVE(link, rrl_next); rctl_rule_release(link->rrl_rule); uma_zfree(rctl_rule_link_zone, link); } /* * Replace lists and we're done. * * XXX: Is there any way to switch list heads instead * of iterating here? */ while (!LIST_EMPTY(&newrules)) { newlink = LIST_FIRST(&newrules); LIST_REMOVE(newlink, rrl_next); LIST_INSERT_HEAD(&p->p_racct->r_rule_links, newlink, rrl_next); } RACCT_UNLOCK(); return; } goaround: RACCT_UNLOCK(); /* * Rule list changed while we were not holding the rctl_lock. * Free the new list and try again. */ while (!LIST_EMPTY(&newrules)) { newlink = LIST_FIRST(&newrules); LIST_REMOVE(newlink, rrl_next); if (newlink->rrl_rule != NULL) rctl_rule_release(newlink->rrl_rule); uma_zfree(rctl_rule_link_zone, newlink); } goto again; } /* * Assign RCTL rules to the newly created process. */ int rctl_proc_fork(struct proc *parent, struct proc *child) { struct rctl_rule *rule; struct rctl_rule_link *link; int error; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); KASSERT(parent->p_racct != NULL, ("process without racct; p = %p", parent)); LIST_INIT(&child->p_racct->r_rule_links); /* * Go through limits applicable to the parent and assign them * to the child. Rules with 'process' subject have to be duplicated * in order to make their rr_subject point to the new process. */ LIST_FOREACH(link, &parent->p_racct->r_rule_links, rrl_next) { if (link->rrl_rule->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS) { rule = rctl_rule_duplicate(link->rrl_rule, M_NOWAIT); if (rule == NULL) goto fail; KASSERT(rule->rr_subject.rs_proc == parent, ("rule->rr_subject.rs_proc != parent")); rule->rr_subject.rs_proc = child; error = rctl_racct_add_rule_locked(child->p_racct, rule); rctl_rule_release(rule); if (error != 0) goto fail; } else { error = rctl_racct_add_rule_locked(child->p_racct, link->rrl_rule); if (error != 0) goto fail; } } return (0); fail: while (!LIST_EMPTY(&child->p_racct->r_rule_links)) { link = LIST_FIRST(&child->p_racct->r_rule_links); LIST_REMOVE(link, rrl_next); rctl_rule_release(link->rrl_rule); uma_zfree(rctl_rule_link_zone, link); } return (EAGAIN); } /* * Release rules attached to the racct. */ void rctl_racct_release(struct racct *racct) { struct rctl_rule_link *link; ASSERT_RACCT_ENABLED(); RACCT_LOCK_ASSERT(); while (!LIST_EMPTY(&racct->r_rule_links)) { link = LIST_FIRST(&racct->r_rule_links); LIST_REMOVE(link, rrl_next); rctl_rule_release(link->rrl_rule); uma_zfree(rctl_rule_link_zone, link); } } static void rctl_init(void) { if (!racct_enable) return; rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); rctl_rule_link_zone = uma_zcreate("rctl_rule_link", sizeof(struct rctl_rule_link), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); /* * Set default values, making sure not to overwrite the ones * fetched from tunables. Most of those could be set at the * declaration, except for the rctl_throttle_max - we cannot * set it there due to hz not being compile time constant. */ if (rctl_throttle_min < 1) rctl_throttle_min = 1; if (rctl_throttle_max < rctl_throttle_min) rctl_throttle_max = 2 * hz; if (rctl_throttle_pct < 0) rctl_throttle_pct = 100; if (rctl_throttle_pct2 < 0) rctl_throttle_pct2 = 100; } #else /* !RCTL */ int sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap) { return (ENOSYS); } int sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap) { return (ENOSYS); } int sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap) { return (ENOSYS); } int sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap) { return (ENOSYS); } int sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap) { return (ENOSYS); } #endif /* !RCTL */