diff --git a/lib/libnvmf/libnvmf.h b/lib/libnvmf/libnvmf.h --- a/lib/libnvmf/libnvmf.h +++ b/lib/libnvmf/libnvmf.h @@ -342,7 +342,8 @@ */ int nvmf_handoff_host(const struct nvme_discovery_log_entry *dle, const char *hostnqn, struct nvmf_qpair *admin_qp, u_int num_queues, - struct nvmf_qpair **io_queues, const struct nvme_controller_data *cdata); + struct nvmf_qpair **io_queues, const struct nvme_controller_data *cdata, + uint32_t reconnect_delay, uint32_t controller_loss_timeout); /* * Disconnect an active host association previously handed off to the @@ -370,7 +371,8 @@ */ int nvmf_reconnect_host(int fd, const struct nvme_discovery_log_entry *dle, const char *hostnqn, struct nvmf_qpair *admin_qp, u_int num_queues, - struct nvmf_qpair **io_queues, const struct nvme_controller_data *cdata); + struct nvmf_qpair **io_queues, const struct nvme_controller_data *cdata, + uint32_t reconnect_delay, uint32_t controller_loss_timeout); /* * Fetch connection status from an existing kernel host. diff --git a/lib/libnvmf/nvmf_host.c b/lib/libnvmf/nvmf_host.c --- a/lib/libnvmf/nvmf_host.c +++ b/lib/libnvmf/nvmf_host.c @@ -792,7 +792,8 @@ prepare_queues_for_handoff(struct nvmf_ioc_nv *nv, const struct nvme_discovery_log_entry *dle, const char *hostnqn, struct nvmf_qpair *admin_qp, u_int num_queues, - struct nvmf_qpair **io_queues, const struct nvme_controller_data *cdata) + struct nvmf_qpair **io_queues, const struct nvme_controller_data *cdata, + uint32_t reconnect_delay, uint32_t controller_loss_timeout) { const struct nvmf_association *na = admin_qp->nq_association; nvlist_t *nvl, *nvl_qp, *nvl_rparams; @@ -820,6 +821,9 @@ nvlist_add_string(nvl_rparams, "hostnqn", hostnqn); nvlist_add_number(nvl_rparams, "num_io_queues", num_queues); nvlist_add_number(nvl_rparams, "kato", admin_qp->nq_kato); + nvlist_add_number(nvl_rparams, "reconnect_delay", reconnect_delay); + nvlist_add_number(nvl_rparams, "controller_loss_timeout", + controller_loss_timeout); nvlist_add_number(nvl_rparams, "io_qsize", io_queues[0]->nq_qsize); nvlist_add_bool(nvl_rparams, "sq_flow_control", na->na_params.sq_flow_control); @@ -842,6 +846,9 @@ nvl = nvlist_create(0); nvlist_add_number(nvl, "trtype", na->na_trtype); nvlist_add_number(nvl, "kato", admin_qp->nq_kato); + nvlist_add_number(nvl, "reconnect_delay", reconnect_delay); + nvlist_add_number(nvl, "controller_loss_timeout", + controller_loss_timeout); nvlist_move_nvlist(nvl, "rparams", nvl_rparams); /* First, the admin queue. */ @@ -872,7 +879,8 @@ int nvmf_handoff_host(const struct nvme_discovery_log_entry *dle, const char *hostnqn, struct nvmf_qpair *admin_qp, u_int num_queues, - struct nvmf_qpair **io_queues, const struct nvme_controller_data *cdata) + struct nvmf_qpair **io_queues, const struct nvme_controller_data *cdata, + uint32_t reconnect_delay, uint32_t controller_loss_timeout) { struct nvmf_ioc_nv nv; u_int i; @@ -885,7 +893,8 @@ } error = prepare_queues_for_handoff(&nv, dle, hostnqn, admin_qp, - num_queues, io_queues, cdata); + num_queues, io_queues, cdata, reconnect_delay, + controller_loss_timeout); if (error != 0) goto out; @@ -981,14 +990,16 @@ int nvmf_reconnect_host(int fd, const struct nvme_discovery_log_entry *dle, const char *hostnqn, struct nvmf_qpair *admin_qp, u_int num_queues, - struct nvmf_qpair **io_queues, const struct nvme_controller_data *cdata) + struct nvmf_qpair **io_queues, const struct nvme_controller_data *cdata, + uint32_t reconnect_delay, uint32_t controller_loss_timeout) { struct nvmf_ioc_nv nv; u_int i; int error; error = prepare_queues_for_handoff(&nv, dle, hostnqn, admin_qp, - num_queues, io_queues, cdata); + num_queues, io_queues, cdata, reconnect_delay, + controller_loss_timeout); if (error != 0) goto out; diff --git a/sbin/devd/Makefile b/sbin/devd/Makefile --- a/sbin/devd/Makefile +++ b/sbin/devd/Makefile @@ -46,6 +46,11 @@ HYPERVPACKAGE= hyperv-tools .endif +CONFGROUPS+= NVME +NVMEDIR= ${DEVDDIR} +NVME+= nvmf.conf +NVMEPACKAGE= nvme-tools + .if ${MK_USB} != "no" DEVD+= uath.conf ulpt.conf .endif diff --git a/sbin/devd/devd.conf.5 b/sbin/devd/devd.conf.5 --- a/sbin/devd/devd.conf.5 +++ b/sbin/devd/devd.conf.5 @@ -38,7 +38,7 @@ .\" ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS .\" SOFTWARE. .\" -.Dd July 8, 2025 +.Dd July 9, 2025 .Dt DEVD.CONF 5 .Os .Sh NAME @@ -517,6 +517,8 @@ representing the start of a controller reset, the successful completion of a controller reset, or a timeout while waiting for the controller to reset, respectively. +.It Li nvme Ta Li controller Ta Li RECONNECT Ta +An NVMe over Fabrics host has disconnected and is requesting a reconnect. .El .Pp .Bl -column "SYSTEM" "SUBSYSTEM" "SHUTDOWN-THRESHOLD" -compact diff --git a/sbin/devd/nvmf.conf b/sbin/devd/nvmf.conf new file mode 100644 --- /dev/null +++ b/sbin/devd/nvmf.conf @@ -0,0 +1,7 @@ +# Attempt to reconnect NVMeoF host devices when requested +notify 100 { + match "system" "nvme"; + match "subsystem" "controller"; + match "type" "RECONNECT"; + action "nvmecontrol reconnect $name"; +}; diff --git a/sbin/nvmecontrol/connect.c b/sbin/nvmecontrol/connect.c --- a/sbin/nvmecontrol/connect.c +++ b/sbin/nvmecontrol/connect.c @@ -31,6 +31,8 @@ const char *subnqn; const char *hostnqn; uint32_t kato; + uint32_t reconnect_delay; + uint32_t controller_loss_timeout; uint16_t num_io_queues; uint16_t queue_size; bool data_digests; @@ -43,6 +45,8 @@ .subnqn = NULL, .hostnqn = NULL, .kato = NVMF_KATO_DEFAULT / 1000, + .reconnect_delay = NVMF_DEFAULT_RECONNECT_DELAY, + .controller_loss_timeout = NVMF_DEFAULT_CONTROLLER_LOSS, .num_io_queues = 1, .queue_size = 0, .data_digests = false, @@ -107,7 +111,7 @@ } error = nvmf_handoff_host(dle, hostnqn, admin, opt.num_io_queues, io, - &cdata); + &cdata, opt.reconnect_delay, opt.controller_loss_timeout); if (error != 0) { warnc(error, "Failed to handoff queues to kernel"); free(io); @@ -259,6 +263,11 @@ "Number of entries in each I/O queue"), OPT("keep-alive-tmo", 'k', arg_uint32, opt, kato, "Keep Alive timeout (in seconds)"), + OPT("reconnect-delay", 'r', arg_uint32, opt, reconnect_delay, + "Delay between reconnect attempts after connection loss " + "(in seconds)"), + OPT("ctrl-loss-tmo", 'l', arg_uint32, opt, controller_loss_timeout, + "Controller loss timeout after connection loss (in seconds)"), OPT("hostnqn", 'q', arg_string, opt, hostnqn, "Host NQN"), OPT("flow_control", 'F', arg_none, opt, flow_control, diff --git a/sbin/nvmecontrol/nvmecontrol.8 b/sbin/nvmecontrol/nvmecontrol.8 --- a/sbin/nvmecontrol/nvmecontrol.8 +++ b/sbin/nvmecontrol/nvmecontrol.8 @@ -33,7 +33,7 @@ .\" .\" Author: Jim Harris .\" -.Dd April 29, 2025 +.Dd July 9, 2025 .Dt NVMECONTROL 8 .Os .Sh NAME @@ -216,6 +216,8 @@ .Op Fl c Ar cntl-id .Op Fl i Ar queues .Op Fl k Ar seconds +.Op Fl l Ar seconds +.Op Fl r Ar seconds .Op Fl t Ar transport .Op Fl q Ar HostNQN .Op Fl Q Ar entries @@ -226,6 +228,8 @@ .Op Fl FGg .Op Fl i Ar queues .Op Fl k Ar seconds +.Op Fl l Ar seconds +.Op Fl r Ar seconds .Op Fl t Ar transport .Op Fl q Ar HostNQN .Op Fl Q Ar entries @@ -241,6 +245,8 @@ .Op Fl FGg .Op Fl i Ar queues .Op Fl k Ar seconds +.Op Fl l Ar seconds +.Op Fl r Ar seconds .Op Fl t Ar transport .Op Fl q Ar HostNQN .Op Fl Q Ar entries @@ -786,6 +792,29 @@ .It Fl k Ar seconds Keep Alive timer duration in seconds. The default is 120. +.It Fl l Ar seconds +Controller Loss timer duration in seconds. +The default is 600. +.Pp +This timer starts when an association is lost with a remote I/O controller +and is cancelled when a new association is established. +If the timer expires, the controller device is deleted. +A setting of zero disables this timer. +.It Fl r Ar seconds +Reconnect timer duration in seconds. +The default is 10. +.Pp +When an association is lost with a remote I/O controller, +the controller device will request reconnection via periodic +.Xr devctl 4 +notifications until either a new association is established or the controller +device is deleted. +This timer sets the interval between each +.Xr devctl 4 +notification. +Note that the first notification is triggered immediately after an association +is lost. +A setting of zero disables this timer. .It Fl t Ar transport Transport to use. The default is diff --git a/sbin/nvmecontrol/reconnect.c b/sbin/nvmecontrol/reconnect.c --- a/sbin/nvmecontrol/reconnect.c +++ b/sbin/nvmecontrol/reconnect.c @@ -27,6 +27,8 @@ const char *transport; const char *hostnqn; uint32_t kato; + uint32_t reconnect_delay; + uint32_t controller_loss_timeout; uint16_t num_io_queues; uint16_t queue_size; bool data_digests; @@ -37,6 +39,8 @@ .transport = "tcp", .hostnqn = NULL, .kato = NVMF_KATO_DEFAULT / 1000, + .reconnect_delay = NVMF_DEFAULT_RECONNECT_DELAY, + .controller_loss_timeout = NVMF_DEFAULT_CONTROLLER_LOSS, .num_io_queues = 1, .queue_size = 0, .data_digests = false, @@ -59,6 +63,7 @@ reconnect_nvm_controller(int fd, const struct nvmf_association_params *aparams, enum nvmf_trtype trtype, int adrfam, const char *address, const char *port, uint16_t cntlid, const char *subnqn, const char *hostnqn, uint32_t kato, + uint32_t reconnect_delay, uint32_t controller_loss_timeout, u_int num_io_queues, u_int queue_size, const struct nvme_discovery_log_entry *dle) { @@ -88,7 +93,7 @@ } error = nvmf_reconnect_host(fd, dle, hostnqn, admin, num_io_queues, io, - &cdata); + &cdata, reconnect_delay, controller_loss_timeout); if (error != 0) { warnc(error, "Failed to handoff queues to kernel"); free(io); @@ -137,7 +142,8 @@ error = reconnect_nvm_controller(fd, &aparams, trtype, AF_UNSPEC, address, port, le16toh(dle->cntlid), subnqn, hostnqn, - opt.kato * 1000, opt.num_io_queues, opt.queue_size, NULL); + opt.kato * 1000, opt.reconnect_delay, opt.controller_loss_timeout, + opt.num_io_queues, opt.queue_size, NULL); free(subnqn); free(tofree); return (error); @@ -196,6 +202,8 @@ address, port, le16toh(dle->cntlid), dle->subnqn, nvlist_get_string(rparams, "hostnqn"), dnvlist_get_number(rparams, "kato", 0), + dnvlist_get_number(rparams, "reconnect_delay", 0), + dnvlist_get_number(rparams, "controller_loss_timeout", 0), nvlist_get_number(rparams, "num_io_queues"), nvlist_get_number(rparams, "io_qsize"), dle); free(subnqn); @@ -291,6 +299,11 @@ "Number of entries in each I/O queue"), OPT("keep-alive-tmo", 'k', arg_uint32, opt, kato, "Keep Alive timeout (in seconds)"), + OPT("reconnect-delay", 'r', arg_uint32, opt, reconnect_delay, + "Delay between reconnect attempts after connection loss " + "(in seconds)"), + OPT("ctrl-loss-tmo", 'l', arg_uint32, opt, controller_loss_timeout, + "Controller loss timeout after connection loss (in seconds)"), OPT("hostnqn", 'q', arg_string, opt, hostnqn, "Host NQN"), OPT("flow_control", 'F', arg_none, opt, flow_control, diff --git a/sys/dev/nvmf/host/nvmf.c b/sys/dev/nvmf/host/nvmf.c --- a/sys/dev/nvmf/host/nvmf.c +++ b/sys/dev/nvmf/host/nvmf.c @@ -27,6 +27,7 @@ #include static struct cdevsw nvmf_cdevsw; +static struct taskqueue *nvmf_tq; bool nvmf_fail_disconnect = false; SYSCTL_BOOL(_kern_nvmf, OID_AUTO, fail_on_disconnection, CTLFLAG_RWTUN, @@ -34,7 +35,10 @@ MALLOC_DEFINE(M_NVMF, "nvmf", "NVMe over Fabrics host"); +static void nvmf_controller_loss_task(void *arg, int pending); static void nvmf_disconnect_task(void *arg, int pending); +static void nvmf_request_reconnect(struct nvmf_softc *sc); +static void nvmf_request_reconnect_task(void *arg, int pending); static void nvmf_shutdown_pre_sync(void *arg, int howto); static void nvmf_shutdown_post_sync(void *arg, int howto); @@ -294,6 +298,9 @@ admin = nvlist_get_nvlist(nvl, "admin"); io = nvlist_get_nvlist_array(nvl, "io", &num_io_queues); kato = dnvlist_get_number(nvl, "kato", 0); + sc->reconnect_delay = dnvlist_get_number(nvl, "reconnect_delay", 0); + sc->controller_loss_timeout = dnvlist_get_number(nvl, + "controller_loss_timeout", 0); /* Setup the admin queue. */ sc->admin = nvmf_init_qp(sc, trtype, admin, "admin queue", 0); @@ -504,6 +511,10 @@ callout_init(&sc->ka_tx_timer, 1); sx_init(&sc->connection_lock, "nvmf connection"); TASK_INIT(&sc->disconnect_task, 0, nvmf_disconnect_task, sc); + TIMEOUT_TASK_INIT(nvmf_tq, &sc->controller_loss_task, 0, + nvmf_controller_loss_task, sc); + TIMEOUT_TASK_INIT(nvmf_tq, &sc->request_reconnect_task, 0, + nvmf_request_reconnect_task, sc); oid = SYSCTL_ADD_NODE(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "ioq", @@ -603,7 +614,9 @@ nvmf_destroy_aer(sc); - taskqueue_drain(taskqueue_thread, &sc->disconnect_task); + taskqueue_drain_timeout(nvmf_tq, &sc->request_reconnect_task); + taskqueue_drain_timeout(nvmf_tq, &sc->controller_loss_task); + taskqueue_drain(nvmf_tq, &sc->disconnect_task); sx_destroy(&sc->connection_lock); nvlist_destroy(sc->rparams); free(sc->cdata, M_NVMF); @@ -613,7 +626,7 @@ void nvmf_disconnect(struct nvmf_softc *sc) { - taskqueue_enqueue(taskqueue_thread, &sc->disconnect_task); + taskqueue_enqueue(nvmf_tq, &sc->disconnect_task); } static void @@ -676,6 +689,74 @@ nvmf_destroy_qp(sc->admin); sc->admin = NULL; + if (sc->reconnect_delay != 0) + nvmf_request_reconnect(sc); + if (sc->controller_loss_timeout != 0) + taskqueue_enqueue_timeout(nvmf_tq, + &sc->controller_loss_task, sc->controller_loss_timeout * + hz); + + sx_xunlock(&sc->connection_lock); +} + +static void +nvmf_controller_loss_task(void *arg, int pending) +{ + struct nvmf_softc *sc = arg; + device_t dev; + int error; + + bus_topo_lock(); + sx_xlock(&sc->connection_lock); + if (sc->admin != NULL || sc->detaching) { + /* Reconnected or already detaching. */ + sx_xunlock(&sc->connection_lock); + bus_topo_unlock(); + return; + } + + sc->controller_timedout = true; + sx_xunlock(&sc->connection_lock); + + /* + * XXX: Doing this from here is a bit ugly. We don't have an + * extra reference on `dev` but bus_topo_lock should block any + * concurrent device_delete_child invocations. + */ + dev = sc->dev; + error = device_delete_child(root_bus, dev); + if (error != 0) + device_printf(dev, + "failed to detach after controller loss: %d\n", error); + bus_topo_unlock(); +} + +static void +nvmf_request_reconnect(struct nvmf_softc *sc) +{ + char buf[64]; + + sx_assert(&sc->connection_lock, SX_LOCKED); + + snprintf(buf, sizeof(buf), "name=\"%s\"", device_get_nameunit(sc->dev)); + devctl_notify("nvme", "controller", "RECONNECT", buf); + taskqueue_enqueue_timeout(nvmf_tq, &sc->request_reconnect_task, + sc->reconnect_delay * hz); +} + +static void +nvmf_request_reconnect_task(void *arg, int pending) +{ + struct nvmf_softc *sc = arg; + + sx_xlock(&sc->connection_lock); + if (sc->admin != NULL || sc->detaching || sc->controller_timedout) { + /* Reconnected or already detaching. */ + sx_xunlock(&sc->connection_lock); + return; + } + + nvmf_request_reconnect(sc); sx_xunlock(&sc->connection_lock); } @@ -699,7 +780,7 @@ } sx_xlock(&sc->connection_lock); - if (sc->admin != NULL || sc->detaching) { + if (sc->admin != NULL || sc->detaching || sc->controller_timedout) { error = EBUSY; goto out; } @@ -745,6 +826,9 @@ nvmf_reconnect_sim(sc); nvmf_rescan_all_ns(sc); + + taskqueue_cancel_timeout(nvmf_tq, &sc->request_reconnect_task, NULL); + taskqueue_cancel_timeout(nvmf_tq, &sc->controller_loss_task, NULL); out: sx_xunlock(&sc->connection_lock); nvlist_destroy(nvl); @@ -852,7 +936,21 @@ } free(sc->io, M_NVMF); - taskqueue_drain(taskqueue_thread, &sc->disconnect_task); + taskqueue_drain(nvmf_tq, &sc->disconnect_task); + if (taskqueue_cancel_timeout(nvmf_tq, &sc->request_reconnect_task, + NULL) != 0) + taskqueue_drain_timeout(nvmf_tq, &sc->request_reconnect_task); + + /* + * Don't cancel/drain the controller loss task if that task + * has fired and is triggering the detach. + */ + if (!sc->controller_timedout) { + if (taskqueue_cancel_timeout(nvmf_tq, &sc->controller_loss_task, + NULL) != 0) + taskqueue_drain_timeout(nvmf_tq, + &sc->controller_loss_task); + } if (sc->admin != NULL) nvmf_destroy_qp(sc->admin); @@ -1154,14 +1252,25 @@ static int nvmf_modevent(module_t mod, int what, void *arg) { + int error; + switch (what) { case MOD_LOAD: - return (nvmf_ctl_load()); + error = nvmf_ctl_load(); + if (error != 0) + return (error); + + nvmf_tq = taskqueue_create("nvmf", M_WAITOK | M_ZERO, + taskqueue_thread_enqueue, &nvmf_tq); + taskqueue_start_threads(&nvmf_tq, 1, PWAIT, "nvmf taskq"); + return (0); case MOD_QUIESCE: return (0); case MOD_UNLOAD: nvmf_ctl_unload(); destroy_dev_drain(&nvmf_cdevsw); + if (nvmf_tq != NULL) + taskqueue_free(nvmf_tq); return (0); default: return (EOPNOTSUPP); diff --git a/sys/dev/nvmf/host/nvmf_var.h b/sys/dev/nvmf/host/nvmf_var.h --- a/sys/dev/nvmf/host/nvmf_var.h +++ b/sys/dev/nvmf/host/nvmf_var.h @@ -75,9 +75,15 @@ struct callout ka_rx_timer; sbintime_t ka_rx_sbt; + struct timeout_task request_reconnect_task; + struct timeout_task controller_loss_task; + uint32_t reconnect_delay; + uint32_t controller_loss_timeout; + struct sx connection_lock; struct task disconnect_task; bool detaching; + bool controller_timedout; u_int num_aer; struct nvmf_aer *aer; diff --git a/sys/dev/nvmf/nvmf.h b/sys/dev/nvmf/nvmf.h --- a/sys/dev/nvmf/nvmf.h +++ b/sys/dev/nvmf/nvmf.h @@ -26,6 +26,13 @@ #define NVMF_NN (1024) +/* + * Default timeouts for Fabrics hosts. These match values used by + * Linux. + */ +#define NVMF_DEFAULT_RECONNECT_DELAY 10 +#define NVMF_DEFAULT_CONTROLLER_LOSS 600 + /* * (data, size) is the userspace buffer for a packed nvlist. * @@ -68,6 +75,8 @@ * * number trtype * number kato (optional) + * number reconnect_delay (optional) + * number controller_loss_timeout (optional) * qpair handoff nvlist admin * qpair handoff nvlist array io * binary cdata struct nvme_controller_data @@ -81,6 +90,8 @@ * string hostnqn * number num_io_queues * number kato (optional) + * number reconnect_delay (optional) + * number controller_loss_timeout (optional) * number io_qsize * bool sq_flow_control *