diff --git a/sys/dev/nvmf/nvmf_transport.h b/sys/dev/nvmf/nvmf_transport.h new file mode 100644 --- /dev/null +++ b/sys/dev/nvmf/nvmf_transport.h @@ -0,0 +1,140 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2022-2024 Chelsio Communications, Inc. + * Written by: John Baldwin + */ + +#ifndef __NVMF_TRANSPORT_H__ +#define __NVMF_TRANSPORT_H__ + +/* + * Interface used by the Fabrics host (initiator) and controller + * (target) to send and receive capsules and associated data. + */ + +#include +#include + +struct mbuf; +struct memdesc; +struct nvmf_capsule; +struct nvmf_connection; +struct nvmf_qpair; +struct nvmf_handoff_qpair_params; + +SYSCTL_DECL(_kern_nvmf); + +/* + * Callback to invoke when an error occurs on a qpair. The last + * parameter is an error value. If the error value is zero, the qpair + * has been closed at the transport level rather than a transport + * error occuring. + */ +typedef void nvmf_qpair_error_t(void *, int); + +/* Callback to invoke when a capsule is received. */ +typedef void nvmf_capsule_receive_t(void *, struct nvmf_capsule *); + +/* + * Callback to invoke when an I/O request has completed. The second + * parameter is the amount of data transferred. The last parameter is + * an error value which is non-zero if the request did not complete + * successfully. A request with an error may complete partially. + */ +typedef void nvmf_io_complete_t(void *, size_t, int); + +/* + * A queue pair represents either an Admin or I/O + * submission/completion queue pair. The params contains negotiated + * values passed in from userland. + * + * Unlike libnvmf in userland, the kernel transport interface does not + * have any notion of an association. Instead, qpairs are + * independent. + */ +struct nvmf_qpair *nvmf_allocate_qpair(enum nvmf_trtype trtype, + bool controller, const struct nvmf_handoff_qpair_params *params, + nvmf_qpair_error_t *error_cb, void *error_cb_arg, + nvmf_capsule_receive_t *receive_cb, void *receive_cb_arg); +void nvmf_free_qpair(struct nvmf_qpair *qp); + +/* + * Capsules are either commands (host -> controller) or responses + * (controller -> host). A data buffer may be associated with a + * command capsule. Transmitted data is not copied by this API but + * instead must be preserved until the completion callback is invoked + * to indicate capsule transmission has completed. + */ +struct nvmf_capsule *nvmf_allocate_command(struct nvmf_qpair *qp, + const void *sqe, int how); +struct nvmf_capsule *nvmf_allocate_response(struct nvmf_qpair *qp, + const void *cqe, int how); +void nvmf_free_capsule(struct nvmf_capsule *nc); +int nvmf_capsule_append_data(struct nvmf_capsule *nc, + struct memdesc *mem, size_t len, bool send, + nvmf_io_complete_t *complete_cb, void *cb_arg); +int nvmf_transmit_capsule(struct nvmf_capsule *nc); +void nvmf_abort_capsule_data(struct nvmf_capsule *nc, int error); +void *nvmf_capsule_sqe(struct nvmf_capsule *nc); +void *nvmf_capsule_cqe(struct nvmf_capsule *nc); + +/* Controller-specific APIs. */ + +/* + * A controller calls this function to check for any + * transport-specific errors (invalid fields) in a received command + * capsule. The callback returns a generic command status value: + * NVME_SC_SUCCESS if no error is found. + */ +uint8_t nvmf_validate_command_capsule(struct nvmf_capsule *nc); + +/* + * A controller calls this function to query the amount of data + * associated with a command capsule. + */ +size_t nvmf_capsule_data_len(const struct nvmf_capsule *cc); + +/* + * A controller calls this function to receive data associated with a + * command capsule (e.g. the data for a WRITE command). This can + * either return in-capsule data or fetch data from the host + * (e.g. using a R2T PDU over TCP). The received command capsule + * should be passed in 'nc'. The received data is stored in 'mem'. + * If this function returns success, then the callback will be invoked + * once the operation has completed. Note that the callback might be + * invoked before this function returns. + */ +int nvmf_receive_controller_data(struct nvmf_capsule *nc, + uint32_t data_offset, struct memdesc *mem, size_t len, + nvmf_io_complete_t *complete_cb, void *cb_arg); + +/* + * A controller calls this function to send data in response to a + * command prior to sending a response capsule. If an error occurs, + * the function returns a generic status completion code to be sent in + * the following CQE. Note that the transfer might send a subset of + * the data requested by nc. If the transfer succeeds, this function + * can return one of the following values: + * + * - NVME_SC_SUCCESS: The transfer has completed successfully and the + * caller should send a success CQE in a response capsule. + * + * - NVMF_SUCCESS_SENT: The transfer has completed successfully and + * the transport layer has sent an implicit success CQE to the + * remote host (e.g. the SUCCESS flag for TCP). The caller should + * not send a response capsule. + * + * - NVMF_MORE: The transfer has completed successfully, but the + * transfer did not complete the data buffer. + * + * The mbuf chain in 'm' is consumed by this function even if an error + * is returned. + */ +u_int nvmf_send_controller_data(struct nvmf_capsule *nc, + uint32_t data_offset, struct mbuf *m, size_t len); + +#define NVMF_SUCCESS_SENT 0x100 +#define NVMF_MORE 0x101 + +#endif /* !__NVMF_TRANSPORT_H__ */ diff --git a/sys/dev/nvmf/nvmf_transport.c b/sys/dev/nvmf/nvmf_transport.c new file mode 100644 --- /dev/null +++ b/sys/dev/nvmf/nvmf_transport.c @@ -0,0 +1,344 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2022-2024 Chelsio Communications, Inc. + * Written by: John Baldwin + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Transport-independent support for fabrics queue pairs and commands. */ + +struct nvmf_transport { + struct nvmf_transport_ops *nt_ops; + + volatile u_int nt_active_qpairs; + SLIST_ENTRY(nvmf_transport) nt_link; +}; + +/* nvmf_transports[nvmf_trtype] is sorted by priority */ +static SLIST_HEAD(, nvmf_transport) nvmf_transports[NVMF_TRTYPE_TCP + 1]; +static struct sx nvmf_transports_lock; + +static MALLOC_DEFINE(M_NVMF_TRANSPORT, "nvmf_xport", + "NVMe over Fabrics transport"); + +SYSCTL_NODE(_kern, OID_AUTO, nvmf, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, + "NVMe over Fabrics"); + +static bool +nvmf_supported_trtype(enum nvmf_trtype trtype) +{ + return (trtype < nitems(nvmf_transports)); +} + +struct nvmf_qpair * +nvmf_allocate_qpair(enum nvmf_trtype trtype, bool controller, + const struct nvmf_handoff_qpair_params *params, + nvmf_qpair_error_t *error_cb, void *error_cb_arg, + nvmf_capsule_receive_t *receive_cb, void *receive_cb_arg) +{ + struct nvmf_transport *nt; + struct nvmf_qpair *qp; + + if (!nvmf_supported_trtype(trtype)) + return (NULL); + + sx_slock(&nvmf_transports_lock); + SLIST_FOREACH(nt, &nvmf_transports[trtype], nt_link) { + qp = nt->nt_ops->allocate_qpair(controller, params); + if (qp != NULL) { + refcount_acquire(&nt->nt_active_qpairs); + break; + } + } + sx_sunlock(&nvmf_transports_lock); + if (qp == NULL) + return (NULL); + + qp->nq_transport = nt; + qp->nq_ops = nt->nt_ops; + qp->nq_controller = controller; + qp->nq_error = error_cb; + qp->nq_error_arg = error_cb_arg; + qp->nq_receive = receive_cb; + qp->nq_receive_arg = receive_cb_arg; + qp->nq_admin = params->admin; + return (qp); +} + +void +nvmf_free_qpair(struct nvmf_qpair *qp) +{ + struct nvmf_transport *nt; + + nt = qp->nq_transport; + qp->nq_ops->free_qpair(qp); + if (refcount_release(&nt->nt_active_qpairs)) + wakeup(nt); +} + +struct nvmf_capsule * +nvmf_allocate_command(struct nvmf_qpair *qp, const void *sqe, int how) +{ + struct nvmf_capsule *nc; + + KASSERT(how == M_WAITOK || how == M_NOWAIT, + ("%s: invalid how", __func__)); + nc = qp->nq_ops->allocate_capsule(qp, how); + if (nc == NULL) + return (NULL); + + nc->nc_qpair = qp; + nc->nc_qe_len = sizeof(struct nvme_command); + memcpy(&nc->nc_sqe, sqe, nc->nc_qe_len); + + /* 4.2 of NVMe base spec: Fabrics always uses SGL. */ + nc->nc_sqe.fuse &= ~NVMEM(NVME_CMD_PSDT); + nc->nc_sqe.fuse |= NVMEF(NVME_CMD_PSDT, NVME_PSDT_SGL); + return (nc); +} + +struct nvmf_capsule * +nvmf_allocate_response(struct nvmf_qpair *qp, const void *cqe, int how) +{ + struct nvmf_capsule *nc; + + KASSERT(how == M_WAITOK || how == M_NOWAIT, + ("%s: invalid how", __func__)); + nc = qp->nq_ops->allocate_capsule(qp, how); + if (nc == NULL) + return (NULL); + + nc->nc_qpair = qp; + nc->nc_qe_len = sizeof(struct nvme_completion); + memcpy(&nc->nc_cqe, cqe, nc->nc_qe_len); + return (nc); +} + +int +nvmf_capsule_append_data(struct nvmf_capsule *nc, struct memdesc *mem, + size_t len, bool send, nvmf_io_complete_t *complete_cb, + void *cb_arg) +{ + if (nc->nc_data.io_len != 0) + return (EBUSY); + + nc->nc_send_data = send; + nc->nc_data.io_mem = *mem; + nc->nc_data.io_len = len; + nc->nc_data.io_complete = complete_cb; + nc->nc_data.io_complete_arg = cb_arg; + return (0); +} + +void +nvmf_free_capsule(struct nvmf_capsule *nc) +{ + nc->nc_qpair->nq_ops->free_capsule(nc); +} + +int +nvmf_transmit_capsule(struct nvmf_capsule *nc) +{ + return (nc->nc_qpair->nq_ops->transmit_capsule(nc)); +} + +void +nvmf_abort_capsule_data(struct nvmf_capsule *nc, int error) +{ + if (nc->nc_data.io_len != 0) + nvmf_complete_io_request(&nc->nc_data, 0, error); +} + +void * +nvmf_capsule_sqe(struct nvmf_capsule *nc) +{ + KASSERT(nc->nc_qe_len == sizeof(struct nvme_command), + ("%s: capsule %p is not a command capsule", __func__, nc)); + return (&nc->nc_sqe); +} + +void * +nvmf_capsule_cqe(struct nvmf_capsule *nc) +{ + KASSERT(nc->nc_qe_len == sizeof(struct nvme_completion), + ("%s: capsule %p is not a response capsule", __func__, nc)); + return (&nc->nc_cqe); +} + +uint8_t +nvmf_validate_command_capsule(struct nvmf_capsule *nc) +{ + KASSERT(nc->nc_qe_len == sizeof(struct nvme_command), + ("%s: capsule %p is not a command capsule", __func__, nc)); + + if (NVMEV(NVME_CMD_PSDT, nc->nc_sqe.fuse) != NVME_PSDT_SGL) + return (NVME_SC_INVALID_FIELD); + + return (nc->nc_qpair->nq_ops->validate_command_capsule(nc)); +} + +size_t +nvmf_capsule_data_len(const struct nvmf_capsule *nc) +{ + return (nc->nc_qpair->nq_ops->capsule_data_len(nc)); +} + +int +nvmf_receive_controller_data(struct nvmf_capsule *nc, uint32_t data_offset, + struct memdesc *mem, size_t len, nvmf_io_complete_t *complete_cb, + void *cb_arg) +{ + struct nvmf_io_request io; + + io.io_mem = *mem; + io.io_len = len; + io.io_complete = complete_cb; + io.io_complete_arg = cb_arg; + return (nc->nc_qpair->nq_ops->receive_controller_data(nc, data_offset, + &io)); +} + +u_int +nvmf_send_controller_data(struct nvmf_capsule *nc, uint32_t data_offset, + struct mbuf *m, size_t len) +{ + MPASS(m_length(m, NULL) == len); + return (nc->nc_qpair->nq_ops->send_controller_data(nc, data_offset, m, + len)); +} + +int +nvmf_transport_module_handler(struct module *mod, int what, void *arg) +{ + struct nvmf_transport_ops *ops = arg; + struct nvmf_transport *nt, *nt2, *prev; + int error; + + switch (what) { + case MOD_LOAD: + if (!nvmf_supported_trtype(ops->trtype)) { + printf("NVMF: Unsupported transport %u", ops->trtype); + return (EINVAL); + } + + nt = malloc(sizeof(*nt), M_NVMF_TRANSPORT, M_WAITOK | M_ZERO); + nt->nt_ops = arg; + + sx_xlock(&nvmf_transports_lock); + if (SLIST_EMPTY(&nvmf_transports[ops->trtype])) { + SLIST_INSERT_HEAD(&nvmf_transports[ops->trtype], nt, + nt_link); + } else { + prev = NULL; + SLIST_FOREACH(nt2, &nvmf_transports[ops->trtype], + nt_link) { + if (ops->priority > nt2->nt_ops->priority) + break; + prev = nt2; + } + if (prev == NULL) + SLIST_INSERT_HEAD(&nvmf_transports[ops->trtype], + nt, nt_link); + else + SLIST_INSERT_AFTER(prev, nt, nt_link); + } + sx_xunlock(&nvmf_transports_lock); + return (0); + + case MOD_QUIESCE: + if (!nvmf_supported_trtype(ops->trtype)) + return (0); + + sx_slock(&nvmf_transports_lock); + SLIST_FOREACH(nt, &nvmf_transports[ops->trtype], nt_link) { + if (nt->nt_ops == ops) + break; + } + if (nt == NULL) { + sx_sunlock(&nvmf_transports_lock); + return (0); + } + if (nt->nt_active_qpairs != 0) { + sx_sunlock(&nvmf_transports_lock); + return (EBUSY); + } + sx_sunlock(&nvmf_transports_lock); + return (0); + + case MOD_UNLOAD: + if (!nvmf_supported_trtype(ops->trtype)) + return (0); + + sx_xlock(&nvmf_transports_lock); + prev = NULL; + SLIST_FOREACH(nt, &nvmf_transports[ops->trtype], nt_link) { + if (nt->nt_ops == ops) + break; + prev = nt; + } + if (nt == NULL) { + KASSERT(nt->nt_active_qpairs == 0, + ("unregistered transport has connections")); + sx_xunlock(&nvmf_transports_lock); + return (0); + } + + if (prev == NULL) + SLIST_REMOVE_HEAD(&nvmf_transports[ops->trtype], + nt_link); + else + SLIST_REMOVE_AFTER(prev, nt_link); + + error = 0; + while (nt->nt_active_qpairs != 0 && error == 0) + error = sx_sleep(nt, &nvmf_transports_lock, PCATCH, + "nftunld", 0); + sx_xunlock(&nvmf_transports_lock); + if (error != 0) + return (error); + free(nt, M_NVMF_TRANSPORT); + return (0); + + default: + return (EOPNOTSUPP); + } +} + +static int +nvmf_transport_modevent(module_t mod __unused, int what, void *arg __unused) +{ + switch (what) { + case MOD_LOAD: + for (u_int i = 0; i < nitems(nvmf_transports); i++) + SLIST_INIT(&nvmf_transports[i]); + sx_init(&nvmf_transports_lock, "nvmf transports"); + return (0); + default: + return (EOPNOTSUPP); + } +} + +static moduledata_t nvmf_transport_mod = { + "nvmf_transport", + nvmf_transport_modevent, + 0 +}; + +DECLARE_MODULE(nvmf_transport, nvmf_transport_mod, SI_SUB_DRIVERS, + SI_ORDER_FIRST); +MODULE_VERSION(nvmf_transport, 1); diff --git a/sys/dev/nvmf/nvmf_transport_internal.h b/sys/dev/nvmf/nvmf_transport_internal.h new file mode 100644 --- /dev/null +++ b/sys/dev/nvmf/nvmf_transport_internal.h @@ -0,0 +1,128 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2022-2024 Chelsio Communications, Inc. + * Written by: John Baldwin + */ + +#ifndef __NVMF_TRANSPORT_INTERNAL_H__ +#define __NVMF_TRANSPORT_INTERNAL_H__ + +#include + +/* + * Interface between the transport-independent APIs in + * nvmf_transport.c and individual transports. + */ + +struct module; +struct nvmf_io_request; + +struct nvmf_transport_ops { + /* Queue pair management. */ + struct nvmf_qpair *(*allocate_qpair)(bool controller, + const struct nvmf_handoff_qpair_params *params); + void (*free_qpair)(struct nvmf_qpair *qp); + + /* Capsule operations. */ + struct nvmf_capsule *(*allocate_capsule)(struct nvmf_qpair *qp, + int how); + void (*free_capsule)(struct nvmf_capsule *nc); + int (*transmit_capsule)(struct nvmf_capsule *nc); + uint8_t (*validate_command_capsule)(struct nvmf_capsule *nc); + + /* Transferring controller data. */ + size_t (*capsule_data_len)(const struct nvmf_capsule *nc); + int (*receive_controller_data)(struct nvmf_capsule *nc, + uint32_t data_offset, struct nvmf_io_request *io); + u_int (*send_controller_data)(struct nvmf_capsule *nc, + uint32_t data_offset, struct mbuf *m, size_t len); + + enum nvmf_trtype trtype; + int priority; +}; + +/* Either an Admin or I/O Submission/Completion Queue pair. */ +struct nvmf_qpair { + struct nvmf_transport *nq_transport; + struct nvmf_transport_ops *nq_ops; + bool nq_controller; + + /* Callback to invoke for a received capsule. */ + nvmf_capsule_receive_t *nq_receive; + void *nq_receive_arg; + + /* Callback to invoke for an error. */ + nvmf_qpair_error_t *nq_error; + void *nq_error_arg; + + bool nq_admin; +}; + +struct nvmf_io_request { + /* + * Data buffer contains io_len bytes in the backing store + * described by mem. + */ + struct memdesc io_mem; + size_t io_len; + nvmf_io_complete_t *io_complete; + void *io_complete_arg; +}; + +/* + * Fabrics Command and Response Capsules. The Fabrics host + * (initiator) and controller (target) drivers work with capsules that + * are transmitted and received by a specific transport. + */ +struct nvmf_capsule { + struct nvmf_qpair *nc_qpair; + + /* Either a SQE or CQE. */ + union { + struct nvme_command nc_sqe; + struct nvme_completion nc_cqe; + }; + int nc_qe_len; + + /* + * Is SQHD in received capsule valid? False for locally- + * synthesized responses. + */ + bool nc_sqhd_valid; + + bool nc_send_data; + struct nvmf_io_request nc_data; +}; + +static void __inline +nvmf_qpair_error(struct nvmf_qpair *nq, int error) +{ + nq->nq_error(nq->nq_error_arg, error); +} + +static void __inline +nvmf_capsule_received(struct nvmf_qpair *nq, struct nvmf_capsule *nc) +{ + nq->nq_receive(nq->nq_receive_arg, nc); +} + +static void __inline +nvmf_complete_io_request(struct nvmf_io_request *io, size_t xfered, int error) +{ + io->io_complete(io->io_complete_arg, xfered, error); +} + +int nvmf_transport_module_handler(struct module *, int, void *); + +#define NVMF_TRANSPORT(name, ops) \ +static moduledata_t nvmf_transport_##name##_mod = { \ + "nvmf/" #name, \ + nvmf_transport_module_handler, \ + &(ops) \ +}; \ +DECLARE_MODULE(nvmf_transport_##name, nvmf_transport_##name##_mod, \ + SI_SUB_DRIVERS, SI_ORDER_ANY); \ +MODULE_DEPEND(nvmf_transport_##name, nvmf_transport, 1, 1, 1) + +#endif /* !__NVMF_TRANSPORT_INTERNAL_H__ */ diff --git a/sys/modules/Makefile b/sys/modules/Makefile --- a/sys/modules/Makefile +++ b/sys/modules/Makefile @@ -296,6 +296,7 @@ nvd \ ${_nvdimm} \ nvme \ + nvmf \ ${_nvram} \ oce \ ${_ocs_fc} \ diff --git a/sys/modules/nvmf/Makefile b/sys/modules/nvmf/Makefile new file mode 100644 --- /dev/null +++ b/sys/modules/nvmf/Makefile @@ -0,0 +1,3 @@ +SUBDIR= nvmf_transport + +.include diff --git a/sys/modules/nvmf/nvmf_transport/Makefile b/sys/modules/nvmf/nvmf_transport/Makefile new file mode 100644 --- /dev/null +++ b/sys/modules/nvmf/nvmf_transport/Makefile @@ -0,0 +1,9 @@ +.PATH: ${SRCTOP}/sys/dev/nvmf + +KMOD= nvmf_transport + +SRCS= nvmf_transport.c + +EXPORT_SYMS= YES + +.include