diff --git a/lib/libnvmf/Makefile b/lib/libnvmf/Makefile index dbba6b476510..b01f5ab82cac 100644 --- a/lib/libnvmf/Makefile +++ b/lib/libnvmf/Makefile @@ -1,22 +1,24 @@ .PATH: ${SRCTOP}/sys/dev/nvmf/controller .PATH: ${SRCTOP}/sys/libkern LIB= nvmf INTERNALLIB= PACKAGE= nvmf INCS= libnvmf.h SRCS= gsb_crc32.c \ nvmf_controller.c \ nvmf_host.c \ nvmf_tcp.c \ nvmf_transport.c \ nvmft_subr.c +LIBADD= nv + CFLAGS+= -I${SRCTOP}/sys/dev/nvmf/controller CFLAGS+= -I${SRCTOP}/sys/dev/nvmf .include CWARNFLAGS.gsb_crc32.c= -Wno-cast-align diff --git a/lib/libnvmf/internal.h b/lib/libnvmf/internal.h index cf45c15ba2f0..7b3d4fbb03ef 100644 --- a/lib/libnvmf/internal.h +++ b/lib/libnvmf/internal.h @@ -1,116 +1,116 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2022-2024 Chelsio Communications, Inc. * Written by: John Baldwin */ #ifndef __LIBNVMF_INTERNAL_H__ #define __LIBNVMF_INTERNAL_H__ +#include #include struct nvmf_transport_ops { /* Association management. */ struct nvmf_association *(*allocate_association)(bool controller, const struct nvmf_association_params *params); void (*update_association)(struct nvmf_association *na, const struct nvme_controller_data *cdata); void (*free_association)(struct nvmf_association *na); /* Queue pair management. */ struct nvmf_qpair *(*allocate_qpair)(struct nvmf_association *na, const struct nvmf_qpair_params *params); void (*free_qpair)(struct nvmf_qpair *qp); - /* Create params for kernel handoff. */ - int (*kernel_handoff_params)(struct nvmf_qpair *qp, - struct nvmf_handoff_qpair_params *qparams); + /* Add params for kernel handoff. */ + void (*kernel_handoff_params)(struct nvmf_qpair *qp, nvlist_t *nvl); /* Capsule operations. */ struct nvmf_capsule *(*allocate_capsule)(struct nvmf_qpair *qp); void (*free_capsule)(struct nvmf_capsule *nc); int (*transmit_capsule)(struct nvmf_capsule *nc); int (*receive_capsule)(struct nvmf_qpair *qp, struct nvmf_capsule **ncp); uint8_t (*validate_command_capsule)(const struct nvmf_capsule *nc); /* Transferring controller data. */ size_t (*capsule_data_len)(const struct nvmf_capsule *nc); int (*receive_controller_data)(const struct nvmf_capsule *nc, uint32_t data_offset, void *buf, size_t len); int (*send_controller_data)(const struct nvmf_capsule *nc, const void *buf, size_t len); }; struct nvmf_association { struct nvmf_transport_ops *na_ops; enum nvmf_trtype na_trtype; bool na_controller; struct nvmf_association_params na_params; /* Each qpair holds a reference on an association. */ u_int na_refs; char *na_last_error; }; struct nvmf_qpair { struct nvmf_association *nq_association; bool nq_admin; uint16_t nq_cid; /* host only */ /* * Queue sizes. This assumes the same size for both the * completion and submission queues within a pair. */ u_int nq_qsize; /* Flow control management for submission queues. */ bool nq_flow_control; uint16_t nq_sqhd; uint16_t nq_sqtail; /* host only */ /* Value in response to/from CONNECT. */ uint16_t nq_cntlid; uint32_t nq_kato; /* valid on admin queue only */ TAILQ_HEAD(, nvmf_capsule) nq_rx_capsules; }; struct nvmf_capsule { struct nvmf_qpair *nc_qpair; /* Either a SQE or CQE. */ union { struct nvme_command nc_sqe; struct nvme_completion nc_cqe; }; int nc_qe_len; /* * Is SQHD in received capsule valid? False for locally- * synthesized responses. */ bool nc_sqhd_valid; /* Data buffer. */ bool nc_send_data; void *nc_data; size_t nc_data_len; TAILQ_ENTRY(nvmf_capsule) nc_link; }; extern struct nvmf_transport_ops tcp_ops; void na_clear_error(struct nvmf_association *na); void na_error(struct nvmf_association *na, const char *fmt, ...); -int nvmf_kernel_handoff_params(struct nvmf_qpair *qp, - struct nvmf_handoff_qpair_params *qparams); +int nvmf_kernel_handoff_params(struct nvmf_qpair *qp, nvlist_t **nvlp); +int nvmf_pack_ioc_nvlist(struct nvmf_ioc_nv *nv, nvlist_t *nvl); #endif /* !__LIBNVMF_INTERNAL_H__ */ diff --git a/lib/libnvmf/libnvmf.h b/lib/libnvmf/libnvmf.h index f15277a02621..44f13fda5ddd 100644 --- a/lib/libnvmf/libnvmf.h +++ b/lib/libnvmf/libnvmf.h @@ -1,363 +1,366 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2022-2024 Chelsio Communications, Inc. * Written by: John Baldwin */ #ifndef __LIBNVMF_H__ #define __LIBNVMF_H__ +#include #include #include #include #include #include #include struct nvmf_capsule; struct nvmf_association; struct nvmf_qpair; /* * Parameters shared by all queue-pairs of an association. Note that * this contains the requested values used to initiate transport * negotiation. */ struct nvmf_association_params { bool sq_flow_control; /* SQ flow control required. */ bool dynamic_controller_model; /* Controller only */ uint16_t max_admin_qsize; /* Controller only */ uint32_t max_io_qsize; /* Controller only, 0 for discovery */ union { struct { uint8_t pda; /* Tx-side PDA. */ bool header_digests; bool data_digests; uint32_t maxr2t; /* Host only */ uint32_t maxh2cdata; /* Controller only */ } tcp; }; }; /* Parameters specific to a single queue pair of an association. */ struct nvmf_qpair_params { bool admin; /* Host only */ union { struct { int fd; } tcp; }; }; /* Transport-independent APIs. */ /* * A host should allocate a new association for each association with * a controller. After the admin queue has been allocated and the * controller's data has been fetched, it should be passed to * nvmf_update_association to update internal transport-specific * parameters before allocating I/O queues. * * A controller uses a single association to manage all incoming * queues since it is not known until after parsing the CONNECT * command which transport queues are admin vs I/O and which * controller they are created against. */ struct nvmf_association *nvmf_allocate_association(enum nvmf_trtype trtype, bool controller, const struct nvmf_association_params *params); void nvmf_update_assocation(struct nvmf_association *na, const struct nvme_controller_data *cdata); void nvmf_free_association(struct nvmf_association *na); /* The most recent association-wide error message. */ const char *nvmf_association_error(const struct nvmf_association *na); /* * A queue pair represents either an Admin or I/O * submission/completion queue pair. * * Each open qpair holds a reference on its association. Once queue * pairs are allocated, callers can safely free the association to * ease bookkeeping. * * If nvmf_allocate_qpair fails, a detailed error message can be obtained * from nvmf_association_error. */ struct nvmf_qpair *nvmf_allocate_qpair(struct nvmf_association *na, const struct nvmf_qpair_params *params); void nvmf_free_qpair(struct nvmf_qpair *qp); /* * Capsules are either commands (host -> controller) or responses * (controller -> host). A single data buffer segment may be * associated with a command capsule. Transmitted data is not copied * by this API but instead must be preserved until the capsule is * transmitted and freed. */ struct nvmf_capsule *nvmf_allocate_command(struct nvmf_qpair *qp, const void *sqe); struct nvmf_capsule *nvmf_allocate_response(struct nvmf_qpair *qp, const void *cqe); void nvmf_free_capsule(struct nvmf_capsule *nc); int nvmf_capsule_append_data(struct nvmf_capsule *nc, void *buf, size_t len, bool send); int nvmf_transmit_capsule(struct nvmf_capsule *nc); int nvmf_receive_capsule(struct nvmf_qpair *qp, struct nvmf_capsule **ncp); const void *nvmf_capsule_sqe(const struct nvmf_capsule *nc); const void *nvmf_capsule_cqe(const struct nvmf_capsule *nc); /* Return a string name for a transport type. */ const char *nvmf_transport_type(uint8_t trtype); /* Validate a NVMe Qualified Name. */ bool nvmf_nqn_valid(const char *nqn); /* Controller-specific APIs. */ /* * A controller calls this function to check for any * transport-specific errors (invalid fields) in a received command * capsule. The callback returns a generic command status value: * NVME_SC_SUCCESS if no error is found. */ uint8_t nvmf_validate_command_capsule(const struct nvmf_capsule *nc); /* * A controller calls this function to query the amount of data * associated with a command capsule. */ size_t nvmf_capsule_data_len(const struct nvmf_capsule *cc); /* * A controller calls this function to receive data associated with a * command capsule (e.g. the data for a WRITE command). This can * either return in-capsule data or fetch data from the host * (e.g. using a R2T PDU over TCP). The received command capsule * should be passed in 'nc'. The received data is stored in '*buf'. */ int nvmf_receive_controller_data(const struct nvmf_capsule *nc, uint32_t data_offset, void *buf, size_t len); /* * A controller calls this function to send data in response to a * command along with a response capsule. If the data transfer * succeeds, a success response is sent. If the data transfer fails, * an appropriate error status capsule is sent. Regardless, a * response capsule is always sent. */ int nvmf_send_controller_data(const struct nvmf_capsule *nc, const void *buf, size_t len); /* * Construct a CQE for a reply to a command capsule in 'nc' with the * completion status 'status'. This is useful when additional CQE * info is required beyond the completion status. */ void nvmf_init_cqe(void *cqe, const struct nvmf_capsule *nc, uint16_t status); /* * Construct and send a response capsule to a command capsule with * the supplied CQE. */ int nvmf_send_response(const struct nvmf_capsule *nc, const void *cqe); /* * Wait for a single command capsule and return it in *ncp. This can * fail if an invalid capsule is received or an I/O error occurs. */ int nvmf_controller_receive_capsule(struct nvmf_qpair *qp, struct nvmf_capsule **ncp); /* Send a response capsule from a controller. */ int nvmf_controller_transmit_response(struct nvmf_capsule *nc); /* Construct and send an error response capsule. */ int nvmf_send_error(const struct nvmf_capsule *cc, uint8_t sc_type, uint8_t sc_status); /* * Construct and send an error response capsule using a generic status * code. */ int nvmf_send_generic_error(const struct nvmf_capsule *nc, uint8_t sc_status); /* Construct and send a simple success response capsule. */ int nvmf_send_success(const struct nvmf_capsule *nc); /* * Allocate a new queue pair and wait for the CONNECT command capsule. * If this fails, a detailed error message can be obtained from * nvmf_association_error. On success, the command capsule is saved * in '*ccp' and the connect data is saved in 'data'. The caller * must send an explicit response and free the the command capsule. */ struct nvmf_qpair *nvmf_accept(struct nvmf_association *na, const struct nvmf_qpair_params *params, struct nvmf_capsule **ccp, struct nvmf_fabric_connect_data *data); /* * Construct and send a response capsule with the Fabrics CONNECT * invalid parameters error status. If data is true the offset is * relative to the CONNECT data structure, otherwise the offset is * relative to the SQE. */ void nvmf_connect_invalid_parameters(const struct nvmf_capsule *cc, bool data, uint16_t offset); /* Construct and send a response capsule for a successful CONNECT. */ int nvmf_finish_accept(const struct nvmf_capsule *cc, uint16_t cntlid); /* Compute the initial state of CAP for a controller. */ uint64_t nvmf_controller_cap(struct nvmf_qpair *qp); /* Generate a serial number string from a host ID. */ void nvmf_controller_serial(char *buf, size_t len, u_long hostid); /* * Populate an Identify Controller data structure for a Discovery * controller. */ void nvmf_init_discovery_controller_data(struct nvmf_qpair *qp, struct nvme_controller_data *cdata); /* * Populate an Identify Controller data structure for an I/O * controller. */ void nvmf_init_io_controller_data(struct nvmf_qpair *qp, const char *serial, const char *subnqn, int nn, uint32_t ioccsz, struct nvme_controller_data *cdata); /* * Validate if a new value for CC is legal given the existing values of * CAP and CC. */ bool nvmf_validate_cc(struct nvmf_qpair *qp, uint64_t cap, uint32_t old_cc, uint32_t new_cc); /* Return the log page id (LID) of a GET_LOG_PAGE command. */ uint8_t nvmf_get_log_page_id(const struct nvme_command *cmd); /* Return the requested data length of a GET_LOG_PAGE command. */ uint64_t nvmf_get_log_page_length(const struct nvme_command *cmd); /* Return the requested data offset of a GET_LOG_PAGE command. */ uint64_t nvmf_get_log_page_offset(const struct nvme_command *cmd); /* Prepare to handoff a controller qpair. */ int nvmf_handoff_controller_qpair(struct nvmf_qpair *qp, - struct nvmf_handoff_controller_qpair *h); + const struct nvmf_fabric_connect_cmd *cmd, + const struct nvmf_fabric_connect_data *data, struct nvmf_ioc_nv *nv); /* Host-specific APIs. */ /* * Connect to an admin or I/O queue. If this fails, a detailed error * message can be obtained from nvmf_association_error. */ struct nvmf_qpair *nvmf_connect(struct nvmf_association *na, const struct nvmf_qpair_params *params, uint16_t qid, u_int queue_size, const uint8_t hostid[16], uint16_t cntlid, const char *subnqn, const char *hostnqn, uint32_t kato); /* Return the CNTLID for a queue returned from CONNECT. */ uint16_t nvmf_cntlid(struct nvmf_qpair *qp); /* * Send a command to the controller. This can fail with EBUSY if the * submission queue is full. */ int nvmf_host_transmit_command(struct nvmf_capsule *nc); /* * Wait for a response to a command. If there are no outstanding * commands in the SQ, fails with EWOULDBLOCK. */ int nvmf_host_receive_response(struct nvmf_qpair *qp, struct nvmf_capsule **rcp); /* * Wait for a response to a specific command. The command must have been * succesfully sent previously. */ int nvmf_host_wait_for_response(struct nvmf_capsule *cc, struct nvmf_capsule **rcp); /* Build a KeepAlive command. */ struct nvmf_capsule *nvmf_keepalive(struct nvmf_qpair *qp); /* Read a controller property. */ int nvmf_read_property(struct nvmf_qpair *qp, uint32_t offset, uint8_t size, uint64_t *value); /* Write a controller property. */ int nvmf_write_property(struct nvmf_qpair *qp, uint32_t offset, uint8_t size, uint64_t value); /* Construct a 16-byte HostId from kern.hostuuid. */ int nvmf_hostid_from_hostuuid(uint8_t hostid[16]); /* Construct a NQN from kern.hostuuid. */ int nvmf_nqn_from_hostuuid(char nqn[NVMF_NQN_MAX_LEN]); /* Fetch controller data via IDENTIFY. */ int nvmf_host_identify_controller(struct nvmf_qpair *qp, struct nvme_controller_data *data); /* Fetch namespace data via IDENTIFY. */ int nvmf_host_identify_namespace(struct nvmf_qpair *qp, uint32_t nsid, struct nvme_namespace_data *nsdata); /* * Fetch discovery log page. The memory for the log page is allocated * by malloc() and returned in *logp. The caller must free the * memory. */ int nvmf_host_fetch_discovery_log_page(struct nvmf_qpair *qp, struct nvme_discovery_log **logp); /* * Request a desired number of I/O queues via SET_FEATURES. The * number of actual I/O queues available is returned in *actual on * success. */ int nvmf_host_request_queues(struct nvmf_qpair *qp, u_int requested, u_int *actual); /* * Handoff active host association to the kernel. This frees the * qpairs (even on error). */ int nvmf_handoff_host(struct nvmf_qpair *admin_qp, u_int num_queues, struct nvmf_qpair **io_queues, const struct nvme_controller_data *cdata); /* * Disconnect an active host association previously handed off to the * kernel. *name is either the name of the device (nvmeX) for this * association or the remote subsystem NQN. */ int nvmf_disconnect_host(const char *host); /* * Disconnect all active host associations previously handed off to * the kernel. */ int nvmf_disconnect_all(void); /* * Fetch reconnect parameters from an existing kernel host to use for - * establishing a new association. + * establishing a new association. The caller must destroy the + * returned nvlist. */ -int nvmf_reconnect_params(int fd, struct nvmf_reconnect_params *rparams); +int nvmf_reconnect_params(int fd, nvlist_t **nvlp); /* * Handoff active host association to an existing host in the kernel. * This frees the qpairs (even on error). */ int nvmf_reconnect_host(int fd, struct nvmf_qpair *admin_qp, u_int num_queues, struct nvmf_qpair **io_queues, const struct nvme_controller_data *cdata); #endif /* !__LIBNVMF_H__ */ diff --git a/lib/libnvmf/nvmf_controller.c b/lib/libnvmf/nvmf_controller.c index 0e0126040ee4..971dccbe039e 100644 --- a/lib/libnvmf/nvmf_controller.c +++ b/lib/libnvmf/nvmf_controller.c @@ -1,464 +1,479 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2024 Chelsio Communications, Inc. * Written by: John Baldwin */ #include #include #include #include #include #include "libnvmf.h" #include "internal.h" #include "nvmft_subr.h" void nvmf_init_cqe(void *cqe, const struct nvmf_capsule *nc, uint16_t status) { struct nvme_completion *cpl = cqe; const struct nvme_command *cmd = nvmf_capsule_sqe(nc); memset(cpl, 0, sizeof(*cpl)); cpl->cid = cmd->cid; cpl->status = htole16(status); } static struct nvmf_capsule * nvmf_simple_response(const struct nvmf_capsule *nc, uint8_t sc_type, uint8_t sc_status) { struct nvme_completion cpl; uint16_t status; status = NVMEF(NVME_STATUS_SCT, sc_type) | NVMEF(NVME_STATUS_SC, sc_status); nvmf_init_cqe(&cpl, nc, status); return (nvmf_allocate_response(nc->nc_qpair, &cpl)); } int nvmf_controller_receive_capsule(struct nvmf_qpair *qp, struct nvmf_capsule **ncp) { struct nvmf_capsule *nc; int error; uint8_t sc_status; *ncp = NULL; error = nvmf_receive_capsule(qp, &nc); if (error != 0) return (error); sc_status = nvmf_validate_command_capsule(nc); if (sc_status != NVME_SC_SUCCESS) { nvmf_send_generic_error(nc, sc_status); nvmf_free_capsule(nc); return (EPROTO); } *ncp = nc; return (0); } int nvmf_controller_transmit_response(struct nvmf_capsule *nc) { struct nvmf_qpair *qp = nc->nc_qpair; /* Set SQHD. */ if (qp->nq_flow_control) { qp->nq_sqhd = (qp->nq_sqhd + 1) % qp->nq_qsize; nc->nc_cqe.sqhd = htole16(qp->nq_sqhd); } else nc->nc_cqe.sqhd = 0; return (nvmf_transmit_capsule(nc)); } int nvmf_send_response(const struct nvmf_capsule *cc, const void *cqe) { struct nvmf_capsule *rc; int error; rc = nvmf_allocate_response(cc->nc_qpair, cqe); if (rc == NULL) return (ENOMEM); error = nvmf_controller_transmit_response(rc); nvmf_free_capsule(rc); return (error); } int nvmf_send_error(const struct nvmf_capsule *cc, uint8_t sc_type, uint8_t sc_status) { struct nvmf_capsule *rc; int error; rc = nvmf_simple_response(cc, sc_type, sc_status); error = nvmf_controller_transmit_response(rc); nvmf_free_capsule(rc); return (error); } int nvmf_send_generic_error(const struct nvmf_capsule *nc, uint8_t sc_status) { return (nvmf_send_error(nc, NVME_SCT_GENERIC, sc_status)); } int nvmf_send_success(const struct nvmf_capsule *nc) { return (nvmf_send_generic_error(nc, NVME_SC_SUCCESS)); } void nvmf_connect_invalid_parameters(const struct nvmf_capsule *cc, bool data, uint16_t offset) { struct nvmf_fabric_connect_rsp rsp; struct nvmf_capsule *rc; nvmf_init_cqe(&rsp, cc, NVMEF(NVME_STATUS_SCT, NVME_SCT_COMMAND_SPECIFIC) | NVMEF(NVME_STATUS_SC, NVMF_FABRIC_SC_INVALID_PARAM)); rsp.status_code_specific.invalid.ipo = htole16(offset); rsp.status_code_specific.invalid.iattr = data ? 1 : 0; rc = nvmf_allocate_response(cc->nc_qpair, &rsp); nvmf_transmit_capsule(rc); nvmf_free_capsule(rc); } struct nvmf_qpair * nvmf_accept(struct nvmf_association *na, const struct nvmf_qpair_params *params, struct nvmf_capsule **ccp, struct nvmf_fabric_connect_data *data) { static const char hostid_zero[sizeof(data->hostid)]; const struct nvmf_fabric_connect_cmd *cmd; struct nvmf_qpair *qp; struct nvmf_capsule *cc, *rc; u_int qsize; int error; uint16_t cntlid; uint8_t sc_status; qp = NULL; cc = NULL; rc = NULL; *ccp = NULL; na_clear_error(na); if (!na->na_controller) { na_error(na, "Cannot accept on a host"); goto error; } qp = nvmf_allocate_qpair(na, params); if (qp == NULL) goto error; /* Read the CONNECT capsule. */ error = nvmf_receive_capsule(qp, &cc); if (error != 0) { na_error(na, "Failed to receive CONNECT: %s", strerror(error)); goto error; } sc_status = nvmf_validate_command_capsule(cc); if (sc_status != 0) { na_error(na, "CONNECT command failed to validate: %u", sc_status); rc = nvmf_simple_response(cc, NVME_SCT_GENERIC, sc_status); goto error; } cmd = nvmf_capsule_sqe(cc); if (cmd->opcode != NVME_OPC_FABRICS_COMMANDS || cmd->fctype != NVMF_FABRIC_COMMAND_CONNECT) { na_error(na, "Invalid opcode in CONNECT (%u,%u)", cmd->opcode, cmd->fctype); rc = nvmf_simple_response(cc, NVME_SCT_GENERIC, NVME_SC_INVALID_OPCODE); goto error; } if (cmd->recfmt != htole16(0)) { na_error(na, "Unsupported CONNECT record format %u", le16toh(cmd->recfmt)); rc = nvmf_simple_response(cc, NVME_SCT_COMMAND_SPECIFIC, NVMF_FABRIC_SC_INCOMPATIBLE_FORMAT); goto error; } qsize = le16toh(cmd->sqsize) + 1; if (cmd->qid == 0) { /* Admin queue limits. */ if (qsize < NVME_MIN_ADMIN_ENTRIES || qsize > NVME_MAX_ADMIN_ENTRIES || qsize > na->na_params.max_admin_qsize) { na_error(na, "Invalid queue size %u", qsize); nvmf_connect_invalid_parameters(cc, false, offsetof(struct nvmf_fabric_connect_cmd, sqsize)); goto error; } qp->nq_admin = true; } else { /* I/O queues not allowed for discovery. */ if (na->na_params.max_io_qsize == 0) { na_error(na, "I/O queue on discovery controller"); nvmf_connect_invalid_parameters(cc, false, offsetof(struct nvmf_fabric_connect_cmd, qid)); goto error; } /* I/O queue limits. */ if (qsize < NVME_MIN_IO_ENTRIES || qsize > NVME_MAX_IO_ENTRIES || qsize > na->na_params.max_io_qsize) { na_error(na, "Invalid queue size %u", qsize); nvmf_connect_invalid_parameters(cc, false, offsetof(struct nvmf_fabric_connect_cmd, sqsize)); goto error; } /* KATO is reserved for I/O queues. */ if (cmd->kato != 0) { na_error(na, "KeepAlive timeout specified for I/O queue"); nvmf_connect_invalid_parameters(cc, false, offsetof(struct nvmf_fabric_connect_cmd, kato)); goto error; } qp->nq_admin = false; } qp->nq_qsize = qsize; /* Fetch CONNECT data. */ if (nvmf_capsule_data_len(cc) != sizeof(*data)) { na_error(na, "Invalid data payload length for CONNECT: %zu", nvmf_capsule_data_len(cc)); nvmf_connect_invalid_parameters(cc, false, offsetof(struct nvmf_fabric_connect_cmd, sgl1)); goto error; } error = nvmf_receive_controller_data(cc, 0, data, sizeof(*data)); if (error != 0) { na_error(na, "Failed to read data for CONNECT: %s", strerror(error)); rc = nvmf_simple_response(cc, NVME_SCT_GENERIC, NVME_SC_DATA_TRANSFER_ERROR); goto error; } /* The hostid must be non-zero. */ if (memcmp(data->hostid, hostid_zero, sizeof(hostid_zero)) == 0) { na_error(na, "HostID in CONNECT data is zero"); nvmf_connect_invalid_parameters(cc, true, offsetof(struct nvmf_fabric_connect_data, hostid)); goto error; } cntlid = le16toh(data->cntlid); if (cmd->qid == 0) { if (na->na_params.dynamic_controller_model) { if (cntlid != NVMF_CNTLID_DYNAMIC) { na_error(na, "Invalid controller ID %#x", cntlid); nvmf_connect_invalid_parameters(cc, true, offsetof(struct nvmf_fabric_connect_data, cntlid)); goto error; } } else { if (cntlid > NVMF_CNTLID_STATIC_MAX && cntlid != NVMF_CNTLID_STATIC_ANY) { na_error(na, "Invalid controller ID %#x", cntlid); nvmf_connect_invalid_parameters(cc, true, offsetof(struct nvmf_fabric_connect_data, cntlid)); goto error; } } } else { /* Wildcard Controller IDs are only valid on an Admin queue. */ if (cntlid > NVMF_CNTLID_STATIC_MAX) { na_error(na, "Invalid controller ID %#x", cntlid); nvmf_connect_invalid_parameters(cc, true, offsetof(struct nvmf_fabric_connect_data, cntlid)); goto error; } } /* Simple validation of each NQN. */ if (!nvmf_nqn_valid(data->subnqn)) { na_error(na, "Invalid SubNQN %.*s", (int)sizeof(data->subnqn), data->subnqn); nvmf_connect_invalid_parameters(cc, true, offsetof(struct nvmf_fabric_connect_data, subnqn)); goto error; } if (!nvmf_nqn_valid(data->hostnqn)) { na_error(na, "Invalid HostNQN %.*s", (int)sizeof(data->hostnqn), data->hostnqn); nvmf_connect_invalid_parameters(cc, true, offsetof(struct nvmf_fabric_connect_data, hostnqn)); goto error; } if (na->na_params.sq_flow_control || (cmd->cattr & NVMF_CONNECT_ATTR_DISABLE_SQ_FC) == 0) qp->nq_flow_control = true; else qp->nq_flow_control = false; qp->nq_sqhd = 0; qp->nq_kato = le32toh(cmd->kato); *ccp = cc; return (qp); error: if (rc != NULL) { nvmf_transmit_capsule(rc); nvmf_free_capsule(rc); } if (cc != NULL) nvmf_free_capsule(cc); if (qp != NULL) nvmf_free_qpair(qp); return (NULL); } int nvmf_finish_accept(const struct nvmf_capsule *cc, uint16_t cntlid) { struct nvmf_fabric_connect_rsp rsp; struct nvmf_qpair *qp = cc->nc_qpair; struct nvmf_capsule *rc; int error; nvmf_init_cqe(&rsp, cc, 0); if (qp->nq_flow_control) rsp.sqhd = htole16(qp->nq_sqhd); else rsp.sqhd = htole16(0xffff); rsp.status_code_specific.success.cntlid = htole16(cntlid); rc = nvmf_allocate_response(qp, &rsp); if (rc == NULL) return (ENOMEM); error = nvmf_transmit_capsule(rc); nvmf_free_capsule(rc); if (error == 0) qp->nq_cntlid = cntlid; return (error); } uint64_t nvmf_controller_cap(struct nvmf_qpair *qp) { const struct nvmf_association *na = qp->nq_association; return (_nvmf_controller_cap(na->na_params.max_io_qsize, NVMF_CC_EN_TIMEOUT)); } bool nvmf_validate_cc(struct nvmf_qpair *qp, uint64_t cap, uint32_t old_cc, uint32_t new_cc) { const struct nvmf_association *na = qp->nq_association; return (_nvmf_validate_cc(na->na_params.max_io_qsize, cap, old_cc, new_cc)); } void nvmf_init_discovery_controller_data(struct nvmf_qpair *qp, struct nvme_controller_data *cdata) { const struct nvmf_association *na = qp->nq_association; struct utsname utsname; char *cp; memset(cdata, 0, sizeof(*cdata)); /* * 5.2 Figure 37 states model name and serial are reserved, * but Linux includes them. Don't bother with serial, but * do set model name. */ uname(&utsname); nvmf_strpad(cdata->mn, utsname.sysname, sizeof(cdata->mn)); nvmf_strpad(cdata->fr, utsname.release, sizeof(cdata->fr)); cp = memchr(cdata->fr, '-', sizeof(cdata->fr)); if (cp != NULL) memset(cp, ' ', sizeof(cdata->fr) - (cp - (char *)cdata->fr)); cdata->ctrlr_id = htole16(qp->nq_cntlid); cdata->ver = htole32(NVME_REV(1, 4)); cdata->cntrltype = 2; cdata->lpa = NVMEF(NVME_CTRLR_DATA_LPA_EXT_DATA, 1); cdata->elpe = 0; cdata->maxcmd = htole16(na->na_params.max_admin_qsize); /* Transport-specific? */ cdata->sgls = htole32( NVMEF(NVME_CTRLR_DATA_SGLS_TRANSPORT_DATA_BLOCK, 1) | NVMEF(NVME_CTRLR_DATA_SGLS_ADDRESS_AS_OFFSET, 1) | NVMEF(NVME_CTRLR_DATA_SGLS_NVM_COMMAND_SET, 1)); strlcpy(cdata->subnqn, NVMF_DISCOVERY_NQN, sizeof(cdata->subnqn)); } void nvmf_init_io_controller_data(struct nvmf_qpair *qp, const char *serial, const char *subnqn, int nn, uint32_t ioccsz, struct nvme_controller_data *cdata) { const struct nvmf_association *na = qp->nq_association; struct utsname utsname; uname(&utsname); memset(cdata, 0, sizeof(*cdata)); _nvmf_init_io_controller_data(qp->nq_cntlid, na->na_params.max_io_qsize, serial, utsname.sysname, utsname.release, subnqn, nn, ioccsz, sizeof(struct nvme_completion), cdata); } uint8_t nvmf_get_log_page_id(const struct nvme_command *cmd) { assert(cmd->opc == NVME_OPC_GET_LOG_PAGE); return (le32toh(cmd->cdw10) & 0xff); } uint64_t nvmf_get_log_page_length(const struct nvme_command *cmd) { uint32_t numd; assert(cmd->opc == NVME_OPC_GET_LOG_PAGE); numd = le32toh(cmd->cdw10) >> 16 | (le32toh(cmd->cdw11) & 0xffff) << 16; return ((numd + 1) * 4); } uint64_t nvmf_get_log_page_offset(const struct nvme_command *cmd) { assert(cmd->opc == NVME_OPC_GET_LOG_PAGE); return (le32toh(cmd->cdw12) | (uint64_t)le32toh(cmd->cdw13) << 32); } int nvmf_handoff_controller_qpair(struct nvmf_qpair *qp, - struct nvmf_handoff_controller_qpair *h) + const struct nvmf_fabric_connect_cmd *cmd, + const struct nvmf_fabric_connect_data *data, struct nvmf_ioc_nv *nv) { - h->trtype = qp->nq_association->na_trtype; - return (nvmf_kernel_handoff_params(qp, &h->params)); + nvlist_t *nvl, *nvl_qp; + int error; + + error = nvmf_kernel_handoff_params(qp, &nvl_qp); + if (error) + return (error); + + nvl = nvlist_create(0); + nvlist_add_number(nvl, "trtype", qp->nq_association->na_trtype); + nvlist_move_nvlist(nvl, "params", nvl_qp); + nvlist_add_binary(nvl, "cmd", cmd, sizeof(*cmd)); + nvlist_add_binary(nvl, "data", data, sizeof(*data)); + + error = nvmf_pack_ioc_nvlist(nv, nvl); + nvlist_destroy(nvl); + return (error); } diff --git a/lib/libnvmf/nvmf_host.c b/lib/libnvmf/nvmf_host.c index a0d95470d8ee..c3668600c463 100644 --- a/lib/libnvmf/nvmf_host.c +++ b/lib/libnvmf/nvmf_host.c @@ -1,915 +1,948 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2024 Chelsio Communications, Inc. * Written by: John Baldwin */ #include #include #include #include #include #include #include #include #include "libnvmf.h" #include "internal.h" static void nvmf_init_sqe(void *sqe, uint8_t opcode) { struct nvme_command *cmd = sqe; memset(cmd, 0, sizeof(*cmd)); cmd->opc = opcode; } static void nvmf_init_fabrics_sqe(void *sqe, uint8_t fctype) { struct nvmf_capsule_cmd *cmd = sqe; nvmf_init_sqe(sqe, NVME_OPC_FABRICS_COMMANDS); cmd->fctype = fctype; } struct nvmf_qpair * nvmf_connect(struct nvmf_association *na, const struct nvmf_qpair_params *params, uint16_t qid, u_int queue_size, const uint8_t hostid[16], uint16_t cntlid, const char *subnqn, const char *hostnqn, uint32_t kato) { struct nvmf_fabric_connect_cmd cmd; struct nvmf_fabric_connect_data data; const struct nvmf_fabric_connect_rsp *rsp; struct nvmf_qpair *qp; struct nvmf_capsule *cc, *rc; int error; uint16_t sqhd, status; qp = NULL; cc = NULL; rc = NULL; na_clear_error(na); if (na->na_controller) { na_error(na, "Cannot connect on a controller"); goto error; } if (params->admin != (qid == 0)) { na_error(na, "Admin queue must use Queue ID 0"); goto error; } if (qid == 0) { if (queue_size < NVME_MIN_ADMIN_ENTRIES || queue_size > NVME_MAX_ADMIN_ENTRIES) { na_error(na, "Invalid queue size %u", queue_size); goto error; } } else { if (queue_size < NVME_MIN_IO_ENTRIES || queue_size > NVME_MAX_IO_ENTRIES) { na_error(na, "Invalid queue size %u", queue_size); goto error; } /* KATO is only for Admin queues. */ if (kato != 0) { na_error(na, "Cannot set KATO on I/O queues"); goto error; } } qp = nvmf_allocate_qpair(na, params); if (qp == NULL) goto error; nvmf_init_fabrics_sqe(&cmd, NVMF_FABRIC_COMMAND_CONNECT); cmd.recfmt = 0; cmd.qid = htole16(qid); /* N.B. sqsize is 0's based. */ cmd.sqsize = htole16(queue_size - 1); if (!na->na_params.sq_flow_control) cmd.cattr |= NVMF_CONNECT_ATTR_DISABLE_SQ_FC; cmd.kato = htole32(kato); cc = nvmf_allocate_command(qp, &cmd); if (cc == NULL) { na_error(na, "Failed to allocate command capsule: %s", strerror(errno)); goto error; } memset(&data, 0, sizeof(data)); memcpy(data.hostid, hostid, sizeof(data.hostid)); data.cntlid = htole16(cntlid); strlcpy(data.subnqn, subnqn, sizeof(data.subnqn)); strlcpy(data.hostnqn, hostnqn, sizeof(data.hostnqn)); error = nvmf_capsule_append_data(cc, &data, sizeof(data), true); if (error != 0) { na_error(na, "Failed to append data to CONNECT capsule: %s", strerror(error)); goto error; } error = nvmf_transmit_capsule(cc); if (error != 0) { na_error(na, "Failed to transmit CONNECT capsule: %s", strerror(errno)); goto error; } error = nvmf_receive_capsule(qp, &rc); if (error != 0) { na_error(na, "Failed to receive CONNECT response: %s", strerror(error)); goto error; } rsp = (const struct nvmf_fabric_connect_rsp *)&rc->nc_cqe; status = le16toh(rc->nc_cqe.status); if (status != 0) { if (NVME_STATUS_GET_SC(status) == NVMF_FABRIC_SC_INVALID_PARAM) na_error(na, "CONNECT invalid parameter IATTR: %#x IPO: %#x", rsp->status_code_specific.invalid.iattr, rsp->status_code_specific.invalid.ipo); else na_error(na, "CONNECT failed, status %#x", status); goto error; } if (rc->nc_cqe.cid != cmd.cid) { na_error(na, "Mismatched CID in CONNECT response"); goto error; } if (!rc->nc_sqhd_valid) { na_error(na, "CONNECT response without valid SQHD"); goto error; } sqhd = le16toh(rsp->sqhd); if (sqhd == 0xffff) { if (na->na_params.sq_flow_control) { na_error(na, "Controller disabled SQ flow control"); goto error; } qp->nq_flow_control = false; } else { qp->nq_flow_control = true; qp->nq_sqhd = sqhd; qp->nq_sqtail = sqhd; } if (rsp->status_code_specific.success.authreq) { na_error(na, "CONNECT response requests authentication\n"); goto error; } qp->nq_qsize = queue_size; qp->nq_cntlid = le16toh(rsp->status_code_specific.success.cntlid); qp->nq_kato = kato; /* XXX: Save qid in qp? */ return (qp); error: if (rc != NULL) nvmf_free_capsule(rc); if (cc != NULL) nvmf_free_capsule(cc); if (qp != NULL) nvmf_free_qpair(qp); return (NULL); } uint16_t nvmf_cntlid(struct nvmf_qpair *qp) { return (qp->nq_cntlid); } int nvmf_host_transmit_command(struct nvmf_capsule *nc) { struct nvmf_qpair *qp = nc->nc_qpair; uint16_t new_sqtail; int error; /* Fail if the queue is full. */ new_sqtail = (qp->nq_sqtail + 1) % qp->nq_qsize; if (new_sqtail == qp->nq_sqhd) return (EBUSY); nc->nc_sqe.cid = htole16(qp->nq_cid); /* 4.2 Skip CID of 0xFFFF. */ qp->nq_cid++; if (qp->nq_cid == 0xFFFF) qp->nq_cid = 0; error = nvmf_transmit_capsule(nc); if (error != 0) return (error); qp->nq_sqtail = new_sqtail; return (0); } /* Receive a single capsule and update SQ FC accounting. */ static int nvmf_host_receive_capsule(struct nvmf_qpair *qp, struct nvmf_capsule **ncp) { struct nvmf_capsule *nc; int error; /* If the SQ is empty, there is no response to wait for. */ if (qp->nq_sqhd == qp->nq_sqtail) return (EWOULDBLOCK); error = nvmf_receive_capsule(qp, &nc); if (error != 0) return (error); if (qp->nq_flow_control) { if (nc->nc_sqhd_valid) qp->nq_sqhd = le16toh(nc->nc_cqe.sqhd); } else { /* * If SQ FC is disabled, just advance the head for * each response capsule received so that we track the * number of outstanding commands. */ qp->nq_sqhd = (qp->nq_sqhd + 1) % qp->nq_qsize; } *ncp = nc; return (0); } int nvmf_host_receive_response(struct nvmf_qpair *qp, struct nvmf_capsule **ncp) { struct nvmf_capsule *nc; /* Return the oldest previously received response. */ if (!TAILQ_EMPTY(&qp->nq_rx_capsules)) { nc = TAILQ_FIRST(&qp->nq_rx_capsules); TAILQ_REMOVE(&qp->nq_rx_capsules, nc, nc_link); *ncp = nc; return (0); } return (nvmf_host_receive_capsule(qp, ncp)); } int nvmf_host_wait_for_response(struct nvmf_capsule *cc, struct nvmf_capsule **rcp) { struct nvmf_qpair *qp = cc->nc_qpair; struct nvmf_capsule *rc; int error; /* Check if a response was already received. */ TAILQ_FOREACH(rc, &qp->nq_rx_capsules, nc_link) { if (rc->nc_cqe.cid == cc->nc_sqe.cid) { TAILQ_REMOVE(&qp->nq_rx_capsules, rc, nc_link); *rcp = rc; return (0); } } /* Wait for a response. */ for (;;) { error = nvmf_host_receive_capsule(qp, &rc); if (error != 0) return (error); if (rc->nc_cqe.cid != cc->nc_sqe.cid) { TAILQ_INSERT_TAIL(&qp->nq_rx_capsules, rc, nc_link); continue; } *rcp = rc; return (0); } } struct nvmf_capsule * nvmf_keepalive(struct nvmf_qpair *qp) { struct nvme_command cmd; if (!qp->nq_admin) { errno = EINVAL; return (NULL); } nvmf_init_sqe(&cmd, NVME_OPC_KEEP_ALIVE); return (nvmf_allocate_command(qp, &cmd)); } static struct nvmf_capsule * nvmf_get_property(struct nvmf_qpair *qp, uint32_t offset, uint8_t size) { struct nvmf_fabric_prop_get_cmd cmd; nvmf_init_fabrics_sqe(&cmd, NVMF_FABRIC_COMMAND_PROPERTY_GET); switch (size) { case 4: cmd.attrib.size = NVMF_PROP_SIZE_4; break; case 8: cmd.attrib.size = NVMF_PROP_SIZE_8; break; default: errno = EINVAL; return (NULL); } cmd.ofst = htole32(offset); return (nvmf_allocate_command(qp, &cmd)); } int nvmf_read_property(struct nvmf_qpair *qp, uint32_t offset, uint8_t size, uint64_t *value) { struct nvmf_capsule *cc, *rc; const struct nvmf_fabric_prop_get_rsp *rsp; uint16_t status; int error; if (!qp->nq_admin) return (EINVAL); cc = nvmf_get_property(qp, offset, size); if (cc == NULL) return (errno); error = nvmf_host_transmit_command(cc); if (error != 0) { nvmf_free_capsule(cc); return (error); } error = nvmf_host_wait_for_response(cc, &rc); nvmf_free_capsule(cc); if (error != 0) return (error); rsp = (const struct nvmf_fabric_prop_get_rsp *)&rc->nc_cqe; status = le16toh(rc->nc_cqe.status); if (status != 0) { printf("NVMF: PROPERTY_GET failed, status %#x\n", status); nvmf_free_capsule(rc); return (EIO); } if (size == 8) *value = le64toh(rsp->value.u64); else *value = le32toh(rsp->value.u32.low); nvmf_free_capsule(rc); return (0); } static struct nvmf_capsule * nvmf_set_property(struct nvmf_qpair *qp, uint32_t offset, uint8_t size, uint64_t value) { struct nvmf_fabric_prop_set_cmd cmd; nvmf_init_fabrics_sqe(&cmd, NVMF_FABRIC_COMMAND_PROPERTY_SET); switch (size) { case 4: cmd.attrib.size = NVMF_PROP_SIZE_4; cmd.value.u32.low = htole32(value); break; case 8: cmd.attrib.size = NVMF_PROP_SIZE_8; cmd.value.u64 = htole64(value); break; default: errno = EINVAL; return (NULL); } cmd.ofst = htole32(offset); return (nvmf_allocate_command(qp, &cmd)); } int nvmf_write_property(struct nvmf_qpair *qp, uint32_t offset, uint8_t size, uint64_t value) { struct nvmf_capsule *cc, *rc; uint16_t status; int error; if (!qp->nq_admin) return (EINVAL); cc = nvmf_set_property(qp, offset, size, value); if (cc == NULL) return (errno); error = nvmf_host_transmit_command(cc); if (error != 0) { nvmf_free_capsule(cc); return (error); } error = nvmf_host_wait_for_response(cc, &rc); nvmf_free_capsule(cc); if (error != 0) return (error); status = le16toh(rc->nc_cqe.status); if (status != 0) { printf("NVMF: PROPERTY_SET failed, status %#x\n", status); nvmf_free_capsule(rc); return (EIO); } nvmf_free_capsule(rc); return (0); } int nvmf_hostid_from_hostuuid(uint8_t hostid[16]) { char hostuuid_str[64]; uuid_t hostuuid; size_t len; uint32_t status; len = sizeof(hostuuid_str); if (sysctlbyname("kern.hostuuid", hostuuid_str, &len, NULL, 0) != 0) return (errno); uuid_from_string(hostuuid_str, &hostuuid, &status); switch (status) { case uuid_s_ok: break; case uuid_s_no_memory: return (ENOMEM); default: return (EINVAL); } uuid_enc_le(hostid, &hostuuid); return (0); } int nvmf_nqn_from_hostuuid(char nqn[NVMF_NQN_MAX_LEN]) { char hostuuid_str[64]; size_t len; len = sizeof(hostuuid_str); if (sysctlbyname("kern.hostuuid", hostuuid_str, &len, NULL, 0) != 0) return (errno); strlcpy(nqn, NVMF_NQN_UUID_PRE, NVMF_NQN_MAX_LEN); strlcat(nqn, hostuuid_str, NVMF_NQN_MAX_LEN); return (0); } int nvmf_host_identify_controller(struct nvmf_qpair *qp, struct nvme_controller_data *cdata) { struct nvme_command cmd; struct nvmf_capsule *cc, *rc; int error; uint16_t status; if (!qp->nq_admin) return (EINVAL); nvmf_init_sqe(&cmd, NVME_OPC_IDENTIFY); /* 5.15.1 Use CNS of 0x01 for controller data. */ cmd.cdw10 = htole32(1); cc = nvmf_allocate_command(qp, &cmd); if (cc == NULL) return (errno); error = nvmf_capsule_append_data(cc, cdata, sizeof(*cdata), false); if (error != 0) { nvmf_free_capsule(cc); return (error); } error = nvmf_host_transmit_command(cc); if (error != 0) { nvmf_free_capsule(cc); return (error); } error = nvmf_host_wait_for_response(cc, &rc); nvmf_free_capsule(cc); if (error != 0) return (error); status = le16toh(rc->nc_cqe.status); if (status != 0) { printf("NVMF: IDENTIFY failed, status %#x\n", status); nvmf_free_capsule(rc); return (EIO); } nvmf_free_capsule(rc); return (0); } int nvmf_host_identify_namespace(struct nvmf_qpair *qp, uint32_t nsid, struct nvme_namespace_data *nsdata) { struct nvme_command cmd; struct nvmf_capsule *cc, *rc; int error; uint16_t status; if (!qp->nq_admin) return (EINVAL); nvmf_init_sqe(&cmd, NVME_OPC_IDENTIFY); /* 5.15.1 Use CNS of 0x00 for namespace data. */ cmd.cdw10 = htole32(0); cmd.nsid = htole32(nsid); cc = nvmf_allocate_command(qp, &cmd); if (cc == NULL) return (errno); error = nvmf_capsule_append_data(cc, nsdata, sizeof(*nsdata), false); if (error != 0) { nvmf_free_capsule(cc); return (error); } error = nvmf_host_transmit_command(cc); if (error != 0) { nvmf_free_capsule(cc); return (error); } error = nvmf_host_wait_for_response(cc, &rc); nvmf_free_capsule(cc); if (error != 0) return (error); status = le16toh(rc->nc_cqe.status); if (status != 0) { printf("NVMF: IDENTIFY failed, status %#x\n", status); nvmf_free_capsule(rc); return (EIO); } nvmf_free_capsule(rc); return (0); } static int nvmf_get_discovery_log_page(struct nvmf_qpair *qp, uint64_t offset, void *buf, size_t len) { struct nvme_command cmd; struct nvmf_capsule *cc, *rc; size_t numd; int error; uint16_t status; if (len % 4 != 0 || len == 0 || offset % 4 != 0) return (EINVAL); numd = (len / 4) - 1; nvmf_init_sqe(&cmd, NVME_OPC_GET_LOG_PAGE); cmd.cdw10 = htole32(numd << 16 | NVME_LOG_DISCOVERY); cmd.cdw11 = htole32(numd >> 16); cmd.cdw12 = htole32(offset); cmd.cdw13 = htole32(offset >> 32); cc = nvmf_allocate_command(qp, &cmd); if (cc == NULL) return (errno); error = nvmf_capsule_append_data(cc, buf, len, false); if (error != 0) { nvmf_free_capsule(cc); return (error); } error = nvmf_host_transmit_command(cc); if (error != 0) { nvmf_free_capsule(cc); return (error); } error = nvmf_host_wait_for_response(cc, &rc); nvmf_free_capsule(cc); if (error != 0) return (error); status = le16toh(rc->nc_cqe.status); if (NVMEV(NVME_STATUS_SC, status) == NVMF_FABRIC_SC_LOG_RESTART_DISCOVERY) { nvmf_free_capsule(rc); return (EAGAIN); } if (status != 0) { printf("NVMF: GET_LOG_PAGE failed, status %#x\n", status); nvmf_free_capsule(rc); return (EIO); } nvmf_free_capsule(rc); return (0); } int nvmf_host_fetch_discovery_log_page(struct nvmf_qpair *qp, struct nvme_discovery_log **logp) { struct nvme_discovery_log hdr, *log; size_t payload_len; int error; if (!qp->nq_admin) return (EINVAL); log = NULL; for (;;) { error = nvmf_get_discovery_log_page(qp, 0, &hdr, sizeof(hdr)); if (error != 0) { free(log); return (error); } nvme_discovery_log_swapbytes(&hdr); if (hdr.recfmt != 0) { printf("NVMF: Unsupported discovery log format: %d\n", hdr.recfmt); free(log); return (EINVAL); } if (hdr.numrec > 1024) { printf("NVMF: Too many discovery log entries: %ju\n", (uintmax_t)hdr.numrec); free(log); return (EFBIG); } payload_len = sizeof(log->entries[0]) * hdr.numrec; log = reallocf(log, sizeof(*log) + payload_len); if (log == NULL) return (ENOMEM); *log = hdr; if (hdr.numrec == 0) break; error = nvmf_get_discovery_log_page(qp, sizeof(hdr), log->entries, payload_len); if (error == EAGAIN) continue; if (error != 0) { free(log); return (error); } /* Re-read the header and check the generation count. */ error = nvmf_get_discovery_log_page(qp, 0, &hdr, sizeof(hdr)); if (error != 0) { free(log); return (error); } nvme_discovery_log_swapbytes(&hdr); if (log->genctr != hdr.genctr) continue; for (u_int i = 0; i < log->numrec; i++) nvme_discovery_log_entry_swapbytes(&log->entries[i]); break; } *logp = log; return (0); } int nvmf_host_request_queues(struct nvmf_qpair *qp, u_int requested, u_int *actual) { struct nvme_command cmd; struct nvmf_capsule *cc, *rc; int error; uint16_t status; if (!qp->nq_admin || requested < 1 || requested > 65535) return (EINVAL); /* The number of queues is 0's based. */ requested--; nvmf_init_sqe(&cmd, NVME_OPC_SET_FEATURES); cmd.cdw10 = htole32(NVME_FEAT_NUMBER_OF_QUEUES); /* Same number of completion and submission queues. */ cmd.cdw11 = htole32((requested << 16) | requested); cc = nvmf_allocate_command(qp, &cmd); if (cc == NULL) return (errno); error = nvmf_host_transmit_command(cc); if (error != 0) { nvmf_free_capsule(cc); return (error); } error = nvmf_host_wait_for_response(cc, &rc); nvmf_free_capsule(cc); if (error != 0) return (error); status = le16toh(rc->nc_cqe.status); if (status != 0) { printf("NVMF: SET_FEATURES failed, status %#x\n", status); nvmf_free_capsule(rc); return (EIO); } *actual = (le32toh(rc->nc_cqe.cdw0) & 0xffff) + 1; nvmf_free_capsule(rc); return (0); } static bool is_queue_pair_idle(struct nvmf_qpair *qp) { if (qp->nq_sqhd != qp->nq_sqtail) return (false); if (!TAILQ_EMPTY(&qp->nq_rx_capsules)) return (false); return (true); } static int -prepare_queues_for_handoff(struct nvmf_handoff_host *hh, - struct nvmf_qpair *admin_qp, u_int num_queues, - struct nvmf_qpair **io_queues, const struct nvme_controller_data *cdata) +prepare_queues_for_handoff(struct nvmf_ioc_nv *nv, struct nvmf_qpair *admin_qp, + u_int num_queues, struct nvmf_qpair **io_queues, + const struct nvme_controller_data *cdata) { - struct nvmf_handoff_qpair_params *io; + nvlist_t *nvl, *nvl_qp; u_int i; int error; - memset(hh, 0, sizeof(*hh)); + if (num_queues == 0) + return (EINVAL); /* All queue pairs must be idle. */ if (!is_queue_pair_idle(admin_qp)) return (EBUSY); for (i = 0; i < num_queues; i++) { if (!is_queue_pair_idle(io_queues[i])) return (EBUSY); } + nvl = nvlist_create(0); + nvlist_add_number(nvl, "trtype", admin_qp->nq_association->na_trtype); + nvlist_add_number(nvl, "kato", admin_qp->nq_kato); + /* First, the admin queue. */ - hh->trtype = admin_qp->nq_association->na_trtype; - hh->kato = admin_qp->nq_kato; - error = nvmf_kernel_handoff_params(admin_qp, &hh->admin); - if (error) + error = nvmf_kernel_handoff_params(admin_qp, &nvl_qp); + if (error) { + nvlist_destroy(nvl); return (error); + } + nvlist_move_nvlist(nvl, "admin", nvl_qp); /* Next, the I/O queues. */ - hh->num_io_queues = num_queues; - io = calloc(num_queues, sizeof(*io)); for (i = 0; i < num_queues; i++) { - error = nvmf_kernel_handoff_params(io_queues[i], &io[i]); + error = nvmf_kernel_handoff_params(io_queues[i], &nvl_qp); if (error) { - free(io); + nvlist_destroy(nvl); return (error); } + nvlist_append_nvlist_array(nvl, "io", nvl_qp); } - hh->io = io; - hh->cdata = cdata; - return (0); + nvlist_add_binary(nvl, "cdata", cdata, sizeof(*cdata)); + + error = nvmf_pack_ioc_nvlist(nv, nvl); + nvlist_destroy(nvl); + return (error); } int nvmf_handoff_host(struct nvmf_qpair *admin_qp, u_int num_queues, struct nvmf_qpair **io_queues, const struct nvme_controller_data *cdata) { - struct nvmf_handoff_host hh; + struct nvmf_ioc_nv nv; u_int i; int error, fd; fd = open("/dev/nvmf", O_RDWR); if (fd == -1) { error = errno; goto out; } - error = prepare_queues_for_handoff(&hh, admin_qp, num_queues, io_queues, + error = prepare_queues_for_handoff(&nv, admin_qp, num_queues, io_queues, cdata); if (error != 0) goto out; - if (ioctl(fd, NVMF_HANDOFF_HOST, &hh) == -1) + if (ioctl(fd, NVMF_HANDOFF_HOST, &nv) == -1) error = errno; - free(hh.io); + free(nv.data); out: if (fd >= 0) close(fd); for (i = 0; i < num_queues; i++) (void)nvmf_free_qpair(io_queues[i]); (void)nvmf_free_qpair(admin_qp); return (error); } int nvmf_disconnect_host(const char *host) { int error, fd; error = 0; fd = open("/dev/nvmf", O_RDWR); if (fd == -1) { error = errno; goto out; } if (ioctl(fd, NVMF_DISCONNECT_HOST, &host) == -1) error = errno; out: if (fd >= 0) close(fd); return (error); } int nvmf_disconnect_all(void) { int error, fd; error = 0; fd = open("/dev/nvmf", O_RDWR); if (fd == -1) { error = errno; goto out; } if (ioctl(fd, NVMF_DISCONNECT_ALL) == -1) error = errno; out: if (fd >= 0) close(fd); return (error); } -int -nvmf_reconnect_params(int fd, struct nvmf_reconnect_params *rparams) +static int +nvmf_read_ioc_nv(int fd, u_long com, nvlist_t **nvlp) { - if (ioctl(fd, NVMF_RECONNECT_PARAMS, rparams) == -1) + struct nvmf_ioc_nv nv; + nvlist_t *nvl; + int error; + + memset(&nv, 0, sizeof(nv)); + if (ioctl(fd, com, &nv) == -1) return (errno); + + nv.data = malloc(nv.len); + nv.size = nv.len; + if (ioctl(fd, com, &nv) == -1) { + error = errno; + free(nv.data); + return (error); + } + + nvl = nvlist_unpack(nv.data, nv.len, 0); + free(nv.data); + if (nvl == NULL) + return (EINVAL); + + *nvlp = nvl; return (0); } +int +nvmf_reconnect_params(int fd, nvlist_t **nvlp) +{ + return (nvmf_read_ioc_nv(fd, NVMF_RECONNECT_PARAMS, nvlp)); +} + int nvmf_reconnect_host(int fd, struct nvmf_qpair *admin_qp, u_int num_queues, struct nvmf_qpair **io_queues, const struct nvme_controller_data *cdata) { - struct nvmf_handoff_host hh; + struct nvmf_ioc_nv nv; u_int i; int error; - error = prepare_queues_for_handoff(&hh, admin_qp, num_queues, io_queues, + error = prepare_queues_for_handoff(&nv, admin_qp, num_queues, io_queues, cdata); if (error != 0) goto out; - if (ioctl(fd, NVMF_RECONNECT_HOST, &hh) == -1) + if (ioctl(fd, NVMF_RECONNECT_HOST, &nv) == -1) error = errno; - free(hh.io); + free(nv.data); out: for (i = 0; i < num_queues; i++) (void)nvmf_free_qpair(io_queues[i]); (void)nvmf_free_qpair(admin_qp); return (error); } diff --git a/lib/libnvmf/nvmf_tcp.c b/lib/libnvmf/nvmf_tcp.c index 264a5bb154a0..3f794b5d9750 100644 --- a/lib/libnvmf/nvmf_tcp.c +++ b/lib/libnvmf/nvmf_tcp.c @@ -1,1482 +1,1479 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2022-2024 Chelsio Communications, Inc. * Written by: John Baldwin */ #include #include #include #include #include #include #include #include #include #include #include "libnvmf.h" #include "internal.h" #include "nvmf_tcp.h" struct nvmf_tcp_qpair; struct nvmf_tcp_command_buffer { struct nvmf_tcp_qpair *qp; void *data; size_t data_len; size_t data_xfered; uint32_t data_offset; uint16_t cid; uint16_t ttag; LIST_ENTRY(nvmf_tcp_command_buffer) link; }; LIST_HEAD(nvmf_tcp_command_buffer_list, nvmf_tcp_command_buffer); struct nvmf_tcp_association { struct nvmf_association na; uint32_t ioccsz; }; struct nvmf_tcp_rxpdu { struct nvme_tcp_common_pdu_hdr *hdr; uint32_t data_len; }; struct nvmf_tcp_capsule { struct nvmf_capsule nc; struct nvmf_tcp_rxpdu rx_pdu; struct nvmf_tcp_command_buffer *cb; TAILQ_ENTRY(nvmf_tcp_capsule) link; }; struct nvmf_tcp_qpair { struct nvmf_qpair qp; int s; uint8_t txpda; uint8_t rxpda; bool header_digests; bool data_digests; uint32_t maxr2t; uint32_t maxh2cdata; uint32_t max_icd; /* Host only */ uint16_t next_ttag; /* Controller only */ struct nvmf_tcp_command_buffer_list tx_buffers; struct nvmf_tcp_command_buffer_list rx_buffers; TAILQ_HEAD(, nvmf_tcp_capsule) rx_capsules; }; #define TASSOC(nc) ((struct nvmf_tcp_association *)(na)) #define TCAP(nc) ((struct nvmf_tcp_capsule *)(nc)) #define CTCAP(nc) ((const struct nvmf_tcp_capsule *)(nc)) #define TQP(qp) ((struct nvmf_tcp_qpair *)(qp)) static const char zero_padding[NVME_TCP_PDU_PDO_MAX_OFFSET]; static uint32_t compute_digest(const void *buf, size_t len) { return (calculate_crc32c(0xffffffff, buf, len) ^ 0xffffffff); } static struct nvmf_tcp_command_buffer * tcp_alloc_command_buffer(struct nvmf_tcp_qpair *qp, void *data, uint32_t data_offset, size_t data_len, uint16_t cid, uint16_t ttag, bool receive) { struct nvmf_tcp_command_buffer *cb; cb = malloc(sizeof(*cb)); cb->qp = qp; cb->data = data; cb->data_offset = data_offset; cb->data_len = data_len; cb->data_xfered = 0; cb->cid = cid; cb->ttag = ttag; if (receive) LIST_INSERT_HEAD(&qp->rx_buffers, cb, link); else LIST_INSERT_HEAD(&qp->tx_buffers, cb, link); return (cb); } static struct nvmf_tcp_command_buffer * tcp_find_command_buffer(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag, bool receive) { struct nvmf_tcp_command_buffer_list *list; struct nvmf_tcp_command_buffer *cb; list = receive ? &qp->rx_buffers : &qp->tx_buffers; LIST_FOREACH(cb, list, link) { if (cb->cid == cid && cb->ttag == ttag) return (cb); } return (NULL); } static void tcp_purge_command_buffer(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag, bool receive) { struct nvmf_tcp_command_buffer *cb; cb = tcp_find_command_buffer(qp, cid, ttag, receive); if (cb != NULL) LIST_REMOVE(cb, link); } static void tcp_free_command_buffer(struct nvmf_tcp_command_buffer *cb) { LIST_REMOVE(cb, link); free(cb); } static int nvmf_tcp_write_pdu(struct nvmf_tcp_qpair *qp, const void *pdu, size_t len) { ssize_t nwritten; const char *cp; cp = pdu; while (len != 0) { nwritten = write(qp->s, cp, len); if (nwritten < 0) return (errno); len -= nwritten; cp += nwritten; } return (0); } static int nvmf_tcp_write_pdu_iov(struct nvmf_tcp_qpair *qp, struct iovec *iov, u_int iovcnt, size_t len) { ssize_t nwritten; for (;;) { nwritten = writev(qp->s, iov, iovcnt); if (nwritten < 0) return (errno); len -= nwritten; if (len == 0) return (0); while (iov->iov_len <= (size_t)nwritten) { nwritten -= iov->iov_len; iovcnt--; iov++; } iov->iov_base = (char *)iov->iov_base + nwritten; iov->iov_len -= nwritten; } } static void nvmf_tcp_report_error(struct nvmf_association *na, struct nvmf_tcp_qpair *qp, uint16_t fes, uint32_t fei, const void *rx_pdu, size_t pdu_len, u_int hlen) { struct nvme_tcp_term_req_hdr hdr; struct iovec iov[2]; if (hlen != 0) { if (hlen > NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE) hlen = NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE; if (hlen > pdu_len) hlen = pdu_len; } memset(&hdr, 0, sizeof(hdr)); hdr.common.pdu_type = na->na_controller ? NVME_TCP_PDU_TYPE_C2H_TERM_REQ : NVME_TCP_PDU_TYPE_H2C_TERM_REQ; hdr.common.hlen = sizeof(hdr); hdr.common.plen = sizeof(hdr) + hlen; hdr.fes = htole16(fes); le32enc(hdr.fei, fei); iov[0].iov_base = &hdr; iov[0].iov_len = sizeof(hdr); iov[1].iov_base = __DECONST(void *, rx_pdu); iov[1].iov_len = hlen; (void)nvmf_tcp_write_pdu_iov(qp, iov, nitems(iov), sizeof(hdr) + hlen); close(qp->s); qp->s = -1; } static int nvmf_tcp_validate_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu, size_t pdu_len) { const struct nvme_tcp_common_pdu_hdr *ch; uint32_t data_len, fei, plen; uint32_t digest, rx_digest; u_int hlen; int error; uint16_t fes; /* Determine how large of a PDU header to return for errors. */ ch = pdu->hdr; hlen = ch->hlen; plen = le32toh(ch->plen); if (hlen < sizeof(*ch) || hlen > plen) hlen = sizeof(*ch); error = nvmf_tcp_validate_pdu_header(ch, qp->qp.nq_association->na_controller, qp->header_digests, qp->data_digests, qp->rxpda, &data_len, &fes, &fei); if (error != 0) { if (error == ECONNRESET) { close(qp->s); qp->s = -1; } else { nvmf_tcp_report_error(qp->qp.nq_association, qp, fes, fei, ch, pdu_len, hlen); } return (error); } /* Check header digest if present. */ if ((ch->flags & NVME_TCP_CH_FLAGS_HDGSTF) != 0) { digest = compute_digest(ch, ch->hlen); memcpy(&rx_digest, (const char *)ch + ch->hlen, sizeof(rx_digest)); if (digest != rx_digest) { printf("NVMe/TCP: Header digest mismatch\n"); nvmf_tcp_report_error(qp->qp.nq_association, qp, NVME_TCP_TERM_REQ_FES_HDGST_ERROR, rx_digest, ch, pdu_len, hlen); return (EBADMSG); } } /* Check data digest if present. */ if ((ch->flags & NVME_TCP_CH_FLAGS_DDGSTF) != 0) { digest = compute_digest((const char *)ch + ch->pdo, data_len); memcpy(&rx_digest, (const char *)ch + plen - sizeof(rx_digest), sizeof(rx_digest)); if (digest != rx_digest) { printf("NVMe/TCP: Data digest mismatch\n"); return (EBADMSG); } } pdu->data_len = data_len; return (0); } /* * Read data from a socket, retrying until the data has been fully * read or an error occurs. */ static int nvmf_tcp_read_buffer(int s, void *buf, size_t len) { ssize_t nread; char *cp; cp = buf; while (len != 0) { nread = read(s, cp, len); if (nread < 0) return (errno); if (nread == 0) return (ECONNRESET); len -= nread; cp += nread; } return (0); } static int nvmf_tcp_read_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu) { struct nvme_tcp_common_pdu_hdr ch; uint32_t plen; int error; memset(pdu, 0, sizeof(*pdu)); error = nvmf_tcp_read_buffer(qp->s, &ch, sizeof(ch)); if (error != 0) return (error); plen = le32toh(ch.plen); /* * Validate a header with garbage lengths to trigger * an error message without reading more. */ if (plen < sizeof(ch) || ch.hlen > plen) { pdu->hdr = &ch; error = nvmf_tcp_validate_pdu(qp, pdu, sizeof(ch)); pdu->hdr = NULL; assert(error != 0); return (error); } /* Read the rest of the PDU. */ pdu->hdr = malloc(plen); memcpy(pdu->hdr, &ch, sizeof(ch)); error = nvmf_tcp_read_buffer(qp->s, pdu->hdr + 1, plen - sizeof(ch)); if (error != 0) return (error); error = nvmf_tcp_validate_pdu(qp, pdu, plen); if (error != 0) { free(pdu->hdr); pdu->hdr = NULL; } return (error); } static void nvmf_tcp_free_pdu(struct nvmf_tcp_rxpdu *pdu) { free(pdu->hdr); pdu->hdr = NULL; } static int nvmf_tcp_handle_term_req(struct nvmf_tcp_rxpdu *pdu) { struct nvme_tcp_term_req_hdr *hdr; hdr = (void *)pdu->hdr; printf("NVMe/TCP: Received termination request: fes %#x fei %#x\n", le16toh(hdr->fes), le32dec(hdr->fei)); nvmf_tcp_free_pdu(pdu); return (ECONNRESET); } static int nvmf_tcp_save_command_capsule(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu) { struct nvme_tcp_cmd *cmd; struct nvmf_capsule *nc; struct nvmf_tcp_capsule *tc; cmd = (void *)pdu->hdr; nc = nvmf_allocate_command(&qp->qp, &cmd->ccsqe); if (nc == NULL) return (ENOMEM); tc = TCAP(nc); tc->rx_pdu = *pdu; TAILQ_INSERT_TAIL(&qp->rx_capsules, tc, link); return (0); } static int nvmf_tcp_save_response_capsule(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu) { struct nvme_tcp_rsp *rsp; struct nvmf_capsule *nc; struct nvmf_tcp_capsule *tc; rsp = (void *)pdu->hdr; nc = nvmf_allocate_response(&qp->qp, &rsp->rccqe); if (nc == NULL) return (ENOMEM); nc->nc_sqhd_valid = true; tc = TCAP(nc); tc->rx_pdu = *pdu; TAILQ_INSERT_TAIL(&qp->rx_capsules, tc, link); /* * Once the CQE has been received, no further transfers to the * command buffer for the associated CID can occur. */ tcp_purge_command_buffer(qp, rsp->rccqe.cid, 0, true); tcp_purge_command_buffer(qp, rsp->rccqe.cid, 0, false); return (0); } /* * Construct and send a PDU that contains an optional data payload. * This includes dealing with digests and the length fields in the * common header. */ static int nvmf_tcp_construct_pdu(struct nvmf_tcp_qpair *qp, void *hdr, size_t hlen, void *data, uint32_t data_len) { struct nvme_tcp_common_pdu_hdr *ch; struct iovec iov[5]; u_int iovcnt; uint32_t header_digest, data_digest, pad, pdo, plen; plen = hlen; if (qp->header_digests) plen += sizeof(header_digest); if (data_len != 0) { pdo = roundup(plen, qp->txpda); pad = pdo - plen; plen = pdo + data_len; if (qp->data_digests) plen += sizeof(data_digest); } else { assert(data == NULL); pdo = 0; pad = 0; } ch = hdr; ch->hlen = hlen; if (qp->header_digests) ch->flags |= NVME_TCP_CH_FLAGS_HDGSTF; if (qp->data_digests && data_len != 0) ch->flags |= NVME_TCP_CH_FLAGS_DDGSTF; ch->pdo = pdo; ch->plen = htole32(plen); /* CH + PSH */ iov[0].iov_base = hdr; iov[0].iov_len = hlen; iovcnt = 1; /* HDGST */ if (qp->header_digests) { header_digest = compute_digest(hdr, hlen); iov[iovcnt].iov_base = &header_digest; iov[iovcnt].iov_len = sizeof(header_digest); iovcnt++; } if (pad != 0) { /* PAD */ iov[iovcnt].iov_base = __DECONST(char *, zero_padding); iov[iovcnt].iov_len = pad; iovcnt++; } if (data_len != 0) { /* DATA */ iov[iovcnt].iov_base = data; iov[iovcnt].iov_len = data_len; iovcnt++; /* DDGST */ if (qp->data_digests) { data_digest = compute_digest(data, data_len); iov[iovcnt].iov_base = &data_digest; iov[iovcnt].iov_len = sizeof(data_digest); iovcnt++; } } return (nvmf_tcp_write_pdu_iov(qp, iov, iovcnt, plen)); } static int nvmf_tcp_handle_h2c_data(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu) { struct nvme_tcp_h2c_data_hdr *h2c; struct nvmf_tcp_command_buffer *cb; uint32_t data_len, data_offset; const char *icd; h2c = (void *)pdu->hdr; if (le32toh(h2c->datal) > qp->maxh2cdata) { nvmf_tcp_report_error(qp->qp.nq_association, qp, NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_LIMIT_EXCEEDED, 0, pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen); nvmf_tcp_free_pdu(pdu); return (EBADMSG); } cb = tcp_find_command_buffer(qp, h2c->cccid, h2c->ttag, true); if (cb == NULL) { nvmf_tcp_report_error(qp->qp.nq_association, qp, NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen); nvmf_tcp_free_pdu(pdu); return (EBADMSG); } data_len = le32toh(h2c->datal); if (data_len != pdu->data_len) { nvmf_tcp_report_error(qp->qp.nq_association, qp, NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, offsetof(struct nvme_tcp_h2c_data_hdr, datal), pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen); nvmf_tcp_free_pdu(pdu); return (EBADMSG); } data_offset = le32toh(h2c->datao); if (data_offset < cb->data_offset || data_offset + data_len > cb->data_offset + cb->data_len) { nvmf_tcp_report_error(qp->qp.nq_association, qp, NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen); nvmf_tcp_free_pdu(pdu); return (EBADMSG); } if (data_offset != cb->data_offset + cb->data_xfered) { nvmf_tcp_report_error(qp->qp.nq_association, qp, NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen); nvmf_tcp_free_pdu(pdu); return (EBADMSG); } if ((cb->data_xfered + data_len == cb->data_len) != ((pdu->hdr->flags & NVME_TCP_H2C_DATA_FLAGS_LAST_PDU) != 0)) { nvmf_tcp_report_error(qp->qp.nq_association, qp, NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen); nvmf_tcp_free_pdu(pdu); return (EBADMSG); } cb->data_xfered += data_len; data_offset -= cb->data_offset; icd = (const char *)pdu->hdr + pdu->hdr->pdo; memcpy((char *)cb->data + data_offset, icd, data_len); nvmf_tcp_free_pdu(pdu); return (0); } static int nvmf_tcp_handle_c2h_data(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu) { struct nvme_tcp_c2h_data_hdr *c2h; struct nvmf_tcp_command_buffer *cb; uint32_t data_len, data_offset; const char *icd; c2h = (void *)pdu->hdr; cb = tcp_find_command_buffer(qp, c2h->cccid, 0, true); if (cb == NULL) { /* * XXX: Could be PDU sequence error if cccid is for a * command that doesn't use a command buffer. */ nvmf_tcp_report_error(qp->qp.nq_association, qp, NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, offsetof(struct nvme_tcp_c2h_data_hdr, cccid), pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen); nvmf_tcp_free_pdu(pdu); return (EBADMSG); } data_len = le32toh(c2h->datal); if (data_len != pdu->data_len) { nvmf_tcp_report_error(qp->qp.nq_association, qp, NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, offsetof(struct nvme_tcp_c2h_data_hdr, datal), pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen); nvmf_tcp_free_pdu(pdu); return (EBADMSG); } data_offset = le32toh(c2h->datao); if (data_offset < cb->data_offset || data_offset + data_len > cb->data_offset + cb->data_len) { nvmf_tcp_report_error(qp->qp.nq_association, qp, NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen); nvmf_tcp_free_pdu(pdu); return (EBADMSG); } if (data_offset != cb->data_offset + cb->data_xfered) { nvmf_tcp_report_error(qp->qp.nq_association, qp, NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen); nvmf_tcp_free_pdu(pdu); return (EBADMSG); } if ((cb->data_xfered + data_len == cb->data_len) != ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_LAST_PDU) != 0)) { nvmf_tcp_report_error(qp->qp.nq_association, qp, NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen); nvmf_tcp_free_pdu(pdu); return (EBADMSG); } cb->data_xfered += data_len; data_offset -= cb->data_offset; icd = (const char *)pdu->hdr + pdu->hdr->pdo; memcpy((char *)cb->data + data_offset, icd, data_len); if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) { struct nvme_completion cqe; struct nvmf_tcp_capsule *tc; struct nvmf_capsule *nc; memset(&cqe, 0, sizeof(cqe)); cqe.cid = cb->cid; nc = nvmf_allocate_response(&qp->qp, &cqe); if (nc == NULL) { nvmf_tcp_free_pdu(pdu); return (ENOMEM); } nc->nc_sqhd_valid = false; tc = TCAP(nc); TAILQ_INSERT_TAIL(&qp->rx_capsules, tc, link); } nvmf_tcp_free_pdu(pdu); return (0); } /* NB: cid and ttag and little-endian already. */ static int tcp_send_h2c_pdu(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag, uint32_t data_offset, void *buf, size_t len, bool last_pdu) { struct nvme_tcp_h2c_data_hdr h2c; memset(&h2c, 0, sizeof(h2c)); h2c.common.pdu_type = NVME_TCP_PDU_TYPE_H2C_DATA; if (last_pdu) h2c.common.flags |= NVME_TCP_H2C_DATA_FLAGS_LAST_PDU; h2c.cccid = cid; h2c.ttag = ttag; h2c.datao = htole32(data_offset); h2c.datal = htole32(len); return (nvmf_tcp_construct_pdu(qp, &h2c, sizeof(h2c), buf, len)); } /* Sends one or more H2C_DATA PDUs, subject to MAXH2CDATA. */ static int tcp_send_h2c_pdus(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag, uint32_t data_offset, void *buf, size_t len, bool last_pdu) { char *p; p = buf; while (len != 0) { size_t todo; int error; todo = len; if (todo > qp->maxh2cdata) todo = qp->maxh2cdata; error = tcp_send_h2c_pdu(qp, cid, ttag, data_offset, p, todo, last_pdu && todo == len); if (error != 0) return (error); p += todo; len -= todo; } return (0); } static int nvmf_tcp_handle_r2t(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu) { struct nvmf_tcp_command_buffer *cb; struct nvme_tcp_r2t_hdr *r2t; uint32_t data_len, data_offset; int error; r2t = (void *)pdu->hdr; cb = tcp_find_command_buffer(qp, r2t->cccid, 0, false); if (cb == NULL) { nvmf_tcp_report_error(qp->qp.nq_association, qp, NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, offsetof(struct nvme_tcp_r2t_hdr, cccid), pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen); nvmf_tcp_free_pdu(pdu); return (EBADMSG); } data_offset = le32toh(r2t->r2to); if (data_offset != cb->data_xfered) { nvmf_tcp_report_error(qp->qp.nq_association, qp, NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen); nvmf_tcp_free_pdu(pdu); return (EBADMSG); } /* * XXX: The spec does not specify how to handle R2T tranfers * out of range of the original command. */ data_len = le32toh(r2t->r2tl); if (data_offset + data_len > cb->data_len) { nvmf_tcp_report_error(qp->qp.nq_association, qp, NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen); nvmf_tcp_free_pdu(pdu); return (EBADMSG); } cb->data_xfered += data_len; /* * Write out one or more H2C_DATA PDUs containing the * requested data. */ error = tcp_send_h2c_pdus(qp, r2t->cccid, r2t->ttag, data_offset, (char *)cb->data + data_offset, data_len, true); nvmf_tcp_free_pdu(pdu); return (error); } static int nvmf_tcp_receive_pdu(struct nvmf_tcp_qpair *qp) { struct nvmf_tcp_rxpdu pdu; int error; error = nvmf_tcp_read_pdu(qp, &pdu); if (error != 0) return (error); switch (pdu.hdr->pdu_type) { default: __unreachable(); break; case NVME_TCP_PDU_TYPE_H2C_TERM_REQ: case NVME_TCP_PDU_TYPE_C2H_TERM_REQ: return (nvmf_tcp_handle_term_req(&pdu)); case NVME_TCP_PDU_TYPE_CAPSULE_CMD: return (nvmf_tcp_save_command_capsule(qp, &pdu)); case NVME_TCP_PDU_TYPE_CAPSULE_RESP: return (nvmf_tcp_save_response_capsule(qp, &pdu)); case NVME_TCP_PDU_TYPE_H2C_DATA: return (nvmf_tcp_handle_h2c_data(qp, &pdu)); case NVME_TCP_PDU_TYPE_C2H_DATA: return (nvmf_tcp_handle_c2h_data(qp, &pdu)); case NVME_TCP_PDU_TYPE_R2T: return (nvmf_tcp_handle_r2t(qp, &pdu)); } } static bool nvmf_tcp_validate_ic_pdu(struct nvmf_association *na, struct nvmf_tcp_qpair *qp, const struct nvme_tcp_common_pdu_hdr *ch, size_t pdu_len) { const struct nvme_tcp_ic_req *pdu; uint32_t plen; u_int hlen; /* Determine how large of a PDU header to return for errors. */ hlen = ch->hlen; plen = le32toh(ch->plen); if (hlen < sizeof(*ch) || hlen > plen) hlen = sizeof(*ch); /* * Errors must be reported for the lowest incorrect field * first, so validate fields in order. */ /* Validate pdu_type. */ /* Controllers only receive PDUs with a PDU direction of 0. */ if (na->na_controller != ((ch->pdu_type & 0x01) == 0)) { na_error(na, "NVMe/TCP: Invalid PDU type %u", ch->pdu_type); nvmf_tcp_report_error(na, qp, NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 0, ch, pdu_len, hlen); return (false); } switch (ch->pdu_type) { case NVME_TCP_PDU_TYPE_IC_REQ: case NVME_TCP_PDU_TYPE_IC_RESP: break; default: na_error(na, "NVMe/TCP: Invalid PDU type %u", ch->pdu_type); nvmf_tcp_report_error(na, qp, NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 0, ch, pdu_len, hlen); return (false); } /* Validate flags. */ if (ch->flags != 0) { na_error(na, "NVMe/TCP: Invalid PDU header flags %#x", ch->flags); nvmf_tcp_report_error(na, qp, NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 1, ch, pdu_len, hlen); return (false); } /* Validate hlen. */ if (ch->hlen != 128) { na_error(na, "NVMe/TCP: Invalid PDU header length %u", ch->hlen); nvmf_tcp_report_error(na, qp, NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 2, ch, pdu_len, hlen); return (false); } /* Validate pdo. */ if (ch->pdo != 0) { na_error(na, "NVMe/TCP: Invalid PDU data offset %u", ch->pdo); nvmf_tcp_report_error(na, qp, NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 3, ch, pdu_len, hlen); return (false); } /* Validate plen. */ if (plen != 128) { na_error(na, "NVMe/TCP: Invalid PDU length %u", plen); nvmf_tcp_report_error(na, qp, NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 4, ch, pdu_len, hlen); return (false); } /* Validate fields common to both ICReq and ICResp. */ pdu = (const struct nvme_tcp_ic_req *)ch; if (le16toh(pdu->pfv) != 0) { na_error(na, "NVMe/TCP: Unsupported PDU version %u", le16toh(pdu->pfv)); nvmf_tcp_report_error(na, qp, NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER, 8, ch, pdu_len, hlen); return (false); } if (pdu->hpda > NVME_TCP_HPDA_MAX) { na_error(na, "NVMe/TCP: Unsupported PDA %u", pdu->hpda); nvmf_tcp_report_error(na, qp, NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 10, ch, pdu_len, hlen); return (false); } if (pdu->dgst.bits.reserved != 0) { na_error(na, "NVMe/TCP: Invalid digest settings"); nvmf_tcp_report_error(na, qp, NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 11, ch, pdu_len, hlen); return (false); } return (true); } static bool nvmf_tcp_read_ic_req(struct nvmf_association *na, struct nvmf_tcp_qpair *qp, struct nvme_tcp_ic_req *pdu) { int error; error = nvmf_tcp_read_buffer(qp->s, pdu, sizeof(*pdu)); if (error != 0) { na_error(na, "NVMe/TCP: Failed to read IC request: %s", strerror(error)); return (false); } return (nvmf_tcp_validate_ic_pdu(na, qp, &pdu->common, sizeof(*pdu))); } static bool nvmf_tcp_read_ic_resp(struct nvmf_association *na, struct nvmf_tcp_qpair *qp, struct nvme_tcp_ic_resp *pdu) { int error; error = nvmf_tcp_read_buffer(qp->s, pdu, sizeof(*pdu)); if (error != 0) { na_error(na, "NVMe/TCP: Failed to read IC response: %s", strerror(error)); return (false); } return (nvmf_tcp_validate_ic_pdu(na, qp, &pdu->common, sizeof(*pdu))); } static struct nvmf_association * tcp_allocate_association(bool controller, const struct nvmf_association_params *params) { struct nvmf_tcp_association *ta; if (controller) { /* 7.4.10.3 */ if (params->tcp.maxh2cdata < 4096 || params->tcp.maxh2cdata % 4 != 0) return (NULL); } ta = calloc(1, sizeof(*ta)); return (&ta->na); } static void tcp_update_association(struct nvmf_association *na, const struct nvme_controller_data *cdata) { struct nvmf_tcp_association *ta = TASSOC(na); ta->ioccsz = le32toh(cdata->ioccsz); } static void tcp_free_association(struct nvmf_association *na) { free(na); } static bool tcp_connect(struct nvmf_tcp_qpair *qp, struct nvmf_association *na, bool admin) { const struct nvmf_association_params *params = &na->na_params; struct nvmf_tcp_association *ta = TASSOC(na); struct nvme_tcp_ic_req ic_req; struct nvme_tcp_ic_resp ic_resp; uint32_t maxh2cdata; int error; if (!admin) { if (ta->ioccsz == 0) { na_error(na, "TCP I/O queues require cdata"); return (false); } if (ta->ioccsz < 4) { na_error(na, "Invalid IOCCSZ %u", ta->ioccsz); return (false); } } memset(&ic_req, 0, sizeof(ic_req)); ic_req.common.pdu_type = NVME_TCP_PDU_TYPE_IC_REQ; ic_req.common.hlen = sizeof(ic_req); ic_req.common.plen = htole32(sizeof(ic_req)); ic_req.pfv = htole16(0); ic_req.hpda = params->tcp.pda; if (params->tcp.header_digests) ic_req.dgst.bits.hdgst_enable = 1; if (params->tcp.data_digests) ic_req.dgst.bits.ddgst_enable = 1; ic_req.maxr2t = htole32(params->tcp.maxr2t); error = nvmf_tcp_write_pdu(qp, &ic_req, sizeof(ic_req)); if (error != 0) { na_error(na, "Failed to write IC request: %s", strerror(error)); return (false); } if (!nvmf_tcp_read_ic_resp(na, qp, &ic_resp)) return (false); /* Ensure the controller didn't enable digests we didn't request. */ if ((!params->tcp.header_digests && ic_resp.dgst.bits.hdgst_enable != 0) || (!params->tcp.data_digests && ic_resp.dgst.bits.ddgst_enable != 0)) { na_error(na, "Controller enabled unrequested digests"); nvmf_tcp_report_error(na, qp, NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER, 11, &ic_resp, sizeof(ic_resp), sizeof(ic_resp)); return (false); } /* * XXX: Is there an upper-bound to enforce here? Perhaps pick * some large value and report larger values as an unsupported * parameter? */ maxh2cdata = le32toh(ic_resp.maxh2cdata); if (maxh2cdata < 4096 || maxh2cdata % 4 != 0) { na_error(na, "Invalid MAXH2CDATA %u", maxh2cdata); nvmf_tcp_report_error(na, qp, NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 12, &ic_resp, sizeof(ic_resp), sizeof(ic_resp)); return (false); } qp->rxpda = (params->tcp.pda + 1) * 4; qp->txpda = (ic_resp.cpda + 1) * 4; qp->header_digests = ic_resp.dgst.bits.hdgst_enable != 0; qp->data_digests = ic_resp.dgst.bits.ddgst_enable != 0; qp->maxr2t = params->tcp.maxr2t; qp->maxh2cdata = maxh2cdata; if (admin) /* 7.4.3 */ qp->max_icd = 8192; else qp->max_icd = (ta->ioccsz - 4) * 16; return (0); } static bool tcp_accept(struct nvmf_tcp_qpair *qp, struct nvmf_association *na) { const struct nvmf_association_params *params = &na->na_params; struct nvme_tcp_ic_req ic_req; struct nvme_tcp_ic_resp ic_resp; int error; if (!nvmf_tcp_read_ic_req(na, qp, &ic_req)) return (false); memset(&ic_resp, 0, sizeof(ic_resp)); ic_resp.common.pdu_type = NVME_TCP_PDU_TYPE_IC_RESP; ic_resp.common.hlen = sizeof(ic_req); ic_resp.common.plen = htole32(sizeof(ic_req)); ic_resp.pfv = htole16(0); ic_resp.cpda = params->tcp.pda; if (params->tcp.header_digests && ic_req.dgst.bits.hdgst_enable != 0) ic_resp.dgst.bits.hdgst_enable = 1; if (params->tcp.data_digests && ic_req.dgst.bits.ddgst_enable != 0) ic_resp.dgst.bits.ddgst_enable = 1; ic_resp.maxh2cdata = htole32(params->tcp.maxh2cdata); error = nvmf_tcp_write_pdu(qp, &ic_resp, sizeof(ic_resp)); if (error != 0) { na_error(na, "Failed to write IC response: %s", strerror(error)); return (false); } qp->rxpda = (params->tcp.pda + 1) * 4; qp->txpda = (ic_req.hpda + 1) * 4; qp->header_digests = ic_resp.dgst.bits.hdgst_enable != 0; qp->data_digests = ic_resp.dgst.bits.ddgst_enable != 0; qp->maxr2t = le32toh(ic_req.maxr2t); qp->maxh2cdata = params->tcp.maxh2cdata; qp->max_icd = 0; /* XXX */ return (0); } static struct nvmf_qpair * tcp_allocate_qpair(struct nvmf_association *na, const struct nvmf_qpair_params *qparams) { const struct nvmf_association_params *aparams = &na->na_params; struct nvmf_tcp_qpair *qp; int error; if (aparams->tcp.pda > NVME_TCP_CPDA_MAX) { na_error(na, "Invalid PDA"); return (NULL); } qp = calloc(1, sizeof(*qp)); qp->s = qparams->tcp.fd; LIST_INIT(&qp->rx_buffers); LIST_INIT(&qp->tx_buffers); TAILQ_INIT(&qp->rx_capsules); if (na->na_controller) error = tcp_accept(qp, na); else error = tcp_connect(qp, na, qparams->admin); if (error != 0) { free(qp); return (NULL); } return (&qp->qp); } static void tcp_free_qpair(struct nvmf_qpair *nq) { struct nvmf_tcp_qpair *qp = TQP(nq); struct nvmf_tcp_capsule *ntc, *tc; struct nvmf_tcp_command_buffer *ncb, *cb; TAILQ_FOREACH_SAFE(tc, &qp->rx_capsules, link, ntc) { TAILQ_REMOVE(&qp->rx_capsules, tc, link); nvmf_free_capsule(&tc->nc); } LIST_FOREACH_SAFE(cb, &qp->rx_buffers, link, ncb) { tcp_free_command_buffer(cb); } LIST_FOREACH_SAFE(cb, &qp->tx_buffers, link, ncb) { tcp_free_command_buffer(cb); } free(qp); } -static int -tcp_kernel_handoff_params(struct nvmf_qpair *nq, - struct nvmf_handoff_qpair_params *qparams) +static void +tcp_kernel_handoff_params(struct nvmf_qpair *nq, nvlist_t *nvl) { struct nvmf_tcp_qpair *qp = TQP(nq); - qparams->tcp.fd = qp->s; - qparams->tcp.rxpda = qp->rxpda; - qparams->tcp.txpda = qp->txpda; - qparams->tcp.header_digests = qp->header_digests; - qparams->tcp.data_digests = qp->data_digests; - qparams->tcp.maxr2t = qp->maxr2t; - qparams->tcp.maxh2cdata = qp->maxh2cdata; - qparams->tcp.max_icd = qp->max_icd; - - return (0); + nvlist_add_number(nvl, "fd", qp->s); + nvlist_add_number(nvl, "rxpda", qp->rxpda); + nvlist_add_number(nvl, "txpda", qp->txpda); + nvlist_add_bool(nvl, "header_digests", qp->header_digests); + nvlist_add_bool(nvl, "data_digests", qp->data_digests); + nvlist_add_number(nvl, "maxr2t", qp->maxr2t); + nvlist_add_number(nvl, "maxh2cdata", qp->maxh2cdata); + nvlist_add_number(nvl, "max_icd", qp->max_icd); } static struct nvmf_capsule * tcp_allocate_capsule(struct nvmf_qpair *qp __unused) { struct nvmf_tcp_capsule *nc; nc = calloc(1, sizeof(*nc)); return (&nc->nc); } static void tcp_free_capsule(struct nvmf_capsule *nc) { struct nvmf_tcp_capsule *tc = TCAP(nc); nvmf_tcp_free_pdu(&tc->rx_pdu); if (tc->cb != NULL) tcp_free_command_buffer(tc->cb); free(tc); } static int tcp_transmit_command(struct nvmf_capsule *nc) { struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair); struct nvmf_tcp_capsule *tc = TCAP(nc); struct nvme_tcp_cmd cmd; struct nvme_sgl_descriptor *sgl; int error; bool use_icd; use_icd = false; if (nc->nc_data_len != 0 && nc->nc_send_data && nc->nc_data_len <= qp->max_icd) use_icd = true; memset(&cmd, 0, sizeof(cmd)); cmd.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_CMD; cmd.ccsqe = nc->nc_sqe; /* Populate SGL in SQE. */ sgl = &cmd.ccsqe.sgl; memset(sgl, 0, sizeof(*sgl)); sgl->address = 0; sgl->length = htole32(nc->nc_data_len); if (use_icd) { /* Use in-capsule data. */ sgl->type = NVME_SGL_TYPE_ICD; } else { /* Use a command buffer. */ sgl->type = NVME_SGL_TYPE_COMMAND_BUFFER; } /* Send command capsule. */ error = nvmf_tcp_construct_pdu(qp, &cmd, sizeof(cmd), use_icd ? nc->nc_data : NULL, use_icd ? nc->nc_data_len : 0); if (error != 0) return (error); /* * If data will be transferred using a command buffer, allocate a * buffer structure and queue it. */ if (nc->nc_data_len != 0 && !use_icd) tc->cb = tcp_alloc_command_buffer(qp, nc->nc_data, 0, nc->nc_data_len, cmd.ccsqe.cid, 0, !nc->nc_send_data); return (0); } static int tcp_transmit_response(struct nvmf_capsule *nc) { struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair); struct nvme_tcp_rsp rsp; memset(&rsp, 0, sizeof(rsp)); rsp.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_RESP; rsp.rccqe = nc->nc_cqe; return (nvmf_tcp_construct_pdu(qp, &rsp, sizeof(rsp), NULL, 0)); } static int tcp_transmit_capsule(struct nvmf_capsule *nc) { if (nc->nc_qe_len == sizeof(struct nvme_command)) return (tcp_transmit_command(nc)); else return (tcp_transmit_response(nc)); } static int tcp_receive_capsule(struct nvmf_qpair *nq, struct nvmf_capsule **ncp) { struct nvmf_tcp_qpair *qp = TQP(nq); struct nvmf_tcp_capsule *tc; int error; while (TAILQ_EMPTY(&qp->rx_capsules)) { error = nvmf_tcp_receive_pdu(qp); if (error != 0) return (error); } tc = TAILQ_FIRST(&qp->rx_capsules); TAILQ_REMOVE(&qp->rx_capsules, tc, link); *ncp = &tc->nc; return (0); } static uint8_t tcp_validate_command_capsule(const struct nvmf_capsule *nc) { const struct nvmf_tcp_capsule *tc = CTCAP(nc); const struct nvme_sgl_descriptor *sgl; assert(tc->rx_pdu.hdr != NULL); sgl = &nc->nc_sqe.sgl; switch (sgl->type) { case NVME_SGL_TYPE_ICD: if (tc->rx_pdu.data_len != le32toh(sgl->length)) { printf("NVMe/TCP: Command Capsule with mismatched ICD length\n"); return (NVME_SC_DATA_SGL_LENGTH_INVALID); } break; case NVME_SGL_TYPE_COMMAND_BUFFER: if (tc->rx_pdu.data_len != 0) { printf("NVMe/TCP: Command Buffer SGL with ICD\n"); return (NVME_SC_INVALID_FIELD); } break; default: printf("NVMe/TCP: Invalid SGL type in Command Capsule\n"); return (NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID); } if (sgl->address != 0) { printf("NVMe/TCP: Invalid SGL offset in Command Capsule\n"); return (NVME_SC_SGL_OFFSET_INVALID); } return (NVME_SC_SUCCESS); } static size_t tcp_capsule_data_len(const struct nvmf_capsule *nc) { assert(nc->nc_qe_len == sizeof(struct nvme_command)); return (le32toh(nc->nc_sqe.sgl.length)); } /* NB: cid and ttag are both little-endian already. */ static int tcp_send_r2t(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag, uint32_t data_offset, uint32_t data_len) { struct nvme_tcp_r2t_hdr r2t; memset(&r2t, 0, sizeof(r2t)); r2t.common.pdu_type = NVME_TCP_PDU_TYPE_R2T; r2t.cccid = cid; r2t.ttag = ttag; r2t.r2to = htole32(data_offset); r2t.r2tl = htole32(data_len); return (nvmf_tcp_construct_pdu(qp, &r2t, sizeof(r2t), NULL, 0)); } static int tcp_receive_r2t_data(const struct nvmf_capsule *nc, uint32_t data_offset, void *buf, size_t len) { struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair); struct nvmf_tcp_command_buffer *cb; int error; uint16_t ttag; /* * Don't bother byte-swapping ttag as it is just a cookie * value returned by the other end as-is. */ ttag = qp->next_ttag++; error = tcp_send_r2t(qp, nc->nc_sqe.cid, ttag, data_offset, len); if (error != 0) return (error); cb = tcp_alloc_command_buffer(qp, buf, data_offset, len, nc->nc_sqe.cid, ttag, true); /* Parse received PDUs until the data transfer is complete. */ while (cb->data_xfered < cb->data_len) { error = nvmf_tcp_receive_pdu(qp); if (error != 0) break; } tcp_free_command_buffer(cb); return (error); } static int tcp_receive_icd_data(const struct nvmf_capsule *nc, uint32_t data_offset, void *buf, size_t len) { const struct nvmf_tcp_capsule *tc = CTCAP(nc); const char *icd; icd = (const char *)tc->rx_pdu.hdr + tc->rx_pdu.hdr->pdo + data_offset; memcpy(buf, icd, len); return (0); } static int tcp_receive_controller_data(const struct nvmf_capsule *nc, uint32_t data_offset, void *buf, size_t len) { struct nvmf_association *na = nc->nc_qpair->nq_association; const struct nvme_sgl_descriptor *sgl; size_t data_len; if (nc->nc_qe_len != sizeof(struct nvme_command) || !na->na_controller) return (EINVAL); sgl = &nc->nc_sqe.sgl; data_len = le32toh(sgl->length); if (data_offset + len > data_len) return (EFBIG); if (sgl->type == NVME_SGL_TYPE_ICD) return (tcp_receive_icd_data(nc, data_offset, buf, len)); else return (tcp_receive_r2t_data(nc, data_offset, buf, len)); } /* NB: cid is little-endian already. */ static int tcp_send_c2h_pdu(struct nvmf_tcp_qpair *qp, uint16_t cid, uint32_t data_offset, const void *buf, size_t len, bool last_pdu, bool success) { struct nvme_tcp_c2h_data_hdr c2h; memset(&c2h, 0, sizeof(c2h)); c2h.common.pdu_type = NVME_TCP_PDU_TYPE_C2H_DATA; if (last_pdu) c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_LAST_PDU; if (success) c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_SUCCESS; c2h.cccid = cid; c2h.datao = htole32(data_offset); c2h.datal = htole32(len); return (nvmf_tcp_construct_pdu(qp, &c2h, sizeof(c2h), __DECONST(void *, buf), len)); } static int tcp_send_controller_data(const struct nvmf_capsule *nc, const void *buf, size_t len) { struct nvmf_association *na = nc->nc_qpair->nq_association; struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair); const struct nvme_sgl_descriptor *sgl; const char *src; size_t todo; uint32_t data_len, data_offset; int error; bool last_pdu, send_success_flag; if (nc->nc_qe_len != sizeof(struct nvme_command) || !na->na_controller) return (EINVAL); sgl = &nc->nc_sqe.sgl; data_len = le32toh(sgl->length); if (len != data_len) { nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD); return (EFBIG); } if (sgl->type != NVME_SGL_TYPE_COMMAND_BUFFER) { nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD); return (EINVAL); } /* Use the SUCCESS flag if SQ flow control is disabled. */ send_success_flag = !qp->qp.nq_flow_control; /* * Write out one or more C2H_DATA PDUs containing the data. * Each PDU is arbitrarily capped at 256k. */ data_offset = 0; src = buf; while (len > 0) { if (len > 256 * 1024) { todo = 256 * 1024; last_pdu = false; } else { todo = len; last_pdu = true; } error = tcp_send_c2h_pdu(qp, nc->nc_sqe.cid, data_offset, src, todo, last_pdu, last_pdu && send_success_flag); if (error != 0) { nvmf_send_generic_error(nc, NVME_SC_TRANSIENT_TRANSPORT_ERROR); return (error); } data_offset += todo; src += todo; len -= todo; } if (!send_success_flag) nvmf_send_success(nc); return (0); } struct nvmf_transport_ops tcp_ops = { .allocate_association = tcp_allocate_association, .update_association = tcp_update_association, .free_association = tcp_free_association, .allocate_qpair = tcp_allocate_qpair, .free_qpair = tcp_free_qpair, .kernel_handoff_params = tcp_kernel_handoff_params, .allocate_capsule = tcp_allocate_capsule, .free_capsule = tcp_free_capsule, .transmit_capsule = tcp_transmit_capsule, .receive_capsule = tcp_receive_capsule, .validate_command_capsule = tcp_validate_command_capsule, .capsule_data_len = tcp_capsule_data_len, .receive_controller_data = tcp_receive_controller_data, .send_controller_data = tcp_send_controller_data, }; diff --git a/lib/libnvmf/nvmf_transport.c b/lib/libnvmf/nvmf_transport.c index 1a8505f2a993..fa3826b8c50d 100644 --- a/lib/libnvmf/nvmf_transport.c +++ b/lib/libnvmf/nvmf_transport.c @@ -1,269 +1,298 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2022-2024 Chelsio Communications, Inc. * Written by: John Baldwin */ #include #include #include #include #include #include #include #include "libnvmf.h" #include "internal.h" struct nvmf_association * nvmf_allocate_association(enum nvmf_trtype trtype, bool controller, const struct nvmf_association_params *params) { struct nvmf_transport_ops *ops; struct nvmf_association *na; switch (trtype) { case NVMF_TRTYPE_TCP: ops = &tcp_ops; break; default: errno = EINVAL; return (NULL); } na = ops->allocate_association(controller, params); if (na == NULL) return (NULL); na->na_ops = ops; na->na_trtype = trtype; na->na_controller = controller; na->na_params = *params; na->na_last_error = NULL; refcount_init(&na->na_refs, 1); return (na); } void nvmf_update_assocation(struct nvmf_association *na, const struct nvme_controller_data *cdata) { na->na_ops->update_association(na, cdata); } void nvmf_free_association(struct nvmf_association *na) { if (refcount_release(&na->na_refs)) { free(na->na_last_error); na->na_ops->free_association(na); } } const char * nvmf_association_error(const struct nvmf_association *na) { return (na->na_last_error); } void na_clear_error(struct nvmf_association *na) { free(na->na_last_error); na->na_last_error = NULL; } void na_error(struct nvmf_association *na, const char *fmt, ...) { va_list ap; char *str; if (na->na_last_error != NULL) return; va_start(ap, fmt); vasprintf(&str, fmt, ap); va_end(ap); na->na_last_error = str; } struct nvmf_qpair * nvmf_allocate_qpair(struct nvmf_association *na, const struct nvmf_qpair_params *params) { struct nvmf_qpair *qp; na_clear_error(na); qp = na->na_ops->allocate_qpair(na, params); if (qp == NULL) return (NULL); refcount_acquire(&na->na_refs); qp->nq_association = na; qp->nq_admin = params->admin; TAILQ_INIT(&qp->nq_rx_capsules); return (qp); } void nvmf_free_qpair(struct nvmf_qpair *qp) { struct nvmf_association *na; struct nvmf_capsule *nc, *tc; TAILQ_FOREACH_SAFE(nc, &qp->nq_rx_capsules, nc_link, tc) { TAILQ_REMOVE(&qp->nq_rx_capsules, nc, nc_link); nvmf_free_capsule(nc); } na = qp->nq_association; na->na_ops->free_qpair(qp); nvmf_free_association(na); } struct nvmf_capsule * nvmf_allocate_command(struct nvmf_qpair *qp, const void *sqe) { struct nvmf_capsule *nc; nc = qp->nq_association->na_ops->allocate_capsule(qp); if (nc == NULL) return (NULL); nc->nc_qpair = qp; nc->nc_qe_len = sizeof(struct nvme_command); memcpy(&nc->nc_sqe, sqe, nc->nc_qe_len); /* 4.2 of NVMe base spec: Fabrics always uses SGL. */ nc->nc_sqe.fuse &= ~NVMEM(NVME_CMD_PSDT); nc->nc_sqe.fuse |= NVMEF(NVME_CMD_PSDT, NVME_PSDT_SGL); return (nc); } struct nvmf_capsule * nvmf_allocate_response(struct nvmf_qpair *qp, const void *cqe) { struct nvmf_capsule *nc; nc = qp->nq_association->na_ops->allocate_capsule(qp); if (nc == NULL) return (NULL); nc->nc_qpair = qp; nc->nc_qe_len = sizeof(struct nvme_completion); memcpy(&nc->nc_cqe, cqe, nc->nc_qe_len); return (nc); } int nvmf_capsule_append_data(struct nvmf_capsule *nc, void *buf, size_t len, bool send) { if (nc->nc_qe_len == sizeof(struct nvme_completion)) return (EINVAL); if (nc->nc_data_len != 0) return (EBUSY); nc->nc_data = buf; nc->nc_data_len = len; nc->nc_send_data = send; return (0); } void nvmf_free_capsule(struct nvmf_capsule *nc) { nc->nc_qpair->nq_association->na_ops->free_capsule(nc); } int nvmf_transmit_capsule(struct nvmf_capsule *nc) { return (nc->nc_qpair->nq_association->na_ops->transmit_capsule(nc)); } int nvmf_receive_capsule(struct nvmf_qpair *qp, struct nvmf_capsule **ncp) { return (qp->nq_association->na_ops->receive_capsule(qp, ncp)); } const void * nvmf_capsule_sqe(const struct nvmf_capsule *nc) { assert(nc->nc_qe_len == sizeof(struct nvme_command)); return (&nc->nc_sqe); } const void * nvmf_capsule_cqe(const struct nvmf_capsule *nc) { assert(nc->nc_qe_len == sizeof(struct nvme_completion)); return (&nc->nc_cqe); } uint8_t nvmf_validate_command_capsule(const struct nvmf_capsule *nc) { assert(nc->nc_qe_len == sizeof(struct nvme_command)); if (NVMEV(NVME_CMD_PSDT, nc->nc_sqe.fuse) != NVME_PSDT_SGL) return (NVME_SC_INVALID_FIELD); return (nc->nc_qpair->nq_association->na_ops->validate_command_capsule(nc)); } size_t nvmf_capsule_data_len(const struct nvmf_capsule *nc) { return (nc->nc_qpair->nq_association->na_ops->capsule_data_len(nc)); } int nvmf_receive_controller_data(const struct nvmf_capsule *nc, uint32_t data_offset, void *buf, size_t len) { return (nc->nc_qpair->nq_association->na_ops->receive_controller_data(nc, data_offset, buf, len)); } int nvmf_send_controller_data(const struct nvmf_capsule *nc, const void *buf, size_t len) { return (nc->nc_qpair->nq_association->na_ops->send_controller_data(nc, buf, len)); } int -nvmf_kernel_handoff_params(struct nvmf_qpair *qp, - struct nvmf_handoff_qpair_params *qparams) +nvmf_kernel_handoff_params(struct nvmf_qpair *qp, nvlist_t **nvlp) { - memset(qparams, 0, sizeof(*qparams)); - qparams->admin = qp->nq_admin; - qparams->sq_flow_control = qp->nq_flow_control; - qparams->qsize = qp->nq_qsize; - qparams->sqhd = qp->nq_sqhd; - qparams->sqtail = qp->nq_sqtail; - return (qp->nq_association->na_ops->kernel_handoff_params(qp, qparams)); + nvlist_t *nvl; + int error; + + nvl = nvlist_create(0); + nvlist_add_bool(nvl, "admin", qp->nq_admin); + nvlist_add_bool(nvl, "sq_flow_control", qp->nq_flow_control); + nvlist_add_number(nvl, "qsize", qp->nq_qsize); + nvlist_add_number(nvl, "sqhd", qp->nq_sqhd); + if (!qp->nq_association->na_controller) + nvlist_add_number(nvl, "sqtail", qp->nq_sqtail); + qp->nq_association->na_ops->kernel_handoff_params(qp, nvl); + error = nvlist_error(nvl); + if (error != 0) { + nvlist_destroy(nvl); + return (error); + } + + *nvlp = nvl; + return (0); } const char * nvmf_transport_type(uint8_t trtype) { static _Thread_local char buf[8]; switch (trtype) { case NVMF_TRTYPE_RDMA: return ("RDMA"); case NVMF_TRTYPE_FC: return ("Fibre Channel"); case NVMF_TRTYPE_TCP: return ("TCP"); case NVMF_TRTYPE_INTRA_HOST: return ("Intra-host"); default: snprintf(buf, sizeof(buf), "0x%02x\n", trtype); return (buf); } } + +int +nvmf_pack_ioc_nvlist(struct nvmf_ioc_nv *nv, nvlist_t *nvl) +{ + int error; + + memset(nv, 0, sizeof(*nv)); + + error = nvlist_error(nvl); + if (error) + return (error); + + nv->data = nvlist_pack(nvl, &nv->size); + if (nv->data == NULL) + return (ENOMEM); + + return (0); +} diff --git a/rescue/rescue/Makefile b/rescue/rescue/Makefile index 4474a0af050f..797daf3d2f14 100644 --- a/rescue/rescue/Makefile +++ b/rescue/rescue/Makefile @@ -1,279 +1,279 @@ .include .include PACKAGE=rescue MAN= MK_SSP= no # Static-PIE is not supported so we should not be linking against _pie.a libs. # This is also needed to avoid linking against sanitizer-instrumented libraries # since MK_ASAN/MK_UBSAN will instrument the .pieo object files. MK_PIE= no NO_SHARED= yes CRUNCH_BUILDOPTS+= MK_PIE=no NO_SHARED=yes # lld >= 16 became more strict about multiply defined symbols. Since there are # many of those in crunchgen'd programs, turn off the check. .if ${LINKER_TYPE} == "lld" && ${LINKER_VERSION} >= 160000 LDFLAGS+= -Wl,--allow-multiple-definition .endif PROG= rescue BINDIR?=/rescue SCRIPTS+= dhclient_FIXED SCRIPTSNAME_dhclient_FIXED= dhclient-script dhclient_FIXED: ../../sbin/dhclient/dhclient-script sed '1s/\/bin\//\/rescue\//' ${.ALLSRC} > ${.TARGET} CLEANFILES+= dhclient_FIXED # The help which used to be here is now in mk/bsd.crunchgen.mk # Define Makefile variable RESCUE CRUNCH_BUILDOPTS+= -DRESCUE # Define compile-time RESCUE symbol when compiling components CRUNCH_BUILDOPTS+= CRUNCH_CFLAGS=-DRESCUE # An experiment that failed: try overriding bsd.lib.mk and bsd.prog.mk # rather than incorporating rescue-specific logic into standard files. #MAKEFLAGS= -m ${.CURDIR} ${.MAKEFLAGS} # Hackery: 'librescue' exists merely as a tool for appropriately # recompiling specific library entries. We _know_ they're needed, and # regular archive searching creates ugly library ordering problems. # Easiest fix: tell the linker to include them into the executable # first, so they are guaranteed to override the regular lib entries. # Note that if 'librescue' hasn't been compiled, we'll just get the # regular lib entries from libc and friends. CRUNCH_LIBS+= ${.OBJDIR}/../librescue/*.o ################################################################### # Programs from stock /bin # # WARNING: Changing this list may require adjusting # /usr/include/paths.h as well! You were warned! # CRUNCH_SRCDIRS+= bin CRUNCH_PROGS_bin= cat chflags chio chmod cp date dd df echo \ ed expr getfacl hostname kenv kill ln ls mkdir mv \ pkill ps pwd realpath rm rmdir setfacl sh sleep stty \ sync test CRUNCH_LIBS+= -lcrypt -ledit -ljail -lkvm -lelf -ltermcapw -lutil -lxo CRUNCH_BUILDTOOLS+= bin/sh # Additional options for specific programs CRUNCH_ALIAS_test= [ CRUNCH_ALIAS_sh= -sh # The -sh alias shouldn't appear in /rescue as a hard link CRUNCH_SUPPRESS_LINK_-sh= 1 CRUNCH_ALIAS_ln= link CRUNCH_ALIAS_rm= unlink CRUNCH_ALIAS_ed= red CRUNCH_ALIAS_pkill= pgrep .if ${MK_TCSH} != "no" CRUNCH_PROGS_bin+= csh CRUNCH_ALIAS_csh= -csh tcsh -tcsh CRUNCH_BUILDTOOLS+= bin/csh CRUNCH_SUPPRESS_LINK_-csh= 1 CRUNCH_SUPPRESS_LINK_-tcsh= 1 .endif ################################################################### # Programs from standard /sbin # # WARNING: Changing this list may require adjusting # /usr/include/paths.h as well! You were warned! # # Note that mdmfs have their own private 'pathnames.h' # headers in addition to the standard 'paths.h' header. # CRUNCH_SRCDIRS+= sbin CRUNCH_PROGS_sbin= \ camcontrol clri devfs dmesg dump \ dumpfs dumpon fsck fsck_ffs fsck_msdosfs fsdb \ fsirand geom ifconfig init \ kldconfig kldload kldstat kldunload ldconfig \ md5 mdconfig mdmfs mknod mount mount_cd9660 \ mount_msdosfs mount_nfs mount_nullfs \ mount_udf mount_unionfs newfs \ newfs_msdos nos-tun reboot \ restore rcorder route savecore \ shutdown swapon sysctl tunefs umount .if ${MK_CCD} != "no" CRUNCH_PROGS_sbin+= ccdconfig .endif .if ${MK_INET} != "no" || ${MK_INET6} != "no" CRUNCH_PROGS_sbin+= ping .endif .if ${MK_INET6_SUPPORT} != "no" CRUNCH_ALIAS_ping= ping6 CRUNCH_PROGS_sbin+= rtsol .endif .if ${MK_IPFILTER} != "no" CRUNCH_PROGS_sbin+= ipf CRUNCH_LIBS_ipf+= ${LIBIPF} .endif .if ${MK_IPFW} != "no" CRUNCH_PROGS_sbin+= ipfw .endif .if ${MK_PF} != "no" CRUNCH_PROGS_sbin+= pfctl CRUNCH_LIBS_pfctl+= ${LIBPFCTL} ${LIBNV} .endif .if ${MK_ROUTED} != "no" CRUNCH_PROGS_sbin+= routed rtquery .endif .if ${MK_ZFS} != "no" CRUNCH_PROGS_sbin+= bectl CRUNCH_PROGS_sbin+= zfs CRUNCH_PROGS_sbin+= zfsbootcfg CRUNCH_PROGS_sbin+= zpool CRUNCH_PROGS_usr.sbin+= zdb .endif # crunchgen does not like C++ programs; this should be fixed someday # CRUNCH_PROGS+= devd CRUNCH_LIBS+= -l80211 -lalias -lcam -lncursesw -ldevstat -lipsec -llzma -CRUNCH_LIBS_camcontrol+= ${LIBNVMF} +CRUNCH_LIBS_camcontrol+= ${LIBNVMF} ${LIBNV} .if ${MK_ZFS} != "no" CRUNCH_LIBS+= -lavl -lpthread -luutil -lumem -ltpool -lspl -lrt CRUNCH_LIBS_zfs+= ${LIBBE} \ ${LIBZPOOL} \ ${LIBZFS} \ ${LIBZUTIL} \ ${LIBZFS_CORE} \ ${LIBZFSBOOTENV} \ ${LIBICP_RESCUE} \ ${LIBNVPAIR} CRUNCH_LIBS_bectl+= ${CRUNCH_LIBS_zfs} CRUNCH_LIBS_zpool+= ${CRUNCH_LIBS_zfs} CRUNCH_LIBS_zdb+= ${CRUNCH_LIBS_zfs} ${LIBZDB} CRUNCH_LIBS_zfsbootcfg+=${LIBZFSBOOTENV} \ ${LIBZPOOL} \ ${LIBZFS} \ ${LIBZUTIL} \ ${LIBZFS_CORE} \ ${LIBICP_RESCUE} \ ${LIBNVPAIR} .else # liblzma needs pthread CRUNCH_LIBS+= -lpthread .endif CRUNCH_LIBS+= -lgeom -lbsdxml -lkiconv .if ${MK_OPENSSL} == "no" CRUNCH_LIBS+= -lmd .endif CRUNCH_LIBS+= -lmt -lsbuf -lufs -lz .if ${MACHINE_CPUARCH} == "i386" CRUNCH_PROGS_sbin+= bsdlabel fdisk CRUNCH_ALIAS_bsdlabel= disklabel #CRUNCH_PROGS+= mount_smbfs #CRUNCH_LIBS+= -lsmb .endif .if ${MACHINE_CPUARCH} == "amd64" CRUNCH_PROGS_sbin+= bsdlabel fdisk CRUNCH_ALIAS_bsdlabel= disklabel .endif CRUNCH_SRCDIR_rtquery= ${SRCTOP}/sbin/routed/rtquery CRUNCH_SRCDIR_ipf= ${SRCTOP}/sbin/ipf/ipf .if ${MK_ZFS} != "no" CRUNCH_SRCDIR_zfs= ${SRCTOP}/cddl/sbin/zfs CRUNCH_SRCDIR_zpool= ${SRCTOP}/cddl/sbin/zpool CRUNCH_SRCDIR_zdb= ${SRCTOP}/cddl/usr.sbin/zdb .endif CRUNCH_ALIAS_reboot= fastboot halt fasthalt nextboot CRUNCH_ALIAS_restore= rrestore CRUNCH_ALIAS_dump= rdump CRUNCH_ALIAS_fsck_ffs= fsck_4.2bsd fsck_ufs CRUNCH_ALIAS_geom= glabel gpart CRUNCH_ALIAS_shutdown= poweroff # dhclient has historically been troublesome... CRUNCH_PROGS_sbin+= dhclient ################################################################## # Programs from stock /usr/bin # CRUNCH_SRCDIRS+= usr.bin CRUNCH_PROGS_usr.bin= head mt sed tail tee CRUNCH_PROGS_usr.bin+= gzip CRUNCH_ALIAS_gzip= gunzip gzcat zcat CRUNCH_PROGS_usr.bin+= bzip2 CRUNCH_ALIAS_bzip2= bunzip2 bzcat CRUNCH_LIBS+= -lbz2 CRUNCH_PROGS_usr.bin+= less CRUNCH_ALIAS_less= more CRUNCH_PROGS_usr.bin+= xz CRUNCH_ALIAS_xz= unxz lzma unlzma xzcat lzcat CRUNCH_PROGS_usr.bin+= zstd CRUNCH_ALIAS_zstd= unzstd zstdcat zstdmt CRUNCH_LIBS+= -lprivatezstd CRUNCH_PROGS_usr.bin+= fetch CRUNCH_LIBS+= -lfetch CRUNCH_PROGS_usr.bin+= tar CRUNCH_LIBS+= -larchive .if ${MK_OPENSSL} != "no" CRUNCH_LIBS+= -lssl -lcrypto .endif CRUNCH_LIBS+= -lmd .if ${MK_NETCAT} != "no" CRUNCH_PROGS_usr.bin+= nc .endif .if ${MK_VI} != "no" CRUNCH_PROGS_usr.bin+= vi CRUNCH_ALIAS_vi= ex .endif CRUNCH_PROGS_usr.bin+= id CRUNCH_ALIAS_id= groups whoami ################################################################## # Programs from stock /usr/sbin # CRUNCH_SRCDIRS+= usr.sbin CRUNCH_PROGS_usr.sbin+= chroot CRUNCH_PROGS_usr.sbin+= chown CRUNCH_ALIAS_chown= chgrp ################################################################## CRUNCH_LIBS+= ${OBJTOP}/lib/libifconfig/libifconfig.a CRUNCH_BUILDOPTS+= CRUNCH_CFLAGS+=-I${OBJTOP}/lib/libifconfig CRUNCH_LIBS_ifconfig+= ${LIBNV} CRUNCH_LIBS+= -lm .if ${MK_ISCSI} != "no" CRUNCH_PROGS_usr.bin+= iscsictl CRUNCH_PROGS_usr.sbin+= iscsid CRUNCH_LIBS+= ${OBJTOP}/lib/libiscsiutil/libiscsiutil.a CRUNCH_BUILDOPTS+= CRUNCH_CFLAGS+=-I${OBJTOP}/lib/libiscsiutil .endif .include .include diff --git a/sbin/nvmecontrol/reconnect.c b/sbin/nvmecontrol/reconnect.c index b606409eea90..4c9277bd34cb 100644 --- a/sbin/nvmecontrol/reconnect.c +++ b/sbin/nvmecontrol/reconnect.c @@ -1,170 +1,182 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2023-2024 Chelsio Communications, Inc. * Written by: John Baldwin */ +#include #include #include #include #include #include #include #include #include "nvmecontrol.h" #include "fabrics.h" /* * See comment about other possible settings in connect.c. */ static struct options { const char *dev; const char *transport; const char *address; const char *hostnqn; uint32_t kato; uint16_t num_io_queues; uint16_t queue_size; bool data_digests; bool flow_control; bool header_digests; } opt = { .dev = NULL, .transport = "tcp", .address = NULL, .hostnqn = NULL, .kato = NVMF_KATO_DEFAULT / 1000, .num_io_queues = 1, .queue_size = 0, .data_digests = false, .flow_control = false, .header_digests = false, }; static void tcp_association_params(struct nvmf_association_params *params) { params->tcp.pda = 0; params->tcp.header_digests = opt.header_digests; params->tcp.data_digests = opt.data_digests; /* XXX */ params->tcp.maxr2t = 1; } static int reconnect_nvm_controller(int fd, enum nvmf_trtype trtype, int adrfam, const char *address, const char *port) { struct nvme_controller_data cdata; struct nvmf_association_params aparams; - struct nvmf_reconnect_params rparams; + nvlist_t *rparams; struct nvmf_qpair *admin, **io; int error; error = nvmf_reconnect_params(fd, &rparams); if (error != 0) { warnc(error, "Failed to fetch reconnect parameters"); return (EX_IOERR); } + if (!nvlist_exists_number(rparams, "cntlid") || + !nvlist_exists_string(rparams, "subnqn")) { + nvlist_destroy(rparams); + warnx("Missing required reconnect parameters"); + return (EX_IOERR); + } + memset(&aparams, 0, sizeof(aparams)); aparams.sq_flow_control = opt.flow_control; switch (trtype) { case NVMF_TRTYPE_TCP: tcp_association_params(&aparams); break; default: + nvlist_destroy(rparams); warnx("Unsupported transport %s", nvmf_transport_type(trtype)); return (EX_UNAVAILABLE); } io = calloc(opt.num_io_queues, sizeof(*io)); error = connect_nvm_queues(&aparams, trtype, adrfam, address, port, - rparams.cntlid, rparams.subnqn, opt.hostnqn, opt.kato, &admin, io, - opt.num_io_queues, opt.queue_size, &cdata); + nvlist_get_number(rparams, "cntlid"), + nvlist_get_string(rparams, "subnqn"), opt.hostnqn, opt.kato, + &admin, io, opt.num_io_queues, opt.queue_size, &cdata); if (error != 0) { free(io); + nvlist_destroy(rparams); return (error); } + nvlist_destroy(rparams); error = nvmf_reconnect_host(fd, admin, opt.num_io_queues, io, &cdata); if (error != 0) { warnc(error, "Failed to handoff queues to kernel"); free(io); return (EX_IOERR); } free(io); return (0); } static void reconnect_fn(const struct cmd *f, int argc, char *argv[]) { enum nvmf_trtype trtype; const char *address, *port; char *tofree; int error, fd; if (arg_parse(argc, argv, f)) return; if (strcasecmp(opt.transport, "tcp") == 0) { trtype = NVMF_TRTYPE_TCP; } else errx(EX_USAGE, "Unsupported or invalid transport"); nvmf_parse_address(opt.address, &address, &port, &tofree); open_dev(opt.dev, &fd, 1, 1); if (port == NULL) errx(EX_USAGE, "Explicit port required"); error = reconnect_nvm_controller(fd, trtype, AF_UNSPEC, address, port); if (error != 0) exit(error); close(fd); free(tofree); } static const struct opts reconnect_opts[] = { #define OPT(l, s, t, opt, addr, desc) { l, s, t, &opt.addr, desc } OPT("transport", 't', arg_string, opt, transport, "Transport type"), OPT("nr-io-queues", 'i', arg_uint16, opt, num_io_queues, "Number of I/O queues"), OPT("queue-size", 'Q', arg_uint16, opt, queue_size, "Number of entries in each I/O queue"), OPT("keep-alive-tmo", 'k', arg_uint32, opt, kato, "Keep Alive timeout (in seconds)"), OPT("hostnqn", 'q', arg_string, opt, hostnqn, "Host NQN"), OPT("flow_control", 'F', arg_none, opt, flow_control, "Request SQ flow control"), OPT("hdr_digests", 'g', arg_none, opt, header_digests, "Enable TCP PDU header digests"), OPT("data_digests", 'G', arg_none, opt, data_digests, "Enable TCP PDU data digests"), { NULL, 0, arg_none, NULL, NULL } }; #undef OPT static const struct args reconnect_args[] = { { arg_string, &opt.dev, "controller-id" }, { arg_string, &opt.address, "address" }, { arg_none, NULL, NULL }, }; static struct cmd reconnect_cmd = { .name = "reconnect", .fn = reconnect_fn, .descr = "Reconnect to a fabrics controller", .ctx_size = sizeof(opt), .opts = reconnect_opts, .args = reconnect_args, }; CMD_COMMAND(reconnect_cmd); diff --git a/share/mk/src.libnames.mk b/share/mk/src.libnames.mk index 588291d8ec9c..786ad9a6f9a5 100644 --- a/share/mk/src.libnames.mk +++ b/share/mk/src.libnames.mk @@ -1,830 +1,831 @@ # # The include file define library names suitable # for INTERNALLIB and PRIVATELIB definition .if !target(____) .error src.libnames.mk cannot be included directly. .endif .if !target(____) ____: .include _PRIVATELIBS= \ atf_c \ atf_cxx \ auditd \ bsddialog \ bsdstat \ cbor \ devdctl \ event1 \ fido2 \ gmock \ gtest \ gmock_main \ gtest_main \ heimipcc \ heimipcs \ kldelf \ ldns \ sqlite3 \ ssh \ ucl \ unbound \ zstd # Let projects based on FreeBSD append to _PRIVATELIBS # by maintaining their own LOCAL_PRIVATELIBS list. _PRIVATELIBS+= ${LOCAL_PRIVATELIBS} _INTERNALLIBS= \ amu \ bsnmptools \ c_nossp_pic \ cron \ diff \ elftc \ fdt \ fifolog \ ifconfig \ ipf \ iscsiutil \ lpr \ lua \ lutok \ netbsd \ ntp \ ntpevent \ nvmf \ openbsd \ opts \ parse \ pe \ pfctl \ pmcstat \ sl \ sm \ smdb \ smutil \ telnet \ vers \ wpaap \ wpacommon \ wpacrypto \ wpadrivers \ wpaeap_common \ wpaeap_peer \ wpaeap_server \ wpaeapol_auth \ wpaeapol_supp \ wpal2_packet \ wpapasn \ wparadius \ wparsn_supp \ wpatls \ wpautils \ wpawps # Let projects based on FreeBSD append to _INTERNALLIBS # by maintaining their own LOCAL_INTERNALLIBS list. _INTERNALLIBS+= ${LOCAL_INTERNALLIBS} _LIBRARIES= \ ${_PRIVATELIBS} \ ${_INTERNALLIBS} \ ${LOCAL_LIBRARIES} \ 80211 \ 9p \ alias \ archive \ asn1 \ avl \ BlocksRuntime \ be \ begemot \ bluetooth \ bsdxml \ bsm \ bsnmp \ bz2 \ c \ c_pic \ calendar \ cam \ casper \ cap_dns \ cap_fileargs \ cap_grp \ cap_net \ cap_netdb \ cap_pwd \ cap_sysctl \ cap_syslog \ com_err \ compiler_rt \ crypt \ crypto \ ctf \ cuse \ cxxrt \ devctl \ devdctl \ devinfo \ devstat \ dialog \ dl \ dpv \ dtrace \ dwarf \ edit \ efivar \ elf \ execinfo \ fetch \ figpar \ formw \ geom \ gpio \ gssapi \ gssapi_krb5 \ hdb \ heimbase \ heimntlm \ heimsqlite \ hx509 \ icp \ ipsec \ ipt \ jail \ kadm5clnt \ kadm5srv \ kafs5 \ kdc \ kiconv \ krb5 \ kvm \ l \ lzma \ m \ magic \ md \ memstat \ mp \ mt \ ncursesw \ netgraph \ netmap \ ngatm \ nv \ nvpair \ opencsd \ pam \ panel \ panelw \ pcap \ pcsclite \ pjdlog \ pmc \ proc \ procstat \ pthread \ radius \ regex \ roken \ rpcsec_gss \ rpcsvc \ rt \ rtld_db \ sbuf \ sdp \ sm \ smb \ spl \ ssl \ ssp_nonshared \ stats \ stdthreads \ supcplusplus \ sys \ sysdecode \ tacplus \ termcapw \ tinfow \ tpool \ ufs \ ugidfw \ ulog \ umem \ usb \ usbhid \ util \ uutil \ vmmapi \ wind \ wrap \ xo \ y \ ypclnt \ z \ zdb \ zfs_core \ zfs \ zfsbootenv \ zpool \ zutil .if ${MK_BLACKLIST} != "no" _LIBRARIES+= \ blacklist \ .endif .if ${MK_OFED} != "no" _LIBRARIES+= \ cxgb4 \ ibcm \ ibmad \ ibnetdisc \ ibumad \ ibverbs \ irdma \ mlx4 \ mlx5 \ rdmacm \ osmcomp \ opensm \ osmvendor .endif .if ${MK_BEARSSL} == "yes" _LIBRARIES+= \ bearssl \ secureboot \ LIBBEARSSL?= ${LIBBEARSSLDIR}/libbearssl.a LIBSECUREBOOT?= ${LIBSECUREBOOTDIR}/libsecureboot.a .endif .if ${MK_VERIEXEC} == "yes" _LIBRARIES+= veriexec LIBVERIEXEC?= ${LIBVERIEXECDIR}/libveriexec.a .endif # Each library's LIBADD needs to be duplicated here for static linkage of # 2nd+ order consumers. Auto-generating this would be better. _DP_80211= sbuf bsdxml _DP_9p= sbuf .if ${MK_CASPER} != "no" _DP_9p+= casper cap_pwd cap_grp .endif # XXX: Not bootstrapped so uses host version on non-FreeBSD, so don't use a # FreeBSD-specific dependency list .if ${.MAKE.OS} == "FreeBSD" || !defined(BOOTSTRAPPING) _DP_archive= z bz2 lzma bsdxml zstd .endif _DP_avl= spl _DP_bsddialog= ncursesw tinfow _DP_zstd= pthread .if ${MK_BLACKLIST} != "no" _DP_blacklist+= pthread .endif _DP_crypto= pthread # See comment by _DP_archive above .if ${.MAKE.OS} == "FreeBSD" || !defined(BOOTSTRAPPING) .if ${MK_OPENSSL} != "no" _DP_archive+= crypto .else _DP_archive+= md .endif .endif _DP_sqlite3= pthread _DP_ssl= crypto _DP_ssh= crypto crypt z .if ${MK_LDNS} != "no" _DP_ssh+= ldns .endif _DP_edit= tinfow .if ${MK_OPENSSL} != "no" _DP_bsnmp= crypto .endif _DP_geom= bsdxml sbuf _DP_cam= sbuf _DP_kldelf= elf _DP_kvm= elf _DP_casper= nv _DP_cap_dns= nv _DP_cap_fileargs= nv _DP_cap_grp= nv _DP_cap_pwd= nv _DP_cap_sysctl= nv _DP_cap_syslog= nv _DP_crypt= md .if ${MK_OFED} != "no" _DP_pcap= ibverbs mlx5 .endif _DP_pjdlog= util _DP_usb= pthread _DP_unbound= ssl crypto pthread _DP_rt= pthread .if ${MK_OPENSSL} == "no" _DP_radius= md .else _DP_radius= crypto .endif _DP_rtld_db= elf procstat _DP_procstat= kvm util elf _DP_proc= cxxrt .if ${MK_CDDL} != "no" _DP_proc+= ctf .endif _DP_proc+= elf procstat rtld_db util z _DP_mp= crypto _DP_memstat= kvm _DP_magic= z _DP_mt= sbuf bsdxml +_DP_nvmf= nv _DP_ldns= ssl crypto _DP_lua= m _DP_lutok= lua .if ${MK_OPENSSL} != "no" _DP_fetch= ssl crypto .else _DP_fetch= md .endif _DP_execinfo= elf _DP_dwarf= elf z _DP_dpv= dialog figpar util tinfow ncursesw _DP_dialog= tinfow ncursesw m _DP_cuse= pthread _DP_atf_cxx= atf_c _DP_gtest= pthread regex _DP_gmock= gtest _DP_gmock_main= gmock _DP_gtest_main= gtest _DP_devstat= kvm _DP_pam= radius tacplus md util .if ${MK_KERBEROS} != "no" _DP_pam+= krb5 .endif .if ${MK_OPENSSH} != "no" _DP_fido2+= crypto z _DP_pam+= ssh .endif .if ${MK_NIS} != "no" _DP_pam+= ypclnt .endif _DP_roken= crypt _DP_kadm5clnt= com_err krb5 roken _DP_kadm5srv= com_err hdb krb5 roken _DP_heimntlm= crypto com_err krb5 roken _DP_hx509= asn1 com_err crypto roken wind _DP_hdb= asn1 com_err krb5 roken sqlite3 _DP_asn1= com_err roken _DP_kdc= roken hdb hx509 krb5 heimntlm asn1 crypto _DP_wind= com_err roken _DP_heimbase= pthread _DP_heimipcc= heimbase roken pthread _DP_heimipcs= heimbase roken pthread _DP_kafs5= asn1 krb5 roken _DP_krb5= asn1 com_err crypt crypto hx509 roken wind heimbase heimipcc _DP_gssapi_krb5= gssapi krb5 crypto roken asn1 com_err _DP_lzma= md pthread _DP_ucl= m _DP_vmmapi= util _DP_opencsd= cxxrt _DP_ctf= spl z _DP_dtrace= ctf elf proc pthread rtld_db xo _DP_xo= util _DP_ztest= geom m nvpair umem zpool pthread avl zfs_core spl zutil zfs uutil icp # The libc dependencies are not strictly needed but are defined to make the # assert happy. _DP_c= compiler_rt sys # Use libssp_nonshared only on i386 and power*. Other archs emit direct calls # to __stack_chk_fail, not __stack_chk_fail_local provided by libssp_nonshared. .if ${MK_SSP} != "no" && \ (${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH:Mpower*} != "") _DP_c+= ssp_nonshared .endif _DP_stats= sbuf pthread _DP_stdthreads= pthread _DP_sys= compiler_rt # Use libssp_nonshared only on i386 and power*. Other archs emit direct calls # to __stack_chk_fail, not __stack_chk_fail_local provided by libssp_nonshared. .if ${MK_SSP} != "no" && \ (${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH:Mpower*} != "") _DP_sys+= ssp_nonshared .endif .if !defined(BOOTSTRAPPING) _DP_thr= c sys _DP_pthread= ${_DP_thr} .endif _DP_tacplus= md pam _DP_ncursesw= tinfow _DP_formw= ncursesw _DP_nvpair= spl _DP_panelw= ncursesw _DP_rpcsec_gss= gssapi _DP_smb= kiconv _DP_ulog= md _DP_fifolog= z _DP_ipf= kvm _DP_tpool= spl _DP_uutil= avl spl _DP_zfs= md pthread rt umem util uutil m avl bsdxml crypto geom nvpair \ z zfs_core zutil _DP_zfsbootenv= zfs nvpair _DP_zfs_core= nvpair spl zutil _DP_zpool= md pthread z icp spl nvpair avl umem _DP_zutil= avl geom m tpool _DP_be= zfs spl nvpair zfsbootenv _DP_netmap= _DP_ifconfig= m _DP_pfctl= nv # OFED support .if ${MK_OFED} != "no" _DP_cxgb4= ibverbs pthread _DP_ibcm= ibverbs _DP_ibmad= ibumad _DP_ibnetdisc= osmcomp ibmad ibumad _DP_ibumad= _DP_ibverbs= _DP_irdma= ibverbs pthread _DP_mlx4= ibverbs pthread _DP_mlx5= ibverbs pthread _DP_rdmacm= ibverbs _DP_osmcomp= pthread _DP_opensm= pthread _DP_osmvendor= ibumad pthread .endif # Define special cases LDADD_supcplusplus= -lsupc++ LIBATF_C= ${LIBDESTDIR}${LIBDIR_BASE}/libprivateatf-c.a LIBATF_CXX= ${LIBDESTDIR}${LIBDIR_BASE}/libprivateatf-c++.a LDADD_atf_c= -lprivateatf-c LDADD_atf_cxx= -lprivateatf-c++ LIBGMOCK= ${LIBDESTDIR}${LIBDIR_BASE}/libprivategmock.a LIBGMOCK_MAIN= ${LIBDESTDIR}${LIBDIR_BASE}/libprivategmock_main.a LIBGTEST= ${LIBDESTDIR}${LIBDIR_BASE}/libprivategtest.a LIBGTEST_MAIN= ${LIBDESTDIR}${LIBDIR_BASE}/libprivategtest_main.a LDADD_gmock= -lprivategmock LDADD_gtest= -lprivategtest LDADD_gmock_main= -lprivategmock_main LDADD_gtest_main= -lprivategtest_main .for _l in ${_PRIVATELIBS} LIB${_l:tu}?= ${LIBDESTDIR}${LIBDIR_BASE}/libprivate${_l}.a .endfor .if ${MK_PIE} != "no" PIE_SUFFIX= _pie .endif .for _l in ${_LIBRARIES} .if ${_INTERNALLIBS:M${_l}} || !defined(SYSROOT) LDADD_${_l}_L+= -L${LIB${_l:tu}DIR} .endif DPADD_${_l}?= ${LIB${_l:tu}} .if ${_PRIVATELIBS:M${_l}} LDADD_${_l}?= -lprivate${_l} .elif ${_INTERNALLIBS:M${_l}} LDADD_${_l}?= ${LDADD_${_l}_L} -l${_l:S/${PIE_SUFFIX}//}${PIE_SUFFIX} .else LDADD_${_l}?= ${LDADD_${_l}_L} -l${_l} .endif # Add in all dependencies for static linkage. # Bootstrapping from non-FreeBSD needs special handling, since it overrides # NO_SHARED back to yes despite only building static versions of bootstrap # libraries (see tools/build/mk/Makefile.boot.pre). .if defined(_DP_${_l}) && (${_INTERNALLIBS:M${_l}} || \ (defined(NO_SHARED) && ${NO_SHARED:tl} != "no") || \ (defined(BOOTSTRAPPING) && ${.MAKE.OS} != "FreeBSD")) .for _d in ${_DP_${_l}} DPADD_${_l}+= ${DPADD_${_d}} LDADD_${_l}+= ${LDADD_${_d}} .endfor .endif .endfor # These are special cases where the library is broken and anything that uses # it needs to add more dependencies. Broken usually means that it has a # cyclic dependency and cannot link its own dependencies. This is bad, please # fix the library instead. # Unless the library itself is broken then the proper place to define # dependencies is _DP_* above. # libatf-c++ exposes libatf-c abi hence we need to explicit link to atf_c for # atf_cxx DPADD_atf_cxx+= ${DPADD_atf_c} LDADD_atf_cxx+= ${LDADD_atf_c} DPADD_gmock+= ${DPADD_gtest} LDADD_gmock+= ${LDADD_gtest} DPADD_gmock_main+= ${DPADD_gmock} LDADD_gmock_main+= ${LDADD_gmock} DPADD_gtest_main+= ${DPADD_gtest} LDADD_gtest_main+= ${LDADD_gtest} # Detect LDADD/DPADD that should be LIBADD, before modifying LDADD here. _BADLDADD= .for _l in ${LDADD:M-l*:N-l*/*:C,^-l,,} .if ${_LIBRARIES:M${_l}} && !${_PRIVATELIBS:M${_l}} _BADLDADD+= ${_l} .endif .endfor .if !empty(_BADLDADD) .error ${.CURDIR}: These libraries should be LIBADD+=foo rather than DPADD/LDADD+=-lfoo: ${_BADLDADD} .endif .for _l in ${LIBADD} DPADD+= ${DPADD_${_l}} LDADD+= ${LDADD_${_l}} .endfor _LIB_OBJTOP?= ${OBJTOP} # INTERNALLIB definitions. LIBDIFFDIR= ${_LIB_OBJTOP}/lib/libdiff LIBDIFF?= ${LIBDIFFDIR}/libdiff${PIE_SUFFIX}.a LIBELFTCDIR= ${_LIB_OBJTOP}/lib/libelftc LIBELFTC?= ${LIBELFTCDIR}/libelftc${PIE_SUFFIX}.a LIBFDTDIR= ${_LIB_OBJTOP}/lib/libfdt LIBFDT?= ${LIBFDTDIR}/libfdt${PIE_SUFFIX}.a LIBLUADIR= ${_LIB_OBJTOP}/lib/liblua LIBLUA?= ${LIBLUADIR}/liblua${PIE_SUFFIX}.a LIBLUTOKDIR= ${_LIB_OBJTOP}/lib/liblutok LIBLUTOK?= ${LIBLUTOKDIR}/liblutok${PIE_SUFFIX}.a LIBPEDIR= ${_LIB_OBJTOP}/lib/libpe LIBPE?= ${LIBPEDIR}/libpe${PIE_SUFFIX}.a LIBOPENBSDDIR= ${_LIB_OBJTOP}/lib/libopenbsd LIBOPENBSD?= ${LIBOPENBSDDIR}/libopenbsd${PIE_SUFFIX}.a LIBSMDIR= ${_LIB_OBJTOP}/lib/libsm LIBSM?= ${LIBSMDIR}/libsm${PIE_SUFFIX}.a LIBSMDBDIR= ${_LIB_OBJTOP}/lib/libsmdb LIBSMDB?= ${LIBSMDBDIR}/libsmdb${PIE_SUFFIX}.a LIBSMUTILDIR= ${_LIB_OBJTOP}/lib/libsmutil LIBSMUTIL?= ${LIBSMUTILDIR}/libsmutil${PIE_SUFFIX}.a LIBSYSDIR= ${_LIB_OBJTOP}/lib/libsys LIBSYS?= ${LIBSYSDIR}/libsys${PIE_SUFFIX}.a LIBNETBSDDIR?= ${_LIB_OBJTOP}/lib/libnetbsd LIBNETBSD?= ${LIBNETBSDDIR}/libnetbsd${PIE_SUFFIX}.a LIBVERSDIR?= ${_LIB_OBJTOP}/kerberos5/lib/libvers LIBVERS?= ${LIBVERSDIR}/libvers${PIE_SUFFIX}.a LIBSLDIR= ${_LIB_OBJTOP}/kerberos5/lib/libsl LIBSL?= ${LIBSLDIR}/libsl${PIE_SUFFIX}.a LIBIFCONFIGDIR= ${_LIB_OBJTOP}/lib/libifconfig LIBIFCONFIG?= ${LIBIFCONFIGDIR}/libifconfig${PIE_SUFFIX}.a LIBIPFDIR= ${_LIB_OBJTOP}/sbin/ipf/libipf LIBIPF?= ${LIBIPFDIR}/libipf${PIE_SUFFIX}.a LIBNVDIR= ${_LIB_OBJTOP}/lib/libnv LIBNV?= ${LIBNVDIR}/libnv${PIE_SUFFIX}.a LIBISCSIUTILDIR= ${_LIB_OBJTOP}/lib/libiscsiutil LIBISCSIUTIL?= ${LIBISCSIUTILDIR}/libiscsiutil${PIE_SUFFIX}.a LIBNVMFDIR= ${_LIB_OBJTOP}/lib/libnvmf LIBNVMF?= ${LIBNVMFDIR}/libnvmf${PIE_SUFFIX}.a LIBTELNETDIR= ${_LIB_OBJTOP}/lib/libtelnet LIBTELNET?= ${LIBTELNETDIR}/libtelnet${PIE_SUFFIX}.a LIBCRONDIR= ${_LIB_OBJTOP}/usr.sbin/cron/lib LIBCRON?= ${LIBCRONDIR}/libcron${PIE_SUFFIX}.a LIBNTPDIR= ${_LIB_OBJTOP}/usr.sbin/ntp/libntp LIBNTP?= ${LIBNTPDIR}/libntp${PIE_SUFFIX}.a LIBNTPEVENTDIR= ${_LIB_OBJTOP}/usr.sbin/ntp/libntpevent LIBNTPEVENT?= ${LIBNTPEVENTDIR}/libntpevent${PIE_SUFFIX}.a LIBOPTSDIR= ${_LIB_OBJTOP}/usr.sbin/ntp/libopts LIBOPTS?= ${LIBOPTSDIR}/libopts${PIE_SUFFIX}.a LIBPARSEDIR= ${_LIB_OBJTOP}/usr.sbin/ntp/libparse LIBPARSE?= ${LIBPARSEDIR}/libparse${PIE_SUFFIX}.a LIBPFCTLDIR= ${_LIB_OBJTOP}/lib/libpfctl LIBPFCTL?= ${LIBPFCTLDIR}/libpfctl${PIE_SUFFIX}.a LIBLPRDIR= ${_LIB_OBJTOP}/usr.sbin/lpr/common_source LIBLPR?= ${LIBLPRDIR}/liblpr${PIE_SUFFIX}.a LIBFIFOLOGDIR= ${_LIB_OBJTOP}/usr.sbin/fifolog/lib LIBFIFOLOG?= ${LIBFIFOLOGDIR}/libfifolog${PIE_SUFFIX}.a LIBBSNMPTOOLSDIR= ${_LIB_OBJTOP}/usr.sbin/bsnmpd/tools/libbsnmptools LIBBSNMPTOOLS?= ${LIBBSNMPTOOLSDIR}/libbsnmptools${PIE_SUFFIX}.a LIBBE?= ${LIBBEDIR}/libbe${PIE_SUFFIX}.a LIBPMCSTATDIR= ${_LIB_OBJTOP}/lib/libpmcstat LIBPMCSTAT?= ${LIBPMCSTATDIR}/libpmcstat${PIE_SUFFIX}.a LIBWPAAPDIR= ${_LIB_OBJTOP}/usr.sbin/wpa/src/ap LIBWPAAP?= ${LIBWPAAPDIR}/libwpaap${PIE_SUFFIX}.a LIBWPACOMMONDIR= ${_LIB_OBJTOP}/usr.sbin/wpa/src/common LIBWPACOMMON?= ${LIBWPACOMMONDIR}/libwpacommon${PIE_SUFFIX}.a LIBWPACRYPTODIR= ${_LIB_OBJTOP}/usr.sbin/wpa/src/crypto LIBWPACRYPTO?= ${LIBWPACRYPTODIR}/libwpacrypto${PIE_SUFFIX}.a LIBWPADRIVERSDIR= ${_LIB_OBJTOP}/usr.sbin/wpa/src/drivers LIBWPADRIVERS?= ${LIBWPADRIVERSDIR}/libwpadrivers${PIE_SUFFIX}.a LIBWPAEAP_COMMONDIR= ${_LIB_OBJTOP}/usr.sbin/wpa/src/eap_common LIBWPAEAP_COMMON?= ${LIBWPAEAP_COMMONDIR}/libwpaeap_common${PIE_SUFFIX}.a LIBWPAEAP_PEERDIR= ${_LIB_OBJTOP}/usr.sbin/wpa/src/eap_peer LIBWPAEAP_PEER?= ${LIBWPAEAP_PEERDIR}/libwpaeap_peer${PIE_SUFFIX}.a LIBWPAEAP_SERVERDIR= ${_LIB_OBJTOP}/usr.sbin/wpa/src/eap_server LIBWPAEAP_SERVER?= ${LIBWPAEAP_SERVERDIR}/libwpaeap_server${PIE_SUFFIX}.a LIBWPAEAPOL_AUTHDIR= ${_LIB_OBJTOP}/usr.sbin/wpa/src/eapol_auth LIBWPAEAPOL_AUTH?= ${LIBWPAEAPOL_AUTHDIR}/libwpaeapol_auth${PIE_SUFFIX}.a LIBWPAEAPOL_SUPPDIR= ${_LIB_OBJTOP}/usr.sbin/wpa/src/eapol_supp LIBWPAEAPOL_SUPP?= ${LIBWPAEAPOL_SUPPDIR}/libwpaeapol_supp${PIE_SUFFIX}.a LIBWPAL2_PACKETDIR= ${_LIB_OBJTOP}/usr.sbin/wpa/src/l2_packet LIBWPAL2_PACKET?= ${LIBWPAL2_PACKETDIR}/libwpal2_packet${PIE_SUFFIX}.a LIBWPAPASNDIR= ${_LIB_OBJTOP}/usr.sbin/wpa/src/pasn LIBWPAPASN?= ${LIBWPAPASNDIR}/libwpapasn${PIE_SUFFIX}.a LIBWPARADIUSDIR= ${_LIB_OBJTOP}/usr.sbin/wpa/src/radius LIBWPARADIUS?= ${LIBWPARADIUSDIR}/libwparadius${PIE_SUFFIX}.a LIBWPARSN_SUPPDIR= ${_LIB_OBJTOP}/usr.sbin/wpa/src/rsn_supp LIBWPARSN_SUPP?= ${LIBWPARSN_SUPPDIR}/libwparsn_supp${PIE_SUFFIX}.a LIBWPATLSDIR= ${_LIB_OBJTOP}/usr.sbin/wpa/src/tls LIBWPATLS?= ${LIBWPATLSDIR}/libwpatls${PIE_SUFFIX}.a LIBWPAUTILSDIR= ${_LIB_OBJTOP}/usr.sbin/wpa/src/utils LIBWPAUTILS?= ${LIBWPAUTILSDIR}/libwpautils${PIE_SUFFIX}.a LIBWPAWPSDIR= ${_LIB_OBJTOP}/usr.sbin/wpa/src/wps LIBWPAWPS?= ${LIBWPAWPSDIR}/libwpawps${PIE_SUFFIX}.a LIBC_NOSSP_PICDIR= ${_LIB_OBJTOP}/lib/libc LIBC_NOSSP_PIC?= ${LIBC_NOSSP_PICDIR}/libc_nossp_pic.a # Define a directory for each library. This is useful for adding -L in when # not using a --sysroot or for meta mode bootstrapping when there is no # Makefile.depend. These are sorted by directory. LIBAVLDIR= ${_LIB_OBJTOP}/cddl/lib/libavl LIBCTFDIR= ${_LIB_OBJTOP}/cddl/lib/libctf LIBDTRACEDIR= ${_LIB_OBJTOP}/cddl/lib/libdtrace LIBICPDIR= ${_LIB_OBJTOP}/cddl/lib/libicp LIBICP?= ${LIBICPDIR}/libicp${PIE_SUFFIX}.a LIBICP_RESCUEDIR= ${_LIB_OBJTOP}/cddl/lib/libicp_rescue LIBICP_RESCUE?= ${LIBICP_RESCUEDIR}/libicp_rescue${PIE_SUFFIX}.a LIBNVPAIRDIR= ${_LIB_OBJTOP}/cddl/lib/libnvpair LIBNVPAIR?= ${LIBNVPAIRDIR}/libnvpair${PIE_SUFFIX}.a LIBUMEMDIR= ${_LIB_OBJTOP}/cddl/lib/libumem LIBUUTILDIR= ${_LIB_OBJTOP}/cddl/lib/libuutil LIBZDBDIR= ${_LIB_OBJTOP}/cddl/lib/libzdb LIBZDB?= ${LIBZDBDIR}/libzdb${PIE_SUFFIX}.a LIBZFSDIR= ${_LIB_OBJTOP}/cddl/lib/libzfs LIBZFS?= ${LIBZFSDIR}/libzfs${PIE_SUFFIX}.a LIBZFS_COREDIR= ${_LIB_OBJTOP}/cddl/lib/libzfs_core LIBZFS_CORE?= ${LIBZFS_COREDIR}/libzfs_core${PIE_SUFFIX}.a LIBZFSBOOTENVDIR= ${_LIB_OBJTOP}/cddl/lib/libzfsbootenv LIBZFSBOOTENV?= ${LIBZFSBOOTENVDIR}/libzfsbootenv${PIE_SUFFIX}.a LIBZPOOLDIR= ${_LIB_OBJTOP}/cddl/lib/libzpool LIBZPOOL?= ${LIBZPOOLDIR}/libzpool${PIE_SUFFIX}.a LIBZUTILDIR= ${_LIB_OBJTOP}/cddl/lib/libzutil LIBZUTIL?= ${LIBZUTILDIR}/libzutil${PIE_SUFFIX}.a LIBTPOOLDIR= ${_LIB_OBJTOP}/cddl/lib/libtpool # OFED support LIBCXGB4DIR= ${_LIB_OBJTOP}/lib/ofed/libcxgb4 LIBIBCMDIR= ${_LIB_OBJTOP}/lib/ofed/libibcm LIBIBMADDIR= ${_LIB_OBJTOP}/lib/ofed/libibmad LIBIBNETDISCDIR=${_LIB_OBJTOP}/lib/ofed/libibnetdisc LIBIBUMADDIR= ${_LIB_OBJTOP}/lib/ofed/libibumad LIBIBVERBSDIR= ${_LIB_OBJTOP}/lib/ofed/libibverbs LIBIRDMADIR= ${_LIB_OBJTOP}/lib/ofed/libirdma LIBMLX4DIR= ${_LIB_OBJTOP}/lib/ofed/libmlx4 LIBMLX5DIR= ${_LIB_OBJTOP}/lib/ofed/libmlx5 LIBRDMACMDIR= ${_LIB_OBJTOP}/lib/ofed/librdmacm LIBOSMCOMPDIR= ${_LIB_OBJTOP}/lib/ofed/complib LIBOPENSMDIR= ${_LIB_OBJTOP}/lib/ofed/libopensm LIBOSMVENDORDIR=${_LIB_OBJTOP}/lib/ofed/libvendor LIBDIALOGDIR= ${_LIB_OBJTOP}/gnu/lib/libdialog LIBSSPDIR= ${_LIB_OBJTOP}/lib/libssp LIBSSP_NONSHAREDDIR= ${_LIB_OBJTOP}/lib/libssp_nonshared LIBASN1DIR= ${_LIB_OBJTOP}/kerberos5/lib/libasn1 LIBGSSAPI_KRB5DIR= ${_LIB_OBJTOP}/kerberos5/lib/libgssapi_krb5 LIBGSSAPI_NTLMDIR= ${_LIB_OBJTOP}/kerberos5/lib/libgssapi_ntlm LIBGSSAPI_SPNEGODIR= ${_LIB_OBJTOP}/kerberos5/lib/libgssapi_spnego LIBHDBDIR= ${_LIB_OBJTOP}/kerberos5/lib/libhdb LIBHEIMBASEDIR= ${_LIB_OBJTOP}/kerberos5/lib/libheimbase LIBHEIMIPCCDIR= ${_LIB_OBJTOP}/kerberos5/lib/libheimipcc LIBHEIMIPCSDIR= ${_LIB_OBJTOP}/kerberos5/lib/libheimipcs LIBHEIMNTLMDIR= ${_LIB_OBJTOP}/kerberos5/lib/libheimntlm LIBHX509DIR= ${_LIB_OBJTOP}/kerberos5/lib/libhx509 LIBKADM5CLNTDIR= ${_LIB_OBJTOP}/kerberos5/lib/libkadm5clnt LIBKADM5SRVDIR= ${_LIB_OBJTOP}/kerberos5/lib/libkadm5srv LIBKAFS5DIR= ${_LIB_OBJTOP}/kerberos5/lib/libkafs5 LIBKDCDIR= ${_LIB_OBJTOP}/kerberos5/lib/libkdc LIBKRB5DIR= ${_LIB_OBJTOP}/kerberos5/lib/libkrb5 LIBROKENDIR= ${_LIB_OBJTOP}/kerberos5/lib/libroken LIBWINDDIR= ${_LIB_OBJTOP}/kerberos5/lib/libwind LIBATF_CDIR= ${_LIB_OBJTOP}/lib/atf/libatf-c LIBATF_CXXDIR= ${_LIB_OBJTOP}/lib/atf/libatf-c++ LIBGMOCKDIR= ${_LIB_OBJTOP}/lib/googletest/gmock LIBGMOCK_MAINDIR= ${_LIB_OBJTOP}/lib/googletest/gmock_main LIBGTESTDIR= ${_LIB_OBJTOP}/lib/googletest/gtest LIBGTEST_MAINDIR= ${_LIB_OBJTOP}/lib/googletest/gtest_main LIBALIASDIR= ${_LIB_OBJTOP}/lib/libalias/libalias LIBBLACKLISTDIR= ${_LIB_OBJTOP}/lib/libblacklist LIBBLOCKSRUNTIMEDIR= ${_LIB_OBJTOP}/lib/libblocksruntime LIBBSNMPDIR= ${_LIB_OBJTOP}/lib/libbsnmp/libbsnmp LIBCASPERDIR= ${_LIB_OBJTOP}/lib/libcasper/libcasper LIBCAP_DNSDIR= ${_LIB_OBJTOP}/lib/libcasper/services/cap_dns LIBCAP_GRPDIR= ${_LIB_OBJTOP}/lib/libcasper/services/cap_grp LIBCAP_NETDIR= ${_LIB_OBJTOP}/lib/libcasper/services/cap_net LIBCAP_PWDDIR= ${_LIB_OBJTOP}/lib/libcasper/services/cap_pwd LIBCAP_SYSCTLDIR= ${_LIB_OBJTOP}/lib/libcasper/services/cap_sysctl LIBCAP_SYSLOGDIR= ${_LIB_OBJTOP}/lib/libcasper/services/cap_syslog LIBCBORDIR= ${_LIB_OBJTOP}/lib/libcbor LIBBSDXMLDIR= ${_LIB_OBJTOP}/lib/libexpat LIBFIDO2DIR= ${_LIB_OBJTOP}/lib/libfido2 LIBKVMDIR= ${_LIB_OBJTOP}/lib/libkvm LIBPTHREADDIR= ${_LIB_OBJTOP}/lib/libthr LIBMDIR= ${_LIB_OBJTOP}/lib/msun LIBFORMWDIR= ${_LIB_OBJTOP}/lib/ncurses/form LIBMENUWDIR= ${_LIB_OBJTOP}/lib/ncurses/menu LIBNCURSESWDIR= ${_LIB_OBJTOP}/lib/ncurses/ncurses LIBTINFOWDIR= ${_LIB_OBJTOP}/lib/ncurses/tinfo LIBPANELWDIR= ${_LIB_OBJTOP}/lib/ncurses/panel LIBCRYPTODIR= ${_LIB_OBJTOP}/secure/lib/libcrypto LIBSPLDIR= ${_LIB_OBJTOP}/cddl/lib/libspl LIBSSHDIR= ${_LIB_OBJTOP}/secure/lib/libssh LIBSSLDIR= ${_LIB_OBJTOP}/secure/lib/libssl LIBTEKENDIR= ${_LIB_OBJTOP}/sys/teken/libteken LIBEGACYDIR= ${_LIB_OBJTOP}/tools/build LIBLNDIR= ${_LIB_OBJTOP}/usr.bin/lex/lib LIBTERMCAPWDIR= ${LIBTINFOWDIR} .-include # Default other library directories to lib/libNAME. .for lib in ${_LIBRARIES} LIB${lib:tu}DIR?= ${OBJTOP}/lib/lib${lib} .endfor # Validate that listed LIBADD are valid. .for _l in ${LIBADD} .if empty(_LIBRARIES:M${_l}) _BADLIBADD+= ${_l} .endif .endfor .if !empty(_BADLIBADD) .error ${.CURDIR}: Invalid LIBADD used which may need to be added to ${_this:T}: ${_BADLIBADD} .endif # Sanity check that libraries are defined here properly when building them. .if defined(LIB) && ${_LIBRARIES:M${LIB}} != "" .if !empty(LIBADD) && \ (!defined(_DP_${LIB}) || ${LIBADD:O:u} != ${_DP_${LIB}:O:u}) .error ${.CURDIR}: Missing or incorrect _DP_${LIB} entry in ${_this:T}. Should match LIBADD for ${LIB} ('${LIBADD}' vs '${_DP_${LIB}}') .endif # Note that OBJTOP is not yet defined here but for the purpose of the check # it is fine as it resolves to the SRC directory. .if !defined(LIB${LIB:tu}DIR) || !exists(${SRCTOP}/${LIB${LIB:tu}DIR:S,^${OBJTOP}/,,}) .error ${.CURDIR}: Missing or incorrect value for LIB${LIB:tu}DIR in ${_this:T}: ${LIB${LIB:tu}DIR:S,^${OBJTOP}/,,} .endif .if ${_INTERNALLIBS:M${LIB}} != "" && !defined(LIB${LIB:tu}) .error ${.CURDIR}: Missing value for LIB${LIB:tu} in ${_this:T}. Likely should be: LIB${LIB:tu}?= $${LIB${LIB:tu}DIR}/lib${LIB}.a .endif .endif .endif # !target(____) diff --git a/sys/cam/ctl/ctl_ioctl.h b/sys/cam/ctl/ctl_ioctl.h index 326e4c931f93..c7070b63eb09 100644 --- a/sys/cam/ctl/ctl_ioctl.h +++ b/sys/cam/ctl/ctl_ioctl.h @@ -1,848 +1,848 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2003 Silicon Graphics International Corp. * Copyright (c) 2011 Spectra Logic Corporation * Copyright (c) 2014-2017 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. * * $Id: //depot/users/kenm/FreeBSD-test2/sys/cam/ctl/ctl_ioctl.h#4 $ */ /* * CAM Target Layer ioctl interface. * * Author: Ken Merry */ #ifndef _CTL_IOCTL_H_ #define _CTL_IOCTL_H_ #ifdef ICL_KERNEL_PROXY #include #endif #include #include #include #include #define CTL_DEFAULT_DEV "/dev/cam/ctl" /* * Maximum number of targets we support. */ #define CTL_MAX_TARGETS 1 /* * Maximum target ID we support. */ #define CTL_MAX_TARGID 15 /* * Maximum number of initiators per port. */ #define CTL_MAX_INIT_PER_PORT 2048 /* Hopefully this won't conflict with new misc devices that pop up */ #define CTL_MINOR 225 typedef enum { CTL_DELAY_TYPE_NONE, CTL_DELAY_TYPE_CONT, CTL_DELAY_TYPE_ONESHOT } ctl_delay_type; typedef enum { CTL_DELAY_LOC_NONE, CTL_DELAY_LOC_DATAMOVE, CTL_DELAY_LOC_DONE, } ctl_delay_location; typedef enum { CTL_DELAY_STATUS_NONE, CTL_DELAY_STATUS_OK, CTL_DELAY_STATUS_INVALID_LUN, CTL_DELAY_STATUS_INVALID_TYPE, CTL_DELAY_STATUS_INVALID_LOC, CTL_DELAY_STATUS_NOT_IMPLEMENTED } ctl_delay_status; struct ctl_io_delay_info { uint32_t lun_id; ctl_delay_type delay_type; ctl_delay_location delay_loc; uint32_t delay_secs; ctl_delay_status status; }; typedef enum { CTL_STATS_NO_IO, CTL_STATS_READ, CTL_STATS_WRITE } ctl_stat_types; #define CTL_STATS_NUM_TYPES 3 typedef enum { CTL_SS_OK, CTL_SS_NEED_MORE_SPACE, CTL_SS_ERROR } ctl_stats_status; typedef enum { CTL_STATS_FLAG_NONE = 0x00, CTL_STATS_FLAG_TIME_VALID = 0x01 } ctl_stats_flags; struct ctl_io_stats { uint32_t item; uint64_t bytes[CTL_STATS_NUM_TYPES]; uint64_t operations[CTL_STATS_NUM_TYPES]; uint64_t dmas[CTL_STATS_NUM_TYPES]; struct bintime time[CTL_STATS_NUM_TYPES]; struct bintime dma_time[CTL_STATS_NUM_TYPES]; }; struct ctl_get_io_stats { struct ctl_io_stats *stats; /* passed to/from kernel */ size_t alloc_len; /* passed to kernel */ size_t fill_len; /* passed to userland */ int first_item; /* passed to kernel */ int num_items; /* passed to userland */ ctl_stats_status status; /* passed to userland */ ctl_stats_flags flags; /* passed to userland */ struct timespec timestamp; /* passed to userland */ }; /* * The types of errors that can be injected: * * NONE: No error specified. * ABORTED: SSD_KEY_ABORTED_COMMAND, 0x45, 0x00 * MEDIUM_ERR: Medium error, different asc/ascq depending on read/write. * UA: Unit attention. * CUSTOM: User specifies the sense data. * TYPE: Mask to use with error types. * * Flags that affect injection behavior: * CONTINUOUS: This error will stay around until explicitly cleared. * DESCRIPTOR: Use descriptor sense instead of fixed sense. */ typedef enum { CTL_LUN_INJ_NONE = 0x000, CTL_LUN_INJ_ABORTED = 0x001, CTL_LUN_INJ_MEDIUM_ERR = 0x002, CTL_LUN_INJ_UA = 0x003, CTL_LUN_INJ_CUSTOM = 0x004, CTL_LUN_INJ_TYPE = 0x0ff, CTL_LUN_INJ_CONTINUOUS = 0x100, CTL_LUN_INJ_DESCRIPTOR = 0x200 } ctl_lun_error; /* * Flags to specify what type of command the given error pattern will * execute on. The first group of types can be ORed together. * * READ: Any read command. * WRITE: Any write command. * READWRITE: Any read or write command. * READCAP: Any read capacity command. * TUR: Test Unit Ready. * ANY: Any command. * MASK: Mask for basic command patterns. * * Special types: * * CMD: The CDB to act on is specified in struct ctl_error_desc_cmd. * RANGE: For read/write commands, act when the LBA is in the * specified range. */ typedef enum { CTL_LUN_PAT_NONE = 0x000, CTL_LUN_PAT_READ = 0x001, CTL_LUN_PAT_WRITE = 0x002, CTL_LUN_PAT_READWRITE = CTL_LUN_PAT_READ | CTL_LUN_PAT_WRITE, CTL_LUN_PAT_READCAP = 0x004, CTL_LUN_PAT_TUR = 0x008, CTL_LUN_PAT_ANY = 0x0ff, CTL_LUN_PAT_MASK = 0x0ff, CTL_LUN_PAT_CMD = 0x100, CTL_LUN_PAT_RANGE = 0x200 } ctl_lun_error_pattern; /* * This structure allows the user to specify a particular CDB pattern to * look for. * * cdb_pattern: Fill in the relevant bytes to look for in the CDB. * cdb_valid_bytes: Bitmask specifying valid bytes in the cdb_pattern. * flags: Specify any command flags (see ctl_io_flags) that * should be set. */ struct ctl_error_desc_cmd { uint8_t cdb_pattern[CTL_MAX_CDBLEN]; uint32_t cdb_valid_bytes; uint32_t flags; }; /* * Error injection descriptor. * * lun_id LUN to act on. * lun_error: The type of error to inject. See above for descriptions. * error_pattern: What kind of command to act on. See above. * cmd_desc: For CTL_LUN_PAT_CMD only. * lba_range: For CTL_LUN_PAT_RANGE only. * custom_sense: Specify sense. For CTL_LUN_INJ_CUSTOM only. * serial: Serial number returned by the kernel. Use for deletion. * links: Kernel use only. */ struct ctl_error_desc { uint32_t lun_id; /* To kernel */ ctl_lun_error lun_error; /* To kernel */ ctl_lun_error_pattern error_pattern; /* To kernel */ struct ctl_error_desc_cmd cmd_desc; /* To kernel */ struct ctl_lba_len lba_range; /* To kernel */ struct scsi_sense_data custom_sense; /* To kernel */ uint64_t serial; /* From kernel */ STAILQ_ENTRY(ctl_error_desc) links; /* Kernel use only */ }; typedef enum { CTL_OOA_FLAG_NONE = 0x00, CTL_OOA_FLAG_ALL_LUNS = 0x01 } ctl_ooa_flags; typedef enum { CTL_OOA_OK, CTL_OOA_NEED_MORE_SPACE, CTL_OOA_ERROR } ctl_get_ooa_status; typedef enum { CTL_OOACMD_FLAG_NONE = 0x00, CTL_OOACMD_FLAG_DMA = 0x01, CTL_OOACMD_FLAG_BLOCKED = 0x02, CTL_OOACMD_FLAG_ABORT = 0x04, CTL_OOACMD_FLAG_RTR = 0x08, CTL_OOACMD_FLAG_DMA_QUEUED = 0x10, CTL_OOACMD_FLAG_STATUS_QUEUED = 0x20, CTL_OOACMD_FLAG_STATUS_SENT = 0x40 } ctl_ooa_cmd_flags; struct ctl_ooa_entry { ctl_ooa_cmd_flags cmd_flags; uint8_t cdb[CTL_MAX_CDBLEN]; uint8_t cdb_len; uint64_t tag_num; ctl_tag_type tag_type; uint32_t lun_num; struct bintime start_bt; }; struct ctl_ooa { ctl_ooa_flags flags; /* passed to kernel */ uint64_t lun_num; /* passed to kernel */ uint32_t alloc_len; /* passed to kernel */ uint32_t alloc_num; /* passed to kernel */ struct ctl_ooa_entry *entries; /* filled in kernel */ uint32_t fill_len; /* passed to userland */ uint32_t fill_num; /* passed to userland */ uint32_t dropped_num; /* passed to userland */ struct bintime cur_bt; /* passed to userland */ ctl_get_ooa_status status; /* passed to userland */ }; typedef enum { CTL_LUN_NOSTATUS, CTL_LUN_OK, CTL_LUN_ERROR, CTL_LUN_WARNING } ctl_lun_status; #define CTL_ERROR_STR_LEN 160 typedef enum { CTL_LUNREQ_CREATE, CTL_LUNREQ_RM, CTL_LUNREQ_MODIFY, } ctl_lunreq_type; /* * The ID_REQ flag is used to say that the caller has requested a * particular LUN ID in the req_lun_id field. If we cannot allocate that * LUN ID, the ctl_add_lun() call will fail. * * The STOPPED flag tells us that the LUN should default to the powered * off state. It will return 0x04,0x02 until it is powered up. ("Logical * unit not ready, initializing command required.") * * The NO_MEDIA flag tells us that the LUN has no media inserted. * * The PRIMARY flag tells us that this LUN is registered as a Primary LUN * which is accessible via the Master shelf controller in an HA. This flag * being set indicates a Primary LUN. This flag being reset represents a * Secondary LUN controlled by the Secondary controller in an HA * configuration. Flag is applicable at this time to T_DIRECT types. * * The SERIAL_NUM flag tells us that the serial_num field is filled in and * valid for use in SCSI INQUIRY VPD page 0x80. * * The DEVID flag tells us that the device_id field is filled in and * valid for use in SCSI INQUIRY VPD page 0x83. * * The DEV_TYPE flag tells us that the device_type field is filled in. * * The EJECTED flag tells us that the removable LUN has tray open. * * The UNMAP flag tells us that this LUN supports UNMAP. * * The OFFLINE flag tells us that this LUN can not access backing store. */ typedef enum { CTL_LUN_FLAG_ID_REQ = 0x01, CTL_LUN_FLAG_STOPPED = 0x02, CTL_LUN_FLAG_NO_MEDIA = 0x04, CTL_LUN_FLAG_PRIMARY = 0x08, CTL_LUN_FLAG_SERIAL_NUM = 0x10, CTL_LUN_FLAG_DEVID = 0x20, CTL_LUN_FLAG_DEV_TYPE = 0x40, CTL_LUN_FLAG_UNMAP = 0x80, CTL_LUN_FLAG_EJECTED = 0x100, CTL_LUN_FLAG_READONLY = 0x200 } ctl_backend_lun_flags; /* * LUN creation parameters: * * flags: Various LUN flags, see above. * * device_type: The SCSI device type. e.g. 0 for Direct Access, * 3 for Processor, etc. Only certain backends may * support setting this field. The CTL_LUN_FLAG_DEV_TYPE * flag should be set in the flags field if the device * type is set. * * lun_size_bytes: The size of the LUN in bytes. For some backends * this is relevant (e.g. ramdisk), for others, it may * be ignored in favor of using the properties of the * backing store. If specified, this should be a * multiple of the blocksize. * * The actual size of the LUN is returned in this * field. * * blocksize_bytes: The LUN blocksize in bytes. For some backends this * is relevant, for others it may be ignored in * favor of using the properties of the backing store. * * The actual blocksize of the LUN is returned in this * field. * * req_lun_id: The requested LUN ID. The CTL_LUN_FLAG_ID_REQ flag * should be set if this is set. The request will be * granted if the LUN number is available, otherwise * the LUN addition request will fail. * * The allocated LUN number is returned in this field. * * serial_num: This is the value returned in SCSI INQUIRY VPD page * 0x80. If it is specified, the CTL_LUN_FLAG_SERIAL_NUM * flag should be set. * * The serial number value used is returned in this * field. * * device_id: This is the value returned in the T10 vendor ID * based DESIGNATOR field in the SCSI INQUIRY VPD page * 0x83 data. If it is specified, the CTL_LUN_FLAG_DEVID * flag should be set. * * The device id value used is returned in this field. */ struct ctl_lun_create_params { ctl_backend_lun_flags flags; uint8_t device_type; uint64_t lun_size_bytes; uint32_t blocksize_bytes; uint32_t req_lun_id; uint8_t serial_num[CTL_SN_LEN]; uint8_t device_id[CTL_DEVID_LEN]; }; /* * LUN removal parameters: * * lun_id: The number of the LUN to delete. This must be set. * The LUN must be backed by the given backend. */ struct ctl_lun_rm_params { uint32_t lun_id; }; /* * LUN modification parameters: * * lun_id: The number of the LUN to modify. This must be set. * The LUN must be backed by the given backend. * * lun_size_bytes: The size of the LUN in bytes. If zero, update * the size using the backing file size, if possible. */ struct ctl_lun_modify_params { uint32_t lun_id; uint64_t lun_size_bytes; }; /* * Union of request type data. Fill in the appropriate union member for * the request type. */ union ctl_lunreq_data { struct ctl_lun_create_params create; struct ctl_lun_rm_params rm; struct ctl_lun_modify_params modify; }; /* * LUN request interface: * * backend: This is required, and is NUL-terminated a string * that is the name of the backend, like "ramdisk" or * "block". * * reqtype: The type of request, CTL_LUNREQ_CREATE to create a * LUN, CTL_LUNREQ_RM to delete a LUN. * * reqdata: Request type-specific information. See the * description of individual the union members above * for more information. * * num_be_args: This is the number of backend-specific arguments * in the be_args array. * * be_args: This is an array of backend-specific arguments. * See above for a description of the fields in this * structure. * * status: Status of the LUN request. * * error_str: If the status is CTL_LUN_ERROR, this will * contain a string describing the error. * * kern_be_args: For kernel use only. */ struct ctl_lun_req { #define CTL_BE_NAME_LEN 32 char backend[CTL_BE_NAME_LEN]; ctl_lunreq_type reqtype; union ctl_lunreq_data reqdata; void * args; nvlist_t * args_nvl; #define CTL_MAX_ARGS_LEN (1024 * 1024) size_t args_len; void * result; nvlist_t * result_nvl; size_t result_len; ctl_lun_status status; char error_str[CTL_ERROR_STR_LEN]; }; /* * LUN list status: * * NONE: No status. * * OK: Request completed successfully. * * NEED_MORE_SPACE: The allocated length of the entries field is too * small for the available data. * * ERROR: An error occurred, look at the error string for a * description of the error. */ typedef enum { CTL_LUN_LIST_NONE, CTL_LUN_LIST_OK, CTL_LUN_LIST_NEED_MORE_SPACE, CTL_LUN_LIST_ERROR } ctl_lun_list_status; /* * LUN list interface * * backend_name: This is a NUL-terminated string. If the string * length is 0, then all LUNs on all backends will * be enumerated. Otherwise this is the name of the * backend to be enumerated, like "ramdisk" or "block". * * alloc_len: The length of the data buffer allocated for entries. * In order to properly size the buffer, make one call * with alloc_len set to 0, and then use the returned * dropped_len as the buffer length to allocate and * pass in on a subsequent call. * * lun_xml: XML-formatted information on the requested LUNs. * * fill_len: The amount of data filled in the storage for entries. * * status: The status of the request. See above for the * description of the values of this field. * * error_str: If the status indicates an error, this string will * be filled in to describe the error. */ struct ctl_lun_list { char backend[CTL_BE_NAME_LEN]; /* passed to kernel*/ uint32_t alloc_len; /* passed to kernel */ char *lun_xml; /* filled in kernel */ uint32_t fill_len; /* passed to userland */ ctl_lun_list_status status; /* passed to userland */ char error_str[CTL_ERROR_STR_LEN]; /* passed to userland */ }; /* * Port request interface: * * driver: This is required, and is NUL-terminated a string * that is the name of the frontend, like "iscsi" . * * reqtype: The type of request, CTL_REQ_CREATE to create a * port, CTL_REQ_REMOVE to delete a port. * * num_be_args: This is the number of frontend-specific arguments * in the be_args array. * * be_args: This is an array of frontend-specific arguments. * See above for a description of the fields in this * structure. * * status: Status of the request. * * error_str: If the status is CTL_LUN_ERROR, this will * contain a string describing the error. * * kern_be_args: For kernel use only. */ typedef enum { CTL_REQ_CREATE, CTL_REQ_REMOVE, CTL_REQ_MODIFY, } ctl_req_type; struct ctl_req { char driver[CTL_DRIVER_NAME_LEN]; ctl_req_type reqtype; void * args; nvlist_t * args_nvl; size_t args_len; void * result; nvlist_t * result_nvl; size_t result_len; ctl_lun_status status; char error_str[CTL_ERROR_STR_LEN]; }; /* * iSCSI status * * OK: Request completed successfully. * * ERROR: An error occurred, look at the error string for a * description of the error. * * CTL_ISCSI_LIST_NEED_MORE_SPACE: * User has to pass larger buffer for CTL_ISCSI_LIST ioctl. */ typedef enum { CTL_ISCSI_OK, CTL_ISCSI_ERROR, CTL_ISCSI_LIST_NEED_MORE_SPACE, CTL_ISCSI_SESSION_NOT_FOUND } ctl_iscsi_status; typedef enum { CTL_ISCSI_HANDOFF, CTL_ISCSI_LIST, CTL_ISCSI_LOGOUT, CTL_ISCSI_TERMINATE, CTL_ISCSI_LIMITS, #if defined(ICL_KERNEL_PROXY) || 1 /* * We actually need those in all cases, but leave the ICL_KERNEL_PROXY, * to remember to remove them along with rest of proxy code, eventually. */ CTL_ISCSI_LISTEN, CTL_ISCSI_ACCEPT, CTL_ISCSI_SEND, CTL_ISCSI_RECEIVE, #endif } ctl_iscsi_type; typedef enum { CTL_ISCSI_DIGEST_NONE, CTL_ISCSI_DIGEST_CRC32C } ctl_iscsi_digest; #define CTL_ISCSI_NAME_LEN 224 /* 223 bytes, by RFC 3720, + '\0' */ #define CTL_ISCSI_ADDR_LEN 47 /* INET6_ADDRSTRLEN + '\0' */ #define CTL_ISCSI_ALIAS_LEN 128 /* Arbitrary. */ #define CTL_ISCSI_OFFLOAD_LEN 8 /* Arbitrary. */ struct ctl_iscsi_handoff_params { char initiator_name[CTL_ISCSI_NAME_LEN]; char initiator_addr[CTL_ISCSI_ADDR_LEN]; char initiator_alias[CTL_ISCSI_ALIAS_LEN]; uint8_t initiator_isid[6]; char target_name[CTL_ISCSI_NAME_LEN]; int socket; int portal_group_tag; /* * Connection parameters negotiated by ctld(8). */ ctl_iscsi_digest header_digest; ctl_iscsi_digest data_digest; uint32_t cmdsn; uint32_t statsn; int max_recv_data_segment_length; int max_burst_length; int first_burst_length; uint32_t immediate_data; char offload[CTL_ISCSI_OFFLOAD_LEN]; #ifdef ICL_KERNEL_PROXY int connection_id; #else int spare; #endif int max_send_data_segment_length; }; struct ctl_iscsi_list_params { uint32_t alloc_len; /* passed to kernel */ char *conn_xml; /* filled in kernel */ uint32_t fill_len; /* passed to userland */ int spare[4]; }; struct ctl_iscsi_logout_params { int connection_id; /* passed to kernel */ char initiator_name[CTL_ISCSI_NAME_LEN]; /* passed to kernel */ char initiator_addr[CTL_ISCSI_ADDR_LEN]; /* passed to kernel */ int all; /* passed to kernel */ int spare[4]; }; struct ctl_iscsi_terminate_params { int connection_id; /* passed to kernel */ char initiator_name[CTL_ISCSI_NAME_LEN]; /* passed to kernel */ char initiator_addr[CTL_ISCSI_NAME_LEN]; /* passed to kernel */ int all; /* passed to kernel */ int spare[4]; }; struct ctl_iscsi_limits_params { /* passed to kernel */ char offload[CTL_ISCSI_OFFLOAD_LEN]; int socket; /* passed to userland */ #ifdef __LP64__ int spare; #endif int max_recv_data_segment_length; int max_send_data_segment_length; int max_burst_length; int first_burst_length; }; #ifdef ICL_KERNEL_PROXY struct ctl_iscsi_listen_params { int iser; int domain; int socktype; int protocol; struct sockaddr *addr; socklen_t addrlen; int portal_id; int spare[4]; }; struct ctl_iscsi_accept_params { int connection_id; int portal_id; struct sockaddr *initiator_addr; socklen_t initiator_addrlen; int spare[4]; }; struct ctl_iscsi_send_params { int connection_id; void *bhs; size_t spare; void *spare2; size_t data_segment_len; void *data_segment; int spare3[4]; }; struct ctl_iscsi_receive_params { int connection_id; void *bhs; size_t spare; void *spare2; size_t data_segment_len; void *data_segment; int spare3[4]; }; #endif /* ICL_KERNEL_PROXY */ union ctl_iscsi_data { struct ctl_iscsi_handoff_params handoff; struct ctl_iscsi_list_params list; struct ctl_iscsi_logout_params logout; struct ctl_iscsi_terminate_params terminate; struct ctl_iscsi_limits_params limits; #ifdef ICL_KERNEL_PROXY struct ctl_iscsi_listen_params listen; struct ctl_iscsi_accept_params accept; struct ctl_iscsi_send_params send; struct ctl_iscsi_receive_params receive; #endif }; /* * iSCSI interface * * status: The status of the request. See above for the * description of the values of this field. * * error_str: If the status indicates an error, this string will * be filled in to describe the error. */ struct ctl_iscsi { ctl_iscsi_type type; /* passed to kernel */ union ctl_iscsi_data data; /* passed to kernel */ ctl_iscsi_status status; /* passed to userland */ char error_str[CTL_ERROR_STR_LEN]; /* passed to userland */ }; struct ctl_lun_map { uint32_t port; uint32_t plun; uint32_t lun; }; /* * NVMe over Fabrics status * * OK: Request completed successfully. * * ERROR: An error occurred, look at the error string for a * description of the error. */ typedef enum { CTL_NVMF_OK, CTL_NVMF_ERROR, CTL_NVMF_LIST_NEED_MORE_SPACE, CTL_NVMF_ASSOCIATION_NOT_FOUND } ctl_nvmf_status; typedef enum { CTL_NVMF_HANDOFF, CTL_NVMF_LIST, CTL_NVMF_TERMINATE } ctl_nvmf_type; struct ctl_nvmf_list_params { uint32_t alloc_len; /* passed to kernel */ char *conn_xml; /* filled in kernel */ uint32_t fill_len; /* passed to userland */ int spare[4]; }; struct ctl_nvmf_terminate_params { int cntlid; /* passed to kernel */ char hostnqn[NVME_NQN_FIELD_SIZE]; /* passed to kernel */ int all; /* passed to kernel */ int spare[4]; }; union ctl_nvmf_data { - struct nvmf_handoff_controller_qpair handoff; + struct nvmf_ioc_nv handoff; struct ctl_nvmf_list_params list; struct ctl_nvmf_terminate_params terminate; }; /* * NVMe over Fabrics interface * * status: The status of the request. See above for the * description of the values of this field. * * error_str: If the status indicates an error, this string will * be filled in to describe the error. */ struct ctl_nvmf { ctl_nvmf_type type; /* passed to kernel */ union ctl_nvmf_data data; /* passed to kernel */ ctl_nvmf_status status; /* passed to userland */ char error_str[CTL_ERROR_STR_LEN]; /* passed to userland */ }; #define CTL_IO _IOWR(CTL_MINOR, 0x00, union ctl_io) #define CTL_ENABLE_PORT _IOW(CTL_MINOR, 0x04, struct ctl_port_entry) #define CTL_DISABLE_PORT _IOW(CTL_MINOR, 0x05, struct ctl_port_entry) #define CTL_DELAY_IO _IOWR(CTL_MINOR, 0x10, struct ctl_io_delay_info) #define CTL_ERROR_INJECT _IOWR(CTL_MINOR, 0x16, struct ctl_error_desc) #define CTL_GET_OOA _IOWR(CTL_MINOR, 0x18, struct ctl_ooa) #define CTL_DUMP_STRUCTS _IO(CTL_MINOR, 0x19) #define CTL_LUN_REQ _IOWR(CTL_MINOR, 0x21, struct ctl_lun_req) #define CTL_LUN_LIST _IOWR(CTL_MINOR, 0x22, struct ctl_lun_list) #define CTL_ERROR_INJECT_DELETE _IOW(CTL_MINOR, 0x23, struct ctl_error_desc) #define CTL_SET_PORT_WWNS _IOW(CTL_MINOR, 0x24, struct ctl_port_entry) #define CTL_ISCSI _IOWR(CTL_MINOR, 0x25, struct ctl_iscsi) #define CTL_PORT_REQ _IOWR(CTL_MINOR, 0x26, struct ctl_req) #define CTL_PORT_LIST _IOWR(CTL_MINOR, 0x27, struct ctl_lun_list) #define CTL_LUN_MAP _IOW(CTL_MINOR, 0x28, struct ctl_lun_map) #define CTL_GET_LUN_STATS _IOWR(CTL_MINOR, 0x29, struct ctl_get_io_stats) #define CTL_GET_PORT_STATS _IOWR(CTL_MINOR, 0x2a, struct ctl_get_io_stats) #define CTL_NVMF _IOWR(CTL_MINOR, 0x2b, struct ctl_nvmf) #endif /* _CTL_IOCTL_H_ */ /* * vim: ts=8 */ diff --git a/sys/dev/nvmf/controller/ctl_frontend_nvmf.c b/sys/dev/nvmf/controller/ctl_frontend_nvmf.c index 8a4538e5056a..75b36b4834f5 100644 --- a/sys/dev/nvmf/controller/ctl_frontend_nvmf.c +++ b/sys/dev/nvmf/controller/ctl_frontend_nvmf.c @@ -1,1169 +1,1197 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2023-2024 Chelsio Communications, Inc. * Written by: John Baldwin */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Store pointers to the capsule and qpair in the two pointer members * of CTL_PRIV_FRONTEND. */ #define NVMFT_NC(io) ((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptrs[0]) #define NVMFT_QP(io) ((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptrs[1]) static void nvmft_done(union ctl_io *io); static int nvmft_init(void); static int nvmft_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int flag, struct thread *td); static int nvmft_shutdown(void); static struct taskqueue *nvmft_taskq; static TAILQ_HEAD(, nvmft_port) nvmft_ports; static struct sx nvmft_ports_lock; MALLOC_DEFINE(M_NVMFT, "nvmft", "NVMe over Fabrics controller"); static struct ctl_frontend nvmft_frontend = { .name = "nvmf", .init = nvmft_init, .ioctl = nvmft_ioctl, .fe_dump = NULL, .shutdown = nvmft_shutdown, }; static void nvmft_online(void *arg) { struct nvmft_port *np = arg; sx_xlock(&np->lock); np->online = true; sx_xunlock(&np->lock); } static void nvmft_offline(void *arg) { struct nvmft_port *np = arg; struct nvmft_controller *ctrlr; sx_xlock(&np->lock); np->online = false; TAILQ_FOREACH(ctrlr, &np->controllers, link) { nvmft_printf(ctrlr, "shutting down due to port going offline\n"); nvmft_controller_error(ctrlr, NULL, ENODEV); } while (!TAILQ_EMPTY(&np->controllers)) sx_sleep(np, &np->lock, 0, "nvmfoff", 0); sx_xunlock(&np->lock); } static int nvmft_lun_enable(void *arg, int lun_id) { struct nvmft_port *np = arg; struct nvmft_controller *ctrlr; uint32_t *old_ns, *new_ns; uint32_t nsid; u_int i; if (lun_id >= le32toh(np->cdata.nn)) { printf("NVMFT: %s lun %d larger than maximum nsid %u\n", np->cdata.subnqn, lun_id, le32toh(np->cdata.nn)); return (EOPNOTSUPP); } nsid = lun_id + 1; sx_xlock(&np->lock); new_ns = mallocarray(np->num_ns + 1, sizeof(*new_ns), M_NVMFT, M_WAITOK); for (i = 0; i < np->num_ns; i++) { if (np->active_ns[i] < nsid) continue; if (np->active_ns[i] == nsid) { sx_xunlock(&np->lock); free(new_ns, M_NVMFT); printf("NVMFT: %s duplicate lun %d\n", np->cdata.subnqn, lun_id); return (EINVAL); } break; } /* Copy over IDs smaller than nsid. */ memcpy(new_ns, np->active_ns, i * sizeof(*np->active_ns)); /* Insert nsid. */ new_ns[i] = nsid; /* Copy over IDs greater than nsid. */ memcpy(new_ns + i + 1, np->active_ns + i, (np->num_ns - i) * sizeof(*np->active_ns)); np->num_ns++; old_ns = np->active_ns; np->active_ns = new_ns; TAILQ_FOREACH(ctrlr, &np->controllers, link) { nvmft_controller_lun_changed(ctrlr, lun_id); } sx_xunlock(&np->lock); free(old_ns, M_NVMFT); return (0); } static int nvmft_lun_disable(void *arg, int lun_id) { struct nvmft_port *np = arg; struct nvmft_controller *ctrlr; uint32_t nsid; u_int i; if (lun_id >= le32toh(np->cdata.nn)) return (0); nsid = lun_id + 1; sx_xlock(&np->lock); for (i = 0; i < np->num_ns; i++) { if (np->active_ns[i] == nsid) goto found; } sx_xunlock(&np->lock); printf("NVMFT: %s request to disable nonexistent lun %d\n", np->cdata.subnqn, lun_id); return (EINVAL); found: /* Move down IDs greater than nsid. */ memmove(np->active_ns + i, np->active_ns + i + 1, (np->num_ns - (i + 1)) * sizeof(*np->active_ns)); np->num_ns--; /* NB: Don't bother freeing the old active_ns array. */ TAILQ_FOREACH(ctrlr, &np->controllers, link) { nvmft_controller_lun_changed(ctrlr, lun_id); } sx_xunlock(&np->lock); return (0); } void nvmft_populate_active_nslist(struct nvmft_port *np, uint32_t nsid, struct nvme_ns_list *nslist) { u_int i, count; sx_slock(&np->lock); count = 0; for (i = 0; i < np->num_ns; i++) { if (np->active_ns[i] <= nsid) continue; nslist->ns[count] = htole32(np->active_ns[i]); count++; if (count == nitems(nslist->ns)) break; } sx_sunlock(&np->lock); } void nvmft_dispatch_command(struct nvmft_qpair *qp, struct nvmf_capsule *nc, bool admin) { struct nvmft_controller *ctrlr = nvmft_qpair_ctrlr(qp); const struct nvme_command *cmd = nvmf_capsule_sqe(nc); struct nvmft_port *np = ctrlr->np; union ctl_io *io; int error; if (cmd->nsid == htole32(0)) { nvmft_send_generic_error(qp, nc, NVME_SC_INVALID_NAMESPACE_OR_FORMAT); nvmf_free_capsule(nc); return; } mtx_lock(&ctrlr->lock); if (ctrlr->pending_commands == 0) ctrlr->start_busy = sbinuptime(); ctrlr->pending_commands++; mtx_unlock(&ctrlr->lock); io = ctl_alloc_io(np->port.ctl_pool_ref); ctl_zero_io(io); NVMFT_NC(io) = nc; NVMFT_QP(io) = qp; io->io_hdr.io_type = admin ? CTL_IO_NVME_ADMIN : CTL_IO_NVME; io->io_hdr.nexus.initid = ctrlr->cntlid; io->io_hdr.nexus.targ_port = np->port.targ_port; io->io_hdr.nexus.targ_lun = le32toh(cmd->nsid) - 1; io->nvmeio.cmd = *cmd; error = ctl_run(io); if (error != 0) { nvmft_printf(ctrlr, "ctl_run failed for command on %s: %d\n", nvmft_qpair_name(qp), error); ctl_nvme_set_generic_error(&io->nvmeio, NVME_SC_INTERNAL_DEVICE_ERROR); nvmft_done(io); nvmft_controller_error(ctrlr, qp, ENXIO); } } void nvmft_terminate_commands(struct nvmft_controller *ctrlr) { struct nvmft_port *np = ctrlr->np; union ctl_io *io; int error; mtx_lock(&ctrlr->lock); if (ctrlr->pending_commands == 0) ctrlr->start_busy = sbinuptime(); ctrlr->pending_commands++; mtx_unlock(&ctrlr->lock); io = ctl_alloc_io(np->port.ctl_pool_ref); ctl_zero_io(io); NVMFT_QP(io) = ctrlr->admin; io->io_hdr.io_type = CTL_IO_TASK; io->io_hdr.nexus.initid = ctrlr->cntlid; io->io_hdr.nexus.targ_port = np->port.targ_port; io->io_hdr.nexus.targ_lun = 0; io->taskio.tag_type = CTL_TAG_SIMPLE; /* XXX: unused? */ io->taskio.task_action = CTL_TASK_I_T_NEXUS_RESET; error = ctl_run(io); if (error != CTL_RETVAL_COMPLETE) { nvmft_printf(ctrlr, "failed to terminate tasks: %d\n", error); #ifdef INVARIANTS io->io_hdr.status = CTL_SUCCESS; #endif nvmft_done(io); } } static void nvmft_datamove_out_cb(void *arg, size_t xfered, int error) { struct ctl_nvmeio *ctnio = arg; if (error != 0) { ctl_nvme_set_data_transfer_error(ctnio); } else { MPASS(xfered == ctnio->kern_data_len); ctnio->kern_data_resid -= xfered; } if (ctnio->kern_sg_entries) { free(ctnio->ext_data_ptr, M_NVMFT); ctnio->ext_data_ptr = NULL; } else MPASS(ctnio->ext_data_ptr == NULL); ctl_datamove_done((union ctl_io *)ctnio, false); } static void nvmft_datamove_out(struct ctl_nvmeio *ctnio, struct nvmft_qpair *qp, struct nvmf_capsule *nc) { struct memdesc mem; int error; MPASS(ctnio->ext_data_ptr == NULL); if (ctnio->kern_sg_entries > 0) { struct ctl_sg_entry *sgl; struct bus_dma_segment *vlist; vlist = mallocarray(ctnio->kern_sg_entries, sizeof(*vlist), M_NVMFT, M_WAITOK); ctnio->ext_data_ptr = (void *)vlist; sgl = (struct ctl_sg_entry *)ctnio->kern_data_ptr; for (u_int i = 0; i < ctnio->kern_sg_entries; i++) { vlist[i].ds_addr = (uintptr_t)sgl[i].addr; vlist[i].ds_len = sgl[i].len; } mem = memdesc_vlist(vlist, ctnio->kern_sg_entries); } else mem = memdesc_vaddr(ctnio->kern_data_ptr, ctnio->kern_data_len); error = nvmf_receive_controller_data(nc, ctnio->kern_rel_offset, &mem, ctnio->kern_data_len, nvmft_datamove_out_cb, ctnio); if (error == 0) return; nvmft_printf(nvmft_qpair_ctrlr(qp), "Failed to request capsule data: %d\n", error); ctl_nvme_set_data_transfer_error(ctnio); if (ctnio->kern_sg_entries) { free(ctnio->ext_data_ptr, M_NVMFT); ctnio->ext_data_ptr = NULL; } else MPASS(ctnio->ext_data_ptr == NULL); ctl_datamove_done((union ctl_io *)ctnio, true); } static struct mbuf * nvmft_copy_data(struct ctl_nvmeio *ctnio) { struct ctl_sg_entry *sgl; struct mbuf *m0, *m; uint32_t resid, off, todo; int mlen; MPASS(ctnio->kern_data_len != 0); m0 = m_getm2(NULL, ctnio->kern_data_len, M_WAITOK, MT_DATA, 0); if (ctnio->kern_sg_entries == 0) { m_copyback(m0, 0, ctnio->kern_data_len, ctnio->kern_data_ptr); return (m0); } resid = ctnio->kern_data_len; sgl = (struct ctl_sg_entry *)ctnio->kern_data_ptr; off = 0; m = m0; mlen = M_TRAILINGSPACE(m); for (;;) { todo = MIN(mlen, sgl->len - off); memcpy(mtod(m, char *) + m->m_len, (char *)sgl->addr + off, todo); m->m_len += todo; resid -= todo; if (resid == 0) { MPASS(m->m_next == NULL); break; } off += todo; if (off == sgl->len) { sgl++; off = 0; } mlen -= todo; if (mlen == 0) { m = m->m_next; mlen = M_TRAILINGSPACE(m); } } return (m0); } static void m_free_ref_data(struct mbuf *m) { ctl_ref kern_data_ref = m->m_ext.ext_arg1; kern_data_ref(m->m_ext.ext_arg2, -1); } static struct mbuf * m_get_ref_data(struct ctl_nvmeio *ctnio, void *buf, u_int size) { struct mbuf *m; m = m_get(M_WAITOK, MT_DATA); m_extadd(m, buf, size, m_free_ref_data, ctnio->kern_data_ref, ctnio->kern_data_arg, M_RDONLY, EXT_CTL); m->m_len = size; ctnio->kern_data_ref(ctnio->kern_data_arg, 1); return (m); } static struct mbuf * nvmft_ref_data(struct ctl_nvmeio *ctnio) { struct ctl_sg_entry *sgl; struct mbuf *m0, *m; MPASS(ctnio->kern_data_len != 0); if (ctnio->kern_sg_entries == 0) return (m_get_ref_data(ctnio, ctnio->kern_data_ptr, ctnio->kern_data_len)); sgl = (struct ctl_sg_entry *)ctnio->kern_data_ptr; m0 = m_get_ref_data(ctnio, sgl[0].addr, sgl[0].len); m = m0; for (u_int i = 1; i < ctnio->kern_sg_entries; i++) { m->m_next = m_get_ref_data(ctnio, sgl[i].addr, sgl[i].len); m = m->m_next; } return (m0); } static void nvmft_datamove_in(struct ctl_nvmeio *ctnio, struct nvmft_qpair *qp, struct nvmf_capsule *nc) { struct mbuf *m; u_int status; if (ctnio->kern_data_ref != NULL) m = nvmft_ref_data(ctnio); else m = nvmft_copy_data(ctnio); status = nvmf_send_controller_data(nc, ctnio->kern_rel_offset, m, ctnio->kern_data_len); switch (status) { case NVMF_SUCCESS_SENT: ctnio->success_sent = true; nvmft_command_completed(qp, nc); /* FALLTHROUGH */ case NVMF_MORE: case NVME_SC_SUCCESS: break; default: ctl_nvme_set_generic_error(ctnio, status); break; } ctl_datamove_done((union ctl_io *)ctnio, true); } void nvmft_handle_datamove(union ctl_io *io) { struct nvmf_capsule *nc; struct nvmft_qpair *qp; /* Some CTL commands preemptively set a success status. */ MPASS(io->io_hdr.status == CTL_STATUS_NONE || io->io_hdr.status == CTL_SUCCESS); MPASS(!io->nvmeio.success_sent); nc = NVMFT_NC(io); qp = NVMFT_QP(io); if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_IN) nvmft_datamove_in(&io->nvmeio, qp, nc); else nvmft_datamove_out(&io->nvmeio, qp, nc); } void nvmft_abort_datamove(union ctl_io *io) { io->io_hdr.port_status = 1; io->io_hdr.flags |= CTL_FLAG_ABORT; ctl_datamove_done(io, true); } static void nvmft_datamove(union ctl_io *io) { struct nvmft_qpair *qp; qp = NVMFT_QP(io); nvmft_qpair_datamove(qp, io); } void nvmft_enqueue_task(struct task *task) { taskqueue_enqueue(nvmft_taskq, task); } void nvmft_drain_task(struct task *task) { taskqueue_drain(nvmft_taskq, task); } static void hip_add(uint64_t pair[2], uint64_t addend) { uint64_t old, new; old = le64toh(pair[0]); new = old + addend; pair[0] = htole64(new); if (new < old) pair[1] += htole64(1); } static void nvmft_done(union ctl_io *io) { struct nvmft_controller *ctrlr; const struct nvme_command *cmd; struct nvmft_qpair *qp; struct nvmf_capsule *nc; size_t len; KASSERT(io->io_hdr.status == CTL_SUCCESS || io->io_hdr.status == CTL_NVME_ERROR, ("%s: bad status %u", __func__, io->io_hdr.status)); nc = NVMFT_NC(io); qp = NVMFT_QP(io); ctrlr = nvmft_qpair_ctrlr(qp); if (nc == NULL) { /* Completion of nvmft_terminate_commands. */ goto end; } cmd = nvmf_capsule_sqe(nc); if (io->io_hdr.status == CTL_SUCCESS) len = nvmf_capsule_data_len(nc) / 512; else len = 0; switch (cmd->opc) { case NVME_OPC_WRITE: mtx_lock(&ctrlr->lock); hip_add(ctrlr->hip.host_write_commands, 1); len += ctrlr->partial_duw; if (len > 1000) hip_add(ctrlr->hip.data_units_written, len / 1000); ctrlr->partial_duw = len % 1000; mtx_unlock(&ctrlr->lock); break; case NVME_OPC_READ: case NVME_OPC_COMPARE: case NVME_OPC_VERIFY: mtx_lock(&ctrlr->lock); if (cmd->opc != NVME_OPC_VERIFY) hip_add(ctrlr->hip.host_read_commands, 1); len += ctrlr->partial_dur; if (len > 1000) hip_add(ctrlr->hip.data_units_read, len / 1000); ctrlr->partial_dur = len % 1000; mtx_unlock(&ctrlr->lock); break; } if (io->nvmeio.success_sent) { MPASS(io->io_hdr.status == CTL_SUCCESS); } else { io->nvmeio.cpl.cid = cmd->cid; nvmft_send_response(qp, &io->nvmeio.cpl); } nvmf_free_capsule(nc); end: ctl_free_io(io); mtx_lock(&ctrlr->lock); ctrlr->pending_commands--; if (ctrlr->pending_commands == 0) ctrlr->busy_total += sbinuptime() - ctrlr->start_busy; mtx_unlock(&ctrlr->lock); } static int nvmft_init(void) { int error; nvmft_taskq = taskqueue_create("nvmft", M_WAITOK, taskqueue_thread_enqueue, &nvmft_taskq); error = taskqueue_start_threads_in_proc(&nvmft_taskq, mp_ncpus, PWAIT, control_softc->ctl_proc, "nvmft"); if (error != 0) { taskqueue_free(nvmft_taskq); return (error); } TAILQ_INIT(&nvmft_ports); sx_init(&nvmft_ports_lock, "nvmft ports"); return (0); } void nvmft_port_free(struct nvmft_port *np) { KASSERT(TAILQ_EMPTY(&np->controllers), ("%s(%p): active controllers", __func__, np)); if (np->port.targ_port != -1) { if (ctl_port_deregister(&np->port) != 0) printf("%s: ctl_port_deregister() failed\n", __func__); } free(np->active_ns, M_NVMFT); clean_unrhdr(np->ids); delete_unrhdr(np->ids); sx_destroy(&np->lock); free(np, M_NVMFT); } static struct nvmft_port * nvmft_port_find(const char *subnqn) { struct nvmft_port *np; KASSERT(nvmf_nqn_valid(subnqn), ("%s: invalid nqn", __func__)); sx_assert(&nvmft_ports_lock, SA_LOCKED); TAILQ_FOREACH(np, &nvmft_ports, link) { if (strcmp(np->cdata.subnqn, subnqn) == 0) break; } return (np); } static struct nvmft_port * nvmft_port_find_by_id(int port_id) { struct nvmft_port *np; sx_assert(&nvmft_ports_lock, SA_LOCKED); TAILQ_FOREACH(np, &nvmft_ports, link) { if (np->port.targ_port == port_id) break; } return (np); } /* * Helper function to fetch a number stored as a string in an nv_list. * Returns false if the string was not a valid number. */ static bool dnvlist_get_strnum(nvlist_t *nvl, const char *name, u_long default_value, u_long *value) { const char *str; char *cp; str = dnvlist_get_string(nvl, name, NULL); if (str == NULL) { *value = default_value; return (true); } if (*str == '\0') return (false); *value = strtoul(str, &cp, 0); if (*cp != '\0') return (false); return (true); } /* * NVMeoF ports support the following parameters: * * Mandatory: * * subnqn: subsystem NVMe Qualified Name * portid: integer port ID from Discovery Log Page entry * * Optional: * serial: Serial Number string * max_io_qsize: Maximum number of I/O queue entries * enable_timeout: Timeout for controller enable in milliseconds * ioccsz: Maximum command capsule size * iorcsz: Maximum response capsule size * nn: Number of namespaces */ static void nvmft_port_create(struct ctl_req *req) { struct nvmft_port *np; struct ctl_port *port; const char *serial, *subnqn; char serial_buf[NVME_SERIAL_NUMBER_LENGTH]; u_long enable_timeout, hostid, ioccsz, iorcsz, max_io_qsize, nn, portid; int error; /* Required parameters. */ subnqn = dnvlist_get_string(req->args_nvl, "subnqn", NULL); if (subnqn == NULL || !nvlist_exists_string(req->args_nvl, "portid")) { req->status = CTL_LUN_ERROR; snprintf(req->error_str, sizeof(req->error_str), "Missing required argument"); return; } if (!nvmf_nqn_valid(subnqn)) { req->status = CTL_LUN_ERROR; snprintf(req->error_str, sizeof(req->error_str), "Invalid SubNQN"); return; } if (!dnvlist_get_strnum(req->args_nvl, "portid", UINT16_MAX, &portid) || portid > UINT16_MAX) { req->status = CTL_LUN_ERROR; snprintf(req->error_str, sizeof(req->error_str), "Invalid port ID"); return; } /* Optional parameters. */ if (!dnvlist_get_strnum(req->args_nvl, "max_io_qsize", NVMF_MAX_IO_ENTRIES, &max_io_qsize) || max_io_qsize < NVME_MIN_IO_ENTRIES || max_io_qsize > NVME_MAX_IO_ENTRIES) { req->status = CTL_LUN_ERROR; snprintf(req->error_str, sizeof(req->error_str), "Invalid maximum I/O queue size"); return; } if (!dnvlist_get_strnum(req->args_nvl, "enable_timeout", NVMF_CC_EN_TIMEOUT * 500, &enable_timeout) || (enable_timeout % 500) != 0 || (enable_timeout / 500) > 255) { req->status = CTL_LUN_ERROR; snprintf(req->error_str, sizeof(req->error_str), "Invalid enable timeout"); return; } if (!dnvlist_get_strnum(req->args_nvl, "ioccsz", NVMF_IOCCSZ, &ioccsz) || ioccsz < sizeof(struct nvme_command) || (ioccsz % 16) != 0) { req->status = CTL_LUN_ERROR; snprintf(req->error_str, sizeof(req->error_str), "Invalid Command Capsule size"); return; } if (!dnvlist_get_strnum(req->args_nvl, "iorcsz", NVMF_IORCSZ, &iorcsz) || iorcsz < sizeof(struct nvme_completion) || (iorcsz % 16) != 0) { req->status = CTL_LUN_ERROR; snprintf(req->error_str, sizeof(req->error_str), "Invalid Response Capsule size"); return; } if (!dnvlist_get_strnum(req->args_nvl, "nn", NVMF_NN, &nn) || nn < 1 || nn > UINT32_MAX) { req->status = CTL_LUN_ERROR; snprintf(req->error_str, sizeof(req->error_str), "Invalid number of namespaces"); return; } serial = dnvlist_get_string(req->args_nvl, "serial", NULL); if (serial == NULL) { getcredhostid(curthread->td_ucred, &hostid); nvmf_controller_serial(serial_buf, sizeof(serial_buf), hostid); serial = serial_buf; } sx_xlock(&nvmft_ports_lock); np = nvmft_port_find(subnqn); if (np != NULL) { req->status = CTL_LUN_ERROR; snprintf(req->error_str, sizeof(req->error_str), "SubNQN \"%s\" already exists", subnqn); sx_xunlock(&nvmft_ports_lock); return; } np = malloc(sizeof(*np), M_NVMFT, M_WAITOK | M_ZERO); refcount_init(&np->refs, 1); np->max_io_qsize = max_io_qsize; np->cap = _nvmf_controller_cap(max_io_qsize, enable_timeout / 500); sx_init(&np->lock, "nvmft port"); np->ids = new_unrhdr(0, MIN(CTL_MAX_INIT_PER_PORT - 1, NVMF_CNTLID_STATIC_MAX), UNR_NO_MTX); TAILQ_INIT(&np->controllers); /* The controller ID is set later for individual controllers. */ _nvmf_init_io_controller_data(0, max_io_qsize, serial, ostype, osrelease, subnqn, nn, ioccsz, iorcsz, &np->cdata); np->cdata.aerl = NVMFT_NUM_AER - 1; np->cdata.oaes = htole32(NVME_ASYNC_EVENT_NS_ATTRIBUTE); np->cdata.oncs = htole16(NVMEF(NVME_CTRLR_DATA_ONCS_VERIFY, 1) | NVMEF(NVME_CTRLR_DATA_ONCS_WRZERO, 1) | NVMEF(NVME_CTRLR_DATA_ONCS_DSM, 1) | NVMEF(NVME_CTRLR_DATA_ONCS_COMPARE, 1)); np->cdata.fuses = NVMEF(NVME_CTRLR_DATA_FUSES_CNW, 1); np->fp.afi = NVMEF(NVME_FIRMWARE_PAGE_AFI_SLOT, 1); memcpy(np->fp.revision[0], np->cdata.fr, sizeof(np->cdata.fr)); port = &np->port; port->frontend = &nvmft_frontend; port->port_type = CTL_PORT_NVMF; port->num_requested_ctl_io = max_io_qsize; port->port_name = "nvmf"; port->physical_port = portid; port->virtual_port = 0; port->port_online = nvmft_online; port->port_offline = nvmft_offline; port->onoff_arg = np; port->lun_enable = nvmft_lun_enable; port->lun_disable = nvmft_lun_disable; port->targ_lun_arg = np; port->fe_datamove = nvmft_datamove; port->fe_done = nvmft_done; port->targ_port = -1; port->options = nvlist_clone(req->args_nvl); error = ctl_port_register(port); if (error != 0) { sx_xunlock(&nvmft_ports_lock); nvlist_destroy(port->options); nvmft_port_rele(np); req->status = CTL_LUN_ERROR; snprintf(req->error_str, sizeof(req->error_str), "Failed to register CTL port with error %d", error); return; } TAILQ_INSERT_TAIL(&nvmft_ports, np, link); sx_xunlock(&nvmft_ports_lock); req->status = CTL_LUN_OK; req->result_nvl = nvlist_create(0); nvlist_add_number(req->result_nvl, "port_id", port->targ_port); } static void nvmft_port_remove(struct ctl_req *req) { struct nvmft_port *np; const char *subnqn; u_long port_id; /* * ctladm port -r just provides the port_id, so permit looking * up a port either by "subnqn" or "port_id". */ port_id = ULONG_MAX; subnqn = dnvlist_get_string(req->args_nvl, "subnqn", NULL); if (subnqn == NULL) { if (!nvlist_exists_string(req->args_nvl, "port_id")) { req->status = CTL_LUN_ERROR; snprintf(req->error_str, sizeof(req->error_str), "Missing required argument"); return; } if (!dnvlist_get_strnum(req->args_nvl, "port_id", ULONG_MAX, &port_id)) { req->status = CTL_LUN_ERROR; snprintf(req->error_str, sizeof(req->error_str), "Invalid CTL port ID"); return; } } else { if (nvlist_exists_string(req->args_nvl, "port_id")) { req->status = CTL_LUN_ERROR; snprintf(req->error_str, sizeof(req->error_str), "Ambiguous port removal request"); return; } } sx_xlock(&nvmft_ports_lock); if (subnqn != NULL) { np = nvmft_port_find(subnqn); if (np == NULL) { req->status = CTL_LUN_ERROR; snprintf(req->error_str, sizeof(req->error_str), "SubNQN \"%s\" does not exist", subnqn); sx_xunlock(&nvmft_ports_lock); return; } } else { np = nvmft_port_find_by_id(port_id); if (np == NULL) { req->status = CTL_LUN_ERROR; snprintf(req->error_str, sizeof(req->error_str), "CTL port %lu is not a NVMF port", port_id); sx_xunlock(&nvmft_ports_lock); return; } } TAILQ_REMOVE(&nvmft_ports, np, link); sx_xunlock(&nvmft_ports_lock); ctl_port_offline(&np->port); nvmft_port_rele(np); req->status = CTL_LUN_OK; } static void nvmft_handoff(struct ctl_nvmf *cn) { - struct nvmf_fabric_connect_cmd cmd; - struct nvmf_handoff_controller_qpair *handoff; - struct nvmf_fabric_connect_data *data; + const struct nvmf_fabric_connect_cmd *cmd; + const struct nvmf_fabric_connect_data *data; + const nvlist_t *params; struct nvmft_port *np; + nvlist_t *nvl; + size_t len; + enum nvmf_trtype trtype; int error; np = NULL; - data = NULL; - handoff = &cn->data.handoff; - error = copyin(handoff->cmd, &cmd, sizeof(cmd)); + error = nvmf_unpack_ioc_nvlist(&cn->data.handoff, &nvl); if (error != 0) { cn->status = CTL_NVMF_ERROR; snprintf(cn->error_str, sizeof(cn->error_str), - "Failed to copyin CONNECT SQE"); + "Failed to copyin and unpack handoff arguments"); return; } - data = malloc(sizeof(*data), M_NVMFT, M_WAITOK); - error = copyin(handoff->data, data, sizeof(*data)); - if (error != 0) { + if (!nvlist_exists_number(nvl, "trtype") || + !nvlist_exists_nvlist(nvl, "params") || + !nvlist_exists_binary(nvl, "cmd") || + !nvlist_exists_binary(nvl, "data")) { + cn->status = CTL_NVMF_ERROR; + snprintf(cn->error_str, sizeof(cn->error_str), + "Handoff arguments missing required value"); + goto out; + } + + params = nvlist_get_nvlist(nvl, "params"); + if (!nvmf_validate_qpair_nvlist(params, true)) { + cn->status = CTL_NVMF_ERROR; + snprintf(cn->error_str, sizeof(cn->error_str), + "Invalid queue pair parameters"); + goto out; + } + + cmd = nvlist_get_binary(nvl, "cmd", &len); + if (len != sizeof(*cmd)) { + cn->status = CTL_NVMF_ERROR; + snprintf(cn->error_str, sizeof(cn->error_str), + "Wrong size for CONNECT SQE"); + goto out; + } + + data = nvlist_get_binary(nvl, "data", &len); + if (len != sizeof(*data)) { cn->status = CTL_NVMF_ERROR; snprintf(cn->error_str, sizeof(cn->error_str), - "Failed to copyin CONNECT data"); + "Wrong size for CONNECT data"); goto out; } if (!nvmf_nqn_valid(data->subnqn)) { cn->status = CTL_NVMF_ERROR; snprintf(cn->error_str, sizeof(cn->error_str), "Invalid SubNQN"); goto out; } sx_slock(&nvmft_ports_lock); np = nvmft_port_find(data->subnqn); if (np == NULL) { sx_sunlock(&nvmft_ports_lock); cn->status = CTL_NVMF_ERROR; snprintf(cn->error_str, sizeof(cn->error_str), "Unknown SubNQN"); goto out; } if (!np->online) { sx_sunlock(&nvmft_ports_lock); cn->status = CTL_NVMF_ERROR; snprintf(cn->error_str, sizeof(cn->error_str), "CTL port offline"); np = NULL; goto out; } nvmft_port_ref(np); sx_sunlock(&nvmft_ports_lock); - if (handoff->params.admin) { - error = nvmft_handoff_admin_queue(np, handoff, &cmd, data); + trtype = nvlist_get_number(nvl, "trtype"); + if (nvlist_get_bool(params, "admin")) { + error = nvmft_handoff_admin_queue(np, trtype, params, cmd, + data); if (error != 0) { cn->status = CTL_NVMF_ERROR; snprintf(cn->error_str, sizeof(cn->error_str), "Failed to handoff admin queue: %d", error); goto out; } } else { - error = nvmft_handoff_io_queue(np, handoff, &cmd, data); + error = nvmft_handoff_io_queue(np, trtype, params, cmd, data); if (error != 0) { cn->status = CTL_NVMF_ERROR; snprintf(cn->error_str, sizeof(cn->error_str), "Failed to handoff I/O queue: %d", error); goto out; } } cn->status = CTL_NVMF_OK; out: if (np != NULL) nvmft_port_rele(np); - free(data, M_NVMFT); + nvlist_destroy(nvl); } static void nvmft_list(struct ctl_nvmf *cn) { struct ctl_nvmf_list_params *lp; struct nvmft_controller *ctrlr; struct nvmft_port *np; struct sbuf *sb; int error; lp = &cn->data.list; sb = sbuf_new(NULL, NULL, lp->alloc_len, SBUF_FIXEDLEN | SBUF_INCLUDENUL); if (sb == NULL) { cn->status = CTL_NVMF_ERROR; snprintf(cn->error_str, sizeof(cn->error_str), "Failed to allocate NVMeoF session list"); return; } sbuf_printf(sb, "\n"); sx_slock(&nvmft_ports_lock); TAILQ_FOREACH(np, &nvmft_ports, link) { sx_slock(&np->lock); TAILQ_FOREACH(ctrlr, &np->controllers, link) { sbuf_printf(sb, "" "%s" "%s" "%u" "\n", ctrlr->cntlid, ctrlr->hostnqn, np->cdata.subnqn, ctrlr->trtype); } sx_sunlock(&np->lock); } sx_sunlock(&nvmft_ports_lock); sbuf_printf(sb, "\n"); if (sbuf_finish(sb) != 0) { sbuf_delete(sb); cn->status = CTL_NVMF_LIST_NEED_MORE_SPACE; snprintf(cn->error_str, sizeof(cn->error_str), "Out of space, %d bytes is too small", lp->alloc_len); return; } error = copyout(sbuf_data(sb), lp->conn_xml, sbuf_len(sb)); if (error != 0) { sbuf_delete(sb); cn->status = CTL_NVMF_ERROR; snprintf(cn->error_str, sizeof(cn->error_str), "Failed to copyout session list: %d", error); return; } lp->fill_len = sbuf_len(sb); cn->status = CTL_NVMF_OK; sbuf_delete(sb); } static void nvmft_terminate(struct ctl_nvmf *cn) { struct ctl_nvmf_terminate_params *tp; struct nvmft_controller *ctrlr; struct nvmft_port *np; bool found, match; tp = &cn->data.terminate; found = false; sx_slock(&nvmft_ports_lock); TAILQ_FOREACH(np, &nvmft_ports, link) { sx_slock(&np->lock); TAILQ_FOREACH(ctrlr, &np->controllers, link) { if (tp->all != 0) match = true; else if (tp->cntlid != -1) match = tp->cntlid == ctrlr->cntlid; else if (tp->hostnqn[0] != '\0') match = strncmp(tp->hostnqn, ctrlr->hostnqn, sizeof(tp->hostnqn)) == 0; else match = false; if (!match) continue; nvmft_printf(ctrlr, "disconnecting due to administrative request\n"); nvmft_controller_error(ctrlr, NULL, ECONNABORTED); found = true; } sx_sunlock(&np->lock); } sx_sunlock(&nvmft_ports_lock); if (!found) { cn->status = CTL_NVMF_ASSOCIATION_NOT_FOUND; snprintf(cn->error_str, sizeof(cn->error_str), "No matching associations found"); return; } cn->status = CTL_NVMF_OK; } static int nvmft_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int flag, struct thread *td) { struct ctl_nvmf *cn; struct ctl_req *req; switch (cmd) { case CTL_PORT_REQ: req = (struct ctl_req *)data; switch (req->reqtype) { case CTL_REQ_CREATE: nvmft_port_create(req); break; case CTL_REQ_REMOVE: nvmft_port_remove(req); break; default: req->status = CTL_LUN_ERROR; snprintf(req->error_str, sizeof(req->error_str), "Unsupported request type %d", req->reqtype); break; } return (0); case CTL_NVMF: cn = (struct ctl_nvmf *)data; switch (cn->type) { case CTL_NVMF_HANDOFF: nvmft_handoff(cn); break; case CTL_NVMF_LIST: nvmft_list(cn); break; case CTL_NVMF_TERMINATE: nvmft_terminate(cn); break; default: cn->status = CTL_NVMF_ERROR; snprintf(cn->error_str, sizeof(cn->error_str), "Invalid NVMeoF request type %d", cn->type); break; } return (0); default: return (ENOTTY); } } static int nvmft_shutdown(void) { /* TODO: Need to check for active controllers. */ if (!TAILQ_EMPTY(&nvmft_ports)) return (EBUSY); taskqueue_free(nvmft_taskq); sx_destroy(&nvmft_ports_lock); return (0); } CTL_FRONTEND_DECLARE(nvmft, nvmft_frontend); MODULE_DEPEND(nvmft, nvmf_transport, 1, 1, 1); diff --git a/sys/dev/nvmf/controller/nvmft_controller.c b/sys/dev/nvmf/controller/nvmft_controller.c index 3c10fea75c9d..83a156d9b92a 100644 --- a/sys/dev/nvmf/controller/nvmft_controller.c +++ b/sys/dev/nvmf/controller/nvmft_controller.c @@ -1,1140 +1,1137 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2023-2024 Chelsio Communications, Inc. * Written by: John Baldwin */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include static void nvmft_controller_shutdown(void *arg, int pending); static void nvmft_controller_terminate(void *arg, int pending); int nvmft_printf(struct nvmft_controller *ctrlr, const char *fmt, ...) { char buf[128]; struct sbuf sb; va_list ap; size_t retval; sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); sbuf_set_drain(&sb, sbuf_printf_drain, &retval); sbuf_printf(&sb, "nvmft%u: ", ctrlr->cntlid); va_start(ap, fmt); sbuf_vprintf(&sb, fmt, ap); va_end(ap); sbuf_finish(&sb); sbuf_delete(&sb); return (retval); } static struct nvmft_controller * nvmft_controller_alloc(struct nvmft_port *np, uint16_t cntlid, const struct nvmf_fabric_connect_data *data) { struct nvmft_controller *ctrlr; ctrlr = malloc(sizeof(*ctrlr), M_NVMFT, M_WAITOK | M_ZERO); ctrlr->cntlid = cntlid; nvmft_port_ref(np); TAILQ_INSERT_TAIL(&np->controllers, ctrlr, link); ctrlr->np = np; mtx_init(&ctrlr->lock, "nvmft controller", NULL, MTX_DEF); callout_init(&ctrlr->ka_timer, 1); TASK_INIT(&ctrlr->shutdown_task, 0, nvmft_controller_shutdown, ctrlr); TIMEOUT_TASK_INIT(taskqueue_thread, &ctrlr->terminate_task, 0, nvmft_controller_terminate, ctrlr); ctrlr->cdata = np->cdata; ctrlr->cdata.ctrlr_id = htole16(cntlid); memcpy(ctrlr->hostid, data->hostid, sizeof(ctrlr->hostid)); memcpy(ctrlr->hostnqn, data->hostnqn, sizeof(ctrlr->hostnqn)); ctrlr->hip.power_cycles[0] = 1; ctrlr->create_time = sbinuptime(); ctrlr->changed_ns = malloc(sizeof(*ctrlr->changed_ns), M_NVMFT, M_WAITOK | M_ZERO); return (ctrlr); } static void nvmft_controller_free(struct nvmft_controller *ctrlr) { mtx_destroy(&ctrlr->lock); MPASS(ctrlr->io_qpairs == NULL); free(ctrlr->changed_ns, M_NVMFT); free(ctrlr, M_NVMFT); } static void nvmft_keep_alive_timer(void *arg) { struct nvmft_controller *ctrlr = arg; int traffic; if (ctrlr->shutdown) return; traffic = atomic_readandclear_int(&ctrlr->ka_active_traffic); if (traffic == 0) { nvmft_printf(ctrlr, "disconnecting due to KeepAlive timeout\n"); nvmft_controller_error(ctrlr, NULL, ETIMEDOUT); return; } callout_schedule_sbt(&ctrlr->ka_timer, ctrlr->ka_sbt, 0, C_HARDCLOCK); } int -nvmft_handoff_admin_queue(struct nvmft_port *np, - const struct nvmf_handoff_controller_qpair *handoff, - const struct nvmf_fabric_connect_cmd *cmd, +nvmft_handoff_admin_queue(struct nvmft_port *np, enum nvmf_trtype trtype, + const nvlist_t *params, const struct nvmf_fabric_connect_cmd *cmd, const struct nvmf_fabric_connect_data *data) { struct nvmft_controller *ctrlr; struct nvmft_qpair *qp; uint32_t kato; int cntlid; if (cmd->qid != htole16(0)) return (EINVAL); - qp = nvmft_qpair_init(handoff->trtype, &handoff->params, 0, - "admin queue"); + qp = nvmft_qpair_init(trtype, params, 0, "admin queue"); if (qp == NULL) { printf("NVMFT: Failed to setup admin queue from %.*s\n", (int)sizeof(data->hostnqn), data->hostnqn); return (ENXIO); } sx_xlock(&np->lock); cntlid = alloc_unr(np->ids); if (cntlid == -1) { sx_xunlock(&np->lock); printf("NVMFT: Unable to allocate controller for %.*s\n", (int)sizeof(data->hostnqn), data->hostnqn); nvmft_connect_error(qp, cmd, NVME_SCT_COMMAND_SPECIFIC, NVMF_FABRIC_SC_INVALID_HOST); nvmft_qpair_destroy(qp); return (ENOMEM); } #ifdef INVARIANTS TAILQ_FOREACH(ctrlr, &np->controllers, link) { KASSERT(ctrlr->cntlid != cntlid, ("%s: duplicate controllers with id %d", __func__, cntlid)); } #endif ctrlr = nvmft_controller_alloc(np, cntlid, data); nvmft_printf(ctrlr, "associated with %.*s\n", (int)sizeof(data->hostnqn), data->hostnqn); ctrlr->admin = qp; - ctrlr->trtype = handoff->trtype; + ctrlr->trtype = trtype; /* * The spec requires a non-zero KeepAlive timer, but allow a * zero KATO value to match Linux. */ kato = le32toh(cmd->kato); if (kato != 0) { /* * Round up to 1 second matching granularity * advertised in cdata. */ ctrlr->ka_sbt = mstosbt(roundup(kato, 1000)); callout_reset_sbt(&ctrlr->ka_timer, ctrlr->ka_sbt, 0, nvmft_keep_alive_timer, ctrlr, C_HARDCLOCK); } nvmft_finish_accept(qp, cmd, ctrlr); sx_xunlock(&np->lock); return (0); } int -nvmft_handoff_io_queue(struct nvmft_port *np, - const struct nvmf_handoff_controller_qpair *handoff, - const struct nvmf_fabric_connect_cmd *cmd, +nvmft_handoff_io_queue(struct nvmft_port *np, enum nvmf_trtype trtype, + const nvlist_t *params, const struct nvmf_fabric_connect_cmd *cmd, const struct nvmf_fabric_connect_data *data) { struct nvmft_controller *ctrlr; struct nvmft_qpair *qp; char name[16]; uint16_t cntlid, qid; qid = le16toh(cmd->qid); if (qid == 0) return (EINVAL); cntlid = le16toh(data->cntlid); snprintf(name, sizeof(name), "I/O queue %u", qid); - qp = nvmft_qpair_init(handoff->trtype, &handoff->params, qid, name); + qp = nvmft_qpair_init(trtype, params, qid, name); if (qp == NULL) { printf("NVMFT: Failed to setup I/O queue %u from %.*s\n", qid, (int)sizeof(data->hostnqn), data->hostnqn); return (ENXIO); } sx_slock(&np->lock); TAILQ_FOREACH(ctrlr, &np->controllers, link) { if (ctrlr->cntlid == cntlid) break; } if (ctrlr == NULL) { sx_sunlock(&np->lock); printf("NVMFT: Nonexistent controller %u for I/O queue %u from %.*s\n", ctrlr->cntlid, qid, (int)sizeof(data->hostnqn), data->hostnqn); nvmft_connect_invalid_parameters(qp, cmd, true, offsetof(struct nvmf_fabric_connect_data, cntlid)); nvmft_qpair_destroy(qp); return (ENOENT); } if (memcmp(ctrlr->hostid, data->hostid, sizeof(ctrlr->hostid)) != 0) { sx_sunlock(&np->lock); nvmft_printf(ctrlr, "hostid mismatch for I/O queue %u from %.*s\n", qid, (int)sizeof(data->hostnqn), data->hostnqn); nvmft_connect_invalid_parameters(qp, cmd, true, offsetof(struct nvmf_fabric_connect_data, hostid)); nvmft_qpair_destroy(qp); return (EINVAL); } if (memcmp(ctrlr->hostnqn, data->hostnqn, sizeof(ctrlr->hostnqn)) != 0) { sx_sunlock(&np->lock); nvmft_printf(ctrlr, "hostnqn mismatch for I/O queue %u from %.*s\n", qid, (int)sizeof(data->hostnqn), data->hostnqn); nvmft_connect_invalid_parameters(qp, cmd, true, offsetof(struct nvmf_fabric_connect_data, hostnqn)); nvmft_qpair_destroy(qp); return (EINVAL); } - /* XXX: Require handoff->trtype == ctrlr->trtype? */ + /* XXX: Require trtype == ctrlr->trtype? */ mtx_lock(&ctrlr->lock); if (ctrlr->shutdown) { mtx_unlock(&ctrlr->lock); sx_sunlock(&np->lock); nvmft_printf(ctrlr, "attempt to create I/O queue %u on disabled controller from %.*s\n", qid, (int)sizeof(data->hostnqn), data->hostnqn); nvmft_connect_invalid_parameters(qp, cmd, true, offsetof(struct nvmf_fabric_connect_data, cntlid)); nvmft_qpair_destroy(qp); return (EINVAL); } if (ctrlr->num_io_queues == 0) { mtx_unlock(&ctrlr->lock); sx_sunlock(&np->lock); nvmft_printf(ctrlr, "attempt to create I/O queue %u without enabled queues from %.*s\n", qid, (int)sizeof(data->hostnqn), data->hostnqn); nvmft_connect_error(qp, cmd, NVME_SCT_GENERIC, NVME_SC_COMMAND_SEQUENCE_ERROR); nvmft_qpair_destroy(qp); return (EINVAL); } if (cmd->qid > ctrlr->num_io_queues) { mtx_unlock(&ctrlr->lock); sx_sunlock(&np->lock); nvmft_printf(ctrlr, "attempt to create invalid I/O queue %u from %.*s\n", qid, (int)sizeof(data->hostnqn), data->hostnqn); nvmft_connect_invalid_parameters(qp, cmd, false, offsetof(struct nvmf_fabric_connect_cmd, qid)); nvmft_qpair_destroy(qp); return (EINVAL); } if (ctrlr->io_qpairs[qid - 1].qp != NULL) { mtx_unlock(&ctrlr->lock); sx_sunlock(&np->lock); nvmft_printf(ctrlr, "attempt to re-create I/O queue %u from %.*s\n", qid, (int)sizeof(data->hostnqn), data->hostnqn); nvmft_connect_error(qp, cmd, NVME_SCT_GENERIC, NVME_SC_COMMAND_SEQUENCE_ERROR); nvmft_qpair_destroy(qp); return (EINVAL); } ctrlr->io_qpairs[qid - 1].qp = qp; mtx_unlock(&ctrlr->lock); nvmft_finish_accept(qp, cmd, ctrlr); sx_sunlock(&np->lock); return (0); } static void nvmft_controller_shutdown(void *arg, int pending) { struct nvmft_controller *ctrlr = arg; MPASS(pending == 1); /* * Shutdown all I/O queues to terminate pending datamoves and * stop receiving new commands. */ mtx_lock(&ctrlr->lock); for (u_int i = 0; i < ctrlr->num_io_queues; i++) { if (ctrlr->io_qpairs[i].qp != NULL) { ctrlr->io_qpairs[i].shutdown = true; mtx_unlock(&ctrlr->lock); nvmft_qpair_shutdown(ctrlr->io_qpairs[i].qp); mtx_lock(&ctrlr->lock); } } mtx_unlock(&ctrlr->lock); /* Terminate active CTL commands. */ nvmft_terminate_commands(ctrlr); /* Wait for all pending CTL commands to complete. */ mtx_lock(&ctrlr->lock); while (ctrlr->pending_commands != 0) mtx_sleep(&ctrlr->pending_commands, &ctrlr->lock, 0, "nvmftsh", hz / 100); mtx_unlock(&ctrlr->lock); /* Delete all of the I/O queues. */ for (u_int i = 0; i < ctrlr->num_io_queues; i++) { if (ctrlr->io_qpairs[i].qp != NULL) nvmft_qpair_destroy(ctrlr->io_qpairs[i].qp); } free(ctrlr->io_qpairs, M_NVMFT); ctrlr->io_qpairs = NULL; mtx_lock(&ctrlr->lock); ctrlr->num_io_queues = 0; /* Mark shutdown complete. */ if (NVMEV(NVME_CSTS_REG_SHST, ctrlr->csts) == NVME_SHST_OCCURRING) { ctrlr->csts &= ~NVMEM(NVME_CSTS_REG_SHST); ctrlr->csts |= NVMEF(NVME_CSTS_REG_SHST, NVME_SHST_COMPLETE); } if (NVMEV(NVME_CSTS_REG_CFS, ctrlr->csts) == 0) { ctrlr->csts &= ~NVMEM(NVME_CSTS_REG_RDY); ctrlr->shutdown = false; } mtx_unlock(&ctrlr->lock); /* * If the admin queue was closed while shutting down or a * fatal controller error has occurred, terminate the * association immediately, otherwise wait up to 2 minutes * (NVMe-over-Fabrics 1.1 4.6). */ if (ctrlr->admin_closed || NVMEV(NVME_CSTS_REG_CFS, ctrlr->csts) != 0) nvmft_controller_terminate(ctrlr, 0); else taskqueue_enqueue_timeout(taskqueue_thread, &ctrlr->terminate_task, hz * 60 * 2); } static void nvmft_controller_terminate(void *arg, int pending) { struct nvmft_controller *ctrlr = arg; struct nvmft_port *np; bool wakeup_np; /* If the controller has been re-enabled, nothing to do. */ mtx_lock(&ctrlr->lock); if (NVMEV(NVME_CC_REG_EN, ctrlr->cc) != 0) { mtx_unlock(&ctrlr->lock); if (ctrlr->ka_sbt != 0) callout_schedule_sbt(&ctrlr->ka_timer, ctrlr->ka_sbt, 0, C_HARDCLOCK); return; } /* Disable updates to CC while destroying admin qpair. */ ctrlr->shutdown = true; mtx_unlock(&ctrlr->lock); nvmft_qpair_destroy(ctrlr->admin); /* Remove association (CNTLID). */ np = ctrlr->np; sx_xlock(&np->lock); TAILQ_REMOVE(&np->controllers, ctrlr, link); free_unr(np->ids, ctrlr->cntlid); wakeup_np = (!np->online && TAILQ_EMPTY(&np->controllers)); sx_xunlock(&np->lock); if (wakeup_np) wakeup(np); callout_drain(&ctrlr->ka_timer); nvmft_printf(ctrlr, "association terminated\n"); nvmft_controller_free(ctrlr); nvmft_port_rele(np); } void nvmft_controller_error(struct nvmft_controller *ctrlr, struct nvmft_qpair *qp, int error) { /* * If a queue pair is closed, that isn't an error per se. * That just means additional commands cannot be received on * that queue pair. * * If the admin queue pair is closed while idle or while * shutting down, terminate the association immediately. * * If an I/O queue pair is closed, just ignore it. */ if (error == 0) { if (qp != ctrlr->admin) return; mtx_lock(&ctrlr->lock); if (ctrlr->shutdown) { ctrlr->admin_closed = true; mtx_unlock(&ctrlr->lock); return; } if (NVMEV(NVME_CC_REG_EN, ctrlr->cc) == 0) { MPASS(ctrlr->num_io_queues == 0); mtx_unlock(&ctrlr->lock); /* * Ok to drop lock here since ctrlr->cc can't * change if the admin queue pair has closed. * This also means no new queues can be handed * off, etc. Note that since there are no I/O * queues, only the admin queue needs to be * destroyed, so it is safe to skip * nvmft_controller_shutdown and just schedule * nvmft_controller_terminate. Note that we * cannot call nvmft_controller_terminate from * here directly as this is called from the * transport layer and freeing the admin qpair * might deadlock waiting for the current * thread to exit. */ if (taskqueue_cancel_timeout(taskqueue_thread, &ctrlr->terminate_task, NULL) == 0) taskqueue_enqueue_timeout(taskqueue_thread, &ctrlr->terminate_task, 0); return; } /* * Treat closing of the admin queue pair while enabled * as a transport error. Note that the admin queue * pair has been closed. */ ctrlr->admin_closed = true; } else mtx_lock(&ctrlr->lock); /* Ignore transport errors if we are already shutting down. */ if (ctrlr->shutdown) { mtx_unlock(&ctrlr->lock); return; } ctrlr->csts |= NVMEF(NVME_CSTS_REG_CFS, 1); ctrlr->cc &= ~NVMEM(NVME_CC_REG_EN); ctrlr->shutdown = true; mtx_unlock(&ctrlr->lock); callout_stop(&ctrlr->ka_timer); taskqueue_enqueue(taskqueue_thread, &ctrlr->shutdown_task); } /* Wrapper around m_getm2 that also sets m_len in the mbufs in the chain. */ static struct mbuf * m_getml(size_t len, int how) { struct mbuf *m, *n; m = m_getm2(NULL, len, how, MT_DATA, 0); if (m == NULL) return (NULL); for (n = m; len > 0; n = n->m_next) { n->m_len = M_SIZE(n); if (n->m_len >= len) { n->m_len = len; MPASS(n->m_next == NULL); } len -= n->m_len; } return (m); } static void m_zero(struct mbuf *m, u_int offset, u_int len) { u_int todo; if (len == 0) return; while (m->m_len <= offset) { offset -= m->m_len; m = m->m_next; } todo = m->m_len - offset; if (todo > len) todo = len; memset(mtodo(m, offset), 0, todo); m = m->m_next; len -= todo; while (len > 0) { todo = m->m_len; if (todo > len) todo = len; memset(mtod(m, void *), 0, todo); m = m->m_next; len -= todo; } } static void handle_get_log_page(struct nvmft_controller *ctrlr, struct nvmf_capsule *nc, const struct nvme_command *cmd) { struct mbuf *m; uint64_t offset; uint32_t numd; size_t len, todo; u_int status; uint8_t lid; bool rae; lid = le32toh(cmd->cdw10) & 0xff; rae = (le32toh(cmd->cdw10) & (1U << 15)) != 0; numd = le32toh(cmd->cdw10) >> 16 | le32toh(cmd->cdw11) << 16; offset = le32toh(cmd->cdw12) | (uint64_t)le32toh(cmd->cdw13) << 32; if (offset % 3 != 0) { status = NVME_SC_INVALID_FIELD; goto done; } len = (numd + 1) * 4; switch (lid) { case NVME_LOG_ERROR: todo = 0; m = m_getml(len, M_WAITOK); if (todo != len) m_zero(m, todo, len - todo); status = nvmf_send_controller_data(nc, 0, m, len); MPASS(status != NVMF_MORE); break; case NVME_LOG_HEALTH_INFORMATION: { struct nvme_health_information_page hip; if (offset >= sizeof(hip)) { status = NVME_SC_INVALID_FIELD; goto done; } todo = sizeof(hip) - offset; if (todo > len) todo = len; mtx_lock(&ctrlr->lock); hip = ctrlr->hip; hip.controller_busy_time[0] = sbintime_getsec(ctrlr->busy_total) / 60; hip.power_on_hours[0] = sbintime_getsec(sbinuptime() - ctrlr->create_time) / 3600; mtx_unlock(&ctrlr->lock); m = m_getml(len, M_WAITOK); m_copyback(m, 0, todo, (char *)&hip + offset); if (todo != len) m_zero(m, todo, len - todo); status = nvmf_send_controller_data(nc, 0, m, len); MPASS(status != NVMF_MORE); break; } case NVME_LOG_FIRMWARE_SLOT: if (offset >= sizeof(ctrlr->np->fp)) { status = NVME_SC_INVALID_FIELD; goto done; } todo = sizeof(ctrlr->np->fp) - offset; if (todo > len) todo = len; m = m_getml(len, M_WAITOK); m_copyback(m, 0, todo, (char *)&ctrlr->np->fp + offset); if (todo != len) m_zero(m, todo, len - todo); status = nvmf_send_controller_data(nc, 0, m, len); MPASS(status != NVMF_MORE); break; case NVME_LOG_CHANGED_NAMESPACE: if (offset >= sizeof(*ctrlr->changed_ns)) { status = NVME_SC_INVALID_FIELD; goto done; } todo = sizeof(*ctrlr->changed_ns) - offset; if (todo > len) todo = len; m = m_getml(len, M_WAITOK); mtx_lock(&ctrlr->lock); m_copyback(m, 0, todo, (char *)ctrlr->changed_ns + offset); if (offset == 0 && len == sizeof(*ctrlr->changed_ns)) memset(ctrlr->changed_ns, 0, sizeof(*ctrlr->changed_ns)); if (!rae) ctrlr->changed_ns_reported = false; mtx_unlock(&ctrlr->lock); if (todo != len) m_zero(m, todo, len - todo); status = nvmf_send_controller_data(nc, 0, m, len); MPASS(status != NVMF_MORE); break; default: nvmft_printf(ctrlr, "Unsupported page %#x for GET_LOG_PAGE\n", lid); status = NVME_SC_INVALID_FIELD; break; } done: if (status == NVMF_SUCCESS_SENT) nvmft_command_completed(ctrlr->admin, nc); else nvmft_send_generic_error(ctrlr->admin, nc, status); nvmf_free_capsule(nc); } static void m_free_nslist(struct mbuf *m) { free(m->m_ext.ext_arg1, M_NVMFT); } static void handle_identify_command(struct nvmft_controller *ctrlr, struct nvmf_capsule *nc, const struct nvme_command *cmd) { struct mbuf *m; size_t data_len; u_int status; uint8_t cns; cns = le32toh(cmd->cdw10) & 0xFF; data_len = nvmf_capsule_data_len(nc); if (data_len != sizeof(ctrlr->cdata)) { nvmft_printf(ctrlr, "Invalid length %zu for IDENTIFY with CNS %#x\n", data_len, cns); nvmft_send_generic_error(ctrlr->admin, nc, NVME_SC_INVALID_OPCODE); nvmf_free_capsule(nc); return; } switch (cns) { case 0: /* Namespace data. */ case 3: /* Namespace Identification Descriptor list. */ nvmft_dispatch_command(ctrlr->admin, nc, true); return; case 1: /* Controller data. */ m = m_getml(sizeof(ctrlr->cdata), M_WAITOK); m_copyback(m, 0, sizeof(ctrlr->cdata), (void *)&ctrlr->cdata); status = nvmf_send_controller_data(nc, 0, m, sizeof(ctrlr->cdata)); MPASS(status != NVMF_MORE); break; case 2: { /* Active namespace list. */ struct nvme_ns_list *nslist; uint32_t nsid; nsid = le32toh(cmd->nsid); if (nsid >= 0xfffffffe) { status = NVME_SC_INVALID_FIELD; break; } nslist = malloc(sizeof(*nslist), M_NVMFT, M_WAITOK | M_ZERO); nvmft_populate_active_nslist(ctrlr->np, nsid, nslist); m = m_get(M_WAITOK, MT_DATA); m_extadd(m, (void *)nslist, sizeof(*nslist), m_free_nslist, nslist, NULL, 0, EXT_CTL); m->m_len = sizeof(*nslist); status = nvmf_send_controller_data(nc, 0, m, m->m_len); MPASS(status != NVMF_MORE); break; } default: nvmft_printf(ctrlr, "Unsupported CNS %#x for IDENTIFY\n", cns); status = NVME_SC_INVALID_FIELD; break; } if (status == NVMF_SUCCESS_SENT) nvmft_command_completed(ctrlr->admin, nc); else nvmft_send_generic_error(ctrlr->admin, nc, status); nvmf_free_capsule(nc); } static void handle_set_features(struct nvmft_controller *ctrlr, struct nvmf_capsule *nc, const struct nvme_command *cmd) { struct nvme_completion cqe; uint8_t fid; fid = NVMEV(NVME_FEAT_SET_FID, le32toh(cmd->cdw10)); switch (fid) { case NVME_FEAT_NUMBER_OF_QUEUES: { uint32_t num_queues; struct nvmft_io_qpair *io_qpairs; num_queues = le32toh(cmd->cdw11) & 0xffff; /* 5.12.1.7: 65535 is invalid. */ if (num_queues == 65535) goto error; /* Fabrics requires the same number of SQs and CQs. */ if (le32toh(cmd->cdw11) >> 16 != num_queues) goto error; /* Convert to 1's based */ num_queues++; io_qpairs = mallocarray(num_queues, sizeof(*io_qpairs), M_NVMFT, M_WAITOK | M_ZERO); mtx_lock(&ctrlr->lock); if (ctrlr->num_io_queues != 0) { mtx_unlock(&ctrlr->lock); free(io_qpairs, M_NVMFT); nvmft_send_generic_error(ctrlr->admin, nc, NVME_SC_COMMAND_SEQUENCE_ERROR); nvmf_free_capsule(nc); return; } ctrlr->num_io_queues = num_queues; ctrlr->io_qpairs = io_qpairs; mtx_unlock(&ctrlr->lock); nvmft_init_cqe(&cqe, nc, 0); cqe.cdw0 = cmd->cdw11; nvmft_send_response(ctrlr->admin, &cqe); nvmf_free_capsule(nc); return; } case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: { uint32_t aer_mask; aer_mask = le32toh(cmd->cdw11); /* Check for any reserved or unimplemented feature bits. */ if ((aer_mask & 0xffffc000) != 0) goto error; mtx_lock(&ctrlr->lock); ctrlr->aer_mask = aer_mask; mtx_unlock(&ctrlr->lock); nvmft_send_success(ctrlr->admin, nc); return; } default: nvmft_printf(ctrlr, "Unsupported feature ID %u for SET_FEATURES\n", fid); goto error; } error: nvmft_send_generic_error(ctrlr->admin, nc, NVME_SC_INVALID_FIELD); nvmf_free_capsule(nc); } static bool update_cc(struct nvmft_controller *ctrlr, uint32_t new_cc, bool *need_shutdown) { struct nvmft_port *np = ctrlr->np; uint32_t changes; *need_shutdown = false; mtx_lock(&ctrlr->lock); /* Don't allow any changes while shutting down. */ if (ctrlr->shutdown) { mtx_unlock(&ctrlr->lock); return (false); } if (!_nvmf_validate_cc(np->max_io_qsize, np->cap, ctrlr->cc, new_cc)) { mtx_unlock(&ctrlr->lock); return (false); } changes = ctrlr->cc ^ new_cc; ctrlr->cc = new_cc; /* Handle shutdown requests. */ if (NVMEV(NVME_CC_REG_SHN, changes) != 0 && NVMEV(NVME_CC_REG_SHN, new_cc) != 0) { ctrlr->csts &= ~NVMEM(NVME_CSTS_REG_SHST); ctrlr->csts |= NVMEF(NVME_CSTS_REG_SHST, NVME_SHST_OCCURRING); ctrlr->cc &= ~NVMEM(NVME_CC_REG_EN); ctrlr->shutdown = true; *need_shutdown = true; nvmft_printf(ctrlr, "shutdown requested\n"); } if (NVMEV(NVME_CC_REG_EN, changes) != 0) { if (NVMEV(NVME_CC_REG_EN, new_cc) == 0) { /* Controller reset. */ nvmft_printf(ctrlr, "reset requested\n"); ctrlr->shutdown = true; *need_shutdown = true; } else ctrlr->csts |= NVMEF(NVME_CSTS_REG_RDY, 1); } mtx_unlock(&ctrlr->lock); return (true); } static void handle_property_get(struct nvmft_controller *ctrlr, struct nvmf_capsule *nc, const struct nvmf_fabric_prop_get_cmd *pget) { struct nvmf_fabric_prop_get_rsp rsp; nvmft_init_cqe(&rsp, nc, 0); switch (le32toh(pget->ofst)) { case NVMF_PROP_CAP: if (pget->attrib.size != NVMF_PROP_SIZE_8) goto error; rsp.value.u64 = htole64(ctrlr->np->cap); break; case NVMF_PROP_VS: if (pget->attrib.size != NVMF_PROP_SIZE_4) goto error; rsp.value.u32.low = ctrlr->cdata.ver; break; case NVMF_PROP_CC: if (pget->attrib.size != NVMF_PROP_SIZE_4) goto error; rsp.value.u32.low = htole32(ctrlr->cc); break; case NVMF_PROP_CSTS: if (pget->attrib.size != NVMF_PROP_SIZE_4) goto error; rsp.value.u32.low = htole32(ctrlr->csts); break; default: goto error; } nvmft_send_response(ctrlr->admin, &rsp); return; error: nvmft_send_generic_error(ctrlr->admin, nc, NVME_SC_INVALID_FIELD); } static void handle_property_set(struct nvmft_controller *ctrlr, struct nvmf_capsule *nc, const struct nvmf_fabric_prop_set_cmd *pset) { bool need_shutdown; need_shutdown = false; switch (le32toh(pset->ofst)) { case NVMF_PROP_CC: if (pset->attrib.size != NVMF_PROP_SIZE_4) goto error; if (!update_cc(ctrlr, le32toh(pset->value.u32.low), &need_shutdown)) goto error; break; default: goto error; } nvmft_send_success(ctrlr->admin, nc); if (need_shutdown) { callout_stop(&ctrlr->ka_timer); taskqueue_enqueue(taskqueue_thread, &ctrlr->shutdown_task); } return; error: nvmft_send_generic_error(ctrlr->admin, nc, NVME_SC_INVALID_FIELD); } static void handle_admin_fabrics_command(struct nvmft_controller *ctrlr, struct nvmf_capsule *nc, const struct nvmf_fabric_cmd *fc) { switch (fc->fctype) { case NVMF_FABRIC_COMMAND_PROPERTY_GET: handle_property_get(ctrlr, nc, (const struct nvmf_fabric_prop_get_cmd *)fc); break; case NVMF_FABRIC_COMMAND_PROPERTY_SET: handle_property_set(ctrlr, nc, (const struct nvmf_fabric_prop_set_cmd *)fc); break; case NVMF_FABRIC_COMMAND_CONNECT: nvmft_printf(ctrlr, "CONNECT command on connected admin queue\n"); nvmft_send_generic_error(ctrlr->admin, nc, NVME_SC_COMMAND_SEQUENCE_ERROR); break; case NVMF_FABRIC_COMMAND_DISCONNECT: nvmft_printf(ctrlr, "DISCONNECT command on admin queue\n"); nvmft_send_error(ctrlr->admin, nc, NVME_SCT_COMMAND_SPECIFIC, NVMF_FABRIC_SC_INVALID_QUEUE_TYPE); break; default: nvmft_printf(ctrlr, "Unsupported fabrics command %#x\n", fc->fctype); nvmft_send_generic_error(ctrlr->admin, nc, NVME_SC_INVALID_OPCODE); break; } nvmf_free_capsule(nc); } void nvmft_handle_admin_command(struct nvmft_controller *ctrlr, struct nvmf_capsule *nc) { const struct nvme_command *cmd = nvmf_capsule_sqe(nc); /* Only permit Fabrics commands while a controller is disabled. */ if (NVMEV(NVME_CC_REG_EN, ctrlr->cc) == 0 && cmd->opc != NVME_OPC_FABRICS_COMMANDS) { nvmft_printf(ctrlr, "Unsupported admin opcode %#x while disabled\n", cmd->opc); nvmft_send_generic_error(ctrlr->admin, nc, NVME_SC_COMMAND_SEQUENCE_ERROR); nvmf_free_capsule(nc); return; } atomic_store_int(&ctrlr->ka_active_traffic, 1); switch (cmd->opc) { case NVME_OPC_GET_LOG_PAGE: handle_get_log_page(ctrlr, nc, cmd); break; case NVME_OPC_IDENTIFY: handle_identify_command(ctrlr, nc, cmd); break; case NVME_OPC_SET_FEATURES: handle_set_features(ctrlr, nc, cmd); break; case NVME_OPC_ASYNC_EVENT_REQUEST: mtx_lock(&ctrlr->lock); if (ctrlr->aer_pending == NVMFT_NUM_AER) { mtx_unlock(&ctrlr->lock); nvmft_send_error(ctrlr->admin, nc, NVME_SCT_COMMAND_SPECIFIC, NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); } else { /* NB: Store the CID without byte-swapping. */ ctrlr->aer_cids[ctrlr->aer_pidx] = cmd->cid; ctrlr->aer_pending++; ctrlr->aer_pidx = (ctrlr->aer_pidx + 1) % NVMFT_NUM_AER; mtx_unlock(&ctrlr->lock); } nvmf_free_capsule(nc); break; case NVME_OPC_KEEP_ALIVE: nvmft_send_success(ctrlr->admin, nc); nvmf_free_capsule(nc); break; case NVME_OPC_FABRICS_COMMANDS: handle_admin_fabrics_command(ctrlr, nc, (const struct nvmf_fabric_cmd *)cmd); break; default: nvmft_printf(ctrlr, "Unsupported admin opcode %#x\n", cmd->opc); nvmft_send_generic_error(ctrlr->admin, nc, NVME_SC_INVALID_OPCODE); nvmf_free_capsule(nc); break; } } void nvmft_handle_io_command(struct nvmft_qpair *qp, uint16_t qid, struct nvmf_capsule *nc) { struct nvmft_controller *ctrlr = nvmft_qpair_ctrlr(qp); const struct nvme_command *cmd = nvmf_capsule_sqe(nc); atomic_store_int(&ctrlr->ka_active_traffic, 1); switch (cmd->opc) { case NVME_OPC_FLUSH: if (cmd->nsid == htole32(0xffffffff)) { nvmft_send_generic_error(qp, nc, NVME_SC_INVALID_NAMESPACE_OR_FORMAT); nvmf_free_capsule(nc); break; } /* FALLTHROUGH */ case NVME_OPC_WRITE: case NVME_OPC_READ: case NVME_OPC_WRITE_UNCORRECTABLE: case NVME_OPC_COMPARE: case NVME_OPC_WRITE_ZEROES: case NVME_OPC_DATASET_MANAGEMENT: case NVME_OPC_VERIFY: nvmft_dispatch_command(qp, nc, false); break; default: nvmft_printf(ctrlr, "Unsupported I/O opcode %#x\n", cmd->opc); nvmft_send_generic_error(qp, nc, NVME_SC_INVALID_OPCODE); nvmf_free_capsule(nc); break; } } static void nvmft_report_aer(struct nvmft_controller *ctrlr, uint32_t aer_mask, u_int type, uint8_t info, uint8_t log_page_id) { struct nvme_completion cpl; MPASS(type <= 7); /* Drop events that are not enabled. */ mtx_lock(&ctrlr->lock); if ((ctrlr->aer_mask & aer_mask) == 0) { mtx_unlock(&ctrlr->lock); return; } /* * If there is no pending AER command, drop it. * XXX: Should we queue these? */ if (ctrlr->aer_pending == 0) { mtx_unlock(&ctrlr->lock); nvmft_printf(ctrlr, "dropping AER type %u, info %#x, page %#x\n", type, info, log_page_id); return; } memset(&cpl, 0, sizeof(cpl)); cpl.cid = ctrlr->aer_cids[ctrlr->aer_cidx]; ctrlr->aer_pending--; ctrlr->aer_cidx = (ctrlr->aer_cidx + 1) % NVMFT_NUM_AER; mtx_unlock(&ctrlr->lock); cpl.cdw0 = htole32(NVMEF(NVME_ASYNC_EVENT_TYPE, type) | NVMEF(NVME_ASYNC_EVENT_INFO, info) | NVMEF(NVME_ASYNC_EVENT_LOG_PAGE_ID, log_page_id)); nvmft_send_response(ctrlr->admin, &cpl); } void nvmft_controller_lun_changed(struct nvmft_controller *ctrlr, int lun_id) { struct nvme_ns_list *nslist; uint32_t new_nsid, nsid; u_int i; new_nsid = lun_id + 1; mtx_lock(&ctrlr->lock); nslist = ctrlr->changed_ns; /* If the first entry is 0xffffffff, the list is already full. */ if (nslist->ns[0] != 0xffffffff) { /* Find the insertion point for this namespace ID. */ for (i = 0; i < nitems(nslist->ns); i++) { nsid = le32toh(nslist->ns[i]); if (nsid == new_nsid) { /* Already reported, nothing to do. */ mtx_unlock(&ctrlr->lock); return; } if (nsid == 0 || nsid > new_nsid) break; } if (nslist->ns[nitems(nslist->ns) - 1] != htole32(0)) { /* List is full. */ memset(ctrlr->changed_ns, 0, sizeof(*ctrlr->changed_ns)); ctrlr->changed_ns->ns[0] = 0xffffffff; } else if (nslist->ns[i] == htole32(0)) { /* * Optimize case where this ID is appended to * the end. */ nslist->ns[i] = htole32(new_nsid); } else { memmove(&nslist->ns[i + 1], &nslist->ns[i], (nitems(nslist->ns) - i - 1) * sizeof(nslist->ns[0])); nslist->ns[i] = htole32(new_nsid); } } if (ctrlr->changed_ns_reported) { mtx_unlock(&ctrlr->lock); return; } ctrlr->changed_ns_reported = true; mtx_unlock(&ctrlr->lock); nvmft_report_aer(ctrlr, NVME_ASYNC_EVENT_NS_ATTRIBUTE, 0x2, 0x0, NVME_LOG_CHANGED_NAMESPACE); } diff --git a/sys/dev/nvmf/controller/nvmft_qpair.c b/sys/dev/nvmf/controller/nvmft_qpair.c index e66d98f38225..73c7bb280780 100644 --- a/sys/dev/nvmf/controller/nvmft_qpair.c +++ b/sys/dev/nvmf/controller/nvmft_qpair.c @@ -1,418 +1,415 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2023-2024 Chelsio Communications, Inc. * Written by: John Baldwin */ #include #include #include #include #include #include #include /* * A bitmask of command ID values. This is used to detect duplicate * commands with the same ID. */ #define NUM_CIDS (UINT16_MAX + 1) BITSET_DEFINE(cidset, NUM_CIDS); struct nvmft_qpair { struct nvmft_controller *ctrlr; struct nvmf_qpair *qp; struct cidset *cids; bool admin; bool sq_flow_control; uint16_t qid; u_int qsize; uint16_t sqhd; - uint16_t sqtail; volatile u_int qp_refs; /* Internal references on 'qp'. */ struct task datamove_task; STAILQ_HEAD(, ctl_io_hdr) datamove_queue; struct mtx lock; char name[16]; }; static int _nvmft_send_generic_error(struct nvmft_qpair *qp, struct nvmf_capsule *nc, uint8_t sc_status); static void nvmft_datamove_task(void *context, int pending); static void nvmft_qpair_error(void *arg, int error) { struct nvmft_qpair *qp = arg; struct nvmft_controller *ctrlr = qp->ctrlr; /* * XXX: The Linux TCP initiator sends a RST immediately after * the FIN, so treat ECONNRESET as plain EOF to avoid spurious * errors on shutdown. */ if (error == ECONNRESET) error = 0; if (error != 0) nvmft_printf(ctrlr, "error %d on %s\n", error, qp->name); nvmft_controller_error(ctrlr, qp, error); } static void nvmft_receive_capsule(void *arg, struct nvmf_capsule *nc) { struct nvmft_qpair *qp = arg; struct nvmft_controller *ctrlr = qp->ctrlr; const struct nvme_command *cmd; uint8_t sc_status; cmd = nvmf_capsule_sqe(nc); if (ctrlr == NULL) { printf("NVMFT: %s received CID %u opcode %u on newborn queue\n", qp->name, le16toh(cmd->cid), cmd->opc); nvmf_free_capsule(nc); return; } sc_status = nvmf_validate_command_capsule(nc); if (sc_status != NVME_SC_SUCCESS) { _nvmft_send_generic_error(qp, nc, sc_status); nvmf_free_capsule(nc); return; } /* Don't bother byte-swapping CID. */ if (BIT_TEST_SET_ATOMIC(NUM_CIDS, cmd->cid, qp->cids)) { _nvmft_send_generic_error(qp, nc, NVME_SC_COMMAND_ID_CONFLICT); nvmf_free_capsule(nc); return; } if (qp->admin) nvmft_handle_admin_command(ctrlr, nc); else nvmft_handle_io_command(qp, qp->qid, nc); } struct nvmft_qpair * -nvmft_qpair_init(enum nvmf_trtype trtype, - const struct nvmf_handoff_qpair_params *handoff, uint16_t qid, +nvmft_qpair_init(enum nvmf_trtype trtype, const nvlist_t *params, uint16_t qid, const char *name) { struct nvmft_qpair *qp; qp = malloc(sizeof(*qp), M_NVMFT, M_WAITOK | M_ZERO); - qp->admin = handoff->admin; - qp->sq_flow_control = handoff->sq_flow_control; - qp->qsize = handoff->qsize; + qp->admin = nvlist_get_bool(params, "admin"); + qp->sq_flow_control = nvlist_get_bool(params, "sq_flow_control"); + qp->qsize = nvlist_get_number(params, "qsize"); qp->qid = qid; - qp->sqhd = handoff->sqhd; - qp->sqtail = handoff->sqtail; + qp->sqhd = nvlist_get_number(params, "sqhd"); strlcpy(qp->name, name, sizeof(qp->name)); mtx_init(&qp->lock, "nvmft qp", NULL, MTX_DEF); qp->cids = BITSET_ALLOC(NUM_CIDS, M_NVMFT, M_WAITOK | M_ZERO); STAILQ_INIT(&qp->datamove_queue); TASK_INIT(&qp->datamove_task, 0, nvmft_datamove_task, qp); - qp->qp = nvmf_allocate_qpair(trtype, true, handoff, nvmft_qpair_error, + qp->qp = nvmf_allocate_qpair(trtype, true, params, nvmft_qpair_error, qp, nvmft_receive_capsule, qp); if (qp->qp == NULL) { mtx_destroy(&qp->lock); free(qp->cids, M_NVMFT); free(qp, M_NVMFT); return (NULL); } refcount_init(&qp->qp_refs, 1); return (qp); } void nvmft_qpair_shutdown(struct nvmft_qpair *qp) { STAILQ_HEAD(, ctl_io_hdr) datamove_queue; struct nvmf_qpair *nq; union ctl_io *io; STAILQ_INIT(&datamove_queue); mtx_lock(&qp->lock); nq = qp->qp; qp->qp = NULL; STAILQ_CONCAT(&datamove_queue, &qp->datamove_queue); mtx_unlock(&qp->lock); if (nq != NULL && refcount_release(&qp->qp_refs)) nvmf_free_qpair(nq); while (!STAILQ_EMPTY(&datamove_queue)) { io = (union ctl_io *)STAILQ_FIRST(&datamove_queue); STAILQ_REMOVE_HEAD(&datamove_queue, links); nvmft_abort_datamove(io); } nvmft_drain_task(&qp->datamove_task); } void nvmft_qpair_destroy(struct nvmft_qpair *qp) { nvmft_qpair_shutdown(qp); mtx_destroy(&qp->lock); free(qp->cids, M_NVMFT); free(qp, M_NVMFT); } struct nvmft_controller * nvmft_qpair_ctrlr(struct nvmft_qpair *qp) { return (qp->ctrlr); } uint16_t nvmft_qpair_id(struct nvmft_qpair *qp) { return (qp->qid); } const char * nvmft_qpair_name(struct nvmft_qpair *qp) { return (qp->name); } static int _nvmft_send_response(struct nvmft_qpair *qp, const void *cqe) { struct nvme_completion cpl; struct nvmf_qpair *nq; struct nvmf_capsule *rc; int error; memcpy(&cpl, cqe, sizeof(cpl)); mtx_lock(&qp->lock); nq = qp->qp; if (nq == NULL) { mtx_unlock(&qp->lock); return (ENOTCONN); } refcount_acquire(&qp->qp_refs); /* Set SQHD. */ if (qp->sq_flow_control) { qp->sqhd = (qp->sqhd + 1) % qp->qsize; cpl.sqhd = htole16(qp->sqhd); } else cpl.sqhd = 0; mtx_unlock(&qp->lock); rc = nvmf_allocate_response(nq, &cpl, M_WAITOK); error = nvmf_transmit_capsule(rc); nvmf_free_capsule(rc); if (refcount_release(&qp->qp_refs)) nvmf_free_qpair(nq); return (error); } void nvmft_command_completed(struct nvmft_qpair *qp, struct nvmf_capsule *nc) { const struct nvme_command *cmd = nvmf_capsule_sqe(nc); /* Don't bother byte-swapping CID. */ KASSERT(BIT_ISSET(NUM_CIDS, cmd->cid, qp->cids), ("%s: CID %u not busy", __func__, cmd->cid)); BIT_CLR_ATOMIC(NUM_CIDS, cmd->cid, qp->cids); } int nvmft_send_response(struct nvmft_qpair *qp, const void *cqe) { const struct nvme_completion *cpl = cqe; /* Don't bother byte-swapping CID. */ KASSERT(BIT_ISSET(NUM_CIDS, cpl->cid, qp->cids), ("%s: CID %u not busy", __func__, cpl->cid)); BIT_CLR_ATOMIC(NUM_CIDS, cpl->cid, qp->cids); return (_nvmft_send_response(qp, cqe)); } void nvmft_init_cqe(void *cqe, struct nvmf_capsule *nc, uint16_t status) { struct nvme_completion *cpl = cqe; const struct nvme_command *cmd = nvmf_capsule_sqe(nc); memset(cpl, 0, sizeof(*cpl)); cpl->cid = cmd->cid; cpl->status = htole16(status); } int nvmft_send_error(struct nvmft_qpair *qp, struct nvmf_capsule *nc, uint8_t sc_type, uint8_t sc_status) { struct nvme_completion cpl; uint16_t status; status = NVMEF(NVME_STATUS_SCT, sc_type) | NVMEF(NVME_STATUS_SC, sc_status); nvmft_init_cqe(&cpl, nc, status); return (nvmft_send_response(qp, &cpl)); } int nvmft_send_generic_error(struct nvmft_qpair *qp, struct nvmf_capsule *nc, uint8_t sc_status) { return (nvmft_send_error(qp, nc, NVME_SCT_GENERIC, sc_status)); } /* * This version doesn't clear CID in qp->cids and is used for errors * before the CID is validated. */ static int _nvmft_send_generic_error(struct nvmft_qpair *qp, struct nvmf_capsule *nc, uint8_t sc_status) { struct nvme_completion cpl; uint16_t status; status = NVMEF(NVME_STATUS_SCT, NVME_SCT_GENERIC) | NVMEF(NVME_STATUS_SC, sc_status); nvmft_init_cqe(&cpl, nc, status); return (_nvmft_send_response(qp, &cpl)); } int nvmft_send_success(struct nvmft_qpair *qp, struct nvmf_capsule *nc) { return (nvmft_send_generic_error(qp, nc, NVME_SC_SUCCESS)); } static void nvmft_init_connect_rsp(struct nvmf_fabric_connect_rsp *rsp, const struct nvmf_fabric_connect_cmd *cmd, uint16_t status) { memset(rsp, 0, sizeof(*rsp)); rsp->cid = cmd->cid; rsp->status = htole16(status); } static int nvmft_send_connect_response(struct nvmft_qpair *qp, const struct nvmf_fabric_connect_rsp *rsp) { struct nvmf_capsule *rc; struct nvmf_qpair *nq; int error; mtx_lock(&qp->lock); nq = qp->qp; if (nq == NULL) { mtx_unlock(&qp->lock); return (ENOTCONN); } refcount_acquire(&qp->qp_refs); mtx_unlock(&qp->lock); rc = nvmf_allocate_response(qp->qp, rsp, M_WAITOK); error = nvmf_transmit_capsule(rc); nvmf_free_capsule(rc); if (refcount_release(&qp->qp_refs)) nvmf_free_qpair(nq); return (error); } void nvmft_connect_error(struct nvmft_qpair *qp, const struct nvmf_fabric_connect_cmd *cmd, uint8_t sc_type, uint8_t sc_status) { struct nvmf_fabric_connect_rsp rsp; uint16_t status; status = NVMEF(NVME_STATUS_SCT, sc_type) | NVMEF(NVME_STATUS_SC, sc_status); nvmft_init_connect_rsp(&rsp, cmd, status); nvmft_send_connect_response(qp, &rsp); } void nvmft_connect_invalid_parameters(struct nvmft_qpair *qp, const struct nvmf_fabric_connect_cmd *cmd, bool data, uint16_t offset) { struct nvmf_fabric_connect_rsp rsp; nvmft_init_connect_rsp(&rsp, cmd, NVMEF(NVME_STATUS_SCT, NVME_SCT_COMMAND_SPECIFIC) | NVMEF(NVME_STATUS_SC, NVMF_FABRIC_SC_INVALID_PARAM)); rsp.status_code_specific.invalid.ipo = htole16(offset); rsp.status_code_specific.invalid.iattr = data ? 1 : 0; nvmft_send_connect_response(qp, &rsp); } int nvmft_finish_accept(struct nvmft_qpair *qp, const struct nvmf_fabric_connect_cmd *cmd, struct nvmft_controller *ctrlr) { struct nvmf_fabric_connect_rsp rsp; qp->ctrlr = ctrlr; nvmft_init_connect_rsp(&rsp, cmd, 0); if (qp->sq_flow_control) rsp.sqhd = htole16(qp->sqhd); else rsp.sqhd = htole16(0xffff); rsp.status_code_specific.success.cntlid = htole16(ctrlr->cntlid); return (nvmft_send_connect_response(qp, &rsp)); } void nvmft_qpair_datamove(struct nvmft_qpair *qp, union ctl_io *io) { bool enqueue_task; mtx_lock(&qp->lock); if (qp->qp == NULL) { mtx_unlock(&qp->lock); nvmft_abort_datamove(io); return; } enqueue_task = STAILQ_EMPTY(&qp->datamove_queue); STAILQ_INSERT_TAIL(&qp->datamove_queue, &io->io_hdr, links); mtx_unlock(&qp->lock); if (enqueue_task) nvmft_enqueue_task(&qp->datamove_task); } static void nvmft_datamove_task(void *context, int pending __unused) { struct nvmft_qpair *qp = context; union ctl_io *io; bool abort; mtx_lock(&qp->lock); while (!STAILQ_EMPTY(&qp->datamove_queue)) { io = (union ctl_io *)STAILQ_FIRST(&qp->datamove_queue); STAILQ_REMOVE_HEAD(&qp->datamove_queue, links); abort = (qp->qp == NULL); mtx_unlock(&qp->lock); if (abort) nvmft_abort_datamove(io); else nvmft_handle_datamove(io); mtx_lock(&qp->lock); } mtx_unlock(&qp->lock); } diff --git a/sys/dev/nvmf/controller/nvmft_var.h b/sys/dev/nvmf/controller/nvmft_var.h index 4fda297c8a85..7a1748d5999e 100644 --- a/sys/dev/nvmf/controller/nvmft_var.h +++ b/sys/dev/nvmf/controller/nvmft_var.h @@ -1,179 +1,178 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2023-2024 Chelsio Communications, Inc. * Written by: John Baldwin */ #ifndef __NVMFT_VAR_H__ #define __NVMFT_VAR_H__ #include +#include #include #include #include #include #include #include struct nvmf_capsule; struct nvmft_controller; struct nvmft_qpair; #define NVMFT_NUM_AER 16 struct nvmft_port { TAILQ_ENTRY(nvmft_port) link; u_int refs; struct ctl_port port; struct nvme_controller_data cdata; struct nvme_firmware_page fp; uint64_t cap; uint32_t max_io_qsize; bool online; struct sx lock; struct unrhdr *ids; TAILQ_HEAD(, nvmft_controller) controllers; uint32_t *active_ns; u_int num_ns; }; struct nvmft_io_qpair { struct nvmft_qpair *qp; bool shutdown; }; struct nvmft_controller { struct nvmft_qpair *admin; struct nvmft_io_qpair *io_qpairs; u_int num_io_queues; bool shutdown; bool admin_closed; uint16_t cntlid; uint32_t cc; uint32_t csts; struct nvmft_port *np; struct mtx lock; struct nvme_controller_data cdata; struct nvme_health_information_page hip; sbintime_t create_time; sbintime_t start_busy; sbintime_t busy_total; uint16_t partial_dur; uint16_t partial_duw; uint8_t hostid[16]; uint8_t hostnqn[NVME_NQN_FIELD_SIZE]; u_int trtype; TAILQ_ENTRY(nvmft_controller) link; /* * Each queue can have at most UINT16_MAX commands, so the total * across all queues will fit in a uint32_t. */ uint32_t pending_commands; volatile int ka_active_traffic; struct callout ka_timer; sbintime_t ka_sbt; /* AER fields. */ uint32_t aer_mask; uint16_t aer_cids[NVMFT_NUM_AER]; uint8_t aer_pending; uint8_t aer_cidx; uint8_t aer_pidx; /* Changed namespace IDs. */ struct nvme_ns_list *changed_ns; bool changed_ns_reported; struct task shutdown_task; struct timeout_task terminate_task; }; MALLOC_DECLARE(M_NVMFT); /* ctl_frontend_nvmf.c */ void nvmft_port_free(struct nvmft_port *np); void nvmft_populate_active_nslist(struct nvmft_port *np, uint32_t nsid, struct nvme_ns_list *nslist); void nvmft_dispatch_command(struct nvmft_qpair *qp, struct nvmf_capsule *nc, bool admin); void nvmft_terminate_commands(struct nvmft_controller *ctrlr); void nvmft_abort_datamove(union ctl_io *io); void nvmft_handle_datamove(union ctl_io *io); void nvmft_drain_task(struct task *task); void nvmft_enqueue_task(struct task *task); /* nvmft_controller.c */ void nvmft_controller_error(struct nvmft_controller *ctrlr, struct nvmft_qpair *qp, int error); void nvmft_controller_lun_changed(struct nvmft_controller *ctrlr, int lun_id); void nvmft_handle_admin_command(struct nvmft_controller *ctrlr, struct nvmf_capsule *nc); void nvmft_handle_io_command(struct nvmft_qpair *qp, uint16_t qid, struct nvmf_capsule *nc); int nvmft_handoff_admin_queue(struct nvmft_port *np, - const struct nvmf_handoff_controller_qpair *handoff, + enum nvmf_trtype trtype, const nvlist_t *params, const struct nvmf_fabric_connect_cmd *cmd, const struct nvmf_fabric_connect_data *data); -int nvmft_handoff_io_queue(struct nvmft_port *np, - const struct nvmf_handoff_controller_qpair *handoff, - const struct nvmf_fabric_connect_cmd *cmd, +int nvmft_handoff_io_queue(struct nvmft_port *np, enum nvmf_trtype trtype, + const nvlist_t *params, const struct nvmf_fabric_connect_cmd *cmd, const struct nvmf_fabric_connect_data *data); int nvmft_printf(struct nvmft_controller *ctrlr, const char *fmt, ...) __printflike(2, 3); /* nvmft_qpair.c */ struct nvmft_qpair *nvmft_qpair_init(enum nvmf_trtype trtype, - const struct nvmf_handoff_qpair_params *handoff, uint16_t qid, - const char *name); + const nvlist_t *params, uint16_t qid, const char *name); void nvmft_qpair_shutdown(struct nvmft_qpair *qp); void nvmft_qpair_destroy(struct nvmft_qpair *qp); struct nvmft_controller *nvmft_qpair_ctrlr(struct nvmft_qpair *qp); void nvmft_qpair_datamove(struct nvmft_qpair *qp, union ctl_io *io); uint16_t nvmft_qpair_id(struct nvmft_qpair *qp); const char *nvmft_qpair_name(struct nvmft_qpair *qp); void nvmft_command_completed(struct nvmft_qpair *qp, struct nvmf_capsule *nc); int nvmft_send_response(struct nvmft_qpair *qp, const void *cqe); void nvmft_init_cqe(void *cqe, struct nvmf_capsule *nc, uint16_t status); int nvmft_send_error(struct nvmft_qpair *qp, struct nvmf_capsule *nc, uint8_t sc_type, uint8_t sc_status); int nvmft_send_generic_error(struct nvmft_qpair *qp, struct nvmf_capsule *nc, uint8_t sc_status); int nvmft_send_success(struct nvmft_qpair *qp, struct nvmf_capsule *nc); void nvmft_connect_error(struct nvmft_qpair *qp, const struct nvmf_fabric_connect_cmd *cmd, uint8_t sc_type, uint8_t sc_status); void nvmft_connect_invalid_parameters(struct nvmft_qpair *qp, const struct nvmf_fabric_connect_cmd *cmd, bool data, uint16_t offset); int nvmft_finish_accept(struct nvmft_qpair *qp, const struct nvmf_fabric_connect_cmd *cmd, struct nvmft_controller *ctrlr); static __inline void nvmft_port_ref(struct nvmft_port *np) { refcount_acquire(&np->refs); } static __inline void nvmft_port_rele(struct nvmft_port *np) { if (refcount_release(&np->refs)) nvmft_port_free(np); } #endif /* !__NVMFT_VAR_H__ */ diff --git a/sys/dev/nvmf/host/nvmf.c b/sys/dev/nvmf/host/nvmf.c index c726e36e1fae..09d5cecdfad6 100644 --- a/sys/dev/nvmf/host/nvmf.c +++ b/sys/dev/nvmf/host/nvmf.c @@ -1,1099 +1,1132 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2023-2024 Chelsio Communications, Inc. * Written by: John Baldwin */ #include #include #include +#include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include static struct cdevsw nvmf_cdevsw; bool nvmf_fail_disconnect = false; SYSCTL_BOOL(_kern_nvmf, OID_AUTO, fail_on_disconnection, CTLFLAG_RWTUN, &nvmf_fail_disconnect, 0, "Fail I/O requests on connection failure"); MALLOC_DEFINE(M_NVMF, "nvmf", "NVMe over Fabrics host"); static void nvmf_disconnect_task(void *arg, int pending); static void nvmf_shutdown_pre_sync(void *arg, int howto); static void nvmf_shutdown_post_sync(void *arg, int howto); void nvmf_complete(void *arg, const struct nvme_completion *cqe) { struct nvmf_completion_status *status = arg; struct mtx *mtx; status->cqe = *cqe; mtx = mtx_pool_find(mtxpool_sleep, status); mtx_lock(mtx); status->done = true; mtx_unlock(mtx); wakeup(status); } void nvmf_io_complete(void *arg, size_t xfered, int error) { struct nvmf_completion_status *status = arg; struct mtx *mtx; status->io_error = error; mtx = mtx_pool_find(mtxpool_sleep, status); mtx_lock(mtx); status->io_done = true; mtx_unlock(mtx); wakeup(status); } void nvmf_wait_for_reply(struct nvmf_completion_status *status) { struct mtx *mtx; mtx = mtx_pool_find(mtxpool_sleep, status); mtx_lock(mtx); while (!status->done || !status->io_done) mtx_sleep(status, mtx, 0, "nvmfcmd", 0); mtx_unlock(mtx); } static int nvmf_read_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size, uint64_t *value) { const struct nvmf_fabric_prop_get_rsp *rsp; struct nvmf_completion_status status; nvmf_status_init(&status); if (!nvmf_cmd_get_property(sc, offset, size, nvmf_complete, &status, M_WAITOK)) return (ECONNABORTED); nvmf_wait_for_reply(&status); if (status.cqe.status != 0) { device_printf(sc->dev, "PROPERTY_GET failed, status %#x\n", le16toh(status.cqe.status)); return (EIO); } rsp = (const struct nvmf_fabric_prop_get_rsp *)&status.cqe; if (size == 8) *value = le64toh(rsp->value.u64); else *value = le32toh(rsp->value.u32.low); return (0); } static int nvmf_write_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size, uint64_t value) { struct nvmf_completion_status status; nvmf_status_init(&status); if (!nvmf_cmd_set_property(sc, offset, size, value, nvmf_complete, &status, M_WAITOK)) return (ECONNABORTED); nvmf_wait_for_reply(&status); if (status.cqe.status != 0) { device_printf(sc->dev, "PROPERTY_SET failed, status %#x\n", le16toh(status.cqe.status)); return (EIO); } return (0); } static void nvmf_shutdown_controller(struct nvmf_softc *sc) { uint64_t cc; int error; error = nvmf_read_property(sc, NVMF_PROP_CC, 4, &cc); if (error != 0) { device_printf(sc->dev, "Failed to fetch CC for shutdown\n"); return; } cc |= NVMEF(NVME_CC_REG_SHN, NVME_SHN_NORMAL); error = nvmf_write_property(sc, NVMF_PROP_CC, 4, cc); if (error != 0) device_printf(sc->dev, "Failed to set CC to trigger shutdown\n"); } static void nvmf_check_keep_alive(void *arg) { struct nvmf_softc *sc = arg; int traffic; traffic = atomic_readandclear_int(&sc->ka_active_rx_traffic); if (traffic == 0) { device_printf(sc->dev, "disconnecting due to KeepAlive timeout\n"); nvmf_disconnect(sc); return; } callout_schedule_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, C_HARDCLOCK); } static void nvmf_keep_alive_complete(void *arg, const struct nvme_completion *cqe) { struct nvmf_softc *sc = arg; atomic_store_int(&sc->ka_active_rx_traffic, 1); if (cqe->status != 0) { device_printf(sc->dev, "KeepAlive response reported status %#x\n", le16toh(cqe->status)); } } static void nvmf_send_keep_alive(void *arg) { struct nvmf_softc *sc = arg; int traffic; /* * Don't bother sending a KeepAlive command if TKAS is active * and another command has been sent during the interval. */ traffic = atomic_load_int(&sc->ka_active_tx_traffic); if (traffic == 0 && !nvmf_cmd_keep_alive(sc, nvmf_keep_alive_complete, sc, M_NOWAIT)) device_printf(sc->dev, "Failed to allocate KeepAlive command\n"); /* Clear ka_active_tx_traffic after sending the keep alive command. */ atomic_store_int(&sc->ka_active_tx_traffic, 0); callout_schedule_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, C_HARDCLOCK); } int -nvmf_init_ivars(struct nvmf_ivars *ivars, struct nvmf_handoff_host *hh) +nvmf_copyin_handoff(const struct nvmf_ioc_nv *nv, nvlist_t **nvlp) { - size_t len; - u_int i; + const nvlist_t *const *io; + const nvlist_t *admin; + nvlist_t *nvl; + size_t i, num_io_queues; + uint32_t qsize; int error; - memset(ivars, 0, sizeof(*ivars)); - - if (!hh->admin.admin || hh->num_io_queues < 1) - return (EINVAL); - - ivars->cdata = malloc(sizeof(*ivars->cdata), M_NVMF, M_WAITOK); - error = copyin(hh->cdata, ivars->cdata, sizeof(*ivars->cdata)); - if (error != 0) - goto out; - nvme_controller_data_swapbytes(ivars->cdata); - - len = hh->num_io_queues * sizeof(*ivars->io_params); - ivars->io_params = malloc(len, M_NVMF, M_WAITOK); - error = copyin(hh->io, ivars->io_params, len); + error = nvmf_unpack_ioc_nvlist(nv, &nvl); if (error != 0) - goto out; - for (i = 0; i < hh->num_io_queues; i++) { - if (ivars->io_params[i].admin) { - error = EINVAL; - goto out; - } + return (error); - /* Require all I/O queues to be the same size. */ - if (ivars->io_params[i].qsize != ivars->io_params[0].qsize) { - error = EINVAL; - goto out; - } + if (!nvlist_exists_number(nvl, "trtype") || + !nvlist_exists_nvlist(nvl, "admin") || + !nvlist_exists_nvlist_array(nvl, "io") || + !nvlist_exists_binary(nvl, "cdata")) + goto invalid; + + admin = nvlist_get_nvlist(nvl, "admin"); + if (!nvmf_validate_qpair_nvlist(admin, false)) + goto invalid; + if (!nvlist_get_bool(admin, "admin")) + goto invalid; + + io = nvlist_get_nvlist_array(nvl, "io", &num_io_queues); + if (num_io_queues < 1) + goto invalid; + for (i = 0; i < num_io_queues; i++) { + if (!nvmf_validate_qpair_nvlist(io[i], false)) + goto invalid; } - ivars->hh = hh; - return (0); + /* Require all I/O queues to be the same size. */ + qsize = nvlist_get_number(io[0], "qsize"); + for (i = 1; i < num_io_queues; i++) { + if (nvlist_get_number(io[i], "qsize") != qsize) + goto invalid; + } -out: - free(ivars->io_params, M_NVMF); - free(ivars->cdata, M_NVMF); - return (error); -} + nvlist_get_binary(nvl, "cdata", &i); + if (i != sizeof(struct nvme_controller_data)) + goto invalid; -void -nvmf_free_ivars(struct nvmf_ivars *ivars) -{ - free(ivars->io_params, M_NVMF); - free(ivars->cdata, M_NVMF); + *nvlp = nvl; + return (0); +invalid: + nvlist_destroy(nvl); + return (EINVAL); } static int nvmf_probe(device_t dev) { - struct nvmf_ivars *ivars = device_get_ivars(dev); + const nvlist_t *nvl = device_get_ivars(dev); + const struct nvme_controller_data *cdata; - if (ivars == NULL) + if (nvl == NULL) return (ENXIO); - device_set_descf(dev, "Fabrics: %.256s", ivars->cdata->subnqn); + cdata = nvlist_get_binary(nvl, "cdata", NULL); + device_set_descf(dev, "Fabrics: %.256s", cdata->subnqn); return (BUS_PROBE_DEFAULT); } static int -nvmf_establish_connection(struct nvmf_softc *sc, struct nvmf_ivars *ivars) +nvmf_establish_connection(struct nvmf_softc *sc, const nvlist_t *nvl) { + const nvlist_t *const *io; + const nvlist_t *admin; + uint64_t kato; + size_t num_io_queues; + enum nvmf_trtype trtype; char name[16]; + trtype = nvlist_get_number(nvl, "trtype"); + admin = nvlist_get_nvlist(nvl, "admin"); + io = nvlist_get_nvlist_array(nvl, "io", &num_io_queues); + kato = dnvlist_get_number(nvl, "kato", 0); + /* Setup the admin queue. */ - sc->admin = nvmf_init_qp(sc, ivars->hh->trtype, &ivars->hh->admin, - "admin queue", 0); + sc->admin = nvmf_init_qp(sc, trtype, admin, "admin queue", 0); if (sc->admin == NULL) { device_printf(sc->dev, "Failed to setup admin queue\n"); return (ENXIO); } /* Setup I/O queues. */ - sc->io = malloc(ivars->hh->num_io_queues * sizeof(*sc->io), M_NVMF, + sc->io = malloc(num_io_queues * sizeof(*sc->io), M_NVMF, M_WAITOK | M_ZERO); - sc->num_io_queues = ivars->hh->num_io_queues; + sc->num_io_queues = num_io_queues; for (u_int i = 0; i < sc->num_io_queues; i++) { snprintf(name, sizeof(name), "I/O queue %u", i); - sc->io[i] = nvmf_init_qp(sc, ivars->hh->trtype, - &ivars->io_params[i], name, i); + sc->io[i] = nvmf_init_qp(sc, trtype, io[i], name, i); if (sc->io[i] == NULL) { device_printf(sc->dev, "Failed to setup I/O queue %u\n", i + 1); return (ENXIO); } } /* Start KeepAlive timers. */ - if (ivars->hh->kato != 0) { + if (kato != 0) { sc->ka_traffic = NVMEV(NVME_CTRLR_DATA_CTRATT_TBKAS, sc->cdata->ctratt) != 0; - sc->ka_rx_sbt = mstosbt(ivars->hh->kato); + sc->ka_rx_sbt = mstosbt(kato); sc->ka_tx_sbt = sc->ka_rx_sbt / 2; callout_reset_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, nvmf_check_keep_alive, sc, C_HARDCLOCK); callout_reset_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, nvmf_send_keep_alive, sc, C_HARDCLOCK); } + memcpy(sc->cdata, nvlist_get_binary(nvl, "cdata", NULL), + sizeof(*sc->cdata)); + return (0); } typedef bool nvmf_scan_active_ns_cb(struct nvmf_softc *, uint32_t, const struct nvme_namespace_data *, void *); static bool nvmf_scan_active_nslist(struct nvmf_softc *sc, struct nvme_ns_list *nslist, struct nvme_namespace_data *data, uint32_t *nsidp, nvmf_scan_active_ns_cb *cb, void *cb_arg) { struct nvmf_completion_status status; uint32_t nsid; nvmf_status_init(&status); nvmf_status_wait_io(&status); if (!nvmf_cmd_identify_active_namespaces(sc, *nsidp, nslist, nvmf_complete, &status, nvmf_io_complete, &status, M_WAITOK)) { device_printf(sc->dev, "failed to send IDENTIFY active namespaces command\n"); return (false); } nvmf_wait_for_reply(&status); if (status.cqe.status != 0) { device_printf(sc->dev, "IDENTIFY active namespaces failed, status %#x\n", le16toh(status.cqe.status)); return (false); } if (status.io_error != 0) { device_printf(sc->dev, "IDENTIFY active namespaces failed with I/O error %d\n", status.io_error); return (false); } for (u_int i = 0; i < nitems(nslist->ns); i++) { nsid = nslist->ns[i]; if (nsid == 0) { *nsidp = 0; return (true); } nvmf_status_init(&status); nvmf_status_wait_io(&status); if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete, &status, nvmf_io_complete, &status, M_WAITOK)) { device_printf(sc->dev, "failed to send IDENTIFY namespace %u command\n", nsid); return (false); } nvmf_wait_for_reply(&status); if (status.cqe.status != 0) { device_printf(sc->dev, "IDENTIFY namespace %u failed, status %#x\n", nsid, le16toh(status.cqe.status)); return (false); } if (status.io_error != 0) { device_printf(sc->dev, "IDENTIFY namespace %u failed with I/O error %d\n", nsid, status.io_error); return (false); } nvme_namespace_data_swapbytes(data); if (!cb(sc, nsid, data, cb_arg)) return (false); } MPASS(nsid == nslist->ns[nitems(nslist->ns) - 1] && nsid != 0); if (nsid >= NVME_GLOBAL_NAMESPACE_TAG - 1) *nsidp = 0; else *nsidp = nsid; return (true); } static bool nvmf_scan_active_namespaces(struct nvmf_softc *sc, nvmf_scan_active_ns_cb *cb, void *cb_arg) { struct nvme_namespace_data *data; struct nvme_ns_list *nslist; uint32_t nsid; bool retval; nslist = malloc(sizeof(*nslist), M_NVMF, M_WAITOK); data = malloc(sizeof(*data), M_NVMF, M_WAITOK); nsid = 0; retval = true; for (;;) { if (!nvmf_scan_active_nslist(sc, nslist, data, &nsid, cb, cb_arg)) { retval = false; break; } if (nsid == 0) break; } free(data, M_NVMF); free(nslist, M_NVMF); return (retval); } static bool nvmf_add_ns(struct nvmf_softc *sc, uint32_t nsid, const struct nvme_namespace_data *data, void *arg __unused) { if (sc->ns[nsid - 1] != NULL) { device_printf(sc->dev, "duplicate namespace %u in active namespace list\n", nsid); return (false); } /* * As in nvme_ns_construct, a size of zero indicates an * invalid namespace. */ if (data->nsze == 0) { device_printf(sc->dev, "ignoring active namespace %u with zero size\n", nsid); return (true); } sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data); nvmf_sim_rescan_ns(sc, nsid); return (true); } static bool nvmf_add_namespaces(struct nvmf_softc *sc) { sc->ns = mallocarray(sc->cdata->nn, sizeof(*sc->ns), M_NVMF, M_WAITOK | M_ZERO); return (nvmf_scan_active_namespaces(sc, nvmf_add_ns, NULL)); } static int nvmf_attach(device_t dev) { struct make_dev_args mda; struct nvmf_softc *sc = device_get_softc(dev); - struct nvmf_ivars *ivars = device_get_ivars(dev); + const nvlist_t *nvl = device_get_ivars(dev); + const nvlist_t * const *io; struct sysctl_oid *oid; uint64_t val; u_int i; int error; - if (ivars == NULL) + if (nvl == NULL) return (ENXIO); sc->dev = dev; - sc->trtype = ivars->hh->trtype; + sc->trtype = nvlist_get_number(nvl, "trtype"); callout_init(&sc->ka_rx_timer, 1); callout_init(&sc->ka_tx_timer, 1); sx_init(&sc->connection_lock, "nvmf connection"); TASK_INIT(&sc->disconnect_task, 0, nvmf_disconnect_task, sc); oid = SYSCTL_ADD_NODE(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "ioq", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "I/O Queues"); sc->ioq_oid_list = SYSCTL_CHILDREN(oid); - /* Claim the cdata pointer from ivars. */ - sc->cdata = ivars->cdata; - ivars->cdata = NULL; + sc->cdata = malloc(sizeof(*sc->cdata), M_NVMF, M_WAITOK); nvmf_init_aer(sc); - error = nvmf_establish_connection(sc, ivars); + error = nvmf_establish_connection(sc, nvl); if (error != 0) goto out; error = nvmf_read_property(sc, NVMF_PROP_CAP, 8, &sc->cap); if (error != 0) { device_printf(sc->dev, "Failed to fetch CAP\n"); error = ENXIO; goto out; } error = nvmf_read_property(sc, NVMF_PROP_VS, 4, &val); if (error != 0) { device_printf(sc->dev, "Failed to fetch VS\n"); error = ENXIO; goto out; } sc->vs = val; /* Honor MDTS if it is set. */ sc->max_xfer_size = maxphys; if (sc->cdata->mdts != 0) { sc->max_xfer_size = ulmin(sc->max_xfer_size, 1 << (sc->cdata->mdts + NVME_MPS_SHIFT + NVME_CAP_HI_MPSMIN(sc->cap >> 32))); } - sc->max_pending_io = ivars->io_params[0].qsize * sc->num_io_queues; + io = nvlist_get_nvlist_array(nvl, "io", NULL); + sc->max_pending_io = nvlist_get_number(io[0], "qsize") * + sc->num_io_queues; error = nvmf_init_sim(sc); if (error != 0) goto out; error = nvmf_start_aer(sc); if (error != 0) { nvmf_destroy_sim(sc); goto out; } if (!nvmf_add_namespaces(sc)) { nvmf_destroy_sim(sc); goto out; } make_dev_args_init(&mda); mda.mda_devsw = &nvmf_cdevsw; mda.mda_uid = UID_ROOT; mda.mda_gid = GID_WHEEL; mda.mda_mode = 0600; mda.mda_si_drv1 = sc; error = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev)); if (error != 0) { nvmf_destroy_sim(sc); goto out; } sc->shutdown_pre_sync_eh = EVENTHANDLER_REGISTER(shutdown_pre_sync, nvmf_shutdown_pre_sync, sc, SHUTDOWN_PRI_FIRST); sc->shutdown_post_sync_eh = EVENTHANDLER_REGISTER(shutdown_post_sync, nvmf_shutdown_post_sync, sc, SHUTDOWN_PRI_FIRST); return (0); out: if (sc->ns != NULL) { for (i = 0; i < sc->cdata->nn; i++) { if (sc->ns[i] != NULL) nvmf_destroy_ns(sc->ns[i]); } free(sc->ns, M_NVMF); } callout_drain(&sc->ka_tx_timer); callout_drain(&sc->ka_rx_timer); if (sc->admin != NULL) nvmf_shutdown_controller(sc); for (i = 0; i < sc->num_io_queues; i++) { if (sc->io[i] != NULL) nvmf_destroy_qp(sc->io[i]); } free(sc->io, M_NVMF); if (sc->admin != NULL) nvmf_destroy_qp(sc->admin); nvmf_destroy_aer(sc); taskqueue_drain(taskqueue_thread, &sc->disconnect_task); sx_destroy(&sc->connection_lock); free(sc->cdata, M_NVMF); return (error); } void nvmf_disconnect(struct nvmf_softc *sc) { taskqueue_enqueue(taskqueue_thread, &sc->disconnect_task); } static void nvmf_disconnect_task(void *arg, int pending __unused) { struct nvmf_softc *sc = arg; u_int i; sx_xlock(&sc->connection_lock); if (sc->admin == NULL) { /* * Ignore transport errors if there is no active * association. */ sx_xunlock(&sc->connection_lock); return; } if (sc->detaching) { if (sc->admin != NULL) { /* * This unsticks the detach process if a * transport error occurs during detach. */ nvmf_shutdown_qp(sc->admin); } sx_xunlock(&sc->connection_lock); return; } if (sc->cdev == NULL) { /* * Transport error occurred during attach (nvmf_add_namespaces). * Shutdown the admin queue. */ nvmf_shutdown_qp(sc->admin); sx_xunlock(&sc->connection_lock); return; } callout_drain(&sc->ka_tx_timer); callout_drain(&sc->ka_rx_timer); sc->ka_traffic = false; /* Quiesce namespace consumers. */ nvmf_disconnect_sim(sc); for (i = 0; i < sc->cdata->nn; i++) { if (sc->ns[i] != NULL) nvmf_disconnect_ns(sc->ns[i]); } /* Shutdown the existing qpairs. */ for (i = 0; i < sc->num_io_queues; i++) { nvmf_destroy_qp(sc->io[i]); } free(sc->io, M_NVMF); sc->io = NULL; sc->num_io_queues = 0; nvmf_destroy_qp(sc->admin); sc->admin = NULL; sx_xunlock(&sc->connection_lock); } static int -nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_handoff_host *hh) +nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv) { - struct nvmf_ivars ivars; + const struct nvme_controller_data *cdata; + nvlist_t *nvl; u_int i; int error; + error = nvmf_copyin_handoff(nv, &nvl); + if (error != 0) + return (error); + /* XXX: Should we permit changing the transport type? */ - if (sc->trtype != hh->trtype) { + if (sc->trtype != nvlist_get_number(nvl, "trtype")) { device_printf(sc->dev, "transport type mismatch on reconnect\n"); return (EINVAL); } - error = nvmf_init_ivars(&ivars, hh); - if (error != 0) - return (error); - sx_xlock(&sc->connection_lock); if (sc->admin != NULL || sc->detaching) { error = EBUSY; goto out; } /* * Ensure this is for the same controller. Note that the * controller ID can vary across associations if the remote * system is using the dynamic controller model. This merely * ensures the new association is connected to the same NVMe * subsystem. */ - if (memcmp(sc->cdata->subnqn, ivars.cdata->subnqn, - sizeof(ivars.cdata->subnqn)) != 0) { + cdata = nvlist_get_binary(nvl, "cdata", NULL); + if (memcmp(sc->cdata->subnqn, cdata->subnqn, + sizeof(cdata->subnqn)) != 0) { device_printf(sc->dev, "controller subsystem NQN mismatch on reconnect\n"); error = EINVAL; goto out; } /* * XXX: Require same number and size of I/O queues so that * max_pending_io is still correct? */ - error = nvmf_establish_connection(sc, &ivars); + error = nvmf_establish_connection(sc, nvl); if (error != 0) goto out; error = nvmf_start_aer(sc); if (error != 0) goto out; device_printf(sc->dev, "established new association with %u I/O queues\n", sc->num_io_queues); /* Restart namespace consumers. */ for (i = 0; i < sc->cdata->nn; i++) { if (sc->ns[i] != NULL) nvmf_reconnect_ns(sc->ns[i]); } nvmf_reconnect_sim(sc); nvmf_rescan_all_ns(sc); out: sx_xunlock(&sc->connection_lock); - nvmf_free_ivars(&ivars); + nvlist_destroy(nvl); return (error); } static void nvmf_shutdown_pre_sync(void *arg, int howto) { struct nvmf_softc *sc = arg; if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED()) return; /* * If this association is disconnected, abort any pending * requests with an error to permit filesystems to unmount * without hanging. */ sx_xlock(&sc->connection_lock); if (sc->admin != NULL || sc->detaching) { sx_xunlock(&sc->connection_lock); return; } for (u_int i = 0; i < sc->cdata->nn; i++) { if (sc->ns[i] != NULL) nvmf_shutdown_ns(sc->ns[i]); } nvmf_shutdown_sim(sc); sx_xunlock(&sc->connection_lock); } static void nvmf_shutdown_post_sync(void *arg, int howto) { struct nvmf_softc *sc = arg; if ((howto & RB_NOSYNC) != 0 || SCHEDULER_STOPPED()) return; /* * If this association is connected, disconnect gracefully. */ sx_xlock(&sc->connection_lock); if (sc->admin == NULL || sc->detaching) { sx_xunlock(&sc->connection_lock); return; } callout_drain(&sc->ka_tx_timer); callout_drain(&sc->ka_rx_timer); nvmf_shutdown_controller(sc); for (u_int i = 0; i < sc->num_io_queues; i++) { nvmf_destroy_qp(sc->io[i]); } nvmf_destroy_qp(sc->admin); sc->admin = NULL; sx_xunlock(&sc->connection_lock); } static int nvmf_detach(device_t dev) { struct nvmf_softc *sc = device_get_softc(dev); u_int i; destroy_dev(sc->cdev); sx_xlock(&sc->connection_lock); sc->detaching = true; sx_xunlock(&sc->connection_lock); EVENTHANDLER_DEREGISTER(shutdown_pre_sync, sc->shutdown_pre_sync_eh); EVENTHANDLER_DEREGISTER(shutdown_post_sync, sc->shutdown_post_sync_eh); nvmf_destroy_sim(sc); for (i = 0; i < sc->cdata->nn; i++) { if (sc->ns[i] != NULL) nvmf_destroy_ns(sc->ns[i]); } free(sc->ns, M_NVMF); callout_drain(&sc->ka_tx_timer); callout_drain(&sc->ka_rx_timer); if (sc->admin != NULL) nvmf_shutdown_controller(sc); for (i = 0; i < sc->num_io_queues; i++) { nvmf_destroy_qp(sc->io[i]); } free(sc->io, M_NVMF); taskqueue_drain(taskqueue_thread, &sc->disconnect_task); if (sc->admin != NULL) nvmf_destroy_qp(sc->admin); nvmf_destroy_aer(sc); sx_destroy(&sc->connection_lock); free(sc->cdata, M_NVMF); return (0); } static void nvmf_rescan_ns_1(struct nvmf_softc *sc, uint32_t nsid, const struct nvme_namespace_data *data) { struct nvmf_namespace *ns; /* XXX: Needs locking around sc->ns[]. */ ns = sc->ns[nsid - 1]; if (data->nsze == 0) { /* XXX: Needs locking */ if (ns != NULL) { nvmf_destroy_ns(ns); sc->ns[nsid - 1] = NULL; } } else { /* XXX: Needs locking */ if (ns == NULL) { sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data); } else { if (!nvmf_update_ns(ns, data)) { nvmf_destroy_ns(ns); sc->ns[nsid - 1] = NULL; } } } nvmf_sim_rescan_ns(sc, nsid); } void nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid) { struct nvmf_completion_status status; struct nvme_namespace_data *data; data = malloc(sizeof(*data), M_NVMF, M_WAITOK); nvmf_status_init(&status); nvmf_status_wait_io(&status); if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete, &status, nvmf_io_complete, &status, M_WAITOK)) { device_printf(sc->dev, "failed to send IDENTIFY namespace %u command\n", nsid); free(data, M_NVMF); return; } nvmf_wait_for_reply(&status); if (status.cqe.status != 0) { device_printf(sc->dev, "IDENTIFY namespace %u failed, status %#x\n", nsid, le16toh(status.cqe.status)); free(data, M_NVMF); return; } if (status.io_error != 0) { device_printf(sc->dev, "IDENTIFY namespace %u failed with I/O error %d\n", nsid, status.io_error); free(data, M_NVMF); return; } nvme_namespace_data_swapbytes(data); nvmf_rescan_ns_1(sc, nsid, data); free(data, M_NVMF); } static void nvmf_purge_namespaces(struct nvmf_softc *sc, uint32_t first_nsid, uint32_t next_valid_nsid) { struct nvmf_namespace *ns; for (uint32_t nsid = first_nsid; nsid < next_valid_nsid; nsid++) { /* XXX: Needs locking around sc->ns[]. */ ns = sc->ns[nsid - 1]; if (ns != NULL) { nvmf_destroy_ns(ns); sc->ns[nsid - 1] = NULL; nvmf_sim_rescan_ns(sc, nsid); } } } static bool nvmf_rescan_ns_cb(struct nvmf_softc *sc, uint32_t nsid, const struct nvme_namespace_data *data, void *arg) { uint32_t *last_nsid = arg; /* Check for any gaps prior to this namespace. */ nvmf_purge_namespaces(sc, *last_nsid + 1, nsid); *last_nsid = nsid; nvmf_rescan_ns_1(sc, nsid, data); return (true); } void nvmf_rescan_all_ns(struct nvmf_softc *sc) { uint32_t last_nsid; last_nsid = 0; if (!nvmf_scan_active_namespaces(sc, nvmf_rescan_ns_cb, &last_nsid)) return; /* * Check for any namespace devices after the last active * namespace. */ nvmf_purge_namespaces(sc, last_nsid + 1, sc->cdata->nn + 1); } int nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt, bool admin) { struct nvmf_completion_status status; struct nvme_command cmd; struct memdesc mem; struct nvmf_host_qpair *qp; struct nvmf_request *req; void *buf; int error; if (pt->len > sc->max_xfer_size) return (EINVAL); buf = NULL; if (pt->len != 0) { /* * XXX: Depending on the size we may want to pin the * user pages and use a memdesc with vm_page_t's * instead. */ buf = malloc(pt->len, M_NVMF, M_WAITOK); if (pt->is_read == 0) { error = copyin(pt->buf, buf, pt->len); if (error != 0) { free(buf, M_NVMF); return (error); } } else { /* Ensure no kernel data is leaked to userland. */ memset(buf, 0, pt->len); } } memset(&cmd, 0, sizeof(cmd)); cmd.opc = pt->cmd.opc; cmd.fuse = pt->cmd.fuse; cmd.nsid = pt->cmd.nsid; cmd.cdw10 = pt->cmd.cdw10; cmd.cdw11 = pt->cmd.cdw11; cmd.cdw12 = pt->cmd.cdw12; cmd.cdw13 = pt->cmd.cdw13; cmd.cdw14 = pt->cmd.cdw14; cmd.cdw15 = pt->cmd.cdw15; sx_slock(&sc->connection_lock); if (sc->admin == NULL || sc->detaching) { device_printf(sc->dev, "failed to send passthrough command\n"); error = ECONNABORTED; sx_sunlock(&sc->connection_lock); goto error; } if (admin) qp = sc->admin; else qp = nvmf_select_io_queue(sc); nvmf_status_init(&status); req = nvmf_allocate_request(qp, &cmd, nvmf_complete, &status, M_WAITOK); sx_sunlock(&sc->connection_lock); if (req == NULL) { device_printf(sc->dev, "failed to send passthrough command\n"); error = ECONNABORTED; goto error; } if (pt->len != 0) { mem = memdesc_vaddr(buf, pt->len); nvmf_capsule_append_data(req->nc, &mem, pt->len, pt->is_read == 0, nvmf_io_complete, &status); nvmf_status_wait_io(&status); } nvmf_submit_request(req); nvmf_wait_for_reply(&status); memset(&pt->cpl, 0, sizeof(pt->cpl)); pt->cpl.cdw0 = status.cqe.cdw0; pt->cpl.status = status.cqe.status; error = status.io_error; if (error == 0 && pt->len != 0 && pt->is_read != 0) error = copyout(buf, pt->buf, pt->len); error: free(buf, M_NVMF); return (error); } +static int +nvmf_reconnect_params(struct nvmf_softc *sc, struct nvmf_ioc_nv *nv) +{ + nvlist_t *nvl; + int error; + + nvl = nvlist_create(0); + + sx_slock(&sc->connection_lock); + if ((sc->cdata->fcatt & 1) == 0) + nvlist_add_number(nvl, "cntlid", NVMF_CNTLID_DYNAMIC); + else + nvlist_add_number(nvl, "cntlid", sc->cdata->ctrlr_id); + nvlist_add_stringf(nvl, "subnqn", "%.256s", sc->cdata->subnqn); + sx_sunlock(&sc->connection_lock); + + error = nvmf_pack_ioc_nvlist(nvl, nv); + nvlist_destroy(nvl); + return (error); +} + static int nvmf_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag, struct thread *td) { struct nvmf_softc *sc = cdev->si_drv1; struct nvme_get_nsid *gnsid; struct nvme_pt_command *pt; - struct nvmf_reconnect_params *rp; - struct nvmf_handoff_host *hh; + struct nvmf_ioc_nv *nv; switch (cmd) { case NVME_PASSTHROUGH_CMD: pt = (struct nvme_pt_command *)arg; return (nvmf_passthrough_cmd(sc, pt, true)); case NVME_GET_NSID: gnsid = (struct nvme_get_nsid *)arg; strlcpy(gnsid->cdev, device_get_nameunit(sc->dev), sizeof(gnsid->cdev)); gnsid->nsid = 0; return (0); case NVME_GET_MAX_XFER_SIZE: *(uint64_t *)arg = sc->max_xfer_size; return (0); case NVMF_RECONNECT_PARAMS: - rp = (struct nvmf_reconnect_params *)arg; - if ((sc->cdata->fcatt & 1) == 0) - rp->cntlid = NVMF_CNTLID_DYNAMIC; - else - rp->cntlid = sc->cdata->ctrlr_id; - memcpy(rp->subnqn, sc->cdata->subnqn, sizeof(rp->subnqn)); - return (0); + nv = (struct nvmf_ioc_nv *)arg; + return (nvmf_reconnect_params(sc, nv)); case NVMF_RECONNECT_HOST: - hh = (struct nvmf_handoff_host *)arg; - return (nvmf_reconnect_host(sc, hh)); + nv = (struct nvmf_ioc_nv *)arg; + return (nvmf_reconnect_host(sc, nv)); default: return (ENOTTY); } } static struct cdevsw nvmf_cdevsw = { .d_version = D_VERSION, .d_ioctl = nvmf_ioctl }; static int nvmf_modevent(module_t mod, int what, void *arg) { switch (what) { case MOD_LOAD: return (nvmf_ctl_load()); case MOD_QUIESCE: return (0); case MOD_UNLOAD: nvmf_ctl_unload(); destroy_dev_drain(&nvmf_cdevsw); return (0); default: return (EOPNOTSUPP); } } static device_method_t nvmf_methods[] = { /* Device interface */ DEVMETHOD(device_probe, nvmf_probe), DEVMETHOD(device_attach, nvmf_attach), DEVMETHOD(device_detach, nvmf_detach), DEVMETHOD_END }; driver_t nvme_nvmf_driver = { "nvme", nvmf_methods, sizeof(struct nvmf_softc), }; DRIVER_MODULE(nvme, root, nvme_nvmf_driver, nvmf_modevent, NULL); MODULE_DEPEND(nvmf, nvmf_transport, 1, 1, 1); diff --git a/sys/dev/nvmf/host/nvmf_ctldev.c b/sys/dev/nvmf/host/nvmf_ctldev.c index f40005a2a666..bc79bd99f639 100644 --- a/sys/dev/nvmf/host/nvmf_ctldev.c +++ b/sys/dev/nvmf/host/nvmf_ctldev.c @@ -1,159 +1,160 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2023 Chelsio Communications, Inc. * Written by: John Baldwin */ #include #include #include #include +#include #include #include #include #include static struct cdev *nvmf_cdev; static int -nvmf_handoff_host(struct nvmf_handoff_host *hh) +nvmf_handoff_host(struct nvmf_ioc_nv *nv) { - struct nvmf_ivars ivars; + nvlist_t *nvl; device_t dev; int error; - error = nvmf_init_ivars(&ivars, hh); + error = nvmf_copyin_handoff(nv, &nvl); if (error != 0) return (error); bus_topo_lock(); dev = device_add_child(root_bus, "nvme", -1); if (dev == NULL) { bus_topo_unlock(); error = ENXIO; goto out; } - device_set_ivars(dev, &ivars); + device_set_ivars(dev, nvl); error = device_probe_and_attach(dev); device_set_ivars(dev, NULL); if (error != 0) device_delete_child(root_bus, dev); bus_topo_unlock(); out: - nvmf_free_ivars(&ivars); + nvlist_destroy(nvl); return (error); } static bool nvmf_matches(device_t dev, char *name) { struct nvmf_softc *sc = device_get_softc(dev); if (strcmp(device_get_nameunit(dev), name) == 0) return (true); if (strcmp(sc->cdata->subnqn, name) == 0) return (true); return (false); } static int nvmf_disconnect_by_name(char *name) { devclass_t dc; device_t dev; int error, unit; bool found; found = false; error = 0; bus_topo_lock(); dc = devclass_find("nvme"); if (dc == NULL) goto out; for (unit = 0; unit < devclass_get_maxunit(dc); unit++) { dev = devclass_get_device(dc, unit); if (dev == NULL) continue; if (device_get_driver(dev) != &nvme_nvmf_driver) continue; if (device_get_parent(dev) != root_bus) continue; if (name != NULL && !nvmf_matches(dev, name)) continue; error = device_delete_child(root_bus, dev); if (error != 0) break; found = true; } out: bus_topo_unlock(); if (error == 0 && !found) error = ENOENT; return (error); } static int nvmf_disconnect_host(const char **namep) { char *name; int error; name = malloc(PATH_MAX, M_NVMF, M_WAITOK); error = copyinstr(*namep, name, PATH_MAX, NULL); if (error == 0) error = nvmf_disconnect_by_name(name); free(name, M_NVMF); return (error); } static int nvmf_ctl_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int flag, struct thread *td) { switch (cmd) { case NVMF_HANDOFF_HOST: - return (nvmf_handoff_host((struct nvmf_handoff_host *)arg)); + return (nvmf_handoff_host((struct nvmf_ioc_nv *)arg)); case NVMF_DISCONNECT_HOST: return (nvmf_disconnect_host((const char **)arg)); case NVMF_DISCONNECT_ALL: return (nvmf_disconnect_by_name(NULL)); default: return (ENOTTY); } } static struct cdevsw nvmf_ctl_cdevsw = { .d_version = D_VERSION, .d_ioctl = nvmf_ctl_ioctl }; int nvmf_ctl_load(void) { struct make_dev_args mda; int error; make_dev_args_init(&mda); mda.mda_devsw = &nvmf_ctl_cdevsw; mda.mda_uid = UID_ROOT; mda.mda_gid = GID_WHEEL; mda.mda_mode = 0600; error = make_dev_s(&mda, &nvmf_cdev, "nvmf"); if (error != 0) nvmf_cdev = NULL; return (error); } void nvmf_ctl_unload(void) { if (nvmf_cdev != NULL) { destroy_dev(nvmf_cdev); nvmf_cdev = NULL; } } diff --git a/sys/dev/nvmf/host/nvmf_qpair.c b/sys/dev/nvmf/host/nvmf_qpair.c index b03ecfa081d3..2f511cf0406d 100644 --- a/sys/dev/nvmf/host/nvmf_qpair.c +++ b/sys/dev/nvmf/host/nvmf_qpair.c @@ -1,449 +1,452 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2023-2024 Chelsio Communications, Inc. * Written by: John Baldwin */ #include #include #include #include #include +#include #include #include #include #include #include struct nvmf_host_command { struct nvmf_request *req; TAILQ_ENTRY(nvmf_host_command) link; uint16_t cid; }; struct nvmf_host_qpair { struct nvmf_softc *sc; struct nvmf_qpair *qp; bool sq_flow_control; bool shutting_down; u_int allocating; u_int num_commands; uint16_t sqhd; uint16_t sqtail; uint64_t submitted; struct mtx lock; TAILQ_HEAD(, nvmf_host_command) free_commands; STAILQ_HEAD(, nvmf_request) pending_requests; /* Indexed by cid. */ struct nvmf_host_command **active_commands; char name[16]; struct sysctl_ctx_list sysctl_ctx; }; struct nvmf_request * nvmf_allocate_request(struct nvmf_host_qpair *qp, void *sqe, nvmf_request_complete_t *cb, void *cb_arg, int how) { struct nvmf_request *req; struct nvmf_qpair *nq; KASSERT(how == M_WAITOK || how == M_NOWAIT, ("%s: invalid how", __func__)); req = malloc(sizeof(*req), M_NVMF, how | M_ZERO); if (req == NULL) return (NULL); mtx_lock(&qp->lock); nq = qp->qp; if (nq == NULL) { mtx_unlock(&qp->lock); free(req, M_NVMF); return (NULL); } qp->allocating++; MPASS(qp->allocating != 0); mtx_unlock(&qp->lock); req->qp = qp; req->cb = cb; req->cb_arg = cb_arg; req->nc = nvmf_allocate_command(nq, sqe, how); if (req->nc == NULL) { free(req, M_NVMF); req = NULL; } mtx_lock(&qp->lock); qp->allocating--; if (qp->allocating == 0 && qp->shutting_down) wakeup(qp); mtx_unlock(&qp->lock); return (req); } static void nvmf_abort_request(struct nvmf_request *req, uint16_t cid) { struct nvme_completion cqe; memset(&cqe, 0, sizeof(cqe)); cqe.cid = cid; cqe.status = htole16(NVMEF(NVME_STATUS_SCT, NVME_SCT_PATH_RELATED) | NVMEF(NVME_STATUS_SC, NVME_SC_COMMAND_ABORTED_BY_HOST)); req->cb(req->cb_arg, &cqe); } void nvmf_free_request(struct nvmf_request *req) { if (req->nc != NULL) nvmf_free_capsule(req->nc); free(req, M_NVMF); } static void nvmf_dispatch_command(struct nvmf_host_qpair *qp, struct nvmf_host_command *cmd) { struct nvmf_softc *sc = qp->sc; struct nvme_command *sqe; struct nvmf_capsule *nc; uint16_t new_sqtail; int error; mtx_assert(&qp->lock, MA_OWNED); qp->submitted++; /* * Update flow control tracking. This is just a sanity check. * Since num_commands == qsize - 1, there can never be too * many commands in flight. */ new_sqtail = (qp->sqtail + 1) % (qp->num_commands + 1); KASSERT(new_sqtail != qp->sqhd, ("%s: qp %p is full", __func__, qp)); qp->sqtail = new_sqtail; mtx_unlock(&qp->lock); nc = cmd->req->nc; sqe = nvmf_capsule_sqe(nc); /* * NB: Don't bother byte-swapping the cid so that receive * doesn't have to swap. */ sqe->cid = cmd->cid; error = nvmf_transmit_capsule(nc); if (error != 0) { device_printf(sc->dev, "failed to transmit capsule: %d, disconnecting\n", error); nvmf_disconnect(sc); return; } if (sc->ka_traffic) atomic_store_int(&sc->ka_active_tx_traffic, 1); } static void nvmf_qp_error(void *arg, int error) { struct nvmf_host_qpair *qp = arg; struct nvmf_softc *sc = qp->sc; /* Ignore simple close of queue pairs during shutdown. */ if (!(sc->detaching && error == 0)) device_printf(sc->dev, "error %d on %s, disconnecting\n", error, qp->name); nvmf_disconnect(sc); } static void nvmf_receive_capsule(void *arg, struct nvmf_capsule *nc) { struct nvmf_host_qpair *qp = arg; struct nvmf_softc *sc = qp->sc; struct nvmf_host_command *cmd; struct nvmf_request *req; const struct nvme_completion *cqe; uint16_t cid; cqe = nvmf_capsule_cqe(nc); if (sc->ka_traffic) atomic_store_int(&sc->ka_active_rx_traffic, 1); /* * NB: Don't bother byte-swapping the cid as transmit doesn't * swap either. */ cid = cqe->cid; if (cid > qp->num_commands) { device_printf(sc->dev, "received invalid CID %u, disconnecting\n", cid); nvmf_disconnect(sc); nvmf_free_capsule(nc); return; } /* Update flow control tracking. */ mtx_lock(&qp->lock); if (qp->sq_flow_control) { if (nvmf_sqhd_valid(nc)) qp->sqhd = le16toh(cqe->sqhd); } else { /* * If SQ FC is disabled, just advance the head for * each response capsule received. */ qp->sqhd = (qp->sqhd + 1) % (qp->num_commands + 1); } /* * If the queue has been shutdown due to an error, silently * drop the response. */ if (qp->qp == NULL) { device_printf(sc->dev, "received completion for CID %u on shutdown %s\n", cid, qp->name); mtx_unlock(&qp->lock); nvmf_free_capsule(nc); return; } cmd = qp->active_commands[cid]; if (cmd == NULL) { mtx_unlock(&qp->lock); device_printf(sc->dev, "received completion for inactive CID %u, disconnecting\n", cid); nvmf_disconnect(sc); nvmf_free_capsule(nc); return; } KASSERT(cmd->cid == cid, ("%s: CID mismatch", __func__)); req = cmd->req; cmd->req = NULL; if (STAILQ_EMPTY(&qp->pending_requests)) { qp->active_commands[cid] = NULL; TAILQ_INSERT_TAIL(&qp->free_commands, cmd, link); mtx_unlock(&qp->lock); } else { cmd->req = STAILQ_FIRST(&qp->pending_requests); STAILQ_REMOVE_HEAD(&qp->pending_requests, link); nvmf_dispatch_command(qp, cmd); } req->cb(req->cb_arg, cqe); nvmf_free_capsule(nc); nvmf_free_request(req); } static void nvmf_sysctls_qp(struct nvmf_softc *sc, struct nvmf_host_qpair *qp, bool admin, u_int qid) { struct sysctl_ctx_list *ctx = &qp->sysctl_ctx; struct sysctl_oid *oid; struct sysctl_oid_list *list; char name[8]; if (admin) { oid = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)), OID_AUTO, "adminq", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Admin Queue"); } else { snprintf(name, sizeof(name), "%u", qid); oid = SYSCTL_ADD_NODE(ctx, sc->ioq_oid_list, OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "I/O Queue"); } list = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UINT(ctx, list, OID_AUTO, "num_entries", CTLFLAG_RD, NULL, qp->num_commands + 1, "Number of entries in queue"); SYSCTL_ADD_U16(ctx, list, OID_AUTO, "sq_head", CTLFLAG_RD, &qp->sqhd, 0, "Current head of submission queue (as observed by driver)"); SYSCTL_ADD_U16(ctx, list, OID_AUTO, "sq_tail", CTLFLAG_RD, &qp->sqtail, 0, "Current tail of submission queue (as observed by driver)"); SYSCTL_ADD_U64(ctx, list, OID_AUTO, "num_cmds", CTLFLAG_RD, &qp->submitted, 0, "Number of commands submitted"); } struct nvmf_host_qpair * nvmf_init_qp(struct nvmf_softc *sc, enum nvmf_trtype trtype, - struct nvmf_handoff_qpair_params *handoff, const char *name, u_int qid) + const nvlist_t *nvl, const char *name, u_int qid) { struct nvmf_host_command *cmd, *ncmd; struct nvmf_host_qpair *qp; u_int i; + bool admin; + admin = nvlist_get_bool(nvl, "admin"); qp = malloc(sizeof(*qp), M_NVMF, M_WAITOK | M_ZERO); qp->sc = sc; - qp->sq_flow_control = handoff->sq_flow_control; - qp->sqhd = handoff->sqhd; - qp->sqtail = handoff->sqtail; + qp->sq_flow_control = nvlist_get_bool(nvl, "sq_flow_control"); + qp->sqhd = nvlist_get_number(nvl, "sqhd"); + qp->sqtail = nvlist_get_number(nvl, "sqtail"); strlcpy(qp->name, name, sizeof(qp->name)); mtx_init(&qp->lock, "nvmf qp", NULL, MTX_DEF); (void)sysctl_ctx_init(&qp->sysctl_ctx); /* * Allocate a spare command slot for each pending AER command * on the admin queue. */ - qp->num_commands = handoff->qsize - 1; - if (handoff->admin) + qp->num_commands = nvlist_get_number(nvl, "qsize") - 1; + if (admin) qp->num_commands += sc->num_aer; qp->active_commands = malloc(sizeof(*qp->active_commands) * qp->num_commands, M_NVMF, M_WAITOK | M_ZERO); TAILQ_INIT(&qp->free_commands); for (i = 0; i < qp->num_commands; i++) { cmd = malloc(sizeof(*cmd), M_NVMF, M_WAITOK | M_ZERO); cmd->cid = i; TAILQ_INSERT_TAIL(&qp->free_commands, cmd, link); } STAILQ_INIT(&qp->pending_requests); - qp->qp = nvmf_allocate_qpair(trtype, false, handoff, nvmf_qp_error, - qp, nvmf_receive_capsule, qp); + qp->qp = nvmf_allocate_qpair(trtype, false, nvl, nvmf_qp_error, qp, + nvmf_receive_capsule, qp); if (qp->qp == NULL) { (void)sysctl_ctx_free(&qp->sysctl_ctx); TAILQ_FOREACH_SAFE(cmd, &qp->free_commands, link, ncmd) { TAILQ_REMOVE(&qp->free_commands, cmd, link); free(cmd, M_NVMF); } free(qp->active_commands, M_NVMF); mtx_destroy(&qp->lock); free(qp, M_NVMF); return (NULL); } - nvmf_sysctls_qp(sc, qp, handoff->admin, qid); + nvmf_sysctls_qp(sc, qp, admin, qid); return (qp); } void nvmf_shutdown_qp(struct nvmf_host_qpair *qp) { struct nvmf_host_command *cmd; struct nvmf_request *req; struct nvmf_qpair *nq; mtx_lock(&qp->lock); nq = qp->qp; qp->qp = NULL; if (nq == NULL) { while (qp->shutting_down) mtx_sleep(qp, &qp->lock, 0, "nvmfqpsh", 0); mtx_unlock(&qp->lock); return; } qp->shutting_down = true; while (qp->allocating != 0) mtx_sleep(qp, &qp->lock, 0, "nvmfqpqu", 0); mtx_unlock(&qp->lock); nvmf_free_qpair(nq); /* * Abort outstanding requests. Active requests will have * their I/O completions invoked and associated capsules freed * by the transport layer via nvmf_free_qpair. Pending * requests must have their I/O completion invoked via * nvmf_abort_capsule_data. */ for (u_int i = 0; i < qp->num_commands; i++) { cmd = qp->active_commands[i]; if (cmd != NULL) { if (!cmd->req->aer) printf("%s: aborted active command %p (CID %u)\n", __func__, cmd->req, cmd->cid); /* This was freed by nvmf_free_qpair. */ cmd->req->nc = NULL; nvmf_abort_request(cmd->req, cmd->cid); nvmf_free_request(cmd->req); free(cmd, M_NVMF); } } while (!STAILQ_EMPTY(&qp->pending_requests)) { req = STAILQ_FIRST(&qp->pending_requests); STAILQ_REMOVE_HEAD(&qp->pending_requests, link); if (!req->aer) printf("%s: aborted pending command %p\n", __func__, req); nvmf_abort_capsule_data(req->nc, ECONNABORTED); nvmf_abort_request(req, 0); nvmf_free_request(req); } mtx_lock(&qp->lock); qp->shutting_down = false; mtx_unlock(&qp->lock); wakeup(qp); } void nvmf_destroy_qp(struct nvmf_host_qpair *qp) { struct nvmf_host_command *cmd, *ncmd; nvmf_shutdown_qp(qp); (void)sysctl_ctx_free(&qp->sysctl_ctx); TAILQ_FOREACH_SAFE(cmd, &qp->free_commands, link, ncmd) { TAILQ_REMOVE(&qp->free_commands, cmd, link); free(cmd, M_NVMF); } free(qp->active_commands, M_NVMF); mtx_destroy(&qp->lock); free(qp, M_NVMF); } void nvmf_submit_request(struct nvmf_request *req) { struct nvmf_host_qpair *qp; struct nvmf_host_command *cmd; qp = req->qp; mtx_lock(&qp->lock); if (qp->qp == NULL) { mtx_unlock(&qp->lock); printf("%s: aborted pending command %p\n", __func__, req); nvmf_abort_capsule_data(req->nc, ECONNABORTED); nvmf_abort_request(req, 0); nvmf_free_request(req); return; } cmd = TAILQ_FIRST(&qp->free_commands); if (cmd == NULL) { /* * Queue this request. Will be sent after enough * in-flight requests have completed. */ STAILQ_INSERT_TAIL(&qp->pending_requests, req, link); mtx_unlock(&qp->lock); return; } TAILQ_REMOVE(&qp->free_commands, cmd, link); KASSERT(qp->active_commands[cmd->cid] == NULL, ("%s: CID already busy", __func__)); qp->active_commands[cmd->cid] = cmd; cmd->req = req; nvmf_dispatch_command(qp, cmd); } diff --git a/sys/dev/nvmf/host/nvmf_var.h b/sys/dev/nvmf/host/nvmf_var.h index e9f33207fea1..14bdf4abb1bf 100644 --- a/sys/dev/nvmf/host/nvmf_var.h +++ b/sys/dev/nvmf/host/nvmf_var.h @@ -1,223 +1,216 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2023-2024 Chelsio Communications, Inc. * Written by: John Baldwin */ #ifndef __NVMF_VAR_H__ #define __NVMF_VAR_H__ #include #include #include #include +//#include #include #include #include #include #include #include struct nvmf_aer; struct nvmf_capsule; struct nvmf_host_qpair; struct nvmf_namespace; struct sysctl_oid_list; typedef void nvmf_request_complete_t(void *, const struct nvme_completion *); -struct nvmf_ivars { - struct nvmf_handoff_host *hh; - struct nvmf_handoff_qpair_params *io_params; - struct nvme_controller_data *cdata; -}; - struct nvmf_softc { device_t dev; struct nvmf_host_qpair *admin; struct nvmf_host_qpair **io; u_int num_io_queues; enum nvmf_trtype trtype; struct cam_sim *sim; struct cam_path *path; struct mtx sim_mtx; bool sim_disconnected; bool sim_shutdown; struct nvmf_namespace **ns; struct nvme_controller_data *cdata; uint64_t cap; uint32_t vs; u_int max_pending_io; u_long max_xfer_size; struct cdev *cdev; /* * Keep Alive support depends on two timers. The 'tx' timer * is responsible for sending KeepAlive commands and runs at * half the timeout interval. The 'rx' timer is responsible * for detecting an actual timeout. * * For efficient support of TKAS, the host does not reschedule * these timers every time new commands are scheduled. * Instead, the host sets the *_traffic flags when commands * are sent and received. The timeout handlers check and * clear these flags. This does mean it can take up to twice * the timeout time to detect an AWOL controller. */ bool ka_traffic; /* Using TKAS? */ volatile int ka_active_tx_traffic; struct callout ka_tx_timer; sbintime_t ka_tx_sbt; volatile int ka_active_rx_traffic; struct callout ka_rx_timer; sbintime_t ka_rx_sbt; struct sx connection_lock; struct task disconnect_task; bool detaching; u_int num_aer; struct nvmf_aer *aer; struct sysctl_oid_list *ioq_oid_list; eventhandler_tag shutdown_pre_sync_eh; eventhandler_tag shutdown_post_sync_eh; }; struct nvmf_request { struct nvmf_host_qpair *qp; struct nvmf_capsule *nc; nvmf_request_complete_t *cb; void *cb_arg; bool aer; STAILQ_ENTRY(nvmf_request) link; }; struct nvmf_completion_status { struct nvme_completion cqe; bool done; bool io_done; int io_error; }; static __inline struct nvmf_host_qpair * nvmf_select_io_queue(struct nvmf_softc *sc) { u_int idx = curcpu * sc->num_io_queues / (mp_maxid + 1); return (sc->io[idx]); } static __inline bool nvmf_cqe_aborted(const struct nvme_completion *cqe) { uint16_t status; status = le16toh(cqe->status); return (NVME_STATUS_GET_SCT(status) == NVME_SCT_PATH_RELATED && NVME_STATUS_GET_SC(status) == NVME_SC_COMMAND_ABORTED_BY_HOST); } static __inline void nvmf_status_init(struct nvmf_completion_status *status) { status->done = false; status->io_done = true; status->io_error = 0; } static __inline void nvmf_status_wait_io(struct nvmf_completion_status *status) { status->io_done = false; } #ifdef DRIVER_MODULE extern driver_t nvme_nvmf_driver; #endif #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_NVMF); #endif /* If true, I/O requests will fail while the host is disconnected. */ extern bool nvmf_fail_disconnect; /* nvmf.c */ void nvmf_complete(void *arg, const struct nvme_completion *cqe); void nvmf_io_complete(void *arg, size_t xfered, int error); void nvmf_wait_for_reply(struct nvmf_completion_status *status); -int nvmf_init_ivars(struct nvmf_ivars *ivars, struct nvmf_handoff_host *hh); -void nvmf_free_ivars(struct nvmf_ivars *ivars); +int nvmf_copyin_handoff(const struct nvmf_ioc_nv *nv, nvlist_t **nvlp); void nvmf_disconnect(struct nvmf_softc *sc); void nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid); void nvmf_rescan_all_ns(struct nvmf_softc *sc); int nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt, bool admin); /* nvmf_aer.c */ void nvmf_init_aer(struct nvmf_softc *sc); int nvmf_start_aer(struct nvmf_softc *sc); void nvmf_destroy_aer(struct nvmf_softc *sc); /* nvmf_cmd.c */ bool nvmf_cmd_get_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size, nvmf_request_complete_t *cb, void *cb_arg, int how); bool nvmf_cmd_set_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size, uint64_t value, nvmf_request_complete_t *cb, void *cb_arg, int how); bool nvmf_cmd_keep_alive(struct nvmf_softc *sc, nvmf_request_complete_t *cb, void *cb_arg, int how); bool nvmf_cmd_identify_active_namespaces(struct nvmf_softc *sc, uint32_t id, struct nvme_ns_list *nslist, nvmf_request_complete_t *req_cb, void *req_cb_arg, nvmf_io_complete_t *io_cb, void *io_cb_arg, int how); bool nvmf_cmd_identify_namespace(struct nvmf_softc *sc, uint32_t id, struct nvme_namespace_data *nsdata, nvmf_request_complete_t *req_cb, void *req_cb_arg, nvmf_io_complete_t *io_cb, void *io_cb_arg, int how); bool nvmf_cmd_get_log_page(struct nvmf_softc *sc, uint32_t nsid, uint8_t lid, uint64_t offset, void *buf, size_t len, nvmf_request_complete_t *req_cb, void *req_cb_arg, nvmf_io_complete_t *io_cb, void *io_cb_arg, int how); /* nvmf_ctldev.c */ int nvmf_ctl_load(void); void nvmf_ctl_unload(void); /* nvmf_ns.c */ struct nvmf_namespace *nvmf_init_ns(struct nvmf_softc *sc, uint32_t id, const struct nvme_namespace_data *data); void nvmf_disconnect_ns(struct nvmf_namespace *ns); void nvmf_reconnect_ns(struct nvmf_namespace *ns); void nvmf_shutdown_ns(struct nvmf_namespace *ns); void nvmf_destroy_ns(struct nvmf_namespace *ns); bool nvmf_update_ns(struct nvmf_namespace *ns, const struct nvme_namespace_data *data); /* nvmf_qpair.c */ struct nvmf_host_qpair *nvmf_init_qp(struct nvmf_softc *sc, - enum nvmf_trtype trtype, struct nvmf_handoff_qpair_params *handoff, - const char *name, u_int qid); + enum nvmf_trtype trtype, const nvlist_t *nvl, const char *name, u_int qid); void nvmf_shutdown_qp(struct nvmf_host_qpair *qp); void nvmf_destroy_qp(struct nvmf_host_qpair *qp); struct nvmf_request *nvmf_allocate_request(struct nvmf_host_qpair *qp, void *sqe, nvmf_request_complete_t *cb, void *cb_arg, int how); void nvmf_submit_request(struct nvmf_request *req); void nvmf_free_request(struct nvmf_request *req); /* nvmf_sim.c */ int nvmf_init_sim(struct nvmf_softc *sc); void nvmf_disconnect_sim(struct nvmf_softc *sc); void nvmf_reconnect_sim(struct nvmf_softc *sc); void nvmf_shutdown_sim(struct nvmf_softc *sc); void nvmf_destroy_sim(struct nvmf_softc *sc); void nvmf_sim_rescan_ns(struct nvmf_softc *sc, uint32_t id); #endif /* !__NVMF_VAR_H__ */ diff --git a/sys/dev/nvmf/nvmf.h b/sys/dev/nvmf/nvmf.h index 1f1ecd437c7e..316bd80d4938 100644 --- a/sys/dev/nvmf/nvmf.h +++ b/sys/dev/nvmf/nvmf.h @@ -1,79 +1,101 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2022-2024 Chelsio Communications, Inc. * Written by: John Baldwin */ #ifndef __NVMF_H__ #define __NVMF_H__ #include #ifndef _KERNEL #include #endif /* * Default settings in Fabrics controllers. These match values used by the * Linux target. */ #define NVMF_MAX_IO_ENTRIES (1024) #define NVMF_CC_EN_TIMEOUT (15) /* In 500ms units */ /* Allows for a 16k data buffer + SQE */ #define NVMF_IOCCSZ (sizeof(struct nvme_command) + 16 * 1024) #define NVMF_IORCSZ (sizeof(struct nvme_completion)) #define NVMF_NN (1024) -struct nvmf_handoff_qpair_params { - bool admin; - bool sq_flow_control; - u_int qsize; - uint16_t sqhd; - uint16_t sqtail; /* host only */ - union { - struct { - int fd; - uint8_t rxpda; - uint8_t txpda; - bool header_digests; - bool data_digests; - uint32_t maxr2t; - uint32_t maxh2cdata; - uint32_t max_icd; - } tcp; - }; +/* + * (data, size) is the userspace buffer for a packed nvlist. + * + * For requests that copyout an nvlist, len is the amount of data + * copied out to *data. If size is zero, no data is copied and len is + * set to the required buffer size. + */ +struct nvmf_ioc_nv { + void *data; + size_t len; + size_t size; }; -struct nvmf_handoff_host { - u_int trtype; - u_int num_io_queues; - u_int kato; - struct nvmf_handoff_qpair_params admin; - struct nvmf_handoff_qpair_params *io; - const struct nvme_controller_data *cdata; -}; +/* + * The fields in a qpair handoff nvlist are: + * + * Transport independent: + * + * bool admin + * bool sq_flow_control + * number qsize + * number sqhd + * number sqtail host only + * + * TCP transport: + * + * number fd + * number rxpda + * number txpda + * bool header_digests + * bool data_digests + * number maxr2t + * number maxh2cdata + * number max_icd + */ -struct nvmf_reconnect_params { - uint16_t cntlid; - char subnqn[256]; -}; +/* + * The fields in the nvlist for NVMF_HANDOFF_HOST and + * NVMF_RECONNECT_HOST are: + * + * number trtype + * number kato (optional) + * qpair handoff nvlist admin + * qpair handoff nvlist array io + * binary cdata struct nvme_controller_data + */ -struct nvmf_handoff_controller_qpair { - u_int trtype; - struct nvmf_handoff_qpair_params params; - const struct nvmf_fabric_connect_cmd *cmd; - const struct nvmf_fabric_connect_data *data; -}; +/* + * The fields in the nvlist for NVMF_RECONNECT_PARAMS are: + * + * number cntlid + * string subnqn + */ + +/* + * The fields in the nvlist for handing off a controller qpair are: + * + * number trtype + * qpair handoff nvlist params + * binary cmd struct nvmf_fabric_connect_cmd + * binary data struct nvmf_fabric_connect_data + */ /* Operations on /dev/nvmf */ -#define NVMF_HANDOFF_HOST _IOW('n', 200, struct nvmf_handoff_host) +#define NVMF_HANDOFF_HOST _IOW('n', 200, struct nvmf_ioc_nv) #define NVMF_DISCONNECT_HOST _IOW('n', 201, const char *) #define NVMF_DISCONNECT_ALL _IO('n', 202) /* Operations on /dev/nvmeX */ -#define NVMF_RECONNECT_PARAMS _IOR('n', 203, struct nvmf_reconnect_params) -#define NVMF_RECONNECT_HOST _IOW('n', 204, struct nvmf_handoff_host) +#define NVMF_RECONNECT_PARAMS _IOWR('n', 203, struct nvmf_ioc_nv) +#define NVMF_RECONNECT_HOST _IOW('n', 204, struct nvmf_ioc_nv) #endif /* !__NVMF_H__ */ diff --git a/sys/dev/nvmf/nvmf_tcp.c b/sys/dev/nvmf/nvmf_tcp.c index 50adbfdd2917..6ad5229f6043 100644 --- a/sys/dev/nvmf/nvmf_tcp.c +++ b/sys/dev/nvmf/nvmf_tcp.c @@ -1,1874 +1,1886 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2022-2024 Chelsio Communications, Inc. * Written by: John Baldwin */ #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include struct nvmf_tcp_capsule; struct nvmf_tcp_qpair; struct nvmf_tcp_command_buffer { struct nvmf_tcp_qpair *qp; struct nvmf_io_request io; size_t data_len; size_t data_xfered; uint32_t data_offset; u_int refs; int error; uint16_t cid; uint16_t ttag; TAILQ_ENTRY(nvmf_tcp_command_buffer) link; /* Controller only */ struct nvmf_tcp_capsule *tc; }; struct nvmf_tcp_command_buffer_list { TAILQ_HEAD(, nvmf_tcp_command_buffer) head; struct mtx lock; }; struct nvmf_tcp_qpair { struct nvmf_qpair qp; struct socket *so; volatile u_int refs; /* Every allocated capsule holds a reference */ uint8_t txpda; uint8_t rxpda; bool header_digests; bool data_digests; uint32_t maxr2t; uint32_t maxh2cdata; /* Controller only */ uint32_t max_tx_data; uint32_t max_icd; /* Host only */ uint16_t next_ttag; /* Controller only */ u_int num_ttags; /* Controller only */ u_int active_ttags; /* Controller only */ bool send_success; /* Controller only */ /* Receive state. */ struct thread *rx_thread; struct cv rx_cv; bool rx_shutdown; /* Transmit state. */ struct thread *tx_thread; struct cv tx_cv; bool tx_shutdown; struct mbufq tx_pdus; STAILQ_HEAD(, nvmf_tcp_capsule) tx_capsules; struct nvmf_tcp_command_buffer_list tx_buffers; struct nvmf_tcp_command_buffer_list rx_buffers; /* * For the controller, an RX command buffer can be in one of * two locations, all protected by the rx_buffers.lock. If a * receive request is waiting for either an R2T slot for its * command (due to exceeding MAXR2T), or a transfer tag it is * placed on the rx_buffers list. When a request is allocated * an active transfer tag, it moves to the open_ttags[] array * (indexed by the tag) until it completes. */ struct nvmf_tcp_command_buffer **open_ttags; /* Controller only */ }; struct nvmf_tcp_rxpdu { struct mbuf *m; const struct nvme_tcp_common_pdu_hdr *hdr; uint32_t data_len; bool data_digest_mismatch; }; struct nvmf_tcp_capsule { struct nvmf_capsule nc; volatile u_int refs; struct nvmf_tcp_rxpdu rx_pdu; uint32_t active_r2ts; /* Controller only */ #ifdef INVARIANTS uint32_t tx_data_offset; /* Controller only */ u_int pending_r2ts; /* Controller only */ #endif STAILQ_ENTRY(nvmf_tcp_capsule) link; }; #define TCAP(nc) ((struct nvmf_tcp_capsule *)(nc)) #define TQP(qp) ((struct nvmf_tcp_qpair *)(qp)) static void tcp_release_capsule(struct nvmf_tcp_capsule *tc); static void tcp_free_qpair(struct nvmf_qpair *nq); SYSCTL_NODE(_kern_nvmf, OID_AUTO, tcp, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "TCP transport"); static u_int tcp_max_transmit_data = 256 * 1024; SYSCTL_UINT(_kern_nvmf_tcp, OID_AUTO, max_transmit_data, CTLFLAG_RWTUN, &tcp_max_transmit_data, 0, "Maximum size of data payload in a transmitted PDU"); static MALLOC_DEFINE(M_NVMF_TCP, "nvmf_tcp", "NVMe over TCP"); static int mbuf_crc32c_helper(void *arg, void *data, u_int len) { uint32_t *digestp = arg; *digestp = calculate_crc32c(*digestp, data, len); return (0); } static uint32_t mbuf_crc32c(struct mbuf *m, u_int offset, u_int len) { uint32_t digest = 0xffffffff; m_apply(m, offset, len, mbuf_crc32c_helper, &digest); digest = digest ^ 0xffffffff; return (digest); } static uint32_t compute_digest(const void *buf, size_t len) { return (calculate_crc32c(0xffffffff, buf, len) ^ 0xffffffff); } static struct nvmf_tcp_command_buffer * tcp_alloc_command_buffer(struct nvmf_tcp_qpair *qp, const struct nvmf_io_request *io, uint32_t data_offset, size_t data_len, uint16_t cid) { struct nvmf_tcp_command_buffer *cb; cb = malloc(sizeof(*cb), M_NVMF_TCP, M_WAITOK); cb->qp = qp; cb->io = *io; cb->data_offset = data_offset; cb->data_len = data_len; cb->data_xfered = 0; refcount_init(&cb->refs, 1); cb->error = 0; cb->cid = cid; cb->ttag = 0; cb->tc = NULL; return (cb); } static void tcp_hold_command_buffer(struct nvmf_tcp_command_buffer *cb) { refcount_acquire(&cb->refs); } static void tcp_free_command_buffer(struct nvmf_tcp_command_buffer *cb) { nvmf_complete_io_request(&cb->io, cb->data_xfered, cb->error); if (cb->tc != NULL) tcp_release_capsule(cb->tc); free(cb, M_NVMF_TCP); } static void tcp_release_command_buffer(struct nvmf_tcp_command_buffer *cb) { if (refcount_release(&cb->refs)) tcp_free_command_buffer(cb); } static void tcp_add_command_buffer(struct nvmf_tcp_command_buffer_list *list, struct nvmf_tcp_command_buffer *cb) { mtx_assert(&list->lock, MA_OWNED); TAILQ_INSERT_HEAD(&list->head, cb, link); } static struct nvmf_tcp_command_buffer * tcp_find_command_buffer(struct nvmf_tcp_command_buffer_list *list, uint16_t cid, uint16_t ttag) { struct nvmf_tcp_command_buffer *cb; mtx_assert(&list->lock, MA_OWNED); TAILQ_FOREACH(cb, &list->head, link) { if (cb->cid == cid && cb->ttag == ttag) return (cb); } return (NULL); } static void tcp_remove_command_buffer(struct nvmf_tcp_command_buffer_list *list, struct nvmf_tcp_command_buffer *cb) { mtx_assert(&list->lock, MA_OWNED); TAILQ_REMOVE(&list->head, cb, link); } static void tcp_purge_command_buffer(struct nvmf_tcp_command_buffer_list *list, uint16_t cid, uint16_t ttag) { struct nvmf_tcp_command_buffer *cb; mtx_lock(&list->lock); cb = tcp_find_command_buffer(list, cid, ttag); if (cb != NULL) { tcp_remove_command_buffer(list, cb); mtx_unlock(&list->lock); tcp_release_command_buffer(cb); } else mtx_unlock(&list->lock); } static void nvmf_tcp_write_pdu(struct nvmf_tcp_qpair *qp, struct mbuf *m) { struct socket *so = qp->so; SOCKBUF_LOCK(&so->so_snd); mbufq_enqueue(&qp->tx_pdus, m); /* XXX: Do we need to handle sb_hiwat being wrong? */ if (sowriteable(so)) cv_signal(&qp->tx_cv); SOCKBUF_UNLOCK(&so->so_snd); } static void nvmf_tcp_report_error(struct nvmf_tcp_qpair *qp, uint16_t fes, uint32_t fei, struct mbuf *rx_pdu, u_int hlen) { struct nvme_tcp_term_req_hdr *hdr; struct mbuf *m; if (hlen != 0) { hlen = min(hlen, NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE); hlen = min(hlen, m_length(rx_pdu, NULL)); } m = m_get2(sizeof(*hdr) + hlen, M_WAITOK, MT_DATA, 0); m->m_len = sizeof(*hdr) + hlen; hdr = mtod(m, void *); memset(hdr, 0, sizeof(*hdr)); hdr->common.pdu_type = qp->qp.nq_controller ? NVME_TCP_PDU_TYPE_C2H_TERM_REQ : NVME_TCP_PDU_TYPE_H2C_TERM_REQ; hdr->common.hlen = sizeof(*hdr); hdr->common.plen = sizeof(*hdr) + hlen; hdr->fes = htole16(fes); le32enc(hdr->fei, fei); if (hlen != 0) m_copydata(rx_pdu, 0, hlen, (caddr_t)(hdr + 1)); nvmf_tcp_write_pdu(qp, m); } static int nvmf_tcp_validate_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu) { const struct nvme_tcp_common_pdu_hdr *ch; struct mbuf *m = pdu->m; uint32_t data_len, fei, plen; uint32_t digest, rx_digest; u_int hlen; int error; uint16_t fes; /* Determine how large of a PDU header to return for errors. */ ch = pdu->hdr; hlen = ch->hlen; plen = le32toh(ch->plen); if (hlen < sizeof(*ch) || hlen > plen) hlen = sizeof(*ch); error = nvmf_tcp_validate_pdu_header(ch, qp->qp.nq_controller, qp->header_digests, qp->data_digests, qp->rxpda, &data_len, &fes, &fei); if (error != 0) { if (error != ECONNRESET) nvmf_tcp_report_error(qp, fes, fei, m, hlen); return (error); } /* Check header digest if present. */ if ((ch->flags & NVME_TCP_CH_FLAGS_HDGSTF) != 0) { digest = mbuf_crc32c(m, 0, ch->hlen); m_copydata(m, ch->hlen, sizeof(rx_digest), (caddr_t)&rx_digest); if (digest != rx_digest) { printf("NVMe/TCP: Header digest mismatch\n"); nvmf_tcp_report_error(qp, NVME_TCP_TERM_REQ_FES_HDGST_ERROR, rx_digest, m, hlen); return (EBADMSG); } } /* Check data digest if present. */ pdu->data_digest_mismatch = false; if ((ch->flags & NVME_TCP_CH_FLAGS_DDGSTF) != 0) { digest = mbuf_crc32c(m, ch->pdo, data_len); m_copydata(m, plen - sizeof(rx_digest), sizeof(rx_digest), (caddr_t)&rx_digest); if (digest != rx_digest) { printf("NVMe/TCP: Data digest mismatch\n"); pdu->data_digest_mismatch = true; } } pdu->data_len = data_len; return (0); } static void nvmf_tcp_free_pdu(struct nvmf_tcp_rxpdu *pdu) { m_freem(pdu->m); pdu->m = NULL; pdu->hdr = NULL; } static int nvmf_tcp_handle_term_req(struct nvmf_tcp_rxpdu *pdu) { const struct nvme_tcp_term_req_hdr *hdr; hdr = (const void *)pdu->hdr; printf("NVMe/TCP: Received termination request: fes %#x fei %#x\n", le16toh(hdr->fes), le32dec(hdr->fei)); nvmf_tcp_free_pdu(pdu); return (ECONNRESET); } static int nvmf_tcp_save_command_capsule(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu) { const struct nvme_tcp_cmd *cmd; struct nvmf_capsule *nc; struct nvmf_tcp_capsule *tc; cmd = (const void *)pdu->hdr; nc = nvmf_allocate_command(&qp->qp, &cmd->ccsqe, M_WAITOK); tc = TCAP(nc); tc->rx_pdu = *pdu; nvmf_capsule_received(&qp->qp, nc); return (0); } static int nvmf_tcp_save_response_capsule(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu) { const struct nvme_tcp_rsp *rsp; struct nvmf_capsule *nc; struct nvmf_tcp_capsule *tc; rsp = (const void *)pdu->hdr; nc = nvmf_allocate_response(&qp->qp, &rsp->rccqe, M_WAITOK); nc->nc_sqhd_valid = true; tc = TCAP(nc); tc->rx_pdu = *pdu; /* * Once the CQE has been received, no further transfers to the * command buffer for the associated CID can occur. */ tcp_purge_command_buffer(&qp->rx_buffers, rsp->rccqe.cid, 0); tcp_purge_command_buffer(&qp->tx_buffers, rsp->rccqe.cid, 0); nvmf_capsule_received(&qp->qp, nc); return (0); } /* * Construct a PDU that contains an optional data payload. This * includes dealing with digests and the length fields in the common * header. */ static struct mbuf * nvmf_tcp_construct_pdu(struct nvmf_tcp_qpair *qp, void *hdr, size_t hlen, struct mbuf *data, uint32_t data_len) { struct nvme_tcp_common_pdu_hdr *ch; struct mbuf *top; uint32_t digest, pad, pdo, plen, mlen; plen = hlen; if (qp->header_digests) plen += sizeof(digest); if (data_len != 0) { KASSERT(m_length(data, NULL) == data_len, ("length mismatch")); pdo = roundup(plen, qp->txpda); pad = pdo - plen; plen = pdo + data_len; if (qp->data_digests) plen += sizeof(digest); mlen = pdo; } else { KASSERT(data == NULL, ("payload mbuf with zero length")); pdo = 0; pad = 0; mlen = plen; } top = m_get2(mlen, M_WAITOK, MT_DATA, 0); top->m_len = mlen; ch = mtod(top, void *); memcpy(ch, hdr, hlen); ch->hlen = hlen; if (qp->header_digests) ch->flags |= NVME_TCP_CH_FLAGS_HDGSTF; if (qp->data_digests && data_len != 0) ch->flags |= NVME_TCP_CH_FLAGS_DDGSTF; ch->pdo = pdo; ch->plen = htole32(plen); /* HDGST */ if (qp->header_digests) { digest = compute_digest(ch, hlen); memcpy((char *)ch + hlen, &digest, sizeof(digest)); } if (pad != 0) { /* PAD */ memset((char *)ch + pdo - pad, 0, pad); } if (data_len != 0) { /* DATA */ top->m_next = data; /* DDGST */ if (qp->data_digests) { digest = mbuf_crc32c(data, 0, data_len); /* XXX: Can't use m_append as it uses M_NOWAIT. */ while (data->m_next != NULL) data = data->m_next; data->m_next = m_get(M_WAITOK, MT_DATA); data->m_next->m_len = sizeof(digest); memcpy(mtod(data->m_next, void *), &digest, sizeof(digest)); } } return (top); } /* Find the next command buffer eligible to schedule for R2T. */ static struct nvmf_tcp_command_buffer * nvmf_tcp_next_r2t(struct nvmf_tcp_qpair *qp) { struct nvmf_tcp_command_buffer *cb; mtx_assert(&qp->rx_buffers.lock, MA_OWNED); MPASS(qp->active_ttags < qp->num_ttags); TAILQ_FOREACH(cb, &qp->rx_buffers.head, link) { /* NB: maxr2t is 0's based. */ if (cb->tc->active_r2ts > qp->maxr2t) continue; #ifdef INVARIANTS cb->tc->pending_r2ts--; #endif TAILQ_REMOVE(&qp->rx_buffers.head, cb, link); return (cb); } return (NULL); } /* Allocate the next free transfer tag and assign it to cb. */ static void nvmf_tcp_allocate_ttag(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_command_buffer *cb) { uint16_t ttag; mtx_assert(&qp->rx_buffers.lock, MA_OWNED); ttag = qp->next_ttag; for (;;) { if (qp->open_ttags[ttag] == NULL) break; if (ttag == qp->num_ttags - 1) ttag = 0; else ttag++; MPASS(ttag != qp->next_ttag); } if (ttag == qp->num_ttags - 1) qp->next_ttag = 0; else qp->next_ttag = ttag + 1; cb->tc->active_r2ts++; qp->active_ttags++; qp->open_ttags[ttag] = cb; /* * Don't bother byte-swapping ttag as it is just a cookie * value returned by the other end as-is. */ cb->ttag = ttag; } /* NB: cid and ttag are both little-endian already. */ static void tcp_send_r2t(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag, uint32_t data_offset, uint32_t data_len) { struct nvme_tcp_r2t_hdr r2t; struct mbuf *m; memset(&r2t, 0, sizeof(r2t)); r2t.common.pdu_type = NVME_TCP_PDU_TYPE_R2T; r2t.cccid = cid; r2t.ttag = ttag; r2t.r2to = htole32(data_offset); r2t.r2tl = htole32(data_len); m = nvmf_tcp_construct_pdu(qp, &r2t, sizeof(r2t), NULL, 0); nvmf_tcp_write_pdu(qp, m); } /* * Release a transfer tag and schedule another R2T. * * NB: This drops the rx_buffers.lock mutex. */ static void nvmf_tcp_send_next_r2t(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_command_buffer *cb) { struct nvmf_tcp_command_buffer *ncb; mtx_assert(&qp->rx_buffers.lock, MA_OWNED); MPASS(qp->open_ttags[cb->ttag] == cb); /* Release this transfer tag. */ qp->open_ttags[cb->ttag] = NULL; qp->active_ttags--; cb->tc->active_r2ts--; /* Schedule another R2T. */ ncb = nvmf_tcp_next_r2t(qp); if (ncb != NULL) { nvmf_tcp_allocate_ttag(qp, ncb); mtx_unlock(&qp->rx_buffers.lock); tcp_send_r2t(qp, ncb->cid, ncb->ttag, ncb->data_offset, ncb->data_len); } else mtx_unlock(&qp->rx_buffers.lock); } /* * Copy len bytes starting at offset skip from an mbuf chain into an * I/O buffer at destination offset io_offset. */ static void mbuf_copyto_io(struct mbuf *m, u_int skip, u_int len, struct nvmf_io_request *io, u_int io_offset) { u_int todo; while (m->m_len <= skip) { skip -= m->m_len; m = m->m_next; } while (len != 0) { MPASS((m->m_flags & M_EXTPG) == 0); todo = min(m->m_len - skip, len); memdesc_copyback(&io->io_mem, io_offset, todo, mtodo(m, skip)); skip = 0; io_offset += todo; len -= todo; m = m->m_next; } } static int nvmf_tcp_handle_h2c_data(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu) { const struct nvme_tcp_h2c_data_hdr *h2c; struct nvmf_tcp_command_buffer *cb; uint32_t data_len, data_offset; uint16_t ttag; h2c = (const void *)pdu->hdr; if (le32toh(h2c->datal) > qp->maxh2cdata) { nvmf_tcp_report_error(qp, NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_LIMIT_EXCEEDED, 0, pdu->m, pdu->hdr->hlen); nvmf_tcp_free_pdu(pdu); return (EBADMSG); } /* * NB: Don't bother byte-swapping ttag as we don't byte-swap * it when sending. */ ttag = h2c->ttag; if (ttag >= qp->num_ttags) { nvmf_tcp_report_error(qp, NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->m, pdu->hdr->hlen); nvmf_tcp_free_pdu(pdu); return (EBADMSG); } mtx_lock(&qp->rx_buffers.lock); cb = qp->open_ttags[ttag]; if (cb == NULL) { mtx_unlock(&qp->rx_buffers.lock); nvmf_tcp_report_error(qp, NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->m, pdu->hdr->hlen); nvmf_tcp_free_pdu(pdu); return (EBADMSG); } MPASS(cb->ttag == ttag); /* For a data digest mismatch, fail the I/O request. */ if (pdu->data_digest_mismatch) { nvmf_tcp_send_next_r2t(qp, cb); cb->error = EINTEGRITY; tcp_release_command_buffer(cb); nvmf_tcp_free_pdu(pdu); return (0); } data_len = le32toh(h2c->datal); if (data_len != pdu->data_len) { mtx_unlock(&qp->rx_buffers.lock); nvmf_tcp_report_error(qp, NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, offsetof(struct nvme_tcp_h2c_data_hdr, datal), pdu->m, pdu->hdr->hlen); nvmf_tcp_free_pdu(pdu); return (EBADMSG); } data_offset = le32toh(h2c->datao); if (data_offset < cb->data_offset || data_offset + data_len > cb->data_offset + cb->data_len) { mtx_unlock(&qp->rx_buffers.lock); nvmf_tcp_report_error(qp, NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, pdu->m, pdu->hdr->hlen); nvmf_tcp_free_pdu(pdu); return (EBADMSG); } if (data_offset != cb->data_offset + cb->data_xfered) { mtx_unlock(&qp->rx_buffers.lock); nvmf_tcp_report_error(qp, NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m, pdu->hdr->hlen); nvmf_tcp_free_pdu(pdu); return (EBADMSG); } if ((cb->data_xfered + data_len == cb->data_len) != ((pdu->hdr->flags & NVME_TCP_H2C_DATA_FLAGS_LAST_PDU) != 0)) { mtx_unlock(&qp->rx_buffers.lock); nvmf_tcp_report_error(qp, NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m, pdu->hdr->hlen); nvmf_tcp_free_pdu(pdu); return (EBADMSG); } cb->data_xfered += data_len; data_offset -= cb->data_offset; if (cb->data_xfered == cb->data_len) { nvmf_tcp_send_next_r2t(qp, cb); } else { tcp_hold_command_buffer(cb); mtx_unlock(&qp->rx_buffers.lock); } mbuf_copyto_io(pdu->m, pdu->hdr->pdo, data_len, &cb->io, data_offset); tcp_release_command_buffer(cb); nvmf_tcp_free_pdu(pdu); return (0); } static int nvmf_tcp_handle_c2h_data(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu) { const struct nvme_tcp_c2h_data_hdr *c2h; struct nvmf_tcp_command_buffer *cb; uint32_t data_len, data_offset; c2h = (const void *)pdu->hdr; mtx_lock(&qp->rx_buffers.lock); cb = tcp_find_command_buffer(&qp->rx_buffers, c2h->cccid, 0); if (cb == NULL) { mtx_unlock(&qp->rx_buffers.lock); /* * XXX: Could be PDU sequence error if cccid is for a * command that doesn't use a command buffer. */ nvmf_tcp_report_error(qp, NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, offsetof(struct nvme_tcp_c2h_data_hdr, cccid), pdu->m, pdu->hdr->hlen); nvmf_tcp_free_pdu(pdu); return (EBADMSG); } /* For a data digest mismatch, fail the I/O request. */ if (pdu->data_digest_mismatch) { cb->error = EINTEGRITY; tcp_remove_command_buffer(&qp->rx_buffers, cb); mtx_unlock(&qp->rx_buffers.lock); tcp_release_command_buffer(cb); nvmf_tcp_free_pdu(pdu); return (0); } data_len = le32toh(c2h->datal); if (data_len != pdu->data_len) { mtx_unlock(&qp->rx_buffers.lock); nvmf_tcp_report_error(qp, NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, offsetof(struct nvme_tcp_c2h_data_hdr, datal), pdu->m, pdu->hdr->hlen); nvmf_tcp_free_pdu(pdu); return (EBADMSG); } data_offset = le32toh(c2h->datao); if (data_offset < cb->data_offset || data_offset + data_len > cb->data_offset + cb->data_len) { mtx_unlock(&qp->rx_buffers.lock); nvmf_tcp_report_error(qp, NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, pdu->m, pdu->hdr->hlen); nvmf_tcp_free_pdu(pdu); return (EBADMSG); } if (data_offset != cb->data_offset + cb->data_xfered) { mtx_unlock(&qp->rx_buffers.lock); nvmf_tcp_report_error(qp, NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m, pdu->hdr->hlen); nvmf_tcp_free_pdu(pdu); return (EBADMSG); } if ((cb->data_xfered + data_len == cb->data_len) != ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_LAST_PDU) != 0)) { mtx_unlock(&qp->rx_buffers.lock); nvmf_tcp_report_error(qp, NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m, pdu->hdr->hlen); nvmf_tcp_free_pdu(pdu); return (EBADMSG); } cb->data_xfered += data_len; data_offset -= cb->data_offset; if (cb->data_xfered == cb->data_len) tcp_remove_command_buffer(&qp->rx_buffers, cb); else tcp_hold_command_buffer(cb); mtx_unlock(&qp->rx_buffers.lock); mbuf_copyto_io(pdu->m, pdu->hdr->pdo, data_len, &cb->io, data_offset); tcp_release_command_buffer(cb); if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) { struct nvme_completion cqe; struct nvmf_capsule *nc; memset(&cqe, 0, sizeof(cqe)); cqe.cid = c2h->cccid; nc = nvmf_allocate_response(&qp->qp, &cqe, M_WAITOK); nc->nc_sqhd_valid = false; nvmf_capsule_received(&qp->qp, nc); } nvmf_tcp_free_pdu(pdu); return (0); } /* Called when m_free drops refcount to 0. */ static void nvmf_tcp_mbuf_done(struct mbuf *m) { struct nvmf_tcp_command_buffer *cb = m->m_ext.ext_arg1; tcp_free_command_buffer(cb); } static struct mbuf * nvmf_tcp_mbuf(void *arg, int how, void *data, size_t len) { struct nvmf_tcp_command_buffer *cb = arg; struct mbuf *m; m = m_get(how, MT_DATA); m->m_flags |= M_RDONLY; m_extaddref(m, data, len, &cb->refs, nvmf_tcp_mbuf_done, cb, NULL); m->m_len = len; return (m); } static void nvmf_tcp_free_mext_pg(struct mbuf *m) { struct nvmf_tcp_command_buffer *cb = m->m_ext.ext_arg1; M_ASSERTEXTPG(m); tcp_release_command_buffer(cb); } static struct mbuf * nvmf_tcp_mext_pg(void *arg, int how) { struct nvmf_tcp_command_buffer *cb = arg; struct mbuf *m; m = mb_alloc_ext_pgs(how, nvmf_tcp_free_mext_pg, M_RDONLY); m->m_ext.ext_arg1 = cb; tcp_hold_command_buffer(cb); return (m); } /* * Return an mbuf chain for a range of data belonging to a command * buffer. * * The mbuf chain uses M_EXT mbufs which hold references on the * command buffer so that it remains "alive" until the data has been * fully transmitted. If truncate_ok is true, then the mbuf chain * might return a short chain to avoid gratuitously splitting up a * page. */ static struct mbuf * nvmf_tcp_command_buffer_mbuf(struct nvmf_tcp_command_buffer *cb, uint32_t data_offset, uint32_t data_len, uint32_t *actual_len, bool can_truncate) { struct mbuf *m; size_t len; m = memdesc_alloc_ext_mbufs(&cb->io.io_mem, nvmf_tcp_mbuf, nvmf_tcp_mext_pg, cb, M_WAITOK, data_offset, data_len, &len, can_truncate); if (actual_len != NULL) *actual_len = len; return (m); } /* NB: cid and ttag and little-endian already. */ static void tcp_send_h2c_pdu(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag, uint32_t data_offset, struct mbuf *m, size_t len, bool last_pdu) { struct nvme_tcp_h2c_data_hdr h2c; struct mbuf *top; memset(&h2c, 0, sizeof(h2c)); h2c.common.pdu_type = NVME_TCP_PDU_TYPE_H2C_DATA; if (last_pdu) h2c.common.flags |= NVME_TCP_H2C_DATA_FLAGS_LAST_PDU; h2c.cccid = cid; h2c.ttag = ttag; h2c.datao = htole32(data_offset); h2c.datal = htole32(len); top = nvmf_tcp_construct_pdu(qp, &h2c, sizeof(h2c), m, len); nvmf_tcp_write_pdu(qp, top); } static int nvmf_tcp_handle_r2t(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu) { const struct nvme_tcp_r2t_hdr *r2t; struct nvmf_tcp_command_buffer *cb; uint32_t data_len, data_offset; r2t = (const void *)pdu->hdr; mtx_lock(&qp->tx_buffers.lock); cb = tcp_find_command_buffer(&qp->tx_buffers, r2t->cccid, 0); if (cb == NULL) { mtx_unlock(&qp->tx_buffers.lock); nvmf_tcp_report_error(qp, NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, offsetof(struct nvme_tcp_r2t_hdr, cccid), pdu->m, pdu->hdr->hlen); nvmf_tcp_free_pdu(pdu); return (EBADMSG); } data_offset = le32toh(r2t->r2to); if (data_offset != cb->data_xfered) { mtx_unlock(&qp->tx_buffers.lock); nvmf_tcp_report_error(qp, NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m, pdu->hdr->hlen); nvmf_tcp_free_pdu(pdu); return (EBADMSG); } /* * XXX: The spec does not specify how to handle R2T tranfers * out of range of the original command. */ data_len = le32toh(r2t->r2tl); if (data_offset + data_len > cb->data_len) { mtx_unlock(&qp->tx_buffers.lock); nvmf_tcp_report_error(qp, NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, pdu->m, pdu->hdr->hlen); nvmf_tcp_free_pdu(pdu); return (EBADMSG); } cb->data_xfered += data_len; if (cb->data_xfered == cb->data_len) tcp_remove_command_buffer(&qp->tx_buffers, cb); else tcp_hold_command_buffer(cb); mtx_unlock(&qp->tx_buffers.lock); /* * Queue one or more H2C_DATA PDUs containing the requested * data. */ while (data_len > 0) { struct mbuf *m; uint32_t sent, todo; todo = min(data_len, qp->max_tx_data); m = nvmf_tcp_command_buffer_mbuf(cb, data_offset, todo, &sent, todo < data_len); tcp_send_h2c_pdu(qp, r2t->cccid, r2t->ttag, data_offset, m, sent, sent == data_len); data_offset += sent; data_len -= sent; } tcp_release_command_buffer(cb); nvmf_tcp_free_pdu(pdu); return (0); } /* * A variant of m_pullup that uses M_WAITOK instead of failing. It * also doesn't do anything if enough bytes are already present in the * first mbuf. */ static struct mbuf * pullup_pdu_hdr(struct mbuf *m, int len) { struct mbuf *n, *p; KASSERT(len <= MCLBYTES, ("%s: len too large", __func__)); if (m->m_len >= len) return (m); n = m_get2(len, M_WAITOK, MT_DATA, 0); n->m_len = len; m_copydata(m, 0, len, mtod(n, void *)); while (m != NULL && m->m_len <= len) { p = m->m_next; len -= m->m_len; m_free(m); m = p; } if (len > 0) { m->m_data += len; m->m_len -= len; } n->m_next = m; return (n); } static int nvmf_tcp_dispatch_pdu(struct nvmf_tcp_qpair *qp, const struct nvme_tcp_common_pdu_hdr *ch, struct nvmf_tcp_rxpdu *pdu) { /* Ensure the PDU header is contiguous. */ pdu->m = pullup_pdu_hdr(pdu->m, ch->hlen); pdu->hdr = mtod(pdu->m, const void *); switch (ch->pdu_type) { default: __assert_unreachable(); break; case NVME_TCP_PDU_TYPE_H2C_TERM_REQ: case NVME_TCP_PDU_TYPE_C2H_TERM_REQ: return (nvmf_tcp_handle_term_req(pdu)); case NVME_TCP_PDU_TYPE_CAPSULE_CMD: return (nvmf_tcp_save_command_capsule(qp, pdu)); case NVME_TCP_PDU_TYPE_CAPSULE_RESP: return (nvmf_tcp_save_response_capsule(qp, pdu)); case NVME_TCP_PDU_TYPE_H2C_DATA: return (nvmf_tcp_handle_h2c_data(qp, pdu)); case NVME_TCP_PDU_TYPE_C2H_DATA: return (nvmf_tcp_handle_c2h_data(qp, pdu)); case NVME_TCP_PDU_TYPE_R2T: return (nvmf_tcp_handle_r2t(qp, pdu)); } } static void nvmf_tcp_receive(void *arg) { struct nvmf_tcp_qpair *qp = arg; struct socket *so = qp->so; struct nvmf_tcp_rxpdu pdu; struct nvme_tcp_common_pdu_hdr ch; struct uio uio; struct iovec iov[1]; struct mbuf *m, *n, *tail; u_int avail, needed; int error, flags, terror; bool have_header; m = tail = NULL; have_header = false; SOCKBUF_LOCK(&so->so_rcv); while (!qp->rx_shutdown) { /* Wait until there is enough data for the next step. */ if (so->so_error != 0 || so->so_rerror != 0) { if (so->so_error != 0) error = so->so_error; else error = so->so_rerror; SOCKBUF_UNLOCK(&so->so_rcv); error: m_freem(m); nvmf_qpair_error(&qp->qp, error); SOCKBUF_LOCK(&so->so_rcv); while (!qp->rx_shutdown) cv_wait(&qp->rx_cv, SOCKBUF_MTX(&so->so_rcv)); break; } avail = sbavail(&so->so_rcv); if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) != 0) { if (!have_header && avail == 0) error = 0; else error = ECONNRESET; SOCKBUF_UNLOCK(&so->so_rcv); goto error; } if (avail == 0 || (!have_header && avail < sizeof(ch))) { cv_wait(&qp->rx_cv, SOCKBUF_MTX(&so->so_rcv)); continue; } SOCKBUF_UNLOCK(&so->so_rcv); if (!have_header) { KASSERT(m == NULL, ("%s: m != NULL but no header", __func__)); memset(&uio, 0, sizeof(uio)); iov[0].iov_base = &ch; iov[0].iov_len = sizeof(ch); uio.uio_iov = iov; uio.uio_iovcnt = 1; uio.uio_resid = sizeof(ch); uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; flags = MSG_DONTWAIT | MSG_PEEK; error = soreceive(so, NULL, &uio, NULL, NULL, &flags); if (error != 0) goto error; KASSERT(uio.uio_resid == 0, ("%s: short CH read", __func__)); have_header = true; needed = le32toh(ch.plen); /* * Malformed PDUs will be reported as errors * by nvmf_tcp_validate_pdu. Just pass along * garbage headers if the lengths mismatch. */ if (needed < sizeof(ch) || ch.hlen > needed) needed = sizeof(ch); memset(&uio, 0, sizeof(uio)); uio.uio_resid = needed; } flags = MSG_DONTWAIT; error = soreceive(so, NULL, &uio, &n, NULL, &flags); if (error != 0) goto error; if (m == NULL) m = n; else tail->m_next = n; if (uio.uio_resid != 0) { tail = n; while (tail->m_next != NULL) tail = tail->m_next; SOCKBUF_LOCK(&so->so_rcv); continue; } #ifdef INVARIANTS tail = NULL; #endif pdu.m = m; m = NULL; pdu.hdr = &ch; error = nvmf_tcp_validate_pdu(qp, &pdu); if (error != 0) m_freem(pdu.m); else error = nvmf_tcp_dispatch_pdu(qp, &ch, &pdu); if (error != 0) { /* * If we received a termination request, close * the connection immediately. */ if (error == ECONNRESET) goto error; /* * Wait for up to 30 seconds for the socket to * be closed by the other end. */ SOCKBUF_LOCK(&so->so_rcv); if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { terror = cv_timedwait(&qp->rx_cv, SOCKBUF_MTX(&so->so_rcv), 30 * hz); if (terror == ETIMEDOUT) printf("NVMe/TCP: Timed out after sending terminate request\n"); } SOCKBUF_UNLOCK(&so->so_rcv); goto error; } have_header = false; SOCKBUF_LOCK(&so->so_rcv); } SOCKBUF_UNLOCK(&so->so_rcv); kthread_exit(); } static struct mbuf * tcp_command_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_capsule *tc) { struct nvmf_capsule *nc = &tc->nc; struct nvmf_tcp_command_buffer *cb; struct nvme_sgl_descriptor *sgl; struct nvme_tcp_cmd cmd; struct mbuf *top, *m; bool use_icd; use_icd = false; cb = NULL; m = NULL; if (nc->nc_data.io_len != 0) { cb = tcp_alloc_command_buffer(qp, &nc->nc_data, 0, nc->nc_data.io_len, nc->nc_sqe.cid); if (nc->nc_send_data && nc->nc_data.io_len <= qp->max_icd) { use_icd = true; m = nvmf_tcp_command_buffer_mbuf(cb, 0, nc->nc_data.io_len, NULL, false); cb->data_xfered = nc->nc_data.io_len; tcp_release_command_buffer(cb); } else if (nc->nc_send_data) { mtx_lock(&qp->tx_buffers.lock); tcp_add_command_buffer(&qp->tx_buffers, cb); mtx_unlock(&qp->tx_buffers.lock); } else { mtx_lock(&qp->rx_buffers.lock); tcp_add_command_buffer(&qp->rx_buffers, cb); mtx_unlock(&qp->rx_buffers.lock); } } memset(&cmd, 0, sizeof(cmd)); cmd.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_CMD; cmd.ccsqe = nc->nc_sqe; /* Populate SGL in SQE. */ sgl = &cmd.ccsqe.sgl; memset(sgl, 0, sizeof(*sgl)); sgl->address = 0; sgl->length = htole32(nc->nc_data.io_len); if (use_icd) { /* Use in-capsule data. */ sgl->type = NVME_SGL_TYPE_ICD; } else { /* Use a command buffer. */ sgl->type = NVME_SGL_TYPE_COMMAND_BUFFER; } top = nvmf_tcp_construct_pdu(qp, &cmd, sizeof(cmd), m, m != NULL ? nc->nc_data.io_len : 0); return (top); } static struct mbuf * tcp_response_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_capsule *tc) { struct nvmf_capsule *nc = &tc->nc; struct nvme_tcp_rsp rsp; memset(&rsp, 0, sizeof(rsp)); rsp.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_RESP; rsp.rccqe = nc->nc_cqe; return (nvmf_tcp_construct_pdu(qp, &rsp, sizeof(rsp), NULL, 0)); } static struct mbuf * capsule_to_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_capsule *tc) { if (tc->nc.nc_qe_len == sizeof(struct nvme_command)) return (tcp_command_pdu(qp, tc)); else return (tcp_response_pdu(qp, tc)); } static void nvmf_tcp_send(void *arg) { struct nvmf_tcp_qpair *qp = arg; struct nvmf_tcp_capsule *tc; struct socket *so = qp->so; struct mbuf *m, *n, *p; u_long space, tosend; int error; m = NULL; SOCKBUF_LOCK(&so->so_snd); while (!qp->tx_shutdown) { if (so->so_error != 0) { error = so->so_error; SOCKBUF_UNLOCK(&so->so_snd); error: m_freem(m); nvmf_qpair_error(&qp->qp, error); SOCKBUF_LOCK(&so->so_snd); while (!qp->tx_shutdown) cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd)); break; } if (m == NULL) { /* Next PDU to send. */ m = mbufq_dequeue(&qp->tx_pdus); } if (m == NULL) { if (STAILQ_EMPTY(&qp->tx_capsules)) { cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd)); continue; } /* Convert a capsule into a PDU. */ tc = STAILQ_FIRST(&qp->tx_capsules); STAILQ_REMOVE_HEAD(&qp->tx_capsules, link); SOCKBUF_UNLOCK(&so->so_snd); n = capsule_to_pdu(qp, tc); tcp_release_capsule(tc); SOCKBUF_LOCK(&so->so_snd); mbufq_enqueue(&qp->tx_pdus, n); continue; } /* * Wait until there is enough room to send some data. * If the socket buffer is empty, always send at least * something. */ space = sbspace(&so->so_snd); if (space < m->m_len && sbused(&so->so_snd) != 0) { cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd)); continue; } SOCKBUF_UNLOCK(&so->so_snd); /* * If 'm' is too big, then the socket buffer must be * empty. Split 'm' to make at least some forward * progress. * * Otherwise, chain up as many pending mbufs from 'm' * that will fit. */ if (m->m_len > space) { n = m_split(m, space, M_WAITOK); } else { tosend = m->m_len; n = m->m_next; p = m; while (n != NULL && tosend + n->m_len <= space) { tosend += n->m_len; p = n; n = n->m_next; } KASSERT(p->m_next == n, ("%s: p not before n", __func__)); p->m_next = NULL; KASSERT(m_length(m, NULL) == tosend, ("%s: length mismatch", __func__)); } error = sosend(so, NULL, NULL, m, NULL, MSG_DONTWAIT, NULL); if (error != 0) { m = NULL; m_freem(n); goto error; } m = n; SOCKBUF_LOCK(&so->so_snd); } SOCKBUF_UNLOCK(&so->so_snd); kthread_exit(); } static int nvmf_soupcall_receive(struct socket *so, void *arg, int waitflag) { struct nvmf_tcp_qpair *qp = arg; if (soreadable(so)) cv_signal(&qp->rx_cv); return (SU_OK); } static int nvmf_soupcall_send(struct socket *so, void *arg, int waitflag) { struct nvmf_tcp_qpair *qp = arg; if (sowriteable(so)) cv_signal(&qp->tx_cv); return (SU_OK); } static struct nvmf_qpair * -tcp_allocate_qpair(bool controller, - const struct nvmf_handoff_qpair_params *params) +tcp_allocate_qpair(bool controller, const nvlist_t *nvl) { struct nvmf_tcp_qpair *qp; struct socket *so; struct file *fp; cap_rights_t rights; int error; - error = fget(curthread, params->tcp.fd, cap_rights_init_one(&rights, - CAP_SOCK_CLIENT), &fp); + if (!nvlist_exists_number(nvl, "fd") || + !nvlist_exists_number(nvl, "rxpda") || + !nvlist_exists_number(nvl, "txpda") || + !nvlist_exists_bool(nvl, "header_digests") || + !nvlist_exists_bool(nvl, "data_digests") || + !nvlist_exists_number(nvl, "maxr2t") || + !nvlist_exists_number(nvl, "maxh2cdata") || + !nvlist_exists_number(nvl, "max_icd")) + return (NULL); + + error = fget(curthread, nvlist_get_number(nvl, "fd"), + cap_rights_init_one(&rights, CAP_SOCK_CLIENT), &fp); if (error != 0) return (NULL); if (fp->f_type != DTYPE_SOCKET) { fdrop(fp, curthread); return (NULL); } so = fp->f_data; if (so->so_type != SOCK_STREAM || so->so_proto->pr_protocol != IPPROTO_TCP) { fdrop(fp, curthread); return (NULL); } /* Claim socket from file descriptor. */ fp->f_ops = &badfileops; fp->f_data = NULL; fdrop(fp, curthread); qp = malloc(sizeof(*qp), M_NVMF_TCP, M_WAITOK | M_ZERO); qp->so = so; refcount_init(&qp->refs, 1); - qp->txpda = params->tcp.txpda; - qp->rxpda = params->tcp.rxpda; - qp->header_digests = params->tcp.header_digests; - qp->data_digests = params->tcp.data_digests; - qp->maxr2t = params->tcp.maxr2t; + qp->txpda = nvlist_get_number(nvl, "txpda"); + qp->rxpda = nvlist_get_number(nvl, "rxpda"); + qp->header_digests = nvlist_get_bool(nvl, "header_digests"); + qp->data_digests = nvlist_get_bool(nvl, "data_digests"); + qp->maxr2t = nvlist_get_number(nvl, "maxr2t"); if (controller) - qp->maxh2cdata = params->tcp.maxh2cdata; + qp->maxh2cdata = nvlist_get_number(nvl, "maxh2cdata"); qp->max_tx_data = tcp_max_transmit_data; if (!controller) { - qp->max_tx_data = min(qp->max_tx_data, params->tcp.maxh2cdata); - qp->max_icd = params->tcp.max_icd; + qp->max_tx_data = min(qp->max_tx_data, + nvlist_get_number(nvl, "maxh2cdata")); + qp->max_icd = nvlist_get_number(nvl, "max_icd"); } if (controller) { /* Use the SUCCESS flag if SQ flow control is disabled. */ - qp->send_success = !params->sq_flow_control; + qp->send_success = !nvlist_get_bool(nvl, "sq_flow_control"); /* NB: maxr2t is 0's based. */ qp->num_ttags = MIN((u_int)UINT16_MAX + 1, - (uint64_t)params->qsize * ((uint64_t)qp->maxr2t + 1)); + nvlist_get_number(nvl, "qsize") * + ((uint64_t)qp->maxr2t + 1)); qp->open_ttags = mallocarray(qp->num_ttags, sizeof(*qp->open_ttags), M_NVMF_TCP, M_WAITOK | M_ZERO); } TAILQ_INIT(&qp->rx_buffers.head); TAILQ_INIT(&qp->tx_buffers.head); mtx_init(&qp->rx_buffers.lock, "nvmf/tcp rx buffers", NULL, MTX_DEF); mtx_init(&qp->tx_buffers.lock, "nvmf/tcp tx buffers", NULL, MTX_DEF); cv_init(&qp->rx_cv, "-"); cv_init(&qp->tx_cv, "-"); mbufq_init(&qp->tx_pdus, 0); STAILQ_INIT(&qp->tx_capsules); /* Register socket upcalls. */ SOCKBUF_LOCK(&so->so_rcv); soupcall_set(so, SO_RCV, nvmf_soupcall_receive, qp); SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_LOCK(&so->so_snd); soupcall_set(so, SO_SND, nvmf_soupcall_send, qp); SOCKBUF_UNLOCK(&so->so_snd); /* Spin up kthreads. */ error = kthread_add(nvmf_tcp_receive, qp, NULL, &qp->rx_thread, 0, 0, "nvmef tcp rx"); if (error != 0) { tcp_free_qpair(&qp->qp); return (NULL); } error = kthread_add(nvmf_tcp_send, qp, NULL, &qp->tx_thread, 0, 0, "nvmef tcp tx"); if (error != 0) { tcp_free_qpair(&qp->qp); return (NULL); } return (&qp->qp); } static void tcp_release_qpair(struct nvmf_tcp_qpair *qp) { if (refcount_release(&qp->refs)) free(qp, M_NVMF_TCP); } static void tcp_free_qpair(struct nvmf_qpair *nq) { struct nvmf_tcp_qpair *qp = TQP(nq); struct nvmf_tcp_command_buffer *ncb, *cb; struct nvmf_tcp_capsule *ntc, *tc; struct socket *so = qp->so; /* Shut down kthreads and clear upcalls */ SOCKBUF_LOCK(&so->so_snd); qp->tx_shutdown = true; if (qp->tx_thread != NULL) { cv_signal(&qp->tx_cv); mtx_sleep(qp->tx_thread, SOCKBUF_MTX(&so->so_snd), 0, "nvtcptx", 0); } soupcall_clear(so, SO_SND); SOCKBUF_UNLOCK(&so->so_snd); SOCKBUF_LOCK(&so->so_rcv); qp->rx_shutdown = true; if (qp->rx_thread != NULL) { cv_signal(&qp->rx_cv); mtx_sleep(qp->rx_thread, SOCKBUF_MTX(&so->so_rcv), 0, "nvtcprx", 0); } soupcall_clear(so, SO_RCV); SOCKBUF_UNLOCK(&so->so_rcv); STAILQ_FOREACH_SAFE(tc, &qp->tx_capsules, link, ntc) { nvmf_abort_capsule_data(&tc->nc, ECONNABORTED); tcp_release_capsule(tc); } mbufq_drain(&qp->tx_pdus); cv_destroy(&qp->tx_cv); cv_destroy(&qp->rx_cv); if (qp->open_ttags != NULL) { for (u_int i = 0; i < qp->num_ttags; i++) { cb = qp->open_ttags[i]; if (cb != NULL) { cb->tc->active_r2ts--; cb->error = ECONNABORTED; tcp_release_command_buffer(cb); } } free(qp->open_ttags, M_NVMF_TCP); } mtx_lock(&qp->rx_buffers.lock); TAILQ_FOREACH_SAFE(cb, &qp->rx_buffers.head, link, ncb) { tcp_remove_command_buffer(&qp->rx_buffers, cb); mtx_unlock(&qp->rx_buffers.lock); #ifdef INVARIANTS if (cb->tc != NULL) cb->tc->pending_r2ts--; #endif cb->error = ECONNABORTED; tcp_release_command_buffer(cb); mtx_lock(&qp->rx_buffers.lock); } mtx_destroy(&qp->rx_buffers.lock); mtx_lock(&qp->tx_buffers.lock); TAILQ_FOREACH_SAFE(cb, &qp->tx_buffers.head, link, ncb) { tcp_remove_command_buffer(&qp->tx_buffers, cb); mtx_unlock(&qp->tx_buffers.lock); cb->error = ECONNABORTED; tcp_release_command_buffer(cb); mtx_lock(&qp->tx_buffers.lock); } mtx_destroy(&qp->tx_buffers.lock); soclose(so); tcp_release_qpair(qp); } static struct nvmf_capsule * tcp_allocate_capsule(struct nvmf_qpair *nq, int how) { struct nvmf_tcp_qpair *qp = TQP(nq); struct nvmf_tcp_capsule *tc; tc = malloc(sizeof(*tc), M_NVMF_TCP, how | M_ZERO); if (tc == NULL) return (NULL); refcount_init(&tc->refs, 1); refcount_acquire(&qp->refs); return (&tc->nc); } static void tcp_release_capsule(struct nvmf_tcp_capsule *tc) { struct nvmf_tcp_qpair *qp = TQP(tc->nc.nc_qpair); if (!refcount_release(&tc->refs)) return; MPASS(tc->active_r2ts == 0); MPASS(tc->pending_r2ts == 0); nvmf_tcp_free_pdu(&tc->rx_pdu); free(tc, M_NVMF_TCP); tcp_release_qpair(qp); } static void tcp_free_capsule(struct nvmf_capsule *nc) { struct nvmf_tcp_capsule *tc = TCAP(nc); tcp_release_capsule(tc); } static int tcp_transmit_capsule(struct nvmf_capsule *nc) { struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair); struct nvmf_tcp_capsule *tc = TCAP(nc); struct socket *so = qp->so; refcount_acquire(&tc->refs); SOCKBUF_LOCK(&so->so_snd); STAILQ_INSERT_TAIL(&qp->tx_capsules, tc, link); if (sowriteable(so)) cv_signal(&qp->tx_cv); SOCKBUF_UNLOCK(&so->so_snd); return (0); } static uint8_t tcp_validate_command_capsule(struct nvmf_capsule *nc) { struct nvmf_tcp_capsule *tc = TCAP(nc); struct nvme_sgl_descriptor *sgl; KASSERT(tc->rx_pdu.hdr != NULL, ("capsule wasn't received")); sgl = &nc->nc_sqe.sgl; switch (sgl->type) { case NVME_SGL_TYPE_ICD: if (tc->rx_pdu.data_len != le32toh(sgl->length)) { printf("NVMe/TCP: Command Capsule with mismatched ICD length\n"); return (NVME_SC_DATA_SGL_LENGTH_INVALID); } break; case NVME_SGL_TYPE_COMMAND_BUFFER: if (tc->rx_pdu.data_len != 0) { printf("NVMe/TCP: Command Buffer SGL with ICD\n"); return (NVME_SC_INVALID_FIELD); } break; default: printf("NVMe/TCP: Invalid SGL type in Command Capsule\n"); return (NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID); } if (sgl->address != 0) { printf("NVMe/TCP: Invalid SGL offset in Command Capsule\n"); return (NVME_SC_SGL_OFFSET_INVALID); } return (NVME_SC_SUCCESS); } static size_t tcp_capsule_data_len(const struct nvmf_capsule *nc) { MPASS(nc->nc_qe_len == sizeof(struct nvme_command)); return (le32toh(nc->nc_sqe.sgl.length)); } static void tcp_receive_r2t_data(struct nvmf_capsule *nc, uint32_t data_offset, struct nvmf_io_request *io) { struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair); struct nvmf_tcp_capsule *tc = TCAP(nc); struct nvmf_tcp_command_buffer *cb; cb = tcp_alloc_command_buffer(qp, io, data_offset, io->io_len, nc->nc_sqe.cid); cb->tc = tc; refcount_acquire(&tc->refs); /* * If this command has too many active R2Ts or there are no * available transfer tags, queue the request for later. * * NB: maxr2t is 0's based. */ mtx_lock(&qp->rx_buffers.lock); if (tc->active_r2ts > qp->maxr2t || qp->active_ttags == qp->num_ttags) { #ifdef INVARIANTS tc->pending_r2ts++; #endif TAILQ_INSERT_TAIL(&qp->rx_buffers.head, cb, link); mtx_unlock(&qp->rx_buffers.lock); return; } nvmf_tcp_allocate_ttag(qp, cb); mtx_unlock(&qp->rx_buffers.lock); tcp_send_r2t(qp, nc->nc_sqe.cid, cb->ttag, data_offset, io->io_len); } static void tcp_receive_icd_data(struct nvmf_capsule *nc, uint32_t data_offset, struct nvmf_io_request *io) { struct nvmf_tcp_capsule *tc = TCAP(nc); mbuf_copyto_io(tc->rx_pdu.m, tc->rx_pdu.hdr->pdo + data_offset, io->io_len, io, 0); nvmf_complete_io_request(io, io->io_len, 0); } static int tcp_receive_controller_data(struct nvmf_capsule *nc, uint32_t data_offset, struct nvmf_io_request *io) { struct nvme_sgl_descriptor *sgl; size_t data_len; if (nc->nc_qe_len != sizeof(struct nvme_command) || !nc->nc_qpair->nq_controller) return (EINVAL); sgl = &nc->nc_sqe.sgl; data_len = le32toh(sgl->length); if (data_offset + io->io_len > data_len) return (EFBIG); if (sgl->type == NVME_SGL_TYPE_ICD) tcp_receive_icd_data(nc, data_offset, io); else tcp_receive_r2t_data(nc, data_offset, io); return (0); } /* NB: cid is little-endian already. */ static void tcp_send_c2h_pdu(struct nvmf_tcp_qpair *qp, uint16_t cid, uint32_t data_offset, struct mbuf *m, size_t len, bool last_pdu, bool success) { struct nvme_tcp_c2h_data_hdr c2h; struct mbuf *top; memset(&c2h, 0, sizeof(c2h)); c2h.common.pdu_type = NVME_TCP_PDU_TYPE_C2H_DATA; if (last_pdu) c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_LAST_PDU; if (success) c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_SUCCESS; c2h.cccid = cid; c2h.datao = htole32(data_offset); c2h.datal = htole32(len); top = nvmf_tcp_construct_pdu(qp, &c2h, sizeof(c2h), m, len); nvmf_tcp_write_pdu(qp, top); } static u_int tcp_send_controller_data(struct nvmf_capsule *nc, uint32_t data_offset, struct mbuf *m, size_t len) { struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair); struct nvme_sgl_descriptor *sgl; uint32_t data_len; bool last_pdu, last_xfer; if (nc->nc_qe_len != sizeof(struct nvme_command) || !qp->qp.nq_controller) { m_freem(m); return (NVME_SC_INVALID_FIELD); } sgl = &nc->nc_sqe.sgl; data_len = le32toh(sgl->length); if (data_offset + len > data_len) { m_freem(m); return (NVME_SC_INVALID_FIELD); } last_xfer = (data_offset + len == data_len); if (sgl->type != NVME_SGL_TYPE_COMMAND_BUFFER) { m_freem(m); return (NVME_SC_INVALID_FIELD); } KASSERT(data_offset == TCAP(nc)->tx_data_offset, ("%s: starting data_offset %u doesn't match end of previous xfer %u", __func__, data_offset, TCAP(nc)->tx_data_offset)); /* Queue one more C2H_DATA PDUs containing the data from 'm'. */ while (m != NULL) { struct mbuf *n; uint32_t todo; if (m->m_len > qp->max_tx_data) { n = m_split(m, qp->max_tx_data, M_WAITOK); todo = m->m_len; } else { struct mbuf *p; todo = m->m_len; p = m; n = p->m_next; while (n != NULL) { if (todo + n->m_len > qp->max_tx_data) { p->m_next = NULL; break; } todo += n->m_len; p = n; n = p->m_next; } MPASS(m_length(m, NULL) == todo); } last_pdu = (n == NULL && last_xfer); tcp_send_c2h_pdu(qp, nc->nc_sqe.cid, data_offset, m, todo, last_pdu, last_pdu && qp->send_success); data_offset += todo; data_len -= todo; m = n; } MPASS(data_len == 0); #ifdef INVARIANTS TCAP(nc)->tx_data_offset = data_offset; #endif if (!last_xfer) return (NVMF_MORE); else if (qp->send_success) return (NVMF_SUCCESS_SENT); else return (NVME_SC_SUCCESS); } struct nvmf_transport_ops tcp_ops = { .allocate_qpair = tcp_allocate_qpair, .free_qpair = tcp_free_qpair, .allocate_capsule = tcp_allocate_capsule, .free_capsule = tcp_free_capsule, .transmit_capsule = tcp_transmit_capsule, .validate_command_capsule = tcp_validate_command_capsule, .capsule_data_len = tcp_capsule_data_len, .receive_controller_data = tcp_receive_controller_data, .send_controller_data = tcp_send_controller_data, .trtype = NVMF_TRTYPE_TCP, .priority = 0, }; NVMF_TRANSPORT(tcp, tcp_ops); diff --git a/sys/dev/nvmf/nvmf_transport.c b/sys/dev/nvmf/nvmf_transport.c index 316d1571e61d..1d3f5ea4cf69 100644 --- a/sys/dev/nvmf/nvmf_transport.c +++ b/sys/dev/nvmf/nvmf_transport.c @@ -1,350 +1,436 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2022-2024 Chelsio Communications, Inc. * Written by: John Baldwin */ #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include /* Transport-independent support for fabrics queue pairs and commands. */ struct nvmf_transport { struct nvmf_transport_ops *nt_ops; volatile u_int nt_active_qpairs; SLIST_ENTRY(nvmf_transport) nt_link; }; /* nvmf_transports[nvmf_trtype] is sorted by priority */ static SLIST_HEAD(, nvmf_transport) nvmf_transports[NVMF_TRTYPE_TCP + 1]; static struct sx nvmf_transports_lock; static MALLOC_DEFINE(M_NVMF_TRANSPORT, "nvmf_xport", "NVMe over Fabrics transport"); SYSCTL_NODE(_kern, OID_AUTO, nvmf, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "NVMe over Fabrics"); static bool nvmf_supported_trtype(enum nvmf_trtype trtype) { return (trtype < nitems(nvmf_transports)); } struct nvmf_qpair * nvmf_allocate_qpair(enum nvmf_trtype trtype, bool controller, - const struct nvmf_handoff_qpair_params *params, - nvmf_qpair_error_t *error_cb, void *error_cb_arg, + const nvlist_t *params, nvmf_qpair_error_t *error_cb, void *error_cb_arg, nvmf_capsule_receive_t *receive_cb, void *receive_cb_arg) { struct nvmf_transport *nt; struct nvmf_qpair *qp; if (!nvmf_supported_trtype(trtype)) return (NULL); sx_slock(&nvmf_transports_lock); SLIST_FOREACH(nt, &nvmf_transports[trtype], nt_link) { qp = nt->nt_ops->allocate_qpair(controller, params); if (qp != NULL) { refcount_acquire(&nt->nt_active_qpairs); break; } } sx_sunlock(&nvmf_transports_lock); if (qp == NULL) return (NULL); qp->nq_transport = nt; qp->nq_ops = nt->nt_ops; qp->nq_controller = controller; qp->nq_error = error_cb; qp->nq_error_arg = error_cb_arg; qp->nq_receive = receive_cb; qp->nq_receive_arg = receive_cb_arg; - qp->nq_admin = params->admin; + qp->nq_admin = nvlist_get_bool(params, "admin"); return (qp); } void nvmf_free_qpair(struct nvmf_qpair *qp) { struct nvmf_transport *nt; nt = qp->nq_transport; qp->nq_ops->free_qpair(qp); if (refcount_release(&nt->nt_active_qpairs)) wakeup(nt); } struct nvmf_capsule * nvmf_allocate_command(struct nvmf_qpair *qp, const void *sqe, int how) { struct nvmf_capsule *nc; KASSERT(how == M_WAITOK || how == M_NOWAIT, ("%s: invalid how", __func__)); nc = qp->nq_ops->allocate_capsule(qp, how); if (nc == NULL) return (NULL); nc->nc_qpair = qp; nc->nc_qe_len = sizeof(struct nvme_command); memcpy(&nc->nc_sqe, sqe, nc->nc_qe_len); /* 4.2 of NVMe base spec: Fabrics always uses SGL. */ nc->nc_sqe.fuse &= ~NVMEM(NVME_CMD_PSDT); nc->nc_sqe.fuse |= NVMEF(NVME_CMD_PSDT, NVME_PSDT_SGL); return (nc); } struct nvmf_capsule * nvmf_allocate_response(struct nvmf_qpair *qp, const void *cqe, int how) { struct nvmf_capsule *nc; KASSERT(how == M_WAITOK || how == M_NOWAIT, ("%s: invalid how", __func__)); nc = qp->nq_ops->allocate_capsule(qp, how); if (nc == NULL) return (NULL); nc->nc_qpair = qp; nc->nc_qe_len = sizeof(struct nvme_completion); memcpy(&nc->nc_cqe, cqe, nc->nc_qe_len); return (nc); } int nvmf_capsule_append_data(struct nvmf_capsule *nc, struct memdesc *mem, size_t len, bool send, nvmf_io_complete_t *complete_cb, void *cb_arg) { if (nc->nc_data.io_len != 0) return (EBUSY); nc->nc_send_data = send; nc->nc_data.io_mem = *mem; nc->nc_data.io_len = len; nc->nc_data.io_complete = complete_cb; nc->nc_data.io_complete_arg = cb_arg; return (0); } void nvmf_free_capsule(struct nvmf_capsule *nc) { nc->nc_qpair->nq_ops->free_capsule(nc); } int nvmf_transmit_capsule(struct nvmf_capsule *nc) { return (nc->nc_qpair->nq_ops->transmit_capsule(nc)); } void nvmf_abort_capsule_data(struct nvmf_capsule *nc, int error) { if (nc->nc_data.io_len != 0) nvmf_complete_io_request(&nc->nc_data, 0, error); } void * nvmf_capsule_sqe(struct nvmf_capsule *nc) { KASSERT(nc->nc_qe_len == sizeof(struct nvme_command), ("%s: capsule %p is not a command capsule", __func__, nc)); return (&nc->nc_sqe); } void * nvmf_capsule_cqe(struct nvmf_capsule *nc) { KASSERT(nc->nc_qe_len == sizeof(struct nvme_completion), ("%s: capsule %p is not a response capsule", __func__, nc)); return (&nc->nc_cqe); } bool nvmf_sqhd_valid(struct nvmf_capsule *nc) { KASSERT(nc->nc_qe_len == sizeof(struct nvme_completion), ("%s: capsule %p is not a response capsule", __func__, nc)); return (nc->nc_sqhd_valid); } uint8_t nvmf_validate_command_capsule(struct nvmf_capsule *nc) { KASSERT(nc->nc_qe_len == sizeof(struct nvme_command), ("%s: capsule %p is not a command capsule", __func__, nc)); if (NVMEV(NVME_CMD_PSDT, nc->nc_sqe.fuse) != NVME_PSDT_SGL) return (NVME_SC_INVALID_FIELD); return (nc->nc_qpair->nq_ops->validate_command_capsule(nc)); } size_t nvmf_capsule_data_len(const struct nvmf_capsule *nc) { return (nc->nc_qpair->nq_ops->capsule_data_len(nc)); } int nvmf_receive_controller_data(struct nvmf_capsule *nc, uint32_t data_offset, struct memdesc *mem, size_t len, nvmf_io_complete_t *complete_cb, void *cb_arg) { struct nvmf_io_request io; io.io_mem = *mem; io.io_len = len; io.io_complete = complete_cb; io.io_complete_arg = cb_arg; return (nc->nc_qpair->nq_ops->receive_controller_data(nc, data_offset, &io)); } u_int nvmf_send_controller_data(struct nvmf_capsule *nc, uint32_t data_offset, struct mbuf *m, size_t len) { MPASS(m_length(m, NULL) == len); return (nc->nc_qpair->nq_ops->send_controller_data(nc, data_offset, m, len)); } +int +nvmf_pack_ioc_nvlist(const nvlist_t *nvl, struct nvmf_ioc_nv *nv) +{ + void *packed; + int error; + + error = nvlist_error(nvl); + if (error != 0) + return (error); + + if (nv->size == 0) { + nv->len = nvlist_size(nvl); + } else { + packed = nvlist_pack(nvl, &nv->len); + if (packed == NULL) + error = ENOMEM; + else if (nv->len > nv->size) + error = EFBIG; + else + error = copyout(packed, nv->data, nv->len); + free(packed, M_NVLIST); + } + return (error); +} + +int +nvmf_unpack_ioc_nvlist(const struct nvmf_ioc_nv *nv, nvlist_t **nvlp) +{ + void *packed; + nvlist_t *nvl; + int error; + + packed = malloc(nv->size, M_NVMF_TRANSPORT, M_WAITOK); + error = copyin(nv->data, packed, nv->size); + if (error != 0) { + free(packed, M_NVMF_TRANSPORT); + return (error); + } + + nvl = nvlist_unpack(packed, nv->size, 0); + free(packed, M_NVMF_TRANSPORT); + if (nvl == NULL) + return (EINVAL); + + *nvlp = nvl; + return (0); +} + +bool +nvmf_validate_qpair_nvlist(const nvlist_t *nvl, bool controller) +{ + uint64_t value, qsize; + bool admin, valid; + + valid = true; + valid &= nvlist_exists_bool(nvl, "admin"); + valid &= nvlist_exists_bool(nvl, "sq_flow_control"); + valid &= nvlist_exists_number(nvl, "qsize"); + valid &= nvlist_exists_number(nvl, "sqhd"); + if (!controller) + valid &= nvlist_exists_number(nvl, "sqtail"); + if (!valid) + return (false); + + admin = nvlist_get_bool(nvl, "admin"); + qsize = nvlist_get_number(nvl, "qsize"); + if (admin) { + if (qsize < NVME_MIN_ADMIN_ENTRIES || + qsize > NVME_MAX_ADMIN_ENTRIES) + return (false); + } else { + if (qsize < NVME_MIN_IO_ENTRIES || qsize > NVME_MAX_IO_ENTRIES) + return (false); + } + value = nvlist_get_number(nvl, "sqhd"); + if (value > qsize - 1) + return (false); + if (!controller) { + value = nvlist_get_number(nvl, "sqtail"); + if (value > qsize - 1) + return (false); + } + + return (true); +} + int nvmf_transport_module_handler(struct module *mod, int what, void *arg) { struct nvmf_transport_ops *ops = arg; struct nvmf_transport *nt, *nt2, *prev; int error; switch (what) { case MOD_LOAD: if (!nvmf_supported_trtype(ops->trtype)) { printf("NVMF: Unsupported transport %u", ops->trtype); return (EINVAL); } nt = malloc(sizeof(*nt), M_NVMF_TRANSPORT, M_WAITOK | M_ZERO); nt->nt_ops = arg; sx_xlock(&nvmf_transports_lock); if (SLIST_EMPTY(&nvmf_transports[ops->trtype])) { SLIST_INSERT_HEAD(&nvmf_transports[ops->trtype], nt, nt_link); } else { prev = NULL; SLIST_FOREACH(nt2, &nvmf_transports[ops->trtype], nt_link) { if (ops->priority > nt2->nt_ops->priority) break; prev = nt2; } if (prev == NULL) SLIST_INSERT_HEAD(&nvmf_transports[ops->trtype], nt, nt_link); else SLIST_INSERT_AFTER(prev, nt, nt_link); } sx_xunlock(&nvmf_transports_lock); return (0); case MOD_QUIESCE: if (!nvmf_supported_trtype(ops->trtype)) return (0); sx_slock(&nvmf_transports_lock); SLIST_FOREACH(nt, &nvmf_transports[ops->trtype], nt_link) { if (nt->nt_ops == ops) break; } if (nt == NULL) { sx_sunlock(&nvmf_transports_lock); return (0); } if (nt->nt_active_qpairs != 0) { sx_sunlock(&nvmf_transports_lock); return (EBUSY); } sx_sunlock(&nvmf_transports_lock); return (0); case MOD_UNLOAD: if (!nvmf_supported_trtype(ops->trtype)) return (0); sx_xlock(&nvmf_transports_lock); prev = NULL; SLIST_FOREACH(nt, &nvmf_transports[ops->trtype], nt_link) { if (nt->nt_ops == ops) break; prev = nt; } if (nt == NULL) { sx_xunlock(&nvmf_transports_lock); return (0); } if (prev == NULL) SLIST_REMOVE_HEAD(&nvmf_transports[ops->trtype], nt_link); else SLIST_REMOVE_AFTER(prev, nt_link); error = 0; while (nt->nt_active_qpairs != 0 && error == 0) error = sx_sleep(nt, &nvmf_transports_lock, PCATCH, "nftunld", 0); sx_xunlock(&nvmf_transports_lock); if (error != 0) return (error); free(nt, M_NVMF_TRANSPORT); return (0); default: return (EOPNOTSUPP); } } static int nvmf_transport_modevent(module_t mod __unused, int what, void *arg __unused) { switch (what) { case MOD_LOAD: for (u_int i = 0; i < nitems(nvmf_transports); i++) SLIST_INIT(&nvmf_transports[i]); sx_init(&nvmf_transports_lock, "nvmf transports"); return (0); default: return (EOPNOTSUPP); } } static moduledata_t nvmf_transport_mod = { "nvmf_transport", nvmf_transport_modevent, 0 }; DECLARE_MODULE(nvmf_transport, nvmf_transport_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST); MODULE_VERSION(nvmf_transport, 1); diff --git a/sys/dev/nvmf/nvmf_transport.h b/sys/dev/nvmf/nvmf_transport.h index bbd830eba576..b192baeaccc1 100644 --- a/sys/dev/nvmf/nvmf_transport.h +++ b/sys/dev/nvmf/nvmf_transport.h @@ -1,141 +1,161 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2022-2024 Chelsio Communications, Inc. * Written by: John Baldwin */ #ifndef __NVMF_TRANSPORT_H__ #define __NVMF_TRANSPORT_H__ /* * Interface used by the Fabrics host (initiator) and controller * (target) to send and receive capsules and associated data. */ +#include #include #include struct mbuf; struct memdesc; struct nvmf_capsule; struct nvmf_connection; +struct nvmf_ioc_nv; struct nvmf_qpair; -struct nvmf_handoff_qpair_params; SYSCTL_DECL(_kern_nvmf); /* * Callback to invoke when an error occurs on a qpair. The last * parameter is an error value. If the error value is zero, the qpair * has been closed at the transport level rather than a transport * error occuring. */ typedef void nvmf_qpair_error_t(void *, int); /* Callback to invoke when a capsule is received. */ typedef void nvmf_capsule_receive_t(void *, struct nvmf_capsule *); /* * Callback to invoke when an I/O request has completed. The second * parameter is the amount of data transferred. The last parameter is * an error value which is non-zero if the request did not complete * successfully. A request with an error may complete partially. */ typedef void nvmf_io_complete_t(void *, size_t, int); /* * A queue pair represents either an Admin or I/O * submission/completion queue pair. The params contains negotiated * values passed in from userland. * * Unlike libnvmf in userland, the kernel transport interface does not * have any notion of an association. Instead, qpairs are * independent. */ struct nvmf_qpair *nvmf_allocate_qpair(enum nvmf_trtype trtype, - bool controller, const struct nvmf_handoff_qpair_params *params, + bool controller, const nvlist_t *params, nvmf_qpair_error_t *error_cb, void *error_cb_arg, nvmf_capsule_receive_t *receive_cb, void *receive_cb_arg); void nvmf_free_qpair(struct nvmf_qpair *qp); /* * Capsules are either commands (host -> controller) or responses * (controller -> host). A data buffer may be associated with a * command capsule. Transmitted data is not copied by this API but * instead must be preserved until the completion callback is invoked * to indicate capsule transmission has completed. */ struct nvmf_capsule *nvmf_allocate_command(struct nvmf_qpair *qp, const void *sqe, int how); struct nvmf_capsule *nvmf_allocate_response(struct nvmf_qpair *qp, const void *cqe, int how); void nvmf_free_capsule(struct nvmf_capsule *nc); int nvmf_capsule_append_data(struct nvmf_capsule *nc, struct memdesc *mem, size_t len, bool send, nvmf_io_complete_t *complete_cb, void *cb_arg); int nvmf_transmit_capsule(struct nvmf_capsule *nc); void nvmf_abort_capsule_data(struct nvmf_capsule *nc, int error); void *nvmf_capsule_sqe(struct nvmf_capsule *nc); void *nvmf_capsule_cqe(struct nvmf_capsule *nc); bool nvmf_sqhd_valid(struct nvmf_capsule *nc); /* Controller-specific APIs. */ /* * A controller calls this function to check for any * transport-specific errors (invalid fields) in a received command * capsule. The callback returns a generic command status value: * NVME_SC_SUCCESS if no error is found. */ uint8_t nvmf_validate_command_capsule(struct nvmf_capsule *nc); /* * A controller calls this function to query the amount of data * associated with a command capsule. */ size_t nvmf_capsule_data_len(const struct nvmf_capsule *cc); /* * A controller calls this function to receive data associated with a * command capsule (e.g. the data for a WRITE command). This can * either return in-capsule data or fetch data from the host * (e.g. using a R2T PDU over TCP). The received command capsule * should be passed in 'nc'. The received data is stored in 'mem'. * If this function returns success, then the callback will be invoked * once the operation has completed. Note that the callback might be * invoked before this function returns. */ int nvmf_receive_controller_data(struct nvmf_capsule *nc, uint32_t data_offset, struct memdesc *mem, size_t len, nvmf_io_complete_t *complete_cb, void *cb_arg); /* * A controller calls this function to send data in response to a * command prior to sending a response capsule. If an error occurs, * the function returns a generic status completion code to be sent in * the following CQE. Note that the transfer might send a subset of * the data requested by nc. If the transfer succeeds, this function * can return one of the following values: * * - NVME_SC_SUCCESS: The transfer has completed successfully and the * caller should send a success CQE in a response capsule. * * - NVMF_SUCCESS_SENT: The transfer has completed successfully and * the transport layer has sent an implicit success CQE to the * remote host (e.g. the SUCCESS flag for TCP). The caller should * not send a response capsule. * * - NVMF_MORE: The transfer has completed successfully, but the * transfer did not complete the data buffer. * * The mbuf chain in 'm' is consumed by this function even if an error * is returned. */ u_int nvmf_send_controller_data(struct nvmf_capsule *nc, uint32_t data_offset, struct mbuf *m, size_t len); #define NVMF_SUCCESS_SENT 0x100 #define NVMF_MORE 0x101 +/* Helper APIs for nvlists used in icotls. */ + +/* + * Pack the nvlist nvl and copyout to the buffer described by nv. + */ +int nvmf_pack_ioc_nvlist(const nvlist_t *nvl, struct nvmf_ioc_nv *nv); + +/* + * Copyin and unpack an nvlist described by nv. The unpacked nvlist + * is returned in *nvlp on success. + */ +int nvmf_unpack_ioc_nvlist(const struct nvmf_ioc_nv *nv, nvlist_t **nvlp); + +/* + * Returns true if a qpair handoff nvlist has all the required + * transport-independent values. + */ +bool nvmf_validate_qpair_nvlist(const nvlist_t *nvl, bool controller); + #endif /* !__NVMF_TRANSPORT_H__ */ diff --git a/sys/dev/nvmf/nvmf_transport_internal.h b/sys/dev/nvmf/nvmf_transport_internal.h index 0be427ee0690..eb819a5c83b9 100644 --- a/sys/dev/nvmf/nvmf_transport_internal.h +++ b/sys/dev/nvmf/nvmf_transport_internal.h @@ -1,128 +1,129 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2022-2024 Chelsio Communications, Inc. * Written by: John Baldwin */ #ifndef __NVMF_TRANSPORT_INTERNAL_H__ #define __NVMF_TRANSPORT_INTERNAL_H__ +#include #include /* * Interface between the transport-independent APIs in * nvmf_transport.c and individual transports. */ struct module; struct nvmf_io_request; struct nvmf_transport_ops { /* Queue pair management. */ struct nvmf_qpair *(*allocate_qpair)(bool controller, - const struct nvmf_handoff_qpair_params *params); + const nvlist_t *nvl); void (*free_qpair)(struct nvmf_qpair *qp); /* Capsule operations. */ struct nvmf_capsule *(*allocate_capsule)(struct nvmf_qpair *qp, int how); void (*free_capsule)(struct nvmf_capsule *nc); int (*transmit_capsule)(struct nvmf_capsule *nc); uint8_t (*validate_command_capsule)(struct nvmf_capsule *nc); /* Transferring controller data. */ size_t (*capsule_data_len)(const struct nvmf_capsule *nc); int (*receive_controller_data)(struct nvmf_capsule *nc, uint32_t data_offset, struct nvmf_io_request *io); u_int (*send_controller_data)(struct nvmf_capsule *nc, uint32_t data_offset, struct mbuf *m, size_t len); enum nvmf_trtype trtype; int priority; }; /* Either an Admin or I/O Submission/Completion Queue pair. */ struct nvmf_qpair { struct nvmf_transport *nq_transport; struct nvmf_transport_ops *nq_ops; bool nq_controller; /* Callback to invoke for a received capsule. */ nvmf_capsule_receive_t *nq_receive; void *nq_receive_arg; /* Callback to invoke for an error. */ nvmf_qpair_error_t *nq_error; void *nq_error_arg; bool nq_admin; }; struct nvmf_io_request { /* * Data buffer contains io_len bytes in the backing store * described by mem. */ struct memdesc io_mem; size_t io_len; nvmf_io_complete_t *io_complete; void *io_complete_arg; }; /* * Fabrics Command and Response Capsules. The Fabrics host * (initiator) and controller (target) drivers work with capsules that * are transmitted and received by a specific transport. */ struct nvmf_capsule { struct nvmf_qpair *nc_qpair; /* Either a SQE or CQE. */ union { struct nvme_command nc_sqe; struct nvme_completion nc_cqe; }; int nc_qe_len; /* * Is SQHD in received capsule valid? False for locally- * synthesized responses. */ bool nc_sqhd_valid; bool nc_send_data; struct nvmf_io_request nc_data; }; static void __inline nvmf_qpair_error(struct nvmf_qpair *nq, int error) { nq->nq_error(nq->nq_error_arg, error); } static void __inline nvmf_capsule_received(struct nvmf_qpair *nq, struct nvmf_capsule *nc) { nq->nq_receive(nq->nq_receive_arg, nc); } static void __inline nvmf_complete_io_request(struct nvmf_io_request *io, size_t xfered, int error) { io->io_complete(io->io_complete_arg, xfered, error); } int nvmf_transport_module_handler(struct module *, int, void *); #define NVMF_TRANSPORT(name, ops) \ static moduledata_t nvmf_transport_##name##_mod = { \ "nvmf/" #name, \ nvmf_transport_module_handler, \ &(ops) \ }; \ DECLARE_MODULE(nvmf_transport_##name, nvmf_transport_##name##_mod, \ SI_SUB_DRIVERS, SI_ORDER_ANY); \ MODULE_DEPEND(nvmf_transport_##name, nvmf_transport, 1, 1, 1) #endif /* !__NVMF_TRANSPORT_INTERNAL_H__ */ diff --git a/usr.sbin/nvmfd/ctl.c b/usr.sbin/nvmfd/ctl.c index 5f01ec8e5bc8..73e90e1411bd 100644 --- a/usr.sbin/nvmfd/ctl.c +++ b/usr.sbin/nvmfd/ctl.c @@ -1,139 +1,137 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2023 Chelsio Communications, Inc. * Written by: John Baldwin */ #include #include #include #include #include #include #include #include #include #include #include #include #include "internal.h" static int ctl_fd = -1; static int ctl_port; static void open_ctl(void) { if (ctl_fd > 0) return; ctl_fd = open(CTL_DEFAULT_DEV, O_RDWR); if (ctl_fd == -1 && errno == ENOENT) { if (kldload("ctl") == -1) err(1, "Failed to load ctl.ko"); ctl_fd = open(CTL_DEFAULT_DEV, O_RDWR); } if (ctl_fd == -1) err(1, "Failed to open %s", CTL_DEFAULT_DEV); } void init_ctl_port(const char *subnqn, const struct nvmf_association_params *params) { char result_buf[256]; struct ctl_port_entry entry; struct ctl_req req; nvlist_t *nvl; open_ctl(); nvl = nvlist_create(0); nvlist_add_string(nvl, "subnqn", subnqn); /* XXX: Hardcoded in discovery.c */ nvlist_add_stringf(nvl, "portid", "%u", 1); nvlist_add_stringf(nvl, "max_io_qsize", "%u", params->max_io_qsize); memset(&req, 0, sizeof(req)); strlcpy(req.driver, "nvmf", sizeof(req.driver)); req.reqtype = CTL_REQ_CREATE; req.args = nvlist_pack(nvl, &req.args_len); if (req.args == NULL) errx(1, "Failed to pack nvlist for CTL_PORT/CTL_REQ_CREATE"); req.result = result_buf; req.result_len = sizeof(result_buf); if (ioctl(ctl_fd, CTL_PORT_REQ, &req) != 0) err(1, "ioctl(CTL_PORT/CTL_REQ_CREATE)"); if (req.status == CTL_LUN_ERROR) errx(1, "Failed to create CTL port: %s", req.error_str); if (req.status != CTL_LUN_OK) errx(1, "Failed to create CTL port: %d", req.status); nvlist_destroy(nvl); nvl = nvlist_unpack(result_buf, req.result_len, 0); if (nvl == NULL) errx(1, "Failed to unpack nvlist from CTL_PORT/CTL_REQ_CREATE"); ctl_port = nvlist_get_number(nvl, "port_id"); nvlist_destroy(nvl); memset(&entry, 0, sizeof(entry)); entry.targ_port = ctl_port; if (ioctl(ctl_fd, CTL_ENABLE_PORT, &entry) != 0) errx(1, "ioctl(CTL_ENABLE_PORT)"); } void shutdown_ctl_port(const char *subnqn) { struct ctl_req req; nvlist_t *nvl; open_ctl(); nvl = nvlist_create(0); nvlist_add_string(nvl, "subnqn", subnqn); memset(&req, 0, sizeof(req)); strlcpy(req.driver, "nvmf", sizeof(req.driver)); req.reqtype = CTL_REQ_REMOVE; req.args = nvlist_pack(nvl, &req.args_len); if (req.args == NULL) errx(1, "Failed to pack nvlist for CTL_PORT/CTL_REQ_REMOVE"); if (ioctl(ctl_fd, CTL_PORT_REQ, &req) != 0) err(1, "ioctl(CTL_PORT/CTL_REQ_REMOVE)"); if (req.status == CTL_LUN_ERROR) errx(1, "Failed to remove CTL port: %s", req.error_str); if (req.status != CTL_LUN_OK) errx(1, "Failed to remove CTL port: %d", req.status); nvlist_destroy(nvl); } void ctl_handoff_qpair(struct nvmf_qpair *qp, const struct nvmf_fabric_connect_cmd *cmd, const struct nvmf_fabric_connect_data *data) { struct ctl_nvmf req; int error; memset(&req, 0, sizeof(req)); req.type = CTL_NVMF_HANDOFF; - error = nvmf_handoff_controller_qpair(qp, &req.data.handoff); + error = nvmf_handoff_controller_qpair(qp, cmd, data, &req.data.handoff); if (error != 0) { warnc(error, "Failed to prepare qpair for handoff"); return; } - req.data.handoff.cmd = cmd; - req.data.handoff.data = data; if (ioctl(ctl_fd, CTL_NVMF, &req) != 0) warn("ioctl(CTL_NVMF/CTL_NVMF_HANDOFF)"); }