diff --git a/lib/Makefile b/lib/Makefile --- a/lib/Makefile +++ b/lib/Makefile @@ -78,6 +78,7 @@ libnetbsd \ libnetmap \ libnv \ + libnvmf \ libopenbsd \ libpam \ libpathconv \ diff --git a/lib/libnvmf/Makefile b/lib/libnvmf/Makefile new file mode 100644 --- /dev/null +++ b/lib/libnvmf/Makefile @@ -0,0 +1,22 @@ +.PATH: ${SRCTOP}/sys/dev/nvmf/controller +.PATH: ${SRCTOP}/sys/libkern + +LIB= nvmf +INTERNALLIB= +PACKAGE= nvmf + +INCS= libnvmf.h + +SRCS= gsb_crc32.c \ + nvmf_controller.c \ + nvmf_host.c \ + nvmf_tcp.c \ + nvmf_transport.c \ + nvmft_subr.c + +CFLAGS+= -I${SRCTOP}/sys/dev/nvmf/controller +CFLAGS+= -I${SRCTOP}/sys/dev/nvmf + +.include + +CWARNFLAGS.gsb_crc32.c= -Wno-cast-align diff --git a/lib/libnvmf/internal.h b/lib/libnvmf/internal.h new file mode 100644 --- /dev/null +++ b/lib/libnvmf/internal.h @@ -0,0 +1,116 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2022-2024 Chelsio Communications, Inc. + * Written by: John Baldwin + */ + +#ifndef __LIBNVMF_INTERNAL_H__ +#define __LIBNVMF_INTERNAL_H__ + +#include + +struct nvmf_transport_ops { + /* Association management. */ + struct nvmf_association *(*allocate_association)(bool controller, + const struct nvmf_association_params *params); + void (*update_association)(struct nvmf_association *na, + const struct nvme_controller_data *cdata); + void (*free_association)(struct nvmf_association *na); + + /* Queue pair management. */ + struct nvmf_qpair *(*allocate_qpair)(struct nvmf_association *na, + const struct nvmf_qpair_params *params); + void (*free_qpair)(struct nvmf_qpair *qp); + + /* Create params for kernel handoff. */ + int (*kernel_handoff_params)(struct nvmf_qpair *qp, + struct nvmf_handoff_qpair_params *qparams); + + /* Capsule operations. */ + struct nvmf_capsule *(*allocate_capsule)(struct nvmf_qpair *qp); + void (*free_capsule)(struct nvmf_capsule *nc); + int (*transmit_capsule)(struct nvmf_capsule *nc); + int (*receive_capsule)(struct nvmf_qpair *qp, + struct nvmf_capsule **ncp); + uint8_t (*validate_command_capsule)(const struct nvmf_capsule *nc); + + /* Transferring controller data. */ + size_t (*capsule_data_len)(const struct nvmf_capsule *nc); + int (*receive_controller_data)(const struct nvmf_capsule *nc, + uint32_t data_offset, void *buf, size_t len); + int (*send_controller_data)(const struct nvmf_capsule *nc, + const void *buf, size_t len); +}; + +struct nvmf_association { + struct nvmf_transport_ops *na_ops; + enum nvmf_trtype na_trtype; + bool na_controller; + + struct nvmf_association_params na_params; + + /* Each qpair holds a reference on an association. */ + u_int na_refs; + + char *na_last_error; +}; + +struct nvmf_qpair { + struct nvmf_association *nq_association; + bool nq_admin; + + uint16_t nq_cid; /* host only */ + + /* + * Queue sizes. This assumes the same size for both the + * completion and submission queues within a pair. + */ + u_int nq_qsize; + + /* Flow control management for submission queues. */ + bool nq_flow_control; + uint16_t nq_sqhd; + uint16_t nq_sqtail; /* host only */ + + /* Value in response to/from CONNECT. */ + uint16_t nq_cntlid; + + uint32_t nq_kato; /* valid on admin queue only */ + + TAILQ_HEAD(, nvmf_capsule) nq_rx_capsules; +}; + +struct nvmf_capsule { + struct nvmf_qpair *nc_qpair; + + /* Either a SQE or CQE. */ + union { + struct nvme_command nc_sqe; + struct nvme_completion nc_cqe; + }; + int nc_qe_len; + + /* + * Is SQHD in received capsule valid? False for locally- + * synthesized responses. + */ + bool nc_sqhd_valid; + + /* Data buffer. */ + bool nc_send_data; + void *nc_data; + size_t nc_data_len; + + TAILQ_ENTRY(nvmf_capsule) nc_link; +}; + +extern struct nvmf_transport_ops tcp_ops; + +void na_clear_error(struct nvmf_association *na); +void na_error(struct nvmf_association *na, const char *fmt, ...); + +int nvmf_kernel_handoff_params(struct nvmf_qpair *qp, + struct nvmf_handoff_qpair_params *qparams); + +#endif /* !__LIBNVMF_INTERNAL_H__ */ diff --git a/lib/libnvmf/libnvmf.h b/lib/libnvmf/libnvmf.h new file mode 100644 --- /dev/null +++ b/lib/libnvmf/libnvmf.h @@ -0,0 +1,363 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2022-2024 Chelsio Communications, Inc. + * Written by: John Baldwin + */ + +#ifndef __LIBNVMF_H__ +#define __LIBNVMF_H__ + +#include +#include +#include +#include +#include +#include + +struct nvmf_capsule; +struct nvmf_association; +struct nvmf_qpair; + +/* + * Parameters shared by all queue-pairs of an association. Note that + * this contains the requested values used to initiate transport + * negotiation. + */ +struct nvmf_association_params { + bool sq_flow_control; /* SQ flow control required. */ + bool dynamic_controller_model; /* Controller only */ + uint16_t max_admin_qsize; /* Controller only */ + uint32_t max_io_qsize; /* Controller only, 0 for discovery */ + union { + struct { + uint8_t pda; /* Tx-side PDA. */ + bool header_digests; + bool data_digests; + uint32_t maxr2t; /* Host only */ + uint32_t maxh2cdata; /* Controller only */ + } tcp; + }; +}; + +/* Parameters specific to a single queue pair of an association. */ +struct nvmf_qpair_params { + bool admin; /* Host only */ + union { + struct { + int fd; + } tcp; + }; +}; + +/* Transport-independent APIs. */ + +/* + * A host should allocate a new association for each association with + * a controller. After the admin queue has been allocated and the + * controller's data has been fetched, it should be passed to + * nvmf_update_association to update internal transport-specific + * parameters before allocating I/O queues. + * + * A controller uses a single association to manage all incoming + * queues since it is not known until after parsing the CONNECT + * command which transport queues are admin vs I/O and which + * controller they are created against. + */ +struct nvmf_association *nvmf_allocate_association(enum nvmf_trtype trtype, + bool controller, const struct nvmf_association_params *params); +void nvmf_update_assocation(struct nvmf_association *na, + const struct nvme_controller_data *cdata); +void nvmf_free_association(struct nvmf_association *na); + +/* The most recent association-wide error message. */ +const char *nvmf_association_error(const struct nvmf_association *na); + +/* + * A queue pair represents either an Admin or I/O + * submission/completion queue pair. + * + * Each open qpair holds a reference on its association. Once queue + * pairs are allocated, callers can safely free the association to + * ease bookkeeping. + * + * If nvmf_allocate_qpair fails, a detailed error message can be obtained + * from nvmf_association_error. + */ +struct nvmf_qpair *nvmf_allocate_qpair(struct nvmf_association *na, + const struct nvmf_qpair_params *params); +void nvmf_free_qpair(struct nvmf_qpair *qp); + +/* + * Capsules are either commands (host -> controller) or responses + * (controller -> host). A single data buffer segment may be + * associated with a command capsule. Transmitted data is not copied + * by this API but instead must be preserved until the capsule is + * transmitted and freed. + */ +struct nvmf_capsule *nvmf_allocate_command(struct nvmf_qpair *qp, + const void *sqe); +struct nvmf_capsule *nvmf_allocate_response(struct nvmf_qpair *qp, + const void *cqe); +void nvmf_free_capsule(struct nvmf_capsule *nc); +int nvmf_capsule_append_data(struct nvmf_capsule *nc, + void *buf, size_t len, bool send); +int nvmf_transmit_capsule(struct nvmf_capsule *nc); +int nvmf_receive_capsule(struct nvmf_qpair *qp, struct nvmf_capsule **ncp); +const void *nvmf_capsule_sqe(const struct nvmf_capsule *nc); +const void *nvmf_capsule_cqe(const struct nvmf_capsule *nc); + +/* Return a string name for a transport type. */ +const char *nvmf_transport_type(uint8_t trtype); + +/* Validate a NVMe Qualified Name. */ +bool nvmf_nqn_valid(const char *nqn); + +/* Controller-specific APIs. */ + +/* + * A controller calls this function to check for any + * transport-specific errors (invalid fields) in a received command + * capsule. The callback returns a generic command status value: + * NVME_SC_SUCCESS if no error is found. + */ +uint8_t nvmf_validate_command_capsule(const struct nvmf_capsule *nc); + +/* + * A controller calls this function to query the amount of data + * associated with a command capsule. + */ +size_t nvmf_capsule_data_len(const struct nvmf_capsule *cc); + +/* + * A controller calls this function to receive data associated with a + * command capsule (e.g. the data for a WRITE command). This can + * either return in-capsule data or fetch data from the host + * (e.g. using a R2T PDU over TCP). The received command capsule + * should be passed in 'nc'. The received data is stored in '*buf'. + */ +int nvmf_receive_controller_data(const struct nvmf_capsule *nc, + uint32_t data_offset, void *buf, size_t len); + +/* + * A controller calls this function to send data in response to a + * command along with a response capsule. If the data transfer + * succeeds, a success response is sent. If the data transfer fails, + * an appropriate error status capsule is sent. Regardless, a + * response capsule is always sent. + */ +int nvmf_send_controller_data(const struct nvmf_capsule *nc, + const void *buf, size_t len); + +/* + * Construct a CQE for a reply to a command capsule in 'nc' with the + * completion status 'status'. This is useful when additional CQE + * info is required beyond the completion status. + */ +void nvmf_init_cqe(void *cqe, const struct nvmf_capsule *nc, + uint16_t status); + +/* + * Construct and send a response capsule to a command capsule with + * the supplied CQE. + */ +int nvmf_send_response(const struct nvmf_capsule *nc, const void *cqe); + +/* + * Wait for a single command capsule and return it in *ncp. This can + * fail if an invalid capsule is received or an I/O error occurs. + */ +int nvmf_controller_receive_capsule(struct nvmf_qpair *qp, + struct nvmf_capsule **ncp); + +/* Send a response capsule from a controller. */ +int nvmf_controller_transmit_response(struct nvmf_capsule *nc); + +/* Construct and send an error response capsule. */ +int nvmf_send_error(const struct nvmf_capsule *cc, uint8_t sc_type, + uint8_t sc_status); + +/* + * Construct and send an error response capsule using a generic status + * code. + */ +int nvmf_send_generic_error(const struct nvmf_capsule *nc, + uint8_t sc_status); + +/* Construct and send a simple success response capsule. */ +int nvmf_send_success(const struct nvmf_capsule *nc); + +/* + * Allocate a new queue pair and wait for the CONNECT command capsule. + * If this fails, a detailed error message can be obtained from + * nvmf_association_error. On success, the command capsule is saved + * in '*ccp' and the connect data is saved in 'data'. The caller + * must send an explicit response and free the the command capsule. + */ +struct nvmf_qpair *nvmf_accept(struct nvmf_association *na, + const struct nvmf_qpair_params *params, struct nvmf_capsule **ccp, + struct nvmf_fabric_connect_data *data); + +/* + * Construct and send a response capsule with the Fabrics CONNECT + * invalid parameters error status. If data is true the offset is + * relative to the CONNECT data structure, otherwise the offset is + * relative to the SQE. + */ +void nvmf_connect_invalid_parameters(const struct nvmf_capsule *cc, + bool data, uint16_t offset); + +/* Construct and send a response capsule for a successful CONNECT. */ +int nvmf_finish_accept(const struct nvmf_capsule *cc, uint16_t cntlid); + +/* Compute the initial state of CAP for a controller. */ +uint64_t nvmf_controller_cap(struct nvmf_qpair *qp); + +/* Generate a serial number string from a host ID. */ +void nvmf_controller_serial(char *buf, size_t len, u_long hostid); + +/* + * Populate an Identify Controller data structure for a Discovery + * controller. + */ +void nvmf_init_discovery_controller_data(struct nvmf_qpair *qp, + struct nvme_controller_data *cdata); + +/* + * Populate an Identify Controller data structure for an I/O + * controller. + */ +void nvmf_init_io_controller_data(struct nvmf_qpair *qp, const char *serial, + const char *subnqn, int nn, uint32_t ioccsz, + struct nvme_controller_data *cdata); + +/* + * Validate if a new value for CC is legal given the existing values of + * CAP and CC. + */ +bool nvmf_validate_cc(struct nvmf_qpair *qp, uint64_t cap, uint32_t old_cc, + uint32_t new_cc); + +/* Return the log page id (LID) of a GET_LOG_PAGE command. */ +uint8_t nvmf_get_log_page_id(const struct nvme_command *cmd); + +/* Return the requested data length of a GET_LOG_PAGE command. */ +uint64_t nvmf_get_log_page_length(const struct nvme_command *cmd); + +/* Return the requested data offset of a GET_LOG_PAGE command. */ +uint64_t nvmf_get_log_page_offset(const struct nvme_command *cmd); + +/* Prepare to handoff a controller qpair. */ +int nvmf_handoff_controller_qpair(struct nvmf_qpair *qp, + struct nvmf_handoff_controller_qpair *h); + +/* Host-specific APIs. */ + +/* + * Connect to an admin or I/O queue. If this fails, a detailed error + * message can be obtained from nvmf_association_error. + */ +struct nvmf_qpair *nvmf_connect(struct nvmf_association *na, + const struct nvmf_qpair_params *params, uint16_t qid, u_int queue_size, + const uint8_t hostid[16], uint16_t cntlid, const char *subnqn, + const char *hostnqn, uint32_t kato); + +/* Return the CNTLID for a queue returned from CONNECT. */ +uint16_t nvmf_cntlid(struct nvmf_qpair *qp); + +/* + * Send a command to the controller. This can fail with EBUSY if the + * submission queue is full. + */ +int nvmf_host_transmit_command(struct nvmf_capsule *nc); + +/* + * Wait for a response to a command. If there are no outstanding + * commands in the SQ, fails with EWOULDBLOCK. + */ +int nvmf_host_receive_response(struct nvmf_qpair *qp, + struct nvmf_capsule **rcp); + +/* + * Wait for a response to a specific command. The command must have been + * succesfully sent previously. + */ +int nvmf_host_wait_for_response(struct nvmf_capsule *cc, + struct nvmf_capsule **rcp); + +/* Build a KeepAlive command. */ +struct nvmf_capsule *nvmf_keepalive(struct nvmf_qpair *qp); + +/* Read a controller property. */ +int nvmf_read_property(struct nvmf_qpair *qp, uint32_t offset, uint8_t size, + uint64_t *value); + +/* Write a controller property. */ +int nvmf_write_property(struct nvmf_qpair *qp, uint32_t offset, + uint8_t size, uint64_t value); + +/* Construct a 16-byte HostId from kern.hostuuid. */ +int nvmf_hostid_from_hostuuid(uint8_t hostid[16]); + +/* Construct a NQN from kern.hostuuid. */ +int nvmf_nqn_from_hostuuid(char nqn[NVMF_NQN_MAX_LEN]); + +/* Fetch controller data via IDENTIFY. */ +int nvmf_host_identify_controller(struct nvmf_qpair *qp, + struct nvme_controller_data *data); + +/* Fetch namespace data via IDENTIFY. */ +int nvmf_host_identify_namespace(struct nvmf_qpair *qp, uint32_t nsid, + struct nvme_namespace_data *nsdata); + +/* + * Fetch discovery log page. The memory for the log page is allocated + * by malloc() and returned in *logp. The caller must free the + * memory. + */ +int nvmf_host_fetch_discovery_log_page(struct nvmf_qpair *qp, + struct nvme_discovery_log **logp); + +/* + * Request a desired number of I/O queues via SET_FEATURES. The + * number of actual I/O queues available is returned in *actual on + * success. + */ +int nvmf_host_request_queues(struct nvmf_qpair *qp, u_int requested, + u_int *actual); + +/* + * Handoff active host association to the kernel. This frees the + * qpairs (even on error). + */ +int nvmf_handoff_host(struct nvmf_qpair *admin_qp, u_int num_queues, + struct nvmf_qpair **io_queues, const struct nvme_controller_data *cdata); + +/* + * Disconnect an active host association previously handed off to the + * kernel. *name is either the name of the device (nvmeX) for this + * association or the remote subsystem NQN. + */ +int nvmf_disconnect_host(const char *host); + +/* + * Disconnect all active host associations previously handed off to + * the kernel. + */ +int nvmf_disconnect_all(void); + +/* + * Fetch reconnect parameters from an existing kernel host to use for + * establishing a new association. + */ +int nvmf_reconnect_params(int fd, struct nvmf_reconnect_params *rparams); + +/* + * Handoff active host association to an existing host in the kernel. + * This frees the qpairs (even on error). + */ +int nvmf_reconnect_host(int fd, struct nvmf_qpair *admin_qp, + u_int num_queues, struct nvmf_qpair **io_queues, + const struct nvme_controller_data *cdata); + +#endif /* !__LIBNVMF_H__ */ diff --git a/lib/libnvmf/nvmf_controller.c b/lib/libnvmf/nvmf_controller.c new file mode 100644 --- /dev/null +++ b/lib/libnvmf/nvmf_controller.c @@ -0,0 +1,463 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Chelsio Communications, Inc. + * Written by: John Baldwin + */ + +#include +#include +#include +#include +#include + +#include "libnvmf.h" +#include "internal.h" +#include "nvmft_subr.h" + +void +nvmf_init_cqe(void *cqe, const struct nvmf_capsule *nc, uint16_t status) +{ + struct nvme_completion *cpl = cqe; + const struct nvme_command *cmd = nvmf_capsule_sqe(nc); + + memset(cpl, 0, sizeof(*cpl)); + cpl->cid = cmd->cid; + cpl->status = htole16(status); +} + +static struct nvmf_capsule * +nvmf_simple_response(const struct nvmf_capsule *nc, uint8_t sc_type, + uint8_t sc_status) +{ + struct nvme_completion cpl; + uint16_t status; + + status = NVMEF(NVME_STATUS_SCT, sc_type) | + NVMEF(NVME_STATUS_SC, sc_status); + nvmf_init_cqe(&cpl, nc, status); + return (nvmf_allocate_response(nc->nc_qpair, &cpl)); +} + +int +nvmf_controller_receive_capsule(struct nvmf_qpair *qp, + struct nvmf_capsule **ncp) +{ + struct nvmf_capsule *nc; + int error; + uint8_t sc_status; + + *ncp = NULL; + error = nvmf_receive_capsule(qp, &nc); + if (error != 0) + return (error); + + sc_status = nvmf_validate_command_capsule(nc); + if (sc_status != NVME_SC_SUCCESS) { + nvmf_send_generic_error(nc, sc_status); + nvmf_free_capsule(nc); + return (EPROTO); + } + + *ncp = nc; + return (0); +} + +int +nvmf_controller_transmit_response(struct nvmf_capsule *nc) +{ + struct nvmf_qpair *qp = nc->nc_qpair; + + /* Set SQHD. */ + if (qp->nq_flow_control) { + qp->nq_sqhd = (qp->nq_sqhd + 1) % qp->nq_qsize; + nc->nc_cqe.sqhd = htole16(qp->nq_sqhd); + } else + nc->nc_cqe.sqhd = 0; + + return (nvmf_transmit_capsule(nc)); +} + +int +nvmf_send_response(const struct nvmf_capsule *cc, const void *cqe) +{ + struct nvmf_capsule *rc; + int error; + + rc = nvmf_allocate_response(cc->nc_qpair, cqe); + if (rc == NULL) + return (ENOMEM); + error = nvmf_controller_transmit_response(rc); + nvmf_free_capsule(rc); + return (error); +} + +int +nvmf_send_error(const struct nvmf_capsule *cc, uint8_t sc_type, + uint8_t sc_status) +{ + struct nvmf_capsule *rc; + int error; + + rc = nvmf_simple_response(cc, sc_type, sc_status); + error = nvmf_controller_transmit_response(rc); + nvmf_free_capsule(rc); + return (error); +} + +int +nvmf_send_generic_error(const struct nvmf_capsule *nc, uint8_t sc_status) +{ + return (nvmf_send_error(nc, NVME_SCT_GENERIC, sc_status)); +} + +int +nvmf_send_success(const struct nvmf_capsule *nc) +{ + return (nvmf_send_generic_error(nc, NVME_SC_SUCCESS)); +} + +void +nvmf_connect_invalid_parameters(const struct nvmf_capsule *cc, bool data, + uint16_t offset) +{ + struct nvmf_fabric_connect_rsp rsp; + struct nvmf_capsule *rc; + + nvmf_init_cqe(&rsp, cc, + NVMEF(NVME_STATUS_SCT, NVME_SCT_COMMAND_SPECIFIC) | + NVMEF(NVME_STATUS_SC, NVMF_FABRIC_SC_INVALID_PARAM)); + rsp.status_code_specific.invalid.ipo = htole16(offset); + rsp.status_code_specific.invalid.iattr = data ? 1 : 0; + rc = nvmf_allocate_response(cc->nc_qpair, &rsp); + nvmf_transmit_capsule(rc); + nvmf_free_capsule(rc); +} + +struct nvmf_qpair * +nvmf_accept(struct nvmf_association *na, const struct nvmf_qpair_params *params, + struct nvmf_capsule **ccp, struct nvmf_fabric_connect_data *data) +{ + static const char hostid_zero[sizeof(data->hostid)]; + const struct nvmf_fabric_connect_cmd *cmd; + struct nvmf_qpair *qp; + struct nvmf_capsule *cc, *rc; + u_int qsize; + int error; + uint16_t cntlid; + uint8_t sc_status; + + qp = NULL; + cc = NULL; + rc = NULL; + *ccp = NULL; + na_clear_error(na); + if (!na->na_controller) { + na_error(na, "Cannot accept on a host"); + goto error; + } + + qp = nvmf_allocate_qpair(na, params); + if (qp == NULL) + goto error; + + /* Read the CONNECT capsule. */ + error = nvmf_receive_capsule(qp, &cc); + if (error != 0) { + na_error(na, "Failed to receive CONNECT: %s", strerror(error)); + goto error; + } + + sc_status = nvmf_validate_command_capsule(cc); + if (sc_status != 0) { + na_error(na, "CONNECT command failed to validate: %u", + sc_status); + rc = nvmf_simple_response(cc, NVME_SCT_GENERIC, sc_status); + goto error; + } + + cmd = nvmf_capsule_sqe(cc); + if (cmd->opcode != NVME_OPC_FABRICS_COMMANDS || + cmd->fctype != NVMF_FABRIC_COMMAND_CONNECT) { + na_error(na, "Invalid opcode in CONNECT (%u,%u)", cmd->opcode, + cmd->fctype); + rc = nvmf_simple_response(cc, NVME_SCT_GENERIC, + NVME_SC_INVALID_OPCODE); + goto error; + } + + if (cmd->recfmt != htole16(0)) { + na_error(na, "Unsupported CONNECT record format %u", + le16toh(cmd->recfmt)); + rc = nvmf_simple_response(cc, NVME_SCT_COMMAND_SPECIFIC, + NVMF_FABRIC_SC_INCOMPATIBLE_FORMAT); + goto error; + } + + qsize = le16toh(cmd->sqsize) + 1; + if (cmd->qid == 0) { + /* Admin queue limits. */ + if (qsize < NVME_MIN_ADMIN_ENTRIES || + qsize > NVME_MAX_ADMIN_ENTRIES || + qsize > na->na_params.max_admin_qsize) { + na_error(na, "Invalid queue size %u", qsize); + nvmf_connect_invalid_parameters(cc, false, + offsetof(struct nvmf_fabric_connect_cmd, sqsize)); + goto error; + } + qp->nq_admin = true; + } else { + /* I/O queues not allowed for discovery. */ + if (na->na_params.max_io_qsize == 0) { + na_error(na, "I/O queue on discovery controller"); + nvmf_connect_invalid_parameters(cc, false, + offsetof(struct nvmf_fabric_connect_cmd, qid)); + goto error; + } + + /* I/O queue limits. */ + if (qsize < NVME_MIN_IO_ENTRIES || + qsize > NVME_MAX_IO_ENTRIES || + qsize > na->na_params.max_io_qsize) { + na_error(na, "Invalid queue size %u", qsize); + nvmf_connect_invalid_parameters(cc, false, + offsetof(struct nvmf_fabric_connect_cmd, sqsize)); + goto error; + } + + /* KATO is reserved for I/O queues. */ + if (cmd->kato != 0) { + na_error(na, + "KeepAlive timeout specified for I/O queue"); + nvmf_connect_invalid_parameters(cc, false, + offsetof(struct nvmf_fabric_connect_cmd, kato)); + goto error; + } + qp->nq_admin = false; + } + qp->nq_qsize = qsize; + + /* Fetch CONNECT data. */ + if (nvmf_capsule_data_len(cc) != sizeof(*data)) { + na_error(na, "Invalid data payload length for CONNECT: %zu", + nvmf_capsule_data_len(cc)); + nvmf_connect_invalid_parameters(cc, false, + offsetof(struct nvmf_fabric_connect_cmd, sgl1)); + goto error; + } + + error = nvmf_receive_controller_data(cc, 0, data, sizeof(*data)); + if (error != 0) { + na_error(na, "Failed to read data for CONNECT: %s", + strerror(error)); + rc = nvmf_simple_response(cc, NVME_SCT_GENERIC, + NVME_SC_DATA_TRANSFER_ERROR); + goto error; + } + + /* The hostid must be non-zero. */ + if (memcmp(data->hostid, hostid_zero, sizeof(hostid_zero)) == 0) { + na_error(na, "HostID in CONNECT data is zero"); + nvmf_connect_invalid_parameters(cc, true, + offsetof(struct nvmf_fabric_connect_data, hostid)); + goto error; + } + + cntlid = le16toh(data->cntlid); + if (cmd->qid == 0) { + if (na->na_params.dynamic_controller_model) { + if (cntlid != NVMF_CNTLID_DYNAMIC) { + na_error(na, "Invalid controller ID %#x", + cntlid); + nvmf_connect_invalid_parameters(cc, true, + offsetof(struct nvmf_fabric_connect_data, + cntlid)); + goto error; + } + } else { + if (cntlid > NVMF_CNTLID_STATIC_MAX && + cntlid != NVMF_CNTLID_STATIC_ANY) { + na_error(na, "Invalid controller ID %#x", + cntlid); + nvmf_connect_invalid_parameters(cc, true, + offsetof(struct nvmf_fabric_connect_data, + cntlid)); + goto error; + } + } + } else { + /* Wildcard Controller IDs are only valid on an Admin queue. */ + if (cntlid > NVMF_CNTLID_STATIC_MAX) { + na_error(na, "Invalid controller ID %#x", cntlid); + nvmf_connect_invalid_parameters(cc, true, + offsetof(struct nvmf_fabric_connect_data, cntlid)); + goto error; + } + } + + /* Simple validation of each NQN. */ + if (!nvmf_nqn_valid(data->subnqn)) { + na_error(na, "Invalid SubNQN %.*s", (int)sizeof(data->subnqn), + data->subnqn); + nvmf_connect_invalid_parameters(cc, true, + offsetof(struct nvmf_fabric_connect_data, subnqn)); + goto error; + } + if (!nvmf_nqn_valid(data->hostnqn)) { + na_error(na, "Invalid HostNQN %.*s", (int)sizeof(data->hostnqn), + data->hostnqn); + nvmf_connect_invalid_parameters(cc, true, + offsetof(struct nvmf_fabric_connect_data, hostnqn)); + goto error; + } + + if (na->na_params.sq_flow_control || + (cmd->cattr & NVMF_CONNECT_ATTR_DISABLE_SQ_FC) == 0) + qp->nq_flow_control = true; + else + qp->nq_flow_control = false; + qp->nq_sqhd = 0; + qp->nq_kato = le32toh(cmd->kato); + *ccp = cc; + return (qp); +error: + if (rc != NULL) { + nvmf_transmit_capsule(rc); + nvmf_free_capsule(rc); + } + if (cc != NULL) + nvmf_free_capsule(cc); + if (qp != NULL) + nvmf_free_qpair(qp); + return (NULL); +} + +int +nvmf_finish_accept(const struct nvmf_capsule *cc, uint16_t cntlid) +{ + struct nvmf_fabric_connect_rsp rsp; + struct nvmf_qpair *qp = cc->nc_qpair; + struct nvmf_capsule *rc; + int error; + + nvmf_init_cqe(&rsp, cc, 0); + if (qp->nq_flow_control) + rsp.sqhd = htole16(qp->nq_sqhd); + else + rsp.sqhd = htole16(0xffff); + rsp.status_code_specific.success.cntlid = htole16(cntlid); + rc = nvmf_allocate_response(qp, &rsp); + if (rc == NULL) + return (ENOMEM); + error = nvmf_transmit_capsule(rc); + nvmf_free_capsule(rc); + if (error == 0) + qp->nq_cntlid = cntlid; + return (error); +} + +uint64_t +nvmf_controller_cap(struct nvmf_qpair *qp) +{ + const struct nvmf_association *na = qp->nq_association; + + return (_nvmf_controller_cap(na->na_params.max_io_qsize, + NVMF_CC_EN_TIMEOUT)); +} + +bool +nvmf_validate_cc(struct nvmf_qpair *qp, uint64_t cap, uint32_t old_cc, + uint32_t new_cc) +{ + const struct nvmf_association *na = qp->nq_association; + + return (_nvmf_validate_cc(na->na_params.max_io_qsize, cap, old_cc, + new_cc)); +} + +void +nvmf_init_discovery_controller_data(struct nvmf_qpair *qp, + struct nvme_controller_data *cdata) +{ + const struct nvmf_association *na = qp->nq_association; + struct utsname utsname; + char *cp; + + memset(cdata, 0, sizeof(*cdata)); + + /* + * 5.2 Figure 37 states model name and serial are reserved, + * but Linux includes them. Don't bother with serial, but + * do set model name. + */ + uname(&utsname); + nvmf_strpad(cdata->mn, utsname.sysname, sizeof(cdata->mn)); + nvmf_strpad(cdata->fr, utsname.release, sizeof(cdata->fr)); + cp = memchr(cdata->fr, '-', sizeof(cdata->fr)); + if (cp != NULL) + memset(cp, ' ', sizeof(cdata->fr) - (cp - (char *)cdata->fr)); + + cdata->ctrlr_id = htole16(qp->nq_cntlid); + cdata->ver = htole32(NVME_REV(1, 4)); + cdata->cntrltype = 2; + + cdata->lpa = NVMEF(NVME_CTRLR_DATA_LPA_EXT_DATA, 1); + cdata->elpe = 0; + + cdata->maxcmd = htole16(na->na_params.max_admin_qsize); + + /* Transport-specific? */ + cdata->sgls = htole32( + NVMEF(NVME_CTRLR_DATA_SGLS_TRANSPORT_DATA_BLOCK, 1) | + NVMEF(NVME_CTRLR_DATA_SGLS_ADDRESS_AS_OFFSET, 1) | + NVMEF(NVME_CTRLR_DATA_SGLS_NVM_COMMAND_SET, 1)); + + strlcpy(cdata->subnqn, NVMF_DISCOVERY_NQN, sizeof(cdata->subnqn)); +} + +void +nvmf_init_io_controller_data(struct nvmf_qpair *qp, const char *serial, + const char *subnqn, int nn, uint32_t ioccsz, + struct nvme_controller_data *cdata) +{ + const struct nvmf_association *na = qp->nq_association; + struct utsname utsname; + + uname(&utsname); + + _nvmf_init_io_controller_data(qp->nq_cntlid, na->na_params.max_io_qsize, + serial, utsname.sysname, utsname.release, subnqn, nn, ioccsz, + sizeof(struct nvme_completion), cdata); +} + +uint8_t +nvmf_get_log_page_id(const struct nvme_command *cmd) +{ + assert(cmd->opc == NVME_OPC_GET_LOG_PAGE); + return (le32toh(cmd->cdw10) & 0xff); +} + +uint64_t +nvmf_get_log_page_length(const struct nvme_command *cmd) +{ + uint32_t numd; + + assert(cmd->opc == NVME_OPC_GET_LOG_PAGE); + numd = le32toh(cmd->cdw10) >> 16 | (le32toh(cmd->cdw11) & 0xffff) << 16; + return ((numd + 1) * 4); +} + +uint64_t +nvmf_get_log_page_offset(const struct nvme_command *cmd) +{ + assert(cmd->opc == NVME_OPC_GET_LOG_PAGE); + return (le32toh(cmd->cdw12) | (uint64_t)le32toh(cmd->cdw13) << 32); +} + +int +nvmf_handoff_controller_qpair(struct nvmf_qpair *qp, + struct nvmf_handoff_controller_qpair *h) +{ + h->trtype = qp->nq_association->na_trtype; + return (nvmf_kernel_handoff_params(qp, &h->params)); +} diff --git a/lib/libnvmf/nvmf_host.c b/lib/libnvmf/nvmf_host.c new file mode 100644 --- /dev/null +++ b/lib/libnvmf/nvmf_host.c @@ -0,0 +1,911 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Chelsio Communications, Inc. + * Written by: John Baldwin + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libnvmf.h" +#include "internal.h" + +static void +nvmf_init_sqe(void *sqe, uint8_t opcode) +{ + struct nvme_command *cmd = sqe; + + memset(cmd, 0, sizeof(*cmd)); + cmd->opc = opcode; +} + +static void +nvmf_init_fabrics_sqe(void *sqe, uint8_t fctype) +{ + struct nvmf_capsule_cmd *cmd = sqe; + + nvmf_init_sqe(sqe, NVME_OPC_FABRICS_COMMANDS); + cmd->fctype = fctype; +} + +struct nvmf_qpair * +nvmf_connect(struct nvmf_association *na, + const struct nvmf_qpair_params *params, uint16_t qid, u_int queue_size, + const uint8_t hostid[16], uint16_t cntlid, const char *subnqn, + const char *hostnqn, uint32_t kato) +{ + struct nvmf_fabric_connect_cmd cmd; + struct nvmf_fabric_connect_data data; + const struct nvmf_fabric_connect_rsp *rsp; + struct nvmf_qpair *qp; + struct nvmf_capsule *cc, *rc; + int error; + uint16_t sqhd, status; + + qp = NULL; + cc = NULL; + rc = NULL; + na_clear_error(na); + if (na->na_controller) { + na_error(na, "Cannot connect on a controller"); + goto error; + } + + if (params->admin != (qid == 0)) { + na_error(na, "Admin queue must use Queue ID 0"); + goto error; + } + + if (qid == 0) { + if (queue_size < NVME_MIN_ADMIN_ENTRIES || + queue_size > NVME_MAX_ADMIN_ENTRIES) { + na_error(na, "Invalid queue size %u", queue_size); + goto error; + } + } else { + if (queue_size < NVME_MIN_IO_ENTRIES || + queue_size > NVME_MAX_IO_ENTRIES) { + na_error(na, "Invalid queue size %u", queue_size); + goto error; + } + + /* KATO is only for Admin queues. */ + if (kato != 0) { + na_error(na, "Cannot set KATO on I/O queues"); + goto error; + } + } + + qp = nvmf_allocate_qpair(na, params); + if (qp == NULL) + goto error; + + nvmf_init_fabrics_sqe(&cmd, NVMF_FABRIC_COMMAND_CONNECT); + cmd.recfmt = 0; + cmd.qid = htole16(qid); + + /* N.B. sqsize is 0's based. */ + cmd.sqsize = htole16(queue_size - 1); + if (!na->na_params.sq_flow_control) + cmd.cattr |= NVMF_CONNECT_ATTR_DISABLE_SQ_FC; + cmd.kato = htole32(kato); + + cc = nvmf_allocate_command(qp, &cmd); + if (cc == NULL) { + na_error(na, "Failed to allocate command capsule: %s", + strerror(errno)); + goto error; + } + + memset(&data, 0, sizeof(data)); + memcpy(data.hostid, hostid, sizeof(data.hostid)); + data.cntlid = htole16(cntlid); + strlcpy(data.subnqn, subnqn, sizeof(data.subnqn)); + strlcpy(data.hostnqn, hostnqn, sizeof(data.hostnqn)); + + error = nvmf_capsule_append_data(cc, &data, sizeof(data), true); + if (error != 0) { + na_error(na, "Failed to append data to CONNECT capsule: %s", + strerror(error)); + goto error; + } + + error = nvmf_transmit_capsule(cc); + if (error != 0) { + na_error(na, "Failed to transmit CONNECT capsule: %s", + strerror(errno)); + goto error; + } + + error = nvmf_receive_capsule(qp, &rc); + if (error != 0) { + na_error(na, "Failed to receive CONNECT response: %s", + strerror(error)); + goto error; + } + + rsp = (const struct nvmf_fabric_connect_rsp *)&rc->nc_cqe; + status = le16toh(rc->nc_cqe.status); + if (status != 0) { + if (NVME_STATUS_GET_SC(status) == NVMF_FABRIC_SC_INVALID_PARAM) + na_error(na, + "CONNECT invalid parameter IATTR: %#x IPO: %#x", + rsp->status_code_specific.invalid.iattr, + rsp->status_code_specific.invalid.ipo); + else + na_error(na, "CONNECT failed, status %#x", status); + goto error; + } + + if (rc->nc_cqe.cid != cmd.cid) { + na_error(na, "Mismatched CID in CONNECT response"); + goto error; + } + + if (!rc->nc_sqhd_valid) { + na_error(na, "CONNECT response without valid SQHD"); + goto error; + } + + sqhd = le16toh(rsp->sqhd); + if (sqhd == 0xffff) { + if (na->na_params.sq_flow_control) { + na_error(na, "Controller disabled SQ flow control"); + goto error; + } + qp->nq_flow_control = false; + } else { + qp->nq_flow_control = true; + qp->nq_sqhd = sqhd; + qp->nq_sqtail = sqhd; + } + + if (rsp->status_code_specific.success.authreq) { + na_error(na, "CONNECT response requests authentication\n"); + goto error; + } + + qp->nq_qsize = queue_size; + qp->nq_cntlid = le16toh(rsp->status_code_specific.success.cntlid); + qp->nq_kato = kato; + /* XXX: Save qid in qp? */ + return (qp); + +error: + if (rc != NULL) + nvmf_free_capsule(rc); + if (cc != NULL) + nvmf_free_capsule(cc); + if (qp != NULL) + nvmf_free_qpair(qp); + return (NULL); +} + +uint16_t +nvmf_cntlid(struct nvmf_qpair *qp) +{ + return (qp->nq_cntlid); +} + +int +nvmf_host_transmit_command(struct nvmf_capsule *nc) +{ + struct nvmf_qpair *qp = nc->nc_qpair; + uint16_t new_sqtail; + int error; + + /* Fail if the queue is full. */ + new_sqtail = (qp->nq_sqtail + 1) % qp->nq_qsize; + if (new_sqtail == qp->nq_sqhd) + return (EBUSY); + + nc->nc_sqe.cid = htole16(qp->nq_cid); + + /* 4.2 Skip CID of 0xFFFF. */ + qp->nq_cid++; + if (qp->nq_cid == 0xFFFF) + qp->nq_cid = 0; + + error = nvmf_transmit_capsule(nc); + if (error != 0) + return (error); + + qp->nq_sqtail = new_sqtail; + return (0); +} + +/* Receive a single capsule and update SQ FC accounting. */ +static int +nvmf_host_receive_capsule(struct nvmf_qpair *qp, struct nvmf_capsule **ncp) +{ + struct nvmf_capsule *nc; + int error; + + /* If the SQ is empty, there is no response to wait for. */ + if (qp->nq_sqhd == qp->nq_sqtail) + return (EWOULDBLOCK); + + error = nvmf_receive_capsule(qp, &nc); + if (error != 0) + return (error); + + if (qp->nq_flow_control) { + if (nc->nc_sqhd_valid) + qp->nq_sqhd = le16toh(nc->nc_cqe.sqhd); + } else { + /* + * If SQ FC is disabled, just advance the head for + * each response capsule received so that we track the + * number of outstanding commands. + */ + qp->nq_sqhd = (qp->nq_sqhd + 1) % qp->nq_qsize; + } + *ncp = nc; + return (0); +} + +int +nvmf_host_receive_response(struct nvmf_qpair *qp, struct nvmf_capsule **ncp) +{ + struct nvmf_capsule *nc; + + /* Return the oldest previously received response. */ + if (!TAILQ_EMPTY(&qp->nq_rx_capsules)) { + nc = TAILQ_FIRST(&qp->nq_rx_capsules); + TAILQ_REMOVE(&qp->nq_rx_capsules, nc, nc_link); + *ncp = nc; + return (0); + } + + return (nvmf_host_receive_capsule(qp, ncp)); +} + +int +nvmf_host_wait_for_response(struct nvmf_capsule *cc, + struct nvmf_capsule **rcp) +{ + struct nvmf_qpair *qp = cc->nc_qpair; + struct nvmf_capsule *rc; + int error; + + /* Check if a response was already received. */ + TAILQ_FOREACH(rc, &qp->nq_rx_capsules, nc_link) { + if (rc->nc_cqe.cid == cc->nc_sqe.cid) { + TAILQ_REMOVE(&qp->nq_rx_capsules, rc, nc_link); + *rcp = rc; + return (0); + } + } + + /* Wait for a response. */ + for (;;) { + error = nvmf_host_receive_capsule(qp, &rc); + if (error != 0) + return (error); + + if (rc->nc_cqe.cid != cc->nc_sqe.cid) { + TAILQ_INSERT_TAIL(&qp->nq_rx_capsules, rc, nc_link); + continue; + } + + *rcp = rc; + return (0); + } +} + +struct nvmf_capsule * +nvmf_keepalive(struct nvmf_qpair *qp) +{ + struct nvme_command cmd; + + if (!qp->nq_admin) { + errno = EINVAL; + return (NULL); + } + + nvmf_init_sqe(&cmd, NVME_OPC_KEEP_ALIVE); + + return (nvmf_allocate_command(qp, &cmd)); +} + +static struct nvmf_capsule * +nvmf_get_property(struct nvmf_qpair *qp, uint32_t offset, uint8_t size) +{ + struct nvmf_fabric_prop_get_cmd cmd; + + nvmf_init_fabrics_sqe(&cmd, NVMF_FABRIC_COMMAND_PROPERTY_GET); + switch (size) { + case 4: + cmd.attrib.size = NVMF_PROP_SIZE_4; + break; + case 8: + cmd.attrib.size = NVMF_PROP_SIZE_8; + break; + default: + errno = EINVAL; + return (NULL); + } + cmd.ofst = htole32(offset); + + return (nvmf_allocate_command(qp, &cmd)); +} + +int +nvmf_read_property(struct nvmf_qpair *qp, uint32_t offset, uint8_t size, + uint64_t *value) +{ + struct nvmf_capsule *cc, *rc; + const struct nvmf_fabric_prop_get_rsp *rsp; + uint16_t status; + int error; + + if (!qp->nq_admin) + return (EINVAL); + + cc = nvmf_get_property(qp, offset, size); + if (cc == NULL) + return (errno); + + error = nvmf_host_transmit_command(cc); + if (error != 0) { + nvmf_free_capsule(cc); + return (error); + } + + error = nvmf_host_wait_for_response(cc, &rc); + nvmf_free_capsule(cc); + if (error != 0) + return (error); + + rsp = (const struct nvmf_fabric_prop_get_rsp *)&rc->nc_cqe; + status = le16toh(rc->nc_cqe.status); + if (status != 0) { + printf("NVMF: PROPERTY_GET failed, status %#x\n", status); + nvmf_free_capsule(rc); + return (EIO); + } + + if (size == 8) + *value = le64toh(rsp->value.u64); + else + *value = le32toh(rsp->value.u32.low); + nvmf_free_capsule(rc); + return (0); +} + +static struct nvmf_capsule * +nvmf_set_property(struct nvmf_qpair *qp, uint32_t offset, uint8_t size, + uint64_t value) +{ + struct nvmf_fabric_prop_set_cmd cmd; + + nvmf_init_fabrics_sqe(&cmd, NVMF_FABRIC_COMMAND_PROPERTY_SET); + switch (size) { + case 4: + cmd.attrib.size = NVMF_PROP_SIZE_4; + cmd.value.u32.low = htole32(value); + break; + case 8: + cmd.attrib.size = NVMF_PROP_SIZE_8; + cmd.value.u64 = htole64(value); + break; + default: + errno = EINVAL; + return (NULL); + } + cmd.ofst = htole32(offset); + + return (nvmf_allocate_command(qp, &cmd)); +} + +int +nvmf_write_property(struct nvmf_qpair *qp, uint32_t offset, uint8_t size, + uint64_t value) +{ + struct nvmf_capsule *cc, *rc; + uint16_t status; + int error; + + if (!qp->nq_admin) + return (EINVAL); + + cc = nvmf_set_property(qp, offset, size, value); + if (cc == NULL) + return (errno); + + error = nvmf_host_transmit_command(cc); + if (error != 0) { + nvmf_free_capsule(cc); + return (error); + } + + error = nvmf_host_wait_for_response(cc, &rc); + nvmf_free_capsule(cc); + if (error != 0) + return (error); + + status = le16toh(rc->nc_cqe.status); + if (status != 0) { + printf("NVMF: PROPERTY_SET failed, status %#x\n", status); + nvmf_free_capsule(rc); + return (EIO); + } + + nvmf_free_capsule(rc); + return (0); +} + +int +nvmf_hostid_from_hostuuid(uint8_t hostid[16]) +{ + char hostuuid_str[64]; + uuid_t hostuuid; + size_t len; + uint32_t status; + + len = sizeof(hostuuid_str); + if (sysctlbyname("kern.hostuuid", hostuuid_str, &len, NULL, 0) != 0) + return (errno); + + uuid_from_string(hostuuid_str, &hostuuid, &status); + switch (status) { + case uuid_s_ok: + break; + case uuid_s_no_memory: + return (ENOMEM); + default: + return (EINVAL); + } + + uuid_enc_le(hostid, &hostuuid); + return (0); +} + +int +nvmf_nqn_from_hostuuid(char nqn[NVMF_NQN_MAX_LEN]) +{ + char hostuuid_str[64]; + size_t len; + + len = sizeof(hostuuid_str); + if (sysctlbyname("kern.hostuuid", hostuuid_str, &len, NULL, 0) != 0) + return (errno); + + strlcpy(nqn, NVMF_NQN_UUID_PRE, NVMF_NQN_MAX_LEN); + strlcat(nqn, hostuuid_str, NVMF_NQN_MAX_LEN); + return (0); +} + +int +nvmf_host_identify_controller(struct nvmf_qpair *qp, + struct nvme_controller_data *cdata) +{ + struct nvme_command cmd; + struct nvmf_capsule *cc, *rc; + int error; + uint16_t status; + + if (!qp->nq_admin) + return (EINVAL); + + nvmf_init_sqe(&cmd, NVME_OPC_IDENTIFY); + + /* 5.15.1 Use CNS of 0x01 for controller data. */ + cmd.cdw10 = htole32(1); + + cc = nvmf_allocate_command(qp, &cmd); + if (cc == NULL) + return (errno); + + error = nvmf_capsule_append_data(cc, cdata, sizeof(*cdata), false); + if (error != 0) { + nvmf_free_capsule(cc); + return (error); + } + + error = nvmf_host_transmit_command(cc); + if (error != 0) { + nvmf_free_capsule(cc); + return (error); + } + + error = nvmf_host_wait_for_response(cc, &rc); + nvmf_free_capsule(cc); + if (error != 0) + return (error); + + status = le16toh(rc->nc_cqe.status); + if (status != 0) { + printf("NVMF: IDENTIFY failed, status %#x\n", status); + nvmf_free_capsule(rc); + return (EIO); + } + + nvmf_free_capsule(rc); + return (0); +} + +int +nvmf_host_identify_namespace(struct nvmf_qpair *qp, uint32_t nsid, + struct nvme_namespace_data *nsdata) +{ + struct nvme_command cmd; + struct nvmf_capsule *cc, *rc; + int error; + uint16_t status; + + if (!qp->nq_admin) + return (EINVAL); + + nvmf_init_sqe(&cmd, NVME_OPC_IDENTIFY); + + /* 5.15.1 Use CNS of 0x00 for namespace data. */ + cmd.cdw10 = htole32(0); + cmd.nsid = htole32(nsid); + + cc = nvmf_allocate_command(qp, &cmd); + if (cc == NULL) + return (errno); + + error = nvmf_capsule_append_data(cc, nsdata, sizeof(*nsdata), false); + if (error != 0) { + nvmf_free_capsule(cc); + return (error); + } + + error = nvmf_host_transmit_command(cc); + if (error != 0) { + nvmf_free_capsule(cc); + return (error); + } + + error = nvmf_host_wait_for_response(cc, &rc); + nvmf_free_capsule(cc); + if (error != 0) + return (error); + + status = le16toh(rc->nc_cqe.status); + if (status != 0) { + printf("NVMF: IDENTIFY failed, status %#x\n", status); + nvmf_free_capsule(rc); + return (EIO); + } + + nvmf_free_capsule(rc); + return (0); +} + +static int +nvmf_get_discovery_log_page(struct nvmf_qpair *qp, uint64_t offset, void *buf, + size_t len) +{ + struct nvme_command cmd; + struct nvmf_capsule *cc, *rc; + size_t numd; + int error; + uint16_t status; + + if (len % 4 != 0 || len == 0 || offset % 4 != 0) + return (EINVAL); + + numd = (len / 4) - 1; + nvmf_init_sqe(&cmd, NVME_OPC_GET_LOG_PAGE); + cmd.cdw10 = htole32(numd << 16 | NVME_LOG_DISCOVERY); + cmd.cdw11 = htole32(numd >> 16); + cmd.cdw12 = htole32(offset); + cmd.cdw13 = htole32(offset >> 32); + + cc = nvmf_allocate_command(qp, &cmd); + if (cc == NULL) + return (errno); + + error = nvmf_capsule_append_data(cc, buf, len, false); + if (error != 0) { + nvmf_free_capsule(cc); + return (error); + } + + error = nvmf_host_transmit_command(cc); + if (error != 0) { + nvmf_free_capsule(cc); + return (error); + } + + error = nvmf_host_wait_for_response(cc, &rc); + nvmf_free_capsule(cc); + if (error != 0) + return (error); + + status = le16toh(rc->nc_cqe.status); + if (NVMEV(NVME_STATUS_SC, status) == + NVMF_FABRIC_SC_LOG_RESTART_DISCOVERY) { + nvmf_free_capsule(rc); + return (EAGAIN); + } + if (status != 0) { + printf("NVMF: GET_LOG_PAGE failed, status %#x\n", status); + nvmf_free_capsule(rc); + return (EIO); + } + + nvmf_free_capsule(rc); + return (0); +} + +int +nvmf_host_fetch_discovery_log_page(struct nvmf_qpair *qp, + struct nvme_discovery_log **logp) +{ + struct nvme_discovery_log hdr, *log; + size_t payload_len; + int error; + + if (!qp->nq_admin) + return (EINVAL); + + log = NULL; + for (;;) { + error = nvmf_get_discovery_log_page(qp, 0, &hdr, sizeof(hdr)); + if (error != 0) + return (error); + nvme_discovery_log_swapbytes(&hdr); + + if (hdr.recfmt != 0) { + printf("NVMF: Unsupported discovery log format: %d\n", + hdr.recfmt); + return (EINVAL); + } + + if (hdr.numrec > 1024) { + printf("NVMF: Too many discovery log entries: %ju\n", + (uintmax_t)hdr.numrec); + return (EFBIG); + } + + payload_len = sizeof(log->entries[0]) * hdr.numrec; + log = reallocf(log, sizeof(*log) + payload_len); + if (log == NULL) + return (ENOMEM); + *log = hdr; + if (hdr.numrec == 0) + break; + + error = nvmf_get_discovery_log_page(qp, sizeof(hdr), + log->entries, payload_len); + if (error == EAGAIN) + continue; + if (error != 0) { + free(log); + return (error); + } + + /* Re-read the header and check the generation count. */ + error = nvmf_get_discovery_log_page(qp, 0, &hdr, sizeof(hdr)); + if (error != 0) { + free(log); + return (error); + } + nvme_discovery_log_swapbytes(&hdr); + + if (log->genctr != hdr.genctr) + continue; + + for (u_int i = 0; i < log->numrec; i++) + nvme_discovery_log_entry_swapbytes(&log->entries[i]); + break; + } + *logp = log; + return (0); +} + +int +nvmf_host_request_queues(struct nvmf_qpair *qp, u_int requested, u_int *actual) +{ + struct nvme_command cmd; + struct nvmf_capsule *cc, *rc; + int error; + uint16_t status; + + if (!qp->nq_admin || requested < 1 || requested > 65535) + return (EINVAL); + + /* The number of queues is 0's based. */ + requested--; + + nvmf_init_sqe(&cmd, NVME_OPC_SET_FEATURES); + cmd.cdw10 = htole32(NVME_FEAT_NUMBER_OF_QUEUES); + + /* Same number of completion and submission queues. */ + cmd.cdw11 = htole32((requested << 16) | requested); + + cc = nvmf_allocate_command(qp, &cmd); + if (cc == NULL) + return (errno); + + error = nvmf_host_transmit_command(cc); + if (error != 0) { + nvmf_free_capsule(cc); + return (error); + } + + error = nvmf_host_wait_for_response(cc, &rc); + nvmf_free_capsule(cc); + if (error != 0) + return (error); + + status = le16toh(rc->nc_cqe.status); + if (status != 0) { + printf("NVMF: SET_FEATURES failed, status %#x\n", status); + nvmf_free_capsule(rc); + return (EIO); + } + + *actual = (le32toh(rc->nc_cqe.cdw0) & 0xffff) + 1; + nvmf_free_capsule(rc); + return (0); +} + +static bool +is_queue_pair_idle(struct nvmf_qpair *qp) +{ + if (qp->nq_sqhd != qp->nq_sqtail) + return (false); + if (!TAILQ_EMPTY(&qp->nq_rx_capsules)) + return (false); + return (true); +} + +static int +prepare_queues_for_handoff(struct nvmf_handoff_host *hh, + struct nvmf_qpair *admin_qp, u_int num_queues, + struct nvmf_qpair **io_queues, const struct nvme_controller_data *cdata) +{ + struct nvmf_handoff_qpair_params *io; + u_int i; + int error; + + memset(hh, 0, sizeof(*hh)); + + /* All queue pairs must be idle. */ + if (!is_queue_pair_idle(admin_qp)) + return (EBUSY); + for (i = 0; i < num_queues; i++) { + if (!is_queue_pair_idle(io_queues[i])) + return (EBUSY); + } + + /* First, the admin queue. */ + hh->trtype = admin_qp->nq_association->na_trtype; + hh->kato = admin_qp->nq_kato; + error = nvmf_kernel_handoff_params(admin_qp, &hh->admin); + if (error) + return (error); + + /* Next, the I/O queues. */ + hh->num_io_queues = num_queues; + io = calloc(num_queues, sizeof(*io)); + for (i = 0; i < num_queues; i++) { + error = nvmf_kernel_handoff_params(io_queues[i], &io[i]); + if (error) { + free(io); + return (error); + } + } + + hh->io = io; + hh->cdata = cdata; + return (0); +} + +int +nvmf_handoff_host(struct nvmf_qpair *admin_qp, u_int num_queues, + struct nvmf_qpair **io_queues, const struct nvme_controller_data *cdata) +{ + struct nvmf_handoff_host hh; + u_int i; + int error, fd; + + fd = open("/dev/nvmf", O_RDWR); + if (fd == -1) { + error = errno; + goto out; + } + + error = prepare_queues_for_handoff(&hh, admin_qp, num_queues, io_queues, + cdata); + if (error != 0) + goto out; + + if (ioctl(fd, NVMF_HANDOFF_HOST, &hh) == -1) + error = errno; + free(hh.io); + +out: + if (fd >= 0) + close(fd); + for (i = 0; i < num_queues; i++) + (void)nvmf_free_qpair(io_queues[i]); + (void)nvmf_free_qpair(admin_qp); + return (error); +} + +int +nvmf_disconnect_host(const char *host) +{ + int error, fd; + + error = 0; + fd = open("/dev/nvmf", O_RDWR); + if (fd == -1) { + error = errno; + goto out; + } + + if (ioctl(fd, NVMF_DISCONNECT_HOST, &host) == -1) + error = errno; + +out: + if (fd >= 0) + close(fd); + return (error); +} + +int +nvmf_disconnect_all(void) +{ + int error, fd; + + error = 0; + fd = open("/dev/nvmf", O_RDWR); + if (fd == -1) { + error = errno; + goto out; + } + + if (ioctl(fd, NVMF_DISCONNECT_ALL) == -1) + error = errno; + +out: + if (fd >= 0) + close(fd); + return (error); +} + +int +nvmf_reconnect_params(int fd, struct nvmf_reconnect_params *rparams) +{ + if (ioctl(fd, NVMF_RECONNECT_PARAMS, rparams) == -1) + return (errno); + return (0); +} + +int +nvmf_reconnect_host(int fd, struct nvmf_qpair *admin_qp, u_int num_queues, + struct nvmf_qpair **io_queues, const struct nvme_controller_data *cdata) +{ + struct nvmf_handoff_host hh; + u_int i; + int error; + + error = prepare_queues_for_handoff(&hh, admin_qp, num_queues, io_queues, + cdata); + if (error != 0) + goto out; + + if (ioctl(fd, NVMF_RECONNECT_HOST, &hh) == -1) + error = errno; + free(hh.io); + +out: + for (i = 0; i < num_queues; i++) + (void)nvmf_free_qpair(io_queues[i]); + (void)nvmf_free_qpair(admin_qp); + return (error); +} diff --git a/lib/libnvmf/nvmf_tcp.c b/lib/libnvmf/nvmf_tcp.c new file mode 100644 --- /dev/null +++ b/lib/libnvmf/nvmf_tcp.c @@ -0,0 +1,1474 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2022-2024 Chelsio Communications, Inc. + * Written by: John Baldwin + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libnvmf.h" +#include "internal.h" +#include "nvmf_tcp.h" + +struct nvmf_tcp_qpair; + +struct nvmf_tcp_command_buffer { + struct nvmf_tcp_qpair *qp; + + void *data; + size_t data_len; + size_t data_xfered; + uint32_t data_offset; + + uint16_t cid; + uint16_t ttag; + + LIST_ENTRY(nvmf_tcp_command_buffer) link; +}; + +LIST_HEAD(nvmf_tcp_command_buffer_list, nvmf_tcp_command_buffer); + +struct nvmf_tcp_association { + struct nvmf_association na; + + uint32_t ioccsz; +}; + +struct nvmf_tcp_rxpdu { + struct nvme_tcp_common_pdu_hdr *hdr; + uint32_t data_len; +}; + +struct nvmf_tcp_capsule { + struct nvmf_capsule nc; + + struct nvmf_tcp_rxpdu rx_pdu; + struct nvmf_tcp_command_buffer *cb; + + TAILQ_ENTRY(nvmf_tcp_capsule) link; +}; + +struct nvmf_tcp_qpair { + struct nvmf_qpair qp; + int s; + + uint8_t txpda; + uint8_t rxpda; + bool header_digests; + bool data_digests; + uint32_t maxr2t; + uint32_t maxh2cdata; + uint32_t max_icd; /* Host only */ + uint16_t next_ttag; /* Controller only */ + + struct nvmf_tcp_command_buffer_list tx_buffers; + struct nvmf_tcp_command_buffer_list rx_buffers; + TAILQ_HEAD(, nvmf_tcp_capsule) rx_capsules; +}; + +#define TASSOC(nc) ((struct nvmf_tcp_association *)(na)) +#define TCAP(nc) ((struct nvmf_tcp_capsule *)(nc)) +#define CTCAP(nc) ((const struct nvmf_tcp_capsule *)(nc)) +#define TQP(qp) ((struct nvmf_tcp_qpair *)(qp)) + +static const char zero_padding[NVME_TCP_PDU_PDO_MAX_OFFSET]; + +static uint32_t +compute_digest(const void *buf, size_t len) +{ + return (calculate_crc32c(0xffffffff, buf, len) ^ 0xffffffff); +} + +static struct nvmf_tcp_command_buffer * +tcp_alloc_command_buffer(struct nvmf_tcp_qpair *qp, void *data, + uint32_t data_offset, size_t data_len, uint16_t cid, uint16_t ttag, + bool receive) +{ + struct nvmf_tcp_command_buffer *cb; + + cb = malloc(sizeof(*cb)); + cb->qp = qp; + cb->data = data; + cb->data_offset = data_offset; + cb->data_len = data_len; + cb->data_xfered = 0; + cb->cid = cid; + cb->ttag = ttag; + + if (receive) + LIST_INSERT_HEAD(&qp->rx_buffers, cb, link); + else + LIST_INSERT_HEAD(&qp->tx_buffers, cb, link); + return (cb); +} + +static struct nvmf_tcp_command_buffer * +tcp_find_command_buffer(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag, + bool receive) +{ + struct nvmf_tcp_command_buffer_list *list; + struct nvmf_tcp_command_buffer *cb; + + list = receive ? &qp->rx_buffers : &qp->tx_buffers; + LIST_FOREACH(cb, list, link) { + if (cb->cid == cid && cb->ttag == ttag) + return (cb); + } + return (NULL); +} + +static void +tcp_purge_command_buffer(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag, + bool receive) +{ + struct nvmf_tcp_command_buffer *cb; + + cb = tcp_find_command_buffer(qp, cid, ttag, receive); + if (cb != NULL) + LIST_REMOVE(cb, link); +} + +static void +tcp_free_command_buffer(struct nvmf_tcp_command_buffer *cb) +{ + LIST_REMOVE(cb, link); + free(cb); +} + +static int +nvmf_tcp_write_pdu(struct nvmf_tcp_qpair *qp, const void *pdu, size_t len) +{ + ssize_t nwritten; + const char *cp; + + cp = pdu; + while (len != 0) { + nwritten = write(qp->s, cp, len); + if (nwritten < 0) + return (errno); + len -= nwritten; + cp += nwritten; + } + return (0); +} + +static int +nvmf_tcp_write_pdu_iov(struct nvmf_tcp_qpair *qp, struct iovec *iov, + u_int iovcnt, size_t len) +{ + ssize_t nwritten; + + for (;;) { + nwritten = writev(qp->s, iov, iovcnt); + if (nwritten < 0) + return (errno); + + len -= nwritten; + if (len == 0) + return (0); + + while (iov->iov_len <= (size_t)nwritten) { + nwritten -= iov->iov_len; + iovcnt--; + iov++; + } + + iov->iov_base = (char *)iov->iov_base + nwritten; + iov->iov_len -= nwritten; + } +} + +static void +nvmf_tcp_report_error(struct nvmf_association *na, struct nvmf_tcp_qpair *qp, + uint16_t fes, uint32_t fei, const void *rx_pdu, size_t pdu_len, u_int hlen) +{ + struct nvme_tcp_term_req_hdr hdr; + struct iovec iov[2]; + + if (hlen != 0) { + if (hlen > NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE) + hlen = NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE; + if (hlen > pdu_len) + hlen = pdu_len; + } + + memset(&hdr, 0, sizeof(hdr)); + hdr.common.pdu_type = na->na_controller ? + NVME_TCP_PDU_TYPE_C2H_TERM_REQ : NVME_TCP_PDU_TYPE_H2C_TERM_REQ; + hdr.common.hlen = sizeof(hdr); + hdr.common.plen = sizeof(hdr) + hlen; + hdr.fes = htole16(fes); + le32enc(hdr.fei, fei); + iov[0].iov_base = &hdr; + iov[0].iov_len = sizeof(hdr); + iov[1].iov_base = __DECONST(void *, rx_pdu); + iov[1].iov_len = hlen; + + (void)nvmf_tcp_write_pdu_iov(qp, iov, nitems(iov), sizeof(hdr) + hlen); + close(qp->s); + qp->s = -1; +} + +static int +nvmf_tcp_validate_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu, + size_t pdu_len) +{ + const struct nvme_tcp_common_pdu_hdr *ch; + uint32_t data_len, fei, plen; + uint32_t digest, rx_digest; + u_int hlen; + int error; + uint16_t fes; + + /* Determine how large of a PDU header to return for errors. */ + ch = pdu->hdr; + hlen = ch->hlen; + plen = le32toh(ch->plen); + if (hlen < sizeof(*ch) || hlen > plen) + hlen = sizeof(*ch); + + error = nvmf_tcp_validate_pdu_header(ch, + qp->qp.nq_association->na_controller, qp->header_digests, + qp->data_digests, qp->rxpda, &data_len, &fes, &fei); + if (error != 0) { + if (error == ECONNRESET) { + close(qp->s); + qp->s = -1; + } else { + nvmf_tcp_report_error(qp->qp.nq_association, qp, + fes, fei, ch, pdu_len, hlen); + } + return (error); + } + + /* Check header digest if present. */ + if ((ch->flags & NVME_TCP_CH_FLAGS_HDGSTF) != 0) { + digest = compute_digest(ch, ch->hlen); + memcpy(&rx_digest, (const char *)ch + ch->hlen, + sizeof(rx_digest)); + if (digest != rx_digest) { + printf("NVMe/TCP: Header digest mismatch\n"); + nvmf_tcp_report_error(qp->qp.nq_association, qp, + NVME_TCP_TERM_REQ_FES_HDGST_ERROR, rx_digest, ch, + pdu_len, hlen); + return (EBADMSG); + } + } + + /* Check data digest if present. */ + if ((ch->flags & NVME_TCP_CH_FLAGS_DDGSTF) != 0) { + digest = compute_digest((const char *)ch + ch->pdo, data_len); + memcpy(&rx_digest, (const char *)ch + plen - sizeof(rx_digest), + sizeof(rx_digest)); + if (digest != rx_digest) { + printf("NVMe/TCP: Data digest mismatch\n"); + return (EBADMSG); + } + } + + pdu->data_len = data_len; + return (0); +} + +/* + * Read data from a socket, retrying until the data has been fully + * read or an error occurs. + */ +static int +nvmf_tcp_read_buffer(int s, void *buf, size_t len) +{ + ssize_t nread; + char *cp; + + cp = buf; + while (len != 0) { + nread = read(s, cp, len); + if (nread < 0) + return (errno); + if (nread == 0) + return (ECONNRESET); + len -= nread; + cp += nread; + } + return (0); +} + +static int +nvmf_tcp_read_pdu(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu) +{ + struct nvme_tcp_common_pdu_hdr ch; + uint32_t plen; + int error; + + memset(pdu, 0, sizeof(*pdu)); + error = nvmf_tcp_read_buffer(qp->s, &ch, sizeof(ch)); + if (error != 0) + return (error); + + plen = le32toh(ch.plen); + + /* + * Validate a header with garbage lengths to trigger + * an error message without reading more. + */ + if (plen < sizeof(ch) || ch.hlen > plen) { + pdu->hdr = &ch; + error = nvmf_tcp_validate_pdu(qp, pdu, sizeof(ch)); + pdu->hdr = NULL; + assert(error != 0); + return (error); + } + + /* Read the rest of the PDU. */ + pdu->hdr = malloc(plen); + memcpy(pdu->hdr, &ch, sizeof(ch)); + error = nvmf_tcp_read_buffer(qp->s, pdu->hdr + 1, plen - sizeof(ch)); + if (error != 0) + return (error); + error = nvmf_tcp_validate_pdu(qp, pdu, plen); + if (error != 0) { + free(pdu->hdr); + pdu->hdr = NULL; + } + return (error); +} + +static void +nvmf_tcp_free_pdu(struct nvmf_tcp_rxpdu *pdu) +{ + free(pdu->hdr); + pdu->hdr = NULL; +} + +static int +nvmf_tcp_handle_term_req(struct nvmf_tcp_rxpdu *pdu) +{ + struct nvme_tcp_term_req_hdr *hdr; + + hdr = (void *)pdu->hdr; + + printf("NVMe/TCP: Received termination request: fes %#x fei %#x\n", + le16toh(hdr->fes), le32dec(hdr->fei)); + nvmf_tcp_free_pdu(pdu); + return (ECONNRESET); +} + +static int +nvmf_tcp_save_command_capsule(struct nvmf_tcp_qpair *qp, + struct nvmf_tcp_rxpdu *pdu) +{ + struct nvme_tcp_cmd *cmd; + struct nvmf_capsule *nc; + struct nvmf_tcp_capsule *tc; + + cmd = (void *)pdu->hdr; + + nc = nvmf_allocate_command(&qp->qp, &cmd->ccsqe); + if (nc == NULL) + return (ENOMEM); + + tc = TCAP(nc); + tc->rx_pdu = *pdu; + + TAILQ_INSERT_TAIL(&qp->rx_capsules, tc, link); + return (0); +} + +static int +nvmf_tcp_save_response_capsule(struct nvmf_tcp_qpair *qp, + struct nvmf_tcp_rxpdu *pdu) +{ + struct nvme_tcp_rsp *rsp; + struct nvmf_capsule *nc; + struct nvmf_tcp_capsule *tc; + + rsp = (void *)pdu->hdr; + + nc = nvmf_allocate_response(&qp->qp, &rsp->rccqe); + if (nc == NULL) + return (ENOMEM); + + nc->nc_sqhd_valid = true; + tc = TCAP(nc); + tc->rx_pdu = *pdu; + + TAILQ_INSERT_TAIL(&qp->rx_capsules, tc, link); + + /* + * Once the CQE has been received, no further transfers to the + * command buffer for the associated CID can occur. + */ + tcp_purge_command_buffer(qp, rsp->rccqe.cid, 0, true); + tcp_purge_command_buffer(qp, rsp->rccqe.cid, 0, false); + + return (0); +} + +/* + * Construct and send a PDU that contains an optional data payload. + * This includes dealing with digests and the length fields in the + * common header. + */ +static int +nvmf_tcp_construct_pdu(struct nvmf_tcp_qpair *qp, void *hdr, size_t hlen, + void *data, uint32_t data_len) +{ + struct nvme_tcp_common_pdu_hdr *ch; + struct iovec iov[5]; + u_int iovcnt; + uint32_t header_digest, data_digest, pad, pdo, plen; + + plen = hlen; + if (qp->header_digests) + plen += sizeof(header_digest); + if (data_len != 0) { + pdo = roundup2(plen, qp->txpda); + pad = pdo - plen; + plen = pdo + data_len; + if (qp->data_digests) + plen += sizeof(data_digest); + } else { + assert(data == NULL); + pdo = 0; + pad = 0; + } + + ch = hdr; + ch->hlen = hlen; + if (qp->header_digests) + ch->flags |= NVME_TCP_CH_FLAGS_HDGSTF; + if (qp->data_digests && data_len != 0) + ch->flags |= NVME_TCP_CH_FLAGS_DDGSTF; + ch->pdo = pdo; + ch->plen = htole32(plen); + + /* CH + PSH */ + iov[0].iov_base = hdr; + iov[0].iov_len = hlen; + iovcnt = 1; + + /* HDGST */ + if (qp->header_digests) { + header_digest = compute_digest(hdr, hlen); + iov[iovcnt].iov_base = &header_digest; + iov[iovcnt].iov_len = sizeof(header_digest); + iovcnt++; + } + + if (pad != 0) { + /* PAD */ + iov[iovcnt].iov_base = __DECONST(char *, zero_padding); + iov[iovcnt].iov_len = pad; + iovcnt++; + } + + if (data_len != 0) { + /* DATA */ + iov[iovcnt].iov_base = data; + iov[iovcnt].iov_len = data_len; + iovcnt++; + + /* DDGST */ + if (qp->data_digests) { + data_digest = compute_digest(data, data_len); + iov[iovcnt].iov_base = &data_digest; + iov[iovcnt].iov_len = sizeof(data_digest); + iovcnt++; + } + } + + return (nvmf_tcp_write_pdu_iov(qp, iov, iovcnt, plen)); +} + +static int +nvmf_tcp_handle_h2c_data(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu) +{ + struct nvme_tcp_h2c_data_hdr *h2c; + struct nvmf_tcp_command_buffer *cb; + uint32_t data_len, data_offset; + const char *icd; + + h2c = (void *)pdu->hdr; + if (le32toh(h2c->datal) > qp->maxh2cdata) { + nvmf_tcp_report_error(qp->qp.nq_association, qp, + NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_LIMIT_EXCEEDED, 0, + pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen); + nvmf_tcp_free_pdu(pdu); + return (EBADMSG); + } + + cb = tcp_find_command_buffer(qp, h2c->cccid, h2c->ttag, true); + if (cb == NULL) { + nvmf_tcp_report_error(qp->qp.nq_association, qp, + NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, + offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->hdr, + le32toh(pdu->hdr->plen), pdu->hdr->hlen); + nvmf_tcp_free_pdu(pdu); + return (EBADMSG); + } + + data_len = le32toh(h2c->datal); + if (data_len != pdu->data_len) { + nvmf_tcp_report_error(qp->qp.nq_association, qp, + NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, + offsetof(struct nvme_tcp_h2c_data_hdr, datal), pdu->hdr, + le32toh(pdu->hdr->plen), pdu->hdr->hlen); + nvmf_tcp_free_pdu(pdu); + return (EBADMSG); + } + + data_offset = le32toh(h2c->datao); + if (data_offset < cb->data_offset || + data_offset + data_len > cb->data_offset + cb->data_len) { + nvmf_tcp_report_error(qp->qp.nq_association, qp, + NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, + pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen); + nvmf_tcp_free_pdu(pdu); + return (EBADMSG); + } + + if (data_offset != cb->data_offset + cb->data_xfered) { + nvmf_tcp_report_error(qp->qp.nq_association, qp, + NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr, + le32toh(pdu->hdr->plen), pdu->hdr->hlen); + nvmf_tcp_free_pdu(pdu); + return (EBADMSG); + } + + if ((cb->data_xfered + data_len == cb->data_len) != + ((pdu->hdr->flags & NVME_TCP_H2C_DATA_FLAGS_LAST_PDU) != 0)) { + nvmf_tcp_report_error(qp->qp.nq_association, qp, + NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr, + le32toh(pdu->hdr->plen), pdu->hdr->hlen); + nvmf_tcp_free_pdu(pdu); + return (EBADMSG); + } + + cb->data_xfered += data_len; + data_offset -= cb->data_offset; + icd = (const char *)pdu->hdr + pdu->hdr->pdo; + memcpy((char *)cb->data + data_offset, icd, data_len); + + nvmf_tcp_free_pdu(pdu); + return (0); +} + +static int +nvmf_tcp_handle_c2h_data(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu) +{ + struct nvme_tcp_c2h_data_hdr *c2h; + struct nvmf_tcp_command_buffer *cb; + uint32_t data_len, data_offset; + const char *icd; + + c2h = (void *)pdu->hdr; + + cb = tcp_find_command_buffer(qp, c2h->cccid, 0, true); + if (cb == NULL) { + /* + * XXX: Could be PDU sequence error if cccid is for a + * command that doesn't use a command buffer. + */ + nvmf_tcp_report_error(qp->qp.nq_association, qp, + NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, + offsetof(struct nvme_tcp_c2h_data_hdr, cccid), pdu->hdr, + le32toh(pdu->hdr->plen), pdu->hdr->hlen); + nvmf_tcp_free_pdu(pdu); + return (EBADMSG); + } + + data_len = le32toh(c2h->datal); + if (data_len != pdu->data_len) { + nvmf_tcp_report_error(qp->qp.nq_association, qp, + NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, + offsetof(struct nvme_tcp_c2h_data_hdr, datal), pdu->hdr, + le32toh(pdu->hdr->plen), pdu->hdr->hlen); + nvmf_tcp_free_pdu(pdu); + return (EBADMSG); + } + + data_offset = le32toh(c2h->datao); + if (data_offset < cb->data_offset || + data_offset + data_len > cb->data_offset + cb->data_len) { + nvmf_tcp_report_error(qp->qp.nq_association, qp, + NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, + pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen); + nvmf_tcp_free_pdu(pdu); + return (EBADMSG); + } + + if (data_offset != cb->data_offset + cb->data_xfered) { + nvmf_tcp_report_error(qp->qp.nq_association, qp, + NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr, + le32toh(pdu->hdr->plen), pdu->hdr->hlen); + nvmf_tcp_free_pdu(pdu); + return (EBADMSG); + } + + if ((cb->data_xfered + data_len == cb->data_len) != + ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_LAST_PDU) != 0)) { + nvmf_tcp_report_error(qp->qp.nq_association, qp, + NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr, + le32toh(pdu->hdr->plen), pdu->hdr->hlen); + nvmf_tcp_free_pdu(pdu); + return (EBADMSG); + } + + cb->data_xfered += data_len; + data_offset -= cb->data_offset; + icd = (const char *)pdu->hdr + pdu->hdr->pdo; + memcpy((char *)cb->data + data_offset, icd, data_len); + + if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) { + struct nvme_completion cqe; + struct nvmf_tcp_capsule *tc; + struct nvmf_capsule *nc; + + memset(&cqe, 0, sizeof(cqe)); + cqe.cid = cb->cid; + + nc = nvmf_allocate_response(&qp->qp, &cqe); + if (nc == NULL) { + nvmf_tcp_free_pdu(pdu); + return (ENOMEM); + } + nc->nc_sqhd_valid = false; + + tc = TCAP(nc); + TAILQ_INSERT_TAIL(&qp->rx_capsules, tc, link); + } + + nvmf_tcp_free_pdu(pdu); + return (0); +} + +/* NB: cid and ttag and little-endian already. */ +static int +tcp_send_h2c_pdu(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag, + uint32_t data_offset, void *buf, size_t len, bool last_pdu) +{ + struct nvme_tcp_h2c_data_hdr h2c; + + memset(&h2c, 0, sizeof(h2c)); + h2c.common.pdu_type = NVME_TCP_PDU_TYPE_H2C_DATA; + if (last_pdu) + h2c.common.flags |= NVME_TCP_H2C_DATA_FLAGS_LAST_PDU; + h2c.cccid = cid; + h2c.ttag = ttag; + h2c.datao = htole32(data_offset); + h2c.datal = htole32(len); + + return (nvmf_tcp_construct_pdu(qp, &h2c, sizeof(h2c), buf, len)); +} + +/* Sends one or more H2C_DATA PDUs, subject to MAXH2CDATA. */ +static int +tcp_send_h2c_pdus(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag, + uint32_t data_offset, void *buf, size_t len, bool last_pdu) +{ + char *p; + + p = buf; + while (len != 0) { + size_t todo; + int error; + + todo = len; + if (todo > qp->maxh2cdata) + todo = qp->maxh2cdata; + error = tcp_send_h2c_pdu(qp, cid, ttag, data_offset, p, todo, + last_pdu && todo == len); + if (error != 0) + return (error); + p += todo; + len -= todo; + } + return (0); +} + +static int +nvmf_tcp_handle_r2t(struct nvmf_tcp_qpair *qp, struct nvmf_tcp_rxpdu *pdu) +{ + struct nvmf_tcp_command_buffer *cb; + struct nvme_tcp_r2t_hdr *r2t; + uint32_t data_len, data_offset; + int error; + + r2t = (void *)pdu->hdr; + + cb = tcp_find_command_buffer(qp, r2t->cccid, 0, false); + if (cb == NULL) { + nvmf_tcp_report_error(qp->qp.nq_association, qp, + NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, + offsetof(struct nvme_tcp_r2t_hdr, cccid), pdu->hdr, + le32toh(pdu->hdr->plen), pdu->hdr->hlen); + nvmf_tcp_free_pdu(pdu); + return (EBADMSG); + } + + data_offset = le32toh(r2t->r2to); + if (data_offset != cb->data_xfered) { + nvmf_tcp_report_error(qp->qp.nq_association, qp, + NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->hdr, + le32toh(pdu->hdr->plen), pdu->hdr->hlen); + nvmf_tcp_free_pdu(pdu); + return (EBADMSG); + } + + /* + * XXX: The spec does not specify how to handle R2T tranfers + * out of range of the original command. + */ + data_len = le32toh(r2t->r2tl); + if (data_offset + data_len > cb->data_len) { + nvmf_tcp_report_error(qp->qp.nq_association, qp, + NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, + pdu->hdr, le32toh(pdu->hdr->plen), pdu->hdr->hlen); + nvmf_tcp_free_pdu(pdu); + return (EBADMSG); + } + + cb->data_xfered += data_len; + + /* + * Write out one or more H2C_DATA PDUs containing the + * requested data. + */ + error = tcp_send_h2c_pdus(qp, r2t->cccid, r2t->ttag, + data_offset, (char *)cb->data + data_offset, data_len, true); + + nvmf_tcp_free_pdu(pdu); + return (error); +} + +static int +nvmf_tcp_receive_pdu(struct nvmf_tcp_qpair *qp) +{ + struct nvmf_tcp_rxpdu pdu; + int error; + + error = nvmf_tcp_read_pdu(qp, &pdu); + if (error != 0) + return (error); + + switch (pdu.hdr->pdu_type) { + default: + __unreachable(); + break; + case NVME_TCP_PDU_TYPE_H2C_TERM_REQ: + case NVME_TCP_PDU_TYPE_C2H_TERM_REQ: + return (nvmf_tcp_handle_term_req(&pdu)); + case NVME_TCP_PDU_TYPE_CAPSULE_CMD: + return (nvmf_tcp_save_command_capsule(qp, &pdu)); + case NVME_TCP_PDU_TYPE_CAPSULE_RESP: + return (nvmf_tcp_save_response_capsule(qp, &pdu)); + case NVME_TCP_PDU_TYPE_H2C_DATA: + return (nvmf_tcp_handle_h2c_data(qp, &pdu)); + case NVME_TCP_PDU_TYPE_C2H_DATA: + return (nvmf_tcp_handle_c2h_data(qp, &pdu)); + case NVME_TCP_PDU_TYPE_R2T: + return (nvmf_tcp_handle_r2t(qp, &pdu)); + } +} + +static bool +nvmf_tcp_validate_ic_pdu(struct nvmf_association *na, struct nvmf_tcp_qpair *qp, + const struct nvme_tcp_common_pdu_hdr *ch, size_t pdu_len) +{ + const struct nvme_tcp_ic_req *pdu; + uint32_t plen; + u_int hlen; + + /* Determine how large of a PDU header to return for errors. */ + hlen = ch->hlen; + plen = le32toh(ch->plen); + if (hlen < sizeof(*ch) || hlen > plen) + hlen = sizeof(*ch); + + /* + * Errors must be reported for the lowest incorrect field + * first, so validate fields in order. + */ + + /* Validate pdu_type. */ + + /* Controllers only receive PDUs with a PDU direction of 0. */ + if (na->na_controller != (ch->pdu_type & 0x01) == 0) { + na_error(na, "NVMe/TCP: Invalid PDU type %u", ch->pdu_type); + nvmf_tcp_report_error(na, qp, + NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 0, ch, pdu_len, + hlen); + return (false); + } + + switch (ch->pdu_type) { + case NVME_TCP_PDU_TYPE_IC_REQ: + case NVME_TCP_PDU_TYPE_IC_RESP: + break; + default: + na_error(na, "NVMe/TCP: Invalid PDU type %u", ch->pdu_type); + nvmf_tcp_report_error(na, qp, + NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 0, ch, pdu_len, + hlen); + return (false); + } + + /* Validate flags. */ + if (ch->flags != 0) { + na_error(na, "NVMe/TCP: Invalid PDU header flags %#x", + ch->flags); + nvmf_tcp_report_error(na, qp, + NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 1, ch, pdu_len, + hlen); + return (false); + } + + /* Validate hlen. */ + if (ch->hlen != 128) { + na_error(na, "NVMe/TCP: Invalid PDU header length %u", + ch->hlen); + nvmf_tcp_report_error(na, qp, + NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 2, ch, pdu_len, + hlen); + return (false); + } + + /* Validate pdo. */ + if (ch->pdo != 0) { + na_error(na, "NVMe/TCP: Invalid PDU data offset %u", ch->pdo); + nvmf_tcp_report_error(na, qp, + NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 3, ch, pdu_len, + hlen); + return (false); + } + + /* Validate plen. */ + if (plen != 128) { + na_error(na, "NVMe/TCP: Invalid PDU length %u", plen); + nvmf_tcp_report_error(na, qp, + NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 4, ch, pdu_len, + hlen); + return (false); + } + + /* Validate fields common to both ICReq and ICResp. */ + pdu = (const struct nvme_tcp_ic_req *)ch; + if (le16toh(pdu->pfv) != 0) { + na_error(na, "NVMe/TCP: Unsupported PDU version %u", + le16toh(pdu->pfv)); + nvmf_tcp_report_error(na, qp, + NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER, + 8, ch, pdu_len, hlen); + return (false); + } + + if (pdu->hpda > NVME_TCP_HPDA_MAX) { + na_error(na, "NVMe/TCP: Unsupported PDA %u", pdu->hpda); + nvmf_tcp_report_error(na, qp, + NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 10, ch, pdu_len, + hlen); + return (false); + } + + if (pdu->dgst.bits.reserved != 0) { + na_error(na, "NVMe/TCP: Invalid digest settings"); + nvmf_tcp_report_error(na, qp, + NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 11, ch, pdu_len, + hlen); + return (false); + } + + return (true); +} + +static bool +nvmf_tcp_read_ic_req(struct nvmf_association *na, struct nvmf_tcp_qpair *qp, + struct nvme_tcp_ic_req *pdu) +{ + int error; + + error = nvmf_tcp_read_buffer(qp->s, pdu, sizeof(*pdu)); + if (error != 0) { + na_error(na, "NVMe/TCP: Failed to read IC request: %s", + strerror(error)); + return (false); + } + + return (nvmf_tcp_validate_ic_pdu(na, qp, &pdu->common, sizeof(*pdu))); +} + +static bool +nvmf_tcp_read_ic_resp(struct nvmf_association *na, struct nvmf_tcp_qpair *qp, + struct nvme_tcp_ic_resp *pdu) +{ + int error; + + error = nvmf_tcp_read_buffer(qp->s, pdu, sizeof(*pdu)); + if (error != 0) { + na_error(na, "NVMe/TCP: Failed to read IC response: %s", + strerror(error)); + return (false); + } + + return (nvmf_tcp_validate_ic_pdu(na, qp, &pdu->common, sizeof(*pdu))); +} + +static struct nvmf_association * +tcp_allocate_association(bool controller __unused, + const struct nvmf_association_params *params __unused) +{ + struct nvmf_tcp_association *ta; + + ta = calloc(1, sizeof(*ta)); + + return (&ta->na); +} + +static void +tcp_update_association(struct nvmf_association *na, + const struct nvme_controller_data *cdata) +{ + struct nvmf_tcp_association *ta = TASSOC(na); + + ta->ioccsz = le32toh(cdata->ioccsz); +} + +static void +tcp_free_association(struct nvmf_association *na) +{ + free(na); +} + +static bool +tcp_connect(struct nvmf_tcp_qpair *qp, struct nvmf_association *na, bool admin) +{ + const struct nvmf_association_params *params = &na->na_params; + struct nvmf_tcp_association *ta = TASSOC(na); + struct nvme_tcp_ic_req ic_req; + struct nvme_tcp_ic_resp ic_resp; + int error; + + if (!admin) { + if (ta->ioccsz == 0) { + na_error(na, "TCP I/O queues require cdata"); + return (false); + } + if (ta->ioccsz < 4) { + na_error(na, "Invalid IOCCSZ %u", ta->ioccsz); + return (false); + } + } + + memset(&ic_req, 0, sizeof(ic_req)); + ic_req.common.pdu_type = NVME_TCP_PDU_TYPE_IC_REQ; + ic_req.common.hlen = sizeof(ic_req); + ic_req.common.plen = htole32(sizeof(ic_req)); + ic_req.pfv = htole16(0); + ic_req.hpda = params->tcp.pda; + if (params->tcp.header_digests) + ic_req.dgst.bits.hdgst_enable = 1; + if (params->tcp.data_digests) + ic_req.dgst.bits.ddgst_enable = 1; + ic_req.maxr2t = htole32(params->tcp.maxr2t); + + error = nvmf_tcp_write_pdu(qp, &ic_req, sizeof(ic_req)); + if (error != 0) { + na_error(na, "Failed to write IC request: %s", strerror(error)); + return (false); + } + + if (!nvmf_tcp_read_ic_resp(na, qp, &ic_resp)) + return (false); + + /* Ensure the controller didn't enable digests we didn't request. */ + if ((!params->tcp.header_digests && + ic_resp.dgst.bits.hdgst_enable != 0) || + (!params->tcp.data_digests && + ic_resp.dgst.bits.ddgst_enable != 0)) { + na_error(na, "Controller enabled unrequested digests"); + nvmf_tcp_report_error(na, qp, + NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER, + 11, &ic_resp, sizeof(ic_resp), sizeof(ic_resp)); + return (false); + } + + /* + * XXX: Is there an upper-bound to enforce here? Perhaps pick + * some large value and report larger values as an unsupported + * parameter? + */ + if (le32toh(ic_resp.maxh2cdata) < 4096) { + na_error(na, "Invalid MAXH2CDATA %u", + le32toh(ic_resp.maxh2cdata)); + nvmf_tcp_report_error(na, qp, + NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, 12, &ic_resp, + sizeof(ic_resp), sizeof(ic_resp)); + return (false); + } + + qp->txpda = (params->tcp.pda + 1) * 4; + qp->rxpda = (ic_resp.cpda + 1) * 4; + qp->header_digests = ic_resp.dgst.bits.hdgst_enable != 0; + qp->data_digests = ic_resp.dgst.bits.ddgst_enable != 0; + qp->maxr2t = params->tcp.maxr2t; + qp->maxh2cdata = le32toh(ic_resp.maxh2cdata); + if (admin) + /* 7.4.3 */ + qp->max_icd = 8192; + else + qp->max_icd = (ta->ioccsz - 4) * 16; + + return (0); +} + +static bool +tcp_accept(struct nvmf_tcp_qpair *qp, struct nvmf_association *na) +{ + const struct nvmf_association_params *params = &na->na_params; + struct nvme_tcp_ic_req ic_req; + struct nvme_tcp_ic_resp ic_resp; + int error; + + if (!nvmf_tcp_read_ic_req(na, qp, &ic_req)) + return (false); + + memset(&ic_resp, 0, sizeof(ic_resp)); + ic_resp.common.pdu_type = NVME_TCP_PDU_TYPE_IC_RESP; + ic_resp.common.hlen = sizeof(ic_req); + ic_resp.common.plen = htole32(sizeof(ic_req)); + ic_resp.pfv = htole16(0); + ic_resp.cpda = params->tcp.pda; + if (params->tcp.header_digests && ic_req.dgst.bits.hdgst_enable != 0) + ic_resp.dgst.bits.hdgst_enable = 1; + if (params->tcp.data_digests && ic_req.dgst.bits.ddgst_enable != 0) + ic_resp.dgst.bits.ddgst_enable = 1; + ic_resp.maxh2cdata = htole32(params->tcp.maxh2cdata); + + error = nvmf_tcp_write_pdu(qp, &ic_resp, sizeof(ic_resp)); + if (error != 0) { + na_error(na, "Failed to write IC response: %s", + strerror(error)); + return (false); + } + + qp->txpda = (params->tcp.pda + 1) * 4; + qp->rxpda = (ic_req.hpda + 1) * 4; + qp->header_digests = ic_resp.dgst.bits.hdgst_enable != 0; + qp->data_digests = ic_resp.dgst.bits.ddgst_enable != 0; + qp->maxr2t = le32toh(ic_req.maxr2t); + qp->maxh2cdata = params->tcp.maxh2cdata; + qp->max_icd = 0; /* XXX */ + return (0); +} + +static struct nvmf_qpair * +tcp_allocate_qpair(struct nvmf_association *na, + const struct nvmf_qpair_params *qparams) +{ + const struct nvmf_association_params *aparams = &na->na_params; + struct nvmf_tcp_qpair *qp; + int error; + + if (aparams->tcp.pda > NVME_TCP_CPDA_MAX) { + na_error(na, "Invalid PDA"); + return (NULL); + } + + qp = calloc(1, sizeof(*qp)); + qp->s = qparams->tcp.fd; + LIST_INIT(&qp->rx_buffers); + LIST_INIT(&qp->tx_buffers); + TAILQ_INIT(&qp->rx_capsules); + if (na->na_controller) + error = tcp_accept(qp, na); + else + error = tcp_connect(qp, na, qparams->admin); + if (error != 0) { + free(qp); + return (NULL); + } + + return (&qp->qp); +} + +static void +tcp_free_qpair(struct nvmf_qpair *nq) +{ + struct nvmf_tcp_qpair *qp = TQP(nq); + struct nvmf_tcp_capsule *ntc, *tc; + struct nvmf_tcp_command_buffer *ncb, *cb; + + TAILQ_FOREACH_SAFE(tc, &qp->rx_capsules, link, ntc) { + TAILQ_REMOVE(&qp->rx_capsules, tc, link); + nvmf_free_capsule(&tc->nc); + } + LIST_FOREACH_SAFE(cb, &qp->rx_buffers, link, ncb) { + tcp_free_command_buffer(cb); + } + LIST_FOREACH_SAFE(cb, &qp->tx_buffers, link, ncb) { + tcp_free_command_buffer(cb); + } + free(qp); +} + +static int +tcp_kernel_handoff_params(struct nvmf_qpair *nq, + struct nvmf_handoff_qpair_params *qparams) +{ + struct nvmf_tcp_qpair *qp = TQP(nq); + + qparams->tcp.fd = qp->s; + qparams->tcp.rxpda = qp->rxpda; + qparams->tcp.txpda = qp->txpda; + qparams->tcp.header_digests = qp->header_digests; + qparams->tcp.data_digests = qp->data_digests; + qparams->tcp.maxr2t = qp->maxr2t; + qparams->tcp.maxh2cdata = qp->maxh2cdata; + qparams->tcp.max_icd = qp->max_icd; + + return (0); +} + +static struct nvmf_capsule * +tcp_allocate_capsule(struct nvmf_qpair *qp __unused) +{ + struct nvmf_tcp_capsule *nc; + + nc = calloc(1, sizeof(*nc)); + return (&nc->nc); +} + +static void +tcp_free_capsule(struct nvmf_capsule *nc) +{ + struct nvmf_tcp_capsule *tc = TCAP(nc); + + nvmf_tcp_free_pdu(&tc->rx_pdu); + if (tc->cb != NULL) + tcp_free_command_buffer(tc->cb); + free(tc); +} + +static int +tcp_transmit_command(struct nvmf_capsule *nc) +{ + struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair); + struct nvmf_tcp_capsule *tc = TCAP(nc); + struct nvme_tcp_cmd cmd; + struct nvme_sgl_descriptor *sgl; + int error; + bool use_icd; + + use_icd = false; + if (nc->nc_data_len != 0 && nc->nc_send_data && + nc->nc_data_len <= qp->max_icd) + use_icd = true; + + memset(&cmd, 0, sizeof(cmd)); + cmd.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_CMD; + cmd.ccsqe = nc->nc_sqe; + + /* Populate SGL in SQE. */ + sgl = &cmd.ccsqe.sgl; + memset(sgl, 0, sizeof(*sgl)); + sgl->address = 0; + sgl->length = htole32(nc->nc_data_len); + if (use_icd) { + /* Use in-capsule data. */ + sgl->type = NVME_SGL_TYPE_ICD; + } else { + /* Use a command buffer. */ + sgl->type = NVME_SGL_TYPE_COMMAND_BUFFER; + } + + /* Send command capsule. */ + error = nvmf_tcp_construct_pdu(qp, &cmd, sizeof(cmd), use_icd ? + nc->nc_data : NULL, use_icd ? nc->nc_data_len : 0); + if (error != 0) + return (error); + + /* + * If data will be transferred using a command buffer, allocate a + * buffer structure and queue it. + */ + if (nc->nc_data_len != 0 && !use_icd) + tc->cb = tcp_alloc_command_buffer(qp, nc->nc_data, 0, + nc->nc_data_len, cmd.ccsqe.cid, 0, !nc->nc_send_data); + + return (0); +} + +static int +tcp_transmit_response(struct nvmf_capsule *nc) +{ + struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair); + struct nvme_tcp_rsp rsp; + + memset(&rsp, 0, sizeof(rsp)); + rsp.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_RESP; + rsp.rccqe = nc->nc_cqe; + + return (nvmf_tcp_construct_pdu(qp, &rsp, sizeof(rsp), NULL, 0)); +} + +static int +tcp_transmit_capsule(struct nvmf_capsule *nc) +{ + if (nc->nc_qe_len == sizeof(struct nvme_command)) + return (tcp_transmit_command(nc)); + else + return (tcp_transmit_response(nc)); +} + +static int +tcp_receive_capsule(struct nvmf_qpair *nq, struct nvmf_capsule **ncp) +{ + struct nvmf_tcp_qpair *qp = TQP(nq); + struct nvmf_tcp_capsule *tc; + int error; + + while (TAILQ_EMPTY(&qp->rx_capsules)) { + error = nvmf_tcp_receive_pdu(qp); + if (error != 0) + return (error); + } + tc = TAILQ_FIRST(&qp->rx_capsules); + TAILQ_REMOVE(&qp->rx_capsules, tc, link); + *ncp = &tc->nc; + return (0); +} + +static uint8_t +tcp_validate_command_capsule(const struct nvmf_capsule *nc) +{ + const struct nvmf_tcp_capsule *tc = CTCAP(nc); + const struct nvme_sgl_descriptor *sgl; + + assert(tc->rx_pdu.hdr != NULL); + + sgl = &nc->nc_sqe.sgl; + switch (sgl->type) { + case NVME_SGL_TYPE_ICD: + if (tc->rx_pdu.data_len != le32toh(sgl->length)) { + printf("NVMe/TCP: Command Capsule with mismatched ICD length\n"); + return (NVME_SC_DATA_SGL_LENGTH_INVALID); + } + break; + case NVME_SGL_TYPE_COMMAND_BUFFER: + if (tc->rx_pdu.data_len != 0) { + printf("NVMe/TCP: Command Buffer SGL with ICD\n"); + return (NVME_SC_INVALID_FIELD); + } + break; + default: + printf("NVMe/TCP: Invalid SGL type in Command Capsule\n"); + return (NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID); + } + + if (sgl->address != 0) { + printf("NVMe/TCP: Invalid SGL offset in Command Capsule\n"); + return (NVME_SC_SGL_OFFSET_INVALID); + } + + return (NVME_SC_SUCCESS); +} + +static size_t +tcp_capsule_data_len(const struct nvmf_capsule *nc) +{ + assert(nc->nc_qe_len == sizeof(struct nvme_command)); + return (le32toh(nc->nc_sqe.sgl.length)); +} + +/* NB: cid and ttag are both little-endian already. */ +static int +tcp_send_r2t(struct nvmf_tcp_qpair *qp, uint16_t cid, uint16_t ttag, + uint32_t data_offset, uint32_t data_len) +{ + struct nvme_tcp_r2t_hdr r2t; + + memset(&r2t, 0, sizeof(r2t)); + r2t.common.pdu_type = NVME_TCP_PDU_TYPE_R2T; + r2t.cccid = cid; + r2t.ttag = ttag; + r2t.r2to = htole32(data_offset); + r2t.r2tl = htole32(data_len); + + return (nvmf_tcp_construct_pdu(qp, &r2t, sizeof(r2t), NULL, 0)); +} + +static int +tcp_receive_r2t_data(const struct nvmf_capsule *nc, uint32_t data_offset, + void *buf, size_t len) +{ + struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair); + struct nvmf_tcp_command_buffer *cb; + int error; + uint16_t ttag; + + /* + * Don't bother byte-swapping ttag as it is just a cookie + * value returned by the other end as-is. + */ + ttag = qp->next_ttag++; + + error = tcp_send_r2t(qp, nc->nc_sqe.cid, ttag, data_offset, len); + if (error != 0) + return (error); + + cb = tcp_alloc_command_buffer(qp, buf, data_offset, len, + nc->nc_sqe.cid, ttag, true); + + /* Parse received PDUs until the data transfer is complete. */ + while (cb->data_xfered < cb->data_len) { + error = nvmf_tcp_receive_pdu(qp); + if (error != 0) + break; + } + tcp_free_command_buffer(cb); + return (error); +} + +static int +tcp_receive_icd_data(const struct nvmf_capsule *nc, uint32_t data_offset, + void *buf, size_t len) +{ + const struct nvmf_tcp_capsule *tc = CTCAP(nc); + const char *icd; + + icd = (const char *)tc->rx_pdu.hdr + tc->rx_pdu.hdr->pdo + data_offset; + memcpy(buf, icd, len); + return (0); +} + +static int +tcp_receive_controller_data(const struct nvmf_capsule *nc, uint32_t data_offset, + void *buf, size_t len) +{ + struct nvmf_association *na = nc->nc_qpair->nq_association; + const struct nvme_sgl_descriptor *sgl; + size_t data_len; + + if (nc->nc_qe_len != sizeof(struct nvme_command) || !na->na_controller) + return (EINVAL); + + sgl = &nc->nc_sqe.sgl; + data_len = le32toh(sgl->length); + if (data_offset + len > data_len) + return (EFBIG); + + if (sgl->type == NVME_SGL_TYPE_ICD) + return (tcp_receive_icd_data(nc, data_offset, buf, len)); + else + return (tcp_receive_r2t_data(nc, data_offset, buf, len)); +} + +/* NB: cid is little-endian already. */ +static int +tcp_send_c2h_pdu(struct nvmf_tcp_qpair *qp, uint16_t cid, + uint32_t data_offset, const void *buf, size_t len, bool last_pdu, + bool success) +{ + struct nvme_tcp_c2h_data_hdr c2h; + + memset(&c2h, 0, sizeof(c2h)); + c2h.common.pdu_type = NVME_TCP_PDU_TYPE_C2H_DATA; + if (last_pdu) + c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_LAST_PDU; + if (success) + c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_SUCCESS; + c2h.cccid = cid; + c2h.datao = htole32(data_offset); + c2h.datal = htole32(len); + + return (nvmf_tcp_construct_pdu(qp, &c2h, sizeof(c2h), + __DECONST(void *, buf), len)); +} + +static int +tcp_send_controller_data(const struct nvmf_capsule *nc, const void *buf, + size_t len) +{ + struct nvmf_association *na = nc->nc_qpair->nq_association; + struct nvmf_tcp_qpair *qp = TQP(nc->nc_qpair); + const struct nvme_sgl_descriptor *sgl; + const char *src; + size_t todo; + uint32_t data_len, data_offset; + int error; + bool last_pdu, send_success_flag; + + if (nc->nc_qe_len != sizeof(struct nvme_command) || !na->na_controller) + return (EINVAL); + + sgl = &nc->nc_sqe.sgl; + data_len = le32toh(sgl->length); + if (len != data_len) { + nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD); + return (EFBIG); + } + + if (sgl->type != NVME_SGL_TYPE_COMMAND_BUFFER) { + nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD); + return (EINVAL); + } + + /* Use the SUCCESS flag if SQ flow control is disabled. */ + send_success_flag = !qp->qp.nq_flow_control; + + /* + * Write out one or more C2H_DATA PDUs containing the data. + * Each PDU is arbitrarily capped at 256k. + */ + data_offset = 0; + src = buf; + while (len > 0) { + if (len > 256 * 1024) { + todo = 256 * 1024; + last_pdu = false; + } else { + todo = len; + last_pdu = true; + } + error = tcp_send_c2h_pdu(qp, nc->nc_sqe.cid, data_offset, + src, todo, last_pdu, last_pdu && send_success_flag); + if (error != 0) { + nvmf_send_generic_error(nc, + NVME_SC_TRANSIENT_TRANSPORT_ERROR); + return (error); + } + data_offset += todo; + src += todo; + len -= todo; + } + if (!send_success_flag) + nvmf_send_success(nc); + return (0); +} + +struct nvmf_transport_ops tcp_ops = { + .allocate_association = tcp_allocate_association, + .update_association = tcp_update_association, + .free_association = tcp_free_association, + .allocate_qpair = tcp_allocate_qpair, + .free_qpair = tcp_free_qpair, + .kernel_handoff_params = tcp_kernel_handoff_params, + .allocate_capsule = tcp_allocate_capsule, + .free_capsule = tcp_free_capsule, + .transmit_capsule = tcp_transmit_capsule, + .receive_capsule = tcp_receive_capsule, + .validate_command_capsule = tcp_validate_command_capsule, + .capsule_data_len = tcp_capsule_data_len, + .receive_controller_data = tcp_receive_controller_data, + .send_controller_data = tcp_send_controller_data, +}; diff --git a/lib/libnvmf/nvmf_transport.c b/lib/libnvmf/nvmf_transport.c new file mode 100644 --- /dev/null +++ b/lib/libnvmf/nvmf_transport.c @@ -0,0 +1,269 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2022-2024 Chelsio Communications, Inc. + * Written by: John Baldwin + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "libnvmf.h" +#include "internal.h" + +struct nvmf_association * +nvmf_allocate_association(enum nvmf_trtype trtype, bool controller, + const struct nvmf_association_params *params) +{ + struct nvmf_transport_ops *ops; + struct nvmf_association *na; + + switch (trtype) { + case NVMF_TRTYPE_TCP: + ops = &tcp_ops; + break; + default: + errno = EINVAL; + return (NULL); + } + + na = ops->allocate_association(controller, params); + if (na == NULL) + return (NULL); + + na->na_ops = ops; + na->na_trtype = trtype; + na->na_controller = controller; + na->na_params = *params; + na->na_last_error = NULL; + refcount_init(&na->na_refs, 1); + return (na); +} + +void +nvmf_update_assocation(struct nvmf_association *na, + const struct nvme_controller_data *cdata) +{ + na->na_ops->update_association(na, cdata); +} + +void +nvmf_free_association(struct nvmf_association *na) +{ + if (refcount_release(&na->na_refs)) { + free(na->na_last_error); + na->na_ops->free_association(na); + } +} + +const char * +nvmf_association_error(const struct nvmf_association *na) +{ + return (na->na_last_error); +} + +void +na_clear_error(struct nvmf_association *na) +{ + free(na->na_last_error); + na->na_last_error = NULL; +} + +void +na_error(struct nvmf_association *na, const char *fmt, ...) +{ + va_list ap; + char *str; + + if (na->na_last_error != NULL) + return; + va_start(ap, fmt); + vasprintf(&str, fmt, ap); + va_end(ap); + na->na_last_error = str; +} + +struct nvmf_qpair * +nvmf_allocate_qpair(struct nvmf_association *na, + const struct nvmf_qpair_params *params) +{ + struct nvmf_qpair *qp; + + na_clear_error(na); + qp = na->na_ops->allocate_qpair(na, params); + if (qp == NULL) + return (NULL); + + refcount_acquire(&na->na_refs); + qp->nq_association = na; + qp->nq_admin = params->admin; + TAILQ_INIT(&qp->nq_rx_capsules); + return (qp); +} + +void +nvmf_free_qpair(struct nvmf_qpair *qp) +{ + struct nvmf_association *na; + struct nvmf_capsule *nc, *tc; + + TAILQ_FOREACH_SAFE(nc, &qp->nq_rx_capsules, nc_link, tc) { + TAILQ_REMOVE(&qp->nq_rx_capsules, nc, nc_link); + nvmf_free_capsule(nc); + } + na = qp->nq_association; + na->na_ops->free_qpair(qp); + nvmf_free_association(na); +} + +struct nvmf_capsule * +nvmf_allocate_command(struct nvmf_qpair *qp, const void *sqe) +{ + struct nvmf_capsule *nc; + + nc = qp->nq_association->na_ops->allocate_capsule(qp); + if (nc == NULL) + return (NULL); + + nc->nc_qpair = qp; + nc->nc_qe_len = sizeof(struct nvme_command); + memcpy(&nc->nc_sqe, sqe, nc->nc_qe_len); + + /* 4.2 of NVMe base spec: Fabrics always uses SGL. */ + nc->nc_sqe.fuse &= ~NVMEM(NVME_CMD_PSDT); + nc->nc_sqe.fuse |= NVMEF(NVME_CMD_PSDT, NVME_PSDT_SGL); + return (nc); +} + +struct nvmf_capsule * +nvmf_allocate_response(struct nvmf_qpair *qp, const void *cqe) +{ + struct nvmf_capsule *nc; + + nc = qp->nq_association->na_ops->allocate_capsule(qp); + if (nc == NULL) + return (NULL); + + nc->nc_qpair = qp; + nc->nc_qe_len = sizeof(struct nvme_completion); + memcpy(&nc->nc_cqe, cqe, nc->nc_qe_len); + return (nc); +} + +int +nvmf_capsule_append_data(struct nvmf_capsule *nc, void *buf, size_t len, + bool send) +{ + if (nc->nc_qe_len == sizeof(struct nvme_completion)) + return (EINVAL); + if (nc->nc_data_len != 0) + return (EBUSY); + + nc->nc_data = buf; + nc->nc_data_len = len; + nc->nc_send_data = send; + return (0); +} + +void +nvmf_free_capsule(struct nvmf_capsule *nc) +{ + nc->nc_qpair->nq_association->na_ops->free_capsule(nc); +} + +int +nvmf_transmit_capsule(struct nvmf_capsule *nc) +{ + return (nc->nc_qpair->nq_association->na_ops->transmit_capsule(nc)); +} + +int +nvmf_receive_capsule(struct nvmf_qpair *qp, struct nvmf_capsule **ncp) +{ + return (qp->nq_association->na_ops->receive_capsule(qp, ncp)); +} + +const void * +nvmf_capsule_sqe(const struct nvmf_capsule *nc) +{ + assert(nc->nc_qe_len == sizeof(struct nvme_command)); + return (&nc->nc_sqe); +} + +const void * +nvmf_capsule_cqe(const struct nvmf_capsule *nc) +{ + assert(nc->nc_qe_len == sizeof(struct nvme_completion)); + return (&nc->nc_cqe); +} + +uint8_t +nvmf_validate_command_capsule(const struct nvmf_capsule *nc) +{ + assert(nc->nc_qe_len == sizeof(struct nvme_command)); + + if (NVMEV(NVME_CMD_PSDT, nc->nc_sqe.fuse) != NVME_PSDT_SGL) + return (NVME_SC_INVALID_FIELD); + + return (nc->nc_qpair->nq_association->na_ops->validate_command_capsule(nc)); +} + +size_t +nvmf_capsule_data_len(const struct nvmf_capsule *nc) +{ + return (nc->nc_qpair->nq_association->na_ops->capsule_data_len(nc)); +} + +int +nvmf_receive_controller_data(const struct nvmf_capsule *nc, + uint32_t data_offset, void *buf, size_t len) +{ + return (nc->nc_qpair->nq_association->na_ops->receive_controller_data(nc, + data_offset, buf, len)); +} + +int +nvmf_send_controller_data(const struct nvmf_capsule *nc, const void *buf, + size_t len) +{ + return (nc->nc_qpair->nq_association->na_ops->send_controller_data(nc, + buf, len)); +} + +int +nvmf_kernel_handoff_params(struct nvmf_qpair *qp, + struct nvmf_handoff_qpair_params *qparams) +{ + memset(qparams, 0, sizeof(*qparams)); + qparams->admin = qp->nq_admin; + qparams->sq_flow_control = qp->nq_flow_control; + qparams->qsize = qp->nq_qsize; + qparams->sqhd = qp->nq_sqhd; + qparams->sqtail = qp->nq_sqtail; + return (qp->nq_association->na_ops->kernel_handoff_params(qp, qparams)); +} + +const char * +nvmf_transport_type(uint8_t trtype) +{ + static _Thread_local char buf[8]; + + switch (trtype) { + case NVMF_TRTYPE_RDMA: + return ("RDMA"); + case NVMF_TRTYPE_FC: + return ("Fibre Channel"); + case NVMF_TRTYPE_TCP: + return ("TCP"); + case NVMF_TRTYPE_INTRA_HOST: + return ("Intra-host"); + default: + snprintf(buf, sizeof(buf), "0x%02x\n", trtype); + return (buf); + } +} diff --git a/share/mk/src.libnames.mk b/share/mk/src.libnames.mk --- a/share/mk/src.libnames.mk +++ b/share/mk/src.libnames.mk @@ -56,6 +56,7 @@ netbsd \ ntp \ ntpevent \ + nvmf \ openbsd \ opts \ parse \ @@ -599,6 +600,9 @@ LIBISCSIUTILDIR= ${_LIB_OBJTOP}/lib/libiscsiutil LIBISCSIUTIL?= ${LIBISCSIUTILDIR}/libiscsiutil${PIE_SUFFIX}.a +LIBNVMFDIR= ${_LIB_OBJTOP}/lib/libnvmf +LIBNVMF?= ${LIBNVMFDIR}/libnvmf${PIE_SUFFIX}.a + LIBTELNETDIR= ${_LIB_OBJTOP}/lib/libtelnet LIBTELNET?= ${LIBTELNETDIR}/libtelnet${PIE_SUFFIX}.a