diff --git a/sys/dev/gve/gve_adminq.h b/sys/dev/gve/gve_adminq.h index bc51046a3037..531a844f7d90 100644 --- a/sys/dev/gve/gve_adminq.h +++ b/sys/dev/gve/gve_adminq.h @@ -1,457 +1,458 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2023-2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its contributors * may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef _GVE_AQ_H_ #define _GVE_AQ_H_ 1 #include #include #include #include #include /* Admin queue opcodes */ enum gve_adminq_opcodes { GVE_ADMINQ_DESCRIBE_DEVICE = 0x1, GVE_ADMINQ_CONFIGURE_DEVICE_RESOURCES = 0x2, GVE_ADMINQ_REGISTER_PAGE_LIST = 0x3, GVE_ADMINQ_UNREGISTER_PAGE_LIST = 0x4, GVE_ADMINQ_CREATE_TX_QUEUE = 0x5, GVE_ADMINQ_CREATE_RX_QUEUE = 0x6, GVE_ADMINQ_DESTROY_TX_QUEUE = 0x7, GVE_ADMINQ_DESTROY_RX_QUEUE = 0x8, GVE_ADMINQ_DECONFIGURE_DEVICE_RESOURCES = 0x9, GVE_ADMINQ_SET_DRIVER_PARAMETER = 0xB, GVE_ADMINQ_REPORT_STATS = 0xC, GVE_ADMINQ_REPORT_LINK_SPEED = 0xD, GVE_ADMINQ_GET_PTYPE_MAP = 0xE, GVE_ADMINQ_VERIFY_DRIVER_COMPATIBILITY = 0xF, }; /* Admin queue status codes */ enum gve_adminq_statuses { GVE_ADMINQ_COMMAND_UNSET = 0x0, GVE_ADMINQ_COMMAND_PASSED = 0x1, GVE_ADMINQ_COMMAND_ERROR_ABORTED = 0xFFFFFFF0, GVE_ADMINQ_COMMAND_ERROR_ALREADY_EXISTS = 0xFFFFFFF1, GVE_ADMINQ_COMMAND_ERROR_CANCELLED = 0xFFFFFFF2, GVE_ADMINQ_COMMAND_ERROR_DATALOSS = 0xFFFFFFF3, GVE_ADMINQ_COMMAND_ERROR_DEADLINE_EXCEEDED = 0xFFFFFFF4, GVE_ADMINQ_COMMAND_ERROR_FAILED_PRECONDITION = 0xFFFFFFF5, GVE_ADMINQ_COMMAND_ERROR_INTERNAL_ERROR = 0xFFFFFFF6, GVE_ADMINQ_COMMAND_ERROR_INVALID_ARGUMENT = 0xFFFFFFF7, GVE_ADMINQ_COMMAND_ERROR_NOT_FOUND = 0xFFFFFFF8, GVE_ADMINQ_COMMAND_ERROR_OUT_OF_RANGE = 0xFFFFFFF9, GVE_ADMINQ_COMMAND_ERROR_PERMISSION_DENIED = 0xFFFFFFFA, GVE_ADMINQ_COMMAND_ERROR_UNAUTHENTICATED = 0xFFFFFFFB, GVE_ADMINQ_COMMAND_ERROR_RESOURCE_EXHAUSTED = 0xFFFFFFFC, GVE_ADMINQ_COMMAND_ERROR_UNAVAILABLE = 0xFFFFFFFD, GVE_ADMINQ_COMMAND_ERROR_UNIMPLEMENTED = 0xFFFFFFFE, GVE_ADMINQ_COMMAND_ERROR_UNKNOWN_ERROR = 0xFFFFFFFF, }; #define GVE_ADMINQ_DEVICE_DESCRIPTOR_VERSION 1 /* * All AdminQ command structs should be naturally packed. The static_assert * calls make sure this is the case at compile time. */ struct gve_adminq_describe_device { __be64 device_descriptor_addr; __be32 device_descriptor_version; __be32 available_length; }; _Static_assert(sizeof(struct gve_adminq_describe_device) == 16, "gve: bad admin queue struct length"); struct gve_device_descriptor { __be64 max_registered_pages; __be16 reserved1; __be16 tx_queue_entries; __be16 rx_queue_entries; __be16 default_num_queues; __be16 mtu; __be16 counters; __be16 reserved2; __be16 rx_pages_per_qpl; uint8_t mac[ETHER_ADDR_LEN]; __be16 num_device_options; __be16 total_length; uint8_t reserved3[6]; }; _Static_assert(sizeof(struct gve_device_descriptor) == 40, "gve: bad admin queue struct length"); struct gve_device_option { __be16 option_id; __be16 option_length; __be32 required_features_mask; }; _Static_assert(sizeof(struct gve_device_option) == 8, "gve: bad admin queue struct length"); struct gve_device_option_gqi_rda { __be32 supported_features_mask; }; _Static_assert(sizeof(struct gve_device_option_gqi_rda) == 4, "gve: bad admin queue struct length"); struct gve_device_option_gqi_qpl { __be32 supported_features_mask; }; _Static_assert(sizeof(struct gve_device_option_gqi_qpl) == 4, "gve: bad admin queue struct length"); struct gve_device_option_dqo_rda { __be32 supported_features_mask; __be16 tx_comp_ring_entries; __be16 rx_buff_ring_entries; }; _Static_assert(sizeof(struct gve_device_option_dqo_rda) == 8, "gve: bad admin queue struct length"); struct gve_device_option_dqo_qpl { __be32 supported_features_mask; __be16 tx_comp_ring_entries; __be16 rx_buff_ring_entries; }; _Static_assert(sizeof(struct gve_device_option_dqo_qpl) == 8, "gve: bad admin queue struct length"); struct gve_ring_size_bound { __be16 rx; __be16 tx; }; _Static_assert(sizeof(struct gve_ring_size_bound) == 4, "gve: bad admin queue struct length"); struct gve_device_option_modify_ring { __be32 supported_features_mask; struct gve_ring_size_bound max_ring_size; struct gve_ring_size_bound min_ring_size; }; _Static_assert(sizeof(struct gve_device_option_modify_ring) == 12, "gve: bad admin queue struct length"); struct gve_device_option_jumbo_frames { __be32 supported_features_mask; __be16 max_mtu; uint8_t padding[2]; }; _Static_assert(sizeof(struct gve_device_option_jumbo_frames) == 8, "gve: bad admin queue struct length"); enum gve_dev_opt_id { GVE_DEV_OPT_ID_GQI_RAW_ADDRESSING = 0x1, GVE_DEV_OPT_ID_GQI_RDA = 0x2, GVE_DEV_OPT_ID_GQI_QPL = 0x3, GVE_DEV_OPT_ID_DQO_RDA = 0x4, GVE_DEV_OPT_ID_MODIFY_RING = 0x6, GVE_DEV_OPT_ID_DQO_QPL = 0x7, GVE_DEV_OPT_ID_JUMBO_FRAMES = 0x8, }; /* * These masks are way to predicate the use of a particular option on the driver * having particular bug fixes represented by each bit position in the mask. * Currently they are all zero because there are no known bugs preventing the * use of any option. */ enum gve_dev_opt_req_feat_mask { GVE_DEV_OPT_REQ_FEAT_MASK_GQI_RAW_ADDRESSING = 0x0, GVE_DEV_OPT_REQ_FEAT_MASK_GQI_RDA = 0x0, GVE_DEV_OPT_REQ_FEAT_MASK_GQI_QPL = 0x0, GVE_DEV_OPT_REQ_FEAT_MASK_DQO_RDA = 0x0, GVE_DEV_OPT_REQ_FEAT_MASK_DQO_QPL = 0x0, GVE_DEV_OPT_REQ_FEAT_MASK_MODIFY_RING = 0x0, GVE_DEV_OPT_REQ_FEAT_MASK_JUMBO_FRAMES = 0x0, }; enum gve_sup_feature_mask { GVE_SUP_MODIFY_RING_MASK = 1 << 0, GVE_SUP_JUMBO_FRAMES_MASK = 1 << 2, }; #define GVE_VERSION_STR_LEN 128 enum gve_driver_capability { gve_driver_capability_gqi_qpl = 0, gve_driver_capability_gqi_rda = 1, gve_driver_capability_dqo_qpl = 2, gve_driver_capability_dqo_rda = 3, }; #define GVE_CAP1(a) BIT((int) a) #define GVE_CAP2(a) BIT(((int) a) - 64) #define GVE_CAP3(a) BIT(((int) a) - 128) #define GVE_CAP4(a) BIT(((int) a) - 192) /* * The following four defines describe 256 compatibility bits. * Only a few bits (as shown in `gve_driver_compatibility`) are currently * defined. The rest are reserved for future use. */ #define GVE_DRIVER_CAPABILITY_FLAGS1 \ (GVE_CAP1(gve_driver_capability_gqi_qpl) | \ GVE_CAP1(gve_driver_capability_dqo_qpl) | \ GVE_CAP1(gve_driver_capability_dqo_rda)) #define GVE_DRIVER_CAPABILITY_FLAGS2 0x0 #define GVE_DRIVER_CAPABILITY_FLAGS3 0x0 #define GVE_DRIVER_CAPABILITY_FLAGS4 0x0 struct gve_driver_info { uint8_t os_type; uint8_t driver_major; uint8_t driver_minor; uint8_t driver_sub; __be32 os_version_major; __be32 os_version_minor; __be32 os_version_sub; __be64 driver_capability_flags[4]; uint8_t os_version_str1[GVE_VERSION_STR_LEN]; uint8_t os_version_str2[GVE_VERSION_STR_LEN]; }; struct gve_adminq_verify_driver_compatibility { __be64 driver_info_len; __be64 driver_info_addr; }; _Static_assert(sizeof(struct gve_adminq_verify_driver_compatibility) == 16, "gve: bad admin queue struct length"); struct gve_adminq_configure_device_resources { __be64 counter_array; __be64 irq_db_addr; __be32 num_counters; __be32 num_irq_dbs; __be32 irq_db_stride; __be32 ntfy_blk_msix_base_idx; uint8_t queue_format; uint8_t padding[7]; }; _Static_assert(sizeof(struct gve_adminq_configure_device_resources) == 40, "gve: bad admin queue struct length"); struct gve_adminq_register_page_list { __be32 page_list_id; __be32 num_pages; __be64 page_address_list_addr; __be64 page_size; }; _Static_assert(sizeof(struct gve_adminq_register_page_list) == 24, "gve: bad admin queue struct length"); struct gve_adminq_unregister_page_list { __be32 page_list_id; }; _Static_assert(sizeof(struct gve_adminq_unregister_page_list) == 4, "gve: bad admin queue struct length"); struct gve_adminq_create_tx_queue { __be32 queue_id; __be32 reserved; __be64 queue_resources_addr; __be64 tx_ring_addr; __be32 queue_page_list_id; __be32 ntfy_id; __be64 tx_comp_ring_addr; __be16 tx_ring_size; __be16 tx_comp_ring_size; uint8_t padding[4]; }; _Static_assert(sizeof(struct gve_adminq_create_tx_queue) == 48, "gve: bad admin queue struct length"); #define GVE_RAW_ADDRESSING_QPL_ID 0xFFFFFFFF struct gve_adminq_create_rx_queue { __be32 queue_id; __be32 index; __be32 reserved; __be32 ntfy_id; __be64 queue_resources_addr; __be64 rx_desc_ring_addr; __be64 rx_data_ring_addr; __be32 queue_page_list_id; __be16 rx_ring_size; __be16 packet_buffer_size; __be16 rx_buff_ring_size; uint8_t enable_rsc; uint8_t padding[5]; }; _Static_assert(sizeof(struct gve_adminq_create_rx_queue) == 56, "gve: bad admin queue struct length"); /* Queue resources that are shared with the device */ struct gve_queue_resources { union { struct { __be32 db_index; /* Device -> Guest */ __be32 counter_index; /* Device -> Guest */ }; uint8_t reserved[64]; }; }; _Static_assert(sizeof(struct gve_queue_resources) == 64, "gve: bad admin queue struct length"); struct gve_adminq_destroy_tx_queue { __be32 queue_id; }; _Static_assert(sizeof(struct gve_adminq_destroy_tx_queue) == 4, "gve: bad admin queue struct length"); struct gve_adminq_destroy_rx_queue { __be32 queue_id; }; _Static_assert(sizeof(struct gve_adminq_destroy_rx_queue) == 4, "gve: bad admin queue struct length"); /* GVE Set Driver Parameter Types */ enum gve_set_driver_param_types { GVE_SET_PARAM_MTU = 0x1, }; struct gve_adminq_set_driver_parameter { __be32 parameter_type; uint8_t reserved[4]; __be64 parameter_value; }; _Static_assert(sizeof(struct gve_adminq_set_driver_parameter) == 16, "gve: bad admin queue struct length"); struct stats { __be32 stat_name; __be32 queue_id; __be64 value; }; _Static_assert(sizeof(struct stats) == 16, "gve: bad admin queue struct length"); -/* These are control path types for PTYPE which are the same as the data path +/* + * These are control path types for PTYPE which are the same as the data path * types. */ struct gve_ptype_entry { uint8_t l3_type; uint8_t l4_type; }; struct gve_ptype_map { struct gve_ptype_entry ptypes[1 << 10]; /* PTYPES are always 10 bits. */ }; struct gve_adminq_get_ptype_map { __be64 ptype_map_len; __be64 ptype_map_addr; }; struct gve_adminq_command { __be32 opcode; __be32 status; union { struct gve_adminq_configure_device_resources configure_device_resources; struct gve_adminq_create_tx_queue create_tx_queue; struct gve_adminq_create_rx_queue create_rx_queue; struct gve_adminq_destroy_tx_queue destroy_tx_queue; struct gve_adminq_destroy_rx_queue destroy_rx_queue; struct gve_adminq_describe_device describe_device; struct gve_adminq_register_page_list reg_page_list; struct gve_adminq_unregister_page_list unreg_page_list; struct gve_adminq_set_driver_parameter set_driver_param; struct gve_adminq_verify_driver_compatibility verify_driver_compatibility; struct gve_adminq_get_ptype_map get_ptype_map; uint8_t reserved[56]; }; }; _Static_assert(sizeof(struct gve_adminq_command) == 64, "gve: bad admin queue struct length"); enum gve_l3_type { /* Must be zero so zero initialized LUT is unknown. */ GVE_L3_TYPE_UNKNOWN = 0, GVE_L3_TYPE_OTHER, GVE_L3_TYPE_IPV4, GVE_L3_TYPE_IPV6, }; enum gve_l4_type { /* Must be zero so zero initialized LUT is unknown. */ GVE_L4_TYPE_UNKNOWN = 0, GVE_L4_TYPE_OTHER, GVE_L4_TYPE_TCP, GVE_L4_TYPE_UDP, GVE_L4_TYPE_ICMP, GVE_L4_TYPE_SCTP, }; int gve_adminq_create_rx_queues(struct gve_priv *priv, uint32_t num_queues); int gve_adminq_create_tx_queues(struct gve_priv *priv, uint32_t num_queues); int gve_adminq_destroy_tx_queues(struct gve_priv *priv, uint32_t num_queues); int gve_adminq_destroy_rx_queues(struct gve_priv *priv, uint32_t num_queues); int gve_adminq_set_mtu(struct gve_priv *priv, uint32_t mtu); int gve_adminq_alloc(struct gve_priv *priv); void gve_reset_adminq(struct gve_priv *priv); int gve_adminq_describe_device(struct gve_priv *priv); int gve_adminq_configure_device_resources(struct gve_priv *priv); int gve_adminq_deconfigure_device_resources(struct gve_priv *priv); void gve_release_adminq(struct gve_priv *priv); int gve_adminq_register_page_list(struct gve_priv *priv, struct gve_queue_page_list *qpl); int gve_adminq_unregister_page_list(struct gve_priv *priv, uint32_t page_list_id); int gve_adminq_verify_driver_compatibility(struct gve_priv *priv, uint64_t driver_info_len, vm_paddr_t driver_info_addr); int gve_adminq_get_ptype_map_dqo(struct gve_priv *priv, struct gve_ptype_lut *ptype_lut); #endif /* _GVE_AQ_H_ */ diff --git a/sys/dev/gve/gve_dqo.h b/sys/dev/gve/gve_dqo.h index 214138303a77..212bfa1a6ad3 100644 --- a/sys/dev/gve/gve_dqo.h +++ b/sys/dev/gve/gve_dqo.h @@ -1,321 +1,333 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its contributors * may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* GVE DQO Descriptor formats */ #ifndef _GVE_DESC_DQO_H_ #define _GVE_DESC_DQO_H_ #include "gve_plat.h" #define GVE_ITR_ENABLE_BIT_DQO BIT(0) #define GVE_ITR_NO_UPDATE_DQO (3 << 3) #define GVE_ITR_INTERVAL_DQO_SHIFT 5 #define GVE_ITR_INTERVAL_DQO_MASK ((1 << 12) - 1) #define GVE_TX_IRQ_RATELIMIT_US_DQO 50 #define GVE_RX_IRQ_RATELIMIT_US_DQO 20 #define GVE_TX_MAX_HDR_SIZE_DQO 255 #define GVE_TX_MIN_TSO_MSS_DQO 88 /* * Ringing the doorbell too often can hurt performance. * * HW requires this value to be at least 8. */ #define GVE_RX_BUF_THRESH_DQO 32 /* * Start dropping RX fragments if at least these many * buffers cannot be posted to the NIC. */ #define GVE_RX_DQO_MIN_PENDING_BUFS 128 #define GVE_DQ_NUM_FRAGS_IN_PAGE (PAGE_SIZE / GVE_DEFAULT_RX_BUFFER_SIZE) /* * gve_rx_qpl_buf_id_dqo's 11 bit wide buf_id field limits the total * number of pages per QPL to 2048. */ #define GVE_RX_NUM_QPL_PAGES_DQO 2048 /* 2K TX buffers for DQO-QPL */ #define GVE_TX_BUF_SHIFT_DQO 11 #define GVE_TX_BUF_SIZE_DQO BIT(GVE_TX_BUF_SHIFT_DQO) #define GVE_TX_BUFS_PER_PAGE_DQO (PAGE_SIZE >> GVE_TX_BUF_SHIFT_DQO) #define GVE_TX_NUM_QPL_PAGES_DQO 512 /* Basic TX descriptor (DTYPE 0x0C) */ struct gve_tx_pkt_desc_dqo { __le64 buf_addr; /* Must be GVE_TX_PKT_DESC_DTYPE_DQO (0xc) */ uint8_t dtype:5; /* Denotes the last descriptor of a packet. */ uint8_t end_of_packet:1; uint8_t checksum_offload_enable:1; /* If set, will generate a descriptor completion for this descriptor. */ uint8_t report_event:1; uint8_t reserved0; __le16 reserved1; /* The TX completion for this packet will contain this tag. */ __le16 compl_tag; uint16_t buf_size:14; uint16_t reserved2:2; } __packed; _Static_assert(sizeof(struct gve_tx_pkt_desc_dqo) == 16, "gve: bad dqo desc struct length"); #define GVE_TX_PKT_DESC_DTYPE_DQO 0xc /* * Maximum number of data descriptors allowed per packet, or per-TSO segment. */ #define GVE_TX_MAX_DATA_DESCS_DQO 10 #define GVE_TX_MAX_BUF_SIZE_DQO ((16 * 1024) - 1) #define GVE_TSO_MAXSIZE_DQO IP_MAXPACKET _Static_assert(GVE_TX_MAX_BUF_SIZE_DQO * GVE_TX_MAX_DATA_DESCS_DQO >= GVE_TSO_MAXSIZE_DQO, "gve: bad tso parameters"); /* * "report_event" on TX packet descriptors may only be reported on the last * descriptor of a TX packet, and they must be spaced apart with at least this * value. */ #define GVE_TX_MIN_RE_INTERVAL 32 struct gve_tx_context_cmd_dtype { uint8_t dtype:5; uint8_t tso:1; uint8_t reserved1:2; uint8_t reserved2; }; _Static_assert(sizeof(struct gve_tx_context_cmd_dtype) == 2, "gve: bad dqo desc struct length"); /* * TX Native TSO Context DTYPE (0x05) * * "flex" fields allow the driver to send additional packet context to HW. */ struct gve_tx_tso_context_desc_dqo { /* The L4 payload bytes that should be segmented. */ uint32_t tso_total_len:24; uint32_t flex10:8; /* Max segment size in TSO excluding headers. */ uint16_t mss:14; uint16_t reserved:2; uint8_t header_len; /* Header length to use for TSO offload */ uint8_t flex11; struct gve_tx_context_cmd_dtype cmd_dtype; uint8_t flex0; uint8_t flex5; uint8_t flex6; uint8_t flex7; uint8_t flex8; uint8_t flex9; } __packed; _Static_assert(sizeof(struct gve_tx_tso_context_desc_dqo) == 16, "gve: bad dqo desc struct length"); #define GVE_TX_TSO_CTX_DESC_DTYPE_DQO 0x5 /* General context descriptor for sending metadata. */ struct gve_tx_general_context_desc_dqo { uint8_t flex4; uint8_t flex5; uint8_t flex6; uint8_t flex7; uint8_t flex8; uint8_t flex9; uint8_t flex10; uint8_t flex11; struct gve_tx_context_cmd_dtype cmd_dtype; uint16_t reserved; uint8_t flex0; uint8_t flex1; uint8_t flex2; uint8_t flex3; } __packed; _Static_assert(sizeof(struct gve_tx_general_context_desc_dqo) == 16, "gve: bad dqo desc struct length"); #define GVE_TX_GENERAL_CTX_DESC_DTYPE_DQO 0x4 /* * Logical structure of metadata which is packed into context descriptor flex * fields. */ struct gve_tx_metadata_dqo { union { struct { uint8_t version; /* * A zero value means no l4_hash was associated with the * mbuf. */ uint16_t path_hash:15; /* * Should be set to 1 if the flow associated with the * mbuf had a rehash from the TCP stack. */ uint16_t rehash_event:1; } __packed; uint8_t bytes[12]; }; } __packed; _Static_assert(sizeof(struct gve_tx_metadata_dqo) == 12, "gve: bad dqo desc struct length"); #define GVE_TX_METADATA_VERSION_DQO 0 +/* Used to access the generation bit within a TX completion descriptor. */ +#define GVE_TX_DESC_DQO_GEN_BYTE_OFFSET 1 +#define GVE_TX_DESC_DQO_GEN_BIT_MASK 0x80 + /* TX completion descriptor */ struct gve_tx_compl_desc_dqo { - /* For types 0-4 this is the TX queue ID associated with this + /* + * For types 0-4 this is the TX queue ID associated with this * completion. */ uint16_t id:11; /* See: GVE_COMPL_TYPE_DQO* */ uint16_t type:3; uint16_t reserved0:1; /* Flipped by HW to notify the descriptor is populated. */ uint16_t generation:1; union { - /* For descriptor completions, this is the last index fetched + /* + * For descriptor completions, this is the last index fetched * by HW + 1. */ __le16 tx_head; - /* For packet completions, this is the completion tag set on the + /* + * For packet completions, this is the completion tag set on the * TX packet descriptors. */ __le16 completion_tag; }; __le32 reserved1; } __packed; _Static_assert(sizeof(struct gve_tx_compl_desc_dqo) == 8, "gve: bad dqo desc struct length"); union gve_tx_desc_dqo { struct gve_tx_pkt_desc_dqo pkt; struct gve_tx_tso_context_desc_dqo tso_ctx; struct gve_tx_general_context_desc_dqo general_ctx; }; #define GVE_COMPL_TYPE_DQO_PKT 0x2 /* Packet completion */ #define GVE_COMPL_TYPE_DQO_DESC 0x4 /* Descriptor completion */ /* Descriptor to post buffers to HW on buffer queue. */ struct gve_rx_desc_dqo { __le16 buf_id; /* ID returned in Rx completion descriptor */ __le16 reserved0; __le32 reserved1; __le64 buf_addr; /* DMA address of the buffer */ __le64 header_buf_addr; __le64 reserved2; } __packed; _Static_assert(sizeof(struct gve_rx_desc_dqo) == 32, "gve: bad dqo desc struct length"); +/* Used to access the generation bit within an RX completion descriptor. */ +#define GVE_RX_DESC_DQO_GEN_BYTE_OFFSET 5 +#define GVE_RX_DESC_DQO_GEN_BIT_MASK 0x40 + /* Descriptor for HW to notify SW of new packets received on RX queue. */ struct gve_rx_compl_desc_dqo { /* Must be 1 */ uint8_t rxdid:4; uint8_t reserved0:4; /* Packet originated from this system rather than the network. */ uint8_t loopback:1; - /* Set when IPv6 packet contains a destination options header or routing + /* + * Set when IPv6 packet contains a destination options header or routing * header. */ uint8_t ipv6_ex_add:1; /* Invalid packet was received. */ uint8_t rx_error:1; uint8_t reserved1:5; uint16_t packet_type:10; uint16_t ip_hdr_err:1; uint16_t udp_len_err:1; uint16_t raw_cs_invalid:1; uint16_t reserved2:3; uint16_t packet_len:14; /* Flipped by HW to notify the descriptor is populated. */ uint16_t generation:1; /* Should be zero. */ uint16_t buffer_queue_id:1; uint16_t header_len:10; uint16_t rsc:1; uint16_t split_header:1; uint16_t reserved3:4; uint8_t descriptor_done:1; uint8_t end_of_packet:1; uint8_t header_buffer_overflow:1; uint8_t l3_l4_processed:1; uint8_t csum_ip_err:1; uint8_t csum_l4_err:1; uint8_t csum_external_ip_err:1; uint8_t csum_external_udp_err:1; uint8_t status_error1; __le16 reserved5; __le16 buf_id; /* Buffer ID which was sent on the buffer queue. */ union { /* Packet checksum. */ __le16 raw_cs; /* Segment length for RSC packets. */ __le16 rsc_seg_len; }; __le32 hash; __le32 reserved6; __le64 reserved7; } __packed; _Static_assert(sizeof(struct gve_rx_compl_desc_dqo) == 32, "gve: bad dqo desc struct length"); #endif /* _GVE_DESC_DQO_H_ */ diff --git a/sys/dev/gve/gve_rx_dqo.c b/sys/dev/gve/gve_rx_dqo.c index a499ac9d3c6a..11b2c7ea0c55 100644 --- a/sys/dev/gve/gve_rx_dqo.c +++ b/sys/dev/gve/gve_rx_dqo.c @@ -1,1021 +1,1031 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its contributors * may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "gve.h" #include "gve_adminq.h" #include "gve_dqo.h" static void gve_free_rx_mbufs_dqo(struct gve_rx_ring *rx) { struct gve_rx_buf_dqo *buf; int i; if (gve_is_qpl(rx->com.priv)) return; for (i = 0; i < rx->dqo.buf_cnt; i++) { buf = &rx->dqo.bufs[i]; if (!buf->mbuf) continue; bus_dmamap_sync(rx->dqo.buf_dmatag, buf->dmamap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rx->dqo.buf_dmatag, buf->dmamap); m_freem(buf->mbuf); buf->mbuf = NULL; } } void gve_rx_free_ring_dqo(struct gve_priv *priv, int i) { struct gve_rx_ring *rx = &priv->rx[i]; struct gve_ring_com *com = &rx->com; int j; if (rx->dqo.compl_ring != NULL) { gve_dma_free_coherent(&rx->dqo.compl_ring_mem); rx->dqo.compl_ring = NULL; } if (rx->dqo.desc_ring != NULL) { gve_dma_free_coherent(&rx->desc_ring_mem); rx->dqo.desc_ring = NULL; } if (rx->dqo.bufs != NULL) { gve_free_rx_mbufs_dqo(rx); if (!gve_is_qpl(priv) && rx->dqo.buf_dmatag) { for (j = 0; j < rx->dqo.buf_cnt; j++) if (rx->dqo.bufs[j].mapped) bus_dmamap_destroy(rx->dqo.buf_dmatag, rx->dqo.bufs[j].dmamap); } free(rx->dqo.bufs, M_GVE); rx->dqo.bufs = NULL; } if (!gve_is_qpl(priv) && rx->dqo.buf_dmatag) bus_dma_tag_destroy(rx->dqo.buf_dmatag); if (com->qpl != NULL) { gve_free_qpl(priv, com->qpl); com->qpl = NULL; } } int gve_rx_alloc_ring_dqo(struct gve_priv *priv, int i) { struct gve_rx_ring *rx = &priv->rx[i]; int err; int j; err = gve_dma_alloc_coherent(priv, sizeof(struct gve_rx_desc_dqo) * priv->rx_desc_cnt, CACHE_LINE_SIZE, &rx->desc_ring_mem); if (err != 0) { device_printf(priv->dev, "Failed to alloc desc ring for rx ring %d", i); goto abort; } rx->dqo.desc_ring = rx->desc_ring_mem.cpu_addr; rx->dqo.mask = priv->rx_desc_cnt - 1; err = gve_dma_alloc_coherent(priv, sizeof(struct gve_rx_compl_desc_dqo) * priv->rx_desc_cnt, CACHE_LINE_SIZE, &rx->dqo.compl_ring_mem); if (err != 0) { device_printf(priv->dev, "Failed to alloc compl ring for rx ring %d", i); goto abort; } rx->dqo.compl_ring = rx->dqo.compl_ring_mem.cpu_addr; rx->dqo.mask = priv->rx_desc_cnt - 1; rx->dqo.buf_cnt = gve_is_qpl(priv) ? GVE_RX_NUM_QPL_PAGES_DQO : priv->rx_desc_cnt; rx->dqo.bufs = malloc(rx->dqo.buf_cnt * sizeof(struct gve_rx_buf_dqo), M_GVE, M_WAITOK | M_ZERO); if (gve_is_qpl(priv)) { rx->com.qpl = gve_alloc_qpl(priv, i + priv->tx_cfg.max_queues, GVE_RX_NUM_QPL_PAGES_DQO, /*single_kva=*/false); if (rx->com.qpl == NULL) { device_printf(priv->dev, "Failed to alloc QPL for rx ring %d", i); err = ENOMEM; goto abort; } return (0); } err = bus_dma_tag_create( bus_get_dma_tag(priv->dev), /* parent */ 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ MCLBYTES, /* maxsize */ 1, /* nsegments */ MCLBYTES, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockarg */ &rx->dqo.buf_dmatag); if (err != 0) { device_printf(priv->dev, "%s: bus_dma_tag_create failed: %d\n", __func__, err); goto abort; } for (j = 0; j < rx->dqo.buf_cnt; j++) { err = bus_dmamap_create(rx->dqo.buf_dmatag, 0, &rx->dqo.bufs[j].dmamap); if (err != 0) { device_printf(priv->dev, "err in creating rx buf dmamap %d: %d", j, err); goto abort; } rx->dqo.bufs[j].mapped = true; } return (0); abort: gve_rx_free_ring_dqo(priv, i); return (err); } static void gve_rx_clear_desc_ring_dqo(struct gve_rx_ring *rx) { struct gve_ring_com *com = &rx->com; int entries; int i; entries = com->priv->rx_desc_cnt; for (i = 0; i < entries; i++) rx->dqo.desc_ring[i] = (struct gve_rx_desc_dqo){}; bus_dmamap_sync(rx->desc_ring_mem.tag, rx->desc_ring_mem.map, BUS_DMASYNC_PREWRITE); } static void gve_rx_clear_compl_ring_dqo(struct gve_rx_ring *rx) { struct gve_ring_com *com = &rx->com; int i; for (i = 0; i < com->priv->rx_desc_cnt; i++) rx->dqo.compl_ring[i] = (struct gve_rx_compl_desc_dqo){}; bus_dmamap_sync(rx->dqo.compl_ring_mem.tag, rx->dqo.compl_ring_mem.map, BUS_DMASYNC_PREWRITE); } void gve_clear_rx_ring_dqo(struct gve_priv *priv, int i) { struct gve_rx_ring *rx = &priv->rx[i]; int j; rx->fill_cnt = 0; rx->cnt = 0; rx->dqo.mask = priv->rx_desc_cnt - 1; rx->dqo.head = 0; rx->dqo.tail = 0; rx->dqo.cur_gen_bit = 0; gve_rx_clear_desc_ring_dqo(rx); gve_rx_clear_compl_ring_dqo(rx); gve_free_rx_mbufs_dqo(rx); if (gve_is_qpl(priv)) { SLIST_INIT(&rx->dqo.free_bufs); STAILQ_INIT(&rx->dqo.used_bufs); for (j = 0; j < rx->dqo.buf_cnt; j++) { struct gve_rx_buf_dqo *buf = &rx->dqo.bufs[j]; vm_page_t page = rx->com.qpl->pages[buf - rx->dqo.bufs]; u_int ref_count = atomic_load_int(&page->ref_count); /* * An ifconfig down+up might see pages still in flight * from the previous innings. */ if (VPRC_WIRE_COUNT(ref_count) == 1) SLIST_INSERT_HEAD(&rx->dqo.free_bufs, buf, slist_entry); else STAILQ_INSERT_TAIL(&rx->dqo.used_bufs, buf, stailq_entry); buf->num_nic_frags = 0; buf->next_idx = 0; } } else { SLIST_INIT(&rx->dqo.free_bufs); for (j = 0; j < rx->dqo.buf_cnt; j++) SLIST_INSERT_HEAD(&rx->dqo.free_bufs, &rx->dqo.bufs[j], slist_entry); } } int gve_rx_intr_dqo(void *arg) { struct gve_rx_ring *rx = arg; struct gve_priv *priv = rx->com.priv; struct gve_ring_com *com = &rx->com; if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) return (FILTER_STRAY); /* Interrupts are automatically masked */ taskqueue_enqueue(com->cleanup_tq, &com->cleanup_task); return (FILTER_HANDLED); } static void gve_rx_advance_head_dqo(struct gve_rx_ring *rx) { rx->dqo.head = (rx->dqo.head + 1) & rx->dqo.mask; rx->fill_cnt++; /* rx->fill_cnt is just a sysctl counter */ if ((rx->dqo.head & (GVE_RX_BUF_THRESH_DQO - 1)) == 0) { bus_dmamap_sync(rx->desc_ring_mem.tag, rx->desc_ring_mem.map, BUS_DMASYNC_PREWRITE); gve_db_bar_dqo_write_4(rx->com.priv, rx->com.db_offset, rx->dqo.head); } } static void gve_rx_post_buf_dqo(struct gve_rx_ring *rx, struct gve_rx_buf_dqo *buf) { struct gve_rx_desc_dqo *desc; bus_dmamap_sync(rx->dqo.buf_dmatag, buf->dmamap, BUS_DMASYNC_PREREAD); desc = &rx->dqo.desc_ring[rx->dqo.head]; desc->buf_id = htole16(buf - rx->dqo.bufs); desc->buf_addr = htole64(buf->addr); gve_rx_advance_head_dqo(rx); } static int gve_rx_post_new_mbuf_dqo(struct gve_rx_ring *rx, int how) { struct gve_rx_buf_dqo *buf; bus_dma_segment_t segs[1]; int nsegs; int err; buf = SLIST_FIRST(&rx->dqo.free_bufs); if (__predict_false(!buf)) { device_printf(rx->com.priv->dev, "Unexpected empty free bufs list\n"); return (ENOBUFS); } SLIST_REMOVE_HEAD(&rx->dqo.free_bufs, slist_entry); buf->mbuf = m_getcl(how, MT_DATA, M_PKTHDR); if (__predict_false(!buf->mbuf)) { err = ENOMEM; counter_enter(); counter_u64_add_protected(rx->stats.rx_mbuf_mclget_null, 1); counter_exit(); goto abort_with_buf; } buf->mbuf->m_len = MCLBYTES; err = bus_dmamap_load_mbuf_sg(rx->dqo.buf_dmatag, buf->dmamap, buf->mbuf, segs, &nsegs, BUS_DMA_NOWAIT); KASSERT(nsegs == 1, ("dma segs for a cluster mbuf is not 1")); if (__predict_false(err != 0)) { counter_enter(); counter_u64_add_protected(rx->stats.rx_mbuf_dmamap_err, 1); counter_exit(); goto abort_with_mbuf; } buf->addr = segs[0].ds_addr; gve_rx_post_buf_dqo(rx, buf); return (0); abort_with_mbuf: m_freem(buf->mbuf); buf->mbuf = NULL; abort_with_buf: SLIST_INSERT_HEAD(&rx->dqo.free_bufs, buf, slist_entry); return (err); } static struct gve_dma_handle * gve_get_page_dma_handle(struct gve_rx_ring *rx, struct gve_rx_buf_dqo *buf) { return (&(rx->com.qpl->dmas[buf - rx->dqo.bufs])); } static void gve_rx_post_qpl_buf_dqo(struct gve_rx_ring *rx, struct gve_rx_buf_dqo *buf, uint8_t frag_num) { struct gve_rx_desc_dqo *desc = &rx->dqo.desc_ring[rx->dqo.head]; union gve_rx_qpl_buf_id_dqo composed_id; struct gve_dma_handle *page_dma_handle; composed_id.buf_id = buf - rx->dqo.bufs; composed_id.frag_num = frag_num; desc->buf_id = htole16(composed_id.all); page_dma_handle = gve_get_page_dma_handle(rx, buf); bus_dmamap_sync(page_dma_handle->tag, page_dma_handle->map, BUS_DMASYNC_PREREAD); desc->buf_addr = htole64(page_dma_handle->bus_addr + frag_num * GVE_DEFAULT_RX_BUFFER_SIZE); buf->num_nic_frags++; gve_rx_advance_head_dqo(rx); } static void gve_rx_maybe_extract_from_used_bufs(struct gve_rx_ring *rx, bool just_one) { struct gve_rx_buf_dqo *hol_blocker = NULL; struct gve_rx_buf_dqo *buf; u_int ref_count; vm_page_t page; while (true) { buf = STAILQ_FIRST(&rx->dqo.used_bufs); if (__predict_false(buf == NULL)) break; page = rx->com.qpl->pages[buf - rx->dqo.bufs]; ref_count = atomic_load_int(&page->ref_count); if (VPRC_WIRE_COUNT(ref_count) != 1) { /* Account for one head-of-line blocker */ if (hol_blocker != NULL) break; hol_blocker = buf; STAILQ_REMOVE_HEAD(&rx->dqo.used_bufs, stailq_entry); continue; } STAILQ_REMOVE_HEAD(&rx->dqo.used_bufs, stailq_entry); SLIST_INSERT_HEAD(&rx->dqo.free_bufs, buf, slist_entry); if (just_one) break; } if (hol_blocker != NULL) STAILQ_INSERT_HEAD(&rx->dqo.used_bufs, hol_blocker, stailq_entry); } static int gve_rx_post_new_dqo_qpl_buf(struct gve_rx_ring *rx) { struct gve_rx_buf_dqo *buf; buf = SLIST_FIRST(&rx->dqo.free_bufs); if (__predict_false(buf == NULL)) { gve_rx_maybe_extract_from_used_bufs(rx, /*just_one=*/true); buf = SLIST_FIRST(&rx->dqo.free_bufs); if (__predict_false(buf == NULL)) return (ENOBUFS); } gve_rx_post_qpl_buf_dqo(rx, buf, buf->next_idx); if (buf->next_idx == GVE_DQ_NUM_FRAGS_IN_PAGE - 1) buf->next_idx = 0; else buf->next_idx++; /* * We have posted all the frags in this buf to the NIC. * - buf will enter used_bufs once the last completion arrives. * - It will renter free_bufs in gve_rx_maybe_extract_from_used_bufs * when its wire count drops back to 1. */ if (buf->next_idx == 0) SLIST_REMOVE_HEAD(&rx->dqo.free_bufs, slist_entry); return (0); } static void gve_rx_post_buffers_dqo(struct gve_rx_ring *rx, int how) { uint32_t num_pending_bufs; uint32_t num_to_post; uint32_t i; int err; num_pending_bufs = (rx->dqo.head - rx->dqo.tail) & rx->dqo.mask; num_to_post = rx->dqo.mask - num_pending_bufs; for (i = 0; i < num_to_post; i++) { if (gve_is_qpl(rx->com.priv)) err = gve_rx_post_new_dqo_qpl_buf(rx); else err = gve_rx_post_new_mbuf_dqo(rx, how); if (err) break; } } void gve_rx_prefill_buffers_dqo(struct gve_rx_ring *rx) { gve_rx_post_buffers_dqo(rx, M_WAITOK); } static void gve_rx_set_hashtype_dqo(struct mbuf *mbuf, struct gve_ptype *ptype, bool *is_tcp) { switch (ptype->l3_type) { case GVE_L3_TYPE_IPV4: switch (ptype->l4_type) { case GVE_L4_TYPE_TCP: *is_tcp = true; M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_TCP_IPV4); break; case GVE_L4_TYPE_UDP: M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_UDP_IPV4); break; default: M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_IPV4); } break; case GVE_L3_TYPE_IPV6: switch (ptype->l4_type) { case GVE_L4_TYPE_TCP: *is_tcp = true; M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_TCP_IPV6); break; case GVE_L4_TYPE_UDP: M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_UDP_IPV6); break; default: M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_IPV6); } break; default: M_HASHTYPE_SET(mbuf, M_HASHTYPE_OPAQUE_HASH); } } static void gve_rx_set_csum_flags_dqo(struct mbuf *mbuf, struct gve_rx_compl_desc_dqo *desc, struct gve_ptype *ptype) { /* HW did not identify and process L3 and L4 headers. */ if (__predict_false(!desc->l3_l4_processed)) return; if (ptype->l3_type == GVE_L3_TYPE_IPV4) { if (__predict_false(desc->csum_ip_err || desc->csum_external_ip_err)) return; } else if (ptype->l3_type == GVE_L3_TYPE_IPV6) { /* Checksum should be skipped if this flag is set. */ if (__predict_false(desc->ipv6_ex_add)) return; } if (__predict_false(desc->csum_l4_err)) return; switch (ptype->l4_type) { case GVE_L4_TYPE_TCP: case GVE_L4_TYPE_UDP: case GVE_L4_TYPE_ICMP: case GVE_L4_TYPE_SCTP: mbuf->m_pkthdr.csum_flags = CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR; mbuf->m_pkthdr.csum_data = 0xffff; break; default: break; } } static void gve_rx_input_mbuf_dqo(struct gve_rx_ring *rx, struct gve_rx_compl_desc_dqo *compl_desc) { struct mbuf *mbuf = rx->ctx.mbuf_head; if_t ifp = rx->com.priv->ifp; struct gve_ptype *ptype; bool do_if_input = true; bool is_tcp = false; ptype = &rx->com.priv->ptype_lut_dqo->ptypes[compl_desc->packet_type]; gve_rx_set_hashtype_dqo(mbuf, ptype, &is_tcp); mbuf->m_pkthdr.flowid = le32toh(compl_desc->hash); gve_rx_set_csum_flags_dqo(mbuf, compl_desc, ptype); mbuf->m_pkthdr.rcvif = ifp; mbuf->m_pkthdr.len = rx->ctx.total_size; if (((if_getcapenable(rx->com.priv->ifp) & IFCAP_LRO) != 0) && is_tcp && (rx->lro.lro_cnt != 0) && (tcp_lro_rx(&rx->lro, mbuf, 0) == 0)) do_if_input = false; if (do_if_input) if_input(ifp, mbuf); counter_enter(); counter_u64_add_protected(rx->stats.rbytes, rx->ctx.total_size); counter_u64_add_protected(rx->stats.rpackets, 1); counter_exit(); rx->ctx = (struct gve_rx_ctx){}; } static int gve_rx_copybreak_dqo(struct gve_rx_ring *rx, void *va, struct gve_rx_compl_desc_dqo *compl_desc, uint16_t frag_len) { struct mbuf *mbuf; mbuf = m_get2(frag_len, M_NOWAIT, MT_DATA, M_PKTHDR); if (__predict_false(mbuf == NULL)) return (ENOMEM); counter_enter(); counter_u64_add_protected(rx->stats.rx_copybreak_cnt, 1); counter_exit(); m_copyback(mbuf, 0, frag_len, va); mbuf->m_len = frag_len; rx->ctx.mbuf_head = mbuf; rx->ctx.mbuf_tail = mbuf; rx->ctx.total_size += frag_len; gve_rx_input_mbuf_dqo(rx, compl_desc); return (0); } static void gve_rx_dqo(struct gve_priv *priv, struct gve_rx_ring *rx, struct gve_rx_compl_desc_dqo *compl_desc, int *work_done) { bool is_last_frag = compl_desc->end_of_packet != 0; struct gve_rx_ctx *ctx = &rx->ctx; struct gve_rx_buf_dqo *buf; uint32_t num_pending_bufs; uint16_t frag_len; uint16_t buf_id; int err; buf_id = le16toh(compl_desc->buf_id); if (__predict_false(buf_id >= rx->dqo.buf_cnt)) { device_printf(priv->dev, "Invalid rx buf id %d on rxq %d, issuing reset\n", buf_id, rx->com.id); gve_schedule_reset(priv); goto drop_frag_clear_ctx; } buf = &rx->dqo.bufs[buf_id]; if (__predict_false(buf->mbuf == NULL)) { device_printf(priv->dev, "Spurious completion for buf id %d on rxq %d, issuing reset\n", buf_id, rx->com.id); gve_schedule_reset(priv); goto drop_frag_clear_ctx; } if (__predict_false(ctx->drop_pkt)) goto drop_frag; if (__predict_false(compl_desc->rx_error)) { counter_enter(); counter_u64_add_protected(rx->stats.rx_dropped_pkt_desc_err, 1); counter_exit(); goto drop_frag; } bus_dmamap_sync(rx->dqo.buf_dmatag, buf->dmamap, BUS_DMASYNC_POSTREAD); frag_len = compl_desc->packet_len; if (frag_len <= priv->rx_copybreak && !ctx->mbuf_head && is_last_frag) { err = gve_rx_copybreak_dqo(rx, mtod(buf->mbuf, char*), compl_desc, frag_len); if (__predict_false(err != 0)) goto drop_frag; (*work_done)++; gve_rx_post_buf_dqo(rx, buf); return; } /* * Although buffer completions may arrive out of order, buffer * descriptors are consumed by the NIC in order. That is, the * buffer at desc_ring[tail] might not be the buffer we got the * completion compl_ring[tail] for: but we know that desc_ring[tail] * has already been read by the NIC. */ num_pending_bufs = (rx->dqo.head - rx->dqo.tail) & rx->dqo.mask; /* * For every fragment received, try to post a new buffer. * * Failures are okay but only so long as the number of outstanding * buffers is above a threshold. * * Beyond that we drop new packets to reuse their buffers. * Without ensuring a minimum number of buffers for the NIC to * put packets in, we run the risk of getting the queue stuck * for good. */ err = gve_rx_post_new_mbuf_dqo(rx, M_NOWAIT); if (__predict_false(err != 0 && num_pending_bufs <= GVE_RX_DQO_MIN_PENDING_BUFS)) { counter_enter(); counter_u64_add_protected( rx->stats.rx_dropped_pkt_mbuf_alloc_fail, 1); counter_exit(); goto drop_frag; } buf->mbuf->m_len = frag_len; ctx->total_size += frag_len; if (ctx->mbuf_tail == NULL) { ctx->mbuf_head = buf->mbuf; ctx->mbuf_tail = buf->mbuf; } else { buf->mbuf->m_flags &= ~M_PKTHDR; ctx->mbuf_tail->m_next = buf->mbuf; ctx->mbuf_tail = buf->mbuf; } /* * Disassociate the mbuf from buf and surrender buf to the free list to * be used by a future mbuf. */ bus_dmamap_unload(rx->dqo.buf_dmatag, buf->dmamap); buf->mbuf = NULL; buf->addr = 0; SLIST_INSERT_HEAD(&rx->dqo.free_bufs, buf, slist_entry); if (is_last_frag) { gve_rx_input_mbuf_dqo(rx, compl_desc); (*work_done)++; } return; drop_frag: /* Clear the earlier frags if there were any */ m_freem(ctx->mbuf_head); rx->ctx = (struct gve_rx_ctx){}; /* Drop the rest of the pkt if there are more frags */ ctx->drop_pkt = true; /* Reuse the dropped frag's buffer */ gve_rx_post_buf_dqo(rx, buf); if (is_last_frag) goto drop_frag_clear_ctx; return; drop_frag_clear_ctx: counter_enter(); counter_u64_add_protected(rx->stats.rx_dropped_pkt, 1); counter_exit(); m_freem(ctx->mbuf_head); rx->ctx = (struct gve_rx_ctx){}; } static void * gve_get_cpu_addr_for_qpl_buf(struct gve_rx_ring *rx, struct gve_rx_buf_dqo *buf, uint8_t buf_frag_num) { int page_idx = buf - rx->dqo.bufs; void *va = rx->com.qpl->dmas[page_idx].cpu_addr; va = (char *)va + (buf_frag_num * GVE_DEFAULT_RX_BUFFER_SIZE); return (va); } static int gve_rx_add_clmbuf_to_ctx(struct gve_rx_ring *rx, struct gve_rx_ctx *ctx, struct gve_rx_buf_dqo *buf, uint8_t buf_frag_num, uint16_t frag_len) { void *va = gve_get_cpu_addr_for_qpl_buf(rx, buf, buf_frag_num); struct mbuf *mbuf; if (ctx->mbuf_tail == NULL) { mbuf = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (mbuf == NULL) return (ENOMEM); ctx->mbuf_head = mbuf; ctx->mbuf_tail = mbuf; } else { mbuf = m_getcl(M_NOWAIT, MT_DATA, 0); if (mbuf == NULL) return (ENOMEM); ctx->mbuf_tail->m_next = mbuf; ctx->mbuf_tail = mbuf; } mbuf->m_len = frag_len; ctx->total_size += frag_len; m_copyback(mbuf, 0, frag_len, va); counter_enter(); counter_u64_add_protected(rx->stats.rx_frag_copy_cnt, 1); counter_exit(); return (0); } static int gve_rx_add_extmbuf_to_ctx(struct gve_rx_ring *rx, struct gve_rx_ctx *ctx, struct gve_rx_buf_dqo *buf, uint8_t buf_frag_num, uint16_t frag_len) { struct mbuf *mbuf; void *page_addr; vm_page_t page; int page_idx; void *va; if (ctx->mbuf_tail == NULL) { mbuf = m_gethdr(M_NOWAIT, MT_DATA); if (mbuf == NULL) return (ENOMEM); ctx->mbuf_head = mbuf; ctx->mbuf_tail = mbuf; } else { mbuf = m_get(M_NOWAIT, MT_DATA); if (mbuf == NULL) return (ENOMEM); ctx->mbuf_tail->m_next = mbuf; ctx->mbuf_tail = mbuf; } mbuf->m_len = frag_len; ctx->total_size += frag_len; page_idx = buf - rx->dqo.bufs; page = rx->com.qpl->pages[page_idx]; page_addr = rx->com.qpl->dmas[page_idx].cpu_addr; va = (char *)page_addr + (buf_frag_num * GVE_DEFAULT_RX_BUFFER_SIZE); /* * Grab an extra ref to the page so that gve_mextadd_free * does not end up freeing the page while the interface exists. */ vm_page_wire(page); counter_enter(); counter_u64_add_protected(rx->stats.rx_frag_flip_cnt, 1); counter_exit(); MEXTADD(mbuf, va, frag_len, gve_mextadd_free, page, page_addr, 0, EXT_NET_DRV); return (0); } static void gve_rx_dqo_qpl(struct gve_priv *priv, struct gve_rx_ring *rx, struct gve_rx_compl_desc_dqo *compl_desc, int *work_done) { bool is_last_frag = compl_desc->end_of_packet != 0; union gve_rx_qpl_buf_id_dqo composed_id; struct gve_dma_handle *page_dma_handle; struct gve_rx_ctx *ctx = &rx->ctx; struct gve_rx_buf_dqo *buf; uint32_t num_pending_bufs; uint8_t buf_frag_num; uint16_t frag_len; uint16_t buf_id; int err; composed_id.all = le16toh(compl_desc->buf_id); buf_id = composed_id.buf_id; buf_frag_num = composed_id.frag_num; if (__predict_false(buf_id >= rx->dqo.buf_cnt)) { device_printf(priv->dev, "Invalid rx buf id %d on rxq %d, issuing reset\n", buf_id, rx->com.id); gve_schedule_reset(priv); goto drop_frag_clear_ctx; } buf = &rx->dqo.bufs[buf_id]; if (__predict_false(buf->num_nic_frags == 0 || buf_frag_num > GVE_DQ_NUM_FRAGS_IN_PAGE - 1)) { device_printf(priv->dev, "Spurious compl for buf id %d on rxq %d " "with buf_frag_num %d and num_nic_frags %d, issuing reset\n", buf_id, rx->com.id, buf_frag_num, buf->num_nic_frags); gve_schedule_reset(priv); goto drop_frag_clear_ctx; } buf->num_nic_frags--; if (__predict_false(ctx->drop_pkt)) goto drop_frag; if (__predict_false(compl_desc->rx_error)) { counter_enter(); counter_u64_add_protected(rx->stats.rx_dropped_pkt_desc_err, 1); counter_exit(); goto drop_frag; } page_dma_handle = gve_get_page_dma_handle(rx, buf); bus_dmamap_sync(page_dma_handle->tag, page_dma_handle->map, BUS_DMASYNC_POSTREAD); frag_len = compl_desc->packet_len; if (frag_len <= priv->rx_copybreak && !ctx->mbuf_head && is_last_frag) { void *va = gve_get_cpu_addr_for_qpl_buf(rx, buf, buf_frag_num); err = gve_rx_copybreak_dqo(rx, va, compl_desc, frag_len); if (__predict_false(err != 0)) goto drop_frag; (*work_done)++; gve_rx_post_qpl_buf_dqo(rx, buf, buf_frag_num); return; } num_pending_bufs = (rx->dqo.head - rx->dqo.tail) & rx->dqo.mask; err = gve_rx_post_new_dqo_qpl_buf(rx); if (__predict_false(err != 0 && num_pending_bufs <= GVE_RX_DQO_MIN_PENDING_BUFS)) { /* * Resort to copying this fragment into a cluster mbuf * when the above threshold is breached and repost the * incoming buffer. If we cannot find cluster mbufs, * just drop the packet (to repost its buffer). */ err = gve_rx_add_clmbuf_to_ctx(rx, ctx, buf, buf_frag_num, frag_len); if (err != 0) { counter_enter(); counter_u64_add_protected( rx->stats.rx_dropped_pkt_buf_post_fail, 1); counter_exit(); goto drop_frag; } gve_rx_post_qpl_buf_dqo(rx, buf, buf_frag_num); } else { err = gve_rx_add_extmbuf_to_ctx(rx, ctx, buf, buf_frag_num, frag_len); if (__predict_false(err != 0)) { counter_enter(); counter_u64_add_protected( rx->stats.rx_dropped_pkt_mbuf_alloc_fail, 1); counter_exit(); goto drop_frag; } } /* * Both the counts need to be checked. * * num_nic_frags == 0 implies no pending completions * but not all frags may have yet been posted. * * next_idx == 0 implies all frags have been posted * but there might be pending completions. */ if (buf->num_nic_frags == 0 && buf->next_idx == 0) STAILQ_INSERT_TAIL(&rx->dqo.used_bufs, buf, stailq_entry); if (is_last_frag) { gve_rx_input_mbuf_dqo(rx, compl_desc); (*work_done)++; } return; drop_frag: /* Clear the earlier frags if there were any */ m_freem(ctx->mbuf_head); rx->ctx = (struct gve_rx_ctx){}; /* Drop the rest of the pkt if there are more frags */ ctx->drop_pkt = true; /* Reuse the dropped frag's buffer */ gve_rx_post_qpl_buf_dqo(rx, buf, buf_frag_num); if (is_last_frag) goto drop_frag_clear_ctx; return; drop_frag_clear_ctx: counter_enter(); counter_u64_add_protected(rx->stats.rx_dropped_pkt, 1); counter_exit(); m_freem(ctx->mbuf_head); rx->ctx = (struct gve_rx_ctx){}; } +static uint8_t +gve_rx_get_gen_bit(uint8_t *desc) +{ + uint8_t byte; + + /* + * Prevent generation bit from being read after the rest of the + * descriptor. + */ + byte = atomic_load_acq_8(desc + GVE_RX_DESC_DQO_GEN_BYTE_OFFSET); + return ((byte & GVE_RX_DESC_DQO_GEN_BIT_MASK) != 0); +} + static bool gve_rx_cleanup_dqo(struct gve_priv *priv, struct gve_rx_ring *rx, int budget) { struct gve_rx_compl_desc_dqo *compl_desc; uint32_t work_done = 0; NET_EPOCH_ASSERT(); while (work_done < budget) { - bus_dmamap_sync(rx->dqo.compl_ring_mem.tag, rx->dqo.compl_ring_mem.map, + bus_dmamap_sync(rx->dqo.compl_ring_mem.tag, + rx->dqo.compl_ring_mem.map, BUS_DMASYNC_POSTREAD); compl_desc = &rx->dqo.compl_ring[rx->dqo.tail]; - if (compl_desc->generation == rx->dqo.cur_gen_bit) + if (gve_rx_get_gen_bit((uint8_t *)compl_desc) == + rx->dqo.cur_gen_bit) break; - /* - * Prevent generation bit from being read after the rest of the - * descriptor. - */ - atomic_thread_fence_acq(); rx->cnt++; rx->dqo.tail = (rx->dqo.tail + 1) & rx->dqo.mask; rx->dqo.cur_gen_bit ^= (rx->dqo.tail == 0); if (gve_is_qpl(priv)) gve_rx_dqo_qpl(priv, rx, compl_desc, &work_done); else gve_rx_dqo(priv, rx, compl_desc, &work_done); } if (work_done != 0) tcp_lro_flush_all(&rx->lro); gve_rx_post_buffers_dqo(rx, M_NOWAIT); if (gve_is_qpl(priv)) gve_rx_maybe_extract_from_used_bufs(rx, /*just_one=*/false); return (work_done == budget); } void gve_rx_cleanup_tq_dqo(void *arg, int pending) { struct gve_rx_ring *rx = arg; struct gve_priv *priv = rx->com.priv; if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) return; if (gve_rx_cleanup_dqo(priv, rx, /*budget=*/64)) { taskqueue_enqueue(rx->com.cleanup_tq, &rx->com.cleanup_task); return; } gve_db_bar_dqo_write_4(priv, rx->com.irq_db_offset, GVE_ITR_NO_UPDATE_DQO | GVE_ITR_ENABLE_BIT_DQO); } diff --git a/sys/dev/gve/gve_tx_dqo.c b/sys/dev/gve/gve_tx_dqo.c index 7361d47b8ce6..8a1993c3e712 100644 --- a/sys/dev/gve/gve_tx_dqo.c +++ b/sys/dev/gve/gve_tx_dqo.c @@ -1,1111 +1,1120 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its contributors * may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "opt_inet6.h" #include "gve.h" #include "gve_dqo.h" static void gve_unmap_packet(struct gve_tx_ring *tx, struct gve_tx_pending_pkt_dqo *pending_pkt) { bus_dmamap_sync(tx->dqo.buf_dmatag, pending_pkt->dmamap, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(tx->dqo.buf_dmatag, pending_pkt->dmamap); } static void gve_clear_qpl_pending_pkt(struct gve_tx_pending_pkt_dqo *pending_pkt) { pending_pkt->qpl_buf_head = -1; pending_pkt->num_qpl_bufs = 0; } static void gve_free_tx_mbufs_dqo(struct gve_tx_ring *tx) { struct gve_tx_pending_pkt_dqo *pending_pkt; int i; for (i = 0; i < tx->dqo.num_pending_pkts; i++) { pending_pkt = &tx->dqo.pending_pkts[i]; if (!pending_pkt->mbuf) continue; if (gve_is_qpl(tx->com.priv)) gve_clear_qpl_pending_pkt(pending_pkt); else gve_unmap_packet(tx, pending_pkt); m_freem(pending_pkt->mbuf); pending_pkt->mbuf = NULL; } } void gve_tx_free_ring_dqo(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; struct gve_ring_com *com = &tx->com; int j; if (tx->dqo.desc_ring != NULL) { gve_dma_free_coherent(&tx->desc_ring_mem); tx->dqo.desc_ring = NULL; } if (tx->dqo.compl_ring != NULL) { gve_dma_free_coherent(&tx->dqo.compl_ring_mem); tx->dqo.compl_ring = NULL; } if (tx->dqo.pending_pkts != NULL) { gve_free_tx_mbufs_dqo(tx); if (!gve_is_qpl(priv) && tx->dqo.buf_dmatag) { for (j = 0; j < tx->dqo.num_pending_pkts; j++) if (tx->dqo.pending_pkts[j].state != GVE_PACKET_STATE_UNALLOCATED) bus_dmamap_destroy(tx->dqo.buf_dmatag, tx->dqo.pending_pkts[j].dmamap); } free(tx->dqo.pending_pkts, M_GVE); tx->dqo.pending_pkts = NULL; } if (!gve_is_qpl(priv) && tx->dqo.buf_dmatag) bus_dma_tag_destroy(tx->dqo.buf_dmatag); if (gve_is_qpl(priv) && tx->dqo.qpl_bufs != NULL) { free(tx->dqo.qpl_bufs, M_GVE); tx->dqo.qpl_bufs = NULL; } if (com->qpl != NULL) { gve_free_qpl(priv, com->qpl); com->qpl = NULL; } } static int gve_tx_alloc_rda_fields_dqo(struct gve_tx_ring *tx) { struct gve_priv *priv = tx->com.priv; int err; int j; /* * DMA tag for mapping Tx mbufs * The maxsize, nsegments, and maxsegsize params should match * the if_sethwtso* arguments in gve_setup_ifnet in gve_main.c. */ err = bus_dma_tag_create( bus_get_dma_tag(priv->dev), /* parent */ 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ GVE_TSO_MAXSIZE_DQO, /* maxsize */ GVE_TX_MAX_DATA_DESCS_DQO, /* nsegments */ GVE_TX_MAX_BUF_SIZE_DQO, /* maxsegsize */ BUS_DMA_ALLOCNOW, /* flags */ NULL, /* lockfunc */ NULL, /* lockarg */ &tx->dqo.buf_dmatag); if (err != 0) { device_printf(priv->dev, "%s: bus_dma_tag_create failed: %d\n", __func__, err); return (err); } for (j = 0; j < tx->dqo.num_pending_pkts; j++) { err = bus_dmamap_create(tx->dqo.buf_dmatag, 0, &tx->dqo.pending_pkts[j].dmamap); if (err != 0) { device_printf(priv->dev, "err in creating pending pkt dmamap %d: %d", j, err); return (err); } tx->dqo.pending_pkts[j].state = GVE_PACKET_STATE_FREE; } return (0); } int gve_tx_alloc_ring_dqo(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; uint16_t num_pending_pkts; int err; /* Descriptor ring */ err = gve_dma_alloc_coherent(priv, sizeof(union gve_tx_desc_dqo) * priv->tx_desc_cnt, CACHE_LINE_SIZE, &tx->desc_ring_mem); if (err != 0) { device_printf(priv->dev, "Failed to alloc desc ring for tx ring %d", i); goto abort; } tx->dqo.desc_ring = tx->desc_ring_mem.cpu_addr; /* Completion ring */ err = gve_dma_alloc_coherent(priv, sizeof(struct gve_tx_compl_desc_dqo) * priv->tx_desc_cnt, CACHE_LINE_SIZE, &tx->dqo.compl_ring_mem); if (err != 0) { device_printf(priv->dev, "Failed to alloc compl ring for tx ring %d", i); goto abort; } tx->dqo.compl_ring = tx->dqo.compl_ring_mem.cpu_addr; /* * pending_pkts array * * The max number of pending packets determines the maximum number of * descriptors which maybe written to the completion queue. * * We must set the number small enough to make sure we never overrun the * completion queue. */ num_pending_pkts = priv->tx_desc_cnt; /* * Reserve space for descriptor completions, which will be reported at * most every GVE_TX_MIN_RE_INTERVAL packets. */ num_pending_pkts -= num_pending_pkts / GVE_TX_MIN_RE_INTERVAL; tx->dqo.num_pending_pkts = num_pending_pkts; tx->dqo.pending_pkts = malloc( sizeof(struct gve_tx_pending_pkt_dqo) * num_pending_pkts, M_GVE, M_WAITOK | M_ZERO); if (gve_is_qpl(priv)) { int qpl_buf_cnt; tx->com.qpl = gve_alloc_qpl(priv, i, GVE_TX_NUM_QPL_PAGES_DQO, /*single_kva*/false); if (tx->com.qpl == NULL) { device_printf(priv->dev, "Failed to alloc QPL for tx ring %d", i); err = ENOMEM; goto abort; } qpl_buf_cnt = GVE_TX_BUFS_PER_PAGE_DQO * tx->com.qpl->num_pages; tx->dqo.qpl_bufs = malloc( sizeof(*tx->dqo.qpl_bufs) * qpl_buf_cnt, M_GVE, M_WAITOK | M_ZERO); } else gve_tx_alloc_rda_fields_dqo(tx); return (0); abort: gve_tx_free_ring_dqo(priv, i); return (err); } static void gve_extract_tx_metadata_dqo(const struct mbuf *mbuf, struct gve_tx_metadata_dqo *metadata) { uint32_t hash = mbuf->m_pkthdr.flowid; uint16_t path_hash; metadata->version = GVE_TX_METADATA_VERSION_DQO; if (hash) { path_hash = hash ^ (hash >> 16); path_hash &= (1 << 15) - 1; if (__predict_false(path_hash == 0)) path_hash = ~path_hash; metadata->path_hash = path_hash; } } static void gve_tx_fill_pkt_desc_dqo(struct gve_tx_ring *tx, uint32_t *desc_idx, uint32_t len, uint64_t addr, int16_t compl_tag, bool eop, bool csum_enabled) { while (len > 0) { struct gve_tx_pkt_desc_dqo *desc = &tx->dqo.desc_ring[*desc_idx].pkt; uint32_t cur_len = MIN(len, GVE_TX_MAX_BUF_SIZE_DQO); bool cur_eop = eop && cur_len == len; *desc = (struct gve_tx_pkt_desc_dqo){ .buf_addr = htole64(addr), .dtype = GVE_TX_PKT_DESC_DTYPE_DQO, .end_of_packet = cur_eop, .checksum_offload_enable = csum_enabled, .compl_tag = htole16(compl_tag), .buf_size = cur_len, }; addr += cur_len; len -= cur_len; *desc_idx = (*desc_idx + 1) & tx->dqo.desc_mask; } } static void gve_tx_fill_tso_ctx_desc(struct gve_tx_tso_context_desc_dqo *desc, const struct mbuf *mbuf, const struct gve_tx_metadata_dqo *metadata, int header_len) { *desc = (struct gve_tx_tso_context_desc_dqo){ .header_len = header_len, .cmd_dtype = { .dtype = GVE_TX_TSO_CTX_DESC_DTYPE_DQO, .tso = 1, }, .flex0 = metadata->bytes[0], .flex5 = metadata->bytes[5], .flex6 = metadata->bytes[6], .flex7 = metadata->bytes[7], .flex8 = metadata->bytes[8], .flex9 = metadata->bytes[9], .flex10 = metadata->bytes[10], .flex11 = metadata->bytes[11], }; desc->tso_total_len = mbuf->m_pkthdr.len - header_len; desc->mss = mbuf->m_pkthdr.tso_segsz; } static void gve_tx_fill_general_ctx_desc(struct gve_tx_general_context_desc_dqo *desc, const struct gve_tx_metadata_dqo *metadata) { *desc = (struct gve_tx_general_context_desc_dqo){ .flex0 = metadata->bytes[0], .flex1 = metadata->bytes[1], .flex2 = metadata->bytes[2], .flex3 = metadata->bytes[3], .flex4 = metadata->bytes[4], .flex5 = metadata->bytes[5], .flex6 = metadata->bytes[6], .flex7 = metadata->bytes[7], .flex8 = metadata->bytes[8], .flex9 = metadata->bytes[9], .flex10 = metadata->bytes[10], .flex11 = metadata->bytes[11], .cmd_dtype = {.dtype = GVE_TX_GENERAL_CTX_DESC_DTYPE_DQO}, }; } #define PULLUP_HDR(m, len) \ do { \ if (__predict_false((m)->m_len < (len))) { \ (m) = m_pullup((m), (len)); \ if ((m) == NULL) \ return (EINVAL); \ } \ } while (0) static int gve_prep_tso(struct mbuf *mbuf, int *header_len) { uint8_t l3_off, l4_off = 0; struct ether_header *eh; struct tcphdr *th; u_short csum; PULLUP_HDR(mbuf, sizeof(*eh)); eh = mtod(mbuf, struct ether_header *); KASSERT(eh->ether_type != ETHERTYPE_VLAN, ("VLAN-tagged packets not supported")); l3_off = ETHER_HDR_LEN; #ifdef INET6 if (ntohs(eh->ether_type) == ETHERTYPE_IPV6) { struct ip6_hdr *ip6; PULLUP_HDR(mbuf, l3_off + sizeof(*ip6)); ip6 = (struct ip6_hdr *)(mtodo(mbuf, l3_off)); l4_off = l3_off + sizeof(struct ip6_hdr); csum = in6_cksum_pseudo(ip6, /*len=*/0, IPPROTO_TCP, /*csum=*/0); } else #endif if (ntohs(eh->ether_type) == ETHERTYPE_IP) { struct ip *ip; PULLUP_HDR(mbuf, l3_off + sizeof(*ip)); ip = (struct ip *)(mtodo(mbuf, l3_off)); l4_off = l3_off + (ip->ip_hl << 2); csum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(IPPROTO_TCP)); } PULLUP_HDR(mbuf, l4_off + sizeof(struct tcphdr *)); th = (struct tcphdr *)(mtodo(mbuf, l4_off)); *header_len = l4_off + (th->th_off << 2); /* * Hardware requires the th->th_sum to not include the TCP payload, * hence we recompute the csum with it excluded. */ th->th_sum = csum; return (0); } static int gve_tx_fill_ctx_descs(struct gve_tx_ring *tx, struct mbuf *mbuf, bool is_tso, uint32_t *desc_idx) { struct gve_tx_general_context_desc_dqo *gen_desc; struct gve_tx_tso_context_desc_dqo *tso_desc; struct gve_tx_metadata_dqo metadata; int header_len; int err; metadata = (struct gve_tx_metadata_dqo){0}; gve_extract_tx_metadata_dqo(mbuf, &metadata); if (is_tso) { err = gve_prep_tso(mbuf, &header_len); if (__predict_false(err)) { counter_enter(); counter_u64_add_protected( tx->stats.tx_delayed_pkt_tsoerr, 1); counter_exit(); return (err); } tso_desc = &tx->dqo.desc_ring[*desc_idx].tso_ctx; gve_tx_fill_tso_ctx_desc(tso_desc, mbuf, &metadata, header_len); *desc_idx = (*desc_idx + 1) & tx->dqo.desc_mask; counter_enter(); counter_u64_add_protected(tx->stats.tso_packet_cnt, 1); counter_exit(); } gen_desc = &tx->dqo.desc_ring[*desc_idx].general_ctx; gve_tx_fill_general_ctx_desc(gen_desc, &metadata); *desc_idx = (*desc_idx + 1) & tx->dqo.desc_mask; return (0); } static int gve_map_mbuf_dqo(struct gve_tx_ring *tx, struct mbuf **mbuf, bus_dmamap_t dmamap, bus_dma_segment_t *segs, int *nsegs, int attempt) { struct mbuf *m_new = NULL; int err; err = bus_dmamap_load_mbuf_sg(tx->dqo.buf_dmatag, dmamap, *mbuf, segs, nsegs, BUS_DMA_NOWAIT); switch (err) { case __predict_true(0): break; case EFBIG: if (__predict_false(attempt > 0)) goto abort; counter_enter(); counter_u64_add_protected( tx->stats.tx_mbuf_collapse, 1); counter_exit(); /* Try m_collapse before m_defrag */ m_new = m_collapse(*mbuf, M_NOWAIT, GVE_TX_MAX_DATA_DESCS_DQO); if (m_new == NULL) { counter_enter(); counter_u64_add_protected( tx->stats.tx_mbuf_defrag, 1); counter_exit(); m_new = m_defrag(*mbuf, M_NOWAIT); } if (__predict_false(m_new == NULL)) { counter_enter(); counter_u64_add_protected( tx->stats.tx_mbuf_defrag_err, 1); counter_exit(); m_freem(*mbuf); *mbuf = NULL; err = ENOMEM; goto abort; } else { *mbuf = m_new; return (gve_map_mbuf_dqo(tx, mbuf, dmamap, segs, nsegs, ++attempt)); } case ENOMEM: counter_enter(); counter_u64_add_protected( tx->stats.tx_mbuf_dmamap_enomem_err, 1); counter_exit(); goto abort; default: goto abort; } return (0); abort: counter_enter(); counter_u64_add_protected(tx->stats.tx_mbuf_dmamap_err, 1); counter_exit(); return (err); } static uint32_t num_avail_desc_ring_slots(const struct gve_tx_ring *tx) { uint32_t num_used = (tx->dqo.desc_tail - tx->dqo.desc_head) & tx->dqo.desc_mask; return (tx->dqo.desc_mask - num_used); } static struct gve_tx_pending_pkt_dqo * gve_alloc_pending_packet(struct gve_tx_ring *tx) { int32_t index = tx->dqo.free_pending_pkts_csm; struct gve_tx_pending_pkt_dqo *pending_pkt; /* * No pending packets available in the consumer list, * try to steal the producer list. */ if (__predict_false(index == -1)) { tx->dqo.free_pending_pkts_csm = atomic_swap_32( &tx->dqo.free_pending_pkts_prd, -1); index = tx->dqo.free_pending_pkts_csm; if (__predict_false(index == -1)) return (NULL); } pending_pkt = &tx->dqo.pending_pkts[index]; /* Remove pending_pkt from the consumer list */ tx->dqo.free_pending_pkts_csm = pending_pkt->next; pending_pkt->state = GVE_PACKET_STATE_PENDING_DATA_COMPL; return (pending_pkt); } static void gve_free_pending_packet(struct gve_tx_ring *tx, struct gve_tx_pending_pkt_dqo *pending_pkt) { int index = pending_pkt - tx->dqo.pending_pkts; int32_t old_head; pending_pkt->state = GVE_PACKET_STATE_FREE; /* Add pending_pkt to the producer list */ while (true) { old_head = atomic_load_acq_32(&tx->dqo.free_pending_pkts_prd); pending_pkt->next = old_head; if (atomic_cmpset_32(&tx->dqo.free_pending_pkts_prd, old_head, index)) break; } } /* * Has the side-effect of retrieving the value of the last desc index * processed by the NIC. hw_tx_head is written to by the completions-processing * taskqueue upon receiving descriptor-completions. */ static bool gve_tx_has_desc_room_dqo(struct gve_tx_ring *tx, int needed_descs) { if (needed_descs <= num_avail_desc_ring_slots(tx)) return (true); tx->dqo.desc_head = atomic_load_acq_32(&tx->dqo.hw_tx_head); if (needed_descs > num_avail_desc_ring_slots(tx)) { counter_enter(); counter_u64_add_protected( tx->stats.tx_delayed_pkt_nospace_descring, 1); counter_exit(); return (false); } return (0); } static void gve_tx_request_desc_compl(struct gve_tx_ring *tx, uint32_t desc_idx) { uint32_t last_report_event_interval; uint32_t last_desc_idx; last_desc_idx = (desc_idx - 1) & tx->dqo.desc_mask; last_report_event_interval = (last_desc_idx - tx->dqo.last_re_idx) & tx->dqo.desc_mask; if (__predict_false(last_report_event_interval >= GVE_TX_MIN_RE_INTERVAL)) { tx->dqo.desc_ring[last_desc_idx].pkt.report_event = true; tx->dqo.last_re_idx = last_desc_idx; } } static bool gve_tx_have_enough_qpl_bufs(struct gve_tx_ring *tx, int num_bufs) { uint32_t available = tx->dqo.qpl_bufs_produced_cached - tx->dqo.qpl_bufs_consumed; if (__predict_true(available >= num_bufs)) return (true); tx->dqo.qpl_bufs_produced_cached = atomic_load_acq_32( &tx->dqo.qpl_bufs_produced); available = tx->dqo.qpl_bufs_produced_cached - tx->dqo.qpl_bufs_consumed; if (__predict_true(available >= num_bufs)) return (true); return (false); } static int32_t gve_tx_alloc_qpl_buf(struct gve_tx_ring *tx) { int32_t buf = tx->dqo.free_qpl_bufs_csm; if (__predict_false(buf == -1)) { tx->dqo.free_qpl_bufs_csm = atomic_swap_32( &tx->dqo.free_qpl_bufs_prd, -1); buf = tx->dqo.free_qpl_bufs_csm; if (__predict_false(buf == -1)) return (-1); } tx->dqo.free_qpl_bufs_csm = tx->dqo.qpl_bufs[buf]; tx->dqo.qpl_bufs_consumed++; return (buf); } /* * Tx buffer i corresponds to * qpl_page_id = i / GVE_TX_BUFS_PER_PAGE_DQO * qpl_page_offset = (i % GVE_TX_BUFS_PER_PAGE_DQO) * GVE_TX_BUF_SIZE_DQO */ static void gve_tx_buf_get_addr_dqo(struct gve_tx_ring *tx, int32_t index, void **va, bus_addr_t *dma_addr) { int page_id = index >> (PAGE_SHIFT - GVE_TX_BUF_SHIFT_DQO); int offset = (index & (GVE_TX_BUFS_PER_PAGE_DQO - 1)) << GVE_TX_BUF_SHIFT_DQO; *va = (char *)tx->com.qpl->dmas[page_id].cpu_addr + offset; *dma_addr = tx->com.qpl->dmas[page_id].bus_addr + offset; } static struct gve_dma_handle * gve_get_page_dma_handle(struct gve_tx_ring *tx, int32_t index) { int page_id = index >> (PAGE_SHIFT - GVE_TX_BUF_SHIFT_DQO); return (&tx->com.qpl->dmas[page_id]); } static void gve_tx_copy_mbuf_and_write_pkt_descs(struct gve_tx_ring *tx, struct mbuf *mbuf, struct gve_tx_pending_pkt_dqo *pkt, bool csum_enabled, int16_t completion_tag, uint32_t *desc_idx) { int32_t pkt_len = mbuf->m_pkthdr.len; struct gve_dma_handle *dma; uint32_t copy_offset = 0; int32_t prev_buf = -1; uint32_t copy_len; bus_addr_t addr; int32_t buf; void *va; MPASS(pkt->num_qpl_bufs == 0); MPASS(pkt->qpl_buf_head == -1); while (copy_offset < pkt_len) { buf = gve_tx_alloc_qpl_buf(tx); /* We already checked for availability */ MPASS(buf != -1); gve_tx_buf_get_addr_dqo(tx, buf, &va, &addr); copy_len = MIN(GVE_TX_BUF_SIZE_DQO, pkt_len - copy_offset); m_copydata(mbuf, copy_offset, copy_len, va); copy_offset += copy_len; dma = gve_get_page_dma_handle(tx, buf); bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_PREWRITE); gve_tx_fill_pkt_desc_dqo(tx, desc_idx, copy_len, addr, completion_tag, /*eop=*/copy_offset == pkt_len, csum_enabled); /* Link all the qpl bufs for a packet */ if (prev_buf == -1) pkt->qpl_buf_head = buf; else tx->dqo.qpl_bufs[prev_buf] = buf; prev_buf = buf; pkt->num_qpl_bufs++; } tx->dqo.qpl_bufs[buf] = -1; } int gve_xmit_dqo_qpl(struct gve_tx_ring *tx, struct mbuf *mbuf) { uint32_t desc_idx = tx->dqo.desc_tail; struct gve_tx_pending_pkt_dqo *pkt; int total_descs_needed; int16_t completion_tag; bool has_csum_flag; int csum_flags; bool is_tso; int nsegs; int err; csum_flags = mbuf->m_pkthdr.csum_flags; has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO); is_tso = csum_flags & CSUM_TSO; nsegs = howmany(mbuf->m_pkthdr.len, GVE_TX_BUF_SIZE_DQO); /* Check if we have enough room in the desc ring */ total_descs_needed = 1 + /* general_ctx_desc */ nsegs + /* pkt_desc */ (is_tso ? 1 : 0); /* tso_ctx_desc */ if (__predict_false(!gve_tx_has_desc_room_dqo(tx, total_descs_needed))) return (ENOBUFS); if (!gve_tx_have_enough_qpl_bufs(tx, nsegs)) { counter_enter(); counter_u64_add_protected( tx->stats.tx_delayed_pkt_nospace_qpl_bufs, 1); counter_exit(); return (ENOBUFS); } pkt = gve_alloc_pending_packet(tx); if (pkt == NULL) { counter_enter(); counter_u64_add_protected( tx->stats.tx_delayed_pkt_nospace_compring, 1); counter_exit(); return (ENOBUFS); } completion_tag = pkt - tx->dqo.pending_pkts; pkt->mbuf = mbuf; err = gve_tx_fill_ctx_descs(tx, mbuf, is_tso, &desc_idx); if (err) goto abort; gve_tx_copy_mbuf_and_write_pkt_descs(tx, mbuf, pkt, has_csum_flag, completion_tag, &desc_idx); /* Remember the index of the last desc written */ tx->dqo.desc_tail = desc_idx; /* * Request a descriptor completion on the last descriptor of the * packet if we are allowed to by the HW enforced interval. */ gve_tx_request_desc_compl(tx, desc_idx); tx->req += total_descs_needed; /* tx->req is just a sysctl counter */ return (0); abort: pkt->mbuf = NULL; gve_free_pending_packet(tx, pkt); return (err); } int gve_xmit_dqo(struct gve_tx_ring *tx, struct mbuf **mbuf_ptr) { bus_dma_segment_t segs[GVE_TX_MAX_DATA_DESCS_DQO]; uint32_t desc_idx = tx->dqo.desc_tail; struct gve_tx_pending_pkt_dqo *pkt; struct mbuf *mbuf = *mbuf_ptr; int total_descs_needed; int16_t completion_tag; bool has_csum_flag; int csum_flags; bool is_tso; int nsegs; int err; int i; csum_flags = mbuf->m_pkthdr.csum_flags; has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO); is_tso = csum_flags & CSUM_TSO; /* * This mbuf might end up needing more than 1 pkt desc. * The actual number, `nsegs` is known only after the * expensive gve_map_mbuf_dqo call. This check beneath * exists to fail early when the desc ring is really full. */ total_descs_needed = 1 + /* general_ctx_desc */ 1 + /* pkt_desc */ (is_tso ? 1 : 0); /* tso_ctx_desc */ if (__predict_false(!gve_tx_has_desc_room_dqo(tx, total_descs_needed))) return (ENOBUFS); pkt = gve_alloc_pending_packet(tx); if (pkt == NULL) { counter_enter(); counter_u64_add_protected( tx->stats.tx_delayed_pkt_nospace_compring, 1); counter_exit(); return (ENOBUFS); } completion_tag = pkt - tx->dqo.pending_pkts; err = gve_map_mbuf_dqo(tx, mbuf_ptr, pkt->dmamap, segs, &nsegs, /*attempt=*/0); if (err) goto abort; mbuf = *mbuf_ptr; /* gve_map_mbuf_dqo might replace the mbuf chain */ pkt->mbuf = mbuf; total_descs_needed = 1 + /* general_ctx_desc */ nsegs + /* pkt_desc */ (is_tso ? 1 : 0); /* tso_ctx_desc */ if (__predict_false( !gve_tx_has_desc_room_dqo(tx, total_descs_needed))) { err = ENOBUFS; goto abort_with_dma; } err = gve_tx_fill_ctx_descs(tx, mbuf, is_tso, &desc_idx); if (err) goto abort_with_dma; bus_dmamap_sync(tx->dqo.buf_dmatag, pkt->dmamap, BUS_DMASYNC_PREWRITE); for (i = 0; i < nsegs; i++) { gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, segs[i].ds_len, segs[i].ds_addr, completion_tag, /*eop=*/i == (nsegs - 1), has_csum_flag); } /* Remember the index of the last desc written */ tx->dqo.desc_tail = desc_idx; /* * Request a descriptor completion on the last descriptor of the * packet if we are allowed to by the HW enforced interval. */ gve_tx_request_desc_compl(tx, desc_idx); tx->req += total_descs_needed; /* tx->req is just a sysctl counter */ return (0); abort_with_dma: gve_unmap_packet(tx, pkt); abort: pkt->mbuf = NULL; gve_free_pending_packet(tx, pkt); return (err); } static void gve_reap_qpl_bufs_dqo(struct gve_tx_ring *tx, struct gve_tx_pending_pkt_dqo *pkt) { int32_t buf = pkt->qpl_buf_head; struct gve_dma_handle *dma; int32_t qpl_buf_tail; int32_t old_head; int i; for (i = 0; i < pkt->num_qpl_bufs; i++) { dma = gve_get_page_dma_handle(tx, buf); bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_POSTWRITE); qpl_buf_tail = buf; buf = tx->dqo.qpl_bufs[buf]; } MPASS(buf == -1); buf = qpl_buf_tail; while (true) { old_head = atomic_load_32(&tx->dqo.free_qpl_bufs_prd); tx->dqo.qpl_bufs[buf] = old_head; /* * The "rel" ensures that the update to dqo.free_qpl_bufs_prd * is visible only after the linked list from this pkt is * attached above to old_head. */ if (atomic_cmpset_rel_32(&tx->dqo.free_qpl_bufs_prd, old_head, pkt->qpl_buf_head)) break; } /* * The "rel" ensures that the update to dqo.qpl_bufs_produced is * visible only adter the update to dqo.free_qpl_bufs_prd above. */ atomic_add_rel_32(&tx->dqo.qpl_bufs_produced, pkt->num_qpl_bufs); gve_clear_qpl_pending_pkt(pkt); } static uint64_t gve_handle_packet_completion(struct gve_priv *priv, struct gve_tx_ring *tx, uint16_t compl_tag) { struct gve_tx_pending_pkt_dqo *pending_pkt; int32_t pkt_len; if (__predict_false(compl_tag >= tx->dqo.num_pending_pkts)) { device_printf(priv->dev, "Invalid TX completion tag: %d\n", compl_tag); return (0); } pending_pkt = &tx->dqo.pending_pkts[compl_tag]; /* Packet is allocated but not pending data completion. */ if (__predict_false(pending_pkt->state != GVE_PACKET_STATE_PENDING_DATA_COMPL)) { device_printf(priv->dev, "No pending data completion: %d\n", compl_tag); return (0); } pkt_len = pending_pkt->mbuf->m_pkthdr.len; if (gve_is_qpl(priv)) gve_reap_qpl_bufs_dqo(tx, pending_pkt); else gve_unmap_packet(tx, pending_pkt); m_freem(pending_pkt->mbuf); pending_pkt->mbuf = NULL; gve_free_pending_packet(tx, pending_pkt); return (pkt_len); } int gve_tx_intr_dqo(void *arg) { struct gve_tx_ring *tx = arg; struct gve_priv *priv = tx->com.priv; struct gve_ring_com *com = &tx->com; if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) return (FILTER_STRAY); /* Interrupts are automatically masked */ taskqueue_enqueue(com->cleanup_tq, &com->cleanup_task); return (FILTER_HANDLED); } static void gve_tx_clear_desc_ring_dqo(struct gve_tx_ring *tx) { struct gve_ring_com *com = &tx->com; int i; for (i = 0; i < com->priv->tx_desc_cnt; i++) tx->dqo.desc_ring[i] = (union gve_tx_desc_dqo){}; bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map, BUS_DMASYNC_PREWRITE); } static void gve_tx_clear_compl_ring_dqo(struct gve_tx_ring *tx) { struct gve_ring_com *com = &tx->com; int entries; int i; entries = com->priv->tx_desc_cnt; for (i = 0; i < entries; i++) tx->dqo.compl_ring[i] = (struct gve_tx_compl_desc_dqo){}; bus_dmamap_sync(tx->dqo.compl_ring_mem.tag, tx->dqo.compl_ring_mem.map, BUS_DMASYNC_PREWRITE); } void gve_clear_tx_ring_dqo(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; int j; tx->dqo.desc_head = 0; tx->dqo.desc_tail = 0; tx->dqo.desc_mask = priv->tx_desc_cnt - 1; tx->dqo.last_re_idx = 0; tx->dqo.compl_head = 0; tx->dqo.compl_mask = priv->tx_desc_cnt - 1; atomic_store_32(&tx->dqo.hw_tx_head, 0); tx->dqo.cur_gen_bit = 0; gve_free_tx_mbufs_dqo(tx); for (j = 0; j < tx->dqo.num_pending_pkts; j++) { if (gve_is_qpl(tx->com.priv)) gve_clear_qpl_pending_pkt(&tx->dqo.pending_pkts[j]); tx->dqo.pending_pkts[j].next = (j == tx->dqo.num_pending_pkts - 1) ? -1 : j + 1; tx->dqo.pending_pkts[j].state = GVE_PACKET_STATE_FREE; } tx->dqo.free_pending_pkts_csm = 0; atomic_store_rel_32(&tx->dqo.free_pending_pkts_prd, -1); if (gve_is_qpl(priv)) { int qpl_buf_cnt = GVE_TX_BUFS_PER_PAGE_DQO * tx->com.qpl->num_pages; for (j = 0; j < qpl_buf_cnt - 1; j++) tx->dqo.qpl_bufs[j] = j + 1; tx->dqo.qpl_bufs[j] = -1; tx->dqo.free_qpl_bufs_csm = 0; atomic_store_32(&tx->dqo.free_qpl_bufs_prd, -1); atomic_store_32(&tx->dqo.qpl_bufs_produced, qpl_buf_cnt); tx->dqo.qpl_bufs_produced_cached = qpl_buf_cnt; tx->dqo.qpl_bufs_consumed = 0; } gve_tx_clear_desc_ring_dqo(tx); gve_tx_clear_compl_ring_dqo(tx); } +static uint8_t +gve_tx_get_gen_bit(uint8_t *desc) +{ + uint8_t byte; + + /* + * Prevent generation bit from being read after the rest of the + * descriptor. + */ + byte = atomic_load_acq_8(desc + GVE_TX_DESC_DQO_GEN_BYTE_OFFSET); + return ((byte & GVE_TX_DESC_DQO_GEN_BIT_MASK) != 0); +} + static bool gve_tx_cleanup_dqo(struct gve_priv *priv, struct gve_tx_ring *tx, int budget) { struct gve_tx_compl_desc_dqo *compl_desc; uint64_t bytes_done = 0; uint64_t pkts_done = 0; uint16_t compl_tag; int work_done = 0; uint16_t tx_head; uint16_t type; while (work_done < budget) { - bus_dmamap_sync(tx->dqo.compl_ring_mem.tag, tx->dqo.compl_ring_mem.map, + bus_dmamap_sync(tx->dqo.compl_ring_mem.tag, + tx->dqo.compl_ring_mem.map, BUS_DMASYNC_POSTREAD); compl_desc = &tx->dqo.compl_ring[tx->dqo.compl_head]; - if (compl_desc->generation == tx->dqo.cur_gen_bit) + if (gve_tx_get_gen_bit((uint8_t *)compl_desc) == + tx->dqo.cur_gen_bit) break; - /* - * Prevent generation bit from being read after the rest of the - * descriptor. - */ - atomic_thread_fence_acq(); type = compl_desc->type; - if (type == GVE_COMPL_TYPE_DQO_DESC) { /* This is the last descriptor fetched by HW plus one */ tx_head = le16toh(compl_desc->tx_head); atomic_store_rel_32(&tx->dqo.hw_tx_head, tx_head); } else if (type == GVE_COMPL_TYPE_DQO_PKT) { compl_tag = le16toh(compl_desc->completion_tag); bytes_done += gve_handle_packet_completion(priv, tx, compl_tag); pkts_done++; } tx->dqo.compl_head = (tx->dqo.compl_head + 1) & tx->dqo.compl_mask; /* Flip the generation bit when we wrap around */ tx->dqo.cur_gen_bit ^= tx->dqo.compl_head == 0; work_done++; } /* * Waking the xmit taskqueue has to occur after room has been made in * the queue. */ atomic_thread_fence_seq_cst(); if (atomic_load_bool(&tx->stopped) && work_done) { atomic_store_bool(&tx->stopped, false); taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); } tx->done += work_done; /* tx->done is just a sysctl counter */ counter_enter(); counter_u64_add_protected(tx->stats.tbytes, bytes_done); counter_u64_add_protected(tx->stats.tpackets, pkts_done); counter_exit(); return (work_done == budget); } void gve_tx_cleanup_tq_dqo(void *arg, int pending) { struct gve_tx_ring *tx = arg; struct gve_priv *priv = tx->com.priv; if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) return; if (gve_tx_cleanup_dqo(priv, tx, /*budget=*/1024)) { taskqueue_enqueue(tx->com.cleanup_tq, &tx->com.cleanup_task); return; } gve_db_bar_dqo_write_4(priv, tx->com.irq_db_offset, GVE_ITR_NO_UPDATE_DQO | GVE_ITR_ENABLE_BIT_DQO); }