Changeset View
Standalone View
usr.sbin/bhyve/pci_nvme.c
Show First 20 Lines • Show All 56 Lines • ▼ Show 20 Lines | |||||
*/ | */ | ||||
#include <sys/cdefs.h> | #include <sys/cdefs.h> | ||||
__FBSDID("$FreeBSD$"); | __FBSDID("$FreeBSD$"); | ||||
#include <sys/errno.h> | #include <sys/errno.h> | ||||
#include <sys/types.h> | #include <sys/types.h> | ||||
#include <net/ieee_oui.h> | #include <net/ieee_oui.h> | ||||
#include <sys/time.h> | |||||
#include <assert.h> | #include <assert.h> | ||||
#include <pthread.h> | #include <pthread.h> | ||||
#include <semaphore.h> | #include <semaphore.h> | ||||
#include <stdbool.h> | #include <stdbool.h> | ||||
#include <stddef.h> | #include <stddef.h> | ||||
#include <stdint.h> | #include <stdint.h> | ||||
#include <stdio.h> | #include <stdio.h> | ||||
Show All 37 Lines | |||||
/* Note the + 1 allows for the initial descriptor to not be page aligned */ | /* Note the + 1 allows for the initial descriptor to not be page aligned */ | ||||
#define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1) | #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1) | ||||
#define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES) | #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES) | ||||
/* This is a synthetic status code to indicate there is no status */ | /* This is a synthetic status code to indicate there is no status */ | ||||
#define NVME_NO_STATUS 0xffff | #define NVME_NO_STATUS 0xffff | ||||
#define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) | #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) | ||||
#define POWER_ON_HOURS(start, now) \ | |||||
((now.tv_sec - start.tv_sec) / 3600) | |||||
/* helpers */ | /* helpers */ | ||||
/* Convert a zero-based value into a one-based value */ | /* Convert a zero-based value into a one-based value */ | ||||
#define ONE_BASED(zero) ((zero) + 1) | #define ONE_BASED(zero) ((zero) + 1) | ||||
/* Convert a one-based value into a zero-based value */ | /* Convert a one-based value into a zero-based value */ | ||||
#define ZERO_BASED(one) ((one) - 1) | #define ZERO_BASED(one) ((one) - 1) | ||||
/* Encode number of SQ's and CQ's for Set/Get Features */ | /* Encode number of SQ's and CQ's for Set/Get Features */ | ||||
▲ Show 20 Lines • Show All 110 Lines • ▼ Show 20 Lines | |||||
struct pci_nvme_softc; | struct pci_nvme_softc; | ||||
struct nvme_feature_obj; | struct nvme_feature_obj; | ||||
typedef void (*nvme_feature_cb)(struct pci_nvme_softc *, | typedef void (*nvme_feature_cb)(struct pci_nvme_softc *, | ||||
struct nvme_feature_obj *, | struct nvme_feature_obj *, | ||||
struct nvme_command *, | struct nvme_command *, | ||||
struct nvme_completion *); | struct nvme_completion *); | ||||
struct nvme_feature_obj { | struct nvme_feature_obj { | ||||
chuck: I think style(9) suggests this shouldn't be done:
> Avoid typedefs ending in “_t”, except as… | |||||
Done Inline Actions
Right, It didn't match the style(9). wanpengqian_gmail.com: > I think style(9) suggests this shouldn't be done:
>
> > Avoid typedefs ending in “_t”… | |||||
uint32_t cdw11; | uint32_t cdw11; | ||||
nvme_feature_cb set; | nvme_feature_cb set; | ||||
nvme_feature_cb get; | nvme_feature_cb get; | ||||
bool namespace_specific; | bool namespace_specific; | ||||
}; | }; | ||||
#define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1) | #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1) | ||||
▲ Show 20 Lines • Show All 42 Lines • ▼ Show 20 Lines | struct pci_nvme_softc { | ||||
/* Accounting for SMART data */ | /* Accounting for SMART data */ | ||||
__uint128_t read_data_units; | __uint128_t read_data_units; | ||||
__uint128_t write_data_units; | __uint128_t write_data_units; | ||||
__uint128_t read_commands; | __uint128_t read_commands; | ||||
__uint128_t write_commands; | __uint128_t write_commands; | ||||
uint32_t read_dunits_remainder; | uint32_t read_dunits_remainder; | ||||
uint32_t write_dunits_remainder; | uint32_t write_dunits_remainder; | ||||
struct timeval power_on_time; | |||||
STAILQ_HEAD(, pci_nvme_aer) aer_list; | STAILQ_HEAD(, pci_nvme_aer) aer_list; | ||||
uint32_t aer_count; | uint32_t aer_count; | ||||
}; | }; | ||||
static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *); | static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *); | ||||
static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *); | static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *); | ||||
▲ Show 20 Lines • Show All 187 Lines • ▼ Show 20 Lines | case NVME_DATASET_MANAGEMENT_ENABLE: | ||||
break; | break; | ||||
default: | default: | ||||
break; | break; | ||||
} | } | ||||
cd->fna = 0x03; | cd->fna = 0x03; | ||||
cd->power_state[0].mp = 10; | cd->power_state[0].mp = 10; | ||||
gettimeofday(&sc->power_on_time, NULL); | |||||
} | } | ||||
/* | /* | ||||
* Calculate the CRC-16 of the given buffer | * Calculate the CRC-16 of the given buffer | ||||
* See copyright attribution at top of file | * See copyright attribution at top of file | ||||
*/ | */ | ||||
static uint16_t | static uint16_t | ||||
crc16(uint16_t crc, const void *buffer, unsigned int len) | crc16(uint16_t crc, const void *buffer, unsigned int len) | ||||
▲ Show 20 Lines • Show All 77 Lines • ▼ Show 20 Lines | pci_nvme_init_nsdata(struct pci_nvme_softc *sc, | ||||
/* LBA data-sz = 2^lbads */ | /* LBA data-sz = 2^lbads */ | ||||
nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; | nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; | ||||
} | } | ||||
static void | static void | ||||
pci_nvme_init_logpages(struct pci_nvme_softc *sc) | pci_nvme_init_logpages(struct pci_nvme_softc *sc) | ||||
{ | { | ||||
__uint128_t power_cycles = 1; | |||||
Not Done Inline ActionsI'd prefer converting it to little endian as required by the nvme spec. Nevertheless, big endian architectures aren't supported by bhyve and as far as I can see, FreeBSD has no htole128 macro yet. So, I think it's fine. corvink: I'd prefer converting it to little endian as required by the nvme spec.
Nevertheless, big… | |||||
memset(&sc->err_log, 0, sizeof(sc->err_log)); | memset(&sc->err_log, 0, sizeof(sc->err_log)); | ||||
memset(&sc->health_log, 0, sizeof(sc->health_log)); | memset(&sc->health_log, 0, sizeof(sc->health_log)); | ||||
memset(&sc->fw_log, 0, sizeof(sc->fw_log)); | memset(&sc->fw_log, 0, sizeof(sc->fw_log)); | ||||
/* Set read/write remainder to round up according to spec */ | /* Set read/write remainder to round up according to spec */ | ||||
sc->read_dunits_remainder = 999; | sc->read_dunits_remainder = 999; | ||||
sc->write_dunits_remainder = 999; | sc->write_dunits_remainder = 999; | ||||
/* Set nominal Health values checked by implementations */ | /* Set nominal Health values checked by implementations */ | ||||
sc->health_log.temperature = 310; | sc->health_log.temperature = 310; | ||||
sc->health_log.available_spare = 100; | sc->health_log.available_spare = 100; | ||||
sc->health_log.available_spare_threshold = 10; | sc->health_log.available_spare_threshold = 10; | ||||
Not Done Inline ActionsCan you help me understand the use of memcpy here? Would a simple assignment not work? chuck: Can you help me understand the use of `memcpy` here? Would a simple assignment not work? | |||||
Done Inline Actions
The NVMe PowerCycles value is a 128-bit counter, but sys/dev/nvme/nvme.h define it as an array of 2 64-bit values. it can not simply be assigned in one line. So I assign it as other 128-bit values do. Now it changes to two line assignment, (Although the initial low value is enough, I prefer to do it whole.) wanpengqian_gmail.com: > Can you help me understand the use of `memcpy` here? Would a simple assignment not work?
The… | |||||
Not Done Inline ActionsBah! I forgot about that. Yes, your first approach was fine. chuck: Bah! I forgot about that. Yes, your first approach was fine. | |||||
Not Done Inline ActionsLast I checked, there was no int128_t data type on 32-bit platforms. If that's changed, I can update the few uses of 2 64-bit quantities to a single 128-bit int. imp: Last I checked, there was no int128_t data type on 32-bit platforms. If that's changed, I can… | |||||
Not Done Inline ActionsHmm, then the emulation is already in trouble as it uses __uint128_t in several other places. If this doesn't exist for 32-bit platforms, what is the portable way to do math on 16-byte values? chuck: Hmm, then the emulation is already in trouble as it uses `__uint128_t` in several other places. | |||||
Not Done Inline Actionsbhyve currently only runs on amd64 (and arm64 out of tree). If at some point a 32-bit port (armv7 is the most likely candidate) grows bhyve support, then we can worry about this. However, for now I think it's fine for bhyve to assume 64-bit architectures and __uint128_t jhb: bhyve currently only runs on amd64 (and arm64 out of tree). If at some point a 32-bit port… | |||||
Not Done Inline ActionsThe problem is with the shared nvme driver which runs on i386... imp: The problem is with the shared nvme driver which runs on i386... | |||||
Not Done Inline ActionsI think we're OK as this file isn't shared with the nvme(4) chuck: I think we're OK as this file isn't shared with the `nvme(4)` | |||||
memcpy(&sc->health_log.power_cycles, &power_cycles, | |||||
sizeof(sc->health_log.power_cycles)); | |||||
} | } | ||||
static void | static void | ||||
pci_nvme_init_features(struct pci_nvme_softc *sc) | pci_nvme_init_features(struct pci_nvme_softc *sc) | ||||
{ | { | ||||
sc->feat[0].set = nvme_feature_invalid_cb; | sc->feat[0].set = nvme_feature_invalid_cb; | ||||
sc->feat[0].get = nvme_feature_invalid_cb; | sc->feat[0].get = nvme_feature_invalid_cb; | ||||
▲ Show 20 Lines • Show All 446 Lines • ▼ Show 20 Lines | |||||
} | } | ||||
static int | static int | ||||
nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, | nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, | ||||
struct nvme_completion* compl) | struct nvme_completion* compl) | ||||
{ | { | ||||
uint32_t logsize; | uint32_t logsize; | ||||
uint8_t logpage = command->cdw10 & 0xFF; | uint8_t logpage = command->cdw10 & 0xFF; | ||||
struct timeval now; | |||||
__uint128_t power_on_hours; | |||||
DPRINTF("%s log page %u len %u", __func__, logpage, logsize); | DPRINTF("%s log page %u len %u", __func__, logpage, logsize); | ||||
pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); | pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); | ||||
/* | /* | ||||
* Command specifies the number of dwords to return in fields NUMDU | * Command specifies the number of dwords to return in fields NUMDU | ||||
* and NUMDL. This is a zero-based value. | * and NUMDL. This is a zero-based value. | ||||
*/ | */ | ||||
logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1; | logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1; | ||||
logsize *= sizeof(uint32_t); | logsize *= sizeof(uint32_t); | ||||
switch (logpage) { | switch (logpage) { | ||||
case NVME_LOG_ERROR: | case NVME_LOG_ERROR: | ||||
nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, | nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, | ||||
command->prp2, (uint8_t *)&sc->err_log, | command->prp2, (uint8_t *)&sc->err_log, | ||||
MIN(logsize, sizeof(sc->err_log)), | MIN(logsize, sizeof(sc->err_log)), | ||||
NVME_COPY_TO_PRP); | NVME_COPY_TO_PRP); | ||||
break; | break; | ||||
case NVME_LOG_HEALTH_INFORMATION: | case NVME_LOG_HEALTH_INFORMATION: | ||||
gettimeofday(&now, NULL); | |||||
power_on_hours = POWER_ON_HOURS(sc->power_on_time, now); | |||||
memcpy(&sc->health_log.power_on_hours, &power_on_hours, | |||||
sizeof(sc->health_log.power_on_hours)); | |||||
pthread_mutex_lock(&sc->mtx); | pthread_mutex_lock(&sc->mtx); | ||||
memcpy(&sc->health_log.data_units_read, &sc->read_data_units, | memcpy(&sc->health_log.data_units_read, &sc->read_data_units, | ||||
sizeof(sc->health_log.data_units_read)); | sizeof(sc->health_log.data_units_read)); | ||||
memcpy(&sc->health_log.data_units_written, &sc->write_data_units, | memcpy(&sc->health_log.data_units_written, &sc->write_data_units, | ||||
sizeof(sc->health_log.data_units_written)); | sizeof(sc->health_log.data_units_written)); | ||||
memcpy(&sc->health_log.host_read_commands, &sc->read_commands, | memcpy(&sc->health_log.host_read_commands, &sc->read_commands, | ||||
sizeof(sc->health_log.host_read_commands)); | sizeof(sc->health_log.host_read_commands)); | ||||
memcpy(&sc->health_log.host_write_commands, &sc->write_commands, | memcpy(&sc->health_log.host_write_commands, &sc->write_commands, | ||||
sizeof(sc->health_log.host_write_commands)); | sizeof(sc->health_log.host_write_commands)); | ||||
pthread_mutex_unlock(&sc->mtx); | pthread_mutex_unlock(&sc->mtx); | ||||
Done Inline ActionsThis copy can be performed outside of the locked region. The IO completion updates the read/write counters in a different thread, and thus, the code here (and there) require a lock to maintain the values' consistencies. chuck: This copy can be performed outside of the locked region.
The IO completion updates the… | |||||
nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, | nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, | ||||
command->prp2, (uint8_t *)&sc->health_log, | command->prp2, (uint8_t *)&sc->health_log, | ||||
MIN(logsize, sizeof(sc->health_log)), | MIN(logsize, sizeof(sc->health_log)), | ||||
NVME_COPY_TO_PRP); | NVME_COPY_TO_PRP); | ||||
break; | break; | ||||
case NVME_LOG_FIRMWARE_SLOT: | case NVME_LOG_FIRMWARE_SLOT: | ||||
nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, | nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, | ||||
▲ Show 20 Lines • Show All 1,719 Lines • Show Last 20 Lines |
I think style(9) suggests this shouldn't be done:
but perhaps I'm misinterpreting it.