Page MenuHomeFreeBSD

D5797.id15541.diff
No OneTemporary

D5797.id15541.diff

This file is larger than 256 KB, so syntax highlighting was skipped.
Index: sys/ofed/drivers/infiniband/hw/mlx4/ah.c
===================================================================
--- sys/ofed/drivers/infiniband/hw/mlx4/ah.c
+++ sys/ofed/drivers/infiniband/hw/mlx4/ah.c
@@ -38,33 +38,14 @@
#include <linux/slab.h>
#include <linux/string.h>
+#include <linux/rcupdate.h>
#include "mlx4_ib.h"
-int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah_attr,
- u8 *mac, int *is_mcast, u8 port)
-{
- struct in6_addr in6;
-
- *is_mcast = 0;
-
- memcpy(&in6, ah_attr->grh.dgid.raw, sizeof in6);
- if (rdma_link_local_addr(&in6))
- rdma_get_ll_mac(&in6, mac);
- else if (rdma_is_multicast_addr(&in6)) {
- rdma_get_mcast_mac(&in6, mac);
- *is_mcast = 1;
- } else
- return -EINVAL;
-
- return 0;
-}
-
static struct ib_ah *create_ib_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr,
struct mlx4_ib_ah *ah)
{
struct mlx4_dev *dev = to_mdev(pd->device)->dev;
-
ah->av.ib.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24));
ah->av.ib.g_slid = ah_attr->src_path_bits;
if (ah_attr->ah_flags & IB_AH_GRH) {
@@ -96,20 +77,45 @@
struct mlx4_dev *dev = ibdev->dev;
int is_mcast = 0;
struct in6_addr in6;
- u16 vlan_tag;
+ u16 vlan_tag = 0xffff;
+ union ib_gid sgid;
+ struct ib_gid_attr gid_attr;
+ int ret;
memcpy(&in6, ah_attr->grh.dgid.raw, sizeof(in6));
if (rdma_is_multicast_addr(&in6)) {
is_mcast = 1;
- resolve_mcast_mac(&in6, ah->av.eth.mac);
+ rdma_get_mcast_mac(&in6, ah->av.eth.mac);
} else {
- memcpy(ah->av.eth.mac, ah_attr->dmac, 6);
+ memcpy(ah->av.eth.mac, ah_attr->dmac, ETH_ALEN);
+ }
+ rcu_read_lock();
+ ret = ib_get_cached_gid(pd->device, ah_attr->port_num,
+ ah_attr->grh.sgid_index, &sgid, &gid_attr);
+
+ if (!ret &&
+ !memcmp(&sgid, &zgid, sizeof(sgid)))
+ ret = -EINVAL;
+ if (!ret &&
+ !gid_attr.ndev)
+ ret = -ENODEV;
+ if (!ret) {
+ memset(ah->av.eth.s_mac, 0, ETH_ALEN);
+ if (is_vlan_dev(gid_attr.ndev))
+ vlan_tag = vlan_dev_vlan_id(gid_attr.ndev);
+ memcpy(ah->av.eth.s_mac, IF_LLADDR(gid_attr.ndev), ETH_ALEN);
}
- vlan_tag = ah_attr->vlan_id;
+ rcu_read_unlock();
+ if (ret)
+ return ERR_PTR(ret);
if (vlan_tag < 0x1000)
vlan_tag |= (ah_attr->sl & 7) << 13;
ah->av.eth.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24));
- ah->av.eth.gid_index = ah_attr->grh.sgid_index;
+ ah->av.eth.gid_index = mlx4_ib_gid_index_to_real_index(ibdev,
+ ah_attr->port_num,
+ ah_attr->grh.sgid_index);
+ if (ah->av.eth.gid_index < 0)
+ return ERR_PTR(ah->av.eth.gid_index);
ah->av.eth.vlan = cpu_to_be16(vlan_tag);
if (ah_attr->static_rate) {
ah->av.eth.stat_rate = ah_attr->static_rate + MLX4_STAT_RATE_OFFSET;
@@ -117,7 +123,9 @@
!(1 << ah->av.eth.stat_rate & dev->caps.stat_rate_support))
--ah->av.eth.stat_rate;
}
-
+ ah->av.eth.sl_tclass_flowlabel |=
+ cpu_to_be32((ah_attr->grh.traffic_class << 20) |
+ ah_attr->grh.flow_label);
/*
* HW requires multicast LID so we just choose one.
*/
@@ -125,7 +133,7 @@
ah->av.ib.dlid = cpu_to_be16(0xc000);
memcpy(ah->av.eth.dgid, ah_attr->grh.dgid.raw, 16);
- ah->av.eth.sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 29);
+ ah->av.eth.sl_tclass_flowlabel |= cpu_to_be32(ah_attr->sl << 29);
return &ah->ibah;
}
@@ -168,9 +176,12 @@
enum rdma_link_layer ll;
memset(ah_attr, 0, sizeof *ah_attr);
- ah_attr->sl = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
ah_attr->port_num = be32_to_cpu(ah->av.ib.port_pd) >> 24;
ll = rdma_port_get_link_layer(ibah->device, ah_attr->port_num);
+ if (ll == IB_LINK_LAYER_ETHERNET)
+ ah_attr->sl = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 29;
+ else
+ ah_attr->sl = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
ah_attr->dlid = ll == IB_LINK_LAYER_INFINIBAND ? be16_to_cpu(ah->av.ib.dlid) : 0;
if (ah->av.ib.stat_rate)
ah_attr->static_rate = ah->av.ib.stat_rate - MLX4_STAT_RATE_OFFSET;
Index: sys/ofed/drivers/infiniband/hw/mlx4/alias_GUID.c
===================================================================
--- sys/ofed/drivers/infiniband/hw/mlx4/alias_GUID.c
+++ sys/ofed/drivers/infiniband/hw/mlx4/alias_GUID.c
@@ -42,6 +42,8 @@
#include <linux/errno.h>
#include <rdma/ib_user_verbs.h>
#include <linux/delay.h>
+#include <linux/math64.h>
+#include <linux/ktime.h>
#include "mlx4_ib.h"
/*
@@ -57,15 +59,19 @@
int query_id;
struct list_head list;
int block_num;
+ ib_sa_comp_mask guid_indexes;
u8 method;
};
struct mlx4_next_alias_guid_work {
u8 port;
u8 block_num;
+ u8 method;
struct mlx4_sriov_alias_guid_info_rec_det rec_det;
};
+static int get_low_record_time_index(struct mlx4_ib_dev *dev, u8 port,
+ int *resched_delay_sec);
void mlx4_ib_update_cache_on_guid_change(struct mlx4_ib_dev *dev, int block_num,
u8 port_num, u8 *p_data)
@@ -119,6 +125,57 @@
return IB_SA_COMP_MASK(4 + index);
}
+void mlx4_ib_slave_alias_guid_event(struct mlx4_ib_dev *dev, int slave,
+ int port, int slave_init)
+{
+ __be64 curr_guid, required_guid;
+ int record_num = slave / 8;
+ int index = slave % 8;
+ int port_index = port - 1;
+ unsigned long flags;
+ int do_work = 0;
+
+ spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags);
+ if (dev->sriov.alias_guid.ports_guid[port_index].state_flags &
+ GUID_STATE_NEED_PORT_INIT)
+ goto unlock;
+ if (!slave_init) {
+ curr_guid = *(__be64 *)&dev->sriov.
+ alias_guid.ports_guid[port_index].
+ all_rec_per_port[record_num].
+ all_recs[GUID_REC_SIZE * index];
+ if (curr_guid == cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL) ||
+ !curr_guid)
+ goto unlock;
+ required_guid = cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL);
+ } else {
+ required_guid = mlx4_get_admin_guid(dev->dev, slave, port);
+ if (required_guid == cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL))
+ goto unlock;
+ }
+ *(__be64 *)&dev->sriov.alias_guid.ports_guid[port_index].
+ all_rec_per_port[record_num].
+ all_recs[GUID_REC_SIZE * index] = required_guid;
+ dev->sriov.alias_guid.ports_guid[port_index].
+ all_rec_per_port[record_num].guid_indexes
+ |= mlx4_ib_get_aguid_comp_mask_from_ix(index);
+ dev->sriov.alias_guid.ports_guid[port_index].
+ all_rec_per_port[record_num].status
+ = MLX4_GUID_INFO_STATUS_IDLE;
+ /* set to run immediately */
+ dev->sriov.alias_guid.ports_guid[port_index].
+ all_rec_per_port[record_num].time_to_run = 0;
+ dev->sriov.alias_guid.ports_guid[port_index].
+ all_rec_per_port[record_num].
+ guids_retry_schedule[index] = 0;
+ do_work = 1;
+unlock:
+ spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags);
+
+ if (do_work)
+ mlx4_ib_init_alias_guid_work(dev, port_index);
+}
+
/*
* Whenever new GUID is set/unset (guid table change) create event and
* notify the relevant slave (master also should be notified).
@@ -139,10 +196,15 @@
enum slave_port_state prev_state;
__be64 tmp_cur_ag, form_cache_ag;
enum slave_port_gen_event gen_event;
+ struct mlx4_sriov_alias_guid_info_rec_det *rec;
+ unsigned long flags;
+ __be64 required_value;
if (!mlx4_is_master(dev->dev))
return;
+ rec = &dev->sriov.alias_guid.ports_guid[port_num - 1].
+ all_rec_per_port[block_num];
guid_indexes = be64_to_cpu((__force __be64) dev->sriov.alias_guid.
ports_guid[port_num - 1].
all_rec_per_port[block_num].guid_indexes);
@@ -156,7 +218,7 @@
continue;
slave_id = (block_num * NUM_ALIAS_GUID_IN_REC) + i ;
- if (slave_id >= dev->dev->num_slaves)
+ if (slave_id >= dev->dev->persist->num_vfs + 1)
return;
tmp_cur_ag = *(__be64 *)&p_data[i * GUID_REC_SIZE];
form_cache_ag = get_cached_alias_guid(dev, port_num,
@@ -168,8 +230,27 @@
*/
if (tmp_cur_ag != form_cache_ag)
continue;
- mlx4_gen_guid_change_eqe(dev->dev, slave_id, port_num);
+ spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags);
+ required_value = *(__be64 *)&rec->all_recs[i * GUID_REC_SIZE];
+
+ if (required_value == cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL))
+ required_value = 0;
+
+ if (tmp_cur_ag == required_value) {
+ rec->guid_indexes = rec->guid_indexes &
+ ~mlx4_ib_get_aguid_comp_mask_from_ix(i);
+ } else {
+ /* may notify port down if value is 0 */
+ if (tmp_cur_ag != MLX4_NOT_SET_GUID) {
+ spin_unlock_irqrestore(&dev->sriov.
+ alias_guid.ag_work_lock, flags);
+ continue;
+ }
+ }
+ spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock,
+ flags);
+ mlx4_gen_guid_change_eqe(dev->dev, slave_id, port_num);
/*2 cases: Valid GUID, and Invalid Guid*/
if (tmp_cur_ag != MLX4_NOT_SET_GUID) { /*valid GUID*/
@@ -187,13 +268,19 @@
port_num, MLX4_PORT_CHANGE_SUBTYPE_ACTIVE);
}
} else { /* request to invalidate GUID */
- set_and_calc_slave_port_state(dev->dev, slave_id, port_num,
- MLX4_PORT_STATE_IB_EVENT_GID_INVALID,
- &gen_event);
- pr_debug("sending PORT DOWN event to slave: %d, port: %d\n",
- slave_id, port_num);
- mlx4_gen_port_state_change_eqe(dev->dev, slave_id, port_num,
- MLX4_PORT_CHANGE_SUBTYPE_DOWN);
+ set_and_calc_slave_port_state(dev->dev,
+ slave_id,
+ port_num,
+ MLX4_PORT_STATE_IB_EVENT_GID_INVALID,
+ &gen_event);
+ if (gen_event == SLAVE_PORT_GEN_EVENT_DOWN) {
+ pr_debug("sending PORT DOWN event to slave: %d, port: %d\n",
+ slave_id, port_num);
+ mlx4_gen_port_state_change_eqe(dev->dev,
+ slave_id,
+ port_num,
+ MLX4_PORT_CHANGE_SUBTYPE_DOWN);
+ }
}
}
}
@@ -208,6 +295,9 @@
int i;
struct mlx4_sriov_alias_guid_info_rec_det *rec;
unsigned long flags, flags1;
+ ib_sa_comp_mask declined_guid_indexes = 0;
+ ib_sa_comp_mask applied_guid_indexes = 0;
+ unsigned int resched_delay_sec = 0;
if (!context)
return;
@@ -218,9 +308,9 @@
all_rec_per_port[cb_ctx->block_num];
if (status) {
- rec->status = MLX4_GUID_INFO_STATUS_IDLE;
pr_debug("(port: %d) failed: status = %d\n",
cb_ctx->port, status);
+ rec->time_to_run = ktime_get_ns() + 1 * NSEC_PER_SEC;
goto out;
}
@@ -237,69 +327,101 @@
rec = &dev->sriov.alias_guid.ports_guid[port_index].
all_rec_per_port[guid_rec->block_num];
- rec->status = MLX4_GUID_INFO_STATUS_SET;
- rec->method = MLX4_GUID_INFO_RECORD_SET;
-
+ spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags);
for (i = 0 ; i < NUM_ALIAS_GUID_IN_REC; i++) {
- __be64 tmp_cur_ag;
- tmp_cur_ag = *(__be64 *)&guid_rec->guid_info_list[i * GUID_REC_SIZE];
- if ((cb_ctx->method == MLX4_GUID_INFO_RECORD_DELETE)
- && (MLX4_NOT_SET_GUID == tmp_cur_ag)) {
- pr_debug("%s:Record num %d in block_num:%d "
- "was deleted by SM,ownership by %d "
- "(0 = driver, 1=sysAdmin, 2=None)\n",
- __func__, i, guid_rec->block_num,
- rec->ownership);
- rec->guid_indexes = rec->guid_indexes &
- ~mlx4_ib_get_aguid_comp_mask_from_ix(i);
+ __be64 sm_response, required_val;
+
+ if (!(cb_ctx->guid_indexes &
+ mlx4_ib_get_aguid_comp_mask_from_ix(i)))
continue;
+ sm_response = *(__be64 *)&guid_rec->guid_info_list
+ [i * GUID_REC_SIZE];
+ required_val = *(__be64 *)&rec->all_recs[i * GUID_REC_SIZE];
+ if (cb_ctx->method == MLX4_GUID_INFO_RECORD_DELETE) {
+ if (required_val ==
+ cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL))
+ goto next_entry;
+
+ /* A new value was set till we got the response */
+ pr_debug("need to set new value %llx, record num %d, block_num:%d\n",
+ (long long)be64_to_cpu(required_val),
+ i, guid_rec->block_num);
+ goto entry_declined;
}
/* check if the SM didn't assign one of the records.
- * if it didn't, if it was not sysadmin request:
- * ask the SM to give a new GUID, (instead of the driver request).
+ * if it didn't, re-ask for.
*/
- if (tmp_cur_ag == MLX4_NOT_SET_GUID) {
- mlx4_ib_warn(&dev->ib_dev, "%s:Record num %d in "
- "block_num: %d was declined by SM, "
- "ownership by %d (0 = driver, 1=sysAdmin,"
- " 2=None)\n", __func__, i,
- guid_rec->block_num, rec->ownership);
- if (rec->ownership == MLX4_GUID_DRIVER_ASSIGN) {
- /* if it is driver assign, asks for new GUID from SM*/
- *(__be64 *)&rec->all_recs[i * GUID_REC_SIZE] =
- MLX4_NOT_SET_GUID;
-
- /* Mark the record as not assigned, and let it
- * be sent again in the next work sched.*/
- rec->status = MLX4_GUID_INFO_STATUS_IDLE;
- rec->guid_indexes |= mlx4_ib_get_aguid_comp_mask_from_ix(i);
- }
+ if (sm_response == MLX4_NOT_SET_GUID) {
+ if (rec->guids_retry_schedule[i] == 0)
+ mlx4_ib_warn(&dev->ib_dev,
+ "%s:Record num %d in block_num: %d was declined by SM\n",
+ __func__, i,
+ guid_rec->block_num);
+ goto entry_declined;
} else {
/* properly assigned record. */
/* We save the GUID we just got from the SM in the
* admin_guid in order to be persistent, and in the
* request from the sm the process will ask for the same GUID */
- if (rec->ownership == MLX4_GUID_SYSADMIN_ASSIGN &&
- tmp_cur_ag != *(__be64 *)&rec->all_recs[i * GUID_REC_SIZE]) {
- /* the sysadmin assignment failed.*/
- mlx4_ib_warn(&dev->ib_dev, "%s: Failed to set"
- " admin guid after SysAdmin "
- "configuration. "
- "Record num %d in block_num:%d "
- "was declined by SM, "
- "new val(0x%llx) was kept\n",
- __func__, i,
- guid_rec->block_num,
- (long long)be64_to_cpu(*(__be64 *) &
- rec->all_recs[i * GUID_REC_SIZE]));
+ if (required_val &&
+ sm_response != required_val) {
+ /* Warn only on first retry */
+ if (rec->guids_retry_schedule[i] == 0)
+ mlx4_ib_warn(&dev->ib_dev, "%s: Failed to set"
+ " admin guid after SysAdmin "
+ "configuration. "
+ "Record num %d in block_num:%d "
+ "was declined by SM, "
+ "new val(0x%llx) was kept, SM returned (0x%llx)\n",
+ __func__, i,
+ guid_rec->block_num,
+ (long long)be64_to_cpu(required_val),
+ (long long)be64_to_cpu(sm_response));
+ goto entry_declined;
} else {
- memcpy(&rec->all_recs[i * GUID_REC_SIZE],
- &guid_rec->guid_info_list[i * GUID_REC_SIZE],
- GUID_REC_SIZE);
+ *(__be64 *)&rec->all_recs[i * GUID_REC_SIZE] =
+ sm_response;
+ if (required_val == 0)
+ mlx4_set_admin_guid(dev->dev,
+ sm_response,
+ (guid_rec->block_num
+ * NUM_ALIAS_GUID_IN_REC) + i,
+ cb_ctx->port);
+ goto next_entry;
}
}
+entry_declined:
+ declined_guid_indexes |= mlx4_ib_get_aguid_comp_mask_from_ix(i);
+ rec->guids_retry_schedule[i] =
+ (rec->guids_retry_schedule[i] == 0) ? 1 :
+ min((unsigned int)60,
+ rec->guids_retry_schedule[i] * 2);
+ /* using the minimum value among all entries in that record */
+ resched_delay_sec = (resched_delay_sec == 0) ?
+ rec->guids_retry_schedule[i] :
+ min(resched_delay_sec,
+ rec->guids_retry_schedule[i]);
+ continue;
+
+next_entry:
+ rec->guids_retry_schedule[i] = 0;
}
+
+ applied_guid_indexes = cb_ctx->guid_indexes & ~declined_guid_indexes;
+ if (declined_guid_indexes ||
+ rec->guid_indexes & ~(applied_guid_indexes)) {
+ pr_debug("record=%d wasn't fully set, guid_indexes=0x%llx applied_indexes=0x%llx, declined_indexes=0x%llx\n",
+ guid_rec->block_num,
+ (long long)be64_to_cpu((__force __be64)rec->guid_indexes),
+ (long long)be64_to_cpu((__force __be64)applied_guid_indexes),
+ (long long)be64_to_cpu((__force __be64)declined_guid_indexes));
+ rec->time_to_run = ktime_get_ns() +
+ resched_delay_sec * NSEC_PER_SEC;
+ } else {
+ rec->status = MLX4_GUID_INFO_STATUS_SET;
+ }
+ spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags);
/*
The func is call here to close the cases when the
sm doesn't send smp, so in the sa response the driver
@@ -311,10 +433,13 @@
out:
spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
- if (!dev->sriov.is_going_down)
+ if (!dev->sriov.is_going_down) {
+ get_low_record_time_index(dev, port_index, &resched_delay_sec);
queue_delayed_work(dev->sriov.alias_guid.ports_guid[port_index].wq,
&dev->sriov.alias_guid.ports_guid[port_index].
- alias_guid_work, 0);
+ alias_guid_work,
+ msecs_to_jiffies(resched_delay_sec * 1000));
+ }
if (cb_ctx->sa_query) {
list_del(&cb_ctx->list);
kfree(cb_ctx);
@@ -331,9 +456,7 @@
ib_sa_comp_mask comp_mask = 0;
dev->sriov.alias_guid.ports_guid[port - 1].all_rec_per_port[index].status
- = MLX4_GUID_INFO_STATUS_IDLE;
- dev->sriov.alias_guid.ports_guid[port - 1].all_rec_per_port[index].method
- = MLX4_GUID_INFO_RECORD_SET;
+ = MLX4_GUID_INFO_STATUS_SET;
/* calculate the comp_mask for that record.*/
for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) {
@@ -347,19 +470,21 @@
need to assign GUIDs, then don't put it up for assignment.
*/
if (MLX4_GUID_FOR_DELETE_VAL == cur_admin_val ||
- (!index && !i) ||
- MLX4_GUID_NONE_ASSIGN == dev->sriov.alias_guid.
- ports_guid[port - 1].all_rec_per_port[index].ownership)
+ (!index && !i))
continue;
comp_mask |= mlx4_ib_get_aguid_comp_mask_from_ix(i);
}
dev->sriov.alias_guid.ports_guid[port - 1].
- all_rec_per_port[index].guid_indexes = comp_mask;
+ all_rec_per_port[index].guid_indexes |= comp_mask;
+ if (dev->sriov.alias_guid.ports_guid[port - 1].
+ all_rec_per_port[index].guid_indexes)
+ dev->sriov.alias_guid.ports_guid[port - 1].
+ all_rec_per_port[index].status = MLX4_GUID_INFO_STATUS_IDLE;
+
}
static int set_guid_rec(struct ib_device *ibdev,
- u8 port, int index,
- struct mlx4_sriov_alias_guid_info_rec_det *rec_det)
+ struct mlx4_next_alias_guid_work *rec)
{
int err;
struct mlx4_ib_dev *dev = to_mdev(ibdev);
@@ -368,6 +493,9 @@
struct ib_port_attr attr;
struct mlx4_alias_guid_work_context *callback_context;
unsigned long resched_delay, flags, flags1;
+ u8 port = rec->port + 1;
+ int index = rec->block_num;
+ struct mlx4_sriov_alias_guid_info_rec_det *rec_det = &rec->rec_det;
struct list_head *head =
&dev->sriov.alias_guid.ports_guid[port - 1].cb_list;
@@ -394,7 +522,9 @@
callback_context->port = port;
callback_context->dev = dev;
callback_context->block_num = index;
- callback_context->method = rec_det->method;
+ callback_context->guid_indexes = rec_det->guid_indexes;
+ callback_context->method = rec->method;
+
memset(&guid_info_rec, 0, sizeof (struct ib_sa_guidinfo_rec));
guid_info_rec.lid = cpu_to_be16(attr.lid);
@@ -413,7 +543,7 @@
callback_context->query_id =
ib_sa_guid_info_rec_query(dev->sriov.alias_guid.sa_client,
ibdev, port, &guid_info_rec,
- comp_mask, rec_det->method, 1000,
+ comp_mask, rec->method, 1000, 0,
GFP_KERNEL, aliasguid_query_handler,
callback_context,
&callback_context->sa_query);
@@ -448,6 +578,30 @@
return err;
}
+static void mlx4_ib_guid_port_init(struct mlx4_ib_dev *dev, int port)
+{
+ int j, k, entry;
+ __be64 guid;
+
+ /*Check if the SM doesn't need to assign the GUIDs*/
+ for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) {
+ for (k = 0; k < NUM_ALIAS_GUID_IN_REC; k++) {
+ entry = j * NUM_ALIAS_GUID_IN_REC + k;
+ /* no request for the 0 entry (hw guid) */
+ if (!entry || entry > dev->dev->persist->num_vfs ||
+ !mlx4_is_slave_active(dev->dev, entry))
+ continue;
+ guid = mlx4_get_admin_guid(dev->dev, entry, port);
+ *(__be64 *)&dev->sriov.alias_guid.ports_guid[port - 1].
+ all_rec_per_port[j].all_recs
+ [GUID_REC_SIZE * k] = guid;
+ pr_debug("guid was set, entry=%d, val=0x%llx, port=%d\n",
+ entry,
+ (long long)be64_to_cpu(guid),
+ port);
+ }
+ }
+}
void mlx4_ib_invalidate_all_guid_record(struct mlx4_ib_dev *dev, int port)
{
int i;
@@ -457,6 +611,13 @@
spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
+
+ if (dev->sriov.alias_guid.ports_guid[port - 1].state_flags &
+ GUID_STATE_NEED_PORT_INIT) {
+ mlx4_ib_guid_port_init(dev, port);
+ dev->sriov.alias_guid.ports_guid[port - 1].state_flags &=
+ (~GUID_STATE_NEED_PORT_INIT);
+ }
for (i = 0; i < NUM_ALIAS_GUID_REC_IN_PORT; i++)
invalidate_guid_record(dev, port, i);
@@ -476,60 +637,107 @@
spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
}
-/* The function returns the next record that was
- * not configured (or failed to be configured) */
-static int get_next_record_to_update(struct mlx4_ib_dev *dev, u8 port,
- struct mlx4_next_alias_guid_work *rec)
+static void set_required_record(struct mlx4_ib_dev *dev, u8 port,
+ struct mlx4_next_alias_guid_work *next_rec,
+ int record_index)
{
- int j;
- unsigned long flags;
+ int i;
+ int lowset_time_entry = -1;
+ int lowest_time = 0;
+ ib_sa_comp_mask delete_guid_indexes = 0;
+ ib_sa_comp_mask set_guid_indexes = 0;
+ struct mlx4_sriov_alias_guid_info_rec_det *rec =
+ &dev->sriov.alias_guid.ports_guid[port].
+ all_rec_per_port[record_index];
- for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) {
- spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags);
- if (dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[j].status ==
- MLX4_GUID_INFO_STATUS_IDLE) {
- memcpy(&rec->rec_det,
- &dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[j],
- sizeof (struct mlx4_sriov_alias_guid_info_rec_det));
- rec->port = port;
- rec->block_num = j;
- dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[j].status =
- MLX4_GUID_INFO_STATUS_PENDING;
- spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags);
- return 0;
+ for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) {
+ if (!(rec->guid_indexes &
+ mlx4_ib_get_aguid_comp_mask_from_ix(i)))
+ continue;
+
+ if (*(__be64 *)&rec->all_recs[i * GUID_REC_SIZE] ==
+ cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL))
+ delete_guid_indexes |=
+ mlx4_ib_get_aguid_comp_mask_from_ix(i);
+ else
+ set_guid_indexes |=
+ mlx4_ib_get_aguid_comp_mask_from_ix(i);
+
+ if (lowset_time_entry == -1 || rec->guids_retry_schedule[i] <=
+ lowest_time) {
+ lowset_time_entry = i;
+ lowest_time = rec->guids_retry_schedule[i];
}
- spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags);
}
- return -ENOENT;
+
+ memcpy(&next_rec->rec_det, rec, sizeof(*rec));
+ next_rec->port = port;
+ next_rec->block_num = record_index;
+
+ if (*(__be64 *)&rec->all_recs[lowset_time_entry * GUID_REC_SIZE] ==
+ cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL)) {
+ next_rec->rec_det.guid_indexes = delete_guid_indexes;
+ next_rec->method = MLX4_GUID_INFO_RECORD_DELETE;
+ } else {
+ next_rec->rec_det.guid_indexes = set_guid_indexes;
+ next_rec->method = MLX4_GUID_INFO_RECORD_SET;
+ }
}
-static void set_administratively_guid_record(struct mlx4_ib_dev *dev, int port,
- int rec_index,
- struct mlx4_sriov_alias_guid_info_rec_det *rec_det)
+/* return index of record that should be updated based on lowest
+ * rescheduled time
+ */
+static int get_low_record_time_index(struct mlx4_ib_dev *dev, u8 port,
+ int *resched_delay_sec)
{
- dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[rec_index].guid_indexes =
- rec_det->guid_indexes;
- memcpy(dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[rec_index].all_recs,
- rec_det->all_recs, NUM_ALIAS_GUID_IN_REC * GUID_REC_SIZE);
- dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[rec_index].status =
- rec_det->status;
+ int record_index = -1;
+ u64 low_record_time = 0;
+ struct mlx4_sriov_alias_guid_info_rec_det rec;
+ int j;
+
+ for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) {
+ rec = dev->sriov.alias_guid.ports_guid[port].
+ all_rec_per_port[j];
+ if (rec.status == MLX4_GUID_INFO_STATUS_IDLE &&
+ rec.guid_indexes) {
+ if (record_index == -1 ||
+ rec.time_to_run < low_record_time) {
+ record_index = j;
+ low_record_time = rec.time_to_run;
+ }
+ }
+ }
+ if (resched_delay_sec) {
+ u64 curr_time = ktime_get_ns();
+
+ *resched_delay_sec = (low_record_time < curr_time) ? 0 :
+ div_u64((low_record_time - curr_time), NSEC_PER_SEC);
+ }
+
+ return record_index;
}
-static void set_all_slaves_guids(struct mlx4_ib_dev *dev, int port)
+/* The function returns the next record that was
+ * not configured (or failed to be configured) */
+static int get_next_record_to_update(struct mlx4_ib_dev *dev, u8 port,
+ struct mlx4_next_alias_guid_work *rec)
{
- int j;
- struct mlx4_sriov_alias_guid_info_rec_det rec_det ;
-
- for (j = 0 ; j < NUM_ALIAS_GUID_REC_IN_PORT ; j++) {
- memset(rec_det.all_recs, 0, NUM_ALIAS_GUID_IN_REC * GUID_REC_SIZE);
- rec_det.guid_indexes = (!j ? 0 : IB_SA_GUIDINFO_REC_GID0) |
- IB_SA_GUIDINFO_REC_GID1 | IB_SA_GUIDINFO_REC_GID2 |
- IB_SA_GUIDINFO_REC_GID3 | IB_SA_GUIDINFO_REC_GID4 |
- IB_SA_GUIDINFO_REC_GID5 | IB_SA_GUIDINFO_REC_GID6 |
- IB_SA_GUIDINFO_REC_GID7;
- rec_det.status = MLX4_GUID_INFO_STATUS_IDLE;
- set_administratively_guid_record(dev, port, j, &rec_det);
+ unsigned long flags;
+ int record_index;
+ int ret = 0;
+
+ spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags);
+ record_index = get_low_record_time_index(dev, port, NULL);
+
+ if (record_index < 0) {
+ ret = -ENOENT;
+ goto out;
}
+
+ set_required_record(dev, port, rec, record_index);
+out:
+ spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags);
+ return ret;
}
static void alias_guid_work(struct work_struct *work)
@@ -559,9 +767,7 @@
goto out;
}
- set_guid_rec(&dev->ib_dev, rec->port + 1, rec->block_num,
- &rec->rec_det);
-
+ set_guid_rec(&dev->ib_dev, rec);
out:
kfree(rec);
}
@@ -576,6 +782,12 @@
spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
if (!dev->sriov.is_going_down) {
+ /* If there is pending one should cancell then run, otherwise
+ * won't run till previous one is ended as same work
+ * struct is used.
+ */
+ cancel_delayed_work(&dev->sriov.alias_guid.ports_guid[port].
+ alias_guid_work);
queue_delayed_work(dev->sriov.alias_guid.ports_guid[port].wq,
&dev->sriov.alias_guid.ports_guid[port].alias_guid_work, 0);
}
@@ -623,7 +835,7 @@
{
char alias_wq_name[15];
int ret = 0;
- int i, j, k;
+ int i, j;
union ib_gid gid;
if (!mlx4_is_master(dev->dev))
@@ -647,33 +859,25 @@
for (i = 0 ; i < dev->num_ports; i++) {
memset(&dev->sriov.alias_guid.ports_guid[i], 0,
sizeof (struct mlx4_sriov_alias_guid_port_rec_det));
- /*Check if the SM doesn't need to assign the GUIDs*/
+ dev->sriov.alias_guid.ports_guid[i].state_flags |=
+ GUID_STATE_NEED_PORT_INIT;
for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) {
- if (mlx4_ib_sm_guid_assign) {
- dev->sriov.alias_guid.ports_guid[i].
- all_rec_per_port[j].
- ownership = MLX4_GUID_DRIVER_ASSIGN;
- continue;
- }
- dev->sriov.alias_guid.ports_guid[i].all_rec_per_port[j].
- ownership = MLX4_GUID_NONE_ASSIGN;
- /*mark each val as it was deleted,
- till the sysAdmin will give it valid val*/
- for (k = 0; k < NUM_ALIAS_GUID_IN_REC; k++) {
- *(__be64 *)&dev->sriov.alias_guid.ports_guid[i].
- all_rec_per_port[j].all_recs[GUID_REC_SIZE * k] =
- cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL);
- }
+ /* mark each val as it was deleted */
+ memset(dev->sriov.alias_guid.ports_guid[i].
+ all_rec_per_port[j].all_recs, 0xFF,
+ sizeof(dev->sriov.alias_guid.ports_guid[i].
+ all_rec_per_port[j].all_recs));
}
INIT_LIST_HEAD(&dev->sriov.alias_guid.ports_guid[i].cb_list);
/*prepare the records, set them to be allocated by sm*/
+ if (mlx4_ib_sm_guid_assign)
+ for (j = 1; j < NUM_ALIAS_GUID_PER_PORT; j++)
+ mlx4_set_admin_guid(dev->dev, 0, j, i + 1);
for (j = 0 ; j < NUM_ALIAS_GUID_REC_IN_PORT; j++)
invalidate_guid_record(dev, i + 1, j);
dev->sriov.alias_guid.ports_guid[i].parent = &dev->sriov.alias_guid;
dev->sriov.alias_guid.ports_guid[i].port = i;
- if (mlx4_ib_sm_guid_assign)
- set_all_slaves_guids(dev, i);
snprintf(alias_wq_name, sizeof alias_wq_name, "alias_guid%d", i);
dev->sriov.alias_guid.ports_guid[i].wq =
Index: sys/ofed/drivers/infiniband/hw/mlx4/cm.c
===================================================================
--- sys/ofed/drivers/infiniband/hw/mlx4/cm.c
+++ sys/ofed/drivers/infiniband/hw/mlx4/cm.c
@@ -242,8 +242,7 @@
static struct id_map_entry *
id_map_alloc(struct ib_device *ibdev, int slave_id, u32 sl_cm_id)
{
- int ret, id;
- static int next_id;
+ int ret;
struct id_map_entry *ent;
struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov;
@@ -259,25 +258,22 @@
ent->dev = to_mdev(ibdev);
INIT_DELAYED_WORK(&ent->timeout, id_map_ent_timeout);
- do {
- spin_lock(&to_mdev(ibdev)->sriov.id_map_lock);
- ret = idr_get_new_above(&sriov->pv_id_table, ent,
- next_id, &id);
- if (!ret) {
- next_id = ((unsigned) id + 1) & MAX_IDR_MASK;
- ent->pv_cm_id = (u32)id;
- sl_id_map_add(ibdev, ent);
- }
+ idr_preload(GFP_KERNEL);
+ spin_lock(&to_mdev(ibdev)->sriov.id_map_lock);
- spin_unlock(&sriov->id_map_lock);
- } while (ret == -EAGAIN && idr_pre_get(&sriov->pv_id_table, GFP_KERNEL));
- /*the function idr_get_new_above can return -ENOSPC, so don't insert in that case.*/
- if (!ret) {
- spin_lock(&sriov->id_map_lock);
+ ret = idr_alloc_cyclic(&sriov->pv_id_table, ent, 0, 0, GFP_NOWAIT);
+ if (ret >= 0) {
+ ent->pv_cm_id = (u32)ret;
+ sl_id_map_add(ibdev, ent);
list_add_tail(&ent->list, &sriov->cm_list);
- spin_unlock(&sriov->id_map_lock);
- return ent;
}
+
+ spin_unlock(&sriov->id_map_lock);
+ idr_preload_end();
+
+ if (ret >= 0)
+ return ent;
+
/*error flow*/
kfree(ent);
mlx4_ib_warn(ibdev, "No more space in the idr (err:0x%x)\n", ret);
@@ -327,8 +323,7 @@
if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID ||
mad->mad_hdr.attr_id == CM_REP_ATTR_ID ||
- mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID ||
- mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) {
+ mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
sl_cm_id = get_local_comm_id(mad);
id = id_map_alloc(ibdev, slave_id, sl_cm_id);
if (IS_ERR(id)) {
@@ -361,7 +356,7 @@
}
int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave,
- struct ib_mad *mad, int is_eth)
+ struct ib_mad *mad)
{
u32 pv_cm_id;
struct id_map_entry *id;
@@ -370,7 +365,7 @@
mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
union ib_gid gid;
- if (is_eth)
+ if (!slave)
return 0;
gid = gid_from_req_msg(ibdev, mad);
@@ -391,7 +386,7 @@
return -ENOENT;
}
- if (!is_eth)
+ if (slave)
*slave = id->slave_id;
set_remote_comm_id(mad, id->sl_cm_id);
@@ -411,7 +406,6 @@
INIT_LIST_HEAD(&dev->sriov.cm_list);
dev->sriov.sl_id_map = RB_ROOT;
idr_init(&dev->sriov.pv_id_table);
- idr_pre_get(&dev->sriov.pv_id_table, GFP_KERNEL);
}
/* slave = -1 ==> all slaves */
Index: sys/ofed/drivers/infiniband/hw/mlx4/cq.c
===================================================================
--- sys/ofed/drivers/infiniband/hw/mlx4/cq.c
+++ sys/ofed/drivers/infiniband/hw/mlx4/cq.c
@@ -34,6 +34,7 @@
#include <linux/mlx4/cq.h>
#include <linux/mlx4/qp.h>
#include <linux/mlx4/srq.h>
+#include <linux/mlx4/driver.h>
#include <linux/slab.h>
#include "mlx4_ib.h"
@@ -97,14 +98,11 @@
struct ib_cq_attr *cq_attr,
int cq_attr_mask)
{
- int err = 0;
struct mlx4_ib_cq *mcq = to_mcq(cq);
struct mlx4_ib_dev *dev = to_mdev(cq->device);
+ int err = 0;
if (cq_attr_mask & IB_CQ_CAP_FLAGS) {
- if (cq_attr->cq_cap_flags & IB_CQ_TIMESTAMP)
- return -ENOTSUPP;
-
if (cq_attr->cq_cap_flags & IB_CQ_IGNORE_OVERRUN) {
if (dev->dev->caps.cq_flags & MLX4_DEV_CAP_CQ_FLAG_IO)
err = mlx4_cq_ignore_overrun(dev->dev, &mcq->mcq);
@@ -127,7 +125,7 @@
int err;
err = mlx4_buf_alloc(dev->dev, nent * dev->dev->caps.cqe_size,
- PAGE_SIZE * 2, &buf->buf);
+ PAGE_SIZE * 2, &buf->buf, GFP_KERNEL);
if (err)
goto out;
@@ -138,7 +136,7 @@
if (err)
goto err_buf;
- err = mlx4_buf_write_mtt(dev->dev, &buf->mtt, &buf->buf);
+ err = mlx4_buf_write_mtt(dev->dev, &buf->mtt, &buf->buf, GFP_KERNEL);
if (err)
goto err_mtt;
@@ -209,6 +207,8 @@
int err;
int entries = attr->cqe;
int vector = attr->comp_vector;
+ bool user_cq = false;
+ void *buf_addr;
if (entries < 1 || entries > dev->dev->caps.max_cqes)
return ERR_PTR(-EINVAL);
@@ -227,6 +227,8 @@
cq->resize_buf = NULL;
cq->resize_umem = NULL;
cq->create_flags = attr->flags;
+ INIT_LIST_HEAD(&cq->send_qp_list);
+ INIT_LIST_HEAD(&cq->recv_qp_list);
if (context) {
struct mlx4_ib_create_cq ucmd;
@@ -236,6 +238,9 @@
goto err_cq;
}
+ buf_addr = (void *)ucmd.buf_addr;
+ user_cq = true;
+
err = mlx4_ib_get_cq_umem(dev, context, &cq->buf, &cq->umem,
ucmd.buf_addr, entries);
if (err)
@@ -243,12 +248,13 @@
err = mlx4_ib_db_map_user(to_mucontext(context), ucmd.db_addr,
&cq->db);
+
if (err)
goto err_mtt;
uar = &to_mucontext(context)->uar;
} else {
- err = mlx4_db_alloc(dev->dev, &cq->db, 1);
+ err = mlx4_db_alloc(dev->dev, &cq->db, 1, GFP_KERNEL);
if (err)
goto err_cq;
@@ -261,30 +267,40 @@
if (err)
goto err_db;
+ buf_addr = &cq->buf.buf;
+
uar = &dev->priv_uar;
}
if (dev->eq_table)
- vector = dev->eq_table[vector % ibdev->num_comp_vectors];
+ vector = mlx4_choose_vector(dev->dev, vector,
+ ibdev->num_comp_vectors);
+ cq->vector = vector;
err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar,
cq->db.dma, &cq->mcq, vector, 0,
- !!(cq->create_flags & IB_CQ_TIMESTAMP));
+ !!(cq->create_flags & IB_CQ_TIMESTAMP),
+ buf_addr, user_cq);
if (err)
goto err_dbmap;
- cq->mcq.comp = mlx4_ib_cq_comp;
+ cq->mcq.comp = mlx4_ib_cq_comp;
cq->mcq.event = mlx4_ib_cq_event;
if (context)
if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof (__u32))) {
err = -EFAULT;
- goto err_dbmap;
+ goto err_cq_free;
}
return &cq->ibcq;
+err_cq_free:
+ mlx4_cq_free(dev->dev, &cq->mcq);
+
err_dbmap:
+ mlx4_release_vector(dev->dev, cq->vector);
+
if (context)
mlx4_ib_db_unmap_user(to_mucontext(context), &cq->db);
@@ -370,17 +386,15 @@
return i - cq->mcq.cons_index;
}
-static int mlx4_ib_cq_resize_copy_cqes(struct mlx4_ib_cq *cq)
+static void mlx4_ib_cq_resize_copy_cqes(struct mlx4_ib_cq *cq)
{
struct mlx4_cqe *cqe, *new_cqe;
int i;
int cqe_size = cq->buf.entry_size;
int cqe_inc = cqe_size == 64 ? 1 : 0;
- struct mlx4_cqe *start_cqe;
i = cq->mcq.cons_index;
cqe = get_cqe(cq, i & cq->ibcq.cqe);
- start_cqe = cqe;
cqe += cqe_inc;
while ((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) != MLX4_CQE_OPCODE_RESIZE) {
@@ -392,15 +406,9 @@
new_cqe->owner_sr_opcode = (cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK) |
(((i + 1) & (cq->resize_buf->cqe + 1)) ? MLX4_CQE_OWNER_MASK : 0);
cqe = get_cqe(cq, ++i & cq->ibcq.cqe);
- if (cqe == start_cqe) {
- pr_warn("resize CQ failed to get resize CQE, CQN 0x%x\n", cq->mcq.cqn);
- return -ENOMEM;
- }
cqe += cqe_inc;
-
}
++cq->mcq.cons_index;
- return 0;
}
int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
@@ -411,9 +419,6 @@
int outst_cqe;
int err;
- if (dev->dev->caps.fw_ver < MLX4_FW_VER_RESIZE_CQ)
- return -ENOSYS;
-
mutex_lock(&cq->resize_mutex);
if (entries < 1 || entries > dev->dev->caps.max_cqes) {
err = -EINVAL;
@@ -439,7 +444,7 @@
/* Can't be smaller than the number of outstanding CQEs */
outst_cqe = mlx4_ib_get_outstanding_cqes(cq);
if (entries < outst_cqe + 1) {
- err = 0;
+ err = -EINVAL;
goto out;
}
@@ -470,7 +475,7 @@
spin_lock_irq(&cq->lock);
if (cq->resize_buf) {
- err = mlx4_ib_cq_resize_copy_cqes(cq);
+ mlx4_ib_cq_resize_copy_cqes(cq);
tmp_buf = cq->buf;
tmp_cqe = cq->ibcq.cqe;
cq->buf = cq->resize_buf->buf;
@@ -534,6 +539,8 @@
mlx4_db_free(dev->dev, &mcq->db);
}
+ mlx4_release_vector(dev->dev, mcq->vector);
+
kfree(mcq);
return 0;
@@ -643,6 +650,7 @@
wc->vlan_id = be16_to_cpu(hdr->tun.sl_vid);
memcpy(&(wc->smac[0]), (char *)&hdr->tun.mac_31_0, 4);
memcpy(&(wc->smac[4]), (char *)&hdr->tun.slid_mac_47_32, 2);
+ wc->wc_flags |= (IB_WC_WITH_VLAN | IB_WC_WITH_SMAC);
} else {
wc->slid = be16_to_cpu(hdr->tun.slid_mac_47_32);
wc->sl = (u8) (be16_to_cpu(hdr->tun.sl_vid) >> 12);
@@ -651,6 +659,55 @@
return 0;
}
+static void mlx4_ib_qp_sw_comp(struct mlx4_ib_qp *qp, int num_entries,
+ struct ib_wc *wc, int *npolled, int is_send)
+{
+ struct mlx4_ib_wq *wq;
+ unsigned cur;
+ int i;
+
+ wq = is_send ? &qp->sq : &qp->rq;
+ cur = wq->head - wq->tail;
+
+ if (cur == 0)
+ return;
+
+ for (i = 0; i < cur && *npolled < num_entries; i++) {
+ wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+ wc->status = IB_WC_WR_FLUSH_ERR;
+ wc->vendor_err = MLX4_CQE_SYNDROME_WR_FLUSH_ERR;
+ wq->tail++;
+ (*npolled)++;
+ wc->qp = &qp->ibqp;
+ wc++;
+ }
+}
+
+static void mlx4_ib_poll_sw_comp(struct mlx4_ib_cq *cq, int num_entries,
+ struct ib_wc *wc, int *npolled)
+{
+ struct mlx4_ib_qp *qp;
+
+ *npolled = 0;
+ /* Find uncompleted WQEs belonging to that cq and retrun
+ * simulated FLUSH_ERR completions
+ */
+ list_for_each_entry(qp, &cq->send_qp_list, cq_send_list) {
+ mlx4_ib_qp_sw_comp(qp, num_entries, wc + *npolled, npolled, 1);
+ if (*npolled >= num_entries)
+ goto out;
+ }
+
+ list_for_each_entry(qp, &cq->recv_qp_list, cq_recv_list) {
+ mlx4_ib_qp_sw_comp(qp, num_entries, wc + *npolled, npolled, 0);
+ if (*npolled >= num_entries)
+ goto out;
+ }
+
+out:
+ return;
+}
+
static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
struct mlx4_ib_qp **cur_qp,
struct ib_wc *wc)
@@ -662,6 +719,7 @@
struct mlx4_srq *msrq = NULL;
int is_send;
int is_error;
+ int is_eth;
u32 g_mlpath_rqpn;
u16 wqe_ctr;
unsigned tail = 0;
@@ -781,7 +839,7 @@
switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
case MLX4_OPCODE_RDMA_WRITE_IMM:
wc->wc_flags |= IB_WC_WITH_IMM;
- /* fall through */
+ /* FALLTHROUGH */
case MLX4_OPCODE_RDMA_WRITE:
wc->opcode = IB_WC_RDMA_WRITE;
break;
@@ -849,34 +907,26 @@
break;
}
+ is_eth = (rdma_port_get_link_layer(wc->qp->device,
+ (*cur_qp)->port) ==
+ IB_LINK_LAYER_ETHERNET);
if (mlx4_is_mfunc(to_mdev(cq->ibcq.device)->dev)) {
if ((*cur_qp)->mlx4_ib_qp_type &
(MLX4_IB_QPT_PROXY_SMI_OWNER |
MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI))
- return use_tunnel_data
- (*cur_qp, cq, wc, tail, cqe,
- rdma_port_get_link_layer
- (wc->qp->device,
- (*cur_qp)->port) ==
- IB_LINK_LAYER_ETHERNET);
+ return use_tunnel_data(*cur_qp, cq, wc, tail,
+ cqe, is_eth);
}
if (timestamp_en) {
- /* currently, only CQ_CREATE_WITH_TIMESTAMPING_RAW is
- * supported. CQ_CREATE_WITH_TIMESTAMPING_SYS isn't
- * supported */
- if (cq->create_flags & IB_CQ_TIMESTAMP_TO_SYS_TIME) {
- wc->ts.timestamp = 0;
- } else {
- wc->ts.timestamp =
- ((u64)(be32_to_cpu(cqe->timestamp_16_47)
- + !cqe->timestamp_0_15) << 16)
- | be16_to_cpu(cqe->timestamp_0_15);
- wc->wc_flags |= IB_WC_WITH_TIMESTAMP;
- }
+ wc->ts.timestamp =
+ ((u64)(be32_to_cpu(cqe->timestamp_16_47)
+ + !cqe->timestamp_0_15) << 16)
+ | be16_to_cpu(cqe->timestamp_0_15);
+ wc->wc_flags |= IB_WC_WITH_TIMESTAMP;
+ wc->slid = be16_to_cpu(IB_LID_PERMISSIVE);
} else {
- wc->wc_flags |= IB_WC_WITH_SLID;
- wc->slid = be16_to_cpu(cqe->rlid);
+ wc->slid = be16_to_cpu(cqe->rlid);
}
g_mlpath_rqpn = be32_to_cpu(cqe->g_mlpath_rqpn);
wc->src_qp = g_mlpath_rqpn & 0xffffff;
@@ -886,25 +936,23 @@
wc->wc_flags |= mlx4_ib_ipoib_csum_ok(cqe->status,
cqe->checksum) ? IB_WC_IP_CSUM_OK : 0;
if (!timestamp_en) {
- if (rdma_port_get_link_layer(wc->qp->device,
- (*cur_qp)->port) ==
- IB_LINK_LAYER_ETHERNET)
+ if (is_eth) {
wc->sl = be16_to_cpu(cqe->sl_vid) >> 13;
- else
+ if (be32_to_cpu(cqe->vlan_my_qpn) &
+ MLX4_CQE_CVLAN_PRESENT_MASK) {
+ wc->vlan_id = be16_to_cpu(cqe->sl_vid) &
+ MLX4_CQE_VID_MASK;
+ } else {
+ wc->vlan_id = 0xffff;
+ }
+ memcpy(wc->smac, cqe->smac, ETH_ALEN);
+ wc->wc_flags |= (IB_WC_WITH_VLAN | IB_WC_WITH_SMAC);
+ } else {
wc->sl = be16_to_cpu(cqe->sl_vid) >> 12;
- wc->wc_flags |= IB_WC_WITH_SL;
- }
- if ((be32_to_cpu(cqe->vlan_my_qpn) &
- MLX4_CQE_VLAN_PRESENT_MASK) && !timestamp_en) {
- wc->vlan_id = be16_to_cpu(cqe->sl_vid) &
- MLX4_CQE_VID_MASK;
- wc->wc_flags |= IB_WC_WITH_VLAN;
+ wc->vlan_id = 0xffff;
+ }
} else {
- wc->vlan_id = 0xffff;
- }
- if (!timestamp_en) {
- memcpy(wc->smac, cqe->smac, 6);
- wc->wc_flags |= IB_WC_WITH_SMAC;
+ wc->sl = IB_SL_INVALID;
}
}
@@ -918,8 +966,13 @@
unsigned long flags;
int npolled;
int err = 0;
+ struct mlx4_ib_dev *mdev = to_mdev(cq->ibcq.device);
spin_lock_irqsave(&cq->lock, flags);
+ if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+ mlx4_ib_poll_sw_comp(cq, num_entries, wc, &npolled);
+ goto out;
+ }
for (npolled = 0; npolled < num_entries; ++npolled) {
err = mlx4_ib_poll_one(cq, &cur_qp, wc + npolled);
@@ -929,6 +982,7 @@
mlx4_cq_set_ci(&cq->mcq);
+out:
spin_unlock_irqrestore(&cq->lock, flags);
if (err == 0 || err == -EAGAIN)
Index: sys/ofed/drivers/infiniband/hw/mlx4/ecn.h
===================================================================
--- /dev/null
+++ sys/ofed/drivers/infiniband/hw/mlx4/ecn.h
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2012 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef MLX4_ECN_H
+#define MLX4_ECN_H
+
+#define CONGESTION_NPRIOS 8
+#define MLX4_CMD_CONGESTION_CTRL_OPCODE 0x68
+#define N_LIST_INDEX 2
+#define VAL_ERROR_VALUE -1
+
+struct mlx4_ib_dev;
+
+enum congestion_control_list_index {
+ LIST_INDEX_QCN,
+ LIST_INDEX_ECN
+};
+
+enum congestion_control_opmod {
+ CONGESTION_CONTROL_GET_PARAMS,
+ CONGESTION_CONTROL_GET_STATISTICS,
+ CONGESTION_CONTROL_GET_GEN_PARAMS,
+ CONGESTION_CONTROL_GET_FLOW_STATE,
+ CONGESTION_CONTROL_SET_PARAMS,
+ CONGESTION_CONTROL_SET_GEN_PARAMS,
+ CONGESTION_CONTROL_SET_FLOW_STATE,
+ CONGESTION_CONTROL_SZ
+};
+
+enum congestion_control_algorithm {
+ CTRL_ALGO_802_1_QAU_REACTION_POINT,
+ CTRL_ALGO_R_ROCE_ECN_1_REACTION_POINT,
+ CTRL_ALGO_R_ROCE_ECN_1_NOTIFICATION_POINT,
+ CTRL_ALGO_SZ
+};
+
+struct congestion_control_inmod {
+ char clear;
+ char algorithm;
+ char priority;
+ char port;
+} __packed;
+
+
+enum congestion_control_r_roce_ecn_rp_modify_enable {
+ CONG_CTRL_RROCE_RP_MODIFY_ENABLE_DCE_TCP_RTT = 12,
+ CONG_CTRL_RROCE_RP_MODIFY_ENABLE_DCE_TCP_G,
+ CONG_CTRL_RROCE_RP_MODIFY_ENABLE_RATE_TO_SET_ON_FIRST_CNP,
+ CONG_CTRL_RROCE_RP_MODIFY_ENABLE_MARKED_RATIO_SHIFT,
+ CONG_CTRL_RROCE_RP_MODIFY_ENABLE_MARKED_RATIO_MULTIPLIER,
+ CONG_CTRL_RROCE_RP_MODIFY_ENABLE_ALPHA_TO_RATE_COEFF,
+ CONG_CTRL_RROCE_RP_MODIFY_ENABLE_MAX_BYTE_RISE = 20,
+ CONG_CTRL_RROCE_RP_MODIFY_ENABLE_MAX_TIME_RISE,
+ CONG_CTRL_RROCE_RP_MODIFY_ENABLE_RPG_MIN_RATE,
+ CONG_CTRL_RROCE_RP_MODIFY_ENABLE_RPG_MIN_DEC_FAC,
+ CONG_CTRL_RROCE_RP_MODIFY_ENABLE_ALPHA_TO_RATE_SHIFT,
+ CONG_CTRL_RROCE_RP_MODIFY_ENABLE_RPG_HAI_RATE,
+ CONG_CTRL_RROCE_RP_MODIFY_ENABLE_RPG_AI_RATE,
+ CONG_CTRL_RROCE_RP_MODIFY_ENABLE_RPG_MAX_RATE,
+ CONG_CTRL_RROCE_RP_MODIFY_ENABLE_RPG_THRESHOLD,
+ CONG_CTRL_RROCE_RP_MODIFY_ENABLE_RPG_BYTE_RESET,
+ CONG_CTRL_RROCE_RP_MODIFY_ENABLE_RPG_TIME_RESET,
+ CONG_CTRL_RROCE_RP_MODIFY_ENABLE_DISABLE_HAI_STAGE = 49,
+ CONG_CTRL_RROCE_RP_MODIFY_ENABLE_CLAMP_TGT_RATE_AFTER_TIME_INC = 55,
+ CONG_CTRL_RROCE_RP_MODIFY_ENABLE_FORCE_RC_TOS,
+ CONG_CTRL_RROCE_RP_MODIFY_ENABLE_FORCE_UC_TOS,
+ CONG_CTRL_RROCE_RP_MODIFY_ENABLE_FORCE_UD_TOS,
+ CONG_CTRL_RROCE_RP_MODIFY_ENABLE_CLAMP_TGT_RATE = 59,
+ CONG_CTRL_RROCE_RP_MODIFY_ENABLE_FAST_RISE = 61,
+ CONG_CTRL_RROCE_RP_MODIFY_ENABLE_CNP_RECEIVE_ENABLE = 63,
+};
+
+
+enum congestion_control_r_roce_ecn_np_modify_enable {
+ CONG_CTRL_RROCE_NP_MODIFY_ENABLE_CNP_DSCP = 26,
+ CONG_CTRL_RROCE_NP_MODIFY_ENABLE_CNP_802P_PRIO,
+ CONG_CTRL_RROCE_NP_MODIFY_ENABLE_NUM_CONGESTION_CYCLES_TO_KEEP,
+ CONG_CTRL_RROCE_NP_MODIFY_ENABLE_CNP_TIMER,
+ CONG_CTRL_RROCE_NP_MODIFY_ENABLE_NUM_INJECTOR,
+ CONG_CTRL_RROCE_NP_MODIFY_ENABLE_COMPN_ECN_RATE_LIMIT = 62,
+ CONG_CTRL_RROCE_NP_MODIFY_ENABLE_ECN_RECEIVE_ENABLE,
+};
+
+enum congestion_control_r_roce_ecn_np_gen_params_modify_enable {
+ CONG_CTRL_RROCE_NP_GEN_PARAMS_MODIFY_ENABLE_CNP_OPCODE = 28,
+ CONG_CTRL_RROCE_NP_GEN_PARAMS_MODIFY_ENABLE_MIN_LOSSY_BUF_CATCHES,
+ CONG_CTRL_RROCE_NP_GEN_PARAMS_MODIFY_ENABLE_MIN_LOSSLESS_BUF_CATCHES,
+ CONG_CTRL_RROCE_NP_GEN_PARAMS_MODIFY_ENABLE_MAX_TIME_BETWEEN_CATCHES,
+ CONG_CTRL_RROCE_NP_GEN_PARAMS_MODIFY_ENABLE_ECN_EXPECT_IPV6 = 61,
+ CONG_CTRL_RROCE_NP_GEN_PARAMS_MODIFY_ENABLE_ECN_EXPECT_VLAN_TAGGED,
+ CONG_CTRL_RROCE_NP_GEN_PARAMS_MODIFY_ENABLE_ECN_CATCH_RATE_LIMIT_EN,
+};
+
+enum congestion_control_r_roce_ecn_rp_gen_params_modify_enable {
+ CONG_CTRL_RROCE_RP_GEN_PARAMS_MODIFY_ENABLE_CNP_OPCODE = 28,
+};
+
+enum congestion_control_r_roce_rp_extended_enable {
+ CONG_CTRL_R_ROCE_RP_EXTENDED_ENABLE_DISABLE_HAI_STAGE = 17,
+ CONG_CTRL_R_ROCE_RP_EXTENDED_ENABLE_CLAMP_TGT_RATE_AFTER_TIME_INC = 23,
+ CONG_CTRL_R_ROCE_RP_EXTENDED_ENABLE_FORCE_RC_TOS,
+ CONG_CTRL_R_ROCE_RP_EXTENDED_ENABLE_FORCE_UC_TOS,
+ CONG_CTRL_R_ROCE_RP_EXTENDED_ENABLE_FORCE_UD_TOS,
+ CONG_CTRL_R_ROCE_RP_EXTENDED_ENABLE_CLAMP_TGT_RATE,
+ CONG_CTRL_R_ROCE_RP_EXTENDED_ENABLE_FAST_RISE = 29,
+ CONG_CTRL_R_ROCE_RP_EXTENDED_ENABLE_CNP_RECEIVE_ENABLE = 31
+};
+
+enum congestion_control_r_roce_np_extended_enable {
+ CONG_CTRL_R_ROCE_NP_EXTENDED_ENABLE_COMP_ECN_RATE_LIMIT = 30,
+ CONG_CTRL_R_ROCE_NP_EXTENDED_ENABLE_ECN_RECEIVE_ENABLE,
+};
+
+enum congestion_control_r_roce_np_gen_params_extended_enable {
+ CONG_CTRL_R_ROCE_NP_GEN_PARAMS_EXTENDED_ENABLE_ECN_EXPECT_IPV6 = 29,
+ CONG_CTRL_R_ROCE_NP_GEN_PARAMS_EXTENDED_ENABLE_ECN_EXPECT_VLAN_TAGGED,
+ CONG_CTRL_R_ROCE_NP_GEN_PARAMS_EXTENDED_ENABLE_ECN_CATCH_RATE_LIMIT_EN,
+};
+
+struct congestion_control_mb_prio_r_roce_ecn_rp_params {
+ __be64 modify_enable;
+ __be32 reserved1;
+ __be32 extended_enable;
+ __be32 reserved2;
+ __be32 rpg_time_reset;
+ __be32 rpg_byte_reset;
+ __be32 rpg_threshold;
+ __be32 rpg_max_rate;
+ __be32 rpg_ai_rate;
+ __be32 rpg_hai_rate;
+ __be32 alpha_to_rate_shift;
+ __be32 rpg_min_dec_fac;
+ __be32 rpg_min_rate;
+ __be32 max_time_rise;
+ __be32 max_byte_rise;
+ __be32 reserved3[2];
+ __be32 alpha_to_rate_coeff;
+ __be32 marked_ratio_multiplier;
+ __be32 marked_ratio_shift;
+ __be32 rate_to_set_on_first_cnp;
+ __be32 dce_tcp_g;
+ __be32 dce_tcp_rtt;
+ __be32 reserved4[42];
+} __packed;
+
+
+struct congestion_control_mb_prio_r_roce_ecn_np_params {
+ __be64 modify_enable;
+ __be32 reserved1;
+ __be32 extended_enable;
+ __be32 num_injector;
+ __be32 cnp_timer;
+ __be32 cnp_dscp_cnp_802p_prio;
+ __be32 num_congestion_cycle_to_keep;
+ __be32 reserved2[56];
+} __packed;
+
+
+struct congestion_control_r_roce_rp_prio_statistics {
+ __be64 rppp_rp_centiseconds;
+ __be32 reserved1;
+ __be32 ignored_cnp;
+ __be32 allocated_rate_limiter;
+ __be32 estimated_average_total_limiters_rate;
+ __be32 max_active_rate_limiter_index;
+ __be32 dropped_cnps_busy_fw;
+ __be32 current_total_limiters_rate;
+ __be32 cnps_handled;
+ __be32 min_total_limiters_rate;
+ __be32 max_total_limiters_rate;
+ __be32 reserved2[4];
+} __packed;
+
+
+struct congestion_control_r_roce_rp_statistics {
+ struct congestion_control_r_roce_rp_prio_statistics
+ prio[CONGESTION_NPRIOS];
+} __packed;
+
+struct congestion_control_r_roce_np_prio_statistics {
+ __be32 reserved1[2];
+ __be64 ignored_ecn;
+ __be32 allocated_injector;
+ __be32 ecn_disables_due_to_low_buffer;
+ __be64 total_time_micro_seconds_ecn_disabled_due_to_low_buffer;
+ __be32 max_active_cnp_injector_index;
+ __be32 ecn_marked_packets_handled_successfully;
+ __be32 cnps_sent;
+ __be32 reserved2[5];
+} __packed;
+
+struct congestion_control_r_roce_np_statistics {
+ struct congestion_control_r_roce_np_prio_statistics
+ prio[CONGESTION_NPRIOS];
+} __packed;
+
+struct congestion_control_mb_r_roce_ecn_np_gen_params {
+ __be64 modify_enable;
+ __be32 reserved1;
+ __be32 extended_enable;
+ __be32 max_time_between_ecn_catches;
+ __be32 min_lossless_buffer_for_ecn_catches;
+ __be32 cnp_opcode;
+ __be32 min_lossy_buffer_for_ecn_catches;
+ __be32 reserved2[8];
+} __packed;
+
+
+struct congestion_control_statistics {
+ struct {
+ __be32 prio_space[0x10];
+ } prio[CONGESTION_NPRIOS] __packed;
+ __be32 gen[0x80];
+} __packed;
+
+union congestion_control_statistics_mailbox {
+ struct congestion_control_r_roce_rp_statistics s_rroce_rp;
+ struct congestion_control_r_roce_np_statistics s_rroce_np;
+ struct congestion_control_statistics s;
+};
+
+union congestion_control_gen_params_mailbox {
+ __be64 modify_enable;
+ struct congestion_control_mb_r_roce_ecn_np_gen_params p_rroce_np;
+ __be32 mb_size[16];
+};
+
+union congestion_control_params_mailbox {
+ __be64 modify_enable;
+ struct congestion_control_mb_prio_r_roce_ecn_rp_params p_rroce_rp;
+ struct congestion_control_mb_prio_r_roce_ecn_np_params p_rroce_np;
+};
+
+union congestion_control_out_mailbox {
+ union congestion_control_statistics_mailbox statistics_mb;
+ union congestion_control_params_mailbox params_mb;
+ union congestion_control_gen_params_mailbox gen_params_mb;
+};
+
+union congestion_control_in_mailbox {
+ union congestion_control_params_mailbox params_mb;
+ union congestion_control_gen_params_mailbox gen_params_mb;
+};
+
+struct ecn_control {
+ u8 ecn[MLX4_MAX_PORTS][CTRL_ALGO_SZ][CONGESTION_NPRIOS];
+ struct list_head (*ecn_list)[CONGESTION_NPRIOS][N_LIST_INDEX];
+};
+
+int ecn_enabled(struct mlx4_ib_dev *dev, u8 port, u8 prio);
+
+#endif
Index: sys/ofed/drivers/infiniband/hw/mlx4/ecn.c
===================================================================
--- /dev/null
+++ sys/ofed/drivers/infiniband/hw/mlx4/ecn.c
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2012 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/mlx4/cmd.h>
+#include "mlx4_ib.h"
+#include "ecn.h"
+
+int mlx4_congestion_control(struct mlx4_ib_dev *dev,
+ enum congestion_control_opmod opmod,
+ u8 port, u8 priority, u8 algorithm, u8 clear,
+ union congestion_control_in_mailbox *in_mb,
+ union congestion_control_out_mailbox *out_mb,
+ u32 out_mb_size, u32 in_mb_size)
+{
+ struct mlx4_cmd_mailbox *mailbox_in = NULL;
+ u64 mailbox_in_dma = 0;
+ int err = 0;
+ u32 inmod = (port + 1) | (priority << 8) |
+ (algorithm << 16) |
+ ((clear & 1) << 31);
+
+ if (in_mb != NULL) {
+ mailbox_in = mlx4_alloc_cmd_mailbox(dev->dev);
+ if (IS_ERR(mailbox_in))
+ return -1;
+ mailbox_in_dma = mailbox_in->dma;
+ memcpy(mailbox_in->buf, in_mb, in_mb_size);
+ }
+
+ if (out_mb != NULL) {
+ struct mlx4_cmd_mailbox *mailbox_out;
+
+ mailbox_out = mlx4_alloc_cmd_mailbox(dev->dev);
+
+ if (IS_ERR(mailbox_out)) {
+ err = -1;
+ goto out1;
+ }
+ err = mlx4_cmd_box(dev->dev, mailbox_in_dma, mailbox_out->dma,
+ inmod, opmod,
+ MLX4_CMD_CONGESTION_CTRL_OPCODE,
+ MLX4_CMD_TIME_CLASS_C,
+ MLX4_CMD_WRAPPED);
+ if (!err)
+ memcpy(out_mb, mailbox_out->buf, out_mb_size);
+
+ mlx4_free_cmd_mailbox(dev->dev, mailbox_out);
+
+ } else {
+ err = mlx4_cmd(dev->dev, mailbox_in_dma, inmod,
+ opmod, MLX4_CMD_CONGESTION_CTRL_OPCODE,
+ MLX4_CMD_TIME_CLASS_C,
+ MLX4_CMD_WRAPPED);
+ }
+out1:
+ if (mailbox_in != NULL)
+ mlx4_free_cmd_mailbox(dev->dev, mailbox_in);
+
+ return err;
+}
+
+int mlx4_congestion_control_get_params(struct mlx4_ib_dev *dev,
+ u8 port, u8 priority, u8 algorithm,
+ u8 clear,
+ union
+ congestion_control_params_mailbox
+ *out_mb) {
+ return mlx4_congestion_control(dev, CONGESTION_CONTROL_GET_PARAMS,
+ port, priority, algorithm, clear,
+ NULL,
+ (union congestion_control_out_mailbox *)
+ out_mb, sizeof(*out_mb), 0);
+}
+
+
+int mlx4_congestion_control_get_statistics(struct mlx4_ib_dev *dev,
+ u8 port, u8 algorithm,
+ u8 clear,
+ union
+ congestion_control_statistics_mailbox
+ *out_mb) {
+ return mlx4_congestion_control(dev, CONGESTION_CONTROL_GET_STATISTICS,
+ port, 0, algorithm, clear,
+ NULL,
+ (union congestion_control_out_mailbox *)
+ out_mb, sizeof(*out_mb), 0);
+}
+
+
+int mlx4_congestion_control_set_params(struct mlx4_ib_dev *dev,
+ u8 port, u8 priority, u8 algorithm,
+ u8 clear,
+ union
+ congestion_control_params_mailbox
+ *in_mb) {
+ return mlx4_congestion_control(dev, CONGESTION_CONTROL_SET_PARAMS,
+ port, priority, algorithm, clear,
+ (union congestion_control_in_mailbox *)
+ in_mb, NULL, 0, sizeof(*in_mb));
+}
+
+int mlx4_congestion_control_set_gen_params(struct mlx4_ib_dev *dev,
+ u8 algorithm,
+ union
+ congestion_control_gen_params_mailbox
+ *in_mb) {
+ return mlx4_congestion_control(dev, CONGESTION_CONTROL_SET_GEN_PARAMS,
+ 0, 0, algorithm, 0,
+ (union congestion_control_in_mailbox *)
+ in_mb, NULL, 0, sizeof(*in_mb));
+}
+
+int mlx4_congestion_control_get_gen_params(struct mlx4_ib_dev *dev,
+ u8 algorithm,
+ union
+ congestion_control_gen_params_mailbox
+ *out_mb) {
+ return mlx4_congestion_control(dev, CONGESTION_CONTROL_GET_GEN_PARAMS,
+ 0, 0, algorithm, 0,
+ NULL,
+ (union congestion_control_out_mailbox *)
+ out_mb, sizeof(*out_mb), 0);
+}
+
+int ecn_enabled(struct mlx4_ib_dev *dev, u8 port, u8 prio)
+{
+ int i;
+
+ for (i = 1; i < CTRL_ALGO_SZ; i++) {
+ if (dev->cong.ecn[port - 1][i][prio])
+ return 1;
+ }
+ return 0;
+}
Index: sys/ofed/drivers/infiniband/hw/mlx4/mad.c
===================================================================
--- sys/ofed/drivers/infiniband/hw/mlx4/mad.c
+++ sys/ofed/drivers/infiniband/hw/mlx4/mad.c
@@ -228,6 +228,8 @@
mad->mad_hdr.method == IB_MGMT_METHOD_SET)
switch (mad->mad_hdr.attr_id) {
case IB_SMP_ATTR_PORT_INFO:
+ if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV)
+ return;
pinfo = (struct ib_port_info *) ((struct ib_smp *) mad)->data;
lid = be16_to_cpu(pinfo->lid);
@@ -243,6 +245,8 @@
break;
case IB_SMP_ATTR_PKEY_TABLE:
+ if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV)
+ return;
if (!mlx4_is_mfunc(dev->dev)) {
mlx4_ib_dispatch_event(dev, port_num,
IB_EVENT_PKEY_CHANGE);
@@ -279,6 +283,8 @@
break;
case IB_SMP_ATTR_GUID_INFO:
+ if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV)
+ return;
/* paravirtualized master's guid is guid 0 -- does not change */
if (!mlx4_is_master(dev->dev))
mlx4_ib_dispatch_event(dev, port_num,
@@ -294,6 +300,25 @@
}
break;
+ case IB_SMP_ATTR_SL_TO_VL_TABLE:
+ /* cache sl to vl mapping changes for use in
+ * filling QP1 LRH VL field when sending packets
+ */
+ if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV &&
+ dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SL_TO_VL_CHANGE_EVENT)
+ return;
+ if (!mlx4_is_slave(dev->dev)) {
+ union sl2vl_tbl_to_u64 sl2vl64;
+ int jj;
+
+ for (jj = 0; jj < 8; jj++) {
+ sl2vl64.sl8[jj] = ((struct ib_smp *)mad)->data[jj];
+ pr_debug("sl2vl[%d] = %02x\n", jj, sl2vl64.sl8[jj]);
+ }
+ atomic64_set(&dev->sl2vl[port_num - 1], sl2vl64.sl64);
+ }
+ break;
+
default:
break;
}
@@ -450,6 +475,21 @@
return -EINVAL;
}
+static int get_gids_from_l3_hdr(struct ib_grh *grh, union ib_gid *sgid, union ib_gid *dgid)
+{
+ int version = ib_get_grh_header_version((void *)grh);
+ enum rdma_network_type net_type;
+
+ if (version == 4)
+ net_type = RDMA_NETWORK_IPV4;
+ else if (version == 6)
+ net_type = RDMA_NETWORK_IPV6;
+ else
+ return -EINVAL;
+
+ return ib_get_gids_from_grh(grh, net_type, sgid, dgid);
+}
+
int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port,
enum ib_qp_type dest_qpt, struct ib_wc *wc,
struct ib_grh *grh, struct ib_mad *mad)
@@ -478,10 +518,6 @@
if (!tun_ctx || tun_ctx->state != DEMUX_PV_STATE_ACTIVE)
return -EAGAIN;
- /* QP0 forwarding only for Dom0 */
- if (!dest_qpt && (mlx4_master_func_num(dev->dev) != slave))
- return -EINVAL;
-
if (!dest_qpt)
tun_qp = &tun_ctx->qp[0];
else
@@ -511,7 +547,10 @@
memset(&attr, 0, sizeof attr);
attr.port_num = port;
if (is_eth) {
- memcpy(&attr.grh.dgid.raw[0], &grh->dgid.raw[0], 16);
+ union ib_gid sgid;
+
+ if (get_gids_from_l3_hdr(grh, &sgid, &attr.grh.dgid))
+ return -EINVAL;
attr.ah_flags = IB_AH_GRH;
}
ah = ib_create_ah(tun_ctx->pd, &attr);
@@ -527,7 +566,7 @@
tun_tx_ix = (++tun_qp->tx_ix_head) & (MLX4_NUM_TUNNEL_BUFS - 1);
spin_unlock(&tun_qp->tx_lock);
if (ret)
- goto out;
+ goto end;
tun_mad = (struct mlx4_rcv_tunnel_mad *) (tun_qp->tx_ring[tun_tx_ix].buf.addr);
if (tun_qp->tx_ring[tun_tx_ix].ah)
@@ -550,15 +589,27 @@
if (is_eth) {
u16 vlan = 0;
+
+ /* In VGT+ mode drop mads if VLAN is not allowed */
+ ret = mlx4_vlan_blocked(dev->dev, port, slave, wc->vlan_id);
+ if (ret)
+ goto out;
+
if (mlx4_get_slave_default_vlan(dev->dev, port, slave, &vlan,
- NULL)) {
- if (vlan != wc->vlan_id)
- /* VST and default vlan is not the packet vlan drop the
- * packet*/
+ NULL)) {
+ /* VST mode */
+ if (vlan != wc->vlan_id) {
+ /* Packet vlan is not the VST-assigned vlan.
+ * Drop the packet.
+ */
+ ret = -EPERM;
goto out;
- else
- /* VST , remove hide the vlan from the VF */
- vlan = 0;
+ } else {
+ /* Remove the vlan tag before forwarding
+ * the packet to the VF.
+ */
+ vlan = 0xffff;
+ }
} else {
vlan = wc->vlan_id;
}
@@ -592,9 +643,15 @@
wr.send_flags = IB_SEND_SIGNALED;
ret = ib_post_send(src_qp, &wr, &bad_wr);
+ if (!ret)
+ return 0;
out:
- if (ret)
- ib_destroy_ah(ah);
+ spin_lock(&tun_qp->tx_lock);
+ tun_qp->tx_ix_tail++;
+ spin_unlock(&tun_qp->tx_lock);
+ tun_qp->tx_ring[tun_tx_ix].ah = NULL;
+end:
+ ib_destroy_ah(ah);
return ret;
}
@@ -614,7 +671,12 @@
is_eth = 1;
if (is_eth) {
- if (!wc->wc_flags & IB_WC_GRH) {
+ union ib_gid dgid;
+ union ib_gid sgid;
+
+ if (get_gids_from_l3_hdr(grh, &sgid, &dgid))
+ return -EINVAL;
+ if (!(wc->wc_flags & IB_WC_GRH)) {
mlx4_ib_warn(ibdev, "RoCE grh not present.\n");
return -EINVAL;
}
@@ -622,7 +684,7 @@
mlx4_ib_warn(ibdev, "RoCE mgmt class is not CM\n");
return -EINVAL;
}
- if (mlx4_get_slave_from_roce_gid(dev->dev, port, grh->dgid.raw, &slave)) {
+ if (mlx4_get_slave_from_roce_gid(dev->dev, port, dgid.raw, &slave)) {
mlx4_ib_warn(ibdev, "failed matching grh\n");
return -ENOENT;
}
@@ -632,7 +694,7 @@
return -ENOENT;
}
- if (mlx4_ib_demux_cm_handler(ibdev, port, &slave, mad, is_eth))
+ if (mlx4_ib_demux_cm_handler(ibdev, port, NULL, mad))
return 0;
err = mlx4_ib_send_to_slave(dev, slave, port, wc->qp->qp_type, wc, grh, mad);
@@ -655,21 +717,44 @@
/* If a grh is present, we demux according to it */
if (wc->wc_flags & IB_WC_GRH) {
- slave = mlx4_ib_find_real_gid(ibdev, port, grh->dgid.global.interface_id);
- if (slave < 0) {
- mlx4_ib_warn(ibdev, "failed matching grh\n");
- return -ENOENT;
+ if (grh->dgid.global.interface_id ==
+ cpu_to_be64(IB_SA_WELL_KNOWN_GUID) &&
+ grh->dgid.global.subnet_prefix ==
+ cpu_to_be64(IB_SA_WELL_KNOWN_GID_PREFIX)) {
+ slave = 0;
+ } else {
+ slave = mlx4_ib_find_real_gid(ibdev, port,
+ grh->dgid.global.interface_id);
+ if (slave < 0) {
+ mlx4_ib_warn(ibdev, "failed matching grh\n");
+ return -ENOENT;
+ }
}
}
/* Class-specific handling */
switch (mad->mad_hdr.mgmt_class) {
+ case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+ case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+ /* 255 indicates the dom0 */
+ if (slave != 255 && slave != mlx4_master_func_num(dev->dev)) {
+ if (!mlx4_vf_smi_enabled(dev->dev, slave, port))
+ return -EPERM;
+ /* for a VF. drop unsolicited MADs */
+ if (!(mad->mad_hdr.method & IB_MGMT_METHOD_RESP)) {
+ mlx4_ib_warn(ibdev, "demux QP0. rejecting unsolicited mad for slave %d class 0x%x, method 0x%x\n",
+ slave, mad->mad_hdr.mgmt_class,
+ mad->mad_hdr.method);
+ return -EINVAL;
+ }
+ }
+ break;
case IB_MGMT_CLASS_SUBN_ADM:
if (mlx4_ib_demux_sa_handler(ibdev, port, slave,
(struct ib_sa_mad *) mad))
return 0;
break;
case IB_MGMT_CLASS_CM:
- if (mlx4_ib_demux_cm_handler(ibdev, port, &slave, mad, is_eth))
+ if (mlx4_ib_demux_cm_handler(ibdev, port, &slave, mad))
return 0;
break;
case IB_MGMT_CLASS_DEVICE_MGMT:
@@ -770,8 +855,7 @@
return IB_MAD_RESULT_FAILURE;
if (!out_mad->mad_hdr.status) {
- if (!(to_mdev(ibdev)->dev->caps.flags & MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV))
- smp_snoop(ibdev, port_num, in_mad, prev_lid);
+ smp_snoop(ibdev, port_num, in_mad, prev_lid);
/* slaves get node desc from FW */
if (!mlx4_is_slave(to_mdev(ibdev)->dev))
node_desc_override(ibdev, out_mad);
@@ -796,49 +880,49 @@
{
struct ib_pma_portcounters *pma_cnt =
(struct ib_pma_portcounters *)counters;
- pma_cnt->port_xmit_data =
- cpu_to_be32((be64_to_cpu(cnt->counters[0].
- IfTxUnicastOctets) +
+ ASSIGN_32BIT_COUNTER(pma_cnt->port_xmit_data,
+ (be64_to_cpu(cnt->counters[0].
+ IfTxUnicastOctets) +
+ be64_to_cpu(cnt->counters[0].
+ IfTxMulticastOctets) +
+ be64_to_cpu(cnt->counters[0].
+ IfTxBroadcastOctets) +
+ be64_to_cpu(cnt->counters[0].
+ IfTxDroppedOctets)) >> 2);
+ ASSIGN_32BIT_COUNTER(pma_cnt->port_rcv_data,
+ (be64_to_cpu(cnt->counters[0].
+ IfRxUnicastOctets) +
+ be64_to_cpu(cnt->counters[0].
+ IfRxMulticastOctets) +
+ be64_to_cpu(cnt->counters[0].
+ IfRxBroadcastOctets) +
+ be64_to_cpu(cnt->counters[0].
+ IfRxNoBufferOctets) +
+ be64_to_cpu(cnt->counters[0].
+ IfRxErrorOctets)) >> 2);
+ ASSIGN_32BIT_COUNTER(pma_cnt->port_xmit_packets,
be64_to_cpu(cnt->counters[0].
- IfTxMulticastOctets) +
+ IfTxUnicastFrames) +
be64_to_cpu(cnt->counters[0].
- IfTxBroadcastOctets) +
+ IfTxMulticastFrames) +
be64_to_cpu(cnt->counters[0].
- IfTxDroppedOctets)) >> 2);
- pma_cnt->port_rcv_data =
- cpu_to_be32((be64_to_cpu(cnt->counters[0].
- IfRxUnicastOctets) +
+ IfTxBroadcastFrames) +
be64_to_cpu(cnt->counters[0].
- IfRxMulticastOctets) +
+ IfTxDroppedFrames));
+ ASSIGN_32BIT_COUNTER(pma_cnt->port_rcv_packets,
be64_to_cpu(cnt->counters[0].
- IfRxBroadcastOctets) +
+ IfRxUnicastFrames) +
be64_to_cpu(cnt->counters[0].
- IfRxNoBufferOctets) +
+ IfRxMulticastFrames) +
be64_to_cpu(cnt->counters[0].
- IfRxErrorOctets)) >> 2);
- pma_cnt->port_xmit_packets =
- cpu_to_be32(be64_to_cpu(cnt->counters[0].
- IfTxUnicastFrames) +
- be64_to_cpu(cnt->counters[0].
- IfTxMulticastFrames) +
- be64_to_cpu(cnt->counters[0].
- IfTxBroadcastFrames) +
- be64_to_cpu(cnt->counters[0].
- IfTxDroppedFrames));
- pma_cnt->port_rcv_packets =
- cpu_to_be32(be64_to_cpu(cnt->counters[0].
- IfRxUnicastFrames) +
- be64_to_cpu(cnt->counters[0].
- IfRxMulticastFrames) +
- be64_to_cpu(cnt->counters[0].
- IfRxBroadcastFrames) +
- be64_to_cpu(cnt->counters[0].
- IfRxNoBufferFrames) +
- be64_to_cpu(cnt->counters[0].
- IfRxErrorFrames));
- pma_cnt->port_rcv_errors = cpu_to_be32(be64_to_cpu(cnt->
- counters[0].
- IfRxErrorFrames));
+ IfRxBroadcastFrames) +
+ be64_to_cpu(cnt->counters[0].
+ IfRxNoBufferFrames) +
+ be64_to_cpu(cnt->counters[0].
+ IfRxErrorFrames));
+ ASSIGN_16BIT_COUNTER(pma_cnt->port_rcv_errors,
+ be64_to_cpu(cnt->counters[0].
+ IfRxErrorFrames));
break;
}
@@ -920,16 +1004,16 @@
{
struct ib_pma_portcounters *pma_cnt =
(struct ib_pma_portcounters *) counters;
- pma_cnt->port_xmit_data =
- cpu_to_be32(be64_to_cpu(
- cnt->counters[0].IfTxOctets) >> 2);
- pma_cnt->port_rcv_data =
- cpu_to_be32(be64_to_cpu(
- cnt->counters[0].IfRxOctets) >> 2);
- pma_cnt->port_xmit_packets =
- cpu_to_be32(be64_to_cpu(cnt->counters[0].IfTxFrames));
- pma_cnt->port_rcv_packets =
- cpu_to_be32(be64_to_cpu(cnt->counters[0].IfRxFrames));
+ ASSIGN_32BIT_COUNTER(pma_cnt->port_xmit_data,
+ (be64_to_cpu(cnt->counters[0].
+ IfTxOctets) >> 2));
+ ASSIGN_32BIT_COUNTER(pma_cnt->port_rcv_data,
+ (be64_to_cpu(cnt->counters[0].
+ IfRxOctets) >> 2));
+ ASSIGN_32BIT_COUNTER(pma_cnt->port_xmit_packets,
+ be64_to_cpu(cnt->counters[0].IfTxFrames));
+ ASSIGN_32BIT_COUNTER(pma_cnt->port_rcv_packets,
+ be64_to_cpu(cnt->counters[0].IfRxFrames));
break;
}
case IB_PMA_PORT_COUNTERS_EXT:
@@ -1025,16 +1109,29 @@
struct ib_wc *in_wc, struct ib_grh *in_grh,
struct ib_mad *in_mad, struct ib_mad *out_mad)
{
- switch (rdma_port_get_link_layer(ibdev, port_num)) {
- case IB_LINK_LAYER_INFINIBAND:
+ struct mlx4_ib_dev *dev = to_mdev(ibdev);
+ enum rdma_link_layer link = rdma_port_get_link_layer(ibdev, port_num);
+
+ /* iboe_process_mad() which uses the HCA flow-counters to implement IB PMA
+ * queries, should be called only by VFs and for that specific purpose
+ */
+ if (link == IB_LINK_LAYER_INFINIBAND) {
+ if (mlx4_is_slave(dev->dev) &&
+ in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT &&
+ (in_mad->mad_hdr.attr_id == IB_PMA_PORT_COUNTERS ||
+ in_mad->mad_hdr.attr_id == IB_PMA_PORT_COUNTERS_EXT))
+ return iboe_process_mad(ibdev, mad_flags, port_num, in_wc,
+ in_grh, in_mad, out_mad);
+
return ib_process_mad(ibdev, mad_flags, port_num, in_wc,
in_grh, in_mad, out_mad);
- case IB_LINK_LAYER_ETHERNET:
- return iboe_process_mad(ibdev, mad_flags, port_num, in_wc,
- in_grh, in_mad, out_mad);
- default:
- return -EINVAL;
}
+
+ if (link == IB_LINK_LAYER_ETHERNET)
+ return iboe_process_mad(ibdev, mad_flags, port_num, in_wc,
+ in_grh, in_mad, out_mad);
+
+ return -EINVAL;
}
static void send_handler(struct ib_mad_agent *agent,
@@ -1121,6 +1218,23 @@
MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK, 0, 0);
}
}
+
+ /* Update the sl to vl table from inside client rereg
+ * only if in secure-host mode (so no snooping occurs)
+ * and the sl-to-vl change event is not generated by FW.
+ */
+ if (!mlx4_is_slave(dev->dev) &&
+ dev->dev->flags & MLX4_FLAG_SECURE_HOST &&
+ !(dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SL_TO_VL_CHANGE_EVENT)) {
+ if (mlx4_is_master(dev->dev))
+ /* already in work queue from mlx4_ib_event queueing
+ * mlx4_handle_port_mgmt_change_event, which calls
+ * this procedure. Therefore, call sl2vl_update directly.
+ */
+ mlx4_ib_sl2vl_update(dev, port_num);
+ else
+ mlx4_sched_ib_sl2vl_update_work(dev, port_num);
+ }
mlx4_ib_dispatch_event(dev, port_num, IB_EVENT_CLIENT_REREGISTER);
}
@@ -1204,11 +1318,6 @@
u16 lid = be16_to_cpu(eqe->event.port_mgmt_change.params.port_info.mstr_sm_lid);
u8 sl = eqe->event.port_mgmt_change.params.port_info.mstr_sm_sl & 0xf;
update_sm_ah(dev, port, lid, sl);
- mlx4_ib_dispatch_event(dev, port, IB_EVENT_SM_CHANGE);
- if (mlx4_is_master(dev->dev))
- mlx4_gen_slaves_port_mgt_ev(dev->dev, port,
- changed_attr & MSTR_SM_CHANGE_MASK,
- lid, sl);
}
/* Check if it is a lid change event */
@@ -1244,6 +1353,23 @@
handle_slaves_guid_change(dev, port, tbl_block, change_bitmap);
}
break;
+
+ case MLX4_DEV_PMC_SUBTYPE_SL_TO_VL_MAP:
+ /* cache sl to vl mapping changes for use in
+ * filling QP1 LRH VL field when sending packets
+ */
+ if (!mlx4_is_slave(dev->dev)) {
+ union sl2vl_tbl_to_u64 sl2vl64;
+ int jj;
+
+ for (jj = 0; jj < 8; jj++) {
+ sl2vl64.sl8[jj] =
+ eqe->event.port_mgmt_change.params.sl2vl_tbl_change_info.sl2vl_table[jj];
+ pr_debug("sl2vl[%d] = %02x\n", jj, sl2vl64.sl8[jj]);
+ }
+ atomic64_set(&dev->sl2vl[port - 1], sl2vl64.sl64);
+ }
+ break;
default:
pr_warn("Unsupported subtype 0x%x for "
"Port Management Change event\n", eqe->subtype);
@@ -1327,7 +1453,7 @@
int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port,
enum ib_qp_type dest_qpt, u16 pkey_index,
u32 remote_qpn, u32 qkey, struct ib_ah_attr *attr,
- u8 *s_mac, struct ib_mad *mad)
+ u8 *s_mac, u16 vlan_id, struct ib_mad *mad)
{
struct ib_sge list;
struct ib_send_wr wr, *bad_wr;
@@ -1349,10 +1475,6 @@
if (!sqp_ctx || sqp_ctx->state != DEMUX_PV_STATE_ACTIVE)
return -EAGAIN;
- /* QP0 forwarding only for Dom0 */
- if (dest_qpt == IB_QPT_SMI && (mlx4_master_func_num(dev->dev) != slave))
- return -EINVAL;
-
if (dest_qpt == IB_QPT_SMI) {
src_qpnum = 0;
sqp = &sqp_ctx->qp[0];
@@ -1418,43 +1540,38 @@
wr.send_flags = IB_SEND_SIGNALED;
if (s_mac)
memcpy(to_mah(ah)->av.eth.s_mac, s_mac, 6);
+ if (vlan_id < 0x1000)
+ vlan_id |= (attr->sl & 7) << 13;
+ to_mah(ah)->av.eth.vlan = cpu_to_be16(vlan_id);
ret = ib_post_send(send_qp, &wr, &bad_wr);
+ if (!ret)
+ return 0;
+
+ spin_lock(&sqp->tx_lock);
+ sqp->tx_ix_tail++;
+ spin_unlock(&sqp->tx_lock);
+ sqp->tx_ring[wire_tx_ix].ah = NULL;
out:
- if (ret)
- ib_destroy_ah(ah);
+ ib_destroy_ah(ah);
return ret;
}
static int get_slave_base_gid_ix(struct mlx4_ib_dev *dev, int slave, int port)
{
- int gids;
- int vfs;
-
if (rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_INFINIBAND)
return slave;
-
- gids = MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS;
- vfs = dev->dev->num_vfs;
-
- if (slave == 0)
- return 0;
- if (slave <= gids % vfs)
- return MLX4_ROCE_PF_GIDS + ((gids / vfs) + 1) * (slave - 1);
-
- return MLX4_ROCE_PF_GIDS + (gids % vfs) + ((gids / vfs) * (slave - 1));
+ return mlx4_get_base_gid_ix(dev->dev, slave, port);
}
-static int get_real_sgid_index(struct mlx4_ib_dev *dev, int slave, int port,
- struct ib_ah_attr *ah_attr)
+static void fill_in_real_sgid_index(struct mlx4_ib_dev *dev, int slave, int port,
+ struct ib_ah_attr *ah_attr)
{
- if (rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_INFINIBAND) {
+ if (rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_INFINIBAND)
ah_attr->grh.sgid_index = slave;
- return 0;
- }
- ah_attr->grh.sgid_index += get_slave_base_gid_ix(dev, slave, port);
- return 0;
+ else
+ ah_attr->grh.sgid_index += get_slave_base_gid_ix(dev, slave, port);
}
static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc *wc)
@@ -1467,6 +1584,8 @@
struct ib_ah_attr ah_attr;
u8 *slave_id;
int slave;
+ int port;
+ u16 vlan_id;
/* Get slave that sent this packet */
if (wc->src_qp < dev->dev->phys_caps.base_proxy_sqpn ||
@@ -1482,11 +1601,6 @@
"belongs to another slave\n", wc->src_qp);
return;
}
- if (slave != mlx4_master_func_num(dev->dev) && !(wc->src_qp & 0x2)) {
- mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d: "
- "non-master trying to send QP0 packets\n", wc->src_qp);
- return;
- }
/* Map transaction ID */
ib_dma_sync_single_for_cpu(ctx->ib_dev, tun_qp->ring[wr_ix].map,
@@ -1514,6 +1628,12 @@
/* Class-specific handling */
switch (tunnel->mad.mad_hdr.mgmt_class) {
+ case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+ case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+ if (slave != mlx4_master_func_num(dev->dev) &&
+ !mlx4_vf_smi_enabled(dev->dev, slave, ctx->port))
+ return;
+ break;
case IB_MGMT_CLASS_SUBN_ADM:
if (mlx4_ib_multiplex_sa_handler(ctx->ib_dev, ctx->port, slave,
(struct ib_sa_mad *) &tunnel->mad))
@@ -1544,13 +1664,22 @@
ah.ibah.device = ctx->ib_dev;
mlx4_ib_query_ah(&ah.ibah, &ah_attr);
if (ah_attr.ah_flags & IB_AH_GRH)
- if (get_real_sgid_index(dev, slave, ctx->port, &ah_attr))
- return;
+ fill_in_real_sgid_index(dev, slave, ctx->port, &ah_attr);
+
+ port = mlx4_slave_convert_port(dev->dev, slave, ah_attr.port_num);
+ if (port < 0)
+ return;
+ ah_attr.port_num = port;
memcpy(ah_attr.dmac, tunnel->hdr.mac, 6);
- ah_attr.vlan_id = tunnel->hdr.vlan;
+ vlan_id = be16_to_cpu(tunnel->hdr.vlan);
+
+ /* In VGT+ mode drop mads if VLAN is not allowed */
+ if (mlx4_vlan_blocked(dev->dev, ctx->port, slave, vlan_id))
+ return;
+
/* if slave have default vlan use it */
mlx4_get_slave_default_vlan(dev->dev, ctx->port, slave,
- &ah_attr.vlan_id, &ah_attr.sl);
+ &vlan_id, &ah_attr.sl);
mlx4_ib_send_to_wire(dev, slave, ctx->port,
is_proxy_qp0(dev, wc->src_qp, slave) ?
@@ -1558,7 +1687,7 @@
be16_to_cpu(tunnel->hdr.pkey_index),
be32_to_cpu(tunnel->hdr.remote_qpn),
be32_to_cpu(tunnel->hdr.qkey),
- &ah_attr, wc->smac, &tunnel->mad);
+ &ah_attr, wc->smac, vlan_id, &tunnel->mad);
}
static int mlx4_ib_alloc_pv_bufs(struct mlx4_ib_demux_pv_ctx *ctx,
@@ -1603,12 +1732,6 @@
tun_qp->ring[i].addr,
rx_buf_size,
DMA_FROM_DEVICE);
- if (unlikely(ib_dma_mapping_error(ctx->ib_dev,
- tun_qp->ring[i].map))) {
- mlx4_ib_warn(ctx->ib_dev, "ib_dma_map_single failed\n");
- kfree(tun_qp->ring[i].addr);
- goto err;
- }
}
for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) {
@@ -1621,12 +1744,6 @@
tun_qp->tx_ring[i].buf.addr,
tx_buf_size,
DMA_TO_DEVICE);
- if (unlikely(ib_dma_mapping_error(ctx->ib_dev,
- tun_qp->tx_ring[i].buf.map))) {
- mlx4_ib_warn(ctx->ib_dev, "ib_dma_map_single failed\n");
- kfree(tun_qp->tx_ring[i].buf.addr);
- goto tx_err;
- }
tun_qp->tx_ring[i].ah = NULL;
}
spin_lock_init(&tun_qp->tx_lock);
@@ -1783,7 +1900,8 @@
qp_init_attr.init_attr.cap.max_recv_sge = 1;
if (create_tun) {
qp_init_attr.init_attr.qp_type = IB_QPT_UD;
- qp_init_attr.init_attr.create_flags = (enum ib_qp_create_flags)MLX4_IB_SRIOV_TUNNEL_QP;
+ qp_init_attr.init_attr.create_flags =
+ (enum ib_qp_create_flags)MLX4_IB_SRIOV_TUNNEL_QP;
qp_init_attr.port = ctx->port;
qp_init_attr.slave = ctx->slave;
qp_init_attr.proxy_qp_type = qp_type;
@@ -1791,7 +1909,8 @@
IB_QP_QKEY | IB_QP_PORT;
} else {
qp_init_attr.init_attr.qp_type = qp_type;
- qp_init_attr.init_attr.create_flags = (enum ib_qp_create_flags)MLX4_IB_SRIOV_SQP;
+ qp_init_attr.init_attr.create_flags =
+ (enum ib_qp_create_flags)MLX4_IB_SRIOV_SQP;
qp_attr_mask_INIT = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_QKEY;
}
qp_init_attr.init_attr.port_num = ctx->port;
@@ -1811,7 +1930,8 @@
ret = 0;
if (create_tun)
ret = find_slave_port_pkey_ix(to_mdev(ctx->ib_dev), ctx->slave,
- ctx->port, 0xFFFF, &attr.pkey_index);
+ ctx->port, IB_DEFAULT_PKEY_FULL,
+ &attr.pkey_index);
if (ret || !create_tun)
attr.pkey_index =
to_mdev(ctx->ib_dev)->pkeys.virt2phys_pkey[ctx->slave][ctx->port - 1][0];
@@ -1953,9 +2073,9 @@
return -EEXIST;
ctx->state = DEMUX_PV_STATE_STARTING;
- /* have QP0 only on port owner, and only if link layer is IB */
- if (ctx->slave == mlx4_master_func_num(to_mdev(ctx->ib_dev)->dev) &&
- rdma_port_get_link_layer(ibdev, ctx->port) == IB_LINK_LAYER_INFINIBAND)
+ /* have QP0 only if link layer is IB */
+ if (rdma_port_get_link_layer(ibdev, ctx->port) ==
+ IB_LINK_LAYER_INFINIBAND)
ctx->has_smi = 1;
if (ctx->has_smi) {
@@ -2146,7 +2266,16 @@
ctx->port = port;
ctx->ib_dev = &dev->ib_dev;
- for (i = 0; i < dev->dev->caps.sqp_demux; i++) {
+ for (i = 0;
+ i < min(dev->dev->caps.sqp_demux,
+ (u16)(dev->dev->persist->num_vfs + 1));
+ i++) {
+ struct mlx4_active_ports actv_ports =
+ mlx4_get_active_ports(dev->dev, i);
+
+ if (!test_bit(port - 1, actv_ports.ports))
+ continue;
+
ret = alloc_pv_object(dev, i, port, &ctx->tun[i]);
if (ret) {
ret = -ENOMEM;
@@ -2301,16 +2430,17 @@
goto demux_err;
err = mlx4_ib_alloc_demux_ctx(dev, &dev->sriov.demux[i], i + 1);
if (err)
- goto demux_err;
+ goto free_pv;
}
mlx4_ib_master_tunnels(dev, 1);
return 0;
+free_pv:
+ free_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1);
demux_err:
- while (i > 0) {
+ while (--i >= 0) {
free_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1);
mlx4_ib_free_demux_ctx(&dev->sriov.demux[i]);
- --i;
}
mlx4_ib_device_unregister_sysfs(dev);
Index: sys/ofed/drivers/infiniband/hw/mlx4/main.c
===================================================================
--- sys/ofed/drivers/infiniband/hw/mlx4/main.c
+++ sys/ofed/drivers/infiniband/hw/mlx4/main.c
@@ -34,19 +34,25 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/errno.h>
+#include <linux/etherdevice.h>
#include <linux/netdevice.h>
#include <linux/inetdevice.h>
#include <linux/if_vlan.h>
#include <linux/fs.h>
+#include <linux/rcupdate.h>
+#include <linux/notifier.h>
+
#include <net/ipv6.h>
#include <rdma/ib_smi.h>
#include <rdma/ib_user_verbs.h>
#include <rdma/ib_user_verbs_exp.h>
#include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
#include <linux/mlx4/driver.h>
#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/qp.h>
#include <linux/sched.h>
#include <linux/page.h>
#include <linux/printk.h>
@@ -56,13 +62,14 @@
#include "wc.h"
#define DRV_NAME MLX4_IB_DRV_NAME
-#define DRV_VERSION "1.0"
-#define DRV_RELDATE __DATE__
+#define DRV_VERSION "2.2-1"
+#define DRV_RELDATE "Feb 2014"
#define MLX4_IB_DRIVER_PROC_DIR_NAME "driver/mlx4_ib"
#define MLX4_IB_MRS_PROC_DIR_NAME "mrs"
#define MLX4_IB_FLOW_MAX_PRIO 0xFFF
#define MLX4_IB_FLOW_QPN_MASK 0xFFFFFF
+#define MLX4_IB_CARD_REV_A0 0xA0
MODULE_AUTHOR("Roland Dreier");
MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver");
@@ -102,13 +109,6 @@
DRV_NAME ": Mellanox ConnectX InfiniBand driver v"
DRV_VERSION " (" DRV_RELDATE ")\n";
-struct update_gid_work {
- struct work_struct work;
- union ib_gid gids[128];
- struct mlx4_ib_dev *dev;
- int port;
-};
-
struct dev_rec {
int bus;
int dev;
@@ -123,8 +123,8 @@
static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev, struct net_device*,
unsigned long);
-static u8 mlx4_ib_get_dev_port(struct net_device *dev,
- struct mlx4_ib_dev *ibdev);
+static int _mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid,
+ int count);
static struct workqueue_struct *wq;
@@ -136,12 +136,11 @@
mad->method = IB_MGMT_METHOD_GET;
}
-static union ib_gid zgid;
-
static int check_flow_steering_support(struct mlx4_dev *dev)
{
int eth_num_ports = 0;
int ib_num_ports = 0;
+
int dmfs = dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED;
if (dmfs) {
@@ -151,16 +150,30 @@
mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
ib_num_ports++;
dmfs &= (!ib_num_ports ||
- (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DMFS_IPOIB)) &&
+ (dev->caps.steering_attr &
+ MLX4_STEERING_ATTR_DMFS_IPOIB)) &&
(!eth_num_ports ||
- (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FS_EN));
+ (dev->caps.steering_attr &
+ MLX4_STEERING_ATTR_DMFS_EN));
if (ib_num_ports && mlx4_is_mfunc(dev)) {
+ pr_warn("Device managed flow steering is unavailable for IB port in multifunction env.\n");
dmfs = 0;
}
}
return dmfs;
}
+static int num_ib_ports(struct mlx4_dev *dev)
+{
+ int ib_ports = 0;
+ int i;
+
+ mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
+ ib_ports++;
+
+ return ib_ports;
+}
+
int mlx4_ib_query_device(struct ib_device *ibdev,
struct ib_device_attr *props)
{
@@ -168,6 +181,7 @@
struct ib_smp *in_mad = NULL;
struct ib_smp *out_mad = NULL;
int err = -ENOMEM;
+ int have_ib_ports;
in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL);
out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
@@ -184,6 +198,8 @@
memset(props, 0, sizeof *props);
+ have_ib_ports = num_ib_ports(dev->dev);
+
props->fw_ver = dev->dev->caps.fw_ver;
props->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT |
IB_DEVICE_PORT_ACTIVE_EVENT |
@@ -196,13 +212,15 @@
props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR)
props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
- if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_APM)
+ if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_APM && have_ib_ports)
props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UD_AV_PORT)
props->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE;
if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_IPOIB_CSUM)
props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
- if (dev->dev->caps.max_gso_sz && dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BLH)
+ if (dev->dev->caps.max_gso_sz &&
+ (dev->dev->rev_id != MLX4_IB_CARD_REV_A0) &&
+ (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BLH))
props->device_cap_flags |= IB_DEVICE_UD_TSO;
if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_RESERVED_LKEY)
props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY;
@@ -214,16 +232,6 @@
props->device_cap_flags |= IB_DEVICE_XRC;
if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_CROSS_CHANNEL)
props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL;
-
- if (check_flow_steering_support(dev->dev))
- props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING;
-
-
- props->device_cap_flags |= IB_DEVICE_QPG;
- if (dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS) {
- props->device_cap_flags |= IB_DEVICE_UD_RSS;
- props->max_rss_tbl_sz = dev->dev->caps.max_rss_tbl_sz;
- }
if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW)
props->device_cap_flags |= IB_DEVICE_MEM_WINDOW;
if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN) {
@@ -231,10 +239,18 @@
props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2B;
else
props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2A;
+ if (dev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED)
+ props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING;
}
+ props->device_cap_flags |= IB_DEVICE_QPG;
+ if (dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS) {
+ props->device_cap_flags |= IB_DEVICE_UD_RSS;
+ props->max_rss_tbl_sz = dev->dev->caps.max_rss_tbl_sz;
+ }
+
props->vendor_id = be32_to_cpup((__be32 *) (out_mad->data + 36)) &
0xffffff;
- props->vendor_part_id = dev->dev->pdev->device;
+ props->vendor_part_id = dev->dev->persist->pdev->device;
props->hw_ver = be32_to_cpup((__be32 *) (out_mad->data + 32));
memcpy(&props->sys_image_guid, out_mad->data + 4, 8);
@@ -266,13 +282,11 @@
props->max_mcast_grp;
props->max_map_per_fmr = dev->dev->caps.max_fmr_maps;
props->hca_core_clock = dev->dev->caps.hca_core_clock;
+ props->timestamp_mask = 0xFFFFFFFFFFFFULL;
if (dev->dev->caps.hca_core_clock > 0)
props->comp_mask |= IB_DEVICE_ATTR_WITH_HCA_CORE_CLOCK;
- if (dev->dev->caps.cq_timestamp) {
- props->timestamp_mask = 0xFFFFFFFFFFFF;
- props->comp_mask |= IB_DEVICE_ATTR_WITH_TIMESTAMP_MASK;
- }
-
+ props->comp_mask |= IB_DEVICE_ATTR_WITH_TIMESTAMP_MASK;
+ props->max_ah = INT_MAX;
out:
kfree(in_mad);
kfree(out_mad);
@@ -393,8 +407,8 @@
struct net_device *ndev;
enum ib_mtu tmp;
struct mlx4_cmd_mailbox *mailbox;
- unsigned long flags;
int err = 0;
+ int is_bonded = mlx4_is_bonded(mdev->dev);
mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);
if (IS_ERR(mailbox))
@@ -406,15 +420,34 @@
if (err)
goto out;
- props->active_width = (((u8 *)mailbox->buf)[5] == 0x40) ?
- IB_WIDTH_4X : IB_WIDTH_1X;
- props->active_speed = IB_SPEED_QDR;
- props->port_cap_flags = IB_PORT_CM_SUP;
- if (netw_view)
- props->gid_tbl_len = MLX4_ROCE_MAX_GIDS;
- else
- props->gid_tbl_len = mdev->dev->caps.gid_table_len[port];
+ props->active_width = (((u8 *)mailbox->buf)[5] == 0x40) ||
+ (((u8 *)mailbox->buf)[5] == 0x20 /*56Gb*/) ?
+ IB_WIDTH_4X : IB_WIDTH_1X;
+ props->active_speed = (((u8 *)mailbox->buf)[5] == 0x20 /*56Gb*/) ?
+ IB_SPEED_FDR : IB_SPEED_QDR;
+ props->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_IP_BASED_GIDS;
+
+ switch (mdev->dev->caps.roce_mode) {
+ case MLX4_ROCE_MODE_1:
+ props->port_cap_flags |= IB_PORT_ROCE;
+ break;
+ case MLX4_ROCE_MODE_1_5:
+ props->port_cap_flags |= IB_PORT_ROCE_V1_5;
+ break;
+ case MLX4_ROCE_MODE_2:
+ props->port_cap_flags |= IB_PORT_ROCE_V2;
+ break;
+ case MLX4_ROCE_MODE_1_5_PLUS_2:
+ props->port_cap_flags |= IB_PORT_ROCE_V2 | IB_PORT_ROCE_V1_5;
+ break;
+ case MLX4_ROCE_MODE_1_PLUS_2:
+ props->port_cap_flags |= IB_PORT_ROCE_V2 | IB_PORT_ROCE;
+ break;
+ default:
+ break;
+ }
+ props->gid_tbl_len = mdev->dev->caps.gid_table_len[port];
props->max_msg_sz = mdev->dev->caps.max_msg_sz;
props->pkey_tbl_len = 1;
props->max_mtu = IB_MTU_4096;
@@ -422,10 +455,17 @@
props->state = IB_PORT_DOWN;
props->phys_state = state_to_phys_state(props->state);
props->active_mtu = IB_MTU_256;
- spin_lock_irqsave(&iboe->lock, flags);
+ read_lock(&iboe->iboe_lock);
ndev = iboe->netdevs[port - 1];
+ if (ndev && is_bonded) {
+#if 0
+ rcu_read_lock(); /* required to get upper dev */
+ ndev = netdev_master_upper_dev_get_rcu(ndev);
+ rcu_read_unlock();
+#endif
+ }
if (!ndev)
- goto out_unlock;
+ goto unlock;
tmp = iboe_get_mtu(ndev->if_mtu);
props->active_mtu = tmp ? min(props->max_mtu, tmp) : IB_MTU_256;
@@ -433,8 +473,8 @@
props->state = (netif_running(ndev) && netif_carrier_ok(ndev)) ?
IB_PORT_ACTIVE : IB_PORT_DOWN;
props->phys_state = state_to_phys_state(props->state);
-out_unlock:
- spin_unlock_irqrestore(&iboe->lock, flags);
+unlock:
+ read_unlock(&iboe->iboe_lock);
out:
mlx4_free_cmd_mailbox(mdev->dev, mailbox);
return err;
@@ -517,23 +557,81 @@
return err;
}
-static int iboe_query_gid(struct ib_device *ibdev, u8 port, int index,
- union ib_gid *gid)
+static int mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
+ union ib_gid *gid)
{
- struct mlx4_ib_dev *dev = to_mdev(ibdev);
+ int ret;
+
+ if (ib_cache_use_roce_gid_cache(ibdev, port))
+ return __mlx4_ib_query_gid(ibdev, port, index, gid, 0);
- *gid = dev->iboe.gid_table[port - 1][index];
+ ret = ib_get_cached_gid(ibdev, port, index, gid, NULL);
+ if (ret == -EAGAIN) {
+ memcpy(gid, &zgid, sizeof(*gid));
+ return 0;
+ }
- return 0;
+ return ret;
}
-static int mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
- union ib_gid *gid)
+static int mlx4_ib_query_sl2vl(struct ib_device *ibdev, u8 port, u64 *sl2vl_tbl)
{
- if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND)
- return __mlx4_ib_query_gid(ibdev, port, index, gid, 0);
- else
- return iboe_query_gid(ibdev, port, index, gid);
+ union sl2vl_tbl_to_u64 sl2vl64;
+ struct ib_smp *in_mad = NULL;
+ struct ib_smp *out_mad = NULL;
+ int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS;
+ int err = -ENOMEM;
+ int jj;
+
+ if (mlx4_is_slave(to_mdev(ibdev)->dev)) {
+ *sl2vl_tbl = 0;
+ return 0;
+ }
+
+ in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL);
+ out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL);
+ if (!in_mad || !out_mad)
+ goto out;
+
+ init_query_mad(in_mad);
+ in_mad->attr_id = IB_SMP_ATTR_SL_TO_VL_TABLE;
+ in_mad->attr_mod = 0;
+
+ if (mlx4_is_mfunc(to_mdev(ibdev)->dev))
+ mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW;
+
+ err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL,
+ in_mad, out_mad);
+ if (err)
+ goto out;
+
+ for (jj = 0; jj < 8; jj++)
+ sl2vl64.sl8[jj] = ((struct ib_smp *)out_mad)->data[jj];
+ *sl2vl_tbl = sl2vl64.sl64;
+
+out:
+ kfree(in_mad);
+ kfree(out_mad);
+ return err;
+}
+
+static void mlx4_init_sl2vl_tbl(struct mlx4_ib_dev *mdev)
+{
+ u64 sl2vl;
+ int i;
+ int err;
+
+ for (i = 1; i <= mdev->dev->caps.num_ports; i++) {
+ if (mdev->dev->caps.port_type[i] == MLX4_PORT_TYPE_ETH)
+ continue;
+ err = mlx4_ib_query_sl2vl(&mdev->ib_dev, i, &sl2vl);
+ if (err) {
+ pr_err("Unable to get default sl to vl mapping for port %d. Using all zeroes (%d)\n",
+ i, err);
+ sl2vl = 0;
+ }
+ atomic64_set(&mdev->sl2vl[i - 1], sl2vl);
+ }
}
int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
@@ -601,7 +699,6 @@
if (IS_ERR(mailbox))
return 0;
- memset(mailbox->buf, 0, 256);
memcpy(mailbox->buf, props->node_desc, 64);
mlx4_cmd(to_mdev(ibdev)->dev, mailbox->dma, 1, 0,
MLX4_CMD_SET_NODE, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
@@ -611,19 +708,16 @@
return 0;
}
-static int mlx4_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols,
- u32 cap_mask)
+static int mlx4_ib_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols,
+ u32 cap_mask)
{
struct mlx4_cmd_mailbox *mailbox;
int err;
- u8 is_eth = dev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH;
mailbox = mlx4_alloc_cmd_mailbox(dev->dev);
if (IS_ERR(mailbox))
return PTR_ERR(mailbox);
- memset(mailbox->buf, 0, 256);
-
if (dev->dev->flags & MLX4_FLAG_OLD_PORT_CMDS) {
*(u8 *) mailbox->buf = !!reset_qkey_viols << 6;
((__be32 *) mailbox->buf)[2] = cpu_to_be32(cap_mask);
@@ -632,8 +726,9 @@
((__be32 *) mailbox->buf)[1] = cpu_to_be32(cap_mask);
}
- err = mlx4_cmd(dev->dev, mailbox->dma, port, is_eth, MLX4_CMD_SET_PORT,
- MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+ err = mlx4_cmd(dev->dev, mailbox->dma, port, MLX4_SET_PORT_IB_OPCODE,
+ MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+ MLX4_CMD_WRAPPED);
mlx4_free_cmd_mailbox(dev->dev, mailbox);
return err;
@@ -642,11 +737,20 @@
static int mlx4_ib_modify_port(struct ib_device *ibdev, u8 port, int mask,
struct ib_port_modify *props)
{
+ struct mlx4_ib_dev *mdev = to_mdev(ibdev);
+ u8 is_eth = mdev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH;
struct ib_port_attr attr;
u32 cap_mask;
int err;
- mutex_lock(&to_mdev(ibdev)->cap_mask_mutex);
+ /* return OK if this is RoCE. CM calls ib_modify_port() regardless
+ * of whether port link layer is ETH or IB. For ETH ports, qkey
+ * violations and port capabilities are not meaningful.
+ */
+ if (is_eth)
+ return 0;
+
+ mutex_lock(&mdev->cap_mask_mutex);
err = mlx4_ib_query_port(ibdev, port, &attr);
if (err)
@@ -655,9 +759,9 @@
cap_mask = (attr.port_cap_flags | props->set_port_cap_mask) &
~props->clr_port_cap_mask;
- err = mlx4_SET_PORT(to_mdev(ibdev), port,
- !!(mask & IB_PORT_RESET_QKEY_CNTR),
- cap_mask);
+ err = mlx4_ib_SET_PORT(mdev, port,
+ !!(mask & IB_PORT_RESET_QKEY_CNTR),
+ cap_mask);
out:
mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex);
@@ -676,9 +780,18 @@
if (!dev->ib_active)
return ERR_PTR(-EAGAIN);
+#ifdef CONFIG_INFINIBAND_WQE_FORMAT
+ /* In case udata->inlen is more than just the header, we are certain
+ * that libmlx4 already know the flag of MLX4_USER_DEV_CAP_WQE_FORMAT */
+ if ((dev->dev->caps.userspace_caps & MLX4_USER_DEV_CAP_WQE_FORMAT) &&
+ !(udata->inlen - sizeof(struct ib_uverbs_cmd_hdr))) {
+ pr_err("User Libraries are not supporting WQE_FORMAT 1\n");
+ return ERR_PTR(-EINVAL);
+ }
+#endif
if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) {
resp_v3.qp_tab_size = dev->dev->caps.num_qps;
- if (mlx4_wc_enabled()) {
+ if (mlx4_arch_bf_support()) {
resp_v3.bf_reg_size = dev->dev->caps.bf_reg_size;
resp_v3.bf_regs_per_page = dev->dev->caps.bf_regs_per_page;
} else {
@@ -688,7 +801,7 @@
} else {
resp.dev_caps = dev->dev->caps.userspace_caps;
resp.qp_tab_size = dev->dev->caps.num_qps;
- if (mlx4_wc_enabled()) {
+ if (mlx4_arch_bf_support()) {
resp.bf_reg_size = dev->dev->caps.bf_reg_size;
resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page;
} else {
@@ -698,7 +811,7 @@
resp.cqe_size = dev->dev->caps.cqe_size;
}
- context = kmalloc(sizeof *context, GFP_KERNEL);
+ context = kzalloc(sizeof *context, GFP_KERNEL);
if (!context)
return ERR_PTR(-ENOMEM);
@@ -710,6 +823,8 @@
INIT_LIST_HEAD(&context->db_page_list);
mutex_init(&context->db_page_mutex);
+ INIT_LIST_HEAD(&context->user_uar_list);
+ mutex_init(&context->user_uar_mutex);
if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION)
err = ib_copy_to_udata(udata, &resp_v3, sizeof(resp_v3));
@@ -728,8 +843,16 @@
static int mlx4_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
{
struct mlx4_ib_ucontext *context = to_mucontext(ibcontext);
+ struct mlx4_ib_user_uar *uar;
+ struct mlx4_ib_user_uar *tmp;
mlx4_uar_free(to_mdev(ibcontext->device)->dev, &context->uar);
+ list_for_each_entry_safe(uar, tmp, &context->user_uar_list, list) {
+ list_del(&uar->list);
+ mlx4_uar_free(to_mdev(ibcontext->device)->dev, &uar->uar);
+ kfree(uar);
+ }
+
kfree(context);
return 0;
@@ -737,6 +860,132 @@
/* XXX FBSD has no support for get_unmapped_area function */
#if 0
+static void mlx4_ib_vma_open(struct vm_area_struct *area)
+{
+ /* vma_open is called when a new VMA is created on top of our VMA.
+ * This is done through either mremap flow or split_vma (usually due
+ * to mlock, madvise, munmap, etc.). We do not support a clone of the
+ * vma, as this VMA is strongly hardware related. Therefore we set the
+ * vm_ops of the newly created/cloned VMA to NULL, to prevent it from
+ * calling us again and trying to do incorrect actions. We assume that
+ * the original vma size is exactly a single page that there will be no
+ * "splitting" operations on.
+ */
+ area->vm_ops = NULL;
+}
+
+static void mlx4_ib_vma_close(struct vm_area_struct *area)
+{
+ struct mlx4_ib_vma_private_data *mlx4_ib_vma_priv_data;
+
+ /* It's guaranteed that all VMAs opened on a FD are closed before the
+ * file itself is closed, therefore no sync is needed with the regular
+ * closing flow. (e.g. mlx4_ib_dealloc_ucontext) However need a sync
+ * with accessing the vma as part of mlx4_ib_disassociate_ucontext.
+ * The close operation is usually called under mm->mmap_sem except when
+ * process is exiting. The exiting case is handled explicitly as part
+ * of mlx4_ib_disassociate_ucontext.
+ */
+ mlx4_ib_vma_priv_data = (struct mlx4_ib_vma_private_data *)area->vm_private_data;
+
+ /* set the vma context pointer to null in the mlx4_ib driver's private
+ * data to protect against a race condition in mlx4_ib_dissassociate_ucontext().
+ */
+ mlx4_ib_vma_priv_data->vma = NULL;
+}
+
+static const struct vm_operations_struct mlx4_ib_vm_ops = {
+ .open = mlx4_ib_vma_open,
+ .close = mlx4_ib_vma_close
+};
+
+static void mlx4_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
+{
+ int i;
+ int ret = 0;
+ struct vm_area_struct *vma;
+ struct mlx4_ib_ucontext *context = to_mucontext(ibcontext);
+ struct task_struct *owning_process = NULL;
+ struct mm_struct *owning_mm = NULL;
+ struct mlx4_ib_user_uar *uar;
+
+ owning_process = get_pid_task(ibcontext->tgid, PIDTYPE_PID);
+ if (!owning_process)
+ return;
+
+ owning_mm = get_task_mm(owning_process);
+ if (!owning_mm) {
+ pr_info("no mm, disassociate ucontext is pending task termination\n");
+ while (1) {
+ /* make sure that task is dead before returning, it may
+ * prevent a rare case of module down in parallel to a
+ * call to mlx4_ib_vma_close.
+ */
+ put_task_struct(owning_process);
+ msleep(1);
+ owning_process = get_pid_task(ibcontext->tgid, PIDTYPE_PID);
+ if (!owning_process || owning_process->state == TASK_DEAD) {
+ pr_info("disassociate ucontext done, task was terminated\n");
+ /* in case task was dead need to release the task struct */
+ if (owning_process)
+ put_task_struct(owning_process);
+ return;
+ }
+ }
+ }
+
+ /* need to protect from a race on closing the vma as part of
+ * mlx4_ib_vma_close().
+ */
+ down_read(&owning_mm->mmap_sem);
+ for (i = 0; i < HW_BAR_COUNT; i++) {
+ vma = context->hw_bar_info[i].vma;
+ if (!vma)
+ continue;
+
+ ret = zap_vma_ptes(context->hw_bar_info[i].vma,
+ context->hw_bar_info[i].vma->vm_start,
+ PAGE_SIZE);
+ if (ret) {
+ pr_err("Error: zap_vma_ptes failed for index=%d, ret=%d\n", i, ret);
+ BUG_ON(1);
+ }
+
+ /* context going to be destroyed, should not access ops any more */
+ context->hw_bar_info[i].vma->vm_ops = NULL;
+ }
+
+ list_for_each_entry(uar, &context->user_uar_list, list)
+ for (i = 0; i < HW_BAR_COUNT; i++) {
+ vma = uar->hw_bar_info[i].vma;
+ if (!vma)
+ continue;
+
+ ret = zap_vma_ptes(uar->hw_bar_info[i].vma,
+ uar->hw_bar_info[i].vma->vm_start,
+ PAGE_SIZE);
+ if (ret) {
+ pr_err("Error: zap_vma_ptes failed for uar_idx=%d, index=%d, ret=%d\n", uar->user_idx, i, ret);
+ BUG_ON(1);
+ }
+
+ /* context going to be destroyed, should not access ops any more */
+ uar->hw_bar_info[i].vma->vm_ops = NULL;
+ }
+
+ up_read(&owning_mm->mmap_sem);
+ mmput(owning_mm);
+ put_task_struct(owning_process);
+}
+
+static void mlx4_ib_set_vma_data(struct vm_area_struct *vma,
+ struct mlx4_ib_vma_private_data *vma_private_data)
+{
+ vma_private_data->vma = vma;
+ vma->vm_private_data = vma_private_data;
+ vma->vm_ops = &mlx4_ib_vm_ops;
+}
+
static unsigned long mlx4_ib_get_unmapped_area(struct file *file,
unsigned long addr,
unsigned long len, unsigned long pgoff,
@@ -791,12 +1040,30 @@
}
#endif
+static void
+mlx4_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
+{
+ /* NOP */
+}
+
+static inline void
+mlx4_ib_set_vma_data(struct vm_area_struct *vma,
+ struct mlx4_ib_vma_private_data *vma_private_data)
+{
+ vma_private_data->vma = vma;
+}
+
static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
{
struct mlx4_ib_dev *dev = to_mdev(context->device);
+ struct mlx4_ib_ucontext *mucontext = to_mucontext(context);
+ int err;
/* Last 8 bits hold the command others are data per that command */
unsigned long command = vma->vm_pgoff & MLX4_IB_MMAP_CMD_MASK;
+ /* The rest of the bits hold the command parameter */
+ unsigned long parm = vma->vm_pgoff >> MLX4_IB_MMAP_CMD_BITS;
+ struct mlx4_ib_user_uar *uar;
if (command < MLX4_IB_MMAP_GET_CONTIGUOUS_PAGES) {
/* compatability handling for commands 0 & 1*/
@@ -804,25 +1071,43 @@
return -EINVAL;
}
if (command == MLX4_IB_MMAP_UAR_PAGE) {
+ /* We prevent double mmaping on same context */
+ if (mucontext->hw_bar_info[HW_BAR_DB].vma != NULL)
+ return -EINVAL;
+
vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
if (io_remap_pfn_range(vma, vma->vm_start,
to_mucontext(context)->uar.pfn,
PAGE_SIZE, vma->vm_page_prot))
return -EAGAIN;
+
+ mlx4_ib_set_vma_data(vma, &mucontext->hw_bar_info[HW_BAR_DB]);
+
} else if (command == MLX4_IB_MMAP_BLUE_FLAME_PAGE &&
dev->dev->caps.bf_reg_size != 0) {
- vma->vm_page_prot = pgprot_wc(vma->vm_page_prot);
+ /* We prevent double mmaping on same context */
+ if (mucontext->hw_bar_info[HW_BAR_BF].vma != NULL)
+ return -EINVAL;
+
+ vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
if (io_remap_pfn_range(vma, vma->vm_start,
to_mucontext(context)->uar.pfn +
dev->dev->caps.num_uars,
PAGE_SIZE, vma->vm_page_prot))
return -EAGAIN;
+
+ mlx4_ib_set_vma_data(vma, &mucontext->hw_bar_info[HW_BAR_BF]);
+
} else if (command == MLX4_IB_MMAP_GET_HW_CLOCK) {
struct mlx4_clock_params params;
int ret;
+ /* We prevent double mmaping on same context */
+ if (mucontext->hw_bar_info[HW_BAR_CLOCK].vma != NULL)
+ return -EINVAL;
+
ret = mlx4_get_internal_clock_params(dev->dev, &params);
if (ret)
return ret;
@@ -830,11 +1115,116 @@
vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
if (io_remap_pfn_range(vma, vma->vm_start,
- (pci_resource_start(dev->dev->pdev,
+ (pci_resource_start(dev->dev->persist->pdev,
params.bar) + params.offset)
>> PAGE_SHIFT,
PAGE_SIZE, vma->vm_page_prot))
return -EAGAIN;
+
+ mlx4_ib_set_vma_data(vma, &mucontext->hw_bar_info[HW_BAR_CLOCK]);
+
+#if 0
+ } else if (command == MLX4_IB_MMAP_GET_CONTIGUOUS_PAGES ||
+ command == MLX4_IB_EXP_MMAP_GET_CONTIGUOUS_PAGES_CPU_NUMA ||
+ command == MLX4_IB_EXP_MMAP_GET_CONTIGUOUS_PAGES_DEV_NUMA) {
+ /* Getting contiguous physical pages */
+ unsigned long total_size = vma->vm_end - vma->vm_start;
+ unsigned long page_size_order = (vma->vm_pgoff) >>
+ MLX4_IB_MMAP_CMD_BITS;
+ struct ib_cmem *ib_cmem;
+ int numa_node;
+
+ if (command == MLX4_IB_EXP_MMAP_GET_CONTIGUOUS_PAGES_CPU_NUMA)
+ numa_node = -1;
+ else if (command == MLX4_IB_EXP_MMAP_GET_CONTIGUOUS_PAGES_DEV_NUMA)
+ numa_node = dev_to_node(&dev->dev->persist->pdev->dev);
+ else
+ numa_node = -1;
+
+ ib_cmem = ib_cmem_alloc_contiguous_pages(context, total_size,
+ page_size_order,
+ numa_node);
+ if (IS_ERR(ib_cmem)) {
+ err = PTR_ERR(ib_cmem);
+ return err;
+ }
+
+ err = ib_cmem_map_contiguous_pages_to_vma(ib_cmem, vma);
+ if (err) {
+ ib_cmem_release_contiguous_pages(ib_cmem);
+ return err;
+ }
+ return 0;
+#endif
+ } else if (command == MLX4_IB_EXP_MMAP_EXT_UAR_PAGE) {
+ if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+ return -EINVAL;
+
+ if (parm >= MLX4_IB_MAX_CTX_UARS)
+ return -EINVAL;
+
+ /* We prevent double mmaping on same context */
+ list_for_each_entry(uar, &mucontext->user_uar_list, list)
+ if (uar->user_idx == parm) {
+ return -EINVAL;
+ }
+
+ uar = kzalloc(sizeof(*uar), GFP_KERNEL);
+ uar->user_idx = parm;
+
+ err = mlx4_uar_alloc(dev->dev, &uar->uar);
+ if (err) {
+ kfree(uar);
+ return -ENOMEM;
+ }
+
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+ if (io_remap_pfn_range(vma, vma->vm_start,
+ uar->uar.pfn,
+ PAGE_SIZE, vma->vm_page_prot)) {
+ mlx4_uar_free(dev->dev, &uar->uar);
+ kfree(uar);
+
+ return -EAGAIN;
+ }
+
+ mlx4_ib_set_vma_data(vma, &uar->hw_bar_info[HW_BAR_DB]);
+ mutex_lock(&mucontext->user_uar_mutex);
+ list_add(&uar->list, &mucontext->user_uar_list);
+ mutex_unlock(&mucontext->user_uar_mutex);
+ } else if (command == MLX4_IB_EXP_MMAP_EXT_BLUE_FLAME_PAGE) {
+ if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+ return -EINVAL;
+
+ if (parm >= MLX4_IB_MAX_CTX_UARS)
+ return -EINVAL;
+
+ /*
+ * BlueFlame pages are affiliated with the UAR pages by their
+ * indexes. A QP can only use a BlueFlame page with the index
+ * equal to the QP UAR. Therefore BF may be mapped to user
+ * only after the related UAR is already mapped to the user.
+ */
+ uar = NULL;
+ list_for_each_entry(uar, &mucontext->user_uar_list, list)
+ if (uar->user_idx == parm)
+ break;
+ if (!uar || uar->user_idx != parm)
+ return -EINVAL;
+
+ /* We prevent double mmaping on same context */
+ if (uar->hw_bar_info[HW_BAR_BF].vma)
+ return -EINVAL;
+
+ vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+
+ if (io_remap_pfn_range(vma, vma->vm_start,
+ uar->uar.pfn + dev->dev->caps.num_uars,
+ PAGE_SIZE, vma->vm_page_prot))
+ return -EAGAIN;
+
+ mlx4_ib_set_vma_data(vma, &uar->hw_bar_info[HW_BAR_BF]);
} else
return -EINVAL;
@@ -846,27 +1236,22 @@
{
struct mlx4_ib_dev *dev = to_mdev(context->device);
int ret;
- int offset;
switch (cmd) {
case MLX4_IOCHWCLOCKOFFSET: {
struct mlx4_clock_params params;
- int ret;
+
ret = mlx4_get_internal_clock_params(dev->dev, &params);
- if (!ret) {
- offset = params.offset % PAGE_SIZE;
- ret = put_user(offset,
- (int *)arg);
- return sizeof(int);
- } else {
+ if (!ret)
+ return __put_user(params.offset % PAGE_SIZE,
+ (int *)arg);
+ else
return ret;
- }
}
- default: {
+ default:
pr_err("mlx4_ib: invalid ioctl %u command with arg %lX\n",
cmd, arg);
- return -ENOTTY;
- }
+ ret = -EINVAL;
}
return ret;
@@ -877,21 +1262,24 @@
{
struct mlx4_ib_dev *dev = to_mdev(device);
cycle_t cycles;
+ int err = 0;
values->values_mask = 0;
if (q_values & IBV_VALUES_HW_CLOCK) {
- cycles = mlx4_read_clock(dev->dev);
- if (cycles < 0) {
- values->hwclock = cycles & CORE_CLOCK_MASK;
+ if (dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS) {
+ cycles = mlx4_read_clock(dev->dev) & CLOCKSOURCE_MASK(48);
+ values->hwclock = cycles;
values->values_mask |= IBV_VALUES_HW_CLOCK;
+ q_values &= ~IBV_VALUES_HW_CLOCK;
+ } else {
+ err = -ENOTSUPP;
}
- q_values &= ~IBV_VALUES_HW_CLOCK;
}
- if (q_values)
- return -ENOTTY;
+ if (q_values && !err)
+ err = -ENOTSUPP;
- return 0;
+ return err;
}
static struct ib_pd *mlx4_ib_alloc_pd(struct ib_device *ibdev,
@@ -1006,25 +1394,20 @@
int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
union ib_gid *gid)
{
- u8 mac[6];
struct net_device *ndev;
int ret = 0;
if (!mqp->port)
return 0;
- spin_lock(&mdev->iboe.lock);
+ read_lock(&mdev->iboe.iboe_lock);
ndev = mdev->iboe.netdevs[mqp->port - 1];
if (ndev)
dev_hold(ndev);
- spin_unlock(&mdev->iboe.lock);
+ read_unlock(&mdev->iboe.iboe_lock);
if (ndev) {
- rdma_get_mcast_mac((struct in6_addr *)gid, mac);
- rtnl_lock();
- dev_mc_add(mdev->iboe.netdevs[mqp->port - 1], mac, 6, 0);
ret = 1;
- rtnl_unlock();
dev_put(ndev);
}
@@ -1033,13 +1416,15 @@
struct mlx4_ib_steering {
struct list_head list;
- u64 reg_id;
+ struct mlx4_flow_reg_id reg_id;
union ib_gid gid;
};
static int parse_flow_attr(struct mlx4_dev *dev,
+ u32 qp_num,
union ib_flow_spec *ib_spec,
- struct _rule_hw *mlx4_spec)
+ struct _rule_hw *mlx4_spec,
+ struct mlx4_ib_hw_flow *hwflow)
{
enum mlx4_net_trans_rule_id type;
@@ -1052,8 +1437,23 @@
ETH_ALEN);
mlx4_spec->eth.vlan_tag = ib_spec->eth.val.vlan_tag;
mlx4_spec->eth.vlan_tag_msk = ib_spec->eth.mask.vlan_tag;
+ if (!is_multicast_ether_addr(mlx4_spec->eth.dst_mac) &&
+ !is_broadcast_ether_addr(mlx4_spec->eth.dst_mac)) {
+ hwflow->dst_mac = mlx4_mac_to_u64(mlx4_spec->eth.dst_mac)
+ & mlx4_mac_to_u64(mlx4_spec->eth.dst_mac_msk);
+ if (hwflow->dst_mac) {
+ int ret = mlx4_register_mac(dev, hwflow->port,
+ hwflow->dst_mac);
+
+ if (ret < 0) {
+ pr_warn("mlx4: can't register MAC %pM when registering flow\n",
+ ib_spec->eth.val.dst_mac);
+ hwflow->dst_mac = 0;
+ return ret;
+ }
+ }
+ }
break;
-
case IB_FLOW_SPEC_IB:
type = MLX4_NET_TRANS_RULE_ID_IB;
mlx4_spec->ib.l3_qpn = ib_spec->ib.val.l3_type_qpn;
@@ -1063,6 +1463,7 @@
ib_spec->ib.mask.dst_gid, 16);
break;
+
case IB_FLOW_SPEC_IPV4:
type = MLX4_NET_TRANS_RULE_ID_IPV4;
mlx4_spec->ipv4.src_ip = ib_spec->ipv4.val.src_ip;
@@ -1077,105 +1478,260 @@
MLX4_NET_TRANS_RULE_ID_TCP :
MLX4_NET_TRANS_RULE_ID_UDP;
mlx4_spec->tcp_udp.dst_port = ib_spec->tcp_udp.val.dst_port;
- mlx4_spec->tcp_udp.dst_port_msk =
- ib_spec->tcp_udp.mask.dst_port;
+ mlx4_spec->tcp_udp.dst_port_msk = ib_spec->tcp_udp.mask.dst_port;
mlx4_spec->tcp_udp.src_port = ib_spec->tcp_udp.val.src_port;
- mlx4_spec->tcp_udp.src_port_msk =
- ib_spec->tcp_udp.mask.src_port;
+ mlx4_spec->tcp_udp.src_port_msk = ib_spec->tcp_udp.mask.src_port;
break;
default:
return -EINVAL;
}
- if (map_sw_to_hw_steering_id(dev, type) < 0 ||
- hw_rule_sz(dev, type) < 0)
+ if (mlx4_map_sw_to_hw_steering_id(dev, type) < 0 ||
+ mlx4_hw_rule_sz(dev, type) < 0)
return -EINVAL;
- mlx4_spec->id = cpu_to_be16(map_sw_to_hw_steering_id(dev, type));
- mlx4_spec->size = hw_rule_sz(dev, type) >> 2;
- return hw_rule_sz(dev, type);
+ mlx4_spec->id = cpu_to_be16(mlx4_map_sw_to_hw_steering_id(dev, type));
+ mlx4_spec->size = mlx4_hw_rule_sz(dev, type) >> 2;
+ return mlx4_hw_rule_sz(dev, type);
}
-static int __mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr,
- int domain,
- enum mlx4_net_trans_promisc_mode flow_type,
- u64 *reg_id)
+struct default_rules {
+ __u32 mandatory_fields[IB_FLOW_SPEC_SUPPORT_LAYERS];
+ __u32 mandatory_not_fields[IB_FLOW_SPEC_SUPPORT_LAYERS];
+ __u32 rules_create_list[IB_FLOW_SPEC_SUPPORT_LAYERS];
+ __u8 link_layer;
+};
+static const struct default_rules default_table[] = {
+ {
+ .mandatory_fields = {IB_FLOW_SPEC_IPV4},
+ .mandatory_not_fields = {IB_FLOW_SPEC_ETH},
+ .rules_create_list = {IB_FLOW_SPEC_IB},
+ .link_layer = IB_LINK_LAYER_INFINIBAND
+ }
+};
+
+static int __mlx4_ib_default_rules_match(struct ib_qp *qp,
+ struct ib_flow_attr *flow_attr)
{
- int ret, i;
- int size = 0;
+ int i, j, k;
void *ib_flow;
- struct mlx4_ib_dev *mdev = to_mdev(qp->device);
- struct mlx4_cmd_mailbox *mailbox;
- struct mlx4_net_trans_rule_hw_ctrl *ctrl;
- size_t rule_size = sizeof(struct mlx4_net_trans_rule_hw_ctrl) +
- (sizeof(struct _rule_hw) * flow_attr->num_of_specs);
+ const struct default_rules *pdefault_rules = default_table;
+ u8 link_layer = rdma_port_get_link_layer(qp->device, flow_attr->port);
- static const u16 __mlx4_domain[] = {
- [IB_FLOW_DOMAIN_USER] = MLX4_DOMAIN_UVERBS,
- [IB_FLOW_DOMAIN_ETHTOOL] = MLX4_DOMAIN_ETHTOOL,
- [IB_FLOW_DOMAIN_RFS] = MLX4_DOMAIN_RFS,
- [IB_FLOW_DOMAIN_NIC] = MLX4_DOMAIN_NIC,
- };
+ for (i = 0; i < ARRAY_SIZE(default_table); i++, pdefault_rules++) {
+ __u32 field_types[IB_FLOW_SPEC_SUPPORT_LAYERS];
+ memset(&field_types, 0, sizeof(field_types));
- if (flow_attr->priority > MLX4_IB_FLOW_MAX_PRIO) {
- pr_err("Invalid priority value.\n");
- return -EINVAL;
- }
- if (domain >= IB_FLOW_DOMAIN_NUM) {
- pr_err("Invalid domain value.\n");
- return -EINVAL;
- }
- if (map_sw_to_hw_steering_mode(mdev->dev, flow_type) < 0)
- return -EINVAL;
+ if (link_layer != pdefault_rules->link_layer)
+ continue;
- mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);
- if (IS_ERR(mailbox))
+ ib_flow = flow_attr + 1;
+ /* we assume the specs are sorted */
+ for (j = 0, k = 0; k < IB_FLOW_SPEC_SUPPORT_LAYERS &&
+ j < flow_attr->num_of_specs; k++) {
+ union ib_flow_spec *current_flow =
+ (union ib_flow_spec *)ib_flow;
+
+ /* same layer but different type */
+ if (((current_flow->type & IB_FLOW_SPEC_LAYER_MASK) ==
+ (pdefault_rules->mandatory_fields[k] &
+ IB_FLOW_SPEC_LAYER_MASK)) &&
+ (current_flow->type !=
+ pdefault_rules->mandatory_fields[k]))
+ goto out;
+
+ /* same layer, try match next one */
+ if (current_flow->type ==
+ pdefault_rules->mandatory_fields[k]) {
+ j++;
+ ib_flow +=
+ ((union ib_flow_spec *)ib_flow)->size;
+ }
+ }
+
+ ib_flow = flow_attr + 1;
+ for (j = 0; j < flow_attr->num_of_specs;
+ j++, ib_flow += ((union ib_flow_spec *)ib_flow)->size)
+ for (k = 0; k < IB_FLOW_SPEC_SUPPORT_LAYERS; k++)
+ /* same layer and same type */
+ if (((union ib_flow_spec *)ib_flow)->type ==
+ pdefault_rules->mandatory_not_fields[k])
+ goto out;
+
+ return i;
+ }
+out:
+ return -1;
+}
+
+static int __mlx4_ib_create_default_rules(
+ struct mlx4_ib_dev *mdev,
+ struct ib_qp *qp,
+ const struct default_rules *pdefault_rules,
+ struct _rule_hw *mlx4_spec,
+ struct mlx4_ib_hw_flow *hwflow) {
+ int size = 0;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(pdefault_rules->rules_create_list); i++) {
+ int ret;
+ union ib_flow_spec ib_spec;
+ switch (pdefault_rules->rules_create_list[i]) {
+ case 0:
+ /* no rule */
+ continue;
+ case IB_FLOW_SPEC_IB:
+ memset(&ib_spec, 0, sizeof(ib_spec));
+ ib_spec.type = IB_FLOW_SPEC_IB;
+ ib_spec.size = sizeof(struct ib_flow_spec_ib);
+
+ break;
+ default:
+ /* invalid rule */
+ return -EINVAL;
+ }
+ /* We must put empty rule, qpn is being ignored */
+ ret = parse_flow_attr(mdev->dev, 0, &ib_spec,
+ mlx4_spec, hwflow);
+ if (ret < 0) {
+ pr_info("invalid parsing\n");
+ return -EINVAL;
+ }
+
+ mlx4_spec = (void *)mlx4_spec + ret;
+ size += ret;
+ }
+ return size;
+}
+
+static int __mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr,
+ int domain,
+ enum mlx4_net_trans_promisc_mode flow_type,
+ struct mlx4_ib_hw_flow *hwflow)
+{
+ int ret, i;
+ int size = 0;
+ void *ib_flow;
+ struct mlx4_ib_dev *mdev = to_mdev(qp->device);
+ struct mlx4_cmd_mailbox *mailbox;
+ struct mlx4_net_trans_rule_hw_ctrl *ctrl;
+ int default_flow;
+
+ static const u16 __mlx4_domain[] = {
+ [IB_FLOW_DOMAIN_USER] = MLX4_DOMAIN_UVERBS,
+ [IB_FLOW_DOMAIN_ETHTOOL] = MLX4_DOMAIN_ETHTOOL,
+ [IB_FLOW_DOMAIN_RFS] = MLX4_DOMAIN_RFS,
+ [IB_FLOW_DOMAIN_NIC] = MLX4_DOMAIN_NIC,
+ };
+
+ if (flow_attr->priority > MLX4_IB_FLOW_MAX_PRIO) {
+ pr_err("Invalid priority value %d\n", flow_attr->priority);
+ return -EINVAL;
+ }
+
+ if (domain >= IB_FLOW_DOMAIN_NUM) {
+ pr_err("Invalid domain value %d\n", domain);
+ return -EINVAL;
+ }
+
+ if (mlx4_map_sw_to_hw_steering_mode(mdev->dev, flow_type) < 0)
+ return -EINVAL;
+
+ mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);
+ if (IS_ERR(mailbox))
return PTR_ERR(mailbox);
- memset(mailbox->buf, 0, rule_size);
ctrl = mailbox->buf;
ctrl->prio = cpu_to_be16(__mlx4_domain[domain] |
flow_attr->priority);
- ctrl->type = map_sw_to_hw_steering_mode(mdev->dev, flow_type);
+ ctrl->type = mlx4_map_sw_to_hw_steering_mode(mdev->dev, flow_type);
ctrl->port = flow_attr->port;
ctrl->qpn = cpu_to_be32(qp->qp_num);
-
- if (flow_attr->flags & IB_FLOW_ATTR_FLAGS_ALLOW_LOOP_BACK)
- ctrl->flags = (1 << 3);
+ hwflow->port = flow_attr->port;
ib_flow = flow_attr + 1;
size += sizeof(struct mlx4_net_trans_rule_hw_ctrl);
+ /* Add default flows */
+ default_flow = __mlx4_ib_default_rules_match(qp, flow_attr);
+ if (default_flow >= 0) {
+ ret = __mlx4_ib_create_default_rules(
+ mdev, qp,
+ default_table + default_flow,
+ mailbox->buf + size, hwflow);
+ if (ret < 0)
+ goto free_mailbox;
+
+ size += ret;
+ }
for (i = 0; i < flow_attr->num_of_specs; i++) {
- ret = parse_flow_attr(mdev->dev, ib_flow, mailbox->buf + size);
- if (ret < 0) {
- mlx4_free_cmd_mailbox(mdev->dev, mailbox);
- return -EINVAL;
- }
- ib_flow += ((union ib_flow_spec *)ib_flow)->size;
+ ret = parse_flow_attr(mdev->dev, qp->qp_num, ib_flow,
+ mailbox->buf + size, hwflow);
+ if (ret < 0)
+ goto unregister_mac;
+
+ ib_flow += ((union ib_flow_spec *) ib_flow)->size;
size += ret;
}
- ret = mlx4_cmd_imm(mdev->dev, mailbox->dma, reg_id, size >> 2, 0,
+ ret = mlx4_cmd_imm(mdev->dev, mailbox->dma, &hwflow->reg_id, size >> 2, 0,
MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A,
- MLX4_CMD_NATIVE);
+ MLX4_CMD_WRAPPED);
if (ret == -ENOMEM)
pr_err("mcg table is full. Fail to register network rule.\n");
else if (ret == -ENXIO)
pr_err("Device managed flow steering is disabled. Fail to register network rule.\n");
else if (ret)
pr_err("Invalid argumant. Fail to register network rule.\n");
+
+unregister_mac:
+ if (ret && hwflow->dst_mac) {
+ mlx4_unregister_mac(mdev->dev, hwflow->port, hwflow->dst_mac);
+ hwflow->dst_mac = 0;
+ }
+free_mailbox:
mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+
return ret;
}
-static int __mlx4_ib_destroy_flow(struct mlx4_dev *dev, u64 reg_id)
+static int __mlx4_ib_destroy_flow(struct mlx4_dev *dev,
+ const struct mlx4_ib_hw_flow *hwflow)
{
int err;
- err = mlx4_cmd(dev, reg_id, 0, 0,
+
+ if (hwflow->dst_mac)
+ mlx4_unregister_mac(dev, hwflow->port, hwflow->dst_mac);
+
+ err = mlx4_cmd(dev, hwflow->reg_id, 0, 0,
MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A,
- MLX4_CMD_NATIVE);
+ MLX4_CMD_WRAPPED);
if (err)
pr_err("Fail to detach network rule. registration id = 0x%llx\n",
- (unsigned long long)reg_id);
+ (long long)hwflow->reg_id);
+ return err;
+}
+
+static int mlx4_ib_tunnel_steer_add(struct ib_qp *qp, struct ib_flow_attr *flow_attr,
+ struct mlx4_ib_hw_flow *hwflow)
+{
+ void *ib_flow;
+ union ib_flow_spec *ib_spec;
+ struct mlx4_dev *dev = to_mdev(qp->device)->dev;
+ int err = 0;
+
+ if (dev->caps.tunnel_offload_mode != MLX4_TUNNEL_OFFLOAD_MODE_VXLAN ||
+ dev->caps.dmfs_high_steer_mode == MLX4_STEERING_DMFS_A0_STATIC)
+ return 0; /* do nothing */
+
+ ib_flow = flow_attr + 1;
+ ib_spec = (union ib_flow_spec *)ib_flow;
+
+ if (ib_spec->type != IB_FLOW_SPEC_ETH || flow_attr->num_of_specs != 1)
+ return 0; /* do nothing */
+
+ err = mlx4_tunnel_steer_add(to_mdev(qp->device)->dev, ib_spec->eth.val.dst_mac,
+ flow_attr->port, qp->qp_num,
+ MLX4_DOMAIN_UVERBS | (flow_attr->priority & 0xff),
+ &hwflow->reg_id);
return err;
}
@@ -1183,13 +1739,15 @@
struct ib_flow_attr *flow_attr,
int domain)
{
- int err = 0, i = 0;
+ int err = 0, i = 0, j = 0;
struct mlx4_ib_flow *mflow;
enum mlx4_net_trans_promisc_mode type[2];
+ struct mlx4_dev *dev = (to_mdev(qp->device))->dev;
+ int is_bonded = mlx4_is_bonded(dev);
memset(type, 0, sizeof(type));
- mflow = kzalloc(sizeof(struct mlx4_ib_flow), GFP_KERNEL);
+ mflow = kzalloc(sizeof(*mflow), GFP_KERNEL);
if (!mflow) {
err = -ENOMEM;
goto err_free;
@@ -1220,14 +1778,58 @@
while (i < ARRAY_SIZE(type) && type[i]) {
err = __mlx4_ib_create_flow(qp, flow_attr, domain, type[i],
- &mflow->reg_id[i]);
+ &mflow->reg_id[i].flow);
+ if (err)
+ goto err_create_flow;
+ i++;
+ if (is_bonded) {
+ /* Application always sees one port so the mirror rule
+ * must be on port #2
+ */
+ flow_attr->port = 2;
+ err = __mlx4_ib_create_flow(qp, flow_attr,
+ domain, type[j],
+ &mflow->reg_id[j].mirror);
+ flow_attr->port = 1;
+ if (err)
+ goto err_create_flow;
+ j++;
+ }
+
+ }
+
+ if (i < ARRAY_SIZE(type) && flow_attr->type == IB_FLOW_ATTR_NORMAL) {
+ err = mlx4_ib_tunnel_steer_add(qp, flow_attr,
+ &mflow->reg_id[i].flow);
if (err)
- goto err_free;
+ goto err_create_flow;
i++;
+ if (is_bonded) {
+ flow_attr->port = 2;
+ err = mlx4_ib_tunnel_steer_add(qp, flow_attr,
+ &mflow->reg_id[j].mirror);
+ flow_attr->port = 1;
+ if (err)
+ goto err_create_flow;
+ j++;
+ }
+ /* function to create mirror rule */
}
return &mflow->ibflow;
+err_create_flow:
+ while (i) {
+ (void)__mlx4_ib_destroy_flow(to_mdev(qp->device)->dev,
+ &mflow->reg_id[i].flow);
+ i--;
+ }
+
+ while (j) {
+ (void)__mlx4_ib_destroy_flow(to_mdev(qp->device)->dev,
+ &mflow->reg_id[j].mirror);
+ j--;
+ }
err_free:
kfree(mflow);
return ERR_PTR(err);
@@ -1240,10 +1842,16 @@
struct mlx4_ib_dev *mdev = to_mdev(flow_id->qp->device);
struct mlx4_ib_flow *mflow = to_mflow(flow_id);
- while (i < ARRAY_SIZE(mflow->reg_id) && mflow->reg_id[i]) {
- err = __mlx4_ib_destroy_flow(mdev->dev, mflow->reg_id[i]);
+ while (i < ARRAY_SIZE(mflow->reg_id) && mflow->reg_id[i].flow.reg_id) {
+ err = __mlx4_ib_destroy_flow(mdev->dev, &mflow->reg_id[i].flow);
if (err)
ret = err;
+ if (mflow->reg_id[i].mirror.reg_id) {
+ err = __mlx4_ib_destroy_flow(mdev->dev,
+ &mflow->reg_id[i].mirror);
+ if (err)
+ ret = err;
+ }
i++;
}
@@ -1251,6 +1859,85 @@
return ret;
}
+static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+ int err = -ENODEV;
+ struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
+ struct mlx4_ib_qp *mqp = to_mqp(ibqp);
+ int is_over_ip = mlx4_roce_is_over_ip(mdev->dev->caps.ud_gid_type);
+ enum mlx4_protocol prot =
+ (ibqp->qp_type == IB_QPT_RAW_PACKET) ? MLX4_PROT_ETH :
+ (gid->raw[1] == 0x0e && is_over_ip) ? MLX4_PROT_IB_IPV4 : MLX4_PROT_IB_IPV6;
+ DECLARE_BITMAP(ports, MLX4_MAX_PORTS);
+ int i = 0;
+
+ if (mdev->dev->caps.steering_mode == MLX4_STEERING_MODE_B0 &&
+ ibqp->qp_type == IB_QPT_RAW_PACKET)
+ gid->raw[5] = mqp->port;
+
+ if (mdev->dev->caps.steering_mode ==
+ MLX4_STEERING_MODE_DEVICE_MANAGED) {
+ bitmap_fill(ports, mdev->dev->caps.num_ports);
+ } else {
+ if ((mqp->port >= 0) &&
+ (mqp->port <= mdev->dev->caps.num_ports)) {
+ bitmap_zero(ports, mdev->dev->caps.num_ports);
+ set_bit(0, ports);
+ } else {
+ return -EINVAL;
+ }
+ }
+
+ for (; i < mdev->dev->caps.num_ports; i++) {
+ struct mlx4_flow_reg_id reg_id = {.flow = {.dst_mac = 0, .port = i + 1}};
+ struct mlx4_ib_steering *ib_steering = NULL;
+
+ if (!test_bit(i, ports))
+ continue;
+ if (mdev->dev->caps.steering_mode ==
+ MLX4_STEERING_MODE_DEVICE_MANAGED) {
+ ib_steering = kmalloc(sizeof(*ib_steering), GFP_KERNEL);
+ if (!ib_steering)
+ goto err_add;
+ }
+
+ err = mlx4_multicast_attach(mdev->dev, &mqp->mqp,
+ gid->raw, i + 1,
+ !!(mqp->flags &
+ MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK),
+ prot,
+ &reg_id.flow.reg_id);
+ if (err) {
+ kfree(ib_steering);
+ goto err_add;
+ }
+
+ err = add_gid_entry(ibqp, gid);
+ if (err) {
+ mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw,
+ prot, reg_id.flow.reg_id);
+ kfree(ib_steering);
+ goto err_add;
+ }
+
+ if (ib_steering) {
+ memcpy(ib_steering->gid.raw, gid->raw, 16);
+ ib_steering->reg_id = reg_id;
+ mutex_lock(&mqp->mutex);
+ list_add(&ib_steering->list, &mqp->steering_rules);
+ mutex_unlock(&mqp->mutex);
+ }
+ }
+
+ return 0;
+
+err_add:
+ if (i > 0)
+ _mlx4_ib_mcg_detach(ibqp, gid, lid, i);
+
+ return err;
+}
+
static struct mlx4_ib_gid_entry *find_gid_entry(struct mlx4_ib_qp *qp, u8 *raw)
{
struct mlx4_ib_gid_entry *ge;
@@ -1267,7 +1954,6 @@
return ret;
}
-
static int del_gid_entry(struct ib_qp *ibqp, union ib_gid *gid)
{
struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
@@ -1279,27 +1965,52 @@
mutex_lock(&mqp->mutex);
ge = find_gid_entry(mqp, gid->raw);
if (ge) {
- spin_lock(&mdev->iboe.lock);
+ read_lock(&mdev->iboe.iboe_lock);
ndev = ge->added ? mdev->iboe.netdevs[ge->port - 1] : NULL;
if (ndev)
dev_hold(ndev);
- spin_unlock(&mdev->iboe.lock);
+ read_unlock(&mdev->iboe.iboe_lock);
rdma_get_mcast_mac((struct in6_addr *)gid, mac);
if (ndev) {
rtnl_lock();
- dev_mc_delete(mdev->iboe.netdevs[ge->port - 1], mac, 6, 0);
+ dev_mc_del(ndev, mac);
rtnl_unlock();
dev_put(ndev);
}
list_del(&ge->list);
kfree(ge);
- } else
+ } else {
pr_warn("could not find mgid entry\n");
+ }
mutex_unlock(&mqp->mutex);
return ge != NULL ? 0 : -EINVAL;
}
+static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+ struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
+ int count = (mdev->dev->caps.steering_mode ==
+ MLX4_STEERING_MODE_DEVICE_MANAGED) ?
+ mdev->dev->caps.num_ports : 1;
+
+ return _mlx4_ib_mcg_detach(ibqp, gid, lid, count);
+}
+
+static int __mlx4_ib_mcg_detach(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
+ union ib_gid *gid, enum mlx4_protocol prot, u64 reg_id)
+{
+ int err = 0;
+
+ err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, prot, reg_id);
+
+ if (err)
+ return err;
+
+ err = del_gid_entry(&mqp->ibqp, gid);
+ return err;
+}
+
static int _mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid,
int count)
{
@@ -1308,6 +2019,13 @@
struct mlx4_ib_qp *mqp = to_mqp(ibqp);
u64 reg_id = 0;
int record_err = 0;
+ int is_over_ip = mlx4_roce_is_over_ip(mdev->dev->caps.ud_gid_type);
+ enum mlx4_protocol prot =
+ (ibqp->qp_type == IB_QPT_RAW_PACKET) ? MLX4_PROT_ETH :
+ (gid->raw[1] == 0x0e && is_over_ip) ? MLX4_PROT_IB_IPV4 : MLX4_PROT_IB_IPV6;
+
+ if (count == 0)
+ return 0;
if (mdev->dev->caps.steering_mode ==
MLX4_STEERING_MODE_DEVICE_MANAGED) {
@@ -1321,30 +2039,19 @@
if (memcmp(ib_steering->gid.raw, gid->raw, 16))
continue;
- if (--count < 0)
- break;
-
list_del(&ib_steering->list);
list_add(&ib_steering->list, &temp);
+ if (--count <= 0)
+ break;
}
mutex_unlock(&mqp->mutex);
list_for_each_entry_safe(ib_steering, tmp, &temp,
list) {
- reg_id = ib_steering->reg_id;
-
- err = mlx4_multicast_detach(mdev->dev, &mqp->mqp,
- gid->raw,
- (ibqp->qp_type == IB_QPT_RAW_PACKET) ?
- MLX4_PROT_ETH : MLX4_PROT_IB_IPV6,
- reg_id);
- if (err) {
- record_err = record_err ?: err;
- continue;
- }
-
- err = del_gid_entry(ibqp, gid);
+ reg_id = ib_steering->reg_id.flow.reg_id;
+ err = __mlx4_ib_mcg_detach(mdev, mqp,
+ gid, prot, reg_id);
if (err) {
- record_err = record_err ?: err;
+ record_err = record_err ?:err;
continue;
}
@@ -1352,9 +2059,7 @@
kfree(ib_steering);
}
mutex_lock(&mqp->mutex);
- list_for_each_entry(ib_steering, &temp, list) {
- list_add(&ib_steering->list, &mqp->steering_rules);
- }
+ list_splice(&temp, &mqp->steering_rules);
mutex_unlock(&mqp->mutex);
if (count) {
pr_warn("Couldn't release all reg_ids for mgid. Steering rule is left attached\n");
@@ -1366,15 +2071,8 @@
ibqp->qp_type == IB_QPT_RAW_PACKET)
gid->raw[5] = mqp->port;
- err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw,
- (ibqp->qp_type == IB_QPT_RAW_PACKET) ?
- MLX4_PROT_ETH : MLX4_PROT_IB_IPV6,
- reg_id);
- if (err)
- return err;
-
- err = del_gid_entry(ibqp, gid);
-
+ err = __mlx4_ib_mcg_detach(mdev, mqp,
+ gid, prot, reg_id);
if (err)
return err;
}
@@ -1382,91 +2080,6 @@
return record_err;
}
-static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
-{
- struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
- int count = (mdev->dev->caps.steering_mode ==
- MLX4_STEERING_MODE_DEVICE_MANAGED) ?
- mdev->dev->caps.num_ports : 1;
-
- return _mlx4_ib_mcg_detach(ibqp, gid, lid, count);
-}
-
-static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
-{
- int err = -ENODEV;
- struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
- struct mlx4_ib_qp *mqp = to_mqp(ibqp);
- DECLARE_BITMAP(ports, MLX4_MAX_PORTS);
- int i = 0;
-
- if (mdev->dev->caps.steering_mode == MLX4_STEERING_MODE_B0 &&
- ibqp->qp_type == IB_QPT_RAW_PACKET)
- gid->raw[5] = mqp->port;
-
- if (mdev->dev->caps.steering_mode ==
- MLX4_STEERING_MODE_DEVICE_MANAGED) {
- bitmap_fill(ports, mdev->dev->caps.num_ports);
- } else {
- if (mqp->port <= mdev->dev->caps.num_ports) {
- bitmap_zero(ports, mdev->dev->caps.num_ports);
- set_bit(0, ports);
- } else {
- return -EINVAL;
- }
- }
-
- for (; i < mdev->dev->caps.num_ports; i++) {
- u64 reg_id;
- struct mlx4_ib_steering *ib_steering = NULL;
- if (!test_bit(i, ports))
- continue;
- if (mdev->dev->caps.steering_mode ==
- MLX4_STEERING_MODE_DEVICE_MANAGED) {
- ib_steering = kmalloc(sizeof(*ib_steering), GFP_KERNEL);
- if (!ib_steering)
- goto err_add;
- }
-
- err = mlx4_multicast_attach(mdev->dev, &mqp->mqp,
- gid->raw, i + 1,
- !!(mqp->flags &
- MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK),
- (ibqp->qp_type == IB_QPT_RAW_PACKET) ?
- MLX4_PROT_ETH : MLX4_PROT_IB_IPV6,
- &reg_id);
- if (err) {
- kfree(ib_steering);
- goto err_add;
- }
-
- err = add_gid_entry(ibqp, gid);
- if (err) {
- mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw,
- MLX4_PROT_IB_IPV6, reg_id);
- kfree(ib_steering);
- goto err_add;
- }
-
- if (ib_steering) {
- memcpy(ib_steering->gid.raw, gid->raw, 16);
- mutex_lock(&mqp->mutex);
- list_add(&ib_steering->list, &mqp->steering_rules);
- mutex_unlock(&mqp->mutex);
- ib_steering->reg_id = reg_id;
- }
- }
-
-
- return 0;
-
-err_add:
- if (i > 0)
- _mlx4_ib_mcg_detach(ibqp, gid, lid, i);
-
- return err;
-}
-
static int init_node_data(struct mlx4_ib_dev *dev)
{
struct ib_smp *in_mad = NULL;
@@ -1510,7 +2123,7 @@
{
struct mlx4_ib_dev *dev =
container_of(device, struct mlx4_ib_dev, ib_dev.dev);
- return sprintf(buf, "MT%d\n", dev->dev->pdev->device);
+ return sprintf(buf, "MT%d\n", dev->dev->persist->pdev->device);
}
static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
@@ -1540,84 +2153,72 @@
dev->dev->board_id);
}
-static ssize_t show_vsd(struct device *device, struct device_attribute *attr,
- char *buf)
-{
- struct mlx4_ib_dev *dev =
- container_of(device, struct mlx4_ib_dev, ib_dev.dev);
- ssize_t len = MLX4_VSD_LEN;
-
- if (dev->dev->vsd_vendor_id == PCI_VENDOR_ID_MELLANOX)
- len = sprintf(buf, "%.*s\n", MLX4_VSD_LEN, dev->dev->vsd);
- else
- memcpy(buf, dev->dev->vsd, MLX4_VSD_LEN);
-
- return len;
-}
-
static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
-static DEVICE_ATTR(vsd, S_IRUGO, show_vsd, NULL);
static struct device_attribute *mlx4_class_attributes[] = {
&dev_attr_hw_rev,
&dev_attr_fw_ver,
&dev_attr_hca_type,
- &dev_attr_board_id,
- &dev_attr_vsd
-};
-
-static void mlx4_addrconf_ifid_eui48(u8 *eui, u16 vlan_id, struct net_device *dev, u8 port)
-{
- memcpy(eui, IF_LLADDR(dev), 3);
- memcpy(eui + 5, IF_LLADDR(dev) + 3, 3);
- if (vlan_id < 0x1000) {
- eui[3] = vlan_id >> 8;
- eui[4] = vlan_id & 0xff;
- } else {
- eui[3] = 0xff;
- eui[4] = 0xfe;
- }
- eui[0] ^= 2;
-}
+ &dev_attr_board_id
+};
-static void update_gids_task(struct work_struct *work)
+#define MLX4_IB_INVALID_MAC ((u64)-1)
+static void mlx4_ib_update_qps(struct mlx4_ib_dev *ibdev,
+ struct net_device *dev,
+ int port)
{
- struct update_gid_work *gw = container_of(work, struct update_gid_work, work);
- struct mlx4_cmd_mailbox *mailbox;
- union ib_gid *gids;
- int err;
- struct mlx4_dev *dev = gw->dev->dev;
+ u64 new_smac = 0;
+ u64 release_mac = MLX4_IB_INVALID_MAC;
+ struct mlx4_ib_qp *qp;
+ new_smac = mlx4_mac_to_u64(IF_LLADDR(dev));
- mailbox = mlx4_alloc_cmd_mailbox(dev);
- if (IS_ERR(mailbox)) {
- pr_warn("update gid table failed %ld\n", PTR_ERR(mailbox));
- goto free;
- }
+ atomic64_set(&ibdev->iboe.mac[port - 1], new_smac);
- gids = mailbox->buf;
- memcpy(gids, gw->gids, sizeof gw->gids);
+ /* no need for update QP1 and mac registration in non-SRIOV */
+ if (!mlx4_is_mfunc(ibdev->dev))
+ return;
- if (mlx4_ib_port_link_layer(&gw->dev->ib_dev, gw->port) ==
- IB_LINK_LAYER_ETHERNET) {
- err = mlx4_cmd(dev, mailbox->dma,
- MLX4_SET_PORT_GID_TABLE << 8 | gw->port,
- 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
- MLX4_CMD_WRAPPED);
+ mutex_lock(&ibdev->qp1_proxy_lock[port - 1]);
+ qp = ibdev->qp1_proxy[port - 1];
+ if (qp) {
+ int new_smac_index;
+ u64 old_smac;
+ struct mlx4_update_qp_params update_params;
- if (err)
- pr_warn("set port command failed\n");
- else
- mlx4_ib_dispatch_event(gw->dev, gw->port,
- IB_EVENT_GID_CHANGE);
+ mutex_lock(&qp->mutex);
+ old_smac = qp->pri.smac;
+ if (new_smac == old_smac)
+ goto unlock;
+
+ new_smac_index = mlx4_register_mac(ibdev->dev, port, new_smac);
+
+ if (new_smac_index < 0)
+ goto unlock;
+
+ update_params.smac_index = new_smac_index;
+ if (mlx4_update_qp(ibdev->dev, qp->mqp.qpn, MLX4_UPDATE_QP_SMAC,
+ &update_params)) {
+ release_mac = new_smac;
+ goto unlock;
+ }
+ /* if old port was zero, no mac was yet registered for this QP */
+ if (qp->pri.smac_port)
+ release_mac = old_smac;
+ qp->pri.smac = new_smac;
+ qp->pri.smac_port = port;
+ qp->pri.smac_index = new_smac_index;
}
- mlx4_free_cmd_mailbox(dev, mailbox);
-free:
- kfree(gw);
+unlock:
+ if (release_mac != MLX4_IB_INVALID_MAC)
+ mlx4_unregister_mac(ibdev->dev, port, release_mac);
+ if (qp)
+ mutex_unlock(&qp->mutex);
+ mutex_unlock(&ibdev->qp1_proxy_lock[port - 1]);
}
static struct net_device *mlx4_ib_get_netdev(struct ib_device *device, u8 port_num)
@@ -1626,323 +2227,204 @@
return mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port_num);
}
-static void reset_gids_task(struct work_struct *work)
+static inline enum mlx4_roce_gid_type ib_gid_type_to_mlx4_gid_type(enum ib_gid_type gid_type)
{
- struct update_gid_work *gw =
- container_of(work, struct update_gid_work, work);
- struct mlx4_cmd_mailbox *mailbox;
- union ib_gid *gids;
- int err;
- struct mlx4_dev *dev = gw->dev->dev;
-
- mailbox = mlx4_alloc_cmd_mailbox(dev);
- if (IS_ERR(mailbox)) {
- pr_warn("reset gid table failed\n");
- goto free;
- }
-
- gids = mailbox->buf;
- memcpy(gids, gw->gids, sizeof(gw->gids));
-
- if (mlx4_ib_port_link_layer(&gw->dev->ib_dev, 1) ==
- IB_LINK_LAYER_ETHERNET &&
- dev->caps.num_ports > 0) {
- err = mlx4_cmd(dev, mailbox->dma,
- MLX4_SET_PORT_GID_TABLE << 8 | 1,
- 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
- MLX4_CMD_WRAPPED);
- if (err)
- pr_warn("set port 1 command failed\n");
- }
-
- if (mlx4_ib_port_link_layer(&gw->dev->ib_dev, 2) ==
- IB_LINK_LAYER_ETHERNET &&
- dev->caps.num_ports > 1) {
- err = mlx4_cmd(dev, mailbox->dma,
- MLX4_SET_PORT_GID_TABLE << 8 | 2,
- 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
- MLX4_CMD_WRAPPED);
- if (err)
- pr_warn("set port 2 command failed\n");
+ switch (gid_type) {
+ case IB_GID_TYPE_IB:
+ return MLX4_ROCE_GID_TYPE_V1;
+ case IB_GID_TYPE_ROCE_V1_5:
+ return MLX4_ROCE_GID_TYPE_V1_5;
+ case IB_GID_TYPE_ROCE_V2:
+ return MLX4_ROCE_GID_TYPE_V2;
+ default:
+ return (enum mlx4_roce_gid_type)IB_GID_TYPE_SIZE;
}
-
- mlx4_free_cmd_mailbox(dev, mailbox);
-free:
- kfree(gw);
}
-static int update_gid_table(struct mlx4_ib_dev *dev, int port,
- union ib_gid *gid, int clear, int default_gid)
+static int mlx4_ib_modify_gid(struct ib_device *device,
+ u8 port_num, unsigned int index,
+ const union ib_gid *gid,
+ const struct ib_gid_attr *attr,
+ void **context)
{
- struct update_gid_work *work;
+ struct mlx4_ib_dev *ibdev = to_mdev(device);
+ struct mlx4_ib_iboe *iboe = &ibdev->iboe;
+ struct mlx4_port_gid_table *port_gid_table;
+ int findex = -1, found = -1;
+ int ret = 0;
+ int clear = !memcmp(&zgid, gid, sizeof(*gid));
+ int hw_update = 0;
int i;
- int need_update = 0;
- int free = -1;
- int found = -1;
- int max_gids;
- int start_index = !default_gid;
-
- max_gids = dev->dev->caps.gid_table_len[port];
- for (i = start_index; i < max_gids; ++i) {
- if (!memcmp(&dev->iboe.gid_table[port - 1][i], gid,
- sizeof(*gid)))
- found = i;
-
- if (clear) {
- if (found >= 0) {
- need_update = 1;
- dev->iboe.gid_table[port - 1][found] = zgid;
- break;
- }
- } else {
- if (found >= 0)
- break;
-
- if (free < 0 &&
- !memcmp(&dev->iboe.gid_table[port - 1][i],
- &zgid, sizeof(*gid)))
- free = i;
- }
- }
+ struct mlx4_roce_addr_table *addr_table;
- if (found == -1 && !clear && free < 0) {
- pr_err("GID table of port %d is full. Can't add "GID_PRINT_FMT"\n",
- port, GID_PRINT_ARGS(gid));
- return -ENOMEM;
- }
- if (found == -1 && clear) {
- pr_err(GID_PRINT_FMT" is not in GID table of port %d\n", GID_PRINT_ARGS(gid), port);
+ if (ib_cache_use_roce_gid_cache(device, port_num))
return -EINVAL;
- }
- if (found == -1 && !clear && free >= 0) {
- dev->iboe.gid_table[port - 1][free] = *gid;
- need_update = 1;
- }
-
- if (!need_update)
- return 0;
-
- work = kzalloc(sizeof *work, GFP_ATOMIC);
- if (!work)
- return -ENOMEM;
-
- memcpy(work->gids, dev->iboe.gid_table[port - 1], sizeof(work->gids));
- INIT_WORK(&work->work, update_gids_task);
- work->port = port;
- work->dev = dev;
- queue_work(wq, &work->work);
-
- return 0;
-}
-
-static int reset_gid_table(struct mlx4_ib_dev *dev)
-{
- struct update_gid_work *work;
+ if (port_num > MLX4_MAX_PORTS)
+ return -EINVAL;
- work = kzalloc(sizeof(*work), GFP_ATOMIC);
- if (!work)
- return -ENOMEM;
+ if (!context)
+ return -EINVAL;
- memset(dev->iboe.gid_table, 0, sizeof(dev->iboe.gid_table));
- memset(work->gids, 0, sizeof(work->gids));
- INIT_WORK(&work->work, reset_gids_task);
- work->dev = dev;
- queue_work(wq, &work->work);
- return 0;
-}
+ write_lock(&iboe->iboe_lock);
+ port_gid_table = &iboe->gid_table[port_num - 1];
-/* XXX BOND Related - stub (no support for these flags in FBSD)*/
-static inline int netif_is_bond_master(struct net_device *dev)
-{
-#if 0
- return (dev->flags & IFF_MASTER) && (dev->priv_flags & IFF_BONDING);
-#endif
- return 0;
-}
+ if (clear) {
+ struct gid_cache_context *ctx = *context;
-static void mlx4_make_default_gid(struct net_device *dev, union ib_gid *gid, u8 port)
-{
- gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
- mlx4_addrconf_ifid_eui48(&gid->raw[8], 0xffff, dev, port);
-}
+ if (ctx) {
+ ctx->refcount--;
+ if (!ctx->refcount) {
+ unsigned int index = ctx->real_index;
-static u8 mlx4_ib_get_dev_port(struct net_device *dev, struct mlx4_ib_dev *ibdev)
-{
- u8 port = 0;
- struct mlx4_ib_iboe *iboe;
- struct net_device *real_dev = rdma_vlan_dev_real_dev(dev) ?
- rdma_vlan_dev_real_dev(dev) : dev;
+ memcpy(&port_gid_table->gids[index].gid, &zgid, sizeof(*gid));
+ kfree(port_gid_table->gids[index].ctx);
+ port_gid_table->gids[index].ctx = NULL;
+ hw_update = 1;
+ }
+ }
+ } else {
+ for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i) {
+ if (!memcmp(&port_gid_table->gids[i].gid, gid, sizeof(*gid))) {
+ found = (port_gid_table->gids[i].gid_type == attr->gid_type) ? i : -1;
+ if (found >= 0)
+ break;
+ }
+ if (findex < 0 && !memcmp(&port_gid_table->gids[i].gid, &zgid, sizeof(*gid)))
+ findex = i; /* HW has space */
+ }
- iboe = &ibdev->iboe;
+ if (found < 0) {
+ if (findex < 0) {
+ ret = -ENOSPC;
+ } else {
+ port_gid_table->gids[findex].ctx = kmalloc(sizeof(*port_gid_table->gids[findex].ctx), GFP_ATOMIC);
+ if (!port_gid_table->gids[findex].ctx) {
+ ret = -ENOMEM;
+ } else {
+ *context = port_gid_table->gids[findex].ctx;
+ memcpy(&port_gid_table->gids[findex].gid, gid, sizeof(*gid));
+ port_gid_table->gids[findex].gid_type = attr->gid_type;
+ port_gid_table->gids[findex].ctx->real_index = findex;
+ port_gid_table->gids[findex].ctx->refcount = 1;
+ hw_update = 1;
+ }
+ }
+ } else {
+ struct gid_cache_context *ctx = port_gid_table->gids[found].ctx;
+ *context = ctx;
+ ctx->refcount++;
+ }
+ }
+ if (!ret && hw_update) {
+ addr_table = kmalloc(sizeof(*addr_table), GFP_ATOMIC);
+ if (!addr_table) {
+ ret = -ENOMEM;
+ } else {
+ for (i = 0; i < MLX4_MAX_PORT_GIDS; i++) {
+ memcpy(addr_table->addr[i].gid,
+ &port_gid_table->gids[i].gid,
+ sizeof(union ib_gid));
+ addr_table->addr[i].type = ib_gid_type_to_mlx4_gid_type(
+ port_gid_table->gids[i].gid_type);
+ }
+ }
+ }
+ write_unlock(&iboe->iboe_lock);
- for (port = 1; port <= MLX4_MAX_PORTS; ++port)
- if ((netif_is_bond_master(real_dev) && (real_dev == iboe->masters[port - 1])) ||
- (!netif_is_bond_master(real_dev) && (real_dev == iboe->netdevs[port - 1])))
- break;
+ if (!ret && hw_update) {
+ ret = mlx4_update_roce_addr_table(ibdev->dev, port_num, addr_table, MLX4_CMD_WRAPPED);
+ kfree(addr_table);
+ }
- return port > MLX4_MAX_PORTS ? 0 : port;
+ return ret;
}
-static void mlx4_ib_get_dev_addr(struct net_device *dev, struct mlx4_ib_dev *ibdev, u8 port)
+int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev,
+ u8 port_num, int index)
{
- struct ifaddr *ifa;
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
- struct inet6_dev *in6_dev;
- union ib_gid *pgid;
- struct inet6_ifaddr *ifp;
-#endif
+ struct mlx4_ib_iboe *iboe = &ibdev->iboe;
+ struct gid_cache_context *ctx = NULL;
union ib_gid gid;
+ struct mlx4_port_gid_table *port_gid_table;
+ int real_index = -EINVAL;
+ int i;
+ int ret;
+ struct ib_gid_attr attr;
+ if (port_num > MLX4_MAX_PORTS)
+ return -EINVAL;
- if ((port == 0) || (port > MLX4_MAX_PORTS))
- return;
-
- /* IPv4 gids */
- TAILQ_FOREACH(ifa, &dev->if_addrhead, ifa_link) {
- if (ifa->ifa_addr && ifa->ifa_addr->sa_family == AF_INET){
- ipv6_addr_set_v4mapped(
- ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr.s_addr,
- (struct in6_addr *)&gid);
- update_gid_table(ibdev, port, &gid, 0, 0);
- }
-
- }
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
- /* IPv6 gids */
- in6_dev = in6_dev_get(dev);
- if (in6_dev) {
- read_lock_bh(&in6_dev->lock);
- list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
- pgid = (union ib_gid *)&ifp->addr;
- update_gid_table(ibdev, port, pgid, 0, 0);
- }
- read_unlock_bh(&in6_dev->lock);
- in6_dev_put(in6_dev);
- }
-#endif
-}
+ if (ib_cache_use_roce_gid_cache(&ibdev->ib_dev, port_num))
+ return index;
-static void mlx4_set_default_gid(struct mlx4_ib_dev *ibdev,
- struct net_device *dev, u8 port)
-{
- union ib_gid gid;
- mlx4_make_default_gid(dev, &gid, port);
- update_gid_table(ibdev, port, &gid, 0, 1);
-}
+ ret = ib_get_cached_gid(&ibdev->ib_dev, port_num, index, &gid, &attr);
+ if (ret)
+ return ret;
-static int mlx4_ib_init_gid_table(struct mlx4_ib_dev *ibdev)
-{
- struct net_device *dev;
+ if (!memcmp(&gid, &zgid, sizeof(gid)))
+ return -EINVAL;
- if (reset_gid_table(ibdev))
- return -1;
+ read_lock(&iboe->iboe_lock);
+ port_gid_table = &iboe->gid_table[port_num - 1];
- IFNET_RLOCK_NOSLEEP();
- TAILQ_FOREACH(dev, &V_ifnet, if_link) {
- u8 port = mlx4_ib_get_dev_port(dev, ibdev);
- if (port) {
- if (!rdma_vlan_dev_real_dev(dev) &&
- !netif_is_bond_master(dev))
- mlx4_set_default_gid(ibdev, dev, port);
- mlx4_ib_get_dev_addr(dev, ibdev, port);
+ for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i)
+ if (!memcmp(&port_gid_table->gids[i].gid, &gid, sizeof(gid)) &&
+ (attr.gid_type == port_gid_table->gids[i].gid_type)) {
+ ctx = port_gid_table->gids[i].ctx;
+ break;
}
- }
-
- IFNET_RUNLOCK_NOSLEEP();
-
- return 0;
+ if (ctx)
+ real_index = ctx->real_index;
+ read_unlock(&iboe->iboe_lock);
+ return real_index;
}
static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev,
- struct net_device *dev, unsigned long event)
+ struct net_device *dev,
+ unsigned long event)
+
{
struct mlx4_ib_iboe *iboe;
+ int update_qps_port = -1;
int port;
- int init = 0;
- unsigned long flags;
iboe = &ibdev->iboe;
- spin_lock_irqsave(&iboe->lock, flags);
+ write_lock(&iboe->iboe_lock);
mlx4_foreach_ib_transport_port(port, ibdev->dev) {
- struct net_device *old_netdev = iboe->netdevs[port - 1];
-/* XXX BOND related */
-#if 0
- struct net_device *old_master = iboe->masters[port - 1];
-#endif
- iboe->masters[port - 1] = NULL;
+
iboe->netdevs[port - 1] =
mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port);
-
- if (old_netdev != iboe->netdevs[port - 1])
- init = 1;
if (dev == iboe->netdevs[port - 1] &&
- event == NETDEV_CHANGEADDR)
- init = 1;
-/* XXX BOND related */
-#if 0
- if (iboe->netdevs[port - 1] && netif_is_bond_slave(iboe->netdevs[port - 1]))
- iboe->masters[port - 1] = iboe->netdevs[port - 1]->master;
+ (event == NETDEV_CHANGEADDR || event == NETDEV_REGISTER ||
+ event == NETDEV_UP || event == NETDEV_CHANGE))
+ update_qps_port = port;
- /* if bonding is used it is possible that we add it to masters only after
- IP address is assigned to the net bonding interface */
- if (old_master != iboe->masters[port - 1])
- init = 1;
-#endif
}
- spin_unlock_irqrestore(&iboe->lock, flags);
+ write_unlock(&iboe->iboe_lock);
- if (init)
- if (mlx4_ib_init_gid_table(ibdev))
- pr_warn("Fail to reset gid table\n");
+ if (update_qps_port > 0)
+ mlx4_ib_update_qps(ibdev, dev, update_qps_port);
}
-static int mlx4_ib_netdev_event(struct notifier_block *this, unsigned long event,
- void *ptr)
+static int mlx4_ib_netdev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
{
- struct net_device *dev = ptr;
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct mlx4_ib_dev *ibdev;
- ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb);
+ if (!net_eq(dev_net(dev), &init_net))
+ return NOTIFY_DONE;
+ ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb);
mlx4_ib_scan_netdevs(ibdev, dev, event);
return NOTIFY_DONE;
}
-/* This function initializes the gid table only if the event_netdev real device is an iboe
- * device, will be invoked by the inet/inet6 events */
-static int mlx4_ib_inet_event(struct notifier_block *this, unsigned long event,
- void *ptr)
-{
- struct net_device *event_netdev = ptr;
- struct mlx4_ib_dev *ibdev;
- struct mlx4_ib_iboe *ibdev_iboe;
- int port = 0;
-
- ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb_inet);
-
- struct net_device *real_dev = rdma_vlan_dev_real_dev(event_netdev) ?
- rdma_vlan_dev_real_dev(event_netdev) :
- event_netdev;
-
- ibdev_iboe = &ibdev->iboe;
-
- port = mlx4_ib_get_dev_port(real_dev, ibdev);
-
- /* Perform init_gid_table if the event real_dev is the net_device which represents this port,
- * otherwise this event is not related and would be ignored.*/
- if(port && (real_dev == ibdev_iboe->netdevs[port - 1]))
- if (mlx4_ib_init_gid_table(ibdev))
- pr_warn("Fail to reset gid table\n");
-
- return NOTIFY_DONE;
-}
-
-
static void init_pkeys(struct mlx4_ib_dev *ibdev)
{
int port;
@@ -1950,7 +2432,8 @@
int i;
if (mlx4_is_master(ibdev->dev)) {
- for (slave = 0; slave <= ibdev->dev->num_vfs; ++slave) {
+ for (slave = 0; slave <= ibdev->dev->persist->num_vfs;
+ ++slave) {
for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) {
for (i = 0;
i < ibdev->dev->phys_caps.pkey_phys_table_len[port];
@@ -1975,84 +2458,87 @@
}
}
-static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev)
-{
- char name[32];
- int eq_per_port = 0;
- int added_eqs = 0;
- int total_eqs = 0;
- int i, j, eq;
-
- /* Legacy mode or comp_pool is not large enough */
- if (dev->caps.comp_pool == 0 ||
- dev->caps.num_ports > dev->caps.comp_pool)
- return;
-
- eq_per_port = rounddown_pow_of_two(dev->caps.comp_pool/
- dev->caps.num_ports);
- /* Init eq table */
- added_eqs = 0;
- mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
- added_eqs += eq_per_port;
+#define MLX4_IB_EQ_NAME_PRIORITY 1
+static void mlx4_ib_eq_cb(unsigned vector, u32 uuid, void *data)
+{
+ int err;
+ struct mlx4_ib_eq_table_entry *entry = data;
+
+ if (MLX4_EQ_UUID_TO_ID(uuid) == MLX4_EQ_ID_IB) {
+ struct mlx4_ib_dev *ibdev = entry->ibdev;
+
+ if (uuid == MLX4_EQ_ID_TO_UUID(MLX4_EQ_ID_IB, entry->port,
+ entry - ibdev->eq_table)) {
+ err = mlx4_rename_eq(ibdev->dev, entry->port, vector,
+ MLX4_IB_EQ_NAME_PRIORITY, "%s-%d",
+ ibdev->ib_dev.name,
+ (unsigned)(entry - ibdev->eq_table));
+ if (err)
+ dev_warn(&ibdev->dev->persist->pdev->dev,
+ "Failed to rename EQ, continuing with default name\n");
+ }
+ }
+}
- total_eqs = dev->caps.num_comp_vectors + added_eqs;
+static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev)
+{
+ int i, j, eq = 0, total_eqs = 0;
- ibdev->eq_table = kzalloc(total_eqs * sizeof(int), GFP_KERNEL);
+ ibdev->eq_table = kcalloc(dev->caps.num_comp_vectors,
+ sizeof(ibdev->eq_table[0]), GFP_KERNEL);
if (!ibdev->eq_table)
return;
- ibdev->eq_added = added_eqs;
-
- eq = 0;
- mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) {
- for (j = 0; j < eq_per_port; j++) {
- sprintf(name, "mlx4-ib-%d-%d@%d:%d:%d:%d", i, j,
- pci_get_domain(dev->pdev->dev.bsddev),
- pci_get_bus(dev->pdev->dev.bsddev),
- PCI_SLOT(dev->pdev->devfn),
- PCI_FUNC(dev->pdev->devfn));
-
+ for (i = 1; i <= dev->caps.num_ports; i++) {
+ for (j = 0; j < mlx4_get_eqs_per_port(dev, i);
+ j++, total_eqs++) {
+ if (i > 1 && mlx4_is_eq_shared(dev, total_eqs))
+ continue;
/* Set IRQ for specific name (per ring) */
- if (mlx4_assign_eq(dev, name,
- &ibdev->eq_table[eq])) {
- /* Use legacy (same as mlx4_en driver) */
- pr_warn("Can't allocate EQ %d; reverting to legacy\n", eq);
- ibdev->eq_table[eq] =
- (eq % dev->caps.num_comp_vectors);
- }
- eq++;
+ ibdev->eq_table[eq].vector = total_eqs;
+ ibdev->eq_table[eq].ibdev = ibdev;
+ ibdev->eq_table[eq].port = i;
+ if (!mlx4_assign_eq(dev, i,
+ MLX4_EQ_ID_TO_UUID(MLX4_EQ_ID_IB,
+ i, eq),
+ mlx4_ib_eq_cb,
+ &ibdev->eq_table[eq],
+ &ibdev->eq_table[eq].vector))
+ eq++;
+ else
+ ibdev->eq_table[eq].vector = -1;
}
}
- /* Fill the reset of the vector with legacy EQ */
- for (i = 0, eq = added_eqs; i < dev->caps.num_comp_vectors; i++)
- ibdev->eq_table[eq++] = i;
+ for (i = eq; i < dev->caps.num_comp_vectors;
+ ibdev->eq_table[i++].vector = -1)
+ ;
/* Advertise the new number of EQs to clients */
- ibdev->ib_dev.num_comp_vectors = total_eqs;
+ ibdev->ib_dev.num_comp_vectors = eq;
}
static void mlx4_ib_free_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev)
{
int i;
+ int total_eqs = ibdev->ib_dev.num_comp_vectors;
- /* no additional eqs were added */
+ /* no eqs were allocated */
if (!ibdev->eq_table)
return;
/* Reset the advertised EQ number */
- ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors;
+ ibdev->ib_dev.num_comp_vectors = 0;
- /* Free only the added eqs */
- for (i = 0; i < ibdev->eq_added; i++) {
- /* Don't free legacy eqs if used */
- if (ibdev->eq_table[i] <= dev->caps.num_comp_vectors)
- continue;
- mlx4_release_eq(dev, ibdev->eq_table[i]);
- }
+ for (i = 0; i < total_eqs; i++)
+ mlx4_release_eq(dev, MLX4_EQ_ID_TO_UUID(MLX4_EQ_ID_IB,
+ ibdev->eq_table[i].port,
+ i),
+ ibdev->eq_table[i].vector);
kfree(ibdev->eq_table);
+ ibdev->eq_table = NULL;
}
/*
@@ -2211,7 +2697,7 @@
return -1;
if (!dev)
return -1;
- if (mlx4_get_val(dev_assign_str.tbl, dev->pdev, 0, &val))
+ if (mlx4_get_val(dev_assign_str.tbl, dev->persist->pdev, 0, &val))
return -1;
if (val != DEFAULT_TBL_VAL) {
@@ -2231,14 +2717,17 @@
static void *mlx4_ib_add(struct mlx4_dev *dev)
{
struct mlx4_ib_dev *ibdev;
- int num_ports = 0;
+ int num_ports;
int i, j;
int err;
struct mlx4_ib_iboe *iboe;
+ int ib_num_ports = 0;
+ int num_req_counters;
int dev_idx;
- pr_info_once("%s", mlx4_ib_version);
+ pr_info_once("%s", mlx4_ib_version);
+ num_ports = 0;
mlx4_foreach_ib_transport_port(i, dev)
num_ports++;
@@ -2248,7 +2737,8 @@
ibdev = (struct mlx4_ib_dev *) ib_alloc_device(sizeof *ibdev);
if (!ibdev) {
- dev_err(&dev->pdev->dev, "Device struct alloc failed\n");
+ dev_err(&dev->persist->pdev->dev,
+ "Device struct alloc failed\n");
return NULL;
}
@@ -2269,20 +2759,21 @@
MLX4_INIT_DOORBELL_LOCK(&ibdev->uar_lock);
ibdev->dev = dev;
+ ibdev->bond_next_port = 0;
dev_idx = mlx4_ib_dev_idx(dev);
if (dev_idx >= 0)
sprintf(ibdev->ib_dev.name, "mlx4_%d", dev_idx);
else
strlcpy(ibdev->ib_dev.name, "mlx4_%d", IB_DEVICE_NAME_MAX);
-
ibdev->ib_dev.owner = THIS_MODULE;
ibdev->ib_dev.node_type = RDMA_NODE_IB_CA;
ibdev->ib_dev.local_dma_lkey = dev->caps.reserved_lkey;
ibdev->num_ports = num_ports;
- ibdev->ib_dev.phys_port_cnt = ibdev->num_ports;
+ ibdev->ib_dev.phys_port_cnt = mlx4_is_bonded(dev) ?
+ 1 : ibdev->num_ports;
ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors;
- ibdev->ib_dev.dma_device = &dev->pdev->dev;
+ ibdev->ib_dev.dma_device = &dev->persist->pdev->dev;
if (dev->caps.userspace_caps)
ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION;
@@ -2296,6 +2787,7 @@
(1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
(1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
(1ull << IB_USER_VERBS_CMD_REG_MR) |
+ (1ull << IB_USER_VERBS_CMD_REREG_MR) |
(1ull << IB_USER_VERBS_CMD_DEREG_MR) |
(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
(1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
@@ -2352,16 +2844,19 @@
ibdev->ib_dev.req_notify_cq = mlx4_ib_arm_cq;
ibdev->ib_dev.get_dma_mr = mlx4_ib_get_dma_mr;
ibdev->ib_dev.reg_user_mr = mlx4_ib_reg_user_mr;
+ ibdev->ib_dev.rereg_user_mr = mlx4_ib_rereg_user_mr;
ibdev->ib_dev.dereg_mr = mlx4_ib_dereg_mr;
+ ibdev->ib_dev.query_values = mlx4_ib_query_values;
ibdev->ib_dev.alloc_fast_reg_mr = mlx4_ib_alloc_fast_reg_mr;
+ ibdev->ib_dev.ioctl = mlx4_ib_ioctl;
ibdev->ib_dev.alloc_fast_reg_page_list = mlx4_ib_alloc_fast_reg_page_list;
ibdev->ib_dev.free_fast_reg_page_list = mlx4_ib_free_fast_reg_page_list;
ibdev->ib_dev.attach_mcast = mlx4_ib_mcg_attach;
ibdev->ib_dev.detach_mcast = mlx4_ib_mcg_detach;
ibdev->ib_dev.process_mad = mlx4_ib_process_mad;
ibdev->ib_dev.get_netdev = mlx4_ib_get_netdev;
- ibdev->ib_dev.ioctl = mlx4_ib_ioctl;
- ibdev->ib_dev.query_values = mlx4_ib_query_values;
+ ibdev->ib_dev.modify_gid = mlx4_ib_modify_gid;
+ ibdev->ib_dev.disassociate_ucontext = mlx4_ib_disassociate_ucontext;
if (!mlx4_is_slave(ibdev->dev)) {
ibdev->ib_dev.alloc_fmr = mlx4_ib_fmr_alloc;
@@ -2370,7 +2865,8 @@
ibdev->ib_dev.dealloc_fmr = mlx4_ib_fmr_dealloc;
}
- if (dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW) {
+ if (dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW ||
+ dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN) {
ibdev->ib_dev.alloc_mw = mlx4_ib_alloc_mw;
ibdev->ib_dev.bind_mw = mlx4_ib_bind_mw;
ibdev->ib_dev.dealloc_mw = mlx4_ib_dealloc_mw;
@@ -2392,59 +2888,74 @@
* Set experimental data
*/
ibdev->ib_dev.uverbs_exp_cmd_mask =
+ (1ull << IB_USER_VERBS_EXP_CMD_MODIFY_QP) |
(1ull << IB_USER_VERBS_EXP_CMD_CREATE_QP) |
(1ull << IB_USER_VERBS_EXP_CMD_MODIFY_CQ) |
(1ull << IB_USER_VERBS_EXP_CMD_QUERY_DEVICE) |
- (1ull << IB_USER_VERBS_EXP_CMD_CREATE_CQ);
+ (1ull << IB_USER_VERBS_EXP_CMD_CREATE_CQ) |
+ (1ull << IB_USER_VERBS_EXP_CMD_REREG_MR) |
+ (1ull << IB_USER_VERBS_EXP_CMD_CREATE_FLOW);
ibdev->ib_dev.exp_create_qp = mlx4_ib_exp_create_qp;
ibdev->ib_dev.exp_query_device = mlx4_ib_exp_query_device;
+ ibdev->ib_dev.exp_rereg_user_mr = mlx4_ib_exp_rereg_user_mr;
+
if (check_flow_steering_support(dev)) {
+ ibdev->steering_support = MLX4_STEERING_MODE_DEVICE_MANAGED;
+ ibdev->ib_dev.create_flow = mlx4_ib_create_flow;
+ ibdev->ib_dev.destroy_flow = mlx4_ib_destroy_flow;
+
ibdev->ib_dev.uverbs_ex_cmd_mask |=
(1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) |
(1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW);
- ibdev->ib_dev.create_flow = mlx4_ib_create_flow;
- ibdev->ib_dev.destroy_flow = mlx4_ib_destroy_flow;
- } else {
- pr_debug("Device managed flow steering is unavailable for this configuration.\n");
}
- /*
- * End of experimental data
- */
mlx4_ib_alloc_eqs(dev, ibdev);
- spin_lock_init(&iboe->lock);
+ rwlock_init(&iboe->iboe_lock);
if (init_node_data(ibdev))
goto err_map;
-
- for (i = 0; i < ibdev->num_ports; ++i) {
- if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i + 1) ==
- IB_LINK_LAYER_ETHERNET) {
- if (mlx4_is_slave(dev)) {
+ mlx4_init_sl2vl_tbl(ibdev);
+
+ num_req_counters = mlx4_is_bonded(dev) ? 1 : ibdev->num_ports;
+ for (i = 0; i < num_req_counters; ++i) {
+ mutex_init(&ibdev->qp1_proxy_lock[i]);
+ if (mlx4_is_slave(dev)) {
+ /* the slave asks for another counter index if exist (RoCE, VMA, DPDK) */
+ if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i + 1) ==
+ IB_LINK_LAYER_ETHERNET) {
ibdev->counters[i].status = mlx4_counter_alloc(ibdev->dev,
i + 1,
&ibdev->counters[i].counter_index);
- } else {/* allocating the PF IB default counter indices reserved in mlx4_init_counters_table */
- ibdev->counters[i].counter_index = ((i + 1) << 1) - 1;
+ } else {
+ ibdev->counters[i].counter_index = ibdev->dev->caps.def_counter_index[i];
ibdev->counters[i].status = 0;
}
-
- dev_info(&dev->pdev->dev,
- "%s: allocated counter index %d for port %d\n",
- __func__, ibdev->counters[i].counter_index, i+1);
- } else {
- ibdev->counters[i].counter_index = MLX4_SINK_COUNTER_INDEX;
- ibdev->counters[i].status = -ENOSPC;
+ } else {/* allocating the PF IB default counter indices reserved in mlx4_init_counters_table */
+ ibdev->counters[i].counter_index = ((i + 1) << 1) - 1;
+ ibdev->counters[i].status = 0;
}
+
+ dev_info(&dev->persist->pdev->dev,
+ "%s: allocated counter index %d for port %d\n",
+ __func__, ibdev->counters[i].counter_index, i+1);
}
+ if (mlx4_is_bonded(dev))
+ for (i = 1; i < ibdev->num_ports ; ++i)
+ ibdev->counters[i] = ibdev->counters[0];
+
+ mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
+ ib_num_ports++;
spin_lock_init(&ibdev->sm_lock);
mutex_init(&ibdev->cap_mask_mutex);
+ INIT_LIST_HEAD(&ibdev->qp_list);
+ spin_lock_init(&ibdev->reset_flow_resource_lock);
- if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED &&
+ if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED &&
!mlx4_is_mfunc(dev)) {
- ibdev->steer_qpn_count = MLX4_IB_UC_MAX_NUM_QPS;
+ ibdev->steer_qpn_count = MLX4_IB_UC_MAX_NUM_QPS >
+ dev->caps.num_qps >> 1 ? dev->caps.num_qps >> 1 : MLX4_IB_UC_MAX_NUM_QPS;
err = mlx4_qp_reserve_range(dev, ibdev->steer_qpn_count,
MLX4_IB_UC_STEER_QPN_ALIGN, &ibdev->steer_qpn_base, 0);
if (err)
@@ -2455,53 +2966,67 @@
sizeof(long),
GFP_KERNEL);
if (!ibdev->ib_uc_qpns_bitmap) {
- dev_err(&dev->pdev->dev, "bit map alloc failed\n");
+ dev_err(&dev->persist->pdev->dev,
+ "bit map alloc failed\n");
goto err_steer_qp_release;
}
- bitmap_zero(ibdev->ib_uc_qpns_bitmap, ibdev->steer_qpn_count);
-
- err = mlx4_FLOW_STEERING_IB_UC_QP_RANGE(dev, ibdev->steer_qpn_base,
- ibdev->steer_qpn_base + ibdev->steer_qpn_count - 1);
- if (err)
- goto err_steer_free_bitmap;
+ if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DMFS_IPOIB) {
+ bitmap_zero(ibdev->ib_uc_qpns_bitmap,
+ ibdev->steer_qpn_count);
+ err = mlx4_FLOW_STEERING_IB_UC_QP_RANGE(
+ dev, ibdev->steer_qpn_base,
+ ibdev->steer_qpn_base +
+ ibdev->steer_qpn_count - 1);
+ if (err)
+ goto err_steer_free_bitmap;
+ } else {
+ bitmap_fill(ibdev->ib_uc_qpns_bitmap,
+ ibdev->steer_qpn_count);
+ }
}
+ memset(iboe->gid_table, 0, sizeof(struct mlx4_port_gid_table) * MLX4_MAX_PORTS);
+ for (j = 1; j <= ibdev->dev->caps.num_ports; j++)
+ atomic64_set(&iboe->mac[j - 1], ibdev->dev->caps.def_mac[j]);
+
if (ib_register_device(&ibdev->ib_dev, NULL))
goto err_steer_free_bitmap;
+ for (j = 0; j < ibdev->ib_dev.num_comp_vectors; j++)
+ if (mlx4_rename_eq(dev, ibdev->eq_table[j].port,
+ ibdev->eq_table[j].vector,
+ MLX4_IB_EQ_NAME_PRIORITY,
+ "%s-%d", ibdev->ib_dev.name,
+ ibdev->eq_table[j].vector))
+ dev_warn(&dev->persist->pdev->dev,
+ "Failed to rename EQ %d, continuing with default name\n",
+ j);
+
if (mlx4_ib_mad_init(ibdev))
goto err_reg;
if (mlx4_ib_init_sriov(ibdev))
goto err_mad;
- if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE) {
+ if (dev->caps.roce_mode != MLX4_ROCE_MODE_INVALID) {
if (!iboe->nb.notifier_call) {
iboe->nb.notifier_call = mlx4_ib_netdev_event;
err = register_netdevice_notifier(&iboe->nb);
if (err) {
iboe->nb.notifier_call = NULL;
- goto err_notify;
- }
- }
- if (!iboe->nb_inet.notifier_call) {
- iboe->nb_inet.notifier_call = mlx4_ib_inet_event;
- err = register_inetaddr_notifier(&iboe->nb_inet);
- if (err) {
- iboe->nb_inet.notifier_call = NULL;
- goto err_notify;
+ goto err_notif;
}
}
- mlx4_ib_scan_netdevs(ibdev, NULL, 0);
}
+
for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) {
if (device_create_file(&ibdev->ib_dev.dev,
mlx4_class_attributes[j]))
- goto err_notify;
+ goto err_notif;
}
if (sysfs_create_group(&ibdev->ib_dev.dev.kobj, &diag_counters_group))
- goto err_notify;
+ goto err_notif;
ibdev->ib_active = true;
@@ -2519,22 +3044,12 @@
}
return ibdev;
-err_notify:
- for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) {
- device_remove_file(&ibdev->ib_dev.dev,
- mlx4_class_attributes[j]);
- }
-
+err_notif:
if (ibdev->iboe.nb.notifier_call) {
if (unregister_netdevice_notifier(&ibdev->iboe.nb))
pr_warn("failure unregistering notifier\n");
ibdev->iboe.nb.notifier_call = NULL;
}
- if (ibdev->iboe.nb_inet.notifier_call) {
- if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet))
- pr_warn("failure unregistering notifier\n");
- ibdev->iboe.nb_inet.notifier_call = NULL;
- }
flush_workqueue(wq);
mlx4_ib_close_sriov(ibdev);
@@ -2549,22 +3064,21 @@
kfree(ibdev->ib_uc_qpns_bitmap);
err_steer_qp_release:
- if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED)
+ if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED)
mlx4_qp_release_range(dev, ibdev->steer_qpn_base,
- ibdev->steer_qpn_count);
+ ibdev->steer_qpn_count);
err_counter:
- for (; i; --i) {
- if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i) ==
+ for (i = 0; i < num_req_counters; i++) {
+ if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i + 1) ==
IB_LINK_LAYER_ETHERNET) {
mlx4_counter_free(ibdev->dev,
- i,
- ibdev->counters[i - 1].counter_index);
+ i + 1,
+ ibdev->counters[i].counter_index);
}
}
err_map:
iounmap(ibdev->priv_uar.map);
- mlx4_ib_free_eqs(dev, ibdev);
err_uar:
mlx4_uar_free(dev, &ibdev->priv_uar);
@@ -2597,13 +3111,14 @@
void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count)
{
if (!qpn ||
- dev->dev->caps.steering_mode != MLX4_STEERING_MODE_DEVICE_MANAGED)
+ dev->steering_support != MLX4_STEERING_MODE_DEVICE_MANAGED)
return;
BUG_ON(qpn < dev->steer_qpn_base);
bitmap_release_region(dev->ib_uc_qpns_bitmap,
- qpn - dev->steer_qpn_base, get_count_order(count));
+ qpn - dev->steer_qpn_base,
+ get_count_order(count));
}
int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
@@ -2626,15 +3141,15 @@
ib_spec = (struct ib_flow_spec_ib *)(flow + 1);
ib_spec->type = IB_FLOW_SPEC_IB;
ib_spec->size = sizeof(struct ib_flow_spec_ib);
- ib_spec->val.l3_type_qpn = mqp->ibqp.qp_num;
- ib_spec->mask.l3_type_qpn = MLX4_IB_FLOW_QPN_MASK;
-
+ ib_spec->val.l3_type_qpn =
+ cpu_to_be32(mqp->ibqp.qp_num & MLX4_IB_FLOW_QPN_MASK);
+ ib_spec->mask.l3_type_qpn = cpu_to_be32(MLX4_IB_FLOW_QPN_MASK);
err = __mlx4_ib_create_flow(&mqp->ibqp, flow,
IB_FLOW_DOMAIN_NIC,
MLX4_FS_REGULAR,
- &mqp->reg_id);
+ &mqp->flow);
} else {
- err = __mlx4_ib_destroy_flow(mdev->dev, mqp->reg_id);
+ err = __mlx4_ib_destroy_flow(mdev->dev, &mqp->flow);
}
kfree(flow);
return err;
@@ -2643,25 +3158,15 @@
static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr)
{
struct mlx4_ib_dev *ibdev = ibdev_ptr;
- int p, j;
+ int p;
int dev_idx, ret;
- if (ibdev->iboe.nb_inet.notifier_call) {
- if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet))
- pr_warn("failure unregistering notifier\n");
- ibdev->iboe.nb_inet.notifier_call = NULL;
- }
+ ibdev->ib_active = false;
+ flush_workqueue(wq);
mlx4_ib_close_sriov(ibdev);
sysfs_remove_group(&ibdev->ib_dev.dev.kobj, &diag_counters_group);
mlx4_ib_mad_cleanup(ibdev);
-
- for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) {
- device_remove_file(&ibdev->ib_dev.dev,
- mlx4_class_attributes[j]);
- }
-
-
dev_idx = -1;
if (dr_active && !(ibdev->dev->flags & MLX4_FLAG_DEV_NUM_STR)) {
ret = sscanf(ibdev->ib_dev.name, "mlx4_%d", &dev_idx);
@@ -2675,19 +3180,19 @@
spin_unlock(&dev_num_str_lock);
}
- if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) {
- mlx4_qp_release_range(dev, ibdev->steer_qpn_base,
- ibdev->steer_qpn_count);
- kfree(ibdev->ib_uc_qpns_bitmap);
- }
-
if (ibdev->iboe.nb.notifier_call) {
if (unregister_netdevice_notifier(&ibdev->iboe.nb))
pr_warn("failure unregistering notifier\n");
ibdev->iboe.nb.notifier_call = NULL;
}
- iounmap(ibdev->priv_uar.map);
+ if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED) {
+ mlx4_qp_release_range(dev, ibdev->steer_qpn_base,
+ ibdev->steer_qpn_count);
+ kfree(ibdev->ib_uc_qpns_bitmap);
+ }
+
+ iounmap(ibdev->priv_uar.map);
for (p = 0; p < ibdev->num_ports; ++p) {
if (mlx4_ib_port_link_layer(&ibdev->ib_dev, p + 1) ==
IB_LINK_LAYER_ETHERNET) {
@@ -2713,17 +3218,24 @@
struct mlx4_dev *dev = ibdev->dev;
int i;
unsigned long flags;
+ struct mlx4_active_ports actv_ports;
+ unsigned int ports;
+ unsigned int first_port;
if (!mlx4_is_master(dev))
return;
- dm = kcalloc(dev->caps.num_ports, sizeof *dm, GFP_ATOMIC);
+ actv_ports = mlx4_get_active_ports(dev, slave);
+ ports = bitmap_weight(actv_ports.ports, dev->caps.num_ports);
+ first_port = find_first_bit(actv_ports.ports, dev->caps.num_ports);
+
+ dm = kcalloc(ports, sizeof(*dm), GFP_ATOMIC);
if (!dm) {
pr_err("failed to allocate memory for tunneling qp update\n");
goto out;
}
- for (i = 0; i < dev->caps.num_ports; i++) {
+ for (i = 0; i < ports; i++) {
dm[i] = kmalloc(sizeof (struct mlx4_ib_demux_work), GFP_ATOMIC);
if (!dm[i]) {
pr_err("failed to allocate memory for tunneling qp update work struct\n");
@@ -2735,9 +3247,9 @@
}
}
/* initialize or tear down tunnel QPs for the slave */
- for (i = 0; i < dev->caps.num_ports; i++) {
+ for (i = 0; i < ports; i++) {
INIT_WORK(&dm[i]->work, mlx4_ib_tunnels_update_work);
- dm[i]->port = i + 1;
+ dm[i]->port = first_port + i + 1;
dm[i]->slave = slave;
dm[i]->do_init = do_init;
dm[i]->dev = ibdev;
@@ -2747,11 +3259,149 @@
spin_unlock_irqrestore(&ibdev->sriov.going_down_lock, flags);
}
out:
- if (dm)
- kfree(dm);
+ kfree(dm);
return;
}
+static void mlx4_ib_handle_catas_error(struct mlx4_ib_dev *ibdev)
+{
+ struct mlx4_ib_qp *mqp;
+ unsigned long flags_qp;
+ unsigned long flags_cq;
+ struct mlx4_ib_cq *send_mcq, *recv_mcq;
+ struct list_head cq_notify_list;
+ struct mlx4_cq *mcq;
+ unsigned long flags;
+
+ pr_warn("mlx4_ib_handle_catas_error was started\n");
+ INIT_LIST_HEAD(&cq_notify_list);
+
+ /* Go over qp list reside on that ibdev, sync with create/destroy qp.*/
+ spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags);
+
+ list_for_each_entry(mqp, &ibdev->qp_list, qps_list) {
+ spin_lock_irqsave(&mqp->sq.lock, flags_qp);
+ if (mqp->sq.tail != mqp->sq.head) {
+ send_mcq = to_mcq(mqp->ibqp.send_cq);
+ spin_lock_irqsave(&send_mcq->lock, flags_cq);
+ if (send_mcq->mcq.comp &&
+ mqp->ibqp.send_cq->comp_handler) {
+ if (!send_mcq->mcq.reset_notify_added) {
+ send_mcq->mcq.reset_notify_added = 1;
+ list_add_tail(&send_mcq->mcq.reset_notify,
+ &cq_notify_list);
+ }
+ }
+ spin_unlock_irqrestore(&send_mcq->lock, flags_cq);
+ }
+ spin_unlock_irqrestore(&mqp->sq.lock, flags_qp);
+ /* Now, handle the QP's receive queue */
+ spin_lock_irqsave(&mqp->rq.lock, flags_qp);
+ /* no handling is needed for SRQ */
+ if (!mqp->ibqp.srq) {
+ if (mqp->rq.tail != mqp->rq.head) {
+ recv_mcq = to_mcq(mqp->ibqp.recv_cq);
+ spin_lock_irqsave(&recv_mcq->lock, flags_cq);
+ if (recv_mcq->mcq.comp &&
+ mqp->ibqp.recv_cq->comp_handler) {
+ if (!recv_mcq->mcq.reset_notify_added) {
+ recv_mcq->mcq.reset_notify_added = 1;
+ list_add_tail(&recv_mcq->mcq.reset_notify,
+ &cq_notify_list);
+ }
+ }
+ spin_unlock_irqrestore(&recv_mcq->lock,
+ flags_cq);
+ }
+ }
+ spin_unlock_irqrestore(&mqp->rq.lock, flags_qp);
+ }
+
+ list_for_each_entry(mcq, &cq_notify_list, reset_notify) {
+ mcq->comp(mcq);
+ }
+ spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags);
+ pr_warn("mlx4_ib_handle_catas_error ended\n");
+}
+
+static void handle_bonded_port_state_event(struct work_struct *work)
+{
+ struct ib_event_work *ew =
+ container_of(work, struct ib_event_work, work);
+ struct mlx4_ib_dev *ibdev = ew->ib_dev;
+ enum ib_port_state bonded_port_state = IB_PORT_NOP;
+ int i;
+ struct ib_event ibev;
+
+ kfree(ew);
+
+ read_lock(&ibdev->iboe.iboe_lock);
+ for (i = 0; i < MLX4_MAX_PORTS; ++i) {
+ struct net_device *curr_netdev = ibdev->iboe.netdevs[i];
+ enum ib_port_state curr_port_state;
+
+ if (!curr_netdev)
+ continue;
+
+ curr_port_state =
+ (netif_running(curr_netdev) &&
+ netif_carrier_ok(curr_netdev)) ?
+ IB_PORT_ACTIVE : IB_PORT_DOWN;
+
+ bonded_port_state = (bonded_port_state != IB_PORT_ACTIVE) ?
+ curr_port_state : IB_PORT_ACTIVE;
+ }
+ read_unlock(&ibdev->iboe.iboe_lock);
+
+ ibev.device = &ibdev->ib_dev;
+ ibev.element.port_num = 1;
+ ibev.event = (bonded_port_state == IB_PORT_ACTIVE) ?
+ IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
+
+ ib_dispatch_event(&ibev);
+}
+
+void mlx4_ib_sl2vl_update(struct mlx4_ib_dev *mdev, int port)
+{
+ u64 sl2vl;
+ int err;
+
+ err = mlx4_ib_query_sl2vl(&mdev->ib_dev, port, &sl2vl);
+ if (err) {
+ pr_err("Unable to get current sl to vl mapping for port %d. Using all zeroes (%d)\n",
+ port, err);
+ sl2vl = 0;
+ }
+ atomic64_set(&mdev->sl2vl[port - 1], sl2vl);
+}
+
+static void ib_sl2vl_update_work(struct work_struct *work)
+{
+ struct ib_event_work *ew = container_of(work, struct ib_event_work, work);
+ struct mlx4_ib_dev *mdev = ew->ib_dev;
+ int port = ew->port;
+
+ mlx4_ib_sl2vl_update(mdev, port);
+
+ kfree(ew);
+}
+
+void mlx4_sched_ib_sl2vl_update_work(struct mlx4_ib_dev *ibdev,
+ int port)
+{
+ struct ib_event_work *ew;
+
+ ew = kmalloc(sizeof(*ew), GFP_ATOMIC);
+ if (ew) {
+ INIT_WORK(&ew->work, ib_sl2vl_update_work);
+ ew->port = port;
+ ew->ib_dev = ibdev;
+ queue_work(wq, &ew->work);
+ } else {
+ pr_err("failed to allocate memory for sl2vl update work\n");
+ }
+}
+
static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr,
enum mlx4_dev_event event, unsigned long param)
{
@@ -2761,6 +3411,18 @@
struct ib_event_work *ew;
int p = 0;
+ if (mlx4_is_bonded(dev) &&
+ ((event == MLX4_DEV_EVENT_PORT_UP) ||
+ (event == MLX4_DEV_EVENT_PORT_DOWN))) {
+ ew = kmalloc(sizeof(*ew), GFP_ATOMIC);
+ if (!ew)
+ return;
+ INIT_WORK(&ew->work, handle_bonded_port_state_event);
+ ew->ib_dev = ibdev;
+ queue_work(wq, &ew->work);
+ return;
+ }
+
if (event == MLX4_DEV_EVENT_PORT_MGMT_CHANGE)
eqe = (struct mlx4_eqe *)param;
else
@@ -2770,27 +3432,28 @@
case MLX4_DEV_EVENT_PORT_UP:
if (p > ibdev->num_ports)
return;
- if (mlx4_is_master(dev) &&
+ if (!mlx4_is_slave(dev) &&
rdma_port_get_link_layer(&ibdev->ib_dev, p) ==
IB_LINK_LAYER_INFINIBAND) {
- mlx4_ib_invalidate_all_guid_record(ibdev, p);
+ if (mlx4_is_master(dev))
+ mlx4_ib_invalidate_all_guid_record(ibdev, p);
+ if (ibdev->dev->flags & MLX4_FLAG_SECURE_HOST &&
+ !(ibdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_SL_TO_VL_CHANGE_EVENT))
+ mlx4_sched_ib_sl2vl_update_work(ibdev, p);
}
- mlx4_ib_info((struct ib_device *) ibdev_ptr,
- "Port %d logical link is up\n", p);
ibev.event = IB_EVENT_PORT_ACTIVE;
break;
case MLX4_DEV_EVENT_PORT_DOWN:
if (p > ibdev->num_ports)
return;
- mlx4_ib_info((struct ib_device *) ibdev_ptr,
- "Port %d logical link is down\n", p);
ibev.event = IB_EVENT_PORT_ERR;
break;
case MLX4_DEV_EVENT_CATASTROPHIC_ERROR:
ibdev->ib_active = false;
ibev.event = IB_EVENT_DEVICE_FATAL;
+ mlx4_ib_handle_catas_error(ibdev);
break;
case MLX4_DEV_EVENT_PORT_MGMT_CHANGE:
@@ -2813,9 +3476,31 @@
case MLX4_DEV_EVENT_SLAVE_INIT:
/* here, p is the slave id */
do_slave_init(ibdev, p, 1);
+ if (mlx4_is_master(dev)) {
+ int i;
+
+ for (i = 1; i <= ibdev->num_ports; i++) {
+ if (rdma_port_get_link_layer(&ibdev->ib_dev, i)
+ == IB_LINK_LAYER_INFINIBAND)
+ mlx4_ib_slave_alias_guid_event(ibdev,
+ p, i,
+ 1);
+ }
+ }
return;
case MLX4_DEV_EVENT_SLAVE_SHUTDOWN:
+ if (mlx4_is_master(dev)) {
+ int i;
+
+ for (i = 1; i <= ibdev->num_ports; i++) {
+ if (rdma_port_get_link_layer(&ibdev->ib_dev, i)
+ == IB_LINK_LAYER_INFINIBAND)
+ mlx4_ib_slave_alias_guid_event(ibdev,
+ p, i,
+ 0);
+ }
+ }
/* here, p is the slave id */
do_slave_init(ibdev, p, 0);
return;
@@ -2825,7 +3510,7 @@
}
ibev.device = ibdev_ptr;
- ibev.element.port_num = (u8) p;
+ ibev.element.port_num = mlx4_is_bonded(ibdev->dev) ? 1 : (u8)p;
ib_dispatch_event(&ibev);
}
@@ -2834,7 +3519,8 @@
.add = mlx4_ib_add,
.remove = mlx4_ib_remove,
.event = mlx4_ib_event,
- .protocol = MLX4_PROT_IB_IPV6
+ .protocol = MLX4_PROT_IB_IPV6,
+ .flags = MLX4_INTFF_BONDING
};
static int __init mlx4_ib_init(void)
@@ -2845,9 +3531,10 @@
if (!wq)
return -ENOMEM;
+
err = mlx4_ib_mcg_init();
if (err)
- goto clean_proc;
+ goto clean_wq;
init_dev_assign();
@@ -2860,7 +3547,7 @@
clean_mcg:
mlx4_ib_mcg_destroy();
-clean_proc:
+clean_wq:
destroy_workqueue(wq);
return err;
}
Index: sys/ofed/drivers/infiniband/hw/mlx4/mcg.c
===================================================================
--- sys/ofed/drivers/infiniband/hw/mlx4/mcg.c
+++ sys/ofed/drivers/infiniband/hw/mlx4/mcg.c
@@ -206,17 +206,19 @@
{
struct mlx4_ib_dev *dev = ctx->dev;
struct ib_ah_attr ah_attr;
+ unsigned long flags;
- spin_lock(&dev->sm_lock);
+ spin_lock_irqsave(&dev->sm_lock, flags);
if (!dev->sm_ah[ctx->port - 1]) {
/* port is not yet Active, sm_ah not ready */
- spin_unlock(&dev->sm_lock);
+ spin_unlock_irqrestore(&dev->sm_lock, flags);
return -EAGAIN;
}
mlx4_ib_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr);
- spin_unlock(&dev->sm_lock);
- return mlx4_ib_send_to_wire(dev, mlx4_master_func_num(dev->dev), ctx->port,
- IB_QPT_GSI, 0, 1, IB_QP1_QKEY, &ah_attr, 0, mad);
+ spin_unlock_irqrestore(&dev->sm_lock, flags);
+ return mlx4_ib_send_to_wire(dev, mlx4_master_func_num(dev->dev),
+ ctx->port, IB_QPT_GSI, 0, 1, IB_QP1_QKEY,
+ &ah_attr, NULL, 0xffff, mad);
}
static int send_mad_to_slave(int slave, struct mlx4_ib_demux_ctx *ctx,
Index: sys/ofed/drivers/infiniband/hw/mlx4/mlx4_exp.h
===================================================================
--- sys/ofed/drivers/infiniband/hw/mlx4/mlx4_exp.h
+++ sys/ofed/drivers/infiniband/hw/mlx4/mlx4_exp.h
@@ -42,5 +42,7 @@
struct ib_udata *udata);
int mlx4_ib_exp_query_device(struct ib_device *ibdev,
struct ib_exp_device_attr *props);
-
+int mlx4_ib_exp_rereg_user_mr(struct ib_mr *mr, int flags,
+ u64 start, u64 length, u64 virt_addr,
+ int mr_access_flags, struct ib_pd *pd);
#endif /* MLX4_EXP_H */
Index: sys/ofed/drivers/infiniband/hw/mlx4/mlx4_exp.c
===================================================================
--- sys/ofed/drivers/infiniband/hw/mlx4/mlx4_exp.c
+++ sys/ofed/drivers/infiniband/hw/mlx4/mlx4_exp.c
@@ -34,6 +34,7 @@
#include "mlx4_ib.h"
#include "mlx4_exp.h"
#include <linux/mlx4/qp.h>
+#include <linux/mlx4/driver.h>
int mlx4_ib_exp_query_device(struct ib_device *ibdev,
struct ib_exp_device_attr *props)
@@ -56,6 +57,24 @@
IB_DEVICE_UD_RSS |
IB_DEVICE_UD_TSS);
+ if (base->device_cap_flags & IB_DEVICE_ROCE_MODE_1_5)
+ props->device_cap_flags2 |= IB_EXP_DEVICE_ROCE_MODE_1_5;
+ if (base->device_cap_flags & IB_DEVICE_ROCE_MODE_2)
+ props->device_cap_flags2 |= IB_EXP_DEVICE_ROCE_MODE_2;
+
+ if (dev->dev->caps.tunnel_offload_mode != MLX4_TUNNEL_OFFLOAD_MODE_NONE)
+ props->device_cap_flags2 |= IB_EXP_DEVICE_VXLAN_SUPPORT;
+
+ /* Only ConnectX3 pro reports csum for now. can add ConnextX-3 later */
+ if (dev->dev->caps.rx_checksum_flags_port[1] &
+ MLX4_RX_CSUM_MODE_IP_OK_IP_NON_TCP_UDP)
+ props->device_cap_flags2 |= (IB_EXP_DEVICE_RX_CSUM_IP_PKT |
+ IB_EXP_DEVICE_RX_CSUM_TCP_UDP_PKT);
+ if (dev->dev->caps.rx_checksum_flags_port[2] &
+ MLX4_RX_CSUM_MODE_IP_OK_IP_NON_TCP_UDP)
+ props->device_cap_flags2 |= (IB_EXP_DEVICE_RX_CSUM_IP_PKT |
+ IB_EXP_DEVICE_RX_CSUM_TCP_UDP_PKT);
+
if (base->max_rss_tbl_sz > 0) {
props->max_rss_tbl_sz = base->max_rss_tbl_sz;
props->exp_comp_mask |= IB_EXP_DEVICE_ATTR_RSS_TBL_SZ;
@@ -64,8 +83,21 @@
props->exp_comp_mask &= ~IB_EXP_DEVICE_ATTR_RSS_TBL_SZ;
}
- if (props->device_cap_flags2)
- props->exp_comp_mask |= IB_EXP_DEVICE_ATTR_CAP_FLAGS2;
+ props->exp_comp_mask |= IB_EXP_DEVICE_ATTR_CAP_FLAGS2;
+
+ props->exp_comp_mask |= IB_EXP_DEVICE_ATTR_EXT_ATOMIC_ARGS;
+ props->atomic_arg_sizes = 1 << 3;
+ props->max_fa_bit_boudary = 64;
+ props->log_max_atomic_inline_arg = 3;
+ props->device_cap_flags2 |= IB_EXP_DEVICE_EXT_ATOMICS;
+
+ props->exp_comp_mask |= IB_EXP_DEVICE_ATTR_MAX_CTX_RES_DOMAIN;
+ props->max_ctx_res_domain = MLX4_IB_MAX_CTX_UARS * dev->dev->caps.bf_regs_per_page;
+
+ props->exp_comp_mask |= IB_EXP_DEVICE_ATTR_MAX_DEVICE_CTX;
+
+ /*mlx4_core uses 1 UAR*/
+ props->max_device_ctx = dev->dev->caps.num_uars - dev->dev->caps.reserved_uars - 1;
return ret;
}
@@ -83,6 +115,10 @@
int use_inlr;
struct mlx4_ib_dev *dev;
+ if ((init_attr->create_flags & IB_QP_CREATE_ATOMIC_BE_REPLY) &&
+ mlx4_is_little_endian())
+ return ERR_PTR(-EINVAL);
+
if (init_attr->max_inl_recv && !udata)
return ERR_PTR(-EINVAL);
Index: sys/ofed/drivers/infiniband/hw/mlx4/mlx4_ib.h
===================================================================
--- sys/ofed/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ sys/ofed/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -41,6 +41,8 @@
#include <linux/rbtree.h>
#include <linux/notifier.h>
+#include <asm/atomic64.h>
+
#include <rdma/ib_verbs.h>
#include <rdma/ib_umem.h>
#include <rdma/ib_mad.h>
@@ -59,9 +61,6 @@
#define mlx4_ib_warn(ibdev, format, arg...) \
dev_warn((ibdev)->dma_device, MLX4_IB_DRV_NAME ": " format, ## arg)
-#define mlx4_ib_info(ibdev, format, arg...) \
- dev_info((ibdev)->dma_device, MLX4_IB_DRV_NAME ": " format, ## arg)
-
enum {
MLX4_IB_SQ_MIN_WQE_SHIFT = 6,
MLX4_IB_MAX_HEADROOM = 2048
@@ -77,15 +76,43 @@
#define MLX4_IB_UC_STEER_QPN_ALIGN 1
#define MLX4_IB_UC_MAX_NUM_QPS (256 * 1024)
+enum hw_bar_type {
+ HW_BAR_BF,
+ HW_BAR_DB,
+ HW_BAR_CLOCK,
+ HW_BAR_COUNT
+};
+
+struct mlx4_ib_vma_private_data {
+ struct vm_area_struct *vma;
+};
+
#define MLX4_IB_MMAP_CMD_MASK 0xFF
#define MLX4_IB_MMAP_CMD_BITS 8
+/*
+ * Set MLX4_IB_MAX_CTX_UARS to 256 each UAR related to 8 BFs.
+ * This provides maximum of 256 * 8 = 2048 BFs.
+ * To gain performance we may need BF per core which means we can support
+ * up to 2048 cores with dedicated BF per context.
+ */
+#define MLX4_IB_MAX_CTX_UARS 256
+
+struct mlx4_ib_user_uar {
+ struct mlx4_ib_vma_private_data hw_bar_info[HW_BAR_COUNT];
+ struct mlx4_uar uar;
+ int user_idx;
+ struct list_head list;
+};
struct mlx4_ib_ucontext {
struct ib_ucontext ibucontext;
struct mlx4_uar uar;
struct list_head db_page_list;
struct mutex db_page_mutex;
+ struct mlx4_ib_vma_private_data hw_bar_info[HW_BAR_COUNT];
+ struct list_head user_uar_list;
+ struct mutex user_uar_mutex;
};
struct mlx4_ib_pd {
@@ -114,6 +141,8 @@
struct mlx4_shared_mr_info {
int mr_id;
struct ib_umem *umem;
+ long long counter;
+ int counter_used;
};
struct mlx4_ib_cq {
@@ -127,6 +156,10 @@
struct ib_umem *umem;
struct ib_umem *resize_umem;
int create_flags;
+ /* List of qps that it serves.*/
+ struct list_head send_qp_list;
+ struct list_head recv_qp_list;
+ int vector;
};
struct mlx4_ib_mr {
@@ -154,10 +187,24 @@
struct mlx4_fmr mfmr;
};
+#define MAX_REGS_PER_FLOW 2
+
+struct mlx4_ib_hw_flow {
+ u64 reg_id;
+ /* dst_mac is used in order to register MAC if L2 ethernet is used */
+ u64 dst_mac;
+ u8 port;
+};
+
+struct mlx4_flow_reg_id {
+ struct mlx4_ib_hw_flow flow;
+ struct mlx4_ib_hw_flow mirror;
+};
+
struct mlx4_ib_flow {
struct ib_flow ibflow;
/* translating DMFS verbs sniffer rule to FW API requires two reg IDs */
- u64 reg_id[2];
+ struct mlx4_flow_reg_id reg_id[MAX_REGS_PER_FLOW];
};
struct mlx4_ib_wq {
@@ -172,6 +219,10 @@
unsigned tail;
};
+enum {
+ MLX4_IB_QP_CREATE_ROCE_V2_GSI = IB_QP_CREATE_RESERVED_START
+};
+
enum mlx4_ib_qp_flags {
MLX4_IB_QP_LSO = IB_QP_CREATE_IPOIB_UD_LSO,
MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK,
@@ -179,6 +230,10 @@
MLX4_IB_QP_CAP_MANAGED_SEND = IB_QP_CREATE_MANAGED_SEND,
MLX4_IB_QP_CAP_MANAGED_RECV = IB_QP_CREATE_MANAGED_RECV,
MLX4_IB_QP_NETIF = IB_QP_CREATE_NETIF_QP,
+ MLX4_IB_QP_CREATE_USE_GFP_NOIO = IB_QP_CREATE_USE_GFP_NOIO,
+
+ /* Mellanox specific flags start from IB_QP_CREATE_RESERVED_START */
+ MLX4_IB_ROCE_V2_GSI_QP = MLX4_IB_QP_CREATE_ROCE_V2_GSI,
MLX4_IB_SRIOV_TUNNEL_QP = 1 << 30,
MLX4_IB_SRIOV_SQP = 1 << 31,
};
@@ -195,6 +250,12 @@
MLX4_IB_MMAP_BLUE_FLAME_PAGE = 1,
MLX4_IB_MMAP_GET_CONTIGUOUS_PAGES = 2,
MLX4_IB_MMAP_GET_HW_CLOCK = 3,
+
+ /* Use EXP mmap commands until it is pushed to upstream */
+ MLX4_IB_EXP_MMAP_GET_CONTIGUOUS_PAGES_CPU_NUMA = 0xFC,
+ MLX4_IB_EXP_MMAP_GET_CONTIGUOUS_PAGES_DEV_NUMA = 0xFD,
+ MLX4_IB_EXP_MMAP_EXT_UAR_PAGE = 0xFE,
+ MLX4_IB_EXP_MMAP_EXT_BLUE_FLAME_PAGE = 0xFF,
};
enum mlx4_ib_qp_type {
@@ -296,6 +357,9 @@
u32 tss_child_count;
u32 rss_child_count;
u32 qpg_tss_mask_sz;
+ struct mlx4_ib_dev *dev;
+ unsigned long flags;
+ struct kref refcount;
};
struct mlx4_ib_qp {
@@ -306,7 +370,7 @@
struct mlx4_db db;
struct mlx4_ib_wq rq;
- u32 doorbell_qpn;
+ __be32 doorbell_qpn;
__be32 sq_signal_bits;
unsigned sq_next_wqe;
int sq_max_wqes_per_wr;
@@ -334,15 +398,16 @@
struct mlx4_ib_buf *sqp_proxy_rcv;
struct mlx4_roce_smac_vlan_info pri;
struct mlx4_roce_smac_vlan_info alt;
- struct list_head rules_list;
- u64 reg_id;
int max_inline_data;
struct mlx4_bf bf;
-
+ struct mlx4_ib_hw_flow flow;
/*
* Experimental data
*/
int max_inlr_data;
+ struct list_head qps_list;
+ struct list_head cq_recv_list;
+ struct list_head cq_send_list;
};
struct mlx4_ib_srq {
@@ -385,14 +450,9 @@
enum mlx4_guid_alias_rec_status {
MLX4_GUID_INFO_STATUS_IDLE,
MLX4_GUID_INFO_STATUS_SET,
- MLX4_GUID_INFO_STATUS_PENDING,
};
-enum mlx4_guid_alias_rec_ownership {
- MLX4_GUID_DRIVER_ASSIGN,
- MLX4_GUID_SYSADMIN_ASSIGN,
- MLX4_GUID_NONE_ASSIGN, /*init state of each record*/
-};
+#define GUID_STATE_NEED_PORT_INIT 0x01
enum mlx4_guid_alias_rec_method {
MLX4_GUID_INFO_RECORD_SET = IB_MGMT_METHOD_SET,
@@ -403,8 +463,8 @@
u8 all_recs[GUID_REC_SIZE * NUM_ALIAS_GUID_IN_REC];
ib_sa_comp_mask guid_indexes; /*indicates what from the 8 records are valid*/
enum mlx4_guid_alias_rec_status status; /*indicates the administraively status of the record.*/
- u8 method; /*set or delete*/
- enum mlx4_guid_alias_rec_ownership ownership; /*indicates who assign that alias_guid record*/
+ unsigned int guids_retry_schedule[NUM_ALIAS_GUID_IN_REC];
+ u64 time_to_run;
};
struct mlx4_sriov_alias_guid_port_rec_det {
@@ -412,6 +472,7 @@
struct workqueue_struct *wq;
struct delayed_work alias_guid_work;
u8 port;
+ u32 state_flags;
struct mlx4_sriov_alias_guid *parent;
struct list_head cb_list;
};
@@ -503,13 +564,27 @@
struct idr pv_id_table;
};
+struct gid_cache_context {
+ int real_index;
+ int refcount;
+};
+
+struct gid_entry {
+ union ib_gid gid;
+ enum ib_gid_type gid_type;
+ struct gid_cache_context *ctx;
+};
+
+struct mlx4_port_gid_table {
+ struct gid_entry gids[MLX4_MAX_PORT_GIDS];
+};
+
struct mlx4_ib_iboe {
- spinlock_t lock;
+ rwlock_t iboe_lock; /* guard from concurrent access to data in this struct */
struct net_device *netdevs[MLX4_MAX_PORTS];
- struct net_device *masters[MLX4_MAX_PORTS];
- struct notifier_block nb;
- struct notifier_block nb_inet;
- union ib_gid gid_table[MLX4_MAX_PORTS][128];
+ atomic64_t mac[MLX4_MAX_PORTS];
+ struct notifier_block nb;
+ struct mlx4_port_gid_table gid_table[MLX4_MAX_PORTS];
};
struct pkey_mgt {
@@ -553,6 +628,12 @@
int status;
};
+struct mlx4_ib_eq_table_entry {
+ int vector;
+ int port;
+ struct mlx4_ib_dev *ibdev;
+};
+
struct mlx4_ib_dev {
struct ib_device ib_dev;
struct mlx4_dev *dev;
@@ -564,14 +645,14 @@
struct ib_mad_agent *send_agent[MLX4_MAX_PORTS][2];
struct ib_ah *sm_ah[MLX4_MAX_PORTS];
spinlock_t sm_lock;
+ atomic64_t sl2vl[MLX4_MAX_PORTS];
struct mlx4_ib_sriov sriov;
struct mutex cap_mask_mutex;
bool ib_active;
struct mlx4_ib_iboe iboe;
struct mlx4_ib_counter counters[MLX4_MAX_PORTS];
- int *eq_table;
- int eq_added;
+ struct mlx4_ib_eq_table_entry *eq_table;
struct kobject *iov_parent;
struct kobject *ports_parent;
struct kobject *dev_ports_parent[MLX4_MFUNC_MAX];
@@ -580,12 +661,21 @@
unsigned long *ib_uc_qpns_bitmap;
int steer_qpn_count;
int steer_qpn_base;
+ int steering_support;
+ struct mlx4_ib_qp *qp1_proxy[MLX4_MAX_PORTS];
+ /* lock when destroying qp1_proxy and getting netdev events */
+ struct mutex qp1_proxy_lock[MLX4_MAX_PORTS];
+ u8 bond_next_port;
+ /* protect resources needed as part of reset flow */
+ spinlock_t reset_flow_resource_lock;
+ struct list_head qp_list;
};
struct ib_event_work {
struct work_struct work;
struct mlx4_ib_dev *ib_dev;
struct mlx4_eqe ib_eqe;
+ int port;
};
struct mlx4_ib_qp_tunnel_init_attr {
@@ -675,6 +765,13 @@
return container_of(ibah, struct mlx4_ib_ah, ibah);
}
+static inline u8 mlx4_ib_bond_next_port(struct mlx4_ib_dev *dev)
+{
+ dev->bond_next_port = (dev->bond_next_port + 1) % dev->num_ports;
+
+ return dev->bond_next_port + 1;
+}
+
int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev);
void mlx4_ib_close_sriov(struct mlx4_ib_dev *dev);
@@ -774,12 +871,12 @@
int mlx4_ib_query_if_stat(struct mlx4_ib_dev *dev, u32 counter_index,
union mlx4_counter *counter, u8 clear);
-static inline int mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah)
+static inline bool mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah)
{
u8 port = be32_to_cpu(ah->av.ib.port_pd) >> 24 & 3;
if (rdma_port_get_link_layer(ah->ibah.device, port) == IB_LINK_LAYER_ETHERNET)
- return 1;
+ return true;
return !!(ah->av.ib.g_slid & 0x80);
}
@@ -815,13 +912,16 @@
int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port,
enum ib_qp_type qpt, struct ib_wc *wc,
struct ib_grh *grh, struct ib_mad *mad);
+
int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port,
enum ib_qp_type dest_qpt, u16 pkey_index, u32 remote_qpn,
- u32 qkey, struct ib_ah_attr *attr, u8 *s_mac, struct ib_mad *mad);
+ u32 qkey, struct ib_ah_attr *attr, u8 *s_mac,
+ u16 vlan_id, struct ib_mad *mad);
+
__be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx);
int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave,
- struct ib_mad *mad, int is_eth);
+ struct ib_mad *mad);
int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id,
struct ib_mad *mad);
@@ -848,6 +948,8 @@
void del_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num,
struct attribute *attr);
ib_sa_comp_mask mlx4_ib_get_aguid_comp_mask_from_ix(int index);
+void mlx4_ib_slave_alias_guid_event(struct mlx4_ib_dev *dev, int slave,
+ int port, int slave_init);
int mlx4_ib_device_register_sysfs(struct mlx4_ib_dev *device) ;
@@ -861,5 +963,16 @@
int is_attach);
int mlx4_ib_query_device(struct ib_device *ibdev,
struct ib_device_attr *props);
+int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags,
+ u64 start, u64 length, u64 virt_addr,
+ int mr_access_flags, struct ib_pd *pd,
+ struct ib_udata *udata);
+int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev,
+ u8 port_num, int index);
+
+void mlx4_sched_ib_sl2vl_update_work(struct mlx4_ib_dev *ibdev,
+ int port);
+
+void mlx4_ib_sl2vl_update(struct mlx4_ib_dev *mdev, int port);
#endif /* MLX4_IB_H */
Index: sys/ofed/drivers/infiniband/hw/mlx4/mr.c
===================================================================
--- sys/ofed/drivers/infiniband/hw/mlx4/mr.c
+++ sys/ofed/drivers/infiniband/hw/mlx4/mr.c
@@ -35,7 +35,13 @@
#include <linux/module.h>
#include <linux/sched.h>
+#include <asm/atomic64.h>
+
#include "mlx4_ib.h"
+#include "mlx4_exp.h"
+
+atomic64_t shared_mr_count = ATOMIC_INIT(0);
+static void free_smr_info(struct mlx4_ib_mr *mr);
static u32 convert_access(int acc)
{
@@ -43,7 +49,7 @@
(acc & IB_ACCESS_REMOTE_WRITE ? MLX4_PERM_REMOTE_WRITE : 0) |
(acc & IB_ACCESS_REMOTE_READ ? MLX4_PERM_REMOTE_READ : 0) |
(acc & IB_ACCESS_LOCAL_WRITE ? MLX4_PERM_LOCAL_WRITE : 0) |
- (acc & IB_ACCESS_MW_BIND ? MLX4_PERM_BIND_MW : 0) |
+ (acc & IB_ACCESS_MW_BIND ? MLX4_PERM_BIND_MW : 0) |
MLX4_PERM_LOCAL_READ;
}
/* No suuport for Shared MR feature */
@@ -70,9 +76,8 @@
static int shared_mr_mmap(struct file *filep, struct vm_area_struct *vma)
{
- struct proc_dir_entry *pde = PDE(filep->f_path.dentry->d_inode);
struct mlx4_shared_mr_info *smr_info =
- (struct mlx4_shared_mr_info *)pde->data;
+ (struct mlx4_shared_mr_info *)PDE_DATA(filep->f_path.dentry->d_inode);
/* Prevent any mapping not on start of area */
if (vma->vm_pgoff != 0)
@@ -102,6 +107,15 @@
}
#endif
+static enum mlx4_mw_type to_mlx4_type(enum ib_mw_type type)
+{
+ switch (type) {
+ case IB_MW_TYPE_1: return MLX4_MW_TYPE_1;
+ case IB_MW_TYPE_2: return MLX4_MW_TYPE_2;
+ default: return -1;
+ }
+}
+
struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc)
{
struct mlx4_ib_mr *mr;
@@ -430,18 +444,20 @@
}
-/* No suuport for Shared MR */
+/* No support for Shared MR */
#if 0
static int prepare_shared_mr(struct mlx4_ib_mr *mr, int access_flags, int mr_id)
{
struct proc_dir_entry *mr_proc_entry;
mode_t mode = S_IFREG;
- char name_buff[16];
+ char name_buff[128];
+ kuid_t uid;
+ kgid_t gid;
mode |= convert_shared_access(access_flags);
sprintf(name_buff, "%X", mr_id);
- mr->smr_info = kmalloc(sizeof(struct mlx4_shared_mr_info), GFP_KERNEL);
+ mr->smr_info = kzalloc(sizeof(struct mlx4_shared_mr_info), GFP_KERNEL);
mr->smr_info->mr_id = mr_id;
mr->smr_info->umem = mr->umem;
@@ -456,8 +472,27 @@
return -ENODEV;
}
- current_uid_gid(&(mr_proc_entry->uid), &(mr_proc_entry->gid));
- mr_proc_entry->size = mr->umem->length;
+ current_uid_gid(&uid, &gid);
+ proc_set_user(mr_proc_entry, uid, gid);
+ proc_set_size(mr_proc_entry, mr->umem->length);
+
+ /* now creating an extra entry having a uniqe suffix counter */
+ mr->smr_info->counter = atomic64_inc_return(&shared_mr_count);
+ sprintf(name_buff, "%X.%lld", mr_id, mr->smr_info->counter);
+ mr_proc_entry = proc_create_data(name_buff, mode,
+ mlx4_mrs_dir_entry,
+ &shared_mr_proc_ops,
+ mr->smr_info);
+ if (!mr_proc_entry) {
+ pr_err("prepare_shared_mr failed via proc for %s\n", name_buff);
+ free_smr_info(mr);
+ return -ENODEV;
+ }
+
+ mr->smr_info->counter_used = 1;
+ proc_set_user(mr_proc_entry, uid, gid);
+ proc_set_size(mr_proc_entry, mr->umem->length);
+
return 0;
}
@@ -474,28 +509,13 @@
IB_ACCESS_SHARED_MR_OTHER_WRITE));
}
+#endif
static void free_smr_info(struct mlx4_ib_mr *mr)
{
- /* When master/parent shared mr is dereged there is
- no ability to share this mr any more - its mr_id will be
- returned to the kernel as part of ib_uverbs_dereg_mr
- and may be allocated again as part of other reg_mr.
- */
- char name_buff[16];
-
- sprintf(name_buff, "%X", mr->smr_info->mr_id);
- /* Remove proc entry is checking internally that no operation
- was strated on that proc fs file and if in the middle
- current process will wait till end of operation.
- That's why no sync mechanism is needed when we release
- below the shared umem.
- */
- remove_proc_entry(name_buff, mlx4_mrs_dir_entry);
kfree(mr->smr_info);
mr->smr_info = NULL;
}
-#endif
static void mlx4_invalidate_umem(void *invalidation_cookie,
struct ib_umem *umem,
@@ -560,13 +580,14 @@
goto err_mr;
mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
-/* No suuport for Shared MR */
+/* No support for Shared MR */
#if 0
+ atomic_set(&mr->invalidated, 0);
/* Check whether MR should be shared */
if (is_shared_mr(access_flags)) {
/* start address and length must be aligned to page size in order
to map a full page and preventing leakage of data */
- if (mr->umem->offset || (length & ~PAGE_MASK)) {
+ if (ib_umem_offset(mr->umem) || (length & ~PAGE_MASK)) {
err = -EINVAL;
goto err_mr;
}
@@ -591,11 +612,10 @@
mlx4_invalidate_umem, mr);
}
- atomic_set(&mr->invalidated, 0);
return &mr->ibmr;
err_smr:
-/* No suuport for Shared MR */
+/* No support for Shared MR */
#if 0
if (mr->smr_info)
free_smr_info(mr);
@@ -612,13 +632,196 @@
return ERR_PTR(err);
}
+int mlx4_ib_exp_rereg_user_mr(struct ib_mr *mr, int flags,
+ u64 start, u64 length, u64 virt_addr,
+ int mr_access_flags, struct ib_pd *pd)
+{
+ struct mlx4_ib_dev *dev = to_mdev(mr->device);
+ struct mlx4_ib_mr *mmr = to_mmr(mr);
+ struct mlx4_mpt_entry *mpt_entry;
+ struct mlx4_mpt_entry **pmpt_entry = &mpt_entry;
+ int err;
+
+ /* Since we synchronize this call and mlx4_ib_dereg_mr via uverbs, */
+ /* we assume that the calls can't run concurrently. Otherwise, a */
+ /* race exists. */
+ err = mlx4_mr_hw_get_mpt(dev->dev, &mmr->mmr, &pmpt_entry);
+
+ if (err)
+ return err;
+
+ if (flags & IB_EXP_MR_REREG_PD) {
+ err = mlx4_mr_hw_change_pd(dev->dev, *pmpt_entry,
+ to_mpd(pd)->pdn);
+
+ if (err)
+ goto release_mpt_entry;
+ }
+
+ if (flags & IB_EXP_MR_REREG_ACCESS) {
+ err = mlx4_mr_hw_change_access(dev->dev, *pmpt_entry,
+ convert_access(mr_access_flags));
+
+ if (err)
+ goto release_mpt_entry;
+ }
+
+ if (flags & IB_EXP_MR_REREG_TRANS) {
+ int shift;
+ int err;
+ int n;
+
+ /* Peer memory isn't supported */
+ if (NULL != mmr->umem->ib_peer_mem) {
+ err = -ENOTSUPP;
+ goto release_mpt_entry;
+ }
+
+ /* Per IB SPRC, when using rereg with translation, a shared MR
+ * becomes a non-shared MR
+ */
+ if (mmr->smr_info) {
+ free_smr_info(mmr);
+ mmr->smr_info = NULL;
+ }
+
+ mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr);
+ ib_umem_release(mmr->umem);
+ mmr->umem = ib_umem_get_ex(mr->uobject->context, start, length,
+ mr_access_flags |
+ IB_ACCESS_LOCAL_WRITE,
+ 0, 1);
+ if (IS_ERR(mmr->umem)) {
+ err = PTR_ERR(mmr->umem);
+ mmr->umem = NULL;
+ goto release_mpt_entry;
+ }
+ n = ib_umem_page_count(mmr->umem);
+ shift = mlx4_ib_umem_calc_optimal_mtt_size(mmr->umem, start,
+ &n);
+
+ mmr->mmr.iova = virt_addr;
+ mmr->mmr.size = length;
+ err = mlx4_mr_rereg_mem_write(dev->dev, &mmr->mmr,
+ virt_addr, length, n, shift,
+ *pmpt_entry);
+ if (err) {
+ ib_umem_release(mmr->umem);
+ goto release_mpt_entry;
+ }
+
+ err = mlx4_ib_umem_write_mtt(dev, &mmr->mmr.mtt, mmr->umem);
+ if (err) {
+ mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr);
+ ib_umem_release(mmr->umem);
+ goto release_mpt_entry;
+ }
+ }
+
+
+ /* If we couldn't transfer the MR to the HCA, just remember to */
+ /* return a failure, but dereg_mr should free resources. */
+ err = mlx4_mr_hw_write_mpt(dev->dev, &mmr->mmr, pmpt_entry);
+
+release_mpt_entry:
+ mlx4_mr_hw_put_mpt(dev->dev, pmpt_entry);
+
+ return err;
+}
+
+int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags,
+ u64 start, u64 length, u64 virt_addr,
+ int mr_access_flags, struct ib_pd *pd,
+ struct ib_udata *udata)
+{
+ struct mlx4_ib_dev *dev = to_mdev(mr->device);
+ struct mlx4_ib_mr *mmr = to_mmr(mr);
+ struct mlx4_mpt_entry *mpt_entry;
+ struct mlx4_mpt_entry **pmpt_entry = &mpt_entry;
+ int err;
+
+ /* Since we synchronize this call and mlx4_ib_dereg_mr via uverbs,
+ * we assume that the calls can't run concurrently. Otherwise, a
+ * race exists.
+ */
+ err = mlx4_mr_hw_get_mpt(dev->dev, &mmr->mmr, &pmpt_entry);
+
+ if (err)
+ return err;
+
+ if (flags & IB_MR_REREG_PD) {
+ err = mlx4_mr_hw_change_pd(dev->dev, *pmpt_entry,
+ to_mpd(pd)->pdn);
+
+ if (err)
+ goto release_mpt_entry;
+ }
+
+ if (flags & IB_MR_REREG_ACCESS) {
+ err = mlx4_mr_hw_change_access(dev->dev, *pmpt_entry,
+ convert_access(mr_access_flags));
+
+ if (err)
+ goto release_mpt_entry;
+ }
+
+ if (flags & IB_MR_REREG_TRANS) {
+ int shift;
+ int n;
+
+ mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr);
+ ib_umem_release(mmr->umem);
+ mmr->umem = ib_umem_get(mr->uobject->context, start, length,
+ mr_access_flags |
+ IB_ACCESS_LOCAL_WRITE,
+ 0);
+ if (IS_ERR(mmr->umem)) {
+ err = PTR_ERR(mmr->umem);
+ /* Prevent mlx4_ib_dereg_mr from free'ing invalid pointer */
+ mmr->umem = NULL;
+ goto release_mpt_entry;
+ }
+ n = ib_umem_page_count(mmr->umem);
+ shift = ilog2(mmr->umem->page_size);
+
+ err = mlx4_mr_rereg_mem_write(dev->dev, &mmr->mmr,
+ virt_addr, length, n, shift,
+ *pmpt_entry);
+ if (err) {
+ ib_umem_release(mmr->umem);
+ goto release_mpt_entry;
+ }
+ mmr->mmr.iova = virt_addr;
+ mmr->mmr.size = length;
+
+ err = mlx4_ib_umem_write_mtt(dev, &mmr->mmr.mtt, mmr->umem);
+ if (err) {
+ mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr);
+ ib_umem_release(mmr->umem);
+ goto release_mpt_entry;
+ }
+ }
+
+ /* If we couldn't transfer the MR to the HCA, just remember to
+ * return a failure. But dereg_mr will free the resources.
+ */
+ err = mlx4_mr_hw_write_mpt(dev->dev, &mmr->mmr, pmpt_entry);
+ if (!err && flags & IB_MR_REREG_ACCESS)
+ mmr->mmr.access = mr_access_flags;
+
+release_mpt_entry:
+ mlx4_mr_hw_put_mpt(dev->dev, pmpt_entry);
+
+ return err;
+}
+
int mlx4_ib_dereg_mr(struct ib_mr *ibmr)
{
struct mlx4_ib_mr *mr = to_mmr(ibmr);
struct ib_umem *umem = mr->umem;
int ret;
-/* No suuport for Shared MR */
+/* No support for Shared MR */
#if 0
if (mr->smr_info)
free_smr_info(mr);
@@ -659,7 +862,8 @@
if (!mw)
return ERR_PTR(-ENOMEM);
- err = mlx4_mw_alloc(dev->dev, to_mpd(pd)->pdn, (enum mlx4_mw_type)type, &mw->mmw);
+ err = mlx4_mw_alloc(dev->dev, to_mpd(pd)->pdn,
+ to_mlx4_type(type), &mw->mmw);
if (err)
goto err_free;
@@ -763,7 +967,8 @@
if (!mfrpl->ibfrpl.page_list)
goto err_free;
- mfrpl->mapped_page_list = dma_alloc_coherent(&dev->dev->pdev->dev,
+ mfrpl->mapped_page_list = dma_alloc_coherent(&dev->dev->persist->
+ pdev->dev,
size, &mfrpl->map,
GFP_KERNEL);
if (!mfrpl->mapped_page_list)
@@ -785,7 +990,8 @@
struct mlx4_ib_fast_reg_page_list *mfrpl = to_mfrpl(page_list);
int size = page_list->max_page_list_len * sizeof (u64);
- dma_free_coherent(&dev->dev->pdev->dev, size, mfrpl->mapped_page_list,
+ dma_free_coherent(&dev->dev->persist->pdev->dev, size,
+ mfrpl->mapped_page_list,
mfrpl->map);
kfree(mfrpl->ibfrpl.page_list);
kfree(mfrpl);
Index: sys/ofed/drivers/infiniband/hw/mlx4/qp.c
===================================================================
--- sys/ofed/drivers/infiniband/hw/mlx4/qp.c
+++ sys/ofed/drivers/infiniband/hw/mlx4/qp.c
@@ -35,12 +35,15 @@
#include <linux/slab.h>
#include <linux/netdevice.h>
#include <linux/bitops.h>
+#include <linux/rcupdate.h>
+#include <linux/etherdevice.h>
#include <rdma/ib_cache.h>
#include <rdma/ib_pack.h>
#include <rdma/ib_addr.h>
#include <rdma/ib_mad.h>
+#include <linux/mlx4/cmd.h>
#include <linux/mlx4/qp.h>
#include <linux/mlx4/driver.h>
#include <linux/io.h>
@@ -48,7 +51,14 @@
#include "mlx4_ib.h"
#include "user.h"
-#define asm __asm
+#define TRAFFIC_CLASS_MASK(mib_dev, port) \
+ ((rdma_port_get_link_layer(&(mib_dev)->ib_dev, (port)) == \
+ IB_LINK_LAYER_ETHERNET) ? 3 : 0)
+
+static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq,
+ struct mlx4_ib_cq *recv_cq);
+static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq,
+ struct mlx4_ib_cq *recv_cq);
enum {
MLX4_IB_ACK_REQ_FREQ = 8,
@@ -83,6 +93,7 @@
u32 send_psn;
struct ib_ud_header ud_header;
u8 header_buf[MLX4_IB_UD_HEADER_SIZE];
+ struct ib_qp *roce_v2_gsi;
};
enum {
@@ -95,6 +106,10 @@
MLX4_RAW_QP_MSGMAX = 31,
};
+#ifndef ETH_ALEN
+#define ETH_ALEN 6
+#endif
+
static const __be32 mlx4_ib_opcode[] = {
[IB_WR_SEND] = cpu_to_be32(MLX4_OPCODE_SEND),
[IB_WR_LSO] = cpu_to_be32(MLX4_OPCODE_LSO),
@@ -109,22 +124,49 @@
[IB_WR_FAST_REG_MR] = cpu_to_be32(MLX4_OPCODE_FMR),
[IB_WR_MASKED_ATOMIC_CMP_AND_SWP] = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_CS),
[IB_WR_MASKED_ATOMIC_FETCH_AND_ADD] = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_FA),
- [IB_WR_BIND_MW] = cpu_to_be32(
- MLX4_OPCODE_BIND_MW),
+ [IB_WR_BIND_MW] = cpu_to_be32(MLX4_OPCODE_BIND_MW),
};
#ifndef wc_wmb
#if defined(__i386__)
- #define wc_wmb() asm volatile("lock; addl $0,0(%%esp) " ::: "memory")
+ #define wc_wmb() __asm volatile("lock; addl $0,0(%%esp) " ::: "memory")
#elif defined(__x86_64__)
- #define wc_wmb() asm volatile("sfence" ::: "memory")
+ #define wc_wmb() __asm volatile("sfence" ::: "memory")
#elif defined(__ia64__)
- #define wc_wmb() asm volatile("fwb" ::: "memory")
+ #define wc_wmb() __asm volatile("fwb" ::: "memory")
#else
#define wc_wmb() wmb()
#endif
#endif
+static inline void copy_lso_header(__be32 *dst, void *src, int hdr_sz,
+ __be32 owner_bit)
+{
+/* In WQE_FORMAT = 1 we need to split segments larger
+ * than 64 bytes, in this case: 64 -
+ * sizeof(lso->mss_hdr_size) = 60
+ */
+#ifdef CONFIG_INFINIBAND_WQE_FORMAT
+ if (likely(hdr_sz > 60)) {
+ memcpy(dst, src, 60);
+ /* writing the rest of the Header and leaving 4 byte for
+ * right the inline header
+ */
+ memcpy((dst + 16), src + 60,
+ hdr_sz - 60);
+ /* make sure we write the reset of the segment before
+ * setting ownership bit to HW
+ */
+ wmb();
+ *(dst + 15) =
+ cpu_to_be32((MLX4_INLINE_SEG) |
+ (hdr_sz - 60)) |
+ owner_bit;
+ } else
+#endif
+ memcpy(dst, src, hdr_sz);
+}
+
static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp)
{
return container_of(mqp, struct mlx4_ib_sqp, qp);
@@ -161,7 +203,10 @@
}
}
}
- return proxy_sqp;
+ if (proxy_sqp)
+ return 1;
+
+ return !!(qp->flags & MLX4_IB_ROCE_V2_GSI_QP);
}
/* used for INIT/CLOSE port logic */
@@ -203,6 +248,7 @@
return get_wqe(qp, qp->sq.offset + (n << qp->sq.wqe_shift));
}
+#ifndef CONFIG_INFINIBAND_WQE_FORMAT
/*
* Stamp a SQ WQE so that it is invalid if prefetched by marking the
* first four bytes of every 64 byte chunk with
@@ -241,6 +287,7 @@
}
}
}
+#endif
static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size)
{
@@ -248,6 +295,9 @@
struct mlx4_wqe_inline_seg *inl;
void *wqe;
int s;
+ int inl_size;
+ __be32 owner_bit = n & qp->sq.wqe_cnt ?
+ cpu_to_be32(MLX4_WQE_CTRL_OWN) : 0;
ctrl = wqe = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
s = sizeof(struct mlx4_wqe_ctrl_seg);
@@ -262,8 +312,24 @@
/* Pad the remainder of the WQE with an inline data segment. */
if (size > s) {
- inl = wqe + s;
- inl->byte_count = cpu_to_be32(1 << 31 | (size - s - sizeof *inl));
+ inl_size = size - s;
+
+ if (s < 64) {
+ inl = wqe + s;
+ inl->byte_count = SET_BYTE_COUNT((1 << 31) | (64 - s -
+ sizeof(*inl)));
+ inl_size -= 64 - s;
+ }
+
+ inl = wqe + 64;
+
+ while (inl_size > 0) {
+ inl->byte_count = SET_BYTE_COUNT((1 << 31) | (64 -
+ sizeof(*inl)));
+ inl_size -= 64;
+ inl += 64 / sizeof(*inl);
+ }
+
}
ctrl->srcrb_flags = 0;
ctrl->fence_size = size / 16;
@@ -274,15 +340,18 @@
wmb();
ctrl->owner_opcode = cpu_to_be32(MLX4_OPCODE_NOP | MLX4_WQE_CTRL_NEC) |
- (n & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0);
-
+ owner_bit;
+ /* HA check if we need to take care stamp_send_wqe */
+#ifndef CONFIG_INFINIBAND_WQE_FORMAT
stamp_send_wqe(qp, n + qp->sq_spare_wqes, size);
+#endif
}
/* Post NOP WQE to prevent wrap-around in the middle of WR */
static inline unsigned pad_wraparound(struct mlx4_ib_qp *qp, int ind)
{
unsigned s = qp->sq.wqe_cnt - (ind & (qp->sq.wqe_cnt - 1));
+
if (unlikely(s < qp->sq_max_wqes_per_wr)) {
post_nop_wqe(qp, ind, s << qp->sq.wqe_shift);
ind += s;
@@ -364,7 +433,7 @@
sizeof (struct mlx4_wqe_raddr_seg);
case MLX4_IB_QPT_RC:
return sizeof (struct mlx4_wqe_ctrl_seg) +
- sizeof (struct mlx4_wqe_masked_atomic_seg) +
+ sizeof(struct mlx4_wqe_masked_atomic_seg) +
sizeof (struct mlx4_wqe_raddr_seg);
case MLX4_IB_QPT_SMI:
case MLX4_IB_QPT_GSI:
@@ -391,9 +460,6 @@
return -EINVAL;
if (!has_rq) {
- if (cap->max_recv_wr)
- return -EINVAL;
-
qp->rq.wqe_cnt = qp->rq.max_gs = 0;
} else {
/* HW requires >= 1 RQ entry with >= 1 gather entry */
@@ -495,7 +561,9 @@
* We need to leave 2 KB + 1 WR of headroom in the SQ to
* allow HW to prefetch.
*/
+#ifndef CONFIG_INFINIBAND_WQE_FORMAT
qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + qp->sq_max_wqes_per_wr;
+#endif
qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr *
qp->sq_max_wqes_per_wr +
qp->sq_spare_wqes);
@@ -608,13 +676,32 @@
kfree(qp->sqp_proxy_rcv);
}
+static int qp_has_rq(struct ib_qp_init_attr *attr)
+{
+ if (attr->qp_type == IB_QPT_XRC_INI || attr->qp_type == IB_QPT_XRC_TGT)
+ return 0;
+
+ return !attr->srq;
+}
+
+static int qp0_enabled_vf(struct mlx4_dev *dev, int qpn)
+{
+ int i;
+ for (i = 0; i < dev->caps.num_ports; i++) {
+ if (qpn == dev->caps.qp0_proxy[i])
+ return !!dev->caps.qp0_qkey[i];
+ }
+ return 0;
+}
+
static int init_qpg_parent(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *pqp,
struct ib_qp_init_attr *attr, int *qpn)
{
struct mlx4_ib_qpg_data *qpg_data;
int tss_num, rss_num;
int tss_align_num, rss_align_num;
- int tss_base, rss_base = 0;
+ int tss_base;
+ int rss_base = 0;
int err;
/* Parent is part of the TSS range (in SW TSS ARP is sent via parent) */
@@ -633,15 +720,18 @@
attr->parent_attrib.rss_child_count = rss_align_num;
}
- qpg_data = kzalloc(sizeof *qpg_data, GFP_KERNEL);
+ qpg_data = kzalloc(sizeof(*qpg_data), GFP_KERNEL);
if (!qpg_data)
return -ENOMEM;
if(pqp->flags & MLX4_IB_QP_NETIF)
err = mlx4_ib_steer_qp_alloc(dev, tss_align_num, &tss_base);
else
- err = mlx4_qp_reserve_range(dev->dev, tss_align_num,
- tss_align_num, &tss_base, MLX4_RESERVE_BF_QP);
+ err = mlx4_qp_reserve_range(
+ dev->dev, tss_align_num,
+ tss_align_num, &tss_base, MLX4_RESERVE_ETH_BF_QP |
+ ((attr->qp_type == IB_QPT_RAW_PACKET) ?
+ MLX4_RESERVE_A0_QP : 0));
if (err)
goto err1;
@@ -677,8 +767,12 @@
qpg_data->qpg_tss_mask_sz = ilog2(tss_align_num);
qpg_data->tss_qpn_base = tss_base;
qpg_data->rss_qpn_base = rss_base;
+ qpg_data->dev = dev;
+ if (pqp->flags & MLX4_IB_QP_NETIF)
+ qpg_data->flags |= MLX4_IB_QP_NETIF;
pqp->qpg_data = qpg_data;
+ kref_init(&pqp->qpg_data->refcount);
*qpn = tss_base;
return 0;
@@ -701,30 +795,36 @@
return err;
}
-static void free_qpg_parent(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *pqp)
+static void qpg_release(struct kref *ref)
{
- struct mlx4_ib_qpg_data *qpg_data = pqp->qpg_data;
+ struct mlx4_ib_qpg_data *qpg_data;
int align_num;
+ qpg_data = container_of(ref, struct mlx4_ib_qpg_data, refcount);
if (qpg_data->tss_child_count > 1)
kfree(qpg_data->tss_bitmap);
align_num = roundup_pow_of_two(1 + qpg_data->tss_child_count);
- if(pqp->flags & MLX4_IB_QP_NETIF)
- mlx4_ib_steer_qp_free(dev, qpg_data->tss_qpn_base, align_num);
+ if (qpg_data->flags & MLX4_IB_QP_NETIF)
+ mlx4_ib_steer_qp_free(qpg_data->dev, qpg_data->tss_qpn_base, align_num);
else
- mlx4_qp_release_range(dev->dev, qpg_data->tss_qpn_base, align_num);
+ mlx4_qp_release_range(qpg_data->dev->dev, qpg_data->tss_qpn_base, align_num);
if (qpg_data->rss_child_count > 1) {
kfree(qpg_data->rss_bitmap);
align_num = roundup_pow_of_two(qpg_data->rss_child_count);
- mlx4_qp_release_range(dev->dev, qpg_data->rss_qpn_base,
- align_num);
+ mlx4_qp_release_range(qpg_data->dev->dev, qpg_data->rss_qpn_base,
+ align_num);
}
kfree(qpg_data);
}
+static void free_qpg_parent(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *pqp)
+{
+ kref_put(&pqp->qpg_data->refcount, qpg_release);
+}
+
static int alloc_qpg_qpn(struct ib_qp_init_attr *init_attr,
struct mlx4_ib_qp *pqp, int *qpn)
{
@@ -763,6 +863,7 @@
}
pqp->qpg_data = qpg_data;
+ kref_get(&qpg_data->refcount);
*qpn = idx;
return 0;
@@ -777,10 +878,12 @@
/* Do range check */
qpn -= qpg_data->tss_qpn_base;
set_bit(qpn, qpg_data->tss_bitmap);
+ kref_put(&qpg_data->refcount, qpg_release);
break;
case IB_QPG_CHILD_RX:
qpn -= qpg_data->rss_qpn_base;
set_bit(qpn, qpg_data->rss_bitmap);
+ kref_put(&qpg_data->refcount, qpg_release);
break;
default:
/* error */
@@ -801,12 +904,16 @@
* VLAN insertion. */
if (attr->qp_type == IB_QPT_RAW_PACKET) {
err = mlx4_qp_reserve_range(dev->dev, 1, 1, qpn,
- MLX4_RESERVE_BF_QP);
+ (attr->cap.max_send_wr ?
+ MLX4_RESERVE_ETH_BF_QP : 0) |
+ (attr->cap.max_recv_wr ?
+ MLX4_RESERVE_A0_QP : 0));
} else {
- if(qp->flags & MLX4_IB_QP_NETIF)
+ if (qp->flags & MLX4_IB_QP_NETIF)
err = mlx4_ib_steer_qp_alloc(dev, 1, qpn);
else
- err = mlx4_qp_reserve_range(dev->dev, 1, 1, qpn, 0);
+ err = mlx4_qp_reserve_range(dev->dev, 1, 1,
+ qpn, 0);
}
break;
case IB_QPG_PARENT:
@@ -861,15 +968,34 @@
free_qpn_common(dev, qp, qp->qpg_type, qp->mqp.qpn);
}
+static struct mlx4_uar *find_user_uar(struct mlx4_ib_ucontext *uctx, unsigned long uar_virt_add)
+{
+ struct mlx4_ib_user_uar *uar;
+
+ mutex_lock(&uctx->user_uar_mutex);
+ list_for_each_entry(uar, &uctx->user_uar_list, list)
+ if (uar->hw_bar_info[HW_BAR_DB].vma &&
+ uar->hw_bar_info[HW_BAR_DB].vma->vm_start == uar_virt_add) {
+ mutex_unlock(&uctx->user_uar_mutex);
+ return &uar->uar;
+ }
+ mutex_unlock(&uctx->user_uar_mutex);
+
+ return NULL;
+}
+
static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
struct ib_qp_init_attr *init_attr,
- struct ib_udata *udata, int sqpn, struct mlx4_ib_qp **caller_qp)
+ struct ib_udata *udata, int sqpn, struct mlx4_ib_qp **caller_qp,
+ gfp_t gfp)
{
int qpn;
int err;
struct mlx4_ib_sqp *sqp;
struct mlx4_ib_qp *qp;
enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type;
+ struct mlx4_ib_cq *mcq;
+ unsigned long flags;
/* When tunneling special qps, we use a plain UD qp */
if (sqpn) {
@@ -878,10 +1004,13 @@
!(init_attr->create_flags & MLX4_IB_SRIOV_SQP))) {
if (init_attr->qp_type == IB_QPT_GSI)
qp_type = MLX4_IB_QPT_PROXY_GSI;
- else if (mlx4_is_master(dev->dev))
- qp_type = MLX4_IB_QPT_PROXY_SMI_OWNER;
- else
- qp_type = MLX4_IB_QPT_PROXY_SMI;
+ else {
+ if (mlx4_is_master(dev->dev) ||
+ qp0_enabled_vf(dev->dev, sqpn))
+ qp_type = MLX4_IB_QPT_PROXY_SMI_OWNER;
+ else
+ qp_type = MLX4_IB_QPT_PROXY_SMI;
+ }
}
qpn = sqpn;
/* add extra sg entry for tunneling */
@@ -896,7 +1025,9 @@
return -EINVAL;
if (tnl_init->proxy_qp_type == IB_QPT_GSI)
qp_type = MLX4_IB_QPT_TUN_GSI;
- else if (tnl_init->slave == mlx4_master_func_num(dev->dev))
+ else if (tnl_init->slave == mlx4_master_func_num(dev->dev) ||
+ mlx4_vf_smi_enabled(dev->dev, tnl_init->slave,
+ tnl_init->port))
qp_type = MLX4_IB_QPT_TUN_SMI_OWNER;
else
qp_type = MLX4_IB_QPT_TUN_SMI;
@@ -911,16 +1042,19 @@
if (qp_type == MLX4_IB_QPT_SMI || qp_type == MLX4_IB_QPT_GSI ||
(qp_type & (MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_SMI_OWNER |
MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER))) {
- sqp = kzalloc(sizeof (struct mlx4_ib_sqp), GFP_KERNEL);
+ sqp = kzalloc(sizeof (struct mlx4_ib_sqp), gfp);
if (!sqp)
return -ENOMEM;
qp = &sqp->qp;
- qp->pri.vid = qp->alt.vid = 0xFFFF;
+ qp->pri.vid = 0xFFFF;
+ qp->alt.vid = 0xFFFF;
+ sqp->roce_v2_gsi = NULL;
} else {
- qp = kzalloc(sizeof (struct mlx4_ib_qp), GFP_KERNEL);
+ qp = kzalloc(sizeof (struct mlx4_ib_qp), gfp);
if (!qp)
return -ENOMEM;
- qp->pri.vid = qp->alt.vid = 0xFFFF;
+ qp->pri.vid = 0xFFFF;
+ qp->alt.vid = 0xFFFF;
}
} else
qp = *caller_qp;
@@ -949,22 +1083,28 @@
spin_lock_init(&qp->rq.lock);
INIT_LIST_HEAD(&qp->gid_list);
INIT_LIST_HEAD(&qp->steering_rules);
- INIT_LIST_HEAD(&qp->rules_list);
qp->state = IB_QPS_RESET;
if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
- err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, mlx4_ib_qp_has_rq(init_attr), qp);
+ err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, qp_has_rq(init_attr), qp);
if (err)
goto err;
if (pd->uobject) {
- struct mlx4_ib_create_qp ucmd;
+ struct mlx4_exp_ib_create_qp ucmd;
+ int ucmd_size;
int shift;
int n;
- if (!udata || ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) {
+ if (udata->src == IB_UDATA_EXP_CMD)
+ ucmd_size = sizeof(ucmd);
+ else
+ ucmd_size = sizeof(ucmd.base);
+
+ memset(&ucmd, 0, sizeof(ucmd));
+ if (ib_copy_from_udata(&ucmd, udata, ucmd_size)) {
err = -EFAULT;
goto err;
}
@@ -978,13 +1118,13 @@
if (init_attr->create_flags & IB_QP_CREATE_MANAGED_RECV)
qp->flags |= MLX4_IB_QP_CAP_MANAGED_RECV;
- qp->sq_no_prefetch = ucmd.sq_no_prefetch;
+ qp->sq_no_prefetch = ucmd.base.sq_no_prefetch;
- err = set_user_sq_size(dev, qp, &ucmd);
+ err = set_user_sq_size(dev, qp, &ucmd.base);
if (err)
goto err;
- qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr,
+ qp->umem = ib_umem_get(pd->uobject->context, ucmd.base.buf_addr,
qp->buf_size, 0, 0);
if (IS_ERR(qp->umem)) {
err = PTR_ERR(qp->umem);
@@ -1002,12 +1142,25 @@
if (err)
goto err_mtt;
- if (mlx4_ib_qp_has_rq(init_attr)) {
+ if (qp_has_rq(init_attr)) {
err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context),
- ucmd.db_addr, &qp->db);
+ ucmd.base.db_addr, &qp->db);
if (err)
goto err_mtt;
}
+
+ if (udata->inlen > offsetof(struct mlx4_exp_ib_create_qp, uar_virt_add) &&
+ ucmd.uar_virt_add) {
+ qp->bf.uar = find_user_uar(to_mucontext(pd->uobject->context),
+ (unsigned long)ucmd.uar_virt_add);
+ if (!qp->bf.uar) {
+ pr_debug("failed to find user UAR with virt address 0x%llx", (long long)ucmd.uar_virt_add);
+ err = -EINVAL;
+ goto err_mtt;
+ }
+ } else {
+ qp->bf.uar = &to_mucontext(pd->uobject->context)->uar;
+ }
} else {
qp->sq_no_prefetch = 0;
@@ -1015,8 +1168,8 @@
if (err)
goto err;
- if (mlx4_ib_qp_has_rq(init_attr)) {
- err = mlx4_db_alloc(dev->dev, &qp->db, 0);
+ if (qp_has_rq(init_attr)) {
+ err = mlx4_db_alloc(dev->dev, &qp->db, 0, gfp);
if (err)
goto err;
@@ -1024,16 +1177,16 @@
}
if (qp->max_inline_data) {
- err = mlx4_bf_alloc(dev->dev, &qp->bf, 0);
+ err = mlx4_bf_alloc(dev->dev, &qp->bf, dev->dev->numa_node);
if (err) {
- pr_debug("failed to allocate blue flame"
- " register (%d)", err);
+ pr_debug("failed to allocate blue flame register (%d)",
+ err);
qp->bf.uar = &dev->priv_uar;
}
} else
qp->bf.uar = &dev->priv_uar;
- if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf)) {
+ if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf, gfp)) {
err = -ENOMEM;
goto err_db;
}
@@ -1043,13 +1196,12 @@
if (err)
goto err_buf;
- err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf);
+ err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf, gfp);
if (err)
goto err_mtt;
- qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof (u64), GFP_KERNEL);
- qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof (u64), GFP_KERNEL);
-
+ qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof (u64), gfp);
+ qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof (u64), gfp);
if (!qp->sq.wrid || !qp->rq.wrid) {
err = -ENOMEM;
goto err_wrid;
@@ -1070,7 +1222,7 @@
goto err_proxy;
}
- err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp);
+ err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp, gfp);
if (err)
goto err_qpn;
@@ -1082,11 +1234,29 @@
* shifting) for send doorbell. Precompute this value to save
* a little bit when posting sends.
*/
- qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
+ qp->doorbell_qpn = cpu_to_be32((u32)qp->mqp.qpn << 8);
qp->mqp.event = mlx4_ib_qp_event;
if (!*caller_qp)
*caller_qp = qp;
+
+ spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
+ mlx4_ib_lock_cqs(to_mcq(init_attr->send_cq),
+ to_mcq(init_attr->recv_cq));
+ /* Maintain device to QPs access, needed for further handling
+ * via reset flow
+ */
+ list_add_tail(&qp->qps_list, &dev->qp_list);
+ /* Maintain CQ to QPs access, needed for further handling
+ * via reset flow
+ */
+ mcq = to_mcq(init_attr->send_cq);
+ list_add_tail(&qp->cq_send_list, &mcq->send_qp_list);
+ mcq = to_mcq(init_attr->recv_cq);
+ list_add_tail(&qp->cq_recv_list, &mcq->recv_qp_list);
+ mlx4_ib_unlock_cqs(to_mcq(init_attr->send_cq),
+ to_mcq(init_attr->recv_cq));
+ spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
return 0;
err_qpn:
@@ -1097,7 +1267,7 @@
free_proxy_bufs(pd->device, qp);
err_wrid:
if (pd->uobject) {
- if (mlx4_ib_qp_has_rq(init_attr))
+ if (qp_has_rq(init_attr))
mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &qp->db);
} else {
kfree(qp->sq.wrid);
@@ -1114,10 +1284,10 @@
mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
err_db:
- if (!pd->uobject && mlx4_ib_qp_has_rq(init_attr))
+ if (!pd->uobject && qp_has_rq(init_attr))
mlx4_db_free(dev->dev, &qp->db);
- if (qp->max_inline_data)
+ if (!pd->uobject && qp->max_inline_data)
mlx4_bf_free(dev->dev, &qp->bf);
err:
@@ -1144,13 +1314,13 @@
__acquires(&send_cq->lock) __acquires(&recv_cq->lock)
{
if (send_cq == recv_cq) {
- spin_lock_irq(&send_cq->lock);
+ spin_lock(&send_cq->lock);
__acquire(&recv_cq->lock);
} else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
- spin_lock_irq(&send_cq->lock);
+ spin_lock(&send_cq->lock);
spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING);
} else {
- spin_lock_irq(&recv_cq->lock);
+ spin_lock(&recv_cq->lock);
spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING);
}
}
@@ -1160,13 +1330,13 @@
{
if (send_cq == recv_cq) {
__release(&recv_cq->lock);
- spin_unlock_irq(&send_cq->lock);
+ spin_unlock(&send_cq->lock);
} else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
spin_unlock(&recv_cq->lock);
- spin_unlock_irq(&send_cq->lock);
+ spin_unlock(&send_cq->lock);
} else {
spin_unlock(&send_cq->lock);
- spin_unlock_irq(&recv_cq->lock);
+ spin_unlock(&recv_cq->lock);
}
}
@@ -1211,15 +1381,17 @@
int is_user)
{
struct mlx4_ib_cq *send_cq, *recv_cq;
+ unsigned long flags;
if (qp->state != IB_QPS_RESET) {
if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state),
MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp))
pr_warn("modify QP %06x to RESET failed.\n",
qp->mqp.qpn);
- if (qp->pri.smac) {
+ if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port)) {
mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac);
qp->pri.smac = 0;
+ qp->pri.smac_port = 0;
}
if (qp->alt.smac) {
mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac);
@@ -1241,8 +1413,13 @@
get_cqs(qp, &send_cq, &recv_cq);
+ spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
mlx4_ib_lock_cqs(send_cq, recv_cq);
+ /* del from lists under both locks above to protect reset flow paths */
+ list_del(&qp->qps_list);
+ list_del(&qp->cq_send_list);
+ list_del(&qp->cq_recv_list);
if (!is_user) {
__mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn,
qp->ibqp.srq ? to_msrq(qp->ibqp.srq): NULL);
@@ -1253,6 +1430,7 @@
mlx4_qp_remove(dev->dev, &qp->mqp);
mlx4_ib_unlock_cqs(send_cq, recv_cq);
+ spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
mlx4_qp_free(dev->dev, &qp->mqp);
@@ -1283,9 +1461,17 @@
del_gid_entries(qp);
}
-static u32 get_sqp_num(struct mlx4_ib_dev *dev, struct ib_qp_init_attr *attr)
+static int get_sqp_num(struct mlx4_ib_dev *dev, struct ib_qp_init_attr *attr)
{
/* Native or PPF */
+ if ((!mlx4_is_mfunc(dev->dev) || mlx4_is_master(dev->dev)) &&
+ attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI) {
+ int sqpn;
+ int res = mlx4_qp_reserve_range(dev->dev, 1, 1, &sqpn, 0);
+
+ return res ? -abs(res) : sqpn;
+ }
+
if (!mlx4_is_mfunc(dev->dev) ||
(mlx4_is_master(dev->dev) &&
attr->create_flags & MLX4_IB_SRIOV_SQP)) {
@@ -1293,6 +1479,7 @@
(attr->qp_type == IB_QPT_SMI ? 0 : 2) +
attr->port_num - 1;
}
+
/* PF or VF -- creating proxies */
if (attr->qp_type == IB_QPT_SMI)
return dev->dev->caps.qp0_proxy[attr->port_num - 1];
@@ -1316,7 +1503,7 @@
if (attr->parent_attrib.rss_child_count == 1)
return -EINVAL; /* Doesn't make sense */
if ((attr->parent_attrib.tss_child_count == 0) &&
- (attr->parent_attrib.rss_child_count == 0))
+ (attr->parent_attrib.rss_child_count == 0))
/* Should be called with IP_QPG_NONE */
return -EINVAL;
if (attr->parent_attrib.rss_child_count > 1) {
@@ -1347,46 +1534,18 @@
return 0;
}
-#define RESERVED_FLAGS_MASK ((((unsigned int)IB_QP_CREATE_RESERVED_END - 1) | IB_QP_CREATE_RESERVED_END) \
- & ~(IB_QP_CREATE_RESERVED_START - 1))
-
-static enum mlx4_ib_qp_flags to_mlx4_ib_qp_flags(enum ib_qp_create_flags ib_qp_flags)
-{
- enum mlx4_ib_qp_flags mlx4_ib_qp_flags = 0;
-
- if (ib_qp_flags & IB_QP_CREATE_IPOIB_UD_LSO)
- mlx4_ib_qp_flags |= MLX4_IB_QP_LSO;
-
- if (ib_qp_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)
- mlx4_ib_qp_flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK;
-
- if (ib_qp_flags & IB_QP_CREATE_NETIF_QP)
- mlx4_ib_qp_flags |= MLX4_IB_QP_NETIF;
-
- if (ib_qp_flags & IB_QP_CREATE_CROSS_CHANNEL)
- mlx4_ib_qp_flags |= MLX4_IB_QP_CAP_CROSS_CHANNEL;
-
- if (ib_qp_flags & IB_QP_CREATE_MANAGED_SEND)
- mlx4_ib_qp_flags |= MLX4_IB_QP_CAP_MANAGED_SEND;
-
- if (ib_qp_flags & IB_QP_CREATE_MANAGED_RECV)
- mlx4_ib_qp_flags |= MLX4_IB_QP_CAP_MANAGED_RECV;
-
- /* reserved flags */
- mlx4_ib_qp_flags |= (ib_qp_flags & RESERVED_FLAGS_MASK);
-
- return mlx4_ib_qp_flags;
-}
-
-struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
- struct ib_qp_init_attr *init_attr,
- struct ib_udata *udata)
+static struct ib_qp *_mlx4_ib_create_qp(struct ib_pd *pd,
+ struct ib_qp_init_attr *init_attr,
+ struct ib_udata *udata)
{
struct mlx4_ib_qp *qp = NULL;
int err;
u16 xrcdn = 0;
- enum mlx4_ib_qp_flags mlx4_qp_flags = to_mlx4_ib_qp_flags(init_attr->create_flags);
struct ib_device *device;
+ gfp_t gfp;
+
+ gfp = (init_attr->create_flags & MLX4_IB_QP_CREATE_USE_GFP_NOIO) ?
+ GFP_NOIO : GFP_KERNEL;
/* see ib_core::ib_create_qp same handling */
device = pd ? pd->device : init_attr->xrcd->device;
@@ -1394,21 +1553,24 @@
* We only support LSO, vendor flag1, and multicast loopback blocking,
* and only for kernel UD QPs.
*/
- if (mlx4_qp_flags & ~(MLX4_IB_QP_LSO |
+ if (init_attr->create_flags & ~(MLX4_IB_QP_LSO |
MLX4_IB_QP_CAP_CROSS_CHANNEL |
MLX4_IB_QP_CAP_MANAGED_SEND |
MLX4_IB_QP_CAP_MANAGED_RECV |
MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK |
- MLX4_IB_SRIOV_TUNNEL_QP | MLX4_IB_SRIOV_SQP |
- MLX4_IB_QP_NETIF))
+ MLX4_IB_SRIOV_TUNNEL_QP |
+ MLX4_IB_SRIOV_SQP |
+ MLX4_IB_QP_NETIF |
+ MLX4_IB_QP_CREATE_ROCE_V2_GSI |
+ MLX4_IB_QP_CREATE_USE_GFP_NOIO))
return ERR_PTR(-EINVAL);
if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) {
- if (init_attr->qp_type != IB_QPT_UD)
- return ERR_PTR(-EINVAL);
+ if (init_attr->qp_type != IB_QPT_UD)
+ return ERR_PTR(-EINVAL);
}
- if ((mlx4_qp_flags &
+ if ((init_attr->create_flags &
(MLX4_IB_QP_CAP_CROSS_CHANNEL |
MLX4_IB_QP_CAP_MANAGED_SEND |
MLX4_IB_QP_CAP_MANAGED_RECV)) &&
@@ -1419,15 +1581,22 @@
return ERR_PTR(-EINVAL);
}
- if ((init_attr->create_flags &
+ if (init_attr->create_flags &
~(IB_QP_CREATE_CROSS_CHANNEL |
IB_QP_CREATE_MANAGED_SEND |
- IB_QP_CREATE_MANAGED_RECV)) &&
- (((mlx4_qp_flags & ~MLX4_IB_SRIOV_SQP) &&
- init_attr->qp_type != IB_QPT_UD) ||
- ((mlx4_qp_flags & MLX4_IB_SRIOV_SQP) &&
- init_attr->qp_type > IB_QPT_GSI)))
- return ERR_PTR(-EINVAL);
+ IB_QP_CREATE_MANAGED_RECV)) {
+ /* userspace is not allowed to set create flags */
+ if (udata)
+ return ERR_PTR(-EINVAL);
+
+ if ((init_attr->create_flags & ~(MLX4_IB_SRIOV_SQP | MLX4_IB_QP_CREATE_USE_GFP_NOIO) &&
+ init_attr->qp_type != IB_QPT_UD) &&
+ (init_attr->create_flags & MLX4_IB_SRIOV_SQP &&
+ init_attr->qp_type > IB_QPT_GSI) &&
+ (init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI &&
+ init_attr->qp_type != IB_QPT_GSI))
+ return ERR_PTR(-EINVAL);
+ }
err = check_qpg_attr(to_mdev(device), init_attr);
if (err)
@@ -1447,14 +1616,16 @@
case IB_QPT_RC:
case IB_QPT_UC:
case IB_QPT_RAW_PACKET:
- qp = kzalloc(sizeof *qp, GFP_KERNEL);
+ qp = kzalloc(sizeof *qp, gfp);
if (!qp)
return ERR_PTR(-ENOMEM);
- qp->pri.vid = qp->alt.vid = 0xFFFF;
+ qp->pri.vid = 0xFFFF;
+ qp->alt.vid = 0xFFFF;
/* fall through */
case IB_QPT_UD:
{
- err = create_qp_common(to_mdev(device), pd, init_attr, udata, 0, &qp);
+ err = create_qp_common(to_mdev(device), pd, init_attr,
+ udata, 0, &qp, gfp);
if (err) {
kfree(qp);
return ERR_PTR(err);
@@ -1468,19 +1639,25 @@
case IB_QPT_SMI:
case IB_QPT_GSI:
{
+ int sqpn;
+
/* Userspace is not allowed to create special QPs: */
if (udata)
return ERR_PTR(-EINVAL);
+ sqpn = get_sqp_num(to_mdev(device), init_attr);
+
+ if (sqpn < 0)
+ return ERR_PTR(sqpn);
err = create_qp_common(to_mdev(device), pd, init_attr, udata,
- get_sqp_num(to_mdev(device), init_attr),
- &qp);
+ sqpn,
+ &qp, gfp);
if (err)
return ERR_PTR(err);
qp->port = init_attr->port_num;
- qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : 1;
-
+ qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 :
+ init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI ? sqpn : 1;
break;
}
default:
@@ -1491,7 +1668,45 @@
return &qp->ibqp;
}
-int mlx4_ib_destroy_qp(struct ib_qp *qp)
+struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
+ struct ib_qp_init_attr *init_attr,
+ struct ib_udata *udata) {
+ struct ib_device *device = pd ? pd->device : init_attr->xrcd->device;
+ struct ib_qp *ibqp;
+ struct mlx4_ib_dev *dev = to_mdev(device);
+
+ ibqp = _mlx4_ib_create_qp(pd, init_attr, udata);
+
+ if (!mlx4_is_slave(dev->dev) &&
+ !IS_ERR_OR_NULL(ibqp) &&
+ (init_attr->qp_type == IB_QPT_GSI) &&
+ !(init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI)) {
+ struct mlx4_ib_sqp *sqp = to_msqp((to_mqp(ibqp)));
+ int is_eth = rdma_port_get_link_layer(device, init_attr->port_num) ==
+ IB_LINK_LAYER_ETHERNET;
+
+ if (is_eth &&
+ ((dev->dev->caps.roce_mode == MLX4_ROCE_MODE_1_5_PLUS_2) ||
+ (dev->dev->caps.roce_mode == MLX4_ROCE_MODE_2) ||
+ (dev->dev->caps.roce_mode == MLX4_ROCE_MODE_1_PLUS_2))) {
+ init_attr->create_flags |= MLX4_IB_QP_CREATE_ROCE_V2_GSI;
+ sqp->roce_v2_gsi = ib_create_qp(pd, init_attr);
+
+ if (IS_ERR_OR_NULL(sqp->roce_v2_gsi)) {
+ pr_err("Failed to create GSI QP for RoCEv2 (%ld)\n", PTR_ERR(sqp->roce_v2_gsi));
+ sqp->roce_v2_gsi = NULL;
+ } else {
+ sqp = to_msqp(to_mqp(sqp->roce_v2_gsi));
+ sqp->qp.flags |= MLX4_IB_ROCE_V2_GSI_QP;
+ }
+
+ init_attr->create_flags &= ~MLX4_IB_QP_CREATE_ROCE_V2_GSI;
+ }
+ }
+ return ibqp;
+}
+
+static int _mlx4_ib_destroy_qp(struct ib_qp *qp)
{
struct mlx4_ib_dev *dev = to_mdev(qp->device);
struct mlx4_ib_qp *mqp = to_mqp(qp);
@@ -1500,6 +1715,12 @@
if (is_qp0(dev, mqp))
mlx4_CLOSE_PORT(dev->dev, mqp->port);
+ if (dev->qp1_proxy[mqp->port - 1] == mqp) {
+ mutex_lock(&dev->qp1_proxy_lock[mqp->port - 1]);
+ dev->qp1_proxy[mqp->port - 1] = NULL;
+ mutex_unlock(&dev->qp1_proxy_lock[mqp->port - 1]);
+ }
+
pd = get_pd(mqp);
destroy_qp_common(dev, mqp, !!pd->ibpd.uobject);
@@ -1511,6 +1732,20 @@
return 0;
}
+int mlx4_ib_destroy_qp(struct ib_qp *qp)
+{
+ struct mlx4_ib_qp *mqp = to_mqp(qp);
+
+ if (mqp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) {
+ struct mlx4_ib_sqp *sqp = to_msqp(mqp);
+
+ if (sqp->roce_v2_gsi)
+ ib_destroy_qp(sqp->roce_v2_gsi);
+ }
+
+ return _mlx4_ib_destroy_qp(qp);
+}
+
static int to_mlx4_st(struct mlx4_ib_dev *dev, enum mlx4_ib_qp_type type)
{
switch (type) {
@@ -1581,55 +1816,45 @@
path->sched_queue = (path->sched_queue & 0xbf) | ((port - 1) << 6);
}
-static int ib_rate_to_mlx4(struct mlx4_ib_dev *dev, u8 rate)
-{
- if (rate == IB_RATE_PORT_CURRENT) {
- return 0;
- } else if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_300_GBPS) {
- return -EINVAL;
- } else {
- while (rate != IB_RATE_2_5_GBPS &&
- !(1 << (rate + MLX4_STAT_RATE_OFFSET) &
- dev->dev->caps.stat_rate_support))
- --rate;
- }
-
- return rate + MLX4_STAT_RATE_OFFSET;
-}
-
-static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah,
- u8 *smac, u16 vlan_id, struct mlx4_ib_qp *qp,
- struct mlx4_qp_path *path, u8 port, int is_primary)
+static int _mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah,
+ u64 smac, u16 vlan_tag, struct mlx4_qp_path *path,
+ struct mlx4_roce_smac_vlan_info *smac_info, u8 port)
{
int is_eth = rdma_port_get_link_layer(&dev->ib_dev, port) ==
IB_LINK_LAYER_ETHERNET;
- u16 vlan_tag;
int vidx;
int smac_index;
int err;
- u64 u64_mac;
- struct mlx4_roce_smac_vlan_info *smac_info;
+
path->grh_mylmc = ah->src_path_bits & 0x7f;
path->rlid = cpu_to_be16(ah->dlid);
-
- err = ib_rate_to_mlx4(dev, ah->static_rate);
- if (err < 0)
- return err;
- path->static_rate = err;
+ if (ah->static_rate) {
+ path->static_rate = ah->static_rate + MLX4_STAT_RATE_OFFSET;
+ while (path->static_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET &&
+ !(1 << path->static_rate & dev->dev->caps.stat_rate_support))
+ --path->static_rate;
+ } else
+ path->static_rate = 0;
if (ah->ah_flags & IB_AH_GRH) {
- if (ah->grh.sgid_index >= dev->dev->caps.gid_table_len[port]) {
+ int real_sgid_index = mlx4_ib_gid_index_to_real_index(dev,
+ port,
+ ah->grh.sgid_index);
+
+ if (real_sgid_index >= dev->dev->caps.gid_table_len[port]) {
pr_err("sgid_index (%u) too large. max is %d\n",
- ah->grh.sgid_index, dev->dev->caps.gid_table_len[port] - 1);
+ real_sgid_index, dev->dev->caps.gid_table_len[port] - 1);
return -1;
}
path->grh_mylmc |= 1 << 7;
- path->mgid_index = ah->grh.sgid_index;
+ path->mgid_index = real_sgid_index;
path->hop_limit = ah->grh.hop_limit;
path->tclass_flowlabel =
- cpu_to_be32((ah->grh.traffic_class << 20) |
+ cpu_to_be32(
+ ((ah->grh.traffic_class &
+ ~TRAFFIC_CLASS_MASK(dev, port)) << 20) |
(ah->grh.flow_label));
memcpy(path->rgid, ah->grh.dgid.raw, 16);
}
@@ -1641,12 +1866,7 @@
path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE |
((port - 1) << 6) | ((ah->sl & 7) << 3);
- if (is_primary)
- smac_info = &qp->pri;
- else
- smac_info = &qp->alt;
-
- vlan_tag = vlan_id;
+ path->feup |= MLX4_FEUP_FORCE_ETH_UP;
if (vlan_tag < 0x1000) {
if (smac_info->vid < 0x1000) {
/* both valid vlan ids */
@@ -1660,10 +1880,8 @@
smac_info->candidate_vlan_port = port;
smac_info->update_vid = 1;
path->vlan_index = vidx;
- path->fl = 1 << 6;
} else {
path->vlan_index = smac_info->vlan_index;
- path->fl = 1 << 6;
}
} else {
/* no current vlan tag in qp */
@@ -1675,8 +1893,9 @@
smac_info->candidate_vlan_port = port;
smac_info->update_vid = 1;
path->vlan_index = vidx;
- path->fl = 1 << 6;
}
+ path->feup |= MLX4_FVL_FORCE_ETH_VLAN;
+ path->fl = 1 << 6;
} else {
/* have current vlan tag. unregister it at modify-qp success */
if (smac_info->vid < 0x1000) {
@@ -1685,38 +1904,62 @@
}
}
-
/* get smac_index for RoCE use.
* If no smac was yet assigned, register one.
* If one was already assigned, but the new mac differs,
* unregister the old one and register the new one.
*/
- u64_mac = mlx4_mac_to_u64(smac);
-
- if (!smac_info->smac || smac_info->smac != u64_mac) {
+ if ((!smac_info->smac && !smac_info->smac_port) ||
+ smac_info->smac != smac) {
/* register candidate now, unreg if needed, after success */
- smac_index = mlx4_register_mac(dev->dev, port, u64_mac);
+ smac_index = mlx4_register_mac(dev->dev, port, smac);
if (smac_index >= 0) {
smac_info->candidate_smac_index = smac_index;
- smac_info->candidate_smac = u64_mac;
+ smac_info->candidate_smac = smac;
smac_info->candidate_smac_port = port;
- } else
+ } else {
return -EINVAL;
- } else
+ }
+ } else {
smac_index = smac_info->smac_index;
+ }
memcpy(path->dmac, ah->dmac, 6);
path->ackto = MLX4_IB_LINK_TYPE_ETH;
/* put MAC table smac index for IBoE */
- path->grh_mylmc = (u8) (smac_index) | 0x80 ;
-
- } else
+ path->grh_mylmc = (u8) (smac_index) | 0x80;
+ } else {
path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE |
((port - 1) << 6) | ((ah->sl & 0xf) << 2);
+ }
return 0;
}
+static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_qp_attr *qp,
+ enum ib_qp_attr_mask qp_attr_mask,
+ struct mlx4_ib_qp *mqp,
+ struct mlx4_qp_path *path, u8 port,
+ u16 vlan_id, u8 *smac)
+{
+ return _mlx4_set_path(dev, &qp->ah_attr,
+ mlx4_mac_to_u64(smac),
+ vlan_id,
+ path, &mqp->pri, port);
+}
+
+static int mlx4_set_alt_path(struct mlx4_ib_dev *dev,
+ const struct ib_qp_attr *qp,
+ enum ib_qp_attr_mask qp_attr_mask,
+ struct mlx4_ib_qp *mqp,
+ struct mlx4_qp_path *path, u8 port)
+{
+ return _mlx4_set_path(dev, &qp->alt_ah_attr,
+ 0,
+ 0xffff,
+ path, &mqp->alt, port);
+}
+
static void update_mcg_macs(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
{
struct mlx4_ib_gid_entry *ge, *tmp;
@@ -1729,35 +1972,94 @@
}
}
-static int handle_eth_ud_smac_index(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, const u8 *smac,
+static int handle_eth_ud_smac_index(struct mlx4_ib_dev *dev,
+ struct mlx4_ib_qp *qp,
struct mlx4_qp_context *context)
{
- struct net_device *ndev;
u64 u64_mac;
int smac_index;
-
- ndev = dev->iboe.netdevs[qp->port - 1];
- if (ndev) {
- smac = IF_LLADDR(ndev);
- u64_mac = mlx4_mac_to_u64(smac);
- } else {
- u64_mac = dev->dev->caps.def_mac[qp->port];
- }
+ u64_mac = atomic64_read(&dev->iboe.mac[qp->port - 1]);
context->pri_path.sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | ((qp->port - 1) << 6);
- if (!qp->pri.smac) {
+ if (!qp->pri.smac && !qp->pri.smac_port) {
smac_index = mlx4_register_mac(dev->dev, qp->port, u64_mac);
if (smac_index >= 0) {
qp->pri.candidate_smac_index = smac_index;
qp->pri.candidate_smac = u64_mac;
qp->pri.candidate_smac_port = qp->port;
context->pri_path.grh_mylmc = 0x80 | (u8) smac_index;
- } else
+ } else {
return -ENOENT;
+ }
}
return 0;
}
+
+enum {
+ MLX4_QPC_ROCE_MODE_1 = 0,
+ MLX4_QPC_ROCE_MODE_2 = 2,
+ MLX4_QPC_ROCE_MODE_1_5 = 1,
+ MLX4_QPC_ROCE_MODE_MAX = 0xff
+};
+
+static enum ib_gid_type mlx4_gid_type_to_ib_gid_type(enum mlx4_roce_gid_type mlx4_gid_type)
+{
+ switch (mlx4_gid_type) {
+ case MLX4_ROCE_GID_TYPE_V1:
+ return IB_GID_TYPE_IB;
+ case MLX4_ROCE_GID_TYPE_V1_5:
+ return IB_GID_TYPE_ROCE_V1_5;
+ case MLX4_ROCE_GID_TYPE_V2:
+ return IB_GID_TYPE_ROCE_V2;
+ default:
+ return IB_GID_TYPE_SIZE;
+ }
+}
+
+static u8 gid_type_to_qpc(enum ib_gid_type gid_type)
+{
+ switch (gid_type) {
+ case IB_GID_TYPE_IB:
+ return MLX4_QPC_ROCE_MODE_1;
+ case IB_GID_TYPE_ROCE_V2:
+ return MLX4_QPC_ROCE_MODE_2;
+ case IB_GID_TYPE_ROCE_V1_5:
+ return MLX4_QPC_ROCE_MODE_1_5;
+ default:
+ return MLX4_QPC_ROCE_MODE_MAX;
+ }
+}
+
+#ifdef CONFIG_INFINIBAND_WQE_FORMAT
+static inline void initialize_wq_buf(struct mlx4_ib_qp *qp)
+{
+ __be32 *wqe;
+ int i;
+ int wq_size;
+
+ wq_size = (qp->sq.wqe_cnt << qp->sq.wqe_shift);
+ wqe = get_send_wqe(qp, 0);
+ for (i = 0; i < wq_size; i += 64, wqe += 16)
+ *wqe = cpu_to_be32(1 << 30);
+}
+#else
+static inline void initialize_wq_buf(struct mlx4_ib_qp *qp)
+{
+ struct mlx4_wqe_ctrl_seg *ctrl;
+ int i;
+
+ for (i = 0; i < qp->sq.wqe_cnt; ++i) {
+ ctrl = get_send_wqe(qp, i);
+ ctrl->owner_opcode = cpu_to_be32(1 << 31);
+ if (qp->sq_max_wqes_per_wr == 1)
+ ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4);
+
+ stamp_send_wqe(qp, i, 1 << qp->sq.wqe_shift);
+ }
+}
+#endif
+
static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
const struct ib_qp_attr *attr, int attr_mask,
enum ib_qp_state cur_state, enum ib_qp_state new_state)
@@ -1771,7 +2073,12 @@
int sqd_event;
int steer_qp = 0;
int err = -EINVAL;
- int is_eth = -1;
+
+ /* APM is not supported under RoCE */
+ if (attr_mask & IB_QP_ALT_PATH &&
+ rdma_port_get_link_layer(&dev->ib_dev, qp->port) ==
+ IB_LINK_LAYER_ETHERNET)
+ return -ENOTSUPP;
context = kzalloc(sizeof *context, GFP_KERNEL);
if (!context)
@@ -1831,13 +2138,12 @@
if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
context->sq_size_stride |= !!qp->sq_no_prefetch << 7;
context->xrcd = cpu_to_be32((u32) qp->xrcdn);
- context->param3 |= cpu_to_be32(1 << 30);
+ if (ibqp->qp_type == IB_QPT_RAW_PACKET)
+ context->param3 |= cpu_to_be32(1 << 30);
}
- if (qp->ibqp.uobject)
- context->usr_page = cpu_to_be32(to_mucontext(ibqp->uobject->context)->uar.index);
- else
- context->usr_page = cpu_to_be32(qp->bf.uar->index);
+ context->usr_page =
+ cpu_to_be32(mlx4_to_hw_uar_index(dev->dev, qp->bf.uar->index));
if (attr_mask & IB_QP_DEST_QPN)
context->remote_qpn = cpu_to_be32(attr->dest_qp_num);
@@ -1864,6 +2170,26 @@
mlx4_ib_steer_qp_reg(dev, qp, 1);
steer_qp = 1;
}
+
+ if (ibqp->qp_type == IB_QPT_UD) {
+ enum ib_gid_type ud_gid_type =
+ mlx4_gid_type_to_ib_gid_type(dev->dev->caps.ud_gid_type);
+ u8 qpc_roce_mode = gid_type_to_qpc(ud_gid_type);
+
+ context->rlkey_roce_mode |= (qpc_roce_mode << 6);
+ }
+ if (ibqp->qp_type == IB_QPT_GSI) {
+ enum ib_gid_type main_gsi_gid_type =
+ (dev->dev->caps.roce_mode == MLX4_ROCE_MODE_1 ||
+ dev->dev->caps.roce_mode == MLX4_ROCE_MODE_1_PLUS_2) ?
+ IB_GID_TYPE_IB : IB_GID_TYPE_ROCE_V1_5;
+
+ enum ib_gid_type gid_type = qp->flags & MLX4_IB_ROCE_V2_GSI_QP ?
+ IB_GID_TYPE_ROCE_V2 : main_gsi_gid_type;
+ u8 qpc_roce_mode = gid_type_to_qpc(gid_type);
+
+ context->rlkey_roce_mode |= (qpc_roce_mode << 6);
+ }
}
if (attr_mask & IB_QP_PKEY_INDEX) {
@@ -1873,17 +2199,52 @@
optpar |= MLX4_QP_OPTPAR_PKEY_INDEX;
}
- if ((attr_mask & IB_QP_AV) && (ibqp->qp_type != IB_QPT_RAW_PACKET)) {
- if (mlx4_set_path(dev, &attr->ah_attr, (u8 *)attr->smac,
- attr_mask & IB_QP_VID ?
- attr->vlan_id : 0xffff ,
- qp, &context->pri_path,
- attr_mask & IB_QP_PORT ?
- attr->port_num : qp->port, 1))
+ if (attr_mask & IB_QP_AV) {
+ u8 port_num = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
+ union ib_gid gid;
+ struct ib_gid_attr gid_attr = {.gid_type = IB_GID_TYPE_IB};
+ u16 vlan = 0xffff;
+ u8 smac[ETH_ALEN];
+ int status = 0;
+ int is_eth = rdma_port_get_link_layer(&dev->ib_dev, qp->port) ==
+ IB_LINK_LAYER_ETHERNET;
+
+ if (is_eth && attr->ah_attr.ah_flags & IB_AH_GRH) {
+ int index = attr->ah_attr.grh.sgid_index;
+ if (mlx4_is_bonded(dev->dev))
+ port_num = 1;
+ rcu_read_lock();
+ status = ib_get_cached_gid(ibqp->device, port_num,
+ index, &gid, &gid_attr);
+ if (!status && !(memcmp(&gid, &zgid, sizeof(gid))))
+ status = -ENONET;
+ if (!status) {
+ vlan = rdma_vlan_dev_vlan_id(gid_attr.ndev);
+ memcpy(smac, IF_LLADDR(gid_attr.ndev), ETH_ALEN);
+ }
+ rcu_read_unlock();
+ }
+ if (status)
+ goto out;
+
+ if (mlx4_set_path(dev, attr, attr_mask, qp, &context->pri_path,
+ port_num, vlan, smac))
goto out;
+ if (is_eth && gid_attr.gid_type == IB_GID_TYPE_ROCE_V2)
+ context->pri_path.hop_limit = IPV6_DEFAULT_HOPLIMIT;
+
optpar |= (MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH |
MLX4_QP_OPTPAR_SCHED_QUEUE);
+
+ if (is_eth && (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR)) {
+ u8 qpc_roce_mode = gid_type_to_qpc(gid_attr.gid_type);
+
+ if (qpc_roce_mode == MLX4_QPC_ROCE_MODE_MAX)
+ goto out;
+ context->rlkey_roce_mode |= (qpc_roce_mode << 6);
+ }
+
}
if (attr_mask & IB_QP_TIMEOUT) {
@@ -1900,16 +2261,13 @@
dev->dev->caps.pkey_table_len[attr->alt_port_num])
goto out;
- if (mlx4_set_path(dev, &attr->alt_ah_attr, (u8 *)attr->smac,
- attr_mask & IB_QP_ALT_VID ?
- attr->alt_vlan_id : 0xffff,
- qp, &context->alt_path,
- attr->alt_port_num, 0))
+ if (mlx4_set_alt_path(dev, attr, attr_mask, qp,
+ &context->alt_path,
+ attr->alt_port_num))
goto out;
context->alt_path.pkey_index = attr->alt_pkey_index;
context->alt_path.ackto = attr->alt_timeout << 3;
- context->alt_path.counter_index = dev->counters[attr->alt_port_num - 1].counter_index;
optpar |= MLX4_QP_OPTPAR_ALT_ADDR_PATH;
}
@@ -1956,18 +2314,6 @@
optpar |= MLX4_QP_OPTPAR_RWE | MLX4_QP_OPTPAR_RRE | MLX4_QP_OPTPAR_RAE;
}
- if (attr_mask & IB_M_EXT_CLASS_1)
- context->params2 |= cpu_to_be32(MLX4_QP_BIT_COLL_MASTER);
-
- /* for now we enable also sqe on send */
- if (attr_mask & IB_M_EXT_CLASS_2) {
- context->params2 |= cpu_to_be32(MLX4_QP_BIT_COLL_SYNC_SQ);
- context->params2 |= cpu_to_be32(MLX4_QP_BIT_COLL_MASTER);
- }
-
- if (attr_mask & IB_M_EXT_CLASS_3)
- context->params2 |= cpu_to_be32(MLX4_QP_BIT_COLL_SYNC_RQ);
-
if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
context->params2 |= (qp->flags & MLX4_IB_QP_CAP_CROSS_CHANNEL ?
cpu_to_be32(MLX4_QP_BIT_COLL_MASTER) : 0);
@@ -2031,15 +2377,8 @@
context->pri_path.fl = 0x80;
context->pri_path.sched_queue |= MLX4_IB_DEFAULT_SCHED_QUEUE;
}
- if (ibqp->qp_type == IB_QPT_RAW_PACKET &&
- (attr_mask & IB_QP_AV)) {
- context->pri_path.sched_queue |=
- ((attr->ah_attr.sl & 0xf) << 3);
- context->pri_path.feup = 1 << 6;
- }
- is_eth = rdma_port_get_link_layer(&dev->ib_dev, qp->port) ==
- IB_LINK_LAYER_ETHERNET;
- if (is_eth) {
+ if (rdma_port_get_link_layer(&dev->ib_dev, qp->port) ==
+ IB_LINK_LAYER_ETHERNET) {
if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI ||
qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI)
context->pri_path.feup = 1 << 7; /* don't fsm */
@@ -2047,18 +2386,35 @@
if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_UD ||
qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI ||
qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI) {
- err = handle_eth_ud_smac_index(dev, qp, (const u8 *)attr->smac, context);
+ err = handle_eth_ud_smac_index(dev, qp, context);
if (err)
return -EINVAL;
+ if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI)
+ dev->qp1_proxy[qp->port - 1] = qp;
}
}
}
- if (ibqp->qp_type == IB_QPT_UD)
- if (is_eth && (new_state == IB_QPS_RTR)) {
+ if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) {
+ context->pri_path.ackto = (context->pri_path.ackto & 0xf8) |
+ MLX4_IB_LINK_TYPE_ETH;
+ if (dev->dev->caps.tunnel_offload_mode == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) {
+ /* set QP to receive both tunneled & non-tunneled packets */
+ if (!(context->flags & cpu_to_be32(1 << MLX4_RSS_QPC_FLAG_OFFSET)))
+ context->srqn = cpu_to_be32(7 << 28);
+ }
+ }
+
+ if (ibqp->qp_type == IB_QPT_UD && (new_state == IB_QPS_RTR)) {
+ int is_eth = rdma_port_get_link_layer(
+ &dev->ib_dev, qp->port) ==
+ IB_LINK_LAYER_ETHERNET;
+ if (is_eth) {
context->pri_path.ackto = MLX4_IB_LINK_TYPE_ETH;
optpar |= MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH;
}
+ }
+
if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD &&
attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify)
@@ -2067,14 +2423,14 @@
sqd_event = 0;
if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
- context->rlkey |= (1 << 4);
+ context->rlkey_roce_mode |= (1 << 4);
if ((attr_mask & IB_QP_GROUP_RSS) &&
- (qp->qpg_data->rss_child_count > 1)) {
+ (qp->qpg_data->rss_child_count > 1)) {
struct mlx4_ib_qpg_data *qpg_data = qp->qpg_data;
void *rss_context_base = &context->pri_path;
struct mlx4_rss_context *rss_context =
- (struct mlx4_rss_context *) (rss_context_base
+ (struct mlx4_rss_context *)(rss_context_base
+ MLX4_RSS_OFFSET_IN_QPC_PRI_PATH);
context->flags |= cpu_to_be32(1 << MLX4_RSS_QPC_FLAG_OFFSET);
@@ -2099,11 +2455,11 @@
0xA74499AD, 0x593D56D9, 0xF3253C06, 0x2ADC1FFC};
rss_context->hash_fn = MLX4_RSS_HASH_TOP;
memcpy(rss_context->rss_key, rsskey,
- sizeof(rss_context->rss_key));
+ sizeof(rss_context->rss_key));
} else {
rss_context->hash_fn = MLX4_RSS_HASH_XOR;
memset(rss_context->rss_key, 0,
- sizeof(rss_context->rss_key));
+ sizeof(rss_context->rss_key));
}
}
/*
@@ -2112,24 +2468,9 @@
* headroom is stamped so that the hardware doesn't start
* processing stale work requests.
*/
- if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
- struct mlx4_wqe_ctrl_seg *ctrl;
- int i;
-
- for (i = 0; i < qp->sq.wqe_cnt; ++i) {
- ctrl = get_send_wqe(qp, i);
- ctrl->owner_opcode = cpu_to_be32(1 << 31);
- if (qp->sq_max_wqes_per_wr == 1)
- ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4);
-
- stamp_send_wqe(qp, i, 1 << qp->sq.wqe_shift);
- }
- }
-
- if ((qp->port && rdma_port_get_link_layer(&dev->ib_dev, qp->port) ==
- IB_LINK_LAYER_ETHERNET) && (qp->ibqp.qp_type == IB_QPT_RAW_PACKET))
- context->pri_path.ackto = (context->pri_path.ackto & 0xf8) |
- MLX4_IB_LINK_TYPE_ETH;
+ if (!ibqp->uobject && cur_state == IB_QPS_RESET &&
+ new_state == IB_QPS_INIT)
+ initialize_wq_buf(qp);
err = mlx4_qp_modify(dev->dev, &qp->mtt, to_mlx4_state(cur_state),
to_mlx4_state(new_state), context, optpar,
@@ -2153,29 +2494,6 @@
if (is_sqp(dev, qp))
store_sqp_attrs(to_msqp(qp), attr, attr_mask);
- /* Set 'ignore_cq_overrun' bits for collectives offload */
- if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
- if (attr_mask & (IB_M_EXT_CLASS_2 | IB_M_EXT_CLASS_3)) {
- err = mlx4_ib_ignore_overrun_cq(ibqp->send_cq);
- if (err) {
- pr_err("Failed to set ignore CQ "
- "overrun for QP 0x%x's send CQ\n",
- ibqp->qp_num);
- goto out;
- }
-
- if (ibqp->recv_cq != ibqp->send_cq) {
- err = mlx4_ib_ignore_overrun_cq(ibqp->recv_cq);
- if (err) {
- pr_err("Failed to set ignore "
- "CQ overrun for QP 0x%x's recv "
- "CQ\n", ibqp->qp_num);
- goto out;
- }
- }
- }
- }
-
/*
* If we moved QP0 to RTR, bring the IB link up; if we moved
* QP0 to RESET or ERROR, bring the link back down.
@@ -2215,9 +2533,10 @@
qp->qpg_type == IB_QPG_PARENT))
mlx4_ib_steer_qp_reg(dev, qp, 0);
}
- if (qp->pri.smac) {
+ if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port)) {
mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac);
qp->pri.smac = 0;
+ qp->pri.smac_port = 0;
}
if (qp->alt.smac) {
mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac);
@@ -2237,42 +2556,38 @@
qp->alt.update_vid = 0;
}
}
-
out:
if (err && steer_qp)
mlx4_ib_steer_qp_reg(dev, qp, 0);
kfree(context);
- if (qp->pri.candidate_smac) {
- if (err)
+ if (qp->pri.candidate_smac ||
+ (!qp->pri.candidate_smac && qp->pri.candidate_smac_port)) {
+ if (err) {
mlx4_unregister_mac(dev->dev, qp->pri.candidate_smac_port, qp->pri.candidate_smac);
- else {
- if (qp->pri.smac) {
+ } else {
+ if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port))
mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac);
- }
qp->pri.smac = qp->pri.candidate_smac;
qp->pri.smac_index = qp->pri.candidate_smac_index;
qp->pri.smac_port = qp->pri.candidate_smac_port;
-
}
qp->pri.candidate_smac = 0;
qp->pri.candidate_smac_index = 0;
qp->pri.candidate_smac_port = 0;
}
if (qp->alt.candidate_smac) {
- if (err)
- mlx4_unregister_mac(dev->dev, qp->alt.candidate_smac_port, qp->pri.candidate_smac);
- else {
- if (qp->pri.smac) {
+ if (err) {
+ mlx4_unregister_mac(dev->dev, qp->alt.candidate_smac_port, qp->alt.candidate_smac);
+ } else {
+ if (qp->alt.smac)
mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac);
- }
qp->alt.smac = qp->alt.candidate_smac;
qp->alt.smac_index = qp->alt.candidate_smac_index;
qp->alt.smac_port = qp->alt.candidate_smac_port;
-
}
- qp->pri.candidate_smac = 0;
- qp->pri.candidate_smac_index = 0;
- qp->pri.candidate_smac_port = 0;
+ qp->alt.candidate_smac = 0;
+ qp->alt.candidate_smac_index = 0;
+ qp->alt.candidate_smac_port = 0;
}
if (qp->pri.update_vid) {
@@ -2312,15 +2627,14 @@
return err;
}
-int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
- int attr_mask, struct ib_udata *udata)
+static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_udata *udata)
{
struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
struct mlx4_ib_qp *qp = to_mqp(ibqp);
enum ib_qp_state cur_state, new_state;
int err = -EINVAL;
int ll;
-
mutex_lock(&qp->mutex);
cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state;
@@ -2334,7 +2648,7 @@
}
if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
- attr_mask & ~IB_M_QP_MOD_VEND_MASK, ll)) {
+ attr_mask, ll)) {
pr_debug("qpn 0x%x: invalid attribute mask specified "
"for transition %d to %d. qp_type %d,"
" attr_mask 0x%x\n",
@@ -2343,10 +2657,25 @@
goto out;
}
- if ((attr_mask & IB_M_QP_MOD_VEND_MASK) && !dev->dev->caps.sync_qp) {
- pr_err("extended verbs are not supported by %s\n",
- dev->ib_dev.name);
- goto out;
+ if (mlx4_is_bonded(dev->dev) && (attr_mask & IB_QP_PORT)) {
+ if ((cur_state == IB_QPS_RESET) && (new_state == IB_QPS_INIT)) {
+ if ((ibqp->qp_type == IB_QPT_RC) ||
+ (ibqp->qp_type == IB_QPT_UD) ||
+ (ibqp->qp_type == IB_QPT_UC) ||
+ (ibqp->qp_type == IB_QPT_RAW_PACKET) ||
+ (ibqp->qp_type == IB_QPT_XRC_INI)) {
+ if (attr_mask & IB_QP_FLOW_ENTROPY)
+ attr->port_num = (attr->flow_entropy % dev->num_ports) + 1;
+ else
+ attr->port_num =
+ mlx4_ib_bond_next_port(dev);
+
+ }
+ } else {
+ /* no sense in changing port_num
+ * when ports are bonded */
+ attr_mask &= ~IB_QP_PORT;
+ }
}
if ((attr_mask & IB_QP_PORT) &&
@@ -2358,6 +2687,11 @@
goto out;
}
+ if ((attr_mask & IB_QP_PORT) && (ibqp->qp_type == IB_QPT_RAW_PACKET) &&
+ (rdma_port_get_link_layer(&dev->ib_dev, attr->port_num) !=
+ IB_LINK_LAYER_ETHERNET))
+ goto out;
+
if (attr_mask & IB_QP_PKEY_INDEX) {
int p = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
if (attr->pkey_index >= dev->dev->caps.pkey_table_len[p]) {
@@ -2394,11 +2728,46 @@
err = __mlx4_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state);
+ if (mlx4_is_bonded(dev->dev) && (attr_mask & IB_QP_PORT))
+ attr->port_num = 1;
+
out:
mutex_unlock(&qp->mutex);
return err;
}
+int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_udata *udata)
+{
+ struct mlx4_ib_qp *mqp = to_mqp(ibqp);
+ int ret;
+
+ ret = _mlx4_ib_modify_qp(ibqp, attr, attr_mask, udata);
+
+ if (mqp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) {
+ struct mlx4_ib_sqp *sqp = to_msqp(mqp);
+
+ if (sqp->roce_v2_gsi)
+ ret = ib_modify_qp(sqp->roce_v2_gsi, attr, attr_mask);
+ if (ret)
+ pr_err("Failed to modify GSI QP for RoCEv2 (%d)\n", ret);
+ }
+ return ret;
+}
+
+static int vf_get_qp0_qkey(struct mlx4_dev *dev, int qpn, u32 *qkey)
+{
+ int i;
+ for (i = 0; i < dev->caps.num_ports; i++) {
+ if (qpn == dev->caps.qp0_proxy[i] ||
+ qpn == dev->caps.qp0_tunnel[i]) {
+ *qkey = dev->caps.qp0_qkey[i];
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
+
static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp,
struct ib_send_wr *wr,
void *wqe, unsigned *mlx_seg_len)
@@ -2428,7 +2797,7 @@
if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER)
send_size += sizeof (struct mlx4_ib_tunnel_header);
- ib_ud_header_init(send_size, 1, 0, 0, 0, 0, &sqp->ud_header);
+ ib_ud_header_init(send_size, 1, 0, 0, 0, 0, 0, 0, &sqp->ud_header);
if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER) {
sqp->ud_header.lrh.service_level =
@@ -2456,8 +2825,13 @@
cpu_to_be32(mdev->dev->caps.qp0_tunnel[sqp->qp.port - 1]);
sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
- if (mlx4_get_parav_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey))
- return -EINVAL;
+ if (mlx4_is_master(mdev->dev)) {
+ if (mlx4_get_parav_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey))
+ return -EINVAL;
+ } else {
+ if (vf_get_qp0_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey))
+ return -EINVAL;
+ }
sqp->ud_header.deth.qkey = cpu_to_be32(qkey);
sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.mqp.qpn);
@@ -2475,11 +2849,11 @@
spc = MLX4_INLINE_ALIGN -
((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
if (header_size <= spc) {
- inl->byte_count = cpu_to_be32(1 << 31 | header_size);
+ inl->byte_count = cpu_to_be32((1U << 31) | header_size);
memcpy(inl + 1, sqp->header_buf, header_size);
i = 1;
} else {
- inl->byte_count = cpu_to_be32(1 << 31 | spc);
+ inl->byte_count = cpu_to_be32((1U << 31) | spc);
memcpy(inl + 1, sqp->header_buf, spc);
inl = (void *) (inl + 1) + spc;
@@ -2498,7 +2872,7 @@
* of 16 mod 64.
*/
wmb();
- inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc));
+ inl->byte_count = cpu_to_be32((1U << 31) | (header_size - spc));
i = 2;
}
@@ -2507,10 +2881,28 @@
return 0;
}
+static u8 sl_to_vl(struct mlx4_ib_dev *dev, u8 sl, int port_num)
+{
+ union sl2vl_tbl_to_u64 tmp_vltab;
+ u8 vl;
+
+ if (sl > 15)
+ return 0xf;
+ tmp_vltab.sl64 = atomic64_read(&dev->sl2vl[port_num - 1]);
+ vl = tmp_vltab.sl8[sl >> 1];
+ if (sl & 1)
+ vl &= 0x0f;
+ else
+ vl >>= 4;
+ return vl;
+}
+
+#define MLX4_ROCEV2_QP1_SPORT 0xC000
static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
- void *wqe, unsigned *mlx_seg_len)
+ void *wqe, unsigned *mlx_seg_len, __be32 owner_bit)
{
struct ib_device *ib_dev = sqp->qp.ibqp.device;
+ struct mlx4_ib_dev *mibdev = to_mdev(ib_dev);
struct mlx4_wqe_mlx_seg *mlx = wqe;
struct mlx4_wqe_ctrl_seg *ctrl = wqe;
struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx;
@@ -2521,11 +2913,21 @@
int header_size;
int spc;
int i;
- int is_eth;
- int is_vlan = 0;
- int is_grh;
- u16 uninitialized_var(vlan);
int err = 0;
+ u16 vlan = 0xffff;
+ bool is_eth;
+ bool is_vlan = false;
+ bool is_grh;
+ bool is_udp = false;
+ int ip_version = 0;
+ u8 ecn =
+#ifdef CONFIG_MLX4_IB_DEBUG_FS
+ ecn_enabled(mibdev, sqp->qp.port,
+ be32_to_cpu(ah->av.eth.sl_tclass_flowlabel) >> 29) ?
+ INET_ECN_ECT_0 : INET_ECN_NOT_ECT;
+#else
+ 0 /* INET_ECN_NOT_ECT */;
+#endif
send_size = 0;
for (i = 0; i < wr->num_sge; ++i)
@@ -2534,29 +2936,48 @@
is_eth = rdma_port_get_link_layer(sqp->qp.ibqp.device, sqp->qp.port) == IB_LINK_LAYER_ETHERNET;
is_grh = mlx4_ib_ah_grh_present(ah);
if (is_eth) {
+ struct ib_gid_attr gid_attr;
+
if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) {
+ enum mlx4_roce_gid_type gid_type;
+
/* When multi-function is enabled, the ib_core gid
* indexes don't necessarily match the hw ones, so
* we must use our own cache */
err = mlx4_get_roce_gid_from_slave(to_mdev(ib_dev)->dev,
be32_to_cpu(ah->av.ib.port_pd) >> 24,
- ah->av.ib.gid_index, &sgid.raw[0]);
- if (err)
- return err;
+ ah->av.ib.gid_index, &sgid.raw[0], &gid_type);
+ gid_attr.gid_type = mlx4_gid_type_to_ib_gid_type(gid_type);
} else {
- err = ib_get_cached_gid(ib_dev,
+ err = ib_get_cached_gid(sqp->qp.ibqp.device,
be32_to_cpu(ah->av.ib.port_pd) >> 24,
- ah->av.ib.gid_index, &sgid);
- if (err)
- return err;
+ ah->av.ib.gid_index, &sgid, &gid_attr);
+ if (!err && !memcmp(&sgid, &zgid, sizeof(sgid)))
+ err = -ENOENT;
+ }
+
+ if (!err) {
+ is_udp = (gid_attr.gid_type == IB_GID_TYPE_ROCE_V2) ? true : false;
+ if (gid_attr.gid_type != IB_GID_TYPE_IB) {
+ is_grh = false;
+ if (ipv6_addr_v4mapped((struct in6_addr *)&sgid))
+ ip_version = 4;
+ else
+ ip_version = 6;
+ }
+ } else {
+ return err;
}
- if (is_eth && ah->av.eth.vlan != 0xffff) {
- vlan = cpu_to_be16(ah->av.eth.vlan) & 0x0fff;
+ if (ah->av.eth.vlan != cpu_to_be16(0xffff)) {
+ vlan = be16_to_cpu(ah->av.eth.vlan) & 0x0fff;
is_vlan = 1;
}
}
- ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh, 0, &sqp->ud_header);
+ err = ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh,
+ ip_version, is_udp, 0, &sqp->ud_header);
+ if (err)
+ return err;
if (!is_eth) {
sqp->ud_header.lrh.service_level =
@@ -2565,35 +2986,63 @@
sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f);
}
- if (is_grh) {
+ if (is_grh || (ip_version == 6)) {
sqp->ud_header.grh.traffic_class =
- (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff;
+ ((be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) &
+ ~TRAFFIC_CLASS_MASK(mibdev, sqp->qp.port)) |
+ (ecn & TRAFFIC_CLASS_MASK(mibdev, sqp->qp.port));
sqp->ud_header.grh.flow_label =
ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff);
- sqp->ud_header.grh.hop_limit = ah->av.ib.hop_limit;
- if (is_eth)
+
+ sqp->ud_header.grh.hop_limit = ip_version ? IPV6_DEFAULT_HOPLIMIT : 1;
+ if (is_eth) {
memcpy(sqp->ud_header.grh.source_gid.raw, sgid.raw, 16);
- else {
- if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) {
- /* When multi-function is enabled, the ib_core gid
- * indexes don't necessarily match the hw ones, so
- * we must use our own cache */
- sqp->ud_header.grh.source_gid.global.subnet_prefix =
- to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1].
- subnet_prefix;
- sqp->ud_header.grh.source_gid.global.interface_id =
- to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1].
- guid_cache[ah->av.ib.gid_index];
- } else
- ib_get_cached_gid(ib_dev,
- be32_to_cpu(ah->av.ib.port_pd) >> 24,
- ah->av.ib.gid_index,
- &sqp->ud_header.grh.source_gid);
- }
+ sqp->ud_header.grh.next_header = is_grh ?
+ 0x1b : (is_udp ? IPPROTO_UDP : mibdev->dev->caps.rr_proto);
+ } else {
+ sqp->ud_header.grh.next_header = 0x1b;
+ if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) {
+ /* When multi-function is enabled, the ib_core gid
+ * indexes don't necessarily match the hw ones, so
+ * we must use our own cache */
+ sqp->ud_header.grh.source_gid.global.subnet_prefix =
+ to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1].
+ subnet_prefix;
+ sqp->ud_header.grh.source_gid.global.interface_id =
+ to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1].
+ guid_cache[ah->av.ib.gid_index];
+ } else
+ ib_get_cached_gid(ib_dev,
+ be32_to_cpu(ah->av.ib.port_pd) >> 24,
+ ah->av.ib.gid_index,
+ &sqp->ud_header.grh.source_gid, NULL);
+ }
memcpy(sqp->ud_header.grh.destination_gid.raw,
ah->av.ib.dgid, 16);
}
+ if (ip_version == 4) {
+ sqp->ud_header.ip4.tos =
+ ((be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) &
+ ~TRAFFIC_CLASS_MASK(mibdev, sqp->qp.port)) |
+ (ecn & TRAFFIC_CLASS_MASK(mibdev, sqp->qp.port));
+ sqp->ud_header.ip4.id = 0;
+ sqp->ud_header.ip4.frag_off = htons(IP_DF);
+ sqp->ud_header.ip4.ttl = IPV6_DEFAULT_HOPLIMIT;
+
+ memcpy(&sqp->ud_header.ip4.saddr,
+ sgid.raw + 12, 4);
+ memcpy(&sqp->ud_header.ip4.daddr, ah->av.ib.dgid + 12, 4);
+ sqp->ud_header.ip4.protocol = is_udp ? IPPROTO_UDP : mibdev->dev->caps.rr_proto;
+ sqp->ud_header.ip4.check = ib_ud_ip4_csum(&sqp->ud_header);
+ }
+
+ if (is_udp) {
+ sqp->ud_header.udp.dport = htons(ROCE_V2_UDP_DPORT);
+ sqp->ud_header.udp.sport = htons(MLX4_ROCEV2_QP1_SPORT);
+ sqp->ud_header.udp.csum = 0;
+ }
+
mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
if (!is_eth) {
@@ -2621,35 +3070,37 @@
}
if (is_eth) {
- u8 *smac;
struct in6_addr in6;
-
+ u16 ether_type;
u16 pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 29) << 13;
+ ether_type = (!ip_version) ? MLX4_IB_IBOE_ETHERTYPE :
+ (ip_version == 4 ? ETHERTYPE_IP : ETHERTYPE_IPV6);
+
mlx->sched_prio = cpu_to_be16(pcp);
+ ether_addr_copy(sqp->ud_header.eth.smac_h, ah->av.eth.s_mac);
memcpy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac, 6);
- /* FIXME: cache smac value? */
memcpy(&ctrl->srcrb_flags16[0], ah->av.eth.mac, 2);
memcpy(&ctrl->imm, ah->av.eth.mac + 2, 4);
memcpy(&in6, sgid.raw, sizeof(in6));
- if (!mlx4_is_mfunc(to_mdev(ib_dev)->dev))
- smac = IF_LLADDR(to_mdev(sqp->qp.ibqp.device)->iboe.netdevs[sqp->qp.port - 1]);
- else
- smac = ah->av.eth.s_mac; /* use the src mac of the tunnel */
- memcpy(sqp->ud_header.eth.smac_h, smac, 6);
if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6))
mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK);
if (!is_vlan) {
- sqp->ud_header.eth.type = cpu_to_be16(MLX4_IB_IBOE_ETHERTYPE);
+ sqp->ud_header.eth.type = cpu_to_be16(ether_type);
} else {
- sqp->ud_header.vlan.type = cpu_to_be16(MLX4_IB_IBOE_ETHERTYPE);
+ sqp->ud_header.vlan.type = cpu_to_be16(ether_type);
sqp->ud_header.vlan.tag = cpu_to_be16(vlan | pcp);
}
} else {
- sqp->ud_header.lrh.virtual_lane = !sqp->qp.ibqp.qp_num ? 15 : 0;
+ sqp->ud_header.lrh.virtual_lane = !sqp->qp.ibqp.qp_num ? 15 :
+ sl_to_vl(mibdev,
+ sqp->ud_header.lrh.service_level,
+ sqp->qp.port);
+ if (sqp->qp.ibqp.qp_num && sqp->ud_header.lrh.virtual_lane == 15)
+ return -EINVAL;
if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE)
sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE;
}
@@ -2689,11 +3140,11 @@
spc = MLX4_INLINE_ALIGN -
((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
if (header_size <= spc) {
- inl->byte_count = cpu_to_be32(1 << 31 | header_size);
+ inl->byte_count = SET_BYTE_COUNT(1 << 31 | header_size);
memcpy(inl + 1, sqp->header_buf, header_size);
i = 1;
} else {
- inl->byte_count = cpu_to_be32(1 << 31 | spc);
+ inl->byte_count = SET_BYTE_COUNT(1 << 31 | spc);
memcpy(inl + 1, sqp->header_buf, spc);
inl = (void *) (inl + 1) + spc;
@@ -2712,7 +3163,7 @@
* of 16 mod 64.
*/
wmb();
- inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc));
+ inl->byte_count = SET_BYTE_COUNT(1 << 31 | (header_size - spc));
i = 2;
}
@@ -2791,12 +3242,8 @@
static void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, u32 rkey)
{
+ memset(iseg, 0, sizeof(*iseg));
iseg->mem_key = cpu_to_be32(rkey);
-
- iseg->reserved1 = 0;
- iseg->reserved2 = 0;
- iseg->reserved3[0] = 0;
- iseg->reserved3[1] = 0;
}
static __always_inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
@@ -2843,7 +3290,8 @@
static void set_tunnel_datagram_seg(struct mlx4_ib_dev *dev,
struct mlx4_wqe_datagram_seg *dseg,
- struct ib_send_wr *wr, enum ib_qp_type qpt)
+ struct ib_send_wr *wr,
+ enum mlx4_ib_qp_type qpt)
{
union mlx4_ext_av *av = &to_mah(wr->wr.ud.ah)->av;
struct mlx4_av sqp_av = {0};
@@ -2856,8 +3304,10 @@
cpu_to_be32(0xf0000000);
memcpy(dseg->av, &sqp_av, sizeof (struct mlx4_av));
- /* This function used only for sending on QP1 proxies */
- dseg->dqpn = cpu_to_be32(dev->dev->caps.qp1_tunnel[port - 1]);
+ if (qpt == MLX4_IB_QPT_PROXY_GSI)
+ dseg->dqpn = cpu_to_be32(dev->dev->caps.qp1_tunnel[port - 1]);
+ else
+ dseg->dqpn = cpu_to_be32(dev->dev->caps.qp0_tunnel[port - 1]);
/* Use QKEY from the QP context, which is set by master */
dseg->qkey = cpu_to_be32(IB_QP_SET_QKEY);
}
@@ -2875,24 +3325,24 @@
hdr.pkey_index = cpu_to_be16(wr->wr.ud.pkey_index);
hdr.qkey = cpu_to_be32(wr->wr.ud.remote_qkey);
memcpy(hdr.mac, ah->av.eth.mac, 6);
- hdr.vlan = cpu_to_be16(ah->av.eth.vlan);
+ hdr.vlan = ah->av.eth.vlan;
spc = MLX4_INLINE_ALIGN -
((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
if (sizeof (hdr) <= spc) {
memcpy(inl + 1, &hdr, sizeof (hdr));
wmb();
- inl->byte_count = cpu_to_be32(1 << 31 | sizeof (hdr));
+ inl->byte_count = cpu_to_be32((1U << 31) | (u32)sizeof(hdr));
i = 1;
} else {
memcpy(inl + 1, &hdr, spc);
wmb();
- inl->byte_count = cpu_to_be32(1 << 31 | spc);
+ inl->byte_count = cpu_to_be32((1U << 31) | spc);
inl = (void *) (inl + 1) + spc;
memcpy(inl + 1, (void *) &hdr + spc, sizeof (hdr) - spc);
wmb();
- inl->byte_count = cpu_to_be32(1 << 31 | (sizeof (hdr) - spc));
+ inl->byte_count = cpu_to_be32((1U << 31) | (u32)(sizeof (hdr) - spc));
i = 2;
}
@@ -2917,10 +3367,11 @@
*/
wmb();
- iseg->byte_count = cpu_to_be32((1 << 31) | 4);
+ iseg->byte_count = cpu_to_be32((1U << 31) | 4);
}
-static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg)
+static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg,
+ __be32 owner_bit)
{
dseg->lkey = cpu_to_be32(sg->lkey);
dseg->addr = cpu_to_be64(sg->addr);
@@ -2934,8 +3385,7 @@
* stale data, and end up sending the wrong data.
*/
wmb();
-
- dseg->byte_count = cpu_to_be32(sg->length);
+ dseg->byte_count = SET_BYTE_COUNT(sg->length);
}
static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg)
@@ -2947,21 +3397,18 @@
static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_send_wr *wr,
struct mlx4_ib_qp *qp, unsigned *lso_seg_len,
- __be32 *lso_hdr_sz, __be32 *blh)
+ u32 *lso_hdr_sz, __be32 *blh)
{
- unsigned halign = ALIGN(sizeof *wqe + wr->wr.ud.hlen, 16);
+ unsigned halign = GET_LSO_SEG_SIZE(wr->wr.ud.hlen);
if (unlikely(halign > MLX4_IB_CACHE_LINE_SIZE))
- *blh = cpu_to_be32(1 << 6);
+ *blh = cpu_to_be32(MLX4_WQE_CTRL_RR);
if (unlikely(!(qp->flags & MLX4_IB_QP_LSO) &&
wr->num_sge > qp->sq.max_gs - (halign >> 4)))
return -EINVAL;
- memcpy(wqe->header, wr->wr.ud.header, wr->wr.ud.hlen);
-
- *lso_hdr_sz = cpu_to_be32((wr->wr.ud.mss - wr->wr.ud.hlen) << 16 |
- wr->wr.ud.hlen);
+ *lso_hdr_sz = wr->wr.ud.mss << 16 | wr->wr.ud.hlen;
*lso_seg_len = halign;
return 0;
}
@@ -2985,11 +3432,11 @@
{
struct mlx4_wqe_inline_seg *inl = wqe;
memset(wqe, 0, 16);
- inl->byte_count = cpu_to_be32(1 << 31);
+ inl->byte_count = cpu_to_be32(1U << 31);
}
static int lay_inline_data(struct mlx4_ib_qp *qp, struct ib_send_wr *wr,
- void *wqe, int *sz)
+ void *wqe, int *sz, __be32 owner_bit)
{
struct mlx4_wqe_inline_seg *seg;
void *addr;
@@ -3012,7 +3459,7 @@
if (inl > qp->max_inline_data) {
inl = 0;
- return -1;
+ return -ENOMEM;
}
while (len >= MLX4_INLINE_ALIGN - off) {
@@ -3023,7 +3470,7 @@
addr += to_copy;
seg_len += to_copy;
wmb(); /* see comment below */
- seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len);
+ seg->byte_count = SET_BYTE_COUNT(MLX4_INLINE_SEG | seg_len);
seg_len = 0;
seg = wqe;
wqe += sizeof *seg;
@@ -3051,7 +3498,7 @@
* data.
*/
wmb();
- seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len);
+ seg->byte_count = SET_BYTE_COUNT(MLX4_INLINE_SEG | seg_len);
}
*sz = (inl + num_seg * sizeof *seg + 15) / 16;
@@ -3081,21 +3528,62 @@
int nreq;
int err = 0;
unsigned ind;
+#ifndef CONFIG_INFINIBAND_WQE_FORMAT
int uninitialized_var(stamp);
+#endif
int uninitialized_var(size);
unsigned uninitialized_var(seglen);
__be32 dummy;
- __be32 *lso_wqe;
- __be32 uninitialized_var(lso_hdr_sz);
+ struct mlx4_wqe_lso_seg *lso_wqe;
+ u32 lso_hdr_sz = 0;
__be32 blh;
+ __be32 owner_bit;
int i;
int inl = 0;
+ struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
+
+ if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) {
+ struct mlx4_ib_sqp *sqp = to_msqp(qp);
+
+ if (sqp->roce_v2_gsi) {
+ struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah);
+ struct ib_gid_attr gid_attr;
+ union ib_gid gid;
+
+ if (mlx4_is_mfunc(mdev->dev)) {
+ enum mlx4_roce_gid_type gid_type;
+ union ib_gid sgid;
+
+ /* When multi-function is enabled, the ib_core gid
+ * indexes don't necessarily match the hw ones, so
+ * we must use our own cache */
+ err = mlx4_get_roce_gid_from_slave(mdev->dev,
+ be32_to_cpu(ah->av.ib.port_pd) >> 24,
+ ah->av.ib.gid_index, &sgid.raw[0], &gid_type);
+ gid_attr.gid_type = mlx4_gid_type_to_ib_gid_type(gid_type);
+ } else {
+ ib_get_cached_gid(ibqp->device,
+ be32_to_cpu(ah->av.ib.port_pd) >> 24,
+ ah->av.ib.gid_index, &gid, &gid_attr);
+ }
+ qp = (gid_attr.gid_type == IB_GID_TYPE_ROCE_V2) ?
+ to_mqp(sqp->roce_v2_gsi) : qp;
+ }
+ }
+
spin_lock_irqsave(&qp->sq.lock, flags);
+ if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+ err = -EIO;
+ *bad_wr = wr;
+ nreq = 0;
+ goto out;
+ }
ind = qp->sq_next_wqe;
for (nreq = 0; wr; ++nreq, wr = wr->next) {
- lso_wqe = &dummy;
+ lso_wqe = (struct mlx4_wqe_lso_seg *)(&dummy);
+ lso_hdr_sz = 0;
blh = 0;
if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
@@ -3113,6 +3601,8 @@
ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
*((u32 *) (&ctrl->vlan_tag)) = 0;
qp->sq.wrid[(qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
+ owner_bit = ind & qp->sq.wqe_cnt ?
+ cpu_to_be32(MLX4_WQE_CTRL_OWN) : 0;
ctrl->srcrb_flags =
(wr->send_flags & IB_SEND_SIGNALED ?
@@ -3192,6 +3682,7 @@
set_bind_seg(wqe, wr);
wqe += sizeof(struct mlx4_wqe_bind_seg);
size += sizeof(struct mlx4_wqe_bind_seg) / 16;
+ break;
default:
/* No extra segments required for sends */
break;
@@ -3227,18 +3718,13 @@
*bad_wr = wr;
goto out;
}
- lso_wqe = (__be32 *) wqe;
+ lso_wqe = wqe;
wqe += seglen;
size += seglen / 16;
}
break;
case MLX4_IB_QPT_PROXY_SMI_OWNER:
- if (unlikely(!mlx4_is_master(to_mdev(ibqp->device)->dev))) {
- err = -ENOSYS;
- *bad_wr = wr;
- goto out;
- }
err = build_sriov_qp0_header(to_msqp(qp), wr, ctrl, &seglen);
if (unlikely(err)) {
*bad_wr = wr;
@@ -3255,16 +3741,13 @@
size += seglen / 16;
break;
case MLX4_IB_QPT_PROXY_SMI:
- /* don't allow QP0 sends on guests */
- err = -ENOSYS;
- *bad_wr = wr;
- goto out;
case MLX4_IB_QPT_PROXY_GSI:
/* If we are tunneling special qps, this is a UD qp.
* In this case we first add a UD segment targeting
* the tunnel qp, and then add a header with address
* information */
- set_tunnel_datagram_seg(to_mdev(ibqp->device), wqe, wr, ibqp->qp_type);
+ set_tunnel_datagram_seg(to_mdev(ibqp->device), wqe, wr,
+ qp->mlx4_ib_qp_type);
wqe += sizeof (struct mlx4_wqe_datagram_seg);
size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
build_tunnel_header(wr, wqe, &seglen);
@@ -3274,7 +3757,8 @@
case MLX4_IB_QPT_SMI:
case MLX4_IB_QPT_GSI:
- err = build_mlx_header(to_msqp(qp), wr, ctrl, &seglen);
+ err = build_mlx_header(to_msqp(qp), wr, ctrl, &seglen,
+ owner_bit);
if (unlikely(err)) {
*bad_wr = wr;
goto out;
@@ -3307,16 +3791,18 @@
if (wr->send_flags & IB_SEND_INLINE && wr->num_sge) {
int sz;
- err = lay_inline_data(qp, wr, wqe, &sz);
- if (!err) {
- inl = 1;
- size += sz;
+ err = lay_inline_data(qp, wr, wqe, &sz, owner_bit);
+ if (err) {
+ *bad_wr = wr;
+ goto out;
}
+ inl = 1;
+ size += sz;
} else {
size += wr->num_sge *
(sizeof(struct mlx4_wqe_data_seg) / 16);
for (i = wr->num_sge - 1; i >= 0; --i, --dseg)
- set_data_seg(dseg, wr->sg_list + i);
+ set_data_seg(dseg, wr->sg_list + i, owner_bit);
}
/*
@@ -3325,7 +3811,15 @@
* are written.
*/
wmb();
- *lso_wqe = lso_hdr_sz;
+ lso_wqe->mss_hdr_size = SET_LSO_MSS(lso_hdr_sz);
+
+ if (lso_hdr_sz) {
+ copy_lso_header((__be32 *)lso_wqe->header,
+ wr->wr.ud.header, wr->wr.ud.hlen,
+ owner_bit);
+ }
+
+
ctrl->fence_size = (wr->send_flags & IB_SEND_FENCE ?
MLX4_WQE_CTRL_FENCE : 0) | size;
@@ -3342,10 +3836,12 @@
goto out;
}
- ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] |
- (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0) | blh;
+ ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] | owner_bit |
+ blh;
+#ifndef CONFIG_INFINIBAND_WQE_FORMAT
stamp = ind + qp->sq_spare_wqes;
+#endif
ind += DIV_ROUND_UP(size * 16, 1U << qp->sq.wqe_shift);
/*
@@ -3358,7 +3854,9 @@
* in the middle of WR).
*/
if (wr->next) {
+#ifndef CONFIG_INFINIBAND_WQE_FORMAT
stamp_send_wqe(qp, stamp, size * 16);
+#endif
ind = pad_wraparound(qp, ind);
}
}
@@ -3369,7 +3867,7 @@
/* We set above doorbell_qpn bits to 0 as part of vlan
* tag initialization, so |= should be correct.
*/
- *(u32 *) (&ctrl->vlan_tag) |= qp->doorbell_qpn;
+ *(__be32 *)(&ctrl->vlan_tag) |= qp->doorbell_qpn;
/*
* Make sure that descriptor is written to memory
* before writing to BlueFlame page.
@@ -3393,7 +3891,8 @@
*/
wmb();
- writel(qp->doorbell_qpn, qp->bf.uar->map + MLX4_SEND_DOORBELL);
+ __raw_writel((__force u32)qp->doorbell_qpn,
+ qp->bf.uar->map + MLX4_SEND_DOORBELL);
/*
* Make sure doorbells don't leak out of SQ spinlock
@@ -3404,7 +3903,9 @@
}
if (likely(nreq)) {
+#ifndef CONFIG_INFINIBAND_WQE_FORMAT
stamp_send_wqe(qp, stamp, size * 16);
+#endif
ind = pad_wraparound(qp, ind);
qp->sq_next_wqe = ind;
}
@@ -3425,10 +3926,18 @@
int ind;
int max_gs;
int i;
+ struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
max_gs = qp->rq.max_gs;
spin_lock_irqsave(&qp->rq.lock, flags);
+ if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+ err = -EIO;
+ *bad_wr = wr;
+ nreq = 0;
+ goto out;
+ }
+
ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
for (nreq = 0; wr; ++nreq, wr = wr->next) {
@@ -3584,7 +4093,7 @@
goto done;
}
- err = mlx4_qp_query(dev->dev, &qp->mqp, &context);
+ err = mlx4_qp_query(dev->dev, &qp->mqp, &context, MLX4_CMD_WRAPPED);
if (err) {
err = -EINVAL;
goto out;
@@ -3675,9 +4184,10 @@
if (qp->flags & MLX4_IB_QP_CAP_MANAGED_RECV)
qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_RECV;
- qp_init_attr->qpg_type = ibqp->qpg_type;
- if (ibqp->qpg_type == IB_QPG_PARENT)
- qp_init_attr->cap.qpg_tss_mask_sz = qp->qpg_data->qpg_tss_mask_sz;
+ qp_init_attr->qpg_type = qp->qpg_type;
+ if (qp->qpg_type == IB_QPG_PARENT)
+ qp_init_attr->cap.qpg_tss_mask_sz =
+ qp->qpg_data->qpg_tss_mask_sz;
else
qp_init_attr->cap.qpg_tss_mask_sz = 0;
@@ -3685,3 +4195,4 @@
mutex_unlock(&qp->mutex);
return err;
}
+
Index: sys/ofed/drivers/infiniband/hw/mlx4/srq.c
===================================================================
--- sys/ofed/drivers/infiniband/hw/mlx4/srq.c
+++ sys/ofed/drivers/infiniband/hw/mlx4/srq.c
@@ -134,13 +134,14 @@
if (err)
goto err_mtt;
} else {
- err = mlx4_db_alloc(dev->dev, &srq->db, 0);
+ err = mlx4_db_alloc(dev->dev, &srq->db, 0, GFP_KERNEL);
if (err)
goto err_srq;
*srq->db.db = 0;
- if (mlx4_buf_alloc(dev->dev, buf_size, PAGE_SIZE * 2, &srq->buf)) {
+ if (mlx4_buf_alloc(dev->dev, buf_size, PAGE_SIZE * 2, &srq->buf,
+ GFP_KERNEL)) {
err = -ENOMEM;
goto err_db;
}
@@ -165,7 +166,7 @@
if (err)
goto err_buf;
- err = mlx4_buf_write_mtt(dev->dev, &srq->mtt, &srq->buf);
+ err = mlx4_buf_write_mtt(dev->dev, &srq->mtt, &srq->buf, GFP_KERNEL);
if (err)
goto err_mtt;
@@ -315,8 +316,15 @@
int err = 0;
int nreq;
int i;
+ struct mlx4_ib_dev *mdev = to_mdev(ibsrq->device);
spin_lock_irqsave(&srq->lock, flags);
+ if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+ err = -EIO;
+ *bad_wr = wr;
+ nreq = 0;
+ goto out;
+ }
for (nreq = 0; wr; ++nreq, wr = wr->next) {
if (unlikely(wr->num_sge > srq->msrq.max_gs)) {
@@ -361,6 +369,7 @@
*srq->db.db = cpu_to_be32(srq->wqe_ctr);
}
+out:
spin_unlock_irqrestore(&srq->lock, flags);
Index: sys/ofed/drivers/infiniband/hw/mlx4/sysfs.c
===================================================================
--- sys/ofed/drivers/infiniband/hw/mlx4/sysfs.c
+++ sys/ofed/drivers/infiniband/hw/mlx4/sysfs.c
@@ -46,21 +46,17 @@
static ssize_t show_admin_alias_guid(struct device *dev,
struct device_attribute *attr, char *buf)
{
- int record_num;/*0-15*/
- int guid_index_in_rec; /*0 - 7*/
struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry =
container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry);
struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx;
struct mlx4_ib_dev *mdev = port->dev;
+ __be64 sysadmin_ag_val;
- record_num = mlx4_ib_iov_dentry->entry_num / 8 ;
- guid_index_in_rec = mlx4_ib_iov_dentry->entry_num % 8 ;
+ sysadmin_ag_val = mlx4_get_admin_guid(mdev->dev,
+ mlx4_ib_iov_dentry->entry_num,
+ port->num);
- return sprintf(buf, "%llx\n",
- (long long)be64_to_cpu(*(__be64 *)&mdev->sriov.alias_guid.
- ports_guid[port->num - 1].
- all_rec_per_port[record_num].
- all_recs[8 * guid_index_in_rec]));
+ return sprintf(buf, "%llx\n", (long long)be64_to_cpu(sysadmin_ag_val));
}
/* store_admin_alias_guid stores the (new) administratively assigned value of that GUID.
@@ -80,6 +76,7 @@
struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx;
struct mlx4_ib_dev *mdev = port->dev;
u64 sysadmin_ag_val;
+ unsigned long flags;
record_num = mlx4_ib_iov_dentry->entry_num / 8;
guid_index_in_rec = mlx4_ib_iov_dentry->entry_num % 8;
@@ -87,6 +84,7 @@
pr_err("GUID 0 block 0 is RO\n");
return count;
}
+ spin_lock_irqsave(&mdev->sriov.alias_guid.ag_work_lock, flags);
sscanf(buf, "%llx", &sysadmin_ag_val);
*(__be64 *)&mdev->sriov.alias_guid.ports_guid[port->num - 1].
all_rec_per_port[record_num].
@@ -96,33 +94,15 @@
/* Change the state to be pending for update */
mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].status
= MLX4_GUID_INFO_STATUS_IDLE ;
-
- mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].method
- = MLX4_GUID_INFO_RECORD_SET;
-
- switch (sysadmin_ag_val) {
- case MLX4_GUID_FOR_DELETE_VAL:
- mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].method
- = MLX4_GUID_INFO_RECORD_DELETE;
- mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].ownership
- = MLX4_GUID_SYSADMIN_ASSIGN;
- break;
- /* The sysadmin requests the SM to re-assign */
- case MLX4_NOT_SET_GUID:
- mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].ownership
- = MLX4_GUID_DRIVER_ASSIGN;
- break;
- /* The sysadmin requests a specific value.*/
- default:
- mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].ownership
- = MLX4_GUID_SYSADMIN_ASSIGN;
- break;
- }
+ mlx4_set_admin_guid(mdev->dev, cpu_to_be64(sysadmin_ag_val),
+ mlx4_ib_iov_dentry->entry_num,
+ port->num);
/* set the record index */
mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].guid_indexes
- = mlx4_ib_get_aguid_comp_mask_from_ix(guid_index_in_rec);
+ |= mlx4_ib_get_aguid_comp_mask_from_ix(guid_index_in_rec);
+ spin_unlock_irqrestore(&mdev->sriov.alias_guid.ag_work_lock, flags);
mlx4_ib_init_alias_guid_work(mdev, port->num - 1);
return count;
@@ -375,7 +355,7 @@
char base_name[9];
/* pci_name format is: bus:dev:func -> xxxx:yy:zz.n */
- strlcpy(name, pci_name(dev->dev->pdev), max);
+ strlcpy(name, pci_name(dev->dev->persist->pdev), max);
strncpy(base_name, name, 8); /*till xxxx:yy:*/
base_name[8] = '\0';
/* with no ARI only 3 last bits are used so when the fn is higher than 8
@@ -389,8 +369,10 @@
struct mlx4_ib_dev *dev;
struct attribute_group pkey_group;
struct attribute_group gid_group;
- u8 port_num;
+ struct device_attribute enable_smi_admin;
+ struct device_attribute smi_enabled;
int slave;
+ u8 port_num;
};
@@ -558,13 +540,106 @@
return NULL;
}
+static ssize_t sysfs_show_smi_enabled(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct mlx4_port *p =
+ container_of(attr, struct mlx4_port, smi_enabled);
+ ssize_t len = 0;
+
+ if (mlx4_vf_smi_enabled(p->dev->dev, p->slave, p->port_num))
+ len = sprintf(buf, "%d\n", 1);
+ else
+ len = sprintf(buf, "%d\n", 0);
+
+ return len;
+}
+
+static ssize_t sysfs_show_enable_smi_admin(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct mlx4_port *p =
+ container_of(attr, struct mlx4_port, enable_smi_admin);
+ ssize_t len = 0;
+
+ if (mlx4_vf_get_enable_smi_admin(p->dev->dev, p->slave, p->port_num))
+ len = sprintf(buf, "%d\n", 1);
+ else
+ len = sprintf(buf, "%d\n", 0);
+
+ return len;
+}
+
+static ssize_t sysfs_store_enable_smi_admin(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct mlx4_port *p =
+ container_of(attr, struct mlx4_port, enable_smi_admin);
+ int enable;
+
+ if (sscanf(buf, "%i", &enable) != 1 ||
+ enable < 0 || enable > 1)
+ return -EINVAL;
+
+ if (mlx4_vf_set_enable_smi_admin(p->dev->dev, p->slave, p->port_num, enable))
+ return -EINVAL;
+ return count;
+}
+
+static int add_vf_smi_entries(struct mlx4_port *p)
+{
+ int is_eth = rdma_port_get_link_layer(&p->dev->ib_dev, p->port_num) ==
+ IB_LINK_LAYER_ETHERNET;
+ int ret;
+
+ /* do not display entries if eth transport, or if master */
+ if (is_eth || p->slave == mlx4_master_func_num(p->dev->dev))
+ return 0;
+
+ sysfs_attr_init(&p->smi_enabled.attr);
+ p->smi_enabled.show = sysfs_show_smi_enabled;
+ p->smi_enabled.store = NULL;
+ p->smi_enabled.attr.name = "smi_enabled";
+ p->smi_enabled.attr.mode = 0444;
+ ret = sysfs_create_file(&p->kobj, &p->smi_enabled.attr);
+ if (ret) {
+ pr_err("failed to create smi_enabled\n");
+ return ret;
+ }
+
+ sysfs_attr_init(&p->enable_smi_admin.attr);
+ p->enable_smi_admin.show = sysfs_show_enable_smi_admin;
+ p->enable_smi_admin.store = sysfs_store_enable_smi_admin;
+ p->enable_smi_admin.attr.name = "enable_smi_admin";
+ p->enable_smi_admin.attr.mode = 0644;
+ ret = sysfs_create_file(&p->kobj, &p->enable_smi_admin.attr);
+ if (ret) {
+ pr_err("failed to create enable_smi_admin\n");
+ sysfs_remove_file(&p->kobj, &p->smi_enabled.attr);
+ return ret;
+ }
+ return 0;
+}
+
+static void remove_vf_smi_entries(struct mlx4_port *p)
+{
+ int is_eth = rdma_port_get_link_layer(&p->dev->ib_dev, p->port_num) ==
+ IB_LINK_LAYER_ETHERNET;
+
+ if (is_eth || p->slave == mlx4_master_func_num(p->dev->dev))
+ return;
+
+ sysfs_remove_file(&p->kobj, &p->smi_enabled.attr);
+ sysfs_remove_file(&p->kobj, &p->enable_smi_admin.attr);
+}
+
static int add_port(struct mlx4_ib_dev *dev, int port_num, int slave)
{
struct mlx4_port *p;
int i;
int ret;
- int is_eth = rdma_port_get_link_layer(&dev->ib_dev, port_num) ==
- IB_LINK_LAYER_ETHERNET;
p = kzalloc(sizeof *p, GFP_KERNEL);
if (!p)
@@ -581,16 +656,13 @@
goto err_alloc;
p->pkey_group.name = "pkey_idx";
- if (is_eth)
- p->pkey_group.attrs =
- alloc_group_attrs(show_port_pkey, NULL,
- dev->dev->caps.pkey_table_len[port_num]);
- else
- p->pkey_group.attrs =
- alloc_group_attrs(show_port_pkey, store_port_pkey,
- dev->dev->caps.pkey_table_len[port_num]);
- if (!p->pkey_group.attrs)
+ p->pkey_group.attrs =
+ alloc_group_attrs(show_port_pkey, store_port_pkey,
+ dev->dev->caps.pkey_table_len[port_num]);
+ if (!p->pkey_group.attrs) {
+ ret = -ENOMEM;
goto err_alloc;
+ }
ret = sysfs_create_group(&p->kobj, &p->pkey_group);
if (ret)
@@ -598,13 +670,19 @@
p->gid_group.name = "gid_idx";
p->gid_group.attrs = alloc_group_attrs(show_port_gid_idx, NULL, 1);
- if (!p->gid_group.attrs)
+ if (!p->gid_group.attrs) {
+ ret = -ENOMEM;
goto err_free_pkey;
+ }
ret = sysfs_create_group(&p->kobj, &p->gid_group);
if (ret)
goto err_free_gid;
+ ret = add_vf_smi_entries(p);
+ if (ret)
+ goto err_free_gid;
+
list_add_tail(&p->kobj.entry, &dev->pkeys.pkey_port_list[slave]);
return 0;
@@ -630,6 +708,7 @@
int port;
struct kobject *p, *t;
struct mlx4_port *mport;
+ struct mlx4_active_ports actv_ports;
get_name(dev, name, slave, sizeof name);
@@ -652,7 +731,11 @@
goto err_ports;
}
+ actv_ports = mlx4_get_active_ports(dev->dev, slave);
+
for (port = 1; port <= dev->dev->caps.num_ports; ++port) {
+ if (!test_bit(port - 1, actv_ports.ports))
+ continue;
err = add_port(dev, port, slave);
if (err)
goto err_add;
@@ -667,6 +750,7 @@
mport = container_of(p, struct mlx4_port, kobj);
sysfs_remove_group(p, &mport->pkey_group);
sysfs_remove_group(p, &mport->gid_group);
+ remove_vf_smi_entries(mport);
kobject_put(p);
}
kobject_put(dev->dev_ports_parent[slave]);
@@ -688,7 +772,7 @@
if (!mlx4_is_master(device->dev))
return 0;
- for (i = 0; i <= device->dev->num_vfs; ++i)
+ for (i = 0; i <= device->dev->persist->num_vfs; ++i)
register_one_pkey_tree(device, i);
return 0;
@@ -703,7 +787,7 @@
if (!mlx4_is_master(device->dev))
return;
- for (slave = device->dev->num_vfs; slave >= 0; --slave) {
+ for (slave = device->dev->persist->num_vfs; slave >= 0; --slave) {
list_for_each_entry_safe(p, t,
&device->pkeys.pkey_port_list[slave],
entry) {
@@ -711,6 +795,7 @@
port = container_of(p, struct mlx4_port, kobj);
sysfs_remove_group(p, &port->pkey_group);
sysfs_remove_group(p, &port->gid_group);
+ remove_vf_smi_entries(port);
kobject_put(p);
kobject_put(device->dev_ports_parent[slave]);
}
@@ -739,7 +824,7 @@
dev->ports_parent =
kobject_create_and_add("ports",
kobject_get(dev->iov_parent));
- if (!dev->iov_parent) {
+ if (!dev->ports_parent) {
ret = -ENOMEM;
goto err_ports;
}
Index: sys/ofed/drivers/infiniband/hw/mlx4/user.h
===================================================================
--- sys/ofed/drivers/infiniband/hw/mlx4/user.h
+++ sys/ofed/drivers/infiniband/hw/mlx4/user.h
@@ -104,4 +104,9 @@
__u8 reserved[5];
};
+struct mlx4_exp_ib_create_qp {
+ struct mlx4_ib_create_qp base;
+ __u64 uar_virt_add;
+};
+
#endif /* MLX4_IB_USER_H */

File Metadata

Mime Type
text/plain
Expires
Wed, Nov 26, 4:05 PM (7 h, 36 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
26218923
Default Alt Text
D5797.id15541.diff (283 KB)

Event Timeline