Index: sys/conf/files =================================================================== --- sys/conf/files +++ sys/conf/files @@ -4186,6 +4186,12 @@ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" ofed/drivers/infiniband/core/peer_mem.c optional ofed \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" +ofed/drivers/infiniband/core/roce_gid_mgmt.c optional ofed \ + no-depend \ + compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" +ofed/drivers/infiniband/core/roce_gid_cache.c optional ofed \ + no-depend \ + compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" ofed/drivers/infiniband/core/sa_query.c optional ofed \ compile-with "${OFED_C} -I$S/ofed/drivers/infiniband/core/" ofed/drivers/infiniband/core/smi.c optional ofed \ Index: sys/modules/ibcore/Makefile =================================================================== --- sys/modules/ibcore/Makefile +++ sys/modules/ibcore/Makefile @@ -6,6 +6,7 @@ agent.c multicast.c smi.c ud_header.c uverbs_main.c \ mad.c peer_mem.c umem.c uverbs_marshall.c \ cache.c device.c packer.c sysfs.c user_mad.c verbs.c \ + roce_gid_mgmt.c roce_gid_cache.c \ cm.c fmr_pool.c mad_rmpp.c ucm.c cma.c \ vnode_if.h device_if.h bus_if.h pci_if.h \ opt_inet.h opt_inet6.h Index: sys/ofed/drivers/infiniband/core/addr.c =================================================================== --- sys/ofed/drivers/infiniband/core/addr.c +++ sys/ofed/drivers/infiniband/core/addr.c @@ -42,13 +42,11 @@ #include #include #include +#include #include #include - -MODULE_AUTHOR("Sean Hefty"); -MODULE_DESCRIPTION("IB Address Translation"); -MODULE_LICENSE("Dual BSD/GPL"); +/* IB Address Translation */ struct addr_req { struct list_head list; @@ -70,6 +68,21 @@ static struct delayed_work work; static struct workqueue_struct *addr_wq; +int rdma_addr_size(struct sockaddr *addr) +{ + switch (addr->sa_family) { + case AF_INET: + return sizeof(struct sockaddr_in); + case AF_INET6: + return sizeof(struct sockaddr_in6); + case AF_IB: + return sizeof(struct sockaddr_ib); + default: + return 0; + } +} +EXPORT_SYMBOL(rdma_addr_size); + static struct rdma_addr_client self; void rdma_addr_register_client(struct rdma_addr_client *client) { @@ -92,7 +105,7 @@ EXPORT_SYMBOL(rdma_addr_unregister_client); int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct ifnet *dev, - const unsigned char *dst_dev_addr) + const unsigned char *dst_dev_addr) { if (dev->if_type == IFT_INFINIBAND) dev_addr->dev_type = ARPHRD_INFINIBAND; @@ -110,75 +123,54 @@ } EXPORT_SYMBOL(rdma_copy_addr); -#define SCOPE_ID_CACHE(_scope_id, _addr6) do { \ - (_addr6)->sin6_addr.s6_addr[3] = (_scope_id); \ - (_addr6)->sin6_scope_id = 0; } while (0) - -#define SCOPE_ID_RESTORE(_scope_id, _addr6) do { \ - (_addr6)->sin6_scope_id = (_scope_id); \ - (_addr6)->sin6_addr.s6_addr[3] = 0; } while (0) - int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr, u16 *vlan_id) { - struct net_device *dev; - int ret = -EADDRNOTAVAIL; + struct net_device *dev = NULL; + int ret; - if (dev_addr->bound_dev_if) { + if (dev_addr->bound_dev_if > 0) { dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); - if (!dev) - return -ENODEV; - ret = rdma_copy_addr(dev_addr, dev, NULL); - dev_put(dev); - return ret; - } - - switch (addr->sa_family) { - case AF_INET: - dev = ip_dev_find(&init_net, - ((struct sockaddr_in *) addr)->sin_addr.s_addr); - - if (!dev) - return ret; - - ret = rdma_copy_addr(dev_addr, dev, NULL); - if (vlan_id) - *vlan_id = rdma_vlan_dev_vlan_id(dev); - dev_put(dev); - break; - -#if defined(INET6) - case AF_INET6: - { - struct sockaddr_in6 *sin6; + } else { + switch (addr->sa_family) { +#ifdef INET + case AF_INET: + dev = ip_dev_find(&init_net, + ((struct sockaddr_in *) addr)->sin_addr.s_addr); + break; +#endif +#ifdef INET6 + case AF_INET6: { + struct sockaddr_in6 sin6 = { + .sin6_len = sizeof(sin6), + .sin6_family = AF_INET6, + .sin6_addr = ((struct sockaddr_in6 *)addr)->sin6_addr, + .sin6_scope_id = ((struct sockaddr_in6 *)addr)->sin6_scope_id, + }; struct ifaddr *ifa; - in_port_t port; - uint32_t scope_id; - sin6 = (struct sockaddr_in6 *)addr; - port = sin6->sin6_port; - sin6->sin6_port = 0; - scope_id = sin6->sin6_scope_id; - if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr)) - SCOPE_ID_CACHE(scope_id, sin6); - ifa = ifa_ifwithaddr(addr); - sin6->sin6_port = port; - if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr)) - SCOPE_ID_RESTORE(scope_id, sin6); - if (ifa == NULL) { - ret = -ENODEV; + sa6_embedscope(&sin6, 0); + ifa = ifa_ifwithaddr((const struct sockaddr *)&sin6); + if (ifa == NULL) break; - } - ret = rdma_copy_addr(dev_addr, ifa->ifa_ifp, NULL); - if (vlan_id) - *vlan_id = rdma_vlan_dev_vlan_id(ifa->ifa_ifp); + dev = ifa->ifa_ifp; + if (dev != NULL) + if_ref(dev); ifa_free(ifa); break; } #endif - default: - break; + default: + break; + } } + if (dev == NULL) + return -ENODEV; + + ret = rdma_copy_addr(dev_addr, dev, NULL); + if (vlan_id) + *vlan_id = rdma_vlan_dev_vlan_id(dev); + dev_put(dev); return ret; } EXPORT_SYMBOL(rdma_translate_ip); @@ -223,9 +215,6 @@ #if defined(INET) || defined(INET6) in_port_t port; #endif -#ifdef INET6 - uint32_t scope_id; -#endif u_char edst[MAX_ADDR_LEN]; int multi; int bcast; @@ -244,9 +233,9 @@ ifa = NULL; ifp = NULL; memset(edst, 0, sizeof(edst)); -#ifdef INET6 - scope_id = -1U; -#endif + + if (dst_in->sa_family != src_in->sa_family) + return -EINVAL; switch (dst_in->sa_family) { #ifdef INET @@ -289,23 +278,13 @@ sin6 = (struct sockaddr_in6 *)dst_in; if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) multi = 1; - if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr)) { - /* - * The IB address comparison fails if the - * scope ID is set and not part of the addr: - */ - scope_id = sin6->sin6_scope_id; - if (scope_id < 256) - SCOPE_ID_CACHE(scope_id, sin6); - } + sa6_embedscope(sin6, 0); + sin6 = (struct sockaddr_in6 *)src_in; if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { port = sin6->sin6_port; sin6->sin6_port = 0; - if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr)) { - if (scope_id < 256) - SCOPE_ID_CACHE(scope_id, sin6); - } + sa6_embedscope(sin6, 0); /* * If we have a source address to use look it @@ -396,12 +375,16 @@ case AF_INET: error = arpresolve(ifp, is_gw, NULL, is_gw ? rte->rt_gateway : dst_in, edst, NULL, NULL); + if (error == 0 && is_gw != 0) + addr->network = RDMA_NETWORK_IPV4; break; #endif #ifdef INET6 case AF_INET6: error = nd6_resolve(ifp, is_gw, NULL, is_gw ? rte->rt_gateway : dst_in, edst, NULL, NULL); + if (error == 0 && is_gw != 0) + addr->network = RDMA_NETWORK_IPV6; break; #endif default: @@ -414,18 +397,19 @@ if (error == 0) error = -rdma_copy_addr(addr, ifp, edst); if (error == 0) - memcpy(src_in, ifa->ifa_addr, ip_addr_size(ifa->ifa_addr)); + memcpy(src_in, ifa->ifa_addr, rdma_addr_size(ifa->ifa_addr)); + if (error == EWOULDBLOCK || error == EAGAIN) + error = ENODATA; + switch (src_in->sa_family) { #ifdef INET6 - if (scope_id < 256) { - sin6 = (struct sockaddr_in6 *)src_in; - if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr)) - SCOPE_ID_RESTORE(scope_id, sin6); - sin6 = (struct sockaddr_in6 *)dst_in; - SCOPE_ID_RESTORE(scope_id, sin6); - } + case AF_INET6: + sa6_recoverscope((struct sockaddr_in6 *)src_in); + sa6_recoverscope((struct sockaddr_in6 *)dst_in); + break; #endif - if (error == EWOULDBLOCK) - error = ENODATA; + default: + break; + } return -error; } @@ -460,7 +444,7 @@ list_for_each_entry_safe(req, temp_req, &done_list, list) { list_del(&req->list); req->callback(req->status, (struct sockaddr *) &req->src_addr, - req->addr, req->context); + req->addr, req->context); put_client(req->client); kfree(req); } @@ -490,12 +474,12 @@ goto err; } - memcpy(src_in, src_addr, ip_addr_size(src_addr)); + memcpy(src_in, src_addr, rdma_addr_size(src_addr)); } else { src_in->sa_family = dst_addr->sa_family; } - memcpy(dst_in, dst_addr, ip_addr_size(dst_addr)); + memcpy(dst_in, dst_addr, rdma_addr_size(dst_addr)); req->addr = addr; req->callback = callback; req->context = context; @@ -569,7 +553,6 @@ struct sockaddr_in6 _sockaddr_in6; } sgid_addr, dgid_addr; - ret = rdma_gid2ip(&sgid_addr._sockaddr, sgid, scope_id); if (ret) return ret; @@ -579,6 +562,7 @@ return ret; memset(&dev_addr, 0, sizeof(dev_addr)); + dev_addr.bound_dev_if = scope_id; /* XXX scope_id overlaps with if_index */ ctx.addr = &dev_addr; init_completion(&ctx.comp); @@ -602,14 +586,17 @@ u32 rdma_get_ipv6_scope_id(struct ib_device *ib, u8 port_num) { -#ifdef INET6 struct ifnet *ifp; + if (ib->get_netdev == NULL) return (-1U); ifp = ib->get_netdev(ib, port_num); if (ifp == NULL) return (-1U); +#if defined(INET6) return (in6_getscopezone(ifp, IPV6_ADDR_SCOPE_LINKLOCAL)); +#elif defined(INET) + return (ifp->if_index); #else return (-1U); #endif @@ -644,7 +631,7 @@ { if (event == NETEVENT_NEIGH_UPDATE) { set_timeout(jiffies); - } + } return 0; } Index: sys/ofed/drivers/infiniband/core/agent.c =================================================================== --- sys/ofed/drivers/infiniband/core/agent.c +++ sys/ofed/drivers/infiniband/core/agent.c @@ -94,14 +94,14 @@ port_priv = ib_get_agent_port(device, port_num); if (!port_priv) { - printk(KERN_ERR SPFX "Unable to find port agent\n"); + dev_err(&device->dev, "Unable to find port agent\n"); return; } agent = port_priv->agent[qpn]; ah = ib_create_ah_from_wc(agent->qp->pd, wc, grh, port_num); if (IS_ERR(ah)) { - printk(KERN_ERR SPFX "ib_create_ah_from_wc error %ld\n", + dev_err(&device->dev, "ib_create_ah_from_wc error %ld\n", PTR_ERR(ah)); return; } @@ -110,7 +110,7 @@ IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, GFP_KERNEL); if (IS_ERR(send_buf)) { - printk(KERN_ERR SPFX "ib_create_send_mad error\n"); + dev_err(&device->dev, "ib_create_send_mad error\n"); goto err1; } @@ -125,7 +125,7 @@ } if (ib_post_send_mad(send_buf, NULL)) { - printk(KERN_ERR SPFX "ib_post_send_mad error\n"); + dev_err(&device->dev, "ib_post_send_mad error\n"); goto err2; } return; @@ -151,7 +151,7 @@ /* Create new device info */ port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL); if (!port_priv) { - printk(KERN_ERR SPFX "No memory for ib_agent_port_private\n"); + dev_err(&device->dev, "No memory for ib_agent_port_private\n"); ret = -ENOMEM; goto error1; } @@ -202,7 +202,7 @@ port_priv = __ib_get_agent_port(device, port_num); if (port_priv == NULL) { spin_unlock_irqrestore(&ib_agent_port_list_lock, flags); - printk(KERN_ERR SPFX "Port %d not found\n", port_num); + dev_err(&device->dev, "Port %d not found\n", port_num); return -ENODEV; } list_del(&port_priv->port_list); Index: sys/ofed/drivers/infiniband/core/cache.c =================================================================== --- sys/ofed/drivers/infiniband/core/cache.c +++ sys/ofed/drivers/infiniband/core/cache.c @@ -42,6 +42,8 @@ #include "core_priv.h" +#define __IB_ONLY + struct ib_pkey_cache { int table_len; u16 table[0]; @@ -69,72 +71,239 @@ 0 : device->phys_port_cnt; } -int ib_get_cached_gid(struct ib_device *device, - u8 port_num, - int index, - union ib_gid *gid) +static int __IB_ONLY __ib_get_cached_gid(struct ib_device *device, + u8 port_num, + int index, + union ib_gid *gid) { struct ib_gid_cache *cache; unsigned long flags; - int ret = -EINVAL; + int ret = -ENOENT; if (port_num < start_port(device) || port_num > end_port(device)) return -EINVAL; + if (!device->cache.gid_cache) + return -ENOENT; read_lock_irqsave(&device->cache.lock, flags); - if (device->cache.gid_cache) { - cache = device->cache.gid_cache[port_num - start_port(device)]; - - if (cache && index >= 0 && index < cache->table_len) { - *gid = cache->table[index]; - ret = 0; - } + cache = device->cache.gid_cache[port_num - start_port(device)]; + if (cache && index >= 0 && index < cache->table_len) { + *gid = cache->table[index]; + ret = 0; } read_unlock_irqrestore(&device->cache.lock, flags); + return ret; +} + +int ib_cache_use_roce_gid_cache(struct ib_device *device, u8 port_num) +{ + if (rdma_port_get_link_layer(device, port_num) == + IB_LINK_LAYER_ETHERNET) { + if (device->cache.roce_gid_cache) + return 0; + else + return -EAGAIN; + } + + return -EINVAL; +} +EXPORT_SYMBOL(ib_cache_use_roce_gid_cache); + +int ib_get_cached_gid(struct ib_device *device, + u8 port_num, + int index, + union ib_gid *gid, + struct ib_gid_attr *attr) +{ + int ret; + + if (port_num < start_port(device) || port_num > end_port(device)) + return -EINVAL; + + ret = ib_cache_use_roce_gid_cache(device, port_num); + if (!ret) + return roce_gid_cache_get_gid(device, port_num, index, gid, + attr); + + if (ret == -EAGAIN) + return ret; + + ret = __ib_get_cached_gid(device, port_num, index, gid); + + if (!ret && attr) { + memset(attr, 0, sizeof(*attr)); + attr->gid_type = IB_GID_TYPE_IB; + } return ret; } EXPORT_SYMBOL(ib_get_cached_gid); -int ib_find_cached_gid(struct ib_device *device, - union ib_gid *gid, - u8 *port_num, - u16 *index) +static int __IB_ONLY ___ib_find_cached_gid_by_port(struct ib_device *device, + u8 port_num, + const union ib_gid *gid, + u16 *index) { struct ib_gid_cache *cache; + u8 p = port_num - start_port(device); + int i; + + if (port_num < start_port(device) || port_num > end_port(device)) + return -EINVAL; + if (!ib_cache_use_roce_gid_cache(device, port_num)) + return -ENOSYS; + if (!device->cache.gid_cache) + return -ENOENT; + + cache = device->cache.gid_cache[p]; + if (!cache) + return -ENOENT; + + for (i = 0; i < cache->table_len; ++i) { + if (!memcmp(gid, &cache->table[i], sizeof(*gid))) { + if (index) + *index = i; + return 0; + } + } + + return -ENOENT; +} + +static int __IB_ONLY __ib_find_cached_gid_by_port(struct ib_device *device, + u8 port_num, + union ib_gid *gid, + u16 *index) +{ unsigned long flags; - int p, i; + u16 found_index; + int ret; + + if (index) + *index = -1; + + read_lock_irqsave(&device->cache.lock, flags); + + ret = ___ib_find_cached_gid_by_port(device, port_num, gid, + &found_index); + + read_unlock_irqrestore(&device->cache.lock, flags); + + if (!ret && index) + *index = found_index; + + return ret; +} + +static int __IB_ONLY __ib_find_cached_gid(struct ib_device *device, + union ib_gid *gid, + u8 *port_num, + u16 *index) +{ + unsigned long flags; + u16 found_index; + int p; int ret = -ENOENT; - *port_num = -1; + if (port_num) + *port_num = -1; if (index) *index = -1; read_lock_irqsave(&device->cache.lock, flags); - if (!device->cache.gid_cache) - goto out; - for (p = 0; p <= end_port(device) - start_port(device); ++p) { - cache = device->cache.gid_cache[p]; - if (!cache) - continue; - for (i = 0; i < cache->table_len; ++i) { - if (!memcmp(gid, &cache->table[i], sizeof *gid)) { - *port_num = p + start_port(device); - if (index) - *index = i; - ret = 0; - goto out; - } + + for (p = start_port(device); p <= end_port(device); ++p) { + if (!___ib_find_cached_gid_by_port(device, p, gid, + &found_index)) { + if (port_num) + *port_num = p; + ret = 0; + break; } } -out: + read_unlock_irqrestore(&device->cache.lock, flags); + + if (!ret && index) + *index = found_index; + + return ret; +} + +int ib_find_cached_gid(struct ib_device *device, + union ib_gid *gid, + enum ib_gid_type gid_type, + struct net *net, + int if_index, + u8 *port_num, + u16 *index) +{ + int ret = -ENOENT; + + /* Look for a RoCE device with the specified GID. */ + if (device->cache.roce_gid_cache) + ret = roce_gid_cache_find_gid(device, gid, gid_type, net, + if_index, port_num, index); + + /* If no RoCE devices with the specified GID, look for IB device. */ + if (ret && gid_type == IB_GID_TYPE_IB) + ret = __ib_find_cached_gid(device, gid, port_num, index); + return ret; } EXPORT_SYMBOL(ib_find_cached_gid); +int ib_find_cached_gid_by_port(struct ib_device *device, + union ib_gid *gid, + enum ib_gid_type gid_type, + u8 port_num, + struct net *net, + int if_index, + u16 *index) +{ + int ret = -ENOENT; + + /* Look for a RoCE device with the specified GID. */ + if (!ib_cache_use_roce_gid_cache(device, port_num)) + return roce_gid_cache_find_gid_by_port(device, gid, gid_type, + port_num, net, if_index, + index); + + /* If no RoCE devices with the specified GID, look for IB device. */ + if (gid_type == IB_GID_TYPE_IB) + ret = __ib_find_cached_gid_by_port(device, port_num, + gid, index); + + return ret; +} +EXPORT_SYMBOL(ib_find_cached_gid_by_port); + +int ib_find_gid_by_filter(struct ib_device *device, + union ib_gid *gid, + u8 port_num, + bool (*filter)(const union ib_gid *gid, + const struct ib_gid_attr *, + void *), + void *context, u16 *index) +{ + /* Look for a RoCE device with the specified GID. */ + if (!ib_cache_use_roce_gid_cache(device, port_num)) + return roce_gid_cache_find_gid_by_filter(device, gid, + port_num, filter, + context, index); + + /* Only RoCE GID cache supports filter function */ + if (filter) + return -ENOSYS; + + /* If no RoCE devices with the specified GID, look for IB device. */ + return __ib_find_cached_gid_by_port(device, port_num, + gid, index); +} +EXPORT_SYMBOL(ib_find_gid_by_filter); + int ib_get_cached_pkey(struct ib_device *device, u8 port_num, int index, @@ -142,24 +311,23 @@ { struct ib_pkey_cache *cache; unsigned long flags; - int ret = -EINVAL; + int ret = -ENOENT; if (port_num < start_port(device) || port_num > end_port(device)) return -EINVAL; - read_lock_irqsave(&device->cache.lock, flags); + if (!device->cache.pkey_cache) + return -ENOENT; - if (device->cache.pkey_cache) { - cache = device->cache.pkey_cache[port_num - start_port(device)]; + read_lock_irqsave(&device->cache.lock, flags); - if (cache && index >= 0 && index < cache->table_len) { - *pkey = cache->table[index]; - ret = 0; - } + cache = device->cache.pkey_cache[port_num - start_port(device)]; + if (cache && index >= 0 && index < cache->table_len) { + *pkey = cache->table[index]; + ret = 0; } read_unlock_irqrestore(&device->cache.lock, flags); - return ret; } EXPORT_SYMBOL(ib_get_cached_pkey); @@ -178,17 +346,17 @@ if (port_num < start_port(device) || port_num > end_port(device)) return -EINVAL; - *index = -1; + if (!device->cache.pkey_cache) + return -ENOENT; read_lock_irqsave(&device->cache.lock, flags); - if (!device->cache.pkey_cache) - goto out; - cache = device->cache.pkey_cache[port_num - start_port(device)]; if (!cache) goto out; + *index = -1; + for (i = 0; i < cache->table_len; ++i) if ((cache->table[i] & 0x7fff) == (pkey & 0x7fff)) { if (cache->table[i] & 0x8000) { @@ -203,6 +371,7 @@ *index = partial_ix; ret = 0; } + out: read_unlock_irqrestore(&device->cache.lock, flags); return ret; @@ -222,17 +391,17 @@ if (port_num < start_port(device) || port_num > end_port(device)) return -EINVAL; - *index = -1; + if (!device->cache.pkey_cache) + return -ENOENT; read_lock_irqsave(&device->cache.lock, flags); - if (!device->cache.pkey_cache) - goto out; - cache = device->cache.pkey_cache[port_num - start_port(device)]; if (!cache) goto out; + *index = -1; + for (i = 0; i < cache->table_len; ++i) if (cache->table[i] == pkey) { *index = i; @@ -250,7 +419,7 @@ u8 *lmc) { unsigned long flags; - int ret = -EINVAL; + int ret = -ENOENT; if (port_num < start_port(device) || port_num > end_port(device)) return -EINVAL; @@ -271,9 +440,15 @@ { struct ib_port_attr *tprops = NULL; struct ib_pkey_cache *pkey_cache = NULL, *old_pkey_cache; - struct ib_gid_cache *gid_cache = NULL, *old_gid_cache; + struct ib_gid_cache *gid_cache = NULL, *old_gid_cache = NULL; int i; int ret; + bool use_roce_gid_cache = + !ib_cache_use_roce_gid_cache(device, + port); + + if (port < start_port(device) || port > end_port(device)) + return; if (!(device->cache.pkey_cache && device->cache.gid_cache && device->cache.lmc_cache)) @@ -297,12 +472,14 @@ pkey_cache->table_len = tprops->pkey_tbl_len; - gid_cache = kmalloc(sizeof *gid_cache + tprops->gid_tbl_len * - sizeof *gid_cache->table, GFP_KERNEL); - if (!gid_cache) - goto err; + if (!use_roce_gid_cache) { + gid_cache = kmalloc(sizeof(*gid_cache) + tprops->gid_tbl_len * + sizeof(*gid_cache->table), GFP_KERNEL); + if (!gid_cache) + goto err; - gid_cache->table_len = tprops->gid_tbl_len; + gid_cache->table_len = tprops->gid_tbl_len; + } for (i = 0; i < pkey_cache->table_len; ++i) { ret = ib_query_pkey(device, port, i, pkey_cache->table + i); @@ -313,22 +490,28 @@ } } - for (i = 0; i < gid_cache->table_len; ++i) { - ret = ib_query_gid(device, port, i, gid_cache->table + i); - if (ret) { - printk(KERN_WARNING "ib_query_gid failed (%d) for %s (index %d)\n", - ret, device->name, i); - goto err; + if (!use_roce_gid_cache) { + for (i = 0; i < gid_cache->table_len; ++i) { + ret = ib_query_gid(device, port, i, + gid_cache->table + i, NULL); + if (ret) { + printk(KERN_WARNING "ib_query_gid failed (%d) for %s (index %d)\n", + ret, device->name, i); + goto err; + } } } write_lock_irq(&device->cache.lock); old_pkey_cache = device->cache.pkey_cache[port - start_port(device)]; - old_gid_cache = device->cache.gid_cache [port - start_port(device)]; + if (!use_roce_gid_cache) + old_gid_cache = + device->cache.gid_cache[port - start_port(device)]; device->cache.pkey_cache[port - start_port(device)] = pkey_cache; - device->cache.gid_cache [port - start_port(device)] = gid_cache; + if (!use_roce_gid_cache) + device->cache.gid_cache[port - start_port(device)] = gid_cache; device->cache.lmc_cache[port - start_port(device)] = tprops->lmc; Index: sys/ofed/drivers/infiniband/core/cm.c =================================================================== --- sys/ofed/drivers/infiniband/core/cm.c +++ sys/ofed/drivers/infiniband/core/cm.c @@ -56,14 +56,7 @@ #include #include "cm_msgs.h" -MODULE_AUTHOR("Sean Hefty"); -MODULE_DESCRIPTION("InfiniBand CM"); -MODULE_LICENSE("Dual BSD/GPL"); - -#ifdef pr_fmt -#undef pr_fmt -#endif -#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__ +/* InfiniBand CM */ static void cm_add_one(struct ib_device *device); static void cm_remove_one(struct ib_device *device); @@ -88,6 +81,8 @@ __be32 random_id_operand; struct list_head timewait_list; struct workqueue_struct *wq; + /* sync on cm change port state */ + spinlock_t state_lock; } cm; /* Counter indexes ordered by attribute ID */ @@ -169,6 +164,7 @@ struct ib_mad_agent *mad_agent; struct kobject port_obj; u8 port_num; + struct list_head cm_priv_list; struct cm_counter_group counter_group[CM_COUNTER_GROUPS]; }; @@ -177,6 +173,7 @@ struct ib_device *ib_device; struct device *device; u8 ack_delay; + int going_down; struct cm_port *port[0]; }; @@ -186,8 +183,6 @@ struct ib_ah_attr ah_attr; u16 pkey_index; u8 timeout; - u8 valid; - u8 smac[ETH_ALEN]; }; struct cm_work { @@ -248,6 +243,10 @@ u8 service_timeout; u8 target_ack_delay; + struct list_head list; + /* indicates that the send port mad was unregistered. */ + int send_port_not_ready; + struct list_head work_list; atomic_t work_count; }; @@ -266,11 +265,32 @@ struct ib_mad_agent *mad_agent; struct ib_mad_send_buf *m; struct ib_ah *ah; + int ret = 0; + unsigned long flags, flags2; + + /* don't let the port to be released till the agent is down */ + spin_lock_irqsave(&cm.state_lock, flags2); + spin_lock_irqsave(&cm.lock, flags); + if (cm_id_priv->send_port_not_ready) { + pr_info("%s: not valid CM id\n", __func__); + ret = -ENODEV; + spin_unlock_irqrestore(&cm.lock, flags); + goto out; + } + spin_unlock_irqrestore(&cm.lock, flags); + /* make sure the port didn't release the mad yet.*/ mad_agent = cm_id_priv->av.port->mad_agent; + if (mad_agent == NULL) { + pr_info("%s: not valid MAD agent\n", __func__); + ret = -ENODEV; + goto out; + } ah = ib_create_ah(mad_agent->qp->pd, &cm_id_priv->av.ah_attr); - if (IS_ERR(ah)) - return PTR_ERR(ah); + if (IS_ERR(ah)){ + ret = PTR_ERR(ah); + goto out; + } m = ib_create_send_mad(mad_agent, cm_id_priv->id.remote_cm_qpn, cm_id_priv->av.pkey_index, @@ -278,7 +298,8 @@ GFP_ATOMIC); if (IS_ERR(m)) { ib_destroy_ah(ah); - return PTR_ERR(m); + ret = PTR_ERR(m); + goto out; } /* Timeout set by caller if response is expected. */ @@ -288,7 +309,9 @@ atomic_inc(&cm_id_priv->refcount); m->context[0] = cm_id_priv; *msg = m; - return 0; +out: + spin_unlock_irqrestore(&cm.state_lock, flags2); + return ret; } static int cm_alloc_response_msg(struct cm_port *port, @@ -357,24 +380,8 @@ grh, &av->ah_attr); } -int ib_update_cm_av(struct ib_cm_id *id, const u8 *smac, const u8 *alt_smac) -{ - struct cm_id_private *cm_id_priv; - - cm_id_priv = container_of(id, struct cm_id_private, id); - - if (smac != NULL) - memcpy(cm_id_priv->av.smac, smac, sizeof(cm_id_priv->av.smac)); - - if (alt_smac != NULL) - memcpy(cm_id_priv->alt_av.smac, alt_smac, - sizeof(cm_id_priv->alt_av.smac)); - - return 0; -} -EXPORT_SYMBOL(ib_update_cm_av); - -static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av) +static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av, + struct cm_id_private *cm_id_priv) { struct cm_device *cm_dev; struct cm_port *port = NULL; @@ -385,7 +392,8 @@ read_lock_irqsave(&cm.device_lock, flags); list_for_each_entry(cm_dev, &cm.device_list, list) { if (!ib_find_cached_gid(cm_dev->ib_device, &path->sgid, - &p, NULL)) { + path->gid_type, path->net, + path->ifindex, &p, NULL)) { port = cm_dev->port[p-1]; break; } @@ -404,9 +412,10 @@ ib_init_ah_from_path(cm_dev->ib_device, port->port_num, path, &av->ah_attr); av->timeout = path->packet_life_time + 1; - memcpy(av->smac, path->smac, sizeof(av->smac)); - av->valid = 1; + spin_lock_irqsave(&cm.lock, flags); + list_add_tail(&cm_id_priv->list, &port->cm_priv_list); + spin_unlock_irqrestore(&cm.lock, flags); return 0; } @@ -746,6 +755,7 @@ spin_lock_init(&cm_id_priv->lock); init_completion(&cm_id_priv->comp); INIT_LIST_HEAD(&cm_id_priv->work_list); + INIT_LIST_HEAD(&cm_id_priv->list); atomic_set(&cm_id_priv->work_count, -1); atomic_set(&cm_id_priv->refcount, 1); return &cm_id_priv->id; @@ -831,6 +841,13 @@ { int wait_time; unsigned long flags; + struct cm_device *cm_dev; + + cm_dev = ib_get_client_data(cm_id_priv->id.device, &cm_client); + if (!cm_dev) { + pr_err("%s Not exists such cm_dev\n", __func__); + return; + } spin_lock_irqsave(&cm.lock, flags); cm_cleanup_timewait(cm_id_priv->timewait_info); @@ -844,8 +861,14 @@ */ cm_id_priv->id.state = IB_CM_TIMEWAIT; wait_time = cm_convert_to_ms(cm_id_priv->av.timeout); - queue_delayed_work(cm.wq, &cm_id_priv->timewait_info->work.work, - msecs_to_jiffies(wait_time)); + + /* Check if the device started its remove_one */ + spin_lock_irqsave(&cm.lock, flags); + if (!cm_dev->going_down) + queue_delayed_work(cm.wq, &cm_id_priv->timewait_info->work.work, + msecs_to_jiffies(wait_time)); + spin_unlock_irqrestore(&cm.lock, flags); + cm_id_priv->timewait_info = NULL; } @@ -939,6 +962,11 @@ break; } + spin_lock_irq(&cm.lock); + if (!list_empty(&cm_id_priv->list) && (!cm_id_priv->send_port_not_ready)) + list_del(&cm_id_priv->list); + spin_unlock_irq(&cm.lock); + cm_free_id(cm_id->local_id); cm_deref_id(cm_id_priv); wait_for_completion(&cm_id_priv->comp); @@ -1058,7 +1086,7 @@ cm_req_set_resp_res(req_msg, param->responder_resources); cm_req_set_retry_count(req_msg, param->retry_count); cm_req_set_rnr_retry_count(req_msg, param->rnr_retry_count); - cm_req_set_srq(req_msg, param->srq); + cm_req_set_srq(req_msg, param->srq); } if (pri_path->hop_limit <= 1) { @@ -1161,14 +1189,14 @@ return (PTR_ERR(cm_id_priv->timewait_info)); } - ret = cm_init_av_by_path(param->primary_path, &cm_id_priv->av); + ret = cm_init_av_by_path(param->primary_path, &cm_id_priv->av, cm_id_priv); if (!ret && param->alternate_path) { ret = cm_init_av_by_path(param->alternate_path, - &cm_id_priv->alt_av); + &cm_id_priv->alt_av, cm_id_priv); } if (ret) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); - goto error1; + goto error1; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); @@ -1254,6 +1282,7 @@ return ret; } +#if 0 static inline int cm_is_active_peer(__be64 local_ca_guid, __be64 remote_ca_guid, __be32 local_qpn, __be32 remote_qpn) { @@ -1261,6 +1290,7 @@ ((local_ca_guid == remote_ca_guid) && (be32_to_cpu(local_qpn) > be32_to_cpu(remote_qpn)))); } +#endif static void cm_format_paths_from_req(struct cm_req_msg *req_msg, struct ib_sa_path_rec *primary_path, @@ -1553,6 +1583,8 @@ struct ib_cm_id *cm_id; struct cm_id_private *cm_id_priv, *listen_cm_id_priv; struct cm_req_msg *req_msg; + union ib_gid gid; + struct ib_gid_attr gid_attr; int ret; req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad; @@ -1592,20 +1624,27 @@ cm_process_routed_req(req_msg, work->mad_recv_wc->wc); cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]); - /* Workarround: path in req_msg doesn't contain MAC, take it from wc */ - memcpy(work->path[0].dmac, cm_id_priv->av.ah_attr.dmac, 6); - work->path[0].vlan_id = cm_id_priv->av.ah_attr.vlan_id; - ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av); + memcpy(work->path[0].dmac, cm_id_priv->av.ah_attr.dmac, ETH_ALEN); + ret = ib_get_cached_gid(work->port->cm_dev->ib_device, + work->port->port_num, + cm_id_priv->av.ah_attr.grh.sgid_index, + &gid, &gid_attr); + if (!ret) { + work->path[0].gid_type = gid_attr.gid_type; + ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av, cm_id_priv); + } if (ret) { ib_get_cached_gid(work->port->cm_dev->ib_device, - work->port->port_num, 0, &work->path[0].sgid); + work->port->port_num, 0, &work->path[0].sgid, + &gid_attr); + work->path[0].gid_type = gid_attr.gid_type; ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID, &work->path[0].sgid, sizeof work->path[0].sgid, NULL, 0); goto rejected; } if (req_msg->alt_local_lid) { - ret = cm_init_av_by_path(&work->path[1], &cm_id_priv->alt_av); + ret = cm_init_av_by_path(&work->path[1], &cm_id_priv->alt_av, cm_id_priv); if (ret) { ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_ALT_GID, &work->path[0].sgid, @@ -1687,7 +1726,6 @@ spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_REQ_RCVD && cm_id->state != IB_CM_MRA_REQ_SENT) { - pr_debug("cm_id->state: %d\n", cm_id->state); ret = -EINVAL; goto out; } @@ -1754,7 +1792,6 @@ spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_REP_RCVD && cm_id->state != IB_CM_MRA_REP_SENT) { - pr_debug("cm_id->state: %d\n", cm_id->state); ret = -EINVAL; goto error; } @@ -1859,7 +1896,6 @@ cm_id_priv = cm_acquire_id(rep_msg->remote_comm_id, 0); if (!cm_id_priv) { cm_dup_rep_handler(work); - pr_debug("no cm_id_priv\n"); return -EINVAL; } @@ -1873,7 +1909,6 @@ default: spin_unlock_irq(&cm_id_priv->lock); ret = -EINVAL; - pr_debug("cm_id_priv->id.state: %d\n", cm_id_priv->id.state); goto error; } @@ -1887,7 +1922,6 @@ spin_unlock(&cm.lock); spin_unlock_irq(&cm_id_priv->lock); ret = -EINVAL; - pr_debug("Failed to insert remote id\n"); goto error; } /* Check for a stale connection. */ @@ -1901,7 +1935,6 @@ IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REP, NULL, 0); ret = -EINVAL; - pr_debug("Stale connection.\n"); goto error; } spin_unlock(&cm.lock); @@ -2042,7 +2075,6 @@ cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_ESTABLISHED) { - pr_debug("cm_id->state: %d\n", cm_id->state); ret = -EINVAL; goto out; } @@ -2112,7 +2144,6 @@ if (cm_id->state != IB_CM_DREQ_RCVD) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); kfree(data); - pr_debug("cm_id->state(%d) != IB_CM_DREQ_RCVD\n", cm_id->state); return -EINVAL; } @@ -2178,7 +2209,6 @@ atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. counter[CM_DREQ_COUNTER]); cm_issue_drep(work->port, work->mad_recv_wc); - pr_debug("no cm_id_priv\n"); return -EINVAL; } @@ -2219,7 +2249,6 @@ counter[CM_DREQ_COUNTER]); goto unlock; default: - pr_debug("cm_id_priv->id.state: %d\n", cm_id_priv->id.state); goto unlock; } cm_id_priv->id.state = IB_CM_DREQ_RCVD; @@ -2323,7 +2352,6 @@ cm_enter_timewait(cm_id_priv); break; default: - pr_debug("cm_id->state: 0x%x\n", cm_id->state); ret = -EINVAL; goto out; } @@ -2428,13 +2456,12 @@ if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT) ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); - cm_enter_timewait(cm_id_priv); - break; + cm_enter_timewait(cm_id_priv); + break; } /* fall through */ default: spin_unlock_irq(&cm_id_priv->lock); - pr_debug("cm_id_priv->id.state: 0x%x\n", cm_id_priv->id.state); ret = -EINVAL; goto out; } @@ -2497,7 +2524,6 @@ break; } default: - pr_debug("cm_id_priv->id.state: 0x%x\n", cm_id_priv->id.state); ret = -EINVAL; goto error1; } @@ -2599,7 +2625,6 @@ counter[CM_MRA_COUNTER]); /* fall through */ default: - pr_debug("cm_id_priv->id.state: 0x%x\n", cm_id_priv->id.state); goto out; } @@ -2674,7 +2699,7 @@ goto out; } - ret = cm_init_av_by_path(alternate_path, &cm_id_priv->alt_av); + ret = cm_init_av_by_path(alternate_path, &cm_id_priv->alt_av, cm_id_priv); if (ret) goto out; cm_id_priv->alt_av.timeout = @@ -2786,7 +2811,7 @@ cm_init_av_for_response(work->port, work->mad_recv_wc->wc, work->mad_recv_wc->recv_buf.grh, &cm_id_priv->av); - if (cm_init_av_by_path(param->alternate_path, &cm_id_priv->alt_av)) + if (cm_init_av_by_path(param->alternate_path, &cm_id_priv->alt_av, cm_id_priv)) goto unlock; ret = atomic_inc_and_test(&cm_id_priv->work_count); if (!ret) @@ -2979,10 +3004,7 @@ return -EINVAL; cm_id_priv = container_of(cm_id, struct cm_id_private, id); - - spin_lock_irqsave(&cm_id_priv->lock, flags); - - ret = cm_init_av_by_path(param->path, &cm_id_priv->av); + ret = cm_init_av_by_path(param->path, &cm_id_priv->av, cm_id_priv); if (ret) goto out; @@ -2999,19 +3021,21 @@ msg->timeout_ms = cm_id_priv->timeout_ms; msg->context[1] = (void *) (unsigned long) IB_CM_SIDR_REQ_SENT; + spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state == IB_CM_IDLE) ret = ib_post_send_mad(msg, NULL); else ret = -EINVAL; if (ret) { + spin_unlock_irqrestore(&cm_id_priv->lock, flags); cm_free_msg(msg); goto out; } cm_id->state = IB_CM_SIDR_REQ_SENT; cm_id_priv->msg = msg; -out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); +out: return ret; } EXPORT_SYMBOL(ib_send_cm_sidr_req); @@ -3339,7 +3363,6 @@ ret = cm_timewait_handler(work); break; default: - pr_debug("work->cm_event.event: 0x%x\n", work->cm_event.event); ret = -EINVAL; break; } @@ -3353,6 +3376,14 @@ struct cm_work *work; unsigned long flags; int ret = 0; + struct cm_device *cm_dev; + int going_down = 0; + + cm_dev = ib_get_client_data(cm_id->device, &cm_client); + if (!cm_dev) { + pr_err("%s: No such cm_dev\n", __func__); + return -ENODEV; + } work = kmalloc(sizeof *work, GFP_ATOMIC); if (!work) @@ -3370,7 +3401,6 @@ ret = -EISCONN; break; default: - pr_debug("cm_id->state: 0x%x\n", cm_id->state); ret = -EINVAL; break; } @@ -3392,7 +3422,19 @@ work->remote_id = cm_id->remote_id; work->mad_recv_wc = NULL; work->cm_event.event = IB_CM_USER_ESTABLISHED; - queue_delayed_work(cm.wq, &work->work, 0); + + /* Check if the device started its remove_one */ + spin_lock_irqsave(&cm.lock, flags); + if (!cm_dev->going_down) + queue_delayed_work(cm.wq, &work->work, 0); + else + going_down = 1; + spin_unlock_irqrestore(&cm.lock, flags); + + if (going_down) { + kfree(work); + return -ENODEV; + } out: return ret; } @@ -3443,6 +3485,7 @@ enum ib_cm_event_type event; u16 attr_id; int paths = 0; + int going_down = 0; switch (mad_recv_wc->recv_buf.mad->mad_hdr.attr_id) { case CM_REQ_ATTR_ID: @@ -3501,7 +3544,19 @@ work->cm_event.event = event; work->mad_recv_wc = mad_recv_wc; work->port = port; - queue_delayed_work(cm.wq, &work->work, 0); + + /* Check if the device started its remove_one */ + spin_lock_irq(&cm.lock); + if (!port->cm_dev->going_down) + queue_delayed_work(cm.wq, &work->work, 0); + else + going_down = 1; + spin_unlock_irq(&cm.lock); + + if (going_down) { + kfree(work); + ib_free_recv_mad(mad_recv_wc); + } } static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv, @@ -3533,7 +3588,6 @@ ret = 0; break; default: - pr_debug("cm_id_priv->id.state: 0x%x\n", cm_id_priv->id.state); ret = -EINVAL; break; } @@ -3560,31 +3614,6 @@ *qp_attr_mask = IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | IB_QP_RQ_PSN; qp_attr->ah_attr = cm_id_priv->av.ah_attr; - if (!cm_id_priv->av.valid) - return -EINVAL; - if (cm_id_priv->av.ah_attr.vlan_id != 0xffff) { - qp_attr->vlan_id = cm_id_priv->av.ah_attr.vlan_id; - *qp_attr_mask |= IB_QP_VID; - } - if (!is_zero_ether_addr(cm_id_priv->av.smac)) { - memcpy(qp_attr->smac, cm_id_priv->av.smac, - sizeof(qp_attr->smac)); - *qp_attr_mask |= IB_QP_SMAC; - } - if (cm_id_priv->alt_av.valid) { - if (cm_id_priv->alt_av.ah_attr.vlan_id != 0xffff) { - qp_attr->alt_vlan_id = - cm_id_priv->alt_av.ah_attr.vlan_id; - *qp_attr_mask |= IB_QP_ALT_VID; - } - if (!is_zero_ether_addr(cm_id_priv->alt_av.smac)) { - memcpy(qp_attr->alt_smac, - cm_id_priv->alt_av.smac, - sizeof(qp_attr->alt_smac)); - *qp_attr_mask |= IB_QP_ALT_SMAC; - } - } - qp_attr->path_mtu = cm_id_priv->path_mtu; qp_attr->dest_qp_num = be32_to_cpu(cm_id_priv->remote_qpn); qp_attr->rq_psn = be32_to_cpu(cm_id_priv->rq_psn); @@ -3606,7 +3635,6 @@ ret = 0; break; default: - pr_debug("cm_id_priv->id.state: 0x%x\n", cm_id_priv->id.state); ret = -EINVAL; break; } @@ -3666,7 +3694,6 @@ ret = 0; break; default: - pr_debug("cm_id_priv->id.state: 0x%x\n", cm_id_priv->id.state); ret = -EINVAL; break; } @@ -3693,7 +3720,6 @@ ret = cm_init_qp_rts_attr(cm_id_priv, qp_attr, qp_attr_mask); break; default: - pr_debug("qp_attr->qp_state: 0x%x\n", qp_attr->qp_state); ret = -EINVAL; break; } @@ -3806,7 +3832,7 @@ struct cm_port *port; struct ib_mad_reg_req reg_req = { .mgmt_class = IB_MGMT_CLASS_CM, - .mgmt_class_version = IB_CM_CLASS_VERSION + .mgmt_class_version = IB_CM_CLASS_VERSION, }; struct ib_port_modify port_modify = { .set_port_cap_mask = IB_PORT_CM_SUP @@ -3825,7 +3851,7 @@ cm_dev->ib_device = ib_device; cm_get_ack_delay(cm_dev); - + cm_dev->going_down = 0; cm_dev->device = device_create(&cm_class, &ib_device->dev, MKDEV(0, 0), NULL, "%s", ib_device->name); @@ -3844,6 +3870,8 @@ port->cm_dev = cm_dev; port->port_num = i; + INIT_LIST_HEAD(&port->cm_priv_list); + ret = cm_create_port_fs(port); if (ret) goto error1; @@ -3895,6 +3923,8 @@ }; unsigned long flags; int i; + struct cm_id_private *cm_id_priv; + struct ib_mad_agent *cur_mad_agent; cm_dev = ib_get_client_data(ib_device, &cm_client); if (!cm_dev) @@ -3904,11 +3934,31 @@ list_del(&cm_dev->list); write_unlock_irqrestore(&cm.device_lock, flags); + spin_lock_irq(&cm.lock); + cm_dev->going_down = 1; + spin_unlock_irq(&cm.lock); + for (i = 1; i <= ib_device->phys_port_cnt; i++) { port = cm_dev->port[i-1]; ib_modify_port(ib_device, port->port_num, 0, &port_modify); - ib_unregister_mad_agent(port->mad_agent); + /* mark all the cm_id's as not valid */ + spin_lock_irq(&cm.lock); + list_for_each_entry(cm_id_priv, &port->cm_priv_list, list) + cm_id_priv->send_port_not_ready = 1; + spin_unlock_irq(&cm.lock); + + /* + * We flush the queue here after the going_down set, this + * verify that no new works will be queued in the recv handler, + * after that we can call the unregister_mad_agent + */ flush_workqueue(cm.wq); + /* don't free mad_agent if it been used now.*/ + spin_lock_irq(&cm.state_lock); + cur_mad_agent = port->mad_agent; + port->mad_agent = NULL; + spin_unlock_irq(&cm.state_lock); + ib_unregister_mad_agent(cur_mad_agent); cm_remove_port_fs(port); } device_unregister(cm_dev->device); @@ -3923,6 +3973,7 @@ INIT_LIST_HEAD(&cm.device_list); rwlock_init(&cm.device_lock); spin_lock_init(&cm.lock); + spin_lock_init(&cm.state_lock); cm.listen_service_table = RB_ROOT; cm.listen_service_id = be64_to_cpu(IB_CM_ASSIGN_SERVICE_ID); cm.remote_id_table = RB_ROOT; Index: sys/ofed/drivers/infiniband/core/cma.c =================================================================== --- sys/ofed/drivers/infiniband/core/cma.c +++ sys/ofed/drivers/infiniband/core/cma.c @@ -3,7 +3,6 @@ * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved. * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved. * Copyright (c) 2005-2006 Intel Corporation. All rights reserved. - * Copyright (c) 2016 Chelsio Communications. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -51,16 +50,18 @@ #include #include +#include + #include #include +#include #include #include #include #include +#include "core_priv.h" -MODULE_AUTHOR("Sean Hefty"); -MODULE_DESCRIPTION("Generic RDMA CM Agent"); -MODULE_LICENSE("Dual BSD/GPL"); +/* Generic RDMA CM Agent */ #define CMA_CM_RESPONSE_TIMEOUT 20 #define CMA_MAX_CM_RETRIES 15 @@ -78,7 +79,7 @@ static int unify_tcp_port_space = 1; module_param(unify_tcp_port_space, int, 0644); MODULE_PARM_DESC(unify_tcp_port_space, "Unify the host TCP and RDMA port " - "space allocation (default=1)"); + "space allocation (default=1)"); static int debug_level = 0; #define cma_pr(level, priv, format, arg...) \ @@ -123,7 +124,6 @@ static LIST_HEAD(listen_any_list); static DEFINE_MUTEX(lock); static struct workqueue_struct *cma_wq; -static struct workqueue_struct *cma_free_wq; static DEFINE_IDR(sdp_ps); static DEFINE_IDR(tcp_ps); static DEFINE_IDR(udp_ps); @@ -136,6 +136,8 @@ struct completion comp; atomic_t refcount; struct list_head id_list; + struct sysctl_ctx_list sysctl_ctx; + enum ib_gid_type default_gid_type; }; struct rdma_bind_list { @@ -148,6 +150,42 @@ CMA_OPTION_AFONLY, }; +void cma_ref_dev(struct cma_device *cma_dev) +{ + atomic_inc(&cma_dev->refcount); +} + +struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter, + void *cookie) +{ + struct cma_device *cma_dev; + struct cma_device *found_cma_dev = NULL; + + mutex_lock(&lock); + + list_for_each_entry(cma_dev, &dev_list, list) + if (filter(cma_dev->device, cookie)) { + found_cma_dev = cma_dev; + break; + } + + if (found_cma_dev) + cma_ref_dev(found_cma_dev); + mutex_unlock(&lock); + return found_cma_dev; +} + +enum ib_gid_type cma_get_default_gid_type(struct cma_device *cma_dev) +{ + return cma_dev->default_gid_type; +} + +void cma_set_default_gid_type(struct cma_device *cma_dev, + enum ib_gid_type default_gid_type) +{ + cma_dev->default_gid_type = default_gid_type; +} + /* * Device removal can occur at anytime, so we need extra handling to * serialize notifying the user of device removal with other callbacks. @@ -168,13 +206,11 @@ int internal_id; enum rdma_cm_state state; spinlock_t lock; - spinlock_t cm_lock; struct mutex qp_mutex; struct completion comp; atomic_t refcount; struct mutex handler_mutex; - struct work_struct work; /* garbage coll */ int backlog; int timeout_ms; @@ -194,10 +230,8 @@ u8 tos; u8 reuseaddr; u8 afonly; + enum ib_gid_type gid_type; int qp_timeout; - /* cache for mc record params */ - struct ib_sa_mcmember_rec rec; - int is_valid_rec; }; struct cma_multicast { @@ -209,6 +243,7 @@ void *context; struct sockaddr_storage addr; struct kref mcref; + bool igmp_joined; }; struct cma_work { @@ -222,7 +257,6 @@ struct cma_ndev_work { struct work_struct work; struct rdma_id_private *id; - struct rdma_cm_event event; }; struct iboe_mcast_work { @@ -313,6 +347,28 @@ hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF); } +/* +static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join) +{ + struct in_device *in_dev = NULL; + + if (ndev) { + rtnl_lock(); + in_dev = __in_dev_get_rtnl(ndev); + if (in_dev) { + if (join) + ip_mc_inc_group(in_dev, + *(__be32 *)(mgid->raw+12)); + else + ip_mc_dec_group(in_dev, + *(__be32 *)(mgid->raw+12)); + } + rtnl_unlock(); + } + return (in_dev) ? 0 : -ENODEV; +} +*/ + static inline u8 sdp_get_majv(u8 sdp_version) { return sdp_version >> 4; @@ -331,15 +387,16 @@ static void cma_attach_to_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev) { - atomic_inc(&cma_dev->refcount); + cma_ref_dev(cma_dev); id_priv->cma_dev = cma_dev; + id_priv->gid_type = cma_dev->default_gid_type; id_priv->id.device = cma_dev->device; id_priv->id.route.addr.dev_addr.transport = rdma_node_get_transport(cma_dev->device->node_type); list_add_tail(&id_priv->list, &cma_dev->id_list); } -static inline void cma_deref_dev(struct cma_device *cma_dev) +void cma_deref_dev(struct cma_device *cma_dev) { if (atomic_dec_and_test(&cma_dev->refcount)) complete(&cma_dev->comp); @@ -362,16 +419,40 @@ mutex_unlock(&lock); } -static int cma_set_qkey(struct rdma_id_private *id_priv) +static inline struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv) +{ + return (struct sockaddr *) &id_priv->id.route.addr.src_addr; +} + +static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv) +{ + return (struct sockaddr *) &id_priv->id.route.addr.dst_addr; +} + +static inline unsigned short cma_family(struct rdma_id_private *id_priv) +{ + return id_priv->id.route.addr.src_addr.ss_family; +} + +static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey) { struct ib_sa_mcmember_rec rec; int ret = 0; - if (id_priv->qkey) + if (id_priv->qkey) { + if (qkey && id_priv->qkey != qkey) + return -EINVAL; + return 0; + } + + if (qkey) { + id_priv->qkey = qkey; return 0; + } switch (id_priv->id.ps) { case RDMA_PS_UDP: + case RDMA_PS_IB: id_priv->qkey = RDMA_UDP_QKEY; break; case RDMA_PS_IPOIB: @@ -400,7 +481,7 @@ return 1; for (i = 0; i < props.gid_tbl_len; ++i) { - err = ib_query_gid(device, port_num, i, &tmp); + err = ib_query_gid(device, port_num, i, &tmp, NULL); if (err) return 1; if (!memcmp(&tmp, gid, sizeof tmp)) @@ -479,6 +560,27 @@ } EXPORT_SYMBOL(rdma_find_cmid_laddr); +static void cma_translate_ib(struct sockaddr_ib *sib, struct rdma_dev_addr *dev_addr) +{ + dev_addr->dev_type = ARPHRD_INFINIBAND; + rdma_addr_set_sgid(dev_addr, (union ib_gid *) &sib->sib_addr); + ib_addr_set_pkey(dev_addr, ntohs(sib->sib_pkey)); +} + +static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) +{ + int ret; + + if (addr->sa_family != AF_IB) { + ret = rdma_translate_ip(addr, dev_addr, NULL); + } else { + cma_translate_ib((struct sockaddr_ib *) addr, dev_addr); + ret = 0; + } + + return ret; +} + static int cma_acquire_dev(struct rdma_id_private *id_priv, struct rdma_id_private *listen_id_priv) { @@ -486,7 +588,7 @@ struct cma_device *cma_dev; union ib_gid gid, iboe_gid; int ret = -ENODEV; - u8 port, found_port; + u8 port; enum rdma_link_layer dev_ll = dev_addr->dev_type == ARPHRD_INFINIBAND ? IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET; @@ -505,16 +607,28 @@ listen_id_priv->id.port_num) == dev_ll) { cma_dev = listen_id_priv->cma_dev; port = listen_id_priv->id.port_num; - if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB && - rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET) - ret = ib_find_cached_gid(cma_dev->device, &iboe_gid, - &found_port, NULL); - else - ret = ib_find_cached_gid(cma_dev->device, &gid, - &found_port, NULL); + if (rdma_node_get_transport(cma_dev->device->node_type) == + RDMA_TRANSPORT_IB && + rdma_port_get_link_layer(cma_dev->device, port) == + IB_LINK_LAYER_ETHERNET) { + int if_index = + id_priv->id.route.addr.dev_addr.bound_dev_if; + + ret = ib_find_cached_gid_by_port(cma_dev->device, + &iboe_gid, + cma_dev->default_gid_type, + port, + &init_net, + if_index, + NULL); + } else { + ret = ib_find_cached_gid_by_port(cma_dev->device, &gid, + IB_GID_TYPE_IB, port, + NULL, 0, NULL); + } - if (!ret && (port == found_port)) { - id_priv->id.port_num = found_port; + if (!ret) { + id_priv->id.port_num = port; goto out; } } @@ -524,18 +638,36 @@ listen_id_priv->cma_dev == cma_dev && listen_id_priv->id.port_num == port) continue; - if (rdma_port_get_link_layer(cma_dev->device, port) == dev_ll) { - if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB && - rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET) - ret = ib_find_cached_gid(cma_dev->device, &iboe_gid, &found_port, NULL); - else - ret = ib_find_cached_gid(cma_dev->device, &gid, &found_port, NULL); - - if (!ret && (port == found_port)) { + if (rdma_port_get_link_layer(cma_dev->device, port) == + dev_ll) { + if (rdma_node_get_transport(cma_dev->device->node_type) == + RDMA_TRANSPORT_IB && + rdma_port_get_link_layer(cma_dev->device, port) == + IB_LINK_LAYER_ETHERNET) { + int if_index = + id_priv->id.route.addr.dev_addr.bound_dev_if; + + ret = ib_find_cached_gid_by_port(cma_dev->device, + &iboe_gid, + cma_dev->default_gid_type, + port, + &init_net, + if_index, + NULL); + } else { + ret = ib_find_cached_gid_by_port(cma_dev->device, + &gid, + IB_GID_TYPE_IB, + port, + NULL, + 0, + NULL); + } + + if (!ret) { id_priv->id.port_num = port; goto out; - } else if (ret == 1) - break; + } } } } @@ -548,6 +680,62 @@ return ret; } +/* + * Select the source IB device and address to reach the destination IB address. + */ +static int cma_resolve_ib_dev(struct rdma_id_private *id_priv) +{ + struct cma_device *cma_dev, *cur_dev; + struct sockaddr_ib *addr; + union ib_gid gid, sgid, *dgid; + u16 pkey, index; + u8 p; + int i; + + cma_dev = NULL; + addr = (struct sockaddr_ib *) cma_dst_addr(id_priv); + dgid = (union ib_gid *) &addr->sib_addr; + pkey = ntohs(addr->sib_pkey); + + list_for_each_entry(cur_dev, &dev_list, list) { + if (rdma_node_get_transport(cur_dev->device->node_type) != RDMA_TRANSPORT_IB) + continue; + + for (p = 1; p <= cur_dev->device->phys_port_cnt; ++p) { + if (ib_find_cached_pkey(cur_dev->device, p, pkey, &index)) + continue; + + for (i = 0; !ib_get_cached_gid(cur_dev->device, p, i, + &gid, NULL); + i++) { + if (!memcmp(&gid, dgid, sizeof(gid))) { + cma_dev = cur_dev; + sgid = gid; + id_priv->id.port_num = p; + goto found; + } + + if (!cma_dev && (gid.global.subnet_prefix == + dgid->global.subnet_prefix)) { + cma_dev = cur_dev; + sgid = gid; + id_priv->id.port_num = p; + } + } + } + } + + if (!cma_dev) + return -ENODEV; + +found: + cma_attach_to_dev(id_priv, cma_dev); + addr = (struct sockaddr_ib *) cma_src_addr(id_priv); + memcpy(&addr->sib_addr, &sgid, sizeof sgid); + cma_translate_ib(addr, &id_priv->id.route.addr.dev_addr); + return 0; +} + static void cma_deref_id(struct rdma_id_private *id_priv) { if (atomic_dec_and_test(&id_priv->refcount)) @@ -582,7 +770,6 @@ id_priv->id.ps = ps; id_priv->id.qp_type = qp_type; spin_lock_init(&id_priv->lock); - spin_lock_init(&id_priv->cm_lock); mutex_init(&id_priv->qp_mutex); init_completion(&id_priv->comp); atomic_set(&id_priv->refcount, 1); @@ -705,24 +892,12 @@ ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; + ret = ib_query_gid(id_priv->id.device, id_priv->id.port_num, - qp_attr.ah_attr.grh.sgid_index, &sgid); + qp_attr.ah_attr.grh.sgid_index, &sgid, NULL); if (ret) goto out; - if (rdma_node_get_transport(id_priv->cma_dev->device->node_type) - == RDMA_TRANSPORT_IB && - rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num) - == IB_LINK_LAYER_ETHERNET) { - u32 scope_id = rdma_get_ipv6_scope_id(id_priv->id.device, - id_priv->id.port_num); - - ret = rdma_addr_find_smac_by_sgid(&sgid, qp_attr.smac, NULL, - scope_id); - if (ret) - goto out; - } - if (conn_param) qp_attr.max_dest_rd_atomic = conn_param->responder_resources; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); @@ -802,7 +977,7 @@ *qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT; if (id_priv->id.qp_type == IB_QPT_UD) { - ret = cma_set_qkey(id_priv); + ret = cma_set_qkey(id_priv, 0); if (ret) return ret; @@ -829,11 +1004,11 @@ else ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr, qp_attr_mask); + if (qp_attr->qp_state == IB_QPS_RTR) qp_attr->rq_psn = id_priv->seq_num; break; case RDMA_TRANSPORT_IWARP: - case RDMA_TRANSPORT_SCIF: if (!id_priv->cm_id.iw) { qp_attr->qp_access_flags = 0; *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS; @@ -852,32 +1027,37 @@ static inline int cma_zero_addr(struct sockaddr *addr) { - struct in6_addr *ip6; - - if (addr->sa_family == AF_INET) - return ipv4_is_zeronet( - ((struct sockaddr_in *)addr)->sin_addr.s_addr); - else { - ip6 = &((struct sockaddr_in6 *) addr)->sin6_addr; - return (ip6->s6_addr32[0] | ip6->s6_addr32[1] | - ip6->s6_addr32[2] | ip6->s6_addr32[3]) == 0; + switch (addr->sa_family) { + case AF_INET: + return ipv4_is_zeronet(((struct sockaddr_in *)addr)->sin_addr.s_addr); + case AF_INET6: + return IN6_IS_ADDR_UNSPECIFIED(&((struct sockaddr_in6 *) addr)->sin6_addr); + case AF_IB: + return ib_addr_any(&((struct sockaddr_ib *) addr)->sib_addr); + default: + return 0; } } static inline int cma_loopback_addr(struct sockaddr *addr) { - if (addr->sa_family == AF_INET) - return ipv4_is_loopback( - ((struct sockaddr_in *) addr)->sin_addr.s_addr); - else - return ipv6_addr_loopback( - &((struct sockaddr_in6 *) addr)->sin6_addr); + switch (addr->sa_family) { + case AF_INET: + return ipv4_is_loopback(((struct sockaddr_in *) addr)->sin_addr.s_addr); + case AF_INET6: + return ipv6_addr_loopback(&((struct sockaddr_in6 *) addr)->sin6_addr); + case AF_IB: + return ib_addr_loopback(&((struct sockaddr_ib *) addr)->sib_addr); + default: + return 0; + } } static inline int cma_any_addr(struct sockaddr *addr) { return cma_zero_addr(addr) || cma_loopback_addr(addr); } + int rdma_cma_any_addr(struct sockaddr *addr) { @@ -894,18 +1074,31 @@ case AF_INET: return ((struct sockaddr_in *) src)->sin_addr.s_addr != ((struct sockaddr_in *) dst)->sin_addr.s_addr; - default: + case AF_INET6: return ipv6_addr_cmp(&((struct sockaddr_in6 *) src)->sin6_addr, &((struct sockaddr_in6 *) dst)->sin6_addr); + default: + return ib_addr_cmp(&((struct sockaddr_ib *) src)->sib_addr, + &((struct sockaddr_ib *) dst)->sib_addr); } } -static inline __be16 cma_port(struct sockaddr *addr) +static __be16 cma_port(struct sockaddr *addr) { - if (addr->sa_family == AF_INET) + struct sockaddr_ib *sib; + + switch (addr->sa_family) { + case AF_INET: return ((struct sockaddr_in *) addr)->sin_port; - else + case AF_INET6: return ((struct sockaddr_in6 *) addr)->sin6_port; + case AF_IB: + sib = (struct sockaddr_ib *) addr; + return htons((u16) (be64_to_cpu(sib->sib_sid) & + be64_to_cpu(sib->sib_sid_mask))); + default: + return 0; + } } static inline int cma_any_port(struct sockaddr *addr) @@ -913,11 +1106,11 @@ return !cma_port(addr); } -static int cma_get_net_info(void *hdr, enum rdma_port_space ps, +static int cma_get_net_info(void *hdr, struct rdma_cm_id *listen_id, u8 *ip_ver, __be16 *port, union cma_ip_addr **src, union cma_ip_addr **dst) { - switch (ps) { + switch (listen_id->ps) { case RDMA_PS_SDP: if (sdp_get_majv(((struct sdp_hh *) hdr)->sdp_version) != SDP_MAJ_VERSION) @@ -932,10 +1125,19 @@ if (((struct cma_hdr *) hdr)->cma_version != CMA_VERSION) return -EINVAL; - *ip_ver = cma_get_ip_ver(hdr); - *port = ((struct cma_hdr *) hdr)->port; - *src = &((struct cma_hdr *) hdr)->src_addr; - *dst = &((struct cma_hdr *) hdr)->dst_addr; + if (listen_id->route.addr.src_addr.ss_family != AF_IB) { + *ip_ver = cma_get_ip_ver(hdr); + *port = ((struct cma_hdr *)hdr)->port; + *src = &((struct cma_hdr *)hdr)->src_addr; + *dst = &((struct cma_hdr *)hdr)->dst_addr; + } else { + memset(ip_ver, 0, sizeof(*ip_ver)); + memset(port, 0, sizeof(*port)); + memset(src, 0, sizeof(*src)); + memset(dst, 0, sizeof(*dst)); + + return 0; + } break; } @@ -944,14 +1146,54 @@ return 0; } -static void cma_save_net_info(struct rdma_addr *addr, - struct rdma_addr *listen_addr, +static void cma_save_ib_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id, + struct ib_sa_path_rec *path) +{ + struct sockaddr_ib *listen_ib, *ib; + + listen_ib = (struct sockaddr_ib *) &listen_id->route.addr.src_addr; + ib = (struct sockaddr_ib *) &id->route.addr.src_addr; + ib->sib_family = listen_ib->sib_family; + if (path) { + ib->sib_pkey = path->pkey; + ib->sib_flowinfo = path->flow_label; + memcpy(&ib->sib_addr, &path->sgid, 16); + } else { + ib->sib_pkey = listen_ib->sib_pkey; + ib->sib_flowinfo = listen_ib->sib_flowinfo; + ib->sib_addr = listen_ib->sib_addr; + } + + ib->sib_sid = listen_ib->sib_sid; + ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL); + ib->sib_scope_id = listen_ib->sib_scope_id; + + if (path) { + ib = (struct sockaddr_ib *) &id->route.addr.dst_addr; + ib->sib_family = listen_ib->sib_family; + ib->sib_pkey = path->pkey; + ib->sib_flowinfo = path->flow_label; + memcpy(&ib->sib_addr, &path->dgid, 16); + } +} + +static void cma_save_net_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id, + struct ib_cm_event *ib_event, + struct rdma_addr *addr, struct rdma_addr *listen_addr, u8 ip_ver, __be16 port, union cma_ip_addr *src, union cma_ip_addr *dst) { struct sockaddr_in *listen4, *ip4; struct sockaddr_in6 *listen6, *ip6; + if (listen_id->route.addr.src_addr.ss_family == AF_IB) { + if (ib_event->event == IB_CM_REQ_RECEIVED) + cma_save_ib_info(id, listen_id, ib_event->param.req_rcvd.primary_path); + else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) + cma_save_ib_info(id, listen_id, NULL); + return; + } + switch (ip_ver) { case 4: listen4 = (struct sockaddr_in *) &listen_addr->src_addr; @@ -988,14 +1230,12 @@ } } -static inline int cma_user_data_offset(enum rdma_port_space ps) +static inline int cma_user_data_offset(struct rdma_id_private *id_priv) { - switch (ps) { - case RDMA_PS_SDP: + if (id_priv->id.ps == RDMA_PS_SDP) return 0; - default: - return sizeof(struct cma_hdr); - } + else + return cma_family(id_priv) == AF_IB ? 0 : sizeof(struct cma_hdr); } static void cma_cancel_route(struct rdma_id_private *id_priv) @@ -1046,8 +1286,7 @@ cma_cancel_route(id_priv); break; case RDMA_CM_LISTEN: - if (cma_any_addr((struct sockaddr *) &id_priv->id.route.addr.src_addr) - && !id_priv->cma_dev) + if (cma_any_addr(cma_src_addr(id_priv)) && !id_priv->cma_dev) cma_cancel_listens(id_priv); break; default: @@ -1057,22 +1296,20 @@ static void cma_release_port(struct rdma_id_private *id_priv) { - struct rdma_bind_list *bind_list; + struct rdma_bind_list *bind_list = id_priv->bind_list; - mutex_lock(&lock); - bind_list = id_priv->bind_list; - if (!bind_list) { - mutex_unlock(&lock); + if (!bind_list) return; - } + + mutex_lock(&lock); hlist_del(&id_priv->node); - id_priv->bind_list = NULL; if (hlist_empty(&bind_list->owners)) { idr_remove(bind_list->ps, bind_list->port); kfree(bind_list); } mutex_unlock(&lock); - if (id_priv->sock) + + if ((id_priv->sock!= NULL) && (id_priv->sock->so_count > 0)) sock_release(id_priv->sock); } @@ -1090,6 +1327,22 @@ kfree(mc); break; case IB_LINK_LAYER_ETHERNET: +/* + if (mc->igmp_joined) { + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + struct net_device *ndev = NULL; + + if (dev_addr->bound_dev_if) + ndev = dev_get_by_index(&init_net, + dev_addr->bound_dev_if); + if (ndev) { + cma_igmp_send(ndev, + &mc->multicast.ib->rec.mgid, + false); + dev_put(ndev); + } + } +*/ kref_put(&mc->mcref, release_mc); break; default: @@ -1097,26 +1350,11 @@ } } } -static void __rdma_free(struct work_struct *work) -{ - struct rdma_id_private *id_priv; - id_priv = container_of(work, struct rdma_id_private, work); - - wait_for_completion(&id_priv->comp); - - if (id_priv->internal_id) - cma_deref_id(id_priv->id.context); - - kfree(id_priv->id.route.path_rec); - kfree(id_priv); -} void rdma_destroy_id(struct rdma_cm_id *id) { struct rdma_id_private *id_priv; enum rdma_cm_state state; - unsigned long flags; - struct ib_cm_id *ib; id_priv = container_of(id, struct rdma_id_private, id); state = cma_exch(id_priv, RDMA_CM_DESTROYING); @@ -1132,17 +1370,10 @@ if (id_priv->cma_dev) { switch (rdma_node_get_transport(id_priv->id.device->node_type)) { case RDMA_TRANSPORT_IB: - spin_lock_irqsave(&id_priv->cm_lock, flags); - if (id_priv->cm_id.ib && !IS_ERR(id_priv->cm_id.ib)) { - ib = id_priv->cm_id.ib; - id_priv->cm_id.ib = NULL; - spin_unlock_irqrestore(&id_priv->cm_lock, flags); - ib_destroy_cm_id(ib); - } else - spin_unlock_irqrestore(&id_priv->cm_lock, flags); + if (id_priv->cm_id.ib) + ib_destroy_cm_id(id_priv->cm_id.ib); break; case RDMA_TRANSPORT_IWARP: - case RDMA_TRANSPORT_SCIF: if (id_priv->cm_id.iw) iw_destroy_cm_id(id_priv->cm_id.iw); break; @@ -1155,8 +1386,13 @@ cma_release_port(id_priv); cma_deref_id(id_priv); - INIT_WORK(&id_priv->work, __rdma_free); - queue_work(cma_free_wq, &id_priv->work); + wait_for_completion(&id_priv->comp); + + if (id_priv->internal_id) + cma_deref_id(id_priv->id.context); + + kfree(id_priv->id.route.path_rec); + kfree(id_priv); } EXPORT_SYMBOL(rdma_destroy_id); @@ -1221,6 +1457,7 @@ (ib_event->event == IB_CM_TIMEWAIT_EXIT && cma_disable_callback(id_priv, RDMA_CM_DISCONNECT))) return 0; + memset(&event, 0, sizeof event); switch (ib_event->event) { case IB_CM_REQ_ERROR: @@ -1298,7 +1535,7 @@ u8 ip_ver; int ret; - if (cma_get_net_info(ib_event->private_data, listen_id->ps, + if (cma_get_net_info(ib_event->private_data, listen_id, &ip_ver, &port, &src, &dst)) return NULL; @@ -1307,7 +1544,8 @@ if (IS_ERR(id)) return NULL; - cma_save_net_info(&id->route.addr, &listen_id->route.addr, + id_priv = container_of(id, struct rdma_id_private, id); + cma_save_net_info(id, listen_id, ib_event, &id->route.addr, &listen_id->route.addr, ip_ver, port, src, dst); rt = &id->route; @@ -1321,19 +1559,17 @@ if (rt->num_paths == 2) rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path; - if (cma_any_addr((struct sockaddr *) &rt->addr.src_addr)) { + if (cma_any_addr(cma_src_addr(id_priv))) { rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND; rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid); ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey)); } else { - ret = rdma_translate_ip((struct sockaddr *) &rt->addr.src_addr, - &rt->addr.dev_addr, NULL); + ret = cma_translate_addr(cma_src_addr(id_priv), &rt->addr.dev_addr); if (ret) goto err; } rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid); - id_priv = container_of(id, struct rdma_id_private, id); id_priv->state = RDMA_CM_CONNECT; return id_priv; @@ -1357,22 +1593,21 @@ if (IS_ERR(id)) return NULL; - - if (cma_get_net_info(ib_event->private_data, listen_id->ps, + if (cma_get_net_info(ib_event->private_data, listen_id, &ip_ver, &port, &src, &dst)) goto err; - cma_save_net_info(&id->route.addr, &listen_id->route.addr, + id_priv = container_of(id, struct rdma_id_private, id); + + cma_save_net_info(id, listen_id, ib_event, &id->route.addr, &listen_id->route.addr, ip_ver, port, src, dst); if (!cma_any_addr((struct sockaddr *) &id->route.addr.src_addr)) { - ret = rdma_translate_ip((struct sockaddr *) &id->route.addr.src_addr, - &id->route.addr.dev_addr, NULL); + ret = cma_translate_addr(cma_src_addr(id_priv), &id->route.addr.dev_addr); if (ret) goto err; } - id_priv = container_of(id, struct rdma_id_private, id); id_priv->state = RDMA_CM_CONNECT; return id_priv; err: @@ -1409,16 +1644,6 @@ struct rdma_id_private *listen_id, *conn_id; struct rdma_cm_event event; int offset, ret; - u8 smac[ETH_ALEN]; - u8 alt_smac[ETH_ALEN]; - u8 *psmac = smac; - u8 *palt_smac = alt_smac; - int is_iboe = ((rdma_node_get_transport(cm_id->device->node_type) == - RDMA_TRANSPORT_IB) && - (rdma_port_get_link_layer(cm_id->device, - ib_event->param.req_rcvd.port) == - IB_LINK_LAYER_ETHERNET)); - int is_sidr = 0; listen_id = cm_id->context; if (!cma_check_req_qp_type(&listen_id->id, ib_event)) @@ -1428,10 +1653,9 @@ return -ECONNABORTED; memset(&event, 0, sizeof event); - offset = cma_user_data_offset(listen_id->id.ps); + offset = cma_user_data_offset(listen_id); event.event = RDMA_CM_EVENT_CONNECT_REQUEST; if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) { - is_sidr = 1; conn_id = cma_new_udp_id(&listen_id->id, ib_event); event.param.ud.private_data = ib_event->private_data + offset; event.param.ud.private_data_len = @@ -1463,37 +1687,18 @@ ret = conn_id->id.event_handler(&conn_id->id, &event); if (ret) goto err3; - - if (is_iboe && !is_sidr) { - u32 scope_id = rdma_get_ipv6_scope_id(cm_id->device, - ib_event->param.req_rcvd.port); - - if (ib_event->param.req_rcvd.primary_path != NULL) - rdma_addr_find_smac_by_sgid( - &ib_event->param.req_rcvd.primary_path->sgid, - psmac, NULL, scope_id); - else - psmac = NULL; - if (ib_event->param.req_rcvd.alternate_path != NULL) - rdma_addr_find_smac_by_sgid( - &ib_event->param.req_rcvd.alternate_path->sgid, - palt_smac, NULL, scope_id); - else - palt_smac = NULL; - } - /* - * Acquire mutex to prevent user executing rdma_destroy_id() - * while we're accessing the cm_id. - */ - mutex_lock(&lock); - if (is_iboe && !is_sidr) - ib_update_cm_av(cm_id, psmac, palt_smac); - if (cma_comp(conn_id, RDMA_CM_CONNECT) && (conn_id->id.qp_type != IB_QPT_UD)) { + /* + * Acquire mutex to prevent user executing rdma_destroy_id() + * while we're accessing the cm_id. + */ + mutex_lock(&lock); + if (cma_comp(conn_id, RDMA_CM_CONNECT) && + (conn_id->id.qp_type != IB_QPT_UD)) { cma_dbg(container_of(&conn_id->id, struct rdma_id_private, id), "sending MRA\n"); - ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); + ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); } - mutex_unlock(&lock); - mutex_unlock(&conn_id->handler_mutex); + mutex_unlock(&lock); + mutex_unlock(&conn_id->handler_mutex); mutex_unlock(&listen_id->handler_mutex); cma_deref_id(conn_id); return 0; @@ -1512,10 +1717,14 @@ return ret; } -static __be64 cma_get_service_id(enum rdma_port_space ps, struct sockaddr *addr) +__be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr) { - return cpu_to_be64(((u64)ps << 16) + be16_to_cpu(cma_port(addr))); + if (addr->sa_family == AF_IB) + return ((struct sockaddr_ib *) addr)->sib_sid; + + return cpu_to_be64(((u64)id->ps << 16) + be16_to_cpu(cma_port(addr))); } +EXPORT_SYMBOL(rdma_get_service_id); static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr, struct ib_cm_compare_data *compare) @@ -1558,7 +1767,7 @@ if (!cma_any_addr(addr)) { sdp_data->dst_addr.ip6 = ip6_addr; memset(&sdp_mask->dst_addr.ip6, 0xFF, - sizeof(sdp_mask->dst_addr.ip6)); + sizeof(sdp_mask->dst_addr.ip6)); } } else { cma_set_ip_ver(cma_data, 6); @@ -1566,7 +1775,7 @@ if (!cma_any_addr(addr)) { cma_data->dst_addr.ip6 = ip6_addr; memset(&cma_mask->dst_addr.ip6, 0xFF, - sizeof(cma_mask->dst_addr.ip6)); + sizeof cma_mask->dst_addr.ip6); } } break; @@ -1579,8 +1788,9 @@ { struct rdma_id_private *id_priv = iw_id->context; struct rdma_cm_event event; - struct sockaddr_in *sin; int ret = 0; + struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; + struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; if (cma_disable_callback(id_priv, RDMA_CM_CONNECT)) return 0; @@ -1591,11 +1801,11 @@ event.event = RDMA_CM_EVENT_DISCONNECTED; break; case IW_CM_EVENT_CONNECT_REPLY: - sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr; - *sin = iw_event->local_addr; - sin = (struct sockaddr_in *) &id_priv->id.route.addr.dst_addr; - *sin = iw_event->remote_addr; - switch ((int)iw_event->status) { + memcpy(cma_src_addr(id_priv), laddr, + rdma_addr_size(laddr)); + memcpy(cma_dst_addr(id_priv), raddr, + rdma_addr_size(raddr)); + switch (iw_event->status) { case 0: event.event = RDMA_CM_EVENT_ESTABLISHED; event.param.conn.initiator_depth = iw_event->ird; @@ -1644,11 +1854,11 @@ { struct rdma_cm_id *new_cm_id; struct rdma_id_private *listen_id, *conn_id; - struct sockaddr_in *sin; - struct net_device *dev = NULL; struct rdma_cm_event event; int ret; struct ib_device_attr attr; + struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; + struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; listen_id = cm_id->context; if (cma_disable_callback(listen_id, RDMA_CM_LISTEN)) @@ -1666,14 +1876,7 @@ mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); conn_id->state = RDMA_CM_CONNECT; - dev = ip_dev_find(&init_net, iw_event->local_addr.sin_addr.s_addr); - if (!dev) { - ret = -EADDRNOTAVAIL; - mutex_unlock(&conn_id->handler_mutex); - rdma_destroy_id(new_cm_id); - goto out; - } - ret = rdma_copy_addr(&conn_id->id.route.addr.dev_addr, dev, NULL); + ret = rdma_translate_ip(laddr, &conn_id->id.route.addr.dev_addr, NULL); if (ret) { mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(new_cm_id); @@ -1691,10 +1894,8 @@ cm_id->context = conn_id; cm_id->cm_handler = cma_iw_handler; - sin = (struct sockaddr_in *) &new_cm_id->route.addr.src_addr; - *sin = iw_event->local_addr; - sin = (struct sockaddr_in *) &new_cm_id->route.addr.dst_addr; - *sin = iw_event->remote_addr; + memcpy(cma_src_addr(conn_id), laddr, rdma_addr_size(laddr)); + memcpy(cma_dst_addr(conn_id), raddr, rdma_addr_size(raddr)); ret = ib_query_device(conn_id->id.device, &attr); if (ret) { @@ -1730,8 +1931,6 @@ cma_deref_id(conn_id); out: - if (dev) - dev_put(dev); mutex_unlock(&listen_id->handler_mutex); return ret; } @@ -1750,8 +1949,8 @@ id_priv->cm_id.ib = id; - addr = (struct sockaddr *) &id_priv->id.route.addr.src_addr; - svc_id = cma_get_service_id(id_priv->id.ps, addr); + addr = cma_src_addr(id_priv); + svc_id = rdma_get_service_id(&id_priv->id, addr); if (cma_any_addr(addr) && !id_priv->afonly) ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, NULL); else { @@ -1770,20 +1969,19 @@ static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog) { int ret; - struct sockaddr_in *sin; struct iw_cm_id *id; id = iw_create_cm_id(id_priv->id.device, - id_priv->sock, - iw_conn_req_handler, - id_priv); + id_priv->sock, + iw_conn_req_handler, + id_priv); if (IS_ERR(id)) return PTR_ERR(id); id_priv->cm_id.iw = id; - sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr; - id_priv->cm_id.iw->local_addr = *sin; + memcpy(&id_priv->cm_id.iw->local_addr, cma_src_addr(id_priv), + rdma_addr_size(cma_src_addr(id_priv))); ret = iw_cm_listen(id_priv->cm_id.iw, backlog); @@ -1812,6 +2010,10 @@ struct rdma_cm_id *id; int ret; + if (cma_family(id_priv) == AF_IB && + rdma_node_get_transport(cma_dev->device->node_type) != RDMA_TRANSPORT_IB) + return; + id = rdma_create_id(cma_listen_handler, id_priv, id_priv->id.ps, id_priv->id.qp_type); if (IS_ERR(id)) @@ -1821,8 +2023,8 @@ dev_id_priv->state = RDMA_CM_ADDR_BOUND; dev_id_priv->sock = id_priv->sock; - memcpy(&id->route.addr.src_addr, &id_priv->id.route.addr.src_addr, - ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr)); + memcpy(cma_src_addr(dev_id_priv), cma_src_addr(id_priv), + rdma_addr_size(cma_src_addr(id_priv))); cma_attach_to_dev(dev_id_priv, cma_dev); list_add_tail(&dev_id_priv->listen_list, &id_priv->listen_list); @@ -1888,36 +2090,44 @@ static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms, struct cma_work *work) { - struct rdma_addr *addr = &id_priv->id.route.addr; + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct ib_sa_path_rec path_rec; ib_sa_comp_mask comp_mask; struct sockaddr_in6 *sin6; + struct sockaddr_ib *sib; memset(&path_rec, 0, sizeof path_rec); - rdma_addr_get_sgid(&addr->dev_addr, &path_rec.sgid); - rdma_addr_get_dgid(&addr->dev_addr, &path_rec.dgid); - path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(&addr->dev_addr)); + rdma_addr_get_sgid(dev_addr, &path_rec.sgid); + rdma_addr_get_dgid(dev_addr, &path_rec.dgid); + path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); path_rec.numb_path = 1; path_rec.reversible = 1; - path_rec.service_id = cma_get_service_id(id_priv->id.ps, - (struct sockaddr *) &addr->dst_addr); + path_rec.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH | IB_SA_PATH_REC_REVERSIBLE | IB_SA_PATH_REC_SERVICE_ID; - if (addr->src_addr.ss_family == AF_INET) { + switch (cma_family(id_priv)) { + case AF_INET: path_rec.qos_class = cpu_to_be16((u16) id_priv->tos); comp_mask |= IB_SA_PATH_REC_QOS_CLASS; - } else { - sin6 = (struct sockaddr_in6 *) &addr->src_addr; + break; + case AF_INET6: + sin6 = (struct sockaddr_in6 *) cma_src_addr(id_priv); path_rec.traffic_class = (u8) (be32_to_cpu(sin6->sin6_flowinfo) >> 20); comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS; + break; + case AF_IB: + sib = (struct sockaddr_ib *) cma_src_addr(id_priv); + path_rec.traffic_class = (u8) (be32_to_cpu(sib->sib_flowinfo) >> 20); + comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS; + break; } id_priv->query_id = ib_sa_path_rec_get(&sa_client, id_priv->id.device, id_priv->id.port_num, &path_rec, - comp_mask, timeout_ms, + comp_mask, timeout_ms, 0, GFP_KERNEL, cma_query_handler, work, &id_priv->query); @@ -1946,10 +2156,48 @@ kfree(work); } -static void cma_ndev_work_handler(struct work_struct *_work) +static int cma_remove_id_dev(struct rdma_id_private *id_priv) +{ + struct rdma_cm_event event; + enum rdma_cm_state state; + int ret = 0; + + /* Record that we want to remove the device */ + state = cma_exch(id_priv, RDMA_CM_DEVICE_REMOVAL); + if (state == RDMA_CM_DESTROYING) + return 0; + + cma_cancel_operation(id_priv, state); + mutex_lock(&id_priv->handler_mutex); + + /* Check for destruction from another callback. */ + if (!cma_comp(id_priv, RDMA_CM_DEVICE_REMOVAL)) + goto out; + + memset(&event, 0, sizeof(event)); + event.event = RDMA_CM_EVENT_DEVICE_REMOVAL; + ret = id_priv->id.event_handler(&id_priv->id, &event); +out: + mutex_unlock(&id_priv->handler_mutex); + return ret; +} + + +static void cma_ndev_device_remove_work_handler(struct work_struct *_work) +{ + struct cma_ndev_work *work = container_of(_work, struct cma_ndev_work, work); + struct rdma_id_private *id_priv = work->id; + + cma_remove_id_dev(id_priv); + cma_deref_id(id_priv); +} + +/* Used for BONDING +static void cma_ndev_addr_change_work_handler(struct work_struct *_work) { struct cma_ndev_work *work = container_of(_work, struct cma_ndev_work, work); struct rdma_id_private *id_priv = work->id; + struct rdma_cm_event event; int destroy = 0; mutex_lock(&id_priv->handler_mutex); @@ -1957,7 +2205,9 @@ id_priv->state == RDMA_CM_DEVICE_REMOVAL) goto out; - if (id_priv->id.event_handler(&id_priv->id, &work->event)) { + memset(&event, 0, sizeof(event)); + event.event = RDMA_CM_EVENT_ADDR_CHANGE; + if (id_priv->id.event_handler(&id_priv->id, &event)) { cma_exch(id_priv, RDMA_CM_DESTROYING); destroy = 1; } @@ -1969,6 +2219,7 @@ rdma_destroy_id(&id_priv->id); kfree(work); } +*/ static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms) { @@ -2053,20 +2304,37 @@ return def_prec2sl & 7; } -static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) +/* eb072c4b8da0ba87bc870c7911aae180bae34d4a +static int iboe_tos_to_sl(struct net_device *ndev, int tos) { - struct rdma_route *route = &id_priv->id.route; - struct rdma_addr *addr = &route->addr; - struct cma_work *work; + int prio; + struct net_device *dev; + + prio = rt_tos2priority(tos); + dev = ndev->priv_flags & IFF_802_1Q_VLAN ? + vlan_dev_real_dev(ndev) : ndev; + + if (netdev_get_num_tc(dev)) + return netdev_get_prio_tc_map(dev, prio); + +#if IS_ENABLED(CONFIG_VLAN_8021Q) + if (ndev->priv_flags & IFF_802_1Q_VLAN) + return (vlan_dev_get_egress_qos_mask(ndev, prio) & + VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; +#endif + return 0; +} +*/ + +static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) +{ + struct rdma_route *route = &id_priv->id.route; + struct rdma_addr *addr = &route->addr; + struct cma_work *work; int ret; - struct sockaddr_in *src_addr = (struct sockaddr_in *)&route->addr.src_addr; - struct sockaddr_in *dst_addr = (struct sockaddr_in *)&route->addr.dst_addr; struct net_device *ndev = NULL; - if (src_addr->sin_family != dst_addr->sin_family) - return -EINVAL; - work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; @@ -2082,35 +2350,42 @@ route->num_paths = 1; - if (addr->dev_addr.bound_dev_if) + if (addr->dev_addr.bound_dev_if) { ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if); + route->path_rec->net = &init_net; + route->path_rec->ifindex = addr->dev_addr.bound_dev_if; + route->path_rec->gid_type = id_priv->gid_type; + } if (!ndev) { ret = -ENODEV; goto err2; } - route->path_rec->vlan_id = rdma_vlan_dev_vlan_id(ndev); memcpy(route->path_rec->dmac, addr->dev_addr.dst_dev_addr, ETH_ALEN); - memcpy(route->path_rec->smac, IF_LLADDR(ndev), ndev->if_addrlen); - rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &route->path_rec->sgid); rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr, &route->path_rec->dgid); - route->path_rec->hop_limit = 1; + /* Use the hint from IP Stack to select GID Type */ + if (addr->dev_addr.network != RDMA_NETWORK_IB) { + route->path_rec->hop_limit = IPV6_DEFAULT_HOPLIMIT; + } else { + route->path_rec->hop_limit = 1; + } route->path_rec->reversible = 1; route->path_rec->pkey = cpu_to_be16(0xffff); route->path_rec->mtu_selector = IB_SA_EQ; + //route->path_rec->sl = iboe_tos_to_sl(ndev, id_priv->tos); /* eb072c4b8da0ba87bc870c7911aae180bae34d4a */ route->path_rec->sl = tos_to_sl(id_priv->tos); - route->path_rec->mtu = iboe_get_mtu(ndev->if_mtu); route->path_rec->rate_selector = IB_SA_EQ; route->path_rec->rate = iboe_get_rate(ndev); dev_put(ndev); route->path_rec->packet_life_time_selector = IB_SA_EQ; route->path_rec->packet_life_time = CMA_IBOE_PACKET_LIFETIME; + route->path_rec->traffic_class = id_priv->tos; if (!route->path_rec->mtu) { ret = -EINVAL; goto err2; @@ -2157,7 +2432,6 @@ } break; case RDMA_TRANSPORT_IWARP: - case RDMA_TRANSPORT_SCIF: ret = cma_resolve_iw_route(id_priv, timeout_ms); break; default: @@ -2175,6 +2449,23 @@ } EXPORT_SYMBOL(rdma_resolve_route); +static void cma_set_loopback(struct sockaddr *addr) +{ + switch (addr->sa_family) { + case AF_INET: + ((struct sockaddr_in *) addr)->sin_addr.s_addr = htonl(INADDR_LOOPBACK); + break; + case AF_INET6: + ipv6_addr_set(&((struct sockaddr_in6 *) addr)->sin6_addr, + 0, 0, 0, htonl(1)); + break; + default: + ib_addr_set(&((struct sockaddr_ib *) addr)->sib_addr, + 0, 0, 0, htonl(1)); + break; + } +} + int rdma_enable_apm(struct rdma_cm_id *id, enum alt_path_type alt_type) { /* APM is not supported yet */ @@ -2184,29 +2475,41 @@ static int cma_bind_loopback(struct rdma_id_private *id_priv) { - struct cma_device *cma_dev; + struct cma_device *cma_dev, *cur_dev; struct ib_port_attr port_attr; union ib_gid gid; u16 pkey; int ret; u8 p; + cma_dev = NULL; mutex_lock(&lock); - if (list_empty(&dev_list)) { + list_for_each_entry(cur_dev, &dev_list, list) { + if (cma_family(id_priv) == AF_IB && + rdma_node_get_transport(cur_dev->device->node_type) != RDMA_TRANSPORT_IB) + continue; + + if (!cma_dev) + cma_dev = cur_dev; + + for (p = 1; p <= cur_dev->device->phys_port_cnt; ++p) { + if (!ib_query_port(cur_dev->device, p, &port_attr) && + port_attr.state == IB_PORT_ACTIVE) { + cma_dev = cur_dev; + goto port_found; + } + } + } + + if (!cma_dev) { ret = -ENODEV; goto out; } - list_for_each_entry(cma_dev, &dev_list, list) - for (p = 1; p <= cma_dev->device->phys_port_cnt; ++p) - if (!ib_query_port(cma_dev->device, p, &port_attr) && - port_attr.state == IB_PORT_ACTIVE) - goto port_found; p = 1; - cma_dev = list_entry(dev_list.next, struct cma_device, list); port_found: - ret = ib_get_cached_gid(cma_dev->device, p, 0, &gid); + ret = ib_get_cached_gid(cma_dev->device, p, 0, &gid, NULL); if (ret) goto out; @@ -2222,6 +2525,7 @@ ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey); id_priv->id.port_num = p; cma_attach_to_dev(id_priv, cma_dev); + cma_set_loopback(cma_src_addr(id_priv)); out: mutex_unlock(&lock); return ret; @@ -2239,8 +2543,7 @@ RDMA_CM_ADDR_RESOLVED)) goto out; - memcpy(&id_priv->id.route.addr.src_addr, src_addr, - ip_addr_size(src_addr)); + memcpy(cma_src_addr(id_priv), src_addr, rdma_addr_size(src_addr)); if (!status && !id_priv->cma_dev) status = cma_acquire_dev(id_priv, NULL); @@ -2268,7 +2571,6 @@ static int cma_resolve_loopback(struct rdma_id_private *id_priv) { struct cma_work *work; - struct sockaddr *src, *dst; union ib_gid gid; int ret; @@ -2285,18 +2587,6 @@ rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid); rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid); - src = (struct sockaddr *) &id_priv->id.route.addr.src_addr; - if (cma_zero_addr(src)) { - dst = (struct sockaddr *) &id_priv->id.route.addr.dst_addr; - if ((src->sa_family = dst->sa_family) == AF_INET) { - ((struct sockaddr_in *)src)->sin_addr = - ((struct sockaddr_in *)dst)->sin_addr; - } else { - ((struct sockaddr_in6 *)src)->sin6_addr = - ((struct sockaddr_in6 *)dst)->sin6_addr; - } - } - work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); work->old_state = RDMA_CM_ADDR_QUERY; @@ -2309,15 +2599,23 @@ return ret; } -static int cma_resolve_scif(struct rdma_id_private *id_priv) +static int cma_resolve_ib_addr(struct rdma_id_private *id_priv) { struct cma_work *work; + int ret; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; - /* we probably can leave it empty here */ + if (!id_priv->cma_dev) { + ret = cma_resolve_ib_dev(id_priv); + if (ret) + goto err; + } + + rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, (union ib_gid *) + &(((struct sockaddr_ib *) &id_priv->id.route.addr.dst_addr)->sib_addr)); work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); @@ -2326,6 +2624,9 @@ work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; queue_work(cma_wq, &work->work); return 0; +err: + kfree(work); + return ret; } static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, @@ -2334,12 +2635,13 @@ if (!src_addr || !src_addr->sa_family) { src_addr = (struct sockaddr *) &id->route.addr.src_addr; src_addr->sa_family = dst_addr->sa_family; -#ifdef INET6 if (dst_addr->sa_family == AF_INET6) { ((struct sockaddr_in6 *) src_addr)->sin6_scope_id = ((struct sockaddr_in6 *) dst_addr)->sin6_scope_id; + } else if (dst_addr->sa_family == AF_IB) { + ((struct sockaddr_ib *) src_addr)->sib_pkey = + ((struct sockaddr_ib *) dst_addr)->sib_pkey; } -#endif } if (!cma_any_addr(src_addr)) return rdma_bind_addr(id, src_addr); @@ -2352,6 +2654,7 @@ #ifdef INET6 struct sockaddr_in6 in6; #endif + struct sockaddr_ib ib; } addr; #endif @@ -2359,19 +2662,25 @@ #ifdef INET case AF_INET: memset(&addr.in, 0, sizeof(addr.in)); - addr.in.sin_family = dst_addr->sa_family; + addr.in.sin_family = AF_INET; addr.in.sin_len = sizeof(addr.in); return rdma_bind_addr(id, (struct sockaddr *)&addr.in); #endif #ifdef INET6 case AF_INET6: memset(&addr.in6, 0, sizeof(addr.in6)); - addr.in6.sin6_family = dst_addr->sa_family; + addr.in6.sin6_family = AF_INET6; addr.in6.sin6_len = sizeof(addr.in6); addr.in6.sin6_scope_id = ((struct sockaddr_in6 *)dst_addr)->sin6_scope_id; return rdma_bind_addr(id, (struct sockaddr *)&addr.in6); #endif + case AF_IB: + memset(&addr.ib, 0, sizeof(addr.ib)); + addr.ib.sib_family = AF_IB; + addr.ib.sib_pkey = + ((struct sockaddr_ib *)dst_addr)->sib_pkey; + return rdma_bind_addr(id, (struct sockaddr *)&addr.ib); default: return -EINVAL; } @@ -2391,20 +2700,25 @@ return ret; } + if (cma_family(id_priv) != dst_addr->sa_family) + return -EINVAL; + if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) return -EINVAL; atomic_inc(&id_priv->refcount); - memcpy(&id->route.addr.dst_addr, dst_addr, ip_addr_size(dst_addr)); - if (cma_any_addr(dst_addr)) + memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr)); + if (cma_any_addr(dst_addr)) { ret = cma_resolve_loopback(id_priv); - else if (id_priv->id.device && - rdma_node_get_transport(id_priv->id.device->node_type) == RDMA_TRANSPORT_SCIF) - ret = cma_resolve_scif(id_priv); - else - ret = rdma_resolve_ip(&addr_client, (struct sockaddr *) &id->route.addr.src_addr, - dst_addr, &id->route.addr.dev_addr, - timeout_ms, addr_handler, id_priv); + } else { + if (dst_addr->sa_family == AF_IB) { + ret = cma_resolve_ib_addr(id_priv); + } else { + ret = rdma_resolve_ip(&addr_client, cma_src_addr(id_priv), + dst_addr, &id->route.addr.dev_addr, + timeout_ms, addr_handler, id_priv); + } + } if (ret) goto err; @@ -2424,7 +2738,7 @@ id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irqsave(&id_priv->lock, flags); - if (id_priv->state == RDMA_CM_IDLE) { + if (reuse || id_priv->state == RDMA_CM_IDLE) { id_priv->reuseaddr = reuse; ret = 0; } else { @@ -2458,10 +2772,29 @@ static void cma_bind_port(struct rdma_bind_list *bind_list, struct rdma_id_private *id_priv) { - struct sockaddr_in *sin; + struct sockaddr *addr; + struct sockaddr_ib *sib; + u64 sid, mask; + __be16 port; + + addr = cma_src_addr(id_priv); + port = htons(bind_list->port); - sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr; - sin->sin_port = htons(bind_list->port); + switch (addr->sa_family) { + case AF_INET: + ((struct sockaddr_in *) addr)->sin_port = port; + break; + case AF_INET6: + ((struct sockaddr_in6 *) addr)->sin6_port = port; + break; + case AF_IB: + sib = (struct sockaddr_ib *) addr; + sid = be64_to_cpu(sib->sib_sid); + mask = be64_to_cpu(sib->sib_sid_mask); + sib->sib_sid = cpu_to_be64((sid & mask) | (u64) ntohs(port)); + sib->sib_sid_mask = cpu_to_be64(~0ULL); + break; + } id_priv->bind_list = bind_list; hlist_add_head(&id_priv->node, &bind_list->owners); } @@ -2520,7 +2853,7 @@ last_used_port = rover; if (ret != -EADDRNOTAVAIL) return ret; - } + } if (--remaining) { rover++; if ((rover < low) || (rover > high)) @@ -2542,7 +2875,7 @@ struct rdma_id_private *cur_id; struct sockaddr *addr, *cur_addr; - addr = (struct sockaddr *) &id_priv->id.route.addr.src_addr; + addr = cma_src_addr(id_priv); hlist_for_each_entry(cur_id, &bind_list->owners, node) { if (id_priv == cur_id) continue; @@ -2551,7 +2884,7 @@ cur_id->reuseaddr) continue; - cur_addr = (struct sockaddr *) &cur_id->id.route.addr.src_addr; + cur_addr = cma_src_addr(cur_id); if (id_priv->afonly && cur_id->afonly && (addr->sa_family != cur_addr->sa_family)) continue; @@ -2571,7 +2904,7 @@ unsigned short snum; int ret; - snum = ntohs(cma_port((struct sockaddr *) &id_priv->id.route.addr.src_addr)); + snum = ntohs(cma_port(cma_src_addr(id_priv))); bind_list = idr_find(ps, snum); if (!bind_list) { @@ -2608,7 +2941,7 @@ #ifdef __linux__ ret = sock->ops->bind(sock, (struct sockaddr *) &id_priv->id.route.addr.src_addr, - ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr)); + rdma_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr)); #else ret = -sobind(sock, (struct sockaddr *)&id_priv->id.route.addr.src_addr, @@ -2619,7 +2952,7 @@ return ret; } - size = ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr); + size = rdma_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr); ret = sock_getname(sock, (struct sockaddr *) &id_priv->id.route.addr.src_addr, &size, 0); @@ -2632,43 +2965,81 @@ return 0; } -static int cma_get_port(struct rdma_id_private *id_priv) +static struct idr *cma_select_inet_ps(struct rdma_id_private *id_priv) { - struct idr *ps; int ret; switch (id_priv->id.ps) { case RDMA_PS_SDP: - ps = &sdp_ps; - break; + return &sdp_ps; case RDMA_PS_TCP: - ps = &tcp_ps; if (unify_tcp_port_space) { ret = cma_get_tcp_port(id_priv); if (ret) - goto out; + return NULL; } - break; + return &tcp_ps; case RDMA_PS_UDP: - ps = &udp_ps; - break; + return &udp_ps; case RDMA_PS_IPOIB: - ps = &ipoib_ps; - break; + return &ipoib_ps; case RDMA_PS_IB: - ps = &ib_ps; - break; + return &ib_ps; default: - return -EPROTONOSUPPORT; + return NULL; } +} + +static struct idr *cma_select_ib_ps(struct rdma_id_private *id_priv) +{ + struct idr *ps = NULL; + struct sockaddr_ib *sib; + u64 sid_ps, mask, sid; + + sib = (struct sockaddr_ib *) cma_src_addr(id_priv); + mask = be64_to_cpu(sib->sib_sid_mask) & RDMA_IB_IP_PS_MASK; + sid = be64_to_cpu(sib->sib_sid) & mask; + + if ((id_priv->id.ps == RDMA_PS_IB) && (sid == (RDMA_IB_IP_PS_IB & mask))) { + sid_ps = RDMA_IB_IP_PS_IB; + ps = &ib_ps; + } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_TCP)) && + (sid == (RDMA_IB_IP_PS_TCP & mask))) { + sid_ps = RDMA_IB_IP_PS_TCP; + ps = &tcp_ps; + } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_UDP)) && + (sid == (RDMA_IB_IP_PS_UDP & mask))) { + sid_ps = RDMA_IB_IP_PS_UDP; + ps = &udp_ps; + } + + if (ps) { + sib->sib_sid = cpu_to_be64(sid_ps | ntohs(cma_port((struct sockaddr *) sib))); + sib->sib_sid_mask = cpu_to_be64(RDMA_IB_IP_PS_MASK | + be64_to_cpu(sib->sib_sid_mask)); + } + return ps; +} + +static int cma_get_port(struct rdma_id_private *id_priv) +{ + struct idr *ps; + int ret; + + if (cma_family(id_priv) != AF_IB) + ps = cma_select_inet_ps(id_priv); + else + ps = cma_select_ib_ps(id_priv); + if (!ps) + return -EPROTONOSUPPORT; mutex_lock(&lock); - if (cma_any_port((struct sockaddr *) &id_priv->id.route.addr.src_addr)) + if (cma_any_port(cma_src_addr(id_priv))) ret = cma_alloc_any_port(ps, id_priv); else ret = cma_use_port(ps, id_priv); mutex_unlock(&lock); -out: + return ret; } @@ -2676,17 +3047,20 @@ struct sockaddr *addr) { #if defined(INET6) - struct sockaddr_in6 *sin6; + struct sockaddr_in6 sin6; if (addr->sa_family != AF_INET6) return 0; - sin6 = (struct sockaddr_in6 *) addr; - if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) && - !sin6->sin6_scope_id) - return -EINVAL; + sin6 = *(struct sockaddr_in6 *)addr; - dev_addr->bound_dev_if = sin6->sin6_scope_id; + if (!(IN6_IS_SCOPE_LINKLOCAL(&sin6.sin6_addr))) + return 0; + + if (sa6_recoverscope(&sin6) || sin6.sin6_scope_id == 0) + return -EINVAL; + + dev_addr->bound_dev_if = sin6.sin6_scope_id; #endif return 0; } @@ -2698,8 +3072,8 @@ id_priv = container_of(id, struct rdma_id_private, id); if (id_priv->state == RDMA_CM_IDLE) { - ((struct sockaddr *) &id->route.addr.src_addr)->sa_family = AF_INET; - ret = rdma_bind_addr(id, (struct sockaddr *) &id->route.addr.src_addr); + id->route.addr.src_addr.ss_family = AF_INET; + ret = rdma_bind_addr(id, cma_src_addr(id_priv)); if (ret) return ret; } @@ -2722,7 +3096,6 @@ goto err; break; case RDMA_TRANSPORT_IWARP: - case RDMA_TRANSPORT_SCIF: ret = cma_iw_listen(id_priv, backlog); if (ret) goto err; @@ -2746,12 +3119,11 @@ { struct rdma_id_private *id_priv; int ret; -#if defined(INET6) int ipv6only; size_t var_size = sizeof(int); -#endif - if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6) + if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6 && + addr->sa_family != AF_IB) return -EAFNOSUPPORT; id_priv = container_of(id, struct rdma_id_private, id); @@ -2762,9 +3134,9 @@ if (ret) goto err1; - memcpy(&id->route.addr.src_addr, addr, ip_addr_size(addr)); + memcpy(cma_src_addr(id_priv), addr, rdma_addr_size(addr)); if (!cma_any_addr(addr)) { - ret = rdma_translate_ip(addr, &id->route.addr.dev_addr, NULL); + ret = cma_translate_addr(addr, &id->route.addr.dev_addr); if (ret) goto err1; @@ -2778,8 +3150,9 @@ id_priv->afonly = 1; #if defined(INET6) else if (addr->sa_family == AF_INET6) - id_priv->afonly = kernel_sysctlbyname(&thread0, "net.inet6.ip6.v6only", - &ipv6only, &var_size, NULL, 0, NULL, 0); + id_priv->afonly = kernel_sysctlbyname(&thread0, + "net.inet6.ip6.v6only", &ipv6only, &var_size, + NULL, 0, NULL, 0); #endif } ret = cma_get_port(id_priv); @@ -2796,20 +3169,20 @@ } EXPORT_SYMBOL(rdma_bind_addr); -static int cma_format_hdr(void *hdr, enum rdma_port_space ps, - struct rdma_route *route) +static int cma_format_hdr(void *hdr, struct rdma_id_private *id_priv) { struct cma_hdr *cma_hdr; struct sdp_hh *sdp_hdr; - if (route->addr.src_addr.ss_family == AF_INET) { + cma_hdr = hdr; + cma_hdr->cma_version = CMA_VERSION; + if (cma_family(id_priv) == AF_INET) { struct sockaddr_in *src4, *dst4; - src4 = (struct sockaddr_in *) &route->addr.src_addr; - dst4 = (struct sockaddr_in *) &route->addr.dst_addr; + src4 = (struct sockaddr_in *) cma_src_addr(id_priv); + dst4 = (struct sockaddr_in *) cma_dst_addr(id_priv); - switch (ps) { - case RDMA_PS_SDP: + if (id_priv->id.ps == RDMA_PS_SDP) { sdp_hdr = hdr; if (sdp_get_majv(sdp_hdr->sdp_version) != SDP_MAJ_VERSION) return -EINVAL; @@ -2817,24 +3190,19 @@ sdp_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr; sdp_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr; sdp_hdr->port = src4->sin_port; - break; - default: - cma_hdr = hdr; - cma_hdr->cma_version = CMA_VERSION; + } else { cma_set_ip_ver(cma_hdr, 4); cma_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr; cma_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr; cma_hdr->port = src4->sin_port; - break; } - } else { + } else if (cma_family(id_priv) == AF_INET6) { struct sockaddr_in6 *src6, *dst6; - src6 = (struct sockaddr_in6 *) &route->addr.src_addr; - dst6 = (struct sockaddr_in6 *) &route->addr.dst_addr; + src6 = (struct sockaddr_in6 *) cma_src_addr(id_priv); + dst6 = (struct sockaddr_in6 *) cma_dst_addr(id_priv); - switch (ps) { - case RDMA_PS_SDP: + if (id_priv->id.ps == RDMA_PS_SDP) { sdp_hdr = hdr; if (sdp_get_majv(sdp_hdr->sdp_version) != SDP_MAJ_VERSION) return -EINVAL; @@ -2842,15 +3210,11 @@ sdp_hdr->src_addr.ip6 = src6->sin6_addr; sdp_hdr->dst_addr.ip6 = dst6->sin6_addr; sdp_hdr->port = src6->sin6_port; - break; - default: - cma_hdr = hdr; - cma_hdr->cma_version = CMA_VERSION; + } else { cma_set_ip_ver(cma_hdr, 6); cma_hdr->src_addr.ip6 = src6->sin6_addr; cma_hdr->dst_addr.ip6 = dst6->sin6_addr; cma_hdr->port = src6->sin6_port; - break; } } return 0; @@ -2881,15 +3245,10 @@ event.status = ib_event->param.sidr_rep_rcvd.status; break; } - ret = cma_set_qkey(id_priv); + ret = cma_set_qkey(id_priv, rep->qkey); if (ret) { event.event = RDMA_CM_EVENT_ADDR_ERROR; - event.status = -EINVAL; - break; - } - if (id_priv->qkey != rep->qkey) { - event.event = RDMA_CM_EVENT_UNREACHABLE; - event.status = -EINVAL; + event.status = ret; break; } ib_init_ah_from_path(id_priv->id.device, id_priv->id.port_num, @@ -2924,27 +3283,34 @@ struct rdma_conn_param *conn_param) { struct ib_cm_sidr_req_param req; - struct rdma_route *route; struct ib_cm_id *id; - int ret; + void *private_data; + int offset, ret; - req.private_data_len = sizeof(struct cma_hdr) + - conn_param->private_data_len; + memset(&req, 0, sizeof req); + offset = cma_user_data_offset(id_priv); + req.private_data_len = offset + conn_param->private_data_len; if (req.private_data_len < conn_param->private_data_len) return -EINVAL; - req.private_data = kzalloc(req.private_data_len, GFP_ATOMIC); - if (!req.private_data) - return -ENOMEM; + if (req.private_data_len) { + private_data = kzalloc(req.private_data_len, GFP_ATOMIC); + if (!private_data) + return -ENOMEM; + } else { + private_data = NULL; + } if (conn_param->private_data && conn_param->private_data_len) - memcpy((void *) req.private_data + sizeof(struct cma_hdr), - conn_param->private_data, conn_param->private_data_len); + memcpy(private_data + offset, conn_param->private_data, + conn_param->private_data_len); - route = &id_priv->id.route; - ret = cma_format_hdr((void *) req.private_data, id_priv->id.ps, route); - if (ret) - goto out; + if (private_data) { + ret = cma_format_hdr(private_data, id_priv); + if (ret) + goto out; + req.private_data = private_data; + } id = ib_create_cm_id(id_priv->id.device, cma_sidr_rep_handler, id_priv); @@ -2954,9 +3320,8 @@ } id_priv->cm_id.ib = id; - req.path = route->path_rec; - req.service_id = cma_get_service_id(id_priv->id.ps, - (struct sockaddr *) &route->addr.dst_addr); + req.path = id_priv->id.route.path_rec; + req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); req.timeout_ms = 1 << (cma_response_timeout - 8); req.max_cm_retries = CMA_MAX_CM_RETRIES; @@ -2967,7 +3332,7 @@ id_priv->cm_id.ib = NULL; } out: - kfree(req.private_data); + kfree(private_data); return ret; } @@ -2981,14 +3346,18 @@ int offset, ret; memset(&req, 0, sizeof req); - offset = cma_user_data_offset(id_priv->id.ps); + offset = cma_user_data_offset(id_priv); req.private_data_len = offset + conn_param->private_data_len; if (req.private_data_len < conn_param->private_data_len) return -EINVAL; - private_data = kzalloc(req.private_data_len, GFP_ATOMIC); - if (!private_data) - return -ENOMEM; + if (req.private_data_len) { + private_data = kzalloc(req.private_data_len, GFP_ATOMIC); + if (!private_data) + return -ENOMEM; + } else { + private_data = NULL; + } if (conn_param->private_data && conn_param->private_data_len) memcpy(private_data + offset, conn_param->private_data, @@ -3002,17 +3371,18 @@ id_priv->cm_id.ib = id; route = &id_priv->id.route; - ret = cma_format_hdr(private_data, id_priv->id.ps, route); - if (ret) - goto out; - req.private_data = private_data; + if (private_data) { + ret = cma_format_hdr(private_data, id_priv); + if (ret) + goto out; + req.private_data = private_data; + } req.primary_path = &route->path_rec[0]; if (route->num_paths == 2) req.alternate_path = &route->path_rec[1]; - req.service_id = cma_get_service_id(id_priv->id.ps, - (struct sockaddr *) &route->addr.dst_addr); + req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); req.qp_num = id_priv->qp_num; req.qp_type = id_priv->id.qp_type; req.starting_psn = id_priv->seq_num; @@ -3021,8 +3391,8 @@ req.flow_control = conn_param->flow_control; req.retry_count = min_t(u8, 7, conn_param->retry_count); req.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count); - req.remote_cm_response_timeout = cma_response_timeout; - req.local_cm_response_timeout = cma_response_timeout; + req.remote_cm_response_timeout = cma_response_timeout; + req.local_cm_response_timeout = cma_response_timeout; req.max_cm_retries = CMA_MAX_CM_RETRIES; req.srq = id_priv->srq ? 1 : 0; @@ -3042,32 +3412,30 @@ struct rdma_conn_param *conn_param) { struct iw_cm_id *cm_id; - struct sockaddr_in* sin; int ret; struct iw_cm_conn_param iw_param; cm_id = iw_create_cm_id(id_priv->id.device, id_priv->sock, - cma_iw_handler, id_priv); + cma_iw_handler, id_priv); if (IS_ERR(cm_id)) return PTR_ERR(cm_id); id_priv->cm_id.iw = cm_id; - sin = (struct sockaddr_in*) &id_priv->id.route.addr.src_addr; - cm_id->local_addr = *sin; - - sin = (struct sockaddr_in*) &id_priv->id.route.addr.dst_addr; - cm_id->remote_addr = *sin; + memcpy(&cm_id->local_addr, cma_src_addr(id_priv), + rdma_addr_size(cma_src_addr(id_priv))); + memcpy(&cm_id->remote_addr, cma_dst_addr(id_priv), + rdma_addr_size(cma_dst_addr(id_priv))); ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) goto out; if (conn_param) { - iw_param.ord = conn_param->initiator_depth; - iw_param.ird = conn_param->responder_resources; - iw_param.private_data = conn_param->private_data; - iw_param.private_data_len = conn_param->private_data_len; + iw_param.ord = conn_param->initiator_depth; + iw_param.ird = conn_param->responder_resources; + iw_param.private_data = conn_param->private_data; + iw_param.private_data_len = conn_param->private_data_len; iw_param.qpn = id_priv->id.qp ? id_priv->qp_num : conn_param->qp_num; } else { memset(&iw_param, 0, sizeof iw_param); @@ -3104,7 +3472,6 @@ ret = cma_connect_ib(id_priv, conn_param); break; case RDMA_TRANSPORT_IWARP: - case RDMA_TRANSPORT_SCIF: ret = cma_connect_iw(id_priv, conn_param); break; default: @@ -3146,6 +3513,7 @@ rep.flow_control = conn_param->flow_control; rep.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count); rep.srq = id_priv->srq ? 1 : 0; + cma_dbg(id_priv, "sending REP\n"); ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep); out: @@ -3178,7 +3546,7 @@ } static int cma_send_sidr_rep(struct rdma_id_private *id_priv, - enum ib_cm_sidr_status status, + enum ib_cm_sidr_status status, u32 qkey, const void *private_data, int private_data_len) { struct ib_cm_sidr_rep_param rep; @@ -3187,7 +3555,7 @@ memset(&rep, 0, sizeof rep); rep.status = status; if (status == IB_SIDR_SUCCESS) { - ret = cma_set_qkey(id_priv); + ret = cma_set_qkey(id_priv, qkey); if (ret) return ret; rep.qp_num = id_priv->qp_num; @@ -3208,6 +3576,7 @@ id_priv = container_of(id, struct rdma_id_private, id); id_priv->owner = curthread->td_proc->p_pid; + if (!cma_comp(id_priv, RDMA_CM_CONNECT)) return -EINVAL; @@ -3220,21 +3589,21 @@ case RDMA_TRANSPORT_IB: if (id->qp_type == IB_QPT_UD) { if (conn_param) - ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, - conn_param->private_data, - conn_param->private_data_len); + ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, + conn_param->qkey, + conn_param->private_data, + conn_param->private_data_len); else ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, - NULL, 0); + 0, NULL, 0); } else { if (conn_param) - ret = cma_accept_ib(id_priv, conn_param); - else - ret = cma_rep_recv(id_priv); + ret = cma_accept_ib(id_priv, conn_param); + else + ret = cma_rep_recv(id_priv); } break; case RDMA_TRANSPORT_IWARP: - case RDMA_TRANSPORT_SCIF: ret = cma_accept_iw(id_priv, conn_param); break; default: @@ -3287,7 +3656,7 @@ switch (rdma_node_get_transport(id->device->node_type)) { case RDMA_TRANSPORT_IB: if (id->qp_type == IB_QPT_UD) - ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, + ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, 0, private_data, private_data_len); else { cma_dbg(id_priv, "sending REJ\n"); @@ -3297,7 +3666,6 @@ } break; case RDMA_TRANSPORT_IWARP: - case RDMA_TRANSPORT_SCIF: ret = iw_cm_reject(id_priv->cm_id.iw, private_data, private_data_len); break; @@ -3324,14 +3692,12 @@ if (ret) goto out; /* Initiate or respond to a disconnect. */ - cma_dbg(id_priv, "sending DREQ\n"); if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0)) { cma_dbg(id_priv, "sending DREP\n"); ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0); } break; case RDMA_TRANSPORT_IWARP: - case RDMA_TRANSPORT_SCIF: ret = iw_cm_disconnect(id_priv->cm_id.iw, 0); break; default: @@ -3348,17 +3714,15 @@ struct rdma_id_private *id_priv; struct cma_multicast *mc = multicast->context; struct rdma_cm_event event; - struct rdma_dev_addr *dev_addr; int ret; - struct net_device *ndev = NULL; - u16 vlan; id_priv = mc->id_priv; - dev_addr = &id_priv->id.route.addr.dev_addr; if (cma_disable_callback(id_priv, RDMA_CM_ADDR_BOUND) && cma_disable_callback(id_priv, RDMA_CM_ADDR_RESOLVED)) return 0; + if (!status) + status = cma_set_qkey(id_priv, be32_to_cpu(multicast->rec.qkey)); mutex_lock(&id_priv->qp_mutex); if (!status && id_priv->id.qp) status = ib_attach_mcast(id_priv->id.qp, &multicast->rec.mgid, @@ -3368,32 +3732,16 @@ memset(&event, 0, sizeof event); event.status = status; event.param.ud.private_data = mc->context; - ndev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); - if (!ndev) { - status = -ENODEV; - } else { - vlan = rdma_vlan_dev_vlan_id(ndev); - dev_put(ndev); - } if (!status) { event.event = RDMA_CM_EVENT_MULTICAST_JOIN; ib_init_ah_from_mcmember(id_priv->id.device, id_priv->id.port_num, &multicast->rec, &event.param.ud.ah_attr); - event.param.ud.ah_attr.vlan_id = vlan; event.param.ud.qp_num = 0xFFFFFF; event.param.ud.qkey = be32_to_cpu(multicast->rec.qkey); - } else { + } else event.event = RDMA_CM_EVENT_MULTICAST_ERROR; - /* mark that the cached record is no longer valid */ - if (status != -ENETRESET && status != -EAGAIN) { - spin_lock(&id_priv->lock); - id_priv->is_valid_rec = 0; - spin_unlock(&id_priv->lock); - } - } - ret = id_priv->id.event_handler(&id_priv->id, &event); if (ret) { cma_exch(id_priv, RDMA_CM_DESTROYING); @@ -3412,24 +3760,22 @@ unsigned char mc_map[MAX_ADDR_LEN]; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct sockaddr_in *sin = (struct sockaddr_in *) addr; -#if defined(INET6) struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) addr; -#endif if (cma_any_addr(addr)) { memset(mgid, 0, sizeof *mgid); -#if defined(INET6) } else if ((addr->sa_family == AF_INET6) && ((be32_to_cpu(sin6->sin6_addr.s6_addr32[0]) & 0xFFF0FFFF) == 0xFF10A01B)) { /* IPv6 address is an SA assigned MGID. */ memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); + } else if (addr->sa_family == AF_IB) { + memcpy(mgid, &((struct sockaddr_ib *) addr)->sib_addr, sizeof *mgid); } else if (addr->sa_family == AF_INET6) { ipv6_ib_mc_map(&sin6->sin6_addr, dev_addr->broadcast, mc_map); if (id_priv->id.ps == RDMA_PS_UDP) mc_map[7] = 0x01; /* Use RDMA CM signature */ *mgid = *(union ib_gid *) (mc_map + 4); -#endif } else { ip_ib_mc_map(sin->sin_addr.s_addr, dev_addr->broadcast, mc_map); if (id_priv->id.ps == RDMA_PS_UDP) @@ -3444,30 +3790,20 @@ struct ib_sa_mcmember_rec rec; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; ib_sa_comp_mask comp_mask; - int ret = 0; + int ret; - ib_addr_get_mgid(dev_addr, &id_priv->rec.mgid); + ib_addr_get_mgid(dev_addr, &rec.mgid); + ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num, + &rec.mgid, &rec); + if (ret) + return ret; - /* cache ipoib bc record */ - spin_lock(&id_priv->lock); - if (!id_priv->is_valid_rec) - ret = ib_sa_get_mcmember_rec(id_priv->id.device, - id_priv->id.port_num, - &id_priv->rec.mgid, - &id_priv->rec); - if (ret) { - id_priv->is_valid_rec = 0; - spin_unlock(&id_priv->lock); + ret = cma_set_qkey(id_priv, 0); + if (ret) return ret; - } else { - rec = id_priv->rec; - id_priv->is_valid_rec = 1; - } - spin_unlock(&id_priv->lock); cma_set_mgid(id_priv, (struct sockaddr *) &mc->addr, &rec.mgid); - if (id_priv->id.ps == RDMA_PS_UDP) - rec.qkey = cpu_to_be32(RDMA_UDP_QKEY); + rec.qkey = cpu_to_be32(id_priv->qkey); rdma_addr_get_sgid(dev_addr, &rec.port_gid); rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); rec.join_state = 1; @@ -3489,7 +3825,7 @@ id_priv->id.port_num, &rec, comp_mask, GFP_KERNEL, cma_ib_mc_handler, mc); - return PTR_RET(mc->multicast.ib); + return PTR_ERR_OR_ZERO(mc->multicast.ib); } static void iboe_mcast_work_handler(struct work_struct *work) @@ -3535,7 +3871,7 @@ { struct iboe_mcast_work *work; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; - int err; + int err = 0; struct sockaddr *addr = (struct sockaddr *)&mc->addr; struct net_device *ndev = NULL; @@ -3567,13 +3903,30 @@ mc->multicast.ib->rec.rate = iboe_get_rate(ndev); mc->multicast.ib->rec.hop_limit = 1; mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->if_mtu); + mc->multicast.ib->rec.ifindex = dev_addr->bound_dev_if; + mc->multicast.ib->rec.net = &init_net; + rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, + &mc->multicast.ib->rec.port_gid); + + if (addr->sa_family == AF_INET) { + mc->multicast.ib->rec.gid_type = + id_priv->cma_dev->default_gid_type; + if ((mc->multicast.ib->rec.gid_type == IB_GID_TYPE_ROCE_V2) || + (mc->multicast.ib->rec.gid_type == IB_GID_TYPE_ROCE_V1_5)) +// err = cma_igmp_send(ndev, &mc->multicast.ib->rec.mgid, true); + if (!err) { + mc->igmp_joined = true; + mc->multicast.ib->rec.hop_limit = IPV6_DEFAULT_HOPLIMIT; + } + } else { + mc->multicast.ib->rec.gid_type = IB_GID_TYPE_IB; + } dev_put(ndev); - if (!mc->multicast.ib->rec.mtu) { + if (err || !mc->multicast.ib->rec.mtu) { err = -EINVAL; goto out2; } - rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, - &mc->multicast.ib->rec.port_gid); + work->id = id_priv; work->mc = mc; INIT_WORK(&work->work, iboe_mcast_work_handler); @@ -3605,10 +3958,10 @@ if (!mc) return -ENOMEM; - memcpy(&mc->addr, addr, ip_addr_size(addr)); + memcpy(&mc->addr, addr, rdma_addr_size(addr)); mc->context = context; mc->id_priv = id_priv; - + mc->igmp_joined = false; spin_lock(&id_priv->lock); list_add(&mc->list, &id_priv->mc_list); spin_unlock(&id_priv->lock); @@ -3650,7 +4003,7 @@ id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irq(&id_priv->lock); list_for_each_entry(mc, &id_priv->mc_list, list) { - if (!memcmp(&mc->addr, addr, ip_addr_size(addr))) { + if (!memcmp(&mc->addr, addr, rdma_addr_size(addr))) { list_del(&mc->list); spin_unlock_irq(&id_priv->lock); @@ -3665,6 +4018,23 @@ kfree(mc); break; case IB_LINK_LAYER_ETHERNET: +/* + if (mc->igmp_joined) { + struct rdma_dev_addr *dev_addr = &id->route.addr.dev_addr; + struct net_device *ndev = NULL; + + if (dev_addr->bound_dev_if) + ndev = dev_get_by_index(&init_net, + dev_addr->bound_dev_if); + if (ndev) { + cma_igmp_send(ndev, + &mc->multicast.ib->rec.mgid, + false); + dev_put(ndev); + } + mc->igmp_joined = false; + } +*/ kref_put(&mc->mcref, release_mc); break; default: @@ -3678,28 +4048,69 @@ } EXPORT_SYMBOL(rdma_leave_multicast); -static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id_priv) +static int cma_netdev_change(struct net_device *ndev, unsigned long event, + struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr; struct cma_ndev_work *work; + enum rdma_link_layer dev_ll; + struct net_device *bounded_dev; + work_func_t work_func; dev_addr = &id_priv->id.route.addr.dev_addr; - if ((dev_addr->bound_dev_if == ndev->if_index) && - memcmp(dev_addr->src_dev_addr, IF_LLADDR(ndev), ndev->if_addrlen)) { - printk(KERN_INFO "RDMA CM addr change for ndev %s used by id %p\n", - ndev->if_xname, &id_priv->id); - work = kzalloc(sizeof *work, GFP_KERNEL); - if (!work) - return -ENOMEM; + switch (event) { +/* BONDING related + case NETDEV_BONDING_FAILOVER: + if (!(ndev->flags & IFF_MASTER) || + !(ndev->priv_flags & IFF_BONDING)) + return 0; + if (dev_addr->bound_dev_if != ndev->ifindex) + return 0; + if (!memcmp(dev_addr->src_dev_addr, + ndev->dev_addr, ndev->addr_len)) + return 0; + work_func = cma_ndev_addr_change_work_handler; + pr_info("RDMA CM addr change for %s used by id %p\n", + ndev->name, &id_priv->id); + break; +*/ + case NETDEV_UNREGISTER: + dev_ll = dev_addr->dev_type == ARPHRD_INFINIBAND ? + IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET; + if (dev_ll != IB_LINK_LAYER_ETHERNET) + return 0; - INIT_WORK(&work->work, cma_ndev_work_handler); - work->id = id_priv; - work->event.event = RDMA_CM_EVENT_ADDR_CHANGE; - atomic_inc(&id_priv->refcount); - queue_work(cma_wq, &work->work); + if (dev_addr->bound_dev_if == ndev->if_index) { + work_func = cma_ndev_device_remove_work_handler; + break; + } + + bounded_dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); + if (!bounded_dev) + return 0; + +/* BONDING + if (!netdev_has_upper_dev(ndev, bounded_dev)) + return 0; +*/ + + work_func = cma_ndev_device_remove_work_handler; + break; + + default: + return 0; } + work = kzalloc(sizeof(*work), GFP_ATOMIC); + if (!work) + return -ENOMEM; + + INIT_WORK(&work->work, work_func); + work->id = id_priv; + atomic_inc(&id_priv->refcount); + queue_work(cma_wq, &work->work); + return 0; } @@ -3711,24 +4122,21 @@ struct rdma_id_private *id_priv; int ret = NOTIFY_DONE; -/* BONDING related, commented out until the bonding is resolved */ -#if 0 +/* BONDING related if (dev_net(ndev) != &init_net) return NOTIFY_DONE; - if (event != NETDEV_BONDING_FAILOVER) + if (event != NETDEV_BONDING_FAILOVER && + event != NETDEV_UNREGISTER) return NOTIFY_DONE; - - if (!(ndev->flags & IFF_MASTER) || !(ndev->priv_flags & IFF_BONDING)) - return NOTIFY_DONE; -#endif +*/ if (event != NETDEV_DOWN && event != NETDEV_UNREGISTER) return NOTIFY_DONE; mutex_lock(&lock); list_for_each_entry(cma_dev, &dev_list, list) list_for_each_entry(id_priv, &cma_dev->id_list, list) { - ret = cma_netdev_change(ndev, id_priv); + ret = cma_netdev_change(ndev, event, id_priv); if (ret) goto out; } @@ -3742,6 +4150,32 @@ .notifier_call = cma_netdev_callback }; +static int +sysctl_cma_default_roce_mode(SYSCTL_HANDLER_ARGS) +{ + struct cma_device *cma_dev = arg1; + char buf[64]; + int error; + + strlcpy(buf, roce_gid_cache_type_str( + cma_get_default_gid_type(cma_dev)), sizeof(buf)); + + error = sysctl_handle_string(oidp, buf, sizeof(buf), req); + if (error != 0 || req->newptr == NULL) + goto done; + + error = roce_gid_cache_parse_gid_str(buf); + if (error < 0) { + error = EINVAL; + goto done; + } + + cma_set_default_gid_type(cma_dev, error); + error = 0; +done: + return (error); +} + static void cma_add_one(struct ib_device *device) { struct cma_device *cma_dev; @@ -3751,7 +4185,10 @@ if (!cma_dev) return; + sysctl_ctx_init(&cma_dev->sysctl_ctx); + cma_dev->device = device; + cma_dev->default_gid_type = IB_GID_TYPE_IB; init_completion(&cma_dev->comp); atomic_set(&cma_dev->refcount, 1); @@ -3763,32 +4200,12 @@ list_for_each_entry(id_priv, &listen_any_list, list) cma_listen_on_dev(id_priv, cma_dev); mutex_unlock(&lock); -} - -static int cma_remove_id_dev(struct rdma_id_private *id_priv) -{ - struct rdma_cm_event event; - enum rdma_cm_state state; - int ret = 0; - - /* Record that we want to remove the device */ - state = cma_exch(id_priv, RDMA_CM_DEVICE_REMOVAL); - if (state == RDMA_CM_DESTROYING) - return 0; - - cma_cancel_operation(id_priv, state); - mutex_lock(&id_priv->handler_mutex); - /* Check for destruction from another callback. */ - if (!cma_comp(id_priv, RDMA_CM_DEVICE_REMOVAL)) - goto out; - - memset(&event, 0, sizeof event); - event.event = RDMA_CM_EVENT_DEVICE_REMOVAL; - ret = id_priv->id.event_handler(&id_priv->id, &event); -out: - mutex_unlock(&id_priv->handler_mutex); - return ret; + (void) SYSCTL_ADD_PROC(&cma_dev->sysctl_ctx, + SYSCTL_CHILDREN(device->ports_parent->parent->oidp), + OID_AUTO, "default_roce_mode", + CTLTYPE_STRING | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, + cma_dev, 0, &sysctl_cma_default_roce_mode, "A", "Default ROCE mode"); } static void cma_process_remove(struct cma_device *cma_dev) @@ -3832,21 +4249,18 @@ mutex_unlock(&lock); cma_process_remove(cma_dev); + sysctl_ctx_free(&cma_dev->sysctl_ctx); kfree(cma_dev); } static int __init cma_init(void) { - int ret = -ENOMEM; + int ret; cma_wq = create_singlethread_workqueue("rdma_cm"); if (!cma_wq) return -ENOMEM; - cma_free_wq = create_singlethread_workqueue("rdma_cm_fr"); - if (!cma_free_wq) - goto err1; - ib_sa_register_client(&sa_client); rdma_addr_register_client(&addr_client); register_netdevice_notifier(&cma_nb); @@ -3861,9 +4275,6 @@ unregister_netdevice_notifier(&cma_nb); rdma_addr_unregister_client(&addr_client); ib_sa_unregister_client(&sa_client); - - destroy_workqueue(cma_free_wq); -err1: destroy_workqueue(cma_wq); return ret; } @@ -3874,8 +4285,6 @@ unregister_netdevice_notifier(&cma_nb); rdma_addr_unregister_client(&addr_client); ib_sa_unregister_client(&sa_client); - flush_workqueue(cma_free_wq); - destroy_workqueue(cma_free_wq); destroy_workqueue(cma_wq); idr_destroy(&sdp_ps); idr_destroy(&tcp_ps); Index: sys/ofed/drivers/infiniband/core/core_priv.h =================================================================== --- sys/ofed/drivers/infiniband/core/core_priv.h +++ sys/ofed/drivers/infiniband/core/core_priv.h @@ -35,12 +35,25 @@ #include #include +#include #include +struct cma_device; +typedef bool (*cma_device_filter)(struct ib_device *, void *); +struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter, + void *cookie); +enum ib_gid_type cma_get_default_gid_type(struct cma_device *cma_dev); +void cma_set_default_gid_type(struct cma_device *cma_dev, + enum ib_gid_type default_gid_type); +void cma_ref_dev(struct cma_device *cma_dev); +void cma_deref_dev(struct cma_device *cma_dev); + +extern struct workqueue_struct *roce_gid_mgmt_wq; + int ib_device_register_sysfs(struct ib_device *device, int (*port_callback)(struct ib_device *, - u8, struct kobject *)); + u8, struct kobject *)); void ib_device_unregister_sysfs(struct ib_device *device); int ib_sysfs_setup(void); @@ -49,4 +62,82 @@ int ib_cache_setup(void); void ib_cache_cleanup(void); +int ib_resolve_eth_dmac(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, int *qp_attr_mask); + +typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port, + struct net_device *idev, void *cookie); + +typedef int (*roce_netdev_filter)(struct ib_device *device, u8 port, + struct net_device *idev, void *cookie); + +struct roce_netdev_list { + struct list_head list; + struct net_device *ndev; +}; + +void ib_dev_roce_ports_of_netdev(struct ib_device *ib_dev, + roce_netdev_filter filter, + void *filter_cookie, + roce_netdev_callback cb, + void *cookie); +void ib_enum_roce_ports_of_netdev(roce_netdev_filter filter, + void *filter_cookie, + roce_netdev_callback cb, + void *cookie); + +const char *roce_gid_cache_type_str(enum ib_gid_type gid_type); +int roce_gid_cache_parse_gid_str(const char *buf); + +int roce_gid_cache_get_gid(struct ib_device *ib_dev, u8 port, int index, + union ib_gid *gid, struct ib_gid_attr *attr); + +int roce_gid_cache_find_gid(struct ib_device *ib_dev, union ib_gid *gid, + enum ib_gid_type gid_type, struct net *net, + int if_index, u8 *port, u16 *index); + +int roce_gid_cache_find_gid_by_port(struct ib_device *ib_dev, union ib_gid *gid, + enum ib_gid_type gid_type, u8 port, + struct net *net, int if_index, u16 *index); + +int roce_gid_cache_find_gid_by_filter(struct ib_device *ib_dev, + union ib_gid *gid, + u8 port, + bool (*filter)(const union ib_gid *gid, + const struct ib_gid_attr *, + void *), + void *context, + u16 *index); + +int roce_gid_cache_is_active(struct ib_device *ib_dev, u8 port); + +enum roce_gid_cache_default_mode { + ROCE_GID_CACHE_DEFAULT_MODE_SET, + ROCE_GID_CACHE_DEFAULT_MODE_DELETE +}; + +int roce_gid_cache_set_default_gid(struct ib_device *ib_dev, u8 port, + struct net_device *ndev, + unsigned long gid_type_mask, + enum roce_gid_cache_default_mode mode); + +int roce_gid_cache_setup(void); +void roce_gid_cache_cleanup(void); + +int roce_add_gid(struct ib_device *ib_dev, u8 port, + union ib_gid *gid, struct ib_gid_attr *attr); + +int roce_del_gid(struct ib_device *ib_dev, u8 port, + union ib_gid *gid, struct ib_gid_attr *attr); + +int roce_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, + struct net_device *ndev); + +int roce_gid_mgmt_init(void); +void roce_gid_mgmt_cleanup(void); + +int roce_rescan_device(struct ib_device *ib_dev); +int roce_sync_all_netdev_gids(struct ib_device *ib_dev, u8 port, + struct list_head *list); + #endif /* _CORE_PRIV_H */ Index: sys/ofed/drivers/infiniband/core/device.c =================================================================== --- sys/ofed/drivers/infiniband/core/device.c +++ sys/ofed/drivers/infiniband/core/device.c @@ -32,17 +32,18 @@ */ #include +#include #include #include #include #include #include +#include +#include #include "core_priv.h" -MODULE_AUTHOR("Roland Dreier"); -MODULE_DESCRIPTION("core kernel InfiniBand API"); -MODULE_LICENSE("Dual BSD/GPL"); +/* core kernel InfiniBand API */ struct ib_client_data { struct list_head list; @@ -173,14 +174,9 @@ */ struct ib_device *ib_alloc_device(size_t size) { - struct ib_device *dev; - BUG_ON(size < sizeof (struct ib_device)); - dev = kzalloc(size, GFP_KERNEL); - spin_lock_init(&dev->cmd_perf_lock); - - return dev; + return kzalloc(size, GFP_KERNEL); } EXPORT_SYMBOL(ib_alloc_device); @@ -264,6 +260,39 @@ return ret; } +static void ib_device_complete_cb(struct kref *kref) +{ + struct ib_device *device = container_of(kref, struct ib_device, + refcount); + + if (device->reg_state >= IB_DEV_UNREGISTERING) + complete(&device->free); +} + +/** + * ib_device_hold - increase the reference count of device + * @device: ib device to prevent from being free'd + * + * Prevent the device from being free'd. + */ +void ib_device_hold(struct ib_device *device) +{ + kref_get(&device->refcount); +} +EXPORT_SYMBOL(ib_device_hold); + +/** + * ib_device_put - decrease the reference count of device + * @device: allows this device to be free'd + * + * Puts the ib_device and allows it to be free'd. + */ +int ib_device_put(struct ib_device *device) +{ + return kref_put(&device->refcount, ib_device_complete_cb); +} +EXPORT_SYMBOL(ib_device_put); + /** * ib_register_device - Register an IB device with IB core * @device:Device to register @@ -315,6 +344,9 @@ list_add_tail(&device->core_list, &device_list); + kref_init(&device->refcount); + init_completion(&device->free); + device->reg_state = IB_DEV_REGISTERED; { @@ -345,6 +377,8 @@ mutex_lock(&device_mutex); + device->reg_state = IB_DEV_UNREGISTERING; + list_for_each_entry_reverse(client, &client_list, list) if (client->remove) client->remove(device); @@ -358,6 +392,9 @@ ib_device_unregister_sysfs(device); + ib_device_put(device); + wait_for_completion(&device->free); + spin_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry_safe(context, tmp, &device->client_data_list, list) kfree(context); @@ -581,6 +618,7 @@ if (port_num < start_port(device) || port_num > end_port(device)) return -EINVAL; + memset(port_attr, 0, sizeof(*port_attr)); return device->query_port(device, port_num, port_attr); } EXPORT_SYMBOL(ib_query_port); @@ -591,17 +629,104 @@ * @port_num:Port number to query * @index:GID table index to query * @gid:Returned GID + * @attr: Returned GID's attribute (only in RoCE) * * ib_query_gid() fetches the specified GID table entry. */ int ib_query_gid(struct ib_device *device, - u8 port_num, int index, union ib_gid *gid) + u8 port_num, int index, union ib_gid *gid, + struct ib_gid_attr *attr) { + if (!ib_cache_use_roce_gid_cache(device, port_num)) + return roce_gid_cache_get_gid(device, port_num, index, gid, + attr); + + if (attr) + return -EINVAL; + return device->query_gid(device, port_num, index, gid); } EXPORT_SYMBOL(ib_query_gid); /** + * ib_dev_roce_ports_of_netdev - enumerate RoCE ports of ibdev in + * respect of netdev + * @ib_dev : IB device we want to query + * @filter: Should we call the callback? + * @filter_cookie: Cookie passed to filter + * @cb: Callback to call for each found RoCE ports + * @cookie: Cookie passed back to the callback + * + * Enumerates all of the physical RoCE ports of ib_dev RoCE ports + * which are relaying Ethernet packets to a specific + * (possibly virtual) netdevice according to filter. + */ +void ib_dev_roce_ports_of_netdev(struct ib_device *ib_dev, + roce_netdev_filter filter, + void *filter_cookie, + roce_netdev_callback cb, + void *cookie) +{ + u8 port; + + if (ib_dev->modify_gid) + for (port = start_port(ib_dev); port <= end_port(ib_dev); + port++) + if (ib_dev->get_link_layer(ib_dev, port) == + IB_LINK_LAYER_ETHERNET) { + struct net_device *idev = NULL; + + rcu_read_lock(); + if (ib_dev->get_netdev) + idev = ib_dev->get_netdev(ib_dev, port); + + /* + if (idev && + idev->reg_state >= NETREG_UNREGISTERED) + idev = NULL; + */ + + if (idev) + dev_hold(idev); + + rcu_read_unlock(); + + if (filter(ib_dev, port, idev, filter_cookie)) + cb(ib_dev, port, idev, cookie); + + if (idev) + dev_put(idev); + } +} + +/** + * ib_enum_roce_ports_of_netdev - enumerate RoCE ports of a netdev + * @filter: Should we call the callback? + * @filter_cookie: Cookie passed to filter + * @cb: Callback to call for each found RoCE ports + * @cookie: Cookie passed back to the callback + * + * Enumerates all of the physical RoCE ports which are relaying + * Ethernet packets to a specific (possibly virtual) netdevice + * according to filter. + */ +void ib_enum_roce_ports_of_netdev(roce_netdev_filter filter, + void *filter_cookie, + roce_netdev_callback cb, + void *cookie) +{ + struct ib_device *dev; + + mutex_lock(&device_mutex); + + list_for_each_entry(dev, &device_list, core_list) + ib_dev_roce_ports_of_netdev(dev, filter, filter_cookie, cb, + cookie); + + mutex_unlock(&device_mutex); +} + +/** * ib_query_pkey - Get P_Key table entry * @device:Device to query * @port_num:Port number to query @@ -669,19 +794,32 @@ * a specified GID value occurs. * @device: The device to query. * @gid: The GID value to search for. + * @gid_type: Type of GID. + * @net: The namespace to search this GID in (RoCE only). + * Valid only if if_index != 0. + * @if_index: The if_index assigned with this GID (RoCE only). * @port_num: The port number of the device where the GID value was found. * @index: The index into the GID table where the GID was found. This * parameter may be NULL. */ int ib_find_gid(struct ib_device *device, union ib_gid *gid, - u8 *port_num, u16 *index) + enum ib_gid_type gid_type, struct net *net, + int if_index, u8 *port_num, u16 *index) { union ib_gid tmp_gid; int ret, port, i; + if (device->cache.roce_gid_cache && + !roce_gid_cache_find_gid(device, gid, gid_type, net, if_index, + port_num, index)) + return 0; + for (port = start_port(device); port <= end_port(device); ++port) { + if (!ib_cache_use_roce_gid_cache(device, port)) + continue; + for (i = 0; i < device->gid_tbl_len[port - start_port(device)]; ++i) { - ret = ib_query_gid(device, port, i, &tmp_gid); + ret = ib_query_gid(device, port, i, &tmp_gid, NULL); if (ret) return ret; if (!memcmp(&tmp_gid, gid, sizeof *gid)) { @@ -750,6 +888,8 @@ goto err; } + roce_gid_cache_setup(); + ret = ib_cache_setup(); if (ret) { printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n"); @@ -768,6 +908,7 @@ static void __exit ib_core_cleanup(void) { + roce_gid_cache_cleanup(); ib_cache_cleanup(); ib_sysfs_cleanup(); /* Make sure that any pending umem accounting work is done. */ Index: sys/ofed/drivers/infiniband/core/iwcm.c =================================================================== --- sys/ofed/drivers/infiniband/core/iwcm.c +++ sys/ofed/drivers/infiniband/core/iwcm.c @@ -59,9 +59,7 @@ #include "iwcm.h" -MODULE_AUTHOR("Tom Tucker"); -MODULE_DESCRIPTION("iWARP CM"); -MODULE_LICENSE("Dual BSD/GPL"); +/* iWARP CM */ static struct workqueue_struct *iwcm_wq; struct iwcm_work { Index: sys/ofed/drivers/infiniband/core/mad.c =================================================================== --- sys/ofed/drivers/infiniband/core/mad.c +++ sys/ofed/drivers/infiniband/core/mad.c @@ -47,10 +47,7 @@ #include "smi.h" #include "agent.h" -MODULE_LICENSE("Dual BSD/GPL"); -MODULE_DESCRIPTION("kernel IB MAD API"); -MODULE_AUTHOR("Hal Rosenstock"); -MODULE_AUTHOR("Sean Hefty"); +/* kernel IB MAD API */ static int mad_sendq_size = IB_MAD_QP_SEND_SIZE; static int mad_recvq_size = IB_MAD_QP_RECV_SIZE; @@ -65,7 +62,6 @@ static struct list_head ib_mad_port_list; static u32 ib_mad_client_id = 0; - /* * Timeout FIFO (tf) param */ @@ -106,7 +102,6 @@ static int send_sa_cc_mad(struct ib_mad_send_wr_private *mad_send_wr, u32 timeout_ms, u32 retries_left); - /* * Timeout FIFO functions - implements FIFO with timeout mechanism */ @@ -223,7 +218,7 @@ INIT_LIST_HEAD(&tf->fifo_head); init_timer(&tf->timer); INIT_WORK(&tf->work, timeout_handler_task); - tf->timer.data = (unsigned long) tf; + tf->timer.data = (unsigned long)tf; tf->timer.function = activate_timeout_handler_task; tf->timer.expires = jiffies; tf->fifo_size = fifo_size; @@ -295,7 +290,6 @@ unsigned long flags; unsigned long time_left; struct tf_entry *tmp, *tmp1; - bool found = false; spin_lock_irqsave(&tf->lists_lock, flags); if (list_empty(&tf->fifo_head)) { @@ -304,13 +298,11 @@ } list_for_each_entry(tmp, &tf->fifo_head, fifo_list) { - if (!tmp->canceled) { - found = true; + if (!tmp->canceled) break; - } } - if (!found) { + if (tmp->canceled) { spin_unlock_irqrestore(&tf->lists_lock, flags); return NULL; } @@ -330,7 +322,7 @@ spin_unlock_irqrestore(&tf->lists_lock, flags); time_left = tmp->exp_time - jiffies; - if ((long) time_left <= 0) + if ((long)time_left <= 0) time_left = 0; *time_left_ms = jiffies_to_msecs(time_left); @@ -1056,7 +1048,7 @@ */ cancel_mads(mad_agent_priv); port_priv = mad_agent_priv->qp_info->port_priv; - cancel_delayed_work_sync(&mad_agent_priv->timed_work); + cancel_delayed_work(&mad_agent_priv->timed_work); spin_lock_irqsave(&port_priv->reg_lock, flags); remove_mad_reg_req(mad_agent_priv); @@ -1100,18 +1092,18 @@ struct ib_mad_snoop_private *mad_snoop_priv; if (!IS_ERR(mad_agent)) { - /* If the TID is zero, the agent can only snoop. */ - if (mad_agent->hi_tid) { - mad_agent_priv = container_of(mad_agent, + /* If the TID is zero, the agent can only snoop. */ + if (mad_agent->hi_tid) { + mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private, agent); - unregister_mad_agent(mad_agent_priv); - } else { - mad_snoop_priv = container_of(mad_agent, + unregister_mad_agent(mad_agent_priv); + } else { + mad_snoop_priv = container_of(mad_agent, struct ib_mad_snoop_private, agent); - unregister_mad_snoop(mad_snoop_priv); - } + unregister_mad_snoop(mad_snoop_priv); + } } return 0; @@ -1238,7 +1230,7 @@ if (smi_handle_dr_smp_send(smp, device->node_type, port_num) == IB_SMI_DISCARD) { ret = -EINVAL; - printk(KERN_ERR PFX "Invalid directed route\n"); + dev_err(&device->dev, "Invalid directed route\n"); goto out; } @@ -1250,7 +1242,7 @@ local = kmalloc(sizeof *local, GFP_ATOMIC); if (!local) { ret = -ENOMEM; - printk(KERN_ERR PFX "No memory for ib_mad_local_private\n"); + dev_err(&device->dev, "No memory for ib_mad_local_private\n"); goto out; } local->mad_priv = NULL; @@ -1258,7 +1250,7 @@ mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_ATOMIC); if (!mad_priv) { ret = -ENOMEM; - printk(KERN_ERR PFX "No memory for local response MAD\n"); + dev_err(&device->dev, "No memory for local response MAD\n"); kfree(local); goto out; } @@ -1369,9 +1361,9 @@ for (left = send_buf->data_len + pad; left > 0; left -= seg_size) { seg = kmalloc(sizeof (*seg) + seg_size, gfp_mask); if (!seg) { - printk(KERN_ERR "alloc_send_rmpp_segs: RMPP mem " - "alloc failed for len %zd, gfp %#x\n", - sizeof (*seg) + seg_size, gfp_mask); + dev_err(&send_buf->mad_agent->device->dev, + "alloc_send_rmpp_segs: RMPP mem alloc failed for len %zd, gfp %#x\n", + sizeof (*seg) + seg_size, gfp_mask); free_send_rmpp_list(send_wr); return -ENOMEM; } @@ -1557,17 +1549,18 @@ if (unlikely(ib_dma_mapping_error(mad_agent->device, sge[0].addr))) return -ENOMEM; + mad_send_wr->header_mapping = sge[0].addr; + sge[1].addr = ib_dma_map_single(mad_agent->device, ib_get_payload(mad_send_wr), sge[1].length, DMA_TO_DEVICE); - if (unlikely(ib_dma_mapping_error(mad_agent->device, sge[1].addr))) { - ret = -ENOMEM; - goto dma1_err; + ib_dma_unmap_single(mad_agent->device, + mad_send_wr->header_mapping, + sge[0].length, DMA_TO_DEVICE); + return -ENOMEM; } - - mad_send_wr->header_mapping = sge[0].addr; mad_send_wr->payload_mapping = sge[1].addr; spin_lock_irqsave(&qp_info->send_queue.lock, flags); @@ -1585,17 +1578,14 @@ list_add_tail(&mad_send_wr->mad_list.list, list); } spin_unlock_irqrestore(&qp_info->send_queue.lock, flags); - - if (!ret) - return 0; - + if (ret) { ib_dma_unmap_single(mad_agent->device, mad_send_wr->header_mapping, - sge[1].length, DMA_TO_DEVICE); -dma1_err: + sge[0].length, DMA_TO_DEVICE); ib_dma_unmap_single(mad_agent->device, mad_send_wr->payload_mapping, - sge[0].length, DMA_TO_DEVICE); + sge[1].length, DMA_TO_DEVICE); + } return ret; } @@ -1617,6 +1607,11 @@ /* Reference MAD agent until send completes */ atomic_inc(&mad_agent_priv->refcount); spin_lock_irqsave(&mad_agent_priv->lock, flags); + if (mad_agent_priv->send_list_closed) { + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + deref_mad_agent(mad_agent_priv); + return -EIO; + } list_add_tail(&mad_send_wr->agent_list, &mad_agent_priv->send_list); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); @@ -1627,7 +1622,7 @@ spin_lock_irqsave(&mad_agent_priv->lock, flags); list_del(&mad_send_wr->agent_list); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); - atomic_dec(&mad_agent_priv->refcount); + deref_mad_agent(mad_agent_priv); } return ret; @@ -1702,29 +1697,29 @@ if (ret < 0) goto error; } else { - /* Reference MAD agent until send completes */ - atomic_inc(&mad_agent_priv->refcount); - spin_lock_irqsave(&mad_agent_priv->lock, flags); - list_add_tail(&mad_send_wr->agent_list, - &mad_agent_priv->send_list); - spin_unlock_irqrestore(&mad_agent_priv->lock, flags); - - if (mad_agent_priv->agent.rmpp_version) { - ret = ib_send_rmpp_mad(mad_send_wr); - if (ret >= 0 && ret != IB_RMPP_RESULT_CONSUMED) - ret = ib_send_mad(mad_send_wr); - } else - ret = ib_send_mad(mad_send_wr); - if (ret < 0) { - /* Fail send request */ + /* Reference MAD agent until send completes */ + atomic_inc(&mad_agent_priv->refcount); spin_lock_irqsave(&mad_agent_priv->lock, flags); - list_del(&mad_send_wr->agent_list); + list_add_tail(&mad_send_wr->agent_list, + &mad_agent_priv->send_list); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); - atomic_dec(&mad_agent_priv->refcount); - goto error; + + if (mad_agent_priv->agent.rmpp_version) { + ret = ib_send_rmpp_mad(mad_send_wr); + if (ret >= 0 && ret != IB_RMPP_RESULT_CONSUMED) + ret = ib_send_mad(mad_send_wr); + } else + ret = ib_send_mad(mad_send_wr); + if (ret < 0) { + /* Fail send request */ + spin_lock_irqsave(&mad_agent_priv->lock, flags); + list_del(&mad_send_wr->agent_list); + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + atomic_dec(&mad_agent_priv->refcount); + goto error; + } } } - } return 0; error: if (bad_send_buf) @@ -1774,7 +1769,8 @@ int ib_process_mad_wc(struct ib_mad_agent *mad_agent, struct ib_wc *wc) { - printk(KERN_ERR PFX "ib_process_mad_wc() not implemented yet\n"); + dev_err(&mad_agent->device->dev, + "ib_process_mad_wc() not implemented yet\n"); return 0; } EXPORT_SYMBOL(ib_process_mad_wc); @@ -1786,7 +1782,7 @@ for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS) { if ((*method)->agent[i]) { - printk(KERN_ERR PFX "Method %d already in use\n", i); + pr_err("Method %d already in use\n", i); return -EINVAL; } } @@ -1798,8 +1794,7 @@ /* Allocate management method table */ *method = kzalloc(sizeof **method, GFP_ATOMIC); if (!*method) { - printk(KERN_ERR PFX "No memory for " - "ib_mad_mgmt_method_table\n"); + pr_err("No memory for ib_mad_mgmt_method_table\n"); return -ENOMEM; } @@ -1894,8 +1889,8 @@ /* Allocate management class table for "new" class version */ *class = kzalloc(sizeof **class, GFP_ATOMIC); if (!*class) { - printk(KERN_ERR PFX "No memory for " - "ib_mad_mgmt_class_table\n"); + dev_err(&agent_priv->agent.device->dev, + "No memory for ib_mad_mgmt_class_table\n"); ret = -ENOMEM; goto error1; } @@ -1961,8 +1956,8 @@ /* Allocate mgmt vendor class table for "new" class version */ vendor = kzalloc(sizeof *vendor, GFP_ATOMIC); if (!vendor) { - printk(KERN_ERR PFX "No memory for " - "ib_mad_mgmt_vendor_class_table\n"); + dev_err(&agent_priv->agent.device->dev, + "No memory for ib_mad_mgmt_vendor_class_table\n"); goto error1; } @@ -1972,8 +1967,8 @@ /* Allocate table for this management vendor class */ vendor_class = kzalloc(sizeof *vendor_class, GFP_ATOMIC); if (!vendor_class) { - printk(KERN_ERR PFX "No memory for " - "ib_mad_mgmt_vendor_class\n"); + dev_err(&agent_priv->agent.device->dev, + "No memory for ib_mad_mgmt_vendor_class\n"); goto error2; } @@ -2004,7 +1999,7 @@ goto check_in_use; } } - printk(KERN_ERR PFX "All OUI slots in use\n"); + dev_err(&agent_priv->agent.device->dev, "All OUI slots in use\n"); goto error3; check_in_use: @@ -2215,9 +2210,9 @@ if (mad_agent->agent.recv_handler) atomic_inc(&mad_agent->refcount); else { - printk(KERN_NOTICE PFX "No receive handler for client " - "%p on port %d\n", - &mad_agent->agent, port_priv->port_num); + dev_info(&port_priv->device->dev, + "No receive handler for client %p on port %d\n", + &mad_agent->agent, port_priv->port_num); mad_agent = NULL; } } @@ -2233,8 +2228,8 @@ /* Make sure MAD base version is understood */ if (mad->mad_hdr.base_version != IB_MGMT_BASE_VERSION) { - printk(KERN_ERR PFX "MAD received with unsupported base " - "version %d\n", mad->mad_hdr.base_version); + pr_err("MAD received with unsupported base version %d\n", + mad->mad_hdr.base_version); goto out; } @@ -2309,7 +2304,7 @@ ((1 << lmc) - 1))); } else { if (ib_get_cached_gid(device, port_num, - attr.grh.sgid_index, &sgid)) + attr.grh.sgid_index, &sgid, NULL)) return 0; return !memcmp(sgid.raw, rwc->recv_buf.grh->dgid.raw, 16); @@ -2486,8 +2481,8 @@ response = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL); if (!response) { - printk(KERN_ERR PFX "ib_mad_recv_done_handler no memory " - "for response buffer\n"); + dev_err(&port_priv->device->dev, + "ib_mad_recv_done_handler no memory for response buffer\n"); goto out; } @@ -2754,7 +2749,8 @@ ret = ib_post_send(qp_info->qp, &queued_send_wr->send_wr, &bad_send_wr); if (ret) { - printk(KERN_ERR PFX "ib_post_send failed: %d\n", ret); + dev_err(&port_priv->device->dev, + "ib_post_send failed: %d\n", ret); mad_send_wr = queued_send_wr; wc->status = IB_WC_LOC_QP_OP_ERR; goto retry; @@ -2826,8 +2822,9 @@ IB_QP_STATE | IB_QP_CUR_STATE); kfree(attr); if (ret) - printk(KERN_ERR PFX "mad_error_handler - " - "ib_modify_qp to RTS : %d\n", ret); + dev_err(&port_priv->device->dev, + "mad_error_handler - ib_modify_qp to RTS : %d\n", + ret); else mark_sends_for_retry(qp_info); } @@ -2835,6 +2832,10 @@ } } +enum { + IB_MAD_COMP_HANDLER_QUOTA = 100 +}; + /* * IB MAD completion callback */ @@ -2842,10 +2843,9 @@ { struct ib_mad_port_private *port_priv; struct ib_wc wc; + int quota = IB_MAD_COMP_HANDLER_QUOTA; port_priv = container_of(work, struct ib_mad_port_private, work); - ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP); - while (ib_poll_cq(port_priv->cq, 1, &wc) == 1) { if (wc.status == IB_WC_SUCCESS) { switch (wc.opcode) { @@ -2861,7 +2861,13 @@ } } else mad_error_handler(port_priv, &wc); + if (!quota--) { + if (!queue_work(port_priv->wq, &port_priv->work)) + pr_warn("%s-%d: fatal: failed to requeue work\n", __func__, __LINE__); + return; + } } + ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP); } static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv) @@ -2875,6 +2881,7 @@ cancel_sa_cc_mads(mad_agent_priv); spin_lock_irqsave(&mad_agent_priv->lock, flags); + mad_agent_priv->send_list_closed = 1; list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr, &mad_agent_priv->send_list, agent_list) { if (mad_send_wr->status == IB_WC_SUCCESS) { @@ -2995,7 +3002,8 @@ if (local->mad_priv) { recv_mad_agent = local->recv_mad_agent; if (!recv_mad_agent) { - printk(KERN_ERR PFX "No receive MAD agent for local completion\n"); + dev_err(&mad_agent_priv->agent.device->dev, + "No receive MAD agent for local completion\n"); free_mad = 1; goto local_send_completion; } @@ -3178,7 +3186,8 @@ } else { mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL); if (!mad_priv) { - printk(KERN_ERR PFX "No memory for receive buffer\n"); + dev_err(&qp_info->port_priv->device->dev, + "No memory for receive buffer\n"); ret = -ENOMEM; break; } @@ -3191,11 +3200,8 @@ if (unlikely(ib_dma_mapping_error(qp_info->port_priv->device, sg_list.addr))) { ret = -ENOMEM; - kmem_cache_free(ib_mad_cache, mad_priv); - printk(KERN_ERR PFX "ib_dma_map_single failed\n"); break; } - mad_priv->header.mapping = sg_list.addr; recv_wr.wr_id = (unsigned long)&mad_priv->header.mad_list; mad_priv->header.mad_list.mad_queue = recv_queue; @@ -3217,7 +3223,8 @@ sizeof mad_priv->header, DMA_FROM_DEVICE); kmem_cache_free(ib_mad_cache, mad_priv); - printk(KERN_ERR PFX "ib_post_recv failed: %d\n", ret); + dev_err(&qp_info->port_priv->device->dev, + "ib_post_recv failed: %d\n", ret); break; } } while (post); @@ -3269,16 +3276,17 @@ int ret, i; struct ib_qp_attr *attr; struct ib_qp *qp; - u16 pkey_index = 0; + u16 pkey_index; attr = kmalloc(sizeof *attr, GFP_KERNEL); if (!attr) { - printk(KERN_ERR PFX "Couldn't kmalloc ib_qp_attr\n"); + dev_err(&port_priv->device->dev, + "Couldn't kmalloc ib_qp_attr\n"); return -ENOMEM; } ret = ib_find_pkey(port_priv->device, port_priv->port_num, - 0xFFFF, &pkey_index); + IB_DEFAULT_PKEY_FULL, &pkey_index); if (ret) pkey_index = 0; @@ -3297,16 +3305,18 @@ ret = ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_QKEY); if (ret) { - printk(KERN_ERR PFX "Couldn't change QP%d state to " - "INIT: %d\n", i, ret); + dev_err(&port_priv->device->dev, + "Couldn't change QP%d state to INIT: %d\n", + i, ret); goto out; } attr->qp_state = IB_QPS_RTR; ret = ib_modify_qp(qp, attr, IB_QP_STATE); if (ret) { - printk(KERN_ERR PFX "Couldn't change QP%d state to " - "RTR: %d\n", i, ret); + dev_err(&port_priv->device->dev, + "Couldn't change QP%d state to RTR: %d\n", + i, ret); goto out; } @@ -3314,16 +3324,18 @@ attr->sq_psn = IB_MAD_SEND_Q_PSN; ret = ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_SQ_PSN); if (ret) { - printk(KERN_ERR PFX "Couldn't change QP%d state to " - "RTS: %d\n", i, ret); + dev_err(&port_priv->device->dev, + "Couldn't change QP%d state to RTS: %d\n", + i, ret); goto out; } } ret = ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP); if (ret) { - printk(KERN_ERR PFX "Failed to request completion " - "notification: %d\n", ret); + dev_err(&port_priv->device->dev, + "Failed to request completion notification: %d\n", + ret); goto out; } @@ -3333,7 +3345,8 @@ ret = ib_mad_post_receive_mads(&port_priv->qp_info[i], NULL); if (ret) { - printk(KERN_ERR PFX "Couldn't post receive WRs\n"); + dev_err(&port_priv->device->dev, + "Couldn't post receive WRs\n"); goto out; } } @@ -3347,7 +3360,8 @@ struct ib_mad_qp_info *qp_info = qp_context; /* It's worse than that! He's dead, Jim! */ - printk(KERN_ERR PFX "Fatal error (%d) on MAD QP (%d)\n", + dev_err(&qp_info->port_priv->device->dev, + "Fatal error (%d) on MAD QP (%d)\n", event->event, qp_info->qp->qp_num); } @@ -3393,8 +3407,9 @@ qp_init_attr.event_handler = qp_event_handler; qp_info->qp = ib_create_qp(qp_info->port_priv->pd, &qp_init_attr); if (IS_ERR(qp_info->qp)) { - printk(KERN_ERR PFX "Couldn't create ib_mad QP%d\n", - get_spl_qp_index(qp_type)); + dev_err(&qp_info->port_priv->device->dev, + "Couldn't create ib_mad QP%d\n", + get_spl_qp_index(qp_type)); ret = PTR_ERR(qp_info->qp); goto error; } @@ -3432,7 +3447,7 @@ /* Create new device info */ port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL); if (!port_priv) { - printk(KERN_ERR PFX "No memory for ib_mad_port_private\n"); + dev_err(&device->dev, "No memory for ib_mad_port_private\n"); return -ENOMEM; } @@ -3452,21 +3467,21 @@ ib_mad_thread_completion_handler, NULL, port_priv, cq_size, 0); if (IS_ERR(port_priv->cq)) { - printk(KERN_ERR PFX "Couldn't create ib_mad CQ\n"); + dev_err(&device->dev, "Couldn't create ib_mad CQ\n"); ret = PTR_ERR(port_priv->cq); goto error3; } port_priv->pd = ib_alloc_pd(device); if (IS_ERR(port_priv->pd)) { - printk(KERN_ERR PFX "Couldn't create ib_mad PD\n"); + dev_err(&device->dev, "Couldn't create ib_mad PD\n"); ret = PTR_ERR(port_priv->pd); goto error4; } port_priv->mr = ib_get_dma_mr(port_priv->pd, IB_ACCESS_LOCAL_WRITE); if (IS_ERR(port_priv->mr)) { - printk(KERN_ERR PFX "Couldn't get ib_mad DMA MR\n"); + dev_err(&device->dev, "Couldn't get ib_mad DMA MR\n"); ret = PTR_ERR(port_priv->mr); goto error5; } @@ -3491,14 +3506,13 @@ if (sa_cc_init(&port_priv->sa_cc)) goto error9; - spin_lock_irqsave(&ib_mad_port_list_lock, flags); list_add_tail(&port_priv->port_list, &ib_mad_port_list); spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); ret = ib_mad_port_start(port_priv); if (ret) { - printk(KERN_ERR PFX "Couldn't start port\n"); + dev_err(&device->dev, "Couldn't start port\n"); goto error10; } @@ -3509,9 +3523,9 @@ list_del_init(&port_priv->port_list); spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); - destroy_workqueue(port_priv->wq); -error9: sa_cc_destroy(&port_priv->sa_cc); +error9: + destroy_workqueue(port_priv->wq); error8: destroy_mad_qp(&port_priv->qp_info[1]); error7: @@ -3544,7 +3558,7 @@ port_priv = __ib_get_mad_port(device, port_num); if (port_priv == NULL) { spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); - printk(KERN_ERR PFX "Port %d not found\n", port_num); + dev_err(&device->dev, "Port %d not found\n", port_num); return -ENODEV; } list_del_init(&port_priv->port_list); @@ -3583,14 +3597,12 @@ for (i = start; i <= end; i++) { if (ib_mad_port_open(device, i)) { - printk(KERN_ERR PFX "Couldn't open %s port %d\n", - device->name, i); + dev_err(&device->dev, "Couldn't open port %d\n", i); goto error; } if (ib_agent_port_open(device, i)) { - printk(KERN_ERR PFX "Couldn't open %s port %d " - "for agents\n", - device->name, i); + dev_err(&device->dev, + "Couldn't open port %d for agents\n", i); goto error_agent; } } @@ -3598,20 +3610,17 @@ error_agent: if (ib_mad_port_close(device, i)) - printk(KERN_ERR PFX "Couldn't close %s port %d\n", - device->name, i); + dev_err(&device->dev, "Couldn't close port %d\n", i); error: i--; while (i >= start) { if (ib_agent_port_close(device, i)) - printk(KERN_ERR PFX "Couldn't close %s port %d " - "for agents\n", - device->name, i); + dev_err(&device->dev, + "Couldn't close port %d for agents\n", i); if (ib_mad_port_close(device, i)) - printk(KERN_ERR PFX "Couldn't close %s port %d\n", - device->name, i); + dev_err(&device->dev, "Couldn't close port %d\n", i); i--; } } @@ -3632,12 +3641,12 @@ } for (i = 0; i < num_ports; i++, cur_port++) { if (ib_agent_port_close(device, cur_port)) - printk(KERN_ERR PFX "Couldn't close %s port %d " - "for agents\n", - device->name, cur_port); + dev_err(&device->dev, + "Couldn't close port %d for agents\n", + cur_port); if (ib_mad_port_close(device, cur_port)) - printk(KERN_ERR PFX "Couldn't close %s port %d\n", - device->name, cur_port); + dev_err(&device->dev, "Couldn't close port %d\n", + cur_port); } } @@ -3663,7 +3672,7 @@ SLAB_HWCACHE_ALIGN, NULL); if (!ib_mad_cache) { - printk(KERN_ERR PFX "Couldn't create ib_mad cache\n"); + pr_err("Couldn't create ib_mad cache\n"); ret = -ENOMEM; goto error1; } @@ -3671,7 +3680,7 @@ INIT_LIST_HEAD(&ib_mad_port_list); if (ib_register_client(&mad_client)) { - printk(KERN_ERR PFX "Couldn't register ib_mad client\n"); + pr_err("Couldn't register ib_mad client\n"); ret = -EINVAL; goto error2; } Index: sys/ofed/drivers/infiniband/core/mad_priv.h =================================================================== --- sys/ofed/drivers/infiniband/core/mad_priv.h +++ sys/ofed/drivers/infiniband/core/mad_priv.h @@ -42,14 +42,11 @@ #include #include - -#define PFX "ib_mad: " - #define IB_MAD_QPS_CORE 2 /* Always QP0 and QP1 as a minimum */ /* QP and CQ parameters */ #define IB_MAD_QP_SEND_SIZE 128 -#define IB_MAD_QP_RECV_SIZE 512 +#define IB_MAD_QP_RECV_SIZE 4096 #define IB_MAD_QP_MIN_SIZE 64 #define IB_MAD_QP_MAX_SIZE 8192 #define IB_MAD_SEND_REQ_MAX_SG 2 @@ -110,6 +107,7 @@ atomic_t refcount; struct completion comp; + int send_list_closed; }; struct ib_mad_snoop_private { Index: sys/ofed/drivers/infiniband/core/multicast.c =================================================================== --- sys/ofed/drivers/infiniband/core/multicast.c +++ sys/ofed/drivers/infiniband/core/multicast.c @@ -40,23 +40,11 @@ #include #include #include -#include #include #include #include "sa.h" -static int mcast_leave_retries = 3; - -/*static const struct kernel_param_ops retry_ops = { - .set = param_set_int, - .get = param_get_int, -}; - -module_param_cb(mcast_leave_retries, &retry_ops, &mcast_leave_retries, 0644); -MODULE_PARM_DESC(mcast_leave_retries, "Number of retries for multicast leave " - "requests before giving up (default: 3)"); -*/ static void mcast_add_one(struct ib_device *device); static void mcast_remove_one(struct ib_device *device); @@ -308,8 +296,8 @@ if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid) return -EINVAL; if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR, - IB_SA_MCMEMBER_REC_MTU, dst->mtu_selector, - src->mtu, dst->mtu)) + IB_SA_MCMEMBER_REC_MTU, dst->mtu_selector, + src->mtu, dst->mtu)) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_TRAFFIC_CLASS && src->traffic_class != dst->traffic_class) @@ -317,14 +305,14 @@ if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey) return -EINVAL; if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR, - IB_SA_MCMEMBER_REC_RATE, dst->rate_selector, - src->rate, dst->rate)) + IB_SA_MCMEMBER_REC_RATE, dst->rate_selector, + src->rate, dst->rate)) return -EINVAL; if (check_selector(comp_mask, - IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR, - IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME, - dst->packet_life_time_selector, - src->packet_life_time, dst->packet_life_time)) + IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR, + IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME, + dst->packet_life_time_selector, + src->packet_life_time, dst->packet_life_time)) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_SL && src->sl != dst->sl) return -EINVAL; @@ -352,7 +340,7 @@ port->port_num, IB_MGMT_METHOD_SET, &member->multicast.rec, member->multicast.comp_mask, - 3000, GFP_KERNEL, join_handler, group, + 1000, 3, GFP_KERNEL, join_handler, group, &group->query); if (ret >= 0) { group->query_id = ret; @@ -376,7 +364,7 @@ IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_JOIN_STATE, - 3000, GFP_KERNEL, leave_handler, + 1000, 3, GFP_KERNEL, leave_handler, group, &group->query); if (ret >= 0) { group->query_id = ret; @@ -540,17 +528,22 @@ if (status) process_join_error(group, status); else { + int mgids_changed, is_mgid0; ib_find_pkey(group->port->dev->device, group->port->port_num, be16_to_cpu(rec->pkey), &pkey_index); spin_lock_irq(&group->port->lock); - group->rec = *rec; if (group->state == MCAST_BUSY && group->pkey_index == MCAST_INVALID_PKEY_INDEX) group->pkey_index = pkey_index; - if (!memcmp(&mgid0, &group->rec.mgid, sizeof mgid0)) { + mgids_changed = memcmp(&rec->mgid, &group->rec.mgid, + sizeof(group->rec.mgid)); + group->rec = *rec; + if (mgids_changed) { rb_erase(&group->node, &group->port->table); - mcast_insert(group->port, group, 1); + is_mgid0 = !memcmp(&mgid0, &group->rec.mgid, + sizeof(mgid0)); + mcast_insert(group->port, group, is_mgid0); } spin_unlock_irq(&group->port->lock); } @@ -565,12 +558,8 @@ if (status && group->retries > 0 && !send_leave(group, group->leave_state)) group->retries--; - else { - if (status && group->retries <= 0) - printk(KERN_WARNING "reached max retry count. " - "status=%d. Giving up\n", status); + else mcast_work_handler(&group->work); - } } static struct mcast_group *acquire_group(struct mcast_port *port, @@ -593,7 +582,7 @@ if (!group) return NULL; - group->retries = mcast_leave_retries; + group->retries = 3; group->port = port; group->rec.mgid = *mgid; group->pkey_index = MCAST_INVALID_PKEY_INDEX; @@ -743,7 +732,22 @@ u16 gid_index; u8 p; - ret = ib_find_cached_gid(device, &rec->port_gid, &p, &gid_index); + switch (rdma_port_get_link_layer(device, port_num)) { + case IB_LINK_LAYER_ETHERNET: + ret = ib_find_cached_gid_by_port(device, &rec->port_gid, + rec->gid_type, port_num, + rec->net, rec->ifindex, + &gid_index); + break; + case IB_LINK_LAYER_INFINIBAND: + ret = ib_find_cached_gid(device, &rec->port_gid, + IB_GID_TYPE_IB, NULL, 0, &p, + &gid_index); + break; + default: + ret = -EINVAL; + } + if (ret) return ret; Index: sys/ofed/drivers/infiniband/core/peer_mem.c =================================================================== --- sys/ofed/drivers/infiniband/core/peer_mem.c +++ sys/ofed/drivers/infiniband/core/peer_mem.c @@ -224,11 +224,15 @@ } /* access to that peer client is under its lock - no extra lock is needed */ -unsigned long ib_peer_insert_context(struct ib_peer_memory_client *ib_peer_client, - void *context) +int ib_peer_insert_context(struct ib_peer_memory_client *ib_peer_client, + void *context, + unsigned long *context_ticket) { struct core_ticket *core_ticket = kzalloc(sizeof(*core_ticket), GFP_KERNEL); + if (!core_ticket) + return -ENOMEM; + ib_peer_client->last_ticket++; core_ticket->context = context; core_ticket->key = ib_peer_client->last_ticket; @@ -236,7 +240,8 @@ list_add_tail(&core_ticket->ticket_list, &ib_peer_client->core_ticket_list); - return core_ticket->key; + *context_ticket = core_ticket->key; + return 0; } int ib_peer_remove_context(struct ib_peer_memory_client *ib_peer_client, Index: sys/ofed/drivers/infiniband/core/roce_gid_cache.c =================================================================== --- /dev/null +++ sys/ofed/drivers/infiniband/core/roce_gid_cache.c @@ -0,0 +1,814 @@ +/* + * Copyright (c) 2015, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include "core_priv.h" + +union ib_gid zgid; +EXPORT_SYMBOL_GPL(zgid); + +static const struct ib_gid_attr zattr; + +enum gid_attr_find_mask { + GID_ATTR_FIND_MASK_GID_TYPE = 1UL << 0, + GID_ATTR_FIND_MASK_NETDEV = 1UL << 1, +}; + +static const char * const gid_type_str[] = { + [IB_GID_TYPE_IB] = "IB/RoCE V1", + [IB_GID_TYPE_ROCE_V2] = "RoCE V2", + [IB_GID_TYPE_ROCE_V1_5] = "RoCE V1.5", +}; + +static inline int addrconf_ifid_eui48(u8 *eui, struct net_device *dev) +{ + if (dev->if_addrlen != ETH_ALEN) + return -1; + memcpy(eui, IF_LLADDR(dev), 3); + memcpy(eui + 5, IF_LLADDR(dev) + 3, 3); + + /* NOTE: The scope ID is added by the GID to IP conversion */ + + eui[3] = 0xFF; + eui[4] = 0xFE; + eui[0] ^= 2; + return 0; +} + +static inline int start_port(struct ib_device *ib_dev) +{ + return (ib_dev->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1; +} + +struct dev_put_rcu { + struct rcu_head rcu; + struct net_device *ndev; +}; + +const char *roce_gid_cache_type_str(enum ib_gid_type gid_type) +{ + if (gid_type < ARRAY_SIZE(gid_type_str) && gid_type_str[gid_type]) + return gid_type_str[gid_type]; + + return "Invalid GID type"; +} +EXPORT_SYMBOL_GPL(roce_gid_cache_type_str); + +int roce_gid_cache_parse_gid_str(const char *buf) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(gid_type_str); ++i) + if (gid_type_str[i] && !strcmp(buf, gid_type_str[i])) + return i; + + return -EINVAL; +} +EXPORT_SYMBOL_GPL(roce_gid_cache_parse_gid_str); + +static void put_ndev(struct rcu_head *rcu) +{ + struct dev_put_rcu *put_rcu = + container_of(rcu, struct dev_put_rcu, rcu); + + dev_put(put_rcu->ndev); + kfree(put_rcu); +} + +static int write_gid(struct ib_device *ib_dev, u8 port, + struct ib_roce_gid_cache *cache, int ix, + const union ib_gid *gid, + const struct ib_gid_attr *attr) +{ + unsigned int orig_seq; + int ret; + struct dev_put_rcu *put_rcu; + struct net_device *old_net_dev; + + orig_seq = cache->data_vec[ix].seq; + cache->data_vec[ix].seq = -1; + /* Ensure that all readers will see invalid sequence + * identifier before starting the actual GID update. + */ + wmb(); + + ret = ib_dev->modify_gid(ib_dev, port, ix, gid, attr, + &cache->data_vec[ix].context); + + old_net_dev = cache->data_vec[ix].attr.ndev; + if (old_net_dev && old_net_dev != attr->ndev) { + put_rcu = kmalloc(sizeof(*put_rcu), GFP_KERNEL); + if (put_rcu) { + put_rcu->ndev = old_net_dev; + call_rcu(&put_rcu->rcu, put_ndev); + } else { + pr_warn("roce_gid_cache: can't allocate rcu context, using synchronize\n"); + synchronize_rcu(); + dev_put(old_net_dev); + } + } + /* if modify_gid failed, just delete the old gid */ + if (ret || !memcmp(gid, &zgid, sizeof(*gid))) { + gid = &zgid; + attr = &zattr; + cache->data_vec[ix].context = NULL; + } + memcpy(&cache->data_vec[ix].gid, gid, sizeof(*gid)); + memcpy(&cache->data_vec[ix].attr, attr, sizeof(*attr)); + if (cache->data_vec[ix].attr.ndev && + cache->data_vec[ix].attr.ndev != old_net_dev) + dev_hold(cache->data_vec[ix].attr.ndev); + + /* Ensure that all cached gid data updating is finished before + * marking the entry as available. + */ + wmb(); + + if (++orig_seq == (unsigned int)-1) + orig_seq = 0; + ACCESS_ONCE(cache->data_vec[ix].seq) = orig_seq; + + if (!ret) { + struct ib_event event; + + event.device = ib_dev; + event.element.port_num = port; + event.event = IB_EVENT_GID_CHANGE; + + ib_dispatch_event(&event); + } + return ret; +} + +static int find_gid(struct ib_roce_gid_cache *cache, union ib_gid *gid, + const struct ib_gid_attr *val, unsigned long mask) +{ + int i; + unsigned int orig_seq; + + for (i = 0; i < cache->sz; i++) { + struct ib_gid_attr *attr = &cache->data_vec[i].attr; + + orig_seq = cache->data_vec[i].seq; + if (orig_seq == -1U) + continue; + /* Make sure the sequence number we remeber was read + * before the gid cache entry content is read. + */ + rmb(); + + if (mask & GID_ATTR_FIND_MASK_GID_TYPE && + attr->gid_type != val->gid_type) + continue; + + if (memcmp(gid, &cache->data_vec[i].gid, sizeof(*gid))) + continue; + + if (mask & GID_ATTR_FIND_MASK_NETDEV && + attr->ndev != val->ndev) + continue; + + /* We have a match, verify that the data we + * compared is valid. Make sure that the + * sequence number we read is the last to be + * read. + */ + rmb(); + if (orig_seq == ACCESS_ONCE(cache->data_vec[i].seq)) + return i; + /* The sequence number changed under our feet, + * the GID entry is invalid. Continue to the + * next entry. + */ + } + + return -1; +} + +static void make_default_gid(struct net_device *dev, union ib_gid *gid) +{ + gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); + addrconf_ifid_eui48(&gid->raw[8], dev); +} + +int roce_add_gid(struct ib_device *ib_dev, u8 port, + union ib_gid *gid, struct ib_gid_attr *attr) +{ + struct ib_roce_gid_cache *cache; + int ix; + int ret = 0; + struct net_device *idev; + + if (!ib_dev->cache.roce_gid_cache) + return -ENOSYS; + + cache = ib_dev->cache.roce_gid_cache[port - start_port(ib_dev)]; + + if (!cache || !cache->active) + return -ENOSYS; + + if (!memcmp(gid, &zgid, sizeof(*gid))) + return -EINVAL; + + if (ib_dev->get_netdev) { + rcu_read_lock(); + idev = ib_dev->get_netdev(ib_dev, port); + if (idev && attr->ndev != idev) { + union ib_gid default_gid; + + /* Adding default GIDs in not permitted */ + make_default_gid(idev, &default_gid); + if (!memcmp(gid, &default_gid, sizeof(*gid))) { + rcu_read_unlock(); + return -EPERM; + } + } + rcu_read_unlock(); + } + + mutex_lock(&cache->lock); + + ix = find_gid(cache, gid, attr, GID_ATTR_FIND_MASK_GID_TYPE | + GID_ATTR_FIND_MASK_NETDEV); + if (ix >= 0) + goto out_unlock; + + ix = find_gid(cache, &zgid, NULL, 0); + if (ix < 0) { + ret = -ENOSPC; + goto out_unlock; + } + + write_gid(ib_dev, port, cache, ix, gid, attr); + +out_unlock: + mutex_unlock(&cache->lock); + return ret; +} + +int roce_del_gid(struct ib_device *ib_dev, u8 port, + union ib_gid *gid, struct ib_gid_attr *attr) +{ + struct ib_roce_gid_cache *cache; + union ib_gid default_gid; + int ix; + + if (!ib_dev->cache.roce_gid_cache) + return 0; + + cache = ib_dev->cache.roce_gid_cache[port - start_port(ib_dev)]; + + if (!cache || !cache->active) + return -ENOSYS; + + if (attr->ndev) { + /* Deleting default GIDs in not permitted */ + make_default_gid(attr->ndev, &default_gid); + if (!memcmp(gid, &default_gid, sizeof(*gid))) + return -EPERM; + } + + mutex_lock(&cache->lock); + + ix = find_gid(cache, gid, attr, + GID_ATTR_FIND_MASK_GID_TYPE | + GID_ATTR_FIND_MASK_NETDEV); + if (ix < 0) + goto out_unlock; + + write_gid(ib_dev, port, cache, ix, &zgid, &zattr); + +out_unlock: + mutex_unlock(&cache->lock); + return 0; +} + +int roce_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, + struct net_device *ndev) +{ + struct ib_roce_gid_cache *cache; + int ix; + + if (!ib_dev->cache.roce_gid_cache) + return 0; + + cache = ib_dev->cache.roce_gid_cache[port - start_port(ib_dev)]; + + if (!cache || !cache->active) + return -ENOSYS; + + mutex_lock(&cache->lock); + + for (ix = 0; ix < cache->sz; ix++) + if (cache->data_vec[ix].attr.ndev == ndev) + write_gid(ib_dev, port, cache, ix, &zgid, &zattr); + + mutex_unlock(&cache->lock); + return 0; +} + +int roce_sync_all_netdev_gids(struct ib_device *ib_dev, u8 port, + struct list_head *list) +{ + struct ib_roce_gid_cache *cache; + int ix; + + if (!ib_dev->cache.roce_gid_cache) + return 0; + + cache = ib_dev->cache.roce_gid_cache[port - start_port(ib_dev)]; + + if (!cache || !cache->active) + return -ENOSYS; + + mutex_lock(&cache->lock); + + for (ix = 0; ix < cache->sz; ix++) { + bool found = false; + struct roce_netdev_list *entry; + + list_for_each_entry(entry, list, list) { + if (cache->data_vec[ix].attr.ndev == entry->ndev) { + found = true; + break; + } + } + if (!found) + write_gid(ib_dev, port, cache, ix, &zgid, &zattr); + } + + mutex_unlock(&cache->lock); + return 0; +} + +int roce_gid_cache_get_gid(struct ib_device *ib_dev, u8 port, int index, + union ib_gid *gid, struct ib_gid_attr *attr) +{ + struct ib_roce_gid_cache *cache; + union ib_gid local_gid; + struct ib_gid_attr local_attr; + unsigned int orig_seq; + + if (!ib_dev->cache.roce_gid_cache) + return -EINVAL; + + cache = ib_dev->cache.roce_gid_cache[port - start_port(ib_dev)]; + + if (!cache || !cache->active) + return -ENOSYS; + + if (index < 0 || index >= cache->sz) + return -EINVAL; + + orig_seq = ACCESS_ONCE(cache->data_vec[index].seq); + /* Make sure we read the sequence number before copying the + * gid to local storage. */ + rmb(); + + memcpy(&local_gid, &cache->data_vec[index].gid, sizeof(local_gid)); + memcpy(&local_attr, &cache->data_vec[index].attr, sizeof(local_attr)); + /* Ensure the local copy completed reading before verifying + * the new sequence number. */ + rmb(); + + if (orig_seq == -1 || + orig_seq != ACCESS_ONCE(cache->data_vec[index].seq)) + return -EAGAIN; + + memcpy(gid, &local_gid, sizeof(*gid)); + if (attr) + memcpy(attr, &local_attr, sizeof(*attr)); + return 0; +} + +static int _roce_gid_cache_find_gid(struct ib_device *ib_dev, union ib_gid *gid, + const struct ib_gid_attr *val, + unsigned long mask, + u8 *port, u16 *index) +{ + struct ib_roce_gid_cache *cache; + u8 p; + int local_index; + + if (!ib_dev->cache.roce_gid_cache) + return -ENOENT; + + for (p = 0; p < ib_dev->phys_port_cnt; p++) { + if (rdma_port_get_link_layer(ib_dev, p + start_port(ib_dev)) != + IB_LINK_LAYER_ETHERNET) + continue; + cache = ib_dev->cache.roce_gid_cache[p]; + if (!cache || !cache->active) + continue; + local_index = find_gid(cache, gid, val, mask); + if (local_index >= 0) { + if (index) + *index = local_index; + if (port) + *port = p + start_port(ib_dev); + return 0; + } + } + + return -ENOENT; +} + +static int get_netdev_from_ifindex(struct net *net, int if_index, + struct ib_gid_attr *gid_attr_val) +{ + if (if_index && net) { + rcu_read_lock(); + gid_attr_val->ndev = dev_get_by_index(net, if_index); + rcu_read_unlock(); + if (gid_attr_val->ndev) + return GID_ATTR_FIND_MASK_NETDEV; + } + return 0; +} + +int roce_gid_cache_find_gid(struct ib_device *ib_dev, union ib_gid *gid, + enum ib_gid_type gid_type, struct net *net, + int if_index, u8 *port, u16 *index) +{ + unsigned long mask = GID_ATTR_FIND_MASK_GID_TYPE; + struct ib_gid_attr gid_attr_val = {.gid_type = gid_type}; + + mask |= get_netdev_from_ifindex(net, if_index, &gid_attr_val); + + return _roce_gid_cache_find_gid(ib_dev, gid, &gid_attr_val, + mask, port, index); +} + +int roce_gid_cache_find_gid_by_port(struct ib_device *ib_dev, union ib_gid *gid, + enum ib_gid_type gid_type, u8 port, + struct net *net, int if_index, u16 *index) +{ + int local_index; + struct ib_roce_gid_cache *cache; + unsigned long mask = GID_ATTR_FIND_MASK_GID_TYPE; + struct ib_gid_attr val = {.gid_type = gid_type}; + + if (!ib_dev->cache.roce_gid_cache || port < start_port(ib_dev) || + port >= (start_port(ib_dev) + ib_dev->phys_port_cnt)) + return -ENOENT; + + cache = ib_dev->cache.roce_gid_cache[port - start_port(ib_dev)]; + if (!cache || !cache->active) + return -ENOENT; + + mask |= get_netdev_from_ifindex(net, if_index, &val); + + local_index = find_gid(cache, gid, &val, mask); + if (local_index >= 0) { + if (index) + *index = local_index; + return 0; + } + + return -ENOENT; +} + +int roce_gid_cache_find_gid_by_filter(struct ib_device *ib_dev, + union ib_gid *gid, + u8 port, + bool (*filter)(const union ib_gid *, + const struct ib_gid_attr *, + void *), + void *context, + u16 *index) +{ + struct ib_roce_gid_cache *cache; + unsigned int i; + bool found = false; + + if (!ib_dev->cache.roce_gid_cache) + return -ENOSYS; + + if (port < start_port(ib_dev) || + port > start_port(ib_dev) + ib_dev->phys_port_cnt || + rdma_port_get_link_layer(ib_dev, port) != + IB_LINK_LAYER_ETHERNET) + return -ENOSYS; + + cache = ib_dev->cache.roce_gid_cache[port - start_port(ib_dev)]; + + if (!cache || !cache->active) + return -ENOENT; + + for (i = 0; i < cache->sz; i++) { + unsigned int orig_seq; + struct ib_gid_attr attr; + + orig_seq = cache->data_vec[i].seq; + if (orig_seq == -1) + continue; + /* Make sure the sequence number we remeber was read + * before the gid cache entry content is read. + */ + rmb(); + + if (memcmp(gid, &cache->data_vec[i].gid, sizeof(*gid))) + continue; + + memcpy(&attr, &cache->data_vec[i].attr, sizeof(attr)); + + rcu_read_lock(); + + /* Make sure we finished reading the attribute */ + rmb(); + if (orig_seq == ACCESS_ONCE(cache->data_vec[i].seq)) + if (!filter || filter(gid, &attr, context)) + found = true; + + rcu_read_unlock(); + + if (found) + break; + } + + if (!found) + return -ENOENT; + + if (index) + *index = i; + return 0; +} + +static struct ib_roce_gid_cache *alloc_roce_gid_cache(int sz) +{ + struct ib_roce_gid_cache *cache = + kzalloc(sizeof(struct ib_roce_gid_cache), GFP_KERNEL); + if (!cache) + return NULL; + + cache->data_vec = kcalloc(sz, sizeof(*cache->data_vec), GFP_KERNEL); + if (!cache->data_vec) + goto err_free_cache; + + mutex_init(&cache->lock); + + cache->sz = sz; + + return cache; + +err_free_cache: + kfree(cache); + return NULL; +} + +static void free_roce_gid_cache(struct ib_device *ib_dev, u8 port) +{ + int i; + struct ib_roce_gid_cache *cache = ib_dev->cache.roce_gid_cache[port - 1]; + + if (!cache) + return; + + for (i = 0; i < cache->sz; ++i) { + if (memcmp(&cache->data_vec[i].gid, &zgid, + sizeof(cache->data_vec[i].gid))) + write_gid(ib_dev, port, cache, i, &zgid, &zattr); + } + kfree(cache->data_vec); + kfree(cache); +} + +static void set_roce_gid_cache_active(struct ib_roce_gid_cache *cache, + int active) +{ + if (!cache) + return; + + cache->active = active; +} + +int roce_gid_cache_set_default_gid(struct ib_device *ib_dev, u8 port, + struct net_device *ndev, + unsigned long gid_type_mask, + enum roce_gid_cache_default_mode mode) +{ + union ib_gid gid; + struct ib_gid_attr gid_attr; + struct ib_roce_gid_cache *cache; + unsigned int gid_type; + unsigned int gid_index = 0; + + cache = ib_dev->cache.roce_gid_cache[port - 1]; + + if (!cache) + goto done; + + make_default_gid(ndev, &gid); + memset(&gid_attr, 0, sizeof(gid_attr)); + gid_attr.ndev = ndev; + for (gid_type = 0; gid_type < IB_GID_TYPE_SIZE; ++gid_type) { + union ib_gid current_gid; + struct ib_gid_attr current_gid_attr; + + if (1UL << gid_type & ~gid_type_mask) + continue; + + gid_attr.gid_type = gid_type; + + if (!roce_gid_cache_get_gid(ib_dev, port, gid_index, + ¤t_gid, ¤t_gid_attr) && + !memcmp(&gid, ¤t_gid, sizeof(gid)) && + !memcmp(&gid_attr, ¤t_gid_attr, sizeof(gid_attr)) && + mode == ROCE_GID_CACHE_DEFAULT_MODE_SET) { + ++gid_index; /* XXX bugfix */ + continue; + } + + mutex_lock(&cache->lock); + if (write_gid(ib_dev, port, cache, gid_index, &zgid, &zattr)) { + pr_warn("roce_gid_cache: can't delete index %d for default gid %pI6\n", + gid_index, gid.raw); + mutex_unlock(&cache->lock); + ++gid_index; + continue; + } + + if (mode == ROCE_GID_CACHE_DEFAULT_MODE_SET && + write_gid(ib_dev, port, cache, gid_index, &gid, &gid_attr)) + pr_warn("roce_gid_cache: unable to add default gid %pI6\n", + gid.raw); + + mutex_unlock(&cache->lock); + ++gid_index; + } +done: + return (gid_index); +} + +static int roce_gid_cache_setup_one(struct ib_device *ib_dev) +{ + u8 port; + int err = 0; + + if (!ib_dev->modify_gid) + return -ENOSYS; + + ib_dev->cache.roce_gid_cache = + kcalloc(ib_dev->phys_port_cnt, + sizeof(*ib_dev->cache.roce_gid_cache), GFP_KERNEL); + + if (!ib_dev->cache.roce_gid_cache) { + pr_warn("failed to allocate roce addr cache for %s\n", + ib_dev->name); + return -ENOMEM; + } + + for (port = 0; port < ib_dev->phys_port_cnt; port++) { + if (rdma_port_get_link_layer(ib_dev, port + start_port(ib_dev)) + != IB_LINK_LAYER_ETHERNET) + continue; + ib_dev->cache.roce_gid_cache[port] = + alloc_roce_gid_cache(ib_dev->gid_tbl_len[port]); + if (!ib_dev->cache.roce_gid_cache[port]) { + err = -ENOMEM; + goto rollback_cache_setup; + } + } + return 0; + +rollback_cache_setup: + for (port = 1; port <= ib_dev->phys_port_cnt; port++) + free_roce_gid_cache(ib_dev, port); + + kfree(ib_dev->cache.roce_gid_cache); + ib_dev->cache.roce_gid_cache = NULL; + return err; +} + +static void roce_gid_cache_cleanup_one(struct ib_device *ib_dev) +{ + u8 port; + + if (!ib_dev->cache.roce_gid_cache) + return; + + for (port = 1; port <= ib_dev->phys_port_cnt; port++) + free_roce_gid_cache(ib_dev, port); + + kfree(ib_dev->cache.roce_gid_cache); + ib_dev->cache.roce_gid_cache = NULL; +} + +static void roce_gid_cache_set_active_state(struct ib_device *ib_dev, + int active) +{ + u8 port; + + if (!ib_dev->cache.roce_gid_cache) + return; + + for (port = 0; port < ib_dev->phys_port_cnt; port++) + set_roce_gid_cache_active(ib_dev->cache.roce_gid_cache[port], + active); +} + +int roce_gid_cache_is_active(struct ib_device *ib_dev, u8 port) +{ + return ib_dev->cache.roce_gid_cache && + ib_dev->cache.roce_gid_cache[port - start_port(ib_dev)]->active; +} + +static void roce_gid_cache_client_setup_one(struct ib_device *ib_dev) +{ + if (!roce_gid_cache_setup_one(ib_dev)) { + roce_gid_cache_set_active_state(ib_dev, 1); + if (roce_rescan_device(ib_dev)) { + roce_gid_cache_set_active_state(ib_dev, 0); + roce_gid_cache_cleanup_one(ib_dev); + } + } +} + +static void roce_gid_cache_client_cleanup_work_handler(struct work_struct *work) +{ + struct ib_cache *ib_cache = container_of(work, struct ib_cache, + roce_gid_cache_cleanup_work); + struct ib_device *ib_dev = container_of(ib_cache, struct ib_device, + cache); + + /* Make sure no gid update task is still referencing this device */ + flush_workqueue(roce_gid_mgmt_wq); + + /* No need to flush the system wq, even though we use it in + * roce_rescan_device because we are guarenteed to run this + * on the system_wq after roce_rescan_device. + */ + + roce_gid_cache_cleanup_one(ib_dev); + ib_device_put(ib_dev); +} + +static void roce_gid_cache_client_cleanup_one_work(struct ib_device *ib_dev) +{ + ib_device_hold(ib_dev); + INIT_WORK(&ib_dev->cache.roce_gid_cache_cleanup_work, + roce_gid_cache_client_cleanup_work_handler); + schedule_work(&ib_dev->cache.roce_gid_cache_cleanup_work); +} + +static void roce_gid_cache_client_cleanup_one(struct ib_device *ib_dev) +{ + roce_gid_cache_set_active_state(ib_dev, 0); + roce_gid_cache_client_cleanup_one_work(ib_dev); +} + +static struct ib_client cache_client = { + .name = "roce_gid_cache", + .add = roce_gid_cache_client_setup_one, + .remove = roce_gid_cache_client_cleanup_one +}; + +int __init roce_gid_cache_setup(void) +{ + roce_gid_mgmt_init(); + + return ib_register_client(&cache_client); +} + +void __exit roce_gid_cache_cleanup(void) +{ + ib_unregister_client(&cache_client); + + roce_gid_mgmt_cleanup(); + + rcu_barrier(); +} Index: sys/ofed/drivers/infiniband/core/roce_gid_mgmt.c =================================================================== --- /dev/null +++ sys/ofed/drivers/infiniband/core/roce_gid_mgmt.c @@ -0,0 +1,417 @@ +/* + * Copyright (c) 2015, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "core_priv.h" + +#include +#include +#include + +#include +#include + +#include + +struct workqueue_struct *roce_gid_mgmt_wq; + +enum gid_op_type { + GID_DEL = 0, + GID_ADD +}; + +struct roce_gid_scan_event_work { + struct work_struct work; + struct net_device *ndev; +}; + +struct roce_rescan_work { + struct work_struct work; + struct ib_device *ib_dev; +}; + +static const struct { + int flag_mask; + enum ib_gid_type gid_type; +} PORT_CAP_TO_GID_TYPE[] = { + {IB_PORT_ROCE_V2, IB_GID_TYPE_ROCE_V2}, + {IB_PORT_ROCE, IB_GID_TYPE_IB}, + {IB_PORT_ROCE_V1_5, IB_GID_TYPE_ROCE_V1_5}, +}; + +#define CAP_TO_GID_TABLE_SIZE ARRAY_SIZE(PORT_CAP_TO_GID_TYPE) + +static unsigned long gid_type_mask_support(struct ib_device *ib_dev, u8 port) +{ + struct ib_port_attr pattr; + int i; + int err; + unsigned int ret_flags = 0; + + err = ib_query_port(ib_dev, port, &pattr); + if (err) { + pr_warn("update_gid: ib_query_port() failed for %s, %d\n", + ib_dev->name, err); + return 0; + } + + for (i = 0; i < CAP_TO_GID_TABLE_SIZE; i++) + if (pattr.port_cap_flags & PORT_CAP_TO_GID_TYPE[i].flag_mask) + ret_flags |= 1UL << PORT_CAP_TO_GID_TYPE[i].gid_type; + + return ret_flags; +} + +static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev, + u8 port, union ib_gid *gid, struct net_device *ndev) +{ + int i; + unsigned long gid_type_mask = gid_type_mask_support(ib_dev, port); + struct ib_gid_attr gid_attr; + + memset(&gid_attr, 0, sizeof(gid_attr)); + gid_attr.ndev = ndev; + + for (i = 0; i < IB_GID_TYPE_SIZE; i++) { + if ((1UL << i) & gid_type_mask) { + gid_attr.gid_type = i; + switch (gid_op) { + case GID_ADD: + roce_add_gid(ib_dev, port, + gid, &gid_attr); + break; + case GID_DEL: + roce_del_gid(ib_dev, port, + gid, &gid_attr); + break; + default: + break; + } + } + } +} + +static int +roce_gid_match_netdev(struct ib_device *ib_dev, u8 port, + struct net_device *idev, void *cookie) +{ + struct net_device *ndev = (struct net_device *)cookie; + if (idev == NULL) + return (0); + return (ndev == idev); +} + +static int +roce_gid_match_all(struct ib_device *ib_dev, u8 port, + struct net_device *idev, void *cookie) +{ + if (idev == NULL) + return (0); + return (1); +} + +static int +roce_gid_enum_netdev_default(struct ib_device *ib_dev, + u8 port, struct net_device *idev) +{ + unsigned long gid_type_mask; + + gid_type_mask = gid_type_mask_support(ib_dev, port); + + return roce_gid_cache_set_default_gid(ib_dev, port, idev, gid_type_mask, + ROCE_GID_CACHE_DEFAULT_MODE_SET); +} + +#define ETH_IPOIB_DRV_NAME "ib" + +static inline int +is_eth_ipoib_intf(struct net_device *dev) +{ + if (strncmp(dev->if_dname, ETH_IPOIB_DRV_NAME, 32)) + return 0; + return 1; +} + +static void +roce_gid_update_addr_callback(struct ib_device *device, u8 port, + struct net_device *ndev, void *cookie) +{ + struct ipx_entry { + TAILQ_ENTRY(ipx_entry) entry; + union ipx_addr { + struct sockaddr sa[0]; + struct sockaddr_in v4; + struct sockaddr_in6 v6; + } ipx_addr; + }; + struct ipx_entry *entry; + struct net_device *idev; + struct ifaddr *ifa; + union ib_gid gid; + int default_gids; + u32 scope_id; + u16 index_num; + int i; + + TAILQ_HEAD(, ipx_entry) ipx_head; + + TAILQ_INIT(&ipx_head); + + /* make sure default GIDs are in */ + default_gids = roce_gid_enum_netdev_default(device, port, ndev); + + scope_id = rdma_get_ipv6_scope_id(device, port); + + IFNET_RLOCK(); + TAILQ_FOREACH(idev, &V_ifnet, if_link) { + if (idev != ndev) { + if (idev->if_type != IFT_L2VLAN) + continue; + if (ndev != rdma_vlan_dev_real_dev(idev)) + continue; + } + + /* clone address information for IPv4 and IPv6 */ + IF_ADDR_RLOCK(idev); +#if defined(INET) + TAILQ_FOREACH(ifa, &idev->if_addrhead, ifa_link) { + if (ifa->ifa_addr == NULL || + ifa->ifa_addr->sa_family != AF_INET) + continue; + entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + if (entry == NULL) { + pr_warn("roce_gid_update_addr_callback: " + "couldn't allocate entry for IPv4 update\n"); + continue; + } + entry->ipx_addr.v4 = *((struct sockaddr_in *)ifa->ifa_addr); + TAILQ_INSERT_TAIL(&ipx_head, entry, entry); + } +#endif +#if defined(INET6) + TAILQ_FOREACH(ifa, &idev->if_addrhead, ifa_link) { + if (ifa->ifa_addr == NULL || + ifa->ifa_addr->sa_family != AF_INET6) + continue; + entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + if (entry == NULL) { + pr_warn("roce_gid_update_addr_callback: " + "couldn't allocate entry for IPv6 update\n"); + continue; + } + entry->ipx_addr.v6 = *((struct sockaddr_in6 *)ifa->ifa_addr); + sa6_recoverscope(&entry->ipx_addr.v6); + TAILQ_INSERT_TAIL(&ipx_head, entry, entry); + } +#endif + IF_ADDR_RUNLOCK(idev); + } + IFNET_RUNLOCK(); + + /* add missing GIDs, if any */ + TAILQ_FOREACH(entry, &ipx_head, entry) { + unsigned long gid_type_mask = gid_type_mask_support(device, port); + + if (rdma_ip2gid(&entry->ipx_addr.sa[0], &gid) != 0) + continue; + + for (i = 0; i < IB_GID_TYPE_SIZE; i++) { + if (!((1UL << i) & gid_type_mask)) + continue; + /* check if entry found */ + if (ib_find_cached_gid_by_port(device, &gid, i, + port, &init_net, ndev->if_index, &index_num) == 0) + break; + } + if (i != IB_GID_TYPE_SIZE) + continue; + /* add new GID */ + update_gid(GID_ADD, device, port, &gid, ndev); + } + + /* remove stale GIDs, if any */ + for (i = default_gids; ib_get_cached_gid(device, port, i, &gid, NULL) == 0; i++) { + union ipx_addr ipx; + + /* don't delete empty entries */ + if (memcmp(&gid, &zgid, sizeof(zgid)) == 0) + continue; + + /* zero default */ + memset(&ipx, 0, sizeof(ipx)); + + if (rdma_gid2ip(&ipx.sa[0], &gid, scope_id) != 0) + continue; + + TAILQ_FOREACH(entry, &ipx_head, entry) { + if (memcmp(&entry->ipx_addr, &ipx, sizeof(ipx)) == 0) + break; + } + /* check if entry found */ + if (entry != NULL) + continue; + /* remove GID */ + update_gid(GID_DEL, device, port, &gid, ndev); + } + + while ((entry = TAILQ_FIRST(&ipx_head))) { + TAILQ_REMOVE(&ipx_head, entry, entry); + kfree(entry); + } +} + +static void +roce_gid_queue_scan_event_handler(struct work_struct *_work) +{ + struct roce_gid_scan_event_work *work = + container_of(_work, struct roce_gid_scan_event_work, work); + + ib_enum_roce_ports_of_netdev(roce_gid_match_netdev, work->ndev, + roce_gid_update_addr_callback, NULL); + + dev_put(work->ndev); + kfree(work); +} + +static void +roce_gid_queue_scan_event(struct net_device *ndev) +{ + struct roce_gid_scan_event_work *work; + +retry: + if (is_eth_ipoib_intf(ndev)) + return; + + if (ndev->if_type != IFT_ETHER) { + if (ndev->if_type == IFT_L2VLAN) { + ndev = rdma_vlan_dev_real_dev(ndev); + if (ndev != NULL) + goto retry; + } + return; + } + + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (!work) { + pr_warn("roce_gid_mgmt: Couldn't allocate work for addr_event\n"); + return; + } + + INIT_WORK(&work->work, roce_gid_queue_scan_event_handler); + dev_hold(ndev); + + work->ndev = ndev; + + queue_work(roce_gid_mgmt_wq, &work->work); +} + +static int +inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *ndev = ptr; + + switch (event) { + case NETDEV_REGISTER: + case NETDEV_UNREGISTER: + case NETDEV_CHANGEADDR: + case NETDEV_CHANGEIFADDR: + roce_gid_queue_scan_event(ndev); + break; + default: + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block nb_inetaddr = { + .notifier_call = inetaddr_event +}; + +static void +roce_rescan_device_handler(struct work_struct *_work) +{ + struct roce_rescan_work *work = + container_of(_work, struct roce_rescan_work, work); + + ib_dev_roce_ports_of_netdev(work->ib_dev, roce_gid_match_all, NULL, + roce_gid_update_addr_callback, NULL); + kfree(work); +} + +/* Caller must flush system workqueue before removing the ib_device */ +int roce_rescan_device(struct ib_device *ib_dev) +{ + struct roce_rescan_work *work = kmalloc(sizeof(*work), GFP_KERNEL); + + if (!work) + return -ENOMEM; + + work->ib_dev = ib_dev; + INIT_WORK(&work->work, roce_rescan_device_handler); + queue_work(roce_gid_mgmt_wq, &work->work); + + return 0; +} + +int __init roce_gid_mgmt_init(void) +{ + roce_gid_mgmt_wq = alloc_ordered_workqueue("roce_gid_mgmt_wq", 0); + + if (!roce_gid_mgmt_wq) { + pr_warn("roce_gid_mgmt: can't allocate work queue\n"); + return -ENOMEM; + } + + register_inetaddr_notifier(&nb_inetaddr); + + /* We relay on the netdevice notifier to enumerate all + * existing devices in the system. Register to this notifier + * last to make sure we will not miss any IP add/del + * callbacks. + */ + register_netdevice_notifier(&nb_inetaddr); + + return 0; +} + +void __exit roce_gid_mgmt_cleanup(void) +{ + unregister_inetaddr_notifier(&nb_inetaddr); + unregister_netdevice_notifier(&nb_inetaddr); + /* Ensure all gid deletion tasks complete before we go down, + * to avoid any reference to free'd memory. By the time + * ib-core is removed, all physical devices have been removed, + * so no issue with remaining hardware contexts. + */ + synchronize_rcu(); + drain_workqueue(roce_gid_mgmt_wq); + destroy_workqueue(roce_gid_mgmt_wq); +} Index: sys/ofed/drivers/infiniband/core/sa.h =================================================================== --- sys/ofed/drivers/infiniband/core/sa.h +++ sys/ofed/drivers/infiniband/core/sa.h @@ -53,7 +53,7 @@ u8 method, struct ib_sa_mcmember_rec *rec, ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, + int timeout_ms, int retries, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_mcmember_rec *resp, void *context), Index: sys/ofed/drivers/infiniband/core/sa_query.c =================================================================== --- sys/ofed/drivers/infiniband/core/sa_query.c +++ sys/ofed/drivers/infiniband/core/sa_query.c @@ -41,14 +41,12 @@ #include #include #include - +#include #include #include #include "sa.h" -MODULE_AUTHOR("Roland Dreier"); -MODULE_DESCRIPTION("InfiniBand subnet administration query support"); -MODULE_LICENSE("Dual BSD/GPL"); +/* InfiniBand subnet administration query support */ struct ib_sa_sm_ah { struct ib_ah *ah; @@ -419,6 +417,11 @@ ah_attr.dlid = port_attr.sm_lid; ah_attr.sl = port_attr.sm_sl; ah_attr.port_num = port->port_num; + if (port_attr.grh_required) { + ah_attr.ah_flags = IB_AH_GRH; + ah_attr.grh.dgid.global.subnet_prefix = cpu_to_be64(IB_SA_WELL_KNOWN_GID_PREFIX); + ah_attr.grh.dgid.global.interface_id = cpu_to_be64(IB_SA_WELL_KNOWN_GUID); + } new_ah->ah = ib_create_ah(port->agent->qp->pd, &ah_attr); if (IS_ERR(new_ah->ah)) { @@ -545,7 +548,8 @@ ah_attr->ah_flags = IB_AH_GRH; ah_attr->grh.dgid = rec->dgid; - ret = ib_find_cached_gid(device, &rec->sgid, &port_num, + ret = ib_find_cached_gid(device, &rec->sgid, rec->gid_type, + rec->net, rec->ifindex, &port_num, &gid_index); if (ret) return ret; @@ -556,13 +560,8 @@ ah_attr->grh.traffic_class = rec->traffic_class; } if (force_grh) { - memcpy(ah_attr->dmac, rec->dmac, 6); - ah_attr->vlan_id = rec->vlan_id; - } else { - memset(ah_attr->dmac, 0, 6); - ah_attr->vlan_id = 0xffff; + memcpy(ah_attr->dmac, rec->dmac, ETH_ALEN); } - return 0; } EXPORT_SYMBOL(ib_init_ah_from_path); @@ -616,7 +615,7 @@ spin_unlock_irqrestore(&tid_lock, flags); } -static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask) +static int send_mad(struct ib_sa_query *query, int timeout_ms, int retries, gfp_t gfp_mask) { unsigned long flags; int ret, id; @@ -633,6 +632,7 @@ return ret; query->mad_buf->timeout_ms = timeout_ms; + query->mad_buf->retries = retries; query->mad_buf->context[0] = query; query->id = id; @@ -657,6 +657,12 @@ } EXPORT_SYMBOL(ib_sa_unpack_path); +void ib_sa_pack_path(struct ib_sa_path_rec *rec, void *attribute) +{ + ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), rec, attribute); +} +EXPORT_SYMBOL(ib_sa_pack_path); + static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query, int status, struct ib_sa_mad *mad) @@ -669,10 +675,10 @@ ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), mad->data, &rec); - rec.vlan_id = 0xffff; + rec.net = NULL; + rec.ifindex = 0; + rec.gid_type = IB_GID_TYPE_IB; memset(rec.dmac, 0, ETH_ALEN); - memset(rec.smac, 0, ETH_ALEN); - query->callback(status, &rec, query->context); } else query->callback(status, NULL, query->context); @@ -683,7 +689,6 @@ kfree(container_of(sa_query, struct ib_sa_path_query, sa_query)); } - /** * ib_sa_path_rec_get - Start a Path get query * @client:SA client @@ -692,6 +697,7 @@ * @rec:Path Record to send in query * @comp_mask:component mask to send in query * @timeout_ms:time to wait for response + * @retries:retries to send for response * @gfp_mask:GFP mask to use for internal allocations * @callback:function called when query completes, times out or is * canceled @@ -710,15 +716,15 @@ * the query. */ int ib_sa_path_rec_get(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, - struct ib_sa_path_rec *rec, - ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, - void (*callback)(int status, - struct ib_sa_path_rec *resp, - void *context), - void *context, - struct ib_sa_query **sa_query) + struct ib_device *device, u8 port_num, + struct ib_sa_path_rec *rec, + ib_sa_comp_mask comp_mask, + int timeout_ms, int retries, gfp_t gfp_mask, + void (*callback)(int status, + struct ib_sa_path_rec *resp, + void *context), + void *context, + struct ib_sa_query **sa_query) { struct ib_sa_path_query *query; struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); @@ -760,7 +766,7 @@ *sa_query = &query->sa_query; - ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); + ret = send_mad(&query->sa_query, timeout_ms, retries, gfp_mask); if (ret < 0) goto err2; @@ -808,6 +814,7 @@ * @rec:Service Record to send in request * @comp_mask:component mask to send in request * @timeout_ms:time to wait for response + * @retries:retries to send for response * @gfp_mask:GFP mask to use for internal allocations * @callback:function called when request completes, times out or is * canceled @@ -830,7 +837,7 @@ struct ib_device *device, u8 port_num, u8 method, struct ib_sa_service_rec *rec, ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, + int timeout_ms, int retries, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_service_rec *resp, void *context), @@ -883,7 +890,7 @@ *sa_query = &query->sa_query; - ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); + ret = send_mad(&query->sa_query, timeout_ms, retries, gfp_mask); if (ret < 0) goto err2; @@ -927,7 +934,7 @@ u8 method, struct ib_sa_mcmember_rec *rec, ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, + int timeout_ms, int retries, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_mcmember_rec *resp, void *context), @@ -975,7 +982,7 @@ *sa_query = &query->sa_query; - ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); + ret = send_mad(&query->sa_query, timeout_ms, retries, gfp_mask); if (ret < 0) goto err2; @@ -993,8 +1000,8 @@ /* Support GuidInfoRecord */ static void ib_sa_guidinfo_rec_callback(struct ib_sa_query *sa_query, - int status, - struct ib_sa_mad *mad) + int status, + struct ib_sa_mad *mad) { struct ib_sa_guidinfo_query *query = container_of(sa_query, struct ib_sa_guidinfo_query, sa_query); @@ -1018,7 +1025,7 @@ struct ib_device *device, u8 port_num, struct ib_sa_guidinfo_rec *rec, ib_sa_comp_mask comp_mask, u8 method, - int timeout_ms, gfp_t gfp_mask, + int timeout_ms, int retries, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_guidinfo_rec *resp, void *context), @@ -1027,7 +1034,7 @@ { struct ib_sa_guidinfo_query *query; struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); - struct ib_sa_port *port; + struct ib_sa_port *port; struct ib_mad_agent *agent; struct ib_sa_mad *mad; int ret; @@ -1048,15 +1055,15 @@ if (!query) return -ENOMEM; - query->sa_query.port = port; + query->sa_query.port = port; ret = alloc_mad(&query->sa_query, gfp_mask); if (ret) goto err1; ib_sa_client_get(client); query->sa_query.client = client; - query->callback = callback; - query->context = context; + query->callback = callback; + query->context = context; mad = query->sa_query.mad_buf->mad; init_mad(mad, agent); @@ -1073,7 +1080,7 @@ *sa_query = &query->sa_query; - ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); + ret = send_mad(&query->sa_query, timeout_ms, retries, gfp_mask); if (ret < 0) goto err2; @@ -1196,7 +1203,7 @@ INIT_IB_EVENT_HANDLER(&sa_dev->event_handler, device, ib_sa_event); if (ib_register_event_handler(&sa_dev->event_handler)) - goto reg_err; + goto err; for (i = 0; i <= e - s; ++i) if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) @@ -1204,14 +1211,10 @@ return; -reg_err: - ib_set_client_data(device, &sa_client, NULL); - i = e - s; err: - for (; i >= 0; --i) - if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND && - !IS_ERR(sa_dev->port[i].agent)) - ib_unregister_mad_agent(sa_dev->port[i].agent); + while (--i >= 0) + if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) + ib_unregister_mad_agent(sa_dev->port[i].agent); kfree(sa_dev); Index: sys/ofed/drivers/infiniband/core/sysfs.c =================================================================== --- sys/ofed/drivers/infiniband/core/sysfs.c +++ sys/ofed/drivers/infiniband/core/sysfs.c @@ -36,15 +36,26 @@ #include #include +#include +#include #include #include #include #include +struct ib_port; + +struct gid_attr_group { + struct ib_port *port; + struct kobject kobj; + struct attribute_group ndev; + struct attribute_group type; +}; struct ib_port { struct kobject kobj; struct ib_device *ibdev; + struct gid_attr_group *gid_attr_group; struct attribute_group gid_group; struct attribute_group pkey_group; u8 port_num; @@ -86,6 +97,24 @@ .show = port_attr_show }; +static ssize_t gid_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct port_attribute *port_attr = + container_of(attr, struct port_attribute, attr); + struct ib_port *p = container_of(kobj, struct gid_attr_group, + kobj)->port; + + if (!port_attr->show) + return -EIO; + + return port_attr->show(p, port_attr, buf); +} + +static const struct sysfs_ops gid_attr_sysfs_ops = { + .show = gid_attr_show +}; + static ssize_t state_show(struct ib_port *p, struct port_attribute *unused, char *buf) { @@ -188,6 +217,9 @@ if (ret) return ret; + if (!attr.active_speed && !attr.active_width) + return sprintf(buf, "0 GB/sec\n"); + ib_active_speed_enum_to_rate(attr.active_speed, &rate, &speed); @@ -232,8 +264,6 @@ return sprintf(buf, "%s\n", "InfiniBand"); case IB_LINK_LAYER_ETHERNET: return sprintf(buf, "%s\n", "Ethernet"); - case IB_LINK_LAYER_SCIF: - return sprintf(buf, "%s\n", "SCIF"); default: return sprintf(buf, "%s\n", "Unknown"); } @@ -262,6 +292,46 @@ NULL }; +static size_t print_ndev(struct ib_gid_attr *gid_attr, char *buf) +{ + if (!gid_attr->ndev) + return -EINVAL; + + return sprintf(buf, "%s\n", gid_attr->ndev->if_xname); +} + +static size_t print_gid_type(struct ib_gid_attr *gid_attr, char *buf) +{ + return sprintf(buf, "%s\n", roce_gid_cache_type_str(gid_attr->gid_type)); +} + +static ssize_t _show_port_gid_attr(struct ib_port *p, + struct port_attribute *attr, + char *buf, + size_t (*print)(struct ib_gid_attr *gid_attr, + char *buf)) +{ + struct port_table_attribute *tab_attr = + container_of(attr, struct port_table_attribute, attr); + union ib_gid gid; + struct ib_gid_attr gid_attr; + ssize_t ret; + va_list args; + + rcu_read_lock(); + ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid, + &gid_attr); + if (ret) + goto err; + + ret = print(&gid_attr, buf); + +err: + va_end(args); + rcu_read_unlock(); + return ret; +} + static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr, char *buf) { @@ -270,13 +340,26 @@ union ib_gid gid; ssize_t ret; - ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid); + ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid, NULL); if (ret) return ret; return sprintf(buf, GID_PRINT_FMT"\n",GID_PRINT_ARGS(gid.raw)); } +static ssize_t show_port_gid_attr_ndev(struct ib_port *p, + struct port_attribute *attr, char *buf) +{ + return _show_port_gid_attr(p, attr, buf, print_ndev); +} + +static ssize_t show_port_gid_attr_gid_type(struct ib_port *p, + struct port_attribute *attr, + char *buf) +{ + return _show_port_gid_attr(p, attr, buf, print_gid_type); +} + static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr, char *buf) { @@ -292,124 +375,125 @@ return sprintf(buf, "0x%04x\n", pkey); } +#define PORT_PMA_ATTR(_name, _counter, _width, _offset) \ +struct port_table_attribute port_pma_attr_##_name = { \ + .attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \ + .index = (_offset) | ((_width) << 16) | ((_counter) << 24) \ +} + static ssize_t get_pma_counters(struct ib_port *p, struct port_attribute *attr, - char *buf, int c_ext) + char *buf, int c_ext) { - struct port_table_attribute *tab_attr = - container_of(attr, struct port_table_attribute, attr); - int offset = tab_attr->index & 0xffff; - int width = (tab_attr->index >> 16) & 0xff; - struct ib_mad *in_mad = NULL; - struct ib_mad *out_mad = NULL; - ssize_t ret; - - if (!p->ibdev->process_mad) - return -ENXIO; - - in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); - out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); - if (!in_mad || !out_mad) { - ret = -ENOMEM; - goto out; - } + struct port_table_attribute *tab_attr = + container_of(attr, struct port_table_attribute, attr); + int offset = tab_attr->index & 0xffff; + int width = (tab_attr->index >> 16) & 0xff; + struct ib_mad *in_mad = NULL; + struct ib_mad *out_mad = NULL; + ssize_t ret; - in_mad->mad_hdr.base_version = 1; - in_mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_PERF_MGMT; - in_mad->mad_hdr.class_version = 1; - in_mad->mad_hdr.method = IB_MGMT_METHOD_GET; - if (c_ext) - in_mad->mad_hdr.attr_id = IB_PMA_PORT_COUNTERS_EXT; - else - in_mad->mad_hdr.attr_id = IB_PMA_PORT_COUNTERS; - - in_mad->data[41] = p->port_num; /* PortSelect field */ - - if ((p->ibdev->process_mad(p->ibdev, IB_MAD_IGNORE_MKEY, - p->port_num, NULL, NULL, in_mad, out_mad) & - (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) != - (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) { - ret = -EINVAL; - goto out; - } + if (!p->ibdev->process_mad) + return -ENOSYS; - switch (width) { - case 4: - ret = sprintf(buf, "%u\n", (out_mad->data[40 + offset / 8] >> - (4 - (offset % 8))) & 0xf); - break; - case 8: - ret = sprintf(buf, "%u\n", out_mad->data[40 + offset / 8]); - break; - case 16: - ret = sprintf(buf, "%u\n", - be16_to_cpup((__be16 *)(out_mad->data + 40 + offset / 8))); - break; - case 32: - ret = sprintf(buf, "%u\n", - be32_to_cpup((__be32 *)(out_mad->data + 40 + offset / 8))); - break; - case 64: - ret = sprintf(buf, "%llu\n", - (unsigned long long)be64_to_cpup((__be64 *)(out_mad->data + 40 + offset / 8))); - break; - default: - ret = 0; - } + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) { + ret = -ENOMEM; + goto out; + } -out: - kfree(in_mad); - kfree(out_mad); + in_mad->mad_hdr.base_version = 1; + in_mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_PERF_MGMT; + in_mad->mad_hdr.class_version = 1; + in_mad->mad_hdr.method = IB_MGMT_METHOD_GET; + if (c_ext) + in_mad->mad_hdr.attr_id = IB_PMA_PORT_COUNTERS_EXT; + else + in_mad->mad_hdr.attr_id = IB_PMA_PORT_COUNTERS; + + in_mad->data[41] = p->port_num; /* PortSelect field */ + + if ((p->ibdev->process_mad(p->ibdev, IB_MAD_IGNORE_MKEY, + p->port_num, NULL, NULL, in_mad, out_mad) & + (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) != + (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) { + ret = -EINVAL; + goto out; + } - return ret; -} + switch (width) { + case 4: + ret = sprintf(buf, "%u\n", (out_mad->data[40 + offset / 8] >> + (4 - (offset % 8))) & 0xf); + break; + case 8: + ret = sprintf(buf, "%u\n", out_mad->data[40 + offset / 8]); + break; + case 16: + ret = sprintf(buf, "%u\n", + be16_to_cpup((__be16 *)(out_mad->data + 40 + offset / 8))); + break; + case 32: + ret = sprintf(buf, "%u\n", + be32_to_cpup((__be32 *)(out_mad->data + 40 + offset / 8))); + break; + case 64: + ret = sprintf(buf, "%lu\n", + be64_to_cpup((__be64 *)(out_mad->data + 40 + + offset / 8))); + break; + default: + ret = 0; + } + +out: + kfree(in_mad); + kfree(out_mad); -#define PORT_PMA_ATTR(_name, _counter, _width, _offset) \ -struct port_table_attribute port_pma_attr_##_name = { \ - .attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \ - .index = (_offset) | ((_width) << 16) | ((_counter) << 24) \ + return ret; } static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr, - char *buf) + char *buf) { - return get_pma_counters(p, attr, buf, 0); + return get_pma_counters(p, attr, buf, 0); } -static PORT_PMA_ATTR(symbol_error , 0, 16, 32); -static PORT_PMA_ATTR(link_error_recovery , 1, 8, 48); -static PORT_PMA_ATTR(link_downed , 2, 8, 56); -static PORT_PMA_ATTR(port_rcv_errors , 3, 16, 64); +static PORT_PMA_ATTR(symbol_error , 0, 16, 32); +static PORT_PMA_ATTR(link_error_recovery , 1, 8, 48); +static PORT_PMA_ATTR(link_downed , 2, 8, 56); +static PORT_PMA_ATTR(port_rcv_errors , 3, 16, 64); static PORT_PMA_ATTR(port_rcv_remote_physical_errors, 4, 16, 80); static PORT_PMA_ATTR(port_rcv_switch_relay_errors , 5, 16, 96); -static PORT_PMA_ATTR(port_xmit_discards , 6, 16, 112); +static PORT_PMA_ATTR(port_xmit_discards , 6, 16, 112); static PORT_PMA_ATTR(port_xmit_constraint_errors , 7, 8, 128); -static PORT_PMA_ATTR(port_rcv_constraint_errors , 8, 8, 136); +static PORT_PMA_ATTR(port_rcv_constraint_errors , 8, 8, 136); static PORT_PMA_ATTR(local_link_integrity_errors , 9, 4, 152); static PORT_PMA_ATTR(excessive_buffer_overrun_errors, 10, 4, 156); -static PORT_PMA_ATTR(VL15_dropped , 11, 16, 176); -static PORT_PMA_ATTR(port_xmit_data , 12, 32, 192); -static PORT_PMA_ATTR(port_rcv_data , 13, 32, 224); -static PORT_PMA_ATTR(port_xmit_packets , 14, 32, 256); -static PORT_PMA_ATTR(port_rcv_packets , 15, 32, 288); +static PORT_PMA_ATTR(VL15_dropped , 11, 16, 176); +static PORT_PMA_ATTR(port_xmit_data , 12, 32, 192); +static PORT_PMA_ATTR(port_rcv_data , 13, 32, 224); +static PORT_PMA_ATTR(port_xmit_packets , 14, 32, 256); +static PORT_PMA_ATTR(port_rcv_packets , 15, 32, 288); static struct attribute *pma_attrs[] = { - &port_pma_attr_symbol_error.attr.attr, - &port_pma_attr_link_error_recovery.attr.attr, - &port_pma_attr_link_downed.attr.attr, - &port_pma_attr_port_rcv_errors.attr.attr, - &port_pma_attr_port_rcv_remote_physical_errors.attr.attr, - &port_pma_attr_port_rcv_switch_relay_errors.attr.attr, - &port_pma_attr_port_xmit_discards.attr.attr, - &port_pma_attr_port_xmit_constraint_errors.attr.attr, - &port_pma_attr_port_rcv_constraint_errors.attr.attr, - &port_pma_attr_local_link_integrity_errors.attr.attr, - &port_pma_attr_excessive_buffer_overrun_errors.attr.attr, - &port_pma_attr_VL15_dropped.attr.attr, - &port_pma_attr_port_xmit_data.attr.attr, - &port_pma_attr_port_rcv_data.attr.attr, - &port_pma_attr_port_xmit_packets.attr.attr, - &port_pma_attr_port_rcv_packets.attr.attr, - NULL + &port_pma_attr_symbol_error.attr.attr, + &port_pma_attr_link_error_recovery.attr.attr, + &port_pma_attr_link_downed.attr.attr, + &port_pma_attr_port_rcv_errors.attr.attr, + &port_pma_attr_port_rcv_remote_physical_errors.attr.attr, + &port_pma_attr_port_rcv_switch_relay_errors.attr.attr, + &port_pma_attr_port_xmit_discards.attr.attr, + &port_pma_attr_port_xmit_constraint_errors.attr.attr, + &port_pma_attr_port_rcv_constraint_errors.attr.attr, + &port_pma_attr_local_link_integrity_errors.attr.attr, + &port_pma_attr_excessive_buffer_overrun_errors.attr.attr, + &port_pma_attr_VL15_dropped.attr.attr, + &port_pma_attr_port_xmit_data.attr.attr, + &port_pma_attr_port_rcv_data.attr.attr, + &port_pma_attr_port_xmit_packets.attr.attr, + &port_pma_attr_port_rcv_packets.attr.attr, + NULL }; static struct attribute_group pma_group = { @@ -417,42 +501,42 @@ .attrs = pma_attrs }; -#define PORT_PMA_ATTR_EXT(_name, _counter, _width, _offset) \ -struct port_table_attribute port_pma_attr_ext_##_name = { \ - .attr = __ATTR(_name, S_IRUGO, show_pma_counter_ext, NULL), \ - .index = (_offset) | ((_width) << 16) | ((_counter) << 24) \ +#define PORT_PMA_ATTR_EXT(_name, _counter, _width, _offset) \ +struct port_table_attribute port_pma_attr_ext_##_name = { \ + .attr = __ATTR(_name, S_IRUGO, show_pma_counter_ext, NULL), \ + .index = (_offset) | ((_width) << 16) | ((_counter) << 24) \ } static ssize_t show_pma_counter_ext(struct ib_port *p, - struct port_attribute *attr, char *buf) + struct port_attribute *attr, char *buf) { - return get_pma_counters(p, attr, buf, 1); + return get_pma_counters(p, attr, buf, 1); } -static PORT_PMA_ATTR_EXT(port_xmit_data_64 , 0, 64, 64); -static PORT_PMA_ATTR_EXT(port_rcv_data_64 , 0, 64, 128); -static PORT_PMA_ATTR_EXT(port_xmit_packets_64 , 0, 64, 192); -static PORT_PMA_ATTR_EXT(port_rcv_packets_64 , 0, 64, 256); -static PORT_PMA_ATTR_EXT(port_unicast_xmit_packets , 0, 64, 320); -static PORT_PMA_ATTR_EXT(port_unicast_rcv_packets , 0, 64, 384); -static PORT_PMA_ATTR_EXT(port_multicast_xmit_packets , 0, 64, 448); -static PORT_PMA_ATTR_EXT(port_multicast_rcv_packets , 0, 64, 512); +static PORT_PMA_ATTR_EXT(port_xmit_data_64 , 0, 64, 64); +static PORT_PMA_ATTR_EXT(port_rcv_data_64 , 0, 64, 128); +static PORT_PMA_ATTR_EXT(port_xmit_packets_64 , 0, 64, 192); +static PORT_PMA_ATTR_EXT(port_rcv_packets_64 , 0, 64, 256); +static PORT_PMA_ATTR_EXT(port_unicast_xmit_packets , 0, 64, 320); +static PORT_PMA_ATTR_EXT(port_unicast_rcv_packets , 0, 64, 384); +static PORT_PMA_ATTR_EXT(port_multicast_xmit_packets , 0, 64, 448); +static PORT_PMA_ATTR_EXT(port_multicast_rcv_packets , 0, 64, 512); static struct attribute *pma_attrs_ext[] = { - &port_pma_attr_ext_port_xmit_data_64.attr.attr, - &port_pma_attr_ext_port_rcv_data_64.attr.attr, - &port_pma_attr_ext_port_xmit_packets_64.attr.attr, - &port_pma_attr_ext_port_rcv_packets_64.attr.attr, - &port_pma_attr_ext_port_unicast_xmit_packets.attr.attr, - &port_pma_attr_ext_port_unicast_rcv_packets.attr.attr, - &port_pma_attr_ext_port_multicast_xmit_packets.attr.attr, - &port_pma_attr_ext_port_multicast_rcv_packets.attr.attr, - NULL + &port_pma_attr_ext_port_xmit_data_64.attr.attr, + &port_pma_attr_ext_port_rcv_data_64.attr.attr, + &port_pma_attr_ext_port_xmit_packets_64.attr.attr, + &port_pma_attr_ext_port_rcv_packets_64.attr.attr, + &port_pma_attr_ext_port_unicast_xmit_packets.attr.attr, + &port_pma_attr_ext_port_unicast_rcv_packets.attr.attr, + &port_pma_attr_ext_port_multicast_xmit_packets.attr.attr, + &port_pma_attr_ext_port_multicast_rcv_packets.attr.attr, + NULL }; static struct attribute_group pma_ext_group = { - .name = "counters_ext", - .attrs = pma_attrs_ext + .name = "counters_ext", + .attrs = pma_attrs_ext }; static void ib_port_release(struct kobject *kobj) @@ -461,25 +545,58 @@ struct attribute *a; int i; - for (i = 0; (a = p->gid_group.attrs[i]); ++i) - kfree(a); + if (p->gid_group.attrs) { + for (i = 0; (a = p->gid_group.attrs[i]); ++i) + kfree(a); - kfree(p->gid_group.attrs); + kfree(p->gid_group.attrs); + } - for (i = 0; (a = p->pkey_group.attrs[i]); ++i) - kfree(a); + if (p->pkey_group.attrs) { + for (i = 0; (a = p->pkey_group.attrs[i]); ++i) + kfree(a); - kfree(p->pkey_group.attrs); + kfree(p->pkey_group.attrs); + } kfree(p); } +static void ib_port_gid_attr_release(struct kobject *kobj) +{ + struct gid_attr_group *g = container_of(kobj, struct gid_attr_group, + kobj); + struct attribute *a; + int i; + + if (g->ndev.attrs) { + for (i = 0; (a = g->ndev.attrs[i]); ++i) + kfree(a); + + kfree(g->ndev.attrs); + } + + if (g->type.attrs) { + for (i = 0; (a = g->type.attrs[i]); ++i) + kfree(a); + + kfree(g->type.attrs); + } + + kfree(g); +} + static struct kobj_type port_type = { .release = ib_port_release, .sysfs_ops = &port_sysfs_ops, .default_attrs = port_default_attrs }; +static struct kobj_type gid_attr_type = { + .sysfs_ops = &gid_attr_sysfs_ops, + .release = ib_port_gid_attr_release +}; + static void ib_device_release(struct device *device) { struct ib_device *dev = container_of(device, struct ib_device, dev); @@ -549,8 +666,8 @@ } static int add_port(struct ib_device *device, int port_num, - int (*port_callback)(struct ib_device *, - u8, struct kobject *)) + int (*port_callback)(struct ib_device *, + u8, struct kobject *)) { struct ib_port *p; struct ib_port_attr attr; @@ -571,56 +688,123 @@ ret = kobject_init_and_add(&p->kobj, &port_type, device->ports_parent, "%d", port_num); - if (ret) + if (ret) { + kfree(p); + return ret; + } + + p->gid_attr_group = kzalloc(sizeof(*p->gid_attr_group), GFP_KERNEL); + if (!p->gid_attr_group) { + ret = -ENOMEM; + goto err_put; + } + + p->gid_attr_group->port = p; + ret = kobject_init_and_add(&p->gid_attr_group->kobj, &gid_attr_type, + &p->kobj, "gid_attrs"); + if (ret) { + kfree(p->gid_attr_group); goto err_put; + } ret = sysfs_create_group(&p->kobj, &pma_group); if (ret) - goto err_put; + goto err_put_gid_attrs; - ret = sysfs_create_group(&p->kobj, &pma_ext_group); - if (ret) - goto err_remove_pma; + ret = sysfs_create_group(&p->kobj, &pma_ext_group); + if (ret) + goto err_remove_pma; p->gid_group.name = "gids"; p->gid_group.attrs = alloc_group_attrs(show_port_gid, attr.gid_tbl_len); - if (!p->gid_group.attrs) + if (!p->gid_group.attrs) { + ret = -ENOMEM; goto err_remove_pma_ext; + } ret = sysfs_create_group(&p->kobj, &p->gid_group); if (ret) goto err_free_gid; + p->gid_attr_group->ndev.name = "ndevs"; + p->gid_attr_group->ndev.attrs = alloc_group_attrs(show_port_gid_attr_ndev, + attr.gid_tbl_len); + if (!p->gid_attr_group->ndev.attrs) { + ret = -ENOMEM; + goto err_remove_gid; + } + + ret = sysfs_create_group(&p->gid_attr_group->kobj, + &p->gid_attr_group->ndev); + if (ret) + goto err_free_gid_ndev; + + p->gid_attr_group->type.name = "types"; + p->gid_attr_group->type.attrs = alloc_group_attrs(show_port_gid_attr_gid_type, + attr.gid_tbl_len); + if (!p->gid_attr_group->type.attrs) { + ret = -ENOMEM; + goto err_remove_gid_ndev; + } + + ret = sysfs_create_group(&p->gid_attr_group->kobj, + &p->gid_attr_group->type); + if (ret) + goto err_free_gid_type; + p->pkey_group.name = "pkeys"; p->pkey_group.attrs = alloc_group_attrs(show_port_pkey, attr.pkey_tbl_len); - if (!p->pkey_group.attrs) - goto err_remove_gid; + if (!p->pkey_group.attrs) { + ret = -ENOMEM; + goto err_remove_gid_type; + } ret = sysfs_create_group(&p->kobj, &p->pkey_group); if (ret) goto err_free_pkey; - if (port_callback) { - ret = port_callback(device, port_num, &p->kobj); - if (ret) - goto err_remove_pkey; - } + if (port_callback) { + ret = port_callback(device, port_num, &p->kobj); + if (ret) + goto err_remove_pkey; + } list_add_tail(&p->kobj.entry, &device->port_list); -#ifdef __linux__ - kobject_uevent(&p->kobj, KOBJ_ADD); -#endif + return 0; err_remove_pkey: - sysfs_remove_group(&p->kobj, &p->pkey_group); + sysfs_remove_group(&p->kobj, &p->pkey_group); err_free_pkey: for (i = 0; i < attr.pkey_tbl_len; ++i) kfree(p->pkey_group.attrs[i]); kfree(p->pkey_group.attrs); + p->pkey_group.attrs = NULL; + +err_remove_gid_type: + sysfs_remove_group(&p->gid_attr_group->kobj, + &p->gid_attr_group->type); + +err_free_gid_type: + for (i = 0; i < attr.gid_tbl_len; ++i) + kfree(p->gid_attr_group->type.attrs[i]); + + kfree(p->gid_attr_group->type.attrs); + p->gid_attr_group->type.attrs = NULL; + +err_remove_gid_ndev: + sysfs_remove_group(&p->gid_attr_group->kobj, + &p->gid_attr_group->ndev); + +err_free_gid_ndev: + for (i = 0; i < attr.gid_tbl_len; ++i) + kfree(p->gid_attr_group->ndev.attrs[i]); + + kfree(p->gid_attr_group->ndev.attrs); + p->gid_attr_group->ndev.attrs = NULL; err_remove_gid: sysfs_remove_group(&p->kobj, &p->gid_group); @@ -630,16 +814,19 @@ kfree(p->gid_group.attrs[i]); kfree(p->gid_group.attrs); + p->gid_group.attrs = NULL; err_remove_pma_ext: - sysfs_remove_group(&p->kobj, &pma_ext_group); + sysfs_remove_group(&p->kobj, &pma_ext_group); err_remove_pma: sysfs_remove_group(&p->kobj, &pma_group); +err_put_gid_attrs: + kobject_put(&p->gid_attr_group->kobj); + err_put: - kobject_put(device->ports_parent); - kfree(p); + kobject_put(&p->kobj); return ret; } @@ -653,7 +840,6 @@ case RDMA_NODE_RNIC: return sprintf(buf, "%d: RNIC\n", dev->node_type); case RDMA_NODE_IB_SWITCH: return sprintf(buf, "%d: switch\n", dev->node_type); case RDMA_NODE_IB_ROUTER: return sprintf(buf, "%d: router\n", dev->node_type); - case RDMA_NODE_MIC: return sprintf(buf, "%d: MIC\n", dev->node_type); default: return sprintf(buf, "%d: \n", dev->node_type); } } @@ -675,7 +861,7 @@ be16_to_cpu(((__be16 *) &attr.sys_image_guid)[2]), be16_to_cpu(((__be16 *) &attr.sys_image_guid)[3])); } - + static ssize_t show_node_guid(struct device *device, struct device_attribute *attr, char *buf) { @@ -715,83 +901,21 @@ return count; } -static ssize_t show_cmd_perf(struct device *device, - struct device_attribute *attr, char *buf) -{ - struct ib_device *dev = container_of(device, struct ib_device, dev); - - return sprintf(buf, "%d\n", dev->cmd_perf); -} - -static ssize_t set_cmd_perf(struct device *device, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct ib_device *dev = container_of(device, struct ib_device, dev); - u32 val; - - if (sscanf(buf, "0x%x", &val) != 1) - return -EINVAL; - - dev->cmd_perf = val; - - return count; -} - -static ssize_t show_cmd_avg(struct device *device, - struct device_attribute *attr, char *buf) -{ - struct ib_device *dev = container_of(device, struct ib_device, dev); - - return sprintf(buf, "%llu\n", (unsigned long long)dev->cmd_avg); -} - -static ssize_t set_cmd_avg(struct device *device, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct ib_device *dev = container_of(device, struct ib_device, dev); - - spin_lock(&dev->cmd_perf_lock); - dev->cmd_avg = 0; - dev->cmd_n = 0; - spin_unlock(&dev->cmd_perf_lock); - - return count; -} - -static ssize_t show_cmd_n(struct device *device, - struct device_attribute *attr, char *buf) -{ - struct ib_device *dev = container_of(device, struct ib_device, dev); - - return sprintf(buf, "%d\n", dev->cmd_n); -} - static DEVICE_ATTR(node_type, S_IRUGO, show_node_type, NULL); static DEVICE_ATTR(sys_image_guid, S_IRUGO, show_sys_image_guid, NULL); static DEVICE_ATTR(node_guid, S_IRUGO, show_node_guid, NULL); static DEVICE_ATTR(node_desc, S_IRUGO | S_IWUSR, show_node_desc, set_node_desc); -static DEVICE_ATTR(cmd_perf, S_IRUGO | S_IWUSR, show_cmd_perf, set_cmd_perf); -static DEVICE_ATTR(cmd_avg, S_IRUGO | S_IWUSR, show_cmd_avg, set_cmd_avg); -static DEVICE_ATTR(cmd_n, S_IRUGO, show_cmd_n, NULL); static struct device_attribute *ib_class_attributes[] = { &dev_attr_node_type, &dev_attr_sys_image_guid, &dev_attr_node_guid, - &dev_attr_node_desc, - &dev_attr_cmd_perf, - &dev_attr_cmd_avg, - &dev_attr_cmd_n, + &dev_attr_node_desc }; static struct class ib_class = { .name = "infiniband", .dev_release = ib_device_release, -#ifdef __linux__ - .dev_uevent = ib_device_uevent, -#endif }; /* Show a given an attribute in the statistics group */ @@ -908,6 +1032,28 @@ .attrs = iw_proto_stats_attrs, }; +static void free_port_list_attributes(struct ib_device *device) +{ + struct kobject *p, *t; + + list_for_each_entry_safe(p, t, &device->port_list, entry) { + struct ib_port *port = container_of(p, struct ib_port, kobj); + list_del(&p->entry); + sysfs_remove_group(p, &pma_group); + sysfs_remove_group(p, &pma_ext_group); + sysfs_remove_group(p, &port->pkey_group); + sysfs_remove_group(p, &port->gid_group); + sysfs_remove_group(&port->gid_attr_group->kobj, + &port->gid_attr_group->ndev); + sysfs_remove_group(&port->gid_attr_group->kobj, + &port->gid_attr_group->type); + kobject_put(&port->gid_attr_group->kobj); + kobject_put(p); + } + + kobject_put(device->ports_parent); +} + int ib_device_register_sysfs(struct ib_device *device, int (*port_callback)(struct ib_device *, u8, struct kobject *)) @@ -918,8 +1064,8 @@ class_dev->class = &ib_class; class_dev->parent = device->dma_device; - dev_set_name(class_dev, device->name); - dev_set_drvdata(class_dev, device); + dev_set_name(class_dev, "%s", device->name); + dev_set_drvdata(class_dev, device); INIT_LIST_HEAD(&device->port_list); @@ -933,8 +1079,9 @@ goto err_unregister; } - device->ports_parent = kobject_create_and_add("ports",&class_dev->kobj); - if (!device->ports_parent) { + device->ports_parent = kobject_create_and_add("ports", + &class_dev->kobj); + if (!device->ports_parent) { ret = -ENOMEM; goto err_put; } @@ -960,21 +1107,7 @@ return 0; err_put: - { - struct kobject *p, *t; - struct ib_port *port; - - list_for_each_entry_safe(p, t, &device->port_list, entry) { - list_del(&p->entry); - port = container_of(p, struct ib_port, kobj); - sysfs_remove_group(p, &pma_group); - sysfs_remove_group(p, &port->pkey_group); - sysfs_remove_group(p, &port->gid_group); - kobject_put(p); - } - } - - kobject_put(&class_dev->kobj); + free_port_list_attributes(device); err_unregister: @@ -990,28 +1123,18 @@ void ib_device_unregister_sysfs(struct ib_device *device) { + /* Hold kobject until ib_dealloc_device() */ + struct kobject *kobj_dev = kobject_get(&device->dev.kobj); int i; - struct kobject *p, *t; - struct ib_port *port; - struct device *class_dev = &device->dev; - /* Hold kobject until ib_dealloc_device() */ - kobject_get(&device->dev.kobj); + if (device->node_type == RDMA_NODE_RNIC && device->get_protocol_stats) + sysfs_remove_group(kobj_dev, &iw_stats_group); - for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) { - device_remove_file(class_dev, ib_class_attributes[i]); - } + free_port_list_attributes(device); - list_for_each_entry_safe(p, t, &device->port_list, entry) { - list_del(&p->entry); - port = container_of(p, struct ib_port, kobj); - sysfs_remove_group(p, &pma_group); - sysfs_remove_group(p, &port->pkey_group); - sysfs_remove_group(p, &port->gid_group); - kobject_put(p); - } + for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) + device_remove_file(&device->dev, ib_class_attributes[i]); - kobject_put(device->ports_parent); device_unregister(&device->dev); } Index: sys/ofed/drivers/infiniband/core/ucm.c =================================================================== --- sys/ofed/drivers/infiniband/core/ucm.c +++ sys/ofed/drivers/infiniband/core/ucm.c @@ -32,7 +32,7 @@ */ #include -#include + #include #include #include #include @@ -51,9 +51,7 @@ #include #include -MODULE_AUTHOR("Libor Michalek"); -MODULE_DESCRIPTION("InfiniBand userspace Connection Manager access"); -MODULE_LICENSE("Dual BSD/GPL"); +/* InfiniBand userspace Connection Manager access */ struct ib_ucm_device { int devnum; @@ -120,6 +118,9 @@ static DEFINE_IDR(ctx_id_table); static DECLARE_BITMAP(dev_map, IB_UCM_MAX_DEVICES); +static dev_t overflow_maj; +static DECLARE_BITMAP(overflow_map, IB_UCM_MAX_DEVICES); + static struct ib_ucm_context *ib_ucm_ctx_get(struct ib_ucm_file *file, int id) { struct ib_ucm_context *ctx; @@ -1214,17 +1215,17 @@ ucm_dev = container_of(dev, struct ib_ucm_device, dev); cdev_del(&ucm_dev->cdev); if (ucm_dev->devnum < IB_UCM_MAX_DEVICES) - clear_bit(ucm_dev->devnum, dev_map); + clear_bit(ucm_dev->devnum, dev_map); else - clear_bit(ucm_dev->devnum - IB_UCM_MAX_DEVICES, dev_map); + clear_bit(ucm_dev->devnum - IB_UCM_MAX_DEVICES, overflow_map); kfree(ucm_dev); } static const struct file_operations ucm_fops = { - .owner = THIS_MODULE, - .open = ib_ucm_open, + .owner = THIS_MODULE, + .open = ib_ucm_open, .release = ib_ucm_close, - .write = ib_ucm_write, + .write = ib_ucm_write, .poll = ib_ucm_poll, .llseek = no_llseek, }; @@ -1239,8 +1240,6 @@ } static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); -static dev_t overflow_maj; -static DECLARE_BITMAP(overflow_map, IB_UCM_MAX_DEVICES); static int find_overflow_devnum(void) { int ret; @@ -1281,7 +1280,7 @@ if (devnum >= IB_UCM_MAX_DEVICES) { devnum = find_overflow_devnum(); if (devnum < 0) - goto err; + goto err; ucm_dev->devnum = devnum + IB_UCM_MAX_DEVICES; base = devnum + overflow_maj; @@ -1337,7 +1336,7 @@ static ssize_t show_abi_version(struct class *class, struct class_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", IB_USER_CM_ABI_VERSION); + return sprintf(buf, "%d\n", IB_USER_CM_ABI_VERSION); } static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL); Index: sys/ofed/drivers/infiniband/core/ucma.c =================================================================== --- sys/ofed/drivers/infiniband/core/ucma.c +++ sys/ofed/drivers/infiniband/core/ucma.c @@ -48,10 +48,10 @@ #include #include #include +#include +#include -MODULE_AUTHOR("Sean Hefty"); -MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access"); -MODULE_LICENSE("Dual BSD/GPL"); +/* RDMA Userspace Connection Manager Access */ static unsigned int max_backlog = 1024; @@ -61,6 +61,7 @@ struct list_head ctx_list; struct list_head event_list; wait_queue_head_t poll_wait; + struct workqueue_struct *close_wq; }; struct ucma_context { @@ -76,6 +77,9 @@ struct list_head list; struct list_head mc_list; + int closing; + int destroying; + struct work_struct close_work; }; struct ucma_multicast { @@ -94,6 +98,7 @@ struct list_head list; struct rdma_cm_id *cm_id; struct rdma_ucm_event_resp resp; + struct work_struct close_work; }; static DEFINE_MUTEX(mut); @@ -119,8 +124,12 @@ mutex_lock(&mut); ctx = _ucma_find_context(id, file); - if (!IS_ERR(ctx)) - atomic_inc(&ctx->ref); + if (!IS_ERR(ctx)) { + if (ctx->closing) + ctx = ERR_PTR(-EIO); + else + atomic_inc(&ctx->ref); + } mutex_unlock(&mut); return ctx; } @@ -131,6 +140,34 @@ complete(&ctx->comp); } +static void ucma_close_event_id(struct work_struct *work) +{ + struct ucma_event *uevent_close = container_of(work, struct ucma_event, close_work); + + rdma_destroy_id(uevent_close->cm_id); + kfree(uevent_close); +} + +static void ucma_close_id(struct work_struct *work) +{ + struct ucma_context *ctx = container_of(work, struct ucma_context, close_work); + + /* Fence to ensure that ctx->closing was seen by all + * ucma_get_ctx running + */ + mutex_lock(&mut); + mutex_unlock(&mut); + + /* once all inflight tasks are finished, we close all underlying + * resources. The context is still alive till its explicit destryoing + * by its creator. + */ + ucma_put_ctx(ctx); + wait_for_completion(&ctx->comp); + /* No new events will be generated after destroying the id. */ + rdma_destroy_id(ctx->cm_id); +} + static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file) { struct ucma_context *ctx; @@ -140,6 +177,7 @@ if (!ctx) return NULL; + INIT_WORK(&ctx->close_work, ucma_close_id); atomic_set(&ctx->ref, 1); init_completion(&ctx->comp); INIT_LIST_HEAD(&ctx->mc_list); @@ -245,6 +283,42 @@ } } +/* Called with file->mut locked for the relevant context. */ +static void ucma_removal_event_handler(struct rdma_cm_id *cm_id) +{ + struct ucma_context *ctx = cm_id->context; + struct ucma_event *con_req_eve; + int event_found = 0; + + if (ctx->destroying) + return; + + /* only if context is pointing to cm_id that it owns it and can be + * queued to be closed, otherwise that cm_id is an inflight one that + * is part of that context event list pending to be detached and + * reattached to its new context as part of ucma_get_event, + * handled separately below. + */ + if (ctx->cm_id == cm_id) { + ctx->closing = 1; + queue_work(ctx->file->close_wq, &ctx->close_work); + return; + } + + list_for_each_entry(con_req_eve, &ctx->file->event_list, list) { + if (con_req_eve->cm_id == cm_id && + con_req_eve->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) { + list_del(&con_req_eve->list); + INIT_WORK(&con_req_eve->close_work, ucma_close_event_id); + queue_work(ctx->file->close_wq, &con_req_eve->close_work); + event_found = 1; + break; + } + } + if (!event_found) + printk(KERN_ERR "ucma_removal_event_handler: warning: connect request event wasn't found\n"); +} + static int ucma_event_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event) { @@ -274,13 +348,18 @@ goto out; } ctx->backlog--; - } else if (!ctx->uid) { + } else if (!ctx->uid || ctx->cm_id != cm_id) { /* * We ignore events for new connections until userspace has set * their context. This can only happen if an error occurs on a * new connection before the user accepts it. This is okay, - * since the accept will just fail later. + * since the accept will just fail later. However, we do need + * to release the underlying HW resources in case of a device + * removal event. */ + if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) + ucma_removal_event_handler(cm_id); + kfree(uevent); goto out; } @@ -289,6 +368,8 @@ wake_up_interruptible(&ctx->file->poll_wait); if (ctx->file->filp) selwakeup(&ctx->file->filp->f_selinfo); + if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) + ucma_removal_event_handler(cm_id); out: mutex_unlock(&ctx->file->mut); return ret; @@ -372,7 +453,7 @@ } static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf, - int in_len, int out_len) + int in_len, int out_len) { struct rdma_ucm_create_id cmd; struct rdma_ucm_create_id_resp resp; @@ -402,6 +483,7 @@ ret = PTR_ERR(ctx->cm_id); goto err1; } + ctx->cm_id->ucontext = ctx; resp.id = ctx->id; @@ -449,9 +531,15 @@ } /* - * We cannot hold file->mut when calling rdma_destroy_id() or we can - * deadlock. We also acquire file->mut in ucma_event_handler(), and - * rdma_destroy_id() will wait until all callbacks have completed. + * ucma_free_ctx is called after the underlying rdma CM-ID is destroyed. At + * this point, no new events will be reported from the hardware. However, we + * still need to cleanup the UCMA context for this ID. Specifically, there + * might be events that have not yet been consumed by the user space software. + * These might include pending connect requests which we have not completed + * processing. We cannot call rdma_destroy_id while holding the lock of the + * context (file->mut), as it might cause a deadlock. We therefore extract all + * relevant events from the context pending events list while holding the + * mutex. After that we release them as needed. */ static int ucma_free_ctx(struct ucma_context *ctx) { @@ -459,8 +547,6 @@ struct ucma_event *uevent, *tmp; LIST_HEAD(list); - /* No new events will be generated after destroying the id. */ - rdma_destroy_id(ctx->cm_id); ucma_cleanup_multicast(ctx); @@ -508,10 +594,20 @@ if (IS_ERR(ctx)) return PTR_ERR(ctx); - ucma_put_ctx(ctx); - wait_for_completion(&ctx->comp); - resp.events_reported = ucma_free_ctx(ctx); + mutex_lock(&ctx->file->mut); + ctx->destroying = 1; + mutex_unlock(&ctx->file->mut); + + flush_workqueue(ctx->file->close_wq); + /* At this point it's guaranteed that there is no inflight + * closing task */ + if (!ctx->closing) { + ucma_put_ctx(ctx); + wait_for_completion(&ctx->comp); + rdma_destroy_id(ctx->cm_id); + } + resp.events_reported = ucma_free_ctx(ctx); if (copy_to_user((void __user *)(unsigned long)cmd.response, &resp, sizeof(resp))) ret = -EFAULT; @@ -519,10 +615,10 @@ return ret; } -static ssize_t ucma_bind_addr(struct ucma_file *file, const char __user *inbuf, +static ssize_t ucma_bind_ip(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { - struct rdma_ucm_bind_addr cmd; + struct rdma_ucm_bind_ip cmd; struct ucma_context *ctx; int ret; @@ -538,24 +634,75 @@ return ret; } +static ssize_t ucma_bind(struct ucma_file *file, const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_bind cmd; + struct sockaddr *addr; + struct ucma_context *ctx; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + addr = (struct sockaddr *) &cmd.addr; + if (cmd.reserved || !cmd.addr_size || (cmd.addr_size != rdma_addr_size(addr))) + return -EINVAL; + + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ret = rdma_bind_addr(ctx->cm_id, addr); + ucma_put_ctx(ctx); + return ret; +} + +static ssize_t ucma_resolve_ip(struct ucma_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_resolve_ip cmd; + struct ucma_context *ctx; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr, + (struct sockaddr *) &cmd.dst_addr, + cmd.timeout_ms); + ucma_put_ctx(ctx); + return ret; +} + static ssize_t ucma_resolve_addr(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_resolve_addr cmd; + struct sockaddr *src, *dst; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; + src = (struct sockaddr *) &cmd.src_addr; + dst = (struct sockaddr *) &cmd.dst_addr; + if (cmd.reserved || (cmd.src_size && (cmd.src_size != rdma_addr_size(src))) || + !cmd.dst_size || (cmd.dst_size != rdma_addr_size(dst))) + return -EINVAL; + ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); - ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr, - (struct sockaddr *) &cmd.dst_addr, - cmd.timeout_ms); + ret = rdma_resolve_addr(ctx->cm_id, src, dst, cmd.timeout_ms); ucma_put_ctx(ctx); return ret; } @@ -648,7 +795,7 @@ const char __user *inbuf, int in_len, int out_len) { - struct rdma_ucm_query_route cmd; + struct rdma_ucm_query cmd; struct rdma_ucm_query_route_resp resp; struct ucma_context *ctx; struct sockaddr *addr; @@ -708,7 +855,288 @@ return ret; } -static void ucma_copy_conn_param(struct rdma_conn_param *dst, +static void ucma_query_device_addr(struct rdma_cm_id *cm_id, + struct rdma_ucm_query_addr_resp *resp) +{ + if (!cm_id->device) + return; + + resp->node_guid = (__force __u64) cm_id->device->node_guid; + resp->port_num = cm_id->port_num; + resp->pkey = (__force __u16) cpu_to_be16( + ib_addr_get_pkey(&cm_id->route.addr.dev_addr)); +} + +static ssize_t ucma_query_addr(struct ucma_context *ctx, + void __user *response, int out_len) +{ + struct rdma_ucm_query_addr_resp resp; + struct sockaddr *addr; + int ret = 0; + + if (out_len < sizeof(resp)) + return -ENOSPC; + + memset(&resp, 0, sizeof resp); + + addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr; + resp.src_size = rdma_addr_size(addr); + memcpy(&resp.src_addr, addr, resp.src_size); + + addr = (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr; + resp.dst_size = rdma_addr_size(addr); + memcpy(&resp.dst_addr, addr, resp.dst_size); + + ucma_query_device_addr(ctx->cm_id, &resp); + + if (copy_to_user(response, &resp, sizeof(resp))) + ret = -EFAULT; + + return ret; +} + +static ssize_t ucma_query_path(struct ucma_context *ctx, + void __user *response, int out_len) +{ + struct rdma_ucm_query_path_resp *resp; + int i, ret = 0; + + if (out_len < sizeof(*resp)) + return -ENOSPC; + + resp = kzalloc(out_len, GFP_KERNEL); + if (!resp) + return -ENOMEM; + + resp->num_paths = ctx->cm_id->route.num_paths; + for (i = 0, out_len -= sizeof(*resp); + i < resp->num_paths && out_len > sizeof(struct ib_path_rec_data); + i++, out_len -= sizeof(struct ib_path_rec_data)) { + + resp->path_data[i].flags = IB_PATH_GMP | IB_PATH_PRIMARY | + IB_PATH_BIDIRECTIONAL; + ib_sa_pack_path(&ctx->cm_id->route.path_rec[i], + &resp->path_data[i].path_rec); + } + + if (copy_to_user(response, resp, + sizeof(*resp) + (i * sizeof(struct ib_path_rec_data)))) + ret = -EFAULT; + + kfree(resp); + return ret; +} + +static ssize_t ucma_query_gid(struct ucma_context *ctx, + void __user *response, int out_len) +{ + struct rdma_ucm_query_addr_resp resp; + struct sockaddr_ib *addr; + int ret = 0; + + if (out_len < sizeof(resp)) + return -ENOSPC; + + memset(&resp, 0, sizeof resp); + + ucma_query_device_addr(ctx->cm_id, &resp); + + addr = (struct sockaddr_ib *) &resp.src_addr; + resp.src_size = sizeof(*addr); + if (ctx->cm_id->route.addr.src_addr.ss_family == AF_IB) { + memcpy(addr, &ctx->cm_id->route.addr.src_addr, resp.src_size); + } else { + addr->sib_family = AF_IB; + addr->sib_pkey = (__force __be16) resp.pkey; + rdma_addr_get_sgid(&ctx->cm_id->route.addr.dev_addr, + (union ib_gid *) &addr->sib_addr); + addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *) + &ctx->cm_id->route.addr.src_addr); + } + + addr = (struct sockaddr_ib *) &resp.dst_addr; + resp.dst_size = sizeof(*addr); + if (ctx->cm_id->route.addr.dst_addr.ss_family == AF_IB) { + memcpy(addr, &ctx->cm_id->route.addr.dst_addr, resp.dst_size); + } else { + addr->sib_family = AF_IB; + addr->sib_pkey = (__force __be16) resp.pkey; + rdma_addr_get_dgid(&ctx->cm_id->route.addr.dev_addr, + (union ib_gid *) &addr->sib_addr); + addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *) + &ctx->cm_id->route.addr.dst_addr); + } + + if (copy_to_user(response, &resp, sizeof(resp))) + ret = -EFAULT; + + return ret; +} + +static ssize_t ucma_query(struct ucma_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_query cmd; + struct ucma_context *ctx; + void __user *response; + int ret; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + response = (void __user *)(unsigned long) cmd.response; + ctx = ucma_get_ctx(file, cmd.id); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + switch (cmd.option) { + case RDMA_USER_CM_QUERY_ADDR: + ret = ucma_query_addr(ctx, response, out_len); + break; + case RDMA_USER_CM_QUERY_PATH: + ret = ucma_query_path(ctx, response, out_len); + break; + case RDMA_USER_CM_QUERY_GID: + ret = ucma_query_gid(ctx, response, out_len); + break; + default: + ret = -ENOSYS; + break; + } + + ucma_put_ctx(ctx); + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + return ret; +} + +static void ucma_copy_conn_param(struct rdma_cm_id *id, + struct rdma_conn_param *dst, struct rdma_ucm_conn_param *src) { dst->private_data = src->private_data; @@ -720,6 +1148,7 @@ dst->rnr_retry_count = src->rnr_retry_count; dst->srq = src->srq; dst->qp_num = src->qp_num; + dst->qkey = (id->route.addr.src_addr.ss_family == AF_IB) ? src->qkey : 0; } static ssize_t ucma_connect(struct ucma_file *file, const char __user *inbuf, @@ -740,7 +1169,7 @@ if (IS_ERR(ctx)) return PTR_ERR(ctx); - ucma_copy_conn_param(&conn_param, &cmd.conn_param); + ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param); ret = rdma_connect(ctx->cm_id, &conn_param); ucma_put_ctx(ctx); return ret; @@ -783,7 +1212,7 @@ return PTR_ERR(ctx); if (cmd.conn_param.valid) { - ucma_copy_conn_param(&conn_param, &cmd.conn_param); + ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param); mutex_lock(&file->mut); ret = rdma_accept(ctx->cm_id, &conn_param); if (!ret) @@ -924,6 +1353,8 @@ if (!optlen) return -EINVAL; + memset(&sa_path, 0, sizeof(sa_path)); + ib_sa_unpack_path(path_data->path_rec, &sa_path); ret = rdma_set_ib_paths(ctx->cm_id, &sa_path, 1); if (ret) @@ -937,7 +1368,7 @@ static int ucma_set_option_ib(struct ucma_context *ctx, int optname, void *optval, size_t optlen) { - int ret = 0; + int ret = -ENOSYS; switch (optname) { case RDMA_OPTION_IB_PATH: @@ -1035,23 +1466,23 @@ return ret; } -static ssize_t ucma_join_multicast(struct ucma_file *file, - const char __user *inbuf, - int in_len, int out_len) +static ssize_t ucma_process_join(struct ucma_file *file, + struct rdma_ucm_join_mcast *cmd, int out_len) { - struct rdma_ucm_join_mcast cmd; struct rdma_ucm_create_id_resp resp; struct ucma_context *ctx; struct ucma_multicast *mc; + struct sockaddr *addr; int ret; if (out_len < sizeof(resp)) return -ENOSPC; - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; + addr = (struct sockaddr *) &cmd->addr; + if (cmd->reserved || !cmd->addr_size || (cmd->addr_size != rdma_addr_size(addr))) + return -EINVAL; - ctx = ucma_get_ctx(file, cmd.id); + ctx = ucma_get_ctx(file, cmd->id); if (IS_ERR(ctx)) return PTR_ERR(ctx); @@ -1062,14 +1493,14 @@ goto err1; } - mc->uid = cmd.uid; - memcpy(&mc->addr, &cmd.addr, sizeof cmd.addr); + mc->uid = cmd->uid; + memcpy(&mc->addr, addr, cmd->addr_size); ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr, mc); if (ret) goto err2; resp.id = mc->id; - if (copy_to_user((void __user *)(unsigned long)cmd.response, + if (copy_to_user((void __user *)(unsigned long) cmd->response, &resp, sizeof(resp))) { ret = -EFAULT; goto err3; @@ -1094,6 +1525,38 @@ return ret; } +static ssize_t ucma_join_ip_multicast(struct ucma_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_join_ip_mcast cmd; + struct rdma_ucm_join_mcast join_cmd; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + join_cmd.response = cmd.response; + join_cmd.uid = cmd.uid; + join_cmd.id = cmd.id; + join_cmd.addr_size = rdma_addr_size((struct sockaddr *) &cmd.addr); + join_cmd.reserved = 0; + memcpy(&join_cmd.addr, &cmd.addr, join_cmd.addr_size); + + return ucma_process_join(file, &join_cmd, out_len); +} + +static ssize_t ucma_join_multicast(struct ucma_file *file, + const char __user *inbuf, + int in_len, int out_len) +{ + struct rdma_ucm_join_mcast cmd; + + if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + return -EFAULT; + + return ucma_process_join(file, &cmd, out_len); +} + static ssize_t ucma_leave_multicast(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) @@ -1236,25 +1699,29 @@ static ssize_t (*ucma_cmd_table[])(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) = { - [RDMA_USER_CM_CMD_CREATE_ID] = ucma_create_id, - [RDMA_USER_CM_CMD_DESTROY_ID] = ucma_destroy_id, - [RDMA_USER_CM_CMD_BIND_ADDR] = ucma_bind_addr, - [RDMA_USER_CM_CMD_RESOLVE_ADDR] = ucma_resolve_addr, - [RDMA_USER_CM_CMD_RESOLVE_ROUTE]= ucma_resolve_route, - [RDMA_USER_CM_CMD_QUERY_ROUTE] = ucma_query_route, - [RDMA_USER_CM_CMD_CONNECT] = ucma_connect, - [RDMA_USER_CM_CMD_LISTEN] = ucma_listen, - [RDMA_USER_CM_CMD_ACCEPT] = ucma_accept, - [RDMA_USER_CM_CMD_REJECT] = ucma_reject, - [RDMA_USER_CM_CMD_DISCONNECT] = ucma_disconnect, - [RDMA_USER_CM_CMD_INIT_QP_ATTR] = ucma_init_qp_attr, - [RDMA_USER_CM_CMD_GET_EVENT] = ucma_get_event, - [RDMA_USER_CM_CMD_GET_OPTION] = NULL, - [RDMA_USER_CM_CMD_SET_OPTION] = ucma_set_option, - [RDMA_USER_CM_CMD_NOTIFY] = ucma_notify, - [RDMA_USER_CM_CMD_JOIN_MCAST] = ucma_join_multicast, - [RDMA_USER_CM_CMD_LEAVE_MCAST] = ucma_leave_multicast, - [RDMA_USER_CM_CMD_MIGRATE_ID] = ucma_migrate_id + [RDMA_USER_CM_CMD_CREATE_ID] = ucma_create_id, + [RDMA_USER_CM_CMD_DESTROY_ID] = ucma_destroy_id, + [RDMA_USER_CM_CMD_BIND_IP] = ucma_bind_ip, + [RDMA_USER_CM_CMD_RESOLVE_IP] = ucma_resolve_ip, + [RDMA_USER_CM_CMD_RESOLVE_ROUTE] = ucma_resolve_route, + [RDMA_USER_CM_CMD_QUERY_ROUTE] = ucma_query_route, + [RDMA_USER_CM_CMD_CONNECT] = ucma_connect, + [RDMA_USER_CM_CMD_LISTEN] = ucma_listen, + [RDMA_USER_CM_CMD_ACCEPT] = ucma_accept, + [RDMA_USER_CM_CMD_REJECT] = ucma_reject, + [RDMA_USER_CM_CMD_DISCONNECT] = ucma_disconnect, + [RDMA_USER_CM_CMD_INIT_QP_ATTR] = ucma_init_qp_attr, + [RDMA_USER_CM_CMD_GET_EVENT] = ucma_get_event, + [RDMA_USER_CM_CMD_GET_OPTION] = NULL, + [RDMA_USER_CM_CMD_SET_OPTION] = ucma_set_option, + [RDMA_USER_CM_CMD_NOTIFY] = ucma_notify, + [RDMA_USER_CM_CMD_JOIN_IP_MCAST] = ucma_join_ip_multicast, + [RDMA_USER_CM_CMD_LEAVE_MCAST] = ucma_leave_multicast, + [RDMA_USER_CM_CMD_MIGRATE_ID] = ucma_migrate_id, + [RDMA_USER_CM_CMD_QUERY] = ucma_query, + [RDMA_USER_CM_CMD_BIND] = ucma_bind, + [RDMA_USER_CM_CMD_RESOLVE_ADDR] = ucma_resolve_addr, + [RDMA_USER_CM_CMD_JOIN_MCAST] = ucma_join_multicast }; static ssize_t ucma_write(struct file *filp, const char __user *buf, @@ -1319,6 +1786,7 @@ INIT_LIST_HEAD(&file->ctx_list); init_waitqueue_head(&file->poll_wait); mutex_init(&file->mut); + file->close_wq = create_singlethread_workqueue("ucma_close_id"); filp->private_data = file; file->filp = filp; @@ -1333,16 +1801,28 @@ mutex_lock(&file->mut); list_for_each_entry_safe(ctx, tmp, &file->ctx_list, list) { + ctx->destroying = 1; mutex_unlock(&file->mut); mutex_lock(&mut); idr_remove(&ctx_idr, ctx->id); mutex_unlock(&mut); + flush_workqueue(file->close_wq); + /* At that step once ctx was marked as destroying and workqueue + * was flushed we are safe from any inflights handlers that + * might put other closing task. + */ + if (!ctx->closing) + /* rdma_destroy_id ensures that no event handlers are + * inflight for that id before releasing it. + */ + rdma_destroy_id(ctx->cm_id); ucma_free_ctx(ctx); mutex_lock(&file->mut); } mutex_unlock(&file->mut); + destroy_workqueue(file->close_wq); kfree(file); return 0; } @@ -1371,11 +1851,11 @@ }; static struct miscdevice ucma_misc = { - .minor = MISC_DYNAMIC_MINOR, - .name = "rdma_cm", + .minor = MISC_DYNAMIC_MINOR, + .name = "rdma_cm", .nodename = "infiniband/rdma_cm", .mode = 0666, - .fops = &ucma_fops, + .fops = &ucma_fops, }; static ssize_t show_abi_version(struct device *dev, @@ -1399,7 +1879,6 @@ printk(KERN_ERR "rdma_ucm: couldn't create abi_version attr\n"); goto err1; } - return 0; err1: misc_deregister(&ucma_misc); Index: sys/ofed/drivers/infiniband/core/ud_header.c =================================================================== --- sys/ofed/drivers/infiniband/core/ud_header.c +++ sys/ofed/drivers/infiniband/core/ud_header.c @@ -35,6 +35,9 @@ #include #include #include +#include + +#include #include @@ -116,6 +119,68 @@ .size_bits = 16 } }; +static const struct ib_field ip4_table[] = { + { STRUCT_FIELD(ip4, ver_len), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 8 }, + { STRUCT_FIELD(ip4, tos), + .offset_words = 0, + .offset_bits = 8, + .size_bits = 8 }, + { STRUCT_FIELD(ip4, tot_len), + .offset_words = 0, + .offset_bits = 16, + .size_bits = 16 }, + { STRUCT_FIELD(ip4, id), + .offset_words = 1, + .offset_bits = 0, + .size_bits = 16 }, + { STRUCT_FIELD(ip4, frag_off), + .offset_words = 1, + .offset_bits = 16, + .size_bits = 16 }, + { STRUCT_FIELD(ip4, ttl), + .offset_words = 2, + .offset_bits = 0, + .size_bits = 8 }, + { STRUCT_FIELD(ip4, protocol), + .offset_words = 2, + .offset_bits = 8, + .size_bits = 8 }, + { STRUCT_FIELD(ip4, check), + .offset_words = 2, + .offset_bits = 16, + .size_bits = 16 }, + { STRUCT_FIELD(ip4, saddr), + .offset_words = 3, + .offset_bits = 0, + .size_bits = 32 }, + { STRUCT_FIELD(ip4, daddr), + .offset_words = 4, + .offset_bits = 0, + .size_bits = 32 } +}; + +static const struct ib_field udp_table[] = { + { STRUCT_FIELD(udp, sport), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 16 }, + { STRUCT_FIELD(udp, dport), + .offset_words = 0, + .offset_bits = 16, + .size_bits = 16 }, + { STRUCT_FIELD(udp, length), + .offset_words = 1, + .offset_bits = 0, + .size_bits = 16 }, + { STRUCT_FIELD(udp, csum), + .offset_words = 1, + .offset_bits = 16, + .size_bits = 16 } +}; + static const struct ib_field grh_table[] = { { STRUCT_FIELD(grh, ip_version), .offset_words = 0, @@ -213,6 +278,26 @@ .size_bits = 24 } }; +u16 ib_ud_ip4_csum(struct ib_ud_header *header) +{ + struct ip iph; + + iph.ip_hl = 5; + iph.ip_v = 4; + iph.ip_tos = header->ip4.tos; + iph.ip_len = header->ip4.tot_len; + iph.ip_id = header->ip4.id; + iph.ip_off = header->ip4.frag_off; + iph.ip_ttl = header->ip4.ttl; + iph.ip_p = header->ip4.protocol; + iph.ip_sum = 0; + iph.ip_src.s_addr = header->ip4.saddr; + iph.ip_dst.s_addr = header->ip4.daddr; + + return in_cksum_hdr(&iph); +} +EXPORT_SYMBOL(ib_ud_ip4_csum); + /** * ib_ud_header_init - Initialize UD header structure * @payload_bytes:Length of packet payload @@ -220,21 +305,37 @@ * @eth_present: specify if Eth header is present * @vlan_present: packet is tagged vlan * @grh_present:GRH flag (if non-zero, GRH will be included) + * @ip_version:GRH flag (if non-zero, IP header, V4 or V6, will be included) + * @grh_present:GRH flag (if non-zero, UDP header will be included) * @immediate_present: specify if immediate data is present * @header:Structure to initialize */ -void ib_ud_header_init(int payload_bytes, - int lrh_present, - int eth_present, - int vlan_present, - int grh_present, - int immediate_present, - struct ib_ud_header *header) +int ib_ud_header_init(int payload_bytes, + int lrh_present, + int eth_present, + int vlan_present, + int grh_present, + int ip_version, + int udp_present, + int immediate_present, + struct ib_ud_header *header) { + int ipv4_present; + int ipv6_present; + + grh_present = grh_present && !ip_version; memset(header, 0, sizeof *header); + /* + * UDP header without IP header doesn't make sense + */ + if (udp_present && ip_version != 4 && ip_version != 6) + return -EINVAL; + + ipv4_present = (ip_version == 4); + ipv6_present = (ip_version == 6); if (lrh_present) { - u16 packet_length = 0; + u16 packet_length; header->lrh.link_version = 0; header->lrh.link_next_header = @@ -250,18 +351,37 @@ } if (vlan_present) - header->eth.type = cpu_to_be16(ETH_P_8021Q); + header->eth.type = cpu_to_be16(ETH_P_8021Q); + + if (ipv6_present || grh_present) { + header->grh.ip_version = 6; + header->grh.payload_length = + cpu_to_be16((IB_BTH_BYTES + + IB_DETH_BYTES + + payload_bytes + + 4 + /* ICRC */ + 3) & ~3); /* round up */ + } + + if (ipv4_present) { + int udp_bytes = udp_present ? IB_UDP_BYTES : 0; - if (grh_present) { - header->grh.ip_version = 6; - header->grh.payload_length = - cpu_to_be16((IB_BTH_BYTES + + header->ip4.ver_len = 0x45; /* version 4, 5 words */ + header->ip4.tot_len = + cpu_to_be16(IB_IP4_BYTES + + udp_bytes + + IB_BTH_BYTES + IB_DETH_BYTES + payload_bytes + - 4 + /* ICRC */ - 3) & ~3); /* round up */ - header->grh.next_header = 0x1b; + 4); /* ICRC */ } + if (udp_present && ip_version) + header->udp.length = + cpu_to_be16(IB_UDP_BYTES + + IB_BTH_BYTES + + IB_DETH_BYTES + + payload_bytes + + 4); /* ICRC */ if (immediate_present) header->bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE; @@ -273,8 +393,11 @@ header->lrh_present = lrh_present; header->eth_present = eth_present; header->vlan_present = vlan_present; - header->grh_present = grh_present; + header->grh_present = grh_present || ipv6_present; + header->ipv4_present = ipv4_present; + header->udp_present = udp_present; header->immediate_present = immediate_present; + return 0; } EXPORT_SYMBOL(ib_ud_header_init); @@ -311,6 +434,16 @@ &header->grh, buf + len); len += IB_GRH_BYTES; } + if (header->ipv4_present) { + ib_pack(ip4_table, ARRAY_SIZE(ip4_table), + &header->ip4, buf + len); + len += IB_IP4_BYTES; + } + if (header->udp_present) { + ib_pack(udp_table, ARRAY_SIZE(udp_table), + &header->udp, buf + len); + len += IB_UDP_BYTES; + } ib_pack(bth_table, ARRAY_SIZE(bth_table), &header->bth, buf + len); Index: sys/ofed/drivers/infiniband/core/umem.c =================================================================== --- sys/ofed/drivers/infiniband/core/umem.c +++ sys/ofed/drivers/infiniband/core/umem.c @@ -39,7 +39,7 @@ #include #include #include -#include + #include #include #include @@ -48,9 +48,6 @@ #define IB_UMEM_MAX_PAGE_CHUNK (PAGE_SIZE / sizeof (struct page *)) -static int allow_weak_ordering; -module_param_named(weak_ordering, allow_weak_ordering, int, 0444); -MODULE_PARM_DESC(weak_ordering, "Allow weak ordering for data registered memory"); static struct ib_umem *peer_umem_get(struct ib_peer_memory_client *ib_peer_mem, struct ib_umem *umem, unsigned long addr, @@ -65,15 +62,17 @@ invalidation_ctx = kzalloc(sizeof(*invalidation_ctx), GFP_KERNEL); if (!invalidation_ctx) { ret = -ENOMEM; - goto out; + goto end; } umem->invalidation_ctx = invalidation_ctx; invalidation_ctx->umem = umem; mutex_lock(&ib_peer_mem->lock); - invalidation_ctx->context_ticket = - ib_peer_insert_context(ib_peer_mem, invalidation_ctx); + ret = ib_peer_insert_context(ib_peer_mem, invalidation_ctx, + &invalidation_ctx->context_ticket); /* unlock before calling get pages to prevent a dead-lock from the callback */ mutex_unlock(&ib_peer_mem->lock); + if (ret) + goto end; } ret = peer_mem->get_pages(addr, umem->length, umem->writable, 1, @@ -121,15 +120,17 @@ if (invalidation_ctx) { ib_peer_remove_context(ib_peer_mem, invalidation_ctx->context_ticket); mutex_unlock(&umem->ib_peer_mem->lock); - kfree(invalidation_ctx); } +end: + if (invalidation_ctx) + kfree(invalidation_ctx); + ib_put_peer_client(ib_peer_mem, umem->peer_mem_client_context, umem->peer_mem_srcu_key); kfree(umem); return ERR_PTR(ret); } - static void peer_umem_release(struct ib_umem *umem) { struct ib_peer_memory_client *ib_peer_mem = umem->ib_peer_mem; @@ -195,23 +196,26 @@ object = NULL; if (umem->nmap > 0) ib_dma_unmap_sg(dev, umem->sg_head.sgl, - umem->nmap, - DMA_BIDIRECTIONAL); + umem->nmap, + DMA_BIDIRECTIONAL); + for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) { page = sg_page(sg); - if (umem->writable && dirty) { - if (object && object != page->object) - VM_OBJECT_WUNLOCK(object); - if (object != page->object) { - object = page->object; - VM_OBJECT_WLOCK(object); - } - vm_page_dirty(page); + if (umem->writable && dirty) { + if (object && object != page->object) + VM_OBJECT_WUNLOCK(object); + if (object != page->object) { + object = page->object; + VM_OBJECT_WLOCK(object); } + vm_page_dirty(page); } + } + sg_free_table(&umem->sg_head); if (object) VM_OBJECT_WUNLOCK(object); + return; } @@ -229,8 +233,13 @@ return; } EXPORT_SYMBOL(ib_umem_activate_invalidation_notifier); + /** * ib_umem_get - Pin and DMA map userspace memory. + * + * If access flags indicate ODP memory, avoid pinning. Instead, stores + * the mm for future page fault handling in conjunction with MMU notifiers. + * * @context: userspace context to pin memory for * @addr: userspace virtual address to start at * @size: length of region to pin @@ -241,13 +250,12 @@ size_t size, int access, int dmasync, int invalidation_supported) { - struct ib_umem *umem; - struct proc *proc; + struct proc *proc; pmap_t pmap; - vm_offset_t end, last, start; - vm_size_t npages; - int error; + vm_offset_t end, last, start; + vm_size_t npages; + int error; int ret; int ents; int i; @@ -259,6 +267,9 @@ if (error) return ERR_PTR(-error); + if (!size) + return ERR_PTR(-EINVAL); + last = addr + size; start = addr & PAGE_MASK; /* Use the linux PAGE_MASK definition. */ end = roundup2(last, PAGE_SIZE); /* Use PAGE_MASK safe operation. */ @@ -267,26 +278,35 @@ npages = atop(end - start); if (npages > vm_page_max_wired) return ERR_PTR(-ENOMEM); + + /* + * If the combination of the addr and size requested for this memory + * region causes an integer overflow, return error. + */ + if (((addr + size) < addr) || + PAGE_ALIGN(addr + size) < (addr + size)) + return ERR_PTR(-EINVAL); + umem = kzalloc(sizeof *umem, GFP_KERNEL); if (!umem) return ERR_PTR(-ENOMEM); proc = curthread->td_proc; PROC_LOCK(proc); if (ptoa(npages + - pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map))) > - lim_cur_proc(proc, RLIMIT_MEMLOCK)) { - PROC_UNLOCK(proc); - kfree(umem); - return ERR_PTR(-ENOMEM); + pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map))) > + lim_cur_proc(proc, RLIMIT_MEMLOCK)) { + PROC_UNLOCK(proc); + kfree(umem); + return ERR_PTR(-ENOMEM); } - PROC_UNLOCK(proc); + PROC_UNLOCK(proc); if (npages + vm_cnt.v_wire_count > vm_page_max_wired) { kfree(umem); return ERR_PTR(-EAGAIN); } error = vm_map_wire(&proc->p_vmspace->vm_map, start, end, - VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES | - (umem->writable ? VM_MAP_WIRE_WRITE : 0)); + VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES | + (umem->writable ? VM_MAP_WIRE_WRITE : 0)); if (error != KERN_SUCCESS) { kfree(umem); return ERR_PTR(-ENOMEM); @@ -294,38 +314,40 @@ umem->context = context; umem->length = size; - umem->offset = addr & ~PAGE_MASK; + umem->offset = addr & ~PAGE_MASK; umem->page_size = PAGE_SIZE; - umem->start = addr; /* - * We ask for writable memory if any access flags other than - * "remote read" are set. "Local write" and "remote write" + * We ask for writable memory if any of the following + * access flags are set. "Local write" and "remote write" * obviously require write access. "Remote atomic" can do * things like fetch and add, which will modify memory, and * "MW bind" can change permissions by binding a window. */ - umem->writable = !!(access & ~IB_ACCESS_REMOTE_READ); + umem->writable = !!(access & + (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND)); if (invalidation_supported || context->peer_mem_private_data) { struct ib_peer_memory_client *peer_mem_client; peer_mem_client = ib_get_peer_client(context, addr, size, - &umem->peer_mem_client_context, - &umem->peer_mem_srcu_key); + &umem->peer_mem_client_context, + &umem->peer_mem_srcu_key); if (peer_mem_client) return peer_umem_get(peer_mem_client, umem, addr, - dmasync, invalidation_supported); + dmasync, invalidation_supported); } umem->hugetlb = 0; pmap = vm_map_pmap(&proc->p_vmspace->vm_map); - if (npages == 0) { + + if (npages == 0 || npages > UINT_MAX) { ret = -EINVAL; - goto out; - } + goto out; + } ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL); if (ret) @@ -348,7 +370,7 @@ goto out; } sg_set_page(sg, PHYS_TO_VM_PAGE(pa), - PAGE_SIZE, 0); + PAGE_SIZE, 0); npages--; start += PAGE_SIZE; } @@ -360,17 +382,17 @@ umem->nmap = ib_dma_map_sg_attrs(context->device, umem->sg_head.sgl, umem->npages, - DMA_BIDIRECTIONAL, - &attrs); + DMA_BIDIRECTIONAL, + &attrs); if (umem->nmap != umem->npages) { - ret = -ENOMEM; - goto out; - } + ret = -ENOMEM; + goto out; + } out: if (ret < 0) { if (need_release) - __ib_umem_release(context->device, umem, 0); + __ib_umem_release(context->device, umem, 0); kfree(umem); } @@ -417,12 +439,11 @@ addr = umem->start; size = umem->length; last = addr + size; - start = addr & PAGE_MASK; /* Use the linux PAGE_MASK definition. */ + start = addr & PAGE_MASK; /* Use the linux PAGE_MASK definition. */ end = roundup2(last, PAGE_SIZE); /* Use PAGE_MASK safe operation. */ vm_map_unwire(&curthread->td_proc->p_vmspace->vm_map, start, end, - VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); + VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); kfree(umem); - } EXPORT_SYMBOL(ib_umem_release); Index: sys/ofed/drivers/infiniband/core/user_mad.c =================================================================== --- sys/ofed/drivers/infiniband/core/user_mad.c +++ sys/ofed/drivers/infiniband/core/user_mad.c @@ -33,6 +33,8 @@ * SOFTWARE. */ +#define pr_fmt(fmt) "user_mad: " fmt + #include #include #include @@ -52,16 +54,22 @@ #include #include -MODULE_AUTHOR("Roland Dreier"); -MODULE_DESCRIPTION("InfiniBand userspace MAD packet access"); -MODULE_LICENSE("Dual BSD/GPL"); +/* InfiniBand userspace MAD packet access */ + +/* Need to change it to use TUNNABLE */ +static int enable_rx_threshold; +module_param(enable_rx_threshold, int, 0444); +MODULE_PARM_DESC(enable_rx_threshold, "Enable threshold for receive queue if non-zero (default=0)"); enum { IB_UMAD_MAX_PORTS = 64, IB_UMAD_MAX_AGENTS = 32, IB_UMAD_MAJOR = 231, - IB_UMAD_MINOR_BASE = 0 + IB_UMAD_MINOR_BASE = 0, + + IB_UMAD_RX_THRESHOLD = 10000, + IB_UMAD_RX_MANAGER_THRESHOLD = 100000 }; /* @@ -79,10 +87,10 @@ */ struct ib_umad_port { - struct cdev *cdev; + struct cdev cdev; struct device *dev; - struct cdev *sm_cdev; + struct cdev sm_cdev; struct device *sm_dev; struct semaphore sm_sem; @@ -93,20 +101,25 @@ struct ib_umad_device *umad_dev; int dev_num; u8 port_num; - struct list_head port_lst; }; struct ib_umad_device { int start_port, end_port; - struct kref ref; + struct kobject kobj; struct ib_umad_port port[0]; }; +struct counted_list { + struct list_head list; + int count; + int threshold; +}; + struct ib_umad_file { struct mutex mutex; struct ib_umad_port *port; struct file *filp; - struct list_head recv_list; + struct counted_list recv_list; struct list_head send_list; struct list_head port_list; spinlock_t send_lock; @@ -131,85 +144,21 @@ static DEFINE_SPINLOCK(port_lock); static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS); -static DECLARE_BITMAP(overflow_map, IB_UMAD_MAX_PORTS); static void ib_umad_add_one(struct ib_device *device); static void ib_umad_remove_one(struct ib_device *device); -static DEFINE_SPINLOCK(ports_list_lock); -static struct list_head ports_list; - - -static void remove_ports(struct kref *ref) -{ - int i; - struct ib_umad_port *p, *p1; - struct ib_umad_device *dev = - container_of(ref, struct ib_umad_device, ref); - - for (i = 0; i <= dev->end_port - dev->start_port; ++i) { - struct ib_umad_port *port = &dev->port[i]; - - list_for_each_entry_safe(p, p1, &ports_list, port_lst) - if (p == port) { - list_del(&p->port_lst); - break; - } - } -} - -static void put_umad_dev(struct kref *ref) +static void ib_umad_release_dev(struct kobject *kobj) { - int ret, i; struct ib_umad_device *dev = - container_of(ref, struct ib_umad_device, ref); + container_of(kobj, struct ib_umad_device, kobj); - spin_lock(&ports_list_lock); - ret = (kref_put(ref, remove_ports)); - spin_unlock(&ports_list_lock); - if (ret) { - for (i = 0; i <= dev->end_port - dev->start_port; ++i) { - if (dev->port[i].dev_num < IB_UMAD_MAX_PORTS) - clear_bit(dev->port[i].dev_num, dev_map); - else - clear_bit(dev->port[i].dev_num - IB_UMAD_MAX_PORTS, overflow_map); - cdev_del(dev->port[i].cdev); - cdev_del(dev->port[i].sm_cdev); - } kfree(dev); - } -} - -static void release_port(struct ib_umad_port *port) -{ - put_umad_dev(&port->umad_dev->ref); -} - - -static struct ib_umad_port *get_port(struct cdev *cdev) -{ - struct ib_umad_port *port; - - spin_lock(&ports_list_lock); - list_for_each_entry(port, &ports_list, port_lst) { - if (port->cdev == cdev || port->sm_cdev == cdev) { - kref_get(&port->umad_dev->ref); - spin_unlock(&ports_list_lock); - - return port; - } - } - spin_unlock(&ports_list_lock); - - return NULL; } -static void insert_port(struct ib_umad_port *port) -{ - spin_lock(&ports_list_lock); - list_add(&port->port_lst, &ports_list); - spin_unlock(&ports_list_lock); -} +static struct kobj_type ib_umad_dev_ktype = { + .release = ib_umad_release_dev, +}; static int hdr_size(struct ib_umad_file *file) { @@ -235,7 +184,7 @@ packet->mad.hdr.id < IB_UMAD_MAX_AGENTS; packet->mad.hdr.id++) if (agent == __get_agent(file, packet->mad.hdr.id)) { - list_add_tail(&packet->list, &file->recv_list); + list_add_tail(&packet->list, &file->recv_list.list); selwakeup(&file->filp->f_selinfo); wake_up_interruptible(&file->recv_wait); ret = 0; @@ -274,15 +223,56 @@ kfree(packet); } +static int get_mads_count(int packet_length, int hdr_len) +{ + int seg_len, data_len, mads_count; + + data_len = packet_length - hdr_len; + seg_len = sizeof(struct ib_mad) - hdr_len; + mads_count = (data_len - 1) / seg_len + 1; + + return mads_count; +} + +static int is_mad_rmpp(struct ib_mad_recv_wc *mad_recv_wc) +{ + struct ib_rmpp_mad *rmpp_mad; + + rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad; + if (ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class) && + (ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE)) { + return 1; + } + return 0; +} + static void recv_handler(struct ib_mad_agent *agent, struct ib_mad_recv_wc *mad_recv_wc) { struct ib_umad_file *file = agent->context; struct ib_umad_packet *packet; + int mgmt_class; + int data_offset; + int drop = 0; + int mad_is_rmpp; if (mad_recv_wc->wc->status != IB_WC_SUCCESS) goto err1; + mad_is_rmpp = is_mad_rmpp(mad_recv_wc); + if (!agent->rmpp_version && mad_is_rmpp) + goto err1; + + mutex_lock(&file->mutex); +/*For now we accept all RMPPs packets, even though we crossed the threshold*/ + if (enable_rx_threshold && + !mad_is_rmpp && file->recv_list.count >= file->recv_list.threshold) + drop = 1; + mutex_unlock(&file->mutex); + + if (drop) + goto err1; + packet = kzalloc(sizeof *packet, GFP_KERNEL); if (!packet) goto err1; @@ -314,6 +304,13 @@ if (queue_packet(file, agent, packet)) goto err2; + + mgmt_class = mad_recv_wc->recv_buf.mad->mad_hdr.mgmt_class; + data_offset = ib_get_mad_data_offset(mgmt_class); + mutex_lock(&file->mutex); + file->recv_list.count += get_mads_count(packet->length, data_offset); + mutex_unlock(&file->mutex); + return; err2: @@ -403,20 +400,20 @@ mutex_lock(&file->mutex); - while (list_empty(&file->recv_list)) { + while (list_empty(&file->recv_list.list)) { mutex_unlock(&file->mutex); if (filp->f_flags & O_NONBLOCK) return -EAGAIN; if (wait_event_interruptible(file->recv_wait, - !list_empty(&file->recv_list))) + !list_empty(&file->recv_list.list))) return -ERESTARTSYS; mutex_lock(&file->mutex); } - packet = list_entry(file->recv_list.next, struct ib_umad_packet, list); + packet = list_entry(file->recv_list.list.next, struct ib_umad_packet, list); list_del(&packet->list); mutex_unlock(&file->mutex); @@ -429,11 +426,21 @@ if (ret < 0) { /* Requeue packet */ mutex_lock(&file->mutex); - list_add(&packet->list, &file->recv_list); + list_add(&packet->list, &file->recv_list.list); mutex_unlock(&file->mutex); } else { - if (packet->recv_wc) + if (packet->recv_wc) { + int mgmt_class; + int data_offset; + + mgmt_class = packet->recv_wc->recv_buf.mad->mad_hdr.mgmt_class; + data_offset = ib_get_mad_data_offset(mgmt_class); + mutex_lock(&file->mutex); + file->recv_list.count -= get_mads_count(packet->length, + data_offset); + mutex_unlock(&file->mutex); ib_free_recv_mad(packet->recv_wc); + } kfree(packet); } return ret; @@ -557,8 +564,8 @@ ah_attr.ah_flags = IB_AH_GRH; memcpy(ah_attr.grh.dgid.raw, packet->mad.hdr.gid, 16); ah_attr.grh.sgid_index = packet->mad.hdr.gid_index; - ah_attr.grh.flow_label = be32_to_cpu(packet->mad.hdr.flow_label); - ah_attr.grh.hop_limit = packet->mad.hdr.hop_limit; + ah_attr.grh.flow_label = be32_to_cpu(packet->mad.hdr.flow_label); + ah_attr.grh.hop_limit = packet->mad.hdr.hop_limit; ah_attr.grh.traffic_class = packet->mad.hdr.traffic_class; } @@ -589,9 +596,9 @@ goto err_ah; } - packet->msg->ah = ah; + packet->msg->ah = ah; packet->msg->timeout_ms = packet->mad.hdr.timeout_ms; - packet->msg->retries = packet->mad.hdr.retries; + packet->msg->retries = packet->mad.hdr.retries; packet->msg->context[0] = packet; /* Copy MAD header. Any RMPP header is already in place. */ @@ -661,12 +668,29 @@ poll_wait(filp, &file->recv_wait, wait); - if (!list_empty(&file->recv_list)) + if (!list_empty(&file->recv_list.list)) mask |= POLLIN | POLLRDNORM; return mask; } +static void update_mgmt_threshold(struct ib_umad_file *file, struct ib_mad_reg_req req) +{ + int i; + + /*Update managers' class rx threshold*/ + for_each_set_bit(i, req.method_mask, IB_MGMT_MAX_METHODS) { + if (i == IB_MGMT_METHOD_GET || + i == IB_MGMT_METHOD_SET || + i == IB_MGMT_METHOD_REPORT || + i == IB_MGMT_METHOD_TRAP) { + file->recv_list.threshold = + IB_UMAD_RX_MANAGER_THRESHOLD; + break; + } + } +} + static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg, int compat_method_mask) { @@ -680,6 +704,8 @@ mutex_lock(&file->mutex); if (!file->port->ib_dev) { + dev_info(file->port->dev, + "ib_umad_reg_agent: invalid device\n"); ret = -EPIPE; goto out; } @@ -690,6 +716,9 @@ } if (ureq.qpn != 0 && ureq.qpn != 1) { + dev_info(file->port->dev, + "ib_umad_reg_agent: invalid QPN %d specified\n", + ureq.qpn); ret = -EINVAL; goto out; } @@ -698,11 +727,15 @@ if (!__get_agent(file, agent_id)) goto found; + dev_info(file->port->dev, + "ib_umad_reg_agent: Max Agents (%u) reached\n", + IB_UMAD_MAX_AGENTS); ret = -ENOMEM; goto out; found: if (ureq.mgmt_class) { + memset(&req, 0, sizeof(req)); req.mgmt_class = ureq.mgmt_class; req.mgmt_class_version = ureq.mgmt_class_version; memcpy(req.oui, ureq.oui, sizeof req.oui); @@ -717,6 +750,8 @@ } else memcpy(req.method_mask, ureq.method_mask, sizeof req.method_mask); + + update_mgmt_threshold(file, req); } agent = ib_register_mad_agent(file->port->ib_dev, file->port->port_num, @@ -739,10 +774,11 @@ if (!file->already_used) { file->already_used = 1; if (!file->use_pkey_index) { - printk(KERN_WARNING "user_mad: process %s did not enable " - "P_Key index support.\n", curthread->td_proc->p_comm); - printk(KERN_WARNING "user_mad: Documentation/infiniband/user_mad.txt " - "has info on the new ABI.\n"); + dev_warn(file->port->dev, + "process %s did not enable P_Key index support.\n", + curthread->td_proc->p_comm); + dev_warn(file->port->dev, + " Documentation/infiniband/user_mad.txt has info on the new ABI.\n"); } } @@ -805,6 +841,33 @@ return ret; } +static long ib_umad_update_threshold(struct ib_umad_file *file, void __user + *arg) +{ + struct ib_user_mad_thresh_req ureq; + int ret = 0; + + mutex_lock(&file->port->file_mutex); + mutex_lock(&file->mutex); + + if (!file->port->ib_dev) { + ret = -EPIPE; + goto out; + } + + if (copy_from_user(&ureq, arg, sizeof(ureq))) { + ret = -EFAULT; + goto out; + } + + file->recv_list.threshold = ureq.threshold; +out: + mutex_unlock(&file->mutex); + mutex_unlock(&file->port->file_mutex); + + return ret; +} + static long ib_umad_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { @@ -815,6 +878,8 @@ return ib_umad_unreg_agent(filp->private_data, (__u32 __user *) arg); case IB_USER_MAD_ENABLE_PKEY: return ib_umad_enable_pkey(filp->private_data); + case IB_USER_MAD_UPDATE_THRESHOLD: + return ib_umad_update_threshold(filp->private_data, (void __user *)arg); default: return -ENOIOCTLCMD; } @@ -831,12 +896,21 @@ return ib_umad_unreg_agent(filp->private_data, compat_ptr(arg)); case IB_USER_MAD_ENABLE_PKEY: return ib_umad_enable_pkey(filp->private_data); + case IB_USER_MAD_UPDATE_THRESHOLD: + return ib_umad_update_threshold(filp->private_data, compat_ptr(arg)); default: return -ENOIOCTLCMD; } } #endif +static void init_recv_list(struct counted_list *recv_list) +{ + INIT_LIST_HEAD(&recv_list->list); + recv_list->count = 0; + recv_list->threshold = IB_UMAD_RX_THRESHOLD; +} + /* * ib_umad_open() does not need the BKL: * @@ -850,30 +924,23 @@ { struct ib_umad_port *port; struct ib_umad_file *file; - int ret; + int ret = -ENXIO; - port = get_port(inode->i_cdev->si_drv1); - if (!port) - return -ENXIO; + port = container_of(inode->i_cdev->si_drv1, struct ib_umad_port, cdev); mutex_lock(&port->file_mutex); - if (!port->ib_dev) { - release_port(port); - ret = -ENXIO; + if (!port->ib_dev) goto out; - } + ret = -ENOMEM; file = kzalloc(sizeof *file, GFP_KERNEL); - if (!file) { - release_port(port); - ret = -ENOMEM; + if (!file) goto out; - } mutex_init(&file->mutex); spin_lock_init(&file->send_lock); - INIT_LIST_HEAD(&file->recv_list); + init_recv_list(&file->recv_list); INIT_LIST_HEAD(&file->send_list); init_waitqueue_head(&file->recv_wait); @@ -884,6 +951,13 @@ list_add_tail(&file->port_list, &port->file_list); ret = nonseekable_open(inode, filp); + if (ret) { + list_del(&file->port_list); + kfree(file); + goto out; + } + + kobject_get(&port->umad_dev->kobj); out: mutex_unlock(&port->file_mutex); @@ -893,7 +967,7 @@ static int ib_umad_close(struct inode *inode, struct file *filp) { struct ib_umad_file *file = filp->private_data; - struct ib_umad_port *port = file->port; + struct ib_umad_device *dev = file->port->umad_dev; struct ib_umad_packet *packet, *tmp; int already_dead; int i; @@ -904,7 +978,7 @@ already_dead = file->agents_dead; file->agents_dead = 1; - list_for_each_entry_safe(packet, tmp, &file->recv_list, list) { + list_for_each_entry_safe(packet, tmp, &file->recv_list.list, list) { if (packet->recv_wc) ib_free_recv_mad(packet->recv_wc); kfree(packet); @@ -922,21 +996,21 @@ mutex_unlock(&file->port->file_mutex); kfree(file); - release_port(port); + kobject_put(&dev->kobj); return 0; } static const struct file_operations umad_fops = { - .owner = THIS_MODULE, - .read = ib_umad_read, - .write = ib_umad_write, - .poll = ib_umad_poll, + .owner = THIS_MODULE, + .read = ib_umad_read, + .write = ib_umad_write, + .poll = ib_umad_poll, .unlocked_ioctl = ib_umad_ioctl, #ifdef CONFIG_COMPAT - .compat_ioctl = ib_umad_compat_ioctl, + .compat_ioctl = ib_umad_compat_ioctl, #endif - .open = ib_umad_open, + .open = ib_umad_open, .release = ib_umad_close, .llseek = no_llseek, }; @@ -949,9 +1023,7 @@ }; int ret; - port = get_port(inode->i_cdev->si_drv1); - if (!port) - return -ENXIO; + port = container_of(inode->i_cdev->si_drv1, struct ib_umad_port, sm_cdev); if (filp->f_flags & O_NONBLOCK) { if (down_trylock(&port->sm_sem)) { @@ -966,17 +1038,27 @@ } ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props); - if (ret) { - up(&port->sm_sem); - goto fail; - } + if (ret) + goto err_up_sem; filp->private_data = port; - return nonseekable_open(inode, filp); + ret = nonseekable_open(inode, filp); + if (ret) + goto err_clr_sm_cap; + + kobject_get(&port->umad_dev->kobj); + + return 0; + +err_clr_sm_cap: + swap(props.set_port_cap_mask, props.clr_port_cap_mask); + ib_modify_port(port->ib_dev, port->port_num, 0, &props); + +err_up_sem: + up(&port->sm_sem); fail: - release_port(port); return ret; } @@ -995,14 +1077,14 @@ up(&port->sm_sem); - release_port(port); + kobject_put(&port->umad_dev->kobj); return ret; } static const struct file_operations umad_sm_fops = { - .owner = THIS_MODULE, - .open = ib_umad_sm_open, + .owner = THIS_MODULE, + .open = ib_umad_sm_open, .release = ib_umad_sm_close, .llseek = no_llseek, }; @@ -1039,12 +1121,13 @@ static ssize_t show_abi_version(struct class *class, struct class_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", IB_USER_MAD_ABI_VERSION); + return sprintf(buf, "%d\n", IB_USER_MAD_ABI_VERSION); } static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL); static dev_t overflow_maj; -static int find_overflow_devnum(void) +static DECLARE_BITMAP(overflow_map, IB_UMAD_MAX_PORTS); +static int find_overflow_devnum(struct ib_device *device) { int ret; @@ -1052,7 +1135,8 @@ ret = alloc_chrdev_region(&overflow_maj, 0, IB_UMAD_MAX_PORTS * 2, "infiniband_mad"); if (ret) { - printk(KERN_ERR "user_mad: couldn't register dynamic device number\n"); + dev_err(&device->dev, + "couldn't register dynamic device number\n"); return ret; } } @@ -1065,6 +1149,7 @@ } static int ib_umad_init_port(struct ib_device *device, int port_num, + struct ib_umad_device *umad_dev, struct ib_umad_port *port) { int devnum; @@ -1074,9 +1159,9 @@ devnum = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS); if (devnum >= IB_UMAD_MAX_PORTS) { spin_unlock(&port_lock); - devnum = find_overflow_devnum(); + devnum = find_overflow_devnum(device); if (devnum < 0) - return -1; + return -1; spin_lock(&port_lock); port->dev_num = devnum + IB_UMAD_MAX_PORTS; @@ -1095,18 +1180,15 @@ mutex_init(&port->file_mutex); INIT_LIST_HEAD(&port->file_list); - port->cdev = cdev_alloc(); - if (!port->cdev) - goto err_cdev_c; - - port->cdev->ops = &umad_fops; - port->cdev->owner = THIS_MODULE; - kobject_set_name(&port->cdev->kobj, "umad%d", port->dev_num); - if (cdev_add(port->cdev, base, 1)) + cdev_init(&port->cdev, &umad_fops); + port->cdev.owner = THIS_MODULE; + port->cdev.kobj.parent = &umad_dev->kobj; + kobject_set_name(&port->cdev.kobj, "umad%d", port->dev_num); + if (cdev_add(&port->cdev, base, 1)) goto err_cdev; port->dev = device_create(umad_class, device->dma_device, - port->cdev->dev, port, + port->cdev.dev, port, "umad%d", port->dev_num); if (IS_ERR(port->dev)) goto err_cdev; @@ -1117,18 +1199,15 @@ goto err_dev; base += IB_UMAD_MAX_PORTS; - port->sm_cdev = cdev_alloc(); - if (!port->sm_cdev) - goto err_dev; - - port->sm_cdev->ops = &umad_sm_fops; - port->sm_cdev->owner = THIS_MODULE; - kobject_set_name(&port->sm_cdev->kobj, "issm%d", port->dev_num); - if (cdev_add(port->sm_cdev, base, 1)) + cdev_init(&port->sm_cdev, &umad_sm_fops); + port->sm_cdev.owner = THIS_MODULE; + port->sm_cdev.kobj.parent = &umad_dev->kobj; + kobject_set_name(&port->sm_cdev.kobj, "issm%d", port->dev_num); + if (cdev_add(&port->sm_cdev, base, 1)) goto err_sm_cdev; port->sm_dev = device_create(umad_class, device->dma_device, - port->sm_cdev->dev, port, + port->sm_cdev.dev, port, "issm%d", port->dev_num); if (IS_ERR(port->sm_dev)) goto err_sm_cdev; @@ -1141,17 +1220,16 @@ return 0; err_sm_dev: - device_destroy(umad_class, port->sm_cdev->dev); + device_destroy(umad_class, port->sm_cdev.dev); err_sm_cdev: - cdev_del(port->sm_cdev); + cdev_del(&port->sm_cdev); err_dev: - device_destroy(umad_class, port->cdev->dev); + device_destroy(umad_class, port->cdev.dev); err_cdev: - cdev_del(port->cdev); -err_cdev_c: + cdev_del(&port->cdev); if (port->dev_num < IB_UMAD_MAX_PORTS) clear_bit(devnum, dev_map); else @@ -1168,8 +1246,11 @@ dev_set_drvdata(port->dev, NULL); dev_set_drvdata(port->sm_dev, NULL); - device_destroy(umad_class, port->cdev->dev); - device_destroy(umad_class, port->sm_cdev->dev); + device_destroy(umad_class, port->cdev.dev); + device_destroy(umad_class, port->sm_cdev.dev); + + cdev_del(&port->cdev); + cdev_del(&port->sm_cdev); mutex_lock(&port->file_mutex); @@ -1186,6 +1267,11 @@ } mutex_unlock(&port->file_mutex); + + if (port->dev_num < IB_UMAD_MAX_PORTS) + clear_bit(port->dev_num, dev_map); + else + clear_bit(port->dev_num - IB_UMAD_MAX_PORTS, overflow_map); } static void ib_umad_add_one(struct ib_device *device) @@ -1209,19 +1295,17 @@ if (!umad_dev) return; - kref_init(&umad_dev->ref); + kobject_init(&umad_dev->kobj, &ib_umad_dev_ktype); umad_dev->start_port = s; umad_dev->end_port = e; - for (i = 0; i <= e - s; ++i) - insert_port(&umad_dev->port[i]); - for (i = s; i <= e; ++i) { umad_dev->port[i - s].umad_dev = umad_dev; - if (ib_umad_init_port(device, i, &umad_dev->port[i - s])) - goto err; + if (ib_umad_init_port(device, i, umad_dev, + &umad_dev->port[i - s])) + goto err; } ib_set_client_data(device, &umad_client, umad_dev); @@ -1230,9 +1314,9 @@ err: while (--i >= s) - ib_umad_kill_port(&umad_dev->port[i - s]); + ib_umad_kill_port(&umad_dev->port[i - s]); - put_umad_dev(&umad_dev->ref); + kobject_put(&umad_dev->kobj); } static void ib_umad_remove_one(struct ib_device *device) @@ -1244,9 +1328,9 @@ return; for (i = 0; i <= umad_dev->end_port - umad_dev->start_port; ++i) - ib_umad_kill_port(&umad_dev->port[i]); + ib_umad_kill_port(&umad_dev->port[i]); - put_umad_dev(&umad_dev->ref); + kobject_put(&umad_dev->kobj); } static char *umad_devnode(struct device *dev, umode_t *mode) @@ -1258,19 +1342,17 @@ { int ret; - INIT_LIST_HEAD(&ports_list); - ret = register_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2, "infiniband_mad"); if (ret) { - printk(KERN_ERR "user_mad: couldn't register device number\n"); + pr_err("couldn't register device number\n"); goto out; } umad_class = class_create(THIS_MODULE, "infiniband_mad"); if (IS_ERR(umad_class)) { ret = PTR_ERR(umad_class); - printk(KERN_ERR "user_mad: couldn't create class infiniband_mad\n"); + pr_err("couldn't create class infiniband_mad\n"); goto out_chrdev; } @@ -1278,13 +1360,13 @@ ret = class_create_file(umad_class, &class_attr_abi_version); if (ret) { - printk(KERN_ERR "user_mad: couldn't create abi_version attribute\n"); + pr_err("couldn't create abi_version attribute\n"); goto out_class; } ret = ib_register_client(&umad_client); if (ret) { - printk(KERN_ERR "user_mad: couldn't register ib_umad client\n"); + pr_err("couldn't register ib_umad client\n"); goto out_class; } @@ -1309,5 +1391,5 @@ unregister_chrdev_region(overflow_maj, IB_UMAD_MAX_PORTS * 2); } -module_init(ib_umad_init); +module_init_order(ib_umad_init, SI_ORDER_THIRD); module_exit(ib_umad_cleanup); Index: sys/ofed/drivers/infiniband/core/uverbs.h =================================================================== --- sys/ofed/drivers/infiniband/core/uverbs.h +++ sys/ofed/drivers/infiniband/core/uverbs.h @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -50,6 +51,18 @@ #include #include +#define INIT_UDATA_BUF_OR_NULL(udata, ibuf, obuf, ilen, olen) \ + do { \ + (udata)->inbuf = (ilen) ? (const void __user *) (ibuf) : NULL; \ + (udata)->outbuf = (olen) ? (void __user *) (obuf) : NULL; \ + (udata)->inlen = (ilen); \ + (udata)->outlen = (olen); \ + } while (0) + +enum uverbs_flags { + UVERBS_FLAG_DISASSOCIATE = 1 +}; + /* * Our lifetime rules for these structs are the following: * @@ -81,11 +94,17 @@ struct cdev cdev; struct rb_root xrcd_tree; struct mutex xrcd_tree_mutex; + struct mutex disassociate_mutex; /* protect lists of files. */ + int disassociated; + u32 flags; + struct srcu_struct disassociate_srcu; + struct list_head uverbs_file_list; + struct list_head uverbs_events_file_list; }; struct ib_uverbs_event_file { struct kref ref; - struct file *filp; + struct file *filp; int is_async; struct ib_uverbs_file *uverbs_file; spinlock_t lock; @@ -93,6 +112,7 @@ wait_queue_head_t poll_wait; struct fasync_struct *async_queue; struct list_head event_list; + struct list_head list; }; struct ib_uverbs_file { @@ -102,6 +122,7 @@ struct ib_ucontext *ucontext; struct ib_event_handler event_handler; struct ib_uverbs_event_file *async_file; + struct list_head list; }; struct ib_uverbs_event { @@ -152,7 +173,7 @@ }; struct ib_udct_object { - struct ib_uobject uobject; + struct ib_uevent_object uevent; }; extern spinlock_t ib_uverbs_idr_lock; @@ -166,6 +187,8 @@ extern struct idr ib_uverbs_xrcd_idr; extern struct idr ib_uverbs_rule_idr; extern struct idr ib_uverbs_dct_idr; +extern struct idr ib_uverbs_wq_idr; +extern struct idr ib_uverbs_rwq_ind_tbl_idr; void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj); @@ -186,6 +209,7 @@ void ib_uverbs_event_handler(struct ib_event_handler *handler, struct ib_event *event); void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, struct ib_xrcd *xrcd); +void ib_uverbs_dct_event_handler(struct ib_event *event, void *context_ptr); struct ib_uverbs_flow_spec { union { @@ -215,6 +239,7 @@ IB_UVERBS_DECLARE_CMD(alloc_pd); IB_UVERBS_DECLARE_CMD(dealloc_pd); IB_UVERBS_DECLARE_CMD(reg_mr); +IB_UVERBS_DECLARE_CMD(rereg_mr); IB_UVERBS_DECLARE_CMD(dereg_mr); IB_UVERBS_DECLARE_CMD(alloc_mw); IB_UVERBS_DECLARE_CMD(dealloc_mw); @@ -245,17 +270,18 @@ IB_UVERBS_DECLARE_CMD(close_xrcd); #define IB_UVERBS_DECLARE_EX_CMD(name) \ - int ib_uverbs_ex_##name(struct ib_uverbs_file *file,\ - struct ib_udata *ucore, \ + int ib_uverbs_ex_##name(struct ib_uverbs_file *file, \ + struct ib_udata *ucore, \ struct ib_udata *uhw) -#define IB_UVERBS_DECLARE_EXP_CMD(name) \ - ssize_t ib_uverbs_exp_##name(struct ib_uverbs_file *file, \ - struct ib_udata *ucore, \ - struct ib_udata *uhw) - IB_UVERBS_DECLARE_EX_CMD(create_flow); IB_UVERBS_DECLARE_EX_CMD(destroy_flow); +IB_UVERBS_DECLARE_EX_CMD(query_device); + +#define IB_UVERBS_DECLARE_EXP_CMD(name) \ + int ib_uverbs_exp_##name(struct ib_uverbs_file *file, \ + struct ib_udata *ucore, \ + struct ib_udata *uhw) IB_UVERBS_DECLARE_EXP_CMD(create_qp); IB_UVERBS_DECLARE_EXP_CMD(modify_cq); @@ -265,5 +291,17 @@ IB_UVERBS_DECLARE_EXP_CMD(create_dct); IB_UVERBS_DECLARE_EXP_CMD(destroy_dct); IB_UVERBS_DECLARE_EXP_CMD(query_dct); +IB_UVERBS_DECLARE_EXP_CMD(arm_dct); +IB_UVERBS_DECLARE_EXP_CMD(create_mr); +IB_UVERBS_DECLARE_EXP_CMD(query_mkey); +IB_UVERBS_DECLARE_EXP_CMD(reg_mr_ex); +IB_UVERBS_DECLARE_EXP_CMD(prefetch_mr); +IB_UVERBS_DECLARE_EXP_CMD(rereg_mr); +IB_UVERBS_DECLARE_EXP_CMD(create_wq); +IB_UVERBS_DECLARE_EXP_CMD(modify_wq); +IB_UVERBS_DECLARE_EXP_CMD(destroy_wq); +IB_UVERBS_DECLARE_EXP_CMD(create_rwq_ind_table); +IB_UVERBS_DECLARE_EXP_CMD(destroy_rwq_ind_table); +IB_UVERBS_DECLARE_EXP_CMD(create_flow); #endif /* UVERBS_H */ Index: sys/ofed/drivers/infiniband/core/uverbs_cmd.c =================================================================== --- sys/ofed/drivers/infiniband/core/uverbs_cmd.c +++ sys/ofed/drivers/infiniband/core/uverbs_cmd.c @@ -39,21 +39,24 @@ #include #include #include +#include #include #include #include +#include #include #include #include #include "uverbs.h" +#include "core_priv.h" static int disable_raw_qp_enforcement; module_param_named(disable_raw_qp_enforcement, disable_raw_qp_enforcement, int, - 0444); -MODULE_PARM_DESC(disable_raw_qp_enforcement, "Disable RAW QP enforcement for " - "being opened by root (default: 0)"); + 0444); +MODULE_PARM_DESC(disable_raw_qp_enforcement, "Disable RAW QP enforcement for " + "being opened by root (default: 0)"); struct uverbs_lock_class { struct lock_class_key key; @@ -69,6 +72,9 @@ static struct uverbs_lock_class srq_lock_class = { .name = "SRQ-uobj" }; static struct uverbs_lock_class xrcd_lock_class = { .name = "XRCD-uobj" }; static struct uverbs_lock_class dct_lock_class = { .name = "DCT-uobj" }; +static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" }; +static struct uverbs_lock_class wq_lock_class = { .name = "WQ-uobj" }; +static struct uverbs_lock_class rwq_ind_table_lock_class = { .name = "IND_TBL-uobj" }; static int uverbs_copy_from_udata(void *dest, struct ib_udata *udata, size_t len) { @@ -92,11 +98,13 @@ (udata)->outbuf = (void __user *) (obuf); \ (udata)->inlen = (ilen); \ (udata)->outlen = (olen); \ + (udata)->src = IB_UDATA_LEGACY_CMD; \ } while (0) enum uverbs_cmd_type { IB_USER_VERBS_CMD_BASIC, - IB_USER_VERBS_CMD_EXTENDED + IB_USER_VERBS_CMD_EXTENDED, + IB_USER_VERBS_CMD_EXP }; /* @@ -283,6 +291,27 @@ return idr_read_obj(&ib_uverbs_qp_idr, qp_handle, context, 0); } +static struct ib_wq *idr_read_wq(int wq_handle, struct ib_ucontext *context) +{ + return idr_read_obj(&ib_uverbs_wq_idr, wq_handle, context, 0); +} + +static void put_wq_read(struct ib_wq *wq) +{ + put_uobj_read(wq->uobject); +} + +static struct ib_rwq_ind_table *idr_read_rwq_indirection_table(int ind_table_handle, + struct ib_ucontext *context) +{ + return idr_read_obj(&ib_uverbs_rwq_ind_tbl_idr, ind_table_handle, context, 0); +} + +static void put_rwq_indirection_table_read(struct ib_rwq_ind_table *ind_table) +{ + put_uobj_read(ind_table->uobject); +} + static struct ib_qp *idr_write_qp(int qp_handle, struct ib_ucontext *context) { struct ib_uobject *uobj; @@ -333,6 +362,16 @@ put_uobj_read(uobj); } +static struct ib_mr *idr_read_mr(int mr_handle, struct ib_ucontext *context) +{ + return idr_read_obj(&ib_uverbs_mr_idr, mr_handle, context, 0); +} + +static void put_mr_read(struct ib_mr *mr) +{ + put_uobj_read(mr->uobject); +} + ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) @@ -376,19 +415,19 @@ INIT_LIST_HEAD(&ucontext->qp_list); INIT_LIST_HEAD(&ucontext->srq_list); INIT_LIST_HEAD(&ucontext->ah_list); + INIT_LIST_HEAD(&ucontext->wq_list); + INIT_LIST_HEAD(&ucontext->rwq_ind_tbl_list); INIT_LIST_HEAD(&ucontext->xrcd_list); INIT_LIST_HEAD(&ucontext->rule_list); INIT_LIST_HEAD(&ucontext->dct_list); ucontext->closing = 0; - ucontext->peer_mem_private_data = NULL; - ucontext->peer_mem_name = NULL; resp.num_comp_vectors = file->device->num_comp_vectors; - ret = get_unused_fd(); + ret = get_unused_fd_flags(O_CLOEXEC); if (ret < 0) goto err_free; - resp.async_fd = ret; + resp.async_fd = ret; filp = ib_uverbs_alloc_event_file(file, 1); if (IS_ERR(filp)) { @@ -396,6 +435,12 @@ goto err_fd; } + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) { + ret = -EFAULT; + goto err_file; + } + file->async_file = filp->private_data; INIT_IB_EVENT_HANDLER(&file->event_handler, file->device->ib_dev, @@ -404,11 +449,6 @@ if (ret) goto err_file; - if (copy_to_user((void __user *) (unsigned long) cmd.response, - &resp, sizeof resp)) { - ret = -EFAULT; - goto err_file; - } kref_get(&file->async_file->ref); kref_get(&file->ref); file->ucontext = ucontext; @@ -433,53 +473,50 @@ return ret; } -static void ib_uverbs_query_device_assign( - struct ib_uverbs_query_device_resp *resp, - struct ib_device_attr *attr, - struct ib_uverbs_file *file) +static void copy_query_dev_fields(struct ib_uverbs_file *file, + struct ib_uverbs_query_device_resp *resp, + struct ib_device_attr *attr) { - memset(resp, 0, sizeof(*resp)); - - resp->fw_ver = attr->fw_ver; - resp->node_guid = file->device->ib_dev->node_guid; - resp->sys_image_guid = attr->sys_image_guid; - resp->max_mr_size = attr->max_mr_size; - resp->page_size_cap = attr->page_size_cap; - resp->vendor_id = attr->vendor_id; - resp->vendor_part_id = attr->vendor_part_id; - resp->hw_ver = attr->hw_ver; - resp->max_qp = attr->max_qp; - resp->max_qp_wr = attr->max_qp_wr; - resp->device_cap_flags = attr->device_cap_flags; - resp->max_sge = attr->max_sge; - resp->max_sge_rd = attr->max_sge_rd; - resp->max_cq = attr->max_cq; - resp->max_cqe = attr->max_cqe; - resp->max_mr = attr->max_mr; - resp->max_pd = attr->max_pd; - resp->max_qp_rd_atom = attr->max_qp_rd_atom; - resp->max_ee_rd_atom = attr->max_ee_rd_atom; - resp->max_res_rd_atom = attr->max_res_rd_atom; - resp->max_qp_init_rd_atom = attr->max_qp_init_rd_atom; - resp->max_ee_init_rd_atom = attr->max_ee_init_rd_atom; - resp->atomic_cap = attr->atomic_cap; - resp->max_ee = attr->max_ee; - resp->max_rdd = attr->max_rdd; - resp->max_mw = attr->max_mw; - resp->max_raw_ipv6_qp = attr->max_raw_ipv6_qp; - resp->max_raw_ethy_qp = attr->max_raw_ethy_qp; - resp->max_mcast_grp = attr->max_mcast_grp; - resp->max_mcast_qp_attach = attr->max_mcast_qp_attach; - resp->max_total_mcast_qp_attach = attr->max_total_mcast_qp_attach; - resp->max_ah = attr->max_ah; - resp->max_fmr = attr->max_fmr; - resp->max_map_per_fmr = attr->max_map_per_fmr; - resp->max_srq = attr->max_srq; - resp->max_srq_wr = attr->max_srq_wr; - resp->max_srq_sge = attr->max_srq_sge; - resp->max_pkeys = attr->max_pkeys; - resp->local_ca_ack_delay = attr->local_ca_ack_delay; - resp->phys_port_cnt = file->device->ib_dev->phys_port_cnt; + resp->fw_ver = attr->fw_ver; + resp->node_guid = file->device->ib_dev->node_guid; + resp->sys_image_guid = attr->sys_image_guid; + resp->max_mr_size = attr->max_mr_size; + resp->page_size_cap = attr->page_size_cap; + resp->vendor_id = attr->vendor_id; + resp->vendor_part_id = attr->vendor_part_id; + resp->hw_ver = attr->hw_ver; + resp->max_qp = attr->max_qp; + resp->max_qp_wr = attr->max_qp_wr; + resp->device_cap_flags = attr->device_cap_flags; + resp->max_sge = attr->max_sge; + resp->max_sge_rd = attr->max_sge_rd; + resp->max_cq = attr->max_cq; + resp->max_cqe = attr->max_cqe; + resp->max_mr = attr->max_mr; + resp->max_pd = attr->max_pd; + resp->max_qp_rd_atom = attr->max_qp_rd_atom; + resp->max_ee_rd_atom = attr->max_ee_rd_atom; + resp->max_res_rd_atom = attr->max_res_rd_atom; + resp->max_qp_init_rd_atom = attr->max_qp_init_rd_atom; + resp->max_ee_init_rd_atom = attr->max_ee_init_rd_atom; + resp->atomic_cap = attr->atomic_cap; + resp->max_ee = attr->max_ee; + resp->max_rdd = attr->max_rdd; + resp->max_mw = attr->max_mw; + resp->max_raw_ipv6_qp = attr->max_raw_ipv6_qp; + resp->max_raw_ethy_qp = attr->max_raw_ethy_qp; + resp->max_mcast_grp = attr->max_mcast_grp; + resp->max_mcast_qp_attach = attr->max_mcast_qp_attach; + resp->max_total_mcast_qp_attach = attr->max_total_mcast_qp_attach; + resp->max_ah = attr->max_ah; + resp->max_fmr = attr->max_fmr; + resp->max_map_per_fmr = attr->max_map_per_fmr; + resp->max_srq = attr->max_srq; + resp->max_srq_wr = attr->max_srq_wr; + resp->max_srq_sge = attr->max_srq_sge; + resp->max_pkeys = attr->max_pkeys; + resp->local_ca_ack_delay = attr->local_ca_ack_delay; + resp->phys_port_cnt = file->device->ib_dev->phys_port_cnt; } ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file, @@ -501,10 +538,18 @@ if (ret) return ret; - ib_uverbs_query_device_assign(&resp, &attr, file); + memset(&resp, 0, sizeof resp); - if (copy_to_user((void __user *)(unsigned long) cmd.response, - &resp, sizeof(resp))) + copy_query_dev_fields(file, &resp, &attr); + + if (resp.atomic_cap > IB_ATOMIC_GLOB) + resp.atomic_cap = IB_ATOMIC_NONE; + + resp.device_cap_flags &= ~IB_EXP_DEVICE_MASK; + + + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) return -EFAULT; return in_len; @@ -755,12 +800,12 @@ } ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_open_xrcd cmd; struct ib_uverbs_open_xrcd_resp resp; - struct ib_udata udata; + struct ib_udata udata; struct ib_uxrcd_object *obj; struct ib_xrcd *xrcd = NULL; struct fd f = {NULL}; @@ -776,7 +821,7 @@ INIT_UDATA(&udata, buf + sizeof cmd, (unsigned long) cmd.response + sizeof resp, - in_len - sizeof cmd, out_len - sizeof resp); + in_len - sizeof cmd, out_len - sizeof resp); mutex_lock(&file->device->xrcd_tree_mutex); @@ -797,7 +842,7 @@ } if (xrcd && cmd.oflags & O_EXCL) { - ret = -EINVAL; + ret = -EINVAL; goto err_tree_mutex_unlock; } } @@ -892,11 +937,11 @@ } ssize_t ib_uverbs_close_xrcd(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_close_xrcd cmd; - struct ib_uobject *uobj; + struct ib_uobject *uobj; struct ib_xrcd *xrcd = NULL; struct inode *inode = NULL; struct ib_uxrcd_object *obj; @@ -924,8 +969,8 @@ if (!inode || atomic_dec_and_test(&xrcd->usecnt)) { ret = ib_dealloc_xrcd(uobj->object); - if (!ret) - uobj->live = 0; + if (!ret) + uobj->live = 0; } live = uobj->live; @@ -968,17 +1013,28 @@ xrcd_table_delete(dev, inode); } +#define KEEP_ACCESS_FLAGS (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | \ + IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_ATOMIC | \ + IB_ACCESS_MW_BIND) +static int translate_exp_access_flags(u64 exp_access_flags) +{ + int access_flags = exp_access_flags & KEEP_ACCESS_FLAGS; + if (exp_access_flags & IB_UVERBS_EXP_ACCESS_MW_ZERO_BASED) + access_flags |= IB_ZERO_BASED; + return access_flags; +} + ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_reg_mr cmd; struct ib_uverbs_reg_mr_resp resp; - struct ib_udata udata; + struct ib_udata udata; struct ib_uobject *uobj; struct ib_pd *pd; struct ib_mr *mr; - int ret; + int ret; if (out_len < sizeof resp) return -ENOSPC; @@ -1006,9 +1062,10 @@ pd = idr_read_pd(cmd.pd_handle, file->ucontext); if (!pd) { - ret = -EINVAL; + ret = -EINVAL; goto err_free; } + /* We first get a new "obj id" to be passed later to reg mr for further use as mr_id. */ @@ -1020,7 +1077,7 @@ cmd.access_flags, &udata, uobj->id); if (IS_ERR(mr)) { ret = PTR_ERR(mr); - goto err_remove_uobj; + goto err_put; } mr->device = pd->device; @@ -1055,11 +1112,10 @@ return in_len; err_copy: - ib_dereg_mr(mr); - -err_remove_uobj: idr_remove_uobj(&ib_uverbs_mr_idr, uobj); + ib_dereg_mr(mr); + err_put: put_pd_read(pd); @@ -1068,14 +1124,195 @@ return ret; } +int ib_uverbs_exp_rereg_mr(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + struct ib_uverbs_exp_rereg_mr cmd; + struct ib_uverbs_exp_rereg_mr_resp resp; + struct ib_pd *pd = NULL; + struct ib_mr *mr; + struct ib_pd *old_pd; + int ret; + struct ib_uobject *uobj; + + if (ucore->outlen < sizeof(resp)) + return -ENOSPC; + + ret = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); + if (ret) + return ret; + + if (cmd.comp_mask) + return -EINVAL; + + if (cmd.flags & ~IB_EXP_MR_REREG_SUPPORTED || !cmd.flags) + return -EINVAL; + + if ((cmd.flags & IB_EXP_MR_REREG_TRANS) && + (!cmd.start || !cmd.hca_va || 0 >= cmd.length || + (cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK))) + return -EINVAL; + + uobj = idr_write_uobj(&ib_uverbs_mr_idr, cmd.mr_handle, + file->ucontext); + + if (!uobj) + return -EINVAL; + + mr = uobj->object; + + if (cmd.flags & IB_EXP_MR_REREG_ACCESS) { + ret = ib_check_mr_access(cmd.access_flags); + if (ret) + goto put_uobjs; + } + + if (cmd.flags & IB_EXP_MR_REREG_PD) { + pd = idr_read_pd(cmd.pd_handle, file->ucontext); + if (!pd) { + ret = -EINVAL; + goto put_uobjs; + } + } + + if (atomic_read(&mr->usecnt)) { + ret = -EBUSY; + goto put_uobj_pd; + } + + old_pd = mr->pd; + ret = mr->device->exp_rereg_user_mr(mr, cmd.flags, cmd.start, + cmd.length, cmd.hca_va, + cmd.access_flags, pd); + if (!ret) { + if (cmd.flags & IB_EXP_MR_REREG_PD) { + atomic_inc(&pd->usecnt); + mr->pd = pd; + atomic_dec(&old_pd->usecnt); + } + } else { + goto put_uobj_pd; + } + + memset(&resp, 0, sizeof(resp)); + resp.lkey = mr->lkey; + resp.rkey = mr->rkey; + + ret = ib_copy_to_udata(ucore, + &resp, sizeof(resp)); +put_uobj_pd: + if (cmd.flags & IB_EXP_MR_REREG_PD) + put_pd_read(pd); + +put_uobjs: + + put_uobj_write(mr->uobject); + + return ret; +} + +ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_rereg_mr cmd; + struct ib_uverbs_rereg_mr_resp resp; + struct ib_udata udata; + struct ib_pd *pd = NULL; + struct ib_mr *mr; + struct ib_pd *old_pd; + int ret; + struct ib_uobject *uobj; + + if (out_len < sizeof(resp)) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof(cmd))) + return -EFAULT; + + INIT_UDATA(&udata, buf + sizeof(cmd), + (unsigned long) cmd.response + sizeof(resp), + in_len - sizeof(cmd), out_len - sizeof(resp)); + + if (cmd.flags & ~IB_MR_REREG_SUPPORTED || !cmd.flags) + return -EINVAL; + + if ((cmd.flags & IB_MR_REREG_TRANS) && + (!cmd.start || !cmd.hca_va || 0 >= cmd.length || + (cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK))) + return -EINVAL; + + uobj = idr_write_uobj(&ib_uverbs_mr_idr, cmd.mr_handle, + file->ucontext); + + if (!uobj) + return -EINVAL; + + mr = uobj->object; + + if (cmd.flags & IB_MR_REREG_ACCESS) { + ret = ib_check_mr_access(cmd.access_flags); + if (ret) + goto put_uobjs; + } + + if (cmd.flags & IB_MR_REREG_PD) { + pd = idr_read_pd(cmd.pd_handle, file->ucontext); + if (!pd) { + ret = -EINVAL; + goto put_uobjs; + } + } + + if (atomic_read(&mr->usecnt)) { + ret = -EBUSY; + goto put_uobj_pd; + } + + old_pd = mr->pd; + ret = mr->device->rereg_user_mr(mr, cmd.flags, cmd.start, + cmd.length, cmd.hca_va, + cmd.access_flags, pd, &udata); + if (!ret) { + if (cmd.flags & IB_MR_REREG_PD) { + atomic_inc(&pd->usecnt); + mr->pd = pd; + atomic_dec(&old_pd->usecnt); + } + } else { + goto put_uobj_pd; + } + + memset(&resp, 0, sizeof(resp)); + resp.lkey = mr->lkey; + resp.rkey = mr->rkey; + + if (copy_to_user((void __user *)(unsigned long)cmd.response, + &resp, sizeof(resp))) + ret = -EFAULT; + else + ret = in_len; + +put_uobj_pd: + if (cmd.flags & IB_MR_REREG_PD) + put_pd_read(pd); + +put_uobjs: + + put_uobj_write(mr->uobject); + + return ret; +} + ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_dereg_mr cmd; struct ib_mr *mr; struct ib_uobject *uobj; - int ret = -EINVAL; + int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; @@ -1107,8 +1344,8 @@ } ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_alloc_mw cmd; struct ib_uverbs_alloc_mw_resp resp; @@ -1189,13 +1426,13 @@ } ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_dealloc_mw cmd; struct ib_mw *mw; - struct ib_uobject *uobj; - int ret = -EINVAL; + struct ib_uobject *uobj; + int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof(cmd))) return -EFAULT; @@ -1226,67 +1463,195 @@ return in_len; } -ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) +int ib_uverbs_exp_reg_mr_ex(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) { - struct ib_uverbs_create_comp_channel cmd; - struct ib_uverbs_create_comp_channel_resp resp; - struct file *filp; - int ret; - - if (out_len < sizeof resp) - return -ENOSPC; - - if (copy_from_user(&cmd, buf, sizeof cmd)) - return -EFAULT; + struct ib_uverbs_exp_reg_mr_ex cmd; + struct ib_uverbs_exp_reg_mr_resp_ex resp; + struct ib_uobject *uobj; + struct ib_pd *pd; + struct ib_mr *mr; + int access_flags; + int ret; + const int min_cmd_size = offsetof(typeof(cmd), comp_mask) + + sizeof(cmd.comp_mask); + + if (ucore->inlen < min_cmd_size) { + pr_debug("ib_uverbs_reg_mr: command input length too short\n"); + return -EINVAL; + } - ret = get_unused_fd(); - if (ret < 0) + ret = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); + if (ret) return ret; - resp.fd = ret; - filp = ib_uverbs_alloc_event_file(file, 0); - if (IS_ERR(filp)) { - put_unused_fd(resp.fd); - return PTR_ERR(filp); + if (cmd.comp_mask >= IB_UVERBS_EXP_REG_MR_EX_RESERVED) { + pr_debug("ib_uverbs_reg_mr: invalid bit in command comp_mask field\n"); + return -EINVAL; } - if (copy_to_user((void __user *) (unsigned long) cmd.response, - &resp, sizeof resp)) { - put_unused_fd(resp.fd); - fput(filp); - return -EFAULT; + if ((cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK)) { + pr_debug("ib_uverbs_reg_mr: HCA virtual address doesn't match host address\n"); + return -EINVAL; } - fd_install(resp.fd, filp); - return in_len; -} - -static ssize_t create_cq(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len, void *vcmd, int ex, - void __user *response) -{ - struct ib_uverbs_create_cq *cmd; - struct ib_uverbs_create_cq_ex *cmd_e; - struct ib_uverbs_create_cq_resp resp; - struct ib_udata udata; - struct ib_ucq_object *obj; - struct ib_uverbs_event_file *ev_file = NULL; - struct ib_cq *cq; - struct ib_cq_init_attr attr; - int cmd_sz; - int ret; + ret = ib_check_mr_access(cmd.exp_access_flags); + if (ret) + return ret; - if (out_len < sizeof resp) - return -ENOSPC; + uobj = kmalloc(sizeof(*uobj), GFP_KERNEL); + if (!uobj) + return -ENOMEM; - cmd = vcmd; - cmd_e = vcmd; - cmd_sz = ex ? sizeof(*cmd_e) : sizeof(*cmd); - INIT_UDATA(&udata, buf + cmd_sz, response + sizeof(resp), - in_len - sizeof(cmd), out_len - sizeof(resp)); + init_uobj(uobj, 0, file->ucontext, &mr_lock_class); + down_write(&uobj->mutex); + + pd = idr_read_pd(cmd.pd_handle, file->ucontext); + if (!pd) { + pr_debug("ib_uverbs_reg_mr: invalid PD\n"); + ret = -EINVAL; + goto err_free; + } + + if (cmd.exp_access_flags & IB_UVERBS_EXP_ACCESS_ON_DEMAND) { +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + struct ib_exp_device_attr exp_attr; + ret = ib_exp_query_device(pd->device, &exp_attr); + if (ret || !(exp_attr.device_cap_flags2 & + IB_EXP_DEVICE_ODP)) { + pr_debug("ib_uverbs_reg_mr: ODP requested on device without ODP support\n"); + ret = -EINVAL; + goto err_put; + } +#else + pr_debug("ib_uverbs_reg_mr: ODP requested but the RDMA subsystem was compiled without ODP support\n"); + ret = -EINVAL; + goto err_put; +#endif + } + + /* We first get a new "obj id" to be passed later to reg mr for + further use as mr_id. + */ + ret = idr_add_uobj(&ib_uverbs_mr_idr, uobj); + if (ret) + goto err_put; + + access_flags = translate_exp_access_flags(cmd.exp_access_flags); + mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va, + access_flags, uhw, uobj->id); + if (IS_ERR(mr)) { + ret = PTR_ERR(mr); + goto err_remove_uobj; + } + + mr->device = pd->device; + mr->pd = pd; + mr->uobject = uobj; + atomic_inc(&pd->usecnt); + atomic_set(&mr->usecnt, 0); + + uobj->object = mr; + + memset(&resp, 0, sizeof(resp)); + resp.lkey = mr->lkey; + resp.rkey = mr->rkey; + resp.mr_handle = uobj->id; + + ret = ib_copy_to_udata(ucore, &resp, sizeof(resp)); + if (ret) + goto err_copy; + + put_pd_read(pd); + + mutex_lock(&file->mutex); + list_add_tail(&uobj->list, &file->ucontext->mr_list); + mutex_unlock(&file->mutex); + + uobj->live = 1; + + up_write(&uobj->mutex); + + return 0; + +err_copy: + ib_dereg_mr(mr); + +err_remove_uobj: + idr_remove_uobj(&ib_uverbs_mr_idr, uobj); + +err_put: + put_pd_read(pd); + +err_free: + put_uobj_write(uobj); + return ret; +} + +ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_create_comp_channel cmd; + struct ib_uverbs_create_comp_channel_resp resp; + struct file *filp; + int ret; + + if (out_len < sizeof resp) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + ret = get_unused_fd_flags(O_CLOEXEC); + if (ret < 0) + return ret; + resp.fd = ret; + + filp = ib_uverbs_alloc_event_file(file, 0); + if (IS_ERR(filp)) { + put_unused_fd(resp.fd); + return PTR_ERR(filp); + } + + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) { + put_unused_fd(resp.fd); + fput(filp); + return -EFAULT; + } + + /* Taking ref count on uverbs_file to make sure that file won't be + * freed till that event file is closed. It will enable accessing the + * uverbs_device fields as part of closing the events file and making + * sure that uverbs device is available by that time as well. + * Note: similar is already done for the async event file. + */ + kref_get(&file->ref); + fd_install(resp.fd, filp); + return in_len; +} + +static ssize_t create_cq(struct ib_uverbs_file *file, + int in_len, + int out_len, void *vcmd, int ex, + void __user *response, struct ib_udata *udata) +{ + struct ib_uverbs_create_cq *cmd; + struct ib_uverbs_exp_create_cq *cmd_e; + struct ib_uverbs_create_cq_resp resp; + struct ib_ucq_object *obj; + struct ib_uverbs_event_file *ev_file = NULL; + struct ib_cq *cq; + struct ib_cq_init_attr attr; + int ret; + + if (out_len < sizeof resp) + return -ENOSPC; + + cmd = vcmd; + cmd_e = vcmd; if (cmd->comp_vector >= file->device->num_comp_vectors) return -EINVAL; @@ -1316,10 +1681,10 @@ memset(&attr, 0, sizeof(attr)); attr.cqe = cmd->cqe; attr.comp_vector = cmd->comp_vector; - if (ex && (cmd_e->comp_mask & IB_UVERBS_CREATE_CQ_EX_CAP_FLAGS)) + if (ex && (cmd_e->comp_mask & IB_UVERBS_EXP_CREATE_CQ_CAP_FLAGS)) attr.flags = cmd_e->create_flags; cq = file->device->ib_dev->create_cq(file->device->ib_dev, &attr, - file->ucontext, &udata); + file->ucontext, udata); if (IS_ERR(cq)) { ret = PTR_ERR(cq); goto err_file; @@ -1372,17 +1737,23 @@ } ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + const char __user *buf, int in_len, + int out_len) { - struct ib_uverbs_create_cq cmd; + struct ib_uverbs_create_cq cmd; + struct ib_udata udata; + struct ib_uverbs_create_cq_resp resp; - if (copy_from_user(&cmd, buf, sizeof(cmd))) + if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - return create_cq(file, buf, in_len, out_len, &cmd, - IB_USER_VERBS_CMD_BASIC, - (void __user *) (unsigned long) cmd.response); + INIT_UDATA(&udata, buf + sizeof cmd, + (unsigned long) cmd.response + sizeof resp, + in_len - sizeof cmd, out_len - sizeof resp); + + return create_cq(file, in_len, out_len, &cmd, + IB_USER_VERBS_CMD_BASIC, (void __user *)cmd.response, + &udata); } ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file, @@ -1449,8 +1820,8 @@ } ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_poll_cq cmd; struct ib_uverbs_poll_cq_resp resp; @@ -1458,7 +1829,7 @@ u8 __user *data_ptr; struct ib_cq *cq; struct ib_wc wc; - int ret; + int ret; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; @@ -1577,7 +1948,7 @@ const char __user *buf, int in_len, int out_len) { - void __user *response; + void __user *response; struct ib_udata udata; struct ib_uqp_object *obj; struct ib_device *device; @@ -1587,43 +1958,50 @@ struct ib_cq *scq = NULL, *rcq = NULL; struct ib_srq *srq = NULL; struct ib_qp *qp; - struct ib_qp_init_attr attr; + struct ib_qp_init_attr *attr; int ret; - union { - struct ib_uverbs_create_qp basic; - } cmd_obj; - struct ib_uverbs_create_qp *cmd; - size_t cmd_size = 0; - union { - struct ib_uverbs_create_qp_resp basic; - } resp_obj; - struct ib_uverbs_create_qp_resp *resp; - size_t resp_size = 0; - - cmd_size = sizeof(cmd_obj.basic); - cmd = &cmd_obj.basic; - - resp_size = sizeof(resp_obj.basic); - resp = &resp_obj.basic; - - if (out_len < resp_size) - return -ENOSPC; + struct ib_uverbs_create_qp *cmd; + size_t cmd_size; + struct ib_uverbs_create_qp_resp *resp; + size_t resp_size; + + attr = kzalloc(sizeof(*attr), GFP_KERNEL); + cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); + resp = kzalloc(sizeof(*resp), GFP_KERNEL); + if (!cmd || !attr || !resp) { + ret = -ENOMEM; + goto err_free; + } - if (copy_from_user(&cmd_obj, buf, cmd_size)) - return -EFAULT; + cmd_size = sizeof(*cmd); + resp_size = sizeof(*resp); - response = (void __user *) (unsigned long) cmd->response; + if (out_len < resp_size) { + ret = -ENOSPC; + goto err_free; + } + + if (copy_from_user(cmd, buf, cmd_size)) { + ret = -EFAULT; + goto err_free; + } + + response = (void __user *)cmd->response; if (!disable_raw_qp_enforcement && - cmd->qp_type == IB_QPT_RAW_PACKET && priv_check(curthread, PRIV_NET_RAW)) - return -EPERM; + cmd->qp_type == IB_QPT_RAW_PACKET && priv_check(curthread, PRIV_NET_RAW)) { + ret = -EPERM; + goto err_free; + } INIT_UDATA(&udata, buf + cmd_size, response + resp_size, in_len - cmd_size, out_len - resp_size); obj = kzalloc(sizeof *obj, GFP_KERNEL); - if (!obj) - return -ENOMEM; + if (!obj) { + ret = -ENOMEM; + goto err_free; + } init_uobj(&obj->uevent.uobject, cmd->user_handle, file->ucontext, &qp_lock_class); down_write(&obj->uevent.uobject.mutex); @@ -1636,24 +2014,25 @@ } device = xrcd->device; } else { - if (cmd->qp_type == IB_QPT_XRC_INI) { + if (cmd->qp_type == IB_QPT_XRC_INI || + cmd->qp_type == IB_EXP_QPT_DC_INI) { cmd->max_recv_wr = 0; cmd->max_recv_sge = 0; } else { if (cmd->is_srq) { srq = idr_read_srq(cmd->srq_handle, file->ucontext); if (!srq || srq->srq_type != IB_SRQT_BASIC) { - ret = -EINVAL; + ret = -EINVAL; goto err_put; - } - } + } + } if (cmd->recv_cq_handle != cmd->send_cq_handle) { rcq = idr_read_cq(cmd->recv_cq_handle, file->ucontext, 0); if (!rcq) { - ret = -EINVAL; + ret = -EINVAL; goto err_put; - } + } } } @@ -1663,36 +2042,35 @@ if (!pd || !scq) { ret = -EINVAL; goto err_put; - } + } device = pd->device; - } + } - memset(&attr, 0, sizeof attr); - attr.event_handler = ib_uverbs_qp_event_handler; - attr.qp_context = file; - attr.send_cq = scq; - attr.recv_cq = rcq; - attr.srq = srq; - attr.xrcd = xrcd; - attr.sq_sig_type = cmd->sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; - attr.qp_type = cmd->qp_type; - attr.create_flags = 0; - - attr.cap.max_send_wr = cmd->max_send_wr; - attr.cap.max_recv_wr = cmd->max_recv_wr; - attr.cap.max_send_sge = cmd->max_send_sge; - attr.cap.max_recv_sge = cmd->max_recv_sge; - attr.cap.max_inline_data = cmd->max_inline_data; + attr->event_handler = ib_uverbs_qp_event_handler; + attr->qp_context = file; + attr->send_cq = scq; + attr->recv_cq = rcq; + attr->srq = srq; + attr->xrcd = xrcd; + attr->sq_sig_type = cmd->sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; + attr->qp_type = cmd->qp_type; + attr->create_flags = 0; + + attr->cap.max_send_wr = cmd->max_send_wr; + attr->cap.max_recv_wr = cmd->max_recv_wr; + attr->cap.max_send_sge = cmd->max_send_sge; + attr->cap.max_recv_sge = cmd->max_recv_sge; + attr->cap.max_inline_data = cmd->max_inline_data; obj->uevent.events_reported = 0; INIT_LIST_HEAD(&obj->uevent.event_list); INIT_LIST_HEAD(&obj->mcast_list); if (cmd->qp_type == IB_QPT_XRC_TGT) - qp = ib_create_qp(pd, &attr); + qp = ib_create_qp(pd, attr); else - qp = device->create_qp(pd, &attr, &udata); + qp = device->create_qp(pd, attr, &udata); if (IS_ERR(qp)) { ret = PTR_ERR(qp); @@ -1703,19 +2081,19 @@ qp->real_qp = qp; qp->device = device; qp->pd = pd; - qp->send_cq = attr.send_cq; - qp->recv_cq = attr.recv_cq; - qp->srq = attr.srq; - qp->event_handler = attr.event_handler; - qp->qp_context = attr.qp_context; - qp->qp_type = attr.qp_type; + qp->send_cq = attr->send_cq; + qp->recv_cq = attr->recv_cq; + qp->srq = attr->srq; + qp->event_handler = attr->event_handler; + qp->qp_context = attr->qp_context; + qp->qp_type = attr->qp_type; atomic_set(&qp->usecnt, 0); atomic_inc(&pd->usecnt); - atomic_inc(&attr.send_cq->usecnt); - if (attr.recv_cq) - atomic_inc(&attr.recv_cq->usecnt); - if (attr.srq) - atomic_inc(&attr.srq->usecnt); + atomic_inc(&attr->send_cq->usecnt); + if (attr->recv_cq) + atomic_inc(&attr->recv_cq->usecnt); + if (attr->srq) + atomic_inc(&attr->srq->usecnt); } qp->uobject = &obj->uevent.uobject; @@ -1724,25 +2102,25 @@ if (ret) goto err_destroy; - memset(&resp_obj, 0, sizeof(resp_obj)); resp->qpn = qp->qp_num; resp->qp_handle = obj->uevent.uobject.id; - resp->max_recv_sge = attr.cap.max_recv_sge; - resp->max_send_sge = attr.cap.max_send_sge; - resp->max_recv_wr = attr.cap.max_recv_wr; - resp->max_send_wr = attr.cap.max_send_wr; - resp->max_inline_data = attr.cap.max_inline_data; + resp->max_recv_sge = attr->cap.max_recv_sge; + resp->max_send_sge = attr->cap.max_send_sge; + resp->max_recv_wr = attr->cap.max_recv_wr; + resp->max_send_wr = attr->cap.max_send_wr; + resp->max_inline_data = attr->cap.max_inline_data; - if (copy_to_user(response, &resp_obj, resp_size)) { - ret = -EFAULT; + if (copy_to_user(response, resp, resp_size)) { + ret = -EFAULT; goto err_copy; - } + } if (xrcd) { - obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject); + obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, + uobject); atomic_inc(&obj->uxrcd->refcnt); put_xrcd_read(xrcd_uobj); - } + } if (pd) put_pd_read(pd); @@ -1760,6 +2138,9 @@ obj->uevent.uobject.live = 1; up_write(&obj->uevent.uobject.mutex); + kfree(attr); + kfree(cmd); + kfree(resp); return in_len; @@ -1782,6 +2163,11 @@ put_srq_read(srq); put_uobj_write(&obj->uevent.uobject); + +err_free: + kfree(attr); + kfree(cmd); + kfree(resp); return ret; } @@ -1834,7 +2220,7 @@ if (IS_ERR(qp)) { ret = PTR_ERR(qp); goto err_put; - } + } qp->uobject = &obj->uevent.uobject; @@ -1862,6 +2248,7 @@ mutex_unlock(&file->mutex); obj->uevent.uobject.live = 1; + up_write(&obj->uevent.uobject.mutex); return in_len; @@ -1879,8 +2266,8 @@ } ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_query_qp cmd; struct ib_uverbs_query_qp_resp resp; @@ -1992,45 +2379,136 @@ } } -static ssize_t __uverbs_modify_qp(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len, - enum uverbs_cmd_type cmd_type) +static ssize_t __uverbs_modify_qp(struct ib_uverbs_file *file, int cmd_len, + enum uverbs_cmd_type cmd_type, + struct ib_uverbs_exp_modify_qp *cmd, + int hw_len, + struct ib_udata *udata) { - struct ib_uverbs_modify_qp_ex cmd; - struct ib_udata udata; struct ib_qp *qp; struct ib_qp_attr *attr; - struct ib_qp_attr_ex *attrx; - int ret; - void *p; - union ib_gid sgid; - union ib_gid *dgid; + int ret; u8 port_num; + u32 exp_mask = 0; - if (cmd_type == IB_USER_VERBS_CMD_BASIC) { - p = &cmd; - p += sizeof(cmd.comp_mask); - if (copy_from_user(p, buf, - sizeof(struct ib_uverbs_modify_qp))) - return -EFAULT; + attr = kzalloc(sizeof(*attr), GFP_KERNEL); + if (!attr) + return -ENOMEM; + + qp = idr_read_qp(cmd->qp_handle, file->ucontext); + if (!qp) { + kfree(attr); + return -EINVAL; + } + + attr->qp_state = cmd->qp_state; + attr->cur_qp_state = cmd->cur_qp_state; + attr->path_mtu = cmd->path_mtu; + attr->path_mig_state = cmd->path_mig_state; + attr->qkey = cmd->qkey; + attr->rq_psn = cmd->rq_psn & 0xffffff; + attr->sq_psn = cmd->sq_psn & 0xffffff; + attr->dest_qp_num = cmd->dest_qp_num; + attr->qp_access_flags = cmd->qp_access_flags; + attr->pkey_index = cmd->pkey_index; + attr->alt_pkey_index = cmd->alt_pkey_index; + attr->en_sqd_async_notify = cmd->en_sqd_async_notify; + attr->max_rd_atomic = cmd->max_rd_atomic; + attr->max_dest_rd_atomic = cmd->max_dest_rd_atomic; + attr->min_rnr_timer = cmd->min_rnr_timer; + attr->port_num = cmd->port_num; + attr->timeout = cmd->timeout; + attr->retry_cnt = cmd->retry_cnt; + attr->rnr_retry = cmd->rnr_retry; + attr->alt_port_num = cmd->alt_port_num; + attr->alt_timeout = cmd->alt_timeout; + if (cmd->comp_mask & IB_UVERBS_EXP_QP_ATTR_FLOW_ENTROPY) { + if (offsetof(typeof(*cmd), flow_entropy) + sizeof(cmd->flow_entropy) <= cmd_len) { + attr->flow_entropy = cmd->flow_entropy; + } else { + ret = -EINVAL; + goto out; + } + } + + memcpy(attr->ah_attr.grh.dgid.raw, cmd->dest.dgid, 16); + attr->ah_attr.grh.flow_label = cmd->dest.flow_label; + attr->ah_attr.grh.sgid_index = cmd->dest.sgid_index; + attr->ah_attr.grh.hop_limit = cmd->dest.hop_limit; + attr->ah_attr.grh.traffic_class = cmd->dest.traffic_class; + attr->ah_attr.dlid = cmd->dest.dlid; + attr->ah_attr.sl = cmd->dest.sl; + attr->ah_attr.src_path_bits = cmd->dest.src_path_bits; + attr->ah_attr.static_rate = cmd->dest.static_rate; + attr->ah_attr.ah_flags = cmd->dest.is_global ? IB_AH_GRH : 0; + attr->ah_attr.port_num = cmd->dest.port_num; + + memcpy(attr->alt_ah_attr.grh.dgid.raw, cmd->alt_dest.dgid, 16); + attr->alt_ah_attr.grh.flow_label = cmd->alt_dest.flow_label; + attr->alt_ah_attr.grh.sgid_index = cmd->alt_dest.sgid_index; + attr->alt_ah_attr.grh.hop_limit = cmd->alt_dest.hop_limit; + attr->alt_ah_attr.grh.traffic_class = cmd->alt_dest.traffic_class; + attr->alt_ah_attr.dlid = cmd->alt_dest.dlid; + attr->alt_ah_attr.sl = cmd->alt_dest.sl; + attr->alt_ah_attr.src_path_bits = cmd->alt_dest.src_path_bits; + attr->alt_ah_attr.static_rate = cmd->alt_dest.static_rate; + attr->alt_ah_attr.ah_flags = cmd->alt_dest.is_global ? IB_AH_GRH : 0; + attr->alt_ah_attr.port_num = cmd->alt_dest.port_num; + port_num = (cmd->attr_mask & IB_QP_PORT) ? cmd->port_num : qp->port_num; + + if (cmd_type == IB_USER_VERBS_CMD_EXP) { + exp_mask = cmd->exp_attr_mask & IBV_EXP_QP_ATTR_MASK; + attr->dct_key = cmd->dct_key; + } + + if (qp->real_qp == qp) { + ret = ib_resolve_eth_dmac(qp, attr, &cmd->attr_mask); + if (ret) + goto out; + ret = qp->device->modify_qp(qp, attr, + modify_qp_mask(qp->qp_type, cmd->attr_mask | exp_mask), udata); + if (!ret && (cmd->attr_mask & IB_QP_PORT)) + qp->port_num = attr->port_num; } else { - if (copy_from_user(&cmd, buf, sizeof(cmd))) - return -EFAULT; + ret = ib_modify_qp(qp, attr, modify_qp_mask(qp->qp_type, cmd->attr_mask | exp_mask)); } + if (ret) + goto out; + + ret = cmd_len + hw_len; + +out: + put_qp_read(qp); + kfree(attr); + + return ret; +} + +ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_modify_qp cmd; + struct ib_udata udata; + struct ib_qp *qp; + struct ib_qp_attr *attr; + int ret; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + INIT_UDATA(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd, out_len); - attrx = kzalloc(sizeof(*attrx), GFP_KERNEL); - if (!attrx) + attr = kmalloc(sizeof *attr, GFP_KERNEL); + if (!attr) return -ENOMEM; - attr = (struct ib_qp_attr *)attrx; qp = idr_read_qp(cmd.qp_handle, file->ucontext); if (!qp) { - kfree(attrx); - return -EINVAL; + ret = -EINVAL; + goto out; } attr->qp_state = cmd.qp_state; @@ -2078,77 +2556,38 @@ attr->alt_ah_attr.static_rate = cmd.alt_dest.static_rate; attr->alt_ah_attr.ah_flags = cmd.alt_dest.is_global ? IB_AH_GRH : 0; attr->alt_ah_attr.port_num = cmd.alt_dest.port_num; - port_num = (cmd.attr_mask & IB_QP_PORT) ? cmd.port_num : qp->port_num; - if ((cmd.attr_mask & IB_QP_AV) && port_num && - (rdma_port_get_link_layer(qp->device, port_num) == - IB_LINK_LAYER_ETHERNET)) { - ret = ib_query_gid(qp->device, port_num, - attr->ah_attr.grh.sgid_index, &sgid); - if (ret) - goto out; - dgid = &attr->ah_attr.grh.dgid; - if (rdma_link_local_addr((struct in6_addr *)dgid->raw)) { - rdma_get_ll_mac((struct in6_addr *)dgid->raw, - attr->ah_attr.dmac); - rdma_get_ll_mac((struct in6_addr *)sgid.raw, - attr->smac); - attr->vlan_id = rdma_get_vlan_id(&sgid); - } else { - ret = rdma_addr_find_dmac_by_grh(&sgid, dgid, - attr->ah_attr.dmac, - &attr->vlan_id, -1U); - if (ret) - goto out; - ret = rdma_addr_find_smac_by_sgid(&sgid, attr->smac, - NULL, -1U); - if (ret) - goto out; - } - cmd.attr_mask |= IB_QP_SMAC; - if (attr->vlan_id < 0xFFFF) - cmd.attr_mask |= IB_QP_VID; - } - if (cmd_type == IB_USER_VERBS_CMD_EXTENDED) { - if (cmd.comp_mask & IB_UVERBS_QP_ATTR_DCT_KEY) - attrx->dct_key = cmd.dct_key; - } if (qp->real_qp == qp) { + ret = ib_resolve_eth_dmac(qp, attr, &cmd.attr_mask); + if (ret) + goto release_qp; ret = qp->device->modify_qp(qp, attr, modify_qp_mask(qp->qp_type, cmd.attr_mask), &udata); - if (!ret && (cmd.attr_mask & IB_QP_PORT)) - qp->port_num = attr->port_num; } else { ret = ib_modify_qp(qp, attr, modify_qp_mask(qp->qp_type, cmd.attr_mask)); } if (ret) - goto out; + goto release_qp; ret = in_len; -out: +release_qp: put_qp_read(qp); - kfree(attrx); + +out: + kfree(attr); return ret; } -ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) -{ - return __uverbs_modify_qp(file, buf, in_len, out_len, - IB_USER_VERBS_CMD_BASIC); -} - ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_destroy_qp cmd; struct ib_uverbs_destroy_qp_resp resp; - struct ib_uobject *uobj; + struct ib_uobject *uobj; struct ib_qp *qp; struct ib_uqp_object *obj; int ret = -EINVAL; @@ -2201,14 +2640,14 @@ } ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_post_send cmd; struct ib_uverbs_post_send_resp resp; struct ib_uverbs_send_wr *user_wr; struct ib_send_wr *wr = NULL, *last, *next, *bad_wr; - struct ib_qp *qp; + struct ib_qp *qp; int i, sg_ind; int is_ud; ssize_t ret = -EINVAL; @@ -2251,13 +2690,13 @@ user_wr->num_sge * sizeof (struct ib_sge), GFP_KERNEL); if (!next) { - ret = -ENOMEM; - goto out_put; - } + ret = -ENOMEM; + goto out_put; + } if (!last) wr = next; - else + else last->next = next; last = next; @@ -2272,10 +2711,13 @@ file->ucontext); if (!next->wr.ud.ah) { ret = -EINVAL; - goto out_put; + goto out_put; } next->wr.ud.remote_qpn = user_wr->wr.ud.remote_qpn; next->wr.ud.remote_qkey = user_wr->wr.ud.remote_qkey; + if (next->opcode == IB_WR_SEND_WITH_IMM) + next->ex.imm_data = + (__be32 __force) user_wr->ex.imm_data; } else { switch (next->opcode) { case IB_WR_RDMA_WRITE_WITH_IMM: @@ -2332,7 +2774,7 @@ for (next = wr; next; next = next->next) { ++resp.bad_wr; if (next == bad_wr) - break; + break; } if (copy_to_user((void __user *) (unsigned long) cmd.response, @@ -2366,7 +2808,7 @@ struct ib_recv_wr *wr = NULL, *last, *next; int sg_ind; int i; - int ret; + int ret; if (in_len < wqe_size * wr_count + sge_count * sizeof (struct ib_uverbs_sge)) @@ -2389,9 +2831,9 @@ } if (user_wr->num_sge + sg_ind > sge_count) { - ret = -EINVAL; - goto err; - } + ret = -EINVAL; + goto err; + } next = kmalloc(ALIGN(sizeof *next, sizeof (struct ib_sge)) + user_wr->num_sge * sizeof (struct ib_sge), @@ -2399,7 +2841,7 @@ if (!next) { ret = -ENOMEM; goto err; - } + } if (!last) wr = next; @@ -2540,8 +2982,8 @@ } ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_create_ah cmd; struct ib_uverbs_create_ah_resp resp; @@ -2564,7 +3006,7 @@ init_uobj(uobj, cmd.user_handle, file->ucontext, &ah_lock_class); down_write(&uobj->mutex); - pd = idr_read_pd(cmd.pd_handle, file->ucontext); + pd = idr_read_pd(cmd.pd_handle, file->ucontext); if (!pd) { ret = -EINVAL; goto err; @@ -2580,6 +3022,7 @@ attr.grh.sgid_index = cmd.attr.grh.sgid_index; attr.grh.hop_limit = cmd.attr.grh.hop_limit; attr.grh.traffic_class = cmd.attr.grh.traffic_class; + memset(&attr.dmac, 0, sizeof(attr.dmac)); memcpy(attr.grh.dgid.raw, cmd.attr.grh.dgid, 16); ah = ib_create_ah(pd, &attr); @@ -2635,7 +3078,7 @@ struct ib_uverbs_destroy_ah cmd; struct ib_ah *ah; struct ib_uobject *uobj; - int ret; + int ret; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; @@ -2713,14 +3156,14 @@ } ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_detach_mcast cmd; struct ib_uqp_object *obj; struct ib_qp *qp; struct ib_uverbs_mcast_entry *mcast; - int ret = -EINVAL; + int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; @@ -2789,7 +3232,7 @@ if (!pd) { ret = -EINVAL; goto err_put_cq; - } + } attr.event_handler = ib_uverbs_srq_event_handler; attr.srq_context = file; @@ -2883,13 +3326,13 @@ } ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_create_srq cmd; struct ib_uverbs_create_xsrq xcmd; struct ib_uverbs_create_srq_resp resp; - struct ib_udata udata; + struct ib_udata udata; int ret; if (out_len < sizeof resp) @@ -2997,7 +3440,7 @@ put_srq_read(srq); if (ret) - return ret; + return ret; memset(&resp, 0, sizeof resp); @@ -3013,8 +3456,8 @@ } ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) + const char __user *buf, int in_len, + int out_len) { struct ib_uverbs_destroy_srq cmd; struct ib_uverbs_destroy_srq_resp resp; @@ -3069,19 +3512,60 @@ return ret ? ret : in_len; } -ssize_t ib_uverbs_exp_create_dct(struct ib_uverbs_file *file, - struct ib_udata *ucore, - struct ib_udata *uhw) +int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + struct ib_uverbs_ex_query_device_resp resp; + struct ib_uverbs_ex_query_device cmd; + struct ib_device_attr attr; + struct ib_device *device; + int err; + + device = file->device->ib_dev; + if (ucore->inlen < sizeof(cmd)) + return -EINVAL; + + err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); + if (err) + return err; + + if (cmd.comp_mask) + return -EINVAL; + + if (cmd.reserved) + return -EINVAL; + + resp.response_length = sizeof(resp); + + if (ucore->outlen < resp.response_length) + return -ENOSPC; + + err = device->query_device(device, &attr); + if (err) + return err; + + copy_query_dev_fields(file, &resp.base, &attr); + resp.comp_mask = 0; + + err = ib_copy_to_udata(ucore, &resp, resp.response_length); + if (err) + return err; + + return 0; +} + +int ib_uverbs_exp_create_dct(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) { - int in_len = ucore->inlen + uhw->inlen; int out_len = ucore->outlen + uhw->outlen; - struct ib_uverbs_create_dct cmd; + struct ib_uverbs_create_dct *cmd; struct ib_uverbs_create_dct_resp resp; - struct ib_udata udata; struct ib_udct_object *obj; struct ib_dct *dct; int ret; - struct ib_dct_init_attr attr; + struct ib_dct_init_attr *attr; struct ib_pd *pd = NULL; struct ib_cq *cq = NULL; struct ib_srq *srq = NULL; @@ -3089,66 +3573,82 @@ if (out_len < sizeof(resp)) return -ENOSPC; - ret = ucore->ops->copy_from(&cmd, ucore, sizeof(cmd)); + cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); + attr = kzalloc(sizeof(*attr), GFP_KERNEL); + if (!attr || !cmd) { + ret = -ENOMEM; + goto err_cmd_attr; + } + + ret = ucore->ops->copy_from(cmd, ucore, sizeof(*cmd)); if (ret) - return ret; + goto err_cmd_attr; obj = kmalloc(sizeof(*obj), GFP_KERNEL); - if (!obj) - return -ENOMEM; + if (!obj) { + ret = -ENOMEM; + goto err_cmd_attr; + } - init_uobj(&obj->uobject, cmd.user_handle, file->ucontext, + init_uobj(&obj->uevent.uobject, cmd->user_handle, file->ucontext, &dct_lock_class); - down_write(&obj->uobject.mutex); + down_write(&obj->uevent.uobject.mutex); - pd = idr_read_pd(cmd.pd_handle, file->ucontext); + pd = idr_read_pd(cmd->pd_handle, file->ucontext); if (!pd) { ret = -EINVAL; goto err_pd; } - cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0); + cq = idr_read_cq(cmd->cq_handle, file->ucontext, 0); if (!cq) { ret = -EINVAL; goto err_put; } - srq = idr_read_srq(cmd.srq_handle, file->ucontext); + srq = idr_read_srq(cmd->srq_handle, file->ucontext); if (!srq) { ret = -EINVAL; goto err_put; } - attr.cq = cq; - attr.access_flags = cmd.access_flags; - attr.min_rnr_timer = cmd.min_rnr_timer; - attr.srq = srq; - attr.tclass = cmd.tclass; - attr.flow_label = cmd.flow_label; - attr.dc_key = cmd.dc_key; - attr.mtu = cmd.mtu; - attr.port = cmd.port; - attr.pkey_index = cmd.pkey_index; - attr.gid_index = cmd.gid_index; - attr.hop_limit = cmd.hop_limit; - attr.create_flags = cmd.create_flags; - - dct = ib_create_dct(pd, &attr, &udata); + if (cmd->create_flags & ~IB_DCT_CREATE_FLAGS_MASK) { + ret = -EINVAL; + goto err_put; + } + + attr->cq = cq; + attr->access_flags = cmd->access_flags; + attr->min_rnr_timer = cmd->min_rnr_timer; + attr->srq = srq; + attr->tclass = cmd->tclass; + attr->flow_label = cmd->flow_label; + attr->dc_key = cmd->dc_key; + attr->mtu = cmd->mtu; + attr->port = cmd->port; + attr->pkey_index = cmd->pkey_index; + attr->gid_index = cmd->gid_index; + attr->hop_limit = cmd->hop_limit; + attr->create_flags = cmd->create_flags; + + obj->uevent.events_reported = 0; + INIT_LIST_HEAD(&obj->uevent.event_list); + dct = ib_create_dct(pd, attr, uhw); if (IS_ERR(dct)) { ret = PTR_ERR(dct); goto err_put; } dct->device = file->device->ib_dev; - dct->uobject = &obj->uobject; + dct->uobject = &obj->uevent.uobject; - obj->uobject.object = dct; - ret = idr_add_uobj(&ib_uverbs_dct_idr, &obj->uobject); + obj->uevent.uobject.object = dct; + ret = idr_add_uobj(&ib_uverbs_dct_idr, &obj->uevent.uobject); if (ret) goto err_dct; memset(&resp, 0, sizeof(resp)); - resp.dct_handle = obj->uobject.id; + resp.dct_handle = obj->uevent.uobject.id; resp.dctn = dct->dct_num; ret = ucore->ops->copy_to(ucore, &resp, sizeof(resp)); @@ -3156,21 +3656,23 @@ goto err_copy; mutex_lock(&file->mutex); - list_add_tail(&obj->uobject.list, &file->ucontext->dct_list); + list_add_tail(&obj->uevent.uobject.list, &file->ucontext->dct_list); mutex_unlock(&file->mutex); - obj->uobject.live = 1; + obj->uevent.uobject.live = 1; put_srq_read(srq); put_cq_read(cq); put_pd_read(pd); - up_write(&obj->uobject.mutex); + up_write(&obj->uevent.uobject.mutex); + kfree(attr); + kfree(cmd); - return in_len; + return 0; err_copy: - idr_remove_uobj(&ib_uverbs_dct_idr, &obj->uobject); + idr_remove_uobj(&ib_uverbs_dct_idr, &obj->uevent.uobject); err_dct: ib_destroy_dct(dct); @@ -3185,21 +3687,24 @@ put_pd_read(pd); err_pd: - put_uobj_write(&obj->uobject); + put_uobj_write(&obj->uevent.uobject); + +err_cmd_attr: + kfree(attr); + kfree(cmd); return ret; } -ssize_t ib_uverbs_exp_destroy_dct(struct ib_uverbs_file *file, - struct ib_udata *ucore, - struct ib_udata *uhw) +int ib_uverbs_exp_destroy_dct(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) { - int in_len = ucore->inlen + uhw->inlen; int out_len = ucore->outlen + uhw->outlen; struct ib_uverbs_destroy_dct cmd; struct ib_uverbs_destroy_dct_resp resp; - struct ib_uobject *uobj; - struct ib_dct *dct; + struct ib_uobject *uobj; struct ib_udct_object *obj; + struct ib_dct *dct; int ret; if (out_len < sizeof(resp)) @@ -3209,12 +3714,12 @@ if (ret) return ret; - uobj = idr_write_uobj(&ib_uverbs_dct_idr, cmd.user_handle, file->ucontext); + uobj = idr_write_uobj(&ib_uverbs_dct_idr, cmd.dct_handle, file->ucontext); if (!uobj) return -EINVAL; dct = uobj->object; - obj = container_of(dct->uobject, struct ib_udct_object, uobject); + obj = container_of(uobj, struct ib_udct_object, uevent.uobject); ret = ib_destroy_dct(dct); if (!ret) @@ -3232,6 +3737,7 @@ mutex_unlock(&file->mutex); memset(&resp, 0, sizeof(resp)); + resp.events_reported = obj->uevent.events_reported; put_uobj(uobj); @@ -3239,14 +3745,45 @@ if (ret) return ret; - return in_len; + return 0; +} + +int ib_uverbs_exp_arm_dct(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + int out_len = ucore->outlen + uhw->outlen; + struct ib_uverbs_arm_dct cmd; + struct ib_uverbs_arm_dct_resp resp; + struct ib_dct *dct; + int err; + + if (out_len < sizeof(resp)) + return -ENOSPC; + + err = ucore->ops->copy_from(&cmd, ucore, sizeof(cmd)); + if (err) + return err; + + dct = idr_read_dct(cmd.dct_handle, file->ucontext); + if (!dct) + return -EINVAL; + + err = dct->device->exp_arm_dct(dct, uhw); + put_dct_read(dct); + if (err) + return err; + + memset(&resp, 0, sizeof(resp)); + err = ucore->ops->copy_to(ucore, &resp, sizeof(resp)); + + return err; } -ssize_t ib_uverbs_exp_query_dct(struct ib_uverbs_file *file, - struct ib_udata *ucore, - struct ib_udata *uhw) +int ib_uverbs_exp_query_dct(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) { - int in_len = ucore->inlen + uhw->inlen; int out_len = ucore->outlen + uhw->outlen; struct ib_uverbs_query_dct cmd; struct ib_uverbs_query_dct_resp resp; @@ -3287,7 +3824,7 @@ resp.flow_label = attr->flow_label; resp.key_violations = attr->key_violations; resp.port = attr->port; - resp.min_rnr_timer = attr->min_rnr_timer; + resp.min_rnr_timer = attr->min_rnr_timer; resp.tclass = attr->tclass; resp.mtu = attr->mtu; resp.pkey_index = attr->pkey_index; @@ -3300,23 +3837,29 @@ out: kfree(attr); - return err ? err : in_len; + return err; } - /* * Experimental functions */ -static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" }; - -static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec, - union ib_flow_spec *ib_spec) +static int kern_spec_to_ib_spec(struct ib_uverbs_exp_flow_spec *kern_spec, + union ib_flow_spec *ib_spec, + int is_exp) { + if (kern_spec->reserved) + return -EINVAL; + + if (!is_exp && kern_spec->type == IB_FLOW_SPEC_IPV6) + return -EINVAL; + ib_spec->type = kern_spec->type; switch (ib_spec->type) { case IB_FLOW_SPEC_ETH: ib_spec->eth.size = sizeof(struct ib_flow_spec_eth); + if (ib_spec->eth.size != kern_spec->eth.size) + return -EINVAL; memcpy(&ib_spec->eth.val, &kern_spec->eth.val, sizeof(struct ib_flow_eth_filter)); memcpy(&ib_spec->eth.mask, &kern_spec->eth.mask, @@ -3331,14 +3874,27 @@ break; case IB_FLOW_SPEC_IPV4: ib_spec->ipv4.size = sizeof(struct ib_flow_spec_ipv4); + if (ib_spec->ipv4.size != kern_spec->ipv4.size) + return -EINVAL; memcpy(&ib_spec->ipv4.val, &kern_spec->ipv4.val, sizeof(struct ib_flow_ipv4_filter)); memcpy(&ib_spec->ipv4.mask, &kern_spec->ipv4.mask, sizeof(struct ib_flow_ipv4_filter)); break; + case IB_FLOW_SPEC_IPV6: + ib_spec->ipv6.size = sizeof(struct ib_flow_spec_ipv6); + if (ib_spec->ipv6.size != kern_spec->ipv6.size) + return -EINVAL; + memcpy(&ib_spec->ipv6.val, &kern_spec->ipv6.val, + sizeof(struct ib_flow_ipv6_filter)); + memcpy(&ib_spec->ipv6.mask, &kern_spec->ipv6.mask, + sizeof(struct ib_flow_ipv6_filter)); + break; case IB_FLOW_SPEC_TCP: case IB_FLOW_SPEC_UDP: ib_spec->tcp_udp.size = sizeof(struct ib_flow_spec_tcp_udp); + if (ib_spec->tcp_udp.size != kern_spec->tcp_udp.size) + return -EINVAL; memcpy(&ib_spec->tcp_udp.val, &kern_spec->tcp_udp.val, sizeof(struct ib_flow_tcp_udp_filter)); memcpy(&ib_spec->tcp_udp.mask, &kern_spec->tcp_udp.mask, @@ -3350,21 +3906,22 @@ return 0; } -int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, - struct ib_udata *ucore, - struct ib_udata *uhw) +int ib_uverbs_exp_create_wq(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) { - struct ib_uverbs_create_flow cmd; - struct ib_uverbs_create_flow_resp resp; - struct ib_uobject *uobj; - struct ib_flow *flow_id; - struct ib_uverbs_flow_attr *kern_flow_attr; - struct ib_flow_attr *flow_attr; - struct ib_qp *qp; + struct ib_uverbs_exp_create_wq cmd; + struct ib_uverbs_exp_create_wq_resp resp; + struct ib_uobject *uobj; int err = 0; - void *kern_spec; - void *ib_spec; - int i; + struct ib_cq *cq; + struct ib_pd *pd; + struct ib_wq *wq; + struct ib_srq *srq = NULL; + struct ib_wq_init_attr wq_init_attr; + + if (ucore->inlen < sizeof(cmd)) + return -EINVAL; if (ucore->outlen < sizeof(resp)) return -ENOSPC; @@ -3373,114 +3930,529 @@ if (err) return err; - ucore->inbuf += sizeof(cmd); - ucore->inlen -= sizeof(cmd); - - if (cmd.comp_mask) - return -EINVAL; - - if (priv_check(curthread, PRIV_NET_RAW) && !disable_raw_qp_enforcement) - return -EPERM; - - if (cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS) + if (cmd.reserved) return -EINVAL; - if (cmd.flow_attr.size > ucore->inlen || - cmd.flow_attr.size > - (cmd.flow_attr.num_of_specs * sizeof(struct ib_uverbs_flow_spec))) + if (cmd.comp_mask) return -EINVAL; - if (cmd.flow_attr.num_of_specs) { - kern_flow_attr = kmalloc(sizeof(*kern_flow_attr) + - cmd.flow_attr.size, GFP_KERNEL); - if (!kern_flow_attr) + uobj = kmalloc(sizeof(*uobj), GFP_KERNEL); + if (!uobj) return -ENOMEM; - memcpy(kern_flow_attr, &cmd.flow_attr, sizeof(*kern_flow_attr)); - err = ib_copy_from_udata(kern_flow_attr + 1, ucore, - cmd.flow_attr.size); - if (err) - goto err_free_attr; - } else { - kern_flow_attr = &cmd.flow_attr; - } - - uobj = kmalloc(sizeof(*uobj), GFP_KERNEL); - if (!uobj) { - err = -ENOMEM; - goto err_free_attr; - } - init_uobj(uobj, 0, file->ucontext, &rule_lock_class); + init_uobj(uobj, cmd.user_handle, file->ucontext, &wq_lock_class); down_write(&uobj->mutex); - - qp = idr_read_qp(cmd.qp_handle, file->ucontext); - if (!qp) { + pd = idr_read_pd(cmd.pd_handle, file->ucontext); + if (!pd) { err = -EINVAL; goto err_uobj; } - flow_attr = kmalloc(sizeof(*flow_attr) + cmd.flow_attr.size, - GFP_KERNEL); - if (!flow_attr) { - err = -ENOMEM; - goto err_put; + cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0); + if (!cq) { + err = -EINVAL; + goto err_put_pd; } - flow_attr->type = kern_flow_attr->type; - flow_attr->priority = kern_flow_attr->priority; - flow_attr->num_of_specs = kern_flow_attr->num_of_specs; - flow_attr->port = kern_flow_attr->port; - flow_attr->flags = kern_flow_attr->flags; - flow_attr->size = sizeof(*flow_attr); - - kern_spec = kern_flow_attr + 1; - ib_spec = flow_attr + 1; - for (i = 0; i < flow_attr->num_of_specs && - cmd.flow_attr.size > - offsetof(struct ib_uverbs_flow_spec, reserved) && - cmd.flow_attr.size >= - ((struct ib_uverbs_flow_spec *)kern_spec)->size; i++) { - err = kern_spec_to_ib_spec(kern_spec, ib_spec); - if (err) - goto err_free; - flow_attr->size += - ((union ib_flow_spec *)ib_spec)->size; - cmd.flow_attr.size -= - ((struct ib_uverbs_flow_spec *)kern_spec)->size; - kern_spec += ((struct ib_uverbs_flow_spec *)kern_spec)->size; - ib_spec += ((union ib_flow_spec *)ib_spec)->size; - } - if (cmd.flow_attr.size || (i != flow_attr->num_of_specs)) { - pr_warn("create flow failed, flow %d: %d bytes left from uverb cmd\n", - i, cmd.flow_attr.size); - goto err_free; + if (cmd.wq_type == IB_WQT_SRQ) { + cmd.max_recv_wr = 0; + cmd.max_recv_sge = 0; + srq = idr_read_srq(cmd.srq_handle, file->ucontext); + if (!srq || srq->srq_type != IB_SRQT_BASIC) { + err = -EINVAL; + goto err_put_srq; } - flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER); - if (IS_ERR(flow_id)) { - err = PTR_ERR(flow_id); - goto err_free; } - flow_id->qp = qp; - flow_id->uobject = uobj; - uobj->object = flow_id; - err = idr_add_uobj(&ib_uverbs_rule_idr, uobj); + memset(&wq_init_attr, 0, sizeof(wq_init_attr)); + wq_init_attr.cq = cq; + wq_init_attr.max_recv_sge = cmd.max_recv_sge; + wq_init_attr.max_recv_wr = cmd.max_recv_wr; + wq_init_attr.srq = srq; + wq_init_attr.wq_context = file; + wq_init_attr.wq_type = cmd.wq_type; + wq = pd->device->create_wq(pd, &wq_init_attr, uhw); + + if (IS_ERR(wq)) { + err = PTR_ERR(wq); + goto err_put_srq; + } + + wq->uobject = uobj; + uobj->object = wq; + wq->wq_type = wq_init_attr.wq_type; + wq->cq = cq; + wq->pd = pd; + wq->srq = srq; + wq->device = pd->device; + wq->wq_context = wq_init_attr.wq_context; + atomic_set(&wq->usecnt, 0); + atomic_inc(&pd->usecnt); + atomic_inc(&cq->usecnt); + if (srq) + atomic_inc(&srq->usecnt); + + err = idr_add_uobj(&ib_uverbs_wq_idr, uobj); if (err) - goto destroy_flow; + goto destroy_wq; memset(&resp, 0, sizeof(resp)); - resp.flow_handle = uobj->id; - + resp.wq_handle = uobj->id; + resp.max_recv_sge = wq_init_attr.max_recv_sge; + resp.max_recv_wr = wq_init_attr.max_recv_wr; + resp.wqn = wq->wq_num; + resp.response_length = offsetof(typeof(resp), wqn) + sizeof(resp.wqn); err = ib_copy_to_udata(ucore, &resp, sizeof(resp)); if (err) goto err_copy; - put_qp_read(qp); - mutex_lock(&file->mutex); - list_add_tail(&uobj->list, &file->ucontext->rule_list); - mutex_unlock(&file->mutex); - + put_pd_read(pd); + put_cq_read(cq); + if (srq) + put_srq_read(srq); + + mutex_lock(&file->mutex); + list_add_tail(&uobj->list, &file->ucontext->wq_list); + mutex_unlock(&file->mutex); + + uobj->live = 1; + + up_write(&uobj->mutex); + + return 0; + +err_copy: + idr_remove_uobj(&ib_uverbs_wq_idr, uobj); +destroy_wq: + ib_destroy_wq(wq); +err_put_srq: + if (srq) + put_srq_read(srq); + put_cq_read(cq); +err_put_pd: + put_pd_read(pd); +err_uobj: + put_uobj_write(uobj); + + return err; +} + +int ib_uverbs_exp_destroy_wq(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + struct ib_uverbs_exp_destroy_wq cmd; + struct ib_wq *wq; + struct ib_uobject *uobj; + int ret; + + if (ucore->inlen < sizeof(cmd)) + return -EINVAL; + + ret = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); + if (ret) + return ret; + + if (cmd.comp_mask) + return -EINVAL; + + uobj = idr_write_uobj(&ib_uverbs_wq_idr, cmd.wq_handle, + file->ucontext); + if (!uobj) + return -EINVAL; + + wq = uobj->object; + ret = ib_destroy_wq(wq); + if (!ret) + uobj->live = 0; + + put_uobj_write(uobj); + if (ret) + return ret; + + idr_remove_uobj(&ib_uverbs_wq_idr, uobj); + + mutex_lock(&file->mutex); + list_del(&uobj->list); + mutex_unlock(&file->mutex); + + put_uobj(uobj); + return ret; +} + +int ib_uverbs_exp_modify_wq(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + struct ib_uverbs_exp_modify_wq cmd; + struct ib_wq *wq; + int ret; + struct ib_wq_attr wq_attr; + enum ib_wq_attr_mask attr_mask; + + if (ucore->inlen < sizeof(cmd)) + return -EINVAL; + + ret = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); + if (ret) + return ret; + + if (!cmd.comp_mask) + return -EINVAL; + + attr_mask = cmd.comp_mask; + wq = idr_read_wq(cmd.wq_handle, file->ucontext); + if (!wq) + return -EINVAL; + + memset(&wq_attr, 0, sizeof(wq_attr)); + wq_attr.curr_wq_state = cmd.curr_wq_state; + wq_attr.wq_state = cmd.wq_state; + ret = wq->device->modify_wq(wq, &wq_attr, attr_mask, uhw); + put_wq_read(wq); + return ret; +} + +int ib_uverbs_exp_create_rwq_ind_table(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + struct ib_uverbs_exp_create_rwq_ind_table cmd; + struct ib_uverbs_exp_create_rwq_ind_table_resp resp; + struct ib_uobject *uobj; + int err = 0; + struct ib_pd *pd; + struct ib_rwq_ind_table_init_attr init_attr; + struct ib_rwq_ind_table *rwq_ind_tbl; + struct ib_wq **wqs = NULL; + u32 *wqs_handles = NULL; + struct ib_wq *wq = NULL; + int i, j, num_read_wqs; + u32 num_wq_handles; + u32 expected_in_size; + + if (ucore->inlen < sizeof(cmd)) + return -EINVAL; + + if (ucore->outlen < sizeof(resp)) + return -ENOSPC; + + err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); + if (err) + return err; + + ucore->inbuf += sizeof(cmd); + ucore->inlen -= sizeof(cmd); + + if (cmd.comp_mask) + return -EINVAL; + + if (cmd.reserved) + return -EINVAL; + + num_wq_handles = 1 << cmd.log_ind_tbl_size; + expected_in_size = num_wq_handles * sizeof(__u32); + if (num_wq_handles == 1) + /* input size for wq handles is u64 aligned */ + expected_in_size += sizeof(__u32); + + if (ucore->inlen != expected_in_size) + return -EINVAL; + + wqs_handles = kcalloc(num_wq_handles, sizeof(*wqs_handles), + GFP_KERNEL); + if (!wqs_handles) + return -ENOMEM; + + err = ib_copy_from_udata(wqs_handles, ucore, + num_wq_handles * sizeof(__u32)); + if (err) + goto err_free; + + wqs = kcalloc(num_wq_handles, sizeof(*wqs), GFP_KERNEL); + if (!wqs) + goto err_free; + + for (num_read_wqs = 0; num_read_wqs < num_wq_handles; + num_read_wqs++) { + wq = idr_read_wq(wqs_handles[num_read_wqs], file->ucontext); + if (!wq) + goto put_wqs; + + wqs[num_read_wqs] = wq; + } + + uobj = kmalloc(sizeof(*uobj), GFP_KERNEL); + if (!uobj) { + err = -ENOMEM; + goto put_wqs; + } + + init_uobj(uobj, 0, file->ucontext, &rwq_ind_table_lock_class); + down_write(&uobj->mutex); + + pd = idr_read_pd(cmd.pd_handle, file->ucontext); + if (!pd) { + err = -EINVAL; + goto err_uobj; + } + + memset(&init_attr, 0, sizeof(init_attr)); + init_attr.pd = pd; + init_attr.log_ind_tbl_size = cmd.log_ind_tbl_size; + init_attr.ind_tbl = wqs; + rwq_ind_tbl = pd->device->create_rwq_ind_table(pd->device, &init_attr, uhw); + + if (IS_ERR(rwq_ind_tbl)) { + err = PTR_ERR(rwq_ind_tbl); + goto put_pd; + } + + rwq_ind_tbl->ind_tbl = wqs; + rwq_ind_tbl->log_ind_tbl_size = init_attr.log_ind_tbl_size; + rwq_ind_tbl->uobject = uobj; + uobj->object = rwq_ind_tbl; + rwq_ind_tbl->device = pd->device; + rwq_ind_tbl->pd = pd; + atomic_set(&rwq_ind_tbl->usecnt, 0); + + for (i = 0; i < num_wq_handles; i++) + atomic_inc(&wqs[i]->usecnt); + + err = idr_add_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj); + if (err) + goto destroy_ind_tbl; + + memset(&resp, 0, sizeof(resp)); + resp.ind_tbl_handle = uobj->id; + resp.ind_tbl_num = rwq_ind_tbl->ind_tbl_num; + resp.response_length = offsetof(typeof(resp), ind_tbl_num) + sizeof(resp.ind_tbl_num); + + err = ib_copy_to_udata(ucore, + &resp, sizeof(resp)); + if (err) + goto err_copy; + + kfree(wqs_handles); + put_pd_read(pd); + + for (j = 0; j < num_read_wqs; j++) + put_wq_read(wqs[j]); + + mutex_lock(&file->mutex); + list_add_tail(&uobj->list, &file->ucontext->rwq_ind_tbl_list); + mutex_unlock(&file->mutex); + + uobj->live = 1; + + up_write(&uobj->mutex); + return 0; + +err_copy: + idr_remove_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj); +destroy_ind_tbl: + ib_destroy_rwq_ind_table(rwq_ind_tbl); +put_pd: + put_pd_read(pd); +err_uobj: + put_uobj_write(uobj); +put_wqs: + for (j = 0; j < num_read_wqs; j++) + put_wq_read(wqs[j]); +err_free: + kfree(wqs_handles); + kfree(wqs); + return err; +} + +int ib_uverbs_exp_destroy_rwq_ind_table(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + struct ib_uverbs_exp_destroy_rwq_ind_table cmd; + struct ib_rwq_ind_table *rwq_ind_tbl; + struct ib_uobject *uobj; + int ret; + struct ib_wq **ind_tbl; + + if (ucore->inlen < sizeof(cmd)) + return -EINVAL; + + ret = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); + if (ret) + return ret; + + if (cmd.comp_mask) + return -EINVAL; + + uobj = idr_write_uobj(&ib_uverbs_rwq_ind_tbl_idr, cmd.ind_tbl_handle, + file->ucontext); + if (!uobj) + return -EINVAL; + rwq_ind_tbl = uobj->object; + ind_tbl = rwq_ind_tbl->ind_tbl; + + ret = ib_destroy_rwq_ind_table(rwq_ind_tbl); + if (!ret) + uobj->live = 0; + + put_uobj_write(uobj); + + if (ret) + return ret; + + idr_remove_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj); + + mutex_lock(&file->mutex); + list_del(&uobj->list); + mutex_unlock(&file->mutex); + + put_uobj(uobj); + kfree(ind_tbl); + return ret; +} + +static int common_create_flow(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw, + bool is_exp) +{ + struct ib_uverbs_create_flow cmd; + struct ib_uverbs_create_flow_resp resp; + struct ib_uobject *uobj; + struct ib_flow *flow_id; + struct ib_uverbs_flow_attr *kern_flow_attr; + struct ib_flow_attr *flow_attr; + struct ib_qp *qp; + int err = 0; + void *kern_spec; + void *ib_spec; + int i; + unsigned long spec_size; + + if (ucore->inlen < sizeof(cmd)) + return -EINVAL; + + if (ucore->outlen < sizeof(resp)) + return -ENOSPC; + + err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); + if (err) + return err; + + ucore->inbuf += sizeof(cmd); + ucore->inlen -= sizeof(cmd); + + if (cmd.comp_mask) + return -EINVAL; + + if (priv_check(curthread, PRIV_NET_RAW) && !disable_raw_qp_enforcement) + return -EPERM; + + if (cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS) + return -EINVAL; + + spec_size = (is_exp) ? sizeof(struct ib_uverbs_exp_flow_spec) : + sizeof(struct ib_uverbs_flow_spec); + if (cmd.flow_attr.size > ucore->inlen || + cmd.flow_attr.size > + (cmd.flow_attr.num_of_specs * spec_size)) + return -EINVAL; + + if (cmd.flow_attr.reserved[0] || + cmd.flow_attr.reserved[1]) + return -EINVAL; + + if (cmd.flow_attr.num_of_specs) { + kern_flow_attr = kmalloc(sizeof(*kern_flow_attr) + cmd.flow_attr.size, + GFP_KERNEL); + if (!kern_flow_attr) + return -ENOMEM; + + memcpy(kern_flow_attr, &cmd.flow_attr, sizeof(*kern_flow_attr)); + err = ib_copy_from_udata(kern_flow_attr + 1, ucore, + cmd.flow_attr.size); + if (err) + goto err_free_attr; + } else { + kern_flow_attr = &cmd.flow_attr; + } + + uobj = kmalloc(sizeof(*uobj), GFP_KERNEL); + if (!uobj) { + err = -ENOMEM; + goto err_free_attr; + } + init_uobj(uobj, 0, file->ucontext, &rule_lock_class); + down_write(&uobj->mutex); + + qp = idr_read_qp(cmd.qp_handle, file->ucontext); + if (!qp) { + err = -EINVAL; + goto err_uobj; + } + + flow_attr = kmalloc(sizeof(*flow_attr) + cmd.flow_attr.size, GFP_KERNEL); + if (!flow_attr) { + err = -ENOMEM; + goto err_put; + } + + flow_attr->type = kern_flow_attr->type; + flow_attr->priority = kern_flow_attr->priority; + flow_attr->num_of_specs = kern_flow_attr->num_of_specs; + flow_attr->port = kern_flow_attr->port; + flow_attr->flags = kern_flow_attr->flags; + flow_attr->size = sizeof(*flow_attr); + + kern_spec = kern_flow_attr + 1; + ib_spec = flow_attr + 1; + for (i = 0; i < flow_attr->num_of_specs && + cmd.flow_attr.size > offsetof(struct ib_uverbs_flow_spec_hdr, reserved) && + cmd.flow_attr.size >= + ((struct ib_uverbs_flow_spec_hdr *)kern_spec)->size; i++) { + err = kern_spec_to_ib_spec(kern_spec, ib_spec, is_exp); + if (err) + goto err_free; + flow_attr->size += + ((union ib_flow_spec *) ib_spec)->size; + cmd.flow_attr.size -= ((struct ib_uverbs_flow_spec_hdr *)kern_spec)->size; + kern_spec += ((struct ib_uverbs_flow_spec_hdr *)kern_spec)->size; + ib_spec += ((union ib_flow_spec *) ib_spec)->size; + } + if (cmd.flow_attr.size || (i != flow_attr->num_of_specs)) { + pr_warn("create flow failed, flow %d: %d bytes left from uverb cmd\n", + i, cmd.flow_attr.size); + err = -EINVAL; + goto err_free; + } + flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER); + if (IS_ERR(flow_id)) { + err = PTR_ERR(flow_id); + goto err_free; + } + flow_id->qp = qp; + flow_id->uobject = uobj; + uobj->object = flow_id; + + err = idr_add_uobj(&ib_uverbs_rule_idr, uobj); + if (err) + goto destroy_flow; + + memset(&resp, 0, sizeof(resp)); + resp.flow_handle = uobj->id; + + err = ib_copy_to_udata(ucore, + &resp, sizeof(resp)); + if (err) + goto err_copy; + + put_qp_read(qp); + mutex_lock(&file->mutex); + list_add_tail(&uobj->list, &file->ucontext->rule_list); + mutex_unlock(&file->mutex); + uobj->live = 1; up_write(&uobj->mutex); @@ -3504,6 +4476,20 @@ return err; } +int ib_uverbs_exp_create_flow(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + return common_create_flow(file, ucore, uhw, true); +} + +int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + return common_create_flow(file, ucore, uhw, false); +} + int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file, struct ib_udata *ucore, struct ib_udata *uhw) @@ -3513,10 +4499,16 @@ struct ib_uobject *uobj; int ret; + if (ucore->inlen < sizeof(cmd)) + return -EINVAL; + ret = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); if (ret) return ret; + if (cmd.comp_mask) + return -EINVAL; + uobj = idr_write_uobj(&ib_uverbs_rule_idr, cmd.flow_handle, file->ucontext); if (!uobj) @@ -3540,45 +4532,70 @@ return ret; } -ssize_t ib_uverbs_exp_modify_qp(struct ib_uverbs_file *file, - struct ib_udata *ucore, struct ib_udata *uhw) +int ib_uverbs_exp_modify_qp(struct ib_uverbs_file *file, + struct ib_udata *ucore, struct ib_udata *uhw) { - const char __user *buf = ucore->inbuf; - int in_len = ucore->inlen + uhw->inlen; - int out_len = ucore->outlen + uhw->outlen; + struct ib_uverbs_exp_modify_qp cmd; + int ret; + + if (ucore->inlen < offsetof(typeof(cmd), comp_mask) + sizeof(cmd.comp_mask)) + return -EINVAL; + + ret = ucore->ops->copy_from(&cmd, ucore, min(sizeof(cmd), ucore->inlen)); + + if (ret) + return ret; + + if (cmd.comp_mask >= IB_UVERBS_EXP_QP_ATTR_RESERVED) + return -ENOSYS; + + ret = __uverbs_modify_qp(file, ucore->inlen, + IB_USER_VERBS_CMD_EXP, &cmd, uhw->inlen, uhw); + if (ret < 0) + return ret; - return __uverbs_modify_qp(file, buf, in_len, out_len, - IB_USER_VERBS_CMD_EXTENDED); + return 0; } -ssize_t ib_uverbs_exp_create_cq(struct ib_uverbs_file *file, - struct ib_udata *ucore, struct ib_udata *uhw) +int ib_uverbs_exp_create_cq(struct ib_uverbs_file *file, + struct ib_udata *ucore, struct ib_udata *uhw) { - const char __user *buf = ucore->inbuf; int in_len = ucore->inlen + uhw->inlen; int out_len = ucore->outlen + uhw->outlen; - struct ib_uverbs_create_cq_ex cmd; + struct ib_uverbs_exp_create_cq cmd; + int ret; - if (copy_from_user(&cmd, buf, sizeof(cmd))) - return -EFAULT; + ret = ucore->ops->copy_from(&cmd, ucore, sizeof(cmd)); + if (ret) + return ret; - return create_cq(file, buf, in_len, out_len, &cmd, - IB_USER_VERBS_CMD_EXTENDED, ucore->outbuf); + if (cmd.comp_mask >= IB_UVERBS_EXP_CREATE_CQ_ATTR_RESERVED) + return -ENOSYS; + + ret = create_cq(file, in_len, out_len, &cmd, + IB_USER_VERBS_CMD_EXP, ucore->outbuf, uhw); + if (ret < 0) + return ret; + + return 0; } -ssize_t ib_uverbs_exp_modify_cq(struct ib_uverbs_file *file, - struct ib_udata *ucore, struct ib_udata *uhw) +int ib_uverbs_exp_modify_cq(struct ib_uverbs_file *file, + struct ib_udata *ucore, struct ib_udata *uhw) { - const char __user *buf = ucore->inbuf; - int in_len = ucore->inlen + uhw->inlen; - struct ib_uverbs_modify_cq_ex cmd; + struct ib_uverbs_exp_modify_cq cmd; struct ib_cq *cq; struct ib_cq_attr attr; - int ret; + int ret; - if (copy_from_user(&cmd, buf, sizeof(cmd))) - return -EFAULT; + memset(&cmd, 0, sizeof(cmd)); + ret = ucore->ops->copy_from(&cmd, ucore, sizeof(cmd)); + if (ret) + return ret; + + if (cmd.comp_mask >= IB_UVERBS_EXP_CQ_ATTR_RESERVED) + return -ENOSYS; cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0); if (!cq) @@ -3592,80 +4609,141 @@ put_cq_read(cq); - return ret ? ret : in_len; + return ret; } -ssize_t ib_uverbs_exp_query_device(struct ib_uverbs_file *file, - struct ib_udata *ucore, struct ib_udata *uhw) +int ib_uverbs_exp_query_device(struct ib_uverbs_file *file, + struct ib_udata *ucore, struct ib_udata *uhw) { - struct ib_uverbs_exp_query_device_resp resp; - struct ib_exp_device_attr exp_attr; + struct ib_uverbs_exp_query_device_resp *resp; + struct ib_uverbs_exp_query_device cmd; + struct ib_exp_device_attr *exp_attr; int ret; - if (ucore->outlen + uhw->outlen < sizeof(resp)) - return -ENOSPC; - - memset(&resp, 0, sizeof(resp)); - memset(&exp_attr, 0, sizeof(exp_attr)); - ret = ib_exp_query_device(file->device->ib_dev, &exp_attr); + ret = ucore->ops->copy_from(&cmd, ucore, sizeof(cmd)); if (ret) return ret; - ib_uverbs_query_device_assign(&resp.base, &exp_attr.base, file); + resp = kzalloc(sizeof(*resp), GFP_KERNEL); + exp_attr = kzalloc(sizeof(*exp_attr), GFP_KERNEL); + if (!exp_attr || !resp) { + ret = -ENOMEM; + goto out; + } + ret = ib_exp_query_device(file->device->ib_dev, exp_attr); + if (ret) + goto out; + + memset(resp, 0, sizeof(*resp)); + copy_query_dev_fields(file, &resp->base, &exp_attr->base); - resp.comp_mask = 0; - resp.device_cap_flags2 = 0; + resp->comp_mask = 0; + resp->device_cap_flags2 = 0; /* * Handle regular attr fields */ - if (exp_attr.base.comp_mask & IB_DEVICE_ATTR_WITH_TIMESTAMP_MASK) { - resp.timestamp_mask = exp_attr.base.timestamp_mask; - resp.comp_mask |= IB_EXP_DEVICE_ATTR_WITH_TIMESTAMP_MASK; + if (exp_attr->base.comp_mask & IB_DEVICE_ATTR_WITH_TIMESTAMP_MASK) { + resp->timestamp_mask = exp_attr->base.timestamp_mask; + resp->comp_mask |= IB_EXP_DEVICE_ATTR_WITH_TIMESTAMP_MASK; } - if (exp_attr.base.comp_mask & IB_DEVICE_ATTR_WITH_HCA_CORE_CLOCK) { - resp.hca_core_clock = exp_attr.base.hca_core_clock; - resp.comp_mask |= IB_EXP_DEVICE_ATTR_WITH_HCA_CORE_CLOCK; + if (exp_attr->base.comp_mask & IB_DEVICE_ATTR_WITH_HCA_CORE_CLOCK) { + resp->hca_core_clock = exp_attr->base.hca_core_clock; + resp->comp_mask |= IB_EXP_DEVICE_ATTR_WITH_HCA_CORE_CLOCK; } /* * Handle experimental attr fields */ - if (exp_attr.exp_comp_mask & IB_EXP_DEVICE_ATTR_CAP_FLAGS2) { - resp.device_cap_flags2 = exp_attr.device_cap_flags2; - resp.comp_mask |= IB_EXP_DEVICE_ATTR_CAP_FLAGS2; + if (exp_attr->exp_comp_mask & IB_EXP_DEVICE_ATTR_CAP_FLAGS2 || + exp_attr->base.device_cap_flags & IB_EXP_DEVICE_MASK) { + resp->device_cap_flags2 = exp_attr->device_cap_flags2; + resp->comp_mask |= IB_EXP_DEVICE_ATTR_CAP_FLAGS2; + resp->device_cap_flags2 |= IB_EXP_DEVICE_MASK & exp_attr->base.device_cap_flags; + resp->base.device_cap_flags &= ~IB_EXP_DEVICE_MASK; } - if (exp_attr.exp_comp_mask & IB_EXP_DEVICE_ATTR_DC_REQ_RD) { - resp.dc_rd_req = exp_attr.dc_rd_req; - resp.comp_mask |= IB_EXP_DEVICE_ATTR_DC_REQ_RD; + if (exp_attr->exp_comp_mask & IB_EXP_DEVICE_ATTR_DC_REQ_RD) { + resp->dc_rd_req = exp_attr->dc_rd_req; + resp->comp_mask |= IB_EXP_DEVICE_ATTR_DC_REQ_RD; } - if (exp_attr.exp_comp_mask & IB_EXP_DEVICE_ATTR_DC_RES_RD) { - resp.dc_rd_res = exp_attr.dc_rd_res; - resp.comp_mask |= IB_EXP_DEVICE_ATTR_DC_RES_RD; + if (exp_attr->exp_comp_mask & IB_EXP_DEVICE_ATTR_DC_RES_RD) { + resp->dc_rd_res = exp_attr->dc_rd_res; + resp->comp_mask |= IB_EXP_DEVICE_ATTR_DC_RES_RD; } - if (exp_attr.exp_comp_mask & IB_EXP_DEVICE_ATTR_INLINE_RECV_SZ) { - resp.inline_recv_sz = exp_attr.inline_recv_sz; - resp.comp_mask |= IB_EXP_DEVICE_ATTR_INLINE_RECV_SZ; + if (exp_attr->exp_comp_mask & IB_EXP_DEVICE_ATTR_MAX_DCT) { + resp->max_dct = exp_attr->max_dct; + resp->comp_mask |= IB_EXP_DEVICE_ATTR_MAX_DCT; } - if (exp_attr.exp_comp_mask & IB_EXP_DEVICE_ATTR_RSS_TBL_SZ) { - resp.max_rss_tbl_sz = exp_attr.max_rss_tbl_sz; - resp.comp_mask |= IB_EXP_DEVICE_ATTR_RSS_TBL_SZ; + if (exp_attr->exp_comp_mask & IB_EXP_DEVICE_ATTR_INLINE_RECV_SZ) { + resp->inline_recv_sz = exp_attr->inline_recv_sz; + resp->comp_mask |= IB_EXP_DEVICE_ATTR_INLINE_RECV_SZ; } - if (copy_to_user(ucore->outbuf, &resp, sizeof(resp))) - return -EFAULT; + if (exp_attr->exp_comp_mask & IB_EXP_DEVICE_ATTR_RSS_TBL_SZ) { + resp->max_rss_tbl_sz = exp_attr->max_rss_tbl_sz; + resp->comp_mask |= IB_EXP_DEVICE_ATTR_RSS_TBL_SZ; + } + + if (exp_attr->exp_comp_mask & IB_EXP_DEVICE_ATTR_EXT_ATOMIC_ARGS) { + resp->comp_mask |= IB_EXP_DEVICE_ATTR_EXT_ATOMIC_ARGS; + resp->atomic_arg_sizes = exp_attr->atomic_arg_sizes; + resp->max_fa_bit_boudary = exp_attr->max_fa_bit_boudary; + resp->log_max_atomic_inline_arg = exp_attr->log_max_atomic_inline_arg; + } + + if (exp_attr->exp_comp_mask & IB_EXP_DEVICE_ATTR_UMR) { + resp->umr_caps.max_reg_descriptors = exp_attr->umr_caps.max_reg_descriptors; + resp->umr_caps.max_send_wqe_inline_klms = exp_attr->umr_caps.max_send_wqe_inline_klms; + resp->umr_caps.max_umr_recursion_depth = exp_attr->umr_caps.max_umr_recursion_depth; + resp->umr_caps.max_umr_stride_dimenson = exp_attr->umr_caps.max_umr_stride_dimenson; + resp->comp_mask |= IB_EXP_DEVICE_ATTR_UMR; + } + + if (exp_attr->exp_comp_mask & IB_EXP_DEVICE_ATTR_MAX_CTX_RES_DOMAIN) { + resp->max_ctx_res_domain = exp_attr->max_ctx_res_domain; + resp->comp_mask |= IB_EXP_DEVICE_ATTR_MAX_CTX_RES_DOMAIN; + } + + if (exp_attr->exp_comp_mask & IB_EXP_DEVICE_ATTR_MAX_WQ_TYPE_RQ) { + resp->max_wq_type_rq = exp_attr->max_wq_type_rq; + resp->comp_mask |= IB_EXP_DEVICE_ATTR_MAX_WQ_TYPE_RQ; + } + + if (exp_attr->exp_comp_mask & IB_EXP_DEVICE_ATTR_MAX_DEVICE_CTX) { + resp->max_device_ctx = exp_attr->max_device_ctx; + resp->comp_mask |= IB_EXP_DEVICE_ATTR_MAX_DEVICE_CTX; + } - return ucore->inlen + uhw->inlen; + if (exp_attr->exp_comp_mask & IB_EXP_DEVICE_ATTR_RX_HASH) { + resp->rx_hash.max_rwq_indirection_tables = exp_attr->rx_hash_caps.max_rwq_indirection_tables; + resp->rx_hash.max_rwq_indirection_table_size = exp_attr->rx_hash_caps.max_rwq_indirection_table_size; + resp->rx_hash.supported_packet_fields = exp_attr->rx_hash_caps.supported_packet_fields; + resp->rx_hash.supported_qps = exp_attr->rx_hash_caps.supported_qps; + resp->rx_hash.supported_hash_functions = exp_attr->rx_hash_caps.supported_hash_functions; + resp->comp_mask |= IB_EXP_DEVICE_ATTR_RX_HASH; + } + + if (copy_to_user(ucore->outbuf, resp, min_t(size_t, sizeof(*resp), + ucore->outlen))) { + ret = -EFAULT; + goto out; + } + +out: + kfree(exp_attr); + kfree(resp); + + return ret; } -ssize_t ib_uverbs_exp_create_qp(struct ib_uverbs_file *file, - struct ib_udata *ucore, struct ib_udata *uhw) +int ib_uverbs_exp_create_qp(struct ib_uverbs_file *file, + struct ib_udata *ucore, struct ib_udata *uhw) { struct ib_uqp_object *obj; struct ib_device *device; @@ -3675,53 +4753,74 @@ struct ib_cq *scq = NULL, *rcq = NULL; struct ib_srq *srq = NULL; struct ib_qp *qp; - struct ib_exp_qp_init_attr attr; - int ret; - struct ib_uverbs_exp_create_qp cmd_exp; + struct ib_exp_qp_init_attr *attr; + struct ib_uverbs_exp_create_qp *cmd_exp; struct ib_uverbs_exp_create_qp_resp resp_exp; struct ib_qp *parentqp = NULL; + int ret; + struct ib_rx_hash_conf rx_hash_conf; + struct ib_rwq_ind_table *ind_tbl = NULL; + int rx_qp = 0; + int i; - memset(&cmd_exp, 0, sizeof(cmd_exp)); - - ret = ucore->ops->copy_from(&cmd_exp, ucore, sizeof(cmd_exp)); + cmd_exp = kzalloc(sizeof(*cmd_exp), GFP_KERNEL); + attr = kzalloc(sizeof(*attr), GFP_KERNEL); + if (!cmd_exp || !attr) { + ret = -ENOMEM; + goto err_cmd_attr; + } + ret = ucore->ops->copy_from(cmd_exp, ucore, sizeof(*cmd_exp)); if (ret) - return ret; + goto err_cmd_attr; if (!disable_raw_qp_enforcement && - cmd_exp.qp_type == IB_QPT_RAW_PACKET && priv_check(curthread, - PRIV_NET_RAW)) - return -EPERM; + cmd_exp->qp_type == IB_QPT_RAW_PACKET && priv_check(curthread, + PRIV_NET_RAW)) { + ret = -EPERM; + goto err_cmd_attr; + } + + for (i = 0; i < sizeof(cmd_exp->reserved_2); i++) { + if (cmd_exp->reserved_2[i] != 0) { + ret = -EINVAL; + goto err_cmd_attr; + } + } obj = kzalloc(sizeof(*obj), GFP_KERNEL); - if (!obj) - return -ENOMEM; + if (!obj) { + ret = -ENOMEM; + goto err_cmd_attr; + } - init_uobj(&obj->uevent.uobject, cmd_exp.user_handle, file->ucontext, + init_uobj(&obj->uevent.uobject, cmd_exp->user_handle, file->ucontext, &qp_lock_class); down_write(&obj->uevent.uobject.mutex); + rx_qp = cmd_exp->rx_hash_conf.rx_hash_function ? 1 : 0; - if (cmd_exp.qp_type == IB_QPT_XRC_TGT) { - xrcd = idr_read_xrcd(cmd_exp.pd_handle, file->ucontext, &xrcd_uobj); + if (cmd_exp->qp_type == IB_QPT_XRC_TGT) { + xrcd = idr_read_xrcd(cmd_exp->pd_handle, file->ucontext, &xrcd_uobj); if (!xrcd) { ret = -EINVAL; goto err_put; } device = xrcd->device; } else { - if (cmd_exp.qp_type == IB_QPT_XRC_INI) { - cmd_exp.max_recv_wr = 0; - cmd_exp.max_recv_sge = 0; + if (cmd_exp->qp_type == IB_QPT_XRC_INI || + cmd_exp->qp_type == IB_EXP_QPT_DC_INI) { + cmd_exp->max_recv_wr = 0; + cmd_exp->max_recv_sge = 0; } else { - if (cmd_exp.is_srq) { - srq = idr_read_srq(cmd_exp.srq_handle, file->ucontext); + if (cmd_exp->is_srq) { + srq = idr_read_srq(cmd_exp->srq_handle, file->ucontext); if (!srq || srq->srq_type != IB_SRQT_BASIC) { ret = -EINVAL; goto err_put; } } - if (cmd_exp.recv_cq_handle != cmd_exp.send_cq_handle) { - rcq = idr_read_cq(cmd_exp.recv_cq_handle, file->ucontext, 0); + if (cmd_exp->recv_cq_handle != cmd_exp->send_cq_handle) { + rcq = idr_read_cq(cmd_exp->recv_cq_handle, file->ucontext, 0); if (!rcq) { ret = -EINVAL; goto err_put; @@ -3729,10 +4828,11 @@ } } - scq = idr_read_cq(cmd_exp.send_cq_handle, file->ucontext, !!rcq); + if (!rx_qp) + scq = idr_read_cq(cmd_exp->send_cq_handle, file->ucontext, !!rcq); rcq = rcq ?: scq; - pd = idr_read_pd(cmd_exp.pd_handle, file->ucontext); - if (!pd || !scq) { + pd = idr_read_pd(cmd_exp->pd_handle, file->ucontext); + if (!pd || (!scq && !rx_qp)) { ret = -EINVAL; goto err_put; } @@ -3740,42 +4840,44 @@ device = pd->device; } - memset(&attr, 0, sizeof(attr)); - attr.event_handler = ib_uverbs_qp_event_handler; - attr.qp_context = file; - attr.send_cq = scq; - attr.recv_cq = rcq; - attr.srq = srq; - attr.xrcd = xrcd; - attr.sq_sig_type = cmd_exp.sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; - attr.qp_type = cmd_exp.qp_type; - attr.create_flags = 0; - - attr.cap.max_send_wr = cmd_exp.max_send_wr; - attr.cap.max_recv_wr = cmd_exp.max_recv_wr; - attr.cap.max_send_sge = cmd_exp.max_send_sge; - attr.cap.max_recv_sge = cmd_exp.max_recv_sge; - attr.cap.max_inline_data = cmd_exp.max_inline_data; - - if (cmd_exp.comp_mask & IB_UVERBS_EXP_CREATE_QP_CAP_FLAGS) - attr.create_flags |= cmd_exp.qp_cap_flags & - (IB_QP_CREATE_CROSS_CHANNEL | - IB_QP_CREATE_MANAGED_SEND | - IB_QP_CREATE_MANAGED_RECV); - - if (cmd_exp.comp_mask & IB_UVERBS_EXP_CREATE_QP_QPG) { + attr->event_handler = ib_uverbs_qp_event_handler; + attr->qp_context = file; + attr->send_cq = scq; + attr->recv_cq = rcq; + attr->srq = srq; + attr->xrcd = xrcd; + attr->sq_sig_type = cmd_exp->sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; + attr->qp_type = cmd_exp->qp_type; + attr->create_flags = 0; + + attr->cap.max_send_wr = cmd_exp->max_send_wr; + attr->cap.max_recv_wr = cmd_exp->max_recv_wr; + attr->cap.max_send_sge = cmd_exp->max_send_sge; + attr->cap.max_recv_sge = cmd_exp->max_recv_sge; + attr->cap.max_inline_data = cmd_exp->max_inline_data; + attr->rx_hash_conf = NULL; + + if (cmd_exp->comp_mask & IB_UVERBS_EXP_CREATE_QP_CAP_FLAGS) { + if (cmd_exp->qp_cap_flags & ~IBV_UVERBS_EXP_CREATE_QP_FLAGS) { + ret = -EINVAL; + goto err_put; + } + attr->create_flags |= cmd_exp->qp_cap_flags; + } + + if (cmd_exp->comp_mask & IB_UVERBS_EXP_CREATE_QP_QPG) { struct ib_uverbs_qpg *qpg; - if (cmd_exp.qp_type != IB_QPT_RAW_PACKET && - cmd_exp.qp_type != IB_QPT_UD) { + if (cmd_exp->qp_type != IB_QPT_RAW_PACKET && + cmd_exp->qp_type != IB_QPT_UD) { ret = -EINVAL; goto err_put; } - qpg = &cmd_exp.qpg; + qpg = &cmd_exp->qpg; switch (qpg->qpg_type) { case IB_QPG_PARENT: - attr.parent_attrib.rss_child_count = + attr->parent_attrib.rss_child_count = qpg->parent_attrib.rss_child_count; - attr.parent_attrib.tss_child_count = + attr->parent_attrib.tss_child_count = qpg->parent_attrib.tss_child_count; break; case IB_QPG_CHILD_RX: @@ -3786,49 +4888,70 @@ ret = -EINVAL; goto err_put; } - attr.qpg_parent = parentqp; + attr->qpg_parent = parentqp; break; default: ret = -EINVAL; goto err_put; } - attr.qpg_type = qpg->qpg_type; + attr->qpg_type = qpg->qpg_type; } - if (cmd_exp.comp_mask & IB_UVERBS_EXP_CREATE_QP_INL_RECV) - attr.max_inl_recv = cmd_exp.max_inl_recv; + if (cmd_exp->comp_mask & IB_UVERBS_EXP_CREATE_QP_INL_RECV) + attr->max_inl_recv = cmd_exp->max_inl_recv; + + /* No comp mask bit is needed, the value of rx_hash_function is used */ + if (cmd_exp->rx_hash_conf.rx_hash_function) { + ind_tbl = idr_read_rwq_indirection_table(cmd_exp->rx_hash_conf.rwq_ind_tbl_handle, + file->ucontext); + if (!ind_tbl) { + ret = -EINVAL; + goto err_put; + } + rx_hash_conf.rwq_ind_tbl = ind_tbl; + rx_hash_conf.rx_hash_fields_mask = cmd_exp->rx_hash_conf.rx_hash_fields_mask; + rx_hash_conf.rx_hash_function = cmd_exp->rx_hash_conf.rx_hash_function; + rx_hash_conf.rx_hash_key = cmd_exp->rx_hash_conf.rx_hash_key; + rx_hash_conf.rx_key_len = cmd_exp->rx_hash_conf.rx_key_len; + attr->rx_hash_conf = &rx_hash_conf; + } + attr->port_num = cmd_exp->port_num; obj->uevent.events_reported = 0; INIT_LIST_HEAD(&obj->uevent.event_list); INIT_LIST_HEAD(&obj->mcast_list); - if (cmd_exp.qp_type == IB_QPT_XRC_TGT) - qp = ib_create_qp(pd, (struct ib_qp_init_attr *)&attr); + if (cmd_exp->qp_type == IB_QPT_XRC_TGT) + qp = ib_create_qp(pd, (struct ib_qp_init_attr *)attr); else - qp = device->exp_create_qp(pd, &attr, uhw); + qp = device->exp_create_qp(pd, attr, uhw); if (IS_ERR(qp)) { ret = PTR_ERR(qp); goto err_put; } - if (cmd_exp.qp_type != IB_QPT_XRC_TGT) { + if (cmd_exp->qp_type != IB_QPT_XRC_TGT) { qp->real_qp = qp; qp->device = device; qp->pd = pd; - qp->send_cq = attr.send_cq; - qp->recv_cq = attr.recv_cq; - qp->srq = attr.srq; - qp->event_handler = attr.event_handler; - qp->qp_context = attr.qp_context; - qp->qp_type = attr.qp_type; + qp->send_cq = attr->send_cq; + qp->recv_cq = attr->recv_cq; + qp->srq = attr->srq; + qp->rwq_ind_tbl = ind_tbl; + qp->event_handler = attr->event_handler; + qp->qp_context = attr->qp_context; + qp->qp_type = attr->qp_type; atomic_set(&qp->usecnt, 0); atomic_inc(&pd->usecnt); - atomic_inc(&attr.send_cq->usecnt); - if (attr.recv_cq) - atomic_inc(&attr.recv_cq->usecnt); - if (attr.srq) - atomic_inc(&attr.srq->usecnt); + if (!rx_qp) + atomic_inc(&attr->send_cq->usecnt); + if (attr->recv_cq) + atomic_inc(&attr->recv_cq->usecnt); + if (attr->srq) + atomic_inc(&attr->srq->usecnt); + if (ind_tbl) + atomic_inc(&ind_tbl->usecnt); } qp->uobject = &obj->uevent.uobject; @@ -3840,15 +4963,15 @@ memset(&resp_exp, 0, sizeof(resp_exp)); resp_exp.qpn = qp->qp_num; resp_exp.qp_handle = obj->uevent.uobject.id; - resp_exp.max_recv_sge = attr.cap.max_recv_sge; - resp_exp.max_send_sge = attr.cap.max_send_sge; - resp_exp.max_recv_wr = attr.cap.max_recv_wr; - resp_exp.max_send_wr = attr.cap.max_send_wr; - resp_exp.max_inline_data = attr.cap.max_inline_data; + resp_exp.max_recv_sge = attr->cap.max_recv_sge; + resp_exp.max_send_sge = attr->cap.max_send_sge; + resp_exp.max_recv_wr = attr->cap.max_recv_wr; + resp_exp.max_send_wr = attr->cap.max_send_wr; + resp_exp.max_inline_data = attr->cap.max_inline_data; - if (cmd_exp.comp_mask & IB_UVERBS_EXP_CREATE_QP_INL_RECV) { + if (cmd_exp->comp_mask & IB_UVERBS_EXP_CREATE_QP_INL_RECV) { resp_exp.comp_mask |= IB_UVERBS_EXP_CREATE_QP_RESP_INL_RECV; - resp_exp.max_inl_recv = attr.max_inl_recv; + resp_exp.max_inl_recv = attr->max_inl_recv; } ret = ucore->ops->copy_to(ucore, &resp_exp, sizeof(resp_exp)); @@ -3871,6 +4994,8 @@ put_srq_read(srq); if (parentqp) put_qp_read(parentqp); + if (ind_tbl) + put_rwq_indirection_table_read(ind_tbl); mutex_lock(&file->mutex); list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list); @@ -3879,8 +5004,10 @@ obj->uevent.uobject.live = 1; up_write(&obj->uevent.uobject.mutex); + kfree(attr); + kfree(cmd_exp); - return ucore->inlen + uhw->inlen; + return 0; err_copy: idr_remove_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); @@ -3901,8 +5028,14 @@ put_srq_read(srq); if (parentqp) put_qp_read(parentqp); + if (ind_tbl) + put_rwq_indirection_table_read(ind_tbl); put_uobj_write(&obj->uevent.uobject); + +err_cmd_attr: + kfree(attr); + kfree(cmd_exp); return ret; } @@ -3911,4 +5044,128 @@ { return device->exp_query_device(device, device_attr); } -EXPORT_SYMBOL(ib_exp_query_device); + +int ib_uverbs_exp_create_mr(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + struct ib_uverbs_exp_create_mr cmd_exp; + struct ib_uverbs_exp_create_mr_resp resp_exp; + struct ib_pd *pd = NULL; + struct ib_mr *mr = NULL; + struct ib_uobject *uobj = NULL; + struct ib_mr_init_attr attr; + int ret; + + if (ucore->outlen + uhw->outlen < sizeof(resp_exp)) + return -ENOSPC; + + ret = ucore->ops->copy_from(&cmd_exp, ucore, sizeof(cmd_exp)); + if (ret) + return ret; + + uobj = kmalloc(sizeof(*uobj), GFP_KERNEL); + if (!uobj) + return -ENOMEM; + + init_uobj(uobj, 0, file->ucontext, &mr_lock_class); + down_write(&uobj->mutex); + + pd = idr_read_pd(cmd_exp.pd_handle, file->ucontext); + if (!pd) { + ret = -EINVAL; + goto err_free; + } + /* We first get a new "obj id" to be passed later to reg mr for + further use as mr_id. + */ + ret = idr_add_uobj(&ib_uverbs_mr_idr, uobj); + if (ret) + goto err_put; + + memset(&attr, 0, sizeof(attr)); + attr.flags = cmd_exp.create_flags; + attr.exp_access_flags = cmd_exp.exp_access_flags; + attr.max_reg_descriptors = cmd_exp.max_reg_descriptors; + mr = ib_create_mr(pd, &attr); + if (IS_ERR(mr)) { + ret = PTR_ERR(mr); + goto err_remove_uobj; + } + + mr->device = pd->device; + mr->pd = pd; + mr->uobject = uobj; + atomic_set(&mr->usecnt, 0); + + uobj->object = mr; + + memset(&resp_exp, 0, sizeof(resp_exp)); + resp_exp.lkey = mr->lkey; + resp_exp.rkey = mr->rkey; + resp_exp.handle = uobj->id; + + ret = ucore->ops->copy_to(ucore, &resp_exp, sizeof(resp_exp)); + if (ret) + goto err_copy; + + put_pd_read(pd); + + mutex_lock(&file->mutex); + list_add_tail(&uobj->list, &file->ucontext->mr_list); + mutex_unlock(&file->mutex); + + uobj->live = 1; + + up_write(&uobj->mutex); + + return 0; + +err_copy: + ib_dereg_mr(mr); + +err_remove_uobj: + idr_remove_uobj(&ib_uverbs_mr_idr, uobj); + +err_put: + put_pd_read(pd); + +err_free: + put_uobj_write(uobj); + return ret; +} + +int ib_uverbs_exp_query_mkey(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + struct ib_uverbs_exp_query_mkey cmd_exp; + struct ib_uverbs_exp_query_mkey_resp resp_exp; + struct ib_mr *mr; + struct ib_mkey_attr mkey_attr; + int ret; + + memset(&cmd_exp, 0, sizeof(cmd_exp)); + ret = ucore->ops->copy_from(&cmd_exp, ucore, sizeof(cmd_exp)); + if (ret) + return ret; + + mr = idr_read_mr(cmd_exp.handle, file->ucontext); + if (!mr) + return -EINVAL; + + ret = ib_query_mkey(mr, 0, &mkey_attr); + if (ret) + return ret; + + put_mr_read(mr); + + memset(&resp_exp, 0, sizeof(resp_exp)); + resp_exp.max_reg_descriptors = mkey_attr.max_reg_descriptors; + + ret = ucore->ops->copy_to(ucore, &resp_exp, sizeof(resp_exp)); + if (ret) + return ret; + + return 0; +} Index: sys/ofed/drivers/infiniband/core/uverbs_main.c =================================================================== --- sys/ofed/drivers/infiniband/core/uverbs_main.c +++ sys/ofed/drivers/infiniband/core/uverbs_main.c @@ -51,9 +51,7 @@ #include "uverbs.h" -MODULE_AUTHOR("Roland Dreier"); -MODULE_DESCRIPTION("InfiniBand userspace verbs access"); -MODULE_LICENSE("Dual BSD/GPL"); +/* InfiniBand userspace verbs access */ enum { IB_UVERBS_MAJOR = 231, @@ -81,10 +79,11 @@ #define INIT_UDATA_EX(udata, ibuf, obuf, ilen, olen) \ do { \ (udata)->ops = &uverbs_copy_ex; \ - (udata)->inbuf = (void __user *)(unsigned long)(ibuf); \ - (udata)->outbuf = (void __user *)(unsigned long)(obuf); \ + (udata)->inbuf = (void __user *)(ibuf); \ + (udata)->outbuf = (void __user *)(obuf); \ (udata)->inlen = (ilen); \ (udata)->outlen = (olen); \ + (udata)->src = IB_UDATA_EX_CMD; \ } while (0) @@ -101,6 +100,8 @@ DEFINE_IDR(ib_uverbs_xrcd_idr); DEFINE_IDR(ib_uverbs_rule_idr); DEFINE_IDR(ib_uverbs_dct_idr); +DEFINE_IDR(ib_uverbs_wq_idr); +DEFINE_IDR(ib_uverbs_rwq_ind_tbl_idr); static DEFINE_SPINLOCK(map_lock); static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES); @@ -108,36 +109,37 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) = { - [IB_USER_VERBS_CMD_GET_CONTEXT] = ib_uverbs_get_context, - [IB_USER_VERBS_CMD_QUERY_DEVICE] = ib_uverbs_query_device, - [IB_USER_VERBS_CMD_QUERY_PORT] = ib_uverbs_query_port, - [IB_USER_VERBS_CMD_ALLOC_PD] = ib_uverbs_alloc_pd, - [IB_USER_VERBS_CMD_DEALLOC_PD] = ib_uverbs_dealloc_pd, - [IB_USER_VERBS_CMD_REG_MR] = ib_uverbs_reg_mr, - [IB_USER_VERBS_CMD_DEREG_MR] = ib_uverbs_dereg_mr, + [IB_USER_VERBS_CMD_GET_CONTEXT] = ib_uverbs_get_context, + [IB_USER_VERBS_CMD_QUERY_DEVICE] = ib_uverbs_query_device, + [IB_USER_VERBS_CMD_QUERY_PORT] = ib_uverbs_query_port, + [IB_USER_VERBS_CMD_ALLOC_PD] = ib_uverbs_alloc_pd, + [IB_USER_VERBS_CMD_DEALLOC_PD] = ib_uverbs_dealloc_pd, + [IB_USER_VERBS_CMD_REG_MR] = ib_uverbs_reg_mr, + [IB_USER_VERBS_CMD_REREG_MR] = ib_uverbs_rereg_mr, + [IB_USER_VERBS_CMD_DEREG_MR] = ib_uverbs_dereg_mr, [IB_USER_VERBS_CMD_ALLOC_MW] = ib_uverbs_alloc_mw, [IB_USER_VERBS_CMD_DEALLOC_MW] = ib_uverbs_dealloc_mw, [IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL] = ib_uverbs_create_comp_channel, - [IB_USER_VERBS_CMD_CREATE_CQ] = ib_uverbs_create_cq, - [IB_USER_VERBS_CMD_RESIZE_CQ] = ib_uverbs_resize_cq, - [IB_USER_VERBS_CMD_POLL_CQ] = ib_uverbs_poll_cq, - [IB_USER_VERBS_CMD_REQ_NOTIFY_CQ] = ib_uverbs_req_notify_cq, - [IB_USER_VERBS_CMD_DESTROY_CQ] = ib_uverbs_destroy_cq, - [IB_USER_VERBS_CMD_CREATE_QP] = ib_uverbs_create_qp, - [IB_USER_VERBS_CMD_QUERY_QP] = ib_uverbs_query_qp, - [IB_USER_VERBS_CMD_MODIFY_QP] = ib_uverbs_modify_qp, - [IB_USER_VERBS_CMD_DESTROY_QP] = ib_uverbs_destroy_qp, - [IB_USER_VERBS_CMD_POST_SEND] = ib_uverbs_post_send, - [IB_USER_VERBS_CMD_POST_RECV] = ib_uverbs_post_recv, - [IB_USER_VERBS_CMD_POST_SRQ_RECV] = ib_uverbs_post_srq_recv, - [IB_USER_VERBS_CMD_CREATE_AH] = ib_uverbs_create_ah, - [IB_USER_VERBS_CMD_DESTROY_AH] = ib_uverbs_destroy_ah, - [IB_USER_VERBS_CMD_ATTACH_MCAST] = ib_uverbs_attach_mcast, - [IB_USER_VERBS_CMD_DETACH_MCAST] = ib_uverbs_detach_mcast, - [IB_USER_VERBS_CMD_CREATE_SRQ] = ib_uverbs_create_srq, - [IB_USER_VERBS_CMD_MODIFY_SRQ] = ib_uverbs_modify_srq, - [IB_USER_VERBS_CMD_QUERY_SRQ] = ib_uverbs_query_srq, - [IB_USER_VERBS_CMD_DESTROY_SRQ] = ib_uverbs_destroy_srq, + [IB_USER_VERBS_CMD_CREATE_CQ] = ib_uverbs_create_cq, + [IB_USER_VERBS_CMD_RESIZE_CQ] = ib_uverbs_resize_cq, + [IB_USER_VERBS_CMD_POLL_CQ] = ib_uverbs_poll_cq, + [IB_USER_VERBS_CMD_REQ_NOTIFY_CQ] = ib_uverbs_req_notify_cq, + [IB_USER_VERBS_CMD_DESTROY_CQ] = ib_uverbs_destroy_cq, + [IB_USER_VERBS_CMD_CREATE_QP] = ib_uverbs_create_qp, + [IB_USER_VERBS_CMD_QUERY_QP] = ib_uverbs_query_qp, + [IB_USER_VERBS_CMD_MODIFY_QP] = ib_uverbs_modify_qp, + [IB_USER_VERBS_CMD_DESTROY_QP] = ib_uverbs_destroy_qp, + [IB_USER_VERBS_CMD_POST_SEND] = ib_uverbs_post_send, + [IB_USER_VERBS_CMD_POST_RECV] = ib_uverbs_post_recv, + [IB_USER_VERBS_CMD_POST_SRQ_RECV] = ib_uverbs_post_srq_recv, + [IB_USER_VERBS_CMD_CREATE_AH] = ib_uverbs_create_ah, + [IB_USER_VERBS_CMD_DESTROY_AH] = ib_uverbs_destroy_ah, + [IB_USER_VERBS_CMD_ATTACH_MCAST] = ib_uverbs_attach_mcast, + [IB_USER_VERBS_CMD_DETACH_MCAST] = ib_uverbs_detach_mcast, + [IB_USER_VERBS_CMD_CREATE_SRQ] = ib_uverbs_create_srq, + [IB_USER_VERBS_CMD_MODIFY_SRQ] = ib_uverbs_modify_srq, + [IB_USER_VERBS_CMD_QUERY_SRQ] = ib_uverbs_query_srq, + [IB_USER_VERBS_CMD_DESTROY_SRQ] = ib_uverbs_destroy_srq, [IB_USER_VERBS_CMD_OPEN_XRCD] = ib_uverbs_open_xrcd, [IB_USER_VERBS_CMD_CLOSE_XRCD] = ib_uverbs_close_xrcd, [IB_USER_VERBS_CMD_CREATE_XSRQ] = ib_uverbs_create_xsrq, @@ -150,10 +152,11 @@ [IB_USER_VERBS_EX_CMD_CREATE_FLOW] = ib_uverbs_ex_create_flow, [IB_USER_VERBS_EX_CMD_DESTROY_FLOW] = ib_uverbs_ex_destroy_flow, }; +typedef int (*uverbs_ex_cmd)(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw); -static ssize_t (*uverbs_exp_cmd_table[])(struct ib_uverbs_file *file, - struct ib_udata *ucore, - struct ib_udata *uhw) = { +static uverbs_ex_cmd uverbs_exp_cmd_table[] = { [IB_USER_VERBS_EXP_CMD_CREATE_QP] = ib_uverbs_exp_create_qp, [IB_USER_VERBS_EXP_CMD_MODIFY_CQ] = ib_uverbs_exp_modify_cq, [IB_USER_VERBS_EXP_CMD_MODIFY_QP] = ib_uverbs_exp_modify_qp, @@ -162,6 +165,20 @@ [IB_USER_VERBS_EXP_CMD_CREATE_DCT] = ib_uverbs_exp_create_dct, [IB_USER_VERBS_EXP_CMD_DESTROY_DCT] = ib_uverbs_exp_destroy_dct, [IB_USER_VERBS_EXP_CMD_QUERY_DCT] = ib_uverbs_exp_query_dct, + [IB_USER_VERBS_EXP_CMD_ARM_DCT] = ib_uverbs_exp_arm_dct, + [IB_USER_VERBS_EXP_CMD_CREATE_MR] = ib_uverbs_exp_create_mr, + [IB_USER_VERBS_EXP_CMD_QUERY_MKEY] = ib_uverbs_exp_query_mkey, + [IB_USER_VERBS_EXP_CMD_REG_MR_EX] = ib_uverbs_exp_reg_mr_ex, +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + [IB_USER_VERBS_EXP_CMD_PREFETCH_MR] = ib_uverbs_exp_prefetch_mr, +#endif + [IB_USER_VERBS_EXP_CMD_REREG_MR] = ib_uverbs_exp_rereg_mr, + [IB_USER_VERBS_EXP_CMD_CREATE_WQ] = ib_uverbs_exp_create_wq, + [IB_USER_VERBS_EXP_CMD_MODIFY_WQ] = ib_uverbs_exp_modify_wq, + [IB_USER_VERBS_EXP_CMD_DESTROY_WQ] = ib_uverbs_exp_destroy_wq, + [IB_USER_VERBS_EXP_CMD_CREATE_RWQ_IND_TBL] = ib_uverbs_exp_create_rwq_ind_table, + [IB_USER_VERBS_EXP_CMD_DESTROY_RWQ_IND_TBL] = ib_uverbs_exp_destroy_rwq_ind_table, + [IB_USER_VERBS_EXP_CMD_CREATE_FLOW] = ib_uverbs_exp_create_flow, }; static void ib_uverbs_add_one(struct ib_device *device); @@ -172,7 +189,12 @@ struct ib_uverbs_device *dev = container_of(ref, struct ib_uverbs_device, ref); - complete(&dev->comp); + if (dev->disassociated) { + cleanup_srcu_struct(&dev->disassociate_srcu); + kfree(dev); + } else { + complete(&dev->comp); + } } static void ib_uverbs_release_event_file(struct kref *ref) @@ -257,13 +279,10 @@ struct ib_mw *mw = uobj->object; idr_remove_uobj(&ib_uverbs_mw_idr, uobj); - err = ib_dealloc_mw(mw); - if (err) { - pr_info("user_verbs: couldn't deallocate MW during cleanup.\n"); - pr_info("user_verbs: the system may have become unstable.\n"); - } + ib_dealloc_mw(mw); kfree(uobj); } + list_for_each_entry_safe(uobj, tmp, &context->rule_list, list) { struct ib_flow *flow_id = uobj->object; @@ -291,7 +310,7 @@ list_for_each_entry_safe(uobj, tmp, &context->dct_list, list) { struct ib_dct *dct = uobj->object; struct ib_udct_object *udct = - container_of(uobj, struct ib_udct_object, uobject); + container_of(uobj, struct ib_udct_object, uevent.uobject); idr_remove_uobj(&ib_uverbs_dct_idr, uobj); @@ -302,6 +321,29 @@ kfree(udct); } + list_for_each_entry_safe(uobj, tmp, &context->rwq_ind_tbl_list, list) { + struct ib_rwq_ind_table *rwq_ind_tbl = uobj->object; + struct ib_wq **ind_tbl = rwq_ind_tbl->ind_tbl; + + idr_remove_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj); + err = ib_destroy_rwq_ind_table(rwq_ind_tbl); + if (err) + pr_info("destroying uverbs rwq_ind_tbl failed: err %d\n", err); + + kfree(ind_tbl); + kfree(uobj); + } + + list_for_each_entry_safe(uobj, tmp, &context->wq_list, list) { + struct ib_wq *wq = uobj->object; + + idr_remove_uobj(&ib_uverbs_wq_idr, uobj); + err = ib_destroy_wq(wq); + if (err) + pr_info("destroying uverbs wq failed: err %d\n", err); + kfree(uobj); + } + list_for_each_entry_safe(uobj, tmp, &context->srq_list, list) { struct ib_srq *srq = uobj->object; struct ib_uevent_object *uevent = @@ -325,7 +367,6 @@ err = ib_destroy_cq(cq); if (err) pr_info("destroying uverbs cq failed: err %d\n", err); - ib_uverbs_release_ucq(file, ev_file, ucq); kfree(ucq); } @@ -370,7 +411,9 @@ struct ib_uverbs_file *file = container_of(ref, struct ib_uverbs_file, ref); - module_put(file->device->ib_dev->owner); + if (!(file->device->flags & UVERBS_FLAG_DISASSOCIATE)) + module_put(file->device->ib_dev->owner); + kref_put(&file->device->ref, ib_uverbs_release_dev); kfree(file); @@ -393,9 +436,15 @@ return -EAGAIN; if (wait_event_interruptible(file->poll_wait, - !list_empty(&file->event_list))) + (!list_empty(&file->event_list) || + file->uverbs_file->device->disassociated))) + /* will reach here in case signal has occoured */ return -ERESTARTSYS; + /* We reach here once list is not empty or once device was disassociated */ + if (list_empty(&file->event_list) && file->uverbs_file->device->disassociated) + return -EIO; + spin_lock_irq(&file->lock); } @@ -469,18 +518,23 @@ } spin_unlock_irq(&file->lock); - if (file->is_async) { - ib_unregister_event_handler(&file->uverbs_file->event_handler); - kref_put(&file->uverbs_file->ref, ib_uverbs_release_file); + mutex_lock(&file->uverbs_file->device->disassociate_mutex); + if (!file->uverbs_file->device->disassociated) { + list_del(&file->list); + if (file->is_async) + ib_unregister_event_handler(&file->uverbs_file->event_handler); } - kref_put(&file->ref, ib_uverbs_release_event_file); + mutex_unlock(&file->uverbs_file->device->disassociate_mutex); + + kref_put(&file->uverbs_file->ref, ib_uverbs_release_file); + kref_put(&file->ref, ib_uverbs_release_event_file); return 0; } static const struct file_operations uverbs_event_fops = { .owner = THIS_MODULE, - .read = ib_uverbs_event_read, + .read = ib_uverbs_event_read, .poll = ib_uverbs_event_poll, .release = ib_uverbs_event_close, .fasync = ib_uverbs_event_fasync, @@ -527,7 +581,7 @@ static void ib_uverbs_async_handler(struct ib_uverbs_file *file, __u64 element, __u64 event, struct list_head *obj_list, - u32 *counter) + u32 *counter, u32 rsc_type) { struct ib_uverbs_event *entry; unsigned long flags; @@ -547,6 +601,7 @@ entry->desc.async.element = element; entry->desc.async.event_type = event; entry->counter = counter; + entry->desc.async.rsc_type = rsc_type; list_add_tail(&entry->list, &file->async_file->event_list); if (obj_list) @@ -566,19 +621,23 @@ ib_uverbs_async_handler(uobj->uverbs_file, uobj->uobject.user_handle, event->event, &uobj->async_list, - &uobj->async_events_reported); + &uobj->async_events_reported, IB_EVENT_RSC_CQ); } void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr) { struct ib_uevent_object *uobj; + /* for XRC target qp's, check that qp is live */ + if (!event->element.qp->uobject || !event->element.qp->uobject->live) + return; + uobj = container_of(event->element.qp->uobject, struct ib_uevent_object, uobject); ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle, event->event, &uobj->event_list, - &uobj->events_reported); + &uobj->events_reported, IB_EVENT_RSC_QP); } void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr) @@ -590,7 +649,7 @@ ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle, event->event, &uobj->event_list, - &uobj->events_reported); + &uobj->events_reported, IB_EVENT_RSC_SRQ); } void ib_uverbs_event_handler(struct ib_event_handler *handler, @@ -600,7 +659,7 @@ container_of(handler, struct ib_uverbs_file, event_handler); ib_uverbs_async_handler(file, event->element.port_num, event->event, - NULL, NULL); + NULL, NULL, IB_EVENT_RSC_DEVICE); } struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file, @@ -621,19 +680,33 @@ ev_file->is_async = is_async; /* - * fops_get() can't fail here, because we're coming from a - * system call on a uverbs file, which will already have a - * module reference. - */ + * fops_get() can't fail here, because we're coming from a + * system call on a uverbs file, which will already have a + * module reference. + */ filp = alloc_file(FMODE_READ, fops_get(&uverbs_event_fops)); if (IS_ERR(filp)) { kfree(ev_file); + return filp; } else { - filp->private_data = ev_file; + filp->private_data = ev_file; + } + + mutex_lock(&uverbs_file->device->disassociate_mutex); + if (!uverbs_file->device->disassociated) { + list_add_tail(&ev_file->list, + &uverbs_file->device->uverbs_events_file_list); + mutex_unlock(&uverbs_file->device->disassociate_mutex); + + return filp; } - return filp; + mutex_unlock(&uverbs_file->device->disassociate_mutex); + + fput(filp); + kfree(ev_file); + return ERR_PTR(-EIO); } /* @@ -665,302 +738,164 @@ return ev_file; } -static const char *verbs_cmd_str(__u32 cmd) -{ - switch (cmd) { - case IB_USER_VERBS_CMD_GET_CONTEXT: - return "GET_CONTEXT"; - case IB_USER_VERBS_CMD_QUERY_DEVICE: - return "QUERY_DEVICE"; - case IB_USER_VERBS_CMD_QUERY_PORT: - return "QUERY_PORT"; - case IB_USER_VERBS_CMD_ALLOC_PD: - return "ALLOC_PD"; - case IB_USER_VERBS_CMD_DEALLOC_PD: - return "DEALLOC_PD"; - case IB_USER_VERBS_CMD_REG_MR: - return "REG_MR"; - case IB_USER_VERBS_CMD_DEREG_MR: - return "DEREG_MR"; - case IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL: - return "CREATE_COMP_CHANNEL"; - case IB_USER_VERBS_CMD_CREATE_CQ: - return "CREATE_CQ"; - case IB_USER_VERBS_CMD_RESIZE_CQ: - return "RESIZE_CQ"; - case IB_USER_VERBS_CMD_POLL_CQ: - return "POLL_CQ"; - case IB_USER_VERBS_CMD_REQ_NOTIFY_CQ: - return "REQ_NOTIFY_CQ"; - case IB_USER_VERBS_CMD_DESTROY_CQ: - return "DESTROY_CQ"; - case IB_USER_VERBS_CMD_CREATE_QP: - return "CREATE_QP"; - case IB_USER_VERBS_CMD_QUERY_QP: - return "QUERY_QP"; - case IB_USER_VERBS_CMD_MODIFY_QP: - return "MODIFY_QP"; - case IB_USER_VERBS_CMD_DESTROY_QP: - return "DESTROY_QP"; - case IB_USER_VERBS_CMD_POST_SEND: - return "POST_SEND"; - case IB_USER_VERBS_CMD_POST_RECV: - return "POST_RECV"; - case IB_USER_VERBS_CMD_POST_SRQ_RECV: - return "POST_SRQ_RECV"; - case IB_USER_VERBS_CMD_CREATE_AH: - return "CREATE_AH"; - case IB_USER_VERBS_CMD_DESTROY_AH: - return "DESTROY_AH"; - case IB_USER_VERBS_CMD_ATTACH_MCAST: - return "ATTACH_MCAST"; - case IB_USER_VERBS_CMD_DETACH_MCAST: - return "DETACH_MCAST"; - case IB_USER_VERBS_CMD_CREATE_SRQ: - return "CREATE_SRQ"; - case IB_USER_VERBS_CMD_MODIFY_SRQ: - return "MODIFY_SRQ"; - case IB_USER_VERBS_CMD_QUERY_SRQ: - return "QUERY_SRQ"; - case IB_USER_VERBS_CMD_DESTROY_SRQ: - return "DESTROY_SRQ"; - case IB_USER_VERBS_CMD_OPEN_XRCD: - return "OPEN_XRCD"; - case IB_USER_VERBS_CMD_CLOSE_XRCD: - return "CLOSE_XRCD"; - case IB_USER_VERBS_CMD_CREATE_XSRQ: - return "CREATE_XSRQ"; - case IB_USER_VERBS_CMD_OPEN_QP: - return "OPEN_QP"; - } - - return "Unknown command"; -} - enum { COMMAND_INFO_MASK = 0x1000, }; -static ssize_t ib_uverbs_exp_handle_cmd(struct ib_uverbs_file *file, - const char __user *buf, - struct ib_device *dev, - struct ib_uverbs_cmd_hdr *hdr, - size_t count, - int legacy_ex_cmd) -{ - struct ib_udata ucore; - struct ib_udata uhw; - struct ib_uverbs_ex_cmd_hdr ex_hdr; - __u32 command = hdr->command - IB_USER_VERBS_EXP_CMD_FIRST; - - if (hdr->command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK | - IB_USER_VERBS_CMD_COMMAND_MASK)) - return -EINVAL; - - if (command >= ARRAY_SIZE(uverbs_exp_cmd_table) || - !uverbs_exp_cmd_table[command]) - return -EINVAL; - - if (!file->ucontext) - return -EINVAL; - - if (!(dev->uverbs_exp_cmd_mask & (1ull << command))) - return -ENOSYS; - - if (legacy_ex_cmd) { - struct ib_uverbs_ex_cmd_hdr_legacy hxl; - struct ib_uverbs_ex_cmd_resp1_legacy resp1; - __u64 response; - ssize_t ret; - - if (count < sizeof(hxl)) - return -EINVAL; - - if (copy_from_user(&hxl, buf, sizeof(hxl))) - return -EFAULT; - - if (((hxl.in_words + hxl.provider_in_words) * 4) != count) - return -EINVAL; - - count -= sizeof(hxl); - buf += sizeof(hxl); - if (hxl.out_words || hxl.provider_out_words) { - if (count < sizeof(resp1)) - return -EINVAL; - if (copy_from_user(&resp1, buf, sizeof(resp1))) - return -EFAULT; - response = resp1.response; - if (!response) - return -EINVAL; - - /* - * Change user buffer to comply with new extension format. - */ - if (sizeof(resp1.comp_mask) != sizeof(resp1.response)) - return -EFAULT; - buf += sizeof(resp1.comp_mask); - if (copy_to_user(__DECONST(void __user *, buf), &resp1.comp_mask, - sizeof(resp1.response))) - return -EFAULT; - - } else { - response = 0; - } - - INIT_UDATA_EX(&ucore, - (hxl.in_words) ? buf : 0, - response, - hxl.in_words * 4, - hxl.out_words * 4); - - INIT_UDATA_EX(&uhw, - (hxl.provider_in_words) ? buf + ucore.inlen : 0, - (hxl.provider_out_words) ? response + ucore.outlen : 0, - hxl.provider_in_words * 4, - hxl.provider_out_words * 4); - - ret = uverbs_exp_cmd_table[command](file, &ucore, &uhw); - /* - * UnChange user buffer - */ - if (response && copy_to_user(__DECONST(void __user *, buf), &resp1.response, sizeof(resp1.response))) - return -EFAULT; - - return ret; - } else { - if (count < (sizeof(hdr) + sizeof(ex_hdr))) - return -EINVAL; - - if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr))) - return -EFAULT; - - buf += sizeof(hdr) + sizeof(ex_hdr); - - if ((hdr->in_words + ex_hdr.provider_in_words) * 8 != count) - return -EINVAL; - - if (ex_hdr.response) { - if (!hdr->out_words && !ex_hdr.provider_out_words) - return -EINVAL; - } else { - if (hdr->out_words || ex_hdr.provider_out_words) - return -EINVAL; - } - - INIT_UDATA_EX(&ucore, - (hdr->in_words) ? buf : 0, - (unsigned long)ex_hdr.response, - hdr->in_words * 8, - hdr->out_words * 8); - - INIT_UDATA_EX(&uhw, - (ex_hdr.provider_in_words) ? buf + ucore.inlen : 0, - (ex_hdr.provider_out_words) ? ex_hdr.response + ucore.outlen : 0, - ex_hdr.provider_in_words * 8, - ex_hdr.provider_out_words * 8); - - return uverbs_exp_cmd_table[command](file, &ucore, &uhw); - } -} - static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, size_t count, loff_t *pos) { struct ib_uverbs_file *file = filp->private_data; struct ib_device *dev = file->device->ib_dev; struct ib_uverbs_cmd_hdr hdr; - struct timespec ts1; - struct timespec ts2; - ktime_t t1, t2, delta; - s64 ds; - ssize_t ret; - u64 dividend; - u32 divisor; - __u32 flags; __u32 command; - int legacy_ex_cmd = 0; + int exp_cmd; size_t written_count = count; + __u32 flags; + int srcu_key; + ssize_t ret; - if (count < sizeof hdr) + if (count < sizeof hdr) { + pr_debug("ib_uverbs_write: header too short\n"); return -EINVAL; + } if (copy_from_user(&hdr, buf, sizeof hdr)) return -EFAULT; - /* - * For BWD compatibility change old style extension verbs commands - * to their equivalent experimental command. - */ - if ((hdr.command >= IB_USER_VERBS_LEGACY_CMD_FIRST) && - (hdr.command <= IB_USER_VERBS_LEGACY_EX_CMD_LAST)) { - hdr.command += IB_USER_VERBS_EXP_CMD_FIRST - - IB_USER_VERBS_LEGACY_CMD_FIRST; - legacy_ex_cmd = 1; - } - flags = (hdr.command & IB_USER_VERBS_CMD_FLAGS_MASK) >> IB_USER_VERBS_CMD_FLAGS_SHIFT; command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK; + exp_cmd = !flags && (command >= IB_USER_VERBS_EXP_CMD_FIRST); - ktime_get_ts(&ts1); - if (!flags && (command >= IB_USER_VERBS_EXP_CMD_FIRST)) { - ret = ib_uverbs_exp_handle_cmd(file, buf, dev, &hdr, count, legacy_ex_cmd); - } else if (!flags) { + srcu_key = srcu_read_lock(&file->device->disassociate_srcu); + if (file->device->disassociated) { + ret = -EIO; + goto out; + } + + if (!flags && !exp_cmd) { if (command >= ARRAY_SIZE(uverbs_cmd_table) || - !uverbs_cmd_table[command]) - return -EINVAL; + !uverbs_cmd_table[command]) { + pr_debug("ib_uverbs_write: unexpected command\n"); + ret = -EINVAL; + goto out; + } if (!file->ucontext && - command != IB_USER_VERBS_CMD_GET_CONTEXT) - return -EINVAL; + command != IB_USER_VERBS_CMD_GET_CONTEXT) { + pr_debug("ib_uverbs_write: invalid context\n"); + ret = -EINVAL; + goto out; + } - if (!(dev->uverbs_cmd_mask & (1ull << command))) - return -ENOSYS; + if (!(file->device->ib_dev->uverbs_cmd_mask & (1ull << command))) { + pr_debug("ib_uverbs_write: command not support by the device\n"); + ret = -ENOSYS; + goto out; + } - if (hdr.in_words * 4 != count) - return -EINVAL; + if (hdr.in_words * 4 != count) { + pr_debug("ib_uverbs_write: header input length doesn't match written length\n"); + ret = -EINVAL; + goto out; + } ret = uverbs_cmd_table[command](file, - buf + sizeof(hdr), - hdr.in_words * 4, - hdr.out_words * 4); - } else if (flags == IB_USER_VERBS_CMD_FLAG_EXTENDED) { + buf + sizeof(hdr), + hdr.in_words * 4, + hdr.out_words * 4); + + } else if ((flags == IB_USER_VERBS_CMD_FLAG_EXTENDED) || exp_cmd) { + struct ib_uverbs_ex_cmd_hdr ex_hdr; struct ib_udata ucore; struct ib_udata uhw; - struct ib_uverbs_ex_cmd_hdr ex_hdr; + int arr_size; + uverbs_ex_cmd *cmd_tbl; + u64 cmd_mask; + + if (exp_cmd) { + command = hdr.command - IB_USER_VERBS_EXP_CMD_FIRST; + arr_size = ARRAY_SIZE(uverbs_exp_cmd_table); + cmd_tbl = uverbs_exp_cmd_table; + cmd_mask = dev->uverbs_exp_cmd_mask; + } else { + arr_size = ARRAY_SIZE(uverbs_ex_cmd_table); + cmd_tbl = uverbs_ex_cmd_table; + cmd_mask = dev->uverbs_ex_cmd_mask; + } if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK | - IB_USER_VERBS_CMD_COMMAND_MASK)) - return -EINVAL; + IB_USER_VERBS_CMD_COMMAND_MASK)) { + pr_debug("ib_uverbs_write: extended command invalid opcode\n"); + ret = -EINVAL; + goto out; + } - if (command >= ARRAY_SIZE(uverbs_ex_cmd_table) || - !uverbs_ex_cmd_table[command]) - return -EINVAL; + if (command >= arr_size || !cmd_tbl[command]) { + pr_debug("ib_uverbs_write: invalid extended command\n"); + ret = -EINVAL; + goto out; + } - if (!file->ucontext) - return -EINVAL; + if (!file->ucontext) { + pr_debug("ib_uverbs_write: invalid context in extended command\n"); + ret = -EINVAL; + goto out; + } - if (!(dev->uverbs_ex_cmd_mask & (1ull << command))) - return -ENOSYS; + if (!(cmd_mask & (1ull << command))) { + pr_debug("ib_uverbs_write: extended command not supported by driver\n"); + ret = -ENOSYS; + goto out; + } - if (count < (sizeof(hdr) + sizeof(ex_hdr))) - return -EINVAL; + if (count < (sizeof(hdr) + sizeof(ex_hdr))) { + pr_debug("ib_uverbs_write: ex header input length doesn't match written length\n"); + ret = -EINVAL; + goto out; + } - if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr))) - return -EFAULT; + if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr))) { + ret = -EFAULT; + goto out; + } count -= sizeof(hdr) + sizeof(ex_hdr); buf += sizeof(hdr) + sizeof(ex_hdr); - if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count) - return -EINVAL; + if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count) { + pr_debug("ib_uverbs_write: extended command doesn't match written length\n"); + ret = -EINVAL; + goto out; + } + + if (ex_hdr.cmd_hdr_reserved) { + ret = -EINVAL; + goto out; + } if (ex_hdr.response) { - if (!hdr.out_words && !ex_hdr.provider_out_words) - return -EINVAL; + if (!hdr.out_words && !ex_hdr.provider_out_words) { + pr_debug("ib_uverbs_write: got response pointer to a zero length buffer\n"); + ret = -EINVAL; + goto out; + } + +/* + if (!access_ok(VERIFY_WRITE, + (void __user *) (unsigned long) ex_hdr.response, + (hdr.out_words + ex_hdr.provider_out_words) * 8)) { + ret = -EFAULT; + goto out; + } +*/ } else { - if (hdr.out_words || ex_hdr.provider_out_words) - return -EINVAL; + if (hdr.out_words || ex_hdr.provider_out_words) { + pr_debug("ib_uverbs_write: got NULL response pointer but non-zero output length\n"); + ret = -EINVAL; + goto out; + } } INIT_UDATA_EX(&ucore, @@ -974,84 +909,78 @@ (ex_hdr.provider_out_words) ? ex_hdr.response + ucore.outlen : 0, ex_hdr.provider_in_words * 8, ex_hdr.provider_out_words * 8); + if (exp_cmd) { + ucore.src = IB_UDATA_EXP_CMD; + uhw.src = IB_UDATA_EXP_CMD; + } - ret = uverbs_ex_cmd_table[command](file, &ucore, &uhw); - - if (ret) - return ret; + ret = cmd_tbl[command](file, &ucore, &uhw); + if (!ret) + ret = written_count; - return written_count; + goto out; } else { - return -EFAULT; + ret = -EFAULT; + goto out; } - if ((dev->cmd_perf & (COMMAND_INFO_MASK - 1)) == hdr.command) { - ktime_get_ts(&ts2); - t1 = timespec_to_ktime(ts1); - t2 = timespec_to_ktime(ts2); - delta = ktime_sub(t2, t1); - ds = ktime_to_ns(delta); - spin_lock(&dev->cmd_perf_lock); - dividend = dev->cmd_avg * dev->cmd_n + ds; - ++dev->cmd_n; - divisor = dev->cmd_n; - do_div(dividend, divisor); - dev->cmd_avg = dividend; - spin_unlock(&dev->cmd_perf_lock); - if (dev->cmd_perf & COMMAND_INFO_MASK) { - pr_info("%s: %s execution time = %lld nsec\n", - file->device->ib_dev->name, - verbs_cmd_str(hdr.command), - (long long)ds); - } - } +out: + srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); return ret; } static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma) { struct ib_uverbs_file *file = filp->private_data; + int ret = 0; + int srcu_key; + + srcu_key = srcu_read_lock(&file->device->disassociate_srcu); + if (file->device->disassociated) { + ret = -EIO; + goto out; + } if (!file->ucontext) - return -ENODEV; + ret = -ENODEV; else - return file->device->ib_dev->mmap(file->ucontext, vma); + ret = file->device->ib_dev->mmap(file->ucontext, vma); +out: + srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); + return ret; } -/* XXX Not supported in FreeBSD */ -#if 0 -static unsigned long ib_uverbs_get_unmapped_area(struct file *filp, - unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) + +static long ib_uverbs_ioctl(struct file *filp, + unsigned int cmd, unsigned long arg) { struct ib_uverbs_file *file = filp->private_data; + long ret = 0; + int srcu_key; - if (!file->ucontext) - return -ENODEV; - else { - if (!file->device->ib_dev->get_unmapped_area) - return current->mm->get_unmapped_area(filp, addr, len, - pgoff, flags); + srcu_key = srcu_read_lock(&file->device->disassociate_srcu); - return file->device->ib_dev->get_unmapped_area(filp, addr, len, - pgoff, flags); + if (file->device->disassociated) { + ret = -EIO; + goto out; } -} -#endif - -static long ib_uverbs_ioctl(struct file *filp, - unsigned int cmd, unsigned long arg) -{ - struct ib_uverbs_file *file = filp->private_data; - if (!file->device->ib_dev->ioctl) - return -ENOTSUPP; + if (!file->device->ib_dev->ioctl) { + ret = -ENOTSUPP; + goto out; + } - if (!file->ucontext) - return -ENODEV; - else + if (!file->ucontext) { + ret = -ENODEV; + goto out; + } else { /* provider should provide it's own locking mechanism */ - return file->device->ib_dev->ioctl(file->ucontext, cmd, arg); + ret = file->device->ib_dev->ioctl(file->ucontext, cmd, arg); + } + +out: + srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); + return ret; } /* @@ -1069,6 +998,7 @@ struct ib_uverbs_device *dev; struct ib_uverbs_file *file; int ret; + int module_dependent; dev = container_of(inode->i_cdev->si_drv1, struct ib_uverbs_device, cdev); if (dev) @@ -1076,15 +1006,31 @@ else return -ENXIO; - if (!try_module_get(dev->ib_dev->owner)) { - ret = -ENODEV; + mutex_lock(&dev->disassociate_mutex); + if (dev->disassociated) { + ret = -EIO; goto err; } - file = kmalloc(sizeof *file, GFP_KERNEL); + /* In case IB device supports disassociate ucontext, there is no hard + * dependency between uverbs device and its low level device. + */ + module_dependent = !(dev->flags & UVERBS_FLAG_DISASSOCIATE); + + if (module_dependent) { + if (!try_module_get(dev->ib_dev->owner)) { + ret = -ENODEV; + goto err; + } + } + + file = kzalloc(sizeof *file, GFP_KERNEL); if (!file) { ret = -ENOMEM; - goto err_module; + if (module_dependent) + goto err_module; + + goto err; } file->device = dev; @@ -1094,6 +1040,8 @@ mutex_init(&file->mutex); filp->private_data = file; + list_add_tail(&file->list, &dev->uverbs_file_list); + mutex_unlock(&dev->disassociate_mutex); return nonseekable_open(inode, filp); @@ -1101,6 +1049,7 @@ module_put(dev->ib_dev->owner); err: + mutex_unlock(&dev->disassociate_mutex); kref_put(&dev->ref, ib_uverbs_release_dev); return ret; } @@ -1108,9 +1057,26 @@ static int ib_uverbs_close(struct inode *inode, struct file *filp) { struct ib_uverbs_file *file = filp->private_data; + struct ib_ucontext *ucontext = NULL; + int srcu_key; + + srcu_key = srcu_read_lock(&file->device->disassociate_srcu); + mutex_lock(&file->device->disassociate_mutex); + if (!file->device->disassociated) { + /* No need to remove from the list once alreday disassociated. + * Try doing that might race with ib_uverbs_free_hw_resources + * as mutex is not held by that time. + */ + list_del(&file->list); + ucontext = file->ucontext; + } + + mutex_unlock(&file->device->disassociate_mutex); - ib_uverbs_cleanup_ucontext(file, file->ucontext); + if (ucontext) + ib_uverbs_cleanup_ucontext(file, ucontext); + srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); if (file->async_file) kref_put(&file->async_file->ref, ib_uverbs_release_event_file); @@ -1120,25 +1086,21 @@ } static const struct file_operations uverbs_fops = { - .owner = THIS_MODULE, - .write = ib_uverbs_write, - .open = ib_uverbs_open, + .owner = THIS_MODULE, + .write = ib_uverbs_write, + .open = ib_uverbs_open, .release = ib_uverbs_close, .llseek = no_llseek, .unlocked_ioctl = ib_uverbs_ioctl, }; static const struct file_operations uverbs_mmap_fops = { - .owner = THIS_MODULE, - .write = ib_uverbs_write, + .owner = THIS_MODULE, + .write = ib_uverbs_write, .mmap = ib_uverbs_mmap, - .open = ib_uverbs_open, + .open = ib_uverbs_open, .release = ib_uverbs_close, .llseek = no_llseek, -/* XXX Not supported in FreeBSD */ -#if 0 - .get_unmapped_area = ib_uverbs_get_unmapped_area, -#endif .unlocked_ioctl = ib_uverbs_ioctl, }; @@ -1160,18 +1122,6 @@ } static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); -static ssize_t show_dev_ref_cnt(struct device *device, - struct device_attribute *attr, char *buf) -{ - struct ib_uverbs_device *dev = dev_get_drvdata(device); - - if (!dev) - return -ENODEV; - - return sprintf(buf, "%d\n", atomic_read(&dev->ref.refcount)); -} -static DEVICE_ATTR(ref_cnt, S_IRUGO, show_dev_ref_cnt, NULL); - static ssize_t show_dev_abi_version(struct device *device, struct device_attribute *attr, char *buf) { @@ -1186,7 +1136,7 @@ static ssize_t show_abi_version(struct class *class, struct class_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", IB_USER_VERBS_ABI_VERSION); + return sprintf(buf, "%d\n", IB_USER_VERBS_ABI_VERSION); } static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL); @@ -1229,7 +1179,7 @@ return -ENODEV; return sprintf(buf, "0x%04x\n", - ((struct pci_dev *)dev->ib_dev->dma_device)->device); + ((struct pci_dev *)dev->ib_dev->dma_device)->device); } static DEVICE_ATTR(device, S_IRUGO, show_dev_device, NULL); @@ -1242,7 +1192,7 @@ return -ENODEV; return sprintf(buf, "0x%04x\n", - ((struct pci_dev *)dev->ib_dev->dma_device)->vendor); + ((struct pci_dev *)dev->ib_dev->dma_device)->vendor); } static DEVICE_ATTR(vendor, S_IRUGO, show_dev_vendor, NULL); @@ -1255,8 +1205,8 @@ }; static struct attribute_group device_group = { - .name = "device", - .attrs = device_attrs + .name = "device", + .attrs = device_attrs }; static void ib_uverbs_add_one(struct ib_device *device) @@ -1264,6 +1214,7 @@ int devnum; dev_t base; struct ib_uverbs_device *uverbs_dev; + int ret; if (!device->alloc_ucontext) return; @@ -1276,6 +1227,13 @@ init_completion(&uverbs_dev->comp); uverbs_dev->xrcd_tree = RB_ROOT; mutex_init(&uverbs_dev->xrcd_tree_mutex); + mutex_init(&uverbs_dev->disassociate_mutex); + ret = init_srcu_struct(&uverbs_dev->disassociate_srcu); + if (ret) + goto err_init; + + INIT_LIST_HEAD(&uverbs_dev->uverbs_file_list); + INIT_LIST_HEAD(&uverbs_dev->uverbs_events_file_list); spin_lock(&map_lock); devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES); @@ -1283,7 +1241,7 @@ spin_unlock(&map_lock); devnum = find_overflow_devnum(); if (devnum < 0) - goto err; + goto err; spin_lock(&map_lock); uverbs_dev->devnum = devnum + IB_UVERBS_MAX_DEVICES; @@ -1314,13 +1272,14 @@ if (device_create_file(uverbs_dev->dev, &dev_attr_ibdev)) goto err_class; - if (device_create_file(uverbs_dev->dev, &dev_attr_ref_cnt)) - goto err_class; if (device_create_file(uverbs_dev->dev, &dev_attr_abi_version)) goto err_class; if (sysfs_create_group(&uverbs_dev->dev->kobj, &device_group)) goto err_class; + if (device->disassociate_ucontext) + uverbs_dev->flags |= UVERBS_FLAG_DISASSOCIATE; + ib_set_client_data(device, &uverbs_client, uverbs_dev); return; @@ -1336,15 +1295,71 @@ clear_bit(devnum, overflow_map); err: + cleanup_srcu_struct(&uverbs_dev->disassociate_srcu); + +err_init: kref_put(&uverbs_dev->ref, ib_uverbs_release_dev); wait_for_completion(&uverbs_dev->comp); kfree(uverbs_dev); return; } +static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev) +{ + struct ib_uverbs_file *file, *tmp_file; + struct ib_uverbs_event_file *event_file, *tmp_event_file; + struct ib_event event; + + mutex_lock(&uverbs_dev->disassociate_mutex); + uverbs_dev->disassociated = 1; + /* We must release the mutex before going ahead and calling + * disassociate_ucontext as a nested call to uverbs_close might + * be called as a result of freeing the resources (e.g mmput). + * In addition, we should take an extra ref count on files to prevent + * them being freed as part of parallel file closing, from other task + * or from event occurs internally from that one. + */ + list_for_each_entry(file, &uverbs_dev->uverbs_file_list, list) + kref_get(&file->ref); + list_for_each_entry(event_file, &uverbs_dev->uverbs_events_file_list, list) + kref_get(&event_file->ref); + mutex_unlock(&uverbs_dev->disassociate_mutex); + + /* pending running commands to terminate */ + synchronize_srcu(&uverbs_dev->disassociate_srcu); + event.event = IB_EVENT_DEVICE_FATAL; + event.element.port_num = 0; + event.device = uverbs_dev->ib_dev; + + list_for_each_entry(file, &uverbs_dev->uverbs_file_list, list) { + ib_uverbs_event_handler(&file->event_handler, &event); + uverbs_dev->ib_dev->disassociate_ucontext(file->ucontext); + ib_uverbs_cleanup_ucontext(file, file->ucontext); + } + + list_for_each_entry(event_file, &uverbs_dev->uverbs_events_file_list, list) { + if (event_file->is_async) { + /* ib_device is freed once that function/remove_one is + * finished, must unregister the event handler before. + */ + ib_unregister_event_handler(&event_file->uverbs_file->event_handler); + } + + wake_up_interruptible(&event_file->poll_wait); + kill_fasync(&event_file->async_queue, SIGIO, POLL_IN); + } + + /* we need a safe iterator as file might be freed as part of loop */ + list_for_each_entry_safe(file, tmp_file, &uverbs_dev->uverbs_file_list, list) + kref_put(&file->ref, ib_uverbs_release_file); + + list_for_each_entry_safe(event_file, tmp_event_file, &uverbs_dev->uverbs_events_file_list, list) + kref_put(&event_file->ref, ib_uverbs_release_event_file); +} static void ib_uverbs_remove_one(struct ib_device *device) { struct ib_uverbs_device *uverbs_dev = ib_get_client_data(device, &uverbs_client); + int wait_clients = 1; if (!uverbs_dev) return; @@ -1355,13 +1370,33 @@ cdev_del(&uverbs_dev->cdev); if (uverbs_dev->devnum < IB_UVERBS_MAX_DEVICES) - clear_bit(uverbs_dev->devnum, dev_map); + clear_bit(uverbs_dev->devnum, dev_map); else clear_bit(uverbs_dev->devnum - IB_UVERBS_MAX_DEVICES, overflow_map); + if (uverbs_dev->flags & UVERBS_FLAG_DISASSOCIATE) { + /* We disassociate HW resources and immediately returning, not + * pending to active userspace clients. Upon returning ib_device + * may be freed internally and is not valid any more. + * uverbs_device is still available, when all clients close + * their files, the uverbs device ref count will be zero and its + * resources will be freed. + * Note: At that step no more files can be opened on that cdev + * as it was deleted, however active clients can still issue + * commands and close their open files. + */ + ib_uverbs_free_hw_resources(uverbs_dev); + wait_clients = 0; + /* ib device can no longer be accessed. It is freed when this procedure returns. */ + uverbs_dev->ib_dev = NULL; + } + /* ref count taken as part of add one is put back in both modes.*/ kref_put(&uverbs_dev->ref, ib_uverbs_release_dev); - wait_for_completion(&uverbs_dev->comp); - kfree(uverbs_dev); + if (wait_clients) { + wait_for_completion(&uverbs_dev->comp); + cleanup_srcu_struct(&uverbs_dev->disassociate_srcu); + kfree(uverbs_dev); + } } static char *uverbs_devnode(struct device *dev, umode_t *mode) @@ -1431,5 +1466,5 @@ idr_destroy(&ib_uverbs_srq_idr); } -module_init(ib_uverbs_init); +module_init_order(ib_uverbs_init, SI_ORDER_THIRD); module_exit(ib_uverbs_cleanup); Index: sys/ofed/drivers/infiniband/core/uverbs_marshall.c =================================================================== --- sys/ofed/drivers/infiniband/core/uverbs_marshall.c +++ sys/ofed/drivers/infiniband/core/uverbs_marshall.c @@ -140,5 +140,10 @@ dst->packet_life_time = src->packet_life_time; dst->preference = src->preference; dst->packet_life_time_selector = src->packet_life_time_selector; + + memset(dst->dmac, 0, sizeof(dst->dmac)); + dst->net = NULL; + dst->ifindex = 0; + dst->gid_type = IB_GID_TYPE_IB; } EXPORT_SYMBOL(ib_copy_path_rec_from_user); Index: sys/ofed/drivers/infiniband/core/verbs.c =================================================================== --- sys/ofed/drivers/infiniband/core/verbs.c +++ sys/ofed/drivers/infiniband/core/verbs.c @@ -39,14 +39,22 @@ #include #include #include +#include #include #include +#include +#include + +#include +#include #include #include #include -int ib_rate_to_mult(enum ib_rate rate) +#include "core_priv.h" + +__attribute_const__ int ib_rate_to_mult(enum ib_rate rate) { switch (rate) { case IB_RATE_2_5_GBPS: return 1; @@ -63,7 +71,7 @@ } EXPORT_SYMBOL(ib_rate_to_mult); -enum ib_rate mult_to_ib_rate(int mult) +__attribute_const__ enum ib_rate mult_to_ib_rate(int mult) { switch (mult) { case 1: return IB_RATE_2_5_GBPS; @@ -80,7 +88,7 @@ } EXPORT_SYMBOL(mult_to_ib_rate); -int ib_rate_to_mbps(enum ib_rate rate) +__attribute_const__ int ib_rate_to_mbps(enum ib_rate rate) { switch (rate) { case IB_RATE_2_5_GBPS: return 2500; @@ -105,7 +113,7 @@ } EXPORT_SYMBOL(ib_rate_to_mbps); -enum rdma_transport_type +__attribute_const__ enum rdma_transport_type rdma_node_get_transport(enum rdma_node_type node_type) { switch (node_type) { @@ -115,8 +123,6 @@ return RDMA_TRANSPORT_IB; case RDMA_NODE_RNIC: return RDMA_TRANSPORT_IWARP; - case RDMA_NODE_MIC: - return RDMA_TRANSPORT_SCIF; default: BUG(); return 0; @@ -134,8 +140,6 @@ return IB_LINK_LAYER_INFINIBAND; case RDMA_TRANSPORT_IWARP: return IB_LINK_LAYER_ETHERNET; - case RDMA_TRANSPORT_SCIF: - return IB_LINK_LAYER_SCIF; default: return IB_LINK_LAYER_UNSPECIFIED; } @@ -188,6 +192,122 @@ } EXPORT_SYMBOL(ib_create_ah); +int ib_get_grh_header_version(const void *h) +{ + const struct ip *ip4h = (struct ip *)(h + 20); + struct ip ip4h_checked; + const struct ip6_hdr *ip6h = (struct ip6_hdr *)h; + + if (((ip6h->ip6_vfc & IPV6_VERSION_MASK) >> 4) != 6) + return (ip4h->ip_v == 4) ? 4 : 0; + /* version may be 6 or 4 */ + if (ip4h->ip_hl != 5) /* IPv4 header length must be 5 for RR */ + return 6; + /* Verify checksum. + We can't write on scattered buffers so we need to copy to + temp buffer. + */ + memcpy(&ip4h_checked, ip4h, sizeof(ip4h_checked)); + ip4h_checked.ip_sum = 0; + ip4h_checked.ip_sum = in_cksum_hdr(&ip4h_checked); + /* if IPv4 header checksum is OK, bellive it */ + if (ip4h->ip_sum == ip4h_checked.ip_sum) + return 4; + return 6; +} +EXPORT_SYMBOL(ib_get_grh_header_version); + +static enum rdma_network_type ib_get_net_type_by_grh(struct ib_device *device, + u8 port_num, + const struct ib_grh *grh) +{ + int grh_version; + + if (rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND) + return RDMA_NETWORK_IB; + + grh_version = ib_get_grh_header_version(grh); + + if (grh_version == 4) + return RDMA_NETWORK_IPV4; + + if (grh->next_hdr == IPPROTO_UDP || grh->next_hdr == 0xfe) + return RDMA_NETWORK_IPV6; + + return RDMA_NETWORK_IB; +} + +struct find_gid_index_context { + u16 vlan_id; + enum ib_gid_type gid_type; +}; + +static bool find_gid_index(const union ib_gid *gid, + const struct ib_gid_attr *gid_attr, + void *context) +{ + u16 tag; + bool dev_supports_vlan = (VLAN_TAG(gid_attr->ndev, &tag) == 0); + struct find_gid_index_context *ctx = + (struct find_gid_index_context *)context; + + if (ctx->gid_type != gid_attr->gid_type) + return false; + + if (!!(ctx->vlan_id != 0xffff) == !dev_supports_vlan || + (dev_supports_vlan && tag != ctx->vlan_id)) + return false; + + return true; +} + +static int get_sgid_index_from_eth(struct ib_device *device, u8 port_num, + u16 vlan_id, union ib_gid *sgid, + enum ib_gid_type gid_type, + u16 *gid_index) +{ + struct find_gid_index_context context = {.vlan_id = vlan_id, + .gid_type = gid_type}; + + return ib_find_gid_by_filter(device, sgid, port_num, find_gid_index, + &context, gid_index); +} + +int ib_get_gids_from_grh(struct ib_grh *grh, enum rdma_network_type net_type, + union ib_gid *sgid, union ib_gid *dgid) +{ + union rdma_network_hdr *l3grh; + struct sockaddr_in src_in; + struct sockaddr_in dst_in; + __be32 src_saddr, dst_saddr; + + if (!sgid || !dgid) + return -EINVAL; + + if (net_type == RDMA_NETWORK_IPV4) { + l3grh = (union rdma_network_hdr *) + ((u8 *)grh + 20); + memcpy(&src_in.sin_addr.s_addr, + &l3grh->roce4grh.ip_src.s_addr, 4); + memcpy(&dst_in.sin_addr.s_addr, + &l3grh->roce4grh.ip_dst.s_addr, 4); + src_saddr = src_in.sin_addr.s_addr; + dst_saddr = dst_in.sin_addr.s_addr; + ipv6_addr_set_v4mapped(src_saddr, + (struct in6_addr *)sgid); + ipv6_addr_set_v4mapped(dst_saddr, + (struct in6_addr *)dgid); + return 0; + } else if (net_type == RDMA_NETWORK_IPV6 || + net_type == RDMA_NETWORK_IB) { + *dgid = grh->dgid; + *sgid = grh->sgid; + return 0; + } else + return -EINVAL; +} +EXPORT_SYMBOL(ib_get_gids_from_grh); + int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc, struct ib_grh *grh, struct ib_ah_attr *ah_attr) { @@ -196,28 +316,50 @@ int ret; int is_eth = (rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_ETHERNET); + enum rdma_network_type net_type = RDMA_NETWORK_IB; + enum ib_gid_type gid_type = IB_GID_TYPE_IB; + union ib_gid dgid; + union ib_gid sgid; memset(ah_attr, 0, sizeof *ah_attr); if (is_eth) { + if (wc->wc_flags & IB_WC_WITH_NETWORK_HDR_TYPE) + net_type = wc->network_hdr_type; + else + net_type = ib_get_net_type_by_grh(device, port_num, grh); + gid_type = ib_network_to_gid_type(net_type, grh); + } + ret = ib_get_gids_from_grh(grh, net_type, &sgid, &dgid); + if (ret) + return ret; + + if (is_eth) { + u16 vlan_id = wc->wc_flags & IB_WC_WITH_VLAN ? + wc->vlan_id : 0xffff; + if (!(wc->wc_flags & IB_WC_GRH)) return -EPROTOTYPE; - if (wc->wc_flags & IB_WC_WITH_SMAC && - wc->wc_flags & IB_WC_WITH_VLAN) { - memcpy(ah_attr->dmac, wc->smac, ETH_ALEN); - ah_attr->vlan_id = wc->vlan_id; - } else { + if (!(wc->wc_flags & IB_WC_WITH_SMAC) || + !(wc->wc_flags & IB_WC_WITH_VLAN)) { u32 scope_id = rdma_get_ipv6_scope_id(device, port_num); - ret = rdma_addr_find_dmac_by_grh(&grh->dgid, &grh->sgid, - ah_attr->dmac, &ah_attr->vlan_id, + ret = rdma_addr_find_dmac_by_grh(&dgid, &sgid, + ah_attr->dmac, + wc->wc_flags & IB_WC_WITH_VLAN ? + NULL : &vlan_id, scope_id); if (ret) return ret; } - } else { - ah_attr->vlan_id = 0xffff; - } + ret = get_sgid_index_from_eth(device, port_num, vlan_id, + &dgid, gid_type, &gid_index); + if (ret) + return ret; + + if (wc->wc_flags & IB_WC_WITH_SMAC) + memcpy(ah_attr->dmac, wc->smac, ETH_ALEN); + } ah_attr->dlid = wc->slid; ah_attr->sl = wc->sl; @@ -226,12 +368,16 @@ if (wc->wc_flags & IB_WC_GRH) { ah_attr->ah_flags = IB_AH_GRH; - ah_attr->grh.dgid = grh->sgid; + ah_attr->grh.dgid = sgid; - ret = ib_find_cached_gid(device, &grh->dgid, &port_num, - &gid_index); - if (ret) - return ret; + if (!is_eth) { + ret = ib_find_cached_gid_by_port(device, &dgid, + IB_GID_TYPE_IB, + port_num, NULL, 0, + &gid_index); + if (ret) + return ret; + } ah_attr->grh.sgid_index = (u8) gid_index; flow_class = be32_to_cpu(grh->version_tclass_flow); @@ -273,6 +419,15 @@ } EXPORT_SYMBOL(ib_query_ah); +int ib_query_values(struct ib_device *device, + int q_values, struct ib_device_values *values) +{ + return device->query_values ? + device->query_values(device, q_values, values) : + -ENOSYS; +} +EXPORT_SYMBOL(ib_query_values); + int ib_destroy_ah(struct ib_ah *ah) { struct ib_pd *pd; @@ -291,7 +446,7 @@ struct ib_srq *ib_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *srq_init_attr) -{ +{ struct ib_srq *srq; if (!pd->device->create_srq) @@ -311,7 +466,7 @@ srq->ext.xrc.cq = srq_init_attr->ext.xrc.cq; atomic_inc(&srq->ext.xrc.xrcd->usecnt); atomic_inc(&srq->ext.xrc.cq->usecnt); - } + } atomic_inc(&pd->usecnt); atomic_set(&srq->usecnt, 0); } @@ -338,14 +493,6 @@ } EXPORT_SYMBOL(ib_query_srq); -int ib_query_values(struct ib_device *device, - int q_values, struct ib_device_values *values) -{ - return device->query_values ? - device->query_values(device, q_values, values) : -ENOSYS; -} -EXPORT_SYMBOL(ib_query_values); - int ib_destroy_srq(struct ib_srq *srq) { struct ib_pd *pd; @@ -360,7 +507,7 @@ pd = srq->pd; srq_type = srq->srq_type; if (srq_type == IB_SRQT_XRC) { - xrcd = srq->ext.xrc.xrcd; + xrcd = srq->ext.xrc.xrcd; cq = srq->ext.xrc.cq; } @@ -384,9 +531,6 @@ struct ib_qp *qp = context; unsigned long flags; - /* The code below must be synced with deletions of existing qps (ib_close_qp) -- - * because a qp from the list may be closed during the scan, resulting in a kernel Oops. - */ spin_lock_irqsave(&qp->device->event_handler_lock, flags); list_for_each_entry(event->element.qp, &qp->open_list, open_list) if (event->element.qp->event_handler) @@ -461,7 +605,7 @@ if (!IS_ERR(qp)) { qp->device = device; qp->real_qp = qp; - qp->uobject = NULL; + qp->uobject = NULL; qp->qp_type = qp_init_attr->qp_type; atomic_set(&qp->usecnt, 0); @@ -483,17 +627,17 @@ else real_qp->device->destroy_qp(real_qp); } else { - qp->event_handler = qp_init_attr->event_handler; - qp->qp_context = qp_init_attr->qp_context; + qp->event_handler = qp_init_attr->event_handler; + qp->qp_context = qp_init_attr->qp_context; if (qp_init_attr->qp_type == IB_QPT_XRC_INI) { qp->recv_cq = NULL; qp->srq = NULL; } else { qp->recv_cq = qp_init_attr->recv_cq; - atomic_inc(&qp_init_attr->recv_cq->usecnt); + atomic_inc(&qp_init_attr->recv_cq->usecnt); qp->srq = qp_init_attr->srq; if (qp->srq) - atomic_inc(&qp_init_attr->srq->usecnt); + atomic_inc(&qp_init_attr->srq->usecnt); } qp->pd = pd; @@ -512,9 +656,7 @@ static const struct { int valid; enum ib_qp_attr_mask req_param[IB_QPT_MAX]; - enum ib_qp_attr_mask req_param_add_eth[IB_QPT_MAX]; enum ib_qp_attr_mask opt_param[IB_QPT_MAX]; - enum ib_qp_attr_mask opt_param_add_eth[IB_QPT_MAX]; } qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = { [IB_QPS_RESET] = { [IB_QPS_RESET] = { .valid = 1 }, @@ -531,13 +673,13 @@ [IB_QPT_RC] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS), - [IB_QPT_DC_INI] = (IB_QP_PKEY_INDEX | + [IB_EXP_QPT_DC_INI] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS | - IB_QP_DC_KEY), + IB_QP_DC_KEY ), [IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX | - IB_QP_PORT | - IB_QP_ACCESS_FLAGS), + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), [IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS), @@ -547,8 +689,13 @@ IB_QP_QKEY), }, .opt_param = { - [IB_QPT_UD] = IB_QP_GROUP_RSS, - [IB_QPT_RAW_PACKET] = IB_QP_GROUP_RSS + [IB_QPT_UD] = (IB_QP_GROUP_RSS | + IB_QP_FLOW_ENTROPY), + [IB_QPT_RAW_PACKET] = IB_QP_GROUP_RSS, + [IB_QPT_RC] = IB_QP_FLOW_ENTROPY, + [IB_QPT_UC] = IB_QP_FLOW_ENTROPY, + [IB_QPT_XRC_INI] = IB_QP_FLOW_ENTROPY, + [IB_QPT_XRC_TGT] = IB_QP_FLOW_ENTROPY, } }, }, @@ -567,7 +714,7 @@ [IB_QPT_RC] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS), - [IB_QPT_DC_INI] = (IB_QP_PKEY_INDEX | + [IB_EXP_QPT_DC_INI] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS), [IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX | @@ -595,7 +742,7 @@ IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER), - [IB_QPT_DC_INI] = (IB_QP_PATH_MTU | + [IB_EXP_QPT_DC_INI] = (IB_QP_PATH_MTU | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER), [IB_QPT_XRC_INI] = (IB_QP_AV | @@ -609,12 +756,6 @@ IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER), }, - .req_param_add_eth = { - [IB_QPT_RC] = (IB_QP_SMAC), - [IB_QPT_UC] = (IB_QP_SMAC), - [IB_QPT_XRC_INI] = (IB_QP_SMAC), - [IB_QPT_XRC_TGT] = (IB_QP_SMAC) - }, .opt_param = { [IB_QPT_UD] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), @@ -624,36 +765,18 @@ [IB_QPT_RC] = (IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX), - [IB_QPT_DC_INI] = (IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_PKEY_INDEX), [IB_QPT_XRC_INI] = (IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX), [IB_QPT_XRC_TGT] = (IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_PKEY_INDEX), + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX), [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), - [IB_QPT_RAW_PACKET] = IB_QP_AV, }, - .opt_param_add_eth = { - [IB_QPT_RC] = (IB_QP_ALT_SMAC | - IB_QP_VID | - IB_QP_ALT_VID), - [IB_QPT_UC] = (IB_QP_ALT_SMAC | - IB_QP_VID | - IB_QP_ALT_VID), - [IB_QPT_XRC_INI] = (IB_QP_ALT_SMAC | - IB_QP_VID | - IB_QP_ALT_VID), - [IB_QPT_XRC_TGT] = (IB_QP_ALT_SMAC | - IB_QP_VID | - IB_QP_ALT_VID) - } - } + }, }, [IB_QPS_RTR] = { [IB_QPS_RESET] = { .valid = 1 }, @@ -668,7 +791,7 @@ IB_QP_RNR_RETRY | IB_QP_SQ_PSN | IB_QP_MAX_QP_RD_ATOMIC), - [IB_QPT_DC_INI] = (IB_QP_TIMEOUT | + [IB_EXP_QPT_DC_INI] = (IB_QP_TIMEOUT | IB_QP_RETRY_CNT | IB_QP_RNR_RETRY | IB_QP_MAX_QP_RD_ATOMIC), @@ -694,7 +817,7 @@ IB_QP_ACCESS_FLAGS | IB_QP_MIN_RNR_TIMER | IB_QP_PATH_MIG_STATE), - [IB_QPT_DC_INI] = (IB_QP_CUR_STATE | + [IB_EXP_QPT_DC_INI] = (IB_QP_CUR_STATE | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_MIN_RNR_TIMER | @@ -704,10 +827,10 @@ IB_QP_ACCESS_FLAGS | IB_QP_PATH_MIG_STATE), [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE | - IB_QP_ALT_PATH | - IB_QP_ACCESS_FLAGS | - IB_QP_MIN_RNR_TIMER | - IB_QP_PATH_MIG_STATE), + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_MIN_RNR_TIMER | + IB_QP_PATH_MIG_STATE), [IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_CUR_STATE | @@ -732,7 +855,7 @@ IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE | IB_QP_MIN_RNR_TIMER), - [IB_QPT_DC_INI] = (IB_QP_CUR_STATE | + [IB_EXP_QPT_DC_INI] = (IB_QP_CUR_STATE | IB_QP_ACCESS_FLAGS | IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE | @@ -889,13 +1012,6 @@ req_param = qp_state_table[cur_state][next_state].req_param[type]; opt_param = qp_state_table[cur_state][next_state].opt_param[type]; - if (ll == IB_LINK_LAYER_ETHERNET) { - req_param |= qp_state_table[cur_state][next_state]. - req_param_add_eth[type]; - opt_param |= qp_state_table[cur_state][next_state]. - opt_param_add_eth[type]; - } - if ((mask & req_param) != req_param) return 0; @@ -906,17 +1022,74 @@ } EXPORT_SYMBOL(ib_modify_qp_is_ok); +int ib_resolve_eth_dmac(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, int *qp_attr_mask) +{ + int ret = 0; + u8 start_port = qp->device->node_type == RDMA_NODE_IB_SWITCH ? 0 : 1; + + if ((*qp_attr_mask & IB_QP_AV) && + (qp_attr->ah_attr.port_num >= start_port) && + (qp_attr->ah_attr.port_num < start_port + qp->device->phys_port_cnt) && + (rdma_port_get_link_layer(qp->device, qp_attr->ah_attr.port_num) == + IB_LINK_LAYER_ETHERNET)) { + if (rdma_link_local_addr((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw)) { + rdma_get_ll_mac((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw, + qp_attr->ah_attr.dmac); + } else { + union ib_gid sgid; + struct ib_gid_attr sgid_attr; + u32 scope_id; + + rcu_read_lock(); + ret = ib_query_gid(qp->device, + qp_attr->ah_attr.port_num, + qp_attr->ah_attr.grh.sgid_index, + &sgid, &sgid_attr); + + if (ret || !sgid_attr.ndev) { + if (!ret) + ret = -ENXIO; + rcu_read_unlock(); + goto out; + } + if (sgid_attr.gid_type == IB_GID_TYPE_ROCE_V2 || + sgid_attr.gid_type == IB_GID_TYPE_ROCE_V1_5) + qp_attr->ah_attr.grh.hop_limit = + IPV6_DEFAULT_HOPLIMIT; + + dev_hold(sgid_attr.ndev); + scope_id = rdma_get_ipv6_scope_id(qp->device, + qp_attr->ah_attr.port_num); + + rcu_read_unlock(); + + ret = rdma_addr_find_dmac_by_grh(&sgid, + &qp_attr->ah_attr.grh.dgid, + qp_attr->ah_attr.dmac, + NULL, scope_id); + + dev_put(sgid_attr.ndev); + } + } +out: + return ret; +} +EXPORT_SYMBOL(ib_resolve_eth_dmac); + + int ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask) { int ret; - ret = qp->device->modify_qp(qp->real_qp, qp_attr, qp_attr_mask, NULL); - if (!ret && (qp_attr_mask & IB_QP_PORT)) - qp->port_num = qp_attr->port_num; + ret = ib_resolve_eth_dmac(qp, qp_attr, &qp_attr_mask); - return ret; + if (ret) + return ret; + + return qp->device->modify_qp(qp->real_qp, qp_attr, qp_attr_mask, NULL); } EXPORT_SYMBOL(ib_modify_qp); @@ -984,6 +1157,7 @@ struct ib_pd *pd; struct ib_cq *scq, *rcq; struct ib_srq *srq; + struct ib_rwq_ind_table *ind_tbl; int ret; if (atomic_read(&qp->usecnt)) @@ -992,21 +1166,25 @@ if (qp->real_qp != qp) return __ib_destroy_shared_qp(qp); - pd = qp->pd; - scq = qp->send_cq; - rcq = qp->recv_cq; - srq = qp->srq; + pd = qp->pd; + scq = qp->send_cq; + rcq = qp->recv_cq; + srq = qp->srq; + ind_tbl = qp->rwq_ind_tbl; ret = qp->device->destroy_qp(qp); if (!ret) { if (pd) - atomic_dec(&pd->usecnt); + atomic_dec(&pd->usecnt); if (scq) - atomic_dec(&scq->usecnt); + atomic_dec(&scq->usecnt); if (rcq) - atomic_dec(&rcq->usecnt); + atomic_dec(&rcq->usecnt); if (srq) atomic_dec(&srq->usecnt); + if (ind_tbl) + atomic_dec(&ind_tbl->usecnt); + } return ret; @@ -1360,19 +1538,9 @@ if (!qp->device->attach_mcast) return -ENOSYS; - - switch (rdma_node_get_transport(qp->device->node_type)) { - case RDMA_TRANSPORT_IB: - if ((gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD) && - qp->qp_type != IB_QPT_RAW_PACKET) - return -EINVAL; - break; - case RDMA_TRANSPORT_IWARP: - case RDMA_TRANSPORT_SCIF: - if (qp->qp_type != IB_QPT_RAW_PACKET) - return -EINVAL; - break; - } + if ((gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD) && + qp->qp_type != IB_QPT_RAW_PACKET) + return -EINVAL; ret = qp->device->attach_mcast(qp, gid, lid); if (!ret) @@ -1387,20 +1555,9 @@ if (!qp->device->detach_mcast) return -ENOSYS; - - switch (rdma_node_get_transport(qp->device->node_type)) { - case RDMA_TRANSPORT_IB: - if ((gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD) && - qp->qp_type != IB_QPT_RAW_PACKET) - return -EINVAL; - break; - case RDMA_TRANSPORT_IWARP: - case RDMA_TRANSPORT_SCIF: - - if (qp->qp_type != IB_QPT_RAW_PACKET) - return -EINVAL; - break; - } + if ((gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD) && + qp->qp_type != IB_QPT_RAW_PACKET) + return -EINVAL; ret = qp->device->detach_mcast(qp, gid, lid); if (!ret) @@ -1448,6 +1605,125 @@ } EXPORT_SYMBOL(ib_dealloc_xrcd); +struct ib_wq *ib_create_wq(struct ib_pd *pd, + struct ib_wq_init_attr *wq_attr) +{ + struct ib_wq *wq; + + if (!pd->device->create_wq) + return ERR_PTR(-ENOSYS); + + wq = pd->device->create_wq(pd, wq_attr, NULL); + if (!IS_ERR(wq)) { + wq->event_handler = wq_attr->event_handler; + wq->wq_context = wq_attr->wq_context; + wq->wq_type = wq_attr->wq_type; + wq->cq = wq_attr->cq; + wq->device = pd->device; + wq->pd = pd; + wq->uobject = NULL; + atomic_inc(&pd->usecnt); + atomic_inc(&wq_attr->cq->usecnt); + if (wq_attr->srq) { + wq->srq = wq_attr->srq; + atomic_inc(&wq_attr->srq->usecnt); + } + atomic_set(&wq->usecnt, 0); + } + return wq; +} +EXPORT_SYMBOL(ib_create_wq); + +int ib_destroy_wq(struct ib_wq *wq) +{ + int err; + struct ib_cq *cq = wq->cq; + struct ib_pd *pd = wq->pd; + struct ib_srq *srq = wq->srq; + + if (!wq->device->destroy_wq) + return -ENOSYS; + + if (atomic_read(&wq->usecnt)) + return -EBUSY; + + err = wq->device->destroy_wq(wq); + if (!err) { + atomic_dec(&pd->usecnt); + atomic_dec(&cq->usecnt); + if (srq) + atomic_dec(&srq->usecnt); + } + return err; +} +EXPORT_SYMBOL(ib_destroy_wq); + +int ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, + enum ib_wq_attr_mask attr_mask) +{ + int err; + + if (!wq->device->modify_wq) + return -ENOSYS; + + err = wq->device->modify_wq(wq, wq_attr, attr_mask, NULL); + return err; +} +EXPORT_SYMBOL(ib_modify_wq); + +struct ib_rwq_ind_table *ib_create_rwq_ind_table(struct ib_device *device, + struct ib_rwq_ind_table_init_attr* + init_attr) +{ + struct ib_rwq_ind_table *rwq_ind_table; + int i; + u32 table_size; + + if (!device->create_rwq_ind_table) + return ERR_PTR(-ENOSYS); + + table_size = (1 << init_attr->log_ind_tbl_size); + rwq_ind_table = device->create_rwq_ind_table(device, + init_attr, NULL); + if (IS_ERR(rwq_ind_table)) + return rwq_ind_table; + + rwq_ind_table->ind_tbl = init_attr->ind_tbl; + rwq_ind_table->log_ind_tbl_size = init_attr->log_ind_tbl_size; + rwq_ind_table->device = device; + rwq_ind_table->pd = init_attr->pd; + rwq_ind_table->uobject = NULL; + atomic_set(&rwq_ind_table->usecnt, 0); + + for (i = 0; i < table_size; i++) + atomic_inc(&rwq_ind_table->ind_tbl[i]->usecnt); + + return rwq_ind_table; +} +EXPORT_SYMBOL(ib_create_rwq_ind_table); + +int ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *rwq_ind_table) +{ + int err, i; + u32 table_size = (1 << rwq_ind_table->log_ind_tbl_size); + struct ib_wq **ind_tbl = rwq_ind_table->ind_tbl; + + if (atomic_read(&rwq_ind_table->usecnt)) + return -EBUSY; + + if (!rwq_ind_table->device->destroy_rwq_ind_table) + return -ENOSYS; + + err = rwq_ind_table->device->destroy_rwq_ind_table(rwq_ind_table); + if (!err) { + for (i = 0; i < table_size; i++) + atomic_dec(&ind_tbl[i]->usecnt); + } + + return err; +} +EXPORT_SYMBOL(ib_destroy_rwq_ind_table); + struct ib_flow *ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr, int domain) @@ -1457,8 +1733,11 @@ return ERR_PTR(-ENOSYS); flow_id = qp->device->create_flow(qp, flow_attr, domain); - if (!IS_ERR(flow_id)) + if (!IS_ERR(flow_id)) { atomic_inc(&qp->usecnt); + flow_id->qp = qp; + flow_id->uobject = NULL; + } return flow_id; } EXPORT_SYMBOL(ib_create_flow); @@ -1466,13 +1745,8 @@ int ib_destroy_flow(struct ib_flow *flow_id) { int err; - struct ib_qp *qp; + struct ib_qp *qp = flow_id->qp; - if (!flow_id) - return -EINVAL; - qp = flow_id->qp; - if (!qp->device->destroy_flow) - return -ENOSYS; err = qp->device->destroy_flow(flow_id); if (!err) atomic_dec(&qp->usecnt); @@ -1504,16 +1778,22 @@ int ib_destroy_dct(struct ib_dct *dct) { + struct ib_srq *srq; + struct ib_cq *cq; + struct ib_pd *pd; int err; if (!dct->device->exp_destroy_dct) return -ENOSYS; + srq = dct->srq; + cq = dct->cq; + pd = dct->pd; err = dct->device->exp_destroy_dct(dct); if (!err) { - atomic_dec(&dct->srq->usecnt); - atomic_dec(&dct->cq->usecnt); - atomic_dec(&dct->pd->usecnt); + atomic_dec(&srq->usecnt); + atomic_dec(&cq->usecnt); + atomic_dec(&pd->usecnt); } return err; @@ -1529,6 +1809,15 @@ } EXPORT_SYMBOL(ib_query_dct); +int ib_arm_dct(struct ib_dct *dct) +{ + if (!dct->device->exp_arm_dct) + return -ENOSYS; + + return dct->device->exp_arm_dct(dct, NULL); +} +EXPORT_SYMBOL(ib_arm_dct); + int ib_check_mr_status(struct ib_mr *mr, u32 check_mask, struct ib_mr_status *mr_status) { @@ -1536,3 +1825,50 @@ mr->device->check_mr_status(mr, check_mask, mr_status) : -ENOSYS; } EXPORT_SYMBOL(ib_check_mr_status); + +int ib_query_mkey(struct ib_mr *mr, u64 mkey_attr_mask, + struct ib_mkey_attr *mkey_attr) +{ + return mr->device->exp_query_mkey ? + mr->device->exp_query_mkey(mr, mkey_attr_mask, mkey_attr) : -ENOSYS; +} +EXPORT_SYMBOL(ib_query_mkey); + +int ib_roce_mode_is_over_ip(struct ib_device *ibdev, int port_num) +{ + struct ib_device_attr attr; + if ((rdma_port_get_link_layer(ibdev, port_num) == IB_LINK_LAYER_ETHERNET) && + !ib_query_device(ibdev, &attr) && + (attr.device_cap_flags & (IB_DEVICE_ROCE_MODE_1_5 | IB_DEVICE_ROCE_MODE_2))) + return 1; + return 0; +} +EXPORT_SYMBOL(ib_roce_mode_is_over_ip); + +struct ib_indir_reg_list * +ib_alloc_indir_reg_list(struct ib_device *device, + unsigned int max_indir_list_len) +{ + struct ib_indir_reg_list *indir_list; + + if (!device->alloc_indir_reg_list) + return ERR_PTR(-ENOSYS); + + indir_list = device->alloc_indir_reg_list(device, + max_indir_list_len); + if (!IS_ERR(indir_list)) { + indir_list->device = device; + indir_list->max_indir_list_len = max_indir_list_len; + } + + return indir_list; +} +EXPORT_SYMBOL(ib_alloc_indir_reg_list); + +void +ib_free_indir_reg_list(struct ib_indir_reg_list *indir_list) +{ + if (indir_list->device->free_indir_reg_list) + indir_list->device->free_indir_reg_list(indir_list); +} +EXPORT_SYMBOL(ib_free_indir_reg_list); Index: sys/ofed/drivers/infiniband/hw/mthca/mthca_av.c =================================================================== --- sys/ofed/drivers/infiniband/hw/mthca/mthca_av.c +++ sys/ofed/drivers/infiniband/hw/mthca/mthca_av.c @@ -281,7 +281,7 @@ ib_get_cached_gid(&dev->ib_dev, be32_to_cpu(ah->av->port_pd) >> 24, ah->av->gid_index % dev->limits.gid_table_len, - &header->grh.source_gid); + &header->grh.source_gid, NULL); memcpy(header->grh.destination_gid.raw, ah->av->dgid, 16); } Index: sys/ofed/drivers/infiniband/hw/mthca/mthca_qp.c =================================================================== --- sys/ofed/drivers/infiniband/hw/mthca/mthca_qp.c +++ sys/ofed/drivers/infiniband/hw/mthca/mthca_qp.c @@ -1500,7 +1500,7 @@ ib_ud_header_init(256, /* assume a MAD */ 1, 0, 0, mthca_ah_grh_present(to_mah(wr->wr.ud.ah)), - 0, + 0, 0, 0, &sqp->ud_header); err = mthca_read_ah(dev, to_mah(wr->wr.ud.ah), &sqp->ud_header); Index: sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c =================================================================== --- sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -633,7 +633,7 @@ IB_SA_PATH_REC_NUMB_PATH | IB_SA_PATH_REC_TRAFFIC_CLASS | IB_SA_PATH_REC_PKEY, - 1000, GFP_ATOMIC, + 1000, 0, GFP_ATOMIC, path_rec_completion, path, &path->query); if (path->query_id < 0) { @@ -991,7 +991,7 @@ priv->broadcastaddr[8] = priv->pkey >> 8; priv->broadcastaddr[9] = priv->pkey & 0xff; - result = ib_query_gid(hca, port, 0, &priv->local_gid); + result = ib_query_gid(hca, port, 0, &priv->local_gid, NULL); if (result) { printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n", hca->name, port, result); Index: sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c =================================================================== --- sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -480,7 +480,7 @@ return; } - if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid)) + if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid, NULL)) ipoib_warn(priv, "ib_query_gid() failed\n"); else memcpy(IF_LLADDR(dev) + 4, priv->local_gid.raw, sizeof (union ib_gid)); Index: sys/ofed/include/rdma/ib_verbs.h =================================================================== --- sys/ofed/include/rdma/ib_verbs.h +++ sys/ofed/include/rdma/ib_verbs.h @@ -211,8 +211,8 @@ IB_DEVICE_SIGNATURE_HANDOVER = (1ULL<<32), IB_DEVICE_ROCE_MODE_1_5 = (1ULL<<34), IB_DEVICE_ROCE_MODE_2 = (1ULL<<35), - IB_DEVICE_INDIR_REGISTRATION = (1ULL<<36) - + IB_DEVICE_INDIR_REGISTRATION = (1ULL<<36), + IB_DEVICE_SIGNATURE_RESP_PIPE = (1ULL<<37), }; enum ib_signature_prot_cap { @@ -229,7 +229,8 @@ enum ib_atomic_cap { IB_ATOMIC_NONE, IB_ATOMIC_HCA, - IB_ATOMIC_GLOB + IB_ATOMIC_GLOB, + IB_ATOMIC_HCA_REPLY_BE = 0x40 /* HOST is LE and atomic reply is BE */ }; enum ib_cq_create_flags { @@ -319,7 +320,8 @@ IB_PORT_INIT = 2, IB_PORT_ARMED = 3, IB_PORT_ACTIVE = 4, - IB_PORT_ACTIVE_DEFER = 5 + IB_PORT_ACTIVE_DEFER = 5, + IB_PORT_DUMMY = -1, /* force enum signed */ }; enum ib_port_cap_flags { @@ -496,6 +498,13 @@ IB_EVENT_QP_LAST_WQE_REACHED, IB_EVENT_CLIENT_REREGISTER, IB_EVENT_GID_CHANGE, + + /* New experimental events start here leaving enough + * room for 14 events which should be enough. + */ + IB_EXP_EVENT_DCT_KEY_VIOLATION = 32, + IB_EXP_EVENT_DCT_ACCESS_ERR, + IB_EXP_EVENT_DCT_REQ_ERR, }; struct ib_event { @@ -504,6 +513,7 @@ struct ib_cq *cq; struct ib_qp *qp; struct ib_srq *srq; + struct ib_dct *dct; u8 port_num; } element; enum ib_event_type event; @@ -722,6 +732,16 @@ }; /** + * struct ib_mkey_attr - Memory key attributes + * + * @max_reg_descriptors: how many mrs we can we register with this mkey + */ +struct ib_mkey_attr { + u32 max_reg_descriptors; +}; + + +/** * mult_to_ib_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate * enum. * @mult: multiple to convert. @@ -761,7 +781,8 @@ IB_WC_INV_EEC_STATE_ERR, IB_WC_FATAL_ERR, IB_WC_RESP_TIMEOUT_ERR, - IB_WC_GENERAL_ERR + IB_WC_GENERAL_ERR, + IB_WC_SIG_PIPELINE_CANCELED, }; enum ib_wc_opcode { @@ -783,7 +804,7 @@ */ IB_WC_RECV = 1 << 7, IB_WC_RECV_RDMA_WITH_IMM, - IB_WC_DUMMY = -1 /* force enum signed */ + IB_WC_DUMMY = -1, /* force enum signed */ }; enum ib_wc_flags { @@ -892,7 +913,7 @@ IB_QPT_RAW_PACKET = 8, IB_QPT_XRC_INI = 9, IB_QPT_XRC_TGT, - IB_QPT_DC_INI, + IB_EXP_QPT_DC_INI = 32, IB_QPT_MAX, /* Reserve a range for qp types internal to the low level driver. * These qp types will not be visible at the IB core layer, so the @@ -921,6 +942,7 @@ IB_QP_CREATE_USE_GFP_NOIO = 1 << 7, IB_QP_CREATE_ATOMIC_BE_REPLY = 1 << 8, IB_QP_CREATE_SIGNATURE_PIPELINE = 1 << 9, + IB_QP_CREATE_RX_END_PADDING = 1 << 11, /* reserve bits 26-31 for low level drivers' internal use */ IB_QP_CREATE_RESERVED_START = 1 << 26, IB_QP_CREATE_RESERVED_END = 1 << 31, @@ -977,6 +999,9 @@ u8 gid_index; u8 hop_limit; u32 create_flags; + u32 inline_size; + void (*event_handler)(struct ib_event *, void *); + void *dct_context; }; struct ib_dct_attr { @@ -1070,7 +1095,8 @@ IB_QPS_RTS, IB_QPS_SQD, IB_QPS_SQE, - IB_QPS_ERR + IB_QPS_ERR, + IB_QPS_DUMMY = -1, /* force enum signed */ }; enum ib_mig_state { @@ -1145,7 +1171,7 @@ IB_WR_RESERVED8, IB_WR_RESERVED9, IB_WR_RESERVED10, - IB_WR_DUMMY = -1 /* force enum signed */ + IB_WR_DUMMY = -1, /* force enum signed */ }; enum ib_send_flags { @@ -1154,6 +1180,7 @@ IB_SEND_SOLICITED = (1<<2), IB_SEND_INLINE = (1<<3), IB_SEND_IP_CSUM = (1<<4), + IB_SEND_SIG_PIPELINED = (1<<5), /* reserve bits 26-31 for low level drivers' internal use */ IB_SEND_RESERVED_START = (1 << 26), @@ -1285,7 +1312,8 @@ IB_ACCESS_REMOTE_ATOMIC = (1<<3), IB_ACCESS_MW_BIND = (1<<4), IB_ACCESS_ALLOCATE_MR = (1<<5), - IB_ZERO_BASED = (1<<13) + IB_ZERO_BASED = (1<<13), + IB_ACCESS_ON_DEMAND = (1<<14), }; struct ib_phys_buf { @@ -1474,6 +1502,27 @@ atomic_t usecnt; }; +enum ib_mp_rq_shifts { + IB_MP_RQ_NO_SHIFT = 0, + IB_MP_RQ_2BYTES_SHIFT = 1 << 0 +}; + +struct ib_wq_mp_rq { + uint8_t use_mp_rq; + enum ib_mp_rq_shifts use_shift; + uint8_t single_wqe_log_num_of_strides; + uint8_t single_stride_log_num_of_bytes; +}; + +enum ib_wq_vlan_offloads { + IB_WQ_CVLAN_STRIPPING = (1 << 0), +}; + +enum ibv_exp_wq_init_attr_flags { + IB_CREATE_WQ_FLAG_RX_END_PADDING = (1ULL << 0), + IB_CREATE_WQ_FLAG_RESERVED = (1ULL << 1) +}; + struct ib_wq_init_attr { void *wq_context; enum ib_wq_type wq_type; @@ -1482,16 +1531,22 @@ struct ib_cq *cq; struct ib_srq *srq; /* IB_WQT_SRQ only */ void (*event_handler)(struct ib_event *, void *); + struct ib_wq_mp_rq mp_rq; + u16 vlan_offloads; + u64 flags; }; enum ib_wq_attr_mask { IB_WQ_STATE = 1 << 0, IB_WQ_CUR_STATE = 1 << 1, + IB_WQ_VLAN_OFFLOADS = 1 << 2, + IB_WQ_MASK_ALL = (IB_WQ_VLAN_OFFLOADS << 1) - 1, }; struct ib_wq_attr { enum ib_wq_state wq_state; enum ib_wq_state curr_wq_state; + u16 vlan_offloads; }; struct ib_rwq_ind_table { @@ -1539,6 +1594,8 @@ struct ib_pd *pd; struct ib_cq *cq; struct ib_srq *srq; + void (*event_handler)(struct ib_event *, void *); + void *dct_context; u32 dct_num; }; @@ -2115,6 +2172,10 @@ struct ib_udata *udata); int (*exp_destroy_dct)(struct ib_dct *dct); int (*exp_query_dct)(struct ib_dct *dct, struct ib_dct_attr *attr); + int (*exp_arm_dct)(struct ib_dct *dct, struct ib_udata *udata); + int (*exp_query_mkey)(struct ib_mr *mr, + u64 mkey_attr_mask, + struct ib_mkey_attr *mkey_attr); /** * exp_rereg_user_mr - Modifies the attributes of an existing memory region. * Conceptually, this call performs the functions deregister memory region @@ -3232,6 +3293,11 @@ */ int ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table); +/* + * ib_arm_dct - Arm a DCT to generate DC key violations + * @dct: pointer to the DCT object + */ +int ib_arm_dct(struct ib_dct *dct); static inline void ib_active_speed_enum_to_rate(u8 active_speed, int *rate, @@ -3301,6 +3367,16 @@ struct ib_mr_status *mr_status); /** + * ib_query_mkey - Retrieves information about a specific memory + * key. + * @mr: The memory region to retrieve information about. + * @mkey_attr_mask: Which attributes to get + * @mkey_attr: The attributes of the specified memory region. + */ +int ib_query_mkey(struct ib_mr *mr, u64 mkey_attr_mask, + struct ib_mkey_attr *mkey_attr); + +/** * ib_query_values - Query values from the HCA * @device: The device on which to query the values from * @q_values - combination of enum ib_values_mask flags to query Index: sys/ofed/include/rdma/ib_verbs_exp.h =================================================================== --- sys/ofed/include/rdma/ib_verbs_exp.h +++ sys/ofed/include/rdma/ib_verbs_exp.h @@ -134,6 +134,7 @@ IB_EXP_DEVICE_ATTR_RX_HASH = 1ULL << 13, IB_EXP_DEVICE_ATTR_MAX_WQ_TYPE_RQ = 1ULL << 14, IB_EXP_DEVICE_ATTR_MAX_DEVICE_CTX = 1ULL << 15, + IB_EXP_DEVICE_ATTR_RX_PAD_END_ALIGN = 1ULL << 20, }; struct ib_exp_device_attr { @@ -160,6 +161,13 @@ struct ib_exp_rx_hash_caps rx_hash_caps; uint32_t max_wq_type_rq; uint32_t max_device_ctx; + /* + * The alignment of the padding end address. + * Which means that when RX end of packet padding is enabled the device + * will pad the end of RX packet up until the next address which is + * aligned to the rx_pad_end_addr_align size. + */ + u16 rx_pad_end_addr_align; }; struct ib_exp_qp_init_attr {