Index: sys/net/if.c =================================================================== --- sys/net/if.c +++ sys/net/if.c @@ -310,12 +310,42 @@ VNET_DEFINE(struct ifnethead, ifnet); /* depend on static init XXX */ VNET_DEFINE(struct ifgrouphead, ifg_head); -/* Table of ifnet by index. */ +/* + * Tables of ifnet by index. + * + * We keep a global if_table indexing all interfaces across all vnets. The + * position in this table is immutable and doesn't change after if_vmove(). + * This table can be used to serialize ifnet pointer to index and later + * restore it. + * + * Also every vnet keeps its own V_ifindex_table. This one starts with 1 and + * is guaranteed to be populated without holes. Its maximum index always + * matches overall number of interfaces in the vnet. It is used by all kinds + * of userland APIs like multicast setsockopt(2) or net.link.generic.ifdata + * sysctl MIB. Entries in this array don't point at interface directly, + * but at the global table entry. + * + * lo0 em0 epair0a lo0 epair0b + * / / / / / + * if_table [0 ] [1 ] [2 ] [3 ] [4 ] + * | | | | | + * vnet0_ifindex_table [0 NULL] [1 ] [2 ] [3 ] | | + * vnet1_ifindex_table [0 NULL] [1 ] [2 ] + */ + +static int if_idx; +static int if_lim = 8; +static struct ifindex_entry { + struct ifnet *ife_ifnet; + uint16_t ife_gencnt; +} *if_table; + +/* Per-VNET API index. */ VNET_DEFINE_STATIC(int, if_index); #define V_if_index VNET(if_index) -VNET_DEFINE_STATIC(int, if_indexlim) = 8; +VNET_DEFINE_STATIC(int, if_indexlim) = 2; #define V_if_indexlim VNET(if_indexlim) -VNET_DEFINE_STATIC(struct ifnet **, ifindex_table); +VNET_DEFINE_STATIC(struct ifindex_entry **, ifindex_table); #define V_ifindex_table VNET(ifindex_table) SYSCTL_NODE(_net_link_generic, IFMIB_SYSTEM, system, @@ -352,13 +382,15 @@ struct ifnet * ifnet_byindex(int idx) { + struct ifindex_entry *ife; NET_EPOCH_ASSERT(); if (__predict_false(idx <= 0 || idx > V_if_index)) return (NULL); - return (ck_pr_load_ptr(&V_ifindex_table[idx])); + ife = ck_pr_load_ptr(&V_ifindex_table[idx]); + return (ife == NULL ? NULL : ck_pr_load_ptr(&ife->ife_ifnet)); } struct ifnet * @@ -375,30 +407,72 @@ } /* - * Allocate an ifindex array entry. + * Allocate global and virtual ifindex entries for ifnet. + * For a new interface ife = NULL. For interface being if_vmove'd, + * ife points to already allocated entry in the global index. */ static void -ifindex_alloc(struct ifnet *ifp) +ifindex_alloc(struct ifnet *ifp, struct ifindex_entry *ife) { u_short idx; IFNET_WLOCK(); - /* - * Try to find an empty slot below V_if_index. If we fail, take the - * next slot. - */ + + if (ife != NULL) + goto virtual; + + /* First allocate index in the global table. */ + for (idx = 0; idx <= if_idx; idx++) + if (if_table[idx].ife_ifnet == NULL) + break; + if (idx >= if_lim) { /* Need to grow. */ + VNET_ITERATOR_DECL(vnet_iter); + struct ifindex_entry *new, *old; + int newlim; + + newlim = if_lim << 1; + new = malloc(newlim * sizeof(*new), M_IFNET, + M_WAITOK | M_ZERO); + memcpy(new, if_table, if_lim * sizeof(*new)); + old = if_table; + ck_pr_store_ptr(&if_table, new); + if_lim = newlim; + + /* Rewrite every vnet index to use new global index. */ + VNET_LIST_RLOCK_NOSLEEP(); + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); + for (u_short i = 1; i <= V_if_index; i++) { + new = V_ifindex_table[i] - old + if_table; + ck_pr_store_ptr(&V_ifindex_table[i], new); + } + CURVNET_RESTORE(); + } + VNET_LIST_RUNLOCK_NOSLEEP(); + + epoch_wait_preempt(net_epoch_preempt); + free(old, M_IFNET); + } + if (idx > if_idx) + if_idx = idx; + ifp->if_idx = idx; + ifp->if_idxgen = if_table[idx].ife_gencnt; + ck_pr_store_ptr(&if_table[idx].ife_ifnet, ifp); + + ife = &if_table[idx]; +virtual: + /* Second, allocate entry in V_ifindex_table. */ for (idx = 1; idx <= V_if_index; idx++) { if (V_ifindex_table[idx] == NULL) break; } - - /* Catch if_index overflow. */ - if (idx >= V_if_indexlim) { - struct ifnet **new, **old; + if (idx >= V_if_indexlim) { /* Need to grow. */ + struct ifindex_entry **new, **old; int newlim; newlim = V_if_indexlim << 1; - new = malloc(newlim * sizeof(*new), M_IFNET, M_WAITOK | M_ZERO); + new = malloc(newlim * sizeof(*new), M_IFNET, + M_WAITOK | M_ZERO); memcpy(new, V_ifindex_table, V_if_indexlim * sizeof(*new)); old = V_ifindex_table; ck_pr_store_ptr(&V_ifindex_table, new); @@ -408,24 +482,11 @@ } if (idx > V_if_index) V_if_index = idx; - ifp->if_index = idx; - ck_pr_store_ptr(&V_ifindex_table[idx], ifp); + ck_pr_store_ptr(&V_ifindex_table[idx], ife); IFNET_WUNLOCK(); } -static void -ifindex_free(u_short idx) -{ - - IFNET_WLOCK_ASSERT(); - - ck_pr_store_ptr(&V_ifindex_table[idx], NULL); - while (V_if_index > 0 && - V_ifindex_table[V_if_index] == NULL) - V_if_index--; -} - struct ifaddr * ifaddr_byindex(u_short idx) { @@ -447,6 +508,15 @@ * parameters. */ +static void +if_init(const void *arg __unused) +{ + + if_table = malloc(if_lim * sizeof(*if_table), M_IFNET, + M_WAITOK | M_ZERO); +} +SYSINIT(if_init, SI_SUB_INIT_IF, SI_ORDER_SECOND, if_init, NULL); + static void vnet_if_init(const void *unused __unused) { @@ -606,7 +676,7 @@ ifp->if_get_counter = if_get_counter_default; ifp->if_pcp = IFNET_PCP_NONE; - ifindex_alloc(ifp); + ifindex_alloc(ifp, NULL); return (ifp); } @@ -668,26 +738,18 @@ ifp->if_flags |= IFF_DYING; /* XXX: Locking */ - /* - * XXXGL: An interface index is really an alias to ifp pointer. - * Why would we clear the alias now, and not in the deferred - * context? Indeed there is nothing wrong with some network - * thread obtaining ifp via ifnet_byindex() inside the network - * epoch and then dereferencing ifp while we peform if_free(), - * and after if_free() finished, too. - * - * The reason is the VIMAGE. For some reason it was designed - * to require all sockets drained before destroying, but not all - * ifnets. A vnet destruction calls if_vmove() on ifnet, which - * causes ID change. But ID change and a possible misidentification - * of an ifnet later is a lesser problem, as it doesn't crash kernel. - * A worse problem is that removed interface may outlive the vnet it - * belongs too! The if_free_deferred() would see ifp->if_vnet freed. - */ CURVNET_SET_QUIET(ifp->if_vnet); IFNET_WLOCK(); - MPASS(V_ifindex_table[ifp->if_index] == ifp); - ifindex_free(ifp->if_index); + MPASS(V_ifindex_table[ifp->if_index]->ife_ifnet == ifp); + /* First, clear global index entry. */ + ck_pr_store_ptr(&V_ifindex_table[ifp->if_index]->ife_ifnet, NULL); + V_ifindex_table[ifp->if_index]->ife_gencnt++; + while (if_idx > 0 && if_table[if_idx].ife_ifnet == NULL) + if_idx--; + /* Second, clear virtual index entry. */ + ck_pr_store_ptr(&V_ifindex_table[ifp->if_index], NULL); + while (V_if_index > 0 && V_ifindex_table[V_if_index] == NULL) + V_if_index--; IFNET_WUNLOCK(); if (refcount_release(&ifp->if_refcount)) @@ -836,7 +898,7 @@ struct sockaddr_dl *sdl; struct ifaddr *ifa; - MPASS(V_ifindex_table[ifp->if_index] == ifp); + MPASS(V_ifindex_table[ifp->if_index]->ife_ifnet == ifp); #ifdef VIMAGE ifp->if_vnet = curvnet; @@ -1271,6 +1333,7 @@ if_vmove(struct ifnet *ifp, struct vnet *new_vnet) { struct if_clone *ifc; + struct ifindex_entry *ife; #ifdef DEV_BPF u_int bif_dlt, bif_hdrlen; #endif @@ -1295,14 +1358,17 @@ return (rc); /* - * Unlink the ifnet from ifindex_table[] in current vnet, and shrink + * Unlink the ifnet from V_ifindex_table in current vnet, and shrink * the if_index for that vnet if possible. * * NOTE: IFNET_WLOCK/IFNET_WUNLOCK() are assumed to be unvirtualized, * or we'd lock on one vnet and unlock on another. */ IFNET_WLOCK(); - ifindex_free(ifp->if_index); + ife = V_ifindex_table[ifp->if_index]; + ck_pr_store_ptr(&V_ifindex_table[ifp->if_index], NULL); + while (V_if_index > 0 && V_ifindex_table[V_if_index] == NULL) + V_if_index--; IFNET_WUNLOCK(); /* @@ -1316,7 +1382,7 @@ * Switch to the context of the target vnet. */ CURVNET_SET_QUIET(new_vnet); - ifindex_alloc(ifp); + ifindex_alloc(ifp, ife); if_attach_internal(ifp, 1, ifc); #ifdef DEV_BPF @@ -4545,8 +4611,9 @@ IF_DB_PRINTF("%d", if_dunit); IF_DB_PRINTF("%s", if_description); IF_DB_PRINTF("%u", if_index); + IF_DB_PRINTF("%u", if_idx); + IF_DB_PRINTF("%u", if_idxgen); IF_DB_PRINTF("%u", if_refcount); - IF_DB_PRINTF("%d", if_index_reserved); IF_DB_PRINTF("%p", if_softc); IF_DB_PRINTF("%p", if_l2com); IF_DB_PRINTF("%p", if_llsoftc); @@ -4607,7 +4674,7 @@ db_printf("vnet=%p\n", curvnet); #endif for (idx = 1; idx <= V_if_index; idx++) { - ifp = V_ifindex_table[idx]; + ifp = V_ifindex_table[idx]->ife_ifnet; if (ifp == NULL) continue; db_printf( "%20s ifp=%p\n", ifp->if_xname, ifp); Index: sys/net/if_var.h =================================================================== --- sys/net/if_var.h +++ sys/net/if_var.h @@ -301,16 +301,17 @@ LIST_ENTRY(ifnet) if_clones; /* interfaces of a cloner */ CK_STAILQ_HEAD(, ifg_list) if_groups; /* linked list of groups per if (CK_) */ /* protected by if_addr_lock */ - u_char if_alloctype; /* if_type at time of allocation */ - uint8_t if_numa_domain; /* NUMA domain of device */ + uint8_t if_alloctype; /* if_type at time of allocation */ + uint8_t if_numa_domain; /* NUMA domain of device */ + uint16_t if_index; /* numeric abbreviation for this if */ + uint16_t if_idx; /* immutable index ... */ + uint16_t if_idxgen; /* ... and its generation count */ /* Driver and protocol specific information that remains stable. */ void *if_softc; /* pointer to driver state */ void *if_llsoftc; /* link layer softc */ void *if_l2com; /* pointer to protocol bits */ const char *if_dname; /* driver name */ int if_dunit; /* unit or IF_DUNIT_NONE */ - u_short if_index; /* numeric abbreviation for this if */ - short if_index_reserved; /* spare space to grow if_index */ char if_xname[IFNAMSIZ]; /* external name (name + unit) */ char *if_description; /* interface description */