Index: sys/netpfil/pf/if_pfsync.c =================================================================== --- sys/netpfil/pf/if_pfsync.c +++ sys/netpfil/pf/if_pfsync.c @@ -77,6 +77,7 @@ #include #include #include +#include #include #include #include @@ -106,6 +107,8 @@ sizeof(struct pfsync_header) + \ sizeof(struct pfsync_subheader) ) +struct pfsync_bucket; + struct pfsync_pkt { struct ip *ip; struct in_addr src; @@ -164,7 +167,7 @@ }; static void pfsync_q_ins(struct pf_state *, int, bool); -static void pfsync_q_del(struct pf_state *, bool); +static void pfsync_q_del(struct pf_state *, bool, struct pfsync_bucket *); static void pfsync_update_state(struct pf_state *); @@ -183,6 +186,34 @@ struct mbuf *pd_m; }; +struct pfsync_sofct; + +struct pfsync_bucket +{ + struct mtx b_mtx; + + int b_id; + struct pfsync_softc *b_sc; + int b_flags; +#define PFSYNCF_BUCKET_PUSH 0x00000001 + + size_t b_len; + TAILQ_HEAD(, pf_state) b_qs[PFSYNC_S_COUNT]; + + /* Only in Queue 0 */ + TAILQ_HEAD(, pfsync_upd_req_item) b_upd_req_list; + TAILQ_HEAD(, pfsync_deferral) b_deferrals; + + u_int b_deferred; + void *b_plus; + size_t b_pluslen; + /* End only in Queue 0 */ + + struct ifaltq b_snd; + + struct callout b_tmo; +}; + struct pfsync_softc { /* Configuration */ struct ifnet *sc_ifp; @@ -192,20 +223,12 @@ uint32_t sc_flags; #define PFSYNCF_OK 0x00000001 #define PFSYNCF_DEFER 0x00000002 -#define PFSYNCF_PUSH 0x00000004 uint8_t sc_maxupdates; struct ip sc_template; - struct callout sc_tmo; struct mtx sc_mtx; /* Queued data */ - size_t sc_len; - TAILQ_HEAD(, pf_state) sc_qs[PFSYNC_S_COUNT]; - TAILQ_HEAD(, pfsync_upd_req_item) sc_upd_req_list; - TAILQ_HEAD(, pfsync_deferral) sc_deferrals; - u_int sc_deferred; - void *sc_plus; - size_t sc_pluslen; + struct pfsync_bucket *sc_buckets; /* Bulk update info */ struct mtx sc_bulk_mtx; @@ -223,6 +246,10 @@ #define PFSYNC_UNLOCK(sc) mtx_unlock(&(sc)->sc_mtx) #define PFSYNC_LOCK_ASSERT(sc) mtx_assert(&(sc)->sc_mtx, MA_OWNED) +#define PFSYNC_BUCKET_LOCK(b) mtx_lock(&(b)->b_mtx) +#define PFSYNC_BUCKET_UNLOCK(b) mtx_unlock(&(b)->b_mtx) +#define PFSYNC_BUCKET_ASSERT(b) mtx_assert(&(b)->b_mtx, MA_OWNED) + #define PFSYNC_BLOCK(sc) mtx_lock(&(sc)->sc_bulk_mtx) #define PFSYNC_BUNLOCK(sc) mtx_unlock(&(sc)->sc_bulk_mtx) #define PFSYNC_BLOCK_ASSERT(sc) mtx_assert(&(sc)->sc_bulk_mtx, MA_OWNED) @@ -239,7 +266,8 @@ #define V_pfsync_carp_adj VNET(pfsync_carp_adj) static void pfsync_timeout(void *); -static void pfsync_push(struct pfsync_softc *); +static void pfsync_push(struct pfsync_bucket *); +static void pfsync_push_all(struct pfsync_softc *); static void pfsyncintr(void *); static int pfsync_multicast_setup(struct pfsync_softc *, struct ifnet *, void *); @@ -249,12 +277,16 @@ static int pfsync_init(void); static void pfsync_uninit(void); +static unsigned long pfsync_buckets; + SYSCTL_NODE(_net, OID_AUTO, pfsync, CTLFLAG_RW, 0, "PFSYNC"); SYSCTL_STRUCT(_net_pfsync, OID_AUTO, stats, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(pfsyncstats), pfsyncstats, "PFSYNC statistics (struct pfsyncstats, net/if_pfsync.h)"); SYSCTL_INT(_net_pfsync, OID_AUTO, carp_demotion_factor, CTLFLAG_RW, &VNET_NAME(pfsync_carp_adj), 0, "pfsync's CARP demotion factor adjustment"); +SYSCTL_ULONG(_net_pfsync, OID_AUTO, pfsync_buckets, CTLFLAG_RDTUN, + &pfsync_buckets, 0, "Number of pfsync hash buckets"); static int pfsync_clone_create(struct if_clone *, int, caddr_t); static void pfsync_clone_destroy(struct ifnet *); @@ -273,7 +305,7 @@ static void pfsync_update_state_req(struct pf_state *); static void pfsync_drop(struct pfsync_softc *); -static void pfsync_sendout(int); +static void pfsync_sendout(int, int); static void pfsync_send_plus(void *, size_t); static void pfsync_bulk_start(void); @@ -285,6 +317,9 @@ #ifdef IPSEC static void pfsync_update_net_tdb(struct pfsync_tdb *); #endif +static struct pfsync_bucket *pfsync_get_bucket(struct pfsync_softc *, + struct pf_state *); + #define PFSYNC_MAX_BULKTRIES 12 @@ -296,21 +331,15 @@ { struct pfsync_softc *sc; struct ifnet *ifp; - int q; + int c, q; if (unit != 0) return (EINVAL); - sc = malloc(sizeof(struct pfsync_softc), M_PFSYNC, M_WAITOK | M_ZERO); - sc->sc_flags |= PFSYNCF_OK; - - for (q = 0; q < PFSYNC_S_COUNT; q++) - TAILQ_INIT(&sc->sc_qs[q]); - - TAILQ_INIT(&sc->sc_upd_req_list); - TAILQ_INIT(&sc->sc_deferrals); + if (! pfsync_buckets) + pfsync_buckets = mp_ncpus * 2; - sc->sc_len = PFSYNC_MINPKT; + sc = malloc(sizeof(struct pfsync_softc), M_PFSYNC, M_WAITOK | M_ZERO); sc->sc_maxupdates = 128; ifp = sc->sc_ifp = if_alloc(IFT_PFSYNC); @@ -318,17 +347,16 @@ free(sc, M_PFSYNC); return (ENOSPC); } + if_initname(ifp, pfsyncname, unit); ifp->if_softc = sc; ifp->if_ioctl = pfsyncioctl; ifp->if_output = pfsyncoutput; ifp->if_type = IFT_PFSYNC; - ifp->if_snd.ifq_maxlen = ifqmaxlen; ifp->if_hdrlen = sizeof(struct pfsync_header); ifp->if_mtu = ETHERMTU; mtx_init(&sc->sc_mtx, pfsyncname, NULL, MTX_DEF); mtx_init(&sc->sc_bulk_mtx, "pfsync bulk", NULL, MTX_DEF); - callout_init(&sc->sc_tmo, 1); callout_init_mtx(&sc->sc_bulk_tmo, &sc->sc_bulk_mtx, 0); callout_init_mtx(&sc->sc_bulkfail_tmo, &sc->sc_bulk_mtx, 0); @@ -336,6 +364,25 @@ bpfattach(ifp, DLT_PFSYNC, PFSYNC_HDRLEN); + sc->sc_buckets = mallocarray(pfsync_buckets, sizeof(*sc->sc_buckets), M_PFSYNC, M_ZERO | M_WAITOK); + for (c = 0; c < pfsync_buckets; c++) { + mtx_init(&sc->sc_buckets[c].b_mtx, pfsyncname, NULL, MTX_DEF); + + sc->sc_buckets[c].b_id = c; + sc->sc_buckets[c].b_sc = sc; + sc->sc_buckets[c].b_len = PFSYNC_MINPKT; + + for (q = 0; q < PFSYNC_S_COUNT; q++) + TAILQ_INIT(&sc->sc_buckets[c].b_qs[q]); + + TAILQ_INIT(&sc->sc_buckets[c].b_upd_req_list); + TAILQ_INIT(&sc->sc_buckets[c].b_deferrals); + + callout_init(&sc->sc_buckets[c].b_tmo, 1); + + sc->sc_buckets[c].b_snd.ifq_maxlen = ifqmaxlen; + } + V_pfsyncif = sc; return (0); @@ -345,29 +392,33 @@ pfsync_clone_destroy(struct ifnet *ifp) { struct pfsync_softc *sc = ifp->if_softc; + int c; - /* - * At this stage, everything should have already been - * cleared by pfsync_uninit(), and we have only to - * drain callouts. - */ - while (sc->sc_deferred > 0) { - struct pfsync_deferral *pd = TAILQ_FIRST(&sc->sc_deferrals); - - TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry); - sc->sc_deferred--; - if (callout_stop(&pd->pd_tmo) > 0) { - pf_release_state(pd->pd_st); - m_freem(pd->pd_m); - free(pd, M_PFSYNC); - } else { - pd->pd_refs++; - callout_drain(&pd->pd_tmo); - free(pd, M_PFSYNC); + for (c = 0; c < pfsync_buckets; c++) { + /* + * At this stage, everything should have already been + * cleared by pfsync_uninit(), and we have only to + * drain callouts. + */ + while (sc->sc_buckets[c].b_deferred > 0) { + struct pfsync_deferral *pd = TAILQ_FIRST(&sc->sc_buckets[c].b_deferrals); + + TAILQ_REMOVE(&sc->sc_buckets[c].b_deferrals, pd, pd_entry); + sc->sc_buckets[c].b_deferred--; + if (callout_stop(&pd->pd_tmo) > 0) { + pf_release_state(pd->pd_st); + m_freem(pd->pd_m); + free(pd, M_PFSYNC); + } else { + pd->pd_refs++; + callout_drain(&pd->pd_tmo); + free(pd, M_PFSYNC); + } } + + callout_drain(&sc->sc_buckets[c].b_tmo); } - callout_drain(&sc->sc_tmo); callout_drain(&sc->sc_bulkfail_tmo); callout_drain(&sc->sc_bulk_tmo); @@ -383,6 +434,8 @@ pfsync_multicast_cleanup(sc); mtx_destroy(&sc->sc_mtx); mtx_destroy(&sc->sc_bulk_mtx); + + free(sc->sc_buckets, M_PFSYNC); free(sc, M_PFSYNC); V_pfsyncif = NULL; @@ -546,7 +599,7 @@ st->state_flags &= ~PFSTATE_NOSYNC; if (st->state_flags & PFSTATE_ACK) { pfsync_q_ins(st, PFSYNC_S_IACK, true); - pfsync_push(sc); + pfsync_push_all(sc); } } st->state_flags &= ~PFSTATE_ACK; @@ -785,9 +838,7 @@ continue; if (st->state_flags & PFSTATE_ACK) { - PFSYNC_LOCK(V_pfsyncif); pfsync_undefer_state(st, 0); - PFSYNC_UNLOCK(V_pfsyncif); } PF_STATE_UNLOCK(st); } @@ -876,9 +927,7 @@ } if (st->state_flags & PFSTATE_ACK) { - PFSYNC_LOCK(sc); pfsync_undefer_state(st, 1); - PFSYNC_UNLOCK(sc); } if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) @@ -912,9 +961,7 @@ pfsync_update_state(st); PF_STATE_UNLOCK(st); - PFSYNC_LOCK(sc); - pfsync_push(sc); - PFSYNC_UNLOCK(sc); + pfsync_push_all(sc); continue; } PF_STATE_UNLOCK(st); @@ -960,16 +1007,14 @@ st = pf_find_state_byid(up->id, up->creatorid); if (st == NULL) { /* We don't have this state. Ask for it. */ - PFSYNC_LOCK(sc); + PFSYNC_BUCKET_LOCK(&sc->sc_buckets[0]); pfsync_request_update(up->creatorid, up->id); - PFSYNC_UNLOCK(sc); + PFSYNC_BUCKET_UNLOCK(&sc->sc_buckets[0]); continue; } if (st->state_flags & PFSTATE_ACK) { - PFSYNC_LOCK(sc); pfsync_undefer_state(st, 1); - PFSYNC_UNLOCK(sc); } if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) @@ -1003,9 +1048,7 @@ pfsync_update_state(st); PF_STATE_UNLOCK(st); - PFSYNC_LOCK(sc); - pfsync_push(sc); - PFSYNC_UNLOCK(sc); + pfsync_push_all(sc); continue; } PF_STATE_UNLOCK(st); @@ -1283,6 +1326,7 @@ struct ifreq *ifr = (struct ifreq *)data; struct pfsyncreq pfsyncr; int error; + int c; switch (cmd) { case SIOCSIFFLAGS: @@ -1303,10 +1347,12 @@ ifr->ifr_mtu > sc->sc_sync_if->if_mtu) return (EINVAL); if (ifr->ifr_mtu < ifp->if_mtu) { - PFSYNC_LOCK(sc); - if (sc->sc_len > PFSYNC_MINPKT) - pfsync_sendout(1); - PFSYNC_UNLOCK(sc); + for (c = 0; c < pfsync_buckets; c++) { + PFSYNC_BUCKET_LOCK(&sc->sc_buckets[c]); + if (sc->sc_buckets[c].b_len > PFSYNC_MINPKT) + pfsync_sendout(1, c); + PFSYNC_BUCKET_UNLOCK(&sc->sc_buckets[c]); + } } ifp->if_mtu = ifr->ifr_mtu; break; @@ -1379,12 +1425,16 @@ break; } - if (sc->sc_len > PFSYNC_MINPKT && - (sifp->if_mtu < sc->sc_ifp->if_mtu || - (sc->sc_sync_if != NULL && - sifp->if_mtu < sc->sc_sync_if->if_mtu) || - sifp->if_mtu < MCLBYTES - sizeof(struct ip))) - pfsync_sendout(1); + for (c = 0; c < pfsync_buckets; c++) { + PFSYNC_BUCKET_LOCK(&sc->sc_buckets[c]); + if (sc->sc_buckets[c].b_len > PFSYNC_MINPKT && + (sifp->if_mtu < sc->sc_ifp->if_mtu || + (sc->sc_sync_if != NULL && + sifp->if_mtu < sc->sc_sync_if->if_mtu) || + sifp->if_mtu < MCLBYTES - sizeof(struct ip))) + pfsync_sendout(1, c); + PFSYNC_BUCKET_UNLOCK(&sc->sc_buckets[c]); + } if (imo->imo_membership) pfsync_multicast_cleanup(sc); @@ -1421,8 +1471,10 @@ sc->sc_flags &= ~PFSYNCF_OK; if (V_pf_status.debug >= PF_DEBUG_MISC) printf("pfsync: requesting bulk update\n"); - pfsync_request_update(0, 0); PFSYNC_UNLOCK(sc); + PFSYNC_BUCKET_LOCK(&sc->sc_buckets[0]); + pfsync_request_update(0, 0); + PFSYNC_BUCKET_UNLOCK(&sc->sc_buckets[0]); PFSYNC_BLOCK(sc); sc->sc_ureq_sent = time_uptime; callout_reset(&sc->sc_bulkfail_tmo, 5 * hz, pfsync_bulk_fail, @@ -1483,33 +1535,34 @@ { struct pf_state *st, *next; struct pfsync_upd_req_item *ur; - int q; + int c, q; - for (q = 0; q < PFSYNC_S_COUNT; q++) { - if (TAILQ_EMPTY(&sc->sc_qs[q])) - continue; + for (c = 0; c < pfsync_buckets; c++) { + for (q = 0; q < PFSYNC_S_COUNT; q++) { + if (TAILQ_EMPTY(&sc->sc_buckets[c].b_qs[q])) + continue; - TAILQ_FOREACH_SAFE(st, &sc->sc_qs[q], sync_list, next) { - KASSERT(st->sync_state == q, - ("%s: st->sync_state == q", - __func__)); - st->sync_state = PFSYNC_S_NONE; - pf_release_state(st); + TAILQ_FOREACH_SAFE(st, &sc->sc_buckets[c].b_qs[q], sync_list, next) { + KASSERT(st->sync_state == q, + ("%s: st->sync_state == q", + __func__)); + st->sync_state = PFSYNC_S_NONE; + pf_release_state(st); + } + TAILQ_INIT(&sc->sc_buckets[c].b_qs[q]); } - TAILQ_INIT(&sc->sc_qs[q]); + sc->sc_buckets[c].b_len = PFSYNC_MINPKT; + sc->sc_buckets[c].b_plus = NULL; } - while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) { - TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry); + while ((ur = TAILQ_FIRST(&sc->sc_buckets[c].b_upd_req_list)) != NULL) { + TAILQ_REMOVE(&sc->sc_buckets[c].b_upd_req_list, ur, ur_entry); free(ur, M_PFSYNC); } - - sc->sc_plus = NULL; - sc->sc_len = PFSYNC_MINPKT; } static void -pfsync_sendout(int schedswi) +pfsync_sendout(int schedswi, int c) { struct pfsync_softc *sc = V_pfsyncif; struct ifnet *ifp = sc->sc_ifp; @@ -1523,23 +1576,23 @@ int q, count = 0; KASSERT(sc != NULL, ("%s: null sc", __func__)); - KASSERT(sc->sc_len > PFSYNC_MINPKT, - ("%s: sc_len %zu", __func__, sc->sc_len)); - PFSYNC_LOCK_ASSERT(sc); + KASSERT(sc->sc_buckets[c].b_len > PFSYNC_MINPKT, + ("%s: sc_len %zu", __func__, sc->sc_buckets[c].b_len)); + PFSYNC_BUCKET_ASSERT(&sc->sc_buckets[c]); if (ifp->if_bpf == NULL && sc->sc_sync_if == NULL) { pfsync_drop(sc); return; } - m = m_get2(max_linkhdr + sc->sc_len, M_NOWAIT, MT_DATA, M_PKTHDR); + m = m_get2(max_linkhdr + sc->sc_buckets[c].b_len, M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) { if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); V_pfsyncstats.pfsyncs_onomem++; return; } m->m_data += max_linkhdr; - m->m_len = m->m_pkthdr.len = sc->sc_len; + m->m_len = m->m_pkthdr.len = sc->sc_buckets[c].b_len; /* build the ip header */ ip = (struct ip *)m->m_data; @@ -1555,19 +1608,19 @@ offset += sizeof(*ph); ph->version = PFSYNC_VERSION; - ph->len = htons(sc->sc_len - sizeof(*ip)); + ph->len = htons(sc->sc_buckets[c].b_len - sizeof(*ip)); bcopy(V_pf_status.pf_chksum, ph->pfcksum, PF_MD5_DIGEST_LENGTH); /* walk the queues */ for (q = 0; q < PFSYNC_S_COUNT; q++) { - if (TAILQ_EMPTY(&sc->sc_qs[q])) + if (TAILQ_EMPTY(&sc->sc_buckets[c].b_qs[q])) continue; subh = (struct pfsync_subheader *)(m->m_data + offset); offset += sizeof(*subh); count = 0; - TAILQ_FOREACH_SAFE(st, &sc->sc_qs[q], sync_list, st_next) { + TAILQ_FOREACH_SAFE(st, &sc->sc_buckets[c].b_qs[q], sync_list, st_next) { KASSERT(st->sync_state == q, ("%s: st->sync_state == q", __func__)); @@ -1581,7 +1634,7 @@ pf_release_state(st); count++; } - TAILQ_INIT(&sc->sc_qs[q]); + TAILQ_INIT(&sc->sc_buckets[c].b_qs[q]); bzero(subh, sizeof(*subh)); subh->action = pfsync_qs[q].action; @@ -1589,13 +1642,13 @@ V_pfsyncstats.pfsyncs_oacts[pfsync_qs[q].action] += count; } - if (!TAILQ_EMPTY(&sc->sc_upd_req_list)) { + if (!TAILQ_EMPTY(&sc->sc_buckets[c].b_upd_req_list)) { subh = (struct pfsync_subheader *)(m->m_data + offset); offset += sizeof(*subh); count = 0; - while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) { - TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry); + while ((ur = TAILQ_FIRST(&sc->sc_buckets[c].b_upd_req_list)) != NULL) { + TAILQ_REMOVE(&sc->sc_buckets[c].b_upd_req_list, ur, ur_entry); bcopy(&ur->ur_msg, m->m_data + offset, sizeof(ur->ur_msg)); @@ -1611,11 +1664,11 @@ } /* has someone built a custom region for us to add? */ - if (sc->sc_plus != NULL) { - bcopy(sc->sc_plus, m->m_data + offset, sc->sc_pluslen); - offset += sc->sc_pluslen; + if (sc->sc_buckets[c].b_plus != NULL) { + bcopy(sc->sc_buckets[c].b_plus, m->m_data + offset, sc->sc_buckets[c].b_pluslen); + offset += sc->sc_buckets[c].b_pluslen; - sc->sc_plus = NULL; + sc->sc_buckets[c].b_plus = NULL; } subh = (struct pfsync_subheader *)(m->m_data + offset); @@ -1629,24 +1682,24 @@ /* we're done, let's put it on the wire */ if (ifp->if_bpf) { m->m_data += sizeof(*ip); - m->m_len = m->m_pkthdr.len = sc->sc_len - sizeof(*ip); + m->m_len = m->m_pkthdr.len = sc->sc_buckets[c].b_len - sizeof(*ip); BPF_MTAP(ifp, m); m->m_data -= sizeof(*ip); - m->m_len = m->m_pkthdr.len = sc->sc_len; + m->m_len = m->m_pkthdr.len = sc->sc_buckets[c].b_len; } if (sc->sc_sync_if == NULL) { - sc->sc_len = PFSYNC_MINPKT; + sc->sc_buckets[c].b_len = PFSYNC_MINPKT; m_freem(m); return; } if_inc_counter(sc->sc_ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(sc->sc_ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len); - sc->sc_len = PFSYNC_MINPKT; + sc->sc_buckets[c].b_len = PFSYNC_MINPKT; - if (!_IF_QFULL(&sc->sc_ifp->if_snd)) - _IF_ENQUEUE(&sc->sc_ifp->if_snd, m); + if (!_IF_QFULL(&sc->sc_buckets[c].b_snd)) + _IF_ENQUEUE(&sc->sc_buckets[c].b_snd, m); else { m_freem(m); if_inc_counter(sc->sc_ifp, IFCOUNTER_OQDROPS, 1); @@ -1659,6 +1712,7 @@ pfsync_insert_state(struct pf_state *st) { struct pfsync_softc *sc = V_pfsyncif; + struct pfsync_bucket *b = pfsync_get_bucket(sc, st); if (st->state_flags & PFSTATE_NOSYNC) return; @@ -1672,12 +1726,12 @@ KASSERT(st->sync_state == PFSYNC_S_NONE, ("%s: st->sync_state %u", __func__, st->sync_state)); - PFSYNC_LOCK(sc); - if (sc->sc_len == PFSYNC_MINPKT) - callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif); + PFSYNC_BUCKET_LOCK(b); + if (b->b_len == PFSYNC_MINPKT) + callout_reset(&b->b_tmo, 1 * hz, pfsync_timeout, b); pfsync_q_ins(st, PFSYNC_S_INS, true); - PFSYNC_UNLOCK(sc); + PFSYNC_BUCKET_UNLOCK(b); st->sync_updates = 0; } @@ -1687,6 +1741,7 @@ { struct pfsync_softc *sc = V_pfsyncif; struct pfsync_deferral *pd; + struct pfsync_bucket *b = pfsync_get_bucket(sc, st); if (m->m_flags & (M_BCAST|M_MCAST)) return (0); @@ -1699,13 +1754,13 @@ return (0); } - if (sc->sc_deferred >= 128) - pfsync_undefer(TAILQ_FIRST(&sc->sc_deferrals), 0); + if (b->b_deferred >= 128) + pfsync_undefer(TAILQ_FIRST(&b->b_deferrals), 0); pd = malloc(sizeof(*pd), M_PFSYNC, M_NOWAIT); if (pd == NULL) return (0); - sc->sc_deferred++; + b->b_deferred++; m->m_flags |= M_SKIP_FIREWALL; st->state_flags |= PFSTATE_ACK; @@ -1716,11 +1771,11 @@ pf_ref_state(st); pd->pd_m = m; - TAILQ_INSERT_TAIL(&sc->sc_deferrals, pd, pd_entry); - callout_init_mtx(&pd->pd_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED); + TAILQ_INSERT_TAIL(&b->b_deferrals, pd, pd_entry); + callout_init_mtx(&pd->pd_tmo, &b->b_mtx, CALLOUT_RETURNUNLOCKED); callout_reset(&pd->pd_tmo, 10, pfsync_defer_tmo, pd); - pfsync_push(sc); + pfsync_push(b); return (1); } @@ -1731,11 +1786,12 @@ struct pfsync_softc *sc = pd->pd_sc; struct mbuf *m = pd->pd_m; struct pf_state *st = pd->pd_st; + struct pfsync_bucket *b = pfsync_get_bucket(sc, st); - PFSYNC_LOCK_ASSERT(sc); + PFSYNC_BUCKET_ASSERT(b); - TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry); - sc->sc_deferred--; + TAILQ_REMOVE(&b->b_deferrals, pd, pd_entry); + b->b_deferred--; pd->pd_st->state_flags &= ~PFSTATE_ACK; /* XXX: locking! */ free(pd, M_PFSYNC); pf_release_state(st); @@ -1743,8 +1799,8 @@ if (drop) m_freem(m); else { - _IF_ENQUEUE(&sc->sc_ifp->if_snd, m); - pfsync_push(sc); + _IF_ENQUEUE(&b->b_snd, m); + pfsync_push(b); } } @@ -1755,13 +1811,14 @@ struct pfsync_softc *sc = pd->pd_sc; struct mbuf *m = pd->pd_m; struct pf_state *st = pd->pd_st; + struct pfsync_bucket *b = pfsync_get_bucket(sc, st); - PFSYNC_LOCK_ASSERT(sc); + PFSYNC_BUCKET_ASSERT(b); CURVNET_SET(m->m_pkthdr.rcvif->if_vnet); - TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry); - sc->sc_deferred--; + TAILQ_REMOVE(&b->b_deferrals, pd, pd_entry); + b->b_deferred--; pd->pd_st->state_flags &= ~PFSTATE_ACK; /* XXX: locking! */ if (pd->pd_refs == 0) free(pd, M_PFSYNC); @@ -1779,40 +1836,52 @@ { struct pfsync_softc *sc = V_pfsyncif; struct pfsync_deferral *pd; + struct pfsync_bucket *b = pfsync_get_bucket(sc, st); - PFSYNC_LOCK_ASSERT(sc); + PFSYNC_BUCKET_LOCK(b); - TAILQ_FOREACH(pd, &sc->sc_deferrals, pd_entry) { + TAILQ_FOREACH(pd, &b->b_deferrals, pd_entry) { if (pd->pd_st == st) { if (callout_stop(&pd->pd_tmo) > 0) pfsync_undefer(pd, drop); + + PFSYNC_BUCKET_UNLOCK(b); return; } } + PFSYNC_BUCKET_UNLOCK(b); panic("%s: unable to find deferred state", __func__); } +static struct pfsync_bucket* +pfsync_get_bucket(struct pfsync_softc *sc, struct pf_state *st) +{ + int c = PF_IDHASH(st) % pfsync_buckets; + return &sc->sc_buckets[c]; +} + static void pfsync_update_state(struct pf_state *st) { struct pfsync_softc *sc = V_pfsyncif; bool sync = false, ref = true; + struct pfsync_bucket *b = pfsync_get_bucket(sc, st); PF_STATE_LOCK_ASSERT(st); - PFSYNC_LOCK(sc); + PFSYNC_BUCKET_LOCK(b); if (st->state_flags & PFSTATE_ACK) pfsync_undefer_state(st, 0); if (st->state_flags & PFSTATE_NOSYNC) { if (st->sync_state != PFSYNC_S_NONE) - pfsync_q_del(st, true); - PFSYNC_UNLOCK(sc); + pfsync_q_del(st, true, b); + PFSYNC_BUCKET_UNLOCK(b); return; } - if (sc->sc_len == PFSYNC_MINPKT) - callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif); + if (b->b_len == PFSYNC_MINPKT) + callout_reset(&b->b_tmo, 1 * hz, pfsync_timeout, b); switch (st->sync_state) { case PFSYNC_S_UPD_C: @@ -1828,7 +1897,7 @@ break; case PFSYNC_S_IACK: - pfsync_q_del(st, false); + pfsync_q_del(st, false, b); ref = false; /* FALLTHROUGH */ @@ -1842,9 +1911,9 @@ } if (sync || (time_uptime - st->pfsync_time) < 2) - pfsync_push(sc); + pfsync_push(b); - PFSYNC_UNLOCK(sc); + PFSYNC_BUCKET_UNLOCK(b); } static void @@ -1854,14 +1923,14 @@ struct pfsync_upd_req_item *item; size_t nlen = sizeof(struct pfsync_upd_req); - PFSYNC_LOCK_ASSERT(sc); + PFSYNC_BUCKET_ASSERT(&sc->sc_buckets[0]); /* * This code does a bit to prevent multiple update requests for the * same state being generated. It searches current subheader queue, * but it doesn't lookup into queue of already packed datagrams. */ - TAILQ_FOREACH(item, &sc->sc_upd_req_list, ur_entry) + TAILQ_FOREACH(item, &sc->sc_buckets[0].b_upd_req_list, ur_entry) if (item->ur_msg.id == id && item->ur_msg.creatorid == creatorid) return; @@ -1873,18 +1942,18 @@ item->ur_msg.id = id; item->ur_msg.creatorid = creatorid; - if (TAILQ_EMPTY(&sc->sc_upd_req_list)) + if (TAILQ_EMPTY(&sc->sc_buckets[0].b_upd_req_list)) nlen += sizeof(struct pfsync_subheader); - if (sc->sc_len + nlen > sc->sc_ifp->if_mtu) { - pfsync_sendout(1); + if (sc->sc_buckets[0].b_len + nlen > sc->sc_ifp->if_mtu) { + pfsync_sendout(1, 0); nlen = sizeof(struct pfsync_subheader) + sizeof(struct pfsync_upd_req); } - TAILQ_INSERT_TAIL(&sc->sc_upd_req_list, item, ur_entry); - sc->sc_len += nlen; + TAILQ_INSERT_TAIL(&sc->sc_buckets[0].b_upd_req_list, item, ur_entry); + sc->sc_buckets[0].b_len += nlen; } static void @@ -1892,27 +1961,28 @@ { struct pfsync_softc *sc = V_pfsyncif; bool ref = true; + struct pfsync_bucket *b = pfsync_get_bucket(sc, st); PF_STATE_LOCK_ASSERT(st); - PFSYNC_LOCK(sc); + PFSYNC_BUCKET_LOCK(b); if (st->state_flags & PFSTATE_NOSYNC) { if (st->sync_state != PFSYNC_S_NONE) - pfsync_q_del(st, true); - PFSYNC_UNLOCK(sc); + pfsync_q_del(st, true, b); + PFSYNC_BUCKET_UNLOCK(b); return; } switch (st->sync_state) { case PFSYNC_S_UPD_C: case PFSYNC_S_IACK: - pfsync_q_del(st, false); + pfsync_q_del(st, false, b); ref = false; /* FALLTHROUGH */ case PFSYNC_S_NONE: pfsync_q_ins(st, PFSYNC_S_UPD, ref); - pfsync_push(sc); + pfsync_push(b); break; case PFSYNC_S_INS: @@ -1925,38 +1995,39 @@ panic("%s: unexpected sync state %d", __func__, st->sync_state); } - PFSYNC_UNLOCK(sc); + PFSYNC_BUCKET_UNLOCK(b); } static void pfsync_delete_state(struct pf_state *st) { struct pfsync_softc *sc = V_pfsyncif; + struct pfsync_bucket *b = pfsync_get_bucket(sc, st); bool ref = true; - PFSYNC_LOCK(sc); + PFSYNC_BUCKET_LOCK(b); if (st->state_flags & PFSTATE_ACK) pfsync_undefer_state(st, 1); if (st->state_flags & PFSTATE_NOSYNC) { if (st->sync_state != PFSYNC_S_NONE) - pfsync_q_del(st, true); - PFSYNC_UNLOCK(sc); + pfsync_q_del(st, true, b); + PFSYNC_BUCKET_UNLOCK(b); return; } - if (sc->sc_len == PFSYNC_MINPKT) - callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif); + if (b->b_len == PFSYNC_MINPKT) + callout_reset(&b->b_tmo, 1 * hz, pfsync_timeout, b); switch (st->sync_state) { case PFSYNC_S_INS: /* We never got to tell the world so just forget about it. */ - pfsync_q_del(st, true); + pfsync_q_del(st, true, b); break; case PFSYNC_S_UPD_C: case PFSYNC_S_UPD: case PFSYNC_S_IACK: - pfsync_q_del(st, false); + pfsync_q_del(st, false, b); ref = false; /* FALLTHROUGH */ @@ -1968,13 +2039,12 @@ panic("%s: unexpected sync state %d", __func__, st->sync_state); } - PFSYNC_UNLOCK(sc); + PFSYNC_BUCKET_UNLOCK(b); } static void pfsync_clear_states(u_int32_t creatorid, const char *ifname) { - struct pfsync_softc *sc = V_pfsyncif; struct { struct pfsync_subheader subh; struct pfsync_clr clr; @@ -1989,9 +2059,7 @@ strlcpy(r.clr.ifname, ifname, sizeof(r.clr.ifname)); r.clr.creatorid = creatorid; - PFSYNC_LOCK(sc); pfsync_send_plus(&r, sizeof(r)); - PFSYNC_UNLOCK(sc); } static void @@ -1999,48 +2067,48 @@ { struct pfsync_softc *sc = V_pfsyncif; size_t nlen = pfsync_qs[q].len; + struct pfsync_bucket *b = pfsync_get_bucket(sc, st); - PFSYNC_LOCK_ASSERT(sc); + PFSYNC_BUCKET_ASSERT(b); KASSERT(st->sync_state == PFSYNC_S_NONE, ("%s: st->sync_state %u", __func__, st->sync_state)); - KASSERT(sc->sc_len >= PFSYNC_MINPKT, ("pfsync pkt len is too low %zu", - sc->sc_len)); + KASSERT(b->b_len >= PFSYNC_MINPKT, ("pfsync pkt len is too low %zu", + b->b_len)); - if (TAILQ_EMPTY(&sc->sc_qs[q])) + if (TAILQ_EMPTY(&b->b_qs[q])) nlen += sizeof(struct pfsync_subheader); - if (sc->sc_len + nlen > sc->sc_ifp->if_mtu) { - pfsync_sendout(1); + if (b->b_len + nlen > sc->sc_ifp->if_mtu) { + pfsync_sendout(1, b->b_id); nlen = sizeof(struct pfsync_subheader) + pfsync_qs[q].len; } - sc->sc_len += nlen; - TAILQ_INSERT_TAIL(&sc->sc_qs[q], st, sync_list); + b->b_len += nlen; + TAILQ_INSERT_TAIL(&b->b_qs[q], st, sync_list); st->sync_state = q; if (ref) pf_ref_state(st); } static void -pfsync_q_del(struct pf_state *st, bool unref) +pfsync_q_del(struct pf_state *st, bool unref, struct pfsync_bucket *b) { - struct pfsync_softc *sc = V_pfsyncif; int q = st->sync_state; - PFSYNC_LOCK_ASSERT(sc); + PFSYNC_BUCKET_ASSERT(b); KASSERT(st->sync_state != PFSYNC_S_NONE, ("%s: st->sync_state != PFSYNC_S_NONE", __func__)); - sc->sc_len -= pfsync_qs[q].len; - TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list); + b->b_len -= pfsync_qs[q].len; + TAILQ_REMOVE(&b->b_qs[q], st, sync_list); st->sync_state = PFSYNC_S_NONE; if (unref) pf_release_state(st); - if (TAILQ_EMPTY(&sc->sc_qs[q])) - sc->sc_len -= sizeof(struct pfsync_subheader); + if (TAILQ_EMPTY(&b->b_qs[q])) + b->b_len -= sizeof(struct pfsync_subheader); } static void @@ -2094,7 +2162,7 @@ } for (; s; s = LIST_NEXT(s, entry)) { - +#if 0 if (sent > 1 && (sc->sc_ifp->if_mtu - sc->sc_len) < sizeof(struct pfsync_state)) { /* We've filled a packet. */ @@ -2106,7 +2174,7 @@ pfsync_bulk_update, sc); goto full; } - +#endif if (s->sync_state == PFSYNC_S_NONE && s->timeout < PFTM_MAX && s->pfsync_time <= sc->sc_ureq_received) { @@ -2119,8 +2187,9 @@ /* We're done. */ pfsync_bulk_status(PFSYNC_BUS_END); - +#if 0 full: +#endif CURVNET_RESTORE(); } @@ -2144,9 +2213,7 @@ r.bus.endtime = htonl(time_uptime - sc->sc_ureq_received); r.bus.status = status; - PFSYNC_LOCK(sc); pfsync_send_plus(&r, sizeof(r)); - PFSYNC_UNLOCK(sc); } static void @@ -2162,9 +2229,9 @@ /* Try again */ callout_reset(&sc->sc_bulkfail_tmo, 5 * hz, pfsync_bulk_fail, V_pfsyncif); - PFSYNC_LOCK(sc); + PFSYNC_BUCKET_LOCK(&sc->sc_buckets[0]); pfsync_request_update(0, 0); - PFSYNC_UNLOCK(sc); + PFSYNC_BUCKET_UNLOCK(&sc->sc_buckets[0]); } else { /* Pretend like the transfer was ok. */ sc->sc_ureq_sent = 0; @@ -2187,72 +2254,92 @@ { struct pfsync_softc *sc = V_pfsyncif; - PFSYNC_LOCK_ASSERT(sc); + PFSYNC_BUCKET_LOCK(&sc->sc_buckets[0]); - if (sc->sc_len + pluslen > sc->sc_ifp->if_mtu) - pfsync_sendout(1); + if (sc->sc_buckets[0].b_len + pluslen > sc->sc_ifp->if_mtu) + pfsync_sendout(1, 0); - sc->sc_plus = plus; - sc->sc_len += (sc->sc_pluslen = pluslen); + sc->sc_buckets[0].b_plus = plus; + sc->sc_buckets[0].b_len += (sc->sc_buckets[0].b_pluslen = pluslen); - pfsync_sendout(1); + pfsync_sendout(1, 0); + PFSYNC_BUCKET_UNLOCK(&sc->sc_buckets[0]); } static void pfsync_timeout(void *arg) { - struct pfsync_softc *sc = arg; + struct pfsync_bucket *b = arg; + struct pfsync_softc *sc = b->b_sc; CURVNET_SET(sc->sc_ifp->if_vnet); - PFSYNC_LOCK(sc); - pfsync_push(sc); - PFSYNC_UNLOCK(sc); + PFSYNC_BUCKET_LOCK(b); + pfsync_push(b); + PFSYNC_BUCKET_UNLOCK(b); CURVNET_RESTORE(); } static void -pfsync_push(struct pfsync_softc *sc) +pfsync_push(struct pfsync_bucket *b) { - PFSYNC_LOCK_ASSERT(sc); + PFSYNC_BUCKET_ASSERT(b); - sc->sc_flags |= PFSYNCF_PUSH; + b->b_flags |= PFSYNCF_BUCKET_PUSH; swi_sched(V_pfsync_swi_cookie, 0); } +static void +pfsync_push_all(struct pfsync_softc *sc) +{ + int c; + + for (c = 0; c < pfsync_buckets; c++) { + PFSYNC_BUCKET_LOCK(&sc->sc_buckets[c]); + pfsync_push(&sc->sc_buckets[c]); + PFSYNC_BUCKET_UNLOCK(&sc->sc_buckets[c]); + } +} + static void pfsyncintr(void *arg) { struct pfsync_softc *sc = arg; + struct pfsync_bucket *b; struct mbuf *m, *n; + int c; CURVNET_SET(sc->sc_ifp->if_vnet); - PFSYNC_LOCK(sc); - if ((sc->sc_flags & PFSYNCF_PUSH) && sc->sc_len > PFSYNC_MINPKT) { - pfsync_sendout(0); - sc->sc_flags &= ~PFSYNCF_PUSH; - } - _IF_DEQUEUE_ALL(&sc->sc_ifp->if_snd, m); - PFSYNC_UNLOCK(sc); + for (c = 0; c < pfsync_buckets; c++) { + b = &sc->sc_buckets[c]; - for (; m != NULL; m = n) { + PFSYNC_BUCKET_LOCK(b); + if ((b->b_flags & PFSYNCF_BUCKET_PUSH) && b->b_len > PFSYNC_MINPKT) { + pfsync_sendout(0, b->b_id); + b->b_flags &= ~PFSYNCF_BUCKET_PUSH; + } + _IF_DEQUEUE_ALL(&b->b_snd, m); + PFSYNC_BUCKET_UNLOCK(b); - n = m->m_nextpkt; - m->m_nextpkt = NULL; + for (; m != NULL; m = n) { - /* - * We distinguish between a deferral packet and our - * own pfsync packet based on M_SKIP_FIREWALL - * flag. This is XXX. - */ - if (m->m_flags & M_SKIP_FIREWALL) - ip_output(m, NULL, NULL, 0, NULL, NULL); - else if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo, - NULL) == 0) - V_pfsyncstats.pfsyncs_opackets++; - else - V_pfsyncstats.pfsyncs_oerrors++; + n = m->m_nextpkt; + m->m_nextpkt = NULL; + + /* + * We distinguish between a deferral packet and our + * own pfsync packet based on M_SKIP_FIREWALL + * flag. This is XXX. + */ + if (m->m_flags & M_SKIP_FIREWALL) + ip_output(m, NULL, NULL, 0, NULL, NULL); + else if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo, + NULL) == 0) + V_pfsyncstats.pfsyncs_opackets++; + else + V_pfsyncstats.pfsyncs_oerrors++; + } } CURVNET_RESTORE(); } @@ -2366,6 +2453,7 @@ V_pfsync_cloner = if_clone_simple(pfsyncname, pfsync_clone_create, pfsync_clone_destroy, 1); + error = swi_add(NULL, pfsyncname, pfsyncintr, V_pfsyncif, SWI_NET, INTR_MPSAFE, &V_pfsync_swi_cookie); if (error) {