diff --git a/sys/netinet/ip_mroute.h b/sys/netinet/ip_mroute.h --- a/sys/netinet/ip_mroute.h +++ b/sys/netinet/ip_mroute.h @@ -283,7 +283,10 @@ struct timeval mfc_last_assert; /* last time I sent an assert*/ uint8_t mfc_flags[MAXVIFS]; /* the MRT_MFC_FLAGS_* flags */ struct in_addr mfc_rp; /* the RP address */ - struct bw_meter *mfc_bw_meter; /* list of bandwidth meters */ + struct bw_meter *mfc_bw_meter_leq; /* list of bandwidth meters + for Lower-or-EQual case */ + struct bw_meter *mfc_bw_meter_geq; /* list of bandwidth meters + for Greater-or-EQual case */ u_long mfc_nstall; /* # of packets awaiting mfc */ TAILQ_HEAD(, rtdetq) mfc_stall; /* q of packets awaiting mfc */ }; @@ -327,7 +330,6 @@ struct bw_meter { struct bw_meter *bm_mfc_next; /* next bw meter (same mfc) */ struct bw_meter *bm_time_next; /* next bw meter (same time) */ - uint32_t bm_time_hash; /* the time hash value */ struct mfc *bm_mfc; /* the corresponding mfc */ uint32_t bm_flags; /* misc flags (see below) */ #define BW_METER_UNIT_PACKETS (1 << 0) /* threshold (in packets) */ @@ -344,6 +346,10 @@ struct bw_data bm_threshold; /* the upcall threshold */ struct bw_data bm_measured; /* the measured bw */ struct timeval bm_start_time; /* abs. time */ +#ifdef _KERNEL + struct callout bm_meter_callout; /* Periodic callout */ + void* arg; /* custom argument */ +#endif }; #ifdef _KERNEL diff --git a/sys/netinet/ip_mroute.c b/sys/netinet/ip_mroute.c --- a/sys/netinet/ip_mroute.c +++ b/sys/netinet/ip_mroute.c @@ -49,6 +49,7 @@ * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000 * Modified by Hitoshi Asaeda, WIDE, August 2000 * Modified by Pavlin Radoslavov, ICSI, October 2002 + * Modified by Wojciech Macek, Semihalf, May 2021 * * MROUTING Revision: 3.5 * and PIM-SMv2 and PIM-DM support, advanced API support, @@ -202,16 +203,6 @@ * Bandwidth meter variables and constants */ static MALLOC_DEFINE(M_BWMETER, "bwmeter", "multicast upcall bw meters"); -/* - * Pending timeouts are stored in a hash table, the key being the - * expiration time. Periodically, the entries are analysed and processed. - */ -#define BW_METER_BUCKETS 1024 -VNET_DEFINE_STATIC(struct bw_meter **, bw_meter_timers); -#define V_bw_meter_timers VNET(bw_meter_timers) -VNET_DEFINE_STATIC(struct callout, bw_meter_ch); -#define V_bw_meter_ch VNET(bw_meter_ch) -#define BW_METER_PERIOD (hz) /* periodical handling of bw meters */ /* * Pending upcalls are stored in a vector which is flushed when @@ -320,14 +311,13 @@ static int add_vif(struct vifctl *); static void bw_meter_prepare_upcall(struct bw_meter *, struct timeval *); static void bw_meter_process(void); -static void bw_meter_receive_packet(struct bw_meter *, int, +static void bw_meter_geq_receive_packet(struct bw_meter *, int, struct timeval *); static void bw_upcalls_send(void); static int del_bw_upcall(struct bw_upcall *); static int del_mfc(struct mfcctl2 *); static int del_vif(vifi_t); static int del_vif_locked(vifi_t); -static void expire_bw_meter_process(void *); static void expire_bw_upcalls_send(void *); static void expire_mfc(struct mfc *); static void expire_upcalls(void *); @@ -685,8 +675,6 @@ curvnet); callout_reset(&V_bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send, curvnet); - callout_reset(&V_bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process, - curvnet); V_ip_mrouter = so; ip_mrouter_cnt++; @@ -745,7 +733,6 @@ callout_stop(&V_expire_upcalls_ch); callout_stop(&V_bw_upcalls_ch); - callout_stop(&V_bw_meter_ch); MFC_LOCK(); @@ -766,7 +753,6 @@ bzero(V_nexpire, sizeof(V_nexpire[0]) * mfchashsize); V_bw_upcalls_n = 0; - bzero(V_bw_meter_timers, BW_METER_BUCKETS * sizeof(*V_bw_meter_timers)); MFC_UNLOCK(); @@ -1036,7 +1022,8 @@ MFC_LOCK_ASSERT(); - free_bw_list(rt->mfc_bw_meter); + free_bw_list(rt->mfc_bw_meter_leq); + free_bw_list(rt->mfc_bw_meter_geq); TAILQ_FOREACH_SAFE(rte, &rt->mfc_stall, rte_link, nrte) { m_freem(rte->m); @@ -1139,7 +1126,8 @@ rt->mfc_nstall = 0; rt->mfc_expire = 0; - rt->mfc_bw_meter = NULL; + rt->mfc_bw_meter_leq = NULL; + rt->mfc_bw_meter_geq = NULL; /* insert new entry at head of hash chain */ LIST_INSERT_HEAD(&V_mfchashtbl[hash], rt, mfc_hash); @@ -1179,8 +1167,10 @@ /* * free the bw_meter entries */ - free_bw_list(rt->mfc_bw_meter); - rt->mfc_bw_meter = NULL; + free_bw_list(rt->mfc_bw_meter_leq); + rt->mfc_bw_meter_leq = NULL; + free_bw_list(rt->mfc_bw_meter_geq); + rt->mfc_bw_meter_geq = NULL; LIST_REMOVE(rt, mfc_hash); free(rt, M_MRTABLE); @@ -1393,7 +1383,8 @@ /* clear the RP address */ rt->mfc_rp.s_addr = INADDR_ANY; - rt->mfc_bw_meter = NULL; + rt->mfc_bw_meter_leq = NULL; + rt->mfc_bw_meter_geq = NULL; /* initialize pkt counters per src-grp */ rt->mfc_pkt_cnt = 0; @@ -1459,16 +1450,6 @@ if (rt->mfc_expire == 0 || --rt->mfc_expire > 0) continue; - /* - * free the bw_meter entries - */ - while (rt->mfc_bw_meter != NULL) { - struct bw_meter *x = rt->mfc_bw_meter; - - rt->mfc_bw_meter = x->bm_mfc_next; - free(x, M_BWMETER); - } - MRTSTAT_INC(mrts_cache_cleanups); CTR3(KTR_IPMF, "%s: expire (%lx, %lx)", __func__, (u_long)ntohl(rt->mfc_origin.s_addr), @@ -1602,14 +1583,22 @@ /* * Perform upcall-related bw measuring. */ - if (rt->mfc_bw_meter != NULL) { + if ((rt->mfc_bw_meter_geq != NULL) || (rt->mfc_bw_meter_leq != NULL)) { struct bw_meter *x; struct timeval now; microtime(&now); MFC_LOCK_ASSERT(); - for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) - bw_meter_receive_packet(x, plen, &now); + /* Process meters for Greater-or-EQual case */ + for (x = rt->mfc_bw_meter_geq; x != NULL; x = x->bm_mfc_next) + bw_meter_geq_receive_packet(x, plen, &now); + + /* Process meters for Lower-or-EQual case */ + for (x = rt->mfc_bw_meter_leq; x != NULL; x = x->bm_mfc_next) { + /* Record that a packet is received */ + x->bm_measured.b_packets++; + x->bm_measured.b_bytes += plen; + } } return 0; @@ -1759,6 +1748,44 @@ return flags; } +static void +expire_bw_meter_leq(void *arg) +{ + struct bw_meter *x = arg; + struct timeval now; + /* + * INFO: + * callout is always executed with MFC_LOCK taken + */ + + CURVNET_SET((struct vnet *)x->arg); + + microtime(&now); + + /* + * Test if we should deliver an upcall + */ + if (((x->bm_flags & BW_METER_UNIT_PACKETS) && + (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || + ((x->bm_flags & BW_METER_UNIT_BYTES) && + (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { + /* Prepare an upcall for delivery */ + bw_meter_prepare_upcall(x, &now); + } + + /* Send all upcalls that are pending delivery */ + bw_upcalls_send(); + + /* Reset counters */ + x->bm_start_time = now; + x->bm_measured.b_bytes = 0; + x->bm_measured.b_packets = 0; + + callout_schedule(&x->bm_meter_callout, tvtohz(&x->bm_threshold.b_time)); + + CURVNET_RESTORE(); +} + /* * Add a bw_meter entry */ @@ -1769,7 +1796,7 @@ struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC, BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC }; struct timeval now; - struct bw_meter *x; + struct bw_meter *x, **bwm_ptr; uint32_t flags; if (!(V_mrt_api_config & MRT_MFC_BW_UPCALL)) @@ -1799,12 +1826,22 @@ MFC_UNLOCK(); return EADDRNOTAVAIL; } - for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) { + + /* Choose an appropriate bw_meter list */ + if (req->bu_flags & BW_UPCALL_GEQ) + bwm_ptr = &mfc->mfc_bw_meter_geq; + else + bwm_ptr = &mfc->mfc_bw_meter_leq; + + for (x = *bwm_ptr; x != NULL; x = x->bm_mfc_next) { if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, - &req->bu_threshold.b_time, ==)) && - (x->bm_threshold.b_packets == req->bu_threshold.b_packets) && - (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) && - (x->bm_flags & BW_METER_USER_FLAGS) == flags) { + &req->bu_threshold.b_time, ==)) + && (x->bm_threshold.b_packets + == req->bu_threshold.b_packets) + && (x->bm_threshold.b_bytes + == req->bu_threshold.b_bytes) + && (x->bm_flags & BW_METER_USER_FLAGS) + == flags) { MFC_UNLOCK(); return 0; /* XXX Already installed */ } @@ -1827,13 +1864,20 @@ x->bm_measured.b_bytes = 0; x->bm_flags = flags; x->bm_time_next = NULL; - x->bm_time_hash = BW_METER_BUCKETS; + x->bm_mfc = mfc; + x->arg = curvnet; + + /* For LEQ case create periodic callout */ + if (req->bu_flags & BW_UPCALL_LEQ) { + callout_init_mtx(&x->bm_meter_callout, &mfc_mtx,0); + callout_reset(&x->bm_meter_callout, tvtohz(&x->bm_threshold.b_time), + expire_bw_meter_leq, x); + } /* Add the new bw_meter entry to the front of entries for this MFC */ - x->bm_mfc = mfc; - x->bm_mfc_next = mfc->mfc_bw_meter; - mfc->mfc_bw_meter = x; - schedule_bw_meter(x, &now); + x->bm_mfc_next = *bwm_ptr; + *bwm_ptr = x; + MFC_UNLOCK(); return 0; @@ -1845,8 +1889,11 @@ while (list != NULL) { struct bw_meter *x = list; + /* MFC_LOCK must be held here */ + if (x->bm_flags & BW_METER_LEQ) + callout_drain(&x->bm_meter_callout); + list = list->bm_mfc_next; - unschedule_bw_meter(x); free(x, M_BWMETER); } } @@ -1858,7 +1905,7 @@ del_bw_upcall(struct bw_upcall *req) { struct mfc *mfc; - struct bw_meter *x; + struct bw_meter *x, **bwm_ptr; if (!(V_mrt_api_config & MRT_MFC_BW_UPCALL)) return EOPNOTSUPP; @@ -1876,8 +1923,14 @@ */ struct bw_meter *list; - list = mfc->mfc_bw_meter; - mfc->mfc_bw_meter = NULL; + /* Free LEQ list */ + list = mfc->mfc_bw_meter_leq; + mfc->mfc_bw_meter_leq = NULL; + free_bw_list(list); + + /* Free GEQ list */ + list = mfc->mfc_bw_meter_geq; + mfc->mfc_bw_meter_geq = NULL; free_bw_list(list); MFC_UNLOCK(); return 0; @@ -1887,8 +1940,14 @@ flags = compute_bw_meter_flags(req); + /* Choose an appropriate bw_meter list */ + if (req->bu_flags & BW_UPCALL_GEQ) + bwm_ptr = &mfc->mfc_bw_meter_geq; + else + bwm_ptr = &mfc->mfc_bw_meter_leq; + /* Find the bw_meter entry to delete */ - for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL; + for (prev = NULL, x = *bwm_ptr; x != NULL; prev = x, x = x->bm_mfc_next) { if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, &req->bu_threshold.b_time, ==)) && @@ -1901,9 +1960,11 @@ if (prev != NULL) prev->bm_mfc_next = x->bm_mfc_next; /* remove from middle*/ else - x->bm_mfc->mfc_bw_meter = x->bm_mfc_next;/* new head of list */ + *bwm_ptr = x->bm_mfc_next;/* new head of list */ + + if (req->bu_flags & BW_UPCALL_LEQ) + callout_stop(&x->bm_meter_callout); - unschedule_bw_meter(x); MFC_UNLOCK(); /* Free the bw_meter entry */ free(x, M_BWMETER); @@ -1920,7 +1981,7 @@ * Perform bandwidth measurement processing that may result in an upcall */ static void -bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp) +bw_meter_geq_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp) { struct timeval delta; @@ -1929,7 +1990,6 @@ delta = *nowp; BW_TIMEVALDECR(&delta, &x->bm_start_time); - if (x->bm_flags & BW_METER_GEQ) { /* * Processing for ">=" type of bw_meter entry */ @@ -1958,54 +2018,6 @@ x->bm_flags |= BW_METER_UPCALL_DELIVERED; } } - } else if (x->bm_flags & BW_METER_LEQ) { - /* - * Processing for "<=" type of bw_meter entry - */ - if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { - /* - * We are behind time with the multicast forwarding table - * scanning for "<=" type of bw_meter entries, so test now - * if we should deliver an upcall. - */ - if (((x->bm_flags & BW_METER_UNIT_PACKETS) && - (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || - ((x->bm_flags & BW_METER_UNIT_BYTES) && - (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { - /* Prepare an upcall for delivery */ - bw_meter_prepare_upcall(x, nowp); - } - /* Reschedule the bw_meter entry */ - unschedule_bw_meter(x); - schedule_bw_meter(x, nowp); - } - - /* Record that a packet is received */ - x->bm_measured.b_packets++; - x->bm_measured.b_bytes += plen; - - /* - * Test if we should restart the measuring interval - */ - if ((x->bm_flags & BW_METER_UNIT_PACKETS && - x->bm_measured.b_packets <= x->bm_threshold.b_packets) || - (x->bm_flags & BW_METER_UNIT_BYTES && - x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) { - /* Don't restart the measuring interval */ - } else { - /* Do restart the measuring interval */ - /* - * XXX: note that we don't unschedule and schedule, because this - * might be too much overhead per packet. Instead, when we process - * all entries for a given timer hash bin, we check whether it is - * really a timeout. If not, we reschedule at that time. - */ - x->bm_start_time = *nowp; - x->bm_measured.b_packets = 0; - x->bm_measured.b_bytes = 0; - x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; - } - } } /* @@ -2103,183 +2115,6 @@ } } -/* - * Compute the timeout hash value for the bw_meter entries - */ -#define BW_METER_TIMEHASH(bw_meter, hash) \ - do { \ - struct timeval next_timeval = (bw_meter)->bm_start_time; \ - \ - BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time); \ - (hash) = next_timeval.tv_sec; \ - if (next_timeval.tv_usec) \ - (hash)++; /* XXX: make sure we don't timeout early */ \ - (hash) %= BW_METER_BUCKETS; \ - } while (0) - -/* - * Schedule a timer to process periodically bw_meter entry of type "<=" - * by linking the entry in the proper hash bucket. - */ -static void -schedule_bw_meter(struct bw_meter *x, struct timeval *nowp) -{ - int time_hash; - - MFC_LOCK_ASSERT(); - - if (!(x->bm_flags & BW_METER_LEQ)) - return; /* XXX: we schedule timers only for "<=" entries */ - - /* - * Reset the bw_meter entry - */ - x->bm_start_time = *nowp; - x->bm_measured.b_packets = 0; - x->bm_measured.b_bytes = 0; - x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; - - /* - * Compute the timeout hash value and insert the entry - */ - BW_METER_TIMEHASH(x, time_hash); - x->bm_time_next = V_bw_meter_timers[time_hash]; - V_bw_meter_timers[time_hash] = x; - x->bm_time_hash = time_hash; -} - -/* - * Unschedule the periodic timer that processes bw_meter entry of type "<=" - * by removing the entry from the proper hash bucket. - */ -static void -unschedule_bw_meter(struct bw_meter *x) -{ - int time_hash; - struct bw_meter *prev, *tmp; - - MFC_LOCK_ASSERT(); - - if (!(x->bm_flags & BW_METER_LEQ)) - return; /* XXX: we schedule timers only for "<=" entries */ - - /* - * Compute the timeout hash value and delete the entry - */ - time_hash = x->bm_time_hash; - if (time_hash >= BW_METER_BUCKETS) - return; /* Entry was not scheduled */ - - for (prev = NULL, tmp = V_bw_meter_timers[time_hash]; - tmp != NULL; prev = tmp, tmp = tmp->bm_time_next) - if (tmp == x) - break; - - if (tmp == NULL) - panic("unschedule_bw_meter: bw_meter entry not found"); - - if (prev != NULL) - prev->bm_time_next = x->bm_time_next; - else - V_bw_meter_timers[time_hash] = x->bm_time_next; - - x->bm_time_next = NULL; - x->bm_time_hash = BW_METER_BUCKETS; -} - -/* - * Process all "<=" type of bw_meter that should be processed now, - * and for each entry prepare an upcall if necessary. Each processed - * entry is rescheduled again for the (periodic) processing. - * - * This is run periodically (once per second normally). On each round, - * all the potentially matching entries are in the hash slot that we are - * looking at. - */ -static void -bw_meter_process() -{ - uint32_t loops; - int i; - struct timeval now, process_endtime; - - microtime(&now); - if (V_last_tv_sec == now.tv_sec) - return; /* nothing to do */ - - loops = now.tv_sec - V_last_tv_sec; - V_last_tv_sec = now.tv_sec; - if (loops > BW_METER_BUCKETS) - loops = BW_METER_BUCKETS; - - MFC_LOCK(); - /* - * Process all bins of bw_meter entries from the one after the last - * processed to the current one. On entry, i points to the last bucket - * visited, so we need to increment i at the beginning of the loop. - */ - for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) { - struct bw_meter *x, *tmp_list; - - if (++i >= BW_METER_BUCKETS) - i = 0; - - /* Disconnect the list of bw_meter entries from the bin */ - tmp_list = V_bw_meter_timers[i]; - V_bw_meter_timers[i] = NULL; - - /* Process the list of bw_meter entries */ - while (tmp_list != NULL) { - x = tmp_list; - tmp_list = tmp_list->bm_time_next; - - /* Test if the time interval is over */ - process_endtime = x->bm_start_time; - BW_TIMEVALADD(&process_endtime, &x->bm_threshold.b_time); - if (BW_TIMEVALCMP(&process_endtime, &now, >)) { - /* Not yet: reschedule, but don't reset */ - int time_hash; - - BW_METER_TIMEHASH(x, time_hash); - if (time_hash == i && process_endtime.tv_sec == now.tv_sec) { - /* - * XXX: somehow the bin processing is a bit ahead of time. - * Put the entry in the next bin. - */ - if (++time_hash >= BW_METER_BUCKETS) - time_hash = 0; - } - x->bm_time_next = V_bw_meter_timers[time_hash]; - V_bw_meter_timers[time_hash] = x; - x->bm_time_hash = time_hash; - - continue; - } - - /* - * Test if we should deliver an upcall - */ - if (((x->bm_flags & BW_METER_UNIT_PACKETS) && - (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || - ((x->bm_flags & BW_METER_UNIT_BYTES) && - (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { - /* Prepare an upcall for delivery */ - bw_meter_prepare_upcall(x, &now); - } - - /* - * Reschedule for next processing - */ - schedule_bw_meter(x, &now); - } - } - - /* Send all upcalls that are pending delivery */ - bw_upcalls_send(); - - MFC_UNLOCK(); -} - /* * A periodic function for sending all upcalls that are pending delivery */ @@ -2297,23 +2132,6 @@ CURVNET_RESTORE(); } -/* - * A periodic function for periodic scanning of the multicast forwarding - * table for processing all "<=" bw_meter entries. - */ -static void -expire_bw_meter_process(void *arg) -{ - CURVNET_SET((struct vnet *) arg); - - if (V_mrt_api_config & MRT_MFC_BW_UPCALL) - bw_meter_process(); - - callout_reset(&V_bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process, - curvnet); - CURVNET_RESTORE(); -} - /* * End of bandwidth monitoring code */ @@ -2835,14 +2653,11 @@ V_viftable = mallocarray(MAXVIFS, sizeof(*V_viftable), M_MRTABLE, M_WAITOK|M_ZERO); - V_bw_meter_timers = mallocarray(BW_METER_BUCKETS, - sizeof(*V_bw_meter_timers), M_MRTABLE, M_WAITOK|M_ZERO); V_bw_upcalls = mallocarray(BW_UPCALLS_MAX, sizeof(*V_bw_upcalls), M_MRTABLE, M_WAITOK|M_ZERO); callout_init(&V_expire_upcalls_ch, 1); callout_init(&V_bw_upcalls_ch, 1); - callout_init(&V_bw_meter_ch, 1); } VNET_SYSINIT(vnet_mroute_init, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_mroute_init, @@ -2853,7 +2668,6 @@ { free(V_bw_upcalls, M_MRTABLE); - free(V_bw_meter_timers, M_MRTABLE); free(V_viftable, M_MRTABLE); free(V_nexpire, M_MRTABLE); V_nexpire = NULL;