Index: head/sys/net/route.c =================================================================== --- head/sys/net/route.c (revision 297224) +++ head/sys/net/route.c (revision 297225) @@ -1,2293 +1,2304 @@ /*- * Copyright (c) 1980, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)route.c 8.3.1.1 (Berkeley) 2/23/95 * $FreeBSD$ */ /************************************************************************ * Note: In this file a 'fib' is a "forwarding information base" * * Which is the new name for an in kernel routing (next hop) table. * ***********************************************************************/ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_route.h" #include "opt_sctp.h" #include "opt_mrouting.h" #include "opt_mpath.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RADIX_MPATH #include #endif #include #include #include #define RT_MAXFIBS UINT16_MAX /* Kernel config default option. */ #ifdef ROUTETABLES #if ROUTETABLES <= 0 #error "ROUTETABLES defined too low" #endif #if ROUTETABLES > RT_MAXFIBS #error "ROUTETABLES defined too big" #endif #define RT_NUMFIBS ROUTETABLES #endif /* ROUTETABLES */ /* Initialize to default if not otherwise set. */ #ifndef RT_NUMFIBS #define RT_NUMFIBS 1 #endif #if defined(INET) || defined(INET6) #ifdef SCTP extern void sctp_addr_change(struct ifaddr *ifa, int cmd); #endif /* SCTP */ #endif /* This is read-only.. */ u_int rt_numfibs = RT_NUMFIBS; SYSCTL_UINT(_net, OID_AUTO, fibs, CTLFLAG_RDTUN, &rt_numfibs, 0, ""); /* * By default add routes to all fibs for new interfaces. * Once this is set to 0 then only allocate routes on interface * changes for the FIB of the caller when adding a new set of addresses * to an interface. XXX this is a shotgun aproach to a problem that needs * a more fine grained solution.. that will come. * XXX also has the problems getting the FIB from curthread which will not * always work given the fib can be overridden and prefixes can be added * from the network stack context. */ VNET_DEFINE(u_int, rt_add_addr_allfibs) = 1; SYSCTL_UINT(_net, OID_AUTO, add_addr_allfibs, CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(rt_add_addr_allfibs), 0, ""); VNET_DEFINE(struct rtstat, rtstat); #define V_rtstat VNET(rtstat) VNET_DEFINE(struct rib_head *, rt_tables); #define V_rt_tables VNET(rt_tables) VNET_DEFINE(int, rttrash); /* routes not in table but not freed */ #define V_rttrash VNET(rttrash) /* * Convert a 'struct radix_node *' to a 'struct rtentry *'. * The operation can be done safely (in this code) because a * 'struct rtentry' starts with two 'struct radix_node''s, the first * one representing leaf nodes in the routing tree, which is * what the code in radix.c passes us as a 'struct radix_node'. * * But because there are a lot of assumptions in this conversion, * do not cast explicitly, but always use the macro below. */ #define RNTORT(p) ((struct rtentry *)(p)) static VNET_DEFINE(uma_zone_t, rtzone); /* Routing table UMA zone. */ #define V_rtzone VNET(rtzone) static int rtrequest1_fib_change(struct rib_head *, struct rt_addrinfo *, struct rtentry **, u_int); static void rt_setmetrics(const struct rt_addrinfo *, struct rtentry *); static int rt_ifdelroute(const struct rtentry *rt, void *arg); static struct rtentry *rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, int *perror); static void rt_notifydelete(struct rtentry *rt, struct rt_addrinfo *info); #ifdef RADIX_MPATH static struct radix_node *rt_mpath_unlink(struct rib_head *rnh, struct rt_addrinfo *info, struct rtentry *rto, int *perror); #endif static int rt_exportinfo(struct rtentry *rt, struct rt_addrinfo *info, int flags); struct if_mtuinfo { struct ifnet *ifp; int mtu; }; static int if_updatemtu_cb(struct radix_node *, void *); /* * handler for net.my_fibnum */ static int sysctl_my_fibnum(SYSCTL_HANDLER_ARGS) { int fibnum; int error; fibnum = curthread->td_proc->p_fibnum; error = sysctl_handle_int(oidp, &fibnum, 0, req); return (error); } SYSCTL_PROC(_net, OID_AUTO, my_fibnum, CTLTYPE_INT|CTLFLAG_RD, NULL, 0, &sysctl_my_fibnum, "I", "default FIB of caller"); static __inline struct rib_head ** rt_tables_get_rnh_ptr(int table, int fam) { struct rib_head **rnh; KASSERT(table >= 0 && table < rt_numfibs, ("%s: table out of bounds.", __func__)); KASSERT(fam >= 0 && fam < (AF_MAX+1), ("%s: fam out of bounds.", __func__)); /* rnh is [fib=0][af=0]. */ rnh = (struct rib_head **)V_rt_tables; /* Get the offset to the requested table and fam. */ rnh += table * (AF_MAX+1) + fam; return (rnh); } struct rib_head * rt_tables_get_rnh(int table, int fam) { return (*rt_tables_get_rnh_ptr(table, fam)); } +rt_gen_t +rt_tables_get_gen(int table, int fam) +{ + struct rib_head *rnh; + + rnh = *rt_tables_get_rnh_ptr(table, fam); + return (rnh->rnh_gen); +} + + /* * route initialization must occur before ip6_init2(), which happenas at * SI_ORDER_MIDDLE. */ static void route_init(void) { /* whack the tunable ints into line. */ if (rt_numfibs > RT_MAXFIBS) rt_numfibs = RT_MAXFIBS; if (rt_numfibs == 0) rt_numfibs = 1; } SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, 0); static int rtentry_zinit(void *mem, int size, int how) { struct rtentry *rt = mem; rt->rt_pksent = counter_u64_alloc(how); if (rt->rt_pksent == NULL) return (ENOMEM); RT_LOCK_INIT(rt); return (0); } static void rtentry_zfini(void *mem, int size) { struct rtentry *rt = mem; RT_LOCK_DESTROY(rt); counter_u64_free(rt->rt_pksent); } static int rtentry_ctor(void *mem, int size, void *arg, int how) { struct rtentry *rt = mem; bzero(rt, offsetof(struct rtentry, rt_endzero)); counter_u64_zero(rt->rt_pksent); rt->rt_chain = NULL; return (0); } static void rtentry_dtor(void *mem, int size, void *arg) { struct rtentry *rt = mem; RT_UNLOCK_COND(rt); } static void vnet_route_init(const void *unused __unused) { struct domain *dom; struct rib_head **rnh; int table; int fam; V_rt_tables = malloc(rt_numfibs * (AF_MAX+1) * sizeof(struct rib_head *), M_RTABLE, M_WAITOK|M_ZERO); V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), rtentry_ctor, rtentry_dtor, rtentry_zinit, rtentry_zfini, UMA_ALIGN_PTR, 0); for (dom = domains; dom; dom = dom->dom_next) { if (dom->dom_rtattach == NULL) continue; for (table = 0; table < rt_numfibs; table++) { fam = dom->dom_family; if (table != 0 && fam != AF_INET6 && fam != AF_INET) break; rnh = rt_tables_get_rnh_ptr(table, fam); if (rnh == NULL) panic("%s: rnh NULL", __func__); dom->dom_rtattach((void **)rnh, 0); } } } VNET_SYSINIT(vnet_route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, vnet_route_init, 0); #ifdef VIMAGE static void vnet_route_uninit(const void *unused __unused) { int table; int fam; struct domain *dom; struct rib_head **rnh; for (dom = domains; dom; dom = dom->dom_next) { if (dom->dom_rtdetach == NULL) continue; for (table = 0; table < rt_numfibs; table++) { fam = dom->dom_family; if (table != 0 && fam != AF_INET6 && fam != AF_INET) break; rnh = rt_tables_get_rnh_ptr(table, fam); if (rnh == NULL) panic("%s: rnh NULL", __func__); dom->dom_rtdetach((void **)rnh, 0); } } free(V_rt_tables, M_RTABLE); uma_zdestroy(V_rtzone); } VNET_SYSUNINIT(vnet_route_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, vnet_route_uninit, 0); #endif struct rib_head * rt_table_init(int offset) { struct rib_head *rh; rh = malloc(sizeof(struct rib_head), M_RTABLE, M_WAITOK | M_ZERO); /* TODO: These details should be hidded inside radix.c */ /* Init masks tree */ rn_inithead_internal(&rh->head, rh->rnh_nodes, offset); rn_inithead_internal(&rh->rmhead.head, rh->rmhead.mask_nodes, 0); rh->head.rnh_masks = &rh->rmhead; /* Init locks */ rw_init(&rh->rib_lock, "rib head lock"); /* Finally, set base callbacks */ rh->rnh_addaddr = rn_addroute; rh->rnh_deladdr = rn_delete; rh->rnh_matchaddr = rn_match; rh->rnh_lookup = rn_lookup; rh->rnh_walktree = rn_walktree; rh->rnh_walktree_from = rn_walktree_from; return (rh); } static int rt_freeentry(struct radix_node *rn, void *arg) { struct radix_head * const rnh = arg; struct radix_node *x; x = (struct radix_node *)rn_delete(rn + 2, NULL, rnh); if (x != NULL) R_Free(x); return (0); } void rt_table_destroy(struct rib_head *rh) { rn_walktree(&rh->rmhead.head, rt_freeentry, &rh->rmhead.head); /* Assume table is already empty */ rw_destroy(&rh->rib_lock); free(rh, M_RTABLE); } #ifndef _SYS_SYSPROTO_H_ struct setfib_args { int fibnum; }; #endif int sys_setfib(struct thread *td, struct setfib_args *uap) { if (uap->fibnum < 0 || uap->fibnum >= rt_numfibs) return EINVAL; td->td_proc->p_fibnum = uap->fibnum; return (0); } /* * Packet routing routines. */ void rtalloc_ign_fib(struct route *ro, u_long ignore, u_int fibnum) { struct rtentry *rt; if ((rt = ro->ro_rt) != NULL) { if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP) return; RTFREE(rt); ro->ro_rt = NULL; } ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, fibnum); if (ro->ro_rt) RT_UNLOCK(ro->ro_rt); } /* * Look up the route that matches the address given * Or, at least try.. Create a cloned route if needed. * * The returned route, if any, is locked. */ struct rtentry * rtalloc1(struct sockaddr *dst, int report, u_long ignflags) { return (rtalloc1_fib(dst, report, ignflags, RT_DEFAULT_FIB)); } struct rtentry * rtalloc1_fib(struct sockaddr *dst, int report, u_long ignflags, u_int fibnum) { struct rib_head *rh; struct radix_node *rn; struct rtentry *newrt; struct rt_addrinfo info; int err = 0, msgtype = RTM_MISS; KASSERT((fibnum < rt_numfibs), ("rtalloc1_fib: bad fibnum")); rh = rt_tables_get_rnh(fibnum, dst->sa_family); newrt = NULL; if (rh == NULL) goto miss; /* * Look up the address in the table for that Address Family */ RIB_RLOCK(rh); rn = rh->rnh_matchaddr(dst, &rh->head); if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) { newrt = RNTORT(rn); RT_LOCK(newrt); RT_ADDREF(newrt); RIB_RUNLOCK(rh); return (newrt); } else RIB_RUNLOCK(rh); /* * Either we hit the root or couldn't find any match, * Which basically means * "caint get there frm here" */ miss: V_rtstat.rts_unreach++; if (report) { /* * If required, report the failure to the supervising * Authorities. * For a delete, this is not an error. (report == 0) */ bzero(&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; rt_missmsg_fib(msgtype, &info, 0, err, fibnum); } return (newrt); } /* * Remove a reference count from an rtentry. * If the count gets low enough, take it out of the routing table */ void rtfree(struct rtentry *rt) { struct rib_head *rnh; KASSERT(rt != NULL,("%s: NULL rt", __func__)); rnh = rt_tables_get_rnh(rt->rt_fibnum, rt_key(rt)->sa_family); KASSERT(rnh != NULL,("%s: NULL rnh", __func__)); RT_LOCK_ASSERT(rt); /* * The callers should use RTFREE_LOCKED() or RTFREE(), so * we should come here exactly with the last reference. */ RT_REMREF(rt); if (rt->rt_refcnt > 0) { log(LOG_DEBUG, "%s: %p has %d refs\n", __func__, rt, rt->rt_refcnt); goto done; } /* * On last reference give the "close method" a chance * to cleanup private state. This also permits (for * IPv4 and IPv6) a chance to decide if the routing table * entry should be purged immediately or at a later time. * When an immediate purge is to happen the close routine * typically calls rtexpunge which clears the RTF_UP flag * on the entry so that the code below reclaims the storage. */ if (rt->rt_refcnt == 0 && rnh->rnh_close) rnh->rnh_close((struct radix_node *)rt, &rnh->head); /* * If we are no longer "up" (and ref == 0) * then we can free the resources associated * with the route. */ if ((rt->rt_flags & RTF_UP) == 0) { if (rt->rt_nodes->rn_flags & (RNF_ACTIVE | RNF_ROOT)) panic("rtfree 2"); /* * the rtentry must have been removed from the routing table * so it is represented in rttrash.. remove that now. */ V_rttrash--; #ifdef DIAGNOSTIC if (rt->rt_refcnt < 0) { printf("rtfree: %p not freed (neg refs)\n", rt); goto done; } #endif /* * release references on items we hold them on.. * e.g other routes and ifaddrs. */ if (rt->rt_ifa) ifa_free(rt->rt_ifa); /* * The key is separatly alloc'd so free it (see rt_setgate()). * This also frees the gateway, as they are always malloc'd * together. */ R_Free(rt_key(rt)); /* * and the rtentry itself of course */ uma_zfree(V_rtzone, rt); return; } done: RT_UNLOCK(rt); } /* * Force a routing table entry to the specified * destination to go through the given gateway. * Normally called as a result of a routing redirect * message from the network layer. */ void rtredirect_fib(struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *netmask, int flags, struct sockaddr *src, u_int fibnum) { struct rtentry *rt; int error = 0; short *stat = NULL; struct rt_addrinfo info; struct ifaddr *ifa; struct rib_head *rnh; ifa = NULL; rnh = rt_tables_get_rnh(fibnum, dst->sa_family); if (rnh == NULL) { error = EAFNOSUPPORT; goto out; } /* verify the gateway is directly reachable */ if ((ifa = ifa_ifwithnet(gateway, 0, fibnum)) == NULL) { error = ENETUNREACH; goto out; } rt = rtalloc1_fib(dst, 0, 0UL, fibnum); /* NB: rt is locked */ /* * If the redirect isn't from our current router for this dst, * it's either old or wrong. If it redirects us to ourselves, * we have a routing loop, perhaps as a result of an interface * going down recently. */ if (!(flags & RTF_DONE) && rt) { if (!sa_equal(src, rt->rt_gateway)) { error = EINVAL; goto done; } if (rt->rt_ifa != ifa && ifa->ifa_addr->sa_family != AF_LINK) { error = EINVAL; goto done; } } if ((flags & RTF_GATEWAY) && ifa_ifwithaddr_check(gateway)) { error = EHOSTUNREACH; goto done; } /* * Create a new entry if we just got back a wildcard entry * or the lookup failed. This is necessary for hosts * which use routing redirects generated by smart gateways * to dynamically build the routing tables. */ if (rt == NULL || (rt_mask(rt) && rt_mask(rt)->sa_len < 2)) goto create; /* * Don't listen to the redirect if it's * for a route to an interface. */ if (rt->rt_flags & RTF_GATEWAY) { if (((rt->rt_flags & RTF_HOST) == 0) && (flags & RTF_HOST)) { /* * Changing from route to net => route to host. * Create new route, rather than smashing route to net. */ create: if (rt != NULL) RTFREE_LOCKED(rt); flags |= RTF_DYNAMIC; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = gateway; info.rti_info[RTAX_NETMASK] = netmask; info.rti_ifa = ifa; info.rti_flags = flags; error = rtrequest1_fib(RTM_ADD, &info, &rt, fibnum); if (rt != NULL) { RT_LOCK(rt); flags = rt->rt_flags; } stat = &V_rtstat.rts_dynamic; } else { /* * Smash the current notion of the gateway to * this destination. Should check about netmask!!! */ if ((flags & RTF_GATEWAY) == 0) rt->rt_flags &= ~RTF_GATEWAY; rt->rt_flags |= RTF_MODIFIED; flags |= RTF_MODIFIED; stat = &V_rtstat.rts_newgateway; /* * add the key and gateway (in one malloc'd chunk). */ RT_UNLOCK(rt); RIB_WLOCK(rnh); RT_LOCK(rt); rt_setgate(rt, rt_key(rt), gateway); RIB_WUNLOCK(rnh); } } else error = EHOSTUNREACH; done: if (rt) RTFREE_LOCKED(rt); out: if (error) V_rtstat.rts_badredirect++; else if (stat != NULL) (*stat)++; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = gateway; info.rti_info[RTAX_NETMASK] = netmask; info.rti_info[RTAX_AUTHOR] = src; rt_missmsg_fib(RTM_REDIRECT, &info, flags, error, fibnum); if (ifa != NULL) ifa_free(ifa); } /* * Routing table ioctl interface. */ int rtioctl_fib(u_long req, caddr_t data, u_int fibnum) { /* * If more ioctl commands are added here, make sure the proper * super-user checks are being performed because it is possible for * prison-root to make it this far if raw sockets have been enabled * in jails. */ #ifdef INET /* Multicast goop, grrr... */ return mrt_ioctl ? mrt_ioctl(req, data, fibnum) : EOPNOTSUPP; #else /* INET */ return ENXIO; #endif /* INET */ } struct ifaddr * ifa_ifwithroute(int flags, const struct sockaddr *dst, struct sockaddr *gateway, u_int fibnum) { struct ifaddr *ifa; int not_found = 0; if ((flags & RTF_GATEWAY) == 0) { /* * If we are adding a route to an interface, * and the interface is a pt to pt link * we should search for the destination * as our clue to the interface. Otherwise * we can use the local address. */ ifa = NULL; if (flags & RTF_HOST) ifa = ifa_ifwithdstaddr(dst, fibnum); if (ifa == NULL) ifa = ifa_ifwithaddr(gateway); } else { /* * If we are adding a route to a remote net * or host, the gateway may still be on the * other end of a pt to pt link. */ ifa = ifa_ifwithdstaddr(gateway, fibnum); } if (ifa == NULL) ifa = ifa_ifwithnet(gateway, 0, fibnum); if (ifa == NULL) { struct rtentry *rt = rtalloc1_fib(gateway, 0, 0, fibnum); if (rt == NULL) return (NULL); /* * dismiss a gateway that is reachable only * through the default router */ switch (gateway->sa_family) { case AF_INET: if (satosin(rt_key(rt))->sin_addr.s_addr == INADDR_ANY) not_found = 1; break; case AF_INET6: if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(rt))->sin6_addr)) not_found = 1; break; default: break; } if (!not_found && rt->rt_ifa != NULL) { ifa = rt->rt_ifa; ifa_ref(ifa); } RT_REMREF(rt); RT_UNLOCK(rt); if (not_found || ifa == NULL) return (NULL); } if (ifa->ifa_addr->sa_family != dst->sa_family) { struct ifaddr *oifa = ifa; ifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp); if (ifa == NULL) ifa = oifa; else ifa_free(oifa); } return (ifa); } /* * Do appropriate manipulations of a routing tree given * all the bits of info needed */ int rtrequest_fib(int req, struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *netmask, int flags, struct rtentry **ret_nrt, u_int fibnum) { struct rt_addrinfo info; if (dst->sa_len == 0) return(EINVAL); bzero((caddr_t)&info, sizeof(info)); info.rti_flags = flags; info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = gateway; info.rti_info[RTAX_NETMASK] = netmask; return rtrequest1_fib(req, &info, ret_nrt, fibnum); } /* * Copy most of @rt data into @info. * * If @flags contains NHR_COPY, copies dst,netmask and gw to the * pointers specified by @info structure. Assume such pointers * are zeroed sockaddr-like structures with sa_len field initialized * to reflect size of the provided buffer. if no NHR_COPY is specified, * point dst,netmask and gw @info fields to appropriate @rt values. * * if @flags contains NHR_REF, do refcouting on rt_ifp. * * Returns 0 on success. */ int rt_exportinfo(struct rtentry *rt, struct rt_addrinfo *info, int flags) { struct rt_metrics *rmx; struct sockaddr *src, *dst; int sa_len; if (flags & NHR_COPY) { /* Copy destination if dst is non-zero */ src = rt_key(rt); dst = info->rti_info[RTAX_DST]; sa_len = src->sa_len; if (dst != NULL) { if (src->sa_len > dst->sa_len) return (ENOMEM); memcpy(dst, src, src->sa_len); info->rti_addrs |= RTA_DST; } /* Copy mask if set && dst is non-zero */ src = rt_mask(rt); dst = info->rti_info[RTAX_NETMASK]; if (src != NULL && dst != NULL) { /* * Radix stores different value in sa_len, * assume rt_mask() to have the same length * as rt_key() */ if (sa_len > dst->sa_len) return (ENOMEM); memcpy(dst, src, src->sa_len); info->rti_addrs |= RTA_NETMASK; } /* Copy gateway is set && dst is non-zero */ src = rt->rt_gateway; dst = info->rti_info[RTAX_GATEWAY]; if ((rt->rt_flags & RTF_GATEWAY) && src != NULL && dst != NULL){ if (src->sa_len > dst->sa_len) return (ENOMEM); memcpy(dst, src, src->sa_len); info->rti_addrs |= RTA_GATEWAY; } } else { info->rti_info[RTAX_DST] = rt_key(rt); info->rti_addrs |= RTA_DST; if (rt_mask(rt) != NULL) { info->rti_info[RTAX_NETMASK] = rt_mask(rt); info->rti_addrs |= RTA_NETMASK; } if (rt->rt_flags & RTF_GATEWAY) { info->rti_info[RTAX_GATEWAY] = rt->rt_gateway; info->rti_addrs |= RTA_GATEWAY; } } rmx = info->rti_rmx; if (rmx != NULL) { info->rti_mflags |= RTV_MTU; rmx->rmx_mtu = rt->rt_mtu; } info->rti_flags = rt->rt_flags; info->rti_ifp = rt->rt_ifp; info->rti_ifa = rt->rt_ifa; if (flags & NHR_REF) { /* Do 'traditional' refcouting */ if_ref(info->rti_ifp); } return (0); } /* * Lookups up route entry for @dst in RIB database for fib @fibnum. * Exports entry data to @info using rt_exportinfo(). * * if @flags contains NHR_REF, refcouting is performed on rt_ifp. * All references can be released later by calling rib_free_info() * * Returns 0 on success. * Returns ENOENT for lookup failure, ENOMEM for export failure. */ int rib_lookup_info(uint32_t fibnum, const struct sockaddr *dst, uint32_t flags, uint32_t flowid, struct rt_addrinfo *info) { struct rib_head *rh; struct radix_node *rn; struct rtentry *rt; int error; KASSERT((fibnum < rt_numfibs), ("rib_lookup_rte: bad fibnum")); rh = rt_tables_get_rnh(fibnum, dst->sa_family); if (rh == NULL) return (ENOENT); RIB_RLOCK(rh); rn = rh->rnh_matchaddr(__DECONST(void *, dst), &rh->head); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { rt = RNTORT(rn); /* Ensure route & ifp is UP */ if (RT_LINK_IS_UP(rt->rt_ifp)) { flags = (flags & NHR_REF) | NHR_COPY; error = rt_exportinfo(rt, info, flags); RIB_RUNLOCK(rh); return (error); } } RIB_RUNLOCK(rh); return (ENOENT); } /* * Releases all references acquired by rib_lookup_info() when * called with NHR_REF flags. */ void rib_free_info(struct rt_addrinfo *info) { if_rele(info->rti_ifp); } /* * Iterates over all existing fibs in system calling * @setwa_f function prior to traversing each fib. * Calls @wa_f function for each element in current fib. * If af is not AF_UNSPEC, iterates over fibs in particular * address family. */ void rt_foreach_fib_walk(int af, rt_setwarg_t *setwa_f, rt_walktree_f_t *wa_f, void *arg) { struct rib_head *rnh; uint32_t fibnum; int i; for (fibnum = 0; fibnum < rt_numfibs; fibnum++) { /* Do we want some specific family? */ if (af != AF_UNSPEC) { rnh = rt_tables_get_rnh(fibnum, af); if (rnh == NULL) continue; if (setwa_f != NULL) setwa_f(rnh, fibnum, af, arg); RIB_WLOCK(rnh); rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f,arg); RIB_WUNLOCK(rnh); continue; } for (i = 1; i <= AF_MAX; i++) { rnh = rt_tables_get_rnh(fibnum, i); if (rnh == NULL) continue; if (setwa_f != NULL) setwa_f(rnh, fibnum, i, arg); RIB_WLOCK(rnh); rnh->rnh_walktree(&rnh->head, (walktree_f_t *)wa_f,arg); RIB_WUNLOCK(rnh); } } } struct rt_delinfo { struct rt_addrinfo info; struct rib_head *rnh; struct rtentry *head; }; /* * Conditionally unlinks @rn from radix tree based * on info data passed in @arg. */ static int rt_checkdelroute(struct radix_node *rn, void *arg) { struct rt_delinfo *di; struct rt_addrinfo *info; struct rtentry *rt; int error; di = (struct rt_delinfo *)arg; rt = (struct rtentry *)rn; info = &di->info; error = 0; info->rti_info[RTAX_DST] = rt_key(rt); info->rti_info[RTAX_NETMASK] = rt_mask(rt); info->rti_info[RTAX_GATEWAY] = rt->rt_gateway; rt = rt_unlinkrte(di->rnh, info, &error); if (rt == NULL) { /* Either not allowed or not matched. Skip entry */ return (0); } /* Entry was unlinked. Add to the list and return */ rt->rt_chain = di->head; di->head = rt; return (0); } /* * Iterates over all existing fibs in system. * Deletes each element for which @filter_f function returned * non-zero value. * If @af is not AF_UNSPEC, iterates over fibs in particular * address family. */ void rt_foreach_fib_walk_del(int af, rt_filter_f_t *filter_f, void *arg) { struct rib_head *rnh; struct rt_delinfo di; struct rtentry *rt; uint32_t fibnum; int i, start, end; bzero(&di, sizeof(di)); di.info.rti_filter = filter_f; di.info.rti_filterdata = arg; for (fibnum = 0; fibnum < rt_numfibs; fibnum++) { /* Do we want some specific family? */ if (af != AF_UNSPEC) { start = af; end = af; } else { start = 1; end = AF_MAX; } for (i = start; i <= end; i++) { rnh = rt_tables_get_rnh(fibnum, i); if (rnh == NULL) continue; di.rnh = rnh; RIB_WLOCK(rnh); rnh->rnh_walktree(&rnh->head, rt_checkdelroute, &di); RIB_WUNLOCK(rnh); if (di.head == NULL) continue; /* We might have something to reclaim */ while (di.head != NULL) { rt = di.head; di.head = rt->rt_chain; rt->rt_chain = NULL; /* TODO std rt -> rt_addrinfo export */ di.info.rti_info[RTAX_DST] = rt_key(rt); di.info.rti_info[RTAX_NETMASK] = rt_mask(rt); rt_notifydelete(rt, &di.info); RTFREE_LOCKED(rt); } } } } /* * Delete Routes for a Network Interface * * Called for each routing entry via the rnh->rnh_walktree() call above * to delete all route entries referencing a detaching network interface. * * Arguments: * rt pointer to rtentry * arg argument passed to rnh->rnh_walktree() - detaching interface * * Returns: * 0 successful * errno failed - reason indicated */ static int rt_ifdelroute(const struct rtentry *rt, void *arg) { struct ifnet *ifp = arg; if (rt->rt_ifp != ifp) return (0); /* * Protect (sorta) against walktree recursion problems * with cloned routes */ if ((rt->rt_flags & RTF_UP) == 0) return (0); return (1); } /* * Delete all remaining routes using this interface * Unfortuneatly the only way to do this is to slog through * the entire routing table looking for routes which point * to this interface...oh well... */ void rt_flushifroutes(struct ifnet *ifp) { rt_foreach_fib_walk_del(AF_UNSPEC, rt_ifdelroute, ifp); } /* * Conditionally unlinks rtentry matching data inside @info from @rnh. * Returns unlinked, locked and referenced @rtentry on success, * Returns NULL and sets @perror to: * ESRCH - if prefix was not found, * EADDRINUSE - if trying to delete PINNED route without appropriate flag. * ENOENT - if supplied filter function returned 0 (not matched). */ static struct rtentry * rt_unlinkrte(struct rib_head *rnh, struct rt_addrinfo *info, int *perror) { struct sockaddr *dst, *netmask; struct rtentry *rt; struct radix_node *rn; dst = info->rti_info[RTAX_DST]; netmask = info->rti_info[RTAX_NETMASK]; rt = (struct rtentry *)rnh->rnh_lookup(dst, netmask, &rnh->head); if (rt == NULL) { *perror = ESRCH; return (NULL); } if ((info->rti_flags & RTF_PINNED) == 0) { /* Check if target route can be deleted */ if (rt->rt_flags & RTF_PINNED) { *perror = EADDRINUSE; return (NULL); } } if (info->rti_filter != NULL) { if (info->rti_filter(rt, info->rti_filterdata) == 0) { /* Not matched */ *perror = ENOENT; return (NULL); } /* * Filter function requested rte deletion. * Ease the caller work by filling in remaining info * from that particular entry. */ info->rti_info[RTAX_GATEWAY] = rt->rt_gateway; } /* * Remove the item from the tree and return it. * Complain if it is not there and do no more processing. */ *perror = ESRCH; #ifdef RADIX_MPATH if (rt_mpath_capable(rnh)) rn = rt_mpath_unlink(rnh, info, rt, perror); else #endif rn = rnh->rnh_deladdr(dst, netmask, &rnh->head); if (rn == NULL) return (NULL); if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) panic ("rtrequest delete"); rt = RNTORT(rn); RT_LOCK(rt); RT_ADDREF(rt); rt->rt_flags &= ~RTF_UP; *perror = 0; return (rt); } static void rt_notifydelete(struct rtentry *rt, struct rt_addrinfo *info) { struct ifaddr *ifa; /* * give the protocol a chance to keep things in sync. */ ifa = rt->rt_ifa; if (ifa != NULL && ifa->ifa_rtrequest != NULL) ifa->ifa_rtrequest(RTM_DELETE, rt, info); /* * One more rtentry floating around that is not * linked to the routing table. rttrash will be decremented * when RTFREE(rt) is eventually called. */ V_rttrash++; } /* * These (questionable) definitions of apparent local variables apply * to the next two functions. XXXXXX!!! */ #define dst info->rti_info[RTAX_DST] #define gateway info->rti_info[RTAX_GATEWAY] #define netmask info->rti_info[RTAX_NETMASK] #define ifaaddr info->rti_info[RTAX_IFA] #define ifpaddr info->rti_info[RTAX_IFP] #define flags info->rti_flags /* * Look up rt_addrinfo for a specific fib. Note that if rti_ifa is defined, * it will be referenced so the caller must free it. */ int rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum) { struct ifaddr *ifa; int error = 0; /* * ifp may be specified by sockaddr_dl * when protocol address is ambiguous. */ if (info->rti_ifp == NULL && ifpaddr != NULL && ifpaddr->sa_family == AF_LINK && (ifa = ifa_ifwithnet(ifpaddr, 0, fibnum)) != NULL) { info->rti_ifp = ifa->ifa_ifp; ifa_free(ifa); } if (info->rti_ifa == NULL && ifaaddr != NULL) info->rti_ifa = ifa_ifwithaddr(ifaaddr); if (info->rti_ifa == NULL) { struct sockaddr *sa; sa = ifaaddr != NULL ? ifaaddr : (gateway != NULL ? gateway : dst); if (sa != NULL && info->rti_ifp != NULL) info->rti_ifa = ifaof_ifpforaddr(sa, info->rti_ifp); else if (dst != NULL && gateway != NULL) info->rti_ifa = ifa_ifwithroute(flags, dst, gateway, fibnum); else if (sa != NULL) info->rti_ifa = ifa_ifwithroute(flags, sa, sa, fibnum); } if ((ifa = info->rti_ifa) != NULL) { if (info->rti_ifp == NULL) info->rti_ifp = ifa->ifa_ifp; } else error = ENETUNREACH; return (error); } static int if_updatemtu_cb(struct radix_node *rn, void *arg) { struct rtentry *rt; struct if_mtuinfo *ifmtu; rt = (struct rtentry *)rn; ifmtu = (struct if_mtuinfo *)arg; if (rt->rt_ifp != ifmtu->ifp) return (0); if (rt->rt_mtu >= ifmtu->mtu) { /* We have to decrease mtu regardless of flags */ rt->rt_mtu = ifmtu->mtu; return (0); } /* * New MTU is bigger. Check if are allowed to alter it */ if ((rt->rt_flags & (RTF_FIXEDMTU | RTF_GATEWAY | RTF_HOST)) != 0) { /* * Skip routes with user-supplied MTU and * non-interface routes */ return (0); } /* We are safe to update route MTU */ rt->rt_mtu = ifmtu->mtu; return (0); } void rt_updatemtu(struct ifnet *ifp) { struct if_mtuinfo ifmtu; struct rib_head *rnh; int i, j; ifmtu.ifp = ifp; /* * Try to update rt_mtu for all routes using this interface * Unfortunately the only way to do this is to traverse all * routing tables in all fibs/domains. */ for (i = 1; i <= AF_MAX; i++) { ifmtu.mtu = if_getmtu_family(ifp, i); for (j = 0; j < rt_numfibs; j++) { rnh = rt_tables_get_rnh(j, i); if (rnh == NULL) continue; RIB_WLOCK(rnh); rnh->rnh_walktree(&rnh->head, if_updatemtu_cb, &ifmtu); RIB_WUNLOCK(rnh); } } } #if 0 int p_sockaddr(char *buf, int buflen, struct sockaddr *s); int rt_print(char *buf, int buflen, struct rtentry *rt); int p_sockaddr(char *buf, int buflen, struct sockaddr *s) { void *paddr = NULL; switch (s->sa_family) { case AF_INET: paddr = &((struct sockaddr_in *)s)->sin_addr; break; case AF_INET6: paddr = &((struct sockaddr_in6 *)s)->sin6_addr; break; } if (paddr == NULL) return (0); if (inet_ntop(s->sa_family, paddr, buf, buflen) == NULL) return (0); return (strlen(buf)); } int rt_print(char *buf, int buflen, struct rtentry *rt) { struct sockaddr *addr, *mask; int i = 0; addr = rt_key(rt); mask = rt_mask(rt); i = p_sockaddr(buf, buflen, addr); if (!(rt->rt_flags & RTF_HOST)) { buf[i++] = '/'; i += p_sockaddr(buf + i, buflen - i, mask); } if (rt->rt_flags & RTF_GATEWAY) { buf[i++] = '>'; i += p_sockaddr(buf + i, buflen - i, rt->rt_gateway); } return (i); } #endif #ifdef RADIX_MPATH /* * Deletes key for single-path routes, unlinks rtentry with * gateway specified in @info from multi-path routes. * * Returnes unlinked entry. In case of failure, returns NULL * and sets @perror to ESRCH. */ static struct radix_node * rt_mpath_unlink(struct rib_head *rnh, struct rt_addrinfo *info, struct rtentry *rto, int *perror) { /* * if we got multipath routes, we require users to specify * a matching RTAX_GATEWAY. */ struct rtentry *rt; // *rto = NULL; struct radix_node *rn; struct sockaddr *gw; gw = info->rti_info[RTAX_GATEWAY]; rt = rt_mpath_matchgate(rto, gw); if (rt == NULL) { *perror = ESRCH; return (NULL); } /* * this is the first entry in the chain */ if (rto == rt) { rn = rn_mpath_next((struct radix_node *)rt); /* * there is another entry, now it's active */ if (rn) { rto = RNTORT(rn); RT_LOCK(rto); rto->rt_flags |= RTF_UP; RT_UNLOCK(rto); } else if (rt->rt_flags & RTF_GATEWAY) { /* * For gateway routes, we need to * make sure that we we are deleting * the correct gateway. * rt_mpath_matchgate() does not * check the case when there is only * one route in the chain. */ if (gw && (rt->rt_gateway->sa_len != gw->sa_len || memcmp(rt->rt_gateway, gw, gw->sa_len))) { *perror = ESRCH; return (NULL); } } /* * use the normal delete code to remove * the first entry */ rn = rnh->rnh_deladdr(dst, netmask, &rnh->head); *perror = 0; return (rn); } /* * if the entry is 2nd and on up */ if (rt_mpath_deldup(rto, rt) == 0) panic ("rtrequest1: rt_mpath_deldup"); *perror = 0; rn = (struct radix_node *)rt; return (rn); } #endif #ifdef FLOWTABLE static struct rtentry * rt_flowtable_check_route(struct rib_head *rnh, struct rt_addrinfo *info) { #if defined(INET6) || defined(INET) struct radix_node *rn; #endif struct rtentry *rt0; rt0 = NULL; /* "flow-table" only supports IPv6 and IPv4 at the moment. */ switch (dst->sa_family) { #ifdef INET6 case AF_INET6: #endif #ifdef INET case AF_INET: #endif #if defined(INET6) || defined(INET) rn = rnh->rnh_matchaddr(dst, &rnh->head); if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) { struct sockaddr *mask; u_char *m, *n; int len; /* * compare mask to see if the new route is * more specific than the existing one */ rt0 = RNTORT(rn); RT_LOCK(rt0); RT_ADDREF(rt0); RT_UNLOCK(rt0); /* * A host route is already present, so * leave the flow-table entries as is. */ if (rt0->rt_flags & RTF_HOST) { RTFREE(rt0); rt0 = NULL; } else if (!(flags & RTF_HOST) && netmask) { mask = rt_mask(rt0); len = mask->sa_len; m = (u_char *)mask; n = (u_char *)netmask; while (len-- > 0) { if (*n != *m) break; n++; m++; } if (len == 0 || (*n < *m)) { RTFREE(rt0); rt0 = NULL; } } } #endif/* INET6 || INET */ } return (rt0); } #endif int rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt, u_int fibnum) { int error = 0; struct rtentry *rt, *rt_old; #ifdef FLOWTABLE struct rtentry *rt0; #endif struct radix_node *rn; struct rib_head *rnh; struct ifaddr *ifa; struct sockaddr *ndst; struct sockaddr_storage mdst; KASSERT((fibnum < rt_numfibs), ("rtrequest1_fib: bad fibnum")); KASSERT((flags & RTF_RNH_LOCKED) == 0, ("rtrequest1_fib: locked")); switch (dst->sa_family) { case AF_INET6: case AF_INET: /* We support multiple FIBs. */ break; default: fibnum = RT_DEFAULT_FIB; break; } /* * Find the correct routing tree to use for this Address Family */ rnh = rt_tables_get_rnh(fibnum, dst->sa_family); if (rnh == NULL) return (EAFNOSUPPORT); /* * If we are adding a host route then we don't want to put * a netmask in the tree, nor do we want to clone it. */ if (flags & RTF_HOST) netmask = NULL; switch (req) { case RTM_DELETE: if (netmask) { rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask); dst = (struct sockaddr *)&mdst; } RIB_WLOCK(rnh); rt = rt_unlinkrte(rnh, info, &error); RIB_WUNLOCK(rnh); if (error != 0) return (error); rt_notifydelete(rt, info); /* * If the caller wants it, then it can have it, * but it's up to it to free the rtentry as we won't be * doing it. */ if (ret_nrt) { *ret_nrt = rt; RT_UNLOCK(rt); } else RTFREE_LOCKED(rt); break; case RTM_RESOLVE: /* * resolve was only used for route cloning * here for compat */ break; case RTM_ADD: if ((flags & RTF_GATEWAY) && !gateway) return (EINVAL); if (dst && gateway && (dst->sa_family != gateway->sa_family) && (gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK)) return (EINVAL); if (info->rti_ifa == NULL) { error = rt_getifa_fib(info, fibnum); if (error) return (error); } else ifa_ref(info->rti_ifa); ifa = info->rti_ifa; rt = uma_zalloc(V_rtzone, M_NOWAIT); if (rt == NULL) { ifa_free(ifa); return (ENOBUFS); } rt->rt_flags = RTF_UP | flags; rt->rt_fibnum = fibnum; /* * Add the gateway. Possibly re-malloc-ing the storage for it. */ if ((error = rt_setgate(rt, dst, gateway)) != 0) { ifa_free(ifa); uma_zfree(V_rtzone, rt); return (error); } /* * point to the (possibly newly malloc'd) dest address. */ ndst = (struct sockaddr *)rt_key(rt); /* * make sure it contains the value we want (masked if needed). */ if (netmask) { rt_maskedcopy(dst, ndst, netmask); } else bcopy(dst, ndst, dst->sa_len); /* * We use the ifa reference returned by rt_getifa_fib(). * This moved from below so that rnh->rnh_addaddr() can * examine the ifa and ifa->ifa_ifp if it so desires. */ rt->rt_ifa = ifa; rt->rt_ifp = ifa->ifa_ifp; rt->rt_weight = 1; rt_setmetrics(info, rt); RIB_WLOCK(rnh); RT_LOCK(rt); #ifdef RADIX_MPATH /* do not permit exactly the same dst/mask/gw pair */ if (rt_mpath_capable(rnh) && rt_mpath_conflict(rnh, rt, netmask)) { RIB_WUNLOCK(rnh); ifa_free(rt->rt_ifa); R_Free(rt_key(rt)); uma_zfree(V_rtzone, rt); return (EEXIST); } #endif #ifdef FLOWTABLE rt0 = rt_flowtable_check_route(rnh, info); #endif /* FLOWTABLE */ /* XXX mtu manipulation will be done in rnh_addaddr -- itojun */ rn = rnh->rnh_addaddr(ndst, netmask, &rnh->head, rt->rt_nodes); rt_old = NULL; if (rn == NULL && (info->rti_flags & RTF_PINNED) != 0) { /* * Force removal and re-try addition * TODO: better multipath&pinned support */ struct sockaddr *info_dst = info->rti_info[RTAX_DST]; info->rti_info[RTAX_DST] = ndst; /* Do not delete existing PINNED(interface) routes */ info->rti_flags &= ~RTF_PINNED; rt_old = rt_unlinkrte(rnh, info, &error); info->rti_flags |= RTF_PINNED; info->rti_info[RTAX_DST] = info_dst; if (rt_old != NULL) rn = rnh->rnh_addaddr(ndst, netmask, &rnh->head, rt->rt_nodes); } RIB_WUNLOCK(rnh); if (rt_old != NULL) RT_UNLOCK(rt_old); /* * If it still failed to go into the tree, * then un-make it (this should be a function) */ if (rn == NULL) { ifa_free(rt->rt_ifa); R_Free(rt_key(rt)); uma_zfree(V_rtzone, rt); #ifdef FLOWTABLE if (rt0 != NULL) RTFREE(rt0); #endif return (EEXIST); } #ifdef FLOWTABLE else if (rt0 != NULL) { flowtable_route_flush(dst->sa_family, rt0); RTFREE(rt0); } #endif if (rt_old != NULL) { rt_notifydelete(rt_old, info); RTFREE(rt_old); } /* * If this protocol has something to add to this then * allow it to do that as well. */ if (ifa->ifa_rtrequest) ifa->ifa_rtrequest(req, rt, info); /* * actually return a resultant rtentry and * give the caller a single reference. */ if (ret_nrt) { *ret_nrt = rt; RT_ADDREF(rt); } + rnh->rnh_gen++; /* Routing table updated */ RT_UNLOCK(rt); break; case RTM_CHANGE: RIB_WLOCK(rnh); error = rtrequest1_fib_change(rnh, info, ret_nrt, fibnum); RIB_WUNLOCK(rnh); break; default: error = EOPNOTSUPP; } return (error); } #undef dst #undef gateway #undef netmask #undef ifaaddr #undef ifpaddr #undef flags static int rtrequest1_fib_change(struct rib_head *rnh, struct rt_addrinfo *info, struct rtentry **ret_nrt, u_int fibnum) { struct rtentry *rt = NULL; int error = 0; int free_ifa = 0; int family, mtu; struct if_mtuinfo ifmtu; rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST], info->rti_info[RTAX_NETMASK], &rnh->head); if (rt == NULL) return (ESRCH); #ifdef RADIX_MPATH /* * If we got multipath routes, * we require users to specify a matching RTAX_GATEWAY. */ if (rt_mpath_capable(rnh)) { rt = rt_mpath_matchgate(rt, info->rti_info[RTAX_GATEWAY]); if (rt == NULL) return (ESRCH); } #endif RT_LOCK(rt); rt_setmetrics(info, rt); /* * New gateway could require new ifaddr, ifp; * flags may also be different; ifp may be specified * by ll sockaddr when protocol address is ambiguous */ if (((rt->rt_flags & RTF_GATEWAY) && info->rti_info[RTAX_GATEWAY] != NULL) || info->rti_info[RTAX_IFP] != NULL || (info->rti_info[RTAX_IFA] != NULL && !sa_equal(info->rti_info[RTAX_IFA], rt->rt_ifa->ifa_addr))) { error = rt_getifa_fib(info, fibnum); if (info->rti_ifa != NULL) free_ifa = 1; if (error != 0) goto bad; } /* Check if outgoing interface has changed */ if (info->rti_ifa != NULL && info->rti_ifa != rt->rt_ifa && rt->rt_ifa != NULL && rt->rt_ifa->ifa_rtrequest != NULL) { rt->rt_ifa->ifa_rtrequest(RTM_DELETE, rt, info); ifa_free(rt->rt_ifa); } /* Update gateway address */ if (info->rti_info[RTAX_GATEWAY] != NULL) { error = rt_setgate(rt, rt_key(rt), info->rti_info[RTAX_GATEWAY]); if (error != 0) goto bad; rt->rt_flags &= ~RTF_GATEWAY; rt->rt_flags |= (RTF_GATEWAY & info->rti_flags); } if (info->rti_ifa != NULL && info->rti_ifa != rt->rt_ifa) { ifa_ref(info->rti_ifa); rt->rt_ifa = info->rti_ifa; rt->rt_ifp = info->rti_ifp; } /* Allow some flags to be toggled on change. */ rt->rt_flags &= ~RTF_FMASK; rt->rt_flags |= info->rti_flags & RTF_FMASK; if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest != NULL) rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, info); /* Alter route MTU if necessary */ if (rt->rt_ifp != NULL) { family = info->rti_info[RTAX_DST]->sa_family; mtu = if_getmtu_family(rt->rt_ifp, family); /* Set default MTU */ if (rt->rt_mtu == 0) rt->rt_mtu = mtu; if (rt->rt_mtu != mtu) { /* Check if we really need to update */ ifmtu.ifp = rt->rt_ifp; ifmtu.mtu = mtu; if_updatemtu_cb(rt->rt_nodes, &ifmtu); } } if (ret_nrt) { *ret_nrt = rt; RT_ADDREF(rt); } bad: RT_UNLOCK(rt); if (free_ifa != 0) ifa_free(info->rti_ifa); return (error); } static void rt_setmetrics(const struct rt_addrinfo *info, struct rtentry *rt) { if (info->rti_mflags & RTV_MTU) { if (info->rti_rmx->rmx_mtu != 0) { /* * MTU was explicitly provided by user. * Keep it. */ rt->rt_flags |= RTF_FIXEDMTU; } else { /* * User explicitly sets MTU to 0. * Assume rollback to default. */ rt->rt_flags &= ~RTF_FIXEDMTU; } rt->rt_mtu = info->rti_rmx->rmx_mtu; } if (info->rti_mflags & RTV_WEIGHT) rt->rt_weight = info->rti_rmx->rmx_weight; /* Kernel -> userland timebase conversion. */ if (info->rti_mflags & RTV_EXPIRE) rt->rt_expire = info->rti_rmx->rmx_expire ? info->rti_rmx->rmx_expire - time_second + time_uptime : 0; } int rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate) { /* XXX dst may be overwritten, can we move this to below */ int dlen = SA_SIZE(dst), glen = SA_SIZE(gate); /* * Prepare to store the gateway in rt->rt_gateway. * Both dst and gateway are stored one after the other in the same * malloc'd chunk. If we have room, we can reuse the old buffer, * rt_gateway already points to the right place. * Otherwise, malloc a new block and update the 'dst' address. */ if (rt->rt_gateway == NULL || glen > SA_SIZE(rt->rt_gateway)) { caddr_t new; R_Malloc(new, caddr_t, dlen + glen); if (new == NULL) return ENOBUFS; /* * XXX note, we copy from *dst and not *rt_key(rt) because * rt_setgate() can be called to initialize a newly * allocated route entry, in which case rt_key(rt) == NULL * (and also rt->rt_gateway == NULL). * Free()/free() handle a NULL argument just fine. */ bcopy(dst, new, dlen); R_Free(rt_key(rt)); /* free old block, if any */ rt_key(rt) = (struct sockaddr *)new; rt->rt_gateway = (struct sockaddr *)(new + dlen); } /* * Copy the new gateway value into the memory chunk. */ bcopy(gate, rt->rt_gateway, glen); return (0); } void rt_maskedcopy(struct sockaddr *src, struct sockaddr *dst, struct sockaddr *netmask) { u_char *cp1 = (u_char *)src; u_char *cp2 = (u_char *)dst; u_char *cp3 = (u_char *)netmask; u_char *cplim = cp2 + *cp3; u_char *cplim2 = cp2 + *cp1; *cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */ cp3 += 2; if (cplim > cplim2) cplim = cplim2; while (cp2 < cplim) *cp2++ = *cp1++ & *cp3++; if (cp2 < cplim2) bzero((caddr_t)cp2, (unsigned)(cplim2 - cp2)); } /* * Set up a routing table entry, normally * for an interface. */ #define _SOCKADDR_TMPSIZE 128 /* Not too big.. kernel stack size is limited */ static inline int rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum) { struct sockaddr *dst; struct sockaddr *netmask; struct rtentry *rt = NULL; struct rt_addrinfo info; int error = 0; int startfib, endfib; char tempbuf[_SOCKADDR_TMPSIZE]; int didwork = 0; int a_failure = 0; static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK}; struct rib_head *rnh; if (flags & RTF_HOST) { dst = ifa->ifa_dstaddr; netmask = NULL; } else { dst = ifa->ifa_addr; netmask = ifa->ifa_netmask; } if (dst->sa_len == 0) return(EINVAL); switch (dst->sa_family) { case AF_INET6: case AF_INET: /* We support multiple FIBs. */ break; default: fibnum = RT_DEFAULT_FIB; break; } if (fibnum == RT_ALL_FIBS) { if (V_rt_add_addr_allfibs == 0 && cmd == (int)RTM_ADD) startfib = endfib = ifa->ifa_ifp->if_fib; else { startfib = 0; endfib = rt_numfibs - 1; } } else { KASSERT((fibnum < rt_numfibs), ("rtinit1: bad fibnum")); startfib = fibnum; endfib = fibnum; } /* * If it's a delete, check that if it exists, * it's on the correct interface or we might scrub * a route to another ifa which would * be confusing at best and possibly worse. */ if (cmd == RTM_DELETE) { /* * It's a delete, so it should already exist.. * If it's a net, mask off the host bits * (Assuming we have a mask) * XXX this is kinda inet specific.. */ if (netmask != NULL) { rt_maskedcopy(dst, (struct sockaddr *)tempbuf, netmask); dst = (struct sockaddr *)tempbuf; } } /* * Now go through all the requested tables (fibs) and do the * requested action. Realistically, this will either be fib 0 * for protocols that don't do multiple tables or all the * tables for those that do. */ for ( fibnum = startfib; fibnum <= endfib; fibnum++) { if (cmd == RTM_DELETE) { struct radix_node *rn; /* * Look up an rtentry that is in the routing tree and * contains the correct info. */ rnh = rt_tables_get_rnh(fibnum, dst->sa_family); if (rnh == NULL) /* this table doesn't exist but others might */ continue; RIB_RLOCK(rnh); rn = rnh->rnh_lookup(dst, netmask, &rnh->head); #ifdef RADIX_MPATH if (rt_mpath_capable(rnh)) { if (rn == NULL) error = ESRCH; else { rt = RNTORT(rn); /* * for interface route the * rt->rt_gateway is sockaddr_intf * for cloning ARP entries, so * rt_mpath_matchgate must use the * interface address */ rt = rt_mpath_matchgate(rt, ifa->ifa_addr); if (rt == NULL) error = ESRCH; } } #endif error = (rn == NULL || (rn->rn_flags & RNF_ROOT) || RNTORT(rn)->rt_ifa != ifa); RIB_RUNLOCK(rnh); if (error) { /* this is only an error if bad on ALL tables */ continue; } } /* * Do the actual request */ bzero((caddr_t)&info, sizeof(info)); info.rti_ifa = ifa; info.rti_flags = flags | (ifa->ifa_flags & ~IFA_RTSELF) | RTF_PINNED; info.rti_info[RTAX_DST] = dst; /* * doing this for compatibility reasons */ if (cmd == RTM_ADD) info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl; else info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr; info.rti_info[RTAX_NETMASK] = netmask; error = rtrequest1_fib(cmd, &info, &rt, fibnum); if (error == 0 && rt != NULL) { /* * notify any listening routing agents of the change */ RT_LOCK(rt); #ifdef RADIX_MPATH /* * in case address alias finds the first address * e.g. ifconfig bge0 192.0.2.246/24 * e.g. ifconfig bge0 192.0.2.247/24 * the address set in the route is 192.0.2.246 * so we need to replace it with 192.0.2.247 */ if (memcmp(rt->rt_ifa->ifa_addr, ifa->ifa_addr, ifa->ifa_addr->sa_len)) { ifa_free(rt->rt_ifa); ifa_ref(ifa); rt->rt_ifp = ifa->ifa_ifp; rt->rt_ifa = ifa; } #endif /* * doing this for compatibility reasons */ if (cmd == RTM_ADD) { ((struct sockaddr_dl *)rt->rt_gateway)->sdl_type = rt->rt_ifp->if_type; ((struct sockaddr_dl *)rt->rt_gateway)->sdl_index = rt->rt_ifp->if_index; } RT_ADDREF(rt); RT_UNLOCK(rt); rt_newaddrmsg_fib(cmd, ifa, error, rt, fibnum); RT_LOCK(rt); RT_REMREF(rt); if (cmd == RTM_DELETE) { /* * If we are deleting, and we found an entry, * then it's been removed from the tree.. * now throw it away. */ RTFREE_LOCKED(rt); } else { if (cmd == RTM_ADD) { /* * We just wanted to add it.. * we don't actually need a reference. */ RT_REMREF(rt); } RT_UNLOCK(rt); } didwork = 1; } if (error) a_failure = error; } if (cmd == RTM_DELETE) { if (didwork) { error = 0; } else { /* we only give an error if it wasn't in any table */ error = ((flags & RTF_HOST) ? EHOSTUNREACH : ENETUNREACH); } } else { if (a_failure) { /* return an error if any of them failed */ error = a_failure; } } return (error); } /* * Set up a routing table entry, normally * for an interface. */ int rtinit(struct ifaddr *ifa, int cmd, int flags) { struct sockaddr *dst; int fib = RT_DEFAULT_FIB; if (flags & RTF_HOST) { dst = ifa->ifa_dstaddr; } else { dst = ifa->ifa_addr; } switch (dst->sa_family) { case AF_INET6: case AF_INET: /* We do support multiple FIBs. */ fib = RT_ALL_FIBS; break; } return (rtinit1(ifa, cmd, flags, fib)); } /* * Announce interface address arrival/withdraw * Returns 0 on success. */ int rt_addrmsg(int cmd, struct ifaddr *ifa, int fibnum) { KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE, ("unexpected cmd %d", cmd)); KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs), ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs)); #if defined(INET) || defined(INET6) #ifdef SCTP /* * notify the SCTP stack * this will only get called when an address is added/deleted * XXX pass the ifaddr struct instead if ifa->ifa_addr... */ sctp_addr_change(ifa, cmd); #endif /* SCTP */ #endif return (rtsock_addrmsg(cmd, ifa, fibnum)); } /* * Announce route addition/removal. * Users of this function MUST validate input data BEFORE calling. * However we have to be able to handle invalid data: * if some userland app sends us "invalid" route message (invalid mask, * no dst, wrong address families, etc...) we need to pass it back * to app (and any other rtsock consumers) with rtm_errno field set to * non-zero value. * Returns 0 on success. */ int rt_routemsg(int cmd, struct ifnet *ifp, int error, struct rtentry *rt, int fibnum) { KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE, ("unexpected cmd %d", cmd)); KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs), ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs)); KASSERT(rt_key(rt) != NULL, (":%s: rt_key must be supplied", __func__)); return (rtsock_routemsg(cmd, ifp, error, rt, fibnum)); } void rt_newaddrmsg(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt) { rt_newaddrmsg_fib(cmd, ifa, error, rt, RT_ALL_FIBS); } /* * This is called to generate messages from the routing socket * indicating a network interface has had addresses associated with it. */ void rt_newaddrmsg_fib(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt, int fibnum) { KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE, ("unexpected cmd %u", cmd)); KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs), ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs)); if (cmd == RTM_ADD) { rt_addrmsg(cmd, ifa, fibnum); if (rt != NULL) rt_routemsg(cmd, ifa->ifa_ifp, error, rt, fibnum); } else { if (rt != NULL) rt_routemsg(cmd, ifa->ifa_ifp, error, rt, fibnum); rt_addrmsg(cmd, ifa, fibnum); } } Index: head/sys/net/route.h =================================================================== --- head/sys/net/route.h (revision 297224) +++ head/sys/net/route.h (revision 297225) @@ -1,465 +1,488 @@ /*- * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)route.h 8.4 (Berkeley) 1/9/95 * $FreeBSD$ */ #ifndef _NET_ROUTE_H_ #define _NET_ROUTE_H_ #include #include /* * Kernel resident routing tables. * * The routing tables are initialized when interface addresses * are set by making entries for all directly connected interfaces. */ /* * Struct route consiste of a destination address, * a route entry pointer, link-layer prepend data pointer along * with its length. */ struct route { struct rtentry *ro_rt; char *ro_prepend; uint16_t ro_plen; uint16_t ro_flags; uint16_t ro_mtu; /* saved ro_rt mtu */ uint16_t spare; struct sockaddr ro_dst; }; #define RT_L2_ME_BIT 2 /* dst L2 addr is our address */ #define RT_MAY_LOOP_BIT 3 /* dst may require loop copy */ #define RT_HAS_HEADER_BIT 4 /* mbuf already have its header prepended */ #define RT_CACHING_CONTEXT 0x1 /* XXX: not used anywhere */ #define RT_NORTREF 0x2 /* doesn't hold reference on ro_rt */ #define RT_L2_ME (1 << RT_L2_ME_BIT) /* 0x0004 */ #define RT_MAY_LOOP (1 << RT_MAY_LOOP_BIT) /* 0x0008 */ #define RT_HAS_HEADER (1 << RT_HAS_HEADER_BIT) /* 0x0010 */ #define RT_REJECT 0x0020 /* Destination is reject */ #define RT_BLACKHOLE 0x0040 /* Destination is blackhole */ #define RT_HAS_GW 0x0080 /* Destination has GW */ struct rt_metrics { u_long rmx_locks; /* Kernel must leave these values alone */ u_long rmx_mtu; /* MTU for this path */ u_long rmx_hopcount; /* max hops expected */ u_long rmx_expire; /* lifetime for route, e.g. redirect */ u_long rmx_recvpipe; /* inbound delay-bandwidth product */ u_long rmx_sendpipe; /* outbound delay-bandwidth product */ u_long rmx_ssthresh; /* outbound gateway buffer limit */ u_long rmx_rtt; /* estimated round trip time */ u_long rmx_rttvar; /* estimated rtt variance */ u_long rmx_pksent; /* packets sent using this route */ u_long rmx_weight; /* route weight */ u_long rmx_filler[3]; /* will be used for T/TCP later */ }; /* * rmx_rtt and rmx_rttvar are stored as microseconds; * RTTTOPRHZ(rtt) converts to a value suitable for use * by a protocol slowtimo counter. */ #define RTM_RTTUNIT 1000000 /* units for rtt, rttvar, as units per sec */ #define RTTTOPRHZ(r) ((r) / (RTM_RTTUNIT / PR_SLOWHZ)) /* lle state is exported in rmx_state rt_metrics field */ #define rmx_state rmx_weight +/* + * Keep a generation count of routing table, incremented on route addition, + * so we can invalidate caches. This is accessed without a lock, as precision + * is not required. + */ +typedef volatile u_int rt_gen_t; /* tree generation (for adds) */ +#define RT_GEN(fibnum, af) rt_tables_get_gen(fibnum, af) + #define RT_DEFAULT_FIB 0 /* Explicitly mark fib=0 restricted cases */ #define RT_ALL_FIBS -1 /* Announce event for every fib */ #ifdef _KERNEL extern u_int rt_numfibs; /* number of usable routing tables */ VNET_DECLARE(u_int, rt_add_addr_allfibs); /* Announce interfaces to all fibs */ #define V_rt_add_addr_allfibs VNET(rt_add_addr_allfibs) #endif /* * We distinguish between routes to hosts and routes to networks, * preferring the former if available. For each route we infer * the interface to use from the gateway address supplied when * the route was entered. Routes that forward packets through * gateways are marked so that the output routines know to address the * gateway rather than the ultimate destination. */ #ifndef RNF_NORMAL #include #ifdef RADIX_MPATH #include #endif #endif #if defined(_KERNEL) || defined(_WANT_RTENTRY) struct rtentry { struct radix_node rt_nodes[2]; /* tree glue, and other values */ /* * XXX struct rtentry must begin with a struct radix_node (or two!) * because the code does some casts of a 'struct radix_node *' * to a 'struct rtentry *' */ #define rt_key(r) (*((struct sockaddr **)(&(r)->rt_nodes->rn_key))) #define rt_mask(r) (*((struct sockaddr **)(&(r)->rt_nodes->rn_mask))) struct sockaddr *rt_gateway; /* value */ struct ifnet *rt_ifp; /* the answer: interface to use */ struct ifaddr *rt_ifa; /* the answer: interface address to use */ int rt_flags; /* up/down?, host/net */ int rt_refcnt; /* # held references */ u_int rt_fibnum; /* which FIB */ u_long rt_mtu; /* MTU for this path */ u_long rt_weight; /* absolute weight */ u_long rt_expire; /* lifetime for route, e.g. redirect */ #define rt_endzero rt_pksent counter_u64_t rt_pksent; /* packets sent using this route */ struct mtx rt_mtx; /* mutex for routing entry */ struct rtentry *rt_chain; /* pointer to next rtentry to delete */ }; #endif /* _KERNEL || _WANT_RTENTRY */ #define RTF_UP 0x1 /* route usable */ #define RTF_GATEWAY 0x2 /* destination is a gateway */ #define RTF_HOST 0x4 /* host entry (net otherwise) */ #define RTF_REJECT 0x8 /* host or net unreachable */ #define RTF_DYNAMIC 0x10 /* created dynamically (by redirect) */ #define RTF_MODIFIED 0x20 /* modified dynamically (by redirect) */ #define RTF_DONE 0x40 /* message confirmed */ /* 0x80 unused, was RTF_DELCLONE */ /* 0x100 unused, was RTF_CLONING */ #define RTF_XRESOLVE 0x200 /* external daemon resolves name */ #define RTF_LLINFO 0x400 /* DEPRECATED - exists ONLY for backward compatibility */ #define RTF_LLDATA 0x400 /* used by apps to add/del L2 entries */ #define RTF_STATIC 0x800 /* manually added */ #define RTF_BLACKHOLE 0x1000 /* just discard pkts (during updates) */ #define RTF_PROTO2 0x4000 /* protocol specific routing flag */ #define RTF_PROTO1 0x8000 /* protocol specific routing flag */ /* 0x10000 unused, was RTF_PRCLONING */ /* 0x20000 unused, was RTF_WASCLONED */ #define RTF_PROTO3 0x40000 /* protocol specific routing flag */ #define RTF_FIXEDMTU 0x80000 /* MTU was explicitly specified */ #define RTF_PINNED 0x100000 /* route is immutable */ #define RTF_LOCAL 0x200000 /* route represents a local address */ #define RTF_BROADCAST 0x400000 /* route represents a bcast address */ #define RTF_MULTICAST 0x800000 /* route represents a mcast address */ /* 0x8000000 and up unassigned */ #define RTF_STICKY 0x10000000 /* always route dst->src */ #define RTF_RNH_LOCKED 0x40000000 /* unused */ #define RTF_GWFLAG_COMPAT 0x80000000 /* a compatibility bit for interacting with existing routing apps */ /* Mask of RTF flags that are allowed to be modified by RTM_CHANGE. */ #define RTF_FMASK \ (RTF_PROTO1 | RTF_PROTO2 | RTF_PROTO3 | RTF_BLACKHOLE | \ RTF_REJECT | RTF_STATIC | RTF_STICKY) /* * fib_ nexthop API flags. */ /* Consumer-visible nexthop info flags */ #define NHF_REJECT 0x0010 /* RTF_REJECT */ #define NHF_BLACKHOLE 0x0020 /* RTF_BLACKHOLE */ #define NHF_REDIRECT 0x0040 /* RTF_DYNAMIC|RTF_MODIFIED */ #define NHF_DEFAULT 0x0080 /* Default route */ #define NHF_BROADCAST 0x0100 /* RTF_BROADCAST */ #define NHF_GATEWAY 0x0200 /* RTF_GATEWAY */ /* Nexthop request flags */ #define NHR_IFAIF 0x01 /* Return ifa_ifp interface */ #define NHR_REF 0x02 /* For future use */ /* Control plane route request flags */ #define NHR_COPY 0x100 /* Copy rte data */ #ifdef _KERNEL /* rte<>ro_flags translation */ static inline void rt_update_ro_flags(struct route *ro) { int rt_flags = ro->ro_rt->rt_flags; ro->ro_flags &= ~ (RT_REJECT|RT_BLACKHOLE|RT_HAS_GW); ro->ro_flags |= (rt_flags & RTF_REJECT) ? RT_REJECT : 0; ro->ro_flags |= (rt_flags & RTF_BLACKHOLE) ? RT_BLACKHOLE : 0; ro->ro_flags |= (rt_flags & RTF_GATEWAY) ? RT_HAS_GW : 0; } #endif /* * Routing statistics. */ struct rtstat { short rts_badredirect; /* bogus redirect calls */ short rts_dynamic; /* routes created by redirects */ short rts_newgateway; /* routes modified by redirects */ short rts_unreach; /* lookups which failed */ short rts_wildcard; /* lookups satisfied by a wildcard */ }; /* * Structures for routing messages. */ struct rt_msghdr { u_short rtm_msglen; /* to skip over non-understood messages */ u_char rtm_version; /* future binary compatibility */ u_char rtm_type; /* message type */ u_short rtm_index; /* index for associated ifp */ int rtm_flags; /* flags, incl. kern & message, e.g. DONE */ int rtm_addrs; /* bitmask identifying sockaddrs in msg */ pid_t rtm_pid; /* identify sender */ int rtm_seq; /* for sender to identify action */ int rtm_errno; /* why failed */ int rtm_fmask; /* bitmask used in RTM_CHANGE message */ u_long rtm_inits; /* which metrics we are initializing */ struct rt_metrics rtm_rmx; /* metrics themselves */ }; #define RTM_VERSION 5 /* Up the ante and ignore older versions */ /* * Message types. */ #define RTM_ADD 0x1 /* Add Route */ #define RTM_DELETE 0x2 /* Delete Route */ #define RTM_CHANGE 0x3 /* Change Metrics or flags */ #define RTM_GET 0x4 /* Report Metrics */ #define RTM_LOSING 0x5 /* Kernel Suspects Partitioning */ #define RTM_REDIRECT 0x6 /* Told to use different route */ #define RTM_MISS 0x7 /* Lookup failed on this address */ #define RTM_LOCK 0x8 /* fix specified metrics */ /* 0x9 */ /* 0xa */ #define RTM_RESOLVE 0xb /* req to resolve dst to LL addr */ #define RTM_NEWADDR 0xc /* address being added to iface */ #define RTM_DELADDR 0xd /* address being removed from iface */ #define RTM_IFINFO 0xe /* iface going up/down etc. */ #define RTM_NEWMADDR 0xf /* mcast group membership being added to if */ #define RTM_DELMADDR 0x10 /* mcast group membership being deleted */ #define RTM_IFANNOUNCE 0x11 /* iface arrival/departure */ #define RTM_IEEE80211 0x12 /* IEEE80211 wireless event */ /* * Bitmask values for rtm_inits and rmx_locks. */ #define RTV_MTU 0x1 /* init or lock _mtu */ #define RTV_HOPCOUNT 0x2 /* init or lock _hopcount */ #define RTV_EXPIRE 0x4 /* init or lock _expire */ #define RTV_RPIPE 0x8 /* init or lock _recvpipe */ #define RTV_SPIPE 0x10 /* init or lock _sendpipe */ #define RTV_SSTHRESH 0x20 /* init or lock _ssthresh */ #define RTV_RTT 0x40 /* init or lock _rtt */ #define RTV_RTTVAR 0x80 /* init or lock _rttvar */ #define RTV_WEIGHT 0x100 /* init or lock _weight */ /* * Bitmask values for rtm_addrs. */ #define RTA_DST 0x1 /* destination sockaddr present */ #define RTA_GATEWAY 0x2 /* gateway sockaddr present */ #define RTA_NETMASK 0x4 /* netmask sockaddr present */ #define RTA_GENMASK 0x8 /* cloning mask sockaddr present */ #define RTA_IFP 0x10 /* interface name sockaddr present */ #define RTA_IFA 0x20 /* interface addr sockaddr present */ #define RTA_AUTHOR 0x40 /* sockaddr for author of redirect */ #define RTA_BRD 0x80 /* for NEWADDR, broadcast or p-p dest addr */ /* * Index offsets for sockaddr array for alternate internal encoding. */ #define RTAX_DST 0 /* destination sockaddr present */ #define RTAX_GATEWAY 1 /* gateway sockaddr present */ #define RTAX_NETMASK 2 /* netmask sockaddr present */ #define RTAX_GENMASK 3 /* cloning mask sockaddr present */ #define RTAX_IFP 4 /* interface name sockaddr present */ #define RTAX_IFA 5 /* interface addr sockaddr present */ #define RTAX_AUTHOR 6 /* sockaddr for author of redirect */ #define RTAX_BRD 7 /* for NEWADDR, broadcast or p-p dest addr */ #define RTAX_MAX 8 /* size of array to allocate */ typedef int rt_filter_f_t(const struct rtentry *, void *); struct rt_addrinfo { int rti_addrs; /* Route RTF_ flags */ int rti_flags; /* Route RTF_ flags */ struct sockaddr *rti_info[RTAX_MAX]; /* Sockaddr data */ struct ifaddr *rti_ifa; /* value of rt_ifa addr */ struct ifnet *rti_ifp; /* route interface */ rt_filter_f_t *rti_filter; /* filter function */ void *rti_filterdata; /* filter paramenters */ u_long rti_mflags; /* metrics RTV_ flags */ u_long rti_spare; /* Will be used for fib */ struct rt_metrics *rti_rmx; /* Pointer to route metrics */ }; /* * This macro returns the size of a struct sockaddr when passed * through a routing socket. Basically we round up sa_len to * a multiple of sizeof(long), with a minimum of sizeof(long). * The check for a NULL pointer is just a convenience, probably never used. * The case sa_len == 0 should only apply to empty structures. */ #define SA_SIZE(sa) \ ( (!(sa) || ((struct sockaddr *)(sa))->sa_len == 0) ? \ sizeof(long) : \ 1 + ( (((struct sockaddr *)(sa))->sa_len - 1) | (sizeof(long) - 1) ) ) #define sa_equal(a, b) ( \ (((const struct sockaddr *)(a))->sa_len == ((const struct sockaddr *)(b))->sa_len) && \ (bcmp((a), (b), ((const struct sockaddr *)(b))->sa_len) == 0)) #ifdef _KERNEL #define RT_LINK_IS_UP(ifp) (!((ifp)->if_capabilities & IFCAP_LINKSTATE) \ || (ifp)->if_link_state == LINK_STATE_UP) #define RT_LOCK_INIT(_rt) \ mtx_init(&(_rt)->rt_mtx, "rtentry", NULL, MTX_DEF | MTX_DUPOK) #define RT_LOCK(_rt) mtx_lock(&(_rt)->rt_mtx) #define RT_UNLOCK(_rt) mtx_unlock(&(_rt)->rt_mtx) #define RT_LOCK_DESTROY(_rt) mtx_destroy(&(_rt)->rt_mtx) #define RT_LOCK_ASSERT(_rt) mtx_assert(&(_rt)->rt_mtx, MA_OWNED) #define RT_UNLOCK_COND(_rt) do { \ if (mtx_owned(&(_rt)->rt_mtx)) \ mtx_unlock(&(_rt)->rt_mtx); \ } while (0) #define RT_ADDREF(_rt) do { \ RT_LOCK_ASSERT(_rt); \ KASSERT((_rt)->rt_refcnt >= 0, \ ("negative refcnt %d", (_rt)->rt_refcnt)); \ (_rt)->rt_refcnt++; \ } while (0) #define RT_REMREF(_rt) do { \ RT_LOCK_ASSERT(_rt); \ KASSERT((_rt)->rt_refcnt > 0, \ ("bogus refcnt %d", (_rt)->rt_refcnt)); \ (_rt)->rt_refcnt--; \ } while (0) #define RTFREE_LOCKED(_rt) do { \ if ((_rt)->rt_refcnt <= 1) \ rtfree(_rt); \ else { \ RT_REMREF(_rt); \ RT_UNLOCK(_rt); \ } \ /* guard against invalid refs */ \ _rt = 0; \ } while (0) #define RTFREE(_rt) do { \ RT_LOCK(_rt); \ RTFREE_LOCKED(_rt); \ } while (0) #define RO_RTFREE(_ro) do { \ if ((_ro)->ro_rt) { \ if ((_ro)->ro_flags & RT_NORTREF) { \ (_ro)->ro_flags &= ~RT_NORTREF; \ (_ro)->ro_rt = NULL; \ } else { \ RT_LOCK((_ro)->ro_rt); \ RTFREE_LOCKED((_ro)->ro_rt); \ } \ } \ } while (0) +/* + * Validate a cached route based on a supplied cookie. If there is an + * out-of-date cache, simply free it. Update the generation number + * for the new allocation + */ +#define RT_VALIDATE(ro, cookiep, fibnum) do { \ + rt_gen_t cookie = RT_GEN(fibnum, (ro)->ro_dst.sa_family); \ + if (*(cookiep) != cookie && (ro)->ro_rt != NULL) { \ + RTFREE((ro)->ro_rt); \ + (ro)->ro_rt = NULL; \ + *(cookiep) = cookie; \ + } \ +} while (0) + struct ifmultiaddr; struct rib_head; void rt_ieee80211msg(struct ifnet *, int, void *, size_t); void rt_ifannouncemsg(struct ifnet *, int); void rt_ifmsg(struct ifnet *); void rt_missmsg(int, struct rt_addrinfo *, int, int); void rt_missmsg_fib(int, struct rt_addrinfo *, int, int, int); void rt_newaddrmsg(int, struct ifaddr *, int, struct rtentry *); void rt_newaddrmsg_fib(int, struct ifaddr *, int, struct rtentry *, int); int rt_addrmsg(int, struct ifaddr *, int); int rt_routemsg(int, struct ifnet *ifp, int, struct rtentry *, int); void rt_newmaddrmsg(int, struct ifmultiaddr *); int rt_setgate(struct rtentry *, struct sockaddr *, struct sockaddr *); void rt_maskedcopy(struct sockaddr *, struct sockaddr *, struct sockaddr *); struct rib_head *rt_table_init(int); void rt_table_destroy(struct rib_head *); +rt_gen_t rt_tables_get_gen(int table, int fam); int rtsock_addrmsg(int, struct ifaddr *, int); int rtsock_routemsg(int, struct ifnet *ifp, int, struct rtentry *, int); /* * Note the following locking behavior: * * rtalloc1() returns a locked rtentry * * rtfree() and RTFREE_LOCKED() require a locked rtentry * * RTFREE() uses an unlocked entry. */ void rtfree(struct rtentry *); void rt_updatemtu(struct ifnet *); typedef int rt_walktree_f_t(struct rtentry *, void *); typedef void rt_setwarg_t(struct rib_head *, uint32_t, int, void *); void rt_foreach_fib_walk(int af, rt_setwarg_t *, rt_walktree_f_t *, void *); void rt_foreach_fib_walk_del(int af, rt_filter_f_t *filter_f, void *arg); void rt_flushifroutes(struct ifnet *ifp); /* XXX MRT COMPAT VERSIONS THAT SET UNIVERSE to 0 */ /* Thes are used by old code not yet converted to use multiple FIBS */ struct rtentry *rtalloc1(struct sockaddr *, int, u_long); int rtinit(struct ifaddr *, int, int); /* XXX MRT NEW VERSIONS THAT USE FIBs * For now the protocol indepedent versions are the same as the AF_INET ones * but this will change.. */ int rt_getifa_fib(struct rt_addrinfo *, u_int fibnum); void rtalloc_ign_fib(struct route *ro, u_long ignflags, u_int fibnum); struct rtentry *rtalloc1_fib(struct sockaddr *, int, u_long, u_int); int rtioctl_fib(u_long, caddr_t, u_int); void rtredirect_fib(struct sockaddr *, struct sockaddr *, struct sockaddr *, int, struct sockaddr *, u_int); int rtrequest_fib(int, struct sockaddr *, struct sockaddr *, struct sockaddr *, int, struct rtentry **, u_int); int rtrequest1_fib(int, struct rt_addrinfo *, struct rtentry **, u_int); int rib_lookup_info(uint32_t, const struct sockaddr *, uint32_t, uint32_t, struct rt_addrinfo *); void rib_free_info(struct rt_addrinfo *info); #endif #endif Index: head/sys/net/route_var.h =================================================================== --- head/sys/net/route_var.h (revision 297224) +++ head/sys/net/route_var.h (revision 297225) @@ -1,76 +1,76 @@ /*- * Copyright (c) 2015-2016 * Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _NET_ROUTE_VAR_H_ #define _NET_ROUTE_VAR_H_ struct rib_head { struct radix_head head; rn_matchaddr_f_t *rnh_matchaddr; /* longest match for sockaddr */ rn_addaddr_f_t *rnh_addaddr; /* add based on sockaddr*/ rn_deladdr_f_t *rnh_deladdr; /* remove based on sockaddr */ rn_lookup_f_t *rnh_lookup; /* exact match for sockaddr */ rn_walktree_t *rnh_walktree; /* traverse tree */ rn_walktree_from_t *rnh_walktree_from; /* traverse tree below a */ rn_close_t *rnh_close; /*do something when the last ref drops*/ - u_int rnh_gen; /* generation counter */ + rt_gen_t rnh_gen; /* generation counter */ int rnh_multipath; /* multipath capable ? */ struct radix_node rnh_nodes[3]; /* empty tree for common case */ struct rwlock rib_lock; /* config/data path lock */ struct radix_mask_head rmhead; /* masks radix head */ }; #define RIB_RLOCK(rh) rw_rlock(&(rh)->rib_lock) #define RIB_RUNLOCK(rh) rw_runlock(&(rh)->rib_lock) #define RIB_WLOCK(rh) rw_wlock(&(rh)->rib_lock) #define RIB_WUNLOCK(rh) rw_wunlock(&(rh)->rib_lock) #define RIB_LOCK_ASSERT(rh) rw_assert(&(rh)->rib_lock, RA_LOCKED) #define RIB_WLOCK_ASSERT(rh) rw_assert(&(rh)->rib_lock, RA_WLOCKED) struct rib_head *rt_tables_get_rnh(int fib, int family); /* rte<>nhop translation */ static inline uint16_t fib_rte_to_nh_flags(int rt_flags) { uint16_t res; res = (rt_flags & RTF_REJECT) ? NHF_REJECT : 0; res |= (rt_flags & RTF_BLACKHOLE) ? NHF_BLACKHOLE : 0; res |= (rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) ? NHF_REDIRECT : 0; res |= (rt_flags & RTF_BROADCAST) ? NHF_BROADCAST : 0; res |= (rt_flags & RTF_GATEWAY) ? NHF_GATEWAY : 0; return (res); } #endif Index: head/sys/netinet/in_pcb.c =================================================================== --- head/sys/netinet/in_pcb.c (revision 297224) +++ head/sys/netinet/in_pcb.c (revision 297225) @@ -1,2658 +1,2680 @@ /*- * Copyright (c) 1982, 1986, 1991, 1993, 1995 * The Regents of the University of California. * Copyright (c) 2007-2009 Robert N. M. Watson * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_ipsec.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_pcbgroup.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #include #include #endif #ifdef INET #include #endif #ifdef INET6 #include #include #include #include #endif /* INET6 */ #ifdef IPSEC #include #include #endif /* IPSEC */ #include static struct callout ipport_tick_callout; /* * These configure the range of local port addresses assigned to * "unspecified" outgoing connections/packets/whatever. */ VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */ VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */ VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */ VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */ VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */ VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */ /* * Reserved ports accessible only to root. There are significant * security considerations that must be accounted for when changing these, * but the security benefits can be great. Please be careful. */ VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */ VNET_DEFINE(int, ipport_reservedlow); /* Variables dealing with random ephemeral port allocation. */ VNET_DEFINE(int, ipport_randomized) = 1; /* user controlled via sysctl */ VNET_DEFINE(int, ipport_randomcps) = 10; /* user controlled via sysctl */ VNET_DEFINE(int, ipport_randomtime) = 45; /* user controlled via sysctl */ VNET_DEFINE(int, ipport_stoprandom); /* toggled by ipport_tick */ VNET_DEFINE(int, ipport_tcpallocs); static VNET_DEFINE(int, ipport_tcplastcount); #define V_ipport_tcplastcount VNET(ipport_tcplastcount) static void in_pcbremlists(struct inpcb *inp); #ifdef INET static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp); #define RANGECHK(var, min, max) \ if ((var) < (min)) { (var) = (min); } \ else if ((var) > (max)) { (var) = (max); } static int sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) { int error; error = sysctl_handle_int(oidp, arg1, arg2, req); if (error == 0) { RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1); RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX); } return (error); } #undef RANGECHK static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports"); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh, CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(ipport_reservedhigh), 0, ""); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow, CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, ""); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipport_randomized), 0, "Enable random port allocation"); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipport_randomcps), 0, "Maximum number of random port " "allocations before switching to a sequental one"); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipport_randomtime), 0, "Minimum time to keep sequental port " "allocation before switching to a random one"); #endif /* INET */ /* * in_pcb.c: manage the Protocol Control Blocks. * * NOTE: It is assumed that most of these functions will be called with * the pcbinfo lock held, and often, the inpcb lock held, as these utility * functions often modify hash chains or addresses in pcbs. */ /* * Initialize an inpcbinfo -- we should be able to reduce the number of * arguments in time. */ void in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name, struct inpcbhead *listhead, int hash_nelements, int porthash_nelements, char *inpcbzone_name, uma_init inpcbzone_init, uma_fini inpcbzone_fini, uint32_t inpcbzone_flags, u_int hashfields) { INP_INFO_LOCK_INIT(pcbinfo, name); INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash"); /* XXXRW: argument? */ INP_LIST_LOCK_INIT(pcbinfo, "pcbinfolist"); #ifdef VIMAGE pcbinfo->ipi_vnet = curvnet; #endif pcbinfo->ipi_listhead = listhead; LIST_INIT(pcbinfo->ipi_listhead); pcbinfo->ipi_count = 0; pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB, &pcbinfo->ipi_hashmask); pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, &pcbinfo->ipi_porthashmask); #ifdef PCBGROUP in_pcbgroup_init(pcbinfo, hashfields, hash_nelements); #endif pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb), NULL, NULL, inpcbzone_init, inpcbzone_fini, UMA_ALIGN_PTR, inpcbzone_flags); uma_zone_set_max(pcbinfo->ipi_zone, maxsockets); uma_zone_set_warning(pcbinfo->ipi_zone, "kern.ipc.maxsockets limit reached"); } /* * Destroy an inpcbinfo. */ void in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) { KASSERT(pcbinfo->ipi_count == 0, ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count)); hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask); hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, pcbinfo->ipi_porthashmask); #ifdef PCBGROUP in_pcbgroup_destroy(pcbinfo); #endif uma_zdestroy(pcbinfo->ipi_zone); INP_LIST_LOCK_DESTROY(pcbinfo); INP_HASH_LOCK_DESTROY(pcbinfo); INP_INFO_LOCK_DESTROY(pcbinfo); } /* * Allocate a PCB and associate it with the socket. * On success return with the PCB locked. */ int in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) { struct inpcb *inp; int error; #ifdef INVARIANTS if (pcbinfo == &V_tcbinfo) { INP_INFO_RLOCK_ASSERT(pcbinfo); } else { INP_INFO_WLOCK_ASSERT(pcbinfo); } #endif error = 0; inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT); if (inp == NULL) return (ENOBUFS); bzero(inp, inp_zero_size); inp->inp_pcbinfo = pcbinfo; inp->inp_socket = so; inp->inp_cred = crhold(so->so_cred); inp->inp_inc.inc_fibnum = so->so_fibnum; #ifdef MAC error = mac_inpcb_init(inp, M_NOWAIT); if (error != 0) goto out; mac_inpcb_create(so, inp); #endif #ifdef IPSEC error = ipsec_init_policy(so, &inp->inp_sp); if (error != 0) { #ifdef MAC mac_inpcb_destroy(inp); #endif goto out; } #endif /*IPSEC*/ #ifdef INET6 if (INP_SOCKAF(so) == AF_INET6) { inp->inp_vflag |= INP_IPV6PROTO; if (V_ip6_v6only) inp->inp_flags |= IN6P_IPV6_V6ONLY; } #endif INP_WLOCK(inp); INP_LIST_WLOCK(pcbinfo); LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list); pcbinfo->ipi_count++; so->so_pcb = (caddr_t)inp; #ifdef INET6 if (V_ip6_auto_flowlabel) inp->inp_flags |= IN6P_AUTOFLOWLABEL; #endif inp->inp_gencnt = ++pcbinfo->ipi_gencnt; refcount_init(&inp->inp_refcount, 1); /* Reference from inpcbinfo */ INP_LIST_WUNLOCK(pcbinfo); #if defined(IPSEC) || defined(MAC) out: if (error != 0) { crfree(inp->inp_cred); uma_zfree(pcbinfo->ipi_zone, inp); } #endif return (error); } #ifdef INET int in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { int anonport, error; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) return (EINVAL); anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0; error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr, &inp->inp_lport, cred); if (error) return (error); if (in_pcbinshash(inp) != 0) { inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; return (EAGAIN); } if (anonport) inp->inp_flags |= INP_ANONPORT; return (0); } #endif /* * Select a local port (number) to use. */ #if defined(INET) || defined(INET6) int in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, struct ucred *cred, int lookupflags) { struct inpcbinfo *pcbinfo; struct inpcb *tmpinp; unsigned short *lastport; int count, dorandom, error; u_short aux, first, last, lport; #ifdef INET struct in_addr laddr; #endif pcbinfo = inp->inp_pcbinfo; /* * Because no actual state changes occur here, a global write lock on * the pcbinfo isn't required. */ INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(pcbinfo); if (inp->inp_flags & INP_HIGHPORT) { first = V_ipport_hifirstauto; /* sysctl */ last = V_ipport_hilastauto; lastport = &pcbinfo->ipi_lasthi; } else if (inp->inp_flags & INP_LOWPORT) { error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0); if (error) return (error); first = V_ipport_lowfirstauto; /* 1023 */ last = V_ipport_lowlastauto; /* 600 */ lastport = &pcbinfo->ipi_lastlow; } else { first = V_ipport_firstauto; /* sysctl */ last = V_ipport_lastauto; lastport = &pcbinfo->ipi_lastport; } /* * For UDP(-Lite), use random port allocation as long as the user * allows it. For TCP (and as of yet unknown) connections, * use random port allocation only if the user allows it AND * ipport_tick() allows it. */ if (V_ipport_randomized && (!V_ipport_stoprandom || pcbinfo == &V_udbinfo || pcbinfo == &V_ulitecbinfo)) dorandom = 1; else dorandom = 0; /* * It makes no sense to do random port allocation if * we have the only port available. */ if (first == last) dorandom = 0; /* Make sure to not include UDP(-Lite) packets in the count. */ if (pcbinfo != &V_udbinfo || pcbinfo != &V_ulitecbinfo) V_ipport_tcpallocs++; /* * Instead of having two loops further down counting up or down * make sure that first is always <= last and go with only one * code path implementing all logic. */ if (first > last) { aux = first; first = last; last = aux; } #ifdef INET /* Make the compiler happy. */ laddr.s_addr = 0; if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) { KASSERT(laddrp != NULL, ("%s: laddrp NULL for v4 inp %p", __func__, inp)); laddr = *laddrp; } #endif tmpinp = NULL; /* Make compiler happy. */ lport = *lportp; if (dorandom) *lastport = first + (arc4random() % (last - first)); count = last - first; do { if (count-- < 0) /* completely used? */ return (EADDRNOTAVAIL); ++*lastport; if (*lastport < first || *lastport > last) *lastport = first; lport = htons(*lastport); #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) tmpinp = in6_pcblookup_local(pcbinfo, &inp->in6p_laddr, lport, lookupflags, cred); #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET tmpinp = in_pcblookup_local(pcbinfo, laddr, lport, lookupflags, cred); #endif } while (tmpinp != NULL); #ifdef INET if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) laddrp->s_addr = laddr.s_addr; #endif *lportp = lport; return (0); } /* * Return cached socket options. */ short inp_so_options(const struct inpcb *inp) { short so_options; so_options = 0; if ((inp->inp_flags2 & INP_REUSEPORT) != 0) so_options |= SO_REUSEPORT; if ((inp->inp_flags2 & INP_REUSEADDR) != 0) so_options |= SO_REUSEADDR; return (so_options); } #endif /* INET || INET6 */ /* * Check if a new BINDMULTI socket is allowed to be created. * * ni points to the new inp. * oi points to the exisitng inp. * * This checks whether the existing inp also has BINDMULTI and * whether the credentials match. */ int in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi) { /* Check permissions match */ if ((ni->inp_flags2 & INP_BINDMULTI) && (ni->inp_cred->cr_uid != oi->inp_cred->cr_uid)) return (0); /* Check the existing inp has BINDMULTI set */ if ((ni->inp_flags2 & INP_BINDMULTI) && ((oi->inp_flags2 & INP_BINDMULTI) == 0)) return (0); /* * We're okay - either INP_BINDMULTI isn't set on ni, or * it is and it matches the checks. */ return (1); } #ifdef INET /* * Set up a bind operation on a PCB, performing port allocation * as required, but do not actually modify the PCB. Callers can * either complete the bind by setting inp_laddr/inp_lport and * calling in_pcbinshash(), or they can just use the resulting * port and address to authorise the sending of a once-off packet. * * On error, the values of *laddrp and *lportp are not changed. */ int in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, u_short *lportp, struct ucred *cred) { struct socket *so = inp->inp_socket; struct sockaddr_in *sin; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct in_addr laddr; u_short lport = 0; int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT); int error; /* * No state changes, so read locks are sufficient here. */ INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(pcbinfo); if (TAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */ return (EADDRNOTAVAIL); laddr.s_addr = *laddrp; if (nam != NULL && laddr.s_addr != INADDR_ANY) return (EINVAL); if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) lookupflags = INPLOOKUP_WILDCARD; if (nam == NULL) { if ((error = prison_local_ip4(cred, &laddr)) != 0) return (error); } else { sin = (struct sockaddr_in *)nam; if (nam->sa_len != sizeof (*sin)) return (EINVAL); #ifdef notdef /* * We should check the family, but old programs * incorrectly fail to initialize it. */ if (sin->sin_family != AF_INET) return (EAFNOSUPPORT); #endif error = prison_local_ip4(cred, &sin->sin_addr); if (error) return (error); if (sin->sin_port != *lportp) { /* Don't allow the port to change. */ if (*lportp != 0) return (EINVAL); lport = sin->sin_port; } /* NB: lport is left as 0 if the port isn't being changed. */ if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { /* * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; * allow complete duplication of binding if * SO_REUSEPORT is set, or if SO_REUSEADDR is set * and a multicast address is bound on both * new and duplicated sockets. */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0) reuseport = SO_REUSEADDR|SO_REUSEPORT; } else if (sin->sin_addr.s_addr != INADDR_ANY) { sin->sin_port = 0; /* yech... */ bzero(&sin->sin_zero, sizeof(sin->sin_zero)); /* * Is the address a local IP address? * If INP_BINDANY is set, then the socket may be bound * to any endpoint address, local or not. */ if ((inp->inp_flags & INP_BINDANY) == 0 && ifa_ifwithaddr_check((struct sockaddr *)sin) == 0) return (EADDRNOTAVAIL); } laddr = sin->sin_addr; if (lport) { struct inpcb *t; struct tcptw *tw; /* GROSS */ if (ntohs(lport) <= V_ipport_reservedhigh && ntohs(lport) >= V_ipport_reservedlow && priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0)) return (EACCES); if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) && priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT, 0) != 0) { t = in_pcblookup_local(pcbinfo, sin->sin_addr, lport, INPLOOKUP_WILDCARD, cred); /* * XXX * This entire block sorely needs a rewrite. */ if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) && ((t->inp_flags & INP_TIMEWAIT) == 0) && (so->so_type != SOCK_STREAM || ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || ntohl(t->inp_laddr.s_addr) != INADDR_ANY || (t->inp_flags2 & INP_REUSEPORT) == 0) && (inp->inp_cred->cr_uid != t->inp_cred->cr_uid)) return (EADDRINUSE); /* * If the socket is a BINDMULTI socket, then * the credentials need to match and the * original socket also has to have been bound * with BINDMULTI. */ if (t && (! in_pcbbind_check_bindmulti(inp, t))) return (EADDRINUSE); } t = in_pcblookup_local(pcbinfo, sin->sin_addr, lport, lookupflags, cred); if (t && (t->inp_flags & INP_TIMEWAIT)) { /* * XXXRW: If an incpb has had its timewait * state recycled, we treat the address as * being in use (for now). This is better * than a panic, but not desirable. */ tw = intotw(t); if (tw == NULL || (reuseport & tw->tw_so_options) == 0) return (EADDRINUSE); } else if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) && (reuseport & inp_so_options(t)) == 0) { #ifdef INET6 if (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || ntohl(t->inp_laddr.s_addr) != INADDR_ANY || (inp->inp_vflag & INP_IPV6PROTO) == 0 || (t->inp_vflag & INP_IPV6PROTO) == 0) #endif return (EADDRINUSE); if (t && (! in_pcbbind_check_bindmulti(inp, t))) return (EADDRINUSE); } } } if (*lportp != 0) lport = *lportp; if (lport == 0) { error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags); if (error != 0) return (error); } *laddrp = laddr.s_addr; *lportp = lport; return (0); } /* * Connect from a socket to a specified address. * Both address and port must be specified in argument sin. * If don't have a local address for this socket yet, * then pick one. */ int in_pcbconnect_mbuf(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred, struct mbuf *m) { u_short lport, fport; in_addr_t laddr, faddr; int anonport, error; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); lport = inp->inp_lport; laddr = inp->inp_laddr.s_addr; anonport = (lport == 0); error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport, NULL, cred); if (error) return (error); /* Do the initial binding of the local address if required. */ if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { inp->inp_lport = lport; inp->inp_laddr.s_addr = laddr; if (in_pcbinshash(inp) != 0) { inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; return (EAGAIN); } } /* Commit the remaining changes. */ inp->inp_lport = lport; inp->inp_laddr.s_addr = laddr; inp->inp_faddr.s_addr = faddr; inp->inp_fport = fport; in_pcbrehash_mbuf(inp, m); if (anonport) inp->inp_flags |= INP_ANONPORT; return (0); } int in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { return (in_pcbconnect_mbuf(inp, nam, cred, NULL)); } /* * Do proper source address selection on an unbound socket in case * of connect. Take jails into account as well. */ int in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, struct ucred *cred) { struct ifaddr *ifa; struct sockaddr *sa; struct sockaddr_in *sin; struct route sro; int error; KASSERT(laddr != NULL, ("%s: laddr NULL", __func__)); /* * Bypass source address selection and use the primary jail IP * if requested. */ if (cred != NULL && !prison_saddrsel_ip4(cred, laddr)) return (0); error = 0; bzero(&sro, sizeof(sro)); sin = (struct sockaddr_in *)&sro.ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(struct sockaddr_in); sin->sin_addr.s_addr = faddr->s_addr; /* * If route is known our src addr is taken from the i/f, * else punt. * * Find out route to destination. */ if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) in_rtalloc_ign(&sro, 0, inp->inp_inc.inc_fibnum); /* * If we found a route, use the address corresponding to * the outgoing interface. * * Otherwise assume faddr is reachable on a directly connected * network and try to find a corresponding interface to take * the source address from. */ if (sro.ro_rt == NULL || sro.ro_rt->rt_ifp == NULL) { struct in_ifaddr *ia; struct ifnet *ifp; ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin, inp->inp_socket->so_fibnum)); if (ia == NULL) ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0, inp->inp_socket->so_fibnum)); if (ia == NULL) { error = ENETUNREACH; goto done; } if (cred == NULL || !prison_flag(cred, PR_IP4)) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; ifa_free(&ia->ia_ifa); goto done; } ifp = ia->ia_ifp; ifa_free(&ia->ia_ifa); ia = NULL; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; IF_ADDR_RUNLOCK(ifp); goto done; } IF_ADDR_RUNLOCK(ifp); /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } /* * If the outgoing interface on the route found is not * a loopback interface, use the address from that interface. * In case of jails do those three steps: * 1. check if the interface address belongs to the jail. If so use it. * 2. check if we have any address on the outgoing interface * belonging to this jail. If so use it. * 3. as a last resort return the 'default' jail address. */ if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) { struct in_ifaddr *ia; struct ifnet *ifp; /* If not jailed, use the default returned. */ if (cred == NULL || !prison_flag(cred, PR_IP4)) { ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa; laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* Jailed. */ /* 1. Check if the iface address belongs to the jail. */ sin = (struct sockaddr_in *)sro.ro_rt->rt_ifa->ifa_addr; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa; laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* * 2. Check if we have any address on the outgoing interface * belonging to this jail. */ ia = NULL; ifp = sro.ro_rt->rt_ifp; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; IF_ADDR_RUNLOCK(ifp); goto done; } IF_ADDR_RUNLOCK(ifp); /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } /* * The outgoing interface is marked with 'loopback net', so a route * to ourselves is here. * Try to find the interface of the destination address and then * take the address from there. That interface is not necessarily * a loopback interface. * In case of jails, check that it is an address of the jail * and if we cannot find, fall back to the 'default' jail address. */ if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) { struct sockaddr_in sain; struct in_ifaddr *ia; bzero(&sain, sizeof(struct sockaddr_in)); sain.sin_family = AF_INET; sain.sin_len = sizeof(struct sockaddr_in); sain.sin_addr.s_addr = faddr->s_addr; ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sain), inp->inp_socket->so_fibnum)); if (ia == NULL) ia = ifatoia(ifa_ifwithnet(sintosa(&sain), 0, inp->inp_socket->so_fibnum)); if (ia == NULL) ia = ifatoia(ifa_ifwithaddr(sintosa(&sain))); if (cred == NULL || !prison_flag(cred, PR_IP4)) { if (ia == NULL) { error = ENETUNREACH; goto done; } laddr->s_addr = ia->ia_addr.sin_addr.s_addr; ifa_free(&ia->ia_ifa); goto done; } /* Jailed. */ if (ia != NULL) { struct ifnet *ifp; ifp = ia->ia_ifp; ifa_free(&ia->ia_ifa); ia = NULL; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; IF_ADDR_RUNLOCK(ifp); goto done; } IF_ADDR_RUNLOCK(ifp); } /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } done: if (sro.ro_rt != NULL) RTFREE(sro.ro_rt); return (error); } /* * Set up for a connect from a socket to the specified address. * On entry, *laddrp and *lportp should contain the current local * address and port for the PCB; these are updated to the values * that should be placed in inp_laddr and inp_lport to complete * the connect. * * On success, *faddrp and *fportp will be set to the remote address * and port. These are not updated in the error case. * * If the operation fails because the connection already exists, * *oinpp will be set to the PCB of that connection so that the * caller can decide to override it. In all other cases, *oinpp * is set to NULL. */ int in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp, struct inpcb **oinpp, struct ucred *cred) { struct rm_priotracker in_ifa_tracker; struct sockaddr_in *sin = (struct sockaddr_in *)nam; struct in_ifaddr *ia; struct inpcb *oinp; struct in_addr laddr, faddr; u_short lport, fport; int error; /* * Because a global state change doesn't actually occur here, a read * lock is sufficient. */ INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); if (oinpp != NULL) *oinpp = NULL; if (nam->sa_len != sizeof (*sin)) return (EINVAL); if (sin->sin_family != AF_INET) return (EAFNOSUPPORT); if (sin->sin_port == 0) return (EADDRNOTAVAIL); laddr.s_addr = *laddrp; lport = *lportp; faddr = sin->sin_addr; fport = sin->sin_port; if (!TAILQ_EMPTY(&V_in_ifaddrhead)) { /* * If the destination address is INADDR_ANY, * use the primary local address. * If the supplied address is INADDR_BROADCAST, * and the primary interface supports broadcast, * choose the broadcast address for that interface. */ if (faddr.s_addr == INADDR_ANY) { IN_IFADDR_RLOCK(&in_ifa_tracker); faddr = IA_SIN(TAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; IN_IFADDR_RUNLOCK(&in_ifa_tracker); if (cred != NULL && (error = prison_get_ip4(cred, &faddr)) != 0) return (error); } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) { IN_IFADDR_RLOCK(&in_ifa_tracker); if (TAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags & IFF_BROADCAST) faddr = satosin(&TAILQ_FIRST( &V_in_ifaddrhead)->ia_broadaddr)->sin_addr; IN_IFADDR_RUNLOCK(&in_ifa_tracker); } } if (laddr.s_addr == INADDR_ANY) { error = in_pcbladdr(inp, &faddr, &laddr, cred); /* * If the destination address is multicast and an outgoing * interface has been set as a multicast option, prefer the * address of that interface as our source address. */ if (IN_MULTICAST(ntohl(faddr.s_addr)) && inp->inp_moptions != NULL) { struct ip_moptions *imo; struct ifnet *ifp; imo = inp->inp_moptions; if (imo->imo_multicast_ifp != NULL) { ifp = imo->imo_multicast_ifp; IN_IFADDR_RLOCK(&in_ifa_tracker); TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if ((ia->ia_ifp == ifp) && (cred == NULL || prison_check_ip4(cred, &ia->ia_addr.sin_addr) == 0)) break; } if (ia == NULL) error = EADDRNOTAVAIL; else { laddr = ia->ia_addr.sin_addr; error = 0; } IN_IFADDR_RUNLOCK(&in_ifa_tracker); } } if (error) return (error); } oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, fport, laddr, lport, 0, NULL); if (oinp != NULL) { if (oinpp != NULL) *oinpp = oinp; return (EADDRINUSE); } if (lport == 0) { error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport, cred); if (error) return (error); } *laddrp = laddr.s_addr; *lportp = lport; *faddrp = faddr.s_addr; *fportp = fport; return (0); } void in_pcbdisconnect(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); inp->inp_faddr.s_addr = INADDR_ANY; inp->inp_fport = 0; in_pcbrehash(inp); } #endif /* INET */ /* * in_pcbdetach() is responsibe for disassociating a socket from an inpcb. * For most protocols, this will be invoked immediately prior to calling * in_pcbfree(). However, with TCP the inpcb may significantly outlive the * socket, in which case in_pcbfree() is deferred. */ void in_pcbdetach(struct inpcb *inp) { KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); inp->inp_socket->so_pcb = NULL; inp->inp_socket = NULL; } /* * in_pcbref() bumps the reference count on an inpcb in order to maintain * stability of an inpcb pointer despite the inpcb lock being released. This * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded, * but where the inpcb lock may already held, or when acquiring a reference * via a pcbgroup. * * in_pcbref() should be used only to provide brief memory stability, and * must always be followed by a call to INP_WLOCK() and in_pcbrele() to * garbage collect the inpcb if it has been in_pcbfree()'d from another * context. Until in_pcbrele() has returned that the inpcb is still valid, * lock and rele are the *only* safe operations that may be performed on the * inpcb. * * While the inpcb will not be freed, releasing the inpcb lock means that the * connection's state may change, so the caller should be careful to * revalidate any cached state on reacquiring the lock. Drop the reference * using in_pcbrele(). */ void in_pcbref(struct inpcb *inp) { KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); refcount_acquire(&inp->inp_refcount); } /* * Drop a refcount on an inpcb elevated using in_pcbref(); because a call to * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we * return a flag indicating whether or not the inpcb remains valid. If it is * valid, we return with the inpcb lock held. * * Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a * reference on an inpcb. Historically more work was done here (actually, in * in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the * need for the pcbinfo lock in in_pcbrele(). Deferring the free is entirely * about memory stability (and continued use of the write lock). */ int in_pcbrele_rlocked(struct inpcb *inp) { struct inpcbinfo *pcbinfo; KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); INP_RLOCK_ASSERT(inp); if (refcount_release(&inp->inp_refcount) == 0) { /* * If the inpcb has been freed, let the caller know, even if * this isn't the last reference. */ if (inp->inp_flags2 & INP_FREED) { INP_RUNLOCK(inp); return (1); } return (0); } KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); INP_RUNLOCK(inp); pcbinfo = inp->inp_pcbinfo; uma_zfree(pcbinfo->ipi_zone, inp); return (1); } int in_pcbrele_wlocked(struct inpcb *inp) { struct inpcbinfo *pcbinfo; KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); INP_WLOCK_ASSERT(inp); if (refcount_release(&inp->inp_refcount) == 0) { /* * If the inpcb has been freed, let the caller know, even if * this isn't the last reference. */ if (inp->inp_flags2 & INP_FREED) { INP_WUNLOCK(inp); return (1); } return (0); } KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); INP_WUNLOCK(inp); pcbinfo = inp->inp_pcbinfo; uma_zfree(pcbinfo->ipi_zone, inp); return (1); } /* * Temporary wrapper. */ int in_pcbrele(struct inpcb *inp) { return (in_pcbrele_wlocked(inp)); } /* * Unconditionally schedule an inpcb to be freed by decrementing its * reference count, which should occur only after the inpcb has been detached * from its socket. If another thread holds a temporary reference (acquired * using in_pcbref()) then the free is deferred until that reference is * released using in_pcbrele(), but the inpcb is still unlocked. Almost all * work, including removal from global lists, is done in this context, where * the pcbinfo lock is held. */ void in_pcbfree(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); #ifdef INVARIANTS if (pcbinfo == &V_tcbinfo) { INP_INFO_LOCK_ASSERT(pcbinfo); } else { INP_INFO_WLOCK_ASSERT(pcbinfo); } #endif INP_WLOCK_ASSERT(inp); /* XXXRW: Do as much as possible here. */ #ifdef IPSEC if (inp->inp_sp != NULL) ipsec_delete_pcbpolicy(inp); #endif INP_LIST_WLOCK(pcbinfo); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; in_pcbremlists(inp); INP_LIST_WUNLOCK(pcbinfo); #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) { ip6_freepcbopts(inp->in6p_outputopts); if (inp->in6p_moptions != NULL) ip6_freemoptions(inp->in6p_moptions); } #endif if (inp->inp_options) (void)m_free(inp->inp_options); #ifdef INET if (inp->inp_moptions != NULL) inp_freemoptions(inp->inp_moptions); #endif + if (inp->inp_route.ro_rt) { + RTFREE(inp->inp_route.ro_rt); + inp->inp_route.ro_rt = (struct rtentry *)NULL; + } + inp->inp_vflag = 0; inp->inp_flags2 |= INP_FREED; crfree(inp->inp_cred); #ifdef MAC mac_inpcb_destroy(inp); #endif if (!in_pcbrele_wlocked(inp)) INP_WUNLOCK(inp); } /* * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and * port reservation, and preventing it from being returned by inpcb lookups. * * It is used by TCP to mark an inpcb as unused and avoid future packet * delivery or event notification when a socket remains open but TCP has * closed. This might occur as a result of a shutdown()-initiated TCP close * or a RST on the wire, and allows the port binding to be reused while still * maintaining the invariant that so_pcb always points to a valid inpcb until * in_pcbdetach(). * * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by * in_pcbnotifyall() and in_pcbpurgeif0()? */ void in_pcbdrop(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); /* * XXXRW: Possibly we should protect the setting of INP_DROPPED with * the hash lock...? */ inp->inp_flags |= INP_DROPPED; if (inp->inp_flags & INP_INHASHLIST) { struct inpcbport *phd = inp->inp_phd; INP_HASH_WLOCK(inp->inp_pcbinfo); LIST_REMOVE(inp, inp_hash); LIST_REMOVE(inp, inp_portlist); if (LIST_FIRST(&phd->phd_pcblist) == NULL) { LIST_REMOVE(phd, phd_hash); free(phd, M_PCB); } INP_HASH_WUNLOCK(inp->inp_pcbinfo); inp->inp_flags &= ~INP_INHASHLIST; #ifdef PCBGROUP in_pcbgroup_remove(inp); #endif } } #ifdef INET /* * Common routines to return the socket addresses associated with inpcbs. */ struct sockaddr * in_sockaddr(in_port_t port, struct in_addr *addr_p) { struct sockaddr_in *sin; sin = malloc(sizeof *sin, M_SONAME, M_WAITOK | M_ZERO); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = *addr_p; sin->sin_port = port; return (struct sockaddr *)sin; } int in_getsockaddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; struct in_addr addr; in_port_t port; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL")); INP_RLOCK(inp); port = inp->inp_lport; addr = inp->inp_laddr; INP_RUNLOCK(inp); *nam = in_sockaddr(port, &addr); return 0; } int in_getpeeraddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; struct in_addr addr; in_port_t port; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL")); INP_RLOCK(inp); port = inp->inp_fport; addr = inp->inp_faddr; INP_RUNLOCK(inp); *nam = in_sockaddr(port, &addr); return 0; } void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno, struct inpcb *(*notify)(struct inpcb *, int)) { struct inpcb *inp, *inp_temp; INP_INFO_WLOCK(pcbinfo); LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) { INP_WLOCK(inp); #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) { INP_WUNLOCK(inp); continue; } #endif if (inp->inp_faddr.s_addr != faddr.s_addr || inp->inp_socket == NULL) { INP_WUNLOCK(inp); continue; } if ((*notify)(inp, errno)) INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(pcbinfo); } void in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) { struct inpcb *inp; struct ip_moptions *imo; int i, gap; INP_INFO_WLOCK(pcbinfo); LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) { INP_WLOCK(inp); imo = inp->inp_moptions; if ((inp->inp_vflag & INP_IPV4) && imo != NULL) { /* * Unselect the outgoing interface if it is being * detached. */ if (imo->imo_multicast_ifp == ifp) imo->imo_multicast_ifp = NULL; /* * Drop multicast group membership if we joined * through the interface being detached. */ for (i = 0, gap = 0; i < imo->imo_num_memberships; i++) { if (imo->imo_membership[i]->inm_ifp == ifp) { in_delmulti(imo->imo_membership[i]); gap++; } else if (gap != 0) imo->imo_membership[i - gap] = imo->imo_membership[i]; } imo->imo_num_memberships -= gap; } INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(pcbinfo); } /* * Lookup a PCB based on the local address and port. Caller must hold the * hash lock. No inpcb locks or references are acquired. */ #define INP_LOOKUP_MAPPED_PCB_COST 3 struct inpcb * in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, u_short lport, int lookupflags, struct ucred *cred) { struct inpcb *inp; #ifdef INET6 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST; #else int matchwild = 3; #endif int wildcard; KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); INP_HASH_LOCK_ASSERT(pcbinfo); if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { struct inpcbhead *head; /* * Look for an unconnected (wildcard foreign addr) PCB that * matches the local address and port we're looking for. */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == INADDR_ANY && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_lport == lport) { /* * Found? */ if (cred == NULL || prison_equal_ip4(cred->cr_prison, inp->inp_cred->cr_prison)) return (inp); } } /* * Not found. */ return (NULL); } else { struct inpcbporthead *porthash; struct inpcbport *phd; struct inpcb *match = NULL; /* * Best fit PCB lookup. * * First see if this local port is in use by looking on the * port hash list. */ porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, pcbinfo->ipi_porthashmask)]; LIST_FOREACH(phd, porthash, phd_hash) { if (phd->phd_port == lport) break; } if (phd != NULL) { /* * Port is in use by one or more PCBs. Look for best * fit. */ LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { wildcard = 0; if (cred != NULL && !prison_equal_ip4(inp->inp_cred->cr_prison, cred->cr_prison)) continue; #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; /* * We never select the PCB that has * INP_IPV6 flag and is bound to :: if * we have another PCB which is bound * to 0.0.0.0. If a PCB has the * INP_IPV6 flag, then we set its cost * higher than IPv4 only PCBs. * * Note that the case only happens * when a socket is bound to ::, under * the condition that the use of the * mapped address is allowed. */ if ((inp->inp_vflag & INP_IPV6) != 0) wildcard += INP_LOOKUP_MAPPED_PCB_COST; #endif if (inp->inp_faddr.s_addr != INADDR_ANY) wildcard++; if (inp->inp_laddr.s_addr != INADDR_ANY) { if (laddr.s_addr == INADDR_ANY) wildcard++; else if (inp->inp_laddr.s_addr != laddr.s_addr) continue; } else { if (laddr.s_addr != INADDR_ANY) wildcard++; } if (wildcard < matchwild) { match = inp; matchwild = wildcard; if (matchwild == 0) break; } } } return (match); } } #undef INP_LOOKUP_MAPPED_PCB_COST #ifdef PCBGROUP /* * Lookup PCB in hash list, using pcbgroup tables. */ static struct inpcb * in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; u_short fport = fport_arg, lport = lport_arg; /* * First look for an exact match. */ tmpinp = NULL; INP_GROUP_LOCK(pcbgroup); head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbgroup->ipg_hashmask)]; LIST_FOREACH(inp, head, inp_pcbgrouphash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == faddr.s_addr && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_fport == fport && inp->inp_lport == lport) { /* * XXX We should be able to directly return * the inp here, without any checks. * Well unless both bound with SO_REUSEPORT? */ if (prison_flag(inp->inp_cred, PR_IP4)) goto found; if (tmpinp == NULL) tmpinp = inp; } } if (tmpinp != NULL) { inp = tmpinp; goto found; } #ifdef RSS /* * For incoming connections, we may wish to do a wildcard * match for an RSS-local socket. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; #ifdef INET6 struct inpcb *local_wild_mapped = NULL; #endif struct inpcb *jail_wild = NULL; struct inpcbhead *head; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbgroup->ipg_hashmask)]; LIST_FOREACH(inp, head, inp_pcbgrouphash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) continue; injail = prison_flag(inp->inp_cred, PR_IP4); if (injail) { if (prison_check_ip4(inp->inp_cred, &laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (inp->inp_laddr.s_addr == laddr.s_addr) { if (injail) goto found; else local_exact = inp; } else if (inp->inp_laddr.s_addr == INADDR_ANY) { #ifdef INET6 /* XXX inp locking, NULL check */ if (inp->inp_vflag & INP_IPV6PROTO) local_wild_mapped = inp; else #endif if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ inp = jail_wild; if (inp == NULL) inp = local_exact; if (inp == NULL) inp = local_wild; #ifdef INET6 if (inp == NULL) inp = local_wild_mapped; #endif if (inp != NULL) goto found; } #endif /* * Then look for a wildcard match, if requested. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; #ifdef INET6 struct inpcb *local_wild_mapped = NULL; #endif struct inpcb *jail_wild = NULL; struct inpcbhead *head; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_wildmask)]; LIST_FOREACH(inp, head, inp_pcbgroup_wild) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) continue; injail = prison_flag(inp->inp_cred, PR_IP4); if (injail) { if (prison_check_ip4(inp->inp_cred, &laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (inp->inp_laddr.s_addr == laddr.s_addr) { if (injail) goto found; else local_exact = inp; } else if (inp->inp_laddr.s_addr == INADDR_ANY) { #ifdef INET6 /* XXX inp locking, NULL check */ if (inp->inp_vflag & INP_IPV6PROTO) local_wild_mapped = inp; else #endif if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ inp = jail_wild; if (inp == NULL) inp = local_exact; if (inp == NULL) inp = local_wild; #ifdef INET6 if (inp == NULL) inp = local_wild_mapped; #endif if (inp != NULL) goto found; } /* if (lookupflags & INPLOOKUP_WILDCARD) */ INP_GROUP_UNLOCK(pcbgroup); return (NULL); found: in_pcbref(inp); INP_GROUP_UNLOCK(pcbgroup); if (lookupflags & INPLOOKUP_WLOCKPCB) { INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) return (NULL); } else if (lookupflags & INPLOOKUP_RLOCKPCB) { INP_RLOCK(inp); if (in_pcbrele_rlocked(inp)) return (NULL); } else panic("%s: locking bug", __func__); return (inp); } #endif /* PCBGROUP */ /* * Lookup PCB in hash list, using pcbinfo tables. This variation assumes * that the caller has locked the hash list, and will not perform any further * locking or reference operations on either the hash list or the connection. */ static struct inpcb * in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; u_short fport = fport_arg, lport = lport_arg; KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); INP_HASH_LOCK_ASSERT(pcbinfo); /* * First look for an exact match. */ tmpinp = NULL; head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == faddr.s_addr && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_fport == fport && inp->inp_lport == lport) { /* * XXX We should be able to directly return * the inp here, without any checks. * Well unless both bound with SO_REUSEPORT? */ if (prison_flag(inp->inp_cred, PR_IP4)) return (inp); if (tmpinp == NULL) tmpinp = inp; } } if (tmpinp != NULL) return (tmpinp); /* * Then look for a wildcard match, if requested. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; #ifdef INET6 struct inpcb *local_wild_mapped = NULL; #endif struct inpcb *jail_wild = NULL; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) continue; injail = prison_flag(inp->inp_cred, PR_IP4); if (injail) { if (prison_check_ip4(inp->inp_cred, &laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (inp->inp_laddr.s_addr == laddr.s_addr) { if (injail) return (inp); else local_exact = inp; } else if (inp->inp_laddr.s_addr == INADDR_ANY) { #ifdef INET6 /* XXX inp locking, NULL check */ if (inp->inp_vflag & INP_IPV6PROTO) local_wild_mapped = inp; else #endif if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ if (jail_wild != NULL) return (jail_wild); if (local_exact != NULL) return (local_exact); if (local_wild != NULL) return (local_wild); #ifdef INET6 if (local_wild_mapped != NULL) return (local_wild_mapped); #endif } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */ return (NULL); } /* * Lookup PCB in hash list, using pcbinfo tables. This variation locks the * hash list lock, and will return the inpcb locked (i.e., requires * INPLOOKUP_LOCKPCB). */ static struct inpcb * in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp) { struct inpcb *inp; INP_HASH_RLOCK(pcbinfo); inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp); if (inp != NULL) { in_pcbref(inp); INP_HASH_RUNLOCK(pcbinfo); if (lookupflags & INPLOOKUP_WLOCKPCB) { INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) return (NULL); } else if (lookupflags & INPLOOKUP_RLOCKPCB) { INP_RLOCK(inp); if (in_pcbrele_rlocked(inp)) return (NULL); } else panic("%s: locking bug", __func__); } else INP_HASH_RUNLOCK(pcbinfo); return (inp); } /* * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf * from which a pre-calculated hash value may be extracted. * * Possibly more of this logic should be in in_pcbgroup.c. */ struct inpcb * in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp) { #if defined(PCBGROUP) && !defined(RSS) struct inpcbgroup *pcbgroup; #endif KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); /* * When not using RSS, use connection groups in preference to the * reservation table when looking up 4-tuples. When using RSS, just * use the reservation table, due to the cost of the Toeplitz hash * in software. * * XXXRW: This policy belongs in the pcbgroup code, as in principle * we could be doing RSS with a non-Toeplitz hash that is affordable * in software. */ #if defined(PCBGROUP) && !defined(RSS) if (in_pcbgroup_enabled(pcbinfo)) { pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, fport); return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); } #endif return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp)); } struct inpcb * in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp, struct mbuf *m) { #ifdef PCBGROUP struct inpcbgroup *pcbgroup; #endif KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); #ifdef PCBGROUP /* * If we can use a hardware-generated hash to look up the connection * group, use that connection group to find the inpcb. Otherwise * fall back on a software hash -- or the reservation table if we're * using RSS. * * XXXRW: As above, that policy belongs in the pcbgroup code. */ if (in_pcbgroup_enabled(pcbinfo) && !(M_HASHTYPE_TEST(m, M_HASHTYPE_NONE))) { pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m), m->m_pkthdr.flowid); if (pcbgroup != NULL) return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); #ifndef RSS pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, fport); return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); #endif } #endif return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp)); } #endif /* INET */ /* * Insert PCB onto various hash lists. */ static int in_pcbinshash_internal(struct inpcb *inp, int do_pcbgroup_update) { struct inpcbhead *pcbhash; struct inpcbporthead *pcbporthash; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbport *phd; u_int32_t hashkey_faddr; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); KASSERT((inp->inp_flags & INP_INHASHLIST) == 0, ("in_pcbinshash: INP_INHASHLIST")); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr); else #endif hashkey_faddr = inp->inp_faddr.s_addr; pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; pcbporthash = &pcbinfo->ipi_porthashbase[ INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; /* * Go through port list and look for a head for this lport. */ LIST_FOREACH(phd, pcbporthash, phd_hash) { if (phd->phd_port == inp->inp_lport) break; } /* * If none exists, malloc one and tack it on. */ if (phd == NULL) { phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT); if (phd == NULL) { return (ENOBUFS); /* XXX */ } phd->phd_port = inp->inp_lport; LIST_INIT(&phd->phd_pcblist); LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); } inp->inp_phd = phd; LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); LIST_INSERT_HEAD(pcbhash, inp, inp_hash); inp->inp_flags |= INP_INHASHLIST; #ifdef PCBGROUP if (do_pcbgroup_update) in_pcbgroup_update(inp); #endif return (0); } /* * For now, there are two public interfaces to insert an inpcb into the hash * lists -- one that does update pcbgroups, and one that doesn't. The latter * is used only in the TCP syncache, where in_pcbinshash is called before the * full 4-tuple is set for the inpcb, and we don't want to install in the * pcbgroup until later. * * XXXRW: This seems like a misfeature. in_pcbinshash should always update * connection groups, and partially initialised inpcbs should not be exposed * to either reservation hash tables or pcbgroups. */ int in_pcbinshash(struct inpcb *inp) { return (in_pcbinshash_internal(inp, 1)); } int in_pcbinshash_nopcbgroup(struct inpcb *inp) { return (in_pcbinshash_internal(inp, 0)); } /* * Move PCB to the proper hash bucket when { faddr, fport } have been * changed. NOTE: This does not handle the case of the lport changing (the * hashed port list would have to be updated as well), so the lport must * not change after in_pcbinshash() has been called. */ void in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbhead *head; u_int32_t hashkey_faddr; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); KASSERT(inp->inp_flags & INP_INHASHLIST, ("in_pcbrehash: !INP_INHASHLIST")); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr); else #endif hashkey_faddr = inp->inp_faddr.s_addr; head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; LIST_REMOVE(inp, inp_hash); LIST_INSERT_HEAD(head, inp, inp_hash); #ifdef PCBGROUP if (m != NULL) in_pcbgroup_update_mbuf(inp, m); else in_pcbgroup_update(inp); #endif } void in_pcbrehash(struct inpcb *inp) { in_pcbrehash_mbuf(inp, NULL); } /* * Remove PCB from various lists. */ static void in_pcbremlists(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; #ifdef INVARIANTS if (pcbinfo == &V_tcbinfo) { INP_INFO_RLOCK_ASSERT(pcbinfo); } else { INP_INFO_WLOCK_ASSERT(pcbinfo); } #endif INP_WLOCK_ASSERT(inp); INP_LIST_WLOCK_ASSERT(pcbinfo); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; if (inp->inp_flags & INP_INHASHLIST) { struct inpcbport *phd = inp->inp_phd; INP_HASH_WLOCK(pcbinfo); LIST_REMOVE(inp, inp_hash); LIST_REMOVE(inp, inp_portlist); if (LIST_FIRST(&phd->phd_pcblist) == NULL) { LIST_REMOVE(phd, phd_hash); free(phd, M_PCB); } INP_HASH_WUNLOCK(pcbinfo); inp->inp_flags &= ~INP_INHASHLIST; } LIST_REMOVE(inp, inp_list); pcbinfo->ipi_count--; #ifdef PCBGROUP in_pcbgroup_remove(inp); #endif +} + +/* + * Check for alternatives when higher level complains + * about service problems. For now, invalidate cached + * routing information. If the route was created dynamically + * (by a redirect), time to try a default gateway again. + */ +void +in_losing(struct inpcb *inp) +{ + + if (inp->inp_route.ro_rt) { + RTFREE(inp->inp_route.ro_rt); + inp->inp_route.ro_rt = (struct rtentry *)NULL; + } + return; } /* * A set label operation has occurred at the socket layer, propagate the * label change into the in_pcb for the socket. */ void in_pcbsosetlabel(struct socket *so) { #ifdef MAC struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL")); INP_WLOCK(inp); SOCK_LOCK(so); mac_inpcb_sosetlabel(so, inp); SOCK_UNLOCK(so); INP_WUNLOCK(inp); #endif } /* * ipport_tick runs once per second, determining if random port allocation * should be continued. If more than ipport_randomcps ports have been * allocated in the last second, then we return to sequential port * allocation. We return to random allocation only once we drop below * ipport_randomcps for at least ipport_randomtime seconds. */ static void ipport_tick(void *xtp) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS here */ if (V_ipport_tcpallocs <= V_ipport_tcplastcount + V_ipport_randomcps) { if (V_ipport_stoprandom > 0) V_ipport_stoprandom--; } else V_ipport_stoprandom = V_ipport_randomtime; V_ipport_tcplastcount = V_ipport_tcpallocs; CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL); } static void ip_fini(void *xtp) { callout_stop(&ipport_tick_callout); } /* * The ipport_callout should start running at about the time we attach the * inet or inet6 domains. */ static void ipport_tick_init(const void *unused __unused) { /* Start ipport_tick. */ callout_init(&ipport_tick_callout, 1); callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL); EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL, SHUTDOWN_PRI_DEFAULT); } SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ipport_tick_init, NULL); void inp_wlock(struct inpcb *inp) { INP_WLOCK(inp); } void inp_wunlock(struct inpcb *inp) { INP_WUNLOCK(inp); } void inp_rlock(struct inpcb *inp) { INP_RLOCK(inp); } void inp_runlock(struct inpcb *inp) { INP_RUNLOCK(inp); } #ifdef INVARIANTS void inp_lock_assert(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); } void inp_unlock_assert(struct inpcb *inp) { INP_UNLOCK_ASSERT(inp); } #endif void inp_apply_all(void (*func)(struct inpcb *, void *), void *arg) { struct inpcb *inp; INP_INFO_WLOCK(&V_tcbinfo); LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) { INP_WLOCK(inp); func(inp, arg); INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(&V_tcbinfo); } struct socket * inp_inpcbtosocket(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); return (inp->inp_socket); } struct tcpcb * inp_inpcbtotcpcb(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); return ((struct tcpcb *)inp->inp_ppcb); } int inp_ip_tos_get(const struct inpcb *inp) { return (inp->inp_ip_tos); } void inp_ip_tos_set(struct inpcb *inp, int val) { inp->inp_ip_tos = val; } void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, uint32_t *faddr, uint16_t *fp) { INP_LOCK_ASSERT(inp); *laddr = inp->inp_laddr.s_addr; *faddr = inp->inp_faddr.s_addr; *lp = inp->inp_lport; *fp = inp->inp_fport; } struct inpcb * so_sotoinpcb(struct socket *so) { return (sotoinpcb(so)); } struct tcpcb * so_sototcpcb(struct socket *so) { return (sototcpcb(so)); } #ifdef DDB static void db_print_indent(int indent) { int i; for (i = 0; i < indent; i++) db_printf(" "); } static void db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent) { char faddr_str[48], laddr_str[48]; db_print_indent(indent); db_printf("%s at %p\n", name, inc); indent += 2; #ifdef INET6 if (inc->inc_flags & INC_ISIPV6) { /* IPv6. */ ip6_sprintf(laddr_str, &inc->inc6_laddr); ip6_sprintf(faddr_str, &inc->inc6_faddr); } else #endif { /* IPv4. */ inet_ntoa_r(inc->inc_laddr, laddr_str); inet_ntoa_r(inc->inc_faddr, faddr_str); } db_print_indent(indent); db_printf("inc_laddr %s inc_lport %u\n", laddr_str, ntohs(inc->inc_lport)); db_print_indent(indent); db_printf("inc_faddr %s inc_fport %u\n", faddr_str, ntohs(inc->inc_fport)); } static void db_print_inpflags(int inp_flags) { int comma; comma = 0; if (inp_flags & INP_RECVOPTS) { db_printf("%sINP_RECVOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVRETOPTS) { db_printf("%sINP_RECVRETOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVDSTADDR) { db_printf("%sINP_RECVDSTADDR", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_HDRINCL) { db_printf("%sINP_HDRINCL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_HIGHPORT) { db_printf("%sINP_HIGHPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_LOWPORT) { db_printf("%sINP_LOWPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_ANONPORT) { db_printf("%sINP_ANONPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVIF) { db_printf("%sINP_RECVIF", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_MTUDISC) { db_printf("%sINP_MTUDISC", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVTTL) { db_printf("%sINP_RECVTTL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_DONTFRAG) { db_printf("%sINP_DONTFRAG", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVTOS) { db_printf("%sINP_RECVTOS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_IPV6_V6ONLY) { db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_PKTINFO) { db_printf("%sIN6P_PKTINFO", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_HOPLIMIT) { db_printf("%sIN6P_HOPLIMIT", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_HOPOPTS) { db_printf("%sIN6P_HOPOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_DSTOPTS) { db_printf("%sIN6P_DSTOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RTHDR) { db_printf("%sIN6P_RTHDR", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RTHDRDSTOPTS) { db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_TCLASS) { db_printf("%sIN6P_TCLASS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_AUTOFLOWLABEL) { db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_TIMEWAIT) { db_printf("%sINP_TIMEWAIT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_ONESBCAST) { db_printf("%sINP_ONESBCAST", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_DROPPED) { db_printf("%sINP_DROPPED", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_SOCKREF) { db_printf("%sINP_SOCKREF", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RFC2292) { db_printf("%sIN6P_RFC2292", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_MTU) { db_printf("IN6P_MTU%s", comma ? ", " : ""); comma = 1; } } static void db_print_inpvflag(u_char inp_vflag) { int comma; comma = 0; if (inp_vflag & INP_IPV4) { db_printf("%sINP_IPV4", comma ? ", " : ""); comma = 1; } if (inp_vflag & INP_IPV6) { db_printf("%sINP_IPV6", comma ? ", " : ""); comma = 1; } if (inp_vflag & INP_IPV6PROTO) { db_printf("%sINP_IPV6PROTO", comma ? ", " : ""); comma = 1; } } static void db_print_inpcb(struct inpcb *inp, const char *name, int indent) { db_print_indent(indent); db_printf("%s at %p\n", name, inp); indent += 2; db_print_indent(indent); db_printf("inp_flow: 0x%x\n", inp->inp_flow); db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent); db_print_indent(indent); db_printf("inp_ppcb: %p inp_pcbinfo: %p inp_socket: %p\n", inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket); db_print_indent(indent); db_printf("inp_label: %p inp_flags: 0x%x (", inp->inp_label, inp->inp_flags); db_print_inpflags(inp->inp_flags); db_printf(")\n"); db_print_indent(indent); db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp, inp->inp_vflag); db_print_inpvflag(inp->inp_vflag); db_printf(")\n"); db_print_indent(indent); db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n", inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl); db_print_indent(indent); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { db_printf("in6p_options: %p in6p_outputopts: %p " "in6p_moptions: %p\n", inp->in6p_options, inp->in6p_outputopts, inp->in6p_moptions); db_printf("in6p_icmp6filt: %p in6p_cksum %d " "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum, inp->in6p_hops); } else #endif { db_printf("inp_ip_tos: %d inp_ip_options: %p " "inp_ip_moptions: %p\n", inp->inp_ip_tos, inp->inp_options, inp->inp_moptions); } db_print_indent(indent); db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd, (uintmax_t)inp->inp_gencnt); } DB_SHOW_COMMAND(inpcb, db_show_inpcb) { struct inpcb *inp; if (!have_addr) { db_printf("usage: show inpcb \n"); return; } inp = (struct inpcb *)addr; db_print_inpcb(inp, "inpcb", 0); } #endif /* DDB */ Index: head/sys/netinet/in_pcb.h =================================================================== --- head/sys/netinet/in_pcb.h (revision 297224) +++ head/sys/netinet/in_pcb.h (revision 297225) @@ -1,733 +1,741 @@ /*- * Copyright (c) 1982, 1986, 1990, 1993 * The Regents of the University of California. * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_pcb.h 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #ifndef _NETINET_IN_PCB_H_ #define _NETINET_IN_PCB_H_ #include #include #include #include +#include #ifdef _KERNEL #include #include #include #include #endif #define in6pcb inpcb /* for KAME src sync over BSD*'s */ #define in6p_sp inp_sp /* for KAME src sync over BSD*'s */ struct inpcbpolicy; /* * struct inpcb is the common protocol control block structure used in most * IP transport protocols. * * Pointers to local and foreign host table entries, local and foreign socket * numbers, and pointers up (to a socket structure) and down (to a * protocol-specific control block) are stored here. */ LIST_HEAD(inpcbhead, inpcb); LIST_HEAD(inpcbporthead, inpcbport); typedef u_quad_t inp_gen_t; /* * PCB with AF_INET6 null bind'ed laddr can receive AF_INET input packet. * So, AF_INET6 null laddr is also used as AF_INET null laddr, by utilizing * the following structure. */ struct in_addr_4in6 { u_int32_t ia46_pad32[3]; struct in_addr ia46_addr4; }; /* * NOTE: ipv6 addrs should be 64-bit aligned, per RFC 2553. in_conninfo has * some extra padding to accomplish this. * NOTE 2: tcp_syncache.c uses first 5 32-bit words, which identify fport, * lport, faddr to generate hash, so these fields shouldn't be moved. */ struct in_endpoints { u_int16_t ie_fport; /* foreign port */ u_int16_t ie_lport; /* local port */ /* protocol dependent part, local and foreign addr */ union { /* foreign host table entry */ struct in_addr_4in6 ie46_foreign; struct in6_addr ie6_foreign; } ie_dependfaddr; union { /* local host table entry */ struct in_addr_4in6 ie46_local; struct in6_addr ie6_local; } ie_dependladdr; u_int32_t ie6_zoneid; /* scope zone id */ }; #define ie_faddr ie_dependfaddr.ie46_foreign.ia46_addr4 #define ie_laddr ie_dependladdr.ie46_local.ia46_addr4 #define ie6_faddr ie_dependfaddr.ie6_foreign #define ie6_laddr ie_dependladdr.ie6_local /* * XXX The defines for inc_* are hacks and should be changed to direct * references. */ struct in_conninfo { u_int8_t inc_flags; u_int8_t inc_len; u_int16_t inc_fibnum; /* XXX was pad, 16 bits is plenty */ /* protocol dependent part */ struct in_endpoints inc_ie; }; /* * Flags for inc_flags. */ #define INC_ISIPV6 0x01 #define inc_isipv6 inc_flags /* temp compatability */ #define inc_fport inc_ie.ie_fport #define inc_lport inc_ie.ie_lport #define inc_faddr inc_ie.ie_faddr #define inc_laddr inc_ie.ie_laddr #define inc6_faddr inc_ie.ie6_faddr #define inc6_laddr inc_ie.ie6_laddr #define inc6_zoneid inc_ie.ie6_zoneid struct icmp6_filter; /*- * struct inpcb captures the network layer state for TCP, UDP, and raw IPv4 and * IPv6 sockets. In the case of TCP and UDP, further per-connection state is * hung off of inp_ppcb most of the time. Almost all fields of struct inpcb * are static after creation or protected by a per-inpcb rwlock, inp_lock. A * few fields are protected by multiple locks as indicated in the locking notes * below. For these fields, all of the listed locks must be write-locked for * any modifications. However, these fields can be safely read while any one of * the listed locks are read-locked. This model can permit greater concurrency * for read operations. For example, connections can be looked up while only * holding a read lock on the global pcblist lock. This is important for * performance when attempting to find the connection for a packet given its IP * and port tuple. * * One noteworthy exception is that the global pcbinfo lock follows a different * set of rules in relation to the inp_list field. Rather than being * write-locked for modifications and read-locked for list iterations, it must * be read-locked during modifications and write-locked during list iterations. * This ensures that the relatively rare global list iterations safely walk a * stable snapshot of connections while allowing more common list modifications * to safely grab the pcblist lock just while adding or removing a connection * from the global list. * * Key: * (c) - Constant after initialization * (g) - Protected by the pcbgroup lock * (i) - Protected by the inpcb lock * (p) - Protected by the pcbinfo lock for the inpcb * (l) - Protected by the pcblist lock for the inpcb * (h) - Protected by the pcbhash lock for the inpcb * (s) - Protected by another subsystem's locks * (x) - Undefined locking * * A few other notes: * * When a read lock is held, stability of the field is guaranteed; to write * to a field, a write lock must generally be held. * * netinet/netinet6-layer code should not assume that the inp_socket pointer * is safe to dereference without inp_lock being held, even for protocols * other than TCP (where the inpcb persists during TIMEWAIT even after the * socket has been freed), or there may be close(2)-related races. * * The inp_vflag field is overloaded, and would otherwise ideally be (c). * * TODO: Currently only the TCP stack is leveraging the global pcbinfo lock * read-lock usage during modification, this model can be applied to other * protocols (especially SCTP). */ struct inpcb { LIST_ENTRY(inpcb) inp_hash; /* (h/i) hash list */ LIST_ENTRY(inpcb) inp_pcbgrouphash; /* (g/i) hash list */ LIST_ENTRY(inpcb) inp_list; /* (p/l) list for all PCBs for proto */ /* (p[w]) for list iteration */ /* (p[r]/l) for addition/removal */ void *inp_ppcb; /* (i) pointer to per-protocol pcb */ struct inpcbinfo *inp_pcbinfo; /* (c) PCB list info */ struct inpcbgroup *inp_pcbgroup; /* (g/i) PCB group list */ LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/h) group wildcard entry */ struct socket *inp_socket; /* (i) back pointer to socket */ struct ucred *inp_cred; /* (c) cache of socket cred */ u_int32_t inp_flow; /* (i) IPv6 flow information */ int inp_flags; /* (i) generic IP/datagram flags */ int inp_flags2; /* (i) generic IP/datagram flags #2*/ u_char inp_vflag; /* (i) IP version flag (v4/v6) */ u_char inp_ip_ttl; /* (i) time to live proto */ u_char inp_ip_p; /* (c) protocol proto */ u_char inp_ip_minttl; /* (i) minimum TTL or drop */ uint32_t inp_flowid; /* (x) flow id / queue id */ u_int inp_refcount; /* (i) refcount */ void *inp_pspare[5]; /* (x) route caching / general use */ uint32_t inp_flowtype; /* (x) M_HASHTYPE value */ uint32_t inp_rss_listen_bucket; /* (x) overridden RSS listen bucket */ u_int inp_ispare[4]; /* (x) route caching / user cookie / * general use */ /* Local and foreign ports, local and foreign addr. */ struct in_conninfo inp_inc; /* (i) list for PCB's local port */ /* MAC and IPSEC policy information. */ struct label *inp_label; /* (i) MAC label */ struct inpcbpolicy *inp_sp; /* (s) for IPSEC */ /* Protocol-dependent part; options. */ struct { u_char inp4_ip_tos; /* (i) type of service proto */ struct mbuf *inp4_options; /* (i) IP options */ struct ip_moptions *inp4_moptions; /* (i) IP mcast options */ } inp_depend4; struct { /* (i) IP options */ struct mbuf *inp6_options; /* (i) IP6 options for outgoing packets */ struct ip6_pktopts *inp6_outputopts; /* (i) IP multicast options */ struct ip6_moptions *inp6_moptions; /* (i) ICMPv6 code type filter */ struct icmp6_filter *inp6_icmp6filt; /* (i) IPV6_CHECKSUM setsockopt */ int inp6_cksum; short inp6_hops; } inp_depend6; LIST_ENTRY(inpcb) inp_portlist; /* (i/h) */ struct inpcbport *inp_phd; /* (i/h) head of this list */ #define inp_zero_size offsetof(struct inpcb, inp_gencnt) inp_gen_t inp_gencnt; /* (c) generation count */ struct llentry *inp_lle; /* cached L2 information */ - struct rtentry *inp_rt; /* cached L3 information */ struct rwlock inp_lock; + rt_gen_t inp_rt_cookie; /* generation for route entry */ + union { /* cached L3 information */ + struct route inpu_route; + struct route_in6 inpu_route6; + } inp_rtu; +#define inp_route inp_rtu.inpu_route +#define inp_route6 inp_rtu.inpu_route6 }; #define inp_fport inp_inc.inc_fport #define inp_lport inp_inc.inc_lport #define inp_faddr inp_inc.inc_faddr #define inp_laddr inp_inc.inc_laddr #define inp_ip_tos inp_depend4.inp4_ip_tos #define inp_options inp_depend4.inp4_options #define inp_moptions inp_depend4.inp4_moptions #define in6p_faddr inp_inc.inc6_faddr #define in6p_laddr inp_inc.inc6_laddr #define in6p_zoneid inp_inc.inc6_zoneid #define in6p_hops inp_depend6.inp6_hops /* default hop limit */ #define in6p_flowinfo inp_flow #define in6p_options inp_depend6.inp6_options #define in6p_outputopts inp_depend6.inp6_outputopts #define in6p_moptions inp_depend6.inp6_moptions #define in6p_icmp6filt inp_depend6.inp6_icmp6filt #define in6p_cksum inp_depend6.inp6_cksum #define inp_vnet inp_pcbinfo->ipi_vnet /* * The range of the generation count, as used in this implementation, is 9e19. * We would have to create 300 billion connections per second for this number * to roll over in a year. This seems sufficiently unlikely that we simply * don't concern ourselves with that possibility. */ /* * Interface exported to userland by various protocols which use inpcbs. Hack * alert -- only define if struct xsocket is in scope. */ #ifdef _SYS_SOCKETVAR_H_ struct xinpcb { size_t xi_len; /* length of this structure */ struct inpcb xi_inp; struct xsocket xi_socket; u_quad_t xi_alignment_hack; }; struct xinpgen { size_t xig_len; /* length of this structure */ u_int xig_count; /* number of PCBs at this time */ inp_gen_t xig_gen; /* generation count at this time */ so_gen_t xig_sogen; /* socket generation count at this time */ }; #endif /* _SYS_SOCKETVAR_H_ */ struct inpcbport { LIST_ENTRY(inpcbport) phd_hash; struct inpcbhead phd_pcblist; u_short phd_port; }; /*- * Global data structure for each high-level protocol (UDP, TCP, ...) in both * IPv4 and IPv6. Holds inpcb lists and information for managing them. * * Each pcbinfo is protected by three locks: ipi_lock, ipi_hash_lock and * ipi_list_lock: * - ipi_lock covering the global pcb list stability during loop iteration, * - ipi_hash_lock covering the hashed lookup tables, * - ipi_list_lock covering mutable global fields (such as the global * pcb list) * * The lock order is: * * ipi_lock (before) * inpcb locks (before) * ipi_list locks (before) * {ipi_hash_lock, pcbgroup locks} * * Locking key: * * (c) Constant or nearly constant after initialisation * (g) Locked by ipi_lock * (l) Locked by ipi_list_lock * (h) Read using either ipi_hash_lock or inpcb lock; write requires both * (p) Protected by one or more pcbgroup locks * (x) Synchronisation properties poorly defined */ struct inpcbinfo { /* * Global lock protecting full inpcb list traversal */ struct rwlock ipi_lock; /* * Global list of inpcbs on the protocol. */ struct inpcbhead *ipi_listhead; /* (g/l) */ u_int ipi_count; /* (l) */ /* * Generation count -- incremented each time a connection is allocated * or freed. */ u_quad_t ipi_gencnt; /* (l) */ /* * Fields associated with port lookup and allocation. */ u_short ipi_lastport; /* (x) */ u_short ipi_lastlow; /* (x) */ u_short ipi_lasthi; /* (x) */ /* * UMA zone from which inpcbs are allocated for this protocol. */ struct uma_zone *ipi_zone; /* (c) */ /* * Connection groups associated with this protocol. These fields are * constant, but pcbgroup structures themselves are protected by * per-pcbgroup locks. */ struct inpcbgroup *ipi_pcbgroups; /* (c) */ u_int ipi_npcbgroups; /* (c) */ u_int ipi_hashfields; /* (c) */ /* * Global lock protecting non-pcbgroup hash lookup tables. */ struct rwlock ipi_hash_lock; /* * Global hash of inpcbs, hashed by local and foreign addresses and * port numbers. */ struct inpcbhead *ipi_hashbase; /* (h) */ u_long ipi_hashmask; /* (h) */ /* * Global hash of inpcbs, hashed by only local port number. */ struct inpcbporthead *ipi_porthashbase; /* (h) */ u_long ipi_porthashmask; /* (h) */ /* * List of wildcard inpcbs for use with pcbgroups. In the past, was * per-pcbgroup but is now global. All pcbgroup locks must be held * to modify the list, so any is sufficient to read it. */ struct inpcbhead *ipi_wildbase; /* (p) */ u_long ipi_wildmask; /* (p) */ /* * Pointer to network stack instance */ struct vnet *ipi_vnet; /* (c) */ /* * general use 2 */ void *ipi_pspare[2]; /* * Global lock protecting global inpcb list, inpcb count, etc. */ struct rwlock ipi_list_lock; }; #ifdef _KERNEL /* * Connection groups hold sets of connections that have similar CPU/thread * affinity. Each connection belongs to exactly one connection group. */ struct inpcbgroup { /* * Per-connection group hash of inpcbs, hashed by local and foreign * addresses and port numbers. */ struct inpcbhead *ipg_hashbase; /* (c) */ u_long ipg_hashmask; /* (c) */ /* * Notional affinity of this pcbgroup. */ u_int ipg_cpu; /* (p) */ /* * Per-connection group lock, not to be confused with ipi_lock. * Protects the hash table hung off the group, but also the global * wildcard list in inpcbinfo. */ struct mtx ipg_lock; } __aligned(CACHE_LINE_SIZE); #define INP_LOCK_INIT(inp, d, t) \ rw_init_flags(&(inp)->inp_lock, (t), RW_RECURSE | RW_DUPOK) #define INP_LOCK_DESTROY(inp) rw_destroy(&(inp)->inp_lock) #define INP_RLOCK(inp) rw_rlock(&(inp)->inp_lock) #define INP_WLOCK(inp) rw_wlock(&(inp)->inp_lock) #define INP_TRY_RLOCK(inp) rw_try_rlock(&(inp)->inp_lock) #define INP_TRY_WLOCK(inp) rw_try_wlock(&(inp)->inp_lock) #define INP_RUNLOCK(inp) rw_runlock(&(inp)->inp_lock) #define INP_WUNLOCK(inp) rw_wunlock(&(inp)->inp_lock) #define INP_TRY_UPGRADE(inp) rw_try_upgrade(&(inp)->inp_lock) #define INP_DOWNGRADE(inp) rw_downgrade(&(inp)->inp_lock) #define INP_WLOCKED(inp) rw_wowned(&(inp)->inp_lock) #define INP_LOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_LOCKED) #define INP_RLOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_RLOCKED) #define INP_WLOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_WLOCKED) #define INP_UNLOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_UNLOCKED) /* * These locking functions are for inpcb consumers outside of sys/netinet, * more specifically, they were added for the benefit of TOE drivers. The * macros are reserved for use by the stack. */ void inp_wlock(struct inpcb *); void inp_wunlock(struct inpcb *); void inp_rlock(struct inpcb *); void inp_runlock(struct inpcb *); #ifdef INVARIANTS void inp_lock_assert(struct inpcb *); void inp_unlock_assert(struct inpcb *); #else static __inline void inp_lock_assert(struct inpcb *inp __unused) { } static __inline void inp_unlock_assert(struct inpcb *inp __unused) { } #endif void inp_apply_all(void (*func)(struct inpcb *, void *), void *arg); int inp_ip_tos_get(const struct inpcb *inp); void inp_ip_tos_set(struct inpcb *inp, int val); struct socket * inp_inpcbtosocket(struct inpcb *inp); struct tcpcb * inp_inpcbtotcpcb(struct inpcb *inp); void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, uint32_t *faddr, uint16_t *fp); short inp_so_options(const struct inpcb *inp); #endif /* _KERNEL */ #define INP_INFO_LOCK_INIT(ipi, d) \ rw_init_flags(&(ipi)->ipi_lock, (d), RW_RECURSE) #define INP_INFO_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_lock) #define INP_INFO_RLOCK(ipi) rw_rlock(&(ipi)->ipi_lock) #define INP_INFO_WLOCK(ipi) rw_wlock(&(ipi)->ipi_lock) #define INP_INFO_TRY_RLOCK(ipi) rw_try_rlock(&(ipi)->ipi_lock) #define INP_INFO_TRY_WLOCK(ipi) rw_try_wlock(&(ipi)->ipi_lock) #define INP_INFO_TRY_UPGRADE(ipi) rw_try_upgrade(&(ipi)->ipi_lock) #define INP_INFO_WLOCKED(ipi) rw_wowned(&(ipi)->ipi_lock) #define INP_INFO_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_lock) #define INP_INFO_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_lock) #define INP_INFO_LOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_LOCKED) #define INP_INFO_RLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_RLOCKED) #define INP_INFO_WLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_WLOCKED) #define INP_INFO_UNLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_UNLOCKED) #define INP_LIST_LOCK_INIT(ipi, d) \ rw_init_flags(&(ipi)->ipi_list_lock, (d), 0) #define INP_LIST_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_list_lock) #define INP_LIST_RLOCK(ipi) rw_rlock(&(ipi)->ipi_list_lock) #define INP_LIST_WLOCK(ipi) rw_wlock(&(ipi)->ipi_list_lock) #define INP_LIST_TRY_RLOCK(ipi) rw_try_rlock(&(ipi)->ipi_list_lock) #define INP_LIST_TRY_WLOCK(ipi) rw_try_wlock(&(ipi)->ipi_list_lock) #define INP_LIST_TRY_UPGRADE(ipi) rw_try_upgrade(&(ipi)->ipi_list_lock) #define INP_LIST_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_list_lock) #define INP_LIST_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_list_lock) #define INP_LIST_LOCK_ASSERT(ipi) \ rw_assert(&(ipi)->ipi_list_lock, RA_LOCKED) #define INP_LIST_RLOCK_ASSERT(ipi) \ rw_assert(&(ipi)->ipi_list_lock, RA_RLOCKED) #define INP_LIST_WLOCK_ASSERT(ipi) \ rw_assert(&(ipi)->ipi_list_lock, RA_WLOCKED) #define INP_LIST_UNLOCK_ASSERT(ipi) \ rw_assert(&(ipi)->ipi_list_lock, RA_UNLOCKED) #define INP_HASH_LOCK_INIT(ipi, d) \ rw_init_flags(&(ipi)->ipi_hash_lock, (d), 0) #define INP_HASH_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_hash_lock) #define INP_HASH_RLOCK(ipi) rw_rlock(&(ipi)->ipi_hash_lock) #define INP_HASH_WLOCK(ipi) rw_wlock(&(ipi)->ipi_hash_lock) #define INP_HASH_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_hash_lock) #define INP_HASH_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_hash_lock) #define INP_HASH_LOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_hash_lock, \ RA_LOCKED) #define INP_HASH_WLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_hash_lock, \ RA_WLOCKED) #define INP_GROUP_LOCK_INIT(ipg, d) mtx_init(&(ipg)->ipg_lock, (d), NULL, \ MTX_DEF | MTX_DUPOK) #define INP_GROUP_LOCK_DESTROY(ipg) mtx_destroy(&(ipg)->ipg_lock) #define INP_GROUP_LOCK(ipg) mtx_lock(&(ipg)->ipg_lock) #define INP_GROUP_LOCK_ASSERT(ipg) mtx_assert(&(ipg)->ipg_lock, MA_OWNED) #define INP_GROUP_UNLOCK(ipg) mtx_unlock(&(ipg)->ipg_lock) #define INP_PCBHASH(faddr, lport, fport, mask) \ (((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask)) #define INP_PCBPORTHASH(lport, mask) \ (ntohs((lport)) & (mask)) #define INP6_PCBHASHKEY(faddr) ((faddr)->s6_addr32[3]) /* * Flags for inp_vflags -- historically version flags only */ #define INP_IPV4 0x1 #define INP_IPV6 0x2 #define INP_IPV6PROTO 0x4 /* opened under IPv6 protocol */ /* * Flags for inp_flags. */ #define INP_RECVOPTS 0x00000001 /* receive incoming IP options */ #define INP_RECVRETOPTS 0x00000002 /* receive IP options for reply */ #define INP_RECVDSTADDR 0x00000004 /* receive IP dst address */ #define INP_HDRINCL 0x00000008 /* user supplies entire IP header */ #define INP_HIGHPORT 0x00000010 /* user wants "high" port binding */ #define INP_LOWPORT 0x00000020 /* user wants "low" port binding */ #define INP_ANONPORT 0x00000040 /* port chosen for user */ #define INP_RECVIF 0x00000080 /* receive incoming interface */ #define INP_MTUDISC 0x00000100 /* user can do MTU discovery */ /* 0x000200 unused: was INP_FAITH */ #define INP_RECVTTL 0x00000400 /* receive incoming IP TTL */ #define INP_DONTFRAG 0x00000800 /* don't fragment packet */ #define INP_BINDANY 0x00001000 /* allow bind to any address */ #define INP_INHASHLIST 0x00002000 /* in_pcbinshash() has been called */ #define INP_RECVTOS 0x00004000 /* receive incoming IP TOS */ #define IN6P_IPV6_V6ONLY 0x00008000 /* restrict AF_INET6 socket for v6 */ #define IN6P_PKTINFO 0x00010000 /* receive IP6 dst and I/F */ #define IN6P_HOPLIMIT 0x00020000 /* receive hoplimit */ #define IN6P_HOPOPTS 0x00040000 /* receive hop-by-hop options */ #define IN6P_DSTOPTS 0x00080000 /* receive dst options after rthdr */ #define IN6P_RTHDR 0x00100000 /* receive routing header */ #define IN6P_RTHDRDSTOPTS 0x00200000 /* receive dstoptions before rthdr */ #define IN6P_TCLASS 0x00400000 /* receive traffic class value */ #define IN6P_AUTOFLOWLABEL 0x00800000 /* attach flowlabel automatically */ #define INP_TIMEWAIT 0x01000000 /* in TIMEWAIT, ppcb is tcptw */ #define INP_ONESBCAST 0x02000000 /* send all-ones broadcast */ #define INP_DROPPED 0x04000000 /* protocol drop flag */ #define INP_SOCKREF 0x08000000 /* strong socket reference */ #define INP_RESERVED_0 0x10000000 /* reserved field */ #define INP_RESERVED_1 0x20000000 /* reserved field */ #define IN6P_RFC2292 0x40000000 /* used RFC2292 API on the socket */ #define IN6P_MTU 0x80000000 /* receive path MTU */ #define INP_CONTROLOPTS (INP_RECVOPTS|INP_RECVRETOPTS|INP_RECVDSTADDR|\ INP_RECVIF|INP_RECVTTL|INP_RECVTOS|\ IN6P_PKTINFO|IN6P_HOPLIMIT|IN6P_HOPOPTS|\ IN6P_DSTOPTS|IN6P_RTHDR|IN6P_RTHDRDSTOPTS|\ IN6P_TCLASS|IN6P_AUTOFLOWLABEL|IN6P_RFC2292|\ IN6P_MTU) /* * Flags for inp_flags2. */ #define INP_LLE_VALID 0x00000001 /* cached lle is valid */ #define INP_RT_VALID 0x00000002 /* cached rtentry is valid */ #define INP_PCBGROUPWILD 0x00000004 /* in pcbgroup wildcard list */ #define INP_REUSEPORT 0x00000008 /* SO_REUSEPORT option is set */ #define INP_FREED 0x00000010 /* inp itself is not valid */ #define INP_REUSEADDR 0x00000020 /* SO_REUSEADDR option is set */ #define INP_BINDMULTI 0x00000040 /* IP_BINDMULTI option is set */ #define INP_RSS_BUCKET_SET 0x00000080 /* IP_RSS_LISTEN_BUCKET is set */ #define INP_RECVFLOWID 0x00000100 /* populate recv datagram with flow info */ #define INP_RECVRSSBUCKETID 0x00000200 /* populate recv datagram with bucket id */ /* * Flags passed to in_pcblookup*() functions. */ #define INPLOOKUP_WILDCARD 0x00000001 /* Allow wildcard sockets. */ #define INPLOOKUP_RLOCKPCB 0x00000002 /* Return inpcb read-locked. */ #define INPLOOKUP_WLOCKPCB 0x00000004 /* Return inpcb write-locked. */ #define INPLOOKUP_MASK (INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB | \ INPLOOKUP_WLOCKPCB) #define sotoinpcb(so) ((struct inpcb *)(so)->so_pcb) #define sotoin6pcb(so) sotoinpcb(so) /* for KAME src sync over BSD*'s */ #define INP_SOCKAF(so) so->so_proto->pr_domain->dom_family #define INP_CHECK_SOCKAF(so, af) (INP_SOCKAF(so) == af) /* * Constants for pcbinfo.ipi_hashfields. */ #define IPI_HASHFIELDS_NONE 0 #define IPI_HASHFIELDS_2TUPLE 1 #define IPI_HASHFIELDS_4TUPLE 2 #ifdef _KERNEL VNET_DECLARE(int, ipport_reservedhigh); VNET_DECLARE(int, ipport_reservedlow); VNET_DECLARE(int, ipport_lowfirstauto); VNET_DECLARE(int, ipport_lowlastauto); VNET_DECLARE(int, ipport_firstauto); VNET_DECLARE(int, ipport_lastauto); VNET_DECLARE(int, ipport_hifirstauto); VNET_DECLARE(int, ipport_hilastauto); VNET_DECLARE(int, ipport_randomized); VNET_DECLARE(int, ipport_randomcps); VNET_DECLARE(int, ipport_randomtime); VNET_DECLARE(int, ipport_stoprandom); VNET_DECLARE(int, ipport_tcpallocs); #define V_ipport_reservedhigh VNET(ipport_reservedhigh) #define V_ipport_reservedlow VNET(ipport_reservedlow) #define V_ipport_lowfirstauto VNET(ipport_lowfirstauto) #define V_ipport_lowlastauto VNET(ipport_lowlastauto) #define V_ipport_firstauto VNET(ipport_firstauto) #define V_ipport_lastauto VNET(ipport_lastauto) #define V_ipport_hifirstauto VNET(ipport_hifirstauto) #define V_ipport_hilastauto VNET(ipport_hilastauto) #define V_ipport_randomized VNET(ipport_randomized) #define V_ipport_randomcps VNET(ipport_randomcps) #define V_ipport_randomtime VNET(ipport_randomtime) #define V_ipport_stoprandom VNET(ipport_stoprandom) #define V_ipport_tcpallocs VNET(ipport_tcpallocs) void in_pcbinfo_destroy(struct inpcbinfo *); void in_pcbinfo_init(struct inpcbinfo *, const char *, struct inpcbhead *, int, int, char *, uma_init, uma_fini, uint32_t, u_int); int in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi); struct inpcbgroup * in_pcbgroup_byhash(struct inpcbinfo *, u_int, uint32_t); struct inpcbgroup * in_pcbgroup_byinpcb(struct inpcb *); struct inpcbgroup * in_pcbgroup_bytuple(struct inpcbinfo *, struct in_addr, u_short, struct in_addr, u_short); void in_pcbgroup_destroy(struct inpcbinfo *); int in_pcbgroup_enabled(struct inpcbinfo *); void in_pcbgroup_init(struct inpcbinfo *, u_int, int); void in_pcbgroup_remove(struct inpcb *); void in_pcbgroup_update(struct inpcb *); void in_pcbgroup_update_mbuf(struct inpcb *, struct mbuf *); void in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *); int in_pcballoc(struct socket *, struct inpcbinfo *); int in_pcbbind(struct inpcb *, struct sockaddr *, struct ucred *); int in_pcb_lport(struct inpcb *, struct in_addr *, u_short *, struct ucred *, int); int in_pcbbind_setup(struct inpcb *, struct sockaddr *, in_addr_t *, u_short *, struct ucred *); int in_pcbconnect(struct inpcb *, struct sockaddr *, struct ucred *); int in_pcbconnect_mbuf(struct inpcb *, struct sockaddr *, struct ucred *, struct mbuf *); int in_pcbconnect_setup(struct inpcb *, struct sockaddr *, in_addr_t *, u_short *, in_addr_t *, u_short *, struct inpcb **, struct ucred *); void in_pcbdetach(struct inpcb *); void in_pcbdisconnect(struct inpcb *); void in_pcbdrop(struct inpcb *); void in_pcbfree(struct inpcb *); int in_pcbinshash(struct inpcb *); int in_pcbinshash_nopcbgroup(struct inpcb *); int in_pcbladdr(struct inpcb *, struct in_addr *, struct in_addr *, struct ucred *); struct inpcb * in_pcblookup_local(struct inpcbinfo *, struct in_addr, u_short, int, struct ucred *); struct inpcb * in_pcblookup(struct inpcbinfo *, struct in_addr, u_int, struct in_addr, u_int, int, struct ifnet *); struct inpcb * in_pcblookup_mbuf(struct inpcbinfo *, struct in_addr, u_int, struct in_addr, u_int, int, struct ifnet *, struct mbuf *); void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr, int, struct inpcb *(*)(struct inpcb *, int)); void in_pcbref(struct inpcb *); void in_pcbrehash(struct inpcb *); void in_pcbrehash_mbuf(struct inpcb *, struct mbuf *); int in_pcbrele(struct inpcb *); int in_pcbrele_rlocked(struct inpcb *); int in_pcbrele_wlocked(struct inpcb *); +void in_losing(struct inpcb *); void in_pcbsetsolabel(struct socket *so); int in_getpeeraddr(struct socket *so, struct sockaddr **nam); int in_getsockaddr(struct socket *so, struct sockaddr **nam); struct sockaddr * in_sockaddr(in_port_t port, struct in_addr *addr); void in_pcbsosetlabel(struct socket *so); #endif /* _KERNEL */ #endif /* !_NETINET_IN_PCB_H_ */ Index: head/sys/netinet/ip_output.c =================================================================== --- head/sys/netinet/ip_output.c (revision 297224) +++ head/sys/netinet/ip_output.c (revision 297225) @@ -1,1392 +1,1411 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_ipfw.h" #include "opt_ipsec.h" #include "opt_mbuf_stress_test.h" #include "opt_mpath.h" #include "opt_route.h" #include "opt_sctp.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RADIX_MPATH #include #endif #include #include #include #include #include #include #include #include #include #include #include #ifdef SCTP #include #include #endif #ifdef IPSEC #include #include #endif /* IPSEC*/ #include #include #ifdef MBUF_STRESS_TEST static int mbuf_frag_size = 0; SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW, &mbuf_frag_size, 0, "Fragment outgoing mbufs to this size"); #endif static void ip_mloopback(struct ifnet *, const struct mbuf *, int); extern int in_mcast_loop; extern struct protosw inetsw[]; static inline int ip_output_pfil(struct mbuf **mp, struct ifnet *ifp, struct inpcb *inp, struct sockaddr_in *dst, int *fibnum, int *error) { struct m_tag *fwd_tag = NULL; struct mbuf *m; struct in_addr odst; struct ip *ip; m = *mp; ip = mtod(m, struct ip *); /* Run through list of hooks for output packets. */ odst.s_addr = ip->ip_dst.s_addr; *error = pfil_run_hooks(&V_inet_pfil_hook, mp, ifp, PFIL_OUT, inp); m = *mp; if ((*error) != 0 || m == NULL) return 1; /* Finished */ ip = mtod(m, struct ip *); /* See if destination IP address was changed by packet filter. */ if (odst.s_addr != ip->ip_dst.s_addr) { m->m_flags |= M_SKIP_FIREWALL; /* If destination is now ourself drop to ip_input(). */ if (in_localip(ip->ip_dst)) { m->m_flags |= M_FASTFWD_OURS; if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID; #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif *error = netisr_queue(NETISR_IP, m); return 1; /* Finished */ } bzero(dst, sizeof(*dst)); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = ip->ip_dst; return -1; /* Reloop */ } /* See if fib was changed by packet filter. */ if ((*fibnum) != M_GETFIB(m)) { m->m_flags |= M_SKIP_FIREWALL; *fibnum = M_GETFIB(m); return -1; /* Reloop for FIB change */ } /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */ if (m->m_flags & M_FASTFWD_OURS) { if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID; *error = netisr_queue(NETISR_IP, m); return 1; /* Finished */ } /* Or forward to some other address? */ if ((m->m_flags & M_IP_NEXTHOP) && ((fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL)) { bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in)); m->m_flags |= M_SKIP_FIREWALL; m->m_flags &= ~M_IP_NEXTHOP; m_tag_delete(m, fwd_tag); return -1; /* Reloop for CHANGE of dst */ } return 0; } /* * IP output. The packet in mbuf chain m contains a skeletal IP * header (with len, off, ttl, proto, tos, src, dst). * The mbuf chain containing the packet will be freed. * The mbuf opt, if present, will not be freed. * If route ro is present and has ro_rt initialized, route lookup would be * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL, * then result of route lookup is stored in ro->ro_rt. * * In the IP forwarding case, the packet will arrive with options already * inserted, so must have a NULL opt pointer. */ int ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, struct ip_moptions *imo, struct inpcb *inp) { struct rm_priotracker in_ifa_tracker; struct ip *ip; struct ifnet *ifp = NULL; /* keep compiler happy */ struct mbuf *m0; int hlen = sizeof (struct ip); int mtu; int error = 0; struct sockaddr_in *dst; const struct sockaddr_in *gw; struct in_ifaddr *ia; int isbroadcast; uint16_t ip_len, ip_off; struct route iproute; struct rtentry *rte; /* cache for ro->ro_rt */ uint32_t fibnum; int have_ia_ref; #ifdef IPSEC int no_route_but_check_spd = 0; #endif M_ASSERTPKTHDR(m); if (inp != NULL) { INP_LOCK_ASSERT(inp); M_SETFIB(m, inp->inp_inc.inc_fibnum); if ((flags & IP_NODEFAULTFLOWID) == 0) { m->m_pkthdr.flowid = inp->inp_flowid; M_HASHTYPE_SET(m, inp->inp_flowtype); } } if (ro == NULL) { ro = &iproute; bzero(ro, sizeof (*ro)); } #ifdef FLOWTABLE if (ro->ro_rt == NULL) (void )flowtable_lookup(AF_INET, m, ro); #endif if (opt) { int len = 0; m = ip_insertoptions(m, opt, &len); if (len != 0) hlen = len; /* ip->ip_hl is updated above */ } ip = mtod(m, struct ip *); ip_len = ntohs(ip->ip_len); ip_off = ntohs(ip->ip_off); if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { ip->ip_v = IPVERSION; ip->ip_hl = hlen >> 2; ip_fillid(ip); IPSTAT_INC(ips_localout); } else { /* Header already set, fetch hlen from there */ hlen = ip->ip_hl << 2; } /* * dst/gw handling: * * dst can be rewritten but always points to &ro->ro_dst. * gw is readonly but can point either to dst OR rt_gateway, * therefore we need restore gw if we're redoing lookup. */ gw = dst = (struct sockaddr_in *)&ro->ro_dst; fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m); rte = ro->ro_rt; - /* - * The address family should also be checked in case of sharing - * the cache with IPv6. - */ - if (rte == NULL || dst->sin_family != AF_INET) { + if (rte == NULL) { bzero(dst, sizeof(*dst)); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = ip->ip_dst; } again: + /* + * Validate route against routing table additions; + * a better/more specific route might have been added. + */ + if (inp) + RT_VALIDATE(ro, &inp->inp_rt_cookie, fibnum); + /* + * If there is a cached route, + * check that it is to the same destination + * and is still up. If not, free it and try again. + * The address family should also be checked in case of sharing the + * cache with IPv6. + * Also check whether routing cache needs invalidation. + */ + rte = ro->ro_rt; + if (rte && ((rte->rt_flags & RTF_UP) == 0 || + rte->rt_ifp == NULL || + !RT_LINK_IS_UP(rte->rt_ifp) || + dst->sin_family != AF_INET || + dst->sin_addr.s_addr != ip->ip_dst.s_addr)) { + RTFREE(rte); + rte = ro->ro_rt = (struct rtentry *)NULL; + } ia = NULL; have_ia_ref = 0; /* * If routing to interface only, short circuit routing lookup. * The use of an all-ones broadcast address implies this; an * interface is specified by the broadcast address of an interface, * or the destination address of a ptp interface. */ if (flags & IP_SENDONES) { if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst), M_GETFIB(m)))) == NULL && (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst), M_GETFIB(m)))) == NULL) { IPSTAT_INC(ips_noroute); error = ENETUNREACH; goto bad; } have_ia_ref = 1; ip->ip_dst.s_addr = INADDR_BROADCAST; dst->sin_addr = ip->ip_dst; ifp = ia->ia_ifp; ip->ip_ttl = 1; isbroadcast = 1; } else if (flags & IP_ROUTETOIF) { if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst), M_GETFIB(m)))) == NULL && (ia = ifatoia(ifa_ifwithnet(sintosa(dst), 0, M_GETFIB(m)))) == NULL) { IPSTAT_INC(ips_noroute); error = ENETUNREACH; goto bad; } have_ia_ref = 1; ifp = ia->ia_ifp; ip->ip_ttl = 1; isbroadcast = in_broadcast(dst->sin_addr, ifp); } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && imo != NULL && imo->imo_multicast_ifp != NULL) { /* * Bypass the normal routing lookup for multicast * packets if the interface is specified. */ ifp = imo->imo_multicast_ifp; IFP_TO_IA(ifp, ia, &in_ifa_tracker); if (ia) have_ia_ref = 1; isbroadcast = 0; /* fool gcc */ } else { /* * We want to do any cloning requested by the link layer, * as this is probably required in all cases for correct * operation (as it is for ARP). */ if (rte == NULL) { #ifdef RADIX_MPATH rtalloc_mpath_fib(ro, ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr), fibnum); #else in_rtalloc_ign(ro, 0, fibnum); #endif rte = ro->ro_rt; } if (rte == NULL || (rte->rt_flags & RTF_UP) == 0 || rte->rt_ifp == NULL || !RT_LINK_IS_UP(rte->rt_ifp)) { #ifdef IPSEC /* * There is no route for this packet, but it is * possible that a matching SPD entry exists. */ no_route_but_check_spd = 1; mtu = 0; /* Silence GCC warning. */ goto sendit; #endif IPSTAT_INC(ips_noroute); error = EHOSTUNREACH; goto bad; } ia = ifatoia(rte->rt_ifa); ifp = rte->rt_ifp; counter_u64_add(rte->rt_pksent, 1); rt_update_ro_flags(ro); if (rte->rt_flags & RTF_GATEWAY) gw = (struct sockaddr_in *)rte->rt_gateway; if (rte->rt_flags & RTF_HOST) isbroadcast = (rte->rt_flags & RTF_BROADCAST); else isbroadcast = in_broadcast(gw->sin_addr, ifp); } /* * Calculate MTU. If we have a route that is up, use that, * otherwise use the interface's MTU. */ if (rte != NULL && (rte->rt_flags & (RTF_UP|RTF_HOST))) mtu = rte->rt_mtu; else mtu = ifp->if_mtu; /* Catch a possible divide by zero later. */ KASSERT(mtu > 0, ("%s: mtu %d <= 0, rte=%p (rt_flags=0x%08x) ifp=%p", __func__, mtu, rte, (rte != NULL) ? rte->rt_flags : 0, ifp)); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { m->m_flags |= M_MCAST; /* * IP destination address is multicast. Make sure "gw" * still points to the address in "ro". (It may have been * changed to point to a gateway address, above.) */ gw = dst; /* * See if the caller provided any multicast options */ if (imo != NULL) { ip->ip_ttl = imo->imo_multicast_ttl; if (imo->imo_multicast_vif != -1) ip->ip_src.s_addr = ip_mcast_src ? ip_mcast_src(imo->imo_multicast_vif) : INADDR_ANY; } else ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; /* * Confirm that the outgoing interface supports multicast. */ if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { if ((ifp->if_flags & IFF_MULTICAST) == 0) { IPSTAT_INC(ips_noroute); error = ENETUNREACH; goto bad; } } /* * If source address not specified yet, use address * of outgoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) { /* Interface may have no addresses. */ if (ia != NULL) ip->ip_src = IA_SIN(ia)->sin_addr; } if ((imo == NULL && in_mcast_loop) || (imo && imo->imo_multicast_loop)) { /* * Loop back multicast datagram if not expressly * forbidden to do so, even if we are not a member * of the group; ip_input() will filter it later, * thus deferring a hash lookup and mutex acquisition * at the expense of a cheap copy using m_copym(). */ ip_mloopback(ifp, m, hlen); } else { /* * If we are acting as a multicast router, perform * multicast forwarding as if the packet had just * arrived on the interface to which we are about * to send. The multicast forwarding function * recursively calls this function, using the * IP_FORWARDING flag to prevent infinite recursion. * * Multicasts that are looped back by ip_mloopback(), * above, will be forwarded by the ip_input() routine, * if necessary. */ if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) { /* * If rsvp daemon is not running, do not * set ip_moptions. This ensures that the packet * is multicast and not just sent down one link * as prescribed by rsvpd. */ if (!V_rsvp_on) imo = NULL; if (ip_mforward && ip_mforward(ip, ifp, m, imo) != 0) { m_freem(m); goto done; } } } /* * Multicasts with a time-to-live of zero may be looped- * back, above, but must not be transmitted on a network. * Also, multicasts addressed to the loopback interface * are not sent -- the above call to ip_mloopback() will * loop back a copy. ip_input() will drop the copy if * this host does not belong to the destination group on * the loopback interface. */ if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) { m_freem(m); goto done; } goto sendit; } /* * If the source address is not specified yet, use the address * of the outoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) { /* Interface may have no addresses. */ if (ia != NULL) { ip->ip_src = IA_SIN(ia)->sin_addr; } } /* * Look for broadcast address and * verify user is allowed to send * such a packet. */ if (isbroadcast) { if ((ifp->if_flags & IFF_BROADCAST) == 0) { error = EADDRNOTAVAIL; goto bad; } if ((flags & IP_ALLOWBROADCAST) == 0) { error = EACCES; goto bad; } /* don't allow broadcast messages to be fragmented */ if (ip_len > mtu) { error = EMSGSIZE; goto bad; } m->m_flags |= M_BCAST; } else { m->m_flags &= ~M_BCAST; } sendit: #ifdef IPSEC switch(ip_ipsec_output(&m, inp, &error)) { case 1: goto bad; case -1: goto done; case 0: default: break; /* Continue with packet processing. */ } /* * Check if there was a route for this packet; return error if not. */ if (no_route_but_check_spd) { IPSTAT_INC(ips_noroute); error = EHOSTUNREACH; goto bad; } /* Update variables that are affected by ipsec4_output(). */ ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; #endif /* IPSEC */ /* Jump over all PFIL processing if hooks are not active. */ if (PFIL_HOOKED(&V_inet_pfil_hook)) { switch (ip_output_pfil(&m, ifp, inp, dst, &fibnum, &error)) { case 1: /* Finished */ goto done; case 0: /* Continue normally */ ip = mtod(m, struct ip *); break; case -1: /* Need to try again */ /* Reset everything for a new round */ RO_RTFREE(ro); if (have_ia_ref) ifa_free(&ia->ia_ifa); ro->ro_prepend = NULL; rte = NULL; gw = dst; ip = mtod(m, struct ip *); goto again; } } /* 127/8 must not appear on wire - RFC1122. */ if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { if ((ifp->if_flags & IFF_LOOPBACK) == 0) { IPSTAT_INC(ips_badaddr); error = EADDRNOTAVAIL; goto bad; } } m->m_pkthdr.csum_flags |= CSUM_IP; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) { in_delayed_cksum(m); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) { sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2)); m->m_pkthdr.csum_flags &= ~CSUM_SCTP; } #endif /* * If small enough for interface, or the interface will take * care of the fragmentation for us, we can just send directly. */ if (ip_len <= mtu || (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0) { ip->ip_sum = 0; if (m->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) { ip->ip_sum = in_cksum(m, hlen); m->m_pkthdr.csum_flags &= ~CSUM_IP; } /* * Record statistics for this interface address. * With CSUM_TSO the byte/packet count will be slightly * incorrect because we count the IP+TCP headers only * once instead of for every generated packet. */ if (!(flags & IP_FORWARDING) && ia) { if (m->m_pkthdr.csum_flags & CSUM_TSO) counter_u64_add(ia->ia_ifa.ifa_opackets, m->m_pkthdr.len / m->m_pkthdr.tso_segsz); else counter_u64_add(ia->ia_ifa.ifa_opackets, 1); counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } #ifdef MBUF_STRESS_TEST if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size) m = m_fragment(m, M_NOWAIT, mbuf_frag_size); #endif /* * Reset layer specific mbuf flags * to avoid confusing lower layers. */ m_clrprotoflags(m); IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL); error = (*ifp->if_output)(ifp, m, (const struct sockaddr *)gw, ro); goto done; } /* Balk when DF bit is set or the interface didn't support TSO. */ if ((ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) { error = EMSGSIZE; IPSTAT_INC(ips_cantfrag); goto bad; } /* * Too large for interface; fragment if possible. If successful, * on return, m will point to a list of packets to be sent. */ error = ip_fragment(ip, &m, mtu, ifp->if_hwassist); if (error) goto bad; for (; m; m = m0) { m0 = m->m_nextpkt; m->m_nextpkt = 0; if (error == 0) { /* Record statistics for this interface address. */ if (ia != NULL) { counter_u64_add(ia->ia_ifa.ifa_opackets, 1); counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } /* * Reset layer specific mbuf flags * to avoid confusing upper layers. */ m_clrprotoflags(m); IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL); error = (*ifp->if_output)(ifp, m, (const struct sockaddr *)gw, ro); } else m_freem(m); } if (error == 0) IPSTAT_INC(ips_fragmented); done: if (ro == &iproute) RO_RTFREE(ro); else if (rte == NULL) /* * If the caller supplied a route but somehow the reference * to it has been released need to prevent the caller * calling RTFREE on it again. */ ro->ro_rt = NULL; if (have_ia_ref) ifa_free(&ia->ia_ifa); return (error); bad: m_freem(m); goto done; } /* * Create a chain of fragments which fit the given mtu. m_frag points to the * mbuf to be fragmented; on return it points to the chain with the fragments. * Return 0 if no error. If error, m_frag may contain a partially built * chain of fragments that should be freed by the caller. * * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist) */ int ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, u_long if_hwassist_flags) { int error = 0; int hlen = ip->ip_hl << 2; int len = (mtu - hlen) & ~7; /* size of payload in each fragment */ int off; struct mbuf *m0 = *m_frag; /* the original packet */ int firstlen; struct mbuf **mnext; int nfrags; uint16_t ip_len, ip_off; ip_len = ntohs(ip->ip_len); ip_off = ntohs(ip->ip_off); if (ip_off & IP_DF) { /* Fragmentation not allowed */ IPSTAT_INC(ips_cantfrag); return EMSGSIZE; } /* * Must be able to put at least 8 bytes per fragment. */ if (len < 8) return EMSGSIZE; /* * If the interface will not calculate checksums on * fragmented packets, then do it here. */ if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(m0); m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } #ifdef SCTP if (m0->m_pkthdr.csum_flags & CSUM_SCTP) { sctp_delayed_cksum(m0, hlen); m0->m_pkthdr.csum_flags &= ~CSUM_SCTP; } #endif if (len > PAGE_SIZE) { /* * Fragment large datagrams such that each segment * contains a multiple of PAGE_SIZE amount of data, * plus headers. This enables a receiver to perform * page-flipping zero-copy optimizations. * * XXX When does this help given that sender and receiver * could have different page sizes, and also mtu could * be less than the receiver's page size ? */ int newlen; off = MIN(mtu, m0->m_pkthdr.len); /* * firstlen (off - hlen) must be aligned on an * 8-byte boundary */ if (off < hlen) goto smart_frag_failure; off = ((off - hlen) & ~7) + hlen; newlen = (~PAGE_MASK) & mtu; if ((newlen + sizeof (struct ip)) > mtu) { /* we failed, go back the default */ smart_frag_failure: newlen = len; off = hlen + len; } len = newlen; } else { off = hlen + len; } firstlen = off - hlen; mnext = &m0->m_nextpkt; /* pointer to next packet */ /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto chain. * Here, m0 is the original packet, m is the fragment being created. * The fragments are linked off the m_nextpkt of the original * packet, which after processing serves as the first fragment. */ for (nfrags = 1; off < ip_len; off += len, nfrags++) { struct ip *mhip; /* ip header on the fragment */ struct mbuf *m; int mhlen = sizeof (struct ip); m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; IPSTAT_INC(ips_odropped); goto done; } /* * Make sure the complete packet header gets copied * from the originating mbuf to the newly created * mbuf. This also ensures that existing firewall * classification(s), VLAN tags and so on get copied * to the resulting fragmented packet(s): */ if (m_dup_pkthdr(m, m0, M_NOWAIT) == 0) { m_free(m); error = ENOBUFS; IPSTAT_INC(ips_odropped); goto done; } /* * In the first mbuf, leave room for the link header, then * copy the original IP header including options. The payload * goes into an additional mbuf chain returned by m_copym(). */ m->m_data += max_linkhdr; mhip = mtod(m, struct ip *); *mhip = *ip; if (hlen > sizeof (struct ip)) { mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); mhip->ip_v = IPVERSION; mhip->ip_hl = mhlen >> 2; } m->m_len = mhlen; /* XXX do we need to add ip_off below ? */ mhip->ip_off = ((off - hlen) >> 3) + ip_off; if (off + len >= ip_len) len = ip_len - off; else mhip->ip_off |= IP_MF; mhip->ip_len = htons((u_short)(len + mhlen)); m->m_next = m_copym(m0, off, len, M_NOWAIT); if (m->m_next == NULL) { /* copy failed */ m_free(m); error = ENOBUFS; /* ??? */ IPSTAT_INC(ips_odropped); goto done; } m->m_pkthdr.len = mhlen + len; #ifdef MAC mac_netinet_fragment(m0, m); #endif mhip->ip_off = htons(mhip->ip_off); mhip->ip_sum = 0; if (m->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) { mhip->ip_sum = in_cksum(m, mhlen); m->m_pkthdr.csum_flags &= ~CSUM_IP; } *mnext = m; mnext = &m->m_nextpkt; } IPSTAT_ADD(ips_ofragments, nfrags); /* * Update first fragment by trimming what's been copied out * and updating header. */ m_adj(m0, hlen + firstlen - ip_len); m0->m_pkthdr.len = hlen + firstlen; ip->ip_len = htons((u_short)m0->m_pkthdr.len); ip->ip_off = htons(ip_off | IP_MF); ip->ip_sum = 0; if (m0->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) { ip->ip_sum = in_cksum(m0, hlen); m0->m_pkthdr.csum_flags &= ~CSUM_IP; } done: *m_frag = m0; return error; } void in_delayed_cksum(struct mbuf *m) { struct ip *ip; uint16_t csum, offset, ip_len; ip = mtod(m, struct ip *); offset = ip->ip_hl << 2 ; ip_len = ntohs(ip->ip_len); csum = in_cksum_skip(m, ip_len, offset); if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0) csum = 0xffff; offset += m->m_pkthdr.csum_data; /* checksum offset */ /* find the mbuf in the chain where the checksum starts*/ while ((m != NULL) && (offset >= m->m_len)) { offset -= m->m_len; m = m->m_next; } KASSERT(m != NULL, ("in_delayed_cksum: checksum outside mbuf chain.")); KASSERT(offset + sizeof(u_short) <= m->m_len, ("in_delayed_cksum: checksum split between mbufs.")); *(u_short *)(m->m_data + offset) = csum; } /* * IP socket option processing. */ int ip_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp = sotoinpcb(so); int error, optval; #ifdef RSS uint32_t rss_bucket; int retval; #endif error = optval = 0; if (sopt->sopt_level != IPPROTO_IP) { error = EINVAL; if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_dir == SOPT_SET) { switch (sopt->sopt_name) { case SO_REUSEADDR: INP_WLOCK(inp); if ((so->so_options & SO_REUSEADDR) != 0) inp->inp_flags2 |= INP_REUSEADDR; else inp->inp_flags2 &= ~INP_REUSEADDR; INP_WUNLOCK(inp); error = 0; break; case SO_REUSEPORT: INP_WLOCK(inp); if ((so->so_options & SO_REUSEPORT) != 0) inp->inp_flags2 |= INP_REUSEPORT; else inp->inp_flags2 &= ~INP_REUSEPORT; INP_WUNLOCK(inp); error = 0; break; case SO_SETFIB: INP_WLOCK(inp); inp->inp_inc.inc_fibnum = so->so_fibnum; INP_WUNLOCK(inp); error = 0; break; default: break; } } return (error); } switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { case IP_OPTIONS: #ifdef notyet case IP_RETOPTS: #endif { struct mbuf *m; if (sopt->sopt_valsize > MLEN) { error = EMSGSIZE; break; } m = m_get(sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; break; } m->m_len = sopt->sopt_valsize; error = sooptcopyin(sopt, mtod(m, char *), m->m_len, m->m_len); if (error) { m_free(m); break; } INP_WLOCK(inp); error = ip_pcbopts(inp, sopt->sopt_name, m); INP_WUNLOCK(inp); return (error); } case IP_BINDANY: if (sopt->sopt_td != NULL) { error = priv_check(sopt->sopt_td, PRIV_NETINET_BINDANY); if (error) break; } /* FALLTHROUGH */ case IP_BINDMULTI: #ifdef RSS case IP_RSS_LISTEN_BUCKET: #endif case IP_TOS: case IP_TTL: case IP_MINTTL: case IP_RECVOPTS: case IP_RECVRETOPTS: case IP_RECVDSTADDR: case IP_RECVTTL: case IP_RECVIF: case IP_ONESBCAST: case IP_DONTFRAG: case IP_RECVTOS: case IP_RECVFLOWID: #ifdef RSS case IP_RECVRSSBUCKETID: #endif error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (sopt->sopt_name) { case IP_TOS: inp->inp_ip_tos = optval; break; case IP_TTL: inp->inp_ip_ttl = optval; break; case IP_MINTTL: if (optval >= 0 && optval <= MAXTTL) inp->inp_ip_minttl = optval; else error = EINVAL; break; #define OPTSET(bit) do { \ INP_WLOCK(inp); \ if (optval) \ inp->inp_flags |= bit; \ else \ inp->inp_flags &= ~bit; \ INP_WUNLOCK(inp); \ } while (0) #define OPTSET2(bit, val) do { \ INP_WLOCK(inp); \ if (val) \ inp->inp_flags2 |= bit; \ else \ inp->inp_flags2 &= ~bit; \ INP_WUNLOCK(inp); \ } while (0) case IP_RECVOPTS: OPTSET(INP_RECVOPTS); break; case IP_RECVRETOPTS: OPTSET(INP_RECVRETOPTS); break; case IP_RECVDSTADDR: OPTSET(INP_RECVDSTADDR); break; case IP_RECVTTL: OPTSET(INP_RECVTTL); break; case IP_RECVIF: OPTSET(INP_RECVIF); break; case IP_ONESBCAST: OPTSET(INP_ONESBCAST); break; case IP_DONTFRAG: OPTSET(INP_DONTFRAG); break; case IP_BINDANY: OPTSET(INP_BINDANY); break; case IP_RECVTOS: OPTSET(INP_RECVTOS); break; case IP_BINDMULTI: OPTSET2(INP_BINDMULTI, optval); break; case IP_RECVFLOWID: OPTSET2(INP_RECVFLOWID, optval); break; #ifdef RSS case IP_RSS_LISTEN_BUCKET: if ((optval >= 0) && (optval < rss_getnumbuckets())) { inp->inp_rss_listen_bucket = optval; OPTSET2(INP_RSS_BUCKET_SET, 1); } else { error = EINVAL; } break; case IP_RECVRSSBUCKETID: OPTSET2(INP_RECVRSSBUCKETID, optval); break; #endif } break; #undef OPTSET #undef OPTSET2 /* * Multicast socket options are processed by the in_mcast * module. */ case IP_MULTICAST_IF: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_ADD_MEMBERSHIP: case IP_DROP_MEMBERSHIP: case IP_ADD_SOURCE_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: case IP_BLOCK_SOURCE: case IP_UNBLOCK_SOURCE: case IP_MSFILTER: case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = inp_setmoptions(inp, sopt); break; case IP_PORTRANGE: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; INP_WLOCK(inp); switch (optval) { case IP_PORTRANGE_DEFAULT: inp->inp_flags &= ~(INP_LOWPORT); inp->inp_flags &= ~(INP_HIGHPORT); break; case IP_PORTRANGE_HIGH: inp->inp_flags &= ~(INP_LOWPORT); inp->inp_flags |= INP_HIGHPORT; break; case IP_PORTRANGE_LOW: inp->inp_flags &= ~(INP_HIGHPORT); inp->inp_flags |= INP_LOWPORT; break; default: error = EINVAL; break; } INP_WUNLOCK(inp); break; #ifdef IPSEC case IP_IPSEC_POLICY: { caddr_t req; struct mbuf *m; if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ break; if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ break; req = mtod(m, caddr_t); error = ipsec_set_policy(inp, sopt->sopt_name, req, m->m_len, (sopt->sopt_td != NULL) ? sopt->sopt_td->td_ucred : NULL); m_freem(m); break; } #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (sopt->sopt_name) { case IP_OPTIONS: case IP_RETOPTS: if (inp->inp_options) error = sooptcopyout(sopt, mtod(inp->inp_options, char *), inp->inp_options->m_len); else sopt->sopt_valsize = 0; break; case IP_TOS: case IP_TTL: case IP_MINTTL: case IP_RECVOPTS: case IP_RECVRETOPTS: case IP_RECVDSTADDR: case IP_RECVTTL: case IP_RECVIF: case IP_PORTRANGE: case IP_ONESBCAST: case IP_DONTFRAG: case IP_BINDANY: case IP_RECVTOS: case IP_BINDMULTI: case IP_FLOWID: case IP_FLOWTYPE: case IP_RECVFLOWID: #ifdef RSS case IP_RSSBUCKETID: case IP_RECVRSSBUCKETID: #endif switch (sopt->sopt_name) { case IP_TOS: optval = inp->inp_ip_tos; break; case IP_TTL: optval = inp->inp_ip_ttl; break; case IP_MINTTL: optval = inp->inp_ip_minttl; break; #define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) #define OPTBIT2(bit) (inp->inp_flags2 & bit ? 1 : 0) case IP_RECVOPTS: optval = OPTBIT(INP_RECVOPTS); break; case IP_RECVRETOPTS: optval = OPTBIT(INP_RECVRETOPTS); break; case IP_RECVDSTADDR: optval = OPTBIT(INP_RECVDSTADDR); break; case IP_RECVTTL: optval = OPTBIT(INP_RECVTTL); break; case IP_RECVIF: optval = OPTBIT(INP_RECVIF); break; case IP_PORTRANGE: if (inp->inp_flags & INP_HIGHPORT) optval = IP_PORTRANGE_HIGH; else if (inp->inp_flags & INP_LOWPORT) optval = IP_PORTRANGE_LOW; else optval = 0; break; case IP_ONESBCAST: optval = OPTBIT(INP_ONESBCAST); break; case IP_DONTFRAG: optval = OPTBIT(INP_DONTFRAG); break; case IP_BINDANY: optval = OPTBIT(INP_BINDANY); break; case IP_RECVTOS: optval = OPTBIT(INP_RECVTOS); break; case IP_FLOWID: optval = inp->inp_flowid; break; case IP_FLOWTYPE: optval = inp->inp_flowtype; break; case IP_RECVFLOWID: optval = OPTBIT2(INP_RECVFLOWID); break; #ifdef RSS case IP_RSSBUCKETID: retval = rss_hash2bucket(inp->inp_flowid, inp->inp_flowtype, &rss_bucket); if (retval == 0) optval = rss_bucket; else error = EINVAL; break; case IP_RECVRSSBUCKETID: optval = OPTBIT2(INP_RECVRSSBUCKETID); break; #endif case IP_BINDMULTI: optval = OPTBIT2(INP_BINDMULTI); break; } error = sooptcopyout(sopt, &optval, sizeof optval); break; /* * Multicast socket options are processed by the in_mcast * module. */ case IP_MULTICAST_IF: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_MSFILTER: error = inp_getmoptions(inp, sopt); break; #ifdef IPSEC case IP_IPSEC_POLICY: { struct mbuf *m = NULL; caddr_t req = NULL; size_t len = 0; if (m != 0) { req = mtod(m, caddr_t); len = m->m_len; } error = ipsec_get_policy(sotoinpcb(so), req, len, &m); if (error == 0) error = soopt_mcopyout(sopt, m); /* XXX */ if (error == 0) m_freem(m); break; } #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; } return (error); } /* * Routine called from ip_output() to loop back a copy of an IP multicast * packet to the input queue of a specified interface. Note that this * calls the output routine of the loopback "driver", but with an interface * pointer that might NOT be a loopback interface -- evil, but easier than * replicating that code here. */ static void ip_mloopback(struct ifnet *ifp, const struct mbuf *m, int hlen) { struct ip *ip; struct mbuf *copym; /* * Make a deep copy of the packet because we're going to * modify the pack in order to generate checksums. */ copym = m_dup(m, M_NOWAIT); if (copym != NULL && (!M_WRITABLE(copym) || copym->m_len < hlen)) copym = m_pullup(copym, hlen); if (copym != NULL) { /* If needed, compute the checksum and mark it as valid. */ if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(copym); copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; copym->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; copym->m_pkthdr.csum_data = 0xffff; } /* * We don't bother to fragment if the IP length is greater * than the interface's MTU. Can this possibly matter? */ ip = mtod(copym, struct ip *); ip->ip_sum = 0; ip->ip_sum = in_cksum(copym, hlen); if_simloop(ifp, copym, AF_INET, 0); } } Index: head/sys/netinet/tcp_output.c =================================================================== --- head/sys/netinet/tcp_output.c (revision 297224) +++ head/sys/netinet/tcp_output.c (revision 297225) @@ -1,1810 +1,1806 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #endif #ifdef TCP_RFC7413 #include #endif #include #define TCPOUTFLAGS #include #include #include #include #include #include #ifdef TCPPCAP #include #endif #ifdef TCPDEBUG #include #endif #ifdef TCP_OFFLOAD #include #endif #ifdef IPSEC #include #endif /*IPSEC*/ #include #include VNET_DEFINE(int, path_mtu_discovery) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(path_mtu_discovery), 1, "Enable Path MTU Discovery"); VNET_DEFINE(int, tcp_do_tso) = 1; #define V_tcp_do_tso VNET(tcp_do_tso) SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_tso), 0, "Enable TCP Segmentation Offload"); VNET_DEFINE(int, tcp_sendspace) = 1024*32; #define V_tcp_sendspace VNET(tcp_sendspace) SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_sendspace), 0, "Initial send socket buffer size"); VNET_DEFINE(int, tcp_do_autosndbuf) = 1; #define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf) SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_autosndbuf), 0, "Enable automatic send buffer sizing"); VNET_DEFINE(int, tcp_autosndbuf_inc) = 8*1024; #define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc) SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autosndbuf_inc), 0, "Incrementor step size of automatic send buffer"); VNET_DEFINE(int, tcp_autosndbuf_max) = 2*1024*1024; #define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max) SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autosndbuf_max), 0, "Max size of automatic send buffer"); static void inline hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, long len, int tso); static void inline cc_after_idle(struct tcpcb *tp); /* * Wrapper for the TCP established output helper hook. */ static void inline hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, long len, int tso) { struct tcp_hhook_data hhook_data; if (V_tcp_hhh[HHOOK_TCP_EST_OUT]->hhh_nhooks > 0) { hhook_data.tp = tp; hhook_data.th = th; hhook_data.to = to; hhook_data.len = len; hhook_data.tso = tso; hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_OUT], &hhook_data, tp->osd); } } /* * CC wrapper hook functions */ static void inline cc_after_idle(struct tcpcb *tp) { INP_WLOCK_ASSERT(tp->t_inpcb); if (CC_ALGO(tp)->after_idle != NULL) CC_ALGO(tp)->after_idle(tp->ccv); } /* * Tcp output routine: figure out what should be sent and send it. */ int tcp_output(struct tcpcb *tp) { struct socket *so = tp->t_inpcb->inp_socket; long len, recwin, sendwin; int off, flags, error = 0; /* Keep compiler happy */ struct mbuf *m; struct ip *ip = NULL; struct ipovly *ipov = NULL; struct tcphdr *th; u_char opt[TCP_MAXOLEN]; unsigned ipoptlen, optlen, hdrlen; #ifdef IPSEC unsigned ipsec_optlen = 0; #endif int idle, sendalot; int sack_rxmit, sack_bytes_rxmt; struct sackhole *p; int tso, mtu; struct tcpopt to; #if 0 int maxburst = TCP_MAXBURST; #endif #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6; isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; #endif INP_WLOCK_ASSERT(tp->t_inpcb); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) return (tcp_offload_output(tp)); #endif #ifdef TCP_RFC7413 /* * For TFO connections in SYN_RECEIVED, only allow the initial * SYN|ACK and those sent by the retransmit timer. */ if ((tp->t_flags & TF_FASTOPEN) && (tp->t_state == TCPS_SYN_RECEIVED) && SEQ_GT(tp->snd_max, tp->snd_una) && /* inital SYN|ACK sent */ (tp->snd_nxt != tp->snd_una)) /* not a retransmit */ return (0); #endif /* * Determine length of data that should be transmitted, * and flags that will be used. * If there is some data or critical controls (SYN, RST) * to send, then transmit; otherwise, investigate further. */ idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); if (idle && ticks - tp->t_rcvtime >= tp->t_rxtcur) cc_after_idle(tp); tp->t_flags &= ~TF_LASTIDLE; if (idle) { if (tp->t_flags & TF_MORETOCOME) { tp->t_flags |= TF_LASTIDLE; idle = 0; } } again: /* * If we've recently taken a timeout, snd_max will be greater than * snd_nxt. There may be SACK information that allows us to avoid * resending already delivered data. Adjust snd_nxt accordingly. */ if ((tp->t_flags & TF_SACK_PERMIT) && SEQ_LT(tp->snd_nxt, tp->snd_max)) tcp_sack_adjust(tp); sendalot = 0; tso = 0; mtu = 0; off = tp->snd_nxt - tp->snd_una; sendwin = min(tp->snd_wnd, tp->snd_cwnd); flags = tcp_outflags[tp->t_state]; /* * Send any SACK-generated retransmissions. If we're explicitly trying * to send out new data (when sendalot is 1), bypass this function. * If we retransmit in fast recovery mode, decrement snd_cwnd, since * we're replacing a (future) new transmission with a retransmission * now, and we previously incremented snd_cwnd in tcp_input(). */ /* * Still in sack recovery , reset rxmit flag to zero. */ sack_rxmit = 0; sack_bytes_rxmt = 0; len = 0; p = NULL; if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp->t_flags) && (p = tcp_sack_output(tp, &sack_bytes_rxmt))) { long cwin; cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt; if (cwin < 0) cwin = 0; /* Do not retransmit SACK segments beyond snd_recover */ if (SEQ_GT(p->end, tp->snd_recover)) { /* * (At least) part of sack hole extends beyond * snd_recover. Check to see if we can rexmit data * for this hole. */ if (SEQ_GEQ(p->rxmit, tp->snd_recover)) { /* * Can't rexmit any more data for this hole. * That data will be rexmitted in the next * sack recovery episode, when snd_recover * moves past p->rxmit. */ p = NULL; goto after_sack_rexmit; } else /* Can rexmit part of the current hole */ len = ((long)ulmin(cwin, tp->snd_recover - p->rxmit)); } else len = ((long)ulmin(cwin, p->end - p->rxmit)); off = p->rxmit - tp->snd_una; KASSERT(off >= 0,("%s: sack block to the left of una : %d", __func__, off)); if (len > 0) { sack_rxmit = 1; sendalot = 1; TCPSTAT_INC(tcps_sack_rexmits); TCPSTAT_ADD(tcps_sack_rexmit_bytes, min(len, tp->t_maxseg)); } } after_sack_rexmit: /* * Get standard flags, and add SYN or FIN if requested by 'hidden' * state flags. */ if (tp->t_flags & TF_NEEDFIN) flags |= TH_FIN; if (tp->t_flags & TF_NEEDSYN) flags |= TH_SYN; SOCKBUF_LOCK(&so->so_snd); /* * If in persist timeout with window of 0, send 1 byte. * Otherwise, if window is small but nonzero * and timer expired, we will send what we can * and go to transmit state. */ if (tp->t_flags & TF_FORCEDATA) { if (sendwin == 0) { /* * If we still have some data to send, then * clear the FIN bit. Usually this would * happen below when it realizes that we * aren't sending all the data. However, * if we have exactly 1 byte of unsent data, * then it won't clear the FIN bit below, * and if we are in persist state, we wind * up sending the packet without recording * that we sent the FIN bit. * * We can't just blindly clear the FIN bit, * because if we don't have any more data * to send then the probe will be the FIN * itself. */ if (off < sbused(&so->so_snd)) flags &= ~TH_FIN; sendwin = 1; } else { tcp_timer_activate(tp, TT_PERSIST, 0); tp->t_rxtshift = 0; } } /* * If snd_nxt == snd_max and we have transmitted a FIN, the * offset will be > 0 even if so_snd.sb_cc is 0, resulting in * a negative length. This can also occur when TCP opens up * its congestion window while receiving additional duplicate * acks after fast-retransmit because TCP will reset snd_nxt * to snd_max after the fast-retransmit. * * In the normal retransmit-FIN-only case, however, snd_nxt will * be set to snd_una, the offset will be 0, and the length may * wind up 0. * * If sack_rxmit is true we are retransmitting from the scoreboard * in which case len is already set. */ if (sack_rxmit == 0) { if (sack_bytes_rxmt == 0) len = ((long)ulmin(sbavail(&so->so_snd), sendwin) - off); else { long cwin; /* * We are inside of a SACK recovery episode and are * sending new data, having retransmitted all the * data possible in the scoreboard. */ len = ((long)ulmin(sbavail(&so->so_snd), tp->snd_wnd) - off); /* * Don't remove this (len > 0) check ! * We explicitly check for len > 0 here (although it * isn't really necessary), to work around a gcc * optimization issue - to force gcc to compute * len above. Without this check, the computation * of len is bungled by the optimizer. */ if (len > 0) { cwin = tp->snd_cwnd - (tp->snd_nxt - tp->sack_newdata) - sack_bytes_rxmt; if (cwin < 0) cwin = 0; len = lmin(len, cwin); } } } /* * Lop off SYN bit if it has already been sent. However, if this * is SYN-SENT state and if segment contains data and if we don't * know that foreign host supports TAO, suppress sending segment. */ if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { if (tp->t_state != TCPS_SYN_RECEIVED) flags &= ~TH_SYN; #ifdef TCP_RFC7413 /* * When sending additional segments following a TFO SYN|ACK, * do not include the SYN bit. */ if ((tp->t_flags & TF_FASTOPEN) && (tp->t_state == TCPS_SYN_RECEIVED)) flags &= ~TH_SYN; #endif off--, len++; } /* * Be careful not to send data and/or FIN on SYN segments. * This measure is needed to prevent interoperability problems * with not fully conformant TCP implementations. */ if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { len = 0; flags &= ~TH_FIN; } #ifdef TCP_RFC7413 /* * When retransmitting SYN|ACK on a passively-created TFO socket, * don't include data, as the presence of data may have caused the * original SYN|ACK to have been dropped by a middlebox. */ if ((tp->t_flags & TF_FASTOPEN) && (((tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_rxtshift > 0)) || (flags & TH_RST))) len = 0; #endif if (len <= 0) { /* * If FIN has been sent but not acked, * but we haven't been called to retransmit, * len will be < 0. Otherwise, window shrank * after we sent into it. If window shrank to 0, * cancel pending retransmit, pull snd_nxt back * to (closed) window, and set the persist timer * if it isn't already going. If the window didn't * close completely, just wait for an ACK. * * We also do a general check here to ensure that * we will set the persist timer when we have data * to send, but a 0-byte window. This makes sure * the persist timer is set even if the packet * hits one of the "goto send" lines below. */ len = 0; if ((sendwin == 0) && (TCPS_HAVEESTABLISHED(tp->t_state)) && (off < (int) sbavail(&so->so_snd))) { tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rxtshift = 0; tp->snd_nxt = tp->snd_una; if (!tcp_timer_active(tp, TT_PERSIST)) tcp_setpersist(tp); } } /* len will be >= 0 after this point. */ KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); /* * Automatic sizing of send socket buffer. Often the send buffer * size is not optimally adjusted to the actual network conditions * at hand (delay bandwidth product). Setting the buffer size too * small limits throughput on links with high bandwidth and high * delay (eg. trans-continental/oceanic links). Setting the * buffer size too big consumes too much real kernel memory, * especially with many connections on busy servers. * * The criteria to step up the send buffer one notch are: * 1. receive window of remote host is larger than send buffer * (with a fudge factor of 5/4th); * 2. send buffer is filled to 7/8th with data (so we actually * have data to make use of it); * 3. send buffer fill has not hit maximal automatic size; * 4. our send window (slow start and cogestion controlled) is * larger than sent but unacknowledged data in send buffer. * * The remote host receive window scaling factor may limit the * growing of the send buffer before it reaches its allowed * maximum. * * It scales directly with slow start or congestion window * and does at most one step per received ACK. This fast * scaling has the drawback of growing the send buffer beyond * what is strictly necessary to make full use of a given * delay*bandwith product. However testing has shown this not * to be much of an problem. At worst we are trading wasting * of available bandwith (the non-use of it) for wasting some * socket buffer memory. * * TODO: Shrink send buffer during idle periods together * with congestion window. Requires another timer. Has to * wait for upcoming tcp timer rewrite. * * XXXGL: should there be used sbused() or sbavail()? */ if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) { if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat && sbused(&so->so_snd) >= (so->so_snd.sb_hiwat / 8 * 7) && sbused(&so->so_snd) < V_tcp_autosndbuf_max && sendwin >= (sbused(&so->so_snd) - (tp->snd_nxt - tp->snd_una))) { if (!sbreserve_locked(&so->so_snd, min(so->so_snd.sb_hiwat + V_tcp_autosndbuf_inc, V_tcp_autosndbuf_max), so, curthread)) so->so_snd.sb_flags &= ~SB_AUTOSIZE; } } /* * Decide if we can use TCP Segmentation Offloading (if supported by * hardware). * * TSO may only be used if we are in a pure bulk sending state. The * presence of TCP-MD5, SACK retransmits, SACK advertizements and * IP options prevent using TSO. With TSO the TCP header is the same * (except for the sequence number) for all generated packets. This * makes it impossible to transmit any options which vary per generated * segment or packet. */ #ifdef IPSEC /* * Pre-calculate here as we save another lookup into the darknesses * of IPsec that way and can actually decide if TSO is ok. */ ipsec_optlen = ipsec_hdrsiz_tcp(tp); #endif if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg && ((tp->t_flags & TF_SIGNATURE) == 0) && tp->rcv_numsacks == 0 && sack_rxmit == 0 && #ifdef IPSEC ipsec_optlen == 0 && #endif tp->t_inpcb->inp_options == NULL && tp->t_inpcb->in6p_options == NULL) tso = 1; if (sack_rxmit) { if (SEQ_LT(p->rxmit + len, tp->snd_una + sbused(&so->so_snd))) flags &= ~TH_FIN; } else { if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + sbused(&so->so_snd))) flags &= ~TH_FIN; } recwin = sbspace(&so->so_rcv); /* * Sender silly window avoidance. We transmit under the following * conditions when len is non-zero: * * - We have a full segment (or more with TSO) * - This is the last buffer in a write()/send() and we are * either idle or running NODELAY * - we've timed out (e.g. persist timer) * - we have more then 1/2 the maximum send window's worth of * data (receiver may be limited the window size) * - we need to retransmit */ if (len) { if (len >= tp->t_maxseg) goto send; /* * NOTE! on localhost connections an 'ack' from the remote * end may occur synchronously with the output and cause * us to flush a buffer queued with moretocome. XXX * * note: the len + off check is almost certainly unnecessary. */ if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ (idle || (tp->t_flags & TF_NODELAY)) && len + off >= sbavail(&so->so_snd) && (tp->t_flags & TF_NOPUSH) == 0) { goto send; } if (tp->t_flags & TF_FORCEDATA) /* typ. timeout case */ goto send; if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) goto send; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */ goto send; if (sack_rxmit) goto send; } /* * Sending of standalone window updates. * * Window updates are important when we close our window due to a * full socket buffer and are opening it again after the application * reads data from it. Once the window has opened again and the * remote end starts to send again the ACK clock takes over and * provides the most current window information. * * We must avoid the silly window syndrome whereas every read * from the receive buffer, no matter how small, causes a window * update to be sent. We also should avoid sending a flurry of * window updates when the socket buffer had queued a lot of data * and the application is doing small reads. * * Prevent a flurry of pointless window updates by only sending * an update when we can increase the advertized window by more * than 1/4th of the socket buffer capacity. When the buffer is * getting full or is very small be more aggressive and send an * update whenever we can increase by two mss sized segments. * In all other situations the ACK's to new incoming data will * carry further window increases. * * Don't send an independent window update if a delayed * ACK is pending (it will get piggy-backed on it) or the * remote side already has done a half-close and won't send * more data. Skip this if the connection is in T/TCP * half-open state. */ if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && !(tp->t_flags & TF_DELACK) && !TCPS_HAVERCVDFIN(tp->t_state)) { /* * "adv" is the amount we could increase the window, * taking into account that we are limited by * TCP_MAXWIN << tp->rcv_scale. */ long adv; int oldwin; adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale); if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { oldwin = (tp->rcv_adv - tp->rcv_nxt); adv -= oldwin; } else oldwin = 0; /* * If the new window size ends up being the same as the old * size when it is scaled, then don't force a window update. */ if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale) goto dontupdate; if (adv >= (long)(2 * tp->t_maxseg) && (adv >= (long)(so->so_rcv.sb_hiwat / 4) || recwin <= (long)(so->so_rcv.sb_hiwat / 8) || so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg)) goto send; } dontupdate: /* * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW * is also a catch-all for the retransmit timer timeout case. */ if (tp->t_flags & TF_ACKNOW) goto send; if ((flags & TH_RST) || ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) goto send; if (SEQ_GT(tp->snd_up, tp->snd_una)) goto send; /* * If our state indicates that FIN should be sent * and we have not yet done so, then we need to send. */ if (flags & TH_FIN && ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) goto send; /* * In SACK, it is possible for tcp_output to fail to send a segment * after the retransmission timer has been turned off. Make sure * that the retransmission timer is set. */ if ((tp->t_flags & TF_SACK_PERMIT) && SEQ_GT(tp->snd_max, tp->snd_una) && !tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) { tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); goto just_return; } /* * TCP window updates are not reliable, rather a polling protocol * using ``persist'' packets is used to insure receipt of window * updates. The three ``states'' for the output side are: * idle not doing retransmits or persists * persisting to move a small or zero window * (re)transmitting and thereby not persisting * * tcp_timer_active(tp, TT_PERSIST) * is true when we are in persist state. * (tp->t_flags & TF_FORCEDATA) * is set when we are called to send a persist packet. * tcp_timer_active(tp, TT_REXMT) * is set when we are retransmitting * The output side is idle when both timers are zero. * * If send window is too small, there is data to transmit, and no * retransmit or persist is pending, then go to persist state. * If nothing happens soon, send when timer expires: * if window is nonzero, transmit what we can, * otherwise force out a byte. */ if (sbavail(&so->so_snd) && !tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) { tp->t_rxtshift = 0; tcp_setpersist(tp); } /* * No reason to send a segment, just return. */ just_return: SOCKBUF_UNLOCK(&so->so_snd); return (0); send: SOCKBUF_LOCK_ASSERT(&so->so_snd); if (len > 0) { if (len >= tp->t_maxseg) tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; else tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; } /* * Before ESTABLISHED, force sending of initial options * unless TCP set not to do any options. * NOTE: we assume that the IP/TCP header plus TCP options * always fit in a single mbuf, leaving room for a maximum * link header, i.e. * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES */ optlen = 0; #ifdef INET6 if (isipv6) hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); else #endif hdrlen = sizeof (struct tcpiphdr); /* * Compute options for segment. * We only have to care about SYN and established connection * segments. Options for SYN-ACK segments are handled in TCP * syncache. */ to.to_flags = 0; if ((tp->t_flags & TF_NOOPT) == 0) { /* Maximum segment size. */ if (flags & TH_SYN) { tp->snd_nxt = tp->iss; to.to_mss = tcp_mssopt(&tp->t_inpcb->inp_inc); to.to_flags |= TOF_MSS; #ifdef TCP_RFC7413 /* * Only include the TFO option on the first * transmission of the SYN|ACK on a * passively-created TFO socket, as the presence of * the TFO option may have caused the original * SYN|ACK to have been dropped by a middlebox. */ if ((tp->t_flags & TF_FASTOPEN) && (tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_rxtshift == 0)) { to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; to.to_tfo_cookie = (u_char *)&tp->t_tfo_cookie; to.to_flags |= TOF_FASTOPEN; } #endif } /* Window scaling. */ if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { to.to_wscale = tp->request_r_scale; to.to_flags |= TOF_SCALE; } /* Timestamps. */ if ((tp->t_flags & TF_RCVD_TSTMP) || ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { to.to_tsval = tcp_ts_getticks() + tp->ts_offset; to.to_tsecr = tp->ts_recent; to.to_flags |= TOF_TS; /* Set receive buffer autosizing timestamp. */ if (tp->rfbuf_ts == 0 && (so->so_rcv.sb_flags & SB_AUTOSIZE)) tp->rfbuf_ts = tcp_ts_getticks(); } /* Selective ACK's. */ if (tp->t_flags & TF_SACK_PERMIT) { if (flags & TH_SYN) to.to_flags |= TOF_SACKPERM; else if (TCPS_HAVEESTABLISHED(tp->t_state) && (tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0) { to.to_flags |= TOF_SACK; to.to_nsacks = tp->rcv_numsacks; to.to_sacks = (u_char *)tp->sackblks; } } #ifdef TCP_SIGNATURE /* TCP-MD5 (RFC2385). */ if (tp->t_flags & TF_SIGNATURE) to.to_flags |= TOF_SIGNATURE; #endif /* TCP_SIGNATURE */ /* Processing the options. */ hdrlen += optlen = tcp_addoptions(&to, opt); } #ifdef INET6 if (isipv6) ipoptlen = ip6_optlen(tp->t_inpcb); else #endif if (tp->t_inpcb->inp_options) ipoptlen = tp->t_inpcb->inp_options->m_len - offsetof(struct ipoption, ipopt_list); else ipoptlen = 0; #ifdef IPSEC ipoptlen += ipsec_optlen; #endif /* * Adjust data length if insertion of options will * bump the packet length beyond the t_maxseg length. * Clear the FIN bit because we cut off the tail of * the segment. */ if (len + optlen + ipoptlen > tp->t_maxseg) { flags &= ~TH_FIN; if (tso) { u_int if_hw_tsomax; u_int if_hw_tsomaxsegcount; u_int if_hw_tsomaxsegsize; struct mbuf *mb; u_int moff; int max_len; /* extract TSO information */ if_hw_tsomax = tp->t_tsomax; if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; /* * Limit a TSO burst to prevent it from * overflowing or exceeding the maximum length * allowed by the network interface: */ KASSERT(ipoptlen == 0, ("%s: TSO can't do IP options", __func__)); /* * Check if we should limit by maximum payload * length: */ if (if_hw_tsomax != 0) { /* compute maximum TSO length */ max_len = (if_hw_tsomax - hdrlen - max_linkhdr); if (max_len <= 0) { len = 0; } else if (len > max_len) { sendalot = 1; len = max_len; } } /* * Check if we should limit by maximum segment * size and count: */ if (if_hw_tsomaxsegcount != 0 && if_hw_tsomaxsegsize != 0) { /* * Subtract one segment for the LINK * and TCP/IP headers mbuf that will * be prepended to this mbuf chain * after the code in this section * limits the number of mbufs in the * chain to if_hw_tsomaxsegcount. */ if_hw_tsomaxsegcount -= 1; max_len = 0; mb = sbsndmbuf(&so->so_snd, off, &moff); while (mb != NULL && max_len < len) { u_int mlen; u_int frags; /* * Get length of mbuf fragment * and how many hardware frags, * rounded up, it would use: */ mlen = (mb->m_len - moff); frags = howmany(mlen, if_hw_tsomaxsegsize); /* Handle special case: Zero Length Mbuf */ if (frags == 0) frags = 1; /* * Check if the fragment limit * will be reached or exceeded: */ if (frags >= if_hw_tsomaxsegcount) { max_len += min(mlen, if_hw_tsomaxsegcount * if_hw_tsomaxsegsize); break; } max_len += mlen; if_hw_tsomaxsegcount -= frags; moff = 0; mb = mb->m_next; } if (max_len <= 0) { len = 0; } else if (len > max_len) { sendalot = 1; len = max_len; } } /* * Prevent the last segment from being * fractional unless the send sockbuf can be * emptied: */ max_len = (tp->t_maxseg - optlen); if ((off + len) < sbavail(&so->so_snd)) { moff = len % max_len; if (moff != 0) { len -= moff; sendalot = 1; } } /* * In case there are too many small fragments * don't use TSO: */ if (len <= max_len) { len = max_len; sendalot = 1; tso = 0; } /* * Send the FIN in a separate segment * after the bulk sending is done. * We don't trust the TSO implementations * to clear the FIN flag on all but the * last segment. */ if (tp->t_flags & TF_NEEDFIN) sendalot = 1; } else { len = tp->t_maxseg - optlen - ipoptlen; sendalot = 1; } } else tso = 0; KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, ("%s: len > IP_MAXPACKET", __func__)); /*#ifdef DIAGNOSTIC*/ #ifdef INET6 if (max_linkhdr + hdrlen > MCLBYTES) #else if (max_linkhdr + hdrlen > MHLEN) #endif panic("tcphdr too big"); /*#endif*/ /* * This KASSERT is here to catch edge cases at a well defined place. * Before, those had triggered (random) panic conditions further down. */ KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); /* * Grab a header mbuf, attaching a copy of data to * be transmitted, and initialize the header from * the template for sends on this connection. */ if (len) { struct mbuf *mb; u_int moff; if ((tp->t_flags & TF_FORCEDATA) && len == 1) TCPSTAT_INC(tcps_sndprobe); else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { tp->t_sndrexmitpack++; TCPSTAT_INC(tcps_sndrexmitpack); TCPSTAT_ADD(tcps_sndrexmitbyte, len); } else { TCPSTAT_INC(tcps_sndpack); TCPSTAT_ADD(tcps_sndbyte, len); } #ifdef INET6 if (MHLEN < hdrlen + max_linkhdr) m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); else #endif m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOBUFS; sack_rxmit = 0; goto out; } m->m_data += max_linkhdr; m->m_len = hdrlen; /* * Start the m_copy functions from the closest mbuf * to the offset in the socket buffer chain. */ mb = sbsndptr(&so->so_snd, off, len, &moff); if (len <= MHLEN - hdrlen - max_linkhdr) { m_copydata(mb, moff, (int)len, mtod(m, caddr_t) + hdrlen); m->m_len += len; } else { m->m_next = m_copy(mb, moff, (int)len); if (m->m_next == NULL) { SOCKBUF_UNLOCK(&so->so_snd); (void) m_free(m); error = ENOBUFS; sack_rxmit = 0; goto out; } } /* * If we're sending everything we've got, set PUSH. * (This will keep happy those implementations which only * give data to the user when a buffer fills or * a PUSH comes in.) */ if ((off + len == sbused(&so->so_snd)) && !(flags & TH_SYN)) flags |= TH_PUSH; SOCKBUF_UNLOCK(&so->so_snd); } else { SOCKBUF_UNLOCK(&so->so_snd); if (tp->t_flags & TF_ACKNOW) TCPSTAT_INC(tcps_sndacks); else if (flags & (TH_SYN|TH_FIN|TH_RST)) TCPSTAT_INC(tcps_sndctrl); else if (SEQ_GT(tp->snd_up, tp->snd_una)) TCPSTAT_INC(tcps_sndurg); else TCPSTAT_INC(tcps_sndwinup); m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; sack_rxmit = 0; goto out; } #ifdef INET6 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && MHLEN >= hdrlen) { M_ALIGN(m, hdrlen); } else #endif m->m_data += max_linkhdr; m->m_len = hdrlen; } SOCKBUF_UNLOCK_ASSERT(&so->so_snd); m->m_pkthdr.rcvif = (struct ifnet *)0; #ifdef MAC mac_inpcb_create_mbuf(tp->t_inpcb, m); #endif #ifdef INET6 if (isipv6) { ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)(ip6 + 1); tcpip_fillheaders(tp->t_inpcb, ip6, th); } else #endif /* INET6 */ { ip = mtod(m, struct ip *); ipov = (struct ipovly *)ip; th = (struct tcphdr *)(ip + 1); tcpip_fillheaders(tp->t_inpcb, ip, th); } /* * Fill in fields, remembering maximum advertised * window for use in delaying messages about window sizes. * If resending a FIN, be sure not to use a new sequence number. */ if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && tp->snd_nxt == tp->snd_max) tp->snd_nxt--; /* * If we are starting a connection, send ECN setup * SYN packet. If we are on a retransmit, we may * resend those bits a number of times as per * RFC 3168. */ if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) { if (tp->t_rxtshift >= 1) { if (tp->t_rxtshift <= V_tcp_ecn_maxretries) flags |= TH_ECE|TH_CWR; } else flags |= TH_ECE|TH_CWR; } if (tp->t_state == TCPS_ESTABLISHED && (tp->t_flags & TF_ECN_PERMIT)) { /* * If the peer has ECN, mark data packets with * ECN capable transmission (ECT). * Ignore pure ack packets, retransmissions and window probes. */ if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && !((tp->t_flags & TF_FORCEDATA) && len == 1)) { #ifdef INET6 if (isipv6) ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); else #endif ip->ip_tos |= IPTOS_ECN_ECT0; TCPSTAT_INC(tcps_ecn_ect0); } /* * Reply with proper ECN notifications. */ if (tp->t_flags & TF_ECN_SND_CWR) { flags |= TH_CWR; tp->t_flags &= ~TF_ECN_SND_CWR; } if (tp->t_flags & TF_ECN_SND_ECE) flags |= TH_ECE; } /* * If we are doing retransmissions, then snd_nxt will * not reflect the first unsent octet. For ACK only * packets, we do not want the sequence number of the * retransmitted packet, we want the sequence number * of the next unsent octet. So, if there is no data * (and no SYN or FIN), use snd_max instead of snd_nxt * when filling in ti_seq. But if we are in persist * state, snd_max might reflect one byte beyond the * right edge of the window, so use snd_nxt in that * case, since we know we aren't doing a retransmission. * (retransmit and persist are mutually exclusive...) */ if (sack_rxmit == 0) { if (len || (flags & (TH_SYN|TH_FIN)) || tcp_timer_active(tp, TT_PERSIST)) th->th_seq = htonl(tp->snd_nxt); else th->th_seq = htonl(tp->snd_max); } else { th->th_seq = htonl(p->rxmit); p->rxmit += len; tp->sackhint.sack_bytes_rexmit += len; } th->th_ack = htonl(tp->rcv_nxt); if (optlen) { bcopy(opt, th + 1, optlen); th->th_off = (sizeof (struct tcphdr) + optlen) >> 2; } th->th_flags = flags; /* * Calculate receive window. Don't shrink window, * but avoid silly window syndrome. */ if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && recwin < (long)tp->t_maxseg) recwin = 0; if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) recwin = (long)(tp->rcv_adv - tp->rcv_nxt); if (recwin > (long)TCP_MAXWIN << tp->rcv_scale) recwin = (long)TCP_MAXWIN << tp->rcv_scale; /* * According to RFC1323 the window field in a SYN (i.e., a * or ) segment itself is never scaled. The * case is handled in syncache. */ if (flags & TH_SYN) th->th_win = htons((u_short) (min(sbspace(&so->so_rcv), TCP_MAXWIN))); else th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); /* * Adjust the RXWIN0SENT flag - indicate that we have advertised * a 0 window. This may cause the remote transmitter to stall. This * flag tells soreceive() to disable delayed acknowledgements when * draining the buffer. This can occur if the receiver is attempting * to read more data than can be buffered prior to transmitting on * the connection. */ if (th->th_win == 0) { tp->t_sndzerowin++; tp->t_flags |= TF_RXWIN0SENT; } else tp->t_flags &= ~TF_RXWIN0SENT; if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); th->th_flags |= TH_URG; } else /* * If no urgent pointer to send, then we pull * the urgent pointer to the left edge of the send window * so that it doesn't drift into the send window on sequence * number wraparound. */ tp->snd_up = tp->snd_una; /* drag it along */ #ifdef TCP_SIGNATURE if (to.to_flags & TOF_SIGNATURE) { int sigoff = to.to_signature - opt; tcp_signature_compute(m, 0, len, optlen, (u_char *)(th + 1) + sigoff, IPSEC_DIR_OUTBOUND); } #endif /* * Put TCP length in extended header, and then * checksum extended header and data. */ m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); #ifdef INET6 if (isipv6) { /* * ip6_plen is not need to be filled now, and will be filled * in ip6_output. */ m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 0); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET { m->m_pkthdr.csum_flags = CSUM_TCP; th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen)); /* IP version must be set here for ipv4/ipv6 checking later */ KASSERT(ip->ip_v == IPVERSION, ("%s: IP version incorrect: %d", __func__, ip->ip_v)); } #endif /* * Enable TSO and specify the size of the segments. * The TCP pseudo header checksum is always provided. */ if (tso) { KASSERT(len > tp->t_maxseg - optlen, ("%s: len <= tso_segsz", __func__)); m->m_pkthdr.csum_flags |= CSUM_TSO; m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; } #ifdef IPSEC KASSERT(len + hdrlen + ipoptlen - ipsec_optlen == m_length(m, NULL), ("%s: mbuf chain shorter than expected: %ld + %u + %u - %u != %u", __func__, len, hdrlen, ipoptlen, ipsec_optlen, m_length(m, NULL))); #else KASSERT(len + hdrlen + ipoptlen == m_length(m, NULL), ("%s: mbuf chain shorter than expected: %ld + %u + %u != %u", __func__, len, hdrlen, ipoptlen, m_length(m, NULL))); #endif /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ hhook_run_tcp_est_out(tp, th, &to, len, tso); #ifdef TCPDEBUG /* * Trace. */ if (so->so_options & SO_DEBUG) { u_short save = 0; #ifdef INET6 if (!isipv6) #endif { save = ipov->ih_len; ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + (th->th_off << 2) */); } tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); #ifdef INET6 if (!isipv6) #endif ipov->ih_len = save; } #endif /* TCPDEBUG */ TCP_PROBE3(debug__output, tp, th, mtod(m, const char *)); /* * Fill in IP length and desired time to live and * send to IP level. There should be a better way * to handle ttl and tos; we could keep them in * the template, but need a way to checksum without them. */ /* * m->m_pkthdr.len should have been set before checksum calculation, * because in6_cksum() need it. */ #ifdef INET6 if (isipv6) { struct route_in6 ro; bzero(&ro, sizeof(ro)); /* * we separately set hoplimit for every segment, since the * user might want to change the value via setsockopt. * Also, desired default hop limit might be changed via * Neighbor Discovery. */ ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL); /* * Set the packet size here for the benefit of DTrace probes. * ip6_output() will set it properly; it's supposed to include * the option header lengths as well. */ ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) tp->t_flags2 |= TF2_PLPMTU_PMTUD; else tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; if (tp->t_state == TCPS_SYN_SENT) TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); TCP_PROBE5(send, NULL, tp, ip6, tp, th); #ifdef TCPPCAP /* Save packet, if requested. */ tcp_pcap_add(th, m, &(tp->t_outpkts)); #endif /* TODO: IPv6 IP6TOS_ECT bit on */ error = ip6_output(m, tp->t_inpcb->in6p_outputopts, &ro, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), NULL, NULL, tp->t_inpcb); if (error == EMSGSIZE && ro.ro_rt != NULL) mtu = ro.ro_rt->rt_mtu; RO_RTFREE(&ro); } #endif /* INET6 */ #if defined(INET) && defined(INET6) else #endif #ifdef INET { - struct route ro; - - bzero(&ro, sizeof(ro)); ip->ip_len = htons(m->m_pkthdr.len); #ifdef INET6 if (tp->t_inpcb->inp_vflag & INP_IPV6PROTO) ip->ip_ttl = in6_selecthlim(tp->t_inpcb, NULL); #endif /* INET6 */ /* * If we do path MTU discovery, then we set DF on every packet. * This might not be the best thing to do according to RFC3390 * Section 2. However the tcp hostcache migitates the problem * so it affects only the first tcp connection with a host. * * NB: Don't set DF on small MTU/MSS to have a safe fallback. */ if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { ip->ip_off |= htons(IP_DF); tp->t_flags2 |= TF2_PLPMTU_PMTUD; } else { tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; } if (tp->t_state == TCPS_SYN_SENT) TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); TCP_PROBE5(send, NULL, tp, ip, tp, th); #ifdef TCPPCAP /* Save packet, if requested. */ tcp_pcap_add(th, m, &(tp->t_outpkts)); #endif - error = ip_output(m, tp->t_inpcb->inp_options, &ro, + error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0, tp->t_inpcb); - if (error == EMSGSIZE && ro.ro_rt != NULL) - mtu = ro.ro_rt->rt_mtu; - RO_RTFREE(&ro); + if (error == EMSGSIZE && tp->t_inpcb->inp_route.ro_rt != NULL) + mtu = tp->t_inpcb->inp_route.ro_rt->rt_mtu; } #endif /* INET */ out: /* * In transmit state, time the transmission and arrange for * the retransmit. In persist state, just set snd_max. */ if ((tp->t_flags & TF_FORCEDATA) == 0 || !tcp_timer_active(tp, TT_PERSIST)) { tcp_seq startseq = tp->snd_nxt; /* * Advance snd_nxt over sequence space of this segment. */ if (flags & (TH_SYN|TH_FIN)) { if (flags & TH_SYN) tp->snd_nxt++; if (flags & TH_FIN) { tp->snd_nxt++; tp->t_flags |= TF_SENTFIN; } } if (sack_rxmit) goto timer; tp->snd_nxt += len; if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { tp->snd_max = tp->snd_nxt; /* * Time this transmission if not a retransmission and * not currently timing anything. */ if (tp->t_rtttime == 0) { tp->t_rtttime = ticks; tp->t_rtseq = startseq; TCPSTAT_INC(tcps_segstimed); } } /* * Set retransmit timer if not currently set, * and not doing a pure ack or a keep-alive probe. * Initial value for retransmit timer is smoothed * round-trip time + 2 * round-trip time variance. * Initialize shift counter which is used for backoff * of retransmit time. */ timer: if (!tcp_timer_active(tp, TT_REXMT) && ((sack_rxmit && tp->snd_nxt != tp->snd_max) || (tp->snd_nxt != tp->snd_una))) { if (tcp_timer_active(tp, TT_PERSIST)) { tcp_timer_activate(tp, TT_PERSIST, 0); tp->t_rxtshift = 0; } tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); } else if (len == 0 && sbavail(&so->so_snd) && !tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) { /* * Avoid a situation where we do not set persist timer * after a zero window condition. For example: * 1) A -> B: packet with enough data to fill the window * 2) B -> A: ACK for #1 + new data (0 window * advertisement) * 3) A -> B: ACK for #2, 0 len packet * * In this case, A will not activate the persist timer, * because it chose to send a packet. Unless tcp_output * is called for some other reason (delayed ack timer, * another input packet from B, socket syscall), A will * not send zero window probes. * * So, if you send a 0-length packet, but there is data * in the socket buffer, and neither the rexmt or * persist timer is already set, then activate the * persist timer. */ tp->t_rxtshift = 0; tcp_setpersist(tp); } } else { /* * Persist case, update snd_max but since we are in * persist mode (no window) we do not update snd_nxt. */ int xlen = len; if (flags & TH_SYN) ++xlen; if (flags & TH_FIN) { ++xlen; tp->t_flags |= TF_SENTFIN; } if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) tp->snd_max = tp->snd_nxt + len; } if (error) { /* * We know that the packet was lost, so back out the * sequence number advance, if any. * * If the error is EPERM the packet got blocked by the * local firewall. Normally we should terminate the * connection but the blocking may have been spurious * due to a firewall reconfiguration cycle. So we treat * it like a packet loss and let the retransmit timer and * timeouts do their work over time. * XXX: It is a POLA question whether calling tcp_drop right * away would be the really correct behavior instead. */ if (((tp->t_flags & TF_FORCEDATA) == 0 || !tcp_timer_active(tp, TT_PERSIST)) && ((flags & TH_SYN) == 0) && (error != EPERM)) { if (sack_rxmit) { p->rxmit -= len; tp->sackhint.sack_bytes_rexmit -= len; KASSERT(tp->sackhint.sack_bytes_rexmit >= 0, ("sackhint bytes rtx >= 0")); } else tp->snd_nxt -= len; } SOCKBUF_UNLOCK_ASSERT(&so->so_snd); /* Check gotos. */ switch (error) { case EPERM: tp->t_softerror = error; return (error); case ENOBUFS: if (!tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); tp->snd_cwnd = tp->t_maxseg; return (0); case EMSGSIZE: /* * For some reason the interface we used initially * to send segments changed to another or lowered * its MTU. * If TSO was active we either got an interface * without TSO capabilits or TSO was turned off. * If we obtained mtu from ip_output() then update * it and try again. */ if (tso) tp->t_flags &= ~TF_TSO; if (mtu != 0) { tcp_mss_update(tp, -1, mtu, NULL, NULL); goto again; } return (error); case EHOSTDOWN: case EHOSTUNREACH: case ENETDOWN: case ENETUNREACH: if (TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_softerror = error; return (0); } /* FALLTHROUGH */ default: return (error); } } TCPSTAT_INC(tcps_sndtotal); /* * Data sent (as far as we can tell). * If this advertises a larger window than any other segment, * then remember the size of the advertised window. * Any pending ACK has now been sent. */ if (recwin >= 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) tp->rcv_adv = tp->rcv_nxt + recwin; tp->last_ack_sent = tp->rcv_nxt; tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); if (tcp_timer_active(tp, TT_DELACK)) tcp_timer_activate(tp, TT_DELACK, 0); #if 0 /* * This completely breaks TCP if newreno is turned on. What happens * is that if delayed-acks are turned on on the receiver, this code * on the transmitter effectively destroys the TCP window, forcing * it to four packets (1.5Kx4 = 6K window). */ if (sendalot && --maxburst) goto again; #endif if (sendalot) goto again; return (0); } void tcp_setpersist(struct tcpcb *tp) { int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; int tt; tp->t_flags &= ~TF_PREVVALID; if (tcp_timer_active(tp, TT_REXMT)) panic("tcp_setpersist: retransmit pending"); /* * Start/restart persistance timer. */ TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], tcp_persmin, tcp_persmax); tcp_timer_activate(tp, TT_PERSIST, tt); if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++; } /* * Insert TCP options according to the supplied parameters to the place * optp in a consistent way. Can handle unaligned destinations. * * The order of the option processing is crucial for optimal packing and * alignment for the scarce option space. * * The optimal order for a SYN/SYN-ACK segment is: * MSS (4) + NOP (1) + Window scale (3) + SACK permitted (2) + * Timestamp (10) + Signature (18) = 38 bytes out of a maximum of 40. * * The SACK options should be last. SACK blocks consume 8*n+2 bytes. * So a full size SACK blocks option is 34 bytes (with 4 SACK blocks). * At minimum we need 10 bytes (to generate 1 SACK block). If both * TCP Timestamps (12 bytes) and TCP Signatures (18 bytes) are present, * we only have 10 bytes for SACK options (40 - (12 + 18)). */ int tcp_addoptions(struct tcpopt *to, u_char *optp) { u_int32_t mask, optlen = 0; for (mask = 1; mask < TOF_MAXOPT; mask <<= 1) { if ((to->to_flags & mask) != mask) continue; if (optlen == TCP_MAXOLEN) break; switch (to->to_flags & mask) { case TOF_MSS: while (optlen % 4) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_MAXSEG) continue; optlen += TCPOLEN_MAXSEG; *optp++ = TCPOPT_MAXSEG; *optp++ = TCPOLEN_MAXSEG; to->to_mss = htons(to->to_mss); bcopy((u_char *)&to->to_mss, optp, sizeof(to->to_mss)); optp += sizeof(to->to_mss); break; case TOF_SCALE: while (!optlen || optlen % 2 != 1) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_WINDOW) continue; optlen += TCPOLEN_WINDOW; *optp++ = TCPOPT_WINDOW; *optp++ = TCPOLEN_WINDOW; *optp++ = to->to_wscale; break; case TOF_SACKPERM: while (optlen % 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_SACK_PERMITTED) continue; optlen += TCPOLEN_SACK_PERMITTED; *optp++ = TCPOPT_SACK_PERMITTED; *optp++ = TCPOLEN_SACK_PERMITTED; break; case TOF_TS: while (!optlen || optlen % 4 != 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_TIMESTAMP) continue; optlen += TCPOLEN_TIMESTAMP; *optp++ = TCPOPT_TIMESTAMP; *optp++ = TCPOLEN_TIMESTAMP; to->to_tsval = htonl(to->to_tsval); to->to_tsecr = htonl(to->to_tsecr); bcopy((u_char *)&to->to_tsval, optp, sizeof(to->to_tsval)); optp += sizeof(to->to_tsval); bcopy((u_char *)&to->to_tsecr, optp, sizeof(to->to_tsecr)); optp += sizeof(to->to_tsecr); break; #ifdef TCP_SIGNATURE case TOF_SIGNATURE: { int siglen = TCPOLEN_SIGNATURE - 2; while (!optlen || optlen % 4 != 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_SIGNATURE) continue; optlen += TCPOLEN_SIGNATURE; *optp++ = TCPOPT_SIGNATURE; *optp++ = TCPOLEN_SIGNATURE; to->to_signature = optp; while (siglen--) *optp++ = 0; break; } #endif case TOF_SACK: { int sackblks = 0; struct sackblk *sack = (struct sackblk *)to->to_sacks; tcp_seq sack_seq; while (!optlen || optlen % 4 != 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_SACKHDR + TCPOLEN_SACK) continue; optlen += TCPOLEN_SACKHDR; *optp++ = TCPOPT_SACK; sackblks = min(to->to_nsacks, (TCP_MAXOLEN - optlen) / TCPOLEN_SACK); *optp++ = TCPOLEN_SACKHDR + sackblks * TCPOLEN_SACK; while (sackblks--) { sack_seq = htonl(sack->start); bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq)); optp += sizeof(sack_seq); sack_seq = htonl(sack->end); bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq)); optp += sizeof(sack_seq); optlen += TCPOLEN_SACK; sack++; } TCPSTAT_INC(tcps_sack_send_blocks); break; } #ifdef TCP_RFC7413 case TOF_FASTOPEN: { int total_len; /* XXX is there any point to aligning this option? */ total_len = TCPOLEN_FAST_OPEN_EMPTY + to->to_tfo_len; if (TCP_MAXOLEN - optlen < total_len) continue; *optp++ = TCPOPT_FAST_OPEN; *optp++ = total_len; if (to->to_tfo_len > 0) { bcopy(to->to_tfo_cookie, optp, to->to_tfo_len); optp += to->to_tfo_len; } optlen += total_len; break; } #endif default: panic("%s: unknown TCP option type", __func__); break; } } /* Terminate and pad TCP options to a 4 byte boundary. */ if (optlen % 4) { optlen += TCPOLEN_EOL; *optp++ = TCPOPT_EOL; } /* * According to RFC 793 (STD0007): * "The content of the header beyond the End-of-Option option * must be header padding (i.e., zero)." * and later: "The padding is composed of zeros." */ while (optlen % 4) { optlen += TCPOLEN_PAD; *optp++ = TCPOPT_PAD; } KASSERT(optlen <= TCP_MAXOLEN, ("%s: TCP options too long", __func__)); return (optlen); } Index: head/sys/netinet/tcp_subr.c =================================================================== --- head/sys/netinet/tcp_subr.c (revision 297224) +++ head/sys/netinet/tcp_subr.c (revision 297225) @@ -1,2993 +1,2997 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #include #include #endif #ifdef TCP_RFC7413 #include #endif #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #ifdef TCPPCAP #include #endif #ifdef TCPDEBUG #include #endif #ifdef INET6 #include #endif #ifdef TCP_OFFLOAD #include #endif #ifdef IPSEC #include #include #ifdef INET6 #include #endif #include #include #endif /*IPSEC*/ #include #include #include VNET_DEFINE(int, tcp_mssdflt) = TCP_MSS; #ifdef INET6 VNET_DEFINE(int, tcp_v6mssdflt) = TCP6_MSS; #endif struct rwlock tcp_function_lock; static int sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS) { int error, new; new = V_tcp_mssdflt; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { if (new < TCP_MINMSS) error = EINVAL; else V_tcp_mssdflt = new; } return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(tcp_mssdflt), 0, &sysctl_net_inet_tcp_mss_check, "I", "Default TCP Maximum Segment Size"); #ifdef INET6 static int sysctl_net_inet_tcp_mss_v6_check(SYSCTL_HANDLER_ARGS) { int error, new; new = V_tcp_v6mssdflt; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { if (new < TCP_MINMSS) error = EINVAL; else V_tcp_v6mssdflt = new; } return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(tcp_v6mssdflt), 0, &sysctl_net_inet_tcp_mss_v6_check, "I", "Default TCP Maximum Segment Size for IPv6"); #endif /* INET6 */ /* * Minimum MSS we accept and use. This prevents DoS attacks where * we are forced to a ridiculous low MSS like 20 and send hundreds * of packets instead of one. The effect scales with the available * bandwidth and quickly saturates the CPU and network interface * with packet generation and sending. Set to zero to disable MINMSS * checking. This setting prevents us from sending too small packets. */ VNET_DEFINE(int, tcp_minmss) = TCP_MINMSS; SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_minmss), 0, "Minimum TCP Maximum Segment Size"); VNET_DEFINE(int, tcp_do_rfc1323) = 1; SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc1323), 0, "Enable rfc1323 (high performance TCP) extensions"); static int tcp_log_debug = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW, &tcp_log_debug, 0, "Log errors caused by incoming TCP segments"); static int tcp_tcbhashsize; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); static int do_tcpdrain = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, "Enable tcp_drain routine for extra help when low on mbufs"); SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(tcbinfo.ipi_count), 0, "Number of active PCBs"); static VNET_DEFINE(int, icmp_may_rst) = 1; #define V_icmp_may_rst VNET(icmp_may_rst) SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp_may_rst), 0, "Certain ICMP unreachable messages may abort connections in SYN_SENT"); static VNET_DEFINE(int, tcp_isn_reseed_interval) = 0; #define V_tcp_isn_reseed_interval VNET(tcp_isn_reseed_interval) SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_isn_reseed_interval), 0, "Seconds between reseeding of ISN secret"); static int tcp_soreceive_stream; SYSCTL_INT(_net_inet_tcp, OID_AUTO, soreceive_stream, CTLFLAG_RDTUN, &tcp_soreceive_stream, 0, "Using soreceive_stream for TCP sockets"); #ifdef TCP_SIGNATURE static int tcp_sig_checksigs = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, signature_verify_input, CTLFLAG_RW, &tcp_sig_checksigs, 0, "Verify RFC2385 digests on inbound traffic"); #endif VNET_DEFINE(uma_zone_t, sack_hole_zone); #define V_sack_hole_zone VNET(sack_hole_zone) VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]); static struct inpcb *tcp_notify(struct inpcb *, int); static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int); static void tcp_mtudisc(struct inpcb *, int); static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, const void *ip6hdr); static void tcp_timer_discard(struct tcpcb *, uint32_t); static struct tcp_function_block tcp_def_funcblk = { "default", tcp_output, tcp_do_segment, tcp_default_ctloutput, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 0, 0 }; int t_functions_inited = 0; struct tcp_funchead t_functions; static struct tcp_function_block *tcp_func_set_ptr = &tcp_def_funcblk; static void init_tcp_functions(void) { if (t_functions_inited == 0) { TAILQ_INIT(&t_functions); rw_init_flags(&tcp_function_lock, "tcp_func_lock" , 0); t_functions_inited = 1; } } static struct tcp_function_block * find_tcp_functions_locked(struct tcp_function_set *fs) { struct tcp_function *f; struct tcp_function_block *blk=NULL; TAILQ_FOREACH(f, &t_functions, tf_next) { if (strcmp(f->tf_fb->tfb_tcp_block_name, fs->function_set_name) == 0) { blk = f->tf_fb; break; } } return(blk); } static struct tcp_function_block * find_tcp_fb_locked(struct tcp_function_block *blk, struct tcp_function **s) { struct tcp_function_block *rblk=NULL; struct tcp_function *f; TAILQ_FOREACH(f, &t_functions, tf_next) { if (f->tf_fb == blk) { rblk = blk; if (s) { *s = f; } break; } } return (rblk); } struct tcp_function_block * find_and_ref_tcp_functions(struct tcp_function_set *fs) { struct tcp_function_block *blk; rw_rlock(&tcp_function_lock); blk = find_tcp_functions_locked(fs); if (blk) refcount_acquire(&blk->tfb_refcnt); rw_runlock(&tcp_function_lock); return(blk); } struct tcp_function_block * find_and_ref_tcp_fb(struct tcp_function_block *blk) { struct tcp_function_block *rblk; rw_rlock(&tcp_function_lock); rblk = find_tcp_fb_locked(blk, NULL); if (rblk) refcount_acquire(&rblk->tfb_refcnt); rw_runlock(&tcp_function_lock); return(rblk); } static int sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS) { int error=ENOENT; struct tcp_function_set fs; struct tcp_function_block *blk; memset(&fs, 0, sizeof(fs)); rw_rlock(&tcp_function_lock); blk = find_tcp_fb_locked(tcp_func_set_ptr, NULL); if (blk) { /* Found him */ strcpy(fs.function_set_name, blk->tfb_tcp_block_name); fs.pcbcnt = blk->tfb_refcnt; } rw_runlock(&tcp_function_lock); error = sysctl_handle_string(oidp, fs.function_set_name, sizeof(fs.function_set_name), req); /* Check for error or no change */ if (error != 0 || req->newptr == NULL) return(error); rw_wlock(&tcp_function_lock); blk = find_tcp_functions_locked(&fs); if ((blk == NULL) || (blk->tfb_flags & TCP_FUNC_BEING_REMOVED)) { error = ENOENT; goto done; } tcp_func_set_ptr = blk; done: rw_wunlock(&tcp_function_lock); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_default, CTLTYPE_STRING | CTLFLAG_RW, NULL, 0, sysctl_net_inet_default_tcp_functions, "A", "Set/get the default TCP functions"); static int sysctl_net_inet_list_available(SYSCTL_HANDLER_ARGS) { int error, cnt, linesz; struct tcp_function *f; char *buffer, *cp; size_t bufsz, outsz; cnt = 0; rw_rlock(&tcp_function_lock); TAILQ_FOREACH(f, &t_functions, tf_next) { cnt++; } rw_runlock(&tcp_function_lock); bufsz = (cnt+2) * (TCP_FUNCTION_NAME_LEN_MAX + 12) + 1; buffer = malloc(bufsz, M_TEMP, M_WAITOK); error = 0; cp = buffer; linesz = snprintf(cp, bufsz, "\n%-32s%c %s\n", "Stack", 'D', "PCB count"); cp += linesz; bufsz -= linesz; outsz = linesz; rw_rlock(&tcp_function_lock); TAILQ_FOREACH(f, &t_functions, tf_next) { linesz = snprintf(cp, bufsz, "%-32s%c %u\n", f->tf_fb->tfb_tcp_block_name, (f->tf_fb == tcp_func_set_ptr) ? '*' : ' ', f->tf_fb->tfb_refcnt); if (linesz >= bufsz) { error = EOVERFLOW; break; } cp += linesz; bufsz -= linesz; outsz += linesz; } rw_runlock(&tcp_function_lock); if (error == 0) error = sysctl_handle_string(oidp, buffer, outsz + 1, req); free(buffer, M_TEMP); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_available, CTLTYPE_STRING|CTLFLAG_RD, NULL, 0, sysctl_net_inet_list_available, "A", "list available TCP Function sets"); /* * Target size of TCP PCB hash tables. Must be a power of two. * * Note that this can be overridden by the kernel environment * variable net.inet.tcp.tcbhashsize */ #ifndef TCBHASHSIZE #define TCBHASHSIZE 0 #endif /* * XXX * Callouts should be moved into struct tcp directly. They are currently * separate because the tcpcb structure is exported to userland for sysctl * parsing purposes, which do not know about callouts. */ struct tcpcb_mem { struct tcpcb tcb; struct tcp_timer tt; struct cc_var ccv; struct osd osd; }; static VNET_DEFINE(uma_zone_t, tcpcb_zone); #define V_tcpcb_zone VNET(tcpcb_zone) MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers"); MALLOC_DEFINE(M_TCPFUNCTIONS, "tcpfunc", "TCP function set memory"); static struct mtx isn_mtx; #define ISN_LOCK_INIT() mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF) #define ISN_LOCK() mtx_lock(&isn_mtx) #define ISN_UNLOCK() mtx_unlock(&isn_mtx) /* * TCP initialization. */ static void tcp_zone_change(void *tag) { uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets); uma_zone_set_max(V_tcpcb_zone, maxsockets); tcp_tw_zone_change(); } static int tcp_inpcb_init(void *mem, int size, int flags) { struct inpcb *inp = mem; INP_LOCK_INIT(inp, "inp", "tcpinp"); return (0); } /* * Take a value and get the next power of 2 that doesn't overflow. * Used to size the tcp_inpcb hash buckets. */ static int maketcp_hashsize(int size) { int hashsize; /* * auto tune. * get the next power of 2 higher than maxsockets. */ hashsize = 1 << fls(size); /* catch overflow, and just go one power of 2 smaller */ if (hashsize < size) { hashsize = 1 << (fls(size) - 1); } return (hashsize); } int register_tcp_functions(struct tcp_function_block *blk, int wait) { struct tcp_function_block *lblk; struct tcp_function *n; struct tcp_function_set fs; if (t_functions_inited == 0) { init_tcp_functions(); } if ((blk->tfb_tcp_output == NULL) || (blk->tfb_tcp_do_segment == NULL) || (blk->tfb_tcp_ctloutput == NULL) || (strlen(blk->tfb_tcp_block_name) == 0)) { /* * These functions are required and you * need a name. */ return (EINVAL); } if (blk->tfb_tcp_timer_stop_all || blk->tfb_tcp_timers_left || blk->tfb_tcp_timer_activate || blk->tfb_tcp_timer_active || blk->tfb_tcp_timer_stop) { /* * If you define one timer function you * must have them all. */ if ((blk->tfb_tcp_timer_stop_all == NULL) || (blk->tfb_tcp_timers_left == NULL) || (blk->tfb_tcp_timer_activate == NULL) || (blk->tfb_tcp_timer_active == NULL) || (blk->tfb_tcp_timer_stop == NULL)) { return (EINVAL); } } n = malloc(sizeof(struct tcp_function), M_TCPFUNCTIONS, wait); if (n == NULL) { return (ENOMEM); } n->tf_fb = blk; strcpy(fs.function_set_name, blk->tfb_tcp_block_name); rw_wlock(&tcp_function_lock); lblk = find_tcp_functions_locked(&fs); if (lblk) { /* Duplicate name space not allowed */ rw_wunlock(&tcp_function_lock); free(n, M_TCPFUNCTIONS); return (EALREADY); } refcount_init(&blk->tfb_refcnt, 0); blk->tfb_flags = 0; TAILQ_INSERT_TAIL(&t_functions, n, tf_next); rw_wunlock(&tcp_function_lock); return(0); } int deregister_tcp_functions(struct tcp_function_block *blk) { struct tcp_function_block *lblk; struct tcp_function *f; int error=ENOENT; if (strcmp(blk->tfb_tcp_block_name, "default") == 0) { /* You can't un-register the default */ return (EPERM); } rw_wlock(&tcp_function_lock); if (blk == tcp_func_set_ptr) { /* You can't free the current default */ rw_wunlock(&tcp_function_lock); return (EBUSY); } if (blk->tfb_refcnt) { /* Still tcb attached, mark it. */ blk->tfb_flags |= TCP_FUNC_BEING_REMOVED; rw_wunlock(&tcp_function_lock); return (EBUSY); } lblk = find_tcp_fb_locked(blk, &f); if (lblk) { /* Found */ TAILQ_REMOVE(&t_functions, f, tf_next); f->tf_fb = NULL; free(f, M_TCPFUNCTIONS); error = 0; } rw_wunlock(&tcp_function_lock); return (error); } void tcp_init(void) { const char *tcbhash_tuneable; int hashsize; tcbhash_tuneable = "net.inet.tcp.tcbhashsize"; if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, &V_tcp_hhh[HHOOK_TCP_EST_IN], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register helper hook\n", __func__); if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, &V_tcp_hhh[HHOOK_TCP_EST_OUT], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register helper hook\n", __func__); hashsize = TCBHASHSIZE; TUNABLE_INT_FETCH(tcbhash_tuneable, &hashsize); if (hashsize == 0) { /* * Auto tune the hash size based on maxsockets. * A perfect hash would have a 1:1 mapping * (hashsize = maxsockets) however it's been * suggested that O(2) average is better. */ hashsize = maketcp_hashsize(maxsockets / 4); /* * Our historical default is 512, * do not autotune lower than this. */ if (hashsize < 512) hashsize = 512; if (bootverbose && IS_DEFAULT_VNET(curvnet)) printf("%s: %s auto tuned to %d\n", __func__, tcbhash_tuneable, hashsize); } /* * We require a hashsize to be a power of two. * Previously if it was not a power of two we would just reset it * back to 512, which could be a nasty surprise if you did not notice * the error message. * Instead what we do is clip it to the closest power of two lower * than the specified hash value. */ if (!powerof2(hashsize)) { int oldhashsize = hashsize; hashsize = maketcp_hashsize(hashsize); /* prevent absurdly low value */ if (hashsize < 16) hashsize = 16; printf("%s: WARNING: TCB hash size not a power of 2, " "clipped from %d to %d.\n", __func__, oldhashsize, hashsize); } in_pcbinfo_init(&V_tcbinfo, "tcp", &V_tcb, hashsize, hashsize, "tcp_inpcb", tcp_inpcb_init, NULL, UMA_ZONE_NOFREE, IPI_HASHFIELDS_4TUPLE); /* * These have to be type stable for the benefit of the timers. */ V_tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(V_tcpcb_zone, maxsockets); uma_zone_set_warning(V_tcpcb_zone, "kern.ipc.maxsockets limit reached"); tcp_tw_init(); syncache_init(); tcp_hc_init(); TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack); V_sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); /* Skip initialization of globals for non-default instances. */ if (!IS_DEFAULT_VNET(curvnet)) return; tcp_reass_global_init(); /* XXX virtualize those bellow? */ tcp_delacktime = TCPTV_DELACK; tcp_keepinit = TCPTV_KEEP_INIT; tcp_keepidle = TCPTV_KEEP_IDLE; tcp_keepintvl = TCPTV_KEEPINTVL; tcp_maxpersistidle = TCPTV_KEEP_IDLE; tcp_msl = TCPTV_MSL; tcp_rexmit_min = TCPTV_MIN; if (tcp_rexmit_min < 1) tcp_rexmit_min = 1; tcp_persmin = TCPTV_PERSMIN; tcp_persmax = TCPTV_PERSMAX; tcp_rexmit_slop = TCPTV_CPU_VAR; tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT; tcp_tcbhashsize = hashsize; /* Setup the tcp function block list */ init_tcp_functions(); register_tcp_functions(&tcp_def_funcblk, M_WAITOK); if (tcp_soreceive_stream) { #ifdef INET tcp_usrreqs.pru_soreceive = soreceive_stream; #endif #ifdef INET6 tcp6_usrreqs.pru_soreceive = soreceive_stream; #endif /* INET6 */ } #ifdef INET6 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) #else /* INET6 */ #define TCP_MINPROTOHDR (sizeof(struct tcpiphdr)) #endif /* INET6 */ if (max_protohdr < TCP_MINPROTOHDR) max_protohdr = TCP_MINPROTOHDR; if (max_linkhdr + TCP_MINPROTOHDR > MHLEN) panic("tcp_init"); #undef TCP_MINPROTOHDR ISN_LOCK_INIT(); EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL, SHUTDOWN_PRI_DEFAULT); EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL, EVENTHANDLER_PRI_ANY); #ifdef TCPPCAP tcp_pcap_init(); #endif #ifdef TCP_RFC7413 tcp_fastopen_init(); #endif } #ifdef VIMAGE void tcp_destroy(void) { int error; #ifdef TCP_RFC7413 tcp_fastopen_destroy(); #endif tcp_hc_destroy(); syncache_destroy(); tcp_tw_destroy(); in_pcbinfo_destroy(&V_tcbinfo); uma_zdestroy(V_sack_hole_zone); uma_zdestroy(V_tcpcb_zone); error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_IN]); if (error != 0) { printf("%s: WARNING: unable to deregister helper hook " "type=%d, id=%d: error %d returned\n", __func__, HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, error); } error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_OUT]); if (error != 0) { printf("%s: WARNING: unable to deregister helper hook " "type=%d, id=%d: error %d returned\n", __func__, HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, error); } } #endif void tcp_fini(void *xtp) { } /* * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb. * tcp_template used to store this data in mbufs, but we now recopy it out * of the tcpcb each time to conserve mbufs. */ void tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr) { struct tcphdr *th = (struct tcphdr *)tcp_ptr; INP_WLOCK_ASSERT(inp); #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) { struct ip6_hdr *ip6; ip6 = (struct ip6_hdr *)ip_ptr; ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | (inp->inp_flow & IPV6_FLOWINFO_MASK); ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | (IPV6_VERSION & IPV6_VERSION_MASK); ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_plen = htons(sizeof(struct tcphdr)); ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = inp->in6p_faddr; } #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET { struct ip *ip; ip = (struct ip *)ip_ptr; ip->ip_v = IPVERSION; ip->ip_hl = 5; ip->ip_tos = inp->inp_ip_tos; ip->ip_len = 0; ip->ip_id = 0; ip->ip_off = 0; ip->ip_ttl = inp->inp_ip_ttl; ip->ip_sum = 0; ip->ip_p = IPPROTO_TCP; ip->ip_src = inp->inp_laddr; ip->ip_dst = inp->inp_faddr; } #endif /* INET */ th->th_sport = inp->inp_lport; th->th_dport = inp->inp_fport; th->th_seq = 0; th->th_ack = 0; th->th_x2 = 0; th->th_off = 5; th->th_flags = 0; th->th_win = 0; th->th_urp = 0; th->th_sum = 0; /* in_pseudo() is called later for ipv4 */ } /* * Create template to be used to send tcp packets on a connection. * Allocates an mbuf and fills in a skeletal tcp/ip header. The only * use for this function is in keepalives, which use tcp_respond. */ struct tcptemp * tcpip_maketemplate(struct inpcb *inp) { struct tcptemp *t; t = malloc(sizeof(*t), M_TEMP, M_NOWAIT); if (t == NULL) return (NULL); tcpip_fillheaders(inp, (void *)&t->tt_ipgen, (void *)&t->tt_t); return (t); } /* * Send a single message to the TCP at address specified by * the given TCP/IP header. If m == NULL, then we make a copy * of the tcpiphdr at th and send directly to the addressed host. * This is used to force keep alive messages out using the TCP * template for a connection. If flags are given then we send * a message back to the TCP which originated the segment th, * and discard the mbuf containing it and any other attached mbufs. * * In any case the ack and sequence number of the transmitted * segment are as specified by the parameters. * * NOTE: If m != NULL, then th must point to *inside* the mbuf. */ void tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, tcp_seq ack, tcp_seq seq, int flags) { struct tcpopt to; struct inpcb *inp; struct ip *ip; struct mbuf *optm; struct tcphdr *nth; u_char *optp; #ifdef INET6 struct ip6_hdr *ip6; int isipv6; #endif /* INET6 */ int optlen, tlen, win; bool incl_opts; KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); #ifdef INET6 isipv6 = ((struct ip *)ipgen)->ip_v == (IPV6_VERSION >> 4); ip6 = ipgen; #endif /* INET6 */ ip = ipgen; if (tp != NULL) { inp = tp->t_inpcb; KASSERT(inp != NULL, ("tcp control block w/o inpcb")); INP_WLOCK_ASSERT(inp); } else inp = NULL; incl_opts = false; win = 0; if (tp != NULL) { if (!(flags & TH_RST)) { win = sbspace(&inp->inp_socket->so_rcv); if (win > (long)TCP_MAXWIN << tp->rcv_scale) win = (long)TCP_MAXWIN << tp->rcv_scale; } if ((tp->t_flags & TF_NOOPT) == 0) incl_opts = true; } if (m == NULL) { m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return; m->m_data += max_linkhdr; #ifdef INET6 if (isipv6) { bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(struct ip6_hdr)); ip6 = mtod(m, struct ip6_hdr *); nth = (struct tcphdr *)(ip6 + 1); } else #endif /* INET6 */ { bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); ip = mtod(m, struct ip *); nth = (struct tcphdr *)(ip + 1); } bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); flags = TH_ACK; } else { /* * reuse the mbuf. * XXX MRT We inherrit the FIB, which is lucky. */ m_freem(m->m_next); m->m_next = NULL; m->m_data = (caddr_t)ipgen; /* m_len is set later */ #define xchg(a,b,type) { type t; t=a; a=b; b=t; } #ifdef INET6 if (isipv6) { xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); nth = (struct tcphdr *)(ip6 + 1); } else #endif /* INET6 */ { xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t); nth = (struct tcphdr *)(ip + 1); } if (th != nth) { /* * this is usually a case when an extension header * exists between the IPv6 header and the * TCP header. */ nth->th_sport = th->th_sport; nth->th_dport = th->th_dport; } xchg(nth->th_dport, nth->th_sport, uint16_t); #undef xchg } tlen = 0; #ifdef INET6 if (isipv6) tlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET tlen = sizeof (struct tcpiphdr); #endif #ifdef INVARIANTS m->m_len = 0; KASSERT(M_TRAILINGSPACE(m) >= tlen, ("Not enough trailing space for message (m=%p, need=%d, have=%ld)", m, tlen, (long)M_TRAILINGSPACE(m))); #endif m->m_len = tlen; to.to_flags = 0; if (incl_opts) { /* Make sure we have room. */ if (M_TRAILINGSPACE(m) < TCP_MAXOLEN) { m->m_next = m_get(M_NOWAIT, MT_DATA); if (m->m_next) { optp = mtod(m->m_next, u_char *); optm = m->m_next; } else incl_opts = false; } else { optp = (u_char *) (nth + 1); optm = m; } } if (incl_opts) { /* Timestamps. */ if (tp->t_flags & TF_RCVD_TSTMP) { to.to_tsval = tcp_ts_getticks() + tp->ts_offset; to.to_tsecr = tp->ts_recent; to.to_flags |= TOF_TS; } #ifdef TCP_SIGNATURE /* TCP-MD5 (RFC2385). */ if (tp->t_flags & TF_SIGNATURE) to.to_flags |= TOF_SIGNATURE; #endif /* Add the options. */ tlen += optlen = tcp_addoptions(&to, optp); /* Update m_len in the correct mbuf. */ optm->m_len += optlen; } else optlen = 0; #ifdef INET6 if (isipv6) { ip6->ip6_flow = 0; ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_plen = htons(tlen - sizeof(*ip6)); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { ip->ip_len = htons(tlen); ip->ip_ttl = V_ip_defttl; if (V_path_mtu_discovery) ip->ip_off |= htons(IP_DF); } #endif m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = NULL; #ifdef MAC if (inp != NULL) { /* * Packet is associated with a socket, so allow the * label of the response to reflect the socket label. */ INP_WLOCK_ASSERT(inp); mac_inpcb_create_mbuf(inp, m); } else { /* * Packet is not associated with a socket, so possibly * update the label in place. */ mac_netinet_tcp_reply(m); } #endif nth->th_seq = htonl(seq); nth->th_ack = htonl(ack); nth->th_x2 = 0; nth->th_off = (sizeof (struct tcphdr) + optlen) >> 2; nth->th_flags = flags; if (tp != NULL) nth->th_win = htons((u_short) (win >> tp->rcv_scale)); else nth->th_win = htons((u_short)win); nth->th_urp = 0; #ifdef TCP_SIGNATURE if (to.to_flags & TOF_SIGNATURE) { tcp_signature_compute(m, 0, 0, optlen, to.to_signature, IPSEC_DIR_OUTBOUND); } #endif m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); #ifdef INET6 if (isipv6) { m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; nth->th_sum = in6_cksum_pseudo(ip6, tlen - sizeof(struct ip6_hdr), IPPROTO_TCP, 0); ip6->ip6_hlim = in6_selecthlim(tp != NULL ? tp->t_inpcb : NULL, NULL); } #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET { m->m_pkthdr.csum_flags = CSUM_TCP; nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); } #endif /* INET */ #ifdef TCPDEBUG if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); #endif TCP_PROBE3(debug__output, tp, th, mtod(m, const char *)); if (flags & TH_RST) TCP_PROBE5(accept__refused, NULL, NULL, mtod(m, const char *), tp, nth); TCP_PROBE5(send, NULL, tp, mtod(m, const char *), tp, nth); #ifdef INET6 if (isipv6) (void) ip6_output(m, NULL, NULL, 0, NULL, NULL, inp); #endif /* INET6 */ #if defined(INET) && defined(INET6) else #endif #ifdef INET (void) ip_output(m, NULL, NULL, 0, NULL, inp); #endif } /* * Create a new TCP control block, making an * empty reassembly queue and hooking it to the argument * protocol control block. The `inp' parameter must have * come from the zone allocator set up in tcp_init(). */ struct tcpcb * tcp_newtcpcb(struct inpcb *inp) { struct tcpcb_mem *tm; struct tcpcb *tp; #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ tm = uma_zalloc(V_tcpcb_zone, M_NOWAIT | M_ZERO); if (tm == NULL) return (NULL); tp = &tm->tcb; /* Initialise cc_var struct for this tcpcb. */ tp->ccv = &tm->ccv; tp->ccv->type = IPPROTO_TCP; tp->ccv->ccvc.tcp = tp; rw_rlock(&tcp_function_lock); tp->t_fb = tcp_func_set_ptr; refcount_acquire(&tp->t_fb->tfb_refcnt); rw_runlock(&tcp_function_lock); if (tp->t_fb->tfb_tcp_fb_init) { (*tp->t_fb->tfb_tcp_fb_init)(tp); } /* * Use the current system default CC algorithm. */ CC_LIST_RLOCK(); KASSERT(!STAILQ_EMPTY(&cc_list), ("cc_list is empty!")); CC_ALGO(tp) = CC_DEFAULT(); CC_LIST_RUNLOCK(); if (CC_ALGO(tp)->cb_init != NULL) if (CC_ALGO(tp)->cb_init(tp->ccv) > 0) { if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp); refcount_release(&tp->t_fb->tfb_refcnt); uma_zfree(V_tcpcb_zone, tm); return (NULL); } tp->osd = &tm->osd; if (khelp_init_osd(HELPER_CLASS_TCP, tp->osd)) { if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp); refcount_release(&tp->t_fb->tfb_refcnt); uma_zfree(V_tcpcb_zone, tm); return (NULL); } #ifdef VIMAGE tp->t_vnet = inp->inp_vnet; #endif tp->t_timers = &tm->tt; /* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */ tp->t_maxseg = #ifdef INET6 isipv6 ? V_tcp_v6mssdflt : #endif /* INET6 */ V_tcp_mssdflt; /* Set up our timeouts. */ callout_init(&tp->t_timers->tt_rexmt, 1); callout_init(&tp->t_timers->tt_persist, 1); callout_init(&tp->t_timers->tt_keep, 1); callout_init(&tp->t_timers->tt_2msl, 1); callout_init(&tp->t_timers->tt_delack, 1); if (V_tcp_do_rfc1323) tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); if (V_tcp_do_sack) tp->t_flags |= TF_SACK_PERMIT; TAILQ_INIT(&tp->snd_holes); /* * The tcpcb will hold a reference on its inpcb until tcp_discardcb() * is called. */ in_pcbref(inp); /* Reference for tcpcb */ tp->t_inpcb = inp; /* * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives * reasonable initial retransmit time. */ tp->t_srtt = TCPTV_SRTTBASE; tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; tp->t_rttmin = tcp_rexmit_min; tp->t_rxtcur = TCPTV_RTOBASE; tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->t_rcvtime = ticks; /* * IPv4 TTL initialization is necessary for an IPv6 socket as well, * because the socket may be bound to an IPv6 wildcard address, * which may match an IPv4-mapped IPv6 address. */ inp->inp_ip_ttl = V_ip_defttl; inp->inp_ppcb = tp; #ifdef TCPPCAP /* * Init the TCP PCAP queues. */ tcp_pcap_tcpcb_init(tp); #endif return (tp); /* XXX */ } /* * Switch the congestion control algorithm back to NewReno for any active * control blocks using an algorithm which is about to go away. * This ensures the CC framework can allow the unload to proceed without leaving * any dangling pointers which would trigger a panic. * Returning non-zero would inform the CC framework that something went wrong * and it would be unsafe to allow the unload to proceed. However, there is no * way for this to occur with this implementation so we always return zero. */ int tcp_ccalgounload(struct cc_algo *unload_algo) { struct cc_algo *tmpalgo; struct inpcb *inp; struct tcpcb *tp; VNET_ITERATOR_DECL(vnet_iter); /* * Check all active control blocks across all network stacks and change * any that are using "unload_algo" back to NewReno. If "unload_algo" * requires cleanup code to be run, call it. */ VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); INP_INFO_WLOCK(&V_tcbinfo); /* * New connections already part way through being initialised * with the CC algo we're removing will not race with this code * because the INP_INFO_WLOCK is held during initialisation. We * therefore don't enter the loop below until the connection * list has stabilised. */ LIST_FOREACH(inp, &V_tcb, inp_list) { INP_WLOCK(inp); /* Important to skip tcptw structs. */ if (!(inp->inp_flags & INP_TIMEWAIT) && (tp = intotcpcb(inp)) != NULL) { /* * By holding INP_WLOCK here, we are assured * that the connection is not currently * executing inside the CC module's functions * i.e. it is safe to make the switch back to * NewReno. */ if (CC_ALGO(tp) == unload_algo) { tmpalgo = CC_ALGO(tp); /* NewReno does not require any init. */ CC_ALGO(tp) = &newreno_cc_algo; if (tmpalgo->cb_destroy != NULL) tmpalgo->cb_destroy(tp->ccv); } } INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); return (0); } /* * Drop a TCP connection, reporting * the specified error. If connection is synchronized, * then send a RST to peer. */ struct tcpcb * tcp_drop(struct tcpcb *tp, int errno) { struct socket *so = tp->t_inpcb->inp_socket; INP_INFO_LOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); if (TCPS_HAVERCVDSYN(tp->t_state)) { tcp_state_change(tp, TCPS_CLOSED); (void) tp->t_fb->tfb_tcp_output(tp); TCPSTAT_INC(tcps_drops); } else TCPSTAT_INC(tcps_conndrops); if (errno == ETIMEDOUT && tp->t_softerror) errno = tp->t_softerror; so->so_error = errno; return (tcp_close(tp)); } void tcp_discardcb(struct tcpcb *tp) { struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ int released; INP_WLOCK_ASSERT(inp); /* * Make sure that all of our timers are stopped before we delete the * PCB. * * If stopping a timer fails, we schedule a discard function in same * callout, and the last discard function called will take care of * deleting the tcpcb. */ tcp_timer_stop(tp, TT_REXMT); tcp_timer_stop(tp, TT_PERSIST); tcp_timer_stop(tp, TT_KEEP); tcp_timer_stop(tp, TT_2MSL); tcp_timer_stop(tp, TT_DELACK); if (tp->t_fb->tfb_tcp_timer_stop_all) { /* Call the stop-all function of the methods */ tp->t_fb->tfb_tcp_timer_stop_all(tp); } /* * If we got enough samples through the srtt filter, * save the rtt and rttvar in the routing entry. * 'Enough' is arbitrarily defined as 4 rtt samples. * 4 samples is enough for the srtt filter to converge * to within enough % of the correct value; fewer samples * and we could save a bogus rtt. The danger is not high * as tcp quickly recovers from everything. * XXX: Works very well but needs some more statistics! */ if (tp->t_rttupdated >= 4) { struct hc_metrics_lite metrics; u_long ssthresh; bzero(&metrics, sizeof(metrics)); /* * Update the ssthresh always when the conditions below * are satisfied. This gives us better new start value * for the congestion avoidance for new connections. * ssthresh is only set if packet loss occured on a session. * * XXXRW: 'so' may be NULL here, and/or socket buffer may be * being torn down. Ideally this code would not use 'so'. */ ssthresh = tp->snd_ssthresh; if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) { /* * convert the limit from user data bytes to * packets then to packet data bytes. */ ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg; if (ssthresh < 2) ssthresh = 2; ssthresh *= (u_long)(tp->t_maxseg + #ifdef INET6 (isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : #endif sizeof (struct tcpiphdr) #ifdef INET6 ) #endif ); } else ssthresh = 0; metrics.rmx_ssthresh = ssthresh; metrics.rmx_rtt = tp->t_srtt; metrics.rmx_rttvar = tp->t_rttvar; metrics.rmx_cwnd = tp->snd_cwnd; metrics.rmx_sendpipe = 0; metrics.rmx_recvpipe = 0; tcp_hc_update(&inp->inp_inc, &metrics); } /* free the reassembly queue, if any */ tcp_reass_flush(tp); #ifdef TCP_OFFLOAD /* Disconnect offload device, if any. */ if (tp->t_flags & TF_TOE) tcp_offload_detach(tp); #endif tcp_free_sackholes(tp); #ifdef TCPPCAP /* Free the TCP PCAP queues. */ tcp_pcap_drain(&(tp->t_inpkts)); tcp_pcap_drain(&(tp->t_outpkts)); #endif /* Allow the CC algorithm to clean up after itself. */ if (CC_ALGO(tp)->cb_destroy != NULL) CC_ALGO(tp)->cb_destroy(tp->ccv); khelp_destroy_osd(tp->osd); CC_ALGO(tp) = NULL; inp->inp_ppcb = NULL; if ((tp->t_timers->tt_flags & TT_MASK) == 0) { /* We own the last reference on tcpcb, let's free it. */ if ((tp->t_fb->tfb_tcp_timers_left) && (tp->t_fb->tfb_tcp_timers_left(tp))) { /* Some fb timers left running! */ return; } if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp); refcount_release(&tp->t_fb->tfb_refcnt); tp->t_inpcb = NULL; uma_zfree(V_tcpcb_zone, tp); released = in_pcbrele_wlocked(inp); KASSERT(!released, ("%s: inp %p should not have been released " "here", __func__, inp)); } } void tcp_timer_2msl_discard(void *xtp) { tcp_timer_discard((struct tcpcb *)xtp, TT_2MSL); } void tcp_timer_keep_discard(void *xtp) { tcp_timer_discard((struct tcpcb *)xtp, TT_KEEP); } void tcp_timer_persist_discard(void *xtp) { tcp_timer_discard((struct tcpcb *)xtp, TT_PERSIST); } void tcp_timer_rexmt_discard(void *xtp) { tcp_timer_discard((struct tcpcb *)xtp, TT_REXMT); } void tcp_timer_delack_discard(void *xtp) { tcp_timer_discard((struct tcpcb *)xtp, TT_DELACK); } void tcp_timer_discard(struct tcpcb *tp, uint32_t timer_type) { struct inpcb *inp; CURVNET_SET(tp->t_vnet); INP_INFO_RLOCK(&V_tcbinfo); inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); KASSERT((tp->t_timers->tt_flags & TT_STOPPED) != 0, ("%s: tcpcb has to be stopped here", __func__)); KASSERT((tp->t_timers->tt_flags & timer_type) != 0, ("%s: discard callout should be running", __func__)); tp->t_timers->tt_flags &= ~timer_type; if ((tp->t_timers->tt_flags & TT_MASK) == 0) { /* We own the last reference on this tcpcb, let's free it. */ if ((tp->t_fb->tfb_tcp_timers_left) && (tp->t_fb->tfb_tcp_timers_left(tp))) { /* Some fb timers left running! */ goto leave; } if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp); refcount_release(&tp->t_fb->tfb_refcnt); tp->t_inpcb = NULL; uma_zfree(V_tcpcb_zone, tp); if (in_pcbrele_wlocked(inp)) { INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } } leave: INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } /* * Attempt to close a TCP control block, marking it as dropped, and freeing * the socket if we hold the only reference. */ struct tcpcb * tcp_close(struct tcpcb *tp) { struct inpcb *inp = tp->t_inpcb; struct socket *so; INP_INFO_LOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); #ifdef TCP_OFFLOAD if (tp->t_state == TCPS_LISTEN) tcp_offload_listen_stop(tp); #endif #ifdef TCP_RFC7413 /* * This releases the TFO pending counter resource for TFO listen * sockets as well as passively-created TFO sockets that transition * from SYN_RECEIVED to CLOSED. */ if (tp->t_tfo_pending) { tcp_fastopen_decrement_counter(tp->t_tfo_pending); tp->t_tfo_pending = NULL; } #endif in_pcbdrop(inp); TCPSTAT_INC(tcps_closed); TCPSTATES_DEC(tp->t_state); KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL")); so = inp->inp_socket; soisdisconnected(so); if (inp->inp_flags & INP_SOCKREF) { KASSERT(so->so_state & SS_PROTOREF, ("tcp_close: !SS_PROTOREF")); inp->inp_flags &= ~INP_SOCKREF; INP_WUNLOCK(inp); ACCEPT_LOCK(); SOCK_LOCK(so); so->so_state &= ~SS_PROTOREF; sofree(so); return (NULL); } return (tp); } void tcp_drain(void) { VNET_ITERATOR_DECL(vnet_iter); if (!do_tcpdrain) return; VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); struct inpcb *inpb; struct tcpcb *tcpb; /* * Walk the tcpbs, if existing, and flush the reassembly queue, * if there is one... * XXX: The "Net/3" implementation doesn't imply that the TCP * reassembly queue should be flushed, but in a situation * where we're really low on mbufs, this is potentially * useful. */ INP_INFO_WLOCK(&V_tcbinfo); LIST_FOREACH(inpb, V_tcbinfo.ipi_listhead, inp_list) { if (inpb->inp_flags & INP_TIMEWAIT) continue; INP_WLOCK(inpb); if ((tcpb = intotcpcb(inpb)) != NULL) { tcp_reass_flush(tcpb); tcp_clean_sackreport(tcpb); } INP_WUNLOCK(inpb); } INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Notify a tcp user of an asynchronous error; * store error as soft error, but wake up user * (for now, won't do anything until can select for soft error). * * Do not wake up user since there currently is no mechanism for * reporting soft errors (yet - a kqueue filter may be added). */ static struct inpcb * tcp_notify(struct inpcb *inp, int error) { struct tcpcb *tp; INP_INFO_LOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); if ((inp->inp_flags & INP_TIMEWAIT) || (inp->inp_flags & INP_DROPPED)) return (inp); tp = intotcpcb(inp); KASSERT(tp != NULL, ("tcp_notify: tp == NULL")); /* * Ignore some errors if we are hooked up. * If connection hasn't completed, has retransmitted several times, * and receives a second error, give up now. This is better * than waiting a long time to establish a connection that * can never complete. */ if (tp->t_state == TCPS_ESTABLISHED && (error == EHOSTUNREACH || error == ENETUNREACH || error == EHOSTDOWN)) { + if (inp->inp_route.ro_rt) { + RTFREE(inp->inp_route.ro_rt); + inp->inp_route.ro_rt = (struct rtentry *)NULL; + } return (inp); } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && tp->t_softerror) { tp = tcp_drop(tp, error); if (tp != NULL) return (inp); else return (NULL); } else { tp->t_softerror = error; return (inp); } #if 0 wakeup( &so->so_timeo); sorwakeup(so); sowwakeup(so); #endif } static int tcp_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, m, n, pcb_count; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; /* * The process of preparing the TCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == NULL) { n = V_tcbinfo.ipi_count + counter_u64_fetch(VNET(tcps_states)[TCPS_SYN_RECEIVED]); n += imax(n / 8, 10); req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb); return (0); } if (req->newptr != NULL) return (EPERM); /* * OK, now we're committed to doing something. */ INP_LIST_RLOCK(&V_tcbinfo); gencnt = V_tcbinfo.ipi_gencnt; n = V_tcbinfo.ipi_count; INP_LIST_RUNLOCK(&V_tcbinfo); m = counter_u64_fetch(VNET(tcps_states)[TCPS_SYN_RECEIVED]); error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) + (n + m) * sizeof(struct xtcpcb)); if (error != 0) return (error); xig.xig_len = sizeof xig; xig.xig_count = n + m; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); error = syncache_pcblist(req, m, &pcb_count); if (error) return (error); inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); if (inp_list == NULL) return (ENOMEM); INP_INFO_WLOCK(&V_tcbinfo); for (inp = LIST_FIRST(V_tcbinfo.ipi_listhead), i = 0; inp != NULL && i < n; inp = LIST_NEXT(inp, inp_list)) { INP_WLOCK(inp); if (inp->inp_gencnt <= gencnt) { /* * XXX: This use of cr_cansee(), introduced with * TCP state changes, is not quite right, but for * now, better than nothing. */ if (inp->inp_flags & INP_TIMEWAIT) { if (intotw(inp) != NULL) error = cr_cansee(req->td->td_ucred, intotw(inp)->tw_cred); else error = EINVAL; /* Skip this inp. */ } else error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) { in_pcbref(inp); inp_list[i++] = inp; } } INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(&V_tcbinfo); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (inp->inp_gencnt <= gencnt) { struct xtcpcb xt; void *inp_ppcb; bzero(&xt, sizeof(xt)); xt.xt_len = sizeof xt; /* XXX should avoid extra copy */ bcopy(inp, &xt.xt_inp, sizeof *inp); inp_ppcb = inp->inp_ppcb; if (inp_ppcb == NULL) bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); else if (inp->inp_flags & INP_TIMEWAIT) { bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); xt.xt_tp.t_state = TCPS_TIME_WAIT; } else { bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp); if (xt.xt_tp.t_timers) tcp_timer_to_xtimer(&xt.xt_tp, xt.xt_tp.t_timers, &xt.xt_timer); } if (inp->inp_socket != NULL) sotoxsocket(inp->inp_socket, &xt.xt_socket); else { bzero(&xt.xt_socket, sizeof xt.xt_socket); xt.xt_socket.xso_protocol = IPPROTO_TCP; } xt.xt_inp.inp_gencnt = inp->inp_gencnt; INP_RUNLOCK(inp); error = SYSCTL_OUT(req, &xt, sizeof xt); } else INP_RUNLOCK(inp); } INP_INFO_RLOCK(&V_tcbinfo); for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (!in_pcbrele_rlocked(inp)) INP_RUNLOCK(inp); } INP_INFO_RUNLOCK(&V_tcbinfo); if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ INP_LIST_RLOCK(&V_tcbinfo); xig.xig_gen = V_tcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_tcbinfo.ipi_count + pcb_count; INP_LIST_RUNLOCK(&V_tcbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); #ifdef INET static int tcp_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in addrs[2]; struct inpcb *inp; int error; error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); inp = in_pcblookup(&V_tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else error = ENOENT; if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, tcp_getcred, "S,xucred", "Get the xucred of a TCP connection"); #endif /* INET */ #ifdef INET6 static int tcp6_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in6 addrs[2]; struct inpcb *inp; int error; #ifdef INET int mapped = 0; #endif error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 || (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) { return (error); } if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { #ifdef INET if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) mapped = 1; else #endif return (EINVAL); } #ifdef INET if (mapped == 1) inp = in_pcblookup(&V_tcbinfo, *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], addrs[1].sin6_port, *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL); else #endif inp = in6_pcblookup(&V_tcbinfo, &addrs[1].sin6_addr, addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else error = ENOENT; if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection"); #endif /* INET6 */ #ifdef INET void tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) { struct ip *ip = vip; struct tcphdr *th; struct in_addr faddr; struct inpcb *inp; struct tcpcb *tp; struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; struct icmp *icp; struct in_conninfo inc; tcp_seq icmp_tcp_seq; int mtu; faddr = ((struct sockaddr_in *)sa)->sin_addr; if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) return; if (cmd == PRC_MSGSIZE) notify = tcp_mtudisc_notify; else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip) notify = tcp_drop_syn_sent; - /* - * Redirects don't need to be handled up here. - */ - else if (PRC_IS_REDIRECT(cmd)) + else if (PRC_IS_REDIRECT(cmd)) { + /* signal EHOSTDOWN, as it flushes the cached route */ + in_pcbnotifyall(&tcbinfo, faddr, EHOSTDOWN, notify); return; + } /* * Hostdead is ugly because it goes linearly through all PCBs. * XXX: We never get this from ICMP, otherwise it makes an * excellent DoS attack on machines with many connections. */ else if (cmd == PRC_HOSTDEAD) ip = NULL; else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) return; if (ip == NULL) { in_pcbnotifyall(&V_tcbinfo, faddr, inetctlerrmap[cmd], notify); return; } icp = (struct icmp *)((caddr_t)ip - offsetof(struct icmp, icmp_ip)); th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); INP_INFO_RLOCK(&V_tcbinfo); inp = in_pcblookup(&V_tcbinfo, faddr, th->th_dport, ip->ip_src, th->th_sport, INPLOOKUP_WLOCKPCB, NULL); if (inp != NULL) { if (!(inp->inp_flags & INP_TIMEWAIT) && !(inp->inp_flags & INP_DROPPED) && !(inp->inp_socket == NULL)) { icmp_tcp_seq = ntohl(th->th_seq); tp = intotcpcb(inp); if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) && SEQ_LT(icmp_tcp_seq, tp->snd_max)) { if (cmd == PRC_MSGSIZE) { /* * MTU discovery: * If we got a needfrag set the MTU * in the route to the suggested new * value (if given) and then notify. */ mtu = ntohs(icp->icmp_nextmtu); /* * If no alternative MTU was * proposed, try the next smaller * one. */ if (!mtu) mtu = ip_next_mtu( ntohs(ip->ip_len), 1); if (mtu < V_tcp_minmss + sizeof(struct tcpiphdr)) mtu = V_tcp_minmss + sizeof(struct tcpiphdr); /* * Only process the offered MTU if it * is smaller than the current one. */ if (mtu < tp->t_maxseg + sizeof(struct tcpiphdr)) { bzero(&inc, sizeof(inc)); inc.inc_faddr = faddr; inc.inc_fibnum = inp->inp_inc.inc_fibnum; tcp_hc_updatemtu(&inc, mtu); tcp_mtudisc(inp, mtu); } } else inp = (*notify)(inp, inetctlerrmap[cmd]); } } if (inp != NULL) INP_WUNLOCK(inp); } else { bzero(&inc, sizeof(inc)); inc.inc_fport = th->th_dport; inc.inc_lport = th->th_sport; inc.inc_faddr = faddr; inc.inc_laddr = ip->ip_src; syncache_unreach(&inc, th); } INP_INFO_RUNLOCK(&V_tcbinfo); } #endif /* INET */ #ifdef INET6 void tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d) { struct tcphdr th; struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; struct ip6_hdr *ip6; struct mbuf *m; struct ip6ctlparam *ip6cp = NULL; const struct sockaddr_in6 *sa6_src = NULL; int off; struct tcp_portonly { u_int16_t th_sport; u_int16_t th_dport; } *thp; if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) return; if (cmd == PRC_MSGSIZE) notify = tcp_mtudisc_notify; else if (!PRC_IS_REDIRECT(cmd) && ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0)) return; /* if the parameter is from icmp6, decode it. */ if (d != NULL) { ip6cp = (struct ip6ctlparam *)d; m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; sa6_src = ip6cp->ip6c_src; } else { m = NULL; ip6 = NULL; off = 0; /* fool gcc */ sa6_src = &sa6_any; } if (ip6 != NULL) { struct in_conninfo inc; /* * XXX: We assume that when IPV6 is non NULL, * M and OFF are valid. */ /* check if we can safely examine src and dst ports */ if (m->m_pkthdr.len < off + sizeof(*thp)) return; bzero(&th, sizeof(th)); m_copydata(m, off, sizeof(*thp), (caddr_t)&th); in6_pcbnotify(&V_tcbinfo, sa, th.th_dport, (struct sockaddr *)ip6cp->ip6c_src, th.th_sport, cmd, NULL, notify); bzero(&inc, sizeof(inc)); inc.inc_fport = th.th_dport; inc.inc_lport = th.th_sport; inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr; inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr; inc.inc_flags |= INC_ISIPV6; INP_INFO_RLOCK(&V_tcbinfo); syncache_unreach(&inc, &th); INP_INFO_RUNLOCK(&V_tcbinfo); } else in6_pcbnotify(&V_tcbinfo, sa, 0, (const struct sockaddr *)sa6_src, 0, cmd, NULL, notify); } #endif /* INET6 */ /* * Following is where TCP initial sequence number generation occurs. * * There are two places where we must use initial sequence numbers: * 1. In SYN-ACK packets. * 2. In SYN packets. * * All ISNs for SYN-ACK packets are generated by the syncache. See * tcp_syncache.c for details. * * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling * depends on this property. In addition, these ISNs should be * unguessable so as to prevent connection hijacking. To satisfy * the requirements of this situation, the algorithm outlined in * RFC 1948 is used, with only small modifications. * * Implementation details: * * Time is based off the system timer, and is corrected so that it * increases by one megabyte per second. This allows for proper * recycling on high speed LANs while still leaving over an hour * before rollover. * * As reading the *exact* system time is too expensive to be done * whenever setting up a TCP connection, we increment the time * offset in two ways. First, a small random positive increment * is added to isn_offset for each connection that is set up. * Second, the function tcp_isn_tick fires once per clock tick * and increments isn_offset as necessary so that sequence numbers * are incremented at approximately ISN_BYTES_PER_SECOND. The * random positive increments serve only to ensure that the same * exact sequence number is never sent out twice (as could otherwise * happen when a port is recycled in less than the system tick * interval.) * * net.inet.tcp.isn_reseed_interval controls the number of seconds * between seeding of isn_secret. This is normally set to zero, * as reseeding should not be necessary. * * Locking of the global variables isn_secret, isn_last_reseed, isn_offset, * isn_offset_old, and isn_ctx is performed using the TCP pcbinfo lock. In * general, this means holding an exclusive (write) lock. */ #define ISN_BYTES_PER_SECOND 1048576 #define ISN_STATIC_INCREMENT 4096 #define ISN_RANDOM_INCREMENT (4096 - 1) static VNET_DEFINE(u_char, isn_secret[32]); static VNET_DEFINE(int, isn_last); static VNET_DEFINE(int, isn_last_reseed); static VNET_DEFINE(u_int32_t, isn_offset); static VNET_DEFINE(u_int32_t, isn_offset_old); #define V_isn_secret VNET(isn_secret) #define V_isn_last VNET(isn_last) #define V_isn_last_reseed VNET(isn_last_reseed) #define V_isn_offset VNET(isn_offset) #define V_isn_offset_old VNET(isn_offset_old) tcp_seq tcp_new_isn(struct tcpcb *tp) { MD5_CTX isn_ctx; u_int32_t md5_buffer[4]; tcp_seq new_isn; u_int32_t projected_offset; INP_WLOCK_ASSERT(tp->t_inpcb); ISN_LOCK(); /* Seed if this is the first use, reseed if requested. */ if ((V_isn_last_reseed == 0) || ((V_tcp_isn_reseed_interval > 0) && (((u_int)V_isn_last_reseed + (u_int)V_tcp_isn_reseed_interval*hz) < (u_int)ticks))) { read_random(&V_isn_secret, sizeof(V_isn_secret)); V_isn_last_reseed = ticks; } /* Compute the md5 hash and return the ISN. */ MD5Init(&isn_ctx); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short)); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short)); #ifdef INET6 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) { MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr, sizeof(struct in6_addr)); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr, sizeof(struct in6_addr)); } else #endif { MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr, sizeof(struct in_addr)); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr, sizeof(struct in_addr)); } MD5Update(&isn_ctx, (u_char *) &V_isn_secret, sizeof(V_isn_secret)); MD5Final((u_char *) &md5_buffer, &isn_ctx); new_isn = (tcp_seq) md5_buffer[0]; V_isn_offset += ISN_STATIC_INCREMENT + (arc4random() & ISN_RANDOM_INCREMENT); if (ticks != V_isn_last) { projected_offset = V_isn_offset_old + ISN_BYTES_PER_SECOND / hz * (ticks - V_isn_last); if (SEQ_GT(projected_offset, V_isn_offset)) V_isn_offset = projected_offset; V_isn_offset_old = V_isn_offset; V_isn_last = ticks; } new_isn += V_isn_offset; ISN_UNLOCK(); return (new_isn); } /* * When a specific ICMP unreachable message is received and the * connection state is SYN-SENT, drop the connection. This behavior * is controlled by the icmp_may_rst sysctl. */ struct inpcb * tcp_drop_syn_sent(struct inpcb *inp, int errno) { struct tcpcb *tp; INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); if ((inp->inp_flags & INP_TIMEWAIT) || (inp->inp_flags & INP_DROPPED)) return (inp); tp = intotcpcb(inp); if (tp->t_state != TCPS_SYN_SENT) return (inp); tp = tcp_drop(tp, errno); if (tp != NULL) return (inp); else return (NULL); } /* * When `need fragmentation' ICMP is received, update our idea of the MSS * based on the new value. Also nudge TCP to send something, since we * know the packet we just sent was dropped. * This duplicates some code in the tcp_mss() function in tcp_input.c. */ static struct inpcb * tcp_mtudisc_notify(struct inpcb *inp, int error) { tcp_mtudisc(inp, -1); return (inp); } static void tcp_mtudisc(struct inpcb *inp, int mtuoffer) { struct tcpcb *tp; struct socket *so; INP_WLOCK_ASSERT(inp); if ((inp->inp_flags & INP_TIMEWAIT) || (inp->inp_flags & INP_DROPPED)) return; tp = intotcpcb(inp); KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL")); tcp_mss_update(tp, -1, mtuoffer, NULL, NULL); so = inp->inp_socket; SOCKBUF_LOCK(&so->so_snd); /* If the mss is larger than the socket buffer, decrease the mss. */ if (so->so_snd.sb_hiwat < tp->t_maxseg) tp->t_maxseg = so->so_snd.sb_hiwat; SOCKBUF_UNLOCK(&so->so_snd); TCPSTAT_INC(tcps_mturesent); tp->t_rtttime = 0; tp->snd_nxt = tp->snd_una; tcp_free_sackholes(tp); tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_SACK_PERMIT) EXIT_FASTRECOVERY(tp->t_flags); tp->t_fb->tfb_tcp_output(tp); } #ifdef INET /* * Look-up the routing entry to the peer of this inpcb. If no route * is found and it cannot be allocated, then return 0. This routine * is called by TCP routines that access the rmx structure and by * tcp_mss_update to get the peer/interface MTU. */ u_long tcp_maxmtu(struct in_conninfo *inc, struct tcp_ifcap *cap) { struct nhop4_extended nh4; struct ifnet *ifp; u_long maxmtu = 0; KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer")); if (inc->inc_faddr.s_addr != INADDR_ANY) { if (fib4_lookup_nh_ext(inc->inc_fibnum, inc->inc_faddr, NHR_REF, 0, &nh4) != 0) return (0); ifp = nh4.nh_ifp; maxmtu = nh4.nh_mtu; /* Report additional interface capabilities. */ if (cap != NULL) { if (ifp->if_capenable & IFCAP_TSO4 && ifp->if_hwassist & CSUM_TSO) { cap->ifcap |= CSUM_TSO; cap->tsomax = ifp->if_hw_tsomax; cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; } } fib4_free_nh_ext(inc->inc_fibnum, &nh4); } return (maxmtu); } #endif /* INET */ #ifdef INET6 u_long tcp_maxmtu6(struct in_conninfo *inc, struct tcp_ifcap *cap) { struct nhop6_extended nh6; struct in6_addr dst6; uint32_t scopeid; struct ifnet *ifp; u_long maxmtu = 0; KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer")); if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { in6_splitscope(&inc->inc6_faddr, &dst6, &scopeid); if (fib6_lookup_nh_ext(inc->inc_fibnum, &dst6, scopeid, 0, 0, &nh6) != 0) return (0); ifp = nh6.nh_ifp; maxmtu = nh6.nh_mtu; /* Report additional interface capabilities. */ if (cap != NULL) { if (ifp->if_capenable & IFCAP_TSO6 && ifp->if_hwassist & CSUM_TSO) { cap->ifcap |= CSUM_TSO; cap->tsomax = ifp->if_hw_tsomax; cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; } } fib6_free_nh_ext(inc->inc_fibnum, &nh6); } return (maxmtu); } #endif /* INET6 */ /* * Calculate effective SMSS per RFC5681 definition for a given TCP * connection at its current state, taking into account SACK and etc. */ u_int tcp_maxseg(const struct tcpcb *tp) { u_int optlen; if (tp->t_flags & TF_NOOPT) return (tp->t_maxseg); /* * Here we have a simplified code from tcp_addoptions(), * without a proper loop, and having most of paddings hardcoded. * We might make mistakes with padding here in some edge cases, * but this is harmless, since result of tcp_maxseg() is used * only in cwnd and ssthresh estimations. */ #define PAD(len) ((((len) / 4) + !!((len) % 4)) * 4) if (TCPS_HAVEESTABLISHED(tp->t_state)) { if (tp->t_flags & TF_RCVD_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = 0; #ifdef TCP_SIGNATURE if (tp->t_flags & TF_SIGNATURE) optlen += PAD(TCPOLEN_SIGNATURE); #endif if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0) { optlen += TCPOLEN_SACKHDR; optlen += tp->rcv_numsacks * TCPOLEN_SACK; optlen = PAD(optlen); } } else { if (tp->t_flags & TF_REQ_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = PAD(TCPOLEN_MAXSEG); if (tp->t_flags & TF_REQ_SCALE) optlen += PAD(TCPOLEN_WINDOW); #ifdef TCP_SIGNATURE if (tp->t_flags & TF_SIGNATURE) optlen += PAD(TCPOLEN_SIGNATURE); #endif if (tp->t_flags & TF_SACK_PERMIT) optlen += PAD(TCPOLEN_SACK_PERMITTED); } #undef PAD optlen = min(optlen, TCP_MAXOLEN); return (tp->t_maxseg - optlen); } #ifdef IPSEC /* compute ESP/AH header size for TCP, including outer IP header. */ size_t ipsec_hdrsiz_tcp(struct tcpcb *tp) { struct inpcb *inp; struct mbuf *m; size_t hdrsiz; struct ip *ip; #ifdef INET6 struct ip6_hdr *ip6; #endif struct tcphdr *th; if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL) || (!key_havesp(IPSEC_DIR_OUTBOUND))) return (0); m = m_gethdr(M_NOWAIT, MT_DATA); if (!m) return (0); #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) { ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)(ip6 + 1); m->m_pkthdr.len = m->m_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); tcpip_fillheaders(inp, ip6, th); hdrsiz = ipsec_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); } else #endif /* INET6 */ { ip = mtod(m, struct ip *); th = (struct tcphdr *)(ip + 1); m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr); tcpip_fillheaders(inp, ip, th); hdrsiz = ipsec_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); } m_free(m); return (hdrsiz); } #endif /* IPSEC */ #ifdef TCP_SIGNATURE /* * Callback function invoked by m_apply() to digest TCP segment data * contained within an mbuf chain. */ static int tcp_signature_apply(void *fstate, void *data, u_int len) { MD5Update(fstate, (u_char *)data, len); return (0); } /* * XXX The key is retrieved from the system's PF_KEY SADB, by keying a * search with the destination IP address, and a 'magic SPI' to be * determined by the application. This is hardcoded elsewhere to 1179 */ struct secasvar * tcp_get_sav(struct mbuf *m, u_int direction) { union sockaddr_union dst; struct secasvar *sav; struct ip *ip; #ifdef INET6 struct ip6_hdr *ip6; char ip6buf[INET6_ADDRSTRLEN]; #endif /* Extract the destination from the IP header in the mbuf. */ bzero(&dst, sizeof(union sockaddr_union)); ip = mtod(m, struct ip *); #ifdef INET6 ip6 = NULL; /* Make the compiler happy. */ #endif switch (ip->ip_v) { #ifdef INET case IPVERSION: dst.sa.sa_len = sizeof(struct sockaddr_in); dst.sa.sa_family = AF_INET; dst.sin.sin_addr = (direction == IPSEC_DIR_INBOUND) ? ip->ip_src : ip->ip_dst; break; #endif #ifdef INET6 case (IPV6_VERSION >> 4): ip6 = mtod(m, struct ip6_hdr *); dst.sa.sa_len = sizeof(struct sockaddr_in6); dst.sa.sa_family = AF_INET6; dst.sin6.sin6_addr = (direction == IPSEC_DIR_INBOUND) ? ip6->ip6_src : ip6->ip6_dst; break; #endif default: return (NULL); /* NOTREACHED */ break; } /* Look up an SADB entry which matches the address of the peer. */ sav = KEY_ALLOCSA(&dst, IPPROTO_TCP, htonl(TCP_SIG_SPI)); if (sav == NULL) { ipseclog((LOG_ERR, "%s: SADB lookup failed for %s\n", __func__, (ip->ip_v == IPVERSION) ? inet_ntoa(dst.sin.sin_addr) : #ifdef INET6 (ip->ip_v == (IPV6_VERSION >> 4)) ? ip6_sprintf(ip6buf, &dst.sin6.sin6_addr) : #endif "(unsupported)")); } return (sav); } /* * Compute TCP-MD5 hash of a TCP segment. (RFC2385) * * Parameters: * m pointer to head of mbuf chain * len length of TCP segment data, excluding options * optlen length of TCP segment options * buf pointer to storage for computed MD5 digest * sav pointer to security assosiation * * We do this over ip, tcphdr, segment data, and the key in the SADB. * When called from tcp_input(), we can be sure that th_sum has been * zeroed out and verified already. * * Releases reference to SADB key before return. * * Return 0 if successful, otherwise return -1. * */ int tcp_signature_do_compute(struct mbuf *m, int len, int optlen, u_char *buf, struct secasvar *sav) { #ifdef INET struct ippseudo ippseudo; #endif MD5_CTX ctx; int doff; struct ip *ip; #ifdef INET struct ipovly *ipovly; #endif struct tcphdr *th; #ifdef INET6 struct ip6_hdr *ip6; struct in6_addr in6; uint32_t plen; uint16_t nhdr; #endif u_short savecsum; KASSERT(m != NULL, ("NULL mbuf chain")); KASSERT(buf != NULL, ("NULL signature pointer")); /* Extract the destination from the IP header in the mbuf. */ ip = mtod(m, struct ip *); #ifdef INET6 ip6 = NULL; /* Make the compiler happy. */ #endif MD5Init(&ctx); /* * Step 1: Update MD5 hash with IP(v6) pseudo-header. * * XXX The ippseudo header MUST be digested in network byte order, * or else we'll fail the regression test. Assume all fields we've * been doing arithmetic on have been in host byte order. * XXX One cannot depend on ipovly->ih_len here. When called from * tcp_output(), the underlying ip_len member has not yet been set. */ switch (ip->ip_v) { #ifdef INET case IPVERSION: ipovly = (struct ipovly *)ip; ippseudo.ippseudo_src = ipovly->ih_src; ippseudo.ippseudo_dst = ipovly->ih_dst; ippseudo.ippseudo_pad = 0; ippseudo.ippseudo_p = IPPROTO_TCP; ippseudo.ippseudo_len = htons(len + sizeof(struct tcphdr) + optlen); MD5Update(&ctx, (char *)&ippseudo, sizeof(struct ippseudo)); th = (struct tcphdr *)((u_char *)ip + sizeof(struct ip)); doff = sizeof(struct ip) + sizeof(struct tcphdr) + optlen; break; #endif #ifdef INET6 /* * RFC 2385, 2.0 Proposal * For IPv6, the pseudo-header is as described in RFC 2460, namely the * 128-bit source IPv6 address, 128-bit destination IPv6 address, zero- * extended next header value (to form 32 bits), and 32-bit segment * length. * Note: Upper-Layer Packet Length comes before Next Header. */ case (IPV6_VERSION >> 4): in6 = ip6->ip6_src; in6_clearscope(&in6); MD5Update(&ctx, (char *)&in6, sizeof(struct in6_addr)); in6 = ip6->ip6_dst; in6_clearscope(&in6); MD5Update(&ctx, (char *)&in6, sizeof(struct in6_addr)); plen = htonl(len + sizeof(struct tcphdr) + optlen); MD5Update(&ctx, (char *)&plen, sizeof(uint32_t)); nhdr = 0; MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t)); MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t)); MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t)); nhdr = IPPROTO_TCP; MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t)); th = (struct tcphdr *)((u_char *)ip6 + sizeof(struct ip6_hdr)); doff = sizeof(struct ip6_hdr) + sizeof(struct tcphdr) + optlen; break; #endif default: KEY_FREESAV(&sav); return (-1); /* NOTREACHED */ break; } /* * Step 2: Update MD5 hash with TCP header, excluding options. * The TCP checksum must be set to zero. */ savecsum = th->th_sum; th->th_sum = 0; MD5Update(&ctx, (char *)th, sizeof(struct tcphdr)); th->th_sum = savecsum; /* * Step 3: Update MD5 hash with TCP segment data. * Use m_apply() to avoid an early m_pullup(). */ if (len > 0) m_apply(m, doff, len, tcp_signature_apply, &ctx); /* * Step 4: Update MD5 hash with shared secret. */ MD5Update(&ctx, sav->key_auth->key_data, _KEYLEN(sav->key_auth)); MD5Final(buf, &ctx); key_sa_recordxfer(sav, m); KEY_FREESAV(&sav); return (0); } /* * Compute TCP-MD5 hash of a TCP segment. (RFC2385) * * Return 0 if successful, otherwise return -1. */ int tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen, u_char *buf, u_int direction) { struct secasvar *sav; if ((sav = tcp_get_sav(m, direction)) == NULL) return (-1); return (tcp_signature_do_compute(m, len, optlen, buf, sav)); } /* * Verify the TCP-MD5 hash of a TCP segment. (RFC2385) * * Parameters: * m pointer to head of mbuf chain * len length of TCP segment data, excluding options * optlen length of TCP segment options * buf pointer to storage for computed MD5 digest * direction direction of flow (IPSEC_DIR_INBOUND or OUTBOUND) * * Return 1 if successful, otherwise return 0. */ int tcp_signature_verify(struct mbuf *m, int off0, int tlen, int optlen, struct tcpopt *to, struct tcphdr *th, u_int tcpbflag) { char tmpdigest[TCP_SIGLEN]; if (tcp_sig_checksigs == 0) return (1); if ((tcpbflag & TF_SIGNATURE) == 0) { if ((to->to_flags & TOF_SIGNATURE) != 0) { /* * If this socket is not expecting signature but * the segment contains signature just fail. */ TCPSTAT_INC(tcps_sig_err_sigopt); TCPSTAT_INC(tcps_sig_rcvbadsig); return (0); } /* Signature is not expected, and not present in segment. */ return (1); } /* * If this socket is expecting signature but the segment does not * contain any just fail. */ if ((to->to_flags & TOF_SIGNATURE) == 0) { TCPSTAT_INC(tcps_sig_err_nosigopt); TCPSTAT_INC(tcps_sig_rcvbadsig); return (0); } if (tcp_signature_compute(m, off0, tlen, optlen, &tmpdigest[0], IPSEC_DIR_INBOUND) == -1) { TCPSTAT_INC(tcps_sig_err_buildsig); TCPSTAT_INC(tcps_sig_rcvbadsig); return (0); } if (bcmp(to->to_signature, &tmpdigest[0], TCP_SIGLEN) != 0) { TCPSTAT_INC(tcps_sig_rcvbadsig); return (0); } TCPSTAT_INC(tcps_sig_rcvgoodsig); return (1); } #endif /* TCP_SIGNATURE */ static int sysctl_drop(SYSCTL_HANDLER_ARGS) { /* addrs[0] is a foreign socket, addrs[1] is a local one. */ struct sockaddr_storage addrs[2]; struct inpcb *inp; struct tcpcb *tp; struct tcptw *tw; struct sockaddr_in *fin, *lin; #ifdef INET6 struct sockaddr_in6 *fin6, *lin6; #endif int error; inp = NULL; fin = lin = NULL; #ifdef INET6 fin6 = lin6 = NULL; #endif error = 0; if (req->oldptr != NULL || req->oldlen != 0) return (EINVAL); if (req->newptr == NULL) return (EPERM); if (req->newlen < sizeof(addrs)) return (ENOMEM); error = SYSCTL_IN(req, &addrs, sizeof(addrs)); if (error) return (error); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: fin6 = (struct sockaddr_in6 *)&addrs[0]; lin6 = (struct sockaddr_in6 *)&addrs[1]; if (fin6->sin6_len != sizeof(struct sockaddr_in6) || lin6->sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) { if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr)) return (EINVAL); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]); fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; break; } error = sa6_embedscope(fin6, V_ip6_use_defzone); if (error) return (error); error = sa6_embedscope(lin6, V_ip6_use_defzone); if (error) return (error); break; #endif #ifdef INET case AF_INET: fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; if (fin->sin_len != sizeof(struct sockaddr_in) || lin->sin_len != sizeof(struct sockaddr_in)) return (EINVAL); break; #endif default: return (EINVAL); } INP_INFO_RLOCK(&V_tcbinfo); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr, fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif #ifdef INET case AF_INET: inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port, lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif } if (inp != NULL) { if (inp->inp_flags & INP_TIMEWAIT) { /* * XXXRW: There currently exists a state where an * inpcb is present, but its timewait state has been * discarded. For now, don't allow dropping of this * type of inpcb. */ tw = intotw(inp); if (tw != NULL) tcp_twclose(tw, 0); else INP_WUNLOCK(inp); } else if (!(inp->inp_flags & INP_DROPPED) && !(inp->inp_socket->so_options & SO_ACCEPTCONN)) { tp = intotcpcb(inp); tp = tcp_drop(tp, ECONNABORTED); if (tp != NULL) INP_WUNLOCK(inp); } else INP_WUNLOCK(inp); } else error = ESRCH; INP_INFO_RUNLOCK(&V_tcbinfo); return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop, CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP, NULL, 0, sysctl_drop, "", "Drop TCP connection"); /* * Generate a standardized TCP log line for use throughout the * tcp subsystem. Memory allocation is done with M_NOWAIT to * allow use in the interrupt context. * * NB: The caller MUST free(s, M_TCPLOG) the returned string. * NB: The function may return NULL if memory allocation failed. * * Due to header inclusion and ordering limitations the struct ip * and ip6_hdr pointers have to be passed as void pointers. */ char * tcp_log_vain(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, const void *ip6hdr) { /* Is logging enabled? */ if (tcp_log_in_vain == 0) return (NULL); return (tcp_log_addr(inc, th, ip4hdr, ip6hdr)); } char * tcp_log_addrs(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, const void *ip6hdr) { /* Is logging enabled? */ if (tcp_log_debug == 0) return (NULL); return (tcp_log_addr(inc, th, ip4hdr, ip6hdr)); } static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, const void *ip6hdr) { char *s, *sp; size_t size; struct ip *ip; #ifdef INET6 const struct ip6_hdr *ip6; ip6 = (const struct ip6_hdr *)ip6hdr; #endif /* INET6 */ ip = (struct ip *)ip4hdr; /* * The log line looks like this: * "TCP: [1.2.3.4]:50332 to [1.2.3.4]:80 tcpflags 0x2" */ size = sizeof("TCP: []:12345 to []:12345 tcpflags 0x2<>") + sizeof(PRINT_TH_FLAGS) + 1 + #ifdef INET6 2 * INET6_ADDRSTRLEN; #else 2 * INET_ADDRSTRLEN; #endif /* INET6 */ s = malloc(size, M_TCPLOG, M_ZERO|M_NOWAIT); if (s == NULL) return (NULL); strcat(s, "TCP: ["); sp = s + strlen(s); if (inc && ((inc->inc_flags & INC_ISIPV6) == 0)) { inet_ntoa_r(inc->inc_faddr, sp); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); sp = s + strlen(s); inet_ntoa_r(inc->inc_laddr, sp); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(inc->inc_lport)); #ifdef INET6 } else if (inc) { ip6_sprintf(sp, &inc->inc6_faddr); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); sp = s + strlen(s); ip6_sprintf(sp, &inc->inc6_laddr); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(inc->inc_lport)); } else if (ip6 && th) { ip6_sprintf(sp, &ip6->ip6_src); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(th->th_sport)); sp = s + strlen(s); ip6_sprintf(sp, &ip6->ip6_dst); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(th->th_dport)); #endif /* INET6 */ #ifdef INET } else if (ip && th) { inet_ntoa_r(ip->ip_src, sp); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(th->th_sport)); sp = s + strlen(s); inet_ntoa_r(ip->ip_dst, sp); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(th->th_dport)); #endif /* INET */ } else { free(s, M_TCPLOG); return (NULL); } sp = s + strlen(s); if (th) sprintf(sp, " tcpflags 0x%b", th->th_flags, PRINT_TH_FLAGS); if (*(s + size - 1) != '\0') panic("%s: string too long", __func__); return (s); } /* * A subroutine which makes it easy to track TCP state changes with DTrace. * This function shouldn't be called for t_state initializations that don't * correspond to actual TCP state transitions. */ void tcp_state_change(struct tcpcb *tp, int newstate) { #if defined(KDTRACE_HOOKS) int pstate = tp->t_state; #endif TCPSTATES_DEC(tp->t_state); TCPSTATES_INC(newstate); tp->t_state = newstate; TCP_PROBE6(state__change, NULL, tp, NULL, tp, NULL, pstate); } Index: head/sys/netinet/tcp_timer.c =================================================================== --- head/sys/netinet/tcp_timer.c (revision 297224) +++ head/sys/netinet/tcp_timer.c (revision 297225) @@ -1,1014 +1,1016 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_tcpdebug.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #include #include #ifdef INET6 #include #endif #include #ifdef TCPDEBUG #include #endif int tcp_persmin; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, CTLTYPE_INT|CTLFLAG_RW, &tcp_persmin, 0, sysctl_msec_to_ticks, "I", "minimum persistence interval"); int tcp_persmax; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, CTLTYPE_INT|CTLFLAG_RW, &tcp_persmax, 0, sysctl_msec_to_ticks, "I", "maximum persistence interval"); int tcp_keepinit; SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection"); int tcp_keepidle; SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW, &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin"); int tcp_keepintvl; SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW, &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes"); int tcp_delacktime; SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW, &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", "Time before a delayed ACK is sent"); int tcp_msl; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW, &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); int tcp_rexmit_min; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW, &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", "Minimum Retransmission Timeout"); int tcp_rexmit_slop; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW, &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", "Retransmission Timer Slop"); static int always_keepalive = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW, &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); int tcp_fast_finwait2_recycle = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, &tcp_fast_finwait2_recycle, 0, "Recycle closed FIN_WAIT_2 connections faster"); int tcp_finwait2_timeout; SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW, &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout"); int tcp_keepcnt = TCPTV_KEEPCNT; SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0, "Number of keepalive probes to send"); /* max idle probes */ int tcp_maxpersistidle; static int tcp_rexmit_drop_options = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW, &tcp_rexmit_drop_options, 0, "Drop TCP options from 3rd and later retransmitted SYN"); static VNET_DEFINE(int, tcp_pmtud_blackhole_detect); #define V_tcp_pmtud_blackhole_detect VNET(tcp_pmtud_blackhole_detect) SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, CTLFLAG_RW|CTLFLAG_VNET, &VNET_NAME(tcp_pmtud_blackhole_detect), 0, "Path MTU Discovery Black Hole Detection Enabled"); static VNET_DEFINE(int, tcp_pmtud_blackhole_activated); #define V_tcp_pmtud_blackhole_activated \ VNET(tcp_pmtud_blackhole_activated) SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated, CTLFLAG_RD|CTLFLAG_VNET, &VNET_NAME(tcp_pmtud_blackhole_activated), 0, "Path MTU Discovery Black Hole Detection, Activation Count"); static VNET_DEFINE(int, tcp_pmtud_blackhole_activated_min_mss); #define V_tcp_pmtud_blackhole_activated_min_mss \ VNET(tcp_pmtud_blackhole_activated_min_mss) SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated_min_mss, CTLFLAG_RD|CTLFLAG_VNET, &VNET_NAME(tcp_pmtud_blackhole_activated_min_mss), 0, "Path MTU Discovery Black Hole Detection, Activation Count at min MSS"); static VNET_DEFINE(int, tcp_pmtud_blackhole_failed); #define V_tcp_pmtud_blackhole_failed VNET(tcp_pmtud_blackhole_failed) SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_failed, CTLFLAG_RD|CTLFLAG_VNET, &VNET_NAME(tcp_pmtud_blackhole_failed), 0, "Path MTU Discovery Black Hole Detection, Failure Count"); #ifdef INET static VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200; #define V_tcp_pmtud_blackhole_mss VNET(tcp_pmtud_blackhole_mss) SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, CTLFLAG_RW|CTLFLAG_VNET, &VNET_NAME(tcp_pmtud_blackhole_mss), 0, "Path MTU Discovery Black Hole Detection lowered MSS"); #endif #ifdef INET6 static VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220; #define V_tcp_v6pmtud_blackhole_mss VNET(tcp_v6pmtud_blackhole_mss) SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss, CTLFLAG_RW|CTLFLAG_VNET, &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0, "Path MTU Discovery IPv6 Black Hole Detection lowered MSS"); #endif #ifdef RSS static int per_cpu_timers = 1; #else static int per_cpu_timers = 0; #endif SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW, &per_cpu_timers , 0, "run tcp timers on all cpus"); #if 0 #define INP_CPU(inp) (per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \ ((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0) #endif /* * Map the given inp to a CPU id. * * This queries RSS if it's compiled in, else it defaults to the current * CPU ID. */ static inline int inp_to_cpuid(struct inpcb *inp) { u_int cpuid; #ifdef RSS if (per_cpu_timers) { cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); if (cpuid == NETISR_CPUID_NONE) return (curcpu); /* XXX */ else return (cpuid); } #else /* Legacy, pre-RSS behaviour */ if (per_cpu_timers) { /* * We don't have a flowid -> cpuid mapping, so cheat and * just map unknown cpuids to curcpu. Not the best, but * apparently better than defaulting to swi 0. */ cpuid = inp->inp_flowid % (mp_maxid + 1); if (! CPU_ABSENT(cpuid)) return (cpuid); return (curcpu); } #endif /* Default for RSS and non-RSS - cpuid 0 */ else { return (0); } } /* * Tcp protocol timeout routine called every 500 ms. * Updates timestamps used for TCP * causes finite state machine actions if timers expire. */ void tcp_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); (void) tcp_tw_2msl_scan(0); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; int tcp_backoff[TCP_MAXRXTSHIFT + 1] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; static int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ /* * TCP timer processing. */ void tcp_timer_delack(void *xtp) { struct tcpcb *tp = xtp; struct inpcb *inp; CURVNET_SET(tp->t_vnet); inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_delack) || !callout_active(&tp->t_timers->tt_delack)) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_delack); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); CURVNET_RESTORE(); return; } KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); KASSERT((tp->t_timers->tt_flags & TT_DELACK) != 0, ("%s: tp %p delack callout should be running", __func__, tp)); tp->t_flags |= TF_ACKNOW; TCPSTAT_INC(tcps_delack); (void) tp->t_fb->tfb_tcp_output(tp); INP_WUNLOCK(inp); CURVNET_RESTORE(); } void tcp_timer_2msl(void *xtp) { struct tcpcb *tp = xtp; struct inpcb *inp; CURVNET_SET(tp->t_vnet); #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif INP_INFO_RLOCK(&V_tcbinfo); inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); tcp_free_sackholes(tp); if (callout_pending(&tp->t_timers->tt_2msl) || !callout_active(&tp->t_timers->tt_2msl)) { INP_WUNLOCK(tp->t_inpcb); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_2msl); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); KASSERT((tp->t_timers->tt_flags & TT_2MSL) != 0, ("%s: tp %p 2msl callout should be running", __func__, tp)); /* * 2 MSL timeout in shutdown went off. If we're closed but * still waiting for peer to close and connection has been idle * too long delete connection control block. Otherwise, check * again in a bit. * * If in TIME_WAIT state just ignore as this timeout is handled in * tcp_tw_2msl_scan(). * * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. * Ignore fact that there were recent incoming segments. */ if ((inp->inp_flags & INP_TIMEWAIT) != 0) { INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 && tp->t_inpcb && tp->t_inpcb->inp_socket && (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { TCPSTAT_INC(tcps_finwait2_drops); tp = tcp_close(tp); } else { if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) { if (!callout_reset(&tp->t_timers->tt_2msl, TP_KEEPINTVL(tp), tcp_timer_2msl, tp)) { tp->t_timers->tt_flags &= ~TT_2MSL_RST; } } else tp = tcp_close(tp); } #ifdef TCPDEBUG if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); if (tp != NULL) INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } void tcp_timer_keep(void *xtp) { struct tcpcb *tp = xtp; struct tcptemp *t_template; struct inpcb *inp; CURVNET_SET(tp->t_vnet); #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif INP_INFO_RLOCK(&V_tcbinfo); inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_keep) || !callout_active(&tp->t_timers->tt_keep)) { INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_keep); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); KASSERT((tp->t_timers->tt_flags & TT_KEEP) != 0, ("%s: tp %p keep callout should be running", __func__, tp)); /* * Keep-alive timer went off; send something * or drop connection if idle for too long. */ TCPSTAT_INC(tcps_keeptimeo); if (tp->t_state < TCPS_ESTABLISHED) goto dropit; if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && tp->t_state <= TCPS_CLOSING) { if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) goto dropit; /* * Send a packet designed to force a response * if the peer is up and reachable: * either an ACK if the connection is still alive, * or an RST if the peer has closed the connection * due to timeout or reboot. * Using sequence number tp->snd_una-1 * causes the transmitted zero-length segment * to lie outside the receive window; * by the protocol spec, this requires the * correspondent TCP to respond. */ TCPSTAT_INC(tcps_keepprobe); t_template = tcpip_maketemplate(inp); if (t_template) { tcp_respond(tp, t_template->tt_ipgen, &t_template->tt_t, (struct mbuf *)NULL, tp->rcv_nxt, tp->snd_una - 1, 0); free(t_template, M_TEMP); } if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp), tcp_timer_keep, tp)) { tp->t_timers->tt_flags &= ~TT_KEEP_RST; } } else if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp), tcp_timer_keep, tp)) { tp->t_timers->tt_flags &= ~TT_KEEP_RST; } #ifdef TCPDEBUG if (inp->inp_socket->so_options & SO_DEBUG) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; dropit: TCPSTAT_INC(tcps_keepdrops); tp = tcp_drop(tp, ETIMEDOUT); #ifdef TCPDEBUG if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); if (tp != NULL) INP_WUNLOCK(tp->t_inpcb); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } void tcp_timer_persist(void *xtp) { struct tcpcb *tp = xtp; struct inpcb *inp; CURVNET_SET(tp->t_vnet); #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif INP_INFO_RLOCK(&V_tcbinfo); inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_persist) || !callout_active(&tp->t_timers->tt_persist)) { INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_persist); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); KASSERT((tp->t_timers->tt_flags & TT_PERSIST) != 0, ("%s: tp %p persist callout should be running", __func__, tp)); /* * Persistance timer into zero window. * Force a byte to be output, if possible. */ TCPSTAT_INC(tcps_persisttimeo); /* * Hack: if the peer is dead/unreachable, we do not * time out if the window is closed. After a full * backoff, drop the connection if the idle time * (no responses to probes) reaches the maximum * backoff that we would use if retransmitting. */ if (tp->t_rxtshift == TCP_MAXRXTSHIFT && (ticks - tp->t_rcvtime >= tcp_maxpersistidle || ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { TCPSTAT_INC(tcps_persistdrop); tp = tcp_drop(tp, ETIMEDOUT); goto out; } /* * If the user has closed the socket then drop a persisting * connection after a much reduced timeout. */ if (tp->t_state > TCPS_CLOSE_WAIT && (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { TCPSTAT_INC(tcps_persistdrop); tp = tcp_drop(tp, ETIMEDOUT); goto out; } tcp_setpersist(tp); tp->t_flags |= TF_FORCEDATA; (void) tp->t_fb->tfb_tcp_output(tp); tp->t_flags &= ~TF_FORCEDATA; out: #ifdef TCPDEBUG if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); if (tp != NULL) INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } void tcp_timer_rexmt(void * xtp) { struct tcpcb *tp = xtp; CURVNET_SET(tp->t_vnet); int rexmt; int headlocked; struct inpcb *inp; #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif INP_INFO_RLOCK(&V_tcbinfo); inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_rexmt) || !callout_active(&tp->t_timers->tt_rexmt)) { INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_rexmt); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); KASSERT((tp->t_timers->tt_flags & TT_REXMT) != 0, ("%s: tp %p rexmt callout should be running", __func__, tp)); tcp_free_sackholes(tp); /* * Retransmission timer went off. Message has not * been acked within retransmit interval. Back off * to a longer retransmit interval and retransmit one segment. */ if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { tp->t_rxtshift = TCP_MAXRXTSHIFT; TCPSTAT_INC(tcps_timeoutdrop); tp = tcp_drop(tp, tp->t_softerror ? tp->t_softerror : ETIMEDOUT); headlocked = 1; goto out; } INP_INFO_RUNLOCK(&V_tcbinfo); headlocked = 0; if (tp->t_state == TCPS_SYN_SENT) { /* * If the SYN was retransmitted, indicate CWND to be * limited to 1 segment in cc_conn_init(). */ tp->snd_cwnd = 1; } else if (tp->t_rxtshift == 1) { /* * first retransmit; record ssthresh and cwnd so they can * be recovered if this turns out to be a "bad" retransmit. * A retransmit is considered "bad" if an ACK for this * segment is received within RTT/2 interval; the assumption * here is that the ACK was already in flight. See * "On Estimating End-to-End Network Path Properties" by * Allman and Paxson for more details. */ tp->snd_cwnd_prev = tp->snd_cwnd; tp->snd_ssthresh_prev = tp->snd_ssthresh; tp->snd_recover_prev = tp->snd_recover; if (IN_FASTRECOVERY(tp->t_flags)) tp->t_flags |= TF_WASFRECOVERY; else tp->t_flags &= ~TF_WASFRECOVERY; if (IN_CONGRECOVERY(tp->t_flags)) tp->t_flags |= TF_WASCRECOVERY; else tp->t_flags &= ~TF_WASCRECOVERY; tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); tp->t_flags |= TF_PREVVALID; } else tp->t_flags &= ~TF_PREVVALID; TCPSTAT_INC(tcps_rexmttimeo); if ((tp->t_state == TCPS_SYN_SENT) || (tp->t_state == TCPS_SYN_RECEIVED)) rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift]; else rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, TCPTV_REXMTMAX); /* * We enter the path for PLMTUD if connection is established or, if * connection is FIN_WAIT_1 status, reason for the last is that if * amount of data we send is very small, we could send it in couple of * packets and process straight to FIN. In that case we won't catch * ESTABLISHED state. */ if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED)) || (tp->t_state == TCPS_FIN_WAIT_1))) { #ifdef INET6 int isipv6; #endif /* * Idea here is that at each stage of mtu probe (usually, 1448 * -> 1188 -> 524) should be given 2 chances to recover before * further clamping down. 'tp->t_rxtshift % 2 == 0' should * take care of that. */ if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) == (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) && (tp->t_rxtshift >= 2 && tp->t_rxtshift % 2 == 0)) { /* * Enter Path MTU Black-hole Detection mechanism: * - Disable Path MTU Discovery (IP "DF" bit). * - Reduce MTU to lower value than what we * negotiated with peer. */ /* Record that we may have found a black hole. */ tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; /* Keep track of previous MSS. */ tp->t_pmtud_saved_maxseg = tp->t_maxseg; /* * Reduce the MSS to blackhole value or to the default * in an attempt to retransmit. */ #ifdef INET6 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0; if (isipv6 && tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { /* Use the sysctl tuneable blackhole MSS. */ tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; V_tcp_pmtud_blackhole_activated++; } else if (isipv6) { /* Use the default MSS. */ tp->t_maxseg = V_tcp_v6mssdflt; /* * Disable Path MTU Discovery when we switch to * minmss. */ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; V_tcp_pmtud_blackhole_activated_min_mss++; } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { /* Use the sysctl tuneable blackhole MSS. */ tp->t_maxseg = V_tcp_pmtud_blackhole_mss; V_tcp_pmtud_blackhole_activated++; } else { /* Use the default MSS. */ tp->t_maxseg = V_tcp_mssdflt; /* * Disable Path MTU Discovery when we switch to * minmss. */ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; V_tcp_pmtud_blackhole_activated_min_mss++; } #endif /* * Reset the slow-start flight size * as it may depend on the new MSS. */ if (CC_ALGO(tp)->conn_init != NULL) CC_ALGO(tp)->conn_init(tp->ccv); } else { /* * If further retransmissions are still unsuccessful * with a lowered MTU, maybe this isn't a blackhole and * we restore the previous MSS and blackhole detection * flags. * The limit '6' is determined by giving each probe * stage (1448, 1188, 524) 2 chances to recover. */ if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && (tp->t_rxtshift > 6)) { tp->t_flags2 |= TF2_PLPMTU_PMTUD; tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; tp->t_maxseg = tp->t_pmtud_saved_maxseg; V_tcp_pmtud_blackhole_failed++; /* * Reset the slow-start flight size as it * may depend on the new MSS. */ if (CC_ALGO(tp)->conn_init != NULL) CC_ALGO(tp)->conn_init(tp->ccv); } } } /* * Disable RFC1323 and SACK if we haven't got any response to * our third SYN to work-around some broken terminal servers * (most of which have hopefully been retired) that have bad VJ * header compression code which trashes TCP segments containing * unknown-to-them TCP options. */ if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && (tp->t_rxtshift == 3)) tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); /* * If we backed off this far, our srtt estimate is probably bogus. * Clobber it so we'll take the next rtt measurement as our srtt; * move the current srtt into rttvar to keep the current * retransmit times until then. */ if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { #ifdef INET6 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) in6_losing(tp->t_inpcb); + else #endif + in_losing(tp->t_inpcb); tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); tp->t_srtt = 0; } tp->snd_nxt = tp->snd_una; tp->snd_recover = tp->snd_max; /* * Force a segment to be sent. */ tp->t_flags |= TF_ACKNOW; /* * If timing a segment in this window, stop the timer. */ tp->t_rtttime = 0; cc_cong_signal(tp, NULL, CC_RTO); (void) tp->t_fb->tfb_tcp_output(tp); out: #ifdef TCPDEBUG if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); if (tp != NULL) INP_WUNLOCK(inp); if (headlocked) INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } void tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta) { struct callout *t_callout; timeout_t *f_callout; struct inpcb *inp = tp->t_inpcb; int cpu = inp_to_cpuid(inp); uint32_t f_reset; #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) return; #endif if (tp->t_timers->tt_flags & TT_STOPPED) return; switch (timer_type) { case TT_DELACK: t_callout = &tp->t_timers->tt_delack; f_callout = tcp_timer_delack; f_reset = TT_DELACK_RST; break; case TT_REXMT: t_callout = &tp->t_timers->tt_rexmt; f_callout = tcp_timer_rexmt; f_reset = TT_REXMT_RST; break; case TT_PERSIST: t_callout = &tp->t_timers->tt_persist; f_callout = tcp_timer_persist; f_reset = TT_PERSIST_RST; break; case TT_KEEP: t_callout = &tp->t_timers->tt_keep; f_callout = tcp_timer_keep; f_reset = TT_KEEP_RST; break; case TT_2MSL: t_callout = &tp->t_timers->tt_2msl; f_callout = tcp_timer_2msl; f_reset = TT_2MSL_RST; break; default: if (tp->t_fb->tfb_tcp_timer_activate) { tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta); return; } panic("tp %p bad timer_type %#x", tp, timer_type); } if (delta == 0) { if ((tp->t_timers->tt_flags & timer_type) && (callout_stop(t_callout) > 0) && (tp->t_timers->tt_flags & f_reset)) { tp->t_timers->tt_flags &= ~(timer_type | f_reset); } } else { if ((tp->t_timers->tt_flags & timer_type) == 0) { tp->t_timers->tt_flags |= (timer_type | f_reset); callout_reset_on(t_callout, delta, f_callout, tp, cpu); } else { /* Reset already running callout on the same CPU. */ if (!callout_reset(t_callout, delta, f_callout, tp)) { /* * Callout not cancelled, consider it as not * properly restarted. */ tp->t_timers->tt_flags &= ~f_reset; } } } } int tcp_timer_active(struct tcpcb *tp, uint32_t timer_type) { struct callout *t_callout; switch (timer_type) { case TT_DELACK: t_callout = &tp->t_timers->tt_delack; break; case TT_REXMT: t_callout = &tp->t_timers->tt_rexmt; break; case TT_PERSIST: t_callout = &tp->t_timers->tt_persist; break; case TT_KEEP: t_callout = &tp->t_timers->tt_keep; break; case TT_2MSL: t_callout = &tp->t_timers->tt_2msl; break; default: if (tp->t_fb->tfb_tcp_timer_active) { return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type)); } panic("tp %p bad timer_type %#x", tp, timer_type); } return callout_active(t_callout); } void tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type) { struct callout *t_callout; timeout_t *f_callout; uint32_t f_reset; tp->t_timers->tt_flags |= TT_STOPPED; switch (timer_type) { case TT_DELACK: t_callout = &tp->t_timers->tt_delack; f_callout = tcp_timer_delack_discard; f_reset = TT_DELACK_RST; break; case TT_REXMT: t_callout = &tp->t_timers->tt_rexmt; f_callout = tcp_timer_rexmt_discard; f_reset = TT_REXMT_RST; break; case TT_PERSIST: t_callout = &tp->t_timers->tt_persist; f_callout = tcp_timer_persist_discard; f_reset = TT_PERSIST_RST; break; case TT_KEEP: t_callout = &tp->t_timers->tt_keep; f_callout = tcp_timer_keep_discard; f_reset = TT_KEEP_RST; break; case TT_2MSL: t_callout = &tp->t_timers->tt_2msl; f_callout = tcp_timer_2msl_discard; f_reset = TT_2MSL_RST; break; default: if (tp->t_fb->tfb_tcp_timer_stop) { /* * XXXrrs we need to look at this with the * stop case below (flags). */ tp->t_fb->tfb_tcp_timer_stop(tp, timer_type); return; } panic("tp %p bad timer_type %#x", tp, timer_type); } if (tp->t_timers->tt_flags & timer_type) { if ((callout_stop(t_callout) > 0) && (tp->t_timers->tt_flags & f_reset)) { tp->t_timers->tt_flags &= ~(timer_type | f_reset); } else { /* * Can't stop the callout, defer tcpcb actual deletion * to the last tcp timer discard callout. * The TT_STOPPED flag will ensure that no tcp timer * callouts can be restarted on our behalf, and * past this point currently running callouts waiting * on inp lock will return right away after the * classical check for callout reset/stop events: * callout_pending() || !callout_active() */ callout_reset(t_callout, 1, f_callout, tp); } } } #define ticks_to_msecs(t) (1000*(t) / hz) void tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, struct xtcp_timer *xtimer) { sbintime_t now; bzero(xtimer, sizeof(*xtimer)); if (timer == NULL) return; now = getsbinuptime(); if (callout_active(&timer->tt_delack)) xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS; if (callout_active(&timer->tt_rexmt)) xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS; if (callout_active(&timer->tt_persist)) xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS; if (callout_active(&timer->tt_keep)) xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS; if (callout_active(&timer->tt_2msl)) xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS; xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime); } Index: head/sys/netinet/udp_usrreq.c =================================================================== --- head/sys/netinet/udp_usrreq.c (revision 297224) +++ head/sys/netinet/udp_usrreq.c (revision 297225) @@ -1,1915 +1,1933 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. * Copyright (c) 2008 Robert N. M. Watson * Copyright (c) 2010-2011 Juniper Networks, Inc. * Copyright (c) 2014 Kevin Lo * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)udp_usrreq.c 8.6 (Berkeley) 5/23/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ipfw.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #ifdef IPSEC #include #include #endif #include #include /* * UDP and UDP-Lite protocols implementation. * Per RFC 768, August, 1980. * Per RFC 3828, July, 2004. */ /* * BSD 4.2 defaulted the udp checksum to be off. Turning off udp checksums * removes the only data integrity mechanism for packets and malformed * packets that would otherwise be discarded due to bad checksums, and may * cause problems (especially for NFS data blocks). */ VNET_DEFINE(int, udp_cksum) = 1; SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(udp_cksum), 0, "compute udp checksum"); int udp_log_in_vain = 0; SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW, &udp_log_in_vain, 0, "Log all incoming UDP packets"); VNET_DEFINE(int, udp_blackhole) = 0; SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(udp_blackhole), 0, "Do not send port unreachables for refused connects"); u_long udp_sendspace = 9216; /* really max datagram size */ SYSCTL_ULONG(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW, &udp_sendspace, 0, "Maximum outgoing UDP datagram size"); u_long udp_recvspace = 40 * (1024 + #ifdef INET6 sizeof(struct sockaddr_in6) #else sizeof(struct sockaddr_in) #endif ); /* 40 1K datagrams */ SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW, &udp_recvspace, 0, "Maximum space for incoming UDP datagrams"); VNET_DEFINE(struct inpcbhead, udb); /* from udp_var.h */ VNET_DEFINE(struct inpcbinfo, udbinfo); VNET_DEFINE(struct inpcbhead, ulitecb); VNET_DEFINE(struct inpcbinfo, ulitecbinfo); static VNET_DEFINE(uma_zone_t, udpcb_zone); #define V_udpcb_zone VNET(udpcb_zone) #ifndef UDBHASHSIZE #define UDBHASHSIZE 128 #endif VNET_PCPUSTAT_DEFINE(struct udpstat, udpstat); /* from udp_var.h */ VNET_PCPUSTAT_SYSINIT(udpstat); SYSCTL_VNET_PCPUSTAT(_net_inet_udp, UDPCTL_STATS, stats, struct udpstat, udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)"); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(udpstat); #endif /* VIMAGE */ #ifdef INET static void udp_detach(struct socket *so); static int udp_output(struct inpcb *, struct mbuf *, struct sockaddr *, struct mbuf *, struct thread *); #endif #ifdef IPSEC #ifdef IPSEC_NAT_T #define UF_ESPINUDP_ALL (UF_ESPINUDP_NON_IKE|UF_ESPINUDP) #ifdef INET static struct mbuf *udp4_espdecap(struct inpcb *, struct mbuf *, int); #endif #endif /* IPSEC_NAT_T */ #endif /* IPSEC */ static void udp_zone_change(void *tag) { uma_zone_set_max(V_udbinfo.ipi_zone, maxsockets); uma_zone_set_max(V_udpcb_zone, maxsockets); } static int udp_inpcb_init(void *mem, int size, int flags) { struct inpcb *inp; inp = mem; INP_LOCK_INIT(inp, "inp", "udpinp"); return (0); } static int udplite_inpcb_init(void *mem, int size, int flags) { struct inpcb *inp; inp = mem; INP_LOCK_INIT(inp, "inp", "udpliteinp"); return (0); } void udp_init(void) { /* * For now default to 2-tuple UDP hashing - until the fragment * reassembly code can also update the flowid. * * Once we can calculate the flowid that way and re-establish * a 4-tuple, flip this to 4-tuple. */ in_pcbinfo_init(&V_udbinfo, "udp", &V_udb, UDBHASHSIZE, UDBHASHSIZE, "udp_inpcb", udp_inpcb_init, NULL, 0, IPI_HASHFIELDS_2TUPLE); V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_zone_set_max(V_udpcb_zone, maxsockets); uma_zone_set_warning(V_udpcb_zone, "kern.ipc.maxsockets limit reached"); EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL, EVENTHANDLER_PRI_ANY); } void udplite_init(void) { in_pcbinfo_init(&V_ulitecbinfo, "udplite", &V_ulitecb, UDBHASHSIZE, UDBHASHSIZE, "udplite_inpcb", udplite_inpcb_init, NULL, 0, IPI_HASHFIELDS_2TUPLE); } /* * Kernel module interface for updating udpstat. The argument is an index * into udpstat treated as an array of u_long. While this encodes the * general layout of udpstat into the caller, it doesn't encode its location, * so that future changes to add, for example, per-CPU stats support won't * cause binary compatibility problems for kernel modules. */ void kmod_udpstat_inc(int statnum) { counter_u64_add(VNET(udpstat)[statnum], 1); } int udp_newudpcb(struct inpcb *inp) { struct udpcb *up; up = uma_zalloc(V_udpcb_zone, M_NOWAIT | M_ZERO); if (up == NULL) return (ENOBUFS); inp->inp_ppcb = up; return (0); } void udp_discardcb(struct udpcb *up) { uma_zfree(V_udpcb_zone, up); } #ifdef VIMAGE void udp_destroy(void) { in_pcbinfo_destroy(&V_udbinfo); uma_zdestroy(V_udpcb_zone); } void udplite_destroy(void) { in_pcbinfo_destroy(&V_ulitecbinfo); } #endif #ifdef INET /* * Subroutine of udp_input(), which appends the provided mbuf chain to the * passed pcb/socket. The caller must provide a sockaddr_in via udp_in that * contains the source address. If the socket ends up being an IPv6 socket, * udp_append() will convert to a sockaddr_in6 before passing the address * into the socket code. * * In the normal case udp_append() will return 0, indicating that you * must unlock the inp. However if a tunneling protocol is in place we increment * the inpcb refcnt and unlock the inp, on return from the tunneling protocol we * then decrement the reference count. If the inp_rele returns 1, indicating the * inp is gone, we return that to the caller to tell them *not* to unlock * the inp. In the case of multi-cast this will cause the distribution * to stop (though most tunneling protocols known currently do *not* use * multicast). */ static int udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off, struct sockaddr_in *udp_in) { struct sockaddr *append_sa; struct socket *so; struct mbuf *opts = 0; #ifdef INET6 struct sockaddr_in6 udp_in6; #endif struct udpcb *up; INP_LOCK_ASSERT(inp); /* * Engage the tunneling protocol. */ up = intoudpcb(inp); if (up->u_tun_func != NULL) { in_pcbref(inp); INP_RUNLOCK(inp); (*up->u_tun_func)(n, off, inp, (struct sockaddr *)udp_in, up->u_tun_ctx); INP_RLOCK(inp); return (in_pcbrele_rlocked(inp)); } off += sizeof(struct udphdr); #ifdef IPSEC /* Check AH/ESP integrity. */ if (ipsec4_in_reject(n, inp)) { m_freem(n); return (0); } #ifdef IPSEC_NAT_T up = intoudpcb(inp); KASSERT(up != NULL, ("%s: udpcb NULL", __func__)); if (up->u_flags & UF_ESPINUDP_ALL) { /* IPSec UDP encaps. */ n = udp4_espdecap(inp, n, off); if (n == NULL) /* Consumed. */ return (0); } #endif /* IPSEC_NAT_T */ #endif /* IPSEC */ #ifdef MAC if (mac_inpcb_check_deliver(inp, n) != 0) { m_freem(n); return (0); } #endif /* MAC */ if (inp->inp_flags & INP_CONTROLOPTS || inp->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) { #ifdef INET6 if (inp->inp_vflag & INP_IPV6) (void)ip6_savecontrol_v4(inp, n, &opts, NULL); else #endif /* INET6 */ ip_savecontrol(inp, &opts, ip, n); } #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { bzero(&udp_in6, sizeof(udp_in6)); udp_in6.sin6_len = sizeof(udp_in6); udp_in6.sin6_family = AF_INET6; in6_sin_2_v4mapsin6(udp_in, &udp_in6); append_sa = (struct sockaddr *)&udp_in6; } else #endif /* INET6 */ append_sa = (struct sockaddr *)udp_in; m_adj(n, off); so = inp->inp_socket; SOCKBUF_LOCK(&so->so_rcv); if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) { SOCKBUF_UNLOCK(&so->so_rcv); m_freem(n); if (opts) m_freem(opts); UDPSTAT_INC(udps_fullsock); } else sorwakeup_locked(so); return (0); } int udp_input(struct mbuf **mp, int *offp, int proto) { struct ip *ip; struct udphdr *uh; struct ifnet *ifp; struct inpcb *inp; uint16_t len, ip_len; struct inpcbinfo *pcbinfo; struct ip save_ip; struct sockaddr_in udp_in; struct mbuf *m; struct m_tag *fwd_tag; int cscov_partial, iphlen; m = *mp; iphlen = *offp; ifp = m->m_pkthdr.rcvif; *mp = NULL; UDPSTAT_INC(udps_ipackets); /* * Strip IP options, if any; should skip this, make available to * user, and use on returned packets, but we don't yet have a way to * check the checksum with options still present. */ if (iphlen > sizeof (struct ip)) { ip_stripoptions(m); iphlen = sizeof(struct ip); } /* * Get IP and UDP header together in first mbuf. */ ip = mtod(m, struct ip *); if (m->m_len < iphlen + sizeof(struct udphdr)) { if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == NULL) { UDPSTAT_INC(udps_hdrops); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); } uh = (struct udphdr *)((caddr_t)ip + iphlen); cscov_partial = (proto == IPPROTO_UDPLITE) ? 1 : 0; /* * Destination port of 0 is illegal, based on RFC768. */ if (uh->uh_dport == 0) goto badunlocked; /* * Construct sockaddr format source address. Stuff source address * and datagram in user buffer. */ bzero(&udp_in, sizeof(udp_in)); udp_in.sin_len = sizeof(udp_in); udp_in.sin_family = AF_INET; udp_in.sin_port = uh->uh_sport; udp_in.sin_addr = ip->ip_src; /* * Make mbuf data length reflect UDP length. If not enough data to * reflect UDP length, drop. */ len = ntohs((u_short)uh->uh_ulen); ip_len = ntohs(ip->ip_len) - iphlen; if (proto == IPPROTO_UDPLITE && (len == 0 || len == ip_len)) { /* Zero means checksum over the complete packet. */ if (len == 0) len = ip_len; cscov_partial = 0; } if (ip_len != len) { if (len > ip_len || len < sizeof(struct udphdr)) { UDPSTAT_INC(udps_badlen); goto badunlocked; } if (proto == IPPROTO_UDP) m_adj(m, len - ip_len); } /* * Save a copy of the IP header in case we want restore it for * sending an ICMP error message in response. */ if (!V_udp_blackhole) save_ip = *ip; else memset(&save_ip, 0, sizeof(save_ip)); /* * Checksum extended UDP header and data. */ if (uh->uh_sum) { u_short uh_sum; if ((m->m_pkthdr.csum_flags & CSUM_DATA_VALID) && !cscov_partial) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) uh_sum = m->m_pkthdr.csum_data; else uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl((u_short)len + m->m_pkthdr.csum_data + proto)); uh_sum ^= 0xffff; } else { char b[9]; bcopy(((struct ipovly *)ip)->ih_x1, b, 9); bzero(((struct ipovly *)ip)->ih_x1, 9); ((struct ipovly *)ip)->ih_len = (proto == IPPROTO_UDP) ? uh->uh_ulen : htons(ip_len); uh_sum = in_cksum(m, len + sizeof (struct ip)); bcopy(b, ((struct ipovly *)ip)->ih_x1, 9); } if (uh_sum) { UDPSTAT_INC(udps_badsum); m_freem(m); return (IPPROTO_DONE); } } else { if (proto == IPPROTO_UDP) { UDPSTAT_INC(udps_nosum); } else { /* UDPLite requires a checksum */ /* XXX: What is the right UDPLite MIB counter here? */ m_freem(m); return (IPPROTO_DONE); } } pcbinfo = udp_get_inpcbinfo(proto); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || in_broadcast(ip->ip_dst, ifp)) { struct inpcb *last; struct inpcbhead *pcblist; struct ip_moptions *imo; INP_INFO_RLOCK(pcbinfo); pcblist = udp_get_pcblist(proto); last = NULL; LIST_FOREACH(inp, pcblist, inp_list) { if (inp->inp_lport != uh->uh_dport) continue; #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_laddr.s_addr != INADDR_ANY && inp->inp_laddr.s_addr != ip->ip_dst.s_addr) continue; if (inp->inp_faddr.s_addr != INADDR_ANY && inp->inp_faddr.s_addr != ip->ip_src.s_addr) continue; if (inp->inp_fport != 0 && inp->inp_fport != uh->uh_sport) continue; INP_RLOCK(inp); /* * XXXRW: Because we weren't holding either the inpcb * or the hash lock when we checked for a match * before, we should probably recheck now that the * inpcb lock is held. */ /* * Handle socket delivery policy for any-source * and source-specific multicast. [RFC3678] */ imo = inp->inp_moptions; if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { struct sockaddr_in group; int blocked; if (imo == NULL) { INP_RUNLOCK(inp); continue; } bzero(&group, sizeof(struct sockaddr_in)); group.sin_len = sizeof(struct sockaddr_in); group.sin_family = AF_INET; group.sin_addr = ip->ip_dst; blocked = imo_multi_filter(imo, ifp, (struct sockaddr *)&group, (struct sockaddr *)&udp_in); if (blocked != MCAST_PASS) { if (blocked == MCAST_NOTGMEMBER) IPSTAT_INC(ips_notmember); if (blocked == MCAST_NOTSMEMBER || blocked == MCAST_MUTED) UDPSTAT_INC(udps_filtermcast); INP_RUNLOCK(inp); continue; } } if (last != NULL) { struct mbuf *n; if ((n = m_copy(m, 0, M_COPYALL)) != NULL) { UDP_PROBE(receive, NULL, last, ip, last, uh); if (udp_append(last, ip, n, iphlen, &udp_in)) { goto inp_lost; } } INP_RUNLOCK(last); } last = inp; /* * Don't look for additional matches if this one does * not have either the SO_REUSEPORT or SO_REUSEADDR * socket options set. This heuristic avoids * searching through all pcbs in the common case of a * non-shared port. It assumes that an application * will never clear these options after setting them. */ if ((last->inp_socket->so_options & (SO_REUSEPORT|SO_REUSEADDR)) == 0) break; } if (last == NULL) { /* * No matching pcb found; discard datagram. (No need * to send an ICMP Port Unreachable for a broadcast * or multicast datgram.) */ UDPSTAT_INC(udps_noportbcast); if (inp) INP_RUNLOCK(inp); INP_INFO_RUNLOCK(pcbinfo); goto badunlocked; } UDP_PROBE(receive, NULL, last, ip, last, uh); if (udp_append(last, ip, m, iphlen, &udp_in) == 0) INP_RUNLOCK(last); inp_lost: INP_INFO_RUNLOCK(pcbinfo); return (IPPROTO_DONE); } /* * Locate pcb for datagram. */ /* * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ if ((m->m_flags & M_IP_NEXTHOP) && (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) { struct sockaddr_in *next_hop; next_hop = (struct sockaddr_in *)(fwd_tag + 1); /* * Transparently forwarded. Pretend to be the destination. * Already got one like this? */ inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport, ip->ip_dst, uh->uh_dport, INPLOOKUP_RLOCKPCB, ifp, m); if (!inp) { /* * It's new. Try to find the ambushing socket. * Because we've rewritten the destination address, * any hardware-generated hash is ignored. */ inp = in_pcblookup(pcbinfo, ip->ip_src, uh->uh_sport, next_hop->sin_addr, next_hop->sin_port ? htons(next_hop->sin_port) : uh->uh_dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, ifp); } /* Remove the tag from the packet. We don't need it anymore. */ m_tag_delete(m, fwd_tag); m->m_flags &= ~M_IP_NEXTHOP; } else inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport, ip->ip_dst, uh->uh_dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, ifp, m); if (inp == NULL) { if (udp_log_in_vain) { char buf[4*sizeof "123"]; strcpy(buf, inet_ntoa(ip->ip_dst)); log(LOG_INFO, "Connection attempt to UDP %s:%d from %s:%d\n", buf, ntohs(uh->uh_dport), inet_ntoa(ip->ip_src), ntohs(uh->uh_sport)); } UDPSTAT_INC(udps_noport); if (m->m_flags & (M_BCAST | M_MCAST)) { UDPSTAT_INC(udps_noportbcast); goto badunlocked; } if (V_udp_blackhole) goto badunlocked; if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0) goto badunlocked; *ip = save_ip; icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0); return (IPPROTO_DONE); } /* * Check the minimum TTL for socket. */ INP_RLOCK_ASSERT(inp); if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) { INP_RUNLOCK(inp); m_freem(m); return (IPPROTO_DONE); } if (cscov_partial) { struct udpcb *up; up = intoudpcb(inp); if (up->u_rxcslen == 0 || up->u_rxcslen > len) { INP_RUNLOCK(inp); m_freem(m); return (IPPROTO_DONE); } } UDP_PROBE(receive, NULL, inp, ip, inp, uh); if (udp_append(inp, ip, m, iphlen, &udp_in) == 0) INP_RUNLOCK(inp); return (IPPROTO_DONE); badunlocked: m_freem(m); return (IPPROTO_DONE); } #endif /* INET */ /* * Notify a udp user of an asynchronous error; just wake up so that they can * collect error status. */ struct inpcb * udp_notify(struct inpcb *inp, int errno) { /* * While udp_ctlinput() always calls udp_notify() with a read lock * when invoking it directly, in_pcbnotifyall() currently uses write * locks due to sharing code with TCP. For now, accept either a read * or a write lock, but a read lock is sufficient. */ INP_LOCK_ASSERT(inp); + if ((errno == EHOSTUNREACH || errno == ENETUNREACH || + errno == EHOSTDOWN) && inp->inp_route.ro_rt) { + RTFREE(inp->inp_route.ro_rt); + inp->inp_route.ro_rt = (struct rtentry *)NULL; + } inp->inp_socket->so_error = errno; sorwakeup(inp->inp_socket); sowwakeup(inp->inp_socket); return (inp); } #ifdef INET static void udp_common_ctlinput(int cmd, struct sockaddr *sa, void *vip, struct inpcbinfo *pcbinfo) { struct ip *ip = vip; struct udphdr *uh; struct in_addr faddr; struct inpcb *inp; faddr = ((struct sockaddr_in *)sa)->sin_addr; if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) return; - /* - * Redirects don't need to be handled up here. - */ - if (PRC_IS_REDIRECT(cmd)) + if (PRC_IS_REDIRECT(cmd)) { + /* signal EHOSTDOWN, as it flushes the cached route */ + in_pcbnotifyall(&udbinfo, faddr, EHOSTDOWN, udp_notify); return; + } /* * Hostdead is ugly because it goes linearly through all PCBs. * * XXX: We never get this from ICMP, otherwise it makes an excellent * DoS attack on machines with many connections. */ if (cmd == PRC_HOSTDEAD) ip = NULL; else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) return; if (ip != NULL) { uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2)); inp = in_pcblookup(pcbinfo, faddr, uh->uh_dport, ip->ip_src, uh->uh_sport, INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { INP_RLOCK_ASSERT(inp); if (inp->inp_socket != NULL) { udp_notify(inp, inetctlerrmap[cmd]); } INP_RUNLOCK(inp); } } else in_pcbnotifyall(pcbinfo, faddr, inetctlerrmap[cmd], udp_notify); } void udp_ctlinput(int cmd, struct sockaddr *sa, void *vip) { return (udp_common_ctlinput(cmd, sa, vip, &V_udbinfo)); } void udplite_ctlinput(int cmd, struct sockaddr *sa, void *vip) { return (udp_common_ctlinput(cmd, sa, vip, &V_ulitecbinfo)); } #endif /* INET */ static int udp_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; /* * The process of preparing the PCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == 0) { n = V_udbinfo.ipi_count; n += imax(n / 8, 10); req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb); return (0); } if (req->newptr != 0) return (EPERM); /* * OK, now we're committed to doing something. */ INP_INFO_RLOCK(&V_udbinfo); gencnt = V_udbinfo.ipi_gencnt; n = V_udbinfo.ipi_count; INP_INFO_RUNLOCK(&V_udbinfo); error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) + n * sizeof(struct xinpcb)); if (error != 0) return (error); xig.xig_len = sizeof xig; xig.xig_count = n; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); if (inp_list == 0) return (ENOMEM); INP_INFO_RLOCK(&V_udbinfo); for (inp = LIST_FIRST(V_udbinfo.ipi_listhead), i = 0; inp && i < n; inp = LIST_NEXT(inp, inp_list)) { INP_WLOCK(inp); if (inp->inp_gencnt <= gencnt && cr_canseeinpcb(req->td->td_ucred, inp) == 0) { in_pcbref(inp); inp_list[i++] = inp; } INP_WUNLOCK(inp); } INP_INFO_RUNLOCK(&V_udbinfo); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (inp->inp_gencnt <= gencnt) { struct xinpcb xi; bzero(&xi, sizeof(xi)); xi.xi_len = sizeof xi; /* XXX should avoid extra copy */ bcopy(inp, &xi.xi_inp, sizeof *inp); if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xi.xi_socket); xi.xi_inp.inp_gencnt = inp->inp_gencnt; INP_RUNLOCK(inp); error = SYSCTL_OUT(req, &xi, sizeof xi); } else INP_RUNLOCK(inp); } INP_INFO_WLOCK(&V_udbinfo); for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (!in_pcbrele_rlocked(inp)) INP_RUNLOCK(inp); } INP_INFO_WUNLOCK(&V_udbinfo); if (!error) { /* * Give the user an updated idea of our state. If the * generation differs from what we told her before, she knows * that something happened while we were processing this * request, and it might be necessary to retry. */ INP_INFO_RLOCK(&V_udbinfo); xig.xig_gen = V_udbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_udbinfo.ipi_count; INP_INFO_RUNLOCK(&V_udbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); return (error); } SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, udp_pcblist, "S,xinpcb", "List of active UDP sockets"); #ifdef INET static int udp_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in addrs[2]; struct inpcb *inp; int error; error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); inp = in_pcblookup(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { INP_RLOCK_ASSERT(inp); if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else error = ENOENT; if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, udp_getcred, "S,xucred", "Get the xucred of a UDP connection"); #endif /* INET */ int udp_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp; struct udpcb *up; int isudplite, error, optval; error = 0; isudplite = (so->so_proto->pr_protocol == IPPROTO_UDPLITE) ? 1 : 0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); INP_WLOCK(inp); if (sopt->sopt_level != so->so_proto->pr_protocol) { #ifdef INET6 if (INP_CHECK_SOCKAF(so, AF_INET6)) { INP_WUNLOCK(inp); error = ip6_ctloutput(so, sopt); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { INP_WUNLOCK(inp); error = ip_ctloutput(so, sopt); } #endif return (error); } switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { case UDP_ENCAP: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; inp = sotoinpcb(so); KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); INP_WLOCK(inp); #ifdef IPSEC_NAT_T up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); #endif switch (optval) { case 0: /* Clear all UDP encap. */ #ifdef IPSEC_NAT_T up->u_flags &= ~UF_ESPINUDP_ALL; #endif break; #ifdef IPSEC_NAT_T case UDP_ENCAP_ESPINUDP: case UDP_ENCAP_ESPINUDP_NON_IKE: up->u_flags &= ~UF_ESPINUDP_ALL; if (optval == UDP_ENCAP_ESPINUDP) up->u_flags |= UF_ESPINUDP; else if (optval == UDP_ENCAP_ESPINUDP_NON_IKE) up->u_flags |= UF_ESPINUDP_NON_IKE; break; #endif default: error = EINVAL; break; } INP_WUNLOCK(inp); break; case UDPLITE_SEND_CSCOV: case UDPLITE_RECV_CSCOV: if (!isudplite) { INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) break; inp = sotoinpcb(so); KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); INP_WLOCK(inp); up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); if ((optval != 0 && optval < 8) || (optval > 65535)) { INP_WUNLOCK(inp); error = EINVAL; break; } if (sopt->sopt_name == UDPLITE_SEND_CSCOV) up->u_txcslen = optval; else up->u_rxcslen = optval; INP_WUNLOCK(inp); break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (sopt->sopt_name) { #ifdef IPSEC_NAT_T case UDP_ENCAP: up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); optval = up->u_flags & UF_ESPINUDP_ALL; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; #endif case UDPLITE_SEND_CSCOV: case UDPLITE_RECV_CSCOV: if (!isudplite) { INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); if (sopt->sopt_name == UDPLITE_SEND_CSCOV) optval = up->u_txcslen; else optval = up->u_rxcslen; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(optval)); break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } break; } return (error); } #ifdef INET #define UH_WLOCKED 2 #define UH_RLOCKED 1 #define UH_UNLOCKED 0 static int udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { struct udpiphdr *ui; int len = m->m_pkthdr.len; struct in_addr faddr, laddr; struct cmsghdr *cm; struct inpcbinfo *pcbinfo; struct sockaddr_in *sin, src; int cscov_partial = 0; int error = 0; int ipflags; u_short fport, lport; - int unlock_udbinfo; + int unlock_udbinfo, unlock_inp; u_char tos; uint8_t pr; uint16_t cscov = 0; uint32_t flowid = 0; uint8_t flowtype = M_HASHTYPE_NONE; /* * udp_output() may need to temporarily bind or connect the current * inpcb. As such, we don't know up front whether we will need the * pcbinfo lock or not. Do any work to decide what is needed up * front before acquiring any locks. */ if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) { if (control) m_freem(control); m_freem(m); return (EMSGSIZE); } src.sin_family = 0; - INP_RLOCK(inp); + sin = (struct sockaddr_in *)addr; + if (sin == NULL || + (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) { + INP_WLOCK(inp); + unlock_inp = UH_WLOCKED; + } else { + INP_RLOCK(inp); + unlock_inp = UH_RLOCKED; + } tos = inp->inp_ip_tos; if (control != NULL) { /* * XXX: Currently, we assume all the optional information is * stored in a single mbuf. */ if (control->m_next) { - INP_RUNLOCK(inp); + if (unlock_inp == UH_WLOCKED) + INP_WUNLOCK(inp); + else + INP_RUNLOCK(inp); m_freem(control); m_freem(m); return (EINVAL); } for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len), control->m_len -= CMSG_ALIGN(cm->cmsg_len)) { cm = mtod(control, struct cmsghdr *); if (control->m_len < sizeof(*cm) || cm->cmsg_len == 0 || cm->cmsg_len > control->m_len) { error = EINVAL; break; } if (cm->cmsg_level != IPPROTO_IP) continue; switch (cm->cmsg_type) { case IP_SENDSRCADDR: if (cm->cmsg_len != CMSG_LEN(sizeof(struct in_addr))) { error = EINVAL; break; } bzero(&src, sizeof(src)); src.sin_family = AF_INET; src.sin_len = sizeof(src); src.sin_port = inp->inp_lport; src.sin_addr = *(struct in_addr *)CMSG_DATA(cm); break; case IP_TOS: if (cm->cmsg_len != CMSG_LEN(sizeof(u_char))) { error = EINVAL; break; } tos = *(u_char *)CMSG_DATA(cm); break; case IP_FLOWID: if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) { error = EINVAL; break; } flowid = *(uint32_t *) CMSG_DATA(cm); break; case IP_FLOWTYPE: if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) { error = EINVAL; break; } flowtype = *(uint32_t *) CMSG_DATA(cm); break; #ifdef RSS case IP_RSSBUCKETID: if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) { error = EINVAL; break; } /* This is just a placeholder for now */ break; #endif /* RSS */ default: error = ENOPROTOOPT; break; } if (error) break; } m_freem(control); } if (error) { - INP_RUNLOCK(inp); + if (unlock_inp == UH_WLOCKED) + INP_WUNLOCK(inp); + else + INP_RUNLOCK(inp); m_freem(m); return (error); } /* * Depending on whether or not the application has bound or connected * the socket, we may have to do varying levels of work. The optimal * case is for a connected UDP socket, as a global lock isn't * required at all. * * In order to decide which we need, we require stability of the * inpcb binding, which we ensure by acquiring a read lock on the * inpcb. This doesn't strictly follow the lock order, so we play * the trylock and retry game; note that we may end up with more * conservative locks than required the second time around, so later * assertions have to accept that. Further analysis of the number of * misses under contention is required. * * XXXRW: Check that hash locking update here is correct. */ pr = inp->inp_socket->so_proto->pr_protocol; pcbinfo = udp_get_inpcbinfo(pr); sin = (struct sockaddr_in *)addr; if (sin != NULL && (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) { - INP_RUNLOCK(inp); - INP_WLOCK(inp); INP_HASH_WLOCK(pcbinfo); unlock_udbinfo = UH_WLOCKED; } else if ((sin != NULL && ( (sin->sin_addr.s_addr == INADDR_ANY) || (sin->sin_addr.s_addr == INADDR_BROADCAST) || (inp->inp_laddr.s_addr == INADDR_ANY) || (inp->inp_lport == 0))) || (src.sin_family == AF_INET)) { INP_HASH_RLOCK(pcbinfo); unlock_udbinfo = UH_RLOCKED; } else unlock_udbinfo = UH_UNLOCKED; /* * If the IP_SENDSRCADDR control message was specified, override the * source address for this datagram. Its use is invalidated if the * address thus specified is incomplete or clobbers other inpcbs. */ laddr = inp->inp_laddr; lport = inp->inp_lport; if (src.sin_family == AF_INET) { INP_HASH_LOCK_ASSERT(pcbinfo); if ((lport == 0) || (laddr.s_addr == INADDR_ANY && src.sin_addr.s_addr == INADDR_ANY)) { error = EINVAL; goto release; } error = in_pcbbind_setup(inp, (struct sockaddr *)&src, &laddr.s_addr, &lport, td->td_ucred); if (error) goto release; } /* * If a UDP socket has been connected, then a local address/port will * have been selected and bound. * * If a UDP socket has not been connected to, then an explicit * destination address must be used, in which case a local * address/port may not have been selected and bound. */ if (sin != NULL) { INP_LOCK_ASSERT(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { error = EISCONN; goto release; } /* * Jail may rewrite the destination address, so let it do * that before we use it. */ error = prison_remote_ip4(td->td_ucred, &sin->sin_addr); if (error) goto release; /* * If a local address or port hasn't yet been selected, or if * the destination address needs to be rewritten due to using * a special INADDR_ constant, invoke in_pcbconnect_setup() * to do the heavy lifting. Once a port is selected, we * commit the binding back to the socket; we also commit the * binding of the address if in jail. * * If we already have a valid binding and we're not * requesting a destination address rewrite, use a fast path. */ if (inp->inp_laddr.s_addr == INADDR_ANY || inp->inp_lport == 0 || sin->sin_addr.s_addr == INADDR_ANY || sin->sin_addr.s_addr == INADDR_BROADCAST) { INP_HASH_LOCK_ASSERT(pcbinfo); error = in_pcbconnect_setup(inp, addr, &laddr.s_addr, &lport, &faddr.s_addr, &fport, NULL, td->td_ucred); if (error) goto release; /* * XXXRW: Why not commit the port if the address is * !INADDR_ANY? */ /* Commit the local port if newly assigned. */ if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); /* * Remember addr if jailed, to prevent * rebinding. */ if (prison_flag(td->td_ucred, PR_IP4)) inp->inp_laddr = laddr; inp->inp_lport = lport; if (in_pcbinshash(inp) != 0) { inp->inp_lport = 0; error = EAGAIN; goto release; } inp->inp_flags |= INP_ANONPORT; } } else { faddr = sin->sin_addr; fport = sin->sin_port; } } else { INP_LOCK_ASSERT(inp); faddr = inp->inp_faddr; fport = inp->inp_fport; if (faddr.s_addr == INADDR_ANY) { error = ENOTCONN; goto release; } } /* * Calculate data length and get a mbuf for UDP, IP, and possible * link-layer headers. Immediate slide the data pointer back forward * since we won't use that space at this layer. */ M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto release; } m->m_data += max_linkhdr; m->m_len -= max_linkhdr; m->m_pkthdr.len -= max_linkhdr; /* * Fill in mbuf with extended UDP header and addresses and length put * into network format. */ ui = mtod(m, struct udpiphdr *); bzero(ui->ui_x1, sizeof(ui->ui_x1)); /* XXX still needed? */ ui->ui_pr = pr; ui->ui_src = laddr; ui->ui_dst = faddr; ui->ui_sport = lport; ui->ui_dport = fport; ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr)); if (pr == IPPROTO_UDPLITE) { struct udpcb *up; uint16_t plen; up = intoudpcb(inp); cscov = up->u_txcslen; plen = (u_short)len + sizeof(struct udphdr); if (cscov >= plen) cscov = 0; ui->ui_len = htons(plen); ui->ui_ulen = htons(cscov); /* * For UDP-Lite, checksum coverage length of zero means * the entire UDPLite packet is covered by the checksum. */ cscov_partial = (cscov == 0) ? 0 : 1; } else ui->ui_v = IPVERSION << 4; /* * Set the Don't Fragment bit in the IP header. */ if (inp->inp_flags & INP_DONTFRAG) { struct ip *ip; ip = (struct ip *)&ui->ui_i; ip->ip_off |= htons(IP_DF); } ipflags = 0; if (inp->inp_socket->so_options & SO_DONTROUTE) ipflags |= IP_ROUTETOIF; if (inp->inp_socket->so_options & SO_BROADCAST) ipflags |= IP_ALLOWBROADCAST; if (inp->inp_flags & INP_ONESBCAST) ipflags |= IP_SENDONES; #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif /* * Set up checksum and output datagram. */ ui->ui_sum = 0; if (pr == IPPROTO_UDPLITE) { if (inp->inp_flags & INP_ONESBCAST) faddr.s_addr = INADDR_BROADCAST; if (cscov_partial) { if ((ui->ui_sum = in_cksum(m, sizeof(struct ip) + cscov)) == 0) ui->ui_sum = 0xffff; } else { if ((ui->ui_sum = in_cksum(m, sizeof(struct udpiphdr) + len)) == 0) ui->ui_sum = 0xffff; } } else if (V_udp_cksum) { if (inp->inp_flags & INP_ONESBCAST) faddr.s_addr = INADDR_BROADCAST; ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr, htons((u_short)len + sizeof(struct udphdr) + pr)); m->m_pkthdr.csum_flags = CSUM_UDP; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); } ((struct ip *)ui)->ip_len = htons(sizeof(struct udpiphdr) + len); ((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl; /* XXX */ ((struct ip *)ui)->ip_tos = tos; /* XXX */ UDPSTAT_INC(udps_opackets); /* * Setup flowid / RSS information for outbound socket. * * Once the UDP code decides to set a flowid some other way, * this allows the flowid to be overridden by userland. */ if (flowtype != M_HASHTYPE_NONE) { m->m_pkthdr.flowid = flowid; M_HASHTYPE_SET(m, flowtype); #ifdef RSS } else { uint32_t hash_val, hash_type; /* * Calculate an appropriate RSS hash for UDP and * UDP Lite. * * The called function will take care of figuring out * whether a 2-tuple or 4-tuple hash is required based * on the currently configured scheme. * * Later later on connected socket values should be * cached in the inpcb and reused, rather than constantly * re-calculating it. * * UDP Lite is a different protocol number and will * likely end up being hashed as a 2-tuple until * RSS / NICs grow UDP Lite protocol awareness. */ if (rss_proto_software_hash_v4(faddr, laddr, fport, lport, pr, &hash_val, &hash_type) == 0) { m->m_pkthdr.flowid = hash_val; M_HASHTYPE_SET(m, hash_type); } #endif } #ifdef RSS /* * Don't override with the inp cached flowid value. * * Depending upon the kind of send being done, the inp * flowid/flowtype values may actually not be appropriate * for this particular socket send. * * We should either leave the flowid at zero (which is what is * currently done) or set it to some software generated * hash value based on the packet contents. */ ipflags |= IP_NODEFAULTFLOWID; #endif /* RSS */ if (unlock_udbinfo == UH_WLOCKED) INP_HASH_WUNLOCK(pcbinfo); else if (unlock_udbinfo == UH_RLOCKED) INP_HASH_RUNLOCK(pcbinfo); UDP_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u); - error = ip_output(m, inp->inp_options, NULL, ipflags, + error = ip_output(m, inp->inp_options, + (unlock_inp == UH_WLOCKED ? &inp->inp_route : NULL), ipflags, inp->inp_moptions, inp); - if (unlock_udbinfo == UH_WLOCKED) + if (unlock_inp == UH_WLOCKED) INP_WUNLOCK(inp); else INP_RUNLOCK(inp); return (error); release: if (unlock_udbinfo == UH_WLOCKED) { INP_HASH_WUNLOCK(pcbinfo); INP_WUNLOCK(inp); } else if (unlock_udbinfo == UH_RLOCKED) { INP_HASH_RUNLOCK(pcbinfo); INP_RUNLOCK(inp); } else INP_RUNLOCK(inp); m_freem(m); return (error); } #if defined(IPSEC) && defined(IPSEC_NAT_T) /* * Potentially decap ESP in UDP frame. Check for an ESP header * and optional marker; if present, strip the UDP header and * push the result through IPSec. * * Returns mbuf to be processed (potentially re-allocated) or * NULL if consumed and/or processed. */ static struct mbuf * udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off) { size_t minlen, payload, skip, iphlen; caddr_t data; struct udpcb *up; struct m_tag *tag; struct udphdr *udphdr; struct ip *ip; INP_RLOCK_ASSERT(inp); /* * Pull up data so the longest case is contiguous: * IP/UDP hdr + non ESP marker + ESP hdr. */ minlen = off + sizeof(uint64_t) + sizeof(struct esp); if (minlen > m->m_pkthdr.len) minlen = m->m_pkthdr.len; if ((m = m_pullup(m, minlen)) == NULL) { IPSECSTAT_INC(ips_in_inval); return (NULL); /* Bypass caller processing. */ } data = mtod(m, caddr_t); /* Points to ip header. */ payload = m->m_len - off; /* Size of payload. */ if (payload == 1 && data[off] == '\xff') return (m); /* NB: keepalive packet, no decap. */ up = intoudpcb(inp); KASSERT(up != NULL, ("%s: udpcb NULL", __func__)); KASSERT((up->u_flags & UF_ESPINUDP_ALL) != 0, ("u_flags 0x%x", up->u_flags)); /* * Check that the payload is large enough to hold an * ESP header and compute the amount of data to remove. * * NB: the caller has already done a pullup for us. * XXX can we assume alignment and eliminate bcopys? */ if (up->u_flags & UF_ESPINUDP_NON_IKE) { /* * draft-ietf-ipsec-nat-t-ike-0[01].txt and * draft-ietf-ipsec-udp-encaps-(00/)01.txt, ignoring * possible AH mode non-IKE marker+non-ESP marker * from draft-ietf-ipsec-udp-encaps-00.txt. */ uint64_t marker; if (payload <= sizeof(uint64_t) + sizeof(struct esp)) return (m); /* NB: no decap. */ bcopy(data + off, &marker, sizeof(uint64_t)); if (marker != 0) /* Non-IKE marker. */ return (m); /* NB: no decap. */ skip = sizeof(uint64_t) + sizeof(struct udphdr); } else { uint32_t spi; if (payload <= sizeof(struct esp)) { IPSECSTAT_INC(ips_in_inval); m_freem(m); return (NULL); /* Discard. */ } bcopy(data + off, &spi, sizeof(uint32_t)); if (spi == 0) /* Non-ESP marker. */ return (m); /* NB: no decap. */ skip = sizeof(struct udphdr); } /* * Setup a PACKET_TAG_IPSEC_NAT_T_PORT tag to remember * the UDP ports. This is required if we want to select * the right SPD for multiple hosts behind same NAT. * * NB: ports are maintained in network byte order everywhere * in the NAT-T code. */ tag = m_tag_get(PACKET_TAG_IPSEC_NAT_T_PORTS, 2 * sizeof(uint16_t), M_NOWAIT); if (tag == NULL) { IPSECSTAT_INC(ips_in_nomem); m_freem(m); return (NULL); /* Discard. */ } iphlen = off - sizeof(struct udphdr); udphdr = (struct udphdr *)(data + iphlen); ((uint16_t *)(tag + 1))[0] = udphdr->uh_sport; ((uint16_t *)(tag + 1))[1] = udphdr->uh_dport; m_tag_prepend(m, tag); /* * Remove the UDP header (and possibly the non ESP marker) * IP header length is iphlen * Before: * <--- off ---> * +----+------+-----+ * | IP | UDP | ESP | * +----+------+-----+ * <-skip-> * After: * +----+-----+ * | IP | ESP | * +----+-----+ * <-skip-> */ ovbcopy(data, data + skip, iphlen); m_adj(m, skip); ip = mtod(m, struct ip *); ip->ip_len = htons(ntohs(ip->ip_len) - skip); ip->ip_p = IPPROTO_ESP; /* * We cannot yet update the cksums so clear any * h/w cksum flags as they are no longer valid. */ if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) m->m_pkthdr.csum_flags &= ~(CSUM_DATA_VALID|CSUM_PSEUDO_HDR); (void) ipsec_common_input(m, iphlen, offsetof(struct ip, ip_p), AF_INET, ip->ip_p); return (NULL); /* NB: consumed, bypass processing. */ } #endif /* defined(IPSEC) && defined(IPSEC_NAT_T) */ static void udp_abort(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_abort: inp == NULL")); INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { INP_HASH_WLOCK(pcbinfo); in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; INP_HASH_WUNLOCK(pcbinfo); soisdisconnected(so); } INP_WUNLOCK(inp); } static int udp_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; struct inpcbinfo *pcbinfo; int error; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp == NULL, ("udp_attach: inp != NULL")); error = soreserve(so, udp_sendspace, udp_recvspace); if (error) return (error); INP_INFO_WLOCK(pcbinfo); error = in_pcballoc(so, pcbinfo); if (error) { INP_INFO_WUNLOCK(pcbinfo); return (error); } inp = sotoinpcb(so); inp->inp_vflag |= INP_IPV4; inp->inp_ip_ttl = V_ip_defttl; error = udp_newudpcb(inp); if (error) { in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(pcbinfo); return (error); } INP_WUNLOCK(inp); INP_INFO_WUNLOCK(pcbinfo); return (0); } #endif /* INET */ int udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f, void *ctx) { struct inpcb *inp; struct udpcb *up; KASSERT(so->so_type == SOCK_DGRAM, ("udp_set_kernel_tunneling: !dgram")); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_set_kernel_tunneling: inp == NULL")); INP_WLOCK(inp); up = intoudpcb(inp); if (up->u_tun_func != NULL) { INP_WUNLOCK(inp); return (EBUSY); } up->u_tun_func = f; up->u_tun_ctx = ctx; INP_WUNLOCK(inp); return (0); } #ifdef INET static int udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; struct inpcbinfo *pcbinfo; int error; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_bind: inp == NULL")); INP_WLOCK(inp); INP_HASH_WLOCK(pcbinfo); error = in_pcbbind(inp, nam, td->td_ucred); INP_HASH_WUNLOCK(pcbinfo); INP_WUNLOCK(inp); return (error); } static void udp_close(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_close: inp == NULL")); INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { INP_HASH_WLOCK(pcbinfo); in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; INP_HASH_WUNLOCK(pcbinfo); soisdisconnected(so); } INP_WUNLOCK(inp); } static int udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; struct inpcbinfo *pcbinfo; struct sockaddr_in *sin; int error; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_connect: inp == NULL")); INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { INP_WUNLOCK(inp); return (EISCONN); } sin = (struct sockaddr_in *)nam; error = prison_remote_ip4(td->td_ucred, &sin->sin_addr); if (error != 0) { INP_WUNLOCK(inp); return (error); } INP_HASH_WLOCK(pcbinfo); error = in_pcbconnect(inp, nam, td->td_ucred); INP_HASH_WUNLOCK(pcbinfo); if (error == 0) soisconnected(so); INP_WUNLOCK(inp); return (error); } static void udp_detach(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; struct udpcb *up; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_detach: inp == NULL")); KASSERT(inp->inp_faddr.s_addr == INADDR_ANY, ("udp_detach: not disconnected")); INP_INFO_WLOCK(pcbinfo); INP_WLOCK(inp); up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); inp->inp_ppcb = NULL; in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(pcbinfo); udp_discardcb(up); } static int udp_disconnect(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_disconnect: inp == NULL")); INP_WLOCK(inp); if (inp->inp_faddr.s_addr == INADDR_ANY) { INP_WUNLOCK(inp); return (ENOTCONN); } INP_HASH_WLOCK(pcbinfo); in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; INP_HASH_WUNLOCK(pcbinfo); SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; /* XXX */ SOCK_UNLOCK(so); INP_WUNLOCK(inp); return (0); } static int udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_send: inp == NULL")); return (udp_output(inp, m, addr, control, td)); } #endif /* INET */ int udp_shutdown(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_shutdown: inp == NULL")); INP_WLOCK(inp); socantsendmore(so); INP_WUNLOCK(inp); return (0); } #ifdef INET struct pr_usrreqs udp_usrreqs = { .pru_abort = udp_abort, .pru_attach = udp_attach, .pru_bind = udp_bind, .pru_connect = udp_connect, .pru_control = in_control, .pru_detach = udp_detach, .pru_disconnect = udp_disconnect, .pru_peeraddr = in_getpeeraddr, .pru_send = udp_send, .pru_soreceive = soreceive_dgram, .pru_sosend = sosend_dgram, .pru_shutdown = udp_shutdown, .pru_sockaddr = in_getsockaddr, .pru_sosetlabel = in_pcbsosetlabel, .pru_close = udp_close, }; #endif /* INET */ Index: head/sys/netinet6/in6_pcb.c =================================================================== --- head/sys/netinet6/in6_pcb.c (revision 297224) +++ head/sys/netinet6/in6_pcb.c (revision 297225) @@ -1,1275 +1,1278 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6_pcb.c,v 1.31 2001/05/21 05:45:10 jinmei Exp $ */ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_pcb.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_pcbgroup.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static struct inpcb *in6_pcblookup_hash_locked(struct inpcbinfo *, struct in6_addr *, u_int, struct in6_addr *, u_int, int, struct ifnet *); int in6_pcbbind(register struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { struct socket *so = inp->inp_socket; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)NULL; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; u_short lport = 0; int error, lookupflags = 0; int reuseport = (so->so_options & SO_REUSEPORT); INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); if (TAILQ_EMPTY(&V_in6_ifaddrhead)) /* XXX broken! */ return (EADDRNOTAVAIL); if (inp->inp_lport || !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) return (EINVAL); if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) lookupflags = INPLOOKUP_WILDCARD; if (nam == NULL) { if ((error = prison_local_ip6(cred, &inp->in6p_laddr, ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0))) != 0) return (error); } else { sin6 = (struct sockaddr_in6 *)nam; if (nam->sa_len != sizeof(*sin6)) return (EINVAL); /* * family check. */ if (nam->sa_family != AF_INET6) return (EAFNOSUPPORT); if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0) return(error); if ((error = prison_local_ip6(cred, &sin6->sin6_addr, ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0))) != 0) return (error); lport = sin6->sin6_port; if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { /* * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; * allow compepte duplication of binding if * SO_REUSEPORT is set, or if SO_REUSEADDR is set * and a multicast address is bound on both * new and duplicated sockets. */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0) reuseport = SO_REUSEADDR|SO_REUSEPORT; } else if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { struct ifaddr *ifa; sin6->sin6_port = 0; /* yech... */ if ((ifa = ifa_ifwithaddr((struct sockaddr *)sin6)) == NULL && (inp->inp_flags & INP_BINDANY) == 0) { return (EADDRNOTAVAIL); } /* * XXX: bind to an anycast address might accidentally * cause sending a packet with anycast source address. * We should allow to bind to a deprecated address, since * the application dares to use it. */ if (ifa != NULL && ((struct in6_ifaddr *)ifa)->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY|IN6_IFF_DETACHED)) { ifa_free(ifa); return (EADDRNOTAVAIL); } if (ifa != NULL) ifa_free(ifa); } if (lport) { struct inpcb *t; struct tcptw *tw; /* GROSS */ if (ntohs(lport) <= V_ipport_reservedhigh && ntohs(lport) >= V_ipport_reservedlow && priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0)) return (EACCES); if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr) && priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT, 0) != 0) { t = in6_pcblookup_local(pcbinfo, &sin6->sin6_addr, lport, INPLOOKUP_WILDCARD, cred); if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) && ((t->inp_flags & INP_TIMEWAIT) == 0) && (so->so_type != SOCK_STREAM || IN6_IS_ADDR_UNSPECIFIED(&t->in6p_faddr)) && (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) || !IN6_IS_ADDR_UNSPECIFIED(&t->in6p_laddr) || (t->inp_flags2 & INP_REUSEPORT) == 0) && (inp->inp_cred->cr_uid != t->inp_cred->cr_uid)) return (EADDRINUSE); /* * If the socket is a BINDMULTI socket, then * the credentials need to match and the * original socket also has to have been bound * with BINDMULTI. */ if (t && (! in_pcbbind_check_bindmulti(inp, t))) return (EADDRINUSE); #ifdef INET if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0 && IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { struct sockaddr_in sin; in6_sin6_2_sin(&sin, sin6); t = in_pcblookup_local(pcbinfo, sin.sin_addr, lport, INPLOOKUP_WILDCARD, cred); if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) && ((t->inp_flags & INP_TIMEWAIT) == 0) && (so->so_type != SOCK_STREAM || ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && (inp->inp_cred->cr_uid != t->inp_cred->cr_uid)) return (EADDRINUSE); if (t && (! in_pcbbind_check_bindmulti(inp, t))) return (EADDRINUSE); } #endif } t = in6_pcblookup_local(pcbinfo, &sin6->sin6_addr, lport, lookupflags, cred); if (t && (t->inp_flags & INP_TIMEWAIT)) { /* * XXXRW: If an incpb has had its timewait * state recycled, we treat the address as * being in use (for now). This is better * than a panic, but not desirable. */ tw = intotw(t); if (tw == NULL || (reuseport & tw->tw_so_options) == 0) return (EADDRINUSE); } else if (t && (reuseport & inp_so_options(t)) == 0) { return (EADDRINUSE); } #ifdef INET if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0 && IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { struct sockaddr_in sin; in6_sin6_2_sin(&sin, sin6); t = in_pcblookup_local(pcbinfo, sin.sin_addr, lport, lookupflags, cred); if (t && t->inp_flags & INP_TIMEWAIT) { tw = intotw(t); if (tw == NULL) return (EADDRINUSE); if ((reuseport & tw->tw_so_options) == 0 && (ntohl(t->inp_laddr.s_addr) != INADDR_ANY || ((inp->inp_vflag & INP_IPV6PROTO) == (t->inp_vflag & INP_IPV6PROTO)))) return (EADDRINUSE); } else if (t && (reuseport & inp_so_options(t)) == 0 && (ntohl(t->inp_laddr.s_addr) != INADDR_ANY || (t->inp_vflag & INP_IPV6PROTO) != 0)) return (EADDRINUSE); } #endif } inp->in6p_laddr = sin6->sin6_addr; } if (lport == 0) { if ((error = in6_pcbsetport(&inp->in6p_laddr, inp, cred)) != 0) { /* Undo an address bind that may have occurred. */ inp->in6p_laddr = in6addr_any; return (error); } } else { inp->inp_lport = lport; if (in_pcbinshash(inp) != 0) { inp->in6p_laddr = in6addr_any; inp->inp_lport = 0; return (EAGAIN); } } return (0); } /* * Transform old in6_pcbconnect() into an inner subroutine for new * in6_pcbconnect(): Do some validity-checking on the remote * address (in mbuf 'nam') and then determine local host address * (i.e., which interface) to use to access that remote host. * * This preserves definition of in6_pcbconnect(), while supporting a * slightly different version for T/TCP. (This is more than * a bit of a kludge, but cleaning up the internal interfaces would * have forced minor changes in every protocol). */ static int in6_pcbladdr(register struct inpcb *inp, struct sockaddr *nam, struct in6_addr *plocal_addr6) { register struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; int error = 0; int scope_ambiguous = 0; struct in6_addr in6a; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); /* XXXRW: why? */ if (nam->sa_len != sizeof (*sin6)) return (EINVAL); if (sin6->sin6_family != AF_INET6) return (EAFNOSUPPORT); if (sin6->sin6_port == 0) return (EADDRNOTAVAIL); if (sin6->sin6_scope_id == 0 && !V_ip6_use_defzone) scope_ambiguous = 1; if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0) return(error); if (!TAILQ_EMPTY(&V_in6_ifaddrhead)) { /* * If the destination address is UNSPECIFIED addr, * use the loopback addr, e.g ::1. */ if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) sin6->sin6_addr = in6addr_loopback; } if ((error = prison_remote_ip6(inp->inp_cred, &sin6->sin6_addr)) != 0) return (error); error = in6_selectsrc_socket(sin6, inp->in6p_outputopts, inp, inp->inp_cred, scope_ambiguous, &in6a, NULL); if (error) return (error); /* * Do not update this earlier, in case we return with an error. * * XXX: this in6_selectsrc_socket result might replace the bound local * address with the address specified by setsockopt(IPV6_PKTINFO). * Is it the intended behavior? */ *plocal_addr6 = in6a; /* * Don't do pcblookup call here; return interface in * plocal_addr6 * and exit to caller, that will do the lookup. */ return (0); } /* * Outer subroutine: * Connect from a socket to a specified address. * Both address and port must be specified in argument sin. * If don't have a local address for this socket yet, * then pick one. */ int in6_pcbconnect_mbuf(register struct inpcb *inp, struct sockaddr *nam, struct ucred *cred, struct mbuf *m) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; register struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; struct in6_addr addr6; int error; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); /* * Call inner routine, to assign local interface address. * in6_pcbladdr() may automatically fill in sin6_scope_id. */ if ((error = in6_pcbladdr(inp, nam, &addr6)) != 0) return (error); if (in6_pcblookup_hash_locked(pcbinfo, &sin6->sin6_addr, sin6->sin6_port, IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) ? &addr6 : &inp->in6p_laddr, inp->inp_lport, 0, NULL) != NULL) { return (EADDRINUSE); } if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { if (inp->inp_lport == 0) { error = in6_pcbbind(inp, (struct sockaddr *)0, cred); if (error) return (error); } inp->in6p_laddr = addr6; } inp->in6p_faddr = sin6->sin6_addr; inp->inp_fport = sin6->sin6_port; /* update flowinfo - draft-itojun-ipv6-flowlabel-api-00 */ inp->inp_flow &= ~IPV6_FLOWLABEL_MASK; if (inp->inp_flags & IN6P_AUTOFLOWLABEL) inp->inp_flow |= (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); in_pcbrehash_mbuf(inp, m); return (0); } int in6_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { return (in6_pcbconnect_mbuf(inp, nam, cred, NULL)); } void in6_pcbdisconnect(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); bzero((caddr_t)&inp->in6p_faddr, sizeof(inp->in6p_faddr)); inp->inp_fport = 0; /* clear flowinfo - draft-itojun-ipv6-flowlabel-api-00 */ inp->inp_flow &= ~IPV6_FLOWLABEL_MASK; in_pcbrehash(inp); } struct sockaddr * in6_sockaddr(in_port_t port, struct in6_addr *addr_p) { struct sockaddr_in6 *sin6; sin6 = malloc(sizeof *sin6, M_SONAME, M_WAITOK); bzero(sin6, sizeof *sin6); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); sin6->sin6_port = port; sin6->sin6_addr = *addr_p; (void)sa6_recoverscope(sin6); /* XXX: should catch errors */ return (struct sockaddr *)sin6; } struct sockaddr * in6_v4mapsin6_sockaddr(in_port_t port, struct in_addr *addr_p) { struct sockaddr_in sin; struct sockaddr_in6 *sin6_p; bzero(&sin, sizeof sin); sin.sin_family = AF_INET; sin.sin_len = sizeof(sin); sin.sin_port = port; sin.sin_addr = *addr_p; sin6_p = malloc(sizeof *sin6_p, M_SONAME, M_WAITOK); in6_sin_2_v4mapsin6(&sin, sin6_p); return (struct sockaddr *)sin6_p; } int in6_getsockaddr(struct socket *so, struct sockaddr **nam) { register struct inpcb *inp; struct in6_addr addr; in_port_t port; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in6_getsockaddr: inp == NULL")); INP_RLOCK(inp); port = inp->inp_lport; addr = inp->in6p_laddr; INP_RUNLOCK(inp); *nam = in6_sockaddr(port, &addr); return 0; } int in6_getpeeraddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; struct in6_addr addr; in_port_t port; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in6_getpeeraddr: inp == NULL")); INP_RLOCK(inp); port = inp->inp_fport; addr = inp->in6p_faddr; INP_RUNLOCK(inp); *nam = in6_sockaddr(port, &addr); return 0; } int in6_mapped_sockaddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in6_mapped_sockaddr: inp == NULL")); #ifdef INET if ((inp->inp_vflag & (INP_IPV4 | INP_IPV6)) == INP_IPV4) { error = in_getsockaddr(so, nam); if (error == 0) in6_sin_2_v4mapsin6_in_sock(nam); } else #endif { /* scope issues will be handled in in6_getsockaddr(). */ error = in6_getsockaddr(so, nam); } return error; } int in6_mapped_peeraddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in6_mapped_peeraddr: inp == NULL")); #ifdef INET if ((inp->inp_vflag & (INP_IPV4 | INP_IPV6)) == INP_IPV4) { error = in_getpeeraddr(so, nam); if (error == 0) in6_sin_2_v4mapsin6_in_sock(nam); } else #endif /* scope issues will be handled in in6_getpeeraddr(). */ error = in6_getpeeraddr(so, nam); return error; } /* * Pass some notification to all connections of a protocol * associated with address dst. The local address and/or port numbers * may be specified to limit the search. The "usual action" will be * taken, depending on the ctlinput cmd. The caller must filter any * cmds that are uninteresting (e.g., no error in the map). * Call the protocol specific routine (if any) to report * any errors for each matching socket. */ void in6_pcbnotify(struct inpcbinfo *pcbinfo, struct sockaddr *dst, u_int fport_arg, const struct sockaddr *src, u_int lport_arg, int cmd, void *cmdarg, struct inpcb *(*notify)(struct inpcb *, int)) { struct inpcb *inp, *inp_temp; struct sockaddr_in6 sa6_src, *sa6_dst; u_short fport = fport_arg, lport = lport_arg; u_int32_t flowinfo; int errno; if ((unsigned)cmd >= PRC_NCMDS || dst->sa_family != AF_INET6) return; sa6_dst = (struct sockaddr_in6 *)dst; if (IN6_IS_ADDR_UNSPECIFIED(&sa6_dst->sin6_addr)) return; /* * note that src can be NULL when we get notify by local fragmentation. */ sa6_src = (src == NULL) ? sa6_any : *(const struct sockaddr_in6 *)src; flowinfo = sa6_src.sin6_flowinfo; /* * Redirects go to all references to the destination, * and use in6_rtchange to invalidate the route cache. * Dead host indications: also use in6_rtchange to invalidate * the cache, and deliver the error to all the sockets. * Otherwise, if we have knowledge of the local port and address, * deliver only to that socket. */ if (PRC_IS_REDIRECT(cmd) || cmd == PRC_HOSTDEAD) { fport = 0; lport = 0; bzero((caddr_t)&sa6_src.sin6_addr, sizeof(sa6_src.sin6_addr)); if (cmd != PRC_HOSTDEAD) notify = in6_rtchange; } errno = inet6ctlerrmap[cmd]; INP_INFO_WLOCK(pcbinfo); LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) { INP_WLOCK(inp); if ((inp->inp_vflag & INP_IPV6) == 0) { INP_WUNLOCK(inp); continue; } /* * If the error designates a new path MTU for a destination * and the application (associated with this socket) wanted to * know the value, notify. * XXX: should we avoid to notify the value to TCP sockets? */ if (cmd == PRC_MSGSIZE && cmdarg != NULL) ip6_notify_pmtu(inp, (struct sockaddr_in6 *)dst, *(u_int32_t *)cmdarg); /* * Detect if we should notify the error. If no source and * destination ports are specifed, but non-zero flowinfo and * local address match, notify the error. This is the case * when the error is delivered with an encrypted buffer * by ESP. Otherwise, just compare addresses and ports * as usual. */ if (lport == 0 && fport == 0 && flowinfo && inp->inp_socket != NULL && flowinfo == (inp->inp_flow & IPV6_FLOWLABEL_MASK) && IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &sa6_src.sin6_addr)) goto do_notify; else if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &sa6_dst->sin6_addr) || inp->inp_socket == 0 || (lport && inp->inp_lport != lport) || (!IN6_IS_ADDR_UNSPECIFIED(&sa6_src.sin6_addr) && !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &sa6_src.sin6_addr)) || (fport && inp->inp_fport != fport)) { INP_WUNLOCK(inp); continue; } do_notify: if (notify) { if ((*notify)(inp, errno)) INP_WUNLOCK(inp); } else INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(pcbinfo); } /* * Lookup a PCB based on the local address and port. Caller must hold the * hash lock. No inpcb locks or references are acquired. */ struct inpcb * in6_pcblookup_local(struct inpcbinfo *pcbinfo, struct in6_addr *laddr, u_short lport, int lookupflags, struct ucred *cred) { register struct inpcb *inp; int matchwild = 3, wildcard; KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); INP_HASH_WLOCK_ASSERT(pcbinfo); if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { struct inpcbhead *head; /* * Look for an unconnected (wildcard foreign addr) PCB that * matches the local address and port we're looking for. */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH( INP6_PCBHASHKEY(&in6addr_any), lport, 0, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) && inp->inp_lport == lport) { /* Found. */ if (cred == NULL || prison_equal_ip6(cred->cr_prison, inp->inp_cred->cr_prison)) return (inp); } } /* * Not found. */ return (NULL); } else { struct inpcbporthead *porthash; struct inpcbport *phd; struct inpcb *match = NULL; /* * Best fit PCB lookup. * * First see if this local port is in use by looking on the * port hash list. */ porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, pcbinfo->ipi_porthashmask)]; LIST_FOREACH(phd, porthash, phd_hash) { if (phd->phd_port == lport) break; } if (phd != NULL) { /* * Port is in use by one or more PCBs. Look for best * fit. */ LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { wildcard = 0; if (cred != NULL && !prison_equal_ip6(cred->cr_prison, inp->inp_cred->cr_prison)) continue; /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) wildcard++; if (!IN6_IS_ADDR_UNSPECIFIED( &inp->in6p_laddr)) { if (IN6_IS_ADDR_UNSPECIFIED(laddr)) wildcard++; else if (!IN6_ARE_ADDR_EQUAL( &inp->in6p_laddr, laddr)) continue; } else { if (!IN6_IS_ADDR_UNSPECIFIED(laddr)) wildcard++; } if (wildcard < matchwild) { match = inp; matchwild = wildcard; if (matchwild == 0) break; } } } return (match); } } void in6_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) { struct inpcb *in6p; struct ip6_moptions *im6o; int i, gap; INP_INFO_WLOCK(pcbinfo); LIST_FOREACH(in6p, pcbinfo->ipi_listhead, inp_list) { INP_WLOCK(in6p); im6o = in6p->in6p_moptions; if ((in6p->inp_vflag & INP_IPV6) && im6o != NULL) { /* * Unselect the outgoing ifp for multicast if it * is being detached. */ if (im6o->im6o_multicast_ifp == ifp) im6o->im6o_multicast_ifp = NULL; /* * Drop multicast group membership if we joined * through the interface being detached. */ gap = 0; for (i = 0; i < im6o->im6o_num_memberships; i++) { if (im6o->im6o_membership[i]->in6m_ifp == ifp) { in6_mc_leave(im6o->im6o_membership[i], NULL); gap++; } else if (gap != 0) { im6o->im6o_membership[i - gap] = im6o->im6o_membership[i]; } } im6o->im6o_num_memberships -= gap; } INP_WUNLOCK(in6p); } INP_INFO_WUNLOCK(pcbinfo); } /* * Check for alternatives when higher level complains * about service problems. For now, invalidate cached * routing information. If the route was created dynamically * (by a redirect), time to try a default gateway again. */ void in6_losing(struct inpcb *in6p) { - /* - * We don't store route pointers in the routing table anymore - */ + if (in6p->inp_route6.ro_rt) { + RTFREE(in6p->inp_route6.ro_rt); + in6p->inp_route6.ro_rt = (struct rtentry *)NULL; + } return; } /* * After a routing change, flush old routing * and allocate a (hopefully) better one. */ struct inpcb * in6_rtchange(struct inpcb *inp, int errno) { - /* - * We don't store route pointers in the routing table anymore - */ + + if (inp->inp_route6.ro_rt) { + RTFREE(inp->inp_route6.ro_rt); + inp->inp_route6.ro_rt = (struct rtentry *)NULL; + } return inp; } #ifdef PCBGROUP /* * Lookup PCB in hash list, using pcbgroup tables. */ static struct inpcb * in6_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup, struct in6_addr *faddr, u_int fport_arg, struct in6_addr *laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; u_short fport = fport_arg, lport = lport_arg; /* * First look for an exact match. */ tmpinp = NULL; INP_GROUP_LOCK(pcbgroup); head = &pcbgroup->ipg_hashbase[INP_PCBHASH( INP6_PCBHASHKEY(faddr), lport, fport, pcbgroup->ipg_hashmask)]; LIST_FOREACH(inp, head, inp_pcbgrouphash) { /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, faddr) && IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) && inp->inp_fport == fport && inp->inp_lport == lport) { /* * XXX We should be able to directly return * the inp here, without any checks. * Well unless both bound with SO_REUSEPORT? */ if (prison_flag(inp->inp_cred, PR_IP6)) goto found; if (tmpinp == NULL) tmpinp = inp; } } if (tmpinp != NULL) { inp = tmpinp; goto found; } /* * Then look for a wildcard match in the pcbgroup. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; struct inpcb *jail_wild = NULL; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbgroup->ipg_hashbase[ INP_PCBHASH(INADDR_ANY, lport, 0, pcbgroup->ipg_hashmask)]; LIST_FOREACH(inp, head, inp_pcbgrouphash) { /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) || inp->inp_lport != lport) { continue; } injail = prison_flag(inp->inp_cred, PR_IP6); if (injail) { if (prison_check_ip6(inp->inp_cred, laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr)) { if (injail) goto found; else local_exact = inp; } else if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ inp = jail_wild; if (inp == NULL) inp = jail_wild; if (inp == NULL) inp = local_exact; if (inp == NULL) inp = local_wild; if (inp != NULL) goto found; } /* * Then look for a wildcard match, if requested. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; struct inpcb *jail_wild = NULL; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbinfo->ipi_wildbase[INP_PCBHASH( INP6_PCBHASHKEY(&in6addr_any), lport, 0, pcbinfo->ipi_wildmask)]; LIST_FOREACH(inp, head, inp_pcbgroup_wild) { /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) || inp->inp_lport != lport) { continue; } injail = prison_flag(inp->inp_cred, PR_IP6); if (injail) { if (prison_check_ip6(inp->inp_cred, laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr)) { if (injail) goto found; else local_exact = inp; } else if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ inp = jail_wild; if (inp == NULL) inp = jail_wild; if (inp == NULL) inp = local_exact; if (inp == NULL) inp = local_wild; if (inp != NULL) goto found; } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */ INP_GROUP_UNLOCK(pcbgroup); return (NULL); found: in_pcbref(inp); INP_GROUP_UNLOCK(pcbgroup); if (lookupflags & INPLOOKUP_WLOCKPCB) { INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) return (NULL); } else if (lookupflags & INPLOOKUP_RLOCKPCB) { INP_RLOCK(inp); if (in_pcbrele_rlocked(inp)) return (NULL); } else panic("%s: locking buf", __func__); return (inp); } #endif /* PCBGROUP */ /* * Lookup PCB in hash list. */ static struct inpcb * in6_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport_arg, struct in6_addr *laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; u_short fport = fport_arg, lport = lport_arg; KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); INP_HASH_LOCK_ASSERT(pcbinfo); /* * First look for an exact match. */ tmpinp = NULL; head = &pcbinfo->ipi_hashbase[INP_PCBHASH( INP6_PCBHASHKEY(faddr), lport, fport, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, faddr) && IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) && inp->inp_fport == fport && inp->inp_lport == lport) { /* * XXX We should be able to directly return * the inp here, without any checks. * Well unless both bound with SO_REUSEPORT? */ if (prison_flag(inp->inp_cred, PR_IP6)) return (inp); if (tmpinp == NULL) tmpinp = inp; } } if (tmpinp != NULL) return (tmpinp); /* * Then look for a wildcard match, if requested. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; struct inpcb *jail_wild = NULL; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH( INP6_PCBHASHKEY(&in6addr_any), lport, 0, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) || inp->inp_lport != lport) { continue; } injail = prison_flag(inp->inp_cred, PR_IP6); if (injail) { if (prison_check_ip6(inp->inp_cred, laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr)) { if (injail) return (inp); else local_exact = inp; } else if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ if (jail_wild != NULL) return (jail_wild); if (local_exact != NULL) return (local_exact); if (local_wild != NULL) return (local_wild); } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */ /* * Not found. */ return (NULL); } /* * Lookup PCB in hash list, using pcbinfo tables. This variation locks the * hash list lock, and will return the inpcb locked (i.e., requires * INPLOOKUP_LOCKPCB). */ static struct inpcb * in6_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport, struct in6_addr *laddr, u_int lport, int lookupflags, struct ifnet *ifp) { struct inpcb *inp; INP_HASH_RLOCK(pcbinfo); inp = in6_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp); if (inp != NULL) { in_pcbref(inp); INP_HASH_RUNLOCK(pcbinfo); if (lookupflags & INPLOOKUP_WLOCKPCB) { INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) return (NULL); } else if (lookupflags & INPLOOKUP_RLOCKPCB) { INP_RLOCK(inp); if (in_pcbrele_rlocked(inp)) return (NULL); } else panic("%s: locking bug", __func__); } else INP_HASH_RUNLOCK(pcbinfo); return (inp); } /* * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf * from which a pre-calculated hash value may be extracted. * * Possibly more of this logic should be in in6_pcbgroup.c. */ struct inpcb * in6_pcblookup(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport, struct in6_addr *laddr, u_int lport, int lookupflags, struct ifnet *ifp) { #if defined(PCBGROUP) && !defined(RSS) struct inpcbgroup *pcbgroup; #endif KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); /* * When not using RSS, use connection groups in preference to the * reservation table when looking up 4-tuples. When using RSS, just * use the reservation table, due to the cost of the Toeplitz hash * in software. * * XXXRW: This policy belongs in the pcbgroup code, as in principle * we could be doing RSS with a non-Toeplitz hash that is affordable * in software. */ #if defined(PCBGROUP) && !defined(RSS) if (in_pcbgroup_enabled(pcbinfo)) { pcbgroup = in6_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, fport); return (in6_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); } #endif return (in6_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp)); } struct inpcb * in6_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport, struct in6_addr *laddr, u_int lport, int lookupflags, struct ifnet *ifp, struct mbuf *m) { #ifdef PCBGROUP struct inpcbgroup *pcbgroup; #endif KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); #ifdef PCBGROUP /* * If we can use a hardware-generated hash to look up the connection * group, use that connection group to find the inpcb. Otherwise * fall back on a software hash -- or the reservation table if we're * using RSS. * * XXXRW: As above, that policy belongs in the pcbgroup code. */ if (in_pcbgroup_enabled(pcbinfo) && M_HASHTYPE_TEST(m, M_HASHTYPE_NONE) == 0) { pcbgroup = in6_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m), m->m_pkthdr.flowid); if (pcbgroup != NULL) return (in6_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); #ifndef RSS pcbgroup = in6_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, fport); return (in6_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); #endif } #endif return (in6_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp)); } void init_sin6(struct sockaddr_in6 *sin6, struct mbuf *m) { struct ip6_hdr *ip; ip = mtod(m, struct ip6_hdr *); bzero(sin6, sizeof(*sin6)); sin6->sin6_len = sizeof(*sin6); sin6->sin6_family = AF_INET6; sin6->sin6_addr = ip->ip6_src; (void)sa6_recoverscope(sin6); /* XXX: should catch errors... */ return; } Index: head/sys/netinet6/ip6_output.c =================================================================== --- head/sys/netinet6/ip6_output.c (revision 297224) +++ head/sys/netinet6/ip6_output.c (revision 297225) @@ -1,3062 +1,3075 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: ip6_output.c,v 1.279 2002/01/26 06:12:30 jinmei Exp $ */ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipfw.h" #include "opt_ipsec.h" #include "opt_sctp.h" #include "opt_route.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #include #include #include #endif /* IPSEC */ #ifdef SCTP #include #include #endif #include #include #ifdef FLOWTABLE #include #endif extern int in6_mcast_loop; struct ip6_exthdrs { struct mbuf *ip6e_ip6; struct mbuf *ip6e_hbh; struct mbuf *ip6e_dest1; struct mbuf *ip6e_rthdr; struct mbuf *ip6e_dest2; }; static int ip6_pcbopt(int, u_char *, int, struct ip6_pktopts **, struct ucred *, int); static int ip6_pcbopts(struct ip6_pktopts **, struct mbuf *, struct socket *, struct sockopt *); static int ip6_getpcbopt(struct ip6_pktopts *, int, struct sockopt *); static int ip6_setpktopt(int, u_char *, int, struct ip6_pktopts *, struct ucred *, int, int, int); static int ip6_copyexthdr(struct mbuf **, caddr_t, int); static int ip6_insertfraghdr(struct mbuf *, struct mbuf *, int, struct ip6_frag **); static int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t); static int ip6_splithdr(struct mbuf *, struct ip6_exthdrs *); static int ip6_getpmtu(struct route_in6 *, int, struct ifnet *, struct in6_addr *, u_long *, int *, u_int); static int ip6_calcmtu(struct ifnet *, const struct in6_addr *, u_long, u_long *, int *); static int ip6_getpmtu_ctl(u_int, struct in6_addr *, u_long *); static int copypktopts(struct ip6_pktopts *, struct ip6_pktopts *, int); /* * Make an extension header from option data. hp is the source, and * mp is the destination. */ #define MAKE_EXTHDR(hp, mp) \ do { \ if (hp) { \ struct ip6_ext *eh = (struct ip6_ext *)(hp); \ error = ip6_copyexthdr((mp), (caddr_t)(hp), \ ((eh)->ip6e_len + 1) << 3); \ if (error) \ goto freehdrs; \ } \ } while (/*CONSTCOND*/ 0) /* * Form a chain of extension headers. * m is the extension header mbuf * mp is the previous mbuf in the chain * p is the next header * i is the type of option. */ #define MAKE_CHAIN(m, mp, p, i)\ do {\ if (m) {\ if (!hdrsplit) \ panic("assumption failed: hdr not split"); \ *mtod((m), u_char *) = *(p);\ *(p) = (i);\ p = mtod((m), u_char *);\ (m)->m_next = (mp)->m_next;\ (mp)->m_next = (m);\ (mp) = (m);\ }\ } while (/*CONSTCOND*/ 0) void in6_delayed_cksum(struct mbuf *m, uint32_t plen, u_short offset) { u_short csum; csum = in_cksum_skip(m, offset + plen, offset); if (m->m_pkthdr.csum_flags & CSUM_UDP_IPV6 && csum == 0) csum = 0xffff; offset += m->m_pkthdr.csum_data; /* checksum offset */ if (offset + sizeof(u_short) > m->m_len) { printf("%s: delayed m_pullup, m->len: %d plen %u off %u " "csum_flags=%b\n", __func__, m->m_len, plen, offset, (int)m->m_pkthdr.csum_flags, CSUM_BITS); /* * XXX this should not happen, but if it does, the correct * behavior may be to insert the checksum in the appropriate * next mbuf in the chain. */ return; } *(u_short *)(m->m_data + offset) = csum; } int ip6_fragment(struct ifnet *ifp, struct mbuf *m0, int hlen, u_char nextproto, int mtu, uint32_t id) { struct mbuf *m, **mnext, *m_frgpart; struct ip6_hdr *ip6, *mhip6; struct ip6_frag *ip6f; int off; int error; int tlen = m0->m_pkthdr.len; m = m0; ip6 = mtod(m, struct ip6_hdr *); mnext = &m->m_nextpkt; for (off = hlen; off < tlen; off += mtu) { m = m_gethdr(M_NOWAIT, MT_DATA); if (!m) { IP6STAT_INC(ip6s_odropped); return (ENOBUFS); } m->m_flags = m0->m_flags & M_COPYFLAGS; *mnext = m; mnext = &m->m_nextpkt; m->m_data += max_linkhdr; mhip6 = mtod(m, struct ip6_hdr *); *mhip6 = *ip6; m->m_len = sizeof(*mhip6); error = ip6_insertfraghdr(m0, m, hlen, &ip6f); if (error) { IP6STAT_INC(ip6s_odropped); return (error); } ip6f->ip6f_offlg = htons((u_short)((off - hlen) & ~7)); if (off + mtu >= tlen) mtu = tlen - off; else ip6f->ip6f_offlg |= IP6F_MORE_FRAG; mhip6->ip6_plen = htons((u_short)(mtu + hlen + sizeof(*ip6f) - sizeof(struct ip6_hdr))); if ((m_frgpart = m_copy(m0, off, mtu)) == 0) { IP6STAT_INC(ip6s_odropped); return (ENOBUFS); } m_cat(m, m_frgpart); m->m_pkthdr.len = mtu + hlen + sizeof(*ip6f); m->m_pkthdr.fibnum = m0->m_pkthdr.fibnum; m->m_pkthdr.rcvif = NULL; ip6f->ip6f_reserved = 0; ip6f->ip6f_ident = id; ip6f->ip6f_nxt = nextproto; IP6STAT_INC(ip6s_ofragments); in6_ifstat_inc(ifp, ifs6_out_fragcreat); } return (0); } /* * IP6 output. The packet in mbuf chain m contains a skeletal IP6 * header (with pri, len, nxt, hlim, src, dst). * This function may modify ver and hlim only. * The mbuf chain containing the packet will be freed. * The mbuf opt, if present, will not be freed. * If route_in6 ro is present and has ro_rt initialized, route lookup would be * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL, * then result of route lookup is stored in ro->ro_rt. * * type of "mtu": rt_mtu is u_long, ifnet.ifr_mtu is int, and * nd_ifinfo.linkmtu is u_int32_t. so we use u_long to hold largest one, * which is rt_mtu. * * ifpp - XXX: just for statistics */ /* * XXX TODO: no flowid is assigned for outbound flows? */ int ip6_output(struct mbuf *m0, struct ip6_pktopts *opt, struct route_in6 *ro, int flags, struct ip6_moptions *im6o, struct ifnet **ifpp, struct inpcb *inp) { struct ip6_hdr *ip6; struct ifnet *ifp, *origifp; struct mbuf *m = m0; struct mbuf *mprev = NULL; int hlen, tlen, len; struct route_in6 ip6route; struct rtentry *rt = NULL; struct sockaddr_in6 *dst, src_sa, dst_sa; struct in6_addr odst; int error = 0; struct in6_ifaddr *ia = NULL; u_long mtu; int alwaysfrag, dontfrag; u_int32_t optlen = 0, plen = 0, unfragpartlen = 0; struct ip6_exthdrs exthdrs; struct in6_addr finaldst, src0, dst0; u_int32_t zone; struct route_in6 *ro_pmtu = NULL; int hdrsplit = 0; int sw_csum, tso; int needfiblookup; uint32_t fibnum; struct m_tag *fwd_tag = NULL; uint32_t id; ip6 = mtod(m, struct ip6_hdr *); if (ip6 == NULL) { printf ("ip6 is NULL"); goto bad; } if (inp != NULL) { M_SETFIB(m, inp->inp_inc.inc_fibnum); if ((flags & IP_NODEFAULTFLOWID) == 0) { /* unconditionally set flowid */ m->m_pkthdr.flowid = inp->inp_flowid; M_HASHTYPE_SET(m, inp->inp_flowtype); } } finaldst = ip6->ip6_dst; bzero(&exthdrs, sizeof(exthdrs)); if (opt) { /* Hop-by-Hop options header */ MAKE_EXTHDR(opt->ip6po_hbh, &exthdrs.ip6e_hbh); /* Destination options header(1st part) */ if (opt->ip6po_rthdr) { /* * Destination options header(1st part) * This only makes sense with a routing header. * See Section 9.2 of RFC 3542. * Disabling this part just for MIP6 convenience is * a bad idea. We need to think carefully about a * way to make the advanced API coexist with MIP6 * options, which might automatically be inserted in * the kernel. */ MAKE_EXTHDR(opt->ip6po_dest1, &exthdrs.ip6e_dest1); } /* Routing header */ MAKE_EXTHDR(opt->ip6po_rthdr, &exthdrs.ip6e_rthdr); /* Destination options header(2nd part) */ MAKE_EXTHDR(opt->ip6po_dest2, &exthdrs.ip6e_dest2); } #ifdef IPSEC /* * IPSec checking which handles several cases. * FAST IPSEC: We re-injected the packet. * XXX: need scope argument. */ switch(ip6_ipsec_output(&m, inp, &error)) { case 1: /* Bad packet */ goto freehdrs; case -1: /* IPSec done */ goto done; case 0: /* No IPSec */ default: break; } #endif /* IPSEC */ /* * Calculate the total length of the extension header chain. * Keep the length of the unfragmentable part for fragmentation. */ optlen = 0; if (exthdrs.ip6e_hbh) optlen += exthdrs.ip6e_hbh->m_len; if (exthdrs.ip6e_dest1) optlen += exthdrs.ip6e_dest1->m_len; if (exthdrs.ip6e_rthdr) optlen += exthdrs.ip6e_rthdr->m_len; unfragpartlen = optlen + sizeof(struct ip6_hdr); /* NOTE: we don't add AH/ESP length here (done in ip6_ipsec_output) */ if (exthdrs.ip6e_dest2) optlen += exthdrs.ip6e_dest2->m_len; /* * If there is at least one extension header, * separate IP6 header from the payload. */ if (optlen && !hdrsplit) { if ((error = ip6_splithdr(m, &exthdrs)) != 0) { m = NULL; goto freehdrs; } m = exthdrs.ip6e_ip6; hdrsplit++; } /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); /* adjust mbuf packet header length */ m->m_pkthdr.len += optlen; plen = m->m_pkthdr.len - sizeof(*ip6); /* If this is a jumbo payload, insert a jumbo payload option. */ if (plen > IPV6_MAXPACKET) { if (!hdrsplit) { if ((error = ip6_splithdr(m, &exthdrs)) != 0) { m = NULL; goto freehdrs; } m = exthdrs.ip6e_ip6; hdrsplit++; } /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); if ((error = ip6_insert_jumboopt(&exthdrs, plen)) != 0) goto freehdrs; ip6->ip6_plen = 0; } else ip6->ip6_plen = htons(plen); /* * Concatenate headers and fill in next header fields. * Here we have, on "m" * IPv6 payload * and we insert headers accordingly. Finally, we should be getting: * IPv6 hbh dest1 rthdr ah* [esp* dest2 payload] * * during the header composing process, "m" points to IPv6 header. * "mprev" points to an extension header prior to esp. */ u_char *nexthdrp = &ip6->ip6_nxt; mprev = m; /* * we treat dest2 specially. this makes IPsec processing * much easier. the goal here is to make mprev point the * mbuf prior to dest2. * * result: IPv6 dest2 payload * m and mprev will point to IPv6 header. */ if (exthdrs.ip6e_dest2) { if (!hdrsplit) panic("assumption failed: hdr not split"); exthdrs.ip6e_dest2->m_next = m->m_next; m->m_next = exthdrs.ip6e_dest2; *mtod(exthdrs.ip6e_dest2, u_char *) = ip6->ip6_nxt; ip6->ip6_nxt = IPPROTO_DSTOPTS; } /* * result: IPv6 hbh dest1 rthdr dest2 payload * m will point to IPv6 header. mprev will point to the * extension header prior to dest2 (rthdr in the above case). */ MAKE_CHAIN(exthdrs.ip6e_hbh, mprev, nexthdrp, IPPROTO_HOPOPTS); MAKE_CHAIN(exthdrs.ip6e_dest1, mprev, nexthdrp, IPPROTO_DSTOPTS); MAKE_CHAIN(exthdrs.ip6e_rthdr, mprev, nexthdrp, IPPROTO_ROUTING); /* * If there is a routing header, discard the packet. */ if (exthdrs.ip6e_rthdr) { error = EINVAL; goto bad; } /* Source address validation */ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) && (flags & IPV6_UNSPECSRC) == 0) { error = EOPNOTSUPP; IP6STAT_INC(ip6s_badscope); goto bad; } if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) { error = EOPNOTSUPP; IP6STAT_INC(ip6s_badscope); goto bad; } IP6STAT_INC(ip6s_localout); /* * Route packet. */ if (ro == 0) { ro = &ip6route; bzero((caddr_t)ro, sizeof(*ro)); } ro_pmtu = ro; if (opt && opt->ip6po_rthdr) ro = &opt->ip6po_route; dst = (struct sockaddr_in6 *)&ro->ro_dst; #ifdef FLOWTABLE if (ro->ro_rt == NULL) (void )flowtable_lookup(AF_INET6, m, (struct route *)ro); #endif fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m); again: /* * if specified, try to fill in the traffic class field. * do not override if a non-zero value is already set. * we check the diffserv field and the ecn field separately. */ if (opt && opt->ip6po_tclass >= 0) { int mask = 0; if ((ip6->ip6_flow & htonl(0xfc << 20)) == 0) mask |= 0xfc; if ((ip6->ip6_flow & htonl(0x03 << 20)) == 0) mask |= 0x03; if (mask != 0) ip6->ip6_flow |= htonl((opt->ip6po_tclass & mask) << 20); } /* fill in or override the hop limit field, if necessary. */ if (opt && opt->ip6po_hlim != -1) ip6->ip6_hlim = opt->ip6po_hlim & 0xff; else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { if (im6o != NULL) ip6->ip6_hlim = im6o->im6o_multicast_hlim; else ip6->ip6_hlim = V_ip6_defmcasthlim; } /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); - if (ro->ro_rt && fwd_tag == NULL) { + /* + * Validate route against routing table additions; + * a better/more specific route might have been added. + * Make sure address family is set in route. + */ + if (inp) { + ro->ro_dst.sin6_family = AF_INET6; + RT_VALIDATE((struct route *)ro, &inp->inp_rt_cookie, fibnum); + } + if (ro->ro_rt && fwd_tag == NULL && (ro->ro_rt->rt_flags & RTF_UP) && + ro->ro_dst.sin6_family == AF_INET6 && + IN6_ARE_ADDR_EQUAL(&ro->ro_dst.sin6_addr, &ip6->ip6_dst)) { rt = ro->ro_rt; ifp = ro->ro_rt->rt_ifp; } else { if (fwd_tag == NULL) { bzero(&dst_sa, sizeof(dst_sa)); dst_sa.sin6_family = AF_INET6; dst_sa.sin6_len = sizeof(dst_sa); dst_sa.sin6_addr = ip6->ip6_dst; } error = in6_selectroute_fib(&dst_sa, opt, im6o, ro, &ifp, &rt, fibnum); if (error != 0) { if (ifp != NULL) in6_ifstat_inc(ifp, ifs6_out_discard); goto bad; } } if (rt == NULL) { /* * If in6_selectroute() does not return a route entry, * dst may not have been updated. */ *dst = dst_sa; /* XXX */ } /* * then rt (for unicast) and ifp must be non-NULL valid values. */ if ((flags & IPV6_FORWARDING) == 0) { /* XXX: the FORWARDING flag can be set for mrouting. */ in6_ifstat_inc(ifp, ifs6_out_request); } if (rt != NULL) { ia = (struct in6_ifaddr *)(rt->rt_ifa); counter_u64_add(rt->rt_pksent, 1); } /* * The outgoing interface must be in the zone of source and * destination addresses. */ origifp = ifp; src0 = ip6->ip6_src; if (in6_setscope(&src0, origifp, &zone)) goto badscope; bzero(&src_sa, sizeof(src_sa)); src_sa.sin6_family = AF_INET6; src_sa.sin6_len = sizeof(src_sa); src_sa.sin6_addr = ip6->ip6_src; if (sa6_recoverscope(&src_sa) || zone != src_sa.sin6_scope_id) goto badscope; dst0 = ip6->ip6_dst; if (in6_setscope(&dst0, origifp, &zone)) goto badscope; /* re-initialize to be sure */ bzero(&dst_sa, sizeof(dst_sa)); dst_sa.sin6_family = AF_INET6; dst_sa.sin6_len = sizeof(dst_sa); dst_sa.sin6_addr = ip6->ip6_dst; if (sa6_recoverscope(&dst_sa) || zone != dst_sa.sin6_scope_id) { goto badscope; } /* We should use ia_ifp to support the case of * sending packets to an address of our own. */ if (ia != NULL && ia->ia_ifp) ifp = ia->ia_ifp; /* scope check is done. */ goto routefound; badscope: IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(origifp, ifs6_out_discard); if (error == 0) error = EHOSTUNREACH; /* XXX */ goto bad; routefound: if (rt && !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { if (opt && opt->ip6po_nextroute.ro_rt) { /* * The nexthop is explicitly specified by the * application. We assume the next hop is an IPv6 * address. */ dst = (struct sockaddr_in6 *)opt->ip6po_nexthop; } else if ((rt->rt_flags & RTF_GATEWAY)) dst = (struct sockaddr_in6 *)rt->rt_gateway; } if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { m->m_flags &= ~(M_BCAST | M_MCAST); /* just in case */ } else { m->m_flags = (m->m_flags & ~M_BCAST) | M_MCAST; in6_ifstat_inc(ifp, ifs6_out_mcast); /* * Confirm that the outgoing interface supports multicast. */ if (!(ifp->if_flags & IFF_MULTICAST)) { IP6STAT_INC(ip6s_noroute); in6_ifstat_inc(ifp, ifs6_out_discard); error = ENETUNREACH; goto bad; } if ((im6o == NULL && in6_mcast_loop) || (im6o && im6o->im6o_multicast_loop)) { /* * Loop back multicast datagram if not expressly * forbidden to do so, even if we have not joined * the address; protocols will filter it later, * thus deferring a hash lookup and lock acquisition * at the expense of an m_copym(). */ ip6_mloopback(ifp, m); } else { /* * If we are acting as a multicast router, perform * multicast forwarding as if the packet had just * arrived on the interface to which we are about * to send. The multicast forwarding function * recursively calls this function, using the * IPV6_FORWARDING flag to prevent infinite recursion. * * Multicasts that are looped back by ip6_mloopback(), * above, will be forwarded by the ip6_input() routine, * if necessary. */ if (V_ip6_mrouter && (flags & IPV6_FORWARDING) == 0) { /* * XXX: ip6_mforward expects that rcvif is NULL * when it is called from the originating path. * However, it may not always be the case. */ m->m_pkthdr.rcvif = NULL; if (ip6_mforward(ip6, ifp, m) != 0) { m_freem(m); goto done; } } } /* * Multicasts with a hoplimit of zero may be looped back, * above, but must not be transmitted on a network. * Also, multicasts addressed to the loopback interface * are not sent -- the above call to ip6_mloopback() will * loop back a copy if this host actually belongs to the * destination group on the loopback interface. */ if (ip6->ip6_hlim == 0 || (ifp->if_flags & IFF_LOOPBACK) || IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst)) { m_freem(m); goto done; } } /* * Fill the outgoing inteface to tell the upper layer * to increment per-interface statistics. */ if (ifpp) *ifpp = ifp; /* Determine path MTU. */ if ((error = ip6_getpmtu(ro_pmtu, ro != ro_pmtu, ifp, &finaldst, &mtu, &alwaysfrag, fibnum)) != 0) goto bad; /* * The caller of this function may specify to use the minimum MTU * in some cases. * An advanced API option (IPV6_USE_MIN_MTU) can also override MTU * setting. The logic is a bit complicated; by default, unicast * packets will follow path MTU while multicast packets will be sent at * the minimum MTU. If IP6PO_MINMTU_ALL is specified, all packets * including unicast ones will be sent at the minimum MTU. Multicast * packets will always be sent at the minimum MTU unless * IP6PO_MINMTU_DISABLE is explicitly specified. * See RFC 3542 for more details. */ if (mtu > IPV6_MMTU) { if ((flags & IPV6_MINMTU)) mtu = IPV6_MMTU; else if (opt && opt->ip6po_minmtu == IP6PO_MINMTU_ALL) mtu = IPV6_MMTU; else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && (opt == NULL || opt->ip6po_minmtu != IP6PO_MINMTU_DISABLE)) { mtu = IPV6_MMTU; } } /* * clear embedded scope identifiers if necessary. * in6_clearscope will touch the addresses only when necessary. */ in6_clearscope(&ip6->ip6_src); in6_clearscope(&ip6->ip6_dst); /* * If the outgoing packet contains a hop-by-hop options header, * it must be examined and processed even by the source node. * (RFC 2460, section 4.) */ if (exthdrs.ip6e_hbh) { struct ip6_hbh *hbh = mtod(exthdrs.ip6e_hbh, struct ip6_hbh *); u_int32_t dummy; /* XXX unused */ u_int32_t plen = 0; /* XXX: ip6_process will check the value */ #ifdef DIAGNOSTIC if ((hbh->ip6h_len + 1) << 3 > exthdrs.ip6e_hbh->m_len) panic("ip6e_hbh is not contiguous"); #endif /* * XXX: if we have to send an ICMPv6 error to the sender, * we need the M_LOOP flag since icmp6_error() expects * the IPv6 and the hop-by-hop options header are * contiguous unless the flag is set. */ m->m_flags |= M_LOOP; m->m_pkthdr.rcvif = ifp; if (ip6_process_hopopts(m, (u_int8_t *)(hbh + 1), ((hbh->ip6h_len + 1) << 3) - sizeof(struct ip6_hbh), &dummy, &plen) < 0) { /* m was already freed at this point */ error = EINVAL;/* better error? */ goto done; } m->m_flags &= ~M_LOOP; /* XXX */ m->m_pkthdr.rcvif = NULL; } /* Jump over all PFIL processing if hooks are not active. */ if (!PFIL_HOOKED(&V_inet6_pfil_hook)) goto passout; odst = ip6->ip6_dst; /* Run through list of hooks for output packets. */ error = pfil_run_hooks(&V_inet6_pfil_hook, &m, ifp, PFIL_OUT, inp); if (error != 0 || m == NULL) goto done; ip6 = mtod(m, struct ip6_hdr *); needfiblookup = 0; /* See if destination IP address was changed by packet filter. */ if (!IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst)) { m->m_flags |= M_SKIP_FIREWALL; /* If destination is now ourself drop to ip6_input(). */ if (in6_localip(&ip6->ip6_dst)) { m->m_flags |= M_FASTFWD_OURS; if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif error = netisr_queue(NETISR_IPV6, m); goto done; } else needfiblookup = 1; /* Redo the routing table lookup. */ } /* See if fib was changed by packet filter. */ if (fibnum != M_GETFIB(m)) { m->m_flags |= M_SKIP_FIREWALL; fibnum = M_GETFIB(m); RO_RTFREE(ro); needfiblookup = 1; } if (needfiblookup) goto again; /* See if local, if yes, send it to netisr. */ if (m->m_flags & M_FASTFWD_OURS) { if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif error = netisr_queue(NETISR_IPV6, m); goto done; } /* Or forward to some other address? */ if ((m->m_flags & M_IP6_NEXTHOP) && (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) { dst = (struct sockaddr_in6 *)&ro->ro_dst; bcopy((fwd_tag+1), &dst_sa, sizeof(struct sockaddr_in6)); m->m_flags |= M_SKIP_FIREWALL; m->m_flags &= ~M_IP6_NEXTHOP; m_tag_delete(m, fwd_tag); goto again; } passout: /* * Send the packet to the outgoing interface. * If necessary, do IPv6 fragmentation before sending. * * the logic here is rather complex: * 1: normal case (dontfrag == 0, alwaysfrag == 0) * 1-a: send as is if tlen <= path mtu * 1-b: fragment if tlen > path mtu * * 2: if user asks us not to fragment (dontfrag == 1) * 2-a: send as is if tlen <= interface mtu * 2-b: error if tlen > interface mtu * * 3: if we always need to attach fragment header (alwaysfrag == 1) * always fragment * * 4: if dontfrag == 1 && alwaysfrag == 1 * error, as we cannot handle this conflicting request */ sw_csum = m->m_pkthdr.csum_flags; if (!hdrsplit) { tso = ((sw_csum & ifp->if_hwassist & CSUM_TSO) != 0) ? 1 : 0; sw_csum &= ~ifp->if_hwassist; } else tso = 0; /* * If we added extension headers, we will not do TSO and calculate the * checksums ourselves for now. * XXX-BZ Need a framework to know when the NIC can handle it, even * with ext. hdrs. */ if (sw_csum & CSUM_DELAY_DATA_IPV6) { sw_csum &= ~CSUM_DELAY_DATA_IPV6; in6_delayed_cksum(m, plen, sizeof(struct ip6_hdr)); } #ifdef SCTP if (sw_csum & CSUM_SCTP_IPV6) { sw_csum &= ~CSUM_SCTP_IPV6; sctp_delayed_cksum(m, sizeof(struct ip6_hdr)); } #endif m->m_pkthdr.csum_flags &= ifp->if_hwassist; tlen = m->m_pkthdr.len; if ((opt && (opt->ip6po_flags & IP6PO_DONTFRAG)) || tso) dontfrag = 1; else dontfrag = 0; if (dontfrag && alwaysfrag) { /* case 4 */ /* conflicting request - can't transmit */ error = EMSGSIZE; goto bad; } if (dontfrag && tlen > IN6_LINKMTU(ifp) && !tso) { /* case 2-b */ /* * Even if the DONTFRAG option is specified, we cannot send the * packet when the data length is larger than the MTU of the * outgoing interface. * Notify the error by sending IPV6_PATHMTU ancillary data if * application wanted to know the MTU value. Also return an * error code (this is not described in the API spec). */ if (inp != NULL) ip6_notify_pmtu(inp, &dst_sa, (u_int32_t)mtu); error = EMSGSIZE; goto bad; } /* * transmit packet without fragmentation */ if (dontfrag || (!alwaysfrag && tlen <= mtu)) { /* case 1-a and 2-a */ struct in6_ifaddr *ia6; ip6 = mtod(m, struct ip6_hdr *); ia6 = in6_ifawithifp(ifp, &ip6->ip6_src); if (ia6) { /* Record statistics for this interface address. */ counter_u64_add(ia6->ia_ifa.ifa_opackets, 1); counter_u64_add(ia6->ia_ifa.ifa_obytes, m->m_pkthdr.len); ifa_free(&ia6->ia_ifa); } - error = nd6_output_ifp(ifp, origifp, m, dst, NULL); + error = nd6_output_ifp(ifp, origifp, m, dst, + (struct route *)ro); goto done; } /* * try to fragment the packet. case 1-b and 3 */ if (mtu < IPV6_MMTU) { /* path MTU cannot be less than IPV6_MMTU */ error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; } else if (ip6->ip6_plen == 0) { /* jumbo payload cannot be fragmented */ error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; } else { u_char nextproto; /* * Too large for the destination or interface; * fragment if possible. * Must be able to put at least 8 bytes per fragment. */ hlen = unfragpartlen; if (mtu > IPV6_MAXPACKET) mtu = IPV6_MAXPACKET; len = (mtu - hlen - sizeof(struct ip6_frag)) & ~7; if (len < 8) { error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; } /* * If the interface will not calculate checksums on * fragmented packets, then do it here. * XXX-BZ handle the hw offloading case. Need flags. */ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { in6_delayed_cksum(m, plen, hlen); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) { sctp_delayed_cksum(m, hlen); m->m_pkthdr.csum_flags &= ~CSUM_SCTP_IPV6; } #endif /* * Change the next header field of the last header in the * unfragmentable part. */ if (exthdrs.ip6e_rthdr) { nextproto = *mtod(exthdrs.ip6e_rthdr, u_char *); *mtod(exthdrs.ip6e_rthdr, u_char *) = IPPROTO_FRAGMENT; } else if (exthdrs.ip6e_dest1) { nextproto = *mtod(exthdrs.ip6e_dest1, u_char *); *mtod(exthdrs.ip6e_dest1, u_char *) = IPPROTO_FRAGMENT; } else if (exthdrs.ip6e_hbh) { nextproto = *mtod(exthdrs.ip6e_hbh, u_char *); *mtod(exthdrs.ip6e_hbh, u_char *) = IPPROTO_FRAGMENT; } else { nextproto = ip6->ip6_nxt; ip6->ip6_nxt = IPPROTO_FRAGMENT; } /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto * chain. */ m0 = m; id = htonl(ip6_randomid()); if ((error = ip6_fragment(ifp, m, hlen, nextproto, len, id))) goto sendorfree; in6_ifstat_inc(ifp, ifs6_out_fragok); } /* * Remove leading garbages. */ sendorfree: m = m0->m_nextpkt; m0->m_nextpkt = 0; m_freem(m0); for (m0 = m; m; m = m0) { m0 = m->m_nextpkt; m->m_nextpkt = 0; if (error == 0) { /* Record statistics for this interface address. */ if (ia) { counter_u64_add(ia->ia_ifa.ifa_opackets, 1); counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } - error = nd6_output_ifp(ifp, origifp, m, dst, NULL); + error = nd6_output_ifp(ifp, origifp, m, dst, + (struct route *)ro); } else m_freem(m); } if (error == 0) IP6STAT_INC(ip6s_fragmented); done: if (ro == &ip6route) RO_RTFREE(ro); return (error); freehdrs: m_freem(exthdrs.ip6e_hbh); /* m_freem will check if mbuf is 0 */ m_freem(exthdrs.ip6e_dest1); m_freem(exthdrs.ip6e_rthdr); m_freem(exthdrs.ip6e_dest2); /* FALLTHROUGH */ bad: if (m) m_freem(m); goto done; } static int ip6_copyexthdr(struct mbuf **mp, caddr_t hdr, int hlen) { struct mbuf *m; if (hlen > MCLBYTES) return (ENOBUFS); /* XXX */ if (hlen > MLEN) m = m_getcl(M_NOWAIT, MT_DATA, 0); else m = m_get(M_NOWAIT, MT_DATA); if (m == NULL) return (ENOBUFS); m->m_len = hlen; if (hdr) bcopy(hdr, mtod(m, caddr_t), hlen); *mp = m; return (0); } /* * Insert jumbo payload option. */ static int ip6_insert_jumboopt(struct ip6_exthdrs *exthdrs, u_int32_t plen) { struct mbuf *mopt; u_char *optbuf; u_int32_t v; #define JUMBOOPTLEN 8 /* length of jumbo payload option and padding */ /* * If there is no hop-by-hop options header, allocate new one. * If there is one but it doesn't have enough space to store the * jumbo payload option, allocate a cluster to store the whole options. * Otherwise, use it to store the options. */ if (exthdrs->ip6e_hbh == 0) { mopt = m_get(M_NOWAIT, MT_DATA); if (mopt == NULL) return (ENOBUFS); mopt->m_len = JUMBOOPTLEN; optbuf = mtod(mopt, u_char *); optbuf[1] = 0; /* = ((JUMBOOPTLEN) >> 3) - 1 */ exthdrs->ip6e_hbh = mopt; } else { struct ip6_hbh *hbh; mopt = exthdrs->ip6e_hbh; if (M_TRAILINGSPACE(mopt) < JUMBOOPTLEN) { /* * XXX assumption: * - exthdrs->ip6e_hbh is not referenced from places * other than exthdrs. * - exthdrs->ip6e_hbh is not an mbuf chain. */ int oldoptlen = mopt->m_len; struct mbuf *n; /* * XXX: give up if the whole (new) hbh header does * not fit even in an mbuf cluster. */ if (oldoptlen + JUMBOOPTLEN > MCLBYTES) return (ENOBUFS); /* * As a consequence, we must always prepare a cluster * at this point. */ n = m_getcl(M_NOWAIT, MT_DATA, 0); if (n == NULL) return (ENOBUFS); n->m_len = oldoptlen + JUMBOOPTLEN; bcopy(mtod(mopt, caddr_t), mtod(n, caddr_t), oldoptlen); optbuf = mtod(n, caddr_t) + oldoptlen; m_freem(mopt); mopt = exthdrs->ip6e_hbh = n; } else { optbuf = mtod(mopt, u_char *) + mopt->m_len; mopt->m_len += JUMBOOPTLEN; } optbuf[0] = IP6OPT_PADN; optbuf[1] = 1; /* * Adjust the header length according to the pad and * the jumbo payload option. */ hbh = mtod(mopt, struct ip6_hbh *); hbh->ip6h_len += (JUMBOOPTLEN >> 3); } /* fill in the option. */ optbuf[2] = IP6OPT_JUMBO; optbuf[3] = 4; v = (u_int32_t)htonl(plen + JUMBOOPTLEN); bcopy(&v, &optbuf[4], sizeof(u_int32_t)); /* finally, adjust the packet header length */ exthdrs->ip6e_ip6->m_pkthdr.len += JUMBOOPTLEN; return (0); #undef JUMBOOPTLEN } /* * Insert fragment header and copy unfragmentable header portions. */ static int ip6_insertfraghdr(struct mbuf *m0, struct mbuf *m, int hlen, struct ip6_frag **frghdrp) { struct mbuf *n, *mlast; if (hlen > sizeof(struct ip6_hdr)) { n = m_copym(m0, sizeof(struct ip6_hdr), hlen - sizeof(struct ip6_hdr), M_NOWAIT); if (n == 0) return (ENOBUFS); m->m_next = n; } else n = m; /* Search for the last mbuf of unfragmentable part. */ for (mlast = n; mlast->m_next; mlast = mlast->m_next) ; if (M_WRITABLE(mlast) && M_TRAILINGSPACE(mlast) >= sizeof(struct ip6_frag)) { /* use the trailing space of the last mbuf for the fragment hdr */ *frghdrp = (struct ip6_frag *)(mtod(mlast, caddr_t) + mlast->m_len); mlast->m_len += sizeof(struct ip6_frag); m->m_pkthdr.len += sizeof(struct ip6_frag); } else { /* allocate a new mbuf for the fragment header */ struct mbuf *mfrg; mfrg = m_get(M_NOWAIT, MT_DATA); if (mfrg == NULL) return (ENOBUFS); mfrg->m_len = sizeof(struct ip6_frag); *frghdrp = mtod(mfrg, struct ip6_frag *); mlast->m_next = mfrg; } return (0); } /* * Calculates IPv6 path mtu for destination @dst. * Resulting MTU is stored in @mtup. * * Returns 0 on success. */ static int ip6_getpmtu_ctl(u_int fibnum, struct in6_addr *dst, u_long *mtup) { struct nhop6_extended nh6; struct in6_addr kdst; uint32_t scopeid; struct ifnet *ifp; u_long mtu; int error; in6_splitscope(dst, &kdst, &scopeid); if (fib6_lookup_nh_ext(fibnum, &kdst, scopeid, NHR_REF, 0, &nh6) != 0) return (EHOSTUNREACH); ifp = nh6.nh_ifp; mtu = nh6.nh_mtu; error = ip6_calcmtu(ifp, dst, mtu, mtup, NULL); fib6_free_nh_ext(fibnum, &nh6); return (error); } /* * Calculates IPv6 path MTU for @dst based on transmit @ifp, * and cached data in @ro_pmtu. * MTU from (successful) route lookup is saved (along with dst) * inside @ro_pmtu to avoid subsequent route lookups after packet * filter processing. * * Stores mtu and always-frag value into @mtup and @alwaysfragp. * Returns 0 on success. */ static int ip6_getpmtu(struct route_in6 *ro_pmtu, int do_lookup, struct ifnet *ifp, struct in6_addr *dst, u_long *mtup, int *alwaysfragp, u_int fibnum) { struct nhop6_basic nh6; struct in6_addr kdst; uint32_t scopeid; struct sockaddr_in6 *sa6_dst; u_long mtu; mtu = 0; if (do_lookup) { /* * Here ro_pmtu has final destination address, while * ro might represent immediate destination. * Use ro_pmtu destination since mtu might differ. */ sa6_dst = (struct sockaddr_in6 *)&ro_pmtu->ro_dst; if (!IN6_ARE_ADDR_EQUAL(&sa6_dst->sin6_addr, dst)) ro_pmtu->ro_mtu = 0; if (ro_pmtu->ro_mtu == 0) { bzero(sa6_dst, sizeof(*sa6_dst)); sa6_dst->sin6_family = AF_INET6; sa6_dst->sin6_len = sizeof(struct sockaddr_in6); sa6_dst->sin6_addr = *dst; in6_splitscope(dst, &kdst, &scopeid); if (fib6_lookup_nh_basic(fibnum, &kdst, scopeid, 0, 0, &nh6) == 0) ro_pmtu->ro_mtu = nh6.nh_mtu; } mtu = ro_pmtu->ro_mtu; } if (ro_pmtu->ro_rt) mtu = ro_pmtu->ro_rt->rt_mtu; return (ip6_calcmtu(ifp, dst, mtu, mtup, alwaysfragp)); } /* * Calculate MTU based on transmit @ifp, route mtu @rt_mtu and * hostcache data for @dst. * Stores mtu and always-frag value into @mtup and @alwaysfragp. * * Returns 0 on success. */ static int ip6_calcmtu(struct ifnet *ifp, const struct in6_addr *dst, u_long rt_mtu, u_long *mtup, int *alwaysfragp) { u_long mtu = 0; int alwaysfrag = 0; int error = 0; if (rt_mtu > 0) { u_int32_t ifmtu; struct in_conninfo inc; bzero(&inc, sizeof(inc)); inc.inc_flags |= INC_ISIPV6; inc.inc6_faddr = *dst; ifmtu = IN6_LINKMTU(ifp); mtu = tcp_hc_getmtu(&inc); if (mtu) mtu = min(mtu, rt_mtu); else mtu = rt_mtu; if (mtu == 0) mtu = ifmtu; else if (mtu < IPV6_MMTU) { /* * RFC2460 section 5, last paragraph: * if we record ICMPv6 too big message with * mtu < IPV6_MMTU, transmit packets sized IPV6_MMTU * or smaller, with framgent header attached. * (fragment header is needed regardless from the * packet size, for translators to identify packets) */ alwaysfrag = 1; mtu = IPV6_MMTU; } } else if (ifp) { mtu = IN6_LINKMTU(ifp); } else error = EHOSTUNREACH; /* XXX */ *mtup = mtu; if (alwaysfragp) *alwaysfragp = alwaysfrag; return (error); } /* * IP6 socket option processing. */ int ip6_ctloutput(struct socket *so, struct sockopt *sopt) { int optdatalen, uproto; void *optdata; struct inpcb *in6p = sotoinpcb(so); int error, optval; int level, op, optname; int optlen; struct thread *td; #ifdef RSS uint32_t rss_bucket; int retval; #endif level = sopt->sopt_level; op = sopt->sopt_dir; optname = sopt->sopt_name; optlen = sopt->sopt_valsize; td = sopt->sopt_td; error = 0; optval = 0; uproto = (int)so->so_proto->pr_protocol; if (level != IPPROTO_IPV6) { error = EINVAL; if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_dir == SOPT_SET) { switch (sopt->sopt_name) { case SO_REUSEADDR: INP_WLOCK(in6p); if ((so->so_options & SO_REUSEADDR) != 0) in6p->inp_flags2 |= INP_REUSEADDR; else in6p->inp_flags2 &= ~INP_REUSEADDR; INP_WUNLOCK(in6p); error = 0; break; case SO_REUSEPORT: INP_WLOCK(in6p); if ((so->so_options & SO_REUSEPORT) != 0) in6p->inp_flags2 |= INP_REUSEPORT; else in6p->inp_flags2 &= ~INP_REUSEPORT; INP_WUNLOCK(in6p); error = 0; break; case SO_SETFIB: INP_WLOCK(in6p); in6p->inp_inc.inc_fibnum = so->so_fibnum; INP_WUNLOCK(in6p); error = 0; break; default: break; } } } else { /* level == IPPROTO_IPV6 */ switch (op) { case SOPT_SET: switch (optname) { case IPV6_2292PKTOPTIONS: #ifdef IPV6_PKTOPTIONS case IPV6_PKTOPTIONS: #endif { struct mbuf *m; error = soopt_getm(sopt, &m); /* XXX */ if (error != 0) break; error = soopt_mcopyin(sopt, m); /* XXX */ if (error != 0) break; error = ip6_pcbopts(&in6p->in6p_outputopts, m, so, sopt); m_freem(m); /* XXX */ break; } /* * Use of some Hop-by-Hop options or some * Destination options, might require special * privilege. That is, normal applications * (without special privilege) might be forbidden * from setting certain options in outgoing packets, * and might never see certain options in received * packets. [RFC 2292 Section 6] * KAME specific note: * KAME prevents non-privileged users from sending or * receiving ANY hbh/dst options in order to avoid * overhead of parsing options in the kernel. */ case IPV6_RECVHOPOPTS: case IPV6_RECVDSTOPTS: case IPV6_RECVRTHDRDSTOPTS: if (td != NULL) { error = priv_check(td, PRIV_NETINET_SETHDROPTS); if (error) break; } /* FALLTHROUGH */ case IPV6_UNICAST_HOPS: case IPV6_HOPLIMIT: case IPV6_RECVPKTINFO: case IPV6_RECVHOPLIMIT: case IPV6_RECVRTHDR: case IPV6_RECVPATHMTU: case IPV6_RECVTCLASS: case IPV6_RECVFLOWID: #ifdef RSS case IPV6_RECVRSSBUCKETID: #endif case IPV6_V6ONLY: case IPV6_AUTOFLOWLABEL: case IPV6_BINDANY: case IPV6_BINDMULTI: #ifdef RSS case IPV6_RSS_LISTEN_BUCKET: #endif if (optname == IPV6_BINDANY && td != NULL) { error = priv_check(td, PRIV_NETINET_BINDANY); if (error) break; } if (optlen != sizeof(int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (optname) { case IPV6_UNICAST_HOPS: if (optval < -1 || optval >= 256) error = EINVAL; else { /* -1 = kernel default */ in6p->in6p_hops = optval; if ((in6p->inp_vflag & INP_IPV4) != 0) in6p->inp_ip_ttl = optval; } break; #define OPTSET(bit) \ do { \ INP_WLOCK(in6p); \ if (optval) \ in6p->inp_flags |= (bit); \ else \ in6p->inp_flags &= ~(bit); \ INP_WUNLOCK(in6p); \ } while (/*CONSTCOND*/ 0) #define OPTSET2292(bit) \ do { \ INP_WLOCK(in6p); \ in6p->inp_flags |= IN6P_RFC2292; \ if (optval) \ in6p->inp_flags |= (bit); \ else \ in6p->inp_flags &= ~(bit); \ INP_WUNLOCK(in6p); \ } while (/*CONSTCOND*/ 0) #define OPTBIT(bit) (in6p->inp_flags & (bit) ? 1 : 0) #define OPTSET2(bit, val) do { \ INP_WLOCK(in6p); \ if (val) \ in6p->inp_flags2 |= bit; \ else \ in6p->inp_flags2 &= ~bit; \ INP_WUNLOCK(in6p); \ } while (0) #define OPTBIT2(bit) (in6p->inp_flags2 & (bit) ? 1 : 0) case IPV6_RECVPKTINFO: /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_PKTINFO); break; case IPV6_HOPLIMIT: { struct ip6_pktopts **optp; /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } optp = &in6p->in6p_outputopts; error = ip6_pcbopt(IPV6_HOPLIMIT, (u_char *)&optval, sizeof(optval), optp, (td != NULL) ? td->td_ucred : NULL, uproto); break; } case IPV6_RECVHOPLIMIT: /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_HOPLIMIT); break; case IPV6_RECVHOPOPTS: /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_HOPOPTS); break; case IPV6_RECVDSTOPTS: /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_DSTOPTS); break; case IPV6_RECVRTHDRDSTOPTS: /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_RTHDRDSTOPTS); break; case IPV6_RECVRTHDR: /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_RTHDR); break; case IPV6_RECVPATHMTU: /* * We ignore this option for TCP * sockets. * (RFC3542 leaves this case * unspecified.) */ if (uproto != IPPROTO_TCP) OPTSET(IN6P_MTU); break; case IPV6_RECVFLOWID: OPTSET2(INP_RECVFLOWID, optval); break; #ifdef RSS case IPV6_RECVRSSBUCKETID: OPTSET2(INP_RECVRSSBUCKETID, optval); break; #endif case IPV6_V6ONLY: /* * make setsockopt(IPV6_V6ONLY) * available only prior to bind(2). * see ipng mailing list, Jun 22 2001. */ if (in6p->inp_lport || !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr)) { error = EINVAL; break; } OPTSET(IN6P_IPV6_V6ONLY); if (optval) in6p->inp_vflag &= ~INP_IPV4; else in6p->inp_vflag |= INP_IPV4; break; case IPV6_RECVTCLASS: /* cannot mix with RFC2292 XXX */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_TCLASS); break; case IPV6_AUTOFLOWLABEL: OPTSET(IN6P_AUTOFLOWLABEL); break; case IPV6_BINDANY: OPTSET(INP_BINDANY); break; case IPV6_BINDMULTI: OPTSET2(INP_BINDMULTI, optval); break; #ifdef RSS case IPV6_RSS_LISTEN_BUCKET: if ((optval >= 0) && (optval < rss_getnumbuckets())) { in6p->inp_rss_listen_bucket = optval; OPTSET2(INP_RSS_BUCKET_SET, 1); } else { error = EINVAL; } break; #endif } break; case IPV6_TCLASS: case IPV6_DONTFRAG: case IPV6_USE_MIN_MTU: case IPV6_PREFER_TEMPADDR: if (optlen != sizeof(optval)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; { struct ip6_pktopts **optp; optp = &in6p->in6p_outputopts; error = ip6_pcbopt(optname, (u_char *)&optval, sizeof(optval), optp, (td != NULL) ? td->td_ucred : NULL, uproto); break; } case IPV6_2292PKTINFO: case IPV6_2292HOPLIMIT: case IPV6_2292HOPOPTS: case IPV6_2292DSTOPTS: case IPV6_2292RTHDR: /* RFC 2292 */ if (optlen != sizeof(int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (optname) { case IPV6_2292PKTINFO: OPTSET2292(IN6P_PKTINFO); break; case IPV6_2292HOPLIMIT: OPTSET2292(IN6P_HOPLIMIT); break; case IPV6_2292HOPOPTS: /* * Check super-user privilege. * See comments for IPV6_RECVHOPOPTS. */ if (td != NULL) { error = priv_check(td, PRIV_NETINET_SETHDROPTS); if (error) return (error); } OPTSET2292(IN6P_HOPOPTS); break; case IPV6_2292DSTOPTS: if (td != NULL) { error = priv_check(td, PRIV_NETINET_SETHDROPTS); if (error) return (error); } OPTSET2292(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); /* XXX */ break; case IPV6_2292RTHDR: OPTSET2292(IN6P_RTHDR); break; } break; case IPV6_PKTINFO: case IPV6_HOPOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_NEXTHOP: { /* new advanced API (RFC3542) */ u_char *optbuf; u_char optbuf_storage[MCLBYTES]; int optlen; struct ip6_pktopts **optp; /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } /* * We only ensure valsize is not too large * here. Further validation will be done * later. */ error = sooptcopyin(sopt, optbuf_storage, sizeof(optbuf_storage), 0); if (error) break; optlen = sopt->sopt_valsize; optbuf = optbuf_storage; optp = &in6p->in6p_outputopts; error = ip6_pcbopt(optname, optbuf, optlen, optp, (td != NULL) ? td->td_ucred : NULL, uproto); break; } #undef OPTSET case IPV6_MULTICAST_IF: case IPV6_MULTICAST_HOPS: case IPV6_MULTICAST_LOOP: case IPV6_JOIN_GROUP: case IPV6_LEAVE_GROUP: case IPV6_MSFILTER: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: error = ip6_setmoptions(in6p, sopt); break; case IPV6_PORTRANGE: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; INP_WLOCK(in6p); switch (optval) { case IPV6_PORTRANGE_DEFAULT: in6p->inp_flags &= ~(INP_LOWPORT); in6p->inp_flags &= ~(INP_HIGHPORT); break; case IPV6_PORTRANGE_HIGH: in6p->inp_flags &= ~(INP_LOWPORT); in6p->inp_flags |= INP_HIGHPORT; break; case IPV6_PORTRANGE_LOW: in6p->inp_flags &= ~(INP_HIGHPORT); in6p->inp_flags |= INP_LOWPORT; break; default: error = EINVAL; break; } INP_WUNLOCK(in6p); break; #ifdef IPSEC case IPV6_IPSEC_POLICY: { caddr_t req; struct mbuf *m; if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ break; if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ break; req = mtod(m, caddr_t); error = ipsec_set_policy(in6p, optname, req, m->m_len, (sopt->sopt_td != NULL) ? sopt->sopt_td->td_ucred : NULL); m_freem(m); break; } #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (optname) { case IPV6_2292PKTOPTIONS: #ifdef IPV6_PKTOPTIONS case IPV6_PKTOPTIONS: #endif /* * RFC3542 (effectively) deprecated the * semantics of the 2292-style pktoptions. * Since it was not reliable in nature (i.e., * applications had to expect the lack of some * information after all), it would make sense * to simplify this part by always returning * empty data. */ sopt->sopt_valsize = 0; break; case IPV6_RECVHOPOPTS: case IPV6_RECVDSTOPTS: case IPV6_RECVRTHDRDSTOPTS: case IPV6_UNICAST_HOPS: case IPV6_RECVPKTINFO: case IPV6_RECVHOPLIMIT: case IPV6_RECVRTHDR: case IPV6_RECVPATHMTU: case IPV6_V6ONLY: case IPV6_PORTRANGE: case IPV6_RECVTCLASS: case IPV6_AUTOFLOWLABEL: case IPV6_BINDANY: case IPV6_FLOWID: case IPV6_FLOWTYPE: case IPV6_RECVFLOWID: #ifdef RSS case IPV6_RSSBUCKETID: case IPV6_RECVRSSBUCKETID: #endif case IPV6_BINDMULTI: switch (optname) { case IPV6_RECVHOPOPTS: optval = OPTBIT(IN6P_HOPOPTS); break; case IPV6_RECVDSTOPTS: optval = OPTBIT(IN6P_DSTOPTS); break; case IPV6_RECVRTHDRDSTOPTS: optval = OPTBIT(IN6P_RTHDRDSTOPTS); break; case IPV6_UNICAST_HOPS: optval = in6p->in6p_hops; break; case IPV6_RECVPKTINFO: optval = OPTBIT(IN6P_PKTINFO); break; case IPV6_RECVHOPLIMIT: optval = OPTBIT(IN6P_HOPLIMIT); break; case IPV6_RECVRTHDR: optval = OPTBIT(IN6P_RTHDR); break; case IPV6_RECVPATHMTU: optval = OPTBIT(IN6P_MTU); break; case IPV6_V6ONLY: optval = OPTBIT(IN6P_IPV6_V6ONLY); break; case IPV6_PORTRANGE: { int flags; flags = in6p->inp_flags; if (flags & INP_HIGHPORT) optval = IPV6_PORTRANGE_HIGH; else if (flags & INP_LOWPORT) optval = IPV6_PORTRANGE_LOW; else optval = 0; break; } case IPV6_RECVTCLASS: optval = OPTBIT(IN6P_TCLASS); break; case IPV6_AUTOFLOWLABEL: optval = OPTBIT(IN6P_AUTOFLOWLABEL); break; case IPV6_BINDANY: optval = OPTBIT(INP_BINDANY); break; case IPV6_FLOWID: optval = in6p->inp_flowid; break; case IPV6_FLOWTYPE: optval = in6p->inp_flowtype; break; case IPV6_RECVFLOWID: optval = OPTBIT2(INP_RECVFLOWID); break; #ifdef RSS case IPV6_RSSBUCKETID: retval = rss_hash2bucket(in6p->inp_flowid, in6p->inp_flowtype, &rss_bucket); if (retval == 0) optval = rss_bucket; else error = EINVAL; break; case IPV6_RECVRSSBUCKETID: optval = OPTBIT2(INP_RECVRSSBUCKETID); break; #endif case IPV6_BINDMULTI: optval = OPTBIT2(INP_BINDMULTI); break; } if (error) break; error = sooptcopyout(sopt, &optval, sizeof optval); break; case IPV6_PATHMTU: { u_long pmtu = 0; struct ip6_mtuinfo mtuinfo; if (!(so->so_state & SS_ISCONNECTED)) return (ENOTCONN); /* * XXX: we dot not consider the case of source * routing, or optional information to specify * the outgoing interface. */ error = ip6_getpmtu_ctl(so->so_fibnum, &in6p->in6p_faddr, &pmtu); if (error) break; if (pmtu > IPV6_MAXPACKET) pmtu = IPV6_MAXPACKET; bzero(&mtuinfo, sizeof(mtuinfo)); mtuinfo.ip6m_mtu = (u_int32_t)pmtu; optdata = (void *)&mtuinfo; optdatalen = sizeof(mtuinfo); error = sooptcopyout(sopt, optdata, optdatalen); break; } case IPV6_2292PKTINFO: case IPV6_2292HOPLIMIT: case IPV6_2292HOPOPTS: case IPV6_2292RTHDR: case IPV6_2292DSTOPTS: switch (optname) { case IPV6_2292PKTINFO: optval = OPTBIT(IN6P_PKTINFO); break; case IPV6_2292HOPLIMIT: optval = OPTBIT(IN6P_HOPLIMIT); break; case IPV6_2292HOPOPTS: optval = OPTBIT(IN6P_HOPOPTS); break; case IPV6_2292RTHDR: optval = OPTBIT(IN6P_RTHDR); break; case IPV6_2292DSTOPTS: optval = OPTBIT(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); break; } error = sooptcopyout(sopt, &optval, sizeof optval); break; case IPV6_PKTINFO: case IPV6_HOPOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_NEXTHOP: case IPV6_TCLASS: case IPV6_DONTFRAG: case IPV6_USE_MIN_MTU: case IPV6_PREFER_TEMPADDR: error = ip6_getpcbopt(in6p->in6p_outputopts, optname, sopt); break; case IPV6_MULTICAST_IF: case IPV6_MULTICAST_HOPS: case IPV6_MULTICAST_LOOP: case IPV6_MSFILTER: error = ip6_getmoptions(in6p, sopt); break; #ifdef IPSEC case IPV6_IPSEC_POLICY: { caddr_t req = NULL; size_t len = 0; struct mbuf *m = NULL; struct mbuf **mp = &m; size_t ovalsize = sopt->sopt_valsize; caddr_t oval = (caddr_t)sopt->sopt_val; error = soopt_getm(sopt, &m); /* XXX */ if (error != 0) break; error = soopt_mcopyin(sopt, m); /* XXX */ if (error != 0) break; sopt->sopt_valsize = ovalsize; sopt->sopt_val = oval; if (m) { req = mtod(m, caddr_t); len = m->m_len; } error = ipsec_get_policy(in6p, req, len, mp); if (error == 0) error = soopt_mcopyout(sopt, m); /* XXX */ if (error == 0 && m) m_freem(m); break; } #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; } } return (error); } int ip6_raw_ctloutput(struct socket *so, struct sockopt *sopt) { int error = 0, optval, optlen; const int icmp6off = offsetof(struct icmp6_hdr, icmp6_cksum); struct inpcb *in6p = sotoinpcb(so); int level, op, optname; level = sopt->sopt_level; op = sopt->sopt_dir; optname = sopt->sopt_name; optlen = sopt->sopt_valsize; if (level != IPPROTO_IPV6) { return (EINVAL); } switch (optname) { case IPV6_CHECKSUM: /* * For ICMPv6 sockets, no modification allowed for checksum * offset, permit "no change" values to help existing apps. * * RFC3542 says: "An attempt to set IPV6_CHECKSUM * for an ICMPv6 socket will fail." * The current behavior does not meet RFC3542. */ switch (op) { case SOPT_SET: if (optlen != sizeof(int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error) break; if ((optval % 2) != 0) { /* the API assumes even offset values */ error = EINVAL; } else if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) { if (optval != icmp6off) error = EINVAL; } else in6p->in6p_cksum = optval; break; case SOPT_GET: if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) optval = icmp6off; else optval = in6p->in6p_cksum; error = sooptcopyout(sopt, &optval, sizeof(optval)); break; default: error = EINVAL; break; } break; default: error = ENOPROTOOPT; break; } return (error); } /* * Set up IP6 options in pcb for insertion in output packets or * specifying behavior of outgoing packets. */ static int ip6_pcbopts(struct ip6_pktopts **pktopt, struct mbuf *m, struct socket *so, struct sockopt *sopt) { struct ip6_pktopts *opt = *pktopt; int error = 0; struct thread *td = sopt->sopt_td; /* turn off any old options. */ if (opt) { #ifdef DIAGNOSTIC if (opt->ip6po_pktinfo || opt->ip6po_nexthop || opt->ip6po_hbh || opt->ip6po_dest1 || opt->ip6po_dest2 || opt->ip6po_rhinfo.ip6po_rhi_rthdr) printf("ip6_pcbopts: all specified options are cleared.\n"); #endif ip6_clearpktopts(opt, -1); } else opt = malloc(sizeof(*opt), M_IP6OPT, M_WAITOK); *pktopt = NULL; if (!m || m->m_len == 0) { /* * Only turning off any previous options, regardless of * whether the opt is just created or given. */ free(opt, M_IP6OPT); return (0); } /* set options specified by user. */ if ((error = ip6_setpktopts(m, opt, NULL, (td != NULL) ? td->td_ucred : NULL, so->so_proto->pr_protocol)) != 0) { ip6_clearpktopts(opt, -1); /* XXX: discard all options */ free(opt, M_IP6OPT); return (error); } *pktopt = opt; return (0); } /* * initialize ip6_pktopts. beware that there are non-zero default values in * the struct. */ void ip6_initpktopts(struct ip6_pktopts *opt) { bzero(opt, sizeof(*opt)); opt->ip6po_hlim = -1; /* -1 means default hop limit */ opt->ip6po_tclass = -1; /* -1 means default traffic class */ opt->ip6po_minmtu = IP6PO_MINMTU_MCASTONLY; opt->ip6po_prefer_tempaddr = IP6PO_TEMPADDR_SYSTEM; } static int ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt, struct ucred *cred, int uproto) { struct ip6_pktopts *opt; if (*pktopt == NULL) { *pktopt = malloc(sizeof(struct ip6_pktopts), M_IP6OPT, M_WAITOK); ip6_initpktopts(*pktopt); } opt = *pktopt; return (ip6_setpktopt(optname, buf, len, opt, cred, 1, 0, uproto)); } static int ip6_getpcbopt(struct ip6_pktopts *pktopt, int optname, struct sockopt *sopt) { void *optdata = NULL; int optdatalen = 0; struct ip6_ext *ip6e; int error = 0; struct in6_pktinfo null_pktinfo; int deftclass = 0, on; int defminmtu = IP6PO_MINMTU_MCASTONLY; int defpreftemp = IP6PO_TEMPADDR_SYSTEM; switch (optname) { case IPV6_PKTINFO: optdata = (void *)&null_pktinfo; if (pktopt && pktopt->ip6po_pktinfo) { bcopy(pktopt->ip6po_pktinfo, &null_pktinfo, sizeof(null_pktinfo)); in6_clearscope(&null_pktinfo.ipi6_addr); } else { /* XXX: we don't have to do this every time... */ bzero(&null_pktinfo, sizeof(null_pktinfo)); } optdatalen = sizeof(struct in6_pktinfo); break; case IPV6_TCLASS: if (pktopt && pktopt->ip6po_tclass >= 0) optdata = (void *)&pktopt->ip6po_tclass; else optdata = (void *)&deftclass; optdatalen = sizeof(int); break; case IPV6_HOPOPTS: if (pktopt && pktopt->ip6po_hbh) { optdata = (void *)pktopt->ip6po_hbh; ip6e = (struct ip6_ext *)pktopt->ip6po_hbh; optdatalen = (ip6e->ip6e_len + 1) << 3; } break; case IPV6_RTHDR: if (pktopt && pktopt->ip6po_rthdr) { optdata = (void *)pktopt->ip6po_rthdr; ip6e = (struct ip6_ext *)pktopt->ip6po_rthdr; optdatalen = (ip6e->ip6e_len + 1) << 3; } break; case IPV6_RTHDRDSTOPTS: if (pktopt && pktopt->ip6po_dest1) { optdata = (void *)pktopt->ip6po_dest1; ip6e = (struct ip6_ext *)pktopt->ip6po_dest1; optdatalen = (ip6e->ip6e_len + 1) << 3; } break; case IPV6_DSTOPTS: if (pktopt && pktopt->ip6po_dest2) { optdata = (void *)pktopt->ip6po_dest2; ip6e = (struct ip6_ext *)pktopt->ip6po_dest2; optdatalen = (ip6e->ip6e_len + 1) << 3; } break; case IPV6_NEXTHOP: if (pktopt && pktopt->ip6po_nexthop) { optdata = (void *)pktopt->ip6po_nexthop; optdatalen = pktopt->ip6po_nexthop->sa_len; } break; case IPV6_USE_MIN_MTU: if (pktopt) optdata = (void *)&pktopt->ip6po_minmtu; else optdata = (void *)&defminmtu; optdatalen = sizeof(int); break; case IPV6_DONTFRAG: if (pktopt && ((pktopt->ip6po_flags) & IP6PO_DONTFRAG)) on = 1; else on = 0; optdata = (void *)&on; optdatalen = sizeof(on); break; case IPV6_PREFER_TEMPADDR: if (pktopt) optdata = (void *)&pktopt->ip6po_prefer_tempaddr; else optdata = (void *)&defpreftemp; optdatalen = sizeof(int); break; default: /* should not happen */ #ifdef DIAGNOSTIC panic("ip6_getpcbopt: unexpected option\n"); #endif return (ENOPROTOOPT); } error = sooptcopyout(sopt, optdata, optdatalen); return (error); } void ip6_clearpktopts(struct ip6_pktopts *pktopt, int optname) { if (pktopt == NULL) return; if (optname == -1 || optname == IPV6_PKTINFO) { if (pktopt->ip6po_pktinfo) free(pktopt->ip6po_pktinfo, M_IP6OPT); pktopt->ip6po_pktinfo = NULL; } if (optname == -1 || optname == IPV6_HOPLIMIT) pktopt->ip6po_hlim = -1; if (optname == -1 || optname == IPV6_TCLASS) pktopt->ip6po_tclass = -1; if (optname == -1 || optname == IPV6_NEXTHOP) { if (pktopt->ip6po_nextroute.ro_rt) { RTFREE(pktopt->ip6po_nextroute.ro_rt); pktopt->ip6po_nextroute.ro_rt = NULL; } if (pktopt->ip6po_nexthop) free(pktopt->ip6po_nexthop, M_IP6OPT); pktopt->ip6po_nexthop = NULL; } if (optname == -1 || optname == IPV6_HOPOPTS) { if (pktopt->ip6po_hbh) free(pktopt->ip6po_hbh, M_IP6OPT); pktopt->ip6po_hbh = NULL; } if (optname == -1 || optname == IPV6_RTHDRDSTOPTS) { if (pktopt->ip6po_dest1) free(pktopt->ip6po_dest1, M_IP6OPT); pktopt->ip6po_dest1 = NULL; } if (optname == -1 || optname == IPV6_RTHDR) { if (pktopt->ip6po_rhinfo.ip6po_rhi_rthdr) free(pktopt->ip6po_rhinfo.ip6po_rhi_rthdr, M_IP6OPT); pktopt->ip6po_rhinfo.ip6po_rhi_rthdr = NULL; if (pktopt->ip6po_route.ro_rt) { RTFREE(pktopt->ip6po_route.ro_rt); pktopt->ip6po_route.ro_rt = NULL; } } if (optname == -1 || optname == IPV6_DSTOPTS) { if (pktopt->ip6po_dest2) free(pktopt->ip6po_dest2, M_IP6OPT); pktopt->ip6po_dest2 = NULL; } } #define PKTOPT_EXTHDRCPY(type) \ do {\ if (src->type) {\ int hlen = (((struct ip6_ext *)src->type)->ip6e_len + 1) << 3;\ dst->type = malloc(hlen, M_IP6OPT, canwait);\ if (dst->type == NULL && canwait == M_NOWAIT)\ goto bad;\ bcopy(src->type, dst->type, hlen);\ }\ } while (/*CONSTCOND*/ 0) static int copypktopts(struct ip6_pktopts *dst, struct ip6_pktopts *src, int canwait) { if (dst == NULL || src == NULL) { printf("ip6_clearpktopts: invalid argument\n"); return (EINVAL); } dst->ip6po_hlim = src->ip6po_hlim; dst->ip6po_tclass = src->ip6po_tclass; dst->ip6po_flags = src->ip6po_flags; dst->ip6po_minmtu = src->ip6po_minmtu; dst->ip6po_prefer_tempaddr = src->ip6po_prefer_tempaddr; if (src->ip6po_pktinfo) { dst->ip6po_pktinfo = malloc(sizeof(*dst->ip6po_pktinfo), M_IP6OPT, canwait); if (dst->ip6po_pktinfo == NULL) goto bad; *dst->ip6po_pktinfo = *src->ip6po_pktinfo; } if (src->ip6po_nexthop) { dst->ip6po_nexthop = malloc(src->ip6po_nexthop->sa_len, M_IP6OPT, canwait); if (dst->ip6po_nexthop == NULL) goto bad; bcopy(src->ip6po_nexthop, dst->ip6po_nexthop, src->ip6po_nexthop->sa_len); } PKTOPT_EXTHDRCPY(ip6po_hbh); PKTOPT_EXTHDRCPY(ip6po_dest1); PKTOPT_EXTHDRCPY(ip6po_dest2); PKTOPT_EXTHDRCPY(ip6po_rthdr); /* not copy the cached route */ return (0); bad: ip6_clearpktopts(dst, -1); return (ENOBUFS); } #undef PKTOPT_EXTHDRCPY struct ip6_pktopts * ip6_copypktopts(struct ip6_pktopts *src, int canwait) { int error; struct ip6_pktopts *dst; dst = malloc(sizeof(*dst), M_IP6OPT, canwait); if (dst == NULL) return (NULL); ip6_initpktopts(dst); if ((error = copypktopts(dst, src, canwait)) != 0) { free(dst, M_IP6OPT); return (NULL); } return (dst); } void ip6_freepcbopts(struct ip6_pktopts *pktopt) { if (pktopt == NULL) return; ip6_clearpktopts(pktopt, -1); free(pktopt, M_IP6OPT); } /* * Set IPv6 outgoing packet options based on advanced API. */ int ip6_setpktopts(struct mbuf *control, struct ip6_pktopts *opt, struct ip6_pktopts *stickyopt, struct ucred *cred, int uproto) { struct cmsghdr *cm = 0; if (control == NULL || opt == NULL) return (EINVAL); ip6_initpktopts(opt); if (stickyopt) { int error; /* * If stickyopt is provided, make a local copy of the options * for this particular packet, then override them by ancillary * objects. * XXX: copypktopts() does not copy the cached route to a next * hop (if any). This is not very good in terms of efficiency, * but we can allow this since this option should be rarely * used. */ if ((error = copypktopts(opt, stickyopt, M_NOWAIT)) != 0) return (error); } /* * XXX: Currently, we assume all the optional information is stored * in a single mbuf. */ if (control->m_next) return (EINVAL); for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len), control->m_len -= CMSG_ALIGN(cm->cmsg_len)) { int error; if (control->m_len < CMSG_LEN(0)) return (EINVAL); cm = mtod(control, struct cmsghdr *); if (cm->cmsg_len == 0 || cm->cmsg_len > control->m_len) return (EINVAL); if (cm->cmsg_level != IPPROTO_IPV6) continue; error = ip6_setpktopt(cm->cmsg_type, CMSG_DATA(cm), cm->cmsg_len - CMSG_LEN(0), opt, cred, 0, 1, uproto); if (error) return (error); } return (0); } /* * Set a particular packet option, as a sticky option or an ancillary data * item. "len" can be 0 only when it's a sticky option. * We have 4 cases of combination of "sticky" and "cmsg": * "sticky=0, cmsg=0": impossible * "sticky=0, cmsg=1": RFC2292 or RFC3542 ancillary data * "sticky=1, cmsg=0": RFC3542 socket option * "sticky=1, cmsg=1": RFC2292 socket option */ static int ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt, struct ucred *cred, int sticky, int cmsg, int uproto) { int minmtupolicy, preftemp; int error; if (!sticky && !cmsg) { #ifdef DIAGNOSTIC printf("ip6_setpktopt: impossible case\n"); #endif return (EINVAL); } /* * IPV6_2292xxx is for backward compatibility to RFC2292, and should * not be specified in the context of RFC3542. Conversely, * RFC3542 types should not be specified in the context of RFC2292. */ if (!cmsg) { switch (optname) { case IPV6_2292PKTINFO: case IPV6_2292HOPLIMIT: case IPV6_2292NEXTHOP: case IPV6_2292HOPOPTS: case IPV6_2292DSTOPTS: case IPV6_2292RTHDR: case IPV6_2292PKTOPTIONS: return (ENOPROTOOPT); } } if (sticky && cmsg) { switch (optname) { case IPV6_PKTINFO: case IPV6_HOPLIMIT: case IPV6_NEXTHOP: case IPV6_HOPOPTS: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_RTHDR: case IPV6_USE_MIN_MTU: case IPV6_DONTFRAG: case IPV6_TCLASS: case IPV6_PREFER_TEMPADDR: /* XXX: not an RFC3542 option */ return (ENOPROTOOPT); } } switch (optname) { case IPV6_2292PKTINFO: case IPV6_PKTINFO: { struct ifnet *ifp = NULL; struct in6_pktinfo *pktinfo; if (len != sizeof(struct in6_pktinfo)) return (EINVAL); pktinfo = (struct in6_pktinfo *)buf; /* * An application can clear any sticky IPV6_PKTINFO option by * doing a "regular" setsockopt with ipi6_addr being * in6addr_any and ipi6_ifindex being zero. * [RFC 3542, Section 6] */ if (optname == IPV6_PKTINFO && opt->ip6po_pktinfo && pktinfo->ipi6_ifindex == 0 && IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { ip6_clearpktopts(opt, optname); break; } if (uproto == IPPROTO_TCP && optname == IPV6_PKTINFO && sticky && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { return (EINVAL); } if (IN6_IS_ADDR_MULTICAST(&pktinfo->ipi6_addr)) return (EINVAL); /* validate the interface index if specified. */ if (pktinfo->ipi6_ifindex > V_if_index) return (ENXIO); if (pktinfo->ipi6_ifindex) { ifp = ifnet_byindex(pktinfo->ipi6_ifindex); if (ifp == NULL) return (ENXIO); } if (ifp != NULL && ( ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED)) return (ENETDOWN); if (ifp != NULL && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { struct in6_ifaddr *ia; in6_setscope(&pktinfo->ipi6_addr, ifp, NULL); ia = in6ifa_ifpwithaddr(ifp, &pktinfo->ipi6_addr); if (ia == NULL) return (EADDRNOTAVAIL); ifa_free(&ia->ia_ifa); } /* * We store the address anyway, and let in6_selectsrc() * validate the specified address. This is because ipi6_addr * may not have enough information about its scope zone, and * we may need additional information (such as outgoing * interface or the scope zone of a destination address) to * disambiguate the scope. * XXX: the delay of the validation may confuse the * application when it is used as a sticky option. */ if (opt->ip6po_pktinfo == NULL) { opt->ip6po_pktinfo = malloc(sizeof(*pktinfo), M_IP6OPT, M_NOWAIT); if (opt->ip6po_pktinfo == NULL) return (ENOBUFS); } bcopy(pktinfo, opt->ip6po_pktinfo, sizeof(*pktinfo)); break; } case IPV6_2292HOPLIMIT: case IPV6_HOPLIMIT: { int *hlimp; /* * RFC 3542 deprecated the usage of sticky IPV6_HOPLIMIT * to simplify the ordering among hoplimit options. */ if (optname == IPV6_HOPLIMIT && sticky) return (ENOPROTOOPT); if (len != sizeof(int)) return (EINVAL); hlimp = (int *)buf; if (*hlimp < -1 || *hlimp > 255) return (EINVAL); opt->ip6po_hlim = *hlimp; break; } case IPV6_TCLASS: { int tclass; if (len != sizeof(int)) return (EINVAL); tclass = *(int *)buf; if (tclass < -1 || tclass > 255) return (EINVAL); opt->ip6po_tclass = tclass; break; } case IPV6_2292NEXTHOP: case IPV6_NEXTHOP: if (cred != NULL) { error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS, 0); if (error) return (error); } if (len == 0) { /* just remove the option */ ip6_clearpktopts(opt, IPV6_NEXTHOP); break; } /* check if cmsg_len is large enough for sa_len */ if (len < sizeof(struct sockaddr) || len < *buf) return (EINVAL); switch (((struct sockaddr *)buf)->sa_family) { case AF_INET6: { struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)buf; int error; if (sa6->sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_UNSPECIFIED(&sa6->sin6_addr) || IN6_IS_ADDR_MULTICAST(&sa6->sin6_addr)) { return (EINVAL); } if ((error = sa6_embedscope(sa6, V_ip6_use_defzone)) != 0) { return (error); } break; } case AF_LINK: /* should eventually be supported */ default: return (EAFNOSUPPORT); } /* turn off the previous option, then set the new option. */ ip6_clearpktopts(opt, IPV6_NEXTHOP); opt->ip6po_nexthop = malloc(*buf, M_IP6OPT, M_NOWAIT); if (opt->ip6po_nexthop == NULL) return (ENOBUFS); bcopy(buf, opt->ip6po_nexthop, *buf); break; case IPV6_2292HOPOPTS: case IPV6_HOPOPTS: { struct ip6_hbh *hbh; int hbhlen; /* * XXX: We don't allow a non-privileged user to set ANY HbH * options, since per-option restriction has too much * overhead. */ if (cred != NULL) { error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS, 0); if (error) return (error); } if (len == 0) { ip6_clearpktopts(opt, IPV6_HOPOPTS); break; /* just remove the option */ } /* message length validation */ if (len < sizeof(struct ip6_hbh)) return (EINVAL); hbh = (struct ip6_hbh *)buf; hbhlen = (hbh->ip6h_len + 1) << 3; if (len != hbhlen) return (EINVAL); /* turn off the previous option, then set the new option. */ ip6_clearpktopts(opt, IPV6_HOPOPTS); opt->ip6po_hbh = malloc(hbhlen, M_IP6OPT, M_NOWAIT); if (opt->ip6po_hbh == NULL) return (ENOBUFS); bcopy(hbh, opt->ip6po_hbh, hbhlen); break; } case IPV6_2292DSTOPTS: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: { struct ip6_dest *dest, **newdest = NULL; int destlen; if (cred != NULL) { /* XXX: see the comment for IPV6_HOPOPTS */ error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS, 0); if (error) return (error); } if (len == 0) { ip6_clearpktopts(opt, optname); break; /* just remove the option */ } /* message length validation */ if (len < sizeof(struct ip6_dest)) return (EINVAL); dest = (struct ip6_dest *)buf; destlen = (dest->ip6d_len + 1) << 3; if (len != destlen) return (EINVAL); /* * Determine the position that the destination options header * should be inserted; before or after the routing header. */ switch (optname) { case IPV6_2292DSTOPTS: /* * The old advacned API is ambiguous on this point. * Our approach is to determine the position based * according to the existence of a routing header. * Note, however, that this depends on the order of the * extension headers in the ancillary data; the 1st * part of the destination options header must appear * before the routing header in the ancillary data, * too. * RFC3542 solved the ambiguity by introducing * separate ancillary data or option types. */ if (opt->ip6po_rthdr == NULL) newdest = &opt->ip6po_dest1; else newdest = &opt->ip6po_dest2; break; case IPV6_RTHDRDSTOPTS: newdest = &opt->ip6po_dest1; break; case IPV6_DSTOPTS: newdest = &opt->ip6po_dest2; break; } /* turn off the previous option, then set the new option. */ ip6_clearpktopts(opt, optname); *newdest = malloc(destlen, M_IP6OPT, M_NOWAIT); if (*newdest == NULL) return (ENOBUFS); bcopy(dest, *newdest, destlen); break; } case IPV6_2292RTHDR: case IPV6_RTHDR: { struct ip6_rthdr *rth; int rthlen; if (len == 0) { ip6_clearpktopts(opt, IPV6_RTHDR); break; /* just remove the option */ } /* message length validation */ if (len < sizeof(struct ip6_rthdr)) return (EINVAL); rth = (struct ip6_rthdr *)buf; rthlen = (rth->ip6r_len + 1) << 3; if (len != rthlen) return (EINVAL); switch (rth->ip6r_type) { case IPV6_RTHDR_TYPE_0: if (rth->ip6r_len == 0) /* must contain one addr */ return (EINVAL); if (rth->ip6r_len % 2) /* length must be even */ return (EINVAL); if (rth->ip6r_len / 2 != rth->ip6r_segleft) return (EINVAL); break; default: return (EINVAL); /* not supported */ } /* turn off the previous option */ ip6_clearpktopts(opt, IPV6_RTHDR); opt->ip6po_rthdr = malloc(rthlen, M_IP6OPT, M_NOWAIT); if (opt->ip6po_rthdr == NULL) return (ENOBUFS); bcopy(rth, opt->ip6po_rthdr, rthlen); break; } case IPV6_USE_MIN_MTU: if (len != sizeof(int)) return (EINVAL); minmtupolicy = *(int *)buf; if (minmtupolicy != IP6PO_MINMTU_MCASTONLY && minmtupolicy != IP6PO_MINMTU_DISABLE && minmtupolicy != IP6PO_MINMTU_ALL) { return (EINVAL); } opt->ip6po_minmtu = minmtupolicy; break; case IPV6_DONTFRAG: if (len != sizeof(int)) return (EINVAL); if (uproto == IPPROTO_TCP || *(int *)buf == 0) { /* * we ignore this option for TCP sockets. * (RFC3542 leaves this case unspecified.) */ opt->ip6po_flags &= ~IP6PO_DONTFRAG; } else opt->ip6po_flags |= IP6PO_DONTFRAG; break; case IPV6_PREFER_TEMPADDR: if (len != sizeof(int)) return (EINVAL); preftemp = *(int *)buf; if (preftemp != IP6PO_TEMPADDR_SYSTEM && preftemp != IP6PO_TEMPADDR_NOTPREFER && preftemp != IP6PO_TEMPADDR_PREFER) { return (EINVAL); } opt->ip6po_prefer_tempaddr = preftemp; break; default: return (ENOPROTOOPT); } /* end of switch */ return (0); } /* * Routine called from ip6_output() to loop back a copy of an IP6 multicast * packet to the input queue of a specified interface. Note that this * calls the output routine of the loopback "driver", but with an interface * pointer that might NOT be &loif -- easier than replicating that code here. */ void ip6_mloopback(struct ifnet *ifp, struct mbuf *m) { struct mbuf *copym; struct ip6_hdr *ip6; copym = m_copy(m, 0, M_COPYALL); if (copym == NULL) return; /* * Make sure to deep-copy IPv6 header portion in case the data * is in an mbuf cluster, so that we can safely override the IPv6 * header portion later. */ if (!M_WRITABLE(copym) || copym->m_len < sizeof(struct ip6_hdr)) { copym = m_pullup(copym, sizeof(struct ip6_hdr)); if (copym == NULL) return; } ip6 = mtod(copym, struct ip6_hdr *); /* * clear embedded scope identifiers if necessary. * in6_clearscope will touch the addresses only when necessary. */ in6_clearscope(&ip6->ip6_src); in6_clearscope(&ip6->ip6_dst); if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { copym->m_pkthdr.csum_flags |= CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR; copym->m_pkthdr.csum_data = 0xffff; } if_simloop(ifp, copym, AF_INET6, 0); } /* * Chop IPv6 header off from the payload. */ static int ip6_splithdr(struct mbuf *m, struct ip6_exthdrs *exthdrs) { struct mbuf *mh; struct ip6_hdr *ip6; ip6 = mtod(m, struct ip6_hdr *); if (m->m_len > sizeof(*ip6)) { mh = m_gethdr(M_NOWAIT, MT_DATA); if (mh == NULL) { m_freem(m); return ENOBUFS; } m_move_pkthdr(mh, m); M_ALIGN(mh, sizeof(*ip6)); m->m_len -= sizeof(*ip6); m->m_data += sizeof(*ip6); mh->m_next = m; m = mh; m->m_len = sizeof(*ip6); bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(*ip6)); } exthdrs->ip6e_ip6 = m; return 0; } /* * Compute IPv6 extension header length. */ int ip6_optlen(struct inpcb *in6p) { int len; if (!in6p->in6p_outputopts) return 0; len = 0; #define elen(x) \ (((struct ip6_ext *)(x)) ? (((struct ip6_ext *)(x))->ip6e_len + 1) << 3 : 0) len += elen(in6p->in6p_outputopts->ip6po_hbh); if (in6p->in6p_outputopts->ip6po_rthdr) /* dest1 is valid with rthdr only */ len += elen(in6p->in6p_outputopts->ip6po_dest1); len += elen(in6p->in6p_outputopts->ip6po_rthdr); len += elen(in6p->in6p_outputopts->ip6po_dest2); return len; #undef elen } Index: head/sys/netinet6/udp6_usrreq.c =================================================================== --- head/sys/netinet6/udp6_usrreq.c (revision 297224) +++ head/sys/netinet6/udp6_usrreq.c (revision 297225) @@ -1,1279 +1,1279 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * Copyright (c) 2010-2011 Juniper Networks, Inc. * Copyright (c) 2014 Kevin Lo * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: udp6_usrreq.c,v 1.27 2001/05/21 05:45:10 jinmei Exp $ * $KAME: udp6_output.c,v 1.31 2001/05/21 16:39:15 jinmei Exp $ */ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)udp_usrreq.c 8.6 (Berkeley) 5/23/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipfw.h" #include "opt_ipsec.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #include #endif /* IPSEC */ #include /* * UDP protocol implementation. * Per RFC 768, August, 1980. */ extern struct protosw inetsw[]; static void udp6_detach(struct socket *so); static int udp6_append(struct inpcb *inp, struct mbuf *n, int off, struct sockaddr_in6 *fromsa) { struct socket *so; struct mbuf *opts; struct udpcb *up; INP_LOCK_ASSERT(inp); /* * Engage the tunneling protocol. */ up = intoudpcb(inp); if (up->u_tun_func != NULL) { in_pcbref(inp); INP_RUNLOCK(inp); (*up->u_tun_func)(n, off, inp, (struct sockaddr *)fromsa, up->u_tun_ctx); INP_RLOCK(inp); return (in_pcbrele_rlocked(inp)); } #ifdef IPSEC /* Check AH/ESP integrity. */ if (ipsec6_in_reject(n, inp)) { m_freem(n); return (0); } #endif /* IPSEC */ #ifdef MAC if (mac_inpcb_check_deliver(inp, n) != 0) { m_freem(n); return (0); } #endif opts = NULL; if (inp->inp_flags & INP_CONTROLOPTS || inp->inp_socket->so_options & SO_TIMESTAMP) ip6_savecontrol(inp, n, &opts); m_adj(n, off + sizeof(struct udphdr)); so = inp->inp_socket; SOCKBUF_LOCK(&so->so_rcv); if (sbappendaddr_locked(&so->so_rcv, (struct sockaddr *)fromsa, n, opts) == 0) { SOCKBUF_UNLOCK(&so->so_rcv); m_freem(n); if (opts) m_freem(opts); UDPSTAT_INC(udps_fullsock); } else sorwakeup_locked(so); return (0); } int udp6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct ifnet *ifp; struct ip6_hdr *ip6; struct udphdr *uh; struct inpcb *inp; struct inpcbinfo *pcbinfo; struct udpcb *up; int off = *offp; int cscov_partial; int plen, ulen; struct sockaddr_in6 fromsa; struct m_tag *fwd_tag; uint16_t uh_sum; uint8_t nxt; ifp = m->m_pkthdr.rcvif; ip6 = mtod(m, struct ip6_hdr *); #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(struct udphdr), IPPROTO_DONE); ip6 = mtod(m, struct ip6_hdr *); uh = (struct udphdr *)((caddr_t)ip6 + off); #else IP6_EXTHDR_GET(uh, struct udphdr *, m, off, sizeof(*uh)); if (!uh) return (IPPROTO_DONE); #endif UDPSTAT_INC(udps_ipackets); /* * Destination port of 0 is illegal, based on RFC768. */ if (uh->uh_dport == 0) goto badunlocked; plen = ntohs(ip6->ip6_plen) - off + sizeof(*ip6); ulen = ntohs((u_short)uh->uh_ulen); nxt = proto; cscov_partial = (nxt == IPPROTO_UDPLITE) ? 1 : 0; if (nxt == IPPROTO_UDPLITE) { /* Zero means checksum over the complete packet. */ if (ulen == 0) ulen = plen; if (ulen == plen) cscov_partial = 0; if ((ulen < sizeof(struct udphdr)) || (ulen > plen)) { /* XXX: What is the right UDPLite MIB counter? */ goto badunlocked; } if (uh->uh_sum == 0) { /* XXX: What is the right UDPLite MIB counter? */ goto badunlocked; } } else { if ((ulen < sizeof(struct udphdr)) || (plen != ulen)) { UDPSTAT_INC(udps_badlen); goto badunlocked; } if (uh->uh_sum == 0) { UDPSTAT_INC(udps_nosum); goto badunlocked; } } if ((m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) && !cscov_partial) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) uh_sum = m->m_pkthdr.csum_data; else uh_sum = in6_cksum_pseudo(ip6, ulen, nxt, m->m_pkthdr.csum_data); uh_sum ^= 0xffff; } else uh_sum = in6_cksum_partial(m, nxt, off, plen, ulen); if (uh_sum != 0) { UDPSTAT_INC(udps_badsum); goto badunlocked; } /* * Construct sockaddr format source address. */ init_sin6(&fromsa, m); fromsa.sin6_port = uh->uh_sport; pcbinfo = udp_get_inpcbinfo(nxt); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { struct inpcb *last; struct inpcbhead *pcblist; struct ip6_moptions *imo; INP_INFO_RLOCK(pcbinfo); /* * In the event that laddr should be set to the link-local * address (this happens in RIPng), the multicast address * specified in the received packet will not match laddr. To * handle this situation, matching is relaxed if the * receiving interface is the same as one specified in the * socket and if the destination multicast address matches * one of the multicast groups specified in the socket. */ /* * KAME note: traditionally we dropped udpiphdr from mbuf * here. We need udphdr for IPsec processing so we do that * later. */ pcblist = udp_get_pcblist(nxt); last = NULL; LIST_FOREACH(inp, pcblist, inp_list) { if ((inp->inp_vflag & INP_IPV6) == 0) continue; if (inp->inp_lport != uh->uh_dport) continue; if (inp->inp_fport != 0 && inp->inp_fport != uh->uh_sport) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ip6->ip6_dst)) continue; } if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ip6->ip6_src) || inp->inp_fport != uh->uh_sport) continue; } /* * XXXRW: Because we weren't holding either the inpcb * or the hash lock when we checked for a match * before, we should probably recheck now that the * inpcb lock is (supposed to be) held. */ /* * Handle socket delivery policy for any-source * and source-specific multicast. [RFC3678] */ imo = inp->in6p_moptions; if (imo && IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { struct sockaddr_in6 mcaddr; int blocked; INP_RLOCK(inp); bzero(&mcaddr, sizeof(struct sockaddr_in6)); mcaddr.sin6_len = sizeof(struct sockaddr_in6); mcaddr.sin6_family = AF_INET6; mcaddr.sin6_addr = ip6->ip6_dst; blocked = im6o_mc_filter(imo, ifp, (struct sockaddr *)&mcaddr, (struct sockaddr *)&fromsa); if (blocked != MCAST_PASS) { if (blocked == MCAST_NOTGMEMBER) IP6STAT_INC(ip6s_notmember); if (blocked == MCAST_NOTSMEMBER || blocked == MCAST_MUTED) UDPSTAT_INC(udps_filtermcast); INP_RUNLOCK(inp); /* XXX */ continue; } INP_RUNLOCK(inp); } if (last != NULL) { struct mbuf *n; if ((n = m_copy(m, 0, M_COPYALL)) != NULL) { INP_RLOCK(last); UDP_PROBE(receive, NULL, last, ip6, last, uh); if (udp6_append(last, n, off, &fromsa)) goto inp_lost; INP_RUNLOCK(last); } } last = inp; /* * Don't look for additional matches if this one does * not have either the SO_REUSEPORT or SO_REUSEADDR * socket options set. This heuristic avoids * searching through all pcbs in the common case of a * non-shared port. It assumes that an application * will never clear these options after setting them. */ if ((last->inp_socket->so_options & (SO_REUSEPORT|SO_REUSEADDR)) == 0) break; } if (last == NULL) { /* * No matching pcb found; discard datagram. (No need * to send an ICMP Port Unreachable for a broadcast * or multicast datgram.) */ UDPSTAT_INC(udps_noport); UDPSTAT_INC(udps_noportmcast); goto badheadlocked; } INP_RLOCK(last); INP_INFO_RUNLOCK(pcbinfo); UDP_PROBE(receive, NULL, last, ip6, last, uh); if (udp6_append(last, m, off, &fromsa) == 0) INP_RUNLOCK(last); inp_lost: return (IPPROTO_DONE); } /* * Locate pcb for datagram. */ /* * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ if ((m->m_flags & M_IP6_NEXTHOP) && (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) { struct sockaddr_in6 *next_hop6; next_hop6 = (struct sockaddr_in6 *)(fwd_tag + 1); /* * Transparently forwarded. Pretend to be the destination. * Already got one like this? */ inp = in6_pcblookup_mbuf(pcbinfo, &ip6->ip6_src, uh->uh_sport, &ip6->ip6_dst, uh->uh_dport, INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif, m); if (!inp) { /* * It's new. Try to find the ambushing socket. * Because we've rewritten the destination address, * any hardware-generated hash is ignored. */ inp = in6_pcblookup(pcbinfo, &ip6->ip6_src, uh->uh_sport, &next_hop6->sin6_addr, next_hop6->sin6_port ? htons(next_hop6->sin6_port) : uh->uh_dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif); } /* Remove the tag from the packet. We don't need it anymore. */ m_tag_delete(m, fwd_tag); m->m_flags &= ~M_IP6_NEXTHOP; } else inp = in6_pcblookup_mbuf(pcbinfo, &ip6->ip6_src, uh->uh_sport, &ip6->ip6_dst, uh->uh_dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif, m); if (inp == NULL) { if (udp_log_in_vain) { char ip6bufs[INET6_ADDRSTRLEN]; char ip6bufd[INET6_ADDRSTRLEN]; log(LOG_INFO, "Connection attempt to UDP [%s]:%d from [%s]:%d\n", ip6_sprintf(ip6bufd, &ip6->ip6_dst), ntohs(uh->uh_dport), ip6_sprintf(ip6bufs, &ip6->ip6_src), ntohs(uh->uh_sport)); } UDPSTAT_INC(udps_noport); if (m->m_flags & M_MCAST) { printf("UDP6: M_MCAST is set in a unicast packet.\n"); UDPSTAT_INC(udps_noportmcast); goto badunlocked; } if (V_udp_blackhole) goto badunlocked; if (badport_bandlim(BANDLIM_ICMP6_UNREACH) < 0) goto badunlocked; icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOPORT, 0); return (IPPROTO_DONE); } INP_RLOCK_ASSERT(inp); up = intoudpcb(inp); if (cscov_partial) { if (up->u_rxcslen == 0 || up->u_rxcslen > ulen) { INP_RUNLOCK(inp); m_freem(m); return (IPPROTO_DONE); } } UDP_PROBE(receive, NULL, inp, ip6, inp, uh); if (udp6_append(inp, m, off, &fromsa) == 0) INP_RUNLOCK(inp); return (IPPROTO_DONE); badheadlocked: INP_INFO_RUNLOCK(pcbinfo); badunlocked: if (m) m_freem(m); return (IPPROTO_DONE); } static void udp6_common_ctlinput(int cmd, struct sockaddr *sa, void *d, struct inpcbinfo *pcbinfo) { struct udphdr uh; struct ip6_hdr *ip6; struct mbuf *m; int off = 0; struct ip6ctlparam *ip6cp = NULL; const struct sockaddr_in6 *sa6_src = NULL; void *cmdarg; struct inpcb *(*notify)(struct inpcb *, int) = udp_notify; struct udp_portonly { u_int16_t uh_sport; u_int16_t uh_dport; } *uhp; if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) return; if ((unsigned)cmd >= PRC_NCMDS) return; if (PRC_IS_REDIRECT(cmd)) notify = in6_rtchange, d = NULL; else if (cmd == PRC_HOSTDEAD) d = NULL; else if (inet6ctlerrmap[cmd] == 0) return; /* if the parameter is from icmp6, decode it. */ if (d != NULL) { ip6cp = (struct ip6ctlparam *)d; m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; cmdarg = ip6cp->ip6c_cmdarg; sa6_src = ip6cp->ip6c_src; } else { m = NULL; ip6 = NULL; cmdarg = NULL; sa6_src = &sa6_any; } if (ip6) { /* * XXX: We assume that when IPV6 is non NULL, * M and OFF are valid. */ /* Check if we can safely examine src and dst ports. */ if (m->m_pkthdr.len < off + sizeof(*uhp)) return; bzero(&uh, sizeof(uh)); m_copydata(m, off, sizeof(*uhp), (caddr_t)&uh); (void)in6_pcbnotify(pcbinfo, sa, uh.uh_dport, (struct sockaddr *)ip6cp->ip6c_src, uh.uh_sport, cmd, cmdarg, notify); } else (void)in6_pcbnotify(pcbinfo, sa, 0, (const struct sockaddr *)sa6_src, 0, cmd, cmdarg, notify); } void udp6_ctlinput(int cmd, struct sockaddr *sa, void *d) { return (udp6_common_ctlinput(cmd, sa, d, &V_udbinfo)); } void udplite6_ctlinput(int cmd, struct sockaddr *sa, void *d) { return (udp6_common_ctlinput(cmd, sa, d, &V_ulitecbinfo)); } static int udp6_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in6 addrs[2]; struct inpcb *inp; int error; error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); if (req->newlen != sizeof(addrs)) return (EINVAL); if (req->oldlen != sizeof(struct xucred)) return (EINVAL); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 || (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) { return (error); } inp = in6_pcblookup(&V_udbinfo, &addrs[1].sin6_addr, addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { INP_RLOCK_ASSERT(inp); if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) error = cr_canseesocket(req->td->td_ucred, inp->inp_socket); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else error = ENOENT; if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet6_udp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW, 0, 0, udp6_getcred, "S,xucred", "Get the xucred of a UDP6 connection"); static int udp6_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr6, struct mbuf *control, struct thread *td) { u_int32_t ulen = m->m_pkthdr.len; u_int32_t plen = sizeof(struct udphdr) + ulen; struct ip6_hdr *ip6; struct udphdr *udp6; struct in6_addr *laddr, *faddr, in6a; struct sockaddr_in6 *sin6 = NULL; int cscov_partial = 0; int scope_ambiguous = 0; u_short fport; int error = 0; uint8_t nxt; uint16_t cscov = 0; struct ip6_pktopts *optp, opt; int af = AF_INET6, hlen = sizeof(struct ip6_hdr); int flags; struct sockaddr_in6 tmp; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); if (addr6) { /* addr6 has been validated in udp6_send(). */ sin6 = (struct sockaddr_in6 *)addr6; /* protect *sin6 from overwrites */ tmp = *sin6; sin6 = &tmp; /* * Application should provide a proper zone ID or the use of * default zone IDs should be enabled. Unfortunately, some * applications do not behave as it should, so we need a * workaround. Even if an appropriate ID is not determined, * we'll see if we can determine the outgoing interface. If we * can, determine the zone ID based on the interface below. */ if (sin6->sin6_scope_id == 0 && !V_ip6_use_defzone) scope_ambiguous = 1; if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0) return (error); } nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ? IPPROTO_UDP : IPPROTO_UDPLITE; if (control) { if ((error = ip6_setpktopts(control, &opt, inp->in6p_outputopts, td->td_ucred, nxt)) != 0) goto release; optp = &opt; } else optp = inp->in6p_outputopts; if (sin6) { faddr = &sin6->sin6_addr; /* * Since we saw no essential reason for calling in_pcbconnect, * we get rid of such kind of logic, and call in6_selectsrc * and in6_pcbsetport in order to fill in the local address * and the local port. */ if (sin6->sin6_port == 0) { error = EADDRNOTAVAIL; goto release; } if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { /* how about ::ffff:0.0.0.0 case? */ error = EISCONN; goto release; } fport = sin6->sin6_port; /* allow 0 port */ if (IN6_IS_ADDR_V4MAPPED(faddr)) { if ((inp->inp_flags & IN6P_IPV6_V6ONLY)) { /* * I believe we should explicitly discard the * packet when mapped addresses are disabled, * rather than send the packet as an IPv6 one. * If we chose the latter approach, the packet * might be sent out on the wire based on the * default route, the situation which we'd * probably want to avoid. * (20010421 jinmei@kame.net) */ error = EINVAL; goto release; } if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && !IN6_IS_ADDR_V4MAPPED(&inp->in6p_laddr)) { /* * when remote addr is an IPv4-mapped address, * local addr should not be an IPv6 address, * since you cannot determine how to map IPv6 * source address to IPv4. */ error = EINVAL; goto release; } af = AF_INET; } if (!IN6_IS_ADDR_V4MAPPED(faddr)) { error = in6_selectsrc_socket(sin6, optp, inp, td->td_ucred, scope_ambiguous, &in6a, NULL); if (error) goto release; laddr = &in6a; } else laddr = &inp->in6p_laddr; /* XXX */ if (laddr == NULL) { if (error == 0) error = EADDRNOTAVAIL; goto release; } if (inp->inp_lport == 0 && (error = in6_pcbsetport(laddr, inp, td->td_ucred)) != 0) { /* Undo an address bind that may have occurred. */ inp->in6p_laddr = in6addr_any; goto release; } } else { if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { error = ENOTCONN; goto release; } if (IN6_IS_ADDR_V4MAPPED(&inp->in6p_faddr)) { if ((inp->inp_flags & IN6P_IPV6_V6ONLY)) { /* * XXX: this case would happen when the * application sets the V6ONLY flag after * connecting the foreign address. * Such applications should be fixed, * so we bark here. */ log(LOG_INFO, "udp6_output: IPV6_V6ONLY " "option was set for a connected socket\n"); error = EINVAL; goto release; } else af = AF_INET; } laddr = &inp->in6p_laddr; faddr = &inp->in6p_faddr; fport = inp->inp_fport; } if (af == AF_INET) hlen = sizeof(struct ip); /* * Calculate data length and get a mbuf * for UDP and IP6 headers. */ M_PREPEND(m, hlen + sizeof(struct udphdr), M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto release; } /* * Stuff checksum and output datagram. */ udp6 = (struct udphdr *)(mtod(m, caddr_t) + hlen); udp6->uh_sport = inp->inp_lport; /* lport is always set in the PCB */ udp6->uh_dport = fport; if (nxt == IPPROTO_UDPLITE) { struct udpcb *up; up = intoudpcb(inp); cscov = up->u_txcslen; if (cscov >= plen) cscov = 0; udp6->uh_ulen = htons(cscov); /* * For UDP-Lite, checksum coverage length of zero means * the entire UDPLite packet is covered by the checksum. */ cscov_partial = (cscov == 0) ? 0 : 1; } else if (plen <= 0xffff) udp6->uh_ulen = htons((u_short)plen); else udp6->uh_ulen = 0; udp6->uh_sum = 0; switch (af) { case AF_INET6: ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_flow = inp->inp_flow & IPV6_FLOWINFO_MASK; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_plen = htons((u_short)plen); ip6->ip6_nxt = nxt; ip6->ip6_hlim = in6_selecthlim(inp, NULL); ip6->ip6_src = *laddr; ip6->ip6_dst = *faddr; if (cscov_partial) { if ((udp6->uh_sum = in6_cksum_partial(m, nxt, sizeof(struct ip6_hdr), plen, cscov)) == 0) udp6->uh_sum = 0xffff; } else { udp6->uh_sum = in6_cksum_pseudo(ip6, plen, nxt, 0); m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); } #ifdef RSS { uint32_t hash_val, hash_type; uint8_t pr; pr = inp->inp_socket->so_proto->pr_protocol; /* * Calculate an appropriate RSS hash for UDP and * UDP Lite. * * The called function will take care of figuring out * whether a 2-tuple or 4-tuple hash is required based * on the currently configured scheme. * * Later later on connected socket values should be * cached in the inpcb and reused, rather than constantly * re-calculating it. * * UDP Lite is a different protocol number and will * likely end up being hashed as a 2-tuple until * RSS / NICs grow UDP Lite protocol awareness. */ if (rss_proto_software_hash_v6(faddr, laddr, fport, inp->inp_lport, pr, &hash_val, &hash_type) == 0) { m->m_pkthdr.flowid = hash_val; M_HASHTYPE_SET(m, hash_type); } } #endif flags = 0; #ifdef RSS /* * Don't override with the inp cached flowid. * * Until the whole UDP path is vetted, it may actually * be incorrect. */ flags |= IP_NODEFAULTFLOWID; #endif UDP_PROBE(send, NULL, inp, ip6, inp, udp6); UDPSTAT_INC(udps_opackets); - error = ip6_output(m, optp, NULL, flags, inp->in6p_moptions, - NULL, inp); + error = ip6_output(m, optp, &inp->inp_route6, flags, + inp->in6p_moptions, NULL, inp); break; case AF_INET: error = EAFNOSUPPORT; goto release; } goto releaseopt; release: m_freem(m); releaseopt: if (control) { ip6_clearpktopts(&opt, -1); m_freem(control); } return (error); } static void udp6_abort(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_abort: inp == NULL")); INP_WLOCK(inp); #ifdef INET if (inp->inp_vflag & INP_IPV4) { struct pr_usrreqs *pru; uint8_t nxt; nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ? IPPROTO_UDP : IPPROTO_UDPLITE; INP_WUNLOCK(inp); pru = inetsw[ip_protox[nxt]].pr_usrreqs; (*pru->pru_abort)(so); return; } #endif if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { INP_HASH_WLOCK(pcbinfo); in6_pcbdisconnect(inp); inp->in6p_laddr = in6addr_any; INP_HASH_WUNLOCK(pcbinfo); soisdisconnected(so); } INP_WUNLOCK(inp); } static int udp6_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; struct inpcbinfo *pcbinfo; int error; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp == NULL, ("udp6_attach: inp != NULL")); if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { error = soreserve(so, udp_sendspace, udp_recvspace); if (error) return (error); } INP_INFO_WLOCK(pcbinfo); error = in_pcballoc(so, pcbinfo); if (error) { INP_INFO_WUNLOCK(pcbinfo); return (error); } inp = (struct inpcb *)so->so_pcb; inp->inp_vflag |= INP_IPV6; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) inp->inp_vflag |= INP_IPV4; inp->in6p_hops = -1; /* use kernel default */ inp->in6p_cksum = -1; /* just to be sure */ /* * XXX: ugly!! * IPv4 TTL initialization is necessary for an IPv6 socket as well, * because the socket may be bound to an IPv6 wildcard address, * which may match an IPv4-mapped IPv6 address. */ inp->inp_ip_ttl = V_ip_defttl; error = udp_newudpcb(inp); if (error) { in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(pcbinfo); return (error); } INP_WUNLOCK(inp); INP_INFO_WUNLOCK(pcbinfo); return (0); } static int udp6_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; struct inpcbinfo *pcbinfo; int error; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_bind: inp == NULL")); INP_WLOCK(inp); INP_HASH_WLOCK(pcbinfo); inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { struct sockaddr_in6 *sin6_p; sin6_p = (struct sockaddr_in6 *)nam; if (IN6_IS_ADDR_UNSPECIFIED(&sin6_p->sin6_addr)) inp->inp_vflag |= INP_IPV4; #ifdef INET else if (IN6_IS_ADDR_V4MAPPED(&sin6_p->sin6_addr)) { struct sockaddr_in sin; in6_sin6_2_sin(&sin, sin6_p); inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; error = in_pcbbind(inp, (struct sockaddr *)&sin, td->td_ucred); goto out; } #endif } error = in6_pcbbind(inp, nam, td->td_ucred); #ifdef INET out: #endif INP_HASH_WUNLOCK(pcbinfo); INP_WUNLOCK(inp); return (error); } static void udp6_close(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_close: inp == NULL")); INP_WLOCK(inp); #ifdef INET if (inp->inp_vflag & INP_IPV4) { struct pr_usrreqs *pru; uint8_t nxt; nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ? IPPROTO_UDP : IPPROTO_UDPLITE; INP_WUNLOCK(inp); pru = inetsw[ip_protox[nxt]].pr_usrreqs; (*pru->pru_disconnect)(so); return; } #endif if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { INP_HASH_WLOCK(pcbinfo); in6_pcbdisconnect(inp); inp->in6p_laddr = in6addr_any; INP_HASH_WUNLOCK(pcbinfo); soisdisconnected(so); } INP_WUNLOCK(inp); } static int udp6_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; struct inpcbinfo *pcbinfo; struct sockaddr_in6 *sin6; int error; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); sin6 = (struct sockaddr_in6 *)nam; KASSERT(inp != NULL, ("udp6_connect: inp == NULL")); /* * XXXRW: Need to clarify locking of v4/v6 flags. */ INP_WLOCK(inp); #ifdef INET if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { struct sockaddr_in sin; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) { error = EINVAL; goto out; } if (inp->inp_faddr.s_addr != INADDR_ANY) { error = EISCONN; goto out; } in6_sin6_2_sin(&sin, sin6); inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; error = prison_remote_ip4(td->td_ucred, &sin.sin_addr); if (error != 0) goto out; INP_HASH_WLOCK(pcbinfo); error = in_pcbconnect(inp, (struct sockaddr *)&sin, td->td_ucred); INP_HASH_WUNLOCK(pcbinfo); if (error == 0) soisconnected(so); goto out; } #endif if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { error = EISCONN; goto out; } inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; error = prison_remote_ip6(td->td_ucred, &sin6->sin6_addr); if (error != 0) goto out; INP_HASH_WLOCK(pcbinfo); error = in6_pcbconnect(inp, nam, td->td_ucred); INP_HASH_WUNLOCK(pcbinfo); if (error == 0) soisconnected(so); out: INP_WUNLOCK(inp); return (error); } static void udp6_detach(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; struct udpcb *up; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_detach: inp == NULL")); INP_INFO_WLOCK(pcbinfo); INP_WLOCK(inp); up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(pcbinfo); udp_discardcb(up); } static int udp6_disconnect(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; int error; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_disconnect: inp == NULL")); INP_WLOCK(inp); #ifdef INET if (inp->inp_vflag & INP_IPV4) { struct pr_usrreqs *pru; uint8_t nxt; nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ? IPPROTO_UDP : IPPROTO_UDPLITE; INP_WUNLOCK(inp); pru = inetsw[ip_protox[nxt]].pr_usrreqs; (void)(*pru->pru_disconnect)(so); return (0); } #endif if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { error = ENOTCONN; goto out; } INP_HASH_WLOCK(pcbinfo); in6_pcbdisconnect(inp); inp->in6p_laddr = in6addr_any; INP_HASH_WUNLOCK(pcbinfo); SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; /* XXX */ SOCK_UNLOCK(so); out: INP_WUNLOCK(inp); return (0); } static int udp6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { struct inpcb *inp; struct inpcbinfo *pcbinfo; int error = 0; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_send: inp == NULL")); INP_WLOCK(inp); if (addr) { if (addr->sa_len != sizeof(struct sockaddr_in6)) { error = EINVAL; goto bad; } if (addr->sa_family != AF_INET6) { error = EAFNOSUPPORT; goto bad; } } #ifdef INET if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { int hasv4addr; struct sockaddr_in6 *sin6 = 0; if (addr == 0) hasv4addr = (inp->inp_vflag & INP_IPV4); else { sin6 = (struct sockaddr_in6 *)addr; hasv4addr = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) ? 1 : 0; } if (hasv4addr) { struct pr_usrreqs *pru; uint8_t nxt; nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ? IPPROTO_UDP : IPPROTO_UDPLITE; /* * XXXRW: We release UDP-layer locks before calling * udp_send() in order to avoid recursion. However, * this does mean there is a short window where inp's * fields are unstable. Could this lead to a * potential race in which the factors causing us to * select the UDPv4 output routine are invalidated? */ INP_WUNLOCK(inp); if (sin6) in6_sin6_2_sin_in_sock(addr); pru = inetsw[ip_protox[nxt]].pr_usrreqs; /* addr will just be freed in sendit(). */ return ((*pru->pru_send)(so, flags, m, addr, control, td)); } } #endif #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif INP_HASH_WLOCK(pcbinfo); error = udp6_output(inp, m, addr, control, td); INP_HASH_WUNLOCK(pcbinfo); INP_WUNLOCK(inp); return (error); bad: INP_WUNLOCK(inp); m_freem(m); return (error); } struct pr_usrreqs udp6_usrreqs = { .pru_abort = udp6_abort, .pru_attach = udp6_attach, .pru_bind = udp6_bind, .pru_connect = udp6_connect, .pru_control = in6_control, .pru_detach = udp6_detach, .pru_disconnect = udp6_disconnect, .pru_peeraddr = in6_mapped_peeraddr, .pru_send = udp6_send, .pru_shutdown = udp_shutdown, .pru_sockaddr = in6_mapped_sockaddr, .pru_soreceive = soreceive_dgram, .pru_sosend = sosend_dgram, .pru_sosetlabel = in_pcbsosetlabel, .pru_close = udp6_close };