Index: head/sys/netinet6/udp6_usrreq.c =================================================================== --- head/sys/netinet6/udp6_usrreq.c +++ head/sys/netinet6/udp6_usrreq.c @@ -679,35 +679,38 @@ SYSCTL_PROC(_net_inet6_udp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW, 0, 0, udp6_getcred, "S,xucred", "Get the xucred of a UDP6 connection"); +#define UH_WLOCKED 2 +#define UH_RLOCKED 1 +#define UH_UNLOCKED 0 static int -udp6_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr6, - struct mbuf *control, struct thread *td) +udp6_output(struct socket *so, int flags_arg, struct mbuf *m, + struct sockaddr *addr6, struct mbuf *control, struct thread *td) { - u_int32_t ulen = m->m_pkthdr.len; - u_int32_t plen = sizeof(struct udphdr) + ulen; + struct inpcbinfo *pcbinfo; + struct inpcb *inp; struct ip6_hdr *ip6; struct udphdr *udp6; struct in6_addr *laddr, *faddr, in6a; - struct sockaddr_in6 *sin6 = NULL; - int cscov_partial = 0; - int scope_ambiguous = 0; - u_short fport; - int error = 0; - uint8_t nxt; - uint16_t cscov = 0; struct ip6_pktopts *optp, opt; - int af = AF_INET6, hlen = sizeof(struct ip6_hdr); - int flags; - struct sockaddr_in6 tmp; + struct sockaddr_in6 *sin6, tmp; + struct epoch_tracker et; + int cscov_partial, error, flags, hlen, scope_ambiguous; + u_int32_t ulen, plen; + uint16_t cscov; + u_short fport; + uint8_t nxt, unlock_udbinfo; - INP_WLOCK_ASSERT(inp); - INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); + /* addr6 has been validated in udp6_send(). */ + sin6 = (struct sockaddr_in6 *)addr6; - if (addr6) { - /* addr6 has been validated in udp6_send(). */ - sin6 = (struct sockaddr_in6 *)addr6; + /* + * In contrast to to IPv4 we do not validate the max. packet length + * here due to IPv6 Jumbograms (RFC2675). + */ - /* protect *sin6 from overwrites */ + scope_ambiguous = 0; + if (sin6) { + /* Protect *addr6 from overwrites. */ tmp = *sin6; sin6 = &tmp; @@ -721,22 +724,86 @@ */ if (sin6->sin6_scope_id == 0 && !V_ip6_use_defzone) scope_ambiguous = 1; - if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0) + if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0) { + if (control) + m_freem(control); + m_freem(m); return (error); + } } + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); + INP_RLOCK(inp); nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ? IPPROTO_UDP : IPPROTO_UDPLITE; + +#ifdef INET + if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { + int hasv4addr; + + if (sin6 == NULL) + hasv4addr = (inp->inp_vflag & INP_IPV4); + else + hasv4addr = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) + ? 1 : 0; + if (hasv4addr) { + struct pr_usrreqs *pru; + + /* + * XXXRW: We release UDP-layer locks before calling + * udp_send() in order to avoid recursion. However, + * this does mean there is a short window where inp's + * fields are unstable. Could this lead to a + * potential race in which the factors causing us to + * select the UDPv4 output routine are invalidated? + */ + INP_RUNLOCK(inp); + if (sin6) + in6_sin6_2_sin_in_sock((struct sockaddr *)sin6); + pru = inetsw[ip_protox[nxt]].pr_usrreqs; + /* addr will just be freed in sendit(). */ + return ((*pru->pru_send)(so, flags_arg, m, + (struct sockaddr *)sin6, control, td)); + } + } +#endif + if (control) { if ((error = ip6_setpktopts(control, &opt, - inp->in6p_outputopts, td->td_ucred, nxt)) != 0) - goto release; + inp->in6p_outputopts, td->td_ucred, nxt)) != 0) { + INP_RUNLOCK(inp); + ip6_clearpktopts(&opt, -1); + if (control) + m_freem(control); + m_freem(m); + return (error); + } optp = &opt; } else optp = inp->in6p_outputopts; + pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); + if (sin6 != NULL && + IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && inp->inp_lport == 0) { + INP_RUNLOCK(inp); + /* + * XXX there is a short window here which could lead to a race; + * should we re-check that what got us here is still valid? + */ + INP_WLOCK(inp); + INP_HASH_WLOCK(pcbinfo); + unlock_udbinfo = UH_WLOCKED; + } else if (sin6 != NULL && + (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) || + IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) || + inp->inp_lport == 0)) { + INP_HASH_RLOCK_ET(pcbinfo, et); + unlock_udbinfo = UH_RLOCKED; + } else + unlock_udbinfo = UH_UNLOCKED; + if (sin6) { - faddr = &sin6->sin6_addr; /* * Since we saw no essential reason for calling in_pcbconnect, @@ -755,85 +822,47 @@ goto release; } - fport = sin6->sin6_port; /* allow 0 port */ + /* + * Given we handle the v4mapped case in the INET block above + * assert here that it must not happen anymore. + */ + KASSERT(!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr), + ("%s: sin6(%p)->sin6_addr is v4mapped which we " + "should have handled.", __func__, sin6)); - if (IN6_IS_ADDR_V4MAPPED(faddr)) { - if ((inp->inp_flags & IN6P_IPV6_V6ONLY)) { - /* - * I believe we should explicitly discard the - * packet when mapped addresses are disabled, - * rather than send the packet as an IPv6 one. - * If we chose the latter approach, the packet - * might be sent out on the wire based on the - * default route, the situation which we'd - * probably want to avoid. - * (20010421 jinmei@kame.net) - */ - error = EINVAL; - goto release; - } - if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && - !IN6_IS_ADDR_V4MAPPED(&inp->in6p_laddr)) { - /* - * when remote addr is an IPv4-mapped address, - * local addr should not be an IPv6 address, - * since you cannot determine how to map IPv6 - * source address to IPv4. - */ - error = EINVAL; - goto release; - } + /* This only requires read-locking. */ + error = in6_selectsrc_socket(sin6, optp, inp, + td->td_ucred, scope_ambiguous, &in6a, NULL); + if (error) + goto release; + laddr = &in6a; - af = AF_INET; - } + if (inp->inp_lport == 0) { - if (!IN6_IS_ADDR_V4MAPPED(faddr)) { - error = in6_selectsrc_socket(sin6, optp, inp, - td->td_ucred, scope_ambiguous, &in6a, NULL); - if (error) + INP_WLOCK_ASSERT(inp); + error = in6_pcbsetport(laddr, inp, td->td_ucred); + if (error != 0) { + /* Undo an address bind that may have occurred. */ + inp->in6p_laddr = in6addr_any; goto release; - laddr = &in6a; - } else - laddr = &inp->in6p_laddr; /* XXX */ - if (laddr == NULL) { - if (error == 0) - error = EADDRNOTAVAIL; - goto release; + } } - if (inp->inp_lport == 0 && - (error = in6_pcbsetport(laddr, inp, td->td_ucred)) != 0) { - /* Undo an address bind that may have occurred. */ - inp->in6p_laddr = in6addr_any; - goto release; - } + faddr = &sin6->sin6_addr; + fport = sin6->sin6_port; /* allow 0 port */ + } else { if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { error = ENOTCONN; goto release; } - if (IN6_IS_ADDR_V4MAPPED(&inp->in6p_faddr)) { - if ((inp->inp_flags & IN6P_IPV6_V6ONLY)) { - /* - * XXX: this case would happen when the - * application sets the V6ONLY flag after - * connecting the foreign address. - * Such applications should be fixed, - * so we bark here. - */ - log(LOG_INFO, "udp6_output: IPV6_V6ONLY " - "option was set for a connected socket\n"); - error = EINVAL; - goto release; - } else - af = AF_INET; - } laddr = &inp->in6p_laddr; faddr = &inp->in6p_faddr; fport = inp->inp_fport; } - if (af == AF_INET) - hlen = sizeof(struct ip); + ulen = m->m_pkthdr.len; + plen = sizeof(struct udphdr) + ulen; + hlen = sizeof(struct ip6_hdr); /* * Calculate data length and get a mbuf @@ -848,6 +877,7 @@ /* * Stuff checksum and output datagram. */ + cscov = cscov_partial = 0; udp6 = (struct udphdr *)(mtod(m, caddr_t) + hlen); udp6->uh_sport = inp->inp_lport; /* lport is always set in the PCB */ udp6->uh_dport = fport; @@ -870,59 +900,59 @@ udp6->uh_ulen = 0; udp6->uh_sum = 0; - switch (af) { - case AF_INET6: - ip6 = mtod(m, struct ip6_hdr *); - ip6->ip6_flow = inp->inp_flow & IPV6_FLOWINFO_MASK; - ip6->ip6_vfc &= ~IPV6_VERSION_MASK; - ip6->ip6_vfc |= IPV6_VERSION; - ip6->ip6_plen = htons((u_short)plen); - ip6->ip6_nxt = nxt; - ip6->ip6_hlim = in6_selecthlim(inp, NULL); - ip6->ip6_src = *laddr; - ip6->ip6_dst = *faddr; + ip6 = mtod(m, struct ip6_hdr *); + ip6->ip6_flow = inp->inp_flow & IPV6_FLOWINFO_MASK; + ip6->ip6_vfc &= ~IPV6_VERSION_MASK; + ip6->ip6_vfc |= IPV6_VERSION; + ip6->ip6_plen = htons((u_short)plen); + ip6->ip6_nxt = nxt; + ip6->ip6_hlim = in6_selecthlim(inp, NULL); + ip6->ip6_src = *laddr; + ip6->ip6_dst = *faddr; - if (cscov_partial) { - if ((udp6->uh_sum = in6_cksum_partial(m, nxt, - sizeof(struct ip6_hdr), plen, cscov)) == 0) - udp6->uh_sum = 0xffff; - } else { - udp6->uh_sum = in6_cksum_pseudo(ip6, plen, nxt, 0); - m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; - m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); - } +#ifdef MAC + mac_inpcb_create_mbuf(inp, m); +#endif + if (cscov_partial) { + if ((udp6->uh_sum = in6_cksum_partial(m, nxt, + sizeof(struct ip6_hdr), plen, cscov)) == 0) + udp6->uh_sum = 0xffff; + } else { + udp6->uh_sum = in6_cksum_pseudo(ip6, plen, nxt, 0); + m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; + m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); + } + + flags = 0; #ifdef RSS - { - uint32_t hash_val, hash_type; - uint8_t pr; + { + uint32_t hash_val, hash_type; + uint8_t pr; - pr = inp->inp_socket->so_proto->pr_protocol; - /* - * Calculate an appropriate RSS hash for UDP and - * UDP Lite. - * - * The called function will take care of figuring out - * whether a 2-tuple or 4-tuple hash is required based - * on the currently configured scheme. - * - * Later later on connected socket values should be - * cached in the inpcb and reused, rather than constantly - * re-calculating it. - * - * UDP Lite is a different protocol number and will - * likely end up being hashed as a 2-tuple until - * RSS / NICs grow UDP Lite protocol awareness. - */ - if (rss_proto_software_hash_v6(faddr, laddr, fport, - inp->inp_lport, pr, &hash_val, &hash_type) == 0) { - m->m_pkthdr.flowid = hash_val; - M_HASHTYPE_SET(m, hash_type); - } + pr = inp->inp_socket->so_proto->pr_protocol; + /* + * Calculate an appropriate RSS hash for UDP and + * UDP Lite. + * + * The called function will take care of figuring out + * whether a 2-tuple or 4-tuple hash is required based + * on the currently configured scheme. + * + * Later later on connected socket values should be + * cached in the inpcb and reused, rather than constantly + * re-calculating it. + * + * UDP Lite is a different protocol number and will + * likely end up being hashed as a 2-tuple until + * RSS / NICs grow UDP Lite protocol awareness. + */ + if (rss_proto_software_hash_v6(faddr, laddr, fport, + inp->inp_lport, pr, &hash_val, &hash_type) == 0) { + m->m_pkthdr.flowid = hash_val; + M_HASHTYPE_SET(m, hash_type); } -#endif - flags = 0; -#ifdef RSS + /* * Don't override with the inp cached flowid. * @@ -932,28 +962,43 @@ flags |= IP_NODEFAULTFLOWID; #endif - if (nxt == IPPROTO_UDPLITE) - UDPLITE_PROBE(send, NULL, inp, ip6, inp, udp6); - else - UDP_PROBE(send, NULL, inp, ip6, inp, udp6); - UDPSTAT_INC(udps_opackets); - error = ip6_output(m, optp, &inp->inp_route6, flags, - inp->in6p_moptions, NULL, inp); - break; - case AF_INET: - error = EAFNOSUPPORT; - goto release; + UDPSTAT_INC(udps_opackets); + if (unlock_udbinfo == UH_WLOCKED) + INP_HASH_WUNLOCK(pcbinfo); + else if (unlock_udbinfo == UH_RLOCKED) + INP_HASH_RUNLOCK_ET(pcbinfo, et); + if (nxt == IPPROTO_UDPLITE) + UDPLITE_PROBE(send, NULL, inp, ip6, inp, udp6); + else + UDP_PROBE(send, NULL, inp, ip6, inp, udp6); + error = ip6_output(m, optp, &inp->inp_route6, flags, + inp->in6p_moptions, NULL, inp); + if (unlock_udbinfo == UH_WLOCKED) + INP_WUNLOCK(inp); + else + INP_RUNLOCK(inp); + + if (control) { + ip6_clearpktopts(&opt, -1); + m_freem(control); } - goto releaseopt; + return (error); release: - m_freem(m); - -releaseopt: + if (unlock_udbinfo == UH_WLOCKED) { + INP_HASH_WUNLOCK(pcbinfo); + INP_WUNLOCK(inp); + } else if (unlock_udbinfo == UH_RLOCKED) { + INP_HASH_RUNLOCK_ET(pcbinfo, et); + INP_RUNLOCK(inp); + } else + INP_RUNLOCK(inp); if (control) { ip6_clearpktopts(&opt, -1); m_freem(control); } + m_freem(m); + return (error); } @@ -1257,15 +1302,8 @@ udp6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { - struct inpcb *inp; - struct inpcbinfo *pcbinfo; - int error = 0; + int error; - pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); - inp = sotoinpcb(so); - KASSERT(inp != NULL, ("udp6_send: inp == NULL")); - - INP_WLOCK(inp); if (addr) { if (addr->sa_len != sizeof(struct sockaddr_in6)) { error = EINVAL; @@ -1277,53 +1315,11 @@ } } -#ifdef INET - if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { - int hasv4addr; - struct sockaddr_in6 *sin6 = NULL; + return (udp6_output(so, flags, m, addr, control, td)); - if (addr == NULL) - hasv4addr = (inp->inp_vflag & INP_IPV4); - else { - sin6 = (struct sockaddr_in6 *)addr; - hasv4addr = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) - ? 1 : 0; - } - if (hasv4addr) { - struct pr_usrreqs *pru; - uint8_t nxt; - - nxt = (inp->inp_socket->so_proto->pr_protocol == - IPPROTO_UDP) ? IPPROTO_UDP : IPPROTO_UDPLITE; - /* - * XXXRW: We release UDP-layer locks before calling - * udp_send() in order to avoid recursion. However, - * this does mean there is a short window where inp's - * fields are unstable. Could this lead to a - * potential race in which the factors causing us to - * select the UDPv4 output routine are invalidated? - */ - INP_WUNLOCK(inp); - if (sin6) - in6_sin6_2_sin_in_sock(addr); - pru = inetsw[ip_protox[nxt]].pr_usrreqs; - /* addr will just be freed in sendit(). */ - return ((*pru->pru_send)(so, flags, m, addr, control, - td)); - } - } -#endif -#ifdef MAC - mac_inpcb_create_mbuf(inp, m); -#endif - INP_HASH_WLOCK(pcbinfo); - error = udp6_output(inp, m, addr, control, td); - INP_HASH_WUNLOCK(pcbinfo); - INP_WUNLOCK(inp); - return (error); - bad: - INP_WUNLOCK(inp); + if (control) + m_freem(control); m_freem(m); return (error); }