Index: sbin/ifconfig/ifgre.c =================================================================== --- sbin/ifconfig/ifgre.c +++ sbin/ifconfig/ifgre.c @@ -44,15 +44,16 @@ #include "ifconfig.h" -#define GREBITS "\020\01ENABLE_CSUM\02ENABLE_SEQ" +#define GREBITS "\020\01ENABLE_CSUM\02ENABLE_SEQ\03UDPENCAP" static void gre_status(int s); static void gre_status(int s) { - uint32_t opts = 0; + uint32_t opts, port; + opts = 0; ifr.ifr_data = (caddr_t)&opts; if (ioctl(s, GREGKEY, &ifr) == 0) if (opts != 0) @@ -60,6 +61,11 @@ opts = 0; if (ioctl(s, GREGOPTS, &ifr) != 0 || opts == 0) return; + + port = 0; + ifr.ifr_data = (caddr_t)&port; + if (ioctl(s, GREGPORT, &ifr) == 0 && port != 0) + printf("\tudpport: %u\n", port); printb("\toptions", opts, GREBITS); putchar('\n'); } @@ -76,6 +82,22 @@ warn("ioctl (set grekey)"); } +static void +setifgreport(const char *val, int dummy __unused, int s, + const struct afswtch *afp) +{ + uint32_t udpport = strtol(val, NULL, 0); + + if (udpport != 0 && (udpport < 0xC000 || udpport > 0xFFFF)) { + warnx("udpport should be chosen from the range 49152-65535"); + return; + } + strlcpy(ifr.ifr_name, name, sizeof (ifr.ifr_name)); + ifr.ifr_data = (caddr_t)&udpport; + if (ioctl(s, GRESPORT, (caddr_t)&ifr) < 0) + warn("ioctl (set udpport)"); +} + static void setifgreopts(const char *val, int d, int s, const struct afswtch *afp) { @@ -101,10 +123,13 @@ static struct cmd gre_cmds[] = { DEF_CMD_ARG("grekey", setifgrekey), + DEF_CMD_ARG("udpport", setifgreport), DEF_CMD("enable_csum", GRE_ENABLE_CSUM, setifgreopts), DEF_CMD("-enable_csum",-GRE_ENABLE_CSUM,setifgreopts), DEF_CMD("enable_seq", GRE_ENABLE_SEQ, setifgreopts), DEF_CMD("-enable_seq",-GRE_ENABLE_SEQ, setifgreopts), + DEF_CMD("udpencap", GRE_UDPENCAP, setifgreopts), + DEF_CMD("-udpencap",-GRE_UDPENCAP, setifgreopts), }; static struct afswtch af_gre = { .af_name = "af_gre", Index: share/man/man4/gre.4 =================================================================== --- share/man/man4/gre.4 +++ share/man/man4/gre.4 @@ -29,7 +29,7 @@ .\" .\" $FreeBSD$ .\" -.Dd June 2, 2015 +.Dd April 16, 2019 .Dt GRE 4 .Os .Sh NAME @@ -89,7 +89,45 @@ Enables checksum calculation for outgoing packets. .It Ar enable_seq Enables use of sequence number field in the GRE header for outgoing packets. +.It Ar udpencap +Enables UDP-in-GRE encapsulation (see the +.Sx GRE-IN-UDP ENCAPSULATION +Section below for details). +.It Ar udpport +Set the source UDP port for outgoing packets. +A value of 0 disables the persistence of source UDP port for outgoing packets. +See the +.Sx GRE-IN-UDP ENCAPSULATION +Section below for details. .El +.Sh GRE-IN-UDP ENCAPSULATION +The +.Nm +supports GRE in UDP encapsulation as defined in RFC 8086. +A GRE in UDP tunnel offers the possibility of better performance for +load-balancing GRE traffic in transit networks. +Encapsulating GRE in UDP enables use of the UDP source port to provide +entropy to ECMP hashing. +.Pp +The GRE in UDP tunnel uses single value 4754 as UDP destination port. +The UDP source port contains a 14-bit entropy value that is generated +by the encapsulator to identify a flow for the encapsulated packet. +The +.Ar udpport +option can be used to disable this behaviour and use single source UDP +port value. +The value of +.Ar udpport +should be within the ephemeral port range, i.e., 49152 to 65535. +.Pp +Note that a GRE in UDP tunnel is unidirectional; the tunnel traffic is not +expected to be returned back to the UDP source port values used to generate +entropy. +This may impact NAPT (Network Address Port Translator) middleboxes. +If such tunnels are expected to be used on a path with a middlebox, +the tunnel can be configured either to disable use of the UDP source port +for entropy or to enable middleboxes to pass packets with UDP source port +entropy. .Sh EXAMPLES .Bd -literal 192.168.1.* --- Router A -------tunnel-------- Router B --- 192.168.2.* Index: sys/conf/config.mk =================================================================== --- sys/conf/config.mk +++ sys/conf/config.mk @@ -27,6 +27,10 @@ echo "#define MROUTING 1" > ${.TARGET} opt_printf.h: echo "#define PRINTF_BUFR_SIZE 128" > ${.TARGET} +.if ${MK_RSS} != "no" +opt_rss.h: + echo "#define RSS 1" > ${.TARGET} +.endif opt_scsi.h: echo "#define SCSI_DELAY 15000" > ${.TARGET} opt_wlan.h: Index: sys/conf/kern.opts.mk =================================================================== --- sys/conf/kern.opts.mk +++ sys/conf/kern.opts.mk @@ -52,7 +52,8 @@ KERNEL_RETPOLINE \ NAND \ OFED \ - RATELIMIT + RATELIMIT \ + RSS # Some options are totally broken on some architectures. We disable # them. If you need to enable them on an experimental basis, you Index: sys/modules/if_gre/Makefile =================================================================== --- sys/modules/if_gre/Makefile +++ sys/modules/if_gre/Makefile @@ -5,7 +5,7 @@ .include "${SYSDIR}/conf/kern.opts.mk" KMOD= if_gre -SRCS= if_gre.c opt_inet.h opt_inet6.h +SRCS= if_gre.c opt_inet.h opt_inet6.h opt_rss.h SRCS.INET= ip_gre.c SRCS.INET6= ip6_gre.c Index: sys/net/if_gre.h =================================================================== --- sys/net/if_gre.h +++ sys/net/if_gre.h @@ -53,14 +53,35 @@ struct ip gi_ip; struct grehdr gi_gre; } __packed; -#endif + +struct greudp { + struct ip gi_ip; + struct udphdr gi_udp; + struct grehdr gi_gre; +} __packed; +#endif /* INET */ #ifdef INET6 struct greip6 { struct ip6_hdr gi6_ip6; struct grehdr gi6_gre; } __packed; -#endif + +struct greudp6 { + struct ip6_hdr gi6_ip6; + struct udphdr gi6_udp; + struct grehdr gi6_gre; +} __packed; +#endif /* INET6 */ + +CK_LIST_HEAD(gre_list, gre_softc); +CK_LIST_HEAD(gre_sockets, gre_socket); +struct gre_socket { + struct socket *so; + struct gre_list list; + CK_LIST_ENTRY(gre_socket) chain; + struct epoch_context epoch_ctx; +}; struct gre_softc { struct ifnet *gre_ifp; @@ -69,22 +90,26 @@ uint32_t gre_oseq; uint32_t gre_key; uint32_t gre_options; + uint32_t gre_csumflags; + uint32_t gre_port; u_int gre_fibnum; u_int gre_hlen; /* header size */ union { void *hdr; #ifdef INET - struct greip *gihdr; + struct greip *iphdr; + struct greudp *udphdr; #endif #ifdef INET6 - struct greip6 *gi6hdr; + struct greip6 *ip6hdr; + struct greudp6 *udp6hdr; #endif } gre_uhdr; + struct gre_socket *gre_so; CK_LIST_ENTRY(gre_softc) chain; CK_LIST_ENTRY(gre_softc) srchash; }; -CK_LIST_HEAD(gre_list, gre_softc); MALLOC_DECLARE(M_GRE); #ifndef GRE_HASH_SIZE @@ -98,28 +123,35 @@ #define GRE_WAIT() epoch_wait_preempt(net_epoch_preempt) #define gre_hdr gre_uhdr.hdr -#define gre_gihdr gre_uhdr.gihdr -#define gre_gi6hdr gre_uhdr.gi6hdr -#define gre_oip gre_gihdr->gi_ip -#define gre_oip6 gre_gi6hdr->gi6_ip6 +#define gre_iphdr gre_uhdr.iphdr +#define gre_ip6hdr gre_uhdr.ip6hdr +#define gre_udphdr gre_uhdr.udphdr +#define gre_udp6hdr gre_uhdr.udp6hdr + +#define gre_oip gre_iphdr->gi_ip +#define gre_udp gre_udphdr->gi_udp +#define gre_oip6 gre_ip6hdr->gi6_ip6 +#define gre_udp6 gre_udp6hdr->gi6_udp struct gre_list *gre_hashinit(void); void gre_hashdestroy(struct gre_list *); int gre_input(struct mbuf *, int, int, void *); -void gre_updatehdr(struct gre_softc *, struct grehdr *); +void gre_update_hdr(struct gre_softc *, struct grehdr *); +void gre_update_udphdr(struct gre_softc *, struct udphdr *, uint16_t); +void gre_sofree(epoch_context_t); void in_gre_init(void); void in_gre_uninit(void); -void in_gre_setopts(struct gre_softc *, u_long, uint32_t); +int in_gre_setopts(struct gre_softc *, u_long, uint32_t); int in_gre_ioctl(struct gre_softc *, u_long, caddr_t); int in_gre_output(struct mbuf *, int, int); void in6_gre_init(void); void in6_gre_uninit(void); -void in6_gre_setopts(struct gre_softc *, u_long, uint32_t); +int in6_gre_setopts(struct gre_softc *, u_long, uint32_t); int in6_gre_ioctl(struct gre_softc *, u_long, caddr_t); -int in6_gre_output(struct mbuf *, int, int); +int in6_gre_output(struct mbuf *, int, int, uint32_t); /* * CISCO uses special type for GRE tunnel created as part of WCCP * connection, while in fact those packets are just IPv4 encapsulated @@ -139,9 +171,15 @@ #define GRESKEY _IOW('i', 108, struct ifreq) #define GREGOPTS _IOWR('i', 109, struct ifreq) #define GRESOPTS _IOW('i', 110, struct ifreq) +#define GREGPORT _IOWR('i', 111, struct ifreq) +#define GRESPORT _IOW('i', 112, struct ifreq) + +/* GRE-in-UDP encapsulation destination port as defined in RFC8086 */ +#define GRE_UDPPORT 4754 #define GRE_ENABLE_CSUM 0x0001 #define GRE_ENABLE_SEQ 0x0002 -#define GRE_OPTMASK (GRE_ENABLE_CSUM|GRE_ENABLE_SEQ) +#define GRE_UDPENCAP 0x0004 +#define GRE_OPTMASK (GRE_ENABLE_CSUM|GRE_ENABLE_SEQ|GRE_UDPENCAP) #endif /* _NET_IF_GRE_H_ */ Index: sys/net/if_gre.c =================================================================== --- sys/net/if_gre.c +++ sys/net/if_gre.c @@ -39,6 +39,7 @@ #include "opt_inet.h" #include "opt_inet6.h" +#include "opt_rss.h" #include #include @@ -49,6 +50,7 @@ #include #include #include +#include #include #include #include @@ -69,15 +71,22 @@ #include #include #include +#ifdef RSS +#include +#endif #endif #ifdef INET6 #include #include #include +#ifdef RSS +#include +#endif #endif #include +#include #include #include @@ -151,6 +160,7 @@ #ifdef INET6 in6_gre_uninit(); #endif + /* XXX: epoch_call drain */ } VNET_SYSUNINIT(vnet_gre_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_gre_uninit, NULL); @@ -266,6 +276,7 @@ break; case GRESKEY: case GRESOPTS: + case GRESPORT: if ((error = priv_check(curthread, PRIV_NET_GRE)) != 0) break; if ((error = copyin(ifr_data_get_ptr(ifr), &opt, @@ -281,23 +292,44 @@ } if (sc->gre_options == opt) break; + } else if (cmd == GRESPORT) { + if (opt != 0 && (opt < 0xC000 || opt > 0xFFFF)) { + error = EINVAL; + break; + } + if (sc->gre_port == opt) + break; + if ((sc->gre_options & GRE_UDPENCAP) == 0) { + /* + * UDP encapsulation is not enabled, thus + * there is no need to reattach softc. + */ + sc->gre_port = opt; + break; + } } switch (sc->gre_family) { #ifdef INET case AF_INET: - in_gre_setopts(sc, cmd, opt); + error = in_gre_setopts(sc, cmd, opt); break; #endif #ifdef INET6 case AF_INET6: - in6_gre_setopts(sc, cmd, opt); + error = in6_gre_setopts(sc, cmd, opt); break; #endif default: + /* + * Tunnel is not yet configured. + * We can just change any parameters. + */ if (cmd == GRESKEY) sc->gre_key = opt; - else + if (cmd == GRESOPTS) sc->gre_options = opt; + if (cmd == GRESPORT) + sc->gre_port = opt; break; } /* @@ -313,6 +345,10 @@ error = copyout(&sc->gre_options, ifr_data_get_ptr(ifr), sizeof(sc->gre_options)); break; + case GREGPORT: + error = copyout(&sc->gre_port, ifr_data_get_ptr(ifr), + sizeof(sc->gre_port)); + break; default: error = EINVAL; break; @@ -337,6 +373,7 @@ static void gre_delete_tunnel(struct gre_softc *sc) { + struct gre_socket *gs; sx_assert(&gre_ioctl_sx, SA_XLOCKED); if (sc->gre_family != 0) { @@ -346,6 +383,16 @@ free(sc->gre_hdr, M_GRE); sc->gre_family = 0; } + /* + * If this Tunnel was the last one that could use UDP socket, + * we should unlink socket from hash table and close it. + */ + if ((gs = sc->gre_so) != NULL && CK_LIST_EMPTY(&gs->list)) { + CK_LIST_REMOVE(gs, chain); + soclose(gs->so); + epoch_call(net_epoch_preempt, &gs->epoch_ctx, gre_sofree); + sc->gre_so = NULL; + } GRE2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; if_link_state_change(GRE2IFP(sc), LINK_STATE_DOWN); } @@ -372,7 +419,38 @@ } void -gre_updatehdr(struct gre_softc *sc, struct grehdr *gh) +gre_sofree(epoch_context_t ctx) +{ + struct gre_socket *gs; + + gs = __containerof(ctx, struct gre_socket, epoch_ctx); + free(gs, M_GRE); +} + +static __inline uint16_t +gre_cksum_add(uint16_t sum, uint16_t a) +{ + uint16_t res; + + res = sum + a; + return (res + (res < a)); +} + +void +gre_update_udphdr(struct gre_softc *sc, struct udphdr *udp, uint16_t csum) +{ + + sx_assert(&gre_ioctl_sx, SA_XLOCKED); + MPASS(sc->gre_options & GRE_UDPENCAP); + + udp->uh_dport = htons(GRE_UDPPORT); + udp->uh_sport = htons(sc->gre_port); + udp->uh_sum = csum; + udp->uh_ulen = 0; +} + +void +gre_update_hdr(struct gre_softc *sc, struct grehdr *gh) { uint32_t *opts; uint16_t flags; @@ -539,6 +617,52 @@ *opts = htonl(seq); } +static uint32_t +gre_flowid(struct gre_softc *sc, struct mbuf *m, uint32_t af) +{ + uint32_t flowid; + + if ((sc->gre_options & GRE_UDPENCAP) == 0 || sc->gre_port != 0) + return (0); +#ifndef RSS + switch (af) { +#ifdef INET + case AF_INET: + flowid = mtod(m, struct ip *)->ip_src.s_addr ^ + mtod(m, struct ip *)->ip_dst.s_addr; + break; +#endif +#ifdef INET6 + case AF_INET6: + flowid = mtod(m, struct ip6_hdr *)->ip6_src.s6_addr32[3] ^ + mtod(m, struct ip6_hdr *)->ip6_dst.s6_addr32[3]; + break; +#endif + default: + flowid = 0; + } +#else /* RSS */ + switch (af) { +#ifdef INET + case AF_INET: + flowid = rss_hash_ip4_2tuple(mtod(m, struct ip *)->ip_src, + mtod(m, struct ip *)->ip_dst); + break; +#endif +#ifdef INET6 + case AF_INET6: + flowid = rss_hash_ip6_2tuple( + &mtod(m, struct ip6_hdr *)->ip6_src, + &mtod(m, struct ip6_hdr *)->ip6_dst); + break; +#endif + default: + flowid = 0; + } +#endif + return (flowid); +} + #define MTAG_GRE 1307983903 static int gre_transmit(struct ifnet *ifp, struct mbuf *m) @@ -546,7 +670,8 @@ GRE_RLOCK_TRACKER; struct gre_softc *sc; struct grehdr *gh; - uint32_t af; + struct udphdr *uh; + uint32_t af, flowid; int error, len; uint16_t proto; @@ -573,6 +698,7 @@ af = m->m_pkthdr.csum_data; BPF_MTAP2(ifp, &af, sizeof(af), m); m->m_flags &= ~(M_BCAST|M_MCAST); + flowid = gre_flowid(sc, m, af); M_SETFIB(m, sc->gre_fibnum); M_PREPEND(m, sc->gre_hlen, M_NOWAIT); if (m == NULL) { @@ -614,6 +740,17 @@ error = ENETDOWN; goto drop; } + if (sc->gre_options & GRE_UDPENCAP) { + uh = (struct udphdr *)mtodo(m, len); + uh->uh_sport |= htons(0xC000) | (flowid >> 16) | + (flowid & 0xFFFF); + uh->uh_ulen = htons(m->m_pkthdr.len - len); + uh->uh_sum = gre_cksum_add(uh->uh_sum, + htons(m->m_pkthdr.len - len + IPPROTO_UDP)); + m->m_pkthdr.csum_flags = sc->gre_csumflags; + m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); + len += sizeof(struct udphdr); + } gh = (struct grehdr *)mtodo(m, len); gh->gre_proto = proto; if (sc->gre_options & GRE_ENABLE_SEQ) @@ -631,7 +768,7 @@ #endif #ifdef INET6 case AF_INET6: - error = in6_gre_output(m, af, sc->gre_hlen); + error = in6_gre_output(m, af, sc->gre_hlen, flowid); break; #endif default: Index: sys/netinet/ip_gre.c =================================================================== --- sys/netinet/ip_gre.c +++ sys/netinet/ip_gre.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -58,15 +59,19 @@ #include #include +#include #include #include #include +#include +#include #ifdef INET6 #include #endif #include +#include #define GRE_TTL 30 VNET_DEFINE(int, ip_gre_ttl) = GRE_TTL; @@ -74,14 +79,22 @@ SYSCTL_INT(_net_inet_ip, OID_AUTO, grettl, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_gre_ttl), 0, "Default TTL value for encapsulated packets"); +struct in_gre_socket { + struct gre_socket base; + in_addr_t addr; +}; +VNET_DEFINE_STATIC(struct gre_sockets *, ipv4_sockets) = NULL; VNET_DEFINE_STATIC(struct gre_list *, ipv4_hashtbl) = NULL; VNET_DEFINE_STATIC(struct gre_list *, ipv4_srchashtbl) = NULL; +#define V_ipv4_sockets VNET(ipv4_sockets) #define V_ipv4_hashtbl VNET(ipv4_hashtbl) #define V_ipv4_srchashtbl VNET(ipv4_srchashtbl) #define GRE_HASH(src, dst) (V_ipv4_hashtbl[\ in_gre_hashval((src), (dst)) & (GRE_HASH_SIZE - 1)]) #define GRE_SRCHASH(src) (V_ipv4_srchashtbl[\ fnv_32_buf(&(src), sizeof(src), FNV1_32_INIT) & (GRE_HASH_SIZE - 1)]) +#define GRE_SOCKHASH(src) (V_ipv4_sockets[\ + fnv_32_buf(&(src), sizeof(src), FNV1_32_INIT) & (GRE_HASH_SIZE - 1)]) #define GRE_HASH_SC(sc) GRE_HASH((sc)->gre_oip.ip_src.s_addr,\ (sc)->gre_oip.ip_dst.s_addr) @@ -94,17 +107,43 @@ return (fnv_32_buf(&dst, sizeof(dst), ret)); } +static struct gre_socket* +in_gre_lookup_socket(in_addr_t addr) +{ + struct gre_socket *gs; + struct in_gre_socket *s; + + CK_LIST_FOREACH(gs, &GRE_SOCKHASH(addr), chain) { + s = __containerof(gs, struct in_gre_socket, base); + if (s->addr == addr) + break; + } + return (gs); +} + static int -in_gre_checkdup(const struct gre_softc *sc, in_addr_t src, in_addr_t dst) +in_gre_checkdup(const struct gre_softc *sc, in_addr_t src, in_addr_t dst, + uint32_t opts) { + struct gre_list *head; struct gre_softc *tmp; + struct gre_socket *gs; if (sc->gre_family == AF_INET && sc->gre_oip.ip_src.s_addr == src && - sc->gre_oip.ip_dst.s_addr == dst) + sc->gre_oip.ip_dst.s_addr == dst && + (sc->gre_options & GRE_UDPENCAP) == (opts & GRE_UDPENCAP)) return (EEXIST); - CK_LIST_FOREACH(tmp, &GRE_HASH(src, dst), chain) { + if (opts & GRE_UDPENCAP) { + gs = in_gre_lookup_socket(src); + if (gs == NULL) + return (0); + head = &gs->list; + } else + head = &GRE_HASH(src, dst); + + CK_LIST_FOREACH(tmp, head, chain) { if (tmp == sc) continue; if (tmp->gre_oip.ip_src.s_addr == src && @@ -181,35 +220,228 @@ } static void +in_gre_udp_input(struct mbuf *m, int off, struct inpcb *inp, + const struct sockaddr *sa, void *ctx) +{ + struct epoch_tracker et; + struct gre_socket *gs; + struct gre_softc *sc; + in_addr_t dst; + + NET_EPOCH_ENTER(et); + /* + * udp_append() holds reference to inp, it is safe to check + * inp_flags2 without INP_RLOCK(). + * If socket was closed before we have entered NET_EPOCH section, + * INP_FREED flag should be set. Otherwise it should be safe to + * make access to ctx data, because gre_so will be freed by + * gre_sofree() via epoch_call(). + */ + if (__predict_false(inp->inp_flags2 & INP_FREED)) { + NET_EPOCH_EXIT(et); + m_freem(m); + return; + } + + gs = (struct gre_socket *)ctx; + dst = ((const struct sockaddr_in *)sa)->sin_addr.s_addr; + CK_LIST_FOREACH(sc, &gs->list, chain) { + if (sc->gre_oip.ip_dst.s_addr == dst) + break; + } + if (sc != NULL && (GRE2IFP(sc)->if_flags & IFF_UP) != 0){ + gre_input(m, off + sizeof(struct udphdr), IPPROTO_UDP, sc); + NET_EPOCH_EXIT(et); + return; + } + m_freem(m); + NET_EPOCH_EXIT(et); +} + +static int +in_gre_setup_socket(struct gre_softc *sc) +{ + struct sockopt sopt; + struct sockaddr_in sin; + struct in_gre_socket *s; + struct gre_socket *gs; + in_addr_t addr; + int error, value; + + /* + * NOTE: we are protected with gre_ioctl_sx lock. + * + * First check that socket is already configured. + * If so, check that source addres was not changed. + * If address is different, check that there are no other tunnels + * and close socket. + */ + addr = sc->gre_oip.ip_src.s_addr; + gs = sc->gre_so; + if (gs != NULL) { + s = __containerof(gs, struct in_gre_socket, base); + if (s->addr != addr) { + if (CK_LIST_EMPTY(&gs->list)) { + CK_LIST_REMOVE(gs, chain); + soclose(gs->so); + epoch_call(net_epoch_preempt, &gs->epoch_ctx, + gre_sofree); + } + gs = sc->gre_so = NULL; + } + } + + if (gs == NULL) { + /* + * Check that socket for given address is already + * configured. + */ + gs = in_gre_lookup_socket(addr); + if (gs == NULL) { + s = malloc(sizeof(*s), M_GRE, M_WAITOK | M_ZERO); + s->addr = addr; + gs = &s->base; + + error = socreate(sc->gre_family, &gs->so, + SOCK_DGRAM, IPPROTO_UDP, curthread->td_ucred, + curthread); + if (error != 0) { + if_printf(GRE2IFP(sc), + "cannot create socket: %d\n", error); + free(s, M_GRE); + return (error); + } + + error = udp_set_kernel_tunneling(gs->so, + in_gre_udp_input, NULL, gs); + if (error != 0) { + if_printf(GRE2IFP(sc), + "cannot set UDP tunneling: %d\n", error); + goto fail; + } + + memset(&sopt, 0, sizeof(sopt)); + sopt.sopt_dir = SOPT_SET; + sopt.sopt_level = IPPROTO_IP; + sopt.sopt_name = IP_BINDANY; + sopt.sopt_val = &value; + sopt.sopt_valsize = sizeof(value); + value = 1; + error = sosetopt(gs->so, &sopt); + if (error != 0) { + if_printf(GRE2IFP(sc), + "cannot set IP_BINDANY opt: %d\n", error); + goto fail; + } + + memset(&sin, 0, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_len = sizeof(sin); + sin.sin_addr.s_addr = addr; + sin.sin_port = htons(GRE_UDPPORT); + error = sobind(gs->so, (struct sockaddr *)&sin, + curthread); + if (error != 0) { + if_printf(GRE2IFP(sc), + "cannot bind socket: %d\n", error); + goto fail; + } + /* Add socket to the chain */ + CK_LIST_INSERT_HEAD(&GRE_SOCKHASH(addr), gs, chain); + } + } + + /* Add softc to the socket's list */ + CK_LIST_INSERT_HEAD(&gs->list, sc, chain); + sc->gre_so = gs; + return (0); +fail: + soclose(gs->so); + free(s, M_GRE); + return (error); +} + +static int in_gre_attach(struct gre_softc *sc) { + struct grehdr *gh; + int error; - sc->gre_hlen = sizeof(struct greip); + if (sc->gre_options & GRE_UDPENCAP) { + sc->gre_csumflags = CSUM_UDP; + sc->gre_hlen = sizeof(struct greudp); + sc->gre_oip.ip_p = IPPROTO_UDP; + gh = &sc->gre_udphdr->gi_gre; + gre_update_udphdr(sc, &sc->gre_udp, + in_pseudo(sc->gre_oip.ip_src.s_addr, + sc->gre_oip.ip_dst.s_addr, 0)); + } else { + sc->gre_hlen = sizeof(struct greip); + sc->gre_oip.ip_p = IPPROTO_GRE; + gh = &sc->gre_iphdr->gi_gre; + } sc->gre_oip.ip_v = IPVERSION; sc->gre_oip.ip_hl = sizeof(struct ip) >> 2; - sc->gre_oip.ip_p = IPPROTO_GRE; - gre_updatehdr(sc, &sc->gre_gihdr->gi_gre); - CK_LIST_INSERT_HEAD(&GRE_HASH_SC(sc), sc, chain); + gre_update_hdr(sc, gh); + + /* + * If we return error, this means that sc is not linked, + * and caller should reset gre_family and free(sc->gre_hdr). + */ + if (sc->gre_options & GRE_UDPENCAP) { + error = in_gre_setup_socket(sc); + if (error != 0) + return (error); + } else + CK_LIST_INSERT_HEAD(&GRE_HASH_SC(sc), sc, chain); CK_LIST_INSERT_HEAD(&GRE_SRCHASH(sc->gre_oip.ip_src.s_addr), sc, srchash); + + /* Set IFF_DRV_RUNNING if interface is ready */ + in_gre_set_running(sc); + return (0); } -void +int in_gre_setopts(struct gre_softc *sc, u_long cmd, uint32_t value) { - - MPASS(cmd == GRESKEY || cmd == GRESOPTS); + int error; /* NOTE: we are protected with gre_ioctl_sx lock */ + MPASS(cmd == GRESKEY || cmd == GRESOPTS || cmd == GRESPORT); MPASS(sc->gre_family == AF_INET); + + /* + * If we are going to change encapsulation protocol, do check + * for duplicate tunnels. Return EEXIST here to do not confuse + * user. + */ + if (cmd == GRESOPTS && + (sc->gre_options & GRE_UDPENCAP) != (value & GRE_UDPENCAP) && + in_gre_checkdup(sc, sc->gre_oip.ip_src.s_addr, + sc->gre_oip.ip_dst.s_addr, value) == EADDRNOTAVAIL) + return (EEXIST); + CK_LIST_REMOVE(sc, chain); CK_LIST_REMOVE(sc, srchash); GRE_WAIT(); - if (cmd == GRESKEY) + switch (cmd) { + case GRESKEY: sc->gre_key = value; - else + break; + case GRESOPTS: sc->gre_options = value; - in_gre_attach(sc); + break; + case GRESPORT: + sc->gre_port = value; + break; + } + error = in_gre_attach(sc); + if (error != 0) { + sc->gre_family = 0; + free(sc->gre_hdr, M_GRE); + } + return (error); } int @@ -241,9 +473,10 @@ if (V_ipv4_hashtbl == NULL) { V_ipv4_hashtbl = gre_hashinit(); V_ipv4_srchashtbl = gre_hashinit(); + V_ipv4_sockets = (struct gre_sockets *)gre_hashinit(); } error = in_gre_checkdup(sc, src->sin_addr.s_addr, - dst->sin_addr.s_addr); + dst->sin_addr.s_addr, sc->gre_options); if (error == EADDRNOTAVAIL) break; if (error == EEXIST) { @@ -251,7 +484,7 @@ error = 0; break; } - ip = malloc(sizeof(struct greip) + 3 * sizeof(uint32_t), + ip = malloc(sizeof(struct greudp) + 3 * sizeof(uint32_t), M_GRE, M_WAITOK | M_ZERO); ip->ip_src.s_addr = src->sin_addr.s_addr; ip->ip_dst.s_addr = dst->sin_addr.s_addr; @@ -267,8 +500,11 @@ sc->gre_hdr = ip; sc->gre_oseq = 0; sc->gre_iseq = UINT32_MAX; - in_gre_attach(sc); - in_gre_set_running(sc); + error = in_gre_attach(sc); + if (error != 0) { + sc->gre_family = 0; + free(sc->gre_hdr, M_GRE); + } break; case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: @@ -354,5 +590,6 @@ V_ipv4_hashtbl = NULL; GRE_WAIT(); gre_hashdestroy(V_ipv4_srchashtbl); + gre_hashdestroy((struct gre_list *)V_ipv4_sockets); } } Index: sys/netinet6/ip6_gre.c =================================================================== --- sys/netinet6/ip6_gre.c +++ sys/netinet6/ip6_gre.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -51,8 +52,12 @@ #include #include #endif +#include #include +#include #include +#include +#include #include #include #include @@ -65,14 +70,22 @@ SYSCTL_INT(_net_inet6_ip6, OID_AUTO, grehlim, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_gre_hlim), 0, "Default hop limit for encapsulated packets"); +struct in6_gre_socket { + struct gre_socket base; + struct in6_addr addr; /* scope zone id is embedded */ +}; +VNET_DEFINE_STATIC(struct gre_sockets *, ipv6_sockets) = NULL; VNET_DEFINE_STATIC(struct gre_list *, ipv6_hashtbl) = NULL; VNET_DEFINE_STATIC(struct gre_list *, ipv6_srchashtbl) = NULL; +#define V_ipv6_sockets VNET(ipv6_sockets) #define V_ipv6_hashtbl VNET(ipv6_hashtbl) #define V_ipv6_srchashtbl VNET(ipv6_srchashtbl) #define GRE_HASH(src, dst) (V_ipv6_hashtbl[\ in6_gre_hashval((src), (dst)) & (GRE_HASH_SIZE - 1)]) #define GRE_SRCHASH(src) (V_ipv6_srchashtbl[\ fnv_32_buf((src), sizeof(*src), FNV1_32_INIT) & (GRE_HASH_SIZE - 1)]) +#define GRE_SOCKHASH(src) (V_ipv6_sockets[\ + fnv_32_buf((src), sizeof(*src), FNV1_32_INIT) & (GRE_HASH_SIZE - 1)]) #define GRE_HASH_SC(sc) GRE_HASH(&(sc)->gre_oip6.ip6_src,\ &(sc)->gre_oip6.ip6_dst) @@ -85,18 +98,43 @@ return (fnv_32_buf(dst, sizeof(*dst), ret)); } +static struct gre_socket* +in6_gre_lookup_socket(const struct in6_addr *addr) +{ + struct gre_socket *gs; + struct in6_gre_socket *s; + + CK_LIST_FOREACH(gs, &GRE_SOCKHASH(addr), chain) { + s = __containerof(gs, struct in6_gre_socket, base); + if (IN6_ARE_ADDR_EQUAL(&s->addr, addr)) + break; + } + return (gs); +} + static int in6_gre_checkdup(const struct gre_softc *sc, const struct in6_addr *src, - const struct in6_addr *dst) + const struct in6_addr *dst, uint32_t opts) { + struct gre_list *head; struct gre_softc *tmp; + struct gre_socket *gs; if (sc->gre_family == AF_INET6 && IN6_ARE_ADDR_EQUAL(&sc->gre_oip6.ip6_src, src) && - IN6_ARE_ADDR_EQUAL(&sc->gre_oip6.ip6_dst, dst)) + IN6_ARE_ADDR_EQUAL(&sc->gre_oip6.ip6_dst, dst) && + (sc->gre_options & GRE_UDPENCAP) == (opts & GRE_UDPENCAP)) return (EEXIST); - CK_LIST_FOREACH(tmp, &GRE_HASH(src, dst), chain) { + if (opts & GRE_UDPENCAP) { + gs = in6_gre_lookup_socket(src); + if (gs == NULL) + return (0); + head = &gs->list; + } else + head = &GRE_HASH(src, dst); + + CK_LIST_FOREACH(tmp, head, chain) { if (tmp == sc) continue; if (IN6_ARE_ADDR_EQUAL(&tmp->gre_oip6.ip6_src, src) && @@ -175,33 +213,237 @@ } static void +in6_gre_udp_input(struct mbuf *m, int off, struct inpcb *inp, + const struct sockaddr *sa, void *ctx) +{ + struct epoch_tracker et; + struct gre_socket *gs; + struct gre_softc *sc; + struct sockaddr_in6 dst; + + NET_EPOCH_ENTER(et); + /* + * udp_append() holds reference to inp, it is safe to check + * inp_flags2 without INP_RLOCK(). + * If socket was closed before we have entered NET_EPOCH section, + * INP_FREED flag should be set. Otherwise it should be safe to + * make access to ctx data, because gre_so will be freed by + * gre_sofree() via epoch_call(). + */ + if (__predict_false(inp->inp_flags2 & INP_FREED)) { + NET_EPOCH_EXIT(et); + m_freem(m); + return; + } + + gs = (struct gre_socket *)ctx; + dst = *(const struct sockaddr_in6 *)sa; + if (sa6_embedscope(&dst, 0)) { + NET_EPOCH_EXIT(et); + m_freem(m); + return; + } + CK_LIST_FOREACH(sc, &gs->list, chain) { + if (IN6_ARE_ADDR_EQUAL(&sc->gre_oip6.ip6_dst, &dst.sin6_addr)) + break; + } + if (sc != NULL && (GRE2IFP(sc)->if_flags & IFF_UP) != 0){ + gre_input(m, off + sizeof(struct udphdr), IPPROTO_UDP, sc); + NET_EPOCH_EXIT(et); + return; + } + m_freem(m); + NET_EPOCH_EXIT(et); +} + +static int +in6_gre_setup_socket(struct gre_softc *sc) +{ + struct sockopt sopt; + struct sockaddr_in6 sin6; + struct in6_gre_socket *s; + struct gre_socket *gs; + int error, value; + + /* + * NOTE: we are protected with gre_ioctl_sx lock. + * + * First check that socket is already configured. + * If so, check that source addres was not changed. + * If address is different, check that there are no other tunnels + * and close socket. + */ + gs = sc->gre_so; + if (gs != NULL) { + s = __containerof(gs, struct in6_gre_socket, base); + if (!IN6_ARE_ADDR_EQUAL(&s->addr, &sc->gre_oip6.ip6_src)) { + if (CK_LIST_EMPTY(&gs->list)) { + CK_LIST_REMOVE(gs, chain); + soclose(gs->so); + epoch_call(net_epoch_preempt, &gs->epoch_ctx, + gre_sofree); + } + gs = sc->gre_so = NULL; + } + } + + if (gs == NULL) { + /* + * Check that socket for given address is already + * configured. + */ + gs = in6_gre_lookup_socket(&sc->gre_oip6.ip6_src); + if (gs == NULL) { + s = malloc(sizeof(*s), M_GRE, M_WAITOK | M_ZERO); + s->addr = sc->gre_oip6.ip6_src; + gs = &s->base; + + error = socreate(sc->gre_family, &gs->so, + SOCK_DGRAM, IPPROTO_UDP, curthread->td_ucred, + curthread); + if (error != 0) { + if_printf(GRE2IFP(sc), + "cannot create socket: %d\n", error); + free(s, M_GRE); + return (error); + } + + error = udp_set_kernel_tunneling(gs->so, + in6_gre_udp_input, NULL, gs); + if (error != 0) { + if_printf(GRE2IFP(sc), + "cannot set UDP tunneling: %d\n", error); + goto fail; + } + + memset(&sopt, 0, sizeof(sopt)); + sopt.sopt_dir = SOPT_SET; + sopt.sopt_level = IPPROTO_IPV6; + sopt.sopt_name = IPV6_BINDANY; + sopt.sopt_val = &value; + sopt.sopt_valsize = sizeof(value); + value = 1; + error = sosetopt(gs->so, &sopt); + if (error != 0) { + if_printf(GRE2IFP(sc), + "cannot set IPV6_BINDANY opt: %d\n", + error); + goto fail; + } + + memset(&sin6, 0, sizeof(sin6)); + sin6.sin6_family = AF_INET6; + sin6.sin6_len = sizeof(sin6); + sin6.sin6_addr = sc->gre_oip6.ip6_src; + sin6.sin6_port = htons(GRE_UDPPORT); + error = sa6_recoverscope(&sin6); + if (error != 0) { + if_printf(GRE2IFP(sc), + "cannot determine scope zone id: %d\n", + error); + goto fail; + } + error = sobind(gs->so, (struct sockaddr *)&sin6, + curthread); + if (error != 0) { + if_printf(GRE2IFP(sc), + "cannot bind socket: %d\n", error); + goto fail; + } + /* Add socket to the chain */ + CK_LIST_INSERT_HEAD( + &GRE_SOCKHASH(&sc->gre_oip6.ip6_src), gs, chain); + } + } + + /* Add softc to the socket's list */ + CK_LIST_INSERT_HEAD(&gs->list, sc, chain); + sc->gre_so = gs; + return (0); +fail: + soclose(gs->so); + free(s, M_GRE); + return (error); +} + +static int in6_gre_attach(struct gre_softc *sc) { + struct grehdr *gh; + int error; - sc->gre_hlen = sizeof(struct greip6); + if (sc->gre_options & GRE_UDPENCAP) { + sc->gre_csumflags = CSUM_UDP_IPV6; + sc->gre_hlen = sizeof(struct greudp6); + sc->gre_oip6.ip6_nxt = IPPROTO_UDP; + gh = &sc->gre_udp6hdr->gi6_gre; + gre_update_udphdr(sc, &sc->gre_udp6, + in6_cksum_pseudo(&sc->gre_oip6, 0, 0, 0)); + } else { + sc->gre_hlen = sizeof(struct greip6); + sc->gre_oip6.ip6_nxt = IPPROTO_GRE; + gh = &sc->gre_ip6hdr->gi6_gre; + } sc->gre_oip6.ip6_vfc = IPV6_VERSION; - sc->gre_oip6.ip6_nxt = IPPROTO_GRE; - gre_updatehdr(sc, &sc->gre_gi6hdr->gi6_gre); - CK_LIST_INSERT_HEAD(&GRE_HASH_SC(sc), sc, chain); + gre_update_hdr(sc, gh); + + /* + * If we return error, this means that sc is not linked, + * and caller should reset gre_family and free(sc->gre_hdr). + */ + if (sc->gre_options & GRE_UDPENCAP) { + error = in6_gre_setup_socket(sc); + if (error != 0) + return (error); + } else + CK_LIST_INSERT_HEAD(&GRE_HASH_SC(sc), sc, chain); CK_LIST_INSERT_HEAD(&GRE_SRCHASH(&sc->gre_oip6.ip6_src), sc, srchash); + + /* Set IFF_DRV_RUNNING if interface is ready */ + in6_gre_set_running(sc); + return (0); } -void +int in6_gre_setopts(struct gre_softc *sc, u_long cmd, uint32_t value) { - - MPASS(cmd == GRESKEY || cmd == GRESOPTS); + int error; /* NOTE: we are protected with gre_ioctl_sx lock */ + MPASS(cmd == GRESKEY || cmd == GRESOPTS || cmd == GRESPORT); MPASS(sc->gre_family == AF_INET6); + + /* + * If we are going to change encapsulation protocol, do check + * for duplicate tunnels. Return EEXIST here to do not confuse + * user. + */ + if (cmd == GRESOPTS && + (sc->gre_options & GRE_UDPENCAP) != (value & GRE_UDPENCAP) && + in6_gre_checkdup(sc, &sc->gre_oip6.ip6_src, + &sc->gre_oip6.ip6_dst, value) == EADDRNOTAVAIL) + return (EEXIST); + CK_LIST_REMOVE(sc, chain); CK_LIST_REMOVE(sc, srchash); GRE_WAIT(); - if (cmd == GRESKEY) + switch (cmd) { + case GRESKEY: sc->gre_key = value; - else + break; + case GRESOPTS: sc->gre_options = value; - in6_gre_attach(sc); + break; + case GRESPORT: + sc->gre_port = value; + break; + } + error = in6_gre_attach(sc); + if (error != 0) { + sc->gre_family = 0; + free(sc->gre_hdr, M_GRE); + } + return (error); } int @@ -242,9 +484,10 @@ if (V_ipv6_hashtbl == NULL) { V_ipv6_hashtbl = gre_hashinit(); V_ipv6_srchashtbl = gre_hashinit(); + V_ipv6_sockets = (struct gre_sockets *)gre_hashinit(); } error = in6_gre_checkdup(sc, &src->sin6_addr, - &dst->sin6_addr); + &dst->sin6_addr, sc->gre_options); if (error == EADDRNOTAVAIL) break; if (error == EEXIST) { @@ -252,7 +495,7 @@ error = 0; break; } - ip6 = malloc(sizeof(struct greip6) + 3 * sizeof(uint32_t), + ip6 = malloc(sizeof(struct greudp6) + 3 * sizeof(uint32_t), M_GRE, M_WAITOK | M_ZERO); ip6->ip6_src = src->sin6_addr; ip6->ip6_dst = dst->sin6_addr; @@ -268,8 +511,11 @@ sc->gre_hdr = ip6; sc->gre_oseq = 0; sc->gre_iseq = UINT32_MAX; - in6_gre_attach(sc); - in6_gre_set_running(sc); + error = in6_gre_attach(sc); + if (error != 0) { + sc->gre_family = 0; + free(sc->gre_hdr, M_GRE); + } break; case SIOCGIFPSRCADDR_IN6: case SIOCGIFPDSTADDR_IN6: @@ -294,12 +540,14 @@ } int -in6_gre_output(struct mbuf *m, int af __unused, int hlen __unused) +in6_gre_output(struct mbuf *m, int af __unused, int hlen __unused, + uint32_t flowid) { struct greip6 *gi6; gi6 = mtod(m, struct greip6 *); gi6->gi6_ip6.ip6_hlim = V_ip6_gre_hlim; + gi6->gi6_ip6.ip6_flow |= flowid & IPV6_FLOWLABEL_MASK; return (ip6_output(m, NULL, NULL, IPV6_MINMTU, NULL, NULL, NULL)); } @@ -342,5 +590,6 @@ V_ipv6_hashtbl = NULL; GRE_WAIT(); gre_hashdestroy(V_ipv6_srchashtbl); + gre_hashdestroy((struct gre_list *)V_ipv6_sockets); } }