diff --git a/sbin/ifconfig/ifconfig.8 b/sbin/ifconfig/ifconfig.8 --- a/sbin/ifconfig/ifconfig.8 +++ b/sbin/ifconfig/ifconfig.8 @@ -2947,25 +2947,13 @@ Another name for the .Fl tunnel parameter. -.It Cm noclamp -This flag prevents the MTU from being clamped to 1280 bytes, the -minimum MTU for IPv6, when the outer protocol is IPv6. When the -flag is set, the MTU value configured on the interface will be -used instead of the fixed length of 1280 bytes. For more details, -please refer to the -.Ar MTU Configuration and Path MTU Discovery -section in -.Xr gif 4 . -.It Cm -noclamp -Clear the flag -.Cm noclamp . .It Cm ignore_source Set a flag to accept encapsulated packets destined to this host independently from source address. This may be useful for hosts, that receive encapsulated packets from the load balancers. .It Cm -ignore_source -Clear the flag +Clear a flag .Cm ignore_source . .El .Ss GRE Tunnel Parameters diff --git a/sbin/ifconfig/ifgif.c b/sbin/ifconfig/ifgif.c --- a/sbin/ifconfig/ifgif.c +++ b/sbin/ifconfig/ifgif.c @@ -49,7 +49,6 @@ #include "ifconfig.h" static const char *GIFBITS[] = { - [0] = "NOCLAMP", [1] = "IGNORE_SOURCE", }; @@ -91,8 +90,6 @@ } static struct cmd gif_cmds[] = { - DEF_CMD("noclamp", GIF_NOCLAMP, setgifopts), - DEF_CMD("-noclamp", -GIF_NOCLAMP, setgifopts), DEF_CMD("ignore_source", GIF_IGNORE_SOURCE, setgifopts), DEF_CMD("-ignore_source", -GIF_IGNORE_SOURCE, setgifopts), }; diff --git a/share/man/man4/gif.4 b/share/man/man4/gif.4 --- a/share/man/man4/gif.4 +++ b/share/man/man4/gif.4 @@ -1,7 +1,7 @@ .\" $KAME: gif.4,v 1.28 2001/05/18 13:15:56 itojun Exp $ .\" .\" Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. -.\" Copyright (C) 2024 Hiroki Sato +.\" Copyright (C) 2024, 2025 Hiroki Sato .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without @@ -28,7 +28,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd August 27, 2025 +.Dd September 12, 2025 .Dt GIF 4 .Os .Sh NAME @@ -102,41 +102,18 @@ .Ss MTU Configuration and Path MTU Discovery The .Nm -interface uses the fixed length, -.Li 1280 , -to determine whether the outgoing IPv6 packets are split. -This means the MTU value configured on the interface will be ignored -when the outer protocol is IPv6. -When the -.Dv NOCLAMP -interface flag is set, -.Nm -uses the same configured value as IPv4 communications. -This behavior prevents potential issues when the path MTU is -smaller than the interface MTU. -This section describes the reason why the default behavior is different. -The -.Dv NOCLAMP -interface flag can be set using the following command: -.Pp -.Dl ifconfig Ar gif0 Cm noclamp -.Pp -and clear the flag using the following: -.Pp -.Dl ifconfig Ar gif0 Cm -noclamp -.Pp -where -.Ar gif0 -is the actual interface name. +interface uses the configured MTU, +.Li 1280 +by default, +to determine whether the outgoing packets are split. +The default value is most conservative to prevent packet loss. +This section explains the reason and the mechanism behind it. .Pp A tunnel interface always has an implicit smaller MTU for the inner protocol than the outer protocol because of the additional header. -Note that the interface MTU on a +The .Nm -interface, -the default value is -.Li 1280 , -is used as MTU for the outer protocol. +interface uses the interface MTU as one for the outer protocol. This means that the MTU for the inner protocol varies depending on the outer protocol header length. If an outgoing packet bigger than the inner protocol MTU arrives at a @@ -160,7 +137,8 @@ .Nm interface helps to mitigate this reduced performance issue, it can also cause packet losses on the intermediate narrowest path -between the two communication endpoints in IPv6. +between the two communication endpoints, +especially in IPv6. IPv6 allows fragmentation only on the sender, not on the routers in the communication path. A big outgoing packet will be dropped on a router with a smaller MTU. @@ -191,13 +169,13 @@ In order to avoid this, a .Nm -interface silently splits a packet of over 1240 octets into fragments to make -the outer protocol packets equal or shorter than 1280 octets, -even when the interface MTU is configured as larger than 1280. -Note that this occurs only when the outer protocol is IPv6. +interface splits a packet of over 1240 octets into +fragments to make the outer protocol packets equal or shorter +than 1280 octets for both IPv4 and IPv6. +The reason for the length .Li 1280 -is the smallest MTU in IPv6 and guarantees no packet loss occurs -on intermediate routers. +is that this is the smallest MTU in IPv6 and guarantees that no packet loss +occurs on intermediate routers. .Pp As mentioned earlier, the performance is sub-optimal if the actual path MTU is larger than @@ -214,24 +192,23 @@ interface as a member of .Xr if_bridge 4 interface. -The +Usually, .Xr if_bridge 4 -interface forcibly changes the MTU of the +sets MTU of the all member interfaces to one on the first member, +typically 1500, +except for .Nm -interface with those for the other member interfaces, -which are likely 1500. +interfaces. In this case, -a situation in which the MTU of the +fragmentation in 1280 octets always occurs at the .Nm -interface is 1500 but fragmentation in 1280 octets always occurs. +interface because its MTU is 1280. .Pp -The default behavior is most conservative to prevent confusing packet loss. +The default MTU is most conservative to prevent confusing packet loss. Depending on the network configuration, -enabling the -.Dv NOCLAMP -interface flag might be helpful for better performance. +increasing the MTU might be helpful for better performance. It is crucial to ensure that the path MTU is equal to or larger than -the interface MTU when enabling this flag. +the interface MTU. .Ss ECN friendly behavior The .Nm diff --git a/sys/net/if.c b/sys/net/if.c --- a/sys/net/if.c +++ b/sys/net/if.c @@ -2737,8 +2737,11 @@ return (EINVAL); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); - /* Disallow MTU changes on bridge member interfaces. */ - if (ifp->if_bridge) + /* + * Disallow MTU changes on bridge member interfaces except + * for gif(4) interfaces. + */ + if (ifp->if_type != IFT_GIF && ifp->if_bridge != NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) { diff --git a/sys/net/if_bridge.c b/sys/net/if_bridge.c --- a/sys/net/if_bridge.c +++ b/sys/net/if_bridge.c @@ -1103,6 +1103,14 @@ break; } CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { + /* + * The gif(4) interface is used for EtherIP tunnel + * and the MTU does not always need to equal to + * the parent bridge(4) interface because it is + * for the outer protocol. + */ + if (bif->bif_ifp->if_type == IFT_GIF) + continue; error = (*bif->bif_ifp->if_ioctl)(bif->bif_ifp, SIOCSIFMTU, (caddr_t)ifr); if (error != 0) { @@ -1118,6 +1126,8 @@ /* Restore the previous MTU on all member interfaces. */ ifr->ifr_mtu = oldmtu; CK_LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { + if (bif->bif_ifp->if_type == IFT_GIF) + continue; (*bif->bif_ifp->if_ioctl)(bif->bif_ifp, SIOCSIFMTU, (caddr_t)ifr); } @@ -1478,7 +1488,8 @@ /* Allow the first Ethernet member to define the MTU */ if (CK_LIST_EMPTY(&sc->sc_iflist)) sc->sc_ifp->if_mtu = ifs->if_mtu; - else if (sc->sc_ifp->if_mtu != ifs->if_mtu) { + else if (ifs->if_type != IFT_GIF && /* gif(4) does not need the same MTU. */ + sc->sc_ifp->if_mtu != ifs->if_mtu) { struct ifreq ifr; snprintf(ifr.ifr_name, sizeof(ifr.ifr_name), "%s", diff --git a/sys/net/if_gif.h b/sys/net/if_gif.h --- a/sys/net/if_gif.h +++ b/sys/net/if_gif.h @@ -120,8 +120,7 @@ #define GIFGOPTS _IOWR('i', 150, struct ifreq) #define GIFSOPTS _IOW('i', 151, struct ifreq) -#define GIF_NOCLAMP 0x0001 #define GIF_IGNORE_SOURCE 0x0002 -#define GIF_OPTMASK (GIF_NOCLAMP|GIF_IGNORE_SOURCE) +#define GIF_OPTMASK (GIF_IGNORE_SOURCE) #endif /* _NET_IF_GIF_H_ */ diff --git a/sys/netinet/in_gif.c b/sys/netinet/in_gif.c --- a/sys/netinet/in_gif.c +++ b/sys/netinet/in_gif.c @@ -280,6 +280,7 @@ M_PREPEND(m, sizeof(struct ip), M_NOWAIT); if (m == NULL) return (ENOBUFS); + m->m_pkthdr.rcvif = ifp; /* For IP_NOROUTEMTU */ ip = mtod(m, struct ip *); MPASS(sc->gif_family == AF_INET); @@ -290,7 +291,7 @@ ip->ip_len = htons(m->m_pkthdr.len); ip->ip_tos = ecn; - return (ip_output(m, NULL, NULL, 0, NULL, NULL)); + return (ip_output(m, NULL, NULL, IP_NOROUTEMTU, NULL, NULL)); } static int diff --git a/sys/netinet/ip_output.c b/sys/netinet/ip_output.c --- a/sys/netinet/ip_output.c +++ b/sys/netinet/ip_output.c @@ -544,7 +544,18 @@ (in_broadcast(ip->ip_dst) || in_ifaddr_broadcast( ((const struct sockaddr_in *)gw)->sin_addr, ia))); } - + if (flags & IP_NOROUTEMTU) { + if (m->m_pkthdr.rcvif != NULL) { + /* + * The rcvif is set to NULL because this is used to + * tell which interface is used for the MTU when + * IP_NOROUTEMTU is specified. + */ + mtu = m->m_pkthdr.rcvif->if_mtu; + m->m_pkthdr.rcvif = NULL; + } else + mtu = ifp->if_mtu; + } /* Catch a possible divide by zero later. */ KASSERT(mtu > 0, ("%s: mtu %d <= 0, ro=%p (nh_flags=0x%08x) ifp=%p", __func__, mtu, ro, diff --git a/sys/netinet/ip_var.h b/sys/netinet/ip_var.h --- a/sys/netinet/ip_var.h +++ b/sys/netinet/ip_var.h @@ -179,6 +179,7 @@ #define IP_ALLOWBROADCAST SO_BROADCAST /* 0x20 can send broadcast packets */ #define IP_NODEFAULTFLOWID 0x40 /* Don't set the flowid from inp */ #define IP_NO_SND_TAG_RL 0x80 /* Don't send down the ratelimit tag */ +#define IP_NOROUTEMTU 0x100 /* Use interface MTU forcibly */ #ifdef __NO_STRICT_ALIGNMENT #define IP_HDR_ALIGNED_P(ip) 1 diff --git a/sys/netinet6/in6_gif.c b/sys/netinet6/in6_gif.c --- a/sys/netinet6/in6_gif.c +++ b/sys/netinet6/in6_gif.c @@ -194,11 +194,6 @@ sc->gif_options = options; in6_gif_attach(sc); } - - if ((options & GIF_NOCLAMP) != - (sc->gif_options & GIF_NOCLAMP)) { - sc->gif_options = options; - } return (0); } @@ -294,13 +289,13 @@ { struct gif_softc *sc = ifp->if_softc; struct ip6_hdr *ip6; - u_long mtu; /* prepend new IP header */ NET_EPOCH_ASSERT(); M_PREPEND(m, sizeof(struct ip6_hdr), M_NOWAIT); if (m == NULL) return (ENOBUFS); + m->m_pkthdr.rcvif = ifp; /* For IPV6_NOROUTEMTU */ ip6 = mtod(m, struct ip6_hdr *); MPASS(sc->gif_family == AF_INET6); @@ -309,16 +304,8 @@ ip6->ip6_flow |= htonl((uint32_t)ecn << 20); ip6->ip6_nxt = proto; ip6->ip6_hlim = V_ip6_gif_hlim; - /* - * Enforce fragmentation to minimum MTU, even if the interface MTU - * is larger, to avoid path MTU discovery when NOCLAMP is not - * set (default). IPv6 does not allow fragmentation on intermediate - * router nodes, so it is too painful to ask for resend of inner - * packet, to achieve path MTU discovery for encapsulated packets. - */ - mtu = ((sc->gif_options & GIF_NOCLAMP) == 0) ? IPV6_MINMTU : 0; - return (ip6_output(m, 0, NULL, mtu, 0, NULL, NULL)); + return (ip6_output(m, 0, NULL, IPV6_NOROUTEMTU, 0, NULL, NULL)); } static int diff --git a/sys/netinet6/ip6_output.c b/sys/netinet6/ip6_output.c --- a/sys/netinet6/ip6_output.c +++ b/sys/netinet6/ip6_output.c @@ -938,8 +938,21 @@ *ifpp = ifp; /* Determine path MTU. */ - ip6_getpmtu(ro_pmtu, ro != ro_pmtu, ifp, &ip6->ip6_dst, &mtu, fibnum, - *nexthdrp); + if (flags & IPV6_NOROUTEMTU) { + if (m->m_pkthdr.rcvif != NULL) { + /* + * The rcvif is set to NULL because this is used to + * tell which interface is used for the MTU when + * IPV6_NOROUTEMTU is specified. + */ + mtu = m->m_pkthdr.rcvif->if_mtu; + m->m_pkthdr.rcvif = NULL; + } else + mtu = ifp->if_mtu; + } else + ip6_getpmtu(ro_pmtu, ro != ro_pmtu, ifp, &ip6->ip6_dst, + &mtu, fibnum, *nexthdrp); + KASSERT(mtu > 0, ("%s:%d: mtu %ld, ro_pmtu %p ro %p ifp %p fibnum %u", __func__, __LINE__, mtu, ro_pmtu, ro, ifp, fibnum)); diff --git a/sys/netinet6/ip6_var.h b/sys/netinet6/ip6_var.h --- a/sys/netinet6/ip6_var.h +++ b/sys/netinet6/ip6_var.h @@ -286,6 +286,7 @@ #define IPV6_UNSPECSRC 0x01 /* allow :: as the source address */ #define IPV6_FORWARDING 0x02 /* most of IPv6 header exists */ #define IPV6_MINMTU 0x04 /* use minimum MTU (IPV6_USE_MIN_MTU) */ +#define IPV6_NOROUTEMTU 0x08 /* Use interface MTU forcibly */ #ifdef __NO_STRICT_ALIGNMENT #define IP6_HDR_ALIGNED_P(ip) 1